summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCyrille Bagard <nocbos@gmail.com>2017-01-03 21:05:01 (GMT)
committerCyrille Bagard <nocbos@gmail.com>2017-01-03 21:05:01 (GMT)
commit8dc59f6f8a145952ceaec8f6d9ffd59d37ca058e (patch)
treeb9037ec4fceb02ac9ea2e3a5d69ab7b311e81cae
parent8d3cfbd9814c80452e2b57a6c936cd4f6eea445a (diff)
Avoided to highlight the same content several times.
-rw-r--r--db.py69
-rwxr-xr-xhtt.py20
2 files changed, 79 insertions, 10 deletions
diff --git a/db.py b/db.py
index c83e8ee..6b22368 100644
--- a/db.py
+++ b/db.py
@@ -3,6 +3,7 @@
from config import max_age
+import hashlib
import sqlite3
import time
import tweepy
@@ -25,6 +26,7 @@ class LikeMemory():
CREATE TABLE IF NOT EXISTS LikedTweets(
sid INTEGER PRIMARY KEY,
username TEXT,
+ fingerprint TEXT,
timestamp INTEGER,
purged BOOLEAN
)
@@ -35,15 +37,76 @@ class LikeMemory():
self._db.commit()
- def save_liked_status(self, sid, username):
+ def _compute_content_fingerprint(self, content):
+ """Compute the fingerprint of a given status content."""
+
+ # Step 1: filter all hashtags
+
+ cut = content.split(' ')
+
+ content = ' '.join([ c for c in cut if not c.startswith('#') ])
+
+ # Step 2: stop at the first link
+
+ cut = content.split(' ')
+
+ keep = []
+
+ for c in cut:
+
+ if c.startswith('http://') or c.startswith('https://'):
+ break
+
+ keep.append(c)
+
+ content = ' '.join(keep)
+
+ # Step 3: get a fresh start
+
+ separators = ':-!?.'
+
+ content = content.lstrip(' ' + separators)
+
+ # Step 4: Extract a common shared base
+
+ base = content
+
+ for sep in separators:
+
+ pos = base.find(sep)
+
+ if pos != -1:
+ base = base[:pos]
+
+ return hashlib.md5(base.rstrip(' ').encode('utf-8')).hexdigest()
+
+
+ def is_original_content(self, content):
+ """Ensure that a given content has never been seen."""
+
+ fingerprint = self._compute_content_fingerprint(content)
+
+ values = (fingerprint, )
+
+ cursor = self._db.cursor()
+ cursor.execute('SELECT sid FROM LikedTweets WHERE fingerprint = ?', values)
+
+ found = cursor.fetchone()
+
+ return found is None
+
+
+ def save_liked_status(self, sid, username, content):
"""Remember a given liked status."""
+ fingerprint = self._compute_content_fingerprint(content)
+
timestamp = int(time.time())
- values = (sid, username, timestamp, False)
+ values = (sid, username, fingerprint, timestamp, False)
cursor = self._db.cursor()
- cursor.execute('INSERT INTO LikedTweets VALUES (?, ?, ?, ?)', values)
+ cursor.execute('INSERT INTO LikedTweets VALUES (?, ?, ?, ?, ?)', values)
self._db.commit()
diff --git a/htt.py b/htt.py
index ea92470..d50156f 100755
--- a/htt.py
+++ b/htt.py
@@ -72,18 +72,24 @@ class StdOutListener(StreamListener):
if like:
- try:
+ if self._memory.is_original_content(content):
- self._api.create_favorite(sid)
+ try:
- self._memory.save_liked_status(sid, username)
+ self._api.create_favorite(sid)
- print('@%s: "%s" (id=%d)' % (username, content, sid))
- print(' -> https://twitter.com/%s/status/%d' % (username, sid))
+ self._memory.save_liked_status(sid, username, content)
- except tweepy.error.TweepError:
+ print('@%s: "%s" (id=%d)' % (username, content, sid))
+ print(' -> https://twitter.com/%s/status/%d' % (username, sid))
- pass
+ except tweepy.error.TweepError:
+
+ pass
+
+ else:
+
+ print('Already seen "%s"' % content)
else: