diff options
author | Cyrille Bagard <nocbos@gmail.com> | 2017-01-03 21:05:01 (GMT) |
---|---|---|
committer | Cyrille Bagard <nocbos@gmail.com> | 2017-01-03 21:05:01 (GMT) |
commit | 8dc59f6f8a145952ceaec8f6d9ffd59d37ca058e (patch) | |
tree | b9037ec4fceb02ac9ea2e3a5d69ab7b311e81cae | |
parent | 8d3cfbd9814c80452e2b57a6c936cd4f6eea445a (diff) |
Avoided to highlight the same content several times.
-rw-r--r-- | db.py | 69 | ||||
-rwxr-xr-x | htt.py | 20 |
2 files changed, 79 insertions, 10 deletions
@@ -3,6 +3,7 @@ from config import max_age +import hashlib import sqlite3 import time import tweepy @@ -25,6 +26,7 @@ class LikeMemory(): CREATE TABLE IF NOT EXISTS LikedTweets( sid INTEGER PRIMARY KEY, username TEXT, + fingerprint TEXT, timestamp INTEGER, purged BOOLEAN ) @@ -35,15 +37,76 @@ class LikeMemory(): self._db.commit() - def save_liked_status(self, sid, username): + def _compute_content_fingerprint(self, content): + """Compute the fingerprint of a given status content.""" + + # Step 1: filter all hashtags + + cut = content.split(' ') + + content = ' '.join([ c for c in cut if not c.startswith('#') ]) + + # Step 2: stop at the first link + + cut = content.split(' ') + + keep = [] + + for c in cut: + + if c.startswith('http://') or c.startswith('https://'): + break + + keep.append(c) + + content = ' '.join(keep) + + # Step 3: get a fresh start + + separators = ':-!?.' + + content = content.lstrip(' ' + separators) + + # Step 4: Extract a common shared base + + base = content + + for sep in separators: + + pos = base.find(sep) + + if pos != -1: + base = base[:pos] + + return hashlib.md5(base.rstrip(' ').encode('utf-8')).hexdigest() + + + def is_original_content(self, content): + """Ensure that a given content has never been seen.""" + + fingerprint = self._compute_content_fingerprint(content) + + values = (fingerprint, ) + + cursor = self._db.cursor() + cursor.execute('SELECT sid FROM LikedTweets WHERE fingerprint = ?', values) + + found = cursor.fetchone() + + return found is None + + + def save_liked_status(self, sid, username, content): """Remember a given liked status.""" + fingerprint = self._compute_content_fingerprint(content) + timestamp = int(time.time()) - values = (sid, username, timestamp, False) + values = (sid, username, fingerprint, timestamp, False) cursor = self._db.cursor() - cursor.execute('INSERT INTO LikedTweets VALUES (?, ?, ?, ?)', values) + cursor.execute('INSERT INTO LikedTweets VALUES (?, ?, ?, ?, ?)', values) self._db.commit() @@ -72,18 +72,24 @@ class StdOutListener(StreamListener): if like: - try: + if self._memory.is_original_content(content): - self._api.create_favorite(sid) + try: - self._memory.save_liked_status(sid, username) + self._api.create_favorite(sid) - print('@%s: "%s" (id=%d)' % (username, content, sid)) - print(' -> https://twitter.com/%s/status/%d' % (username, sid)) + self._memory.save_liked_status(sid, username, content) - except tweepy.error.TweepError: + print('@%s: "%s" (id=%d)' % (username, content, sid)) + print(' -> https://twitter.com/%s/status/%d' % (username, sid)) - pass + except tweepy.error.TweepError: + + pass + + else: + + print('Already seen "%s"' % content) else: |