From 8dc59f6f8a145952ceaec8f6d9ffd59d37ca058e Mon Sep 17 00:00:00 2001 From: Cyrille Bagard Date: Tue, 3 Jan 2017 22:05:01 +0100 Subject: Avoided to highlight the same content several times. --- db.py | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--- htt.py | 20 ++++++++++++------- 2 files changed, 79 insertions(+), 10 deletions(-) diff --git a/db.py b/db.py index c83e8ee..6b22368 100644 --- a/db.py +++ b/db.py @@ -3,6 +3,7 @@ from config import max_age +import hashlib import sqlite3 import time import tweepy @@ -25,6 +26,7 @@ class LikeMemory(): CREATE TABLE IF NOT EXISTS LikedTweets( sid INTEGER PRIMARY KEY, username TEXT, + fingerprint TEXT, timestamp INTEGER, purged BOOLEAN ) @@ -35,15 +37,76 @@ class LikeMemory(): self._db.commit() - def save_liked_status(self, sid, username): + def _compute_content_fingerprint(self, content): + """Compute the fingerprint of a given status content.""" + + # Step 1: filter all hashtags + + cut = content.split(' ') + + content = ' '.join([ c for c in cut if not c.startswith('#') ]) + + # Step 2: stop at the first link + + cut = content.split(' ') + + keep = [] + + for c in cut: + + if c.startswith('http://') or c.startswith('https://'): + break + + keep.append(c) + + content = ' '.join(keep) + + # Step 3: get a fresh start + + separators = ':-!?.' + + content = content.lstrip(' ' + separators) + + # Step 4: Extract a common shared base + + base = content + + for sep in separators: + + pos = base.find(sep) + + if pos != -1: + base = base[:pos] + + return hashlib.md5(base.rstrip(' ').encode('utf-8')).hexdigest() + + + def is_original_content(self, content): + """Ensure that a given content has never been seen.""" + + fingerprint = self._compute_content_fingerprint(content) + + values = (fingerprint, ) + + cursor = self._db.cursor() + cursor.execute('SELECT sid FROM LikedTweets WHERE fingerprint = ?', values) + + found = cursor.fetchone() + + return found is None + + + def save_liked_status(self, sid, username, content): """Remember a given liked status.""" + fingerprint = self._compute_content_fingerprint(content) + timestamp = int(time.time()) - values = (sid, username, timestamp, False) + values = (sid, username, fingerprint, timestamp, False) cursor = self._db.cursor() - cursor.execute('INSERT INTO LikedTweets VALUES (?, ?, ?, ?)', values) + cursor.execute('INSERT INTO LikedTweets VALUES (?, ?, ?, ?, ?)', values) self._db.commit() diff --git a/htt.py b/htt.py index ea92470..d50156f 100755 --- a/htt.py +++ b/htt.py @@ -72,18 +72,24 @@ class StdOutListener(StreamListener): if like: - try: + if self._memory.is_original_content(content): - self._api.create_favorite(sid) + try: - self._memory.save_liked_status(sid, username) + self._api.create_favorite(sid) - print('@%s: "%s" (id=%d)' % (username, content, sid)) - print(' -> https://twitter.com/%s/status/%d' % (username, sid)) + self._memory.save_liked_status(sid, username, content) - except tweepy.error.TweepError: + print('@%s: "%s" (id=%d)' % (username, content, sid)) + print(' -> https://twitter.com/%s/status/%d' % (username, sid)) - pass + except tweepy.error.TweepError: + + pass + + else: + + print('Already seen "%s"' % content) else: -- cgit v0.11.2-87-g4458