diff options
Diffstat (limited to 'db.py')
-rw-r--r-- | db.py | 69 |
1 files changed, 66 insertions, 3 deletions
@@ -3,6 +3,7 @@ from config import max_age +import hashlib import sqlite3 import time import tweepy @@ -25,6 +26,7 @@ class LikeMemory(): CREATE TABLE IF NOT EXISTS LikedTweets( sid INTEGER PRIMARY KEY, username TEXT, + fingerprint TEXT, timestamp INTEGER, purged BOOLEAN ) @@ -35,15 +37,76 @@ class LikeMemory(): self._db.commit() - def save_liked_status(self, sid, username): + def _compute_content_fingerprint(self, content): + """Compute the fingerprint of a given status content.""" + + # Step 1: filter all hashtags + + cut = content.split(' ') + + content = ' '.join([ c for c in cut if not c.startswith('#') ]) + + # Step 2: stop at the first link + + cut = content.split(' ') + + keep = [] + + for c in cut: + + if c.startswith('http://') or c.startswith('https://'): + break + + keep.append(c) + + content = ' '.join(keep) + + # Step 3: get a fresh start + + separators = ':-!?.' + + content = content.lstrip(' ' + separators) + + # Step 4: Extract a common shared base + + base = content + + for sep in separators: + + pos = base.find(sep) + + if pos != -1: + base = base[:pos] + + return hashlib.md5(base.rstrip(' ').encode('utf-8')).hexdigest() + + + def is_original_content(self, content): + """Ensure that a given content has never been seen.""" + + fingerprint = self._compute_content_fingerprint(content) + + values = (fingerprint, ) + + cursor = self._db.cursor() + cursor.execute('SELECT sid FROM LikedTweets WHERE fingerprint = ?', values) + + found = cursor.fetchone() + + return found is None + + + def save_liked_status(self, sid, username, content): """Remember a given liked status.""" + fingerprint = self._compute_content_fingerprint(content) + timestamp = int(time.time()) - values = (sid, username, timestamp, False) + values = (sid, username, fingerprint, timestamp, False) cursor = self._db.cursor() - cursor.execute('INSERT INTO LikedTweets VALUES (?, ?, ?, ?)', values) + cursor.execute('INSERT INTO LikedTweets VALUES (?, ?, ?, ?, ?)', values) self._db.commit() |