diff options
authorCyrille Bagard <>2017-01-28 11:55:42 (GMT)
committerCyrille Bagard <>2017-01-28 11:55:42 (GMT)
commitbcb5341a02725b9a6c0d28a211b594b2362c37eb (patch)
parent8dc59f6f8a145952ceaec8f6d9ffd59d37ca058e (diff)
Defined a new version for tracking content, too aggressive for Twitter.
6 files changed, 392 insertions, 19 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..af9f4aa
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,10 @@
+# -*- mode: sh -*-
+# Emacs
+# Python
+# Misc
diff --git a/ b/
index b1a3471..051de6c 100644
--- a/
+++ b/
@@ -5,8 +5,11 @@
# List of space-separated hashtags to follow, for instance #python #bot
hashtags = '#python #bot'
-# Keywords to find in Tweets we want to highlight
+# Keywords to find in Tweets we want to highlight; underscores will be replaced by spaces.
white_kwds = 'you got the idea'
# Age of old Tweets to get purged in days
max_age = 14
+# List of space-separated accounts to follow
+accounts = 'laughing_bit'
diff --git a/ b/
index 6b22368..ae0bf17 100644
--- a/
+++ b/
@@ -9,15 +9,25 @@ import time
import tweepy
+_db = None
+def open_db():
+ """Open the database."""
+ global _db
+ _db = sqlite3.connect('HTT.db', detect_types=sqlite3.PARSE_DECLTYPES)
class LikeMemory():
"""Track all liked Tweets."""
def __init__(self, api):
"""Build the Python object."""
- self._api = api
+ global _db
- self._db = sqlite3.connect('HTT.db', detect_types=sqlite3.PARSE_DECLTYPES)
+ self._api = api
sqlite3.register_adapter(bool, int)
sqlite3.register_converter("BOOLEAN", lambda v: bool(int(v)))
@@ -32,9 +42,9 @@ class LikeMemory():
- cursor = self._db.cursor()
+ cursor = _db.cursor()
- self._db.commit()
+ _db.commit()
def _compute_content_fingerprint(self, content):
@@ -78,17 +88,19 @@ class LikeMemory():
if pos != -1:
base = base[:pos]
- return hashlib.md5(base.rstrip(' ').encode('utf-8')).hexdigest()
+ return hashlib.md5(base.rstrip(' ').lower().encode('utf-8')).hexdigest()
def is_original_content(self, content):
"""Ensure that a given content has never been seen."""
+ global _db
fingerprint = self._compute_content_fingerprint(content)
values = (fingerprint, )
- cursor = self._db.cursor()
+ cursor = _db.cursor()
cursor.execute('SELECT sid FROM LikedTweets WHERE fingerprint = ?', values)
found = cursor.fetchone()
@@ -99,25 +111,29 @@ class LikeMemory():
def save_liked_status(self, sid, username, content):
"""Remember a given liked status."""
+ global _db
fingerprint = self._compute_content_fingerprint(content)
timestamp = int(time.time())
values = (sid, username, fingerprint, timestamp, False)
- cursor = self._db.cursor()
+ cursor = _db.cursor()
cursor.execute('INSERT INTO LikedTweets VALUES (?, ?, ?, ?, ?)', values)
- self._db.commit()
+ _db.commit()
def purge_old_status(self):
"""Purge old seen statuses."""
+ global _db
timestamp = int(time.time()) - max_age * 24 * 60 * 60
values = (timestamp, False)
- cursor = self._db.cursor()
+ cursor = _db.cursor()
cursor.execute('SELECT sid FROM LikedTweets WHERE timestamp < ? AND purged = ?', values)
rows = cursor.fetchall()
@@ -137,9 +153,74 @@ class LikeMemory():
values = (True, sid)
- cursor = self._db.cursor()
+ cursor = _db.cursor()
cursor.execute('UPDATE LikedTweets SET purged = ? WHERE sid = ?', values)
- self._db.commit()
+ _db.commit()
print('Purged %d liked Tweet%s!' % (len(rows), '' if len(rows) <= 1 else 's'))
+class TrackMemory():
+ """Remember last seen Tweet for users."""
+ def __init__(self):
+ """Build the Python object."""
+ global _db
+ sql = '''
+ username TEXT,
+ last INTEGER
+ )
+ '''
+ cursor = _db.cursor()
+ cursor.execute(sql)
+ _db.commit()
+ def get_last_seen_for(self, uid):
+ """Get the status id of the last Tweet for a given user."""
+ global _db
+ values = (uid, )
+ cursor = _db.cursor()
+ cursor.execute('SELECT last FROM TrackedUsers WHERE uid = ?', values)
+ found = cursor.fetchone()
+ if found is None:
+ last = None
+ else:
+ last = found[0]
+ return last
+ def set_last_seen_for(self, uid, name, sid):
+ """Set the status id of the last seen Tweet for a given user."""
+ global _db
+ since = self.get_last_seen_for(uid)
+ if since is None:
+ values = (uid, name, sid)
+ cursor = _db.cursor()
+ cursor.execute('INSERT INTO TrackedUsers VALUES (?, ?, ?)', values)
+ else:
+ values = (sid, uid)
+ cursor = _db.cursor()
+ cursor.execute('UPDATE TrackedUsers SET last = ? WHERE uid = ?', values)
+ _db.commit()
diff --git a/ b/
index d50156f..dcfd92d 100755
--- a/
+++ b/
@@ -9,6 +9,7 @@ from tweepy.streaming import StreamListener
from auth import *
from config import hashtags, white_kwds
from db import LikeMemory
+from users import listen_to_users
import json
import sys
@@ -16,7 +17,7 @@ import sys
class StdOutListener(StreamListener):
"""A listener handles tweets are the received from the stream."""
- def __init__(self, api, memory):
+ def __init__(self, api, memory, tempo):
"""Build the Python object."""
@@ -24,6 +25,9 @@ class StdOutListener(StreamListener):
self._api = api
self._memory = memory
+ self._tempo = tempo
+ self._previous = None
self._white = [ s.lower() for s in white_kwds.split(' ') ]
@@ -52,8 +56,17 @@ class StdOutListener(StreamListener):
decoded = json.loads(data)
if 'retweeted_status' in decoded:
+ if not ('id' in decoded['retweeted_status']):
+ print(decoded)
sid, username, content = self.get_status_info(decoded['retweeted_status'])
+ if not ('id' in decoded):
+ print(decoded)
sid, username, content = self.get_status_info(decoded)
like = False
@@ -76,8 +89,19 @@ class StdOutListener(StreamListener):
- self._api.create_favorite(sid)
+ if self._tempo:
+ if self._previous != None:
+ self._api.create_favorite(self._previous)
+ self._previous = sid
+ else:
+ self._api.create_favorite(sid)
+ # Save even pending statuses to remember them when looking for original content
self._memory.save_liked_status(sid, username, content)
print('@%s: "%s" (id=%d)' % (username, content, sid))
@@ -114,8 +138,8 @@ if __name__ == '__main__':
auth.set_access_token(ACCESS_KEY, ACCESS_SECRET)
- api = tweepy.API(auth)
- memory = LikeMemory(api)
+ api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
+ #memory = LikeMemory(api)
if len(sys.argv) > 1 and sys.argv[1] == '--purge':
@@ -123,7 +147,9 @@ if __name__ == '__main__':
- listener = StdOutListener(api, memory)
+ listen_to_users(auth, api)
+ #listener = StdOutListener(api, memory, True)
- stream = Stream(auth, listener)
- stream.filter(track=hashtags.split(' '))
+ #stream = Stream(auth, listener)
+ #stream.filter(track=hashtags.split(' '))
diff --git a/ b/
new file mode 100644
index 0000000..8471473
--- /dev/null
+++ b/
@@ -0,0 +1,76 @@
+# -*- coding: utf-8 -*-
+import tweepy
+from config import white_kwds
+COLOR_RESET = "\033[0m"
+COLOR_REJECTED = "\033[1;31m"
+COLOR_ACCEPTED = "\033[1;32m"
+COLOR_ALREADY = "\033[1;33m"
+def get_displayable_content(orig, margin):
+ """Format content to get it displayable."""
+ padding = '\n' + ' ' * margin
+ useful = [ l for l in orig.split('\n') if len(l) > 0 ]
+ result = padding.join(useful)
+ return result
+def analyse(sid, username, content, api, memory):
+ """Analyse a Tweet content."""
+ like = False
+ words = content.split(' ')
+ white = [ s.lower().replace('_', ' ') for s in white_kwds.split(' ') ]
+ for kwd in white:
+ for w in words:
+ if w.lower() == kwd:
+ like = True
+ break
+ if like:
+ break
+ if like:
+ if memory.is_original_content(content):
+ try:
+ api.create_favorite(sid)
+ memory.save_liked_status(sid, username, content)
+ displayable = get_displayable_content(content, len('Liking') + len(' @%s: "' % username))
+ print(COLOR_ACCEPTED + 'Liking' + COLOR_RESET + ' @%s: "%s"' % (username, displayable))
+ print(' ->' % (username, sid))
+ except tweepy.error.TweepError:
+ pass
+ else:
+ displayable = get_displayable_content(content, len('Already seen "'))
+ print(COLOR_ALREADY + 'Already seen' + COLOR_RESET + ' "%s"' % displayable)
+ else:
+ displayable = get_displayable_content(content, len('Reject') + len(' @%s: "' % username))
+ print(COLOR_REJECTED + 'Reject' + COLOR_RESET + ' @%s: "%s"' % (username, displayable))
diff --git a/ b/
new file mode 100755
index 0000000..e61d3c2
--- /dev/null
+++ b/
@@ -0,0 +1,177 @@
+# -*- coding: utf-8 -*-
+import tweepy
+from config import accounts
+from db import open_db, LikeMemory, TrackMemory
+from taste import analyse
+import os
+import pickle
+from random import shuffle
+CACHE_FILENAME = 'ids.cache'
+class UsersListener():
+ """A listener handles tweets are the received from the stream."""
+ def __init__(self, api):
+ """Build the Python object."""
+ super().__init__()
+ self._api = api
+ open_db()
+ self._memory = LikeMemory(api)
+ self._tracker = TrackMemory()
+ self._compute_ids_to_follow(True)
+ def _compute_ids_to_follow(self, cached):
+ """Get the list of accounts to track."""
+ if not os.path.isfile(CACHE_FILENAME):
+ cached = False
+ if cached:
+ ids = pickle.load(open(CACHE_FILENAME, 'rb'))
+ print('[i] Reloaded %u accounts' % len(ids))
+ else:
+ ids = []
+ for master in accounts.split(' '):
+ count = 0
+ try:
+ for page in tweepy.Cursor(self._api.followers_ids, screen_name=master).pages():
+ count += len(page)
+ ids.extend(page)
+ print('[i] Got %u accounts following %s' % (count, master))
+ except:
+ print('[!] Error while receiving followers for %s...' % master)
+ pickle.dump(ids, open(CACHE_FILENAME, 'wb'))
+ print('[i] Loaded %u accounts' % len(ids))
+ ids = list(set(ids))
+ print('[i] Kept %u accounts' % len(ids))
+ # Remove all account natively followed
+ already = []
+ for page in tweepy.Cursor(self._api.followers_ids,
+ already.extend(page)
+ print('[i] I am followed by %u accounts' % len(already))
+ self._ids = [ x for x in ids if x not in already ]
+ shuffle(self._ids)
+ print('[i] Tracking %u accounts...' % len(self._ids))
+ def start(self, auth):
+ """Start the listener."""
+ while True:
+ for uid in self._ids:
+ since = self._tracker.get_last_seen_for(uid)
+ last = []
+ try:
+ last = self._api.user_timeline(uid, since)
+ except tweepy.error.TweepError as e:
+ # Private account
+ # tweepy.error.TweepError: Not authorized.
+ if e.response.status_code == 401:
+ pass
+ # Nothing new!
+ # tweepy.error.TweepError: [{'message': 'Sorry, that page does not exist.', 'code': 34}]
+ elif e.response.status_code == 404:
+ pass
+ else:
+ print(e, e.response.status_code)
+ assert(False)
+ first = None
+ for status in last:
+ sid =
+ uid =
+ username =
+ while hasattr(status, 'retweeted_status'):
+ status = status.retweeted_status
+ analyse(sid, username, status.text, self._api, self._memory)
+ if first is None:
+ first = uid, username, sid
+ if not(first is None):
+ uid, username, sid = first
+ self._tracker.set_last_seen_for(uid, username, sid)
+def listen_to_users(auth, api):
+ """Track all tweets written by users."""
+ data = api.rate_limit_status()
+ for c in data['resources'].keys():
+ print('%s' % c)
+ category = data['resources'][c]
+ for p in category.keys():
+ props = category[p]
+ changed = props['remaining'] != props['limit']
+ print(' %s %s: %d / %d' % ('!!' if changed else ' ', p, props['remaining'], props['limit']))
+ if True:
+ listener = UsersListener(api)
+ ####listener.start(auth)
+ #stream = Stream(auth, listener)
+ #stream.filter(follow=new)