From bcb5341a02725b9a6c0d28a211b594b2362c37eb Mon Sep 17 00:00:00 2001 From: Cyrille Bagard Date: Sat, 28 Jan 2017 12:55:42 +0100 Subject: Defined a new version for tracking content, too aggressive for Twitter. --- .gitignore | 10 ++++ config.py | 5 +- db.py | 103 +++++++++++++++++++++++++++++++---- htt.py | 40 +++++++++++--- taste.py | 76 ++++++++++++++++++++++++++ users.py | 177 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 392 insertions(+), 19 deletions(-) create mode 100644 .gitignore create mode 100644 taste.py create mode 100755 users.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..af9f4aa --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +# -*- mode: sh -*- + +# Emacs +*~ + +# Python +__pycache__ + +# Misc +ids.cache diff --git a/config.py b/config.py index b1a3471..051de6c 100644 --- a/config.py +++ b/config.py @@ -5,8 +5,11 @@ # List of space-separated hashtags to follow, for instance #python #bot hashtags = '#python #bot' -# Keywords to find in Tweets we want to highlight +# Keywords to find in Tweets we want to highlight; underscores will be replaced by spaces. white_kwds = 'you got the idea' # Age of old Tweets to get purged in days max_age = 14 + +# List of space-separated accounts to follow +accounts = 'laughing_bit' diff --git a/db.py b/db.py index 6b22368..ae0bf17 100644 --- a/db.py +++ b/db.py @@ -9,15 +9,25 @@ import time import tweepy +_db = None + +def open_db(): + """Open the database.""" + + global _db + + _db = sqlite3.connect('HTT.db', detect_types=sqlite3.PARSE_DECLTYPES) + + class LikeMemory(): """Track all liked Tweets.""" def __init__(self, api): """Build the Python object.""" - self._api = api + global _db - self._db = sqlite3.connect('HTT.db', detect_types=sqlite3.PARSE_DECLTYPES) + self._api = api sqlite3.register_adapter(bool, int) sqlite3.register_converter("BOOLEAN", lambda v: bool(int(v))) @@ -32,9 +42,9 @@ class LikeMemory(): ) ''' - cursor = self._db.cursor() + cursor = _db.cursor() cursor.execute(sql) - self._db.commit() + _db.commit() def _compute_content_fingerprint(self, content): @@ -78,17 +88,19 @@ class LikeMemory(): if pos != -1: base = base[:pos] - return hashlib.md5(base.rstrip(' ').encode('utf-8')).hexdigest() + return hashlib.md5(base.rstrip(' ').lower().encode('utf-8')).hexdigest() def is_original_content(self, content): """Ensure that a given content has never been seen.""" + global _db + fingerprint = self._compute_content_fingerprint(content) values = (fingerprint, ) - cursor = self._db.cursor() + cursor = _db.cursor() cursor.execute('SELECT sid FROM LikedTweets WHERE fingerprint = ?', values) found = cursor.fetchone() @@ -99,25 +111,29 @@ class LikeMemory(): def save_liked_status(self, sid, username, content): """Remember a given liked status.""" + global _db + fingerprint = self._compute_content_fingerprint(content) timestamp = int(time.time()) values = (sid, username, fingerprint, timestamp, False) - cursor = self._db.cursor() + cursor = _db.cursor() cursor.execute('INSERT INTO LikedTweets VALUES (?, ?, ?, ?, ?)', values) - self._db.commit() + _db.commit() def purge_old_status(self): """Purge old seen statuses.""" + global _db + timestamp = int(time.time()) - max_age * 24 * 60 * 60 values = (timestamp, False) - cursor = self._db.cursor() + cursor = _db.cursor() cursor.execute('SELECT sid FROM LikedTweets WHERE timestamp < ? AND purged = ?', values) rows = cursor.fetchall() @@ -137,9 +153,74 @@ class LikeMemory(): values = (True, sid) - cursor = self._db.cursor() + cursor = _db.cursor() cursor.execute('UPDATE LikedTweets SET purged = ? WHERE sid = ?', values) - self._db.commit() + _db.commit() print('Purged %d liked Tweet%s!' % (len(rows), '' if len(rows) <= 1 else 's')) + + +class TrackMemory(): + """Remember last seen Tweet for users.""" + + def __init__(self): + """Build the Python object.""" + + global _db + + sql = ''' + CREATE TABLE IF NOT EXISTS TrackedUsers( + uid INTEGER PRIMARY KEY, + username TEXT, + last INTEGER + ) + ''' + + cursor = _db.cursor() + cursor.execute(sql) + _db.commit() + + + def get_last_seen_for(self, uid): + """Get the status id of the last Tweet for a given user.""" + + global _db + + values = (uid, ) + + cursor = _db.cursor() + cursor.execute('SELECT last FROM TrackedUsers WHERE uid = ?', values) + + found = cursor.fetchone() + + if found is None: + last = None + else: + last = found[0] + + return last + + + def set_last_seen_for(self, uid, name, sid): + """Set the status id of the last seen Tweet for a given user.""" + + global _db + + since = self.get_last_seen_for(uid) + + if since is None: + + values = (uid, name, sid) + + cursor = _db.cursor() + cursor.execute('INSERT INTO TrackedUsers VALUES (?, ?, ?)', values) + + else: + + values = (sid, uid) + + cursor = _db.cursor() + cursor.execute('UPDATE TrackedUsers SET last = ? WHERE uid = ?', values) + + _db.commit() diff --git a/htt.py b/htt.py index d50156f..dcfd92d 100755 --- a/htt.py +++ b/htt.py @@ -9,6 +9,7 @@ from tweepy.streaming import StreamListener from auth import * from config import hashtags, white_kwds from db import LikeMemory +from users import listen_to_users import json import sys @@ -16,7 +17,7 @@ import sys class StdOutListener(StreamListener): """A listener handles tweets are the received from the stream.""" - def __init__(self, api, memory): + def __init__(self, api, memory, tempo): """Build the Python object.""" super().__init__() @@ -24,6 +25,9 @@ class StdOutListener(StreamListener): self._api = api self._memory = memory + self._tempo = tempo + self._previous = None + self._white = [ s.lower() for s in white_kwds.split(' ') ] @@ -52,8 +56,17 @@ class StdOutListener(StreamListener): decoded = json.loads(data) if 'retweeted_status' in decoded: + + if not ('id' in decoded['retweeted_status']): + print(decoded) + sid, username, content = self.get_status_info(decoded['retweeted_status']) + else: + + if not ('id' in decoded): + print(decoded) + sid, username, content = self.get_status_info(decoded) like = False @@ -76,8 +89,19 @@ class StdOutListener(StreamListener): try: - self._api.create_favorite(sid) + if self._tempo: + if self._previous != None: + + self._api.create_favorite(self._previous) + + self._previous = sid + + else: + + self._api.create_favorite(sid) + + # Save even pending statuses to remember them when looking for original content self._memory.save_liked_status(sid, username, content) print('@%s: "%s" (id=%d)' % (username, content, sid)) @@ -114,8 +138,8 @@ if __name__ == '__main__': auth = OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET) auth.set_access_token(ACCESS_KEY, ACCESS_SECRET) - api = tweepy.API(auth) - memory = LikeMemory(api) + api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) + #memory = LikeMemory(api) if len(sys.argv) > 1 and sys.argv[1] == '--purge': @@ -123,7 +147,9 @@ if __name__ == '__main__': else: - listener = StdOutListener(api, memory) + listen_to_users(auth, api) + + #listener = StdOutListener(api, memory, True) - stream = Stream(auth, listener) - stream.filter(track=hashtags.split(' ')) + #stream = Stream(auth, listener) + #stream.filter(track=hashtags.split(' ')) diff --git a/taste.py b/taste.py new file mode 100644 index 0000000..8471473 --- /dev/null +++ b/taste.py @@ -0,0 +1,76 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- + + +import tweepy +from config import white_kwds + + +COLOR_RESET = "\033[0m" + +COLOR_REJECTED = "\033[1;31m" +COLOR_ACCEPTED = "\033[1;32m" +COLOR_ALREADY = "\033[1;33m" + + +def get_displayable_content(orig, margin): + """Format content to get it displayable.""" + + padding = '\n' + ' ' * margin + + useful = [ l for l in orig.split('\n') if len(l) > 0 ] + + result = padding.join(useful) + + return result + + +def analyse(sid, username, content, api, memory): + """Analyse a Tweet content.""" + + like = False + + words = content.split(' ') + + white = [ s.lower().replace('_', ' ') for s in white_kwds.split(' ') ] + + for kwd in white: + + for w in words: + if w.lower() == kwd: + like = True + break + + if like: + break + + if like: + + if memory.is_original_content(content): + + try: + + api.create_favorite(sid) + + memory.save_liked_status(sid, username, content) + + displayable = get_displayable_content(content, len('Liking') + len(' @%s: "' % username)) + + print(COLOR_ACCEPTED + 'Liking' + COLOR_RESET + ' @%s: "%s"' % (username, displayable)) + print(' -> https://twitter.com/%s/status/%d' % (username, sid)) + + except tweepy.error.TweepError: + + pass + + else: + + displayable = get_displayable_content(content, len('Already seen "')) + + print(COLOR_ALREADY + 'Already seen' + COLOR_RESET + ' "%s"' % displayable) + + else: + + displayable = get_displayable_content(content, len('Reject') + len(' @%s: "' % username)) + + print(COLOR_REJECTED + 'Reject' + COLOR_RESET + ' @%s: "%s"' % (username, displayable)) diff --git a/users.py b/users.py new file mode 100755 index 0000000..e61d3c2 --- /dev/null +++ b/users.py @@ -0,0 +1,177 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- + + +import tweepy +from config import accounts +from db import open_db, LikeMemory, TrackMemory +from taste import analyse +import os +import pickle +from random import shuffle + + +CACHE_FILENAME = 'ids.cache' + + +class UsersListener(): + """A listener handles tweets are the received from the stream.""" + + def __init__(self, api): + """Build the Python object.""" + + super().__init__() + + self._api = api + + open_db() + + self._memory = LikeMemory(api) + self._tracker = TrackMemory() + + self._compute_ids_to_follow(True) + + + def _compute_ids_to_follow(self, cached): + """Get the list of accounts to track.""" + + if not os.path.isfile(CACHE_FILENAME): + cached = False + + if cached: + + ids = pickle.load(open(CACHE_FILENAME, 'rb')) + + print('[i] Reloaded %u accounts' % len(ids)) + + else: + + ids = [] + + for master in accounts.split(' '): + + count = 0 + + try: + + for page in tweepy.Cursor(self._api.followers_ids, screen_name=master).pages(): + + count += len(page) + ids.extend(page) + + print('[i] Got %u accounts following %s' % (count, master)) + + except: + + print('[!] Error while receiving followers for %s...' % master) + + + pickle.dump(ids, open(CACHE_FILENAME, 'wb')) + + print('[i] Loaded %u accounts' % len(ids)) + + ids = list(set(ids)) + + print('[i] Kept %u accounts' % len(ids)) + + # Remove all account natively followed + + already = [] + + for page in tweepy.Cursor(self._api.followers_ids, screen_name=self._api.me().name).pages(): + already.extend(page) + + print('[i] I am followed by %u accounts' % len(already)) + + self._ids = [ x for x in ids if x not in already ] + + shuffle(self._ids) + + print('[i] Tracking %u accounts...' % len(self._ids)) + + + def start(self, auth): + """Start the listener.""" + + while True: + + for uid in self._ids: + + since = self._tracker.get_last_seen_for(uid) + + last = [] + + try: + + last = self._api.user_timeline(uid, since) + + except tweepy.error.TweepError as e: + + # Private account + # tweepy.error.TweepError: Not authorized. + if e.response.status_code == 401: + pass + + # Nothing new! + # tweepy.error.TweepError: [{'message': 'Sorry, that page does not exist.', 'code': 34}] + elif e.response.status_code == 404: + pass + + else: + print(e, e.response.status_code) + assert(False) + + first = None + + for status in last: + + sid = status.id + uid = status.author.id + username = status.author.screen_name + + while hasattr(status, 'retweeted_status'): + status = status.retweeted_status + + analyse(sid, username, status.text, self._api, self._memory) + + if first is None: + first = uid, username, sid + + if not(first is None): + uid, username, sid = first + self._tracker.set_last_seen_for(uid, username, sid) + + + +def listen_to_users(auth, api): + """Track all tweets written by users.""" + + + data = api.rate_limit_status() + + for c in data['resources'].keys(): + + print('%s' % c) + + category = data['resources'][c] + + for p in category.keys(): + + props = category[p] + changed = props['remaining'] != props['limit'] + + print(' %s %s: %d / %d' % ('!!' if changed else ' ', p, props['remaining'], props['limit'])) + + + + + + + if True: + + listener = UsersListener(api) + + ####listener.start(auth) + + #stream = Stream(auth, listener) + #stream.filter(follow=new) -- cgit v0.11.2-87-g4458