summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCyrille Bagard <nocbos@gmail.com>2017-01-28 11:55:42 (GMT)
committerCyrille Bagard <nocbos@gmail.com>2017-01-28 11:55:42 (GMT)
commitbcb5341a02725b9a6c0d28a211b594b2362c37eb (patch)
tree17d7bb9f909ecfc28da338a5e7563c4845b62d57
parent8dc59f6f8a145952ceaec8f6d9ffd59d37ca058e (diff)
Defined a new version for tracking content, too aggressive for Twitter.
-rw-r--r--.gitignore10
-rw-r--r--config.py5
-rw-r--r--db.py103
-rwxr-xr-xhtt.py40
-rw-r--r--taste.py76
-rwxr-xr-xusers.py177
6 files changed, 392 insertions, 19 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..af9f4aa
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,10 @@
+# -*- mode: sh -*-
+
+# Emacs
+*~
+
+# Python
+__pycache__
+
+# Misc
+ids.cache
diff --git a/config.py b/config.py
index b1a3471..051de6c 100644
--- a/config.py
+++ b/config.py
@@ -5,8 +5,11 @@
# List of space-separated hashtags to follow, for instance #python #bot
hashtags = '#python #bot'
-# Keywords to find in Tweets we want to highlight
+# Keywords to find in Tweets we want to highlight; underscores will be replaced by spaces.
white_kwds = 'you got the idea'
# Age of old Tweets to get purged in days
max_age = 14
+
+# List of space-separated accounts to follow
+accounts = 'laughing_bit'
diff --git a/db.py b/db.py
index 6b22368..ae0bf17 100644
--- a/db.py
+++ b/db.py
@@ -9,15 +9,25 @@ import time
import tweepy
+_db = None
+
+def open_db():
+ """Open the database."""
+
+ global _db
+
+ _db = sqlite3.connect('HTT.db', detect_types=sqlite3.PARSE_DECLTYPES)
+
+
class LikeMemory():
"""Track all liked Tweets."""
def __init__(self, api):
"""Build the Python object."""
- self._api = api
+ global _db
- self._db = sqlite3.connect('HTT.db', detect_types=sqlite3.PARSE_DECLTYPES)
+ self._api = api
sqlite3.register_adapter(bool, int)
sqlite3.register_converter("BOOLEAN", lambda v: bool(int(v)))
@@ -32,9 +42,9 @@ class LikeMemory():
)
'''
- cursor = self._db.cursor()
+ cursor = _db.cursor()
cursor.execute(sql)
- self._db.commit()
+ _db.commit()
def _compute_content_fingerprint(self, content):
@@ -78,17 +88,19 @@ class LikeMemory():
if pos != -1:
base = base[:pos]
- return hashlib.md5(base.rstrip(' ').encode('utf-8')).hexdigest()
+ return hashlib.md5(base.rstrip(' ').lower().encode('utf-8')).hexdigest()
def is_original_content(self, content):
"""Ensure that a given content has never been seen."""
+ global _db
+
fingerprint = self._compute_content_fingerprint(content)
values = (fingerprint, )
- cursor = self._db.cursor()
+ cursor = _db.cursor()
cursor.execute('SELECT sid FROM LikedTweets WHERE fingerprint = ?', values)
found = cursor.fetchone()
@@ -99,25 +111,29 @@ class LikeMemory():
def save_liked_status(self, sid, username, content):
"""Remember a given liked status."""
+ global _db
+
fingerprint = self._compute_content_fingerprint(content)
timestamp = int(time.time())
values = (sid, username, fingerprint, timestamp, False)
- cursor = self._db.cursor()
+ cursor = _db.cursor()
cursor.execute('INSERT INTO LikedTweets VALUES (?, ?, ?, ?, ?)', values)
- self._db.commit()
+ _db.commit()
def purge_old_status(self):
"""Purge old seen statuses."""
+ global _db
+
timestamp = int(time.time()) - max_age * 24 * 60 * 60
values = (timestamp, False)
- cursor = self._db.cursor()
+ cursor = _db.cursor()
cursor.execute('SELECT sid FROM LikedTweets WHERE timestamp < ? AND purged = ?', values)
rows = cursor.fetchall()
@@ -137,9 +153,74 @@ class LikeMemory():
values = (True, sid)
- cursor = self._db.cursor()
+ cursor = _db.cursor()
cursor.execute('UPDATE LikedTweets SET purged = ? WHERE sid = ?', values)
- self._db.commit()
+ _db.commit()
print('Purged %d liked Tweet%s!' % (len(rows), '' if len(rows) <= 1 else 's'))
+
+
+class TrackMemory():
+ """Remember last seen Tweet for users."""
+
+ def __init__(self):
+ """Build the Python object."""
+
+ global _db
+
+ sql = '''
+ CREATE TABLE IF NOT EXISTS TrackedUsers(
+ uid INTEGER PRIMARY KEY,
+ username TEXT,
+ last INTEGER
+ )
+ '''
+
+ cursor = _db.cursor()
+ cursor.execute(sql)
+ _db.commit()
+
+
+ def get_last_seen_for(self, uid):
+ """Get the status id of the last Tweet for a given user."""
+
+ global _db
+
+ values = (uid, )
+
+ cursor = _db.cursor()
+ cursor.execute('SELECT last FROM TrackedUsers WHERE uid = ?', values)
+
+ found = cursor.fetchone()
+
+ if found is None:
+ last = None
+ else:
+ last = found[0]
+
+ return last
+
+
+ def set_last_seen_for(self, uid, name, sid):
+ """Set the status id of the last seen Tweet for a given user."""
+
+ global _db
+
+ since = self.get_last_seen_for(uid)
+
+ if since is None:
+
+ values = (uid, name, sid)
+
+ cursor = _db.cursor()
+ cursor.execute('INSERT INTO TrackedUsers VALUES (?, ?, ?)', values)
+
+ else:
+
+ values = (sid, uid)
+
+ cursor = _db.cursor()
+ cursor.execute('UPDATE TrackedUsers SET last = ? WHERE uid = ?', values)
+
+ _db.commit()
diff --git a/htt.py b/htt.py
index d50156f..dcfd92d 100755
--- a/htt.py
+++ b/htt.py
@@ -9,6 +9,7 @@ from tweepy.streaming import StreamListener
from auth import *
from config import hashtags, white_kwds
from db import LikeMemory
+from users import listen_to_users
import json
import sys
@@ -16,7 +17,7 @@ import sys
class StdOutListener(StreamListener):
"""A listener handles tweets are the received from the stream."""
- def __init__(self, api, memory):
+ def __init__(self, api, memory, tempo):
"""Build the Python object."""
super().__init__()
@@ -24,6 +25,9 @@ class StdOutListener(StreamListener):
self._api = api
self._memory = memory
+ self._tempo = tempo
+ self._previous = None
+
self._white = [ s.lower() for s in white_kwds.split(' ') ]
@@ -52,8 +56,17 @@ class StdOutListener(StreamListener):
decoded = json.loads(data)
if 'retweeted_status' in decoded:
+
+ if not ('id' in decoded['retweeted_status']):
+ print(decoded)
+
sid, username, content = self.get_status_info(decoded['retweeted_status'])
+
else:
+
+ if not ('id' in decoded):
+ print(decoded)
+
sid, username, content = self.get_status_info(decoded)
like = False
@@ -76,8 +89,19 @@ class StdOutListener(StreamListener):
try:
- self._api.create_favorite(sid)
+ if self._tempo:
+ if self._previous != None:
+
+ self._api.create_favorite(self._previous)
+
+ self._previous = sid
+
+ else:
+
+ self._api.create_favorite(sid)
+
+ # Save even pending statuses to remember them when looking for original content
self._memory.save_liked_status(sid, username, content)
print('@%s: "%s" (id=%d)' % (username, content, sid))
@@ -114,8 +138,8 @@ if __name__ == '__main__':
auth = OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_KEY, ACCESS_SECRET)
- api = tweepy.API(auth)
- memory = LikeMemory(api)
+ api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
+ #memory = LikeMemory(api)
if len(sys.argv) > 1 and sys.argv[1] == '--purge':
@@ -123,7 +147,9 @@ if __name__ == '__main__':
else:
- listener = StdOutListener(api, memory)
+ listen_to_users(auth, api)
+
+ #listener = StdOutListener(api, memory, True)
- stream = Stream(auth, listener)
- stream.filter(track=hashtags.split(' '))
+ #stream = Stream(auth, listener)
+ #stream.filter(track=hashtags.split(' '))
diff --git a/taste.py b/taste.py
new file mode 100644
index 0000000..8471473
--- /dev/null
+++ b/taste.py
@@ -0,0 +1,76 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+
+import tweepy
+from config import white_kwds
+
+
+COLOR_RESET = "\033[0m"
+
+COLOR_REJECTED = "\033[1;31m"
+COLOR_ACCEPTED = "\033[1;32m"
+COLOR_ALREADY = "\033[1;33m"
+
+
+def get_displayable_content(orig, margin):
+ """Format content to get it displayable."""
+
+ padding = '\n' + ' ' * margin
+
+ useful = [ l for l in orig.split('\n') if len(l) > 0 ]
+
+ result = padding.join(useful)
+
+ return result
+
+
+def analyse(sid, username, content, api, memory):
+ """Analyse a Tweet content."""
+
+ like = False
+
+ words = content.split(' ')
+
+ white = [ s.lower().replace('_', ' ') for s in white_kwds.split(' ') ]
+
+ for kwd in white:
+
+ for w in words:
+ if w.lower() == kwd:
+ like = True
+ break
+
+ if like:
+ break
+
+ if like:
+
+ if memory.is_original_content(content):
+
+ try:
+
+ api.create_favorite(sid)
+
+ memory.save_liked_status(sid, username, content)
+
+ displayable = get_displayable_content(content, len('Liking') + len(' @%s: "' % username))
+
+ print(COLOR_ACCEPTED + 'Liking' + COLOR_RESET + ' @%s: "%s"' % (username, displayable))
+ print(' -> https://twitter.com/%s/status/%d' % (username, sid))
+
+ except tweepy.error.TweepError:
+
+ pass
+
+ else:
+
+ displayable = get_displayable_content(content, len('Already seen "'))
+
+ print(COLOR_ALREADY + 'Already seen' + COLOR_RESET + ' "%s"' % displayable)
+
+ else:
+
+ displayable = get_displayable_content(content, len('Reject') + len(' @%s: "' % username))
+
+ print(COLOR_REJECTED + 'Reject' + COLOR_RESET + ' @%s: "%s"' % (username, displayable))
diff --git a/users.py b/users.py
new file mode 100755
index 0000000..e61d3c2
--- /dev/null
+++ b/users.py
@@ -0,0 +1,177 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+
+import tweepy
+from config import accounts
+from db import open_db, LikeMemory, TrackMemory
+from taste import analyse
+import os
+import pickle
+from random import shuffle
+
+
+CACHE_FILENAME = 'ids.cache'
+
+
+class UsersListener():
+ """A listener handles tweets are the received from the stream."""
+
+ def __init__(self, api):
+ """Build the Python object."""
+
+ super().__init__()
+
+ self._api = api
+
+ open_db()
+
+ self._memory = LikeMemory(api)
+ self._tracker = TrackMemory()
+
+ self._compute_ids_to_follow(True)
+
+
+ def _compute_ids_to_follow(self, cached):
+ """Get the list of accounts to track."""
+
+ if not os.path.isfile(CACHE_FILENAME):
+ cached = False
+
+ if cached:
+
+ ids = pickle.load(open(CACHE_FILENAME, 'rb'))
+
+ print('[i] Reloaded %u accounts' % len(ids))
+
+ else:
+
+ ids = []
+
+ for master in accounts.split(' '):
+
+ count = 0
+
+ try:
+
+ for page in tweepy.Cursor(self._api.followers_ids, screen_name=master).pages():
+
+ count += len(page)
+ ids.extend(page)
+
+ print('[i] Got %u accounts following %s' % (count, master))
+
+ except:
+
+ print('[!] Error while receiving followers for %s...' % master)
+
+
+ pickle.dump(ids, open(CACHE_FILENAME, 'wb'))
+
+ print('[i] Loaded %u accounts' % len(ids))
+
+ ids = list(set(ids))
+
+ print('[i] Kept %u accounts' % len(ids))
+
+ # Remove all account natively followed
+
+ already = []
+
+ for page in tweepy.Cursor(self._api.followers_ids, screen_name=self._api.me().name).pages():
+ already.extend(page)
+
+ print('[i] I am followed by %u accounts' % len(already))
+
+ self._ids = [ x for x in ids if x not in already ]
+
+ shuffle(self._ids)
+
+ print('[i] Tracking %u accounts...' % len(self._ids))
+
+
+ def start(self, auth):
+ """Start the listener."""
+
+ while True:
+
+ for uid in self._ids:
+
+ since = self._tracker.get_last_seen_for(uid)
+
+ last = []
+
+ try:
+
+ last = self._api.user_timeline(uid, since)
+
+ except tweepy.error.TweepError as e:
+
+ # Private account
+ # tweepy.error.TweepError: Not authorized.
+ if e.response.status_code == 401:
+ pass
+
+ # Nothing new!
+ # tweepy.error.TweepError: [{'message': 'Sorry, that page does not exist.', 'code': 34}]
+ elif e.response.status_code == 404:
+ pass
+
+ else:
+ print(e, e.response.status_code)
+ assert(False)
+
+ first = None
+
+ for status in last:
+
+ sid = status.id
+ uid = status.author.id
+ username = status.author.screen_name
+
+ while hasattr(status, 'retweeted_status'):
+ status = status.retweeted_status
+
+ analyse(sid, username, status.text, self._api, self._memory)
+
+ if first is None:
+ first = uid, username, sid
+
+ if not(first is None):
+ uid, username, sid = first
+ self._tracker.set_last_seen_for(uid, username, sid)
+
+
+
+def listen_to_users(auth, api):
+ """Track all tweets written by users."""
+
+
+ data = api.rate_limit_status()
+
+ for c in data['resources'].keys():
+
+ print('%s' % c)
+
+ category = data['resources'][c]
+
+ for p in category.keys():
+
+ props = category[p]
+ changed = props['remaining'] != props['limit']
+
+ print(' %s %s: %d / %d' % ('!!' if changed else ' ', p, props['remaining'], props['limit']))
+
+
+
+
+
+
+ if True:
+
+ listener = UsersListener(api)
+
+ ####listener.start(auth)
+
+ #stream = Stream(auth, listener)
+ #stream.filter(follow=new)