summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCyrille Bagard <nocbos@gmail.com>2017-11-25 10:27:11 (GMT)
committerCyrille Bagard <nocbos@gmail.com>2017-11-25 10:27:11 (GMT)
commit8c7ba43ddcf5ac6264bd5e85b457025af9fb506a (patch)
tree3337a7614526ee8ba3387f71cc153cb43a203123
parentbcb5341a02725b9a6c0d28a211b594b2362c37eb (diff)
Tracked live Tweets as well as account content.
-rw-r--r--config.py22
-rwxr-xr-xhtt.py148
-rw-r--r--live.py98
-rw-r--r--taste.py166
-rw-r--r--[-rwxr-xr-x]users.py116
5 files changed, 356 insertions, 194 deletions
diff --git a/config.py b/config.py
index 051de6c..4d6f75d 100644
--- a/config.py
+++ b/config.py
@@ -2,14 +2,36 @@
# -*- coding: utf-8 -*-
+# Space-separated list of readable languages
+accepted_languages = 'fr en'
+
# List of space-separated hashtags to follow, for instance #python #bot
hashtags = '#python #bot'
+# Selected keywords to find in Tweets we want to highlight; underscores will be replaced by spaces.
+underlined = 'code'
+
# Keywords to find in Tweets we want to highlight; underscores will be replaced by spaces.
white_kwds = 'you got the idea'
+# Case sensitive keywords to find in Tweets we want to highlight; underscores will be replaced by spaces.
+cs_white_kwds = 'ARM'
+
+# Keywords leading to reject a Tweet; underscores will be replaced by spaces.
+black_kwds = 'trump porn sex download job'
+
+# Case sensitive keywords leading to reject a Tweet; underscores will be replaced by spaces.
+cs_black_kwds = 'REP_ARM REPS_ARM'
+
# Age of old Tweets to get purged in days
max_age = 14
# List of space-separated accounts to follow
accounts = 'laughing_bit'
+
+# List of accounts providing no first hand content
+banned_accounts = 'cnn'
+
+banned_accounts_re = '.*bot .*Bot .*career.* .*Career.* .*_jobs .*_Jobs'
+
+banned_titles_re = '.*Parts.* .*Jobs.*'
diff --git a/htt.py b/htt.py
index dcfd92d..49cec54 100755
--- a/htt.py
+++ b/htt.py
@@ -2,134 +2,13 @@
# -*- coding: utf-8 -*-
+import argparse
import tweepy
-from tweepy import OAuthHandler
-from tweepy import Stream
-from tweepy.streaming import StreamListener
from auth import *
-from config import hashtags, white_kwds
-from db import LikeMemory
+from db import open_db
+from live import listen_live
+from tweepy import OAuthHandler
from users import listen_to_users
-import json
-import sys
-
-
-class StdOutListener(StreamListener):
- """A listener handles tweets are the received from the stream."""
-
- def __init__(self, api, memory, tempo):
- """Build the Python object."""
-
- super().__init__()
-
- self._api = api
- self._memory = memory
-
- self._tempo = tempo
- self._previous = None
-
- self._white = [ s.lower() for s in white_kwds.split(' ') ]
-
-
- def get_status_info(self, data):
- """Parse status data to get information about its author and content."""
-
- # Do not rely on https://dev.twitter.com/overview/api/tweets
- # as the specs seem outdated...
-
- sid = data['id']
- username = data['user']['screen_name']
-
- if 'extended_tweet' in data:
- content = data['extended_tweet']['full_text']
- else:
- content = data['text']
-
- content = content.replace('\n', '')
-
- return sid, username, content
-
-
- def on_data(self, data):
- """Receive Tweets matching the given hashtags."""
-
- decoded = json.loads(data)
-
- if 'retweeted_status' in decoded:
-
- if not ('id' in decoded['retweeted_status']):
- print(decoded)
-
- sid, username, content = self.get_status_info(decoded['retweeted_status'])
-
- else:
-
- if not ('id' in decoded):
- print(decoded)
-
- sid, username, content = self.get_status_info(decoded)
-
- like = False
-
- words = content.split(' ')
-
- for kwd in self._white:
-
- for w in words:
- if w.lower() == kwd:
- like = True
- break
-
- if like:
- break
-
- if like:
-
- if self._memory.is_original_content(content):
-
- try:
-
- if self._tempo:
-
- if self._previous != None:
-
- self._api.create_favorite(self._previous)
-
- self._previous = sid
-
- else:
-
- self._api.create_favorite(sid)
-
- # Save even pending statuses to remember them when looking for original content
- self._memory.save_liked_status(sid, username, content)
-
- print('@%s: "%s" (id=%d)' % (username, content, sid))
- print(' -> https://twitter.com/%s/status/%d' % (username, sid))
-
- except tweepy.error.TweepError:
-
- pass
-
- else:
-
- print('Already seen "%s"' % content)
-
- else:
-
- print('Reject "%s"' % content)
-
- return True
-
-
- def on_error(self, code):
- """Handle errors."""
-
- print('Error:', code)
-
- if code == 420:
- #returning False in on_data disconnects the stream
- return False
if __name__ == '__main__':
@@ -139,17 +18,20 @@ if __name__ == '__main__':
auth.set_access_token(ACCESS_KEY, ACCESS_SECRET)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
- #memory = LikeMemory(api)
- if len(sys.argv) > 1 and sys.argv[1] == '--purge':
+ open_db()
- memory.purge_old_status()
+ parser = argparse.ArgumentParser()
+ parser.add_argument('-p', '--purge', help='Delete old liked Tweets', action='store_true')
+ parser.add_argument('-d', '--daily', help='Analyse contents from selected followers', action='store_true')
- else:
+ args = parser.parse_args()
- listen_to_users(auth, api)
+ if args.purge:
+ memory.purge_old_status()
- #listener = StdOutListener(api, memory, True)
+ elif args.daily:
+ listen_to_users(auth, api)
- #stream = Stream(auth, listener)
- #stream.filter(track=hashtags.split(' '))
+ else:
+ listen_live(auth, api)
diff --git a/live.py b/live.py
new file mode 100644
index 0000000..9f6ad4c
--- /dev/null
+++ b/live.py
@@ -0,0 +1,98 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+
+from config import hashtags, underlined
+from db import LikeMemory
+from taste import analyse
+from tweepy import Stream
+from tweepy.streaming import StreamListener
+
+
+CACHE_SIZE = 1000 #10000
+
+
+class StdOutListener(StreamListener):
+ """A listener handles tweets are the received from the stream."""
+
+ def __init__(self, api):
+ """Build the Python object."""
+
+ super().__init__()
+
+ self._api = api
+ self._memory = LikeMemory(api)
+
+ self._cache = []
+
+ self._tweets_reviewed = 0
+ self._tweets_liked = 0
+
+
+ def on_status(self, status):
+ """Receive Tweets matching the given filter."""
+
+ sid = status.id
+ username = status.author.screen_name
+ displayed = status.author.name
+
+ if hasattr(status, 'lang'):
+ lang = status.lang
+ else:
+ lang = 'unknown'
+
+ while hasattr(status, 'retweeted_status'):
+ status = status.retweeted_status
+
+ cached = [ sid, username, displayed, lang, status.text ]
+ self._cache.insert(0, cached)
+
+ csize = len(self._cache)
+
+ if csize <= CACHE_SIZE:
+
+ if csize % 50 == 0:
+ print('[*] Cache size: %u...' % csize)
+
+ else:
+
+ sid, username, displayed, lang, text = self._cache.pop()
+
+ liked = analyse(sid, username, displayed, lang, text, self._api, self._memory)
+
+ self._tweets_reviewed += 1
+
+ if liked:
+ self._tweets_liked += 1
+
+ if self._tweets_reviewed % 50 == 0:
+ print('[*] Seen and analyzed %u tweet%s, liked %u tweet%s...' \
+ % (self._tweets_reviewed, 's' if self._tweets_reviewed > 1 else '', \
+ self._tweets_liked, 's' if self._tweets_liked > 1 else ''))
+
+ return True
+
+
+ def on_error(self, code):
+ """Handle errors."""
+
+ print('Error:', code)
+
+ if code == 420:
+ #returning False in on_data disconnects the stream
+ return False
+
+
+def listen_live(auth, api):
+ """Track all tweets written by users."""
+
+ lst = hashtags.split(' ') + underlined.split(' ') + [ '#re' ]
+
+ targets = [ s.lower().replace('_', ' ') for s in lst ]
+
+ listener = StdOutListener(api)
+
+ print('targets = ', targets)
+
+ stream = Stream(auth, listener)
+ stream.filter(track=targets)
diff --git a/taste.py b/taste.py
index 8471473..95139b4 100644
--- a/taste.py
+++ b/taste.py
@@ -2,8 +2,13 @@
# -*- coding: utf-8 -*-
+import re
+import sys
import tweepy
-from config import white_kwds
+from config import accepted_languages, white_kwds, cs_white_kwds, black_kwds, cs_black_kwds
+from config import banned_accounts, banned_accounts_re, banned_titles_re
+from random import randint
+from time import sleep
COLOR_RESET = "\033[0m"
@@ -13,6 +18,10 @@ COLOR_ACCEPTED = "\033[1;32m"
COLOR_ALREADY = "\033[1;33m"
+# Counter for this session
+_like_count = 0
+
+
def get_displayable_content(orig, margin):
"""Format content to get it displayable."""
@@ -25,16 +34,60 @@ def get_displayable_content(orig, margin):
return result
-def analyse(sid, username, content, api, memory):
+def is_blacklisted(username, displayed):
+ """Define if a given account is blacklisted or not."""
+
+ result = username in banned_accounts.split(' ')
+
+ if not result:
+
+ for exp in banned_accounts_re.split(' '):
+
+ preg = re.compile(exp)
+
+ match = preg.match(username)
+
+ if match:
+ result = True
+
+ if result:
+ break
+
+ if not result:
+
+ for exp in banned_titles_re.split(' '):
+
+ preg = re.compile(exp)
+
+ match = preg.match(displayed)
+
+ if match:
+ result = True
+
+ if result:
+ break
+
+ return result
+
+
+def analyse(sid, username, displayed, lang, content, api, memory):
"""Analyse a Tweet content."""
- like = False
+ global _like_count
- words = content.split(' ')
+ liked = False
- white = [ s.lower().replace('_', ' ') for s in white_kwds.split(' ') ]
+ if not is_blacklisted(username, displayed) and (lang in accepted_languages.split(' ')):
- for kwd in white:
+ like = False
+
+ words = content.split(' ')
+
+ # White list
+
+ white = [ s.lower().replace('_', ' ') for s in white_kwds.split(' ') ]
+
+ for kwd in white:
for w in words:
if w.lower() == kwd:
@@ -44,33 +97,102 @@ def analyse(sid, username, content, api, memory):
if like:
break
- if like:
+ # White list, case sensitive
- if memory.is_original_content(content):
+ if not like:
- try:
+ white = [ s.replace('_', ' ') for s in cs_white_kwds.split(' ') ]
- api.create_favorite(sid)
+ for kwd in white:
- memory.save_liked_status(sid, username, content)
+ for w in words:
+ if w == kwd:
+ like = True
+ break
- displayable = get_displayable_content(content, len('Liking') + len(' @%s: "' % username))
+ if like:
+ break
- print(COLOR_ACCEPTED + 'Liking' + COLOR_RESET + ' @%s: "%s"' % (username, displayable))
- print(' -> https://twitter.com/%s/status/%d' % (username, sid))
+ # Black list
- except tweepy.error.TweepError:
+ if like:
- pass
+ black = [ s.lower().replace('_', ' ') for s in black_kwds.split(' ') ]
- else:
+ for kwd in black:
+
+ for w in words:
+ if w.lower() == kwd:
+ like = False
+ break
- displayable = get_displayable_content(content, len('Already seen "'))
+ if not like:
+ break
+
+ # Black list, case sensitive
+
+ if like:
+
+ black = [ s.replace('_', ' ') for s in cs_black_kwds.split(' ') ]
+
+ for kwd in black:
+
+ for w in words:
+ if w == kwd:
+ like = False
+ break
+
+ if not like:
+ break
+
+ # Final step
+
+ if like:
+
+ if memory.is_original_content(content):
+
+ try:
+
+ api.create_favorite(sid)
+
+ memory.save_liked_status(sid, username, content)
- print(COLOR_ALREADY + 'Already seen' + COLOR_RESET + ' "%s"' % displayable)
+ displayable = get_displayable_content(content, len('Liking') + len(' @%s: "' % username))
+
+ _like_count += 1
+
+ print(COLOR_ACCEPTED + 'Liking' + COLOR_RESET + ' @%s: "%s"' % (username, displayable))
+ print(' %u -> https://twitter.com/%s/status/%d' % (_like_count, username, sid))
+
+ liked = True
+
+ # # Do not be so aggressive!
+ # if _like_count % 100 == 0:
+
+ # time = randint(10, 54)
+ # print('[*] Reached %u likes; sleeping %u minutes...' % (_like_count, time))
+ # sleep(60 * time)
+
+ # # Do not be so aggressive and respect the limits!
+ # if _like_count > 900:
+
+ # print('[*] Enough for today! Reached %u likes...' % _like_count)
+ # sys.exit()
+
+ except tweepy.error.TweepError:
+
+ pass
+
+ else:
+
+ displayable = get_displayable_content(content, len('Already seen "'))
+
+ print(COLOR_ALREADY + 'Already seen' + COLOR_RESET + ' "%s"' % displayable)
+
+ else:
- else:
+ displayable = get_displayable_content(content, len('Reject') + len(' @%s: "' % username))
- displayable = get_displayable_content(content, len('Reject') + len(' @%s: "' % username))
+ print(COLOR_REJECTED + 'Reject' + COLOR_RESET + ' @%s: "%s"' % (username, displayable))
- print(COLOR_REJECTED + 'Reject' + COLOR_RESET + ' @%s: "%s"' % (username, displayable))
+ return liked
diff --git a/users.py b/users.py
index e61d3c2..b0d7fef 100755..100644
--- a/users.py
+++ b/users.py
@@ -4,11 +4,12 @@
import tweepy
from config import accounts
-from db import open_db, LikeMemory, TrackMemory
+from db import LikeMemory, TrackMemory
from taste import analyse
import os
import pickle
-from random import shuffle
+from random import randint, shuffle
+from time import sleep
CACHE_FILENAME = 'ids.cache'
@@ -24,8 +25,6 @@ class UsersListener():
self._api = api
- open_db()
-
self._memory = LikeMemory(api)
self._tracker = TrackMemory()
@@ -65,15 +64,14 @@ class UsersListener():
print('[!] Error while receiving followers for %s...' % master)
-
- pickle.dump(ids, open(CACHE_FILENAME, 'wb'))
-
print('[i] Loaded %u accounts' % len(ids))
ids = list(set(ids))
print('[i] Kept %u accounts' % len(ids))
+ pickle.dump(ids, open(CACHE_FILENAME, 'wb'))
+
# Remove all account natively followed
already = []
@@ -93,54 +91,94 @@ class UsersListener():
def start(self, auth):
"""Start the listener."""
- while True:
+ accounts_reviewed = 0
+ tweets_reviewed = 0
+ tweets_liked = 0
- for uid in self._ids:
+ for uid in self._ids[:900]:
- since = self._tracker.get_last_seen_for(uid)
+ since = self._tracker.get_last_seen_for(uid)
- last = []
+ last = []
- try:
+ try:
+
+ last = self._api.user_timeline(uid, since)
+
+ print('[*] Current search: %u account%s visited, %u tweet%s analyzed, %u tweet%s liked' \
+ % (accounts_reviewed, 's' if accounts_reviewed > 1 else '', \
+ tweets_reviewed, 's' if tweets_reviewed > 1 else '', \
+ tweets_liked, 's' if tweets_liked > 1 else ''))
+
+
+ # Do not be so aggressive!
+ #tempo = randint(2, 7)
+ #print('[*] Enjoying a small break for %u seconds... %u account%s visited, %u tweet%s analyzed' \
+ # % (tempo, accounts_reviewed, 's' if accounts_reviewed > 1 else '', \
+ # tweets_reviewed, 's' if tweets_reviewed > 1 else ''))
+ #sleep(tempo)
+
+ except tweepy.error.TweepError as e:
+
+ # Private account
+ # tweepy.error.TweepError: Not authorized.
+ if e.response.status_code == 401:
+ pass
+
+ # Blocked !
+ # tweepy.error.TweepError: [{'message': 'To protect our users from spam and other malicious activity, this account is temporarily locked. Please log in to https://twitter.com to unlock your account.', 'code': 326}]
+ elif e.response.status_code == 403:
+ print(e, e.response.status_code)
+ assert(False)
+
+ # Nothing new!
+ # tweepy.error.TweepError: [{'message': 'Sorry, that page does not exist.', 'code': 34}]
+ elif e.response.status_code == 404:
+ pass
- last = self._api.user_timeline(uid, since)
+ # ???
+ # tweepy.error.TweepError: [{'message': 'Internal error', 'code': 131}]
+ elif e.response.status_code == 500:
+ print('[!] Twitter internal error for uid=%u' % uid)
- except tweepy.error.TweepError as e:
+ else:
+ print(e, e.response.status_code)
+ assert(False)
- # Private account
- # tweepy.error.TweepError: Not authorized.
- if e.response.status_code == 401:
- pass
+ first = None
- # Nothing new!
- # tweepy.error.TweepError: [{'message': 'Sorry, that page does not exist.', 'code': 34}]
- elif e.response.status_code == 404:
- pass
+ for status in last:
- else:
- print(e, e.response.status_code)
- assert(False)
+ sid = status.id
+ uid = status.author.id
+ username = status.author.screen_name
+ displayed = status.author.name
- first = None
+ if hasattr(status, 'lang'):
+ lang = status.lang
+ else:
+ lang = 'unknown'
- for status in last:
+ while hasattr(status, 'retweeted_status'):
+ status = status.retweeted_status
- sid = status.id
- uid = status.author.id
- username = status.author.screen_name
+ liked = analyse(sid, username, displayed, lang, status.text, self._api, self._memory)
- while hasattr(status, 'retweeted_status'):
- status = status.retweeted_status
+ if first is None:
+ first = uid, username, sid
- analyse(sid, username, status.text, self._api, self._memory)
+ tweets_reviewed += 1
- if first is None:
- first = uid, username, sid
+ # Do not spam users!
+ if liked:
+ tweets_liked += 1
+ break
- if not(first is None):
- uid, username, sid = first
- self._tracker.set_last_seen_for(uid, username, sid)
+ if not(first is None):
+ uid, username, sid = first
+ self._tracker.set_last_seen_for(uid, username, sid)
+ accounts_reviewed += 1
def listen_to_users(auth, api):
@@ -171,7 +209,7 @@ def listen_to_users(auth, api):
listener = UsersListener(api)
- ####listener.start(auth)
+ listener.start(auth)
#stream = Stream(auth, listener)
#stream.filter(follow=new)