From 8c7ba43ddcf5ac6264bd5e85b457025af9fb506a Mon Sep 17 00:00:00 2001 From: Cyrille Bagard Date: Sat, 25 Nov 2017 11:27:11 +0100 Subject: Tracked live Tweets as well as account content. --- config.py | 22 +++++++++ htt.py | 148 ++++++------------------------------------------------- live.py | 98 +++++++++++++++++++++++++++++++++++++ taste.py | 166 +++++++++++++++++++++++++++++++++++++++++++++++++++++--------- users.py | 116 ++++++++++++++++++++++++++++--------------- 5 files changed, 356 insertions(+), 194 deletions(-) create mode 100644 live.py mode change 100755 => 100644 users.py diff --git a/config.py b/config.py index 051de6c..4d6f75d 100644 --- a/config.py +++ b/config.py @@ -2,14 +2,36 @@ # -*- coding: utf-8 -*- +# Space-separated list of readable languages +accepted_languages = 'fr en' + # List of space-separated hashtags to follow, for instance #python #bot hashtags = '#python #bot' +# Selected keywords to find in Tweets we want to highlight; underscores will be replaced by spaces. +underlined = 'code' + # Keywords to find in Tweets we want to highlight; underscores will be replaced by spaces. white_kwds = 'you got the idea' +# Case sensitive keywords to find in Tweets we want to highlight; underscores will be replaced by spaces. +cs_white_kwds = 'ARM' + +# Keywords leading to reject a Tweet; underscores will be replaced by spaces. +black_kwds = 'trump porn sex download job' + +# Case sensitive keywords leading to reject a Tweet; underscores will be replaced by spaces. +cs_black_kwds = 'REP_ARM REPS_ARM' + # Age of old Tweets to get purged in days max_age = 14 # List of space-separated accounts to follow accounts = 'laughing_bit' + +# List of accounts providing no first hand content +banned_accounts = 'cnn' + +banned_accounts_re = '.*bot .*Bot .*career.* .*Career.* .*_jobs .*_Jobs' + +banned_titles_re = '.*Parts.* .*Jobs.*' diff --git a/htt.py b/htt.py index dcfd92d..49cec54 100755 --- a/htt.py +++ b/htt.py @@ -2,134 +2,13 @@ # -*- coding: utf-8 -*- +import argparse import tweepy -from tweepy import OAuthHandler -from tweepy import Stream -from tweepy.streaming import StreamListener from auth import * -from config import hashtags, white_kwds -from db import LikeMemory +from db import open_db +from live import listen_live +from tweepy import OAuthHandler from users import listen_to_users -import json -import sys - - -class StdOutListener(StreamListener): - """A listener handles tweets are the received from the stream.""" - - def __init__(self, api, memory, tempo): - """Build the Python object.""" - - super().__init__() - - self._api = api - self._memory = memory - - self._tempo = tempo - self._previous = None - - self._white = [ s.lower() for s in white_kwds.split(' ') ] - - - def get_status_info(self, data): - """Parse status data to get information about its author and content.""" - - # Do not rely on https://dev.twitter.com/overview/api/tweets - # as the specs seem outdated... - - sid = data['id'] - username = data['user']['screen_name'] - - if 'extended_tweet' in data: - content = data['extended_tweet']['full_text'] - else: - content = data['text'] - - content = content.replace('\n', '') - - return sid, username, content - - - def on_data(self, data): - """Receive Tweets matching the given hashtags.""" - - decoded = json.loads(data) - - if 'retweeted_status' in decoded: - - if not ('id' in decoded['retweeted_status']): - print(decoded) - - sid, username, content = self.get_status_info(decoded['retweeted_status']) - - else: - - if not ('id' in decoded): - print(decoded) - - sid, username, content = self.get_status_info(decoded) - - like = False - - words = content.split(' ') - - for kwd in self._white: - - for w in words: - if w.lower() == kwd: - like = True - break - - if like: - break - - if like: - - if self._memory.is_original_content(content): - - try: - - if self._tempo: - - if self._previous != None: - - self._api.create_favorite(self._previous) - - self._previous = sid - - else: - - self._api.create_favorite(sid) - - # Save even pending statuses to remember them when looking for original content - self._memory.save_liked_status(sid, username, content) - - print('@%s: "%s" (id=%d)' % (username, content, sid)) - print(' -> https://twitter.com/%s/status/%d' % (username, sid)) - - except tweepy.error.TweepError: - - pass - - else: - - print('Already seen "%s"' % content) - - else: - - print('Reject "%s"' % content) - - return True - - - def on_error(self, code): - """Handle errors.""" - - print('Error:', code) - - if code == 420: - #returning False in on_data disconnects the stream - return False if __name__ == '__main__': @@ -139,17 +18,20 @@ if __name__ == '__main__': auth.set_access_token(ACCESS_KEY, ACCESS_SECRET) api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) - #memory = LikeMemory(api) - if len(sys.argv) > 1 and sys.argv[1] == '--purge': + open_db() - memory.purge_old_status() + parser = argparse.ArgumentParser() + parser.add_argument('-p', '--purge', help='Delete old liked Tweets', action='store_true') + parser.add_argument('-d', '--daily', help='Analyse contents from selected followers', action='store_true') - else: + args = parser.parse_args() - listen_to_users(auth, api) + if args.purge: + memory.purge_old_status() - #listener = StdOutListener(api, memory, True) + elif args.daily: + listen_to_users(auth, api) - #stream = Stream(auth, listener) - #stream.filter(track=hashtags.split(' ')) + else: + listen_live(auth, api) diff --git a/live.py b/live.py new file mode 100644 index 0000000..9f6ad4c --- /dev/null +++ b/live.py @@ -0,0 +1,98 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- + + +from config import hashtags, underlined +from db import LikeMemory +from taste import analyse +from tweepy import Stream +from tweepy.streaming import StreamListener + + +CACHE_SIZE = 1000 #10000 + + +class StdOutListener(StreamListener): + """A listener handles tweets are the received from the stream.""" + + def __init__(self, api): + """Build the Python object.""" + + super().__init__() + + self._api = api + self._memory = LikeMemory(api) + + self._cache = [] + + self._tweets_reviewed = 0 + self._tweets_liked = 0 + + + def on_status(self, status): + """Receive Tweets matching the given filter.""" + + sid = status.id + username = status.author.screen_name + displayed = status.author.name + + if hasattr(status, 'lang'): + lang = status.lang + else: + lang = 'unknown' + + while hasattr(status, 'retweeted_status'): + status = status.retweeted_status + + cached = [ sid, username, displayed, lang, status.text ] + self._cache.insert(0, cached) + + csize = len(self._cache) + + if csize <= CACHE_SIZE: + + if csize % 50 == 0: + print('[*] Cache size: %u...' % csize) + + else: + + sid, username, displayed, lang, text = self._cache.pop() + + liked = analyse(sid, username, displayed, lang, text, self._api, self._memory) + + self._tweets_reviewed += 1 + + if liked: + self._tweets_liked += 1 + + if self._tweets_reviewed % 50 == 0: + print('[*] Seen and analyzed %u tweet%s, liked %u tweet%s...' \ + % (self._tweets_reviewed, 's' if self._tweets_reviewed > 1 else '', \ + self._tweets_liked, 's' if self._tweets_liked > 1 else '')) + + return True + + + def on_error(self, code): + """Handle errors.""" + + print('Error:', code) + + if code == 420: + #returning False in on_data disconnects the stream + return False + + +def listen_live(auth, api): + """Track all tweets written by users.""" + + lst = hashtags.split(' ') + underlined.split(' ') + [ '#re' ] + + targets = [ s.lower().replace('_', ' ') for s in lst ] + + listener = StdOutListener(api) + + print('targets = ', targets) + + stream = Stream(auth, listener) + stream.filter(track=targets) diff --git a/taste.py b/taste.py index 8471473..95139b4 100644 --- a/taste.py +++ b/taste.py @@ -2,8 +2,13 @@ # -*- coding: utf-8 -*- +import re +import sys import tweepy -from config import white_kwds +from config import accepted_languages, white_kwds, cs_white_kwds, black_kwds, cs_black_kwds +from config import banned_accounts, banned_accounts_re, banned_titles_re +from random import randint +from time import sleep COLOR_RESET = "\033[0m" @@ -13,6 +18,10 @@ COLOR_ACCEPTED = "\033[1;32m" COLOR_ALREADY = "\033[1;33m" +# Counter for this session +_like_count = 0 + + def get_displayable_content(orig, margin): """Format content to get it displayable.""" @@ -25,16 +34,60 @@ def get_displayable_content(orig, margin): return result -def analyse(sid, username, content, api, memory): +def is_blacklisted(username, displayed): + """Define if a given account is blacklisted or not.""" + + result = username in banned_accounts.split(' ') + + if not result: + + for exp in banned_accounts_re.split(' '): + + preg = re.compile(exp) + + match = preg.match(username) + + if match: + result = True + + if result: + break + + if not result: + + for exp in banned_titles_re.split(' '): + + preg = re.compile(exp) + + match = preg.match(displayed) + + if match: + result = True + + if result: + break + + return result + + +def analyse(sid, username, displayed, lang, content, api, memory): """Analyse a Tweet content.""" - like = False + global _like_count - words = content.split(' ') + liked = False - white = [ s.lower().replace('_', ' ') for s in white_kwds.split(' ') ] + if not is_blacklisted(username, displayed) and (lang in accepted_languages.split(' ')): - for kwd in white: + like = False + + words = content.split(' ') + + # White list + + white = [ s.lower().replace('_', ' ') for s in white_kwds.split(' ') ] + + for kwd in white: for w in words: if w.lower() == kwd: @@ -44,33 +97,102 @@ def analyse(sid, username, content, api, memory): if like: break - if like: + # White list, case sensitive - if memory.is_original_content(content): + if not like: - try: + white = [ s.replace('_', ' ') for s in cs_white_kwds.split(' ') ] - api.create_favorite(sid) + for kwd in white: - memory.save_liked_status(sid, username, content) + for w in words: + if w == kwd: + like = True + break - displayable = get_displayable_content(content, len('Liking') + len(' @%s: "' % username)) + if like: + break - print(COLOR_ACCEPTED + 'Liking' + COLOR_RESET + ' @%s: "%s"' % (username, displayable)) - print(' -> https://twitter.com/%s/status/%d' % (username, sid)) + # Black list - except tweepy.error.TweepError: + if like: - pass + black = [ s.lower().replace('_', ' ') for s in black_kwds.split(' ') ] - else: + for kwd in black: + + for w in words: + if w.lower() == kwd: + like = False + break - displayable = get_displayable_content(content, len('Already seen "')) + if not like: + break + + # Black list, case sensitive + + if like: + + black = [ s.replace('_', ' ') for s in cs_black_kwds.split(' ') ] + + for kwd in black: + + for w in words: + if w == kwd: + like = False + break + + if not like: + break + + # Final step + + if like: + + if memory.is_original_content(content): + + try: + + api.create_favorite(sid) + + memory.save_liked_status(sid, username, content) - print(COLOR_ALREADY + 'Already seen' + COLOR_RESET + ' "%s"' % displayable) + displayable = get_displayable_content(content, len('Liking') + len(' @%s: "' % username)) + + _like_count += 1 + + print(COLOR_ACCEPTED + 'Liking' + COLOR_RESET + ' @%s: "%s"' % (username, displayable)) + print(' %u -> https://twitter.com/%s/status/%d' % (_like_count, username, sid)) + + liked = True + + # # Do not be so aggressive! + # if _like_count % 100 == 0: + + # time = randint(10, 54) + # print('[*] Reached %u likes; sleeping %u minutes...' % (_like_count, time)) + # sleep(60 * time) + + # # Do not be so aggressive and respect the limits! + # if _like_count > 900: + + # print('[*] Enough for today! Reached %u likes...' % _like_count) + # sys.exit() + + except tweepy.error.TweepError: + + pass + + else: + + displayable = get_displayable_content(content, len('Already seen "')) + + print(COLOR_ALREADY + 'Already seen' + COLOR_RESET + ' "%s"' % displayable) + + else: - else: + displayable = get_displayable_content(content, len('Reject') + len(' @%s: "' % username)) - displayable = get_displayable_content(content, len('Reject') + len(' @%s: "' % username)) + print(COLOR_REJECTED + 'Reject' + COLOR_RESET + ' @%s: "%s"' % (username, displayable)) - print(COLOR_REJECTED + 'Reject' + COLOR_RESET + ' @%s: "%s"' % (username, displayable)) + return liked diff --git a/users.py b/users.py old mode 100755 new mode 100644 index e61d3c2..b0d7fef --- a/users.py +++ b/users.py @@ -4,11 +4,12 @@ import tweepy from config import accounts -from db import open_db, LikeMemory, TrackMemory +from db import LikeMemory, TrackMemory from taste import analyse import os import pickle -from random import shuffle +from random import randint, shuffle +from time import sleep CACHE_FILENAME = 'ids.cache' @@ -24,8 +25,6 @@ class UsersListener(): self._api = api - open_db() - self._memory = LikeMemory(api) self._tracker = TrackMemory() @@ -65,15 +64,14 @@ class UsersListener(): print('[!] Error while receiving followers for %s...' % master) - - pickle.dump(ids, open(CACHE_FILENAME, 'wb')) - print('[i] Loaded %u accounts' % len(ids)) ids = list(set(ids)) print('[i] Kept %u accounts' % len(ids)) + pickle.dump(ids, open(CACHE_FILENAME, 'wb')) + # Remove all account natively followed already = [] @@ -93,54 +91,94 @@ class UsersListener(): def start(self, auth): """Start the listener.""" - while True: + accounts_reviewed = 0 + tweets_reviewed = 0 + tweets_liked = 0 - for uid in self._ids: + for uid in self._ids[:900]: - since = self._tracker.get_last_seen_for(uid) + since = self._tracker.get_last_seen_for(uid) - last = [] + last = [] - try: + try: + + last = self._api.user_timeline(uid, since) + + print('[*] Current search: %u account%s visited, %u tweet%s analyzed, %u tweet%s liked' \ + % (accounts_reviewed, 's' if accounts_reviewed > 1 else '', \ + tweets_reviewed, 's' if tweets_reviewed > 1 else '', \ + tweets_liked, 's' if tweets_liked > 1 else '')) + + + # Do not be so aggressive! + #tempo = randint(2, 7) + #print('[*] Enjoying a small break for %u seconds... %u account%s visited, %u tweet%s analyzed' \ + # % (tempo, accounts_reviewed, 's' if accounts_reviewed > 1 else '', \ + # tweets_reviewed, 's' if tweets_reviewed > 1 else '')) + #sleep(tempo) + + except tweepy.error.TweepError as e: + + # Private account + # tweepy.error.TweepError: Not authorized. + if e.response.status_code == 401: + pass + + # Blocked ! + # tweepy.error.TweepError: [{'message': 'To protect our users from spam and other malicious activity, this account is temporarily locked. Please log in to https://twitter.com to unlock your account.', 'code': 326}] + elif e.response.status_code == 403: + print(e, e.response.status_code) + assert(False) + + # Nothing new! + # tweepy.error.TweepError: [{'message': 'Sorry, that page does not exist.', 'code': 34}] + elif e.response.status_code == 404: + pass - last = self._api.user_timeline(uid, since) + # ??? + # tweepy.error.TweepError: [{'message': 'Internal error', 'code': 131}] + elif e.response.status_code == 500: + print('[!] Twitter internal error for uid=%u' % uid) - except tweepy.error.TweepError as e: + else: + print(e, e.response.status_code) + assert(False) - # Private account - # tweepy.error.TweepError: Not authorized. - if e.response.status_code == 401: - pass + first = None - # Nothing new! - # tweepy.error.TweepError: [{'message': 'Sorry, that page does not exist.', 'code': 34}] - elif e.response.status_code == 404: - pass + for status in last: - else: - print(e, e.response.status_code) - assert(False) + sid = status.id + uid = status.author.id + username = status.author.screen_name + displayed = status.author.name - first = None + if hasattr(status, 'lang'): + lang = status.lang + else: + lang = 'unknown' - for status in last: + while hasattr(status, 'retweeted_status'): + status = status.retweeted_status - sid = status.id - uid = status.author.id - username = status.author.screen_name + liked = analyse(sid, username, displayed, lang, status.text, self._api, self._memory) - while hasattr(status, 'retweeted_status'): - status = status.retweeted_status + if first is None: + first = uid, username, sid - analyse(sid, username, status.text, self._api, self._memory) + tweets_reviewed += 1 - if first is None: - first = uid, username, sid + # Do not spam users! + if liked: + tweets_liked += 1 + break - if not(first is None): - uid, username, sid = first - self._tracker.set_last_seen_for(uid, username, sid) + if not(first is None): + uid, username, sid = first + self._tracker.set_last_seen_for(uid, username, sid) + accounts_reviewed += 1 def listen_to_users(auth, api): @@ -171,7 +209,7 @@ def listen_to_users(auth, api): listener = UsersListener(api) - ####listener.start(auth) + listener.start(auth) #stream = Stream(auth, listener) #stream.filter(follow=new) -- cgit v0.11.2-87-g4458