diff options
author | Cyrille Bagard <nocbos@gmail.com> | 2017-11-25 10:55:08 (GMT) |
---|---|---|
committer | Cyrille Bagard <nocbos@gmail.com> | 2017-11-25 10:55:08 (GMT) |
commit | b5c4d532c34ed4d3ac03622b1e249a5d323afd33 (patch) | |
tree | ed89f8d886ee5d6542acecc31336c07f0412c0d7 | |
parent | 8c7ba43ddcf5ac6264bd5e85b457025af9fb506a (diff) |
Filtered some kinds of spam.
-rw-r--r-- | config.py | 3 | ||||
-rw-r--r-- | taste.py | 24 |
2 files changed, 26 insertions, 1 deletions
@@ -35,3 +35,6 @@ banned_accounts = 'cnn' banned_accounts_re = '.*bot .*Bot .*career.* .*Career.* .*_jobs .*_Jobs' banned_titles_re = '.*Parts.* .*Jobs.*' + +# Threshold of accepted uppercased words +sensitive_ratio = 50.0 @@ -7,6 +7,7 @@ import sys import tweepy from config import accepted_languages, white_kwds, cs_white_kwds, black_kwds, cs_black_kwds from config import banned_accounts, banned_accounts_re, banned_titles_re +from config import sensitive_ratio from random import randint from time import sleep @@ -70,6 +71,25 @@ def is_blacklisted(username, displayed): return result +def is_spam(content): + """Define if a given content is suitable or not.""" + + keywords = content.split(' ') + + uc_counter = 0 + + for kw in keywords: + + if kw == kw.upper(): + uc_counter += 1 + + ratio = (uc_counter * 100.0) / len(keywords) + + result = (ratio > sensitive_ratio) + + return result + + def analyse(sid, username, displayed, lang, content, api, memory): """Analyse a Tweet content.""" @@ -77,7 +97,9 @@ def analyse(sid, username, displayed, lang, content, api, memory): liked = False - if not is_blacklisted(username, displayed) and (lang in accepted_languages.split(' ')): + if not is_blacklisted(username, displayed) \ + and not is_spam(content) \ + and (lang in accepted_languages.split(' ')): like = False |