From b5c4d532c34ed4d3ac03622b1e249a5d323afd33 Mon Sep 17 00:00:00 2001 From: Cyrille Bagard Date: Sat, 25 Nov 2017 11:55:08 +0100 Subject: Filtered some kinds of spam. --- config.py | 3 +++ taste.py | 24 +++++++++++++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/config.py b/config.py index 4d6f75d..daaff2f 100644 --- a/config.py +++ b/config.py @@ -35,3 +35,6 @@ banned_accounts = 'cnn' banned_accounts_re = '.*bot .*Bot .*career.* .*Career.* .*_jobs .*_Jobs' banned_titles_re = '.*Parts.* .*Jobs.*' + +# Threshold of accepted uppercased words +sensitive_ratio = 50.0 diff --git a/taste.py b/taste.py index 95139b4..069e273 100644 --- a/taste.py +++ b/taste.py @@ -7,6 +7,7 @@ import sys import tweepy from config import accepted_languages, white_kwds, cs_white_kwds, black_kwds, cs_black_kwds from config import banned_accounts, banned_accounts_re, banned_titles_re +from config import sensitive_ratio from random import randint from time import sleep @@ -70,6 +71,25 @@ def is_blacklisted(username, displayed): return result +def is_spam(content): + """Define if a given content is suitable or not.""" + + keywords = content.split(' ') + + uc_counter = 0 + + for kw in keywords: + + if kw == kw.upper(): + uc_counter += 1 + + ratio = (uc_counter * 100.0) / len(keywords) + + result = (ratio > sensitive_ratio) + + return result + + def analyse(sid, username, displayed, lang, content, api, memory): """Analyse a Tweet content.""" @@ -77,7 +97,9 @@ def analyse(sid, username, displayed, lang, content, api, memory): liked = False - if not is_blacklisted(username, displayed) and (lang in accepted_languages.split(' ')): + if not is_blacklisted(username, displayed) \ + and not is_spam(content) \ + and (lang in accepted_languages.split(' ')): like = False -- cgit v0.11.2-87-g4458