From b5c4d532c34ed4d3ac03622b1e249a5d323afd33 Mon Sep 17 00:00:00 2001
From: Cyrille Bagard <nocbos@gmail.com>
Date: Sat, 25 Nov 2017 11:55:08 +0100
Subject: Filtered some kinds of spam.

---
 config.py |  3 +++
 taste.py  | 24 +++++++++++++++++++++++-
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/config.py b/config.py
index 4d6f75d..daaff2f 100644
--- a/config.py
+++ b/config.py
@@ -35,3 +35,6 @@ banned_accounts = 'cnn'
 banned_accounts_re = '.*bot .*Bot .*career.* .*Career.* .*_jobs .*_Jobs'
 
 banned_titles_re = '.*Parts.* .*Jobs.*'
+
+# Threshold of accepted uppercased words
+sensitive_ratio = 50.0
diff --git a/taste.py b/taste.py
index 95139b4..069e273 100644
--- a/taste.py
+++ b/taste.py
@@ -7,6 +7,7 @@ import sys
 import tweepy
 from config import accepted_languages, white_kwds, cs_white_kwds, black_kwds, cs_black_kwds
 from config import banned_accounts, banned_accounts_re, banned_titles_re
+from config import sensitive_ratio
 from random import randint
 from time import sleep
 
@@ -70,6 +71,25 @@ def is_blacklisted(username, displayed):
     return result
 
 
+def is_spam(content):
+    """Define if a given content is suitable or not."""
+
+    keywords = content.split(' ')
+
+    uc_counter = 0
+
+    for kw in keywords:
+
+        if kw == kw.upper():
+            uc_counter += 1
+
+    ratio = (uc_counter * 100.0) / len(keywords)
+
+    result = (ratio > sensitive_ratio)
+
+    return result
+
+
 def analyse(sid, username, displayed, lang, content, api, memory):
     """Analyse a Tweet content."""
 
@@ -77,7 +97,9 @@ def analyse(sid, username, displayed, lang, content, api, memory):
 
     liked = False
 
-    if not is_blacklisted(username, displayed) and (lang in accepted_languages.split(' ')):
+    if not is_blacklisted(username, displayed) \
+       and not is_spam(content) \
+       and (lang in accepted_languages.split(' ')):
 
         like = False
 
-- 
cgit v0.11.2-87-g4458