summaryrefslogtreecommitdiff
path: root/nl.py
diff options
context:
space:
mode:
Diffstat (limited to 'nl.py')
-rw-r--r--nl.py76
1 files changed, 76 insertions, 0 deletions
diff --git a/nl.py b/nl.py
new file mode 100644
index 0000000..7e697c4
--- /dev/null
+++ b/nl.py
@@ -0,0 +1,76 @@
+
+import nltk
+from nltk import wordpunct_tokenize
+from nltk.corpus import stopwords
+
+
+#
+# See: http://www.nltk.org/
+#
+# Code from: http://blog.alejandronolla.com/2013/05/15/detecting-text-language-with-python-and-nltk/
+#
+
+
+class LanguageIdentifier(object):
+
+ def __init__(self):
+ """Initialize the language detection."""
+
+ # Avoid the following LookupError:
+ # """ Resource 'corpora/stopwords.zip/stopwords/' not found. """
+
+ try:
+ languages = stopwords.fileids()
+ except:
+ nltk.download("stopwords")
+
+
+ def _calculate_languages_ratios(self, text):
+ """
+ Calculate probability of given text to be written in several languages and
+ return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}
+
+ @param text: Text whose language want to be detected
+ @type text: str
+
+ @return: Dictionary with languages and unique stopwords seen in analyzed text
+ @rtype: dict
+ """
+
+ languages_ratios = {}
+
+ tokens = wordpunct_tokenize(text)
+ words = [ word.lower() for word in tokens ]
+
+ # Compute per language included in nltk number of unique stopwords appearing in analyzed text
+ for language in stopwords.fileids():
+
+ stopwords_set = set(stopwords.words(language))
+ words_set = set(words)
+ common_elements = words_set.intersection(stopwords_set)
+
+ languages_ratios[language] = len(common_elements) # language "score"
+
+ return languages_ratios
+
+
+ def detect_language(self, text):
+ """
+ Calculate probability of given text to be written in several languages and
+ return the highest scored.
+
+ It uses a stopwords based approach, counting how many unique stopwords
+ are seen in analyzed text.
+
+ @param text: Text whose language want to be detected
+ @type text: str
+
+ @return: Most scored language guessed
+ @rtype: str
+ """
+
+ ratios = self._calculate_languages_ratios(text)
+
+ most_rated_language = max(ratios, key=ratios.get)
+
+ return most_rated_language