1 files changed, 76 insertions, 0 deletions
diff --git a/nl.py b/nl.py
new file mode 100644
index 0000000..7e697c4
--- /dev/null
+++ b/nl.py
@@ -0,0 +1,76 @@
+
+import nltk
+from nltk import wordpunct_tokenize
+from nltk.corpus import stopwords
+
+
+#
+# See: http://www.nltk.org/
+#
+# Code from: http://blog.alejandronolla.com/2013/05/15/detecting-text-language-with-python-and-nltk/
+#
+
+
+class LanguageIdentifier(object):
+
+    def __init__(self):
+        """Initialize the language detection."""
+
+        # Avoid the following LookupError:
+        #   """ Resource 'corpora/stopwords.zip/stopwords/' not found. """
+
+        try:
+            languages = stopwords.fileids()
+        except:
+            nltk.download("stopwords")
+
+
+    def _calculate_languages_ratios(self, text):
+        """
+        Calculate probability of given text to be written in several languages and
+        return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}
+        
+        @param text: Text whose language want to be detected
+        @type text: str
+        
+        @return: Dictionary with languages and unique stopwords seen in analyzed text
+        @rtype: dict
+        """
+
+        languages_ratios = {}
+
+        tokens = wordpunct_tokenize(text)
+        words = [ word.lower() for word in tokens ]
+
+        # Compute per language included in nltk number of unique stopwords appearing in analyzed text
+        for language in stopwords.fileids():
+
+            stopwords_set = set(stopwords.words(language))
+            words_set = set(words)
+            common_elements = words_set.intersection(stopwords_set)
+
+            languages_ratios[language] = len(common_elements) # language "score"
+
+        return languages_ratios
+
+
+    def detect_language(self, text):
+        """
+        Calculate probability of given text to be written in several languages and
+        return the highest scored.
+        
+        It uses a stopwords based approach, counting how many unique stopwords
+        are seen in analyzed text.
+        
+        @param text: Text whose language want to be detected
+        @type text: str
+        
+        @return: Most scored language guessed
+        @rtype: str
+        """
+
+        ratios = self._calculate_languages_ratios(text)
+
+        most_rated_language = max(ratios, key=ratios.get)
+
+        return most_rated_language