diff options
Diffstat (limited to 'nl.py')
-rw-r--r-- | nl.py | 76 |
1 files changed, 76 insertions, 0 deletions
@@ -0,0 +1,76 @@ + +import nltk +from nltk import wordpunct_tokenize +from nltk.corpus import stopwords + + +# +# See: http://www.nltk.org/ +# +# Code from: http://blog.alejandronolla.com/2013/05/15/detecting-text-language-with-python-and-nltk/ +# + + +class LanguageIdentifier(object): + + def __init__(self): + """Initialize the language detection.""" + + # Avoid the following LookupError: + # """ Resource 'corpora/stopwords.zip/stopwords/' not found. """ + + try: + languages = stopwords.fileids() + except: + nltk.download("stopwords") + + + def _calculate_languages_ratios(self, text): + """ + Calculate probability of given text to be written in several languages and + return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0} + + @param text: Text whose language want to be detected + @type text: str + + @return: Dictionary with languages and unique stopwords seen in analyzed text + @rtype: dict + """ + + languages_ratios = {} + + tokens = wordpunct_tokenize(text) + words = [ word.lower() for word in tokens ] + + # Compute per language included in nltk number of unique stopwords appearing in analyzed text + for language in stopwords.fileids(): + + stopwords_set = set(stopwords.words(language)) + words_set = set(words) + common_elements = words_set.intersection(stopwords_set) + + languages_ratios[language] = len(common_elements) # language "score" + + return languages_ratios + + + def detect_language(self, text): + """ + Calculate probability of given text to be written in several languages and + return the highest scored. + + It uses a stopwords based approach, counting how many unique stopwords + are seen in analyzed text. + + @param text: Text whose language want to be detected + @type text: str + + @return: Most scored language guessed + @rtype: str + """ + + ratios = self._calculate_languages_ratios(text) + + most_rated_language = max(ratios, key=ratios.get) + + return most_rated_language |