import nltk from nltk import wordpunct_tokenize from nltk.corpus import stopwords # # See: http://www.nltk.org/ # # Code from: http://blog.alejandronolla.com/2013/05/15/detecting-text-language-with-python-and-nltk/ # class LanguageIdentifier(object): def __init__(self): """Initialize the language detection.""" # Avoid the following LookupError: # """ Resource 'corpora/stopwords.zip/stopwords/' not found. """ try: languages = stopwords.fileids() except: nltk.download("stopwords") def _calculate_languages_ratios(self, text): """ Calculate probability of given text to be written in several languages and return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0} @param text: Text whose language want to be detected @type text: str @return: Dictionary with languages and unique stopwords seen in analyzed text @rtype: dict """ languages_ratios = {} tokens = wordpunct_tokenize(text) words = [ word.lower() for word in tokens ] # Compute per language included in nltk number of unique stopwords appearing in analyzed text for language in stopwords.fileids(): stopwords_set = set(stopwords.words(language)) words_set = set(words) common_elements = words_set.intersection(stopwords_set) languages_ratios[language] = len(common_elements) # language "score" return languages_ratios def detect_language(self, text): """ Calculate probability of given text to be written in several languages and return the highest scored. It uses a stopwords based approach, counting how many unique stopwords are seen in analyzed text. @param text: Text whose language want to be detected @type text: str @return: Most scored language guessed @rtype: str """ ratios = self._calculate_languages_ratios(text) most_rated_language = max(ratios, key=ratios.get) return most_rated_language