blob: 7e697c4d3f38aa35ee424b9bd800f4b0b87ffa9c (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
|
import nltk
from nltk import wordpunct_tokenize
from nltk.corpus import stopwords
#
# See: http://www.nltk.org/
#
# Code from: http://blog.alejandronolla.com/2013/05/15/detecting-text-language-with-python-and-nltk/
#
class LanguageIdentifier(object):
def __init__(self):
"""Initialize the language detection."""
# Avoid the following LookupError:
# """ Resource 'corpora/stopwords.zip/stopwords/' not found. """
try:
languages = stopwords.fileids()
except:
nltk.download("stopwords")
def _calculate_languages_ratios(self, text):
"""
Calculate probability of given text to be written in several languages and
return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}
@param text: Text whose language want to be detected
@type text: str
@return: Dictionary with languages and unique stopwords seen in analyzed text
@rtype: dict
"""
languages_ratios = {}
tokens = wordpunct_tokenize(text)
words = [ word.lower() for word in tokens ]
# Compute per language included in nltk number of unique stopwords appearing in analyzed text
for language in stopwords.fileids():
stopwords_set = set(stopwords.words(language))
words_set = set(words)
common_elements = words_set.intersection(stopwords_set)
languages_ratios[language] = len(common_elements) # language "score"
return languages_ratios
def detect_language(self, text):
"""
Calculate probability of given text to be written in several languages and
return the highest scored.
It uses a stopwords based approach, counting how many unique stopwords
are seen in analyzed text.
@param text: Text whose language want to be detected
@type text: str
@return: Most scored language guessed
@rtype: str
"""
ratios = self._calculate_languages_ratios(text)
most_rated_language = max(ratios, key=ratios.get)
return most_rated_language
|