-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlangDetector.py
More file actions
executable file
·111 lines (84 loc) · 4.11 KB
/
langDetector.py
File metadata and controls
executable file
·111 lines (84 loc) · 4.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/env python
#coding:utf-8
# Author: Alejandro Nolla - z0mbiehunt3r
# Purpose: Example for detecting language using a stopwords based approach
# Created: 15/05/13
import sys
try:
from nltk import wordpunct_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
except ImportError:
print '[!] You need to install nltk (http://nltk.org/index.html)'
#----------------------------------------------------------------------
def _calculate_languages_ratios(text):
"""
Calculate probability of given text to be written in several languages and
return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}
@param text: Text whose language want to be detected
@type text: str
@return: Dictionary with languages and unique stopwords seen in analyzed text
@rtype: dict
"""
languages_ratios = {}
'''
nltk.wordpunct_tokenize() splits all punctuations into separate tokens
>>> wordpunct_tokenize("That's thirty minutes away. I'll be there in ten.")
['That', "'", 's', 'thirty', 'minutes', 'away', '.', 'I', "'", 'll', 'be', 'there', 'in', 'ten', '.']
'''
tokenizer = RegexpTokenizer(r'\w+')
#tokens = wordpunct_tokenize(text)
tokens = tokenizer.tokenize(text.lower())
myStopWords = []
for one in range(98,123):
if one == 105 or one == 117:
continue
myStopWords += [chr(one)]
print myStopWords
tokens = [i for i in tokenizer.tokenize(text.lower()) if i not in myStopWords]
print tokens
words = [word.lower() for word in tokens]
# Compute per language included in nltk number of unique stopwords appearing in analyzed text
for language in stopwords.fileids():
stopwords_set = set(stopwords.words(language))
words_set = set(words)
common_elements = words_set.intersection(stopwords_set)
languages_ratios[language] = len(common_elements) # language "score"
return languages_ratios
#----------------------------------------------------------------------
def detect_language(text):
"""
Calculate probability of given text to be written in several languages and
return the highest scored.
It uses a stopwords based approach, counting how many unique stopwords
are seen in analyzed text.
@param text: Text whose language want to be detected
@type text: str
@return: Most scored language guessed
@rtype: str
"""
ratios = _calculate_languages_ratios(text)
most_rated_language = max(ratios, key=ratios.get)
return most_rated_language
if __name__=='__main__':
text = '''
There's a passage I got memorized. Ezekiel 25:17. "The path of the righteous man is beset on all sides\
by the inequities of the selfish and the tyranny of evil men. Blessed is he who, in the name of charity\
and good will, shepherds the weak through the valley of the darkness, for he is truly his brother's keeper\
and the finder of lost children. And I will strike down upon thee with great vengeance and furious anger\
those who attempt to poison and destroy My brothers. And you will know I am the Lord when I lay My vengeance\
upon you." Now... I been sayin' that shit for years. And if you ever heard it, that meant your ass. You'd\
be dead right now. I never gave much thought to what it meant. I just thought it was a cold-blooded thing\
to say to a motherfucker before I popped a cap in his ass. But I saw some shit this mornin' made me think\
twice. See, now I'm thinking: maybe it means you're the evil man. And I'm the righteous man. And Mr.\
9mm here... he's the shepherd protecting my righteous ass in the valley of darkness. Or it could mean\
you're the righteous man and I'm the shepherd and it's the world that's evil and selfish. And I'd like\
that. But that shit ain't the truth. The truth is you're the weak. And I'm the tyranny of evil men.\
But I'm tryin', Ringo. I'm tryin' real hard to be the shepherd.
'''
language = detect_language(sys.argv[1])
#if "english" in language:
# print "english"
#else:
# print "vernacular"
print language