forked from sloria/TextBlob
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtranslate.py
144 lines (118 loc) · 4.64 KB
/
translate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# -*- coding: utf-8 -*-
"""
Translator module that uses the Google Translate API.
Adapted from Terry Yin's google-translate-python.
Language detection added by Steven Loria.
"""
from __future__ import absolute_import
import codecs
import json
import re
from textblob.compat import PY2, request, urlencode
from textblob.exceptions import TranslatorError, NotTranslated
class Translator(object):
"""A language translator and detector.
Usage:
::
>>> from textblob.translate import Translator
>>> t = Translator()
>>> t.translate('hello', from_lang='en', to_lang='fr')
u'bonjour'
>>> t.detect("hola")
u'es'
"""
url = "http://translate.google.com/translate_a/t?client=webapp&dt=bd&dt=ex&dt=ld&dt=md&dt=qca&dt=rw&dt=rm&dt=ss&dt=t&dt=at&ie=UTF-8&oe=UTF-8&otf=2&ssel=0&tsel=0&kc=1"
headers = {
'Accept': '*/*',
'Connection': 'keep-alive',
'User-Agent': (
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) '
'AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.168 Safari/535.19')
}
def translate(self, source, from_lang='auto', to_lang='en', host=None, type_=None):
"""Translate the source text from one language to another."""
if PY2:
source = source.encode('utf-8')
data = {"q": source}
url = u'{url}&sl={from_lang}&tl={to_lang}&hl={to_lang}&tk={tk}'.format(
url=self.url,
from_lang=from_lang,
to_lang=to_lang,
tk=_calculate_tk(source),
)
response = self._request(url, host=host, type_=type_, data=data)
result = json.loads(response)[0][0][0][0]
if isinstance(result, list):
try:
result = result[0] # ignore detected language
except IndexError:
pass
self._validate_translation(source, result)
return result
def detect(self, source, host=None, type_=None):
"""Detect the source text's language."""
if PY2:
source = source.encode('utf-8')
if len(source) < 3:
raise TranslatorError('Must provide a string with at least 3 characters.')
data = {"q": source}
url = u'{url}&sl=auto&tk={tk}'.format(url=self.url, tk=_calculate_tk(source))
response = self._request(url, host=host, type_=type_, data=data)
language = json.loads(response)[0][0][2]
return language
def _validate_translation(self, source, result):
"""Validate API returned expected schema, and that the translated text
is different than the original string.
"""
if not result:
raise NotTranslated('Translation API returned and empty response.')
if PY2:
result = result.encode('utf-8')
if result.strip() == source.strip():
raise NotTranslated('Translation API returned the input string unchanged.')
def _request(self, url, host=None, type_=None, data=None):
encoded_data = urlencode(data).encode('utf-8')
req = request.Request(url=url, headers=self.headers, data=encoded_data)
if host or type_:
req.set_proxy(host=host, type=type_)
resp = request.urlopen(req)
content = resp.read()
return content.decode('utf-8')
def _unescape(text):
"""Unescape unicode character codes within a string.
"""
pattern = r'\\{1,2}u[0-9a-fA-F]{4}'
return re.sub(pattern, lambda x: codecs.getdecoder('unicode_escape')(x.group())[0], text)
def _calculate_tk(source):
"""Reverse engineered cross-site request protection."""
# Source: https://github.com/soimort/translate-shell/issues/94#issuecomment-165433715
# Source: http://www.liuxiatool.com/t.php
def c_int(x, nbits=32):
""" C cast to int32, int16, int8... """
return (x & ((1 << (nbits - 1)) - 1)) - (x & (1 << (nbits - 1)))
def c_uint(x, nbits=32):
""" C cast to uint32, uint16, uint8... """
return x & ((1 << nbits) - 1)
tkk = [406398, 561666268 + 1526272306]
b = tkk[0]
if PY2:
d = map(ord, source)
else:
d = source.encode('utf-8')
def RL(a, b):
for c in range(0, len(b) - 2, 3):
d = b[c + 2]
d = ord(d) - 87 if d >= 'a' else int(d)
xa = c_uint(a)
d = xa >> d if b[c + 1] == '+' else xa << d
a = a + d & 4294967295 if b[c] == '+' else a ^ d
return c_int(a)
a = b
for di in d:
a = RL(a + di, "+-a^+6")
a = RL(a, "+-3^+b+-f")
a ^= tkk[1]
a = a if a >= 0 else ((a & 2147483647) + 2147483648)
a %= pow(10, 6)
tk = '{0:d}.{1:d}'.format(a, a ^ b)
return tk