Skip to content

Commit 0793386

Browse files
authored
GH-233: Add lang_detect module (#733)
1 parent dd19616 commit 0793386

File tree

8 files changed

+98
-1
lines changed

8 files changed

+98
-1
lines changed

README.md

+26-1
Original file line numberDiff line numberDiff line change
@@ -292,6 +292,31 @@ Satisfaction, guaranteed.
292292
```
293293
</details>
294294

295+
<details>
296+
<summary><b><a href="">Lang Detect</a></b> - Identifying the Language of Text
297+
<code>⚛️</code>
298+
</summary>
299+
300+
<br/>
301+
302+
Lang Detect API. Thanks to awesome work from [FastText](https://fasttext.cc/docs/en/language-identification.html)
303+
304+
Install extend dependencies and models
305+
306+
```bash
307+
$ pip install underthesea[lang-detect]
308+
```
309+
310+
Usage examples in script
311+
312+
```python
313+
>>> from underthesea.pipeline.lang_detect import lang_detect
314+
315+
>>> lang_detect("Cựu binh Mỹ trả nhật ký nhẹ lòng khi thấy cuộc sống hòa bình tại Việt Nam")
316+
vi
317+
```
318+
</details>
319+
295320
<details>
296321
<summary><b><a href="">Say 🗣️</a></b> - Converting written text into spoken audio
297322
<code>⚛️</code>
@@ -363,7 +388,7 @@ Resource CP_Vietnamese_VLC_v2_2022 is downloaded in ~/.underthesea/datasets/CP_V
363388

364389
* Automatic Speech Recognition
365390
* Machine Translation
366-
* Chatbot (Chat & Speak)
391+
* Chatbot Agent
367392

368393
## Contributing
369394

setup.py

+3
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@
4242
],
4343
'prompt': [
4444
'openai'
45+
],
46+
'lang-detect': [
47+
'fasttext '
4548
]
4649
}
4750
setup(

tests/pipeline/lang_detect/__init__.py

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# -*- coding: utf-8 -*-
2+
from unittest import TestCase
3+
from underthesea import lang_detect
4+
5+
6+
class TestLangDetect(TestCase):
7+
def test_lang_detect_1(self):
8+
actual = lang_detect("Bộ Công Thương xóa một tổng cục, giảm nhiều đầu mối")
9+
expected = "vi"
10+
self.assertEqual(actual, expected)
11+
12+
def test_lang_detect_2(self):
13+
actual = lang_detect("Ceci est un texte français.")
14+
expected = "fr"
15+
self.assertEqual(actual, expected)
16+
17+
def test_lang_detect_3(self):
18+
actual = lang_detect("如來の妙色身、 世間與に等しきは無し。")
19+
expected = "ja"
20+
self.assertEqual(actual, expected)

underthesea/__init__.py

+6
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,11 @@
4949
except Exception:
5050
pass
5151

52+
try:
53+
from underthesea.pipeline.lang_detect import lang_detect
54+
except Exception as e:
55+
print(e)
56+
5257

5358
# lazy loading
5459
def dependency_parse(*args, **kwargs):
@@ -61,6 +66,7 @@ def dependency_parse(*args, **kwargs):
6166
'text_normalize',
6267
'word_tokenize', 'pos_tag', 'chunk',
6368
'ner',
69+
'lang_detect',
6470
'classify', 'sentiment',
6571
'dependency_parse'
6672
]

underthesea/model_fetcher.py

+12
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ class UTSModel(Enum):
2323
sa_general = "SA_GENERAL"
2424
sa_bank = "SA_BANK"
2525
sa_bank_v131 = "SA_BANK_V131"
26+
lang_detect_fast_text = "LANG_DETECT_FAST_TEXT"
2627

2728
# flake8: noqa: C901
2829

@@ -161,6 +162,10 @@ def download(model_name):
161162
if model_name == "VIET_TTS_V0_4_1":
162163
ModelFetcher.download_zip(REPO[model_name])
163164

165+
if model_name == "LANG_DETECT_FAST_TEXT":
166+
url = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin"
167+
cached_path(url, cache_dir=cache_dir)
168+
164169
@staticmethod
165170
def list(all=False):
166171
models = []
@@ -206,6 +211,13 @@ def get_model_path(model):
206211

207212
if model == UTSModel.sa_bank:
208213
return Path(UNDERTHESEA_FOLDER) / "models" / "SA_BANK"
214+
215+
if model == UTSModel.sa_bank:
216+
return Path(UNDERTHESEA_FOLDER) / "models" / "SA_BANK"
217+
218+
if model == "LANG_DETECT_FAST_TEXT":
219+
return Path(UNDERTHESEA_FOLDER) / "models" / "lid.176.bin"
220+
209221
return Path(UNDERTHESEA_FOLDER) / "models" / model
210222

211223

underthesea/models.yaml

+8
Original file line numberDiff line numberDiff line change
@@ -48,4 +48,12 @@ VIET_TTS_V0_4_1:
4848
year: 2023
4949
url: https://github.com/undertheseanlp/underthesea/releases/download/resources/viet_tts_v0.4.1.zip
5050
filename: viet_tts_v0.4.1.zip
51+
LANG_DETECT_FAST_TEXT:
52+
cache_dir: models
53+
model_path: LANG_DETECT_FAST_TEXT
54+
type: Lang Detect
55+
license: Open
56+
year: 2020
57+
url: https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin
58+
filename: lid.176.bin
5159

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
import fasttext
2+
import os
3+
from underthesea.model_fetcher import ModelFetcher
4+
5+
fasttext.FastText.eprint = lambda x: None
6+
lang_detect_model = None
7+
8+
9+
def lang_detect(text):
10+
global lang_detect_model
11+
model_name = "LANG_DETECT_FAST_TEXT"
12+
model_path = ModelFetcher.get_model_path(model_name)
13+
if not lang_detect_model:
14+
if not os.path.exists(model_path):
15+
ModelFetcher.download(model_name)
16+
try:
17+
lang_detect_model = fasttext.load_model(str(model_path))
18+
except Exception:
19+
pass
20+
21+
predictions = lang_detect_model.predict(text)
22+
language = predictions[0][0].replace('__label__', '')
23+
return language

0 commit comments

Comments
 (0)