Skip to content

Commit ba1f929

Browse files
committed
Add budoux
1 parent 6022bf7 commit ba1f929

File tree

5 files changed

+111
-1
lines changed

5 files changed

+111
-1
lines changed

docker_requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,3 +35,4 @@ ufal.chu-liu-edmonds==1.0.3
3535
wtpsplit==1.3.0
3636
wunsen==0.0.3
3737
word2word>=1.0.0,<2
38+
budoux==0.7.0

pythainlp/tokenize/budoux.py

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
# -*- coding: utf-8 -*-
2+
# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
3+
# SPDX-FileType: SOURCE
4+
# SPDX-License-Identifier: Apache-2.0
5+
"""
6+
Wrapper for BudouX tokenizer (https://github.com/google/budoux)
7+
8+
This module provides a small, defensive wrapper around the Python
9+
`budoux` package. The wrapper lazy-imports the package so importing
10+
`pythainlp.tokenize` will not fail if `budoux` is not installed. When
11+
used and `budoux` is missing, a clear ImportError is raised with an
12+
installation hint.
13+
14+
The BudouX API surface has changed across versions; this wrapper tries
15+
several common entry points (`LineBreaker`, `Budoux`, `parse`,
16+
`segment`) and normalizes the output into a list of strings.
17+
"""
18+
from typing import List
19+
20+
_parser = None
21+
22+
23+
def _init_parser():
24+
"""Lazy initialize and return a budoux parser instance.
25+
26+
Raises ImportError when `budoux` is not installed, and RuntimeError
27+
if the installed budoux does not expose a supported API.
28+
"""
29+
try:
30+
import budoux
31+
except Exception as exc: # pragma: no cover - defensive import
32+
raise ImportError(
33+
"budoux is not installed. Install it with: pip install budoux"
34+
) from exc
35+
36+
return budoux.load_default_thai_parser()
37+
38+
39+
def segment(text: str) -> List[str]:
40+
"""Segment `text` into tokens using budoux.
41+
42+
The function returns a list of strings. If `budoux` is not available
43+
the function raises ImportError with an installation hint.
44+
"""
45+
if not text or not isinstance(text, str):
46+
return []
47+
48+
global _parser
49+
if _parser is None:
50+
_parser = _init_parser()
51+
52+
parser = _parser
53+
54+
# Call the most-likely parse/segment method and normalize output.
55+
if hasattr(parser, "parse") and callable(getattr(parser, "parse")):
56+
result = parser.parse(text)
57+
elif hasattr(parser, "segment") and callable(getattr(parser, "segment")):
58+
result = parser.segment(text)
59+
elif hasattr(parser, "break_lines") and callable(
60+
getattr(parser, "break_lines")
61+
):
62+
result = parser.break_lines(text)
63+
else:
64+
# If parser is the module exposing top-level parse/segment
65+
if hasattr(parser, "parse") and callable(getattr(parser, "parse")):
66+
result = parser.parse(text)
67+
elif hasattr(parser, "segment") and callable(
68+
getattr(parser, "segment")
69+
):
70+
result = parser.segment(text)
71+
else:
72+
raise RuntimeError("Unable to call budoux parser method.")
73+
74+
# Normalize: allow list[str], list[dict], str (joined with newline)
75+
if isinstance(result, str):
76+
# some implementations return a string with newlines
77+
return [s for s in result.splitlines() if s]
78+
79+
if isinstance(result, list):
80+
out: List[str] = []
81+
for item in result:
82+
if isinstance(item, str):
83+
out.append(item)
84+
elif isinstance(item, dict):
85+
# Some APIs may return dict-like segments
86+
if "text" in item:
87+
out.append(item["text"])
88+
else:
89+
out.append(str(item))
90+
else:
91+
out.append(str(item))
92+
return out
93+
94+
# Fallback: stringify whatever we got
95+
return [str(result)]

pythainlp/tokenize/core.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,8 @@ def word_tokenize(
152152
* *tltk* - wrapper for
153153
`TLTK <https://pypi.org/project/tltk/>`_.,
154154
maximum collocation approach
155+
* *budoux* - wrapper for
156+
`budoux <https://github.com/google/budoux>`_.
155157
:Note:
156158
- The **custom_dict** parameter only works for \
157159
*deepcut*, *longest*, *newmm*, and *newmm-safe* engines.
@@ -227,7 +229,8 @@ def word_tokenize(
227229
"nercut",
228230
"sefr_cut",
229231
"tltk",
230-
"oskut"
232+
"oskut",
233+
"budoux",
231234
):
232235
raise NotImplementedError(
233236
f"The {engine} engine does not support custom dictionaries."
@@ -264,6 +267,10 @@ def word_tokenize(
264267
elif engine == "icu":
265268
from pythainlp.tokenize.pyicu import segment
266269

270+
segments = segment(text)
271+
elif engine == "budoux":
272+
from pythainlp.tokenize.budoux import segment
273+
267274
segments = segment(text)
268275
elif engine == "nercut":
269276
from pythainlp.tokenize.nercut import segment

setup.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@
8686
"thai_nner": ["thai_nner"],
8787
"thai2fit": ["emoji>=0.5.1", "gensim>=4.0.0", NUMPY],
8888
"thai2rom": [NUMPY, "torch>=1.0.0"],
89+
"budoux": ["budoux>=0.7.0"],
8990
"translate": [
9091
'fairseq>=0.10.0,<0.13;python_version<"3.11"',
9192
'fairseq-fixed==0.12.3.1,<0.13;python_version>="3.11"',
@@ -155,6 +156,7 @@
155156
"wtpsplit>=1.0.1",
156157
"wunsen>=0.0.3",
157158
"word2word>=1.0.0",
159+
"budoux>=0.7.0",
158160
],
159161
}
160162

tests/extra/testx_tokenize.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -333,3 +333,8 @@ def test_sefr_cut(self):
333333
class WordTokenizeTLTKTestCase(unittest.TestCase):
334334
def test_word_tokenize_tltk(self):
335335
self.assertIsNotNone(word_tokenize(TEXT_1, engine="tltk"))
336+
337+
338+
class WordTokenizeBudouxTestCase(unittest.TestCase):
339+
def test_word_tokenize_budoux(self):
340+
self.assertIsNotNone(word_tokenize(TEXT_1, engine="budoux"))

0 commit comments

Comments
 (0)