Add budoux

wannaphong · wannaphong · commit ba1f929cd523 · 2025-10-24T10:53:53.000Z
diff --git a/docker_requirements.txt b/docker_requirements.txt
@@ -35,3 +35,4 @@ ufal.chu-liu-edmonds==1.0.3
 wtpsplit==1.3.0
 wunsen==0.0.3
 word2word>=1.0.0,<2
+budoux==0.7.0
diff --git a/pythainlp/tokenize/budoux.py b/pythainlp/tokenize/budoux.py
@@ -0,0 +1,95 @@
+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Wrapper for BudouX tokenizer (https://github.com/google/budoux)
+
+This module provides a small, defensive wrapper around the Python
+`budoux` package. The wrapper lazy-imports the package so importing
+`pythainlp.tokenize` will not fail if `budoux` is not installed. When
+used and `budoux` is missing, a clear ImportError is raised with an
+installation hint.
+
+The BudouX API surface has changed across versions; this wrapper tries
+several common entry points (`LineBreaker`, `Budoux`, `parse`,
+`segment`) and normalizes the output into a list of strings.
+"""
+from typing import List
+
+_parser = None
+
+
+def _init_parser():
+    """Lazy initialize and return a budoux parser instance.
+
+    Raises ImportError when `budoux` is not installed, and RuntimeError
+    if the installed budoux does not expose a supported API.
+    """
+    try:
+        import budoux
+    except Exception as exc:  # pragma: no cover - defensive import
+        raise ImportError(
+            "budoux is not installed. Install it with: pip install budoux"
+        ) from exc
+
+    return budoux.load_default_thai_parser()
+
+
+def segment(text: str) -> List[str]:
+    """Segment `text` into tokens using budoux.
+
+    The function returns a list of strings. If `budoux` is not available
+    the function raises ImportError with an installation hint.
+    """
+    if not text or not isinstance(text, str):
+        return []
+
+    global _parser
+    if _parser is None:
+        _parser = _init_parser()
+
+    parser = _parser
+
+    # Call the most-likely parse/segment method and normalize output.
+    if hasattr(parser, "parse") and callable(getattr(parser, "parse")):
+        result = parser.parse(text)
+    elif hasattr(parser, "segment") and callable(getattr(parser, "segment")):
+        result = parser.segment(text)
+    elif hasattr(parser, "break_lines") and callable(
+        getattr(parser, "break_lines")
+    ):
+        result = parser.break_lines(text)
+    else:
+        # If parser is the module exposing top-level parse/segment
+        if hasattr(parser, "parse") and callable(getattr(parser, "parse")):
+            result = parser.parse(text)
+        elif hasattr(parser, "segment") and callable(
+            getattr(parser, "segment")
+        ):
+            result = parser.segment(text)
+        else:
+            raise RuntimeError("Unable to call budoux parser method.")
+
+    # Normalize: allow list[str], list[dict], str (joined with newline)
+    if isinstance(result, str):
+        # some implementations return a string with newlines
+        return [s for s in result.splitlines() if s]
+
+    if isinstance(result, list):
+        out: List[str] = []
+        for item in result:
+            if isinstance(item, str):
+                out.append(item)
+            elif isinstance(item, dict):
+                # Some APIs may return dict-like segments
+                if "text" in item:
+                    out.append(item["text"])
+                else:
+                    out.append(str(item))
+            else:
+                out.append(str(item))
+        return out
+
+    # Fallback: stringify whatever we got
+    return [str(result)]
diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
@@ -152,6 +152,8 @@ def word_tokenize(
         * *tltk* - wrapper for
           `TLTK <https://pypi.org/project/tltk/>`_.,
            maximum collocation approach
+        * *budoux* - wrapper for
+          `budoux <https://github.com/google/budoux>`_.
     :Note:
         - The **custom_dict** parameter only works for \
           *deepcut*, *longest*, *newmm*, and *newmm-safe* engines.
@@ -227,7 +229,8 @@ def word_tokenize(
         "nercut",
         "sefr_cut",
         "tltk",
-        "oskut"
+        "oskut",
+        "budoux",
     ):
         raise NotImplementedError(
             f"The {engine} engine does not support custom dictionaries."
@@ -264,6 +267,10 @@ def word_tokenize(
     elif engine == "icu":
         from pythainlp.tokenize.pyicu import segment
 
+        segments = segment(text)
+    elif engine == "budoux":
+        from pythainlp.tokenize.budoux import segment
+
         segments = segment(text)
     elif engine == "nercut":
         from pythainlp.tokenize.nercut import segment
diff --git a/setup.py b/setup.py
@@ -86,6 +86,7 @@
     "thai_nner": ["thai_nner"],
     "thai2fit": ["emoji>=0.5.1", "gensim>=4.0.0", NUMPY],
     "thai2rom": [NUMPY, "torch>=1.0.0"],
+    "budoux": ["budoux>=0.7.0"],
     "translate": [
         'fairseq>=0.10.0,<0.13;python_version<"3.11"',
         'fairseq-fixed==0.12.3.1,<0.13;python_version>="3.11"',
@@ -155,6 +156,7 @@
         "wtpsplit>=1.0.1",
         "wunsen>=0.0.3",
         "word2word>=1.0.0",
+        "budoux>=0.7.0",
     ],
 }
 
diff --git a/tests/extra/testx_tokenize.py b/tests/extra/testx_tokenize.py
@@ -333,3 +333,8 @@ def test_sefr_cut(self):
 class WordTokenizeTLTKTestCase(unittest.TestCase):
     def test_word_tokenize_tltk(self):
         self.assertIsNotNone(word_tokenize(TEXT_1, engine="tltk"))
+
+
+class WordTokenizeBudouxTestCase(unittest.TestCase):
+    def test_word_tokenize_budoux(self):
+        self.assertIsNotNone(word_tokenize(TEXT_1, engine="budoux"))