|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | +# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project |
| 3 | +# SPDX-FileType: SOURCE |
| 4 | +# SPDX-License-Identifier: Apache-2.0 |
| 5 | +""" |
| 6 | +Wrapper for BudouX tokenizer (https://github.com/google/budoux) |
| 7 | +
|
| 8 | +This module provides a small, defensive wrapper around the Python |
| 9 | +`budoux` package. The wrapper lazy-imports the package so importing |
| 10 | +`pythainlp.tokenize` will not fail if `budoux` is not installed. When |
| 11 | +used and `budoux` is missing, a clear ImportError is raised with an |
| 12 | +installation hint. |
| 13 | +
|
| 14 | +The BudouX API surface has changed across versions; this wrapper tries |
| 15 | +several common entry points (`LineBreaker`, `Budoux`, `parse`, |
| 16 | +`segment`) and normalizes the output into a list of strings. |
| 17 | +""" |
| 18 | +from typing import List |
| 19 | + |
| 20 | +_parser = None |
| 21 | + |
| 22 | + |
| 23 | +def _init_parser(): |
| 24 | + """Lazy initialize and return a budoux parser instance. |
| 25 | +
|
| 26 | + Raises ImportError when `budoux` is not installed, and RuntimeError |
| 27 | + if the installed budoux does not expose a supported API. |
| 28 | + """ |
| 29 | + try: |
| 30 | + import budoux |
| 31 | + except Exception as exc: # pragma: no cover - defensive import |
| 32 | + raise ImportError( |
| 33 | + "budoux is not installed. Install it with: pip install budoux" |
| 34 | + ) from exc |
| 35 | + |
| 36 | + return budoux.load_default_thai_parser() |
| 37 | + |
| 38 | + |
| 39 | +def segment(text: str) -> List[str]: |
| 40 | + """Segment `text` into tokens using budoux. |
| 41 | +
|
| 42 | + The function returns a list of strings. If `budoux` is not available |
| 43 | + the function raises ImportError with an installation hint. |
| 44 | + """ |
| 45 | + if not text or not isinstance(text, str): |
| 46 | + return [] |
| 47 | + |
| 48 | + global _parser |
| 49 | + if _parser is None: |
| 50 | + _parser = _init_parser() |
| 51 | + |
| 52 | + parser = _parser |
| 53 | + |
| 54 | + # Call the most-likely parse/segment method and normalize output. |
| 55 | + if hasattr(parser, "parse") and callable(getattr(parser, "parse")): |
| 56 | + result = parser.parse(text) |
| 57 | + elif hasattr(parser, "segment") and callable(getattr(parser, "segment")): |
| 58 | + result = parser.segment(text) |
| 59 | + elif hasattr(parser, "break_lines") and callable( |
| 60 | + getattr(parser, "break_lines") |
| 61 | + ): |
| 62 | + result = parser.break_lines(text) |
| 63 | + else: |
| 64 | + # If parser is the module exposing top-level parse/segment |
| 65 | + if hasattr(parser, "parse") and callable(getattr(parser, "parse")): |
| 66 | + result = parser.parse(text) |
| 67 | + elif hasattr(parser, "segment") and callable( |
| 68 | + getattr(parser, "segment") |
| 69 | + ): |
| 70 | + result = parser.segment(text) |
| 71 | + else: |
| 72 | + raise RuntimeError("Unable to call budoux parser method.") |
| 73 | + |
| 74 | + # Normalize: allow list[str], list[dict], str (joined with newline) |
| 75 | + if isinstance(result, str): |
| 76 | + # some implementations return a string with newlines |
| 77 | + return [s for s in result.splitlines() if s] |
| 78 | + |
| 79 | + if isinstance(result, list): |
| 80 | + out: List[str] = [] |
| 81 | + for item in result: |
| 82 | + if isinstance(item, str): |
| 83 | + out.append(item) |
| 84 | + elif isinstance(item, dict): |
| 85 | + # Some APIs may return dict-like segments |
| 86 | + if "text" in item: |
| 87 | + out.append(item["text"]) |
| 88 | + else: |
| 89 | + out.append(str(item)) |
| 90 | + else: |
| 91 | + out.append(str(item)) |
| 92 | + return out |
| 93 | + |
| 94 | + # Fallback: stringify whatever we got |
| 95 | + return [str(result)] |
0 commit comments