chore: Drop stopwords related logic (#45)

SilverRainZ · web-flow · commit 42bf7f017c11 · 2025-10-31T13:08:36.000+08:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -48,8 +48,6 @@ dependencies = [
     "jieba-next",
     "python-pinyin",
     "pyxdg",
-    "stopwordsiso",
-    "setuptools", # req by stopwordsiso, https://stackoverflow.com/a/39930983/4799273
     "wcwidth",
     "wordsegment",
     # CUSTOM DEPENDENCIES END
diff --git a/src/sphinxnotes/snippet/ext.py b/src/sphinxnotes/snippet/ext.py
@@ -63,9 +63,9 @@ def extract_excerpt(s: Snippet) -> str:
 def extract_keywords(s: Snippet) -> list[str]:
     keywords = [s.docname]
     if isinstance(s, WithTitle) and s.title is not None:
-        keywords.extend(extractor.extract(s.title, strip_stopwords=False))
+        keywords.extend(extractor.extract(s.title))
     if isinstance(s, Code):
-        keywords.extend(extractor.extract(s.desc, strip_stopwords=False))
+        keywords.extend(extractor.extract(s.desc))
     return keywords
 
 
diff --git a/src/sphinxnotes/snippet/keyword.py b/src/sphinxnotes/snippet/keyword.py
@@ -26,7 +26,6 @@ def __init__(self):
         from langid import rank
         from jieba_next import cut_for_search, setLogLevel
         from pypinyin import lazy_pinyin
-        from stopwordsiso import stopwords
         from wordsegment import load, segment
 
         # Turn off jieba debug log.
@@ -38,16 +37,13 @@ def __init__(self):
         self._tokenize_zh_cn = cut_for_search
         self._tokenize_en = segment
         self._pinyin = lazy_pinyin
-        self._stopwords = stopwords
 
         self._punctuation = (
             string.punctuation
             + '！？｡。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.·'
         )
 
-    def extract(
-        self, text: str, top_n: int | None = None, strip_stopwords: bool = True
-    ) -> list[str]:
+    def extract(self, text: str, top_n: int | None = None) -> list[str]:
         """Return keywords of given text."""
         # TODO: zh -> en
         # Normalize
@@ -57,8 +53,6 @@ def extract(
         # Invalid token removal
         words = self.strip_invalid_token(words)
         # Stopwords removal
-        if strip_stopwords:
-            words = self.strip_stopwords(words)
         if top_n:
             # Get top n words as keyword
             keywords = Counter(words).most_common(top_n)
@@ -106,13 +100,5 @@ def tokenize(self, text: str) -> list[str]:
     def trans_to_pinyin(self, word: str) -> str | None:
         return ' '.join(self._pinyin(word, errors='ignore'))
 
-    def strip_stopwords(self, words: list[str]) -> list[str]:
-        stw = self._stopwords(['en', 'zh'])
-        new_words = []
-        for word in words:
-            if word not in stw:
-                new_words.append(word)
-        return new_words
-
     def strip_invalid_token(self, tokens: list[str]) -> list[str]:
         return [token for token in tokens if token != '']