@@ -26,7 +26,6 @@ def __init__(self):
2626 from langid import rank
2727 from jieba_next import cut_for_search , setLogLevel
2828 from pypinyin import lazy_pinyin
29- from stopwordsiso import stopwords
3029 from wordsegment import load , segment
3130
3231 # Turn off jieba debug log.
@@ -38,16 +37,13 @@ def __init__(self):
3837 self ._tokenize_zh_cn = cut_for_search
3938 self ._tokenize_en = segment
4039 self ._pinyin = lazy_pinyin
41- self ._stopwords = stopwords
4240
4341 self ._punctuation = (
4442 string .punctuation
4543 + '!?。。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.·'
4644 )
4745
48- def extract (
49- self , text : str , top_n : int | None = None , strip_stopwords : bool = True
50- ) -> list [str ]:
46+ def extract (self , text : str , top_n : int | None = None ) -> list [str ]:
5147 """Return keywords of given text."""
5248 # TODO: zh -> en
5349 # Normalize
@@ -57,8 +53,6 @@ def extract(
5753 # Invalid token removal
5854 words = self .strip_invalid_token (words )
5955 # Stopwords removal
60- if strip_stopwords :
61- words = self .strip_stopwords (words )
6256 if top_n :
6357 # Get top n words as keyword
6458 keywords = Counter (words ).most_common (top_n )
@@ -106,13 +100,5 @@ def tokenize(self, text: str) -> list[str]:
106100 def trans_to_pinyin (self , word : str ) -> str | None :
107101 return ' ' .join (self ._pinyin (word , errors = 'ignore' ))
108102
109- def strip_stopwords (self , words : list [str ]) -> list [str ]:
110- stw = self ._stopwords (['en' , 'zh' ])
111- new_words = []
112- for word in words :
113- if word not in stw :
114- new_words .append (word )
115- return new_words
116-
117103 def strip_invalid_token (self , tokens : list [str ]) -> list [str ]:
118104 return [token for token in tokens if token != '' ]
0 commit comments