Exclude parser when running Spacy model

PJ-Finlay · web-flow · commit 95c6f3375570 · 2024-09-26T15:03:50.000-04:00
Doesn't load unnecessary components when loading the Spacy sentence segmentation model. This should improve performance. > The SentenceRecognizer is a simple statistical component that only provides sentence boundaries. Along with being faster and smaller than the parser, its primary advantage is that it’s easier to train because it only requires annotated sentence boundaries rather than full dependency parses. spaCy’s trained pipelines include both a parser and a trained sentence segmenter, which is disabled by default. If you only need sentence boundaries and no parser, you can use the exclude or disable argument on spacy.load https://spacy.io/usage/linguistic-features/#sbd-senter
diff --git a/argostranslate/sbd.py b/argostranslate/sbd.py
@@ -18,16 +18,18 @@ def split_sentences(self, text: str, lang_code: Optional[str] = None) -> List[st
 
 # Spacy sentence boundary detection Sentencizer
 # https://community.libretranslate.com/t/sentence-boundary-detection-for-machine-translation/606/3
+# https://spacy.io/usage/linguistic-features/#sbd
 
 # Download model:
 # python -m spacy download xx_sent_ud_sm
 class SpacySentencizerSmall(ISentenceBoundaryDetectionModel):
     def __init__(self):
         try:
-            self.nlp = spacy.load("xx_sent_ud_sm")
+            self.nlp = spacy.load("xx_sent_ud_sm", exclude=["parser"])
         except OSError:
+            # Automatically download the model if it doesn't exist
             spacy.cli.download("xx_sent_ud_sm")
-            self.nlp = spacy.load("xx_sent_ud_sm")
+            self.nlp = spacy.load("xx_sent_ud_sm", exclude=["parser"])
         self.nlp.add_pipe("sentencizer")
 
     def split_sentences(self, text: str, lang_code: Optional[str] = None) -> List[str]: