Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 9 additions & 6 deletions compass/extraction/apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,8 +359,9 @@ async def _extract_with_ngram_check(
if not cleaned_text:
logger.debug(
"No cleaned text found after extraction on attempt %d "
"for document with source %s. Retrying...",
"of %d for document with source %s. Retrying...",
attempt,
num_tries,
source,
)
continue
Expand All @@ -371,9 +372,10 @@ async def _extract_with_ngram_check(
if ngram_frac >= ngram_thresh:
logger.debug(
"Document extraction for %r passed ngram check on attempt %d "
"with score %.2f (OCR: %r; Document source: %s)",
"of %d with score %.2f (OCR: %r; Document source: %s)",
out_text_key,
attempt + 1,
attempt,
num_tries,
ngram_frac,
doc_is_from_ocr,
source,
Expand All @@ -384,10 +386,11 @@ async def _extract_with_ngram_check(
best_score = max(best_score, ngram_frac)

logger.debug(
"Document extraction for %r failed ngram check on attempt %d "
"with score %.2f (OCR: %r; Document source: %s). Retrying...",
"Document extraction for %r failed ngram check on attempt %d of "
"%d, with score %.2f (OCR: %r; Document source: %s). Retrying...",
out_text_key,
attempt + 1,
attempt,
num_tries,
ngram_frac,
doc_is_from_ocr,
source,
Expand Down
8 changes: 4 additions & 4 deletions compass/plugin/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from abc import ABC, abstractmethod

from compass.plugin.base import BaseExtractionPlugin
from compass.llm.calling import BaseLLMCaller, LLMCaller
from compass.llm.calling import BaseLLMCaller
from compass.extraction import extract_relevant_text_with_ngram_validation
from compass.scripts.download import filter_ordinance_docs
from compass.services.threaded import CLEANED_FP_REGISTRY, CleanedFileWriter
Expand Down Expand Up @@ -238,12 +238,11 @@ async def extract_relevant_text(self, doc, extractor_class, model_config):
model_config : LLMConfig
Configuration for the LLM model to use for text extraction.
"""
llm_caller = LLMCaller(
extractor = extractor_class(
llm_service=model_config.llm_service,
usage_tracker=self.usage_tracker,
**model_config.llm_call_kwargs,
)
extractor = extractor_class(llm_caller)
doc = await extract_relevant_text_with_ngram_validation(
doc,
model_config.text_splitter,
Expand Down Expand Up @@ -316,11 +315,12 @@ async def filter_docs(
),
)

heuristic = await self.get_heuristic()
docs = await filter_ordinance_docs(
docs,
self.jurisdiction,
self.model_configs,
heuristic=self.HEURISTIC(),
heuristic=heuristic,
tech=self.IDENTIFIER,
text_collectors=self.TEXT_COLLECTORS,
usage_tracker=self.usage_tracker,
Expand Down
10 changes: 0 additions & 10 deletions compass/plugin/noop.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,16 +76,6 @@ def _store_chunk(self, parser, chunk_ind):
class NoOpTextExtractor(BaseTextExtractor):
"""NoOp text extractor that returns the full text"""

def __init__(self, llm_caller):
"""

Parameters
----------
llm_caller : LLMCaller
LLM Caller instance used to extract ordinance info with.
"""
self.llm_caller = llm_caller

async def return_original(self, text_chunks): # noqa: PLR6301
"""No processing, just return original text

Expand Down
Loading