diff --git a/compass/extraction/apply.py b/compass/extraction/apply.py index c569d144c..63deb52c0 100644 --- a/compass/extraction/apply.py +++ b/compass/extraction/apply.py @@ -359,8 +359,9 @@ async def _extract_with_ngram_check( if not cleaned_text: logger.debug( "No cleaned text found after extraction on attempt %d " - "for document with source %s. Retrying...", + "of %d for document with source %s. Retrying...", attempt, + num_tries, source, ) continue @@ -371,9 +372,10 @@ async def _extract_with_ngram_check( if ngram_frac >= ngram_thresh: logger.debug( "Document extraction for %r passed ngram check on attempt %d " - "with score %.2f (OCR: %r; Document source: %s)", + "of %d with score %.2f (OCR: %r; Document source: %s)", out_text_key, - attempt + 1, + attempt, + num_tries, ngram_frac, doc_is_from_ocr, source, @@ -384,10 +386,11 @@ async def _extract_with_ngram_check( best_score = max(best_score, ngram_frac) logger.debug( - "Document extraction for %r failed ngram check on attempt %d " - "with score %.2f (OCR: %r; Document source: %s). Retrying...", + "Document extraction for %r failed ngram check on attempt %d of " + "%d, with score %.2f (OCR: %r; Document source: %s). Retrying...", out_text_key, - attempt + 1, + attempt, + num_tries, ngram_frac, doc_is_from_ocr, source, diff --git a/compass/plugin/interface.py b/compass/plugin/interface.py index 5e4fd26c2..52af3dcba 100644 --- a/compass/plugin/interface.py +++ b/compass/plugin/interface.py @@ -4,7 +4,7 @@ from abc import ABC, abstractmethod from compass.plugin.base import BaseExtractionPlugin -from compass.llm.calling import BaseLLMCaller, LLMCaller +from compass.llm.calling import BaseLLMCaller from compass.extraction import extract_relevant_text_with_ngram_validation from compass.scripts.download import filter_ordinance_docs from compass.services.threaded import CLEANED_FP_REGISTRY, CleanedFileWriter @@ -238,12 +238,11 @@ async def extract_relevant_text(self, doc, extractor_class, model_config): model_config : LLMConfig Configuration for the LLM model to use for text extraction. """ - llm_caller = LLMCaller( + extractor = extractor_class( llm_service=model_config.llm_service, usage_tracker=self.usage_tracker, **model_config.llm_call_kwargs, ) - extractor = extractor_class(llm_caller) doc = await extract_relevant_text_with_ngram_validation( doc, model_config.text_splitter, @@ -316,11 +315,12 @@ async def filter_docs( ), ) + heuristic = await self.get_heuristic() docs = await filter_ordinance_docs( docs, self.jurisdiction, self.model_configs, - heuristic=self.HEURISTIC(), + heuristic=heuristic, tech=self.IDENTIFIER, text_collectors=self.TEXT_COLLECTORS, usage_tracker=self.usage_tracker, diff --git a/compass/plugin/noop.py b/compass/plugin/noop.py index 7d7ede507..bdf44629e 100644 --- a/compass/plugin/noop.py +++ b/compass/plugin/noop.py @@ -76,16 +76,6 @@ def _store_chunk(self, parser, chunk_ind): class NoOpTextExtractor(BaseTextExtractor): """NoOp text extractor that returns the full text""" - def __init__(self, llm_caller): - """ - - Parameters - ---------- - llm_caller : LLMCaller - LLM Caller instance used to extract ordinance info with. - """ - self.llm_caller = llm_caller - async def return_original(self, text_chunks): # noqa: PLR6301 """No processing, just return original text diff --git a/compass/plugin/one_shot/base.py b/compass/plugin/one_shot/base.py index b3c4bb13f..4ab68576a 100644 --- a/compass/plugin/one_shot/base.py +++ b/compass/plugin/one_shot/base.py @@ -1,12 +1,9 @@ """COMPASS one-shot extraction plugin""" -import json import logging -import hashlib import importlib.resources -from pathlib import Path - -from platformdirs import user_data_dir +from asyncio import Semaphore +from enum import StrEnum, auto from compass.llm.calling import SchemaOutputLLMCaller from compass.plugin import ( @@ -17,21 +14,41 @@ PromptBasedTextCollector, PromptBasedTextExtractor, OrdinanceExtractionPlugin, + KeywordBasedHeuristic, +) +from compass.plugin.one_shot.generators import ( + generate_query_templates, + generate_website_keywords, + generate_heuristic_keywords, ) -from compass.plugin.one_shot.generators import generate_query_templates from compass.plugin.one_shot.components import ( SchemaBasedTextCollector, + SchemaBasedTextExtractor, SchemaOrdinanceParser, ) +from compass.plugin.one_shot.cache import key_from_cache, key_to_cache +from compass.services.threaded import CLEANED_FP_REGISTRY from compass.utilities.io import load_config from compass.utilities.enums import LLMTasks +from compass.exceptions import COMPASSPluginConfigurationError logger = logging.getLogger(__name__) _SCHEMA_DIR = importlib.resources.files("compass.plugin.one_shot.schemas") +_QT_SEMAPHORE = Semaphore(1) +_WK_SEMAPHORE = Semaphore(1) +_HK_SEMAPHORE = Semaphore(1) + + +class _CacheKey(StrEnum): + """LLM generated content cache keys""" + QUERY_TEMPLATES = auto() + WEBSITE_KEYWORDS = auto() + HEURISTIC_KEYWORDS = auto() -def create_schema_based_one_shot_extraction_plugin(config, tech): + +def create_schema_based_one_shot_extraction_plugin(config, tech): # noqa: C901 """Create a one-shot extraction plugin based on a configuration Parameters @@ -39,14 +56,15 @@ def create_schema_based_one_shot_extraction_plugin(config, tech): config : dict or path-like One-shot configuration dictionary. If not a dictionary, should be a path to a file containing the configuration (supported - formats: JSON, JSON5, YAML, TOML). See the wind ordinance schema + formats: JSON, JSON5, YAML, TOML). See the + `wind ordinance schema `_ for an example. The configuration must include the following keys: - `schema`: A dictionary representing the schema of the - output. Can also be a path to a file that contains the - schema (supported formats: JSON, JSON5, YAML, TOML). See - the wind ordinance schema for an example. + output. Can also be a path to a file that contains the + schema (supported formats: JSON, JSON5, YAML, TOML). See + the wind ordinance schema for an example. The configuration can also include the following optional keys: @@ -61,10 +79,20 @@ def create_schema_based_one_shot_extraction_plugin(config, tech): that is being processed. If not provided, the LLM will be used to generate search engine queries based on the schema input. - - `website_keywords`: A list of keywords to use for - filtering websites during document retrieval. If not - provided, the LLM will be used to generate website - keywords based on the schema input. + - `website_keywords`: A dictionary mapping keywords to + scores for filtering websites during document retrieval. + If not provided, the LLM will be used to generate + website keywords based on the schema input. + - `heuristic_keywords`: A dictionary containing the keyword + lists used by the heuristic document filter. The + dictionary must include ``not_tech_words``, + ``good_tech_keywords``, ``good_tech_acronyms``, and + ``good_tech_phrases`` keys. Alternatively, this input can + simply be ``True``, in which case the LLM will be used to + generate heuristic keyword lists based on the schema + input. If ``False``, ``None``, or not provided, a `NoOp` + heuristic that always returns ``True`` will be used (not + recommended if doing website crawling). - `collection_prompts`: A list of prompts to use for collecting relevant text from documents. Alternatively, this input can simply be ``True``, in which case the LLM @@ -78,7 +106,7 @@ def create_schema_based_one_shot_extraction_plugin(config, tech): the text extraction prompts. If ``False``, ``None``, or not provided, the entire document text will be used for extraction (no text consolidation). - - `cache_query_templates`: Boolean flag indicating + - `cache_llm_generated_content`: Boolean flag indicating whether or not to cache generated query templates and website keywords for future use. By default, ``True``. Caching is recommended since the generation of query @@ -109,7 +137,7 @@ def create_schema_based_one_shot_extraction_plugin(config, tech): text_collectors = _collectors_from_config(config) text_extractors = _extractors_from_config( - config, in_label=text_collectors[-1].OUT_LABEL + config, in_label=text_collectors[-1].OUT_LABEL, tech=tech ) parsers = _parser_from_config( config, in_label=text_extractors[-1].OUT_LABEL @@ -122,35 +150,11 @@ class SchemaBasedExtractionPlugin(OrdinanceExtractionPlugin): IDENTIFIER = tech """str: Identifier for extraction task """ - # TODO: implement dynamic generation of the heuristic based on - # the extraction schema HEURISTIC = NoOpHeuristic """BaseHeuristic: Class with a ``check()`` method""" - # TODO: implement dynamic generation of the website keywords - # based on the extraction schema - WEBSITE_KEYWORDS = { - "pdf": 23040, - "zoning": 11520, - "ordinance": 5760, - r"renewable%20energy": 1440, - r"renewable+energy": 1440, - "renewable energy": 1440, - "planning": 720, - "plan": 360, - "government": 180, - "code": 60, - "area": 60, - r"land%20development": 15, - r"land+development": 15, - "land development": 15, - "land": 3, - "environment": 3, - "energy": 3, - "renewable": 3, - "municipal": 1, - "department": 1, - } + HEURISTIC_KEYWORDS = None + """dict: Keyword lists for heuristic content filtering""" TEXT_COLLECTORS = text_collectors """Classes for collecting text chunks from docs""" @@ -164,6 +168,34 @@ class SchemaBasedExtractionPlugin(OrdinanceExtractionPlugin): QUERY_TEMPLATES = [] # set by user or LLM-generated """List: List of search engine query templates""" + WEBSITE_KEYWORDS = {} # set by user or LLM-generated + """dict: Keyword weight mapping for link crawl prioritization""" + + async def get_heuristic(self): + """Get a `BaseHeuristic` instance with a `check()` method + + The ``check()`` method should accept a string of text and + return ``True`` if the text passes the heuristic check and + ``False`` otherwise. + """ + if self.HEURISTIC_KEYWORDS and self.HEURISTIC is not NoOpHeuristic: + return self.HEURISTIC() + + if not config.get("heuristic_keywords"): + return NoOpHeuristic() + + hk = await self._get_heuristic_keywords() + + class SchemaBasedHeuristic(KeywordBasedHeuristic): + NOT_TECH_WORDS = hk["NOT_TECH_WORDS"] + GOOD_TECH_KEYWORDS = hk["GOOD_TECH_KEYWORDS"] + GOOD_TECH_ACRONYMS = hk["GOOD_TECH_ACRONYMS"] + GOOD_TECH_PHRASES = hk["GOOD_TECH_PHRASES"] + + self.__class__.HEURISTIC_KEYWORDS = hk + self.__class__.HEURISTIC = SchemaBasedHeuristic + return self.HEURISTIC() + async def get_query_templates(self): """Get a list of query templates for document retrieval @@ -179,35 +211,168 @@ async def get_query_templates(self): return self.QUERY_TEMPLATES if qt := config.get("query_templates"): - self.QUERY_TEMPLATES = qt + self.__class__.QUERY_TEMPLATES = qt return qt - qt = _qt_from_cache(self.IDENTIFIER, config["schema"]) + qt = key_from_cache( + self.IDENTIFIER, + config["schema"], + key=_CacheKey.QUERY_TEMPLATES, + ) if qt: - self.QUERY_TEMPLATES = qt + self.__class__.QUERY_TEMPLATES = qt return qt - model_config = self.model_configs.get( - LLMTasks.PLUGIN_GENERATION, - self.model_configs[LLMTasks.DEFAULT], - ) - schema_llm = SchemaOutputLLMCaller( - llm_service=model_config.llm_service, - usage_tracker=self.usage_tracker, - **model_config.llm_call_kwargs, - ) - logger.debug("Generating query templates...") - qt = await generate_query_templates( - schema_llm, config["schema"], add_think_prompt=True - ) - logger.debug("Generated the following query templates:\n%r", qt) - self.QUERY_TEMPLATES = qt - - if config.get("cache_query_templates", True): - _qt_to_cache(self.IDENTIFIER, config["schema"], qt) + async with _QT_SEMAPHORE: + if self.QUERY_TEMPLATES: + return self.QUERY_TEMPLATES + + model_config = self.model_configs.get( + LLMTasks.PLUGIN_GENERATION, + self.model_configs[LLMTasks.DEFAULT], + ) + schema_llm = SchemaOutputLLMCaller( + llm_service=model_config.llm_service, + usage_tracker=self.usage_tracker, + **model_config.llm_call_kwargs, + ) + logger.debug("Generating query templates...") + qt = await generate_query_templates( + schema_llm, config["schema"], add_think_prompt=True + ) + logger.debug( + "Generated the following query templates:\n%r", qt + ) + self.__class__.QUERY_TEMPLATES = qt + + if config.get("cache_llm_generated_content", True): + key_to_cache( + self.IDENTIFIER, + config["schema"], + key=_CacheKey.QUERY_TEMPLATES, + value=qt, + ) return qt + async def get_website_keywords(self): + """Get a dict of website search keyword scores + + Returns + ------- + dict + Dictionary mapping keywords to scores that indicate + links which should be prioritized when performing a + website scrape for a document. + """ + if self.WEBSITE_KEYWORDS: + return self.WEBSITE_KEYWORDS + + if wk := config.get("website_keywords"): + wk = _augment_website_keywords(wk) + self.__class__.WEBSITE_KEYWORDS = wk + return wk + + wk = key_from_cache( + self.IDENTIFIER, + config["schema"], + key=_CacheKey.WEBSITE_KEYWORDS, + ) + if wk: + wk = _augment_website_keywords(wk) + self.__class__.WEBSITE_KEYWORDS = wk + return wk + + async with _WK_SEMAPHORE: + if self.WEBSITE_KEYWORDS: + return self.WEBSITE_KEYWORDS + + model_config = self.model_configs.get( + LLMTasks.PLUGIN_GENERATION, + self.model_configs[LLMTasks.DEFAULT], + ) + schema_llm = SchemaOutputLLMCaller( + llm_service=model_config.llm_service, + usage_tracker=self.usage_tracker, + **model_config.llm_call_kwargs, + ) + logger.debug("Generating website keywords...") + wk = await generate_website_keywords( + schema_llm, + config["schema"], + add_think_prompt=True, + ) + logger.debug( + "Generated the following website keywords:\n%r", wk + ) + if config.get("cache_llm_generated_content", True): + key_to_cache( + self.IDENTIFIER, + config["schema"], + key=_CacheKey.WEBSITE_KEYWORDS, + value=wk, + ) + + wk = _augment_website_keywords(wk) + self.__class__.WEBSITE_KEYWORDS = wk + + return wk + + async def _get_heuristic_keywords(self): + """Get keyword lists for the heuristic document filter""" + if self.HEURISTIC_KEYWORDS: + return self.HEURISTIC_KEYWORDS + + if isinstance(hk := config.get("heuristic_keywords"), dict): + hk = _normalize_heuristic_keywords(hk) + self.__class__.HEURISTIC_KEYWORDS = hk + return hk + + hk = key_from_cache( + self.IDENTIFIER, + config["schema"], + key=_CacheKey.HEURISTIC_KEYWORDS, + ) + if hk: + hk = _normalize_heuristic_keywords(hk) + self.__class__.HEURISTIC_KEYWORDS = hk + return hk + + async with _HK_SEMAPHORE: + if self.HEURISTIC_KEYWORDS: + return self.HEURISTIC_KEYWORDS + + model_config = self.model_configs.get( + LLMTasks.PLUGIN_GENERATION, + self.model_configs[LLMTasks.DEFAULT], + ) + schema_llm = SchemaOutputLLMCaller( + llm_service=model_config.llm_service, + usage_tracker=self.usage_tracker, + **model_config.llm_call_kwargs, + ) + logger.debug("Generating heuristic keywords...") + hk = await generate_heuristic_keywords( + schema_llm, + config["schema"], + add_think_prompt=True, + ) + hk = _normalize_heuristic_keywords(hk) + logger.debug( + "Generated the following heuristic keywords:\n%r", hk + ) + if config.get("cache_llm_generated_content", True): + key_to_cache( + self.IDENTIFIER, + config["schema"], + key=_CacheKey.HEURISTIC_KEYWORDS, + value=hk, + ) + + self.__class__.HEURISTIC_KEYWORDS = hk + + return hk + def _validate_query_templates(self): """NoOp validation for query templates @@ -215,6 +380,13 @@ def _validate_query_templates(self): runtime whether or not they will be valid. """ + def _validate_website_keywords(self): + """NoOp validation for website keywords + + Since keywords can be generated by LLM, we don't know until + runtime whether or not they will be valid. + """ + register_plugin(SchemaBasedExtractionPlugin) @@ -225,38 +397,41 @@ def _collectors_from_config(config): if cp is True: schema_fp = _SCHEMA_DIR / "validate_chunk.json5" - class PluginCollector(SchemaBasedTextCollector): + class PluginTextCollector(SchemaBasedTextCollector): OUT_LABEL = NoOpTextCollector.OUT_LABEL # reuse label SCHEMA = config["schema"] OUTPUT_SCHEMA = load_config(schema_fp) - return [PluginCollector] + return [PluginTextCollector] if cp: - class PluginCollector(PromptBasedTextCollector): + class PluginTextCollector(PromptBasedTextCollector): OUT_LABEL = NoOpTextCollector.OUT_LABEL # reuse label PROMPTS = cp - return [PluginCollector] + return [PluginTextCollector] return [NoOpTextCollector] -def _extractors_from_config(config, in_label): +def _extractors_from_config(config, in_label, tech): """Create a TextExtractor subclass based on a config dict""" tep = config.get("text_extraction_prompts") if tep is True: - # TODO: When implementing this, don't forget to register the - # text output file name so it gets store in the - # cleaned outputs directory - msg = ( - "LLM-based text extraction not implemented yet. If you would like " - "to see this feature implemented, please submit an issue or, " - "better yet, a pull request!" - ) - raise NotImplementedError(msg) + schema_fp = _SCHEMA_DIR / "extract_text.json5" + + class PluginTextExtractor(SchemaBasedTextExtractor): + IN_LABEL = in_label + OUT_LABEL = "copied_relevant_text" + SCHEMA = config["schema"] + OUTPUT_SCHEMA = load_config(schema_fp) + + CLEANED_FP_REGISTRY.setdefault(tech.casefold(), {})[ + "copied_relevant_text" + ] = "Text for Extraction.txt" + return [PluginTextExtractor] if tep: @@ -290,83 +465,85 @@ class PluginParser(SchemaOrdinanceParser): return [PluginParser] -def _qt_from_cache(identifier, schema): - """Get cached query templates for a given schema if they exist""" - # cspell: disable-next-line - data_dir = Path(user_data_dir(appname="INFRA-COMPASS", appauthor="NLR")) - cache_fp = data_dir / "qt_cache.json" - if not cache_fp.exists(): - return None - - logger.debug("Loading query templates from cache at %s", cache_fp) - qt = json.loads(cache_fp.read_text(encoding="utf-8")) - if identifier.casefold() not in qt: - return None - - potential_qt = qt[identifier.casefold()] - m = hashlib.sha256() - m.update(str(schema).encode()) - if potential_qt.get("sha256") != m.hexdigest(): - return None - - templates = potential_qt["templates"] - logger.debug( - "Found query templates for %r in cache:\n%r", identifier, templates - ) - return templates - - -def _qt_to_cache(identifier, schema, qt): - """Cache generated query templates for future use""" - # cspell: disable-next-line - data_dir = Path(user_data_dir(appname="INFRA-COMPASS", appauthor="NLR")) - data_dir.mkdir(parents=True, exist_ok=True) - cache_fp = data_dir / "qt_cache.json" - if not cache_fp.exists(): - logger.debug( - "Cache file for query templates not found at %s. Creating new " - "cache with current query templates for %r", - cache_fp, - identifier, - ) - cache = { - identifier.casefold(): { - "templates": qt, - "sha256": hashlib.sha256(str(schema).encode()).hexdigest(), - } - } - cache_fp.write_text(json.dumps(cache, indent=4), encoding="utf-8") - return - - logger.debug("Loading query templates from cache at %s", cache_fp) - cache = json.loads(cache_fp.read_text(encoding="utf-8")) - if identifier.casefold() not in cache: - logger.debug( - "Adding query templates for %r to cache at %s", - identifier, - cache_fp, +def _augment_website_keywords(keywords): + """Add URL-encoded variants for multi-word keywords""" + augmented = dict(keywords) + for keyword, score in list(augmented.items()): + if not isinstance(keyword, str): + continue + + if " " not in keyword: + continue + + encoded = keyword.replace(" ", "%20") + if encoded not in augmented: + augmented[encoded] = score + + plus_encoded = keyword.replace(" ", "+") + if plus_encoded not in augmented: + augmented[plus_encoded] = score + + return augmented + + +def _normalize_heuristic_keywords(raw): + """Normalize heuristic keyword lists into required structure""" + if not isinstance(raw, dict): + msg = "Heuristic keywords must be a dictionary of keyword lists." + raise COMPASSPluginConfigurationError(msg) + + expected_keys = { + "NOT_TECH_WORDS", + "GOOD_TECH_KEYWORDS", + "GOOD_TECH_ACRONYMS", + "GOOD_TECH_PHRASES", + } + + normalized = {} + for raw_key, value in raw.items(): + if not isinstance(raw_key, str): + msg = "Heuristic keyword keys must be strings." + raise COMPASSPluginConfigurationError(msg) + + target_key = ( + raw_key.strip().replace(" ", "_").replace("-", "_").upper() ) - cache[identifier.casefold()] = { - "templates": qt, - "sha256": hashlib.sha256(str(schema).encode()).hexdigest(), - } - cache_fp.write_text(json.dumps(cache, indent=4), encoding="utf-8") - return - - potential_qt = cache[identifier.casefold()] - m = hashlib.sha256() - m.update(str(schema).encode()) - if potential_qt.get("sha256") == m.hexdigest(): - logger.debug( - "Query templates for %r already in cache and schema hash " - "matches, so not updating cache", - identifier, + if target_key not in expected_keys: + msg = f"Unexpected heuristic keyword list: {raw_key!r}." + raise COMPASSPluginConfigurationError(msg) + + normalized[target_key] = _normalize_keyword_list(value) + + missing = expected_keys - set(normalized) + if missing: + msg = ( + f"Heuristic keywords are missing required lists: {sorted(missing)}" ) - return + raise COMPASSPluginConfigurationError(msg) - cache[identifier.casefold()] = { - "templates": qt, - "sha256": hashlib.sha256(str(schema).encode()).hexdigest(), - } - cache_fp.write_text(json.dumps(cache, indent=4), encoding="utf-8") - return + empty = [key for key, value in normalized.items() if not value] + if empty: + msg = f"Heuristic keyword lists must not be empty: {sorted(empty)}" + raise COMPASSPluginConfigurationError(msg) + + return normalized + + +def _normalize_keyword_list(items): + """Normalize keyword list entries""" + normalized = set() + for item in items: + if not isinstance(item, str): + continue + + keyword = item.strip() + if not keyword: + continue + + keyword = keyword.casefold() + if keyword in normalized: + continue + + normalized.add(keyword) + + return list(normalized) diff --git a/compass/plugin/one_shot/cache.py b/compass/plugin/one_shot/cache.py new file mode 100644 index 000000000..b7e0579e1 --- /dev/null +++ b/compass/plugin/one_shot/cache.py @@ -0,0 +1,167 @@ +"""Schema-based cache for storing LLM-generated outputs""" + +import json +import logging +import hashlib +from pathlib import Path + +from platformdirs import user_data_dir + + +logger = logging.getLogger(__name__) +_CACHE_FP = "llm-generation_cache.json" +_SHA256_KEY = "sha256" + + +def key_from_cache(identifier, schema, key): + """[NOT PUBLIC API] Get cached value for key/schema combination + + Parameters + ---------- + identifier : str + A string identifier for the technology of the extraction schema + (e.g. "wind", "solar", "building_codes", etc.). + schema : dict + The extraction schema that is being used for the LLM-based + one-shot extraction. This is used to ensure that cached content + is only returned if the schema matches, which helps ensure + that cached content is relevant and accurate for the current + extraction task. + key : str + The specific key for the cached content to retrieve, (e.g. + "query_templates", "website_keywords", etc.). + + Returns + ------- + list or dict or None + The cached value for the specified key/schema combination, or + ``None`` if no valid cached value is found. + """ + # cspell: disable-next-line + data_dir = Path(user_data_dir(appname="INFRA-COMPASS", appauthor="NLR")) + cache_fp = data_dir / _CACHE_FP + cache = _load_cache(cache_fp) + + tech_cache = cache.get(identifier.casefold(), {}) + if not tech_cache: + logger.debug("Did not find cache for %r", identifier) + return None + + if tech_cache.get(_SHA256_KEY) != _schema_hash(schema): + logger.debug( + "Cache for %r exists but schema hash did not match", identifier + ) + return None + + out = tech_cache.get(key) + if not out: + logger.debug( + "Cache for %r exists and schema hash matches but no %r found", + identifier, + str(key), + ) + return None + + logger.debug("Found %r for %r in cache:\n%r", str(key), identifier, out) + return out + + +def key_to_cache(identifier, schema, key, value): + """[NOT PUBLIC API] Cache key/value for given schema/tech combo + + Parameters + ---------- + identifier : str + A string identifier for the technology of the extraction schema + (e.g. "wind", "solar", "building_codes", etc.). + schema : dict + The extraction schema that is being used for the LLM-based + one-shot extraction. This is used to ensure that cached content + is only returned if the schema matches, which helps ensure + that cached content is relevant and accurate for the current + extraction task. + key : str + The specific key for the cached content to retrieve, (e.g. + "query_templates", "website_keywords", etc.). + value : list or dict + The value to cache for the specified key/schema combination. + This should be the output of an LLM generation function that is + being cached for future reuse. The value should be + JSON-serializable since it will be stored in a JSON file on + disk. Examples of values include a list of query templates for + document retrieval, or a dictionary of website keywords and + their relevance weights for link crawling prioritization. The + value should be relevant to the technology and extraction task + specified by the schema, and should be generated based on the + content of the schema to ensure that it is useful and accurate + for future extractions using the same schema. + """ + # cspell: disable-next-line + data_dir = Path(user_data_dir(appname="INFRA-COMPASS", appauthor="NLR")) + data_dir.mkdir(parents=True, exist_ok=True) + cache_fp = data_dir / _CACHE_FP + + logger.debug("Loading %r from cache at %s", str(key), cache_fp) + cache = _load_cache(cache_fp) + schema_hash = _schema_hash(schema) + + if identifier.casefold() not in cache: + logger.debug( + "Adding %r for %r to cache at %s", + str(key), + identifier, + cache_fp, + ) + cache[identifier.casefold()] = {key: value, _SHA256_KEY: schema_hash} + _write_cache(cache_fp, cache) + return + + potential_qt = cache[identifier.casefold()] + if potential_qt.get(_SHA256_KEY) == schema_hash: + if key in potential_qt: + logger.debug( + "%r for %r already in cache and schema hash " + "matches, so not updating cache", + str(key), + identifier, + ) + return + + logger.debug( + "Schema hash matches but %r is missing. Updating cache for %r " + "at %s", + str(key), + identifier, + cache_fp, + ) + potential_qt[key] = value + _write_cache(cache_fp, cache) + return + + cache[identifier.casefold()] = {key: value, _SHA256_KEY: schema_hash} + _write_cache(cache_fp, cache) + + +def _load_cache(cache_fp): + """Load cache file contents as a dict""" + if not cache_fp.exists(): + return {} + + logger.debug("Loading LLM generation cache at %s", cache_fp) + return json.loads(cache_fp.read_text(encoding="utf-8")) + + +def _write_cache(cache_fp, cache): + """Write cache file contents to disk""" + cache_fp.write_text(json.dumps(cache, indent=4), encoding="utf-8") + + +def _schema_hash(schema): + """Get SHA256 hash of the schema for cache validation""" + m = hashlib.sha256() + m.update( + json.dumps(schema, sort_keys=True, separators=(",", ":")).encode( + "utf-8" + ) + ) + return m.hexdigest() diff --git a/compass/plugin/one_shot/components.py b/compass/plugin/one_shot/components.py index 2c10b9665..6274725ab 100644 --- a/compass/plugin/one_shot/components.py +++ b/compass/plugin/one_shot/components.py @@ -1,12 +1,14 @@ """COMPASS extraction schema-based plugin component implementations""" +import asyncio import logging from abc import ABC, abstractmethod import pandas as pd +from elm import ApiBase from compass.llm.calling import SchemaOutputLLMCaller -from compass.plugin import BaseParser, BaseTextCollector +from compass.plugin import BaseParser, BaseTextCollector, BaseTextExtractor from compass.utilities.enums import LLMUsageCategory from compass.utilities.parsing import merge_overlapping_texts @@ -36,13 +38,65 @@ {text} Think before you answer.\ +""" +_TEXT_EXTRACTOR_SYSTEM_PROMPT = """\ +You are a text extraction assistant. Your job is to extract only verbatim, \ +**unmodified** excerpts from the provided text. Do not interpret or \ +paraphrase. Do not summarize. Only return exactly copied segments that match \ +the specified extraction scope/domain. If the relevant content appears within \ +a table, return the entire table, including headers and footers, exactly as \ +formatted.\ +""" +_TEXT_EXTRACTOR_MAIN_PROMPT = """\ +# CONTEXT # +We want to reduce the provided excerpt to only contain information for the \ +domain relevant to the following extraction schema: + +{schema} + +The extracted text will be used for structured data extraction following this \ +schema, so it must be both **comprehensive** (retaining all relevant details) \ +and **focused** (excluding unrelated content), with **zero rewriting or \ +paraphrasing**. Ensure that all retained information is **directly +applicable** to the extraction task while preserving full context and accuracy. + +# OBJECTIVE # +Extract all text **pertaining to the extraction schema domain** from the \ +provided excerpt. + +# RESPONSE # +Follow these guidelines carefully: + +1. ## Formatting & Structure ##: +- **Preserve _all_ section titles, headers, and numberings** for reference. +- **Maintain the original wording, formatting, and structure** to ensure \ +accuracy. + +2. ## Output Handling ##: +- This is a strict extraction task — act like a text filter, **not** a \ +summarizer or writer. +- Do not add, explain, reword, or summarize anything. +- The output must be a **copy-paste** of the original excerpt. **Absolutely \ +no paraphrasing or rewriting.** +- The output must consist **only** of contiguous or discontiguous verbatim \ +blocks copied from the input. +- The only allowed change is to remove irrelevant sections of text. You can \ +remove irrelevant text from within sections, but you cannot add any new text \ +or modify the text you keep in any way. +- If **no relevant text** is found, return null. + +# TEXT # + +{text} + """ _DATA_PARSER_MAIN_PROMPT = """\ Extract all {desc}features from the following text: {text} -Think before you answer""" +Think before you answer\ +""" _DATA_PARSER_SYSTEM_PROMPT = """\ You are a legal scholar extracting structured data from {desc}documents. \ Follow all instructions in the schema descriptions carefully.\ @@ -152,6 +206,80 @@ def _store_chunk(self, parser, chunk_ind): ) +class SchemaBasedTextExtractor(SchemaOutputLLMCaller, BaseTextExtractor): + """Schema-based text extractor""" + + @property + @abstractmethod + def SCHEMA(self): # noqa: N802 + """dict: Extraction schema""" + raise NotImplementedError + + @property + @abstractmethod + def OUTPUT_SCHEMA(self): # noqa: N802 + """dict: Validation output schema""" + raise NotImplementedError + + @property + def parsers(self): + """Iterable of parsers provided by this extractor + + Yields + ------ + name : str + Name describing the type of text output by the parser. + parser : callable + Async function that takes a ``text_chunks`` input and + outputs parsed text. + """ + yield self.OUT_LABEL, self._process + + async def _process(self, text_chunks): + """Perform extraction processing""" + + logger.info( + "Extracting summary text from %d text chunks asynchronously...", + len(text_chunks), + ) + outer_task_name = asyncio.current_task().get_name() + summaries = [ + asyncio.create_task( + self.call( + sys_msg=_TEXT_EXTRACTOR_SYSTEM_PROMPT, + content=_TEXT_EXTRACTOR_MAIN_PROMPT.format( + schema=self.SCHEMA, text=chunk + ), + response_format={ + "type": "json_schema", + "json_schema": { + "name": "text_extraction", + "strict": True, + "schema": self.OUTPUT_SCHEMA, + }, + }, + usage_sub_label=self._USAGE_LABEL, + ), + name=outer_task_name, + ) + for chunk in text_chunks + ] + summary_chunks = await asyncio.gather(*summaries) + summary_chunks = [ + chunk.get("domain_relevant_text") for chunk in summary_chunks + ] + + text_summary = merge_overlapping_texts(summary_chunks) + logger.debug( + "Final summary contains %d tokens", + ApiBase.count_tokens( + text_summary, + model=self.kwargs.get("model", "gpt-4"), + ), + ) + return text_summary + + class SchemaOrdinanceParser(SchemaOutputLLMCaller, BaseParser): """Base class for parsing structured data""" diff --git a/compass/plugin/one_shot/generators.py b/compass/plugin/one_shot/generators.py index 20b08ff69..a8fa50af6 100644 --- a/compass/plugin/one_shot/generators.py +++ b/compass/plugin/one_shot/generators.py @@ -1,5 +1,6 @@ """COMPASS one-shot extraction plugin generators""" +import operator import importlib.resources from elm.utilities.retry import async_retry_with_exponential_backoff @@ -49,6 +50,57 @@ news, or reports).\ """ +_KEYWORD_GENERATOR_SYSTEM_PROMPT = """\ +You are an expert search strategist for regulatory documents. \ +Goal: Given an extraction schema (JSON) for an ordinance domain, generate \ +high-quality website keywords and weights for prioritizing crawl links. + +Input: +- schema_json: a JSON schema describing features/requirements to extract. + +Output: +- Produce an array of keyword/weight objects with integer weights. +- Do not include extra keys or any markdown. + +Guidelines: +- Derive terms from the schema title/description, feature names, and \ +definitions. Prefer official/legal terminology in the schema. +- Focus on keywords likely to appear in legal document URLs or link text. +- Include terms that indicate governing document types \ +(e.g., "ordinance", "zoning", "code", "regulations", "chapter", "section"). +- Include domain-specific synonyms and abbreviations present in the schema. +- Weights are relative: higher means more relevant for link prioritization. +- Avoid jurisdiction-specific entities. +""" + +_HEURISTIC_GENERATOR_SYSTEM_PROMPT = """\ +You are an expert in ordinance discovery and regulatory text filtering. \ +Goal: Given an extraction schema (JSON) for an ordinance domain, generate \ +keyword lists for a heuristic text check that detects domain-relevant \ +content and excludes look-alike words. + +Input: +- schema_json: a JSON schema describing features/requirements to extract. + +Output: +- Provide four keyword lists in the response schema. +- Do not include extra keys or any markdown. + +Guidelines: +- Derive terms from schema title/description, feature names, and \ +definitions. Prefer official/legal terminology and abbreviations. +- not_tech_words should include common look-alikes or near matches that \ +appear in non-domain contexts and could cause false positives. These will \ +be removed from the text before performing a keyword-based relevance check. \ +- good_tech_keywords should include single-word indicators likely to \ +appear in ordinance text. +- good_tech_acronyms should include short acronyms and abbreviations used \ +in legal documents for the domain. +- good_tech_phrases should include multi-word phrases (at least 2 words) \ +that indicate domain relevance. +- Avoid jurisdiction-specific names; keep keywords general. +""" + @async_retry_with_exponential_backoff( base_delay=1, @@ -129,6 +181,175 @@ async def generate_query_templates( return out +@async_retry_with_exponential_backoff( + base_delay=1, + exponential_base=4, + jitter=True, + max_retries=3, + errors=(COMPASSRuntimeError,), +) +async def generate_website_keywords( + schema_llm, extraction_schema, add_think_prompt=True +): + """Generate website keyword weights for document retrieval + + Parameters + ---------- + schema_llm : SchemaOutputLLMCaller + A LLM caller configured to output structured data according to a + provided schema. This function relies on the LLM to generate the + keyword weights, so the quality of the generated keywords will + depend on the capabilities of the LLM being used and how well it + can interpret the provided extraction schema. Highly recommended + to use the most powerful/capable instruction-tuned model for + this function. + extraction_schema : dict + A dictionary representing the schema of the desired extraction + task. The keywords will be generated based on the content of + this schema, so it should be as detailed and specific as + possible, and should include domain-specific terminology if + applicable. See the wind ordinance schema for an example. + add_think_prompt : bool, optional + Option to add a "Think before you answer" instruction to the end + of the prompt (useful for thinking models). + By default, ``True``. + + Returns + ------- + dict + Dictionary mapping keywords to integer weights for website link + prioritization. + + Raises + ------ + COMPASSRuntimeError + If the LLM fails to return any valid keyword weights after 3 + attempts. + """ + + keyword_schema_fp = _SCHEMA_DIR / "website_keywords.json5" + keyword_schema = load_config(keyword_schema_fp) + main_prompt = ( + "Generate website keyword weights for the following extraction " + f"schema:\n\n{extraction_schema}" + ) + if add_think_prompt: + main_prompt = f"{main_prompt}\n\nThink before you answer" + + response = await schema_llm.call( + sys_msg=_KEYWORD_GENERATOR_SYSTEM_PROMPT, + content=main_prompt, + response_format={ + "type": "json_schema", + "json_schema": { + "name": "website_keyword_generation", + "strict": True, + "schema": keyword_schema, + }, + }, + usage_sub_label=LLMUsageCategory.PLUGIN_GENERATION, + ) + out = _normalize_website_keywords(response.get("keywords")) + if not out: + msg = ( + "LLM did not return any valid website keywords. " + f"Received response: {response}" + ) + raise COMPASSRuntimeError(msg) + + return out + + +@async_retry_with_exponential_backoff( + base_delay=1, + exponential_base=4, + jitter=True, + max_retries=3, + errors=(COMPASSRuntimeError,), +) +async def generate_heuristic_keywords( + schema_llm, extraction_schema, add_think_prompt=True +): + """Generate keyword lists for a heuristic text check + + Parameters + ---------- + schema_llm : SchemaOutputLLMCaller + A LLM caller configured to output structured data according to a + provided schema. This function relies on the LLM to generate the + heuristic keyword lists, so the quality of the generated output + will depend on the capabilities of the LLM being used and how + well it can interpret the provided extraction schema. + extraction_schema : dict + A dictionary representing the schema of the desired extraction + task. The keyword lists will be generated based on the content + of this schema, so it should be as detailed and specific as + possible, and should include domain-specific terminology if + applicable. See the wind ordinance schema for an example. + add_think_prompt : bool, optional + Option to add a "Think before you answer" instruction to the end + of the prompt (useful for thinking models). + By default, ``True``. + + Returns + ------- + dict + Dictionary containing the keyword lists for a heuristic text + check: ``not_tech_words``, ``good_tech_keywords``, + ``good_tech_acronyms``, and ``good_tech_phrases``. + + Raises + ------ + COMPASSRuntimeError + If the LLM fails to return any valid heuristic keywords after 3 + attempts. + """ + + heuristic_schema_fp = _SCHEMA_DIR / "heuristic_keywords.json5" + heuristic_schema = load_config(heuristic_schema_fp) + main_prompt = ( + "Generate heuristic keyword lists for the following extraction " + f"schema:\n\n{extraction_schema}" + ) + if add_think_prompt: + main_prompt = f"{main_prompt}\n\nThink before you answer" + + response = await schema_llm.call( + sys_msg=_HEURISTIC_GENERATOR_SYSTEM_PROMPT, + content=main_prompt, + response_format={ + "type": "json_schema", + "json_schema": { + "name": "heuristic_keyword_generation", + "strict": True, + "schema": heuristic_schema, + }, + }, + usage_sub_label=LLMUsageCategory.PLUGIN_GENERATION, + ) + + if not response: + msg = ( + "LLM did not return any heuristic keywords. " + f"Received response: {response}" + ) + raise COMPASSRuntimeError(msg) + + return response + + +def _normalize_website_keywords(raw): + """Normalize keyword weights into a deduplicated dict""" + if not raw: + return {} + + items = _parse_llm_kw_to_list(raw) + if not items: + return {} + + return _de_duplicate_keywords(items) + + def _is_formattable(q): """True if the query template is formattable with a jurisdiction""" try: @@ -137,3 +358,38 @@ def _is_formattable(q): return False return True + + +def _parse_llm_kw_to_list(llm_kw): + """Parse LLM output into a list of (keyword, weight) tuples""" + items = [] + for item in llm_kw: + if isinstance(item, str): + items.append((item, 1)) + elif isinstance(item, dict): + items.append((item.get("keyword"), item.get("weight", 1))) + return items + + +def _de_duplicate_keywords(items): + """Process keywords by normalizing and keeping the highest weight""" + deduped = {} + sorted_items = sorted(items, key=operator.itemgetter(1), reverse=True) + for keyword, weight in sorted_items: + if not isinstance(keyword, str): + continue + + normalized = keyword.strip().casefold() + if not normalized or normalized.isdigit(): + continue + try: + int_weight = int(weight) + except (TypeError, ValueError): + continue + + if int_weight < 1: + continue + + deduped.setdefault(normalized, int_weight) + + return deduped diff --git a/compass/plugin/one_shot/schemas/extract_text.json5 b/compass/plugin/one_shot/schemas/extract_text.json5 new file mode 100644 index 000000000..fef3f9f7a --- /dev/null +++ b/compass/plugin/one_shot/schemas/extract_text.json5 @@ -0,0 +1,12 @@ +{ + "type": "object", + "description": "Response containing all relevant text extracted from the input chunk based on the extraction schema/domain. The extracted text should be both **comprehensive and focused** in order to maximize extraction accuracy. The output should be **verbatim text copied from the input chunk, without any paraphrasing or rewriting**. Only text irrelevant to the extraction should be dropped. If no relevant text is found, the LLM should return null.", + "additionalProperties": false, + "required": ["domain_relevant_text"], + "properties": { + "domain_relevant_text": { + "type": ["string", "null"], + "description": "The text extracted from the input chunk that is relevant to the extraction schema/domain. This should be **verbatim text copied from the input chunk, without any paraphrasing or rewriting, but possibly with some irrelevant text removed**. If no relevant text is found, this field should be null.", + } + }, +} \ No newline at end of file diff --git a/compass/plugin/one_shot/schemas/heuristic_keywords.json5 b/compass/plugin/one_shot/schemas/heuristic_keywords.json5 new file mode 100644 index 000000000..1f8f7ecbc --- /dev/null +++ b/compass/plugin/one_shot/schemas/heuristic_keywords.json5 @@ -0,0 +1,181 @@ +{ + "title": "Heuristic Keyword Lists", + "description": "Schema for LLM-generated heuristic keyword lists used to filter ordinance text.", + "type": "object", + "additionalProperties": false, + "required": [ + "not_tech_words", + "good_tech_keywords", + "good_tech_acronyms", + "good_tech_phrases" + ], + "properties": { + "not_tech_words": { + "type": "array", + "items": { + "type": "string" + } + }, + "good_tech_keywords": { + "type": "array", + "items": { + "type": "string" + } + }, + "good_tech_acronyms": { + "type": "array", + "items": { + "type": "string" + } + }, + "good_tech_phrases": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "$descriptions": { + "general": [ + "Return ONLY the fields allowed by this schema.", + "Provide four arrays of strings.", + "Derive terms from the extraction schema (title, descriptions, features).", + "Avoid jurisdiction-specific names; keep keywords general.", + ], + "not_tech_words": [ + "Include common look-alike words or near matches that are not the target domain.", + "Prefer words/phrases likely to cause false positives for a keyword-based lookup.", + "Do not include empty strings or whitespace-only entries.", + "Include as many distinct entries as possible (at least 3, ideally 10-20).", + ], + "good_tech_keywords": [ + "Include single-word indicators likely to appear in ordinance text.", + "Prefer official/legal terminology and abbreviations.", + "Avoid numbers-only keywords.", + "Limit this to 3-5 of the most relevant keywords.", + "Do not include extraction fields as keywords unless they are highly specific to the domain.", + ], + "good_tech_acronyms": [ + "Include short acronyms or abbreviations used in legal documents.", + "Avoid adding acronyms that are ambiguous outside the domain.", + "Limit this to 5-10 of the most relevant acronyms.", + "Do not include extraction fields or units as acronyms unless they are highly specific to the domain.", + ], + "good_tech_phrases": [ + "Include multi-word phrases (at least 2 words) that indicate domain relevance.", + "Avoid near-duplicate phrases or trivial variants.", + "Limit this to 5-10 of the most relevant phrases.", + "Do not include extraction fields as phrases unless they are highly specific to the domain.", + ], + "quality_checks": [ + "Avoid duplicate strings across lists.", + "Use lower-case strings when possible.", + ] + }, + "$examples": [ + { + "not_tech_words": [ + "micro wecs", + "small wecs", + "mini wecs", + "private wecs", + "personal wecs", + "pwecs", + "rewind", + "small wind", + "micro wind", + "mini wind", + "private wind", + "personal wind", + "swecs", + "windbreak", + "windiest", + "winds", + "windshield", + "window", + "windy", + "wind attribute", + "wind blow", + "wind break", + "wind current", + "wind damage", + "wind data", + "wind direction", + "wind draft", + "wind erosion", + "wind energy resource atlas", + "wind load", + "wind movement", + "wind orient", + "wind resource", + "wind runway", + "prevailing wind", + "downwind", + ], + "good_tech_keywords": [ + "wind", + "setback", + "turbine", + ], + "good_tech_acronyms": [ + "wecs", + "wes", + "lwet", + "uwet", + "wef" + ], + "good_tech_phrases": [ + "wind energy conversion", + "wind turbine", + "wind tower", + "wind farm", + "wind energy system", + "wind energy farm", + "utility wind energy system", + ] + }, + { + "not_tech_words": [ + "concentrated solar", + "csp", + "micro secs", + "small secs", + "mini secs", + "private secs", + "personal secs", + "psecs", + "solaris", + "small solar", + "micro solar", + "mini solar", + "private solar", + "personal solar", + "swecs", + "solar break", + "solar damage", + "solar data", + "solar resource", + ], + "good_tech_keywords": [ + "solar", + "setback", + "photovoltaic", + ], + "good_tech_acronyms": [ + "secs", + "sef", + "ses", + "cses" + ], + "good_tech_phrases": [ + "commercial solar energy system", + "solar energy conversion", + "solar energy system", + "solar panel", + "solar farm", + "solar energy farm", + "utility solar energy system", + ] + } + ] +} diff --git a/compass/plugin/one_shot/schemas/website_keywords.json5 b/compass/plugin/one_shot/schemas/website_keywords.json5 new file mode 100644 index 000000000..6f2d023af --- /dev/null +++ b/compass/plugin/one_shot/schemas/website_keywords.json5 @@ -0,0 +1,91 @@ +{ + "title": "Website Keyword Weights", + "description": "Schema for LLM-generated website keyword weights used to score crawl links.", + "type": "object", + "additionalProperties": false, + "required": ["keywords"], + "properties": { + "keywords": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": false, + "required": ["keyword", "weight"], + "properties": { + "keyword": { + "type": "string" + }, + "weight": { + "type": "integer", + "minimum": 1 + } + } + } + } + }, + "$descriptions": { + "general": [ + "Return ONLY the fields allowed by this schema.", + "Provide a single array of keyword/weight objects.", + "Each item must include \"keyword\" and \"weight\".", + "Weights are relative: higher means more relevant to crawl scoring.", + "Derive keywords from the extraction schema (title, descriptions, features).", + "Focus on legal document types and domain terminology, not news." + ], + "keyword_content": [ + "Include ordinance/code/zoning terminology if relevant.", + "Include domain-specific abbreviations or acronyms from the schema.", + "Prefer terms likely to appear in URLs or link text.", + "Do not include empty strings or whitespace-only keywords.", + "Avoid repeating the same keyword with different casing.", + "Avoid numbers-only keywords.", + "Avoid using specific extraction fields as keywords.", + "Avoid jurisdiction-specific names; keep keywords general." + ], + "quality_checks": [ + "Include at least 10 distinct keywords.", + "Avoid near-duplicate keywords or trivial variants.", + "Each keyword must be unique.", + "Use integer weights only." + ] + }, + "$examples": [ + { + "keywords": [ + {"keyword": "pdf", "weight": 92160}, + {"keyword": "secs", "weight": 46080}, + {"keyword": "solar", "weight": 23040}, + {"keyword": "zoning", "weight": 11520}, + {"keyword": "ordinance", "weight": 5760}, + {"keyword": "renewable energy", "weight": 1440}, + {"keyword": "planning", "weight": 720}, + {"keyword": "plan", "weight": 360}, + {"keyword": "government", "weight": 180}, + {"keyword": "code", "weight": 60}, + {"keyword": "area", "weight": 60}, + {"keyword": "land development", "weight": 15}, + {"keyword": "land", "weight": 3}, + {"keyword": "environment", "weight": 3}, + {"keyword": "energy", "weight": 3}, + {"keyword": "renewable", "weight": 3}, + {"keyword": "municipal", "weight": 1}, + {"keyword": "department", "weight": 1} + ] + }, + { + "keywords": [ + {"keyword": "pdf", "weight": 15000}, + {"keyword": "wind energy", "weight": 12000}, + {"keyword": "wecs", "weight": 11000}, + {"keyword": "wind turbine", "weight": 10000}, + {"keyword": "ordinance", "weight": 9000}, + {"keyword": "regulation", "weight": 7500}, + {"keyword": "zoning", "weight": 6000}, + {"keyword": "code", "weight": 3000}, + {"keyword": "permit", "weight": 1500}, + {"keyword": "land use", "weight": 1200}, + {"keyword": "planning", "weight": 800} + ] + } + ] +} diff --git a/compass/plugin/ordinance.py b/compass/plugin/ordinance.py index f26e2d17e..92d233e9f 100644 --- a/compass/plugin/ordinance.py +++ b/compass/plugin/ordinance.py @@ -13,6 +13,7 @@ from elm import ApiBase from compass.llm.calling import ( + LLMCaller, BaseLLMCaller, ChatLLMCaller, JSONFromTextLLMCaller, @@ -57,7 +58,7 @@ } -class BaseTextExtractor(ABC): +class BaseTextExtractor(BaseLLMCaller, ABC): """Extract succinct extraction text from input""" TASK_DESCRIPTION = "Condensing text for extraction" @@ -66,6 +67,8 @@ class BaseTextExtractor(ABC): TASK_ID = "text_extraction" """ID to use for this extraction for linking with LLM configs""" + _USAGE_LABEL = LLMUsageCategory.DOCUMENT_ORDINANCE_SUMMARY + @property @abstractmethod def IN_LABEL(self): # noqa: N802 @@ -389,7 +392,7 @@ def _store_chunk(self, parser, chunk_ind): ) -class PromptBasedTextExtractor(BaseTextExtractor, ABC): +class PromptBasedTextExtractor(LLMCaller, BaseTextExtractor, ABC): """Text extractor based on a chain of prompts""" SYSTEM_MESSAGE = ( @@ -434,6 +437,9 @@ class PromptBasedTextExtractor(BaseTextExtractor, ABC): **Absolutely no paraphrasing or rewriting.** - The output must consist **only** of contiguous or discontiguous verbatim blocks copied from the input. + - The only allowed change is to remove irrelevant sections of text. + You can remove irrelevant text from within sections, but you cannot + add any new text or modify the text you keep in any way. - If **no relevant text** is found, return the response: 'No relevant text.' """ @@ -443,8 +449,6 @@ class PromptBasedTextExtractor(BaseTextExtractor, ABC): ) """Prompt component instructing model output guidelines""" - _USAGE_LABEL = LLMUsageCategory.DOCUMENT_ORDINANCE_SUMMARY - @property @abstractmethod def PROMPTS(self): # noqa: N802 @@ -502,16 +506,6 @@ def __init_subclass__(cls, **kwargs): last_index = len(cls.PROMPTS) - 1 cls.OUT_LABEL = last_prompt.get("key", f"extracted_text_{last_index}") - def __init__(self, llm_caller): - """ - - Parameters - ---------- - llm_caller : LLMCaller - LLM Caller instance used to extract ordinance info with. - """ - self.llm_caller = llm_caller - @property def parsers(self): """Iterable of parsers provided by this extractor @@ -545,7 +539,7 @@ async def _process(self, text_chunks, instructions, is_valid_chunk=None): outer_task_name = asyncio.current_task().get_name() summaries = [ asyncio.create_task( - self.llm_caller.call( + self.call( sys_msg=self.SYSTEM_MESSAGE, content=f"{instructions}\n\n# TEXT #\n\n{chunk}", usage_sub_label=self._USAGE_LABEL, @@ -565,8 +559,7 @@ async def _process(self, text_chunks, instructions, is_valid_chunk=None): logger.debug( "Final summary contains %d tokens", ApiBase.count_tokens( - text_summary, - model=self.llm_caller.kwargs.get("model", "gpt-4"), + text_summary, model=self.kwargs.get("model", "gpt-4") ), ) return text_summary diff --git a/compass/services/threaded.py b/compass/services/threaded.py index bc5f146ff..0632ac31f 100644 --- a/compass/services/threaded.py +++ b/compass/services/threaded.py @@ -542,12 +542,13 @@ def _dump_jurisdiction_info( def _compile_doc_info(doc): """Put together meta information about a single document""" year, month, day = doc.attrs.get("date") or (None, None, None) + out_fp = doc.attrs.get("source_fp", doc.attrs.get("out_fp")) return { "source": doc.attrs.get("source"), "effective_year": year if year is not None and year > 0 else None, "effective_month": month if month is not None and month > 0 else None, "effective_day": day if day is not None and day > 0 else None, - "ord_filename": Path(doc.attrs.get("out_fp") or "unknown").name, + "ord_filename": Path(out_fp or "unknown").name, "num_pages": len(doc.pages), "checksum": doc.attrs.get("checksum"), "is_pdf": isinstance(doc, PDFDocument), diff --git a/docs/source/examples/index.rst b/docs/source/examples/index.rst index c90e05ee5..6efcad8dd 100644 --- a/docs/source/examples/index.rst +++ b/docs/source/examples/index.rst @@ -10,4 +10,5 @@ get started with ``COMPASS``: execution_basics/README one_shot_schema_extraction/README + parse_existing_docs/CLI/README parse_existing_docs/code/README diff --git a/examples/one_shot_schema_extraction/README.rst b/examples/one_shot_schema_extraction/README.rst index a89770c4c..d44d07351 100644 --- a/examples/one_shot_schema_extraction/README.rst +++ b/examples/one_shot_schema_extraction/README.rst @@ -145,11 +145,11 @@ The key options are listed below: - ``website_keywords``: Keyword weights for document search prioritization. - ``collection_prompts``: Prompt list for chunk filtering, or ``true`` to auto-generate. - ``text_extraction_prompts``: Prompt list for text consolidation, or ``true`` to auto-generate. -- ``cache_query_templates``: Cache generated query templates and keywords. By default, ``true``. +- ``cache_llm_generated_content``: Cache LLM-generated query templates and keywords. By default, ``true``. - ``extraction_system_prompt``: Optional system prompt override for extraction. -See `this documentation `_ +See `this documentation `_ for further details. If you want full control over all of the options above, you can specify them directly in the config diff --git a/examples/one_shot_schema_extraction/plugin_config.yaml b/examples/one_shot_schema_extraction/plugin_config.yaml index 4da8700c5..e5cda9287 100644 --- a/examples/one_shot_schema_extraction/plugin_config.yaml +++ b/examples/one_shot_schema_extraction/plugin_config.yaml @@ -15,16 +15,12 @@ website_keywords: wind: 23040 zoning: 11520 ordinance: 5760 - renewable%20energy: 1440 - renewable+energy: 1440 renewable energy: 1440 planning: 720 plan: 360 government: 180 code: 60 area: 60 - land%20development: 15 - land+development: 15 land development: 15 land: 3 environment: 3 @@ -33,6 +29,62 @@ website_keywords: municipal: 1 department: 1 +heuristic_keywords: + good_tech_keywords: + - "wind" + - "setback" + good_tech_acronyms: + - "wecs" + - "wes" + - "lwet" + - "uwet" + - "wef" + good_tech_phrases: + - "wind energy conversion" + - "wind turbine" + - "wind tower" + - "wind farm" + - "wind energy system" + - "wind energy farm" + - "utility wind energy system" + not_tech_words: + - "micro wecs" + - "small wecs" + - "mini wecs" + - "private wecs" + - "personal wecs" + - "pwecs" + - "rewind" + - "small wind" + - "micro wind" + - "mini wind" + - "private wind" + - "personal wind" + - "swecs" + - "windbreak" + - "windiest" + - "winds" + - "windshield" + - "window" + - "windy" + - "wind attribute" + - "wind blow" + - "wind break" + - "wind current" + - "wind damage" + - "wind data" + - "wind direction" + - "wind draft" + - "wind erosion" + - "wind energy resource atlas" + - "wind load" + - "wind movement" + - "wind orient" + - "wind resource" + - "wind runway" + - "prevailing wind" + - "downwind" + collection_prompts: - key: contains_ord_info label: contains ordinance info @@ -64,15 +116,15 @@ text_extraction_prompts: - Do **not** include text that does not pertain to wind energy systems. 3. ## Formatting & Structure ##: - - **Preserve _all_ section titles, headers, and numberings** for reference. - - **Maintain the original wording, formatting, and structure** to ensure accuracy. + - **Preserve _all_ section titles, headers, and numberings** for reference. + - **Maintain the original wording, formatting, and structure** to ensure accuracy. 4. ## Output Handling ##: - - This is a strict extraction task — act like a text filter, **not** a summarizer or writer. + - This is a strict extraction task — act like a text filter, **not** a summarizer or writer. - Do not add, explain, reword, or summarize anything. - - The output must be a **copy-paste** of the original excerpt. **Absolutely no paraphrasing or rewriting.** - - The output must consist **only** of contiguous or discontiguous verbatim blocks copied from the input. - - If **no relevant text** is found, return the response: 'No relevant text.' + - The output must be a **copy-paste** of the original excerpt. **Absolutely no paraphrasing or rewriting.** + - The output must consist **only** of contiguous or discontiguous verbatim blocks copied from the input. + - If **no relevant text** is found, return the response: 'No relevant text.' - key: cleaned_text_for_extraction out_fn: "{jurisdiction} Utility Scale Wind Ordinance.txt" @@ -103,14 +155,14 @@ text_extraction_prompts: - Do **not** include text that does not pertain at all to wind energy systems. 3. ## Formatting & Structure ##: - - **Preserve _all_ section titles, headers, and numberings** for reference. - - **Maintain the original wording, formatting, and structure** to ensure accuracy. + - **Preserve _all_ section titles, headers, and numberings** for reference. + - **Maintain the original wording, formatting, and structure** to ensure accuracy. 4. ## Output Handling ##: - - This is a strict extraction task — act like a text filter, **not** a summarizer or writer. + - This is a strict extraction task — act like a text filter, **not** a summarizer or writer. - Do not add, explain, reword, or summarize anything. - - The output must be a **copy-paste** of the original excerpt. **Absolutely no paraphrasing or rewriting.** - - The output must consist **only** of contiguous or discontiguous verbatim blocks copied from the input. - - If **no relevant text** is found, return the response: 'No relevant text.' + - The output must be a **copy-paste** of the original excerpt. **Absolutely no paraphrasing or rewriting.** + - The output must consist **only** of contiguous or discontiguous verbatim blocks copied from the input. + - If **no relevant text** is found, return the response: 'No relevant text.' extraction_system_prompt: "You are a legal scholar extracting structured data from wind energy ordinances. Follow all instructions in the schema descriptions carefully. Only extract requirements for large, commercial, utility-scale wind energy systems." diff --git a/examples/one_shot_schema_extraction/plugin_config_simple.json5 b/examples/one_shot_schema_extraction/plugin_config_simple.json5 index 92b4e9260..9295dcc48 100644 --- a/examples/one_shot_schema_extraction/plugin_config_simple.json5 +++ b/examples/one_shot_schema_extraction/plugin_config_simple.json5 @@ -2,10 +2,17 @@ // Always required for one-shot schema extraction plugins "schema": "./wind_schema.json", - // The default value for ``cache_query_templates`` is ``true``, - // but we include it here anyway for completeness and to - // demonstrate that it can be set to ``false`` if desired. - "cache_query_templates": true, + // The default value for ``cache_llm_generated_content`` is + // ``true``, but we include it here anyway for completeness + // and to demonstrate that it can be set to ``false`` if desired. + "cache_llm_generated_content": true, + + // By setting this option to ``true``, we indicate that we would + // like a keyword-based heuristic to be applied, but would like + // to use the LLM to generate heuristic keywords based on the + // extraction schema (instead of providing custom heuristic + // keywords). + "heuristic_keywords": true, // By setting this option to ``true``, we indicate that we would // like a text collection (filter) step, but would like to simply diff --git a/examples/parse_existing_docs/CLI/README.rst b/examples/parse_existing_docs/CLI/README.rst new file mode 100644 index 000000000..77224df3d --- /dev/null +++ b/examples/parse_existing_docs/CLI/README.rst @@ -0,0 +1,91 @@ +********************************* +Parsing Existing Docs via the CLI +********************************* + +If you already have documents that you want to run data extraction on, +you can skip web search and run COMPASS directly against local files. +This example shows the minimal CLI setup for processing local documents. + +Prerequisites +============= +Be sure to go over the +`COMPASS Execution Basics `_ +to understand how to set up a run environment and model run configuration. +You will be re-using the same execution pattern here with an added input to +point COMPASS to your local files. + +Compile Document Info +===================== +The key to running COMPASS against local files is compiling information +about the local documents that we can point COMPASS to. To do this, we +need to generate a mapping of jurisdiction codes to lists of document +metadata dicts, where each dict contains (at minimum) a required +``source_fp`` key that points to the local file path. + +For example, a minimal local document specification would look like this: + +.. literalinclude:: local_docs_minimal.json5 + :language: json5 + +This mapping can be saved as a config file using any of the formats +supported by COMPASS (JSON, JSON5, YAML, or TOML). + +Since we didn't include any additional metadata beyond the required +``source_fp``, COMPASS will perform all of the same document processing +steps that a document retrieved via search would go through, including +legal text validation and date extraction. To skip some or all of these +steps, you can include additional metadata fields in the document dicts +as described in the +`COMPASS documentation `_. +Below is an example of a more fully specified document mapping that +includes multiple documents, each with additional metadata fields to +skip certain processing steps: + +.. literalinclude:: local_docs.json5 + :language: json5 + + +Updating COMPASS Run Config +=========================== +Once the local document mapping is compiled, you can point COMPASS to it via +the main run config. You will also need to disable search so that COMPASS +doesn't attempt to retrieve documents from the web in addition to processing +your local files. The rest of the config can be set up as a typical COMPASS +run config with out_dir, tech, and any other relevant settings. Below is a +simple example: + +.. literalinclude:: config.json5 + :language: json5 + +.. NOTE:: + If you are not sure whether your local docs contain the relevant information + to be extracted, you can leave the web search enabled and COMPASS will + default back to a web search if no structured data is extracted from the + local documents. + +Of course, your jurisdiction CSV should still be set up to match the jurisdictions +you would like to process: + +.. literalinclude:: jurisdictions.csv + :language: text + +In this way, you can build up a corpus of local docs, point your config to the +document mapping, and only ever process the jurisdiction(s) you are interested in. + + +Running COMPASS +=============== +Once everything is configured, you can execute a model run as described in the +`COMPASS Execution Basics `_: + +.. code-block:: shell + + compass process -c config.json5 + +If you are using ``pixi``: + +.. code-block:: shell + + pixi run compass process -c config.json5 + +Outputs are written under ``./outputs`` by default. diff --git a/examples/parse_existing_docs/CLI/config.json5 b/examples/parse_existing_docs/CLI/config.json5 new file mode 100644 index 000000000..d9f0796e2 --- /dev/null +++ b/examples/parse_existing_docs/CLI/config.json5 @@ -0,0 +1,13 @@ +{ + // Same as a typical COMPASS config + "out_dir": "./outputs", + "jurisdiction_fp": "./jurisdictions.csv", + "tech": "wind", + + // NEW: Point to local docs mapping + "known_local_docs": "./local_docs.json5", + + // NEW: Disable web search since we already have local docs + "perform_se_search": false, + "perform_website_search": false +} diff --git a/examples/parse_existing_docs/CLI/jurisdictions.csv b/examples/parse_existing_docs/CLI/jurisdictions.csv new file mode 100644 index 000000000..509b1423a --- /dev/null +++ b/examples/parse_existing_docs/CLI/jurisdictions.csv @@ -0,0 +1,3 @@ +County,State +Decatur,Indiana +Franklin,Indiana diff --git a/examples/parse_existing_docs/CLI/local_docs.json5 b/examples/parse_existing_docs/CLI/local_docs.json5 new file mode 100755 index 000000000..816dcf26a --- /dev/null +++ b/examples/parse_existing_docs/CLI/local_docs.json5 @@ -0,0 +1,25 @@ +{ + "18031": [ + { + "source_fp": "../Decatur County, Indiana.pdf", + "source": "https://decaturcounty.in.gov/download/zoning-ordinance-article-13-wind-energy-conversion-system-wecs?refresh=68ffda0d84a6e1761597965&wpdmdl=6638", + "date": [null, null, null], // [year, month, day] - Skips date extraction if given + "check_if_legal_doc": false, // Skip legal doc check + + // Optional metadata fields - not required but can be helpful for metadata in the run output + "checksum": "sha256:1f68616ac8c4f26ca6cacf85023f210f7a453c002ca9159eb42252470b503386", + "from_ocr": false, + }, + ], + "18047": [ + { + "source_fp": "../Franklin County, Indiana.pdf", + "source": "https://www.franklincounty.in.gov/wp-content/uploads/2023/05/80.06.06-Commercial-and-Intermediate-Energy-Systems.pdf", + "date": [2023, 5, null], // Same as above... + "check_if_legal_doc": false, + + "checksum": "sha256:6ff5f90301ffba6ac4a8dd4d629201fe7f5cbffa7c5ae6fc8951e978d11be1fa", + "from_ocr": false, + } + ], +} \ No newline at end of file diff --git a/examples/parse_existing_docs/CLI/local_docs_minimal.json5 b/examples/parse_existing_docs/CLI/local_docs_minimal.json5 new file mode 100644 index 000000000..6a2becf83 --- /dev/null +++ b/examples/parse_existing_docs/CLI/local_docs_minimal.json5 @@ -0,0 +1,7 @@ +{ + "18031": [ + { + "source_fp": "../Decatur County, Indiana.pdf" + } + ] +} diff --git a/examples/parse_existing_docs/Franklin County, Indiana.pdf b/examples/parse_existing_docs/Franklin County, Indiana.pdf new file mode 100644 index 000000000..043719188 Binary files /dev/null and b/examples/parse_existing_docs/Franklin County, Indiana.pdf differ