diff --git a/compass/extraction/apply.py b/compass/extraction/apply.py
index c569d144c..63deb52c0 100644
--- a/compass/extraction/apply.py
+++ b/compass/extraction/apply.py
@@ -359,8 +359,9 @@ async def _extract_with_ngram_check(
         if not cleaned_text:
             logger.debug(
                 "No cleaned text found after extraction on attempt %d "
-                "for document with source %s. Retrying...",
+                "of %d for document with source %s. Retrying...",
                 attempt,
+                num_tries,
                 source,
             )
             continue
@@ -371,9 +372,10 @@ async def _extract_with_ngram_check(
         if ngram_frac >= ngram_thresh:
             logger.debug(
                 "Document extraction for %r passed ngram check on attempt %d "
-                "with score %.2f (OCR: %r; Document source: %s)",
+                "of %d with score %.2f (OCR: %r; Document source: %s)",
                 out_text_key,
-                attempt + 1,
+                attempt,
+                num_tries,
                 ngram_frac,
                 doc_is_from_ocr,
                 source,
@@ -384,10 +386,11 @@ async def _extract_with_ngram_check(
         best_score = max(best_score, ngram_frac)
 
         logger.debug(
-            "Document extraction for %r failed ngram check on attempt %d "
-            "with score %.2f (OCR: %r; Document source: %s). Retrying...",
+            "Document extraction for %r failed ngram check on attempt %d of "
+            "%d, with score %.2f (OCR: %r; Document source: %s). Retrying...",
             out_text_key,
-            attempt + 1,
+            attempt,
+            num_tries,
             ngram_frac,
             doc_is_from_ocr,
             source,
diff --git a/compass/plugin/interface.py b/compass/plugin/interface.py
index 5e4fd26c2..52af3dcba 100644
--- a/compass/plugin/interface.py
+++ b/compass/plugin/interface.py
@@ -4,7 +4,7 @@
 from abc import ABC, abstractmethod
 
 from compass.plugin.base import BaseExtractionPlugin
-from compass.llm.calling import BaseLLMCaller, LLMCaller
+from compass.llm.calling import BaseLLMCaller
 from compass.extraction import extract_relevant_text_with_ngram_validation
 from compass.scripts.download import filter_ordinance_docs
 from compass.services.threaded import CLEANED_FP_REGISTRY, CleanedFileWriter
@@ -238,12 +238,11 @@ async def extract_relevant_text(self, doc, extractor_class, model_config):
         model_config : LLMConfig
             Configuration for the LLM model to use for text extraction.
         """
-        llm_caller = LLMCaller(
+        extractor = extractor_class(
             llm_service=model_config.llm_service,
             usage_tracker=self.usage_tracker,
             **model_config.llm_call_kwargs,
         )
-        extractor = extractor_class(llm_caller)
         doc = await extract_relevant_text_with_ngram_validation(
             doc,
             model_config.text_splitter,
@@ -316,11 +315,12 @@ async def filter_docs(
             ),
         )
 
+        heuristic = await self.get_heuristic()
         docs = await filter_ordinance_docs(
             docs,
             self.jurisdiction,
             self.model_configs,
-            heuristic=self.HEURISTIC(),
+            heuristic=heuristic,
             tech=self.IDENTIFIER,
             text_collectors=self.TEXT_COLLECTORS,
             usage_tracker=self.usage_tracker,
diff --git a/compass/plugin/noop.py b/compass/plugin/noop.py
index 7d7ede507..bdf44629e 100644
--- a/compass/plugin/noop.py
+++ b/compass/plugin/noop.py
@@ -76,16 +76,6 @@ def _store_chunk(self, parser, chunk_ind):
 class NoOpTextExtractor(BaseTextExtractor):
     """NoOp text extractor that returns the full text"""
 
-    def __init__(self, llm_caller):
-        """
-
-        Parameters
-        ----------
-        llm_caller : LLMCaller
-            LLM Caller instance used to extract ordinance info with.
-        """
-        self.llm_caller = llm_caller
-
     async def return_original(self, text_chunks):  # noqa: PLR6301
         """No processing, just return original text
 
diff --git a/compass/plugin/one_shot/base.py b/compass/plugin/one_shot/base.py
index b3c4bb13f..4ab68576a 100644
--- a/compass/plugin/one_shot/base.py
+++ b/compass/plugin/one_shot/base.py
@@ -1,12 +1,9 @@
 """COMPASS one-shot extraction plugin"""
 
-import json
 import logging
-import hashlib
 import importlib.resources
-from pathlib import Path
-
-from platformdirs import user_data_dir
+from asyncio import Semaphore
+from enum import StrEnum, auto
 
 from compass.llm.calling import SchemaOutputLLMCaller
 from compass.plugin import (
@@ -17,21 +14,41 @@
     PromptBasedTextCollector,
     PromptBasedTextExtractor,
     OrdinanceExtractionPlugin,
+    KeywordBasedHeuristic,
+)
+from compass.plugin.one_shot.generators import (
+    generate_query_templates,
+    generate_website_keywords,
+    generate_heuristic_keywords,
 )
-from compass.plugin.one_shot.generators import generate_query_templates
 from compass.plugin.one_shot.components import (
     SchemaBasedTextCollector,
+    SchemaBasedTextExtractor,
     SchemaOrdinanceParser,
 )
+from compass.plugin.one_shot.cache import key_from_cache, key_to_cache
+from compass.services.threaded import CLEANED_FP_REGISTRY
 from compass.utilities.io import load_config
 from compass.utilities.enums import LLMTasks
+from compass.exceptions import COMPASSPluginConfigurationError
 
 
 logger = logging.getLogger(__name__)
 _SCHEMA_DIR = importlib.resources.files("compass.plugin.one_shot.schemas")
+_QT_SEMAPHORE = Semaphore(1)
+_WK_SEMAPHORE = Semaphore(1)
+_HK_SEMAPHORE = Semaphore(1)
+
+
+class _CacheKey(StrEnum):
+    """LLM generated content cache keys"""
 
+    QUERY_TEMPLATES = auto()
+    WEBSITE_KEYWORDS = auto()
+    HEURISTIC_KEYWORDS = auto()
 
-def create_schema_based_one_shot_extraction_plugin(config, tech):
+
+def create_schema_based_one_shot_extraction_plugin(config, tech):  # noqa: C901
     """Create a one-shot extraction plugin based on a configuration
 
     Parameters
@@ -39,14 +56,15 @@ def create_schema_based_one_shot_extraction_plugin(config, tech):
     config : dict or path-like
         One-shot configuration dictionary. If not a dictionary, should
         be a path to a file containing the configuration (supported
-        formats: JSON, JSON5, YAML, TOML). See the wind ordinance schema
+        formats: JSON, JSON5, YAML, TOML). See the
+        `wind ordinance schema <https://github.com/NatLabRockies/COMPASS/blob/main/examples/one_shot_schema_extraction/wind_schema.json>`_
         for an example. The configuration must include the following
         keys:
 
             - `schema`: A dictionary representing the schema of the
-               output. Can also be a path to a file that contains the
-               schema (supported formats: JSON, JSON5, YAML, TOML). See
-               the wind ordinance schema for an example.
+              output. Can also be a path to a file that contains the
+              schema (supported formats: JSON, JSON5, YAML, TOML). See
+              the wind ordinance schema for an example.
 
         The configuration can also include the following optional keys:
 
@@ -61,10 +79,20 @@ def create_schema_based_one_shot_extraction_plugin(config, tech):
               that is being processed. If not provided, the LLM will be
               used to generate search engine queries based on the
               schema input.
-            - `website_keywords`: A list of keywords to use for
-              filtering websites during document retrieval. If not
-              provided, the LLM will be used to generate website
-              keywords based on the schema input.
+            - `website_keywords`: A dictionary mapping keywords to
+              scores for filtering websites during document retrieval.
+              If not provided, the LLM will be used to generate
+              website keywords based on the schema input.
+            - `heuristic_keywords`: A dictionary containing the keyword
+              lists used by the heuristic document filter. The
+              dictionary must include ``not_tech_words``,
+              ``good_tech_keywords``, ``good_tech_acronyms``, and
+              ``good_tech_phrases`` keys. Alternatively, this input can
+              simply be ``True``, in which case the LLM will be used to
+              generate heuristic keyword lists based on the schema
+              input. If ``False``, ``None``, or not provided, a `NoOp`
+              heuristic that always returns ``True`` will be used (not
+              recommended if doing website crawling).
             - `collection_prompts`: A list of prompts to use for
               collecting relevant text from documents. Alternatively,
               this input can simply be ``True``, in which case the LLM
@@ -78,7 +106,7 @@ def create_schema_based_one_shot_extraction_plugin(config, tech):
               the text extraction prompts. If ``False``, ``None``, or
               not provided, the entire document text will be used for
               extraction (no text consolidation).
-            - `cache_query_templates`: Boolean flag indicating
+            - `cache_llm_generated_content`: Boolean flag indicating
               whether or not to cache generated query templates and
               website keywords for future use. By default, ``True``.
               Caching is recommended since the generation of query
@@ -109,7 +137,7 @@ def create_schema_based_one_shot_extraction_plugin(config, tech):
 
     text_collectors = _collectors_from_config(config)
     text_extractors = _extractors_from_config(
-        config, in_label=text_collectors[-1].OUT_LABEL
+        config, in_label=text_collectors[-1].OUT_LABEL, tech=tech
     )
     parsers = _parser_from_config(
         config, in_label=text_extractors[-1].OUT_LABEL
@@ -122,35 +150,11 @@ class SchemaBasedExtractionPlugin(OrdinanceExtractionPlugin):
         IDENTIFIER = tech
         """str: Identifier for extraction task """
 
-        # TODO: implement dynamic generation of the heuristic based on
-        # the extraction schema
         HEURISTIC = NoOpHeuristic
         """BaseHeuristic: Class with a ``check()`` method"""
 
-        # TODO: implement dynamic generation of the website keywords
-        # based on the extraction schema
-        WEBSITE_KEYWORDS = {
-            "pdf": 23040,
-            "zoning": 11520,
-            "ordinance": 5760,
-            r"renewable%20energy": 1440,
-            r"renewable+energy": 1440,
-            "renewable energy": 1440,
-            "planning": 720,
-            "plan": 360,
-            "government": 180,
-            "code": 60,
-            "area": 60,
-            r"land%20development": 15,
-            r"land+development": 15,
-            "land development": 15,
-            "land": 3,
-            "environment": 3,
-            "energy": 3,
-            "renewable": 3,
-            "municipal": 1,
-            "department": 1,
-        }
+        HEURISTIC_KEYWORDS = None
+        """dict: Keyword lists for heuristic content filtering"""
 
         TEXT_COLLECTORS = text_collectors
         """Classes for collecting text chunks from docs"""
@@ -164,6 +168,34 @@ class SchemaBasedExtractionPlugin(OrdinanceExtractionPlugin):
         QUERY_TEMPLATES = []  # set by user or LLM-generated
         """List: List of search engine query templates"""
 
+        WEBSITE_KEYWORDS = {}  # set by user or LLM-generated
+        """dict: Keyword weight mapping for link crawl prioritization"""
+
+        async def get_heuristic(self):
+            """Get a `BaseHeuristic` instance with a `check()` method
+
+            The ``check()`` method should accept a string of text and
+            return ``True`` if the text passes the heuristic check and
+            ``False`` otherwise.
+            """
+            if self.HEURISTIC_KEYWORDS and self.HEURISTIC is not NoOpHeuristic:
+                return self.HEURISTIC()
+
+            if not config.get("heuristic_keywords"):
+                return NoOpHeuristic()
+
+            hk = await self._get_heuristic_keywords()
+
+            class SchemaBasedHeuristic(KeywordBasedHeuristic):
+                NOT_TECH_WORDS = hk["NOT_TECH_WORDS"]
+                GOOD_TECH_KEYWORDS = hk["GOOD_TECH_KEYWORDS"]
+                GOOD_TECH_ACRONYMS = hk["GOOD_TECH_ACRONYMS"]
+                GOOD_TECH_PHRASES = hk["GOOD_TECH_PHRASES"]
+
+            self.__class__.HEURISTIC_KEYWORDS = hk
+            self.__class__.HEURISTIC = SchemaBasedHeuristic
+            return self.HEURISTIC()
+
         async def get_query_templates(self):
             """Get a list of query templates for document retrieval
 
@@ -179,35 +211,168 @@ async def get_query_templates(self):
                 return self.QUERY_TEMPLATES
 
             if qt := config.get("query_templates"):
-                self.QUERY_TEMPLATES = qt
+                self.__class__.QUERY_TEMPLATES = qt
                 return qt
 
-            qt = _qt_from_cache(self.IDENTIFIER, config["schema"])
+            qt = key_from_cache(
+                self.IDENTIFIER,
+                config["schema"],
+                key=_CacheKey.QUERY_TEMPLATES,
+            )
             if qt:
-                self.QUERY_TEMPLATES = qt
+                self.__class__.QUERY_TEMPLATES = qt
                 return qt
 
-            model_config = self.model_configs.get(
-                LLMTasks.PLUGIN_GENERATION,
-                self.model_configs[LLMTasks.DEFAULT],
-            )
-            schema_llm = SchemaOutputLLMCaller(
-                llm_service=model_config.llm_service,
-                usage_tracker=self.usage_tracker,
-                **model_config.llm_call_kwargs,
-            )
-            logger.debug("Generating query templates...")
-            qt = await generate_query_templates(
-                schema_llm, config["schema"], add_think_prompt=True
-            )
-            logger.debug("Generated the following query templates:\n%r", qt)
-            self.QUERY_TEMPLATES = qt
-
-            if config.get("cache_query_templates", True):
-                _qt_to_cache(self.IDENTIFIER, config["schema"], qt)
+            async with _QT_SEMAPHORE:
+                if self.QUERY_TEMPLATES:
+                    return self.QUERY_TEMPLATES
+
+                model_config = self.model_configs.get(
+                    LLMTasks.PLUGIN_GENERATION,
+                    self.model_configs[LLMTasks.DEFAULT],
+                )
+                schema_llm = SchemaOutputLLMCaller(
+                    llm_service=model_config.llm_service,
+                    usage_tracker=self.usage_tracker,
+                    **model_config.llm_call_kwargs,
+                )
+                logger.debug("Generating query templates...")
+                qt = await generate_query_templates(
+                    schema_llm, config["schema"], add_think_prompt=True
+                )
+                logger.debug(
+                    "Generated the following query templates:\n%r", qt
+                )
+                self.__class__.QUERY_TEMPLATES = qt
+
+                if config.get("cache_llm_generated_content", True):
+                    key_to_cache(
+                        self.IDENTIFIER,
+                        config["schema"],
+                        key=_CacheKey.QUERY_TEMPLATES,
+                        value=qt,
+                    )
 
             return qt
 
+        async def get_website_keywords(self):
+            """Get a dict of website search keyword scores
+
+            Returns
+            -------
+            dict
+                Dictionary mapping keywords to scores that indicate
+                links which should be prioritized when performing a
+                website scrape for a document.
+            """
+            if self.WEBSITE_KEYWORDS:
+                return self.WEBSITE_KEYWORDS
+
+            if wk := config.get("website_keywords"):
+                wk = _augment_website_keywords(wk)
+                self.__class__.WEBSITE_KEYWORDS = wk
+                return wk
+
+            wk = key_from_cache(
+                self.IDENTIFIER,
+                config["schema"],
+                key=_CacheKey.WEBSITE_KEYWORDS,
+            )
+            if wk:
+                wk = _augment_website_keywords(wk)
+                self.__class__.WEBSITE_KEYWORDS = wk
+                return wk
+
+            async with _WK_SEMAPHORE:
+                if self.WEBSITE_KEYWORDS:
+                    return self.WEBSITE_KEYWORDS
+
+                model_config = self.model_configs.get(
+                    LLMTasks.PLUGIN_GENERATION,
+                    self.model_configs[LLMTasks.DEFAULT],
+                )
+                schema_llm = SchemaOutputLLMCaller(
+                    llm_service=model_config.llm_service,
+                    usage_tracker=self.usage_tracker,
+                    **model_config.llm_call_kwargs,
+                )
+                logger.debug("Generating website keywords...")
+                wk = await generate_website_keywords(
+                    schema_llm,
+                    config["schema"],
+                    add_think_prompt=True,
+                )
+                logger.debug(
+                    "Generated the following website keywords:\n%r", wk
+                )
+                if config.get("cache_llm_generated_content", True):
+                    key_to_cache(
+                        self.IDENTIFIER,
+                        config["schema"],
+                        key=_CacheKey.WEBSITE_KEYWORDS,
+                        value=wk,
+                    )
+
+                wk = _augment_website_keywords(wk)
+                self.__class__.WEBSITE_KEYWORDS = wk
+
+            return wk
+
+        async def _get_heuristic_keywords(self):
+            """Get keyword lists for the heuristic document filter"""
+            if self.HEURISTIC_KEYWORDS:
+                return self.HEURISTIC_KEYWORDS
+
+            if isinstance(hk := config.get("heuristic_keywords"), dict):
+                hk = _normalize_heuristic_keywords(hk)
+                self.__class__.HEURISTIC_KEYWORDS = hk
+                return hk
+
+            hk = key_from_cache(
+                self.IDENTIFIER,
+                config["schema"],
+                key=_CacheKey.HEURISTIC_KEYWORDS,
+            )
+            if hk:
+                hk = _normalize_heuristic_keywords(hk)
+                self.__class__.HEURISTIC_KEYWORDS = hk
+                return hk
+
+            async with _HK_SEMAPHORE:
+                if self.HEURISTIC_KEYWORDS:
+                    return self.HEURISTIC_KEYWORDS
+
+                model_config = self.model_configs.get(
+                    LLMTasks.PLUGIN_GENERATION,
+                    self.model_configs[LLMTasks.DEFAULT],
+                )
+                schema_llm = SchemaOutputLLMCaller(
+                    llm_service=model_config.llm_service,
+                    usage_tracker=self.usage_tracker,
+                    **model_config.llm_call_kwargs,
+                )
+                logger.debug("Generating heuristic keywords...")
+                hk = await generate_heuristic_keywords(
+                    schema_llm,
+                    config["schema"],
+                    add_think_prompt=True,
+                )
+                hk = _normalize_heuristic_keywords(hk)
+                logger.debug(
+                    "Generated the following heuristic keywords:\n%r", hk
+                )
+                if config.get("cache_llm_generated_content", True):
+                    key_to_cache(
+                        self.IDENTIFIER,
+                        config["schema"],
+                        key=_CacheKey.HEURISTIC_KEYWORDS,
+                        value=hk,
+                    )
+
+                self.__class__.HEURISTIC_KEYWORDS = hk
+
+            return hk
+
         def _validate_query_templates(self):
             """NoOp validation for query templates
 
@@ -215,6 +380,13 @@ def _validate_query_templates(self):
             runtime whether or not they will be valid.
             """
 
+        def _validate_website_keywords(self):
+            """NoOp validation for website keywords
+
+            Since keywords can be generated by LLM, we don't know until
+            runtime whether or not they will be valid.
+            """
+
     register_plugin(SchemaBasedExtractionPlugin)
 
 
@@ -225,38 +397,41 @@ def _collectors_from_config(config):
     if cp is True:
         schema_fp = _SCHEMA_DIR / "validate_chunk.json5"
 
-        class PluginCollector(SchemaBasedTextCollector):
+        class PluginTextCollector(SchemaBasedTextCollector):
             OUT_LABEL = NoOpTextCollector.OUT_LABEL  # reuse label
             SCHEMA = config["schema"]
             OUTPUT_SCHEMA = load_config(schema_fp)
 
-        return [PluginCollector]
+        return [PluginTextCollector]
 
     if cp:
 
-        class PluginCollector(PromptBasedTextCollector):
+        class PluginTextCollector(PromptBasedTextCollector):
             OUT_LABEL = NoOpTextCollector.OUT_LABEL  # reuse label
             PROMPTS = cp
 
-        return [PluginCollector]
+        return [PluginTextCollector]
 
     return [NoOpTextCollector]
 
 
-def _extractors_from_config(config, in_label):
+def _extractors_from_config(config, in_label, tech):
     """Create a TextExtractor subclass based on a config dict"""
     tep = config.get("text_extraction_prompts")
 
     if tep is True:
-        # TODO: When implementing this, don't forget to register the
-        # text output file name so it gets store in the
-        # cleaned outputs directory
-        msg = (
-            "LLM-based text extraction not implemented yet. If you would like "
-            "to see this feature implemented, please submit an issue or, "
-            "better yet, a pull request!"
-        )
-        raise NotImplementedError(msg)
+        schema_fp = _SCHEMA_DIR / "extract_text.json5"
+
+        class PluginTextExtractor(SchemaBasedTextExtractor):
+            IN_LABEL = in_label
+            OUT_LABEL = "copied_relevant_text"
+            SCHEMA = config["schema"]
+            OUTPUT_SCHEMA = load_config(schema_fp)
+
+        CLEANED_FP_REGISTRY.setdefault(tech.casefold(), {})[
+            "copied_relevant_text"
+        ] = "Text for Extraction.txt"
+        return [PluginTextExtractor]
 
     if tep:
 
@@ -290,83 +465,85 @@ class PluginParser(SchemaOrdinanceParser):
     return [PluginParser]
 
 
-def _qt_from_cache(identifier, schema):
-    """Get cached query templates for a given schema if they exist"""
-    # cspell: disable-next-line
-    data_dir = Path(user_data_dir(appname="INFRA-COMPASS", appauthor="NLR"))
-    cache_fp = data_dir / "qt_cache.json"
-    if not cache_fp.exists():
-        return None
-
-    logger.debug("Loading query templates from cache at %s", cache_fp)
-    qt = json.loads(cache_fp.read_text(encoding="utf-8"))
-    if identifier.casefold() not in qt:
-        return None
-
-    potential_qt = qt[identifier.casefold()]
-    m = hashlib.sha256()
-    m.update(str(schema).encode())
-    if potential_qt.get("sha256") != m.hexdigest():
-        return None
-
-    templates = potential_qt["templates"]
-    logger.debug(
-        "Found query templates for %r in cache:\n%r", identifier, templates
-    )
-    return templates
-
-
-def _qt_to_cache(identifier, schema, qt):
-    """Cache generated query templates for future use"""
-    # cspell: disable-next-line
-    data_dir = Path(user_data_dir(appname="INFRA-COMPASS", appauthor="NLR"))
-    data_dir.mkdir(parents=True, exist_ok=True)
-    cache_fp = data_dir / "qt_cache.json"
-    if not cache_fp.exists():
-        logger.debug(
-            "Cache file for query templates not found at %s. Creating new "
-            "cache with current query templates for %r",
-            cache_fp,
-            identifier,
-        )
-        cache = {
-            identifier.casefold(): {
-                "templates": qt,
-                "sha256": hashlib.sha256(str(schema).encode()).hexdigest(),
-            }
-        }
-        cache_fp.write_text(json.dumps(cache, indent=4), encoding="utf-8")
-        return
-
-    logger.debug("Loading query templates from cache at %s", cache_fp)
-    cache = json.loads(cache_fp.read_text(encoding="utf-8"))
-    if identifier.casefold() not in cache:
-        logger.debug(
-            "Adding query templates for %r to cache at %s",
-            identifier,
-            cache_fp,
+def _augment_website_keywords(keywords):
+    """Add URL-encoded variants for multi-word keywords"""
+    augmented = dict(keywords)
+    for keyword, score in list(augmented.items()):
+        if not isinstance(keyword, str):
+            continue
+
+        if " " not in keyword:
+            continue
+
+        encoded = keyword.replace(" ", "%20")
+        if encoded not in augmented:
+            augmented[encoded] = score
+
+        plus_encoded = keyword.replace(" ", "+")
+        if plus_encoded not in augmented:
+            augmented[plus_encoded] = score
+
+    return augmented
+
+
+def _normalize_heuristic_keywords(raw):
+    """Normalize heuristic keyword lists into required structure"""
+    if not isinstance(raw, dict):
+        msg = "Heuristic keywords must be a dictionary of keyword lists."
+        raise COMPASSPluginConfigurationError(msg)
+
+    expected_keys = {
+        "NOT_TECH_WORDS",
+        "GOOD_TECH_KEYWORDS",
+        "GOOD_TECH_ACRONYMS",
+        "GOOD_TECH_PHRASES",
+    }
+
+    normalized = {}
+    for raw_key, value in raw.items():
+        if not isinstance(raw_key, str):
+            msg = "Heuristic keyword keys must be strings."
+            raise COMPASSPluginConfigurationError(msg)
+
+        target_key = (
+            raw_key.strip().replace(" ", "_").replace("-", "_").upper()
         )
-        cache[identifier.casefold()] = {
-            "templates": qt,
-            "sha256": hashlib.sha256(str(schema).encode()).hexdigest(),
-        }
-        cache_fp.write_text(json.dumps(cache, indent=4), encoding="utf-8")
-        return
-
-    potential_qt = cache[identifier.casefold()]
-    m = hashlib.sha256()
-    m.update(str(schema).encode())
-    if potential_qt.get("sha256") == m.hexdigest():
-        logger.debug(
-            "Query templates for %r already in cache and schema hash "
-            "matches, so not updating cache",
-            identifier,
+        if target_key not in expected_keys:
+            msg = f"Unexpected heuristic keyword list: {raw_key!r}."
+            raise COMPASSPluginConfigurationError(msg)
+
+        normalized[target_key] = _normalize_keyword_list(value)
+
+    missing = expected_keys - set(normalized)
+    if missing:
+        msg = (
+            f"Heuristic keywords are missing required lists: {sorted(missing)}"
         )
-        return
+        raise COMPASSPluginConfigurationError(msg)
 
-    cache[identifier.casefold()] = {
-        "templates": qt,
-        "sha256": hashlib.sha256(str(schema).encode()).hexdigest(),
-    }
-    cache_fp.write_text(json.dumps(cache, indent=4), encoding="utf-8")
-    return
+    empty = [key for key, value in normalized.items() if not value]
+    if empty:
+        msg = f"Heuristic keyword lists must not be empty: {sorted(empty)}"
+        raise COMPASSPluginConfigurationError(msg)
+
+    return normalized
+
+
+def _normalize_keyword_list(items):
+    """Normalize keyword list entries"""
+    normalized = set()
+    for item in items:
+        if not isinstance(item, str):
+            continue
+
+        keyword = item.strip()
+        if not keyword:
+            continue
+
+        keyword = keyword.casefold()
+        if keyword in normalized:
+            continue
+
+        normalized.add(keyword)
+
+    return list(normalized)
diff --git a/compass/plugin/one_shot/cache.py b/compass/plugin/one_shot/cache.py
new file mode 100644
index 000000000..b7e0579e1
--- /dev/null
+++ b/compass/plugin/one_shot/cache.py
@@ -0,0 +1,167 @@
+"""Schema-based cache for storing LLM-generated outputs"""
+
+import json
+import logging
+import hashlib
+from pathlib import Path
+
+from platformdirs import user_data_dir
+
+
+logger = logging.getLogger(__name__)
+_CACHE_FP = "llm-generation_cache.json"
+_SHA256_KEY = "sha256"
+
+
+def key_from_cache(identifier, schema, key):
+    """[NOT PUBLIC API] Get cached value for key/schema combination
+
+    Parameters
+    ----------
+    identifier : str
+        A string identifier for the technology of the extraction schema
+        (e.g. "wind", "solar", "building_codes", etc.).
+    schema : dict
+        The extraction schema that is being used for the LLM-based
+        one-shot extraction. This is used to ensure that cached content
+        is only returned if the schema matches, which helps ensure
+        that cached content is relevant and accurate for the current
+        extraction task.
+    key : str
+        The specific key for the cached content to retrieve, (e.g.
+        "query_templates", "website_keywords", etc.).
+
+    Returns
+    -------
+    list or dict or None
+        The cached value for the specified key/schema combination, or
+        ``None`` if no valid cached value is found.
+    """
+    # cspell: disable-next-line
+    data_dir = Path(user_data_dir(appname="INFRA-COMPASS", appauthor="NLR"))
+    cache_fp = data_dir / _CACHE_FP
+    cache = _load_cache(cache_fp)
+
+    tech_cache = cache.get(identifier.casefold(), {})
+    if not tech_cache:
+        logger.debug("Did not find cache for %r", identifier)
+        return None
+
+    if tech_cache.get(_SHA256_KEY) != _schema_hash(schema):
+        logger.debug(
+            "Cache for %r exists but schema hash did not match", identifier
+        )
+        return None
+
+    out = tech_cache.get(key)
+    if not out:
+        logger.debug(
+            "Cache for %r exists and schema hash matches but no %r found",
+            identifier,
+            str(key),
+        )
+        return None
+
+    logger.debug("Found %r for %r in cache:\n%r", str(key), identifier, out)
+    return out
+
+
+def key_to_cache(identifier, schema, key, value):
+    """[NOT PUBLIC API] Cache key/value for given schema/tech combo
+
+    Parameters
+    ----------
+    identifier : str
+        A string identifier for the technology of the extraction schema
+        (e.g. "wind", "solar", "building_codes", etc.).
+    schema : dict
+        The extraction schema that is being used for the LLM-based
+        one-shot extraction. This is used to ensure that cached content
+        is only returned if the schema matches, which helps ensure
+        that cached content is relevant and accurate for the current
+        extraction task.
+    key : str
+        The specific key for the cached content to retrieve, (e.g.
+        "query_templates", "website_keywords", etc.).
+    value : list or dict
+        The value to cache for the specified key/schema combination.
+        This should be the output of an LLM generation function that is
+        being cached for future reuse. The value should be
+        JSON-serializable since it will be stored in a JSON file on
+        disk. Examples of values include a list of query templates for
+        document retrieval, or a dictionary of website keywords and
+        their relevance weights for link crawling prioritization. The
+        value should be relevant to the technology and extraction task
+        specified by the schema, and should be generated based on the
+        content of the schema to ensure that it is useful and accurate
+        for future extractions using the same schema.
+    """
+    # cspell: disable-next-line
+    data_dir = Path(user_data_dir(appname="INFRA-COMPASS", appauthor="NLR"))
+    data_dir.mkdir(parents=True, exist_ok=True)
+    cache_fp = data_dir / _CACHE_FP
+
+    logger.debug("Loading %r from cache at %s", str(key), cache_fp)
+    cache = _load_cache(cache_fp)
+    schema_hash = _schema_hash(schema)
+
+    if identifier.casefold() not in cache:
+        logger.debug(
+            "Adding %r for %r to cache at %s",
+            str(key),
+            identifier,
+            cache_fp,
+        )
+        cache[identifier.casefold()] = {key: value, _SHA256_KEY: schema_hash}
+        _write_cache(cache_fp, cache)
+        return
+
+    potential_qt = cache[identifier.casefold()]
+    if potential_qt.get(_SHA256_KEY) == schema_hash:
+        if key in potential_qt:
+            logger.debug(
+                "%r for %r already in cache and schema hash "
+                "matches, so not updating cache",
+                str(key),
+                identifier,
+            )
+            return
+
+        logger.debug(
+            "Schema hash matches but %r is missing. Updating cache for %r "
+            "at %s",
+            str(key),
+            identifier,
+            cache_fp,
+        )
+        potential_qt[key] = value
+        _write_cache(cache_fp, cache)
+        return
+
+    cache[identifier.casefold()] = {key: value, _SHA256_KEY: schema_hash}
+    _write_cache(cache_fp, cache)
+
+
+def _load_cache(cache_fp):
+    """Load cache file contents as a dict"""
+    if not cache_fp.exists():
+        return {}
+
+    logger.debug("Loading LLM generation cache at %s", cache_fp)
+    return json.loads(cache_fp.read_text(encoding="utf-8"))
+
+
+def _write_cache(cache_fp, cache):
+    """Write cache file contents to disk"""
+    cache_fp.write_text(json.dumps(cache, indent=4), encoding="utf-8")
+
+
+def _schema_hash(schema):
+    """Get SHA256 hash of the schema for cache validation"""
+    m = hashlib.sha256()
+    m.update(
+        json.dumps(schema, sort_keys=True, separators=(",", ":")).encode(
+            "utf-8"
+        )
+    )
+    return m.hexdigest()
diff --git a/compass/plugin/one_shot/components.py b/compass/plugin/one_shot/components.py
index 2c10b9665..6274725ab 100644
--- a/compass/plugin/one_shot/components.py
+++ b/compass/plugin/one_shot/components.py
@@ -1,12 +1,14 @@
 """COMPASS extraction schema-based plugin component implementations"""
 
+import asyncio
 import logging
 from abc import ABC, abstractmethod
 
 import pandas as pd
+from elm import ApiBase
 
 from compass.llm.calling import SchemaOutputLLMCaller
-from compass.plugin import BaseParser, BaseTextCollector
+from compass.plugin import BaseParser, BaseTextCollector, BaseTextExtractor
 from compass.utilities.enums import LLMUsageCategory
 from compass.utilities.parsing import merge_overlapping_texts
 
@@ -36,13 +38,65 @@
 {text}
 
 Think before you answer.\
+"""
+_TEXT_EXTRACTOR_SYSTEM_PROMPT = """\
+You are a text extraction assistant. Your job is to extract only verbatim, \
+**unmodified** excerpts from the provided text. Do not interpret or \
+paraphrase. Do not summarize. Only return exactly copied segments that match \
+the specified extraction scope/domain. If the relevant content appears within \
+a table, return the entire table, including headers and footers, exactly as \
+formatted.\
+"""
+_TEXT_EXTRACTOR_MAIN_PROMPT = """\
+# CONTEXT #
+We want to reduce the provided excerpt to only contain information for the \
+domain relevant to the following extraction schema:
+
+{schema}
+
+The extracted text will be used for structured data extraction following this \
+schema, so it must be both **comprehensive** (retaining all relevant details) \
+and **focused** (excluding unrelated content), with **zero rewriting or \
+paraphrasing**. Ensure that all retained information is **directly
+applicable** to the extraction task while preserving full context and accuracy.
+
+# OBJECTIVE #
+Extract all text **pertaining to the extraction schema domain** from the \
+provided excerpt.
+
+# RESPONSE #
+Follow these guidelines carefully:
+
+1. ## Formatting & Structure ##:
+- **Preserve _all_ section titles, headers, and numberings** for reference.
+- **Maintain the original wording, formatting, and structure** to ensure \
+accuracy.
+
+2. ## Output Handling ##:
+- This is a strict extraction task — act like a text filter, **not** a \
+summarizer or writer.
+- Do not add, explain, reword, or summarize anything.
+- The output must be a **copy-paste** of the original excerpt. **Absolutely \
+no paraphrasing or rewriting.**
+- The output must consist **only** of contiguous or discontiguous verbatim \
+blocks copied from the input.
+- The only allowed change is to remove irrelevant sections of text. You can \
+remove irrelevant text from within sections, but you cannot add any new text \
+or modify the text you keep in any way.
+- If **no relevant text** is found, return null.
+
+# TEXT #
+
+{text}
+
 """
 _DATA_PARSER_MAIN_PROMPT = """\
 Extract all {desc}features from the following text:
 
 {text}
 
-Think before you answer"""
+Think before you answer\
+"""
 _DATA_PARSER_SYSTEM_PROMPT = """\
 You are a legal scholar extracting structured data from {desc}documents. \
 Follow all instructions in the schema descriptions carefully.\
@@ -152,6 +206,80 @@ def _store_chunk(self, parser, chunk_ind):
             )
 
 
+class SchemaBasedTextExtractor(SchemaOutputLLMCaller, BaseTextExtractor):
+    """Schema-based text extractor"""
+
+    @property
+    @abstractmethod
+    def SCHEMA(self):  # noqa: N802
+        """dict: Extraction schema"""
+        raise NotImplementedError
+
+    @property
+    @abstractmethod
+    def OUTPUT_SCHEMA(self):  # noqa: N802
+        """dict: Validation output schema"""
+        raise NotImplementedError
+
+    @property
+    def parsers(self):
+        """Iterable of parsers provided by this extractor
+
+        Yields
+        ------
+        name : str
+            Name describing the type of text output by the parser.
+        parser : callable
+            Async function that takes a ``text_chunks`` input and
+            outputs parsed text.
+        """
+        yield self.OUT_LABEL, self._process
+
+    async def _process(self, text_chunks):
+        """Perform extraction processing"""
+
+        logger.info(
+            "Extracting summary text from %d text chunks asynchronously...",
+            len(text_chunks),
+        )
+        outer_task_name = asyncio.current_task().get_name()
+        summaries = [
+            asyncio.create_task(
+                self.call(
+                    sys_msg=_TEXT_EXTRACTOR_SYSTEM_PROMPT,
+                    content=_TEXT_EXTRACTOR_MAIN_PROMPT.format(
+                        schema=self.SCHEMA, text=chunk
+                    ),
+                    response_format={
+                        "type": "json_schema",
+                        "json_schema": {
+                            "name": "text_extraction",
+                            "strict": True,
+                            "schema": self.OUTPUT_SCHEMA,
+                        },
+                    },
+                    usage_sub_label=self._USAGE_LABEL,
+                ),
+                name=outer_task_name,
+            )
+            for chunk in text_chunks
+        ]
+        summary_chunks = await asyncio.gather(*summaries)
+        summary_chunks = [
+            chunk.get("domain_relevant_text") for chunk in summary_chunks
+        ]
+
+        text_summary = merge_overlapping_texts(summary_chunks)
+        logger.debug(
+            "Final summary contains %d tokens",
+            ApiBase.count_tokens(
+                text_summary,
+                model=self.kwargs.get("model", "gpt-4"),
+            ),
+        )
+        return text_summary
+
+
 class SchemaOrdinanceParser(SchemaOutputLLMCaller, BaseParser):
     """Base class for parsing structured data"""
 
diff --git a/compass/plugin/one_shot/generators.py b/compass/plugin/one_shot/generators.py
index 20b08ff69..a8fa50af6 100644
--- a/compass/plugin/one_shot/generators.py
+++ b/compass/plugin/one_shot/generators.py
@@ -1,5 +1,6 @@
 """COMPASS one-shot extraction plugin generators"""
 
+import operator
 import importlib.resources
 
 from elm.utilities.retry import async_retry_with_exponential_backoff
@@ -49,6 +50,57 @@
 news, or reports).\
 """
 
+_KEYWORD_GENERATOR_SYSTEM_PROMPT = """\
+You are an expert search strategist for regulatory documents. \
+Goal: Given an extraction schema (JSON) for an ordinance domain, generate \
+high-quality website keywords and weights for prioritizing crawl links.
+
+Input:
+- schema_json: a JSON schema describing features/requirements to extract.
+
+Output:
+- Produce an array of keyword/weight objects with integer weights.
+- Do not include extra keys or any markdown.
+
+Guidelines:
+- Derive terms from the schema title/description, feature names, and \
+definitions. Prefer official/legal terminology in the schema.
+- Focus on keywords likely to appear in legal document URLs or link text.
+- Include terms that indicate governing document types \
+(e.g., "ordinance", "zoning", "code", "regulations", "chapter", "section").
+- Include domain-specific synonyms and abbreviations present in the schema.
+- Weights are relative: higher means more relevant for link prioritization.
+- Avoid jurisdiction-specific entities.
+"""
+
+_HEURISTIC_GENERATOR_SYSTEM_PROMPT = """\
+You are an expert in ordinance discovery and regulatory text filtering. \
+Goal: Given an extraction schema (JSON) for an ordinance domain, generate \
+keyword lists for a heuristic text check that detects domain-relevant \
+content and excludes look-alike words.
+
+Input:
+- schema_json: a JSON schema describing features/requirements to extract.
+
+Output:
+- Provide four keyword lists in the response schema.
+- Do not include extra keys or any markdown.
+
+Guidelines:
+- Derive terms from schema title/description, feature names, and \
+definitions. Prefer official/legal terminology and abbreviations.
+- not_tech_words should include common look-alikes or near matches that \
+appear in non-domain contexts and could cause false positives. These will \
+be removed from the text before performing a keyword-based relevance check. \
+- good_tech_keywords should include single-word indicators likely to \
+appear in ordinance text.
+- good_tech_acronyms should include short acronyms and abbreviations used \
+in legal documents for the domain.
+- good_tech_phrases should include multi-word phrases (at least 2 words) \
+that indicate domain relevance.
+- Avoid jurisdiction-specific names; keep keywords general.
+"""
+
 
 @async_retry_with_exponential_backoff(
     base_delay=1,
@@ -129,6 +181,175 @@ async def generate_query_templates(
     return out
 
 
+@async_retry_with_exponential_backoff(
+    base_delay=1,
+    exponential_base=4,
+    jitter=True,
+    max_retries=3,
+    errors=(COMPASSRuntimeError,),
+)
+async def generate_website_keywords(
+    schema_llm, extraction_schema, add_think_prompt=True
+):
+    """Generate website keyword weights for document retrieval
+
+    Parameters
+    ----------
+    schema_llm : SchemaOutputLLMCaller
+        A LLM caller configured to output structured data according to a
+        provided schema. This function relies on the LLM to generate the
+        keyword weights, so the quality of the generated keywords will
+        depend on the capabilities of the LLM being used and how well it
+        can interpret the provided extraction schema. Highly recommended
+        to use the most powerful/capable instruction-tuned model for
+        this function.
+    extraction_schema : dict
+        A dictionary representing the schema of the desired extraction
+        task. The keywords will be generated based on the content of
+        this schema, so it should be as detailed and specific as
+        possible, and should include domain-specific terminology if
+        applicable. See the wind ordinance schema for an example.
+    add_think_prompt : bool, optional
+        Option to add a "Think before you answer" instruction to the end
+        of the prompt (useful for thinking models).
+        By default, ``True``.
+
+    Returns
+    -------
+    dict
+        Dictionary mapping keywords to integer weights for website link
+        prioritization.
+
+    Raises
+    ------
+    COMPASSRuntimeError
+        If the LLM fails to return any valid keyword weights after 3
+        attempts.
+    """
+
+    keyword_schema_fp = _SCHEMA_DIR / "website_keywords.json5"
+    keyword_schema = load_config(keyword_schema_fp)
+    main_prompt = (
+        "Generate website keyword weights for the following extraction "
+        f"schema:\n\n{extraction_schema}"
+    )
+    if add_think_prompt:
+        main_prompt = f"{main_prompt}\n\nThink before you answer"
+
+    response = await schema_llm.call(
+        sys_msg=_KEYWORD_GENERATOR_SYSTEM_PROMPT,
+        content=main_prompt,
+        response_format={
+            "type": "json_schema",
+            "json_schema": {
+                "name": "website_keyword_generation",
+                "strict": True,
+                "schema": keyword_schema,
+            },
+        },
+        usage_sub_label=LLMUsageCategory.PLUGIN_GENERATION,
+    )
+    out = _normalize_website_keywords(response.get("keywords"))
+    if not out:
+        msg = (
+            "LLM did not return any valid website keywords. "
+            f"Received response: {response}"
+        )
+        raise COMPASSRuntimeError(msg)
+
+    return out
+
+
+@async_retry_with_exponential_backoff(
+    base_delay=1,
+    exponential_base=4,
+    jitter=True,
+    max_retries=3,
+    errors=(COMPASSRuntimeError,),
+)
+async def generate_heuristic_keywords(
+    schema_llm, extraction_schema, add_think_prompt=True
+):
+    """Generate keyword lists for a heuristic text check
+
+    Parameters
+    ----------
+    schema_llm : SchemaOutputLLMCaller
+        A LLM caller configured to output structured data according to a
+        provided schema. This function relies on the LLM to generate the
+        heuristic keyword lists, so the quality of the generated output
+        will depend on the capabilities of the LLM being used and how
+        well it can interpret the provided extraction schema.
+    extraction_schema : dict
+        A dictionary representing the schema of the desired extraction
+        task. The keyword lists will be generated based on the content
+        of this schema, so it should be as detailed and specific as
+        possible, and should include domain-specific terminology if
+        applicable. See the wind ordinance schema for an example.
+    add_think_prompt : bool, optional
+        Option to add a "Think before you answer" instruction to the end
+        of the prompt (useful for thinking models).
+        By default, ``True``.
+
+    Returns
+    -------
+    dict
+        Dictionary containing the keyword lists for a heuristic text
+        check: ``not_tech_words``, ``good_tech_keywords``,
+        ``good_tech_acronyms``, and ``good_tech_phrases``.
+
+    Raises
+    ------
+    COMPASSRuntimeError
+        If the LLM fails to return any valid heuristic keywords after 3
+        attempts.
+    """
+
+    heuristic_schema_fp = _SCHEMA_DIR / "heuristic_keywords.json5"
+    heuristic_schema = load_config(heuristic_schema_fp)
+    main_prompt = (
+        "Generate heuristic keyword lists for the following extraction "
+        f"schema:\n\n{extraction_schema}"
+    )
+    if add_think_prompt:
+        main_prompt = f"{main_prompt}\n\nThink before you answer"
+
+    response = await schema_llm.call(
+        sys_msg=_HEURISTIC_GENERATOR_SYSTEM_PROMPT,
+        content=main_prompt,
+        response_format={
+            "type": "json_schema",
+            "json_schema": {
+                "name": "heuristic_keyword_generation",
+                "strict": True,
+                "schema": heuristic_schema,
+            },
+        },
+        usage_sub_label=LLMUsageCategory.PLUGIN_GENERATION,
+    )
+
+    if not response:
+        msg = (
+            "LLM did not return any heuristic keywords. "
+            f"Received response: {response}"
+        )
+        raise COMPASSRuntimeError(msg)
+
+    return response
+
+
+def _normalize_website_keywords(raw):
+    """Normalize keyword weights into a deduplicated dict"""
+    if not raw:
+        return {}
+
+    items = _parse_llm_kw_to_list(raw)
+    if not items:
+        return {}
+
+    return _de_duplicate_keywords(items)
+
+
 def _is_formattable(q):
     """True if the query template is formattable with a jurisdiction"""
     try:
@@ -137,3 +358,38 @@ def _is_formattable(q):
         return False
 
     return True
+
+
+def _parse_llm_kw_to_list(llm_kw):
+    """Parse LLM output into a list of (keyword, weight) tuples"""
+    items = []
+    for item in llm_kw:
+        if isinstance(item, str):
+            items.append((item, 1))
+        elif isinstance(item, dict):
+            items.append((item.get("keyword"), item.get("weight", 1)))
+    return items
+
+
+def _de_duplicate_keywords(items):
+    """Process keywords by normalizing and keeping the highest weight"""
+    deduped = {}
+    sorted_items = sorted(items, key=operator.itemgetter(1), reverse=True)
+    for keyword, weight in sorted_items:
+        if not isinstance(keyword, str):
+            continue
+
+        normalized = keyword.strip().casefold()
+        if not normalized or normalized.isdigit():
+            continue
+        try:
+            int_weight = int(weight)
+        except (TypeError, ValueError):
+            continue
+
+        if int_weight < 1:
+            continue
+
+        deduped.setdefault(normalized, int_weight)
+
+    return deduped
diff --git a/compass/plugin/one_shot/schemas/extract_text.json5 b/compass/plugin/one_shot/schemas/extract_text.json5
new file mode 100644
index 000000000..fef3f9f7a
--- /dev/null
+++ b/compass/plugin/one_shot/schemas/extract_text.json5
@@ -0,0 +1,12 @@
+{
+    "type": "object",
+    "description": "Response containing all relevant text extracted from the input chunk based on the extraction schema/domain. The extracted text should be both **comprehensive and focused** in order to maximize extraction accuracy. The output should be **verbatim text copied from the input chunk, without any paraphrasing or rewriting**. Only text irrelevant to the extraction should be dropped. If no relevant text is found, the LLM should return null.",
+    "additionalProperties": false,
+    "required": ["domain_relevant_text"],
+    "properties": {
+        "domain_relevant_text": {
+            "type": ["string", "null"],
+            "description": "The text extracted from the input chunk that is relevant to the extraction schema/domain. This should be **verbatim text copied from the input chunk, without any paraphrasing or rewriting, but possibly with some irrelevant text removed**. If no relevant text is found, this field should be null.",
+        }
+    },
+}
\ No newline at end of file
diff --git a/compass/plugin/one_shot/schemas/heuristic_keywords.json5 b/compass/plugin/one_shot/schemas/heuristic_keywords.json5
new file mode 100644
index 000000000..1f8f7ecbc
--- /dev/null
+++ b/compass/plugin/one_shot/schemas/heuristic_keywords.json5
@@ -0,0 +1,181 @@
+{
+  "title": "Heuristic Keyword Lists",
+  "description": "Schema for LLM-generated heuristic keyword lists used to filter ordinance text.",
+  "type": "object",
+  "additionalProperties": false,
+  "required": [
+    "not_tech_words",
+    "good_tech_keywords",
+    "good_tech_acronyms",
+    "good_tech_phrases"
+  ],
+  "properties": {
+    "not_tech_words": {
+      "type": "array",
+      "items": {
+        "type": "string"
+      }
+    },
+    "good_tech_keywords": {
+      "type": "array",
+      "items": {
+        "type": "string"
+      }
+    },
+    "good_tech_acronyms": {
+      "type": "array",
+      "items": {
+        "type": "string"
+      }
+    },
+    "good_tech_phrases": {
+      "type": "array",
+      "items": {
+        "type": "string"
+      }
+    }
+  },
+  "$descriptions": {
+    "general": [
+      "Return ONLY the fields allowed by this schema.",
+      "Provide four arrays of strings.",
+      "Derive terms from the extraction schema (title, descriptions, features).",
+      "Avoid jurisdiction-specific names; keep keywords general.",
+    ],
+    "not_tech_words": [
+      "Include common look-alike words or near matches that are not the target domain.",
+      "Prefer words/phrases likely to cause false positives for a keyword-based lookup.",
+      "Do not include empty strings or whitespace-only entries.",
+      "Include as many distinct entries as possible (at least 3, ideally 10-20).",
+    ],
+    "good_tech_keywords": [
+      "Include single-word indicators likely to appear in ordinance text.",
+      "Prefer official/legal terminology and abbreviations.",
+      "Avoid numbers-only keywords.",
+      "Limit this to 3-5 of the most relevant keywords.",
+      "Do not include extraction fields as keywords unless they are highly specific to the domain.",
+    ],
+    "good_tech_acronyms": [
+      "Include short acronyms or abbreviations used in legal documents.",
+      "Avoid adding acronyms that are ambiguous outside the domain.",
+      "Limit this to 5-10 of the most relevant acronyms.",
+      "Do not include extraction fields or units as acronyms unless they are highly specific to the domain.",
+    ],
+    "good_tech_phrases": [
+      "Include multi-word phrases (at least 2 words) that indicate domain relevance.",
+      "Avoid near-duplicate phrases or trivial variants.",
+      "Limit this to 5-10 of the most relevant phrases.",
+      "Do not include extraction fields as phrases unless they are highly specific to the domain.",
+    ],
+    "quality_checks": [
+      "Avoid duplicate strings across lists.",
+      "Use lower-case strings when possible.",
+    ]
+  },
+  "$examples": [
+    {
+      "not_tech_words": [
+        "micro wecs",
+        "small wecs",
+        "mini wecs",
+        "private wecs",
+        "personal wecs",
+        "pwecs",
+        "rewind",
+        "small wind",
+        "micro wind",
+        "mini wind",
+        "private wind",
+        "personal wind",
+        "swecs",
+        "windbreak",
+        "windiest",
+        "winds",
+        "windshield",
+        "window",
+        "windy",
+        "wind attribute",
+        "wind blow",
+        "wind break",
+        "wind current",
+        "wind damage",
+        "wind data",
+        "wind direction",
+        "wind draft",
+        "wind erosion",
+        "wind energy resource atlas",
+        "wind load",
+        "wind movement",
+        "wind orient",
+        "wind resource",
+        "wind runway",
+        "prevailing wind",
+        "downwind",
+      ],
+      "good_tech_keywords": [
+        "wind",
+        "setback",
+        "turbine",
+      ],
+      "good_tech_acronyms": [
+        "wecs",
+        "wes",
+        "lwet",
+        "uwet",
+        "wef"
+      ],
+      "good_tech_phrases": [
+        "wind energy conversion",
+        "wind turbine",
+        "wind tower",
+        "wind farm",
+        "wind energy system",
+        "wind energy farm",
+        "utility wind energy system",
+      ]
+    },
+    {
+      "not_tech_words": [
+        "concentrated solar",
+        "csp",
+        "micro secs",
+        "small secs",
+        "mini secs",
+        "private secs",
+        "personal secs",
+        "psecs",
+        "solaris",
+        "small solar",
+        "micro solar",
+        "mini solar",
+        "private solar",
+        "personal solar",
+        "swecs",
+        "solar break",
+        "solar damage",
+        "solar data",
+        "solar resource",
+      ],
+      "good_tech_keywords": [
+        "solar",
+        "setback",
+        "photovoltaic",
+      ],
+      "good_tech_acronyms": [
+        "secs",
+        "sef",
+        "ses",
+        "cses"
+      ],
+      "good_tech_phrases": [
+        "commercial solar energy system",
+        "solar energy conversion",
+        "solar energy system",
+        "solar panel",
+        "solar farm",
+        "solar energy farm",
+        "utility solar energy system",
+      ]
+    }
+  ]
+}
diff --git a/compass/plugin/one_shot/schemas/website_keywords.json5 b/compass/plugin/one_shot/schemas/website_keywords.json5
new file mode 100644
index 000000000..6f2d023af
--- /dev/null
+++ b/compass/plugin/one_shot/schemas/website_keywords.json5
@@ -0,0 +1,91 @@
+{
+  "title": "Website Keyword Weights",
+  "description": "Schema for LLM-generated website keyword weights used to score crawl links.",
+  "type": "object",
+  "additionalProperties": false,
+  "required": ["keywords"],
+  "properties": {
+    "keywords": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "additionalProperties": false,
+        "required": ["keyword", "weight"],
+        "properties": {
+          "keyword": {
+            "type": "string"
+          },
+          "weight": {
+            "type": "integer",
+            "minimum": 1
+          }
+        }
+      }
+    }
+  },
+  "$descriptions": {
+    "general": [
+      "Return ONLY the fields allowed by this schema.",
+      "Provide a single array of keyword/weight objects.",
+      "Each item must include \"keyword\" and \"weight\".",
+      "Weights are relative: higher means more relevant to crawl scoring.",
+      "Derive keywords from the extraction schema (title, descriptions, features).",
+      "Focus on legal document types and domain terminology, not news."
+    ],
+    "keyword_content": [
+      "Include ordinance/code/zoning terminology if relevant.",
+      "Include domain-specific abbreviations or acronyms from the schema.",
+      "Prefer terms likely to appear in URLs or link text.",
+      "Do not include empty strings or whitespace-only keywords.",
+      "Avoid repeating the same keyword with different casing.",
+      "Avoid numbers-only keywords.",
+      "Avoid using specific extraction fields as keywords.",
+      "Avoid jurisdiction-specific names; keep keywords general."
+    ],
+    "quality_checks": [
+      "Include at least 10 distinct keywords.",
+      "Avoid near-duplicate keywords or trivial variants.",
+      "Each keyword must be unique.",
+      "Use integer weights only."
+    ]
+  },
+  "$examples": [
+    {
+      "keywords": [
+        {"keyword": "pdf", "weight": 92160},
+        {"keyword": "secs", "weight": 46080},
+        {"keyword": "solar", "weight": 23040},
+        {"keyword": "zoning", "weight": 11520},
+        {"keyword": "ordinance", "weight": 5760},
+        {"keyword": "renewable energy", "weight": 1440},
+        {"keyword": "planning", "weight": 720},
+        {"keyword": "plan", "weight": 360},
+        {"keyword": "government", "weight": 180},
+        {"keyword": "code", "weight": 60},
+        {"keyword": "area", "weight": 60},
+        {"keyword": "land development", "weight": 15},
+        {"keyword": "land", "weight": 3},
+        {"keyword": "environment", "weight": 3},
+        {"keyword": "energy", "weight": 3},
+        {"keyword": "renewable", "weight": 3},
+        {"keyword": "municipal", "weight": 1},
+        {"keyword": "department", "weight": 1}
+      ]
+    },
+    {
+      "keywords": [
+        {"keyword": "pdf", "weight": 15000},
+        {"keyword": "wind energy", "weight": 12000},
+        {"keyword": "wecs", "weight": 11000},
+        {"keyword": "wind turbine", "weight": 10000},
+        {"keyword": "ordinance", "weight": 9000},
+        {"keyword": "regulation", "weight": 7500},
+        {"keyword": "zoning", "weight": 6000},
+        {"keyword": "code", "weight": 3000},
+        {"keyword": "permit", "weight": 1500},
+        {"keyword": "land use", "weight": 1200},
+        {"keyword": "planning", "weight": 800}
+      ]
+    }
+  ]
+}
diff --git a/compass/plugin/ordinance.py b/compass/plugin/ordinance.py
index f26e2d17e..92d233e9f 100644
--- a/compass/plugin/ordinance.py
+++ b/compass/plugin/ordinance.py
@@ -13,6 +13,7 @@
 from elm import ApiBase
 
 from compass.llm.calling import (
+    LLMCaller,
     BaseLLMCaller,
     ChatLLMCaller,
     JSONFromTextLLMCaller,
@@ -57,7 +58,7 @@
 }
 
 
-class BaseTextExtractor(ABC):
+class BaseTextExtractor(BaseLLMCaller, ABC):
     """Extract succinct extraction text from input"""
 
     TASK_DESCRIPTION = "Condensing text for extraction"
@@ -66,6 +67,8 @@ class BaseTextExtractor(ABC):
     TASK_ID = "text_extraction"
     """ID to use for this extraction for linking with LLM configs"""
 
+    _USAGE_LABEL = LLMUsageCategory.DOCUMENT_ORDINANCE_SUMMARY
+
     @property
     @abstractmethod
     def IN_LABEL(self):  # noqa: N802
@@ -389,7 +392,7 @@ def _store_chunk(self, parser, chunk_ind):
             )
 
 
-class PromptBasedTextExtractor(BaseTextExtractor, ABC):
+class PromptBasedTextExtractor(LLMCaller, BaseTextExtractor, ABC):
     """Text extractor based on a chain of prompts"""
 
     SYSTEM_MESSAGE = (
@@ -434,6 +437,9 @@ class PromptBasedTextExtractor(BaseTextExtractor, ABC):
           **Absolutely no paraphrasing or rewriting.**
         - The output must consist **only** of contiguous or discontiguous
           verbatim blocks copied from the input.
+        - The only allowed change is to remove irrelevant sections of text.
+          You can remove irrelevant text from within sections, but you cannot
+          add any new text or modify the text you keep in any way.
         - If **no relevant text** is found, return the response:
           'No relevant text.'
         """
@@ -443,8 +449,6 @@ class PromptBasedTextExtractor(BaseTextExtractor, ABC):
     )
     """Prompt component instructing model output guidelines"""
 
-    _USAGE_LABEL = LLMUsageCategory.DOCUMENT_ORDINANCE_SUMMARY
-
     @property
     @abstractmethod
     def PROMPTS(self):  # noqa: N802
@@ -502,16 +506,6 @@ def __init_subclass__(cls, **kwargs):
         last_index = len(cls.PROMPTS) - 1
         cls.OUT_LABEL = last_prompt.get("key", f"extracted_text_{last_index}")
 
-    def __init__(self, llm_caller):
-        """
-
-        Parameters
-        ----------
-        llm_caller : LLMCaller
-            LLM Caller instance used to extract ordinance info with.
-        """
-        self.llm_caller = llm_caller
-
     @property
     def parsers(self):
         """Iterable of parsers provided by this extractor
@@ -545,7 +539,7 @@ async def _process(self, text_chunks, instructions, is_valid_chunk=None):
         outer_task_name = asyncio.current_task().get_name()
         summaries = [
             asyncio.create_task(
-                self.llm_caller.call(
+                self.call(
                     sys_msg=self.SYSTEM_MESSAGE,
                     content=f"{instructions}\n\n# TEXT #\n\n{chunk}",
                     usage_sub_label=self._USAGE_LABEL,
@@ -565,8 +559,7 @@ async def _process(self, text_chunks, instructions, is_valid_chunk=None):
         logger.debug(
             "Final summary contains %d tokens",
             ApiBase.count_tokens(
-                text_summary,
-                model=self.llm_caller.kwargs.get("model", "gpt-4"),
+                text_summary, model=self.kwargs.get("model", "gpt-4")
             ),
         )
         return text_summary
diff --git a/compass/services/threaded.py b/compass/services/threaded.py
index bc5f146ff..0632ac31f 100644
--- a/compass/services/threaded.py
+++ b/compass/services/threaded.py
@@ -542,12 +542,13 @@ def _dump_jurisdiction_info(
 def _compile_doc_info(doc):
     """Put together meta information about a single document"""
     year, month, day = doc.attrs.get("date") or (None, None, None)
+    out_fp = doc.attrs.get("source_fp", doc.attrs.get("out_fp"))
     return {
         "source": doc.attrs.get("source"),
         "effective_year": year if year is not None and year > 0 else None,
         "effective_month": month if month is not None and month > 0 else None,
         "effective_day": day if day is not None and day > 0 else None,
-        "ord_filename": Path(doc.attrs.get("out_fp") or "unknown").name,
+        "ord_filename": Path(out_fp or "unknown").name,
         "num_pages": len(doc.pages),
         "checksum": doc.attrs.get("checksum"),
         "is_pdf": isinstance(doc, PDFDocument),
diff --git a/docs/source/examples/index.rst b/docs/source/examples/index.rst
index c90e05ee5..6efcad8dd 100644
--- a/docs/source/examples/index.rst
+++ b/docs/source/examples/index.rst
@@ -10,4 +10,5 @@ get started with ``COMPASS``:
 
    execution_basics/README
    one_shot_schema_extraction/README
+   parse_existing_docs/CLI/README
    parse_existing_docs/code/README
diff --git a/examples/one_shot_schema_extraction/README.rst b/examples/one_shot_schema_extraction/README.rst
index a89770c4c..d44d07351 100644
--- a/examples/one_shot_schema_extraction/README.rst
+++ b/examples/one_shot_schema_extraction/README.rst
@@ -145,11 +145,11 @@ The key options are listed below:
 - ``website_keywords``: Keyword weights for document search prioritization.
 - ``collection_prompts``: Prompt list for chunk filtering, or ``true`` to auto-generate.
 - ``text_extraction_prompts``: Prompt list for text consolidation, or ``true`` to auto-generate.
-- ``cache_query_templates``: Cache generated query templates and keywords. By default, ``true``.
+- ``cache_llm_generated_content``: Cache LLM-generated query templates and keywords. By default, ``true``.
 - ``extraction_system_prompt``: Optional system prompt override for extraction.
 
 
-See `this documentation <https://natlabrockies.github.io/COMPASS/_autosummary//compass.plugin.one_shot.base.create_schema_based_one_shot_extraction_plugin.html#compass.plugin.one_shot.base.create_schema_based_one_shot_extraction_plugin>`_
+See `this documentation <https://natlabrockies.github.io/COMPASS/_autosummary/compass.plugin.one_shot.base.create_schema_based_one_shot_extraction_plugin.html#compass.plugin.one_shot.base.create_schema_based_one_shot_extraction_plugin>`_
 for further details.
 
 If you want full control over all of the options above, you can specify them directly in the config
diff --git a/examples/one_shot_schema_extraction/plugin_config.yaml b/examples/one_shot_schema_extraction/plugin_config.yaml
index 4da8700c5..e5cda9287 100644
--- a/examples/one_shot_schema_extraction/plugin_config.yaml
+++ b/examples/one_shot_schema_extraction/plugin_config.yaml
@@ -15,16 +15,12 @@ website_keywords:
   wind: 23040
   zoning: 11520
   ordinance: 5760
-  renewable%20energy: 1440
-  renewable+energy: 1440
   renewable energy: 1440
   planning: 720
   plan: 360
   government: 180
   code: 60
   area: 60
-  land%20development: 15
-  land+development: 15
   land development: 15
   land: 3
   environment: 3
@@ -33,6 +29,62 @@ website_keywords:
   municipal: 1
   department: 1
 
+heuristic_keywords:
+  good_tech_keywords:
+    - "wind"
+    - "setback"
+  good_tech_acronyms:
+    - "wecs"
+    - "wes"
+    - "lwet"
+    - "uwet"
+    - "wef"
+  good_tech_phrases:
+    - "wind energy conversion"
+    - "wind turbine"
+    - "wind tower"
+    - "wind farm"
+    - "wind energy system"
+    - "wind energy farm"
+    - "utility wind energy system"
+  not_tech_words:
+    - "micro wecs"
+    - "small wecs"
+    - "mini wecs"
+    - "private wecs"
+    - "personal wecs"
+    - "pwecs"
+    - "rewind"
+    - "small wind"
+    - "micro wind"
+    - "mini wind"
+    - "private wind"
+    - "personal wind"
+    - "swecs"
+    - "windbreak"
+    - "windiest"
+    - "winds"
+    - "windshield"
+    - "window"
+    - "windy"
+    - "wind attribute"
+    - "wind blow"
+    - "wind break"
+    - "wind current"
+    - "wind damage"
+    - "wind data"
+    - "wind direction"
+    - "wind draft"
+    - "wind erosion"
+    - "wind energy resource atlas"
+    - "wind load"
+    - "wind movement"
+    - "wind orient"
+    - "wind resource"
+    - "wind runway"
+    - "prevailing wind"
+    - "downwind"
+
 collection_prompts:
   - key: contains_ord_info
     label: contains ordinance info
@@ -64,15 +116,15 @@ text_extraction_prompts:
       - Do **not** include text that does not pertain to wind energy systems.
 
       3. ## Formatting & Structure ##:
-      - **Preserve _all_ section titles, headers, and numberings** for   reference.
-      - **Maintain the original wording, formatting, and structure** to   ensure accuracy.
+      - **Preserve _all_ section titles, headers, and numberings** for reference.
+      - **Maintain the original wording, formatting, and structure** to ensure accuracy.
 
       4. ## Output Handling ##:
-      - This is a strict extraction task — act like a text filter, **not**   a summarizer or writer.
+      - This is a strict extraction task — act like a text filter, **not** a summarizer or writer.
       - Do not add, explain, reword, or summarize anything.
-      - The output must be a **copy-paste** of the original excerpt.   **Absolutely no paraphrasing or rewriting.**
-      - The output must consist **only** of contiguous or discontiguous   verbatim blocks copied from the input.
-      - If **no relevant text** is found, return the response:   'No relevant text.'
+      - The output must be a **copy-paste** of the original excerpt. **Absolutely no paraphrasing or rewriting.**
+      - The output must consist **only** of contiguous or discontiguous verbatim blocks copied from the input.
+      - If **no relevant text** is found, return the response: 'No relevant text.'
 
   - key: cleaned_text_for_extraction
     out_fn: "{jurisdiction} Utility Scale Wind Ordinance.txt"
@@ -103,14 +155,14 @@ text_extraction_prompts:
       - Do **not** include text that does not pertain at all to wind energy systems.
 
       3. ## Formatting & Structure ##:
-      - **Preserve _all_ section titles, headers, and numberings** for   reference.
-      - **Maintain the original wording, formatting, and structure** to   ensure accuracy.
+      - **Preserve _all_ section titles, headers, and numberings** for reference.
+      - **Maintain the original wording, formatting, and structure** to ensure accuracy.
 
       4. ## Output Handling ##:
-      - This is a strict extraction task — act like a text filter, **not**   a summarizer or writer.
+      - This is a strict extraction task — act like a text filter, **not** a summarizer or writer.
       - Do not add, explain, reword, or summarize anything.
-      - The output must be a **copy-paste** of the original excerpt.   **Absolutely no paraphrasing or rewriting.**
-      - The output must consist **only** of contiguous or discontiguous   verbatim blocks copied from the input.
-      - If **no relevant text** is found, return the response:   'No relevant text.'
+      - The output must be a **copy-paste** of the original excerpt. **Absolutely no paraphrasing or rewriting.**
+      - The output must consist **only** of contiguous or discontiguous verbatim blocks copied from the input.
+      - If **no relevant text** is found, return the response: 'No relevant text.'
 
 extraction_system_prompt: "You are a legal scholar extracting structured data from wind energy ordinances. Follow all instructions in the schema descriptions carefully. Only extract requirements for large, commercial, utility-scale wind energy systems."
diff --git a/examples/one_shot_schema_extraction/plugin_config_simple.json5 b/examples/one_shot_schema_extraction/plugin_config_simple.json5
index 92b4e9260..9295dcc48 100644
--- a/examples/one_shot_schema_extraction/plugin_config_simple.json5
+++ b/examples/one_shot_schema_extraction/plugin_config_simple.json5
@@ -2,10 +2,17 @@
     // Always required for one-shot schema extraction plugins
     "schema": "./wind_schema.json",
 
-    // The default value for ``cache_query_templates`` is ``true``,
-    // but we include it here anyway for completeness and to
-    // demonstrate that it can be set to ``false`` if desired.
-    "cache_query_templates": true,
+    // The default value for ``cache_llm_generated_content`` is
+    // ``true``, but we include it here anyway for completeness
+    // and to demonstrate that it can be set to ``false`` if desired.
+    "cache_llm_generated_content": true,
+
+    // By setting this option to ``true``, we indicate that we would
+    // like a keyword-based heuristic to be applied, but would like
+    // to use the LLM to generate heuristic keywords based on the
+    // extraction schema (instead of providing custom heuristic
+    // keywords).
+    "heuristic_keywords": true,
 
     // By setting this option to ``true``, we indicate that we would
     // like a text collection (filter) step, but would like to simply
diff --git a/examples/parse_existing_docs/CLI/README.rst b/examples/parse_existing_docs/CLI/README.rst
new file mode 100644
index 000000000..77224df3d
--- /dev/null
+++ b/examples/parse_existing_docs/CLI/README.rst
@@ -0,0 +1,91 @@
+*********************************
+Parsing Existing Docs via the CLI
+*********************************
+
+If you already have documents that you want to run data extraction on,
+you can skip web search and run COMPASS directly against local files.
+This example shows the minimal CLI setup for processing local documents.
+
+Prerequisites
+=============
+Be sure to go over the
+`COMPASS Execution Basics <https://natlabrockies.github.io/COMPASS/examples/execution_basics/README.html>`_
+to understand how to set up a run environment and model run configuration.
+You will be re-using the same execution pattern here with an added input to
+point COMPASS to your local files.
+
+Compile Document Info
+=====================
+The key to running COMPASS against local files is compiling information
+about the local documents that we can point COMPASS to. To do this, we
+need to generate a mapping of jurisdiction codes to lists of document
+metadata dicts, where each dict contains (at minimum) a required
+``source_fp`` key that points to the local file path.
+
+For example, a minimal local document specification would look like this:
+
+.. literalinclude:: local_docs_minimal.json5
+    :language: json5
+
+This mapping can be saved as a config file using any of the formats
+supported by COMPASS (JSON, JSON5, YAML, or TOML).
+
+Since we didn't include any additional metadata beyond the required
+``source_fp``, COMPASS will perform all of the same document processing
+steps that a document retrieved via search would go through, including
+legal text validation and date extraction. To skip some or all of these
+steps, you can include additional metadata fields in the document dicts
+as described in the
+`COMPASS documentation <https://natlabrockies.github.io/COMPASS/_autosummary/compass.scripts.process.process_jurisdictions_with_openai.html#compass.scripts.process.process_jurisdictions_with_openai>`_.
+Below is an example of a more fully specified document mapping that
+includes multiple documents, each with additional metadata fields to
+skip certain processing steps:
+
+.. literalinclude:: local_docs.json5
+    :language: json5
+
+
+Updating COMPASS Run Config
+===========================
+Once the local document mapping is compiled, you can point COMPASS to it via
+the main run config. You will also need to disable search so that COMPASS
+doesn't attempt to retrieve documents from the web in addition to processing
+your local files. The rest of the config can be set up as a typical COMPASS
+run config with out_dir, tech, and any other relevant settings. Below is a
+simple example:
+
+.. literalinclude:: config.json5
+    :language: json5
+
+.. NOTE::
+    If you are not sure whether your local docs contain the relevant information
+    to be extracted, you can leave the web search enabled and COMPASS will
+    default back to a web search if no structured data is extracted from the
+    local documents.
+
+Of course, your jurisdiction CSV should still be set up to match the jurisdictions
+you would like to process:
+
+.. literalinclude:: jurisdictions.csv
+    :language: text
+
+In this way, you can build up a corpus of local docs, point your config to the
+document mapping, and only ever process the jurisdiction(s) you are interested in.
+
+
+Running COMPASS
+===============
+Once everything is configured, you can execute a model run as described in the
+`COMPASS Execution Basics <https://natlabrockies.github.io/COMPASS/examples/execution_basics/README.html>`_:
+
+.. code-block:: shell
+
+    compass process -c config.json5
+
+If you are using ``pixi``:
+
+.. code-block:: shell
+
+    pixi run compass process -c config.json5
+
+Outputs are written under ``./outputs`` by default.
diff --git a/examples/parse_existing_docs/CLI/config.json5 b/examples/parse_existing_docs/CLI/config.json5
new file mode 100644
index 000000000..d9f0796e2
--- /dev/null
+++ b/examples/parse_existing_docs/CLI/config.json5
@@ -0,0 +1,13 @@
+{
+    // Same as a typical COMPASS config
+    "out_dir": "./outputs",
+    "jurisdiction_fp": "./jurisdictions.csv",
+    "tech": "wind",
+
+    // NEW: Point to local docs mapping
+    "known_local_docs": "./local_docs.json5",
+
+    // NEW: Disable web search since we already have local docs
+    "perform_se_search": false,
+    "perform_website_search": false
+}
diff --git a/examples/parse_existing_docs/CLI/jurisdictions.csv b/examples/parse_existing_docs/CLI/jurisdictions.csv
new file mode 100644
index 000000000..509b1423a
--- /dev/null
+++ b/examples/parse_existing_docs/CLI/jurisdictions.csv
@@ -0,0 +1,3 @@
+County,State
+Decatur,Indiana
+Franklin,Indiana
diff --git a/examples/parse_existing_docs/CLI/local_docs.json5 b/examples/parse_existing_docs/CLI/local_docs.json5
new file mode 100755
index 000000000..816dcf26a
--- /dev/null
+++ b/examples/parse_existing_docs/CLI/local_docs.json5
@@ -0,0 +1,25 @@
+{
+    "18031": [
+        {
+            "source_fp": "../Decatur County, Indiana.pdf",
+            "source": "https://decaturcounty.in.gov/download/zoning-ordinance-article-13-wind-energy-conversion-system-wecs?refresh=68ffda0d84a6e1761597965&wpdmdl=6638",
+            "date": [null, null, null],  // [year, month, day] - Skips date extraction if given
+            "check_if_legal_doc": false, // Skip legal doc check
+
+            // Optional metadata fields - not required but can be helpful for metadata in the run output
+            "checksum": "sha256:1f68616ac8c4f26ca6cacf85023f210f7a453c002ca9159eb42252470b503386",
+            "from_ocr": false,
+        },
+    ],
+    "18047": [
+        {
+            "source_fp": "../Franklin County, Indiana.pdf",
+            "source": "https://www.franklincounty.in.gov/wp-content/uploads/2023/05/80.06.06-Commercial-and-Intermediate-Energy-Systems.pdf",
+            "date": [2023, 5, null],  // Same as above...
+            "check_if_legal_doc": false,
+
+            "checksum": "sha256:6ff5f90301ffba6ac4a8dd4d629201fe7f5cbffa7c5ae6fc8951e978d11be1fa",
+            "from_ocr": false,
+        }
+    ],
+}
\ No newline at end of file
diff --git a/examples/parse_existing_docs/CLI/local_docs_minimal.json5 b/examples/parse_existing_docs/CLI/local_docs_minimal.json5
new file mode 100644
index 000000000..6a2becf83
--- /dev/null
+++ b/examples/parse_existing_docs/CLI/local_docs_minimal.json5
@@ -0,0 +1,7 @@
+{
+    "18031": [
+        {
+            "source_fp": "../Decatur County, Indiana.pdf"
+        }
+    ]
+}
diff --git a/examples/parse_existing_docs/Franklin County, Indiana.pdf b/examples/parse_existing_docs/Franklin County, Indiana.pdf
new file mode 100644
index 000000000..043719188
Binary files /dev/null and b/examples/parse_existing_docs/Franklin County, Indiana.pdf differ