diff --git a/docs/config/yaml.md b/docs/config/yaml.md index ae4d42790..2c2ba7eb5 100644 --- a/docs/config/yaml.md +++ b/docs/config/yaml.md @@ -171,9 +171,45 @@ Where to put all vectors for the system. Configured for lancedb by default. This - `url` **str** (only for AI Search) - AI Search endpoint - `api_key` **str** (optional - only for AI Search) - The AI Search api key to use. - `audience` **str** (only for AI Search) - Audience for managed identity token if managed identity authentication is used. -- `container_name` **str** - The name of a vector container. This stores all indexes (tables) for a given dataset ingest. Default=`default` +- `index_prefix` **str** - (optional) A prefix for the indexes you will create for embeddings. This stores all indexes (tables) for a given dataset ingest. - `database_name` **str** - (cosmosdb only) Name of the database. -- `overwrite` **bool** (only used at index creation time) - Overwrite collection if it exist. Default=`True` +- `embeddings_schema` **list[dict[str, str]]** (optional) - Enables customization for each of your embeddings. + - ``: + - `index_name` **str**: (optional) - Name for the specific embedding index table. + - `id_field` **str**: (optional) - Field name to be used as id. Default=`id` + - `vector_field` **str**: (optional) - Field name to be used as vector. Default=`vector` + - `vector_size` **int**: (optional) - Vector size for the embeddings. Default=`3072` + +The supported embeddings are: + +- `text_unit.text` +- `document.text` +- `entity.title` +- `entity.description` +- `relationship.description` +- `community.title` +- `community.summary` +- `community.full_content` + +For example: + +```yaml +vector_store: + type: lancedb + db_uri: output/lancedb + container_name: christmas-carol + index_prefix: "christmas-carol" + embeddings_schema: + text_unit.text: + index_name: "text-unit-embeddings" + id_field: "id_custom" + vector_field: "vector_custom" + vector_size: 3072 + entity.description: + id_field: "id_custom" + +``` + ## Workflow Configurations diff --git a/graphrag/config/defaults.py b/graphrag/config/defaults.py index 6f36169af..999343950 100644 --- a/graphrag/config/defaults.py +++ b/graphrag/config/defaults.py @@ -402,8 +402,8 @@ class VectorStoreDefaults: type: ClassVar[str] = VectorStoreType.LanceDB.value db_uri: str = str(Path(DEFAULT_OUTPUT_BASE_DIR) / "lancedb") - container_name: str = "default" overwrite: bool = True + index_prefix: None = None url: None = None api_key: None = None audience: None = None diff --git a/graphrag/config/embeddings.py b/graphrag/config/embeddings.py index f15023857..60711b8aa 100644 --- a/graphrag/config/embeddings.py +++ b/graphrag/config/embeddings.py @@ -30,13 +30,13 @@ def create_index_name( - container_name: str, embedding_name: str, validate: bool = True + index_prefix: str, embedding_name: str, validate: bool = True ) -> str: """ Create a index name for the embedding store. Within any given vector store, we can have multiple sets of embeddings organized into projects. - The `container` param is used for this partitioning, and is added as a prefix to the index name for differentiation. + The `container` param is used for this partitioning, and is added as a index_prefix to the index name for differentiation. The embedding name is fixed, with the available list defined in graphrag.index.config.embeddings @@ -45,4 +45,7 @@ def create_index_name( if validate and embedding_name not in all_embeddings: msg = f"Invalid embedding name: {embedding_name}" raise KeyError(msg) - return f"{container_name}-{embedding_name}".replace(".", "-") + + if index_prefix: + return f"{index_prefix}-{embedding_name}".replace(".", "-") + return embedding_name.replace(".", "-") diff --git a/graphrag/config/init_content.py b/graphrag/config/init_content.py index 69b05015f..1cbccf74d 100644 --- a/graphrag/config/init_content.py +++ b/graphrag/config/init_content.py @@ -77,7 +77,6 @@ vector_store: type: {vector_store_defaults.type} db_uri: {vector_store_defaults.db_uri} - container_name: {vector_store_defaults.container_name} ### Workflow settings ### diff --git a/graphrag/config/models/vector_store_config.py b/graphrag/config/models/vector_store_config.py index ac00679e4..c2b3e61de 100644 --- a/graphrag/config/models/vector_store_config.py +++ b/graphrag/config/models/vector_store_config.py @@ -72,9 +72,9 @@ def _validate_url(self) -> None: default=vector_store_defaults.audience, ) - container_name: str = Field( - description="The container name to use.", - default=vector_store_defaults.container_name, + index_prefix: str | None = Field( + description="The index prefix to use.", + default=vector_store_defaults.index_prefix, ) database_name: str | None = Field( diff --git a/graphrag/index/workflows/generate_text_embeddings.py b/graphrag/index/workflows/generate_text_embeddings.py index c15ff3e07..5c20bce45 100644 --- a/graphrag/index/workflows/generate_text_embeddings.py +++ b/graphrag/index/workflows/generate_text_embeddings.py @@ -255,6 +255,14 @@ def _create_vector_store( else: single_embedding_config = raw_config + if ( + single_embedding_config.index_name is not None + and vector_store_config.index_prefix + ): + single_embedding_config.index_name = ( + f"{vector_store_config.index_prefix}-{single_embedding_config.index_name}" + ) + if single_embedding_config.index_name is None: single_embedding_config.index_name = index_name @@ -270,9 +278,9 @@ def _create_vector_store( def _get_index_name(vector_store_config: VectorStoreConfig, embedding_name: str) -> str: - container_name = vector_store_config.container_name - index_name = create_index_name(container_name, embedding_name) + index_prefix = vector_store_config.index_prefix or "" + index_name = create_index_name(index_prefix, embedding_name) - msg = f"using vector store {vector_store_config.type} with container_name {container_name} for embedding {embedding_name}: {index_name}" + msg = f"using vector store {vector_store_config.type} with index prefix {index_prefix} for embedding {embedding_name}: {index_name}" logger.info(msg) return index_name diff --git a/tests/unit/config/utils.py b/tests/unit/config/utils.py index ecd489a51..9769cf99b 100644 --- a/tests/unit/config/utils.py +++ b/tests/unit/config/utils.py @@ -114,7 +114,7 @@ def assert_vector_store_configs( assert actual.url == expected.url assert actual.api_key == expected.api_key assert actual.audience == expected.audience - assert actual.container_name == expected.container_name + assert actual.index_prefix == expected.index_prefix assert actual.database_name == expected.database_name