Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 38 additions & 2 deletions docs/config/yaml.md
Original file line number Diff line number Diff line change
Expand Up @@ -171,9 +171,45 @@ Where to put all vectors for the system. Configured for lancedb by default. This
- `url` **str** (only for AI Search) - AI Search endpoint
- `api_key` **str** (optional - only for AI Search) - The AI Search api key to use.
- `audience` **str** (only for AI Search) - Audience for managed identity token if managed identity authentication is used.
- `container_name` **str** - The name of a vector container. This stores all indexes (tables) for a given dataset ingest. Default=`default`
- `index_prefix` **str** - (optional) A prefix for the indexes you will create for embeddings. This stores all indexes (tables) for a given dataset ingest.
- `database_name` **str** - (cosmosdb only) Name of the database.
- `overwrite` **bool** (only used at index creation time) - Overwrite collection if it exist. Default=`True`
- `embeddings_schema` **list[dict[str, str]]** (optional) - Enables customization for each of your embeddings.
- `<supported_embedding>`:
- `index_name` **str**: (optional) - Name for the specific embedding index table.
- `id_field` **str**: (optional) - Field name to be used as id. Default=`id`
- `vector_field` **str**: (optional) - Field name to be used as vector. Default=`vector`
- `vector_size` **int**: (optional) - Vector size for the embeddings. Default=`3072`

The supported embeddings are:

- `text_unit.text`
- `document.text`
- `entity.title`
- `entity.description`
- `relationship.description`
- `community.title`
- `community.summary`
- `community.full_content`

For example:

```yaml
vector_store:
type: lancedb
db_uri: output/lancedb
container_name: christmas-carol
index_prefix: "christmas-carol"
embeddings_schema:
text_unit.text:
index_name: "text-unit-embeddings"
id_field: "id_custom"
vector_field: "vector_custom"
vector_size: 3072
entity.description:
id_field: "id_custom"

```


## Workflow Configurations

Expand Down
2 changes: 1 addition & 1 deletion graphrag/config/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,8 +402,8 @@ class VectorStoreDefaults:

type: ClassVar[str] = VectorStoreType.LanceDB.value
db_uri: str = str(Path(DEFAULT_OUTPUT_BASE_DIR) / "lancedb")
container_name: str = "default"
overwrite: bool = True
index_prefix: None = None
url: None = None
api_key: None = None
audience: None = None
Expand Down
9 changes: 6 additions & 3 deletions graphrag/config/embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,13 @@


def create_index_name(
container_name: str, embedding_name: str, validate: bool = True
index_prefix: str, embedding_name: str, validate: bool = True
) -> str:
"""
Create a index name for the embedding store.

Within any given vector store, we can have multiple sets of embeddings organized into projects.
The `container` param is used for this partitioning, and is added as a prefix to the index name for differentiation.
The `container` param is used for this partitioning, and is added as a index_prefix to the index name for differentiation.

The embedding name is fixed, with the available list defined in graphrag.index.config.embeddings

Expand All @@ -45,4 +45,7 @@ def create_index_name(
if validate and embedding_name not in all_embeddings:
msg = f"Invalid embedding name: {embedding_name}"
raise KeyError(msg)
return f"{container_name}-{embedding_name}".replace(".", "-")

if index_prefix:
return f"{index_prefix}-{embedding_name}".replace(".", "-")
return embedding_name.replace(".", "-")
2 changes: 1 addition & 1 deletion graphrag/config/init_content.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@
vector_store:
type: {vector_store_defaults.type}
db_uri: {vector_store_defaults.db_uri}
container_name: {vector_store_defaults.container_name}
index_prefix: {vector_store_defaults.index_prefix}
### Workflow settings ###
Expand Down
6 changes: 3 additions & 3 deletions graphrag/config/models/vector_store_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,9 @@ def _validate_url(self) -> None:
default=vector_store_defaults.audience,
)

container_name: str = Field(
description="The container name to use.",
default=vector_store_defaults.container_name,
index_prefix: str | None = Field(
description="The index prefix to use.",
default=vector_store_defaults.index_prefix,
)

database_name: str | None = Field(
Expand Down
14 changes: 11 additions & 3 deletions graphrag/index/workflows/generate_text_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,14 @@ def _create_vector_store(
else:
single_embedding_config = raw_config

if (
single_embedding_config.index_name is not None
and vector_store_config.index_prefix
):
single_embedding_config.index_name = (
f"{vector_store_config.index_prefix}-{single_embedding_config.index_name}"
)

if single_embedding_config.index_name is None:
single_embedding_config.index_name = index_name

Expand All @@ -270,9 +278,9 @@ def _create_vector_store(


def _get_index_name(vector_store_config: VectorStoreConfig, embedding_name: str) -> str:
container_name = vector_store_config.container_name
index_name = create_index_name(container_name, embedding_name)
index_prefix = vector_store_config.index_prefix or ""
index_name = create_index_name(index_prefix, embedding_name)

msg = f"using vector store {vector_store_config.type} with container_name {container_name} for embedding {embedding_name}: {index_name}"
msg = f"using vector store {vector_store_config.type} with index prefix {index_prefix} for embedding {embedding_name}: {index_name}"
logger.info(msg)
return index_name
2 changes: 1 addition & 1 deletion tests/unit/config/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def assert_vector_store_configs(
assert actual.url == expected.url
assert actual.api_key == expected.api_key
assert actual.audience == expected.audience
assert actual.container_name == expected.container_name
assert actual.index_prefix == expected.index_prefix
assert actual.database_name == expected.database_name


Expand Down
Loading