Skip to content
1 change: 1 addition & 0 deletions packages/core/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@
"devDependencies": {
"ajv": "^8.17.1",
"bunchee": "5.3.2",
"change-case": "^5.4.4",
"natural": "^8.0.1"
},
"dependencies": {
Expand Down
1 change: 1 addition & 0 deletions packages/core/src/schema/index.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
export * from "./node";
export { DATA_KEY, TYPE_KEY, fromDocStore, type DocJson } from "./python";
export { FileReader, TransformComponent, type BaseReader } from "./type";
export { EngineResponse } from "./type/engine–response";
export * from "./zod";
32 changes: 11 additions & 21 deletions packages/core/src/schema/node.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@ export enum NodeRelationship {
}

export enum ObjectType {
TEXT = "TEXT",
IMAGE = "IMAGE",
INDEX = "INDEX",
DOCUMENT = "DOCUMENT",
IMAGE_DOCUMENT = "IMAGE_DOCUMENT",
TEXT = "1",
IMAGE = "2",
INDEX = "3",
DOCUMENT = "4",
IMAGE_DOCUMENT = "5", // Python side doesn't have this enum
}

export enum MetadataMode {
Expand Down Expand Up @@ -76,7 +76,6 @@ export abstract class BaseNode<T extends Metadata = Metadata> {
excludedEmbedMetadataKeys,
excludedLlmMetadataKeys,
relationships,
hash,
embedding,
} = init || {};
this.id_ = id_ ?? randomUUID();
Expand Down Expand Up @@ -177,13 +176,12 @@ export abstract class BaseNode<T extends Metadata = Metadata> {
return {
...this,
type: this.type,
// hash is an accessor property, so it's not included in the rest operator
hash: this.hash,
// no `hash` here to align with Python side
};
}

clone(): BaseNode {
return jsonToNode(this.toMutableJSON()) as BaseNode;
return jsonToNode(this.toMutableJSON(), this.type);
}

/**
Expand Down Expand Up @@ -224,27 +222,19 @@ export class TextNode<T extends Metadata = Metadata> extends BaseNode<T> {
init;
this.text = text ?? "";
this.textTemplate = textTemplate ?? "";
if (startCharIdx) {
if (startCharIdx !== undefined) {
this.startCharIdx = startCharIdx;
}
if (endCharIdx) {
if (endCharIdx !== undefined) {
this.endCharIdx = endCharIdx;
}
this.metadataSeparator = metadataSeparator ?? "\n";
}

/**
* Generate a hash of the text node.
* The ID is not part of the hash as it can change independent of content.
* @returns
*/
generateHash() {
const hashFunction = createSHA256();
hashFunction.update(`type=${this.type}`);
hashFunction.update(
`startCharIdx=${this.startCharIdx} endCharIdx=${this.endCharIdx}`,
);
hashFunction.update(this.getContent(MetadataMode.ALL));
const docIdentity = this.text + JSON.stringify(this.metadata);
hashFunction.update(docIdentity);
return hashFunction.digest();
}

Expand Down
72 changes: 72 additions & 0 deletions packages/core/src/schema/python.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
/**
* Python adapter for the schema.
*/
import { jsonToNode, ObjectType } from "./node";

export const TYPE_KEY = "__type__";
export const DATA_KEY = "__data__";

async function camelCaseJson(json: Record<string, any>) {
const { camelCase } = await import("change-case");
return Object.entries(json).reduce(
(acc, [key, value]) => {
acc[
camelCase(key, {
suffixCharacters: "_",
})
] = value;
return acc;
},
{} as Record<string, any>,
);
}

const PYTHON_TO_JS_TYPE_MAP = {
"1": ObjectType.TEXT,
"2": ObjectType.IMAGE,
"3": ObjectType.INDEX,
"4": ObjectType.DOCUMENT,
};

const LEGACY_JS_MAP = {
TEXT: ObjectType.TEXT,
IMAGE: ObjectType.IMAGE,
INDEX: ObjectType.INDEX,
DOCUMENT: ObjectType.DOCUMENT,
IMAGE_DOCUMENT: ObjectType.DOCUMENT,
};

export type DocJson = {
[TYPE_KEY]: string;
[DATA_KEY]: string;
};

async function fromImpl(data: Record<string, unknown>) {
const convertedJson = await camelCaseJson(data);
if (convertedJson.relationships) {
for (const [key, value] of Object.entries(convertedJson.relationships)) {
if (typeof value === "object" && value !== null) {
convertedJson.relationships[key] = await camelCaseJson(value);
} else if (Array.isArray(value)) {
convertedJson.relationships[key] = await Promise.all(
value.map((v) => camelCaseJson(v)),
);
}
}
}
return convertedJson;
}

export async function fromDocStore({
[TYPE_KEY]: type,
[DATA_KEY]: data,
}: DocJson) {
if (!(type in PYTHON_TO_JS_TYPE_MAP) && !(type in LEGACY_JS_MAP)) {
throw new Error("Invalid type");
}
const objectType =
PYTHON_TO_JS_TYPE_MAP[type as keyof typeof PYTHON_TO_JS_TYPE_MAP] ||
LEGACY_JS_MAP[type as keyof typeof LEGACY_JS_MAP];
const convertedJson = await fromImpl(JSON.parse(data));
return jsonToNode(convertedJson, objectType);
}
9 changes: 6 additions & 3 deletions packages/core/tests/decorator.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,17 @@ describe("chunkSizeCheck", () => {
env.ENABLE_CHUNK_SIZE_CHECK = "true";

let message = "";
const consoleMock = vi
.spyOn(console, "warn")
.mockImplementation((msg) => (message += msg + "\n"));
vi.spyOn(console, "warn").mockImplementation(
(msg) => (message += msg + "\n"),
);

Settings.chunkSize = 0;

const node = new TextNode();
expect(message).toEqual("");
node.setContent("a".repeat(1024));
expect(message).toBe("");
node.getContent();
expect(message).toContain("is larger than chunk size");
});
});
77 changes: 72 additions & 5 deletions packages/core/tests/schema-node.test.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,74 @@
import { Document, TextNode } from "@llamaindex/core/schema";
import {
Document,
ObjectType,
TextNode,
fromPythonDocStore,
} from "@llamaindex/core/schema";
import { beforeEach, describe, expect, test } from "vitest";

describe("Python", () => {
test("from python doc store", async () => {
const node = await fromPythonDocStore({
__data__: JSON.stringify({
id_: "e86be4a7-2ad0-4c3c-937b-3140f562e7a7",
embedding: null,
metadata: {},
excluded_embed_metadata_keys: [],
excluded_llm_metadata_keys: [],
relationships: {
"1": {
node_id: "e1fe8fd0-f470-40cd-bc2e-be3a220cef94",
node_type: "4",
metadata: {},
hash: "191a8fdcf068d3ac831da23cde07a92efe1432243c7f628d1009aa2ecdf6cb03",
class_name: "RelatedNodeInfo",
},
},
text: "This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test.",
mimetype: "text/plain",
start_char_idx: 0,
end_char_idx: 1599,
text_template: "{metadata_str}\n\n{content}",
metadata_template: "{key}: {value}",
metadata_seperator: "\n",
class_name: "TextNode",
}),
__type__: "1",
});
expect(node.startCharIdx).toBe(0);
expect(node.endCharIdx).toBe(1599);
expect(node).toMatchInlineSnapshot(`
{
"embedding": null,
"endCharIdx": 1599,
"excludedEmbedMetadataKeys": [],
"excludedLlmMetadataKeys": [],
"id_": "e86be4a7-2ad0-4c3c-937b-3140f562e7a7",
"metadata": {},
"metadataSeparator": "
",
"relationships": {
"1": {
"className": "RelatedNodeInfo",
"hash": "191a8fdcf068d3ac831da23cde07a92efe1432243c7f628d1009aa2ecdf6cb03",
"metadata": {},
"nodeId": "e1fe8fd0-f470-40cd-bc2e-be3a220cef94",
"nodeType": "4",
},
},
"startCharIdx": 0,
"text": "This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test.",
"textTemplate": "{metadata_str}

{content}",
"type": "1",
}
`);
expect(node.id_).toBe("e86be4a7-2ad0-4c3c-937b-3140f562e7a7");
expect(node.type).toBe(ObjectType.TEXT);
});
});

describe("Document", () => {
let document: Document;

Expand All @@ -10,7 +78,7 @@ describe("Document", () => {

test("should generate a hash", () => {
expect(document.hash).toMatchInlineSnapshot(
`"1mkNkQC30mZlBBG48DNuG2WSKcTQ32DImC+4JUoVijg="`,
`"oznYDHYUGHArYnhRy9lj63IvEt/rNg1EH5EjwtPU/Pc="`,
);
});

Expand All @@ -30,7 +98,7 @@ describe("TextNode", () => {

test("should generate a hash", () => {
expect(node.hash).toMatchInlineSnapshot(
`"nTSKdUTYqR52MPv/brvb4RTGeqedTEqG9QN8KSAj2Do="`,
`"oznYDHYUGHArYnhRy9lj63IvEt/rNg1EH5EjwtPU/Pc="`,
);
});

Expand All @@ -52,7 +120,6 @@ describe("TextNode", () => {
"embedding": undefined,
"excludedEmbedMetadataKeys": [],
"excludedLlmMetadataKeys": [],
"hash": "Z6SWgFPlalaeblMGQGw0KS3qKgmZdEWXKfzEp/K+QN0=",
"id_": Any<String>,
"metadata": {
"something": 1,
Expand All @@ -63,7 +130,7 @@ describe("TextNode", () => {
"relationships": {},
"text": "Hello World",
"textTemplate": "",
"type": "TEXT",
"type": "1",
}
`,
);
Expand Down
14 changes: 10 additions & 4 deletions packages/llamaindex/src/ingestion/IngestionCache.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
import type { BaseNode, TransformComponent } from "@llamaindex/core/schema";
import { MetadataMode } from "@llamaindex/core/schema";
import {
type BaseNode,
fromDocStore,
MetadataMode,
type TransformComponent,
} from "@llamaindex/core/schema";
import { createSHA256 } from "@llamaindex/env";
import { docToJson, jsonToDoc } from "../storage/docStore/utils.js";
import { docToJson } from "../storage/docStore/utils.js";
import { SimpleKVStore } from "../storage/kvStore/SimpleKVStore.js";
import type { BaseKVStore } from "../storage/kvStore/types.js";

Expand Down Expand Up @@ -63,6 +67,8 @@ export class IngestionCache {
if (!json || !json[this.nodesKey] || !Array.isArray(json[this.nodesKey])) {
return undefined;
}
return json[this.nodesKey].map((doc: any) => jsonToDoc(doc));
return Promise.all(
json[this.nodesKey].map((doc: any) => fromDocStore(doc)),
);
}
}
13 changes: 8 additions & 5 deletions packages/llamaindex/src/storage/docStore/KVDocumentStore.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
import type { BaseNode } from "@llamaindex/core/schema";
import { ObjectType } from "@llamaindex/core/schema";
import {
type BaseNode,
fromDocStore,
ObjectType,
} from "@llamaindex/core/schema";
import _ from "lodash";
import { DEFAULT_NAMESPACE } from "../constants.js";
import type { BaseKVStore } from "../kvStore/types.js";
import type { RefDocInfo } from "./types.js";
import { BaseDocumentStore } from "./types.js";
import { docToJson, isValidDocJson, jsonToDoc } from "./utils.js";
import { docToJson, isValidDocJson } from "./utils.js";

type DocMetaData = { docHash: string; refDocId?: string };

Expand All @@ -29,7 +32,7 @@ export class KVDocumentStore extends BaseDocumentStore {
for (const key in jsonDict) {
const value = jsonDict[key];
if (isValidDocJson(value)) {
docs[key] = jsonToDoc(value);
docs[key] = await fromDocStore(value);
} else {
console.warn(`Invalid JSON for docId ${key}`);
}
Expand Down Expand Up @@ -94,7 +97,7 @@ export class KVDocumentStore extends BaseDocumentStore {
if (!isValidDocJson(json)) {
throw new Error(`Invalid JSON for docId ${docId}`);
}
return jsonToDoc(json);
return fromDocStore(json);
}

async getRefDocInfo(refDocId: string): Promise<RefDocInfo | undefined> {
Expand Down
30 changes: 1 addition & 29 deletions packages/llamaindex/src/storage/docStore/utils.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import type { BaseNode } from "@llamaindex/core/schema";
import { Document, ObjectType, TextNode } from "@llamaindex/core/schema";
import { ObjectType } from "@llamaindex/core/schema";

const TYPE_KEY = "__type__";
const DATA_KEY = "__data__";
Expand All @@ -24,31 +24,3 @@ export function docToJson(doc: BaseNode): DocJson {
[TYPE_KEY]: doc.type,
};
}

export function jsonToDoc(docDict: DocJson): BaseNode {
const docType = docDict[TYPE_KEY];
const dataDict = JSON.parse(docDict[DATA_KEY]);
let doc: BaseNode;

if (docType === ObjectType.DOCUMENT) {
doc = new Document({
text: dataDict.text,
id_: dataDict.id_,
embedding: dataDict.embedding,
hash: dataDict.hash,
metadata: dataDict.metadata,
});
} else if (docType === ObjectType.TEXT) {
doc = new TextNode({
text: dataDict.text,
id_: dataDict.id_,
hash: dataDict.hash,
metadata: dataDict.metadata,
relationships: dataDict.relationships,
});
} else {
throw new Error(`Unknown doc type: ${docType}`);
}

return doc;
}
Loading