Skip to content

Commit b916a18

Browse files
committed
Implement custom chunk strategy for yaml text
1 parent 67dc21c commit b916a18

File tree

4 files changed

+163
-2
lines changed

4 files changed

+163
-2
lines changed

src/chunker.ts

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
import {
2+
ChunkResult,
3+
} from "@magda/semantic-indexer-sdk";
4+
import { RecursiveCharacterTextSplitter } from "@langchain/textsplitters";
5+
6+
export class Chunker {
7+
private chunkSize: number;
8+
private overlap: number;
9+
private splitter: RecursiveCharacterTextSplitter;
10+
11+
constructor(chunkSize: number, overlap: number) {
12+
if (overlap >= chunkSize) {
13+
throw new Error("Overlap must be smaller than chunk size");
14+
}
15+
this.chunkSize = chunkSize * 4;
16+
this.overlap = overlap * 4;
17+
this.splitter = new RecursiveCharacterTextSplitter({
18+
separators: [
19+
"\nColumns",
20+
"\nDescription",
21+
"\nTitle",
22+
"\nKeywords",
23+
"\nThemes",
24+
"\nTemporal coverage",
25+
"\nLanguages",
26+
"\nPublisher",
27+
"\n",
28+
" ",
29+
""
30+
],
31+
chunkSize: this.chunkSize,
32+
chunkOverlap: this.overlap,
33+
keepSeparator: true,
34+
stripWhitespace: false,
35+
});
36+
}
37+
38+
async chunk(text: string): Promise<ChunkResult[]> {
39+
if (!text) {
40+
return [];
41+
}
42+
43+
if (text.length <= this.chunkSize) {
44+
return [{
45+
text: text,
46+
position: 0,
47+
length: text.length,
48+
overlap: 0
49+
}];
50+
}
51+
52+
const chunks = await this.splitter.splitTextWithMetadata(text);
53+
54+
for (let i = 1; i < chunks.length; i++) {
55+
if (chunks[i].overlap === 0 && chunks[i].text.trim() === '') {
56+
chunks[i-1].text = chunks[i-1].text + chunks[i].text;
57+
chunks[i-1].length = chunks[i-1].length + chunks[i].length;
58+
chunks.splice(i, 1);
59+
i--;
60+
}
61+
}
62+
63+
return chunks;
64+
}
65+
}

src/createEmbeddingText.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,14 +109,14 @@ export const createEmbeddingText: CreateEmbeddingText = async ({
109109
const yamlText = toYaml({
110110
Title: title || undefined,
111111
Format: format || "CSV",
112-
Description: description || undefined,
113112
"File name": fileName || undefined,
114113
"Temporal coverage": temporalCoverage || undefined,
115114
Licence: licence || undefined,
116115
Publisher: publisher || undefined,
117116
Themes: themes || undefined,
118117
Keywords: keywords || undefined,
119118
Languages: languages || undefined,
119+
Description: description || undefined,
120120
Columns: columns || undefined
121121
});
122122

src/index.ts

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,29 @@
11
import semanticIndexer, {
22
SemanticIndexerOptions,
33
commonYargs,
4+
ChunkStrategyType,
45
} from "@magda/semantic-indexer-sdk";
56
import { csvSemanticIndexerArgs } from "./csvSemanticIndexerArgs.js";
67
import { createEmbeddingText } from "./createEmbeddingText.js";
8+
import { Chunker } from "./chunker.js";
79

810
const port = csvSemanticIndexerArgs.port;
911
const args = commonYargs(port, `http://localhost:${port}`);
1012

13+
const chunker = new Chunker(csvSemanticIndexerArgs.chunkSizeLimit, csvSemanticIndexerArgs.overlap);
14+
15+
const chunkStrategy: ChunkStrategyType = async (text: string) => {
16+
return await chunker.chunk(text);
17+
};
18+
1119
const options: SemanticIndexerOptions = {
1220
argv: args,
1321
id: csvSemanticIndexerArgs.id,
1422
itemType: "storageObject",
1523
formatTypes: ["csv"],
1624
autoDownloadFile: true,
17-
createEmbeddingText: createEmbeddingText
25+
createEmbeddingText: createEmbeddingText,
26+
chunkStrategy: chunkStrategy,
1827
};
1928

2029
semanticIndexer(options);

src/test/chunker.spec.ts

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
import { expect } from "chai";
2+
import { Chunker } from "../chunker.js";
3+
4+
describe("Chunker", () => {
5+
it("should chunk yaml text with proper overlapping", async () => {
6+
const text = `Title: Dataset Title
7+
Publisher: Example Publisher
8+
Themes: [Environment, Climate]
9+
Keywords: [weather, rainfall]
10+
Languages: [en, zh]
11+
Description: This is a long description for testing chunker logic
12+
Columns:
13+
- column1
14+
- column2
15+
- column3
16+
- column4
17+
- column5
18+
`;
19+
const chunker = new Chunker(10, 3);
20+
const chunks = await chunker.chunk(text);
21+
22+
expect(chunks).to.not.be.empty;
23+
24+
chunks.forEach((chunk) => {
25+
expect(chunk.length).to.equal(chunk.text.length);
26+
expect(text.slice(chunk.position, chunk.position + chunk.length))
27+
.to.equal(chunk.text);
28+
});
29+
30+
const reconstructed = chunks.map(chunk =>
31+
chunk.text.slice(chunk.overlap)
32+
).join('');
33+
expect(reconstructed).to.equal(text);
34+
});
35+
36+
it("should chunk yaml text with newline characters in description", async () => {
37+
const text = `
38+
Title: Another Dataset
39+
Description: This is a very long description\nthat will be chunked into\nmultiple chunks.
40+
Publisher: Test Publisher
41+
Themes: [Data, Analysis]
42+
Columns:
43+
- column1
44+
- column2
45+
- column3
46+
- column4
47+
- column5
48+
`;
49+
const chunker = new Chunker(40, 8);
50+
const chunks = await chunker.chunk(text);
51+
52+
expect(chunks).to.not.be.empty;
53+
chunks.forEach((chunk) => {
54+
expect(chunk.length).to.equal(chunk.text.length);
55+
expect(text.slice(chunk.position, chunk.position + chunk.length))
56+
.to.equal(chunk.text);
57+
});
58+
59+
const reconstructed = chunks.map(chunk =>
60+
chunk.text.slice(chunk.overlap)
61+
).join('');
62+
expect(reconstructed).to.equal(text);
63+
});
64+
65+
it("should handle text smaller than chunk size", async () => {
66+
const text = `Title: Small Dataset`;
67+
const chunker = new Chunker(50, 5);
68+
const chunks = await chunker.chunk(text);
69+
70+
expect(chunks).to.deep.equal([{
71+
text,
72+
length: text.length,
73+
position: 0,
74+
overlap: 0
75+
}]);
76+
});
77+
78+
it("should throw error when overlap >= chunk size", () => {
79+
expect(() => new Chunker(20, 25)).to.throw("Overlap must be smaller than chunk size");
80+
});
81+
82+
it("should handle empty text", async () => {
83+
const chunker = new Chunker(50, 5);
84+
const chunks = await chunker.chunk("");
85+
expect(chunks).to.be.empty;
86+
});
87+
});

0 commit comments

Comments
 (0)