Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/main.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ env:
WEAVIATE_134: 1.34.20
WEAVIATE_135: 1.35.16
WEAVIATE_136: 1.36.10
WEAVIATE_137: 1.37.0-rc.1
WEAVIATE_137: 1.37.2

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
Expand Down
7 changes: 7 additions & 0 deletions src/collections/config/types/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,11 @@ export type Stopwords = {
removals: string[];
};

export type TextAnalyzerConfig = {
asciiFold?: boolean | { ignore: string[] };
stopwordPreset?: StopwordsPreset | string;
};

export type InvertedIndexConfig = {
bm25: {
k1: number;
Expand All @@ -45,6 +50,7 @@ export type InvertedIndexConfig = {
indexPropertyLength: boolean;
indexNullState: boolean;
stopwords: Stopwords;
stopwordPresets?: { [presetName: string]: string[] };
};

export type ObjectTTLConfig = {
Expand Down Expand Up @@ -95,6 +101,7 @@ export type PropertyConfig = {
nestedProperties?: PropertyConfig[];
tokenization: Tokenization | string;
vectorizerConfig?: PropertyVectorizerConfig;
textAnalyzer?: TextAnalyzerConfig;
};

export type ReferenceConfig = {
Expand Down
50 changes: 49 additions & 1 deletion src/collections/config/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ import {
RerankerConfig,
SQConfig,
ShardingConfig,
TextAnalyzerConfig,
VectorConfig,
VectorDistance,
VectorIndexConfigDynamic,
Expand All @@ -63,6 +64,49 @@ import {
VectorizerConfig,
} from './types/index.js';

/**
* Translates the user-facing `TextAnalyzerConfig` (with the ergonomic
* `asciiFold: boolean | { ignore: string[] }` union) into the flat wire
* shape Weaviate's REST API expects (`asciiFold: boolean`,
* `asciiFoldIgnore: string[]`, `stopwordPreset: string`).
*/
export const textAnalyzerConfigToWire = (
config?: TextAnalyzerConfig
): { asciiFold?: boolean; asciiFoldIgnore?: string[]; stopwordPreset?: string } | undefined => {
if (config == undefined) return undefined;
const out: { asciiFold?: boolean; asciiFoldIgnore?: string[]; stopwordPreset?: string } = {
stopwordPreset: config.stopwordPreset,
};
if (typeof config.asciiFold === 'boolean') {
out.asciiFold = config.asciiFold;
} else if (typeof config.asciiFold === 'object') {
out.asciiFold = true;
out.asciiFoldIgnore = config.asciiFold.ignore;
}
return out;
};

/**
* Inverse of `textAnalyzerConfigToWire`: translates the server-returned flat
* shape back into the user-facing union form so values round-trip cleanly
* through `client.collections.create({...})` → `collection.config.get()`.
*/
export const textAnalyzerConfigFromWire = (wire?: {
asciiFold?: boolean;
asciiFoldIgnore?: string[];
stopwordPreset?: string;
}): TextAnalyzerConfig | undefined => {
if (wire == undefined) return undefined;
const out: TextAnalyzerConfig = {};
if (wire.stopwordPreset != undefined) out.stopwordPreset = wire.stopwordPreset;
if (wire.asciiFoldIgnore && wire.asciiFoldIgnore.length > 0) {
out.asciiFold = { ignore: wire.asciiFoldIgnore };
} else if (typeof wire.asciiFold === 'boolean') {
out.asciiFold = wire.asciiFold;
}
return Object.keys(out).length === 0 ? undefined : out;
};

export class ReferenceTypeGuards {
static isSingleTarget<T>(ref: ReferenceConfigCreate<T>): ref is ReferenceSingleTargetConfigCreate<T> {
return (ref as ReferenceSingleTargetConfigCreate<T>).targetCollection !== undefined;
Expand All @@ -76,7 +120,8 @@ export const resolveProperty = <T>(
prop: PropertyConfigCreate<T>,
vectorizers?: string[]
): WeaviateProperty => {
const { dataType, nestedProperties, skipVectorization, vectorizePropertyName, ...rest } = prop;
const { dataType, nestedProperties, skipVectorization, vectorizePropertyName, textAnalyzer, ...rest } =
prop as PropertyConfigCreate<T> & { textAnalyzer?: TextAnalyzerConfig };
Comment thread
g-despot marked this conversation as resolved.
const moduleConfig: any = {};
vectorizers?.forEach((vectorizer) => {
moduleConfig[vectorizer] = {
Expand All @@ -91,6 +136,7 @@ export const resolveProperty = <T>(
? nestedProperties.map((prop) => resolveNestedProperty(prop))
: undefined,
moduleConfig: Object.keys(moduleConfig).length > 0 ? moduleConfig : undefined,
textAnalyzer: textAnalyzerConfigToWire(textAnalyzer),
};
};

Expand Down Expand Up @@ -393,6 +439,7 @@ class ConfigMapping {
bm25: ConfigMapping.bm25(v.bm25),
cleanupIntervalSeconds: v.cleanupIntervalSeconds,
stopwords: ConfigMapping.stopwords(v.stopwords),
stopwordPresets: v.stopwordPresets,
indexNullState: v.indexNullState ? v.indexNullState : false,
indexPropertyLength: v.indexPropertyLength ? v.indexPropertyLength : false,
indexTimestamps: v.indexTimestamps ? v.indexTimestamps : false,
Expand Down Expand Up @@ -741,6 +788,7 @@ class ConfigMapping {
? ConfigMapping.properties(prop.nestedProperties)
: undefined,
tokenization: prop.tokenization ? prop.tokenization : 'none',
textAnalyzer: textAnalyzerConfigFromWire(prop.textAnalyzer),
};
});
}
Expand Down
6 changes: 6 additions & 0 deletions src/collections/configure/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ const configure = {
* @param {'en' | 'none'} [options.stopwordsPreset] The stopwords preset to use.
* @param {string[]} [options.stopwordsAdditions] Additional stopwords to add.
* @param {string[]} [options.stopwordsRemovals] Stopwords to remove.
* @param {{ [presetName: string]: string[] }} [options.stopwordPresets] User-defined named stopword lists, referenced by name from a property's `textAnalyzer.stopwordPreset`. Requires Weaviate >= 1.37.2.
*/
invertedIndex: (options: {
bm25b?: number;
Expand All @@ -100,6 +101,7 @@ const configure = {
stopwordsPreset?: 'en' | 'none';
stopwordsAdditions?: string[];
stopwordsRemovals?: string[];
stopwordPresets?: { [presetName: string]: string[] };
}): InvertedIndexConfigCreate => {
return {
bm25:
Expand All @@ -121,6 +123,7 @@ const configure = {
removals: options.stopwordsRemovals,
}
: undefined,
stopwordPresets: options.stopwordPresets,
};
},
objectTTL: {
Expand Down Expand Up @@ -238,6 +241,7 @@ const reconfigure = {
* @param {'en' | 'none'} [options.stopwordsPreset] The stopwords preset to use.
* @param {string[]} [options.stopwordsAdditions] Additional stopwords to add.
* @param {string[]} [options.stopwordsRemovals] Stopwords to remove.
* @param {{ [presetName: string]: string[] }} [options.stopwordPresets] User-defined named stopword lists, referenced by name from a property's `textAnalyzer.stopwordPreset`. Requires Weaviate >= 1.37.2.
*/
invertedIndex: (options: {
bm25b?: number;
Expand All @@ -246,6 +250,7 @@ const reconfigure = {
stopwordsPreset?: 'en' | 'none';
stopwordsAdditions?: string[];
stopwordsRemovals?: string[];
stopwordPresets?: { [presetName: string]: string[] };
}): InvertedIndexConfigUpdate => {
return {
bm25:
Expand All @@ -264,6 +269,7 @@ const reconfigure = {
removals: options.stopwordsRemovals,
}
: undefined,
stopwordPresets: options.stopwordPresets,
};
},
/**
Expand Down
4 changes: 4 additions & 0 deletions src/collections/configure/types/base.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import {
ObjectTTLConfig,
QuantizerConfig,
ReplicationConfig,
TextAnalyzerConfig,
} from '../../config/types/index.js';
import { DataType, QuantizerRecursivePartial } from '../../types/index.js';
import { NonRefKeys, RefKeys } from '../../types/internal.js';
Expand All @@ -30,6 +31,7 @@ export type InvertedIndexConfigUpdate = {
additions?: string[];
removals?: string[];
};
stopwordPresets?: { [presetName: string]: string[] };
};

export type ObjectTTLConfigCreate = RecursivePartial<ObjectTTLConfig>;
Expand Down Expand Up @@ -99,6 +101,7 @@ export type PropertyConfigCreateBase = {
tokenization?: WeaviateProperty['tokenization'];
skipVectorization?: boolean;
vectorizePropertyName?: boolean;
textAnalyzer?: TextAnalyzerConfig;
};

export type NestedPropertyConfigCreateBase = {
Expand All @@ -121,6 +124,7 @@ export type PropertyConfigCreate<T> = T extends undefined
tokenization?: WeaviateProperty['tokenization'];
skipVectorization?: boolean;
vectorizePropertyName?: boolean;
textAnalyzer?: TextAnalyzerConfig;
} & NestedDataTypeConfig<T>
: {
[K in NonRefKeys<T>]: RequiresNested<DataType<T[K]>> extends true
Expand Down
18 changes: 6 additions & 12 deletions src/openapi/schema.ts
Original file line number Diff line number Diff line change
Expand Up @@ -554,8 +554,6 @@ export interface definitions {
*/
alias?: string;
};
/** @description resources applicable for MCP actions */
mcp?: { [key: string]: unknown };
Comment thread
bevzzz marked this conversation as resolved.
/**
* @description Allowed actions in weaviate.
* @enum {string}
Expand Down Expand Up @@ -838,7 +836,7 @@ export interface definitions {
usingBlockMaxWAND?: boolean;
/** @description User-defined dictionary for tokenization. */
tokenizerUserDict?: definitions['TokenizerUserDictConfig'][];
/** @description User-defined named stopword lists. Each key is a preset name that can be referenced by a property's textAnalyzer.stopwordPreset field. The value is an array of stopword strings. */
/** @description User-defined named stopword lists. Each key is a preset name that can be referenced by a property's textAnalyzer.stopwordPreset field. The value is an array of stopword strings. Preset names must not be empty or whitespace-only; each list must contain at least one word; individual words must not be empty or whitespace-only. */
stopwordPresets?: { [key: string]: string[] };
};
/** @description Configure how replication is executed in a cluster */
Expand Down Expand Up @@ -908,17 +906,13 @@ export interface definitions {
| 'gse_ch';
/** @description Optional text analyzer configuration (e.g. ASCII folding). */
analyzerConfig?: definitions['TextAnalyzerConfig'];
/** @description Optional named stopword configurations. Each key is a preset name that can be referenced by analyzerConfig.stopwordPreset. Each value is a StopwordConfig (with optional preset, additions, and removals). */
stopwordPresets?: { [key: string]: definitions['StopwordConfig'] };
/** @description Optional fallback stopword configuration. Used when analyzerConfig.stopwordPreset is not set. Shape matches InvertedIndexConfig.stopwords on a collection. When analyzerConfig.stopwordPreset is not set and this field is omitted, word tokenization defaults to preset 'en'. Mutually exclusive with stopwordPresets — pass one or the other, not both. */
stopwords?: definitions['StopwordConfig'];
/** @description Optional user-defined named stopword presets. Shape matches InvertedIndexConfig.stopwordPresets on a collection: each key is a preset name, each value is a plain list of stopwords. A preset name that matches a built-in ('en', 'none') fully replaces the built-in. Preset names must not be empty or whitespace-only; each word list must contain at least one word; individual words must not be empty or whitespace-only. Mutually exclusive with stopwords — pass one or the other, not both. */
stopwordPresets?: { [key: string]: string[] };
};
/** @description Response from the tokenize endpoint. */
/** @description Response from the tokenize endpoints. Returns `indexed` text and text used at `query` time */
TokenizeResponse: {
/** @description The tokenization method that was applied. */
tokenization?: string;
/** @description The text analyzer configuration that was used, if any. */
analyzerConfig?: definitions['TextAnalyzerConfig'];
/** @description The stopword configuration that was used, if any. */
stopwordConfig?: definitions['StopwordConfig'];
/** @description The tokens as they would be stored in the inverted index. */
indexed?: string[];
/** @description The tokens as they would be used for query matching (e.g., after stopword removal). */
Expand Down
51 changes: 35 additions & 16 deletions src/tokenize/index.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import { textAnalyzerConfigToWire } from '../collections/config/utils.js';
import { Stopwords, Tokenization } from '../collections/types/index.js';
import ConnectionGRPC from '../connection/grpc.js';
import { WeaviateInvalidInputError } from '../errors.js';
import {
WeaviatePropertyTokenizeRequest,
WeaviateTokenizeRequest,
Expand All @@ -10,31 +12,35 @@ import { TextAnalyzerConfig, TokenizeResult } from './types.js';
import { parseResult } from './util.js';

const tokenize = (connection: ConnectionGRPC, dbVersionSupport: DbVersionSupport): Tokenize => {
const parseTextAnalyzerConfig = (config?: TextAnalyzerConfig) => {
if (config == undefined) return undefined;
const out = { stopwordPreset: config.stopwordPreset ? String(config.stopwordPreset) : undefined };
if (typeof config?.asciiFold === 'boolean') {
return { ...out, asciiFold: config?.asciiFold };
} else if (typeof config?.asciiFold === 'object') {
return {
...out,
asciiFold: true,
asciiFoldIgnore: config?.asciiFold.ignore,
};
}
return out;
};
return {
text: (text, tokenization, opts) => {
if (opts?.stopwords !== undefined && opts?.stopwordPresets !== undefined) {
return Promise.reject(
new WeaviateInvalidInputError(
'stopwords and stopwordPresets are mutually exclusive; pass at most one'
)
);
}
const hasStopwordsField = opts?.stopwords !== undefined || opts?.stopwordPresets !== undefined;
return dbVersionSupport
.supportsTokenize()
.then(({ supports, message }) => (supports ? Promise.resolve() : Promise.reject(new Error(message))))
.then(() =>
hasStopwordsField
? dbVersionSupport
.supportsTokenizeStopwords()
.then(({ supports, message }) =>
supports ? Promise.resolve() : Promise.reject(new Error(message))
)
: Promise.resolve()
)
.then(() =>
connection
.postReturn<WeaviateTokenizeRequest, WeaviateTokenizeResponse>('/tokenize', {
text,
tokenization,
analyzerConfig: parseTextAnalyzerConfig(opts?.analyzerConfig),
analyzerConfig: textAnalyzerConfigToWire(opts?.analyzerConfig),
stopwords: opts?.stopwords,
stopwordPresets: opts?.stopwordPresets,
})
Comment thread
g-despot marked this conversation as resolved.
Comment thread
g-despot marked this conversation as resolved.
.then(parseResult)
Expand All @@ -60,7 +66,20 @@ export interface Tokenize {
tokenization: Tokenization,
opts?: {
analyzerConfig?: TextAnalyzerConfig;
stopwordPresets?: Record<string, Partial<Stopwords>>;
/**
* One-off stopwords block applied directly to this request. Mirrors the
* collection-level `invertedIndexConfig.stopwords` shape (preset +
* additions + removals). Mutually exclusive with `stopwordPresets`.
*/
stopwords?: Partial<Stopwords>;
/**
* User-defined named stopword lists. Keyed by preset name; each value is a
* flat array of stopword strings. Mirrors the wire format accepted by
* Weaviate's `/v1/tokenize` endpoint (>= v1.37.2) and the schema-level
* `invertedIndexConfig.stopwordPresets`. Mutually exclusive with
* `stopwords`.
*/
stopwordPresets?: { [presetName: string]: string[] };
}
) => Promise<TokenizeResult>;
forProperty: (collection: string, property: string, text: string) => Promise<TokenizeResult>;
Expand Down
4 changes: 2 additions & 2 deletions src/tokenize/types.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import { Stopwords } from '../collections/types/index.js';
import { StopwordsPreset } from '../collections/types/index.js';

export type TextAnalyzerConfig = {
asciiFold?: boolean | { ignore: string[] };
stopwordPreset?: Stopwords | string;
stopwordPreset?: StopwordsPreset | string;
};

export type TokenizeResult = {
Expand Down
7 changes: 7 additions & 0 deletions src/utils/dbVersion.ts
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,13 @@ export class DbVersionSupport {
supports: version.isAtLeast(1, 37, 0),
message: this.errorMessage('Tokenize endpoint', version.show(), '1.37.0'),
}));

supportsTokenizeStopwords = () =>
this.dbVersionProvider.getVersion().then((version) => ({
version,
supports: version.isAtLeast(1, 37, 2),
message: this.errorMessage('Tokenize endpoint stopwords / stopwordPresets', version.show(), '1.37.2'),
}));
}

const EMPTY_VERSION = '';
Expand Down
Loading
Loading