Skip to content

Commit 67dc21c

Browse files
committed
set up helm chart
1 parent 25bdb0b commit 67dc21c

File tree

9 files changed

+313
-1
lines changed

9 files changed

+313
-1
lines changed

Dockerfile

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
FROM node:18-alpine
2+
3+
RUN mkdir -p /usr/src/app
4+
COPY . /usr/src/app
5+
WORKDIR /usr/src/app/component
6+
ENTRYPOINT [ "node", "/usr/src/app/component/dist/index.js" ]

README.md

Lines changed: 66 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,67 @@
11
# magda-csv-semantic-indexer
2-
A Magda semantic indexer can index CSV files
2+
3+
![Version: 1.0.0-alpha.0](https://img.shields.io/badge/Version-1.0.0--alpha.0-informational?style=flat-square)
4+
5+
A Helm chart for Magda CSV Semantic Indexer
6+
7+
**Homepage:** <https://github.com/magda-io/magda-csv-semantic-indexer>
8+
9+
## Source Code
10+
11+
* <https://github.com/magda-io/magda-csv-semantic-indexer>
12+
13+
## Requirements
14+
15+
Kubernetes: `>= 1.14.0-0`
16+
17+
| Repository | Name | Version |
18+
|------------|------|---------|
19+
| oci://ghcr.io/magda-io/charts | magda-common | 5.2.0 |
20+
21+
## Values
22+
23+
| Key | Type | Default | Description |
24+
|-----|------|---------|-------------|
25+
| defaultAdminUserId | string | `"00000000-0000-4000-8000-000000000000"` | |
26+
| defaultSemanticIndexerConfig.bulkEmbeddingsSize | int | `1` | |
27+
| defaultSemanticIndexerConfig.bulkIndexSize | int | `50` | |
28+
| defaultSemanticIndexerConfig.chunkSizeLimit | int | `512` | |
29+
| defaultSemanticIndexerConfig.chunkSizeLimit | int | `512` | |
30+
| defaultSemanticIndexerConfig.id | string | `"csv-semantic-indexer"` | |
31+
| defaultSemanticIndexerConfig.indexName | string | `"semantic-index"` | |
32+
| defaultSemanticIndexerConfig.indexVersion | int | `1` | |
33+
| defaultSemanticIndexerConfig.overlap | int | `50` | |
34+
| defaultSemanticIndexerConfig.overlap | int | `50` | |
35+
| embeddingApiURL | string | `"http://magda-embedding-api"` | |
36+
| global | object | `{"image":{},"rollingUpdate":{},"searchEngine":{"defaultDatasetBucket":"magda-datasets","semanticIndexer":{"indexName":null,"indexVersion":null,"knnVectorFieldConfig":{"compressionLevel":null,"dimension":768,"efConstruction":100,"efSearch":100,"encoder":{"clip":false,"name":"sq","type":"fp16"},"m":16,"mode":"in_memory","spaceType":"l2"},"numberOfReplicas":0,"numberOfShards":1}}}` | only for providing appropriate default value for helm lint |
37+
| global.searchEngine.semanticIndexer.knnVectorFieldConfig.compressionLevel | string | `nil` | The compression_level mapping parameter selects a quantization encoder that reduces vector memory consumption by the given factor. |
38+
| global.searchEngine.semanticIndexer.knnVectorFieldConfig.dimension | int | `768` | Dimension of the embedding vectors. |
39+
| global.searchEngine.semanticIndexer.knnVectorFieldConfig.efConstruction | int | `100` | Similar to efSearch but used during index construction. Higher values improve search quality but increase index build time. |
40+
| global.searchEngine.semanticIndexer.knnVectorFieldConfig.efSearch | int | `100` | The size of the candidate queue during search. Larger values may improve search quality but increase search latency. |
41+
| global.searchEngine.semanticIndexer.knnVectorFieldConfig.encoder | object | `{"clip":false,"name":"sq","type":"fp16"}` | FAISS Encoder configuration (If compressionLevel is set, encoder will be ignored). |
42+
| global.searchEngine.semanticIndexer.knnVectorFieldConfig.m | int | `16` | The maximum number of graph edges per vector. Higher values increase memory usage but may improve search quality. |
43+
| global.searchEngine.semanticIndexer.knnVectorFieldConfig.mode | string | `"in_memory"` | Vector workload mode: `on_disk` or `in_memory`. |
44+
| image.name | string | `"data61/magda-csv-semantic-indexer"` | |
45+
| image.pullPolicy | string | `"IfNotPresent"` | |
46+
| image.repository | string | `"localhost:5000"` | |
47+
| image.tag | string | `"latest"` | |
48+
| minioConfig.defaultDatasetBucket | string | `""` | |
49+
| minioConfig.endPoint | string | `"magda-minio"` | |
50+
| minioConfig.port | int | `9000` | |
51+
| minioConfig.region | string | `""` | |
52+
| minioConfig.useSSL | bool | `false` | |
53+
| opensearchURL | string | `"http://opensearch:9200"` | |
54+
| port | int | `6305` | Service port configuration |
55+
| resources.limits.cpu | string | `"100m"` | |
56+
| resources.requests.cpu | string | `"50m"` | |
57+
| resources.requests.memory | string | `"200Mi"` | |
58+
| semanticIndexer.bulkEmbeddingsSize | int | `nil` | number of string we request embedding api to process in one request |
59+
| semanticIndexer.bulkIndexSize | int | `nil` | Number of documents we send to OpenSearch for bulk processing in a single request |
60+
| semanticIndexer.chunkSizeLimit | int | `nil` | The maximum number of tokens in a single chunk. |
61+
| semanticIndexer.id | string | `"csv-semantic-indexer-5"` | Semantic indexer ID |
62+
| semanticIndexer.indexName | string | `nil` | index name |
63+
| semanticIndexer.indexVersion | int | `nil` | index version |
64+
| semanticIndexer.overlap | int | `nil` | The number of overlapping tokens between chunks. |
65+
66+
----------------------------------------------
67+
Autogenerated from chart metadata using [helm-docs v1.11.0](https://github.com/norwoodj/helm-docs/releases/v1.11.0)
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
apiVersion: v2
2+
name: magda-csv-semantic-indexer
3+
version: 1.0.0-alpha.0
4+
kubeVersion: ">= 1.14.0-0"
5+
description: A Helm chart for Magda CSV Semantic Indexer
6+
home: "https://github.com/magda-io/magda-csv-semantic-indexer"
7+
sources:
8+
- https://github.com/magda-io/magda-csv-semantic-indexer
9+
dependencies:
10+
- name: magda-common
11+
version: "5.2.0"
12+
repository: "oci://ghcr.io/magda-io/charts"
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
{{- define "magda-csv-semantic-indexer.semanticIndexer.values" }}
2+
{{- $semanticIndexer := get .Values "semanticIndexer" | default dict }}
3+
{{- $globalSemanticIndexer := get .Values.global.searchEngine "semanticIndexer" | default dict }}
4+
{{- $defaultConfig := .Values.defaultSemanticIndexerConfig }}
5+
6+
{{- $id := .Values.semanticIndexer.id | default $defaultConfig.id }}
7+
{{- $indexVersion := .Values.semanticIndexer.indexVersion | default (get $globalSemanticIndexer "indexVersion") | default $defaultConfig.indexVersion }}
8+
{{- $actualIndexName := .Values.semanticIndexer.indexName | default (get $globalSemanticIndexer "indexName") | default $defaultConfig.indexName }}
9+
{{- $chunkSizeLimit := .Values.semanticIndexer.chunkSizeLimit | default $defaultConfig.chunkSizeLimit }}
10+
{{- $overlap := .Values.semanticIndexer.overlap | default $defaultConfig.overlap }}
11+
{{- $bulkEmbeddingsSize := .Values.semanticIndexer.bulkEmbeddingsSize | default $defaultConfig.bulkEmbeddingsSize }}
12+
{{- $bulkIndexSize := .Values.semanticIndexer.bulkIndexSize | default $defaultConfig.bulkIndexSize }}
13+
14+
{{- $_ := set $semanticIndexer "id" $id }}
15+
{{- $_ := set $semanticIndexer "numberOfShards" (get $globalSemanticIndexer "numberOfShards") }}
16+
{{- $_ := set $semanticIndexer "numberOfReplicas" (get $globalSemanticIndexer "numberOfReplicas") }}
17+
{{- $_ := set $semanticIndexer "knnVectorFieldConfig" (get $globalSemanticIndexer "knnVectorFieldConfig") }}
18+
19+
{{- $_ := set $semanticIndexer "indexName" $actualIndexName }}
20+
{{- $_ := set $semanticIndexer "indexVersion" $indexVersion }}
21+
{{- $_ := set $semanticIndexer "chunkSizeLimit" $chunkSizeLimit }}
22+
{{- $_ := set $semanticIndexer "overlap" $overlap }}
23+
{{- $_ := set $semanticIndexer "bulkEmbeddingsSize" $bulkEmbeddingsSize }}
24+
{{- $_ := set $semanticIndexer "bulkIndexSize" $bulkIndexSize }}
25+
26+
{{- $semanticIndexer | mustToRawJson }}
27+
{{- end }}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
apiVersion: v1
2+
kind: ConfigMap
3+
metadata:
4+
name: "{{ .Chart.Name }}-config"
5+
data:
6+
semantic-indexer.json: {{ (include "magda-csv-semantic-indexer.semanticIndexer.values" .) | quote }}
7+
8+
{{- $minioConfig := .Values.minioConfig }}
9+
{{- $finalBucket := (.Values.minioConfig.defaultDatasetBucket | default .Values.global.defaultDatasetBucket | default "magda-datasets") }}
10+
{{- $_ := set $minioConfig "defaultDatasetBucket" $finalBucket }}
11+
minio.json: {{ $minioConfig | mustToRawJson | quote }}
12+
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
apiVersion: apps/v1
2+
kind: Deployment
3+
metadata:
4+
name: magda-csv-semantic-indexer
5+
spec:
6+
replicas: 1
7+
strategy:
8+
rollingUpdate:
9+
maxUnavailable: {{ .Values.global.rollingUpdate.maxUnavailable | default 0 }}
10+
selector:
11+
matchLabels:
12+
service: magda-csv-semantic-indexer
13+
template:
14+
metadata:
15+
labels:
16+
service: magda-csv-semantic-indexer
17+
spec:
18+
{{- include "magda.imagePullSecrets" . | indent 6 }}
19+
containers:
20+
- name: magda-csv-semantic-indexer
21+
image: {{ include "magda.image" . | quote }}
22+
imagePullPolicy: {{ include "magda.imagePullPolicy" . | quote }}
23+
command: [
24+
"node",
25+
"/usr/src/app/component/dist/index.js",
26+
"--semanticIndexerConfig", "/etc/config/semantic-indexer.json",
27+
"--minioConfig", "/etc/config/minio.json",
28+
"--opensearchApiURL", "{{ .Values.opensearchURL }}",
29+
"--embeddingApiURL", "{{ .Values.embeddingApiURL }}",
30+
"--id", "{{ .Values.semanticIndexer.id | default .Values.defaultSemanticIndexerConfig.id }}",
31+
"--chunkSizeLimit", "{{ .Values.semanticIndexer.chunkSizeLimit | default .Values.defaultSemanticIndexerConfig.chunkSizeLimit }}",
32+
"--overlap", "{{ .Values.semanticIndexer.overlap | default .Values.defaultSemanticIndexerConfig.overlap }}"
33+
]
34+
{{- if .Values.global.enableLivenessProbes }}
35+
livenessProbe:
36+
httpGet:
37+
path: "/healthz"
38+
port: 80
39+
initialDelaySeconds: 10
40+
periodSeconds: 10
41+
timeoutSeconds: 10
42+
{{- end }}
43+
resources:
44+
{{ toYaml .Values.resources | indent 10 }}
45+
env:
46+
- name: NODE_PORT
47+
value: "80"
48+
- name: REGISTRY_URL
49+
value: "http://registry-api/v0"
50+
- name: REGISTRY_READ_ONLY_URL
51+
value: "http://registry-api-read-only/v0"
52+
- name: ENABLE_MULTI_TENANTS
53+
{{- if .Values.global.enableMultiTenants }}
54+
value: "true"
55+
{{- else }}
56+
value: "false"
57+
{{- end }}
58+
- name: TENANT_URL
59+
value: "http://tenant-api/v0"
60+
- name: USER_ID
61+
value: {{ .Values.global.defaultAdminUserId | default .Values.defaultAdminUserId }}
62+
- name: INTERNAL_URL
63+
value: "http://magda-csv-semantic-indexer"
64+
- name: JWT_SECRET
65+
valueFrom:
66+
secretKeyRef:
67+
name: auth-secrets
68+
key: jwt-secret
69+
- name: MINIO_SECRET_KEY
70+
valueFrom:
71+
secretKeyRef:
72+
name: storage-secrets
73+
key: secretkey
74+
- name: MINIO_ACCESS_KEY
75+
valueFrom:
76+
secretKeyRef:
77+
name: storage-secrets
78+
key: accesskey
79+
- name: PORT
80+
value: "{{ .Values.port }}"
81+
volumeMounts:
82+
- name: "{{ .Chart.Name }}-config"
83+
mountPath: "/etc/config"
84+
readOnly: true
85+
volumes:
86+
- name: "{{ .Chart.Name }}-config"
87+
configMap:
88+
name: "{{ .Chart.Name }}-config"
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
apiVersion: v1
2+
kind: Service
3+
metadata:
4+
name: "magda-csv-semantic-indexer"
5+
spec:
6+
ports:
7+
- name: http
8+
port: 80
9+
targetPort: {{ .Values.port }}
10+
selector:
11+
service: magda-csv-semantic-indexer
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
# -- only for providing appropriate default value for helm lint
2+
global:
3+
image: {}
4+
rollingUpdate: {}
5+
searchEngine:
6+
semanticIndexer:
7+
indexName:
8+
indexVersion:
9+
numberOfShards: 1
10+
numberOfReplicas: 0
11+
knnVectorFieldConfig:
12+
# -- Vector workload mode: `on_disk` or `in_memory`.
13+
mode: "in_memory"
14+
# -- Dimension of the embedding vectors.
15+
dimension: 768
16+
# -- The compression_level mapping parameter selects a quantization encoder that reduces vector memory consumption by the given factor.
17+
compressionLevel: null
18+
# Supported values: l1, l2, innerProduct, cosine, linf
19+
spaceType: "l2"
20+
# -- Similar to efSearch but used during index construction. Higher values improve search quality but increase index build time.
21+
efConstruction: 100
22+
# -- The size of the candidate queue during search. Larger values may improve search quality but increase search latency.
23+
efSearch: 100
24+
# -- The maximum number of graph edges per vector. Higher values increase memory usage but may improve search quality.
25+
m: 16
26+
# -- FAISS Encoder configuration (If compressionLevel is set, encoder will be ignored).
27+
encoder:
28+
name: "sq"
29+
type: "fp16"
30+
clip: false
31+
defaultDatasetBucket: "magda-datasets"
32+
33+
opensearchURL: http://opensearch:9200
34+
embeddingApiURL: http://magda-embedding-api
35+
36+
# -- Service port configuration
37+
port: 6305
38+
39+
defaultSemanticIndexerConfig:
40+
id: "csv-semantic-indexer"
41+
chunkSizeLimit: 512
42+
overlap: 50
43+
indexName: "semantic-index"
44+
indexVersion: 1
45+
chunkSizeLimit: 512
46+
overlap: 50
47+
bulkEmbeddingsSize: 1
48+
bulkIndexSize: 50
49+
50+
semanticIndexer:
51+
# -- (string) Semantic indexer ID
52+
id: ""
53+
# -- (string) index name
54+
indexName:
55+
# -- (int) index version
56+
indexVersion:
57+
# -- (int) The maximum number of tokens in a single chunk.
58+
chunkSizeLimit:
59+
# -- (int) The number of overlapping tokens between chunks.
60+
overlap:
61+
# -- (int) number of string we request embedding api to process in one request
62+
bulkEmbeddingsSize:
63+
# -- (int) Number of documents we send to OpenSearch for bulk processing in a single request
64+
bulkIndexSize:
65+
66+
minioConfig:
67+
endPoint: "magda-minio"
68+
port: 9000
69+
region: ""
70+
useSSL: false
71+
defaultDatasetBucket: ""
72+
73+
image:
74+
name: "magda-csv-semantic-indexer"
75+
# tag:
76+
# pullPolicy:
77+
# imagePullSecret:
78+
79+
defaultImage:
80+
repository: ghcr.io/magda-io
81+
pullPolicy: IfNotPresent
82+
imagePullSecret: false
83+
84+
defaultAdminUserId: "00000000-0000-4000-8000-000000000000"
85+
86+
resources:
87+
requests:
88+
cpu: 50m
89+
memory: 200Mi
90+
limits:
91+
cpu: 100m

deploy/test-deploy.yaml

Whitespace-only changes.

0 commit comments

Comments
 (0)