Skip to content

Commit 1560407

Browse files
mongodbenBen Perlmutter
and
Ben Perlmutter
authored
(EAI-871): Cronjobs to programmatically update datasets (#631)
* pageDataset * stub test file * aggregates + fix ts issue * test clean up * script to upload docs dataset * fix broken test * stubbin * working e2e code example upload * checkpoint work * working e2e * fix build issues * Fix broken test * fix merge issues * add datasets drone * datasets scripts to staging * fix docker dir * fix code example dataset script * fix filter? * Apply suggestions from code review * implement NL feedback --------- Co-authored-by: Ben Perlmutter <[email protected]>
1 parent c8c8495 commit 1560407

File tree

9 files changed

+312
-20
lines changed

9 files changed

+312
-20
lines changed

.drone.yml

+157-2
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ trigger:
149149

150150
steps:
151151
# Deploys docker image associated with staging build that triggered promotion
152-
- name: deploy-staging-chat-server
152+
- name: deploy-staging-chatbot-server
153153
image: quay.io/mongodb/drone-helm:v3
154154
settings:
155155
chart: mongodb/web-app
@@ -237,7 +237,8 @@ steps:
237237
namespace: docs
238238
release: chat-server-qa
239239
values: image.tag=git-${DRONE_COMMIT_SHA:0:7}-qa,image.repository=795250896452.dkr.ecr.us-east-1.amazonaws.com/docs/${DRONE_REPO_NAME}-chat-server,ingress.enabled=true,ingress.hosts[0]=chat-qa.docs.staging.corp.mongodb.com
240-
values_files: ["packages/mongodb-chatbot-server/environments/qa.yml"]
240+
values_files:
241+
["packages/mongodb-chatbot-server/environments/qa.yml"]
241242
api_server: https://api.staging.corp.mongodb.com
242243
kubernetes_token:
243244
from_secret: staging_kubernetes_token
@@ -963,3 +964,157 @@ steps:
963964
api_server: https://api.prod.corp.mongodb.com
964965
kubernetes_token:
965966
from_secret: prod_kubernetes_token
967+
968+
# ---
969+
# Datasets service
970+
# ---
971+
---
972+
depends_on: ["test-all"]
973+
kind: pipeline
974+
type: kubernetes
975+
name: staging-build-datasets-service
976+
977+
trigger:
978+
branch:
979+
- main
980+
event:
981+
- push
982+
paths:
983+
include:
984+
- packages/datasets/**/*
985+
- packages/mongodb-rag-core/**/*
986+
- package-lock.json
987+
- package.json
988+
989+
steps:
990+
# Builds and publishes Docker image for staging
991+
- name: publish-staging-datasets-service
992+
image: plugins/kaniko-ecr
993+
settings:
994+
dockerfile: datasets.dockerfile
995+
create_repository: true
996+
registry: 795250896452.dkr.ecr.us-east-1.amazonaws.com
997+
repo: docs/${DRONE_REPO_NAME}-datasets-service
998+
tags:
999+
- git-${DRONE_COMMIT_SHA:0:7}-staging
1000+
- latest-staging
1001+
access_key:
1002+
from_secret: ecr_access_key
1003+
secret_key:
1004+
from_secret: ecr_secret_key
1005+
1006+
# Promotes current drone build to staging environment
1007+
- name: promote-staging-datasets-service
1008+
image: drone/cli:1.4.0-alpine
1009+
commands:
1010+
- drone build promote mongodb/chatbot ${DRONE_BUILD_NUMBER} staging
1011+
environment:
1012+
DRONE_SERVER: ${DRONE_SYSTEM_PROTO}://${DRONE_SYSTEM_HOST}
1013+
DRONE_TOKEN:
1014+
from_secret: drone_token
1015+
1016+
---
1017+
kind: pipeline
1018+
type: kubernetes
1019+
name: staging-deploy-datasets-service
1020+
1021+
trigger:
1022+
event:
1023+
- promote
1024+
target:
1025+
- staging
1026+
paths:
1027+
include:
1028+
- packages/datasets/**/*
1029+
- packages/mongodb-rag-core/**/*
1030+
- package-lock.json
1031+
- package.json
1032+
1033+
branch:
1034+
- main
1035+
1036+
steps:
1037+
# Deploys docker image associated with staging build that triggered promotion
1038+
- name: deploy-staging-datasets-service
1039+
image: quay.io/mongodb/drone-helm:v3
1040+
settings:
1041+
chart: mongodb/cronjobs
1042+
chart_version: 1.10.0
1043+
add_repos: [mongodb=https://10gen.github.io/helm-charts]
1044+
namespace: docs
1045+
release: datasets-service
1046+
values: image.tag=git-${DRONE_COMMIT_SHA:0:7}-staging,image.repository=795250896452.dkr.ecr.us-east-1.amazonaws.com/docs/${DRONE_REPO_NAME}-datasets-service
1047+
values_files:
1048+
- packages/datasets/environments/staging.yml
1049+
api_server: https://api.staging.corp.mongodb.com
1050+
kubernetes_token:
1051+
from_secret: staging_kubernetes_token
1052+
1053+
---
1054+
depends_on: ["test-all"]
1055+
kind: pipeline
1056+
type: kubernetes
1057+
name: production-build-datasets-service
1058+
1059+
trigger:
1060+
event:
1061+
- tag
1062+
ref:
1063+
include:
1064+
- refs/tags/datasets-v*
1065+
1066+
steps:
1067+
# Builds and publishes Docker image for production
1068+
- name: publish-production-datasets-service
1069+
image: plugins/kaniko-ecr
1070+
settings:
1071+
dockerfile: datasets.dockerfile
1072+
create_repository: true
1073+
registry: 795250896452.dkr.ecr.us-east-1.amazonaws.com
1074+
repo: docs/${DRONE_REPO_NAME}-datasets-service
1075+
tags:
1076+
- git-${DRONE_COMMIT_SHA:0:7}-production
1077+
- ${DRONE_TAG}
1078+
access_key:
1079+
from_secret: ecr_access_key
1080+
secret_key:
1081+
from_secret: ecr_secret_key
1082+
1083+
# Promotes current drone build to production environment
1084+
- name: promote-production-datasets-service
1085+
image: drone/cli:1.4.0-alpine
1086+
commands:
1087+
- drone build promote mongodb/chatbot ${DRONE_BUILD_NUMBER} production
1088+
environment:
1089+
DRONE_SERVER: ${DRONE_SYSTEM_PROTO}://${DRONE_SYSTEM_HOST}
1090+
DRONE_TOKEN:
1091+
from_secret: drone_token
1092+
---
1093+
kind: pipeline
1094+
type: kubernetes
1095+
name: production-deploy-datasets-service
1096+
1097+
trigger:
1098+
event:
1099+
- promote
1100+
target:
1101+
- production
1102+
ref:
1103+
include:
1104+
- refs/tags/datasets-v*
1105+
1106+
steps:
1107+
- name: deploy-production-datasets-service
1108+
image: quay.io/mongodb/drone-helm:v3
1109+
settings:
1110+
chart: mongodb/cronjobs
1111+
chart_version: 1.10.0
1112+
add_repos: [mongodb=https://10gen.github.io/helm-charts]
1113+
namespace: docs
1114+
release: datasets-service
1115+
values: image.tag=git-${DRONE_COMMIT_SHA:0:7}-production,image.repository=795250896452.dkr.ecr.us-east-1.amazonaws.com/docs/${DRONE_REPO_NAME}-datasets-service
1116+
values_files:
1117+
- packages/datasets/environments/production.yml
1118+
api_server: https://api.prod.corp.mongodb.com
1119+
kubernetes_token:
1120+
from_secret: prod_kubernetes_token

datasets.dockerfile

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Build stage
2+
FROM node:18 as builder
3+
4+
WORKDIR /bin
5+
COPY . ./
6+
RUN npm install lerna && npm run bootstrap && npm run build -- --scope=datasets --include-dependencies
7+
8+
ENV NODE_ENV=production
9+
10+
WORKDIR /bin/packages/datasets
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
image:
2+
repository: node
3+
tag: 18
4+
5+
cronJobs:
6+
- name: upload-docs-dataset
7+
schedule: "0 6 * * 6" # every Saturday at 6am UTC
8+
command: ["npm", "run", "uploadContentDatasetToHuggingFace"]
9+
env:
10+
MONGODB_DATABASE_NAME: docs-chatbot-prod
11+
HUGGINGFACE_DOCS_CONTENT_REPO: mongodb-eai/docs
12+
envSecrets:
13+
MONGODB_CONNECTION_URI: docs-chatbot-prod
14+
HUGGINGFACE_ACCESS_TOKEN: docs-chatbot-prod
15+
resources:
16+
# guaranteed amount of resources
17+
requests:
18+
cpu: 500m
19+
memory: 2Gi
20+
# maximum allowed resources
21+
limits:
22+
cpu: 1
23+
memory: 5Gi
24+
backoffLimit: 3
25+
26+
- name: upload-code-example-dataset
27+
schedule: "0 8 * * 6" # every Saturday at 8am UTC
28+
command: ["npm", "run", "uploadCodeExampleDatasetToHuggingFace"]
29+
env:
30+
MONGODB_DATABASE_NAME: docs-chatbot-prod
31+
OPENAI_API_VERSION: "2024-06-01"
32+
HUGGINGFACE_DOCS_CODE_EXAMPLES_REPO: mongodb-eai/code-examples
33+
OPENAI_CHAT_COMPLETION_DEPLOYMENT: gpt-4o-mini
34+
envSecrets:
35+
MONGODB_CONNECTION_URI: docs-chatbot-prod
36+
HUGGINGFACE_ACCESS_TOKEN: docs-chatbot-prod
37+
OPENAI_ENDPOINT: docs-chatbot-prod
38+
OPENAI_API_KEY: docs-chatbot-prod
39+
resources:
40+
# guaranteed amount of resources
41+
requests:
42+
cpu: 500m
43+
memory: 2Gi
44+
# maximum allowed resources
45+
limits:
46+
cpu: 1
47+
memory: 5Gi
48+
backoffLimit: 3
+49
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
image:
2+
repository: node
3+
tag: 18
4+
5+
cronJobs:
6+
- name: upload-content-dataset
7+
schedule: "0 6 * * 6" # every Saturday at 6am UTC (1AM EST)
8+
command: ["npm", "run", "uploadContentDatasetToHuggingFace"]
9+
env:
10+
MONGODB_DATABASE_NAME: docs-chatbot-staging
11+
HUGGINGFACE_DOCS_CONTENT_REPO: mongodb-eai/test_integration
12+
envSecrets:
13+
MONGODB_CONNECTION_URI: docs-chatbot-staging
14+
# TODO: add this to k8s
15+
HUGGINGFACE_ACCESS_TOKEN: docs-chatbot-staging
16+
resources:
17+
# guaranteed amount of resources
18+
requests:
19+
cpu: 100m
20+
memory: 2Gi
21+
# maximum allowed resources
22+
limits:
23+
cpu: 500m
24+
memory: 5Gi
25+
backoffLimit: 3
26+
27+
- name: upload-code-example-dataset
28+
schedule: "0 0 31 2 *" # Never runs, b.c February 31 doesn't exist. Must trigger manually
29+
command: ["npm", "run", "uploadCodeExampleDatasetToHuggingFace"]
30+
env:
31+
MONGODB_DATABASE_NAME: docs-chatbot-staging
32+
OPENAI_API_VERSION: "2024-06-01"
33+
HUGGINGFACE_DOCS_CODE_EXAMPLES_REPO: mongodb-eai/test_integration
34+
OPENAI_CHAT_COMPLETION_DEPLOYMENT: gpt-4o-mini
35+
envSecrets:
36+
MONGODB_CONNECTION_URI: docs-chatbot-staging
37+
HUGGINGFACE_ACCESS_TOKEN: docs-chatbot-staging
38+
OPENAI_ENDPOINT: docs-chatbot-staging
39+
OPENAI_API_KEY: docs-chatbot-staging
40+
resources:
41+
# guaranteed amount of resources
42+
requests:
43+
cpu: 100m
44+
memory: 2Gi
45+
# maximum allowed resources
46+
limits:
47+
cpu: 500m
48+
memory: 5Gi
49+
backoffLimit: 3

packages/datasets/src/EnvVars.ts

+6-1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@ export const OPENAI_ENV_VARS = {
77

88
export const HUGGINGFACE = {
99
HUGGINGFACE_ACCESS_TOKEN: "",
10+
};
11+
12+
export const HUGGINGFACE_DOCS_CONTENT = {
1013
HUGGINGFACE_DOCS_CONTENT_REPO: "",
11-
HUGGINGFACE_DOCS_CODE_EXAMPLE_REPO: "",
14+
};
15+
export const HUGGINGFACE_DOCS_CODE_EXAMPLES = {
16+
HUGGINGFACE_DOCS_CODE_EXAMPLES_REPO: "",
1217
};

packages/datasets/src/README.md

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# MongoDB Education Datasets
2+
3+
This package contains scripts to generate datasets related to MongoDB Education.
4+
5+
These datasets can be useful for use in AI systems, such as RAG chatbots or fine-tuning LLMs.
6+
7+
Public datasets may be found on the [MongoDB Education AI HuggingFace space](https://huggingface.co/mongodb-eai).

0 commit comments

Comments
 (0)