Skip to content

Commit 6a4cebb

Browse files
Roopan-MicrosoftAjitPadhi-MicrosoftPavan-Microsoftross-p-smithgpickett
authored
build: merging dev changes to main branch (#1599)
Signed-off-by: dependabot[bot] <[email protected]> Co-authored-by: Ajit Padhi <[email protected]> Co-authored-by: Pavan-Microsoft <[email protected]> Co-authored-by: Ross Smith <[email protected]> Co-authored-by: gpickett <[email protected]> Co-authored-by: Francia Riesco <[email protected]> Co-authored-by: Francia Riesco <[email protected]> Co-authored-by: Prajwal D C <[email protected]> Co-authored-by: Harmanpreet-Microsoft <[email protected]> Co-authored-by: UtkarshMishra-Microsoft <[email protected]> Co-authored-by: Priyanka-Microsoft <[email protected]> Co-authored-by: Prasanjeet-Microsoft <[email protected]> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
1 parent 405d4bc commit 6a4cebb

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

57 files changed

+5589
-1808
lines changed

.env.sample

+14-8
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,9 @@ AZURE_SEARCH_DATASOURCE_NAME=
2222
# Azure OpenAI for generating the answer and computing the embedding of the documents
2323
AZURE_OPENAI_RESOURCE=
2424
AZURE_OPENAI_API_KEY=
25-
AZURE_OPENAI_MODEL_INFO="{\"model\":\"gpt-35-turbo-16k\",\"modelName\":\"gpt-35-turbo-16k\",\"modelVersion\":\"0613\"}"
26-
AZURE_OPENAI_EMBEDDING_MODEL_INFO="{\"model\":\"text-embedding-ada-002\",\"modelName\":\"text-embedding-ada-002\",\"modelVersion\":\"2\"}"
25+
AZURE_OPENAI_MODEL=gpt-35-turbo
26+
AZURE_OPENAI_MODEL_NAME=gpt-35-turbo
27+
AZURE_OPENAI_EMBEDDING_MODEL=text-embedding-ada-002
2728
AZURE_OPENAI_TEMPERATURE=0
2829
AZURE_OPENAI_TOP_P=1.0
2930
AZURE_OPENAI_MAX_TOKENS=1000
@@ -35,10 +36,12 @@ AZURE_OPENAI_STREAM=True
3536
AzureWebJobsStorage=
3637
BACKEND_URL=http://localhost:7071
3738
DOCUMENT_PROCESSING_QUEUE_NAME=
38-
# Azure Blob Storage for storing the original documents to be processed
39-
AZURE_BLOB_STORAGE_INFO="{\"containerName\":\"documents\",\"accountName\":\"\",\"accountKey\":\"\"}"
39+
AZURE_BLOB_ACCOUNT_NAME=
40+
AZURE_BLOB_ACCOUNT_KEY=
41+
AZURE_BLOB_CONTAINER_NAME=
4042
# Azure Form Recognizer for extracting the text from the documents
41-
AZURE_FORM_RECOGNIZER_INFO="{\"endpoint\":\"\",\"key\":\"\"}"
43+
AZURE_FORM_RECOGNIZER_ENDPOINT=
44+
AZURE_FORM_RECOGNIZER_KEY=
4245
# Azure AI Content Safety for filtering out the inappropriate questions or answers
4346
AZURE_CONTENT_SAFETY_ENDPOINT=
4447
AZURE_CONTENT_SAFETY_KEY=
@@ -60,8 +63,11 @@ AZURE_KEY_VAULT_ENDPOINT=
6063
# Chat conversation type to decide between custom or byod (bring your own data) conversation type
6164
CONVERSATION_FLOW=
6265
# Chat History CosmosDB Integration Settings
63-
AZURE_COSMOSDB_INFO="{\"accountName\":\"cosmos-abc123\",\"databaseName\":\"db_conversation_history\",\"containerName\":\"conversations\"}"
64-
AZURE_COSMOSDB_ACCOUNT_KEY=
66+
AZURE_COSMOSDB_ACCOUNT_NAME=
67+
AZURE_COSMOSDB_DATABASE_NAME=
68+
AZURE_COSMOSDB_CONVERSATIONS_CONTAINER_NAME=
6569
AZURE_COSMOSDB_ENABLE_FEEDBACK=
66-
AZURE_POSTGRESQL_INFO="{\"user\":\"\",\"dbname\":\"postgres\",\"host\":\"\"}"
70+
AZURE_POSTGRESQL_HOST_NAME=
71+
AZURE_POSTGRESQL_DATABASE_NAME=
72+
AZURE_POSTGRESQL_USER=
6773
DATABASE_TYPE="CosmosDB"

.github/workflows/build-docker-images.yml

+4-7
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,7 @@
11
name: Build Docker Images
22

33
on:
4-
workflow_run:
5-
workflows: [Tests]
6-
types: [completed]
4+
push:
75
branches:
86
- main
97
- dev
@@ -22,7 +20,6 @@ on:
2220

2321
jobs:
2422
docker-build:
25-
if: ${{ github.event_name != 'workflow_run' || github.event.workflow_run.conclusion == 'success' }}
2623
strategy:
2724
matrix:
2825
include:
@@ -34,9 +31,9 @@ jobs:
3431
dockerfile: docker/Frontend.Dockerfile
3532
uses: ./.github/workflows/build-docker.yml
3633
with:
37-
registry: ${{ github.event.workflow_run.head_branch == 'main' && 'fruoccopublic.azurecr.io' || 'cwydcontainerreg.azurecr.io'}}
38-
username: ${{ github.event.workflow_run.head_branch == 'main' && 'fruoccopublic' || 'cwydcontainerreg'}}
34+
registry: ${{ github.ref_name == 'main' && 'fruoccopublic.azurecr.io' || 'cwydcontainerreg.azurecr.io'}}
35+
username: ${{ github.ref_name == 'main' && 'fruoccopublic' || 'cwydcontainerreg'}}
3936
app_name: ${{ matrix.app_name }}
4037
dockerfile: ${{ matrix.dockerfile }}
41-
push: ${{ github.event.workflow_run.head_branch == 'main' || github.event.workflow_run.head_branch == 'dev' || github.event.workflow_run.head_branch == 'demo' }}
38+
push: ${{ github.ref_name == 'main' || github.ref_name == 'dev' || github.ref_name == 'demo' }}
4239
secrets: inherit

.github/workflows/build-docker.yml

+2-3
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ jobs:
2828
docker-build:
2929
runs-on: ubuntu-latest
3030
steps:
31-
3231
- name: Checkout
3332
uses: actions/checkout@v4
3433

@@ -61,7 +60,7 @@ jobs:
6160
context: .
6261
file: ${{ inputs.dockerfile }}
6362
push: ${{ inputs.push }}
64-
cache-from: type=registry,ref=${{ inputs.registry }}/${{ inputs.app_name}}:${{ github.ref_name == 'main' && 'latest' || github.ref_name == 'dev' && 'dev' || github.ref_name == 'demo' && 'demo' || 'latest' }}
63+
cache-from: type=registry,ref=${{ inputs.registry }}/${{ inputs.app_name}}:${{ github.ref_name == 'main' && 'latest' || github.ref_name == 'dev' && 'dev' || github.ref_name == 'demo' && 'demo' || github.head_ref || github.ref_name }}
6564
tags: |
66-
${{ inputs.registry }}/${{ inputs.app_name}}:${{ github.ref_name == 'main' && 'latest' || github.ref_name == 'dev' && 'dev' || github.ref_name == 'demo' && 'demo' || 'latest' }}
65+
${{ inputs.registry }}/${{ inputs.app_name}}:${{ github.ref_name == 'main' && 'latest' || github.ref_name == 'dev' && 'dev' || github.ref_name == 'demo' && 'demo' || github.head_ref || 'default' }}
6766
${{ inputs.registry }}/${{ inputs.app_name}}:${{ steps.date.outputs.date }}_${{ github.run_number }}

.github/workflows/sync-branches.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ jobs:
1515

1616
steps:
1717
- name: Checkout repository
18-
uses: actions/checkout@v3
18+
uses: actions/checkout@v4
1919
with:
2020
fetch-depth: 0 # Fetch all history for accurate branch comparison
2121

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ Welcome to the *Chat with your data* Solution accelerator repository! The *Chat
5050

5151

5252

53+
5354
### About this repo
5455

5556
This repository provides an end-to-end solution for users who want to query their data with natural language. It includes a well designed ingestion mechanism for multiple file types, an easy deployment, and a support team for maintenance. The accelerator demonstrates both Push or Pull Ingestion; the choice of orchestration (Semantic Kernel, LangChain, OpenAI Functions or [Prompt Flow](docs/prompt_flow.md)) and should be the minimum components needed to implement a RAG pattern. It is not intended to be put into Production as-is without experimentation or evaluation of your data. It provides the following features:

azure.yaml

+7-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,13 @@ metadata:
55
66
hooks:
77
postprovision:
8-
run: ./infra/prompt-flow/create-prompt-flow.sh
8+
# run: ./infra/prompt-flow/create-prompt-flow.sh
9+
posix:
10+
shell: sh
11+
run: chmod +x ./scripts/parse_env.sh && ./scripts/parse_env.sh
12+
windows:
13+
shell: pwsh
14+
run: ./scripts/parse_env.ps1
915
services:
1016
web:
1117
project: ./code

code/backend/batch/batch_push_results.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -28,19 +28,22 @@ def _get_file_name_from_message(message_body) -> str:
2828
)
2929
def batch_push_results(msg: func.QueueMessage) -> None:
3030
message_body = json.loads(msg.get_body().decode("utf-8"))
31-
logger.debug("Process Document Event queue function triggered: %s", message_body)
31+
logger.info("Process Document Event queue function triggered: %s", message_body)
3232

3333
event_type = message_body.get("eventType", "")
3434
# We handle "" in this scenario for backwards compatibility
3535
# This function is primarily triggered by an Event Grid queue message from the blob storage
3636
# However, it can also be triggered using a legacy schema from BatchStartProcessing
3737
if event_type in ("", "Microsoft.Storage.BlobCreated"):
38+
logger.info("Handling 'Blob Created' event with message body: %s", message_body)
3839
_process_document_created_event(message_body)
3940

4041
elif event_type == "Microsoft.Storage.BlobDeleted":
42+
logger.info("Handling 'Blob Deleted' event with message body: %s", message_body)
4143
_process_document_deleted_event(message_body)
4244

4345
else:
46+
logger.exception("Received an unrecognized event type: %s", event_type)
4447
raise NotImplementedError(f"Unknown event type received: {event_type}")
4548

4649

code/backend/batch/utilities/helpers/azure_blob_storage_client.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -247,7 +247,7 @@ def get_container_sas(self):
247247
user_delegation_key=self.user_delegation_key,
248248
account_key=self.account_key,
249249
permission="r",
250-
expiry=datetime.utcnow() + timedelta(hours=1),
250+
expiry=datetime.utcnow() + timedelta(days=365 * 5),
251251
)
252252

253253
def get_blob_sas(self, file_name):

code/backend/batch/utilities/helpers/azure_form_recognizer_helper.py

+8
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
1+
import logging
12
from azure.core.credentials import AzureKeyCredential
23
from azure.ai.formrecognizer import DocumentAnalysisClient
34
from azure.identity import DefaultAzureCredential
45
import html
56
import traceback
67
from .env_helper import EnvHelper
78

9+
logger = logging.getLogger(__name__)
10+
811

912
class AzureFormRecognizerClient:
1013
def __init__(self) -> None:
@@ -75,6 +78,8 @@ def begin_analyze_document_from_url(
7578
model_id = "prebuilt-layout" if use_layout else "prebuilt-read"
7679

7780
try:
81+
logger.info("Method begin_analyze_document_from_url started")
82+
logger.info(f"Model ID selected: {model_id}")
7883
poller = self.document_analysis_client.begin_analyze_document_from_url(
7984
model_id, document_url=source_url
8085
)
@@ -144,4 +149,7 @@ def begin_analyze_document_from_url(
144149

145150
return page_map
146151
except Exception as e:
152+
logger.exception(f"Exception in begin_analyze_document_from_url: {e}")
147153
raise ValueError(f"Error: {traceback.format_exc()}. Error: {e}")
154+
finally:
155+
logger.info("Method begin_analyze_document_from_url ended")

code/backend/batch/utilities/helpers/config/config_helper.py

+12-6
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,9 @@ def __init__(self, config: dict):
5252
)
5353
self.enable_chat_history = config["enable_chat_history"]
5454
self.database_type = config.get("database_type", self.env_helper.DATABASE_TYPE)
55+
self.conversational_flow = config.get(
56+
"conversational_flow", self.env_helper.CONVERSATION_FLOW
57+
)
5558

5659
def get_available_document_types(self) -> list[str]:
5760
document_types = {
@@ -187,21 +190,27 @@ def _set_new_config_properties(config: dict, default_config: dict):
187190
@staticmethod
188191
@functools.cache
189192
def get_active_config_or_default():
193+
logger.info("Method get_active_config_or_default started")
190194
env_helper = EnvHelper()
191195
config = ConfigHelper.get_default_config()
192196

193197
if env_helper.LOAD_CONFIG_FROM_BLOB_STORAGE:
198+
logger.info("Loading configuration from Blob Storage")
194199
blob_client = AzureBlobStorageClient(container_name=CONFIG_CONTAINER_NAME)
195200

196201
if blob_client.file_exists(CONFIG_FILE_NAME):
202+
logger.info("Configuration file found in Blob Storage")
197203
default_config = config
198204
config_file = blob_client.download_file(CONFIG_FILE_NAME)
199205
config = json.loads(config_file)
200206

201207
ConfigHelper._set_new_config_properties(config, default_config)
202208
else:
203-
logger.info("Returning default config")
209+
logger.info(
210+
"Configuration file not found in Blob Storage, using default configuration"
211+
)
204212

213+
logger.info("Method get_active_config_or_default ended")
205214
return Config(config)
206215

207216
@staticmethod
@@ -247,11 +256,7 @@ def get_default_config():
247256
logger.info("Loading default config from %s", config_file_path)
248257
ConfigHelper._default_config = json.loads(
249258
Template(f.read()).substitute(
250-
ORCHESTRATION_STRATEGY=(
251-
OrchestrationStrategy.SEMANTIC_KERNEL.value
252-
if env_helper.DATABASE_TYPE == DatabaseType.POSTGRESQL.value
253-
else env_helper.ORCHESTRATION_STRATEGY
254-
),
259+
ORCHESTRATION_STRATEGY=env_helper.ORCHESTRATION_STRATEGY,
255260
LOG_USER_INTERACTIONS=(
256261
False
257262
if env_helper.DATABASE_TYPE == DatabaseType.POSTGRESQL.value
@@ -262,6 +267,7 @@ def get_default_config():
262267
if env_helper.DATABASE_TYPE == DatabaseType.POSTGRESQL.value
263268
else True
264269
),
270+
CONVERSATION_FLOW=env_helper.CONVERSATION_FLOW,
265271
DATABASE_TYPE=env_helper.DATABASE_TYPE,
266272
)
267273
)

code/backend/batch/utilities/helpers/config/default.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
"enable_post_answering_prompt": false,
1010
"ai_assistant_type": "default",
1111
"enable_content_safety": true,
12-
"conversational_flow": "custom"
12+
"conversational_flow": "${CONVERSATION_FLOW}"
1313
},
1414
"example": {
1515
"documents": "{\n \"retrieved_documents\": [\n {\n \"[doc1]\": {\n \"content\": \"Dual Transformer Encoder (DTE) DTE (https://dev.azure.com/TScience/TSciencePublic/_wiki/wikis/TSciencePublic.wiki/82/Dual-Transformer-Encoder) DTE is a general pair-oriented sentence representation learning framework based on transformers. It provides training, inference and evaluation for sentence similarity models. Model Details DTE can be used to train a model for sentence similarity with the following features: - Build upon existing transformer-based text representations (e.g.TNLR, BERT, RoBERTa, BAG-NLR) - Apply smoothness inducing technology to improve the representation robustness - SMART (https://arxiv.org/abs/1911.03437) SMART - Apply NCE (Noise Contrastive Estimation) based similarity learning to speed up training of 100M pairs We use pretrained DTE model\"\n }\n },\n {\n \"[doc2]\": {\n \"content\": \"trained on internal data. You can find more details here - Models.md (https://dev.azure.com/TScience/_git/TSciencePublic?path=%2FDualTransformerEncoder%2FMODELS.md&version=GBmaster&_a=preview) Models.md DTE-pretrained for In-context Learning Research suggests that finetuned transformers can be used to retrieve semantically similar exemplars for e.g. KATE (https://arxiv.org/pdf/2101.06804.pdf) KATE . They show that finetuned models esp. tuned on related tasks give the maximum boost to GPT-3 in-context performance. DTE have lot of pretrained models that are trained on intent classification tasks. We can use these model embedding to find natural language utterances which are similar to our test utterances at test time. The steps are: 1. Embed\"\n }\n },\n {\n \"[doc3]\": {\n \"content\": \"train and test utterances using DTE model 2. For each test embedding, find K-nearest neighbors. 3. Prefix the prompt with nearest embeddings. The following diagram from the above paper (https://arxiv.org/pdf/2101.06804.pdf) the above paper visualizes this process: DTE-Finetuned This is an extension of DTE-pretrained method where we further finetune the embedding models for prompt crafting task. In summary, we sample random prompts from our training data and use them for GPT-3 inference for the another part of training data. Some prompts work better and lead to right results whereas other prompts lead\"\n }\n },\n {\n \"[doc4]\": {\n \"content\": \"to wrong completions. We finetune the model on the downstream task of whether a prompt is good or not based on whether it leads to right or wrong completion. This approach is similar to this paper: Learning To Retrieve Prompts for In-Context Learning (https://arxiv.org/pdf/2112.08633.pdf) this paper: Learning To Retrieve Prompts for In-Context Learning . This method is very general but it may require a lot of data to actually finetune a model to learn how to retrieve examples suitable for the downstream inference model like GPT-3.\"\n }\n }\n ]\n}",

code/backend/batch/utilities/helpers/embedders/integrated_vectorization_embedder.py

+11
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,16 @@ class IntegratedVectorizationEmbedder(EmbedderBase):
1515
def __init__(self, env_helper: EnvHelper):
1616
self.env_helper = env_helper
1717
self.llm_helper: LLMHelper = LLMHelper()
18+
logger.info("Initialized IntegratedVectorizationEmbedder.")
1819

1920
def embed_file(self, source_url: str, file_name: str = None):
21+
logger.info(
22+
f"Starting embed_file for source_url: {source_url}, file_name: {file_name}."
23+
)
2024
self.process_using_integrated_vectorization(source_url=source_url)
2125

2226
def process_using_integrated_vectorization(self, source_url: str):
27+
logger.info(f"Starting integrated vectorization for source_url: {source_url}.")
2328
config = ConfigHelper.get_active_config_or_default()
2429
try:
2530
search_datasource = AzureSearchDatasource(self.env_helper)
@@ -35,14 +40,20 @@ def process_using_integrated_vectorization(self, source_url: str):
3540
self.env_helper.AZURE_SEARCH_INDEXER_NAME,
3641
skillset_name=search_skillset_result.name,
3742
)
43+
logger.info("Integrated vectorization process completed successfully.")
3844
return indexer_result
3945
except Exception as e:
4046
logger.error(f"Error processing {source_url}: {e}")
4147
raise e
4248

4349
def reprocess_all(self):
50+
logger.info("Starting reprocess_all operation.")
4451
search_indexer = AzureSearchIndexer(self.env_helper)
4552
if search_indexer.indexer_exists(self.env_helper.AZURE_SEARCH_INDEXER_NAME):
53+
logger.info(
54+
f"Running indexer: {self.env_helper.AZURE_SEARCH_INDEXER_NAME}."
55+
)
4656
search_indexer.run_indexer(self.env_helper.AZURE_SEARCH_INDEXER_NAME)
4757
else:
58+
logger.info("Indexer does not exist. Starting full processing.")
4859
self.process_using_integrated_vectorization(source_url="all")

code/backend/batch/utilities/helpers/embedders/postgres_embedder.py

+13
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020

2121
class PostgresEmbedder(EmbedderBase):
2222
def __init__(self, blob_client: AzureBlobStorageClient, env_helper: EnvHelper):
23+
logger.info("Initializing PostgresEmbedder.")
2324
self.env_helper = env_helper
2425
self.llm_helper = LLMHelper()
2526
self.azure_postgres_helper = AzurePostgresHelper()
@@ -33,6 +34,7 @@ def __init__(self, blob_client: AzureBlobStorageClient, env_helper: EnvHelper):
3334
self.embedding_configs[ext] = processor
3435

3536
def embed_file(self, source_url: str, file_name: str):
37+
logger.info(f"Embedding file: {file_name} from source: {source_url}")
3638
file_extension = file_name.split(".")[-1].lower()
3739
embedding_config = self.embedding_configs.get(file_extension)
3840
self.__embed(
@@ -48,32 +50,42 @@ def embed_file(self, source_url: str, file_name: str):
4850
def __embed(
4951
self, source_url: str, file_extension: str, embedding_config: EmbeddingConfig
5052
):
53+
logger.info(f"Starting embedding process for source: {source_url}")
5154
documents_to_upload: List[SourceDocument] = []
5255
if (
5356
embedding_config.use_advanced_image_processing
5457
and file_extension
5558
in self.config.get_advanced_image_processing_image_types()
5659
):
60+
logger.error(
61+
"Advanced image processing is not supported in PostgresEmbedder."
62+
)
5763
raise NotImplementedError(
5864
"Advanced image processing is not supported in PostgresEmbedder."
5965
)
6066
else:
67+
logger.info(f"Loading documents from source: {source_url}")
6168
documents: List[SourceDocument] = self.document_loading.load(
6269
source_url, embedding_config.loading
6370
)
6471
documents = self.document_chunking.chunk(
6572
documents, embedding_config.chunking
6673
)
74+
logger.info("Chunked into document chunks.")
6775

6876
for document in documents:
6977
documents_to_upload.append(self.__convert_to_search_document(document))
7078

7179
if documents_to_upload:
80+
logger.info(
81+
f"Uploading {len(documents_to_upload)} documents to vector store."
82+
)
7283
self.azure_postgres_helper.create_vector_store(documents_to_upload)
7384
else:
7485
logger.warning("No documents to upload.")
7586

7687
def __convert_to_search_document(self, document: SourceDocument):
88+
logger.info(f"Generating embeddings for document ID: {document.id}")
7789
embedded_content = self.llm_helper.generate_embeddings(document.content)
7890
metadata = {
7991
"id": document.id,
@@ -84,6 +96,7 @@ def __convert_to_search_document(self, document: SourceDocument):
8496
"offset": document.offset,
8597
"page_number": document.page_number,
8698
}
99+
logger.info(f"Metadata generated for document ID: {document.id}")
87100
return {
88101
"id": document.id,
89102
"content": document.content,

0 commit comments

Comments
 (0)