Merge pull request #31 from Azure-Samples/docx

pamelafox · web-flow · commit 25b19b2447bf · 2025-10-14T23:07:12.000-07:00
Add markitdown example and optimize dev container for Codespaces
diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
@@ -0,0 +1,5 @@
+FROM mcr.microsoft.com/devcontainers/python:3.12-bookworm
+
+COPY requirements.txt /tmp/pip-tmp/
+RUN pip3 --disable-pip-version-check --no-cache-dir install -r /tmp/pip-tmp/requirements.txt \
+    && rm -rf /tmp/pip-tmp
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -1,6 +1,9 @@
 {
-    "name": "azure-openai-keyless",
-    "image": "mcr.microsoft.com/devcontainers/python:3.12-bullseye",
+    "name": "azure-openai-entity-extraction",
+    "build": {
+        "dockerfile": "Dockerfile",
+        "context": ".."
+    },
     "features": {
         "ghcr.io/azure/azure-dev/azd:latest": {}
     },
@@ -15,6 +18,8 @@
                 "humao.rest-client"
             ],
             "python.defaultInterpreterPath": "/usr/local/bin/python",
+            "python.analysis.typeCheckingMode": "off",
+            "remote.autoForwardPorts": false,
             "[python]": {
                 "editor.formatOnSave": true,
                 "editor.codeActionsOnSave": {
@@ -24,9 +29,5 @@
 			}
         }
     },
-    "postCreateCommand": "pip install -r requirements.txt",
-    "remoteUser": "vscode",
-    "hostRequirements": {
-        "memory": "8gb"
-    }
+    "remoteUser": "vscode"
 }
diff --git a/basic_githubmodels.py b/basic_githubmodels.py
@@ -29,7 +29,7 @@ class CalendarEvent(BaseModel):
 )
 
 message = completion.choices[0].message
-if (message.refusal):
+if message.refusal:
     rich.print(message.refusal)
 else:
     rich.print(message.parsed)
diff --git a/example_doc.docx b/example_doc.docx
diff --git a/extract_pdf_receipt.py b/extract_pdf_receipt.py
@@ -59,7 +59,7 @@ class Receipt(BaseModel):
 completion = client.beta.chat.completions.parse(
     model=model_name,
     messages=[
-        {"role": "system", "content": "Extract the information the receipt"},
+        {"role": "system", "content": "Extract the information from the receipt"},
         {"role": "user", "content": md_text},
     ],
     response_format=Receipt,
diff --git a/extract_word_docx.py b/extract_word_docx.py
@@ -0,0 +1,65 @@
+import logging
+import os
+
+import openai
+from dotenv import load_dotenv
+from markitdown import MarkItDown
+from pydantic import BaseModel
+from rich import print
+
+logging.basicConfig(level=logging.WARNING)
+load_dotenv(override=True)
+
+if os.getenv("OPENAI_HOST", "github") == "azure":
+    import azure.identity
+
+    if not os.getenv("AZURE_OPENAI_SERVICE") or not os.getenv("AZURE_OPENAI_GPT_DEPLOYMENT"):
+        logging.warning("AZURE_OPENAI_SERVICE and AZURE_OPENAI_GPT_DEPLOYMENT env variables are empty. See README.")
+        exit(1)
+    credential = azure.identity.AzureDeveloperCliCredential(tenant_id=os.getenv("AZURE_TENANT_ID"))
+    token_provider = azure.identity.get_bearer_token_provider(
+        credential, "https://cognitiveservices.azure.com/.default"
+    )
+    client = openai.OpenAI(
+        base_url=f"https://{os.getenv('AZURE_OPENAI_SERVICE')}.openai.azure.com/openai/v1",
+        api_key=token_provider,
+    )
+    model_name = os.getenv("AZURE_OPENAI_GPT_DEPLOYMENT")
+else:
+    if not os.getenv("GITHUB_TOKEN"):
+        logging.warning("GITHUB_TOKEN env variable is empty. See README.")
+        exit(1)
+    client = openai.OpenAI(
+        base_url="https://models.github.ai/inference",
+        api_key=os.environ["GITHUB_TOKEN"],
+        default_query={"api-version": "2024-08-01-preview"},
+    )
+    model_name = "openai/gpt-4o"
+
+
+# Define models for Structured Outputs
+class DocumentMetadata(BaseModel):
+    title: str
+    author: str | None
+    headings: list[str]
+
+
+# Use markitdown to convert docx to markdown
+md = MarkItDown(enable_plugins=False)
+markdown_text = md.convert("example_doc.docx").text_content
+
+# Send request to LLM to extract using Structured Outputs
+completion = client.beta.chat.completions.parse(
+    model=model_name,
+    messages=[
+        {"role": "system", "content": "Extract the document title, author, and a list of all headings."},
+        {"role": "user", "content": markdown_text},
+    ],
+    response_format=DocumentMetadata,
+)
+
+message = completion.choices[0].message
+if message.refusal:
+    print(message.refusal)
+else:
+    print(message.parsed)
diff --git a/requirements.txt b/requirements.txt
@@ -7,3 +7,4 @@ beautifulsoup4
 pymupdf4llm
 azure-ai-inference
 dotenv-azd
+markitdown[docx]

Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,7 @@ class CalendarEvent(BaseModel):`
`29`	`29`	`)`
`30`	`30`
`31`	`31`	`message = completion.choices[0].message`
`32`		`-if (message.refusal):`
	`32`	`+if message.refusal:`
`33`	`33`	`rich.print(message.refusal)`
`34`	`34`	`else:`
`35`	`35`	`rich.print(message.parsed)`