Skip to content

Commit 25b19b2

Browse files
authored
Merge pull request #31 from Azure-Samples/docx
Add markitdown example and optimize dev container for Codespaces
2 parents f688ddf + bb48350 commit 25b19b2

File tree

7 files changed

+81
-9
lines changed

7 files changed

+81
-9
lines changed

.devcontainer/Dockerfile

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
FROM mcr.microsoft.com/devcontainers/python:3.12-bookworm
2+
3+
COPY requirements.txt /tmp/pip-tmp/
4+
RUN pip3 --disable-pip-version-check --no-cache-dir install -r /tmp/pip-tmp/requirements.txt \
5+
&& rm -rf /tmp/pip-tmp

.devcontainer/devcontainer.json

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
{
2-
"name": "azure-openai-keyless",
3-
"image": "mcr.microsoft.com/devcontainers/python:3.12-bullseye",
2+
"name": "azure-openai-entity-extraction",
3+
"build": {
4+
"dockerfile": "Dockerfile",
5+
"context": ".."
6+
},
47
"features": {
58
"ghcr.io/azure/azure-dev/azd:latest": {}
69
},
@@ -15,6 +18,8 @@
1518
"humao.rest-client"
1619
],
1720
"python.defaultInterpreterPath": "/usr/local/bin/python",
21+
"python.analysis.typeCheckingMode": "off",
22+
"remote.autoForwardPorts": false,
1823
"[python]": {
1924
"editor.formatOnSave": true,
2025
"editor.codeActionsOnSave": {
@@ -24,9 +29,5 @@
2429
}
2530
}
2631
},
27-
"postCreateCommand": "pip install -r requirements.txt",
28-
"remoteUser": "vscode",
29-
"hostRequirements": {
30-
"memory": "8gb"
31-
}
32+
"remoteUser": "vscode"
3233
}

basic_githubmodels.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ class CalendarEvent(BaseModel):
2929
)
3030

3131
message = completion.choices[0].message
32-
if (message.refusal):
32+
if message.refusal:
3333
rich.print(message.refusal)
3434
else:
3535
rich.print(message.parsed)

example_doc.docx

18.3 KB
Binary file not shown.

extract_pdf_receipt.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ class Receipt(BaseModel):
5959
completion = client.beta.chat.completions.parse(
6060
model=model_name,
6161
messages=[
62-
{"role": "system", "content": "Extract the information the receipt"},
62+
{"role": "system", "content": "Extract the information from the receipt"},
6363
{"role": "user", "content": md_text},
6464
],
6565
response_format=Receipt,

extract_word_docx.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
import logging
2+
import os
3+
4+
import openai
5+
from dotenv import load_dotenv
6+
from markitdown import MarkItDown
7+
from pydantic import BaseModel
8+
from rich import print
9+
10+
logging.basicConfig(level=logging.WARNING)
11+
load_dotenv(override=True)
12+
13+
if os.getenv("OPENAI_HOST", "github") == "azure":
14+
import azure.identity
15+
16+
if not os.getenv("AZURE_OPENAI_SERVICE") or not os.getenv("AZURE_OPENAI_GPT_DEPLOYMENT"):
17+
logging.warning("AZURE_OPENAI_SERVICE and AZURE_OPENAI_GPT_DEPLOYMENT env variables are empty. See README.")
18+
exit(1)
19+
credential = azure.identity.AzureDeveloperCliCredential(tenant_id=os.getenv("AZURE_TENANT_ID"))
20+
token_provider = azure.identity.get_bearer_token_provider(
21+
credential, "https://cognitiveservices.azure.com/.default"
22+
)
23+
client = openai.OpenAI(
24+
base_url=f"https://{os.getenv('AZURE_OPENAI_SERVICE')}.openai.azure.com/openai/v1",
25+
api_key=token_provider,
26+
)
27+
model_name = os.getenv("AZURE_OPENAI_GPT_DEPLOYMENT")
28+
else:
29+
if not os.getenv("GITHUB_TOKEN"):
30+
logging.warning("GITHUB_TOKEN env variable is empty. See README.")
31+
exit(1)
32+
client = openai.OpenAI(
33+
base_url="https://models.github.ai/inference",
34+
api_key=os.environ["GITHUB_TOKEN"],
35+
default_query={"api-version": "2024-08-01-preview"},
36+
)
37+
model_name = "openai/gpt-4o"
38+
39+
40+
# Define models for Structured Outputs
41+
class DocumentMetadata(BaseModel):
42+
title: str
43+
author: str | None
44+
headings: list[str]
45+
46+
47+
# Use markitdown to convert docx to markdown
48+
md = MarkItDown(enable_plugins=False)
49+
markdown_text = md.convert("example_doc.docx").text_content
50+
51+
# Send request to LLM to extract using Structured Outputs
52+
completion = client.beta.chat.completions.parse(
53+
model=model_name,
54+
messages=[
55+
{"role": "system", "content": "Extract the document title, author, and a list of all headings."},
56+
{"role": "user", "content": markdown_text},
57+
],
58+
response_format=DocumentMetadata,
59+
)
60+
61+
message = completion.choices[0].message
62+
if message.refusal:
63+
print(message.refusal)
64+
else:
65+
print(message.parsed)

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,4 @@ beautifulsoup4
77
pymupdf4llm
88
azure-ai-inference
99
dotenv-azd
10+
markitdown[docx]

0 commit comments

Comments
 (0)