-
Notifications
You must be signed in to change notification settings - Fork 1.4k
Fix media type inference for URLs with query parameters #3501
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
fedexman
wants to merge
4
commits into
pydantic:main
Choose a base branch
from
fedexman:feature/parse-url-to-handle-query-string-in-mediatype
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
4 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,6 +2,7 @@ | |
|
|
||
| import base64 | ||
| import hashlib | ||
| import mimetypes | ||
| from abc import ABC, abstractmethod | ||
| from collections.abc import Sequence | ||
| from dataclasses import KW_ONLY, dataclass, field, replace | ||
|
|
@@ -10,6 +11,7 @@ | |
| from os import PathLike | ||
| from pathlib import Path | ||
| from typing import TYPE_CHECKING, Annotated, Any, Literal, TypeAlias, cast, overload | ||
| from urllib.parse import urlparse | ||
|
|
||
| import pydantic | ||
| import pydantic_core | ||
|
|
@@ -25,6 +27,26 @@ | |
| if TYPE_CHECKING: | ||
| from .models.instrumented import InstrumentationSettings | ||
|
|
||
| # Register manually MIME types that are not in the standard library | ||
| # Document types | ||
| mimetypes.add_type('text/markdown', '.mdx') | ||
| mimetypes.add_type('text/x-asciidoc', '.asciidoc') | ||
|
|
||
| # Video types | ||
| mimetypes.add_type('video/3gpp', '.three_gp') | ||
| mimetypes.add_type('video/x-flv', '.flv') | ||
| mimetypes.add_type('video/x-matroska', '.mkv') | ||
| mimetypes.add_type('video/x-ms-wmv', '.wmv') | ||
|
|
||
| # Audio types | ||
| mimetypes.add_type('audio/flac', '.flac') | ||
| mimetypes.add_type('audio/mpeg', '.mp3') | ||
| mimetypes.add_type('audio/ogg', '.oga') | ||
| # override stdlib mimetypes that use x- prefix with standard types | ||
| mimetypes.add_type('audio/aac', '.aac') | ||
| mimetypes.add_type('audio/aiff', '.aiff') | ||
| mimetypes.add_type('audio/wav', '.wav') | ||
|
Comment on lines
+30
to
+48
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This will affect the global mimetypes db for the current interpreter. Let's instantiate an explicit You can then use your instance's |
||
|
|
||
|
|
||
| AudioMediaType: TypeAlias = Literal['audio/wav', 'audio/mpeg', 'audio/ogg', 'audio/flac', 'audio/aiff', 'audio/aac'] | ||
| ImageMediaType: TypeAlias = Literal['image/jpeg', 'image/png', 'image/gif', 'image/webp'] | ||
|
|
@@ -226,38 +248,27 @@ def __init__( | |
| ) | ||
| self.kind = kind | ||
|
|
||
| def _infer_media_type(self) -> VideoMediaType: | ||
| def _infer_media_type(self) -> str: | ||
| """Return the media type of the video, based on the url.""" | ||
| if self.url.endswith('.mkv'): | ||
| return 'video/x-matroska' | ||
| elif self.url.endswith('.mov'): | ||
| return 'video/quicktime' | ||
| elif self.url.endswith('.mp4'): | ||
| return 'video/mp4' | ||
| elif self.url.endswith('.webm'): | ||
| return 'video/webm' | ||
| elif self.url.endswith('.flv'): | ||
| return 'video/x-flv' | ||
| elif self.url.endswith(('.mpeg', '.mpg')): | ||
| return 'video/mpeg' | ||
| elif self.url.endswith('.wmv'): | ||
| return 'video/x-ms-wmv' | ||
| elif self.url.endswith('.three_gp'): | ||
| return 'video/3gpp' | ||
| # Assume that YouTube videos are mp4 because there would be no extension | ||
| # to infer from. This should not be a problem, as Gemini disregards media | ||
| # type for YouTube URLs. | ||
| elif self.is_youtube: | ||
| if self.is_youtube: | ||
| return 'video/mp4' | ||
| else: | ||
|
|
||
| mime_type, _ = guess_type(self.url) | ||
| if mime_type is None: | ||
| raise ValueError( | ||
| f'Could not infer media type from video URL: {self.url}. Explicitly provide a `media_type` instead.' | ||
| ) | ||
| return mime_type | ||
|
|
||
| @property | ||
| def is_youtube(self) -> bool: | ||
| """True if the URL has a YouTube domain.""" | ||
| return self.url.startswith(('https://youtu.be/', 'https://youtube.com/', 'https://www.youtube.com/')) | ||
| parsed = urlparse(self.url) | ||
| hostname = parsed.hostname or '' | ||
| return hostname in ('youtu.be', 'youtube.com', 'www.youtube.com') | ||
|
|
||
| @property | ||
| def format(self) -> VideoFormat: | ||
|
|
@@ -302,28 +313,18 @@ def __init__( | |
| ) | ||
| self.kind = kind | ||
|
|
||
| def _infer_media_type(self) -> AudioMediaType: | ||
| def _infer_media_type(self) -> str: | ||
| """Return the media type of the audio file, based on the url. | ||
|
|
||
| References: | ||
| - Gemini: https://ai.google.dev/gemini-api/docs/audio#supported-formats | ||
| """ | ||
| if self.url.endswith('.mp3'): | ||
| return 'audio/mpeg' | ||
| if self.url.endswith('.wav'): | ||
| return 'audio/wav' | ||
| if self.url.endswith('.flac'): | ||
| return 'audio/flac' | ||
| if self.url.endswith('.oga'): | ||
| return 'audio/ogg' | ||
| if self.url.endswith('.aiff'): | ||
| return 'audio/aiff' | ||
| if self.url.endswith('.aac'): | ||
| return 'audio/aac' | ||
|
|
||
| raise ValueError( | ||
| f'Could not infer media type from audio URL: {self.url}. Explicitly provide a `media_type` instead.' | ||
| ) | ||
| mime_type, _ = guess_type(self.url) | ||
| if mime_type is None: | ||
| raise ValueError( | ||
| f'Could not infer media type from audio URL: {self.url}. Explicitly provide a `media_type` instead.' | ||
| ) | ||
| return mime_type | ||
|
|
||
| @property | ||
| def format(self) -> AudioFormat: | ||
|
|
@@ -365,20 +366,14 @@ def __init__( | |
| ) | ||
| self.kind = kind | ||
|
|
||
| def _infer_media_type(self) -> ImageMediaType: | ||
| def _infer_media_type(self) -> str: | ||
| """Return the media type of the image, based on the url.""" | ||
| if self.url.endswith(('.jpg', '.jpeg')): | ||
| return 'image/jpeg' | ||
| elif self.url.endswith('.png'): | ||
| return 'image/png' | ||
| elif self.url.endswith('.gif'): | ||
| return 'image/gif' | ||
| elif self.url.endswith('.webp'): | ||
| return 'image/webp' | ||
| else: | ||
| mime_type, _ = guess_type(self.url) | ||
| if mime_type is None: | ||
| raise ValueError( | ||
| f'Could not infer media type from image URL: {self.url}. Explicitly provide a `media_type` instead.' | ||
| ) | ||
| return mime_type | ||
|
|
||
| @property | ||
| def format(self) -> ImageFormat: | ||
|
|
@@ -425,33 +420,12 @@ def __init__( | |
|
|
||
| def _infer_media_type(self) -> str: | ||
| """Return the media type of the document, based on the url.""" | ||
| # Common document types are hardcoded here as mime-type support for these | ||
| # extensions varies across operating systems. | ||
| if self.url.endswith(('.md', '.mdx', '.markdown')): | ||
| return 'text/markdown' | ||
| elif self.url.endswith('.asciidoc'): | ||
| return 'text/x-asciidoc' | ||
| elif self.url.endswith('.txt'): | ||
| return 'text/plain' | ||
| elif self.url.endswith('.pdf'): | ||
| return 'application/pdf' | ||
| elif self.url.endswith('.rtf'): | ||
| return 'application/rtf' | ||
| elif self.url.endswith('.doc'): | ||
| return 'application/msword' | ||
| elif self.url.endswith('.docx'): | ||
| return 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' | ||
| elif self.url.endswith('.xls'): | ||
| return 'application/vnd.ms-excel' | ||
| elif self.url.endswith('.xlsx'): | ||
| return 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' | ||
|
|
||
| type_, _ = guess_type(self.url) | ||
| if type_ is None: | ||
| mime_type, _ = guess_type(self.url) | ||
| if mime_type is None: | ||
| raise ValueError( | ||
| f'Could not infer media type from document URL: {self.url}. Explicitly provide a `media_type` instead.' | ||
| ) | ||
| return type_ | ||
| return mime_type | ||
|
|
||
| @property | ||
| def format(self) -> DocumentFormat: | ||
|
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why aren't we using the
mimetypesstdlib module?mimetypes.guess_type()already parses URLs and the current implementation doesn't take into account case insensitivity, etc.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@Viicos Interestingly we already use that in
DocumentUrl._infer_media_type, after checking a bunch of types ourselves :/@fedexman Can you see if we can use
mimetypes.guess_type()for all of these?The method can be changed to just return
strrather thanXMediaType, as I don't think that type is used on any public fields.