Merge branch 'main' into transformers-save-dtensor

S1ro1 · web-flow · commit a9589e1c78e1 · 2025-05-09T16:24:49.000+02:00
diff --git a/.github/conda/meta.yaml b/.github/conda/meta.yaml
@@ -21,6 +21,7 @@ requirements:
     - typing-extensions
     - packaging
     - pyyaml
+    - hf-xet >=1.1.0,<2.0.0
   run:
     - python
     - pip
@@ -30,6 +31,7 @@ requirements:
     - typing-extensions
     - packaging
     - pyyaml
+    - hf-xet >=1.1.0,<2.0.0
 
 test:
   imports:
diff --git a/docs/source/en/guides/manage-cache.md b/docs/source/en/guides/manage-cache.md
@@ -21,7 +21,7 @@ The caching system is designed as follows:
 ├─ <SPACES>
 ```
 
-The `<CACHE_DIR>` is usually your user's home directory. However, it is customizable with the `cache_dir` argument on all methods, or by specifying either `HF_HOME` or `HF_HUB_CACHE` environment variable.
+The default `<CACHE_DIR>` is `~/.cache/huggingface/hub`. However, it is customizable with the `cache_dir` argument on all methods, or by specifying either `HF_HOME` or `HF_HUB_CACHE` environment variable.
 
 Models, datasets and spaces share a common root. Each of these repositories contains the
 repository type, the namespace (organization or username) if it exists and the
diff --git a/docs/source/en/package_reference/environment_variables.md b/docs/source/en/package_reference/environment_variables.md
@@ -86,6 +86,7 @@ Integer value to define the number of seconds to wait for server response when d
 ## Xet 
 
 ### Other Xet environment variables
+* [`HF_HUB_DISABLE_XET`](../package_reference/environment_variables#hfhubdisablexet)
 * [`HF_XET_CACHE`](../package_reference/environment_variables#hfxetcache)
 * [`HF_XET_HIGH_PERFORMANCE`](../package_reference/environment_variables#hfxethighperformance)
 * [`HF_XET_RECONSTRUCT_WRITE_SEQUENTIALLY`](../package_reference/environment_variables#hfxetreconstructwritesequentially)
@@ -164,6 +165,10 @@ Each library defines its own policy (i.e. which usage to monitor) but the core i
 
 You can set `HF_HUB_DISABLE_TELEMETRY=1` as environment variable to globally disable telemetry.
 
+### HF_HUB_DISABLE_XET
+
+Set to disable using `hf-xet`, even if it is available in your Python environment. This is since `hf-xet` will be used automatically if it is found, this allows explicitly disabling its usage.
+
 ### HF_HUB_ENABLE_HF_TRANSFER
 
 Set to `True` for faster uploads and downloads from the Hub using `hf_transfer`.
diff --git a/setup.py b/setup.py
@@ -14,7 +14,7 @@ def get_version() -> str:
 install_requires = [
     "filelock",
     "fsspec>=2023.5.0",
-    "hf-xet>=1.0.2,<2.0.0; platform_machine=='x86_64' or platform_machine=='amd64' or platform_machine=='arm64' or platform_machine=='aarch64'",
+    "hf-xet>=1.1.0,<2.0.0; platform_machine=='x86_64' or platform_machine=='amd64' or platform_machine=='arm64' or platform_machine=='aarch64'",
     "packaging>=20.9",
     "pyyaml>=5.1",
     "requests",
@@ -56,7 +56,7 @@ def get_version() -> str:
     "keras<3.0",
 ]
 
-extras["hf_xet"] = ["hf_xet>=1.0.2,<2.0.0"]
+extras["hf_xet"] = ["hf_xet>=1.1.0,<2.0.0"]
 
 extras["testing"] = (
     extras["cli"]
diff --git a/src/huggingface_hub/_commit_api.py b/src/huggingface_hub/_commit_api.py
@@ -530,7 +530,7 @@ def _upload_xet_files(
     if len(additions) == 0:
         return
     # at this point, we know that hf_xet is installed
-    from hf_xet import upload_files
+    from hf_xet import upload_bytes, upload_files
 
     try:
         xet_connection_info = fetch_xet_connection_info_from_repo_info(
@@ -571,8 +571,10 @@ def token_refresher() -> Tuple[str, int]:
     num_chunks_num_digits = int(math.log10(num_chunks)) + 1
     for i, chunk in enumerate(chunk_iterable(additions, chunk_size=UPLOAD_BATCH_MAX_NUM_FILES)):
         _chunk = [op for op in chunk]
-        paths = [str(op.path_or_fileobj) for op in _chunk]
-        expected_size = sum([os.path.getsize(path) for path in paths])
+
+        bytes_ops = [op for op in _chunk if isinstance(op.path_or_fileobj, bytes)]
+        paths_ops = [op for op in _chunk if isinstance(op.path_or_fileobj, (str, Path))]
+        expected_size = sum(op.upload_info.size for op in bytes_ops + paths_ops)
 
         if num_chunks > 1:
             description = f"Uploading Batch [{str(i + 1).zfill(num_chunks_num_digits)}/{num_chunks}]..."
@@ -592,7 +594,24 @@ def token_refresher() -> Tuple[str, int]:
             def update_progress(increment: int):
                 progress.update(increment)
 
-            upload_files(paths, xet_endpoint, access_token_info, token_refresher, update_progress, repo_type)
+            if len(paths_ops) > 0:
+                upload_files(
+                    [str(op.path_or_fileobj) for op in paths_ops],
+                    xet_endpoint,
+                    access_token_info,
+                    token_refresher,
+                    update_progress,
+                    repo_type,
+                )
+            if len(bytes_ops) > 0:
+                upload_bytes(
+                    [op.path_or_fileobj for op in bytes_ops],
+                    xet_endpoint,
+                    access_token_info,
+                    token_refresher,
+                    update_progress,
+                    repo_type,
+                )
     return
 
 
diff --git a/src/huggingface_hub/_upload_large_folder.py b/src/huggingface_hub/_upload_large_folder.py
@@ -209,7 +209,7 @@ def target_chunk(self) -> int:
     def update_chunk(self, success: bool, nb_items: int, duration: float) -> None:
         with self._chunk_lock:
             if not success:
-                logger.warn(f"Failed to commit {nb_items} files at once. Will retry with less files in next batch.")
+                logger.warning(f"Failed to commit {nb_items} files at once. Will retry with less files in next batch.")
                 self._chunk_idx -= 1
             elif nb_items >= COMMIT_SIZE_SCALE[self._chunk_idx] and duration < 40:
                 logger.info(f"Successfully committed {nb_items} at once. Increasing the limit for next batch.")
diff --git a/src/huggingface_hub/commands/upload.py b/src/huggingface_hub/commands/upload.py
@@ -59,6 +59,7 @@
 from huggingface_hub.errors import RevisionNotFoundError
 from huggingface_hub.hf_api import HfApi
 from huggingface_hub.utils import disable_progress_bars, enable_progress_bars
+from huggingface_hub.utils._runtime import is_xet_available
 
 
 logger = logging.get_logger(__name__)
@@ -215,7 +216,7 @@ def _upload(self) -> str:
             if self.delete is not None and len(self.delete) > 0:
                 warnings.warn("Ignoring `--delete` since a single file is uploaded.")
 
-        if not HF_HUB_ENABLE_HF_TRANSFER:
+        if not is_xet_available() and not HF_HUB_ENABLE_HF_TRANSFER:
             logger.info(
                 "Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See"
                 " https://huggingface.co/docs/huggingface_hub/hf_transfer for more details."
diff --git a/src/huggingface_hub/file_download.py b/src/huggingface_hub/file_download.py
@@ -582,7 +582,7 @@ def xet_get(
 
     """
     try:
-        from hf_xet import PyPointerFile, download_files  # type: ignore[no-redef]
+        from hf_xet import PyXetDownloadInfo, download_files  # type: ignore[no-redef]
     except ImportError:
         raise ValueError(
             "To use optimized download using Xet storage, you need to install the hf_xet package. "
@@ -597,8 +597,10 @@ def token_refresher() -> Tuple[str, int]:
             raise ValueError("Failed to refresh token using xet metadata.")
         return connection_info.access_token, connection_info.expiration_unix_epoch
 
-    pointer_files = [
-        PyPointerFile(path=str(incomplete_path.absolute()), hash=xet_file_data.file_hash, filesize=expected_size)
+    xet_download_info = [
+        PyXetDownloadInfo(
+            destination_path=str(incomplete_path.absolute()), hash=xet_file_data.file_hash, file_size=expected_size
+        )
     ]
 
     if not displayed_filename:
@@ -623,7 +625,7 @@ def progress_updater(progress_bytes: float):
             progress.update(progress_bytes)
 
         download_files(
-            pointer_files,
+            xet_download_info,
             endpoint=connection_info.endpoint,
             token_info=(connection_info.access_token, connection_info.expiration_unix_epoch),
             token_refresher=token_refresher,
diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py
@@ -4475,18 +4475,17 @@ def preupload_lfs_files(
             expand="xetEnabled",
             token=token,
         ).xet_enabled
-        has_binary_data = any(
-            isinstance(addition.path_or_fileobj, (bytes, io.BufferedIOBase))
-            for addition in new_lfs_additions_to_upload
+        has_buffered_io_data = any(
+            isinstance(addition.path_or_fileobj, io.BufferedIOBase) for addition in new_lfs_additions_to_upload
         )
-        if xet_enabled and not has_binary_data and is_xet_available():
+        if xet_enabled and not has_buffered_io_data and is_xet_available():
             logger.info("Uploading files using Xet Storage..")
             _upload_xet_files(**upload_kwargs, create_pr=create_pr)  # type: ignore [arg-type]
         else:
             if xet_enabled and is_xet_available():
-                if has_binary_data:
+                if has_buffered_io_data:
                     logger.warning(
-                        "Uploading files as bytes or binary IO objects is not supported by Xet Storage. "
+                        "Uploading files as a binary IO buffer is not supported by Xet Storage. "
                         "Falling back to HTTP upload."
                     )
             _upload_lfs_files(**upload_kwargs, num_threads=num_threads)  # type: ignore [arg-type]
@@ -7573,6 +7572,7 @@ def create_inference_endpoint(
         revision: Optional[str] = None,
         task: Optional[str] = None,
         custom_image: Optional[Dict] = None,
+        env: Optional[Dict[str, str]] = None,
         secrets: Optional[Dict[str, str]] = None,
         type: InferenceEndpointType = InferenceEndpointType.PROTECTED,
         domain: Optional[str] = None,
@@ -7616,6 +7616,8 @@ def create_inference_endpoint(
             custom_image (`Dict`, *optional*):
                 A custom Docker image to use for the Inference Endpoint. This is useful if you want to deploy an
                 Inference Endpoint running on the `text-generation-inference` (TGI) framework (see examples).
+            env (`Dict[str, str]`, *optional*):
+                Non-secret environment variables to inject in the container environment.
             secrets (`Dict[str, str]`, *optional*):
                 Secret values to inject in the container environment.
             type ([`InferenceEndpointType]`, *optional*):
@@ -7678,14 +7680,14 @@ def create_inference_endpoint(
             ...     type="protected",
             ...     instance_size="x1",
             ...     instance_type="nvidia-a10g",
+            ...     env={
+            ...           "MAX_BATCH_PREFILL_TOKENS": "2048",
+            ...           "MAX_INPUT_LENGTH": "1024",
+            ...           "MAX_TOTAL_TOKENS": "1512",
+            ...           "MODEL_ID": "/repository"
+            ...         },
             ...     custom_image={
             ...         "health_route": "/health",
-            ...         "env": {
-            ...             "MAX_BATCH_PREFILL_TOKENS": "2048",
-            ...             "MAX_INPUT_LENGTH": "1024",
-            ...             "MAX_TOTAL_TOKENS": "1512",
-            ...             "MODEL_ID": "/repository"
-            ...         },
             ...         "url": "ghcr.io/huggingface/text-generation-inference:1.1.0",
             ...     },
             ...    secrets={"MY_SECRET_KEY": "secret_value"},
@@ -7723,6 +7725,8 @@ def create_inference_endpoint(
             },
             "type": type,
         }
+        if env:
+            payload["model"]["env"] = env
         if secrets:
             payload["model"]["secrets"] = secrets
         if domain is not None or path is not None:
@@ -7897,6 +7901,7 @@ def update_inference_endpoint(
         revision: Optional[str] = None,
         task: Optional[str] = None,
         custom_image: Optional[Dict] = None,
+        env: Optional[Dict[str, str]] = None,
         secrets: Optional[Dict[str, str]] = None,
         # Route update
         domain: Optional[str] = None,
@@ -7942,6 +7947,8 @@ def update_inference_endpoint(
             custom_image (`Dict`, *optional*):
                 A custom Docker image to use for the Inference Endpoint. This is useful if you want to deploy an
                 Inference Endpoint running on the `text-generation-inference` (TGI) framework (see examples).
+            env (`Dict[str, str]`, *optional*):
+                Non-secret environment variables to inject in the container environment
             secrets (`Dict[str, str]`, *optional*):
                 Secret values to inject in the container environment.
 
@@ -7992,6 +7999,8 @@ def update_inference_endpoint(
             payload["model"]["task"] = task
         if custom_image is not None:
             payload["model"]["image"] = {"custom": custom_image}
+        if env is not None:
+            payload["model"]["env"] = env
         if secrets is not None:
             payload["model"]["secrets"] = secrets
         if domain is not None:
diff --git a/src/huggingface_hub/inference/_client.py b/src/huggingface_hub/inference/_client.py
@@ -883,7 +883,13 @@ def chat_completion(
         payload_model = model or self.model
 
         # Get the provider helper
-        provider_helper = get_provider_helper(self.provider, task="conversational", model=payload_model)
+        provider_helper = get_provider_helper(
+            self.provider,
+            task="conversational",
+            model=model_id_or_url
+            if model_id_or_url is not None and model_id_or_url.startswith(("http://", "https://"))
+            else payload_model,
+        )
 
         # Prepare the payload
         parameters = {
diff --git a/src/huggingface_hub/inference/_generated/_async_client.py b/src/huggingface_hub/inference/_generated/_async_client.py
@@ -923,7 +923,13 @@ async def chat_completion(
         payload_model = model or self.model
 
         # Get the provider helper
-        provider_helper = get_provider_helper(self.provider, task="conversational", model=payload_model)
+        provider_helper = get_provider_helper(
+            self.provider,
+            task="conversational",
+            model=model_id_or_url
+            if model_id_or_url is not None and model_id_or_url.startswith(("http://", "https://"))
+            else payload_model,
+        )
 
         # Prepare the payload
         parameters = {
diff --git a/src/huggingface_hub/inference/_generated/types/chat_completion.py b/src/huggingface_hub/inference/_generated/types/chat_completion.py
@@ -45,7 +45,7 @@ class ChatCompletionInputMessage(BaseInferenceType):
     tool_calls: Optional[List[ChatCompletionInputToolCall]] = None
 
 
-ChatCompletionInputGrammarTypeType = Literal["json", "regex"]
+ChatCompletionInputGrammarTypeType = Literal["json", "regex", "json_schema"]
 
 
 @dataclass_with_extra
diff --git a/src/huggingface_hub/inference/_generated/types/text_generation.py b/src/huggingface_hub/inference/_generated/types/text_generation.py
@@ -8,7 +8,7 @@
 from .base import BaseInferenceType, dataclass_with_extra
 
 
-TypeEnum = Literal["json", "regex"]
+TypeEnum = Literal["json", "regex", "json_schema"]
 
 
 @dataclass_with_extra
diff --git a/src/huggingface_hub/inference/_providers/__init__.py b/src/huggingface_hub/inference/_providers/__init__.py
@@ -23,7 +23,7 @@
 from .nebius import NebiusConversationalTask, NebiusTextGenerationTask, NebiusTextToImageTask
 from .novita import NovitaConversationalTask, NovitaTextGenerationTask, NovitaTextToVideoTask
 from .openai import OpenAIConversationalTask
-from .replicate import ReplicateTask, ReplicateTextToSpeechTask
+from .replicate import ReplicateTask, ReplicateTextToImageTask, ReplicateTextToSpeechTask
 from .sambanova import SambanovaConversationalTask, SambanovaFeatureExtractionTask
 from .together import TogetherConversationalTask, TogetherTextGenerationTask, TogetherTextToImageTask
 
@@ -115,7 +115,7 @@
         "conversational": OpenAIConversationalTask(),
     },
     "replicate": {
-        "text-to-image": ReplicateTask("text-to-image"),
+        "text-to-image": ReplicateTextToImageTask(),
         "text-to-speech": ReplicateTextToSpeechTask(),
         "text-to-video": ReplicateTask("text-to-video"),
     },
@@ -147,7 +147,9 @@ def get_provider_helper(
         ValueError: If provider or task is not supported
     """
 
-    if model is None and provider in (None, "auto"):
+    if (model is None and provider in (None, "auto")) or (
+        model is not None and model.startswith(("http://", "https://"))
+    ):
         provider = "hf-inference"
 
     if provider is None:
diff --git a/src/huggingface_hub/inference/_providers/hf_inference.py b/src/huggingface_hub/inference/_providers/hf_inference.py
@@ -42,7 +42,7 @@ def _prepare_url(self, api_key: str, mapped_model: str) -> str:
             return mapped_model
         return (
             # Feature-extraction and sentence-similarity are the only cases where we handle models with several tasks.
-            f"{self.base_url}/pipeline/{self.task}/{mapped_model}"
+            f"{self.base_url}/models/{mapped_model}/pipeline/{self.task}"
             if self.task in ("feature-extraction", "sentence-similarity")
             # Otherwise, we use the default endpoint
             else f"{self.base_url}/models/{mapped_model}"
diff --git a/src/huggingface_hub/inference/_providers/replicate.py b/src/huggingface_hub/inference/_providers/replicate.py
@@ -47,6 +47,19 @@ def get_response(self, response: Union[bytes, Dict], request_params: Optional[Re
         return get_session().get(output_url).content
 
 
+class ReplicateTextToImageTask(ReplicateTask):
+    def __init__(self):
+        super().__init__("text-to-image")
+
+    def _prepare_payload_as_dict(
+        self, inputs: Any, parameters: Dict, provider_mapping_info: InferenceProviderMapping
+    ) -> Optional[Dict]:
+        payload: Dict = super()._prepare_payload_as_dict(inputs, parameters, provider_mapping_info)  # type: ignore[assignment]
+        if provider_mapping_info.adapter_weights_path is not None:
+            payload["input"]["lora_weights"] = f"https://huggingface.co/{provider_mapping_info.hf_model_id}"
+        return payload
+
+
 class ReplicateTextToSpeechTask(ReplicateTask):
     def __init__(self):
         super().__init__("text-to-speech")
diff --git a/src/huggingface_hub/utils/_runtime.py b/src/huggingface_hub/utils/_runtime.py
@@ -154,6 +154,10 @@ def get_hf_transfer_version() -> str:
 
 # xet
 def is_xet_available() -> bool:
+    # since hf_xet is automatically used if available, allow explicit disabling via environment variable
+    if constants._is_true(os.environ.get("HF_HUB_DISABLE_XET")):  # type: ignore
+        return False
+
     return is_package_available("hf_xet")
 
 
diff --git a/tests/test_inference_client.py b/tests/test_inference_client.py
diff --git a/tests/test_inference_providers.py b/tests/test_inference_providers.py
diff --git a/tests/test_xet_download.py b/tests/test_xet_download.py
diff --git a/tests/test_xet_upload.py b/tests/test_xet_upload.py
diff --git a/tests/test_xet_utils.py b/tests/test_xet_utils.py