huggingface · lhoestq · Mar 28, 2023 · Feb 26, 2023 · Mar 27, 2023 · Mar 28, 2023
diff --git a/src/datasets/builder.py b/src/datasets/builder.py
@@ -251,6 +251,8 @@ class DatasetBuilder:
             `os.path.join(data_dir, "**")` as `data_files`.
             For builders that require manual download, it must be the path to the local directory containing the
             manually downloaded data.
+        storage_options (`dict`, *optional*):
+            Key/value pairs to be passed on to the dataset file-system backend, if any.
         writer_batch_size (`int`, *optional*):
             Batch size used by the ArrowWriter.
             It defines the number of samples that are kept in memory before writing them
@@ -299,6 +301,7 @@ def __init__(
         repo_id: Optional[str] = None,
         data_files: Optional[Union[str, list, dict, DataFilesDict]] = None,
         data_dir: Optional[str] = None,
+        storage_options: Optional[dict] = None,
         writer_batch_size: Optional[int] = None,
         name="deprecated",
         **config_kwargs,
@@ -315,6 +318,7 @@ def __init__(
         self.base_path = base_path
         self.use_auth_token = use_auth_token
         self.repo_id = repo_id
+        self.storage_options = storage_options
         self._writer_batch_size = writer_batch_size or self.DEFAULT_WRITER_BATCH_SIZE
 
         if data_files is not None and not isinstance(data_files, DataFilesDict):
@@ -778,6 +782,7 @@ def download_and_prepare(
                     use_etag=False,
                     num_proc=num_proc,
                     use_auth_token=use_auth_token,
+                    storage_options=self.storage_options,
                 )  # We don't use etag for data files to speed up the process
 
             dl_manager = DownloadManager(
@@ -1251,7 +1256,7 @@ def as_streaming_dataset(
 
         dl_manager = StreamingDownloadManager(
             base_path=base_path or self.base_path,
-            download_config=DownloadConfig(use_auth_token=self.use_auth_token),
+            download_config=DownloadConfig(use_auth_token=self.use_auth_token, storage_options=self.storage_options),
             dataset_name=self.name,
             data_dir=self.config.data_dir,
         )

diff --git a/src/datasets/download/download_config.py b/src/datasets/download/download_config.py
@@ -42,6 +42,8 @@ class DownloadConfig:
         ignore_url_params (`bool`, defaults to `False`):
             Whether to strip all query parameters and fragments from
             the download URL before using it for caching the file.
+        storage_options (`dict`, *optional*):
+            Key/value pairs to be passed on to the dataset file-system backend, if any.
         download_desc (`str`, *optional*):
             A description to be displayed alongside with the progress bar while downloading the files.
     """
@@ -60,6 +62,7 @@ class DownloadConfig:
     max_retries: int = 1
     use_auth_token: Optional[Union[str, bool]] = None
     ignore_url_params: bool = False
+    storage_options: Optional[Dict] = None
     download_desc: Optional[str] = None
 
     def copy(self) -> "DownloadConfig":

diff --git a/src/datasets/load.py b/src/datasets/load.py
@@ -1409,6 +1409,7 @@ def load_dataset_builder(
     download_mode: Optional[Union[DownloadMode, str]] = None,
     revision: Optional[Union[str, Version]] = None,
     use_auth_token: Optional[Union[bool, str]] = None,
+    storage_options: Optional[Dict] = None,
     **config_kwargs,
 ) -> DatasetBuilder:
     """Load a dataset builder from the Hugging Face Hub, or a local dataset. A dataset builder can be used to inspect general information that is required to build a dataset (cache directory, config, dataset info, etc.)
@@ -1469,6 +1470,10 @@ def load_dataset_builder(
         use_auth_token (`str` or `bool`, *optional*):
             Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.
             If `True`, or not specified, will get token from `"~/.huggingface"`.
+        storage_options (`dict`, *optional*, defaults to `None`):
+            **Experimental**. Key/value pairs to be passed on to the dataset file-system backend, if any.
+
+            <Added version="2.11.0"/>
         **config_kwargs (additional keyword arguments):
             Keyword arguments to be passed to the [`BuilderConfig`]
             and used in the [`DatasetBuilder`].
@@ -1524,6 +1529,7 @@ def load_dataset_builder(
         hash=hash,
         features=features,
         use_auth_token=use_auth_token,
+        storage_options=storage_options,
         **builder_kwargs,
         **config_kwargs,
     )
@@ -1550,6 +1556,7 @@ def load_dataset(
     task: Optional[Union[str, TaskTemplate]] = None,
     streaming: bool = False,
     num_proc: Optional[int] = None,
+    storage_options: Optional[Dict] = None,
     **config_kwargs,
 ) -> Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset]:
     """Load a dataset from the Hugging Face Hub, or a local dataset.
@@ -1671,6 +1678,10 @@ def load_dataset(
             Multiprocessing is disabled by default.
 
             <Added version="2.7.0"/>
+        storage_options (`dict`, *optional*, defaults to `None`):
+            **Experimental**. Key/value pairs to be passed on to the dataset file-system backend, if any.
+
+            <Added version="2.11.0"/>
         **config_kwargs (additional keyword arguments):
             Keyword arguments to be passed to the `BuilderConfig`
             and used in the [`DatasetBuilder`].
@@ -1764,6 +1775,7 @@ def load_dataset(
         download_mode=download_mode,
         revision=revision,
         use_auth_token=use_auth_token,
+        storage_options=storage_options,
         **config_kwargs,
     )
 
@@ -1782,6 +1794,7 @@ def load_dataset(
         verification_mode=verification_mode,
         try_from_hf_gcs=try_from_hf_gcs,
         num_proc=num_proc,
+        storage_options=storage_options,
     )
 
     # Build dataset for splits

diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py
@@ -192,6 +192,7 @@ def cached_path(
             max_retries=download_config.max_retries,
             use_auth_token=download_config.use_auth_token,
             ignore_url_params=download_config.ignore_url_params,
+            storage_options=download_config.storage_options,
             download_desc=download_config.download_desc,
         )
     elif os.path.exists(url_or_filename):
@@ -328,17 +329,17 @@ def _request_with_retry(
     return response
 
 
-def fsspec_head(url, timeout=10.0):
+def fsspec_head(url, storage_options=None):
     _raise_if_offline_mode_is_enabled(f"Tried to reach {url}")
-    fs, _, paths = fsspec.get_fs_token_paths(url, storage_options={"requests_timeout": timeout})
+    fs, _, paths = fsspec.get_fs_token_paths(url, storage_options=storage_options)
     if len(paths) > 1:
         raise ValueError(f"HEAD can be called with at most one path but was called with {paths}")
     return fs.info(paths[0])
 
 
-def fsspec_get(url, temp_file, timeout=10.0, desc=None):
+def fsspec_get(url, temp_file, storage_options=None, desc=None):
     _raise_if_offline_mode_is_enabled(f"Tried to reach {url}")
-    fs, _, paths = fsspec.get_fs_token_paths(url, storage_options={"requests_timeout": timeout})
+    fs, _, paths = fsspec.get_fs_token_paths(url, storage_options=storage_options)
     if len(paths) > 1:
         raise ValueError(f"GET can be called with at most one path but was called with {paths}")
     callback = fsspec.callbacks.TqdmCallback(
@@ -445,6 +446,7 @@ def get_from_cache(
     max_retries=0,
     use_auth_token=None,
     ignore_url_params=False,
+    storage_options=None,
     download_desc=None,
 ) -> str:
     """
@@ -499,7 +501,7 @@ def get_from_cache(
         if scheme == "ftp":
             connected = ftp_head(url)
         elif scheme not in ("http", "https"):
-            response = fsspec_head(url)
+            response = fsspec_head(url, storage_options=storage_options)
             # s3fs uses "ETag", gcsfs uses "etag"
             etag = (response.get("ETag", None) or response.get("etag", None)) if use_etag else None
             connected = True
@@ -604,7 +606,7 @@ def _resumable_file_manager():
             if scheme == "ftp":
                 ftp_get(url, temp_file)
             elif scheme not in ("http", "https"):
-                fsspec_get(url, temp_file, desc=download_desc)
+                fsspec_get(url, temp_file, storage_options=storage_options, desc=download_desc)
             else:
                 http_get(
                     url,