huggingface · janineguo · Jun 1, 2023 · Jun 1, 2023 · Jun 13, 2023 · Jul 6, 2023
diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
@@ -5316,7 +5316,12 @@ def path_in_repo(_index, shard):
             for data_file in data_files
             if data_file.startswith(f"data/{split}-") and data_file not in shards_path_in_repo
         ]
-        deleted_size = sum(xgetsize(hf_hub_url(repo_id, data_file), token=token) for data_file in data_files_to_delete)
+
+        download_config = DownloadConfig(token=token)
+        deleted_size = sum(
+            xgetsize(hf_hub_url(repo_id, data_file), download_config=download_config)
+            for data_file in data_files_to_delete
+        )
 
         def delete_file(file):
             api.delete_file(file, repo_id=repo_id, token=token, repo_type="dataset", revision=branch)

diff --git a/src/datasets/download/streaming_download_manager.py b/src/datasets/download/streaming_download_manager.py
diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py
@@ -7,6 +7,7 @@
 import pyarrow as pa
 
 from .. import config
+from ..download.download_config import DownloadConfig
 from ..download.streaming_download_manager import xopen, xsplitext
 from ..table import array_cast
 from ..utils.py_utils import no_op_if_value_is_null, string_to_dict
@@ -172,13 +173,15 @@ def decode_example(
         if file is None:
             token_per_repo_id = token_per_repo_id or {}
             source_url = path.split("::")[-1]
+            repo_id = None
             try:
                 repo_id = string_to_dict(source_url, config.HUB_DATASETS_URL)["repo_id"]
-                token = token_per_repo_id[repo_id]
+                token_per_repo_id[repo_id]
             except (ValueError, KeyError):
-                token = None
+                pass
 
-            with xopen(path, "rb", token=token) as f:
+            download_config = DownloadConfig(token=None if repo_id is None else token_per_repo_id[repo_id])
+            with xopen(path, "rb", download_config=download_config) as f:
                 array, sampling_rate = sf.read(f)
 
         else:

diff --git a/src/datasets/features/image.py b/src/datasets/features/image.py
@@ -9,6 +9,7 @@
 import pyarrow as pa
 
 from .. import config
+from ..download.download_config import DownloadConfig
 from ..download.streaming_download_manager import xopen
 from ..table import array_cast
 from ..utils.file_utils import is_local_path
@@ -167,10 +168,13 @@ def decode_example(self, value: dict, token_per_repo_id=None) -> "PIL.Image.Imag
                     source_url = path.split("::")[-1]
                     try:
                         repo_id = string_to_dict(source_url, config.HUB_DATASETS_URL)["repo_id"]
-                        token = token_per_repo_id.get(repo_id)
+                        token_per_repo_id.get(repo_id)
+                        download_config = DownloadConfig(token=token_per_repo_id.get(repo_id))
                     except ValueError:
-                        token = None
-                    with xopen(path, "rb", token=token) as f:
+                        use_auth_token = None
+                        download_config = DownloadConfig(token=use_auth_token)
+
+                    with xopen(path, "rb", download_config=download_config) as f:
                         bytes_ = BytesIO(f.read())
                     image = PIL.Image.open(bytes_)
         else:

diff --git a/src/datasets/load.py b/src/datasets/load.py
@@ -343,7 +343,7 @@ def _create_importable_file(
 
 
 def infer_module_for_data_files(
-    data_files_list: DataFilesList, token: Optional[Union[bool, str]] = None
+    data_files_list: DataFilesList, download_config: Optional[DownloadConfig] = None
 ) -> Optional[Tuple[str, str]]:
     """Infer module (and builder kwargs) from list of data files.
 
@@ -352,8 +352,7 @@ def infer_module_for_data_files(
 
     Args:
         data_files_list (DataFilesList): List of data files.
-        token (bool or str, optional): Whether to use token or token to authenticate on the Hugging Face Hub
-            for private remote files.
+        download_config (bool or str, optional): mainly use token or storage_options to support different platforms and auth types.
 
     Returns:
         tuple[str, str]: Tuple with
@@ -376,19 +375,18 @@ def sort_key(ext_count: Tuple[str, int]) -> Tuple[int, bool]:
             if ext in _EXTENSION_TO_MODULE:
                 return _EXTENSION_TO_MODULE[ext]
             elif ext == ".zip":
-                return infer_module_for_data_files_in_archives(data_files_list, token=token)
+                return infer_module_for_data_files_in_archives(data_files_list, download_config=download_config)
     return None, {}
 
 
 def infer_module_for_data_files_in_archives(
-    data_files_list: DataFilesList, token: Optional[Union[bool, str]]
+    data_files_list: DataFilesList, download_config: Optional[DownloadConfig]
 ) -> Optional[Tuple[str, str]]:
     """Infer module (and builder kwargs) from list of archive data files.
 
     Args:
         data_files_list (DataFilesList): List of data files.
-        token (bool or str, optional): Whether to use token or token to authenticate on the Hugging Face Hub
-            for private remote files.
+        download_config (bool or str, optional): mainly use token or storage_options to support different platforms and auth types.
 
     Returns:
         tuple[str, str]: Tuple with
@@ -405,7 +403,7 @@ def infer_module_for_data_files_in_archives(
             extracted = xjoin(StreamingDownloadManager().extract(filepath), "**")
             archived_files += [
                 f.split("::")[0]
-                for f in xglob(extracted, recursive=True, token=token)[
+                for f in xglob(extracted, recursive=True, download_config=download_config)[
                     : config.ARCHIVED_DATA_FILES_MAX_NUMBER_FOR_MODULE_INFERENCE
                 ]
             ]
@@ -785,7 +783,7 @@ def get_module(self) -> DatasetModule:
             allowed_extensions=ALL_ALLOWED_EXTENSIONS,
         )
         split_modules = {
-            split: infer_module_for_data_files(data_files_list, token=self.download_config.token)
+            split: infer_module_for_data_files(data_files_list, download_config=self.download_config)
             for split, data_files_list in data_files.items()
         }
         module_name, builder_kwargs = next(iter(split_modules.values()))

diff --git a/src/datasets/streaming.py b/src/datasets/streaming.py
@@ -1,8 +1,9 @@
 import importlib
 import inspect
 from functools import wraps
-from typing import TYPE_CHECKING, Optional, Union
+from typing import TYPE_CHECKING, Optional
 
+from .download.download_config import DownloadConfig
 from .download.streaming_download_manager import (
     xbasename,
     xdirname,
@@ -39,7 +40,7 @@
     from .builder import DatasetBuilder
 
 
-def extend_module_for_streaming(module_path, token: Optional[Union[str, bool]] = None):
+def extend_module_for_streaming(module_path, download_config: Optional[DownloadConfig] = None):
     """Extend the module to support streaming.
 
     We patch some functions in the module to use `fsspec` to support data streaming:
@@ -55,8 +56,7 @@ def extend_module_for_streaming(module_path, token: Optional[Union[str, bool]] =
 
     Args:
         module_path: Path to the module to be extended.
-        token (``str`` or :obj:`bool`, optional): Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.
-            If True, or not specified, will get token from `"~/.huggingface"`.
+        download_config : mainly use token or storage_options to support different platforms and auth types.
     """
 
     module = importlib.import_module(module_path)
@@ -68,7 +68,7 @@ def extend_module_for_streaming(module_path, token: Optional[Union[str, bool]] =
     def wrap_auth(function):
         @wraps(function)
         def wrapper(*args, **kwargs):
-            return function(*args, token=token, **kwargs)
+            return function(*args, download_config=download_config, **kwargs)
 
         wrapper._decorator_name_ = "wrap_auth"
         return wrapper
@@ -109,14 +109,15 @@ def extend_dataset_builder_for_streaming(builder: "DatasetBuilder"):
         builder (:class:`DatasetBuilder`): Dataset builder instance.
     """
     # this extends the open and os.path.join functions for data streaming
-    extend_module_for_streaming(builder.__module__, token=builder.token)
+    download_config = DownloadConfig(storage_options=builder.storage_options, token=builder.use_auth_token)
+    extend_module_for_streaming(builder.__module__, download_config=download_config)
     # if needed, we also have to extend additional internal imports (like wmt14 -> wmt_utils)
     if not builder.__module__.startswith("datasets."):  # check that it's not a packaged builder like csv
         for imports in get_imports(inspect.getfile(builder.__class__)):
             if imports[0] == "internal":
                 internal_import_name = imports[1]
                 internal_module_name = ".".join(builder.__module__.split(".")[:-1] + [internal_import_name])
-                extend_module_for_streaming(internal_module_name, token=builder.token)
+                extend_module_for_streaming(internal_module_name, download_config=download_config)
 
     # builders can inherit from other builders that might use streaming functionality
     # (for example, ImageFolder and AudioFolder inherit from FolderBuilder which implements examples generation)
@@ -129,4 +130,4 @@ def extend_dataset_builder_for_streaming(builder: "DatasetBuilder"):
         if issubclass(cls, DatasetBuilder) and cls.__module__ != DatasetBuilder.__module__
     ]  # check it's not a standard builder from datasets.builder
     for module in parent_builder_modules:
-        extend_module_for_streaming(module, token=builder.token)
+        extend_module_for_streaming(module, download_config=download_config)
diff --git a/tests/test_streaming_download_manager.py b/tests/test_streaming_download_manager.py
@@ -7,6 +7,7 @@
 from fsspec.registry import _registry as _fsspec_registry
 from fsspec.spec import AbstractBufferedFile, AbstractFileSystem
 
+from datasets.download.download_config import DownloadConfig
 from datasets.download.streaming_download_manager import (
     StreamingDownloadManager,
     _get_extraction_protocol,
@@ -236,8 +237,9 @@ def test_xexists(input_path, exists, tmp_path, mock_fsspec):
 @pytest.mark.integration
 def test_xexists_private(hf_private_dataset_repo_txt_data, hf_token):
     root_url = hf_hub_url(hf_private_dataset_repo_txt_data, "")
-    assert xexists(root_url + "data/text_data.txt", token=hf_token)
-    assert not xexists(root_url + "file_that_doesnt_exist.txt", token=hf_token)
+    download_config = DownloadConfig(token=hf_token)
+    assert xexists(root_url + "data/text_data.txt", download_config=download_config)
+    assert not xexists(root_url + "file_that_doesnt_exist.txt", download_config=download_config)
 
 
 @pytest.mark.parametrize(
@@ -320,12 +322,13 @@ def test_xlistdir(input_path, expected_paths, tmp_path, mock_fsspec):
 @pytest.mark.integration
 def test_xlistdir_private(hf_private_dataset_repo_zipped_txt_data, hf_token):
     root_url = hf_hub_url(hf_private_dataset_repo_zipped_txt_data, "data.zip")
-    assert len(xlistdir("zip://::" + root_url, token=hf_token)) == 1
-    assert len(xlistdir("zip://main_dir::" + root_url, token=hf_token)) == 2
+    download_config = DownloadConfig(token=hf_token)
+    assert len(xlistdir("zip://::" + root_url, download_config=download_config)) == 1
+    assert len(xlistdir("zip://main_dir::" + root_url, download_config=download_config)) == 2
     with pytest.raises(FileNotFoundError):
-        xlistdir("zip://qwertyuiop::" + root_url, token=hf_token)
+        xlistdir("zip://qwertyuiop::" + root_url, download_config=download_config)
     with pytest.raises(NotImplementedError):
-        xlistdir(root_url, token=hf_token)
+        xlistdir(root_url, download_config=download_config)
 
 
 @pytest.mark.parametrize(
@@ -348,11 +351,13 @@ def test_xisdir(input_path, isdir, tmp_path, mock_fsspec):
 @pytest.mark.integration
 def test_xisdir_private(hf_private_dataset_repo_zipped_txt_data, hf_token):
     root_url = hf_hub_url(hf_private_dataset_repo_zipped_txt_data, "data.zip")
-    assert xisdir("zip://::" + root_url, token=hf_token) is True
-    assert xisdir("zip://main_dir::" + root_url, token=hf_token) is True
-    assert xisdir("zip://qwertyuiop::" + root_url, token=hf_token) is False
+
+    download_config = DownloadConfig(token=hf_token)
+    assert xisdir("zip://::" + root_url, download_config=download_config) is True
+    assert xisdir("zip://main_dir::" + root_url, download_config=download_config) is True
+    assert xisdir("zip://qwertyuiop::" + root_url, download_config=download_config) is False
     with pytest.raises(NotImplementedError):
-        xisdir(root_url, token=hf_token)
+        xisdir(root_url, download_config=download_config)
 
 
 @pytest.mark.parametrize(
@@ -374,8 +379,9 @@ def test_xisfile(input_path, isfile, tmp_path, mock_fsspec):
 @pytest.mark.integration
 def test_xisfile_private(hf_private_dataset_repo_txt_data, hf_token):
     root_url = hf_hub_url(hf_private_dataset_repo_txt_data, "")
-    assert xisfile(root_url + "data/text_data.txt", token=hf_token) is True
-    assert xisfile(root_url + "qwertyuiop", token=hf_token) is False
+    download_config = DownloadConfig(token=hf_token)
+    assert xisfile(root_url + "data/text_data.txt", download_config=download_config) is True
+    assert xisfile(root_url + "qwertyuiop", download_config=download_config) is False
 
 
 @pytest.mark.parametrize(
@@ -397,9 +403,10 @@ def test_xgetsize(input_path, size, tmp_path, mock_fsspec):
 @pytest.mark.integration
 def test_xgetsize_private(hf_private_dataset_repo_txt_data, hf_token):
     root_url = hf_hub_url(hf_private_dataset_repo_txt_data, "")
-    assert xgetsize(root_url + "data/text_data.txt", token=hf_token) == 39
+    download_config = DownloadConfig(token=hf_token)
+    assert xgetsize(root_url + "data/text_data.txt", download_config=download_config) == 39
     with pytest.raises(FileNotFoundError):
-        xgetsize(root_url + "qwertyuiop", token=hf_token)
+        xgetsize(root_url + "qwertyuiop", download_config=download_config)
 
 
 @pytest.mark.parametrize(
@@ -440,8 +447,9 @@ def test_xglob(input_path, expected_paths, tmp_path, mock_fsspec):
 @pytest.mark.integration
 def test_xglob_private(hf_private_dataset_repo_zipped_txt_data, hf_token):
     root_url = hf_hub_url(hf_private_dataset_repo_zipped_txt_data, "data.zip")
-    assert len(xglob("zip://**::" + root_url, token=hf_token)) == 3
-    assert len(xglob("zip://qwertyuiop/*::" + root_url, token=hf_token)) == 0
+    download_config = DownloadConfig(token=hf_token)
+    assert len(xglob("zip://**::" + root_url, download_config=download_config)) == 3
+    assert len(xglob("zip://qwertyuiop/*::" + root_url, download_config=download_config)) == 0
 
 
 @pytest.mark.parametrize(
@@ -478,9 +486,10 @@ def test_xwalk(input_path, expected_outputs, tmp_path, mock_fsspec):
 @pytest.mark.integration
 def test_xwalk_private(hf_private_dataset_repo_zipped_txt_data, hf_token):
     root_url = hf_hub_url(hf_private_dataset_repo_zipped_txt_data, "data.zip")
-    assert len(list(xwalk("zip://::" + root_url, token=hf_token))) == 2
-    assert len(list(xwalk("zip://main_dir::" + root_url, token=hf_token))) == 1
-    assert len(list(xwalk("zip://qwertyuiop::" + root_url, token=hf_token))) == 0
+    download_config = DownloadConfig(token=hf_token)
+    assert len(list(xwalk("zip://::" + root_url, download_config=download_config))) == 2
+    assert len(list(xwalk("zip://main_dir::" + root_url, download_config=download_config))) == 1
+    assert len(list(xwalk("zip://qwertyuiop::" + root_url, download_config=download_config))) == 0
 
 
 @pytest.mark.parametrize(