From dbc158bcfb74a534ecbae4013711d0558fd57b6d Mon Sep 17 00:00:00 2001 From: humengyu Date: Thu, 8 May 2025 20:24:57 +0800 Subject: [PATCH 1/4] Fix race conditions on symlink-less filesystems by extending lock coverage --- src/huggingface_hub/file_download.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/src/huggingface_hub/file_download.py b/src/huggingface_hub/file_download.py index 2b3a4a1c4b..98f3afe861 100644 --- a/src/huggingface_hub/file_download.py +++ b/src/huggingface_hub/file_download.py @@ -1130,19 +1130,21 @@ def _hf_hub_download_to_cache_dir( # In that case store a ref. _cache_commit_hash_for_specific_revision(storage_folder, revision, commit_hash) - # If file already exists, return it (except if force_download=True) - if not force_download: - if os.path.exists(pointer_path): - return pointer_path - - if os.path.exists(blob_path): - # we have the blob already, but not the pointer - _create_symlink(blob_path, pointer_path, new_blob=False) - return pointer_path - # Prevent parallel downloads of the same file with a lock. # etag could be duplicated across repos, lock_path = os.path.join(locks_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type), f"{etag}.lock") + Path(lock_path).parent.mkdir(parents=True, exist_ok=True) + + # If file already exists, return it (except if force_download=True) + with WeakFileLock(lock_path): + if not force_download: + if os.path.exists(pointer_path): + return pointer_path + + if os.path.exists(blob_path): + # we have the blob already, but not the pointer + _create_symlink(blob_path, pointer_path, new_blob=False) + return pointer_path # Some Windows versions do not allow for paths longer than 255 characters. # In this case, we must specify it as an extended path by using the "\\?\" prefix. @@ -1154,7 +1156,6 @@ def _hf_hub_download_to_cache_dir( # Local file doesn't exist or etag isn't a match => retrieve file from remote (or cache) - Path(lock_path).parent.mkdir(parents=True, exist_ok=True) with WeakFileLock(lock_path): _download_to_tmp_and_move( incomplete_path=Path(blob_path + ".incomplete"), From e1f3295769b41db8340d17e844a365bf543b4236 Mon Sep 17 00:00:00 2001 From: humengyu Date: Fri, 16 May 2025 18:22:16 +0800 Subject: [PATCH 2/4] Update src/huggingface_hub/file_download.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: célina --- src/huggingface_hub/file_download.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/huggingface_hub/file_download.py b/src/huggingface_hub/file_download.py index 98f3afe861..65ea17e284 100644 --- a/src/huggingface_hub/file_download.py +++ b/src/huggingface_hub/file_download.py @@ -1135,16 +1135,17 @@ def _hf_hub_download_to_cache_dir( lock_path = os.path.join(locks_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type), f"{etag}.lock") Path(lock_path).parent.mkdir(parents=True, exist_ok=True) - # If file already exists, return it (except if force_download=True) - with WeakFileLock(lock_path): - if not force_download: - if os.path.exists(pointer_path): - return pointer_path - - if os.path.exists(blob_path): - # we have the blob already, but not the pointer + + # pointer already exists -> immediate return + if not force_download and os.path.exists(pointer_path): + return pointer_path + + # Blob exists but pointer must be (safely) created -> take the lock + if not force_download and os.path.exists(blob_path): + with WeakFileLock(lock_path): + if not os.path.exists(pointer_path): _create_symlink(blob_path, pointer_path, new_blob=False) - return pointer_path + return pointer_path # Some Windows versions do not allow for paths longer than 255 characters. # In this case, we must specify it as an extended path by using the "\\?\" prefix. From afaf0d85359cca848051cb80651fe95ffdc6ef79 Mon Sep 17 00:00:00 2001 From: humengyu Date: Fri, 16 May 2025 20:01:11 +0800 Subject: [PATCH 3/4] move normalization of lock_path before mkdir --- src/huggingface_hub/file_download.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/huggingface_hub/file_download.py b/src/huggingface_hub/file_download.py index 65ea17e284..72a88f4119 100644 --- a/src/huggingface_hub/file_download.py +++ b/src/huggingface_hub/file_download.py @@ -1133,6 +1133,15 @@ def _hf_hub_download_to_cache_dir( # Prevent parallel downloads of the same file with a lock. # etag could be duplicated across repos, lock_path = os.path.join(locks_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type), f"{etag}.lock") + + # Some Windows versions do not allow for paths longer than 255 characters. + # In this case, we must specify it as an extended path by using the "\\?\" prefix. + if os.name == "nt" and len(os.path.abspath(lock_path)) > 255: + lock_path = "\\\\?\\" + os.path.abspath(lock_path) + + if os.name == "nt" and len(os.path.abspath(blob_path)) > 255: + blob_path = "\\\\?\\" + os.path.abspath(blob_path) + Path(lock_path).parent.mkdir(parents=True, exist_ok=True) @@ -1147,14 +1156,6 @@ def _hf_hub_download_to_cache_dir( _create_symlink(blob_path, pointer_path, new_blob=False) return pointer_path - # Some Windows versions do not allow for paths longer than 255 characters. - # In this case, we must specify it as an extended path by using the "\\?\" prefix. - if os.name == "nt" and len(os.path.abspath(lock_path)) > 255: - lock_path = "\\\\?\\" + os.path.abspath(lock_path) - - if os.name == "nt" and len(os.path.abspath(blob_path)) > 255: - blob_path = "\\\\?\\" + os.path.abspath(blob_path) - # Local file doesn't exist or etag isn't a match => retrieve file from remote (or cache) with WeakFileLock(lock_path): From eb28de282b08cc8661b67885d79663449a383508 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 19 May 2025 09:17:28 +0000 Subject: [PATCH 4/4] Apply style fixes --- src/huggingface_hub/file_download.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/huggingface_hub/file_download.py b/src/huggingface_hub/file_download.py index 72a88f4119..dfcce9f769 100644 --- a/src/huggingface_hub/file_download.py +++ b/src/huggingface_hub/file_download.py @@ -1144,7 +1144,6 @@ def _hf_hub_download_to_cache_dir( Path(lock_path).parent.mkdir(parents=True, exist_ok=True) - # pointer already exists -> immediate return if not force_download and os.path.exists(pointer_path): return pointer_path