Skip to content

Commit 8958517

Browse files
committed
Add providing location for fetch
-extends aboutcode-org#54 -Added filename deduction (content-disposition/URL) -Fetch and its helper functions now use pathlib's Path Signed-off-by: Mateusz Perc <[email protected]>
1 parent 42110e9 commit 8958517

File tree

2 files changed

+53
-21
lines changed

2 files changed

+53
-21
lines changed

requirements.txt

+2
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,5 @@ six==1.14.0
1616
urllib3==1.25.8
1717
wcwidth==0.1.8
1818
zipp==1.2.0
19+
kiss-headers==2.3.0
20+

src/fetchcode/__init__.py

+51-21
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,12 @@
1616

1717
from ftplib import FTP
1818
from mimetypes import MimeTypes
19-
import os
20-
import tempfile
2119
from urllib.parse import urlparse
20+
from kiss_headers import parse_it
21+
from pathlib import Path
2222

2323
import requests
24+
import tempfile
2425

2526

2627
class Response:
@@ -41,14 +42,35 @@ def __init__(self, location, content_type, size, url):
4142
def fetch_http(url, location):
4243
"""
4344
Return a `Response` object built from fetching the content at a HTTP/HTTPS based `url` URL string
44-
saving the content in a file at `location`
45+
Saving the content in a file at `location`
46+
If `location` is an existing directory - try to deduce the filename
47+
If deduction failed, save the content in a temporary file created at a `location`
4548
"""
4649
r = requests.get(url)
47-
with open(location, 'wb') as f:
50+
51+
if Path.is_dir(location):
52+
content_disposition = parse_it(r.headers).get("content-disposition") or {}
53+
filename_priority = [
54+
content_disposition.get("filename*"),
55+
content_disposition.get("filename"),
56+
Path(urlparse(url).path).name,
57+
]
58+
filename_found = False
59+
for filename in filename_priority:
60+
if filename is not None and len(filename):
61+
filename_found = True
62+
location = location / filename
63+
break
64+
if not filename_found:
65+
location = Path(
66+
tempfile.NamedTemporaryFile(dir=location, delete=False).name
67+
)
68+
69+
with open(location, "wb") as f:
4870
f.write(r.content)
4971

50-
content_type = r.headers.get('content-type')
51-
size = r.headers.get('content-length')
72+
content_type = r.headers.get("content-type")
73+
size = r.headers.get("content-length")
5274
size = int(size) if size else None
5375

5476
resp = Response(location=location, content_type=content_type, size=size, url=url)
@@ -59,49 +81,57 @@ def fetch_http(url, location):
5981
def fetch_ftp(url, location):
6082
"""
6183
Return a `Response` object built from fetching the content at a FTP based `url` URL string
62-
saving the content in a file at `location`
84+
Saving the content in a file at `location`
85+
If `location` is an existing directory - deduce the filename from the URL
6386
"""
6487
url_parts = urlparse(url)
6588

6689
netloc = url_parts.netloc
67-
path = url_parts.path
68-
dir, file = os.path.split(path)
90+
path = Path(url_parts.path)
91+
directory = path.parent
92+
filename = path.name
93+
94+
if Path.is_dir(location):
95+
location /= filename
6996

7097
ftp = FTP(netloc)
7198
ftp.login()
7299

73-
size = ftp.size(path)
100+
size = ftp.size(str(path))
74101
mime = MimeTypes()
75-
mime_type = mime.guess_type(file)
102+
mime_type = mime.guess_type(filename)
76103
if mime_type:
77104
content_type = mime_type[0]
78105
else:
79106
content_type = None
80107

81-
ftp.cwd(dir)
82-
file = 'RETR {}'.format(file)
83-
with open(location, 'wb') as f:
84-
ftp.retrbinary(file, f.write)
108+
ftp.cwd(str(directory))
109+
filename = "RETR {}".format(filename)
110+
with open(location, "wb") as f:
111+
ftp.retrbinary(filename, f.write)
85112
ftp.close()
86113

87114
resp = Response(location=location, content_type=content_type, size=size, url=url)
88115
return resp
89116

90117

91-
def fetch(url):
118+
def fetch(url, location=None):
92119
"""
93-
Return a `Response` object built from fetching the content at the `url` URL string and store content at a temporary file.
120+
Return a `Response` object built from fetching the content at the `url` URL string and store content at a provided `location`
121+
If `location` is None, save the content in a newly created temporary file
122+
If `location` is an existing directory - try to deduce the filename
94123
"""
95124

96-
temp = tempfile.NamedTemporaryFile(delete=False)
97-
location = temp.name
125+
if location is None:
126+
temp = tempfile.NamedTemporaryFile(delete=False)
127+
location = temp.name
98128

99129
url_parts = urlparse(url)
100130
scheme = url_parts.scheme
101131

102-
fetchers = {'ftp': fetch_ftp, 'http': fetch_http, 'https': fetch_http}
132+
fetchers = {"ftp": fetch_ftp, "http": fetch_http, "https": fetch_http}
103133

104134
if scheme in fetchers:
105135
return fetchers.get(scheme)(url, location)
106136

107-
raise Exception('Not a supported/known scheme.')
137+
raise Exception("Not a supported/known scheme.")

0 commit comments

Comments
 (0)