Skip to content

Commit 084dde2

Browse files
quepopaalexanderr
authored andcommitted
Add providing location for fetch
-extends aboutcode-org#54 -Added filename deduction (content-disposition/URL) -Fetch and its helper functions now use pathlib's Path Signed-off-by: Mateusz Perc <[email protected]>
1 parent 719ea3c commit 084dde2

File tree

2 files changed

+53
-21
lines changed

2 files changed

+53
-21
lines changed

requirements.txt

+1
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,4 @@ six==1.14.0
1616
urllib3==1.26.5
1717
wcwidth==0.1.8
1818
zipp==1.2.0
19+
kiss-headers==2.3.0

src/fetchcode/__init__.py

+52-21
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,13 @@
1616

1717
from ftplib import FTP
1818
from mimetypes import MimeTypes
19-
import os
20-
import tempfile
19+
from pathlib import Path
20+
from pathlib import PurePosixPath
2121
from urllib.parse import urlparse
22+
from kiss_headers import parse_it
2223

2324
import requests
25+
import tempfile
2426

2527

2628
class Response:
@@ -41,14 +43,35 @@ def __init__(self, location, content_type, size, url):
4143
def fetch_http(url, location):
4244
"""
4345
Return a `Response` object built from fetching the content at a HTTP/HTTPS based `url` URL string
44-
saving the content in a file at `location`
46+
Saving the content in a file at `location`
47+
If `location` is an existing directory - try to deduce the filename
48+
If deduction failed, save the content in a temporary file created at a `location`
4549
"""
4650
r = requests.get(url)
47-
with open(location, 'wb') as f:
51+
52+
if Path.is_dir(location):
53+
content_disposition = parse_it(r.headers).get("content-disposition") or {}
54+
filename_priority = [
55+
content_disposition.get("filename*"),
56+
content_disposition.get("filename"),
57+
PurePosixPath(urlparse(url).path).name,
58+
]
59+
filename_found = False
60+
for filename in filename_priority:
61+
if filename is not None and len(filename):
62+
filename_found = True
63+
location /= filename
64+
break
65+
if not filename_found:
66+
location = Path(
67+
tempfile.NamedTemporaryFile(dir=location, delete=False).name
68+
)
69+
70+
with open(location, "wb") as f:
4871
f.write(r.content)
4972

50-
content_type = r.headers.get('content-type')
51-
size = r.headers.get('content-length')
73+
content_type = r.headers.get("content-type")
74+
size = r.headers.get("content-length")
5275
size = int(size) if size else None
5376

5477
resp = Response(location=location, content_type=content_type, size=size, url=url)
@@ -59,49 +82,57 @@ def fetch_http(url, location):
5982
def fetch_ftp(url, location):
6083
"""
6184
Return a `Response` object built from fetching the content at a FTP based `url` URL string
62-
saving the content in a file at `location`
85+
Saving the content in a file at `location`
86+
If `location` is an existing directory - deduce the filename from the URL
6387
"""
6488
url_parts = urlparse(url)
6589

6690
netloc = url_parts.netloc
67-
path = url_parts.path
68-
dir, file = os.path.split(path)
91+
path = PurePosixPath(url_parts.path)
92+
directory = path.parent
93+
filename = path.name
94+
95+
if Path.is_dir(location):
96+
location /= filename
6997

7098
ftp = FTP(netloc)
7199
ftp.login()
72100

73-
size = ftp.size(path)
101+
size = ftp.size(str(path))
74102
mime = MimeTypes()
75-
mime_type = mime.guess_type(file)
103+
mime_type = mime.guess_type(filename)
76104
if mime_type:
77105
content_type = mime_type[0]
78106
else:
79107
content_type = None
80108

81-
ftp.cwd(dir)
82-
file = 'RETR {}'.format(file)
83-
with open(location, 'wb') as f:
84-
ftp.retrbinary(file, f.write)
109+
ftp.cwd(str(directory))
110+
filename = "RETR {}".format(filename)
111+
with open(location, "wb") as f:
112+
ftp.retrbinary(filename, f.write)
85113
ftp.close()
86114

87115
resp = Response(location=location, content_type=content_type, size=size, url=url)
88116
return resp
89117

90118

91-
def fetch(url):
119+
def fetch(url, location=None):
92120
"""
93-
Return a `Response` object built from fetching the content at the `url` URL string and store content at a temporary file.
121+
Return a `Response` object built from fetching the content at the `url` URL string and store content at a provided `location`
122+
If `location` is None, save the content in a newly created temporary file
123+
If `location` is an existing directory - try to deduce the filename
94124
"""
95125

96-
temp = tempfile.NamedTemporaryFile(delete=False)
97-
location = temp.name
126+
if location is None:
127+
temp = tempfile.NamedTemporaryFile(delete=False)
128+
location = Path(temp.name)
98129

99130
url_parts = urlparse(url)
100131
scheme = url_parts.scheme
101132

102-
fetchers = {'ftp': fetch_ftp, 'http': fetch_http, 'https': fetch_http}
133+
fetchers = {"ftp": fetch_ftp, "http": fetch_http, "https": fetch_http}
103134

104135
if scheme in fetchers:
105136
return fetchers.get(scheme)(url, location)
106137

107-
raise Exception('Not a supported/known scheme.')
138+
raise Exception("Not a supported/known scheme.")

0 commit comments

Comments
 (0)