Skip to content

Commit 3193c01

Browse files
committed
Add providing location for fetch
-extends #54 -Added filename deduction (content-disposition/URL) -Fetch and its helper functions now use pathlib's Path Signed-off-by: Mateusz Perc <[email protected]>
1 parent 42110e9 commit 3193c01

File tree

2 files changed

+50
-21
lines changed

2 files changed

+50
-21
lines changed

requirements.txt

+1
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,4 @@ six==1.14.0
1616
urllib3==1.25.8
1717
wcwidth==0.1.8
1818
zipp==1.2.0
19+
kiss-headers==2.3.0

src/fetchcode/__init__.py

+49-21
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,12 @@
1616

1717
from ftplib import FTP
1818
from mimetypes import MimeTypes
19-
import os
20-
import tempfile
2119
from urllib.parse import urlparse
20+
from kiss_headers import parse_it
21+
from pathlib import Path
2222

2323
import requests
24+
import tempfile
2425

2526

2627
class Response:
@@ -41,14 +42,33 @@ def __init__(self, location, content_type, size, url):
4142
def fetch_http(url, location):
4243
"""
4344
Return a `Response` object built from fetching the content at a HTTP/HTTPS based `url` URL string
44-
saving the content in a file at `location`
45+
Saving the content in a file at `location`
46+
If `location` is an existing directory - try to deduce the filename
47+
If deduction failed, save the content in a temporary file created at a `location`
4548
"""
4649
r = requests.get(url)
47-
with open(location, 'wb') as f:
50+
51+
if Path.is_dir(location):
52+
content_disposition = parse_it(r.headers).get("content-disposition") or {}
53+
filename_priority = [
54+
content_disposition.get("filename*"),
55+
content_disposition.get("filename"),
56+
Path(urlparse(url).path).name,
57+
]
58+
filename_found = False
59+
for filename in filename_priority:
60+
if filename is not None and len(filename):
61+
filename_found = True
62+
location = location / filename
63+
break
64+
if not filename_found:
65+
location /= tempfile.NamedTemporaryFile(dir=location, delete=False).name
66+
67+
with open(location, "wb") as f:
4868
f.write(r.content)
4969

50-
content_type = r.headers.get('content-type')
51-
size = r.headers.get('content-length')
70+
content_type = r.headers.get("content-type")
71+
size = r.headers.get("content-length")
5272
size = int(size) if size else None
5373

5474
resp = Response(location=location, content_type=content_type, size=size, url=url)
@@ -59,49 +79,57 @@ def fetch_http(url, location):
5979
def fetch_ftp(url, location):
6080
"""
6181
Return a `Response` object built from fetching the content at a FTP based `url` URL string
62-
saving the content in a file at `location`
82+
Saving the content in a file at `location`
83+
If `location` is an existing directory - deduce the filename from the URL
6384
"""
6485
url_parts = urlparse(url)
6586

6687
netloc = url_parts.netloc
67-
path = url_parts.path
68-
dir, file = os.path.split(path)
88+
path = Path(url_parts.path)
89+
directory = path.parent
90+
filename = path.name
91+
92+
if Path.is_dir(location):
93+
location /= filename
6994

7095
ftp = FTP(netloc)
7196
ftp.login()
7297

73-
size = ftp.size(path)
98+
size = ftp.size(str(path))
7499
mime = MimeTypes()
75-
mime_type = mime.guess_type(file)
100+
mime_type = mime.guess_type(filename)
76101
if mime_type:
77102
content_type = mime_type[0]
78103
else:
79104
content_type = None
80105

81-
ftp.cwd(dir)
82-
file = 'RETR {}'.format(file)
83-
with open(location, 'wb') as f:
84-
ftp.retrbinary(file, f.write)
106+
ftp.cwd(str(directory))
107+
filename = "RETR {}".format(filename)
108+
with open(location, "wb") as f:
109+
ftp.retrbinary(filename, f.write)
85110
ftp.close()
86111

87112
resp = Response(location=location, content_type=content_type, size=size, url=url)
88113
return resp
89114

90115

91-
def fetch(url):
116+
def fetch(url, location=None):
92117
"""
93-
Return a `Response` object built from fetching the content at the `url` URL string and store content at a temporary file.
118+
Return a `Response` object built from fetching the content at the `url` URL string and store content at a provided `location`
119+
If `location` is None, save the content in a newly created temporary file
120+
If `location` is an existing directory - try to deduce the filename
94121
"""
95122

96-
temp = tempfile.NamedTemporaryFile(delete=False)
97-
location = temp.name
123+
if location is None:
124+
temp = tempfile.NamedTemporaryFile(delete=False)
125+
location = temp.name
98126

99127
url_parts = urlparse(url)
100128
scheme = url_parts.scheme
101129

102-
fetchers = {'ftp': fetch_ftp, 'http': fetch_http, 'https': fetch_http}
130+
fetchers = {"ftp": fetch_ftp, "http": fetch_http, "https": fetch_http}
103131

104132
if scheme in fetchers:
105133
return fetchers.get(scheme)(url, location)
106134

107-
raise Exception('Not a supported/known scheme.')
135+
raise Exception("Not a supported/known scheme.")

0 commit comments

Comments
 (0)