Skip to content

Commit 3805f4e

Browse files
committed
Added headers to requests.get calls
1 parent 0d1f938 commit 3805f4e

File tree

2 files changed

+15
-2
lines changed

2 files changed

+15
-2
lines changed

Diff for: bitextor/bitextor_wget.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -132,14 +132,20 @@ def run(url, out_path, time_limit, agent, filetypes, warcfilename, wait):
132132

133133
if '//' not in args.url:
134134
args.url = '%s%s' % ('http://', args.url)
135+
headers = requests.utils.default_headers()
135136

137+
headers.update(
138+
{
139+
'User-Agent': 'Mozilla/5.0 (compatible; Bitextor/8 +https://github.com/bitextor/bitextor)',
140+
}
141+
)
136142
connection_error, fixed_url = check_connection(args.url)
137143

138144
if not connection_error:
139145
args.url = fixed_url
140146

141147
try:
142-
robots = requests.get(args.url + "/robots.txt", timeout=15).text.split("\n")
148+
robots = requests.get(args.url + "/robots.txt", timeout=15, headers=headers).text.split("\n")
143149
for line in robots:
144150
if "Crawl-delay" in line:
145151
try:

Diff for: bitextor/utils/common.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -155,10 +155,17 @@ def check_lengths(file_path_from, file_path_to, throw=True):
155155
def check_connection(url):
156156
connection_error = False
157157
connection = None
158+
headers = requests.utils.default_headers()
159+
160+
headers.update(
161+
{
162+
'User-Agent': 'Mozilla/5.0 (compatible; Bitextor/8 +https://github.com/bitextor/bitextor)',
163+
}
164+
)
158165

159166
for check in range(2):
160167
try:
161-
connection = requests.get(url, timeout=15)
168+
connection = requests.get(url, timeout=15, headers=headers)
162169
except requests.exceptions.ConnectTimeout:
163170
if check:
164171
connection_error = True

0 commit comments

Comments
 (0)