-
-
Notifications
You must be signed in to change notification settings - Fork 64
Open
Description
Overview
When attempting to use requests.Session
with capture_http
in some kind of loop to create new WARC files, an error is raised.
However, when using requests
directly without the use of a session, all works as expected.
Below is the code snippet using requests.Session
alongside the exception raised
from warcio.capture_http import capture_http
from requests.sessions import Session
import requests
HEADERS = {
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
'cache-control': 'max-age=0',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="100", "Google Chrome";v="100"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"macOS"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'none',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36',
}
session = Session()
for i in range(3):
fn = f'example-session-error-{i}.warc.gz'
with capture_http(fn):
print(f"Scraping {fn}")
session.get('https://httpbin.org/ip')
Below is the exception raised
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-4-99e75b92ba45> in <module>
4 with capture_http(fn):
5 print(f"Scraping {fn}")
----> 6 session.get('https://httpbin.org/ip')
~/anaconda3/lib/python3.8/site-packages/requests/sessions.py in get(self, url, **kwargs)
540
541 kwargs.setdefault('allow_redirects', True)
--> 542 return self.request('GET', url, **kwargs)
543
544 def options(self, url, **kwargs):
~/anaconda3/lib/python3.8/site-packages/requests/sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
527 }
528 send_kwargs.update(settings)
--> 529 resp = self.send(prep, **send_kwargs)
530
531 return resp
~/anaconda3/lib/python3.8/site-packages/requests/sessions.py in send(self, request, **kwargs)
685
686 if not stream:
--> 687 r.content
688
689 return r
~/anaconda3/lib/python3.8/site-packages/requests/models.py in content(self)
836 self._content = None
837 else:
--> 838 self._content = b''.join(self.iter_content(CONTENT_CHUNK_SIZE)) or b''
839
840 self._content_consumed = True
~/anaconda3/lib/python3.8/site-packages/requests/models.py in generate()
758 if hasattr(self.raw, 'stream'):
759 try:
--> 760 for chunk in self.raw.stream(chunk_size, decode_content=True):
761 yield chunk
762 except ProtocolError as e:
~/anaconda3/lib/python3.8/site-packages/urllib3/response.py in stream(self, amt, decode_content)
577 else:
578 while not is_fp_closed(self._fp):
--> 579 data = self.read(amt=amt, decode_content=decode_content)
580
581 if data:
~/anaconda3/lib/python3.8/site-packages/urllib3/response.py in read(self, amt, decode_content, cache_content)
520 else:
521 cache_content = False
--> 522 data = self._fp.read(amt) if not fp_closed else b""
523 if (
524 amt != 0 and not data
~/anaconda3/lib/python3.8/http/client.py in read(self, amt)
456 # Amount is given, implement using readinto
457 b = bytearray(amt)
--> 458 n = self.readinto(b)
459 return memoryview(b)[:n].tobytes()
460 else:
~/anaconda3/lib/python3.8/http/client.py in readinto(self, b)
508 self.length -= n
509 if not self.length:
--> 510 self._close_conn()
511 return n
512
~/anaconda3/lib/python3.8/http/client.py in _close_conn(self)
410 fp = self.fp
411 self.fp = None
--> 412 fp.close()
413
414 def close(self):
~/anaconda3/lib/python3.8/site-packages/warcio/capture_http.py in close(self)
63
64 def close(self):
---> 65 self.recorder.done()
66 if self.fp:
67 return self.fp.close()
~/anaconda3/lib/python3.8/site-packages/warcio/capture_http.py in done(self)
196
197 with self.lock:
--> 198 self.writer.write_request_response_pair(request, response)
199 finally:
200 self.request_out.close()
~/anaconda3/lib/python3.8/site-packages/warcio/warcwriter.py in write_request_response_pair(self, req, resp, params)
31 req.rec_headers.add_header('WARC-Concurrent-To', resp_id)
32
---> 33 self._do_write_req_resp(req, resp, params)
34
35 def write_record(self, record, params=None): #pragma: no cover
~/anaconda3/lib/python3.8/site-packages/warcio/warcwriter.py in _do_write_req_resp(self, req, resp, params)
138
139 def _do_write_req_resp(self, req, resp, params):
--> 140 self._write_warc_record(self.out, resp)
141 self._write_warc_record(self.out, req)
142
~/anaconda3/lib/python3.8/site-packages/warcio/warcwriter.py in _write_warc_record(self, out, record)
89 # write record headers -- encoded as utf-8
90 # WARC headers can be utf-8 per spec
---> 91 out.write(record.rec_headers.to_bytes(encoding='utf-8'))
92
93 # write headers buffer, if any
~/anaconda3/lib/python3.8/site-packages/warcio/warcwriter.py in write(self, buff)
120 # buff = buff.encode('utf-8')
121 buff = self.compressor.compress(buff)
--> 122 self.out.write(buff)
123
124 def flush(self):
ValueError: write to closed file
The following code snippet intends to do the same thing as above without the use of a session, and does actually work
for i in range(3):
fn = f'example-session-error-{i}.warc.gz'
with capture_http(fn):
print(f"Scraping {fn}")
requests.get('https://httpbin.org/ip', headers=HEADERS)
Environment
Python - 3.8.5
requests - 2.27.1
warcio - 1.7.4
Any help regarding this issue would be massively appreciated.
Metadata
Metadata
Assignees
Labels
No labels