Skip to content

Trying to write to closed file when using requests.Session #147

@maxyousif15

Description

@maxyousif15

Overview

When attempting to use requests.Session with capture_http in some kind of loop to create new WARC files, an error is raised.
However, when using requests directly without the use of a session, all works as expected.

Below is the code snippet using requests.Session alongside the exception raised

from warcio.capture_http import capture_http
from requests.sessions import Session
import requests


HEADERS = {
    'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
    'cache-control': 'max-age=0',
    'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="100", "Google Chrome";v="100"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"macOS"',
    'sec-fetch-dest': 'document',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-site': 'none',
    'sec-fetch-user': '?1',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36',
}


session = Session()
for i in range(3):
    fn = f'example-session-error-{i}.warc.gz'
    with capture_http(fn):
        print(f"Scraping {fn}")
        session.get('https://httpbin.org/ip')

Below is the exception raised

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-4-99e75b92ba45> in <module>
      4     with capture_http(fn):
      5         print(f"Scraping {fn}")
----> 6         session.get('https://httpbin.org/ip')

~/anaconda3/lib/python3.8/site-packages/requests/sessions.py in get(self, url, **kwargs)
    540 
    541         kwargs.setdefault('allow_redirects', True)
--> 542         return self.request('GET', url, **kwargs)
    543 
    544     def options(self, url, **kwargs):

~/anaconda3/lib/python3.8/site-packages/requests/sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
    527         }
    528         send_kwargs.update(settings)
--> 529         resp = self.send(prep, **send_kwargs)
    530 
    531         return resp

~/anaconda3/lib/python3.8/site-packages/requests/sessions.py in send(self, request, **kwargs)
    685 
    686         if not stream:
--> 687             r.content
    688 
    689         return r

~/anaconda3/lib/python3.8/site-packages/requests/models.py in content(self)
    836                 self._content = None
    837             else:
--> 838                 self._content = b''.join(self.iter_content(CONTENT_CHUNK_SIZE)) or b''
    839 
    840         self._content_consumed = True

~/anaconda3/lib/python3.8/site-packages/requests/models.py in generate()
    758             if hasattr(self.raw, 'stream'):
    759                 try:
--> 760                     for chunk in self.raw.stream(chunk_size, decode_content=True):
    761                         yield chunk
    762                 except ProtocolError as e:

~/anaconda3/lib/python3.8/site-packages/urllib3/response.py in stream(self, amt, decode_content)
    577         else:
    578             while not is_fp_closed(self._fp):
--> 579                 data = self.read(amt=amt, decode_content=decode_content)
    580 
    581                 if data:

~/anaconda3/lib/python3.8/site-packages/urllib3/response.py in read(self, amt, decode_content, cache_content)
    520             else:
    521                 cache_content = False
--> 522                 data = self._fp.read(amt) if not fp_closed else b""
    523                 if (
    524                     amt != 0 and not data

~/anaconda3/lib/python3.8/http/client.py in read(self, amt)
    456             # Amount is given, implement using readinto
    457             b = bytearray(amt)
--> 458             n = self.readinto(b)
    459             return memoryview(b)[:n].tobytes()
    460         else:

~/anaconda3/lib/python3.8/http/client.py in readinto(self, b)
    508             self.length -= n
    509             if not self.length:
--> 510                 self._close_conn()
    511         return n
    512 

~/anaconda3/lib/python3.8/http/client.py in _close_conn(self)
    410         fp = self.fp
    411         self.fp = None
--> 412         fp.close()
    413 
    414     def close(self):

~/anaconda3/lib/python3.8/site-packages/warcio/capture_http.py in close(self)
     63 
     64     def close(self):
---> 65         self.recorder.done()
     66         if self.fp:
     67             return self.fp.close()

~/anaconda3/lib/python3.8/site-packages/warcio/capture_http.py in done(self)
    196 
    197             with self.lock:
--> 198                 self.writer.write_request_response_pair(request, response)
    199         finally:
    200             self.request_out.close()

~/anaconda3/lib/python3.8/site-packages/warcio/warcwriter.py in write_request_response_pair(self, req, resp, params)
     31             req.rec_headers.add_header('WARC-Concurrent-To', resp_id)
     32 
---> 33         self._do_write_req_resp(req, resp, params)
     34 
     35     def write_record(self, record, params=None):  #pragma: no cover

~/anaconda3/lib/python3.8/site-packages/warcio/warcwriter.py in _do_write_req_resp(self, req, resp, params)
    138 
    139     def _do_write_req_resp(self, req, resp, params):
--> 140         self._write_warc_record(self.out, resp)
    141         self._write_warc_record(self.out, req)
    142 

~/anaconda3/lib/python3.8/site-packages/warcio/warcwriter.py in _write_warc_record(self, out, record)
     89         # write record headers -- encoded as utf-8
     90         # WARC headers can be utf-8 per spec
---> 91         out.write(record.rec_headers.to_bytes(encoding='utf-8'))
     92 
     93         # write headers buffer, if any

~/anaconda3/lib/python3.8/site-packages/warcio/warcwriter.py in write(self, buff)
    120         #    buff = buff.encode('utf-8')
    121         buff = self.compressor.compress(buff)
--> 122         self.out.write(buff)
    123 
    124     def flush(self):

ValueError: write to closed file

The following code snippet intends to do the same thing as above without the use of a session, and does actually work

for i in range(3):
    fn = f'example-session-error-{i}.warc.gz'
    with capture_http(fn):
        print(f"Scraping {fn}")
        requests.get('https://httpbin.org/ip', headers=HEADERS)

Environment

Python - 3.8.5
requests - 2.27.1
warcio - 1.7.4

Any help regarding this issue would be massively appreciated.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions