Skip to content

Commit 8d4f63b

Browse files
authored
Merge pull request #75 from scrapy-plugins/crawlera-on-demand
Enable Crawlera On Demand
2 parents 77a98eb + 06b19bd commit 8d4f63b

File tree

3 files changed

+204
-34
lines changed

3 files changed

+204
-34
lines changed

docs/settings.rst

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,3 +63,15 @@ CRAWLERA_BACKOFF_MAX
6363
Default: ``180``
6464

6565
Max value for exponential backoff as showed in the formula above.
66+
67+
CRAWLERA_FORCE_ENABLE_ON_HTTP_CODES
68+
------------------------------------
69+
70+
Default: ``[]``
71+
72+
List of HTTP response status codes that warrant enabling Crawlera for the
73+
corresponding domain.
74+
75+
When a response with one of these HTTP status codes is received after a request
76+
that did not go through Crawlera, the request is retried with Crawlera, and any
77+
new request to the same domain is also sent through Crawlera.

scrapy_crawlera/middleware.py

Lines changed: 40 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,9 @@ class CrawleraMiddleware(object):
2727
backoff_step = 15
2828
backoff_max = 180
2929
exp_backoff = None
30+
force_enable_on_http_codes = []
3031
max_auth_retry_times = 10
32+
enabled_for_domain = {}
3133

3234
_settings = [
3335
('apikey', str),
@@ -37,6 +39,7 @@ class CrawleraMiddleware(object):
3739
('preserve_delay', bool),
3840
('backoff_step', int),
3941
('backoff_max', int),
42+
('force_enable_on_http_codes', list),
4043
]
4144

4245
def __init__(self, crawler):
@@ -54,15 +57,23 @@ def from_crawler(cls, crawler):
5457

5558
def open_spider(self, spider):
5659
self.enabled = self.is_enabled(spider)
57-
if not self.enabled:
58-
return
59-
6060
self.spider = spider
6161

6262
for k, type_ in self._settings:
6363
setattr(self, k, self._get_setting_value(spider, k, type_))
6464

65+
self._headers = self.crawler.settings.get('CRAWLERA_DEFAULT_HEADERS', {}).items()
66+
self.exp_backoff = exp_backoff(self.backoff_step, self.backoff_max)
67+
68+
if not self.enabled and not self.force_enable_on_http_codes:
69+
return
70+
71+
if not self.apikey:
72+
logging.warning("Crawlera can't be used without a APIKEY", extra={'spider': spider})
73+
return
74+
6575
self._proxyauth = self.get_proxyauth(spider)
76+
6677
logging.info(
6778
"Using crawlera at %s (apikey: %s)" % (self.url, self.apikey[:7]),
6879
extra={'spider': spider},
@@ -77,9 +88,6 @@ def open_spider(self, spider):
7788
extra={'spider': spider},
7889
)
7990

80-
self._headers = self.crawler.settings.get('CRAWLERA_DEFAULT_HEADERS', {}).items()
81-
self.exp_backoff = exp_backoff(self.backoff_step, self.backoff_max)
82-
8391
def _settings_get(self, type_, *a, **kw):
8492
if type_ is int:
8593
return self.crawler.settings.getint(*a, **kw)
@@ -165,7 +173,11 @@ def _is_auth_error(self, response):
165173

166174
def process_response(self, request, response, spider):
167175
if not self._is_enabled_for_request(request):
168-
return response
176+
return self._handle_not_enabled_response(request, response)
177+
178+
if not self._is_crawlera_response(response):
179+
return request
180+
169181
key = self._get_slot_key(request)
170182
self._restore_original_delay(request)
171183

@@ -216,6 +228,13 @@ def process_exception(self, request, exception, spider):
216228
self._clear_dns_cache()
217229
self._set_custom_delay(request, self.connection_refused_delay)
218230

231+
def _handle_not_enabled_response(self, request, response):
232+
if self._should_enable_for_response(response):
233+
domain = self._get_url_domain(request.url)
234+
self.enabled_for_domain[domain] = True
235+
return request
236+
return response
237+
219238
def _retry_auth(self, response, request, spider):
220239
logging.warning(
221240
"Retrying crawlera request for authentication issue",
@@ -232,8 +251,21 @@ def _clear_dns_cache(self):
232251
# so client can reconnect trough DNS failover.
233252
dnscache.pop(urlparse(self.url).hostname, None)
234253

254+
def _should_enable_for_response(self, response):
255+
return response.status in self.force_enable_on_http_codes
256+
235257
def _is_enabled_for_request(self, request):
236-
return self.enabled and not request.meta.get('dont_proxy', False)
258+
domain = self._get_url_domain(request.url)
259+
domain_enabled = self.enabled_for_domain.get(domain, False)
260+
dont_proxy = request.meta.get('dont_proxy', False)
261+
return (domain_enabled or self.enabled) and not dont_proxy
262+
263+
def _get_url_domain(self, url):
264+
parsed = urlparse(url)
265+
return parsed.netloc
266+
267+
def _is_crawlera_response(self, response):
268+
return bool(response.headers.get("X-Crawlera-Version"))
237269

238270
def _get_slot_key(self, request):
239271
return request.meta.get('download_slot')

0 commit comments

Comments
 (0)