@@ -27,7 +27,9 @@ class CrawleraMiddleware(object):
2727 backoff_step = 15
2828 backoff_max = 180
2929 exp_backoff = None
30+ force_enable_on_http_codes = []
3031 max_auth_retry_times = 10
32+ enabled_for_domain = {}
3133
3234 _settings = [
3335 ('apikey' , str ),
@@ -37,6 +39,7 @@ class CrawleraMiddleware(object):
3739 ('preserve_delay' , bool ),
3840 ('backoff_step' , int ),
3941 ('backoff_max' , int ),
42+ ('force_enable_on_http_codes' , list ),
4043 ]
4144
4245 def __init__ (self , crawler ):
@@ -54,15 +57,23 @@ def from_crawler(cls, crawler):
5457
5558 def open_spider (self , spider ):
5659 self .enabled = self .is_enabled (spider )
57- if not self .enabled :
58- return
59-
6060 self .spider = spider
6161
6262 for k , type_ in self ._settings :
6363 setattr (self , k , self ._get_setting_value (spider , k , type_ ))
6464
65+ self ._headers = self .crawler .settings .get ('CRAWLERA_DEFAULT_HEADERS' , {}).items ()
66+ self .exp_backoff = exp_backoff (self .backoff_step , self .backoff_max )
67+
68+ if not self .enabled and not self .force_enable_on_http_codes :
69+ return
70+
71+ if not self .apikey :
72+ logging .warning ("Crawlera can't be used without a APIKEY" , extra = {'spider' : spider })
73+ return
74+
6575 self ._proxyauth = self .get_proxyauth (spider )
76+
6677 logging .info (
6778 "Using crawlera at %s (apikey: %s)" % (self .url , self .apikey [:7 ]),
6879 extra = {'spider' : spider },
@@ -77,9 +88,6 @@ def open_spider(self, spider):
7788 extra = {'spider' : spider },
7889 )
7990
80- self ._headers = self .crawler .settings .get ('CRAWLERA_DEFAULT_HEADERS' , {}).items ()
81- self .exp_backoff = exp_backoff (self .backoff_step , self .backoff_max )
82-
8391 def _settings_get (self , type_ , * a , ** kw ):
8492 if type_ is int :
8593 return self .crawler .settings .getint (* a , ** kw )
@@ -165,7 +173,11 @@ def _is_auth_error(self, response):
165173
166174 def process_response (self , request , response , spider ):
167175 if not self ._is_enabled_for_request (request ):
168- return response
176+ return self ._handle_not_enabled_response (request , response )
177+
178+ if not self ._is_crawlera_response (response ):
179+ return request
180+
169181 key = self ._get_slot_key (request )
170182 self ._restore_original_delay (request )
171183
@@ -216,6 +228,13 @@ def process_exception(self, request, exception, spider):
216228 self ._clear_dns_cache ()
217229 self ._set_custom_delay (request , self .connection_refused_delay )
218230
231+ def _handle_not_enabled_response (self , request , response ):
232+ if self ._should_enable_for_response (response ):
233+ domain = self ._get_url_domain (request .url )
234+ self .enabled_for_domain [domain ] = True
235+ return request
236+ return response
237+
219238 def _retry_auth (self , response , request , spider ):
220239 logging .warning (
221240 "Retrying crawlera request for authentication issue" ,
@@ -232,8 +251,21 @@ def _clear_dns_cache(self):
232251 # so client can reconnect trough DNS failover.
233252 dnscache .pop (urlparse (self .url ).hostname , None )
234253
254+ def _should_enable_for_response (self , response ):
255+ return response .status in self .force_enable_on_http_codes
256+
235257 def _is_enabled_for_request (self , request ):
236- return self .enabled and not request .meta .get ('dont_proxy' , False )
258+ domain = self ._get_url_domain (request .url )
259+ domain_enabled = self .enabled_for_domain .get (domain , False )
260+ dont_proxy = request .meta .get ('dont_proxy' , False )
261+ return (domain_enabled or self .enabled ) and not dont_proxy
262+
263+ def _get_url_domain (self , url ):
264+ parsed = urlparse (url )
265+ return parsed .netloc
266+
267+ def _is_crawlera_response (self , response ):
268+ return bool (response .headers .get ("X-Crawlera-Version" ))
237269
238270 def _get_slot_key (self , request ):
239271 return request .meta .get ('download_slot' )
0 commit comments