Skip to content

Commit b4a7548

Browse files
committed
Make exp backoff a iterator, add configurability
1 parent 370500b commit b4a7548

File tree

3 files changed

+29
-37
lines changed

3 files changed

+29
-37
lines changed

scrapy_crawlera/middleware.py

Lines changed: 9 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from scrapy.exceptions import ScrapyDeprecationWarning
1111
from twisted.internet.error import ConnectionRefusedError, ConnectionDone
1212

13-
from scrapy_crawlera.utils import exp_backoff_full_jitter
13+
from scrapy_crawlera.utils import exp_backoff
1414

1515

1616
class CrawleraMiddleware(object):
@@ -24,9 +24,9 @@ class CrawleraMiddleware(object):
2424
preserve_delay = False
2525
header_prefix = 'X-Crawlera-'
2626
conflicting_headers = ('X-Crawlera-Profile', 'X-Crawlera-UA')
27-
noslaves_attempts = 0
28-
noslaves_base_delay = 15
29-
noslaves_max_delay = 180
27+
backoff_step = 15
28+
backoff_max = 180
29+
exp_backoff = None
3030

3131
_settings = [
3232
('apikey', str),
@@ -36,6 +36,8 @@ class CrawleraMiddleware(object):
3636
('maxbans', int),
3737
('download_timeout', int),
3838
('preserve_delay', bool),
39+
('backoff_step', int),
40+
('backoff_max', int),
3941
]
4042

4143
def __init__(self, crawler):
@@ -71,6 +73,7 @@ def open_spider(self, spider):
7173
"To avoid this behaviour you can use the CRAWLERA_PRESERVE_DELAY setting but keep in mind that this may slow down the crawl significantly")
7274

7375
self._headers = self.crawler.settings.get('CRAWLERA_DEFAULT_HEADERS', {}).items()
76+
self.exp_backoff = exp_backoff(self.backoff_step, self.backoff_max)
7477

7578
def _settings_get(self, type_, *a, **kw):
7679
if type_ is int:
@@ -158,8 +161,7 @@ def process_response(self, request, response, spider):
158161
self._restore_original_delay(request)
159162

160163
if self._is_no_available_proxies(response):
161-
after = self._get_noslaves_delay()
162-
self._set_custom_delay(request, after)
164+
self._set_custom_delay(request, next(self.exp_backoff))
163165
else:
164166
self._reset_noslaves_delay()
165167

@@ -207,22 +209,9 @@ def _get_slot(self, request):
207209
key = self._get_slot_key(request)
208210
return key, self.crawler.engine.downloader.slots.get(key)
209211

210-
def _get_noslaves_delay(self):
211-
"""
212-
Returns the amount of delay to use in case of no available proxies,
213-
also increments the number of attempts due to no proxies
214-
"""
215-
delay = exp_backoff_full_jitter(
216-
self.noslaves_attempts,
217-
self.noslaves_max_delay,
218-
self.noslaves_base_delay
219-
)
220-
self.noslaves_attempts += 1
221-
return delay
222-
223212
def _reset_noslaves_delay(self):
224213
"""Reset the number of attempts due to no available proxies"""
225-
self.noslaves_attempts = 0
214+
self.exp_backoff = exp_backoff(self.backoff_step, self.backoff_max)
226215

227216
def _set_custom_delay(self, request, delay):
228217
"""Set custom delay for slot and save original one."""

scrapy_crawlera/utils.py

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,16 @@
11
import math
22
import random
33

4-
5-
def exp_backoff(attempt, cap, base):
6-
""" Exponential backoff time """
7-
# this is a numerically stable version of
8-
# min(cap, base * 2 ** attempt)
9-
max_attempts = math.log(cap / base, 2)
10-
if attempt <= max_attempts:
11-
return base * 2 ** attempt
12-
return cap
4+
from itertools import count
135

146

15-
def exp_backoff_full_jitter(attempt, cap, base):
7+
def exp_backoff(step, max):
168
""" Exponential backoff time with Full Jitter """
17-
return random.uniform(0, exp_backoff(attempt, cap, base))
9+
# this is a numerically stable version of
10+
# min(max, step * 2 ** attempt)
11+
max_attempts = math.log(max / step, 2)
12+
for attempt in count(0, 1):
13+
if attempt < max_attempts:
14+
yield random.uniform(0, step * 2 ** attempt)
15+
else:
16+
yield random.uniform(0, max)

tests/test_crawlera.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
from scrapy_crawlera import CrawleraMiddleware
1515
import os
1616

17+
from scrapy_crawlera.utils import exp_backoff
18+
1719

1820
class MockedSlot(object):
1921

@@ -478,9 +480,12 @@ def test_noslaves_delays(self, random_uniform_patch):
478480
url = 'http://www.scrapytest.org'
479481
ban_url = 'http://ban.me'
480482
max_delay = 70
481-
initial_delay = 15
483+
backoff_step = 15
482484
default_delay = 0
483485

486+
self.settings['CRAWLERA_BACKOFF_STEP'] = backoff_step
487+
self.settings['CRAWLERA_BACKOFF_MAX'] = max_delay
488+
484489
self.spider.crawlera_enabled = True
485490
crawler = self._mock_crawler(self.spider, self.settings)
486491
mw = self.mwcls.from_crawler(crawler)
@@ -497,13 +502,12 @@ def test_noslaves_delays(self, random_uniform_patch):
497502

498503
# delays grow exponentially
499504
mw.process_response(noslaves_req, noslaves_res, self.spider)
500-
self.assertEqual(slot.delay, initial_delay)
501-
505+
self.assertEqual(slot.delay, backoff_step)
502506
mw.process_response(noslaves_req, noslaves_res, self.spider)
503-
self.assertEqual(slot.delay, initial_delay * 2 ** 1)
507+
self.assertEqual(slot.delay, backoff_step * 2 ** 1)
504508

505509
mw.process_response(noslaves_req, noslaves_res, self.spider)
506-
self.assertEqual(slot.delay, initial_delay * 2 ** 2)
510+
self.assertEqual(slot.delay, backoff_step * 2 ** 2)
507511

508512
mw.process_response(noslaves_req, noslaves_res, self.spider)
509513
self.assertEqual(slot.delay, max_delay)
@@ -517,7 +521,7 @@ def test_noslaves_delays(self, random_uniform_patch):
517521
self.assertEqual(slot.delay, default_delay)
518522

519523
mw.process_response(noslaves_req, noslaves_res, self.spider)
520-
self.assertEqual(slot.delay, initial_delay)
524+
self.assertEqual(slot.delay, backoff_step)
521525

522526
good_req = Request(url, meta={'download_slot': slot_key})
523527
good_res = Response(

0 commit comments

Comments
 (0)