From 2c6f027d3574e598044378eff0d393974238b82d Mon Sep 17 00:00:00 2001 From: elnuno Date: Wed, 22 Mar 2017 18:49:23 -0300 Subject: [PATCH 1/3] Allow sorting query parameters. Gives a nice increase in cache hits for naive apps that send unordered queries. --- cachecontrol/adapter.py | 6 +++++- cachecontrol/caches/file_cache.py | 4 ++-- cachecontrol/controller.py | 25 +++++++++++++++++-------- cachecontrol/wrapper.py | 6 ++++-- 4 files changed, 28 insertions(+), 13 deletions(-) diff --git a/cachecontrol/adapter.py b/cachecontrol/adapter.py index 195d49d9..dd4287b4 100644 --- a/cachecontrol/adapter.py +++ b/cachecontrol/adapter.py @@ -17,17 +17,20 @@ def __init__(self, cache=None, serializer=None, heuristic=None, cacheable_methods=None, + sort_query=False, *args, **kw): super(CacheControlAdapter, self).__init__(*args, **kw) self.cache = cache or DictCache() self.heuristic = heuristic self.cacheable_methods = cacheable_methods or ('GET',) + self.sort_query = sort_query controller_factory = controller_class or CacheController self.controller = controller_factory( self.cache, cache_etags=cache_etags, serializer=serializer, + sort_query=sort_query ) def send(self, request, cacheable_methods=None, **kw): @@ -117,7 +120,8 @@ def _update_chunk_length(self): # See if we should invalidate the cache. if request.method in self.invalidating_methods and resp.ok: - cache_url = self.controller.cache_url(request.url) + cache_url = self.controller.cache_url(request.url, + sort_query=self.sort_query) self.cache.delete(cache_url) # Give the request a from_cache attr to let people use it diff --git a/cachecontrol/caches/file_cache.py b/cachecontrol/caches/file_cache.py index be9b94dd..456c4565 100644 --- a/cachecontrol/caches/file_cache.py +++ b/cachecontrol/caches/file_cache.py @@ -124,10 +124,10 @@ def delete(self, key): pass -def url_to_file_path(url, filecache): +def url_to_file_path(url, filecache, sort_query=False): """Return the file cache path based on the URL. This does not ensure the file exists! """ - key = CacheController.cache_url(url) + key = CacheController.cache_url(url, sort_query=sort_query) return filecache._fn(key) diff --git a/cachecontrol/controller.py b/cachecontrol/controller.py index 1e1f9b6e..19cf03d5 100644 --- a/cachecontrol/controller.py +++ b/cachecontrol/controller.py @@ -31,14 +31,15 @@ class CacheController(object): """An interface to see if request should cached or not. """ def __init__(self, cache=None, cache_etags=True, serializer=None, - status_codes=None): + status_codes=None, sort_query=False): self.cache = cache or DictCache() self.cache_etags = cache_etags self.serializer = serializer or Serializer() self.cacheable_status_codes = status_codes or (200, 203, 300, 301) + self.sort_query = sort_query @classmethod - def _urlnorm(cls, uri): + def _urlnorm(cls, uri, sort_query=False): """Normalize the URL to create a safe key for the cache""" (scheme, authority, path, query, fragment) = parse_uri(uri) if not scheme or not authority: @@ -50,6 +51,14 @@ def _urlnorm(cls, uri): if not path: path = "/" + # Sorting the query might induce behavior changes in the response. + # Use with care. However, assuming param randomization, a query with + # four params has a 96% chance of missing the cache on the second + # request if a response has already been recorded. The chance of a + # hit grows to 50% after a dozen requests. + if query and sort_query: + query = '&'.join(sorted(query.split('&'))) + # Could do syntax based normalization of the URI before # computing the digest. See Section 6.2.2 of Std 66. request_uri = query and "?".join([path, query]) or path @@ -58,8 +67,8 @@ def _urlnorm(cls, uri): return defrag_uri @classmethod - def cache_url(cls, uri): - return cls._urlnorm(uri) + def cache_url(cls, uri, sort_query=False): + return cls._urlnorm(uri, sort_query=sort_query) def parse_cache_control(self, headers): """ @@ -90,7 +99,7 @@ def cached_request(self, request): Return a cached response if it exists in the cache, otherwise return False. """ - cache_url = self.cache_url(request.url) + cache_url = self.cache_url(request.url, sort_query=self.sort_query) logger.debug('Looking up "%s" in the cache', cache_url) cc = self.parse_cache_control(request.headers) @@ -207,7 +216,7 @@ def cached_request(self, request): return False def conditional_headers(self, request): - cache_url = self.cache_url(request.url) + cache_url = self.cache_url(request.url, sort_query=self.sort_query) resp = self.serializer.loads(request, self.cache.get(cache_url)) new_headers = {} @@ -255,7 +264,7 @@ def cache_response(self, request, response, body=None, cc_req = self.parse_cache_control(request.headers) cc = self.parse_cache_control(response_headers) - cache_url = self.cache_url(request.url) + cache_url = self.cache_url(request.url, sort_query=self.sort_query) logger.debug('Updating cache with response from "%s"', cache_url) # Delete it from the cache if we happen to have it stored there @@ -317,7 +326,7 @@ def update_cached_response(self, request, response): This should only ever be called when we've sent an ETag and gotten a 304 as the response. """ - cache_url = self.cache_url(request.url) + cache_url = self.cache_url(request.url, sort_query=self.sort_query) cached_response = self.serializer.loads( request, diff --git a/cachecontrol/wrapper.py b/cachecontrol/wrapper.py index b50a6e27..4b707afe 100644 --- a/cachecontrol/wrapper.py +++ b/cachecontrol/wrapper.py @@ -9,7 +9,8 @@ def CacheControl(sess, heuristic=None, controller_class=None, adapter_class=None, - cacheable_methods=None): + cacheable_methods=None, + sort_query=False): cache = cache or DictCache() adapter_class = adapter_class or CacheControlAdapter @@ -19,7 +20,8 @@ def CacheControl(sess, serializer=serializer, heuristic=heuristic, controller_class=controller_class, - cacheable_methods=cacheable_methods + cacheable_methods=cacheable_methods, + sort_query=sort_query ) sess.mount('http://', adapter) sess.mount('https://', adapter) From 43bd91ef7db9f387eb3b5642171f5a654d1d9881 Mon Sep 17 00:00:00 2001 From: elnuno Date: Wed, 22 Mar 2017 18:51:17 -0300 Subject: [PATCH 2/3] Some tests. Probably worth pruning. --- tests/test_adapter.py | 7 ++++ tests/test_cache_control.py | 29 ++++++++++++++++ tests/test_storage_filecache.py | 59 ++++++++++++++++++++++++++++++++- 3 files changed, 94 insertions(+), 1 deletion(-) diff --git a/tests/test_adapter.py b/tests/test_adapter.py index 556c6d3d..bd55f14a 100644 --- a/tests/test_adapter.py +++ b/tests/test_adapter.py @@ -54,3 +54,10 @@ def test_close(self): sess.close() assert cache.close.called + +def test_cache_control_sets_sort_query(): + for bool_ in (True, False): + sess = CacheControl(Session(), mock.Mock(spec=DictCache), + sort_query=bool_) + assert sess.adapters['http://'].sort_query == bool_ + assert sess.adapters['http://'].controller.sort_query == bool_ diff --git a/tests/test_cache_control.py b/tests/test_cache_control.py index 77b8f90d..7254458f 100644 --- a/tests/test_cache_control.py +++ b/tests/test_cache_control.py @@ -4,6 +4,8 @@ import pytest from mock import ANY, Mock import time +from random import shuffle +import string from cachecontrol import CacheController from cachecontrol.cache import DictCache @@ -225,3 +227,30 @@ def test_cached_request_with_bad_max_age_headers_not_returned(self): self.c.cache = DictCache({self.url: resp}) assert not self.req({}) + +def test_cache_url_sorting(): + letter_n_numbers = list(enumerate(string.ascii_lowercase[3:], start=4)) + suff = '&' + '&'.join('%s=%s' % (k, v) for v, k in letter_n_numbers) + + def get_param(url): + """Mock losing order when processing params""" + shuffle(letter_n_numbers) + params = {k: v for v, k in letter_n_numbers} + url = url.replace(suff, '') + query = '&' + '&'.join('%s=%s' % item for item in params.items()) + return url + query + + no_query = 'http://example.com' + unsorted_query = 'http://example.com?b=2&c=3&a=1' + suff + sorted_query = 'http://example.com?a=1&b=2&c=3' + suff + + cache_url = CacheController.cache_url + assert cache_url(no_query, sort_query=True) == cache_url(no_query) + assert cache_url(unsorted_query) != cache_url(sorted_query) + assert cache_url(unsorted_query, True) == cache_url(sorted_query) + randomized = get_param(unsorted_query) + assert randomized != unsorted_query + assert cache_url(randomized) != cache_url(sorted_query) + assert cache_url(randomized, True) == cache_url(sorted_query) + randomized_again = get_param(unsorted_query) + assert cache_url(randomized, True) == cache_url(randomized_again, True) diff --git a/tests/test_storage_filecache.py b/tests/test_storage_filecache.py index f075a079..f13df00a 100644 --- a/tests/test_storage_filecache.py +++ b/tests/test_storage_filecache.py @@ -4,12 +4,13 @@ import os import string -from random import randint, sample +from random import randint, sample, shuffle import pytest import requests from cachecontrol import CacheControl from cachecontrol.caches import FileCache +from cachecontrol.caches.file_cache import url_to_file_path from lockfile import LockFile from lockfile.mkdirlockfile import MkdirLockFile @@ -110,3 +111,59 @@ def test_lock_class(self, tmpdir): lock_class = object() cache = FileCache(str(tmpdir), lock_class=lock_class) assert cache.lock_class is lock_class + + def test_url_to_file_path(self, tmpdir): + cache = FileCache(str(tmpdir)) + # We'll add a long sorted suffix so that unsorted queries have a low + # collision probability (about 3.8e-23 for each sorted/unsorted + # comparison). + letter_n_numbers = list(enumerate(string.ascii_lowercase[3:], start=4)) + suff = '&' + '&'.join('%s=%s' % (k, v) for v, k in letter_n_numbers) + + def get_param(url): + """Mock losing order when processing params""" + shuffle(letter_n_numbers) + params = {k: v for v, k in letter_n_numbers} + url = url.replace(suff, '') + query = '&' + '&'.join('%s=%s' % item for item in params.items()) + return url + query + + urls = { + 'no_query': 'http://example.com', + 'unsorted_query': 'http://example.com?b=2&c=3&a=1' + suff, + 'sorted_query': 'http://example.com?a=1&b=2&c=3' + suff, + 'unsorted_empty_value': 'http://example.com?b=&c=&a=1' + suff, + 'sorted_empty_value': 'http://example.com?a=1&b=&c=3' + suff, + 'unsorted_repeated_key': 'http://example.com?b=2&c=3&b=0' + '&c=5&a=1' + suff, + 'sorted_repeated_key': 'http://example.com?a=1&b=0&b=2&c=3&' + 'c=5' + suff} + + unoquery = url_to_file_path(urls['no_query'], cache, sort_query=False) + snoquery = url_to_file_path(urls['no_query'], cache, sort_query=True) + assert unoquery == snoquery + urls.pop('no_query') + + sortedpaths = {urlname: url_to_file_path(urlvalue, cache, True) for + urlname, urlvalue in urls.items()} + + for key, value in urls.items(): + if key.startswith('sorted'): + assert url_to_file_path(value, cache, True) == sortedpaths[key] + + unsortedpaths = {urlname: url_to_file_path(urlvalue, cache, False) for + urlname, urlvalue in urls.items()} + + for key, url in urls.items(): + if key.startswith('unsorted'): + assert sortedpaths[key] != unsortedpaths[key] + else: + assert sortedpaths[key] == unsortedpaths[key] + + # Randomize and sort params + sort_param = url_to_file_path(get_param(url), cache, True) + assert sort_param == sortedpaths[key] + + # Randomize but don't sort params + unsort_param = url_to_file_path(get_param(url), cache, False) + assert unsort_param != unsortedpaths[key] From cf1debbe12f8d1c0e10a0c4bc487e0569efdb918 Mon Sep 17 00:00:00 2001 From: elnuno Date: Wed, 22 Mar 2017 18:51:49 -0300 Subject: [PATCH 3/3] Add a section about query sorting to the tips doc. --- docs/tips.rst | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/docs/tips.rst b/docs/tips.rst index b603300f..3a7d4e38 100644 --- a/docs/tips.rst +++ b/docs/tips.rst @@ -53,6 +53,17 @@ If you are caching requests that use a large number of query string parameters, consider sorting them to ensure that the request is properly cached. +.. note:: + + Assuming random parameter order, a request with four parameters + has a >95% chance of missing the cache on the second time, because + it grows with the number of permutations (1/24 for four + parameters, 1/125 for five, etc.). However, this problem doesn't + arise while the order stays the same. So if the same dict is used + as params many times it'll likely hit the cache. FileCache suffers + more from this when used on different Python runs, as this makes + order more random. + Requests supports passing both dictionaries and lists of tuples as the param argument in a request. For example: :: @@ -60,3 +71,24 @@ param argument in a request. For example: :: By ordering your params, you can be sure the cache key will be consistent across requests and you are caching effectively. + +For convenience, the sorting of query parameters can be done in +cachecontrol itself. For example: :: + + # Unsorted + sess = cachecontrol.CacheControl(requests.Session(), sort_query=False, + heuristic=ExpiresAfter(days=1)) + params = [('a', 'b'), ('c', 'd')] + resp = sess.get('http://www.reddit.com', params=params) + print(resp.from_cache) # False (first fetch) + resp = sess.get('http://www.reddit.com', params=reversed(params)) + print(resp.from_cache) # False (!) + + sess = cachecontrol.CacheControl(requests.Session(), sort_query=True, + heuristic=ExpiresAfter(days=1)) + params = [('a', 'b'), ('c', 'd')] + resp = sess.get('http://www.reddit.com', params=params) + print(resp.from_cache) # False (first fetch) + resp = sess.get('http://www.reddit.com', params=reversed(params)) + print(resp.from_cache) # True :) +