Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow sorting query parameters #147

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion cachecontrol/adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,20 @@ def __init__(self, cache=None,
serializer=None,
heuristic=None,
cacheable_methods=None,
sort_query=False,
*args, **kw):
super(CacheControlAdapter, self).__init__(*args, **kw)
self.cache = cache or DictCache()
self.heuristic = heuristic
self.cacheable_methods = cacheable_methods or ('GET',)
self.sort_query = sort_query

controller_factory = controller_class or CacheController
self.controller = controller_factory(
self.cache,
cache_etags=cache_etags,
serializer=serializer,
sort_query=sort_query
)

def send(self, request, cacheable_methods=None, **kw):
Expand Down Expand Up @@ -117,7 +120,8 @@ def _update_chunk_length(self):

# See if we should invalidate the cache.
if request.method in self.invalidating_methods and resp.ok:
cache_url = self.controller.cache_url(request.url)
cache_url = self.controller.cache_url(request.url,
sort_query=self.sort_query)
self.cache.delete(cache_url)

# Give the request a from_cache attr to let people use it
Expand Down
4 changes: 2 additions & 2 deletions cachecontrol/caches/file_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,10 +124,10 @@ def delete(self, key):
pass


def url_to_file_path(url, filecache):
def url_to_file_path(url, filecache, sort_query=False):
"""Return the file cache path based on the URL.

This does not ensure the file exists!
"""
key = CacheController.cache_url(url)
key = CacheController.cache_url(url, sort_query=sort_query)
return filecache._fn(key)
25 changes: 17 additions & 8 deletions cachecontrol/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,15 @@ class CacheController(object):
"""An interface to see if request should cached or not.
"""
def __init__(self, cache=None, cache_etags=True, serializer=None,
status_codes=None):
status_codes=None, sort_query=False):
self.cache = cache or DictCache()
self.cache_etags = cache_etags
self.serializer = serializer or Serializer()
self.cacheable_status_codes = status_codes or (200, 203, 300, 301)
self.sort_query = sort_query

@classmethod
def _urlnorm(cls, uri):
def _urlnorm(cls, uri, sort_query=False):
"""Normalize the URL to create a safe key for the cache"""
(scheme, authority, path, query, fragment) = parse_uri(uri)
if not scheme or not authority:
Expand All @@ -50,6 +51,14 @@ def _urlnorm(cls, uri):
if not path:
path = "/"

# Sorting the query might induce behavior changes in the response.
# Use with care. However, assuming param randomization, a query with
# four params has a 96% chance of missing the cache on the second
# request if a response has already been recorded. The chance of a
# hit grows to 50% after a dozen requests.
if query and sort_query:
query = '&'.join(sorted(query.split('&')))

# Could do syntax based normalization of the URI before
# computing the digest. See Section 6.2.2 of Std 66.
request_uri = query and "?".join([path, query]) or path
Expand All @@ -58,8 +67,8 @@ def _urlnorm(cls, uri):
return defrag_uri

@classmethod
def cache_url(cls, uri):
return cls._urlnorm(uri)
def cache_url(cls, uri, sort_query=False):
return cls._urlnorm(uri, sort_query=sort_query)

def parse_cache_control(self, headers):
"""
Expand Down Expand Up @@ -90,7 +99,7 @@ def cached_request(self, request):
Return a cached response if it exists in the cache, otherwise
return False.
"""
cache_url = self.cache_url(request.url)
cache_url = self.cache_url(request.url, sort_query=self.sort_query)
logger.debug('Looking up "%s" in the cache', cache_url)
cc = self.parse_cache_control(request.headers)

Expand Down Expand Up @@ -207,7 +216,7 @@ def cached_request(self, request):
return False

def conditional_headers(self, request):
cache_url = self.cache_url(request.url)
cache_url = self.cache_url(request.url, sort_query=self.sort_query)
resp = self.serializer.loads(request, self.cache.get(cache_url))
new_headers = {}

Expand Down Expand Up @@ -255,7 +264,7 @@ def cache_response(self, request, response, body=None,
cc_req = self.parse_cache_control(request.headers)
cc = self.parse_cache_control(response_headers)

cache_url = self.cache_url(request.url)
cache_url = self.cache_url(request.url, sort_query=self.sort_query)
logger.debug('Updating cache with response from "%s"', cache_url)

# Delete it from the cache if we happen to have it stored there
Expand Down Expand Up @@ -317,7 +326,7 @@ def update_cached_response(self, request, response):
This should only ever be called when we've sent an ETag and
gotten a 304 as the response.
"""
cache_url = self.cache_url(request.url)
cache_url = self.cache_url(request.url, sort_query=self.sort_query)

cached_response = self.serializer.loads(
request,
Expand Down
6 changes: 4 additions & 2 deletions cachecontrol/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ def CacheControl(sess,
heuristic=None,
controller_class=None,
adapter_class=None,
cacheable_methods=None):
cacheable_methods=None,
sort_query=False):

cache = cache or DictCache()
adapter_class = adapter_class or CacheControlAdapter
Expand All @@ -19,7 +20,8 @@ def CacheControl(sess,
serializer=serializer,
heuristic=heuristic,
controller_class=controller_class,
cacheable_methods=cacheable_methods
cacheable_methods=cacheable_methods,
sort_query=sort_query
)
sess.mount('http://', adapter)
sess.mount('https://', adapter)
Expand Down
32 changes: 32 additions & 0 deletions docs/tips.rst
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,42 @@ If you are caching requests that use a large number of query string
parameters, consider sorting them to ensure that the request is
properly cached.

.. note::

Assuming random parameter order, a request with four parameters
has a >95% chance of missing the cache on the second time, because
it grows with the number of permutations (1/24 for four
parameters, 1/125 for five, etc.). However, this problem doesn't
arise while the order stays the same. So if the same dict is used
as params many times it'll likely hit the cache. FileCache suffers
more from this when used on different Python runs, as this makes
order more random.

Requests supports passing both dictionaries and lists of tuples as the
param argument in a request. For example: ::

requests.get(url, params=sorted([('foo', 'one'), ('bar', 'two')]))

By ordering your params, you can be sure the cache key will be
consistent across requests and you are caching effectively.

For convenience, the sorting of query parameters can be done in
cachecontrol itself. For example: ::

# Unsorted
sess = cachecontrol.CacheControl(requests.Session(), sort_query=False,
heuristic=ExpiresAfter(days=1))
params = [('a', 'b'), ('c', 'd')]
resp = sess.get('http://www.reddit.com', params=params)
print(resp.from_cache) # False (first fetch)
resp = sess.get('http://www.reddit.com', params=reversed(params))
print(resp.from_cache) # False (!)

sess = cachecontrol.CacheControl(requests.Session(), sort_query=True,
heuristic=ExpiresAfter(days=1))
params = [('a', 'b'), ('c', 'd')]
resp = sess.get('http://www.reddit.com', params=params)
print(resp.from_cache) # False (first fetch)
resp = sess.get('http://www.reddit.com', params=reversed(params))
print(resp.from_cache) # True :)

7 changes: 7 additions & 0 deletions tests/test_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,10 @@ def test_close(self):

sess.close()
assert cache.close.called

def test_cache_control_sets_sort_query():
for bool_ in (True, False):
sess = CacheControl(Session(), mock.Mock(spec=DictCache),
sort_query=bool_)
assert sess.adapters['http://'].sort_query == bool_
assert sess.adapters['http://'].controller.sort_query == bool_
29 changes: 29 additions & 0 deletions tests/test_cache_control.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import pytest
from mock import ANY, Mock
import time
from random import shuffle
import string

from cachecontrol import CacheController
from cachecontrol.cache import DictCache
Expand Down Expand Up @@ -225,3 +227,30 @@ def test_cached_request_with_bad_max_age_headers_not_returned(self):
self.c.cache = DictCache({self.url: resp})

assert not self.req({})

def test_cache_url_sorting():
letter_n_numbers = list(enumerate(string.ascii_lowercase[3:], start=4))
suff = '&' + '&'.join('%s=%s' % (k, v) for v, k in letter_n_numbers)

def get_param(url):
"""Mock losing order when processing params"""
shuffle(letter_n_numbers)
params = {k: v for v, k in letter_n_numbers}
url = url.replace(suff, '')
query = '&' + '&'.join('%s=%s' % item for item in params.items())
return url + query

no_query = 'http://example.com'
unsorted_query = 'http://example.com?b=2&c=3&a=1' + suff
sorted_query = 'http://example.com?a=1&b=2&c=3' + suff

cache_url = CacheController.cache_url
assert cache_url(no_query, sort_query=True) == cache_url(no_query)
assert cache_url(unsorted_query) != cache_url(sorted_query)
assert cache_url(unsorted_query, True) == cache_url(sorted_query)
randomized = get_param(unsorted_query)
assert randomized != unsorted_query
assert cache_url(randomized) != cache_url(sorted_query)
assert cache_url(randomized, True) == cache_url(sorted_query)
randomized_again = get_param(unsorted_query)
assert cache_url(randomized, True) == cache_url(randomized_again, True)
59 changes: 58 additions & 1 deletion tests/test_storage_filecache.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@
import os
import string

from random import randint, sample
from random import randint, sample, shuffle

import pytest
import requests
from cachecontrol import CacheControl
from cachecontrol.caches import FileCache
from cachecontrol.caches.file_cache import url_to_file_path
from lockfile import LockFile
from lockfile.mkdirlockfile import MkdirLockFile

Expand Down Expand Up @@ -110,3 +111,59 @@ def test_lock_class(self, tmpdir):
lock_class = object()
cache = FileCache(str(tmpdir), lock_class=lock_class)
assert cache.lock_class is lock_class

def test_url_to_file_path(self, tmpdir):
cache = FileCache(str(tmpdir))
# We'll add a long sorted suffix so that unsorted queries have a low
# collision probability (about 3.8e-23 for each sorted/unsorted
# comparison).
letter_n_numbers = list(enumerate(string.ascii_lowercase[3:], start=4))
suff = '&' + '&'.join('%s=%s' % (k, v) for v, k in letter_n_numbers)

def get_param(url):
"""Mock losing order when processing params"""
shuffle(letter_n_numbers)
params = {k: v for v, k in letter_n_numbers}
url = url.replace(suff, '')
query = '&' + '&'.join('%s=%s' % item for item in params.items())
return url + query

urls = {
'no_query': 'http://example.com',
'unsorted_query': 'http://example.com?b=2&c=3&a=1' + suff,
'sorted_query': 'http://example.com?a=1&b=2&c=3' + suff,
'unsorted_empty_value': 'http://example.com?b=&c=&a=1' + suff,
'sorted_empty_value': 'http://example.com?a=1&b=&c=3' + suff,
'unsorted_repeated_key': 'http://example.com?b=2&c=3&b=0'
'&c=5&a=1' + suff,
'sorted_repeated_key': 'http://example.com?a=1&b=0&b=2&c=3&'
'c=5' + suff}

unoquery = url_to_file_path(urls['no_query'], cache, sort_query=False)
snoquery = url_to_file_path(urls['no_query'], cache, sort_query=True)
assert unoquery == snoquery
urls.pop('no_query')

sortedpaths = {urlname: url_to_file_path(urlvalue, cache, True) for
urlname, urlvalue in urls.items()}

for key, value in urls.items():
if key.startswith('sorted'):
assert url_to_file_path(value, cache, True) == sortedpaths[key]

unsortedpaths = {urlname: url_to_file_path(urlvalue, cache, False) for
urlname, urlvalue in urls.items()}

for key, url in urls.items():
if key.startswith('unsorted'):
assert sortedpaths[key] != unsortedpaths[key]
else:
assert sortedpaths[key] == unsortedpaths[key]

# Randomize and sort params
sort_param = url_to_file_path(get_param(url), cache, True)
assert sort_param == sortedpaths[key]

# Randomize but don't sort params
unsort_param = url_to_file_path(get_param(url), cache, False)
assert unsort_param != unsortedpaths[key]