psf · elnuno · Mar 22, 2017 · Mar 22, 2017 · Mar 22, 2017 · Apr 11, 2017
diff --git a/cachecontrol/adapter.py b/cachecontrol/adapter.py
@@ -17,17 +17,20 @@ def __init__(self, cache=None,
                  serializer=None,
                  heuristic=None,
                  cacheable_methods=None,
+                 sort_query=False,
                  *args, **kw):
         super(CacheControlAdapter, self).__init__(*args, **kw)
         self.cache = cache or DictCache()
         self.heuristic = heuristic
         self.cacheable_methods = cacheable_methods or ('GET',)
+        self.sort_query = sort_query
 
         controller_factory = controller_class or CacheController
         self.controller = controller_factory(
             self.cache,
             cache_etags=cache_etags,
             serializer=serializer,
+            sort_query=sort_query
         )
 
     def send(self, request, cacheable_methods=None, **kw):
@@ -117,7 +120,8 @@ def _update_chunk_length(self):
 
         # See if we should invalidate the cache.
         if request.method in self.invalidating_methods and resp.ok:
-            cache_url = self.controller.cache_url(request.url)
+            cache_url = self.controller.cache_url(request.url,
+                                                  sort_query=self.sort_query)
             self.cache.delete(cache_url)
 
         # Give the request a from_cache attr to let people use it

diff --git a/cachecontrol/caches/file_cache.py b/cachecontrol/caches/file_cache.py
@@ -124,10 +124,10 @@ def delete(self, key):
                 pass
 
 
-def url_to_file_path(url, filecache):
+def url_to_file_path(url, filecache, sort_query=False):
     """Return the file cache path based on the URL.
 
     This does not ensure the file exists!
     """
-    key = CacheController.cache_url(url)
+    key = CacheController.cache_url(url, sort_query=sort_query)
     return filecache._fn(key)
diff --git a/cachecontrol/controller.py b/cachecontrol/controller.py
@@ -31,14 +31,15 @@ class CacheController(object):
     """An interface to see if request should cached or not.
     """
     def __init__(self, cache=None, cache_etags=True, serializer=None,
-                 status_codes=None):
+                 status_codes=None, sort_query=False):
         self.cache = cache or DictCache()
         self.cache_etags = cache_etags
         self.serializer = serializer or Serializer()
         self.cacheable_status_codes = status_codes or (200, 203, 300, 301)
+        self.sort_query = sort_query
 
     @classmethod
-    def _urlnorm(cls, uri):
+    def _urlnorm(cls, uri, sort_query=False):
         """Normalize the URL to create a safe key for the cache"""
         (scheme, authority, path, query, fragment) = parse_uri(uri)
         if not scheme or not authority:
@@ -50,6 +51,14 @@ def _urlnorm(cls, uri):
         if not path:
             path = "/"
 
+        # Sorting the query might induce behavior changes in the response.
+        # Use with care. However, assuming param randomization, a query with
+        # four params has a 96% chance of missing the cache on the second
+        # request if a response has already been recorded. The chance of a
+        # hit grows to 50% after a dozen requests.
+        if query and sort_query:
+            query = '&'.join(sorted(query.split('&')))
+
         # Could do syntax based normalization of the URI before
         # computing the digest. See Section 6.2.2 of Std 66.
         request_uri = query and "?".join([path, query]) or path
@@ -58,8 +67,8 @@ def _urlnorm(cls, uri):
         return defrag_uri
 
     @classmethod
-    def cache_url(cls, uri):
-        return cls._urlnorm(uri)
+    def cache_url(cls, uri, sort_query=False):
+        return cls._urlnorm(uri, sort_query=sort_query)
 
     def parse_cache_control(self, headers):
         """
@@ -90,7 +99,7 @@ def cached_request(self, request):
         Return a cached response if it exists in the cache, otherwise
         return False.
         """
-        cache_url = self.cache_url(request.url)
+        cache_url = self.cache_url(request.url, sort_query=self.sort_query)
         logger.debug('Looking up "%s" in the cache', cache_url)
         cc = self.parse_cache_control(request.headers)
 
@@ -207,7 +216,7 @@ def cached_request(self, request):
         return False
 
     def conditional_headers(self, request):
-        cache_url = self.cache_url(request.url)
+        cache_url = self.cache_url(request.url, sort_query=self.sort_query)
         resp = self.serializer.loads(request, self.cache.get(cache_url))
         new_headers = {}
 
@@ -255,7 +264,7 @@ def cache_response(self, request, response, body=None,
         cc_req = self.parse_cache_control(request.headers)
         cc = self.parse_cache_control(response_headers)
 
-        cache_url = self.cache_url(request.url)
+        cache_url = self.cache_url(request.url, sort_query=self.sort_query)
         logger.debug('Updating cache with response from "%s"', cache_url)
 
         # Delete it from the cache if we happen to have it stored there
@@ -317,7 +326,7 @@ def update_cached_response(self, request, response):
         This should only ever be called when we've sent an ETag and
         gotten a 304 as the response.
         """
-        cache_url = self.cache_url(request.url)
+        cache_url = self.cache_url(request.url, sort_query=self.sort_query)
 
         cached_response = self.serializer.loads(
             request,

diff --git a/cachecontrol/wrapper.py b/cachecontrol/wrapper.py
@@ -9,7 +9,8 @@ def CacheControl(sess,
                  heuristic=None,
                  controller_class=None,
                  adapter_class=None,
-                 cacheable_methods=None):
+                 cacheable_methods=None,
+                 sort_query=False):
 
     cache = cache or DictCache()
     adapter_class = adapter_class or CacheControlAdapter
@@ -19,7 +20,8 @@ def CacheControl(sess,
         serializer=serializer,
         heuristic=heuristic,
         controller_class=controller_class,
-        cacheable_methods=cacheable_methods
+        cacheable_methods=cacheable_methods,
+        sort_query=sort_query
     )
     sess.mount('http://', adapter)
     sess.mount('https://', adapter)

diff --git a/docs/tips.rst b/docs/tips.rst
@@ -53,10 +53,42 @@ If you are caching requests that use a large number of query string
 parameters, consider sorting them to ensure that the request is
 properly cached.
 
+.. note::
+
+  Assuming random parameter order, a request with four parameters
+  has a >95% chance of missing the cache on the second time, because
+  it grows with the number of permutations (1/24 for four
+  parameters, 1/125 for five, etc.). However, this problem doesn't
+  arise while the order stays the same. So if the same dict is used
+  as params many times it'll likely hit the cache. FileCache suffers
+  more from this when used on different Python runs, as this makes
+  order more random.
+
 Requests supports passing both dictionaries and lists of tuples as the
 param argument in a request. For example: ::
 
   requests.get(url, params=sorted([('foo', 'one'), ('bar', 'two')]))
 
 By ordering your params, you can be sure the cache key will be
 consistent across requests and you are caching effectively.
+
+For convenience, the sorting of query parameters can be done in
+cachecontrol itself. For example: ::
+
+  # Unsorted
+  sess = cachecontrol.CacheControl(requests.Session(), sort_query=False,
+                                   heuristic=ExpiresAfter(days=1))
+  params = [('a', 'b'), ('c', 'd')]
+  resp = sess.get('http://www.reddit.com', params=params)
+  print(resp.from_cache) # False (first fetch)
+  resp = sess.get('http://www.reddit.com', params=reversed(params))
+  print(resp.from_cache) # False (!)
+
+  sess = cachecontrol.CacheControl(requests.Session(), sort_query=True,
+                                   heuristic=ExpiresAfter(days=1))
+  params = [('a', 'b'), ('c', 'd')]
+  resp = sess.get('http://www.reddit.com', params=params)
+  print(resp.from_cache) # False (first fetch)
+  resp = sess.get('http://www.reddit.com', params=reversed(params))
+  print(resp.from_cache) # True :)
+
diff --git a/tests/test_adapter.py b/tests/test_adapter.py
@@ -54,3 +54,10 @@ def test_close(self):
 
         sess.close()
         assert cache.close.called
+
+def test_cache_control_sets_sort_query():
+    for bool_ in (True, False):
+        sess = CacheControl(Session(), mock.Mock(spec=DictCache),
+                            sort_query=bool_)
+        assert sess.adapters['http://'].sort_query == bool_
+        assert sess.adapters['http://'].controller.sort_query == bool_
diff --git a/tests/test_cache_control.py b/tests/test_cache_control.py
@@ -4,6 +4,8 @@
 import pytest
 from mock import ANY, Mock
 import time
+from random import shuffle
+import string
 
 from cachecontrol import CacheController
 from cachecontrol.cache import DictCache
@@ -225,3 +227,30 @@ def test_cached_request_with_bad_max_age_headers_not_returned(self):
         self.c.cache = DictCache({self.url: resp})
 
         assert not self.req({})
+
+def test_cache_url_sorting():
+    letter_n_numbers = list(enumerate(string.ascii_lowercase[3:], start=4))
+    suff = '&' + '&'.join('%s=%s' % (k, v) for v, k in letter_n_numbers)
+
+    def get_param(url):
+        """Mock losing order when processing params"""
+        shuffle(letter_n_numbers)
+        params = {k: v for v, k in letter_n_numbers}
+        url = url.replace(suff, '')
+        query = '&' + '&'.join('%s=%s' % item for item in params.items())
+        return url + query
+
+    no_query = 'http://example.com'
+    unsorted_query = 'http://example.com?b=2&c=3&a=1' + suff
+    sorted_query = 'http://example.com?a=1&b=2&c=3' + suff
+
+    cache_url = CacheController.cache_url
+    assert cache_url(no_query, sort_query=True) == cache_url(no_query)
+    assert cache_url(unsorted_query) != cache_url(sorted_query)
+    assert cache_url(unsorted_query, True) == cache_url(sorted_query)
+    randomized = get_param(unsorted_query)
+    assert randomized != unsorted_query
+    assert cache_url(randomized) != cache_url(sorted_query)
+    assert cache_url(randomized, True) == cache_url(sorted_query)
+    randomized_again = get_param(unsorted_query)
+    assert cache_url(randomized, True) == cache_url(randomized_again, True)
diff --git a/tests/test_storage_filecache.py b/tests/test_storage_filecache.py
@@ -4,12 +4,13 @@
 import os
 import string
 
-from random import randint, sample
+from random import randint, sample, shuffle
 
 import pytest
 import requests
 from cachecontrol import CacheControl
 from cachecontrol.caches import FileCache
+from cachecontrol.caches.file_cache import url_to_file_path
 from lockfile import LockFile
 from lockfile.mkdirlockfile import MkdirLockFile
 
@@ -110,3 +111,59 @@ def test_lock_class(self, tmpdir):
         lock_class = object()
         cache = FileCache(str(tmpdir), lock_class=lock_class)
         assert cache.lock_class is lock_class
+
+    def test_url_to_file_path(self, tmpdir):
+        cache = FileCache(str(tmpdir))
+        # We'll add a long sorted suffix so that unsorted queries have a low
+        # collision probability (about 3.8e-23 for each sorted/unsorted
+        # comparison).
+        letter_n_numbers = list(enumerate(string.ascii_lowercase[3:], start=4))
+        suff = '&' + '&'.join('%s=%s' % (k, v) for v, k in letter_n_numbers)
+
+        def get_param(url):
+            """Mock losing order when processing params"""
+            shuffle(letter_n_numbers)
+            params = {k: v for v, k in letter_n_numbers}
+            url = url.replace(suff, '')
+            query = '&' + '&'.join('%s=%s' % item for item in params.items())
+            return url + query
+
+        urls = {
+            'no_query': 'http://example.com',
+            'unsorted_query': 'http://example.com?b=2&c=3&a=1' + suff,
+            'sorted_query': 'http://example.com?a=1&b=2&c=3' + suff,
+            'unsorted_empty_value': 'http://example.com?b=&c=&a=1' + suff,
+            'sorted_empty_value': 'http://example.com?a=1&b=&c=3' + suff,
+            'unsorted_repeated_key': 'http://example.com?b=2&c=3&b=0'
+                                     '&c=5&a=1' + suff,
+            'sorted_repeated_key': 'http://example.com?a=1&b=0&b=2&c=3&'
+                                   'c=5' + suff}
+
+        unoquery = url_to_file_path(urls['no_query'], cache, sort_query=False)
+        snoquery = url_to_file_path(urls['no_query'], cache, sort_query=True)
+        assert unoquery == snoquery
+        urls.pop('no_query')
+
+        sortedpaths = {urlname: url_to_file_path(urlvalue, cache, True) for
+                       urlname, urlvalue in urls.items()}
+
+        for key, value in urls.items():
+            if key.startswith('sorted'):
+                assert url_to_file_path(value, cache, True) == sortedpaths[key]
+
+        unsortedpaths = {urlname: url_to_file_path(urlvalue, cache, False) for
+                         urlname, urlvalue in urls.items()}
+
+        for key, url in urls.items():
+            if key.startswith('unsorted'):
+                assert sortedpaths[key] != unsortedpaths[key]
+            else:
+                assert sortedpaths[key] == unsortedpaths[key]
+
+            # Randomize and sort params
+            sort_param = url_to_file_path(get_param(url), cache, True)
+            assert sort_param == sortedpaths[key]
+
+            # Randomize but don't sort params
+            unsort_param = url_to_file_path(get_param(url), cache, False)
+            assert unsort_param != unsortedpaths[key]