Skip to content

Commit 5a63183

Browse files
committed
The default size of the re module's compiled regular expression cache has
been increased from 100 to 500 and the cache replacement policy has changed from simply clearing the entire cache on overflow to randomly forgetting 20% of the existing cached compiled regular expressions. This is a performance win for applications that use a lot of regular expressions and limits the impact of the performance hit anytime the cache is exceeded.
1 parent f5ae1ef commit 5a63183

File tree

3 files changed

+106
-3
lines changed

3 files changed

+106
-3
lines changed

Lib/re.py

Lines changed: 36 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -254,7 +254,40 @@ def escape(pattern):
254254

255255
_pattern_type = type(sre_compile.compile("", 0))
256256

257-
_MAXCACHE = 100
257+
_MAXCACHE = 500
258+
259+
def _shrink_cache(cache_dict, max_length, divisor=5):
260+
"""Make room in the given cache.
261+
262+
Args:
263+
cache_dict: The cache dictionary to modify.
264+
max_length: Maximum # of entries in cache_dict before it is shrunk.
265+
divisor: Cache will shrink to max_length - 1/divisor*max_length items.
266+
"""
267+
# Toss out a fraction of the entries at random to make room for new ones.
268+
# A random algorithm was chosen as opposed to simply cache_dict.popitem()
269+
# as popitem could penalize the same regular expression repeatedly based
270+
# on its internal hash value. Being random should spread the cache miss
271+
# love around.
272+
cache_keys = tuple(cache_dict.keys())
273+
overage = len(cache_keys) - max_length
274+
if overage < 0:
275+
# Cache is already within limits. Normally this should not happen
276+
# but it could due to multithreading.
277+
return
278+
number_to_toss = max_length // divisor + overage
279+
# The import is done here to avoid a circular depencency.
280+
import random
281+
if not hasattr(random, 'sample'):
282+
# Do nothing while resolving the circular dependency:
283+
# re->random->warnings->tokenize->string->re
284+
return
285+
for doomed_key in random.sample(cache_keys, number_to_toss):
286+
try:
287+
del cache_dict[doomed_key]
288+
except KeyError:
289+
# Ignore problems if the cache changed from another thread.
290+
pass
258291

259292
def _compile(*key):
260293
# internal: compile pattern
@@ -272,7 +305,7 @@ def _compile(*key):
272305
raise TypeError("first argument must be string or compiled pattern")
273306
p = sre_compile.compile(pattern, flags)
274307
if len(_cache) >= _MAXCACHE:
275-
_cache.clear()
308+
_shrink_cache(_cache, _MAXCACHE)
276309
_cache[cachekey] = p
277310
return p
278311

@@ -284,7 +317,7 @@ def _compile_repl(*key):
284317
repl, pattern = key
285318
p = sre_parse.parse_template(repl, pattern)
286319
if len(_cache_repl) >= _MAXCACHE:
287-
_cache_repl.clear()
320+
_shrink_cache(_cache_repl, _MAXCACHE)
288321
_cache_repl[key] = p
289322
return p
290323

Lib/test/test_re.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -874,8 +874,71 @@ def run_re_tests():
874874
if result is None:
875875
print('=== Fails on unicode-sensitive match', t)
876876

877+
878+
class ReCacheTests(unittest.TestCase):
879+
"""These tests are specific to the re._shrink_cache implementation."""
880+
881+
def setUp(self):
882+
self._orig_maxcache = re._MAXCACHE
883+
884+
def tearDown(self):
885+
re._MAXCACHE = self._orig_maxcache
886+
887+
def test_compile_cache_overflow(self):
888+
# NOTE: If a profiler or debugger is tracing code and compiling
889+
# regular expressions while tracing through this test... expect
890+
# the test to fail. This test is not concurrency safe.
891+
892+
# Explicitly fill the caches.
893+
re._MAXCACHE = 20
894+
max_cache = re._MAXCACHE
895+
unique_chars = tuple(chr(char_num) for char_num in
896+
range(b'a'[0], b'a'[0]+max_cache))
897+
re._cache.clear()
898+
for char in unique_chars:
899+
re._compile(char, 0)
900+
self.assertEqual(max_cache, len(re._cache))
901+
re._cache_repl.clear()
902+
for char in unique_chars:
903+
re._compile_repl(char*2, char)
904+
self.assertEqual(max_cache, len(re._cache_repl))
905+
906+
# Overflow both caches and make sure they have extra room left
907+
# afterwards as well as having more than a single entry.
908+
re._compile('A', 0)
909+
self.assertLess(len(re._cache), max_cache)
910+
self.assertGreater(len(re._cache), 1)
911+
re._compile_repl('A', 'A')
912+
self.assertLess(len(re._cache_repl), max_cache)
913+
self.assertGreater(len(re._cache_repl), 1)
914+
915+
def test_shrink_cache_at_limit(self):
916+
cache = dict(zip(range(6), range(6)))
917+
re._shrink_cache(cache, 6, divisor=3)
918+
self.assertEqual(4, len(cache))
919+
920+
def test_shrink_cache_empty(self):
921+
cache = {}
922+
re._shrink_cache(cache, 6, divisor=3)
923+
# Cache was empty, make sure we didn't raise an exception.
924+
self.assertEqual(0, len(cache))
925+
926+
def test_shrink_cache_overflowing(self):
927+
cache = dict(zip(range(6), range(6)))
928+
re._shrink_cache(cache, 4, divisor=2)
929+
# Cache was larger than the maximum, be sure we shrunk to smaller.
930+
self.assertEqual(2, len(cache))
931+
932+
def test_shrink_cache_underflow(self):
933+
cache = dict(zip(range(6), range(6)))
934+
# No shrinking to do.
935+
re._shrink_cache(cache, 9, divisor=3)
936+
self.assertEqual(6, len(cache))
937+
938+
877939
def test_main():
878940
run_unittest(ReTests)
941+
run_unittest(ReCacheTests)
879942
run_re_tests()
880943

881944
if __name__ == "__main__":

Misc/NEWS

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -473,6 +473,13 @@ C-API
473473
Library
474474
-------
475475

476+
- The default size of the re module's compiled regular expression cache has
477+
been increased from 100 to 500 and the cache replacement policy has changed
478+
from simply clearing the entire cache on overflow to randomly forgetting 20%
479+
of the existing cached compiled regular expressions. This is a performance
480+
win for applications that use a lot of regular expressions and limits the
481+
impact of the performance hit anytime the cache is exceeded.
482+
476483
- Issue #7113: Speed up loading in configparser. Patch by Łukasz Langa.
477484

478485
- Issue #9032: XML-RPC client retries the request on EPIPE error. The EPIPE

0 commit comments

Comments
 (0)