From e79afd9acda243ecbc6927a2d233cef8e9ffc164 Mon Sep 17 00:00:00 2001 From: Eric Snow Date: Wed, 15 Dec 2021 17:54:09 -0700 Subject: [PATCH 1/5] Move the unicode identifiers (_Py_Identifer) to _PyRuntimeState. --- Include/internal/pycore_global_objects.h | 11 ++++++ Include/internal/pycore_runtime.h | 2 - Include/internal/pycore_unicodeobject.h | 15 ------- Objects/unicodeobject.c | 50 +++++++++++------------- Python/pystate.c | 14 ++++--- 5 files changed, 43 insertions(+), 49 deletions(-) diff --git a/Include/internal/pycore_global_objects.h b/Include/internal/pycore_global_objects.h index 6cae3bca6be45a..0862238598b293 100644 --- a/Include/internal/pycore_global_objects.h +++ b/Include/internal/pycore_global_objects.h @@ -54,6 +54,17 @@ struct _Py_global_objects { * -_PY_NSMALLNEGINTS (inclusive) to _PY_NSMALLPOSINTS (exclusive). */ PyLongObject small_ints[_PY_NSMALLNEGINTS + _PY_NSMALLPOSINTS]; + + /* Unicode identifiers (_Py_Identifier): see _PyUnicode_FromId() */ + struct _Py_unicode_ids { + PyThread_type_lock lock; + // next_index value must be preserved when Py_Initialize()/Py_Finalize() + // is called multiple times: see _PyUnicode_FromId() implementation. + Py_ssize_t next_index; + + Py_ssize_t size; + PyObject **array; + } unicode_ids; } singletons; }; diff --git a/Include/internal/pycore_runtime.h b/Include/internal/pycore_runtime.h index 725c859ea7853d..d3b2b59b85e3b8 100644 --- a/Include/internal/pycore_runtime.h +++ b/Include/internal/pycore_runtime.h @@ -116,8 +116,6 @@ typedef struct pyruntimestate { void *open_code_userdata; _Py_AuditHookEntry *audit_hook_head; - struct _Py_unicode_runtime_ids unicode_ids; - struct _Py_global_objects global_objects; // If anything gets added after global_objects then // _PyRuntimeState_reset() needs to get updated to clear it. diff --git a/Include/internal/pycore_unicodeobject.h b/Include/internal/pycore_unicodeobject.h index c50c42011a9349..1ad751d180a181 100644 --- a/Include/internal/pycore_unicodeobject.h +++ b/Include/internal/pycore_unicodeobject.h @@ -19,13 +19,6 @@ extern void _PyUnicode_Fini(PyInterpreterState *); /* other API */ -struct _Py_unicode_runtime_ids { - PyThread_type_lock lock; - // next_index value must be preserved when Py_Initialize()/Py_Finalize() - // is called multiple times: see _PyUnicode_FromId() implementation. - Py_ssize_t next_index; -}; - /* fs_codec.encoding is initialized to NULL. Later, it is set to a non-NULL string by _PyUnicode_InitEncodings(). */ struct _Py_unicode_fs_codec { @@ -35,11 +28,6 @@ struct _Py_unicode_fs_codec { _Py_error_handler error_handler; }; -struct _Py_unicode_ids { - Py_ssize_t size; - PyObject **array; -}; - struct _Py_unicode_state { // The empty Unicode object is a singleton to improve performance. PyObject *empty_string; @@ -57,9 +45,6 @@ struct _Py_unicode_state { count of a string is: s->ob_refcnt + (s->state ? 2 : 0) */ PyObject *interned; - - // Unicode identifiers (_Py_Identifier): see _PyUnicode_FromId() - struct _Py_unicode_ids ids; }; extern void _PyUnicode_ClearInterned(PyInterpreterState *); diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 14449bce70839f..fada8c0955ac4a 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -233,6 +233,8 @@ static int unicode_is_singleton(PyObject *unicode); #endif +#define IDENTIFIERS _Py_SINGLETON(unicode_ids) + static struct _Py_unicode_state* get_unicode_state(void) { @@ -2331,30 +2333,25 @@ PyUnicode_FromString(const char *u) PyObject * _PyUnicode_FromId(_Py_Identifier *id) { - PyInterpreterState *interp = _PyInterpreterState_GET(); - struct _Py_unicode_ids *ids = &interp->unicode.ids; - Py_ssize_t index = _Py_atomic_size_get(&id->index); if (index < 0) { - struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_ids; - - PyThread_acquire_lock(rt_ids->lock, WAIT_LOCK); + PyThread_acquire_lock(IDENTIFIERS.lock, WAIT_LOCK); // Check again to detect concurrent access. Another thread can have // initialized the index while this thread waited for the lock. index = _Py_atomic_size_get(&id->index); if (index < 0) { - assert(rt_ids->next_index < PY_SSIZE_T_MAX); - index = rt_ids->next_index; - rt_ids->next_index++; + assert(IDENTIFIERS.next_index < PY_SSIZE_T_MAX); + index = IDENTIFIERS.next_index; + IDENTIFIERS.next_index++; _Py_atomic_size_set(&id->index, index); } - PyThread_release_lock(rt_ids->lock); + PyThread_release_lock(IDENTIFIERS.lock); } assert(index >= 0); PyObject *obj; - if (index < ids->size) { - obj = ids->array[index]; + if (index < IDENTIFIERS.size) { + obj = IDENTIFIERS.array[index]; if (obj) { // Return a borrowed reference return obj; @@ -2368,22 +2365,22 @@ _PyUnicode_FromId(_Py_Identifier *id) } PyUnicode_InternInPlace(&obj); - if (index >= ids->size) { + if (index >= IDENTIFIERS.size) { // Overallocate to reduce the number of realloc Py_ssize_t new_size = Py_MAX(index * 2, 16); - Py_ssize_t item_size = sizeof(ids->array[0]); - PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size); + Py_ssize_t item_size = sizeof(IDENTIFIERS.array[0]); + PyObject **new_array = PyMem_Realloc(IDENTIFIERS.array, new_size * item_size); if (new_array == NULL) { PyErr_NoMemory(); return NULL; } - memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size); - ids->array = new_array; - ids->size = new_size; + memset(&new_array[IDENTIFIERS.size], 0, (new_size - IDENTIFIERS.size) * item_size); + IDENTIFIERS.array = new_array; + IDENTIFIERS.size = new_size; } // The array stores a strong reference - ids->array[index] = obj; + IDENTIFIERS.array[index] = obj; // Return a borrowed reference return obj; @@ -2391,15 +2388,14 @@ _PyUnicode_FromId(_Py_Identifier *id) static void -unicode_clear_identifiers(struct _Py_unicode_state *state) +unicode_clear_identifiers(void) { - struct _Py_unicode_ids *ids = &state->ids; - for (Py_ssize_t i=0; i < ids->size; i++) { - Py_XDECREF(ids->array[i]); + for (Py_ssize_t i=0; i < IDENTIFIERS.size; i++) { + Py_XDECREF(IDENTIFIERS.array[i]); } - ids->size = 0; - PyMem_Free(ids->array); - ids->array = NULL; + IDENTIFIERS.size = 0; + PyMem_Free(IDENTIFIERS.array); + IDENTIFIERS.array = NULL; // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid // after Py_Finalize(). } @@ -16095,7 +16091,7 @@ _PyUnicode_Fini(PyInterpreterState *interp) _PyUnicode_FiniEncodings(&state->fs_codec); - unicode_clear_identifiers(state); + unicode_clear_identifiers(); for (Py_ssize_t i = 0; i < 256; i++) { Py_CLEAR(state->latin1[i]); diff --git a/Python/pystate.c b/Python/pystate.c index 463b248f22336e..709c621087a937 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -120,8 +120,9 @@ init_runtime(_PyRuntimeState *runtime, // Set it to the ID of the main thread of the main interpreter. runtime->main_thread = PyThread_get_thread_ident(); - runtime->unicode_ids.next_index = unicode_next_index; - runtime->unicode_ids.lock = unicode_ids_mutex; + struct _Py_unicode_ids *ids = &runtime->global_objects.singletons.unicode_ids; + ids->next_index = unicode_next_index; + ids->lock = unicode_ids_mutex; runtime->_initialized = 1; } @@ -137,7 +138,8 @@ _PyRuntimeState_Init(_PyRuntimeState *runtime) _Py_AuditHookEntry *audit_hook_head = runtime->audit_hook_head; // bpo-42882: Preserve next_index value if Py_Initialize()/Py_Finalize() // is called multiple times. - Py_ssize_t unicode_next_index = runtime->unicode_ids.next_index; + struct _Py_unicode_ids *ids = &runtime->global_objects.singletons.unicode_ids; + Py_ssize_t unicode_next_index = ids->next_index; PyThread_type_lock lock1, lock2, lock3; if (alloc_for_runtime(&lock1, &lock2, &lock3) != 0) { @@ -164,7 +166,8 @@ _PyRuntimeState_Fini(_PyRuntimeState *runtime) FREE_LOCK(runtime->interpreters.mutex); FREE_LOCK(runtime->xidregistry.mutex); - FREE_LOCK(runtime->unicode_ids.lock); + struct _Py_unicode_ids *ids = &runtime->global_objects.singletons.unicode_ids; + FREE_LOCK(ids->lock); #undef FREE_LOCK PyMem_SetAllocator(PYMEM_DOMAIN_RAW, &old_alloc); @@ -186,7 +189,8 @@ _PyRuntimeState_ReInitThreads(_PyRuntimeState *runtime) int reinit_interp = _PyThread_at_fork_reinit(&runtime->interpreters.mutex); int reinit_xidregistry = _PyThread_at_fork_reinit(&runtime->xidregistry.mutex); - int reinit_unicode_ids = _PyThread_at_fork_reinit(&runtime->unicode_ids.lock); + struct _Py_unicode_ids *ids = &runtime->global_objects.singletons.unicode_ids; + int reinit_unicode_ids = _PyThread_at_fork_reinit(&ids->lock); PyMem_SetAllocator(PYMEM_DOMAIN_RAW, &old_alloc); From 7c6c441b294b312d38df68de330c14bd17b406e3 Mon Sep 17 00:00:00 2001 From: Eric Snow Date: Wed, 15 Dec 2021 18:02:45 -0700 Subject: [PATCH 2/5] Move the interned strings to _PyRuntimeState. --- Include/internal/pycore_global_objects.h | 13 ++++++++++ Include/internal/pycore_unicodeobject.h | 10 -------- Objects/unicodeobject.c | 31 +++++++++++------------- 3 files changed, 27 insertions(+), 27 deletions(-) diff --git a/Include/internal/pycore_global_objects.h b/Include/internal/pycore_global_objects.h index 0862238598b293..2122c9bf598716 100644 --- a/Include/internal/pycore_global_objects.h +++ b/Include/internal/pycore_global_objects.h @@ -45,6 +45,8 @@ extern "C" { _PyRuntime.global_objects.NAME #define _Py_SINGLETON(NAME) \ _Py_GLOBAL_OBJECT(singletons.NAME) +#define _Py_CACHED_OBJECT(NAME) \ + _Py_GLOBAL_OBJECT(cached.NAME) struct _Py_global_objects { struct { @@ -66,6 +68,17 @@ struct _Py_global_objects { PyObject **array; } unicode_ids; } singletons; + struct { + /* This dictionary holds all interned unicode strings. Note that references + to strings in this dictionary are *not* counted in the string's ob_refcnt. + When the interned string reaches a refcnt of 0 the string deallocation + function will delete the reference from this dictionary. + + Another way to look at this is that to say that the actual reference + count of a string is: s->ob_refcnt + (s->state ? 2 : 0) + */ + PyObject *unicode_interned; + } cached; }; #define _Py_global_objects_INIT { \ diff --git a/Include/internal/pycore_unicodeobject.h b/Include/internal/pycore_unicodeobject.h index 1ad751d180a181..7551fe5d468b40 100644 --- a/Include/internal/pycore_unicodeobject.h +++ b/Include/internal/pycore_unicodeobject.h @@ -35,16 +35,6 @@ struct _Py_unicode_state { shared as well. */ PyObject *latin1[256]; struct _Py_unicode_fs_codec fs_codec; - - /* This dictionary holds all interned unicode strings. Note that references - to strings in this dictionary are *not* counted in the string's ob_refcnt. - When the interned string reaches a refcnt of 0 the string deallocation - function will delete the reference from this dictionary. - - Another way to look at this is that to say that the actual reference - count of a string is: s->ob_refcnt + (s->state ? 2 : 0) - */ - PyObject *interned; }; extern void _PyUnicode_ClearInterned(PyInterpreterState *); diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index fada8c0955ac4a..8ddfa9e502b576 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -234,6 +234,7 @@ static int unicode_is_singleton(PyObject *unicode); #define IDENTIFIERS _Py_SINGLETON(unicode_ids) +#define INTERNED _Py_CACHED_OBJECT(unicode_interned) static struct _Py_unicode_state* get_unicode_state(void) @@ -1952,7 +1953,6 @@ unicode_dealloc(PyObject *unicode) case SSTATE_INTERNED_MORTAL: { - struct _Py_unicode_state *state = get_unicode_state(); /* Revive the dead object temporarily. PyDict_DelItem() removes two references (key and value) which were ignored by PyUnicode_InternInPlace(). Use refcnt=3 rather than refcnt=2 @@ -1960,7 +1960,7 @@ unicode_dealloc(PyObject *unicode) PyDict_DelItem(). */ assert(Py_REFCNT(unicode) == 0); Py_SET_REFCNT(unicode, 3); - if (PyDict_DelItem(state->interned, unicode) != 0) { + if (PyDict_DelItem(INTERNED, unicode) != 0) { _PyErr_WriteUnraisableMsg("deletion of interned string failed", NULL); } @@ -15592,16 +15592,15 @@ PyUnicode_InternInPlace(PyObject **p) return; } - struct _Py_unicode_state *state = get_unicode_state(); - if (state->interned == NULL) { - state->interned = PyDict_New(); - if (state->interned == NULL) { + if (INTERNED == NULL) { + INTERNED = PyDict_New(); + if (INTERNED == NULL) { PyErr_Clear(); /* Don't leave an exception */ return; } } - PyObject *t = PyDict_SetDefault(state->interned, s, s); + PyObject *t = PyDict_SetDefault(INTERNED, s, s); if (t == NULL) { PyErr_Clear(); return; @@ -15654,11 +15653,10 @@ PyUnicode_InternFromString(const char *cp) void _PyUnicode_ClearInterned(PyInterpreterState *interp) { - struct _Py_unicode_state *state = &interp->unicode; - if (state->interned == NULL) { + if (INTERNED == NULL) { return; } - assert(PyDict_CheckExact(state->interned)); + assert(PyDict_CheckExact(INTERNED)); /* Interned unicode strings are not forcibly deallocated; rather, we give them their stolen references back, and then clear and DECREF the @@ -15666,13 +15664,13 @@ _PyUnicode_ClearInterned(PyInterpreterState *interp) #ifdef INTERNED_STATS fprintf(stderr, "releasing %zd interned strings\n", - PyDict_GET_SIZE(state->interned)); + PyDict_GET_SIZE(INTERNED)); Py_ssize_t immortal_size = 0, mortal_size = 0; #endif Py_ssize_t pos = 0; PyObject *s, *ignored_value; - while (PyDict_Next(state->interned, &pos, &s, &ignored_value)) { + while (PyDict_Next(INTERNED, &pos, &s, &ignored_value)) { assert(PyUnicode_IS_READY(s)); switch (PyUnicode_CHECK_INTERNED(s)) { @@ -15703,8 +15701,8 @@ _PyUnicode_ClearInterned(PyInterpreterState *interp) mortal_size, immortal_size); #endif - PyDict_Clear(state->interned); - Py_CLEAR(state->interned); + PyDict_Clear(INTERNED); + Py_CLEAR(INTERNED); } @@ -16075,8 +16073,7 @@ _PyUnicode_EnableLegacyWindowsFSEncoding(void) static inline int unicode_is_finalizing(void) { - struct _Py_unicode_state *state = get_unicode_state(); - return (state->interned == NULL); + return (INTERNED == NULL); } #endif @@ -16087,7 +16084,7 @@ _PyUnicode_Fini(PyInterpreterState *interp) struct _Py_unicode_state *state = &interp->unicode; // _PyUnicode_ClearInterned() must be called before - assert(state->interned == NULL); + assert(INTERNED == NULL); _PyUnicode_FiniEncodings(&state->fs_codec); From 58d9c0beef998ce8f320ed33700e8a54222e9f44 Mon Sep 17 00:00:00 2001 From: Eric Snow Date: Thu, 16 Dec 2021 17:22:23 -0700 Subject: [PATCH 3/5] Only clean up it it's the main interpreter. --- Objects/unicodeobject.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 8ddfa9e502b576..c2045e37515542 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2388,8 +2388,11 @@ _PyUnicode_FromId(_Py_Identifier *id) static void -unicode_clear_identifiers(void) +unicode_clear_identifiers(PyInterpreterState *interp) { + if (!_Py_IsMainInterpreter(interp)) { + return; + } for (Py_ssize_t i=0; i < IDENTIFIERS.size; i++) { Py_XDECREF(IDENTIFIERS.array[i]); } @@ -15653,6 +15656,9 @@ PyUnicode_InternFromString(const char *cp) void _PyUnicode_ClearInterned(PyInterpreterState *interp) { + if (!_Py_IsMainInterpreter(interp)) { + return; + } if (INTERNED == NULL) { return; } @@ -16088,7 +16094,7 @@ _PyUnicode_Fini(PyInterpreterState *interp) _PyUnicode_FiniEncodings(&state->fs_codec); - unicode_clear_identifiers(); + unicode_clear_identifiers(interp); for (Py_ssize_t i = 0; i < 256; i++) { Py_CLEAR(state->latin1[i]); From 01f19dc3582f4cffb382a4c495ca34412796176b Mon Sep 17 00:00:00 2001 From: Eric Snow Date: Thu, 16 Dec 2021 17:26:26 -0700 Subject: [PATCH 4/5] Add a NEWS entry. --- .../Core and Builtins/2021-12-16-17-26-17.bpo-46006.vAP3Et.rst | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2021-12-16-17-26-17.bpo-46006.vAP3Et.rst diff --git a/Misc/NEWS.d/next/Core and Builtins/2021-12-16-17-26-17.bpo-46006.vAP3Et.rst b/Misc/NEWS.d/next/Core and Builtins/2021-12-16-17-26-17.bpo-46006.vAP3Et.rst new file mode 100644 index 00000000000000..702ca75982699c --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2021-12-16-17-26-17.bpo-46006.vAP3Et.rst @@ -0,0 +1,3 @@ +Move the interned strings and Py_IDENTIFIER strings back to the +process-global runtime state instead of the per-interpreter state (at least +for now). From b60bd334c908cdedd11270775f24ae904d9decde Mon Sep 17 00:00:00 2001 From: Eric Snow Date: Fri, 17 Dec 2021 09:51:34 -0700 Subject: [PATCH 5/5] Only expect NULL if the main interpreter. --- Objects/unicodeobject.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index c2045e37515542..12219eff0a6d86 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -16090,7 +16090,7 @@ _PyUnicode_Fini(PyInterpreterState *interp) struct _Py_unicode_state *state = &interp->unicode; // _PyUnicode_ClearInterned() must be called before - assert(INTERNED == NULL); + assert(INTERNED == NULL || !_Py_IsMainInterpreter(interp)); _PyUnicode_FiniEncodings(&state->fs_codec);