Skip to content

bpo-46006: Move the interned strings and identifiers to _PyRuntimeState. #30131

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions Include/internal/pycore_global_objects.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ extern "C" {
_PyRuntime.global_objects.NAME
#define _Py_SINGLETON(NAME) \
_Py_GLOBAL_OBJECT(singletons.NAME)
#define _Py_CACHED_OBJECT(NAME) \
_Py_GLOBAL_OBJECT(cached.NAME)

struct _Py_global_objects {
struct {
Expand All @@ -54,7 +56,29 @@ struct _Py_global_objects {
* -_PY_NSMALLNEGINTS (inclusive) to _PY_NSMALLPOSINTS (exclusive).
*/
PyLongObject small_ints[_PY_NSMALLNEGINTS + _PY_NSMALLPOSINTS];

/* Unicode identifiers (_Py_Identifier): see _PyUnicode_FromId() */
struct _Py_unicode_ids {
PyThread_type_lock lock;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't this lock unnecessary? The GIL is held whenever an identifier is used, isn't it?

// next_index value must be preserved when Py_Initialize()/Py_Finalize()
// is called multiple times: see _PyUnicode_FromId() implementation.
Py_ssize_t next_index;

Py_ssize_t size;
PyObject **array;
} unicode_ids;
} singletons;
struct {
/* This dictionary holds all interned unicode strings. Note that references
to strings in this dictionary are *not* counted in the string's ob_refcnt.
When the interned string reaches a refcnt of 0 the string deallocation
function will delete the reference from this dictionary.

Another way to look at this is that to say that the actual reference
count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
*/
PyObject *unicode_interned;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you use "string" or "str" rather than "unicode". Python 2 is history 🙂

} cached;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please drop this struct.

};

#define _Py_global_objects_INIT { \
Expand Down
2 changes: 0 additions & 2 deletions Include/internal/pycore_runtime.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,6 @@ typedef struct pyruntimestate {
void *open_code_userdata;
_Py_AuditHookEntry *audit_hook_head;

struct _Py_unicode_runtime_ids unicode_ids;

struct _Py_global_objects global_objects;
// If anything gets added after global_objects then
// _PyRuntimeState_reset() needs to get updated to clear it.
Expand Down
25 changes: 0 additions & 25 deletions Include/internal/pycore_unicodeobject.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,6 @@ extern void _PyUnicode_Fini(PyInterpreterState *);

/* other API */

struct _Py_unicode_runtime_ids {
PyThread_type_lock lock;
// next_index value must be preserved when Py_Initialize()/Py_Finalize()
// is called multiple times: see _PyUnicode_FromId() implementation.
Py_ssize_t next_index;
};

/* fs_codec.encoding is initialized to NULL.
Later, it is set to a non-NULL string by _PyUnicode_InitEncodings(). */
struct _Py_unicode_fs_codec {
Expand All @@ -35,31 +28,13 @@ struct _Py_unicode_fs_codec {
_Py_error_handler error_handler;
};

struct _Py_unicode_ids {
Py_ssize_t size;
PyObject **array;
};

struct _Py_unicode_state {
// The empty Unicode object is a singleton to improve performance.
PyObject *empty_string;
/* Single character Unicode strings in the Latin-1 range are being
shared as well. */
PyObject *latin1[256];
struct _Py_unicode_fs_codec fs_codec;

/* This dictionary holds all interned unicode strings. Note that references
to strings in this dictionary are *not* counted in the string's ob_refcnt.
When the interned string reaches a refcnt of 0 the string deallocation
function will delete the reference from this dictionary.

Another way to look at this is that to say that the actual reference
count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
*/
PyObject *interned;

// Unicode identifiers (_Py_Identifier): see _PyUnicode_FromId()
struct _Py_unicode_ids ids;
};

extern void _PyUnicode_ClearInterned(PyInterpreterState *);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Move the interned strings and Py_IDENTIFIER strings back to the
process-global runtime state instead of the per-interpreter state (at least
for now).
87 changes: 43 additions & 44 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,9 @@ static int unicode_is_singleton(PyObject *unicode);
#endif


#define IDENTIFIERS _Py_SINGLETON(unicode_ids)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you drop the IDENTIFIERS and INTERNED macros. They impair readability.
You can leave _Py_SINGLETON as it conveys some meaning.

#define INTERNED _Py_CACHED_OBJECT(unicode_interned)

static struct _Py_unicode_state*
get_unicode_state(void)
{
Expand Down Expand Up @@ -1950,15 +1953,14 @@ unicode_dealloc(PyObject *unicode)

case SSTATE_INTERNED_MORTAL:
{
struct _Py_unicode_state *state = get_unicode_state();
/* Revive the dead object temporarily. PyDict_DelItem() removes two
references (key and value) which were ignored by
PyUnicode_InternInPlace(). Use refcnt=3 rather than refcnt=2
to prevent calling unicode_dealloc() again. Adjust refcnt after
PyDict_DelItem(). */
assert(Py_REFCNT(unicode) == 0);
Py_SET_REFCNT(unicode, 3);
if (PyDict_DelItem(state->interned, unicode) != 0) {
if (PyDict_DelItem(INTERNED, unicode) != 0) {
_PyErr_WriteUnraisableMsg("deletion of interned string failed",
NULL);
}
Expand Down Expand Up @@ -2331,30 +2333,25 @@ PyUnicode_FromString(const char *u)
PyObject *
_PyUnicode_FromId(_Py_Identifier *id)
{
PyInterpreterState *interp = _PyInterpreterState_GET();
struct _Py_unicode_ids *ids = &interp->unicode.ids;

Py_ssize_t index = _Py_atomic_size_get(&id->index);
if (index < 0) {
struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_ids;

PyThread_acquire_lock(rt_ids->lock, WAIT_LOCK);
PyThread_acquire_lock(IDENTIFIERS.lock, WAIT_LOCK);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Drop this, and assert that the GIL is held?

// Check again to detect concurrent access. Another thread can have
// initialized the index while this thread waited for the lock.
index = _Py_atomic_size_get(&id->index);
if (index < 0) {
assert(rt_ids->next_index < PY_SSIZE_T_MAX);
index = rt_ids->next_index;
rt_ids->next_index++;
assert(IDENTIFIERS.next_index < PY_SSIZE_T_MAX);
index = IDENTIFIERS.next_index;
IDENTIFIERS.next_index++;
_Py_atomic_size_set(&id->index, index);
}
PyThread_release_lock(rt_ids->lock);
PyThread_release_lock(IDENTIFIERS.lock);
}
assert(index >= 0);

PyObject *obj;
if (index < ids->size) {
obj = ids->array[index];
if (index < IDENTIFIERS.size) {
obj = IDENTIFIERS.array[index];
if (obj) {
// Return a borrowed reference
return obj;
Expand All @@ -2368,38 +2365,40 @@ _PyUnicode_FromId(_Py_Identifier *id)
}
PyUnicode_InternInPlace(&obj);

if (index >= ids->size) {
if (index >= IDENTIFIERS.size) {
// Overallocate to reduce the number of realloc
Py_ssize_t new_size = Py_MAX(index * 2, 16);
Py_ssize_t item_size = sizeof(ids->array[0]);
PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size);
Py_ssize_t item_size = sizeof(IDENTIFIERS.array[0]);
PyObject **new_array = PyMem_Realloc(IDENTIFIERS.array, new_size * item_size);
if (new_array == NULL) {
PyErr_NoMemory();
return NULL;
}
memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size);
ids->array = new_array;
ids->size = new_size;
memset(&new_array[IDENTIFIERS.size], 0, (new_size - IDENTIFIERS.size) * item_size);
IDENTIFIERS.array = new_array;
IDENTIFIERS.size = new_size;
}

// The array stores a strong reference
ids->array[index] = obj;
IDENTIFIERS.array[index] = obj;

// Return a borrowed reference
return obj;
}


static void
unicode_clear_identifiers(struct _Py_unicode_state *state)
unicode_clear_identifiers(PyInterpreterState *interp)
{
struct _Py_unicode_ids *ids = &state->ids;
for (Py_ssize_t i=0; i < ids->size; i++) {
Py_XDECREF(ids->array[i]);
if (!_Py_IsMainInterpreter(interp)) {
return;
}
for (Py_ssize_t i=0; i < IDENTIFIERS.size; i++) {
Py_XDECREF(IDENTIFIERS.array[i]);
}
ids->size = 0;
PyMem_Free(ids->array);
ids->array = NULL;
IDENTIFIERS.size = 0;
PyMem_Free(IDENTIFIERS.array);
IDENTIFIERS.array = NULL;
// Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid
// after Py_Finalize().
}
Expand Down Expand Up @@ -15596,16 +15595,15 @@ PyUnicode_InternInPlace(PyObject **p)
return;
}

struct _Py_unicode_state *state = get_unicode_state();
if (state->interned == NULL) {
state->interned = PyDict_New();
if (state->interned == NULL) {
if (INTERNED == NULL) {
INTERNED = PyDict_New();
if (INTERNED == NULL) {
PyErr_Clear(); /* Don't leave an exception */
return;
}
}

PyObject *t = PyDict_SetDefault(state->interned, s, s);
PyObject *t = PyDict_SetDefault(INTERNED, s, s);
if (t == NULL) {
PyErr_Clear();
return;
Expand Down Expand Up @@ -15658,25 +15656,27 @@ PyUnicode_InternFromString(const char *cp)
void
_PyUnicode_ClearInterned(PyInterpreterState *interp)
{
struct _Py_unicode_state *state = &interp->unicode;
if (state->interned == NULL) {
if (!_Py_IsMainInterpreter(interp)) {
return;
}
assert(PyDict_CheckExact(state->interned));
if (INTERNED == NULL) {
return;
}
assert(PyDict_CheckExact(INTERNED));

/* Interned unicode strings are not forcibly deallocated; rather, we give
them their stolen references back, and then clear and DECREF the
interned dict. */

#ifdef INTERNED_STATS
fprintf(stderr, "releasing %zd interned strings\n",
PyDict_GET_SIZE(state->interned));
PyDict_GET_SIZE(INTERNED));

Py_ssize_t immortal_size = 0, mortal_size = 0;
#endif
Py_ssize_t pos = 0;
PyObject *s, *ignored_value;
while (PyDict_Next(state->interned, &pos, &s, &ignored_value)) {
while (PyDict_Next(INTERNED, &pos, &s, &ignored_value)) {
assert(PyUnicode_IS_READY(s));

switch (PyUnicode_CHECK_INTERNED(s)) {
Expand Down Expand Up @@ -15707,8 +15707,8 @@ _PyUnicode_ClearInterned(PyInterpreterState *interp)
mortal_size, immortal_size);
#endif

PyDict_Clear(state->interned);
Py_CLEAR(state->interned);
PyDict_Clear(INTERNED);
Py_CLEAR(INTERNED);
}


Expand Down Expand Up @@ -16079,8 +16079,7 @@ _PyUnicode_EnableLegacyWindowsFSEncoding(void)
static inline int
unicode_is_finalizing(void)
{
struct _Py_unicode_state *state = get_unicode_state();
return (state->interned == NULL);
return (INTERNED == NULL);
}
#endif

Expand All @@ -16091,11 +16090,11 @@ _PyUnicode_Fini(PyInterpreterState *interp)
struct _Py_unicode_state *state = &interp->unicode;

// _PyUnicode_ClearInterned() must be called before
assert(state->interned == NULL);
assert(INTERNED == NULL || !_Py_IsMainInterpreter(interp));

_PyUnicode_FiniEncodings(&state->fs_codec);

unicode_clear_identifiers(state);
unicode_clear_identifiers(interp);

for (Py_ssize_t i = 0; i < 256; i++) {
Py_CLEAR(state->latin1[i]);
Expand Down
14 changes: 9 additions & 5 deletions Python/pystate.c
Original file line number Diff line number Diff line change
Expand Up @@ -120,8 +120,9 @@ init_runtime(_PyRuntimeState *runtime,
// Set it to the ID of the main thread of the main interpreter.
runtime->main_thread = PyThread_get_thread_ident();

runtime->unicode_ids.next_index = unicode_next_index;
runtime->unicode_ids.lock = unicode_ids_mutex;
struct _Py_unicode_ids *ids = &runtime->global_objects.singletons.unicode_ids;
ids->next_index = unicode_next_index;
ids->lock = unicode_ids_mutex;

runtime->_initialized = 1;
}
Expand All @@ -137,7 +138,8 @@ _PyRuntimeState_Init(_PyRuntimeState *runtime)
_Py_AuditHookEntry *audit_hook_head = runtime->audit_hook_head;
// bpo-42882: Preserve next_index value if Py_Initialize()/Py_Finalize()
// is called multiple times.
Py_ssize_t unicode_next_index = runtime->unicode_ids.next_index;
struct _Py_unicode_ids *ids = &runtime->global_objects.singletons.unicode_ids;
Py_ssize_t unicode_next_index = ids->next_index;

PyThread_type_lock lock1, lock2, lock3;
if (alloc_for_runtime(&lock1, &lock2, &lock3) != 0) {
Expand All @@ -164,7 +166,8 @@ _PyRuntimeState_Fini(_PyRuntimeState *runtime)

FREE_LOCK(runtime->interpreters.mutex);
FREE_LOCK(runtime->xidregistry.mutex);
FREE_LOCK(runtime->unicode_ids.lock);
struct _Py_unicode_ids *ids = &runtime->global_objects.singletons.unicode_ids;
FREE_LOCK(ids->lock);

#undef FREE_LOCK
PyMem_SetAllocator(PYMEM_DOMAIN_RAW, &old_alloc);
Expand All @@ -186,7 +189,8 @@ _PyRuntimeState_ReInitThreads(_PyRuntimeState *runtime)

int reinit_interp = _PyThread_at_fork_reinit(&runtime->interpreters.mutex);
int reinit_xidregistry = _PyThread_at_fork_reinit(&runtime->xidregistry.mutex);
int reinit_unicode_ids = _PyThread_at_fork_reinit(&runtime->unicode_ids.lock);
struct _Py_unicode_ids *ids = &runtime->global_objects.singletons.unicode_ids;
int reinit_unicode_ids = _PyThread_at_fork_reinit(&ids->lock);

PyMem_SetAllocator(PYMEM_DOMAIN_RAW, &old_alloc);

Expand Down