From ed1e37fd0109847353c46fecdb6ea9047b73fc4b Mon Sep 17 00:00:00 2001
From: Matt Page <mpage@meta.com>
Date: Tue, 26 Mar 2024 16:01:16 -0700
Subject: [PATCH 1/5] Use QSBR to avoid locking in PyType_IsSubtype

---
 Include/internal/pycore_pyatomic_ft_wrappers.h |  3 +++
 Objects/typeobject.c                           | 12 +++++++++---
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/Include/internal/pycore_pyatomic_ft_wrappers.h b/Include/internal/pycore_pyatomic_ft_wrappers.h
index e441600d54e1aa..969cda7514c1d2 100644
--- a/Include/internal/pycore_pyatomic_ft_wrappers.h
+++ b/Include/internal/pycore_pyatomic_ft_wrappers.h
@@ -23,6 +23,8 @@ extern "C" {
 #define FT_ATOMIC_LOAD_SSIZE(value) _Py_atomic_load_ssize(&value)
 #define FT_ATOMIC_LOAD_SSIZE_RELAXED(value) \
     _Py_atomic_load_ssize_relaxed(&value)
+#define FT_ATOMIC_STORE_PTR(value, new_value) \
+    _Py_atomic_store_ptr(&value, new_value)
 #define FT_ATOMIC_STORE_PTR_RELAXED(value, new_value) \
     _Py_atomic_store_ptr_relaxed(&value, new_value)
 #define FT_ATOMIC_STORE_PTR_RELEASE(value, new_value) \
@@ -32,6 +34,7 @@ extern "C" {
 #else
 #define FT_ATOMIC_LOAD_SSIZE(value) value
 #define FT_ATOMIC_LOAD_SSIZE_RELAXED(value) value
+#define FT_ATOMIC_STORE_PTR(value, new_value) value = new_value
 #define FT_ATOMIC_STORE_PTR_RELAXED(value, new_value) value = new_value
 #define FT_ATOMIC_STORE_PTR_RELEASE(value, new_value) value = new_value
 #define FT_ATOMIC_STORE_SSIZE_RELAXED(value, new_value) value = new_value
diff --git a/Objects/typeobject.c b/Objects/typeobject.c
index 82822784aaf407..be70f715b28976 100644
--- a/Objects/typeobject.c
+++ b/Objects/typeobject.c
@@ -2,6 +2,7 @@
 
 #include "Python.h"
 #include "pycore_abstract.h"      // _PySequence_IterSearch()
+#include "pycore_pyatomic_ft_wrappers.h"
 #include "pycore_call.h"          // _PyObject_VectorcallTstate()
 #include "pycore_code.h"          // CO_FAST_FREE
 #include "pycore_dict.h"          // _PyDict_KeysSize()
@@ -404,7 +405,13 @@ set_tp_mro(PyTypeObject *self, PyObject *mro)
         /* Other checks are done via set_tp_bases. */
         _Py_SetImmortal(mro);
     }
-    self->tp_mro = mro;
+#ifdef Py_GIL_DISABLED
+    if (self->tp_mro != NULL) {
+        // Allow concurrent reads from PyType_IsSubtype
+        _PyObject_GC_SET_SHARED(self->tp_mro);
+    }
+#endif
+    FT_ATOMIC_STORE_PTR(self->tp_mro, mro);
 }
 
 static inline void
@@ -2342,9 +2349,8 @@ int
 PyType_IsSubtype(PyTypeObject *a, PyTypeObject *b)
 {
 #ifdef Py_GIL_DISABLED
-    PyObject *mro = _PyType_GetMRO(a);
+    PyObject *mro = _Py_atomic_load_ptr(&a->tp_mro);
     int res = is_subtype_with_mro(mro, a, b);
-    Py_XDECREF(mro);
     return res;
 #else
     return is_subtype_with_mro(lookup_tp_mro(a), a, b);

From ce38dbf8a592ae4541576496c608c7bac26779cd Mon Sep 17 00:00:00 2001
From: Matt Page <mpage@meta.com>
Date: Wed, 27 Mar 2024 13:19:01 -0700
Subject: [PATCH 2/5] Make sure GC actually delays freeing

---
 Objects/typeobject.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Objects/typeobject.c b/Objects/typeobject.c
index be70f715b28976..0018eda5708f2c 100644
--- a/Objects/typeobject.c
+++ b/Objects/typeobject.c
@@ -408,7 +408,7 @@ set_tp_mro(PyTypeObject *self, PyObject *mro)
 #ifdef Py_GIL_DISABLED
     if (self->tp_mro != NULL) {
         // Allow concurrent reads from PyType_IsSubtype
-        _PyObject_GC_SET_SHARED(self->tp_mro);
+        _PyObject_GC_SET_SHARED_INLINE(self->tp_mro);
     }
 #endif
     FT_ATOMIC_STORE_PTR(self->tp_mro, mro);

From 21b40de5d8698275a5134ccb778cc83806496666 Mon Sep 17 00:00:00 2001
From: Matt Page <mpage@meta.com>
Date: Wed, 27 Mar 2024 13:22:54 -0700
Subject: [PATCH 3/5] Refactor PyType_IsSubtype and _PyType_GetMRO

---
 Include/internal/pycore_pyatomic_ft_wrappers.h |  2 ++
 Objects/typeobject.c                           | 13 +++----------
 2 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/Include/internal/pycore_pyatomic_ft_wrappers.h b/Include/internal/pycore_pyatomic_ft_wrappers.h
index 969cda7514c1d2..2514f51f1b0086 100644
--- a/Include/internal/pycore_pyatomic_ft_wrappers.h
+++ b/Include/internal/pycore_pyatomic_ft_wrappers.h
@@ -20,6 +20,7 @@ extern "C" {
 #endif
 
 #ifdef Py_GIL_DISABLED
+#define FT_ATOMIC_LOAD_PTR(value) _Py_atomic_load_ptr(&value)
 #define FT_ATOMIC_LOAD_SSIZE(value) _Py_atomic_load_ssize(&value)
 #define FT_ATOMIC_LOAD_SSIZE_RELAXED(value) \
     _Py_atomic_load_ssize_relaxed(&value)
@@ -32,6 +33,7 @@ extern "C" {
 #define FT_ATOMIC_STORE_SSIZE_RELAXED(value, new_value) \
     _Py_atomic_store_ssize_relaxed(&value, new_value)
 #else
+#define FT_ATOMIC_LOAD_PTR(value) value
 #define FT_ATOMIC_LOAD_SSIZE(value) value
 #define FT_ATOMIC_LOAD_SSIZE_RELAXED(value) value
 #define FT_ATOMIC_STORE_PTR(value, new_value) value = new_value
diff --git a/Objects/typeobject.c b/Objects/typeobject.c
index 0018eda5708f2c..d7a7c330b5eaab 100644
--- a/Objects/typeobject.c
+++ b/Objects/typeobject.c
@@ -367,15 +367,14 @@ clear_tp_bases(PyTypeObject *self)
 static inline PyObject *
 lookup_tp_mro(PyTypeObject *self)
 {
-    ASSERT_TYPE_LOCK_HELD();
-    return self->tp_mro;
+    return FT_ATOMIC_LOAD_PTR(self->tp_mro);
 }
 
 PyObject *
 _PyType_GetMRO(PyTypeObject *self)
 {
+    PyObject *mro = lookup_tp_mro(self);
 #ifdef Py_GIL_DISABLED
-    PyObject *mro = _Py_atomic_load_ptr_relaxed(&self->tp_mro);
     if (mro == NULL) {
         return NULL;
     }
@@ -389,7 +388,7 @@ _PyType_GetMRO(PyTypeObject *self)
     END_TYPE_LOCK()
     return mro;
 #else
-    return Py_XNewRef(lookup_tp_mro(self));
+    return Py_XNewRef(mro);
 #endif
 }
 
@@ -2348,13 +2347,7 @@ is_subtype_with_mro(PyObject *a_mro, PyTypeObject *a, PyTypeObject *b)
 int
 PyType_IsSubtype(PyTypeObject *a, PyTypeObject *b)
 {
-#ifdef Py_GIL_DISABLED
-    PyObject *mro = _Py_atomic_load_ptr(&a->tp_mro);
-    int res = is_subtype_with_mro(mro, a, b);
-    return res;
-#else
     return is_subtype_with_mro(lookup_tp_mro(a), a, b);
-#endif
 }
 
 /* Routines to do a method lookup in the type without looking in the

From 0ee1aa3a3c60af9e8af935dbd201b6a2b6fd569b Mon Sep 17 00:00:00 2001
From: Matt Page <mpage@meta.com>
Date: Wed, 27 Mar 2024 17:08:23 -0700
Subject: [PATCH 4/5] Done add tuples tagged for delayed reclamation to free
 lists

---
 Objects/tupleobject.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/Objects/tupleobject.c b/Objects/tupleobject.c
index d9dc00da368a84..7fbe3fafe7f408 100644
--- a/Objects/tupleobject.c
+++ b/Objects/tupleobject.c
@@ -1161,6 +1161,13 @@ static inline int
 maybe_freelist_push(PyTupleObject *op)
 {
 #ifdef WITH_FREELISTS
+#ifdef Py_GIL_DISABLED
+    if (_PyObject_GC_IS_SHARED_INLINE((PyObject *) op)) {
+        // There may still be threads that are concurrently reading from the
+        // tuple.
+        return 0;
+    }
+#endif
     struct _Py_object_freelists *freelists = _Py_object_freelists_GET();
     if (Py_SIZE(op) == 0) {
         return 0;

From e5c7b59d2f79a90e365e833ab99c9a363014fe33 Mon Sep 17 00:00:00 2001
From: Matt Page <mpage@meta.com>
Date: Thu, 28 Mar 2024 09:26:13 -0700
Subject: [PATCH 5/5] Temporarily remove thread safety when performing
 PyType_IsSubtype

---
 Include/internal/pycore_pyatomic_ft_wrappers.h |  5 -----
 Objects/tupleobject.c                          |  7 -------
 Objects/typeobject.c                           | 18 ++++++------------
 3 files changed, 6 insertions(+), 24 deletions(-)

diff --git a/Include/internal/pycore_pyatomic_ft_wrappers.h b/Include/internal/pycore_pyatomic_ft_wrappers.h
index 2514f51f1b0086..e441600d54e1aa 100644
--- a/Include/internal/pycore_pyatomic_ft_wrappers.h
+++ b/Include/internal/pycore_pyatomic_ft_wrappers.h
@@ -20,12 +20,9 @@ extern "C" {
 #endif
 
 #ifdef Py_GIL_DISABLED
-#define FT_ATOMIC_LOAD_PTR(value) _Py_atomic_load_ptr(&value)
 #define FT_ATOMIC_LOAD_SSIZE(value) _Py_atomic_load_ssize(&value)
 #define FT_ATOMIC_LOAD_SSIZE_RELAXED(value) \
     _Py_atomic_load_ssize_relaxed(&value)
-#define FT_ATOMIC_STORE_PTR(value, new_value) \
-    _Py_atomic_store_ptr(&value, new_value)
 #define FT_ATOMIC_STORE_PTR_RELAXED(value, new_value) \
     _Py_atomic_store_ptr_relaxed(&value, new_value)
 #define FT_ATOMIC_STORE_PTR_RELEASE(value, new_value) \
@@ -33,10 +30,8 @@ extern "C" {
 #define FT_ATOMIC_STORE_SSIZE_RELAXED(value, new_value) \
     _Py_atomic_store_ssize_relaxed(&value, new_value)
 #else
-#define FT_ATOMIC_LOAD_PTR(value) value
 #define FT_ATOMIC_LOAD_SSIZE(value) value
 #define FT_ATOMIC_LOAD_SSIZE_RELAXED(value) value
-#define FT_ATOMIC_STORE_PTR(value, new_value) value = new_value
 #define FT_ATOMIC_STORE_PTR_RELAXED(value, new_value) value = new_value
 #define FT_ATOMIC_STORE_PTR_RELEASE(value, new_value) value = new_value
 #define FT_ATOMIC_STORE_SSIZE_RELAXED(value, new_value) value = new_value
diff --git a/Objects/tupleobject.c b/Objects/tupleobject.c
index 7fbe3fafe7f408..d9dc00da368a84 100644
--- a/Objects/tupleobject.c
+++ b/Objects/tupleobject.c
@@ -1161,13 +1161,6 @@ static inline int
 maybe_freelist_push(PyTupleObject *op)
 {
 #ifdef WITH_FREELISTS
-#ifdef Py_GIL_DISABLED
-    if (_PyObject_GC_IS_SHARED_INLINE((PyObject *) op)) {
-        // There may still be threads that are concurrently reading from the
-        // tuple.
-        return 0;
-    }
-#endif
     struct _Py_object_freelists *freelists = _Py_object_freelists_GET();
     if (Py_SIZE(op) == 0) {
         return 0;
diff --git a/Objects/typeobject.c b/Objects/typeobject.c
index d7a7c330b5eaab..2ef79fbf17b329 100644
--- a/Objects/typeobject.c
+++ b/Objects/typeobject.c
@@ -2,7 +2,6 @@
 
 #include "Python.h"
 #include "pycore_abstract.h"      // _PySequence_IterSearch()
-#include "pycore_pyatomic_ft_wrappers.h"
 #include "pycore_call.h"          // _PyObject_VectorcallTstate()
 #include "pycore_code.h"          // CO_FAST_FREE
 #include "pycore_dict.h"          // _PyDict_KeysSize()
@@ -367,14 +366,15 @@ clear_tp_bases(PyTypeObject *self)
 static inline PyObject *
 lookup_tp_mro(PyTypeObject *self)
 {
-    return FT_ATOMIC_LOAD_PTR(self->tp_mro);
+    ASSERT_TYPE_LOCK_HELD();
+    return self->tp_mro;
 }
 
 PyObject *
 _PyType_GetMRO(PyTypeObject *self)
 {
-    PyObject *mro = lookup_tp_mro(self);
 #ifdef Py_GIL_DISABLED
+    PyObject *mro = _Py_atomic_load_ptr_relaxed(&self->tp_mro);
     if (mro == NULL) {
         return NULL;
     }
@@ -388,7 +388,7 @@ _PyType_GetMRO(PyTypeObject *self)
     END_TYPE_LOCK()
     return mro;
 #else
-    return Py_XNewRef(mro);
+    return Py_XNewRef(lookup_tp_mro(self));
 #endif
 }
 
@@ -404,13 +404,7 @@ set_tp_mro(PyTypeObject *self, PyObject *mro)
         /* Other checks are done via set_tp_bases. */
         _Py_SetImmortal(mro);
     }
-#ifdef Py_GIL_DISABLED
-    if (self->tp_mro != NULL) {
-        // Allow concurrent reads from PyType_IsSubtype
-        _PyObject_GC_SET_SHARED_INLINE(self->tp_mro);
-    }
-#endif
-    FT_ATOMIC_STORE_PTR(self->tp_mro, mro);
+    self->tp_mro = mro;
 }
 
 static inline void
@@ -2347,7 +2341,7 @@ is_subtype_with_mro(PyObject *a_mro, PyTypeObject *a, PyTypeObject *b)
 int
 PyType_IsSubtype(PyTypeObject *a, PyTypeObject *b)
 {
-    return is_subtype_with_mro(lookup_tp_mro(a), a, b);
+    return is_subtype_with_mro(a->tp_mro, a, b);
 }
 
 /* Routines to do a method lookup in the type without looking in the