From 2a918e4dd7baff17b01930fb813538a93dc5a701 Mon Sep 17 00:00:00 2001
From: Pablo Galindo <pablogsal@gmail.com>
Date: Sun, 7 Aug 2022 03:53:15 +0100
Subject: [PATCH 01/47] Allow Linux perf profiler to see Python calls

---
 Include/cpython/initconfig.h                  |   1 +
 Include/internal/pycore_ceval.h               |   1 +
 Makefile.pre.in                               |   7 +-
 ...2-08-20-18-36-40.gh-issue-96143.nh3GFM.rst |   5 +
 Objects/asm_trampoline.sx                     |  21 +++
 Objects/perf_trampoline.c                     | 165 ++++++++++++++++++
 PCbuild/_freeze_module.vcxproj                |   1 +
 PCbuild/_freeze_module.vcxproj.filters        |   3 +
 PCbuild/pythoncore.vcxproj                    |   1 +
 PCbuild/pythoncore.vcxproj.filters            |   3 +
 Python/clinic/sysmodule.c.h                   |  38 +++-
 Python/initconfig.c                           |  24 +++
 Python/pylifecycle.c                          |   5 +
 Python/sysmodule.c                            |  38 ++++
 configure                                     |  22 +++
 configure.ac                                  |  14 ++
 pyconfig.h.in                                 |   3 +
 17 files changed, 350 insertions(+), 2 deletions(-)
 create mode 100644 Misc/NEWS.d/next/Core and Builtins/2022-08-20-18-36-40.gh-issue-96143.nh3GFM.rst
 create mode 100644 Objects/asm_trampoline.sx
 create mode 100644 Objects/perf_trampoline.c

diff --git a/Include/cpython/initconfig.h b/Include/cpython/initconfig.h
index 3b6d59389f26b9..c6057a4c3ed945 100644
--- a/Include/cpython/initconfig.h
+++ b/Include/cpython/initconfig.h
@@ -142,6 +142,7 @@ typedef struct PyConfig {
     unsigned long hash_seed;
     int faulthandler;
     int tracemalloc;
+    int perf_profiling;
     int import_time;
     int code_debug_ranges;
     int show_ref_count;
diff --git a/Include/internal/pycore_ceval.h b/Include/internal/pycore_ceval.h
index 1b999301938c59..5a658aa17027a3 100644
--- a/Include/internal/pycore_ceval.h
+++ b/Include/internal/pycore_ceval.h
@@ -65,6 +65,7 @@ extern PyObject* _PyEval_BuiltinsFromGlobals(
     PyThreadState *tstate,
     PyObject *globals);
 
+extern int _PyPerfTrampoline_Init(int activate);
 
 static inline PyObject*
 _PyEval_EvalFrame(PyThreadState *tstate, struct _PyInterpreterFrame *frame, int throwflag)
diff --git a/Makefile.pre.in b/Makefile.pre.in
index 414e6045b4d69e..587c421a4dcf8a 100644
--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@@ -475,7 +475,9 @@ OBJECT_OBJS=	\
 		Objects/unicodeobject.o \
 		Objects/unicodectype.o \
 		Objects/unionobject.o \
-		Objects/weakrefobject.o
+		Objects/weakrefobject.o \
+		Objects/perf_trampoline.o \
+		@PERF_TRAMPOLINE_OBJ@
 
 DEEPFREEZE_OBJS = Python/deepfreeze/deepfreeze.o
 
@@ -2318,6 +2320,9 @@ config.status:	$(srcdir)/configure
 
 .PRECIOUS: config.status $(BUILDPYTHON) Makefile Makefile.pre
 
+Objects/asm_trampoline.o: $(srcdir)/Objects/asm_trampoline.sx
+	$(CC) -c $(PY_CORE_CFLAGS) -o $@ $<
+
 # Some make's put the object file in the current directory
 .c.o:
 	$(CC) -c $(PY_CORE_CFLAGS) -o $@ $<
diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-08-20-18-36-40.gh-issue-96143.nh3GFM.rst b/Misc/NEWS.d/next/Core and Builtins/2022-08-20-18-36-40.gh-issue-96143.nh3GFM.rst
new file mode 100644
index 00000000000000..66bd70536a669c
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2022-08-20-18-36-40.gh-issue-96143.nh3GFM.rst	
@@ -0,0 +1,5 @@
+Add a new ``-X perf`` Python command line option as well as
+:func:`sys._activate_perf_trampoline` and
+:func:`sys._deactivate_perf_trampoline` function in the :mod:`sys` module
+that allows to set/unset the interpreter in a way that the Linux ``perf``
+profiler can detect Python calls. Patch by Pablo Galindo.
diff --git a/Objects/asm_trampoline.sx b/Objects/asm_trampoline.sx
new file mode 100644
index 00000000000000..74ebc9384b7095
--- /dev/null
+++ b/Objects/asm_trampoline.sx
@@ -0,0 +1,21 @@
+    .text
+    .globl	_Py_trampoline_func_start
+_Py_trampoline_func_start:
+#ifdef __x86_64__
+    push   %rbp
+    mov    %rsp,%rbp
+    mov    %rdi,%rax
+    mov    %rsi,%rdi
+    mov    %rdx,%rsi
+    mov    %ecx,%edx
+    call   *%rax
+    pop    %rbp
+    ret
+#endif // __x86_64__
+#ifdef __aarch64__
+    TODO
+#endif
+    .globl	_Py_trampoline_func_end
+_Py_trampoline_func_end:
+    .section        .note.GNU-stack,"",@progbits
+
diff --git a/Objects/perf_trampoline.c b/Objects/perf_trampoline.c
new file mode 100644
index 00000000000000..e23e06b658d89a
--- /dev/null
+++ b/Objects/perf_trampoline.c
@@ -0,0 +1,165 @@
+#include "Python.h"
+#include "pycore_ceval.h"
+#include "pycore_frame.h"
+#include "pycore_interp.h"
+
+#ifdef HAVE_PERF_TRAMPOLINE
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+typedef PyObject *(*py_evaluator)(PyThreadState *, _PyInterpreterFrame *,
+                                  int throwflag);
+typedef PyObject *(*py_trampoline)(py_evaluator, PyThreadState *,
+                                   _PyInterpreterFrame *, int throwflag);
+extern void *_Py_trampoline_func_start;
+extern void *_Py_trampoline_func_end;
+
+typedef struct {
+    char *start_addr;
+    char *current_addr;
+    size_t size;
+    size_t size_left;
+    size_t code_size;
+} code_arena_t;
+
+static Py_ssize_t extra_code_index = -1;
+static code_arena_t code_arena;
+
+static int
+new_code_arena()
+{
+    size_t page_size = sysconf(_SC_PAGESIZE);
+    char *memory = mmap(NULL,  // address
+                        page_size, PROT_READ | PROT_WRITE | PROT_EXEC,
+                        MAP_PRIVATE | MAP_ANONYMOUS,
+                        -1,  // fd (not used here)
+                        0);  // offset (not used here)
+    if (!memory) {
+        Py_FatalError("Failed to allocate new code arena");
+        return -1;
+    }
+    void *start = &_Py_trampoline_func_start;
+    void *end = &_Py_trampoline_func_end;
+    size_t code_size = end - start;
+
+    long n_copies = page_size / code_size;
+    for (int i = 0; i < n_copies; i++) {
+        memcpy(memory + i * code_size, start, code_size * sizeof(char));
+    }
+
+    mprotect(memory, page_size, PROT_READ | PROT_EXEC);
+
+    code_arena.start_addr = memory;
+    code_arena.current_addr = memory;
+    code_arena.size = page_size;
+    code_arena.size_left = page_size;
+    code_arena.code_size = code_size;
+    return 0;
+}
+
+static inline py_trampoline
+code_arena_new_code(code_arena_t *code_arena)
+{
+    py_trampoline trampoline = (py_trampoline)code_arena->current_addr;
+    code_arena->size_left -= code_arena->code_size;
+    code_arena->current_addr += code_arena->code_size;
+    return trampoline;
+}
+
+static inline py_trampoline
+compile_trampoline(void)
+{
+    if (code_arena.size_left <= code_arena.code_size) {
+        if (new_code_arena() < 0) {
+            return NULL;
+        }
+    }
+
+    assert(code_arena.size_left <= code_arena.size);
+    return code_arena_new_code(&code_arena);
+}
+
+static inline FILE *
+perf_map_open(pid_t pid)
+{
+    char filename[100];
+    snprintf(filename, sizeof(filename), "/tmp/perf-%d.map", pid);
+    FILE *res = fopen(filename, "a");
+    if (!res) {
+        _Py_FatalErrorFormat(__func__, "Couldn't open %s: errno(%d)", filename, errno);
+        return NULL;
+    }
+    return res;
+}
+
+static inline int
+perf_map_close(FILE *fp)
+{
+    if (fp) {
+        return fclose(fp);
+    }
+    return 0;
+}
+
+static void
+perf_map_write_entry(FILE *method_file, const void *code_addr,
+                     unsigned int code_size, const char *entry,
+                     const char *file)
+{
+    fprintf(method_file, "%lx %x py::%s:%s\n", (unsigned long)code_addr,
+            code_size, entry, file);
+}
+
+static PyObject *
+py_trampoline_evaluator(PyThreadState *ts, _PyInterpreterFrame *frame,
+                        int throw)
+{
+    PyCodeObject *co = frame->f_code;
+    py_trampoline f = NULL;
+    _PyCode_GetExtra((PyObject *)co, extra_code_index, (void **)&f);
+    if (f == NULL) {
+        if (extra_code_index == -1) {
+            extra_code_index = _PyEval_RequestCodeExtraIndex(NULL);
+        }
+        py_trampoline new_trampoline = compile_trampoline();
+        if (new_trampoline == NULL) {
+            return NULL;
+        }
+        FILE *pfile = perf_map_open(getpid());
+        if (pfile == NULL) {
+            return NULL;
+        }
+        perf_map_write_entry(pfile, new_trampoline, code_arena.code_size,
+                             PyUnicode_AsUTF8(co->co_qualname),
+                             PyUnicode_AsUTF8(co->co_filename));
+        perf_map_close(pfile);
+        _PyCode_SetExtra((PyObject *)co, extra_code_index,
+                         (void *)new_trampoline);
+        f = new_trampoline;
+    }
+    assert(f != NULL);
+    return f(_PyEval_EvalFrameDefault, ts, frame, throw);
+}
+#endif
+
+int
+_PyPerfTrampoline_Init(int activate)
+{
+    PyThreadState *tstate = _PyThreadState_GET();
+    if (!activate) {
+        tstate->interp->eval_frame = NULL;
+    }
+    else {
+#ifdef HAVE_PERF_TRAMPOLINE
+        tstate->interp->eval_frame = py_trampoline_evaluator;
+        if (new_code_arena() < 0) {
+            return -1;
+        }
+#endif
+    }
+    return 0;
+}
diff --git a/PCbuild/_freeze_module.vcxproj b/PCbuild/_freeze_module.vcxproj
index 5821c3d9e4d860..5d4a843465995e 100644
--- a/PCbuild/_freeze_module.vcxproj
+++ b/PCbuild/_freeze_module.vcxproj
@@ -129,6 +129,7 @@
     <ClCompile Include="..\Objects\cellobject.c" />
     <ClCompile Include="..\Objects\classobject.c" />
     <ClCompile Include="..\Objects\codeobject.c" />
+    <ClCompile Include="..\Objects\perf_trampoline.c" />
     <ClCompile Include="..\Objects\complexobject.c" />
     <ClCompile Include="..\Objects\descrobject.c" />
     <ClCompile Include="..\Objects\dictobject.c" />
diff --git a/PCbuild/_freeze_module.vcxproj.filters b/PCbuild/_freeze_module.vcxproj.filters
index b657f56e28f248..d061ef911339f8 100644
--- a/PCbuild/_freeze_module.vcxproj.filters
+++ b/PCbuild/_freeze_module.vcxproj.filters
@@ -85,6 +85,9 @@
     <ClCompile Include="..\Objects\codeobject.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\Objects\perf_trampolie.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="..\Python\compile.c">
       <Filter>Source Files</Filter>
     </ClCompile>
diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj
index 3ff4be51872485..dcc214299426da 100644
--- a/PCbuild/pythoncore.vcxproj
+++ b/PCbuild/pythoncore.vcxproj
@@ -430,6 +430,7 @@
     <ClCompile Include="..\Objects\cellobject.c" />
     <ClCompile Include="..\Objects\classobject.c" />
     <ClCompile Include="..\Objects\codeobject.c" />
+    <ClCompile Include="..\Objects\perf_trampoline.c" />
     <ClCompile Include="..\Objects\complexobject.c" />
     <ClCompile Include="..\Objects\descrobject.c" />
     <ClCompile Include="..\Objects\dictobject.c" />
diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters
index 64d248dfafd81e..13c9ab549bbe18 100644
--- a/PCbuild/pythoncore.vcxproj.filters
+++ b/PCbuild/pythoncore.vcxproj.filters
@@ -926,6 +926,9 @@
     <ClCompile Include="..\Objects\codeobject.c">
       <Filter>Objects</Filter>
     </ClCompile>
+    <ClCompile Include="..\Objects\perf_trampoline.c">
+      <Filter>Objects</Filter>
+    </ClCompile>
     <ClCompile Include="..\Objects\complexobject.c">
       <Filter>Objects</Filter>
     </ClCompile>
diff --git a/Python/clinic/sysmodule.c.h b/Python/clinic/sysmodule.c.h
index beaf21c85bcff2..df3d412f01d94f 100644
--- a/Python/clinic/sysmodule.c.h
+++ b/Python/clinic/sysmodule.c.h
@@ -1127,6 +1127,42 @@ sys_getandroidapilevel(PyObject *module, PyObject *Py_UNUSED(ignored))
 
 #endif /* defined(ANDROID_API_LEVEL) */
 
+PyDoc_STRVAR(sys__activate_perf_trampoline__doc__,
+"_activate_perf_trampoline($module, /)\n"
+"--\n"
+"\n"
+"Activate the perf profiler trampoline.");
+
+#define SYS__ACTIVATE_PERF_TRAMPOLINE_METHODDEF    \
+    {"_activate_perf_trampoline", (PyCFunction)sys__activate_perf_trampoline, METH_NOARGS, sys__activate_perf_trampoline__doc__},
+
+static PyObject *
+sys__activate_perf_trampoline_impl(PyObject *module);
+
+static PyObject *
+sys__activate_perf_trampoline(PyObject *module, PyObject *Py_UNUSED(ignored))
+{
+    return sys__activate_perf_trampoline_impl(module);
+}
+
+PyDoc_STRVAR(sys__deactivate_perf_trampoline__doc__,
+"_deactivate_perf_trampoline($module, /)\n"
+"--\n"
+"\n"
+"Activate the perf profiler trampoline.");
+
+#define SYS__DEACTIVATE_PERF_TRAMPOLINE_METHODDEF    \
+    {"_deactivate_perf_trampoline", (PyCFunction)sys__deactivate_perf_trampoline, METH_NOARGS, sys__deactivate_perf_trampoline__doc__},
+
+static PyObject *
+sys__deactivate_perf_trampoline_impl(PyObject *module);
+
+static PyObject *
+sys__deactivate_perf_trampoline(PyObject *module, PyObject *Py_UNUSED(ignored))
+{
+    return sys__deactivate_perf_trampoline_impl(module);
+}
+
 #ifndef SYS_GETWINDOWSVERSION_METHODDEF
     #define SYS_GETWINDOWSVERSION_METHODDEF
 #endif /* !defined(SYS_GETWINDOWSVERSION_METHODDEF) */
@@ -1170,4 +1206,4 @@ sys_getandroidapilevel(PyObject *module, PyObject *Py_UNUSED(ignored))
 #ifndef SYS_GETANDROIDAPILEVEL_METHODDEF
     #define SYS_GETANDROIDAPILEVEL_METHODDEF
 #endif /* !defined(SYS_GETANDROIDAPILEVEL_METHODDEF) */
-/*[clinic end generated code: output=38446a4c76e2f3b6 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=639b32664762e04a input=a9049054013a1b77]*/
diff --git a/Python/initconfig.c b/Python/initconfig.c
index 70f0363297f3e0..f870d1a032794b 100644
--- a/Python/initconfig.c
+++ b/Python/initconfig.c
@@ -745,6 +745,7 @@ _PyConfig_InitCompatConfig(PyConfig *config)
     config->use_hash_seed = -1;
     config->faulthandler = -1;
     config->tracemalloc = -1;
+    config->perf_profiling = -1;
     config->module_search_paths_set = 0;
     config->parse_argv = 0;
     config->site_import = -1;
@@ -829,6 +830,7 @@ PyConfig_InitIsolatedConfig(PyConfig *config)
     config->use_hash_seed = 0;
     config->faulthandler = 0;
     config->tracemalloc = 0;
+    config->perf_profiling = 0;
     config->safe_path = 1;
     config->pathconfig_warnings = 0;
 #ifdef MS_WINDOWS
@@ -940,6 +942,7 @@ _PyConfig_Copy(PyConfig *config, const PyConfig *config2)
     COPY_ATTR(_install_importlib);
     COPY_ATTR(faulthandler);
     COPY_ATTR(tracemalloc);
+    COPY_ATTR(perf_profiling);
     COPY_ATTR(import_time);
     COPY_ATTR(code_debug_ranges);
     COPY_ATTR(show_ref_count);
@@ -1050,6 +1053,7 @@ _PyConfig_AsDict(const PyConfig *config)
     SET_ITEM_UINT(hash_seed);
     SET_ITEM_INT(faulthandler);
     SET_ITEM_INT(tracemalloc);
+    SET_ITEM_INT(perf_profiling);
     SET_ITEM_INT(import_time);
     SET_ITEM_INT(code_debug_ranges);
     SET_ITEM_INT(show_ref_count);
@@ -1331,6 +1335,7 @@ _PyConfig_FromDict(PyConfig *config, PyObject *dict)
     CHECK_VALUE("hash_seed", config->hash_seed <= MAX_HASH_SEED);
     GET_UINT(faulthandler);
     GET_UINT(tracemalloc);
+    GET_UINT(perf_profiling);
     GET_UINT(import_time);
     GET_UINT(code_debug_ranges);
     GET_UINT(show_ref_count);
@@ -1687,6 +1692,16 @@ config_read_env_vars(PyConfig *config)
     return _PyStatus_OK();
 }
 
+static PyStatus
+config_init_perf_profiling(PyConfig *config)
+{
+    const wchar_t *xoption = config_get_xoption(config, L"perf");
+    if (xoption) {
+        config->perf_profiling = 1;
+    }
+    return _PyStatus_OK();
+
+}
 
 static PyStatus
 config_init_tracemalloc(PyConfig *config)
@@ -1788,6 +1803,12 @@ config_read_complex_options(PyConfig *config)
             return status;
         }
     }
+    if (config->tracemalloc < 0) {
+        status = config_init_perf_profiling(config);
+        if (_PyStatus_EXCEPTION(status)) {
+            return status;
+        }
+    }
 
     if (config->pycache_prefix == NULL) {
         status = config_init_pycache_prefix(config);
@@ -2104,6 +2125,9 @@ config_read(PyConfig *config, int compute_path_config)
     if (config->tracemalloc < 0) {
         config->tracemalloc = 0;
     }
+    if (config->perf_profiling < 0) {
+        config->perf_profiling = 0;
+    }
     if (config->use_hash_seed < 0) {
         config->use_hash_seed = 0;
         config->hash_seed = 0;
diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c
index bb646f1a0ee2d0..be9a9e1f2543ea 100644
--- a/Python/pylifecycle.c
+++ b/Python/pylifecycle.c
@@ -1149,6 +1149,11 @@ init_interp_main(PyThreadState *tstate)
         if (_PyTraceMalloc_Init(config->tracemalloc) < 0) {
             return _PyStatus_ERR("can't initialize tracemalloc");
         }
+
+        if (_PyPerfTrampoline_Init(config->perf_profiling) < 0) {
+            return _PyStatus_ERR("can't initialize the perf trampoline");
+        }
+
     }
 
     status = init_sys_streams(tstate);
diff --git a/Python/sysmodule.c b/Python/sysmodule.c
index b8009b2db45f7b..9f35fdfda7b190 100644
--- a/Python/sysmodule.c
+++ b/Python/sysmodule.c
@@ -1994,6 +1994,42 @@ sys_getandroidapilevel_impl(PyObject *module)
 }
 #endif   /* ANDROID_API_LEVEL */
 
+/*[clinic input]
+sys._activate_perf_trampoline
+
+Activate the perf profiler trampoline.
+[clinic start generated code]*/
+
+static PyObject *
+sys__activate_perf_trampoline_impl(PyObject *module)
+/*[clinic end generated code: output=248f6dc862887fd0 input=67667f43ffabb1e4]*/
+{
+    if  (_PyPerfTrampoline_Init(1) < 0) {
+        return NULL;
+    }
+    Py_RETURN_NONE;
+}
+
+
+/*[clinic input]
+sys._deactivate_perf_trampoline
+
+Activate the perf profiler trampoline.
+[clinic start generated code]*/
+
+static PyObject *
+sys__deactivate_perf_trampoline_impl(PyObject *module)
+/*[clinic end generated code: output=7dde745eb7ba5e54 input=3d4fbb4aef9ad3d8]*/
+{
+    if  (_PyPerfTrampoline_Init(0) < 0) {
+        return NULL;
+    }
+    Py_RETURN_NONE;
+}
+
+
+
+
 static PyMethodDef sys_methods[] = {
     /* Might as well keep this in alphabetic order */
     SYS_ADDAUDITHOOK_METHODDEF
@@ -2047,6 +2083,8 @@ static PyMethodDef sys_methods[] = {
      METH_VARARGS | METH_KEYWORDS, set_asyncgen_hooks_doc},
     SYS_GET_ASYNCGEN_HOOKS_METHODDEF
     SYS_GETANDROIDAPILEVEL_METHODDEF
+    SYS__ACTIVATE_PERF_TRAMPOLINE_METHODDEF
+    SYS__DEACTIVATE_PERF_TRAMPOLINE_METHODDEF
     SYS_UNRAISABLEHOOK_METHODDEF
 #ifdef Py_STATS
     SYS__STATS_ON_METHODDEF
diff --git a/configure b/configure
index 1801f806ae137f..e2d18ad665f2d1 100755
--- a/configure
+++ b/configure
@@ -861,6 +861,7 @@ LIBEXPAT_CFLAGS
 TZPATH
 LIBUUID_LIBS
 LIBUUID_CFLAGS
+PERF_TRAMPOLINE_OBJ
 SHLIBS
 CFLAGSFORSHARED
 LINKFORSHARED
@@ -11418,6 +11419,27 @@ esac
 { $as_echo "$as_me:${as_lineno-$LINENO}: result: $SHLIBS" >&5
 $as_echo "$SHLIBS" >&6; }
 
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking perf trampoline" >&5
+$as_echo_n "checking perf trampoline... " >&6; }
+case $host in #(
+  x86_64-*-linux-*) :
+    perf_trampoline=yes ;; #(
+  *) :
+    perf_trampoline=no
+ ;;
+esac
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $perf_trampoline" >&5
+$as_echo "$perf_trampoline" >&6; }
+
+if test "x$perf_trampoline" = xyes; then :
+
+
+$as_echo "#define HAVE_PERF_TRAMPOLINE 1" >>confdefs.h
+
+  PERF_TRAMPOLINE_OBJ=Objects/asm_trampoline.o
+
+fi
+
 
 # checks for libraries
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for sendfile in -lsendfile" >&5
diff --git a/configure.ac b/configure.ac
index bb9fec07242f8e..86effabfc8c6e6 100644
--- a/configure.ac
+++ b/configure.ac
@@ -3425,6 +3425,20 @@ case "$ac_sys_system" in
 esac
 AC_MSG_RESULT($SHLIBS)
 
+dnl perf trampoline is Linux specific and requires an arch-specific
+dnl trampoline in asssembly.
+AC_MSG_CHECKING([perf trampoline])
+AS_CASE([$host],
+  [x86_64-*-linux-*], [perf_trampoline=yes],
+  [perf_trampoline=no]
+)
+AC_MSG_RESULT([$perf_trampoline])
+
+AS_VAR_IF([perf_trampoline], [yes], [
+  AC_DEFINE([HAVE_PERF_TRAMPOLINE], [1], [Define to 1 if you have the perf trampoline.])
+  PERF_TRAMPOLINE_OBJ=Objects/asm_trampoline.o
+])
+AC_SUBST([PERF_TRAMPOLINE_OBJ])
 
 # checks for libraries
 AC_CHECK_LIB(sendfile, sendfile)
diff --git a/pyconfig.h.in b/pyconfig.h.in
index 10e7ad12fa982c..f826d8983ea2ae 100644
--- a/pyconfig.h.in
+++ b/pyconfig.h.in
@@ -872,6 +872,9 @@
 /* Define to 1 if you have the `pause' function. */
 #undef HAVE_PAUSE
 
+/* Define to 1 if you have the perf trampoline. */
+#undef HAVE_PERF_TRAMPOLINE
+
 /* Define to 1 if you have the `pipe' function. */
 #undef HAVE_PIPE
 

From cea1420b1a31d3b638d8bd2853f0453b094d3df0 Mon Sep 17 00:00:00 2001
From: Pablo Galindo <pablogsal@gmail.com>
Date: Sat, 20 Aug 2022 19:30:07 +0100
Subject: [PATCH 02/47] Add test

---
 Lib/test/test_perf_profiler.py                | 128 ++++++++++++++++++
 Makefile.pre.in                               |   2 +-
 .../{asm_trampoline.sx => asm_trampoline.S}   |   0
 3 files changed, 129 insertions(+), 1 deletion(-)
 create mode 100644 Lib/test/test_perf_profiler.py
 rename Objects/{asm_trampoline.sx => asm_trampoline.S} (100%)

diff --git a/Lib/test/test_perf_profiler.py b/Lib/test/test_perf_profiler.py
new file mode 100644
index 00000000000000..0c96b3a4dfcf1b
--- /dev/null
+++ b/Lib/test/test_perf_profiler.py
@@ -0,0 +1,128 @@
+import unittest
+import subprocess
+import re
+import sys
+import sysconfig
+from test.support.script_helper import make_script
+from test.support.os_helper import temp_dir
+
+
+def get_perf_version():
+    try:
+        cmd = ["perf", "--version"]
+        proc = subprocess.Popen(
+            cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True
+        )
+        with proc:
+            version, stderr = proc.communicate()
+
+        if proc.returncode:
+            raise Exception(
+                f"Command {' '.join(cmd)!r} failed "
+                f"with exit code {proc.returncode}: "
+                f"stdout={version!r} stderr={stderr!r}"
+            )
+    except OSError:
+        raise unittest.SkipTest("Couldn't find perf on the path")
+
+    match = re.search(r"^perf version\s+(.*)", version)
+    if match is None:
+        raise Exception("unable to parse perf version: %r" % version)
+    return (version, match.group(1))
+
+
+_, version = get_perf_version()
+
+if not version:
+    raise unittest.SkipTest("Could not find valid perf tool")
+
+if "no-omit-frame-pointe" not in sysconfig.get_config_var("CFLAGS"):
+    raise unittest.SkipTest("Unwinding without frame pointer is unreliable")
+
+
+def run_perf(cwd, *args, **env_vars):
+    if env_vars:
+        env = os.environ.copy()
+        env.update(env_vars)
+    else:
+        env = None
+    # -nx: Do not execute commands from any .gdbinit initialization files
+    #      (issue #22188)
+    output_file = cwd + "/perf_output.perf"
+    base_cmd = ("perf", "record", "-g", "--call-graph=fp", "-o", output_file, "--")
+    proc = subprocess.Popen(
+        base_cmd + args,
+        # Redirect stdin to prevent GDB from messing with
+        # the terminal settings
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        env=env,
+    )
+    with proc:
+        out, err = proc.communicate()
+    base_cmd = ("perf", "script")
+    proc = subprocess.Popen(
+        ("perf", "script", "-i", output_file),
+        # Redirect stdin to prevent GDB from messing with
+        # the terminal settings
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        env=env,
+    )
+    with proc:
+        out, err = proc.communicate()
+    return out.decode("utf-8", "replace"), err.decode("utf-8", "replace")
+
+
+class TestPerfProfiler(unittest.TestCase):
+    def test_python_calls_appear_in_the_stack_if_perf_activated(self):
+        with temp_dir() as script_dir:
+            code = """if 1:
+                def foo(n):
+                    x = 0
+                    for i in range(n):
+                        x += i
+
+                def bar(n):
+                    foo(n)
+
+                def baz(n):
+                    bar(n)
+
+                baz(10000000)
+                """
+            script = make_script(script_dir, "perftest", code)
+            stdout, stderr = run_perf(script_dir, sys.executable, "-Xperf", script)
+            self.assertEqual(stderr, "")
+
+            self.assertIn(f"py::foo:{script}", stdout)
+            self.assertIn(f"py::bar:{script}", stdout)
+            self.assertIn(f"py::baz:{script}", stdout)
+
+    def test_python_calls_do_not_appear_in_the_stack_if_perf_activated(self):
+        with temp_dir() as script_dir:
+            code = """if 1:
+                def foo(n):
+                    x = 0
+                    for i in range(n):
+                        x += i
+
+                def bar(n):
+                    foo(n)
+
+                def baz(n):
+                    bar(n)
+
+                baz(10000000)
+                """
+            script = make_script(script_dir, "perftest", code)
+            stdout, stderr = run_perf(script_dir, sys.executable, script)
+            self.assertEqual(stderr, "")
+
+            self.assertNotIn(f"py::foo:{script}", stdout)
+            self.assertNotIn(f"py::bar:{script}", stdout)
+            self.assertNotIn(f"py::baz:{script}", stdout)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/Makefile.pre.in b/Makefile.pre.in
index 587c421a4dcf8a..027aefcb61312d 100644
--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@@ -2320,7 +2320,7 @@ config.status:	$(srcdir)/configure
 
 .PRECIOUS: config.status $(BUILDPYTHON) Makefile Makefile.pre
 
-Objects/asm_trampoline.o: $(srcdir)/Objects/asm_trampoline.sx
+Objects/asm_trampoline.o: $(srcdir)/Objects/asm_trampoline.S
 	$(CC) -c $(PY_CORE_CFLAGS) -o $@ $<
 
 # Some make's put the object file in the current directory
diff --git a/Objects/asm_trampoline.sx b/Objects/asm_trampoline.S
similarity index 100%
rename from Objects/asm_trampoline.sx
rename to Objects/asm_trampoline.S

From 4107c530242e61020c44f4b35841de95ac843a91 Mon Sep 17 00:00:00 2001
From: Pablo Galindo Salgado <Pablogsal@gmail.com>
Date: Sat, 20 Aug 2022 19:31:40 +0100
Subject: [PATCH 03/47] Update PCbuild/_freeze_module.vcxproj.filters

Co-authored-by: Kumar Aditya <59607654+kumaraditya303@users.noreply.github.com>
---
 PCbuild/_freeze_module.vcxproj.filters | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PCbuild/_freeze_module.vcxproj.filters b/PCbuild/_freeze_module.vcxproj.filters
index d061ef911339f8..b96b31fc557cd3 100644
--- a/PCbuild/_freeze_module.vcxproj.filters
+++ b/PCbuild/_freeze_module.vcxproj.filters
@@ -85,7 +85,7 @@
     <ClCompile Include="..\Objects\codeobject.c">
       <Filter>Source Files</Filter>
     </ClCompile>
-    <ClCompile Include="..\Objects\perf_trampolie.c">
+    <ClCompile Include="..\Objects\perf_trampoline.c">
       <Filter>Source Files</Filter>
     </ClCompile>
     <ClCompile Include="..\Python\compile.c">

From 5e34e6617be4c010f969b397c170ce24d5182657 Mon Sep 17 00:00:00 2001
From: Christian Heimes <christian@python.org>
Date: Sun, 21 Aug 2022 12:59:18 +0200
Subject: [PATCH 04/47] munmap pages on shutdown, keep FILE open

---
 Include/internal/pycore_ceval.h |   2 +
 Modules/posixmodule.c           |   5 ++
 Objects/perf_trampoline.c       | 100 ++++++++++++++++++++++++--------
 Python/pylifecycle.c            |   1 +
 4 files changed, 85 insertions(+), 23 deletions(-)

diff --git a/Include/internal/pycore_ceval.h b/Include/internal/pycore_ceval.h
index 5a658aa17027a3..8dea61ad0de9c7 100644
--- a/Include/internal/pycore_ceval.h
+++ b/Include/internal/pycore_ceval.h
@@ -66,6 +66,8 @@ extern PyObject* _PyEval_BuiltinsFromGlobals(
     PyObject *globals);
 
 extern int _PyPerfTrampoline_Init(int activate);
+extern int _PyPerfTrampoline_Fini(void);
+extern PyStatus _PyPerfTrampoline_AfterFork_Child(void);
 
 static inline PyObject*
 _PyEval_EvalFrame(PyThreadState *tstate, struct _PyInterpreterFrame *frame, int throwflag)
diff --git a/Modules/posixmodule.c b/Modules/posixmodule.c
index d45fa231ae5e2a..3810bc87c1fbab 100644
--- a/Modules/posixmodule.c
+++ b/Modules/posixmodule.c
@@ -606,6 +606,11 @@ PyOS_AfterFork_Child(void)
     }
     assert(_PyThreadState_GET() == tstate);
 
+    status = _PyPerfTrampoline_AfterFork_Child();
+    if (_PyStatus_EXCEPTION(status)) {
+        goto fatal_error;
+    }
+
     run_at_forkers(tstate->interp->after_forkers_child, 0);
     return;
 
diff --git a/Objects/perf_trampoline.c b/Objects/perf_trampoline.c
index e23e06b658d89a..b69e72cd05e4c5 100644
--- a/Objects/perf_trampoline.c
+++ b/Objects/perf_trampoline.c
@@ -18,23 +18,30 @@ typedef PyObject *(*py_trampoline)(py_evaluator, PyThreadState *,
 extern void *_Py_trampoline_func_start;
 extern void *_Py_trampoline_func_end;
 
-typedef struct {
+struct code_arena_st {
     char *start_addr;
     char *current_addr;
     size_t size;
     size_t size_left;
     size_t code_size;
-} code_arena_t;
+    struct code_arena_st *prev;
+};
+
+typedef struct code_arena_st code_arena_t;
 
 static Py_ssize_t extra_code_index = -1;
-static code_arena_t code_arena;
+static code_arena_t *code_arena;
+static FILE *perf_map_file;
 
 static int
-new_code_arena()
+new_code_arena(void)
 {
-    size_t page_size = sysconf(_SC_PAGESIZE);
+    // non-trivial programs typically need 64 to 256 kiB.
+    size_t mem_size = 4096 * 16;
+    assert(mem_size % sysconf(_SC_PAGESIZE) == 0);
     char *memory = mmap(NULL,  // address
-                        page_size, PROT_READ | PROT_WRITE | PROT_EXEC,
+                        mem_size,
+                        PROT_READ | PROT_WRITE | PROT_EXEC,
                         MAP_PRIVATE | MAP_ANONYMOUS,
                         -1,  // fd (not used here)
                         0);  // offset (not used here)
@@ -46,21 +53,43 @@ new_code_arena()
     void *end = &_Py_trampoline_func_end;
     size_t code_size = end - start;
 
-    long n_copies = page_size / code_size;
+    long n_copies = mem_size / code_size;
     for (int i = 0; i < n_copies; i++) {
         memcpy(memory + i * code_size, start, code_size * sizeof(char));
     }
 
-    mprotect(memory, page_size, PROT_READ | PROT_EXEC);
+    mprotect(memory, mem_size, PROT_READ | PROT_EXEC);
+
+    code_arena_t *new_arena = PyMem_RawCalloc(1, sizeof(code_arena_t));
+    if (new_arena == NULL) {
+        Py_FatalError("Failed to allocate new code arena struct");
+        return -1;
+    }
 
-    code_arena.start_addr = memory;
-    code_arena.current_addr = memory;
-    code_arena.size = page_size;
-    code_arena.size_left = page_size;
-    code_arena.code_size = code_size;
+    new_arena->start_addr = memory;
+    new_arena->current_addr = memory;
+    new_arena->size = mem_size;
+    new_arena->size_left = mem_size;
+    new_arena->code_size = code_size;
+    new_arena->prev = code_arena;
+    code_arena = new_arena;
     return 0;
 }
 
+static void
+free_code_arenas(void)
+{
+    code_arena_t *cur = code_arena;
+    code_arena_t *prev;
+    code_arena = NULL; // invalid static pointer
+    while(cur) {
+        munmap(cur->start_addr, cur->size);
+        prev = cur->prev;
+        PyMem_RawFree(cur);
+        cur = prev;
+    }
+}
+
 static inline py_trampoline
 code_arena_new_code(code_arena_t *code_arena)
 {
@@ -73,27 +102,32 @@ code_arena_new_code(code_arena_t *code_arena)
 static inline py_trampoline
 compile_trampoline(void)
 {
-    if (code_arena.size_left <= code_arena.code_size) {
+    if ((code_arena == NULL) || (code_arena->size_left <= code_arena->code_size)) {
         if (new_code_arena() < 0) {
             return NULL;
         }
     }
 
-    assert(code_arena.size_left <= code_arena.size);
-    return code_arena_new_code(&code_arena);
+    assert(code_arena->size_left <= code_arena->size);
+    return code_arena_new_code(code_arena);
 }
 
 static inline FILE *
-perf_map_open(pid_t pid)
+perf_map_get_file(void)
 {
+    if (perf_map_file) {
+        return perf_map_file;
+    }
     char filename[100];
+    pid_t pid = getpid();
+    // TODO: %d is incorrect if pid_t is long long
     snprintf(filename, sizeof(filename), "/tmp/perf-%d.map", pid);
-    FILE *res = fopen(filename, "a");
-    if (!res) {
+    perf_map_file = fopen(filename, "a");
+    if (!perf_map_file) {
         _Py_FatalErrorFormat(__func__, "Couldn't open %s: errno(%d)", filename, errno);
         return NULL;
     }
-    return res;
+    return perf_map_file;
 }
 
 static inline int
@@ -112,6 +146,7 @@ perf_map_write_entry(FILE *method_file, const void *code_addr,
 {
     fprintf(method_file, "%lx %x py::%s:%s\n", (unsigned long)code_addr,
             code_size, entry, file);
+    fflush(method_file);
 }
 
 static PyObject *
@@ -129,14 +164,13 @@ py_trampoline_evaluator(PyThreadState *ts, _PyInterpreterFrame *frame,
         if (new_trampoline == NULL) {
             return NULL;
         }
-        FILE *pfile = perf_map_open(getpid());
+        FILE *pfile = perf_map_get_file();
         if (pfile == NULL) {
             return NULL;
         }
-        perf_map_write_entry(pfile, new_trampoline, code_arena.code_size,
+        perf_map_write_entry(pfile, new_trampoline, code_arena->code_size,
                              PyUnicode_AsUTF8(co->co_qualname),
                              PyUnicode_AsUTF8(co->co_filename));
-        perf_map_close(pfile);
         _PyCode_SetExtra((PyObject *)co, extra_code_index,
                          (void *)new_trampoline);
         f = new_trampoline;
@@ -163,3 +197,23 @@ _PyPerfTrampoline_Init(int activate)
     }
     return 0;
 }
+
+int
+_PyPerfTrampoline_Fini(void)
+{
+#ifdef HAVE_PERF_TRAMPOLINE
+    free_code_arenas();
+    perf_map_close(perf_map_file);
+#endif
+    return 0;
+}
+
+PyStatus
+_PyPerfTrampoline_AfterFork_Child(void)
+{
+#ifdef HAVE_PERF_TRAMPOLINE
+    // close file in child.
+    perf_map_close(perf_map_file);
+#endif
+    return PyStatus_Ok();
+}
diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c
index be9a9e1f2543ea..05755ab46d9765 100644
--- a/Python/pylifecycle.c
+++ b/Python/pylifecycle.c
@@ -1728,6 +1728,7 @@ finalize_interp_clear(PyThreadState *tstate)
         _PyArg_Fini();
         _Py_ClearFileSystemEncoding();
         _Py_Deepfreeze_Fini();
+        _PyPerfTrampoline_Fini();
     }
 
     finalize_interp_types(tstate->interp);

From a26a8503b3df44eb93a658e32db06fe172a4b697 Mon Sep 17 00:00:00 2001
From: Pablo Galindo <pablogsal@gmail.com>
Date: Sun, 21 Aug 2022 16:59:51 +0100
Subject: [PATCH 05/47] Fix tests

---
 Lib/test/test_embed.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/Lib/test/test_embed.py b/Lib/test/test_embed.py
index c546bb08e297c3..70d7367ea9e64f 100644
--- a/Lib/test/test_embed.py
+++ b/Lib/test/test_embed.py
@@ -436,6 +436,7 @@ class InitConfigTests(EmbeddingTestsMixin, unittest.TestCase):
         'hash_seed': 0,
         'faulthandler': 0,
         'tracemalloc': 0,
+        'perf_profiling': 0,
         'import_time': 0,
         'code_debug_ranges': 1,
         'show_ref_count': 0,
@@ -520,6 +521,7 @@ class InitConfigTests(EmbeddingTestsMixin, unittest.TestCase):
         use_hash_seed=0,
         faulthandler=0,
         tracemalloc=0,
+        perf_profiling=0,
         pathconfig_warnings=0,
     )
     if MS_WINDOWS:
@@ -828,6 +830,7 @@ def test_init_from_config(self):
             'use_hash_seed': 1,
             'hash_seed': 123,
             'tracemalloc': 2,
+            'perf_profiling': 0,
             'import_time': 1,
             'code_debug_ranges': 0,
             'show_ref_count': 1,
@@ -890,6 +893,7 @@ def test_init_compat_env(self):
             'use_hash_seed': 1,
             'hash_seed': 42,
             'tracemalloc': 2,
+            'perf_profiling': 0,
             'import_time': 1,
             'code_debug_ranges': 0,
             'malloc_stats': 1,
@@ -921,6 +925,7 @@ def test_init_python_env(self):
             'use_hash_seed': 1,
             'hash_seed': 42,
             'tracemalloc': 2,
+            'perf_profiling': 0,
             'import_time': 1,
             'code_debug_ranges': 0,
             'malloc_stats': 1,

From 8170b244eb0f05a63f6faec1274e8375b1f6b4f6 Mon Sep 17 00:00:00 2001
From: Pablo Galindo <pablogsal@gmail.com>
Date: Sun, 21 Aug 2022 17:40:35 +0100
Subject: [PATCH 06/47] Skip tests if sanitizer is active

---
 Lib/test/test_perf_profiler.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Lib/test/test_perf_profiler.py b/Lib/test/test_perf_profiler.py
index 0c96b3a4dfcf1b..c39e34a25f13b0 100644
--- a/Lib/test/test_perf_profiler.py
+++ b/Lib/test/test_perf_profiler.py
@@ -5,6 +5,7 @@
 import sysconfig
 from test.support.script_helper import make_script
 from test.support.os_helper import temp_dir
+from test.support import check_sanitizer
 
 
 def get_perf_version():
@@ -39,6 +40,9 @@ def get_perf_version():
 if "no-omit-frame-pointe" not in sysconfig.get_config_var("CFLAGS"):
     raise unittest.SkipTest("Unwinding without frame pointer is unreliable")
 
+if check_sanitizer(address=True, memory=True, ub=True):
+    raise unittest.SkipTest("Perf unwinding doesn't work with sanitizers")
+
 
 def run_perf(cwd, *args, **env_vars):
     if env_vars:

From 9df1c93dbfe2d47b9c80b1d0d3162b435601a3cf Mon Sep 17 00:00:00 2001
From: Christian Heimes <christian@python.org>
Date: Sun, 21 Aug 2022 21:05:13 +0200
Subject: [PATCH 07/47] Add ARM64 code generated by aarch64-linux-gnu-gcc

---
 Objects/asm_trampoline.S | 15 ++++++++++++---
 configure                |  6 ++++--
 configure.ac             |  5 +++--
 3 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/Objects/asm_trampoline.S b/Objects/asm_trampoline.S
index 74ebc9384b7095..93ddbd625df252 100644
--- a/Objects/asm_trampoline.S
+++ b/Objects/asm_trampoline.S
@@ -12,10 +12,19 @@ _Py_trampoline_func_start:
     pop    %rbp
     ret
 #endif // __x86_64__
-#ifdef __aarch64__
-    TODO
+#if defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
+    // ARM64 little endian, 64bit ABI
+    // generate with aarch64-linux-gnu-gcc 12.1
+    stp     x29, x30, [sp, -16]!
+    mov     x29, sp
+    mov     x4, x0
+    mov     x0, x1
+    mov     x1, x2
+    mov     w2, w3
+    blr     x4
+    ldp     x29, x30, [sp], 16
+    ret
 #endif
     .globl	_Py_trampoline_func_end
 _Py_trampoline_func_end:
     .section        .note.GNU-stack,"",@progbits
-
diff --git a/configure b/configure
index e2d18ad665f2d1..c96ac527eb4bfc 100755
--- a/configure
+++ b/configure
@@ -11421,8 +11421,10 @@ $as_echo "$SHLIBS" >&6; }
 
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking perf trampoline" >&5
 $as_echo_n "checking perf trampoline... " >&6; }
-case $host in #(
-  x86_64-*-linux-*) :
+case $PLATFORM_TRIPLET in #(
+  x86_64-linux-gnu) :
+    perf_trampoline=yes ;; #(
+  aarch64-linux-gnu) :
     perf_trampoline=yes ;; #(
   *) :
     perf_trampoline=no
diff --git a/configure.ac b/configure.ac
index 86effabfc8c6e6..0e06d5b6fbadd7 100644
--- a/configure.ac
+++ b/configure.ac
@@ -3428,8 +3428,9 @@ AC_MSG_RESULT($SHLIBS)
 dnl perf trampoline is Linux specific and requires an arch-specific
 dnl trampoline in asssembly.
 AC_MSG_CHECKING([perf trampoline])
-AS_CASE([$host],
-  [x86_64-*-linux-*], [perf_trampoline=yes],
+AS_CASE([$PLATFORM_TRIPLET],
+  [x86_64-linux-gnu], [perf_trampoline=yes],
+  [aarch64-linux-gnu], [perf_trampoline=yes],
   [perf_trampoline=no]
 )
 AC_MSG_RESULT([$perf_trampoline])

From d8f396d1b0fd7dc25630a75b9faefc4efe847372 Mon Sep 17 00:00:00 2001
From: Pablo Galindo <pablogsal@gmail.com>
Date: Sun, 21 Aug 2022 22:07:42 +0100
Subject: [PATCH 08/47] Address review comments

---
 Include/internal/pycore_ceval.h |  1 +
 Lib/test/test_perf_profiler.py  | 32 ++++++---------------
 Objects/perf_trampoline.c       | 30 ++++++++++++++------
 Python/clinic/sysmodule.c.h     | 50 ++++++++++++++++++++++-----------
 Python/initconfig.c             |  2 +-
 Python/sysmodule.c              | 37 ++++++++++++++++++------
 6 files changed, 94 insertions(+), 58 deletions(-)

diff --git a/Include/internal/pycore_ceval.h b/Include/internal/pycore_ceval.h
index 8dea61ad0de9c7..4d281bd640702d 100644
--- a/Include/internal/pycore_ceval.h
+++ b/Include/internal/pycore_ceval.h
@@ -67,6 +67,7 @@ extern PyObject* _PyEval_BuiltinsFromGlobals(
 
 extern int _PyPerfTrampoline_Init(int activate);
 extern int _PyPerfTrampoline_Fini(void);
+extern int _PyIsPerfTrampolineActive(void);
 extern PyStatus _PyPerfTrampoline_AfterFork_Child(void);
 
 static inline PyObject*
diff --git a/Lib/test/test_perf_profiler.py b/Lib/test/test_perf_profiler.py
index c39e34a25f13b0..f40b5b54152868 100644
--- a/Lib/test/test_perf_profiler.py
+++ b/Lib/test/test_perf_profiler.py
@@ -3,6 +3,7 @@
 import re
 import sys
 import sysconfig
+import os
 from test.support.script_helper import make_script
 from test.support.os_helper import temp_dir
 from test.support import check_sanitizer
@@ -11,20 +12,13 @@
 def get_perf_version():
     try:
         cmd = ["perf", "--version"]
-        proc = subprocess.Popen(
+        proc = subprocess.run(
             cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True
         )
-        with proc:
-            version, stderr = proc.communicate()
-
-        if proc.returncode:
-            raise Exception(
-                f"Command {' '.join(cmd)!r} failed "
-                f"with exit code {proc.returncode}: "
-                f"stdout={version!r} stderr={stderr!r}"
-            )
-    except OSError:
+    except subprocess.SubprocessError:
         raise unittest.SkipTest("Couldn't find perf on the path")
+    
+    version = proc.stdout
 
     match = re.search(r"^perf version\s+(.*)", version)
     if match is None:
@@ -50,32 +44,22 @@ def run_perf(cwd, *args, **env_vars):
         env.update(env_vars)
     else:
         env = None
-    # -nx: Do not execute commands from any .gdbinit initialization files
-    #      (issue #22188)
     output_file = cwd + "/perf_output.perf"
     base_cmd = ("perf", "record", "-g", "--call-graph=fp", "-o", output_file, "--")
-    proc = subprocess.Popen(
+    proc = subprocess.run(
         base_cmd + args,
-        # Redirect stdin to prevent GDB from messing with
-        # the terminal settings
         stdout=subprocess.PIPE,
         stderr=subprocess.PIPE,
         env=env,
     )
-    with proc:
-        out, err = proc.communicate()
     base_cmd = ("perf", "script")
-    proc = subprocess.Popen(
+    proc = subprocess.run(
         ("perf", "script", "-i", output_file),
-        # Redirect stdin to prevent GDB from messing with
-        # the terminal settings
         stdout=subprocess.PIPE,
         stderr=subprocess.PIPE,
         env=env,
     )
-    with proc:
-        out, err = proc.communicate()
-    return out.decode("utf-8", "replace"), err.decode("utf-8", "replace")
+    return proc.stdout.decode("utf-8", "replace"), proc.stderr.decode("utf-8", "replace")
 
 
 class TestPerfProfiler(unittest.TestCase):
diff --git a/Objects/perf_trampoline.c b/Objects/perf_trampoline.c
index b69e72cd05e4c5..d6e633cc53c2c9 100644
--- a/Objects/perf_trampoline.c
+++ b/Objects/perf_trampoline.c
@@ -41,7 +41,7 @@ new_code_arena(void)
     assert(mem_size % sysconf(_SC_PAGESIZE) == 0);
     char *memory = mmap(NULL,  // address
                         mem_size,
-                        PROT_READ | PROT_WRITE | PROT_EXEC,
+                        PROT_READ | PROT_WRITE,
                         MAP_PRIVATE | MAP_ANONYMOUS,
                         -1,  // fd (not used here)
                         0);  // offset (not used here)
@@ -53,8 +53,8 @@ new_code_arena(void)
     void *end = &_Py_trampoline_func_end;
     size_t code_size = end - start;
 
-    long n_copies = mem_size / code_size;
-    for (int i = 0; i < n_copies; i++) {
+    size_t n_copies = mem_size / code_size;
+    for (size_t i = 0; i < n_copies; i++) {
         memcpy(memory + i * code_size, start, code_size * sizeof(char));
     }
 
@@ -144,7 +144,8 @@ perf_map_write_entry(FILE *method_file, const void *code_addr,
                      unsigned int code_size, const char *entry,
                      const char *file)
 {
-    fprintf(method_file, "%lx %x py::%s:%s\n", (unsigned long)code_addr,
+    assert(entry != NULL && file != NULL);
+    fprintf(method_file, "%p %x py::%s:%s\n", code_addr,
             code_size, entry, file);
     fflush(method_file);
 }
@@ -157,6 +158,10 @@ py_trampoline_evaluator(PyThreadState *ts, _PyInterpreterFrame *frame,
     py_trampoline f = NULL;
     _PyCode_GetExtra((PyObject *)co, extra_code_index, (void **)&f);
     if (f == NULL) {
+        FILE *pfile = perf_map_get_file();
+        if (pfile == NULL) {
+            return NULL;
+        }
         if (extra_code_index == -1) {
             extra_code_index = _PyEval_RequestCodeExtraIndex(NULL);
         }
@@ -164,10 +169,6 @@ py_trampoline_evaluator(PyThreadState *ts, _PyInterpreterFrame *frame,
         if (new_trampoline == NULL) {
             return NULL;
         }
-        FILE *pfile = perf_map_get_file();
-        if (pfile == NULL) {
-            return NULL;
-        }
         perf_map_write_entry(pfile, new_trampoline, code_arena->code_size,
                              PyUnicode_AsUTF8(co->co_qualname),
                              PyUnicode_AsUTF8(co->co_filename));
@@ -178,7 +179,19 @@ py_trampoline_evaluator(PyThreadState *ts, _PyInterpreterFrame *frame,
     assert(f != NULL);
     return f(_PyEval_EvalFrameDefault, ts, frame, throw);
 }
+#endif // HAVE_PERF_TRAMPOLINE
+
+int
+_PyIsPerfTrampolineActive(void)
+{
+#ifdef HAVE_PERF_TRAMPOLINE
+    PyThreadState *tstate = _PyThreadState_GET();
+    return tstate->interp->eval_frame == py_trampoline_evaluator;
 #endif
+    return 0;
+}
+
+
 
 int
 _PyPerfTrampoline_Init(int activate)
@@ -214,6 +227,7 @@ _PyPerfTrampoline_AfterFork_Child(void)
 #ifdef HAVE_PERF_TRAMPOLINE
     // close file in child.
     perf_map_close(perf_map_file);
+    perf_map_file = NULL;
 #endif
     return PyStatus_Ok();
 }
diff --git a/Python/clinic/sysmodule.c.h b/Python/clinic/sysmodule.c.h
index df3d412f01d94f..272da518f70660 100644
--- a/Python/clinic/sysmodule.c.h
+++ b/Python/clinic/sysmodule.c.h
@@ -1127,40 +1127,58 @@ sys_getandroidapilevel(PyObject *module, PyObject *Py_UNUSED(ignored))
 
 #endif /* defined(ANDROID_API_LEVEL) */
 
-PyDoc_STRVAR(sys__activate_perf_trampoline__doc__,
-"_activate_perf_trampoline($module, /)\n"
+PyDoc_STRVAR(sys_activate_perf_trampoline__doc__,
+"activate_perf_trampoline($module, /)\n"
 "--\n"
 "\n"
 "Activate the perf profiler trampoline.");
 
-#define SYS__ACTIVATE_PERF_TRAMPOLINE_METHODDEF    \
-    {"_activate_perf_trampoline", (PyCFunction)sys__activate_perf_trampoline, METH_NOARGS, sys__activate_perf_trampoline__doc__},
+#define SYS_ACTIVATE_PERF_TRAMPOLINE_METHODDEF    \
+    {"activate_perf_trampoline", (PyCFunction)sys_activate_perf_trampoline, METH_NOARGS, sys_activate_perf_trampoline__doc__},
 
 static PyObject *
-sys__activate_perf_trampoline_impl(PyObject *module);
+sys_activate_perf_trampoline_impl(PyObject *module);
 
 static PyObject *
-sys__activate_perf_trampoline(PyObject *module, PyObject *Py_UNUSED(ignored))
+sys_activate_perf_trampoline(PyObject *module, PyObject *Py_UNUSED(ignored))
 {
-    return sys__activate_perf_trampoline_impl(module);
+    return sys_activate_perf_trampoline_impl(module);
 }
 
-PyDoc_STRVAR(sys__deactivate_perf_trampoline__doc__,
-"_deactivate_perf_trampoline($module, /)\n"
+PyDoc_STRVAR(sys_deactivate_perf_trampoline__doc__,
+"deactivate_perf_trampoline($module, /)\n"
 "--\n"
 "\n"
-"Activate the perf profiler trampoline.");
+"Dectivate the perf profiler trampoline.");
+
+#define SYS_DEACTIVATE_PERF_TRAMPOLINE_METHODDEF    \
+    {"deactivate_perf_trampoline", (PyCFunction)sys_deactivate_perf_trampoline, METH_NOARGS, sys_deactivate_perf_trampoline__doc__},
+
+static PyObject *
+sys_deactivate_perf_trampoline_impl(PyObject *module);
+
+static PyObject *
+sys_deactivate_perf_trampoline(PyObject *module, PyObject *Py_UNUSED(ignored))
+{
+    return sys_deactivate_perf_trampoline_impl(module);
+}
+
+PyDoc_STRVAR(sys_is_perf_trampoline_active__doc__,
+"is_perf_trampoline_active($module, /)\n"
+"--\n"
+"\n"
+"Returns *True* if the perf profiler trampoline is active.");
 
-#define SYS__DEACTIVATE_PERF_TRAMPOLINE_METHODDEF    \
-    {"_deactivate_perf_trampoline", (PyCFunction)sys__deactivate_perf_trampoline, METH_NOARGS, sys__deactivate_perf_trampoline__doc__},
+#define SYS_IS_PERF_TRAMPOLINE_ACTIVE_METHODDEF    \
+    {"is_perf_trampoline_active", (PyCFunction)sys_is_perf_trampoline_active, METH_NOARGS, sys_is_perf_trampoline_active__doc__},
 
 static PyObject *
-sys__deactivate_perf_trampoline_impl(PyObject *module);
+sys_is_perf_trampoline_active_impl(PyObject *module);
 
 static PyObject *
-sys__deactivate_perf_trampoline(PyObject *module, PyObject *Py_UNUSED(ignored))
+sys_is_perf_trampoline_active(PyObject *module, PyObject *Py_UNUSED(ignored))
 {
-    return sys__deactivate_perf_trampoline_impl(module);
+    return sys_is_perf_trampoline_active_impl(module);
 }
 
 #ifndef SYS_GETWINDOWSVERSION_METHODDEF
@@ -1206,4 +1224,4 @@ sys__deactivate_perf_trampoline(PyObject *module, PyObject *Py_UNUSED(ignored))
 #ifndef SYS_GETANDROIDAPILEVEL_METHODDEF
     #define SYS_GETANDROIDAPILEVEL_METHODDEF
 #endif /* !defined(SYS_GETANDROIDAPILEVEL_METHODDEF) */
-/*[clinic end generated code: output=639b32664762e04a input=a9049054013a1b77]*/
+/*[clinic end generated code: output=4b43e2be96492326 input=a9049054013a1b77]*/
diff --git a/Python/initconfig.c b/Python/initconfig.c
index f870d1a032794b..d0d6e2eb8338ee 100644
--- a/Python/initconfig.c
+++ b/Python/initconfig.c
@@ -1803,7 +1803,7 @@ config_read_complex_options(PyConfig *config)
             return status;
         }
     }
-    if (config->tracemalloc < 0) {
+    if (config->perf_profiling < 0) {
         status = config_init_perf_profiling(config);
         if (_PyStatus_EXCEPTION(status)) {
             return status;
diff --git a/Python/sysmodule.c b/Python/sysmodule.c
index 9f35fdfda7b190..80a0e73e0aba62 100644
--- a/Python/sysmodule.c
+++ b/Python/sysmodule.c
@@ -1995,14 +1995,14 @@ sys_getandroidapilevel_impl(PyObject *module)
 #endif   /* ANDROID_API_LEVEL */
 
 /*[clinic input]
-sys._activate_perf_trampoline
+sys.activate_perf_trampoline
 
 Activate the perf profiler trampoline.
 [clinic start generated code]*/
 
 static PyObject *
-sys__activate_perf_trampoline_impl(PyObject *module)
-/*[clinic end generated code: output=248f6dc862887fd0 input=67667f43ffabb1e4]*/
+sys_activate_perf_trampoline_impl(PyObject *module)
+/*[clinic end generated code: output=7f97c60d4f580b85 input=666a2d744a97a220]*/
 {
     if  (_PyPerfTrampoline_Init(1) < 0) {
         return NULL;
@@ -2012,14 +2012,14 @@ sys__activate_perf_trampoline_impl(PyObject *module)
 
 
 /*[clinic input]
-sys._deactivate_perf_trampoline
+sys.deactivate_perf_trampoline
 
-Activate the perf profiler trampoline.
+Dectivate the perf profiler trampoline.
 [clinic start generated code]*/
 
 static PyObject *
-sys__deactivate_perf_trampoline_impl(PyObject *module)
-/*[clinic end generated code: output=7dde745eb7ba5e54 input=3d4fbb4aef9ad3d8]*/
+sys_deactivate_perf_trampoline_impl(PyObject *module)
+/*[clinic end generated code: output=5ba2f93711f85b6e input=d85cf6e3cd37d81e]*/
 {
     if  (_PyPerfTrampoline_Init(0) < 0) {
         return NULL;
@@ -2027,6 +2027,24 @@ sys__deactivate_perf_trampoline_impl(PyObject *module)
     Py_RETURN_NONE;
 }
 
+/*[clinic input]
+sys.is_perf_trampoline_active
+
+Returns *True* if the perf profiler trampoline is active.
+[clinic start generated code]*/
+
+static PyObject *
+sys_is_perf_trampoline_active_impl(PyObject *module)
+/*[clinic end generated code: output=7bbf80001165b590 input=59f045e52c228654]*/
+{
+#ifdef HAVE_PERF_TRAMPOLINE
+    if (_PyIsPerfTrampolineActive()) {
+        Py_RETURN_TRUE;
+    }
+#endif
+    Py_RETURN_FALSE;
+}
+
 
 
 
@@ -2083,8 +2101,9 @@ static PyMethodDef sys_methods[] = {
      METH_VARARGS | METH_KEYWORDS, set_asyncgen_hooks_doc},
     SYS_GET_ASYNCGEN_HOOKS_METHODDEF
     SYS_GETANDROIDAPILEVEL_METHODDEF
-    SYS__ACTIVATE_PERF_TRAMPOLINE_METHODDEF
-    SYS__DEACTIVATE_PERF_TRAMPOLINE_METHODDEF
+    SYS_ACTIVATE_PERF_TRAMPOLINE_METHODDEF
+    SYS_IS_PERF_TRAMPOLINE_ACTIVE_METHODDEF
+    SYS_DEACTIVATE_PERF_TRAMPOLINE_METHODDEF
     SYS_UNRAISABLEHOOK_METHODDEF
 #ifdef Py_STATS
     SYS__STATS_ON_METHODDEF

From d35c5d701429f89165a4cfa7fa80f5f0cb7544ab Mon Sep 17 00:00:00 2001
From: Christian Heimes <christian@python.org>
Date: Mon, 22 Aug 2022 11:24:57 +0200
Subject: [PATCH 09/47] Secure fopen, use unraisable, continue on error

---
 Objects/perf_trampoline.c | 53 +++++++++++++++++++++++++++++++--------
 1 file changed, 42 insertions(+), 11 deletions(-)

diff --git a/Objects/perf_trampoline.c b/Objects/perf_trampoline.c
index d6e633cc53c2c9..a41a705733ea1d 100644
--- a/Objects/perf_trampoline.c
+++ b/Objects/perf_trampoline.c
@@ -5,6 +5,7 @@
 
 #ifdef HAVE_PERF_TRAMPOLINE
 
+#include <fcntl.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/mman.h>
@@ -29,6 +30,12 @@ struct code_arena_st {
 
 typedef struct code_arena_st code_arena_t;
 
+typedef enum {
+    PERF_STATUS_FAILED = -1,
+    PERF_STATUS_NO_INIT = 0,
+} perf_status_t;
+
+static perf_status_t perf_status = PERF_STATUS_NO_INIT;
 static Py_ssize_t extra_code_index = -1;
 static code_arena_t *code_arena;
 static FILE *perf_map_file;
@@ -46,7 +53,10 @@ new_code_arena(void)
                         -1,  // fd (not used here)
                         0);  // offset (not used here)
     if (!memory) {
-        Py_FatalError("Failed to allocate new code arena");
+        PyErr_SetFromErrno(PyExc_OSError);
+        _PyErr_WriteUnraisableMsg(
+            "Failed to create new mmap for perf trampoline", NULL);
+        perf_status = PERF_STATUS_FAILED;
         return -1;
     }
     void *start = &_Py_trampoline_func_start;
@@ -57,12 +67,19 @@ new_code_arena(void)
     for (size_t i = 0; i < n_copies; i++) {
         memcpy(memory + i * code_size, start, code_size * sizeof(char));
     }
-
-    mprotect(memory, mem_size, PROT_READ | PROT_EXEC);
+    // Some systems may prevent us from creating executable code on the fly.
+    int res = mprotect(memory, mem_size, PROT_READ | PROT_EXEC);
+    if (res == -1) {
+        PyErr_SetFromErrno(PyExc_OSError);
+        _PyErr_WriteUnraisableMsg(
+            "Failed to set mmap for perf trampoline to PROT_READ | PROT_EXEC", NULL);
+    }
 
     code_arena_t *new_arena = PyMem_RawCalloc(1, sizeof(code_arena_t));
     if (new_arena == NULL) {
-        Py_FatalError("Failed to allocate new code arena struct");
+        PyErr_NoMemory();
+        _PyErr_WriteUnraisableMsg(
+            "Failed to allocate new code arena struct", NULL);
         return -1;
     }
 
@@ -107,7 +124,6 @@ compile_trampoline(void)
             return NULL;
         }
     }
-
     assert(code_arena->size_left <= code_arena->size);
     return code_arena_new_code(code_arena);
 }
@@ -120,11 +136,23 @@ perf_map_get_file(void)
     }
     char filename[100];
     pid_t pid = getpid();
-    // TODO: %d is incorrect if pid_t is long long
-    snprintf(filename, sizeof(filename), "/tmp/perf-%d.map", pid);
-    perf_map_file = fopen(filename, "a");
+    // Location and file name of perf map is hard-coded in perf tool.
+    // Use exclusive create flag wit nofollow to prevent symlink attacks.
+    int flags = O_WRONLY | O_CREAT | O_EXCL | O_NOFOLLOW | O_CLOEXEC;
+    snprintf(filename, sizeof(filename)-1, "/tmp/perf-%jd.map", (intmax_t)pid);
+    int fd = open(filename, flags, 0600);
+    if (fd == -1) {
+        perf_status = PERF_STATUS_FAILED;
+        PyErr_SetFromErrnoWithFilename(PyExc_OSError, filename);
+        _PyErr_WriteUnraisableMsg("Failed to create perf map file", NULL);
+        return NULL;
+    }
+    perf_map_file = fdopen(fd, "w");
     if (!perf_map_file) {
-        _Py_FatalErrorFormat(__func__, "Couldn't open %s: errno(%d)", filename, errno);
+        perf_status = PERF_STATUS_FAILED;
+        PyErr_SetFromErrnoWithFilename(PyExc_OSError, filename);
+        _PyErr_WriteUnraisableMsg("Failed to create perf map file handle", NULL);
+        close(fd);
         return NULL;
     }
     return perf_map_file;
@@ -154,20 +182,23 @@ static PyObject *
 py_trampoline_evaluator(PyThreadState *ts, _PyInterpreterFrame *frame,
                         int throw)
 {
+    if (perf_status == PERF_STATUS_FAILED) {
+        return _PyEval_EvalFrameDefault(ts, frame, throw);
+    }
     PyCodeObject *co = frame->f_code;
     py_trampoline f = NULL;
     _PyCode_GetExtra((PyObject *)co, extra_code_index, (void **)&f);
     if (f == NULL) {
         FILE *pfile = perf_map_get_file();
         if (pfile == NULL) {
-            return NULL;
+            return _PyEval_EvalFrameDefault(ts, frame, throw);
         }
         if (extra_code_index == -1) {
             extra_code_index = _PyEval_RequestCodeExtraIndex(NULL);
         }
         py_trampoline new_trampoline = compile_trampoline();
         if (new_trampoline == NULL) {
-            return NULL;
+            return _PyEval_EvalFrameDefault(ts, frame, throw);
         }
         perf_map_write_entry(pfile, new_trampoline, code_arena->code_size,
                              PyUnicode_AsUTF8(co->co_qualname),

From 2664b127a4f482bb2337a86007415013bbcec93e Mon Sep 17 00:00:00 2001
From: Christian Heimes <christian@python.org>
Date: Mon, 22 Aug 2022 11:30:43 +0200
Subject: [PATCH 10/47] cleanup resources, set to uninit

---
 Objects/perf_trampoline.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/Objects/perf_trampoline.c b/Objects/perf_trampoline.c
index a41a705733ea1d..fafd79f6a8efbf 100644
--- a/Objects/perf_trampoline.c
+++ b/Objects/perf_trampoline.c
@@ -71,6 +71,7 @@ new_code_arena(void)
     int res = mprotect(memory, mem_size, PROT_READ | PROT_EXEC);
     if (res == -1) {
         PyErr_SetFromErrno(PyExc_OSError);
+        munmap(memory, mem_size);
         _PyErr_WriteUnraisableMsg(
             "Failed to set mmap for perf trampoline to PROT_READ | PROT_EXEC", NULL);
     }
@@ -78,6 +79,7 @@ new_code_arena(void)
     code_arena_t *new_arena = PyMem_RawCalloc(1, sizeof(code_arena_t));
     if (new_arena == NULL) {
         PyErr_NoMemory();
+        munmap(memory, mem_size);
         _PyErr_WriteUnraisableMsg(
             "Failed to allocate new code arena struct", NULL);
         return -1;
@@ -151,8 +153,8 @@ perf_map_get_file(void)
     if (!perf_map_file) {
         perf_status = PERF_STATUS_FAILED;
         PyErr_SetFromErrnoWithFilename(PyExc_OSError, filename);
-        _PyErr_WriteUnraisableMsg("Failed to create perf map file handle", NULL);
         close(fd);
+        _PyErr_WriteUnraisableMsg("Failed to create perf map file handle", NULL);
         return NULL;
     }
     return perf_map_file;
@@ -164,6 +166,7 @@ perf_map_close(FILE *fp)
     if (fp) {
         return fclose(fp);
     }
+    perf_status = PERF_STATUS_NO_INIT;
     return 0;
 }
 

From e6c365a99a5d59bd2166d0177976ee0c6728acd0 Mon Sep 17 00:00:00 2001
From: Pablo Galindo <pablogsal@gmail.com>
Date: Mon, 22 Aug 2022 12:40:06 +0100
Subject: [PATCH 11/47] Allow to set custom callbacks

---
 Include/internal/pycore_ceval.h |  16 +++
 Lib/test/test_perf_profiler.py  |   4 +-
 Objects/perf_trampoline.c       | 172 +++++++++++++++++++-------------
 Python/pylifecycle.c            |   7 +-
 Python/sysmodule.c              |   2 +-
 5 files changed, 130 insertions(+), 71 deletions(-)

diff --git a/Include/internal/pycore_ceval.h b/Include/internal/pycore_ceval.h
index 4d281bd640702d..df7eb1bc8a7c6a 100644
--- a/Include/internal/pycore_ceval.h
+++ b/Include/internal/pycore_ceval.h
@@ -65,11 +65,27 @@ extern PyObject* _PyEval_BuiltinsFromGlobals(
     PyThreadState *tstate,
     PyObject *globals);
 
+// Trampoline API
+
+typedef void* (*trampoline_state_init)(void);
+typedef void (*trampoline_state_write)(void* state, const void *code_addr,
+                                       unsigned int code_size, PyCodeObject* code);
+typedef int (*trampoline_state_free)(void* state);
+extern int _PyPerfTrampoline_SetCallbacks(
+    trampoline_state_init init_state,
+    trampoline_state_write write_state,
+    trampoline_state_free free_state
+);
+
 extern int _PyPerfTrampoline_Init(int activate);
 extern int _PyPerfTrampoline_Fini(void);
 extern int _PyIsPerfTrampolineActive(void);
 extern PyStatus _PyPerfTrampoline_AfterFork_Child(void);
 
+extern void* _Py_perf_map_get_file(void);
+extern void _Py_perf_map_write_entry(void*, const void*, unsigned int, PyCodeObject*);
+extern int _Py_perf_map_close(void*);
+
 static inline PyObject*
 _PyEval_EvalFrame(PyThreadState *tstate, struct _PyInterpreterFrame *frame, int throwflag)
 {
diff --git a/Lib/test/test_perf_profiler.py b/Lib/test/test_perf_profiler.py
index f40b5b54152868..fdd8f588765bf9 100644
--- a/Lib/test/test_perf_profiler.py
+++ b/Lib/test/test_perf_profiler.py
@@ -11,11 +11,11 @@
 
 def get_perf_version():
     try:
-        cmd = ["perf", "--version"]
+        cmd = ["perf", "version"]
         proc = subprocess.run(
             cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True
         )
-    except subprocess.SubprocessError:
+    except (subprocess.SubprocessError, OSError):
         raise unittest.SkipTest("Couldn't find perf on the path")
     
     version = proc.stdout
diff --git a/Objects/perf_trampoline.c b/Objects/perf_trampoline.c
index fafd79f6a8efbf..2f2c2843979174 100644
--- a/Objects/perf_trampoline.c
+++ b/Objects/perf_trampoline.c
@@ -28,18 +28,92 @@ struct code_arena_st {
     struct code_arena_st *prev;
 };
 
-typedef struct code_arena_st code_arena_t;
-
 typedef enum {
     PERF_STATUS_FAILED = -1,
     PERF_STATUS_NO_INIT = 0,
+    PERF_STATUS_OK = 1,
 } perf_status_t;
 
+typedef struct code_arena_st code_arena_t;
+
+struct trampoline_api_st {
+    trampoline_state_init init_state;
+    trampoline_state_write write_state;
+    trampoline_state_free free_state;
+    void* state;
+};
+
+typedef struct trampoline_api_st trampoline_api_t;
+
 static perf_status_t perf_status = PERF_STATUS_NO_INIT;
 static Py_ssize_t extra_code_index = -1;
 static code_arena_t *code_arena;
+static trampoline_api_t trampoline_api;
 static FILE *perf_map_file;
 
+void*
+_Py_perf_map_get_file(void)
+{
+    if (perf_map_file) {
+        return perf_map_file;
+    }
+    char filename[100];
+    pid_t pid = getpid();
+    // Location and file name of perf map is hard-coded in perf tool.
+    // Use exclusive create flag wit nofollow to prevent symlink attacks.
+    int flags = O_WRONLY | O_CREAT | O_EXCL | O_NOFOLLOW | O_CLOEXEC;
+    snprintf(filename, sizeof(filename)-1, "/tmp/perf-%jd.map", (intmax_t)pid);
+    int fd = open(filename, flags, 0600);
+    if (fd == -1) {
+        perf_status = PERF_STATUS_FAILED;
+        PyErr_SetFromErrnoWithFilename(PyExc_OSError, filename);
+        _PyErr_WriteUnraisableMsg("Failed to create perf map file", NULL);
+        return NULL;
+    }
+    perf_map_file = fdopen(fd, "w");
+    if (!perf_map_file) {
+        perf_status = PERF_STATUS_FAILED;
+        PyErr_SetFromErrnoWithFilename(PyExc_OSError, filename);
+        close(fd);
+        _PyErr_WriteUnraisableMsg("Failed to create perf map file handle", NULL);
+        return NULL;
+    }
+    return perf_map_file;
+}
+
+int
+_Py_perf_map_close(void* state)
+{
+    FILE *fp = (FILE*)state;
+    if (fp) {
+        return fclose(fp);
+    }
+    perf_map_file = NULL;
+    perf_status = PERF_STATUS_NO_INIT;
+    return 0;
+}
+
+void
+_Py_perf_map_write_entry(void* state, const void *code_addr,
+                     unsigned int code_size, PyCodeObject* co)
+{
+    assert(file != NULL);
+    FILE *method_file = (FILE*)state;
+    const char* entry = PyUnicode_AsUTF8(co->co_qualname);
+    if (entry == NULL) {
+        _PyErr_WriteUnraisableMsg( "Failed to get qualname from code object", NULL);
+        return;
+    }
+    const char* filename = PyUnicode_AsUTF8(co->co_filename);
+    if (filename == NULL) {
+        _PyErr_WriteUnraisableMsg( "Failed to get filename from code object", NULL);
+        return;
+    }
+    fprintf(method_file, "%p %x py::%s:%s\n", code_addr,
+            code_size, entry, filename);
+    fflush(method_file);
+}
+
 static int
 new_code_arena(void)
 {
@@ -130,88 +204,34 @@ compile_trampoline(void)
     return code_arena_new_code(code_arena);
 }
 
-static inline FILE *
-perf_map_get_file(void)
-{
-    if (perf_map_file) {
-        return perf_map_file;
-    }
-    char filename[100];
-    pid_t pid = getpid();
-    // Location and file name of perf map is hard-coded in perf tool.
-    // Use exclusive create flag wit nofollow to prevent symlink attacks.
-    int flags = O_WRONLY | O_CREAT | O_EXCL | O_NOFOLLOW | O_CLOEXEC;
-    snprintf(filename, sizeof(filename)-1, "/tmp/perf-%jd.map", (intmax_t)pid);
-    int fd = open(filename, flags, 0600);
-    if (fd == -1) {
-        perf_status = PERF_STATUS_FAILED;
-        PyErr_SetFromErrnoWithFilename(PyExc_OSError, filename);
-        _PyErr_WriteUnraisableMsg("Failed to create perf map file", NULL);
-        return NULL;
-    }
-    perf_map_file = fdopen(fd, "w");
-    if (!perf_map_file) {
-        perf_status = PERF_STATUS_FAILED;
-        PyErr_SetFromErrnoWithFilename(PyExc_OSError, filename);
-        close(fd);
-        _PyErr_WriteUnraisableMsg("Failed to create perf map file handle", NULL);
-        return NULL;
-    }
-    return perf_map_file;
-}
-
-static inline int
-perf_map_close(FILE *fp)
-{
-    if (fp) {
-        return fclose(fp);
-    }
-    perf_status = PERF_STATUS_NO_INIT;
-    return 0;
-}
-
-static void
-perf_map_write_entry(FILE *method_file, const void *code_addr,
-                     unsigned int code_size, const char *entry,
-                     const char *file)
-{
-    assert(entry != NULL && file != NULL);
-    fprintf(method_file, "%p %x py::%s:%s\n", code_addr,
-            code_size, entry, file);
-    fflush(method_file);
-}
-
 static PyObject *
 py_trampoline_evaluator(PyThreadState *ts, _PyInterpreterFrame *frame,
                         int throw)
 {
-    if (perf_status == PERF_STATUS_FAILED) {
-        return _PyEval_EvalFrameDefault(ts, frame, throw);
+    if (perf_status == PERF_STATUS_FAILED || perf_status == PERF_STATUS_NO_INIT) {
+        goto default_eval;
     }
     PyCodeObject *co = frame->f_code;
     py_trampoline f = NULL;
     _PyCode_GetExtra((PyObject *)co, extra_code_index, (void **)&f);
     if (f == NULL) {
-        FILE *pfile = perf_map_get_file();
-        if (pfile == NULL) {
-            return _PyEval_EvalFrameDefault(ts, frame, throw);
-        }
         if (extra_code_index == -1) {
             extra_code_index = _PyEval_RequestCodeExtraIndex(NULL);
         }
         py_trampoline new_trampoline = compile_trampoline();
         if (new_trampoline == NULL) {
-            return _PyEval_EvalFrameDefault(ts, frame, throw);
+            goto default_eval;
         }
-        perf_map_write_entry(pfile, new_trampoline, code_arena->code_size,
-                             PyUnicode_AsUTF8(co->co_qualname),
-                             PyUnicode_AsUTF8(co->co_filename));
+        trampoline_api.write_state(trampoline_api.state, new_trampoline,
+                                   code_arena->code_size, co);
         _PyCode_SetExtra((PyObject *)co, extra_code_index,
                          (void *)new_trampoline);
         f = new_trampoline;
     }
     assert(f != NULL);
     return f(_PyEval_EvalFrameDefault, ts, frame, throw);
+default_eval:
+    return _PyEval_EvalFrameDefault(ts, frame, throw);
 }
 #endif // HAVE_PERF_TRAMPOLINE
 
@@ -225,7 +245,26 @@ _PyIsPerfTrampolineActive(void)
     return 0;
 }
 
-
+int _PyPerfTrampoline_SetCallbacks(
+    trampoline_state_init init_state,
+    trampoline_state_write write_state,
+    trampoline_state_free free_state
+) {
+    if (trampoline_api.state) {
+        Py_FatalError("Trampoline state already initialized");
+        return -1;
+    }
+    trampoline_api.init_state = init_state;
+    trampoline_api.write_state = write_state;
+    trampoline_api.free_state = free_state;
+    void* state = trampoline_api.init_state();
+    if (state == NULL) {
+        return -1;
+    }
+    trampoline_api.state = state;
+    perf_status = PERF_STATUS_OK;
+    return 0;
+}
 
 int
 _PyPerfTrampoline_Init(int activate)
@@ -250,7 +289,7 @@ _PyPerfTrampoline_Fini(void)
 {
 #ifdef HAVE_PERF_TRAMPOLINE
     free_code_arenas();
-    perf_map_close(perf_map_file);
+    trampoline_api.free_state(trampoline_api.state);
 #endif
     return 0;
 }
@@ -260,8 +299,7 @@ _PyPerfTrampoline_AfterFork_Child(void)
 {
 #ifdef HAVE_PERF_TRAMPOLINE
     // close file in child.
-    perf_map_close(perf_map_file);
-    perf_map_file = NULL;
+    trampoline_api.free_state(trampoline_api.state);
 #endif
     return PyStatus_Ok();
 }
diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c
index 05755ab46d9765..ed05d0277bb2df 100644
--- a/Python/pylifecycle.c
+++ b/Python/pylifecycle.c
@@ -1150,7 +1150,12 @@ init_interp_main(PyThreadState *tstate)
             return _PyStatus_ERR("can't initialize tracemalloc");
         }
 
-        if (_PyPerfTrampoline_Init(config->perf_profiling) < 0) {
+        if (_PyPerfTrampoline_SetCallbacks(
+                _Py_perf_map_get_file, _Py_perf_map_write_entry, _Py_perf_map_close
+            ) < 0 ||
+            _PyPerfTrampoline_Init(
+                config->perf_profiling
+            ) < 0) {
             return _PyStatus_ERR("can't initialize the perf trampoline");
         }
 
diff --git a/Python/sysmodule.c b/Python/sysmodule.c
index 80a0e73e0aba62..71c976ea3339b6 100644
--- a/Python/sysmodule.c
+++ b/Python/sysmodule.c
@@ -2004,7 +2004,7 @@ static PyObject *
 sys_activate_perf_trampoline_impl(PyObject *module)
 /*[clinic end generated code: output=7f97c60d4f580b85 input=666a2d744a97a220]*/
 {
-    if  (_PyPerfTrampoline_Init(1) < 0) {
+    if (_PyPerfTrampoline_Init(1) < 0) {
         return NULL;
     }
     Py_RETURN_NONE;

From 5513fb15c191f2f3bc04b43dca34fd8012ea709a Mon Sep 17 00:00:00 2001
From: Pablo Galindo <pablogsal@gmail.com>
Date: Mon, 22 Aug 2022 12:50:00 +0100
Subject: [PATCH 12/47] Add comment to asm file

---
 Objects/asm_trampoline.S | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/Objects/asm_trampoline.S b/Objects/asm_trampoline.S
index 93ddbd625df252..4151af9d331ab2 100644
--- a/Objects/asm_trampoline.S
+++ b/Objects/asm_trampoline.S
@@ -1,5 +1,12 @@
     .text
     .globl	_Py_trampoline_func_start
+# The following assembly is equivalent to:
+# PyObject *
+# trampoline(py_evaluator evaluator, PyThreadState *ts,
+#            _PyInterpreterFrame *f, int throwflag)
+# {
+#     return evaluator(ts, f, throwflag);
+# }
 _Py_trampoline_func_start:
 #ifdef __x86_64__
     push   %rbp

From 76c7dc00ecb7546f60315dd9e70612e995af594c Mon Sep 17 00:00:00 2001
From: Pablo Galindo <pablogsal@gmail.com>
Date: Mon, 22 Aug 2022 12:56:11 +0100
Subject: [PATCH 13/47] fixup! Merge pull request #36 from tiran/perf-file

---
 Objects/perf_trampoline.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/Objects/perf_trampoline.c b/Objects/perf_trampoline.c
index 2f2c2843979174..565427deddcbed 100644
--- a/Objects/perf_trampoline.c
+++ b/Objects/perf_trampoline.c
@@ -49,8 +49,8 @@ static perf_status_t perf_status = PERF_STATUS_NO_INIT;
 static Py_ssize_t extra_code_index = -1;
 static code_arena_t *code_arena;
 static trampoline_api_t trampoline_api;
-static FILE *perf_map_file;
 
+static FILE *perf_map_file;
 void*
 _Py_perf_map_get_file(void)
 {
@@ -97,7 +97,7 @@ void
 _Py_perf_map_write_entry(void* state, const void *code_addr,
                      unsigned int code_size, PyCodeObject* co)
 {
-    assert(file != NULL);
+    assert(state != NULL);
     FILE *method_file = (FILE*)state;
     const char* entry = PyUnicode_AsUTF8(co->co_qualname);
     if (entry == NULL) {
@@ -250,6 +250,7 @@ int _PyPerfTrampoline_SetCallbacks(
     trampoline_state_write write_state,
     trampoline_state_free free_state
 ) {
+#ifdef HAVE_PERF_TRAMPOLINE
     if (trampoline_api.state) {
         Py_FatalError("Trampoline state already initialized");
         return -1;
@@ -263,6 +264,7 @@ int _PyPerfTrampoline_SetCallbacks(
     }
     trampoline_api.state = state;
     perf_status = PERF_STATUS_OK;
+#endif
     return 0;
 }
 

From a545b3cc7d9a262b6231a7193953c4f8a00037cc Mon Sep 17 00:00:00 2001
From: Pablo Galindo <pablogsal@gmail.com>
Date: Mon, 22 Aug 2022 13:40:08 +0100
Subject: [PATCH 14/47] Add comments to the perf_trampoline file and format
 file

---
 Lib/test/test_perf_profiler.py |   4 +
 Objects/perf_trampoline.c      | 238 ++++++++++++++++++++++++++-------
 2 files changed, 193 insertions(+), 49 deletions(-)

diff --git a/Lib/test/test_perf_profiler.py b/Lib/test/test_perf_profiler.py
index fdd8f588765bf9..413c55bf912993 100644
--- a/Lib/test/test_perf_profiler.py
+++ b/Lib/test/test_perf_profiler.py
@@ -4,6 +4,7 @@
 import sys
 import sysconfig
 import os
+from test import support
 from test.support.script_helper import make_script
 from test.support.os_helper import temp_dir
 from test.support import check_sanitizer
@@ -25,6 +26,9 @@ def get_perf_version():
         raise Exception("unable to parse perf version: %r" % version)
     return (version, match.group(1))
 
+if not support.has_subprocess_support:
+    raise unittest.SkipTest("test module requires subprocess")
+
 
 _, version = get_perf_version()
 
diff --git a/Objects/perf_trampoline.c b/Objects/perf_trampoline.c
index 565427deddcbed..6fd2966b5d1eea 100644
--- a/Objects/perf_trampoline.c
+++ b/Objects/perf_trampoline.c
@@ -1,3 +1,128 @@
+/*
+
+Perf trampoline instrumentation
+===============================
+
+This file contains instrumentation to allow to associate
+calls to the CPython eval loop back to the names of the Python
+fuctions and filename being executed.
+
+Many natve performance profilers like the Linux perf tools are
+only available to 'see' the C stack when sampling from the profiled
+process. This means that if we have the following python code:
+
+    import time
+    def foo(n):
+        # Some CPU intensive code
+
+    def bar(n):
+        foo(n)
+
+    def baz(n):
+        bar(n)
+
+    baz(10000000)
+
+A performace profiler that is only able to see native frames will
+produce the following backtrace whe sampling from foo():
+
+    _PyEval_EvalFrameDefault -----> Evaluation frame of foo()
+    _PyEval_Vector
+    _PyFunction_Vectorcall
+    PyObject_Vectorcall
+    call_function
+
+    _PyEval_EvalFrameDefault ------> Evaluation frame of bar()
+    _PyEval_EvalFrame
+    _PyEval_Vector
+    _PyFunction_Vectorcall
+    PyObject_Vectorcall
+    call_function
+
+    _PyEval_EvalFrameDefault -------> Evaluation frame of baz()
+    _PyEval_EvalFrame
+    _PyEval_Vector
+    _PyFunction_Vectorcall
+    PyObject_Vectorcall
+    call_function
+
+    ...
+
+    Py_RunMain
+
+Because the profiler is only able to see the native frames and the native
+function that runs the evaluation loop is the same (_PyEval_EvalFrameDefault)
+then the profiler and any reporter generated by it will not be able to
+associate the names of the Python functions and the filenames associated with
+those calls, rendering the results useless in the Python world.
+
+To fix this problem, we introduce the concept of a trampoline frame. A
+trampoline frame is a piece of code that is uniqued per Python code object that
+is executed before entering the CPython eval loop. This piece of code just
+calls the original Python evaluattion function (_PyEval_EvalFrameDefault) and
+forwards all the arguments received. In this way, when a profiler samples
+frames from the previous example it will see;
+
+    _PyEval_EvalFrameDefault -----> Evaluation frame of foo()
+    [Jit compiled code 3]
+    _PyEval_Vector
+    _PyFunction_Vectorcall
+    PyObject_Vectorcall
+    call_function
+
+    _PyEval_EvalFrameDefault ------> Evaluation frame of bar()
+    [Jit compiled code 2]
+    _PyEval_EvalFrame
+    _PyEval_Vector
+    _PyFunction_Vectorcall
+    PyObject_Vectorcall
+    call_function
+
+    _PyEval_EvalFrameDefault -------> Evaluation frame of baz()
+    [Jit compiled code 1]
+    _PyEval_EvalFrame
+    _PyEval_Vector
+    _PyFunction_Vectorcall
+    PyObject_Vectorcall
+    call_function
+
+    ...
+
+    Py_RunMain
+
+When we generate every unique copy of the trampoline (what here we called "[Jit
+compiled code N]") we write the relationship between the compiled code and the
+Python function that is associated with it. Every profiler requires this
+information in a different format. For example, the Linux "perf" profiler
+requires a file in "/tmp/perf-PID.map" (name and location not configurable)
+with the following format:
+
+    <compiled code address> <compiled code size> <name of the compiled code>
+
+If this file is available when "perf" generates reports, it will automatically
+associate every trampoline with the Python function that it is associated with
+allowing it to generate reports that include Python information. These reports
+then can also be filtered in a way that *only* Python information appears.
+
+Notice that for this to work, there must be a unique copied of the trampoline
+per Python code object even if the code in the trampoline is the same. To
+achieve this we have a assembly template in Objects/asm_trampiline.S that is
+compiled into the Python executable/shared library. This template generates a
+symbol that maps the start of the assembly code and another that marks the end
+of the assembly code for the trampoline.  Then, every time we need a unique
+trampoline for a Python code object, we copy the assembly code into a mmaped
+area that has executable permissions and we return the start of that area as
+our trampoline function.
+
+Asking for a mmap-ed memory area for trampoline is very wastefull so we
+allocate big arenas of memory in a single mmap call, we populate the entire
+arena with copies of the trampoline (this allows us to now have to invalidate
+the icache for the instructions in the page) and then we return the next
+available chunk every time someone asks for a new trampoline. We keep a linked
+list of arenas in case the current memory arena is exhausted and another one is
+needed.
+*/
+
 #include "Python.h"
 #include "pycore_ceval.h"
 #include "pycore_frame.h"
@@ -16,31 +141,37 @@ typedef PyObject *(*py_evaluator)(PyThreadState *, _PyInterpreterFrame *,
                                   int throwflag);
 typedef PyObject *(*py_trampoline)(py_evaluator, PyThreadState *,
                                    _PyInterpreterFrame *, int throwflag);
-extern void *_Py_trampoline_func_start;
-extern void *_Py_trampoline_func_end;
+
+extern void *_Py_trampoline_func_start;  // Start of the template of the
+                                         // assembly trampoline
+extern void *
+    _Py_trampoline_func_end;  // End of the template of the assembly trampoline
 
 struct code_arena_st {
-    char *start_addr;
-    char *current_addr;
-    size_t size;
-    size_t size_left;
-    size_t code_size;
-    struct code_arena_st *prev;
+    char *start_addr;    // Start of the memory arena
+    char *current_addr;  // Address of the current trampoline within the arena
+    size_t size;         // Size of the memory arena
+    size_t size_left;    // Remaining size of the memory arena
+    size_t code_size;    // Size of the code of every trampoline in the arena
+    struct code_arena_st
+        *prev;  // Pointer to the arena  or NULL if this is the first arena.
 };
 
 typedef enum {
-    PERF_STATUS_FAILED = -1,
-    PERF_STATUS_NO_INIT = 0,
-    PERF_STATUS_OK = 1,
+    PERF_STATUS_FAILED = -1,  // Perf trampoline is in an invalid state
+    PERF_STATUS_NO_INIT = 0,  // Perf trampoline is not initialized
+    PERF_STATUS_OK = 1,       // Perf trampoline is ready to be executed
 } perf_status_t;
 
 typedef struct code_arena_st code_arena_t;
 
 struct trampoline_api_st {
-    trampoline_state_init init_state;
-    trampoline_state_write write_state;
-    trampoline_state_free free_state;
-    void* state;
+    trampoline_state_init
+        init_state;  // Callback to initialize the trampoline state
+    trampoline_state_write
+        write_state;  // Callback to register every trampoline being created
+    trampoline_state_free free_state;  // Callback to free the trampoline state
+    void *state;
 };
 
 typedef struct trampoline_api_st trampoline_api_t;
@@ -51,7 +182,7 @@ static code_arena_t *code_arena;
 static trampoline_api_t trampoline_api;
 
 static FILE *perf_map_file;
-void*
+void *
 _Py_perf_map_get_file(void)
 {
     if (perf_map_file) {
@@ -62,7 +193,8 @@ _Py_perf_map_get_file(void)
     // Location and file name of perf map is hard-coded in perf tool.
     // Use exclusive create flag wit nofollow to prevent symlink attacks.
     int flags = O_WRONLY | O_CREAT | O_EXCL | O_NOFOLLOW | O_CLOEXEC;
-    snprintf(filename, sizeof(filename)-1, "/tmp/perf-%jd.map", (intmax_t)pid);
+    snprintf(filename, sizeof(filename) - 1, "/tmp/perf-%jd.map",
+             (intmax_t)pid);
     int fd = open(filename, flags, 0600);
     if (fd == -1) {
         perf_status = PERF_STATUS_FAILED;
@@ -75,16 +207,17 @@ _Py_perf_map_get_file(void)
         perf_status = PERF_STATUS_FAILED;
         PyErr_SetFromErrnoWithFilename(PyExc_OSError, filename);
         close(fd);
-        _PyErr_WriteUnraisableMsg("Failed to create perf map file handle", NULL);
+        _PyErr_WriteUnraisableMsg("Failed to create perf map file handle",
+                                  NULL);
         return NULL;
     }
     return perf_map_file;
 }
 
 int
-_Py_perf_map_close(void* state)
+_Py_perf_map_close(void *state)
 {
-    FILE *fp = (FILE*)state;
+    FILE *fp = (FILE *)state;
     if (fp) {
         return fclose(fp);
     }
@@ -94,23 +227,25 @@ _Py_perf_map_close(void* state)
 }
 
 void
-_Py_perf_map_write_entry(void* state, const void *code_addr,
-                     unsigned int code_size, PyCodeObject* co)
+_Py_perf_map_write_entry(void *state, const void *code_addr,
+                         unsigned int code_size, PyCodeObject *co)
 {
     assert(state != NULL);
-    FILE *method_file = (FILE*)state;
-    const char* entry = PyUnicode_AsUTF8(co->co_qualname);
+    FILE *method_file = (FILE *)state;
+    const char *entry = PyUnicode_AsUTF8(co->co_qualname);
     if (entry == NULL) {
-        _PyErr_WriteUnraisableMsg( "Failed to get qualname from code object", NULL);
+        _PyErr_WriteUnraisableMsg("Failed to get qualname from code object",
+                                  NULL);
         return;
     }
-    const char* filename = PyUnicode_AsUTF8(co->co_filename);
+    const char *filename = PyUnicode_AsUTF8(co->co_filename);
     if (filename == NULL) {
-        _PyErr_WriteUnraisableMsg( "Failed to get filename from code object", NULL);
+        _PyErr_WriteUnraisableMsg("Failed to get filename from code object",
+                                  NULL);
         return;
     }
-    fprintf(method_file, "%p %x py::%s:%s\n", code_addr,
-            code_size, entry, filename);
+    fprintf(method_file, "%p %x py::%s:%s\n", code_addr, code_size, entry,
+            filename);
     fflush(method_file);
 }
 
@@ -120,12 +255,11 @@ new_code_arena(void)
     // non-trivial programs typically need 64 to 256 kiB.
     size_t mem_size = 4096 * 16;
     assert(mem_size % sysconf(_SC_PAGESIZE) == 0);
-    char *memory = mmap(NULL,  // address
-                        mem_size,
-                        PROT_READ | PROT_WRITE,
-                        MAP_PRIVATE | MAP_ANONYMOUS,
-                        -1,  // fd (not used here)
-                        0);  // offset (not used here)
+    char *memory =
+        mmap(NULL,  // address
+             mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS,
+             -1,  // fd (not used here)
+             0);  // offset (not used here)
     if (!memory) {
         PyErr_SetFromErrno(PyExc_OSError);
         _PyErr_WriteUnraisableMsg(
@@ -147,15 +281,16 @@ new_code_arena(void)
         PyErr_SetFromErrno(PyExc_OSError);
         munmap(memory, mem_size);
         _PyErr_WriteUnraisableMsg(
-            "Failed to set mmap for perf trampoline to PROT_READ | PROT_EXEC", NULL);
+            "Failed to set mmap for perf trampoline to PROT_READ | PROT_EXEC",
+            NULL);
     }
 
     code_arena_t *new_arena = PyMem_RawCalloc(1, sizeof(code_arena_t));
     if (new_arena == NULL) {
         PyErr_NoMemory();
         munmap(memory, mem_size);
-        _PyErr_WriteUnraisableMsg(
-            "Failed to allocate new code arena struct", NULL);
+        _PyErr_WriteUnraisableMsg("Failed to allocate new code arena struct",
+                                  NULL);
         return -1;
     }
 
@@ -174,8 +309,8 @@ free_code_arenas(void)
 {
     code_arena_t *cur = code_arena;
     code_arena_t *prev;
-    code_arena = NULL; // invalid static pointer
-    while(cur) {
+    code_arena = NULL;  // invalid static pointer
+    while (cur) {
         munmap(cur->start_addr, cur->size);
         prev = cur->prev;
         PyMem_RawFree(cur);
@@ -195,7 +330,8 @@ code_arena_new_code(code_arena_t *code_arena)
 static inline py_trampoline
 compile_trampoline(void)
 {
-    if ((code_arena == NULL) || (code_arena->size_left <= code_arena->code_size)) {
+    if ((code_arena == NULL) ||
+        (code_arena->size_left <= code_arena->code_size)) {
         if (new_code_arena() < 0) {
             return NULL;
         }
@@ -208,13 +344,16 @@ static PyObject *
 py_trampoline_evaluator(PyThreadState *ts, _PyInterpreterFrame *frame,
                         int throw)
 {
-    if (perf_status == PERF_STATUS_FAILED || perf_status == PERF_STATUS_NO_INIT) {
+    if (perf_status == PERF_STATUS_FAILED ||
+        perf_status == PERF_STATUS_NO_INIT) {
         goto default_eval;
     }
     PyCodeObject *co = frame->f_code;
     py_trampoline f = NULL;
     _PyCode_GetExtra((PyObject *)co, extra_code_index, (void **)&f);
     if (f == NULL) {
+        // This is the first time we see this code object so we need
+        // to compile a trampoline for it.
         if (extra_code_index == -1) {
             extra_code_index = _PyEval_RequestCodeExtraIndex(NULL);
         }
@@ -231,9 +370,10 @@ py_trampoline_evaluator(PyThreadState *ts, _PyInterpreterFrame *frame,
     assert(f != NULL);
     return f(_PyEval_EvalFrameDefault, ts, frame, throw);
 default_eval:
+    // Something failed, fall back to the default evaluator.
     return _PyEval_EvalFrameDefault(ts, frame, throw);
 }
-#endif // HAVE_PERF_TRAMPOLINE
+#endif  // HAVE_PERF_TRAMPOLINE
 
 int
 _PyIsPerfTrampolineActive(void)
@@ -245,11 +385,11 @@ _PyIsPerfTrampolineActive(void)
     return 0;
 }
 
-int _PyPerfTrampoline_SetCallbacks(
-    trampoline_state_init init_state,
-    trampoline_state_write write_state,
-    trampoline_state_free free_state
-) {
+int
+_PyPerfTrampoline_SetCallbacks(trampoline_state_init init_state,
+                               trampoline_state_write write_state,
+                               trampoline_state_free free_state)
+{
 #ifdef HAVE_PERF_TRAMPOLINE
     if (trampoline_api.state) {
         Py_FatalError("Trampoline state already initialized");
@@ -258,7 +398,7 @@ int _PyPerfTrampoline_SetCallbacks(
     trampoline_api.init_state = init_state;
     trampoline_api.write_state = write_state;
     trampoline_api.free_state = free_state;
-    void* state = trampoline_api.init_state();
+    void *state = trampoline_api.init_state();
     if (state == NULL) {
         return -1;
     }

From 5130c8d9c2121dcb1ad2ad45833f75d5f8efcdbb Mon Sep 17 00:00:00 2001
From: Pablo Galindo <pablogsal@gmail.com>
Date: Mon, 22 Aug 2022 13:44:02 +0100
Subject: [PATCH 15/47] Correct News entry

---
 .../2022-08-20-18-36-40.gh-issue-96143.nh3GFM.rst        | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-08-20-18-36-40.gh-issue-96143.nh3GFM.rst b/Misc/NEWS.d/next/Core and Builtins/2022-08-20-18-36-40.gh-issue-96143.nh3GFM.rst
index 66bd70536a669c..dfd701fce8a932 100644
--- a/Misc/NEWS.d/next/Core and Builtins/2022-08-20-18-36-40.gh-issue-96143.nh3GFM.rst	
+++ b/Misc/NEWS.d/next/Core and Builtins/2022-08-20-18-36-40.gh-issue-96143.nh3GFM.rst	
@@ -1,5 +1,6 @@
 Add a new ``-X perf`` Python command line option as well as
-:func:`sys._activate_perf_trampoline` and
-:func:`sys._deactivate_perf_trampoline` function in the :mod:`sys` module
-that allows to set/unset the interpreter in a way that the Linux ``perf``
-profiler can detect Python calls. Patch by Pablo Galindo.
+:func:`sys.activate_perf_trampoline` and :func:`sys.deactivate_perf_trampoline`
+function in the :mod:`sys` module that allows to set/unset the interpreter in a
+way that the Linux ``perf`` profiler can detect Python calls. The new
+:func:`sys.is_perf_trampoline_active` function allows to query the state of the
+perf trampoline. Patch by Pablo Galindo.

From 991366b14d744da32de16d4b3c7b5bb0f45438cc Mon Sep 17 00:00:00 2001
From: Pablo Galindo Salgado <Pablogsal@gmail.com>
Date: Mon, 22 Aug 2022 13:52:08 +0100
Subject: [PATCH 16/47] Update Lib/test/test_perf_profiler.py

Co-authored-by: Kumar Aditya <59607654+kumaraditya303@users.noreply.github.com>
---
 Lib/test/test_perf_profiler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Lib/test/test_perf_profiler.py b/Lib/test/test_perf_profiler.py
index 413c55bf912993..40450a79e116ab 100644
--- a/Lib/test/test_perf_profiler.py
+++ b/Lib/test/test_perf_profiler.py
@@ -35,7 +35,7 @@ def get_perf_version():
 if not version:
     raise unittest.SkipTest("Could not find valid perf tool")
 
-if "no-omit-frame-pointe" not in sysconfig.get_config_var("CFLAGS"):
+if "no-omit-frame-pointer" not in sysconfig.get_config_var("CFLAGS"):
     raise unittest.SkipTest("Unwinding without frame pointer is unreliable")
 
 if check_sanitizer(address=True, memory=True, ub=True):

From 0a0e53db1acfba60ddbad767fc4097828a38b437 Mon Sep 17 00:00:00 2001
From: Pablo Galindo <pablogsal@gmail.com>
Date: Mon, 22 Aug 2022 13:52:36 +0100
Subject: [PATCH 17/47] Rename perf macro

---
 Objects/perf_trampoline.c | 14 +++++++-------
 Python/sysmodule.c        |  2 +-
 configure                 |  2 +-
 configure.ac              |  2 +-
 pyconfig.h.in             |  6 +++---
 5 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/Objects/perf_trampoline.c b/Objects/perf_trampoline.c
index 6fd2966b5d1eea..baebcd426b661a 100644
--- a/Objects/perf_trampoline.c
+++ b/Objects/perf_trampoline.c
@@ -128,7 +128,7 @@ needed.
 #include "pycore_frame.h"
 #include "pycore_interp.h"
 
-#ifdef HAVE_PERF_TRAMPOLINE
+#ifdef _PY_HAVE_PERF_TRAMPOLINE
 
 #include <fcntl.h>
 #include <stdio.h>
@@ -373,12 +373,12 @@ py_trampoline_evaluator(PyThreadState *ts, _PyInterpreterFrame *frame,
     // Something failed, fall back to the default evaluator.
     return _PyEval_EvalFrameDefault(ts, frame, throw);
 }
-#endif  // HAVE_PERF_TRAMPOLINE
+#endif  // _PY_HAVE_PERF_TRAMPOLINE
 
 int
 _PyIsPerfTrampolineActive(void)
 {
-#ifdef HAVE_PERF_TRAMPOLINE
+#ifdef _PY_HAVE_PERF_TRAMPOLINE
     PyThreadState *tstate = _PyThreadState_GET();
     return tstate->interp->eval_frame == py_trampoline_evaluator;
 #endif
@@ -390,7 +390,7 @@ _PyPerfTrampoline_SetCallbacks(trampoline_state_init init_state,
                                trampoline_state_write write_state,
                                trampoline_state_free free_state)
 {
-#ifdef HAVE_PERF_TRAMPOLINE
+#ifdef _PY_HAVE_PERF_TRAMPOLINE
     if (trampoline_api.state) {
         Py_FatalError("Trampoline state already initialized");
         return -1;
@@ -416,7 +416,7 @@ _PyPerfTrampoline_Init(int activate)
         tstate->interp->eval_frame = NULL;
     }
     else {
-#ifdef HAVE_PERF_TRAMPOLINE
+#ifdef _PY_HAVE_PERF_TRAMPOLINE
         tstate->interp->eval_frame = py_trampoline_evaluator;
         if (new_code_arena() < 0) {
             return -1;
@@ -429,7 +429,7 @@ _PyPerfTrampoline_Init(int activate)
 int
 _PyPerfTrampoline_Fini(void)
 {
-#ifdef HAVE_PERF_TRAMPOLINE
+#ifdef _PY_HAVE_PERF_TRAMPOLINE
     free_code_arenas();
     trampoline_api.free_state(trampoline_api.state);
 #endif
@@ -439,7 +439,7 @@ _PyPerfTrampoline_Fini(void)
 PyStatus
 _PyPerfTrampoline_AfterFork_Child(void)
 {
-#ifdef HAVE_PERF_TRAMPOLINE
+#ifdef _PY_HAVE_PERF_TRAMPOLINE
     // close file in child.
     trampoline_api.free_state(trampoline_api.state);
 #endif
diff --git a/Python/sysmodule.c b/Python/sysmodule.c
index 71c976ea3339b6..23ff254be522e5 100644
--- a/Python/sysmodule.c
+++ b/Python/sysmodule.c
@@ -2037,7 +2037,7 @@ static PyObject *
 sys_is_perf_trampoline_active_impl(PyObject *module)
 /*[clinic end generated code: output=7bbf80001165b590 input=59f045e52c228654]*/
 {
-#ifdef HAVE_PERF_TRAMPOLINE
+#ifdef _PY_HAVE_PERF_TRAMPOLINE
     if (_PyIsPerfTrampolineActive()) {
         Py_RETURN_TRUE;
     }
diff --git a/configure b/configure
index c96ac527eb4bfc..30135532000cf2 100755
--- a/configure
+++ b/configure
@@ -11436,7 +11436,7 @@ $as_echo "$perf_trampoline" >&6; }
 if test "x$perf_trampoline" = xyes; then :
 
 
-$as_echo "#define HAVE_PERF_TRAMPOLINE 1" >>confdefs.h
+$as_echo "#define _PY_HAVE_PERF_TRAMPOLINE 1" >>confdefs.h
 
   PERF_TRAMPOLINE_OBJ=Objects/asm_trampoline.o
 
diff --git a/configure.ac b/configure.ac
index 0e06d5b6fbadd7..438dd052ce45ac 100644
--- a/configure.ac
+++ b/configure.ac
@@ -3436,7 +3436,7 @@ AS_CASE([$PLATFORM_TRIPLET],
 AC_MSG_RESULT([$perf_trampoline])
 
 AS_VAR_IF([perf_trampoline], [yes], [
-  AC_DEFINE([HAVE_PERF_TRAMPOLINE], [1], [Define to 1 if you have the perf trampoline.])
+  AC_DEFINE([_PY_HAVE_PERF_TRAMPOLINE], [1], [Define to 1 if you have the perf trampoline.])
   PERF_TRAMPOLINE_OBJ=Objects/asm_trampoline.o
 ])
 AC_SUBST([PERF_TRAMPOLINE_OBJ])
diff --git a/pyconfig.h.in b/pyconfig.h.in
index f826d8983ea2ae..6ceefc7cf19ae3 100644
--- a/pyconfig.h.in
+++ b/pyconfig.h.in
@@ -872,9 +872,6 @@
 /* Define to 1 if you have the `pause' function. */
 #undef HAVE_PAUSE
 
-/* Define to 1 if you have the perf trampoline. */
-#undef HAVE_PERF_TRAMPOLINE
-
 /* Define to 1 if you have the `pipe' function. */
 #undef HAVE_PIPE
 
@@ -1802,6 +1799,9 @@
 /* framework name */
 #undef _PYTHONFRAMEWORK
 
+/* Define to 1 if you have the perf trampoline. */
+#undef _PY_HAVE_PERF_TRAMPOLINE
+
 /* Define to force use of thread-safe errno, h_errno, and other functions */
 #undef _REENTRANT
 

From 7ea33715b00ecdb2ed7cb4718b5ef3c474d24a5d Mon Sep 17 00:00:00 2001
From: Pablo Galindo <pablogsal@gmail.com>
Date: Mon, 22 Aug 2022 13:54:03 +0100
Subject: [PATCH 18/47] Fix some typos

---
 Objects/perf_trampoline.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/Objects/perf_trampoline.c b/Objects/perf_trampoline.c
index baebcd426b661a..e2a94b0e4ea683 100644
--- a/Objects/perf_trampoline.c
+++ b/Objects/perf_trampoline.c
@@ -5,9 +5,9 @@ Perf trampoline instrumentation
 
 This file contains instrumentation to allow to associate
 calls to the CPython eval loop back to the names of the Python
-fuctions and filename being executed.
+functions and filename being executed.
 
-Many natve performance profilers like the Linux perf tools are
+Many native performance profilers like the Linux perf tools are
 only available to 'see' the C stack when sampling from the profiled
 process. This means that if we have the following python code:
 
@@ -23,8 +23,8 @@ process. This means that if we have the following python code:
 
     baz(10000000)
 
-A performace profiler that is only able to see native frames will
-produce the following backtrace whe sampling from foo():
+A performance profiler that is only able to see native frames will
+produce the following backtrace when sampling from foo():
 
     _PyEval_EvalFrameDefault -----> Evaluation frame of foo()
     _PyEval_Vector
@@ -57,9 +57,9 @@ associate the names of the Python functions and the filenames associated with
 those calls, rendering the results useless in the Python world.
 
 To fix this problem, we introduce the concept of a trampoline frame. A
-trampoline frame is a piece of code that is uniqued per Python code object that
+trampoline frame is a piece of code that is unique per Python code object that
 is executed before entering the CPython eval loop. This piece of code just
-calls the original Python evaluattion function (_PyEval_EvalFrameDefault) and
+calls the original Python evaluation function (_PyEval_EvalFrameDefault) and
 forwards all the arguments received. In this way, when a profiler samples
 frames from the previous example it will see;
 
@@ -114,7 +114,7 @@ trampoline for a Python code object, we copy the assembly code into a mmaped
 area that has executable permissions and we return the start of that area as
 our trampoline function.
 
-Asking for a mmap-ed memory area for trampoline is very wastefull so we
+Asking for a mmap-ed memory area for trampoline is very wasteful so we
 allocate big arenas of memory in a single mmap call, we populate the entire
 arena with copies of the trampoline (this allows us to now have to invalidate
 the icache for the instructions in the page) and then we return the next

From 680db66ebe5c763712ffd367114a3d64cf017ce4 Mon Sep 17 00:00:00 2001
From: Christian Heimes <christian@python.org>
Date: Mon, 22 Aug 2022 14:38:01 +0200
Subject: [PATCH 19/47] Improve perf profiler tests

- check for subprocess support
- look in PY_CORE_CFLAGS (contains custom CFLAGS, too)
- add frame pointers in debug builds
- check cheap stuff first
- workaround for Fedora
---
 Lib/test/test_perf_profiler.py | 45 +++++++++++++++-------------------
 configure                      |  6 +++++
 configure.ac                   |  5 ++++
 3 files changed, 31 insertions(+), 25 deletions(-)

diff --git a/Lib/test/test_perf_profiler.py b/Lib/test/test_perf_profiler.py
index 40450a79e116ab..387f119d73d9d2 100644
--- a/Lib/test/test_perf_profiler.py
+++ b/Lib/test/test_perf_profiler.py
@@ -7,40 +7,35 @@
 from test import support
 from test.support.script_helper import make_script
 from test.support.os_helper import temp_dir
-from test.support import check_sanitizer
 
 
-def get_perf_version():
-    try:
-        cmd = ["perf", "version"]
-        proc = subprocess.run(
-            cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True
-        )
-    except (subprocess.SubprocessError, OSError):
-        raise unittest.SkipTest("Couldn't find perf on the path")
-    
-    version = proc.stdout
-
-    match = re.search(r"^perf version\s+(.*)", version)
-    if match is None:
-        raise Exception("unable to parse perf version: %r" % version)
-    return (version, match.group(1))
-
 if not support.has_subprocess_support:
     raise unittest.SkipTest("test module requires subprocess")
 
-
-_, version = get_perf_version()
-
-if not version:
-    raise unittest.SkipTest("Could not find valid perf tool")
-
-if "no-omit-frame-pointer" not in sysconfig.get_config_var("CFLAGS"):
+if "no-omit-frame-pointer" not in sysconfig.get_config_var("PY_CORE_CFLAGS"):
     raise unittest.SkipTest("Unwinding without frame pointer is unreliable")
 
-if check_sanitizer(address=True, memory=True, ub=True):
+if support.check_sanitizer(address=True, memory=True, ub=True):
     raise unittest.SkipTest("Perf unwinding doesn't work with sanitizers")
 
+def check_perf_command():
+    try:
+        cmd = ["perf", "--help"]
+        stdout = subprocess.check_output(
+            cmd, universal_newlines=True
+        )
+    except (subprocess.SubprocessError, OSError):
+        raise unittest.SkipTest("Couldn't find perf on the path")
+
+    # perf version does not return a version number on Fedora. Use presence
+    # of "perf.data" in help as indicator that it's perf from Linux tools.
+    if "perf.data" not in stdout:
+        raise unittest.SkipTest(
+            "perf command does not look like Linux tool perf"
+        )
+    
+check_perf_command()
+
 
 def run_perf(cwd, *args, **env_vars):
     if env_vars:
diff --git a/configure b/configure
index 30135532000cf2..05d0702cfa8b98 100755
--- a/configure
+++ b/configure
@@ -11440,6 +11440,12 @@ $as_echo "#define _PY_HAVE_PERF_TRAMPOLINE 1" >>confdefs.h
 
   PERF_TRAMPOLINE_OBJ=Objects/asm_trampoline.o
 
+    if test "x$Py_DEBUG" = xtrue; then :
+
+    as_fn_append BASECFLAGS " -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer"
+
+fi
+
 fi
 
 
diff --git a/configure.ac b/configure.ac
index 438dd052ce45ac..a3449d143f5bac 100644
--- a/configure.ac
+++ b/configure.ac
@@ -3438,6 +3438,11 @@ AC_MSG_RESULT([$perf_trampoline])
 AS_VAR_IF([perf_trampoline], [yes], [
   AC_DEFINE([_PY_HAVE_PERF_TRAMPOLINE], [1], [Define to 1 if you have the perf trampoline.])
   PERF_TRAMPOLINE_OBJ=Objects/asm_trampoline.o
+
+  dnl perf needs frame pointers for unwinding, include compiler option in debug builds
+  AS_VAR_IF([Py_DEBUG], [true], [
+    AS_VAR_APPEND([BASECFLAGS], [" -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer"])
+  ])
 ])
 AC_SUBST([PERF_TRAMPOLINE_OBJ])
 

From 1263a29ed8ed8b89140a36659b1076a477c0e7d9 Mon Sep 17 00:00:00 2001
From: Pablo Galindo <pablogsal@gmail.com>
Date: Mon, 22 Aug 2022 14:00:48 +0100
Subject: [PATCH 20/47] Add guard for initialization

---
 Python/pylifecycle.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c
index ed05d0277bb2df..50c31b8a1830e1 100644
--- a/Python/pylifecycle.c
+++ b/Python/pylifecycle.c
@@ -1150,6 +1150,8 @@ init_interp_main(PyThreadState *tstate)
             return _PyStatus_ERR("can't initialize tracemalloc");
         }
 
+
+#ifdef _PY_HAVE_PERF_TRAMPOLINE
         if (_PyPerfTrampoline_SetCallbacks(
                 _Py_perf_map_get_file, _Py_perf_map_write_entry, _Py_perf_map_close
             ) < 0 ||
@@ -1158,7 +1160,7 @@ init_interp_main(PyThreadState *tstate)
             ) < 0) {
             return _PyStatus_ERR("can't initialize the perf trampoline");
         }
-
+#endif
     }
 
     status = init_sys_streams(tstate);

From a42bde5a7179efe8960b8d0615e3afaaea40285f Mon Sep 17 00:00:00 2001
From: Pablo Galindo <pablogsal@gmail.com>
Date: Mon, 22 Aug 2022 14:02:24 +0100
Subject: [PATCH 21/47] Add acks

---
 .../2022-08-20-18-36-40.gh-issue-96143.nh3GFM.rst              | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-08-20-18-36-40.gh-issue-96143.nh3GFM.rst b/Misc/NEWS.d/next/Core and Builtins/2022-08-20-18-36-40.gh-issue-96143.nh3GFM.rst
index dfd701fce8a932..2d039ad3e0fe7f 100644
--- a/Misc/NEWS.d/next/Core and Builtins/2022-08-20-18-36-40.gh-issue-96143.nh3GFM.rst	
+++ b/Misc/NEWS.d/next/Core and Builtins/2022-08-20-18-36-40.gh-issue-96143.nh3GFM.rst	
@@ -3,4 +3,5 @@ Add a new ``-X perf`` Python command line option as well as
 function in the :mod:`sys` module that allows to set/unset the interpreter in a
 way that the Linux ``perf`` profiler can detect Python calls. The new
 :func:`sys.is_perf_trampoline_active` function allows to query the state of the
-perf trampoline. Patch by Pablo Galindo.
+perf trampoline. Design by Pablo Galindo. Patch by Pablo Galindo and Christian Heimes
+with contributions from Gregory P. Smith and Mark Shannon.

From b780d2ae1b32388bfa4162f91326ada06d360d65 Mon Sep 17 00:00:00 2001
From: Pablo Galindo <pablogsal@gmail.com>
Date: Mon, 22 Aug 2022 14:12:03 +0100
Subject: [PATCH 22/47] Initialize perf file lazily

---
 Objects/perf_trampoline.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/Objects/perf_trampoline.c b/Objects/perf_trampoline.c
index e2a94b0e4ea683..85eff9969e264c 100644
--- a/Objects/perf_trampoline.c
+++ b/Objects/perf_trampoline.c
@@ -398,11 +398,7 @@ _PyPerfTrampoline_SetCallbacks(trampoline_state_init init_state,
     trampoline_api.init_state = init_state;
     trampoline_api.write_state = write_state;
     trampoline_api.free_state = free_state;
-    void *state = trampoline_api.init_state();
-    if (state == NULL) {
-        return -1;
-    }
-    trampoline_api.state = state;
+    trampoline_api.state = NULL;
     perf_status = PERF_STATUS_OK;
 #endif
     return 0;
@@ -421,8 +417,16 @@ _PyPerfTrampoline_Init(int activate)
         if (new_code_arena() < 0) {
             return -1;
         }
+        if (trampoline_api.state == NULL) {
+            void *state = trampoline_api.init_state();
+            if (state == NULL) {
+                return -1;
+            }
+            trampoline_api.state = state;
+        }
 #endif
     }
+    perf_status = PERF_STATUS_OK;
     return 0;
 }
 

From 04bf416e32a337382f8161070fd7fd4775f7de62 Mon Sep 17 00:00:00 2001
From: Pablo Galindo <pablogsal@gmail.com>
Date: Mon, 22 Aug 2022 14:24:07 +0100
Subject: [PATCH 23/47] Address review comments

---
 Objects/perf_trampoline.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/Objects/perf_trampoline.c b/Objects/perf_trampoline.c
index 85eff9969e264c..2b003e1a7fdd15 100644
--- a/Objects/perf_trampoline.c
+++ b/Objects/perf_trampoline.c
@@ -121,6 +121,12 @@ the icache for the instructions in the page) and then we return the next
 available chunk every time someone asks for a new trampoline. We keep a linked
 list of arenas in case the current memory arena is exhausted and another one is
 needed.
+
+For the best results, Python should be compiled with
+CFLAGS="-fno-omit-frame-pointer -mno-omit-leaf-frame-pointer"` as this allows
+profilers to unwind using only the frame pointer and not on DWARF debug
+information (note that as trampilines are dynamically generated there won't be
+any DWARF information available for them).
 */
 
 #include "Python.h"
@@ -128,6 +134,12 @@ needed.
 #include "pycore_frame.h"
 #include "pycore_interp.h"
 
+typedef enum {
+    PERF_STATUS_FAILED = -1,  // Perf trampoline is in an invalid state
+    PERF_STATUS_NO_INIT = 0,  // Perf trampoline is not initialized
+    PERF_STATUS_OK = 1,       // Perf trampoline is ready to be executed
+} perf_status_t;
+
 #ifdef _PY_HAVE_PERF_TRAMPOLINE
 
 #include <fcntl.h>
@@ -157,12 +169,6 @@ struct code_arena_st {
         *prev;  // Pointer to the arena  or NULL if this is the first arena.
 };
 
-typedef enum {
-    PERF_STATUS_FAILED = -1,  // Perf trampoline is in an invalid state
-    PERF_STATUS_NO_INIT = 0,  // Perf trampoline is not initialized
-    PERF_STATUS_OK = 1,       // Perf trampoline is ready to be executed
-} perf_status_t;
-
 typedef struct code_arena_st code_arena_t;
 
 struct trampoline_api_st {
@@ -283,6 +289,7 @@ new_code_arena(void)
         _PyErr_WriteUnraisableMsg(
             "Failed to set mmap for perf trampoline to PROT_READ | PROT_EXEC",
             NULL);
+        retur - 1;
     }
 
     code_arena_t *new_arena = PyMem_RawCalloc(1, sizeof(code_arena_t));

From 7558df2f57f551285f4b3b2270d923c316a81170 Mon Sep 17 00:00:00 2001
From: Pablo Galindo <pablogsal@gmail.com>
Date: Mon, 22 Aug 2022 16:41:39 +0100
Subject: [PATCH 24/47] Complain if there is already a evaluator frame when
 deactivating/activating

---
 Lib/test/test_perf_profiler.py | 13 ++++++++++++-
 Objects/perf_trampoline.c      | 11 +++++++++--
 Python/sysmodule.c             |  2 --
 3 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/Lib/test/test_perf_profiler.py b/Lib/test/test_perf_profiler.py
index 387f119d73d9d2..ba3ccc5ac241a5 100644
--- a/Lib/test/test_perf_profiler.py
+++ b/Lib/test/test_perf_profiler.py
@@ -12,7 +12,13 @@
 if not support.has_subprocess_support:
     raise unittest.SkipTest("test module requires subprocess")
 
-if "no-omit-frame-pointer" not in sysconfig.get_config_var("PY_CORE_CFLAGS"):
+def is_unwinding_realiable():
+    cflags = sysconfig.get_config_var("PY_CORE_CFLAGS")
+    if not cflags:
+        return False
+    return "no-omit-frame-pointer" in cflags
+
+if not is_unwinding_realiable():
     raise unittest.SkipTest("Unwinding without frame pointer is unreliable")
 
 if support.check_sanitizer(address=True, memory=True, ub=True):
@@ -51,12 +57,17 @@ def run_perf(cwd, *args, **env_vars):
         stderr=subprocess.PIPE,
         env=env,
     )
+    if proc.returncode:
+        print(proc.stderr)
+        raise ValueError(f"Perf failed with return code {proc.returncode}")
+
     base_cmd = ("perf", "script")
     proc = subprocess.run(
         ("perf", "script", "-i", output_file),
         stdout=subprocess.PIPE,
         stderr=subprocess.PIPE,
         env=env,
+        check=True,
     )
     return proc.stdout.decode("utf-8", "replace"), proc.stderr.decode("utf-8", "replace")
 
diff --git a/Objects/perf_trampoline.c b/Objects/perf_trampoline.c
index 2b003e1a7fdd15..f20afe46140c2c 100644
--- a/Objects/perf_trampoline.c
+++ b/Objects/perf_trampoline.c
@@ -289,7 +289,7 @@ new_code_arena(void)
         _PyErr_WriteUnraisableMsg(
             "Failed to set mmap for perf trampoline to PROT_READ | PROT_EXEC",
             NULL);
-        retur - 1;
+        return -1;
     }
 
     code_arena_t *new_arena = PyMem_RawCalloc(1, sizeof(code_arena_t));
@@ -415,6 +415,13 @@ int
 _PyPerfTrampoline_Init(int activate)
 {
     PyThreadState *tstate = _PyThreadState_GET();
+    if (tstate->interp->eval_frame &&
+        tstate->interp->eval_frame != py_trampoline_evaluator) {
+        PyErr_SetString(PyExc_RuntimeError,
+                        "Trampoline cannot be initialized as a custom eval "
+                        "frame is already present");
+        return -1;
+    }
     if (!activate) {
         tstate->interp->eval_frame = NULL;
     }
@@ -431,9 +438,9 @@ _PyPerfTrampoline_Init(int activate)
             }
             trampoline_api.state = state;
         }
+        perf_status = PERF_STATUS_OK;
 #endif
     }
-    perf_status = PERF_STATUS_OK;
     return 0;
 }
 
diff --git a/Python/sysmodule.c b/Python/sysmodule.c
index 23ff254be522e5..55b026c0b73574 100644
--- a/Python/sysmodule.c
+++ b/Python/sysmodule.c
@@ -2046,8 +2046,6 @@ sys_is_perf_trampoline_active_impl(PyObject *module)
 }
 
 
-
-
 static PyMethodDef sys_methods[] = {
     /* Might as well keep this in alphabetic order */
     SYS_ADDAUDITHOOK_METHODDEF

From d1ebc88c19c835478e13c1883e230b8242e9eed5 Mon Sep 17 00:00:00 2001
From: Pablo Galindo <pablogsal@gmail.com>
Date: Mon, 22 Aug 2022 17:12:13 +0100
Subject: [PATCH 25/47] Fix some errors on CI

---
 Lib/test/test_perf_profiler.py | 43 +++++++++++++++++++++++++++-------
 Objects/perf_trampoline.c      |  4 ++--
 2 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/Lib/test/test_perf_profiler.py b/Lib/test/test_perf_profiler.py
index ba3ccc5ac241a5..aa61c1c1effacf 100644
--- a/Lib/test/test_perf_profiler.py
+++ b/Lib/test/test_perf_profiler.py
@@ -12,34 +12,59 @@
 if not support.has_subprocess_support:
     raise unittest.SkipTest("test module requires subprocess")
 
+
 def is_unwinding_realiable():
     cflags = sysconfig.get_config_var("PY_CORE_CFLAGS")
     if not cflags:
         return False
     return "no-omit-frame-pointer" in cflags
 
+
 if not is_unwinding_realiable():
     raise unittest.SkipTest("Unwinding without frame pointer is unreliable")
 
 if support.check_sanitizer(address=True, memory=True, ub=True):
     raise unittest.SkipTest("Perf unwinding doesn't work with sanitizers")
 
+
 def check_perf_command():
     try:
         cmd = ["perf", "--help"]
-        stdout = subprocess.check_output(
-            cmd, universal_newlines=True
-        )
+        stdout = subprocess.check_output(cmd, universal_newlines=True)
     except (subprocess.SubprocessError, OSError):
         raise unittest.SkipTest("Couldn't find perf on the path")
 
     # perf version does not return a version number on Fedora. Use presence
     # of "perf.data" in help as indicator that it's perf from Linux tools.
     if "perf.data" not in stdout:
-        raise unittest.SkipTest(
-            "perf command does not look like Linux tool perf"
-        )
-    
+        raise unittest.SkipTest("perf command does not look like Linux tool perf")
+
+    # Check that we can run a simple perf run
+    with temp_dir() as script_dir:
+        try:
+            output_file = script_dir + "/perf_output.perf"
+            cmd = (
+                "perf",
+                "record",
+                "-g",
+                "--call-graph=fp",
+                "-o",
+                output_file,
+                "--",
+                sys.executable,
+                "-c",
+                'print("hello")',
+            )
+            stdout = subprocess.check_output(
+                cmd, cwd=script_dir, universal_newlines=True, stderr=subprocess.STDOUT
+            )
+        except (subprocess.SubprocessError, OSError):
+            raise unittest.SkipTest("Couldn't run perf on simple script")
+
+        if "hello" not in stdout:
+            raise unittest.SkipTest("perf run did not work correctly")
+
+
 check_perf_command()
 
 
@@ -69,7 +94,9 @@ def run_perf(cwd, *args, **env_vars):
         env=env,
         check=True,
     )
-    return proc.stdout.decode("utf-8", "replace"), proc.stderr.decode("utf-8", "replace")
+    return proc.stdout.decode("utf-8", "replace"), proc.stderr.decode(
+        "utf-8", "replace"
+    )
 
 
 class TestPerfProfiler(unittest.TestCase):
diff --git a/Objects/perf_trampoline.c b/Objects/perf_trampoline.c
index f20afe46140c2c..238bb1a1fee5b9 100644
--- a/Objects/perf_trampoline.c
+++ b/Objects/perf_trampoline.c
@@ -414,6 +414,7 @@ _PyPerfTrampoline_SetCallbacks(trampoline_state_init init_state,
 int
 _PyPerfTrampoline_Init(int activate)
 {
+#ifdef _PY_HAVE_PERF_TRAMPOLINE
     PyThreadState *tstate = _PyThreadState_GET();
     if (tstate->interp->eval_frame &&
         tstate->interp->eval_frame != py_trampoline_evaluator) {
@@ -426,7 +427,6 @@ _PyPerfTrampoline_Init(int activate)
         tstate->interp->eval_frame = NULL;
     }
     else {
-#ifdef _PY_HAVE_PERF_TRAMPOLINE
         tstate->interp->eval_frame = py_trampoline_evaluator;
         if (new_code_arena() < 0) {
             return -1;
@@ -439,8 +439,8 @@ _PyPerfTrampoline_Init(int activate)
             trampoline_api.state = state;
         }
         perf_status = PERF_STATUS_OK;
-#endif
     }
+#endif
     return 0;
 }
 

From a83a31bb3b2a60bc17bedd2a98fa6b20382b5d5d Mon Sep 17 00:00:00 2001
From: Christian Heimes <christian@python.org>
Date: Mon, 22 Aug 2022 20:23:18 +0200
Subject: [PATCH 26/47] Reorder arguments to speed up trampoline

---
 Objects/asm_trampoline.S  | 21 ++++++---------------
 Objects/perf_trampoline.c | 10 +++++++---
 2 files changed, 13 insertions(+), 18 deletions(-)

diff --git a/Objects/asm_trampoline.S b/Objects/asm_trampoline.S
index 4151af9d331ab2..460707717df003 100644
--- a/Objects/asm_trampoline.S
+++ b/Objects/asm_trampoline.S
@@ -2,21 +2,16 @@
     .globl	_Py_trampoline_func_start
 # The following assembly is equivalent to:
 # PyObject *
-# trampoline(py_evaluator evaluator, PyThreadState *ts,
-#            _PyInterpreterFrame *f, int throwflag)
+# trampoline(PyThreadState *ts, _PyInterpreterFrame *f,
+#            int throwflag, py_evaluator evaluator)
 # {
 #     return evaluator(ts, f, throwflag);
 # }
 _Py_trampoline_func_start:
 #ifdef __x86_64__
-    push   %rbp
-    mov    %rsp,%rbp
-    mov    %rdi,%rax
-    mov    %rsi,%rdi
-    mov    %rdx,%rsi
-    mov    %ecx,%edx
-    call   *%rax
-    pop    %rbp
+    sub    $8, %rsp
+    call    *%rcx
+    add    $8, %rsp
     ret
 #endif // __x86_64__
 #if defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
@@ -24,11 +19,7 @@ _Py_trampoline_func_start:
     // generate with aarch64-linux-gnu-gcc 12.1
     stp     x29, x30, [sp, -16]!
     mov     x29, sp
-    mov     x4, x0
-    mov     x0, x1
-    mov     x1, x2
-    mov     w2, w3
-    blr     x4
+    blr     x3
     ldp     x29, x30, [sp], 16
     ret
 #endif
diff --git a/Objects/perf_trampoline.c b/Objects/perf_trampoline.c
index 238bb1a1fee5b9..371093660be888 100644
--- a/Objects/perf_trampoline.c
+++ b/Objects/perf_trampoline.c
@@ -149,10 +149,14 @@ typedef enum {
 #include <sys/types.h>
 #include <unistd.h>
 
+/* The function pointer is passed as last argument. The other three arguments
+ * are passed in the same order as the function requires. This results in
+ * shorter, more efficient ASM code for trampoline.
+ */
 typedef PyObject *(*py_evaluator)(PyThreadState *, _PyInterpreterFrame *,
                                   int throwflag);
-typedef PyObject *(*py_trampoline)(py_evaluator, PyThreadState *,
-                                   _PyInterpreterFrame *, int throwflag);
+typedef PyObject *(*py_trampoline)(PyThreadState *, _PyInterpreterFrame *,
+                                   int, py_evaluator);
 
 extern void *_Py_trampoline_func_start;  // Start of the template of the
                                          // assembly trampoline
@@ -375,7 +379,7 @@ py_trampoline_evaluator(PyThreadState *ts, _PyInterpreterFrame *frame,
         f = new_trampoline;
     }
     assert(f != NULL);
-    return f(_PyEval_EvalFrameDefault, ts, frame, throw);
+    return f(ts, frame, throw, _PyEval_EvalFrameDefault);
 default_eval:
     // Something failed, fall back to the default evaluator.
     return _PyEval_EvalFrameDefault(ts, frame, throw);

From 0febd84a22d63df4f62bb7500b0d4861718b2640 Mon Sep 17 00:00:00 2001
From: Pablo Galindo <pablogsal@gmail.com>
Date: Mon, 22 Aug 2022 20:28:46 +0100
Subject: [PATCH 27/47] Preserve frame pointer

---
 Objects/asm_trampoline.S | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/Objects/asm_trampoline.S b/Objects/asm_trampoline.S
index 460707717df003..2c12f683eeb68c 100644
--- a/Objects/asm_trampoline.S
+++ b/Objects/asm_trampoline.S
@@ -9,9 +9,10 @@
 # }
 _Py_trampoline_func_start:
 #ifdef __x86_64__
-    sub    $8, %rsp
+    pushq   %rbp
+    movq    %rsp, %rbp
     call    *%rcx
-    add    $8, %rsp
+    popq    %rbp
     ret
 #endif // __x86_64__
 #if defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)

From dc5a6a5fafe1878ae7172604fc46d4ea2f068ad1 Mon Sep 17 00:00:00 2001
From: Pablo Galindo <pablogsal@gmail.com>
Date: Mon, 22 Aug 2022 20:57:10 +0100
Subject: [PATCH 28/47] Support perf backend and better handle forks

---
 Objects/perf_trampoline.c   | 21 +++++++----
 Python/clinic/sysmodule.c.h | 70 +++++++++++++++++++++++++------------
 Python/pylifecycle.c        | 13 ++++---
 Python/sysmodule.c          | 41 +++++++++++++++-------
 4 files changed, 96 insertions(+), 49 deletions(-)

diff --git a/Objects/perf_trampoline.c b/Objects/perf_trampoline.c
index 371093660be888..33467bf3f7d0d9 100644
--- a/Objects/perf_trampoline.c
+++ b/Objects/perf_trampoline.c
@@ -155,8 +155,8 @@ typedef enum {
  */
 typedef PyObject *(*py_evaluator)(PyThreadState *, _PyInterpreterFrame *,
                                   int throwflag);
-typedef PyObject *(*py_trampoline)(PyThreadState *, _PyInterpreterFrame *,
-                                   int, py_evaluator);
+typedef PyObject *(*py_trampoline)(PyThreadState *, _PyInterpreterFrame *, int,
+                                   py_evaluator);
 
 extern void *_Py_trampoline_func_start;  // Start of the template of the
                                          // assembly trampoline
@@ -403,8 +403,7 @@ _PyPerfTrampoline_SetCallbacks(trampoline_state_init init_state,
 {
 #ifdef _PY_HAVE_PERF_TRAMPOLINE
     if (trampoline_api.state) {
-        Py_FatalError("Trampoline state already initialized");
-        return -1;
+        _PyPerfTrampoline_Fini();
     }
     trampoline_api.init_state = init_state;
     trampoline_api.write_state = write_state;
@@ -453,7 +452,11 @@ _PyPerfTrampoline_Fini(void)
 {
 #ifdef _PY_HAVE_PERF_TRAMPOLINE
     free_code_arenas();
-    trampoline_api.free_state(trampoline_api.state);
+    if (trampoline_api.state) {
+        trampoline_api.free_state(trampoline_api.state);
+        trampoline_api.state = NULL;
+    }
+    extra_code_index = -1;
 #endif
     return 0;
 }
@@ -462,8 +465,12 @@ PyStatus
 _PyPerfTrampoline_AfterFork_Child(void)
 {
 #ifdef _PY_HAVE_PERF_TRAMPOLINE
-    // close file in child.
-    trampoline_api.free_state(trampoline_api.state);
+    // Restart trampoline in file in child.
+    int was_active = _PyIsPerfTrampolineActive();
+    _PyPerfTrampoline_Fini();
+    if (was_active) {
+        _PyPerfTrampoline_Init(1);
+    }
 #endif
     return PyStatus_Ok();
 }
diff --git a/Python/clinic/sysmodule.c.h b/Python/clinic/sysmodule.c.h
index 272da518f70660..08e325774713d6 100644
--- a/Python/clinic/sysmodule.c.h
+++ b/Python/clinic/sysmodule.c.h
@@ -1127,58 +1127,84 @@ sys_getandroidapilevel(PyObject *module, PyObject *Py_UNUSED(ignored))
 
 #endif /* defined(ANDROID_API_LEVEL) */
 
-PyDoc_STRVAR(sys_activate_perf_trampoline__doc__,
-"activate_perf_trampoline($module, /)\n"
+PyDoc_STRVAR(sys_activate_stack_trampoline__doc__,
+"activate_stack_trampoline($module, backend=\'perf\', /)\n"
 "--\n"
 "\n"
 "Activate the perf profiler trampoline.");
 
-#define SYS_ACTIVATE_PERF_TRAMPOLINE_METHODDEF    \
-    {"activate_perf_trampoline", (PyCFunction)sys_activate_perf_trampoline, METH_NOARGS, sys_activate_perf_trampoline__doc__},
+#define SYS_ACTIVATE_STACK_TRAMPOLINE_METHODDEF    \
+    {"activate_stack_trampoline", _PyCFunction_CAST(sys_activate_stack_trampoline), METH_FASTCALL, sys_activate_stack_trampoline__doc__},
 
 static PyObject *
-sys_activate_perf_trampoline_impl(PyObject *module);
+sys_activate_stack_trampoline_impl(PyObject *module, const char *backend);
 
 static PyObject *
-sys_activate_perf_trampoline(PyObject *module, PyObject *Py_UNUSED(ignored))
+sys_activate_stack_trampoline(PyObject *module, PyObject *const *args, Py_ssize_t nargs)
 {
-    return sys_activate_perf_trampoline_impl(module);
+    PyObject *return_value = NULL;
+    const char *backend = "perf";
+
+    if (!_PyArg_CheckPositional("activate_stack_trampoline", nargs, 0, 1)) {
+        goto exit;
+    }
+    if (nargs < 1) {
+        goto skip_optional;
+    }
+    if (!PyUnicode_Check(args[0])) {
+        _PyArg_BadArgument("activate_stack_trampoline", "argument 1", "str", args[0]);
+        goto exit;
+    }
+    Py_ssize_t backend_length;
+    backend = PyUnicode_AsUTF8AndSize(args[0], &backend_length);
+    if (backend == NULL) {
+        goto exit;
+    }
+    if (strlen(backend) != (size_t)backend_length) {
+        PyErr_SetString(PyExc_ValueError, "embedded null character");
+        goto exit;
+    }
+skip_optional:
+    return_value = sys_activate_stack_trampoline_impl(module, backend);
+
+exit:
+    return return_value;
 }
 
-PyDoc_STRVAR(sys_deactivate_perf_trampoline__doc__,
-"deactivate_perf_trampoline($module, /)\n"
+PyDoc_STRVAR(sys_deactivate_stack_trampoline__doc__,
+"deactivate_stack_trampoline($module, /)\n"
 "--\n"
 "\n"
 "Dectivate the perf profiler trampoline.");
 
-#define SYS_DEACTIVATE_PERF_TRAMPOLINE_METHODDEF    \
-    {"deactivate_perf_trampoline", (PyCFunction)sys_deactivate_perf_trampoline, METH_NOARGS, sys_deactivate_perf_trampoline__doc__},
+#define SYS_DEACTIVATE_STACK_TRAMPOLINE_METHODDEF    \
+    {"deactivate_stack_trampoline", (PyCFunction)sys_deactivate_stack_trampoline, METH_NOARGS, sys_deactivate_stack_trampoline__doc__},
 
 static PyObject *
-sys_deactivate_perf_trampoline_impl(PyObject *module);
+sys_deactivate_stack_trampoline_impl(PyObject *module);
 
 static PyObject *
-sys_deactivate_perf_trampoline(PyObject *module, PyObject *Py_UNUSED(ignored))
+sys_deactivate_stack_trampoline(PyObject *module, PyObject *Py_UNUSED(ignored))
 {
-    return sys_deactivate_perf_trampoline_impl(module);
+    return sys_deactivate_stack_trampoline_impl(module);
 }
 
-PyDoc_STRVAR(sys_is_perf_trampoline_active__doc__,
-"is_perf_trampoline_active($module, /)\n"
+PyDoc_STRVAR(sys_is_stack_trampoline_active__doc__,
+"is_stack_trampoline_active($module, /)\n"
 "--\n"
 "\n"
 "Returns *True* if the perf profiler trampoline is active.");
 
-#define SYS_IS_PERF_TRAMPOLINE_ACTIVE_METHODDEF    \
-    {"is_perf_trampoline_active", (PyCFunction)sys_is_perf_trampoline_active, METH_NOARGS, sys_is_perf_trampoline_active__doc__},
+#define SYS_IS_STACK_TRAMPOLINE_ACTIVE_METHODDEF    \
+    {"is_stack_trampoline_active", (PyCFunction)sys_is_stack_trampoline_active, METH_NOARGS, sys_is_stack_trampoline_active__doc__},
 
 static PyObject *
-sys_is_perf_trampoline_active_impl(PyObject *module);
+sys_is_stack_trampoline_active_impl(PyObject *module);
 
 static PyObject *
-sys_is_perf_trampoline_active(PyObject *module, PyObject *Py_UNUSED(ignored))
+sys_is_stack_trampoline_active(PyObject *module, PyObject *Py_UNUSED(ignored))
 {
-    return sys_is_perf_trampoline_active_impl(module);
+    return sys_is_stack_trampoline_active_impl(module);
 }
 
 #ifndef SYS_GETWINDOWSVERSION_METHODDEF
@@ -1224,4 +1250,4 @@ sys_is_perf_trampoline_active(PyObject *module, PyObject *Py_UNUSED(ignored))
 #ifndef SYS_GETANDROIDAPILEVEL_METHODDEF
     #define SYS_GETANDROIDAPILEVEL_METHODDEF
 #endif /* !defined(SYS_GETANDROIDAPILEVEL_METHODDEF) */
-/*[clinic end generated code: output=4b43e2be96492326 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=9fd2e37197f55a7f input=a9049054013a1b77]*/
diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c
index 50c31b8a1830e1..44952c9b83c5b8 100644
--- a/Python/pylifecycle.c
+++ b/Python/pylifecycle.c
@@ -1152,13 +1152,12 @@ init_interp_main(PyThreadState *tstate)
 
 
 #ifdef _PY_HAVE_PERF_TRAMPOLINE
-        if (_PyPerfTrampoline_SetCallbacks(
-                _Py_perf_map_get_file, _Py_perf_map_write_entry, _Py_perf_map_close
-            ) < 0 ||
-            _PyPerfTrampoline_Init(
-                config->perf_profiling
-            ) < 0) {
-            return _PyStatus_ERR("can't initialize the perf trampoline");
+        if (config->perf_profiling) {
+            if (_PyPerfTrampoline_SetCallbacks(
+                    _Py_perf_map_get_file, _Py_perf_map_write_entry, _Py_perf_map_close
+                ) < 0 || _PyPerfTrampoline_Init(config->perf_profiling) < 0) {
+                return _PyStatus_ERR("can't initialize the perf trampoline");
+            }
         }
 #endif
     }
diff --git a/Python/sysmodule.c b/Python/sysmodule.c
index 55b026c0b73574..ee1d05619cbe71 100644
--- a/Python/sysmodule.c
+++ b/Python/sysmodule.c
@@ -1995,15 +1995,30 @@ sys_getandroidapilevel_impl(PyObject *module)
 #endif   /* ANDROID_API_LEVEL */
 
 /*[clinic input]
-sys.activate_perf_trampoline
+sys.activate_stack_trampoline
+
+    backend: str = "perf"
+    /
 
 Activate the perf profiler trampoline.
 [clinic start generated code]*/
 
 static PyObject *
-sys_activate_perf_trampoline_impl(PyObject *module)
-/*[clinic end generated code: output=7f97c60d4f580b85 input=666a2d744a97a220]*/
-{
+sys_activate_stack_trampoline_impl(PyObject *module, const char *backend)
+/*[clinic end generated code: output=5783cdeb51874b43 input=58d7244062b933a8]*/
+{
+    if (strcmp(backend, "perf") == 0) {
+        if (_PyPerfTrampoline_SetCallbacks(
+                _Py_perf_map_get_file, _Py_perf_map_write_entry, _Py_perf_map_close
+            ) < 0 ) {
+            PyErr_SetString(PyExc_ValueError, "can't activate perf trampoline");
+            return NULL;
+        }
+    }
+    else {
+        PyErr_Format(PyExc_ValueError, "unsuported invalid backend: %s", backend);
+        return NULL;
+    }
     if (_PyPerfTrampoline_Init(1) < 0) {
         return NULL;
     }
@@ -2012,14 +2027,14 @@ sys_activate_perf_trampoline_impl(PyObject *module)
 
 
 /*[clinic input]
-sys.deactivate_perf_trampoline
+sys.deactivate_stack_trampoline
 
 Dectivate the perf profiler trampoline.
 [clinic start generated code]*/
 
 static PyObject *
-sys_deactivate_perf_trampoline_impl(PyObject *module)
-/*[clinic end generated code: output=5ba2f93711f85b6e input=d85cf6e3cd37d81e]*/
+sys_deactivate_stack_trampoline_impl(PyObject *module)
+/*[clinic end generated code: output=b50da25465df0ef1 input=491f4fc1ed615736]*/
 {
     if  (_PyPerfTrampoline_Init(0) < 0) {
         return NULL;
@@ -2028,14 +2043,14 @@ sys_deactivate_perf_trampoline_impl(PyObject *module)
 }
 
 /*[clinic input]
-sys.is_perf_trampoline_active
+sys.is_stack_trampoline_active
 
 Returns *True* if the perf profiler trampoline is active.
 [clinic start generated code]*/
 
 static PyObject *
-sys_is_perf_trampoline_active_impl(PyObject *module)
-/*[clinic end generated code: output=7bbf80001165b590 input=59f045e52c228654]*/
+sys_is_stack_trampoline_active_impl(PyObject *module)
+/*[clinic end generated code: output=ab2746de0ad9d293 input=061fa5776ac9dd59]*/
 {
 #ifdef _PY_HAVE_PERF_TRAMPOLINE
     if (_PyIsPerfTrampolineActive()) {
@@ -2099,9 +2114,9 @@ static PyMethodDef sys_methods[] = {
      METH_VARARGS | METH_KEYWORDS, set_asyncgen_hooks_doc},
     SYS_GET_ASYNCGEN_HOOKS_METHODDEF
     SYS_GETANDROIDAPILEVEL_METHODDEF
-    SYS_ACTIVATE_PERF_TRAMPOLINE_METHODDEF
-    SYS_IS_PERF_TRAMPOLINE_ACTIVE_METHODDEF
-    SYS_DEACTIVATE_PERF_TRAMPOLINE_METHODDEF
+    SYS_ACTIVATE_STACK_TRAMPOLINE_METHODDEF
+    SYS_DEACTIVATE_STACK_TRAMPOLINE_METHODDEF
+    SYS_IS_STACK_TRAMPOLINE_ACTIVE_METHODDEF
     SYS_UNRAISABLEHOOK_METHODDEF
 #ifdef Py_STATS
     SYS__STATS_ON_METHODDEF

From be72b920030909e321bc1aa964a17827e57e6a36 Mon Sep 17 00:00:00 2001
From: Pablo Galindo <pablogsal@gmail.com>
Date: Mon, 22 Aug 2022 21:26:34 +0100
Subject: [PATCH 29/47] Fix more fork problems

---
 Objects/perf_trampoline.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/Objects/perf_trampoline.c b/Objects/perf_trampoline.c
index 33467bf3f7d0d9..4200cdb8484f1a 100644
--- a/Objects/perf_trampoline.c
+++ b/Objects/perf_trampoline.c
@@ -228,12 +228,13 @@ int
 _Py_perf_map_close(void *state)
 {
     FILE *fp = (FILE *)state;
+    int ret = 0;
     if (fp) {
-        return fclose(fp);
+        ret = fclose(fp);
     }
     perf_map_file = NULL;
     perf_status = PERF_STATUS_NO_INIT;
-    return 0;
+    return ret;
 }
 
 void
@@ -361,8 +362,11 @@ py_trampoline_evaluator(PyThreadState *ts, _PyInterpreterFrame *frame,
     }
     PyCodeObject *co = frame->f_code;
     py_trampoline f = NULL;
-    _PyCode_GetExtra((PyObject *)co, extra_code_index, (void **)&f);
-    if (f == NULL) {
+    int ret = -1;
+    if (extra_code_index != -1) {
+        ret = _PyCode_GetExtra((PyObject *)co, extra_code_index, (void **)&f);
+    }
+    if (ret != 0 || f == NULL) {
         // This is the first time we see this code object so we need
         // to compile a trampoline for it.
         if (extra_code_index == -1) {
@@ -452,7 +456,7 @@ _PyPerfTrampoline_Fini(void)
 {
 #ifdef _PY_HAVE_PERF_TRAMPOLINE
     free_code_arenas();
-    if (trampoline_api.state) {
+    if (trampoline_api.state != NULL) {
         trampoline_api.free_state(trampoline_api.state);
         trampoline_api.state = NULL;
     }

From b5739f4897588d91bf9825e677e1e458d90222e5 Mon Sep 17 00:00:00 2001
From: Pablo Galindo Salgado <Pablogsal@gmail.com>
Date: Mon, 22 Aug 2022 21:26:54 +0100
Subject: [PATCH 30/47] Update Lib/test/test_perf_profiler.py

Co-authored-by: chalggg <mmichal601@gmail.com>
---
 Lib/test/test_perf_profiler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Lib/test/test_perf_profiler.py b/Lib/test/test_perf_profiler.py
index aa61c1c1effacf..8afc3b3a135fb2 100644
--- a/Lib/test/test_perf_profiler.py
+++ b/Lib/test/test_perf_profiler.py
@@ -13,7 +13,7 @@
     raise unittest.SkipTest("test module requires subprocess")
 
 
-def is_unwinding_realiable():
+def is_unwinding_reliable():
     cflags = sysconfig.get_config_var("PY_CORE_CFLAGS")
     if not cflags:
         return False

From 04c0c142a814b3f54fa673de97faec8596925aef Mon Sep 17 00:00:00 2001
From: Pablo Galindo <pablogsal@gmail.com>
Date: Mon, 22 Aug 2022 21:37:01 +0100
Subject: [PATCH 31/47] Handle missing backends

---
 Python/sysmodule.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/Python/sysmodule.c b/Python/sysmodule.c
index ee1d05619cbe71..628679fbe61e49 100644
--- a/Python/sysmodule.c
+++ b/Python/sysmodule.c
@@ -2008,12 +2008,17 @@ sys_activate_stack_trampoline_impl(PyObject *module, const char *backend)
 /*[clinic end generated code: output=5783cdeb51874b43 input=58d7244062b933a8]*/
 {
     if (strcmp(backend, "perf") == 0) {
+#ifdef _PY_HAVE_PERF_TRAMPOLINE
         if (_PyPerfTrampoline_SetCallbacks(
                 _Py_perf_map_get_file, _Py_perf_map_write_entry, _Py_perf_map_close
             ) < 0 ) {
             PyErr_SetString(PyExc_ValueError, "can't activate perf trampoline");
             return NULL;
         }
+#else
+        PyErr_SetString(PyExc_ValueError, "perf trampoline not available");
+        return NULL;
+#endif
     }
     else {
         PyErr_Format(PyExc_ValueError, "unsuported invalid backend: %s", backend);

From e810ce68bae6910f7b9c385802b7127c89c11b9e Mon Sep 17 00:00:00 2001
From: Pablo Galindo Salgado <Pablogsal@gmail.com>
Date: Mon, 22 Aug 2022 21:37:37 +0100
Subject: [PATCH 32/47] Update Lib/test/test_perf_profiler.py

Co-authored-by: chalggg <mmichal601@gmail.com>
---
 Lib/test/test_perf_profiler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Lib/test/test_perf_profiler.py b/Lib/test/test_perf_profiler.py
index 8afc3b3a135fb2..2c0c495029692b 100644
--- a/Lib/test/test_perf_profiler.py
+++ b/Lib/test/test_perf_profiler.py
@@ -20,7 +20,7 @@ def is_unwinding_reliable():
     return "no-omit-frame-pointer" in cflags
 
 
-if not is_unwinding_realiable():
+if not is_unwinding_reliable():
     raise unittest.SkipTest("Unwinding without frame pointer is unreliable")
 
 if support.check_sanitizer(address=True, memory=True, ub=True):

From bc8bf4ec3a15b6f38ca4a51fb0ca130d055d1d70 Mon Sep 17 00:00:00 2001
From: Pablo Galindo <pablogsal@gmail.com>
Date: Mon, 22 Aug 2022 23:39:32 +0100
Subject: [PATCH 33/47] clean up perf files

---
 Lib/test/test_perf_profiler.py | 53 ++++++++++++++++++++--------------
 1 file changed, 31 insertions(+), 22 deletions(-)

diff --git a/Lib/test/test_perf_profiler.py b/Lib/test/test_perf_profiler.py
index 2c0c495029692b..a5c66e00b99b94 100644
--- a/Lib/test/test_perf_profiler.py
+++ b/Lib/test/test_perf_profiler.py
@@ -1,9 +1,9 @@
 import unittest
 import subprocess
-import re
 import sys
 import sysconfig
 import os
+import pathlib
 from test import support
 from test.support.script_helper import make_script
 from test.support.os_helper import temp_dir
@@ -76,27 +76,36 @@ def run_perf(cwd, *args, **env_vars):
         env = None
     output_file = cwd + "/perf_output.perf"
     base_cmd = ("perf", "record", "-g", "--call-graph=fp", "-o", output_file, "--")
-    proc = subprocess.run(
-        base_cmd + args,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
-        env=env,
-    )
-    if proc.returncode:
-        print(proc.stderr)
-        raise ValueError(f"Perf failed with return code {proc.returncode}")
-
-    base_cmd = ("perf", "script")
-    proc = subprocess.run(
-        ("perf", "script", "-i", output_file),
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
-        env=env,
-        check=True,
-    )
-    return proc.stdout.decode("utf-8", "replace"), proc.stderr.decode(
-        "utf-8", "replace"
-    )
+    prev_perf_files = set(pathlib.Path("/tmp/").glob("perf-*.map"))
+    try:
+        proc = subprocess.run(
+            base_cmd + args,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            env=env,
+        )
+        if proc.returncode:
+            print(proc.stderr)
+            raise ValueError(f"Perf failed with return code {proc.returncode}")
+
+        base_cmd = ("perf", "script")
+        proc = subprocess.run(
+            ("perf", "script", "-i", output_file),
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            env=env,
+            check=True,
+        )
+        return proc.stdout.decode("utf-8", "replace"), proc.stderr.decode(
+            "utf-8", "replace"
+        )
+    finally:
+        # Clean up the perf map file at the end
+        files_to_delete = (
+            set(pathlib.Path("/tmp/").glob("perf-*.map")) - prev_perf_files
+        )
+        for file in files_to_delete:
+            file.unlink()
 
 
 class TestPerfProfiler(unittest.TestCase):

From 0252845bd2fade29686fe77ef855184de1bc48fa Mon Sep 17 00:00:00 2001
From: Pablo Galindo Salgado <Pablogsal@gmail.com>
Date: Tue, 23 Aug 2022 00:09:08 +0100
Subject: [PATCH 34/47] Update Misc/NEWS.d/next/Core and
 Builtins/2022-08-20-18-36-40.gh-issue-96143.nh3GFM.rst

---
 .../2022-08-20-18-36-40.gh-issue-96143.nh3GFM.rst               | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-08-20-18-36-40.gh-issue-96143.nh3GFM.rst b/Misc/NEWS.d/next/Core and Builtins/2022-08-20-18-36-40.gh-issue-96143.nh3GFM.rst
index 2d039ad3e0fe7f..1b6f60862ddc60 100644
--- a/Misc/NEWS.d/next/Core and Builtins/2022-08-20-18-36-40.gh-issue-96143.nh3GFM.rst	
+++ b/Misc/NEWS.d/next/Core and Builtins/2022-08-20-18-36-40.gh-issue-96143.nh3GFM.rst	
@@ -4,4 +4,4 @@ function in the :mod:`sys` module that allows to set/unset the interpreter in a
 way that the Linux ``perf`` profiler can detect Python calls. The new
 :func:`sys.is_perf_trampoline_active` function allows to query the state of the
 perf trampoline. Design by Pablo Galindo. Patch by Pablo Galindo and Christian Heimes
-with contributions from Gregory P. Smith and Mark Shannon.
+with contributions from Gregory P. Smith [Google] and Mark Shannon.

From 264bed72d5582880731fb7c69ca37c00f0b49985 Mon Sep 17 00:00:00 2001
From: Pablo Galindo <pablogsal@gmail.com>
Date: Tue, 23 Aug 2022 13:11:39 +0100
Subject: [PATCH 35/47] Test fork support, fix some fork problems and improve
 test file

---
 Lib/test/test_perf_profiler.py | 190 ++++++++++++++++++++++++++-------
 Makefile.pre.in                |   1 +
 Objects/perf_trampoline.c      |  17 +--
 configure                      |   4 +
 configure.ac                   |   3 +
 5 files changed, 167 insertions(+), 48 deletions(-)

diff --git a/Lib/test/test_perf_profiler.py b/Lib/test/test_perf_profiler.py
index a5c66e00b99b94..35c85dc68fe513 100644
--- a/Lib/test/test_perf_profiler.py
+++ b/Lib/test/test_perf_profiler.py
@@ -13,6 +13,116 @@
     raise unittest.SkipTest("test module requires subprocess")
 
 
+def supports_trampoline_profiling():
+    perf_trampoline = sysconfig.get_config_var("PERF_TRAMPOLINE_SUPPORT")
+    if not perf_trampoline:
+        return False
+    return int(perf_trampoline) == 1
+
+
+if not supports_trampoline_profiling():
+    raise unittest.SkipTest("perf trampoline profiling not supported")
+
+
+class TestPerfTrampoline(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        self.perf_files = set(pathlib.Path("/tmp/").glob("perf-*.map"))
+
+    def tearDown(self) -> None:
+        super().tearDown()
+        files_to_delete = (
+            set(pathlib.Path("/tmp/").glob("perf-*.map")) - self.perf_files
+        )
+        for file in files_to_delete:
+            file.unlink()
+
+    def test_trampoline_works(self):
+        code = """if 1:
+                def foo():
+                    pass
+
+                def bar():
+                    foo()
+
+                def baz():
+                    bar()
+
+                baz()
+                """
+        with subprocess.Popen(
+            [sys.executable, "-Xperf", "-c", code],
+            universal_newlines=True,
+            stderr=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+        ) as process:
+            stdout, stderr = process.communicate()
+
+        self.assertEqual(process.returncode, 0)
+        self.assertEqual(stderr, "")
+        self.assertEqual(stdout, "")
+        perf_file = pathlib.Path(f"/tmp/perf-{process.pid}.map")
+        self.assertTrue(perf_file.exists())
+
+    def test_trampoline_works_with_forks(self):
+        code = """if 1:
+                import os, sys
+
+                def foo_fork():
+                    pass
+
+                def bar_fork():
+                    foo_fork()
+
+                def baz_fork():
+                    bar_fork()
+
+                def foo():
+                    pid = os.fork()
+                    if pid == 0:
+                        print(os.getpid())
+                        baz_fork()
+                    else:
+                        _, status = os.waitpid(-1, 0)
+                        sys.exit(status)
+
+                def bar():
+                    foo()
+
+                def baz():
+                    bar()
+
+                baz()
+                """
+        with temp_dir() as script_dir:
+            script = make_script(script_dir, "perftest", code)
+            with subprocess.Popen(
+                [sys.executable, "-Xperf", script],
+                universal_newlines=True,
+                stderr=subprocess.PIPE,
+                stdout=subprocess.PIPE,
+            ) as process:
+                stdout, stderr = process.communicate()
+
+        self.assertEqual(process.returncode, 0)
+        self.assertEqual(stderr, "")
+        child_pid = int(stdout.strip())
+        perf_file = pathlib.Path(f"/tmp/perf-{process.pid}.map")
+        perf_child_file = pathlib.Path(f"/tmp/perf-{child_pid}.map")
+        self.assertTrue(perf_file.exists())
+        self.assertTrue(perf_child_file.exists())
+
+        perf_file_contents = perf_file.read_text()
+        self.assertIn(f"py::foo:{script}", perf_file_contents)
+        self.assertIn(f"py::bar:{script}", perf_file_contents)
+        self.assertIn(f"py::baz:{script}", perf_file_contents)
+
+        child_perf_file_contents = perf_child_file.read_text()
+        self.assertIn(f"py::foo_fork:{script}", child_perf_file_contents)
+        self.assertIn(f"py::bar_fork:{script}", child_perf_file_contents)
+        self.assertIn(f"py::baz_fork:{script}", child_perf_file_contents)
+
+
 def is_unwinding_reliable():
     cflags = sysconfig.get_config_var("PY_CORE_CFLAGS")
     if not cflags:
@@ -20,24 +130,17 @@ def is_unwinding_reliable():
     return "no-omit-frame-pointer" in cflags
 
 
-if not is_unwinding_reliable():
-    raise unittest.SkipTest("Unwinding without frame pointer is unreliable")
-
-if support.check_sanitizer(address=True, memory=True, ub=True):
-    raise unittest.SkipTest("Perf unwinding doesn't work with sanitizers")
-
-
-def check_perf_command():
+def perf_command_works():
     try:
         cmd = ["perf", "--help"]
         stdout = subprocess.check_output(cmd, universal_newlines=True)
     except (subprocess.SubprocessError, OSError):
-        raise unittest.SkipTest("Couldn't find perf on the path")
+        return False
 
     # perf version does not return a version number on Fedora. Use presence
     # of "perf.data" in help as indicator that it's perf from Linux tools.
     if "perf.data" not in stdout:
-        raise unittest.SkipTest("perf command does not look like Linux tool perf")
+        return False
 
     # Check that we can run a simple perf run
     with temp_dir() as script_dir:
@@ -59,13 +162,12 @@ def check_perf_command():
                 cmd, cwd=script_dir, universal_newlines=True, stderr=subprocess.STDOUT
             )
         except (subprocess.SubprocessError, OSError):
-            raise unittest.SkipTest("Couldn't run perf on simple script")
+            return False
 
         if "hello" not in stdout:
-            raise unittest.SkipTest("perf run did not work correctly")
-
+            return False
 
-check_perf_command()
+    return True
 
 
 def run_perf(cwd, *args, **env_vars):
@@ -76,39 +178,45 @@ def run_perf(cwd, *args, **env_vars):
         env = None
     output_file = cwd + "/perf_output.perf"
     base_cmd = ("perf", "record", "-g", "--call-graph=fp", "-o", output_file, "--")
-    prev_perf_files = set(pathlib.Path("/tmp/").glob("perf-*.map"))
-    try:
-        proc = subprocess.run(
-            base_cmd + args,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            env=env,
-        )
-        if proc.returncode:
-            print(proc.stderr)
-            raise ValueError(f"Perf failed with return code {proc.returncode}")
+    proc = subprocess.run(
+        base_cmd + args,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        env=env,
+    )
+    if proc.returncode:
+        print(proc.stderr)
+        raise ValueError(f"Perf failed with return code {proc.returncode}")
 
-        base_cmd = ("perf", "script")
-        proc = subprocess.run(
-            ("perf", "script", "-i", output_file),
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            env=env,
-            check=True,
-        )
-        return proc.stdout.decode("utf-8", "replace"), proc.stderr.decode(
-            "utf-8", "replace"
-        )
-    finally:
-        # Clean up the perf map file at the end
+    base_cmd = ("perf", "script")
+    proc = subprocess.run(
+        ("perf", "script", "-i", output_file),
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        env=env,
+        check=True,
+    )
+    return proc.stdout.decode("utf-8", "replace"), proc.stderr.decode(
+        "utf-8", "replace"
+    )
+
+
+@unittest.skipUnless(perf_command_works(), "perf command doesn't work")
+@unittest.skipUnless(is_unwinding_reliable(), "Unwinding is unreliable")
+@support.skip_if_sanitizer(address=True, memory=True, ub=True)
+class TestPerfProfiler(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        self.perf_files = set(pathlib.Path("/tmp/").glob("perf-*.map"))
+
+    def tearDown(self) -> None:
+        super().tearDown()
         files_to_delete = (
-            set(pathlib.Path("/tmp/").glob("perf-*.map")) - prev_perf_files
+            set(pathlib.Path("/tmp/").glob("perf-*.map")) - self.perf_files
         )
         for file in files_to_delete:
             file.unlink()
 
-
-class TestPerfProfiler(unittest.TestCase):
     def test_python_calls_appear_in_the_stack_if_perf_activated(self):
         with temp_dir() as script_dir:
             code = """if 1:
diff --git a/Makefile.pre.in b/Makefile.pre.in
index 027aefcb61312d..a70681a876afc1 100644
--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@@ -54,6 +54,7 @@ DTRACE=         @DTRACE@
 DFLAGS=         @DFLAGS@
 DTRACE_HEADERS= @DTRACE_HEADERS@
 DTRACE_OBJS=    @DTRACE_OBJS@
+PERF_TRAMPOLINE_SUPPORT= @PERF_TRAMPOLINE_SUPPORT@
 
 GNULD=		@GNULD@
 
diff --git a/Objects/perf_trampoline.c b/Objects/perf_trampoline.c
index 4200cdb8484f1a..0e81b9235e424d 100644
--- a/Objects/perf_trampoline.c
+++ b/Objects/perf_trampoline.c
@@ -362,16 +362,11 @@ py_trampoline_evaluator(PyThreadState *ts, _PyInterpreterFrame *frame,
     }
     PyCodeObject *co = frame->f_code;
     py_trampoline f = NULL;
-    int ret = -1;
-    if (extra_code_index != -1) {
-        ret = _PyCode_GetExtra((PyObject *)co, extra_code_index, (void **)&f);
-    }
+    assert(extra_code_index != -1);
+    int ret = _PyCode_GetExtra((PyObject *)co, extra_code_index, (void **)&f);
     if (ret != 0 || f == NULL) {
         // This is the first time we see this code object so we need
         // to compile a trampoline for it.
-        if (extra_code_index == -1) {
-            extra_code_index = _PyEval_RequestCodeExtraIndex(NULL);
-        }
         py_trampoline new_trampoline = compile_trampoline();
         if (new_trampoline == NULL) {
             goto default_eval;
@@ -445,6 +440,10 @@ _PyPerfTrampoline_Init(int activate)
             }
             trampoline_api.state = state;
         }
+        extra_code_index = _PyEval_RequestCodeExtraIndex(NULL);
+        if (extra_code_index == -1) {
+            return -1;
+        }
         perf_status = PERF_STATUS_OK;
     }
 #endif
@@ -455,6 +454,10 @@ int
 _PyPerfTrampoline_Fini(void)
 {
 #ifdef _PY_HAVE_PERF_TRAMPOLINE
+    PyThreadState *tstate = _PyThreadState_GET();
+    if (tstate->interp->eval_frame == py_trampoline_evaluator) {
+        tstate->interp->eval_frame = NULL;
+    }
     free_code_arenas();
     if (trampoline_api.state != NULL) {
         trampoline_api.free_state(trampoline_api.state);
diff --git a/configure b/configure
index 05d0702cfa8b98..7fbef76a826097 100755
--- a/configure
+++ b/configure
@@ -862,6 +862,7 @@ TZPATH
 LIBUUID_LIBS
 LIBUUID_CFLAGS
 PERF_TRAMPOLINE_OBJ
+PERF_TRAMPOLINE_SUPPORT
 SHLIBS
 CFLAGSFORSHARED
 LINKFORSHARED
@@ -11433,12 +11434,15 @@ esac
 { $as_echo "$as_me:${as_lineno-$LINENO}: result: $perf_trampoline" >&5
 $as_echo "$perf_trampoline" >&6; }
 
+
+PERF_TRAMPOLINE_SUPPORT=
 if test "x$perf_trampoline" = xyes; then :
 
 
 $as_echo "#define _PY_HAVE_PERF_TRAMPOLINE 1" >>confdefs.h
 
   PERF_TRAMPOLINE_OBJ=Objects/asm_trampoline.o
+  PERF_TRAMPOLINE_SUPPORT=1
 
     if test "x$Py_DEBUG" = xtrue; then :
 
diff --git a/configure.ac b/configure.ac
index a3449d143f5bac..84ae7235accc61 100644
--- a/configure.ac
+++ b/configure.ac
@@ -3435,9 +3435,12 @@ AS_CASE([$PLATFORM_TRIPLET],
 )
 AC_MSG_RESULT([$perf_trampoline])
 
+AC_SUBST(PERF_TRAMPOLINE_SUPPORT)
+PERF_TRAMPOLINE_SUPPORT=
 AS_VAR_IF([perf_trampoline], [yes], [
   AC_DEFINE([_PY_HAVE_PERF_TRAMPOLINE], [1], [Define to 1 if you have the perf trampoline.])
   PERF_TRAMPOLINE_OBJ=Objects/asm_trampoline.o
+  PERF_TRAMPOLINE_SUPPORT=1
 
   dnl perf needs frame pointers for unwinding, include compiler option in debug builds
   AS_VAR_IF([Py_DEBUG], [true], [

From a31a498ac3b1c56191354daa604eb0ad06945a50 Mon Sep 17 00:00:00 2001
From: Pablo Galindo <pablogsal@gmail.com>
Date: Tue, 23 Aug 2022 13:42:06 +0100
Subject: [PATCH 36/47] Add more tests

---
 Include/internal/pycore_ceval.h |  6 ++-
 Lib/test/test_perf_profiler.py  | 96 +++++++++++++++++++++++++++++----
 Objects/perf_trampoline.c       | 22 ++++++--
 Python/sysmodule.c              | 18 ++++---
 4 files changed, 123 insertions(+), 19 deletions(-)

diff --git a/Include/internal/pycore_ceval.h b/Include/internal/pycore_ceval.h
index df7eb1bc8a7c6a..59c56032f0b0ce 100644
--- a/Include/internal/pycore_ceval.h
+++ b/Include/internal/pycore_ceval.h
@@ -76,7 +76,11 @@ extern int _PyPerfTrampoline_SetCallbacks(
     trampoline_state_write write_state,
     trampoline_state_free free_state
 );
-
+extern void _PyPerfTrampoline_GetCallbacks(
+    trampoline_state_init *init_state,
+    trampoline_state_write *write_state,
+    trampoline_state_free *free_state
+);
 extern int _PyPerfTrampoline_Init(int activate);
 extern int _PyPerfTrampoline_Fini(void);
 extern int _PyIsPerfTrampolineActive(void);
diff --git a/Lib/test/test_perf_profiler.py b/Lib/test/test_perf_profiler.py
index 35c85dc68fe513..254eafed294089 100644
--- a/Lib/test/test_perf_profiler.py
+++ b/Lib/test/test_perf_profiler.py
@@ -5,7 +5,11 @@
 import os
 import pathlib
 from test import support
-from test.support.script_helper import make_script
+from test.support.script_helper import (
+    make_script,
+    assert_python_failure,
+    assert_python_ok,
+)
 from test.support.os_helper import temp_dir
 
 
@@ -50,19 +54,25 @@ def baz():
 
                 baz()
                 """
-        with subprocess.Popen(
-            [sys.executable, "-Xperf", "-c", code],
-            universal_newlines=True,
-            stderr=subprocess.PIPE,
-            stdout=subprocess.PIPE,
-        ) as process:
-            stdout, stderr = process.communicate()
+        with temp_dir() as script_dir:
+            script = make_script(script_dir, "perftest", code)
+            with subprocess.Popen(
+                [sys.executable, "-Xperf", script],
+                universal_newlines=True,
+                stderr=subprocess.PIPE,
+                stdout=subprocess.PIPE,
+            ) as process:
+                stdout, stderr = process.communicate()
 
-        self.assertEqual(process.returncode, 0)
         self.assertEqual(stderr, "")
         self.assertEqual(stdout, "")
+
         perf_file = pathlib.Path(f"/tmp/perf-{process.pid}.map")
         self.assertTrue(perf_file.exists())
+        perf_file_contents = perf_file.read_text()
+        self.assertIn(f"py::foo:{script}", perf_file_contents)
+        self.assertIn(f"py::bar:{script}", perf_file_contents)
+        self.assertIn(f"py::baz:{script}", perf_file_contents)
 
     def test_trampoline_works_with_forks(self):
         code = """if 1:
@@ -122,6 +132,74 @@ def baz():
         self.assertIn(f"py::bar_fork:{script}", child_perf_file_contents)
         self.assertIn(f"py::baz_fork:{script}", child_perf_file_contents)
 
+    def test_sys_api(self):
+        code = """if 1:
+                import sys
+                def foo():
+                    pass
+
+                def spam():
+                    pass
+
+                def bar():
+                    sys.deactivate_stack_trampoline()
+                    foo()
+                    sys.activate_stack_trampoline("perf")
+                    spam()
+
+                def baz():
+                    bar()
+
+                sys.activate_stack_trampoline("perf")
+                baz()
+                """
+        with temp_dir() as script_dir:
+            script = make_script(script_dir, "perftest", code)
+            with subprocess.Popen(
+                [sys.executable, script],
+                universal_newlines=True,
+                stderr=subprocess.PIPE,
+                stdout=subprocess.PIPE,
+            ) as process:
+                stdout, stderr = process.communicate()
+
+        self.assertEqual(stderr, "")
+        self.assertEqual(stdout, "")
+
+        perf_file = pathlib.Path(f"/tmp/perf-{process.pid}.map")
+        self.assertTrue(perf_file.exists())
+        perf_file_contents = perf_file.read_text()
+        self.assertNotIn(f"py::foo:{script}", perf_file_contents)
+        self.assertIn(f"py::spam:{script}", perf_file_contents)
+        self.assertIn(f"py::bar:{script}", perf_file_contents)
+        self.assertIn(f"py::baz:{script}", perf_file_contents)
+
+    def test_sys_api_with_existing_trampoline(self):
+        code = """if 1:
+                import sys
+                sys.activate_stack_trampoline("perf")
+                sys.activate_stack_trampoline("perf")
+                """
+        assert_python_ok("-c", code)
+
+    def test_sys_api_with_invalid_trampoline(self):
+        code = """if 1:
+                import sys
+                sys.activate_stack_trampoline("invalid")
+                """
+        rc, out, err = assert_python_failure("-c", code)
+        self.assertIn("invalid backend: invalid", err.decode())
+
+    def test_sys_api_get_status(self):
+        code = """if 1:
+                import sys
+                sys.activate_stack_trampoline("perf")
+                assert sys.is_stack_trampoline_active() is True
+                sys.deactivate_stack_trampoline()
+                assert sys.is_stack_trampoline_active() is False
+                """
+        assert_python_ok("-c", code)
+
 
 def is_unwinding_reliable():
     cflags = sysconfig.get_config_var("PY_CORE_CFLAGS")
diff --git a/Objects/perf_trampoline.c b/Objects/perf_trampoline.c
index 0e81b9235e424d..068bd2892badd3 100644
--- a/Objects/perf_trampoline.c
+++ b/Objects/perf_trampoline.c
@@ -209,7 +209,6 @@ _Py_perf_map_get_file(void)
     if (fd == -1) {
         perf_status = PERF_STATUS_FAILED;
         PyErr_SetFromErrnoWithFilename(PyExc_OSError, filename);
-        _PyErr_WriteUnraisableMsg("Failed to create perf map file", NULL);
         return NULL;
     }
     perf_map_file = fdopen(fd, "w");
@@ -217,8 +216,6 @@ _Py_perf_map_get_file(void)
         perf_status = PERF_STATUS_FAILED;
         PyErr_SetFromErrnoWithFilename(PyExc_OSError, filename);
         close(fd);
-        _PyErr_WriteUnraisableMsg("Failed to create perf map file handle",
-                                  NULL);
         return NULL;
     }
     return perf_map_file;
@@ -395,6 +392,25 @@ _PyIsPerfTrampolineActive(void)
     return 0;
 }
 
+void
+_PyPerfTrampoline_GetCallbacks(trampoline_state_init *init_state,
+                               trampoline_state_write *write_state,
+                               trampoline_state_free *free_state)
+{
+#ifdef _PY_HAVE_PERF_TRAMPOLINE
+    if (init_state) {
+        *init_state = trampoline_api.init_state;
+    }
+    if (write_state) {
+        *write_state = trampoline_api.write_state;
+    }
+    if (free_state) {
+        *free_state = trampoline_api.free_state;
+    }
+#endif
+    return;
+}
+
 int
 _PyPerfTrampoline_SetCallbacks(trampoline_state_init init_state,
                                trampoline_state_write write_state,
diff --git a/Python/sysmodule.c b/Python/sysmodule.c
index 628679fbe61e49..7313d6098d6472 100644
--- a/Python/sysmodule.c
+++ b/Python/sysmodule.c
@@ -2009,11 +2009,17 @@ sys_activate_stack_trampoline_impl(PyObject *module, const char *backend)
 {
     if (strcmp(backend, "perf") == 0) {
 #ifdef _PY_HAVE_PERF_TRAMPOLINE
-        if (_PyPerfTrampoline_SetCallbacks(
-                _Py_perf_map_get_file, _Py_perf_map_write_entry, _Py_perf_map_close
-            ) < 0 ) {
-            PyErr_SetString(PyExc_ValueError, "can't activate perf trampoline");
-            return NULL;
+        trampoline_state_init init_callback = NULL; 
+        _PyPerfTrampoline_GetCallbacks(&init_callback, NULL, NULL);
+        if (init_callback != _Py_perf_map_get_file) {
+            if ( _PyPerfTrampoline_SetCallbacks(
+                    _Py_perf_map_get_file,
+                    _Py_perf_map_write_entry,
+                    _Py_perf_map_close
+                ) < 0 ) {
+                PyErr_SetString(PyExc_ValueError, "can't activate perf trampoline");
+                return NULL;
+            }
         }
 #else
         PyErr_SetString(PyExc_ValueError, "perf trampoline not available");
@@ -2021,7 +2027,7 @@ sys_activate_stack_trampoline_impl(PyObject *module, const char *backend)
 #endif
     }
     else {
-        PyErr_Format(PyExc_ValueError, "unsuported invalid backend: %s", backend);
+        PyErr_Format(PyExc_ValueError, "invalid backend: %s", backend);
         return NULL;
     }
     if (_PyPerfTrampoline_Init(1) < 0) {

From f591e8d69e7afd62a689f0614a6b97efdc85fa42 Mon Sep 17 00:00:00 2001
From: Pablo Galindo Salgado <Pablogsal@gmail.com>
Date: Tue, 23 Aug 2022 14:14:49 +0100
Subject: [PATCH 37/47] Update Objects/perf_trampoline.c

Co-authored-by: Christian Heimes <christian@python.org>
---
 Objects/perf_trampoline.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Objects/perf_trampoline.c b/Objects/perf_trampoline.c
index 068bd2892badd3..32d3072faa235c 100644
--- a/Objects/perf_trampoline.c
+++ b/Objects/perf_trampoline.c
@@ -123,7 +123,7 @@ list of arenas in case the current memory arena is exhausted and another one is
 needed.
 
 For the best results, Python should be compiled with
-CFLAGS="-fno-omit-frame-pointer -mno-omit-leaf-frame-pointer"` as this allows
+CFLAGS="-fno-omit-frame-pointer -mno-omit-leaf-frame-pointer" as this allows
 profilers to unwind using only the frame pointer and not on DWARF debug
 information (note that as trampilines are dynamically generated there won't be
 any DWARF information available for them).

From 0af2a08f0fec59790f885e6bd6508afba6e70396 Mon Sep 17 00:00:00 2001
From: Pablo Galindo <pablogsal@gmail.com>
Date: Tue, 23 Aug 2022 14:16:28 +0100
Subject: [PATCH 38/47] make argument mandatory

---
 Python/clinic/sysmodule.c.h | 23 ++++++++---------------
 Python/sysmodule.c          |  4 ++--
 2 files changed, 10 insertions(+), 17 deletions(-)

diff --git a/Python/clinic/sysmodule.c.h b/Python/clinic/sysmodule.c.h
index 08e325774713d6..63709dc102fc48 100644
--- a/Python/clinic/sysmodule.c.h
+++ b/Python/clinic/sysmodule.c.h
@@ -1128,35 +1128,29 @@ sys_getandroidapilevel(PyObject *module, PyObject *Py_UNUSED(ignored))
 #endif /* defined(ANDROID_API_LEVEL) */
 
 PyDoc_STRVAR(sys_activate_stack_trampoline__doc__,
-"activate_stack_trampoline($module, backend=\'perf\', /)\n"
+"activate_stack_trampoline($module, backend, /)\n"
 "--\n"
 "\n"
 "Activate the perf profiler trampoline.");
 
 #define SYS_ACTIVATE_STACK_TRAMPOLINE_METHODDEF    \
-    {"activate_stack_trampoline", _PyCFunction_CAST(sys_activate_stack_trampoline), METH_FASTCALL, sys_activate_stack_trampoline__doc__},
+    {"activate_stack_trampoline", (PyCFunction)sys_activate_stack_trampoline, METH_O, sys_activate_stack_trampoline__doc__},
 
 static PyObject *
 sys_activate_stack_trampoline_impl(PyObject *module, const char *backend);
 
 static PyObject *
-sys_activate_stack_trampoline(PyObject *module, PyObject *const *args, Py_ssize_t nargs)
+sys_activate_stack_trampoline(PyObject *module, PyObject *arg)
 {
     PyObject *return_value = NULL;
-    const char *backend = "perf";
+    const char *backend;
 
-    if (!_PyArg_CheckPositional("activate_stack_trampoline", nargs, 0, 1)) {
-        goto exit;
-    }
-    if (nargs < 1) {
-        goto skip_optional;
-    }
-    if (!PyUnicode_Check(args[0])) {
-        _PyArg_BadArgument("activate_stack_trampoline", "argument 1", "str", args[0]);
+    if (!PyUnicode_Check(arg)) {
+        _PyArg_BadArgument("activate_stack_trampoline", "argument", "str", arg);
         goto exit;
     }
     Py_ssize_t backend_length;
-    backend = PyUnicode_AsUTF8AndSize(args[0], &backend_length);
+    backend = PyUnicode_AsUTF8AndSize(arg, &backend_length);
     if (backend == NULL) {
         goto exit;
     }
@@ -1164,7 +1158,6 @@ sys_activate_stack_trampoline(PyObject *module, PyObject *const *args, Py_ssize_
         PyErr_SetString(PyExc_ValueError, "embedded null character");
         goto exit;
     }
-skip_optional:
     return_value = sys_activate_stack_trampoline_impl(module, backend);
 
 exit:
@@ -1250,4 +1243,4 @@ sys_is_stack_trampoline_active(PyObject *module, PyObject *Py_UNUSED(ignored))
 #ifndef SYS_GETANDROIDAPILEVEL_METHODDEF
     #define SYS_GETANDROIDAPILEVEL_METHODDEF
 #endif /* !defined(SYS_GETANDROIDAPILEVEL_METHODDEF) */
-/*[clinic end generated code: output=9fd2e37197f55a7f input=a9049054013a1b77]*/
+/*[clinic end generated code: output=75d0508065ec2818 input=a9049054013a1b77]*/
diff --git a/Python/sysmodule.c b/Python/sysmodule.c
index 7313d6098d6472..cf31f7c19c8f55 100644
--- a/Python/sysmodule.c
+++ b/Python/sysmodule.c
@@ -1997,7 +1997,7 @@ sys_getandroidapilevel_impl(PyObject *module)
 /*[clinic input]
 sys.activate_stack_trampoline
 
-    backend: str = "perf"
+    backend: str
     /
 
 Activate the perf profiler trampoline.
@@ -2005,7 +2005,7 @@ Activate the perf profiler trampoline.
 
 static PyObject *
 sys_activate_stack_trampoline_impl(PyObject *module, const char *backend)
-/*[clinic end generated code: output=5783cdeb51874b43 input=58d7244062b933a8]*/
+/*[clinic end generated code: output=5783cdeb51874b43 input=b09020e3a17c78c5]*/
 {
     if (strcmp(backend, "perf") == 0) {
 #ifdef _PY_HAVE_PERF_TRAMPOLINE

From 861ae09e9c8e8d76f9434869dad0ad6c8e4b393a Mon Sep 17 00:00:00 2001
From: Christian Heimes <christian@python.org>
Date: Tue, 23 Aug 2022 15:27:53 +0200
Subject: [PATCH 39/47] Use struct for perf callbacks

---
 Include/internal/pycore_ceval.h | 33 +++++++++---------
 Objects/perf_trampoline.c       | 60 +++++++++++++++++----------------
 Python/pylifecycle.c            |  5 ++-
 Python/sysmodule.c              | 12 +++----
 4 files changed, 52 insertions(+), 58 deletions(-)

diff --git a/Include/internal/pycore_ceval.h b/Include/internal/pycore_ceval.h
index 59c56032f0b0ce..4119913de1c11a 100644
--- a/Include/internal/pycore_ceval.h
+++ b/Include/internal/pycore_ceval.h
@@ -67,28 +67,25 @@ extern PyObject* _PyEval_BuiltinsFromGlobals(
 
 // Trampoline API
 
-typedef void* (*trampoline_state_init)(void);
-typedef void (*trampoline_state_write)(void* state, const void *code_addr,
-                                       unsigned int code_size, PyCodeObject* code);
-typedef int (*trampoline_state_free)(void* state);
-extern int _PyPerfTrampoline_SetCallbacks(
-    trampoline_state_init init_state,
-    trampoline_state_write write_state,
-    trampoline_state_free free_state
-);
-extern void _PyPerfTrampoline_GetCallbacks(
-    trampoline_state_init *init_state,
-    trampoline_state_write *write_state,
-    trampoline_state_free *free_state
-);
+typedef struct {
+    // Callback to initialize the trampoline state
+    void* (*init_state)(void);
+    // Callback to register every trampoline being created
+    void (*write_state)(void* state, const void *code_addr,
+                        unsigned int code_size, PyCodeObject* code);
+    // Callback to free the trampoline state
+    int (*free_state)(void* state);
+} _PyPerf_Callbacks;
+
+extern int _PyPerfTrampoline_SetCallbacks(_PyPerf_Callbacks *);
+extern void _PyPerfTrampoline_GetCallbacks(_PyPerf_Callbacks *);
 extern int _PyPerfTrampoline_Init(int activate);
 extern int _PyPerfTrampoline_Fini(void);
 extern int _PyIsPerfTrampolineActive(void);
 extern PyStatus _PyPerfTrampoline_AfterFork_Child(void);
-
-extern void* _Py_perf_map_get_file(void);
-extern void _Py_perf_map_write_entry(void*, const void*, unsigned int, PyCodeObject*);
-extern int _Py_perf_map_close(void*);
+#ifdef _PY_HAVE_PERF_TRAMPOLINE
+extern _PyPerf_Callbacks _Py_perfmap_callbacks;
+#endif
 
 static inline PyObject*
 _PyEval_EvalFrame(PyThreadState *tstate, struct _PyInterpreterFrame *frame, int throwflag)
diff --git a/Objects/perf_trampoline.c b/Objects/perf_trampoline.c
index 068bd2892badd3..89a58fcfc7a4fa 100644
--- a/Objects/perf_trampoline.c
+++ b/Objects/perf_trampoline.c
@@ -176,11 +176,10 @@ struct code_arena_st {
 typedef struct code_arena_st code_arena_t;
 
 struct trampoline_api_st {
-    trampoline_state_init
-        init_state;  // Callback to initialize the trampoline state
-    trampoline_state_write
-        write_state;  // Callback to register every trampoline being created
-    trampoline_state_free free_state;  // Callback to free the trampoline state
+    void* (*init_state)(void);
+    void (*write_state)(void* state, const void *code_addr,
+                        unsigned int code_size, PyCodeObject* code);
+    int (*free_state)(void* state);
     void *state;
 };
 
@@ -192,8 +191,9 @@ static code_arena_t *code_arena;
 static trampoline_api_t trampoline_api;
 
 static FILE *perf_map_file;
-void *
-_Py_perf_map_get_file(void)
+
+static void *
+perf_map_get_file(void)
 {
     if (perf_map_file) {
         return perf_map_file;
@@ -221,8 +221,8 @@ _Py_perf_map_get_file(void)
     return perf_map_file;
 }
 
-int
-_Py_perf_map_close(void *state)
+static int
+perf_map_close(void *state)
 {
     FILE *fp = (FILE *)state;
     int ret = 0;
@@ -234,8 +234,8 @@ _Py_perf_map_close(void *state)
     return ret;
 }
 
-void
-_Py_perf_map_write_entry(void *state, const void *code_addr,
+static void
+perf_map_write_entry(void *state, const void *code_addr,
                          unsigned int code_size, PyCodeObject *co)
 {
     assert(state != NULL);
@@ -257,6 +257,12 @@ _Py_perf_map_write_entry(void *state, const void *code_addr,
     fflush(method_file);
 }
 
+_PyPerf_Callbacks _Py_perfmap_callbacks = {
+    &perf_map_get_file,
+    &perf_map_write_entry,
+    &perf_map_close
+};
+
 static int
 new_code_arena(void)
 {
@@ -393,36 +399,32 @@ _PyIsPerfTrampolineActive(void)
 }
 
 void
-_PyPerfTrampoline_GetCallbacks(trampoline_state_init *init_state,
-                               trampoline_state_write *write_state,
-                               trampoline_state_free *free_state)
+_PyPerfTrampoline_GetCallbacks(_PyPerf_Callbacks *callbacks)
 {
-#ifdef _PY_HAVE_PERF_TRAMPOLINE
-    if (init_state) {
-        *init_state = trampoline_api.init_state;
-    }
-    if (write_state) {
-        *write_state = trampoline_api.write_state;
-    }
-    if (free_state) {
-        *free_state = trampoline_api.free_state;
+    if (callbacks == NULL) {
+        return;
     }
+#ifdef _PY_HAVE_PERF_TRAMPOLINE
+    callbacks->init_state = trampoline_api.init_state;
+    callbacks->write_state = trampoline_api.write_state;
+    callbacks->free_state = trampoline_api.free_state;
 #endif
     return;
 }
 
 int
-_PyPerfTrampoline_SetCallbacks(trampoline_state_init init_state,
-                               trampoline_state_write write_state,
-                               trampoline_state_free free_state)
+_PyPerfTrampoline_SetCallbacks(_PyPerf_Callbacks *callbacks)
 {
+    if (callbacks == NULL) {
+        return -1;
+    }
 #ifdef _PY_HAVE_PERF_TRAMPOLINE
     if (trampoline_api.state) {
         _PyPerfTrampoline_Fini();
     }
-    trampoline_api.init_state = init_state;
-    trampoline_api.write_state = write_state;
-    trampoline_api.free_state = free_state;
+    trampoline_api.init_state = callbacks->init_state;
+    trampoline_api.write_state = callbacks->write_state;
+    trampoline_api.free_state = callbacks->free_state;
     trampoline_api.state = NULL;
     perf_status = PERF_STATUS_OK;
 #endif
diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c
index 44952c9b83c5b8..31b7297b26e299 100644
--- a/Python/pylifecycle.c
+++ b/Python/pylifecycle.c
@@ -1153,9 +1153,8 @@ init_interp_main(PyThreadState *tstate)
 
 #ifdef _PY_HAVE_PERF_TRAMPOLINE
         if (config->perf_profiling) {
-            if (_PyPerfTrampoline_SetCallbacks(
-                    _Py_perf_map_get_file, _Py_perf_map_write_entry, _Py_perf_map_close
-                ) < 0 || _PyPerfTrampoline_Init(config->perf_profiling) < 0) {
+            if (_PyPerfTrampoline_SetCallbacks(&_Py_perfmap_callbacks) < 0 ||
+                    _PyPerfTrampoline_Init(config->perf_profiling) < 0) {
                 return _PyStatus_ERR("can't initialize the perf trampoline");
             }
         }
diff --git a/Python/sysmodule.c b/Python/sysmodule.c
index 7313d6098d6472..02253ffe9cc923 100644
--- a/Python/sysmodule.c
+++ b/Python/sysmodule.c
@@ -2009,14 +2009,10 @@ sys_activate_stack_trampoline_impl(PyObject *module, const char *backend)
 {
     if (strcmp(backend, "perf") == 0) {
 #ifdef _PY_HAVE_PERF_TRAMPOLINE
-        trampoline_state_init init_callback = NULL; 
-        _PyPerfTrampoline_GetCallbacks(&init_callback, NULL, NULL);
-        if (init_callback != _Py_perf_map_get_file) {
-            if ( _PyPerfTrampoline_SetCallbacks(
-                    _Py_perf_map_get_file,
-                    _Py_perf_map_write_entry,
-                    _Py_perf_map_close
-                ) < 0 ) {
+        _PyPerf_Callbacks cur_cb;
+        _PyPerfTrampoline_GetCallbacks(&cur_cb);
+        if (cur_cb.init_state != _Py_perfmap_callbacks.init_state) {
+            if (_PyPerfTrampoline_SetCallbacks(&_Py_perfmap_callbacks) < 0 ) {
                 PyErr_SetString(PyExc_ValueError, "can't activate perf trampoline");
                 return NULL;
             }

From 3058cf0ce6d7f709683d734d65f9f7c37b542bff Mon Sep 17 00:00:00 2001
From: Christian Heimes <christian@python.org>
Date: Tue, 23 Aug 2022 15:41:57 +0200
Subject: [PATCH 40/47] Rename macro to PY_HAVE_PERF_TRAMPOLINE

---
 Include/internal/pycore_ceval.h |  2 +-
 Lib/test/test_perf_profiler.py  |  2 +-
 Makefile.pre.in                 |  1 -
 Objects/perf_trampoline.c       | 16 ++++++++--------
 Python/pylifecycle.c            |  2 +-
 Python/sysmodule.c              |  4 ++--
 configure                       |  6 +-----
 configure.ac                    |  5 +----
 pyconfig.h.in                   |  6 +++---
 9 files changed, 18 insertions(+), 26 deletions(-)

diff --git a/Include/internal/pycore_ceval.h b/Include/internal/pycore_ceval.h
index 4119913de1c11a..f0949e6fa0da92 100644
--- a/Include/internal/pycore_ceval.h
+++ b/Include/internal/pycore_ceval.h
@@ -83,7 +83,7 @@ extern int _PyPerfTrampoline_Init(int activate);
 extern int _PyPerfTrampoline_Fini(void);
 extern int _PyIsPerfTrampolineActive(void);
 extern PyStatus _PyPerfTrampoline_AfterFork_Child(void);
-#ifdef _PY_HAVE_PERF_TRAMPOLINE
+#ifdef PY_HAVE_PERF_TRAMPOLINE
 extern _PyPerf_Callbacks _Py_perfmap_callbacks;
 #endif
 
diff --git a/Lib/test/test_perf_profiler.py b/Lib/test/test_perf_profiler.py
index 254eafed294089..c2aad85b652e35 100644
--- a/Lib/test/test_perf_profiler.py
+++ b/Lib/test/test_perf_profiler.py
@@ -18,7 +18,7 @@
 
 
 def supports_trampoline_profiling():
-    perf_trampoline = sysconfig.get_config_var("PERF_TRAMPOLINE_SUPPORT")
+    perf_trampoline = sysconfig.get_config_var("PY_HAVE_PERF_TRAMPOLINE")
     if not perf_trampoline:
         return False
     return int(perf_trampoline) == 1
diff --git a/Makefile.pre.in b/Makefile.pre.in
index a70681a876afc1..027aefcb61312d 100644
--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@@ -54,7 +54,6 @@ DTRACE=         @DTRACE@
 DFLAGS=         @DFLAGS@
 DTRACE_HEADERS= @DTRACE_HEADERS@
 DTRACE_OBJS=    @DTRACE_OBJS@
-PERF_TRAMPOLINE_SUPPORT= @PERF_TRAMPOLINE_SUPPORT@
 
 GNULD=		@GNULD@
 
diff --git a/Objects/perf_trampoline.c b/Objects/perf_trampoline.c
index 89a58fcfc7a4fa..ac1f8c5aaeb592 100644
--- a/Objects/perf_trampoline.c
+++ b/Objects/perf_trampoline.c
@@ -140,7 +140,7 @@ typedef enum {
     PERF_STATUS_OK = 1,       // Perf trampoline is ready to be executed
 } perf_status_t;
 
-#ifdef _PY_HAVE_PERF_TRAMPOLINE
+#ifdef PY_HAVE_PERF_TRAMPOLINE
 
 #include <fcntl.h>
 #include <stdio.h>
@@ -386,12 +386,12 @@ py_trampoline_evaluator(PyThreadState *ts, _PyInterpreterFrame *frame,
     // Something failed, fall back to the default evaluator.
     return _PyEval_EvalFrameDefault(ts, frame, throw);
 }
-#endif  // _PY_HAVE_PERF_TRAMPOLINE
+#endif  // PY_HAVE_PERF_TRAMPOLINE
 
 int
 _PyIsPerfTrampolineActive(void)
 {
-#ifdef _PY_HAVE_PERF_TRAMPOLINE
+#ifdef PY_HAVE_PERF_TRAMPOLINE
     PyThreadState *tstate = _PyThreadState_GET();
     return tstate->interp->eval_frame == py_trampoline_evaluator;
 #endif
@@ -404,7 +404,7 @@ _PyPerfTrampoline_GetCallbacks(_PyPerf_Callbacks *callbacks)
     if (callbacks == NULL) {
         return;
     }
-#ifdef _PY_HAVE_PERF_TRAMPOLINE
+#ifdef PY_HAVE_PERF_TRAMPOLINE
     callbacks->init_state = trampoline_api.init_state;
     callbacks->write_state = trampoline_api.write_state;
     callbacks->free_state = trampoline_api.free_state;
@@ -418,7 +418,7 @@ _PyPerfTrampoline_SetCallbacks(_PyPerf_Callbacks *callbacks)
     if (callbacks == NULL) {
         return -1;
     }
-#ifdef _PY_HAVE_PERF_TRAMPOLINE
+#ifdef PY_HAVE_PERF_TRAMPOLINE
     if (trampoline_api.state) {
         _PyPerfTrampoline_Fini();
     }
@@ -434,7 +434,7 @@ _PyPerfTrampoline_SetCallbacks(_PyPerf_Callbacks *callbacks)
 int
 _PyPerfTrampoline_Init(int activate)
 {
-#ifdef _PY_HAVE_PERF_TRAMPOLINE
+#ifdef PY_HAVE_PERF_TRAMPOLINE
     PyThreadState *tstate = _PyThreadState_GET();
     if (tstate->interp->eval_frame &&
         tstate->interp->eval_frame != py_trampoline_evaluator) {
@@ -471,7 +471,7 @@ _PyPerfTrampoline_Init(int activate)
 int
 _PyPerfTrampoline_Fini(void)
 {
-#ifdef _PY_HAVE_PERF_TRAMPOLINE
+#ifdef PY_HAVE_PERF_TRAMPOLINE
     PyThreadState *tstate = _PyThreadState_GET();
     if (tstate->interp->eval_frame == py_trampoline_evaluator) {
         tstate->interp->eval_frame = NULL;
@@ -489,7 +489,7 @@ _PyPerfTrampoline_Fini(void)
 PyStatus
 _PyPerfTrampoline_AfterFork_Child(void)
 {
-#ifdef _PY_HAVE_PERF_TRAMPOLINE
+#ifdef PY_HAVE_PERF_TRAMPOLINE
     // Restart trampoline in file in child.
     int was_active = _PyIsPerfTrampolineActive();
     _PyPerfTrampoline_Fini();
diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c
index 31b7297b26e299..8ce6d71651c10b 100644
--- a/Python/pylifecycle.c
+++ b/Python/pylifecycle.c
@@ -1151,7 +1151,7 @@ init_interp_main(PyThreadState *tstate)
         }
 
 
-#ifdef _PY_HAVE_PERF_TRAMPOLINE
+#ifdef PY_HAVE_PERF_TRAMPOLINE
         if (config->perf_profiling) {
             if (_PyPerfTrampoline_SetCallbacks(&_Py_perfmap_callbacks) < 0 ||
                     _PyPerfTrampoline_Init(config->perf_profiling) < 0) {
diff --git a/Python/sysmodule.c b/Python/sysmodule.c
index 02253ffe9cc923..07c81c92cd85f8 100644
--- a/Python/sysmodule.c
+++ b/Python/sysmodule.c
@@ -2008,7 +2008,7 @@ sys_activate_stack_trampoline_impl(PyObject *module, const char *backend)
 /*[clinic end generated code: output=5783cdeb51874b43 input=58d7244062b933a8]*/
 {
     if (strcmp(backend, "perf") == 0) {
-#ifdef _PY_HAVE_PERF_TRAMPOLINE
+#ifdef PY_HAVE_PERF_TRAMPOLINE
         _PyPerf_Callbacks cur_cb;
         _PyPerfTrampoline_GetCallbacks(&cur_cb);
         if (cur_cb.init_state != _Py_perfmap_callbacks.init_state) {
@@ -2059,7 +2059,7 @@ static PyObject *
 sys_is_stack_trampoline_active_impl(PyObject *module)
 /*[clinic end generated code: output=ab2746de0ad9d293 input=061fa5776ac9dd59]*/
 {
-#ifdef _PY_HAVE_PERF_TRAMPOLINE
+#ifdef PY_HAVE_PERF_TRAMPOLINE
     if (_PyIsPerfTrampolineActive()) {
         Py_RETURN_TRUE;
     }
diff --git a/configure b/configure
index 7fbef76a826097..74ec2afe31cf5d 100755
--- a/configure
+++ b/configure
@@ -862,7 +862,6 @@ TZPATH
 LIBUUID_LIBS
 LIBUUID_CFLAGS
 PERF_TRAMPOLINE_OBJ
-PERF_TRAMPOLINE_SUPPORT
 SHLIBS
 CFLAGSFORSHARED
 LINKFORSHARED
@@ -11434,15 +11433,12 @@ esac
 { $as_echo "$as_me:${as_lineno-$LINENO}: result: $perf_trampoline" >&5
 $as_echo "$perf_trampoline" >&6; }
 
-
-PERF_TRAMPOLINE_SUPPORT=
 if test "x$perf_trampoline" = xyes; then :
 
 
-$as_echo "#define _PY_HAVE_PERF_TRAMPOLINE 1" >>confdefs.h
+$as_echo "#define PY_HAVE_PERF_TRAMPOLINE 1" >>confdefs.h
 
   PERF_TRAMPOLINE_OBJ=Objects/asm_trampoline.o
-  PERF_TRAMPOLINE_SUPPORT=1
 
     if test "x$Py_DEBUG" = xtrue; then :
 
diff --git a/configure.ac b/configure.ac
index 84ae7235accc61..ecf88646295ab7 100644
--- a/configure.ac
+++ b/configure.ac
@@ -3435,12 +3435,9 @@ AS_CASE([$PLATFORM_TRIPLET],
 )
 AC_MSG_RESULT([$perf_trampoline])
 
-AC_SUBST(PERF_TRAMPOLINE_SUPPORT)
-PERF_TRAMPOLINE_SUPPORT=
 AS_VAR_IF([perf_trampoline], [yes], [
-  AC_DEFINE([_PY_HAVE_PERF_TRAMPOLINE], [1], [Define to 1 if you have the perf trampoline.])
+  AC_DEFINE([PY_HAVE_PERF_TRAMPOLINE], [1], [Define to 1 if you have the perf trampoline.])
   PERF_TRAMPOLINE_OBJ=Objects/asm_trampoline.o
-  PERF_TRAMPOLINE_SUPPORT=1
 
   dnl perf needs frame pointers for unwinding, include compiler option in debug builds
   AS_VAR_IF([Py_DEBUG], [true], [
diff --git a/pyconfig.h.in b/pyconfig.h.in
index 6ceefc7cf19ae3..1ce09855f5559d 100644
--- a/pyconfig.h.in
+++ b/pyconfig.h.in
@@ -1568,6 +1568,9 @@
 /* Define if you want to coerce the C locale to a UTF-8 based locale */
 #undef PY_COERCE_C_LOCALE
 
+/* Define to 1 if you have the perf trampoline. */
+#undef PY_HAVE_PERF_TRAMPOLINE
+
 /* Define to 1 to build the sqlite module with loadable extensions support. */
 #undef PY_SQLITE_ENABLE_LOAD_EXTENSION
 
@@ -1799,9 +1802,6 @@
 /* framework name */
 #undef _PYTHONFRAMEWORK
 
-/* Define to 1 if you have the perf trampoline. */
-#undef _PY_HAVE_PERF_TRAMPOLINE
-
 /* Define to force use of thread-safe errno, h_errno, and other functions */
 #undef _REENTRANT
 

From be612a9f8162bfaadbd485200962d2f7c70eb11e Mon Sep 17 00:00:00 2001
From: Pablo Galindo <pablogsal@gmail.com>
Date: Tue, 23 Aug 2022 23:16:02 +0100
Subject: [PATCH 41/47] Allow gdb to unwind

---
 Objects/asm_trampoline.S | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/Objects/asm_trampoline.S b/Objects/asm_trampoline.S
index 2c12f683eeb68c..460707717df003 100644
--- a/Objects/asm_trampoline.S
+++ b/Objects/asm_trampoline.S
@@ -9,10 +9,9 @@
 # }
 _Py_trampoline_func_start:
 #ifdef __x86_64__
-    pushq   %rbp
-    movq    %rsp, %rbp
+    sub    $8, %rsp
     call    *%rcx
-    popq    %rbp
+    add    $8, %rsp
     ret
 #endif // __x86_64__
 #if defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)

From c27f8b1b551ffd9b5418ff6f4e8203d28930d300 Mon Sep 17 00:00:00 2001
From: Pablo Galindo <pablogsal@gmail.com>
Date: Thu, 25 Aug 2022 14:45:29 +0100
Subject: [PATCH 42/47] Add docs

Signed-off-by: Pablo Galindo <pablogsal@gmail.com>
---
 Doc/howto/index.rst           |   1 +
 Doc/howto/instrumentation.rst |   2 +-
 Doc/howto/perf_profiling.rst  | 201 ++++++++++++++++++++++++++++++++++
 3 files changed, 203 insertions(+), 1 deletion(-)
 create mode 100644 Doc/howto/perf_profiling.rst

diff --git a/Doc/howto/index.rst b/Doc/howto/index.rst
index 8a378e6659efc4..f521276a5a83c5 100644
--- a/Doc/howto/index.rst
+++ b/Doc/howto/index.rst
@@ -30,6 +30,7 @@ Currently, the HOWTOs are:
    ipaddress.rst
    clinic.rst
    instrumentation.rst
+   perf_profiling.rst
    annotations.rst
    isolating-extensions.rst
 
diff --git a/Doc/howto/instrumentation.rst b/Doc/howto/instrumentation.rst
index 4ce15c69dac90b..ec17928751680a 100644
--- a/Doc/howto/instrumentation.rst
+++ b/Doc/howto/instrumentation.rst
@@ -3,7 +3,7 @@
 .. _instrumentation:
 
 ===============================================
-Instrumenting CPython with DTrace and SystemTap
+instrumenting cpython with dtrace and systemtap
 ===============================================
 
 :author: David Malcolm
diff --git a/Doc/howto/perf_profiling.rst b/Doc/howto/perf_profiling.rst
new file mode 100644
index 00000000000000..f87fd316cd5f21
--- /dev/null
+++ b/Doc/howto/perf_profiling.rst
@@ -0,0 +1,201 @@
+.. highlight:: shell-session
+
+.. _perf_profiling:
+
+==============================================
+Python support for the Linux ``perf`` profiler
+==============================================
+
+:author: Pablo Galindo
+
+The Linux ``perf`` profiler is a very powerful tool that allows you to profile and
+obtain information about the performance of your application. It is a very powerful
+profiler that also has a very vibrant ecosystem of tools that aid with the analysis
+of the data that it produces.
+
+The main problem with using the ``perf`` profiler with Python applications is that
+``perf`` only allows to get information about native symbols, this is, the names of
+the functions and procedures written in C. This means that the names and file names
+of the Python functions in your code will not appear in the output of the ``perf``.
+
+Since Python 3.12, the interpreter can run in a special mode that allows Python
+functions to appear in the output of the ``perf`` profiler. When this mode is
+enabled, the interpreter will interpose a small piece of code compiled on the
+fly before the execution of every Python function and it will teach ``perf`` the
+relationship between this piece of code and the associated Python function using
+`perf map files`_.
+
+.. warning::
+
+    Support for the ``perf`` profiler is only currently available for Linux on
+    selected architectures. Check the output of the configure build step or
+    check the output of ``python.exe -m sysconfig | grep HAVE_PERF_TRAMPOLINE``
+    to see if your system is supported.
+
+For example, consider the following script:
+
+.. code-block:: python
+
+    def foo(n):
+        result = 0
+        for _ in range(n):
+            result += 1
+        return result
+
+    def bar(n):
+        foo(n)
+
+    def baz(n):
+        bar(n)
+
+    if __name__ == "__main__":
+        baz(1000000)
+
+We can run perf to sample CPU stack traces at 9999 Hertz:
+
+    $ perf record -F 9999 -g -o perf.data python my_script.py
+
+Then we can use perf report to analyze the data:
+
+.. code-block:: shell-session
+
+    $ perf report --stdio -n -g
+
+    # Children      Self       Samples  Command     Shared Object       Symbol
+    # ........  ........  ............  ..........  ..................  ..........................................
+    #
+        91.08%     0.00%             0  python.exe  python.exe          [.] _start
+                |
+                ---_start
+                |
+                    --90.71%--__libc_start_main
+                            Py_BytesMain
+                            |
+                            |--56.88%--pymain_run_python.constprop.0
+                            |          |
+                            |          |--56.13%--_PyRun_AnyFileObject
+                            |          |          _PyRun_SimpleFileObject
+                            |          |          |
+                            |          |          |--55.02%--run_mod
+                            |          |          |          |
+                            |          |          |           --54.65%--PyEval_EvalCode
+                            |          |          |                     _PyEval_EvalFrameDefault
+                            |          |          |                     PyObject_Vectorcall
+                            |          |          |                     _PyEval_Vector
+                            |          |          |                     _PyEval_EvalFrameDefault
+                            |          |          |                     PyObject_Vectorcall
+                            |          |          |                     _PyEval_Vector
+                            |          |          |                     _PyEval_EvalFrameDefault
+                            |          |          |                     PyObject_Vectorcall
+                            |          |          |                     _PyEval_Vector
+                            |          |          |                     |
+                            |          |          |                     |--51.67%--_PyEval_EvalFrameDefault
+                            |          |          |                     |          |
+                            |          |          |                     |          |--11.52%--_PyLong_Add
+                            |          |          |                     |          |          |
+                            |          |          |                     |          |          |--2.97%--_PyObject_Malloc
+    ...
+
+As you can see here, the Python functions are not shown in the output, only ``_Py_Eval_EvalFrameDefault`` appears
+(the function that evaluates the Python bytecode) shows up. Unfortunately that's not very useful because all Python
+functions use the same C function to evaluate bytecode so we cannot know which Python function corresponds to which
+bytecode-evaluating function.
+
+Instead, if we run the same experiment with perf support activated we get:
+
+.. code-block:: shell-session
+
+    $ perf report --stdio -n -g
+
+    # Children      Self       Samples  Command     Shared Object       Symbol
+    # ........  ........  ............  ..........  ..................  .....................................................................
+    #
+        90.58%     0.36%             1  python.exe  python.exe          [.] _start
+                |
+                ---_start
+                |
+                    --89.86%--__libc_start_main
+                            Py_BytesMain
+                            |
+                            |--55.43%--pymain_run_python.constprop.0
+                            |          |
+                            |          |--54.71%--_PyRun_AnyFileObject
+                            |          |          _PyRun_SimpleFileObject
+                            |          |          |
+                            |          |          |--53.62%--run_mod
+                            |          |          |          |
+                            |          |          |           --53.26%--PyEval_EvalCode
+                            |          |          |                     py::<module>:/src/script.py
+                            |          |          |                     _PyEval_EvalFrameDefault
+                            |          |          |                     PyObject_Vectorcall
+                            |          |          |                     _PyEval_Vector
+                            |          |          |                     py::baz:/src/script.py
+                            |          |          |                     _PyEval_EvalFrameDefault
+                            |          |          |                     PyObject_Vectorcall
+                            |          |          |                     _PyEval_Vector
+                            |          |          |                     py::bar:/src/script.py
+                            |          |          |                     _PyEval_EvalFrameDefault
+                            |          |          |                     PyObject_Vectorcall
+                            |          |          |                     _PyEval_Vector
+                            |          |          |                     py::foo:/src/script.py
+                            |          |          |                     |
+                            |          |          |                     |--51.81%--_PyEval_EvalFrameDefault
+                            |          |          |                     |          |
+                            |          |          |                     |          |--13.77%--_PyLong_Add
+                            |          |          |                     |          |          |
+                            |          |          |                     |          |          |--3.26%--_PyObject_Malloc
+
+
+
+Enabling perf profiling mode
+----------------------------
+
+There are two main ways to activate the perf profiling mode. If you want it to be
+active since the start of the Python interpreter, you can use the `-Xperf` option:
+
+    $ python -Xperf my_script.py
+
+There is also support for dynamically activating and deactivating the perf
+profiling mode by using the APIs in the :mod:`sys` module:
+
+.. code-block:: python
+
+    import sys
+    sys.activate_stack_trampoline("perf")
+
+    # Run some code with Perf profiling active
+
+    sys.deactivate_stack_trampoline()
+
+    # Perf profiling is not active anymore
+
+These APIs can be handy if you want to activate/deactivate profiling mode in
+response to a signal or other communication mechanism with your process.
+
+
+
+Now we can analyze the data with ``perf report``:
+
+    $ perf report -g -i perf.data
+
+
+How to obtain the best results
+-------------------------------
+
+For the best results, Python should be compiled with
+``CFLAGS="-fno-omit-frame-pointer -mno-omit-leaf-frame-pointer"`` as this allows
+profilers to unwind using only the frame pointer and not on DWARF debug
+information. This is because as the code that is interposed to allow perf
+support is dynamically generated it doesn't have any DWARF debugging information
+available.
+
+You can check if you system has been compiled with this flag by running:
+
+    $ python -m sysconfig | grep 'no-omit-frame-pointer'
+
+If you don't see any output it means that your interpreter has not been compiled with
+frame pointers and therefore it may not be able to show Python functions in the output
+of ``perf``.
+
+.. _perf map files: https://github.com/torvalds/linux/blob/0513e464f9007b70b96740271a948ca5ab6e7dd7/tools/perf/Documentation/jit-interface.txt
+

From e27a2c498a606d1505e289c8c67f6565cf60d9cb Mon Sep 17 00:00:00 2001
From: Pablo Galindo <pablogsal@gmail.com>
Date: Thu, 25 Aug 2022 15:26:35 +0100
Subject: [PATCH 43/47] fixup! Add docs

---
 Doc/howto/instrumentation.rst |  2 +-
 Doc/howto/perf_profiling.rst  | 11 +++++------
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/Doc/howto/instrumentation.rst b/Doc/howto/instrumentation.rst
index ec17928751680a..4ce15c69dac90b 100644
--- a/Doc/howto/instrumentation.rst
+++ b/Doc/howto/instrumentation.rst
@@ -3,7 +3,7 @@
 .. _instrumentation:
 
 ===============================================
-instrumenting cpython with dtrace and systemtap
+Instrumenting CPython with DTrace and SystemTap
 ===============================================
 
 :author: David Malcolm
diff --git a/Doc/howto/perf_profiling.rst b/Doc/howto/perf_profiling.rst
index f87fd316cd5f21..f9a9efda70bbd2 100644
--- a/Doc/howto/perf_profiling.rst
+++ b/Doc/howto/perf_profiling.rst
@@ -9,9 +9,9 @@ Python support for the Linux ``perf`` profiler
 :author: Pablo Galindo
 
 The Linux ``perf`` profiler is a very powerful tool that allows you to profile and
-obtain information about the performance of your application. It is a very powerful
-profiler that also has a very vibrant ecosystem of tools that aid with the analysis
-of the data that it produces.
+obtain information about the performance of your application. ``perf`` also has
+a very vibrant ecosystem of tools that aid with the analysis of the data that it
+produces.
 
 The main problem with using the ``perf`` profiler with Python applications is that
 ``perf`` only allows to get information about native symbols, this is, the names of
@@ -29,7 +29,7 @@ relationship between this piece of code and the associated Python function using
 
     Support for the ``perf`` profiler is only currently available for Linux on
     selected architectures. Check the output of the configure build step or
-    check the output of ``python.exe -m sysconfig | grep HAVE_PERF_TRAMPOLINE``
+    check the output of ``python -m sysconfig | grep HAVE_PERF_TRAMPOLINE``
     to see if your system is supported.
 
 For example, consider the following script:
@@ -197,5 +197,4 @@ If you don't see any output it means that your interpreter has not been compiled
 frame pointers and therefore it may not be able to show Python functions in the output
 of ``perf``.
 
-.. _perf map files: https://github.com/torvalds/linux/blob/0513e464f9007b70b96740271a948ca5ab6e7dd7/tools/perf/Documentation/jit-interface.txt
-
+.. _perf map files: https://github.com/torvalds/linux/blob/0513e464f9007b70b96740271a948ca5ab6e7dd7/tools/perf/Documentation/jit-interface.txt
\ No newline at end of file

From 81c7f4b8e01befe94aa830b23d98cb56e0664ff3 Mon Sep 17 00:00:00 2001
From: Pablo Galindo <pablogsal@gmail.com>
Date: Thu, 25 Aug 2022 16:44:46 +0100
Subject: [PATCH 44/47] fixup! fixup! Add docs

---
 Doc/howto/perf_profiling.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Doc/howto/perf_profiling.rst b/Doc/howto/perf_profiling.rst
index f9a9efda70bbd2..2e1bb48af8c88e 100644
--- a/Doc/howto/perf_profiling.rst
+++ b/Doc/howto/perf_profiling.rst
@@ -197,4 +197,4 @@ If you don't see any output it means that your interpreter has not been compiled
 frame pointers and therefore it may not be able to show Python functions in the output
 of ``perf``.
 
-.. _perf map files: https://github.com/torvalds/linux/blob/0513e464f9007b70b96740271a948ca5ab6e7dd7/tools/perf/Documentation/jit-interface.txt
\ No newline at end of file
+.. _perf map files: https://github.com/torvalds/linux/blob/0513e464f9007b70b96740271a948ca5ab6e7dd7/tools/perf/Documentation/jit-interface.txt

From ef0650bbd2fdac0b93dc293014b4354a213beebc Mon Sep 17 00:00:00 2001
From: "Gregory P. Smith" <greg@krypto.org>
Date: Sun, 28 Aug 2022 21:02:40 -0700
Subject: [PATCH 45/47] Update sys API names in the NEWS entry.

they now match the current code.
---
 .../2022-08-20-18-36-40.gh-issue-96143.nh3GFM.rst             | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-08-20-18-36-40.gh-issue-96143.nh3GFM.rst b/Misc/NEWS.d/next/Core and Builtins/2022-08-20-18-36-40.gh-issue-96143.nh3GFM.rst
index 1b6f60862ddc60..30f44fd453a547 100644
--- a/Misc/NEWS.d/next/Core and Builtins/2022-08-20-18-36-40.gh-issue-96143.nh3GFM.rst	
+++ b/Misc/NEWS.d/next/Core and Builtins/2022-08-20-18-36-40.gh-issue-96143.nh3GFM.rst	
@@ -1,7 +1,7 @@
 Add a new ``-X perf`` Python command line option as well as
-:func:`sys.activate_perf_trampoline` and :func:`sys.deactivate_perf_trampoline`
+:func:`sys.activate_stack_trampoline` and :func:`sys.deactivate_stack_trampoline`
 function in the :mod:`sys` module that allows to set/unset the interpreter in a
 way that the Linux ``perf`` profiler can detect Python calls. The new
-:func:`sys.is_perf_trampoline_active` function allows to query the state of the
+:func:`sys.is_stack_trampoline_active` function allows to query the state of the
 perf trampoline. Design by Pablo Galindo. Patch by Pablo Galindo and Christian Heimes
 with contributions from Gregory P. Smith [Google] and Mark Shannon.

From d8932d2cefd035ba226f0c9bbc8ed43a99090e78 Mon Sep 17 00:00:00 2001
From: Pablo Galindo <pablogsal@gmail.com>
Date: Mon, 29 Aug 2022 19:22:35 +0100
Subject: [PATCH 46/47] Add environment variable

---
 Python/initconfig.c | 10 ++++++++++
 Python/sysmodule.c  | 10 +++++-----
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/Python/initconfig.c b/Python/initconfig.c
index d0d6e2eb8338ee..b7bb5716753693 100644
--- a/Python/initconfig.c
+++ b/Python/initconfig.c
@@ -1695,6 +1695,16 @@ config_read_env_vars(PyConfig *config)
 static PyStatus
 config_init_perf_profiling(PyConfig *config)
 {
+    int active = 0;
+    const char *env = config_get_env(config, "PYTHONPERFSUPPORT");
+    if (env) {
+        if (_Py_str_to_int(env, &active) != 0) {
+            active = 0;
+        }
+        if (active) {
+            config->perf_profiling = 1;
+        }
+    }
     const wchar_t *xoption = config_get_xoption(config, L"perf");
     if (xoption) {
         config->perf_profiling = 1;
diff --git a/Python/sysmodule.c b/Python/sysmodule.c
index 5f08cefe57f197..75e64553d88c9f 100644
--- a/Python/sysmodule.c
+++ b/Python/sysmodule.c
@@ -2066,8 +2066,8 @@ static PyObject *
 sys_activate_stack_trampoline_impl(PyObject *module, const char *backend)
 /*[clinic end generated code: output=5783cdeb51874b43 input=b09020e3a17c78c5]*/
 {
-    if (strcmp(backend, "perf") == 0) {
 #ifdef PY_HAVE_PERF_TRAMPOLINE
+    if (strcmp(backend, "perf") == 0) {
         _PyPerf_Callbacks cur_cb;
         _PyPerfTrampoline_GetCallbacks(&cur_cb);
         if (cur_cb.init_state != _Py_perfmap_callbacks.init_state) {
@@ -2076,10 +2076,6 @@ sys_activate_stack_trampoline_impl(PyObject *module, const char *backend)
                 return NULL;
             }
         }
-#else
-        PyErr_SetString(PyExc_ValueError, "perf trampoline not available");
-        return NULL;
-#endif
     }
     else {
         PyErr_Format(PyExc_ValueError, "invalid backend: %s", backend);
@@ -2089,6 +2085,10 @@ sys_activate_stack_trampoline_impl(PyObject *module, const char *backend)
         return NULL;
     }
     Py_RETURN_NONE;
+#else
+    PyErr_SetString(PyExc_ValueError, "perf trampoline not available");
+    return NULL;
+#endif
 }
 
 

From e3f846e3b445bb9d6912e5de0fc9c6a5d38799c7 Mon Sep 17 00:00:00 2001
From: Pablo Galindo <pablogsal@gmail.com>
Date: Mon, 29 Aug 2022 20:44:12 +0100
Subject: [PATCH 47/47] Document the env var and the -X option

---
 Doc/c-api/init_config.rst | 14 ++++++++++++++
 Doc/using/cmdline.rst     | 13 +++++++++++++
 Python/initconfig.c       |  5 +++++
 3 files changed, 32 insertions(+)

diff --git a/Doc/c-api/init_config.rst b/Doc/c-api/init_config.rst
index 2074ec4e0e8ea5..c4a342ee811ca9 100644
--- a/Doc/c-api/init_config.rst
+++ b/Doc/c-api/init_config.rst
@@ -1155,6 +1155,20 @@ PyConfig
 
       Default: ``-1`` in Python mode, ``0`` in isolated mode.
 
+   .. c:member:: int perf_profiling
+
+      Enable compatibility mode with the perf profiler?
+
+      If non-zero, initialize the perf trampoline. See :ref:`perf_profiling`
+      for more information.
+
+      Set by :option:`-X perf <-X>` command line option and by the
+      :envvar:`PYTHONPERFSUPPORT` environment variable.
+
+      Default: ``-1``.
+
+      .. versionadded:: 3.12
+
    .. c:member:: int use_environment
 
       Use :ref:`environment variables <using-on-envvars>`?
diff --git a/Doc/using/cmdline.rst b/Doc/using/cmdline.rst
index 6678d476fa831b..5ecc882d818fce 100644
--- a/Doc/using/cmdline.rst
+++ b/Doc/using/cmdline.rst
@@ -535,6 +535,12 @@ Miscellaneous options
      development (running from the source tree) then the default is "off".
      Note that the "importlib_bootstrap" and "importlib_bootstrap_external"
      frozen modules are always used, even if this flag is set to "off".
+   * ``-X perf`` to activate compatibility mode with the ``perf`` profiler.
+     When this option is activated, the Linux ``perf`` profiler will be able to
+     report Python calls. This option is only available on some platforms and
+     will do nothing if is not supported on the current system. The default value
+     is "off". See also :envvar:`PYTHONPERFSUPPORT` and :ref:`perf_profiling`
+     for more information.
 
    It also allows passing arbitrary values and retrieving them through the
    :data:`sys._xoptions` dictionary.
@@ -1025,6 +1031,13 @@ conflict.
 
    .. versionadded:: 3.11
 
+.. envvar:: PYTHONPERFSUPPORT
+
+   If this variable is set to a nonzero value, it activates compatibility mode
+   with the ``perf`` profiler so Python calls can be detected by it. See the
+   :ref:`perf_profiling` section for more information.
+
+   .. versionadded:: 3.12
 
 
 Debug-mode variables
diff --git a/Python/initconfig.c b/Python/initconfig.c
index b7bb5716753693..33a8f276b19cbf 100644
--- a/Python/initconfig.c
+++ b/Python/initconfig.c
@@ -118,6 +118,11 @@ The following implementation-specific options are available:\n\
    files are desired as well as suppressing the extra visual location indicators \n\
    when the interpreter displays tracebacks.\n\
 \n\
+-X perf: activate support for the Linux \"perf\" profiler by activating the \"perf\"\n\
+    trampoline. When this option is activated, the Linux \"perf\" profiler will be \n\
+    able to report Python calls. This option is only available on some platforms and will \n\
+    do nothing if is not supported on the current system. The default value is \"off\".\n\
+\n\
 -X frozen_modules=[on|off]: whether or not frozen modules should be used.\n\
    The default is \"on\" (or \"off\" if you are running a local build).";