pin PythonActors to a single thread (#267)

suo · facebook-github-bot · commit a2d4277e24c3 · 2025-06-13T16:43:53.000-07:00
Summary: Pull Request resolved: #267 As we did with StreamActor before it, it is good to run PythonActor handlers on one thread consistently, since a lot of the Python code we have assumes that is called from a single thread, makes use of thread-local state, etc. So, we spawn one thread per PythonActor for it to run on. TODO: this does NOT preserve thread-local state between async endpoints and sync endpoints, which is a huge issue. I think the correct solution there is detect whether an actor has any async endpoints, and if so, switch to a mode where absolutely all Python code is run on the asyncio event loop. That's a deeper refactor though, so will do this to unblock for now. Reviewed By: mariusae Differential Revision: D76603819 fbshipit-source-id: d8101084fe41a10d163a26dd450e1e0e4a2614ac
diff --git a/monarch_hyperactor/src/actor.rs b/monarch_hyperactor/src/actor.rs
@@ -36,8 +36,11 @@ use pyo3::types::PyType;
 use serde::Deserialize;
 use serde::Serialize;
 use serde_bytes::ByteBuf;
+use tokio::runtime::Handle;
 use tokio::sync::Mutex;
 use tokio::sync::oneshot;
+use tokio::task::JoinHandle;
+use tracing::span::Id;
 
 use crate::mailbox::PyMailbox;
 use crate::proc::InstanceWrapper;
@@ -284,6 +287,56 @@ impl Actor for PythonActor {
             Ok(Self { actor })
         })?)
     }
+
+    /// Specialize spawn_server_task for PythonActor, because we want to run the stream on a
+    /// dedicated OS thread. We do this to guarantee tha all Python code is
+    /// executed on the same thread, since often Python code uses thread-local
+    /// state or otherwise assumes that it is called only from a single thread.
+    fn spawn_server_task<F>(future: F) -> JoinHandle<F::Output>
+    where
+        F: Future + Send + 'static,
+        F::Output: Send + 'static,
+    {
+        let (join_tx, join_rx) = tokio::sync::oneshot::channel();
+        // It is important that we spawn a standalone thread for the work here,
+        // as opposed to using `spawn_blocking` to spawn a tokio-managed thread.
+        // This is because the worker stream may call uninterruptible FFI code
+        // that can deadlock (CUDA, NCCL).
+        // If we use a tokio-managed blocking thread, then runtime teardown will
+        // try to wait for tasks on that thread to reach an await point, and
+        // hang forever.
+        let builder = std::thread::Builder::new().name("python-actor".to_string());
+        let _thread_handle = builder.spawn(move || {
+            // Spawn a new thread with a single-threaded tokio runtime to run the
+            // actor loop.  We avoid the current-threaded runtime, so that we can
+            // use `block_in_place` for nested async-to-sync-to-async flows.
+            let rt = tokio::runtime::Builder::new_multi_thread()
+                .worker_threads(1)
+                .enable_io()
+                .build()
+                .unwrap();
+            rt.block_on(async {
+                tokio::task::block_in_place(|| {
+                    // Allow e.g. destructing py objects on this thread, which
+                    // can happen at shutdown when the a stream actors env map
+                    // for rvalues is dropped (e.g. P1673311499).
+                    // https://github.com/PyO3/pyo3/discussions/3499
+                    Python::with_gil(|py| {
+                        py.allow_threads(|| {
+                            let result = Handle::current().block_on(future);
+                            if join_tx.send(result).is_err() {
+                                panic!("could not send join result")
+                            }
+                        })
+                    })
+                })
+            })
+        });
+
+        // In order to bridge the synchronous join handle with the async world,
+        // smuggle the result through a channel.
+        tokio::spawn(async move { join_rx.await.unwrap() })
+    }
 }
 
 /// Get the event loop state to run PythonActor handlers in. We construct a
diff --git a/python/tests/test_python_actors.py b/python/tests/test_python_actors.py
@@ -8,6 +8,7 @@
 import operator
 import os
 import re
+import threading
 from types import ModuleType
 from unittest.mock import AsyncMock, patch
 
@@ -549,3 +550,44 @@ def _patch_output(msg):
 
         with pytest.raises(monarch.actor_mesh.ActorError, match="ValueError: bad rank"):
             await fut
+
+
+class TLSActor(Actor):
+    """An actor that manages thread-local state."""
+
+    def __init__(self):
+        self.local = threading.local()
+        self.local.value = 0
+
+    @endpoint
+    def increment(self):
+        self.local.value += 1
+
+    @endpoint
+    async def increment_async(self):
+        self.local.value += 1
+
+    @endpoint
+    def get(self):
+        return self.local.value
+
+    @endpoint
+    async def get_async(self):
+        return self.local.value
+
+
+async def test_actor_tls() -> None:
+    """Test that thread-local state is respected."""
+    pm = await proc_mesh(gpus=1)
+    am = await pm.spawn("tls", TLSActor)
+    await am.increment.call_one()
+    # TODO(suo): TLS is NOT preserved across async/sync endpoints, because currently
+    # we run async endpoints on a different thread than sync ones.
+    # Will fix this in a followup diff.
+
+    # await am.increment_async.call_one()
+    await am.increment.call_one()
+    # await am.increment_async.call_one()
+
+    assert 2 == await am.get.call_one()
+    # assert 4 == await am.get_async.call_one()