Bug report
Bug description:
On a MacBook Air M4, there are 3 mach_vm_read_overwrite() per get_stack_trace(), and they're all 16384 bytes (page):
2026-05-09T00:42:52.737520000+0200 maurycy@gimel /Users/maurycy/src/github.com/maurycy/cpython (main 57ef219*) % sudo dtrace -q -Z -x dynvarsize=64m -x bufsize=128m \
-c "./python.exe -m profiling.sampling run -r 1000khz -d 15 --pstats -o /dev/null /tmp/busy.py" \
-n '
pid$target::*get_stack_trace*:entry { self->gst = timestamp; }
pid$target::*get_stack_trace*:return /self->gst/ {
@gst_avg = avg(timestamp - self->gst);
@gst_n = count(); self->gst = 0;
}
pid$target::mach_vm_read_overwrite:entry {
self->vmr = timestamp;
@vmr_size[arg2] = count();
}
pid$target::mach_vm_read_overwrite:return /self->vmr/ {
@vmr_avg = avg(timestamp - self->vmr);
@vmr_n = count();
self->vmr = 0;
}
END {
printf("\n=== get_stack_trace ===\n");
printf("avg ns "); printa(@gst_avg); printf("count "); printa(@gst_n);
printf("\n=== mach_vm_read_overwrite ===\n");
printf("avg ns "); printa(@vmr_avg); printf("count "); printa(@vmr_n);
printf("\n=== mach_vm_read_overwrite read sizes (bytes) ===\n");
printa("%d bytes %@d\n", @vmr_size);
}'
Captured 897,532 samples in 15.00 seconds
Sample rate: 59,835.42 samples/sec
Error rate: 0.00
Raw unwinder rate: 69,908.92 samples/sec (14.30 us/sample)
Warning: missed 14102479 samples from the expected total of 15000011 (94.02%)
=== get_stack_trace ===
avg ns
13104
count
911514
=== mach_vm_read_overwrite ===
avg ns
3137
count
2734564
=== mach_vm_read_overwrite read sizes (bytes) ===
880 bytes 1
8 bytes 2
16384 bytes 2734561
2026-05-09T00:43:11.123030000+0200 maurycy@gimel /Users/maurycy/src/github.com/maurycy/cpython (main 57ef219*) % cat /tmp/busy.py
x = 0
while True:
x = (x + 1) % 1000003
2026-05-09T00:43:22.664550000+0200 maurycy@gimel /Users/maurycy/src/github.com/maurycy/cpython (main 57ef219*) % git diff
diff --git i/Lib/profiling/sampling/sample.py w/Lib/profiling/sampling/sample.py
index 5bbe2483581..36ed33337aa 100644
--- i/Lib/profiling/sampling/sample.py
+++ w/Lib/profiling/sampling/sample.py
@@ -181,6 +181,19 @@ def sample(self, collector, duration_sec=None, *, async_aware=False):
print(f"Sample rate: {fmt(sample_rate, 2)} samples/sec")
print(f"Error rate: {fmt(error_rate, 2)}")
+ # Raw RemoteUnwinder() throughput (tight loop, no scheduler/sleep)
+ try:
+ raw_start = time.perf_counter()
+ raw_n = 0
+ while time.perf_counter() - raw_start < 0.2:
+ self._get_stack_trace(async_aware=async_aware)
+ raw_n += 1
+ raw_elapsed = time.perf_counter() - raw_start
+ if raw_n and raw_elapsed:
+ print(f"Raw unwinder rate: {fmt(raw_n/raw_elapsed, 2)} samples/sec ({fmt(1e6*raw_elapsed/raw_n, 2)} us/sample)")
+ except Exception:
+ pass
+
# Print unwinder stats if stats collection is enabled
if self.collect_stats:
self._print_unwinder_stats()
2026-05-09T00:43:23.960838000+0200 maurycy@gimel /Users/maurycy/src/github.com/maurycy/cpython (main 57ef219*) %
(The ns are heavily taxed by DTrace, so should be taken with a grain of salt.)
However, the cache is invalidated after reading every sample, so they've very short-lived:
|
_Py_RemoteDebug_ClearCache(&self->handle); |
I believe that's where the reads happen:
|
if (_Py_RemoteDebug_PagedReadRemoteMemory( |
|
&self->handle, |
|
current_interpreter, |
|
INTERP_STATE_BUFFER_SIZE, |
|
interp_state_buffer) < 0) { |
|
set_exception_cause(self, PyExc_RuntimeError, "Failed to read interpreter state buffer"); |
|
Py_CLEAR(result); |
|
goto exit; |
|
} |
|
int bytes_read = _Py_RemoteDebug_PagedReadRemoteMemory( |
|
&unwinder->handle, *current_tstate, (size_t)unwinder->debug_offsets.thread_state.size, ts); |
|
Py_ssize_t bytes_read = _Py_RemoteDebug_PagedReadRemoteMemory( |
|
&unwinder->handle, |
|
address, |
|
SIZEOF_INTERP_FRAME, |
|
frame |
|
); |
These are all relatively small structs, between 80-8000 bytes.
(My understanding is that InterpState will nearly always be on a different page than TS / Frame, but they can be.)
The larger the mach_vm_read_overwrite, the slower it gets:
[130] 2026-05-09T00:44:06.004867000+0200 maurycy@gimel /Users/maurycy/src/github.com/maurycy/cpython (main 57ef219*) % cat /tmp/vmread_bench.c
#include <mach/mach.h>
#include <mach/mach_vm.h>
#include <stdio.h>
#include <time.h>
int main(void) {
char src[16384] = {0}, dst[16384];
mach_port_t self = mach_task_self();
mach_vm_size_t out;
for (size_t sz = 64; sz <= 16384; sz *= 2) {
uint64_t t0 = clock_gettime_nsec_np(CLOCK_UPTIME_RAW);
for (int i = 0; i < 100000; i++)
mach_vm_read_overwrite(self, (mach_vm_address_t)src, sz,
(mach_vm_address_t)dst, &out);
uint64_t t1 = clock_gettime_nsec_np(CLOCK_UPTIME_RAW);
printf("%6zu B: %.0f ns/op\n", sz, (t1 - t0) / 100000.0);
}
}
2026-05-09T00:44:06.177740000+0200 maurycy@gimel /Users/maurycy/src/github.com/maurycy/cpython (main 57ef219*) % cc /tmp/vmread_bench.c -o /tmp/vmread_bench
2026-05-09T00:44:14.349092000+0200 maurycy@gimel /Users/maurycy/src/github.com/maurycy/cpython (main 57ef219*) % /tmp/vmread_bench
64 B: 635 ns/op
128 B: 388 ns/op
256 B: 358 ns/op
512 B: 371 ns/op
1024 B: 391 ns/op
2048 B: 415 ns/op
4096 B: 489 ns/op
8192 B: 651 ns/op
16384 B: 938 ns/op
Similarly on Linux?
2026-05-08T22:53:09.556034847+0000 maurycy@weiss /home/maurycy/cpython (main 57ef219*) # cat /tmp/vm_readv_bench.c
#define _GNU_SOURCE
#include <stdio.h>
#include <sys/uio.h>
#include <time.h>
#include <unistd.h>
int main(void) {
char src[16384] = {0}, dst[16384];
pid_t self = getpid();
for (size_t sz = 64; sz <= 16384; sz *= 2) {
struct iovec local = {dst, sz};
struct iovec remote = {src, sz};
struct timespec t0, t1;
clock_gettime(CLOCK_MONOTONIC, &t0);
for (int i = 0; i < 100000; i++)
process_vm_readv(self, &local, 1, &remote, 1, 0);
clock_gettime(CLOCK_MONOTONIC, &t1);
long ns = (t1.tv_sec - t0.tv_sec) * 1000000000L
+ (t1.tv_nsec - t0.tv_nsec);
printf("%6zu B: %ld ns/op\n", sz, ns / 100000);
}
}
2026-05-08T22:53:09.928097987+0000 maurycy@weiss /home/maurycy/cpython (main 57ef219*) # cc /tmp/vm_readv_bench.c -o /tmp/vm_readv_bench
2026-05-08T22:53:24.934249589+0000 maurycy@weiss /home/maurycy/cpython (main 57ef219*) # /tmp/vm_readv_bench
64 B: 434 ns/op
128 B: 285 ns/op
256 B: 250 ns/op
512 B: 256 ns/op
1024 B: 265 ns/op
2048 B: 329 ns/op
4096 B: 369 ns/op
8192 B: 493 ns/op
16384 B: 779 ns/op
I think that one approach is to invalidate more inteligently (or even have two caches: stable and not so stable), using code_object_generation or tlbc_generation but I cannot figure out a trick for TS/frame.
Using just _Py_RemoteDebug_ReadRemoteMemory:
diff --git a/Modules/_remote_debugging/frames.c b/Modules/_remote_debugging/frames.c
index bbdfce3f720..7e565763927 100644
--- a/Modules/_remote_debugging/frames.c
+++ b/Modules/_remote_debugging/frames.c
@@ -197,7 +197,7 @@ parse_frame_object(
char frame[SIZEOF_INTERP_FRAME];
*address_of_code_object = 0;
- Py_ssize_t bytes_read = _Py_RemoteDebug_PagedReadRemoteMemory(
+ Py_ssize_t bytes_read = _Py_RemoteDebug_ReadRemoteMemory(
&unwinder->handle,
address,
SIZEOF_INTERP_FRAME,
diff --git a/Modules/_remote_debugging/module.c b/Modules/_remote_debugging/module.c
index 172f8711a2a..f034fbbf15f 100644
--- a/Modules/_remote_debugging/module.c
+++ b/Modules/_remote_debugging/module.c
@@ -537,7 +537,7 @@ _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self
while (current_interpreter != 0) {
// Read interpreter state to get the interpreter ID
char interp_state_buffer[INTERP_STATE_BUFFER_SIZE];
- if (_Py_RemoteDebug_PagedReadRemoteMemory(
+ if (_Py_RemoteDebug_ReadRemoteMemory(
&self->handle,
current_interpreter,
INTERP_STATE_BUFFER_SIZE,
diff --git a/Modules/_remote_debugging/threads.c b/Modules/_remote_debugging/threads.c
index 4daa5e5f92b..31d83f561a8 100644
--- a/Modules/_remote_debugging/threads.c
+++ b/Modules/_remote_debugging/threads.c
@@ -303,7 +303,7 @@ unwind_stack_for_thread(
StackChunkList chunks = {0};
char ts[SIZEOF_THREAD_STATE];
- int bytes_read = _Py_RemoteDebug_PagedReadRemoteMemory(
+ int bytes_read = _Py_RemoteDebug_ReadRemoteMemory(
&unwinder->handle, *current_tstate, (size_t)unwinder->debug_offsets.thread_state.size, ts);
if (bytes_read < 0) {
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read thread state");
We get:
2026-05-09T01:01:17.577709000+0200 maurycy@gimel /Users/maurycy/src/github.com/maurycy/cpython (remote-debugging-no-paged 7606bb1*) % sudo dtrace -q -Z -x dynvarsize=64m -x bufsize=128m \
-c "./python.exe -m profiling.sampling run -r 1000khz -d 15 --pstats -o /dev/null /tmp/busy.py" \
-n '
pid$target::*get_stack_trace*:entry { self->gst = timestamp; }
pid$target::*get_stack_trace*:return /self->gst/ {
@gst_avg = avg(timestamp - self->gst);
@gst_n = count(); self->gst = 0;
}
pid$target::mach_vm_read_overwrite:entry {
self->vmr = timestamp;
@vmr_size[arg2] = count();
}
pid$target::mach_vm_read_overwrite:return /self->vmr/ {
@vmr_avg = avg(timestamp - self->vmr);
@vmr_n = count();
self->vmr = 0;
}
END {
printf("\n=== get_stack_trace ===\n");
printf("avg ns "); printa(@gst_avg); printf("count "); printa(@gst_n);
printf("\n=== mach_vm_read_overwrite ===\n");
printf("avg ns "); printa(@vmr_avg); printf("count "); printa(@vmr_n);
printf("\n=== mach_vm_read_overwrite read sizes (bytes) ===\n");
printa("%d bytes %@d\n", @vmr_size);
}'
Captured 1,077,222 samples in 15.00 seconds
Sample rate: 71,814.76 samples/sec
Error rate: 0.00
Raw unwinder rate: 87,653.50 samples/sec (11.41 us/sample)
Warning: missed 13922786 samples from the expected total of 15000008 (92.82%)
=== get_stack_trace ===
avg ns
10381
count
1094753
=== mach_vm_read_overwrite ===
avg ns
2570
count
3284291
=== mach_vm_read_overwrite read sizes (bytes) ===
8 bytes 2
16384 bytes 20
7992 bytes 1094753
880 bytes 1094754
88 bytes 1094762
Note that mach_vm_read_overwrite is now 2570 ns vs. 3137 ns before.
As a result:
[130] 2026-05-09T01:08:09.192039000+0200 maurycy@gimel /Users/maurycy/src/github.com/maurycy/cpython (remote-debugging-no-paged 7606bb1*) % sudo -E ./python.exe -m profiling.sampling run -r 1000khz -d 15 --pstats -o /dev/null /tmp/busy.py
Captured 2,657,910 samples in 15.00 seconds
Sample rate: 177,193.96 samples/sec
Error rate: 0.00
Raw unwinder rate: 297,398.45 samples/sec (3.36 us/sample)
Warning: missed 12342093 samples from the expected total of 15000003 (82.28%)
2026-05-09T01:08:28.996839000+0200 maurycy@gimel /Users/maurycy/src/github.com/maurycy/cpython (remote-debugging-no-paged 7606bb1*) %
Versus, on the main branch (57ef219 revision):
2026-05-09T01:12:48.469595000+0200 maurycy@gimel /Users/maurycy/src/github.com/maurycy/cpython (main 57ef219*) % sudo -E ./python.exe -m profiling.sampling run -r 1000khz -d 15 --pstats -o /dev/null /tmp/busy.py
Captured 1,687,897 samples in 15.00 seconds
Sample rate: 112,526.41 samples/sec
Error rate: 0.00
Raw unwinder rate: 132,402.68 samples/sec (7.55 us/sample)
Warning: missed 13312110 samples from the expected total of 15000007 (88.75%)
That's:
- sampling rate: 112 kHz -> 177 kHz (still not great)
- sampling: 8.88 μs -> 5.64 μs
- raw unwinder per sample 7.55 μs -> 3.36 μs
PS. Truth to be told, I think that only tstate.current_frame and frame.instr_ptr have really high churn, and it's still not that efficient.
PS2. What do you think about mach_vm_remap(copy=FALSE)? ;->
CPython versions tested on:
CPython main branch
Operating systems tested on:
macOS
Linked PRs
Bug report
Bug description:
On a MacBook Air M4, there are 3
mach_vm_read_overwrite()perget_stack_trace(), and they're all 16384 bytes (page):(The
nsare heavily taxed by DTrace, so should be taken with a grain of salt.)However, the cache is invalidated after reading every sample, so they've very short-lived:
cpython/Modules/_remote_debugging/module.c
Line 717 in 57ef219
I believe that's where the reads happen:
cpython/Modules/_remote_debugging/module.c
Lines 540 to 548 in 57ef219
cpython/Modules/_remote_debugging/threads.c
Lines 306 to 307 in 57ef219
cpython/Modules/_remote_debugging/frames.c
Lines 200 to 205 in 57ef219
These are all relatively small structs, between 80-8000 bytes.
(My understanding is that
InterpStatewill nearly always be on a different page than TS / Frame, but they can be.)The larger the
mach_vm_read_overwrite, the slower it gets:Similarly on Linux?
I think that one approach is to invalidate more inteligently (or even have two caches: stable and not so stable), using
code_object_generationortlbc_generationbut I cannot figure out a trick for TS/frame.Using just
_Py_RemoteDebug_ReadRemoteMemory:We get:
Note that
mach_vm_read_overwriteis now 2570 ns vs. 3137 ns before.As a result:
Versus, on the
mainbranch (57ef219 revision):That's:
PS. Truth to be told, I think that only
tstate.current_frameandframe.instr_ptrhave really high churn, and it's still not that efficient.PS2. What do you think about
mach_vm_remap(copy=FALSE)? ;->CPython versions tested on:
CPython main branch
Operating systems tested on:
macOS
Linked PRs