Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions Lib/profiling/sampling/sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,33 @@ def _print_unwinder_stats(self):
print(f" Hits: {code_hits:n} ({ANSIColors.GREEN}{fmt(code_hits_pct)}%{ANSIColors.RESET})")
print(f" Misses: {code_misses:n} ({ANSIColors.RED}{fmt(code_misses_pct)}%{ANSIColors.RESET})")

batched_attempts = stats.get('batched_read_attempts', 0)
batched_successes = stats.get('batched_read_successes', 0)
batched_misses = stats.get('batched_read_misses', 0)
segments_requested = stats.get('batched_read_segments_requested', 0)
segments_completed = stats.get('batched_read_segments_completed', 0)
if batched_attempts > 0:
batched_success_rate = stats.get('batched_read_success_rate', 0.0)
batched_miss_rate = 100.0 - batched_success_rate
segment_completion_rate = stats.get(
'batched_read_segment_completion_rate', 0.0
)

print(f" {ANSIColors.CYAN}Batched Reads:{ANSIColors.RESET}")
print(f" Attempts: {batched_attempts:n}")
print(
f" Successes: {batched_successes:n} "
f"({ANSIColors.GREEN}{fmt(batched_success_rate)}%{ANSIColors.RESET})"
)
print(
f" Misses: {batched_misses:n} "
f"({ANSIColors.RED}{fmt(batched_miss_rate)}%{ANSIColors.RESET})"
)
print(
f" Segments read: {segments_completed:n}/{segments_requested:n} "
f"({ANSIColors.GREEN}{fmt(segment_completion_rate)}%{ANSIColors.RESET})"
)

# Memory operations
memory_reads = stats.get('memory_reads', 0)
memory_bytes = stats.get('memory_bytes_read', 0)
Expand Down
7 changes: 7 additions & 0 deletions Lib/test/test_external_inspection.py
Original file line number Diff line number Diff line change
Expand Up @@ -3767,6 +3767,13 @@ def test_get_stats(self):
"frames_read_from_cache",
"frames_read_from_memory",
"frame_cache_hit_rate",
"batched_read_attempts",
"batched_read_successes",
"batched_read_misses",
"batched_read_segments_requested",
"batched_read_segments_completed",
"batched_read_success_rate",
"batched_read_segment_completion_rate",
]
for key in expected_keys:
self.assertIn(key, stats)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Fix excessive overhead in the Tachyon profiler when inspecting a remote
process by avoiding repeated remote page-cache scans, batching predicted
remote reads, and reusing cached profiler result objects. Patch by Pablo
Galindo and Maurycy Pawłowski-Wieroński.
68 changes: 65 additions & 3 deletions Modules/_remote_debugging/_remote_debugging.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ extern "C" {
#include "internal/pycore_llist.h" // struct llist_node
#include "internal/pycore_long.h" // _PyLong_GetZero
#include "internal/pycore_pyerrors.h" // _PyErr_FormatFromCause
#include "internal/pycore_pyhash.h" // _Py_HashPointerRaw
#include "internal/pycore_stackref.h" // Py_TAG_BITS
#include "../../Python/remote_debug.h"

Expand Down Expand Up @@ -215,6 +216,8 @@ typedef struct {
PyObject *file_name;
int first_lineno;
PyObject *linetable; // bytes
PyObject *last_frame_info;
ptrdiff_t last_addrq;
uintptr_t addr_code_adaptive;
} CachedCodeMetadata;

Expand All @@ -224,11 +227,34 @@ typedef struct {

typedef struct {
uint64_t thread_id; // 0 = empty slot
uintptr_t thread_state_addr;
uintptr_t addrs[FRAME_CACHE_MAX_FRAMES];
Py_ssize_t num_addrs;
PyObject *thread_id_obj; // owned reference, NULL if empty
PyObject *frame_list; // owned reference, NULL if empty
} FrameCacheEntry;

#define INTERPRETER_THREAD_CACHE_SIZE 32
#if (INTERPRETER_THREAD_CACHE_SIZE & (INTERPRETER_THREAD_CACHE_SIZE - 1)) != 0
# error "INTERPRETER_THREAD_CACHE_SIZE must be a power of two"
#endif

typedef struct {
uintptr_t interpreter_addr;
uintptr_t thread_state_addr;
} InterpreterThreadCacheEntry;

// Carries already-read thread state and/or frame buffers across helpers so the
// downstream callee can skip a remote read. Address fields are caller-supplied
// inputs; buffer pointers (tstate, frame) are NULL unless a prior batched read
// successfully populated them.
typedef struct {
const char *tstate;
uintptr_t tstate_addr;
const char *frame;
uintptr_t frame_addr;
} RemoteReadPrefetch;

/* Statistics for profiling performance analysis */
typedef struct {
uint64_t total_samples; // Total number of get_stack_trace calls
Expand All @@ -242,14 +268,40 @@ typedef struct {
uint64_t code_object_cache_hits; // Code object cache hits
uint64_t code_object_cache_misses; // Code object cache misses
uint64_t stale_cache_invalidations; // Times stale entries were cleared
uint64_t batched_read_attempts; // Batched remote-read attempts
uint64_t batched_read_successes; // Attempts that read all requested segments
uint64_t batched_read_misses; // Attempts that fell back or partially read
uint64_t batched_read_segments_requested; // Segments requested by batched reads
uint64_t batched_read_segments_completed; // Segments completed by batched reads
} UnwinderStats;

#if defined(__GNUC__) || defined(__clang__)
# define REMOTE_DEBUG_UNLIKELY(value) __builtin_expect(!!(value), 0)
#else
# define REMOTE_DEBUG_UNLIKELY(value) (value)
#endif

/* Stats tracking macros - no-op when stats collection is disabled */
#define STATS_INC(unwinder, field) \
do { if ((unwinder)->collect_stats) (unwinder)->stats.field++; } while(0)
do { if (REMOTE_DEBUG_UNLIKELY((unwinder)->collect_stats)) (unwinder)->stats.field++; } while(0)

#define STATS_ADD(unwinder, field, val) \
do { if ((unwinder)->collect_stats) (unwinder)->stats.field += (val); } while(0)
do { if (REMOTE_DEBUG_UNLIKELY((unwinder)->collect_stats)) (unwinder)->stats.field += (val); } while(0)

#define STATS_BATCHED_READ(unwinder, requested, completed) \
do { \
if (REMOTE_DEBUG_UNLIKELY((unwinder)->collect_stats)) { \
(unwinder)->stats.batched_read_attempts++; \
(unwinder)->stats.batched_read_segments_requested += (uint64_t)(requested); \
(unwinder)->stats.batched_read_segments_completed += (uint64_t)(completed); \
if ((completed) == (requested)) { \
(unwinder)->stats.batched_read_successes++; \
} \
else { \
(unwinder)->stats.batched_read_misses++; \
} \
} \
} while(0)

typedef struct {
PyTypeObject *RemoteDebugging_Type;
Expand Down Expand Up @@ -302,9 +354,14 @@ typedef struct {
int cache_frames;
int collect_stats; // whether to collect statistics
uint32_t stale_invalidation_counter; // counter for throttling frame_cache_invalidate_stale
// L1 single-entry shortcut over cached_tstates[]: most workloads sample one
// interpreter, so check this pair before hashing into the table below.
uintptr_t cached_tstate_interpreter_addr;
uintptr_t cached_tstate_addr;
RemoteDebuggingState *cached_state;
FrameCacheEntry *frame_cache; // preallocated array of FRAME_CACHE_MAX_THREADS entries
UnwinderStats stats; // statistics for performance analysis
InterpreterThreadCacheEntry cached_tstates[INTERPRETER_THREAD_CACHE_SIZE];
#ifdef Py_GIL_DISABLED
uint32_t tlbc_generation;
_Py_hashtable_t *tlbc_cache;
Expand Down Expand Up @@ -361,11 +418,13 @@ typedef struct {
typedef struct {
/* Inputs */
uintptr_t frame_addr; // Starting frame address
uintptr_t thread_state_addr; // Owning thread state address
uintptr_t base_frame_addr; // Sentinel at bottom (for validation)
uintptr_t gc_frame; // GC frame address (0 if not tracking)
uintptr_t last_profiled_frame; // Last cached frame (0 if no cache)
StackChunkList *chunks; // Pre-copied stack chunks
int skip_first_frame; // Skip frame_addr itself (continue from its caller)
RemoteReadPrefetch prefetch; // Optional already-read thread/frame buffers

/* Outputs */
PyObject *frame_info; // List to append FrameInfo objects
Expand Down Expand Up @@ -548,6 +607,7 @@ extern int process_frame_chain(
extern int frame_cache_init(RemoteUnwinderObject *unwinder);
extern void frame_cache_cleanup(RemoteUnwinderObject *unwinder);
extern FrameCacheEntry *frame_cache_find(RemoteUnwinderObject *unwinder, uint64_t thread_id);
extern FrameCacheEntry *frame_cache_find_by_tstate(RemoteUnwinderObject *unwinder, uintptr_t tstate_addr);
extern int clear_last_profiled_frames(RemoteUnwinderObject *unwinder);
extern void frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result);
extern int frame_cache_lookup_and_extend(
Expand All @@ -566,6 +626,7 @@ extern int frame_cache_store(
PyObject *frame_list,
const uintptr_t *addrs,
Py_ssize_t num_addrs,
uintptr_t thread_state_addr,
uintptr_t base_frame_addr,
uintptr_t last_frame_visited);

Expand Down Expand Up @@ -605,7 +666,8 @@ extern PyObject* unwind_stack_for_thread(
uintptr_t *current_tstate,
uintptr_t gil_holder_tstate,
uintptr_t gc_frame,
uintptr_t main_thread_tstate
uintptr_t main_thread_tstate,
const RemoteReadPrefetch *prefetch
);

/* Thread stopping functions (for blocking mode) */
Expand Down
9 changes: 8 additions & 1 deletion Modules/_remote_debugging/clinic/module.c.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 13 additions & 0 deletions Modules/_remote_debugging/code_objects.c
Original file line number Diff line number Diff line change
Expand Up @@ -405,6 +405,8 @@ parse_code_object(RemoteUnwinderObject *unwinder,
meta->func_name = func;
meta->file_name = file;
meta->linetable = linetable;
meta->last_frame_info = NULL;
meta->last_addrq = -1;
meta->first_lineno = GET_MEMBER(int, code_object, unwinder->debug_offsets.code_object.firstlineno);
meta->addr_code_adaptive = real_address + (uintptr_t)unwinder->debug_offsets.code_object.co_code_adaptive;

Expand Down Expand Up @@ -482,6 +484,12 @@ parse_code_object(RemoteUnwinderObject *unwinder,
addrq = (uint16_t *)ip - (uint16_t *)meta->addr_code_adaptive;
#endif
; // Empty statement to avoid C23 extension warning

if (!unwinder->opcodes && meta->last_frame_info != NULL && meta->last_addrq == addrq) {
Comment thread
pablogsal marked this conversation as resolved.
*result = Py_NewRef(meta->last_frame_info);
return 0;
}

LocationInfo info = {0};
bool ok = parse_linetable(addrq, PyBytes_AS_STRING(meta->linetable),
PyBytes_GET_SIZE(meta->linetable),
Expand Down Expand Up @@ -529,6 +537,11 @@ parse_code_object(RemoteUnwinderObject *unwinder,
goto error;
}

if (!unwinder->opcodes) {
Py_XSETREF(meta->last_frame_info, Py_NewRef(tuple));
meta->last_addrq = addrq;
}

*result = tuple;
return 0;

Expand Down
26 changes: 26 additions & 0 deletions Modules/_remote_debugging/frame_cache.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ frame_cache_cleanup(RemoteUnwinderObject *unwinder)
return;
}
for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
Py_CLEAR(unwinder->frame_cache[i].thread_id_obj);
Py_CLEAR(unwinder->frame_cache[i].frame_list);
}
PyMem_Free(unwinder->frame_cache);
Expand All @@ -53,6 +54,21 @@ frame_cache_find(RemoteUnwinderObject *unwinder, uint64_t thread_id)
return NULL;
}

FrameCacheEntry *
frame_cache_find_by_tstate(RemoteUnwinderObject *unwinder, uintptr_t tstate_addr)
{
if (!unwinder->frame_cache || tstate_addr == 0) {
return NULL;
}
for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
if (unwinder->frame_cache[i].thread_state_addr == tstate_addr) {
assert(unwinder->frame_cache[i].num_addrs <= FRAME_CACHE_MAX_FRAMES);
return &unwinder->frame_cache[i];
}
}
return NULL;
}

// Allocate a cache slot for a thread
// Returns NULL if cache is full (graceful degradation)
static FrameCacheEntry *
Expand Down Expand Up @@ -127,8 +143,10 @@ frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result)
}
if (!found) {
// Clear this entry
Py_CLEAR(unwinder->frame_cache[i].thread_id_obj);
Py_CLEAR(unwinder->frame_cache[i].frame_list);
unwinder->frame_cache[i].thread_id = 0;
unwinder->frame_cache[i].thread_state_addr = 0;
unwinder->frame_cache[i].num_addrs = 0;
STATS_INC(unwinder, stale_cache_invalidations);
}
Expand Down Expand Up @@ -216,6 +234,7 @@ frame_cache_store(
PyObject *frame_list,
const uintptr_t *addrs,
Py_ssize_t num_addrs,
uintptr_t thread_state_addr,
uintptr_t base_frame_addr,
uintptr_t last_frame_visited)
{
Expand Down Expand Up @@ -257,6 +276,13 @@ frame_cache_store(
return -1;
}
entry->thread_id = thread_id;
entry->thread_state_addr = thread_state_addr;
if (entry->thread_id_obj == NULL) {
entry->thread_id_obj = PyLong_FromUnsignedLongLong(thread_id);
if (entry->thread_id_obj == NULL) {
return -1;
}
}
memcpy(entry->addrs, addrs, num_addrs * sizeof(uintptr_t));
entry->num_addrs = num_addrs;
assert(entry->num_addrs == num_addrs);
Expand Down
Loading
Loading