diff --git a/Include/cpython/pystats.h b/Include/cpython/pystats.h index e473110eca7415..5d1f44988a6df1 100644 --- a/Include/cpython/pystats.h +++ b/Include/cpython/pystats.h @@ -144,6 +144,7 @@ typedef struct _optimization_stats { uint64_t unknown_callee; uint64_t trace_immediately_deopts; uint64_t executors_invalidated; + uint64_t fitness_terminated_traces; UOpStats opcode[PYSTATS_MAX_UOP_ID + 1]; uint64_t unsupported_opcode[256]; uint64_t trace_length_hist[_Py_UOP_HIST_SIZE]; diff --git a/Include/internal/pycore_interp_structs.h b/Include/internal/pycore_interp_structs.h index c4b084642668a9..6366335fd92584 100644 --- a/Include/internal/pycore_interp_structs.h +++ b/Include/internal/pycore_interp_structs.h @@ -449,6 +449,10 @@ typedef struct _PyOptimizationConfig { uint16_t side_exit_initial_value; uint16_t side_exit_initial_backoff; + // Trace fitness thresholds + uint16_t fitness_initial; + uint16_t fitness_initial_side; + // Optimization flags bool specialization_enabled; bool uops_optimize_enabled; diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h index cf01c620476ff7..48caa6648232b6 100644 --- a/Include/internal/pycore_optimizer.h +++ b/Include/internal/pycore_optimizer.h @@ -15,6 +15,23 @@ extern "C" { #include "pycore_optimizer_types.h" #include +/* Default fitness configuration values for trace quality control. + * FITNESS_INITIAL and FITNESS_INITIAL_SIDE can be overridden via + * PYTHON_JIT_FITNESS_INITIAL and PYTHON_JIT_FITNESS_INITIAL_SIDE */ +#define FITNESS_PER_INSTRUCTION 2 +#define FITNESS_BRANCH_BASE 5 +#define FITNESS_INITIAL (UOP_MAX_TRACE_LENGTH * FITNESS_PER_INSTRUCTION) +#define FITNESS_INITIAL_SIDE (FITNESS_INITIAL * 3 / 5) +#define FITNESS_BACKWARD_EDGE (FITNESS_INITIAL / 10) + +/* Exit quality constants for fitness-based trace termination. + * Higher values mean better places to stop the trace. */ + +#define EXIT_QUALITY_DEFAULT 200 +#define EXIT_QUALITY_CLOSE_LOOP (4 * EXIT_QUALITY_DEFAULT) +#define EXIT_QUALITY_ENTER_EXECUTOR (2 * EXIT_QUALITY_DEFAULT + 100) +#define EXIT_QUALITY_SPECIALIZABLE (EXIT_QUALITY_DEFAULT / 4) + typedef struct _PyJitUopBuffer { _PyUOpInstruction *start; @@ -101,7 +118,8 @@ typedef struct _PyJitTracerPreviousState { } _PyJitTracerPreviousState; typedef struct _PyJitTracerTranslatorState { - int jump_backward_seen; + int32_t fitness; // Current trace fitness, starts high, decrements + int frame_depth; // Current inline depth (0 = root frame) } _PyJitTracerTranslatorState; typedef struct _PyJitTracerState { diff --git a/Python/optimizer.c b/Python/optimizer.c index f09bf778587b12..81b6c228265e52 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -549,8 +549,6 @@ dynamic_exit_uop[MAX_UOP_ID + 1] = { }; -#define CONFIDENCE_RANGE 1000 -#define CONFIDENCE_CUTOFF 333 #ifdef Py_DEBUG #define DPRINTF(level, ...) \ @@ -598,6 +596,46 @@ add_to_trace( ((uint32_t)((INSTR) - ((_Py_CODEUNIT *)(CODE)->co_code_adaptive))) +/* Compute branch fitness penalty based on how likely the traced path is. + * The penalty is small when the traced path is common, large when rare. + * A branch that historically goes the other way gets a heavy penalty. */ +static inline int +compute_branch_penalty(uint16_t history, bool branch_taken) +{ + int taken_count = _Py_popcount32((uint32_t)history); + int on_trace_count = branch_taken ? taken_count : 16 - taken_count; + int off_trace = 16 - on_trace_count; + /* Linear scaling: off_trace ranges from 0 (fully biased our way) + * to 16 (fully biased against us), so the penalty ranges from + * FITNESS_BRANCH_BASE to FITNESS_BRANCH_BASE + 32. */ + return FITNESS_BRANCH_BASE + off_trace * 2; +} + +/* Compute exit quality for the current trace position. + * Higher values mean better places to stop the trace. */ +static inline int32_t +compute_exit_quality(_Py_CODEUNIT *target_instr, int opcode, + const _PyJitTracerState *tracer) +{ + if (target_instr == tracer->initial_state.start_instr || + target_instr == tracer->initial_state.close_loop_instr) { + return EXIT_QUALITY_CLOSE_LOOP; + } + if (target_instr->op.code == ENTER_EXECUTOR) { + return EXIT_QUALITY_ENTER_EXECUTOR; + } + if (_PyOpcode_Caches[_PyOpcode_Deopt[opcode]] > 0) { + return EXIT_QUALITY_SPECIALIZABLE; + } + return EXIT_QUALITY_DEFAULT; +} + +static inline int32_t +compute_frame_penalty(const _PyOptimizationConfig *cfg) +{ + return (int32_t)cfg->fitness_initial / 30 + 1; +} + static int is_terminator(const _PyUOpInstruction *uop) { @@ -734,13 +772,11 @@ _PyJit_translate_single_bytecode_to_trace( DPRINTF(2, "Unsupported: oparg too large\n"); unsupported: { - // Rewind to previous instruction and replace with _EXIT_TRACE. _PyUOpInstruction *curr = uop_buffer_last(trace); while (curr->opcode != _SET_IP && uop_buffer_length(trace) > 2) { trace->next--; curr = uop_buffer_last(trace); } - assert(curr->opcode == _SET_IP || uop_buffer_length(trace) == 2); if (curr->opcode == _SET_IP) { int32_t old_target = (int32_t)uop_get_target(curr); curr->opcode = _DEOPT; @@ -763,6 +799,23 @@ _PyJit_translate_single_bytecode_to_trace( return 1; } + // Fitness-based trace quality check (before reserving space for this instruction) + _PyJitTracerTranslatorState *ts = &tracer->translator_state; + int32_t eq = compute_exit_quality(target_instr, opcode, tracer); + DPRINTF(3, "Fitness check: %s(%d) fitness=%d, exit_quality=%d, depth=%d\n", + _PyOpcode_OpName[opcode], oparg, ts->fitness, eq, ts->frame_depth); + + // Check if fitness is depleted — should we stop the trace? + if (ts->fitness < eq) { + // This is a tracer heuristic rather than normal program control flow, + // so leave operand1 clear and let the resulting side exit increase chain_depth. + ADD_TO_TRACE(_EXIT_TRACE, 0, 0, target); + OPT_STAT_INC(fitness_terminated_traces); + DPRINTF(2, "Fitness terminated: %s(%d) fitness=%d < exit_quality=%d\n", + _PyOpcode_OpName[opcode], oparg, ts->fitness, eq); + goto done; + } + // One for possible _DEOPT, one because _CHECK_VALIDITY itself might _DEOPT trace->end -= 2; @@ -789,12 +842,7 @@ _PyJit_translate_single_bytecode_to_trace( trace->end -= needs_guard_ip; int space_needed = expansion->nuops + needs_guard_ip + 2 + (!OPCODE_HAS_NO_SAVE_IP(opcode)); - if (uop_buffer_remaining_space(trace) < space_needed) { - DPRINTF(2, "No room for expansions and guards (need %d, got %d)\n", - space_needed, uop_buffer_remaining_space(trace)); - OPT_STAT_INC(trace_too_long); - goto done; - } + assert(uop_buffer_remaining_space(trace) > space_needed); ADD_TO_TRACE(_CHECK_VALIDITY, 0, 0, target); @@ -816,6 +864,12 @@ _PyJit_translate_single_bytecode_to_trace( assert(jump_happened ? (next_instr == computed_jump_instr) : (next_instr == computed_next_instr)); uint32_t uopcode = BRANCH_TO_GUARD[opcode - POP_JUMP_IF_FALSE][jump_happened]; ADD_TO_TRACE(uopcode, 0, 0, INSTR_IP(jump_happened ? computed_next_instr : computed_jump_instr, old_code)); + int bp = compute_branch_penalty(target_instr[1].cache, jump_happened); + tracer->translator_state.fitness -= bp; + DPRINTF(3, " branch penalty: -%d (history=0x%04x, taken=%d) -> fitness=%d\n", + bp, target_instr[1].cache, jump_happened, + tracer->translator_state.fitness); + break; } case JUMP_BACKWARD_JIT: @@ -823,6 +877,9 @@ _PyJit_translate_single_bytecode_to_trace( case JUMP_BACKWARD_NO_JIT: case JUMP_BACKWARD: ADD_TO_TRACE(_CHECK_PERIODIC, 0, 0, target); + tracer->translator_state.fitness -= FITNESS_BACKWARD_EDGE; + DPRINTF(3, " backward edge penalty: -%d -> fitness=%d\n", + FITNESS_BACKWARD_EDGE, tracer->translator_state.fitness); _Py_FALLTHROUGH; case JUMP_BACKWARD_NO_INTERRUPT: { @@ -945,6 +1002,37 @@ _PyJit_translate_single_bytecode_to_trace( assert(next->op.code == STORE_FAST); operand = next->op.arg; } + else if (uop == _PUSH_FRAME) { + _PyJitTracerTranslatorState *ts_depth = &tracer->translator_state; + ts_depth->frame_depth++; + assert(ts_depth->frame_depth < MAX_ABSTRACT_FRAME_DEPTH); + int32_t frame_penalty = compute_frame_penalty(&tstate->interp->opt_config); + int32_t cost = frame_penalty * ts_depth->frame_depth; + ts_depth->fitness -= cost; + DPRINTF(3, " _PUSH_FRAME: depth=%d, penalty=-%d (per_frame=%d) -> fitness=%d\n", + ts_depth->frame_depth, cost, frame_penalty, + ts_depth->fitness); + } + else if (uop == _RETURN_VALUE || uop == _RETURN_GENERATOR || uop == _YIELD_VALUE) { + _PyJitTracerTranslatorState *ts_depth = &tracer->translator_state; + int32_t frame_penalty = compute_frame_penalty(&tstate->interp->opt_config); + if (ts_depth->frame_depth <= 0) { + // Underflow: returning from a frame we didn't enter + ts_depth->fitness -= frame_penalty * 2; + DPRINTF(3, " %s: underflow penalty=-%d -> fitness=%d\n", + _PyOpcode_uop_name[uop], frame_penalty * 2, + ts_depth->fitness); + } + else { + // Reward returning: small inlined calls should be encouraged + ts_depth->fitness += frame_penalty; + DPRINTF(3, " %s: return reward=+%d, depth=%d -> fitness=%d\n", + _PyOpcode_uop_name[uop], frame_penalty, + ts_depth->frame_depth - 1, + ts_depth->fitness); + } + ts_depth->frame_depth = ts_depth->frame_depth <= 0 ? 0 : ts_depth->frame_depth - 1; + } else if (_PyUop_Flags[uop] & HAS_RECORDS_VALUE_FLAG) { PyObject *recorded_value = tracer->prev_state.recorded_value; tracer->prev_state.recorded_value = NULL; @@ -986,7 +1074,13 @@ _PyJit_translate_single_bytecode_to_trace( ADD_TO_TRACE(_JUMP_TO_TOP, 0, 0, 0); goto done; } - DPRINTF(2, "Trace continuing\n"); + // Update fitness AFTER translation, BEFORE returning to continue tracing. + // This ensures the next iteration's fitness check reflects the cost of + // all instructions translated so far. + tracer->translator_state.fitness -= FITNESS_PER_INSTRUCTION; + DPRINTF(3, " per-insn cost: -%d -> fitness=%d\n", + FITNESS_PER_INSTRUCTION, tracer->translator_state.fitness); + DPRINTF(2, "Trace continuing (fitness=%d)\n", tracer->translator_state.fitness); return 1; done: DPRINTF(2, "Trace done\n"); @@ -1069,6 +1163,17 @@ _PyJit_TryInitializeTracing( assert(curr_instr->op.code == JUMP_BACKWARD_JIT || curr_instr->op.code == RESUME_CHECK_JIT || (exit != NULL)); tracer->initial_state.jump_backward_instr = curr_instr; + // Initialize fitness tracking state + const _PyOptimizationConfig *cfg = &tstate->interp->opt_config; + _PyJitTracerTranslatorState *ts = &tracer->translator_state; + bool is_side_trace = (exit != NULL); + ts->fitness = is_side_trace + ? (int32_t)cfg->fitness_initial_side + : (int32_t)cfg->fitness_initial; + ts->frame_depth = 0; + DPRINTF(3, "Fitness init: %s trace, fitness=%d\n", + is_side_trace ? "side" : "root", ts->fitness); + tracer->is_tracing = true; return 1; } @@ -2101,7 +2206,11 @@ _PyDumpExecutors(FILE *out) fprintf(out, " node [colorscheme=greys9]\n"); PyInterpreterState *interp = PyInterpreterState_Get(); for (size_t i = 0; i < interp->executor_count; i++) { - executor_to_gv(interp->executor_ptrs[i], out); + _PyExecutorObject *exec = interp->executor_ptrs[i]; + if (exec->vm_data.code == NULL) { + continue; + } + executor_to_gv(exec, out); } fprintf(out, "}\n\n"); return 0; diff --git a/Python/pystate.c b/Python/pystate.c index 143175da0f45c7..78eab7cc7d2459 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -635,6 +635,22 @@ init_interpreter(PyInterpreterState *interp, "PYTHON_JIT_SIDE_EXIT_INITIAL_BACKOFF", SIDE_EXIT_INITIAL_BACKOFF, 0, MAX_BACKOFF); + // Trace fitness configuration + init_policy(&interp->opt_config.fitness_initial, + "PYTHON_JIT_FITNESS_INITIAL", + FITNESS_INITIAL, 100, 10000); + init_policy(&interp->opt_config.fitness_initial_side, + "PYTHON_JIT_FITNESS_INITIAL_SIDE", + FITNESS_INITIAL_SIDE, 50, 5000); + /* The tracer starts at start_instr, so initial fitness must not be below + * the close-loop exit quality or tracing will terminate immediately. */ + if (interp->opt_config.fitness_initial < EXIT_QUALITY_CLOSE_LOOP) { + interp->opt_config.fitness_initial = EXIT_QUALITY_CLOSE_LOOP; + } + if (interp->opt_config.fitness_initial_side < EXIT_QUALITY_CLOSE_LOOP) { + interp->opt_config.fitness_initial_side = EXIT_QUALITY_CLOSE_LOOP; + } + interp->opt_config.specialization_enabled = !is_env_enabled("PYTHON_SPECIALIZATION_OFF"); interp->opt_config.uops_optimize_enabled = !is_env_disabled("PYTHON_UOPS_OPTIMIZE"); if (interp != &runtime->_main_interpreter) { diff --git a/Python/pystats.c b/Python/pystats.c index a057ad884566d8..2fac2db1b738c7 100644 --- a/Python/pystats.c +++ b/Python/pystats.c @@ -274,6 +274,7 @@ print_optimization_stats(FILE *out, OptimizationStats *stats) fprintf(out, "Optimization low confidence: %" PRIu64 "\n", stats->low_confidence); fprintf(out, "Optimization unknown callee: %" PRIu64 "\n", stats->unknown_callee); fprintf(out, "Executors invalidated: %" PRIu64 "\n", stats->executors_invalidated); + fprintf(out, "Optimization fitness terminated: %" PRIu64 "\n", stats->fitness_terminated_traces); print_histogram(out, "Trace length", stats->trace_length_hist); print_histogram(out, "Trace run length", stats->trace_run_length_hist);