diff --git a/.cspell.dict/cpython.txt b/.cspell.dict/cpython.txt index 819d6875b58..7681760ea65 100644 --- a/.cspell.dict/cpython.txt +++ b/.cspell.dict/cpython.txt @@ -44,6 +44,7 @@ copyslot cpucount defaultdict denom +deopt dictbytype DICTFLAG dictoffset diff --git a/.cspell.json b/.cspell.json index bbc13e6fded..0d41568618a 100644 --- a/.cspell.json +++ b/.cspell.json @@ -60,6 +60,7 @@ "dedentations", "dedents", "deduped", + "deoptimize", "downcastable", "downcasted", "dumpable", @@ -73,6 +74,7 @@ "interps", "jitted", "jitting", + "kwonly", "lossily", "makeunicodedata", "microbenchmark", diff --git a/crates/codegen/src/ir.rs b/crates/codegen/src/ir.rs index 4363ffaa768..b21af84a51c 100644 --- a/crates/codegen/src/ir.rs +++ b/crates/codegen/src/ir.rs @@ -457,11 +457,13 @@ impl CodeInfo { .map(|byte| CodeUnit::new(Instruction::ExtendedArg, byte)) .chain([CodeUnit { op, arg: lo_arg }]), ); - // Emit CACHE code units after the instruction - instructions.extend(core::iter::repeat_n( - CodeUnit::new(Instruction::Cache, 0.into()), - cache_count, - )); + // Emit CACHE code units after the instruction (all zeroed) + if cache_count > 0 { + instructions.extend(core::iter::repeat_n( + CodeUnit::new(Instruction::Cache, 0.into()), + cache_count, + )); + } current_offset = offset_after; } next_block = block.next; diff --git a/crates/compiler-core/src/bytecode.rs b/crates/compiler-core/src/bytecode.rs index cece1fb77fa..cec04b9edd9 100644 --- a/crates/compiler-core/src/bytecode.rs +++ b/crates/compiler-core/src/bytecode.rs @@ -343,6 +343,11 @@ pub struct CodeUnit { const _: () = assert!(mem::size_of::() == 2); +/// Adaptive specialization: number of executions before attempting specialization. +pub const ADAPTIVE_WARMUP_VALUE: u8 = 50; +/// Adaptive specialization: backoff counter after de-optimization. +pub const ADAPTIVE_BACKOFF_VALUE: u8 = 250; + impl CodeUnit { pub const fn new(op: Instruction, arg: OpArgByte) -> Self { Self { op, arg } @@ -391,7 +396,11 @@ impl TryFrom<&[u8]> for CodeUnits { return Err(Self::Error::InvalidBytecode); } - value.chunks_exact(2).map(CodeUnit::try_from).collect() + let units: Self = value + .chunks_exact(2) + .map(CodeUnit::try_from) + .collect::>()?; + Ok(units) } } @@ -441,6 +450,140 @@ impl CodeUnits { core::ptr::write(op_ptr, new_op.into()); } } + + /// Write a u16 value into a CACHE code unit at `index`. + /// Each CodeUnit is 2 bytes (#[repr(C)]: op u8 + arg u8), so one u16 fits exactly. + /// + /// # Safety + /// - `index` must be in bounds and point to a CACHE entry. + /// - The caller must ensure no concurrent reads/writes to the same slot. + pub unsafe fn write_cache_u16(&self, index: usize, value: u16) { + unsafe { + let units = &mut *self.0.get(); + let ptr = units.as_mut_ptr().add(index) as *mut u8; + core::ptr::write_unaligned(ptr as *mut u16, value); + } + } + + /// Read a u16 value from a CACHE code unit at `index`. + /// + /// # Panics + /// Panics if `index` is out of bounds. + pub fn read_cache_u16(&self, index: usize) -> u16 { + let units = unsafe { &*self.0.get() }; + assert!(index < units.len(), "read_cache_u16: index out of bounds"); + let ptr = units.as_ptr().wrapping_add(index) as *const u8; + unsafe { core::ptr::read_unaligned(ptr as *const u16) } + } + + /// Write a u32 value across two consecutive CACHE code units starting at `index`. + /// + /// # Safety + /// Same requirements as `write_cache_u16`. + pub unsafe fn write_cache_u32(&self, index: usize, value: u32) { + unsafe { + self.write_cache_u16(index, value as u16); + self.write_cache_u16(index + 1, (value >> 16) as u16); + } + } + + /// Read a u32 value from two consecutive CACHE code units starting at `index`. + /// + /// # Panics + /// Panics if `index + 1` is out of bounds. + pub fn read_cache_u32(&self, index: usize) -> u32 { + let lo = self.read_cache_u16(index) as u32; + let hi = self.read_cache_u16(index + 1) as u32; + lo | (hi << 16) + } + + /// Write a u64 value across four consecutive CACHE code units starting at `index`. + /// + /// # Safety + /// Same requirements as `write_cache_u16`. + pub unsafe fn write_cache_u64(&self, index: usize, value: u64) { + unsafe { + self.write_cache_u32(index, value as u32); + self.write_cache_u32(index + 2, (value >> 32) as u32); + } + } + + /// Read a u64 value from four consecutive CACHE code units starting at `index`. + /// + /// # Panics + /// Panics if `index + 3` is out of bounds. + pub fn read_cache_u64(&self, index: usize) -> u64 { + let lo = self.read_cache_u32(index) as u64; + let hi = self.read_cache_u32(index + 2) as u64; + lo | (hi << 32) + } + + /// Read the adaptive counter from the first CACHE entry's `arg` byte. + /// This preserves `op = Instruction::Cache`, unlike `read_cache_u16`. + pub fn read_adaptive_counter(&self, index: usize) -> u8 { + let units = unsafe { &*self.0.get() }; + u8::from(units[index].arg) + } + + /// Write the adaptive counter to the first CACHE entry's `arg` byte. + /// This preserves `op = Instruction::Cache`, unlike `write_cache_u16`. + /// + /// # Safety + /// - `index` must be in bounds and point to a CACHE entry. + pub unsafe fn write_adaptive_counter(&self, index: usize, value: u8) { + let units = unsafe { &mut *self.0.get() }; + units[index].arg = OpArgByte::from(value); + } + + /// Produce a clean copy of the bytecode suitable for serialization + /// (marshal) and `co_code`. Specialized opcodes are mapped back to their + /// base variants via `deoptimize()` and all CACHE entries are zeroed. + pub fn original_bytes(&self) -> Vec { + let units = unsafe { &*self.0.get() }; + let mut out = Vec::with_capacity(units.len() * 2); + let len = units.len(); + let mut i = 0; + while i < len { + let op = units[i].op.deoptimize(); + let caches = op.cache_entries(); + out.push(u8::from(op)); + out.push(u8::from(units[i].arg)); + // Zero-fill all CACHE entries (counter + cached data) + for _ in 0..caches { + i += 1; + out.push(0); // op = Cache = 0 + out.push(0); // arg = 0 + } + i += 1; + } + out + } + + /// Initialize adaptive warmup counters for all cacheable instructions. + /// Called lazily at RESUME (first execution of a code object). + /// Uses the `arg` byte of the first CACHE entry, preserving `op = Instruction::Cache`. + pub fn quicken(&self) { + let units = unsafe { &mut *self.0.get() }; + let len = units.len(); + let mut i = 0; + while i < len { + let op = units[i].op; + let caches = op.cache_entries(); + if caches > 0 { + // Don't write adaptive counter for instrumented opcodes; + // specialization is skipped while monitoring is active. + if !op.is_instrumented() { + let cache_base = i + 1; + if cache_base < len { + units[cache_base].arg = OpArgByte::from(ADAPTIVE_WARMUP_VALUE); + } + } + i += 1 + caches; + } else { + i += 1; + } + } + } } /// A Constant (which usually encapsulates data within it) diff --git a/crates/compiler-core/src/bytecode/instruction.rs b/crates/compiler-core/src/bytecode/instruction.rs index c1c5e8cd847..e7b13ff21d2 100644 --- a/crates/compiler-core/src/bytecode/instruction.rs +++ b/crates/compiler-core/src/bytecode/instruction.rs @@ -512,6 +512,126 @@ impl Instruction { }) } + /// Map a specialized opcode back to its adaptive (base) variant. + /// `_PyOpcode_Deopt` + pub fn deoptimize(self) -> Self { + match self { + // LOAD_ATTR specializations + Self::LoadAttrClass + | Self::LoadAttrClassWithMetaclassCheck + | Self::LoadAttrGetattributeOverridden + | Self::LoadAttrInstanceValue + | Self::LoadAttrMethodLazyDict + | Self::LoadAttrMethodNoDict + | Self::LoadAttrMethodWithValues + | Self::LoadAttrModule + | Self::LoadAttrNondescriptorNoDict + | Self::LoadAttrNondescriptorWithValues + | Self::LoadAttrProperty + | Self::LoadAttrSlot + | Self::LoadAttrWithHint => Self::LoadAttr { idx: Arg::marker() }, + // BINARY_OP specializations + Self::BinaryOpAddFloat + | Self::BinaryOpAddInt + | Self::BinaryOpAddUnicode + | Self::BinaryOpExtend + | Self::BinaryOpInplaceAddUnicode + | Self::BinaryOpMultiplyFloat + | Self::BinaryOpMultiplyInt + | Self::BinaryOpSubscrDict + | Self::BinaryOpSubscrGetitem + | Self::BinaryOpSubscrListInt + | Self::BinaryOpSubscrListSlice + | Self::BinaryOpSubscrStrInt + | Self::BinaryOpSubscrTupleInt + | Self::BinaryOpSubtractFloat + | Self::BinaryOpSubtractInt => Self::BinaryOp { op: Arg::marker() }, + // CALL specializations + Self::CallAllocAndEnterInit + | Self::CallBoundMethodExactArgs + | Self::CallBoundMethodGeneral + | Self::CallBuiltinClass + | Self::CallBuiltinFast + | Self::CallBuiltinFastWithKeywords + | Self::CallBuiltinO + | Self::CallIsinstance + | Self::CallLen + | Self::CallListAppend + | Self::CallMethodDescriptorFast + | Self::CallMethodDescriptorFastWithKeywords + | Self::CallMethodDescriptorNoargs + | Self::CallMethodDescriptorO + | Self::CallNonPyGeneral + | Self::CallPyExactArgs + | Self::CallPyGeneral + | Self::CallStr1 + | Self::CallTuple1 + | Self::CallType1 => Self::Call { + nargs: Arg::marker(), + }, + // CALL_KW specializations + Self::CallKwBoundMethod | Self::CallKwNonPy | Self::CallKwPy => Self::CallKw { + nargs: Arg::marker(), + }, + // TO_BOOL specializations + Self::ToBoolAlwaysTrue + | Self::ToBoolBool + | Self::ToBoolInt + | Self::ToBoolList + | Self::ToBoolNone + | Self::ToBoolStr => Self::ToBool, + // COMPARE_OP specializations + Self::CompareOpFloat | Self::CompareOpInt | Self::CompareOpStr => { + Self::CompareOp { op: Arg::marker() } + } + // CONTAINS_OP specializations + Self::ContainsOpDict | Self::ContainsOpSet => Self::ContainsOp(Arg::marker()), + // FOR_ITER specializations + Self::ForIterGen | Self::ForIterList | Self::ForIterRange | Self::ForIterTuple => { + Self::ForIter { + target: Arg::marker(), + } + } + // LOAD_GLOBAL specializations + Self::LoadGlobalBuiltin | Self::LoadGlobalModule => Self::LoadGlobal(Arg::marker()), + // STORE_ATTR specializations + Self::StoreAttrInstanceValue | Self::StoreAttrSlot | Self::StoreAttrWithHint => { + Self::StoreAttr { idx: Arg::marker() } + } + // LOAD_SUPER_ATTR specializations + Self::LoadSuperAttrAttr | Self::LoadSuperAttrMethod => { + Self::LoadSuperAttr { arg: Arg::marker() } + } + // STORE_SUBSCR specializations + Self::StoreSubscrDict | Self::StoreSubscrListInt => Self::StoreSubscr, + // UNPACK_SEQUENCE specializations + Self::UnpackSequenceList | Self::UnpackSequenceTuple | Self::UnpackSequenceTwoTuple => { + Self::UnpackSequence { + size: Arg::marker(), + } + } + // SEND specializations + Self::SendGen => Self::Send { + target: Arg::marker(), + }, + // LOAD_CONST specializations + Self::LoadConstImmortal | Self::LoadConstMortal => { + Self::LoadConst { idx: Arg::marker() } + } + // RESUME specializations + Self::ResumeCheck => Self::Resume { arg: Arg::marker() }, + // JUMP_BACKWARD specializations + Self::JumpBackwardJit | Self::JumpBackwardNoJit => Self::JumpBackward { + target: Arg::marker(), + }, + // Instrumented opcodes map back to their base + _ => match self.to_base() { + Some(base) => base, + None => self, + }, + } + } + /// Number of CACHE code units that follow this instruction. /// _PyOpcode_Caches pub fn cache_entries(self) -> usize { @@ -626,8 +746,11 @@ impl Instruction { | Self::UnpackSequenceTuple | Self::UnpackSequenceTwoTuple => 1, - // Everything else: 0 cache entries - _ => 0, + // Instrumented opcodes have the same cache entries as their base + _ => match self.to_base() { + Some(base) => base.cache_entries(), + None => 0, + }, } } } diff --git a/crates/compiler-core/src/marshal.rs b/crates/compiler-core/src/marshal.rs index 11df127920a..310bad9d868 100644 --- a/crates/compiler-core/src/marshal.rs +++ b/crates/compiler-core/src/marshal.rs @@ -662,9 +662,8 @@ pub fn serialize_value( pub fn serialize_code(buf: &mut W, code: &CodeObject) { write_len(buf, code.instructions.len()); - // SAFETY: it's ok to transmute CodeUnit to [u8; 2] - let (_, instructions_bytes, _) = unsafe { code.instructions.align_to() }; - buf.write_slice(instructions_bytes); + let original = code.instructions.original_bytes(); + buf.write_slice(&original); write_len(buf, code.locations.len()); for (start, end) in &*code.locations { diff --git a/crates/vm/src/builtins/code.rs b/crates/vm/src/builtins/code.rs index 1708477004e..126d0216546 100644 --- a/crates/vm/src/builtins/code.rs +++ b/crates/vm/src/builtins/code.rs @@ -346,6 +346,8 @@ pub struct PyCode { pub instrumentation_version: AtomicU64, /// Side-table for INSTRUMENTED_LINE / INSTRUMENTED_INSTRUCTION. pub monitoring_data: PyMutex>, + /// Whether adaptive counters have been initialized (lazy quickening). + pub quickened: core::sync::atomic::AtomicBool, } impl Deref for PyCode { @@ -363,6 +365,7 @@ impl PyCode { source_path: AtomicPtr::new(sp), instrumentation_version: AtomicU64::new(0), monitoring_data: PyMutex::new(None), + quickened: core::sync::atomic::AtomicBool::new(false), } } @@ -681,7 +684,12 @@ impl PyCode { #[pygetset] pub fn co_code(&self, vm: &VirtualMachine) -> crate::builtins::PyBytesRef { - // SAFETY: CodeUnit is #[repr(C)] with size 2, so we can safely transmute to bytes + vm.ctx.new_bytes(self.code.instructions.original_bytes()) + } + + #[pygetset] + pub fn _co_code_adaptive(&self, vm: &VirtualMachine) -> crate::builtins::PyBytesRef { + // Return current (possibly quickened/specialized) bytecode let bytes = unsafe { core::slice::from_raw_parts( self.code.instructions.as_ptr() as *const u8, @@ -691,12 +699,6 @@ impl PyCode { vm.ctx.new_bytes(bytes.to_vec()) } - #[pygetset] - pub fn _co_code_adaptive(&self, vm: &VirtualMachine) -> crate::builtins::PyBytesRef { - // RustPython doesn't have adaptive/specialized bytecode, so return regular co_code - self.co_code(vm) - } - #[pygetset] pub fn co_freevars(&self, vm: &VirtualMachine) -> PyTupleRef { let names = self diff --git a/crates/vm/src/builtins/function.rs b/crates/vm/src/builtins/function.rs index 58f818cc7a7..489482d1933 100644 --- a/crates/vm/src/builtins/function.rs +++ b/crates/vm/src/builtins/function.rs @@ -22,6 +22,7 @@ use crate::{ Callable, Comparable, Constructor, GetAttr, GetDescriptor, PyComparisonOp, Representable, }, }; +use core::sync::atomic::{AtomicU32, Ordering::Relaxed}; use itertools::Itertools; #[cfg(feature = "jit")] use rustpython_jit::CompiledCode; @@ -72,10 +73,13 @@ pub struct PyFunction { annotate: PyMutex>, module: PyMutex, doc: PyMutex, + func_version: AtomicU32, #[cfg(feature = "jit")] jitted_code: OnceCell, } +static FUNC_VERSION_COUNTER: AtomicU32 = AtomicU32::new(1); + unsafe impl Traverse for PyFunction { fn traverse(&self, tracer_fn: &mut TraverseFn<'_>) { self.globals.traverse(tracer_fn); @@ -200,6 +204,7 @@ impl PyFunction { annotate: PyMutex::new(None), module: PyMutex::new(module), doc: PyMutex::new(doc), + func_version: AtomicU32::new(FUNC_VERSION_COUNTER.fetch_add(1, Relaxed)), #[cfg(feature = "jit")] jitted_code: OnceCell::new(), }; @@ -593,6 +598,68 @@ impl Py { pub fn invoke(&self, func_args: FuncArgs, vm: &VirtualMachine) -> PyResult { self.invoke_with_locals(func_args, None, vm) } + + /// Returns the function version, or 0 if invalidated. + #[inline] + pub fn func_version(&self) -> u32 { + self.func_version.load(Relaxed) + } + + /// Check if this function is eligible for exact-args call specialization. + /// Returns true if: no VARARGS, no VARKEYWORDS, no kwonly args, not generator/coroutine, + /// and effective_nargs matches co_argcount. + pub(crate) fn can_specialize_call(&self, effective_nargs: u32) -> bool { + let code = self.code.lock(); + let flags = code.flags; + flags.contains(bytecode::CodeFlags::NEWLOCALS) + && !flags.intersects( + bytecode::CodeFlags::VARARGS + | bytecode::CodeFlags::VARKEYWORDS + | bytecode::CodeFlags::GENERATOR + | bytecode::CodeFlags::COROUTINE, + ) + && code.kwonlyarg_count == 0 + && code.arg_count == effective_nargs + } + + /// Fast path for calling a simple function with exact positional args. + /// Skips FuncArgs allocation, prepend_arg, and fill_locals_from_args. + /// Only valid when: no VARARGS, no VARKEYWORDS, no kwonlyargs, not generator/coroutine, + /// and nargs == co_argcount. + pub fn invoke_exact_args(&self, args: &[PyObjectRef], vm: &VirtualMachine) -> PyResult { + let code = self.code.lock().clone(); + + let locals = ArgMapping::from_dict_exact(vm.ctx.new_dict()); + + let frame = Frame::new( + code.clone(), + Scope::new(Some(locals), self.globals.clone()), + self.builtins.clone(), + self.closure.as_ref().map_or(&[], |c| c.as_slice()), + Some(self.to_owned().into()), + vm, + ) + .into_ref(&vm.ctx); + + // Copy args directly into fastlocals + { + let fastlocals = unsafe { frame.fastlocals.borrow_mut() }; + for (i, arg) in args.iter().enumerate() { + fastlocals[i] = Some(arg.clone()); + } + } + + // Handle cell2arg + if let Some(cell2arg) = code.cell2arg.as_deref() { + let fastlocals = unsafe { frame.fastlocals.borrow_mut() }; + for (cell_idx, arg_idx) in cell2arg.iter().enumerate().filter(|(_, i)| **i != -1) { + let x = fastlocals[*arg_idx as usize].take(); + frame.set_cell_contents(cell_idx, x); + } + } + + vm.run_frame(frame) + } } impl PyPayload for PyFunction { @@ -615,12 +682,7 @@ impl PyFunction { #[pygetset(setter)] fn set___code__(&self, code: PyRef) { *self.code.lock() = code; - // TODO: jit support - // #[cfg(feature = "jit")] - // { - // // If available, clear cached compiled code. - // let _ = self.jitted_code.take(); - // } + self.func_version.store(0, Relaxed); } #[pygetset] @@ -629,7 +691,8 @@ impl PyFunction { } #[pygetset(setter)] fn set___defaults__(&self, defaults: Option) { - self.defaults_and_kwdefaults.lock().0 = defaults + self.defaults_and_kwdefaults.lock().0 = defaults; + self.func_version.store(0, Relaxed); } #[pygetset] @@ -638,7 +701,8 @@ impl PyFunction { } #[pygetset(setter)] fn set___kwdefaults__(&self, kwdefaults: Option) { - self.defaults_and_kwdefaults.lock().1 = kwdefaults + self.defaults_and_kwdefaults.lock().1 = kwdefaults; + self.func_version.store(0, Relaxed); } // {"__closure__", T_OBJECT, OFF(func_closure), READONLY}, diff --git a/crates/vm/src/builtins/type.rs b/crates/vm/src/builtins/type.rs index 86865e9e083..b3a3c206c68 100644 --- a/crates/vm/src/builtins/type.rs +++ b/crates/vm/src/builtins/type.rs @@ -28,7 +28,14 @@ use crate::{ Representable, SLOT_DEFS, SetAttr, TypeDataRef, TypeDataRefMut, TypeDataSlot, }, }; -use core::{any::Any, borrow::Borrow, ops::Deref, pin::Pin, ptr::NonNull}; +use core::{ + any::Any, + borrow::Borrow, + ops::Deref, + pin::Pin, + ptr::NonNull, + sync::atomic::{AtomicU32, Ordering}, +}; use indexmap::{IndexMap, map::Entry}; use itertools::Itertools; use num_traits::ToPrimitive; @@ -44,8 +51,12 @@ pub struct PyType { pub attributes: PyRwLock, pub slots: PyTypeSlots, pub heaptype_ext: Option>>, + /// Type version tag for inline caching. 0 means unassigned/invalidated. + pub tp_version_tag: AtomicU32, } +static NEXT_TYPE_VERSION: AtomicU32 = AtomicU32::new(1); + unsafe impl crate::object::Traverse for PyType { fn traverse(&self, tracer_fn: &mut crate::object::TraverseFn<'_>) { self.base.traverse(tracer_fn); @@ -188,6 +199,34 @@ fn is_subtype_with_mro(a_mro: &[PyTypeRef], a: &Py, b: &Py) -> b } impl PyType { + /// Assign a fresh version tag. Returns 0 on overflow (all caches invalidated). + pub fn assign_version_tag(&self) -> u32 { + loop { + let current = NEXT_TYPE_VERSION.load(Ordering::Relaxed); + let Some(next) = current.checked_add(1) else { + return 0; // Overflow: version space exhausted + }; + if NEXT_TYPE_VERSION + .compare_exchange_weak(current, next, Ordering::Relaxed, Ordering::Relaxed) + .is_ok() + { + self.tp_version_tag.store(current, Ordering::Release); + return current; + } + } + } + + /// Invalidate this type's version tag and cascade to all subclasses. + pub fn modified(&self) { + self.tp_version_tag.store(0, Ordering::Release); + let subclasses = self.subclasses.read(); + for weak_ref in subclasses.iter() { + if let Some(sub) = weak_ref.upgrade() { + sub.downcast_ref::().unwrap().modified(); + } + } + } + pub fn new_simple_heap( name: &str, base: &Py, @@ -365,6 +404,7 @@ impl PyType { attributes: PyRwLock::new(attrs), slots, heaptype_ext: Some(Pin::new(Box::new(heaptype_ext))), + tp_version_tag: AtomicU32::new(0), }, metaclass, None, @@ -418,6 +458,7 @@ impl PyType { attributes: PyRwLock::new(attrs), slots, heaptype_ext: None, + tp_version_tag: AtomicU32::new(0), }, metaclass, None, @@ -799,6 +840,9 @@ impl PyType { } update_mro_recursively(zelf, vm)?; + // Invalidate inline caches + zelf.modified(); + // TODO: do any old slots need to be cleaned up first? zelf.init_slots(&vm.ctx); @@ -1903,6 +1947,9 @@ impl SetAttr for PyType { ))); } } + // Invalidate inline caches that depend on this type's attributes + zelf.modified(); + if attr_name.as_wtf8().starts_with("__") && attr_name.as_wtf8().ends_with("__") { if assign { zelf.update_slot::(attr_name, &vm.ctx); diff --git a/crates/vm/src/frame.rs b/crates/vm/src/frame.rs index 663885c579d..08ce117fd48 100644 --- a/crates/vm/src/frame.rs +++ b/crates/vm/src/frame.rs @@ -4,18 +4,18 @@ use crate::{ AsObject, Py, PyExact, PyObject, PyObjectRef, PyPayload, PyRef, PyResult, PyStackRef, TryFromObject, VirtualMachine, builtins::{ - PyBaseException, PyBaseExceptionRef, PyCode, PyCoroutine, PyDict, PyDictRef, PyGenerator, - PyInterpolation, PyList, PySet, PySlice, PyStr, PyStrInterned, PyTemplate, PyTraceback, - PyType, PyUtf8Str, + PyBaseException, PyBaseExceptionRef, PyBaseObject, PyCode, PyCoroutine, PyDict, PyDictRef, + PyFloat, PyGenerator, PyInt, PyInterpolation, PyList, PySet, PySlice, PyStr, PyStrInterned, + PyTemplate, PyTraceback, PyType, PyUtf8Str, asyncgenerator::PyAsyncGenWrappedValue, - float::PyFloat, frame::stack_analysis, function::{PyCell, PyCellRef, PyFunction}, - int::PyInt, range::PyRangeIterator, tuple::{PyTuple, PyTupleRef}, }, - bytecode::{self, Instruction, LoadAttr, LoadSuperAttr, SpecialMethod}, + bytecode::{ + self, ADAPTIVE_BACKOFF_VALUE, Arg, Instruction, LoadAttr, LoadSuperAttr, SpecialMethod, + }, convert::{IntoObject, ToPyResult}, coroutine::Coro, exceptions::ExceptionCtor, @@ -34,7 +34,7 @@ use core::cell::UnsafeCell; use core::iter::zip; use core::sync::atomic; use core::sync::atomic::AtomicPtr; -use core::sync::atomic::Ordering::Relaxed; +use core::sync::atomic::Ordering::{Acquire, Relaxed}; use indexmap::IndexMap; use itertools::Itertools; use malachite_bigint::BigInt; @@ -1124,7 +1124,24 @@ impl ExecutingFrame<'_> { } match instruction { - Instruction::BinaryOp { op } => self.execute_bin_op(vm, op.get(arg)), + Instruction::BinaryOp { op } => { + let op_val = op.get(arg); + let instr_idx = self.lasti() as usize - 1; + let cache_base = instr_idx + 1; + + let counter = self.code.instructions.read_adaptive_counter(cache_base); + if counter > 0 { + unsafe { + self.code + .instructions + .write_adaptive_counter(cache_base, counter - 1); + } + } else { + self.specialize_binary_op(vm, op_val, instr_idx, cache_base); + } + + self.execute_bin_op(vm, op_val) + } // TODO: In CPython, this does in-place unicode concatenation when // refcount is 1. Falls back to regular iadd for now. Instruction::BinaryOpInplaceAddUnicode => { @@ -1239,7 +1256,20 @@ impl ExecutingFrame<'_> { } Instruction::Call { nargs } => { // Stack: [callable, self_or_null, arg1, ..., argN] - let args = self.collect_positional_args(nargs.get(arg)); + let nargs_val = nargs.get(arg); + let instr_idx = self.lasti() as usize - 1; + let cache_base = instr_idx + 1; + let counter = self.code.instructions.read_adaptive_counter(cache_base); + if counter > 0 { + unsafe { + self.code + .instructions + .write_adaptive_counter(cache_base, counter - 1); + } + } else { + self.specialize_call(vm, nargs_val, instr_idx, cache_base); + } + let args = self.collect_positional_args(nargs_val); self.execute_call(args, vm) } Instruction::CallKw { nargs } => { @@ -2282,6 +2312,10 @@ impl ExecutingFrame<'_> { } Instruction::RaiseVarargs { kind } => self.execute_raise(vm, kind.get(arg)), Instruction::Resume { .. } => { + // Lazy quickening: initialize adaptive counters on first execution + if !self.code.quickened.swap(true, atomic::Ordering::Relaxed) { + self.code.instructions.quicken(); + } // Check if bytecode needs re-instrumentation let global_ver = vm .state @@ -2644,6 +2678,298 @@ impl ExecutingFrame<'_> { self.push_value(vm.ctx.new_bool(!value).into()); Ok(None) } + // Specialized LOAD_ATTR opcodes + Instruction::LoadAttrMethodNoDict => { + let oparg = LoadAttr::new(u32::from(arg)); + let instr_idx = self.lasti() as usize - 1; + let cache_base = instr_idx + 1; + + let owner = self.top_value(); + let type_version = self.code.instructions.read_cache_u32(cache_base + 1); + + if type_version != 0 && owner.class().tp_version_tag.load(Acquire) == type_version { + // Cache hit: load the cached method descriptor + let descr_ptr = self.code.instructions.read_cache_u64(cache_base + 5); + let func = unsafe { &*(descr_ptr as *const PyObject) }.to_owned(); + let owner = self.pop_value(); + self.push_value(func); + self.push_value(owner); + Ok(None) + } else { + // De-optimize + unsafe { + self.code + .instructions + .replace_op(instr_idx, Instruction::LoadAttr { idx: Arg::marker() }); + self.code + .instructions + .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE); + } + self.load_attr_slow(vm, oparg) + } + } + Instruction::LoadAttrMethodWithValues => { + let oparg = LoadAttr::new(u32::from(arg)); + let instr_idx = self.lasti() as usize - 1; + let cache_base = instr_idx + 1; + let attr_name = self.code.names[oparg.name_idx() as usize]; + + let owner = self.top_value(); + let type_version = self.code.instructions.read_cache_u32(cache_base + 1); + + if type_version != 0 && owner.class().tp_version_tag.load(Acquire) == type_version { + // Check instance dict doesn't shadow the method + let shadowed = if let Some(dict) = owner.dict() { + match dict.get_item_opt(attr_name, vm) { + Ok(Some(_)) => true, + Ok(None) => false, + Err(_) => { + // Dict lookup error → deoptimize to safe path + unsafe { + self.code.instructions.replace_op( + instr_idx, + Instruction::LoadAttr { idx: Arg::marker() }, + ); + self.code + .instructions + .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE); + } + return self.load_attr_slow(vm, oparg); + } + } + } else { + false + }; + + if !shadowed { + // Cache hit: load the cached method descriptor + let descr_ptr = self.code.instructions.read_cache_u64(cache_base + 5); + let func = unsafe { &*(descr_ptr as *const PyObject) }.to_owned(); + let owner = self.pop_value(); + self.push_value(func); + self.push_value(owner); + return Ok(None); + } + } + // De-optimize + unsafe { + self.code + .instructions + .replace_op(instr_idx, Instruction::LoadAttr { idx: Arg::marker() }); + self.code + .instructions + .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE); + } + self.load_attr_slow(vm, oparg) + } + Instruction::LoadAttrInstanceValue => { + let oparg = LoadAttr::new(u32::from(arg)); + let instr_idx = self.lasti() as usize - 1; + let cache_base = instr_idx + 1; + let attr_name = self.code.names[oparg.name_idx() as usize]; + + let owner = self.top_value(); + let type_version = self.code.instructions.read_cache_u32(cache_base + 1); + + if type_version != 0 && owner.class().tp_version_tag.load(Acquire) == type_version { + // Type version matches — no data descriptor for this attr. + // Try direct dict lookup, skipping full descriptor protocol. + if let Some(dict) = owner.dict() + && let Some(value) = dict.get_item_opt(attr_name, vm)? + { + self.pop_value(); + self.push_value(value); + return Ok(None); + } + // Not in instance dict — fall through to class lookup via slow path + } + // De-optimize + unsafe { + self.code + .instructions + .replace_op(instr_idx, Instruction::LoadAttr { idx: Arg::marker() }); + self.code + .instructions + .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE); + } + self.load_attr_slow(vm, oparg) + } + // Specialized BINARY_OP opcodes + Instruction::BinaryOpAddInt => { + let b = self.top_value(); + let a = self.nth_value(1); + if let (Some(a_int), Some(b_int)) = ( + a.downcast_ref_if_exact::(vm), + b.downcast_ref_if_exact::(vm), + ) { + let result = a_int.as_bigint() + b_int.as_bigint(); + self.pop_value(); + self.pop_value(); + self.push_value(vm.ctx.new_bigint(&result).into()); + Ok(None) + } else { + self.deoptimize_binary_op(bytecode::BinaryOperator::Add); + self.execute_bin_op(vm, bytecode::BinaryOperator::Add) + } + } + Instruction::BinaryOpSubtractInt => { + let b = self.top_value(); + let a = self.nth_value(1); + if let (Some(a_int), Some(b_int)) = ( + a.downcast_ref_if_exact::(vm), + b.downcast_ref_if_exact::(vm), + ) { + let result = a_int.as_bigint() - b_int.as_bigint(); + self.pop_value(); + self.pop_value(); + self.push_value(vm.ctx.new_bigint(&result).into()); + Ok(None) + } else { + self.deoptimize_binary_op(bytecode::BinaryOperator::Subtract); + self.execute_bin_op(vm, bytecode::BinaryOperator::Subtract) + } + } + Instruction::BinaryOpMultiplyInt => { + let b = self.top_value(); + let a = self.nth_value(1); + if let (Some(a_int), Some(b_int)) = ( + a.downcast_ref_if_exact::(vm), + b.downcast_ref_if_exact::(vm), + ) { + let result = a_int.as_bigint() * b_int.as_bigint(); + self.pop_value(); + self.pop_value(); + self.push_value(vm.ctx.new_bigint(&result).into()); + Ok(None) + } else { + self.deoptimize_binary_op(bytecode::BinaryOperator::Multiply); + self.execute_bin_op(vm, bytecode::BinaryOperator::Multiply) + } + } + Instruction::BinaryOpAddFloat => { + let b = self.top_value(); + let a = self.nth_value(1); + if let (Some(a_f), Some(b_f)) = ( + a.downcast_ref_if_exact::(vm), + b.downcast_ref_if_exact::(vm), + ) { + let result = a_f.to_f64() + b_f.to_f64(); + self.pop_value(); + self.pop_value(); + self.push_value(vm.ctx.new_float(result).into()); + Ok(None) + } else { + self.deoptimize_binary_op(bytecode::BinaryOperator::Add); + self.execute_bin_op(vm, bytecode::BinaryOperator::Add) + } + } + Instruction::BinaryOpSubtractFloat => { + let b = self.top_value(); + let a = self.nth_value(1); + if let (Some(a_f), Some(b_f)) = ( + a.downcast_ref_if_exact::(vm), + b.downcast_ref_if_exact::(vm), + ) { + let result = a_f.to_f64() - b_f.to_f64(); + self.pop_value(); + self.pop_value(); + self.push_value(vm.ctx.new_float(result).into()); + Ok(None) + } else { + self.deoptimize_binary_op(bytecode::BinaryOperator::Subtract); + self.execute_bin_op(vm, bytecode::BinaryOperator::Subtract) + } + } + Instruction::BinaryOpMultiplyFloat => { + let b = self.top_value(); + let a = self.nth_value(1); + if let (Some(a_f), Some(b_f)) = ( + a.downcast_ref_if_exact::(vm), + b.downcast_ref_if_exact::(vm), + ) { + let result = a_f.to_f64() * b_f.to_f64(); + self.pop_value(); + self.pop_value(); + self.push_value(vm.ctx.new_float(result).into()); + Ok(None) + } else { + self.deoptimize_binary_op(bytecode::BinaryOperator::Multiply); + self.execute_bin_op(vm, bytecode::BinaryOperator::Multiply) + } + } + Instruction::CallPyExactArgs => { + let instr_idx = self.lasti() as usize - 1; + let cache_base = instr_idx + 1; + let cached_version = self.code.instructions.read_cache_u32(cache_base + 1); + let nargs: u32 = arg.into(); + // Stack: [callable, self_or_null, arg1, ..., argN] + let callable = self.nth_value(nargs + 1); + if let Some(func) = callable.downcast_ref::() + && func.func_version() == cached_version + && cached_version != 0 + { + let args: Vec = self.pop_multiple(nargs as usize).collect(); + let _null = self.pop_value_opt(); // self_or_null (NULL) + let callable = self.pop_value(); + let func = callable.downcast_ref::().unwrap(); + let result = func.invoke_exact_args(&args, vm)?; + self.push_value(result); + Ok(None) + } else { + // Deoptimize + unsafe { + self.code.instructions.replace_op( + instr_idx, + Instruction::Call { + nargs: Arg::marker(), + }, + ); + self.code + .instructions + .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE); + } + let args = self.collect_positional_args(nargs); + self.execute_call(args, vm) + } + } + Instruction::CallBoundMethodExactArgs => { + let instr_idx = self.lasti() as usize - 1; + let cache_base = instr_idx + 1; + let cached_version = self.code.instructions.read_cache_u32(cache_base + 1); + let nargs: u32 = arg.into(); + // Stack: [callable, self_val, arg1, ..., argN] + let callable = self.nth_value(nargs + 1); + if let Some(func) = callable.downcast_ref::() + && func.func_version() == cached_version + && cached_version != 0 + { + let pos_args: Vec = self.pop_multiple(nargs as usize).collect(); + let self_val = self.pop_value(); + let callable = self.pop_value(); + let func = callable.downcast_ref::().unwrap(); + let mut all_args = Vec::with_capacity(pos_args.len() + 1); + all_args.push(self_val); + all_args.extend(pos_args); + let result = func.invoke_exact_args(&all_args, vm)?; + self.push_value(result); + Ok(None) + } else { + // Deoptimize + unsafe { + self.code.instructions.replace_op( + instr_idx, + Instruction::Call { + nargs: Arg::marker(), + }, + ); + self.code + .instructions + .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE); + } + let args = self.collect_positional_args(nargs); + self.execute_call(args, vm) + } + } // All INSTRUMENTED_* opcodes delegate to a cold function to keep // the hot instruction loop free of monitoring overhead. _ => self.execute_instrumented(instruction, arg, vm), @@ -4111,6 +4437,134 @@ impl ExecutingFrame<'_> { } fn load_attr(&mut self, vm: &VirtualMachine, oparg: LoadAttr) -> FrameResult { + let instr_idx = self.lasti() as usize - 1; + let cache_base = instr_idx + 1; + + let counter = self.code.instructions.read_adaptive_counter(cache_base); + if counter > 0 { + unsafe { + self.code + .instructions + .write_adaptive_counter(cache_base, counter - 1); + } + } else { + self.specialize_load_attr(vm, oparg, instr_idx, cache_base); + } + + self.load_attr_slow(vm, oparg) + } + + fn specialize_load_attr( + &mut self, + _vm: &VirtualMachine, + oparg: LoadAttr, + instr_idx: usize, + cache_base: usize, + ) { + let obj = self.top_value(); + let cls = obj.class(); + + // Only specialize if getattro is the default (PyBaseObject::getattro) + let is_default_getattro = cls + .slots + .getattro + .load() + .is_some_and(|f| f as usize == PyBaseObject::getattro as *const () as usize); + if !is_default_getattro { + unsafe { + self.code + .instructions + .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE); + } + return; + } + + // Get or assign type version + let mut type_version = cls.tp_version_tag.load(Acquire); + if type_version == 0 { + type_version = cls.assign_version_tag(); + } + if type_version == 0 { + // Version counter overflow — backoff to avoid re-attempting every execution + unsafe { + self.code + .instructions + .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE); + } + return; + } + + let attr_name = self.code.names[oparg.name_idx() as usize]; + + // Look up attr in class via MRO + let cls_attr = cls.get_attr(attr_name); + let has_dict = obj.dict().is_some(); + + if oparg.is_method() { + // Method specialization + if let Some(ref descr) = cls_attr + && descr + .class() + .slots + .flags + .has_feature(PyTypeFlags::METHOD_DESCRIPTOR) + { + let descr_ptr = &**descr as *const PyObject as u64; + unsafe { + self.code + .instructions + .write_cache_u32(cache_base + 1, type_version); + self.code + .instructions + .write_cache_u64(cache_base + 5, descr_ptr); + } + + let new_op = if !has_dict { + Instruction::LoadAttrMethodNoDict + } else { + Instruction::LoadAttrMethodWithValues + }; + unsafe { + self.code.instructions.replace_op(instr_idx, new_op); + } + return; + } + // Can't specialize this method call + unsafe { + self.code + .instructions + .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE); + } + } else { + // Regular attribute access + let has_data_descr = cls_attr.as_ref().is_some_and(|descr| { + let descr_cls = descr.class(); + descr_cls.slots.descr_get.load().is_some() + && descr_cls.slots.descr_set.load().is_some() + }); + + if !has_data_descr && has_dict { + // Instance attribute access — skip class descriptor check + unsafe { + self.code + .instructions + .write_cache_u32(cache_base + 1, type_version); + self.code + .instructions + .replace_op(instr_idx, Instruction::LoadAttrInstanceValue); + } + } else { + // Data descriptor or no dict — can't easily specialize + unsafe { + self.code + .instructions + .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE); + } + } + } + } + + fn load_attr_slow(&mut self, vm: &VirtualMachine, oparg: LoadAttr) -> FrameResult { let attr_name = self.code.names[oparg.name_idx() as usize]; let parent = self.pop_value(); @@ -4135,6 +4589,141 @@ impl ExecutingFrame<'_> { Ok(None) } + fn specialize_binary_op( + &mut self, + vm: &VirtualMachine, + op: bytecode::BinaryOperator, + instr_idx: usize, + cache_base: usize, + ) { + let b = self.top_value(); + let a = self.nth_value(1); + + let new_op = match op { + bytecode::BinaryOperator::Add => { + if a.downcast_ref_if_exact::(vm).is_some() + && b.downcast_ref_if_exact::(vm).is_some() + { + Some(Instruction::BinaryOpAddInt) + } else if a.downcast_ref_if_exact::(vm).is_some() + && b.downcast_ref_if_exact::(vm).is_some() + { + Some(Instruction::BinaryOpAddFloat) + } else { + None + } + } + bytecode::BinaryOperator::Subtract => { + if a.downcast_ref_if_exact::(vm).is_some() + && b.downcast_ref_if_exact::(vm).is_some() + { + Some(Instruction::BinaryOpSubtractInt) + } else if a.downcast_ref_if_exact::(vm).is_some() + && b.downcast_ref_if_exact::(vm).is_some() + { + Some(Instruction::BinaryOpSubtractFloat) + } else { + None + } + } + bytecode::BinaryOperator::Multiply => { + if a.downcast_ref_if_exact::(vm).is_some() + && b.downcast_ref_if_exact::(vm).is_some() + { + Some(Instruction::BinaryOpMultiplyInt) + } else if a.downcast_ref_if_exact::(vm).is_some() + && b.downcast_ref_if_exact::(vm).is_some() + { + Some(Instruction::BinaryOpMultiplyFloat) + } else { + None + } + } + _ => None, + }; + + if let Some(new_op) = new_op { + unsafe { + self.code.instructions.replace_op(instr_idx, new_op); + } + } else { + unsafe { + self.code + .instructions + .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE); + } + } + } + + fn deoptimize_binary_op(&mut self, _op: bytecode::BinaryOperator) { + let instr_idx = self.lasti() as usize - 1; + let cache_base = instr_idx + 1; + unsafe { + self.code + .instructions + .replace_op(instr_idx, Instruction::BinaryOp { op: Arg::marker() }); + self.code + .instructions + .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE); + } + } + + fn specialize_call( + &mut self, + _vm: &VirtualMachine, + nargs: u32, + instr_idx: usize, + cache_base: usize, + ) { + // Stack: [callable, self_or_null, arg1, ..., argN] + // callable is at position nargs + 1 from top + // self_or_null is at position nargs from top + let stack = &self.state.stack; + let stack_len = stack.len(); + let self_or_null_is_some = stack[stack_len - nargs as usize - 1].is_some(); + let callable = self.nth_value(nargs + 1); + + if let Some(func) = callable.downcast_ref::() { + let version = func.func_version(); + if version == 0 { + unsafe { + self.code + .instructions + .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE); + } + return; + } + + let effective_nargs = if self_or_null_is_some { + nargs + 1 + } else { + nargs + }; + + if func.can_specialize_call(effective_nargs) { + let new_op = if self_or_null_is_some { + Instruction::CallBoundMethodExactArgs + } else { + Instruction::CallPyExactArgs + }; + unsafe { + self.code.instructions.replace_op(instr_idx, new_op); + // Store func_version in cache (after counter) + self.code + .instructions + .write_cache_u32(cache_base + 1, version); + } + return; + } + } + + unsafe { + self.code + .instructions + .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE); + } + } + fn load_super_attr(&mut self, vm: &VirtualMachine, oparg: LoadSuperAttr) -> FrameResult { let attr_name = self.code.names[oparg.name_idx() as usize]; diff --git a/crates/vm/src/object/core.rs b/crates/vm/src/object/core.rs index b48045f2163..41ddfa26b2e 100644 --- a/crates/vm/src/object/core.rs +++ b/crates/vm/src/object/core.rs @@ -1927,6 +1927,7 @@ pub(crate) fn init_type_hierarchy() -> (PyTypeRef, PyTypeRef, PyTypeRef) { attributes: PyRwLock::new(Default::default()), slots: PyType::make_slots(), heaptype_ext: None, + tp_version_tag: core::sync::atomic::AtomicU32::new(0), }; let object_payload = PyType { base: None, @@ -1936,6 +1937,7 @@ pub(crate) fn init_type_hierarchy() -> (PyTypeRef, PyTypeRef, PyTypeRef) { attributes: PyRwLock::new(Default::default()), slots: object::PyBaseObject::make_slots(), heaptype_ext: None, + tp_version_tag: core::sync::atomic::AtomicU32::new(0), }; let type_type_ptr = Box::into_raw(Box::new(partially_init!( PyInner:: { @@ -1997,6 +1999,7 @@ pub(crate) fn init_type_hierarchy() -> (PyTypeRef, PyTypeRef, PyTypeRef) { attributes: PyRwLock::default(), slots: PyWeak::make_slots(), heaptype_ext: None, + tp_version_tag: core::sync::atomic::AtomicU32::new(0), }; let weakref_type = PyRef::new_ref(weakref_type, type_type.clone(), None); // Static type: untrack from GC (was tracked by new_ref because PyType has HAS_TRAVERSE) diff --git a/crates/vm/src/stdlib/sys/monitoring.rs b/crates/vm/src/stdlib/sys/monitoring.rs index e223e249e54..858ea83b8a7 100644 --- a/crates/vm/src/stdlib/sys/monitoring.rs +++ b/crates/vm/src/stdlib/sys/monitoring.rs @@ -270,13 +270,29 @@ pub fn instrument_code(code: &PyCode, events: u32) { } } - // Phase 3: Remove regular INSTRUMENTED_* → restore base opcodes - for i in 0..len { - let op = code.code.instructions[i].op; - if let Some(base) = op.to_base() { - unsafe { - code.code.instructions.replace_op(i, base); + // Phase 3: Remove regular INSTRUMENTED_* and specialized opcodes → restore base opcodes. + // Also clear all CACHE entries so specialization starts fresh. + { + let mut i = 0; + while i < len { + let op = code.code.instructions[i].op; + let base_op = op.deoptimize(); + if u8::from(base_op) != u8::from(op) { + unsafe { + code.code.instructions.replace_op(i, base_op); + } + } + let caches = base_op.cache_entries(); + // Zero all CACHE entries (the op+arg bytes may have been overwritten + // by specialization with arbitrary data like pointers). + for c in 1..=caches { + if i + c < len { + unsafe { + code.code.instructions.write_cache_u16(i + c, 0); + } + } } + i += 1 + caches; } }