diff --git a/.cspell.dict/cpython.txt b/.cspell.dict/cpython.txt
index 819d6875b58..7681760ea65 100644
--- a/.cspell.dict/cpython.txt
+++ b/.cspell.dict/cpython.txt
@@ -44,6 +44,7 @@ copyslot
 cpucount
 defaultdict
 denom
+deopt
 dictbytype
 DICTFLAG
 dictoffset
diff --git a/.cspell.json b/.cspell.json
index bbc13e6fded..0d41568618a 100644
--- a/.cspell.json
+++ b/.cspell.json
@@ -60,6 +60,7 @@
     "dedentations",
     "dedents",
     "deduped",
+    "deoptimize",
     "downcastable",
     "downcasted",
     "dumpable",
@@ -73,6 +74,7 @@
     "interps",
     "jitted",
     "jitting",
+    "kwonly",
     "lossily",
     "makeunicodedata",
     "microbenchmark",
diff --git a/crates/codegen/src/ir.rs b/crates/codegen/src/ir.rs
index 4363ffaa768..b21af84a51c 100644
--- a/crates/codegen/src/ir.rs
+++ b/crates/codegen/src/ir.rs
@@ -457,11 +457,13 @@ impl CodeInfo {
                             .map(|byte| CodeUnit::new(Instruction::ExtendedArg, byte))
                             .chain([CodeUnit { op, arg: lo_arg }]),
                     );
-                    // Emit CACHE code units after the instruction
-                    instructions.extend(core::iter::repeat_n(
-                        CodeUnit::new(Instruction::Cache, 0.into()),
-                        cache_count,
-                    ));
+                    // Emit CACHE code units after the instruction (all zeroed)
+                    if cache_count > 0 {
+                        instructions.extend(core::iter::repeat_n(
+                            CodeUnit::new(Instruction::Cache, 0.into()),
+                            cache_count,
+                        ));
+                    }
                     current_offset = offset_after;
                 }
                 next_block = block.next;
diff --git a/crates/compiler-core/src/bytecode.rs b/crates/compiler-core/src/bytecode.rs
index cece1fb77fa..cec04b9edd9 100644
--- a/crates/compiler-core/src/bytecode.rs
+++ b/crates/compiler-core/src/bytecode.rs
@@ -343,6 +343,11 @@ pub struct CodeUnit {
 
 const _: () = assert!(mem::size_of::<CodeUnit>() == 2);
 
+/// Adaptive specialization: number of executions before attempting specialization.
+pub const ADAPTIVE_WARMUP_VALUE: u8 = 50;
+/// Adaptive specialization: backoff counter after de-optimization.
+pub const ADAPTIVE_BACKOFF_VALUE: u8 = 250;
+
 impl CodeUnit {
     pub const fn new(op: Instruction, arg: OpArgByte) -> Self {
         Self { op, arg }
@@ -391,7 +396,11 @@ impl TryFrom<&[u8]> for CodeUnits {
             return Err(Self::Error::InvalidBytecode);
         }
 
-        value.chunks_exact(2).map(CodeUnit::try_from).collect()
+        let units: Self = value
+            .chunks_exact(2)
+            .map(CodeUnit::try_from)
+            .collect::<Result<_, _>>()?;
+        Ok(units)
     }
 }
 
@@ -441,6 +450,140 @@ impl CodeUnits {
             core::ptr::write(op_ptr, new_op.into());
         }
     }
+
+    /// Write a u16 value into a CACHE code unit at `index`.
+    /// Each CodeUnit is 2 bytes (#[repr(C)]: op u8 + arg u8), so one u16 fits exactly.
+    ///
+    /// # Safety
+    /// - `index` must be in bounds and point to a CACHE entry.
+    /// - The caller must ensure no concurrent reads/writes to the same slot.
+    pub unsafe fn write_cache_u16(&self, index: usize, value: u16) {
+        unsafe {
+            let units = &mut *self.0.get();
+            let ptr = units.as_mut_ptr().add(index) as *mut u8;
+            core::ptr::write_unaligned(ptr as *mut u16, value);
+        }
+    }
+
+    /// Read a u16 value from a CACHE code unit at `index`.
+    ///
+    /// # Panics
+    /// Panics if `index` is out of bounds.
+    pub fn read_cache_u16(&self, index: usize) -> u16 {
+        let units = unsafe { &*self.0.get() };
+        assert!(index < units.len(), "read_cache_u16: index out of bounds");
+        let ptr = units.as_ptr().wrapping_add(index) as *const u8;
+        unsafe { core::ptr::read_unaligned(ptr as *const u16) }
+    }
+
+    /// Write a u32 value across two consecutive CACHE code units starting at `index`.
+    ///
+    /// # Safety
+    /// Same requirements as `write_cache_u16`.
+    pub unsafe fn write_cache_u32(&self, index: usize, value: u32) {
+        unsafe {
+            self.write_cache_u16(index, value as u16);
+            self.write_cache_u16(index + 1, (value >> 16) as u16);
+        }
+    }
+
+    /// Read a u32 value from two consecutive CACHE code units starting at `index`.
+    ///
+    /// # Panics
+    /// Panics if `index + 1` is out of bounds.
+    pub fn read_cache_u32(&self, index: usize) -> u32 {
+        let lo = self.read_cache_u16(index) as u32;
+        let hi = self.read_cache_u16(index + 1) as u32;
+        lo | (hi << 16)
+    }
+
+    /// Write a u64 value across four consecutive CACHE code units starting at `index`.
+    ///
+    /// # Safety
+    /// Same requirements as `write_cache_u16`.
+    pub unsafe fn write_cache_u64(&self, index: usize, value: u64) {
+        unsafe {
+            self.write_cache_u32(index, value as u32);
+            self.write_cache_u32(index + 2, (value >> 32) as u32);
+        }
+    }
+
+    /// Read a u64 value from four consecutive CACHE code units starting at `index`.
+    ///
+    /// # Panics
+    /// Panics if `index + 3` is out of bounds.
+    pub fn read_cache_u64(&self, index: usize) -> u64 {
+        let lo = self.read_cache_u32(index) as u64;
+        let hi = self.read_cache_u32(index + 2) as u64;
+        lo | (hi << 32)
+    }
+
+    /// Read the adaptive counter from the first CACHE entry's `arg` byte.
+    /// This preserves `op = Instruction::Cache`, unlike `read_cache_u16`.
+    pub fn read_adaptive_counter(&self, index: usize) -> u8 {
+        let units = unsafe { &*self.0.get() };
+        u8::from(units[index].arg)
+    }
+
+    /// Write the adaptive counter to the first CACHE entry's `arg` byte.
+    /// This preserves `op = Instruction::Cache`, unlike `write_cache_u16`.
+    ///
+    /// # Safety
+    /// - `index` must be in bounds and point to a CACHE entry.
+    pub unsafe fn write_adaptive_counter(&self, index: usize, value: u8) {
+        let units = unsafe { &mut *self.0.get() };
+        units[index].arg = OpArgByte::from(value);
+    }
+
+    /// Produce a clean copy of the bytecode suitable for serialization
+    /// (marshal) and `co_code`. Specialized opcodes are mapped back to their
+    /// base variants via `deoptimize()` and all CACHE entries are zeroed.
+    pub fn original_bytes(&self) -> Vec<u8> {
+        let units = unsafe { &*self.0.get() };
+        let mut out = Vec::with_capacity(units.len() * 2);
+        let len = units.len();
+        let mut i = 0;
+        while i < len {
+            let op = units[i].op.deoptimize();
+            let caches = op.cache_entries();
+            out.push(u8::from(op));
+            out.push(u8::from(units[i].arg));
+            // Zero-fill all CACHE entries (counter + cached data)
+            for _ in 0..caches {
+                i += 1;
+                out.push(0); // op = Cache = 0
+                out.push(0); // arg = 0
+            }
+            i += 1;
+        }
+        out
+    }
+
+    /// Initialize adaptive warmup counters for all cacheable instructions.
+    /// Called lazily at RESUME (first execution of a code object).
+    /// Uses the `arg` byte of the first CACHE entry, preserving `op = Instruction::Cache`.
+    pub fn quicken(&self) {
+        let units = unsafe { &mut *self.0.get() };
+        let len = units.len();
+        let mut i = 0;
+        while i < len {
+            let op = units[i].op;
+            let caches = op.cache_entries();
+            if caches > 0 {
+                // Don't write adaptive counter for instrumented opcodes;
+                // specialization is skipped while monitoring is active.
+                if !op.is_instrumented() {
+                    let cache_base = i + 1;
+                    if cache_base < len {
+                        units[cache_base].arg = OpArgByte::from(ADAPTIVE_WARMUP_VALUE);
+                    }
+                }
+                i += 1 + caches;
+            } else {
+                i += 1;
+            }
+        }
+    }
 }
 
 /// A Constant (which usually encapsulates data within it)
diff --git a/crates/compiler-core/src/bytecode/instruction.rs b/crates/compiler-core/src/bytecode/instruction.rs
index c1c5e8cd847..e7b13ff21d2 100644
--- a/crates/compiler-core/src/bytecode/instruction.rs
+++ b/crates/compiler-core/src/bytecode/instruction.rs
@@ -512,6 +512,126 @@ impl Instruction {
         })
     }
 
+    /// Map a specialized opcode back to its adaptive (base) variant.
+    /// `_PyOpcode_Deopt`
+    pub fn deoptimize(self) -> Self {
+        match self {
+            // LOAD_ATTR specializations
+            Self::LoadAttrClass
+            | Self::LoadAttrClassWithMetaclassCheck
+            | Self::LoadAttrGetattributeOverridden
+            | Self::LoadAttrInstanceValue
+            | Self::LoadAttrMethodLazyDict
+            | Self::LoadAttrMethodNoDict
+            | Self::LoadAttrMethodWithValues
+            | Self::LoadAttrModule
+            | Self::LoadAttrNondescriptorNoDict
+            | Self::LoadAttrNondescriptorWithValues
+            | Self::LoadAttrProperty
+            | Self::LoadAttrSlot
+            | Self::LoadAttrWithHint => Self::LoadAttr { idx: Arg::marker() },
+            // BINARY_OP specializations
+            Self::BinaryOpAddFloat
+            | Self::BinaryOpAddInt
+            | Self::BinaryOpAddUnicode
+            | Self::BinaryOpExtend
+            | Self::BinaryOpInplaceAddUnicode
+            | Self::BinaryOpMultiplyFloat
+            | Self::BinaryOpMultiplyInt
+            | Self::BinaryOpSubscrDict
+            | Self::BinaryOpSubscrGetitem
+            | Self::BinaryOpSubscrListInt
+            | Self::BinaryOpSubscrListSlice
+            | Self::BinaryOpSubscrStrInt
+            | Self::BinaryOpSubscrTupleInt
+            | Self::BinaryOpSubtractFloat
+            | Self::BinaryOpSubtractInt => Self::BinaryOp { op: Arg::marker() },
+            // CALL specializations
+            Self::CallAllocAndEnterInit
+            | Self::CallBoundMethodExactArgs
+            | Self::CallBoundMethodGeneral
+            | Self::CallBuiltinClass
+            | Self::CallBuiltinFast
+            | Self::CallBuiltinFastWithKeywords
+            | Self::CallBuiltinO
+            | Self::CallIsinstance
+            | Self::CallLen
+            | Self::CallListAppend
+            | Self::CallMethodDescriptorFast
+            | Self::CallMethodDescriptorFastWithKeywords
+            | Self::CallMethodDescriptorNoargs
+            | Self::CallMethodDescriptorO
+            | Self::CallNonPyGeneral
+            | Self::CallPyExactArgs
+            | Self::CallPyGeneral
+            | Self::CallStr1
+            | Self::CallTuple1
+            | Self::CallType1 => Self::Call {
+                nargs: Arg::marker(),
+            },
+            // CALL_KW specializations
+            Self::CallKwBoundMethod | Self::CallKwNonPy | Self::CallKwPy => Self::CallKw {
+                nargs: Arg::marker(),
+            },
+            // TO_BOOL specializations
+            Self::ToBoolAlwaysTrue
+            | Self::ToBoolBool
+            | Self::ToBoolInt
+            | Self::ToBoolList
+            | Self::ToBoolNone
+            | Self::ToBoolStr => Self::ToBool,
+            // COMPARE_OP specializations
+            Self::CompareOpFloat | Self::CompareOpInt | Self::CompareOpStr => {
+                Self::CompareOp { op: Arg::marker() }
+            }
+            // CONTAINS_OP specializations
+            Self::ContainsOpDict | Self::ContainsOpSet => Self::ContainsOp(Arg::marker()),
+            // FOR_ITER specializations
+            Self::ForIterGen | Self::ForIterList | Self::ForIterRange | Self::ForIterTuple => {
+                Self::ForIter {
+                    target: Arg::marker(),
+                }
+            }
+            // LOAD_GLOBAL specializations
+            Self::LoadGlobalBuiltin | Self::LoadGlobalModule => Self::LoadGlobal(Arg::marker()),
+            // STORE_ATTR specializations
+            Self::StoreAttrInstanceValue | Self::StoreAttrSlot | Self::StoreAttrWithHint => {
+                Self::StoreAttr { idx: Arg::marker() }
+            }
+            // LOAD_SUPER_ATTR specializations
+            Self::LoadSuperAttrAttr | Self::LoadSuperAttrMethod => {
+                Self::LoadSuperAttr { arg: Arg::marker() }
+            }
+            // STORE_SUBSCR specializations
+            Self::StoreSubscrDict | Self::StoreSubscrListInt => Self::StoreSubscr,
+            // UNPACK_SEQUENCE specializations
+            Self::UnpackSequenceList | Self::UnpackSequenceTuple | Self::UnpackSequenceTwoTuple => {
+                Self::UnpackSequence {
+                    size: Arg::marker(),
+                }
+            }
+            // SEND specializations
+            Self::SendGen => Self::Send {
+                target: Arg::marker(),
+            },
+            // LOAD_CONST specializations
+            Self::LoadConstImmortal | Self::LoadConstMortal => {
+                Self::LoadConst { idx: Arg::marker() }
+            }
+            // RESUME specializations
+            Self::ResumeCheck => Self::Resume { arg: Arg::marker() },
+            // JUMP_BACKWARD specializations
+            Self::JumpBackwardJit | Self::JumpBackwardNoJit => Self::JumpBackward {
+                target: Arg::marker(),
+            },
+            // Instrumented opcodes map back to their base
+            _ => match self.to_base() {
+                Some(base) => base,
+                None => self,
+            },
+        }
+    }
+
     /// Number of CACHE code units that follow this instruction.
     /// _PyOpcode_Caches
     pub fn cache_entries(self) -> usize {
@@ -626,8 +746,11 @@ impl Instruction {
             | Self::UnpackSequenceTuple
             | Self::UnpackSequenceTwoTuple => 1,
 
-            // Everything else: 0 cache entries
-            _ => 0,
+            // Instrumented opcodes have the same cache entries as their base
+            _ => match self.to_base() {
+                Some(base) => base.cache_entries(),
+                None => 0,
+            },
         }
     }
 }
diff --git a/crates/compiler-core/src/marshal.rs b/crates/compiler-core/src/marshal.rs
index 11df127920a..310bad9d868 100644
--- a/crates/compiler-core/src/marshal.rs
+++ b/crates/compiler-core/src/marshal.rs
@@ -662,9 +662,8 @@ pub fn serialize_value<W: Write, D: Dumpable>(
 
 pub fn serialize_code<W: Write, C: Constant>(buf: &mut W, code: &CodeObject<C>) {
     write_len(buf, code.instructions.len());
-    // SAFETY: it's ok to transmute CodeUnit to [u8; 2]
-    let (_, instructions_bytes, _) = unsafe { code.instructions.align_to() };
-    buf.write_slice(instructions_bytes);
+    let original = code.instructions.original_bytes();
+    buf.write_slice(&original);
 
     write_len(buf, code.locations.len());
     for (start, end) in &*code.locations {
diff --git a/crates/vm/src/builtins/code.rs b/crates/vm/src/builtins/code.rs
index 1708477004e..126d0216546 100644
--- a/crates/vm/src/builtins/code.rs
+++ b/crates/vm/src/builtins/code.rs
@@ -346,6 +346,8 @@ pub struct PyCode {
     pub instrumentation_version: AtomicU64,
     /// Side-table for INSTRUMENTED_LINE / INSTRUMENTED_INSTRUCTION.
     pub monitoring_data: PyMutex<Option<CoMonitoringData>>,
+    /// Whether adaptive counters have been initialized (lazy quickening).
+    pub quickened: core::sync::atomic::AtomicBool,
 }
 
 impl Deref for PyCode {
@@ -363,6 +365,7 @@ impl PyCode {
             source_path: AtomicPtr::new(sp),
             instrumentation_version: AtomicU64::new(0),
             monitoring_data: PyMutex::new(None),
+            quickened: core::sync::atomic::AtomicBool::new(false),
         }
     }
 
@@ -681,7 +684,12 @@ impl PyCode {
 
     #[pygetset]
     pub fn co_code(&self, vm: &VirtualMachine) -> crate::builtins::PyBytesRef {
-        // SAFETY: CodeUnit is #[repr(C)] with size 2, so we can safely transmute to bytes
+        vm.ctx.new_bytes(self.code.instructions.original_bytes())
+    }
+
+    #[pygetset]
+    pub fn _co_code_adaptive(&self, vm: &VirtualMachine) -> crate::builtins::PyBytesRef {
+        // Return current (possibly quickened/specialized) bytecode
         let bytes = unsafe {
             core::slice::from_raw_parts(
                 self.code.instructions.as_ptr() as *const u8,
@@ -691,12 +699,6 @@ impl PyCode {
         vm.ctx.new_bytes(bytes.to_vec())
     }
 
-    #[pygetset]
-    pub fn _co_code_adaptive(&self, vm: &VirtualMachine) -> crate::builtins::PyBytesRef {
-        // RustPython doesn't have adaptive/specialized bytecode, so return regular co_code
-        self.co_code(vm)
-    }
-
     #[pygetset]
     pub fn co_freevars(&self, vm: &VirtualMachine) -> PyTupleRef {
         let names = self
diff --git a/crates/vm/src/builtins/function.rs b/crates/vm/src/builtins/function.rs
index 58f818cc7a7..489482d1933 100644
--- a/crates/vm/src/builtins/function.rs
+++ b/crates/vm/src/builtins/function.rs
@@ -22,6 +22,7 @@ use crate::{
         Callable, Comparable, Constructor, GetAttr, GetDescriptor, PyComparisonOp, Representable,
     },
 };
+use core::sync::atomic::{AtomicU32, Ordering::Relaxed};
 use itertools::Itertools;
 #[cfg(feature = "jit")]
 use rustpython_jit::CompiledCode;
@@ -72,10 +73,13 @@ pub struct PyFunction {
     annotate: PyMutex<Option<PyObjectRef>>,
     module: PyMutex<PyObjectRef>,
     doc: PyMutex<PyObjectRef>,
+    func_version: AtomicU32,
     #[cfg(feature = "jit")]
     jitted_code: OnceCell<CompiledCode>,
 }
 
+static FUNC_VERSION_COUNTER: AtomicU32 = AtomicU32::new(1);
+
 unsafe impl Traverse for PyFunction {
     fn traverse(&self, tracer_fn: &mut TraverseFn<'_>) {
         self.globals.traverse(tracer_fn);
@@ -200,6 +204,7 @@ impl PyFunction {
             annotate: PyMutex::new(None),
             module: PyMutex::new(module),
             doc: PyMutex::new(doc),
+            func_version: AtomicU32::new(FUNC_VERSION_COUNTER.fetch_add(1, Relaxed)),
             #[cfg(feature = "jit")]
             jitted_code: OnceCell::new(),
         };
@@ -593,6 +598,68 @@ impl Py<PyFunction> {
     pub fn invoke(&self, func_args: FuncArgs, vm: &VirtualMachine) -> PyResult {
         self.invoke_with_locals(func_args, None, vm)
     }
+
+    /// Returns the function version, or 0 if invalidated.
+    #[inline]
+    pub fn func_version(&self) -> u32 {
+        self.func_version.load(Relaxed)
+    }
+
+    /// Check if this function is eligible for exact-args call specialization.
+    /// Returns true if: no VARARGS, no VARKEYWORDS, no kwonly args, not generator/coroutine,
+    /// and effective_nargs matches co_argcount.
+    pub(crate) fn can_specialize_call(&self, effective_nargs: u32) -> bool {
+        let code = self.code.lock();
+        let flags = code.flags;
+        flags.contains(bytecode::CodeFlags::NEWLOCALS)
+            && !flags.intersects(
+                bytecode::CodeFlags::VARARGS
+                    | bytecode::CodeFlags::VARKEYWORDS
+                    | bytecode::CodeFlags::GENERATOR
+                    | bytecode::CodeFlags::COROUTINE,
+            )
+            && code.kwonlyarg_count == 0
+            && code.arg_count == effective_nargs
+    }
+
+    /// Fast path for calling a simple function with exact positional args.
+    /// Skips FuncArgs allocation, prepend_arg, and fill_locals_from_args.
+    /// Only valid when: no VARARGS, no VARKEYWORDS, no kwonlyargs, not generator/coroutine,
+    /// and nargs == co_argcount.
+    pub fn invoke_exact_args(&self, args: &[PyObjectRef], vm: &VirtualMachine) -> PyResult {
+        let code = self.code.lock().clone();
+
+        let locals = ArgMapping::from_dict_exact(vm.ctx.new_dict());
+
+        let frame = Frame::new(
+            code.clone(),
+            Scope::new(Some(locals), self.globals.clone()),
+            self.builtins.clone(),
+            self.closure.as_ref().map_or(&[], |c| c.as_slice()),
+            Some(self.to_owned().into()),
+            vm,
+        )
+        .into_ref(&vm.ctx);
+
+        // Copy args directly into fastlocals
+        {
+            let fastlocals = unsafe { frame.fastlocals.borrow_mut() };
+            for (i, arg) in args.iter().enumerate() {
+                fastlocals[i] = Some(arg.clone());
+            }
+        }
+
+        // Handle cell2arg
+        if let Some(cell2arg) = code.cell2arg.as_deref() {
+            let fastlocals = unsafe { frame.fastlocals.borrow_mut() };
+            for (cell_idx, arg_idx) in cell2arg.iter().enumerate().filter(|(_, i)| **i != -1) {
+                let x = fastlocals[*arg_idx as usize].take();
+                frame.set_cell_contents(cell_idx, x);
+            }
+        }
+
+        vm.run_frame(frame)
+    }
 }
 
 impl PyPayload for PyFunction {
@@ -615,12 +682,7 @@ impl PyFunction {
     #[pygetset(setter)]
     fn set___code__(&self, code: PyRef<PyCode>) {
         *self.code.lock() = code;
-        // TODO: jit support
-        // #[cfg(feature = "jit")]
-        // {
-        //     // If available, clear cached compiled code.
-        //     let _ = self.jitted_code.take();
-        // }
+        self.func_version.store(0, Relaxed);
     }
 
     #[pygetset]
@@ -629,7 +691,8 @@ impl PyFunction {
     }
     #[pygetset(setter)]
     fn set___defaults__(&self, defaults: Option<PyTupleRef>) {
-        self.defaults_and_kwdefaults.lock().0 = defaults
+        self.defaults_and_kwdefaults.lock().0 = defaults;
+        self.func_version.store(0, Relaxed);
     }
 
     #[pygetset]
@@ -638,7 +701,8 @@ impl PyFunction {
     }
     #[pygetset(setter)]
     fn set___kwdefaults__(&self, kwdefaults: Option<PyDictRef>) {
-        self.defaults_and_kwdefaults.lock().1 = kwdefaults
+        self.defaults_and_kwdefaults.lock().1 = kwdefaults;
+        self.func_version.store(0, Relaxed);
     }
 
     // {"__closure__",   T_OBJECT,     OFF(func_closure), READONLY},
diff --git a/crates/vm/src/builtins/type.rs b/crates/vm/src/builtins/type.rs
index 86865e9e083..b3a3c206c68 100644
--- a/crates/vm/src/builtins/type.rs
+++ b/crates/vm/src/builtins/type.rs
@@ -28,7 +28,14 @@ use crate::{
         Representable, SLOT_DEFS, SetAttr, TypeDataRef, TypeDataRefMut, TypeDataSlot,
     },
 };
-use core::{any::Any, borrow::Borrow, ops::Deref, pin::Pin, ptr::NonNull};
+use core::{
+    any::Any,
+    borrow::Borrow,
+    ops::Deref,
+    pin::Pin,
+    ptr::NonNull,
+    sync::atomic::{AtomicU32, Ordering},
+};
 use indexmap::{IndexMap, map::Entry};
 use itertools::Itertools;
 use num_traits::ToPrimitive;
@@ -44,8 +51,12 @@ pub struct PyType {
     pub attributes: PyRwLock<PyAttributes>,
     pub slots: PyTypeSlots,
     pub heaptype_ext: Option<Pin<Box<HeapTypeExt>>>,
+    /// Type version tag for inline caching. 0 means unassigned/invalidated.
+    pub tp_version_tag: AtomicU32,
 }
 
+static NEXT_TYPE_VERSION: AtomicU32 = AtomicU32::new(1);
+
 unsafe impl crate::object::Traverse for PyType {
     fn traverse(&self, tracer_fn: &mut crate::object::TraverseFn<'_>) {
         self.base.traverse(tracer_fn);
@@ -188,6 +199,34 @@ fn is_subtype_with_mro(a_mro: &[PyTypeRef], a: &Py<PyType>, b: &Py<PyType>) -> b
 }
 
 impl PyType {
+    /// Assign a fresh version tag. Returns 0 on overflow (all caches invalidated).
+    pub fn assign_version_tag(&self) -> u32 {
+        loop {
+            let current = NEXT_TYPE_VERSION.load(Ordering::Relaxed);
+            let Some(next) = current.checked_add(1) else {
+                return 0; // Overflow: version space exhausted
+            };
+            if NEXT_TYPE_VERSION
+                .compare_exchange_weak(current, next, Ordering::Relaxed, Ordering::Relaxed)
+                .is_ok()
+            {
+                self.tp_version_tag.store(current, Ordering::Release);
+                return current;
+            }
+        }
+    }
+
+    /// Invalidate this type's version tag and cascade to all subclasses.
+    pub fn modified(&self) {
+        self.tp_version_tag.store(0, Ordering::Release);
+        let subclasses = self.subclasses.read();
+        for weak_ref in subclasses.iter() {
+            if let Some(sub) = weak_ref.upgrade() {
+                sub.downcast_ref::<PyType>().unwrap().modified();
+            }
+        }
+    }
+
     pub fn new_simple_heap(
         name: &str,
         base: &Py<PyType>,
@@ -365,6 +404,7 @@ impl PyType {
                 attributes: PyRwLock::new(attrs),
                 slots,
                 heaptype_ext: Some(Pin::new(Box::new(heaptype_ext))),
+                tp_version_tag: AtomicU32::new(0),
             },
             metaclass,
             None,
@@ -418,6 +458,7 @@ impl PyType {
                 attributes: PyRwLock::new(attrs),
                 slots,
                 heaptype_ext: None,
+                tp_version_tag: AtomicU32::new(0),
             },
             metaclass,
             None,
@@ -799,6 +840,9 @@ impl PyType {
         }
         update_mro_recursively(zelf, vm)?;
 
+        // Invalidate inline caches
+        zelf.modified();
+
         // TODO: do any old slots need to be cleaned up first?
         zelf.init_slots(&vm.ctx);
 
@@ -1903,6 +1947,9 @@ impl SetAttr for PyType {
                 )));
             }
         }
+        // Invalidate inline caches that depend on this type's attributes
+        zelf.modified();
+
         if attr_name.as_wtf8().starts_with("__") && attr_name.as_wtf8().ends_with("__") {
             if assign {
                 zelf.update_slot::<true>(attr_name, &vm.ctx);
diff --git a/crates/vm/src/frame.rs b/crates/vm/src/frame.rs
index 663885c579d..08ce117fd48 100644
--- a/crates/vm/src/frame.rs
+++ b/crates/vm/src/frame.rs
@@ -4,18 +4,18 @@ use crate::{
     AsObject, Py, PyExact, PyObject, PyObjectRef, PyPayload, PyRef, PyResult, PyStackRef,
     TryFromObject, VirtualMachine,
     builtins::{
-        PyBaseException, PyBaseExceptionRef, PyCode, PyCoroutine, PyDict, PyDictRef, PyGenerator,
-        PyInterpolation, PyList, PySet, PySlice, PyStr, PyStrInterned, PyTemplate, PyTraceback,
-        PyType, PyUtf8Str,
+        PyBaseException, PyBaseExceptionRef, PyBaseObject, PyCode, PyCoroutine, PyDict, PyDictRef,
+        PyFloat, PyGenerator, PyInt, PyInterpolation, PyList, PySet, PySlice, PyStr, PyStrInterned,
+        PyTemplate, PyTraceback, PyType, PyUtf8Str,
         asyncgenerator::PyAsyncGenWrappedValue,
-        float::PyFloat,
         frame::stack_analysis,
         function::{PyCell, PyCellRef, PyFunction},
-        int::PyInt,
         range::PyRangeIterator,
         tuple::{PyTuple, PyTupleRef},
     },
-    bytecode::{self, Instruction, LoadAttr, LoadSuperAttr, SpecialMethod},
+    bytecode::{
+        self, ADAPTIVE_BACKOFF_VALUE, Arg, Instruction, LoadAttr, LoadSuperAttr, SpecialMethod,
+    },
     convert::{IntoObject, ToPyResult},
     coroutine::Coro,
     exceptions::ExceptionCtor,
@@ -34,7 +34,7 @@ use core::cell::UnsafeCell;
 use core::iter::zip;
 use core::sync::atomic;
 use core::sync::atomic::AtomicPtr;
-use core::sync::atomic::Ordering::Relaxed;
+use core::sync::atomic::Ordering::{Acquire, Relaxed};
 use indexmap::IndexMap;
 use itertools::Itertools;
 use malachite_bigint::BigInt;
@@ -1124,7 +1124,24 @@ impl ExecutingFrame<'_> {
         }
 
         match instruction {
-            Instruction::BinaryOp { op } => self.execute_bin_op(vm, op.get(arg)),
+            Instruction::BinaryOp { op } => {
+                let op_val = op.get(arg);
+                let instr_idx = self.lasti() as usize - 1;
+                let cache_base = instr_idx + 1;
+
+                let counter = self.code.instructions.read_adaptive_counter(cache_base);
+                if counter > 0 {
+                    unsafe {
+                        self.code
+                            .instructions
+                            .write_adaptive_counter(cache_base, counter - 1);
+                    }
+                } else {
+                    self.specialize_binary_op(vm, op_val, instr_idx, cache_base);
+                }
+
+                self.execute_bin_op(vm, op_val)
+            }
             // TODO: In CPython, this does in-place unicode concatenation when
             // refcount is 1. Falls back to regular iadd for now.
             Instruction::BinaryOpInplaceAddUnicode => {
@@ -1239,7 +1256,20 @@ impl ExecutingFrame<'_> {
             }
             Instruction::Call { nargs } => {
                 // Stack: [callable, self_or_null, arg1, ..., argN]
-                let args = self.collect_positional_args(nargs.get(arg));
+                let nargs_val = nargs.get(arg);
+                let instr_idx = self.lasti() as usize - 1;
+                let cache_base = instr_idx + 1;
+                let counter = self.code.instructions.read_adaptive_counter(cache_base);
+                if counter > 0 {
+                    unsafe {
+                        self.code
+                            .instructions
+                            .write_adaptive_counter(cache_base, counter - 1);
+                    }
+                } else {
+                    self.specialize_call(vm, nargs_val, instr_idx, cache_base);
+                }
+                let args = self.collect_positional_args(nargs_val);
                 self.execute_call(args, vm)
             }
             Instruction::CallKw { nargs } => {
@@ -2282,6 +2312,10 @@ impl ExecutingFrame<'_> {
             }
             Instruction::RaiseVarargs { kind } => self.execute_raise(vm, kind.get(arg)),
             Instruction::Resume { .. } => {
+                // Lazy quickening: initialize adaptive counters on first execution
+                if !self.code.quickened.swap(true, atomic::Ordering::Relaxed) {
+                    self.code.instructions.quicken();
+                }
                 // Check if bytecode needs re-instrumentation
                 let global_ver = vm
                     .state
@@ -2644,6 +2678,298 @@ impl ExecutingFrame<'_> {
                 self.push_value(vm.ctx.new_bool(!value).into());
                 Ok(None)
             }
+            // Specialized LOAD_ATTR opcodes
+            Instruction::LoadAttrMethodNoDict => {
+                let oparg = LoadAttr::new(u32::from(arg));
+                let instr_idx = self.lasti() as usize - 1;
+                let cache_base = instr_idx + 1;
+
+                let owner = self.top_value();
+                let type_version = self.code.instructions.read_cache_u32(cache_base + 1);
+
+                if type_version != 0 && owner.class().tp_version_tag.load(Acquire) == type_version {
+                    // Cache hit: load the cached method descriptor
+                    let descr_ptr = self.code.instructions.read_cache_u64(cache_base + 5);
+                    let func = unsafe { &*(descr_ptr as *const PyObject) }.to_owned();
+                    let owner = self.pop_value();
+                    self.push_value(func);
+                    self.push_value(owner);
+                    Ok(None)
+                } else {
+                    // De-optimize
+                    unsafe {
+                        self.code
+                            .instructions
+                            .replace_op(instr_idx, Instruction::LoadAttr { idx: Arg::marker() });
+                        self.code
+                            .instructions
+                            .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE);
+                    }
+                    self.load_attr_slow(vm, oparg)
+                }
+            }
+            Instruction::LoadAttrMethodWithValues => {
+                let oparg = LoadAttr::new(u32::from(arg));
+                let instr_idx = self.lasti() as usize - 1;
+                let cache_base = instr_idx + 1;
+                let attr_name = self.code.names[oparg.name_idx() as usize];
+
+                let owner = self.top_value();
+                let type_version = self.code.instructions.read_cache_u32(cache_base + 1);
+
+                if type_version != 0 && owner.class().tp_version_tag.load(Acquire) == type_version {
+                    // Check instance dict doesn't shadow the method
+                    let shadowed = if let Some(dict) = owner.dict() {
+                        match dict.get_item_opt(attr_name, vm) {
+                            Ok(Some(_)) => true,
+                            Ok(None) => false,
+                            Err(_) => {
+                                // Dict lookup error → deoptimize to safe path
+                                unsafe {
+                                    self.code.instructions.replace_op(
+                                        instr_idx,
+                                        Instruction::LoadAttr { idx: Arg::marker() },
+                                    );
+                                    self.code
+                                        .instructions
+                                        .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE);
+                                }
+                                return self.load_attr_slow(vm, oparg);
+                            }
+                        }
+                    } else {
+                        false
+                    };
+
+                    if !shadowed {
+                        // Cache hit: load the cached method descriptor
+                        let descr_ptr = self.code.instructions.read_cache_u64(cache_base + 5);
+                        let func = unsafe { &*(descr_ptr as *const PyObject) }.to_owned();
+                        let owner = self.pop_value();
+                        self.push_value(func);
+                        self.push_value(owner);
+                        return Ok(None);
+                    }
+                }
+                // De-optimize
+                unsafe {
+                    self.code
+                        .instructions
+                        .replace_op(instr_idx, Instruction::LoadAttr { idx: Arg::marker() });
+                    self.code
+                        .instructions
+                        .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE);
+                }
+                self.load_attr_slow(vm, oparg)
+            }
+            Instruction::LoadAttrInstanceValue => {
+                let oparg = LoadAttr::new(u32::from(arg));
+                let instr_idx = self.lasti() as usize - 1;
+                let cache_base = instr_idx + 1;
+                let attr_name = self.code.names[oparg.name_idx() as usize];
+
+                let owner = self.top_value();
+                let type_version = self.code.instructions.read_cache_u32(cache_base + 1);
+
+                if type_version != 0 && owner.class().tp_version_tag.load(Acquire) == type_version {
+                    // Type version matches — no data descriptor for this attr.
+                    // Try direct dict lookup, skipping full descriptor protocol.
+                    if let Some(dict) = owner.dict()
+                        && let Some(value) = dict.get_item_opt(attr_name, vm)?
+                    {
+                        self.pop_value();
+                        self.push_value(value);
+                        return Ok(None);
+                    }
+                    // Not in instance dict — fall through to class lookup via slow path
+                }
+                // De-optimize
+                unsafe {
+                    self.code
+                        .instructions
+                        .replace_op(instr_idx, Instruction::LoadAttr { idx: Arg::marker() });
+                    self.code
+                        .instructions
+                        .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE);
+                }
+                self.load_attr_slow(vm, oparg)
+            }
+            // Specialized BINARY_OP opcodes
+            Instruction::BinaryOpAddInt => {
+                let b = self.top_value();
+                let a = self.nth_value(1);
+                if let (Some(a_int), Some(b_int)) = (
+                    a.downcast_ref_if_exact::<PyInt>(vm),
+                    b.downcast_ref_if_exact::<PyInt>(vm),
+                ) {
+                    let result = a_int.as_bigint() + b_int.as_bigint();
+                    self.pop_value();
+                    self.pop_value();
+                    self.push_value(vm.ctx.new_bigint(&result).into());
+                    Ok(None)
+                } else {
+                    self.deoptimize_binary_op(bytecode::BinaryOperator::Add);
+                    self.execute_bin_op(vm, bytecode::BinaryOperator::Add)
+                }
+            }
+            Instruction::BinaryOpSubtractInt => {
+                let b = self.top_value();
+                let a = self.nth_value(1);
+                if let (Some(a_int), Some(b_int)) = (
+                    a.downcast_ref_if_exact::<PyInt>(vm),
+                    b.downcast_ref_if_exact::<PyInt>(vm),
+                ) {
+                    let result = a_int.as_bigint() - b_int.as_bigint();
+                    self.pop_value();
+                    self.pop_value();
+                    self.push_value(vm.ctx.new_bigint(&result).into());
+                    Ok(None)
+                } else {
+                    self.deoptimize_binary_op(bytecode::BinaryOperator::Subtract);
+                    self.execute_bin_op(vm, bytecode::BinaryOperator::Subtract)
+                }
+            }
+            Instruction::BinaryOpMultiplyInt => {
+                let b = self.top_value();
+                let a = self.nth_value(1);
+                if let (Some(a_int), Some(b_int)) = (
+                    a.downcast_ref_if_exact::<PyInt>(vm),
+                    b.downcast_ref_if_exact::<PyInt>(vm),
+                ) {
+                    let result = a_int.as_bigint() * b_int.as_bigint();
+                    self.pop_value();
+                    self.pop_value();
+                    self.push_value(vm.ctx.new_bigint(&result).into());
+                    Ok(None)
+                } else {
+                    self.deoptimize_binary_op(bytecode::BinaryOperator::Multiply);
+                    self.execute_bin_op(vm, bytecode::BinaryOperator::Multiply)
+                }
+            }
+            Instruction::BinaryOpAddFloat => {
+                let b = self.top_value();
+                let a = self.nth_value(1);
+                if let (Some(a_f), Some(b_f)) = (
+                    a.downcast_ref_if_exact::<PyFloat>(vm),
+                    b.downcast_ref_if_exact::<PyFloat>(vm),
+                ) {
+                    let result = a_f.to_f64() + b_f.to_f64();
+                    self.pop_value();
+                    self.pop_value();
+                    self.push_value(vm.ctx.new_float(result).into());
+                    Ok(None)
+                } else {
+                    self.deoptimize_binary_op(bytecode::BinaryOperator::Add);
+                    self.execute_bin_op(vm, bytecode::BinaryOperator::Add)
+                }
+            }
+            Instruction::BinaryOpSubtractFloat => {
+                let b = self.top_value();
+                let a = self.nth_value(1);
+                if let (Some(a_f), Some(b_f)) = (
+                    a.downcast_ref_if_exact::<PyFloat>(vm),
+                    b.downcast_ref_if_exact::<PyFloat>(vm),
+                ) {
+                    let result = a_f.to_f64() - b_f.to_f64();
+                    self.pop_value();
+                    self.pop_value();
+                    self.push_value(vm.ctx.new_float(result).into());
+                    Ok(None)
+                } else {
+                    self.deoptimize_binary_op(bytecode::BinaryOperator::Subtract);
+                    self.execute_bin_op(vm, bytecode::BinaryOperator::Subtract)
+                }
+            }
+            Instruction::BinaryOpMultiplyFloat => {
+                let b = self.top_value();
+                let a = self.nth_value(1);
+                if let (Some(a_f), Some(b_f)) = (
+                    a.downcast_ref_if_exact::<PyFloat>(vm),
+                    b.downcast_ref_if_exact::<PyFloat>(vm),
+                ) {
+                    let result = a_f.to_f64() * b_f.to_f64();
+                    self.pop_value();
+                    self.pop_value();
+                    self.push_value(vm.ctx.new_float(result).into());
+                    Ok(None)
+                } else {
+                    self.deoptimize_binary_op(bytecode::BinaryOperator::Multiply);
+                    self.execute_bin_op(vm, bytecode::BinaryOperator::Multiply)
+                }
+            }
+            Instruction::CallPyExactArgs => {
+                let instr_idx = self.lasti() as usize - 1;
+                let cache_base = instr_idx + 1;
+                let cached_version = self.code.instructions.read_cache_u32(cache_base + 1);
+                let nargs: u32 = arg.into();
+                // Stack: [callable, self_or_null, arg1, ..., argN]
+                let callable = self.nth_value(nargs + 1);
+                if let Some(func) = callable.downcast_ref::<PyFunction>()
+                    && func.func_version() == cached_version
+                    && cached_version != 0
+                {
+                    let args: Vec<PyObjectRef> = self.pop_multiple(nargs as usize).collect();
+                    let _null = self.pop_value_opt(); // self_or_null (NULL)
+                    let callable = self.pop_value();
+                    let func = callable.downcast_ref::<PyFunction>().unwrap();
+                    let result = func.invoke_exact_args(&args, vm)?;
+                    self.push_value(result);
+                    Ok(None)
+                } else {
+                    // Deoptimize
+                    unsafe {
+                        self.code.instructions.replace_op(
+                            instr_idx,
+                            Instruction::Call {
+                                nargs: Arg::marker(),
+                            },
+                        );
+                        self.code
+                            .instructions
+                            .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE);
+                    }
+                    let args = self.collect_positional_args(nargs);
+                    self.execute_call(args, vm)
+                }
+            }
+            Instruction::CallBoundMethodExactArgs => {
+                let instr_idx = self.lasti() as usize - 1;
+                let cache_base = instr_idx + 1;
+                let cached_version = self.code.instructions.read_cache_u32(cache_base + 1);
+                let nargs: u32 = arg.into();
+                // Stack: [callable, self_val, arg1, ..., argN]
+                let callable = self.nth_value(nargs + 1);
+                if let Some(func) = callable.downcast_ref::<PyFunction>()
+                    && func.func_version() == cached_version
+                    && cached_version != 0
+                {
+                    let pos_args: Vec<PyObjectRef> = self.pop_multiple(nargs as usize).collect();
+                    let self_val = self.pop_value();
+                    let callable = self.pop_value();
+                    let func = callable.downcast_ref::<PyFunction>().unwrap();
+                    let mut all_args = Vec::with_capacity(pos_args.len() + 1);
+                    all_args.push(self_val);
+                    all_args.extend(pos_args);
+                    let result = func.invoke_exact_args(&all_args, vm)?;
+                    self.push_value(result);
+                    Ok(None)
+                } else {
+                    // Deoptimize
+                    unsafe {
+                        self.code.instructions.replace_op(
+                            instr_idx,
+                            Instruction::Call {
+                                nargs: Arg::marker(),
+                            },
+                        );
+                        self.code
+                            .instructions
+                            .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE);
+                    }
+                    let args = self.collect_positional_args(nargs);
+                    self.execute_call(args, vm)
+                }
+            }
             // All INSTRUMENTED_* opcodes delegate to a cold function to keep
             // the hot instruction loop free of monitoring overhead.
             _ => self.execute_instrumented(instruction, arg, vm),
@@ -4111,6 +4437,134 @@ impl ExecutingFrame<'_> {
     }
 
     fn load_attr(&mut self, vm: &VirtualMachine, oparg: LoadAttr) -> FrameResult {
+        let instr_idx = self.lasti() as usize - 1;
+        let cache_base = instr_idx + 1;
+
+        let counter = self.code.instructions.read_adaptive_counter(cache_base);
+        if counter > 0 {
+            unsafe {
+                self.code
+                    .instructions
+                    .write_adaptive_counter(cache_base, counter - 1);
+            }
+        } else {
+            self.specialize_load_attr(vm, oparg, instr_idx, cache_base);
+        }
+
+        self.load_attr_slow(vm, oparg)
+    }
+
+    fn specialize_load_attr(
+        &mut self,
+        _vm: &VirtualMachine,
+        oparg: LoadAttr,
+        instr_idx: usize,
+        cache_base: usize,
+    ) {
+        let obj = self.top_value();
+        let cls = obj.class();
+
+        // Only specialize if getattro is the default (PyBaseObject::getattro)
+        let is_default_getattro = cls
+            .slots
+            .getattro
+            .load()
+            .is_some_and(|f| f as usize == PyBaseObject::getattro as *const () as usize);
+        if !is_default_getattro {
+            unsafe {
+                self.code
+                    .instructions
+                    .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE);
+            }
+            return;
+        }
+
+        // Get or assign type version
+        let mut type_version = cls.tp_version_tag.load(Acquire);
+        if type_version == 0 {
+            type_version = cls.assign_version_tag();
+        }
+        if type_version == 0 {
+            // Version counter overflow — backoff to avoid re-attempting every execution
+            unsafe {
+                self.code
+                    .instructions
+                    .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE);
+            }
+            return;
+        }
+
+        let attr_name = self.code.names[oparg.name_idx() as usize];
+
+        // Look up attr in class via MRO
+        let cls_attr = cls.get_attr(attr_name);
+        let has_dict = obj.dict().is_some();
+
+        if oparg.is_method() {
+            // Method specialization
+            if let Some(ref descr) = cls_attr
+                && descr
+                    .class()
+                    .slots
+                    .flags
+                    .has_feature(PyTypeFlags::METHOD_DESCRIPTOR)
+            {
+                let descr_ptr = &**descr as *const PyObject as u64;
+                unsafe {
+                    self.code
+                        .instructions
+                        .write_cache_u32(cache_base + 1, type_version);
+                    self.code
+                        .instructions
+                        .write_cache_u64(cache_base + 5, descr_ptr);
+                }
+
+                let new_op = if !has_dict {
+                    Instruction::LoadAttrMethodNoDict
+                } else {
+                    Instruction::LoadAttrMethodWithValues
+                };
+                unsafe {
+                    self.code.instructions.replace_op(instr_idx, new_op);
+                }
+                return;
+            }
+            // Can't specialize this method call
+            unsafe {
+                self.code
+                    .instructions
+                    .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE);
+            }
+        } else {
+            // Regular attribute access
+            let has_data_descr = cls_attr.as_ref().is_some_and(|descr| {
+                let descr_cls = descr.class();
+                descr_cls.slots.descr_get.load().is_some()
+                    && descr_cls.slots.descr_set.load().is_some()
+            });
+
+            if !has_data_descr && has_dict {
+                // Instance attribute access — skip class descriptor check
+                unsafe {
+                    self.code
+                        .instructions
+                        .write_cache_u32(cache_base + 1, type_version);
+                    self.code
+                        .instructions
+                        .replace_op(instr_idx, Instruction::LoadAttrInstanceValue);
+                }
+            } else {
+                // Data descriptor or no dict — can't easily specialize
+                unsafe {
+                    self.code
+                        .instructions
+                        .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE);
+                }
+            }
+        }
+    }
+
+    fn load_attr_slow(&mut self, vm: &VirtualMachine, oparg: LoadAttr) -> FrameResult {
         let attr_name = self.code.names[oparg.name_idx() as usize];
         let parent = self.pop_value();
 
@@ -4135,6 +4589,141 @@ impl ExecutingFrame<'_> {
         Ok(None)
     }
 
+    fn specialize_binary_op(
+        &mut self,
+        vm: &VirtualMachine,
+        op: bytecode::BinaryOperator,
+        instr_idx: usize,
+        cache_base: usize,
+    ) {
+        let b = self.top_value();
+        let a = self.nth_value(1);
+
+        let new_op = match op {
+            bytecode::BinaryOperator::Add => {
+                if a.downcast_ref_if_exact::<PyInt>(vm).is_some()
+                    && b.downcast_ref_if_exact::<PyInt>(vm).is_some()
+                {
+                    Some(Instruction::BinaryOpAddInt)
+                } else if a.downcast_ref_if_exact::<PyFloat>(vm).is_some()
+                    && b.downcast_ref_if_exact::<PyFloat>(vm).is_some()
+                {
+                    Some(Instruction::BinaryOpAddFloat)
+                } else {
+                    None
+                }
+            }
+            bytecode::BinaryOperator::Subtract => {
+                if a.downcast_ref_if_exact::<PyInt>(vm).is_some()
+                    && b.downcast_ref_if_exact::<PyInt>(vm).is_some()
+                {
+                    Some(Instruction::BinaryOpSubtractInt)
+                } else if a.downcast_ref_if_exact::<PyFloat>(vm).is_some()
+                    && b.downcast_ref_if_exact::<PyFloat>(vm).is_some()
+                {
+                    Some(Instruction::BinaryOpSubtractFloat)
+                } else {
+                    None
+                }
+            }
+            bytecode::BinaryOperator::Multiply => {
+                if a.downcast_ref_if_exact::<PyInt>(vm).is_some()
+                    && b.downcast_ref_if_exact::<PyInt>(vm).is_some()
+                {
+                    Some(Instruction::BinaryOpMultiplyInt)
+                } else if a.downcast_ref_if_exact::<PyFloat>(vm).is_some()
+                    && b.downcast_ref_if_exact::<PyFloat>(vm).is_some()
+                {
+                    Some(Instruction::BinaryOpMultiplyFloat)
+                } else {
+                    None
+                }
+            }
+            _ => None,
+        };
+
+        if let Some(new_op) = new_op {
+            unsafe {
+                self.code.instructions.replace_op(instr_idx, new_op);
+            }
+        } else {
+            unsafe {
+                self.code
+                    .instructions
+                    .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE);
+            }
+        }
+    }
+
+    fn deoptimize_binary_op(&mut self, _op: bytecode::BinaryOperator) {
+        let instr_idx = self.lasti() as usize - 1;
+        let cache_base = instr_idx + 1;
+        unsafe {
+            self.code
+                .instructions
+                .replace_op(instr_idx, Instruction::BinaryOp { op: Arg::marker() });
+            self.code
+                .instructions
+                .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE);
+        }
+    }
+
+    fn specialize_call(
+        &mut self,
+        _vm: &VirtualMachine,
+        nargs: u32,
+        instr_idx: usize,
+        cache_base: usize,
+    ) {
+        // Stack: [callable, self_or_null, arg1, ..., argN]
+        // callable is at position nargs + 1 from top
+        // self_or_null is at position nargs from top
+        let stack = &self.state.stack;
+        let stack_len = stack.len();
+        let self_or_null_is_some = stack[stack_len - nargs as usize - 1].is_some();
+        let callable = self.nth_value(nargs + 1);
+
+        if let Some(func) = callable.downcast_ref::<PyFunction>() {
+            let version = func.func_version();
+            if version == 0 {
+                unsafe {
+                    self.code
+                        .instructions
+                        .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE);
+                }
+                return;
+            }
+
+            let effective_nargs = if self_or_null_is_some {
+                nargs + 1
+            } else {
+                nargs
+            };
+
+            if func.can_specialize_call(effective_nargs) {
+                let new_op = if self_or_null_is_some {
+                    Instruction::CallBoundMethodExactArgs
+                } else {
+                    Instruction::CallPyExactArgs
+                };
+                unsafe {
+                    self.code.instructions.replace_op(instr_idx, new_op);
+                    // Store func_version in cache (after counter)
+                    self.code
+                        .instructions
+                        .write_cache_u32(cache_base + 1, version);
+                }
+                return;
+            }
+        }
+
+        unsafe {
+            self.code
+                .instructions
+                .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE);
+        }
+    }
+
     fn load_super_attr(&mut self, vm: &VirtualMachine, oparg: LoadSuperAttr) -> FrameResult {
         let attr_name = self.code.names[oparg.name_idx() as usize];
 
diff --git a/crates/vm/src/object/core.rs b/crates/vm/src/object/core.rs
index b48045f2163..41ddfa26b2e 100644
--- a/crates/vm/src/object/core.rs
+++ b/crates/vm/src/object/core.rs
@@ -1927,6 +1927,7 @@ pub(crate) fn init_type_hierarchy() -> (PyTypeRef, PyTypeRef, PyTypeRef) {
             attributes: PyRwLock::new(Default::default()),
             slots: PyType::make_slots(),
             heaptype_ext: None,
+            tp_version_tag: core::sync::atomic::AtomicU32::new(0),
         };
         let object_payload = PyType {
             base: None,
@@ -1936,6 +1937,7 @@ pub(crate) fn init_type_hierarchy() -> (PyTypeRef, PyTypeRef, PyTypeRef) {
             attributes: PyRwLock::new(Default::default()),
             slots: object::PyBaseObject::make_slots(),
             heaptype_ext: None,
+            tp_version_tag: core::sync::atomic::AtomicU32::new(0),
         };
         let type_type_ptr = Box::into_raw(Box::new(partially_init!(
             PyInner::<PyType> {
@@ -1997,6 +1999,7 @@ pub(crate) fn init_type_hierarchy() -> (PyTypeRef, PyTypeRef, PyTypeRef) {
         attributes: PyRwLock::default(),
         slots: PyWeak::make_slots(),
         heaptype_ext: None,
+        tp_version_tag: core::sync::atomic::AtomicU32::new(0),
     };
     let weakref_type = PyRef::new_ref(weakref_type, type_type.clone(), None);
     // Static type: untrack from GC (was tracked by new_ref because PyType has HAS_TRAVERSE)
diff --git a/crates/vm/src/stdlib/sys/monitoring.rs b/crates/vm/src/stdlib/sys/monitoring.rs
index e223e249e54..858ea83b8a7 100644
--- a/crates/vm/src/stdlib/sys/monitoring.rs
+++ b/crates/vm/src/stdlib/sys/monitoring.rs
@@ -270,13 +270,29 @@ pub fn instrument_code(code: &PyCode, events: u32) {
         }
     }
 
-    // Phase 3: Remove regular INSTRUMENTED_* → restore base opcodes
-    for i in 0..len {
-        let op = code.code.instructions[i].op;
-        if let Some(base) = op.to_base() {
-            unsafe {
-                code.code.instructions.replace_op(i, base);
+    // Phase 3: Remove regular INSTRUMENTED_* and specialized opcodes → restore base opcodes.
+    // Also clear all CACHE entries so specialization starts fresh.
+    {
+        let mut i = 0;
+        while i < len {
+            let op = code.code.instructions[i].op;
+            let base_op = op.deoptimize();
+            if u8::from(base_op) != u8::from(op) {
+                unsafe {
+                    code.code.instructions.replace_op(i, base_op);
+                }
+            }
+            let caches = base_op.cache_entries();
+            // Zero all CACHE entries (the op+arg bytes may have been overwritten
+            // by specialization with arbitrary data like pointers).
+            for c in 1..=caches {
+                if i + c < len {
+                    unsafe {
+                        code.code.instructions.write_cache_u16(i + c, 0);
+                    }
+                }
             }
+            i += 1 + caches;
         }
     }