From 9228b69f279d53c722f6099ea3e4ec4660ec81d6 Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Wed, 11 Dec 2024 17:51:31 -0800 Subject: [PATCH] pulley: Initial scaffold of SIMD support This commit fills out some of the initial infrastructure necessary for supporting the SIMD proposal to WebAssembly in the Pulley interpreter, namely 128-bit simd. The `VRegVal` union has been filled out with various types, endianness questions are settled, and initial implementations of a suite of opcodes are added to get a basic set of tests working throughout the backend. cc #9783 --- cranelift/codegen/meta/src/pulley.rs | 60 ++-- .../codegen/src/isa/pulley_shared/abi.rs | 21 +- .../codegen/src/isa/pulley_shared/inst.isle | 10 + .../codegen/src/isa/pulley_shared/inst/mod.rs | 14 +- .../codegen/src/isa/pulley_shared/lower.isle | 32 +++ crates/wast-util/src/lib.rs | 9 - pulley/src/decode.rs | 20 +- pulley/src/disas.rs | 25 +- pulley/src/encode.rs | 24 +- pulley/src/interp.rs | 262 +++++++++++++++++- pulley/src/lib.rs | 41 +++ pulley/src/regs.rs | 18 +- 12 files changed, 474 insertions(+), 62 deletions(-) diff --git a/cranelift/codegen/meta/src/pulley.rs b/cranelift/codegen/meta/src/pulley.rs index fa22191d1bba..2c95b6e5366b 100644 --- a/cranelift/codegen/meta/src/pulley.rs +++ b/cranelift/codegen/meta/src/pulley.rs @@ -27,10 +27,23 @@ const OPS: &[Inst<'_>] = pulley_interpreter::for_each_op!(define); const EXTENDED_OPS: &[Inst<'_>] = pulley_interpreter::for_each_extended_op!(define); enum Operand<'a> { - Normal { name: &'a str, ty: &'a str }, - Writable { name: &'a str, ty: &'a str }, - TrapCode { name: &'a str, ty: &'a str }, - Binop { reg: &'a str }, + Normal { + name: &'a str, + ty: &'a str, + }, + Writable { + name: &'a str, + ty: &'a str, + }, + TrapCode { + name: &'a str, + ty: &'a str, + }, + Binop { + dst: &'a str, + src1: &'a str, + src2: &'a str, + }, } impl Inst<'_> { @@ -38,8 +51,23 @@ impl Inst<'_> { self.fields .iter() .map(|(name, ty)| match (*name, *ty) { - ("operands", "BinaryOperands < XReg >") => Operand::Binop { reg: "XReg" }, - ("operands", "BinaryOperands < FReg >") => Operand::Binop { reg: "FReg" }, + ("operands", binop) => { + // Parse "BinaryOperands < A >"` as A/A/A + // Parse "BinaryOperands < A, B >"` as A/B/A + // Parse "BinaryOperands < A, B, C >"` as A/B/C + let mut parts = binop + .strip_prefix("BinaryOperands <") + .unwrap() + .strip_suffix(">") + .unwrap() + .trim() + .split(',') + .map(|x| x.trim()); + let dst = parts.next().unwrap(); + let src1 = parts.next().unwrap_or(dst); + let src2 = parts.next().unwrap_or(dst); + Operand::Binop { dst, src1, src2 } + } ("dst", ty) => Operand::Writable { name, ty }, (name, ty) => Operand::Normal { name, ty }, }) @@ -109,7 +137,7 @@ pub fn generate_rust(filename: &str, out_dir: &Path) -> Result<(), Error> { pat.push_str(","); format_string.push_str(&format!(" // trap={{{name}:?}}")); } - Operand::Binop { reg: _ } => { + Operand::Binop { .. } => { pat.push_str("dst, src1, src2,"); format_string.push_str(" {dst}, {src1}, {src2}"); locals.push_str(&format!("let dst = reg_name(*dst.to_reg());\n")); @@ -161,7 +189,7 @@ pub fn generate_rust(filename: &str, out_dir: &Path) -> Result<(), Error> { } } Operand::TrapCode { .. } => {} - Operand::Binop { reg: _ } => { + Operand::Binop { .. } => { pat.push_str("dst, src1, src2,"); uses.push("src1"); uses.push("src2"); @@ -221,7 +249,7 @@ pub fn generate_rust(filename: &str, out_dir: &Path) -> Result<(), Error> { pat.push_str(","); trap.push_str(&format!("sink.add_trap({name});\n")); } - Operand::Binop { reg: _ } => { + Operand::Binop { .. } => { pat.push_str("dst, src1, src2,"); args.push_str( "pulley_interpreter::regs::BinaryOperands::new(dst, src1, src2),", @@ -265,10 +293,10 @@ pub fn generate_isle(filename: &str, out_dir: &Path) -> Result<(), Error> { Operand::Writable { name, ty } => { isle.push_str(&format!("\n ({name} Writable{ty})")); } - Operand::Binop { reg } => { - isle.push_str(&format!("\n (dst Writable{reg})")); - isle.push_str(&format!("\n (src1 {reg})")); - isle.push_str(&format!("\n (src2 {reg})")); + Operand::Binop { dst, src1, src2 } => { + isle.push_str(&format!("\n (dst Writable{dst})")); + isle.push_str(&format!("\n (src1 {src1})")); + isle.push_str(&format!("\n (src2 {src2})")); } } } @@ -303,13 +331,13 @@ pub fn generate_isle(filename: &str, out_dir: &Path) -> Result<(), Error> { assert!(result.is_none(), "{} has >1 result", inst.snake_name); result = Some(ty); } - Operand::Binop { reg } => { - isle.push_str(&format!("{reg} {reg}")); + Operand::Binop { dst, src1, src2 } => { + isle.push_str(&format!("{src1} {src2}")); rule.push_str("src1 src2"); ops.push("src1"); ops.push("src2"); assert!(result.is_none(), "{} has >1 result", inst.snake_name); - result = Some(reg); + result = Some(dst); } } isle.push_str(" "); diff --git a/cranelift/codegen/src/isa/pulley_shared/abi.rs b/cranelift/codegen/src/isa/pulley_shared/abi.rs index e2c9317d1d93..292e8b680ac3 100644 --- a/cranelift/codegen/src/isa/pulley_shared/abi.rs +++ b/cranelift/codegen/src/isa/pulley_shared/abi.rs @@ -510,17 +510,18 @@ where _target_vector_bytes: u32, _isa_flags: &PulleyFlags, ) -> u32 { + // Spill slots are the size of a "word" or a pointer, but Pulley + // registers are 8-byte for integers/floats regardless of pointer size. + // Calculate the number of slots necessary to store 8 bytes. + let slots_for_8bytes = match P::pointer_width() { + PointerWidth::PointerWidth32 => 2, + PointerWidth::PointerWidth64 => 1, + }; match rc { - // Spilling an integer or float register requires spilling 8 bytes, - // and spill slots are defined in terms of "word bytes" or the size - // of a pointer. That means on 32-bit pulley we need to take up two - // spill slots where on 64-bit pulley we need to only take up one - // spill slot for integers. - RegClass::Int | RegClass::Float => match P::pointer_width() { - PointerWidth::PointerWidth32 => 2, - PointerWidth::PointerWidth64 => 1, - }, - RegClass::Vector => unreachable!(), + // Int/float registers are 8-bytes + RegClass::Int | RegClass::Float => slots_for_8bytes, + // Vector registers are 16 bytes + RegClass::Vector => 2 * slots_for_8bytes, } } diff --git a/cranelift/codegen/src/isa/pulley_shared/inst.isle b/cranelift/codegen/src/isa/pulley_shared/inst.isle index 015b547fb96f..384912269c71 100644 --- a/cranelift/codegen/src/isa/pulley_shared/inst.isle +++ b/cranelift/codegen/src/isa/pulley_shared/inst.isle @@ -414,6 +414,16 @@ (rule (pulley_fstore amode src ty flags) (SideEffectNoResult.Inst (MInst.FStore amode src ty flags))) +(decl pulley_vload (Amode Type MemFlags) VReg) +(rule (pulley_vload amode ty flags) + (let ((dst WritableVReg (temp_writable_vreg)) + (_ Unit (emit (MInst.VLoad dst amode ty flags)))) + dst)) + +(decl pulley_vstore (Amode VReg Type MemFlags) SideEffectNoResult) +(rule (pulley_vstore amode src ty flags) + (SideEffectNoResult.Inst (MInst.VStore amode src ty flags))) + (decl gen_br_table (XReg MachLabel BoxVecMachLabel) Unit) (rule (gen_br_table idx default labels) (emit (MInst.BrTable idx default labels))) diff --git a/cranelift/codegen/src/isa/pulley_shared/inst/mod.rs b/cranelift/codegen/src/isa/pulley_shared/inst/mod.rs index e2560639d1f0..11aac8e7c304 100644 --- a/cranelift/codegen/src/isa/pulley_shared/inst/mod.rs +++ b/cranelift/codegen/src/isa/pulley_shared/inst/mod.rs @@ -453,18 +453,8 @@ where } fn worst_case_size() -> CodeOffset { - // `BrIfXeq32 { a, b, taken, not_taken }` expands to `br_if_xeq32 a, b, taken; jump not_taken`. - // - // The first instruction is seven bytes long: - // * 1 byte opcode - // * 1 byte `a` register encoding - // * 1 byte `b` register encoding - // * 4 byte `taken` displacement - // - // And the second instruction is five bytes long: - // * 1 byte opcode - // * 4 byte `not_taken` displacement - 12 + // `Vconst128 { dst, imm }` is 18 bytes (opcode + dst + 16-byte imm) + 18 } fn ref_type_regclass(_settings: &settings::Flags) -> RegClass { diff --git a/cranelift/codegen/src/isa/pulley_shared/lower.isle b/cranelift/codegen/src/isa/pulley_shared/lower.isle index e1df6602706b..a97d26bc7589 100644 --- a/cranelift/codegen/src/isa/pulley_shared/lower.isle +++ b/cranelift/codegen/src/isa/pulley_shared/lower.isle @@ -143,6 +143,11 @@ (rule (lower (has_type $I64 (iadd a b))) (pulley_xadd64 a b)) +(rule (lower (has_type $I8X16 (iadd a b))) (pulley_vaddi8x16 a b)) +(rule (lower (has_type $I16X8 (iadd a b))) (pulley_vaddi16x8 a b)) +(rule (lower (has_type $I32X4 (iadd a b))) (pulley_vaddi32x4 a b)) +(rule (lower (has_type $I64X2 (iadd a b))) (pulley_vaddi64x2 a b)) + ;;;; Rules for `isub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type $I8 (isub a b))) @@ -192,6 +197,11 @@ (rule (lower (has_type $I64 (ishl a b))) (pulley_xshl64 a b)) +(rule (lower (has_type $I8X16 (ishl a b))) (pulley_vshli8x16 a b)) +(rule (lower (has_type $I16X8 (ishl a b))) (pulley_vshli16x8 a b)) +(rule (lower (has_type $I32X4 (ishl a b))) (pulley_vshli32x4 a b)) +(rule (lower (has_type $I64X2 (ishl a b))) (pulley_vshli64x2 a b)) + ;;;; Rules for `ushr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type $I32 (ushr a b))) @@ -200,6 +210,11 @@ (rule (lower (has_type $I64 (ushr a b))) (pulley_xshr64_u a b)) +(rule (lower (has_type $I8X16 (ushr a b))) (pulley_vshri8x16_u a b)) +(rule (lower (has_type $I16X8 (ushr a b))) (pulley_vshri16x8_u a b)) +(rule (lower (has_type $I32X4 (ushr a b))) (pulley_vshri32x4_u a b)) +(rule (lower (has_type $I64X2 (ushr a b))) (pulley_vshri64x2_u a b)) + ;;;; Rules for `sshr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type $I32 (sshr a b))) @@ -208,6 +223,11 @@ (rule (lower (has_type $I64 (sshr a b))) (pulley_xshr64_s a b)) +(rule (lower (has_type $I8X16 (sshr a b))) (pulley_vshri8x16_s a b)) +(rule (lower (has_type $I16X8 (sshr a b))) (pulley_vshri16x8_s a b)) +(rule (lower (has_type $I32X4 (sshr a b))) (pulley_vshri32x4_s a b)) +(rule (lower (has_type $I64X2 (sshr a b))) (pulley_vshri64x2_s a b)) + ;;;; Rules for `band` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule 0 (lower (has_type (fits_in_32 _) (band a b))) @@ -414,6 +434,9 @@ (rule 1 (lower (has_type $I64 (sload32 flags addr offset))) (pulley_xload (amode addr offset) $I32 flags (ExtKind.Sign64))) +(rule 2 (lower (has_type (ty_vec128 ty) (load flags addr offset))) + (pulley_vload (amode addr offset) ty flags)) + ;;;; Rules for `store` and friends ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (store flags src @ (value_type (ty_int ty)) addr offset)) @@ -431,6 +454,9 @@ (rule (lower (istore32 flags src addr offset)) (side_effect (pulley_xstore (amode addr offset) src $I32 flags))) +(rule 2 (lower (store flags src @ (value_type (ty_vec128 ty)) addr offset)) + (side_effect (pulley_vstore (amode addr offset) src ty flags))) + ;;;; Rules for `stack_addr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (stack_addr stack_slot offset)) @@ -622,6 +648,8 @@ (rule (lower (has_type $F32 (fadd a b))) (pulley_fadd32 a b)) (rule (lower (has_type $F64 (fadd a b))) (pulley_fadd64 a b)) +(rule (lower (has_type $F32X4 (fadd a b))) (pulley_vaddf32x4 a b)) +(rule (lower (has_type $F64X2 (fadd a b))) (pulley_vaddf64x2 a b)) ;;;; Rules for `fsub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -687,3 +715,7 @@ (rule (lower (has_type $F32 (fabs a))) (pulley_fabs32 a)) (rule (lower (has_type $F64 (fabs a))) (pulley_fabs64 a)) + +;;;; Rules for `vconst` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (has_type (ty_vec128 _) (vconst (u128_from_constant a)))) (pulley_vconst128 a)) diff --git a/crates/wast-util/src/lib.rs b/crates/wast-util/src/lib.rs index 00551c46f29e..7c99c6eaaa83 100644 --- a/crates/wast-util/src/lib.rs +++ b/crates/wast-util/src/lib.rs @@ -403,10 +403,8 @@ impl WastTest { "misc_testsuite/simd/almost-extmul.wast", "misc_testsuite/simd/canonicalize-nan.wast", "misc_testsuite/simd/cvt-from-uint.wast", - "misc_testsuite/simd/interesting-float-splat.wast", "misc_testsuite/simd/issue4807.wast", "misc_testsuite/simd/issue6725-no-egraph-panic.wast", - "misc_testsuite/simd/issue_3173_select_v128.wast", "misc_testsuite/simd/issue_3327_bnot_lowering.wast", "misc_testsuite/simd/load_splat_out_of_bounds.wast", "misc_testsuite/simd/replace-lane-preserve.wast", @@ -418,11 +416,6 @@ impl WastTest { "misc_testsuite/threads/MP_wait.wast", "misc_testsuite/threads/SB_atomic.wast", "misc_testsuite/threads/load-store-alignment.wast", - "misc_testsuite/winch/_simd_address.wast", - "misc_testsuite/winch/_simd_const.wast", - "misc_testsuite/winch/_simd_load.wast", - "misc_testsuite/winch/_simd_multivalue.wast", - "misc_testsuite/winch/_simd_store.wast", "spec_testsuite/proposals/annotations/simd_lane.wast", "spec_testsuite/proposals/multi-memory/simd_memory-multi.wast", "spec_testsuite/proposals/relaxed-simd/i16x8_relaxed_q15mulr_s.wast", @@ -433,7 +426,6 @@ impl WastTest { "spec_testsuite/proposals/relaxed-simd/relaxed_madd_nmadd.wast", "spec_testsuite/proposals/relaxed-simd/relaxed_min_max.wast", "spec_testsuite/proposals/threads/atomic.wast", - "spec_testsuite/simd_address.wast", "spec_testsuite/simd_align.wast", "spec_testsuite/simd_bit_shift.wast", "spec_testsuite/simd_bitwise.wast", @@ -484,7 +476,6 @@ impl WastTest { "spec_testsuite/simd_load_splat.wast", "spec_testsuite/simd_load_zero.wast", "spec_testsuite/simd_splat.wast", - "spec_testsuite/simd_store.wast", "spec_testsuite/simd_store16_lane.wast", "spec_testsuite/simd_store32_lane.wast", "spec_testsuite/simd_store64_lane.wast", diff --git a/pulley/src/decode.rs b/pulley/src/decode.rs index bcd57017283d..d11fbe482d85 100644 --- a/pulley/src/decode.rs +++ b/pulley/src/decode.rs @@ -303,6 +303,15 @@ impl Decode for u64 { } } +impl Decode for u128 { + fn decode(bytecode: &mut T) -> Result + where + T: BytecodeStream, + { + Ok(u128::from_le_bytes(bytecode.read()?)) + } +} + impl Decode for i8 { fn decode(bytecode: &mut T) -> Result where @@ -339,6 +348,15 @@ impl Decode for i64 { } } +impl Decode for i128 { + fn decode(bytecode: &mut T) -> Result + where + T: BytecodeStream, + { + Ok(i128::from_le_bytes(bytecode.read()?)) + } +} + impl Decode for XReg { fn decode(bytecode: &mut T) -> Result where @@ -404,7 +422,7 @@ impl Decode for ExtendedOpcode { } } -impl Decode for BinaryOperands { +impl Decode for BinaryOperands { fn decode(bytecode: &mut T) -> Result where T: BytecodeStream, diff --git a/pulley/src/disas.rs b/pulley/src/disas.rs index 301bf2c345d6..fedff6ea14be 100644 --- a/pulley/src/disas.rs +++ b/pulley/src/disas.rs @@ -149,6 +149,12 @@ impl Disas for i64 { } } +impl Disas for i128 { + fn disas(&self, _position: usize, disas: &mut String) { + write!(disas, "{self}").unwrap(); + } +} + impl Disas for u8 { fn disas(&self, _position: usize, disas: &mut String) { write!(disas, "{self}").unwrap(); @@ -173,6 +179,12 @@ impl Disas for u64 { } } +impl Disas for u128 { + fn disas(&self, _position: usize, disas: &mut String) { + write!(disas, "{self}").unwrap(); + } +} + impl Disas for PcRelOffset { fn disas(&self, position: usize, disas: &mut String) { let offset = isize::try_from(i32::from(*self)).unwrap(); @@ -192,9 +204,18 @@ fn disas_list(position: usize, disas: &mut String, iter: impl IntoIter } } -impl Disas for BinaryOperands { +impl Disas for BinaryOperands +where + D: Reg + Disas, + S1: Reg + Disas, + S2: Reg + Disas, +{ fn disas(&self, position: usize, disas: &mut String) { - disas_list(position, disas, [self.dst, self.src1, self.src2]) + self.dst.disas(position, disas); + write!(disas, ", ").unwrap(); + self.src1.disas(position, disas); + write!(disas, ", ").unwrap(); + self.src2.disas(position, disas); } } diff --git a/pulley/src/encode.rs b/pulley/src/encode.rs index 1891b158a7af..c1d7d2dab610 100644 --- a/pulley/src/encode.rs +++ b/pulley/src/encode.rs @@ -59,6 +59,17 @@ impl Encode for u64 { } } +impl Encode for u128 { + const WIDTH: u8 = 16; + + fn encode(&self, sink: &mut E) + where + E: Extend, + { + sink.extend(self.to_le_bytes()); + } +} + impl Encode for i8 { const WIDTH: u8 = 1; @@ -103,6 +114,17 @@ impl Encode for i64 { } } +impl Encode for i128 { + const WIDTH: u8 = 16; + + fn encode(&self, sink: &mut E) + where + E: Extend, + { + sink.extend(self.to_le_bytes()); + } +} + impl Encode for XReg { const WIDTH: u8 = 1; @@ -147,7 +169,7 @@ impl Encode for PcRelOffset { } } -impl Encode for BinaryOperands { +impl Encode for BinaryOperands { const WIDTH: u8 = 2; fn encode(&self, sink: &mut E) diff --git a/pulley/src/interp.rs b/pulley/src/interp.rs index 4800015ddfc2..2579790cda43 100644 --- a/pulley/src/interp.rs +++ b/pulley/src/interp.rs @@ -540,11 +540,30 @@ impl fmt::LowerHex for VRegVal { } } +/// 128-bit vector registers. +/// +/// This register is always stored in little-endian order and has different +/// constraints than `XRegVal` and `FRegVal` above. Notably all fields of this +/// union are the same width so all bits are always defined. Note that +/// little-endian is required though so bitcasts between different shapes of +/// vectors works. This union cannot be stored in big-endian. #[derive(Copy, Clone)] +#[repr(align(16))] union VRegUnion { - // TODO: need to figure out how we are going to handle portability of lane - // ordering on top of each lane's endianness. u128: u128, + i8x16: [i8; 16], + i16x8: [i16; 8], + i32x4: [i32; 4], + i64x2: [i64; 2], + u8x16: [u8; 16], + u16x8: [u16; 8], + u32x4: [u32; 4], + u64x2: [u64; 2], + // Note that these are `u32` and `u64`, not f32/f64. That's only because + // f32/f64 don't have `.to_le()` and `::from_le()` so need to go through the + // bits anyway. + f32x4: [u32; 4], + f64x2: [u64; 2], } impl Default for VRegVal { @@ -569,6 +588,96 @@ impl VRegVal { pub fn set_u128(&mut self, val: u128) { self.0.u128 = val.to_le(); } + + fn get_i8x16(&self) -> [i8; 16] { + let val = unsafe { self.0.i8x16 }; + val.map(|e| i8::from_le(e)) + } + + fn set_i8x16(&mut self, val: [i8; 16]) { + self.0.i8x16 = val.map(|e| e.to_le()); + } + + fn get_u8x16(&self) -> [u8; 16] { + let val = unsafe { self.0.u8x16 }; + val.map(|e| u8::from_le(e)) + } + + fn set_u8x16(&mut self, val: [u8; 16]) { + self.0.u8x16 = val.map(|e| e.to_le()); + } + + fn get_i16x8(&self) -> [i16; 8] { + let val = unsafe { self.0.i16x8 }; + val.map(|e| i16::from_le(e)) + } + + fn set_i16x8(&mut self, val: [i16; 8]) { + self.0.i16x8 = val.map(|e| e.to_le()); + } + + fn get_u16x8(&self) -> [u16; 8] { + let val = unsafe { self.0.u16x8 }; + val.map(|e| u16::from_le(e)) + } + + fn set_u16x8(&mut self, val: [u16; 8]) { + self.0.u16x8 = val.map(|e| e.to_le()); + } + + fn get_i32x4(&self) -> [i32; 4] { + let val = unsafe { self.0.i32x4 }; + val.map(|e| i32::from_le(e)) + } + + fn set_i32x4(&mut self, val: [i32; 4]) { + self.0.i32x4 = val.map(|e| e.to_le()); + } + + fn get_u32x4(&self) -> [u32; 4] { + let val = unsafe { self.0.u32x4 }; + val.map(|e| u32::from_le(e)) + } + + fn set_u32x4(&mut self, val: [u32; 4]) { + self.0.u32x4 = val.map(|e| e.to_le()); + } + + fn get_i64x2(&self) -> [i64; 2] { + let val = unsafe { self.0.i64x2 }; + val.map(|e| i64::from_le(e)) + } + + fn set_i64x2(&mut self, val: [i64; 2]) { + self.0.i64x2 = val.map(|e| e.to_le()); + } + + fn get_u64x2(&self) -> [u64; 2] { + let val = unsafe { self.0.u64x2 }; + val.map(|e| u64::from_le(e)) + } + + fn set_u64x2(&mut self, val: [u64; 2]) { + self.0.u64x2 = val.map(|e| e.to_le()); + } + + fn get_f64x2(&self) -> [f64; 2] { + let val = unsafe { self.0.f64x2 }; + val.map(|e| f64::from_bits(u64::from_le(e))) + } + + fn set_f64x2(&mut self, val: [f64; 2]) { + self.0.f64x2 = val.map(|e| e.to_bits().to_le()); + } + + fn get_f32x4(&self) -> [f32; 4] { + let val = unsafe { self.0.f32x4 }; + val.map(|e| f32::from_bits(u32::from_le(e))) + } + + fn set_f32x4(&mut self, val: [f32; 4]) { + self.0.f32x4 = val.map(|e| e.to_bits().to_le()); + } } /// The machine state for a Pulley virtual machine: the various registers and @@ -2417,6 +2526,155 @@ impl OpVisitor for Interpreter<'_> { self.state[dst].set_f64(a.wasm_abs()); ControlFlow::Continue(()) } + + fn vaddi8x16(&mut self, operands: BinaryOperands) -> ControlFlow { + let mut a = self.state[operands.src1].get_i8x16(); + let b = self.state[operands.src2].get_i8x16(); + for (a, b) in a.iter_mut().zip(b) { + *a += b; + } + self.state[operands.dst].set_i8x16(a); + ControlFlow::Continue(()) + } + + fn vaddi16x8(&mut self, operands: BinaryOperands) -> ControlFlow { + let mut a = self.state[operands.src1].get_i16x8(); + let b = self.state[operands.src2].get_i16x8(); + for (a, b) in a.iter_mut().zip(b) { + *a += b; + } + self.state[operands.dst].set_i16x8(a); + ControlFlow::Continue(()) + } + + fn vaddi32x4(&mut self, operands: BinaryOperands) -> ControlFlow { + let mut a = self.state[operands.src1].get_i32x4(); + let b = self.state[operands.src2].get_i32x4(); + for (a, b) in a.iter_mut().zip(b) { + *a += b; + } + self.state[operands.dst].set_i32x4(a); + ControlFlow::Continue(()) + } + + fn vaddi64x2(&mut self, operands: BinaryOperands) -> ControlFlow { + let mut a = self.state[operands.src1].get_i64x2(); + let b = self.state[operands.src2].get_i64x2(); + for (a, b) in a.iter_mut().zip(b) { + *a += b; + } + self.state[operands.dst].set_i64x2(a); + ControlFlow::Continue(()) + } + + fn vaddf32x4(&mut self, operands: BinaryOperands) -> ControlFlow { + let mut a = self.state[operands.src1].get_f32x4(); + let b = self.state[operands.src2].get_f32x4(); + for (a, b) in a.iter_mut().zip(b) { + *a += b; + } + self.state[operands.dst].set_f32x4(a); + ControlFlow::Continue(()) + } + + fn vaddf64x2(&mut self, operands: BinaryOperands) -> ControlFlow { + let mut a = self.state[operands.src1].get_f64x2(); + let b = self.state[operands.src2].get_f64x2(); + for (a, b) in a.iter_mut().zip(b) { + *a += b; + } + self.state[operands.dst].set_f64x2(a); + ControlFlow::Continue(()) + } + + fn vshli8x16(&mut self, operands: BinaryOperands) -> ControlFlow { + let a = self.state[operands.src1].get_i8x16(); + let b = self.state[operands.src2].get_u32(); + self.state[operands.dst].set_i8x16(a.map(|a| a.wrapping_shl(b))); + ControlFlow::Continue(()) + } + + fn vshli16x8(&mut self, operands: BinaryOperands) -> ControlFlow { + let a = self.state[operands.src1].get_i16x8(); + let b = self.state[operands.src2].get_u32(); + self.state[operands.dst].set_i16x8(a.map(|a| a.wrapping_shl(b))); + ControlFlow::Continue(()) + } + + fn vshli32x4(&mut self, operands: BinaryOperands) -> ControlFlow { + let a = self.state[operands.src1].get_i32x4(); + let b = self.state[operands.src2].get_u32(); + self.state[operands.dst].set_i32x4(a.map(|a| a.wrapping_shl(b))); + ControlFlow::Continue(()) + } + + fn vshli64x2(&mut self, operands: BinaryOperands) -> ControlFlow { + let a = self.state[operands.src1].get_i64x2(); + let b = self.state[operands.src2].get_u32(); + self.state[operands.dst].set_i64x2(a.map(|a| a.wrapping_shl(b))); + ControlFlow::Continue(()) + } + + fn vshri8x16_s(&mut self, operands: BinaryOperands) -> ControlFlow { + let a = self.state[operands.src1].get_i8x16(); + let b = self.state[operands.src2].get_u32(); + self.state[operands.dst].set_i8x16(a.map(|a| a.wrapping_shr(b))); + ControlFlow::Continue(()) + } + + fn vshri16x8_s(&mut self, operands: BinaryOperands) -> ControlFlow { + let a = self.state[operands.src1].get_i16x8(); + let b = self.state[operands.src2].get_u32(); + self.state[operands.dst].set_i16x8(a.map(|a| a.wrapping_shr(b))); + ControlFlow::Continue(()) + } + + fn vshri32x4_s(&mut self, operands: BinaryOperands) -> ControlFlow { + let a = self.state[operands.src1].get_i32x4(); + let b = self.state[operands.src2].get_u32(); + self.state[operands.dst].set_i32x4(a.map(|a| a.wrapping_shr(b))); + ControlFlow::Continue(()) + } + + fn vshri64x2_s(&mut self, operands: BinaryOperands) -> ControlFlow { + let a = self.state[operands.src1].get_i64x2(); + let b = self.state[operands.src2].get_u32(); + self.state[operands.dst].set_i64x2(a.map(|a| a.wrapping_shr(b))); + ControlFlow::Continue(()) + } + + fn vshri8x16_u(&mut self, operands: BinaryOperands) -> ControlFlow { + let a = self.state[operands.src1].get_u8x16(); + let b = self.state[operands.src2].get_u32(); + self.state[operands.dst].set_u8x16(a.map(|a| a.wrapping_shr(b))); + ControlFlow::Continue(()) + } + + fn vshri16x8_u(&mut self, operands: BinaryOperands) -> ControlFlow { + let a = self.state[operands.src1].get_u16x8(); + let b = self.state[operands.src2].get_u32(); + self.state[operands.dst].set_u16x8(a.map(|a| a.wrapping_shr(b))); + ControlFlow::Continue(()) + } + + fn vshri32x4_u(&mut self, operands: BinaryOperands) -> ControlFlow { + let a = self.state[operands.src1].get_u32x4(); + let b = self.state[operands.src2].get_u32(); + self.state[operands.dst].set_u32x4(a.map(|a| a.wrapping_shr(b))); + ControlFlow::Continue(()) + } + + fn vshri64x2_u(&mut self, operands: BinaryOperands) -> ControlFlow { + let a = self.state[operands.src1].get_u64x2(); + let b = self.state[operands.src2].get_u32(); + self.state[operands.dst].set_u64x2(a.map(|a| a.wrapping_shr(b))); + ControlFlow::Continue(()) + } + + fn vconst128(&mut self, dst: VReg, val: u128) -> ControlFlow { + self.state[dst].set_u128(val); + ControlFlow::Continue(()) + } } impl ExtendedOpVisitor for Interpreter<'_> { diff --git a/pulley/src/lib.rs b/pulley/src/lib.rs index bb8b83f7994a..563ee1df6755 100644 --- a/pulley/src/lib.rs +++ b/pulley/src/lib.rs @@ -563,6 +563,47 @@ macro_rules! for_each_op { fneg64 = Fneg64 { dst: FReg, src: FReg }; /// `dst = |src|` fabs64 = Fabs64 { dst: FReg, src: FReg }; + + /// `dst = imm` + vconst128 = Vconst128 { dst: VReg, imm: u128 }; + + /// `dst = src1 + src2` + vaddi8x16 = VAddI8x16 { operands: BinaryOperands }; + /// `dst = src1 + src2` + vaddi16x8 = VAddI16x8 { operands: BinaryOperands }; + /// `dst = src1 + src2` + vaddi32x4 = VAddI32x4 { operands: BinaryOperands }; + /// `dst = src1 + src2` + vaddi64x2 = VAddI64x2 { operands: BinaryOperands }; + /// `dst = src1 + src2` + vaddf32x4 = VAddF32x4 { operands: BinaryOperands }; + /// `dst = src1 + src2` + vaddf64x2 = VAddF64x2 { operands: BinaryOperands }; + + /// `dst = src1 << src2` + vshli8x16 = VShlI8x16 { operands: BinaryOperands }; + /// `dst = src1 << src2` + vshli16x8 = VShlI16x8 { operands: BinaryOperands }; + /// `dst = src1 << src2` + vshli32x4 = VShlI32x4 { operands: BinaryOperands }; + /// `dst = src1 << src2` + vshli64x2 = VShlI64x2 { operands: BinaryOperands }; + /// `dst = src1 >> src2` (signed) + vshri8x16_s = VShrI8x16S { operands: BinaryOperands }; + /// `dst = src1 >> src2` (signed) + vshri16x8_s = VShrI16x8S { operands: BinaryOperands }; + /// `dst = src1 >> src2` (signed) + vshri32x4_s = VShrI32x4S { operands: BinaryOperands }; + /// `dst = src1 >> src2` (signed) + vshri64x2_s = VShrI64x2S { operands: BinaryOperands }; + /// `dst = src1 >> src2` (unsigned) + vshri8x16_u = VShrI8x16U { operands: BinaryOperands }; + /// `dst = src1 >> src2` (unsigned) + vshri16x8_u = VShrI16x8U { operands: BinaryOperands }; + /// `dst = src1 >> src2` (unsigned) + vshri32x4_u = VShrI32x4U { operands: BinaryOperands }; + /// `dst = src1 >> src2` (unsigned) + vshri64x2_u = VShrI64x2U { operands: BinaryOperands }; } }; } diff --git a/pulley/src/regs.rs b/pulley/src/regs.rs index deaa08deb19f..00262bf233ff 100644 --- a/pulley/src/regs.rs +++ b/pulley/src/regs.rs @@ -164,18 +164,18 @@ impl fmt::Debug for AnyReg { /// Operands to a binary operation, packed into a 16-bit word (5 bits per register). #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] -pub struct BinaryOperands { +pub struct BinaryOperands { /// The destination register, packed in bits 0..5. - pub dst: R, + pub dst: D, /// The first source register, packed in bits 5..10. - pub src1: R, + pub src1: S1, /// The second source register, packed in bits 10..15. - pub src2: R, + pub src2: S2, } -impl BinaryOperands { +impl BinaryOperands { /// Convenience constructor for applying `Into` - pub fn new(dst: impl Into, src1: impl Into, src2: impl Into) -> Self { + pub fn new(dst: impl Into, src1: impl Into, src2: impl Into) -> Self { Self { dst: dst.into(), src1: src1.into(), @@ -194,9 +194,9 @@ impl BinaryOperands { /// Convert from dense 16 bit encoding. The topmost bit is ignored. pub fn from_bits(bits: u16) -> Self { Self { - dst: R::new((bits & 0b11111) as u8).unwrap(), - src1: R::new(((bits >> 5) & 0b11111) as u8).unwrap(), - src2: R::new(((bits >> 10) & 0b11111) as u8).unwrap(), + dst: D::new((bits & 0b11111) as u8).unwrap(), + src1: S1::new(((bits >> 5) & 0b11111) as u8).unwrap(), + src2: S2::new(((bits >> 10) & 0b11111) as u8).unwrap(), } } }