From 41f446044e75b0694e688a53a42f8c8ddddd76b4 Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Sun, 5 Jan 2025 20:25:05 -0800 Subject: [PATCH 1/6] emulate missing x86 shift instructions --- src/coreclr/jit/gentree.cpp | 66 +++++++++++++++++++++------- src/coreclr/jit/hwintrinsicxarch.cpp | 15 ------- src/coreclr/jit/importercalls.cpp | 2 +- 3 files changed, 51 insertions(+), 32 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index f48cbc4e02dfd2..90734e4ae043c6 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -20924,8 +20924,6 @@ GenTree* Compiler::gtNewSimdBinOpNode( unsigned shiftCountMask = (genTypeSize(simdBaseType) * 8) - 1; - GenTree* nonConstantByteShiftCountOp = NULL; - if (op2->IsCnsIntOrI()) { op2->AsIntCon()->gtIconVal &= shiftCountMask; @@ -21089,16 +21087,19 @@ GenTree* Compiler::gtNewSimdBinOpNode( } #if defined(TARGET_XARCH) - case GT_RSZ: case GT_LSH: + case GT_RSH: + case GT_RSZ: { - // We don't have actual instructions for shifting bytes, so we'll emulate them - // by shifting 32-bit values and masking off the bits that should be zeroed. + // This emulates byte shift shift instructions, which don't exist in x86 SIMD, + // plus arithmetic shift of qwords, which did not exist pre-AVX-512. - assert(varTypeIsByte(simdBaseType)); + assert(varTypeIsByte(simdBaseType) || (varTypeIsLong(simdBaseType) && (op == GT_RSH))); - intrinsic = - GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(this, op, op1, op2ForLookup, TYP_INT, simdSize, false); + // We will emulate arithmetic shift by using logical shift and then masking in the sign bits. + genTreeOps simdOp = op == GT_RSH ? GT_RSZ : op; + intrinsic = GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(this, simdOp, op1, op2ForLookup, + genActualType(simdBaseType), simdSize, false); assert(intrinsic != NI_Illegal); GenTree* maskAmountOp; @@ -21106,22 +21107,55 @@ GenTree* Compiler::gtNewSimdBinOpNode( if (op2->IsCnsIntOrI()) { ssize_t shiftCount = op2->AsIntCon()->gtIconVal; - ssize_t mask = op == GT_RSZ ? (255 >> shiftCount) : ((255 << shiftCount) & 0xFF); - - maskAmountOp = gtNewIconNode(mask, type); + if (varTypeIsByte(simdBaseType)) + { + ssize_t mask = op == GT_LSH ? ((0xFF << shiftCount) & 0xFF) : (0xFF >> shiftCount); + maskAmountOp = gtNewIconNode(mask, type); + } + else + { + int64_t mask = static_cast(0xFFFFFFFFFFFFFFFFULL >> shiftCount); + maskAmountOp = gtNewLconNode(mask); + } } else { assert(op2->OperIsHWIntrinsic(NI_Vector128_CreateScalar)); - GenTree* nonConstantByteShiftCountOp = fgMakeMultiUse(&op2->AsHWIntrinsic()->Op(1)); - maskAmountOp = gtNewOperNode(op, TYP_INT, gtNewIconNode(255), nonConstantByteShiftCountOp); + GenTree* shiftCountDup = fgMakeMultiUse(&op2->AsHWIntrinsic()->Op(1)); + if (op == GT_RSH) + { + // For arithmetic shift, we will be using ConditionalSelect to mask in the sign bits, which means + // the mask will be evaluated before the shift. We swap the copied operand with the shift amount + // operand here in order to preserve correct evaluation order for the masked shift count. + std::swap(shiftCountDup, op2->AsHWIntrinsic()->Op(1)); + } + + maskAmountOp = + varTypeIsByte(simdBaseType) + ? gtNewOperNode(op, TYP_INT, gtNewIconNode(0xFF), shiftCountDup) + : gtNewOperNode(op, simdBaseType, gtNewLconNode(0xFFFFFFFFFFFFFFFFULL), shiftCountDup); } - GenTree* shiftOp = gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, CORINFO_TYPE_INT, simdSize); - GenTree* maskOp = gtNewSimdCreateBroadcastNode(type, maskAmountOp, simdBaseJitType, simdSize); + if (op == GT_RSH) + { + GenTree* op1Dup = fgMakeMultiUse(&op1); + GenTree* signOp = + gtNewSimdCmpOpNode(GT_GT, type, gtNewZeroConNode(type), op1Dup, simdBaseJitType, simdSize); + + CorInfoType shiftType = varTypeIsSmall(simdBaseType) ? CORINFO_TYPE_INT : simdBaseJitType; + GenTree* shiftOp = gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, shiftType, simdSize); + GenTree* maskOp = gtNewSimdCreateBroadcastNode(type, maskAmountOp, simdBaseJitType, simdSize); - return gtNewSimdBinOpNode(GT_AND, type, shiftOp, maskOp, simdBaseJitType, simdSize); + return gtNewSimdCndSelNode(type, maskOp, shiftOp, signOp, simdBaseJitType, simdSize); + } + else + { + GenTree* shiftOp = gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, CORINFO_TYPE_INT, simdSize); + GenTree* maskOp = gtNewSimdCreateBroadcastNode(type, maskAmountOp, simdBaseJitType, simdSize); + + return gtNewSimdBinOpNode(GT_AND, type, shiftOp, maskOp, simdBaseJitType, simdSize); + } } #endif // TARGET_XARCH diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index 7ba8c6615f3617..67bf92ce38ef73 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -3443,21 +3443,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, { assert(sig->numArgs == 2); - if (varTypeIsByte(simdBaseType)) - { - // byte and sbyte would require more work to support - break; - } - - if (varTypeIsLong(simdBaseType) || (simdBaseType == TYP_DOUBLE)) - { - if (!compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL)) - { - // long, ulong, and double would require more work to support - break; - } - } - if ((simdSize != 32) || compOpportunisticallyDependsOn(InstructionSet_AVX2)) { genTreeOps op = varTypeIsUnsigned(simdBaseType) ? GT_RSZ : GT_RSH; diff --git a/src/coreclr/jit/importercalls.cpp b/src/coreclr/jit/importercalls.cpp index 37ba933a75a97a..6d737b4f59d10f 100644 --- a/src/coreclr/jit/importercalls.cpp +++ b/src/coreclr/jit/importercalls.cpp @@ -3319,7 +3319,7 @@ GenTree* Compiler::impIntrinsic(CORINFO_CLASS_HANDLE clsHnd, bool betterToExpand = false; - // Allow some lighweight intrinsics in Tier0 which can improve throughput + // Allow some lightweight intrinsics in Tier0 which can improve throughput // we're fine if intrinsic decides to not expand itself in this case unlike mustExpand. if (!mustExpand && opts.Tier0OptimizationEnabled()) { From 478732dba9c81bfcc561119b94d839dc6dcc585d Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Mon, 6 Jan 2025 13:01:09 -0800 Subject: [PATCH 2/6] disable vpsraq emulation on 32-bit --- src/coreclr/jit/hwintrinsicxarch.cpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index 67bf92ce38ef73..eb91979e4ff252 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -3443,6 +3443,17 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, { assert(sig->numArgs == 2); +#if defined(TARGET_X86) + if ((simdBaseType == TYP_LONG) || (simdBaseType == TYP_DOUBLE)) + { + if (!compOpportunisticallyDependsOn(InstructionSet_EVEX)) + { + // we need vpsraq to handle signed long, use software fallback + break; + } + } +#endif // TARGET_X86 + if ((simdSize != 32) || compOpportunisticallyDependsOn(InstructionSet_AVX2)) { genTreeOps op = varTypeIsUnsigned(simdBaseType) ? GT_RSZ : GT_RSH; From 5f4ad0f0ab88471c99d8b304b3390364898f019d Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Mon, 6 Jan 2025 16:20:06 -0800 Subject: [PATCH 3/6] use logical shift for mask --- src/coreclr/jit/gentree.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 90734e4ae043c6..8b9e7dec2530eb 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -21097,9 +21097,9 @@ GenTree* Compiler::gtNewSimdBinOpNode( assert(varTypeIsByte(simdBaseType) || (varTypeIsLong(simdBaseType) && (op == GT_RSH))); // We will emulate arithmetic shift by using logical shift and then masking in the sign bits. - genTreeOps simdOp = op == GT_RSH ? GT_RSZ : op; - intrinsic = GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(this, simdOp, op1, op2ForLookup, - genActualType(simdBaseType), simdSize, false); + genTreeOps instrOp = op == GT_RSH ? GT_RSZ : op; + intrinsic = GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(this, instrOp, op1, op2ForLookup, + genActualType(simdBaseType), simdSize, false); assert(intrinsic != NI_Illegal); GenTree* maskAmountOp; @@ -21133,8 +21133,8 @@ GenTree* Compiler::gtNewSimdBinOpNode( maskAmountOp = varTypeIsByte(simdBaseType) - ? gtNewOperNode(op, TYP_INT, gtNewIconNode(0xFF), shiftCountDup) - : gtNewOperNode(op, simdBaseType, gtNewLconNode(0xFFFFFFFFFFFFFFFFULL), shiftCountDup); + ? gtNewOperNode(instrOp, TYP_INT, gtNewIconNode(0xFF), shiftCountDup) + : gtNewOperNode(instrOp, simdBaseType, gtNewLconNode(0xFFFFFFFFFFFFFFFFULL), shiftCountDup); } if (op == GT_RSH) From 85ba0bb732589bf67de629b5d5a0d1472f4f987f Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Tue, 7 Jan 2025 12:44:37 -0800 Subject: [PATCH 4/6] fix disasm for shift instructions --- src/coreclr/jit/emitxarch.cpp | 10 +++++++++- src/coreclr/jit/gentree.cpp | 4 ++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 70f54f021c9375..ba9ed336da57d3 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -12358,10 +12358,18 @@ void emitter::emitDispIns( reg2 = reg3; reg3 = tmp; } + + // Shift instructions take xmm for the 3rd operand regardless of instruction size. + emitAttr attr3 = attr; + if (hasTupleTypeInfo(ins) && ((insTupleTypeInfo(id->idIns()) & INS_TT_MEM128) != 0)) + { + attr3 = EA_16BYTE; + } + printf("%s", emitRegName(id->idReg1(), attr)); emitDispEmbMasking(id); printf(", %s, ", emitRegName(reg2, attr)); - printf("%s", emitRegName(reg3, attr)); + printf("%s", emitRegName(reg3, attr3)); emitDispEmbRounding(id); break; } diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 8b9e7dec2530eb..453af79d953142 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -21091,8 +21091,8 @@ GenTree* Compiler::gtNewSimdBinOpNode( case GT_RSH: case GT_RSZ: { - // This emulates byte shift shift instructions, which don't exist in x86 SIMD, - // plus arithmetic shift of qwords, which did not exist pre-AVX-512. + // This emulates byte shift instructions, which don't exist in x86 SIMD, + // plus arithmetic shift of qwords, which did not exist before AVX-512. assert(varTypeIsByte(simdBaseType) || (varTypeIsLong(simdBaseType) && (op == GT_RSH))); From 19946a1a1668dffdc4fac844421386e6ba2129c2 Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Wed, 8 Jan 2025 11:24:50 -0800 Subject: [PATCH 5/6] allow vpsraq emulation on 32-bit for const shift amount --- src/coreclr/jit/emitxarch.cpp | 4 ++-- src/coreclr/jit/hwintrinsicxarch.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index ba9ed336da57d3..91ade208c29dc7 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -12359,10 +12359,10 @@ void emitter::emitDispIns( reg3 = tmp; } - // Shift instructions take xmm for the 3rd operand regardless of instruction size. emitAttr attr3 = attr; - if (hasTupleTypeInfo(ins) && ((insTupleTypeInfo(id->idIns()) & INS_TT_MEM128) != 0)) + if (hasTupleTypeInfo(ins) && ((insTupleTypeInfo(ins) & INS_TT_MEM128) != 0)) { + // Shift instructions take xmm for the 3rd operand regardless of instruction size. attr3 = EA_16BYTE; } diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index eb91979e4ff252..9f3ab5639d7fbe 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -3446,9 +3446,9 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, #if defined(TARGET_X86) if ((simdBaseType == TYP_LONG) || (simdBaseType == TYP_DOUBLE)) { - if (!compOpportunisticallyDependsOn(InstructionSet_EVEX)) + if (!compOpportunisticallyDependsOn(InstructionSet_EVEX) && !impStackTop(0).val->IsCnsIntOrI()) { - // we need vpsraq to handle signed long, use software fallback + // we need vpsraq to handle signed long with non-const shift amount, use software fallback break; } } From 584901264addc02748e2265bb92a3b6c7118f164 Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Thu, 9 Jan 2025 12:14:42 -0800 Subject: [PATCH 6/6] review feedback --- src/coreclr/jit/gentree.cpp | 20 ++++++++++++++++---- src/coreclr/jit/hwintrinsicxarch.cpp | 3 ++- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 22e5b51785587b..3b46510bde396f 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -7936,12 +7936,26 @@ GenTree* Compiler::gtNewAllBitsSetConNode(var_types type) switch (type) { + case TYP_BYTE: + case TYP_UBYTE: + { + return gtNewIconNode(0xFF); + } + + case TYP_SHORT: + case TYP_USHORT: + { + return gtNewIconNode(0xFFFF); + } + case TYP_INT: + case TYP_UINT: { return gtNewIconNode(-1); } case TYP_LONG: + case TYP_ULONG: { return gtNewLconNode(-1); } @@ -21132,10 +21146,8 @@ GenTree* Compiler::gtNewSimdBinOpNode( std::swap(shiftCountDup, op2->AsHWIntrinsic()->Op(1)); } - maskAmountOp = - varTypeIsByte(simdBaseType) - ? gtNewOperNode(instrOp, TYP_INT, gtNewIconNode(0xFF), shiftCountDup) - : gtNewOperNode(instrOp, simdBaseType, gtNewLconNode(0xFFFFFFFFFFFFFFFFULL), shiftCountDup); + maskAmountOp = gtNewOperNode(instrOp, genActualType(simdBaseType), gtNewAllBitsSetConNode(simdBaseType), + shiftCountDup); } if (op == GT_RSH) diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index 9f3ab5639d7fbe..4d0ebbe19b9f9f 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -3448,7 +3448,8 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, { if (!compOpportunisticallyDependsOn(InstructionSet_EVEX) && !impStackTop(0).val->IsCnsIntOrI()) { - // we need vpsraq to handle signed long with non-const shift amount, use software fallback + // If vpsraq is available, we can use that. We can also trivially emulate arithmetic shift by const + // amount. Otherwise, more work is required for long types, so we fall back to managed for now. break; } }