diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 70f54f021c9375..91ade208c29dc7 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -12358,10 +12358,18 @@ void emitter::emitDispIns( reg2 = reg3; reg3 = tmp; } + + emitAttr attr3 = attr; + if (hasTupleTypeInfo(ins) && ((insTupleTypeInfo(ins) & INS_TT_MEM128) != 0)) + { + // Shift instructions take xmm for the 3rd operand regardless of instruction size. + attr3 = EA_16BYTE; + } + printf("%s", emitRegName(id->idReg1(), attr)); emitDispEmbMasking(id); printf(", %s, ", emitRegName(reg2, attr)); - printf("%s", emitRegName(reg3, attr)); + printf("%s", emitRegName(reg3, attr3)); emitDispEmbRounding(id); break; } diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 166a5ff7bc585d..3b46510bde396f 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -7936,12 +7936,26 @@ GenTree* Compiler::gtNewAllBitsSetConNode(var_types type) switch (type) { + case TYP_BYTE: + case TYP_UBYTE: + { + return gtNewIconNode(0xFF); + } + + case TYP_SHORT: + case TYP_USHORT: + { + return gtNewIconNode(0xFFFF); + } + case TYP_INT: + case TYP_UINT: { return gtNewIconNode(-1); } case TYP_LONG: + case TYP_ULONG: { return gtNewLconNode(-1); } @@ -20925,8 +20939,6 @@ GenTree* Compiler::gtNewSimdBinOpNode( unsigned shiftCountMask = (genTypeSize(simdBaseType) * 8) - 1; - GenTree* nonConstantByteShiftCountOp = NULL; - if (op2->IsCnsIntOrI()) { op2->AsIntCon()->gtIconVal &= shiftCountMask; @@ -21090,16 +21102,19 @@ GenTree* Compiler::gtNewSimdBinOpNode( } #if defined(TARGET_XARCH) - case GT_RSZ: case GT_LSH: + case GT_RSH: + case GT_RSZ: { - // We don't have actual instructions for shifting bytes, so we'll emulate them - // by shifting 32-bit values and masking off the bits that should be zeroed. + // This emulates byte shift instructions, which don't exist in x86 SIMD, + // plus arithmetic shift of qwords, which did not exist before AVX-512. - assert(varTypeIsByte(simdBaseType)); + assert(varTypeIsByte(simdBaseType) || (varTypeIsLong(simdBaseType) && (op == GT_RSH))); - intrinsic = - GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(this, op, op1, op2ForLookup, TYP_INT, simdSize, false); + // We will emulate arithmetic shift by using logical shift and then masking in the sign bits. + genTreeOps instrOp = op == GT_RSH ? GT_RSZ : op; + intrinsic = GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(this, instrOp, op1, op2ForLookup, + genActualType(simdBaseType), simdSize, false); assert(intrinsic != NI_Illegal); GenTree* maskAmountOp; @@ -21107,22 +21122,53 @@ GenTree* Compiler::gtNewSimdBinOpNode( if (op2->IsCnsIntOrI()) { ssize_t shiftCount = op2->AsIntCon()->gtIconVal; - ssize_t mask = op == GT_RSZ ? (255 >> shiftCount) : ((255 << shiftCount) & 0xFF); - - maskAmountOp = gtNewIconNode(mask, type); + if (varTypeIsByte(simdBaseType)) + { + ssize_t mask = op == GT_LSH ? ((0xFF << shiftCount) & 0xFF) : (0xFF >> shiftCount); + maskAmountOp = gtNewIconNode(mask, type); + } + else + { + int64_t mask = static_cast(0xFFFFFFFFFFFFFFFFULL >> shiftCount); + maskAmountOp = gtNewLconNode(mask); + } } else { assert(op2->OperIsHWIntrinsic(NI_Vector128_CreateScalar)); - GenTree* nonConstantByteShiftCountOp = fgMakeMultiUse(&op2->AsHWIntrinsic()->Op(1)); - maskAmountOp = gtNewOperNode(op, TYP_INT, gtNewIconNode(255), nonConstantByteShiftCountOp); + GenTree* shiftCountDup = fgMakeMultiUse(&op2->AsHWIntrinsic()->Op(1)); + if (op == GT_RSH) + { + // For arithmetic shift, we will be using ConditionalSelect to mask in the sign bits, which means + // the mask will be evaluated before the shift. We swap the copied operand with the shift amount + // operand here in order to preserve correct evaluation order for the masked shift count. + std::swap(shiftCountDup, op2->AsHWIntrinsic()->Op(1)); + } + + maskAmountOp = gtNewOperNode(instrOp, genActualType(simdBaseType), gtNewAllBitsSetConNode(simdBaseType), + shiftCountDup); } - GenTree* shiftOp = gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, CORINFO_TYPE_INT, simdSize); - GenTree* maskOp = gtNewSimdCreateBroadcastNode(type, maskAmountOp, simdBaseJitType, simdSize); + if (op == GT_RSH) + { + GenTree* op1Dup = fgMakeMultiUse(&op1); + GenTree* signOp = + gtNewSimdCmpOpNode(GT_GT, type, gtNewZeroConNode(type), op1Dup, simdBaseJitType, simdSize); + + CorInfoType shiftType = varTypeIsSmall(simdBaseType) ? CORINFO_TYPE_INT : simdBaseJitType; + GenTree* shiftOp = gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, shiftType, simdSize); + GenTree* maskOp = gtNewSimdCreateBroadcastNode(type, maskAmountOp, simdBaseJitType, simdSize); + + return gtNewSimdCndSelNode(type, maskOp, shiftOp, signOp, simdBaseJitType, simdSize); + } + else + { + GenTree* shiftOp = gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, CORINFO_TYPE_INT, simdSize); + GenTree* maskOp = gtNewSimdCreateBroadcastNode(type, maskAmountOp, simdBaseJitType, simdSize); - return gtNewSimdBinOpNode(GT_AND, type, shiftOp, maskOp, simdBaseJitType, simdSize); + return gtNewSimdBinOpNode(GT_AND, type, shiftOp, maskOp, simdBaseJitType, simdSize); + } } #endif // TARGET_XARCH diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index 7ba8c6615f3617..4d0ebbe19b9f9f 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -3443,20 +3443,17 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, { assert(sig->numArgs == 2); - if (varTypeIsByte(simdBaseType)) - { - // byte and sbyte would require more work to support - break; - } - - if (varTypeIsLong(simdBaseType) || (simdBaseType == TYP_DOUBLE)) +#if defined(TARGET_X86) + if ((simdBaseType == TYP_LONG) || (simdBaseType == TYP_DOUBLE)) { - if (!compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL)) + if (!compOpportunisticallyDependsOn(InstructionSet_EVEX) && !impStackTop(0).val->IsCnsIntOrI()) { - // long, ulong, and double would require more work to support + // If vpsraq is available, we can use that. We can also trivially emulate arithmetic shift by const + // amount. Otherwise, more work is required for long types, so we fall back to managed for now. break; } } +#endif // TARGET_X86 if ((simdSize != 32) || compOpportunisticallyDependsOn(InstructionSet_AVX2)) { diff --git a/src/coreclr/jit/importercalls.cpp b/src/coreclr/jit/importercalls.cpp index 37ba933a75a97a..6d737b4f59d10f 100644 --- a/src/coreclr/jit/importercalls.cpp +++ b/src/coreclr/jit/importercalls.cpp @@ -3319,7 +3319,7 @@ GenTree* Compiler::impIntrinsic(CORINFO_CLASS_HANDLE clsHnd, bool betterToExpand = false; - // Allow some lighweight intrinsics in Tier0 which can improve throughput + // Allow some lightweight intrinsics in Tier0 which can improve throughput // we're fine if intrinsic decides to not expand itself in this case unlike mustExpand. if (!mustExpand && opts.Tier0OptimizationEnabled()) {