From 41f446044e75b0694e688a53a42f8c8ddddd76b4 Mon Sep 17 00:00:00 2001
From: Clinton Ingram <clinton.ingram@outlook.com>
Date: Sun, 5 Jan 2025 20:25:05 -0800
Subject: [PATCH 1/6] emulate missing x86 shift instructions

---
 src/coreclr/jit/gentree.cpp          | 66 +++++++++++++++++++++-------
 src/coreclr/jit/hwintrinsicxarch.cpp | 15 -------
 src/coreclr/jit/importercalls.cpp    |  2 +-
 3 files changed, 51 insertions(+), 32 deletions(-)

diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp
index f48cbc4e02dfd2..90734e4ae043c6 100644
--- a/src/coreclr/jit/gentree.cpp
+++ b/src/coreclr/jit/gentree.cpp
@@ -20924,8 +20924,6 @@ GenTree* Compiler::gtNewSimdBinOpNode(
 
             unsigned shiftCountMask = (genTypeSize(simdBaseType) * 8) - 1;
 
-            GenTree* nonConstantByteShiftCountOp = NULL;
-
             if (op2->IsCnsIntOrI())
             {
                 op2->AsIntCon()->gtIconVal &= shiftCountMask;
@@ -21089,16 +21087,19 @@ GenTree* Compiler::gtNewSimdBinOpNode(
         }
 
 #if defined(TARGET_XARCH)
-        case GT_RSZ:
         case GT_LSH:
+        case GT_RSH:
+        case GT_RSZ:
         {
-            // We don't have actual instructions for shifting bytes, so we'll emulate them
-            // by shifting 32-bit values and masking off the bits that should be zeroed.
+            // This emulates byte shift shift instructions, which don't exist in x86 SIMD,
+            // plus arithmetic shift of qwords, which did not exist pre-AVX-512.
 
-            assert(varTypeIsByte(simdBaseType));
+            assert(varTypeIsByte(simdBaseType) || (varTypeIsLong(simdBaseType) && (op == GT_RSH)));
 
-            intrinsic =
-                GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(this, op, op1, op2ForLookup, TYP_INT, simdSize, false);
+            // We will emulate arithmetic shift by using logical shift and then masking in the sign bits.
+            genTreeOps simdOp = op == GT_RSH ? GT_RSZ : op;
+            intrinsic         = GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(this, simdOp, op1, op2ForLookup,
+                                                                             genActualType(simdBaseType), simdSize, false);
             assert(intrinsic != NI_Illegal);
 
             GenTree* maskAmountOp;
@@ -21106,22 +21107,55 @@ GenTree* Compiler::gtNewSimdBinOpNode(
             if (op2->IsCnsIntOrI())
             {
                 ssize_t shiftCount = op2->AsIntCon()->gtIconVal;
-                ssize_t mask       = op == GT_RSZ ? (255 >> shiftCount) : ((255 << shiftCount) & 0xFF);
-
-                maskAmountOp = gtNewIconNode(mask, type);
+                if (varTypeIsByte(simdBaseType))
+                {
+                    ssize_t mask = op == GT_LSH ? ((0xFF << shiftCount) & 0xFF) : (0xFF >> shiftCount);
+                    maskAmountOp = gtNewIconNode(mask, type);
+                }
+                else
+                {
+                    int64_t mask = static_cast<int64_t>(0xFFFFFFFFFFFFFFFFULL >> shiftCount);
+                    maskAmountOp = gtNewLconNode(mask);
+                }
             }
             else
             {
                 assert(op2->OperIsHWIntrinsic(NI_Vector128_CreateScalar));
 
-                GenTree* nonConstantByteShiftCountOp = fgMakeMultiUse(&op2->AsHWIntrinsic()->Op(1));
-                maskAmountOp = gtNewOperNode(op, TYP_INT, gtNewIconNode(255), nonConstantByteShiftCountOp);
+                GenTree* shiftCountDup = fgMakeMultiUse(&op2->AsHWIntrinsic()->Op(1));
+                if (op == GT_RSH)
+                {
+                    // For arithmetic shift, we will be using ConditionalSelect to mask in the sign bits, which means
+                    // the mask will be evaluated before the shift. We swap the copied operand with the shift amount
+                    // operand here in order to preserve correct evaluation order for the masked shift count.
+                    std::swap(shiftCountDup, op2->AsHWIntrinsic()->Op(1));
+                }
+
+                maskAmountOp =
+                    varTypeIsByte(simdBaseType)
+                        ? gtNewOperNode(op, TYP_INT, gtNewIconNode(0xFF), shiftCountDup)
+                        : gtNewOperNode(op, simdBaseType, gtNewLconNode(0xFFFFFFFFFFFFFFFFULL), shiftCountDup);
             }
 
-            GenTree* shiftOp = gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, CORINFO_TYPE_INT, simdSize);
-            GenTree* maskOp  = gtNewSimdCreateBroadcastNode(type, maskAmountOp, simdBaseJitType, simdSize);
+            if (op == GT_RSH)
+            {
+                GenTree* op1Dup = fgMakeMultiUse(&op1);
+                GenTree* signOp =
+                    gtNewSimdCmpOpNode(GT_GT, type, gtNewZeroConNode(type), op1Dup, simdBaseJitType, simdSize);
+
+                CorInfoType shiftType = varTypeIsSmall(simdBaseType) ? CORINFO_TYPE_INT : simdBaseJitType;
+                GenTree*    shiftOp   = gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, shiftType, simdSize);
+                GenTree*    maskOp    = gtNewSimdCreateBroadcastNode(type, maskAmountOp, simdBaseJitType, simdSize);
 
-            return gtNewSimdBinOpNode(GT_AND, type, shiftOp, maskOp, simdBaseJitType, simdSize);
+                return gtNewSimdCndSelNode(type, maskOp, shiftOp, signOp, simdBaseJitType, simdSize);
+            }
+            else
+            {
+                GenTree* shiftOp = gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, CORINFO_TYPE_INT, simdSize);
+                GenTree* maskOp  = gtNewSimdCreateBroadcastNode(type, maskAmountOp, simdBaseJitType, simdSize);
+
+                return gtNewSimdBinOpNode(GT_AND, type, shiftOp, maskOp, simdBaseJitType, simdSize);
+            }
         }
 #endif // TARGET_XARCH
 
diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp
index 7ba8c6615f3617..67bf92ce38ef73 100644
--- a/src/coreclr/jit/hwintrinsicxarch.cpp
+++ b/src/coreclr/jit/hwintrinsicxarch.cpp
@@ -3443,21 +3443,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic        intrinsic,
         {
             assert(sig->numArgs == 2);
 
-            if (varTypeIsByte(simdBaseType))
-            {
-                // byte and sbyte would require more work to support
-                break;
-            }
-
-            if (varTypeIsLong(simdBaseType) || (simdBaseType == TYP_DOUBLE))
-            {
-                if (!compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL))
-                {
-                    // long, ulong, and double would require more work to support
-                    break;
-                }
-            }
-
             if ((simdSize != 32) || compOpportunisticallyDependsOn(InstructionSet_AVX2))
             {
                 genTreeOps op = varTypeIsUnsigned(simdBaseType) ? GT_RSZ : GT_RSH;
diff --git a/src/coreclr/jit/importercalls.cpp b/src/coreclr/jit/importercalls.cpp
index 37ba933a75a97a..6d737b4f59d10f 100644
--- a/src/coreclr/jit/importercalls.cpp
+++ b/src/coreclr/jit/importercalls.cpp
@@ -3319,7 +3319,7 @@ GenTree* Compiler::impIntrinsic(CORINFO_CLASS_HANDLE    clsHnd,
 
     bool betterToExpand = false;
 
-    // Allow some lighweight intrinsics in Tier0 which can improve throughput
+    // Allow some lightweight intrinsics in Tier0 which can improve throughput
     // we're fine if intrinsic decides to not expand itself in this case unlike mustExpand.
     if (!mustExpand && opts.Tier0OptimizationEnabled())
     {

From 478732dba9c81bfcc561119b94d839dc6dcc585d Mon Sep 17 00:00:00 2001
From: Clinton Ingram <clinton.ingram@outlook.com>
Date: Mon, 6 Jan 2025 13:01:09 -0800
Subject: [PATCH 2/6] disable vpsraq emulation on 32-bit

---
 src/coreclr/jit/hwintrinsicxarch.cpp | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp
index 67bf92ce38ef73..eb91979e4ff252 100644
--- a/src/coreclr/jit/hwintrinsicxarch.cpp
+++ b/src/coreclr/jit/hwintrinsicxarch.cpp
@@ -3443,6 +3443,17 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic        intrinsic,
         {
             assert(sig->numArgs == 2);
 
+#if defined(TARGET_X86)
+            if ((simdBaseType == TYP_LONG) || (simdBaseType == TYP_DOUBLE))
+            {
+                if (!compOpportunisticallyDependsOn(InstructionSet_EVEX))
+                {
+                    // we need vpsraq to handle signed long, use software fallback
+                    break;
+                }
+            }
+#endif // TARGET_X86
+
             if ((simdSize != 32) || compOpportunisticallyDependsOn(InstructionSet_AVX2))
             {
                 genTreeOps op = varTypeIsUnsigned(simdBaseType) ? GT_RSZ : GT_RSH;

From 5f4ad0f0ab88471c99d8b304b3390364898f019d Mon Sep 17 00:00:00 2001
From: Clinton Ingram <clinton.ingram@outlook.com>
Date: Mon, 6 Jan 2025 16:20:06 -0800
Subject: [PATCH 3/6] use logical shift for mask

---
 src/coreclr/jit/gentree.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp
index 90734e4ae043c6..8b9e7dec2530eb 100644
--- a/src/coreclr/jit/gentree.cpp
+++ b/src/coreclr/jit/gentree.cpp
@@ -21097,9 +21097,9 @@ GenTree* Compiler::gtNewSimdBinOpNode(
             assert(varTypeIsByte(simdBaseType) || (varTypeIsLong(simdBaseType) && (op == GT_RSH)));
 
             // We will emulate arithmetic shift by using logical shift and then masking in the sign bits.
-            genTreeOps simdOp = op == GT_RSH ? GT_RSZ : op;
-            intrinsic         = GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(this, simdOp, op1, op2ForLookup,
-                                                                             genActualType(simdBaseType), simdSize, false);
+            genTreeOps instrOp = op == GT_RSH ? GT_RSZ : op;
+            intrinsic          = GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(this, instrOp, op1, op2ForLookup,
+                                                                              genActualType(simdBaseType), simdSize, false);
             assert(intrinsic != NI_Illegal);
 
             GenTree* maskAmountOp;
@@ -21133,8 +21133,8 @@ GenTree* Compiler::gtNewSimdBinOpNode(
 
                 maskAmountOp =
                     varTypeIsByte(simdBaseType)
-                        ? gtNewOperNode(op, TYP_INT, gtNewIconNode(0xFF), shiftCountDup)
-                        : gtNewOperNode(op, simdBaseType, gtNewLconNode(0xFFFFFFFFFFFFFFFFULL), shiftCountDup);
+                        ? gtNewOperNode(instrOp, TYP_INT, gtNewIconNode(0xFF), shiftCountDup)
+                        : gtNewOperNode(instrOp, simdBaseType, gtNewLconNode(0xFFFFFFFFFFFFFFFFULL), shiftCountDup);
             }
 
             if (op == GT_RSH)

From 85ba0bb732589bf67de629b5d5a0d1472f4f987f Mon Sep 17 00:00:00 2001
From: Clinton Ingram <clinton.ingram@outlook.com>
Date: Tue, 7 Jan 2025 12:44:37 -0800
Subject: [PATCH 4/6] fix disasm for shift instructions

---
 src/coreclr/jit/emitxarch.cpp | 10 +++++++++-
 src/coreclr/jit/gentree.cpp   |  4 ++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index 70f54f021c9375..ba9ed336da57d3 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -12358,10 +12358,18 @@ void emitter::emitDispIns(
                 reg2          = reg3;
                 reg3          = tmp;
             }
+
+            // Shift instructions take xmm for the 3rd operand regardless of instruction size.
+            emitAttr attr3 = attr;
+            if (hasTupleTypeInfo(ins) && ((insTupleTypeInfo(id->idIns()) & INS_TT_MEM128) != 0))
+            {
+                attr3 = EA_16BYTE;
+            }
+
             printf("%s", emitRegName(id->idReg1(), attr));
             emitDispEmbMasking(id);
             printf(", %s, ", emitRegName(reg2, attr));
-            printf("%s", emitRegName(reg3, attr));
+            printf("%s", emitRegName(reg3, attr3));
             emitDispEmbRounding(id);
             break;
         }
diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp
index 8b9e7dec2530eb..453af79d953142 100644
--- a/src/coreclr/jit/gentree.cpp
+++ b/src/coreclr/jit/gentree.cpp
@@ -21091,8 +21091,8 @@ GenTree* Compiler::gtNewSimdBinOpNode(
         case GT_RSH:
         case GT_RSZ:
         {
-            // This emulates byte shift shift instructions, which don't exist in x86 SIMD,
-            // plus arithmetic shift of qwords, which did not exist pre-AVX-512.
+            // This emulates byte shift instructions, which don't exist in x86 SIMD,
+            // plus arithmetic shift of qwords, which did not exist before AVX-512.
 
             assert(varTypeIsByte(simdBaseType) || (varTypeIsLong(simdBaseType) && (op == GT_RSH)));
 

From 19946a1a1668dffdc4fac844421386e6ba2129c2 Mon Sep 17 00:00:00 2001
From: Clinton Ingram <clinton.ingram@outlook.com>
Date: Wed, 8 Jan 2025 11:24:50 -0800
Subject: [PATCH 5/6] allow vpsraq emulation on 32-bit for const shift amount

---
 src/coreclr/jit/emitxarch.cpp        | 4 ++--
 src/coreclr/jit/hwintrinsicxarch.cpp | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index ba9ed336da57d3..91ade208c29dc7 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -12359,10 +12359,10 @@ void emitter::emitDispIns(
                 reg3          = tmp;
             }
 
-            // Shift instructions take xmm for the 3rd operand regardless of instruction size.
             emitAttr attr3 = attr;
-            if (hasTupleTypeInfo(ins) && ((insTupleTypeInfo(id->idIns()) & INS_TT_MEM128) != 0))
+            if (hasTupleTypeInfo(ins) && ((insTupleTypeInfo(ins) & INS_TT_MEM128) != 0))
             {
+                // Shift instructions take xmm for the 3rd operand regardless of instruction size.
                 attr3 = EA_16BYTE;
             }
 
diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp
index eb91979e4ff252..9f3ab5639d7fbe 100644
--- a/src/coreclr/jit/hwintrinsicxarch.cpp
+++ b/src/coreclr/jit/hwintrinsicxarch.cpp
@@ -3446,9 +3446,9 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic        intrinsic,
 #if defined(TARGET_X86)
             if ((simdBaseType == TYP_LONG) || (simdBaseType == TYP_DOUBLE))
             {
-                if (!compOpportunisticallyDependsOn(InstructionSet_EVEX))
+                if (!compOpportunisticallyDependsOn(InstructionSet_EVEX) && !impStackTop(0).val->IsCnsIntOrI())
                 {
-                    // we need vpsraq to handle signed long, use software fallback
+                    // we need vpsraq to handle signed long with non-const shift amount, use software fallback
                     break;
                 }
             }

From 584901264addc02748e2265bb92a3b6c7118f164 Mon Sep 17 00:00:00 2001
From: Clinton Ingram <clinton.ingram@outlook.com>
Date: Thu, 9 Jan 2025 12:14:42 -0800
Subject: [PATCH 6/6] review feedback

---
 src/coreclr/jit/gentree.cpp          | 20 ++++++++++++++++----
 src/coreclr/jit/hwintrinsicxarch.cpp |  3 ++-
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp
index 22e5b51785587b..3b46510bde396f 100644
--- a/src/coreclr/jit/gentree.cpp
+++ b/src/coreclr/jit/gentree.cpp
@@ -7936,12 +7936,26 @@ GenTree* Compiler::gtNewAllBitsSetConNode(var_types type)
 
     switch (type)
     {
+        case TYP_BYTE:
+        case TYP_UBYTE:
+        {
+            return gtNewIconNode(0xFF);
+        }
+
+        case TYP_SHORT:
+        case TYP_USHORT:
+        {
+            return gtNewIconNode(0xFFFF);
+        }
+
         case TYP_INT:
+        case TYP_UINT:
         {
             return gtNewIconNode(-1);
         }
 
         case TYP_LONG:
+        case TYP_ULONG:
         {
             return gtNewLconNode(-1);
         }
@@ -21132,10 +21146,8 @@ GenTree* Compiler::gtNewSimdBinOpNode(
                     std::swap(shiftCountDup, op2->AsHWIntrinsic()->Op(1));
                 }
 
-                maskAmountOp =
-                    varTypeIsByte(simdBaseType)
-                        ? gtNewOperNode(instrOp, TYP_INT, gtNewIconNode(0xFF), shiftCountDup)
-                        : gtNewOperNode(instrOp, simdBaseType, gtNewLconNode(0xFFFFFFFFFFFFFFFFULL), shiftCountDup);
+                maskAmountOp = gtNewOperNode(instrOp, genActualType(simdBaseType), gtNewAllBitsSetConNode(simdBaseType),
+                                             shiftCountDup);
             }
 
             if (op == GT_RSH)
diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp
index 9f3ab5639d7fbe..4d0ebbe19b9f9f 100644
--- a/src/coreclr/jit/hwintrinsicxarch.cpp
+++ b/src/coreclr/jit/hwintrinsicxarch.cpp
@@ -3448,7 +3448,8 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic        intrinsic,
             {
                 if (!compOpportunisticallyDependsOn(InstructionSet_EVEX) && !impStackTop(0).val->IsCnsIntOrI())
                 {
-                    // we need vpsraq to handle signed long with non-const shift amount, use software fallback
+                    // If vpsraq is available, we can use that. We can also trivially emulate arithmetic shift by const
+                    // amount. Otherwise, more work is required for long types, so we fall back to managed for now.
                     break;
                 }
             }