From a872cbd30c9a174211b314f0cb78906dcb1c4486 Mon Sep 17 00:00:00 2001 From: Aleksandar Micic Date: Mon, 20 Jan 2025 12:13:57 -0500 Subject: [PATCH] Force outlining for array EA API Macros for Array Effective Address calculations are inherently inlined and seem to create much pressure either on register or code cash in Bytecode interpreter. They are rewritten in C and forced to be outlined specifically for GNU compilers on X and Z, where we saw regression when Offheap was introduced (what made the macros more complex, creating even more pressure). For other platforms where we did not see regression, we continue to inline (ATM unknown if outlining would have negative or possitive effect). Hence we still keep it in a header (*.h) file. Signed-off-by: Aleksandar Micic --- runtime/oti/j9accessbarrier.h | 13 ++---- runtime/oti/j9accessbarrierhelpers.h | 69 ++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+), 8 deletions(-) diff --git a/runtime/oti/j9accessbarrier.h b/runtime/oti/j9accessbarrier.h index 86eb5494ed9..4fb381b0dfb 100644 --- a/runtime/oti/j9accessbarrier.h +++ b/runtime/oti/j9accessbarrier.h @@ -161,14 +161,11 @@ typedef struct J9IndexableObject* mm_j9array_t; * else * discontiguous */ -#define J9JAVAARRAY_EA(vmThread, array, index, elemType) \ - ((J9IndexableObjectLayout_NoDataAddr_NoArraylet == (vmThread)->indexableObjectLayout) \ - ? J9JAVAARRAYCONTIGUOUS_BASE_EA(vmThread, array, index, elemType) \ - : ((J9IndexableObjectLayout_DataAddr_NoArraylet == (vmThread)->indexableObjectLayout) \ - ? J9JAVAARRAYCONTIGUOUS_WITH_DATAADDRESS_VIRTUALLARGEOBJECTHEAPENABLED_EA(vmThread, array, index, elemType) \ - : (J9ISCONTIGUOUSARRAY(vmThread, array) \ - ? J9JAVAARRAYCONTIGUOUS_EA(vmThread, array, index, elemType) \ - : J9JAVAARRAYDISCONTIGUOUS_EA(vmThread, array, index, elemType)))) + + + +/* Effective Address calculation for callers using vmThread are passed to C implementation, which may force outlining for some platforms. */ +#define J9JAVAARRAY_EA(vmThread, array, index, elemType) j9javaArray_##elemType##_EA(vmThread, (J9IndexableObject *)(array), index) #define J9JAVAARRAY_EA_VM(javaVM, array, index, elemType) \ ((J9IndexableObjectLayout_NoDataAddr_NoArraylet == (javaVM)->indexableObjectLayout) \ diff --git a/runtime/oti/j9accessbarrierhelpers.h b/runtime/oti/j9accessbarrierhelpers.h index e97a2efbea0..06ca460a7bf 100644 --- a/runtime/oti/j9accessbarrierhelpers.h +++ b/runtime/oti/j9accessbarrierhelpers.h @@ -23,6 +23,75 @@ #ifndef J9ACCESSBARRIERHELPERS_H #define J9ACCESSBARRIERHELPERS_H +#if defined (J9VM_ENV_DATA64) +#if (defined(__GNUC__) && (defined(J9HAMMER) || defined(S390))) +/* Forcing non-inlining on GNU for X and Z, where inlining seems to create much register or code cache pressure within Bytecode Interpreter */ +__attribute__ ((noinline)) +#else /* (defined(__GNUC__) && (defined(J9HAMMER) || defined(S390))) */ +VMINLINE +#endif /* (defined(__GNUC__) && (defined(J9HAMMER) || defined(S390))) */ +static UDATA j9javaArray_BA(J9VMThread *vmThread, J9IndexableObject *array, UDATA *index, U_8 elementSize) +{ + UDATA baseAddress = (UDATA)array; + + if (J9VMTHREAD_COMPRESS_OBJECT_REFERENCES(vmThread)) { + baseAddress += sizeof(J9IndexableObjectContiguousCompressed); + } else { + baseAddress += sizeof(J9IndexableObjectContiguousFull); + } + + if (J9IndexableObjectLayout_NoDataAddr_NoArraylet == vmThread->indexableObjectLayout) { + /* Standard GCs: nothing extra to do - just explicitly listed for clarity */ + } else if (J9IndexableObjectLayout_DataAddr_NoArraylet == vmThread->indexableObjectLayout) { + /* Balanced Offheap; dereference dataAddr that is just after the (base) header */ + baseAddress = *(UDATA *)baseAddress; + } else { + /* GCs that may have arraylet (Balanced arraylet or Metronome) - will recalculate baseAddress from scratch */ + if (J9ISCONTIGUOUSARRAY(vmThread, array)) { + baseAddress = (UDATA)array + vmThread->contiguousIndexableHeaderSize; + } else { + fj9object_t *arrayoid = (fj9object_t *)((UDATA)array + vmThread->discontiguousIndexableHeaderSize); + /* While arrayletLeafSize is UDATA, the result of this division will fit into U_32 (simply because Java can't have more array elements) */ + U_32 elementsPerLeaf = (U_32)(J9VMTHREAD_JAVAVM(vmThread)->arrayletLeafSize / elementSize); + U_32 leafIndex = ((U_32)*index) / elementsPerLeaf; + *index = ((U_32)*index) % elementsPerLeaf; + + if (J9VMTHREAD_COMPRESS_OBJECT_REFERENCES(vmThread)) { + U_32 leafToken = *((U_32 *)arrayoid + leafIndex); + baseAddress = (UDATA)J9_CONVERT_POINTER_FROM_TOKEN__(vmThread, leafToken); + } else { + UDATA leafToken = *((UDATA *)arrayoid + leafIndex); + baseAddress = leafToken; + } + } + } + + + return baseAddress; +} + +#define J9JAVAARRAY_C_EA(elemType) \ +VMINLINE static elemType *j9javaArray_##elemType##_EA(J9VMThread *vmThread, J9IndexableObject *array, UDATA index) \ +{ \ + UDATA baseAddress = j9javaArray_BA(vmThread, array, &index, (U_8)sizeof(elemType)); \ + /* Intentionally inlining this to treat sizeof value as an immediate value */ \ + return (elemType *)(baseAddress + index * sizeof(elemType)); \ +} \ + +/* generate C bodies */ + +J9JAVAARRAY_C_EA(I_8) +J9JAVAARRAY_C_EA(U_8) +J9JAVAARRAY_C_EA(I_16) +J9JAVAARRAY_C_EA(U_16) +J9JAVAARRAY_C_EA(I_32) +J9JAVAARRAY_C_EA(U_32) +J9JAVAARRAY_C_EA(I_64) +J9JAVAARRAY_C_EA(U_64) +J9JAVAARRAY_C_EA(IDATA) +J9JAVAARRAY_C_EA(UDATA) +#endif /* defined (J9VM_ENV_DATA64) */ + /** * These helpers could be written as macros (where the body of methods would be wrapped around oval parenthesis, which would mean that the last expression in the block * is return value of the block). However, it is not fully supported by ANSI, but only select C compilers, like GNU C: https://gcc.gnu.org/onlinedocs/gcc/Statement-Exprs.html).