-
-
Notifications
You must be signed in to change notification settings - Fork 206
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Update Optimizer (float arithmetic and more) #472
base: master
Are you sure you want to change the base?
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1914,106 +1914,87 @@ OP_BREAK: | |
OP_FLOAT_MUL: | ||
GO_ON j_float_mul, OP_FLOAT_DIV | ||
j_float_mul: | ||
fld dword [esi+4] | ||
fmul dword [esi+8] | ||
sub esp, 4 | ||
fstp dword [esp] | ||
pop eax | ||
movss xmm0, dword [esi+4] | ||
mulss xmm0, dword [esi+8] | ||
movd eax, xmm0 | ||
CHECKCODESIZE j_float_mul | ||
|
||
OP_FLOAT_DIV: | ||
GO_ON j_float_div, OP_FLOAT_ADD | ||
j_float_div: | ||
fld dword [esi+4] | ||
fdiv dword [esi+8] | ||
sub esp, 4 | ||
fstp dword [esp] | ||
pop eax | ||
movss xmm0, dword [esi+4] | ||
divss xmm0, dword [esi+8] | ||
movd eax, xmm0 | ||
CHECKCODESIZE j_float_div | ||
|
||
OP_FLOAT_ADD: | ||
GO_ON j_float_add, OP_FLOAT_SUB | ||
j_float_add: | ||
fld dword [esi+4] | ||
fadd dword [esi+8] | ||
sub esp, 4 | ||
fstp dword [esp] | ||
pop eax | ||
movss xmm0, dword [esi+4] | ||
addss xmm0, dword [esi+8] | ||
movd eax, xmm0 | ||
CHECKCODESIZE j_float_add | ||
|
||
OP_FLOAT_SUB: | ||
GO_ON j_float_sub, OP_FLOAT_TO | ||
j_float_sub: | ||
fld dword [esi+4] | ||
fsub dword [esi+8] | ||
sub esp, 4 | ||
fstp dword [esp] | ||
pop eax | ||
movss xmm0, dword [esi+4] | ||
subss xmm0, dword [esi+8] | ||
movd eax, xmm0 | ||
CHECKCODESIZE j_float_sub | ||
|
||
OP_FLOAT_TO: | ||
GO_ON j_float_to, OP_FLOAT_ROUND | ||
j_float_to: | ||
fild dword [esi+4] | ||
sub esp, 4 | ||
fstp dword [esp] | ||
pop eax | ||
cvtsi2ss xmm0, dword [esi+4] | ||
movd eax, xmm0 | ||
CHECKCODESIZE j_float_to | ||
|
||
OP_FLOAT_ROUND: | ||
GO_ON j_float_round, OP_FLOAT_CMP | ||
j_float_round: | ||
;get the float control word | ||
push 0 | ||
mov ebp,esp | ||
fstcw [ebp] | ||
mov eax,[ebp] | ||
push eax | ||
;clear the top bits | ||
xor ah,ah | ||
;get the control method | ||
push edx | ||
mov edx,[esi+8] | ||
and edx,3 ;sanity check | ||
shl edx,2 ;shift it to right position | ||
;set the bits | ||
or ah,dl ;set bits 15,14 of FCW to rounding method | ||
or ah,3 ;set precision to 64bit | ||
|
||
;calculate | ||
sub esp,4 | ||
fld dword [esi+4] | ||
test edx,edx | ||
jnz .skip_correct | ||
;nearest mode | ||
;correct so as to AVOID bankers rounding | ||
or ah, 4 ;set rounding mode to floor | ||
fadd dword [g_round_nearest] | ||
|
||
.skip_correct: | ||
mov [ebp], eax | ||
fldcw [ebp] | ||
frndint | ||
fistp dword [esp] | ||
pop eax | ||
.done: | ||
pop edx | ||
;restore bits | ||
pop ebp | ||
mov [esp], ebp | ||
fldcw [esp] | ||
pop ebp | ||
cmp dword [esi+8], 0 | ||
jne .Floor | ||
;if (arg2 == 0) ROUND | ||
;{ | ||
cvtss2si eax, dword [esi+4] | ||
;} | ||
jmp .Done | ||
.Floor: | ||
cmp dword [esi+8], 1 | ||
jne .Ceil | ||
;else if (arg2 == 1) FLOOR | ||
;{ | ||
cvttss2si eax, dword [esi+4] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Incorrect code. |
||
mov ebp, dword [esi+4] | ||
shr ebp, 31 | ||
sub eax, ebp | ||
;} | ||
jmp .Done | ||
.Ceil: | ||
cmp dword [esi+8], 2 | ||
jne .Zero | ||
;else if (arg2 == 2) CEIL | ||
;{ | ||
movss xmm0, dword [esi+4] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also incorrect. You're basically doing |
||
addss xmm0, dword [g_round_nearest] | ||
cvtss2si eax, xmm0 | ||
;} | ||
jmp .Done | ||
.Zero: | ||
;else ZERO | ||
;{ | ||
cvttss2si eax, dword [esi+4] | ||
;} | ||
.Done: | ||
CHECKCODESIZE j_float_round | ||
|
||
OP_FLOAT_CMP: | ||
GO_ON j_float_cmp, OP_INVALID | ||
j_float_cmp: | ||
fld dword [esi+8] | ||
fld dword [esi+4] | ||
fucompp | ||
fnstsw ax | ||
fwait | ||
sahf | ||
movss xmm0, dword [esi+8] | ||
movss xmm1, dword [esi+4] | ||
ucomiss xmm1, xmm0 | ||
cmovz eax, [g_flagsjit+4] | ||
cmova eax, [g_flagsjit+8] | ||
cmovb eax, [g_flagsjit+0] | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,6 +10,7 @@ | |
#ifndef __AMXXLOG_H__ | ||
#define __AMXXLOG_H__ | ||
|
||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This boosts performance by 1000% |
||
class CLog | ||
{ | ||
private: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
#include "cpuinfo.h" | ||
|
||
|
||
#ifdef _WIN32 | ||
|
||
// Windows | ||
#include <intrin.h> | ||
#define cpuid(info, x) __cpuidex(info, x, 0) | ||
|
||
#else | ||
|
||
// GCC Intrinsics | ||
#include <cpuid.h> | ||
void cpuid(int info[4], int InfoType){ | ||
__cpuid_count(InfoType, 0, info[0], info[1], info[2], info[3]); | ||
} | ||
|
||
#endif | ||
|
||
// CPU Info | ||
CPUInfo::CPUInfo() | ||
{ | ||
int info[4]; | ||
cpuid(info, 0); | ||
int nIds = info[0]; | ||
|
||
cpuid(info, 0x80000000); | ||
unsigned nExIds = info[0]; | ||
|
||
// Detect Features | ||
if (nIds >= 0x00000001) | ||
{ | ||
cpuid(info,0x00000001); | ||
m_has_cmov = (info[3] & ((int)1 << 15)) != 0; | ||
m_has_mmx = (info[3] & ((int)1 << 23)) != 0; | ||
|
||
m_has_sse = (info[3] & ((int)1 << 25)) != 0; | ||
m_has_sse2 = (info[3] & ((int)1 << 26)) != 0; | ||
m_has_sse3 = (info[2] & ((int)1 << 0)) != 0; | ||
m_has_ssse3 = (info[2] & ((int)1 << 9)) != 0; | ||
m_has_sse4_1 = (info[2] & ((int)1 << 19)) != 0; | ||
m_has_sse4_2 = (info[2] & ((int)1 << 20)) != 0; | ||
|
||
m_has_clmul = (info[2] & ((int)1 << 1)) != 0; | ||
m_has_fma3 = (info[2] & ((int)1 << 12)) != 0; | ||
m_has_aes = (info[2] & ((int)1 << 25)) != 0; | ||
m_has_avx = (info[2] & ((int)1 << 28)) != 0; | ||
m_has_f16c = (info[2] & ((int)1 << 29)) != 0; | ||
m_has_rdrand = (info[2] & ((int)1 << 30)) != 0; | ||
} | ||
if (nIds >= 0x00000007) | ||
{ | ||
cpuid(info,0x00000007); | ||
|
||
m_has_avx2 = (info[1] & ((int)1 << 5)) != 0; | ||
m_has_avx512F = (info[1] & ((int)1 << 16)) != 0; | ||
m_has_avx512CD = (info[1] & ((int)1 << 28)) != 0; | ||
m_has_avx512PF = (info[1] & ((int)1 << 26)) != 0; | ||
m_has_avx512ER = (info[1] & ((int)1 << 27)) != 0; | ||
m_has_avx512VL = (info[1] & ((int)1 << 31)) != 0; | ||
m_has_avx512BW = (info[1] & ((int)1 << 30)) != 0; | ||
m_has_avx512DQ = (info[1] & ((int)1 << 17)) != 0; | ||
m_has_avx512IFMA = (info[1] & ((int)1 << 21)) != 0; | ||
m_has_avx512VBMI = (info[2] & ((int)1 << 1)) != 0; | ||
} | ||
if (nExIds >= 0x80000001) | ||
{ | ||
cpuid(info,0x80000001); | ||
|
||
m_has_xop = (info[2] & ((int)1 << 11)) != 0; | ||
} | ||
} | ||
|
||
|
||
bool CPUInfo::has_cmov() const { return m_has_cmov; } | ||
bool CPUInfo::has_mmx() const { return m_has_mmx; } | ||
bool CPUInfo::has_sse() const { return m_has_sse; } | ||
bool CPUInfo::has_sse2() const { return m_has_sse2; } | ||
bool CPUInfo::has_sse3() const { return m_has_sse3; } | ||
bool CPUInfo::has_ssse3() const { return m_has_ssse3; } | ||
bool CPUInfo::has_sse4_1() const { return m_has_sse4_1; } | ||
bool CPUInfo::has_sse4_2() const { return m_has_sse4_2; } | ||
bool CPUInfo::has_clmul() const { return m_has_clmul; } | ||
bool CPUInfo::has_fma3() const { return m_has_fma3; } | ||
bool CPUInfo::has_aes() const { return m_has_aes; } | ||
bool CPUInfo::has_avx() const { return m_has_avx; } | ||
bool CPUInfo::has_avx2() const { return m_has_avx2; } | ||
bool CPUInfo::has_f16c() const { return m_has_f16c; } | ||
bool CPUInfo::has_rdrand() const { return m_has_rdrand; } | ||
bool CPUInfo::has_xop() const { return m_has_xop; } | ||
bool CPUInfo::has_avx512F() const { return m_has_avx512F; } | ||
bool CPUInfo::has_avx512CD() const { return m_has_avx512CD; } | ||
bool CPUInfo::has_avx512PF() const { return m_has_avx512PF; } | ||
bool CPUInfo::has_avx512ER() const { return m_has_avx512ER; } | ||
bool CPUInfo::has_avx512VL() const { return m_has_avx512VL; } | ||
bool CPUInfo::has_avx512BW() const { return m_has_avx512BW; } | ||
bool CPUInfo::has_avx512DQ() const { return m_has_avx512DQ; } | ||
bool CPUInfo::has_avx512IFMA() const { return m_has_avx512IFMA; } | ||
bool CPUInfo::has_avx512VBMI() const { return m_has_avx512VBMI; } | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
#ifndef _INCLUDE_AMXMODX_CPUINFO_H | ||
#define _INCLUDE_AMXMODX_CPUINFO_H | ||
|
||
class CPUInfo | ||
{ | ||
private: | ||
bool m_has_cmov; | ||
bool m_has_mmx; | ||
|
||
bool m_has_sse; | ||
bool m_has_sse2; | ||
bool m_has_sse3; | ||
bool m_has_ssse3; | ||
bool m_has_sse4_1; | ||
bool m_has_sse4_2; | ||
|
||
bool m_has_clmul; | ||
bool m_has_fma3; | ||
bool m_has_aes; | ||
bool m_has_avx; | ||
bool m_has_avx2; | ||
bool m_has_f16c; | ||
bool m_has_rdrand; | ||
bool m_has_xop; | ||
|
||
bool m_has_avx512F; | ||
bool m_has_avx512CD; | ||
bool m_has_avx512PF; | ||
bool m_has_avx512ER; | ||
bool m_has_avx512VL; | ||
bool m_has_avx512BW; | ||
bool m_has_avx512DQ; | ||
bool m_has_avx512IFMA; | ||
bool m_has_avx512VBMI; | ||
|
||
public: | ||
CPUInfo(); | ||
|
||
bool has_cmov() const; | ||
bool has_mmx() const; | ||
|
||
bool has_sse() const; | ||
bool has_sse2() const; | ||
bool has_sse3() const; | ||
bool has_ssse3() const; | ||
bool has_sse4_1() const; | ||
bool has_sse4_2() const; | ||
|
||
bool has_clmul() const; | ||
bool has_fma3() const; | ||
bool has_aes() const; | ||
bool has_avx() const; | ||
bool has_avx2() const; | ||
bool has_f16c() const; | ||
bool has_rdrand() const; | ||
bool has_xop() const; | ||
|
||
bool has_avx512F() const; | ||
bool has_avx512CD() const; | ||
bool has_avx512PF() const; | ||
bool has_avx512ER() const; | ||
bool has_avx512VL() const; | ||
bool has_avx512BW() const; | ||
bool has_avx512DQ() const; | ||
bool has_avx512IFMA() const; | ||
bool has_avx512VBMI() const; | ||
}; | ||
|
||
|
||
#endif //_INCLUDE_AMXMODX_CPUINFO_H |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is correct but breaks compatibility. Oddly, amxmodx doesn't do regular "banker's rounding", but instead does basically
floor(x + 0.5)
. Therefore, without this change, rounding 2.5 gives you 3, and with your change rounding 2.5 gives you 2.Personally, I wouldn't mind breaking compatibility here, but considering this could easily break some plugins, Arkshine and others might not agree with this change, and they'd probably be right.
See this PR for some extra info.