Use SSE when available for floating-point operations (bug 5841, r=her).

This commit is contained in:
David Anderson 2013-08-08 20:26:36 -07:00
parent f031ad23f6
commit bf325b72f1
7 changed files with 339 additions and 41 deletions

View File

@ -0,0 +1,33 @@
/**
* vim: set ts=8 sts=2 sw=2 tw=99 et:
* =============================================================================
* SourcePawn JIT SDK
* Copyright (C) 2004-2008 AlliedModders LLC. All rights reserved.
* =============================================================================
*
* This program is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License, version 3.0, as published by the
* Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along with
* this program. If not, see <http://www.gnu.org/licenses/>.
*
* As a special exception, AlliedModders LLC gives you permission to link the
* code of this program (as well as its derivative works) to "Half-Life 2," the
* "Source Engine," the "SourcePawn JIT," and any Game MODs that run on software
* by the Valve Corporation. You must obey the GNU General Public License in
* all respects for all other code used. Additionally, AlliedModders LLC grants
* this exception to all derivative works. AlliedModders LLC defines further
* exceptions, found in LICENSE.txt (as of this writing, version JULY-31-2007),
* or <http://www.sourcemod.net/license.php>.
*
* Version: $Id$
*/
#include <assembler-x86.h>
CPUFeatures AssemblerX86::X86Features;

View File

@ -94,6 +94,25 @@ struct FloatRegister
}
};
struct CPUFeatures
{
CPUFeatures()
{
memset(this, 0, sizeof(*this));
}
bool fpu;
bool mmx;
bool sse;
bool sse2;
bool sse3;
bool ssse3;
bool sse4_1;
bool sse4_2;
bool avx;
bool avx2;
};
const Register eax = { 0 };
const Register ecx = { 1 };
const Register edx = { 2 };
@ -299,7 +318,19 @@ struct Operand
class AssemblerX86 : public Assembler
{
private:
// List of processor features; to be used, this must be filled in at
// startup.
static CPUFeatures X86Features;
public:
static void SetFeatures(const CPUFeatures &features) {
X86Features = features;
}
static const CPUFeatures &Features() {
return X86Features;
}
void movl(Register dest, Register src) {
emit1(0x89, src.code, dest.code);
}
@ -712,6 +743,74 @@ class AssemblerX86 : public Assembler
outOfMemory_ = true;
}
void cpuid() {
emit2(0x0f, 0xa2);
}
// SSE operations can only be used if the feature detection function has
// been run *and* detected the appropriate level of functionality.
void movss(FloatRegister dest, const Operand &src) {
assert(Features().sse);
emit3(0xf3, 0x0f, 0x10, dest.code, src);
}
void cvttss2si(Register dest, Register src) {
assert(Features().sse);
emit3(0xf3, 0x0f, 0x2c, dest.code, src.code);
}
void cvttss2si(Register dest, const Operand &src) {
assert(Features().sse);
emit3(0xf3, 0x0f, 0x2c, dest.code, src);
}
void cvtss2si(Register dest, Register src) {
assert(Features().sse);
emit3(0xf3, 0x0f, 0x2d, dest.code, src.code);
}
void cvtss2si(Register dest, const Operand &src) {
assert(Features().sse);
emit3(0xf3, 0x0f, 0x2d, dest.code, src);
}
void cvtsi2ss(FloatRegister dest, Register src) {
assert(Features().sse);
emit3(0xf3, 0x0f, 0x2a, dest.code, src.code);
}
void cvtsi2ss(FloatRegister dest, const Operand &src) {
assert(Features().sse);
emit3(0xf3, 0x0f, 0x2a, dest.code, src);
}
void addss(FloatRegister dest, const Operand &src) {
assert(Features().sse);
emit3(0xf3, 0x0f, 0x58, dest.code, src);
}
void subss(FloatRegister dest, const Operand &src) {
assert(Features().sse);
emit3(0xf3, 0x0f, 0x5c, dest.code, src);
}
void mulss(FloatRegister dest, const Operand &src) {
assert(Features().sse);
emit3(0xf3, 0x0f, 0x59, dest.code, src);
}
void divss(FloatRegister dest, const Operand &src) {
assert(Features().sse);
emit3(0xf3, 0x0f, 0x5e, dest.code, src);
}
void ucomiss(FloatRegister left, Register right) {
emit2(0x0f, 0x2e, left.code, right.code);
}
void ucomiss(FloatRegister left, const Operand &right) {
emit2(0x0f, 0x2e, left.code, right);
}
// SSE2-only instructions.
void movd(Register dest, FloatRegister src) {
assert(Features().sse2);
emit3(0x66, 0x0f, 0x7e, dest.code, src.code);
}
void movd(Register dest, const Operand &src) {
assert(Features().sse2);
emit3(0x66, 0x0f, 0x7e, dest.code, src);
}
static void PatchRel32Absolute(uint8_t *ip, void *ptr) {
int32_t delta = uint32_t(ptr) - uint32_t(ip);
*reinterpret_cast<int32_t *>(ip - 4) = delta;
@ -806,6 +905,22 @@ class AssemblerX86 : public Assembler
emit(reg, operand);
}
void emit3(uint8_t prefix1, uint8_t prefix2, uint8_t opcode) {
ensureSpace();
*pos_++ = prefix1;
*pos_++ = prefix2;
*pos_++ = opcode;
}
void emit3(uint8_t prefix1, uint8_t prefix2, uint8_t opcode, uint8_t reg, uint8_t opreg) {
emit3(prefix1, prefix2, opcode);
assert(reg <= 7);
*pos_++ = (kModeReg << 6) | (reg << 3) | opreg;
}
void emit3(uint8_t prefix1, uint8_t prefix2, uint8_t opcode, uint8_t reg, const Operand &operand) {
emit3(prefix1, prefix2, opcode);
emit(reg, operand);
}
template <typename T>
void shift_cl(const T &t, uint8_t r) {
emit1(0xd3, r, t);

View File

@ -0,0 +1,104 @@
/**
* vim: set ts=8 sts=2 sw=2 tw=99 et:
* =============================================================================
* SourcePawn JIT SDK
* Copyright (C) 2004-2008 AlliedModders LLC. All rights reserved.
* =============================================================================
*
* This program is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License, version 3.0, as published by the
* Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along with
* this program. If not, see <http://www.gnu.org/licenses/>.
*
* As a special exception, AlliedModders LLC gives you permission to link the
* code of this program (as well as its derivative works) to "Half-Life 2," the
* "Source Engine," the "SourcePawn JIT," and any Game MODs that run on software
* by the Valve Corporation. You must obey the GNU General Public License in
* all respects for all other code used. Additionally, AlliedModders LLC grants
* this exception to all derivative works. AlliedModders LLC defines further
* exceptions, found in LICENSE.txt (as of this writing, version JULY-31-2007),
* or <http://www.sourcemod.net/license.php>.
*
* Version: $Id$
*/
#ifndef _include_sourcepawn_macroassembler_x86h__
#define _include_sourcepawn_macroassembler_x86h__
#include <assembler.h>
#include <ke_vector.h>
#include <string.h>
#include <assembler-x86.h>
class MacroAssemblerX86 : public AssemblerX86
{
public:
static void GenerateFeatureDetection(MacroAssemblerX86 &masm) {
masm.push(ebp);
masm.movl(ebp, esp);
masm.push(ebx);
{
// Get ECX, EDX feature bits at the first CPUID level.
masm.movl(eax, 1);
masm.cpuid();
masm.movl(eax, Operand(ebp, 8));
masm.movl(Operand(eax, 0), ecx);
masm.movl(eax, Operand(ebp, 12));
masm.movl(Operand(eax, 0), edx);
}
// Zero out bits we're not guaranteed to get.
masm.movl(eax, Operand(ebp, 16));
masm.movl(Operand(eax, 0), 0);
Label skip_level_7;
{
// Get EBX feature bits at 7th CPUID level.
masm.movl(eax, 0);
masm.cpuid();
masm.cmpl(eax, 7);
masm.j(below, &skip_level_7);
masm.movl(eax, 7);
masm.movl(ecx, 0);
masm.cpuid();
masm.movl(eax, Operand(ebp, 16));
masm.movl(Operand(eax, 0), ebx);
}
masm.bind(&skip_level_7);
masm.pop(ebx);
masm.pop(ebp);
masm.ret();
}
static void RunFeatureDetection(void *code) {
typedef void (*fn_t)(int *reg_ecx, int *reg_edx, int *reg_ebx);
int reg_ecx, reg_edx, reg_ebx;
((fn_t)code)(&reg_ecx, &reg_edx, &reg_ebx);
CPUFeatures features;
features.fpu = !!(reg_edx & (1 << 0));
features.mmx = !!(reg_edx & (1 << 23));
features.sse = !!(reg_edx & (1 << 25));
features.sse2 = !!(reg_edx & (1 << 26));
features.sse3 = !!(reg_ecx & (1 << 0));
features.ssse3 = !!(reg_ecx & (1 << 9));
features.sse4_1 = !!(reg_ecx & (1 << 19));
features.sse4_2 = !!(reg_ecx & (1 << 20));
features.avx = !!(reg_ecx & (1 << 28));
features.avx2 = !!(reg_ebx & (1 << 5));
SetFeatures(features);
}
private:
};
#endif // _include_sourcepawn_macroassembler_x86h__

View File

@ -37,7 +37,8 @@ binary.AddSourceFiles('sourcepawn/jit', [
'zlib/uncompr.c',
'zlib/zutil.c',
'md5/md5.cpp',
'../../knight/shared/KeCodeAllocator.cpp'
'../../knight/shared/KeCodeAllocator.cpp',
'../../public/jit/x86/assembler-x86.cpp'
])
SM.AutoVersion('sourcepawn/jit', binary)
SM.ExtractDebugInfo(extension, binary)

View File

@ -34,6 +34,7 @@ OBJECTS = dll_exports.cpp \
zlib/zutil.c \
OBJECTS += ../../knight/shared/KeCodeAllocator.cpp
OBJECTS += ../../public/jit/x86/assembler-x86.cpp
##############################################
### CONFIGURE ANY OTHER FLAGS/OPTIONS HERE ###
@ -75,6 +76,7 @@ ifeq "$(GCC_VERSION)" "4"
endif
OBJ_LINUX := $(OBJECTS:../../knight/shared/%.cpp=$(BIN_DIR)/knight/%.o)
OBJ_LINUX := $(OBJ_LINUX:../../public/jit/x86/%.cpp=$(BIN_DIR)/%.o)
OBJ_LINUX := $(OBJ_LINUX:%.cpp=$(BIN_DIR)/%.o)
OBJ_LINUX := $(OBJ_LINUX:%.c=$(BIN_DIR)/%.o)
@ -89,6 +91,9 @@ $(BIN_DIR)/%.o: %.cpp
$(BIN_DIR)/knight/%.o: ../../knight/shared/%.cpp
$(CXX) $(INCLUDE) $(CFLAGS) $(CXXFLAGS) -o $@ -c $<
$(BIN_DIR)/assembler-x86.o: ../../public/jit/x86/assembler-x86.cpp
$(CXX) $(INCLUDE) $(CFLAGS) $(CXXFLAGS) -o $@ -c $<
all:
mkdir -p $(BIN_DIR)/x86
mkdir -p $(BIN_DIR)/md5

View File

@ -983,10 +983,15 @@ Compiler::emitOp(OPCODE op)
break;
case OP_FLOAT:
if (MacroAssemblerX86::Features().sse2) {
__ cvtsi2ss(xmm0, Operand(edi, 0));
__ movd(pri, xmm0);
} else {
__ fild32(Operand(edi, 0));
__ subl(esp, 4);
__ fstp32(Operand(esp, 0));
__ pop(pri);
}
__ addl(stk, 4);
break;
@ -994,34 +999,52 @@ Compiler::emitOp(OPCODE op)
case OP_FLOATSUB:
case OP_FLOATMUL:
case OP_FLOATDIV:
if (MacroAssemblerX86::Features().sse2) {
__ movss(xmm0, Operand(stk, 0));
if (op == OP_FLOATADD)
__ addss(xmm0, Operand(stk, 4));
else if (op == OP_FLOATSUB)
__ subss(xmm0, Operand(stk, 4));
else if (op == OP_FLOATMUL)
__ mulss(xmm0, Operand(stk, 4));
else if (op == OP_FLOATDIV)
__ divss(xmm0, Operand(stk, 4));
__ movd(pri, xmm0);
} else {
__ subl(esp, 4);
__ fld32(Operand(edi, 0));
__ fld32(Operand(stk, 0));
if (op == OP_FLOATADD)
__ fadd32(Operand(edi, 4));
__ fadd32(Operand(stk, 4));
else if (op == OP_FLOATSUB)
__ fsub32(Operand(edi, 4));
__ fsub32(Operand(stk, 4));
else if (op == OP_FLOATMUL)
__ fmul32(Operand(edi, 4));
__ fmul32(Operand(stk, 4));
else if (op == OP_FLOATDIV)
__ fdiv32(Operand(edi, 4));
__ fdiv32(Operand(stk, 4));
__ fstp32(Operand(esp, 0));
__ pop(pri);
}
__ addl(stk, 8);
break;
case OP_RND_TO_NEAREST:
{
if (MacroAssemblerX86::Features().sse) {
// Assume no one is touching MXCSR.
__ cvtss2si(pri, Operand(stk, 0));
} else {
static float kRoundToNearest = 0.5f;
// From http://wurstcaptures.untergrund.net/assembler_tricks.html#fastfloorf
__ fld32(Operand(edi, 0));
__ fld32(Operand(stk, 0));
__ fadd32(st0, st0);
__ fadd32(Operand(ExternalAddress(&kRoundToNearest)));
__ subl(esp, 4);
__ fistp32(Operand(esp, 0));
__ pop(pri);
__ sarl(pri, 1);
}
__ addl(stk, 4);
break;
}
@ -1030,7 +1053,7 @@ Compiler::emitOp(OPCODE op)
{
static float kRoundToCeil = -0.5f;
// From http://wurstcaptures.untergrund.net/assembler_tricks.html#fastfloorf
__ fld32(Operand(edi, 0));
__ fld32(Operand(stk, 0));
__ fadd32(st0, st0);
__ fsubr32(Operand(ExternalAddress(&kRoundToCeil)));
__ subl(esp, 4);
@ -1043,7 +1066,10 @@ Compiler::emitOp(OPCODE op)
}
case OP_RND_TO_ZERO:
__ fld32(Operand(edi, 0));
if (MacroAssemblerX86::Features().sse) {
__ cvttss2si(pri, Operand(stk, 0));
} else {
__ fld32(Operand(stk, 0));
__ subl(esp, 8);
__ fstcw(Operand(esp, 4));
__ movl(Operand(esp, 0), 0xfff);
@ -1052,6 +1078,7 @@ Compiler::emitOp(OPCODE op)
__ pop(pri);
__ fldcw(Operand(esp, 0));
__ addl(esp, 4);
}
__ addl(stk, 4);
break;
@ -1071,10 +1098,15 @@ Compiler::emitOp(OPCODE op)
case OP_FLOATCMP:
{
Label bl, ab, done;
__ fld32(Operand(edi, 0));
__ fld32(Operand(edi, 4));
if (MacroAssemblerX86::Features().sse) {
__ movss(xmm0, Operand(stk, 4));
__ ucomiss(xmm0, Operand(stk, 0));
} else {
__ fld32(Operand(stk, 0));
__ fld32(Operand(stk, 4));
__ fucomip(st1);
__ fstp(st0);
}
__ j(above, &ab);
__ j(below, &bl);
__ xorl(pri, pri);
@ -1869,6 +1901,14 @@ bool JITX86::InitializeJIT()
if (!m_pJitGenArray)
return false;
MacroAssemblerX86 masm;
MacroAssemblerX86::GenerateFeatureDetection(masm);
void *code = LinkCode(masm);
if (!code)
return false;
MacroAssemblerX86::RunFeatureDetection(code);
KE_FreeCode(g_pCodeCache, code);
return true;
}

View File

@ -35,7 +35,7 @@
#include <sp_vm_types.h>
#include <sp_vm_api.h>
#include <KeCodeAllocator.h>
#include <assembler-x86.h>
#include <macro-assembler-x86.h>
#include <ke_vector.h>
#include "jit_shared.h"
#include "BaseRuntime.h"