8076276: Add support for AVX512

Reviewed-by: kvn, roland
This commit is contained in:
Michael C Berg 2015-05-08 11:49:20 -07:00 committed by Vladimir Kozlov
parent 58a1361125
commit 4fca8dbb1f
46 changed files with 6234 additions and 1316 deletions

File diff suppressed because it is too large Load Diff

View File

@ -438,7 +438,7 @@ class ArrayAddress VALUE_OBJ_CLASS_SPEC {
};
const int FPUStateSizeInWords = NOT_LP64(27) LP64_ONLY( 512 / wordSize);
const int FPUStateSizeInWords = NOT_LP64(27) LP64_ONLY( 512*2 / wordSize);
// The Intel x86/Amd64 Assembler: Pure assembler doing NO optimizations on the instruction
// level (e.g. mov rax, 0 is not translated into xor rax, rax!); i.e., what you write
@ -503,7 +503,8 @@ class Assembler : public AbstractAssembler {
REX_WRXB = 0x4F,
VEX_3bytes = 0xC4,
VEX_2bytes = 0xC5
VEX_2bytes = 0xC5,
EVEX_4bytes = 0x62
};
enum VexPrefix {
@ -513,6 +514,14 @@ class Assembler : public AbstractAssembler {
VEX_W = 0x80
};
enum ExexPrefix {
EVEX_F = 0x04,
EVEX_V = 0x08,
EVEX_Rb = 0x10,
EVEX_X = 0x40,
EVEX_Z = 0x80
};
enum VexSimdPrefix {
VEX_SIMD_NONE = 0x0,
VEX_SIMD_66 = 0x1,
@ -527,6 +536,37 @@ class Assembler : public AbstractAssembler {
VEX_OPCODE_0F_3A = 0x3
};
enum AvxVectorLen {
AVX_128bit = 0x0,
AVX_256bit = 0x1,
AVX_512bit = 0x2,
AVX_NoVec = 0x4
};
enum EvexTupleType {
EVEX_FV = 0,
EVEX_HV = 4,
EVEX_FVM = 6,
EVEX_T1S = 7,
EVEX_T1F = 11,
EVEX_T2 = 13,
EVEX_T4 = 15,
EVEX_T8 = 17,
EVEX_HVM = 18,
EVEX_QVM = 19,
EVEX_OVM = 20,
EVEX_M128 = 21,
EVEX_DUP = 22,
EVEX_ETUP = 23
};
enum EvexInputSizeInBits {
EVEX_8bit = 0,
EVEX_16bit = 1,
EVEX_32bit = 2,
EVEX_64bit = 3
};
enum WhichOperand {
// input to locate_operand, and format code for relocations
imm_operand = 0, // embedded 32-bit|64-bit immediate operand
@ -554,6 +594,11 @@ class Assembler : public AbstractAssembler {
private:
int evex_encoding;
int input_size_in_bits;
int avx_vector_len;
int tuple_type;
bool is_evex_instruction;
// 64bit prefixes
int prefix_and_encode(int reg_enc, bool byteinst = false);
@ -580,108 +625,143 @@ private:
void vex_prefix(bool vex_r, bool vex_b, bool vex_x, bool vex_w,
int nds_enc, VexSimdPrefix pre, VexOpcode opc,
bool vector256);
int vector_len);
void evex_prefix(bool vex_r, bool vex_b, bool vex_x, bool vex_w, bool evex_r, bool evex_v,
int nds_enc, VexSimdPrefix pre, VexOpcode opc,
bool is_extended_context, bool is_merge_context,
int vector_len, bool no_mask_reg );
void vex_prefix(Address adr, int nds_enc, int xreg_enc,
VexSimdPrefix pre, VexOpcode opc,
bool vex_w, bool vector256);
bool vex_w, int vector_len,
bool legacy_mode = false, bool no_mask_reg = false);
void vex_prefix(XMMRegister dst, XMMRegister nds, Address src,
VexSimdPrefix pre, bool vector256 = false) {
VexSimdPrefix pre, int vector_len = AVX_128bit,
bool no_mask_reg = false, bool legacy_mode = false) {
int dst_enc = dst->encoding();
int nds_enc = nds->is_valid() ? nds->encoding() : 0;
vex_prefix(src, nds_enc, dst_enc, pre, VEX_OPCODE_0F, false, vector256);
vex_prefix(src, nds_enc, dst_enc, pre, VEX_OPCODE_0F, false, vector_len, legacy_mode, no_mask_reg);
}
void vex_prefix_0F38(Register dst, Register nds, Address src) {
void vex_prefix_q(XMMRegister dst, XMMRegister nds, Address src,
VexSimdPrefix pre, int vector_len = AVX_128bit,
bool no_mask_reg = false) {
int dst_enc = dst->encoding();
int nds_enc = nds->is_valid() ? nds->encoding() : 0;
vex_prefix(src, nds_enc, dst_enc, pre, VEX_OPCODE_0F, true, vector_len, false, no_mask_reg);
}
void vex_prefix_0F38(Register dst, Register nds, Address src, bool no_mask_reg = false) {
bool vex_w = false;
bool vector256 = false;
int vector_len = AVX_128bit;
vex_prefix(src, nds->encoding(), dst->encoding(),
VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector256);
VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w,
vector_len, no_mask_reg);
}
void vex_prefix_0F38_q(Register dst, Register nds, Address src) {
void vex_prefix_0F38_q(Register dst, Register nds, Address src, bool no_mask_reg = false) {
bool vex_w = true;
bool vector256 = false;
int vector_len = AVX_128bit;
vex_prefix(src, nds->encoding(), dst->encoding(),
VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector256);
VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w,
vector_len, no_mask_reg);
}
int vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc,
VexSimdPrefix pre, VexOpcode opc,
bool vex_w, bool vector256);
bool vex_w, int vector_len,
bool legacy_mode, bool no_mask_reg);
int vex_prefix_0F38_and_encode(Register dst, Register nds, Register src) {
int vex_prefix_0F38_and_encode(Register dst, Register nds, Register src, bool no_mask_reg = false) {
bool vex_w = false;
bool vector256 = false;
int vector_len = AVX_128bit;
return vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(),
VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector256);
VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector_len,
false, no_mask_reg);
}
int vex_prefix_0F38_and_encode_q(Register dst, Register nds, Register src) {
int vex_prefix_0F38_and_encode_q(Register dst, Register nds, Register src, bool no_mask_reg = false) {
bool vex_w = true;
bool vector256 = false;
int vector_len = AVX_128bit;
return vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(),
VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector256);
VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector_len,
false, no_mask_reg);
}
int vex_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src,
VexSimdPrefix pre, bool vector256 = false,
VexOpcode opc = VEX_OPCODE_0F) {
VexSimdPrefix pre, int vector_len = AVX_128bit,
VexOpcode opc = VEX_OPCODE_0F, bool legacy_mode = false,
bool no_mask_reg = false) {
int src_enc = src->encoding();
int dst_enc = dst->encoding();
int nds_enc = nds->is_valid() ? nds->encoding() : 0;
return vex_prefix_and_encode(dst_enc, nds_enc, src_enc, pre, opc, false, vector256);
return vex_prefix_and_encode(dst_enc, nds_enc, src_enc, pre, opc, false, vector_len, legacy_mode, no_mask_reg);
}
void simd_prefix(XMMRegister xreg, XMMRegister nds, Address adr,
VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F,
bool rex_w = false, bool vector256 = false);
VexSimdPrefix pre, bool no_mask_reg, VexOpcode opc = VEX_OPCODE_0F,
bool rex_w = false, int vector_len = AVX_128bit, bool legacy_mode = false);
void simd_prefix(XMMRegister dst, Address src,
VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F) {
simd_prefix(dst, xnoreg, src, pre, opc);
void simd_prefix(XMMRegister dst, Address src, VexSimdPrefix pre,
bool no_mask_reg, VexOpcode opc = VEX_OPCODE_0F) {
simd_prefix(dst, xnoreg, src, pre, no_mask_reg, opc);
}
void simd_prefix(Address dst, XMMRegister src, VexSimdPrefix pre) {
simd_prefix(src, dst, pre);
void simd_prefix(Address dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg) {
simd_prefix(src, dst, pre, no_mask_reg);
}
void simd_prefix_q(XMMRegister dst, XMMRegister nds, Address src,
VexSimdPrefix pre) {
VexSimdPrefix pre, bool no_mask_reg = false) {
bool rex_w = true;
simd_prefix(dst, nds, src, pre, VEX_OPCODE_0F, rex_w);
simd_prefix(dst, nds, src, pre, no_mask_reg, VEX_OPCODE_0F, rex_w);
}
int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src,
VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F,
bool rex_w = false, bool vector256 = false);
VexSimdPrefix pre, bool no_mask_reg,
VexOpcode opc = VEX_OPCODE_0F,
bool rex_w = false, int vector_len = AVX_128bit,
bool legacy_mode = false);
int kreg_prefix_and_encode(KRegister dst, KRegister nds, KRegister src,
VexSimdPrefix pre, bool no_mask_reg,
VexOpcode opc = VEX_OPCODE_0F,
bool rex_w = false, int vector_len = AVX_128bit);
int kreg_prefix_and_encode(KRegister dst, KRegister nds, Register src,
VexSimdPrefix pre, bool no_mask_reg,
VexOpcode opc = VEX_OPCODE_0F,
bool rex_w = false, int vector_len = AVX_128bit);
// Move/convert 32-bit integer value.
int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, Register src,
VexSimdPrefix pre) {
VexSimdPrefix pre, bool no_mask_reg) {
// It is OK to cast from Register to XMMRegister to pass argument here
// since only encoding is used in simd_prefix_and_encode() and number of
// Gen and Xmm registers are the same.
return simd_prefix_and_encode(dst, nds, as_XMMRegister(src->encoding()), pre);
return simd_prefix_and_encode(dst, nds, as_XMMRegister(src->encoding()), pre, no_mask_reg, VEX_OPCODE_0F);
}
int simd_prefix_and_encode(XMMRegister dst, Register src, VexSimdPrefix pre) {
return simd_prefix_and_encode(dst, xnoreg, src, pre);
int simd_prefix_and_encode(XMMRegister dst, Register src, VexSimdPrefix pre, bool no_mask_reg) {
return simd_prefix_and_encode(dst, xnoreg, src, pre, no_mask_reg);
}
int simd_prefix_and_encode(Register dst, XMMRegister src,
VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F) {
return simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, pre, opc);
VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F,
bool no_mask_reg = false) {
return simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, pre, no_mask_reg, opc);
}
// Move/convert 64-bit integer value.
int simd_prefix_and_encode_q(XMMRegister dst, XMMRegister nds, Register src,
VexSimdPrefix pre) {
VexSimdPrefix pre, bool no_mask_reg = false) {
bool rex_w = true;
return simd_prefix_and_encode(dst, nds, as_XMMRegister(src->encoding()), pre, VEX_OPCODE_0F, rex_w);
return simd_prefix_and_encode(dst, nds, as_XMMRegister(src->encoding()), pre, no_mask_reg, VEX_OPCODE_0F, rex_w);
}
int simd_prefix_and_encode_q(XMMRegister dst, Register src, VexSimdPrefix pre) {
return simd_prefix_and_encode_q(dst, xnoreg, src, pre);
int simd_prefix_and_encode_q(XMMRegister dst, Register src, VexSimdPrefix pre, bool no_mask_reg) {
return simd_prefix_and_encode_q(dst, xnoreg, src, pre, no_mask_reg);
}
int simd_prefix_and_encode_q(Register dst, XMMRegister src,
VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F) {
VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F,
bool no_mask_reg = false) {
bool rex_w = true;
return simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, pre, opc, rex_w);
return simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, pre, no_mask_reg, opc, rex_w);
}
// Helper functions for groups of instructions
@ -692,14 +772,28 @@ private:
void emit_arith_imm32(int op1, int op2, Register dst, int32_t imm32);
void emit_arith(int op1, int op2, Register dst, Register src);
void emit_simd_arith(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre);
void emit_simd_arith(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre);
void emit_simd_arith_nonds(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre);
void emit_simd_arith_nonds(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre);
void emit_simd_arith(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre, bool no_mask_reg = false, bool legacy_mode = false);
void emit_simd_arith_q(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre, bool no_mask_reg = false);
void emit_simd_arith(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg = false, bool legacy_mode = false);
void emit_simd_arith_q(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg = false);
void emit_simd_arith_nonds(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre, bool no_mask_reg = false);
void emit_simd_arith_nonds_q(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre, bool no_mask_reg = false);
void emit_simd_arith_nonds(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg = false, bool legacy_mode = false);
void emit_simd_arith_nonds_q(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg = false);
void emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds,
Address src, VexSimdPrefix pre, bool vector256);
Address src, VexSimdPrefix pre, int vector_len,
bool no_mask_reg = false, bool legacy_mode = false);
void emit_vex_arith_q(int opcode, XMMRegister dst, XMMRegister nds,
Address src, VexSimdPrefix pre, int vector_len,
bool no_mask_reg = false);
void emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds,
XMMRegister src, VexSimdPrefix pre, bool vector256);
XMMRegister src, VexSimdPrefix pre, int vector_len,
bool no_mask_reg = false, bool legacy_mode = false);
void emit_vex_arith_q(int opcode, XMMRegister dst, XMMRegister nds,
XMMRegister src, VexSimdPrefix pre, int vector_len,
bool no_mask_reg = false);
bool emit_compressed_disp_byte(int &disp);
void emit_operand(Register reg,
Register base, Register index, Address::ScaleFactor scale,
@ -825,7 +919,9 @@ private:
public:
// Creation
Assembler(CodeBuffer* code) : AbstractAssembler(code) {}
Assembler(CodeBuffer* code) : AbstractAssembler(code) {
init_attributes();
}
// Decoding
static address locate_operand(address inst, WhichOperand which);
@ -833,11 +929,21 @@ private:
// Utilities
static bool is_polling_page_far() NOT_LP64({ return false;});
static bool query_compressed_disp_byte(int disp, bool is_evex_inst, int vector_len,
int cur_tuple_type, int in_size_in_bits, int cur_encoding);
// Generic instructions
// Does 32bit or 64bit as needed for the platform. In some sense these
// belong in macro assembler but there is no need for both varieties to exist
void init_attributes(void) {
evex_encoding = 0;
input_size_in_bits = 0;
avx_vector_len = AVX_NoVec;
tuple_type = EVEX_ETUP;
is_evex_instruction = false;
}
void lea(Register dst, Address src);
void mov(Register dst, Register src);
@ -1338,6 +1444,12 @@ private:
void movb(Address dst, int imm8);
void movb(Register dst, Address src);
void kmovq(KRegister dst, KRegister src);
void kmovql(KRegister dst, Register src);
void kmovdl(KRegister dst, Register src);
void kmovq(Address dst, KRegister src);
void kmovq(KRegister dst, Address src);
void movdl(XMMRegister dst, Register src);
void movdl(Register dst, XMMRegister src);
void movdl(XMMRegister dst, Address src);
@ -1361,6 +1473,11 @@ private:
void vmovdqu(XMMRegister dst, Address src);
void vmovdqu(XMMRegister dst, XMMRegister src);
// Move Unaligned 512bit Vector
void evmovdqu(Address dst, XMMRegister src, int vector_len);
void evmovdqu(XMMRegister dst, Address src, int vector_len);
void evmovdqu(XMMRegister dst, XMMRegister src, int vector_len);
// Move lower 64bit to high 64bit in 128bit register
void movlhps(XMMRegister dst, XMMRegister src);
@ -1486,10 +1603,10 @@ private:
// Pack with unsigned saturation
void packuswb(XMMRegister dst, XMMRegister src);
void packuswb(XMMRegister dst, Address src);
void vpackuswb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vpackuswb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
// Pemutation of 64bit words
void vpermq(XMMRegister dst, XMMRegister src, int imm8, bool vector256);
void vpermq(XMMRegister dst, XMMRegister src, int imm8, int vector_len);
void pause();
@ -1734,54 +1851,54 @@ private:
// Add Packed Floating-Point Values
void addpd(XMMRegister dst, XMMRegister src);
void addps(XMMRegister dst, XMMRegister src);
void vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vaddps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vaddpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
void vaddps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
void vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vaddps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vaddpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
void vaddps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
// Subtract Packed Floating-Point Values
void subpd(XMMRegister dst, XMMRegister src);
void subps(XMMRegister dst, XMMRegister src);
void vsubpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vsubps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vsubpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
void vsubps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
void vsubpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vsubps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vsubpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
void vsubps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
// Multiply Packed Floating-Point Values
void mulpd(XMMRegister dst, XMMRegister src);
void mulps(XMMRegister dst, XMMRegister src);
void vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vmulpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
void vmulps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
void vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vmulpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
void vmulps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
// Divide Packed Floating-Point Values
void divpd(XMMRegister dst, XMMRegister src);
void divps(XMMRegister dst, XMMRegister src);
void vdivpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vdivps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vdivpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
void vdivps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
void vdivpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vdivps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vdivpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
void vdivps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
// Bitwise Logical AND of Packed Floating-Point Values
void andpd(XMMRegister dst, XMMRegister src);
void andps(XMMRegister dst, XMMRegister src);
void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vandpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
void vandps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vandpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
void vandps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
// Bitwise Logical XOR of Packed Floating-Point Values
void xorpd(XMMRegister dst, XMMRegister src);
void xorps(XMMRegister dst, XMMRegister src);
void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vxorpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
void vxorps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vxorpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
void vxorps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
// Add horizontal packed integers
void vphaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vphaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vphaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vphaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void phaddw(XMMRegister dst, XMMRegister src);
void phaddd(XMMRegister dst, XMMRegister src);
@ -1790,36 +1907,38 @@ private:
void paddw(XMMRegister dst, XMMRegister src);
void paddd(XMMRegister dst, XMMRegister src);
void paddq(XMMRegister dst, XMMRegister src);
void vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vpaddq(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vpaddb(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
void vpaddw(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
void vpaddd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
void vpaddq(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
void vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vpaddq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
void vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
void vpaddd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
void vpaddq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
// Sub packed integers
void psubb(XMMRegister dst, XMMRegister src);
void psubw(XMMRegister dst, XMMRegister src);
void psubd(XMMRegister dst, XMMRegister src);
void psubq(XMMRegister dst, XMMRegister src);
void vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vpsubd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vpsubq(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vpsubb(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
void vpsubw(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
void vpsubd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
void vpsubq(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
void vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vpsubd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vpsubq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
void vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
void vpsubd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
void vpsubq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
// Multiply packed integers (only shorts and ints)
void pmullw(XMMRegister dst, XMMRegister src);
void pmulld(XMMRegister dst, XMMRegister src);
void vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vpmullw(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
void vpmulld(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
void vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vpmullq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
void vpmulld(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
void vpmullq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
// Shift left packed integers
void psllw(XMMRegister dst, int shift);
@ -1828,12 +1947,12 @@ private:
void psllw(XMMRegister dst, XMMRegister shift);
void pslld(XMMRegister dst, XMMRegister shift);
void psllq(XMMRegister dst, XMMRegister shift);
void vpsllw(XMMRegister dst, XMMRegister src, int shift, bool vector256);
void vpslld(XMMRegister dst, XMMRegister src, int shift, bool vector256);
void vpsllq(XMMRegister dst, XMMRegister src, int shift, bool vector256);
void vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
void vpslld(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
void vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
void vpsllw(XMMRegister dst, XMMRegister src, int shift, int vector_len);
void vpslld(XMMRegister dst, XMMRegister src, int shift, int vector_len);
void vpsllq(XMMRegister dst, XMMRegister src, int shift, int vector_len);
void vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
void vpslld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
void vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
// Logical shift right packed integers
void psrlw(XMMRegister dst, int shift);
@ -1842,42 +1961,43 @@ private:
void psrlw(XMMRegister dst, XMMRegister shift);
void psrld(XMMRegister dst, XMMRegister shift);
void psrlq(XMMRegister dst, XMMRegister shift);
void vpsrlw(XMMRegister dst, XMMRegister src, int shift, bool vector256);
void vpsrld(XMMRegister dst, XMMRegister src, int shift, bool vector256);
void vpsrlq(XMMRegister dst, XMMRegister src, int shift, bool vector256);
void vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
void vpsrld(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
void vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
void vpsrlw(XMMRegister dst, XMMRegister src, int shift, int vector_len);
void vpsrld(XMMRegister dst, XMMRegister src, int shift, int vector_len);
void vpsrlq(XMMRegister dst, XMMRegister src, int shift, int vector_len);
void vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
void vpsrld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
void vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
// Arithmetic shift right packed integers (only shorts and ints, no instructions for longs)
void psraw(XMMRegister dst, int shift);
void psrad(XMMRegister dst, int shift);
void psraw(XMMRegister dst, XMMRegister shift);
void psrad(XMMRegister dst, XMMRegister shift);
void vpsraw(XMMRegister dst, XMMRegister src, int shift, bool vector256);
void vpsrad(XMMRegister dst, XMMRegister src, int shift, bool vector256);
void vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
void vpsrad(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
void vpsraw(XMMRegister dst, XMMRegister src, int shift, int vector_len);
void vpsrad(XMMRegister dst, XMMRegister src, int shift, int vector_len);
void vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
void vpsrad(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
// And packed integers
void pand(XMMRegister dst, XMMRegister src);
void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vpand(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
// Or packed integers
void por(XMMRegister dst, XMMRegister src);
void vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vpor(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
void vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vpor(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
// Xor packed integers
void pxor(XMMRegister dst, XMMRegister src);
void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
void vpxor(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vpxor(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
// Copy low 128bit into high 128bit of YMM registers.
void vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src);
void vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src);
void vextractf128h(XMMRegister dst, XMMRegister src);
void vextracti128h(XMMRegister dst, XMMRegister src);
// Load/store high 128bit of YMM registers which does not destroy other half.
void vinsertf128h(XMMRegister dst, Address src);
@ -1885,9 +2005,25 @@ private:
void vextractf128h(Address dst, XMMRegister src);
void vextracti128h(Address dst, XMMRegister src);
// Copy low 256bit into high 256bit of ZMM registers.
void vinserti64x4h(XMMRegister dst, XMMRegister nds, XMMRegister src);
void vinsertf64x4h(XMMRegister dst, XMMRegister nds, XMMRegister src);
void vextracti64x4h(XMMRegister dst, XMMRegister src);
void vextractf64x4h(XMMRegister dst, XMMRegister src);
void vextractf64x4h(Address dst, XMMRegister src);
void vinsertf64x4h(XMMRegister dst, Address src);
// Copy targeted 128bit segments of the ZMM registers
void vextracti64x2h(XMMRegister dst, XMMRegister src, int value);
void vextractf64x2h(XMMRegister dst, XMMRegister src, int value);
void vextractf32x4h(XMMRegister dst, XMMRegister src, int value);
// duplicate 4-bytes integer data from src into 8 locations in dest
void vpbroadcastd(XMMRegister dst, XMMRegister src);
// duplicate 4-bytes integer data from src into vector_len locations in dest
void evpbroadcastd(XMMRegister dst, XMMRegister src, int vector_len);
// Carry-Less Multiplication Quadword
void pclmulqdq(XMMRegister dst, XMMRegister src, int mask);
void vpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask);

View File

@ -233,13 +233,30 @@ void FrameMap::initialize() {
_xmm_regs[13] = xmm13;
_xmm_regs[14] = xmm14;
_xmm_regs[15] = xmm15;
_xmm_regs[16] = xmm16;
_xmm_regs[17] = xmm17;
_xmm_regs[18] = xmm18;
_xmm_regs[19] = xmm19;
_xmm_regs[20] = xmm20;
_xmm_regs[21] = xmm21;
_xmm_regs[22] = xmm22;
_xmm_regs[23] = xmm23;
_xmm_regs[24] = xmm24;
_xmm_regs[25] = xmm25;
_xmm_regs[26] = xmm26;
_xmm_regs[27] = xmm27;
_xmm_regs[28] = xmm28;
_xmm_regs[29] = xmm29;
_xmm_regs[30] = xmm30;
_xmm_regs[31] = xmm31;
#endif // _LP64
for (int i = 0; i < 8; i++) {
_caller_save_fpu_regs[i] = LIR_OprFact::single_fpu(i);
}
for (int i = 0; i < nof_caller_save_xmm_regs ; i++) {
int num_caller_save_xmm_regs = get_num_caller_save_xmms();
for (int i = 0; i < num_caller_save_xmm_regs; i++) {
_caller_save_xmm_regs[i] = LIR_OprFact::single_xmm(i);
}

View File

@ -152,6 +152,16 @@
return range;
}
static int get_num_caller_save_xmms(void) {
int num_caller_save_xmm_regs = nof_caller_save_xmm_regs;
#ifdef _LP64
if (UseAVX < 3) {
num_caller_save_xmm_regs = num_caller_save_xmm_regs / 2;
}
#endif
return num_caller_save_xmm_regs;
}
static int nof_caller_save_cpu_regs() { return adjust_reg_range(pd_nof_caller_save_cpu_regs_frame_map); }
static int last_cpu_reg() { return adjust_reg_range(pd_last_cpu_reg); }
static int last_byte_reg() { return adjust_reg_range(pd_last_byte_reg); }

View File

@ -85,8 +85,9 @@ inline void LinearScan::pd_add_temps(LIR_Op* op) {
tty->print_cr("killing XMMs for trig");
}
#endif
int num_caller_save_xmm_regs = FrameMap::get_num_caller_save_xmms();
int op_id = op->id();
for (int xmm = 0; xmm < FrameMap::nof_caller_save_xmm_regs; xmm++) {
for (int xmm = 0; xmm < num_caller_save_xmm_regs; xmm++) {
LIR_Opr opr = FrameMap::caller_save_xmm_reg_at(xmm);
add_temp(reg_num(opr), op_id, noUse, T_ILLEGAL);
}
@ -100,6 +101,10 @@ inline void LinearScan::pd_add_temps(LIR_Op* op) {
// Implementation of LinearScanWalker
inline bool LinearScanWalker::pd_init_regs_for_alloc(Interval* cur) {
int last_xmm_reg = pd_last_xmm_reg;
if (UseAVX < 3) {
last_xmm_reg = pd_first_xmm_reg + (pd_nof_xmm_regs_frame_map / 2) - 1;
}
if (allocator()->gen()->is_vreg_flag_set(cur->reg_num(), LIRGenerator::byte_reg)) {
assert(cur->type() != T_FLOAT && cur->type() != T_DOUBLE, "cpu regs only");
_first_reg = pd_first_byte_reg;
@ -107,7 +112,7 @@ inline bool LinearScanWalker::pd_init_regs_for_alloc(Interval* cur) {
return true;
} else if ((UseSSE >= 1 && cur->type() == T_FLOAT) || (UseSSE >= 2 && cur->type() == T_DOUBLE)) {
_first_reg = pd_first_xmm_reg;
_last_reg = pd_last_xmm_reg;
_last_reg = last_xmm_reg;
return true;
}

View File

@ -323,7 +323,7 @@ static OopMap* generate_oop_map(StubAssembler* sasm, int num_rt_args,
LP64_ONLY(num_rt_args = 0);
LP64_ONLY(assert((reg_save_frame_size * VMRegImpl::stack_slot_size) % 16 == 0, "must be 16 byte aligned");)
int frame_size_in_slots = reg_save_frame_size + num_rt_args; // args + thread
sasm->set_frame_size(frame_size_in_slots / VMRegImpl::slots_per_word );
sasm->set_frame_size(frame_size_in_slots / VMRegImpl::slots_per_word);
// record saved value locations in an OopMap
// locations are offsets from sp after runtime call; num_rt_args is number of arguments in call, including thread
@ -362,6 +362,13 @@ static OopMap* generate_oop_map(StubAssembler* sasm, int num_rt_args,
map->set_callee_saved(VMRegImpl::stack2reg(r15H_off + num_rt_args), r15->as_VMReg()->next());
#endif // _LP64
int xmm_bypass_limit = FrameMap::nof_xmm_regs;
#ifdef _LP64
if (UseAVX < 3) {
xmm_bypass_limit = xmm_bypass_limit / 2;
}
#endif
if (save_fpu_registers) {
if (UseSSE < 2) {
int fpu_off = float_regs_as_doubles_off;
@ -380,11 +387,13 @@ static OopMap* generate_oop_map(StubAssembler* sasm, int num_rt_args,
if (UseSSE >= 2) {
int xmm_off = xmm_regs_as_doubles_off;
for (int n = 0; n < FrameMap::nof_xmm_regs; n++) {
VMReg xmm_name_0 = as_XMMRegister(n)->as_VMReg();
map->set_callee_saved(VMRegImpl::stack2reg(xmm_off + num_rt_args), xmm_name_0);
// %%% This is really a waste but we'll keep things as they were for now
if (true) {
map->set_callee_saved(VMRegImpl::stack2reg(xmm_off + 1 + num_rt_args), xmm_name_0->next());
if (n < xmm_bypass_limit) {
VMReg xmm_name_0 = as_XMMRegister(n)->as_VMReg();
map->set_callee_saved(VMRegImpl::stack2reg(xmm_off + num_rt_args), xmm_name_0);
// %%% This is really a waste but we'll keep things as they were for now
if (true) {
map->set_callee_saved(VMRegImpl::stack2reg(xmm_off + 1 + num_rt_args), xmm_name_0->next());
}
}
xmm_off += 2;
}
@ -393,8 +402,10 @@ static OopMap* generate_oop_map(StubAssembler* sasm, int num_rt_args,
} else if (UseSSE == 1) {
int xmm_off = xmm_regs_as_doubles_off;
for (int n = 0; n < FrameMap::nof_xmm_regs; n++) {
VMReg xmm_name_0 = as_XMMRegister(n)->as_VMReg();
map->set_callee_saved(VMRegImpl::stack2reg(xmm_off + num_rt_args), xmm_name_0);
if (n < xmm_bypass_limit) {
VMReg xmm_name_0 = as_XMMRegister(n)->as_VMReg();
map->set_callee_saved(VMRegImpl::stack2reg(xmm_off + num_rt_args), xmm_name_0);
}
xmm_off += 2;
}
assert(xmm_off == float_regs_as_doubles_off, "incorrect number of xmm registers");
@ -474,6 +485,24 @@ static OopMap* save_live_registers(StubAssembler* sasm, int num_rt_args,
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 104), xmm13);
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 112), xmm14);
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 120), xmm15);
if (UseAVX > 2) {
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 128), xmm16);
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 136), xmm17);
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 144), xmm18);
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 152), xmm19);
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 160), xmm20);
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 168), xmm21);
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 176), xmm22);
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 184), xmm23);
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 192), xmm24);
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 200), xmm25);
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 208), xmm26);
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 216), xmm27);
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 224), xmm28);
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 232), xmm29);
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 240), xmm30);
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 248), xmm31);
}
#endif // _LP64
} else if (UseSSE == 1) {
// save XMM registers as float because double not supported without SSE2
@ -516,6 +545,24 @@ static void restore_fpu(StubAssembler* sasm, bool restore_fpu_registers = true)
__ movdbl(xmm13, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 104));
__ movdbl(xmm14, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 112));
__ movdbl(xmm15, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 120));
if (UseAVX > 2) {
__ movdbl(xmm16, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 128));
__ movdbl(xmm17, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 136));
__ movdbl(xmm18, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 144));
__ movdbl(xmm19, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 152));
__ movdbl(xmm20, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 160));
__ movdbl(xmm21, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 168));
__ movdbl(xmm22, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 176));
__ movdbl(xmm23, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 184));
__ movdbl(xmm24, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 192));
__ movdbl(xmm25, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 200));
__ movdbl(xmm26, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 208));
__ movdbl(xmm27, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 216));
__ movdbl(xmm28, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 224));
__ movdbl(xmm29, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 232));
__ movdbl(xmm30, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 240));
__ movdbl(xmm31, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 248));
}
#endif // _LP64
} else if (UseSSE == 1) {
// restore XMM registers

View File

@ -25,6 +25,7 @@
#include "precompiled.hpp"
#include "opto/compile.hpp"
#include "opto/node.hpp"
#include "opto/optoreg.hpp"
// processor dependent initialization for i486
@ -37,4 +38,24 @@ void Compile::pd_compiler2_init() {
ConditionalMoveLimit = 0;
}
#endif // AMD64
if (UseAVX < 3) {
int delta = XMMRegisterImpl::max_slots_per_register * XMMRegisterImpl::number_of_registers;
int bottom = ConcreteRegisterImpl::max_fpr;
int top = bottom + delta;
int middle = bottom + (delta / 2);
int xmm_slots = XMMRegisterImpl::max_slots_per_register;
int lower = xmm_slots / 2;
// mark bad every register that we cannot get to if AVX less than 3, we have all slots in the array
// Note: vm2opto is allocated to ConcreteRegisterImpl::number_of_registers
for (int i = bottom; i < middle; i += xmm_slots) {
for (OptoReg::Name j = OptoReg::Name(i + lower); j<OptoReg::Name(i + xmm_slots); j = OptoReg::add(j, 1)) {
OptoReg::invalidate(j);
}
}
// mark the upper zmm bank bad and all the mask registers bad in this case
for (OptoReg::Name i = OptoReg::Name(middle); i<OptoReg::Name(_last_Mach_Reg - 1); i = OptoReg::add(i, 1)) {
OptoReg::invalidate(i);
}
}
}

View File

@ -125,7 +125,7 @@
// Entry frames
#ifdef AMD64
#ifdef _WIN64
entry_frame_after_call_words = 28,
entry_frame_after_call_words = 60,
entry_frame_call_wrapper_offset = 2,
arg_reg_save_area_bytes = 32 // Register argument save area

View File

@ -3996,21 +3996,21 @@ void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src
}
}
void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
if (reachable(src)) {
vandpd(dst, nds, as_Address(src), vector256);
vandpd(dst, nds, as_Address(src), vector_len);
} else {
lea(rscratch1, src);
vandpd(dst, nds, Address(rscratch1, 0), vector256);
vandpd(dst, nds, Address(rscratch1, 0), vector_len);
}
}
void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
if (reachable(src)) {
vandps(dst, nds, as_Address(src), vector256);
vandps(dst, nds, as_Address(src), vector_len);
} else {
lea(rscratch1, src);
vandps(dst, nds, Address(rscratch1, 0), vector256);
vandps(dst, nds, Address(rscratch1, 0), vector_len);
}
}
@ -4068,21 +4068,21 @@ void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src
}
}
void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
if (reachable(src)) {
vxorpd(dst, nds, as_Address(src), vector256);
vxorpd(dst, nds, as_Address(src), vector_len);
} else {
lea(rscratch1, src);
vxorpd(dst, nds, Address(rscratch1, 0), vector256);
vxorpd(dst, nds, Address(rscratch1, 0), vector_len);
}
}
void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
if (reachable(src)) {
vxorps(dst, nds, as_Address(src), vector256);
vxorps(dst, nds, as_Address(src), vector_len);
} else {
lea(rscratch1, src);
vxorps(dst, nds, Address(rscratch1, 0), vector256);
vxorps(dst, nds, Address(rscratch1, 0), vector_len);
}
}
@ -4561,6 +4561,14 @@ void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int
movflt(Address(rsp,off++*sizeof(jdouble)),xmm6);
movflt(Address(rsp,off++*sizeof(jdouble)),xmm7);
} else if (UseSSE >= 2) {
if (UseAVX > 2) {
movl(rbx, 0xffff);
#ifdef _LP64
kmovql(k1, rbx);
#else
kmovdl(k1, rbx);
#endif
}
#ifdef COMPILER2
if (MaxVectorSize > 16) {
assert(UseAVX > 0, "256bit vectors are supported only with AVX");
@ -7063,8 +7071,39 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned,
{
assert( UseSSE >= 2, "supported cpu only" );
Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
if (UseAVX > 2) {
movl(rtmp, 0xffff);
#ifdef _LP64
kmovql(k1, rtmp);
#else
kmovdl(k1, rtmp);
#endif
}
movdl(xtmp, value);
if (UseAVX >= 2 && UseUnalignedLoadStores) {
if (UseAVX > 2 && UseUnalignedLoadStores) {
// Fill 64-byte chunks
Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
evpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
subl(count, 16 << shift);
jcc(Assembler::less, L_check_fill_32_bytes);
align(16);
BIND(L_fill_64_bytes_loop);
evmovdqu(Address(to, 0), xtmp, Assembler::AVX_512bit);
addptr(to, 64);
subl(count, 16 << shift);
jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
BIND(L_check_fill_32_bytes);
addl(count, 8 << shift);
jccb(Assembler::less, L_check_fill_8_bytes);
evmovdqu(Address(to, 0), xtmp, Assembler::AVX_256bit);
addptr(to, 32);
subl(count, 8 << shift);
BIND(L_check_fill_8_bytes);
} else if (UseAVX == 2 && UseUnalignedLoadStores) {
// Fill 64-byte chunks
Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
vpbroadcastd(xtmp, xtmp);
@ -7200,11 +7239,11 @@ void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
bind(L_copy_32_chars);
vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector256 */ true);
vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
vptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector
jccb(Assembler::notZero, L_copy_32_chars_exit);
vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector256 */ true);
vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector256 */ true);
vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1);
vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
bind(L_chars_32_check);
@ -7227,13 +7266,13 @@ void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
vptest(tmp2Reg, tmp1Reg);
jccb(Assembler::notZero, L_copy_16_chars_exit);
vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector256 */ true);
vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector256 */ true);
vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1);
vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1);
} else {
if (UseAVX > 0) {
movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector256 */ false);
vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0);
} else {
movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
por(tmp2Reg, tmp3Reg);
@ -7776,7 +7815,7 @@ void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegi
if (UseAVX > 0) {
vpclmulhdq(xtmp, xK, xcrc); // [123:64]
vpclmulldq(xcrc, xK, xcrc); // [63:0]
vpxor(xcrc, xcrc, Address(buf, offset), false /* vector256 */);
vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */);
pxor(xcrc, xtmp);
} else {
movdqa(xtmp, xcrc);
@ -7920,7 +7959,7 @@ void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Regi
movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()));
if (UseAVX > 0) {
vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
vpand(xmm3, xmm0, xmm2, false /* vector256 */);
vpand(xmm3, xmm0, xmm2, 0 /* vector_len */);
vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
} else {
movdqa(xmm2, xmm0);

View File

@ -1024,13 +1024,13 @@ public:
void vaddss(XMMRegister dst, XMMRegister nds, Address src) { Assembler::vaddss(dst, nds, src); }
void vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src);
void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { Assembler::vandpd(dst, nds, src, vector256); }
void vandpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { Assembler::vandpd(dst, nds, src, vector256); }
void vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256);
void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vandpd(dst, nds, src, vector_len); }
void vandpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { Assembler::vandpd(dst, nds, src, vector_len); }
void vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len);
void vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { Assembler::vandps(dst, nds, src, vector256); }
void vandps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { Assembler::vandps(dst, nds, src, vector256); }
void vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256);
void vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vandps(dst, nds, src, vector_len); }
void vandps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { Assembler::vandps(dst, nds, src, vector_len); }
void vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len);
void vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vdivsd(dst, nds, src); }
void vdivsd(XMMRegister dst, XMMRegister nds, Address src) { Assembler::vdivsd(dst, nds, src); }
@ -1058,25 +1058,25 @@ public:
// AVX Vector instructions
void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { Assembler::vxorpd(dst, nds, src, vector256); }
void vxorpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { Assembler::vxorpd(dst, nds, src, vector256); }
void vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256);
void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vxorpd(dst, nds, src, vector_len); }
void vxorpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { Assembler::vxorpd(dst, nds, src, vector_len); }
void vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len);
void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { Assembler::vxorps(dst, nds, src, vector256); }
void vxorps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { Assembler::vxorps(dst, nds, src, vector256); }
void vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256);
void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vxorps(dst, nds, src, vector_len); }
void vxorps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { Assembler::vxorps(dst, nds, src, vector_len); }
void vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len);
void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
if (UseAVX > 1 || !vector256) // vpxor 256 bit is available only in AVX2
Assembler::vpxor(dst, nds, src, vector256);
void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
if (UseAVX > 1 || (vector_len < 1)) // vpxor 256 bit is available only in AVX2
Assembler::vpxor(dst, nds, src, vector_len);
else
Assembler::vxorpd(dst, nds, src, vector256);
Assembler::vxorpd(dst, nds, src, vector_len);
}
void vpxor(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
if (UseAVX > 1 || !vector256) // vpxor 256 bit is available only in AVX2
Assembler::vpxor(dst, nds, src, vector256);
void vpxor(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
if (UseAVX > 1 || (vector_len < 1)) // vpxor 256 bit is available only in AVX2
Assembler::vpxor(dst, nds, src, vector_len);
else
Assembler::vxorpd(dst, nds, src, vector256);
Assembler::vxorpd(dst, nds, src, vector_len);
}
// Simple version for AVX2 256bit vectors

View File

@ -68,6 +68,22 @@ REGISTER_DEFINITION(XMMRegister, xmm12);
REGISTER_DEFINITION(XMMRegister, xmm13);
REGISTER_DEFINITION(XMMRegister, xmm14);
REGISTER_DEFINITION(XMMRegister, xmm15);
REGISTER_DEFINITION(XMMRegister, xmm16);
REGISTER_DEFINITION(XMMRegister, xmm17);
REGISTER_DEFINITION(XMMRegister, xmm18);
REGISTER_DEFINITION(XMMRegister, xmm19);
REGISTER_DEFINITION(XMMRegister, xmm20);
REGISTER_DEFINITION(XMMRegister, xmm21);
REGISTER_DEFINITION(XMMRegister, xmm22);
REGISTER_DEFINITION(XMMRegister, xmm23);
REGISTER_DEFINITION(XMMRegister, xmm24);
REGISTER_DEFINITION(XMMRegister, xmm25);
REGISTER_DEFINITION(XMMRegister, xmm26);
REGISTER_DEFINITION(XMMRegister, xmm27);
REGISTER_DEFINITION(XMMRegister, xmm28);
REGISTER_DEFINITION(XMMRegister, xmm29);
REGISTER_DEFINITION(XMMRegister, xmm30);
REGISTER_DEFINITION(XMMRegister, xmm31);
REGISTER_DEFINITION(Register, c_rarg0);
REGISTER_DEFINITION(Register, c_rarg1);
@ -123,5 +139,15 @@ REGISTER_DEFINITION(MMXRegister, mmx5 );
REGISTER_DEFINITION(MMXRegister, mmx6 );
REGISTER_DEFINITION(MMXRegister, mmx7 );
REGISTER_DEFINITION(KRegister, knoreg);
REGISTER_DEFINITION(KRegister, k0);
REGISTER_DEFINITION(KRegister, k1);
REGISTER_DEFINITION(KRegister, k2);
REGISTER_DEFINITION(KRegister, k3);
REGISTER_DEFINITION(KRegister, k4);
REGISTER_DEFINITION(KRegister, k5);
REGISTER_DEFINITION(KRegister, k6);
REGISTER_DEFINITION(KRegister, k7);
// JSR 292
REGISTER_DEFINITION(Register, rbp_mh_SP_save);

View File

@ -31,11 +31,13 @@ const int ConcreteRegisterImpl::max_gpr = RegisterImpl::number_of_registers;
const int ConcreteRegisterImpl::max_gpr = RegisterImpl::number_of_registers << 1;
#endif // AMD64
const int ConcreteRegisterImpl::max_fpr = ConcreteRegisterImpl::max_gpr +
2 * FloatRegisterImpl::number_of_registers;
2 * FloatRegisterImpl::number_of_registers;
const int ConcreteRegisterImpl::max_xmm = ConcreteRegisterImpl::max_fpr +
8 * XMMRegisterImpl::number_of_registers;
XMMRegisterImpl::max_slots_per_register * XMMRegisterImpl::number_of_registers;
const int ConcreteRegisterImpl::max_kpr = ConcreteRegisterImpl::max_xmm +
KRegisterImpl::max_slots_per_register * KRegisterImpl::number_of_registers;
const char* RegisterImpl::name() const {
const char* names[number_of_registers] = {
#ifndef AMD64
@ -59,8 +61,17 @@ const char* XMMRegisterImpl::name() const {
const char* names[number_of_registers] = {
"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7"
#ifdef AMD64
,"xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"
,"xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"
,"xmm16", "xmm17", "xmm18", "xmm19", "xmm20", "xmm21", "xmm22", "xmm23"
,"xmm24", "xmm25", "xmm26", "xmm27", "xmm28", "xmm29", "xmm30", "xmm31"
#endif // AMD64
};
return is_valid() ? names[encoding()] : "xnoreg";
}
const char* KRegisterImpl::name() const {
const char* names[number_of_registers] = {
"k0", "k1", "k2", "k3", "k4", "k5", "k6", "k7"
};
return is_valid() ? names[encoding()] : "knoreg";
}

View File

@ -45,10 +45,12 @@ class RegisterImpl: public AbstractRegisterImpl {
enum {
#ifndef AMD64
number_of_registers = 8,
number_of_byte_registers = 4
number_of_byte_registers = 4,
max_slots_per_register = 1
#else
number_of_registers = 16,
number_of_byte_registers = 16
number_of_byte_registers = 16,
max_slots_per_register = 1
#endif // AMD64
};
@ -143,9 +145,11 @@ class XMMRegisterImpl: public AbstractRegisterImpl {
public:
enum {
#ifndef AMD64
number_of_registers = 8
number_of_registers = 8,
max_slots_per_register = 16 // 512-bit
#else
number_of_registers = 16
number_of_registers = 32,
max_slots_per_register = 16 // 512-bit
#endif // AMD64
};
@ -183,6 +187,22 @@ CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm12, (12));
CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm13, (13));
CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm14, (14));
CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm15, (15));
CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm16, (16));
CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm17, (17));
CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm18, (18));
CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm19, (19));
CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm20, (20));
CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm21, (21));
CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm22, (22));
CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm23, (23));
CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm24, (24));
CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm25, (25));
CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm26, (26));
CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm27, (27));
CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm28, (28));
CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm29, (29));
CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm30, (30));
CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm31, (31));
#endif // AMD64
// Only used by the 32bit stubGenerator. These can't be described by vmreg and hence
@ -200,6 +220,46 @@ CONSTANT_REGISTER_DECLARATION(MMXRegister, mmx5 , ( 5));
CONSTANT_REGISTER_DECLARATION(MMXRegister, mmx6 , ( 6));
CONSTANT_REGISTER_DECLARATION(MMXRegister, mmx7 , ( 7));
// Use XMMRegister as shortcut
class KRegisterImpl;
typedef KRegisterImpl* KRegister;
inline KRegister as_KRegister(int encoding) {
return (KRegister)(intptr_t)encoding;
}
// The implementation of XMM registers for the IA32 architecture
class KRegisterImpl : public AbstractRegisterImpl {
public:
enum {
number_of_registers = 8,
max_slots_per_register = 1
};
// construction
friend KRegister as_KRegister(int encoding);
inline VMReg as_VMReg();
// derived registers, offsets, and addresses
KRegister successor() const { return as_KRegister(encoding() + 1); }
// accessors
int encoding() const { assert(is_valid(), err_msg("invalid register (%d)", (int)(intptr_t)this)); return (intptr_t)this; }
bool is_valid() const { return 0 <= (intptr_t)this && (intptr_t)this < number_of_registers; }
const char* name() const;
};
// The Mask registers, for AVX3 enabled and up chips
CONSTANT_REGISTER_DECLARATION(KRegister, knoreg, (-1));
CONSTANT_REGISTER_DECLARATION(KRegister, k0, (0));
CONSTANT_REGISTER_DECLARATION(KRegister, k1, (1));
CONSTANT_REGISTER_DECLARATION(KRegister, k2, (2));
CONSTANT_REGISTER_DECLARATION(KRegister, k3, (3));
CONSTANT_REGISTER_DECLARATION(KRegister, k4, (4));
CONSTANT_REGISTER_DECLARATION(KRegister, k5, (5));
CONSTANT_REGISTER_DECLARATION(KRegister, k6, (6));
CONSTANT_REGISTER_DECLARATION(KRegister, k7, (7));
// Need to know the total number of registers of all sorts for SharedInfo.
// Define a class that exports it.
@ -211,18 +271,20 @@ class ConcreteRegisterImpl : public AbstractRegisterImpl {
// There is no requirement that any ordering here matches any ordering c2 gives
// it's optoregs.
number_of_registers = RegisterImpl::number_of_registers +
number_of_registers = RegisterImpl::number_of_registers +
#ifdef AMD64
RegisterImpl::number_of_registers + // "H" half of a 64bit register
RegisterImpl::number_of_registers + // "H" half of a 64bit register
#endif // AMD64
2 * FloatRegisterImpl::number_of_registers +
8 * XMMRegisterImpl::number_of_registers +
1 // eflags
2 * FloatRegisterImpl::number_of_registers +
XMMRegisterImpl::max_slots_per_register * XMMRegisterImpl::number_of_registers +
KRegisterImpl::number_of_registers + // mask registers
1 // eflags
};
static const int max_gpr;
static const int max_fpr;
static const int max_xmm;
static const int max_kpr;
};

View File

@ -117,9 +117,9 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_
int vect_words = 0;
#ifdef COMPILER2
if (save_vectors) {
assert(UseAVX > 0, "256bit vectors are supported only with AVX");
assert(MaxVectorSize == 32, "only 256bit vectors are supported now");
// Save upper half of YMM registes
assert(UseAVX > 0, "512bit vectors are supported only with EVEX");
assert(MaxVectorSize == 64, "only 512bit vectors are supported now");
// Save upper half of ZMM/YMM registers :
vect_words = 8 * 16 / wordSize;
additional_frame_words += vect_words;
}
@ -216,6 +216,17 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_
__ vextractf128h(Address(rsp, 80),xmm5);
__ vextractf128h(Address(rsp, 96),xmm6);
__ vextractf128h(Address(rsp,112),xmm7);
if (UseAVX > 2) {
__ subptr(rsp, 256); // Save upper half of ZMM registes
__ vextractf64x4h(Address(rsp, 0), xmm0);
__ vextractf64x4h(Address(rsp, 32), xmm1);
__ vextractf64x4h(Address(rsp, 64), xmm2);
__ vextractf64x4h(Address(rsp, 96), xmm3);
__ vextractf64x4h(Address(rsp, 128), xmm4);
__ vextractf64x4h(Address(rsp, 160), xmm5);
__ vextractf64x4h(Address(rsp, 192), xmm6);
__ vextractf64x4h(Address(rsp, 224), xmm7);
}
}
// Set an oopmap for the call site. This oopmap will map all
@ -283,8 +294,8 @@ void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_ve
int additional_frame_bytes = 0;
#ifdef COMPILER2
if (restore_vectors) {
assert(UseAVX > 0, "256bit vectors are supported only with AVX");
assert(MaxVectorSize == 32, "only 256bit vectors are supported now");
assert(UseAVX > 0, "512bit vectors are supported only with EVEX");
assert(MaxVectorSize == 64, "only 512bit vectors are supported now");
additional_frame_bytes = 128;
}
#else
@ -324,6 +335,18 @@ void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_ve
__ vinsertf128h(xmm6, Address(rsp, 96));
__ vinsertf128h(xmm7, Address(rsp,112));
__ addptr(rsp, additional_frame_bytes);
if (UseAVX > 2) {
additional_frame_bytes = 256;
__ vinsertf64x4h(xmm0, Address(rsp, 0));
__ vinsertf64x4h(xmm1, Address(rsp, 32));
__ vinsertf64x4h(xmm2, Address(rsp, 64));
__ vinsertf64x4h(xmm3, Address(rsp, 96));
__ vinsertf64x4h(xmm4, Address(rsp, 128));
__ vinsertf64x4h(xmm5, Address(rsp, 160));
__ vinsertf64x4h(xmm6, Address(rsp, 192));
__ vinsertf64x4h(xmm7, Address(rsp, 224));
__ addptr(rsp, additional_frame_bytes);
}
}
__ pop_FPU_state();
__ addptr(rsp, FPU_regs_live*wordSize); // Pop FPU registers

View File

@ -86,7 +86,23 @@ class RegisterSaver {
DEF_XMM_OFFS(13),
DEF_XMM_OFFS(14),
DEF_XMM_OFFS(15),
fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
DEF_XMM_OFFS(16),
DEF_XMM_OFFS(17),
DEF_XMM_OFFS(18),
DEF_XMM_OFFS(19),
DEF_XMM_OFFS(20),
DEF_XMM_OFFS(21),
DEF_XMM_OFFS(22),
DEF_XMM_OFFS(23),
DEF_XMM_OFFS(24),
DEF_XMM_OFFS(25),
DEF_XMM_OFFS(26),
DEF_XMM_OFFS(27),
DEF_XMM_OFFS(28),
DEF_XMM_OFFS(29),
DEF_XMM_OFFS(30),
DEF_XMM_OFFS(31),
fpu_state_end = fpu_state_off + ((FPUStateSizeInWords - 1)*wordSize / BytesPerInt),
fpu_stateH_end,
r15_off, r15H_off,
r14_off, r14H_off,
@ -136,13 +152,21 @@ class RegisterSaver {
OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors) {
int vect_words = 0;
int num_xmm_regs = 16;
if (UseAVX > 2) {
num_xmm_regs = 32;
}
#ifdef COMPILER2
if (save_vectors) {
assert(UseAVX > 0, "256bit vectors are supported only with AVX");
assert(MaxVectorSize == 32, "only 256bit vectors are supported now");
// Save upper half of YMM registes
vect_words = 16 * 16 / wordSize;
assert(UseAVX > 0, "512bit vectors are supported only with EVEX");
assert(MaxVectorSize == 64, "only 512bit vectors are supported now");
// Save upper half of YMM registers
vect_words = 16 * num_xmm_regs / wordSize;
additional_frame_words += vect_words;
if (UseAVX > 2) {
// Save upper half of ZMM registers as well
additional_frame_words += vect_words;
}
}
#else
assert(!save_vectors, "vectors are generated only by C2");
@ -150,7 +174,7 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_
// Always make the frame size 16-byte aligned
int frame_size_in_bytes = round_to(additional_frame_words*wordSize +
reg_save_size*BytesPerInt, 16);
reg_save_size*BytesPerInt, num_xmm_regs);
// OopMap frame size is in compiler stack slots (jint's) not bytes or words
int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
// The caller will allocate additional_frame_words
@ -169,24 +193,77 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_
__ push_CPU_state(); // Push a multiple of 16 bytes
if (vect_words > 0) {
assert(vect_words*wordSize == 256, "");
__ subptr(rsp, 256); // Save upper half of YMM registes
__ vextractf128h(Address(rsp, 0),xmm0);
__ vextractf128h(Address(rsp, 16),xmm1);
__ vextractf128h(Address(rsp, 32),xmm2);
__ vextractf128h(Address(rsp, 48),xmm3);
__ vextractf128h(Address(rsp, 64),xmm4);
__ vextractf128h(Address(rsp, 80),xmm5);
__ vextractf128h(Address(rsp, 96),xmm6);
__ vextractf128h(Address(rsp,112),xmm7);
__ vextractf128h(Address(rsp,128),xmm8);
__ vextractf128h(Address(rsp,144),xmm9);
__ vextractf128h(Address(rsp,160),xmm10);
__ vextractf128h(Address(rsp,176),xmm11);
__ vextractf128h(Address(rsp,192),xmm12);
__ vextractf128h(Address(rsp,208),xmm13);
__ vextractf128h(Address(rsp,224),xmm14);
__ vextractf128h(Address(rsp,240),xmm15);
assert(vect_words*wordSize >= 256, "");
__ subptr(rsp, 256); // Save upper half of YMM registes(0..15)
__ vextractf128h(Address(rsp, 0), xmm0);
__ vextractf128h(Address(rsp, 16), xmm1);
__ vextractf128h(Address(rsp, 32), xmm2);
__ vextractf128h(Address(rsp, 48), xmm3);
__ vextractf128h(Address(rsp, 64), xmm4);
__ vextractf128h(Address(rsp, 80), xmm5);
__ vextractf128h(Address(rsp, 96), xmm6);
__ vextractf128h(Address(rsp, 112), xmm7);
__ vextractf128h(Address(rsp, 128), xmm8);
__ vextractf128h(Address(rsp, 144), xmm9);
__ vextractf128h(Address(rsp, 160), xmm10);
__ vextractf128h(Address(rsp, 176), xmm11);
__ vextractf128h(Address(rsp, 192), xmm12);
__ vextractf128h(Address(rsp, 208), xmm13);
__ vextractf128h(Address(rsp, 224), xmm14);
__ vextractf128h(Address(rsp, 240), xmm15);
if (UseAVX > 2) {
__ subptr(rsp, 256); // Save upper half of YMM registes(16..31)
__ vextractf128h(Address(rsp, 0), xmm16);
__ vextractf128h(Address(rsp, 16), xmm17);
__ vextractf128h(Address(rsp, 32), xmm18);
__ vextractf128h(Address(rsp, 48), xmm19);
__ vextractf128h(Address(rsp, 64), xmm20);
__ vextractf128h(Address(rsp, 80), xmm21);
__ vextractf128h(Address(rsp, 96), xmm22);
__ vextractf128h(Address(rsp, 112), xmm23);
__ vextractf128h(Address(rsp, 128), xmm24);
__ vextractf128h(Address(rsp, 144), xmm25);
__ vextractf128h(Address(rsp, 160), xmm26);
__ vextractf128h(Address(rsp, 176), xmm27);
__ vextractf128h(Address(rsp, 192), xmm28);
__ vextractf128h(Address(rsp, 208), xmm29);
__ vextractf128h(Address(rsp, 224), xmm30);
__ vextractf128h(Address(rsp, 240), xmm31);
// Now handle the ZMM registers (0..31)
__ subptr(rsp, 1024); // Save upper half of ZMM registes
__ vextractf64x4h(Address(rsp, 0), xmm0);
__ vextractf64x4h(Address(rsp, 32), xmm1);
__ vextractf64x4h(Address(rsp, 64), xmm2);
__ vextractf64x4h(Address(rsp, 96), xmm3);
__ vextractf64x4h(Address(rsp, 128), xmm4);
__ vextractf64x4h(Address(rsp, 160), xmm5);
__ vextractf64x4h(Address(rsp, 192), xmm6);
__ vextractf64x4h(Address(rsp, 224), xmm7);
__ vextractf64x4h(Address(rsp, 256), xmm8);
__ vextractf64x4h(Address(rsp, 288), xmm9);
__ vextractf64x4h(Address(rsp, 320), xmm10);
__ vextractf64x4h(Address(rsp, 352), xmm11);
__ vextractf64x4h(Address(rsp, 384), xmm12);
__ vextractf64x4h(Address(rsp, 416), xmm13);
__ vextractf64x4h(Address(rsp, 448), xmm14);
__ vextractf64x4h(Address(rsp, 480), xmm15);
__ vextractf64x4h(Address(rsp, 512), xmm16);
__ vextractf64x4h(Address(rsp, 544), xmm17);
__ vextractf64x4h(Address(rsp, 576), xmm18);
__ vextractf64x4h(Address(rsp, 608), xmm19);
__ vextractf64x4h(Address(rsp, 640), xmm20);
__ vextractf64x4h(Address(rsp, 672), xmm21);
__ vextractf64x4h(Address(rsp, 704), xmm22);
__ vextractf64x4h(Address(rsp, 736), xmm23);
__ vextractf64x4h(Address(rsp, 768), xmm24);
__ vextractf64x4h(Address(rsp, 800), xmm25);
__ vextractf64x4h(Address(rsp, 832), xmm26);
__ vextractf64x4h(Address(rsp, 864), xmm27);
__ vextractf64x4h(Address(rsp, 896), xmm28);
__ vextractf64x4h(Address(rsp, 928), xmm29);
__ vextractf64x4h(Address(rsp, 960), xmm30);
__ vextractf64x4h(Address(rsp, 992), xmm31);
}
}
if (frame::arg_reg_save_area_bytes != 0) {
// Allocate argument register save area
@ -235,6 +312,24 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_
map->set_callee_saved(STACK_OFFSET(xmm13_off), xmm13->as_VMReg());
map->set_callee_saved(STACK_OFFSET(xmm14_off), xmm14->as_VMReg());
map->set_callee_saved(STACK_OFFSET(xmm15_off), xmm15->as_VMReg());
if (UseAVX > 2) {
map->set_callee_saved(STACK_OFFSET(xmm16_off), xmm16->as_VMReg());
map->set_callee_saved(STACK_OFFSET(xmm17_off), xmm17->as_VMReg());
map->set_callee_saved(STACK_OFFSET(xmm18_off), xmm18->as_VMReg());
map->set_callee_saved(STACK_OFFSET(xmm19_off), xmm19->as_VMReg());
map->set_callee_saved(STACK_OFFSET(xmm20_off), xmm20->as_VMReg());
map->set_callee_saved(STACK_OFFSET(xmm21_off), xmm21->as_VMReg());
map->set_callee_saved(STACK_OFFSET(xmm22_off), xmm22->as_VMReg());
map->set_callee_saved(STACK_OFFSET(xmm23_off), xmm23->as_VMReg());
map->set_callee_saved(STACK_OFFSET(xmm24_off), xmm24->as_VMReg());
map->set_callee_saved(STACK_OFFSET(xmm25_off), xmm25->as_VMReg());
map->set_callee_saved(STACK_OFFSET(xmm26_off), xmm26->as_VMReg());
map->set_callee_saved(STACK_OFFSET(xmm27_off), xmm27->as_VMReg());
map->set_callee_saved(STACK_OFFSET(xmm28_off), xmm28->as_VMReg());
map->set_callee_saved(STACK_OFFSET(xmm29_off), xmm29->as_VMReg());
map->set_callee_saved(STACK_OFFSET(xmm30_off), xmm30->as_VMReg());
map->set_callee_saved(STACK_OFFSET(xmm31_off), xmm31->as_VMReg());
}
// %%% These should all be a waste but we'll keep things as they were for now
if (true) {
@ -269,6 +364,24 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_
map->set_callee_saved(STACK_OFFSET(xmm13H_off), xmm13->as_VMReg()->next());
map->set_callee_saved(STACK_OFFSET(xmm14H_off), xmm14->as_VMReg()->next());
map->set_callee_saved(STACK_OFFSET(xmm15H_off), xmm15->as_VMReg()->next());
if (UseAVX > 2) {
map->set_callee_saved(STACK_OFFSET(xmm16H_off), xmm16->as_VMReg());
map->set_callee_saved(STACK_OFFSET(xmm17H_off), xmm17->as_VMReg());
map->set_callee_saved(STACK_OFFSET(xmm18H_off), xmm18->as_VMReg());
map->set_callee_saved(STACK_OFFSET(xmm19H_off), xmm19->as_VMReg());
map->set_callee_saved(STACK_OFFSET(xmm20H_off), xmm20->as_VMReg());
map->set_callee_saved(STACK_OFFSET(xmm21H_off), xmm21->as_VMReg());
map->set_callee_saved(STACK_OFFSET(xmm22H_off), xmm22->as_VMReg());
map->set_callee_saved(STACK_OFFSET(xmm23H_off), xmm23->as_VMReg());
map->set_callee_saved(STACK_OFFSET(xmm24H_off), xmm24->as_VMReg());
map->set_callee_saved(STACK_OFFSET(xmm25H_off), xmm25->as_VMReg());
map->set_callee_saved(STACK_OFFSET(xmm26H_off), xmm26->as_VMReg());
map->set_callee_saved(STACK_OFFSET(xmm27H_off), xmm27->as_VMReg());
map->set_callee_saved(STACK_OFFSET(xmm28H_off), xmm28->as_VMReg());
map->set_callee_saved(STACK_OFFSET(xmm29H_off), xmm29->as_VMReg());
map->set_callee_saved(STACK_OFFSET(xmm30H_off), xmm30->as_VMReg());
map->set_callee_saved(STACK_OFFSET(xmm31H_off), xmm31->as_VMReg());
}
}
return map;
@ -281,9 +394,9 @@ void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_ve
}
#ifdef COMPILER2
if (restore_vectors) {
// Restore upper half of YMM registes.
assert(UseAVX > 0, "256bit vectors are supported only with AVX");
assert(MaxVectorSize == 32, "only 256bit vectors are supported now");
// Restore upper half of YMM registes (0..15)
assert(UseAVX > 0, "512bit vectors are supported only with AVX");
assert(MaxVectorSize == 64, "only 512bit vectors are supported now");
__ vinsertf128h(xmm0, Address(rsp, 0));
__ vinsertf128h(xmm1, Address(rsp, 16));
__ vinsertf128h(xmm2, Address(rsp, 32));
@ -301,6 +414,60 @@ void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_ve
__ vinsertf128h(xmm14, Address(rsp,224));
__ vinsertf128h(xmm15, Address(rsp,240));
__ addptr(rsp, 256);
if (UseAVX > 2) {
// Restore upper half of YMM registes (16..31)
__ vinsertf128h(xmm16, Address(rsp, 0));
__ vinsertf128h(xmm17, Address(rsp, 16));
__ vinsertf128h(xmm18, Address(rsp, 32));
__ vinsertf128h(xmm19, Address(rsp, 48));
__ vinsertf128h(xmm20, Address(rsp, 64));
__ vinsertf128h(xmm21, Address(rsp, 80));
__ vinsertf128h(xmm22, Address(rsp, 96));
__ vinsertf128h(xmm23, Address(rsp,112));
__ vinsertf128h(xmm24, Address(rsp,128));
__ vinsertf128h(xmm25, Address(rsp,144));
__ vinsertf128h(xmm26, Address(rsp,160));
__ vinsertf128h(xmm27, Address(rsp,176));
__ vinsertf128h(xmm28, Address(rsp,192));
__ vinsertf128h(xmm29, Address(rsp,208));
__ vinsertf128h(xmm30, Address(rsp,224));
__ vinsertf128h(xmm31, Address(rsp,240));
__ addptr(rsp, 256);
// Restore upper half of ZMM registes.
__ vinsertf64x4h(xmm0, Address(rsp, 0));
__ vinsertf64x4h(xmm1, Address(rsp, 32));
__ vinsertf64x4h(xmm2, Address(rsp, 64));
__ vinsertf64x4h(xmm3, Address(rsp, 96));
__ vinsertf64x4h(xmm4, Address(rsp, 128));
__ vinsertf64x4h(xmm5, Address(rsp, 160));
__ vinsertf64x4h(xmm6, Address(rsp, 192));
__ vinsertf64x4h(xmm7, Address(rsp, 224));
__ vinsertf64x4h(xmm8, Address(rsp, 256));
__ vinsertf64x4h(xmm9, Address(rsp, 288));
__ vinsertf64x4h(xmm10, Address(rsp, 320));
__ vinsertf64x4h(xmm11, Address(rsp, 352));
__ vinsertf64x4h(xmm12, Address(rsp, 384));
__ vinsertf64x4h(xmm13, Address(rsp, 416));
__ vinsertf64x4h(xmm14, Address(rsp, 448));
__ vinsertf64x4h(xmm15, Address(rsp, 480));
__ vinsertf64x4h(xmm16, Address(rsp, 512));
__ vinsertf64x4h(xmm17, Address(rsp, 544));
__ vinsertf64x4h(xmm18, Address(rsp, 576));
__ vinsertf64x4h(xmm19, Address(rsp, 608));
__ vinsertf64x4h(xmm20, Address(rsp, 640));
__ vinsertf64x4h(xmm21, Address(rsp, 672));
__ vinsertf64x4h(xmm22, Address(rsp, 704));
__ vinsertf64x4h(xmm23, Address(rsp, 736));
__ vinsertf64x4h(xmm24, Address(rsp, 768));
__ vinsertf64x4h(xmm25, Address(rsp, 800));
__ vinsertf64x4h(xmm26, Address(rsp, 832));
__ vinsertf64x4h(xmm27, Address(rsp, 864));
__ vinsertf64x4h(xmm28, Address(rsp, 896));
__ vinsertf64x4h(xmm29, Address(rsp, 928));
__ vinsertf64x4h(xmm30, Address(rsp, 960));
__ vinsertf64x4h(xmm31, Address(rsp, 992));
__ subptr(rsp, 1024);
}
}
#else
assert(!restore_vectors, "vectors are generated only by C2");

View File

@ -166,6 +166,13 @@ class StubGenerator: public StubCodeGenerator {
__ movptr(saved_rdi, rdi);
__ movptr(saved_rsi, rsi);
__ movptr(saved_rbx, rbx);
// provide initial value for required masks
if (UseAVX > 2) {
__ movl(rbx, 0xffff);
__ kmovdl(k1, rbx);
}
// save and initialize %mxcsr
if (sse_save) {
Label skip_ldmx;
@ -794,7 +801,10 @@ class StubGenerator: public StubCodeGenerator {
__ BIND(L_copy_64_bytes_loop);
if (UseUnalignedLoadStores) {
if (UseAVX >= 2) {
if (UseAVX > 2) {
__ evmovdqu(xmm0, Address(from, 0), Assembler::AVX_512bit);
__ evmovdqu(Address(from, to_from, Address::times_1, 0), xmm0, Assembler::AVX_512bit);
} else if (UseAVX == 2) {
__ vmovdqu(xmm0, Address(from, 0));
__ vmovdqu(Address(from, to_from, Address::times_1, 0), xmm0);
__ vmovdqu(xmm1, Address(from, 32));
@ -833,7 +843,7 @@ class StubGenerator: public StubCodeGenerator {
__ subl(qword_count, 8);
__ jcc(Assembler::greaterEqual, L_copy_64_bytes_loop);
if (UseUnalignedLoadStores && (UseAVX >= 2)) {
if (UseUnalignedLoadStores && (UseAVX == 2)) {
// clean upper bits of YMM registers
__ vpxor(xmm0, xmm0);
__ vpxor(xmm1, xmm1);

View File

@ -137,8 +137,10 @@ class StubGenerator: public StubCodeGenerator {
// [ return_from_Java ] <--- rsp
// [ argument word n ]
// ...
// -28 [ argument word 1 ]
// -27 [ saved xmm15 ] <--- rsp_after_call
// -60 [ argument word 1 ]
// -59 [ saved xmm31 ] <--- rsp after_call
// [ saved xmm16-xmm30 ] (EVEX enabled, else the space is blank)
// -27 [ saved xmm15 ]
// [ saved xmm7-xmm14 ]
// -9 [ saved xmm6 ] (each xmm register takes 2 slots)
// -7 [ saved r15 ]
@ -166,7 +168,7 @@ class StubGenerator: public StubCodeGenerator {
enum call_stub_layout {
#ifdef _WIN64
xmm_save_first = 6, // save from xmm6
xmm_save_last = 15, // to xmm15
xmm_save_last = 31, // to xmm31
xmm_save_base = -9,
rsp_after_call_off = xmm_save_base - 2 * (xmm_save_last - xmm_save_first), // -27
r15_off = -7,
@ -262,9 +264,19 @@ class StubGenerator: public StubCodeGenerator {
__ movptr(r13_save, r13);
__ movptr(r14_save, r14);
__ movptr(r15_save, r15);
if (UseAVX > 2) {
__ movl(rbx, 0xffff);
__ kmovql(k1, rbx);
}
#ifdef _WIN64
for (int i = 6; i <= 15; i++) {
__ movdqu(xmm_save(i), as_XMMRegister(i));
if (UseAVX > 2) {
for (int i = 6; i <= 31; i++) {
__ movdqu(xmm_save(i), as_XMMRegister(i));
}
} else {
for (int i = 6; i <= 15; i++) {
__ movdqu(xmm_save(i), as_XMMRegister(i));
}
}
const Address rdi_save(rbp, rdi_off * wordSize);
@ -1318,7 +1330,10 @@ class StubGenerator: public StubCodeGenerator {
Label L_end;
// Copy 64-bytes per iteration
__ BIND(L_loop);
if (UseAVX >= 2) {
if (UseAVX > 2) {
__ evmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56), Assembler::AVX_512bit);
__ evmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0, Assembler::AVX_512bit);
} else if (UseAVX == 2) {
__ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
__ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
__ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
@ -1395,7 +1410,10 @@ class StubGenerator: public StubCodeGenerator {
Label L_end;
// Copy 64-bytes per iteration
__ BIND(L_loop);
if (UseAVX >= 2) {
if (UseAVX > 2) {
__ evmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32), Assembler::AVX_512bit);
__ evmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0, Assembler::AVX_512bit);
} else if (UseAVX == 2) {
__ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
__ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
__ vmovdqu(xmm1, Address(from, qword_count, Address::times_8, 0));

View File

@ -35,7 +35,7 @@
int VM_Version::_cpu;
int VM_Version::_model;
int VM_Version::_stepping;
int VM_Version::_cpuFeatures;
uint64_t VM_Version::_cpuFeatures;
const char* VM_Version::_features_str = "";
VM_Version::CpuidInfo VM_Version::_cpuid_info = { 0, };
@ -45,7 +45,7 @@ address VM_Version::_cpuinfo_segv_addr = 0;
address VM_Version::_cpuinfo_cont_addr = 0;
static BufferBlob* stub_blob;
static const int stub_size = 600;
static const int stub_size = 1000;
extern "C" {
typedef void (*get_cpu_info_stub_t)(void*);
@ -60,15 +60,16 @@ class VM_Version_StubGenerator: public StubCodeGenerator {
address generate_get_cpu_info() {
// Flags to test CPU type.
const uint32_t HS_EFL_AC = 0x40000;
const uint32_t HS_EFL_ID = 0x200000;
const uint32_t HS_EFL_AC = 0x40000;
const uint32_t HS_EFL_ID = 0x200000;
// Values for when we don't have a CPUID instruction.
const int CPU_FAMILY_SHIFT = 8;
const uint32_t CPU_FAMILY_386 = (3 << CPU_FAMILY_SHIFT);
const uint32_t CPU_FAMILY_486 = (4 << CPU_FAMILY_SHIFT);
const uint32_t CPU_FAMILY_386 = (3 << CPU_FAMILY_SHIFT);
const uint32_t CPU_FAMILY_486 = (4 << CPU_FAMILY_SHIFT);
Label detect_486, cpu486, detect_586, std_cpuid1, std_cpuid4;
Label sef_cpuid, ext_cpuid, ext_cpuid1, ext_cpuid5, ext_cpuid7, done;
Label sef_cpuid, ext_cpuid, ext_cpuid1, ext_cpuid5, ext_cpuid7, done, wrapup;
Label legacy_setup, save_restore_except, legacy_save_restore, start_simd_check;
StubCodeMark mark(this, "VM_Version", "get_cpu_info_stub");
# define __ _masm->
@ -241,53 +242,6 @@ class VM_Version_StubGenerator: public StubCodeGenerator {
__ movl(Address(rsi, 0), rax);
__ movl(Address(rsi, 4), rdx);
__ andl(rax, 0x6); // xcr0 bits sse | ymm
__ cmpl(rax, 0x6);
__ jccb(Assembler::notEqual, sef_cpuid); // jump if AVX is not supported
//
// Some OSs have a bug when upper 128bits of YMM
// registers are not restored after a signal processing.
// Generate SEGV here (reference through NULL)
// and check upper YMM bits after it.
//
VM_Version::set_avx_cpuFeatures(); // Enable temporary to pass asserts
intx saved_useavx = UseAVX;
intx saved_usesse = UseSSE;
UseAVX = 1;
UseSSE = 2;
// load value into all 32 bytes of ymm7 register
__ movl(rcx, VM_Version::ymm_test_value());
__ movdl(xmm0, rcx);
__ pshufd(xmm0, xmm0, 0x00);
__ vinsertf128h(xmm0, xmm0, xmm0);
__ vmovdqu(xmm7, xmm0);
#ifdef _LP64
__ vmovdqu(xmm8, xmm0);
__ vmovdqu(xmm15, xmm0);
#endif
__ xorl(rsi, rsi);
VM_Version::set_cpuinfo_segv_addr( __ pc() );
// Generate SEGV
__ movl(rax, Address(rsi, 0));
VM_Version::set_cpuinfo_cont_addr( __ pc() );
// Returns here after signal. Save xmm0 to check it later.
__ lea(rsi, Address(rbp, in_bytes(VM_Version::ymm_save_offset())));
__ vmovdqu(Address(rsi, 0), xmm0);
__ vmovdqu(Address(rsi, 32), xmm7);
#ifdef _LP64
__ vmovdqu(Address(rsi, 64), xmm8);
__ vmovdqu(Address(rsi, 96), xmm15);
#endif
VM_Version::clean_cpuFeatures();
UseAVX = saved_useavx;
UseSSE = saved_usesse;
//
// cpuid(0x7) Structured Extended Features
//
@ -364,9 +318,143 @@ class VM_Version_StubGenerator: public StubCodeGenerator {
__ movl(Address(rsi,12), rdx);
//
// return
// Check if OS has enabled XGETBV instruction to access XCR0
// (OSXSAVE feature flag) and CPU supports AVX
//
__ lea(rsi, Address(rbp, in_bytes(VM_Version::std_cpuid1_offset())));
__ movl(rcx, 0x18000000); // cpuid1 bits osxsave | avx
__ andl(rcx, Address(rsi, 8)); // cpuid1 bits osxsave | avx
__ cmpl(rcx, 0x18000000);
__ jccb(Assembler::notEqual, done); // jump if AVX is not supported
__ movl(rax, 0x6);
__ andl(rax, Address(rbp, in_bytes(VM_Version::xem_xcr0_offset()))); // xcr0 bits sse | ymm
__ cmpl(rax, 0x6);
__ jccb(Assembler::equal, start_simd_check); // return if AVX is not supported
// we need to bridge farther than imm8, so we use this island as a thunk
__ bind(done);
__ jmp(wrapup);
__ bind(start_simd_check);
//
// Some OSs have a bug when upper 128/256bits of YMM/ZMM
// registers are not restored after a signal processing.
// Generate SEGV here (reference through NULL)
// and check upper YMM/ZMM bits after it.
//
intx saved_useavx = UseAVX;
intx saved_usesse = UseSSE;
// check _cpuid_info.sef_cpuid7_ebx.bits.avx512f
__ lea(rsi, Address(rbp, in_bytes(VM_Version::sef_cpuid7_offset())));
__ movl(rax, 0x10000);
__ andl(rax, Address(rsi, 4)); // xcr0 bits sse | ymm
__ cmpl(rax, 0x10000);
__ jccb(Assembler::notEqual, legacy_setup); // jump if EVEX is not supported
// check _cpuid_info.xem_xcr0_eax.bits.opmask
// check _cpuid_info.xem_xcr0_eax.bits.zmm512
// check _cpuid_info.xem_xcr0_eax.bits.zmm32
__ movl(rax, 0xE0);
__ andl(rax, Address(rbp, in_bytes(VM_Version::xem_xcr0_offset()))); // xcr0 bits sse | ymm
__ cmpl(rax, 0xE0);
__ jccb(Assembler::notEqual, legacy_setup); // jump if EVEX is not supported
// EVEX setup: run in lowest evex mode
VM_Version::set_evex_cpuFeatures(); // Enable temporary to pass asserts
UseAVX = 3;
UseSSE = 2;
// load value into all 64 bytes of zmm7 register
__ movl(rcx, VM_Version::ymm_test_value());
__ movdl(xmm0, rcx);
__ movl(rcx, 0xffff);
#ifdef _LP64
__ kmovql(k1, rcx);
#else
__ kmovdl(k1, rcx);
#endif
__ evpbroadcastd(xmm0, xmm0, Assembler::AVX_512bit);
__ evmovdqu(xmm7, xmm0, Assembler::AVX_512bit);
#ifdef _LP64
__ evmovdqu(xmm8, xmm0, Assembler::AVX_512bit);
__ evmovdqu(xmm31, xmm0, Assembler::AVX_512bit);
#endif
VM_Version::clean_cpuFeatures();
__ jmp(save_restore_except);
__ bind(legacy_setup);
// AVX setup
VM_Version::set_avx_cpuFeatures(); // Enable temporary to pass asserts
UseAVX = 1;
UseSSE = 2;
// load value into all 32 bytes of ymm7 register
__ movl(rcx, VM_Version::ymm_test_value());
__ movdl(xmm0, rcx);
__ pshufd(xmm0, xmm0, 0x00);
__ vinsertf128h(xmm0, xmm0, xmm0);
__ vmovdqu(xmm7, xmm0);
#ifdef _LP64
__ vmovdqu(xmm8, xmm0);
__ vmovdqu(xmm15, xmm0);
#endif
VM_Version::clean_cpuFeatures();
__ bind(save_restore_except);
__ xorl(rsi, rsi);
VM_Version::set_cpuinfo_segv_addr(__ pc());
// Generate SEGV
__ movl(rax, Address(rsi, 0));
VM_Version::set_cpuinfo_cont_addr(__ pc());
// Returns here after signal. Save xmm0 to check it later.
// check _cpuid_info.sef_cpuid7_ebx.bits.avx512f
__ lea(rsi, Address(rbp, in_bytes(VM_Version::sef_cpuid7_offset())));
__ movl(rax, 0x10000);
__ andl(rax, Address(rsi, 4));
__ cmpl(rax, 0x10000);
__ jccb(Assembler::notEqual, legacy_save_restore);
// check _cpuid_info.xem_xcr0_eax.bits.opmask
// check _cpuid_info.xem_xcr0_eax.bits.zmm512
// check _cpuid_info.xem_xcr0_eax.bits.zmm32
__ movl(rax, 0xE0);
__ andl(rax, Address(rbp, in_bytes(VM_Version::xem_xcr0_offset()))); // xcr0 bits sse | ymm
__ cmpl(rax, 0xE0);
__ jccb(Assembler::notEqual, legacy_save_restore);
// EVEX check: run in lowest evex mode
VM_Version::set_evex_cpuFeatures(); // Enable temporary to pass asserts
UseAVX = 3;
UseSSE = 2;
__ lea(rsi, Address(rbp, in_bytes(VM_Version::zmm_save_offset())));
__ evmovdqu(Address(rsi, 0), xmm0, Assembler::AVX_512bit);
__ evmovdqu(Address(rsi, 64), xmm7, Assembler::AVX_512bit);
#ifdef _LP64
__ evmovdqu(Address(rsi, 128), xmm8, Assembler::AVX_512bit);
__ evmovdqu(Address(rsi, 192), xmm31, Assembler::AVX_512bit);
#endif
VM_Version::clean_cpuFeatures();
UseAVX = saved_useavx;
UseSSE = saved_usesse;
__ jmp(wrapup);
__ bind(legacy_save_restore);
// AVX check
VM_Version::set_avx_cpuFeatures(); // Enable temporary to pass asserts
UseAVX = 1;
UseSSE = 2;
__ lea(rsi, Address(rbp, in_bytes(VM_Version::ymm_save_offset())));
__ vmovdqu(Address(rsi, 0), xmm0);
__ vmovdqu(Address(rsi, 32), xmm7);
#ifdef _LP64
__ vmovdqu(Address(rsi, 64), xmm8);
__ vmovdqu(Address(rsi, 96), xmm15);
#endif
VM_Version::clean_cpuFeatures();
UseAVX = saved_useavx;
UseSSE = saved_usesse;
__ bind(wrapup);
__ popf();
__ pop(rsi);
__ pop(rbx);
@ -459,6 +547,29 @@ void VM_Version::get_processor_features() {
if (UseSSE < 1)
_cpuFeatures &= ~CPU_SSE;
// first try initial setting and detect what we can support
if (UseAVX > 0) {
if (UseAVX > 2 && supports_evex()) {
UseAVX = 3;
} else if (UseAVX > 1 && supports_avx2()) {
UseAVX = 2;
} else if (UseAVX > 0 && supports_avx()) {
UseAVX = 1;
} else {
UseAVX = 0;
}
} else if (UseAVX < 0) {
UseAVX = 0;
}
if (UseAVX < 3) {
_cpuFeatures &= ~CPU_AVX512F;
_cpuFeatures &= ~CPU_AVX512DQ;
_cpuFeatures &= ~CPU_AVX512CD;
_cpuFeatures &= ~CPU_AVX512BW;
_cpuFeatures &= ~CPU_AVX512VL;
}
if (UseAVX < 2)
_cpuFeatures &= ~CPU_AVX2;
@ -474,7 +585,7 @@ void VM_Version::get_processor_features() {
}
char buf[256];
jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
cores_per_cpu(), threads_per_core(),
cpu_family(), _model, _stepping,
(supports_cmov() ? ", cmov" : ""),
@ -504,7 +615,8 @@ void VM_Version::get_processor_features() {
(supports_tscinv() ? ", tscinv": ""),
(supports_bmi1() ? ", bmi1" : ""),
(supports_bmi2() ? ", bmi2" : ""),
(supports_adx() ? ", adx" : ""));
(supports_adx() ? ", adx" : ""),
(supports_evex() ? ", evex" : ""));
_features_str = os::strdup(buf);
// UseSSE is set to the smaller of what hardware supports and what
@ -521,13 +633,6 @@ void VM_Version::get_processor_features() {
if (!supports_sse ()) // Drop to 0 if no SSE support
UseSSE = 0;
if (UseAVX > 2) UseAVX=2;
if (UseAVX < 0) UseAVX=0;
if (!supports_avx2()) // Drop to 1 if no AVX2 support
UseAVX = MIN2((intx)1,UseAVX);
if (!supports_avx ()) // Drop to 0 if no AVX support
UseAVX = 0;
// Use AES instructions if available.
if (supports_aes()) {
if (FLAG_IS_DEFAULT(UseAES)) {
@ -598,7 +703,8 @@ void VM_Version::get_processor_features() {
if ((_model == CPU_MODEL_HASWELL_E3) ||
(_model == CPU_MODEL_HASWELL_E7 && _stepping < 3) ||
(_model == CPU_MODEL_BROADWELL && _stepping < 4)) {
if (!UnlockExperimentalVMOptions) {
// currently a collision between SKL and HSW_E3
if (!UnlockExperimentalVMOptions && UseAVX < 3) {
vm_exit_during_initialization("UseRTMLocking is only available as experimental option on this platform. It must be enabled via -XX:+UnlockExperimentalVMOptions flag.");
} else {
warning("UseRTMLocking is only available as experimental option on this platform.");
@ -651,10 +757,10 @@ void VM_Version::get_processor_features() {
if (MaxVectorSize > 0) {
if (!is_power_of_2(MaxVectorSize)) {
warning("MaxVectorSize must be a power of 2");
FLAG_SET_DEFAULT(MaxVectorSize, 32);
FLAG_SET_DEFAULT(MaxVectorSize, 64);
}
if (MaxVectorSize > 32) {
FLAG_SET_DEFAULT(MaxVectorSize, 32);
if (MaxVectorSize > 64) {
FLAG_SET_DEFAULT(MaxVectorSize, 64);
}
if (MaxVectorSize > 16 && (UseAVX == 0 || !os_supports_avx_vectors())) {
// 32 bytes vectors (in YMM) are only supported with AVX+

View File

@ -208,20 +208,33 @@ public:
bmi2 : 1,
erms : 1,
: 1,
rtm : 1,
: 7,
adx : 1,
: 12;
rtm : 1,
: 4,
avx512f : 1,
avx512dq : 1,
: 1,
adx : 1,
: 6,
avx512pf : 1,
avx512er : 1,
avx512cd : 1,
: 1,
avx512bw : 1,
avx512vl : 1;
} bits;
};
union XemXcr0Eax {
uint32_t value;
struct {
uint32_t x87 : 1,
sse : 1,
ymm : 1,
: 29;
uint32_t x87 : 1,
sse : 1,
ymm : 1,
: 2,
opmask : 1,
zmm512 : 1,
zmm32 : 1,
: 24;
} bits;
};
@ -229,43 +242,51 @@ protected:
static int _cpu;
static int _model;
static int _stepping;
static int _cpuFeatures; // features returned by the "cpuid" instruction
// 0 if this instruction is not available
static uint64_t _cpuFeatures; // features returned by the "cpuid" instruction
// 0 if this instruction is not available
static const char* _features_str;
static address _cpuinfo_segv_addr; // address of instruction which causes SEGV
static address _cpuinfo_cont_addr; // address of instruction after the one which causes SEGV
enum {
CPU_CX8 = (1 << 0), // next bits are from cpuid 1 (EDX)
CPU_CMOV = (1 << 1),
CPU_FXSR = (1 << 2),
CPU_HT = (1 << 3),
CPU_MMX = (1 << 4),
CPU_3DNOW_PREFETCH = (1 << 5), // Processor supports 3dnow prefetch and prefetchw instructions
// may not necessarily support other 3dnow instructions
CPU_SSE = (1 << 6),
CPU_SSE2 = (1 << 7),
CPU_SSE3 = (1 << 8), // SSE3 comes from cpuid 1 (ECX)
CPU_SSSE3 = (1 << 9),
CPU_SSE4A = (1 << 10),
CPU_SSE4_1 = (1 << 11),
CPU_SSE4_2 = (1 << 12),
CPU_POPCNT = (1 << 13),
CPU_LZCNT = (1 << 14),
CPU_TSC = (1 << 15),
CPU_TSCINV = (1 << 16),
CPU_AVX = (1 << 17),
CPU_AVX2 = (1 << 18),
CPU_AES = (1 << 19),
CPU_ERMS = (1 << 20), // enhanced 'rep movsb/stosb' instructions
CPU_CLMUL = (1 << 21), // carryless multiply for CRC
CPU_BMI1 = (1 << 22),
CPU_BMI2 = (1 << 23),
CPU_RTM = (1 << 24), // Restricted Transactional Memory instructions
CPU_ADX = (1 << 25)
CPU_CX8 = (1 << 0), // next bits are from cpuid 1 (EDX)
CPU_CMOV = (1 << 1),
CPU_FXSR = (1 << 2),
CPU_HT = (1 << 3),
CPU_MMX = (1 << 4),
CPU_3DNOW_PREFETCH = (1 << 5), // Processor supports 3dnow prefetch and prefetchw instructions
// may not necessarily support other 3dnow instructions
CPU_SSE = (1 << 6),
CPU_SSE2 = (1 << 7),
CPU_SSE3 = (1 << 8), // SSE3 comes from cpuid 1 (ECX)
CPU_SSSE3 = (1 << 9),
CPU_SSE4A = (1 << 10),
CPU_SSE4_1 = (1 << 11),
CPU_SSE4_2 = (1 << 12),
CPU_POPCNT = (1 << 13),
CPU_LZCNT = (1 << 14),
CPU_TSC = (1 << 15),
CPU_TSCINV = (1 << 16),
CPU_AVX = (1 << 17),
CPU_AVX2 = (1 << 18),
CPU_AES = (1 << 19),
CPU_ERMS = (1 << 20), // enhanced 'rep movsb/stosb' instructions
CPU_CLMUL = (1 << 21), // carryless multiply for CRC
CPU_BMI1 = (1 << 22),
CPU_BMI2 = (1 << 23),
CPU_RTM = (1 << 24), // Restricted Transactional Memory instructions
CPU_ADX = (1 << 25),
CPU_AVX512F = (1 << 26), // AVX 512bit foundation instructions
CPU_AVX512DQ = (1 << 27),
CPU_AVX512PF = (1 << 28),
CPU_AVX512ER = (1 << 29),
CPU_AVX512CD = (1 << 30),
CPU_AVX512BW = (1 << 31)
} cpuFeatureFlags;
#define CPU_AVX512VL 0x100000000 // EVEX instructions with smaller vector length : enums are limited to 32bit
enum {
// AMD
CPU_FAMILY_AMD_11H = 0x11,
@ -282,7 +303,8 @@ protected:
CPU_MODEL_IVYBRIDGE_EP = 0x3a,
CPU_MODEL_HASWELL_E3 = 0x3c,
CPU_MODEL_HASWELL_E7 = 0x3f,
CPU_MODEL_BROADWELL = 0x3d
CPU_MODEL_BROADWELL = 0x3d,
CPU_MODEL_SKYLAKE = CPU_MODEL_HASWELL_E3
} cpuExtendedFamily;
// cpuid information block. All info derived from executing cpuid with
@ -376,6 +398,9 @@ protected:
// Space to save ymm registers after signal handle
int ymm_save[8*4]; // Save ymm0, ymm7, ymm8, ymm15
// Space to save zmm registers after signal handle
int zmm_save[16*4]; // Save zmm0, zmm7, zmm8, zmm31
};
// The actual cpuid info block
@ -404,8 +429,8 @@ protected:
return result;
}
static uint32_t feature_flags() {
uint32_t result = 0;
static uint64_t feature_flags() {
uint64_t result = 0;
if (_cpuid_info.std_cpuid1_edx.bits.cmpxchg8 != 0)
result |= CPU_CX8;
if (_cpuid_info.std_cpuid1_edx.bits.cmov != 0)
@ -440,6 +465,24 @@ protected:
result |= CPU_AVX;
if (_cpuid_info.sef_cpuid7_ebx.bits.avx2 != 0)
result |= CPU_AVX2;
if (_cpuid_info.sef_cpuid7_ebx.bits.avx512f != 0 &&
_cpuid_info.xem_xcr0_eax.bits.opmask != 0 &&
_cpuid_info.xem_xcr0_eax.bits.zmm512 != 0 &&
_cpuid_info.xem_xcr0_eax.bits.zmm32 != 0) {
result |= CPU_AVX512F;
if (_cpuid_info.sef_cpuid7_ebx.bits.avx512cd != 0)
result |= CPU_AVX512CD;
if (_cpuid_info.sef_cpuid7_ebx.bits.avx512dq != 0)
result |= CPU_AVX512DQ;
if (_cpuid_info.sef_cpuid7_ebx.bits.avx512pf != 0)
result |= CPU_AVX512PF;
if (_cpuid_info.sef_cpuid7_ebx.bits.avx512er != 0)
result |= CPU_AVX512ER;
if (_cpuid_info.sef_cpuid7_ebx.bits.avx512bw != 0)
result |= CPU_AVX512BW;
if (_cpuid_info.sef_cpuid7_ebx.bits.avx512vl != 0)
result |= CPU_AVX512VL;
}
}
if(_cpuid_info.sef_cpuid7_ebx.bits.bmi1 != 0)
result |= CPU_BMI1;
@ -484,18 +527,31 @@ protected:
}
static bool os_supports_avx_vectors() {
if (!supports_avx()) {
return false;
}
// Verify that OS save/restore all bits of AVX registers
// during signal processing.
int nreg = 2 LP64_ONLY(+2);
for (int i = 0; i < 8 * nreg; i++) { // 32 bytes per ymm register
if (_cpuid_info.ymm_save[i] != ymm_test_value()) {
return false;
bool retVal = false;
if (supports_evex()) {
// Verify that OS save/restore all bits of EVEX registers
// during signal processing.
int nreg = 2 LP64_ONLY(+2);
retVal = true;
for (int i = 0; i < 16 * nreg; i++) { // 64 bytes per zmm register
if (_cpuid_info.zmm_save[i] != ymm_test_value()) {
retVal = false;
break;
}
}
} else if (supports_avx()) {
// Verify that OS save/restore all bits of AVX registers
// during signal processing.
int nreg = 2 LP64_ONLY(+2);
retVal = true;
for (int i = 0; i < 8 * nreg; i++) { // 32 bytes per ymm register
if (_cpuid_info.ymm_save[i] != ymm_test_value()) {
retVal = false;
break;
}
}
}
return true;
return retVal;
}
static void get_processor_features();
@ -515,6 +571,7 @@ public:
static ByteSize tpl_cpuidB2_offset() { return byte_offset_of(CpuidInfo, tpl_cpuidB2_eax); }
static ByteSize xem_xcr0_offset() { return byte_offset_of(CpuidInfo, xem_xcr0_eax); }
static ByteSize ymm_save_offset() { return byte_offset_of(CpuidInfo, ymm_save); }
static ByteSize zmm_save_offset() { return byte_offset_of(CpuidInfo, zmm_save); }
// The value used to check ymm register after signal handle
static int ymm_test_value() { return 0xCAFEBABE; }
@ -527,6 +584,7 @@ public:
static void clean_cpuFeatures() { _cpuFeatures = 0; }
static void set_avx_cpuFeatures() { _cpuFeatures = (CPU_SSE | CPU_SSE2 | CPU_AVX); }
static void set_evex_cpuFeatures() { _cpuFeatures = (CPU_AVX512F | CPU_SSE | CPU_SSE2 ); }
// Initialization
@ -636,7 +694,14 @@ public:
static bool supports_rtm() { return (_cpuFeatures & CPU_RTM) != 0; }
static bool supports_bmi1() { return (_cpuFeatures & CPU_BMI1) != 0; }
static bool supports_bmi2() { return (_cpuFeatures & CPU_BMI2) != 0; }
static bool supports_adx() { return (_cpuFeatures & CPU_ADX) != 0; }
static bool supports_adx() { return (_cpuFeatures & CPU_ADX) != 0; }
static bool supports_evex() { return (_cpuFeatures & CPU_AVX512F) != 0; }
static bool supports_avx512dq() { return (_cpuFeatures & CPU_AVX512DQ) != 0; }
static bool supports_avx512pf() { return (_cpuFeatures & CPU_AVX512PF) != 0; }
static bool supports_avx512er() { return (_cpuFeatures & CPU_AVX512ER) != 0; }
static bool supports_avx512cd() { return (_cpuFeatures & CPU_AVX512CD) != 0; }
static bool supports_avx512bw() { return (_cpuFeatures & CPU_AVX512BW) != 0; }
static bool supports_avx512vl() { return (_cpuFeatures & CPU_AVX512VL) != 0; }
// Intel features
static bool is_intel_family_core() { return is_intel() &&
extended_cpu_family() == CPU_FAMILY_INTEL_CORE; }

View File

@ -47,13 +47,22 @@ void VMRegImpl::set_regName() {
}
XMMRegister xreg = ::as_XMMRegister(0);
for ( ; i < ConcreteRegisterImpl::max_xmm ; ) {
for (int j = 0 ; j < 8 ; j++) {
for (; i < ConcreteRegisterImpl::max_xmm;) {
for (int j = 0 ; j < XMMRegisterImpl::max_slots_per_register ; j++) {
regName[i++] = xreg->name();
}
xreg = xreg->successor();
}
KRegister kreg = ::as_KRegister(0);
for (; i < ConcreteRegisterImpl::max_kpr;) {
for (int j = 0; j < KRegisterImpl::max_slots_per_register; j++) {
regName[i++] = kreg->name();
}
kreg = kreg->successor();
}
for ( ; i < ConcreteRegisterImpl::number_of_registers ; i ++ ) {
regName[i] = "NON-GPR-FPR-XMM";
regName[i] = "NON-GPR-FPR-XMM-KREG";
}
}

View File

@ -36,7 +36,24 @@ inline bool is_FloatRegister() {
}
inline bool is_XMMRegister() {
return value() >= ConcreteRegisterImpl::max_fpr && value() < ConcreteRegisterImpl::max_xmm;
int uarch_max_xmm = ConcreteRegisterImpl::max_xmm;
#ifdef _LP64
if (UseAVX < 3) {
int half_xmm = (XMMRegisterImpl::max_slots_per_register * XMMRegisterImpl::number_of_registers) / 2;
uarch_max_xmm -= half_xmm;
}
#endif
return (value() >= ConcreteRegisterImpl::max_fpr && value() < uarch_max_xmm);
}
inline bool is_KRegister() {
if (UseAVX > 2) {
return value() >= ConcreteRegisterImpl::max_xmm && value() < ConcreteRegisterImpl::max_kpr;
} else {
return false;
}
}
inline Register as_Register() {
@ -59,7 +76,13 @@ inline FloatRegister as_FloatRegister() {
inline XMMRegister as_XMMRegister() {
assert( is_XMMRegister() && is_even(value()), "must be" );
// Yuk
return ::as_XMMRegister((value() - ConcreteRegisterImpl::max_fpr) >> 3);
return ::as_XMMRegister((value() - ConcreteRegisterImpl::max_fpr) >> 4);
}
inline KRegister as_KRegister() {
assert(is_KRegister(), "must be");
// Yuk
return ::as_KRegister((value() - ConcreteRegisterImpl::max_xmm));
}
inline bool is_concrete() {

View File

@ -39,7 +39,11 @@ inline VMReg FloatRegisterImpl::as_VMReg() {
}
inline VMReg XMMRegisterImpl::as_VMReg() {
return VMRegImpl::as_VMReg((encoding() << 3) + ConcreteRegisterImpl::max_fpr);
return VMRegImpl::as_VMReg((encoding() << 4) + ConcreteRegisterImpl::max_fpr);
}
inline VMReg KRegisterImpl::as_VMReg() {
return VMRegImpl::as_VMReg(encoding() + ConcreteRegisterImpl::max_xmm);
}
#endif // CPU_X86_VM_VMREG_X86_INLINE_HPP

File diff suppressed because it is too large Load Diff

View File

@ -101,6 +101,17 @@ reg_def FPR6L( SOC, SOC, Op_RegF, 6, as_FloatRegister(5)->as_VMReg());
reg_def FPR6H( SOC, SOC, Op_RegF, 6, as_FloatRegister(5)->as_VMReg()->next());
reg_def FPR7L( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg());
reg_def FPR7H( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next());
//
// Empty fill registers, which are never used, but supply alignment to xmm regs
//
reg_def FILL0( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next(2));
reg_def FILL1( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next(3));
reg_def FILL2( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next(4));
reg_def FILL3( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next(5));
reg_def FILL4( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next(6));
reg_def FILL5( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next(7));
reg_def FILL6( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next(8));
reg_def FILL7( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next(9));
// Specify priority of register selection within phases of register
// allocation. Highest priority is first. A useful heuristic is to
@ -112,7 +123,8 @@ reg_def FPR7H( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next());
alloc_class chunk0( ECX, EBX, EBP, EDI, EAX, EDX, ESI, ESP,
FPR0L, FPR0H, FPR1L, FPR1H, FPR2L, FPR2H,
FPR3L, FPR3H, FPR4L, FPR4H, FPR5L, FPR5H,
FPR6L, FPR6H, FPR7L, FPR7H );
FPR6L, FPR6H, FPR7L, FPR7H,
FILL0, FILL1, FILL2, FILL3, FILL4, FILL5, FILL6, FILL7);
//----------Architecture Description Register Classes--------------------------
@ -131,7 +143,7 @@ reg_class any_reg_with_ebp(EAX, EDX, EBP, EDI, ESI, ECX, EBX, ESP);
// Class for all registers (excluding EBP)
reg_class any_reg_no_ebp(EAX, EDX, EDI, ESI, ECX, EBX, ESP);
// Dynamic register class that selects at runtime between register classes
// any_reg and any_no_ebp_reg (depending on the value of the flag PreserveFramePointer).
// any_reg and any_no_ebp_reg (depending on the value of the flag PreserveFramePointer).
// Equivalent to: return PreserveFramePointer ? any_no_ebp_reg : any_reg;
reg_class_dynamic any_reg(any_reg_no_ebp, any_reg_with_ebp, %{ PreserveFramePointer %});
@ -279,7 +291,9 @@ static int pre_call_resets_size() {
size += 6; // fldcw
}
if (C->max_vector_size() > 16) {
size += 3; // vzeroupper
if(UseAVX <= 2) {
size += 3; // vzeroupper
}
}
return size;
}
@ -288,7 +302,7 @@ static int pre_call_resets_size() {
// from the start of the call to the point where the return address
// will point.
int MachCallStaticJavaNode::ret_addr_offset() {
return 5 + pre_call_resets_size(); // 5 bytes from start of call to where return address points
return 5 + pre_call_resets_size(); // 5 bytes from start of call to where return address points
}
int MachCallDynamicJavaNode::ret_addr_offset() {
@ -767,6 +781,12 @@ static int impl_helper( CodeBuffer *cbuf, bool do_size, bool is_load, int offset
// Helper for XMM registers. Extra opcode bits, limited syntax.
static int impl_x_helper( CodeBuffer *cbuf, bool do_size, bool is_load,
int offset, int reg_lo, int reg_hi, int size, outputStream* st ) {
int in_size_in_bits = Assembler::EVEX_32bit;
int evex_encoding = 0;
if (reg_lo+1 == reg_hi) {
in_size_in_bits = Assembler::EVEX_64bit;
evex_encoding = Assembler::VEX_W;
}
if (cbuf) {
MacroAssembler _masm(cbuf);
if (reg_lo+1 == reg_hi) { // double move?
@ -799,7 +819,17 @@ static int impl_x_helper( CodeBuffer *cbuf, bool do_size, bool is_load,
}
#endif
}
int offset_size = (offset == 0) ? 0 : ((offset <= 127) ? 1 : 4);
bool is_single_byte = false;
if ((UseAVX > 2) && (offset != 0)) {
is_single_byte = Assembler::query_compressed_disp_byte(offset, true, 0, Assembler::EVEX_T1S, in_size_in_bits, evex_encoding);
}
int offset_size = 0;
if (UseAVX > 2 ) {
offset_size = (offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
} else {
offset_size = (offset == 0) ? 0 : ((offset <= 127) ? 1 : 4);
}
size += (UseAVX > 2) ? 2 : 0; // Need an additional two bytes for EVEX
// VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
return size+5+offset_size;
}
@ -835,8 +865,8 @@ static int impl_movx_helper( CodeBuffer *cbuf, bool do_size, int src_lo, int dst
#endif
}
// VEX_2bytes prefix is used if UseAVX > 0, and it takes the same 2 bytes as SIMD prefix.
// Only MOVAPS SSE prefix uses 1 byte.
int sz = 4;
// Only MOVAPS SSE prefix uses 1 byte. EVEX uses an additional 2 bytes.
int sz = (UseAVX > 2) ? 6 : 4;
if (!(src_lo+1 == src_hi && dst_lo+1 == dst_hi) &&
UseXmmRegToRegMoveAll && (UseAVX == 0)) sz = 3;
return size + sz;
@ -854,7 +884,7 @@ static int impl_movgpr2x_helper( CodeBuffer *cbuf, bool do_size, int src_lo, int
st->print("movdl %s, %s\t# spill", Matcher::regName[dst_lo], Matcher::regName[src_lo]);
#endif
}
return 4;
return (UseAVX> 2) ? 6 : 4;
}
@ -870,7 +900,7 @@ static int impl_movx2gpr_helper( CodeBuffer *cbuf, bool do_size, int src_lo, int
st->print("movdl %s, %s\t# spill", Matcher::regName[dst_lo], Matcher::regName[src_lo]);
#endif
}
return 4;
return (UseAVX> 2) ? 6 : 4;
}
static int impl_mov_helper( CodeBuffer *cbuf, bool do_size, int src, int dst, int size, outputStream* st ) {
@ -941,9 +971,8 @@ static int vec_stack_to_stack_helper(CodeBuffer *cbuf, bool do_size, int src_off
calc_size += 3+src_offset_size + 3+dst_offset_size;
break;
case Op_VecX:
calc_size = 6 + 6 + 5+src_offset_size + 5+dst_offset_size;
break;
case Op_VecY:
case Op_VecZ:
calc_size = 6 + 6 + 5+src_offset_size + 5+dst_offset_size;
break;
default:
@ -974,6 +1003,11 @@ static int vec_stack_to_stack_helper(CodeBuffer *cbuf, bool do_size, int src_off
__ vmovdqu(xmm0, Address(rsp, src_offset));
__ vmovdqu(Address(rsp, dst_offset), xmm0);
__ vmovdqu(xmm0, Address(rsp, -32));
case Op_VecZ:
__ evmovdqu(Address(rsp, -64), xmm0, 2);
__ evmovdqu(xmm0, Address(rsp, src_offset), 2);
__ evmovdqu(Address(rsp, dst_offset), xmm0, 2);
__ evmovdqu(xmm0, Address(rsp, -64), 2);
break;
default:
ShouldNotReachHere();
@ -1009,6 +1043,12 @@ static int vec_stack_to_stack_helper(CodeBuffer *cbuf, bool do_size, int src_off
"vmovdqu [rsp + #%d], xmm0\n\t"
"vmovdqu xmm0, [rsp - #32]",
src_offset, dst_offset);
case Op_VecZ:
st->print("vmovdqu [rsp - #64], xmm0\t# 512-bit mem-mem spill\n\t"
"vmovdqu xmm0, [rsp + #%d]\n\t"
"vmovdqu [rsp + #%d], xmm0\n\t"
"vmovdqu xmm0, [rsp - #64]",
src_offset, dst_offset);
break;
default:
ShouldNotReachHere();
@ -1042,7 +1082,7 @@ uint MachSpillCopyNode::implementation( CodeBuffer *cbuf, PhaseRegAlloc *ra_, bo
uint ireg = ideal_reg();
assert((src_first_rc != rc_int && dst_first_rc != rc_int), "sanity");
assert((src_first_rc != rc_float && dst_first_rc != rc_float), "sanity");
assert((ireg == Op_VecS || ireg == Op_VecD || ireg == Op_VecX || ireg == Op_VecY), "sanity");
assert((ireg == Op_VecS || ireg == Op_VecD || ireg == Op_VecX || ireg == Op_VecY || ireg == Op_VecZ ), "sanity");
if( src_first_rc == rc_stack && dst_first_rc == rc_stack ) {
// mem -> mem
int src_offset = ra_->reg2offset(src_first);
@ -3998,7 +4038,7 @@ operand regFPR1(regFPR reg) %{
// XMM Float register operands
operand regF() %{
predicate( UseSSE>=1 );
constraint(ALLOC_IN_RC(float_reg));
constraint(ALLOC_IN_RC(float_reg_legacy));
match(RegF);
format %{ %}
interface(REG_INTER);
@ -4007,12 +4047,45 @@ operand regF() %{
// XMM Double register operands
operand regD() %{
predicate( UseSSE>=2 );
constraint(ALLOC_IN_RC(double_reg));
constraint(ALLOC_IN_RC(double_reg_legacy));
match(RegD);
format %{ %}
interface(REG_INTER);
%}
// Vectors : note, we use legacy registers to avoid extra (unneeded in 32-bit VM)
// runtime code generation via reg_class_dynamic.
operand vecS() %{
constraint(ALLOC_IN_RC(vectors_reg_legacy));
match(VecS);
format %{ %}
interface(REG_INTER);
%}
operand vecD() %{
constraint(ALLOC_IN_RC(vectord_reg_legacy));
match(VecD);
format %{ %}
interface(REG_INTER);
%}
operand vecX() %{
constraint(ALLOC_IN_RC(vectorx_reg_legacy));
match(VecX);
format %{ %}
interface(REG_INTER);
%}
operand vecY() %{
constraint(ALLOC_IN_RC(vectory_reg_legacy));
match(VecY);
format %{ %}
interface(REG_INTER);
%}
//----------Memory Operands----------------------------------------------------
// Direct Memory Operand
@ -5020,11 +5093,11 @@ instruct bytes_reverse_unsigned_short(rRegI dst, eFlagsReg cr) %{
match(Set dst (ReverseBytesUS dst));
effect(KILL cr);
format %{ "BSWAP $dst\n\t"
format %{ "BSWAP $dst\n\t"
"SHR $dst,16\n\t" %}
ins_encode %{
__ bswapl($dst$$Register);
__ shrl($dst$$Register, 16);
__ shrl($dst$$Register, 16);
%}
ins_pipe( ialu_reg );
%}
@ -5033,11 +5106,11 @@ instruct bytes_reverse_short(rRegI dst, eFlagsReg cr) %{
match(Set dst (ReverseBytesS dst));
effect(KILL cr);
format %{ "BSWAP $dst\n\t"
format %{ "BSWAP $dst\n\t"
"SAR $dst,16\n\t" %}
ins_encode %{
__ bswapl($dst$$Register);
__ sarl($dst$$Register, 16);
__ sarl($dst$$Register, 16);
%}
ins_pipe( ialu_reg );
%}
@ -6525,7 +6598,7 @@ instruct membar_volatile(eFlagsReg cr) %{
effect(KILL cr);
ins_cost(400);
format %{
format %{
$$template
if (os::is_MP()) {
$$emit$$"LOCK ADDL [ESP + #0], 0\t! membar_volatile"
@ -8288,10 +8361,10 @@ instruct xorI_eReg(rRegI dst, rRegI src, eFlagsReg cr) %{
// Xor Register with Immediate -1
instruct xorI_eReg_im1(rRegI dst, immI_M1 imm) %{
match(Set dst (XorI dst imm));
match(Set dst (XorI dst imm));
size(2);
format %{ "NOT $dst" %}
format %{ "NOT $dst" %}
ins_encode %{
__ notl($dst$$Register);
%}
@ -8939,7 +9012,7 @@ instruct xorl_eReg(eRegL dst, eRegL src, eFlagsReg cr) %{
// Xor Long Register with Immediate -1
instruct xorl_eReg_im1(eRegL dst, immL_M1 imm) %{
match(Set dst (XorL dst imm));
match(Set dst (XorL dst imm));
format %{ "NOT $dst.lo\n\t"
"NOT $dst.hi" %}
ins_encode %{
@ -8994,7 +9067,7 @@ instruct shlL_eReg_2(eRegL dst, immI_2 cnt, eFlagsReg cr) %{
effect(KILL cr);
ins_cost(100);
format %{ "ADD $dst.lo,$dst.lo\n\t"
"ADC $dst.hi,$dst.hi\n\t"
"ADC $dst.hi,$dst.hi\n\t"
"ADD $dst.lo,$dst.lo\n\t"
"ADC $dst.hi,$dst.hi" %}
ins_encode %{
@ -9013,9 +9086,9 @@ instruct shlL_eReg_3(eRegL dst, immI_3 cnt, eFlagsReg cr) %{
effect(KILL cr);
ins_cost(100);
format %{ "ADD $dst.lo,$dst.lo\n\t"
"ADC $dst.hi,$dst.hi\n\t"
"ADC $dst.hi,$dst.hi\n\t"
"ADD $dst.lo,$dst.lo\n\t"
"ADC $dst.hi,$dst.hi\n\t"
"ADC $dst.hi,$dst.hi\n\t"
"ADD $dst.lo,$dst.lo\n\t"
"ADC $dst.hi,$dst.hi" %}
ins_encode %{
@ -11168,7 +11241,6 @@ instruct convL2I_reg( rRegI dst, eRegL src ) %{
ins_pipe( ialu_reg_reg );
%}
instruct MoveF2I_stack_reg(rRegI dst, stackSlotF src) %{
match(Set dst (MoveF2I src));
effect( DEF dst, USE src );
@ -11400,7 +11472,7 @@ instruct rep_stos(eCXRegI cnt, eDIRegP base, eAXRegI zero, Universe dummy, eFlag
format %{ "XOR EAX,EAX\t# ClearArray:\n\t"
"SHL ECX,1\t# Convert doublewords to words\n\t"
"REP STOS\t# store EAX into [EDI++] while ECX--" %}
ins_encode %{
ins_encode %{
__ clear_mem($base$$Register, $cnt$$Register, $zero$$Register);
%}
ins_pipe( pipe_slow );
@ -11413,7 +11485,7 @@ instruct rep_fast_stosb(eCXRegI cnt, eDIRegP base, eAXRegI zero, Universe dummy,
format %{ "XOR EAX,EAX\t# ClearArray:\n\t"
"SHL ECX,3\t# Convert doublewords to bytes\n\t"
"REP STOSB\t# store EAX into [EDI++] while ECX--" %}
ins_encode %{
ins_encode %{
__ clear_mem($base$$Register, $cnt$$Register, $zero$$Register);
%}
ins_pipe( pipe_slow );

View File

@ -172,7 +172,7 @@ reg_class no_reg();
// Class for all pointer registers (including RSP and RBP)
reg_class any_reg_with_rbp(RAX, RAX_H,
RDX, RDX_H,
RBP, RBP_H,
RBP, RBP_H,
RDI, RDI_H,
RSI, RSI_H,
RCX, RCX_H,
@ -189,7 +189,7 @@ reg_class any_reg_with_rbp(RAX, RAX_H,
// Class for all pointer registers (including RSP, but excluding RBP)
reg_class any_reg_no_rbp(RAX, RAX_H,
RDX, RDX_H,
RDX, RDX_H,
RDI, RDI_H,
RSI, RSI_H,
RCX, RCX_H,
@ -205,10 +205,10 @@ reg_class any_reg_no_rbp(RAX, RAX_H,
R15, R15_H);
// Dynamic register class that selects at runtime between register classes
// any_reg_no_rbp and any_reg_with_rbp (depending on the value of the flag PreserveFramePointer).
// any_reg_no_rbp and any_reg_with_rbp (depending on the value of the flag PreserveFramePointer).
// Equivalent to: return PreserveFramePointer ? any_reg_no_rbp : any_reg_with_rbp;
reg_class_dynamic any_reg(any_reg_no_rbp, any_reg_with_rbp, %{ PreserveFramePointer %});
// Class for all pointer registers (excluding RSP)
reg_class ptr_reg_with_rbp(RAX, RAX_H,
RDX, RDX_H,
@ -226,7 +226,7 @@ reg_class ptr_reg_with_rbp(RAX, RAX_H,
// Class for all pointer registers (excluding RSP and RBP)
reg_class ptr_reg_no_rbp(RAX, RAX_H,
RDX, RDX_H,
RDX, RDX_H,
RDI, RDI_H,
RSI, RSI_H,
RCX, RCX_H,
@ -536,7 +536,11 @@ source %{
#define __ _masm.
static int clear_avx_size() {
return (Compile::current()->max_vector_size() > 16) ? 3 : 0; // vzeroupper
if(UseAVX > 2) {
return 0; // vzeroupper is ignored
} else {
return (Compile::current()->max_vector_size() > 16) ? 3 : 0; // vzeroupper
}
}
// !!!!! Special hack to get all types of calls to specify the byte offset
@ -545,7 +549,7 @@ static int clear_avx_size() {
int MachCallStaticJavaNode::ret_addr_offset()
{
int offset = 5; // 5 bytes from start of call to where return address points
offset += clear_avx_size();
offset += clear_avx_size();
return offset;
}
@ -860,7 +864,7 @@ void MachPrologNode::format(PhaseRegAlloc* ra_, outputStream* st) const {
st->print("subq rsp, #%d\t# Create frame",framesize);
st->print("\n\t");
framesize -= wordSize;
st->print("movq [rsp + #%d], rbp\t# Save rbp",framesize);
st->print("movq [rsp + #%d], rbp\t# Save rbp",framesize);
if (PreserveFramePointer) {
st->print("\n\t");
st->print("movq rbp, [rsp + #%d]\t# Save the caller's SP into rbp", (framesize + wordSize));
@ -1070,6 +1074,11 @@ static void vec_stack_to_stack_helper(CodeBuffer *cbuf, int src_offset,
__ vmovdqu(xmm0, Address(rsp, src_offset));
__ vmovdqu(Address(rsp, dst_offset), xmm0);
__ vmovdqu(xmm0, Address(rsp, -32));
case Op_VecZ:
__ evmovdqu(Address(rsp, -64), xmm0, 2);
__ evmovdqu(xmm0, Address(rsp, src_offset), 2);
__ evmovdqu(Address(rsp, dst_offset), xmm0, 2);
__ evmovdqu(xmm0, Address(rsp, -64), 2);
break;
default:
ShouldNotReachHere();
@ -1103,6 +1112,13 @@ static void vec_stack_to_stack_helper(CodeBuffer *cbuf, int src_offset,
"vmovdqu xmm0, [rsp - #32]",
src_offset, dst_offset);
break;
case Op_VecZ:
st->print("vmovdqu [rsp - #64], xmm0\t# 512-bit mem-mem spill\n\t"
"vmovdqu xmm0, [rsp + #%d]\n\t"
"vmovdqu [rsp + #%d], xmm0\n\t"
"vmovdqu xmm0, [rsp - #64]",
src_offset, dst_offset);
break;
default:
ShouldNotReachHere();
}
@ -1136,7 +1152,7 @@ uint MachSpillCopyNode::implementation(CodeBuffer* cbuf,
if (bottom_type()->isa_vect() != NULL) {
uint ireg = ideal_reg();
assert((src_first_rc != rc_int && dst_first_rc != rc_int), "sanity");
assert((ireg == Op_VecS || ireg == Op_VecD || ireg == Op_VecX || ireg == Op_VecY), "sanity");
assert((ireg == Op_VecS || ireg == Op_VecD || ireg == Op_VecX || ireg == Op_VecY || ireg == Op_VecZ ), "sanity");
if( src_first_rc == rc_stack && dst_first_rc == rc_stack ) {
// mem -> mem
int src_offset = ra_->reg2offset(src_first);
@ -1573,7 +1589,7 @@ uint MachUEPNode::size(PhaseRegAlloc* ra_) const
return MachNode::size(ra_); // too many variables; just compute it
// the hard way
}
//=============================================================================
@ -2832,7 +2848,7 @@ frame
RAX_H_num // Op_RegL
};
// Excluded flags and vector registers.
assert(ARRAY_SIZE(hi) == _last_machine_leaf - 5, "missing type");
assert(ARRAY_SIZE(hi) == _last_machine_leaf - 6, "missing type");
return OptoRegPair(hi[ideal_reg], lo[ideal_reg]);
%}
%}
@ -3335,7 +3351,7 @@ operand no_rax_rdx_RegI()
// Pointer Register
operand any_RegP()
%{
constraint(ALLOC_IN_RC(any_reg));
constraint(ALLOC_IN_RC(any_reg));
match(RegP);
match(rax_RegP);
match(rbx_RegP);
@ -3589,20 +3605,51 @@ operand rFlagsRegUCF() %{
%}
// Float register operands
operand regF()
%{
constraint(ALLOC_IN_RC(float_reg));
match(RegF);
operand regF() %{
constraint(ALLOC_IN_RC(float_reg));
match(RegF);
format %{ %}
interface(REG_INTER);
%}
// Double register operands
operand regD() %{
constraint(ALLOC_IN_RC(double_reg));
match(RegD);
format %{ %}
interface(REG_INTER);
%}
// Vectors
operand vecS() %{
constraint(ALLOC_IN_RC(vectors_reg));
match(VecS);
format %{ %}
interface(REG_INTER);
%}
// Double register operands
operand regD()
%{
constraint(ALLOC_IN_RC(double_reg));
match(RegD);
operand vecD() %{
constraint(ALLOC_IN_RC(vectord_reg));
match(VecD);
format %{ %}
interface(REG_INTER);
%}
operand vecX() %{
constraint(ALLOC_IN_RC(vectorx_reg));
match(VecX);
format %{ %}
interface(REG_INTER);
%}
operand vecY() %{
constraint(ALLOC_IN_RC(vectory_reg));
match(VecY);
format %{ %}
interface(REG_INTER);
@ -4947,7 +4994,7 @@ instruct loadI2L_immU31(rRegL dst, memory mem, immU31 mask, rFlagsReg cr) %{
%}
// Load Unsigned Integer into Long Register
instruct loadUI2L(rRegL dst, memory mem, immL_32bits mask)
instruct loadUI2L(rRegL dst, memory mem, immL_32bits mask)
%{
match(Set dst (AndL (ConvI2L (LoadI mem)) mask));
@ -10374,7 +10421,7 @@ instruct rep_stos(rcx_RegL cnt, rdi_RegP base, rax_RegI zero, Universe dummy,
format %{ "xorq rax, rax\t# ClearArray:\n\t"
"rep stosq\t# Store rax to *rdi++ while rcx--" %}
ins_encode %{
ins_encode %{
__ clear_mem($base$$Register, $cnt$$Register, $zero$$Register);
%}
ins_pipe(pipe_slow);
@ -10389,7 +10436,7 @@ instruct rep_fast_stosb(rcx_RegL cnt, rdi_RegP base, rax_RegI zero, Universe dum
format %{ "xorq rax, rax\t# ClearArray:\n\t"
"shlq rcx,3\t# Convert doublewords to bytes\n\t"
"rep stosb\t# Store rax to *rdi++ while rcx--" %}
ins_encode %{
ins_encode %{
__ clear_mem($base$$Register, $cnt$$Register, $zero$$Register);
%}
ins_pipe( pipe_slow );

View File

@ -929,6 +929,7 @@ const char *ArchDesc::getIdealType(const char *idealOp) {
case 'D': return "TypeVect::VECTD";
case 'X': return "TypeVect::VECTX";
case 'Y': return "TypeVect::VECTY";
case 'Z': return "TypeVect::VECTZ";
default:
internal_err("Vector type %s with unrecognized type\n",idealOp);
}

View File

@ -3919,6 +3919,7 @@ bool MatchRule::is_base_register(FormDict &globals) const {
strcmp(opType,"VecD")==0 ||
strcmp(opType,"VecX")==0 ||
strcmp(opType,"VecY")==0 ||
strcmp(opType,"VecZ")==0 ||
strcmp(opType,"Reg" )==0) ) {
return 1;
}
@ -4048,6 +4049,7 @@ int MatchRule::is_expensive() const {
strcmp(opType,"AddReductionVF")==0 ||
strcmp(opType,"AddReductionVD")==0 ||
strcmp(opType,"MulReductionVI")==0 ||
strcmp(opType,"MulReductionVL")==0 ||
strcmp(opType,"MulReductionVF")==0 ||
strcmp(opType,"MulReductionVD")==0 ||
0 /* 0 to line up columns nicely */ )
@ -4139,12 +4141,12 @@ bool MatchRule::is_vector() const {
static const char *vector_list[] = {
"AddVB","AddVS","AddVI","AddVL","AddVF","AddVD",
"SubVB","SubVS","SubVI","SubVL","SubVF","SubVD",
"MulVS","MulVI","MulVF","MulVD",
"MulVS","MulVI","MulVL","MulVF","MulVD",
"DivVF","DivVD",
"AndV" ,"XorV" ,"OrV",
"AddReductionVI", "AddReductionVL",
"AddReductionVF", "AddReductionVD",
"MulReductionVI",
"MulReductionVI", "MulReductionVL",
"MulReductionVF", "MulReductionVD",
"LShiftCntV","RShiftCntV",
"LShiftVB","LShiftVS","LShiftVI","LShiftVL",

View File

@ -1290,7 +1290,8 @@ void LinearScan::build_intervals() {
#ifdef X86
}
if (UseSSE > 0) {
for (i = 0; i < FrameMap::nof_caller_save_xmm_regs; i++) {
int num_caller_save_xmm_regs = FrameMap::get_num_caller_save_xmms();
for (i = 0; i < num_caller_save_xmm_regs; i ++) {
LIR_Opr opr = FrameMap::caller_save_xmm_reg_at(i);
assert(opr->is_valid() && opr->is_register(), "FrameMap should not return invalid operands");
assert(reg_numHi(opr) == -1, "missing addition of range for hi-register");
@ -2098,7 +2099,13 @@ LIR_Opr LinearScan::calc_operand_for_interval(const Interval* interval) {
case T_FLOAT: {
#ifdef X86
if (UseSSE >= 1) {
assert(assigned_reg >= pd_first_xmm_reg && assigned_reg <= pd_last_xmm_reg, "no xmm register");
int last_xmm_reg = pd_last_xmm_reg;
#ifdef _LP64
if (UseAVX < 3) {
last_xmm_reg = pd_first_xmm_reg + (pd_nof_xmm_regs_frame_map / 2) - 1;
}
#endif
assert(assigned_reg >= pd_first_xmm_reg && assigned_reg <= last_xmm_reg, "no xmm register");
assert(interval->assigned_regHi() == any_reg, "must not have hi register");
return LIR_OprFact::single_xmm(assigned_reg - pd_first_xmm_reg);
}
@ -2112,7 +2119,13 @@ LIR_Opr LinearScan::calc_operand_for_interval(const Interval* interval) {
case T_DOUBLE: {
#ifdef X86
if (UseSSE >= 2) {
assert(assigned_reg >= pd_first_xmm_reg && assigned_reg <= pd_last_xmm_reg, "no xmm register");
int last_xmm_reg = pd_last_xmm_reg;
#ifdef _LP64
if (UseAVX < 3) {
last_xmm_reg = pd_first_xmm_reg + (pd_nof_xmm_regs_frame_map / 2) - 1;
}
#endif
assert(assigned_reg >= pd_first_xmm_reg && assigned_reg <= last_xmm_reg, "no xmm register");
assert(interval->assigned_regHi() == any_reg, "must not have hi register (double xmm values are stored in one register)");
return LIR_OprFact::double_xmm(assigned_reg - pd_first_xmm_reg);
}
@ -3600,7 +3613,8 @@ void RegisterVerifier::process_operations(LIR_List* ops, IntervalList* input_sta
}
#ifdef X86
for (j = 0; j < FrameMap::nof_caller_save_xmm_regs; j++) {
int num_caller_save_xmm_regs = FrameMap::get_num_caller_save_xmms();
for (j = 0; j < num_caller_save_xmm_regs; j++) {
state_put(input_state, reg_num(FrameMap::caller_save_xmm_reg_at(j)), NULL);
}
#endif
@ -4514,12 +4528,20 @@ void Interval::print(outputStream* out) const {
if (reg_num() < LIR_OprDesc::vreg_base) {
type_name = "fixed";
// need a temporary operand for fixed intervals because type() cannot be called
#ifdef X86
int last_xmm_reg = pd_last_xmm_reg;
#ifdef _LP64
if (UseAVX < 3) {
last_xmm_reg = pd_first_xmm_reg + (pd_nof_xmm_regs_frame_map / 2) - 1;
}
#endif
#endif
if (assigned_reg() >= pd_first_cpu_reg && assigned_reg() <= pd_last_cpu_reg) {
opr = LIR_OprFact::single_cpu(assigned_reg());
} else if (assigned_reg() >= pd_first_fpu_reg && assigned_reg() <= pd_last_fpu_reg) {
opr = LIR_OprFact::single_fpu(assigned_reg() - pd_first_fpu_reg);
#ifdef X86
} else if (assigned_reg() >= pd_first_xmm_reg && assigned_reg() <= pd_last_xmm_reg) {
} else if (assigned_reg() >= pd_first_xmm_reg && assigned_reg() <= last_xmm_reg) {
opr = LIR_OprFact::single_xmm(assigned_reg() - pd_first_xmm_reg);
#endif
} else {

View File

@ -96,7 +96,7 @@
product(intx, MaxLoopPad, (OptoLoopAlignment-1), \
"Align a loop if padding size in bytes is less or equal to this value") \
\
product(intx, MaxVectorSize, 32, \
product(intx, MaxVectorSize, 64, \
"Max vector size in bytes, " \
"actual size could be less depending on elements type") \
\

View File

@ -907,6 +907,13 @@ void PhaseChaitin::gather_lrg_masks( bool after_aggressive ) {
lrg.set_num_regs(RegMask::SlotsPerVecY);
lrg.set_reg_pressure(1);
break;
case Op_VecZ:
assert(Matcher::vector_size_supported(T_FLOAT,RegMask::SlotsPerVecZ), "sanity");
assert(RegMask::num_registers(Op_VecZ) == RegMask::SlotsPerVecZ, "sanity");
assert(lrgmask.is_aligned_sets(RegMask::SlotsPerVecZ), "vector should be aligned");
lrg.set_num_regs(RegMask::SlotsPerVecZ);
lrg.set_reg_pressure(1);
break;
default:
ShouldNotReachHere();
}
@ -1514,7 +1521,7 @@ uint PhaseChaitin::Select( ) {
int n_regs = lrg->num_regs();
assert(!lrg->_is_vector || !lrg->_fat_proj, "sanity");
if (n_regs == 1 || !lrg->_fat_proj) {
assert(!lrg->_is_vector || n_regs <= RegMask::SlotsPerVecY, "sanity");
assert(!lrg->_is_vector || n_regs <= RegMask::SlotsPerVecZ, "sanity");
lrg->Clear(); // Clear the mask
lrg->Insert(reg); // Set regmask to match selected reg
// For vectors and pairs, also insert the low bit of the pair

View File

@ -141,7 +141,7 @@ public:
// Number of registers this live range uses when it colors
private:
uint8_t _num_regs; // 2 for Longs and Doubles, 1 for all else
uint16_t _num_regs; // 2 for Longs and Doubles, 1 for all else
// except _num_regs is kill count for fat_proj
public:
int num_regs() const { return _num_regs; }
@ -150,7 +150,7 @@ public:
private:
// Number of physical registers this live range uses when it colors
// Architecture and register-set dependent
uint8_t _reg_pressure;
uint16_t _reg_pressure;
public:
void set_reg_pressure(int i) { _reg_pressure = i; }
int reg_pressure() const { return _reg_pressure; }

View File

@ -282,6 +282,8 @@ macro(SubVD)
macro(MulVS)
macro(MulVI)
macro(MulReductionVI)
macro(MulVL)
macro(MulReductionVL)
macro(MulVF)
macro(MulReductionVF)
macro(MulVD)

View File

@ -3110,6 +3110,7 @@ void Compile::final_graph_reshaping_impl( Node *n, Final_Reshape_Counts &frc) {
case Op_AddReductionVF:
case Op_AddReductionVD:
case Op_MulReductionVI:
case Op_MulReductionVL:
case Op_MulReductionVF:
case Op_MulReductionVD:
break;

View File

@ -83,6 +83,7 @@ Matcher::Matcher()
idealreg2spillmask [Op_VecD] = NULL;
idealreg2spillmask [Op_VecX] = NULL;
idealreg2spillmask [Op_VecY] = NULL;
idealreg2spillmask [Op_VecZ] = NULL;
idealreg2debugmask [Op_RegI] = NULL;
idealreg2debugmask [Op_RegN] = NULL;
@ -94,6 +95,7 @@ Matcher::Matcher()
idealreg2debugmask [Op_VecD] = NULL;
idealreg2debugmask [Op_VecX] = NULL;
idealreg2debugmask [Op_VecY] = NULL;
idealreg2debugmask [Op_VecZ] = NULL;
idealreg2mhdebugmask[Op_RegI] = NULL;
idealreg2mhdebugmask[Op_RegN] = NULL;
@ -105,6 +107,7 @@ Matcher::Matcher()
idealreg2mhdebugmask[Op_VecD] = NULL;
idealreg2mhdebugmask[Op_VecX] = NULL;
idealreg2mhdebugmask[Op_VecY] = NULL;
idealreg2mhdebugmask[Op_VecZ] = NULL;
debug_only(_mem_node = NULL;) // Ideal memory node consumed by mach node
}
@ -413,7 +416,7 @@ static RegMask *init_input_masks( uint size, RegMask &ret_adr, RegMask &fp ) {
void Matcher::init_first_stack_mask() {
// Allocate storage for spill masks as masks for the appropriate load type.
RegMask *rms = (RegMask*)C->comp_arena()->Amalloc_D(sizeof(RegMask) * (3*6+4));
RegMask *rms = (RegMask*)C->comp_arena()->Amalloc_D(sizeof(RegMask) * (3*6+5));
idealreg2spillmask [Op_RegN] = &rms[0];
idealreg2spillmask [Op_RegI] = &rms[1];
@ -440,6 +443,7 @@ void Matcher::init_first_stack_mask() {
idealreg2spillmask [Op_VecD] = &rms[19];
idealreg2spillmask [Op_VecX] = &rms[20];
idealreg2spillmask [Op_VecY] = &rms[21];
idealreg2spillmask [Op_VecZ] = &rms[22];
OptoReg::Name i;
@ -523,6 +527,18 @@ void Matcher::init_first_stack_mask() {
assert(aligned_stack_mask.is_AllStack(), "should be infinite stack");
*idealreg2spillmask[Op_VecY] = *idealreg2regmask[Op_VecY];
idealreg2spillmask[Op_VecY]->OR(aligned_stack_mask);
}
if (Matcher::vector_size_supported(T_FLOAT,16)) {
// For VecZ we need enough alignment and 64 bytes (16 slots) for spills.
OptoReg::Name in = OptoReg::add(_in_arg_limit, -1);
for (int k = 1; (in >= init_in) && (k < RegMask::SlotsPerVecZ); k++) {
aligned_stack_mask.Remove(in);
in = OptoReg::add(in, -1);
}
aligned_stack_mask.clear_to_sets(RegMask::SlotsPerVecZ);
assert(aligned_stack_mask.is_AllStack(), "should be infinite stack");
*idealreg2spillmask[Op_VecZ] = *idealreg2regmask[Op_VecZ];
idealreg2spillmask[Op_VecZ]->OR(aligned_stack_mask);
}
if (UseFPUForSpilling) {
// This mask logic assumes that the spill operations are
@ -862,6 +878,10 @@ void Matcher::init_spill_mask( Node *ret ) {
MachNode *spillVectY = match_tree(new LoadVectorNode(NULL,mem,fp,atp,TypeVect::VECTY));
idealreg2regmask[Op_VecY] = &spillVectY->out_RegMask();
}
if (Matcher::vector_size_supported(T_FLOAT,16)) {
MachNode *spillVectZ = match_tree(new LoadVectorNode(NULL,mem,fp,atp,TypeVect::VECTZ));
idealreg2regmask[Op_VecZ] = &spillVectZ->out_RegMask();
}
}
#ifdef ASSERT

View File

@ -42,6 +42,7 @@ const char *NodeClassNames[] = {
"VecD",
"VecX",
"VecY",
"VecZ",
"_last_machine_leaf",
#include "classes.hpp"
"_last_class_name",

View File

@ -40,6 +40,7 @@ enum Opcodes {
macro(VecD) // Machine vectord register
macro(VecX) // Machine vectorx register
macro(VecY) // Machine vectory register
macro(VecZ) // Machine vectorz register
macro(RegFlags) // Machine flags register
_last_machine_leaf, // Split between regular opcodes and machine
#include "classes.hpp"

View File

@ -103,6 +103,10 @@ class OptoReg VALUE_OBJ_CLASS_SPEC {
return r - stack0();
}
static void invalidate(Name n) {
vm2opto[n] = Bad;
}
// convert a stack slot number into an OptoReg::Name
static OptoReg::Name stack2reg( int idx) {
return Name(stack0() + idx);

View File

@ -1880,8 +1880,8 @@ void Compile::ScheduleAndBundle() {
if (!do_scheduling())
return;
// Scheduling code works only with pairs (8 bytes) maximum.
if (max_vector_size() > 8)
// Scheduling code works only with pairs (16 bytes) maximum.
if (max_vector_size() > 16)
return;
TracePhase tp("isched", &timers[_t_instrSched]);

View File

@ -114,11 +114,14 @@ const RegMask RegMask::Empty(
//=============================================================================
bool RegMask::is_vector(uint ireg) {
return (ireg == Op_VecS || ireg == Op_VecD || ireg == Op_VecX || ireg == Op_VecY);
return (ireg == Op_VecS || ireg == Op_VecD ||
ireg == Op_VecX || ireg == Op_VecY || ireg == Op_VecZ );
}
int RegMask::num_registers(uint ireg) {
switch(ireg) {
case Op_VecZ:
return 16;
case Op_VecY:
return 8;
case Op_VecX:
@ -233,7 +236,8 @@ int RegMask::is_bound_pair() const {
return true;
}
static int low_bits[3] = { 0x55555555, 0x11111111, 0x01010101 };
// only indicies of power 2 are accessed, so index 3 is only filled in for storage.
static int low_bits[5] = { 0x55555555, 0x11111111, 0x01010101, 0x00000000, 0x00010001 };
//------------------------------find_first_set---------------------------------
// Find the lowest-numbered register set in the mask. Return the
// HIGHEST register number in the set, or BAD if no sets.
@ -254,7 +258,7 @@ OptoReg::Name RegMask::find_first_set(const int size) const {
// Clear out partial bits; leave only aligned adjacent bit pairs
void RegMask::clear_to_sets(const int size) {
if (size == 1) return;
assert(2 <= size && size <= 8, "update low bits table");
assert(2 <= size && size <= 16, "update low bits table");
assert(is_power_of_2(size), "sanity");
int low_bits_mask = low_bits[size>>2];
for (int i = 0; i < RM_SIZE; i++) {
@ -268,6 +272,9 @@ void RegMask::clear_to_sets(const int size) {
sets |= (sets>>2); // Smear 2 hi-bits into a set
if (size > 4) {
sets |= (sets>>4); // Smear 4 hi-bits into a set
if (size > 8) {
sets |= (sets>>8); // Smear 8 hi-bits into a set
}
}
}
_A[i] = sets;
@ -279,7 +286,7 @@ void RegMask::clear_to_sets(const int size) {
// Smear out partial bits to aligned adjacent bit sets
void RegMask::smear_to_sets(const int size) {
if (size == 1) return;
assert(2 <= size && size <= 8, "update low bits table");
assert(2 <= size && size <= 16, "update low bits table");
assert(is_power_of_2(size), "sanity");
int low_bits_mask = low_bits[size>>2];
for (int i = 0; i < RM_SIZE; i++) {
@ -294,6 +301,9 @@ void RegMask::smear_to_sets(const int size) {
sets |= (sets<<2); // Smear 2 lo-bits into a set
if (size > 4) {
sets |= (sets<<4); // Smear 4 lo-bits into a set
if (size > 8) {
sets |= (sets<<8); // Smear 8 lo-bits into a set
}
}
}
_A[i] = sets;
@ -304,7 +314,7 @@ void RegMask::smear_to_sets(const int size) {
//------------------------------is_aligned_set--------------------------------
bool RegMask::is_aligned_sets(const int size) const {
if (size == 1) return true;
assert(2 <= size && size <= 8, "update low bits table");
assert(2 <= size && size <= 16, "update low bits table");
assert(is_power_of_2(size), "sanity");
int low_bits_mask = low_bits[size>>2];
// Assert that the register mask contains only bit sets.
@ -330,7 +340,7 @@ bool RegMask::is_aligned_sets(const int size) const {
// Works also for size 1.
int RegMask::is_bound_set(const int size) const {
if( is_AllStack() ) return false;
assert(1 <= size && size <= 8, "update low bits table");
assert(1 <= size && size <= 16, "update low bits table");
int bit = -1; // Set to hold the one bit allowed
for (int i = 0; i < RM_SIZE; i++) {
if (_A[i] ) { // Found some bits
@ -346,10 +356,12 @@ int RegMask::is_bound_set(const int size) const {
if (((-1) & ~(bit-1)) != _A[i])
return false; // Found many bits, so fail
i++; // Skip iteration forward and check high part
// The lower 24 bits should be 0 since it is split case and size <= 8.
int set = bit>>24;
// The lower (32-size) bits should be 0 since it is split case.
int clear_bit_size = 32-size;
int shift_back_size = 32-clear_bit_size;
int set = bit>>clear_bit_size;
set = set & -set; // Remove sign extension.
set = (((set << size) - 1) >> 8);
set = (((set << size) - 1) >> shift_back_size);
if (i >= RM_SIZE || _A[i] != set)
return false; // Require expected low bits in next word
}
@ -375,7 +387,7 @@ bool RegMask::is_UP() const {
//------------------------------Size-------------------------------------------
// Compute size of register mask in bits
uint RegMask::Size() const {
extern uint8_t bitsInByte[256];
extern uint8_t bitsInByte[512];
uint sum = 0;
for( int i = 0; i < RM_SIZE; i++ )
sum +=

View File

@ -98,7 +98,8 @@ public:
SlotsPerVecS = 1,
SlotsPerVecD = 2,
SlotsPerVecX = 4,
SlotsPerVecY = 8 };
SlotsPerVecY = 8,
SlotsPerVecZ = 16 };
// A constructor only used by the ADLC output. All mask fields are filled
// in directly. Calls to this look something like RM(1,2,3,4);
@ -299,13 +300,13 @@ public:
static bool can_represent(OptoReg::Name reg) {
// NOTE: -1 in computation reflects the usage of the last
// bit of the regmask as an infinite stack flag and
// -7 is to keep mask aligned for largest value (VecY).
// -7 is to keep mask aligned for largest value (VecZ).
return (int)reg < (int)(CHUNK_SIZE-1);
}
static bool can_represent_arg(OptoReg::Name reg) {
// NOTE: -SlotsPerVecY in computation reflects the need
// to keep mask aligned for largest value (VecY).
return (int)reg < (int)(CHUNK_SIZE-SlotsPerVecY);
// NOTE: -SlotsPerVecZ in computation reflects the need
// to keep mask aligned for largest value (VecZ).
return (int)reg < (int)(CHUNK_SIZE-SlotsPerVecZ);
}
};

View File

@ -68,16 +68,19 @@ Type::TypeInfo Type::_type_info[Type::lastype] = {
{ Bad, T_ILLEGAL, "vectord:", false, Op_RegD, relocInfo::none }, // VectorD
{ Bad, T_ILLEGAL, "vectorx:", false, 0, relocInfo::none }, // VectorX
{ Bad, T_ILLEGAL, "vectory:", false, 0, relocInfo::none }, // VectorY
{ Bad, T_ILLEGAL, "vectorz:", false, 0, relocInfo::none }, // VectorZ
#elif defined(PPC64)
{ Bad, T_ILLEGAL, "vectors:", false, 0, relocInfo::none }, // VectorS
{ Bad, T_ILLEGAL, "vectord:", false, Op_RegL, relocInfo::none }, // VectorD
{ Bad, T_ILLEGAL, "vectorx:", false, 0, relocInfo::none }, // VectorX
{ Bad, T_ILLEGAL, "vectory:", false, 0, relocInfo::none }, // VectorY
{ Bad, T_ILLEGAL, "vectorz:", false, 0, relocInfo::none }, // VectorZ
#else // all other
{ Bad, T_ILLEGAL, "vectors:", false, Op_VecS, relocInfo::none }, // VectorS
{ Bad, T_ILLEGAL, "vectord:", false, Op_VecD, relocInfo::none }, // VectorD
{ Bad, T_ILLEGAL, "vectorx:", false, Op_VecX, relocInfo::none }, // VectorX
{ Bad, T_ILLEGAL, "vectory:", false, Op_VecY, relocInfo::none }, // VectorY
{ Bad, T_ILLEGAL, "vectorz:", false, Op_VecZ, relocInfo::none }, // VectorZ
#endif
{ Bad, T_ADDRESS, "anyptr:", false, Op_RegP, relocInfo::none }, // AnyPtr
{ Bad, T_ADDRESS, "rawptr:", false, Op_RegP, relocInfo::none }, // RawPtr
@ -503,10 +506,14 @@ void Type::Initialize_shared(Compile* current) {
if (Matcher::vector_size_supported(T_FLOAT,8)) {
TypeVect::VECTY = TypeVect::make(T_FLOAT,8);
}
if (Matcher::vector_size_supported(T_FLOAT,16)) {
TypeVect::VECTZ = TypeVect::make(T_FLOAT,16);
}
mreg2type[Op_VecS] = TypeVect::VECTS;
mreg2type[Op_VecD] = TypeVect::VECTD;
mreg2type[Op_VecX] = TypeVect::VECTX;
mreg2type[Op_VecY] = TypeVect::VECTY;
mreg2type[Op_VecZ] = TypeVect::VECTZ;
// Restore working type arena.
current->set_type_arena(save);
@ -798,6 +805,7 @@ const Type::TYPES Type::dual_type[Type::lastype] = {
Bad, // VectorD - handled in v-call
Bad, // VectorX - handled in v-call
Bad, // VectorY - handled in v-call
Bad, // VectorZ - handled in v-call
Bad, // AnyPtr - handled in v-call
Bad, // RawPtr - handled in v-call
@ -2051,6 +2059,7 @@ const TypeVect *TypeVect::VECTS = NULL; // 32-bit vectors
const TypeVect *TypeVect::VECTD = NULL; // 64-bit vectors
const TypeVect *TypeVect::VECTX = NULL; // 128-bit vectors
const TypeVect *TypeVect::VECTY = NULL; // 256-bit vectors
const TypeVect *TypeVect::VECTZ = NULL; // 512-bit vectors
//------------------------------make-------------------------------------------
const TypeVect* TypeVect::make(const Type *elem, uint length) {
@ -2070,6 +2079,8 @@ const TypeVect* TypeVect::make(const Type *elem, uint length) {
return (TypeVect*)(new TypeVectX(elem, length))->hashcons();
case Op_VecY:
return (TypeVect*)(new TypeVectY(elem, length))->hashcons();
case Op_VecZ:
return (TypeVect*)(new TypeVectZ(elem, length))->hashcons();
}
ShouldNotReachHere();
return NULL;
@ -2093,7 +2104,8 @@ const Type *TypeVect::xmeet( const Type *t ) const {
case VectorS:
case VectorD:
case VectorX:
case VectorY: { // Meeting 2 vectors?
case VectorY:
case VectorZ: { // Meeting 2 vectors?
const TypeVect* v = t->is_vect();
assert( base() == v->base(), "");
assert(length() == v->length(), "");
@ -2151,6 +2163,8 @@ void TypeVect::dump2(Dict &d, uint depth, outputStream *st) const {
st->print("vectorx["); break;
case VectorY:
st->print("vectory["); break;
case VectorZ:
st->print("vectorz["); break;
default:
ShouldNotReachHere();
}

View File

@ -57,6 +57,7 @@ class TypeVectS;
class TypeVectD;
class TypeVectX;
class TypeVectY;
class TypeVectZ;
class TypePtr;
class TypeRawPtr;
class TypeOopPtr;
@ -90,6 +91,7 @@ public:
VectorD, // 64bit Vector types
VectorX, // 128bit Vector types
VectorY, // 256bit Vector types
VectorZ, // 512bit Vector types
AnyPtr, // Any old raw, klass, inst, or array pointer
RawPtr, // Raw (non-oop) pointers
@ -729,6 +731,7 @@ public:
static const TypeVect *VECTD;
static const TypeVect *VECTX;
static const TypeVect *VECTY;
static const TypeVect *VECTZ;
#ifndef PRODUCT
virtual void dump2(Dict &d, uint, outputStream *st) const; // Specialized per-Type dumping
@ -755,6 +758,11 @@ class TypeVectY : public TypeVect {
TypeVectY(const Type* elem, uint length) : TypeVect(VectorY, elem, length) {}
};
class TypeVectZ : public TypeVect {
friend class TypeVect;
TypeVectZ(const Type* elem, uint length) : TypeVect(VectorZ, elem, length) {}
};
//------------------------------TypePtr----------------------------------------
// Class of machine Pointer Types: raw data, instances or arrays.
// If the _base enum is AnyPtr, then this refers to all of the above.
@ -1568,12 +1576,12 @@ inline const TypeAry *Type::is_ary() const {
}
inline const TypeVect *Type::is_vect() const {
assert( _base >= VectorS && _base <= VectorY, "Not a Vector" );
assert( _base >= VectorS && _base <= VectorZ, "Not a Vector" );
return (TypeVect*)this;
}
inline const TypeVect *Type::isa_vect() const {
return (_base >= VectorS && _base <= VectorY) ? (TypeVect*)this : NULL;
return (_base >= VectorS && _base <= VectorZ) ? (TypeVect*)this : NULL;
}
inline const TypePtr *Type::is_ptr() const {

View File

@ -77,6 +77,9 @@ int VectorNode::opcode(int sopc, BasicType bt) {
case T_INT: return Op_MulVI;
}
ShouldNotReachHere();
case Op_MulL:
assert(bt == T_LONG, "must be");
return Op_MulVL;
case Op_MulF:
assert(bt == T_FLOAT, "must be");
return Op_MulVF;
@ -267,6 +270,7 @@ VectorNode* VectorNode::make(int opc, Node* n1, Node* n2, uint vlen, BasicType b
case Op_MulVS: return new MulVSNode(n1, n2, vt);
case Op_MulVI: return new MulVINode(n1, n2, vt);
case Op_MulVL: return new MulVLNode(n1, n2, vt);
case Op_MulVF: return new MulVFNode(n1, n2, vt);
case Op_MulVD: return new MulVDNode(n1, n2, vt);
@ -463,6 +467,10 @@ int ReductionNode::opcode(int opc, BasicType bt) {
assert(bt == T_INT, "must be");
vopc = Op_MulReductionVI;
break;
case Op_MulL:
assert(bt == T_LONG, "must be");
vopc = Op_MulReductionVL;
break;
case Op_MulF:
assert(bt == T_FLOAT, "must be");
vopc = Op_MulReductionVF;
@ -492,6 +500,7 @@ ReductionNode* ReductionNode::make(int opc, Node *ctrl, Node* n1, Node* n2, Basi
case Op_AddReductionVF: return new AddReductionVFNode(ctrl, n1, n2);
case Op_AddReductionVD: return new AddReductionVDNode(ctrl, n1, n2);
case Op_MulReductionVI: return new MulReductionVINode(ctrl, n1, n2);
case Op_MulReductionVL: return new MulReductionVLNode(ctrl, n1, n2);
case Op_MulReductionVF: return new MulReductionVFNode(ctrl, n1, n2);
case Op_MulReductionVD: return new MulReductionVDNode(ctrl, n1, n2);
}

View File

@ -90,6 +90,30 @@ class AddVINode : public VectorNode {
virtual int Opcode() const;
};
//------------------------------AddVLNode--------------------------------------
// Vector add long
class AddVLNode : public VectorNode {
public:
AddVLNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1, in2, vt) {}
virtual int Opcode() const;
};
//------------------------------AddVFNode--------------------------------------
// Vector add float
class AddVFNode : public VectorNode {
public:
AddVFNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1, in2, vt) {}
virtual int Opcode() const;
};
//------------------------------AddVDNode--------------------------------------
// Vector add double
class AddVDNode : public VectorNode {
public:
AddVDNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1, in2, vt) {}
virtual int Opcode() const;
};
//------------------------------ReductionNode------------------------------------
// Perform reduction of a vector
class ReductionNode : public Node {
@ -121,22 +145,6 @@ public:
virtual uint ideal_reg() const { return Op_RegL; }
};
//------------------------------AddVLNode--------------------------------------
// Vector add long
class AddVLNode : public VectorNode {
public:
AddVLNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
virtual int Opcode() const;
};
//------------------------------AddVFNode--------------------------------------
// Vector add float
class AddVFNode : public VectorNode {
public:
AddVFNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
virtual int Opcode() const;
};
//------------------------------AddReductionVFNode--------------------------------------
// Vector add float as a reduction
class AddReductionVFNode : public ReductionNode {
@ -147,14 +155,6 @@ public:
virtual uint ideal_reg() const { return Op_RegF; }
};
//------------------------------AddVDNode--------------------------------------
// Vector add double
class AddVDNode : public VectorNode {
public:
AddVDNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
virtual int Opcode() const;
};
//------------------------------AddReductionVDNode--------------------------------------
// Vector add double as a reduction
class AddReductionVDNode : public ReductionNode {
@ -229,6 +229,30 @@ class MulVINode : public VectorNode {
virtual int Opcode() const;
};
//------------------------------MulVLNode--------------------------------------
// Vector multiply long
class MulVLNode : public VectorNode {
public:
MulVLNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1, in2, vt) {}
virtual int Opcode() const;
};
//------------------------------MulVFNode--------------------------------------
// Vector multiply float
class MulVFNode : public VectorNode {
public:
MulVFNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1, in2, vt) {}
virtual int Opcode() const;
};
//------------------------------MulVDNode--------------------------------------
// Vector multiply double
class MulVDNode : public VectorNode {
public:
MulVDNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1, in2, vt) {}
virtual int Opcode() const;
};
//------------------------------MulReductionVINode--------------------------------------
// Vector multiply int as a reduction
class MulReductionVINode : public ReductionNode {
@ -239,12 +263,14 @@ public:
virtual uint ideal_reg() const { return Op_RegI; }
};
//------------------------------MulVFNode--------------------------------------
// Vector multiply float
class MulVFNode : public VectorNode {
public:
MulVFNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
//------------------------------MulReductionVLNode--------------------------------------
// Vector multiply int as a reduction
class MulReductionVLNode : public ReductionNode {
public:
MulReductionVLNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
virtual int Opcode() const;
virtual const Type* bottom_type() const { return TypeLong::LONG; }
virtual uint ideal_reg() const { return Op_RegI; }
};
//------------------------------MulReductionVFNode--------------------------------------
@ -257,14 +283,6 @@ public:
virtual uint ideal_reg() const { return Op_RegF; }
};
//------------------------------MulVDNode--------------------------------------
// Vector multiply double
class MulVDNode : public VectorNode {
public:
MulVDNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
virtual int Opcode() const;
};
//------------------------------MulReductionVDNode--------------------------------------
// Vector multiply double as a reduction
class MulReductionVDNode : public ReductionNode {

View File

@ -2010,6 +2010,8 @@ typedef CompactHashtable<Symbol*, char> SymbolCompactHashTable;
declare_c2_type(SubVFNode, VectorNode) \
declare_c2_type(SubVDNode, VectorNode) \
declare_c2_type(MulVSNode, VectorNode) \
declare_c2_type(MulVLNode, VectorNode) \
declare_c2_type(MulReductionVLNode, ReductionNode) \
declare_c2_type(MulVINode, VectorNode) \
declare_c2_type(MulReductionVINode, ReductionNode) \
declare_c2_type(MulVFNode, VectorNode) \

View File

@ -0,0 +1,92 @@
/*
* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*
*/
/**
* @test
* @bug 8076276
* @summary Add C2 x86 Superword support for scalar sum reduction optimizations : long test
*
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 SumRed_Double
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 SumRed_Double
*
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 SumRed_Double
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 SumRed_Double
*
*/
public class SumRed_Long
{
public static void main(String[] args) throws Exception {
long[] a = new long[256*1024];
long[] b = new long[256*1024];
long[] c = new long[256*1024];
long[] d = new long[256*1024];
sumReductionInit(a,b,c);
long total = 0;
long valid = 262144000;
for(int j = 0; j < 2000; j++) {
total = sumReductionImplement(a,b,c,d,total);
}
total = (int)total;
if(total == valid) {
System.out.println("Success");
} else {
System.out.println("Invalid sum of elements variable in total: " + total);
System.out.println("Expected value = " + valid);
throw new Exception("Failed");
}
}
public static void sumReductionInit(
long[] a,
long[] b,
long[] c)
{
for(int j = 0; j < 1; j++)
{
for(int i = 0; i < a.length; i++)
{
a[i] = i * 1 + j;
b[i] = i * 1 - j;
c[i] = i + j;
}
}
}
public static long sumReductionImplement(
long[] a,
long[] b,
long[] c,
long[] d,
long total)
{
for(int i = 0; i < a.length; i++)
{
d[i]= (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
total += d[i];
}
return total;
}
}