mirror of
https://github.com/openjdk/jdk.git
synced 2026-05-11 14:11:36 +00:00
8076276: Add support for AVX512
Reviewed-by: kvn, roland
This commit is contained in:
parent
58a1361125
commit
4fca8dbb1f
File diff suppressed because it is too large
Load Diff
@ -438,7 +438,7 @@ class ArrayAddress VALUE_OBJ_CLASS_SPEC {
|
||||
|
||||
};
|
||||
|
||||
const int FPUStateSizeInWords = NOT_LP64(27) LP64_ONLY( 512 / wordSize);
|
||||
const int FPUStateSizeInWords = NOT_LP64(27) LP64_ONLY( 512*2 / wordSize);
|
||||
|
||||
// The Intel x86/Amd64 Assembler: Pure assembler doing NO optimizations on the instruction
|
||||
// level (e.g. mov rax, 0 is not translated into xor rax, rax!); i.e., what you write
|
||||
@ -503,7 +503,8 @@ class Assembler : public AbstractAssembler {
|
||||
REX_WRXB = 0x4F,
|
||||
|
||||
VEX_3bytes = 0xC4,
|
||||
VEX_2bytes = 0xC5
|
||||
VEX_2bytes = 0xC5,
|
||||
EVEX_4bytes = 0x62
|
||||
};
|
||||
|
||||
enum VexPrefix {
|
||||
@ -513,6 +514,14 @@ class Assembler : public AbstractAssembler {
|
||||
VEX_W = 0x80
|
||||
};
|
||||
|
||||
enum ExexPrefix {
|
||||
EVEX_F = 0x04,
|
||||
EVEX_V = 0x08,
|
||||
EVEX_Rb = 0x10,
|
||||
EVEX_X = 0x40,
|
||||
EVEX_Z = 0x80
|
||||
};
|
||||
|
||||
enum VexSimdPrefix {
|
||||
VEX_SIMD_NONE = 0x0,
|
||||
VEX_SIMD_66 = 0x1,
|
||||
@ -527,6 +536,37 @@ class Assembler : public AbstractAssembler {
|
||||
VEX_OPCODE_0F_3A = 0x3
|
||||
};
|
||||
|
||||
enum AvxVectorLen {
|
||||
AVX_128bit = 0x0,
|
||||
AVX_256bit = 0x1,
|
||||
AVX_512bit = 0x2,
|
||||
AVX_NoVec = 0x4
|
||||
};
|
||||
|
||||
enum EvexTupleType {
|
||||
EVEX_FV = 0,
|
||||
EVEX_HV = 4,
|
||||
EVEX_FVM = 6,
|
||||
EVEX_T1S = 7,
|
||||
EVEX_T1F = 11,
|
||||
EVEX_T2 = 13,
|
||||
EVEX_T4 = 15,
|
||||
EVEX_T8 = 17,
|
||||
EVEX_HVM = 18,
|
||||
EVEX_QVM = 19,
|
||||
EVEX_OVM = 20,
|
||||
EVEX_M128 = 21,
|
||||
EVEX_DUP = 22,
|
||||
EVEX_ETUP = 23
|
||||
};
|
||||
|
||||
enum EvexInputSizeInBits {
|
||||
EVEX_8bit = 0,
|
||||
EVEX_16bit = 1,
|
||||
EVEX_32bit = 2,
|
||||
EVEX_64bit = 3
|
||||
};
|
||||
|
||||
enum WhichOperand {
|
||||
// input to locate_operand, and format code for relocations
|
||||
imm_operand = 0, // embedded 32-bit|64-bit immediate operand
|
||||
@ -554,6 +594,11 @@ class Assembler : public AbstractAssembler {
|
||||
|
||||
private:
|
||||
|
||||
int evex_encoding;
|
||||
int input_size_in_bits;
|
||||
int avx_vector_len;
|
||||
int tuple_type;
|
||||
bool is_evex_instruction;
|
||||
|
||||
// 64bit prefixes
|
||||
int prefix_and_encode(int reg_enc, bool byteinst = false);
|
||||
@ -580,108 +625,143 @@ private:
|
||||
|
||||
void vex_prefix(bool vex_r, bool vex_b, bool vex_x, bool vex_w,
|
||||
int nds_enc, VexSimdPrefix pre, VexOpcode opc,
|
||||
bool vector256);
|
||||
int vector_len);
|
||||
|
||||
void evex_prefix(bool vex_r, bool vex_b, bool vex_x, bool vex_w, bool evex_r, bool evex_v,
|
||||
int nds_enc, VexSimdPrefix pre, VexOpcode opc,
|
||||
bool is_extended_context, bool is_merge_context,
|
||||
int vector_len, bool no_mask_reg );
|
||||
|
||||
void vex_prefix(Address adr, int nds_enc, int xreg_enc,
|
||||
VexSimdPrefix pre, VexOpcode opc,
|
||||
bool vex_w, bool vector256);
|
||||
bool vex_w, int vector_len,
|
||||
bool legacy_mode = false, bool no_mask_reg = false);
|
||||
|
||||
void vex_prefix(XMMRegister dst, XMMRegister nds, Address src,
|
||||
VexSimdPrefix pre, bool vector256 = false) {
|
||||
VexSimdPrefix pre, int vector_len = AVX_128bit,
|
||||
bool no_mask_reg = false, bool legacy_mode = false) {
|
||||
int dst_enc = dst->encoding();
|
||||
int nds_enc = nds->is_valid() ? nds->encoding() : 0;
|
||||
vex_prefix(src, nds_enc, dst_enc, pre, VEX_OPCODE_0F, false, vector256);
|
||||
vex_prefix(src, nds_enc, dst_enc, pre, VEX_OPCODE_0F, false, vector_len, legacy_mode, no_mask_reg);
|
||||
}
|
||||
|
||||
void vex_prefix_0F38(Register dst, Register nds, Address src) {
|
||||
void vex_prefix_q(XMMRegister dst, XMMRegister nds, Address src,
|
||||
VexSimdPrefix pre, int vector_len = AVX_128bit,
|
||||
bool no_mask_reg = false) {
|
||||
int dst_enc = dst->encoding();
|
||||
int nds_enc = nds->is_valid() ? nds->encoding() : 0;
|
||||
vex_prefix(src, nds_enc, dst_enc, pre, VEX_OPCODE_0F, true, vector_len, false, no_mask_reg);
|
||||
}
|
||||
|
||||
void vex_prefix_0F38(Register dst, Register nds, Address src, bool no_mask_reg = false) {
|
||||
bool vex_w = false;
|
||||
bool vector256 = false;
|
||||
int vector_len = AVX_128bit;
|
||||
vex_prefix(src, nds->encoding(), dst->encoding(),
|
||||
VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector256);
|
||||
VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w,
|
||||
vector_len, no_mask_reg);
|
||||
}
|
||||
|
||||
void vex_prefix_0F38_q(Register dst, Register nds, Address src) {
|
||||
void vex_prefix_0F38_q(Register dst, Register nds, Address src, bool no_mask_reg = false) {
|
||||
bool vex_w = true;
|
||||
bool vector256 = false;
|
||||
int vector_len = AVX_128bit;
|
||||
vex_prefix(src, nds->encoding(), dst->encoding(),
|
||||
VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector256);
|
||||
VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w,
|
||||
vector_len, no_mask_reg);
|
||||
}
|
||||
int vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc,
|
||||
VexSimdPrefix pre, VexOpcode opc,
|
||||
bool vex_w, bool vector256);
|
||||
bool vex_w, int vector_len,
|
||||
bool legacy_mode, bool no_mask_reg);
|
||||
|
||||
int vex_prefix_0F38_and_encode(Register dst, Register nds, Register src) {
|
||||
int vex_prefix_0F38_and_encode(Register dst, Register nds, Register src, bool no_mask_reg = false) {
|
||||
bool vex_w = false;
|
||||
bool vector256 = false;
|
||||
int vector_len = AVX_128bit;
|
||||
return vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(),
|
||||
VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector256);
|
||||
VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector_len,
|
||||
false, no_mask_reg);
|
||||
}
|
||||
int vex_prefix_0F38_and_encode_q(Register dst, Register nds, Register src) {
|
||||
int vex_prefix_0F38_and_encode_q(Register dst, Register nds, Register src, bool no_mask_reg = false) {
|
||||
bool vex_w = true;
|
||||
bool vector256 = false;
|
||||
int vector_len = AVX_128bit;
|
||||
return vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(),
|
||||
VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector256);
|
||||
VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector_len,
|
||||
false, no_mask_reg);
|
||||
}
|
||||
int vex_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src,
|
||||
VexSimdPrefix pre, bool vector256 = false,
|
||||
VexOpcode opc = VEX_OPCODE_0F) {
|
||||
VexSimdPrefix pre, int vector_len = AVX_128bit,
|
||||
VexOpcode opc = VEX_OPCODE_0F, bool legacy_mode = false,
|
||||
bool no_mask_reg = false) {
|
||||
int src_enc = src->encoding();
|
||||
int dst_enc = dst->encoding();
|
||||
int nds_enc = nds->is_valid() ? nds->encoding() : 0;
|
||||
return vex_prefix_and_encode(dst_enc, nds_enc, src_enc, pre, opc, false, vector256);
|
||||
return vex_prefix_and_encode(dst_enc, nds_enc, src_enc, pre, opc, false, vector_len, legacy_mode, no_mask_reg);
|
||||
}
|
||||
|
||||
void simd_prefix(XMMRegister xreg, XMMRegister nds, Address adr,
|
||||
VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F,
|
||||
bool rex_w = false, bool vector256 = false);
|
||||
VexSimdPrefix pre, bool no_mask_reg, VexOpcode opc = VEX_OPCODE_0F,
|
||||
bool rex_w = false, int vector_len = AVX_128bit, bool legacy_mode = false);
|
||||
|
||||
void simd_prefix(XMMRegister dst, Address src,
|
||||
VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F) {
|
||||
simd_prefix(dst, xnoreg, src, pre, opc);
|
||||
void simd_prefix(XMMRegister dst, Address src, VexSimdPrefix pre,
|
||||
bool no_mask_reg, VexOpcode opc = VEX_OPCODE_0F) {
|
||||
simd_prefix(dst, xnoreg, src, pre, no_mask_reg, opc);
|
||||
}
|
||||
|
||||
void simd_prefix(Address dst, XMMRegister src, VexSimdPrefix pre) {
|
||||
simd_prefix(src, dst, pre);
|
||||
void simd_prefix(Address dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg) {
|
||||
simd_prefix(src, dst, pre, no_mask_reg);
|
||||
}
|
||||
void simd_prefix_q(XMMRegister dst, XMMRegister nds, Address src,
|
||||
VexSimdPrefix pre) {
|
||||
VexSimdPrefix pre, bool no_mask_reg = false) {
|
||||
bool rex_w = true;
|
||||
simd_prefix(dst, nds, src, pre, VEX_OPCODE_0F, rex_w);
|
||||
simd_prefix(dst, nds, src, pre, no_mask_reg, VEX_OPCODE_0F, rex_w);
|
||||
}
|
||||
|
||||
int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src,
|
||||
VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F,
|
||||
bool rex_w = false, bool vector256 = false);
|
||||
VexSimdPrefix pre, bool no_mask_reg,
|
||||
VexOpcode opc = VEX_OPCODE_0F,
|
||||
bool rex_w = false, int vector_len = AVX_128bit,
|
||||
bool legacy_mode = false);
|
||||
|
||||
int kreg_prefix_and_encode(KRegister dst, KRegister nds, KRegister src,
|
||||
VexSimdPrefix pre, bool no_mask_reg,
|
||||
VexOpcode opc = VEX_OPCODE_0F,
|
||||
bool rex_w = false, int vector_len = AVX_128bit);
|
||||
|
||||
int kreg_prefix_and_encode(KRegister dst, KRegister nds, Register src,
|
||||
VexSimdPrefix pre, bool no_mask_reg,
|
||||
VexOpcode opc = VEX_OPCODE_0F,
|
||||
bool rex_w = false, int vector_len = AVX_128bit);
|
||||
|
||||
// Move/convert 32-bit integer value.
|
||||
int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, Register src,
|
||||
VexSimdPrefix pre) {
|
||||
VexSimdPrefix pre, bool no_mask_reg) {
|
||||
// It is OK to cast from Register to XMMRegister to pass argument here
|
||||
// since only encoding is used in simd_prefix_and_encode() and number of
|
||||
// Gen and Xmm registers are the same.
|
||||
return simd_prefix_and_encode(dst, nds, as_XMMRegister(src->encoding()), pre);
|
||||
return simd_prefix_and_encode(dst, nds, as_XMMRegister(src->encoding()), pre, no_mask_reg, VEX_OPCODE_0F);
|
||||
}
|
||||
int simd_prefix_and_encode(XMMRegister dst, Register src, VexSimdPrefix pre) {
|
||||
return simd_prefix_and_encode(dst, xnoreg, src, pre);
|
||||
int simd_prefix_and_encode(XMMRegister dst, Register src, VexSimdPrefix pre, bool no_mask_reg) {
|
||||
return simd_prefix_and_encode(dst, xnoreg, src, pre, no_mask_reg);
|
||||
}
|
||||
int simd_prefix_and_encode(Register dst, XMMRegister src,
|
||||
VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F) {
|
||||
return simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, pre, opc);
|
||||
VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F,
|
||||
bool no_mask_reg = false) {
|
||||
return simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, pre, no_mask_reg, opc);
|
||||
}
|
||||
|
||||
// Move/convert 64-bit integer value.
|
||||
int simd_prefix_and_encode_q(XMMRegister dst, XMMRegister nds, Register src,
|
||||
VexSimdPrefix pre) {
|
||||
VexSimdPrefix pre, bool no_mask_reg = false) {
|
||||
bool rex_w = true;
|
||||
return simd_prefix_and_encode(dst, nds, as_XMMRegister(src->encoding()), pre, VEX_OPCODE_0F, rex_w);
|
||||
return simd_prefix_and_encode(dst, nds, as_XMMRegister(src->encoding()), pre, no_mask_reg, VEX_OPCODE_0F, rex_w);
|
||||
}
|
||||
int simd_prefix_and_encode_q(XMMRegister dst, Register src, VexSimdPrefix pre) {
|
||||
return simd_prefix_and_encode_q(dst, xnoreg, src, pre);
|
||||
int simd_prefix_and_encode_q(XMMRegister dst, Register src, VexSimdPrefix pre, bool no_mask_reg) {
|
||||
return simd_prefix_and_encode_q(dst, xnoreg, src, pre, no_mask_reg);
|
||||
}
|
||||
int simd_prefix_and_encode_q(Register dst, XMMRegister src,
|
||||
VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F) {
|
||||
VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F,
|
||||
bool no_mask_reg = false) {
|
||||
bool rex_w = true;
|
||||
return simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, pre, opc, rex_w);
|
||||
return simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, pre, no_mask_reg, opc, rex_w);
|
||||
}
|
||||
|
||||
// Helper functions for groups of instructions
|
||||
@ -692,14 +772,28 @@ private:
|
||||
void emit_arith_imm32(int op1, int op2, Register dst, int32_t imm32);
|
||||
void emit_arith(int op1, int op2, Register dst, Register src);
|
||||
|
||||
void emit_simd_arith(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre);
|
||||
void emit_simd_arith(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre);
|
||||
void emit_simd_arith_nonds(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre);
|
||||
void emit_simd_arith_nonds(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre);
|
||||
void emit_simd_arith(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre, bool no_mask_reg = false, bool legacy_mode = false);
|
||||
void emit_simd_arith_q(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre, bool no_mask_reg = false);
|
||||
void emit_simd_arith(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg = false, bool legacy_mode = false);
|
||||
void emit_simd_arith_q(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg = false);
|
||||
void emit_simd_arith_nonds(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre, bool no_mask_reg = false);
|
||||
void emit_simd_arith_nonds_q(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre, bool no_mask_reg = false);
|
||||
void emit_simd_arith_nonds(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg = false, bool legacy_mode = false);
|
||||
void emit_simd_arith_nonds_q(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg = false);
|
||||
void emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds,
|
||||
Address src, VexSimdPrefix pre, bool vector256);
|
||||
Address src, VexSimdPrefix pre, int vector_len,
|
||||
bool no_mask_reg = false, bool legacy_mode = false);
|
||||
void emit_vex_arith_q(int opcode, XMMRegister dst, XMMRegister nds,
|
||||
Address src, VexSimdPrefix pre, int vector_len,
|
||||
bool no_mask_reg = false);
|
||||
void emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds,
|
||||
XMMRegister src, VexSimdPrefix pre, bool vector256);
|
||||
XMMRegister src, VexSimdPrefix pre, int vector_len,
|
||||
bool no_mask_reg = false, bool legacy_mode = false);
|
||||
void emit_vex_arith_q(int opcode, XMMRegister dst, XMMRegister nds,
|
||||
XMMRegister src, VexSimdPrefix pre, int vector_len,
|
||||
bool no_mask_reg = false);
|
||||
|
||||
bool emit_compressed_disp_byte(int &disp);
|
||||
|
||||
void emit_operand(Register reg,
|
||||
Register base, Register index, Address::ScaleFactor scale,
|
||||
@ -825,7 +919,9 @@ private:
|
||||
public:
|
||||
|
||||
// Creation
|
||||
Assembler(CodeBuffer* code) : AbstractAssembler(code) {}
|
||||
Assembler(CodeBuffer* code) : AbstractAssembler(code) {
|
||||
init_attributes();
|
||||
}
|
||||
|
||||
// Decoding
|
||||
static address locate_operand(address inst, WhichOperand which);
|
||||
@ -833,11 +929,21 @@ private:
|
||||
|
||||
// Utilities
|
||||
static bool is_polling_page_far() NOT_LP64({ return false;});
|
||||
static bool query_compressed_disp_byte(int disp, bool is_evex_inst, int vector_len,
|
||||
int cur_tuple_type, int in_size_in_bits, int cur_encoding);
|
||||
|
||||
// Generic instructions
|
||||
// Does 32bit or 64bit as needed for the platform. In some sense these
|
||||
// belong in macro assembler but there is no need for both varieties to exist
|
||||
|
||||
void init_attributes(void) {
|
||||
evex_encoding = 0;
|
||||
input_size_in_bits = 0;
|
||||
avx_vector_len = AVX_NoVec;
|
||||
tuple_type = EVEX_ETUP;
|
||||
is_evex_instruction = false;
|
||||
}
|
||||
|
||||
void lea(Register dst, Address src);
|
||||
|
||||
void mov(Register dst, Register src);
|
||||
@ -1338,6 +1444,12 @@ private:
|
||||
void movb(Address dst, int imm8);
|
||||
void movb(Register dst, Address src);
|
||||
|
||||
void kmovq(KRegister dst, KRegister src);
|
||||
void kmovql(KRegister dst, Register src);
|
||||
void kmovdl(KRegister dst, Register src);
|
||||
void kmovq(Address dst, KRegister src);
|
||||
void kmovq(KRegister dst, Address src);
|
||||
|
||||
void movdl(XMMRegister dst, Register src);
|
||||
void movdl(Register dst, XMMRegister src);
|
||||
void movdl(XMMRegister dst, Address src);
|
||||
@ -1361,6 +1473,11 @@ private:
|
||||
void vmovdqu(XMMRegister dst, Address src);
|
||||
void vmovdqu(XMMRegister dst, XMMRegister src);
|
||||
|
||||
// Move Unaligned 512bit Vector
|
||||
void evmovdqu(Address dst, XMMRegister src, int vector_len);
|
||||
void evmovdqu(XMMRegister dst, Address src, int vector_len);
|
||||
void evmovdqu(XMMRegister dst, XMMRegister src, int vector_len);
|
||||
|
||||
// Move lower 64bit to high 64bit in 128bit register
|
||||
void movlhps(XMMRegister dst, XMMRegister src);
|
||||
|
||||
@ -1486,10 +1603,10 @@ private:
|
||||
// Pack with unsigned saturation
|
||||
void packuswb(XMMRegister dst, XMMRegister src);
|
||||
void packuswb(XMMRegister dst, Address src);
|
||||
void vpackuswb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
|
||||
void vpackuswb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||
|
||||
// Pemutation of 64bit words
|
||||
void vpermq(XMMRegister dst, XMMRegister src, int imm8, bool vector256);
|
||||
void vpermq(XMMRegister dst, XMMRegister src, int imm8, int vector_len);
|
||||
|
||||
void pause();
|
||||
|
||||
@ -1734,54 +1851,54 @@ private:
|
||||
// Add Packed Floating-Point Values
|
||||
void addpd(XMMRegister dst, XMMRegister src);
|
||||
void addps(XMMRegister dst, XMMRegister src);
|
||||
void vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
|
||||
void vaddps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
|
||||
void vaddpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
|
||||
void vaddps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
|
||||
void vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||
void vaddps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||
void vaddpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
|
||||
void vaddps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
|
||||
|
||||
// Subtract Packed Floating-Point Values
|
||||
void subpd(XMMRegister dst, XMMRegister src);
|
||||
void subps(XMMRegister dst, XMMRegister src);
|
||||
void vsubpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
|
||||
void vsubps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
|
||||
void vsubpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
|
||||
void vsubps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
|
||||
void vsubpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||
void vsubps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||
void vsubpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
|
||||
void vsubps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
|
||||
|
||||
// Multiply Packed Floating-Point Values
|
||||
void mulpd(XMMRegister dst, XMMRegister src);
|
||||
void mulps(XMMRegister dst, XMMRegister src);
|
||||
void vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
|
||||
void vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
|
||||
void vmulpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
|
||||
void vmulps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
|
||||
void vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||
void vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||
void vmulpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
|
||||
void vmulps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
|
||||
|
||||
// Divide Packed Floating-Point Values
|
||||
void divpd(XMMRegister dst, XMMRegister src);
|
||||
void divps(XMMRegister dst, XMMRegister src);
|
||||
void vdivpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
|
||||
void vdivps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
|
||||
void vdivpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
|
||||
void vdivps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
|
||||
void vdivpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||
void vdivps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||
void vdivpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
|
||||
void vdivps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
|
||||
|
||||
// Bitwise Logical AND of Packed Floating-Point Values
|
||||
void andpd(XMMRegister dst, XMMRegister src);
|
||||
void andps(XMMRegister dst, XMMRegister src);
|
||||
void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
|
||||
void vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
|
||||
void vandpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
|
||||
void vandps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
|
||||
void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||
void vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||
void vandpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
|
||||
void vandps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
|
||||
|
||||
// Bitwise Logical XOR of Packed Floating-Point Values
|
||||
void xorpd(XMMRegister dst, XMMRegister src);
|
||||
void xorps(XMMRegister dst, XMMRegister src);
|
||||
void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
|
||||
void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
|
||||
void vxorpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
|
||||
void vxorps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
|
||||
void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||
void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||
void vxorpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
|
||||
void vxorps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
|
||||
|
||||
// Add horizontal packed integers
|
||||
void vphaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
|
||||
void vphaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
|
||||
void vphaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||
void vphaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||
void phaddw(XMMRegister dst, XMMRegister src);
|
||||
void phaddd(XMMRegister dst, XMMRegister src);
|
||||
|
||||
@ -1790,36 +1907,38 @@ private:
|
||||
void paddw(XMMRegister dst, XMMRegister src);
|
||||
void paddd(XMMRegister dst, XMMRegister src);
|
||||
void paddq(XMMRegister dst, XMMRegister src);
|
||||
void vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
|
||||
void vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
|
||||
void vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
|
||||
void vpaddq(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
|
||||
void vpaddb(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
|
||||
void vpaddw(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
|
||||
void vpaddd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
|
||||
void vpaddq(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
|
||||
void vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||
void vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||
void vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||
void vpaddq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||
void vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
|
||||
void vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
|
||||
void vpaddd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
|
||||
void vpaddq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
|
||||
|
||||
// Sub packed integers
|
||||
void psubb(XMMRegister dst, XMMRegister src);
|
||||
void psubw(XMMRegister dst, XMMRegister src);
|
||||
void psubd(XMMRegister dst, XMMRegister src);
|
||||
void psubq(XMMRegister dst, XMMRegister src);
|
||||
void vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
|
||||
void vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
|
||||
void vpsubd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
|
||||
void vpsubq(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
|
||||
void vpsubb(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
|
||||
void vpsubw(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
|
||||
void vpsubd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
|
||||
void vpsubq(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
|
||||
void vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||
void vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||
void vpsubd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||
void vpsubq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||
void vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
|
||||
void vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
|
||||
void vpsubd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
|
||||
void vpsubq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
|
||||
|
||||
// Multiply packed integers (only shorts and ints)
|
||||
void pmullw(XMMRegister dst, XMMRegister src);
|
||||
void pmulld(XMMRegister dst, XMMRegister src);
|
||||
void vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
|
||||
void vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
|
||||
void vpmullw(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
|
||||
void vpmulld(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
|
||||
void vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||
void vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||
void vpmullq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||
void vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
|
||||
void vpmulld(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
|
||||
void vpmullq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
|
||||
|
||||
// Shift left packed integers
|
||||
void psllw(XMMRegister dst, int shift);
|
||||
@ -1828,12 +1947,12 @@ private:
|
||||
void psllw(XMMRegister dst, XMMRegister shift);
|
||||
void pslld(XMMRegister dst, XMMRegister shift);
|
||||
void psllq(XMMRegister dst, XMMRegister shift);
|
||||
void vpsllw(XMMRegister dst, XMMRegister src, int shift, bool vector256);
|
||||
void vpslld(XMMRegister dst, XMMRegister src, int shift, bool vector256);
|
||||
void vpsllq(XMMRegister dst, XMMRegister src, int shift, bool vector256);
|
||||
void vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
|
||||
void vpslld(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
|
||||
void vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
|
||||
void vpsllw(XMMRegister dst, XMMRegister src, int shift, int vector_len);
|
||||
void vpslld(XMMRegister dst, XMMRegister src, int shift, int vector_len);
|
||||
void vpsllq(XMMRegister dst, XMMRegister src, int shift, int vector_len);
|
||||
void vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
|
||||
void vpslld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
|
||||
void vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
|
||||
|
||||
// Logical shift right packed integers
|
||||
void psrlw(XMMRegister dst, int shift);
|
||||
@ -1842,42 +1961,43 @@ private:
|
||||
void psrlw(XMMRegister dst, XMMRegister shift);
|
||||
void psrld(XMMRegister dst, XMMRegister shift);
|
||||
void psrlq(XMMRegister dst, XMMRegister shift);
|
||||
void vpsrlw(XMMRegister dst, XMMRegister src, int shift, bool vector256);
|
||||
void vpsrld(XMMRegister dst, XMMRegister src, int shift, bool vector256);
|
||||
void vpsrlq(XMMRegister dst, XMMRegister src, int shift, bool vector256);
|
||||
void vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
|
||||
void vpsrld(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
|
||||
void vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
|
||||
void vpsrlw(XMMRegister dst, XMMRegister src, int shift, int vector_len);
|
||||
void vpsrld(XMMRegister dst, XMMRegister src, int shift, int vector_len);
|
||||
void vpsrlq(XMMRegister dst, XMMRegister src, int shift, int vector_len);
|
||||
void vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
|
||||
void vpsrld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
|
||||
void vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
|
||||
|
||||
// Arithmetic shift right packed integers (only shorts and ints, no instructions for longs)
|
||||
void psraw(XMMRegister dst, int shift);
|
||||
void psrad(XMMRegister dst, int shift);
|
||||
void psraw(XMMRegister dst, XMMRegister shift);
|
||||
void psrad(XMMRegister dst, XMMRegister shift);
|
||||
void vpsraw(XMMRegister dst, XMMRegister src, int shift, bool vector256);
|
||||
void vpsrad(XMMRegister dst, XMMRegister src, int shift, bool vector256);
|
||||
void vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
|
||||
void vpsrad(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
|
||||
void vpsraw(XMMRegister dst, XMMRegister src, int shift, int vector_len);
|
||||
void vpsrad(XMMRegister dst, XMMRegister src, int shift, int vector_len);
|
||||
void vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
|
||||
void vpsrad(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
|
||||
|
||||
// And packed integers
|
||||
void pand(XMMRegister dst, XMMRegister src);
|
||||
void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
|
||||
void vpand(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
|
||||
void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||
void vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
|
||||
|
||||
// Or packed integers
|
||||
void por(XMMRegister dst, XMMRegister src);
|
||||
void vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
|
||||
void vpor(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
|
||||
void vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||
void vpor(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
|
||||
|
||||
// Xor packed integers
|
||||
void pxor(XMMRegister dst, XMMRegister src);
|
||||
void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
|
||||
void vpxor(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
|
||||
void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||
void vpxor(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
|
||||
|
||||
// Copy low 128bit into high 128bit of YMM registers.
|
||||
void vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src);
|
||||
void vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src);
|
||||
void vextractf128h(XMMRegister dst, XMMRegister src);
|
||||
void vextracti128h(XMMRegister dst, XMMRegister src);
|
||||
|
||||
// Load/store high 128bit of YMM registers which does not destroy other half.
|
||||
void vinsertf128h(XMMRegister dst, Address src);
|
||||
@ -1885,9 +2005,25 @@ private:
|
||||
void vextractf128h(Address dst, XMMRegister src);
|
||||
void vextracti128h(Address dst, XMMRegister src);
|
||||
|
||||
// Copy low 256bit into high 256bit of ZMM registers.
|
||||
void vinserti64x4h(XMMRegister dst, XMMRegister nds, XMMRegister src);
|
||||
void vinsertf64x4h(XMMRegister dst, XMMRegister nds, XMMRegister src);
|
||||
void vextracti64x4h(XMMRegister dst, XMMRegister src);
|
||||
void vextractf64x4h(XMMRegister dst, XMMRegister src);
|
||||
void vextractf64x4h(Address dst, XMMRegister src);
|
||||
void vinsertf64x4h(XMMRegister dst, Address src);
|
||||
|
||||
// Copy targeted 128bit segments of the ZMM registers
|
||||
void vextracti64x2h(XMMRegister dst, XMMRegister src, int value);
|
||||
void vextractf64x2h(XMMRegister dst, XMMRegister src, int value);
|
||||
void vextractf32x4h(XMMRegister dst, XMMRegister src, int value);
|
||||
|
||||
// duplicate 4-bytes integer data from src into 8 locations in dest
|
||||
void vpbroadcastd(XMMRegister dst, XMMRegister src);
|
||||
|
||||
// duplicate 4-bytes integer data from src into vector_len locations in dest
|
||||
void evpbroadcastd(XMMRegister dst, XMMRegister src, int vector_len);
|
||||
|
||||
// Carry-Less Multiplication Quadword
|
||||
void pclmulqdq(XMMRegister dst, XMMRegister src, int mask);
|
||||
void vpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask);
|
||||
|
||||
@ -233,13 +233,30 @@ void FrameMap::initialize() {
|
||||
_xmm_regs[13] = xmm13;
|
||||
_xmm_regs[14] = xmm14;
|
||||
_xmm_regs[15] = xmm15;
|
||||
_xmm_regs[16] = xmm16;
|
||||
_xmm_regs[17] = xmm17;
|
||||
_xmm_regs[18] = xmm18;
|
||||
_xmm_regs[19] = xmm19;
|
||||
_xmm_regs[20] = xmm20;
|
||||
_xmm_regs[21] = xmm21;
|
||||
_xmm_regs[22] = xmm22;
|
||||
_xmm_regs[23] = xmm23;
|
||||
_xmm_regs[24] = xmm24;
|
||||
_xmm_regs[25] = xmm25;
|
||||
_xmm_regs[26] = xmm26;
|
||||
_xmm_regs[27] = xmm27;
|
||||
_xmm_regs[28] = xmm28;
|
||||
_xmm_regs[29] = xmm29;
|
||||
_xmm_regs[30] = xmm30;
|
||||
_xmm_regs[31] = xmm31;
|
||||
#endif // _LP64
|
||||
|
||||
for (int i = 0; i < 8; i++) {
|
||||
_caller_save_fpu_regs[i] = LIR_OprFact::single_fpu(i);
|
||||
}
|
||||
|
||||
for (int i = 0; i < nof_caller_save_xmm_regs ; i++) {
|
||||
int num_caller_save_xmm_regs = get_num_caller_save_xmms();
|
||||
for (int i = 0; i < num_caller_save_xmm_regs; i++) {
|
||||
_caller_save_xmm_regs[i] = LIR_OprFact::single_xmm(i);
|
||||
}
|
||||
|
||||
|
||||
@ -152,6 +152,16 @@
|
||||
return range;
|
||||
}
|
||||
|
||||
static int get_num_caller_save_xmms(void) {
|
||||
int num_caller_save_xmm_regs = nof_caller_save_xmm_regs;
|
||||
#ifdef _LP64
|
||||
if (UseAVX < 3) {
|
||||
num_caller_save_xmm_regs = num_caller_save_xmm_regs / 2;
|
||||
}
|
||||
#endif
|
||||
return num_caller_save_xmm_regs;
|
||||
}
|
||||
|
||||
static int nof_caller_save_cpu_regs() { return adjust_reg_range(pd_nof_caller_save_cpu_regs_frame_map); }
|
||||
static int last_cpu_reg() { return adjust_reg_range(pd_last_cpu_reg); }
|
||||
static int last_byte_reg() { return adjust_reg_range(pd_last_byte_reg); }
|
||||
|
||||
@ -85,8 +85,9 @@ inline void LinearScan::pd_add_temps(LIR_Op* op) {
|
||||
tty->print_cr("killing XMMs for trig");
|
||||
}
|
||||
#endif
|
||||
int num_caller_save_xmm_regs = FrameMap::get_num_caller_save_xmms();
|
||||
int op_id = op->id();
|
||||
for (int xmm = 0; xmm < FrameMap::nof_caller_save_xmm_regs; xmm++) {
|
||||
for (int xmm = 0; xmm < num_caller_save_xmm_regs; xmm++) {
|
||||
LIR_Opr opr = FrameMap::caller_save_xmm_reg_at(xmm);
|
||||
add_temp(reg_num(opr), op_id, noUse, T_ILLEGAL);
|
||||
}
|
||||
@ -100,6 +101,10 @@ inline void LinearScan::pd_add_temps(LIR_Op* op) {
|
||||
// Implementation of LinearScanWalker
|
||||
|
||||
inline bool LinearScanWalker::pd_init_regs_for_alloc(Interval* cur) {
|
||||
int last_xmm_reg = pd_last_xmm_reg;
|
||||
if (UseAVX < 3) {
|
||||
last_xmm_reg = pd_first_xmm_reg + (pd_nof_xmm_regs_frame_map / 2) - 1;
|
||||
}
|
||||
if (allocator()->gen()->is_vreg_flag_set(cur->reg_num(), LIRGenerator::byte_reg)) {
|
||||
assert(cur->type() != T_FLOAT && cur->type() != T_DOUBLE, "cpu regs only");
|
||||
_first_reg = pd_first_byte_reg;
|
||||
@ -107,7 +112,7 @@ inline bool LinearScanWalker::pd_init_regs_for_alloc(Interval* cur) {
|
||||
return true;
|
||||
} else if ((UseSSE >= 1 && cur->type() == T_FLOAT) || (UseSSE >= 2 && cur->type() == T_DOUBLE)) {
|
||||
_first_reg = pd_first_xmm_reg;
|
||||
_last_reg = pd_last_xmm_reg;
|
||||
_last_reg = last_xmm_reg;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@ -323,7 +323,7 @@ static OopMap* generate_oop_map(StubAssembler* sasm, int num_rt_args,
|
||||
LP64_ONLY(num_rt_args = 0);
|
||||
LP64_ONLY(assert((reg_save_frame_size * VMRegImpl::stack_slot_size) % 16 == 0, "must be 16 byte aligned");)
|
||||
int frame_size_in_slots = reg_save_frame_size + num_rt_args; // args + thread
|
||||
sasm->set_frame_size(frame_size_in_slots / VMRegImpl::slots_per_word );
|
||||
sasm->set_frame_size(frame_size_in_slots / VMRegImpl::slots_per_word);
|
||||
|
||||
// record saved value locations in an OopMap
|
||||
// locations are offsets from sp after runtime call; num_rt_args is number of arguments in call, including thread
|
||||
@ -362,6 +362,13 @@ static OopMap* generate_oop_map(StubAssembler* sasm, int num_rt_args,
|
||||
map->set_callee_saved(VMRegImpl::stack2reg(r15H_off + num_rt_args), r15->as_VMReg()->next());
|
||||
#endif // _LP64
|
||||
|
||||
int xmm_bypass_limit = FrameMap::nof_xmm_regs;
|
||||
#ifdef _LP64
|
||||
if (UseAVX < 3) {
|
||||
xmm_bypass_limit = xmm_bypass_limit / 2;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (save_fpu_registers) {
|
||||
if (UseSSE < 2) {
|
||||
int fpu_off = float_regs_as_doubles_off;
|
||||
@ -380,11 +387,13 @@ static OopMap* generate_oop_map(StubAssembler* sasm, int num_rt_args,
|
||||
if (UseSSE >= 2) {
|
||||
int xmm_off = xmm_regs_as_doubles_off;
|
||||
for (int n = 0; n < FrameMap::nof_xmm_regs; n++) {
|
||||
VMReg xmm_name_0 = as_XMMRegister(n)->as_VMReg();
|
||||
map->set_callee_saved(VMRegImpl::stack2reg(xmm_off + num_rt_args), xmm_name_0);
|
||||
// %%% This is really a waste but we'll keep things as they were for now
|
||||
if (true) {
|
||||
map->set_callee_saved(VMRegImpl::stack2reg(xmm_off + 1 + num_rt_args), xmm_name_0->next());
|
||||
if (n < xmm_bypass_limit) {
|
||||
VMReg xmm_name_0 = as_XMMRegister(n)->as_VMReg();
|
||||
map->set_callee_saved(VMRegImpl::stack2reg(xmm_off + num_rt_args), xmm_name_0);
|
||||
// %%% This is really a waste but we'll keep things as they were for now
|
||||
if (true) {
|
||||
map->set_callee_saved(VMRegImpl::stack2reg(xmm_off + 1 + num_rt_args), xmm_name_0->next());
|
||||
}
|
||||
}
|
||||
xmm_off += 2;
|
||||
}
|
||||
@ -393,8 +402,10 @@ static OopMap* generate_oop_map(StubAssembler* sasm, int num_rt_args,
|
||||
} else if (UseSSE == 1) {
|
||||
int xmm_off = xmm_regs_as_doubles_off;
|
||||
for (int n = 0; n < FrameMap::nof_xmm_regs; n++) {
|
||||
VMReg xmm_name_0 = as_XMMRegister(n)->as_VMReg();
|
||||
map->set_callee_saved(VMRegImpl::stack2reg(xmm_off + num_rt_args), xmm_name_0);
|
||||
if (n < xmm_bypass_limit) {
|
||||
VMReg xmm_name_0 = as_XMMRegister(n)->as_VMReg();
|
||||
map->set_callee_saved(VMRegImpl::stack2reg(xmm_off + num_rt_args), xmm_name_0);
|
||||
}
|
||||
xmm_off += 2;
|
||||
}
|
||||
assert(xmm_off == float_regs_as_doubles_off, "incorrect number of xmm registers");
|
||||
@ -474,6 +485,24 @@ static OopMap* save_live_registers(StubAssembler* sasm, int num_rt_args,
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 104), xmm13);
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 112), xmm14);
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 120), xmm15);
|
||||
if (UseAVX > 2) {
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 128), xmm16);
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 136), xmm17);
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 144), xmm18);
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 152), xmm19);
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 160), xmm20);
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 168), xmm21);
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 176), xmm22);
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 184), xmm23);
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 192), xmm24);
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 200), xmm25);
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 208), xmm26);
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 216), xmm27);
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 224), xmm28);
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 232), xmm29);
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 240), xmm30);
|
||||
__ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 248), xmm31);
|
||||
}
|
||||
#endif // _LP64
|
||||
} else if (UseSSE == 1) {
|
||||
// save XMM registers as float because double not supported without SSE2
|
||||
@ -516,6 +545,24 @@ static void restore_fpu(StubAssembler* sasm, bool restore_fpu_registers = true)
|
||||
__ movdbl(xmm13, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 104));
|
||||
__ movdbl(xmm14, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 112));
|
||||
__ movdbl(xmm15, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 120));
|
||||
if (UseAVX > 2) {
|
||||
__ movdbl(xmm16, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 128));
|
||||
__ movdbl(xmm17, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 136));
|
||||
__ movdbl(xmm18, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 144));
|
||||
__ movdbl(xmm19, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 152));
|
||||
__ movdbl(xmm20, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 160));
|
||||
__ movdbl(xmm21, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 168));
|
||||
__ movdbl(xmm22, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 176));
|
||||
__ movdbl(xmm23, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 184));
|
||||
__ movdbl(xmm24, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 192));
|
||||
__ movdbl(xmm25, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 200));
|
||||
__ movdbl(xmm26, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 208));
|
||||
__ movdbl(xmm27, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 216));
|
||||
__ movdbl(xmm28, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 224));
|
||||
__ movdbl(xmm29, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 232));
|
||||
__ movdbl(xmm30, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 240));
|
||||
__ movdbl(xmm31, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 248));
|
||||
}
|
||||
#endif // _LP64
|
||||
} else if (UseSSE == 1) {
|
||||
// restore XMM registers
|
||||
|
||||
@ -25,6 +25,7 @@
|
||||
#include "precompiled.hpp"
|
||||
#include "opto/compile.hpp"
|
||||
#include "opto/node.hpp"
|
||||
#include "opto/optoreg.hpp"
|
||||
|
||||
// processor dependent initialization for i486
|
||||
|
||||
@ -37,4 +38,24 @@ void Compile::pd_compiler2_init() {
|
||||
ConditionalMoveLimit = 0;
|
||||
}
|
||||
#endif // AMD64
|
||||
|
||||
if (UseAVX < 3) {
|
||||
int delta = XMMRegisterImpl::max_slots_per_register * XMMRegisterImpl::number_of_registers;
|
||||
int bottom = ConcreteRegisterImpl::max_fpr;
|
||||
int top = bottom + delta;
|
||||
int middle = bottom + (delta / 2);
|
||||
int xmm_slots = XMMRegisterImpl::max_slots_per_register;
|
||||
int lower = xmm_slots / 2;
|
||||
// mark bad every register that we cannot get to if AVX less than 3, we have all slots in the array
|
||||
// Note: vm2opto is allocated to ConcreteRegisterImpl::number_of_registers
|
||||
for (int i = bottom; i < middle; i += xmm_slots) {
|
||||
for (OptoReg::Name j = OptoReg::Name(i + lower); j<OptoReg::Name(i + xmm_slots); j = OptoReg::add(j, 1)) {
|
||||
OptoReg::invalidate(j);
|
||||
}
|
||||
}
|
||||
// mark the upper zmm bank bad and all the mask registers bad in this case
|
||||
for (OptoReg::Name i = OptoReg::Name(middle); i<OptoReg::Name(_last_Mach_Reg - 1); i = OptoReg::add(i, 1)) {
|
||||
OptoReg::invalidate(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -125,7 +125,7 @@
|
||||
// Entry frames
|
||||
#ifdef AMD64
|
||||
#ifdef _WIN64
|
||||
entry_frame_after_call_words = 28,
|
||||
entry_frame_after_call_words = 60,
|
||||
entry_frame_call_wrapper_offset = 2,
|
||||
|
||||
arg_reg_save_area_bytes = 32 // Register argument save area
|
||||
|
||||
@ -3996,21 +3996,21 @@ void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src
|
||||
}
|
||||
}
|
||||
|
||||
void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
|
||||
void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
|
||||
if (reachable(src)) {
|
||||
vandpd(dst, nds, as_Address(src), vector256);
|
||||
vandpd(dst, nds, as_Address(src), vector_len);
|
||||
} else {
|
||||
lea(rscratch1, src);
|
||||
vandpd(dst, nds, Address(rscratch1, 0), vector256);
|
||||
vandpd(dst, nds, Address(rscratch1, 0), vector_len);
|
||||
}
|
||||
}
|
||||
|
||||
void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
|
||||
void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
|
||||
if (reachable(src)) {
|
||||
vandps(dst, nds, as_Address(src), vector256);
|
||||
vandps(dst, nds, as_Address(src), vector_len);
|
||||
} else {
|
||||
lea(rscratch1, src);
|
||||
vandps(dst, nds, Address(rscratch1, 0), vector256);
|
||||
vandps(dst, nds, Address(rscratch1, 0), vector_len);
|
||||
}
|
||||
}
|
||||
|
||||
@ -4068,21 +4068,21 @@ void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src
|
||||
}
|
||||
}
|
||||
|
||||
void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
|
||||
void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
|
||||
if (reachable(src)) {
|
||||
vxorpd(dst, nds, as_Address(src), vector256);
|
||||
vxorpd(dst, nds, as_Address(src), vector_len);
|
||||
} else {
|
||||
lea(rscratch1, src);
|
||||
vxorpd(dst, nds, Address(rscratch1, 0), vector256);
|
||||
vxorpd(dst, nds, Address(rscratch1, 0), vector_len);
|
||||
}
|
||||
}
|
||||
|
||||
void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
|
||||
void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
|
||||
if (reachable(src)) {
|
||||
vxorps(dst, nds, as_Address(src), vector256);
|
||||
vxorps(dst, nds, as_Address(src), vector_len);
|
||||
} else {
|
||||
lea(rscratch1, src);
|
||||
vxorps(dst, nds, Address(rscratch1, 0), vector256);
|
||||
vxorps(dst, nds, Address(rscratch1, 0), vector_len);
|
||||
}
|
||||
}
|
||||
|
||||
@ -4561,6 +4561,14 @@ void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int
|
||||
movflt(Address(rsp,off++*sizeof(jdouble)),xmm6);
|
||||
movflt(Address(rsp,off++*sizeof(jdouble)),xmm7);
|
||||
} else if (UseSSE >= 2) {
|
||||
if (UseAVX > 2) {
|
||||
movl(rbx, 0xffff);
|
||||
#ifdef _LP64
|
||||
kmovql(k1, rbx);
|
||||
#else
|
||||
kmovdl(k1, rbx);
|
||||
#endif
|
||||
}
|
||||
#ifdef COMPILER2
|
||||
if (MaxVectorSize > 16) {
|
||||
assert(UseAVX > 0, "256bit vectors are supported only with AVX");
|
||||
@ -7063,8 +7071,39 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned,
|
||||
{
|
||||
assert( UseSSE >= 2, "supported cpu only" );
|
||||
Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
|
||||
if (UseAVX > 2) {
|
||||
movl(rtmp, 0xffff);
|
||||
#ifdef _LP64
|
||||
kmovql(k1, rtmp);
|
||||
#else
|
||||
kmovdl(k1, rtmp);
|
||||
#endif
|
||||
}
|
||||
movdl(xtmp, value);
|
||||
if (UseAVX >= 2 && UseUnalignedLoadStores) {
|
||||
if (UseAVX > 2 && UseUnalignedLoadStores) {
|
||||
// Fill 64-byte chunks
|
||||
Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
|
||||
evpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
|
||||
|
||||
subl(count, 16 << shift);
|
||||
jcc(Assembler::less, L_check_fill_32_bytes);
|
||||
align(16);
|
||||
|
||||
BIND(L_fill_64_bytes_loop);
|
||||
evmovdqu(Address(to, 0), xtmp, Assembler::AVX_512bit);
|
||||
addptr(to, 64);
|
||||
subl(count, 16 << shift);
|
||||
jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
|
||||
|
||||
BIND(L_check_fill_32_bytes);
|
||||
addl(count, 8 << shift);
|
||||
jccb(Assembler::less, L_check_fill_8_bytes);
|
||||
evmovdqu(Address(to, 0), xtmp, Assembler::AVX_256bit);
|
||||
addptr(to, 32);
|
||||
subl(count, 8 << shift);
|
||||
|
||||
BIND(L_check_fill_8_bytes);
|
||||
} else if (UseAVX == 2 && UseUnalignedLoadStores) {
|
||||
// Fill 64-byte chunks
|
||||
Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
|
||||
vpbroadcastd(xtmp, xtmp);
|
||||
@ -7200,11 +7239,11 @@ void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
|
||||
bind(L_copy_32_chars);
|
||||
vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
|
||||
vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
|
||||
vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector256 */ true);
|
||||
vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
|
||||
vptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector
|
||||
jccb(Assembler::notZero, L_copy_32_chars_exit);
|
||||
vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector256 */ true);
|
||||
vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector256 */ true);
|
||||
vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
|
||||
vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1);
|
||||
vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
|
||||
|
||||
bind(L_chars_32_check);
|
||||
@ -7227,13 +7266,13 @@ void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
|
||||
vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
|
||||
vptest(tmp2Reg, tmp1Reg);
|
||||
jccb(Assembler::notZero, L_copy_16_chars_exit);
|
||||
vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector256 */ true);
|
||||
vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector256 */ true);
|
||||
vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1);
|
||||
vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1);
|
||||
} else {
|
||||
if (UseAVX > 0) {
|
||||
movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
|
||||
movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
|
||||
vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector256 */ false);
|
||||
vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0);
|
||||
} else {
|
||||
movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
|
||||
por(tmp2Reg, tmp3Reg);
|
||||
@ -7776,7 +7815,7 @@ void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegi
|
||||
if (UseAVX > 0) {
|
||||
vpclmulhdq(xtmp, xK, xcrc); // [123:64]
|
||||
vpclmulldq(xcrc, xK, xcrc); // [63:0]
|
||||
vpxor(xcrc, xcrc, Address(buf, offset), false /* vector256 */);
|
||||
vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */);
|
||||
pxor(xcrc, xtmp);
|
||||
} else {
|
||||
movdqa(xtmp, xcrc);
|
||||
@ -7920,7 +7959,7 @@ void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Regi
|
||||
movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()));
|
||||
if (UseAVX > 0) {
|
||||
vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
|
||||
vpand(xmm3, xmm0, xmm2, false /* vector256 */);
|
||||
vpand(xmm3, xmm0, xmm2, 0 /* vector_len */);
|
||||
vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
|
||||
} else {
|
||||
movdqa(xmm2, xmm0);
|
||||
|
||||
@ -1024,13 +1024,13 @@ public:
|
||||
void vaddss(XMMRegister dst, XMMRegister nds, Address src) { Assembler::vaddss(dst, nds, src); }
|
||||
void vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src);
|
||||
|
||||
void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { Assembler::vandpd(dst, nds, src, vector256); }
|
||||
void vandpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { Assembler::vandpd(dst, nds, src, vector256); }
|
||||
void vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256);
|
||||
void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vandpd(dst, nds, src, vector_len); }
|
||||
void vandpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { Assembler::vandpd(dst, nds, src, vector_len); }
|
||||
void vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len);
|
||||
|
||||
void vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { Assembler::vandps(dst, nds, src, vector256); }
|
||||
void vandps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { Assembler::vandps(dst, nds, src, vector256); }
|
||||
void vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256);
|
||||
void vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vandps(dst, nds, src, vector_len); }
|
||||
void vandps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { Assembler::vandps(dst, nds, src, vector_len); }
|
||||
void vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len);
|
||||
|
||||
void vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vdivsd(dst, nds, src); }
|
||||
void vdivsd(XMMRegister dst, XMMRegister nds, Address src) { Assembler::vdivsd(dst, nds, src); }
|
||||
@ -1058,25 +1058,25 @@ public:
|
||||
|
||||
// AVX Vector instructions
|
||||
|
||||
void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { Assembler::vxorpd(dst, nds, src, vector256); }
|
||||
void vxorpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { Assembler::vxorpd(dst, nds, src, vector256); }
|
||||
void vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256);
|
||||
void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vxorpd(dst, nds, src, vector_len); }
|
||||
void vxorpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { Assembler::vxorpd(dst, nds, src, vector_len); }
|
||||
void vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len);
|
||||
|
||||
void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { Assembler::vxorps(dst, nds, src, vector256); }
|
||||
void vxorps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { Assembler::vxorps(dst, nds, src, vector256); }
|
||||
void vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256);
|
||||
void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vxorps(dst, nds, src, vector_len); }
|
||||
void vxorps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { Assembler::vxorps(dst, nds, src, vector_len); }
|
||||
void vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len);
|
||||
|
||||
void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
|
||||
if (UseAVX > 1 || !vector256) // vpxor 256 bit is available only in AVX2
|
||||
Assembler::vpxor(dst, nds, src, vector256);
|
||||
void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
|
||||
if (UseAVX > 1 || (vector_len < 1)) // vpxor 256 bit is available only in AVX2
|
||||
Assembler::vpxor(dst, nds, src, vector_len);
|
||||
else
|
||||
Assembler::vxorpd(dst, nds, src, vector256);
|
||||
Assembler::vxorpd(dst, nds, src, vector_len);
|
||||
}
|
||||
void vpxor(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
|
||||
if (UseAVX > 1 || !vector256) // vpxor 256 bit is available only in AVX2
|
||||
Assembler::vpxor(dst, nds, src, vector256);
|
||||
void vpxor(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
|
||||
if (UseAVX > 1 || (vector_len < 1)) // vpxor 256 bit is available only in AVX2
|
||||
Assembler::vpxor(dst, nds, src, vector_len);
|
||||
else
|
||||
Assembler::vxorpd(dst, nds, src, vector256);
|
||||
Assembler::vxorpd(dst, nds, src, vector_len);
|
||||
}
|
||||
|
||||
// Simple version for AVX2 256bit vectors
|
||||
|
||||
@ -68,6 +68,22 @@ REGISTER_DEFINITION(XMMRegister, xmm12);
|
||||
REGISTER_DEFINITION(XMMRegister, xmm13);
|
||||
REGISTER_DEFINITION(XMMRegister, xmm14);
|
||||
REGISTER_DEFINITION(XMMRegister, xmm15);
|
||||
REGISTER_DEFINITION(XMMRegister, xmm16);
|
||||
REGISTER_DEFINITION(XMMRegister, xmm17);
|
||||
REGISTER_DEFINITION(XMMRegister, xmm18);
|
||||
REGISTER_DEFINITION(XMMRegister, xmm19);
|
||||
REGISTER_DEFINITION(XMMRegister, xmm20);
|
||||
REGISTER_DEFINITION(XMMRegister, xmm21);
|
||||
REGISTER_DEFINITION(XMMRegister, xmm22);
|
||||
REGISTER_DEFINITION(XMMRegister, xmm23);
|
||||
REGISTER_DEFINITION(XMMRegister, xmm24);
|
||||
REGISTER_DEFINITION(XMMRegister, xmm25);
|
||||
REGISTER_DEFINITION(XMMRegister, xmm26);
|
||||
REGISTER_DEFINITION(XMMRegister, xmm27);
|
||||
REGISTER_DEFINITION(XMMRegister, xmm28);
|
||||
REGISTER_DEFINITION(XMMRegister, xmm29);
|
||||
REGISTER_DEFINITION(XMMRegister, xmm30);
|
||||
REGISTER_DEFINITION(XMMRegister, xmm31);
|
||||
|
||||
REGISTER_DEFINITION(Register, c_rarg0);
|
||||
REGISTER_DEFINITION(Register, c_rarg1);
|
||||
@ -123,5 +139,15 @@ REGISTER_DEFINITION(MMXRegister, mmx5 );
|
||||
REGISTER_DEFINITION(MMXRegister, mmx6 );
|
||||
REGISTER_DEFINITION(MMXRegister, mmx7 );
|
||||
|
||||
REGISTER_DEFINITION(KRegister, knoreg);
|
||||
REGISTER_DEFINITION(KRegister, k0);
|
||||
REGISTER_DEFINITION(KRegister, k1);
|
||||
REGISTER_DEFINITION(KRegister, k2);
|
||||
REGISTER_DEFINITION(KRegister, k3);
|
||||
REGISTER_DEFINITION(KRegister, k4);
|
||||
REGISTER_DEFINITION(KRegister, k5);
|
||||
REGISTER_DEFINITION(KRegister, k6);
|
||||
REGISTER_DEFINITION(KRegister, k7);
|
||||
|
||||
// JSR 292
|
||||
REGISTER_DEFINITION(Register, rbp_mh_SP_save);
|
||||
|
||||
@ -31,11 +31,13 @@ const int ConcreteRegisterImpl::max_gpr = RegisterImpl::number_of_registers;
|
||||
const int ConcreteRegisterImpl::max_gpr = RegisterImpl::number_of_registers << 1;
|
||||
#endif // AMD64
|
||||
|
||||
|
||||
const int ConcreteRegisterImpl::max_fpr = ConcreteRegisterImpl::max_gpr +
|
||||
2 * FloatRegisterImpl::number_of_registers;
|
||||
2 * FloatRegisterImpl::number_of_registers;
|
||||
const int ConcreteRegisterImpl::max_xmm = ConcreteRegisterImpl::max_fpr +
|
||||
8 * XMMRegisterImpl::number_of_registers;
|
||||
XMMRegisterImpl::max_slots_per_register * XMMRegisterImpl::number_of_registers;
|
||||
const int ConcreteRegisterImpl::max_kpr = ConcreteRegisterImpl::max_xmm +
|
||||
KRegisterImpl::max_slots_per_register * KRegisterImpl::number_of_registers;
|
||||
|
||||
const char* RegisterImpl::name() const {
|
||||
const char* names[number_of_registers] = {
|
||||
#ifndef AMD64
|
||||
@ -59,8 +61,17 @@ const char* XMMRegisterImpl::name() const {
|
||||
const char* names[number_of_registers] = {
|
||||
"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7"
|
||||
#ifdef AMD64
|
||||
,"xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"
|
||||
,"xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"
|
||||
,"xmm16", "xmm17", "xmm18", "xmm19", "xmm20", "xmm21", "xmm22", "xmm23"
|
||||
,"xmm24", "xmm25", "xmm26", "xmm27", "xmm28", "xmm29", "xmm30", "xmm31"
|
||||
#endif // AMD64
|
||||
};
|
||||
return is_valid() ? names[encoding()] : "xnoreg";
|
||||
}
|
||||
|
||||
const char* KRegisterImpl::name() const {
|
||||
const char* names[number_of_registers] = {
|
||||
"k0", "k1", "k2", "k3", "k4", "k5", "k6", "k7"
|
||||
};
|
||||
return is_valid() ? names[encoding()] : "knoreg";
|
||||
}
|
||||
|
||||
@ -45,10 +45,12 @@ class RegisterImpl: public AbstractRegisterImpl {
|
||||
enum {
|
||||
#ifndef AMD64
|
||||
number_of_registers = 8,
|
||||
number_of_byte_registers = 4
|
||||
number_of_byte_registers = 4,
|
||||
max_slots_per_register = 1
|
||||
#else
|
||||
number_of_registers = 16,
|
||||
number_of_byte_registers = 16
|
||||
number_of_byte_registers = 16,
|
||||
max_slots_per_register = 1
|
||||
#endif // AMD64
|
||||
};
|
||||
|
||||
@ -143,9 +145,11 @@ class XMMRegisterImpl: public AbstractRegisterImpl {
|
||||
public:
|
||||
enum {
|
||||
#ifndef AMD64
|
||||
number_of_registers = 8
|
||||
number_of_registers = 8,
|
||||
max_slots_per_register = 16 // 512-bit
|
||||
#else
|
||||
number_of_registers = 16
|
||||
number_of_registers = 32,
|
||||
max_slots_per_register = 16 // 512-bit
|
||||
#endif // AMD64
|
||||
};
|
||||
|
||||
@ -183,6 +187,22 @@ CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm12, (12));
|
||||
CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm13, (13));
|
||||
CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm14, (14));
|
||||
CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm15, (15));
|
||||
CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm16, (16));
|
||||
CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm17, (17));
|
||||
CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm18, (18));
|
||||
CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm19, (19));
|
||||
CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm20, (20));
|
||||
CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm21, (21));
|
||||
CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm22, (22));
|
||||
CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm23, (23));
|
||||
CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm24, (24));
|
||||
CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm25, (25));
|
||||
CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm26, (26));
|
||||
CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm27, (27));
|
||||
CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm28, (28));
|
||||
CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm29, (29));
|
||||
CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm30, (30));
|
||||
CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm31, (31));
|
||||
#endif // AMD64
|
||||
|
||||
// Only used by the 32bit stubGenerator. These can't be described by vmreg and hence
|
||||
@ -200,6 +220,46 @@ CONSTANT_REGISTER_DECLARATION(MMXRegister, mmx5 , ( 5));
|
||||
CONSTANT_REGISTER_DECLARATION(MMXRegister, mmx6 , ( 6));
|
||||
CONSTANT_REGISTER_DECLARATION(MMXRegister, mmx7 , ( 7));
|
||||
|
||||
// Use XMMRegister as shortcut
|
||||
class KRegisterImpl;
|
||||
typedef KRegisterImpl* KRegister;
|
||||
|
||||
inline KRegister as_KRegister(int encoding) {
|
||||
return (KRegister)(intptr_t)encoding;
|
||||
}
|
||||
|
||||
// The implementation of XMM registers for the IA32 architecture
|
||||
class KRegisterImpl : public AbstractRegisterImpl {
|
||||
public:
|
||||
enum {
|
||||
number_of_registers = 8,
|
||||
max_slots_per_register = 1
|
||||
};
|
||||
|
||||
// construction
|
||||
friend KRegister as_KRegister(int encoding);
|
||||
|
||||
inline VMReg as_VMReg();
|
||||
|
||||
// derived registers, offsets, and addresses
|
||||
KRegister successor() const { return as_KRegister(encoding() + 1); }
|
||||
|
||||
// accessors
|
||||
int encoding() const { assert(is_valid(), err_msg("invalid register (%d)", (int)(intptr_t)this)); return (intptr_t)this; }
|
||||
bool is_valid() const { return 0 <= (intptr_t)this && (intptr_t)this < number_of_registers; }
|
||||
const char* name() const;
|
||||
};
|
||||
|
||||
// The Mask registers, for AVX3 enabled and up chips
|
||||
CONSTANT_REGISTER_DECLARATION(KRegister, knoreg, (-1));
|
||||
CONSTANT_REGISTER_DECLARATION(KRegister, k0, (0));
|
||||
CONSTANT_REGISTER_DECLARATION(KRegister, k1, (1));
|
||||
CONSTANT_REGISTER_DECLARATION(KRegister, k2, (2));
|
||||
CONSTANT_REGISTER_DECLARATION(KRegister, k3, (3));
|
||||
CONSTANT_REGISTER_DECLARATION(KRegister, k4, (4));
|
||||
CONSTANT_REGISTER_DECLARATION(KRegister, k5, (5));
|
||||
CONSTANT_REGISTER_DECLARATION(KRegister, k6, (6));
|
||||
CONSTANT_REGISTER_DECLARATION(KRegister, k7, (7));
|
||||
|
||||
// Need to know the total number of registers of all sorts for SharedInfo.
|
||||
// Define a class that exports it.
|
||||
@ -211,18 +271,20 @@ class ConcreteRegisterImpl : public AbstractRegisterImpl {
|
||||
// There is no requirement that any ordering here matches any ordering c2 gives
|
||||
// it's optoregs.
|
||||
|
||||
number_of_registers = RegisterImpl::number_of_registers +
|
||||
number_of_registers = RegisterImpl::number_of_registers +
|
||||
#ifdef AMD64
|
||||
RegisterImpl::number_of_registers + // "H" half of a 64bit register
|
||||
RegisterImpl::number_of_registers + // "H" half of a 64bit register
|
||||
#endif // AMD64
|
||||
2 * FloatRegisterImpl::number_of_registers +
|
||||
8 * XMMRegisterImpl::number_of_registers +
|
||||
1 // eflags
|
||||
2 * FloatRegisterImpl::number_of_registers +
|
||||
XMMRegisterImpl::max_slots_per_register * XMMRegisterImpl::number_of_registers +
|
||||
KRegisterImpl::number_of_registers + // mask registers
|
||||
1 // eflags
|
||||
};
|
||||
|
||||
static const int max_gpr;
|
||||
static const int max_fpr;
|
||||
static const int max_xmm;
|
||||
static const int max_kpr;
|
||||
|
||||
};
|
||||
|
||||
|
||||
@ -117,9 +117,9 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_
|
||||
int vect_words = 0;
|
||||
#ifdef COMPILER2
|
||||
if (save_vectors) {
|
||||
assert(UseAVX > 0, "256bit vectors are supported only with AVX");
|
||||
assert(MaxVectorSize == 32, "only 256bit vectors are supported now");
|
||||
// Save upper half of YMM registes
|
||||
assert(UseAVX > 0, "512bit vectors are supported only with EVEX");
|
||||
assert(MaxVectorSize == 64, "only 512bit vectors are supported now");
|
||||
// Save upper half of ZMM/YMM registers :
|
||||
vect_words = 8 * 16 / wordSize;
|
||||
additional_frame_words += vect_words;
|
||||
}
|
||||
@ -216,6 +216,17 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_
|
||||
__ vextractf128h(Address(rsp, 80),xmm5);
|
||||
__ vextractf128h(Address(rsp, 96),xmm6);
|
||||
__ vextractf128h(Address(rsp,112),xmm7);
|
||||
if (UseAVX > 2) {
|
||||
__ subptr(rsp, 256); // Save upper half of ZMM registes
|
||||
__ vextractf64x4h(Address(rsp, 0), xmm0);
|
||||
__ vextractf64x4h(Address(rsp, 32), xmm1);
|
||||
__ vextractf64x4h(Address(rsp, 64), xmm2);
|
||||
__ vextractf64x4h(Address(rsp, 96), xmm3);
|
||||
__ vextractf64x4h(Address(rsp, 128), xmm4);
|
||||
__ vextractf64x4h(Address(rsp, 160), xmm5);
|
||||
__ vextractf64x4h(Address(rsp, 192), xmm6);
|
||||
__ vextractf64x4h(Address(rsp, 224), xmm7);
|
||||
}
|
||||
}
|
||||
|
||||
// Set an oopmap for the call site. This oopmap will map all
|
||||
@ -283,8 +294,8 @@ void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_ve
|
||||
int additional_frame_bytes = 0;
|
||||
#ifdef COMPILER2
|
||||
if (restore_vectors) {
|
||||
assert(UseAVX > 0, "256bit vectors are supported only with AVX");
|
||||
assert(MaxVectorSize == 32, "only 256bit vectors are supported now");
|
||||
assert(UseAVX > 0, "512bit vectors are supported only with EVEX");
|
||||
assert(MaxVectorSize == 64, "only 512bit vectors are supported now");
|
||||
additional_frame_bytes = 128;
|
||||
}
|
||||
#else
|
||||
@ -324,6 +335,18 @@ void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_ve
|
||||
__ vinsertf128h(xmm6, Address(rsp, 96));
|
||||
__ vinsertf128h(xmm7, Address(rsp,112));
|
||||
__ addptr(rsp, additional_frame_bytes);
|
||||
if (UseAVX > 2) {
|
||||
additional_frame_bytes = 256;
|
||||
__ vinsertf64x4h(xmm0, Address(rsp, 0));
|
||||
__ vinsertf64x4h(xmm1, Address(rsp, 32));
|
||||
__ vinsertf64x4h(xmm2, Address(rsp, 64));
|
||||
__ vinsertf64x4h(xmm3, Address(rsp, 96));
|
||||
__ vinsertf64x4h(xmm4, Address(rsp, 128));
|
||||
__ vinsertf64x4h(xmm5, Address(rsp, 160));
|
||||
__ vinsertf64x4h(xmm6, Address(rsp, 192));
|
||||
__ vinsertf64x4h(xmm7, Address(rsp, 224));
|
||||
__ addptr(rsp, additional_frame_bytes);
|
||||
}
|
||||
}
|
||||
__ pop_FPU_state();
|
||||
__ addptr(rsp, FPU_regs_live*wordSize); // Pop FPU registers
|
||||
|
||||
@ -86,7 +86,23 @@ class RegisterSaver {
|
||||
DEF_XMM_OFFS(13),
|
||||
DEF_XMM_OFFS(14),
|
||||
DEF_XMM_OFFS(15),
|
||||
fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
|
||||
DEF_XMM_OFFS(16),
|
||||
DEF_XMM_OFFS(17),
|
||||
DEF_XMM_OFFS(18),
|
||||
DEF_XMM_OFFS(19),
|
||||
DEF_XMM_OFFS(20),
|
||||
DEF_XMM_OFFS(21),
|
||||
DEF_XMM_OFFS(22),
|
||||
DEF_XMM_OFFS(23),
|
||||
DEF_XMM_OFFS(24),
|
||||
DEF_XMM_OFFS(25),
|
||||
DEF_XMM_OFFS(26),
|
||||
DEF_XMM_OFFS(27),
|
||||
DEF_XMM_OFFS(28),
|
||||
DEF_XMM_OFFS(29),
|
||||
DEF_XMM_OFFS(30),
|
||||
DEF_XMM_OFFS(31),
|
||||
fpu_state_end = fpu_state_off + ((FPUStateSizeInWords - 1)*wordSize / BytesPerInt),
|
||||
fpu_stateH_end,
|
||||
r15_off, r15H_off,
|
||||
r14_off, r14H_off,
|
||||
@ -136,13 +152,21 @@ class RegisterSaver {
|
||||
|
||||
OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors) {
|
||||
int vect_words = 0;
|
||||
int num_xmm_regs = 16;
|
||||
if (UseAVX > 2) {
|
||||
num_xmm_regs = 32;
|
||||
}
|
||||
#ifdef COMPILER2
|
||||
if (save_vectors) {
|
||||
assert(UseAVX > 0, "256bit vectors are supported only with AVX");
|
||||
assert(MaxVectorSize == 32, "only 256bit vectors are supported now");
|
||||
// Save upper half of YMM registes
|
||||
vect_words = 16 * 16 / wordSize;
|
||||
assert(UseAVX > 0, "512bit vectors are supported only with EVEX");
|
||||
assert(MaxVectorSize == 64, "only 512bit vectors are supported now");
|
||||
// Save upper half of YMM registers
|
||||
vect_words = 16 * num_xmm_regs / wordSize;
|
||||
additional_frame_words += vect_words;
|
||||
if (UseAVX > 2) {
|
||||
// Save upper half of ZMM registers as well
|
||||
additional_frame_words += vect_words;
|
||||
}
|
||||
}
|
||||
#else
|
||||
assert(!save_vectors, "vectors are generated only by C2");
|
||||
@ -150,7 +174,7 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_
|
||||
|
||||
// Always make the frame size 16-byte aligned
|
||||
int frame_size_in_bytes = round_to(additional_frame_words*wordSize +
|
||||
reg_save_size*BytesPerInt, 16);
|
||||
reg_save_size*BytesPerInt, num_xmm_regs);
|
||||
// OopMap frame size is in compiler stack slots (jint's) not bytes or words
|
||||
int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
|
||||
// The caller will allocate additional_frame_words
|
||||
@ -169,24 +193,77 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_
|
||||
__ push_CPU_state(); // Push a multiple of 16 bytes
|
||||
|
||||
if (vect_words > 0) {
|
||||
assert(vect_words*wordSize == 256, "");
|
||||
__ subptr(rsp, 256); // Save upper half of YMM registes
|
||||
__ vextractf128h(Address(rsp, 0),xmm0);
|
||||
__ vextractf128h(Address(rsp, 16),xmm1);
|
||||
__ vextractf128h(Address(rsp, 32),xmm2);
|
||||
__ vextractf128h(Address(rsp, 48),xmm3);
|
||||
__ vextractf128h(Address(rsp, 64),xmm4);
|
||||
__ vextractf128h(Address(rsp, 80),xmm5);
|
||||
__ vextractf128h(Address(rsp, 96),xmm6);
|
||||
__ vextractf128h(Address(rsp,112),xmm7);
|
||||
__ vextractf128h(Address(rsp,128),xmm8);
|
||||
__ vextractf128h(Address(rsp,144),xmm9);
|
||||
__ vextractf128h(Address(rsp,160),xmm10);
|
||||
__ vextractf128h(Address(rsp,176),xmm11);
|
||||
__ vextractf128h(Address(rsp,192),xmm12);
|
||||
__ vextractf128h(Address(rsp,208),xmm13);
|
||||
__ vextractf128h(Address(rsp,224),xmm14);
|
||||
__ vextractf128h(Address(rsp,240),xmm15);
|
||||
assert(vect_words*wordSize >= 256, "");
|
||||
__ subptr(rsp, 256); // Save upper half of YMM registes(0..15)
|
||||
__ vextractf128h(Address(rsp, 0), xmm0);
|
||||
__ vextractf128h(Address(rsp, 16), xmm1);
|
||||
__ vextractf128h(Address(rsp, 32), xmm2);
|
||||
__ vextractf128h(Address(rsp, 48), xmm3);
|
||||
__ vextractf128h(Address(rsp, 64), xmm4);
|
||||
__ vextractf128h(Address(rsp, 80), xmm5);
|
||||
__ vextractf128h(Address(rsp, 96), xmm6);
|
||||
__ vextractf128h(Address(rsp, 112), xmm7);
|
||||
__ vextractf128h(Address(rsp, 128), xmm8);
|
||||
__ vextractf128h(Address(rsp, 144), xmm9);
|
||||
__ vextractf128h(Address(rsp, 160), xmm10);
|
||||
__ vextractf128h(Address(rsp, 176), xmm11);
|
||||
__ vextractf128h(Address(rsp, 192), xmm12);
|
||||
__ vextractf128h(Address(rsp, 208), xmm13);
|
||||
__ vextractf128h(Address(rsp, 224), xmm14);
|
||||
__ vextractf128h(Address(rsp, 240), xmm15);
|
||||
if (UseAVX > 2) {
|
||||
__ subptr(rsp, 256); // Save upper half of YMM registes(16..31)
|
||||
__ vextractf128h(Address(rsp, 0), xmm16);
|
||||
__ vextractf128h(Address(rsp, 16), xmm17);
|
||||
__ vextractf128h(Address(rsp, 32), xmm18);
|
||||
__ vextractf128h(Address(rsp, 48), xmm19);
|
||||
__ vextractf128h(Address(rsp, 64), xmm20);
|
||||
__ vextractf128h(Address(rsp, 80), xmm21);
|
||||
__ vextractf128h(Address(rsp, 96), xmm22);
|
||||
__ vextractf128h(Address(rsp, 112), xmm23);
|
||||
__ vextractf128h(Address(rsp, 128), xmm24);
|
||||
__ vextractf128h(Address(rsp, 144), xmm25);
|
||||
__ vextractf128h(Address(rsp, 160), xmm26);
|
||||
__ vextractf128h(Address(rsp, 176), xmm27);
|
||||
__ vextractf128h(Address(rsp, 192), xmm28);
|
||||
__ vextractf128h(Address(rsp, 208), xmm29);
|
||||
__ vextractf128h(Address(rsp, 224), xmm30);
|
||||
__ vextractf128h(Address(rsp, 240), xmm31);
|
||||
// Now handle the ZMM registers (0..31)
|
||||
__ subptr(rsp, 1024); // Save upper half of ZMM registes
|
||||
__ vextractf64x4h(Address(rsp, 0), xmm0);
|
||||
__ vextractf64x4h(Address(rsp, 32), xmm1);
|
||||
__ vextractf64x4h(Address(rsp, 64), xmm2);
|
||||
__ vextractf64x4h(Address(rsp, 96), xmm3);
|
||||
__ vextractf64x4h(Address(rsp, 128), xmm4);
|
||||
__ vextractf64x4h(Address(rsp, 160), xmm5);
|
||||
__ vextractf64x4h(Address(rsp, 192), xmm6);
|
||||
__ vextractf64x4h(Address(rsp, 224), xmm7);
|
||||
__ vextractf64x4h(Address(rsp, 256), xmm8);
|
||||
__ vextractf64x4h(Address(rsp, 288), xmm9);
|
||||
__ vextractf64x4h(Address(rsp, 320), xmm10);
|
||||
__ vextractf64x4h(Address(rsp, 352), xmm11);
|
||||
__ vextractf64x4h(Address(rsp, 384), xmm12);
|
||||
__ vextractf64x4h(Address(rsp, 416), xmm13);
|
||||
__ vextractf64x4h(Address(rsp, 448), xmm14);
|
||||
__ vextractf64x4h(Address(rsp, 480), xmm15);
|
||||
__ vextractf64x4h(Address(rsp, 512), xmm16);
|
||||
__ vextractf64x4h(Address(rsp, 544), xmm17);
|
||||
__ vextractf64x4h(Address(rsp, 576), xmm18);
|
||||
__ vextractf64x4h(Address(rsp, 608), xmm19);
|
||||
__ vextractf64x4h(Address(rsp, 640), xmm20);
|
||||
__ vextractf64x4h(Address(rsp, 672), xmm21);
|
||||
__ vextractf64x4h(Address(rsp, 704), xmm22);
|
||||
__ vextractf64x4h(Address(rsp, 736), xmm23);
|
||||
__ vextractf64x4h(Address(rsp, 768), xmm24);
|
||||
__ vextractf64x4h(Address(rsp, 800), xmm25);
|
||||
__ vextractf64x4h(Address(rsp, 832), xmm26);
|
||||
__ vextractf64x4h(Address(rsp, 864), xmm27);
|
||||
__ vextractf64x4h(Address(rsp, 896), xmm28);
|
||||
__ vextractf64x4h(Address(rsp, 928), xmm29);
|
||||
__ vextractf64x4h(Address(rsp, 960), xmm30);
|
||||
__ vextractf64x4h(Address(rsp, 992), xmm31);
|
||||
}
|
||||
}
|
||||
if (frame::arg_reg_save_area_bytes != 0) {
|
||||
// Allocate argument register save area
|
||||
@ -235,6 +312,24 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_
|
||||
map->set_callee_saved(STACK_OFFSET(xmm13_off), xmm13->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm14_off), xmm14->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm15_off), xmm15->as_VMReg());
|
||||
if (UseAVX > 2) {
|
||||
map->set_callee_saved(STACK_OFFSET(xmm16_off), xmm16->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm17_off), xmm17->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm18_off), xmm18->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm19_off), xmm19->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm20_off), xmm20->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm21_off), xmm21->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm22_off), xmm22->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm23_off), xmm23->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm24_off), xmm24->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm25_off), xmm25->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm26_off), xmm26->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm27_off), xmm27->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm28_off), xmm28->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm29_off), xmm29->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm30_off), xmm30->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm31_off), xmm31->as_VMReg());
|
||||
}
|
||||
|
||||
// %%% These should all be a waste but we'll keep things as they were for now
|
||||
if (true) {
|
||||
@ -269,6 +364,24 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_
|
||||
map->set_callee_saved(STACK_OFFSET(xmm13H_off), xmm13->as_VMReg()->next());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm14H_off), xmm14->as_VMReg()->next());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm15H_off), xmm15->as_VMReg()->next());
|
||||
if (UseAVX > 2) {
|
||||
map->set_callee_saved(STACK_OFFSET(xmm16H_off), xmm16->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm17H_off), xmm17->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm18H_off), xmm18->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm19H_off), xmm19->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm20H_off), xmm20->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm21H_off), xmm21->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm22H_off), xmm22->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm23H_off), xmm23->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm24H_off), xmm24->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm25H_off), xmm25->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm26H_off), xmm26->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm27H_off), xmm27->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm28H_off), xmm28->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm29H_off), xmm29->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm30H_off), xmm30->as_VMReg());
|
||||
map->set_callee_saved(STACK_OFFSET(xmm31H_off), xmm31->as_VMReg());
|
||||
}
|
||||
}
|
||||
|
||||
return map;
|
||||
@ -281,9 +394,9 @@ void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_ve
|
||||
}
|
||||
#ifdef COMPILER2
|
||||
if (restore_vectors) {
|
||||
// Restore upper half of YMM registes.
|
||||
assert(UseAVX > 0, "256bit vectors are supported only with AVX");
|
||||
assert(MaxVectorSize == 32, "only 256bit vectors are supported now");
|
||||
// Restore upper half of YMM registes (0..15)
|
||||
assert(UseAVX > 0, "512bit vectors are supported only with AVX");
|
||||
assert(MaxVectorSize == 64, "only 512bit vectors are supported now");
|
||||
__ vinsertf128h(xmm0, Address(rsp, 0));
|
||||
__ vinsertf128h(xmm1, Address(rsp, 16));
|
||||
__ vinsertf128h(xmm2, Address(rsp, 32));
|
||||
@ -301,6 +414,60 @@ void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_ve
|
||||
__ vinsertf128h(xmm14, Address(rsp,224));
|
||||
__ vinsertf128h(xmm15, Address(rsp,240));
|
||||
__ addptr(rsp, 256);
|
||||
if (UseAVX > 2) {
|
||||
// Restore upper half of YMM registes (16..31)
|
||||
__ vinsertf128h(xmm16, Address(rsp, 0));
|
||||
__ vinsertf128h(xmm17, Address(rsp, 16));
|
||||
__ vinsertf128h(xmm18, Address(rsp, 32));
|
||||
__ vinsertf128h(xmm19, Address(rsp, 48));
|
||||
__ vinsertf128h(xmm20, Address(rsp, 64));
|
||||
__ vinsertf128h(xmm21, Address(rsp, 80));
|
||||
__ vinsertf128h(xmm22, Address(rsp, 96));
|
||||
__ vinsertf128h(xmm23, Address(rsp,112));
|
||||
__ vinsertf128h(xmm24, Address(rsp,128));
|
||||
__ vinsertf128h(xmm25, Address(rsp,144));
|
||||
__ vinsertf128h(xmm26, Address(rsp,160));
|
||||
__ vinsertf128h(xmm27, Address(rsp,176));
|
||||
__ vinsertf128h(xmm28, Address(rsp,192));
|
||||
__ vinsertf128h(xmm29, Address(rsp,208));
|
||||
__ vinsertf128h(xmm30, Address(rsp,224));
|
||||
__ vinsertf128h(xmm31, Address(rsp,240));
|
||||
__ addptr(rsp, 256);
|
||||
// Restore upper half of ZMM registes.
|
||||
__ vinsertf64x4h(xmm0, Address(rsp, 0));
|
||||
__ vinsertf64x4h(xmm1, Address(rsp, 32));
|
||||
__ vinsertf64x4h(xmm2, Address(rsp, 64));
|
||||
__ vinsertf64x4h(xmm3, Address(rsp, 96));
|
||||
__ vinsertf64x4h(xmm4, Address(rsp, 128));
|
||||
__ vinsertf64x4h(xmm5, Address(rsp, 160));
|
||||
__ vinsertf64x4h(xmm6, Address(rsp, 192));
|
||||
__ vinsertf64x4h(xmm7, Address(rsp, 224));
|
||||
__ vinsertf64x4h(xmm8, Address(rsp, 256));
|
||||
__ vinsertf64x4h(xmm9, Address(rsp, 288));
|
||||
__ vinsertf64x4h(xmm10, Address(rsp, 320));
|
||||
__ vinsertf64x4h(xmm11, Address(rsp, 352));
|
||||
__ vinsertf64x4h(xmm12, Address(rsp, 384));
|
||||
__ vinsertf64x4h(xmm13, Address(rsp, 416));
|
||||
__ vinsertf64x4h(xmm14, Address(rsp, 448));
|
||||
__ vinsertf64x4h(xmm15, Address(rsp, 480));
|
||||
__ vinsertf64x4h(xmm16, Address(rsp, 512));
|
||||
__ vinsertf64x4h(xmm17, Address(rsp, 544));
|
||||
__ vinsertf64x4h(xmm18, Address(rsp, 576));
|
||||
__ vinsertf64x4h(xmm19, Address(rsp, 608));
|
||||
__ vinsertf64x4h(xmm20, Address(rsp, 640));
|
||||
__ vinsertf64x4h(xmm21, Address(rsp, 672));
|
||||
__ vinsertf64x4h(xmm22, Address(rsp, 704));
|
||||
__ vinsertf64x4h(xmm23, Address(rsp, 736));
|
||||
__ vinsertf64x4h(xmm24, Address(rsp, 768));
|
||||
__ vinsertf64x4h(xmm25, Address(rsp, 800));
|
||||
__ vinsertf64x4h(xmm26, Address(rsp, 832));
|
||||
__ vinsertf64x4h(xmm27, Address(rsp, 864));
|
||||
__ vinsertf64x4h(xmm28, Address(rsp, 896));
|
||||
__ vinsertf64x4h(xmm29, Address(rsp, 928));
|
||||
__ vinsertf64x4h(xmm30, Address(rsp, 960));
|
||||
__ vinsertf64x4h(xmm31, Address(rsp, 992));
|
||||
__ subptr(rsp, 1024);
|
||||
}
|
||||
}
|
||||
#else
|
||||
assert(!restore_vectors, "vectors are generated only by C2");
|
||||
|
||||
@ -166,6 +166,13 @@ class StubGenerator: public StubCodeGenerator {
|
||||
__ movptr(saved_rdi, rdi);
|
||||
__ movptr(saved_rsi, rsi);
|
||||
__ movptr(saved_rbx, rbx);
|
||||
|
||||
// provide initial value for required masks
|
||||
if (UseAVX > 2) {
|
||||
__ movl(rbx, 0xffff);
|
||||
__ kmovdl(k1, rbx);
|
||||
}
|
||||
|
||||
// save and initialize %mxcsr
|
||||
if (sse_save) {
|
||||
Label skip_ldmx;
|
||||
@ -794,7 +801,10 @@ class StubGenerator: public StubCodeGenerator {
|
||||
__ BIND(L_copy_64_bytes_loop);
|
||||
|
||||
if (UseUnalignedLoadStores) {
|
||||
if (UseAVX >= 2) {
|
||||
if (UseAVX > 2) {
|
||||
__ evmovdqu(xmm0, Address(from, 0), Assembler::AVX_512bit);
|
||||
__ evmovdqu(Address(from, to_from, Address::times_1, 0), xmm0, Assembler::AVX_512bit);
|
||||
} else if (UseAVX == 2) {
|
||||
__ vmovdqu(xmm0, Address(from, 0));
|
||||
__ vmovdqu(Address(from, to_from, Address::times_1, 0), xmm0);
|
||||
__ vmovdqu(xmm1, Address(from, 32));
|
||||
@ -833,7 +843,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||
__ subl(qword_count, 8);
|
||||
__ jcc(Assembler::greaterEqual, L_copy_64_bytes_loop);
|
||||
|
||||
if (UseUnalignedLoadStores && (UseAVX >= 2)) {
|
||||
if (UseUnalignedLoadStores && (UseAVX == 2)) {
|
||||
// clean upper bits of YMM registers
|
||||
__ vpxor(xmm0, xmm0);
|
||||
__ vpxor(xmm1, xmm1);
|
||||
|
||||
@ -137,8 +137,10 @@ class StubGenerator: public StubCodeGenerator {
|
||||
// [ return_from_Java ] <--- rsp
|
||||
// [ argument word n ]
|
||||
// ...
|
||||
// -28 [ argument word 1 ]
|
||||
// -27 [ saved xmm15 ] <--- rsp_after_call
|
||||
// -60 [ argument word 1 ]
|
||||
// -59 [ saved xmm31 ] <--- rsp after_call
|
||||
// [ saved xmm16-xmm30 ] (EVEX enabled, else the space is blank)
|
||||
// -27 [ saved xmm15 ]
|
||||
// [ saved xmm7-xmm14 ]
|
||||
// -9 [ saved xmm6 ] (each xmm register takes 2 slots)
|
||||
// -7 [ saved r15 ]
|
||||
@ -166,7 +168,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||
enum call_stub_layout {
|
||||
#ifdef _WIN64
|
||||
xmm_save_first = 6, // save from xmm6
|
||||
xmm_save_last = 15, // to xmm15
|
||||
xmm_save_last = 31, // to xmm31
|
||||
xmm_save_base = -9,
|
||||
rsp_after_call_off = xmm_save_base - 2 * (xmm_save_last - xmm_save_first), // -27
|
||||
r15_off = -7,
|
||||
@ -262,9 +264,19 @@ class StubGenerator: public StubCodeGenerator {
|
||||
__ movptr(r13_save, r13);
|
||||
__ movptr(r14_save, r14);
|
||||
__ movptr(r15_save, r15);
|
||||
if (UseAVX > 2) {
|
||||
__ movl(rbx, 0xffff);
|
||||
__ kmovql(k1, rbx);
|
||||
}
|
||||
#ifdef _WIN64
|
||||
for (int i = 6; i <= 15; i++) {
|
||||
__ movdqu(xmm_save(i), as_XMMRegister(i));
|
||||
if (UseAVX > 2) {
|
||||
for (int i = 6; i <= 31; i++) {
|
||||
__ movdqu(xmm_save(i), as_XMMRegister(i));
|
||||
}
|
||||
} else {
|
||||
for (int i = 6; i <= 15; i++) {
|
||||
__ movdqu(xmm_save(i), as_XMMRegister(i));
|
||||
}
|
||||
}
|
||||
|
||||
const Address rdi_save(rbp, rdi_off * wordSize);
|
||||
@ -1318,7 +1330,10 @@ class StubGenerator: public StubCodeGenerator {
|
||||
Label L_end;
|
||||
// Copy 64-bytes per iteration
|
||||
__ BIND(L_loop);
|
||||
if (UseAVX >= 2) {
|
||||
if (UseAVX > 2) {
|
||||
__ evmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56), Assembler::AVX_512bit);
|
||||
__ evmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0, Assembler::AVX_512bit);
|
||||
} else if (UseAVX == 2) {
|
||||
__ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
|
||||
__ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
|
||||
__ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
|
||||
@ -1395,7 +1410,10 @@ class StubGenerator: public StubCodeGenerator {
|
||||
Label L_end;
|
||||
// Copy 64-bytes per iteration
|
||||
__ BIND(L_loop);
|
||||
if (UseAVX >= 2) {
|
||||
if (UseAVX > 2) {
|
||||
__ evmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32), Assembler::AVX_512bit);
|
||||
__ evmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0, Assembler::AVX_512bit);
|
||||
} else if (UseAVX == 2) {
|
||||
__ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
|
||||
__ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
|
||||
__ vmovdqu(xmm1, Address(from, qword_count, Address::times_8, 0));
|
||||
|
||||
@ -35,7 +35,7 @@
|
||||
int VM_Version::_cpu;
|
||||
int VM_Version::_model;
|
||||
int VM_Version::_stepping;
|
||||
int VM_Version::_cpuFeatures;
|
||||
uint64_t VM_Version::_cpuFeatures;
|
||||
const char* VM_Version::_features_str = "";
|
||||
VM_Version::CpuidInfo VM_Version::_cpuid_info = { 0, };
|
||||
|
||||
@ -45,7 +45,7 @@ address VM_Version::_cpuinfo_segv_addr = 0;
|
||||
address VM_Version::_cpuinfo_cont_addr = 0;
|
||||
|
||||
static BufferBlob* stub_blob;
|
||||
static const int stub_size = 600;
|
||||
static const int stub_size = 1000;
|
||||
|
||||
extern "C" {
|
||||
typedef void (*get_cpu_info_stub_t)(void*);
|
||||
@ -60,15 +60,16 @@ class VM_Version_StubGenerator: public StubCodeGenerator {
|
||||
|
||||
address generate_get_cpu_info() {
|
||||
// Flags to test CPU type.
|
||||
const uint32_t HS_EFL_AC = 0x40000;
|
||||
const uint32_t HS_EFL_ID = 0x200000;
|
||||
const uint32_t HS_EFL_AC = 0x40000;
|
||||
const uint32_t HS_EFL_ID = 0x200000;
|
||||
// Values for when we don't have a CPUID instruction.
|
||||
const int CPU_FAMILY_SHIFT = 8;
|
||||
const uint32_t CPU_FAMILY_386 = (3 << CPU_FAMILY_SHIFT);
|
||||
const uint32_t CPU_FAMILY_486 = (4 << CPU_FAMILY_SHIFT);
|
||||
const uint32_t CPU_FAMILY_386 = (3 << CPU_FAMILY_SHIFT);
|
||||
const uint32_t CPU_FAMILY_486 = (4 << CPU_FAMILY_SHIFT);
|
||||
|
||||
Label detect_486, cpu486, detect_586, std_cpuid1, std_cpuid4;
|
||||
Label sef_cpuid, ext_cpuid, ext_cpuid1, ext_cpuid5, ext_cpuid7, done;
|
||||
Label sef_cpuid, ext_cpuid, ext_cpuid1, ext_cpuid5, ext_cpuid7, done, wrapup;
|
||||
Label legacy_setup, save_restore_except, legacy_save_restore, start_simd_check;
|
||||
|
||||
StubCodeMark mark(this, "VM_Version", "get_cpu_info_stub");
|
||||
# define __ _masm->
|
||||
@ -241,53 +242,6 @@ class VM_Version_StubGenerator: public StubCodeGenerator {
|
||||
__ movl(Address(rsi, 0), rax);
|
||||
__ movl(Address(rsi, 4), rdx);
|
||||
|
||||
__ andl(rax, 0x6); // xcr0 bits sse | ymm
|
||||
__ cmpl(rax, 0x6);
|
||||
__ jccb(Assembler::notEqual, sef_cpuid); // jump if AVX is not supported
|
||||
|
||||
//
|
||||
// Some OSs have a bug when upper 128bits of YMM
|
||||
// registers are not restored after a signal processing.
|
||||
// Generate SEGV here (reference through NULL)
|
||||
// and check upper YMM bits after it.
|
||||
//
|
||||
VM_Version::set_avx_cpuFeatures(); // Enable temporary to pass asserts
|
||||
intx saved_useavx = UseAVX;
|
||||
intx saved_usesse = UseSSE;
|
||||
UseAVX = 1;
|
||||
UseSSE = 2;
|
||||
|
||||
// load value into all 32 bytes of ymm7 register
|
||||
__ movl(rcx, VM_Version::ymm_test_value());
|
||||
|
||||
__ movdl(xmm0, rcx);
|
||||
__ pshufd(xmm0, xmm0, 0x00);
|
||||
__ vinsertf128h(xmm0, xmm0, xmm0);
|
||||
__ vmovdqu(xmm7, xmm0);
|
||||
#ifdef _LP64
|
||||
__ vmovdqu(xmm8, xmm0);
|
||||
__ vmovdqu(xmm15, xmm0);
|
||||
#endif
|
||||
|
||||
__ xorl(rsi, rsi);
|
||||
VM_Version::set_cpuinfo_segv_addr( __ pc() );
|
||||
// Generate SEGV
|
||||
__ movl(rax, Address(rsi, 0));
|
||||
|
||||
VM_Version::set_cpuinfo_cont_addr( __ pc() );
|
||||
// Returns here after signal. Save xmm0 to check it later.
|
||||
__ lea(rsi, Address(rbp, in_bytes(VM_Version::ymm_save_offset())));
|
||||
__ vmovdqu(Address(rsi, 0), xmm0);
|
||||
__ vmovdqu(Address(rsi, 32), xmm7);
|
||||
#ifdef _LP64
|
||||
__ vmovdqu(Address(rsi, 64), xmm8);
|
||||
__ vmovdqu(Address(rsi, 96), xmm15);
|
||||
#endif
|
||||
|
||||
VM_Version::clean_cpuFeatures();
|
||||
UseAVX = saved_useavx;
|
||||
UseSSE = saved_usesse;
|
||||
|
||||
//
|
||||
// cpuid(0x7) Structured Extended Features
|
||||
//
|
||||
@ -364,9 +318,143 @@ class VM_Version_StubGenerator: public StubCodeGenerator {
|
||||
__ movl(Address(rsi,12), rdx);
|
||||
|
||||
//
|
||||
// return
|
||||
// Check if OS has enabled XGETBV instruction to access XCR0
|
||||
// (OSXSAVE feature flag) and CPU supports AVX
|
||||
//
|
||||
__ lea(rsi, Address(rbp, in_bytes(VM_Version::std_cpuid1_offset())));
|
||||
__ movl(rcx, 0x18000000); // cpuid1 bits osxsave | avx
|
||||
__ andl(rcx, Address(rsi, 8)); // cpuid1 bits osxsave | avx
|
||||
__ cmpl(rcx, 0x18000000);
|
||||
__ jccb(Assembler::notEqual, done); // jump if AVX is not supported
|
||||
|
||||
__ movl(rax, 0x6);
|
||||
__ andl(rax, Address(rbp, in_bytes(VM_Version::xem_xcr0_offset()))); // xcr0 bits sse | ymm
|
||||
__ cmpl(rax, 0x6);
|
||||
__ jccb(Assembler::equal, start_simd_check); // return if AVX is not supported
|
||||
|
||||
// we need to bridge farther than imm8, so we use this island as a thunk
|
||||
__ bind(done);
|
||||
__ jmp(wrapup);
|
||||
|
||||
__ bind(start_simd_check);
|
||||
//
|
||||
// Some OSs have a bug when upper 128/256bits of YMM/ZMM
|
||||
// registers are not restored after a signal processing.
|
||||
// Generate SEGV here (reference through NULL)
|
||||
// and check upper YMM/ZMM bits after it.
|
||||
//
|
||||
intx saved_useavx = UseAVX;
|
||||
intx saved_usesse = UseSSE;
|
||||
// check _cpuid_info.sef_cpuid7_ebx.bits.avx512f
|
||||
__ lea(rsi, Address(rbp, in_bytes(VM_Version::sef_cpuid7_offset())));
|
||||
__ movl(rax, 0x10000);
|
||||
__ andl(rax, Address(rsi, 4)); // xcr0 bits sse | ymm
|
||||
__ cmpl(rax, 0x10000);
|
||||
__ jccb(Assembler::notEqual, legacy_setup); // jump if EVEX is not supported
|
||||
// check _cpuid_info.xem_xcr0_eax.bits.opmask
|
||||
// check _cpuid_info.xem_xcr0_eax.bits.zmm512
|
||||
// check _cpuid_info.xem_xcr0_eax.bits.zmm32
|
||||
__ movl(rax, 0xE0);
|
||||
__ andl(rax, Address(rbp, in_bytes(VM_Version::xem_xcr0_offset()))); // xcr0 bits sse | ymm
|
||||
__ cmpl(rax, 0xE0);
|
||||
__ jccb(Assembler::notEqual, legacy_setup); // jump if EVEX is not supported
|
||||
|
||||
// EVEX setup: run in lowest evex mode
|
||||
VM_Version::set_evex_cpuFeatures(); // Enable temporary to pass asserts
|
||||
UseAVX = 3;
|
||||
UseSSE = 2;
|
||||
// load value into all 64 bytes of zmm7 register
|
||||
__ movl(rcx, VM_Version::ymm_test_value());
|
||||
__ movdl(xmm0, rcx);
|
||||
__ movl(rcx, 0xffff);
|
||||
#ifdef _LP64
|
||||
__ kmovql(k1, rcx);
|
||||
#else
|
||||
__ kmovdl(k1, rcx);
|
||||
#endif
|
||||
__ evpbroadcastd(xmm0, xmm0, Assembler::AVX_512bit);
|
||||
__ evmovdqu(xmm7, xmm0, Assembler::AVX_512bit);
|
||||
#ifdef _LP64
|
||||
__ evmovdqu(xmm8, xmm0, Assembler::AVX_512bit);
|
||||
__ evmovdqu(xmm31, xmm0, Assembler::AVX_512bit);
|
||||
#endif
|
||||
VM_Version::clean_cpuFeatures();
|
||||
__ jmp(save_restore_except);
|
||||
|
||||
__ bind(legacy_setup);
|
||||
// AVX setup
|
||||
VM_Version::set_avx_cpuFeatures(); // Enable temporary to pass asserts
|
||||
UseAVX = 1;
|
||||
UseSSE = 2;
|
||||
// load value into all 32 bytes of ymm7 register
|
||||
__ movl(rcx, VM_Version::ymm_test_value());
|
||||
|
||||
__ movdl(xmm0, rcx);
|
||||
__ pshufd(xmm0, xmm0, 0x00);
|
||||
__ vinsertf128h(xmm0, xmm0, xmm0);
|
||||
__ vmovdqu(xmm7, xmm0);
|
||||
#ifdef _LP64
|
||||
__ vmovdqu(xmm8, xmm0);
|
||||
__ vmovdqu(xmm15, xmm0);
|
||||
#endif
|
||||
VM_Version::clean_cpuFeatures();
|
||||
|
||||
__ bind(save_restore_except);
|
||||
__ xorl(rsi, rsi);
|
||||
VM_Version::set_cpuinfo_segv_addr(__ pc());
|
||||
// Generate SEGV
|
||||
__ movl(rax, Address(rsi, 0));
|
||||
|
||||
VM_Version::set_cpuinfo_cont_addr(__ pc());
|
||||
// Returns here after signal. Save xmm0 to check it later.
|
||||
|
||||
// check _cpuid_info.sef_cpuid7_ebx.bits.avx512f
|
||||
__ lea(rsi, Address(rbp, in_bytes(VM_Version::sef_cpuid7_offset())));
|
||||
__ movl(rax, 0x10000);
|
||||
__ andl(rax, Address(rsi, 4));
|
||||
__ cmpl(rax, 0x10000);
|
||||
__ jccb(Assembler::notEqual, legacy_save_restore);
|
||||
// check _cpuid_info.xem_xcr0_eax.bits.opmask
|
||||
// check _cpuid_info.xem_xcr0_eax.bits.zmm512
|
||||
// check _cpuid_info.xem_xcr0_eax.bits.zmm32
|
||||
__ movl(rax, 0xE0);
|
||||
__ andl(rax, Address(rbp, in_bytes(VM_Version::xem_xcr0_offset()))); // xcr0 bits sse | ymm
|
||||
__ cmpl(rax, 0xE0);
|
||||
__ jccb(Assembler::notEqual, legacy_save_restore);
|
||||
|
||||
// EVEX check: run in lowest evex mode
|
||||
VM_Version::set_evex_cpuFeatures(); // Enable temporary to pass asserts
|
||||
UseAVX = 3;
|
||||
UseSSE = 2;
|
||||
__ lea(rsi, Address(rbp, in_bytes(VM_Version::zmm_save_offset())));
|
||||
__ evmovdqu(Address(rsi, 0), xmm0, Assembler::AVX_512bit);
|
||||
__ evmovdqu(Address(rsi, 64), xmm7, Assembler::AVX_512bit);
|
||||
#ifdef _LP64
|
||||
__ evmovdqu(Address(rsi, 128), xmm8, Assembler::AVX_512bit);
|
||||
__ evmovdqu(Address(rsi, 192), xmm31, Assembler::AVX_512bit);
|
||||
#endif
|
||||
VM_Version::clean_cpuFeatures();
|
||||
UseAVX = saved_useavx;
|
||||
UseSSE = saved_usesse;
|
||||
__ jmp(wrapup);
|
||||
|
||||
__ bind(legacy_save_restore);
|
||||
// AVX check
|
||||
VM_Version::set_avx_cpuFeatures(); // Enable temporary to pass asserts
|
||||
UseAVX = 1;
|
||||
UseSSE = 2;
|
||||
__ lea(rsi, Address(rbp, in_bytes(VM_Version::ymm_save_offset())));
|
||||
__ vmovdqu(Address(rsi, 0), xmm0);
|
||||
__ vmovdqu(Address(rsi, 32), xmm7);
|
||||
#ifdef _LP64
|
||||
__ vmovdqu(Address(rsi, 64), xmm8);
|
||||
__ vmovdqu(Address(rsi, 96), xmm15);
|
||||
#endif
|
||||
VM_Version::clean_cpuFeatures();
|
||||
UseAVX = saved_useavx;
|
||||
UseSSE = saved_usesse;
|
||||
|
||||
__ bind(wrapup);
|
||||
__ popf();
|
||||
__ pop(rsi);
|
||||
__ pop(rbx);
|
||||
@ -459,6 +547,29 @@ void VM_Version::get_processor_features() {
|
||||
if (UseSSE < 1)
|
||||
_cpuFeatures &= ~CPU_SSE;
|
||||
|
||||
// first try initial setting and detect what we can support
|
||||
if (UseAVX > 0) {
|
||||
if (UseAVX > 2 && supports_evex()) {
|
||||
UseAVX = 3;
|
||||
} else if (UseAVX > 1 && supports_avx2()) {
|
||||
UseAVX = 2;
|
||||
} else if (UseAVX > 0 && supports_avx()) {
|
||||
UseAVX = 1;
|
||||
} else {
|
||||
UseAVX = 0;
|
||||
}
|
||||
} else if (UseAVX < 0) {
|
||||
UseAVX = 0;
|
||||
}
|
||||
|
||||
if (UseAVX < 3) {
|
||||
_cpuFeatures &= ~CPU_AVX512F;
|
||||
_cpuFeatures &= ~CPU_AVX512DQ;
|
||||
_cpuFeatures &= ~CPU_AVX512CD;
|
||||
_cpuFeatures &= ~CPU_AVX512BW;
|
||||
_cpuFeatures &= ~CPU_AVX512VL;
|
||||
}
|
||||
|
||||
if (UseAVX < 2)
|
||||
_cpuFeatures &= ~CPU_AVX2;
|
||||
|
||||
@ -474,7 +585,7 @@ void VM_Version::get_processor_features() {
|
||||
}
|
||||
|
||||
char buf[256];
|
||||
jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
|
||||
jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
|
||||
cores_per_cpu(), threads_per_core(),
|
||||
cpu_family(), _model, _stepping,
|
||||
(supports_cmov() ? ", cmov" : ""),
|
||||
@ -504,7 +615,8 @@ void VM_Version::get_processor_features() {
|
||||
(supports_tscinv() ? ", tscinv": ""),
|
||||
(supports_bmi1() ? ", bmi1" : ""),
|
||||
(supports_bmi2() ? ", bmi2" : ""),
|
||||
(supports_adx() ? ", adx" : ""));
|
||||
(supports_adx() ? ", adx" : ""),
|
||||
(supports_evex() ? ", evex" : ""));
|
||||
_features_str = os::strdup(buf);
|
||||
|
||||
// UseSSE is set to the smaller of what hardware supports and what
|
||||
@ -521,13 +633,6 @@ void VM_Version::get_processor_features() {
|
||||
if (!supports_sse ()) // Drop to 0 if no SSE support
|
||||
UseSSE = 0;
|
||||
|
||||
if (UseAVX > 2) UseAVX=2;
|
||||
if (UseAVX < 0) UseAVX=0;
|
||||
if (!supports_avx2()) // Drop to 1 if no AVX2 support
|
||||
UseAVX = MIN2((intx)1,UseAVX);
|
||||
if (!supports_avx ()) // Drop to 0 if no AVX support
|
||||
UseAVX = 0;
|
||||
|
||||
// Use AES instructions if available.
|
||||
if (supports_aes()) {
|
||||
if (FLAG_IS_DEFAULT(UseAES)) {
|
||||
@ -598,7 +703,8 @@ void VM_Version::get_processor_features() {
|
||||
if ((_model == CPU_MODEL_HASWELL_E3) ||
|
||||
(_model == CPU_MODEL_HASWELL_E7 && _stepping < 3) ||
|
||||
(_model == CPU_MODEL_BROADWELL && _stepping < 4)) {
|
||||
if (!UnlockExperimentalVMOptions) {
|
||||
// currently a collision between SKL and HSW_E3
|
||||
if (!UnlockExperimentalVMOptions && UseAVX < 3) {
|
||||
vm_exit_during_initialization("UseRTMLocking is only available as experimental option on this platform. It must be enabled via -XX:+UnlockExperimentalVMOptions flag.");
|
||||
} else {
|
||||
warning("UseRTMLocking is only available as experimental option on this platform.");
|
||||
@ -651,10 +757,10 @@ void VM_Version::get_processor_features() {
|
||||
if (MaxVectorSize > 0) {
|
||||
if (!is_power_of_2(MaxVectorSize)) {
|
||||
warning("MaxVectorSize must be a power of 2");
|
||||
FLAG_SET_DEFAULT(MaxVectorSize, 32);
|
||||
FLAG_SET_DEFAULT(MaxVectorSize, 64);
|
||||
}
|
||||
if (MaxVectorSize > 32) {
|
||||
FLAG_SET_DEFAULT(MaxVectorSize, 32);
|
||||
if (MaxVectorSize > 64) {
|
||||
FLAG_SET_DEFAULT(MaxVectorSize, 64);
|
||||
}
|
||||
if (MaxVectorSize > 16 && (UseAVX == 0 || !os_supports_avx_vectors())) {
|
||||
// 32 bytes vectors (in YMM) are only supported with AVX+
|
||||
|
||||
@ -208,20 +208,33 @@ public:
|
||||
bmi2 : 1,
|
||||
erms : 1,
|
||||
: 1,
|
||||
rtm : 1,
|
||||
: 7,
|
||||
adx : 1,
|
||||
: 12;
|
||||
rtm : 1,
|
||||
: 4,
|
||||
avx512f : 1,
|
||||
avx512dq : 1,
|
||||
: 1,
|
||||
adx : 1,
|
||||
: 6,
|
||||
avx512pf : 1,
|
||||
avx512er : 1,
|
||||
avx512cd : 1,
|
||||
: 1,
|
||||
avx512bw : 1,
|
||||
avx512vl : 1;
|
||||
} bits;
|
||||
};
|
||||
|
||||
union XemXcr0Eax {
|
||||
uint32_t value;
|
||||
struct {
|
||||
uint32_t x87 : 1,
|
||||
sse : 1,
|
||||
ymm : 1,
|
||||
: 29;
|
||||
uint32_t x87 : 1,
|
||||
sse : 1,
|
||||
ymm : 1,
|
||||
: 2,
|
||||
opmask : 1,
|
||||
zmm512 : 1,
|
||||
zmm32 : 1,
|
||||
: 24;
|
||||
} bits;
|
||||
};
|
||||
|
||||
@ -229,43 +242,51 @@ protected:
|
||||
static int _cpu;
|
||||
static int _model;
|
||||
static int _stepping;
|
||||
static int _cpuFeatures; // features returned by the "cpuid" instruction
|
||||
// 0 if this instruction is not available
|
||||
static uint64_t _cpuFeatures; // features returned by the "cpuid" instruction
|
||||
// 0 if this instruction is not available
|
||||
static const char* _features_str;
|
||||
|
||||
static address _cpuinfo_segv_addr; // address of instruction which causes SEGV
|
||||
static address _cpuinfo_cont_addr; // address of instruction after the one which causes SEGV
|
||||
|
||||
enum {
|
||||
CPU_CX8 = (1 << 0), // next bits are from cpuid 1 (EDX)
|
||||
CPU_CMOV = (1 << 1),
|
||||
CPU_FXSR = (1 << 2),
|
||||
CPU_HT = (1 << 3),
|
||||
CPU_MMX = (1 << 4),
|
||||
CPU_3DNOW_PREFETCH = (1 << 5), // Processor supports 3dnow prefetch and prefetchw instructions
|
||||
// may not necessarily support other 3dnow instructions
|
||||
CPU_SSE = (1 << 6),
|
||||
CPU_SSE2 = (1 << 7),
|
||||
CPU_SSE3 = (1 << 8), // SSE3 comes from cpuid 1 (ECX)
|
||||
CPU_SSSE3 = (1 << 9),
|
||||
CPU_SSE4A = (1 << 10),
|
||||
CPU_SSE4_1 = (1 << 11),
|
||||
CPU_SSE4_2 = (1 << 12),
|
||||
CPU_POPCNT = (1 << 13),
|
||||
CPU_LZCNT = (1 << 14),
|
||||
CPU_TSC = (1 << 15),
|
||||
CPU_TSCINV = (1 << 16),
|
||||
CPU_AVX = (1 << 17),
|
||||
CPU_AVX2 = (1 << 18),
|
||||
CPU_AES = (1 << 19),
|
||||
CPU_ERMS = (1 << 20), // enhanced 'rep movsb/stosb' instructions
|
||||
CPU_CLMUL = (1 << 21), // carryless multiply for CRC
|
||||
CPU_BMI1 = (1 << 22),
|
||||
CPU_BMI2 = (1 << 23),
|
||||
CPU_RTM = (1 << 24), // Restricted Transactional Memory instructions
|
||||
CPU_ADX = (1 << 25)
|
||||
CPU_CX8 = (1 << 0), // next bits are from cpuid 1 (EDX)
|
||||
CPU_CMOV = (1 << 1),
|
||||
CPU_FXSR = (1 << 2),
|
||||
CPU_HT = (1 << 3),
|
||||
CPU_MMX = (1 << 4),
|
||||
CPU_3DNOW_PREFETCH = (1 << 5), // Processor supports 3dnow prefetch and prefetchw instructions
|
||||
// may not necessarily support other 3dnow instructions
|
||||
CPU_SSE = (1 << 6),
|
||||
CPU_SSE2 = (1 << 7),
|
||||
CPU_SSE3 = (1 << 8), // SSE3 comes from cpuid 1 (ECX)
|
||||
CPU_SSSE3 = (1 << 9),
|
||||
CPU_SSE4A = (1 << 10),
|
||||
CPU_SSE4_1 = (1 << 11),
|
||||
CPU_SSE4_2 = (1 << 12),
|
||||
CPU_POPCNT = (1 << 13),
|
||||
CPU_LZCNT = (1 << 14),
|
||||
CPU_TSC = (1 << 15),
|
||||
CPU_TSCINV = (1 << 16),
|
||||
CPU_AVX = (1 << 17),
|
||||
CPU_AVX2 = (1 << 18),
|
||||
CPU_AES = (1 << 19),
|
||||
CPU_ERMS = (1 << 20), // enhanced 'rep movsb/stosb' instructions
|
||||
CPU_CLMUL = (1 << 21), // carryless multiply for CRC
|
||||
CPU_BMI1 = (1 << 22),
|
||||
CPU_BMI2 = (1 << 23),
|
||||
CPU_RTM = (1 << 24), // Restricted Transactional Memory instructions
|
||||
CPU_ADX = (1 << 25),
|
||||
CPU_AVX512F = (1 << 26), // AVX 512bit foundation instructions
|
||||
CPU_AVX512DQ = (1 << 27),
|
||||
CPU_AVX512PF = (1 << 28),
|
||||
CPU_AVX512ER = (1 << 29),
|
||||
CPU_AVX512CD = (1 << 30),
|
||||
CPU_AVX512BW = (1 << 31)
|
||||
} cpuFeatureFlags;
|
||||
|
||||
#define CPU_AVX512VL 0x100000000 // EVEX instructions with smaller vector length : enums are limited to 32bit
|
||||
|
||||
enum {
|
||||
// AMD
|
||||
CPU_FAMILY_AMD_11H = 0x11,
|
||||
@ -282,7 +303,8 @@ protected:
|
||||
CPU_MODEL_IVYBRIDGE_EP = 0x3a,
|
||||
CPU_MODEL_HASWELL_E3 = 0x3c,
|
||||
CPU_MODEL_HASWELL_E7 = 0x3f,
|
||||
CPU_MODEL_BROADWELL = 0x3d
|
||||
CPU_MODEL_BROADWELL = 0x3d,
|
||||
CPU_MODEL_SKYLAKE = CPU_MODEL_HASWELL_E3
|
||||
} cpuExtendedFamily;
|
||||
|
||||
// cpuid information block. All info derived from executing cpuid with
|
||||
@ -376,6 +398,9 @@ protected:
|
||||
|
||||
// Space to save ymm registers after signal handle
|
||||
int ymm_save[8*4]; // Save ymm0, ymm7, ymm8, ymm15
|
||||
|
||||
// Space to save zmm registers after signal handle
|
||||
int zmm_save[16*4]; // Save zmm0, zmm7, zmm8, zmm31
|
||||
};
|
||||
|
||||
// The actual cpuid info block
|
||||
@ -404,8 +429,8 @@ protected:
|
||||
return result;
|
||||
}
|
||||
|
||||
static uint32_t feature_flags() {
|
||||
uint32_t result = 0;
|
||||
static uint64_t feature_flags() {
|
||||
uint64_t result = 0;
|
||||
if (_cpuid_info.std_cpuid1_edx.bits.cmpxchg8 != 0)
|
||||
result |= CPU_CX8;
|
||||
if (_cpuid_info.std_cpuid1_edx.bits.cmov != 0)
|
||||
@ -440,6 +465,24 @@ protected:
|
||||
result |= CPU_AVX;
|
||||
if (_cpuid_info.sef_cpuid7_ebx.bits.avx2 != 0)
|
||||
result |= CPU_AVX2;
|
||||
if (_cpuid_info.sef_cpuid7_ebx.bits.avx512f != 0 &&
|
||||
_cpuid_info.xem_xcr0_eax.bits.opmask != 0 &&
|
||||
_cpuid_info.xem_xcr0_eax.bits.zmm512 != 0 &&
|
||||
_cpuid_info.xem_xcr0_eax.bits.zmm32 != 0) {
|
||||
result |= CPU_AVX512F;
|
||||
if (_cpuid_info.sef_cpuid7_ebx.bits.avx512cd != 0)
|
||||
result |= CPU_AVX512CD;
|
||||
if (_cpuid_info.sef_cpuid7_ebx.bits.avx512dq != 0)
|
||||
result |= CPU_AVX512DQ;
|
||||
if (_cpuid_info.sef_cpuid7_ebx.bits.avx512pf != 0)
|
||||
result |= CPU_AVX512PF;
|
||||
if (_cpuid_info.sef_cpuid7_ebx.bits.avx512er != 0)
|
||||
result |= CPU_AVX512ER;
|
||||
if (_cpuid_info.sef_cpuid7_ebx.bits.avx512bw != 0)
|
||||
result |= CPU_AVX512BW;
|
||||
if (_cpuid_info.sef_cpuid7_ebx.bits.avx512vl != 0)
|
||||
result |= CPU_AVX512VL;
|
||||
}
|
||||
}
|
||||
if(_cpuid_info.sef_cpuid7_ebx.bits.bmi1 != 0)
|
||||
result |= CPU_BMI1;
|
||||
@ -484,18 +527,31 @@ protected:
|
||||
}
|
||||
|
||||
static bool os_supports_avx_vectors() {
|
||||
if (!supports_avx()) {
|
||||
return false;
|
||||
}
|
||||
// Verify that OS save/restore all bits of AVX registers
|
||||
// during signal processing.
|
||||
int nreg = 2 LP64_ONLY(+2);
|
||||
for (int i = 0; i < 8 * nreg; i++) { // 32 bytes per ymm register
|
||||
if (_cpuid_info.ymm_save[i] != ymm_test_value()) {
|
||||
return false;
|
||||
bool retVal = false;
|
||||
if (supports_evex()) {
|
||||
// Verify that OS save/restore all bits of EVEX registers
|
||||
// during signal processing.
|
||||
int nreg = 2 LP64_ONLY(+2);
|
||||
retVal = true;
|
||||
for (int i = 0; i < 16 * nreg; i++) { // 64 bytes per zmm register
|
||||
if (_cpuid_info.zmm_save[i] != ymm_test_value()) {
|
||||
retVal = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else if (supports_avx()) {
|
||||
// Verify that OS save/restore all bits of AVX registers
|
||||
// during signal processing.
|
||||
int nreg = 2 LP64_ONLY(+2);
|
||||
retVal = true;
|
||||
for (int i = 0; i < 8 * nreg; i++) { // 32 bytes per ymm register
|
||||
if (_cpuid_info.ymm_save[i] != ymm_test_value()) {
|
||||
retVal = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
return retVal;
|
||||
}
|
||||
|
||||
static void get_processor_features();
|
||||
@ -515,6 +571,7 @@ public:
|
||||
static ByteSize tpl_cpuidB2_offset() { return byte_offset_of(CpuidInfo, tpl_cpuidB2_eax); }
|
||||
static ByteSize xem_xcr0_offset() { return byte_offset_of(CpuidInfo, xem_xcr0_eax); }
|
||||
static ByteSize ymm_save_offset() { return byte_offset_of(CpuidInfo, ymm_save); }
|
||||
static ByteSize zmm_save_offset() { return byte_offset_of(CpuidInfo, zmm_save); }
|
||||
|
||||
// The value used to check ymm register after signal handle
|
||||
static int ymm_test_value() { return 0xCAFEBABE; }
|
||||
@ -527,6 +584,7 @@ public:
|
||||
|
||||
static void clean_cpuFeatures() { _cpuFeatures = 0; }
|
||||
static void set_avx_cpuFeatures() { _cpuFeatures = (CPU_SSE | CPU_SSE2 | CPU_AVX); }
|
||||
static void set_evex_cpuFeatures() { _cpuFeatures = (CPU_AVX512F | CPU_SSE | CPU_SSE2 ); }
|
||||
|
||||
|
||||
// Initialization
|
||||
@ -636,7 +694,14 @@ public:
|
||||
static bool supports_rtm() { return (_cpuFeatures & CPU_RTM) != 0; }
|
||||
static bool supports_bmi1() { return (_cpuFeatures & CPU_BMI1) != 0; }
|
||||
static bool supports_bmi2() { return (_cpuFeatures & CPU_BMI2) != 0; }
|
||||
static bool supports_adx() { return (_cpuFeatures & CPU_ADX) != 0; }
|
||||
static bool supports_adx() { return (_cpuFeatures & CPU_ADX) != 0; }
|
||||
static bool supports_evex() { return (_cpuFeatures & CPU_AVX512F) != 0; }
|
||||
static bool supports_avx512dq() { return (_cpuFeatures & CPU_AVX512DQ) != 0; }
|
||||
static bool supports_avx512pf() { return (_cpuFeatures & CPU_AVX512PF) != 0; }
|
||||
static bool supports_avx512er() { return (_cpuFeatures & CPU_AVX512ER) != 0; }
|
||||
static bool supports_avx512cd() { return (_cpuFeatures & CPU_AVX512CD) != 0; }
|
||||
static bool supports_avx512bw() { return (_cpuFeatures & CPU_AVX512BW) != 0; }
|
||||
static bool supports_avx512vl() { return (_cpuFeatures & CPU_AVX512VL) != 0; }
|
||||
// Intel features
|
||||
static bool is_intel_family_core() { return is_intel() &&
|
||||
extended_cpu_family() == CPU_FAMILY_INTEL_CORE; }
|
||||
|
||||
@ -47,13 +47,22 @@ void VMRegImpl::set_regName() {
|
||||
}
|
||||
|
||||
XMMRegister xreg = ::as_XMMRegister(0);
|
||||
for ( ; i < ConcreteRegisterImpl::max_xmm ; ) {
|
||||
for (int j = 0 ; j < 8 ; j++) {
|
||||
for (; i < ConcreteRegisterImpl::max_xmm;) {
|
||||
for (int j = 0 ; j < XMMRegisterImpl::max_slots_per_register ; j++) {
|
||||
regName[i++] = xreg->name();
|
||||
}
|
||||
xreg = xreg->successor();
|
||||
}
|
||||
|
||||
KRegister kreg = ::as_KRegister(0);
|
||||
for (; i < ConcreteRegisterImpl::max_kpr;) {
|
||||
for (int j = 0; j < KRegisterImpl::max_slots_per_register; j++) {
|
||||
regName[i++] = kreg->name();
|
||||
}
|
||||
kreg = kreg->successor();
|
||||
}
|
||||
|
||||
for ( ; i < ConcreteRegisterImpl::number_of_registers ; i ++ ) {
|
||||
regName[i] = "NON-GPR-FPR-XMM";
|
||||
regName[i] = "NON-GPR-FPR-XMM-KREG";
|
||||
}
|
||||
}
|
||||
|
||||
@ -36,7 +36,24 @@ inline bool is_FloatRegister() {
|
||||
}
|
||||
|
||||
inline bool is_XMMRegister() {
|
||||
return value() >= ConcreteRegisterImpl::max_fpr && value() < ConcreteRegisterImpl::max_xmm;
|
||||
int uarch_max_xmm = ConcreteRegisterImpl::max_xmm;
|
||||
|
||||
#ifdef _LP64
|
||||
if (UseAVX < 3) {
|
||||
int half_xmm = (XMMRegisterImpl::max_slots_per_register * XMMRegisterImpl::number_of_registers) / 2;
|
||||
uarch_max_xmm -= half_xmm;
|
||||
}
|
||||
#endif
|
||||
|
||||
return (value() >= ConcreteRegisterImpl::max_fpr && value() < uarch_max_xmm);
|
||||
}
|
||||
|
||||
inline bool is_KRegister() {
|
||||
if (UseAVX > 2) {
|
||||
return value() >= ConcreteRegisterImpl::max_xmm && value() < ConcreteRegisterImpl::max_kpr;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
inline Register as_Register() {
|
||||
@ -59,7 +76,13 @@ inline FloatRegister as_FloatRegister() {
|
||||
inline XMMRegister as_XMMRegister() {
|
||||
assert( is_XMMRegister() && is_even(value()), "must be" );
|
||||
// Yuk
|
||||
return ::as_XMMRegister((value() - ConcreteRegisterImpl::max_fpr) >> 3);
|
||||
return ::as_XMMRegister((value() - ConcreteRegisterImpl::max_fpr) >> 4);
|
||||
}
|
||||
|
||||
inline KRegister as_KRegister() {
|
||||
assert(is_KRegister(), "must be");
|
||||
// Yuk
|
||||
return ::as_KRegister((value() - ConcreteRegisterImpl::max_xmm));
|
||||
}
|
||||
|
||||
inline bool is_concrete() {
|
||||
|
||||
@ -39,7 +39,11 @@ inline VMReg FloatRegisterImpl::as_VMReg() {
|
||||
}
|
||||
|
||||
inline VMReg XMMRegisterImpl::as_VMReg() {
|
||||
return VMRegImpl::as_VMReg((encoding() << 3) + ConcreteRegisterImpl::max_fpr);
|
||||
return VMRegImpl::as_VMReg((encoding() << 4) + ConcreteRegisterImpl::max_fpr);
|
||||
}
|
||||
|
||||
inline VMReg KRegisterImpl::as_VMReg() {
|
||||
return VMRegImpl::as_VMReg(encoding() + ConcreteRegisterImpl::max_xmm);
|
||||
}
|
||||
|
||||
#endif // CPU_X86_VM_VMREG_X86_INLINE_HPP
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -101,6 +101,17 @@ reg_def FPR6L( SOC, SOC, Op_RegF, 6, as_FloatRegister(5)->as_VMReg());
|
||||
reg_def FPR6H( SOC, SOC, Op_RegF, 6, as_FloatRegister(5)->as_VMReg()->next());
|
||||
reg_def FPR7L( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg());
|
||||
reg_def FPR7H( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next());
|
||||
//
|
||||
// Empty fill registers, which are never used, but supply alignment to xmm regs
|
||||
//
|
||||
reg_def FILL0( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next(2));
|
||||
reg_def FILL1( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next(3));
|
||||
reg_def FILL2( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next(4));
|
||||
reg_def FILL3( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next(5));
|
||||
reg_def FILL4( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next(6));
|
||||
reg_def FILL5( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next(7));
|
||||
reg_def FILL6( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next(8));
|
||||
reg_def FILL7( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next(9));
|
||||
|
||||
// Specify priority of register selection within phases of register
|
||||
// allocation. Highest priority is first. A useful heuristic is to
|
||||
@ -112,7 +123,8 @@ reg_def FPR7H( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next());
|
||||
alloc_class chunk0( ECX, EBX, EBP, EDI, EAX, EDX, ESI, ESP,
|
||||
FPR0L, FPR0H, FPR1L, FPR1H, FPR2L, FPR2H,
|
||||
FPR3L, FPR3H, FPR4L, FPR4H, FPR5L, FPR5H,
|
||||
FPR6L, FPR6H, FPR7L, FPR7H );
|
||||
FPR6L, FPR6H, FPR7L, FPR7H,
|
||||
FILL0, FILL1, FILL2, FILL3, FILL4, FILL5, FILL6, FILL7);
|
||||
|
||||
|
||||
//----------Architecture Description Register Classes--------------------------
|
||||
@ -131,7 +143,7 @@ reg_class any_reg_with_ebp(EAX, EDX, EBP, EDI, ESI, ECX, EBX, ESP);
|
||||
// Class for all registers (excluding EBP)
|
||||
reg_class any_reg_no_ebp(EAX, EDX, EDI, ESI, ECX, EBX, ESP);
|
||||
// Dynamic register class that selects at runtime between register classes
|
||||
// any_reg and any_no_ebp_reg (depending on the value of the flag PreserveFramePointer).
|
||||
// any_reg and any_no_ebp_reg (depending on the value of the flag PreserveFramePointer).
|
||||
// Equivalent to: return PreserveFramePointer ? any_no_ebp_reg : any_reg;
|
||||
reg_class_dynamic any_reg(any_reg_no_ebp, any_reg_with_ebp, %{ PreserveFramePointer %});
|
||||
|
||||
@ -279,7 +291,9 @@ static int pre_call_resets_size() {
|
||||
size += 6; // fldcw
|
||||
}
|
||||
if (C->max_vector_size() > 16) {
|
||||
size += 3; // vzeroupper
|
||||
if(UseAVX <= 2) {
|
||||
size += 3; // vzeroupper
|
||||
}
|
||||
}
|
||||
return size;
|
||||
}
|
||||
@ -288,7 +302,7 @@ static int pre_call_resets_size() {
|
||||
// from the start of the call to the point where the return address
|
||||
// will point.
|
||||
int MachCallStaticJavaNode::ret_addr_offset() {
|
||||
return 5 + pre_call_resets_size(); // 5 bytes from start of call to where return address points
|
||||
return 5 + pre_call_resets_size(); // 5 bytes from start of call to where return address points
|
||||
}
|
||||
|
||||
int MachCallDynamicJavaNode::ret_addr_offset() {
|
||||
@ -767,6 +781,12 @@ static int impl_helper( CodeBuffer *cbuf, bool do_size, bool is_load, int offset
|
||||
// Helper for XMM registers. Extra opcode bits, limited syntax.
|
||||
static int impl_x_helper( CodeBuffer *cbuf, bool do_size, bool is_load,
|
||||
int offset, int reg_lo, int reg_hi, int size, outputStream* st ) {
|
||||
int in_size_in_bits = Assembler::EVEX_32bit;
|
||||
int evex_encoding = 0;
|
||||
if (reg_lo+1 == reg_hi) {
|
||||
in_size_in_bits = Assembler::EVEX_64bit;
|
||||
evex_encoding = Assembler::VEX_W;
|
||||
}
|
||||
if (cbuf) {
|
||||
MacroAssembler _masm(cbuf);
|
||||
if (reg_lo+1 == reg_hi) { // double move?
|
||||
@ -799,7 +819,17 @@ static int impl_x_helper( CodeBuffer *cbuf, bool do_size, bool is_load,
|
||||
}
|
||||
#endif
|
||||
}
|
||||
int offset_size = (offset == 0) ? 0 : ((offset <= 127) ? 1 : 4);
|
||||
bool is_single_byte = false;
|
||||
if ((UseAVX > 2) && (offset != 0)) {
|
||||
is_single_byte = Assembler::query_compressed_disp_byte(offset, true, 0, Assembler::EVEX_T1S, in_size_in_bits, evex_encoding);
|
||||
}
|
||||
int offset_size = 0;
|
||||
if (UseAVX > 2 ) {
|
||||
offset_size = (offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
|
||||
} else {
|
||||
offset_size = (offset == 0) ? 0 : ((offset <= 127) ? 1 : 4);
|
||||
}
|
||||
size += (UseAVX > 2) ? 2 : 0; // Need an additional two bytes for EVEX
|
||||
// VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
|
||||
return size+5+offset_size;
|
||||
}
|
||||
@ -835,8 +865,8 @@ static int impl_movx_helper( CodeBuffer *cbuf, bool do_size, int src_lo, int dst
|
||||
#endif
|
||||
}
|
||||
// VEX_2bytes prefix is used if UseAVX > 0, and it takes the same 2 bytes as SIMD prefix.
|
||||
// Only MOVAPS SSE prefix uses 1 byte.
|
||||
int sz = 4;
|
||||
// Only MOVAPS SSE prefix uses 1 byte. EVEX uses an additional 2 bytes.
|
||||
int sz = (UseAVX > 2) ? 6 : 4;
|
||||
if (!(src_lo+1 == src_hi && dst_lo+1 == dst_hi) &&
|
||||
UseXmmRegToRegMoveAll && (UseAVX == 0)) sz = 3;
|
||||
return size + sz;
|
||||
@ -854,7 +884,7 @@ static int impl_movgpr2x_helper( CodeBuffer *cbuf, bool do_size, int src_lo, int
|
||||
st->print("movdl %s, %s\t# spill", Matcher::regName[dst_lo], Matcher::regName[src_lo]);
|
||||
#endif
|
||||
}
|
||||
return 4;
|
||||
return (UseAVX> 2) ? 6 : 4;
|
||||
}
|
||||
|
||||
|
||||
@ -870,7 +900,7 @@ static int impl_movx2gpr_helper( CodeBuffer *cbuf, bool do_size, int src_lo, int
|
||||
st->print("movdl %s, %s\t# spill", Matcher::regName[dst_lo], Matcher::regName[src_lo]);
|
||||
#endif
|
||||
}
|
||||
return 4;
|
||||
return (UseAVX> 2) ? 6 : 4;
|
||||
}
|
||||
|
||||
static int impl_mov_helper( CodeBuffer *cbuf, bool do_size, int src, int dst, int size, outputStream* st ) {
|
||||
@ -941,9 +971,8 @@ static int vec_stack_to_stack_helper(CodeBuffer *cbuf, bool do_size, int src_off
|
||||
calc_size += 3+src_offset_size + 3+dst_offset_size;
|
||||
break;
|
||||
case Op_VecX:
|
||||
calc_size = 6 + 6 + 5+src_offset_size + 5+dst_offset_size;
|
||||
break;
|
||||
case Op_VecY:
|
||||
case Op_VecZ:
|
||||
calc_size = 6 + 6 + 5+src_offset_size + 5+dst_offset_size;
|
||||
break;
|
||||
default:
|
||||
@ -974,6 +1003,11 @@ static int vec_stack_to_stack_helper(CodeBuffer *cbuf, bool do_size, int src_off
|
||||
__ vmovdqu(xmm0, Address(rsp, src_offset));
|
||||
__ vmovdqu(Address(rsp, dst_offset), xmm0);
|
||||
__ vmovdqu(xmm0, Address(rsp, -32));
|
||||
case Op_VecZ:
|
||||
__ evmovdqu(Address(rsp, -64), xmm0, 2);
|
||||
__ evmovdqu(xmm0, Address(rsp, src_offset), 2);
|
||||
__ evmovdqu(Address(rsp, dst_offset), xmm0, 2);
|
||||
__ evmovdqu(xmm0, Address(rsp, -64), 2);
|
||||
break;
|
||||
default:
|
||||
ShouldNotReachHere();
|
||||
@ -1009,6 +1043,12 @@ static int vec_stack_to_stack_helper(CodeBuffer *cbuf, bool do_size, int src_off
|
||||
"vmovdqu [rsp + #%d], xmm0\n\t"
|
||||
"vmovdqu xmm0, [rsp - #32]",
|
||||
src_offset, dst_offset);
|
||||
case Op_VecZ:
|
||||
st->print("vmovdqu [rsp - #64], xmm0\t# 512-bit mem-mem spill\n\t"
|
||||
"vmovdqu xmm0, [rsp + #%d]\n\t"
|
||||
"vmovdqu [rsp + #%d], xmm0\n\t"
|
||||
"vmovdqu xmm0, [rsp - #64]",
|
||||
src_offset, dst_offset);
|
||||
break;
|
||||
default:
|
||||
ShouldNotReachHere();
|
||||
@ -1042,7 +1082,7 @@ uint MachSpillCopyNode::implementation( CodeBuffer *cbuf, PhaseRegAlloc *ra_, bo
|
||||
uint ireg = ideal_reg();
|
||||
assert((src_first_rc != rc_int && dst_first_rc != rc_int), "sanity");
|
||||
assert((src_first_rc != rc_float && dst_first_rc != rc_float), "sanity");
|
||||
assert((ireg == Op_VecS || ireg == Op_VecD || ireg == Op_VecX || ireg == Op_VecY), "sanity");
|
||||
assert((ireg == Op_VecS || ireg == Op_VecD || ireg == Op_VecX || ireg == Op_VecY || ireg == Op_VecZ ), "sanity");
|
||||
if( src_first_rc == rc_stack && dst_first_rc == rc_stack ) {
|
||||
// mem -> mem
|
||||
int src_offset = ra_->reg2offset(src_first);
|
||||
@ -3998,7 +4038,7 @@ operand regFPR1(regFPR reg) %{
|
||||
// XMM Float register operands
|
||||
operand regF() %{
|
||||
predicate( UseSSE>=1 );
|
||||
constraint(ALLOC_IN_RC(float_reg));
|
||||
constraint(ALLOC_IN_RC(float_reg_legacy));
|
||||
match(RegF);
|
||||
format %{ %}
|
||||
interface(REG_INTER);
|
||||
@ -4007,12 +4047,45 @@ operand regF() %{
|
||||
// XMM Double register operands
|
||||
operand regD() %{
|
||||
predicate( UseSSE>=2 );
|
||||
constraint(ALLOC_IN_RC(double_reg));
|
||||
constraint(ALLOC_IN_RC(double_reg_legacy));
|
||||
match(RegD);
|
||||
format %{ %}
|
||||
interface(REG_INTER);
|
||||
%}
|
||||
|
||||
// Vectors : note, we use legacy registers to avoid extra (unneeded in 32-bit VM)
|
||||
// runtime code generation via reg_class_dynamic.
|
||||
operand vecS() %{
|
||||
constraint(ALLOC_IN_RC(vectors_reg_legacy));
|
||||
match(VecS);
|
||||
|
||||
format %{ %}
|
||||
interface(REG_INTER);
|
||||
%}
|
||||
|
||||
operand vecD() %{
|
||||
constraint(ALLOC_IN_RC(vectord_reg_legacy));
|
||||
match(VecD);
|
||||
|
||||
format %{ %}
|
||||
interface(REG_INTER);
|
||||
%}
|
||||
|
||||
operand vecX() %{
|
||||
constraint(ALLOC_IN_RC(vectorx_reg_legacy));
|
||||
match(VecX);
|
||||
|
||||
format %{ %}
|
||||
interface(REG_INTER);
|
||||
%}
|
||||
|
||||
operand vecY() %{
|
||||
constraint(ALLOC_IN_RC(vectory_reg_legacy));
|
||||
match(VecY);
|
||||
|
||||
format %{ %}
|
||||
interface(REG_INTER);
|
||||
%}
|
||||
|
||||
//----------Memory Operands----------------------------------------------------
|
||||
// Direct Memory Operand
|
||||
@ -5020,11 +5093,11 @@ instruct bytes_reverse_unsigned_short(rRegI dst, eFlagsReg cr) %{
|
||||
match(Set dst (ReverseBytesUS dst));
|
||||
effect(KILL cr);
|
||||
|
||||
format %{ "BSWAP $dst\n\t"
|
||||
format %{ "BSWAP $dst\n\t"
|
||||
"SHR $dst,16\n\t" %}
|
||||
ins_encode %{
|
||||
__ bswapl($dst$$Register);
|
||||
__ shrl($dst$$Register, 16);
|
||||
__ shrl($dst$$Register, 16);
|
||||
%}
|
||||
ins_pipe( ialu_reg );
|
||||
%}
|
||||
@ -5033,11 +5106,11 @@ instruct bytes_reverse_short(rRegI dst, eFlagsReg cr) %{
|
||||
match(Set dst (ReverseBytesS dst));
|
||||
effect(KILL cr);
|
||||
|
||||
format %{ "BSWAP $dst\n\t"
|
||||
format %{ "BSWAP $dst\n\t"
|
||||
"SAR $dst,16\n\t" %}
|
||||
ins_encode %{
|
||||
__ bswapl($dst$$Register);
|
||||
__ sarl($dst$$Register, 16);
|
||||
__ sarl($dst$$Register, 16);
|
||||
%}
|
||||
ins_pipe( ialu_reg );
|
||||
%}
|
||||
@ -6525,7 +6598,7 @@ instruct membar_volatile(eFlagsReg cr) %{
|
||||
effect(KILL cr);
|
||||
ins_cost(400);
|
||||
|
||||
format %{
|
||||
format %{
|
||||
$$template
|
||||
if (os::is_MP()) {
|
||||
$$emit$$"LOCK ADDL [ESP + #0], 0\t! membar_volatile"
|
||||
@ -8288,10 +8361,10 @@ instruct xorI_eReg(rRegI dst, rRegI src, eFlagsReg cr) %{
|
||||
|
||||
// Xor Register with Immediate -1
|
||||
instruct xorI_eReg_im1(rRegI dst, immI_M1 imm) %{
|
||||
match(Set dst (XorI dst imm));
|
||||
match(Set dst (XorI dst imm));
|
||||
|
||||
size(2);
|
||||
format %{ "NOT $dst" %}
|
||||
format %{ "NOT $dst" %}
|
||||
ins_encode %{
|
||||
__ notl($dst$$Register);
|
||||
%}
|
||||
@ -8939,7 +9012,7 @@ instruct xorl_eReg(eRegL dst, eRegL src, eFlagsReg cr) %{
|
||||
|
||||
// Xor Long Register with Immediate -1
|
||||
instruct xorl_eReg_im1(eRegL dst, immL_M1 imm) %{
|
||||
match(Set dst (XorL dst imm));
|
||||
match(Set dst (XorL dst imm));
|
||||
format %{ "NOT $dst.lo\n\t"
|
||||
"NOT $dst.hi" %}
|
||||
ins_encode %{
|
||||
@ -8994,7 +9067,7 @@ instruct shlL_eReg_2(eRegL dst, immI_2 cnt, eFlagsReg cr) %{
|
||||
effect(KILL cr);
|
||||
ins_cost(100);
|
||||
format %{ "ADD $dst.lo,$dst.lo\n\t"
|
||||
"ADC $dst.hi,$dst.hi\n\t"
|
||||
"ADC $dst.hi,$dst.hi\n\t"
|
||||
"ADD $dst.lo,$dst.lo\n\t"
|
||||
"ADC $dst.hi,$dst.hi" %}
|
||||
ins_encode %{
|
||||
@ -9013,9 +9086,9 @@ instruct shlL_eReg_3(eRegL dst, immI_3 cnt, eFlagsReg cr) %{
|
||||
effect(KILL cr);
|
||||
ins_cost(100);
|
||||
format %{ "ADD $dst.lo,$dst.lo\n\t"
|
||||
"ADC $dst.hi,$dst.hi\n\t"
|
||||
"ADC $dst.hi,$dst.hi\n\t"
|
||||
"ADD $dst.lo,$dst.lo\n\t"
|
||||
"ADC $dst.hi,$dst.hi\n\t"
|
||||
"ADC $dst.hi,$dst.hi\n\t"
|
||||
"ADD $dst.lo,$dst.lo\n\t"
|
||||
"ADC $dst.hi,$dst.hi" %}
|
||||
ins_encode %{
|
||||
@ -11168,7 +11241,6 @@ instruct convL2I_reg( rRegI dst, eRegL src ) %{
|
||||
ins_pipe( ialu_reg_reg );
|
||||
%}
|
||||
|
||||
|
||||
instruct MoveF2I_stack_reg(rRegI dst, stackSlotF src) %{
|
||||
match(Set dst (MoveF2I src));
|
||||
effect( DEF dst, USE src );
|
||||
@ -11400,7 +11472,7 @@ instruct rep_stos(eCXRegI cnt, eDIRegP base, eAXRegI zero, Universe dummy, eFlag
|
||||
format %{ "XOR EAX,EAX\t# ClearArray:\n\t"
|
||||
"SHL ECX,1\t# Convert doublewords to words\n\t"
|
||||
"REP STOS\t# store EAX into [EDI++] while ECX--" %}
|
||||
ins_encode %{
|
||||
ins_encode %{
|
||||
__ clear_mem($base$$Register, $cnt$$Register, $zero$$Register);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
@ -11413,7 +11485,7 @@ instruct rep_fast_stosb(eCXRegI cnt, eDIRegP base, eAXRegI zero, Universe dummy,
|
||||
format %{ "XOR EAX,EAX\t# ClearArray:\n\t"
|
||||
"SHL ECX,3\t# Convert doublewords to bytes\n\t"
|
||||
"REP STOSB\t# store EAX into [EDI++] while ECX--" %}
|
||||
ins_encode %{
|
||||
ins_encode %{
|
||||
__ clear_mem($base$$Register, $cnt$$Register, $zero$$Register);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
|
||||
@ -172,7 +172,7 @@ reg_class no_reg();
|
||||
// Class for all pointer registers (including RSP and RBP)
|
||||
reg_class any_reg_with_rbp(RAX, RAX_H,
|
||||
RDX, RDX_H,
|
||||
RBP, RBP_H,
|
||||
RBP, RBP_H,
|
||||
RDI, RDI_H,
|
||||
RSI, RSI_H,
|
||||
RCX, RCX_H,
|
||||
@ -189,7 +189,7 @@ reg_class any_reg_with_rbp(RAX, RAX_H,
|
||||
|
||||
// Class for all pointer registers (including RSP, but excluding RBP)
|
||||
reg_class any_reg_no_rbp(RAX, RAX_H,
|
||||
RDX, RDX_H,
|
||||
RDX, RDX_H,
|
||||
RDI, RDI_H,
|
||||
RSI, RSI_H,
|
||||
RCX, RCX_H,
|
||||
@ -205,10 +205,10 @@ reg_class any_reg_no_rbp(RAX, RAX_H,
|
||||
R15, R15_H);
|
||||
|
||||
// Dynamic register class that selects at runtime between register classes
|
||||
// any_reg_no_rbp and any_reg_with_rbp (depending on the value of the flag PreserveFramePointer).
|
||||
// any_reg_no_rbp and any_reg_with_rbp (depending on the value of the flag PreserveFramePointer).
|
||||
// Equivalent to: return PreserveFramePointer ? any_reg_no_rbp : any_reg_with_rbp;
|
||||
reg_class_dynamic any_reg(any_reg_no_rbp, any_reg_with_rbp, %{ PreserveFramePointer %});
|
||||
|
||||
|
||||
// Class for all pointer registers (excluding RSP)
|
||||
reg_class ptr_reg_with_rbp(RAX, RAX_H,
|
||||
RDX, RDX_H,
|
||||
@ -226,7 +226,7 @@ reg_class ptr_reg_with_rbp(RAX, RAX_H,
|
||||
|
||||
// Class for all pointer registers (excluding RSP and RBP)
|
||||
reg_class ptr_reg_no_rbp(RAX, RAX_H,
|
||||
RDX, RDX_H,
|
||||
RDX, RDX_H,
|
||||
RDI, RDI_H,
|
||||
RSI, RSI_H,
|
||||
RCX, RCX_H,
|
||||
@ -536,7 +536,11 @@ source %{
|
||||
#define __ _masm.
|
||||
|
||||
static int clear_avx_size() {
|
||||
return (Compile::current()->max_vector_size() > 16) ? 3 : 0; // vzeroupper
|
||||
if(UseAVX > 2) {
|
||||
return 0; // vzeroupper is ignored
|
||||
} else {
|
||||
return (Compile::current()->max_vector_size() > 16) ? 3 : 0; // vzeroupper
|
||||
}
|
||||
}
|
||||
|
||||
// !!!!! Special hack to get all types of calls to specify the byte offset
|
||||
@ -545,7 +549,7 @@ static int clear_avx_size() {
|
||||
int MachCallStaticJavaNode::ret_addr_offset()
|
||||
{
|
||||
int offset = 5; // 5 bytes from start of call to where return address points
|
||||
offset += clear_avx_size();
|
||||
offset += clear_avx_size();
|
||||
return offset;
|
||||
}
|
||||
|
||||
@ -860,7 +864,7 @@ void MachPrologNode::format(PhaseRegAlloc* ra_, outputStream* st) const {
|
||||
st->print("subq rsp, #%d\t# Create frame",framesize);
|
||||
st->print("\n\t");
|
||||
framesize -= wordSize;
|
||||
st->print("movq [rsp + #%d], rbp\t# Save rbp",framesize);
|
||||
st->print("movq [rsp + #%d], rbp\t# Save rbp",framesize);
|
||||
if (PreserveFramePointer) {
|
||||
st->print("\n\t");
|
||||
st->print("movq rbp, [rsp + #%d]\t# Save the caller's SP into rbp", (framesize + wordSize));
|
||||
@ -1070,6 +1074,11 @@ static void vec_stack_to_stack_helper(CodeBuffer *cbuf, int src_offset,
|
||||
__ vmovdqu(xmm0, Address(rsp, src_offset));
|
||||
__ vmovdqu(Address(rsp, dst_offset), xmm0);
|
||||
__ vmovdqu(xmm0, Address(rsp, -32));
|
||||
case Op_VecZ:
|
||||
__ evmovdqu(Address(rsp, -64), xmm0, 2);
|
||||
__ evmovdqu(xmm0, Address(rsp, src_offset), 2);
|
||||
__ evmovdqu(Address(rsp, dst_offset), xmm0, 2);
|
||||
__ evmovdqu(xmm0, Address(rsp, -64), 2);
|
||||
break;
|
||||
default:
|
||||
ShouldNotReachHere();
|
||||
@ -1103,6 +1112,13 @@ static void vec_stack_to_stack_helper(CodeBuffer *cbuf, int src_offset,
|
||||
"vmovdqu xmm0, [rsp - #32]",
|
||||
src_offset, dst_offset);
|
||||
break;
|
||||
case Op_VecZ:
|
||||
st->print("vmovdqu [rsp - #64], xmm0\t# 512-bit mem-mem spill\n\t"
|
||||
"vmovdqu xmm0, [rsp + #%d]\n\t"
|
||||
"vmovdqu [rsp + #%d], xmm0\n\t"
|
||||
"vmovdqu xmm0, [rsp - #64]",
|
||||
src_offset, dst_offset);
|
||||
break;
|
||||
default:
|
||||
ShouldNotReachHere();
|
||||
}
|
||||
@ -1136,7 +1152,7 @@ uint MachSpillCopyNode::implementation(CodeBuffer* cbuf,
|
||||
if (bottom_type()->isa_vect() != NULL) {
|
||||
uint ireg = ideal_reg();
|
||||
assert((src_first_rc != rc_int && dst_first_rc != rc_int), "sanity");
|
||||
assert((ireg == Op_VecS || ireg == Op_VecD || ireg == Op_VecX || ireg == Op_VecY), "sanity");
|
||||
assert((ireg == Op_VecS || ireg == Op_VecD || ireg == Op_VecX || ireg == Op_VecY || ireg == Op_VecZ ), "sanity");
|
||||
if( src_first_rc == rc_stack && dst_first_rc == rc_stack ) {
|
||||
// mem -> mem
|
||||
int src_offset = ra_->reg2offset(src_first);
|
||||
@ -1573,7 +1589,7 @@ uint MachUEPNode::size(PhaseRegAlloc* ra_) const
|
||||
return MachNode::size(ra_); // too many variables; just compute it
|
||||
// the hard way
|
||||
}
|
||||
|
||||
|
||||
|
||||
//=============================================================================
|
||||
|
||||
@ -2832,7 +2848,7 @@ frame
|
||||
RAX_H_num // Op_RegL
|
||||
};
|
||||
// Excluded flags and vector registers.
|
||||
assert(ARRAY_SIZE(hi) == _last_machine_leaf - 5, "missing type");
|
||||
assert(ARRAY_SIZE(hi) == _last_machine_leaf - 6, "missing type");
|
||||
return OptoRegPair(hi[ideal_reg], lo[ideal_reg]);
|
||||
%}
|
||||
%}
|
||||
@ -3335,7 +3351,7 @@ operand no_rax_rdx_RegI()
|
||||
// Pointer Register
|
||||
operand any_RegP()
|
||||
%{
|
||||
constraint(ALLOC_IN_RC(any_reg));
|
||||
constraint(ALLOC_IN_RC(any_reg));
|
||||
match(RegP);
|
||||
match(rax_RegP);
|
||||
match(rbx_RegP);
|
||||
@ -3589,20 +3605,51 @@ operand rFlagsRegUCF() %{
|
||||
%}
|
||||
|
||||
// Float register operands
|
||||
operand regF()
|
||||
%{
|
||||
constraint(ALLOC_IN_RC(float_reg));
|
||||
match(RegF);
|
||||
operand regF() %{
|
||||
constraint(ALLOC_IN_RC(float_reg));
|
||||
match(RegF);
|
||||
|
||||
format %{ %}
|
||||
interface(REG_INTER);
|
||||
%}
|
||||
|
||||
// Double register operands
|
||||
operand regD() %{
|
||||
constraint(ALLOC_IN_RC(double_reg));
|
||||
match(RegD);
|
||||
|
||||
format %{ %}
|
||||
interface(REG_INTER);
|
||||
%}
|
||||
|
||||
// Vectors
|
||||
operand vecS() %{
|
||||
constraint(ALLOC_IN_RC(vectors_reg));
|
||||
match(VecS);
|
||||
|
||||
format %{ %}
|
||||
interface(REG_INTER);
|
||||
%}
|
||||
|
||||
// Double register operands
|
||||
operand regD()
|
||||
%{
|
||||
constraint(ALLOC_IN_RC(double_reg));
|
||||
match(RegD);
|
||||
operand vecD() %{
|
||||
constraint(ALLOC_IN_RC(vectord_reg));
|
||||
match(VecD);
|
||||
|
||||
format %{ %}
|
||||
interface(REG_INTER);
|
||||
%}
|
||||
|
||||
operand vecX() %{
|
||||
constraint(ALLOC_IN_RC(vectorx_reg));
|
||||
match(VecX);
|
||||
|
||||
format %{ %}
|
||||
interface(REG_INTER);
|
||||
%}
|
||||
|
||||
operand vecY() %{
|
||||
constraint(ALLOC_IN_RC(vectory_reg));
|
||||
match(VecY);
|
||||
|
||||
format %{ %}
|
||||
interface(REG_INTER);
|
||||
@ -4947,7 +4994,7 @@ instruct loadI2L_immU31(rRegL dst, memory mem, immU31 mask, rFlagsReg cr) %{
|
||||
%}
|
||||
|
||||
// Load Unsigned Integer into Long Register
|
||||
instruct loadUI2L(rRegL dst, memory mem, immL_32bits mask)
|
||||
instruct loadUI2L(rRegL dst, memory mem, immL_32bits mask)
|
||||
%{
|
||||
match(Set dst (AndL (ConvI2L (LoadI mem)) mask));
|
||||
|
||||
@ -10374,7 +10421,7 @@ instruct rep_stos(rcx_RegL cnt, rdi_RegP base, rax_RegI zero, Universe dummy,
|
||||
|
||||
format %{ "xorq rax, rax\t# ClearArray:\n\t"
|
||||
"rep stosq\t# Store rax to *rdi++ while rcx--" %}
|
||||
ins_encode %{
|
||||
ins_encode %{
|
||||
__ clear_mem($base$$Register, $cnt$$Register, $zero$$Register);
|
||||
%}
|
||||
ins_pipe(pipe_slow);
|
||||
@ -10389,7 +10436,7 @@ instruct rep_fast_stosb(rcx_RegL cnt, rdi_RegP base, rax_RegI zero, Universe dum
|
||||
format %{ "xorq rax, rax\t# ClearArray:\n\t"
|
||||
"shlq rcx,3\t# Convert doublewords to bytes\n\t"
|
||||
"rep stosb\t# Store rax to *rdi++ while rcx--" %}
|
||||
ins_encode %{
|
||||
ins_encode %{
|
||||
__ clear_mem($base$$Register, $cnt$$Register, $zero$$Register);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
|
||||
@ -929,6 +929,7 @@ const char *ArchDesc::getIdealType(const char *idealOp) {
|
||||
case 'D': return "TypeVect::VECTD";
|
||||
case 'X': return "TypeVect::VECTX";
|
||||
case 'Y': return "TypeVect::VECTY";
|
||||
case 'Z': return "TypeVect::VECTZ";
|
||||
default:
|
||||
internal_err("Vector type %s with unrecognized type\n",idealOp);
|
||||
}
|
||||
|
||||
@ -3919,6 +3919,7 @@ bool MatchRule::is_base_register(FormDict &globals) const {
|
||||
strcmp(opType,"VecD")==0 ||
|
||||
strcmp(opType,"VecX")==0 ||
|
||||
strcmp(opType,"VecY")==0 ||
|
||||
strcmp(opType,"VecZ")==0 ||
|
||||
strcmp(opType,"Reg" )==0) ) {
|
||||
return 1;
|
||||
}
|
||||
@ -4048,6 +4049,7 @@ int MatchRule::is_expensive() const {
|
||||
strcmp(opType,"AddReductionVF")==0 ||
|
||||
strcmp(opType,"AddReductionVD")==0 ||
|
||||
strcmp(opType,"MulReductionVI")==0 ||
|
||||
strcmp(opType,"MulReductionVL")==0 ||
|
||||
strcmp(opType,"MulReductionVF")==0 ||
|
||||
strcmp(opType,"MulReductionVD")==0 ||
|
||||
0 /* 0 to line up columns nicely */ )
|
||||
@ -4139,12 +4141,12 @@ bool MatchRule::is_vector() const {
|
||||
static const char *vector_list[] = {
|
||||
"AddVB","AddVS","AddVI","AddVL","AddVF","AddVD",
|
||||
"SubVB","SubVS","SubVI","SubVL","SubVF","SubVD",
|
||||
"MulVS","MulVI","MulVF","MulVD",
|
||||
"MulVS","MulVI","MulVL","MulVF","MulVD",
|
||||
"DivVF","DivVD",
|
||||
"AndV" ,"XorV" ,"OrV",
|
||||
"AddReductionVI", "AddReductionVL",
|
||||
"AddReductionVF", "AddReductionVD",
|
||||
"MulReductionVI",
|
||||
"MulReductionVI", "MulReductionVL",
|
||||
"MulReductionVF", "MulReductionVD",
|
||||
"LShiftCntV","RShiftCntV",
|
||||
"LShiftVB","LShiftVS","LShiftVI","LShiftVL",
|
||||
|
||||
@ -1290,7 +1290,8 @@ void LinearScan::build_intervals() {
|
||||
#ifdef X86
|
||||
}
|
||||
if (UseSSE > 0) {
|
||||
for (i = 0; i < FrameMap::nof_caller_save_xmm_regs; i++) {
|
||||
int num_caller_save_xmm_regs = FrameMap::get_num_caller_save_xmms();
|
||||
for (i = 0; i < num_caller_save_xmm_regs; i ++) {
|
||||
LIR_Opr opr = FrameMap::caller_save_xmm_reg_at(i);
|
||||
assert(opr->is_valid() && opr->is_register(), "FrameMap should not return invalid operands");
|
||||
assert(reg_numHi(opr) == -1, "missing addition of range for hi-register");
|
||||
@ -2098,7 +2099,13 @@ LIR_Opr LinearScan::calc_operand_for_interval(const Interval* interval) {
|
||||
case T_FLOAT: {
|
||||
#ifdef X86
|
||||
if (UseSSE >= 1) {
|
||||
assert(assigned_reg >= pd_first_xmm_reg && assigned_reg <= pd_last_xmm_reg, "no xmm register");
|
||||
int last_xmm_reg = pd_last_xmm_reg;
|
||||
#ifdef _LP64
|
||||
if (UseAVX < 3) {
|
||||
last_xmm_reg = pd_first_xmm_reg + (pd_nof_xmm_regs_frame_map / 2) - 1;
|
||||
}
|
||||
#endif
|
||||
assert(assigned_reg >= pd_first_xmm_reg && assigned_reg <= last_xmm_reg, "no xmm register");
|
||||
assert(interval->assigned_regHi() == any_reg, "must not have hi register");
|
||||
return LIR_OprFact::single_xmm(assigned_reg - pd_first_xmm_reg);
|
||||
}
|
||||
@ -2112,7 +2119,13 @@ LIR_Opr LinearScan::calc_operand_for_interval(const Interval* interval) {
|
||||
case T_DOUBLE: {
|
||||
#ifdef X86
|
||||
if (UseSSE >= 2) {
|
||||
assert(assigned_reg >= pd_first_xmm_reg && assigned_reg <= pd_last_xmm_reg, "no xmm register");
|
||||
int last_xmm_reg = pd_last_xmm_reg;
|
||||
#ifdef _LP64
|
||||
if (UseAVX < 3) {
|
||||
last_xmm_reg = pd_first_xmm_reg + (pd_nof_xmm_regs_frame_map / 2) - 1;
|
||||
}
|
||||
#endif
|
||||
assert(assigned_reg >= pd_first_xmm_reg && assigned_reg <= last_xmm_reg, "no xmm register");
|
||||
assert(interval->assigned_regHi() == any_reg, "must not have hi register (double xmm values are stored in one register)");
|
||||
return LIR_OprFact::double_xmm(assigned_reg - pd_first_xmm_reg);
|
||||
}
|
||||
@ -3600,7 +3613,8 @@ void RegisterVerifier::process_operations(LIR_List* ops, IntervalList* input_sta
|
||||
}
|
||||
|
||||
#ifdef X86
|
||||
for (j = 0; j < FrameMap::nof_caller_save_xmm_regs; j++) {
|
||||
int num_caller_save_xmm_regs = FrameMap::get_num_caller_save_xmms();
|
||||
for (j = 0; j < num_caller_save_xmm_regs; j++) {
|
||||
state_put(input_state, reg_num(FrameMap::caller_save_xmm_reg_at(j)), NULL);
|
||||
}
|
||||
#endif
|
||||
@ -4514,12 +4528,20 @@ void Interval::print(outputStream* out) const {
|
||||
if (reg_num() < LIR_OprDesc::vreg_base) {
|
||||
type_name = "fixed";
|
||||
// need a temporary operand for fixed intervals because type() cannot be called
|
||||
#ifdef X86
|
||||
int last_xmm_reg = pd_last_xmm_reg;
|
||||
#ifdef _LP64
|
||||
if (UseAVX < 3) {
|
||||
last_xmm_reg = pd_first_xmm_reg + (pd_nof_xmm_regs_frame_map / 2) - 1;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
if (assigned_reg() >= pd_first_cpu_reg && assigned_reg() <= pd_last_cpu_reg) {
|
||||
opr = LIR_OprFact::single_cpu(assigned_reg());
|
||||
} else if (assigned_reg() >= pd_first_fpu_reg && assigned_reg() <= pd_last_fpu_reg) {
|
||||
opr = LIR_OprFact::single_fpu(assigned_reg() - pd_first_fpu_reg);
|
||||
#ifdef X86
|
||||
} else if (assigned_reg() >= pd_first_xmm_reg && assigned_reg() <= pd_last_xmm_reg) {
|
||||
} else if (assigned_reg() >= pd_first_xmm_reg && assigned_reg() <= last_xmm_reg) {
|
||||
opr = LIR_OprFact::single_xmm(assigned_reg() - pd_first_xmm_reg);
|
||||
#endif
|
||||
} else {
|
||||
|
||||
@ -96,7 +96,7 @@
|
||||
product(intx, MaxLoopPad, (OptoLoopAlignment-1), \
|
||||
"Align a loop if padding size in bytes is less or equal to this value") \
|
||||
\
|
||||
product(intx, MaxVectorSize, 32, \
|
||||
product(intx, MaxVectorSize, 64, \
|
||||
"Max vector size in bytes, " \
|
||||
"actual size could be less depending on elements type") \
|
||||
\
|
||||
|
||||
@ -907,6 +907,13 @@ void PhaseChaitin::gather_lrg_masks( bool after_aggressive ) {
|
||||
lrg.set_num_regs(RegMask::SlotsPerVecY);
|
||||
lrg.set_reg_pressure(1);
|
||||
break;
|
||||
case Op_VecZ:
|
||||
assert(Matcher::vector_size_supported(T_FLOAT,RegMask::SlotsPerVecZ), "sanity");
|
||||
assert(RegMask::num_registers(Op_VecZ) == RegMask::SlotsPerVecZ, "sanity");
|
||||
assert(lrgmask.is_aligned_sets(RegMask::SlotsPerVecZ), "vector should be aligned");
|
||||
lrg.set_num_regs(RegMask::SlotsPerVecZ);
|
||||
lrg.set_reg_pressure(1);
|
||||
break;
|
||||
default:
|
||||
ShouldNotReachHere();
|
||||
}
|
||||
@ -1514,7 +1521,7 @@ uint PhaseChaitin::Select( ) {
|
||||
int n_regs = lrg->num_regs();
|
||||
assert(!lrg->_is_vector || !lrg->_fat_proj, "sanity");
|
||||
if (n_regs == 1 || !lrg->_fat_proj) {
|
||||
assert(!lrg->_is_vector || n_regs <= RegMask::SlotsPerVecY, "sanity");
|
||||
assert(!lrg->_is_vector || n_regs <= RegMask::SlotsPerVecZ, "sanity");
|
||||
lrg->Clear(); // Clear the mask
|
||||
lrg->Insert(reg); // Set regmask to match selected reg
|
||||
// For vectors and pairs, also insert the low bit of the pair
|
||||
|
||||
@ -141,7 +141,7 @@ public:
|
||||
|
||||
// Number of registers this live range uses when it colors
|
||||
private:
|
||||
uint8_t _num_regs; // 2 for Longs and Doubles, 1 for all else
|
||||
uint16_t _num_regs; // 2 for Longs and Doubles, 1 for all else
|
||||
// except _num_regs is kill count for fat_proj
|
||||
public:
|
||||
int num_regs() const { return _num_regs; }
|
||||
@ -150,7 +150,7 @@ public:
|
||||
private:
|
||||
// Number of physical registers this live range uses when it colors
|
||||
// Architecture and register-set dependent
|
||||
uint8_t _reg_pressure;
|
||||
uint16_t _reg_pressure;
|
||||
public:
|
||||
void set_reg_pressure(int i) { _reg_pressure = i; }
|
||||
int reg_pressure() const { return _reg_pressure; }
|
||||
|
||||
@ -282,6 +282,8 @@ macro(SubVD)
|
||||
macro(MulVS)
|
||||
macro(MulVI)
|
||||
macro(MulReductionVI)
|
||||
macro(MulVL)
|
||||
macro(MulReductionVL)
|
||||
macro(MulVF)
|
||||
macro(MulReductionVF)
|
||||
macro(MulVD)
|
||||
|
||||
@ -3110,6 +3110,7 @@ void Compile::final_graph_reshaping_impl( Node *n, Final_Reshape_Counts &frc) {
|
||||
case Op_AddReductionVF:
|
||||
case Op_AddReductionVD:
|
||||
case Op_MulReductionVI:
|
||||
case Op_MulReductionVL:
|
||||
case Op_MulReductionVF:
|
||||
case Op_MulReductionVD:
|
||||
break;
|
||||
|
||||
@ -83,6 +83,7 @@ Matcher::Matcher()
|
||||
idealreg2spillmask [Op_VecD] = NULL;
|
||||
idealreg2spillmask [Op_VecX] = NULL;
|
||||
idealreg2spillmask [Op_VecY] = NULL;
|
||||
idealreg2spillmask [Op_VecZ] = NULL;
|
||||
|
||||
idealreg2debugmask [Op_RegI] = NULL;
|
||||
idealreg2debugmask [Op_RegN] = NULL;
|
||||
@ -94,6 +95,7 @@ Matcher::Matcher()
|
||||
idealreg2debugmask [Op_VecD] = NULL;
|
||||
idealreg2debugmask [Op_VecX] = NULL;
|
||||
idealreg2debugmask [Op_VecY] = NULL;
|
||||
idealreg2debugmask [Op_VecZ] = NULL;
|
||||
|
||||
idealreg2mhdebugmask[Op_RegI] = NULL;
|
||||
idealreg2mhdebugmask[Op_RegN] = NULL;
|
||||
@ -105,6 +107,7 @@ Matcher::Matcher()
|
||||
idealreg2mhdebugmask[Op_VecD] = NULL;
|
||||
idealreg2mhdebugmask[Op_VecX] = NULL;
|
||||
idealreg2mhdebugmask[Op_VecY] = NULL;
|
||||
idealreg2mhdebugmask[Op_VecZ] = NULL;
|
||||
|
||||
debug_only(_mem_node = NULL;) // Ideal memory node consumed by mach node
|
||||
}
|
||||
@ -413,7 +416,7 @@ static RegMask *init_input_masks( uint size, RegMask &ret_adr, RegMask &fp ) {
|
||||
void Matcher::init_first_stack_mask() {
|
||||
|
||||
// Allocate storage for spill masks as masks for the appropriate load type.
|
||||
RegMask *rms = (RegMask*)C->comp_arena()->Amalloc_D(sizeof(RegMask) * (3*6+4));
|
||||
RegMask *rms = (RegMask*)C->comp_arena()->Amalloc_D(sizeof(RegMask) * (3*6+5));
|
||||
|
||||
idealreg2spillmask [Op_RegN] = &rms[0];
|
||||
idealreg2spillmask [Op_RegI] = &rms[1];
|
||||
@ -440,6 +443,7 @@ void Matcher::init_first_stack_mask() {
|
||||
idealreg2spillmask [Op_VecD] = &rms[19];
|
||||
idealreg2spillmask [Op_VecX] = &rms[20];
|
||||
idealreg2spillmask [Op_VecY] = &rms[21];
|
||||
idealreg2spillmask [Op_VecZ] = &rms[22];
|
||||
|
||||
OptoReg::Name i;
|
||||
|
||||
@ -523,6 +527,18 @@ void Matcher::init_first_stack_mask() {
|
||||
assert(aligned_stack_mask.is_AllStack(), "should be infinite stack");
|
||||
*idealreg2spillmask[Op_VecY] = *idealreg2regmask[Op_VecY];
|
||||
idealreg2spillmask[Op_VecY]->OR(aligned_stack_mask);
|
||||
}
|
||||
if (Matcher::vector_size_supported(T_FLOAT,16)) {
|
||||
// For VecZ we need enough alignment and 64 bytes (16 slots) for spills.
|
||||
OptoReg::Name in = OptoReg::add(_in_arg_limit, -1);
|
||||
for (int k = 1; (in >= init_in) && (k < RegMask::SlotsPerVecZ); k++) {
|
||||
aligned_stack_mask.Remove(in);
|
||||
in = OptoReg::add(in, -1);
|
||||
}
|
||||
aligned_stack_mask.clear_to_sets(RegMask::SlotsPerVecZ);
|
||||
assert(aligned_stack_mask.is_AllStack(), "should be infinite stack");
|
||||
*idealreg2spillmask[Op_VecZ] = *idealreg2regmask[Op_VecZ];
|
||||
idealreg2spillmask[Op_VecZ]->OR(aligned_stack_mask);
|
||||
}
|
||||
if (UseFPUForSpilling) {
|
||||
// This mask logic assumes that the spill operations are
|
||||
@ -862,6 +878,10 @@ void Matcher::init_spill_mask( Node *ret ) {
|
||||
MachNode *spillVectY = match_tree(new LoadVectorNode(NULL,mem,fp,atp,TypeVect::VECTY));
|
||||
idealreg2regmask[Op_VecY] = &spillVectY->out_RegMask();
|
||||
}
|
||||
if (Matcher::vector_size_supported(T_FLOAT,16)) {
|
||||
MachNode *spillVectZ = match_tree(new LoadVectorNode(NULL,mem,fp,atp,TypeVect::VECTZ));
|
||||
idealreg2regmask[Op_VecZ] = &spillVectZ->out_RegMask();
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef ASSERT
|
||||
|
||||
@ -42,6 +42,7 @@ const char *NodeClassNames[] = {
|
||||
"VecD",
|
||||
"VecX",
|
||||
"VecY",
|
||||
"VecZ",
|
||||
"_last_machine_leaf",
|
||||
#include "classes.hpp"
|
||||
"_last_class_name",
|
||||
|
||||
@ -40,6 +40,7 @@ enum Opcodes {
|
||||
macro(VecD) // Machine vectord register
|
||||
macro(VecX) // Machine vectorx register
|
||||
macro(VecY) // Machine vectory register
|
||||
macro(VecZ) // Machine vectorz register
|
||||
macro(RegFlags) // Machine flags register
|
||||
_last_machine_leaf, // Split between regular opcodes and machine
|
||||
#include "classes.hpp"
|
||||
|
||||
@ -103,6 +103,10 @@ class OptoReg VALUE_OBJ_CLASS_SPEC {
|
||||
return r - stack0();
|
||||
}
|
||||
|
||||
static void invalidate(Name n) {
|
||||
vm2opto[n] = Bad;
|
||||
}
|
||||
|
||||
// convert a stack slot number into an OptoReg::Name
|
||||
static OptoReg::Name stack2reg( int idx) {
|
||||
return Name(stack0() + idx);
|
||||
|
||||
@ -1880,8 +1880,8 @@ void Compile::ScheduleAndBundle() {
|
||||
if (!do_scheduling())
|
||||
return;
|
||||
|
||||
// Scheduling code works only with pairs (8 bytes) maximum.
|
||||
if (max_vector_size() > 8)
|
||||
// Scheduling code works only with pairs (16 bytes) maximum.
|
||||
if (max_vector_size() > 16)
|
||||
return;
|
||||
|
||||
TracePhase tp("isched", &timers[_t_instrSched]);
|
||||
|
||||
@ -114,11 +114,14 @@ const RegMask RegMask::Empty(
|
||||
|
||||
//=============================================================================
|
||||
bool RegMask::is_vector(uint ireg) {
|
||||
return (ireg == Op_VecS || ireg == Op_VecD || ireg == Op_VecX || ireg == Op_VecY);
|
||||
return (ireg == Op_VecS || ireg == Op_VecD ||
|
||||
ireg == Op_VecX || ireg == Op_VecY || ireg == Op_VecZ );
|
||||
}
|
||||
|
||||
int RegMask::num_registers(uint ireg) {
|
||||
switch(ireg) {
|
||||
case Op_VecZ:
|
||||
return 16;
|
||||
case Op_VecY:
|
||||
return 8;
|
||||
case Op_VecX:
|
||||
@ -233,7 +236,8 @@ int RegMask::is_bound_pair() const {
|
||||
return true;
|
||||
}
|
||||
|
||||
static int low_bits[3] = { 0x55555555, 0x11111111, 0x01010101 };
|
||||
// only indicies of power 2 are accessed, so index 3 is only filled in for storage.
|
||||
static int low_bits[5] = { 0x55555555, 0x11111111, 0x01010101, 0x00000000, 0x00010001 };
|
||||
//------------------------------find_first_set---------------------------------
|
||||
// Find the lowest-numbered register set in the mask. Return the
|
||||
// HIGHEST register number in the set, or BAD if no sets.
|
||||
@ -254,7 +258,7 @@ OptoReg::Name RegMask::find_first_set(const int size) const {
|
||||
// Clear out partial bits; leave only aligned adjacent bit pairs
|
||||
void RegMask::clear_to_sets(const int size) {
|
||||
if (size == 1) return;
|
||||
assert(2 <= size && size <= 8, "update low bits table");
|
||||
assert(2 <= size && size <= 16, "update low bits table");
|
||||
assert(is_power_of_2(size), "sanity");
|
||||
int low_bits_mask = low_bits[size>>2];
|
||||
for (int i = 0; i < RM_SIZE; i++) {
|
||||
@ -268,6 +272,9 @@ void RegMask::clear_to_sets(const int size) {
|
||||
sets |= (sets>>2); // Smear 2 hi-bits into a set
|
||||
if (size > 4) {
|
||||
sets |= (sets>>4); // Smear 4 hi-bits into a set
|
||||
if (size > 8) {
|
||||
sets |= (sets>>8); // Smear 8 hi-bits into a set
|
||||
}
|
||||
}
|
||||
}
|
||||
_A[i] = sets;
|
||||
@ -279,7 +286,7 @@ void RegMask::clear_to_sets(const int size) {
|
||||
// Smear out partial bits to aligned adjacent bit sets
|
||||
void RegMask::smear_to_sets(const int size) {
|
||||
if (size == 1) return;
|
||||
assert(2 <= size && size <= 8, "update low bits table");
|
||||
assert(2 <= size && size <= 16, "update low bits table");
|
||||
assert(is_power_of_2(size), "sanity");
|
||||
int low_bits_mask = low_bits[size>>2];
|
||||
for (int i = 0; i < RM_SIZE; i++) {
|
||||
@ -294,6 +301,9 @@ void RegMask::smear_to_sets(const int size) {
|
||||
sets |= (sets<<2); // Smear 2 lo-bits into a set
|
||||
if (size > 4) {
|
||||
sets |= (sets<<4); // Smear 4 lo-bits into a set
|
||||
if (size > 8) {
|
||||
sets |= (sets<<8); // Smear 8 lo-bits into a set
|
||||
}
|
||||
}
|
||||
}
|
||||
_A[i] = sets;
|
||||
@ -304,7 +314,7 @@ void RegMask::smear_to_sets(const int size) {
|
||||
//------------------------------is_aligned_set--------------------------------
|
||||
bool RegMask::is_aligned_sets(const int size) const {
|
||||
if (size == 1) return true;
|
||||
assert(2 <= size && size <= 8, "update low bits table");
|
||||
assert(2 <= size && size <= 16, "update low bits table");
|
||||
assert(is_power_of_2(size), "sanity");
|
||||
int low_bits_mask = low_bits[size>>2];
|
||||
// Assert that the register mask contains only bit sets.
|
||||
@ -330,7 +340,7 @@ bool RegMask::is_aligned_sets(const int size) const {
|
||||
// Works also for size 1.
|
||||
int RegMask::is_bound_set(const int size) const {
|
||||
if( is_AllStack() ) return false;
|
||||
assert(1 <= size && size <= 8, "update low bits table");
|
||||
assert(1 <= size && size <= 16, "update low bits table");
|
||||
int bit = -1; // Set to hold the one bit allowed
|
||||
for (int i = 0; i < RM_SIZE; i++) {
|
||||
if (_A[i] ) { // Found some bits
|
||||
@ -346,10 +356,12 @@ int RegMask::is_bound_set(const int size) const {
|
||||
if (((-1) & ~(bit-1)) != _A[i])
|
||||
return false; // Found many bits, so fail
|
||||
i++; // Skip iteration forward and check high part
|
||||
// The lower 24 bits should be 0 since it is split case and size <= 8.
|
||||
int set = bit>>24;
|
||||
// The lower (32-size) bits should be 0 since it is split case.
|
||||
int clear_bit_size = 32-size;
|
||||
int shift_back_size = 32-clear_bit_size;
|
||||
int set = bit>>clear_bit_size;
|
||||
set = set & -set; // Remove sign extension.
|
||||
set = (((set << size) - 1) >> 8);
|
||||
set = (((set << size) - 1) >> shift_back_size);
|
||||
if (i >= RM_SIZE || _A[i] != set)
|
||||
return false; // Require expected low bits in next word
|
||||
}
|
||||
@ -375,7 +387,7 @@ bool RegMask::is_UP() const {
|
||||
//------------------------------Size-------------------------------------------
|
||||
// Compute size of register mask in bits
|
||||
uint RegMask::Size() const {
|
||||
extern uint8_t bitsInByte[256];
|
||||
extern uint8_t bitsInByte[512];
|
||||
uint sum = 0;
|
||||
for( int i = 0; i < RM_SIZE; i++ )
|
||||
sum +=
|
||||
|
||||
@ -98,7 +98,8 @@ public:
|
||||
SlotsPerVecS = 1,
|
||||
SlotsPerVecD = 2,
|
||||
SlotsPerVecX = 4,
|
||||
SlotsPerVecY = 8 };
|
||||
SlotsPerVecY = 8,
|
||||
SlotsPerVecZ = 16 };
|
||||
|
||||
// A constructor only used by the ADLC output. All mask fields are filled
|
||||
// in directly. Calls to this look something like RM(1,2,3,4);
|
||||
@ -299,13 +300,13 @@ public:
|
||||
static bool can_represent(OptoReg::Name reg) {
|
||||
// NOTE: -1 in computation reflects the usage of the last
|
||||
// bit of the regmask as an infinite stack flag and
|
||||
// -7 is to keep mask aligned for largest value (VecY).
|
||||
// -7 is to keep mask aligned for largest value (VecZ).
|
||||
return (int)reg < (int)(CHUNK_SIZE-1);
|
||||
}
|
||||
static bool can_represent_arg(OptoReg::Name reg) {
|
||||
// NOTE: -SlotsPerVecY in computation reflects the need
|
||||
// to keep mask aligned for largest value (VecY).
|
||||
return (int)reg < (int)(CHUNK_SIZE-SlotsPerVecY);
|
||||
// NOTE: -SlotsPerVecZ in computation reflects the need
|
||||
// to keep mask aligned for largest value (VecZ).
|
||||
return (int)reg < (int)(CHUNK_SIZE-SlotsPerVecZ);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@ -68,16 +68,19 @@ Type::TypeInfo Type::_type_info[Type::lastype] = {
|
||||
{ Bad, T_ILLEGAL, "vectord:", false, Op_RegD, relocInfo::none }, // VectorD
|
||||
{ Bad, T_ILLEGAL, "vectorx:", false, 0, relocInfo::none }, // VectorX
|
||||
{ Bad, T_ILLEGAL, "vectory:", false, 0, relocInfo::none }, // VectorY
|
||||
{ Bad, T_ILLEGAL, "vectorz:", false, 0, relocInfo::none }, // VectorZ
|
||||
#elif defined(PPC64)
|
||||
{ Bad, T_ILLEGAL, "vectors:", false, 0, relocInfo::none }, // VectorS
|
||||
{ Bad, T_ILLEGAL, "vectord:", false, Op_RegL, relocInfo::none }, // VectorD
|
||||
{ Bad, T_ILLEGAL, "vectorx:", false, 0, relocInfo::none }, // VectorX
|
||||
{ Bad, T_ILLEGAL, "vectory:", false, 0, relocInfo::none }, // VectorY
|
||||
{ Bad, T_ILLEGAL, "vectorz:", false, 0, relocInfo::none }, // VectorZ
|
||||
#else // all other
|
||||
{ Bad, T_ILLEGAL, "vectors:", false, Op_VecS, relocInfo::none }, // VectorS
|
||||
{ Bad, T_ILLEGAL, "vectord:", false, Op_VecD, relocInfo::none }, // VectorD
|
||||
{ Bad, T_ILLEGAL, "vectorx:", false, Op_VecX, relocInfo::none }, // VectorX
|
||||
{ Bad, T_ILLEGAL, "vectory:", false, Op_VecY, relocInfo::none }, // VectorY
|
||||
{ Bad, T_ILLEGAL, "vectorz:", false, Op_VecZ, relocInfo::none }, // VectorZ
|
||||
#endif
|
||||
{ Bad, T_ADDRESS, "anyptr:", false, Op_RegP, relocInfo::none }, // AnyPtr
|
||||
{ Bad, T_ADDRESS, "rawptr:", false, Op_RegP, relocInfo::none }, // RawPtr
|
||||
@ -503,10 +506,14 @@ void Type::Initialize_shared(Compile* current) {
|
||||
if (Matcher::vector_size_supported(T_FLOAT,8)) {
|
||||
TypeVect::VECTY = TypeVect::make(T_FLOAT,8);
|
||||
}
|
||||
if (Matcher::vector_size_supported(T_FLOAT,16)) {
|
||||
TypeVect::VECTZ = TypeVect::make(T_FLOAT,16);
|
||||
}
|
||||
mreg2type[Op_VecS] = TypeVect::VECTS;
|
||||
mreg2type[Op_VecD] = TypeVect::VECTD;
|
||||
mreg2type[Op_VecX] = TypeVect::VECTX;
|
||||
mreg2type[Op_VecY] = TypeVect::VECTY;
|
||||
mreg2type[Op_VecZ] = TypeVect::VECTZ;
|
||||
|
||||
// Restore working type arena.
|
||||
current->set_type_arena(save);
|
||||
@ -798,6 +805,7 @@ const Type::TYPES Type::dual_type[Type::lastype] = {
|
||||
Bad, // VectorD - handled in v-call
|
||||
Bad, // VectorX - handled in v-call
|
||||
Bad, // VectorY - handled in v-call
|
||||
Bad, // VectorZ - handled in v-call
|
||||
|
||||
Bad, // AnyPtr - handled in v-call
|
||||
Bad, // RawPtr - handled in v-call
|
||||
@ -2051,6 +2059,7 @@ const TypeVect *TypeVect::VECTS = NULL; // 32-bit vectors
|
||||
const TypeVect *TypeVect::VECTD = NULL; // 64-bit vectors
|
||||
const TypeVect *TypeVect::VECTX = NULL; // 128-bit vectors
|
||||
const TypeVect *TypeVect::VECTY = NULL; // 256-bit vectors
|
||||
const TypeVect *TypeVect::VECTZ = NULL; // 512-bit vectors
|
||||
|
||||
//------------------------------make-------------------------------------------
|
||||
const TypeVect* TypeVect::make(const Type *elem, uint length) {
|
||||
@ -2070,6 +2079,8 @@ const TypeVect* TypeVect::make(const Type *elem, uint length) {
|
||||
return (TypeVect*)(new TypeVectX(elem, length))->hashcons();
|
||||
case Op_VecY:
|
||||
return (TypeVect*)(new TypeVectY(elem, length))->hashcons();
|
||||
case Op_VecZ:
|
||||
return (TypeVect*)(new TypeVectZ(elem, length))->hashcons();
|
||||
}
|
||||
ShouldNotReachHere();
|
||||
return NULL;
|
||||
@ -2093,7 +2104,8 @@ const Type *TypeVect::xmeet( const Type *t ) const {
|
||||
case VectorS:
|
||||
case VectorD:
|
||||
case VectorX:
|
||||
case VectorY: { // Meeting 2 vectors?
|
||||
case VectorY:
|
||||
case VectorZ: { // Meeting 2 vectors?
|
||||
const TypeVect* v = t->is_vect();
|
||||
assert( base() == v->base(), "");
|
||||
assert(length() == v->length(), "");
|
||||
@ -2151,6 +2163,8 @@ void TypeVect::dump2(Dict &d, uint depth, outputStream *st) const {
|
||||
st->print("vectorx["); break;
|
||||
case VectorY:
|
||||
st->print("vectory["); break;
|
||||
case VectorZ:
|
||||
st->print("vectorz["); break;
|
||||
default:
|
||||
ShouldNotReachHere();
|
||||
}
|
||||
|
||||
@ -57,6 +57,7 @@ class TypeVectS;
|
||||
class TypeVectD;
|
||||
class TypeVectX;
|
||||
class TypeVectY;
|
||||
class TypeVectZ;
|
||||
class TypePtr;
|
||||
class TypeRawPtr;
|
||||
class TypeOopPtr;
|
||||
@ -90,6 +91,7 @@ public:
|
||||
VectorD, // 64bit Vector types
|
||||
VectorX, // 128bit Vector types
|
||||
VectorY, // 256bit Vector types
|
||||
VectorZ, // 512bit Vector types
|
||||
|
||||
AnyPtr, // Any old raw, klass, inst, or array pointer
|
||||
RawPtr, // Raw (non-oop) pointers
|
||||
@ -729,6 +731,7 @@ public:
|
||||
static const TypeVect *VECTD;
|
||||
static const TypeVect *VECTX;
|
||||
static const TypeVect *VECTY;
|
||||
static const TypeVect *VECTZ;
|
||||
|
||||
#ifndef PRODUCT
|
||||
virtual void dump2(Dict &d, uint, outputStream *st) const; // Specialized per-Type dumping
|
||||
@ -755,6 +758,11 @@ class TypeVectY : public TypeVect {
|
||||
TypeVectY(const Type* elem, uint length) : TypeVect(VectorY, elem, length) {}
|
||||
};
|
||||
|
||||
class TypeVectZ : public TypeVect {
|
||||
friend class TypeVect;
|
||||
TypeVectZ(const Type* elem, uint length) : TypeVect(VectorZ, elem, length) {}
|
||||
};
|
||||
|
||||
//------------------------------TypePtr----------------------------------------
|
||||
// Class of machine Pointer Types: raw data, instances or arrays.
|
||||
// If the _base enum is AnyPtr, then this refers to all of the above.
|
||||
@ -1568,12 +1576,12 @@ inline const TypeAry *Type::is_ary() const {
|
||||
}
|
||||
|
||||
inline const TypeVect *Type::is_vect() const {
|
||||
assert( _base >= VectorS && _base <= VectorY, "Not a Vector" );
|
||||
assert( _base >= VectorS && _base <= VectorZ, "Not a Vector" );
|
||||
return (TypeVect*)this;
|
||||
}
|
||||
|
||||
inline const TypeVect *Type::isa_vect() const {
|
||||
return (_base >= VectorS && _base <= VectorY) ? (TypeVect*)this : NULL;
|
||||
return (_base >= VectorS && _base <= VectorZ) ? (TypeVect*)this : NULL;
|
||||
}
|
||||
|
||||
inline const TypePtr *Type::is_ptr() const {
|
||||
|
||||
@ -77,6 +77,9 @@ int VectorNode::opcode(int sopc, BasicType bt) {
|
||||
case T_INT: return Op_MulVI;
|
||||
}
|
||||
ShouldNotReachHere();
|
||||
case Op_MulL:
|
||||
assert(bt == T_LONG, "must be");
|
||||
return Op_MulVL;
|
||||
case Op_MulF:
|
||||
assert(bt == T_FLOAT, "must be");
|
||||
return Op_MulVF;
|
||||
@ -267,6 +270,7 @@ VectorNode* VectorNode::make(int opc, Node* n1, Node* n2, uint vlen, BasicType b
|
||||
|
||||
case Op_MulVS: return new MulVSNode(n1, n2, vt);
|
||||
case Op_MulVI: return new MulVINode(n1, n2, vt);
|
||||
case Op_MulVL: return new MulVLNode(n1, n2, vt);
|
||||
case Op_MulVF: return new MulVFNode(n1, n2, vt);
|
||||
case Op_MulVD: return new MulVDNode(n1, n2, vt);
|
||||
|
||||
@ -463,6 +467,10 @@ int ReductionNode::opcode(int opc, BasicType bt) {
|
||||
assert(bt == T_INT, "must be");
|
||||
vopc = Op_MulReductionVI;
|
||||
break;
|
||||
case Op_MulL:
|
||||
assert(bt == T_LONG, "must be");
|
||||
vopc = Op_MulReductionVL;
|
||||
break;
|
||||
case Op_MulF:
|
||||
assert(bt == T_FLOAT, "must be");
|
||||
vopc = Op_MulReductionVF;
|
||||
@ -492,6 +500,7 @@ ReductionNode* ReductionNode::make(int opc, Node *ctrl, Node* n1, Node* n2, Basi
|
||||
case Op_AddReductionVF: return new AddReductionVFNode(ctrl, n1, n2);
|
||||
case Op_AddReductionVD: return new AddReductionVDNode(ctrl, n1, n2);
|
||||
case Op_MulReductionVI: return new MulReductionVINode(ctrl, n1, n2);
|
||||
case Op_MulReductionVL: return new MulReductionVLNode(ctrl, n1, n2);
|
||||
case Op_MulReductionVF: return new MulReductionVFNode(ctrl, n1, n2);
|
||||
case Op_MulReductionVD: return new MulReductionVDNode(ctrl, n1, n2);
|
||||
}
|
||||
|
||||
@ -90,6 +90,30 @@ class AddVINode : public VectorNode {
|
||||
virtual int Opcode() const;
|
||||
};
|
||||
|
||||
//------------------------------AddVLNode--------------------------------------
|
||||
// Vector add long
|
||||
class AddVLNode : public VectorNode {
|
||||
public:
|
||||
AddVLNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1, in2, vt) {}
|
||||
virtual int Opcode() const;
|
||||
};
|
||||
|
||||
//------------------------------AddVFNode--------------------------------------
|
||||
// Vector add float
|
||||
class AddVFNode : public VectorNode {
|
||||
public:
|
||||
AddVFNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1, in2, vt) {}
|
||||
virtual int Opcode() const;
|
||||
};
|
||||
|
||||
//------------------------------AddVDNode--------------------------------------
|
||||
// Vector add double
|
||||
class AddVDNode : public VectorNode {
|
||||
public:
|
||||
AddVDNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1, in2, vt) {}
|
||||
virtual int Opcode() const;
|
||||
};
|
||||
|
||||
//------------------------------ReductionNode------------------------------------
|
||||
// Perform reduction of a vector
|
||||
class ReductionNode : public Node {
|
||||
@ -121,22 +145,6 @@ public:
|
||||
virtual uint ideal_reg() const { return Op_RegL; }
|
||||
};
|
||||
|
||||
//------------------------------AddVLNode--------------------------------------
|
||||
// Vector add long
|
||||
class AddVLNode : public VectorNode {
|
||||
public:
|
||||
AddVLNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
|
||||
virtual int Opcode() const;
|
||||
};
|
||||
|
||||
//------------------------------AddVFNode--------------------------------------
|
||||
// Vector add float
|
||||
class AddVFNode : public VectorNode {
|
||||
public:
|
||||
AddVFNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
|
||||
virtual int Opcode() const;
|
||||
};
|
||||
|
||||
//------------------------------AddReductionVFNode--------------------------------------
|
||||
// Vector add float as a reduction
|
||||
class AddReductionVFNode : public ReductionNode {
|
||||
@ -147,14 +155,6 @@ public:
|
||||
virtual uint ideal_reg() const { return Op_RegF; }
|
||||
};
|
||||
|
||||
//------------------------------AddVDNode--------------------------------------
|
||||
// Vector add double
|
||||
class AddVDNode : public VectorNode {
|
||||
public:
|
||||
AddVDNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
|
||||
virtual int Opcode() const;
|
||||
};
|
||||
|
||||
//------------------------------AddReductionVDNode--------------------------------------
|
||||
// Vector add double as a reduction
|
||||
class AddReductionVDNode : public ReductionNode {
|
||||
@ -229,6 +229,30 @@ class MulVINode : public VectorNode {
|
||||
virtual int Opcode() const;
|
||||
};
|
||||
|
||||
//------------------------------MulVLNode--------------------------------------
|
||||
// Vector multiply long
|
||||
class MulVLNode : public VectorNode {
|
||||
public:
|
||||
MulVLNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1, in2, vt) {}
|
||||
virtual int Opcode() const;
|
||||
};
|
||||
|
||||
//------------------------------MulVFNode--------------------------------------
|
||||
// Vector multiply float
|
||||
class MulVFNode : public VectorNode {
|
||||
public:
|
||||
MulVFNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1, in2, vt) {}
|
||||
virtual int Opcode() const;
|
||||
};
|
||||
|
||||
//------------------------------MulVDNode--------------------------------------
|
||||
// Vector multiply double
|
||||
class MulVDNode : public VectorNode {
|
||||
public:
|
||||
MulVDNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1, in2, vt) {}
|
||||
virtual int Opcode() const;
|
||||
};
|
||||
|
||||
//------------------------------MulReductionVINode--------------------------------------
|
||||
// Vector multiply int as a reduction
|
||||
class MulReductionVINode : public ReductionNode {
|
||||
@ -239,12 +263,14 @@ public:
|
||||
virtual uint ideal_reg() const { return Op_RegI; }
|
||||
};
|
||||
|
||||
//------------------------------MulVFNode--------------------------------------
|
||||
// Vector multiply float
|
||||
class MulVFNode : public VectorNode {
|
||||
public:
|
||||
MulVFNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
|
||||
//------------------------------MulReductionVLNode--------------------------------------
|
||||
// Vector multiply int as a reduction
|
||||
class MulReductionVLNode : public ReductionNode {
|
||||
public:
|
||||
MulReductionVLNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
|
||||
virtual int Opcode() const;
|
||||
virtual const Type* bottom_type() const { return TypeLong::LONG; }
|
||||
virtual uint ideal_reg() const { return Op_RegI; }
|
||||
};
|
||||
|
||||
//------------------------------MulReductionVFNode--------------------------------------
|
||||
@ -257,14 +283,6 @@ public:
|
||||
virtual uint ideal_reg() const { return Op_RegF; }
|
||||
};
|
||||
|
||||
//------------------------------MulVDNode--------------------------------------
|
||||
// Vector multiply double
|
||||
class MulVDNode : public VectorNode {
|
||||
public:
|
||||
MulVDNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
|
||||
virtual int Opcode() const;
|
||||
};
|
||||
|
||||
//------------------------------MulReductionVDNode--------------------------------------
|
||||
// Vector multiply double as a reduction
|
||||
class MulReductionVDNode : public ReductionNode {
|
||||
|
||||
@ -2010,6 +2010,8 @@ typedef CompactHashtable<Symbol*, char> SymbolCompactHashTable;
|
||||
declare_c2_type(SubVFNode, VectorNode) \
|
||||
declare_c2_type(SubVDNode, VectorNode) \
|
||||
declare_c2_type(MulVSNode, VectorNode) \
|
||||
declare_c2_type(MulVLNode, VectorNode) \
|
||||
declare_c2_type(MulReductionVLNode, ReductionNode) \
|
||||
declare_c2_type(MulVINode, VectorNode) \
|
||||
declare_c2_type(MulReductionVINode, ReductionNode) \
|
||||
declare_c2_type(MulVFNode, VectorNode) \
|
||||
|
||||
92
hotspot/test/compiler/loopopts/superword/SumRed_Long.java
Normal file
92
hotspot/test/compiler/loopopts/superword/SumRed_Long.java
Normal file
@ -0,0 +1,92 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*
|
||||
*/
|
||||
|
||||
/**
|
||||
* @test
|
||||
* @bug 8076276
|
||||
* @summary Add C2 x86 Superword support for scalar sum reduction optimizations : long test
|
||||
*
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 SumRed_Double
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 SumRed_Double
|
||||
*
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 SumRed_Double
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 SumRed_Double
|
||||
*
|
||||
*/
|
||||
|
||||
public class SumRed_Long
|
||||
{
|
||||
public static void main(String[] args) throws Exception {
|
||||
long[] a = new long[256*1024];
|
||||
long[] b = new long[256*1024];
|
||||
long[] c = new long[256*1024];
|
||||
long[] d = new long[256*1024];
|
||||
sumReductionInit(a,b,c);
|
||||
long total = 0;
|
||||
long valid = 262144000;
|
||||
for(int j = 0; j < 2000; j++) {
|
||||
total = sumReductionImplement(a,b,c,d,total);
|
||||
}
|
||||
total = (int)total;
|
||||
if(total == valid) {
|
||||
System.out.println("Success");
|
||||
} else {
|
||||
System.out.println("Invalid sum of elements variable in total: " + total);
|
||||
System.out.println("Expected value = " + valid);
|
||||
throw new Exception("Failed");
|
||||
}
|
||||
}
|
||||
|
||||
public static void sumReductionInit(
|
||||
long[] a,
|
||||
long[] b,
|
||||
long[] c)
|
||||
{
|
||||
for(int j = 0; j < 1; j++)
|
||||
{
|
||||
for(int i = 0; i < a.length; i++)
|
||||
{
|
||||
a[i] = i * 1 + j;
|
||||
b[i] = i * 1 - j;
|
||||
c[i] = i + j;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static long sumReductionImplement(
|
||||
long[] a,
|
||||
long[] b,
|
||||
long[] c,
|
||||
long[] d,
|
||||
long total)
|
||||
{
|
||||
for(int i = 0; i < a.length; i++)
|
||||
{
|
||||
d[i]= (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
|
||||
total += d[i];
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user