mirror of
https://github.com/openjdk/jdk.git
synced 2026-03-18 03:43:15 +00:00
8380079: Add separate flag for platforms on which copy and clear operations are faster with AVX3Threshold set to 0
Reviewed-by: kvn, asmehra, sviswanathan
This commit is contained in:
parent
7695b1f9c2
commit
9dc3f488b4
@ -168,16 +168,27 @@ define_pd_global(intx, InitArrayShortSize, 8*BytesPerLong);
|
||||
"Perform Ecore Optimization") \
|
||||
\
|
||||
/* Minimum array size in bytes to use AVX512 intrinsics */ \
|
||||
/* for copy, inflate and fill which don't bail out early based on any */ \
|
||||
/* for inflate and fill which don't bail out early based on any */ \
|
||||
/* condition. When this value is set to zero compare operations like */ \
|
||||
/* compare, vectorizedMismatch, compress can also use AVX512 intrinsics.*/\
|
||||
product(int, AVX3Threshold, 4096, DIAGNOSTIC, \
|
||||
"Minimum array size in bytes to use AVX512 intrinsics" \
|
||||
"for copy, inflate and fill. When this value is set as zero" \
|
||||
"for inflate and fill. When this value is set as zero" \
|
||||
"compare operations can also use AVX512 intrinsics.") \
|
||||
range(0, max_jint) \
|
||||
constraint(AVX3ThresholdConstraintFunc,AfterErgo) \
|
||||
\
|
||||
/* Minimum array size in bytes to use AVX512 intrinsics */ \
|
||||
/* for copy and fill which don't bail out early based on any */ \
|
||||
/* condition. When this value is set to zero clear operations that */ \
|
||||
/* work on memory blocks can also use AVX512 intrinsics. */ \
|
||||
product(int, CopyAVX3Threshold, 4096, DIAGNOSTIC, \
|
||||
"Minimum array size in bytes to use AVX512 intrinsics" \
|
||||
"for copy and fill. When this value is set as zero" \
|
||||
"clear operations can also use AVX512 intrinsics.") \
|
||||
range(0, max_jint) \
|
||||
constraint(CopyAVX3ThresholdConstraintFunc,AfterErgo) \
|
||||
\
|
||||
product(bool, IntelJccErratumMitigation, true, DIAGNOSTIC, \
|
||||
"Turn off JVM mitigations related to Intel micro code " \
|
||||
"mitigations for the Intel JCC erratum") \
|
||||
|
||||
@ -5820,7 +5820,7 @@ void MacroAssembler::xmm_clear_mem(Register base, Register cnt, Register rtmp, X
|
||||
// cnt - number of qwords (8-byte words).
|
||||
// base - start address, qword aligned.
|
||||
Label L_zero_64_bytes, L_loop, L_sloop, L_tail, L_end;
|
||||
bool use64byteVector = (MaxVectorSize == 64) && (VM_Version::avx3_threshold() == 0);
|
||||
bool use64byteVector = (MaxVectorSize == 64) && (CopyAVX3Threshold == 0);
|
||||
if (use64byteVector) {
|
||||
vpxor(xtmp, xtmp, xtmp, AVX_512bit);
|
||||
} else if (MaxVectorSize >= 32) {
|
||||
@ -5884,7 +5884,7 @@ void MacroAssembler::xmm_clear_mem(Register base, Register cnt, Register rtmp, X
|
||||
// Clearing constant sized memory using YMM/ZMM registers.
|
||||
void MacroAssembler::clear_mem(Register base, int cnt, Register rtmp, XMMRegister xtmp, KRegister mask) {
|
||||
assert(UseAVX > 2 && VM_Version::supports_avx512vl(), "");
|
||||
bool use64byteVector = (MaxVectorSize > 32) && (VM_Version::avx3_threshold() == 0);
|
||||
bool use64byteVector = (MaxVectorSize > 32) && (CopyAVX3Threshold == 0);
|
||||
|
||||
int vector64_count = (cnt & (~0x7)) >> 3;
|
||||
cnt = cnt & 0x7;
|
||||
@ -6109,8 +6109,8 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned,
|
||||
// Fill 64-byte chunks
|
||||
Label L_fill_64_bytes_loop_avx3, L_check_fill_64_bytes_avx2;
|
||||
|
||||
// If number of bytes to fill < VM_Version::avx3_threshold(), perform fill using AVX2
|
||||
cmpptr(count, VM_Version::avx3_threshold());
|
||||
// If number of bytes to fill < CopyAVX3Threshold, perform fill using AVX2
|
||||
cmpptr(count, CopyAVX3Threshold);
|
||||
jccb(Assembler::below, L_check_fill_64_bytes_avx2);
|
||||
|
||||
vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
|
||||
@ -9483,7 +9483,6 @@ void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register va
|
||||
Label L_fill_zmm_sequence;
|
||||
|
||||
int shift = -1;
|
||||
int avx3threshold = VM_Version::avx3_threshold();
|
||||
switch(type) {
|
||||
case T_BYTE: shift = 0;
|
||||
break;
|
||||
@ -9499,10 +9498,10 @@ void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register va
|
||||
fatal("Unhandled type: %s\n", type2name(type));
|
||||
}
|
||||
|
||||
if ((avx3threshold != 0) || (MaxVectorSize == 32)) {
|
||||
if ((CopyAVX3Threshold != 0) || (MaxVectorSize == 32)) {
|
||||
|
||||
if (MaxVectorSize == 64) {
|
||||
cmpq(count, avx3threshold >> shift);
|
||||
cmpq(count, CopyAVX3Threshold >> shift);
|
||||
jcc(Assembler::greater, L_fill_zmm_sequence);
|
||||
}
|
||||
|
||||
|
||||
@ -166,12 +166,12 @@ class StubGenerator: public StubCodeGenerator {
|
||||
// - If target supports AVX3 features (BW+VL+F) then implementation uses 32 byte vectors (YMMs)
|
||||
// for both special cases (various small block sizes) and aligned copy loop. This is the
|
||||
// default configuration.
|
||||
// - If copy length is above AVX3Threshold, then implementation use 64 byte vectors (ZMMs)
|
||||
// - If copy length is above CopyAVX3Threshold, then implementation use 64 byte vectors (ZMMs)
|
||||
// for main copy loop (and subsequent tail) since bulk of the cycles will be consumed in it.
|
||||
// - If user forces MaxVectorSize=32 then above 4096 bytes its seen that REP MOVs shows a
|
||||
// better performance for disjoint copies. For conjoint/backward copy vector based
|
||||
// copy performs better.
|
||||
// - If user sets AVX3Threshold=0, then special cases for small blocks sizes operate over
|
||||
// - If user sets CopyAVX3Threshold=0, then special cases for small blocks sizes operate over
|
||||
// 64 byte vector registers (ZMMs).
|
||||
|
||||
address generate_disjoint_copy_avx3_masked(StubId stub_id, address* entry);
|
||||
|
||||
@ -144,7 +144,7 @@ address StubGenerator::generate_updateBytesAdler32() {
|
||||
__ align32();
|
||||
if (VM_Version::supports_avx512vl()) {
|
||||
// AVX2 performs better for smaller inputs because of leaner post loop reduction sequence..
|
||||
__ cmpl(s, MAX2(128, VM_Version::avx3_threshold()));
|
||||
__ cmpl(s, MAX2(128, CopyAVX3Threshold));
|
||||
__ jcc(Assembler::belowEqual, SLOOP1A_AVX2);
|
||||
__ lea(end, Address(s, data, Address::times_1, - (2*CHUNKSIZE -1)));
|
||||
|
||||
|
||||
@ -511,12 +511,12 @@ void StubGenerator::copy_bytes_backward(Register from, Register dest,
|
||||
// - If target supports AVX3 features (BW+VL+F) then implementation uses 32 byte vectors (YMMs)
|
||||
// for both special cases (various small block sizes) and aligned copy loop. This is the
|
||||
// default configuration.
|
||||
// - If copy length is above AVX3Threshold, then implementation use 64 byte vectors (ZMMs)
|
||||
// - If copy length is above CopyAVX3Threshold, then implementation use 64 byte vectors (ZMMs)
|
||||
// for main copy loop (and subsequent tail) since bulk of the cycles will be consumed in it.
|
||||
// - If user forces MaxVectorSize=32 then above 4096 bytes its seen that REP MOVs shows a
|
||||
// better performance for disjoint copies. For conjoint/backward copy vector based
|
||||
// copy performs better.
|
||||
// - If user sets AVX3Threshold=0, then special cases for small blocks sizes operate over
|
||||
// - If user sets CopyAVX3Threshold=0, then special cases for small blocks sizes operate over
|
||||
// 64 byte vector registers (ZMMs).
|
||||
|
||||
// Inputs:
|
||||
@ -575,8 +575,7 @@ address StubGenerator::generate_disjoint_copy_avx3_masked(StubId stub_id, addres
|
||||
StubCodeMark mark(this, stub_id);
|
||||
address start = __ pc();
|
||||
|
||||
int avx3threshold = VM_Version::avx3_threshold();
|
||||
bool use64byteVector = (MaxVectorSize > 32) && (avx3threshold == 0);
|
||||
bool use64byteVector = (MaxVectorSize > 32) && (CopyAVX3Threshold == 0);
|
||||
const int large_threshold = 2621440; // 2.5 MB
|
||||
Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
|
||||
Label L_repmovs, L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
|
||||
@ -647,7 +646,7 @@ address StubGenerator::generate_disjoint_copy_avx3_masked(StubId stub_id, addres
|
||||
__ cmpq(temp2, large_threshold);
|
||||
__ jcc(Assembler::greaterEqual, L_copy_large);
|
||||
}
|
||||
if (avx3threshold != 0) {
|
||||
if (CopyAVX3Threshold != 0) {
|
||||
__ cmpq(count, threshold[shift]);
|
||||
if (MaxVectorSize == 64) {
|
||||
// Copy using 64 byte vectors.
|
||||
@ -659,7 +658,7 @@ address StubGenerator::generate_disjoint_copy_avx3_masked(StubId stub_id, addres
|
||||
}
|
||||
}
|
||||
|
||||
if ((MaxVectorSize < 64) || (avx3threshold != 0)) {
|
||||
if ((MaxVectorSize < 64) || (CopyAVX3Threshold != 0)) {
|
||||
// Partial copy to make dst address 32 byte aligned.
|
||||
__ movq(temp2, to);
|
||||
__ andq(temp2, 31);
|
||||
@ -913,8 +912,7 @@ address StubGenerator::generate_conjoint_copy_avx3_masked(StubId stub_id, addres
|
||||
StubCodeMark mark(this, stub_id);
|
||||
address start = __ pc();
|
||||
|
||||
int avx3threshold = VM_Version::avx3_threshold();
|
||||
bool use64byteVector = (MaxVectorSize > 32) && (avx3threshold == 0);
|
||||
bool use64byteVector = (MaxVectorSize > 32) && (CopyAVX3Threshold == 0);
|
||||
|
||||
Label L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
|
||||
Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
|
||||
@ -979,12 +977,12 @@ address StubGenerator::generate_conjoint_copy_avx3_masked(StubId stub_id, addres
|
||||
// PRE-MAIN-POST loop for aligned copy.
|
||||
__ BIND(L_entry);
|
||||
|
||||
if ((MaxVectorSize > 32) && (avx3threshold != 0)) {
|
||||
if ((MaxVectorSize > 32) && (CopyAVX3Threshold != 0)) {
|
||||
__ cmpq(temp1, threshold[shift]);
|
||||
__ jcc(Assembler::greaterEqual, L_pre_main_post_64);
|
||||
}
|
||||
|
||||
if ((MaxVectorSize < 64) || (avx3threshold != 0)) {
|
||||
if ((MaxVectorSize < 64) || (CopyAVX3Threshold != 0)) {
|
||||
// Partial copy to make dst address 32 byte aligned.
|
||||
__ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
|
||||
__ andq(temp2, 31);
|
||||
@ -1199,7 +1197,7 @@ void StubGenerator::arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegi
|
||||
bool use64byteVector, Label& L_entry, Label& L_exit) {
|
||||
Label L_entry_64, L_entry_96, L_entry_128;
|
||||
Label L_entry_160, L_entry_192;
|
||||
bool avx3 = (MaxVectorSize > 32) && (VM_Version::avx3_threshold() == 0);
|
||||
bool avx3 = (MaxVectorSize > 32) && (CopyAVX3Threshold == 0);
|
||||
|
||||
int size_mat[][6] = {
|
||||
/* T_BYTE */ {32 , 64, 96 , 128 , 160 , 192 },
|
||||
|
||||
@ -1967,6 +1967,18 @@ void VM_Version::get_processor_features() {
|
||||
if (FLAG_IS_DEFAULT(UseCopySignIntrinsic)) {
|
||||
FLAG_SET_DEFAULT(UseCopySignIntrinsic, true);
|
||||
}
|
||||
// CopyAVX3Threshold is the threshold at which 64-byte instructions are used
|
||||
// for implementing the array copy and clear operations.
|
||||
// The Intel platforms that supports the serialize instruction
|
||||
// have improved implementation of 64-byte load/stores and so the default
|
||||
// threshold is set to 0 for these platforms.
|
||||
if (FLAG_IS_DEFAULT(CopyAVX3Threshold)) {
|
||||
if (is_intel() && is_intel_server_family() && supports_serialize()) {
|
||||
FLAG_SET_DEFAULT(CopyAVX3Threshold, 0);
|
||||
} else {
|
||||
FLAG_SET_DEFAULT(CopyAVX3Threshold, AVX3Threshold);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void VM_Version::print_platform_virtualization_info(outputStream* st) {
|
||||
@ -2122,17 +2134,6 @@ bool VM_Version::is_intel_darkmont() {
|
||||
return is_intel() && is_intel_server_family() && (_model == 0xCC || _model == 0xDD);
|
||||
}
|
||||
|
||||
// avx3_threshold() sets the threshold at which 64-byte instructions are used
|
||||
// for implementing the array copy and clear operations.
|
||||
// The Intel platforms that supports the serialize instruction
|
||||
// has improved implementation of 64-byte load/stores and so the default
|
||||
// threshold is set to 0 for these platforms.
|
||||
int VM_Version::avx3_threshold() {
|
||||
return (is_intel_server_family() &&
|
||||
supports_serialize() &&
|
||||
FLAG_IS_DEFAULT(AVX3Threshold)) ? 0 : AVX3Threshold;
|
||||
}
|
||||
|
||||
void VM_Version::clear_apx_test_state() {
|
||||
clear_apx_test_state_stub();
|
||||
}
|
||||
|
||||
@ -958,8 +958,6 @@ public:
|
||||
|
||||
static bool is_intel_darkmont();
|
||||
|
||||
static int avx3_threshold();
|
||||
|
||||
static bool is_intel_tsc_synched_at_init();
|
||||
|
||||
static void insert_features_names(VM_Version::VM_Features features, stringStream& ss);
|
||||
|
||||
@ -274,6 +274,17 @@ JVMFlag::Error AVX3ThresholdConstraintFunc(int value, bool verbose) {
|
||||
return JVMFlag::SUCCESS;
|
||||
}
|
||||
|
||||
JVMFlag::Error CopyAVX3ThresholdConstraintFunc(int value, bool verbose) {
|
||||
if (value != 0 && !is_power_of_2(value)) {
|
||||
JVMFlag::printError(verbose,
|
||||
"CopyAVX3Threshold ( %d ) must be 0 or "
|
||||
"a power of two value between 0 and MAX_INT\n", value);
|
||||
return JVMFlag::VIOLATES_CONSTRAINT;
|
||||
}
|
||||
|
||||
return JVMFlag::SUCCESS;
|
||||
}
|
||||
|
||||
JVMFlag::Error ArraycopySrcPrefetchDistanceConstraintFunc(uintx value, bool verbose) {
|
||||
if (value >= 4032) {
|
||||
JVMFlag::printError(verbose,
|
||||
|
||||
@ -46,6 +46,7 @@
|
||||
f(uintx, ArraycopyDstPrefetchDistanceConstraintFunc) \
|
||||
f(uintx, ArraycopySrcPrefetchDistanceConstraintFunc) \
|
||||
f(int, AVX3ThresholdConstraintFunc) \
|
||||
f(int, CopyAVX3ThresholdConstraintFunc) \
|
||||
f(uint, TypeProfileLevelConstraintFunc) \
|
||||
f(uint, VerifyIterativeGVNConstraintFunc) \
|
||||
f(intx, InitArrayShortSizeConstraintFunc) \
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user