8380079: Add separate flag for platforms on which copy and clear operations are faster with AVX3Threshold set to 0

Reviewed-by: kvn, asmehra, sviswanathan
This commit is contained in:
Mohamed Issa 2026-03-16 19:21:05 +00:00 committed by Sandhya Viswanathan
parent 7695b1f9c2
commit 9dc3f488b4
9 changed files with 55 additions and 36 deletions

View File

@ -168,16 +168,27 @@ define_pd_global(intx, InitArrayShortSize, 8*BytesPerLong);
"Perform Ecore Optimization") \
\
/* Minimum array size in bytes to use AVX512 intrinsics */ \
/* for copy, inflate and fill which don't bail out early based on any */ \
/* for inflate and fill which don't bail out early based on any */ \
/* condition. When this value is set to zero compare operations like */ \
/* compare, vectorizedMismatch, compress can also use AVX512 intrinsics.*/\
product(int, AVX3Threshold, 4096, DIAGNOSTIC, \
"Minimum array size in bytes to use AVX512 intrinsics" \
"for copy, inflate and fill. When this value is set as zero" \
"for inflate and fill. When this value is set as zero" \
"compare operations can also use AVX512 intrinsics.") \
range(0, max_jint) \
constraint(AVX3ThresholdConstraintFunc,AfterErgo) \
\
/* Minimum array size in bytes to use AVX512 intrinsics */ \
/* for copy and fill which don't bail out early based on any */ \
/* condition. When this value is set to zero clear operations that */ \
/* work on memory blocks can also use AVX512 intrinsics. */ \
product(int, CopyAVX3Threshold, 4096, DIAGNOSTIC, \
"Minimum array size in bytes to use AVX512 intrinsics" \
"for copy and fill. When this value is set as zero" \
"clear operations can also use AVX512 intrinsics.") \
range(0, max_jint) \
constraint(CopyAVX3ThresholdConstraintFunc,AfterErgo) \
\
product(bool, IntelJccErratumMitigation, true, DIAGNOSTIC, \
"Turn off JVM mitigations related to Intel micro code " \
"mitigations for the Intel JCC erratum") \

View File

@ -5820,7 +5820,7 @@ void MacroAssembler::xmm_clear_mem(Register base, Register cnt, Register rtmp, X
// cnt - number of qwords (8-byte words).
// base - start address, qword aligned.
Label L_zero_64_bytes, L_loop, L_sloop, L_tail, L_end;
bool use64byteVector = (MaxVectorSize == 64) && (VM_Version::avx3_threshold() == 0);
bool use64byteVector = (MaxVectorSize == 64) && (CopyAVX3Threshold == 0);
if (use64byteVector) {
vpxor(xtmp, xtmp, xtmp, AVX_512bit);
} else if (MaxVectorSize >= 32) {
@ -5884,7 +5884,7 @@ void MacroAssembler::xmm_clear_mem(Register base, Register cnt, Register rtmp, X
// Clearing constant sized memory using YMM/ZMM registers.
void MacroAssembler::clear_mem(Register base, int cnt, Register rtmp, XMMRegister xtmp, KRegister mask) {
assert(UseAVX > 2 && VM_Version::supports_avx512vl(), "");
bool use64byteVector = (MaxVectorSize > 32) && (VM_Version::avx3_threshold() == 0);
bool use64byteVector = (MaxVectorSize > 32) && (CopyAVX3Threshold == 0);
int vector64_count = (cnt & (~0x7)) >> 3;
cnt = cnt & 0x7;
@ -6109,8 +6109,8 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned,
// Fill 64-byte chunks
Label L_fill_64_bytes_loop_avx3, L_check_fill_64_bytes_avx2;
// If number of bytes to fill < VM_Version::avx3_threshold(), perform fill using AVX2
cmpptr(count, VM_Version::avx3_threshold());
// If number of bytes to fill < CopyAVX3Threshold, perform fill using AVX2
cmpptr(count, CopyAVX3Threshold);
jccb(Assembler::below, L_check_fill_64_bytes_avx2);
vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
@ -9483,7 +9483,6 @@ void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register va
Label L_fill_zmm_sequence;
int shift = -1;
int avx3threshold = VM_Version::avx3_threshold();
switch(type) {
case T_BYTE: shift = 0;
break;
@ -9499,10 +9498,10 @@ void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register va
fatal("Unhandled type: %s\n", type2name(type));
}
if ((avx3threshold != 0) || (MaxVectorSize == 32)) {
if ((CopyAVX3Threshold != 0) || (MaxVectorSize == 32)) {
if (MaxVectorSize == 64) {
cmpq(count, avx3threshold >> shift);
cmpq(count, CopyAVX3Threshold >> shift);
jcc(Assembler::greater, L_fill_zmm_sequence);
}

View File

@ -166,12 +166,12 @@ class StubGenerator: public StubCodeGenerator {
// - If target supports AVX3 features (BW+VL+F) then implementation uses 32 byte vectors (YMMs)
// for both special cases (various small block sizes) and aligned copy loop. This is the
// default configuration.
// - If copy length is above AVX3Threshold, then implementation use 64 byte vectors (ZMMs)
// - If copy length is above CopyAVX3Threshold, then implementation use 64 byte vectors (ZMMs)
// for main copy loop (and subsequent tail) since bulk of the cycles will be consumed in it.
// - If user forces MaxVectorSize=32 then above 4096 bytes its seen that REP MOVs shows a
// better performance for disjoint copies. For conjoint/backward copy vector based
// copy performs better.
// - If user sets AVX3Threshold=0, then special cases for small blocks sizes operate over
// - If user sets CopyAVX3Threshold=0, then special cases for small blocks sizes operate over
// 64 byte vector registers (ZMMs).
address generate_disjoint_copy_avx3_masked(StubId stub_id, address* entry);

View File

@ -144,7 +144,7 @@ address StubGenerator::generate_updateBytesAdler32() {
__ align32();
if (VM_Version::supports_avx512vl()) {
// AVX2 performs better for smaller inputs because of leaner post loop reduction sequence..
__ cmpl(s, MAX2(128, VM_Version::avx3_threshold()));
__ cmpl(s, MAX2(128, CopyAVX3Threshold));
__ jcc(Assembler::belowEqual, SLOOP1A_AVX2);
__ lea(end, Address(s, data, Address::times_1, - (2*CHUNKSIZE -1)));

View File

@ -511,12 +511,12 @@ void StubGenerator::copy_bytes_backward(Register from, Register dest,
// - If target supports AVX3 features (BW+VL+F) then implementation uses 32 byte vectors (YMMs)
// for both special cases (various small block sizes) and aligned copy loop. This is the
// default configuration.
// - If copy length is above AVX3Threshold, then implementation use 64 byte vectors (ZMMs)
// - If copy length is above CopyAVX3Threshold, then implementation use 64 byte vectors (ZMMs)
// for main copy loop (and subsequent tail) since bulk of the cycles will be consumed in it.
// - If user forces MaxVectorSize=32 then above 4096 bytes its seen that REP MOVs shows a
// better performance for disjoint copies. For conjoint/backward copy vector based
// copy performs better.
// - If user sets AVX3Threshold=0, then special cases for small blocks sizes operate over
// - If user sets CopyAVX3Threshold=0, then special cases for small blocks sizes operate over
// 64 byte vector registers (ZMMs).
// Inputs:
@ -575,8 +575,7 @@ address StubGenerator::generate_disjoint_copy_avx3_masked(StubId stub_id, addres
StubCodeMark mark(this, stub_id);
address start = __ pc();
int avx3threshold = VM_Version::avx3_threshold();
bool use64byteVector = (MaxVectorSize > 32) && (avx3threshold == 0);
bool use64byteVector = (MaxVectorSize > 32) && (CopyAVX3Threshold == 0);
const int large_threshold = 2621440; // 2.5 MB
Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
Label L_repmovs, L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
@ -647,7 +646,7 @@ address StubGenerator::generate_disjoint_copy_avx3_masked(StubId stub_id, addres
__ cmpq(temp2, large_threshold);
__ jcc(Assembler::greaterEqual, L_copy_large);
}
if (avx3threshold != 0) {
if (CopyAVX3Threshold != 0) {
__ cmpq(count, threshold[shift]);
if (MaxVectorSize == 64) {
// Copy using 64 byte vectors.
@ -659,7 +658,7 @@ address StubGenerator::generate_disjoint_copy_avx3_masked(StubId stub_id, addres
}
}
if ((MaxVectorSize < 64) || (avx3threshold != 0)) {
if ((MaxVectorSize < 64) || (CopyAVX3Threshold != 0)) {
// Partial copy to make dst address 32 byte aligned.
__ movq(temp2, to);
__ andq(temp2, 31);
@ -913,8 +912,7 @@ address StubGenerator::generate_conjoint_copy_avx3_masked(StubId stub_id, addres
StubCodeMark mark(this, stub_id);
address start = __ pc();
int avx3threshold = VM_Version::avx3_threshold();
bool use64byteVector = (MaxVectorSize > 32) && (avx3threshold == 0);
bool use64byteVector = (MaxVectorSize > 32) && (CopyAVX3Threshold == 0);
Label L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
@ -979,12 +977,12 @@ address StubGenerator::generate_conjoint_copy_avx3_masked(StubId stub_id, addres
// PRE-MAIN-POST loop for aligned copy.
__ BIND(L_entry);
if ((MaxVectorSize > 32) && (avx3threshold != 0)) {
if ((MaxVectorSize > 32) && (CopyAVX3Threshold != 0)) {
__ cmpq(temp1, threshold[shift]);
__ jcc(Assembler::greaterEqual, L_pre_main_post_64);
}
if ((MaxVectorSize < 64) || (avx3threshold != 0)) {
if ((MaxVectorSize < 64) || (CopyAVX3Threshold != 0)) {
// Partial copy to make dst address 32 byte aligned.
__ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
__ andq(temp2, 31);
@ -1199,7 +1197,7 @@ void StubGenerator::arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegi
bool use64byteVector, Label& L_entry, Label& L_exit) {
Label L_entry_64, L_entry_96, L_entry_128;
Label L_entry_160, L_entry_192;
bool avx3 = (MaxVectorSize > 32) && (VM_Version::avx3_threshold() == 0);
bool avx3 = (MaxVectorSize > 32) && (CopyAVX3Threshold == 0);
int size_mat[][6] = {
/* T_BYTE */ {32 , 64, 96 , 128 , 160 , 192 },

View File

@ -1967,6 +1967,18 @@ void VM_Version::get_processor_features() {
if (FLAG_IS_DEFAULT(UseCopySignIntrinsic)) {
FLAG_SET_DEFAULT(UseCopySignIntrinsic, true);
}
// CopyAVX3Threshold is the threshold at which 64-byte instructions are used
// for implementing the array copy and clear operations.
// The Intel platforms that supports the serialize instruction
// have improved implementation of 64-byte load/stores and so the default
// threshold is set to 0 for these platforms.
if (FLAG_IS_DEFAULT(CopyAVX3Threshold)) {
if (is_intel() && is_intel_server_family() && supports_serialize()) {
FLAG_SET_DEFAULT(CopyAVX3Threshold, 0);
} else {
FLAG_SET_DEFAULT(CopyAVX3Threshold, AVX3Threshold);
}
}
}
void VM_Version::print_platform_virtualization_info(outputStream* st) {
@ -2122,17 +2134,6 @@ bool VM_Version::is_intel_darkmont() {
return is_intel() && is_intel_server_family() && (_model == 0xCC || _model == 0xDD);
}
// avx3_threshold() sets the threshold at which 64-byte instructions are used
// for implementing the array copy and clear operations.
// The Intel platforms that supports the serialize instruction
// has improved implementation of 64-byte load/stores and so the default
// threshold is set to 0 for these platforms.
int VM_Version::avx3_threshold() {
return (is_intel_server_family() &&
supports_serialize() &&
FLAG_IS_DEFAULT(AVX3Threshold)) ? 0 : AVX3Threshold;
}
void VM_Version::clear_apx_test_state() {
clear_apx_test_state_stub();
}

View File

@ -958,8 +958,6 @@ public:
static bool is_intel_darkmont();
static int avx3_threshold();
static bool is_intel_tsc_synched_at_init();
static void insert_features_names(VM_Version::VM_Features features, stringStream& ss);

View File

@ -274,6 +274,17 @@ JVMFlag::Error AVX3ThresholdConstraintFunc(int value, bool verbose) {
return JVMFlag::SUCCESS;
}
JVMFlag::Error CopyAVX3ThresholdConstraintFunc(int value, bool verbose) {
if (value != 0 && !is_power_of_2(value)) {
JVMFlag::printError(verbose,
"CopyAVX3Threshold ( %d ) must be 0 or "
"a power of two value between 0 and MAX_INT\n", value);
return JVMFlag::VIOLATES_CONSTRAINT;
}
return JVMFlag::SUCCESS;
}
JVMFlag::Error ArraycopySrcPrefetchDistanceConstraintFunc(uintx value, bool verbose) {
if (value >= 4032) {
JVMFlag::printError(verbose,

View File

@ -46,6 +46,7 @@
f(uintx, ArraycopyDstPrefetchDistanceConstraintFunc) \
f(uintx, ArraycopySrcPrefetchDistanceConstraintFunc) \
f(int, AVX3ThresholdConstraintFunc) \
f(int, CopyAVX3ThresholdConstraintFunc) \
f(uint, TypeProfileLevelConstraintFunc) \
f(uint, VerifyIterativeGVNConstraintFunc) \
f(intx, InitArrayShortSizeConstraintFunc) \