diff --git a/src/hotspot/cpu/x86/globals_x86.hpp b/src/hotspot/cpu/x86/globals_x86.hpp index 4f5b6d31e75..084793dc262 100644 --- a/src/hotspot/cpu/x86/globals_x86.hpp +++ b/src/hotspot/cpu/x86/globals_x86.hpp @@ -168,16 +168,27 @@ define_pd_global(intx, InitArrayShortSize, 8*BytesPerLong); "Perform Ecore Optimization") \ \ /* Minimum array size in bytes to use AVX512 intrinsics */ \ - /* for copy, inflate and fill which don't bail out early based on any */ \ + /* for inflate and fill which don't bail out early based on any */ \ /* condition. When this value is set to zero compare operations like */ \ /* compare, vectorizedMismatch, compress can also use AVX512 intrinsics.*/\ product(int, AVX3Threshold, 4096, DIAGNOSTIC, \ "Minimum array size in bytes to use AVX512 intrinsics" \ - "for copy, inflate and fill. When this value is set as zero" \ + "for inflate and fill. When this value is set as zero" \ "compare operations can also use AVX512 intrinsics.") \ range(0, max_jint) \ constraint(AVX3ThresholdConstraintFunc,AfterErgo) \ \ + /* Minimum array size in bytes to use AVX512 intrinsics */ \ + /* for copy and fill which don't bail out early based on any */ \ + /* condition. When this value is set to zero clear operations that */ \ + /* work on memory blocks can also use AVX512 intrinsics. */ \ + product(int, CopyAVX3Threshold, 4096, DIAGNOSTIC, \ + "Minimum array size in bytes to use AVX512 intrinsics" \ + "for copy and fill. When this value is set as zero" \ + "clear operations can also use AVX512 intrinsics.") \ + range(0, max_jint) \ + constraint(CopyAVX3ThresholdConstraintFunc,AfterErgo) \ + \ product(bool, IntelJccErratumMitigation, true, DIAGNOSTIC, \ "Turn off JVM mitigations related to Intel micro code " \ "mitigations for the Intel JCC erratum") \ diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.cpp b/src/hotspot/cpu/x86/macroAssembler_x86.cpp index 2d46a50d426..1d77be26bd9 100644 --- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp +++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp @@ -5820,7 +5820,7 @@ void MacroAssembler::xmm_clear_mem(Register base, Register cnt, Register rtmp, X // cnt - number of qwords (8-byte words). // base - start address, qword aligned. Label L_zero_64_bytes, L_loop, L_sloop, L_tail, L_end; - bool use64byteVector = (MaxVectorSize == 64) && (VM_Version::avx3_threshold() == 0); + bool use64byteVector = (MaxVectorSize == 64) && (CopyAVX3Threshold == 0); if (use64byteVector) { vpxor(xtmp, xtmp, xtmp, AVX_512bit); } else if (MaxVectorSize >= 32) { @@ -5884,7 +5884,7 @@ void MacroAssembler::xmm_clear_mem(Register base, Register cnt, Register rtmp, X // Clearing constant sized memory using YMM/ZMM registers. void MacroAssembler::clear_mem(Register base, int cnt, Register rtmp, XMMRegister xtmp, KRegister mask) { assert(UseAVX > 2 && VM_Version::supports_avx512vl(), ""); - bool use64byteVector = (MaxVectorSize > 32) && (VM_Version::avx3_threshold() == 0); + bool use64byteVector = (MaxVectorSize > 32) && (CopyAVX3Threshold == 0); int vector64_count = (cnt & (~0x7)) >> 3; cnt = cnt & 0x7; @@ -6109,8 +6109,8 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned, // Fill 64-byte chunks Label L_fill_64_bytes_loop_avx3, L_check_fill_64_bytes_avx2; - // If number of bytes to fill < VM_Version::avx3_threshold(), perform fill using AVX2 - cmpptr(count, VM_Version::avx3_threshold()); + // If number of bytes to fill < CopyAVX3Threshold, perform fill using AVX2 + cmpptr(count, CopyAVX3Threshold); jccb(Assembler::below, L_check_fill_64_bytes_avx2); vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit); @@ -9483,7 +9483,6 @@ void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register va Label L_fill_zmm_sequence; int shift = -1; - int avx3threshold = VM_Version::avx3_threshold(); switch(type) { case T_BYTE: shift = 0; break; @@ -9499,10 +9498,10 @@ void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register va fatal("Unhandled type: %s\n", type2name(type)); } - if ((avx3threshold != 0) || (MaxVectorSize == 32)) { + if ((CopyAVX3Threshold != 0) || (MaxVectorSize == 32)) { if (MaxVectorSize == 64) { - cmpq(count, avx3threshold >> shift); + cmpq(count, CopyAVX3Threshold >> shift); jcc(Assembler::greater, L_fill_zmm_sequence); } diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp index 64b56442c90..332add6dcd4 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp @@ -166,12 +166,12 @@ class StubGenerator: public StubCodeGenerator { // - If target supports AVX3 features (BW+VL+F) then implementation uses 32 byte vectors (YMMs) // for both special cases (various small block sizes) and aligned copy loop. This is the // default configuration. - // - If copy length is above AVX3Threshold, then implementation use 64 byte vectors (ZMMs) + // - If copy length is above CopyAVX3Threshold, then implementation use 64 byte vectors (ZMMs) // for main copy loop (and subsequent tail) since bulk of the cycles will be consumed in it. // - If user forces MaxVectorSize=32 then above 4096 bytes its seen that REP MOVs shows a // better performance for disjoint copies. For conjoint/backward copy vector based // copy performs better. - // - If user sets AVX3Threshold=0, then special cases for small blocks sizes operate over + // - If user sets CopyAVX3Threshold=0, then special cases for small blocks sizes operate over // 64 byte vector registers (ZMMs). address generate_disjoint_copy_avx3_masked(StubId stub_id, address* entry); diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64_adler.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64_adler.cpp index 2799997a761..1d3e7afde1d 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64_adler.cpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64_adler.cpp @@ -144,7 +144,7 @@ address StubGenerator::generate_updateBytesAdler32() { __ align32(); if (VM_Version::supports_avx512vl()) { // AVX2 performs better for smaller inputs because of leaner post loop reduction sequence.. - __ cmpl(s, MAX2(128, VM_Version::avx3_threshold())); + __ cmpl(s, MAX2(128, CopyAVX3Threshold)); __ jcc(Assembler::belowEqual, SLOOP1A_AVX2); __ lea(end, Address(s, data, Address::times_1, - (2*CHUNKSIZE -1))); diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64_arraycopy.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64_arraycopy.cpp index d53fafafdb4..01e004b7b43 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64_arraycopy.cpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64_arraycopy.cpp @@ -511,12 +511,12 @@ void StubGenerator::copy_bytes_backward(Register from, Register dest, // - If target supports AVX3 features (BW+VL+F) then implementation uses 32 byte vectors (YMMs) // for both special cases (various small block sizes) and aligned copy loop. This is the // default configuration. -// - If copy length is above AVX3Threshold, then implementation use 64 byte vectors (ZMMs) +// - If copy length is above CopyAVX3Threshold, then implementation use 64 byte vectors (ZMMs) // for main copy loop (and subsequent tail) since bulk of the cycles will be consumed in it. // - If user forces MaxVectorSize=32 then above 4096 bytes its seen that REP MOVs shows a // better performance for disjoint copies. For conjoint/backward copy vector based // copy performs better. -// - If user sets AVX3Threshold=0, then special cases for small blocks sizes operate over +// - If user sets CopyAVX3Threshold=0, then special cases for small blocks sizes operate over // 64 byte vector registers (ZMMs). // Inputs: @@ -575,8 +575,7 @@ address StubGenerator::generate_disjoint_copy_avx3_masked(StubId stub_id, addres StubCodeMark mark(this, stub_id); address start = __ pc(); - int avx3threshold = VM_Version::avx3_threshold(); - bool use64byteVector = (MaxVectorSize > 32) && (avx3threshold == 0); + bool use64byteVector = (MaxVectorSize > 32) && (CopyAVX3Threshold == 0); const int large_threshold = 2621440; // 2.5 MB Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry; Label L_repmovs, L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64; @@ -647,7 +646,7 @@ address StubGenerator::generate_disjoint_copy_avx3_masked(StubId stub_id, addres __ cmpq(temp2, large_threshold); __ jcc(Assembler::greaterEqual, L_copy_large); } - if (avx3threshold != 0) { + if (CopyAVX3Threshold != 0) { __ cmpq(count, threshold[shift]); if (MaxVectorSize == 64) { // Copy using 64 byte vectors. @@ -659,7 +658,7 @@ address StubGenerator::generate_disjoint_copy_avx3_masked(StubId stub_id, addres } } - if ((MaxVectorSize < 64) || (avx3threshold != 0)) { + if ((MaxVectorSize < 64) || (CopyAVX3Threshold != 0)) { // Partial copy to make dst address 32 byte aligned. __ movq(temp2, to); __ andq(temp2, 31); @@ -913,8 +912,7 @@ address StubGenerator::generate_conjoint_copy_avx3_masked(StubId stub_id, addres StubCodeMark mark(this, stub_id); address start = __ pc(); - int avx3threshold = VM_Version::avx3_threshold(); - bool use64byteVector = (MaxVectorSize > 32) && (avx3threshold == 0); + bool use64byteVector = (MaxVectorSize > 32) && (CopyAVX3Threshold == 0); Label L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64; Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry; @@ -979,12 +977,12 @@ address StubGenerator::generate_conjoint_copy_avx3_masked(StubId stub_id, addres // PRE-MAIN-POST loop for aligned copy. __ BIND(L_entry); - if ((MaxVectorSize > 32) && (avx3threshold != 0)) { + if ((MaxVectorSize > 32) && (CopyAVX3Threshold != 0)) { __ cmpq(temp1, threshold[shift]); __ jcc(Assembler::greaterEqual, L_pre_main_post_64); } - if ((MaxVectorSize < 64) || (avx3threshold != 0)) { + if ((MaxVectorSize < 64) || (CopyAVX3Threshold != 0)) { // Partial copy to make dst address 32 byte aligned. __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0)); __ andq(temp2, 31); @@ -1199,7 +1197,7 @@ void StubGenerator::arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegi bool use64byteVector, Label& L_entry, Label& L_exit) { Label L_entry_64, L_entry_96, L_entry_128; Label L_entry_160, L_entry_192; - bool avx3 = (MaxVectorSize > 32) && (VM_Version::avx3_threshold() == 0); + bool avx3 = (MaxVectorSize > 32) && (CopyAVX3Threshold == 0); int size_mat[][6] = { /* T_BYTE */ {32 , 64, 96 , 128 , 160 , 192 }, diff --git a/src/hotspot/cpu/x86/vm_version_x86.cpp b/src/hotspot/cpu/x86/vm_version_x86.cpp index b352de77d6f..a800feea0a8 100644 --- a/src/hotspot/cpu/x86/vm_version_x86.cpp +++ b/src/hotspot/cpu/x86/vm_version_x86.cpp @@ -1967,6 +1967,18 @@ void VM_Version::get_processor_features() { if (FLAG_IS_DEFAULT(UseCopySignIntrinsic)) { FLAG_SET_DEFAULT(UseCopySignIntrinsic, true); } + // CopyAVX3Threshold is the threshold at which 64-byte instructions are used + // for implementing the array copy and clear operations. + // The Intel platforms that supports the serialize instruction + // have improved implementation of 64-byte load/stores and so the default + // threshold is set to 0 for these platforms. + if (FLAG_IS_DEFAULT(CopyAVX3Threshold)) { + if (is_intel() && is_intel_server_family() && supports_serialize()) { + FLAG_SET_DEFAULT(CopyAVX3Threshold, 0); + } else { + FLAG_SET_DEFAULT(CopyAVX3Threshold, AVX3Threshold); + } + } } void VM_Version::print_platform_virtualization_info(outputStream* st) { @@ -2122,17 +2134,6 @@ bool VM_Version::is_intel_darkmont() { return is_intel() && is_intel_server_family() && (_model == 0xCC || _model == 0xDD); } -// avx3_threshold() sets the threshold at which 64-byte instructions are used -// for implementing the array copy and clear operations. -// The Intel platforms that supports the serialize instruction -// has improved implementation of 64-byte load/stores and so the default -// threshold is set to 0 for these platforms. -int VM_Version::avx3_threshold() { - return (is_intel_server_family() && - supports_serialize() && - FLAG_IS_DEFAULT(AVX3Threshold)) ? 0 : AVX3Threshold; -} - void VM_Version::clear_apx_test_state() { clear_apx_test_state_stub(); } diff --git a/src/hotspot/cpu/x86/vm_version_x86.hpp b/src/hotspot/cpu/x86/vm_version_x86.hpp index 9f0446df7c6..a42558a8023 100644 --- a/src/hotspot/cpu/x86/vm_version_x86.hpp +++ b/src/hotspot/cpu/x86/vm_version_x86.hpp @@ -958,8 +958,6 @@ public: static bool is_intel_darkmont(); - static int avx3_threshold(); - static bool is_intel_tsc_synched_at_init(); static void insert_features_names(VM_Version::VM_Features features, stringStream& ss); diff --git a/src/hotspot/share/runtime/flags/jvmFlagConstraintsCompiler.cpp b/src/hotspot/share/runtime/flags/jvmFlagConstraintsCompiler.cpp index 444ce321759..36eece6f013 100644 --- a/src/hotspot/share/runtime/flags/jvmFlagConstraintsCompiler.cpp +++ b/src/hotspot/share/runtime/flags/jvmFlagConstraintsCompiler.cpp @@ -274,6 +274,17 @@ JVMFlag::Error AVX3ThresholdConstraintFunc(int value, bool verbose) { return JVMFlag::SUCCESS; } +JVMFlag::Error CopyAVX3ThresholdConstraintFunc(int value, bool verbose) { + if (value != 0 && !is_power_of_2(value)) { + JVMFlag::printError(verbose, + "CopyAVX3Threshold ( %d ) must be 0 or " + "a power of two value between 0 and MAX_INT\n", value); + return JVMFlag::VIOLATES_CONSTRAINT; + } + + return JVMFlag::SUCCESS; +} + JVMFlag::Error ArraycopySrcPrefetchDistanceConstraintFunc(uintx value, bool verbose) { if (value >= 4032) { JVMFlag::printError(verbose, diff --git a/src/hotspot/share/runtime/flags/jvmFlagConstraintsCompiler.hpp b/src/hotspot/share/runtime/flags/jvmFlagConstraintsCompiler.hpp index cf785800cfc..45e91058e0b 100644 --- a/src/hotspot/share/runtime/flags/jvmFlagConstraintsCompiler.hpp +++ b/src/hotspot/share/runtime/flags/jvmFlagConstraintsCompiler.hpp @@ -46,6 +46,7 @@ f(uintx, ArraycopyDstPrefetchDistanceConstraintFunc) \ f(uintx, ArraycopySrcPrefetchDistanceConstraintFunc) \ f(int, AVX3ThresholdConstraintFunc) \ + f(int, CopyAVX3ThresholdConstraintFunc) \ f(uint, TypeProfileLevelConstraintFunc) \ f(uint, VerifyIterativeGVNConstraintFunc) \ f(intx, InitArrayShortSizeConstraintFunc) \