8380079: Add separate flag for platforms on which copy and clear operations are faster with AVX3Threshold set to 0

Reviewed-by: kvn, asmehra, sviswanathan
2026-06-27 04:42:56 +00:00 · 2026-03-16 19:21:05 +00:00 · 2026-03-16 19:21:05 +00:00 · 9dc3f488b4
commit 9dc3f488b4
parent 7695b1f9c2
9 changed files with 55 additions and 36 deletions
--- a/src/hotspot/cpu/x86/globals_x86.hpp
+++ b/src/hotspot/cpu/x86/globals_x86.hpp
@ -168,16 +168,27 @@ define_pd_global(intx, InitArrayShortSize, 8*BytesPerLong);
          "Perform Ecore Optimization")                                     \
                                                                            \
  /* Minimum array size in bytes to use AVX512 intrinsics */                \
-  /* for copy, inflate and fill which don't bail out early based on any */  \
+  /* for inflate and fill which don't bail out early based on any */        \
  /* condition. When this value is set to zero compare operations like */   \
  /* compare, vectorizedMismatch, compress can also use AVX512 intrinsics.*/\
  product(int, AVX3Threshold, 4096, DIAGNOSTIC,                             \
             "Minimum array size in bytes to use AVX512 intrinsics"         \
-             "for copy, inflate and fill. When this value is set as zero"   \
+             "for inflate and fill. When this value is set as zero"         \
             "compare operations can also use AVX512 intrinsics.")          \
             range(0, max_jint)                                             \
             constraint(AVX3ThresholdConstraintFunc,AfterErgo)              \
                                                                            \
+  /* Minimum array size in bytes to use AVX512 intrinsics */                \
+  /* for copy and fill which don't bail out early based on any */           \
+  /* condition. When this value is set to zero clear operations that */     \
+  /* work on memory blocks can also use AVX512 intrinsics. */               \
+  product(int, CopyAVX3Threshold, 4096, DIAGNOSTIC,                         \
+             "Minimum array size in bytes to use AVX512 intrinsics"         \
+             "for copy and fill. When this value is set as zero"            \
+             "clear operations can also use AVX512 intrinsics.")            \
+             range(0, max_jint)                                             \
+             constraint(CopyAVX3ThresholdConstraintFunc,AfterErgo)          \
+                                                                            \
  product(bool, IntelJccErratumMitigation, true, DIAGNOSTIC,                \
             "Turn off JVM mitigations related to Intel micro code "        \
             "mitigations for the Intel JCC erratum")                       \
--- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp
@ -5820,7 +5820,7 @@ void MacroAssembler::xmm_clear_mem(Register base, Register cnt, Register rtmp, X
  // cnt - number of qwords (8-byte words).
  // base - start address, qword aligned.
  Label L_zero_64_bytes, L_loop, L_sloop, L_tail, L_end;
-  bool use64byteVector = (MaxVectorSize == 64) && (VM_Version::avx3_threshold() == 0);
+  bool use64byteVector = (MaxVectorSize == 64) && (CopyAVX3Threshold == 0);
  if (use64byteVector) {
    vpxor(xtmp, xtmp, xtmp, AVX_512bit);
  } else if (MaxVectorSize >= 32) {
@ -5884,7 +5884,7 @@ void MacroAssembler::xmm_clear_mem(Register base, Register cnt, Register rtmp, X
 // Clearing constant sized memory using YMM/ZMM registers.
 void MacroAssembler::clear_mem(Register base, int cnt, Register rtmp, XMMRegister xtmp, KRegister mask) {
  assert(UseAVX > 2 && VM_Version::supports_avx512vl(), "");
-  bool use64byteVector = (MaxVectorSize > 32) && (VM_Version::avx3_threshold() == 0);
+  bool use64byteVector = (MaxVectorSize > 32) && (CopyAVX3Threshold == 0);

  int vector64_count = (cnt & (~0x7)) >> 3;
  cnt = cnt & 0x7;
@ -6109,8 +6109,8 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned,
          // Fill 64-byte chunks
          Label L_fill_64_bytes_loop_avx3, L_check_fill_64_bytes_avx2;

-          // If number of bytes to fill < VM_Version::avx3_threshold(), perform fill using AVX2
-          cmpptr(count, VM_Version::avx3_threshold());
+          // If number of bytes to fill < CopyAVX3Threshold, perform fill using AVX2
+          cmpptr(count, CopyAVX3Threshold);
          jccb(Assembler::below, L_check_fill_64_bytes_avx2);

          vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
@ -9483,7 +9483,6 @@ void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register va
  Label L_fill_zmm_sequence;

  int shift = -1;
-  int avx3threshold = VM_Version::avx3_threshold();
  switch(type) {
    case T_BYTE:  shift = 0;
      break;
@ -9499,10 +9498,10 @@ void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register va
      fatal("Unhandled type: %s\n", type2name(type));
  }

-  if ((avx3threshold != 0)  || (MaxVectorSize == 32)) {
+  if ((CopyAVX3Threshold != 0)  || (MaxVectorSize == 32)) {

    if (MaxVectorSize == 64) {
-      cmpq(count, avx3threshold >> shift);
+      cmpq(count, CopyAVX3Threshold >> shift);
      jcc(Assembler::greater, L_fill_zmm_sequence);
    }

--- a/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp
@ -166,12 +166,12 @@ class StubGenerator: public StubCodeGenerator {
  // - If target supports AVX3 features (BW+VL+F) then implementation uses 32 byte vectors (YMMs)
  //   for both special cases (various small block sizes) and aligned copy loop. This is the
  //   default configuration.
-  // - If copy length is above AVX3Threshold, then implementation use 64 byte vectors (ZMMs)
+  // - If copy length is above CopyAVX3Threshold, then implementation use 64 byte vectors (ZMMs)
  //   for main copy loop (and subsequent tail) since bulk of the cycles will be consumed in it.
  // - If user forces MaxVectorSize=32 then above 4096 bytes its seen that REP MOVs shows a
  //   better performance for disjoint copies. For conjoint/backward copy vector based
  //   copy performs better.
-  // - If user sets AVX3Threshold=0, then special cases for small blocks sizes operate over
+  // - If user sets CopyAVX3Threshold=0, then special cases for small blocks sizes operate over
  //   64 byte vector registers (ZMMs).

  address generate_disjoint_copy_avx3_masked(StubId stub_id, address* entry);
--- a/src/hotspot/cpu/x86/stubGenerator_x86_64_adler.cpp
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64_adler.cpp
@ -144,7 +144,7 @@ address StubGenerator::generate_updateBytesAdler32() {
  __ align32();
  if (VM_Version::supports_avx512vl()) {
    // AVX2 performs better for smaller inputs because of leaner post loop reduction sequence..
-    __ cmpl(s, MAX2(128, VM_Version::avx3_threshold()));
+    __ cmpl(s, MAX2(128, CopyAVX3Threshold));
    __ jcc(Assembler::belowEqual, SLOOP1A_AVX2);
    __ lea(end, Address(s, data, Address::times_1, - (2*CHUNKSIZE -1)));

--- a/src/hotspot/cpu/x86/stubGenerator_x86_64_arraycopy.cpp
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64_arraycopy.cpp
@ -511,12 +511,12 @@ void StubGenerator::copy_bytes_backward(Register from, Register dest,
 // - If target supports AVX3 features (BW+VL+F) then implementation uses 32 byte vectors (YMMs)
 //   for both special cases (various small block sizes) and aligned copy loop. This is the
 //   default configuration.
-// - If copy length is above AVX3Threshold, then implementation use 64 byte vectors (ZMMs)
+// - If copy length is above CopyAVX3Threshold, then implementation use 64 byte vectors (ZMMs)
 //   for main copy loop (and subsequent tail) since bulk of the cycles will be consumed in it.
 // - If user forces MaxVectorSize=32 then above 4096 bytes its seen that REP MOVs shows a
 //   better performance for disjoint copies. For conjoint/backward copy vector based
 //   copy performs better.
-// - If user sets AVX3Threshold=0, then special cases for small blocks sizes operate over
+// - If user sets CopyAVX3Threshold=0, then special cases for small blocks sizes operate over
 //   64 byte vector registers (ZMMs).

 // Inputs:
@ -575,8 +575,7 @@ address StubGenerator::generate_disjoint_copy_avx3_masked(StubId stub_id, addres
  StubCodeMark mark(this, stub_id);
  address start = __ pc();

-  int avx3threshold = VM_Version::avx3_threshold();
-  bool use64byteVector = (MaxVectorSize > 32) && (avx3threshold == 0);
+  bool use64byteVector = (MaxVectorSize > 32) && (CopyAVX3Threshold == 0);
  const int large_threshold = 2621440; // 2.5 MB
  Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
  Label L_repmovs, L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
@ -647,7 +646,7 @@ address StubGenerator::generate_disjoint_copy_avx3_masked(StubId stub_id, addres
      __ cmpq(temp2, large_threshold);
      __ jcc(Assembler::greaterEqual, L_copy_large);
    }
-    if (avx3threshold != 0) {
+    if (CopyAVX3Threshold != 0) {
      __ cmpq(count, threshold[shift]);
      if (MaxVectorSize == 64) {
        // Copy using 64 byte vectors.
@ -659,7 +658,7 @@ address StubGenerator::generate_disjoint_copy_avx3_masked(StubId stub_id, addres
      }
    }

-    if ((MaxVectorSize < 64)  || (avx3threshold != 0)) {
+    if ((MaxVectorSize < 64)  || (CopyAVX3Threshold != 0)) {
      // Partial copy to make dst address 32 byte aligned.
      __ movq(temp2, to);
      __ andq(temp2, 31);
@ -913,8 +912,7 @@ address StubGenerator::generate_conjoint_copy_avx3_masked(StubId stub_id, addres
  StubCodeMark mark(this, stub_id);
  address start = __ pc();

-  int avx3threshold = VM_Version::avx3_threshold();
-  bool use64byteVector = (MaxVectorSize > 32) && (avx3threshold == 0);
+  bool use64byteVector = (MaxVectorSize > 32) && (CopyAVX3Threshold == 0);

  Label L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
  Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
@ -979,12 +977,12 @@ address StubGenerator::generate_conjoint_copy_avx3_masked(StubId stub_id, addres
    // PRE-MAIN-POST loop for aligned copy.
    __ BIND(L_entry);

-    if ((MaxVectorSize > 32) && (avx3threshold != 0)) {
+    if ((MaxVectorSize > 32) && (CopyAVX3Threshold != 0)) {
      __ cmpq(temp1, threshold[shift]);
      __ jcc(Assembler::greaterEqual, L_pre_main_post_64);
    }

-    if ((MaxVectorSize < 64)  || (avx3threshold != 0)) {
+    if ((MaxVectorSize < 64)  || (CopyAVX3Threshold != 0)) {
      // Partial copy to make dst address 32 byte aligned.
      __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
      __ andq(temp2, 31);
@ -1199,7 +1197,7 @@ void StubGenerator::arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegi
                                                           bool use64byteVector, Label& L_entry, Label& L_exit) {
  Label L_entry_64, L_entry_96, L_entry_128;
  Label L_entry_160, L_entry_192;
-  bool avx3 = (MaxVectorSize > 32) && (VM_Version::avx3_threshold() == 0);
+  bool avx3 = (MaxVectorSize > 32) && (CopyAVX3Threshold == 0);

  int size_mat[][6] = {
  /* T_BYTE */ {32 , 64,  96 , 128 , 160 , 192 },
--- a/src/hotspot/cpu/x86/vm_version_x86.cpp
+++ b/src/hotspot/cpu/x86/vm_version_x86.cpp
@ -1967,6 +1967,18 @@ void VM_Version::get_processor_features() {
  if (FLAG_IS_DEFAULT(UseCopySignIntrinsic)) {
      FLAG_SET_DEFAULT(UseCopySignIntrinsic, true);
  }
+  // CopyAVX3Threshold is the threshold at which 64-byte instructions are used
+  // for implementing the array copy and clear operations.
+  // The Intel platforms that supports the serialize instruction
+  // have improved implementation of 64-byte load/stores and so the default
+  // threshold is set to 0 for these platforms.
+  if (FLAG_IS_DEFAULT(CopyAVX3Threshold)) {
+    if (is_intel() && is_intel_server_family() && supports_serialize()) {
+      FLAG_SET_DEFAULT(CopyAVX3Threshold, 0);
+    } else {
+      FLAG_SET_DEFAULT(CopyAVX3Threshold, AVX3Threshold);
+    }
+  }
 }

 void VM_Version::print_platform_virtualization_info(outputStream* st) {
@ -2122,17 +2134,6 @@ bool VM_Version::is_intel_darkmont() {
  return is_intel() && is_intel_server_family() && (_model == 0xCC || _model == 0xDD);
 }

-// avx3_threshold() sets the threshold at which 64-byte instructions are used
-// for implementing the array copy and clear operations.
-// The Intel platforms that supports the serialize instruction
-// has improved implementation of 64-byte load/stores and so the default
-// threshold is set to 0 for these platforms.
-int VM_Version::avx3_threshold() {
-  return (is_intel_server_family() &&
-          supports_serialize() &&
-          FLAG_IS_DEFAULT(AVX3Threshold)) ? 0 : AVX3Threshold;
-}
-
 void VM_Version::clear_apx_test_state() {
  clear_apx_test_state_stub();
 }
--- a/src/hotspot/cpu/x86/vm_version_x86.hpp
+++ b/src/hotspot/cpu/x86/vm_version_x86.hpp
@ -958,8 +958,6 @@ public:

  static bool is_intel_darkmont();

-  static int avx3_threshold();
-
  static bool is_intel_tsc_synched_at_init();

  static void insert_features_names(VM_Version::VM_Features features, stringStream& ss);
--- a/src/hotspot/share/runtime/flags/jvmFlagConstraintsCompiler.cpp
+++ b/src/hotspot/share/runtime/flags/jvmFlagConstraintsCompiler.cpp
@ -274,6 +274,17 @@ JVMFlag::Error AVX3ThresholdConstraintFunc(int value, bool verbose) {
  return JVMFlag::SUCCESS;
 }

+JVMFlag::Error CopyAVX3ThresholdConstraintFunc(int value, bool verbose) {
+  if (value != 0 && !is_power_of_2(value)) {
+    JVMFlag::printError(verbose,
+                        "CopyAVX3Threshold ( %d ) must be 0 or "
+                        "a power of two value between 0 and MAX_INT\n", value);
+    return JVMFlag::VIOLATES_CONSTRAINT;
+  }
+
+  return JVMFlag::SUCCESS;
+}
+
 JVMFlag::Error ArraycopySrcPrefetchDistanceConstraintFunc(uintx value, bool verbose) {
  if (value >= 4032) {
    JVMFlag::printError(verbose,
--- a/src/hotspot/share/runtime/flags/jvmFlagConstraintsCompiler.hpp
+++ b/src/hotspot/share/runtime/flags/jvmFlagConstraintsCompiler.hpp
@ -46,6 +46,7 @@
  f(uintx, ArraycopyDstPrefetchDistanceConstraintFunc)  \
  f(uintx, ArraycopySrcPrefetchDistanceConstraintFunc)  \
  f(int,   AVX3ThresholdConstraintFunc)                 \
+  f(int,   CopyAVX3ThresholdConstraintFunc)             \
  f(uint,  TypeProfileLevelConstraintFunc)              \
  f(uint,  VerifyIterativeGVNConstraintFunc)            \
  f(intx,  InitArrayShortSizeConstraintFunc)            \