8382482: Optimize equals scenario in x86 scalar floating point min/max reduction loops

Reviewed-by: sviswanathan, epeter, sparasa
2026-07-23 17:40:35 +00:00 · 2026-05-28 20:16:12 +00:00 · 2026-05-28 20:16:12 +00:00 · bb4d2abb0f
commit bb4d2abb0f
parent 4eb67734b7
2 changed files with 56 additions and 36 deletions
--- a/src/hotspot/cpu/x86/x86.ad
+++ b/src/hotspot/cpu/x86/x86.ad
@ -1742,14 +1742,10 @@ static inline void movfp(MacroAssembler* masm, enum FP_PREC pt,
 // ja   -> b           # a
 // jp   -> NaN         # NaN
 // jb   -> a           # b
-// je                  #
-// |-jz -> a | b       # a & b
-// |    -> a           #
+// je   -> a | b       # a & b
 static void emit_fp_min_max(MacroAssembler* masm, XMMRegister dst,
-                            XMMRegister a, XMMRegister b,
-                            XMMRegister xmmt, Register rt,
+                            XMMRegister a, XMMRegister b, Register rt,
                            bool min, enum FP_PREC pt) {
-
  Label nan, zero, below, above, done;

  emit_fp_ucom(masm, pt, a, b);
@ -1759,31 +1755,26 @@ static void emit_fp_min_max(MacroAssembler* masm, XMMRegister dst,
  } else {
    __ jccb(Assembler::above, done);
  }
-
  __ jccb(Assembler::parity, nan);  // PF=1
  __ jccb(Assembler::below, below); // CF=1

  // equal
-  __ vpxor(xmmt, xmmt, xmmt, Assembler::AVX_128bit);
-  emit_fp_ucom(masm, pt, a, xmmt);
-
-  __ jccb(Assembler::equal, zero);
-  movfp(masm, pt, dst, a, rt);
-
-  __ jmp(done);
-
-  __ bind(zero);
+  // Using bitwise operations is a low cost way to compute the correct result
+  // for zero and non-zero inputs in this scenario except for NaN, which is
+  // handled separately. The mantissa and exponent are valid with either
+  // bitwise operation. For zero inputs, the sign bit is chosen according to
+  // whether a minimum or maximum value is required.
  if (min) {
+    // Negative sign preserved when available (e.g., min(+0, -0) -> -0)
    __ vpor(dst, a, b, Assembler::AVX_128bit);
  } else {
+    // Positive sign preserved when available (e.g., max(+0, -0) -> +0)
    __ vpand(dst, a, b, Assembler::AVX_128bit);
  }
-
  __ jmp(done);

  __ bind(above);
  movfp(masm, pt, dst, min ? b : a, rt);
-
  __ jmp(done);

  __ bind(nan);
@ -7376,18 +7367,18 @@ instruct minmaxF_reg_avx10_2(regF dst, regF a, regF b)
  ins_pipe( pipe_slow );
 %}

-instruct minmaxF_reduction_reg_avx10_2(regF dst, regF a, regF b, regF xtmp, rRegI rtmp, rFlagsReg cr)
+instruct minmaxF_reduction_reg_avx10_2(regF dst, regF a, regF b, rRegI rtmp, rFlagsReg cr)
 %{
  predicate(VM_Version::supports_avx10_2() && VLoopReductions::is_reduction(n));
  match(Set dst (MaxF a b));
  match(Set dst (MinF a b));
-  effect(USE a, USE b, TEMP xtmp, TEMP rtmp, KILL cr);
+  effect(USE a, USE b, TEMP rtmp, KILL cr);

-  format %{ "minmaxF_reduction $dst, $a, $b \t! using $xtmp and $rtmp as TEMP" %}
+  format %{ "minmaxF_reduction $dst, $a, $b \t! using $rtmp as TEMP" %}
  ins_encode %{
    int opcode = this->ideal_Opcode();
    bool min = (opcode == Op_MinF) ? true : false;
-    emit_fp_min_max(masm, $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $xtmp$$XMMRegister, $rtmp$$Register,
+    emit_fp_min_max(masm, $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $rtmp$$Register,
                    min, fp_prec_flt /*pt*/);
  %}
  ins_pipe( pipe_slow );
@ -7412,18 +7403,18 @@ instruct minmaxF_reg(legRegF dst, legRegF a, legRegF b, legRegF tmp, legRegF atm
  ins_pipe( pipe_slow );
 %}

-instruct minmaxF_reduction_reg(legRegF dst, legRegF a, legRegF b, legRegF xtmp, rRegI rtmp, rFlagsReg cr)
+instruct minmaxF_reduction_reg(legRegF dst, legRegF a, legRegF b, rRegI rtmp, rFlagsReg cr)
 %{
  predicate(!VM_Version::supports_avx10_2() && UseAVX > 0 && VLoopReductions::is_reduction(n));
  match(Set dst (MaxF a b));
  match(Set dst (MinF a b));
-  effect(USE a, USE b, TEMP xtmp, TEMP rtmp, KILL cr);
+  effect(USE a, USE b, TEMP rtmp, KILL cr);

-  format %{ "minmaxF_reduction $dst, $a, $b \t!using $xtmp and $rtmp as TEMP" %}
+  format %{ "minmaxF_reduction $dst, $a, $b \t!using $rtmp as TEMP" %}
  ins_encode %{
    int opcode = this->ideal_Opcode();
    bool min = (opcode == Op_MinF) ? true : false;
-    emit_fp_min_max(masm, $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $xtmp$$XMMRegister, $rtmp$$Register,
+    emit_fp_min_max(masm, $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $rtmp$$Register,
                    min, fp_prec_flt /*pt*/);
  %}
  ins_pipe( pipe_slow );
@ -7445,18 +7436,18 @@ instruct minmaxD_reg_avx10_2(regD dst, regD a, regD b)
  ins_pipe( pipe_slow );
 %}

-instruct minmaxD_reduction_reg_avx10_2(regD dst, regD a, regD b, regD xtmp, rRegI rtmp, rFlagsReg cr)
+instruct minmaxD_reduction_reg_avx10_2(regD dst, regD a, regD b, rRegI rtmp, rFlagsReg cr)
 %{
  predicate(VM_Version::supports_avx10_2() && VLoopReductions::is_reduction(n));
  match(Set dst (MaxD a b));
  match(Set dst (MinD a b));
-  effect(USE a, USE b, TEMP xtmp, TEMP rtmp, KILL cr);
+  effect(USE a, USE b, TEMP rtmp, KILL cr);

-  format %{ "minmaxD_reduction $dst, $a, $b \t! using $xtmp and $rtmp as TEMP" %}
+  format %{ "minmaxD_reduction $dst, $a, $b \t! using $rtmp as TEMP" %}
  ins_encode %{
    int opcode = this->ideal_Opcode();
    bool min = (opcode == Op_MinD) ? true : false;
-    emit_fp_min_max(masm, $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $xtmp$$XMMRegister, $rtmp$$Register,
+    emit_fp_min_max(masm, $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $rtmp$$Register,
                    min, fp_prec_dbl /*pt*/);
  %}
  ins_pipe( pipe_slow );
@ -7481,18 +7472,18 @@ instruct minmaxD_reg(legRegD dst, legRegD a, legRegD b, legRegD tmp, legRegD atm
  ins_pipe( pipe_slow );
 %}

-instruct minmaxD_reduction_reg(legRegD dst, legRegD a, legRegD b, legRegD xtmp, rRegL rtmp, rFlagsReg cr)
+instruct minmaxD_reduction_reg(legRegD dst, legRegD a, legRegD b, rRegL rtmp, rFlagsReg cr)
 %{
  predicate(!VM_Version::supports_avx10_2() && UseAVX > 0 && VLoopReductions::is_reduction(n));
  match(Set dst (MaxD a b));
  match(Set dst (MinD a b));
-  effect(USE a, USE b, TEMP xtmp, TEMP rtmp, KILL cr);
+  effect(USE a, USE b, TEMP rtmp, KILL cr);

-  format %{ "minmaxD_reduction $dst, $a, $b \t! using $xtmp and $rtmp as TEMP" %}
+  format %{ "minmaxD_reduction $dst, $a, $b \t! using $rtmp as TEMP" %}
  ins_encode %{
    int opcode = this->ideal_Opcode();
    bool min = (opcode == Op_MinD) ? true : false;
-    emit_fp_min_max(masm, $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $xtmp$$XMMRegister, $rtmp$$Register,
+    emit_fp_min_max(masm, $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $rtmp$$Register,
                    min, fp_prec_dbl /*pt*/);
  %}
  ins_pipe( pipe_slow );
--- a/test/micro/org/openjdk/bench/vm/compiler/FpMinMaxIntrinsics.java
+++ b/test/micro/org/openjdk/bench/vm/compiler/FpMinMaxIntrinsics.java
@ -54,8 +54,37 @@ public class FpMinMaxIntrinsics {
        c2 = COUNT - (s2 = step());

        for (int i = 0; i < COUNT; i++) {
-            floats[i] = r.nextFloat();
-            doubles[i] = r.nextDouble();
+            final int mappedIndex = i % 100;
+
+            if (mappedIndex >= 0 && mappedIndex < 10) {
+                // NaN
+                floats[i] = Float.NaN;
+                doubles[i] = Double.NaN;
+            } else if (mappedIndex >= 20 && mappedIndex < 30) {
+                // Equal (+0.0)
+                floats[i] = +0.0f;
+                doubles[i] = +0.0;
+            } else if (mappedIndex >= 40 && mappedIndex < 50) {
+                // Equal (-0.0)
+                floats[i] = -0.0f;
+                doubles[i] = -0.0;
+            } else if (mappedIndex >= 60 && mappedIndex < 70) {
+                // Descending
+                floats[i] = (float) (COUNT - i);
+                doubles[i] = (double) (COUNT - i);
+            } else if (mappedIndex >= 80 && mappedIndex < 90) {
+                // Ascending
+                floats[i] = (float) i;
+                doubles[i] = (double) i;
+            } else if (mappedIndex >= 90 && mappedIndex < 100) {
+                // Random (negative)
+                floats[i] = -r.nextFloat();
+                doubles[i] = -r.nextDouble();
+            } else {
+                // Random (positive)
+                floats[i] = r.nextFloat();
+                doubles[i] = r.nextDouble();
+            }
        }
    }