8320347: Emulate vblendvp[sd] on ECore

Reviewed-by: sviswanathan, jbhateja
2026-06-30 14:20:29 +00:00 · 2023-11-30 16:10:54 +00:00 · 2023-11-30 16:10:54 +00:00 · 6aba6aa6f1
commit 6aba6aa6f1
parent 693847452f
8 changed files with 294 additions and 93 deletions
--- a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
+++ b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
@ -1094,32 +1094,78 @@ void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
  bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
  bool is_double_word = is_double_word_type(elem_bt);

+  /* Note on 'non-obvious' assembly sequence:
+   *
+   * While there are vminps/vmaxps instructions, there are two important differences between hardware
+   * and Java on how they handle floats:
+   *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
+   *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
+   *
+   * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
+   *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
+   *                (only useful when signs differ, noop otherwise)
+   *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
+
+   *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
+   *   btmp = (b < +0.0) ? a : b
+   *   atmp = (b < +0.0) ? b : a
+   *   Tmp  = Max_Float(atmp , btmp)
+   *   Res  = (atmp == NaN) ? atmp : Tmp
+   */
+
+  void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
+  void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
+  void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
+  XMMRegister mask;
+
  if (!is_double_word && is_min) {
-    vblendvps(atmp, a, b, a, vlen_enc);
-    vblendvps(btmp, b, a, a, vlen_enc);
-    vminps(tmp, atmp, btmp, vlen_enc);
-    vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
-    vblendvps(dst, tmp, atmp, btmp, vlen_enc);
+    mask = a;
+    vblend = &MacroAssembler::vblendvps;
+    vmaxmin = &MacroAssembler::vminps;
+    vcmp = &MacroAssembler::vcmpps;
  } else if (!is_double_word && !is_min) {
-    vblendvps(btmp, b, a, b, vlen_enc);
-    vblendvps(atmp, a, b, b, vlen_enc);
-    vmaxps(tmp, atmp, btmp, vlen_enc);
-    vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
-    vblendvps(dst, tmp, atmp, btmp, vlen_enc);
+    mask = b;
+    vblend = &MacroAssembler::vblendvps;
+    vmaxmin = &MacroAssembler::vmaxps;
+    vcmp = &MacroAssembler::vcmpps;
  } else if (is_double_word && is_min) {
-    vblendvpd(atmp, a, b, a, vlen_enc);
-    vblendvpd(btmp, b, a, a, vlen_enc);
-    vminpd(tmp, atmp, btmp, vlen_enc);
-    vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
-    vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
+    mask = a;
+    vblend = &MacroAssembler::vblendvpd;
+    vmaxmin = &MacroAssembler::vminpd;
+    vcmp = &MacroAssembler::vcmppd;
  } else {
    assert(is_double_word && !is_min, "sanity");
-    vblendvpd(btmp, b, a, b, vlen_enc);
-    vblendvpd(atmp, a, b, b, vlen_enc);
-    vmaxpd(tmp, atmp, btmp, vlen_enc);
-    vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
-    vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
+    mask = b;
+    vblend = &MacroAssembler::vblendvpd;
+    vmaxmin = &MacroAssembler::vmaxpd;
+    vcmp = &MacroAssembler::vcmppd;
  }
+
+  // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
+  XMMRegister maxmin, scratch;
+  if (dst == btmp) {
+    maxmin = btmp;
+    scratch = tmp;
+  } else {
+    maxmin = tmp;
+    scratch = btmp;
+  }
+
+  bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
+  if (precompute_mask && !is_double_word) {
+    vpsrad(tmp, mask, 32, vlen_enc);
+    mask = tmp;
+  } else if (precompute_mask && is_double_word) {
+    vpxor(tmp, tmp, tmp, vlen_enc);
+    vpcmpgtq(tmp, tmp, mask, vlen_enc);
+    mask = tmp;
+  }
+
+  (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
+  (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
+  (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
+  (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
+  (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
 }

 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
@ -5318,18 +5364,18 @@ void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegist
  if (opcode == Op_SignumVD) {
    vsubpd(dst, zero, one, vec_enc);
    // if src < 0 ? -1 : 1
-    vblendvpd(dst, one, dst, src, vec_enc);
+    vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
    // if src == NaN, -0.0 or 0.0 return src.
    vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
-    vblendvpd(dst, dst, src, xtmp1, vec_enc);
+    vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
  } else {
    assert(opcode == Op_SignumVF, "");
    vsubps(dst, zero, one, vec_enc);
    // if src < 0 ? -1 : 1
-    vblendvps(dst, one, dst, src, vec_enc);
+    vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
    // if src == NaN, -0.0 or 0.0 return src.
    vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
-    vblendvps(dst, dst, src, xtmp1, vec_enc);
+    vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
  }
 }

--- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp
@ -3566,6 +3566,56 @@ void MacroAssembler::vbroadcastss(XMMRegister dst, AddressLiteral src, int vecto
  }
 }

+// Vector float blend
+// vblendvps(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister mask, int vector_len, bool compute_mask = true, XMMRegister scratch = xnoreg)
+void MacroAssembler::vblendvps(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister mask, int vector_len, bool compute_mask, XMMRegister scratch) {
+  // WARN: Allow dst == (src1|src2), mask == scratch
+  bool blend_emulation = EnableX86ECoreOpts && UseAVX > 1;
+  bool scratch_available = scratch != xnoreg && scratch != src1 && scratch != src2 && scratch != dst;
+  bool dst_available = dst != mask && (dst != src1 || dst != src2);
+  if (blend_emulation && scratch_available && dst_available) {
+    if (compute_mask) {
+      vpsrad(scratch, mask, 32, vector_len);
+      mask = scratch;
+    }
+    if (dst == src1) {
+      vpandn(dst,     mask, src1, vector_len); // if mask == 0, src1
+      vpand (scratch, mask, src2, vector_len); // if mask == 1, src2
+    } else {
+      vpand (dst,     mask, src2, vector_len); // if mask == 1, src2
+      vpandn(scratch, mask, src1, vector_len); // if mask == 0, src1
+    }
+    vpor(dst, dst, scratch, vector_len);
+  } else {
+    Assembler::vblendvps(dst, src1, src2, mask, vector_len);
+  }
+}
+
+// vblendvpd(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister mask, int vector_len, bool compute_mask = true, XMMRegister scratch = xnoreg)
+void MacroAssembler::vblendvpd(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister mask, int vector_len, bool compute_mask, XMMRegister scratch) {
+  // WARN: Allow dst == (src1|src2), mask == scratch
+  bool blend_emulation = EnableX86ECoreOpts && UseAVX > 1;
+  bool scratch_available = scratch != xnoreg && scratch != src1 && scratch != src2 && scratch != dst && (!compute_mask || scratch != mask);
+  bool dst_available = dst != mask && (dst != src1 || dst != src2);
+  if (blend_emulation && scratch_available && dst_available) {
+    if (compute_mask) {
+      vpxor(scratch, scratch, scratch, vector_len);
+      vpcmpgtq(scratch, scratch, mask, vector_len);
+      mask = scratch;
+    }
+    if (dst == src1) {
+      vpandn(dst,     mask, src1, vector_len); // if mask == 0, src
+      vpand (scratch, mask, src2, vector_len); // if mask == 1, src2
+    } else {
+      vpand (dst,     mask, src2, vector_len); // if mask == 1, src2
+      vpandn(scratch, mask, src1, vector_len); // if mask == 0, src
+    }
+    vpor(dst, dst, scratch, vector_len);
+  } else {
+    Assembler::vblendvpd(dst, src1, src2, mask, vector_len);
+  }
+}
+
 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
  assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
  Assembler::vpcmpeqb(dst, nds, src, vector_len);
--- a/src/hotspot/cpu/x86/macroAssembler_x86.hpp
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.hpp
@ -1130,6 +1130,10 @@ public:
  using Assembler::vbroadcastss;
  void vbroadcastss(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);

+  // Vector float blend
+  void vblendvps(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister mask, int vector_len, bool compute_mask = true, XMMRegister scratch = xnoreg);
+  void vblendvpd(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister mask, int vector_len, bool compute_mask = true, XMMRegister scratch = xnoreg);
+
  void divsd(XMMRegister dst, XMMRegister    src) { Assembler::divsd(dst, src); }
  void divsd(XMMRegister dst, Address        src) { Assembler::divsd(dst, src); }
  void divsd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);
--- a/src/hotspot/cpu/x86/x86.ad
+++ b/src/hotspot/cpu/x86/x86.ad
@ -7801,7 +7801,7 @@ instruct blendvp(vec dst, vec src, vec mask, rxmm0 tmp) %{
 %}

 instruct vblendvpI(legVec dst, legVec src1, legVec src2, legVec mask) %{
-  predicate(UseAVX > 0 &&
+  predicate(UseAVX > 0 && !EnableX86ECoreOpts &&
            n->in(2)->bottom_type()->isa_vectmask() == NULL &&
            Matcher::vector_length_in_bytes(n) <= 32 &&
            is_integral_type(Matcher::vector_element_basic_type(n)));
@ -7815,7 +7815,7 @@ instruct vblendvpI(legVec dst, legVec src1, legVec src2, legVec mask) %{
 %}

 instruct vblendvpFD(legVec dst, legVec src1, legVec src2, legVec mask) %{
-  predicate(UseAVX > 0 &&
+  predicate(UseAVX > 0 && !EnableX86ECoreOpts &&
            n->in(2)->bottom_type()->isa_vectmask() == NULL &&
            Matcher::vector_length_in_bytes(n) <= 32 &&
            !is_integral_type(Matcher::vector_element_basic_type(n)));
@ -7828,6 +7828,22 @@ instruct vblendvpFD(legVec dst, legVec src1, legVec src2, legVec mask) %{
  ins_pipe( pipe_slow );
 %}

+instruct vblendvp(legVec dst, legVec src1, legVec src2, legVec mask, legVec vtmp) %{
+  predicate(UseAVX > 0 && EnableX86ECoreOpts &&
+            n->in(2)->bottom_type()->isa_vectmask() == NULL &&
+            Matcher::vector_length_in_bytes(n) <= 32);
+  match(Set dst (VectorBlend (Binary src1 src2) mask));
+  format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using $vtmp as TEMP" %}
+  effect(TEMP vtmp, TEMP dst);
+  ins_encode %{
+    int vlen_enc = vector_length_encoding(this);
+    __ vpandn($vtmp$$XMMRegister, $mask$$XMMRegister, $src1$$XMMRegister, vlen_enc);
+    __ vpand ($dst$$XMMRegister,  $mask$$XMMRegister, $src2$$XMMRegister, vlen_enc);
+    __ vpor  ($dst$$XMMRegister,  $dst$$XMMRegister,  $vtmp$$XMMRegister, vlen_enc);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct evblendvp64(vec dst, vec src1, vec src2, vec mask, kReg ktmp) %{
  predicate(Matcher::vector_length_in_bytes(n) == 64 &&
            n->in(2)->bottom_type()->isa_vectmask() == NULL);
--- a/src/hotspot/cpu/x86/x86_64.ad
+++ b/src/hotspot/cpu/x86/x86_64.ad
@ -4478,34 +4478,15 @@ instruct loadD(regD dst, memory mem)
  ins_pipe(pipe_slow); // XXX
 %}

-
-// Following pseudo code describes the algorithm for max[FD]:
-// Min algorithm is on similar lines
-//  btmp = (b < +0.0) ? a : b
-//  atmp = (b < +0.0) ? b : a
-//  Tmp  = Max_Float(atmp , btmp)
-//  Res  = (atmp == NaN) ? atmp : Tmp
-
 // max = java.lang.Math.max(float a, float b)
 instruct maxF_reg(legRegF dst, legRegF a, legRegF b, legRegF tmp, legRegF atmp, legRegF btmp) %{
  predicate(UseAVX > 0 && !SuperWord::is_reduction(n));
  match(Set dst (MaxF a b));
  effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
-  format %{
-     "vblendvps        $btmp,$b,$a,$b           \n\t"
-     "vblendvps        $atmp,$a,$b,$b           \n\t"
-     "vmaxss           $tmp,$atmp,$btmp         \n\t"
-     "vcmpps.unordered $btmp,$atmp,$atmp        \n\t"
-     "vblendvps        $dst,$tmp,$atmp,$btmp    \n\t"
-  %}
+  format %{ "maxF $dst, $a, $b \t! using tmp, atmp and btmp as TEMP" %}
  ins_encode %{
-    int vector_len = Assembler::AVX_128bit;
-    __ vblendvps($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, vector_len);
-    __ vblendvps($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $b$$XMMRegister, vector_len);
-    __ vmaxss($tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister);
-    __ vcmpps($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::_false, vector_len);
-    __ vblendvps($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
- %}
+    __ vminmax_fp(Op_MaxV, T_FLOAT, $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, Assembler::AVX_128bit);
+  %}
  ins_pipe( pipe_slow );
 %}

@ -4527,20 +4508,9 @@ instruct maxD_reg(legRegD dst, legRegD a, legRegD b, legRegD tmp, legRegD atmp,
  predicate(UseAVX > 0 && !SuperWord::is_reduction(n));
  match(Set dst (MaxD a b));
  effect(USE a, USE b, TEMP atmp, TEMP btmp, TEMP tmp);
-  format %{
-     "vblendvpd        $btmp,$b,$a,$b            \n\t"
-     "vblendvpd        $atmp,$a,$b,$b            \n\t"
-     "vmaxsd           $tmp,$atmp,$btmp          \n\t"
-     "vcmppd.unordered $btmp,$atmp,$atmp         \n\t"
-     "vblendvpd        $dst,$tmp,$atmp,$btmp     \n\t"
-  %}
+  format %{ "maxD $dst, $a, $b \t! using tmp, atmp and btmp as TEMP" %}
  ins_encode %{
-    int vector_len = Assembler::AVX_128bit;
-    __ vblendvpd($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, vector_len);
-    __ vblendvpd($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $b$$XMMRegister, vector_len);
-    __ vmaxsd($tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister);
-    __ vcmppd($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::_false, vector_len);
-    __ vblendvpd($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
+    __ vminmax_fp(Op_MaxV, T_DOUBLE, $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, Assembler::AVX_128bit);
  %}
  ins_pipe( pipe_slow );
 %}
@ -4563,20 +4533,9 @@ instruct minF_reg(legRegF dst, legRegF a, legRegF b, legRegF tmp, legRegF atmp,
  predicate(UseAVX > 0 && !SuperWord::is_reduction(n));
  match(Set dst (MinF a b));
  effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
-  format %{
-     "vblendvps        $atmp,$a,$b,$a             \n\t"
-     "vblendvps        $btmp,$b,$a,$a             \n\t"
-     "vminss           $tmp,$atmp,$btmp           \n\t"
-     "vcmpps.unordered $btmp,$atmp,$atmp          \n\t"
-     "vblendvps        $dst,$tmp,$atmp,$btmp      \n\t"
-  %}
+  format %{ "minF $dst, $a, $b \t! using tmp, atmp and btmp as TEMP" %}
  ins_encode %{
-    int vector_len = Assembler::AVX_128bit;
-    __ vblendvps($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, vector_len);
-    __ vblendvps($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $a$$XMMRegister, vector_len);
-    __ vminss($tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister);
-    __ vcmpps($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::_false, vector_len);
-    __ vblendvps($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
+    __ vminmax_fp(Op_MinV, T_FLOAT, $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, Assembler::AVX_128bit);
  %}
  ins_pipe( pipe_slow );
 %}
@ -4599,20 +4558,9 @@ instruct minD_reg(legRegD dst, legRegD a, legRegD b, legRegD tmp, legRegD atmp,
  predicate(UseAVX > 0 && !SuperWord::is_reduction(n));
  match(Set dst (MinD a b));
  effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
-  format %{
-     "vblendvpd        $atmp,$a,$b,$a           \n\t"
-     "vblendvpd        $btmp,$b,$a,$a           \n\t"
-     "vminsd           $tmp,$atmp,$btmp         \n\t"
-     "vcmppd.unordered $btmp,$atmp,$atmp        \n\t"
-     "vblendvpd        $dst,$tmp,$atmp,$btmp    \n\t"
-  %}
+    format %{ "minD $dst, $a, $b \t! using tmp, atmp and btmp as TEMP" %}
  ins_encode %{
-    int vector_len = Assembler::AVX_128bit;
-    __ vblendvpd($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, vector_len);
-    __ vblendvpd($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $a$$XMMRegister, vector_len);
-    __ vminsd($tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister);
-    __ vcmppd($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::_false, vector_len);
-    __ vblendvpd($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
+    __ vminmax_fp(Op_MinV, T_DOUBLE, $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, Assembler::AVX_128bit);
  %}
  ins_pipe( pipe_slow );
 %}
--- a/test/hotspot/jtreg/compiler/vectorization/TestSignumVector.java
+++ b/test/hotspot/jtreg/compiler/vectorization/TestSignumVector.java
@ -33,6 +33,8 @@

 package compiler.vectorization;

+import java.util.Random;
+
 import compiler.lib.ir_framework.*;

 public class TestSignumVector {
@ -62,12 +64,22 @@ public class TestSignumVector {
  public void kernel_test_signum_double() {
      dinp = new double[ARRLEN];
      dout = new double[ARRLEN];
+      Random rnd = new Random(20);
      for(int i = 0 ; i < ARRLEN; i++) {
-          dinp[i] = (double)i*1.4;
+          dinp[i] = (i-ARRLEN/2)*rnd.nextDouble();
      }
      for (int i = 0; i < ITERS; i++) {
          test_signum_double(dout , dinp);
      }
+      for(int i = 0 ; i < ARRLEN; i++) {
+        if (i-ARRLEN/2<0) {
+            if (dout[i] != -1.0)  throw new RuntimeException("Expected negative numbers in first half of array: " + java.util.Arrays.toString(dout));
+        } else if (i-ARRLEN/2==0) {
+            if (dout[i] != 0)     throw new RuntimeException("Expected zero in the middle of array: " + java.util.Arrays.toString(dout));
+        } else {
+            if (dout[i] != 1.0)   throw new RuntimeException("Expected positive numbers in second half of array: " + java.util.Arrays.toString(dout));
+        }
+    }
  }

  @Test
@ -82,11 +94,21 @@ public class TestSignumVector {
  public void kernel_test_round() {
      finp = new float[ARRLEN];
      fout = new float[ARRLEN];
+      Random rnd = new Random(20);
      for(int i = 0 ; i < ARRLEN; i++) {
-          finp[i] = (float)i*1.4f;
+          finp[i] = (i-ARRLEN/2)*rnd.nextFloat();
      }
      for (int i = 0; i < ITERS; i++) {
          test_signum_float(fout , finp);
      }
+      for(int i = 0 ; i < ARRLEN; i++) {
+        if (i-ARRLEN/2<0) {
+            if (fout[i] != -1.0)  throw new RuntimeException("Expected negative numbers in first half of array: " + java.util.Arrays.toString(fout));
+        } else if (i-ARRLEN/2==0) {
+            if (fout[i] != 0)     throw new RuntimeException("Expected zero in the middle of array: " + java.util.Arrays.toString(fout));
+        } else {
+            if (fout[i] != 1.0)   throw new RuntimeException("Expected positive numbers in second half of array: " + java.util.Arrays.toString(fout));
+        }
+    }
  }
 }
--- a/test/hotspot/jtreg/compiler/vectorization/runner/BasicDoubleOpTest.java
+++ b/test/hotspot/jtreg/compiler/vectorization/runner/BasicDoubleOpTest.java
@ -42,6 +42,7 @@
 package compiler.vectorization.runner;

 import compiler.lib.ir_framework.*;
+import java.util.Random;

 public class BasicDoubleOpTest extends VectorizationTestRunner {

@ -50,11 +51,63 @@ public class BasicDoubleOpTest extends VectorizationTestRunner {
    private double[] a;
    private double[] b;
    private double[] c;
+    private double[] d;
+    private double[] e;

    public BasicDoubleOpTest() {
+        // Positive test values                       sign |   exponent | mantisa
+        double smallPositive   = Double.longBitsToDouble(0<<63 | 0x03f << 52 | 0x30000f);
+        double positive        = Double.longBitsToDouble(0<<63 | 0x07f << 52 | 0x30000f);
+        double bigPositive     = Double.longBitsToDouble(0<<63 | 0x07f << 52 | 0x30100f);
+        double biggerPositive  = Double.longBitsToDouble(0<<63 | 0x7fe << 52 | 0x30000f);
+        double maxPositive     = Double.MAX_VALUE;
+
+        // Special positive
+        double nan1  = Double.longBitsToDouble(0<<63 | 0x7ff << 52 | 0x7fffff);
+        double nan2  = Double.longBitsToDouble(0<<63 | 0x7ff << 52 | 0x30000f);
+        double inf   = Double.longBitsToDouble(0<<63 | 0x7ff << 52);
+        double zero  = 0.0;
+
+        // Negative test values                       sign |   exponent | mantisa
+        double smallNegative   = Double.longBitsToDouble(1<<63 | 0x003 << 52 | 0x30000f);
+        double negative        = Double.longBitsToDouble(1<<63 | 0x783 << 52 | 0x30100f);
+        double bigNegative     = Double.longBitsToDouble(1<<63 | 0x783 << 52 | 0x30000f);
+        double biggerNegative  = Double.longBitsToDouble(1<<63 | 0x786 << 52 | 0x30000f);
+        double maxNegative     = Double.longBitsToDouble(1<<63 | 0x7fe << 52 | 0x7fffff);
+
+        // Special negative
+        double nNan1  = Double.longBitsToDouble(1<<63 | 0x7ff << 52 | 0x7fffff);
+        double nNan2  = Double.longBitsToDouble(1<<63 | 0x7ff << 52 | 0x30000f);
+        double nInf   = Double.longBitsToDouble(1<<63 | 0x7ff << 52);
+        double nZero  = -0.0;
+
+        double[] numberList = new double[] {
+            nInf, maxNegative, biggerNegative, bigNegative, negative, smallNegative, nZero,
+            zero, smallPositive, positive, bigPositive, biggerPositive, maxPositive, inf,
+            nan1, nan2, nNan1, nNan2
+        };
+
+        Random rnd = new Random(10);
        a = new double[SIZE];
        b = new double[SIZE];
        c = new double[SIZE];
+        d = new double[SIZE];
+        e = new double[SIZE];
+
+        for (int i = 0; i < SIZE;) {
+            for (int j = 0; j < numberList.length && i < SIZE; j++, i++) {
+                for (int k = j; k < numberList.length && i < SIZE; k++, i++) {
+                    if (rnd.nextBoolean()) {
+                        d[i] = numberList[j];
+                        e[i] = numberList[k];
+                    } else {
+                        d[i] = numberList[k];
+                        e[i] = numberList[j];
+                    }
+                }
+            }
+        }
+
        for (int i = 0; i < SIZE; i++) {
            a[i] = 850.0 * i + 22222.22;
            b[i] = -12345.678;
@ -179,7 +232,7 @@ public class BasicDoubleOpTest extends VectorizationTestRunner {
    public double[] vectorMax() {
        double[] res = new double[SIZE];
        for (int i = 0; i < SIZE; i++) {
-            res[i] = Math.max(a[i], b[i]);
+            res[i] = Math.max(d[i], e[i]);
        }
        return res;
    }
@ -190,7 +243,7 @@ public class BasicDoubleOpTest extends VectorizationTestRunner {
    public double[] vectorMin() {
        double[] res = new double[SIZE];
        for (int i = 0; i < SIZE; i++) {
-            res[i] = Math.min(a[i], b[i]);
+            res[i] = Math.min(d[i], e[i]);
        }
        return res;
    }
--- a/test/hotspot/jtreg/compiler/vectorization/runner/BasicFloatOpTest.java
+++ b/test/hotspot/jtreg/compiler/vectorization/runner/BasicFloatOpTest.java
@ -42,6 +42,7 @@
 package compiler.vectorization.runner;

 import compiler.lib.ir_framework.*;
+import java.util.Random;

 public class BasicFloatOpTest extends VectorizationTestRunner {

@ -50,11 +51,72 @@ public class BasicFloatOpTest extends VectorizationTestRunner {
    private float[] a;
    private float[] b;
    private float[] c;
+    private float[] d;
+    private float[] e;

    public BasicFloatOpTest() {
+        // Positive test values                       sign |   exponent | mantisa
+        float smallPositive   = Float.intBitsToFloat(0<<31 | 0x3f << 23 | 0x30000f);
+        float positive        = Float.intBitsToFloat(0<<31 | 0x7f << 23 | 0x30000f);
+        float bigPositive     = Float.intBitsToFloat(0<<31 | 0x7f << 23 | 0x30100f);
+        float biggerPositive  = Float.intBitsToFloat(0<<31 | 0xfe << 23 | 0x30000f);
+        float maxPositive     = Float.MAX_VALUE;
+
+        // Special positive
+        float nan1  = Float.intBitsToFloat(0<<31 | 0xff << 23 | 0x7fffff);
+        float nan2  = Float.intBitsToFloat(0<<31 | 0xff << 23 | 0x30000f);
+        float inf   = Float.intBitsToFloat(0<<31 | 0xff << 23);
+        float zero  = 0.0f;
+
+        // Negative test values                       sign |   exponent | mantisa
+        float smallNegative   = Float.intBitsToFloat(1<<31 | 0x03 << 23 | 0x30000f);
+        float negative        = Float.intBitsToFloat(1<<31 | 0x83 << 23 | 0x30100f);
+        float bigNegative     = Float.intBitsToFloat(1<<31 | 0x83 << 23 | 0x30000f);
+        float biggerNegative  = Float.intBitsToFloat(1<<31 | 0x86 << 23 | 0x30000f);
+        float maxNegative     = Float.intBitsToFloat(1<<31 | 0xfe << 23 | 0x7fffff);
+
+        // Special negative
+        float nNan1  = Float.intBitsToFloat(1<<31 | 0xff << 23 | 0x7fffff);
+        float nNan2  = Float.intBitsToFloat(1<<31 | 0xff << 23 | 0x30000f);
+        float nInf   = Float.intBitsToFloat(1<<31 | 0xff << 23);
+        float nZero  = -0.0f;
+
+        float[] orderedList = new float[] {
+            nInf, maxNegative, biggerNegative, bigNegative, negative, smallNegative, nZero,
+            zero, smallPositive, positive, bigPositive, biggerPositive, maxPositive, inf
+        };
+
+        float[] NaNs = new float[] {
+            nan1, nan2, nNan1, nNan2
+        };
+
+        float[] numberList = new float[] {
+            nInf, maxNegative, biggerNegative, bigNegative, negative, smallNegative, nZero,
+            zero, smallPositive, positive, bigPositive, biggerPositive, maxPositive, inf,
+            nan1, nan2, nNan1, nNan2
+        };
+
+        Random rnd = new Random(11);
        a = new float[SIZE];
        b = new float[SIZE];
        c = new float[SIZE];
+        d = new float[SIZE];
+        e = new float[SIZE];
+
+        for (int i = 0; i < SIZE;) {
+            for (int j = 0; j < numberList.length && i < SIZE; j++, i++) {
+                for (int k = j; k < numberList.length && i < SIZE; k++, i++) {
+                    if (rnd.nextBoolean()) {
+                        d[i] = numberList[j];
+                        e[i] = numberList[k];
+                    } else {
+                        d[i] = numberList[k];
+                        e[i] = numberList[j];
+                    }
+                }
+            }
+        }
+
        for (int i = 0; i < SIZE; i++) {
            a[i] = 850.0f * i + 22222.22f;
            b[i] = -12345.678f;
@ -146,7 +208,7 @@ public class BasicFloatOpTest extends VectorizationTestRunner {
    public float[] vectorMax() {
        float[] res = new float[SIZE];
        for (int i = 0; i < SIZE; i++) {
-            res[i] = Math.max(a[i], b[i]);
+            res[i] = Math.max(d[i], e[i]);
        }
        return res;
    }
@ -157,7 +219,7 @@ public class BasicFloatOpTest extends VectorizationTestRunner {
    public float[] vectorMin() {
        float[] res = new float[SIZE];
        for (int i = 0; i < SIZE; i++) {
-            res[i] = Math.min(a[i], b[i]);
+            res[i] = Math.min(d[i], e[i]);
        }
        return res;
    }