8346256: Optimize UMIN/UMAX reduction operations for x86 targets

Reviewed-by: qamai, sviswanathan
This commit is contained in:
Jatin Bhateja 2026-03-06 04:39:42 +00:00
parent e6406641d7
commit 310d5a1562
5 changed files with 95 additions and 18 deletions

View File

@ -5442,6 +5442,13 @@ void Assembler::pmovsxwd(XMMRegister dst, XMMRegister src) {
emit_int16(0x23, (0xC0 | encode));
}
void Assembler::pmovzxwd(XMMRegister dst, XMMRegister src) {
assert(VM_Version::supports_sse4_1(), "");
InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
emit_int16(0x33, (0xC0 | encode));
}
void Assembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
assert(VM_Version::supports_avx(), "");
InstructionMark im(this);

View File

@ -1965,6 +1965,7 @@ private:
void pmovsxbq(XMMRegister dst, XMMRegister src);
void pmovsxbw(XMMRegister dst, XMMRegister src);
void pmovsxwd(XMMRegister dst, XMMRegister src);
void pmovzxwd(XMMRegister dst, XMMRegister src);
void vpmovsxbd(XMMRegister dst, XMMRegister src, int vector_len);
void vpmovsxbq(XMMRegister dst, XMMRegister src, int vector_len);
void vpmovsxbw(XMMRegister dst, XMMRegister src, int vector_len);

View File

@ -1729,6 +1729,24 @@ void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegis
default: assert(false, "wrong type");
}
break;
case Op_UMinReductionV:
switch (typ) {
case T_BYTE: vpminub(dst, dst, src, Assembler::AVX_128bit); break;
case T_SHORT: vpminuw(dst, dst, src, Assembler::AVX_128bit); break;
case T_INT: vpminud(dst, dst, src, Assembler::AVX_128bit); break;
case T_LONG: evpminuq(dst, k0, dst, src, true, Assembler::AVX_128bit); break;
default: assert(false, "wrong type");
}
break;
case Op_UMaxReductionV:
switch (typ) {
case T_BYTE: vpmaxub(dst, dst, src, Assembler::AVX_128bit); break;
case T_SHORT: vpmaxuw(dst, dst, src, Assembler::AVX_128bit); break;
case T_INT: vpmaxud(dst, dst, src, Assembler::AVX_128bit); break;
case T_LONG: evpmaxuq(dst, k0, dst, src, true, Assembler::AVX_128bit); break;
default: assert(false, "wrong type");
}
break;
case Op_AddReductionVF: addss(dst, src); break;
case Op_AddReductionVD: addsd(dst, src); break;
case Op_AddReductionVI:
@ -1792,6 +1810,24 @@ void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegis
default: assert(false, "wrong type");
}
break;
case Op_UMinReductionV:
switch (typ) {
case T_BYTE: vpminub(dst, src1, src2, vector_len); break;
case T_SHORT: vpminuw(dst, src1, src2, vector_len); break;
case T_INT: vpminud(dst, src1, src2, vector_len); break;
case T_LONG: evpminuq(dst, k0, src1, src2, true, vector_len); break;
default: assert(false, "wrong type");
}
break;
case Op_UMaxReductionV:
switch (typ) {
case T_BYTE: vpmaxub(dst, src1, src2, vector_len); break;
case T_SHORT: vpmaxuw(dst, src1, src2, vector_len); break;
case T_INT: vpmaxud(dst, src1, src2, vector_len); break;
case T_LONG: evpmaxuq(dst, k0, src1, src2, true, vector_len); break;
default: assert(false, "wrong type");
}
break;
case Op_AddReductionVI:
switch (typ) {
case T_BYTE: vpaddb(dst, src1, src2, vector_len); break;
@ -2058,7 +2094,11 @@ void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMReg
psrldq(vtmp2, 1);
reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
movdl(vtmp2, src1);
pmovsxbd(vtmp1, vtmp1);
if (opcode == Op_UMinReductionV || opcode == Op_UMaxReductionV) {
pmovzxbd(vtmp1, vtmp1);
} else {
pmovsxbd(vtmp1, vtmp1);
}
reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
pextrb(dst, vtmp1, 0x0);
movsbl(dst, dst);
@ -2135,7 +2175,11 @@ void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMReg
reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
}
movdl(vtmp2, src1);
pmovsxwd(vtmp1, vtmp1);
if (opcode == Op_UMinReductionV || opcode == Op_UMaxReductionV) {
pmovzxwd(vtmp1, vtmp1);
} else {
pmovsxwd(vtmp1, vtmp1);
}
reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
pextrw(dst, vtmp1, 0x0);
movswl(dst, dst);

View File

@ -3341,6 +3341,18 @@ bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
return false;
}
break;
case Op_UMinReductionV:
case Op_UMaxReductionV:
if (UseAVX == 0) {
return false;
}
if (bt == T_LONG && !VM_Version::supports_avx512vl()) {
return false;
}
if (UseAVX > 2 && size_in_bits == 512 && !VM_Version::supports_avx512vl()) {
return false;
}
break;
case Op_MaxV:
case Op_MinV:
if (UseSSE < 4 && is_integral_type(bt)) {
@ -19371,6 +19383,8 @@ instruct reductionI(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtm
match(Set dst (XorReductionV src1 src2));
match(Set dst (MinReductionV src1 src2));
match(Set dst (MaxReductionV src1 src2));
match(Set dst (UMinReductionV src1 src2));
match(Set dst (UMaxReductionV src1 src2));
effect(TEMP vtmp1, TEMP vtmp2);
format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
ins_encode %{
@ -19392,6 +19406,8 @@ instruct reductionL(rRegL dst, rRegL src1, legVec src2, legVec vtmp1, legVec vtm
match(Set dst (XorReductionV src1 src2));
match(Set dst (MinReductionV src1 src2));
match(Set dst (MaxReductionV src1 src2));
match(Set dst (UMinReductionV src1 src2));
match(Set dst (UMaxReductionV src1 src2));
effect(TEMP vtmp1, TEMP vtmp2);
format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
ins_encode %{
@ -19411,6 +19427,8 @@ instruct reductionL_avx512dq(rRegL dst, rRegL src1, vec src2, vec vtmp1, vec vtm
match(Set dst (XorReductionV src1 src2));
match(Set dst (MinReductionV src1 src2));
match(Set dst (MaxReductionV src1 src2));
match(Set dst (UMinReductionV src1 src2));
match(Set dst (UMaxReductionV src1 src2));
effect(TEMP vtmp1, TEMP vtmp2);
format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
ins_encode %{
@ -19639,6 +19657,8 @@ instruct reductionB(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtm
match(Set dst (XorReductionV src1 src2));
match(Set dst (MinReductionV src1 src2));
match(Set dst (MaxReductionV src1 src2));
match(Set dst (UMinReductionV src1 src2));
match(Set dst (UMaxReductionV src1 src2));
effect(TEMP vtmp1, TEMP vtmp2);
format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
ins_encode %{
@ -19657,6 +19677,8 @@ instruct reductionB_avx512bw(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtm
match(Set dst (XorReductionV src1 src2));
match(Set dst (MinReductionV src1 src2));
match(Set dst (MaxReductionV src1 src2));
match(Set dst (UMinReductionV src1 src2));
match(Set dst (UMaxReductionV src1 src2));
effect(TEMP vtmp1, TEMP vtmp2);
format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
ins_encode %{
@ -19678,6 +19700,8 @@ instruct reductionS(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtm
match(Set dst (XorReductionV src1 src2));
match(Set dst (MinReductionV src1 src2));
match(Set dst (MaxReductionV src1 src2));
match(Set dst (UMinReductionV src1 src2));
match(Set dst (UMaxReductionV src1 src2));
effect(TEMP vtmp1, TEMP vtmp2);
format %{ "vector_reduction_short $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
ins_encode %{

View File

@ -1,4 +1,5 @@
/*
* Copyright (c) 2026, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
@ -140,7 +141,7 @@ public class VectorUMinMaxReductionTest {
@Test
@IR(counts = {IRNode.UMIN_REDUCTION_V, "= 1"},
applyIfCPUFeature = {"asimd", "true"})
applyIfCPUFeatureOr = {"asimd", "true", "avx", "true"})
public static void testByteUMin() {
byte got = ByteVector.fromArray(B_SPECIES, ba, 0).reduceLanes(VectorOperators.UMIN);
verifyByte(B_SPECIES, got, BYTE_UMIN_IDENTITY, VectorMath::minUnsigned, false);
@ -148,7 +149,7 @@ public class VectorUMinMaxReductionTest {
@Test
@IR(counts = {IRNode.UMAX_REDUCTION_V, "= 1"},
applyIfCPUFeature = {"asimd", "true"})
applyIfCPUFeatureOr = {"asimd", "true", "avx", "true"})
public static void testByteUMax() {
byte got = ByteVector.fromArray(B_SPECIES, ba, 0).reduceLanes(VectorOperators.UMAX);
verifyByte(B_SPECIES, got, BYTE_UMAX_IDENTITY, VectorMath::maxUnsigned, false);
@ -156,7 +157,7 @@ public class VectorUMinMaxReductionTest {
@Test
@IR(counts = {IRNode.UMIN_REDUCTION_V, "= 1"},
applyIfCPUFeature = {"asimd", "true"})
applyIfCPUFeatureOr = {"asimd", "true", "avx", "true"})
public static void testByteUMinMasked() {
byte got = ByteVector.fromArray(B_SPECIES, ba, 0)
.reduceLanes(VectorOperators.UMIN,
@ -166,7 +167,7 @@ public class VectorUMinMaxReductionTest {
@Test
@IR(counts = {IRNode.UMAX_REDUCTION_V, "= 1"},
applyIfCPUFeature = {"asimd", "true"})
applyIfCPUFeatureOr = {"asimd", "true", "avx", "true"})
public static void testByteUMaxMasked() {
byte got = ByteVector.fromArray(B_SPECIES, ba, 0)
.reduceLanes(VectorOperators.UMAX,
@ -178,7 +179,7 @@ public class VectorUMinMaxReductionTest {
@Test
@IR(counts = {IRNode.UMIN_REDUCTION_V, "= 1"},
applyIfCPUFeature = {"asimd", "true"})
applyIfCPUFeatureOr = {"asimd", "true", "avx", "true"})
public static void testShortUMin() {
short got = ShortVector.fromArray(S_SPECIES, sa, 0).reduceLanes(VectorOperators.UMIN);
verifyShort(S_SPECIES, got, SHORT_UMIN_IDENTITY, VectorMath::minUnsigned, false);
@ -186,7 +187,7 @@ public class VectorUMinMaxReductionTest {
@Test
@IR(counts = {IRNode.UMAX_REDUCTION_V, "= 1"},
applyIfCPUFeature = {"asimd", "true"})
applyIfCPUFeatureOr = {"asimd", "true", "avx", "true"})
public static void testShortUMax() {
short got = ShortVector.fromArray(S_SPECIES, sa, 0).reduceLanes(VectorOperators.UMAX);
verifyShort(S_SPECIES, got, SHORT_UMAX_IDENTITY, VectorMath::maxUnsigned, false);
@ -194,7 +195,7 @@ public class VectorUMinMaxReductionTest {
@Test
@IR(counts = {IRNode.UMIN_REDUCTION_V, "= 1"},
applyIfCPUFeature = {"asimd", "true"})
applyIfCPUFeatureOr = {"asimd", "true", "avx", "true"})
public static void testShortUMinMasked() {
short got = ShortVector.fromArray(S_SPECIES, sa, 0)
.reduceLanes(VectorOperators.UMIN,
@ -204,7 +205,7 @@ public class VectorUMinMaxReductionTest {
@Test
@IR(counts = {IRNode.UMAX_REDUCTION_V, "= 1"},
applyIfCPUFeature = {"asimd", "true"})
applyIfCPUFeatureOr = {"asimd", "true", "avx", "true"})
public static void testShortUMaxMasked() {
short got = ShortVector.fromArray(S_SPECIES, sa, 0)
.reduceLanes(VectorOperators.UMAX,
@ -216,7 +217,7 @@ public class VectorUMinMaxReductionTest {
@Test
@IR(counts = {IRNode.UMIN_REDUCTION_V, "= 1"},
applyIfCPUFeature = {"asimd", "true"})
applyIfCPUFeatureOr = {"asimd", "true", "avx", "true"})
public static void testIntUMin() {
int got = IntVector.fromArray(I_SPECIES, ia, 0).reduceLanes(VectorOperators.UMIN);
verifyInt(I_SPECIES, got, INT_UMIN_IDENTITY, VectorMath::minUnsigned, false);
@ -224,7 +225,7 @@ public class VectorUMinMaxReductionTest {
@Test
@IR(counts = {IRNode.UMAX_REDUCTION_V, "= 1"},
applyIfCPUFeature = {"asimd", "true"})
applyIfCPUFeatureOr = {"asimd", "true", "avx", "true"})
public static void testIntUMax() {
int got = IntVector.fromArray(I_SPECIES, ia, 0).reduceLanes(VectorOperators.UMAX);
verifyInt(I_SPECIES, got, INT_UMAX_IDENTITY, VectorMath::maxUnsigned, false);
@ -232,7 +233,7 @@ public class VectorUMinMaxReductionTest {
@Test
@IR(counts = {IRNode.UMIN_REDUCTION_V, "= 1"},
applyIfCPUFeature = {"asimd", "true"})
applyIfCPUFeatureOr = {"asimd", "true", "avx", "true"})
public static void testIntUMinMasked() {
int got = IntVector.fromArray(I_SPECIES, ia, 0)
.reduceLanes(VectorOperators.UMIN,
@ -242,7 +243,7 @@ public class VectorUMinMaxReductionTest {
@Test
@IR(counts = {IRNode.UMAX_REDUCTION_V, "= 1"},
applyIfCPUFeature = {"asimd", "true"})
applyIfCPUFeatureOr = {"asimd", "true", "avx", "true"})
public static void testIntUMaxMasked() {
int got = IntVector.fromArray(I_SPECIES, ia, 0)
.reduceLanes(VectorOperators.UMAX,
@ -254,7 +255,7 @@ public class VectorUMinMaxReductionTest {
@Test
@IR(counts = {IRNode.UMIN_REDUCTION_V, "= 1"},
applyIfCPUFeature = {"asimd", "true"})
applyIfCPUFeatureOr = {"asimd", "true", "avx512vl", "true"})
public static void testLongUMin() {
long got = LongVector.fromArray(L_SPECIES, la, 0).reduceLanes(VectorOperators.UMIN);
verifyLong(L_SPECIES, got, LONG_UMIN_IDENTITY, VectorMath::minUnsigned, false);
@ -262,7 +263,7 @@ public class VectorUMinMaxReductionTest {
@Test
@IR(counts = {IRNode.UMAX_REDUCTION_V, "= 1"},
applyIfCPUFeature = {"asimd", "true"})
applyIfCPUFeatureOr = {"asimd", "true", "avx512vl", "true"})
public static void testLongUMax() {
long got = LongVector.fromArray(L_SPECIES, la, 0).reduceLanes(VectorOperators.UMAX);
verifyLong(L_SPECIES, got, LONG_UMAX_IDENTITY, VectorMath::maxUnsigned, false);
@ -270,7 +271,7 @@ public class VectorUMinMaxReductionTest {
@Test
@IR(counts = {IRNode.UMIN_REDUCTION_V, "= 1"},
applyIfCPUFeature = {"asimd", "true"})
applyIfCPUFeatureOr = {"asimd", "true", "avx512vl", "true"})
public static void testLongUMinMasked() {
long got = LongVector.fromArray(L_SPECIES, la, 0)
.reduceLanes(VectorOperators.UMIN,
@ -280,7 +281,7 @@ public class VectorUMinMaxReductionTest {
@Test
@IR(counts = {IRNode.UMAX_REDUCTION_V, "= 1"},
applyIfCPUFeature = {"asimd", "true"})
applyIfCPUFeatureOr = {"asimd", "true", "avx512vl", "true"})
public static void testLongUMaxMasked() {
long got = LongVector.fromArray(L_SPECIES, la, 0)
.reduceLanes(VectorOperators.UMAX,