mirror of
https://github.com/openjdk/jdk.git
synced 2026-06-08 03:25:05 +00:00
8346256: Optimize UMIN/UMAX reduction operations for x86 targets
Reviewed-by: qamai, sviswanathan
This commit is contained in:
parent
e6406641d7
commit
310d5a1562
@ -5442,6 +5442,13 @@ void Assembler::pmovsxwd(XMMRegister dst, XMMRegister src) {
|
||||
emit_int16(0x23, (0xC0 | encode));
|
||||
}
|
||||
|
||||
void Assembler::pmovzxwd(XMMRegister dst, XMMRegister src) {
|
||||
assert(VM_Version::supports_sse4_1(), "");
|
||||
InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
|
||||
int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
|
||||
emit_int16(0x33, (0xC0 | encode));
|
||||
}
|
||||
|
||||
void Assembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
|
||||
assert(VM_Version::supports_avx(), "");
|
||||
InstructionMark im(this);
|
||||
|
||||
@ -1965,6 +1965,7 @@ private:
|
||||
void pmovsxbq(XMMRegister dst, XMMRegister src);
|
||||
void pmovsxbw(XMMRegister dst, XMMRegister src);
|
||||
void pmovsxwd(XMMRegister dst, XMMRegister src);
|
||||
void pmovzxwd(XMMRegister dst, XMMRegister src);
|
||||
void vpmovsxbd(XMMRegister dst, XMMRegister src, int vector_len);
|
||||
void vpmovsxbq(XMMRegister dst, XMMRegister src, int vector_len);
|
||||
void vpmovsxbw(XMMRegister dst, XMMRegister src, int vector_len);
|
||||
|
||||
@ -1729,6 +1729,24 @@ void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegis
|
||||
default: assert(false, "wrong type");
|
||||
}
|
||||
break;
|
||||
case Op_UMinReductionV:
|
||||
switch (typ) {
|
||||
case T_BYTE: vpminub(dst, dst, src, Assembler::AVX_128bit); break;
|
||||
case T_SHORT: vpminuw(dst, dst, src, Assembler::AVX_128bit); break;
|
||||
case T_INT: vpminud(dst, dst, src, Assembler::AVX_128bit); break;
|
||||
case T_LONG: evpminuq(dst, k0, dst, src, true, Assembler::AVX_128bit); break;
|
||||
default: assert(false, "wrong type");
|
||||
}
|
||||
break;
|
||||
case Op_UMaxReductionV:
|
||||
switch (typ) {
|
||||
case T_BYTE: vpmaxub(dst, dst, src, Assembler::AVX_128bit); break;
|
||||
case T_SHORT: vpmaxuw(dst, dst, src, Assembler::AVX_128bit); break;
|
||||
case T_INT: vpmaxud(dst, dst, src, Assembler::AVX_128bit); break;
|
||||
case T_LONG: evpmaxuq(dst, k0, dst, src, true, Assembler::AVX_128bit); break;
|
||||
default: assert(false, "wrong type");
|
||||
}
|
||||
break;
|
||||
case Op_AddReductionVF: addss(dst, src); break;
|
||||
case Op_AddReductionVD: addsd(dst, src); break;
|
||||
case Op_AddReductionVI:
|
||||
@ -1792,6 +1810,24 @@ void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegis
|
||||
default: assert(false, "wrong type");
|
||||
}
|
||||
break;
|
||||
case Op_UMinReductionV:
|
||||
switch (typ) {
|
||||
case T_BYTE: vpminub(dst, src1, src2, vector_len); break;
|
||||
case T_SHORT: vpminuw(dst, src1, src2, vector_len); break;
|
||||
case T_INT: vpminud(dst, src1, src2, vector_len); break;
|
||||
case T_LONG: evpminuq(dst, k0, src1, src2, true, vector_len); break;
|
||||
default: assert(false, "wrong type");
|
||||
}
|
||||
break;
|
||||
case Op_UMaxReductionV:
|
||||
switch (typ) {
|
||||
case T_BYTE: vpmaxub(dst, src1, src2, vector_len); break;
|
||||
case T_SHORT: vpmaxuw(dst, src1, src2, vector_len); break;
|
||||
case T_INT: vpmaxud(dst, src1, src2, vector_len); break;
|
||||
case T_LONG: evpmaxuq(dst, k0, src1, src2, true, vector_len); break;
|
||||
default: assert(false, "wrong type");
|
||||
}
|
||||
break;
|
||||
case Op_AddReductionVI:
|
||||
switch (typ) {
|
||||
case T_BYTE: vpaddb(dst, src1, src2, vector_len); break;
|
||||
@ -2058,7 +2094,11 @@ void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMReg
|
||||
psrldq(vtmp2, 1);
|
||||
reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
|
||||
movdl(vtmp2, src1);
|
||||
pmovsxbd(vtmp1, vtmp1);
|
||||
if (opcode == Op_UMinReductionV || opcode == Op_UMaxReductionV) {
|
||||
pmovzxbd(vtmp1, vtmp1);
|
||||
} else {
|
||||
pmovsxbd(vtmp1, vtmp1);
|
||||
}
|
||||
reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
|
||||
pextrb(dst, vtmp1, 0x0);
|
||||
movsbl(dst, dst);
|
||||
@ -2135,7 +2175,11 @@ void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMReg
|
||||
reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
|
||||
}
|
||||
movdl(vtmp2, src1);
|
||||
pmovsxwd(vtmp1, vtmp1);
|
||||
if (opcode == Op_UMinReductionV || opcode == Op_UMaxReductionV) {
|
||||
pmovzxwd(vtmp1, vtmp1);
|
||||
} else {
|
||||
pmovsxwd(vtmp1, vtmp1);
|
||||
}
|
||||
reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
|
||||
pextrw(dst, vtmp1, 0x0);
|
||||
movswl(dst, dst);
|
||||
|
||||
@ -3341,6 +3341,18 @@ bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case Op_UMinReductionV:
|
||||
case Op_UMaxReductionV:
|
||||
if (UseAVX == 0) {
|
||||
return false;
|
||||
}
|
||||
if (bt == T_LONG && !VM_Version::supports_avx512vl()) {
|
||||
return false;
|
||||
}
|
||||
if (UseAVX > 2 && size_in_bits == 512 && !VM_Version::supports_avx512vl()) {
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case Op_MaxV:
|
||||
case Op_MinV:
|
||||
if (UseSSE < 4 && is_integral_type(bt)) {
|
||||
@ -19371,6 +19383,8 @@ instruct reductionI(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtm
|
||||
match(Set dst (XorReductionV src1 src2));
|
||||
match(Set dst (MinReductionV src1 src2));
|
||||
match(Set dst (MaxReductionV src1 src2));
|
||||
match(Set dst (UMinReductionV src1 src2));
|
||||
match(Set dst (UMaxReductionV src1 src2));
|
||||
effect(TEMP vtmp1, TEMP vtmp2);
|
||||
format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
|
||||
ins_encode %{
|
||||
@ -19392,6 +19406,8 @@ instruct reductionL(rRegL dst, rRegL src1, legVec src2, legVec vtmp1, legVec vtm
|
||||
match(Set dst (XorReductionV src1 src2));
|
||||
match(Set dst (MinReductionV src1 src2));
|
||||
match(Set dst (MaxReductionV src1 src2));
|
||||
match(Set dst (UMinReductionV src1 src2));
|
||||
match(Set dst (UMaxReductionV src1 src2));
|
||||
effect(TEMP vtmp1, TEMP vtmp2);
|
||||
format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
|
||||
ins_encode %{
|
||||
@ -19411,6 +19427,8 @@ instruct reductionL_avx512dq(rRegL dst, rRegL src1, vec src2, vec vtmp1, vec vtm
|
||||
match(Set dst (XorReductionV src1 src2));
|
||||
match(Set dst (MinReductionV src1 src2));
|
||||
match(Set dst (MaxReductionV src1 src2));
|
||||
match(Set dst (UMinReductionV src1 src2));
|
||||
match(Set dst (UMaxReductionV src1 src2));
|
||||
effect(TEMP vtmp1, TEMP vtmp2);
|
||||
format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
|
||||
ins_encode %{
|
||||
@ -19639,6 +19657,8 @@ instruct reductionB(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtm
|
||||
match(Set dst (XorReductionV src1 src2));
|
||||
match(Set dst (MinReductionV src1 src2));
|
||||
match(Set dst (MaxReductionV src1 src2));
|
||||
match(Set dst (UMinReductionV src1 src2));
|
||||
match(Set dst (UMaxReductionV src1 src2));
|
||||
effect(TEMP vtmp1, TEMP vtmp2);
|
||||
format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
|
||||
ins_encode %{
|
||||
@ -19657,6 +19677,8 @@ instruct reductionB_avx512bw(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtm
|
||||
match(Set dst (XorReductionV src1 src2));
|
||||
match(Set dst (MinReductionV src1 src2));
|
||||
match(Set dst (MaxReductionV src1 src2));
|
||||
match(Set dst (UMinReductionV src1 src2));
|
||||
match(Set dst (UMaxReductionV src1 src2));
|
||||
effect(TEMP vtmp1, TEMP vtmp2);
|
||||
format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
|
||||
ins_encode %{
|
||||
@ -19678,6 +19700,8 @@ instruct reductionS(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtm
|
||||
match(Set dst (XorReductionV src1 src2));
|
||||
match(Set dst (MinReductionV src1 src2));
|
||||
match(Set dst (MaxReductionV src1 src2));
|
||||
match(Set dst (UMinReductionV src1 src2));
|
||||
match(Set dst (UMaxReductionV src1 src2));
|
||||
effect(TEMP vtmp1, TEMP vtmp2);
|
||||
format %{ "vector_reduction_short $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
|
||||
ins_encode %{
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2026, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
@ -140,7 +141,7 @@ public class VectorUMinMaxReductionTest {
|
||||
|
||||
@Test
|
||||
@IR(counts = {IRNode.UMIN_REDUCTION_V, "= 1"},
|
||||
applyIfCPUFeature = {"asimd", "true"})
|
||||
applyIfCPUFeatureOr = {"asimd", "true", "avx", "true"})
|
||||
public static void testByteUMin() {
|
||||
byte got = ByteVector.fromArray(B_SPECIES, ba, 0).reduceLanes(VectorOperators.UMIN);
|
||||
verifyByte(B_SPECIES, got, BYTE_UMIN_IDENTITY, VectorMath::minUnsigned, false);
|
||||
@ -148,7 +149,7 @@ public class VectorUMinMaxReductionTest {
|
||||
|
||||
@Test
|
||||
@IR(counts = {IRNode.UMAX_REDUCTION_V, "= 1"},
|
||||
applyIfCPUFeature = {"asimd", "true"})
|
||||
applyIfCPUFeatureOr = {"asimd", "true", "avx", "true"})
|
||||
public static void testByteUMax() {
|
||||
byte got = ByteVector.fromArray(B_SPECIES, ba, 0).reduceLanes(VectorOperators.UMAX);
|
||||
verifyByte(B_SPECIES, got, BYTE_UMAX_IDENTITY, VectorMath::maxUnsigned, false);
|
||||
@ -156,7 +157,7 @@ public class VectorUMinMaxReductionTest {
|
||||
|
||||
@Test
|
||||
@IR(counts = {IRNode.UMIN_REDUCTION_V, "= 1"},
|
||||
applyIfCPUFeature = {"asimd", "true"})
|
||||
applyIfCPUFeatureOr = {"asimd", "true", "avx", "true"})
|
||||
public static void testByteUMinMasked() {
|
||||
byte got = ByteVector.fromArray(B_SPECIES, ba, 0)
|
||||
.reduceLanes(VectorOperators.UMIN,
|
||||
@ -166,7 +167,7 @@ public class VectorUMinMaxReductionTest {
|
||||
|
||||
@Test
|
||||
@IR(counts = {IRNode.UMAX_REDUCTION_V, "= 1"},
|
||||
applyIfCPUFeature = {"asimd", "true"})
|
||||
applyIfCPUFeatureOr = {"asimd", "true", "avx", "true"})
|
||||
public static void testByteUMaxMasked() {
|
||||
byte got = ByteVector.fromArray(B_SPECIES, ba, 0)
|
||||
.reduceLanes(VectorOperators.UMAX,
|
||||
@ -178,7 +179,7 @@ public class VectorUMinMaxReductionTest {
|
||||
|
||||
@Test
|
||||
@IR(counts = {IRNode.UMIN_REDUCTION_V, "= 1"},
|
||||
applyIfCPUFeature = {"asimd", "true"})
|
||||
applyIfCPUFeatureOr = {"asimd", "true", "avx", "true"})
|
||||
public static void testShortUMin() {
|
||||
short got = ShortVector.fromArray(S_SPECIES, sa, 0).reduceLanes(VectorOperators.UMIN);
|
||||
verifyShort(S_SPECIES, got, SHORT_UMIN_IDENTITY, VectorMath::minUnsigned, false);
|
||||
@ -186,7 +187,7 @@ public class VectorUMinMaxReductionTest {
|
||||
|
||||
@Test
|
||||
@IR(counts = {IRNode.UMAX_REDUCTION_V, "= 1"},
|
||||
applyIfCPUFeature = {"asimd", "true"})
|
||||
applyIfCPUFeatureOr = {"asimd", "true", "avx", "true"})
|
||||
public static void testShortUMax() {
|
||||
short got = ShortVector.fromArray(S_SPECIES, sa, 0).reduceLanes(VectorOperators.UMAX);
|
||||
verifyShort(S_SPECIES, got, SHORT_UMAX_IDENTITY, VectorMath::maxUnsigned, false);
|
||||
@ -194,7 +195,7 @@ public class VectorUMinMaxReductionTest {
|
||||
|
||||
@Test
|
||||
@IR(counts = {IRNode.UMIN_REDUCTION_V, "= 1"},
|
||||
applyIfCPUFeature = {"asimd", "true"})
|
||||
applyIfCPUFeatureOr = {"asimd", "true", "avx", "true"})
|
||||
public static void testShortUMinMasked() {
|
||||
short got = ShortVector.fromArray(S_SPECIES, sa, 0)
|
||||
.reduceLanes(VectorOperators.UMIN,
|
||||
@ -204,7 +205,7 @@ public class VectorUMinMaxReductionTest {
|
||||
|
||||
@Test
|
||||
@IR(counts = {IRNode.UMAX_REDUCTION_V, "= 1"},
|
||||
applyIfCPUFeature = {"asimd", "true"})
|
||||
applyIfCPUFeatureOr = {"asimd", "true", "avx", "true"})
|
||||
public static void testShortUMaxMasked() {
|
||||
short got = ShortVector.fromArray(S_SPECIES, sa, 0)
|
||||
.reduceLanes(VectorOperators.UMAX,
|
||||
@ -216,7 +217,7 @@ public class VectorUMinMaxReductionTest {
|
||||
|
||||
@Test
|
||||
@IR(counts = {IRNode.UMIN_REDUCTION_V, "= 1"},
|
||||
applyIfCPUFeature = {"asimd", "true"})
|
||||
applyIfCPUFeatureOr = {"asimd", "true", "avx", "true"})
|
||||
public static void testIntUMin() {
|
||||
int got = IntVector.fromArray(I_SPECIES, ia, 0).reduceLanes(VectorOperators.UMIN);
|
||||
verifyInt(I_SPECIES, got, INT_UMIN_IDENTITY, VectorMath::minUnsigned, false);
|
||||
@ -224,7 +225,7 @@ public class VectorUMinMaxReductionTest {
|
||||
|
||||
@Test
|
||||
@IR(counts = {IRNode.UMAX_REDUCTION_V, "= 1"},
|
||||
applyIfCPUFeature = {"asimd", "true"})
|
||||
applyIfCPUFeatureOr = {"asimd", "true", "avx", "true"})
|
||||
public static void testIntUMax() {
|
||||
int got = IntVector.fromArray(I_SPECIES, ia, 0).reduceLanes(VectorOperators.UMAX);
|
||||
verifyInt(I_SPECIES, got, INT_UMAX_IDENTITY, VectorMath::maxUnsigned, false);
|
||||
@ -232,7 +233,7 @@ public class VectorUMinMaxReductionTest {
|
||||
|
||||
@Test
|
||||
@IR(counts = {IRNode.UMIN_REDUCTION_V, "= 1"},
|
||||
applyIfCPUFeature = {"asimd", "true"})
|
||||
applyIfCPUFeatureOr = {"asimd", "true", "avx", "true"})
|
||||
public static void testIntUMinMasked() {
|
||||
int got = IntVector.fromArray(I_SPECIES, ia, 0)
|
||||
.reduceLanes(VectorOperators.UMIN,
|
||||
@ -242,7 +243,7 @@ public class VectorUMinMaxReductionTest {
|
||||
|
||||
@Test
|
||||
@IR(counts = {IRNode.UMAX_REDUCTION_V, "= 1"},
|
||||
applyIfCPUFeature = {"asimd", "true"})
|
||||
applyIfCPUFeatureOr = {"asimd", "true", "avx", "true"})
|
||||
public static void testIntUMaxMasked() {
|
||||
int got = IntVector.fromArray(I_SPECIES, ia, 0)
|
||||
.reduceLanes(VectorOperators.UMAX,
|
||||
@ -254,7 +255,7 @@ public class VectorUMinMaxReductionTest {
|
||||
|
||||
@Test
|
||||
@IR(counts = {IRNode.UMIN_REDUCTION_V, "= 1"},
|
||||
applyIfCPUFeature = {"asimd", "true"})
|
||||
applyIfCPUFeatureOr = {"asimd", "true", "avx512vl", "true"})
|
||||
public static void testLongUMin() {
|
||||
long got = LongVector.fromArray(L_SPECIES, la, 0).reduceLanes(VectorOperators.UMIN);
|
||||
verifyLong(L_SPECIES, got, LONG_UMIN_IDENTITY, VectorMath::minUnsigned, false);
|
||||
@ -262,7 +263,7 @@ public class VectorUMinMaxReductionTest {
|
||||
|
||||
@Test
|
||||
@IR(counts = {IRNode.UMAX_REDUCTION_V, "= 1"},
|
||||
applyIfCPUFeature = {"asimd", "true"})
|
||||
applyIfCPUFeatureOr = {"asimd", "true", "avx512vl", "true"})
|
||||
public static void testLongUMax() {
|
||||
long got = LongVector.fromArray(L_SPECIES, la, 0).reduceLanes(VectorOperators.UMAX);
|
||||
verifyLong(L_SPECIES, got, LONG_UMAX_IDENTITY, VectorMath::maxUnsigned, false);
|
||||
@ -270,7 +271,7 @@ public class VectorUMinMaxReductionTest {
|
||||
|
||||
@Test
|
||||
@IR(counts = {IRNode.UMIN_REDUCTION_V, "= 1"},
|
||||
applyIfCPUFeature = {"asimd", "true"})
|
||||
applyIfCPUFeatureOr = {"asimd", "true", "avx512vl", "true"})
|
||||
public static void testLongUMinMasked() {
|
||||
long got = LongVector.fromArray(L_SPECIES, la, 0)
|
||||
.reduceLanes(VectorOperators.UMIN,
|
||||
@ -280,7 +281,7 @@ public class VectorUMinMaxReductionTest {
|
||||
|
||||
@Test
|
||||
@IR(counts = {IRNode.UMAX_REDUCTION_V, "= 1"},
|
||||
applyIfCPUFeature = {"asimd", "true"})
|
||||
applyIfCPUFeatureOr = {"asimd", "true", "avx512vl", "true"})
|
||||
public static void testLongUMaxMasked() {
|
||||
long got = LongVector.fromArray(L_SPECIES, la, 0)
|
||||
.reduceLanes(VectorOperators.UMAX,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user