From e83e79460bcc47a5f6b1c5b9835dae362d688ea2 Mon Sep 17 00:00:00 2001 From: Eric Fang Date: Tue, 2 Jun 2026 07:35:53 +0000 Subject: [PATCH] 8376285: AArch64: Support fusion optimization for SVE destructive instructions Reviewed-by: aph, dlong, adinn --- src/hotspot/cpu/aarch64/aarch64_vector.ad | 404 +++++++++++++----- src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 | 150 ++++--- .../cpu/aarch64/c2_MacroAssembler_aarch64.cpp | 8 +- .../cpu/aarch64/macroAssembler_aarch64.cpp | 23 + .../cpu/aarch64/macroAssembler_aarch64.hpp | 96 +++++ .../cpu/aarch64/nativeInst_aarch64.hpp | 23 + 6 files changed, 540 insertions(+), 164 deletions(-) diff --git a/src/hotspot/cpu/aarch64/aarch64_vector.ad b/src/hotspot/cpu/aarch64/aarch64_vector.ad index 2ff93c9e288..b9899995531 100644 --- a/src/hotspot/cpu/aarch64/aarch64_vector.ad +++ b/src/hotspot/cpu/aarch64/aarch64_vector.ad @@ -1671,24 +1671,42 @@ instruct vnotL(vReg dst, vReg src, immL_M1 m1) %{ // vector not - predicated -instruct vnotI_masked(vReg dst_src, immI_M1 m1, pRegGov pg) %{ +// The Java Vector API specification requires that for masked unary operations, +// suppressed lanes are filled from the first vector operand (see "Masked +// Operations" in Vector.java around line 568). So we use movprfx to copy src +// into dst before emitting the predicated instruction. +instruct vnotI_masked(vReg dst, vReg src, immI_M1 m1, pRegGov pg) %{ predicate(UseSVE > 0); - match(Set dst_src (XorV (Binary dst_src (Replicate m1)) pg)); - format %{ "vnotI_masked $dst_src, $pg, $dst_src" %} + match(Set dst (XorV (Binary src (Replicate m1)) pg)); + format %{ "vnotI_masked $dst, $pg, $src" %} ins_encode %{ - __ sve_not($dst_src$$FloatRegister, get_reg_variant(this), - $pg$$PRegister, $dst_src$$FloatRegister); + __ maybe_movprfx($dst$$FloatRegister, $src$$FloatRegister); + // Although dst and src hold the same value after movprfx, we must use src + // (not dst) as the source of the following instruction. The movprfx + // destination register must not appear in any source operand of the + // following instruction except as the destructive operand. + __ sve_not($dst$$FloatRegister, get_reg_variant(this), + $pg$$PRegister, $src$$FloatRegister); %} ins_pipe(pipe_slow); %} -instruct vnotL_masked(vReg dst_src, immL_M1 m1, pRegGov pg) %{ +// The Java Vector API specification requires that for masked unary operations, +// suppressed lanes are filled from the first vector operand (see "Masked +// Operations" in Vector.java around line 568). So we use movprfx to copy src +// into dst before emitting the predicated instruction. +instruct vnotL_masked(vReg dst, vReg src, immL_M1 m1, pRegGov pg) %{ predicate(UseSVE > 0); - match(Set dst_src (XorV (Binary dst_src (Replicate m1)) pg)); - format %{ "vnotL_masked $dst_src, $pg, $dst_src" %} + match(Set dst (XorV (Binary src (Replicate m1)) pg)); + format %{ "vnotL_masked $dst, $pg, $src" %} ins_encode %{ - __ sve_not($dst_src$$FloatRegister, get_reg_variant(this), - $pg$$PRegister, $dst_src$$FloatRegister); + __ maybe_movprfx($dst$$FloatRegister, $src$$FloatRegister); + // Although dst and src hold the same value after movprfx, we must use src + // (not dst) as the source of the following instruction. The movprfx + // destination register must not appear in any source operand of the + // following instruction except as the destructive operand. + __ sve_not($dst$$FloatRegister, get_reg_variant(this), + $pg$$PRegister, $src$$FloatRegister); %} ins_pipe(pipe_slow); %} @@ -1985,62 +2003,116 @@ instruct vabsD(vReg dst, vReg src) %{ // vector abs - predicated -instruct vabsB_masked(vReg dst_src, pRegGov pg) %{ +// The Java Vector API specification requires that for masked unary operations, +// suppressed lanes are filled from the first vector operand (see "Masked +// Operations" in Vector.java around line 568). So we use movprfx to copy src +// into dst before emitting the predicated instruction. +instruct vabsB_masked(vReg dst, vReg src, pRegGov pg) %{ predicate(UseSVE > 0); - match(Set dst_src (AbsVB dst_src pg)); - format %{ "vabsB_masked $dst_src, $pg, $dst_src" %} + match(Set dst (AbsVB src pg)); + format %{ "vabsB_masked $dst, $pg, $src" %} ins_encode %{ - __ sve_abs($dst_src$$FloatRegister, __ B, $pg$$PRegister, $dst_src$$FloatRegister); + __ maybe_movprfx($dst$$FloatRegister, $src$$FloatRegister); + // Although dst and src hold the same value after movprfx, we must use src + // (not dst) as the source of the following instruction. The movprfx + // destination register must not appear in any source operand of the + // following instruction except as the destructive operand. + __ sve_abs($dst$$FloatRegister, __ B, $pg$$PRegister, $src$$FloatRegister); %} ins_pipe(pipe_slow); %} -instruct vabsS_masked(vReg dst_src, pRegGov pg) %{ +// The Java Vector API specification requires that for masked unary operations, +// suppressed lanes are filled from the first vector operand (see "Masked +// Operations" in Vector.java around line 568). So we use movprfx to copy src +// into dst before emitting the predicated instruction. +instruct vabsS_masked(vReg dst, vReg src, pRegGov pg) %{ predicate(UseSVE > 0); - match(Set dst_src (AbsVS dst_src pg)); - format %{ "vabsS_masked $dst_src, $pg, $dst_src" %} + match(Set dst (AbsVS src pg)); + format %{ "vabsS_masked $dst, $pg, $src" %} ins_encode %{ - __ sve_abs($dst_src$$FloatRegister, __ H, $pg$$PRegister, $dst_src$$FloatRegister); + __ maybe_movprfx($dst$$FloatRegister, $src$$FloatRegister); + // Although dst and src hold the same value after movprfx, we must use src + // (not dst) as the source of the following instruction. The movprfx + // destination register must not appear in any source operand of the + // following instruction except as the destructive operand. + __ sve_abs($dst$$FloatRegister, __ H, $pg$$PRegister, $src$$FloatRegister); %} ins_pipe(pipe_slow); %} -instruct vabsI_masked(vReg dst_src, pRegGov pg) %{ +// The Java Vector API specification requires that for masked unary operations, +// suppressed lanes are filled from the first vector operand (see "Masked +// Operations" in Vector.java around line 568). So we use movprfx to copy src +// into dst before emitting the predicated instruction. +instruct vabsI_masked(vReg dst, vReg src, pRegGov pg) %{ predicate(UseSVE > 0); - match(Set dst_src (AbsVI dst_src pg)); - format %{ "vabsI_masked $dst_src, $pg, $dst_src" %} + match(Set dst (AbsVI src pg)); + format %{ "vabsI_masked $dst, $pg, $src" %} ins_encode %{ - __ sve_abs($dst_src$$FloatRegister, __ S, $pg$$PRegister, $dst_src$$FloatRegister); + __ maybe_movprfx($dst$$FloatRegister, $src$$FloatRegister); + // Although dst and src hold the same value after movprfx, we must use src + // (not dst) as the source of the following instruction. The movprfx + // destination register must not appear in any source operand of the + // following instruction except as the destructive operand. + __ sve_abs($dst$$FloatRegister, __ S, $pg$$PRegister, $src$$FloatRegister); %} ins_pipe(pipe_slow); %} -instruct vabsL_masked(vReg dst_src, pRegGov pg) %{ +// The Java Vector API specification requires that for masked unary operations, +// suppressed lanes are filled from the first vector operand (see "Masked +// Operations" in Vector.java around line 568). So we use movprfx to copy src +// into dst before emitting the predicated instruction. +instruct vabsL_masked(vReg dst, vReg src, pRegGov pg) %{ predicate(UseSVE > 0); - match(Set dst_src (AbsVL dst_src pg)); - format %{ "vabsL_masked $dst_src, $pg, $dst_src" %} + match(Set dst (AbsVL src pg)); + format %{ "vabsL_masked $dst, $pg, $src" %} ins_encode %{ - __ sve_abs($dst_src$$FloatRegister, __ D, $pg$$PRegister, $dst_src$$FloatRegister); + __ maybe_movprfx($dst$$FloatRegister, $src$$FloatRegister); + // Although dst and src hold the same value after movprfx, we must use src + // (not dst) as the source of the following instruction. The movprfx + // destination register must not appear in any source operand of the + // following instruction except as the destructive operand. + __ sve_abs($dst$$FloatRegister, __ D, $pg$$PRegister, $src$$FloatRegister); %} ins_pipe(pipe_slow); %} -instruct vabsF_masked(vReg dst_src, pRegGov pg) %{ +// The Java Vector API specification requires that for masked unary operations, +// suppressed lanes are filled from the first vector operand (see "Masked +// Operations" in Vector.java around line 568). So we use movprfx to copy src +// into dst before emitting the predicated instruction. +instruct vabsF_masked(vReg dst, vReg src, pRegGov pg) %{ predicate(UseSVE > 0); - match(Set dst_src (AbsVF dst_src pg)); - format %{ "vabsF_masked $dst_src, $pg, $dst_src" %} + match(Set dst (AbsVF src pg)); + format %{ "vabsF_masked $dst, $pg, $src" %} ins_encode %{ - __ sve_fabs($dst_src$$FloatRegister, __ S, $pg$$PRegister, $dst_src$$FloatRegister); + __ maybe_movprfx($dst$$FloatRegister, $src$$FloatRegister); + // Although dst and src hold the same value after movprfx, we must use src + // (not dst) as the source of the following instruction. The movprfx + // destination register must not appear in any source operand of the + // following instruction except as the destructive operand. + __ sve_fabs($dst$$FloatRegister, __ S, $pg$$PRegister, $src$$FloatRegister); %} ins_pipe(pipe_slow); %} -instruct vabsD_masked(vReg dst_src, pRegGov pg) %{ +// The Java Vector API specification requires that for masked unary operations, +// suppressed lanes are filled from the first vector operand (see "Masked +// Operations" in Vector.java around line 568). So we use movprfx to copy src +// into dst before emitting the predicated instruction. +instruct vabsD_masked(vReg dst, vReg src, pRegGov pg) %{ predicate(UseSVE > 0); - match(Set dst_src (AbsVD dst_src pg)); - format %{ "vabsD_masked $dst_src, $pg, $dst_src" %} + match(Set dst (AbsVD src pg)); + format %{ "vabsD_masked $dst, $pg, $src" %} ins_encode %{ - __ sve_fabs($dst_src$$FloatRegister, __ D, $pg$$PRegister, $dst_src$$FloatRegister); + __ maybe_movprfx($dst$$FloatRegister, $src$$FloatRegister); + // Although dst and src hold the same value after movprfx, we must use src + // (not dst) as the source of the following instruction. The movprfx + // destination register must not appear in any source operand of the + // following instruction except as the destructive operand. + __ sve_fabs($dst$$FloatRegister, __ D, $pg$$PRegister, $src$$FloatRegister); %} ins_pipe(pipe_slow); %} @@ -2158,44 +2230,80 @@ instruct vnegD(vReg dst, vReg src) %{ // vector neg - predicated -instruct vnegI_masked(vReg dst_src, pRegGov pg) %{ +// The Java Vector API specification requires that for masked unary operations, +// suppressed lanes are filled from the first vector operand (see "Masked +// Operations" in Vector.java around line 568). So we use movprfx to copy src +// into dst before emitting the predicated instruction. +instruct vnegI_masked(vReg dst, vReg src, pRegGov pg) %{ predicate(UseSVE > 0); - match(Set dst_src (NegVI dst_src pg)); - format %{ "vnegI_masked $dst_src, $pg, $dst_src" %} + match(Set dst (NegVI src pg)); + format %{ "vnegI_masked $dst, $pg, $src" %} ins_encode %{ BasicType bt = Matcher::vector_element_basic_type(this); - __ sve_neg($dst_src$$FloatRegister, __ elemType_to_regVariant(bt), - $pg$$PRegister, $dst_src$$FloatRegister); + __ maybe_movprfx($dst$$FloatRegister, $src$$FloatRegister); + // Although dst and src hold the same value after movprfx, we must use src + // (not dst) as the source of the following instruction. The movprfx + // destination register must not appear in any source operand of the + // following instruction except as the destructive operand. + __ sve_neg($dst$$FloatRegister, __ elemType_to_regVariant(bt), + $pg$$PRegister, $src$$FloatRegister); %} ins_pipe(pipe_slow); %} -instruct vnegL_masked(vReg dst_src, pRegGov pg) %{ +// The Java Vector API specification requires that for masked unary operations, +// suppressed lanes are filled from the first vector operand (see "Masked +// Operations" in Vector.java around line 568). So we use movprfx to copy src +// into dst before emitting the predicated instruction. +instruct vnegL_masked(vReg dst, vReg src, pRegGov pg) %{ predicate(UseSVE > 0); - match(Set dst_src (NegVL dst_src pg)); - format %{ "vnegL_masked $dst_src, $pg, $dst_src" %} + match(Set dst (NegVL src pg)); + format %{ "vnegL_masked $dst, $pg, $src" %} ins_encode %{ - __ sve_neg($dst_src$$FloatRegister, __ D, $pg$$PRegister, $dst_src$$FloatRegister); + __ maybe_movprfx($dst$$FloatRegister, $src$$FloatRegister); + // Although dst and src hold the same value after movprfx, we must use src + // (not dst) as the source of the following instruction. The movprfx + // destination register must not appear in any source operand of the + // following instruction except as the destructive operand. + __ sve_neg($dst$$FloatRegister, __ D, $pg$$PRegister, $src$$FloatRegister); %} ins_pipe(pipe_slow); %} -instruct vnegF_masked(vReg dst_src, pRegGov pg) %{ +// The Java Vector API specification requires that for masked unary operations, +// suppressed lanes are filled from the first vector operand (see "Masked +// Operations" in Vector.java around line 568). So we use movprfx to copy src +// into dst before emitting the predicated instruction. +instruct vnegF_masked(vReg dst, vReg src, pRegGov pg) %{ predicate(UseSVE > 0); - match(Set dst_src (NegVF dst_src pg)); - format %{ "vnegF_masked $dst_src, $pg, $dst_src" %} + match(Set dst (NegVF src pg)); + format %{ "vnegF_masked $dst, $pg, $src" %} ins_encode %{ - __ sve_fneg($dst_src$$FloatRegister, __ S, $pg$$PRegister, $dst_src$$FloatRegister); + __ maybe_movprfx($dst$$FloatRegister, $src$$FloatRegister); + // Although dst and src hold the same value after movprfx, we must use src + // (not dst) as the source of the following instruction. The movprfx + // destination register must not appear in any source operand of the + // following instruction except as the destructive operand. + __ sve_fneg($dst$$FloatRegister, __ S, $pg$$PRegister, $src$$FloatRegister); %} ins_pipe(pipe_slow); %} -instruct vnegD_masked(vReg dst_src, pRegGov pg) %{ +// The Java Vector API specification requires that for masked unary operations, +// suppressed lanes are filled from the first vector operand (see "Masked +// Operations" in Vector.java around line 568). So we use movprfx to copy src +// into dst before emitting the predicated instruction. +instruct vnegD_masked(vReg dst, vReg src, pRegGov pg) %{ predicate(UseSVE > 0); - match(Set dst_src (NegVD dst_src pg)); - format %{ "vnegD_masked $dst_src, $pg, $dst_src" %} + match(Set dst (NegVD src pg)); + format %{ "vnegD_masked $dst, $pg, $src" %} ins_encode %{ - __ sve_fneg($dst_src$$FloatRegister, __ D, $pg$$PRegister, $dst_src$$FloatRegister); + __ maybe_movprfx($dst$$FloatRegister, $src$$FloatRegister); + // Although dst and src hold the same value after movprfx, we must use src + // (not dst) as the source of the following instruction. The movprfx + // destination register must not appear in any source operand of the + // following instruction except as the destructive operand. + __ sve_fneg($dst$$FloatRegister, __ D, $pg$$PRegister, $src$$FloatRegister); %} ins_pipe(pipe_slow); %} @@ -2251,22 +2359,40 @@ instruct vsqrtD(vReg dst, vReg src) %{ // vector sqrt - predicated -instruct vsqrtF_masked(vReg dst_src, pRegGov pg) %{ +// The Java Vector API specification requires that for masked unary operations, +// suppressed lanes are filled from the first vector operand (see "Masked +// Operations" in Vector.java around line 568). So we use movprfx to copy src +// into dst before emitting the predicated instruction. +instruct vsqrtF_masked(vReg dst, vReg src, pRegGov pg) %{ predicate(UseSVE > 0); - match(Set dst_src (SqrtVF dst_src pg)); - format %{ "vsqrtF_masked $dst_src, $pg, $dst_src" %} + match(Set dst (SqrtVF src pg)); + format %{ "vsqrtF_masked $dst, $pg, $src" %} ins_encode %{ - __ sve_fsqrt($dst_src$$FloatRegister, __ S, $pg$$PRegister, $dst_src$$FloatRegister); + __ maybe_movprfx($dst$$FloatRegister, $src$$FloatRegister); + // Although dst and src hold the same value after movprfx, we must use src + // (not dst) as the source of the following instruction. The movprfx + // destination register must not appear in any source operand of the + // following instruction except as the destructive operand. + __ sve_fsqrt($dst$$FloatRegister, __ S, $pg$$PRegister, $src$$FloatRegister); %} ins_pipe(pipe_slow); %} -instruct vsqrtD_masked(vReg dst_src, pRegGov pg) %{ +// The Java Vector API specification requires that for masked unary operations, +// suppressed lanes are filled from the first vector operand (see "Masked +// Operations" in Vector.java around line 568). So we use movprfx to copy src +// into dst before emitting the predicated instruction. +instruct vsqrtD_masked(vReg dst, vReg src, pRegGov pg) %{ predicate(UseSVE > 0); - match(Set dst_src (SqrtVD dst_src pg)); - format %{ "vsqrtD_masked $dst_src, $pg, $dst_src" %} + match(Set dst (SqrtVD src pg)); + format %{ "vsqrtD_masked $dst, $pg, $src" %} ins_encode %{ - __ sve_fsqrt($dst_src$$FloatRegister, __ D, $pg$$PRegister, $dst_src$$FloatRegister); + __ maybe_movprfx($dst$$FloatRegister, $src$$FloatRegister); + // Although dst and src hold the same value after movprfx, we must use src + // (not dst) as the source of the following instruction. The movprfx + // destination register must not appear in any source operand of the + // following instruction except as the destructive operand. + __ sve_fsqrt($dst$$FloatRegister, __ D, $pg$$PRegister, $src$$FloatRegister); %} ins_pipe(pipe_slow); %} @@ -5331,9 +5457,7 @@ instruct insertI_index_lt32(vReg dst, vReg src, iRegIorL2I val, immI idx, __ sve_index($tmp$$FloatRegister, size, -16, 1); __ sve_cmp(Assembler::EQ, $pgtmp$$PRegister, size, ptrue, $tmp$$FloatRegister, (int)($idx$$constant) - 16); - if ($dst$$FloatRegister != $src$$FloatRegister) { - __ sve_orr($dst$$FloatRegister, $src$$FloatRegister, $src$$FloatRegister); - } + __ maybe_movprfx($dst$$FloatRegister, $src$$FloatRegister); __ sve_cpy($dst$$FloatRegister, size, $pgtmp$$PRegister, $val$$Register); %} ins_pipe(pipe_slow); @@ -5356,9 +5480,7 @@ instruct insertI_index_ge32(vReg dst, vReg src, iRegIorL2I val, immI idx, vReg t __ sve_dup($tmp2$$FloatRegister, size, (int)($idx$$constant)); __ sve_cmp(Assembler::EQ, $pgtmp$$PRegister, size, ptrue, $tmp1$$FloatRegister, $tmp2$$FloatRegister); - if ($dst$$FloatRegister != $src$$FloatRegister) { - __ sve_orr($dst$$FloatRegister, $src$$FloatRegister, $src$$FloatRegister); - } + __ maybe_movprfx($dst$$FloatRegister, $src$$FloatRegister); __ sve_cpy($dst$$FloatRegister, size, $pgtmp$$PRegister, $val$$Register); %} ins_pipe(pipe_slow); @@ -5392,9 +5514,7 @@ instruct insertL_gt128b(vReg dst, vReg src, iRegL val, immI idx, __ sve_index($tmp$$FloatRegister, __ D, -16, 1); __ sve_cmp(Assembler::EQ, $pgtmp$$PRegister, __ D, ptrue, $tmp$$FloatRegister, (int)($idx$$constant) - 16); - if ($dst$$FloatRegister != $src$$FloatRegister) { - __ sve_orr($dst$$FloatRegister, $src$$FloatRegister, $src$$FloatRegister); - } + __ maybe_movprfx($dst$$FloatRegister, $src$$FloatRegister); __ sve_cpy($dst$$FloatRegister, __ D, $pgtmp$$PRegister, $val$$Register); %} ins_pipe(pipe_slow); @@ -5432,7 +5552,7 @@ instruct insertF_index_lt32(vReg dst, vReg src, vRegF val, immI idx, __ sve_index($dst$$FloatRegister, __ S, -16, 1); __ sve_cmp(Assembler::EQ, $pgtmp$$PRegister, __ S, ptrue, $dst$$FloatRegister, (int)($idx$$constant) - 16); - __ sve_orr($dst$$FloatRegister, $src$$FloatRegister, $src$$FloatRegister); + __ sve_movprfx($dst$$FloatRegister, $src$$FloatRegister); __ sve_cpy($dst$$FloatRegister, __ S, $pgtmp$$PRegister, $val$$FloatRegister); %} ins_pipe(pipe_slow); @@ -5451,7 +5571,7 @@ instruct insertF_index_ge32(vReg dst, vReg src, vRegF val, immI idx, vReg tmp, __ sve_dup($dst$$FloatRegister, __ S, (int)($idx$$constant)); __ sve_cmp(Assembler::EQ, $pgtmp$$PRegister, __ S, ptrue, $tmp$$FloatRegister, $dst$$FloatRegister); - __ sve_orr($dst$$FloatRegister, $src$$FloatRegister, $src$$FloatRegister); + __ sve_movprfx($dst$$FloatRegister, $src$$FloatRegister); __ sve_cpy($dst$$FloatRegister, __ S, $pgtmp$$PRegister, $val$$FloatRegister); %} ins_pipe(pipe_slow); @@ -5486,7 +5606,7 @@ instruct insertD_gt128b(vReg dst, vReg src, vRegD val, immI idx, __ sve_index($dst$$FloatRegister, __ D, -16, 1); __ sve_cmp(Assembler::EQ, $pgtmp$$PRegister, __ D, ptrue, $dst$$FloatRegister, (int)($idx$$constant) - 16); - __ sve_orr($dst$$FloatRegister, $src$$FloatRegister, $src$$FloatRegister); + __ sve_movprfx($dst$$FloatRegister, $src$$FloatRegister); __ sve_cpy($dst$$FloatRegister, __ D, $pgtmp$$PRegister, $val$$FloatRegister); %} ins_pipe(pipe_slow); @@ -5656,8 +5776,12 @@ instruct extractF(vRegF dst, vReg src, immI idx) %{ __ ins($dst$$FloatRegister, __ S, $src$$FloatRegister, 0, index); } else { assert(UseSVE > 0, "must be sve"); - __ sve_orr($dst$$FloatRegister, $src$$FloatRegister, $src$$FloatRegister); - __ sve_ext($dst$$FloatRegister, $dst$$FloatRegister, index << 2); + __ sve_movprfx($dst$$FloatRegister, $src$$FloatRegister); + // Although dst and src hold the same value after movprfx, we must use src + // (not dst) as the second source of ext. The movprfx destination register + // must not appear in any source operand of the following instruction + // except as the destructive operand. + __ sve_ext($dst$$FloatRegister, $src$$FloatRegister, index << 2); } %} ins_pipe(pipe_slow); @@ -5677,8 +5801,12 @@ instruct extractD(vRegD dst, vReg src, immI idx) %{ __ ins($dst$$FloatRegister, __ D, $src$$FloatRegister, 0, index); } else { assert(UseSVE > 0, "must be sve"); - __ sve_orr($dst$$FloatRegister, $src$$FloatRegister, $src$$FloatRegister); - __ sve_ext($dst$$FloatRegister, $dst$$FloatRegister, index << 3); + __ sve_movprfx($dst$$FloatRegister, $src$$FloatRegister); + // Although dst and src hold the same value after movprfx, we must use src + // (not dst) as the second source of ext. The movprfx destination register + // must not appear in any source operand of the following instruction + // except as the destructive operand. + __ sve_ext($dst$$FloatRegister, $src$$FloatRegister, index << 3); } %} ins_pipe(pipe_slow); @@ -6855,25 +6983,43 @@ instruct vpopcountL(vReg dst, vReg src) %{ // vector popcount - predicated -instruct vpopcountI_masked(vReg dst_src, pRegGov pg) %{ +// The Java Vector API specification requires that for masked unary operations, +// suppressed lanes are filled from the first vector operand (see "Masked +// Operations" in Vector.java around line 568). So we use movprfx to copy src +// into dst before emitting the predicated instruction. +instruct vpopcountI_masked(vReg dst, vReg src, pRegGov pg) %{ predicate(UseSVE > 0); - match(Set dst_src (PopCountVI dst_src pg)); - format %{ "vpopcountI_masked $dst_src, $pg, $dst_src" %} + match(Set dst (PopCountVI src pg)); + format %{ "vpopcountI_masked $dst, $pg, $src" %} ins_encode %{ BasicType bt = Matcher::vector_element_basic_type(this); - __ sve_cnt($dst_src$$FloatRegister, __ elemType_to_regVariant(bt), - $pg$$PRegister, $dst_src$$FloatRegister); + __ maybe_movprfx($dst$$FloatRegister, $src$$FloatRegister); + // Although dst and src hold the same value after movprfx, we must use src + // (not dst) as the source of the following instruction. The movprfx + // destination register must not appear in any source operand of the + // following instruction except as the destructive operand. + __ sve_cnt($dst$$FloatRegister, __ elemType_to_regVariant(bt), + $pg$$PRegister, $src$$FloatRegister); %} ins_pipe(pipe_slow); %} -instruct vpopcountL_masked(vReg dst_src, pRegGov pg) %{ +// The Java Vector API specification requires that for masked unary operations, +// suppressed lanes are filled from the first vector operand (see "Masked +// Operations" in Vector.java around line 568). So we use movprfx to copy src +// into dst before emitting the predicated instruction. +instruct vpopcountL_masked(vReg dst, vReg src, pRegGov pg) %{ predicate(UseSVE > 0); - match(Set dst_src (PopCountVL dst_src pg)); - format %{ "vpopcountL_masked $dst_src, $pg, $dst_src" %} + match(Set dst (PopCountVL src pg)); + format %{ "vpopcountL_masked $dst, $pg, $src" %} ins_encode %{ - __ sve_cnt($dst_src$$FloatRegister, __ D, - $pg$$PRegister, $dst_src$$FloatRegister); + __ maybe_movprfx($dst$$FloatRegister, $src$$FloatRegister); + // Although dst and src hold the same value after movprfx, we must use src + // (not dst) as the source of the following instruction. The movprfx + // destination register must not appear in any source operand of the + // following instruction except as the destructive operand. + __ sve_cnt($dst$$FloatRegister, __ D, + $pg$$PRegister, $src$$FloatRegister); %} ins_pipe(pipe_slow); %} @@ -7240,14 +7386,23 @@ instruct vcountLeadingZeros(vReg dst, vReg src) %{ // The dst and src should use the same register to make sure the // inactive lanes in dst save the same elements as src. -instruct vcountLeadingZeros_masked(vReg dst_src, pRegGov pg) %{ +// The Java Vector API specification requires that for masked unary operations, +// suppressed lanes are filled from the first vector operand (see "Masked +// Operations" in Vector.java around line 568). So we use movprfx to copy src +// into dst before emitting the predicated instruction. +instruct vcountLeadingZeros_masked(vReg dst, vReg src, pRegGov pg) %{ predicate(UseSVE > 0); - match(Set dst_src (CountLeadingZerosV dst_src pg)); - format %{ "vcountLeadingZeros_masked $dst_src, $pg, $dst_src" %} + match(Set dst (CountLeadingZerosV src pg)); + format %{ "vcountLeadingZeros_masked $dst, $pg, $src" %} ins_encode %{ BasicType bt = Matcher::vector_element_basic_type(this); - __ sve_clz($dst_src$$FloatRegister, __ elemType_to_regVariant(bt), - $pg$$PRegister, $dst_src$$FloatRegister); + __ maybe_movprfx($dst$$FloatRegister, $src$$FloatRegister); + // Although dst and src hold the same value after movprfx, we must use src + // (not dst) as the source of the following instruction. The movprfx + // destination register must not appear in any source operand of the + // following instruction except as the destructive operand. + __ sve_clz($dst$$FloatRegister, __ elemType_to_regVariant(bt), + $pg$$PRegister, $src$$FloatRegister); %} ins_pipe(pipe_slow); %} @@ -7296,19 +7451,26 @@ instruct vcountTrailingZeros(vReg dst, vReg src) %{ ins_pipe(pipe_slow); %} -// The dst and src should use the same register to make sure the -// inactive lanes in dst save the same elements as src. -instruct vcountTrailingZeros_masked(vReg dst_src, pRegGov pg) %{ +// The Java Vector API specification requires that for masked unary operations, +// suppressed lanes are filled from the first vector operand (see "Masked +// Operations" in Vector.java around line 568). So we use movprfx to copy src +// into dst before emitting the predicated instruction. +instruct vcountTrailingZeros_masked(vReg dst, vReg src, pRegGov pg) %{ predicate(UseSVE > 0); - match(Set dst_src (CountTrailingZerosV dst_src pg)); - format %{ "vcountTrailingZeros_masked $dst_src, $pg, $dst_src" %} + match(Set dst (CountTrailingZerosV src pg)); + format %{ "vcountTrailingZeros_masked $dst, $pg, $src" %} ins_encode %{ BasicType bt = Matcher::vector_element_basic_type(this); Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt); - __ sve_rbit($dst_src$$FloatRegister, size, - $pg$$PRegister, $dst_src$$FloatRegister); - __ sve_clz($dst_src$$FloatRegister, size, - $pg$$PRegister, $dst_src$$FloatRegister); + __ maybe_movprfx($dst$$FloatRegister, $src$$FloatRegister); + // Although dst and src hold the same value after movprfx, we must use src + // (not dst) as the source of the following instruction. The movprfx + // destination register must not appear in any source operand of the + // following instruction except as the destructive operand. + __ sve_rbit($dst$$FloatRegister, size, + $pg$$PRegister, $src$$FloatRegister); + __ sve_clz($dst$$FloatRegister, size, + $pg$$PRegister, $dst$$FloatRegister); %} ins_pipe(pipe_slow); %} @@ -7347,14 +7509,23 @@ instruct vreverse(vReg dst, vReg src) %{ // The dst and src should use the same register to make sure the // inactive lanes in dst save the same elements as src. -instruct vreverse_masked(vReg dst_src, pRegGov pg) %{ +// The Java Vector API specification requires that for masked unary operations, +// suppressed lanes are filled from the first vector operand (see "Masked +// Operations" in Vector.java around line 568). So we use movprfx to copy src +// into dst before emitting the predicated instruction. +instruct vreverse_masked(vReg dst, vReg src, pRegGov pg) %{ predicate(UseSVE > 0); - match(Set dst_src (ReverseV dst_src pg)); - format %{ "vreverse_masked $dst_src, $pg, $dst_src" %} + match(Set dst (ReverseV src pg)); + format %{ "vreverse_masked $dst, $pg, $src" %} ins_encode %{ BasicType bt = Matcher::vector_element_basic_type(this); - __ sve_rbit($dst_src$$FloatRegister, __ elemType_to_regVariant(bt), - $pg$$PRegister, $dst_src$$FloatRegister); + __ maybe_movprfx($dst$$FloatRegister, $src$$FloatRegister); + // Although dst and src hold the same value after movprfx, we must use src + // (not dst) as the source of the following instruction. The movprfx + // destination register must not appear in any source operand of the + // following instruction except as the destructive operand. + __ sve_rbit($dst$$FloatRegister, __ elemType_to_regVariant(bt), + $pg$$PRegister, $src$$FloatRegister); %} ins_pipe(pipe_slow); %} @@ -7393,19 +7564,28 @@ instruct vreverseBytes(vReg dst, vReg src) %{ ins_pipe(pipe_slow); %} -// The dst and src should use the same register to make sure the -// inactive lanes in dst save the same elements as src. -instruct vreverseBytes_masked(vReg dst_src, pRegGov pg) %{ +// The Java Vector API specification requires that for masked unary operations, +// suppressed lanes are filled from the first vector operand (see "Masked +// Operations" in Vector.java around line 568). So we use movprfx to copy src +// into dst before emitting the predicated instruction. +instruct vreverseBytes_masked(vReg dst, vReg src, pRegGov pg) %{ predicate(UseSVE > 0); - match(Set dst_src (ReverseBytesV dst_src pg)); - format %{ "vreverseBytes_masked $dst_src, $pg, $dst_src" %} + match(Set dst (ReverseBytesV src pg)); + format %{ "vreverseBytes_masked $dst, $pg, $src" %} ins_encode %{ BasicType bt = Matcher::vector_element_basic_type(this); if (bt == T_BYTE) { - // do nothing + if ($dst$$FloatRegister != $src$$FloatRegister) { + __ sve_orr($dst$$FloatRegister, $src$$FloatRegister, $src$$FloatRegister); + } } else { - __ sve_revb($dst_src$$FloatRegister, __ elemType_to_regVariant(bt), - $pg$$PRegister, $dst_src$$FloatRegister); + __ maybe_movprfx($dst$$FloatRegister, $src$$FloatRegister); + // Although dst and src hold the same value after movprfx, we must use src + // (not dst) as the source of the following instruction. The movprfx + // destination register must not appear in any source operand of the + // following instruction except as the destructive operand. + __ sve_revb($dst$$FloatRegister, __ elemType_to_regVariant(bt), + $pg$$PRegister, $src$$FloatRegister); } %} ins_pipe(pipe_slow); diff --git a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 index c5df949dfb6..a53efd43d5d 100644 --- a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 +++ b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 @@ -899,13 +899,22 @@ dnl dnl VECTOR_NOT_PREDICATE($1 ) dnl VECTOR_NOT_PREDICATE(type) define(`VECTOR_NOT_PREDICATE', ` -instruct vnot$1_masked`'(vReg dst_src, imm$1_M1 m1, pRegGov pg) %{ +// The Java Vector API specification requires that for masked unary operations, +// suppressed lanes are filled from the first vector operand (see "Masked +// Operations" in Vector.java around line 568). So we use movprfx to copy src +// into dst before emitting the predicated instruction. +instruct vnot$1_masked`'(vReg dst, vReg src, imm$1_M1 m1, pRegGov pg) %{ predicate(UseSVE > 0); - match(Set dst_src (XorV (Binary dst_src (Replicate m1)) pg)); - format %{ "vnot$1_masked $dst_src, $pg, $dst_src" %} + match(Set dst (XorV (Binary src (Replicate m1)) pg)); + format %{ "vnot$1_masked $dst, $pg, $src" %} ins_encode %{ - __ sve_not($dst_src$$FloatRegister, get_reg_variant(this), - $pg$$PRegister, $dst_src$$FloatRegister); + __ maybe_movprfx($dst$$FloatRegister, $src$$FloatRegister); + // Although dst and src hold the same value after movprfx, we must use src + // (not dst) as the source of the following instruction. The movprfx + // destination register must not appear in any source operand of the + // following instruction except as the destructive operand. + __ sve_not($dst$$FloatRegister, get_reg_variant(this), + $pg$$PRegister, $src$$FloatRegister); %} ins_pipe(pipe_slow); %}')dnl @@ -1042,14 +1051,23 @@ dnl dnl UNARY_OP_PREDICATE($1, $2, $3 ) dnl UNARY_OP_PREDICATE(rule_name, op_name, insn) define(`UNARY_OP_PREDICATE', ` -instruct $1_masked(vReg dst_src, pRegGov pg) %{ +// The Java Vector API specification requires that for masked unary operations, +// suppressed lanes are filled from the first vector operand (see "Masked +// Operations" in Vector.java around line 568). So we use movprfx to copy src +// into dst before emitting the predicated instruction. +instruct $1_masked(vReg dst, vReg src, pRegGov pg) %{ predicate(UseSVE > 0); - match(Set dst_src ($2 dst_src pg)); - format %{ "$1_masked $dst_src, $pg, $dst_src" %} + match(Set dst ($2 src pg)); + format %{ "$1_masked $dst, $pg, $src" %} ins_encode %{ BasicType bt = Matcher::vector_element_basic_type(this); - __ $3($dst_src$$FloatRegister, __ elemType_to_regVariant(bt), - $pg$$PRegister, $dst_src$$FloatRegister); + __ maybe_movprfx($dst$$FloatRegister, $src$$FloatRegister); + // Although dst and src hold the same value after movprfx, we must use src + // (not dst) as the source of the following instruction. The movprfx + // destination register must not appear in any source operand of the + // following instruction except as the destructive operand. + __ $3($dst$$FloatRegister, __ elemType_to_regVariant(bt), + $pg$$PRegister, $src$$FloatRegister); %} ins_pipe(pipe_slow); %}')dnl @@ -1057,12 +1075,21 @@ dnl dnl UNARY_OP_PREDICATE_WITH_SIZE($1, $2, $3, $4 ) dnl UNARY_OP_PREDICATE_WITH_SIZE(rule_name, op_name, insn, size) define(`UNARY_OP_PREDICATE_WITH_SIZE', ` -instruct $1_masked(vReg dst_src, pRegGov pg) %{ +// The Java Vector API specification requires that for masked unary operations, +// suppressed lanes are filled from the first vector operand (see "Masked +// Operations" in Vector.java around line 568). So we use movprfx to copy src +// into dst before emitting the predicated instruction. +instruct $1_masked(vReg dst, vReg src, pRegGov pg) %{ predicate(UseSVE > 0); - match(Set dst_src ($2 dst_src pg)); - format %{ "$1_masked $dst_src, $pg, $dst_src" %} + match(Set dst ($2 src pg)); + format %{ "$1_masked $dst, $pg, $src" %} ins_encode %{ - __ $3($dst_src$$FloatRegister, __ $4, $pg$$PRegister, $dst_src$$FloatRegister); + __ maybe_movprfx($dst$$FloatRegister, $src$$FloatRegister); + // Although dst and src hold the same value after movprfx, we must use src + // (not dst) as the source of the following instruction. The movprfx + // destination register must not appear in any source operand of the + // following instruction except as the destructive operand. + __ $3($dst$$FloatRegister, __ $4, $pg$$PRegister, $src$$FloatRegister); %} ins_pipe(pipe_slow); %}')dnl @@ -3368,9 +3395,7 @@ instruct insertI_index_lt32(vReg dst, vReg src, iRegIorL2I val, immI idx, __ sve_index($tmp$$FloatRegister, size, -16, 1); __ sve_cmp(Assembler::EQ, $pgtmp$$PRegister, size, ptrue, $tmp$$FloatRegister, (int)($idx$$constant) - 16); - if ($dst$$FloatRegister != $src$$FloatRegister) { - __ sve_orr($dst$$FloatRegister, $src$$FloatRegister, $src$$FloatRegister); - } + __ maybe_movprfx($dst$$FloatRegister, $src$$FloatRegister); __ sve_cpy($dst$$FloatRegister, size, $pgtmp$$PRegister, $val$$Register); %} ins_pipe(pipe_slow); @@ -3393,9 +3418,7 @@ instruct insertI_index_ge32(vReg dst, vReg src, iRegIorL2I val, immI idx, vReg t __ sve_dup($tmp2$$FloatRegister, size, (int)($idx$$constant)); __ sve_cmp(Assembler::EQ, $pgtmp$$PRegister, size, ptrue, $tmp1$$FloatRegister, $tmp2$$FloatRegister); - if ($dst$$FloatRegister != $src$$FloatRegister) { - __ sve_orr($dst$$FloatRegister, $src$$FloatRegister, $src$$FloatRegister); - } + __ maybe_movprfx($dst$$FloatRegister, $src$$FloatRegister); __ sve_cpy($dst$$FloatRegister, size, $pgtmp$$PRegister, $val$$Register); %} ins_pipe(pipe_slow); @@ -3429,9 +3452,7 @@ instruct insertL_gt128b(vReg dst, vReg src, iRegL val, immI idx, __ sve_index($tmp$$FloatRegister, __ D, -16, 1); __ sve_cmp(Assembler::EQ, $pgtmp$$PRegister, __ D, ptrue, $tmp$$FloatRegister, (int)($idx$$constant) - 16); - if ($dst$$FloatRegister != $src$$FloatRegister) { - __ sve_orr($dst$$FloatRegister, $src$$FloatRegister, $src$$FloatRegister); - } + __ maybe_movprfx($dst$$FloatRegister, $src$$FloatRegister); __ sve_cpy($dst$$FloatRegister, __ D, $pgtmp$$PRegister, $val$$Register); %} ins_pipe(pipe_slow); @@ -3469,7 +3490,7 @@ instruct insertF_index_lt32(vReg dst, vReg src, vRegF val, immI idx, __ sve_index($dst$$FloatRegister, __ S, -16, 1); __ sve_cmp(Assembler::EQ, $pgtmp$$PRegister, __ S, ptrue, $dst$$FloatRegister, (int)($idx$$constant) - 16); - __ sve_orr($dst$$FloatRegister, $src$$FloatRegister, $src$$FloatRegister); + __ sve_movprfx($dst$$FloatRegister, $src$$FloatRegister); __ sve_cpy($dst$$FloatRegister, __ S, $pgtmp$$PRegister, $val$$FloatRegister); %} ins_pipe(pipe_slow); @@ -3488,7 +3509,7 @@ instruct insertF_index_ge32(vReg dst, vReg src, vRegF val, immI idx, vReg tmp, __ sve_dup($dst$$FloatRegister, __ S, (int)($idx$$constant)); __ sve_cmp(Assembler::EQ, $pgtmp$$PRegister, __ S, ptrue, $tmp$$FloatRegister, $dst$$FloatRegister); - __ sve_orr($dst$$FloatRegister, $src$$FloatRegister, $src$$FloatRegister); + __ sve_movprfx($dst$$FloatRegister, $src$$FloatRegister); __ sve_cpy($dst$$FloatRegister, __ S, $pgtmp$$PRegister, $val$$FloatRegister); %} ins_pipe(pipe_slow); @@ -3523,7 +3544,7 @@ instruct insertD_gt128b(vReg dst, vReg src, vRegD val, immI idx, __ sve_index($dst$$FloatRegister, __ D, -16, 1); __ sve_cmp(Assembler::EQ, $pgtmp$$PRegister, __ D, ptrue, $dst$$FloatRegister, (int)($idx$$constant) - 16); - __ sve_orr($dst$$FloatRegister, $src$$FloatRegister, $src$$FloatRegister); + __ sve_movprfx($dst$$FloatRegister, $src$$FloatRegister); __ sve_cpy($dst$$FloatRegister, __ D, $pgtmp$$PRegister, $val$$FloatRegister); %} ins_pipe(pipe_slow); @@ -3621,8 +3642,12 @@ instruct extract$1(vReg$1 dst, vReg src, immI idx) %{ __ ins($dst$$FloatRegister, __ $4, $src$$FloatRegister, 0, index); } else { assert(UseSVE > 0, "must be sve"); - __ sve_orr($dst$$FloatRegister, $src$$FloatRegister, $src$$FloatRegister); - __ sve_ext($dst$$FloatRegister, $dst$$FloatRegister, index << $5); + __ sve_movprfx($dst$$FloatRegister, $src$$FloatRegister); + // Although dst and src hold the same value after movprfx, we must use src + // (not dst) as the second source of ext. The movprfx destination register + // must not appear in any source operand of the following instruction + // except as the destructive operand. + __ sve_ext($dst$$FloatRegister, $src$$FloatRegister, index << $5); } %} ins_pipe(pipe_slow); @@ -4682,13 +4707,22 @@ instruct vpopcountL(vReg dst, vReg src) %{ // vector popcount - predicated UNARY_OP_PREDICATE(vpopcountI, PopCountVI, sve_cnt) -instruct vpopcountL_masked(vReg dst_src, pRegGov pg) %{ +// The Java Vector API specification requires that for masked unary operations, +// suppressed lanes are filled from the first vector operand (see "Masked +// Operations" in Vector.java around line 568). So we use movprfx to copy src +// into dst before emitting the predicated instruction. +instruct vpopcountL_masked(vReg dst, vReg src, pRegGov pg) %{ predicate(UseSVE > 0); - match(Set dst_src (PopCountVL dst_src pg)); - format %{ "vpopcountL_masked $dst_src, $pg, $dst_src" %} + match(Set dst (PopCountVL src pg)); + format %{ "vpopcountL_masked $dst, $pg, $src" %} ins_encode %{ - __ sve_cnt($dst_src$$FloatRegister, __ D, - $pg$$PRegister, $dst_src$$FloatRegister); + __ maybe_movprfx($dst$$FloatRegister, $src$$FloatRegister); + // Although dst and src hold the same value after movprfx, we must use src + // (not dst) as the source of the following instruction. The movprfx + // destination register must not appear in any source operand of the + // following instruction except as the destructive operand. + __ sve_cnt($dst$$FloatRegister, __ D, + $pg$$PRegister, $src$$FloatRegister); %} ins_pipe(pipe_slow); %} @@ -5100,19 +5134,26 @@ instruct vcountTrailingZeros(vReg dst, vReg src) %{ ins_pipe(pipe_slow); %} -// The dst and src should use the same register to make sure the -// inactive lanes in dst save the same elements as src. -instruct vcountTrailingZeros_masked(vReg dst_src, pRegGov pg) %{ +// The Java Vector API specification requires that for masked unary operations, +// suppressed lanes are filled from the first vector operand (see "Masked +// Operations" in Vector.java around line 568). So we use movprfx to copy src +// into dst before emitting the predicated instruction. +instruct vcountTrailingZeros_masked(vReg dst, vReg src, pRegGov pg) %{ predicate(UseSVE > 0); - match(Set dst_src (CountTrailingZerosV dst_src pg)); - format %{ "vcountTrailingZeros_masked $dst_src, $pg, $dst_src" %} + match(Set dst (CountTrailingZerosV src pg)); + format %{ "vcountTrailingZeros_masked $dst, $pg, $src" %} ins_encode %{ BasicType bt = Matcher::vector_element_basic_type(this); Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt); - __ sve_rbit($dst_src$$FloatRegister, size, - $pg$$PRegister, $dst_src$$FloatRegister); - __ sve_clz($dst_src$$FloatRegister, size, - $pg$$PRegister, $dst_src$$FloatRegister); + __ maybe_movprfx($dst$$FloatRegister, $src$$FloatRegister); + // Although dst and src hold the same value after movprfx, we must use src + // (not dst) as the source of the following instruction. The movprfx + // destination register must not appear in any source operand of the + // following instruction except as the destructive operand. + __ sve_rbit($dst$$FloatRegister, size, + $pg$$PRegister, $src$$FloatRegister); + __ sve_clz($dst$$FloatRegister, size, + $pg$$PRegister, $dst$$FloatRegister); %} ins_pipe(pipe_slow); %} @@ -5186,19 +5227,28 @@ instruct vreverseBytes(vReg dst, vReg src) %{ ins_pipe(pipe_slow); %} -// The dst and src should use the same register to make sure the -// inactive lanes in dst save the same elements as src. -instruct vreverseBytes_masked(vReg dst_src, pRegGov pg) %{ +// The Java Vector API specification requires that for masked unary operations, +// suppressed lanes are filled from the first vector operand (see "Masked +// Operations" in Vector.java around line 568). So we use movprfx to copy src +// into dst before emitting the predicated instruction. +instruct vreverseBytes_masked(vReg dst, vReg src, pRegGov pg) %{ predicate(UseSVE > 0); - match(Set dst_src (ReverseBytesV dst_src pg)); - format %{ "vreverseBytes_masked $dst_src, $pg, $dst_src" %} + match(Set dst (ReverseBytesV src pg)); + format %{ "vreverseBytes_masked $dst, $pg, $src" %} ins_encode %{ BasicType bt = Matcher::vector_element_basic_type(this); if (bt == T_BYTE) { - // do nothing + if ($dst$$FloatRegister != $src$$FloatRegister) { + __ sve_orr($dst$$FloatRegister, $src$$FloatRegister, $src$$FloatRegister); + } } else { - __ sve_revb($dst_src$$FloatRegister, __ elemType_to_regVariant(bt), - $pg$$PRegister, $dst_src$$FloatRegister); + __ maybe_movprfx($dst$$FloatRegister, $src$$FloatRegister); + // Although dst and src hold the same value after movprfx, we must use src + // (not dst) as the source of the following instruction. The movprfx + // destination register must not appear in any source operand of the + // following instruction except as the destructive operand. + __ sve_revb($dst$$FloatRegister, __ elemType_to_regVariant(bt), + $pg$$PRegister, $src$$FloatRegister); } %} ins_pipe(pipe_slow); diff --git a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp index 67dc4966d64..cb9e308197e 100644 --- a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp @@ -2494,8 +2494,12 @@ void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRe smov(dst, src, size, idx); } } else { - sve_orr(vtmp, src, src); - sve_ext(vtmp, vtmp, idx << size); + sve_movprfx(vtmp, src); + // Although vtmp and src hold the same value after movprfx, we must use src + // (not vtmp) as the second source of ext. The movprfx destination register + // must not appear in any source operand of the following instruction except + // as the destructive operand. + sve_ext(vtmp, src, idx << size); if (bt == T_INT || bt == T_LONG) { umov(dst, vtmp, size, 0); } else { diff --git a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp index a52ad112560..ac5bae22384 100644 --- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp @@ -7278,3 +7278,26 @@ void MacroAssembler::neon_vector_rotate(FloatRegister dst, SIMD_Arrangement T, sli(dst, T, src, lshift); } } + +void MacroAssembler::try_to_replace_prev_vector_copy_with_movprfx(FloatRegister dst) { + if (code_section()->is_empty()) { + return; + } + + address prev = pc() - NativeInstruction::instruction_size; + uint32_t insn = nativeInstruction_at(prev)->encoding(); + if (!NativeInstruction::is_neon_vector_mov_alias(insn) && + !NativeInstruction::is_sve_vector_mov_alias(insn)) { + return; + } + + // The destructive instruction must reuse the mov alias destination. + uint32_t rd = Instruction_aarch64::extract(insn, 4, 0); + if (rd != (uint32_t)dst->encoding()) { + return; + } + + uint32_t rn = Instruction_aarch64::extract(insn, 9, 5); + Instruction_aarch64::patch(prev, 31, 0, + NativeInstruction::encode_sve_movprfx(rd, rn)); +} diff --git a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp index ad8827bd9c0..b1050b45731 100644 --- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp @@ -1734,7 +1734,103 @@ public: private: // Check the current thread doesn't need a cross modify fence. void verify_cross_modify_fence_not_required() PRODUCT_RETURN; + void try_to_replace_prev_vector_copy_with_movprfx(FloatRegister dst); +public: + void maybe_movprfx(FloatRegister dst, FloatRegister src) { + if (dst != src) { + sve_movprfx(dst, src); + } + } + +// Wrappers for SVE explicit destructive instructions, overriding the +// same-signature Assembler entry points to enable movprfx fusion optimization. +// +// Implicit destructive instructions (e.g. predicated unary ops like sve_abs/ +// sve_neg/sve_not, whose ISA encoding allows Zd != Zn but whose use as a Java +// Vector API masked operation requires pass-through of the first source) are +// not covered here. For those, the .ad file is responsible for emitting +// movprfx explicitly via maybe_movprfx() before the destructive op. +#define SVE_DESTRUCTIVE_BINARY_INS(NAME) \ + using Assembler::NAME; \ + void NAME(FloatRegister Zd, SIMD_RegVariant T, PRegister Pg, \ + FloatRegister Zm) { \ + if (Zd != Zm) { \ + try_to_replace_prev_vector_copy_with_movprfx(Zd); \ + } \ + Assembler::NAME(Zd, T, Pg, Zm); \ + } + +#define SVE_DESTRUCTIVE_BINARY_5(I1, I2, I3, I4, I5) \ + SVE_DESTRUCTIVE_BINARY_INS(I1); SVE_DESTRUCTIVE_BINARY_INS(I2); \ + SVE_DESTRUCTIVE_BINARY_INS(I3); SVE_DESTRUCTIVE_BINARY_INS(I4); \ + SVE_DESTRUCTIVE_BINARY_INS(I5); + + SVE_DESTRUCTIVE_BINARY_5(sve_add, sve_and, sve_asr, sve_bic, sve_eor) + SVE_DESTRUCTIVE_BINARY_5(sve_fabd, sve_fadd, sve_fdiv, sve_fmax, sve_fmin) + SVE_DESTRUCTIVE_BINARY_5(sve_fmul, sve_fsub, sve_lsl, sve_lsr, sve_mul) + SVE_DESTRUCTIVE_BINARY_5(sve_orr, sve_smax, sve_smin, sve_sqadd, sve_sqsub) + SVE_DESTRUCTIVE_BINARY_5(sve_sub, sve_uqadd, sve_uqsub, sve_umax, sve_umin) + +#undef SVE_DESTRUCTIVE_BINARY_INS +#undef SVE_DESTRUCTIVE_BINARY_5 + +#define SVE_DESTRUCTIVE_SHIFT_IMM_INS(NAME) \ + void NAME(FloatRegister Zd, SIMD_RegVariant T, PRegister Pg, int shift) { \ + try_to_replace_prev_vector_copy_with_movprfx(Zd); \ + Assembler::NAME(Zd, T, Pg, shift); \ + } + + SVE_DESTRUCTIVE_SHIFT_IMM_INS(sve_asr); + SVE_DESTRUCTIVE_SHIFT_IMM_INS(sve_lsl); + SVE_DESTRUCTIVE_SHIFT_IMM_INS(sve_lsr); + +#undef SVE_DESTRUCTIVE_SHIFT_IMM_INS + +#define SVE_DESTRUCTIVE_UNPRED_IMM_INS(NAME, IMM_TYPE) \ + void NAME(FloatRegister Zd, SIMD_RegVariant T, IMM_TYPE imm) { \ + try_to_replace_prev_vector_copy_with_movprfx(Zd); \ + Assembler::NAME(Zd, T, imm); \ + } + + SVE_DESTRUCTIVE_UNPRED_IMM_INS(sve_add, unsigned); + SVE_DESTRUCTIVE_UNPRED_IMM_INS(sve_sub, unsigned); + SVE_DESTRUCTIVE_UNPRED_IMM_INS(sve_and, uint64_t); + SVE_DESTRUCTIVE_UNPRED_IMM_INS(sve_eor, uint64_t); + SVE_DESTRUCTIVE_UNPRED_IMM_INS(sve_orr, uint64_t); + +#undef SVE_DESTRUCTIVE_UNPRED_IMM_INS + +#define SVE_DESTRUCTIVE_TERNARY_INS(NAME) \ + using Assembler::NAME; \ + void NAME(FloatRegister Zd, SIMD_RegVariant T, PRegister Pg, \ + FloatRegister Zn, FloatRegister Zm) { \ + if (Zd != Zn && Zd != Zm) { \ + try_to_replace_prev_vector_copy_with_movprfx(Zd); \ + } \ + Assembler::NAME(Zd, T, Pg, Zn, Zm); \ + } + + SVE_DESTRUCTIVE_TERNARY_INS(sve_fmad); + SVE_DESTRUCTIVE_TERNARY_INS(sve_fmla); + SVE_DESTRUCTIVE_TERNARY_INS(sve_fmls); + SVE_DESTRUCTIVE_TERNARY_INS(sve_fmsb); + SVE_DESTRUCTIVE_TERNARY_INS(sve_fnmad); + SVE_DESTRUCTIVE_TERNARY_INS(sve_fnmla); + SVE_DESTRUCTIVE_TERNARY_INS(sve_fnmls); + SVE_DESTRUCTIVE_TERNARY_INS(sve_fnmsb); + SVE_DESTRUCTIVE_TERNARY_INS(sve_mla); + SVE_DESTRUCTIVE_TERNARY_INS(sve_mls); + +#undef SVE_DESTRUCTIVE_TERNARY_INS + + using Assembler::sve_eor3; + void sve_eor3(FloatRegister Zd, FloatRegister Zm, FloatRegister Zk) { + if (Zd != Zm && Zd != Zk) { + try_to_replace_prev_vector_copy_with_movprfx(Zd); + } + Assembler::sve_eor3(Zd, Zm, Zk); + } }; #ifdef ASSERT diff --git a/src/hotspot/cpu/aarch64/nativeInst_aarch64.hpp b/src/hotspot/cpu/aarch64/nativeInst_aarch64.hpp index 4bccbc59582..57bb9a91533 100644 --- a/src/hotspot/cpu/aarch64/nativeInst_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/nativeInst_aarch64.hpp @@ -140,6 +140,29 @@ public: Instruction_aarch64::extract(insn, 23, 23) == 0b0 && Instruction_aarch64::extract(insn, 26, 25) == 0b00; } + + static bool is_neon_vector_mov_alias(uint32_t insn) { + if (Instruction_aarch64::extract(insn, 31, 31) != 0 || + Instruction_aarch64::extract(insn, 29, 21) != 0b001110101 || + Instruction_aarch64::extract(insn, 15, 10) != 0b000111) { + return false; + } + return Instruction_aarch64::extract(insn, 9, 5) == + Instruction_aarch64::extract(insn, 20, 16); + } + + static bool is_sve_vector_mov_alias(uint32_t insn) { + if (Instruction_aarch64::extract(insn, 31, 21) != 0b00000100011 || + Instruction_aarch64::extract(insn, 15, 10) != 0b001100) { + return false; + } + return Instruction_aarch64::extract(insn, 9, 5) == + Instruction_aarch64::extract(insn, 20, 16); + } + + static uint32_t encode_sve_movprfx(uint32_t dst, uint32_t src) { + return 0x1082f << 10 | (src << 5) | dst; + } }; inline NativeInstruction* nativeInstruction_at(address address) {