8361582: AArch64: Some ConH values cannot be replicated with SVE

Reviewed-by: shade, epeter, aph
This commit is contained in:
Bhavana Kilambi 2025-09-01 09:18:29 +00:00 committed by Aleksey Shipilev
parent fc77e7600f
commit 7f0cd6488b
7 changed files with 213 additions and 38 deletions

View File

@ -4412,10 +4412,9 @@ operand immI8()
%}
// 8 bit signed value (simm8), or #simm8 LSL 8.
operand immI8_shift8()
operand immIDupV()
%{
predicate((n->get_int() <= 127 && n->get_int() >= -128) ||
(n->get_int() <= 32512 && n->get_int() >= -32768 && (n->get_int() & 0xff) == 0));
predicate(Assembler::operand_valid_for_sve_dup_immediate((int64_t)n->get_int()));
match(ConI);
op_cost(0);
@ -4424,10 +4423,9 @@ operand immI8_shift8()
%}
// 8 bit signed value (simm8), or #simm8 LSL 8.
operand immL8_shift8()
operand immLDupV()
%{
predicate((n->get_long() <= 127 && n->get_long() >= -128) ||
(n->get_long() <= 32512 && n->get_long() >= -32768 && (n->get_long() & 0xff) == 0));
predicate(Assembler::operand_valid_for_sve_dup_immediate(n->get_long()));
match(ConL);
op_cost(0);
@ -4435,6 +4433,17 @@ operand immL8_shift8()
interface(CONST_INTER);
%}
// 8 bit signed value (simm8), or #simm8 LSL 8.
operand immHDupV()
%{
predicate(Assembler::operand_valid_for_sve_dup_immediate((int64_t)n->geth()));
match(ConH);
op_cost(0);
format %{ %}
interface(CONST_INTER);
%}
// 8 bit integer valid for vector add sub immediate
operand immBAddSubV()
%{
@ -7077,18 +7086,16 @@ instruct loadConD(vRegD dst, immD con) %{
%}
// Load Half Float Constant
// The "ldr" instruction loads a 32-bit word from the constant pool into a
// 32-bit register but only the bottom half will be populated and the top
// 16 bits are zero.
instruct loadConH(vRegF dst, immH con) %{
match(Set dst con);
format %{
"ldrs $dst, [$constantaddress]\t# load from constant table: half float=$con\n\t"
%}
format %{ "mov rscratch1, $con\n\t"
"fmov $dst, rscratch1"
%}
ins_encode %{
__ ldrs(as_FloatRegister($dst$$reg), $constantaddress($con));
__ movw(rscratch1, (uint32_t)$con$$constant);
__ fmovs($dst$$FloatRegister, rscratch1);
%}
ins_pipe(fp_load_constant_s);
ins_pipe(pipe_class_default);
%}
// Store Instructions

View File

@ -4875,7 +4875,7 @@ instruct replicateB_imm8_gt128b(vReg dst, immI8 con) %{
ins_pipe(pipe_slow);
%}
instruct replicateI_imm8_gt128b(vReg dst, immI8_shift8 con) %{
instruct replicateI_imm8_gt128b(vReg dst, immIDupV con) %{
predicate(Matcher::vector_length_in_bytes(n) > 16 &&
(Matcher::vector_element_basic_type(n) == T_SHORT ||
Matcher::vector_element_basic_type(n) == T_INT));
@ -4898,7 +4898,7 @@ instruct replicateL_imm_128b(vReg dst, immL con) %{
ins_pipe(pipe_slow);
%}
instruct replicateL_imm8_gt128b(vReg dst, immL8_shift8 con) %{
instruct replicateL_imm8_gt128b(vReg dst, immLDupV con) %{
predicate(Matcher::vector_length_in_bytes(n) > 16);
match(Set dst (Replicate con));
format %{ "replicateL_imm8_gt128b $dst, $con\t# vector > 128 bits" %}
@ -4909,19 +4909,27 @@ instruct replicateL_imm8_gt128b(vReg dst, immL8_shift8 con) %{
ins_pipe(pipe_slow);
%}
// Replicate a 16-bit half precision float value
instruct replicateHF_imm(vReg dst, immH con) %{
// Replicate an immediate 16-bit half precision float value
instruct replicateHF_imm_le128b(vReg dst, immH con) %{
predicate(Matcher::vector_length_in_bytes(n) <= 16);
match(Set dst (Replicate con));
format %{ "replicateHF_imm $dst, $con\t# replicate immediate half-precision float" %}
format %{ "replicateHF_imm_le128b $dst, $con\t# vector <= 128 bits" %}
ins_encode %{
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
int imm = (int)($con$$constant) & 0xffff;
if (VM_Version::use_neon_for_vector(length_in_bytes)) {
__ mov($dst$$FloatRegister, get_arrangement(this), imm);
} else { // length_in_bytes must be > 16 and SVE should be enabled
assert(UseSVE > 0, "must be sve");
__ sve_dup($dst$$FloatRegister, __ H, imm);
}
__ mov($dst$$FloatRegister, get_arrangement(this), imm);
%}
ins_pipe(pipe_slow);
%}
// Replicate a 16-bit half precision float which is within the limits
// for the operand - immHDupV
instruct replicateHF_imm8_gt128b(vReg dst, immHDupV con) %{
predicate(Matcher::vector_length_in_bytes(n) > 16);
match(Set dst (Replicate con));
format %{ "replicateHF_imm8_gt128b $dst, $con\t# vector > 128 bits" %}
ins_encode %{
assert(UseSVE > 0, "must be sve");
__ sve_dup($dst$$FloatRegister, __ H, (int)($con$$constant));
%}
ins_pipe(pipe_slow);
%}

View File

@ -3107,7 +3107,7 @@ instruct replicateB_imm8_gt128b(vReg dst, immI8 con) %{
ins_pipe(pipe_slow);
%}
instruct replicateI_imm8_gt128b(vReg dst, immI8_shift8 con) %{
instruct replicateI_imm8_gt128b(vReg dst, immIDupV con) %{
predicate(Matcher::vector_length_in_bytes(n) > 16 &&
(Matcher::vector_element_basic_type(n) == T_SHORT ||
Matcher::vector_element_basic_type(n) == T_INT));
@ -3130,7 +3130,7 @@ instruct replicateL_imm_128b(vReg dst, immL con) %{
ins_pipe(pipe_slow);
%}
instruct replicateL_imm8_gt128b(vReg dst, immL8_shift8 con) %{
instruct replicateL_imm8_gt128b(vReg dst, immLDupV con) %{
predicate(Matcher::vector_length_in_bytes(n) > 16);
match(Set dst (Replicate con));
format %{ "replicateL_imm8_gt128b $dst, $con\t# vector > 128 bits" %}
@ -3141,19 +3141,27 @@ instruct replicateL_imm8_gt128b(vReg dst, immL8_shift8 con) %{
ins_pipe(pipe_slow);
%}
// Replicate a 16-bit half precision float value
instruct replicateHF_imm(vReg dst, immH con) %{
// Replicate an immediate 16-bit half precision float value
instruct replicateHF_imm_le128b(vReg dst, immH con) %{
predicate(Matcher::vector_length_in_bytes(n) <= 16);
match(Set dst (Replicate con));
format %{ "replicateHF_imm $dst, $con\t# replicate immediate half-precision float" %}
format %{ "replicateHF_imm_le128b $dst, $con\t# vector <= 128 bits" %}
ins_encode %{
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
int imm = (int)($con$$constant) & 0xffff;
if (VM_Version::use_neon_for_vector(length_in_bytes)) {
__ mov($dst$$FloatRegister, get_arrangement(this), imm);
} else { // length_in_bytes must be > 16 and SVE should be enabled
assert(UseSVE > 0, "must be sve");
__ sve_dup($dst$$FloatRegister, __ H, imm);
}
__ mov($dst$$FloatRegister, get_arrangement(this), imm);
%}
ins_pipe(pipe_slow);
%}
// Replicate a 16-bit half precision float which is within the limits
// for the operand - immHDupV
instruct replicateHF_imm8_gt128b(vReg dst, immHDupV con) %{
predicate(Matcher::vector_length_in_bytes(n) > 16);
match(Set dst (Replicate con));
format %{ "replicateHF_imm8_gt128b $dst, $con\t# vector > 128 bits" %}
ins_encode %{
assert(UseSVE > 0, "must be sve");
__ sve_dup($dst$$FloatRegister, __ H, (int)($con$$constant));
%}
ins_pipe(pipe_slow);
%}

View File

@ -434,6 +434,11 @@ int Assembler::operand_valid_for_movi_immediate(uint64_t imm64, SIMD_Arrangement
return -1;
}
bool Assembler::operand_valid_for_sve_dup_immediate(int64_t imm) {
return ((imm >= -128 && imm <= 127) ||
(((imm & 0xff) == 0) && imm >= -32768 && imm <= 32512));
}
bool Assembler::operand_valid_for_sve_logical_immediate(unsigned elembits, uint64_t imm) {
return encode_sve_logical_immediate(elembits, imm) != 0xffffffff;
}

View File

@ -4324,6 +4324,7 @@ public:
static bool operand_valid_for_sve_add_sub_immediate(int64_t imm);
static bool operand_valid_for_float_immediate(double imm);
static int operand_valid_for_movi_immediate(uint64_t imm64, SIMD_Arrangement T);
static bool operand_valid_for_sve_dup_immediate(int64_t imm);
void emit_data64(jlong data, relocInfo::relocType rtype, int format = 0);
void emit_data64(jlong data, RelocationHolder const& rspec, int format = 0);

View File

@ -0,0 +1,136 @@
/* Copyright (c) 2025, Arm Limited. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/**
* @test
* @bug 8361582
* @summary Ensure the correct backend replicate node is being generated for
* half precision float constants on >16B SVE machines
* @modules jdk.incubator.vector
* @library /test/lib /
* @run main/othervm compiler.c2.aarch64.TestFloat16Replicate
*/
package compiler.c2.aarch64;
import compiler.lib.ir_framework.*;
import compiler.lib.verify.*;
import java.util.Arrays;
import java.util.Random;
import jdk.incubator.vector.Float16;
import jdk.test.lib.*;
import jdk.test.lib.Utils;
import static java.lang.Float.*;
import static jdk.incubator.vector.Float16.*;
public class TestFloat16Replicate {
private static short[] input;
private static short[] output;
private static short[] expected;
private static Random rnd;
// Choose FP16_IMM8 which is within the range of [-128 << 8, 127 << 8] and a multiple of 256
private static final Float16 FP16_IMM8;
// Choose a value in the range [-128 << 8, 127 << 8] and a non multiple of 256 for FP16_NON_IMM8
private static final Float16 FP16_NON_IMM8;
private static final int LEN = 1024;
public static void main(String args[]) {
TestFramework.runWithFlags("--add-modules=jdk.incubator.vector");
TestFramework.runWithFlags("--add-modules=jdk.incubator.vector", "-XX:-TieredCompilation");
}
static {
rnd = Utils.getRandomInstance();
int k = rnd.nextInt(-128, 128);
int b = rnd.nextInt(1, 256);
short bits_imm8 = (short) (k << 8);
short bits_non_imm8 = (short) ((k << 8) + b);
FP16_IMM8 = Float16.shortBitsToFloat16(bits_imm8);
FP16_NON_IMM8 = Float16.shortBitsToFloat16(bits_non_imm8);
input = new short[LEN];
output = new short[LEN];
expected = new short[LEN];
for (int i = 0; i < LEN; i++) {
input[i] = (short) i;
}
}
// For vectorizable loops containing FP16 operations with an FP16 constant as one of the inputs, the IR
// node `(dst (Replicate con))` is generated to broadcast the constant into all lanes of an SVE register.
// On SVE-capable hardware with vector length > 16B, if the FP16 immediate is a signed value within the
// range [-128, 127] or a signed multiple of 256 in the range [-32768, 32512] for element widths of
// 16 bits or higher then the backend should generate the "replicateHF_imm_gt128b" machnode.
@Test
@Warmup(5000)
@IR(counts = {IRNode.REPLICATE_HF_IMM8, ">0"},
phase = CompilePhase.FINAL_CODE,
applyIf = {"MaxVectorSize", ">16"},
applyIfCPUFeature = {"sve", "true"})
public void TestFloat16AddInRange() {
for (int i = 0; i < LEN; ++i) {
output[i] = float16ToRawShortBits(Float16.add(shortBitsToFloat16(input[i]), FP16_IMM8));
}
}
@Check(test="TestFloat16AddInRange")
public void checkResultFloat16AddInRange() {
for (int i = 0; i < LEN; ++i) {
expected[i] = floatToFloat16(float16ToFloat(input[i]) + FP16_IMM8.floatValue());
}
Verify.checkEQWithRawBits(output, expected);
}
// For vectorizable loops containing FP16 operations with an FP16 constant as one of the inputs, the IR
// node `(dst (Replicate con))` is generated to broadcast the constant into all lanes of an SVE register.
// On SVE-capable hardware with vector length > 16B, if the FP16 constant falls outside the immediate
// range accepted by the SVE "dup" instruction, the backend must:
// 1. Generate the "loadConH" machnode to load the FP16 constant from the constant pool.
// 2. Emit the "replicateHF" machnode to broadcast this loaded constant into an SVE register.
// In this case, the backend should not generate the "replicateHF_imm8_gt128b" machnode.
@Test
@Warmup(5000)
@IR(counts = {IRNode.REPLICATE_HF, ">0"},
failOn = {IRNode.REPLICATE_HF_IMM8},
phase = CompilePhase.FINAL_CODE,
applyIf = {"MaxVectorSize", ">16"},
applyIfCPUFeature = {"sve", "true"})
public void TestFloat16AddOutOfRange() {
for (int i = 0; i < LEN; ++i) {
output[i] = float16ToRawShortBits(add(shortBitsToFloat16(input[i]), FP16_NON_IMM8));
}
}
@Check(test="TestFloat16AddOutOfRange")
public void checkResultFloat16AddOutOfRange() {
for (int i = 0; i < LEN; ++i) {
expected[i] = floatToFloat16(float16ToFloat(input[i]) + FP16_NON_IMM8.floatValue());
}
Verify.checkEQWithRawBits(output, expected);
}
}

View File

@ -2896,6 +2896,16 @@ public class IRNode {
vectorNode(SELECT_FROM_TWO_VECTOR_VL, "SelectFromTwoVector", TYPE_LONG);
}
public static final String REPLICATE_HF = PREFIX + "REPLICATE_HF" + POSTFIX;
static {
machOnlyNameRegex(REPLICATE_HF, "replicateHF");
}
public static final String REPLICATE_HF_IMM8 = PREFIX + "REPLICATE_HF_IMM8" + POSTFIX;
static {
machOnlyNameRegex(REPLICATE_HF_IMM8, "replicateHF_imm8_gt128b");
}
/*
* Utility methods to set up IR_NODE_MAPPINGS.
*/