diff --git a/src/hotspot/cpu/aarch64/aarch64.ad b/src/hotspot/cpu/aarch64/aarch64.ad index 9697ac31350..33466453b76 100644 --- a/src/hotspot/cpu/aarch64/aarch64.ad +++ b/src/hotspot/cpu/aarch64/aarch64.ad @@ -4412,10 +4412,9 @@ operand immI8() %} // 8 bit signed value (simm8), or #simm8 LSL 8. -operand immI8_shift8() +operand immIDupV() %{ - predicate((n->get_int() <= 127 && n->get_int() >= -128) || - (n->get_int() <= 32512 && n->get_int() >= -32768 && (n->get_int() & 0xff) == 0)); + predicate(Assembler::operand_valid_for_sve_dup_immediate((int64_t)n->get_int())); match(ConI); op_cost(0); @@ -4424,10 +4423,9 @@ operand immI8_shift8() %} // 8 bit signed value (simm8), or #simm8 LSL 8. -operand immL8_shift8() +operand immLDupV() %{ - predicate((n->get_long() <= 127 && n->get_long() >= -128) || - (n->get_long() <= 32512 && n->get_long() >= -32768 && (n->get_long() & 0xff) == 0)); + predicate(Assembler::operand_valid_for_sve_dup_immediate(n->get_long())); match(ConL); op_cost(0); @@ -4435,6 +4433,17 @@ operand immL8_shift8() interface(CONST_INTER); %} +// 8 bit signed value (simm8), or #simm8 LSL 8. +operand immHDupV() +%{ + predicate(Assembler::operand_valid_for_sve_dup_immediate((int64_t)n->geth())); + match(ConH); + + op_cost(0); + format %{ %} + interface(CONST_INTER); +%} + // 8 bit integer valid for vector add sub immediate operand immBAddSubV() %{ @@ -7077,18 +7086,16 @@ instruct loadConD(vRegD dst, immD con) %{ %} // Load Half Float Constant -// The "ldr" instruction loads a 32-bit word from the constant pool into a -// 32-bit register but only the bottom half will be populated and the top -// 16 bits are zero. instruct loadConH(vRegF dst, immH con) %{ match(Set dst con); - format %{ - "ldrs $dst, [$constantaddress]\t# load from constant table: half float=$con\n\t" - %} + format %{ "mov rscratch1, $con\n\t" + "fmov $dst, rscratch1" + %} ins_encode %{ - __ ldrs(as_FloatRegister($dst$$reg), $constantaddress($con)); + __ movw(rscratch1, (uint32_t)$con$$constant); + __ fmovs($dst$$FloatRegister, rscratch1); %} - ins_pipe(fp_load_constant_s); + ins_pipe(pipe_class_default); %} // Store Instructions diff --git a/src/hotspot/cpu/aarch64/aarch64_vector.ad b/src/hotspot/cpu/aarch64/aarch64_vector.ad index 58300992c2a..67c4dad27a7 100644 --- a/src/hotspot/cpu/aarch64/aarch64_vector.ad +++ b/src/hotspot/cpu/aarch64/aarch64_vector.ad @@ -4875,7 +4875,7 @@ instruct replicateB_imm8_gt128b(vReg dst, immI8 con) %{ ins_pipe(pipe_slow); %} -instruct replicateI_imm8_gt128b(vReg dst, immI8_shift8 con) %{ +instruct replicateI_imm8_gt128b(vReg dst, immIDupV con) %{ predicate(Matcher::vector_length_in_bytes(n) > 16 && (Matcher::vector_element_basic_type(n) == T_SHORT || Matcher::vector_element_basic_type(n) == T_INT)); @@ -4898,7 +4898,7 @@ instruct replicateL_imm_128b(vReg dst, immL con) %{ ins_pipe(pipe_slow); %} -instruct replicateL_imm8_gt128b(vReg dst, immL8_shift8 con) %{ +instruct replicateL_imm8_gt128b(vReg dst, immLDupV con) %{ predicate(Matcher::vector_length_in_bytes(n) > 16); match(Set dst (Replicate con)); format %{ "replicateL_imm8_gt128b $dst, $con\t# vector > 128 bits" %} @@ -4909,19 +4909,27 @@ instruct replicateL_imm8_gt128b(vReg dst, immL8_shift8 con) %{ ins_pipe(pipe_slow); %} -// Replicate a 16-bit half precision float value -instruct replicateHF_imm(vReg dst, immH con) %{ +// Replicate an immediate 16-bit half precision float value +instruct replicateHF_imm_le128b(vReg dst, immH con) %{ + predicate(Matcher::vector_length_in_bytes(n) <= 16); match(Set dst (Replicate con)); - format %{ "replicateHF_imm $dst, $con\t# replicate immediate half-precision float" %} + format %{ "replicateHF_imm_le128b $dst, $con\t# vector <= 128 bits" %} ins_encode %{ - uint length_in_bytes = Matcher::vector_length_in_bytes(this); int imm = (int)($con$$constant) & 0xffff; - if (VM_Version::use_neon_for_vector(length_in_bytes)) { - __ mov($dst$$FloatRegister, get_arrangement(this), imm); - } else { // length_in_bytes must be > 16 and SVE should be enabled - assert(UseSVE > 0, "must be sve"); - __ sve_dup($dst$$FloatRegister, __ H, imm); - } + __ mov($dst$$FloatRegister, get_arrangement(this), imm); + %} + ins_pipe(pipe_slow); +%} + +// Replicate a 16-bit half precision float which is within the limits +// for the operand - immHDupV +instruct replicateHF_imm8_gt128b(vReg dst, immHDupV con) %{ + predicate(Matcher::vector_length_in_bytes(n) > 16); + match(Set dst (Replicate con)); + format %{ "replicateHF_imm8_gt128b $dst, $con\t# vector > 128 bits" %} + ins_encode %{ + assert(UseSVE > 0, "must be sve"); + __ sve_dup($dst$$FloatRegister, __ H, (int)($con$$constant)); %} ins_pipe(pipe_slow); %} diff --git a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 index 4d91e04dc21..28f91204ec3 100644 --- a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 +++ b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 @@ -3107,7 +3107,7 @@ instruct replicateB_imm8_gt128b(vReg dst, immI8 con) %{ ins_pipe(pipe_slow); %} -instruct replicateI_imm8_gt128b(vReg dst, immI8_shift8 con) %{ +instruct replicateI_imm8_gt128b(vReg dst, immIDupV con) %{ predicate(Matcher::vector_length_in_bytes(n) > 16 && (Matcher::vector_element_basic_type(n) == T_SHORT || Matcher::vector_element_basic_type(n) == T_INT)); @@ -3130,7 +3130,7 @@ instruct replicateL_imm_128b(vReg dst, immL con) %{ ins_pipe(pipe_slow); %} -instruct replicateL_imm8_gt128b(vReg dst, immL8_shift8 con) %{ +instruct replicateL_imm8_gt128b(vReg dst, immLDupV con) %{ predicate(Matcher::vector_length_in_bytes(n) > 16); match(Set dst (Replicate con)); format %{ "replicateL_imm8_gt128b $dst, $con\t# vector > 128 bits" %} @@ -3141,19 +3141,27 @@ instruct replicateL_imm8_gt128b(vReg dst, immL8_shift8 con) %{ ins_pipe(pipe_slow); %} -// Replicate a 16-bit half precision float value -instruct replicateHF_imm(vReg dst, immH con) %{ +// Replicate an immediate 16-bit half precision float value +instruct replicateHF_imm_le128b(vReg dst, immH con) %{ + predicate(Matcher::vector_length_in_bytes(n) <= 16); match(Set dst (Replicate con)); - format %{ "replicateHF_imm $dst, $con\t# replicate immediate half-precision float" %} + format %{ "replicateHF_imm_le128b $dst, $con\t# vector <= 128 bits" %} ins_encode %{ - uint length_in_bytes = Matcher::vector_length_in_bytes(this); int imm = (int)($con$$constant) & 0xffff; - if (VM_Version::use_neon_for_vector(length_in_bytes)) { - __ mov($dst$$FloatRegister, get_arrangement(this), imm); - } else { // length_in_bytes must be > 16 and SVE should be enabled - assert(UseSVE > 0, "must be sve"); - __ sve_dup($dst$$FloatRegister, __ H, imm); - } + __ mov($dst$$FloatRegister, get_arrangement(this), imm); + %} + ins_pipe(pipe_slow); +%} + +// Replicate a 16-bit half precision float which is within the limits +// for the operand - immHDupV +instruct replicateHF_imm8_gt128b(vReg dst, immHDupV con) %{ + predicate(Matcher::vector_length_in_bytes(n) > 16); + match(Set dst (Replicate con)); + format %{ "replicateHF_imm8_gt128b $dst, $con\t# vector > 128 bits" %} + ins_encode %{ + assert(UseSVE > 0, "must be sve"); + __ sve_dup($dst$$FloatRegister, __ H, (int)($con$$constant)); %} ins_pipe(pipe_slow); %} diff --git a/src/hotspot/cpu/aarch64/assembler_aarch64.cpp b/src/hotspot/cpu/aarch64/assembler_aarch64.cpp index 5e5d6c16b45..fe1792ed1c6 100644 --- a/src/hotspot/cpu/aarch64/assembler_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/assembler_aarch64.cpp @@ -434,6 +434,11 @@ int Assembler::operand_valid_for_movi_immediate(uint64_t imm64, SIMD_Arrangement return -1; } +bool Assembler::operand_valid_for_sve_dup_immediate(int64_t imm) { + return ((imm >= -128 && imm <= 127) || + (((imm & 0xff) == 0) && imm >= -32768 && imm <= 32512)); +} + bool Assembler::operand_valid_for_sve_logical_immediate(unsigned elembits, uint64_t imm) { return encode_sve_logical_immediate(elembits, imm) != 0xffffffff; } diff --git a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp index 11d302e9026..4b0a0e77915 100644 --- a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp @@ -4324,6 +4324,7 @@ public: static bool operand_valid_for_sve_add_sub_immediate(int64_t imm); static bool operand_valid_for_float_immediate(double imm); static int operand_valid_for_movi_immediate(uint64_t imm64, SIMD_Arrangement T); + static bool operand_valid_for_sve_dup_immediate(int64_t imm); void emit_data64(jlong data, relocInfo::relocType rtype, int format = 0); void emit_data64(jlong data, RelocationHolder const& rspec, int format = 0); diff --git a/test/hotspot/jtreg/compiler/c2/aarch64/TestFloat16Replicate.java b/test/hotspot/jtreg/compiler/c2/aarch64/TestFloat16Replicate.java new file mode 100644 index 00000000000..ab7808a0401 --- /dev/null +++ b/test/hotspot/jtreg/compiler/c2/aarch64/TestFloat16Replicate.java @@ -0,0 +1,136 @@ +/* Copyright (c) 2025, Arm Limited. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/** +* @test +* @bug 8361582 +* @summary Ensure the correct backend replicate node is being generated for +* half precision float constants on >16B SVE machines +* @modules jdk.incubator.vector +* @library /test/lib / +* @run main/othervm compiler.c2.aarch64.TestFloat16Replicate +*/ + +package compiler.c2.aarch64; + +import compiler.lib.ir_framework.*; +import compiler.lib.verify.*; +import java.util.Arrays; +import java.util.Random; +import jdk.incubator.vector.Float16; +import jdk.test.lib.*; +import jdk.test.lib.Utils; + +import static java.lang.Float.*; +import static jdk.incubator.vector.Float16.*; + +public class TestFloat16Replicate { + private static short[] input; + private static short[] output; + private static short[] expected; + private static Random rnd; + + // Choose FP16_IMM8 which is within the range of [-128 << 8, 127 << 8] and a multiple of 256 + private static final Float16 FP16_IMM8; + + // Choose a value in the range [-128 << 8, 127 << 8] and a non multiple of 256 for FP16_NON_IMM8 + private static final Float16 FP16_NON_IMM8; + + private static final int LEN = 1024; + + public static void main(String args[]) { + TestFramework.runWithFlags("--add-modules=jdk.incubator.vector"); + TestFramework.runWithFlags("--add-modules=jdk.incubator.vector", "-XX:-TieredCompilation"); + } + + static { + rnd = Utils.getRandomInstance(); + int k = rnd.nextInt(-128, 128); + int b = rnd.nextInt(1, 256); + short bits_imm8 = (short) (k << 8); + short bits_non_imm8 = (short) ((k << 8) + b); + + FP16_IMM8 = Float16.shortBitsToFloat16(bits_imm8); + FP16_NON_IMM8 = Float16.shortBitsToFloat16(bits_non_imm8); + + input = new short[LEN]; + output = new short[LEN]; + expected = new short[LEN]; + + for (int i = 0; i < LEN; i++) { + input[i] = (short) i; + } + } + + // For vectorizable loops containing FP16 operations with an FP16 constant as one of the inputs, the IR + // node `(dst (Replicate con))` is generated to broadcast the constant into all lanes of an SVE register. + // On SVE-capable hardware with vector length > 16B, if the FP16 immediate is a signed value within the + // range [-128, 127] or a signed multiple of 256 in the range [-32768, 32512] for element widths of + // 16 bits or higher then the backend should generate the "replicateHF_imm_gt128b" machnode. + @Test + @Warmup(5000) + @IR(counts = {IRNode.REPLICATE_HF_IMM8, ">0"}, + phase = CompilePhase.FINAL_CODE, + applyIf = {"MaxVectorSize", ">16"}, + applyIfCPUFeature = {"sve", "true"}) + public void TestFloat16AddInRange() { + for (int i = 0; i < LEN; ++i) { + output[i] = float16ToRawShortBits(Float16.add(shortBitsToFloat16(input[i]), FP16_IMM8)); + } + } + + @Check(test="TestFloat16AddInRange") + public void checkResultFloat16AddInRange() { + for (int i = 0; i < LEN; ++i) { + expected[i] = floatToFloat16(float16ToFloat(input[i]) + FP16_IMM8.floatValue()); + } + Verify.checkEQWithRawBits(output, expected); + } + + // For vectorizable loops containing FP16 operations with an FP16 constant as one of the inputs, the IR + // node `(dst (Replicate con))` is generated to broadcast the constant into all lanes of an SVE register. + // On SVE-capable hardware with vector length > 16B, if the FP16 constant falls outside the immediate + // range accepted by the SVE "dup" instruction, the backend must: + // 1. Generate the "loadConH" machnode to load the FP16 constant from the constant pool. + // 2. Emit the "replicateHF" machnode to broadcast this loaded constant into an SVE register. + // In this case, the backend should not generate the "replicateHF_imm8_gt128b" machnode. + @Test + @Warmup(5000) + @IR(counts = {IRNode.REPLICATE_HF, ">0"}, + failOn = {IRNode.REPLICATE_HF_IMM8}, + phase = CompilePhase.FINAL_CODE, + applyIf = {"MaxVectorSize", ">16"}, + applyIfCPUFeature = {"sve", "true"}) + public void TestFloat16AddOutOfRange() { + for (int i = 0; i < LEN; ++i) { + output[i] = float16ToRawShortBits(add(shortBitsToFloat16(input[i]), FP16_NON_IMM8)); + } + } + + @Check(test="TestFloat16AddOutOfRange") + public void checkResultFloat16AddOutOfRange() { + for (int i = 0; i < LEN; ++i) { + expected[i] = floatToFloat16(float16ToFloat(input[i]) + FP16_NON_IMM8.floatValue()); + } + Verify.checkEQWithRawBits(output, expected); + } +} diff --git a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java index 7fb1eeb800c..16c6d99a64f 100644 --- a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java +++ b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java @@ -2896,6 +2896,16 @@ public class IRNode { vectorNode(SELECT_FROM_TWO_VECTOR_VL, "SelectFromTwoVector", TYPE_LONG); } + public static final String REPLICATE_HF = PREFIX + "REPLICATE_HF" + POSTFIX; + static { + machOnlyNameRegex(REPLICATE_HF, "replicateHF"); + } + + public static final String REPLICATE_HF_IMM8 = PREFIX + "REPLICATE_HF_IMM8" + POSTFIX; + static { + machOnlyNameRegex(REPLICATE_HF_IMM8, "replicateHF_imm8_gt128b"); + } + /* * Utility methods to set up IR_NODE_MAPPINGS. */