8361582: AArch64: Some ConH values cannot be replicated with SVE

Reviewed-by: shade, epeter, aph
2026-01-28 12:09:14 +00:00 · 2025-09-01 09:18:29 +00:00 · 2025-09-01 09:18:29 +00:00 · 7f0cd6488b
commit 7f0cd6488b
parent fc77e7600f
7 changed files with 213 additions and 38 deletions
--- a/src/hotspot/cpu/aarch64/aarch64.ad
+++ b/src/hotspot/cpu/aarch64/aarch64.ad
@ -4412,10 +4412,9 @@ operand immI8()
 %}

 // 8 bit signed value (simm8), or #simm8 LSL 8.
-operand immI8_shift8()
+operand immIDupV()
 %{
-  predicate((n->get_int() <= 127 && n->get_int() >= -128) ||
-            (n->get_int() <= 32512 && n->get_int() >= -32768 && (n->get_int() & 0xff) == 0));
+  predicate(Assembler::operand_valid_for_sve_dup_immediate((int64_t)n->get_int()));
  match(ConI);

  op_cost(0);
@ -4424,10 +4423,9 @@ operand immI8_shift8()
 %}

 // 8 bit signed value (simm8), or #simm8 LSL 8.
-operand immL8_shift8()
+operand immLDupV()
 %{
-  predicate((n->get_long() <= 127 && n->get_long() >= -128) ||
-            (n->get_long() <= 32512 && n->get_long() >= -32768 && (n->get_long() & 0xff) == 0));
+  predicate(Assembler::operand_valid_for_sve_dup_immediate(n->get_long()));
  match(ConL);

  op_cost(0);
@ -4435,6 +4433,17 @@ operand immL8_shift8()
  interface(CONST_INTER);
 %}

+// 8 bit signed value (simm8), or #simm8 LSL 8.
+operand immHDupV()
+%{
+  predicate(Assembler::operand_valid_for_sve_dup_immediate((int64_t)n->geth()));
+  match(ConH);
+
+  op_cost(0);
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
 // 8 bit integer valid for vector add sub immediate
 operand immBAddSubV()
 %{
@ -7077,18 +7086,16 @@ instruct loadConD(vRegD dst, immD con) %{
 %}

 // Load Half Float Constant
-// The "ldr" instruction loads a 32-bit word from the constant pool into a
-// 32-bit register but only the bottom half will be populated and the top
-// 16 bits are zero.
 instruct loadConH(vRegF dst, immH con) %{
  match(Set dst con);
-  format %{
-    "ldrs $dst, [$constantaddress]\t# load from constant table: half float=$con\n\t"
-  %}
+  format %{ "mov    rscratch1, $con\n\t"
+            "fmov   $dst, rscratch1"
+         %}
  ins_encode %{
-    __ ldrs(as_FloatRegister($dst$$reg), $constantaddress($con));
+    __ movw(rscratch1, (uint32_t)$con$$constant);
+    __ fmovs($dst$$FloatRegister, rscratch1);
  %}
-  ins_pipe(fp_load_constant_s);
+  ins_pipe(pipe_class_default);
 %}

 // Store Instructions
--- a/src/hotspot/cpu/aarch64/aarch64_vector.ad
+++ b/src/hotspot/cpu/aarch64/aarch64_vector.ad
@ -4875,7 +4875,7 @@ instruct replicateB_imm8_gt128b(vReg dst, immI8 con) %{
  ins_pipe(pipe_slow);
 %}

-instruct replicateI_imm8_gt128b(vReg dst, immI8_shift8 con) %{
+instruct replicateI_imm8_gt128b(vReg dst, immIDupV con) %{
  predicate(Matcher::vector_length_in_bytes(n) > 16 &&
            (Matcher::vector_element_basic_type(n) == T_SHORT ||
             Matcher::vector_element_basic_type(n) == T_INT));
@ -4898,7 +4898,7 @@ instruct replicateL_imm_128b(vReg dst, immL con) %{
  ins_pipe(pipe_slow);
 %}

-instruct replicateL_imm8_gt128b(vReg dst, immL8_shift8 con) %{
+instruct replicateL_imm8_gt128b(vReg dst, immLDupV con) %{
  predicate(Matcher::vector_length_in_bytes(n) > 16);
  match(Set dst (Replicate con));
  format %{ "replicateL_imm8_gt128b $dst, $con\t# vector > 128 bits" %}
@ -4909,19 +4909,27 @@ instruct replicateL_imm8_gt128b(vReg dst, immL8_shift8 con) %{
  ins_pipe(pipe_slow);
 %}

-// Replicate a 16-bit half precision float value
-instruct replicateHF_imm(vReg dst, immH con) %{
+// Replicate an immediate 16-bit half precision float value
+instruct replicateHF_imm_le128b(vReg dst, immH con) %{
+  predicate(Matcher::vector_length_in_bytes(n) <= 16);
  match(Set dst (Replicate con));
-  format %{ "replicateHF_imm $dst, $con\t# replicate immediate half-precision float" %}
+  format %{ "replicateHF_imm_le128b $dst, $con\t# vector <= 128 bits" %}
  ins_encode %{
-    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    int imm = (int)($con$$constant) & 0xffff;
-    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
-      __ mov($dst$$FloatRegister, get_arrangement(this), imm);
-    } else { // length_in_bytes must be > 16 and SVE should be enabled
-      assert(UseSVE > 0, "must be sve");
-      __ sve_dup($dst$$FloatRegister, __ H, imm);
-    }
+    __ mov($dst$$FloatRegister, get_arrangement(this), imm);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+// Replicate a 16-bit half precision float which is within the limits
+// for the operand - immHDupV
+instruct replicateHF_imm8_gt128b(vReg dst, immHDupV con) %{
+  predicate(Matcher::vector_length_in_bytes(n) > 16);
+  match(Set dst (Replicate con));
+  format %{ "replicateHF_imm8_gt128b $dst, $con\t# vector > 128 bits" %}
+  ins_encode %{
+    assert(UseSVE > 0, "must be sve");
+    __ sve_dup($dst$$FloatRegister, __ H, (int)($con$$constant));
  %}
  ins_pipe(pipe_slow);
 %}
--- a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
+++ b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
@ -3107,7 +3107,7 @@ instruct replicateB_imm8_gt128b(vReg dst, immI8 con) %{
  ins_pipe(pipe_slow);
 %}

-instruct replicateI_imm8_gt128b(vReg dst, immI8_shift8 con) %{
+instruct replicateI_imm8_gt128b(vReg dst, immIDupV con) %{
  predicate(Matcher::vector_length_in_bytes(n) > 16 &&
            (Matcher::vector_element_basic_type(n) == T_SHORT ||
             Matcher::vector_element_basic_type(n) == T_INT));
@ -3130,7 +3130,7 @@ instruct replicateL_imm_128b(vReg dst, immL con) %{
  ins_pipe(pipe_slow);
 %}

-instruct replicateL_imm8_gt128b(vReg dst, immL8_shift8 con) %{
+instruct replicateL_imm8_gt128b(vReg dst, immLDupV con) %{
  predicate(Matcher::vector_length_in_bytes(n) > 16);
  match(Set dst (Replicate con));
  format %{ "replicateL_imm8_gt128b $dst, $con\t# vector > 128 bits" %}
@ -3141,19 +3141,27 @@ instruct replicateL_imm8_gt128b(vReg dst, immL8_shift8 con) %{
  ins_pipe(pipe_slow);
 %}

-// Replicate a 16-bit half precision float value
-instruct replicateHF_imm(vReg dst, immH con) %{
+// Replicate an immediate 16-bit half precision float value
+instruct replicateHF_imm_le128b(vReg dst, immH con) %{
+  predicate(Matcher::vector_length_in_bytes(n) <= 16);
  match(Set dst (Replicate con));
-  format %{ "replicateHF_imm $dst, $con\t# replicate immediate half-precision float" %}
+  format %{ "replicateHF_imm_le128b $dst, $con\t# vector <= 128 bits" %}
  ins_encode %{
-    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    int imm = (int)($con$$constant) & 0xffff;
-    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
-      __ mov($dst$$FloatRegister, get_arrangement(this), imm);
-    } else { // length_in_bytes must be > 16 and SVE should be enabled
-      assert(UseSVE > 0, "must be sve");
-      __ sve_dup($dst$$FloatRegister, __ H, imm);
-    }
+    __ mov($dst$$FloatRegister, get_arrangement(this), imm);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+// Replicate a 16-bit half precision float which is within the limits
+// for the operand - immHDupV
+instruct replicateHF_imm8_gt128b(vReg dst, immHDupV con) %{
+  predicate(Matcher::vector_length_in_bytes(n) > 16);
+  match(Set dst (Replicate con));
+  format %{ "replicateHF_imm8_gt128b $dst, $con\t# vector > 128 bits" %}
+  ins_encode %{
+    assert(UseSVE > 0, "must be sve");
+    __ sve_dup($dst$$FloatRegister, __ H, (int)($con$$constant));
  %}
  ins_pipe(pipe_slow);
 %}
--- a/src/hotspot/cpu/aarch64/assembler_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/assembler_aarch64.cpp
@ -434,6 +434,11 @@ int Assembler::operand_valid_for_movi_immediate(uint64_t imm64, SIMD_Arrangement
  return -1;
 }

+bool Assembler::operand_valid_for_sve_dup_immediate(int64_t imm) {
+  return ((imm >= -128 && imm <= 127) ||
+          (((imm & 0xff) == 0) && imm >= -32768 && imm <= 32512));
+}
+
 bool Assembler::operand_valid_for_sve_logical_immediate(unsigned elembits, uint64_t imm) {
  return encode_sve_logical_immediate(elembits, imm) != 0xffffffff;
 }
--- a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
@ -4324,6 +4324,7 @@ public:
  static bool operand_valid_for_sve_add_sub_immediate(int64_t imm);
  static bool operand_valid_for_float_immediate(double imm);
  static int  operand_valid_for_movi_immediate(uint64_t imm64, SIMD_Arrangement T);
+  static bool operand_valid_for_sve_dup_immediate(int64_t imm);

  void emit_data64(jlong data, relocInfo::relocType rtype, int format = 0);
  void emit_data64(jlong data, RelocationHolder const& rspec, int format = 0);
--- a/test/hotspot/jtreg/compiler/c2/aarch64/TestFloat16Replicate.java
+++ b/test/hotspot/jtreg/compiler/c2/aarch64/TestFloat16Replicate.java
@ -0,0 +1,136 @@
+/* Copyright (c) 2025, Arm Limited. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/**
+* @test
+* @bug 8361582
+* @summary Ensure the correct backend replicate node is being generated for
+*          half precision float constants on >16B SVE machines
+* @modules jdk.incubator.vector
+* @library /test/lib /
+* @run main/othervm compiler.c2.aarch64.TestFloat16Replicate
+*/
+
+package compiler.c2.aarch64;
+
+import compiler.lib.ir_framework.*;
+import compiler.lib.verify.*;
+import java.util.Arrays;
+import java.util.Random;
+import jdk.incubator.vector.Float16;
+import jdk.test.lib.*;
+import jdk.test.lib.Utils;
+
+import static java.lang.Float.*;
+import static jdk.incubator.vector.Float16.*;
+
+public class TestFloat16Replicate {
+    private static short[] input;
+    private static short[] output;
+    private static short[] expected;
+    private static Random rnd;
+
+    // Choose FP16_IMM8 which is within the range of [-128 << 8, 127 << 8] and a multiple of 256
+    private static final Float16 FP16_IMM8;
+
+    // Choose a value in the range [-128 << 8, 127 << 8] and a non multiple of 256 for FP16_NON_IMM8
+    private static final Float16 FP16_NON_IMM8;
+
+    private static final int LEN = 1024;
+
+    public static void main(String args[]) {
+        TestFramework.runWithFlags("--add-modules=jdk.incubator.vector");
+        TestFramework.runWithFlags("--add-modules=jdk.incubator.vector", "-XX:-TieredCompilation");
+    }
+
+    static {
+        rnd = Utils.getRandomInstance();
+        int k = rnd.nextInt(-128, 128);
+        int b = rnd.nextInt(1, 256);
+        short bits_imm8     = (short) (k << 8);
+        short bits_non_imm8 = (short) ((k << 8) + b);
+
+        FP16_IMM8     = Float16.shortBitsToFloat16(bits_imm8);
+        FP16_NON_IMM8 = Float16.shortBitsToFloat16(bits_non_imm8);
+
+        input  = new short[LEN];
+        output = new short[LEN];
+        expected = new short[LEN];
+
+        for (int i = 0; i < LEN; i++) {
+            input[i] = (short) i;
+        }
+    }
+
+    // For vectorizable loops containing FP16 operations with an FP16 constant as one of the inputs, the IR
+    // node `(dst (Replicate con))` is generated to broadcast the constant into all lanes of an SVE register.
+    // On SVE-capable hardware with vector length > 16B, if the FP16 immediate is a signed value within the
+    // range [-128, 127] or a signed multiple of 256 in the range [-32768, 32512] for element widths of
+    // 16 bits or higher then the backend should generate the "replicateHF_imm_gt128b" machnode.
+    @Test
+    @Warmup(5000)
+    @IR(counts = {IRNode.REPLICATE_HF_IMM8, ">0"},
+        phase = CompilePhase.FINAL_CODE,
+        applyIf = {"MaxVectorSize", ">16"},
+        applyIfCPUFeature = {"sve", "true"})
+    public void TestFloat16AddInRange() {
+        for (int i = 0; i < LEN; ++i) {
+            output[i] = float16ToRawShortBits(Float16.add(shortBitsToFloat16(input[i]), FP16_IMM8));
+        }
+    }
+
+    @Check(test="TestFloat16AddInRange")
+    public void checkResultFloat16AddInRange() {
+        for (int i = 0; i < LEN; ++i) {
+            expected[i] = floatToFloat16(float16ToFloat(input[i]) + FP16_IMM8.floatValue());
+        }
+        Verify.checkEQWithRawBits(output, expected);
+    }
+
+    // For vectorizable loops containing FP16 operations with an FP16 constant as one of the inputs, the IR
+    // node `(dst (Replicate con))` is generated to broadcast the constant into all lanes of an SVE register.
+    // On SVE-capable hardware with vector length > 16B, if the FP16 constant falls outside the immediate
+    // range accepted by the SVE "dup" instruction, the backend must:
+    //   1. Generate the "loadConH" machnode to load the FP16 constant from the constant pool.
+    //   2. Emit the "replicateHF" machnode to broadcast this loaded constant into an SVE register.
+    // In this case, the backend should not generate the "replicateHF_imm8_gt128b" machnode.
+    @Test
+    @Warmup(5000)
+    @IR(counts = {IRNode.REPLICATE_HF, ">0"},
+        failOn = {IRNode.REPLICATE_HF_IMM8},
+        phase = CompilePhase.FINAL_CODE,
+        applyIf = {"MaxVectorSize", ">16"},
+        applyIfCPUFeature = {"sve", "true"})
+    public void TestFloat16AddOutOfRange() {
+        for (int i = 0; i < LEN; ++i) {
+            output[i] = float16ToRawShortBits(add(shortBitsToFloat16(input[i]), FP16_NON_IMM8));
+        }
+    }
+
+    @Check(test="TestFloat16AddOutOfRange")
+    public void checkResultFloat16AddOutOfRange() {
+        for (int i = 0; i < LEN; ++i) {
+            expected[i] = floatToFloat16(float16ToFloat(input[i]) + FP16_NON_IMM8.floatValue());
+        }
+        Verify.checkEQWithRawBits(output, expected);
+    }
+}
--- a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java
+++ b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java
@ -2896,6 +2896,16 @@ public class IRNode {
        vectorNode(SELECT_FROM_TWO_VECTOR_VL, "SelectFromTwoVector", TYPE_LONG);
    }

+    public static final String REPLICATE_HF = PREFIX + "REPLICATE_HF" + POSTFIX;
+    static {
+        machOnlyNameRegex(REPLICATE_HF, "replicateHF");
+    }
+
+    public static final String REPLICATE_HF_IMM8 = PREFIX + "REPLICATE_HF_IMM8" + POSTFIX;
+    static {
+        machOnlyNameRegex(REPLICATE_HF_IMM8, "replicateHF_imm8_gt128b");
+    }
+
    /*
     * Utility methods to set up IR_NODE_MAPPINGS.
     */