diff --git a/src/hotspot/cpu/aarch64/aarch64.ad b/src/hotspot/cpu/aarch64/aarch64.ad
index 9697ac31350..33466453b76 100644
--- a/src/hotspot/cpu/aarch64/aarch64.ad
+++ b/src/hotspot/cpu/aarch64/aarch64.ad
@@ -4412,10 +4412,9 @@ operand immI8()
 %}
 
 // 8 bit signed value (simm8), or #simm8 LSL 8.
-operand immI8_shift8()
+operand immIDupV()
 %{
-  predicate((n->get_int() <= 127 && n->get_int() >= -128) ||
-            (n->get_int() <= 32512 && n->get_int() >= -32768 && (n->get_int() & 0xff) == 0));
+  predicate(Assembler::operand_valid_for_sve_dup_immediate((int64_t)n->get_int()));
   match(ConI);
 
   op_cost(0);
@@ -4424,10 +4423,9 @@ operand immI8_shift8()
 %}
 
 // 8 bit signed value (simm8), or #simm8 LSL 8.
-operand immL8_shift8()
+operand immLDupV()
 %{
-  predicate((n->get_long() <= 127 && n->get_long() >= -128) ||
-            (n->get_long() <= 32512 && n->get_long() >= -32768 && (n->get_long() & 0xff) == 0));
+  predicate(Assembler::operand_valid_for_sve_dup_immediate(n->get_long()));
   match(ConL);
 
   op_cost(0);
@@ -4435,6 +4433,17 @@ operand immL8_shift8()
   interface(CONST_INTER);
 %}
 
+// 8 bit signed value (simm8), or #simm8 LSL 8.
+operand immHDupV()
+%{
+  predicate(Assembler::operand_valid_for_sve_dup_immediate((int64_t)n->geth()));
+  match(ConH);
+
+  op_cost(0);
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
 // 8 bit integer valid for vector add sub immediate
 operand immBAddSubV()
 %{
@@ -7077,18 +7086,16 @@ instruct loadConD(vRegD dst, immD con) %{
 %}
 
 // Load Half Float Constant
-// The "ldr" instruction loads a 32-bit word from the constant pool into a
-// 32-bit register but only the bottom half will be populated and the top
-// 16 bits are zero.
 instruct loadConH(vRegF dst, immH con) %{
   match(Set dst con);
-  format %{
-    "ldrs $dst, [$constantaddress]\t# load from constant table: half float=$con\n\t"
-  %}
+  format %{ "mov    rscratch1, $con\n\t"
+            "fmov   $dst, rscratch1"
+         %}
   ins_encode %{
-    __ ldrs(as_FloatRegister($dst$$reg), $constantaddress($con));
+    __ movw(rscratch1, (uint32_t)$con$$constant);
+    __ fmovs($dst$$FloatRegister, rscratch1);
   %}
-  ins_pipe(fp_load_constant_s);
+  ins_pipe(pipe_class_default);
 %}
 
 // Store Instructions
diff --git a/src/hotspot/cpu/aarch64/aarch64_vector.ad b/src/hotspot/cpu/aarch64/aarch64_vector.ad
index 58300992c2a..67c4dad27a7 100644
--- a/src/hotspot/cpu/aarch64/aarch64_vector.ad
+++ b/src/hotspot/cpu/aarch64/aarch64_vector.ad
@@ -4875,7 +4875,7 @@ instruct replicateB_imm8_gt128b(vReg dst, immI8 con) %{
   ins_pipe(pipe_slow);
 %}
 
-instruct replicateI_imm8_gt128b(vReg dst, immI8_shift8 con) %{
+instruct replicateI_imm8_gt128b(vReg dst, immIDupV con) %{
   predicate(Matcher::vector_length_in_bytes(n) > 16 &&
             (Matcher::vector_element_basic_type(n) == T_SHORT ||
              Matcher::vector_element_basic_type(n) == T_INT));
@@ -4898,7 +4898,7 @@ instruct replicateL_imm_128b(vReg dst, immL con) %{
   ins_pipe(pipe_slow);
 %}
 
-instruct replicateL_imm8_gt128b(vReg dst, immL8_shift8 con) %{
+instruct replicateL_imm8_gt128b(vReg dst, immLDupV con) %{
   predicate(Matcher::vector_length_in_bytes(n) > 16);
   match(Set dst (Replicate con));
   format %{ "replicateL_imm8_gt128b $dst, $con\t# vector > 128 bits" %}
@@ -4909,19 +4909,27 @@ instruct replicateL_imm8_gt128b(vReg dst, immL8_shift8 con) %{
   ins_pipe(pipe_slow);
 %}
 
-// Replicate a 16-bit half precision float value
-instruct replicateHF_imm(vReg dst, immH con) %{
+// Replicate an immediate 16-bit half precision float value
+instruct replicateHF_imm_le128b(vReg dst, immH con) %{
+  predicate(Matcher::vector_length_in_bytes(n) <= 16);
   match(Set dst (Replicate con));
-  format %{ "replicateHF_imm $dst, $con\t# replicate immediate half-precision float" %}
+  format %{ "replicateHF_imm_le128b $dst, $con\t# vector <= 128 bits" %}
   ins_encode %{
-    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
     int imm = (int)($con$$constant) & 0xffff;
-    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
-      __ mov($dst$$FloatRegister, get_arrangement(this), imm);
-    } else { // length_in_bytes must be > 16 and SVE should be enabled
-      assert(UseSVE > 0, "must be sve");
-      __ sve_dup($dst$$FloatRegister, __ H, imm);
-    }
+    __ mov($dst$$FloatRegister, get_arrangement(this), imm);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+// Replicate a 16-bit half precision float which is within the limits
+// for the operand - immHDupV
+instruct replicateHF_imm8_gt128b(vReg dst, immHDupV con) %{
+  predicate(Matcher::vector_length_in_bytes(n) > 16);
+  match(Set dst (Replicate con));
+  format %{ "replicateHF_imm8_gt128b $dst, $con\t# vector > 128 bits" %}
+  ins_encode %{
+    assert(UseSVE > 0, "must be sve");
+    __ sve_dup($dst$$FloatRegister, __ H, (int)($con$$constant));
   %}
   ins_pipe(pipe_slow);
 %}
diff --git a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
index 4d91e04dc21..28f91204ec3 100644
--- a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
+++ b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
@@ -3107,7 +3107,7 @@ instruct replicateB_imm8_gt128b(vReg dst, immI8 con) %{
   ins_pipe(pipe_slow);
 %}
 
-instruct replicateI_imm8_gt128b(vReg dst, immI8_shift8 con) %{
+instruct replicateI_imm8_gt128b(vReg dst, immIDupV con) %{
   predicate(Matcher::vector_length_in_bytes(n) > 16 &&
             (Matcher::vector_element_basic_type(n) == T_SHORT ||
              Matcher::vector_element_basic_type(n) == T_INT));
@@ -3130,7 +3130,7 @@ instruct replicateL_imm_128b(vReg dst, immL con) %{
   ins_pipe(pipe_slow);
 %}
 
-instruct replicateL_imm8_gt128b(vReg dst, immL8_shift8 con) %{
+instruct replicateL_imm8_gt128b(vReg dst, immLDupV con) %{
   predicate(Matcher::vector_length_in_bytes(n) > 16);
   match(Set dst (Replicate con));
   format %{ "replicateL_imm8_gt128b $dst, $con\t# vector > 128 bits" %}
@@ -3141,19 +3141,27 @@ instruct replicateL_imm8_gt128b(vReg dst, immL8_shift8 con) %{
   ins_pipe(pipe_slow);
 %}
 
-// Replicate a 16-bit half precision float value
-instruct replicateHF_imm(vReg dst, immH con) %{
+// Replicate an immediate 16-bit half precision float value
+instruct replicateHF_imm_le128b(vReg dst, immH con) %{
+  predicate(Matcher::vector_length_in_bytes(n) <= 16);
   match(Set dst (Replicate con));
-  format %{ "replicateHF_imm $dst, $con\t# replicate immediate half-precision float" %}
+  format %{ "replicateHF_imm_le128b $dst, $con\t# vector <= 128 bits" %}
   ins_encode %{
-    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
     int imm = (int)($con$$constant) & 0xffff;
-    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
-      __ mov($dst$$FloatRegister, get_arrangement(this), imm);
-    } else { // length_in_bytes must be > 16 and SVE should be enabled
-      assert(UseSVE > 0, "must be sve");
-      __ sve_dup($dst$$FloatRegister, __ H, imm);
-    }
+    __ mov($dst$$FloatRegister, get_arrangement(this), imm);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+// Replicate a 16-bit half precision float which is within the limits
+// for the operand - immHDupV
+instruct replicateHF_imm8_gt128b(vReg dst, immHDupV con) %{
+  predicate(Matcher::vector_length_in_bytes(n) > 16);
+  match(Set dst (Replicate con));
+  format %{ "replicateHF_imm8_gt128b $dst, $con\t# vector > 128 bits" %}
+  ins_encode %{
+    assert(UseSVE > 0, "must be sve");
+    __ sve_dup($dst$$FloatRegister, __ H, (int)($con$$constant));
   %}
   ins_pipe(pipe_slow);
 %}
diff --git a/src/hotspot/cpu/aarch64/assembler_aarch64.cpp b/src/hotspot/cpu/aarch64/assembler_aarch64.cpp
index 5e5d6c16b45..fe1792ed1c6 100644
--- a/src/hotspot/cpu/aarch64/assembler_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/assembler_aarch64.cpp
@@ -434,6 +434,11 @@ int Assembler::operand_valid_for_movi_immediate(uint64_t imm64, SIMD_Arrangement
   return -1;
 }
 
+bool Assembler::operand_valid_for_sve_dup_immediate(int64_t imm) {
+  return ((imm >= -128 && imm <= 127) ||
+          (((imm & 0xff) == 0) && imm >= -32768 && imm <= 32512));
+}
+
 bool Assembler::operand_valid_for_sve_logical_immediate(unsigned elembits, uint64_t imm) {
   return encode_sve_logical_immediate(elembits, imm) != 0xffffffff;
 }
diff --git a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
index 11d302e9026..4b0a0e77915 100644
--- a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
@@ -4324,6 +4324,7 @@ public:
   static bool operand_valid_for_sve_add_sub_immediate(int64_t imm);
   static bool operand_valid_for_float_immediate(double imm);
   static int  operand_valid_for_movi_immediate(uint64_t imm64, SIMD_Arrangement T);
+  static bool operand_valid_for_sve_dup_immediate(int64_t imm);
 
   void emit_data64(jlong data, relocInfo::relocType rtype, int format = 0);
   void emit_data64(jlong data, RelocationHolder const& rspec, int format = 0);
diff --git a/test/hotspot/jtreg/compiler/c2/aarch64/TestFloat16Replicate.java b/test/hotspot/jtreg/compiler/c2/aarch64/TestFloat16Replicate.java
new file mode 100644
index 00000000000..ab7808a0401
--- /dev/null
+++ b/test/hotspot/jtreg/compiler/c2/aarch64/TestFloat16Replicate.java
@@ -0,0 +1,136 @@
+/* Copyright (c) 2025, Arm Limited. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/**
+* @test
+* @bug 8361582
+* @summary Ensure the correct backend replicate node is being generated for
+*          half precision float constants on >16B SVE machines
+* @modules jdk.incubator.vector
+* @library /test/lib /
+* @run main/othervm compiler.c2.aarch64.TestFloat16Replicate
+*/
+
+package compiler.c2.aarch64;
+
+import compiler.lib.ir_framework.*;
+import compiler.lib.verify.*;
+import java.util.Arrays;
+import java.util.Random;
+import jdk.incubator.vector.Float16;
+import jdk.test.lib.*;
+import jdk.test.lib.Utils;
+
+import static java.lang.Float.*;
+import static jdk.incubator.vector.Float16.*;
+
+public class TestFloat16Replicate {
+    private static short[] input;
+    private static short[] output;
+    private static short[] expected;
+    private static Random rnd;
+
+    // Choose FP16_IMM8 which is within the range of [-128 << 8, 127 << 8] and a multiple of 256
+    private static final Float16 FP16_IMM8;
+
+    // Choose a value in the range [-128 << 8, 127 << 8] and a non multiple of 256 for FP16_NON_IMM8
+    private static final Float16 FP16_NON_IMM8;
+
+    private static final int LEN = 1024;
+
+    public static void main(String args[]) {
+        TestFramework.runWithFlags("--add-modules=jdk.incubator.vector");
+        TestFramework.runWithFlags("--add-modules=jdk.incubator.vector", "-XX:-TieredCompilation");
+    }
+
+    static {
+        rnd = Utils.getRandomInstance();
+        int k = rnd.nextInt(-128, 128);
+        int b = rnd.nextInt(1, 256);
+        short bits_imm8     = (short) (k << 8);
+        short bits_non_imm8 = (short) ((k << 8) + b);
+
+        FP16_IMM8     = Float16.shortBitsToFloat16(bits_imm8);
+        FP16_NON_IMM8 = Float16.shortBitsToFloat16(bits_non_imm8);
+
+        input  = new short[LEN];
+        output = new short[LEN];
+        expected = new short[LEN];
+
+        for (int i = 0; i < LEN; i++) {
+            input[i] = (short) i;
+        }
+    }
+
+    // For vectorizable loops containing FP16 operations with an FP16 constant as one of the inputs, the IR
+    // node `(dst (Replicate con))` is generated to broadcast the constant into all lanes of an SVE register.
+    // On SVE-capable hardware with vector length > 16B, if the FP16 immediate is a signed value within the
+    // range [-128, 127] or a signed multiple of 256 in the range [-32768, 32512] for element widths of
+    // 16 bits or higher then the backend should generate the "replicateHF_imm_gt128b" machnode.
+    @Test
+    @Warmup(5000)
+    @IR(counts = {IRNode.REPLICATE_HF_IMM8, ">0"},
+        phase = CompilePhase.FINAL_CODE,
+        applyIf = {"MaxVectorSize", ">16"},
+        applyIfCPUFeature = {"sve", "true"})
+    public void TestFloat16AddInRange() {
+        for (int i = 0; i < LEN; ++i) {
+            output[i] = float16ToRawShortBits(Float16.add(shortBitsToFloat16(input[i]), FP16_IMM8));
+        }
+    }
+
+    @Check(test="TestFloat16AddInRange")
+    public void checkResultFloat16AddInRange() {
+        for (int i = 0; i < LEN; ++i) {
+            expected[i] = floatToFloat16(float16ToFloat(input[i]) + FP16_IMM8.floatValue());
+        }
+        Verify.checkEQWithRawBits(output, expected);
+    }
+
+    // For vectorizable loops containing FP16 operations with an FP16 constant as one of the inputs, the IR
+    // node `(dst (Replicate con))` is generated to broadcast the constant into all lanes of an SVE register.
+    // On SVE-capable hardware with vector length > 16B, if the FP16 constant falls outside the immediate
+    // range accepted by the SVE "dup" instruction, the backend must:
+    //   1. Generate the "loadConH" machnode to load the FP16 constant from the constant pool.
+    //   2. Emit the "replicateHF" machnode to broadcast this loaded constant into an SVE register.
+    // In this case, the backend should not generate the "replicateHF_imm8_gt128b" machnode.
+    @Test
+    @Warmup(5000)
+    @IR(counts = {IRNode.REPLICATE_HF, ">0"},
+        failOn = {IRNode.REPLICATE_HF_IMM8},
+        phase = CompilePhase.FINAL_CODE,
+        applyIf = {"MaxVectorSize", ">16"},
+        applyIfCPUFeature = {"sve", "true"})
+    public void TestFloat16AddOutOfRange() {
+        for (int i = 0; i < LEN; ++i) {
+            output[i] = float16ToRawShortBits(add(shortBitsToFloat16(input[i]), FP16_NON_IMM8));
+        }
+    }
+
+    @Check(test="TestFloat16AddOutOfRange")
+    public void checkResultFloat16AddOutOfRange() {
+        for (int i = 0; i < LEN; ++i) {
+            expected[i] = floatToFloat16(float16ToFloat(input[i]) + FP16_NON_IMM8.floatValue());
+        }
+        Verify.checkEQWithRawBits(output, expected);
+    }
+}
diff --git a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java
index 7fb1eeb800c..16c6d99a64f 100644
--- a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java
+++ b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java
@@ -2896,6 +2896,16 @@ public class IRNode {
         vectorNode(SELECT_FROM_TWO_VECTOR_VL, "SelectFromTwoVector", TYPE_LONG);
     }
 
+    public static final String REPLICATE_HF = PREFIX + "REPLICATE_HF" + POSTFIX;
+    static {
+        machOnlyNameRegex(REPLICATE_HF, "replicateHF");
+    }
+
+    public static final String REPLICATE_HF_IMM8 = PREFIX + "REPLICATE_HF_IMM8" + POSTFIX;
+    static {
+        machOnlyNameRegex(REPLICATE_HF_IMM8, "replicateHF_imm8_gt128b");
+    }
+
     /*
      * Utility methods to set up IR_NODE_MAPPINGS.
      */