8359419: AArch64: Relax min vector length to 32-bit for short vectors

Reviewed-by: aph, fgao, bkilambi, dlunden
2026-01-28 12:09:14 +00:00 · 2025-07-22 09:06:02 +00:00 · 2025-07-22 09:06:02 +00:00 · ac141c2fa1
commit ac141c2fa1
parent ed70910b0f
10 changed files with 350 additions and 128 deletions
--- a/src/hotspot/cpu/aarch64/aarch64.ad
+++ b/src/hotspot/cpu/aarch64/aarch64.ad
@ -2362,17 +2362,34 @@ int Matcher::max_vector_size(const BasicType bt) {
 }

 int Matcher::min_vector_size(const BasicType bt) {
-  int max_size = max_vector_size(bt);
-  // Limit the min vector size to 8 bytes.
-  int size = 8 / type2aelembytes(bt);
-  if (bt == T_BYTE) {
-    // To support vector api shuffle/rearrange.
-    size = 4;
-  } else if (bt == T_BOOLEAN) {
-    // To support vector api load/store mask.
-    size = 2;
+  // Usually, the shortest vector length supported by AArch64 ISA and
+  // Vector API species is 64 bits. However, we allow 32-bit or 16-bit
+  // vectors in a few special cases.
+  int size;
+  switch(bt) {
+    case T_BOOLEAN:
+      // Load/store a vector mask with only 2 elements for vector types
+      // such as "2I/2F/2L/2D".
+      size = 2;
+      break;
+    case T_BYTE:
+      // Generate a "4B" vector, to support vector cast between "8B/16B"
+      // and "4S/4I/4L/4F/4D".
+      size = 4;
+      break;
+    case T_SHORT:
+      // Generate a "2S" vector, to support vector cast between "4S/8S"
+      // and "2I/2L/2F/2D".
+      size = 2;
+      break;
+    default:
+      // Limit the min vector length to 64-bit.
+      size = 8 / type2aelembytes(bt);
+      // The number of elements in a vector should be at least 2.
+      size = MAX2(size, 2);
  }
-  if (size < 2) size = 2;
+
+  int max_size = max_vector_size(bt);
  return MIN2(size, max_size);
 }

--- a/src/hotspot/cpu/aarch64/aarch64_vector.ad
+++ b/src/hotspot/cpu/aarch64/aarch64_vector.ad
@ -131,7 +131,7 @@ source %{
      // These operations are not profitable to be vectorized on NEON, because no direct
      // NEON instructions support them. But the match rule support for them is profitable for
      // Vector API intrinsics.
-      if ((opcode == Op_VectorCastD2X && bt == T_INT) ||
+      if ((opcode == Op_VectorCastD2X && (bt == T_INT || bt == T_SHORT)) ||
          (opcode == Op_VectorCastL2X && bt == T_FLOAT) ||
          (opcode == Op_CountLeadingZerosV && bt == T_LONG) ||
          (opcode == Op_CountTrailingZerosV && bt == T_LONG) ||
@ -189,6 +189,18 @@ source %{
          return false;
        }
        break;
+      case Op_AddReductionVI:
+      case Op_AndReductionV:
+      case Op_OrReductionV:
+      case Op_XorReductionV:
+      case Op_MinReductionV:
+      case Op_MaxReductionV:
+        // Reductions with less than 8 bytes vector length are
+        // not supported.
+        if (length_in_bytes < 8) {
+          return false;
+        }
+        break;
      case Op_MulReductionVD:
      case Op_MulReductionVF:
      case Op_MulReductionVI:
@ -4244,8 +4256,8 @@ instruct vzeroExtStoX(vReg dst, vReg src) %{
    assert(bt == T_INT || bt == T_LONG, "must be");
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
-      // 4S to 4I
-      __ neon_vector_extend($dst$$FloatRegister, T_INT, length_in_bytes,
+      // 2S to 2I/2L, 4S to 4I
+      __ neon_vector_extend($dst$$FloatRegister, bt, length_in_bytes,
                            $src$$FloatRegister, T_SHORT, /* is_unsigned */ true);
    } else {
      assert(UseSVE > 0, "must be sve");
@ -4265,11 +4277,11 @@ instruct vzeroExtItoX(vReg dst, vReg src) %{
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
      // 2I to 2L
-      __ neon_vector_extend($dst$$FloatRegister, T_LONG, length_in_bytes,
+      __ neon_vector_extend($dst$$FloatRegister, bt, length_in_bytes,
                            $src$$FloatRegister, T_INT, /* is_unsigned */ true);
    } else {
      assert(UseSVE > 0, "must be sve");
-      __ sve_vector_extend($dst$$FloatRegister, __ D,
+      __ sve_vector_extend($dst$$FloatRegister, __ elemType_to_regVariant(bt),
                           $src$$FloatRegister, __ S, /* is_unsigned */ true);
    }
  %}
@ -4343,11 +4355,15 @@ instruct vcvtStoX_extend(vReg dst, vReg src) %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
-      // 4S to 4I/4F
-      __ neon_vector_extend($dst$$FloatRegister, T_INT, length_in_bytes,
-                            $src$$FloatRegister, T_SHORT);
-      if (bt == T_FLOAT) {
-        __ scvtfv(__ T4S, $dst$$FloatRegister, $dst$$FloatRegister);
+      if (is_floating_point_type(bt)) {
+        // 2S to 2F/2D, 4S to 4F
+        __ neon_vector_extend($dst$$FloatRegister, bt == T_FLOAT ? T_INT : T_LONG,
+                              length_in_bytes, $src$$FloatRegister, T_SHORT);
+        __ scvtfv(get_arrangement(this), $dst$$FloatRegister, $dst$$FloatRegister);
+      } else {
+        // 2S to 2I/2L, 4S to 4I
+        __ neon_vector_extend($dst$$FloatRegister, bt, length_in_bytes,
+                              $src$$FloatRegister, T_SHORT);
      }
    } else {
      assert(UseSVE > 0, "must be sve");
@ -4371,7 +4387,7 @@ instruct vcvtItoX_narrow_neon(vReg dst, vReg src) %{
  effect(TEMP_DEF dst);
  format %{ "vcvtItoX_narrow_neon $dst, $src" %}
  ins_encode %{
-    // 4I to 4B/4S
+    // 2I to 2S, 4I to 4B/4S
    BasicType bt = Matcher::vector_element_basic_type(this);
    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src);
    __ neon_vector_narrow($dst$$FloatRegister, bt,
@ -4434,28 +4450,29 @@ instruct vcvtItoX(vReg dst, vReg src) %{

 // VectorCastL2X

-instruct vcvtLtoI_neon(vReg dst, vReg src) %{
-  predicate(Matcher::vector_element_basic_type(n) == T_INT &&
+instruct vcvtLtoX_narrow_neon(vReg dst, vReg src) %{
+  predicate((Matcher::vector_element_basic_type(n) == T_INT ||
+             Matcher::vector_element_basic_type(n) == T_SHORT) &&
            VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(1))));
  match(Set dst (VectorCastL2X src));
-  format %{ "vcvtLtoI_neon $dst, $src" %}
+  format %{ "vcvtLtoX_narrow_neon $dst, $src" %}
  ins_encode %{
-    // 2L to 2I
+    // 2L to 2S/2I
+    BasicType bt = Matcher::vector_element_basic_type(this);
    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src);
-    __ neon_vector_narrow($dst$$FloatRegister, T_INT,
+    __ neon_vector_narrow($dst$$FloatRegister, bt,
                          $src$$FloatRegister, T_LONG, length_in_bytes);
  %}
  ins_pipe(pipe_slow);
 %}

-instruct vcvtLtoI_sve(vReg dst, vReg src, vReg tmp) %{
-  predicate((Matcher::vector_element_basic_type(n) == T_INT &&
-             !VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(1)))) ||
-            Matcher::vector_element_basic_type(n) == T_BYTE ||
-            Matcher::vector_element_basic_type(n) == T_SHORT);
+instruct vcvtLtoX_narrow_sve(vReg dst, vReg src, vReg tmp) %{
+  predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(1))) &&
+            !is_floating_point_type(Matcher::vector_element_basic_type(n)) &&
+            type2aelembytes(Matcher::vector_element_basic_type(n)) <= 4);
  match(Set dst (VectorCastL2X src));
  effect(TEMP_DEF dst, TEMP tmp);
-  format %{ "vcvtLtoI_sve $dst, $src\t# KILL $tmp" %}
+  format %{ "vcvtLtoX_narrow_sve $dst, $src\t# KILL $tmp" %}
  ins_encode %{
    assert(UseSVE > 0, "must be sve");
    BasicType bt = Matcher::vector_element_basic_type(this);
@ -4521,10 +4538,11 @@ instruct vcvtFtoX_narrow_neon(vReg dst, vReg src) %{
  effect(TEMP_DEF dst);
  format %{ "vcvtFtoX_narrow_neon $dst, $src" %}
  ins_encode %{
-    // 4F to 4B/4S
+    // 2F to 2S, 4F to 4B/4S
    BasicType bt = Matcher::vector_element_basic_type(this);
    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src);
-    __ fcvtzs($dst$$FloatRegister, __ T4S, $src$$FloatRegister);
+    __ fcvtzs($dst$$FloatRegister, length_in_bytes == 16 ? __ T4S : __ T2S,
+              $src$$FloatRegister);
    __ neon_vector_narrow($dst$$FloatRegister, bt,
                          $dst$$FloatRegister, T_INT, length_in_bytes);
  %}
@ -4590,12 +4608,14 @@ instruct vcvtFtoX(vReg dst, vReg src) %{
 // VectorCastD2X

 instruct vcvtDtoI_neon(vReg dst, vReg src) %{
-  predicate(UseSVE == 0 && Matcher::vector_element_basic_type(n) == T_INT);
+  predicate(UseSVE == 0 &&
+            (Matcher::vector_element_basic_type(n) == T_INT ||
+             Matcher::vector_element_basic_type(n) == T_SHORT));
  match(Set dst (VectorCastD2X src));
  effect(TEMP_DEF dst);
-  format %{ "vcvtDtoI_neon $dst, $src\t# 2D to 2I" %}
+  format %{ "vcvtDtoI_neon $dst, $src\t# 2D to 2S/2I" %}
  ins_encode %{
-    // 2D to 2I
+    // 2D to 2S/2I
    __ ins($dst$$FloatRegister, __ D, $src$$FloatRegister, 0, 1);
    // We can't use fcvtzs(vector, integer) instruction here because we need
    // saturation arithmetic. See JDK-8276151.
@ -4603,6 +4623,10 @@ instruct vcvtDtoI_neon(vReg dst, vReg src) %{
    __ fcvtzdw(rscratch2, $dst$$FloatRegister);
    __ fmovs($dst$$FloatRegister, rscratch1);
    __ mov($dst$$FloatRegister, __ S, 1, rscratch2);
+    if (Matcher::vector_element_basic_type(this) == T_SHORT) {
+      __ neon_vector_narrow($dst$$FloatRegister, T_SHORT,
+                            $dst$$FloatRegister, T_INT, 8);
+    }
  %}
  ins_pipe(pipe_slow);
 %}
@ -4676,7 +4700,7 @@ instruct vcvtHFtoF(vReg dst, vReg src) %{
  ins_encode %{
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
-      // 4HF to 4F
+      // 2HF to 2F, 4HF to 4F
      __ fcvtl($dst$$FloatRegister, __ T4S, $src$$FloatRegister, __ T4H);
    } else {
      assert(UseSVE > 0, "must be sve");
@ -4692,9 +4716,9 @@ instruct vcvtHFtoF(vReg dst, vReg src) %{
 instruct vcvtFtoHF_neon(vReg dst, vReg src) %{
  predicate(VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(1))));
  match(Set dst (VectorCastF2HF src));
-  format %{ "vcvtFtoHF_neon $dst, $src\t# 4F to 4HF" %}
+  format %{ "vcvtFtoHF_neon $dst, $src\t# 2F/4F to 2HF/4HF" %}
  ins_encode %{
-    // 4F to 4HF
+    // 2F to 2HF, 4F to 4HF
    __ fcvtn($dst$$FloatRegister, __ T4H, $src$$FloatRegister, __ T4S);
  %}
  ins_pipe(pipe_slow);
@ -6396,14 +6420,12 @@ instruct vpopcountI(vReg dst, vReg src) %{
    } else {
      assert(bt == T_SHORT || bt == T_INT, "unsupported");
      if (UseSVE == 0) {
-        assert(length_in_bytes == 8 || length_in_bytes == 16, "unsupported");
-        __ cnt($dst$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B,
-               $src$$FloatRegister);
-        __ uaddlp($dst$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B,
-                  $dst$$FloatRegister);
+        assert(length_in_bytes <= 16, "unsupported");
+        bool isQ = length_in_bytes == 16;
+        __ cnt($dst$$FloatRegister, isQ ? __ T16B : __ T8B, $src$$FloatRegister);
+        __ uaddlp($dst$$FloatRegister, isQ ? __ T16B : __ T8B, $dst$$FloatRegister);
        if (bt == T_INT) {
-          __ uaddlp($dst$$FloatRegister, length_in_bytes == 16 ? __ T8H : __ T4H,
-                    $dst$$FloatRegister);
+          __ uaddlp($dst$$FloatRegister, isQ ? __ T8H : __ T4H, $dst$$FloatRegister);
        }
      } else {
        __ sve_cnt($dst$$FloatRegister, __ elemType_to_regVariant(bt),
@ -6465,7 +6487,7 @@ instruct vblend_neon(vReg dst, vReg src1, vReg src2) %{
  format %{ "vblend_neon $dst, $src1, $src2" %}
  ins_encode %{
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
-    assert(length_in_bytes == 8 || length_in_bytes == 16, "must be");
+    assert(length_in_bytes <= 16, "must be");
    __ bsl($dst$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B,
           $src2$$FloatRegister, $src1$$FloatRegister);
  %}
@ -6852,7 +6874,7 @@ instruct vcountTrailingZeros(vReg dst, vReg src) %{
    } else {
      assert(bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported type");
      if (UseSVE == 0) {
-        assert(length_in_bytes == 8 || length_in_bytes == 16, "unsupported");
+        assert(length_in_bytes <= 16, "unsupported");
        __ neon_reverse_bits($dst$$FloatRegister, $src$$FloatRegister,
                             bt, /* isQ */ length_in_bytes == 16);
        if (bt != T_LONG) {
@ -6911,7 +6933,7 @@ instruct vreverse(vReg dst, vReg src) %{
    } else {
      assert(bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported type");
      if (UseSVE == 0) {
-        assert(length_in_bytes == 8 || length_in_bytes == 16, "unsupported");
+        assert(length_in_bytes <= 16, "unsupported");
        __ neon_reverse_bits($dst$$FloatRegister, $src$$FloatRegister,
                             bt, /* isQ */ length_in_bytes == 16);
      } else {
@ -6947,7 +6969,7 @@ instruct vreverseBytes(vReg dst, vReg src) %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
-      assert(length_in_bytes == 8 || length_in_bytes == 16, "unsupported");
+      assert(length_in_bytes <= 16, "unsupported");
      if (bt == T_BYTE) {
        if ($dst$$FloatRegister != $src$$FloatRegister) {
          __ orr($dst$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B,
--- a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
+++ b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
@ -121,7 +121,7 @@ source %{
      // These operations are not profitable to be vectorized on NEON, because no direct
      // NEON instructions support them. But the match rule support for them is profitable for
      // Vector API intrinsics.
-      if ((opcode == Op_VectorCastD2X && bt == T_INT) ||
+      if ((opcode == Op_VectorCastD2X && (bt == T_INT || bt == T_SHORT)) ||
          (opcode == Op_VectorCastL2X && bt == T_FLOAT) ||
          (opcode == Op_CountLeadingZerosV && bt == T_LONG) ||
          (opcode == Op_CountTrailingZerosV && bt == T_LONG) ||
@ -179,6 +179,18 @@ source %{
          return false;
        }
        break;
+      case Op_AddReductionVI:
+      case Op_AndReductionV:
+      case Op_OrReductionV:
+      case Op_XorReductionV:
+      case Op_MinReductionV:
+      case Op_MaxReductionV:
+        // Reductions with less than 8 bytes vector length are
+        // not supported.
+        if (length_in_bytes < 8) {
+          return false;
+        }
+        break;
      case Op_MulReductionVD:
      case Op_MulReductionVF:
      case Op_MulReductionVI:
@ -2502,31 +2514,31 @@ instruct reinterpret_resize_gt128b(vReg dst, vReg src, pReg ptmp, rFlagsReg cr)
 %}

 // ---------------------------- Vector zero extend --------------------------------
-dnl VECTOR_ZERO_EXTEND($1,      $2,     $3,      $4,       $5        $6,        $7,         )
-dnl VECTOR_ZERO_EXTEND(op_name, dst_bt, src_bt,  dst_size, src_size, assertion, neon_comment)
+dnl VECTOR_ZERO_EXTEND($1,      $2,     $3,       $4,        $5,         )
+dnl VECTOR_ZERO_EXTEND(op_name, src_bt, src_size, assertion, neon_comment)
 define(`VECTOR_ZERO_EXTEND', `
 instruct vzeroExt$1toX(vReg dst, vReg src) %{
  match(Set dst (VectorUCast`$1'2X src));
  format %{ "vzeroExt$1toX $dst, $src" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
-    assert($6, "must be");
+    assert($4, "must be");
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
-      // $7
-      __ neon_vector_extend($dst$$FloatRegister, $2, length_in_bytes,
-                            $src$$FloatRegister, $3, /* is_unsigned */ true);
+      // $5
+      __ neon_vector_extend($dst$$FloatRegister, bt, length_in_bytes,
+                            $src$$FloatRegister, $2, /* is_unsigned */ true);
    } else {
      assert(UseSVE > 0, "must be sve");
-      __ sve_vector_extend($dst$$FloatRegister, __ $4,
-                           $src$$FloatRegister, __ $5, /* is_unsigned */ true);
+      __ sve_vector_extend($dst$$FloatRegister, __ elemType_to_regVariant(bt),
+                           $src$$FloatRegister, __ $3, /* is_unsigned */ true);
    }
  %}
  ins_pipe(pipe_slow);
 %}')dnl
-VECTOR_ZERO_EXTEND(B, bt,     T_BYTE,  elemType_to_regVariant(bt), B, bt == T_SHORT || bt == T_INT || bt == T_LONG, `4B to 4S/4I, 8B to 8S')
-VECTOR_ZERO_EXTEND(S, T_INT,  T_SHORT, elemType_to_regVariant(bt), H, bt == T_INT || bt == T_LONG,                  `4S to 4I')
-VECTOR_ZERO_EXTEND(I, T_LONG, T_INT,   D,                          S, bt == T_LONG,                                 `2I to 2L')
+VECTOR_ZERO_EXTEND(B, T_BYTE,  B, bt == T_SHORT || bt == T_INT || bt == T_LONG, `4B to 4S/4I, 8B to 8S')
+VECTOR_ZERO_EXTEND(S, T_SHORT, H, bt == T_INT || bt == T_LONG,                  `2S to 2I/2L, 4S to 4I')
+VECTOR_ZERO_EXTEND(I, T_INT,   S, bt == T_LONG,                                 `2I to 2L')

 // ------------------------------ Vector cast ----------------------------------

@ -2595,11 +2607,15 @@ instruct vcvtStoX_extend(vReg dst, vReg src) %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
-      // 4S to 4I/4F
-      __ neon_vector_extend($dst$$FloatRegister, T_INT, length_in_bytes,
-                            $src$$FloatRegister, T_SHORT);
-      if (bt == T_FLOAT) {
-        __ scvtfv(__ T4S, $dst$$FloatRegister, $dst$$FloatRegister);
+      if (is_floating_point_type(bt)) {
+        // 2S to 2F/2D, 4S to 4F
+        __ neon_vector_extend($dst$$FloatRegister, bt == T_FLOAT ? T_INT : T_LONG,
+                              length_in_bytes, $src$$FloatRegister, T_SHORT);
+        __ scvtfv(get_arrangement(this), $dst$$FloatRegister, $dst$$FloatRegister);
+      } else {
+        // 2S to 2I/2L, 4S to 4I
+        __ neon_vector_extend($dst$$FloatRegister, bt, length_in_bytes,
+                              $src$$FloatRegister, T_SHORT);
      }
    } else {
      assert(UseSVE > 0, "must be sve");
@ -2623,7 +2639,7 @@ instruct vcvtItoX_narrow_neon(vReg dst, vReg src) %{
  effect(TEMP_DEF dst);
  format %{ "vcvtItoX_narrow_neon $dst, $src" %}
  ins_encode %{
-    // 4I to 4B/4S
+    // 2I to 2S, 4I to 4B/4S
    BasicType bt = Matcher::vector_element_basic_type(this);
    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src);
    __ neon_vector_narrow($dst$$FloatRegister, bt,
@ -2686,28 +2702,29 @@ instruct vcvtItoX(vReg dst, vReg src) %{

 // VectorCastL2X

-instruct vcvtLtoI_neon(vReg dst, vReg src) %{
-  predicate(Matcher::vector_element_basic_type(n) == T_INT &&
+instruct vcvtLtoX_narrow_neon(vReg dst, vReg src) %{
+  predicate((Matcher::vector_element_basic_type(n) == T_INT ||
+             Matcher::vector_element_basic_type(n) == T_SHORT) &&
            VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(1))));
  match(Set dst (VectorCastL2X src));
-  format %{ "vcvtLtoI_neon $dst, $src" %}
+  format %{ "vcvtLtoX_narrow_neon $dst, $src" %}
  ins_encode %{
-    // 2L to 2I
+    // 2L to 2S/2I
+    BasicType bt = Matcher::vector_element_basic_type(this);
    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src);
-    __ neon_vector_narrow($dst$$FloatRegister, T_INT,
+    __ neon_vector_narrow($dst$$FloatRegister, bt,
                          $src$$FloatRegister, T_LONG, length_in_bytes);
  %}
  ins_pipe(pipe_slow);
 %}

-instruct vcvtLtoI_sve(vReg dst, vReg src, vReg tmp) %{
-  predicate((Matcher::vector_element_basic_type(n) == T_INT &&
-             !VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(1)))) ||
-            Matcher::vector_element_basic_type(n) == T_BYTE ||
-            Matcher::vector_element_basic_type(n) == T_SHORT);
+instruct vcvtLtoX_narrow_sve(vReg dst, vReg src, vReg tmp) %{
+  predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(1))) &&
+            !is_floating_point_type(Matcher::vector_element_basic_type(n)) &&
+            type2aelembytes(Matcher::vector_element_basic_type(n)) <= 4);
  match(Set dst (VectorCastL2X src));
  effect(TEMP_DEF dst, TEMP tmp);
-  format %{ "vcvtLtoI_sve $dst, $src\t# KILL $tmp" %}
+  format %{ "vcvtLtoX_narrow_sve $dst, $src\t# KILL $tmp" %}
  ins_encode %{
    assert(UseSVE > 0, "must be sve");
    BasicType bt = Matcher::vector_element_basic_type(this);
@ -2773,10 +2790,11 @@ instruct vcvtFtoX_narrow_neon(vReg dst, vReg src) %{
  effect(TEMP_DEF dst);
  format %{ "vcvtFtoX_narrow_neon $dst, $src" %}
  ins_encode %{
-    // 4F to 4B/4S
+    // 2F to 2S, 4F to 4B/4S
    BasicType bt = Matcher::vector_element_basic_type(this);
    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src);
-    __ fcvtzs($dst$$FloatRegister, __ T4S, $src$$FloatRegister);
+    __ fcvtzs($dst$$FloatRegister, length_in_bytes == 16 ? __ T4S : __ T2S,
+              $src$$FloatRegister);
    __ neon_vector_narrow($dst$$FloatRegister, bt,
                          $dst$$FloatRegister, T_INT, length_in_bytes);
  %}
@ -2842,12 +2860,14 @@ instruct vcvtFtoX(vReg dst, vReg src) %{
 // VectorCastD2X

 instruct vcvtDtoI_neon(vReg dst, vReg src) %{
-  predicate(UseSVE == 0 && Matcher::vector_element_basic_type(n) == T_INT);
+  predicate(UseSVE == 0 &&
+            (Matcher::vector_element_basic_type(n) == T_INT ||
+             Matcher::vector_element_basic_type(n) == T_SHORT));
  match(Set dst (VectorCastD2X src));
  effect(TEMP_DEF dst);
-  format %{ "vcvtDtoI_neon $dst, $src\t# 2D to 2I" %}
+  format %{ "vcvtDtoI_neon $dst, $src\t# 2D to 2S/2I" %}
  ins_encode %{
-    // 2D to 2I
+    // 2D to 2S/2I
    __ ins($dst$$FloatRegister, __ D, $src$$FloatRegister, 0, 1);
    // We can't use fcvtzs(vector, integer) instruction here because we need
    // saturation arithmetic. See JDK-8276151.
@ -2855,6 +2875,10 @@ instruct vcvtDtoI_neon(vReg dst, vReg src) %{
    __ fcvtzdw(rscratch2, $dst$$FloatRegister);
    __ fmovs($dst$$FloatRegister, rscratch1);
    __ mov($dst$$FloatRegister, __ S, 1, rscratch2);
+    if (Matcher::vector_element_basic_type(this) == T_SHORT) {
+      __ neon_vector_narrow($dst$$FloatRegister, T_SHORT,
+                            $dst$$FloatRegister, T_INT, 8);
+    }
  %}
  ins_pipe(pipe_slow);
 %}
@ -2928,7 +2952,7 @@ instruct vcvtHFtoF(vReg dst, vReg src) %{
  ins_encode %{
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
-      // 4HF to 4F
+      // 2HF to 2F, 4HF to 4F
      __ fcvtl($dst$$FloatRegister, __ T4S, $src$$FloatRegister, __ T4H);
    } else {
      assert(UseSVE > 0, "must be sve");
@ -2944,9 +2968,9 @@ instruct vcvtHFtoF(vReg dst, vReg src) %{
 instruct vcvtFtoHF_neon(vReg dst, vReg src) %{
  predicate(VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(1))));
  match(Set dst (VectorCastF2HF src));
-  format %{ "vcvtFtoHF_neon $dst, $src\t# 4F to 4HF" %}
+  format %{ "vcvtFtoHF_neon $dst, $src\t# 2F/4F to 2HF/4HF" %}
  ins_encode %{
-    // 4F to 4HF
+    // 2F to 2HF, 4F to 4HF
    __ fcvtn($dst$$FloatRegister, __ T4H, $src$$FloatRegister, __ T4S);
  %}
  ins_pipe(pipe_slow);
@ -4417,14 +4441,12 @@ instruct vpopcountI(vReg dst, vReg src) %{
    } else {
      assert(bt == T_SHORT || bt == T_INT, "unsupported");
      if (UseSVE == 0) {
-        assert(length_in_bytes == 8 || length_in_bytes == 16, "unsupported");
-        __ cnt($dst$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B,
-               $src$$FloatRegister);
-        __ uaddlp($dst$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B,
-                  $dst$$FloatRegister);
+        assert(length_in_bytes <= 16, "unsupported");
+        bool isQ = length_in_bytes == 16;
+        __ cnt($dst$$FloatRegister, isQ ? __ T16B : __ T8B, $src$$FloatRegister);
+        __ uaddlp($dst$$FloatRegister, isQ ? __ T16B : __ T8B, $dst$$FloatRegister);
        if (bt == T_INT) {
-          __ uaddlp($dst$$FloatRegister, length_in_bytes == 16 ? __ T8H : __ T4H,
-                    $dst$$FloatRegister);
+          __ uaddlp($dst$$FloatRegister, isQ ? __ T8H : __ T4H, $dst$$FloatRegister);
        }
      } else {
        __ sve_cnt($dst$$FloatRegister, __ elemType_to_regVariant(bt),
@ -4475,7 +4497,7 @@ instruct vblend_neon(vReg dst, vReg src1, vReg src2) %{
  format %{ "vblend_neon $dst, $src1, $src2" %}
  ins_encode %{
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
-    assert(length_in_bytes == 8 || length_in_bytes == 16, "must be");
+    assert(length_in_bytes <= 16, "must be");
    __ bsl($dst$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B,
           $src2$$FloatRegister, $src1$$FloatRegister);
  %}
@ -4851,7 +4873,7 @@ instruct vcountTrailingZeros(vReg dst, vReg src) %{
    } else {
      assert(bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported type");
      if (UseSVE == 0) {
-        assert(length_in_bytes == 8 || length_in_bytes == 16, "unsupported");
+        assert(length_in_bytes <= 16, "unsupported");
        __ neon_reverse_bits($dst$$FloatRegister, $src$$FloatRegister,
                             bt, /* isQ */ length_in_bytes == 16);
        if (bt != T_LONG) {
@ -4910,7 +4932,7 @@ instruct vreverse(vReg dst, vReg src) %{
    } else {
      assert(bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported type");
      if (UseSVE == 0) {
-        assert(length_in_bytes == 8 || length_in_bytes == 16, "unsupported");
+        assert(length_in_bytes <= 16, "unsupported");
        __ neon_reverse_bits($dst$$FloatRegister, $src$$FloatRegister,
                             bt, /* isQ */ length_in_bytes == 16);
      } else {
@ -4935,7 +4957,7 @@ instruct vreverseBytes(vReg dst, vReg src) %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
-      assert(length_in_bytes == 8 || length_in_bytes == 16, "unsupported");
+      assert(length_in_bytes <= 16, "unsupported");
      if (bt == T_BYTE) {
        if ($dst$$FloatRegister != $src$$FloatRegister) {
          __ orr($dst$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B,
--- a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp
@ -1778,19 +1778,21 @@ void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister
 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
                                           FloatRegister src, BasicType src_bt, bool is_unsigned) {
  if (src_bt == T_BYTE) {
-    if (dst_bt == T_SHORT) {
-      // 4B/8B to 4S/8S
-      _xshll(is_unsigned, dst, T8H, src, T8B, 0);
-    } else {
-      // 4B to 4I
-      assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
-      _xshll(is_unsigned, dst, T8H, src, T8B, 0);
+    // 4B to 4S/4I, 8B to 8S
+    assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
+    assert(dst_bt == T_SHORT || dst_bt == T_INT, "unsupported");
+    _xshll(is_unsigned, dst, T8H, src, T8B, 0);
+    if (dst_bt == T_INT) {
      _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
    }
  } else if (src_bt == T_SHORT) {
-    // 4S to 4I
-    assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
+    // 2S to 2I/2L, 4S to 4I
+    assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
+    assert(dst_bt == T_INT || dst_bt == T_LONG, "unsupported");
    _xshll(is_unsigned, dst, T4S, src, T4H, 0);
+    if (dst_bt == T_LONG) {
+      _xshll(is_unsigned, dst, T2D, dst, T2S, 0);
+    }
  } else if (src_bt == T_INT) {
    // 2I to 2L
    assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
@ -1810,18 +1812,21 @@ void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
    assert(dst_bt == T_BYTE, "unsupported");
    xtn(dst, T8B, src, T8H);
  } else if (src_bt == T_INT) {
-    // 4I to 4B/4S
-    assert(src_vlen_in_bytes == 16, "unsupported");
+    // 2I to 2S, 4I to 4B/4S
+    assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
    assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
    xtn(dst, T4H, src, T4S);
    if (dst_bt == T_BYTE) {
      xtn(dst, T8B, dst, T8H);
    }
  } else if (src_bt == T_LONG) {
-    // 2L to 2I
+    // 2L to 2S/2I
    assert(src_vlen_in_bytes == 16, "unsupported");
-    assert(dst_bt == T_INT, "unsupported");
+    assert(dst_bt == T_INT || dst_bt == T_SHORT, "unsupported");
    xtn(dst, T2S, src, T2D);
+    if (dst_bt == T_SHORT) {
+      xtn(dst, T4H, dst, T4S);
+    }
  } else {
    ShouldNotReachHere();
  }
--- a/test/hotspot/jtreg/compiler/loopopts/superword/TestDependencyOffsets.java
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestDependencyOffsets.java
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2023, 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -597,8 +597,7 @@ public class TestDependencyOffsets {
            case "byte"   -> new CPUMinVectorWidth[]{new CPUMinVectorWidth(SSE4_ASIMD, 4 )};
            case "char"   -> new CPUMinVectorWidth[]{new CPUMinVectorWidth(SSE4,       4 ),
                                                     new CPUMinVectorWidth(ASIMD,      8 )};
-            case "short"  -> new CPUMinVectorWidth[]{new CPUMinVectorWidth(SSE4,       4 ),
-                                                     new CPUMinVectorWidth(ASIMD,      8 )};
+            case "short"  -> new CPUMinVectorWidth[]{new CPUMinVectorWidth(SSE4_ASIMD, 4 )};
            case "int"    -> new CPUMinVectorWidth[]{new CPUMinVectorWidth(SSE4_ASIMD, 8 )};
            case "long"   -> new CPUMinVectorWidth[]{new CPUMinVectorWidth(SSE4_ASIMD, 16)};
            case "float"  -> new CPUMinVectorWidth[]{new CPUMinVectorWidth(SSE4_ASIMD, 8 )};
--- a/test/hotspot/jtreg/compiler/vectorapi/reshape/utils/TestCastMethods.java
+++ b/test/hotspot/jtreg/compiler/vectorapi/reshape/utils/TestCastMethods.java
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2021, 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -649,18 +649,25 @@ public class TestCastMethods {
            makePair(SSPEC128, BSPEC64),
            makePair(SSPEC256, BSPEC128),
            makePair(SSPEC512, BSPEC256),
+            makePair(SSPEC64, ISPEC64),
            makePair(SSPEC64, ISPEC128),
            makePair(SSPEC128, ISPEC256),
            makePair(SSPEC256, ISPEC512),
+            makePair(SSPEC64, LSPEC128),
            makePair(SSPEC64, LSPEC256),
+            makePair(SSPEC128, LSPEC128),
            makePair(SSPEC128, LSPEC512),
+            makePair(SSPEC64, FSPEC64),
            makePair(SSPEC64, FSPEC128),
            makePair(SSPEC128, FSPEC256),
            makePair(SSPEC256, FSPEC512),
+            makePair(SSPEC64, DSPEC128),
            makePair(SSPEC64, DSPEC256),
+            makePair(SSPEC128, DSPEC128),
            makePair(SSPEC128, DSPEC512),
            makePair(ISPEC256, BSPEC64),
            makePair(ISPEC512, BSPEC128),
+            makePair(ISPEC64,  SSPEC64),
            makePair(ISPEC128, SSPEC64),
            makePair(ISPEC256, SSPEC128),
            makePair(ISPEC512, SSPEC256),
@ -675,7 +682,9 @@ public class TestCastMethods {
            makePair(ISPEC128, DSPEC256),
            makePair(ISPEC256, DSPEC512),
            makePair(LSPEC512, BSPEC64),
+            makePair(LSPEC128, SSPEC64),
            makePair(LSPEC256, SSPEC64),
+            makePair(LSPEC128, SSPEC128),
            makePair(LSPEC512, SSPEC128),
            makePair(LSPEC128, ISPEC64),
            makePair(LSPEC256, ISPEC128),
@ -688,6 +697,7 @@ public class TestCastMethods {
            makePair(LSPEC512, DSPEC512),
            makePair(FSPEC256, BSPEC64),
            makePair(FSPEC512, BSPEC128),
+            makePair(FSPEC64,  SSPEC64),
            makePair(FSPEC128, SSPEC64),
            makePair(FSPEC256, SSPEC128),
            makePair(FSPEC512, SSPEC256),
@ -702,7 +712,9 @@ public class TestCastMethods {
            makePair(FSPEC128, DSPEC256),
            makePair(FSPEC256, DSPEC512),
            makePair(DSPEC512, BSPEC64),
+            makePair(DSPEC128, SSPEC64),
            makePair(DSPEC256, SSPEC64),
+            makePair(DSPEC128, SSPEC128),
            makePair(DSPEC512, SSPEC128),
            makePair(DSPEC128, ISPEC64),
            makePair(DSPEC256, ISPEC128),
@ -751,14 +763,17 @@ public class TestCastMethods {
            makePair(BSPEC512, LSPEC256, true),
            makePair(BSPEC512, LSPEC512, true),

+            makePair(SSPEC64, ISPEC64, true),
            makePair(SSPEC64, ISPEC128, true),
            makePair(SSPEC64, ISPEC256, true),
            makePair(SSPEC64, ISPEC512, true),
+            makePair(SSPEC64, LSPEC128, true),
            makePair(SSPEC64, LSPEC256, true),
            makePair(SSPEC64, LSPEC512, true),
            makePair(SSPEC128, ISPEC128, true),
            makePair(SSPEC128, ISPEC256, true),
            makePair(SSPEC128, ISPEC512, true),
+            makePair(SSPEC128, LSPEC128, true),
            makePair(SSPEC128, LSPEC256, true),
            makePair(SSPEC128, LSPEC512, true),
            makePair(SSPEC256, ISPEC128, true),
@ -789,23 +804,35 @@ public class TestCastMethods {
            makePair(BSPEC64, FSPEC128),
            makePair(SSPEC64, BSPEC64),
            makePair(SSPEC128, BSPEC64),
+            makePair(SSPEC64, ISPEC64),
            makePair(SSPEC64, ISPEC128),
+            makePair(SSPEC64,  LSPEC128),
+            makePair(SSPEC128, LSPEC128),
+            makePair(SSPEC64, FSPEC64),
            makePair(SSPEC64, FSPEC128),
+            makePair(SSPEC64,  DSPEC128),
+            makePair(SSPEC128, DSPEC128),
            makePair(ISPEC128, BSPEC64),
            makePair(ISPEC128, SSPEC64),
-            makePair(ISPEC64, LSPEC128),
+            makePair(ISPEC64,  SSPEC64),
+            makePair(ISPEC64,  LSPEC128),
            makePair(ISPEC64, FSPEC64),
            makePair(ISPEC128, FSPEC128),
            makePair(ISPEC64, DSPEC128),
+            makePair(LSPEC128, SSPEC64),
+            makePair(LSPEC128, SSPEC128),
            makePair(LSPEC128, ISPEC64),
            makePair(LSPEC128, FSPEC64),
            makePair(LSPEC128, DSPEC128),
            makePair(FSPEC128, BSPEC64),
+            makePair(FSPEC64, SSPEC64),
            makePair(FSPEC128, SSPEC64),
            makePair(FSPEC64, ISPEC64),
            makePair(FSPEC128, ISPEC128),
            makePair(FSPEC64, LSPEC128),
            makePair(FSPEC64, DSPEC128),
+            makePair(DSPEC128, SSPEC64),
+            makePair(DSPEC128, SSPEC128),
            makePair(DSPEC128, ISPEC64),
            makePair(DSPEC128, LSPEC128),
            makePair(DSPEC128, FSPEC64),
@ -816,8 +843,11 @@ public class TestCastMethods {
            makePair(BSPEC128, SSPEC64, true),
            makePair(BSPEC128, SSPEC128, true),
            makePair(BSPEC128, ISPEC128, true),
+            makePair(SSPEC64, ISPEC64, true),
            makePair(SSPEC64, ISPEC128, true),
+            makePair(SSPEC64, LSPEC128, true),
            makePair(SSPEC128, ISPEC128, true),
+            makePair(SSPEC128, LSPEC128, true),
            makePair(ISPEC64, LSPEC128, true)
    );
 }
--- a/test/hotspot/jtreg/compiler/vectorization/TestFloatConversionsVector.java
+++ b/test/hotspot/jtreg/compiler/vectorization/TestFloatConversionsVector.java
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2022, 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -84,10 +84,13 @@ public class TestFloatConversionsVector {
    }

    @Test
+    @IR(counts = {IRNode.VECTOR_CAST_F2HF, IRNode.VECTOR_SIZE_2, "> 0"},
+        applyIfOr = {"UseCompactObjectHeaders", "false", "AlignVector", "false"},
+        applyIfCPUFeature = {"asimd", "true"})
    public void test_float_float16_short_vector(short[] sout, float[] finp) {
-        for (int i = 0; i < finp.length; i+= 4) {
-            sout[i+0] = Float.floatToFloat16(finp[i+0]);
-            sout[i+1] = Float.floatToFloat16(finp[i+1]);
+        for (int i = 0; i < finp.length; i += 4) {
+            sout[i] = Float.floatToFloat16(finp[i]);
+            sout[i + 1] = Float.floatToFloat16(finp[i + 1]);
        }
    }

@ -124,8 +127,9 @@ public class TestFloatConversionsVector {
        }

        // Verifying the result
-        for (int i = 0; i < ARRLEN; i++) {
+        for (int i = 0; i < ARRLEN; i += 4) {
            Asserts.assertEquals(Float.floatToFloat16(finp[i]), sout[i]);
+            Asserts.assertEquals(Float.floatToFloat16(finp[i + 1]), sout[i + 1]);
        }
    }

@ -152,7 +156,19 @@ public class TestFloatConversionsVector {
        }
    }

-    @Run(test = {"test_float16_float", "test_float16_float_strided"}, mode = RunMode.STANDALONE)
+    @Test
+    @IR(counts = {IRNode.VECTOR_CAST_HF2F, IRNode.VECTOR_SIZE_2, "> 0"},
+        applyIfOr = {"UseCompactObjectHeaders", "false", "AlignVector", "false"},
+        applyIfCPUFeature = {"asimd", "true"})
+    public void test_float16_float_short_vector(float[] fout, short[] sinp) {
+        for (int i = 0; i < sinp.length; i += 4) {
+            fout[i] = Float.float16ToFloat(sinp[i]);
+            fout[i + 1] = Float.float16ToFloat(sinp[i + 1]);
+        }
+    }
+
+    @Run(test = {"test_float16_float", "test_float16_float_strided",
+                 "test_float16_float_short_vector"}, mode = RunMode.STANDALONE)
    public void kernel_test_float16_float() {
        sinp = new short[ARRLEN];
        fout = new float[ARRLEN];
@ -178,5 +194,15 @@ public class TestFloatConversionsVector {
        for (int i = 0; i < ARRLEN/2; i++) {
            Asserts.assertEquals(Float.float16ToFloat(sinp[i*2]), fout[i*2]);
        }
+
+        for (int i = 0; i < ITERS; i++) {
+            test_float16_float_short_vector(fout, sinp);
+        }
+
+        // Verifying the result
+        for (int i = 0; i < ARRLEN; i += 4) {
+            Asserts.assertEquals(Float.float16ToFloat(sinp[i]), fout[i]);
+            Asserts.assertEquals(Float.float16ToFloat(sinp[i + 1]), fout[i + 1]);
+        }
    }
 }
--- a/test/hotspot/jtreg/compiler/vectorization/runner/ArrayTypeConvertTest.java
+++ b/test/hotspot/jtreg/compiler/vectorization/runner/ArrayTypeConvertTest.java
@ -1,6 +1,6 @@
 /*
 * Copyright (c) 2022, 2023, Arm Limited. All rights reserved.
- * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -252,9 +252,12 @@ public class ArrayTypeConvertTest extends VectorizationTestRunner {
    }

    @Test
-    @IR(applyIfCPUFeatureOr = {"sve", "true", "avx2", "true", "rvv", "true"},
+    @IR(applyIfCPUFeature = {"rvv", "true"},
        applyIf = {"MaxVectorSize", ">=32"},
        counts = {IRNode.VECTOR_CAST_S2D, IRNode.VECTOR_SIZE + "min(max_short, max_double)", ">0"})
+    @IR(applyIfCPUFeatureOr = {"asimd", "true", "avx", "true"},
+        applyIf = {"MaxVectorSize", ">=16"},
+        counts = {IRNode.VECTOR_CAST_S2D, IRNode.VECTOR_SIZE + "min(max_short, max_double)", ">0"})
    public double[] convertShortToDouble() {
        double[] res = new double[SIZE];
        for (int i = 0; i < SIZE; i++) {
@ -374,9 +377,12 @@ public class ArrayTypeConvertTest extends VectorizationTestRunner {
    }

    @Test
-    @IR(applyIfCPUFeatureOr = {"sve", "true", "avx", "true", "rvv", "true"},
+    @IR(applyIfCPUFeature = {"rvv", "true"},
        applyIf = {"MaxVectorSize", ">=32"},
        counts = {IRNode.VECTOR_CAST_D2S, IRNode.VECTOR_SIZE + "min(max_double, max_short)", ">0"})
+    @IR(applyIfCPUFeatureOr = {"sve", "true", "avx", "true"},
+        applyIf = {"MaxVectorSize", ">=16"},
+        counts = {IRNode.VECTOR_CAST_D2S, IRNode.VECTOR_SIZE + "min(max_double, max_short)", ">0"})
    public short[] convertDoubleToShort() {
        short[] res = new short[SIZE];
        for (int i = 0; i < SIZE; i++) {
@ -386,9 +392,12 @@ public class ArrayTypeConvertTest extends VectorizationTestRunner {
    }

    @Test
-    @IR(applyIfCPUFeatureOr = {"sve", "true", "avx", "true", "rvv", "true"},
+    @IR(applyIfCPUFeature = {"rvv", "true"},
        applyIf = {"MaxVectorSize", ">=32"},
        counts = {IRNode.VECTOR_CAST_D2S, IRNode.VECTOR_SIZE + "min(max_double, max_char)", ">0"})
+    @IR(applyIfCPUFeatureOr = {"sve", "true", "avx", "true"},
+        applyIf = {"MaxVectorSize", ">=16"},
+        counts = {IRNode.VECTOR_CAST_D2S, IRNode.VECTOR_SIZE + "min(max_double, max_char)", ">0"})
    public char[] convertDoubleToChar() {
        char[] res = new char[SIZE];
        for (int i = 0; i < SIZE; i++) {
--- a/test/micro/org/openjdk/bench/jdk/incubator/vector/VectorFPtoIntCastOperations.java
+++ b/test/micro/org/openjdk/bench/jdk/incubator/vector/VectorFPtoIntCastOperations.java
@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2022, 2024, Oracle and/or its affiliates. All rights reserved.
+ *  Copyright (c) 2022, 2025, Oracle and/or its affiliates. All rights reserved.
 *  DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 *  This code is free software; you can redistribute it and/or modify it
@ -120,6 +120,18 @@ public class VectorFPtoIntCastOperations {
        }
    }

+    @Benchmark
+    public void microFloat64ToShort64() {
+        VectorSpecies<Float> ISPECIES = FloatVector.SPECIES_64;
+        VectorSpecies<Short> OSPECIES = ShortVector.SPECIES_64;
+        for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE / 2); i += ISPECIES.length(), j += OSPECIES.length()) {
+            FloatVector.fromArray(ISPECIES, float_arr, i)
+                .convertShape(VectorOperators.F2S, OSPECIES, 0)
+                .reinterpretAsShorts()
+                .intoArray(short_res, j);
+        }
+    }
+
    @Benchmark
    public void microFloat128ToShort128() {
        VectorSpecies<Float> ISPECIES = FloatVector.SPECIES_128;
--- a/test/micro/org/openjdk/bench/vm/compiler/VectorTwoShorts.java
+++ b/test/micro/org/openjdk/bench/vm/compiler/VectorTwoShorts.java
@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package org.openjdk.bench.vm.compiler;
+
+import org.openjdk.jmh.annotations.*;
+
+import java.util.concurrent.TimeUnit;
+import java.util.Random;
+
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@State(Scope.Thread)
+@Warmup(iterations = 4, time = 2, timeUnit = TimeUnit.SECONDS)
+@Measurement(iterations = 4, time = 2, timeUnit = TimeUnit.SECONDS)
+@Fork(value = 3)
+public class VectorTwoShorts {
+    @Param({"64", "128", "512", "1024"})
+    public int LEN;
+
+    private short[] sA;
+    private short[] sB;
+    private short[] sC;
+
+    @Param("0")
+    private int seed;
+    private Random r = new Random(seed);
+
+    @Setup
+    public void init() {
+        sA = new short[LEN];
+        sB = new short[LEN];
+        sC = new short[LEN];
+
+        for (int i = 0; i < LEN; i++) {
+            sA[i] = (short) r.nextInt();
+            sB[i] = (short) r.nextInt();
+        }
+    }
+
+    @Benchmark
+    public void addVec2S() {
+        for (int i = 0; i < LEN - 3; i++) {
+            sC[i + 3] = (short) (sA[i] + sB[i]);
+        }
+    }
+
+    @Benchmark
+    public void mulVec2S() {
+        for (int i = 0; i < LEN - 3; i++) {
+            sC[i + 3] = (short) (sA[i] * sB[i]);
+        }
+    }
+
+    @Benchmark
+    public void reverseBytesVec2S() {
+        for (int i = 0; i < LEN - 3; i++) {
+            sC[i + 3] = (short) Short.reverseBytes(sA[i]);
+        }
+    }
+}