diff --git a/src/hotspot/cpu/aarch64/aarch64.ad b/src/hotspot/cpu/aarch64/aarch64.ad
index 681b14ab068..404ab8d9ba4 100644
--- a/src/hotspot/cpu/aarch64/aarch64.ad
+++ b/src/hotspot/cpu/aarch64/aarch64.ad
@@ -2362,17 +2362,34 @@ int Matcher::max_vector_size(const BasicType bt) {
 }
 
 int Matcher::min_vector_size(const BasicType bt) {
-  int max_size = max_vector_size(bt);
-  // Limit the min vector size to 8 bytes.
-  int size = 8 / type2aelembytes(bt);
-  if (bt == T_BYTE) {
-    // To support vector api shuffle/rearrange.
-    size = 4;
-  } else if (bt == T_BOOLEAN) {
-    // To support vector api load/store mask.
-    size = 2;
+  // Usually, the shortest vector length supported by AArch64 ISA and
+  // Vector API species is 64 bits. However, we allow 32-bit or 16-bit
+  // vectors in a few special cases.
+  int size;
+  switch(bt) {
+    case T_BOOLEAN:
+      // Load/store a vector mask with only 2 elements for vector types
+      // such as "2I/2F/2L/2D".
+      size = 2;
+      break;
+    case T_BYTE:
+      // Generate a "4B" vector, to support vector cast between "8B/16B"
+      // and "4S/4I/4L/4F/4D".
+      size = 4;
+      break;
+    case T_SHORT:
+      // Generate a "2S" vector, to support vector cast between "4S/8S"
+      // and "2I/2L/2F/2D".
+      size = 2;
+      break;
+    default:
+      // Limit the min vector length to 64-bit.
+      size = 8 / type2aelembytes(bt);
+      // The number of elements in a vector should be at least 2.
+      size = MAX2(size, 2);
   }
-  if (size < 2) size = 2;
+
+  int max_size = max_vector_size(bt);
   return MIN2(size, max_size);
 }
 
diff --git a/src/hotspot/cpu/aarch64/aarch64_vector.ad b/src/hotspot/cpu/aarch64/aarch64_vector.ad
index b4e6d79347f..1b6296ddd8b 100644
--- a/src/hotspot/cpu/aarch64/aarch64_vector.ad
+++ b/src/hotspot/cpu/aarch64/aarch64_vector.ad
@@ -131,7 +131,7 @@ source %{
       // These operations are not profitable to be vectorized on NEON, because no direct
       // NEON instructions support them. But the match rule support for them is profitable for
       // Vector API intrinsics.
-      if ((opcode == Op_VectorCastD2X && bt == T_INT) ||
+      if ((opcode == Op_VectorCastD2X && (bt == T_INT || bt == T_SHORT)) ||
           (opcode == Op_VectorCastL2X && bt == T_FLOAT) ||
           (opcode == Op_CountLeadingZerosV && bt == T_LONG) ||
           (opcode == Op_CountTrailingZerosV && bt == T_LONG) ||
@@ -189,6 +189,18 @@ source %{
           return false;
         }
         break;
+      case Op_AddReductionVI:
+      case Op_AndReductionV:
+      case Op_OrReductionV:
+      case Op_XorReductionV:
+      case Op_MinReductionV:
+      case Op_MaxReductionV:
+        // Reductions with less than 8 bytes vector length are
+        // not supported.
+        if (length_in_bytes < 8) {
+          return false;
+        }
+        break;
       case Op_MulReductionVD:
       case Op_MulReductionVF:
       case Op_MulReductionVI:
@@ -4244,8 +4256,8 @@ instruct vzeroExtStoX(vReg dst, vReg src) %{
     assert(bt == T_INT || bt == T_LONG, "must be");
     uint length_in_bytes = Matcher::vector_length_in_bytes(this);
     if (VM_Version::use_neon_for_vector(length_in_bytes)) {
-      // 4S to 4I
-      __ neon_vector_extend($dst$$FloatRegister, T_INT, length_in_bytes,
+      // 2S to 2I/2L, 4S to 4I
+      __ neon_vector_extend($dst$$FloatRegister, bt, length_in_bytes,
                             $src$$FloatRegister, T_SHORT, /* is_unsigned */ true);
     } else {
       assert(UseSVE > 0, "must be sve");
@@ -4265,11 +4277,11 @@ instruct vzeroExtItoX(vReg dst, vReg src) %{
     uint length_in_bytes = Matcher::vector_length_in_bytes(this);
     if (VM_Version::use_neon_for_vector(length_in_bytes)) {
       // 2I to 2L
-      __ neon_vector_extend($dst$$FloatRegister, T_LONG, length_in_bytes,
+      __ neon_vector_extend($dst$$FloatRegister, bt, length_in_bytes,
                             $src$$FloatRegister, T_INT, /* is_unsigned */ true);
     } else {
       assert(UseSVE > 0, "must be sve");
-      __ sve_vector_extend($dst$$FloatRegister, __ D,
+      __ sve_vector_extend($dst$$FloatRegister, __ elemType_to_regVariant(bt),
                            $src$$FloatRegister, __ S, /* is_unsigned */ true);
     }
   %}
@@ -4343,11 +4355,15 @@ instruct vcvtStoX_extend(vReg dst, vReg src) %{
     BasicType bt = Matcher::vector_element_basic_type(this);
     uint length_in_bytes = Matcher::vector_length_in_bytes(this);
     if (VM_Version::use_neon_for_vector(length_in_bytes)) {
-      // 4S to 4I/4F
-      __ neon_vector_extend($dst$$FloatRegister, T_INT, length_in_bytes,
-                            $src$$FloatRegister, T_SHORT);
-      if (bt == T_FLOAT) {
-        __ scvtfv(__ T4S, $dst$$FloatRegister, $dst$$FloatRegister);
+      if (is_floating_point_type(bt)) {
+        // 2S to 2F/2D, 4S to 4F
+        __ neon_vector_extend($dst$$FloatRegister, bt == T_FLOAT ? T_INT : T_LONG,
+                              length_in_bytes, $src$$FloatRegister, T_SHORT);
+        __ scvtfv(get_arrangement(this), $dst$$FloatRegister, $dst$$FloatRegister);
+      } else {
+        // 2S to 2I/2L, 4S to 4I
+        __ neon_vector_extend($dst$$FloatRegister, bt, length_in_bytes,
+                              $src$$FloatRegister, T_SHORT);
       }
     } else {
       assert(UseSVE > 0, "must be sve");
@@ -4371,7 +4387,7 @@ instruct vcvtItoX_narrow_neon(vReg dst, vReg src) %{
   effect(TEMP_DEF dst);
   format %{ "vcvtItoX_narrow_neon $dst, $src" %}
   ins_encode %{
-    // 4I to 4B/4S
+    // 2I to 2S, 4I to 4B/4S
     BasicType bt = Matcher::vector_element_basic_type(this);
     uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src);
     __ neon_vector_narrow($dst$$FloatRegister, bt,
@@ -4434,28 +4450,29 @@ instruct vcvtItoX(vReg dst, vReg src) %{
 
 // VectorCastL2X
 
-instruct vcvtLtoI_neon(vReg dst, vReg src) %{
-  predicate(Matcher::vector_element_basic_type(n) == T_INT &&
+instruct vcvtLtoX_narrow_neon(vReg dst, vReg src) %{
+  predicate((Matcher::vector_element_basic_type(n) == T_INT ||
+             Matcher::vector_element_basic_type(n) == T_SHORT) &&
             VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(1))));
   match(Set dst (VectorCastL2X src));
-  format %{ "vcvtLtoI_neon $dst, $src" %}
+  format %{ "vcvtLtoX_narrow_neon $dst, $src" %}
   ins_encode %{
-    // 2L to 2I
+    // 2L to 2S/2I
+    BasicType bt = Matcher::vector_element_basic_type(this);
     uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src);
-    __ neon_vector_narrow($dst$$FloatRegister, T_INT,
+    __ neon_vector_narrow($dst$$FloatRegister, bt,
                           $src$$FloatRegister, T_LONG, length_in_bytes);
   %}
   ins_pipe(pipe_slow);
 %}
 
-instruct vcvtLtoI_sve(vReg dst, vReg src, vReg tmp) %{
-  predicate((Matcher::vector_element_basic_type(n) == T_INT &&
-             !VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(1)))) ||
-            Matcher::vector_element_basic_type(n) == T_BYTE ||
-            Matcher::vector_element_basic_type(n) == T_SHORT);
+instruct vcvtLtoX_narrow_sve(vReg dst, vReg src, vReg tmp) %{
+  predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(1))) &&
+            !is_floating_point_type(Matcher::vector_element_basic_type(n)) &&
+            type2aelembytes(Matcher::vector_element_basic_type(n)) <= 4);
   match(Set dst (VectorCastL2X src));
   effect(TEMP_DEF dst, TEMP tmp);
-  format %{ "vcvtLtoI_sve $dst, $src\t# KILL $tmp" %}
+  format %{ "vcvtLtoX_narrow_sve $dst, $src\t# KILL $tmp" %}
   ins_encode %{
     assert(UseSVE > 0, "must be sve");
     BasicType bt = Matcher::vector_element_basic_type(this);
@@ -4521,10 +4538,11 @@ instruct vcvtFtoX_narrow_neon(vReg dst, vReg src) %{
   effect(TEMP_DEF dst);
   format %{ "vcvtFtoX_narrow_neon $dst, $src" %}
   ins_encode %{
-    // 4F to 4B/4S
+    // 2F to 2S, 4F to 4B/4S
     BasicType bt = Matcher::vector_element_basic_type(this);
     uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src);
-    __ fcvtzs($dst$$FloatRegister, __ T4S, $src$$FloatRegister);
+    __ fcvtzs($dst$$FloatRegister, length_in_bytes == 16 ? __ T4S : __ T2S,
+              $src$$FloatRegister);
     __ neon_vector_narrow($dst$$FloatRegister, bt,
                           $dst$$FloatRegister, T_INT, length_in_bytes);
   %}
@@ -4590,12 +4608,14 @@ instruct vcvtFtoX(vReg dst, vReg src) %{
 // VectorCastD2X
 
 instruct vcvtDtoI_neon(vReg dst, vReg src) %{
-  predicate(UseSVE == 0 && Matcher::vector_element_basic_type(n) == T_INT);
+  predicate(UseSVE == 0 &&
+            (Matcher::vector_element_basic_type(n) == T_INT ||
+             Matcher::vector_element_basic_type(n) == T_SHORT));
   match(Set dst (VectorCastD2X src));
   effect(TEMP_DEF dst);
-  format %{ "vcvtDtoI_neon $dst, $src\t# 2D to 2I" %}
+  format %{ "vcvtDtoI_neon $dst, $src\t# 2D to 2S/2I" %}
   ins_encode %{
-    // 2D to 2I
+    // 2D to 2S/2I
     __ ins($dst$$FloatRegister, __ D, $src$$FloatRegister, 0, 1);
     // We can't use fcvtzs(vector, integer) instruction here because we need
     // saturation arithmetic. See JDK-8276151.
@@ -4603,6 +4623,10 @@ instruct vcvtDtoI_neon(vReg dst, vReg src) %{
     __ fcvtzdw(rscratch2, $dst$$FloatRegister);
     __ fmovs($dst$$FloatRegister, rscratch1);
     __ mov($dst$$FloatRegister, __ S, 1, rscratch2);
+    if (Matcher::vector_element_basic_type(this) == T_SHORT) {
+      __ neon_vector_narrow($dst$$FloatRegister, T_SHORT,
+                            $dst$$FloatRegister, T_INT, 8);
+    }
   %}
   ins_pipe(pipe_slow);
 %}
@@ -4676,7 +4700,7 @@ instruct vcvtHFtoF(vReg dst, vReg src) %{
   ins_encode %{
     uint length_in_bytes = Matcher::vector_length_in_bytes(this);
     if (VM_Version::use_neon_for_vector(length_in_bytes)) {
-      // 4HF to 4F
+      // 2HF to 2F, 4HF to 4F
       __ fcvtl($dst$$FloatRegister, __ T4S, $src$$FloatRegister, __ T4H);
     } else {
       assert(UseSVE > 0, "must be sve");
@@ -4692,9 +4716,9 @@ instruct vcvtHFtoF(vReg dst, vReg src) %{
 instruct vcvtFtoHF_neon(vReg dst, vReg src) %{
   predicate(VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(1))));
   match(Set dst (VectorCastF2HF src));
-  format %{ "vcvtFtoHF_neon $dst, $src\t# 4F to 4HF" %}
+  format %{ "vcvtFtoHF_neon $dst, $src\t# 2F/4F to 2HF/4HF" %}
   ins_encode %{
-    // 4F to 4HF
+    // 2F to 2HF, 4F to 4HF
     __ fcvtn($dst$$FloatRegister, __ T4H, $src$$FloatRegister, __ T4S);
   %}
   ins_pipe(pipe_slow);
@@ -6396,14 +6420,12 @@ instruct vpopcountI(vReg dst, vReg src) %{
     } else {
       assert(bt == T_SHORT || bt == T_INT, "unsupported");
       if (UseSVE == 0) {
-        assert(length_in_bytes == 8 || length_in_bytes == 16, "unsupported");
-        __ cnt($dst$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B,
-               $src$$FloatRegister);
-        __ uaddlp($dst$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B,
-                  $dst$$FloatRegister);
+        assert(length_in_bytes <= 16, "unsupported");
+        bool isQ = length_in_bytes == 16;
+        __ cnt($dst$$FloatRegister, isQ ? __ T16B : __ T8B, $src$$FloatRegister);
+        __ uaddlp($dst$$FloatRegister, isQ ? __ T16B : __ T8B, $dst$$FloatRegister);
         if (bt == T_INT) {
-          __ uaddlp($dst$$FloatRegister, length_in_bytes == 16 ? __ T8H : __ T4H,
-                    $dst$$FloatRegister);
+          __ uaddlp($dst$$FloatRegister, isQ ? __ T8H : __ T4H, $dst$$FloatRegister);
         }
       } else {
         __ sve_cnt($dst$$FloatRegister, __ elemType_to_regVariant(bt),
@@ -6465,7 +6487,7 @@ instruct vblend_neon(vReg dst, vReg src1, vReg src2) %{
   format %{ "vblend_neon $dst, $src1, $src2" %}
   ins_encode %{
     uint length_in_bytes = Matcher::vector_length_in_bytes(this);
-    assert(length_in_bytes == 8 || length_in_bytes == 16, "must be");
+    assert(length_in_bytes <= 16, "must be");
     __ bsl($dst$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B,
            $src2$$FloatRegister, $src1$$FloatRegister);
   %}
@@ -6852,7 +6874,7 @@ instruct vcountTrailingZeros(vReg dst, vReg src) %{
     } else {
       assert(bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported type");
       if (UseSVE == 0) {
-        assert(length_in_bytes == 8 || length_in_bytes == 16, "unsupported");
+        assert(length_in_bytes <= 16, "unsupported");
         __ neon_reverse_bits($dst$$FloatRegister, $src$$FloatRegister,
                              bt, /* isQ */ length_in_bytes == 16);
         if (bt != T_LONG) {
@@ -6911,7 +6933,7 @@ instruct vreverse(vReg dst, vReg src) %{
     } else {
       assert(bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported type");
       if (UseSVE == 0) {
-        assert(length_in_bytes == 8 || length_in_bytes == 16, "unsupported");
+        assert(length_in_bytes <= 16, "unsupported");
         __ neon_reverse_bits($dst$$FloatRegister, $src$$FloatRegister,
                              bt, /* isQ */ length_in_bytes == 16);
       } else {
@@ -6947,7 +6969,7 @@ instruct vreverseBytes(vReg dst, vReg src) %{
     BasicType bt = Matcher::vector_element_basic_type(this);
     uint length_in_bytes = Matcher::vector_length_in_bytes(this);
     if (VM_Version::use_neon_for_vector(length_in_bytes)) {
-      assert(length_in_bytes == 8 || length_in_bytes == 16, "unsupported");
+      assert(length_in_bytes <= 16, "unsupported");
       if (bt == T_BYTE) {
         if ($dst$$FloatRegister != $src$$FloatRegister) {
           __ orr($dst$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B,
diff --git a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
index cc07e0e4076..efefbf692bd 100644
--- a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
+++ b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
@@ -121,7 +121,7 @@ source %{
       // These operations are not profitable to be vectorized on NEON, because no direct
       // NEON instructions support them. But the match rule support for them is profitable for
       // Vector API intrinsics.
-      if ((opcode == Op_VectorCastD2X && bt == T_INT) ||
+      if ((opcode == Op_VectorCastD2X && (bt == T_INT || bt == T_SHORT)) ||
           (opcode == Op_VectorCastL2X && bt == T_FLOAT) ||
           (opcode == Op_CountLeadingZerosV && bt == T_LONG) ||
           (opcode == Op_CountTrailingZerosV && bt == T_LONG) ||
@@ -179,6 +179,18 @@ source %{
           return false;
         }
         break;
+      case Op_AddReductionVI:
+      case Op_AndReductionV:
+      case Op_OrReductionV:
+      case Op_XorReductionV:
+      case Op_MinReductionV:
+      case Op_MaxReductionV:
+        // Reductions with less than 8 bytes vector length are
+        // not supported.
+        if (length_in_bytes < 8) {
+          return false;
+        }
+        break;
       case Op_MulReductionVD:
       case Op_MulReductionVF:
       case Op_MulReductionVI:
@@ -2502,31 +2514,31 @@ instruct reinterpret_resize_gt128b(vReg dst, vReg src, pReg ptmp, rFlagsReg cr)
 %}
 
 // ---------------------------- Vector zero extend --------------------------------
-dnl VECTOR_ZERO_EXTEND($1,      $2,     $3,      $4,       $5        $6,        $7,         )
-dnl VECTOR_ZERO_EXTEND(op_name, dst_bt, src_bt,  dst_size, src_size, assertion, neon_comment)
+dnl VECTOR_ZERO_EXTEND($1,      $2,     $3,       $4,        $5,         )
+dnl VECTOR_ZERO_EXTEND(op_name, src_bt, src_size, assertion, neon_comment)
 define(`VECTOR_ZERO_EXTEND', `
 instruct vzeroExt$1toX(vReg dst, vReg src) %{
   match(Set dst (VectorUCast`$1'2X src));
   format %{ "vzeroExt$1toX $dst, $src" %}
   ins_encode %{
     BasicType bt = Matcher::vector_element_basic_type(this);
-    assert($6, "must be");
+    assert($4, "must be");
     uint length_in_bytes = Matcher::vector_length_in_bytes(this);
     if (VM_Version::use_neon_for_vector(length_in_bytes)) {
-      // $7
-      __ neon_vector_extend($dst$$FloatRegister, $2, length_in_bytes,
-                            $src$$FloatRegister, $3, /* is_unsigned */ true);
+      // $5
+      __ neon_vector_extend($dst$$FloatRegister, bt, length_in_bytes,
+                            $src$$FloatRegister, $2, /* is_unsigned */ true);
     } else {
       assert(UseSVE > 0, "must be sve");
-      __ sve_vector_extend($dst$$FloatRegister, __ $4,
-                           $src$$FloatRegister, __ $5, /* is_unsigned */ true);
+      __ sve_vector_extend($dst$$FloatRegister, __ elemType_to_regVariant(bt),
+                           $src$$FloatRegister, __ $3, /* is_unsigned */ true);
     }
   %}
   ins_pipe(pipe_slow);
 %}')dnl
-VECTOR_ZERO_EXTEND(B, bt,     T_BYTE,  elemType_to_regVariant(bt), B, bt == T_SHORT || bt == T_INT || bt == T_LONG, `4B to 4S/4I, 8B to 8S')
-VECTOR_ZERO_EXTEND(S, T_INT,  T_SHORT, elemType_to_regVariant(bt), H, bt == T_INT || bt == T_LONG,                  `4S to 4I')
-VECTOR_ZERO_EXTEND(I, T_LONG, T_INT,   D,                          S, bt == T_LONG,                                 `2I to 2L')
+VECTOR_ZERO_EXTEND(B, T_BYTE,  B, bt == T_SHORT || bt == T_INT || bt == T_LONG, `4B to 4S/4I, 8B to 8S')
+VECTOR_ZERO_EXTEND(S, T_SHORT, H, bt == T_INT || bt == T_LONG,                  `2S to 2I/2L, 4S to 4I')
+VECTOR_ZERO_EXTEND(I, T_INT,   S, bt == T_LONG,                                 `2I to 2L')
 
 // ------------------------------ Vector cast ----------------------------------
 
@@ -2595,11 +2607,15 @@ instruct vcvtStoX_extend(vReg dst, vReg src) %{
     BasicType bt = Matcher::vector_element_basic_type(this);
     uint length_in_bytes = Matcher::vector_length_in_bytes(this);
     if (VM_Version::use_neon_for_vector(length_in_bytes)) {
-      // 4S to 4I/4F
-      __ neon_vector_extend($dst$$FloatRegister, T_INT, length_in_bytes,
-                            $src$$FloatRegister, T_SHORT);
-      if (bt == T_FLOAT) {
-        __ scvtfv(__ T4S, $dst$$FloatRegister, $dst$$FloatRegister);
+      if (is_floating_point_type(bt)) {
+        // 2S to 2F/2D, 4S to 4F
+        __ neon_vector_extend($dst$$FloatRegister, bt == T_FLOAT ? T_INT : T_LONG,
+                              length_in_bytes, $src$$FloatRegister, T_SHORT);
+        __ scvtfv(get_arrangement(this), $dst$$FloatRegister, $dst$$FloatRegister);
+      } else {
+        // 2S to 2I/2L, 4S to 4I
+        __ neon_vector_extend($dst$$FloatRegister, bt, length_in_bytes,
+                              $src$$FloatRegister, T_SHORT);
       }
     } else {
       assert(UseSVE > 0, "must be sve");
@@ -2623,7 +2639,7 @@ instruct vcvtItoX_narrow_neon(vReg dst, vReg src) %{
   effect(TEMP_DEF dst);
   format %{ "vcvtItoX_narrow_neon $dst, $src" %}
   ins_encode %{
-    // 4I to 4B/4S
+    // 2I to 2S, 4I to 4B/4S
     BasicType bt = Matcher::vector_element_basic_type(this);
     uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src);
     __ neon_vector_narrow($dst$$FloatRegister, bt,
@@ -2686,28 +2702,29 @@ instruct vcvtItoX(vReg dst, vReg src) %{
 
 // VectorCastL2X
 
-instruct vcvtLtoI_neon(vReg dst, vReg src) %{
-  predicate(Matcher::vector_element_basic_type(n) == T_INT &&
+instruct vcvtLtoX_narrow_neon(vReg dst, vReg src) %{
+  predicate((Matcher::vector_element_basic_type(n) == T_INT ||
+             Matcher::vector_element_basic_type(n) == T_SHORT) &&
             VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(1))));
   match(Set dst (VectorCastL2X src));
-  format %{ "vcvtLtoI_neon $dst, $src" %}
+  format %{ "vcvtLtoX_narrow_neon $dst, $src" %}
   ins_encode %{
-    // 2L to 2I
+    // 2L to 2S/2I
+    BasicType bt = Matcher::vector_element_basic_type(this);
     uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src);
-    __ neon_vector_narrow($dst$$FloatRegister, T_INT,
+    __ neon_vector_narrow($dst$$FloatRegister, bt,
                           $src$$FloatRegister, T_LONG, length_in_bytes);
   %}
   ins_pipe(pipe_slow);
 %}
 
-instruct vcvtLtoI_sve(vReg dst, vReg src, vReg tmp) %{
-  predicate((Matcher::vector_element_basic_type(n) == T_INT &&
-             !VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(1)))) ||
-            Matcher::vector_element_basic_type(n) == T_BYTE ||
-            Matcher::vector_element_basic_type(n) == T_SHORT);
+instruct vcvtLtoX_narrow_sve(vReg dst, vReg src, vReg tmp) %{
+  predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(1))) &&
+            !is_floating_point_type(Matcher::vector_element_basic_type(n)) &&
+            type2aelembytes(Matcher::vector_element_basic_type(n)) <= 4);
   match(Set dst (VectorCastL2X src));
   effect(TEMP_DEF dst, TEMP tmp);
-  format %{ "vcvtLtoI_sve $dst, $src\t# KILL $tmp" %}
+  format %{ "vcvtLtoX_narrow_sve $dst, $src\t# KILL $tmp" %}
   ins_encode %{
     assert(UseSVE > 0, "must be sve");
     BasicType bt = Matcher::vector_element_basic_type(this);
@@ -2773,10 +2790,11 @@ instruct vcvtFtoX_narrow_neon(vReg dst, vReg src) %{
   effect(TEMP_DEF dst);
   format %{ "vcvtFtoX_narrow_neon $dst, $src" %}
   ins_encode %{
-    // 4F to 4B/4S
+    // 2F to 2S, 4F to 4B/4S
     BasicType bt = Matcher::vector_element_basic_type(this);
     uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src);
-    __ fcvtzs($dst$$FloatRegister, __ T4S, $src$$FloatRegister);
+    __ fcvtzs($dst$$FloatRegister, length_in_bytes == 16 ? __ T4S : __ T2S,
+              $src$$FloatRegister);
     __ neon_vector_narrow($dst$$FloatRegister, bt,
                           $dst$$FloatRegister, T_INT, length_in_bytes);
   %}
@@ -2842,12 +2860,14 @@ instruct vcvtFtoX(vReg dst, vReg src) %{
 // VectorCastD2X
 
 instruct vcvtDtoI_neon(vReg dst, vReg src) %{
-  predicate(UseSVE == 0 && Matcher::vector_element_basic_type(n) == T_INT);
+  predicate(UseSVE == 0 &&
+            (Matcher::vector_element_basic_type(n) == T_INT ||
+             Matcher::vector_element_basic_type(n) == T_SHORT));
   match(Set dst (VectorCastD2X src));
   effect(TEMP_DEF dst);
-  format %{ "vcvtDtoI_neon $dst, $src\t# 2D to 2I" %}
+  format %{ "vcvtDtoI_neon $dst, $src\t# 2D to 2S/2I" %}
   ins_encode %{
-    // 2D to 2I
+    // 2D to 2S/2I
     __ ins($dst$$FloatRegister, __ D, $src$$FloatRegister, 0, 1);
     // We can't use fcvtzs(vector, integer) instruction here because we need
     // saturation arithmetic. See JDK-8276151.
@@ -2855,6 +2875,10 @@ instruct vcvtDtoI_neon(vReg dst, vReg src) %{
     __ fcvtzdw(rscratch2, $dst$$FloatRegister);
     __ fmovs($dst$$FloatRegister, rscratch1);
     __ mov($dst$$FloatRegister, __ S, 1, rscratch2);
+    if (Matcher::vector_element_basic_type(this) == T_SHORT) {
+      __ neon_vector_narrow($dst$$FloatRegister, T_SHORT,
+                            $dst$$FloatRegister, T_INT, 8);
+    }
   %}
   ins_pipe(pipe_slow);
 %}
@@ -2928,7 +2952,7 @@ instruct vcvtHFtoF(vReg dst, vReg src) %{
   ins_encode %{
     uint length_in_bytes = Matcher::vector_length_in_bytes(this);
     if (VM_Version::use_neon_for_vector(length_in_bytes)) {
-      // 4HF to 4F
+      // 2HF to 2F, 4HF to 4F
       __ fcvtl($dst$$FloatRegister, __ T4S, $src$$FloatRegister, __ T4H);
     } else {
       assert(UseSVE > 0, "must be sve");
@@ -2944,9 +2968,9 @@ instruct vcvtHFtoF(vReg dst, vReg src) %{
 instruct vcvtFtoHF_neon(vReg dst, vReg src) %{
   predicate(VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(1))));
   match(Set dst (VectorCastF2HF src));
-  format %{ "vcvtFtoHF_neon $dst, $src\t# 4F to 4HF" %}
+  format %{ "vcvtFtoHF_neon $dst, $src\t# 2F/4F to 2HF/4HF" %}
   ins_encode %{
-    // 4F to 4HF
+    // 2F to 2HF, 4F to 4HF
     __ fcvtn($dst$$FloatRegister, __ T4H, $src$$FloatRegister, __ T4S);
   %}
   ins_pipe(pipe_slow);
@@ -4417,14 +4441,12 @@ instruct vpopcountI(vReg dst, vReg src) %{
     } else {
       assert(bt == T_SHORT || bt == T_INT, "unsupported");
       if (UseSVE == 0) {
-        assert(length_in_bytes == 8 || length_in_bytes == 16, "unsupported");
-        __ cnt($dst$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B,
-               $src$$FloatRegister);
-        __ uaddlp($dst$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B,
-                  $dst$$FloatRegister);
+        assert(length_in_bytes <= 16, "unsupported");
+        bool isQ = length_in_bytes == 16;
+        __ cnt($dst$$FloatRegister, isQ ? __ T16B : __ T8B, $src$$FloatRegister);
+        __ uaddlp($dst$$FloatRegister, isQ ? __ T16B : __ T8B, $dst$$FloatRegister);
         if (bt == T_INT) {
-          __ uaddlp($dst$$FloatRegister, length_in_bytes == 16 ? __ T8H : __ T4H,
-                    $dst$$FloatRegister);
+          __ uaddlp($dst$$FloatRegister, isQ ? __ T8H : __ T4H, $dst$$FloatRegister);
         }
       } else {
         __ sve_cnt($dst$$FloatRegister, __ elemType_to_regVariant(bt),
@@ -4475,7 +4497,7 @@ instruct vblend_neon(vReg dst, vReg src1, vReg src2) %{
   format %{ "vblend_neon $dst, $src1, $src2" %}
   ins_encode %{
     uint length_in_bytes = Matcher::vector_length_in_bytes(this);
-    assert(length_in_bytes == 8 || length_in_bytes == 16, "must be");
+    assert(length_in_bytes <= 16, "must be");
     __ bsl($dst$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B,
            $src2$$FloatRegister, $src1$$FloatRegister);
   %}
@@ -4851,7 +4873,7 @@ instruct vcountTrailingZeros(vReg dst, vReg src) %{
     } else {
       assert(bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported type");
       if (UseSVE == 0) {
-        assert(length_in_bytes == 8 || length_in_bytes == 16, "unsupported");
+        assert(length_in_bytes <= 16, "unsupported");
         __ neon_reverse_bits($dst$$FloatRegister, $src$$FloatRegister,
                              bt, /* isQ */ length_in_bytes == 16);
         if (bt != T_LONG) {
@@ -4910,7 +4932,7 @@ instruct vreverse(vReg dst, vReg src) %{
     } else {
       assert(bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported type");
       if (UseSVE == 0) {
-        assert(length_in_bytes == 8 || length_in_bytes == 16, "unsupported");
+        assert(length_in_bytes <= 16, "unsupported");
         __ neon_reverse_bits($dst$$FloatRegister, $src$$FloatRegister,
                              bt, /* isQ */ length_in_bytes == 16);
       } else {
@@ -4935,7 +4957,7 @@ instruct vreverseBytes(vReg dst, vReg src) %{
     BasicType bt = Matcher::vector_element_basic_type(this);
     uint length_in_bytes = Matcher::vector_length_in_bytes(this);
     if (VM_Version::use_neon_for_vector(length_in_bytes)) {
-      assert(length_in_bytes == 8 || length_in_bytes == 16, "unsupported");
+      assert(length_in_bytes <= 16, "unsupported");
       if (bt == T_BYTE) {
         if ($dst$$FloatRegister != $src$$FloatRegister) {
           __ orr($dst$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B,
diff --git a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp
index 914967e4009..a4ecd56af08 100644
--- a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp
@@ -1778,19 +1778,21 @@ void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister
 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
                                            FloatRegister src, BasicType src_bt, bool is_unsigned) {
   if (src_bt == T_BYTE) {
-    if (dst_bt == T_SHORT) {
-      // 4B/8B to 4S/8S
-      _xshll(is_unsigned, dst, T8H, src, T8B, 0);
-    } else {
-      // 4B to 4I
-      assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
-      _xshll(is_unsigned, dst, T8H, src, T8B, 0);
+    // 4B to 4S/4I, 8B to 8S
+    assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
+    assert(dst_bt == T_SHORT || dst_bt == T_INT, "unsupported");
+    _xshll(is_unsigned, dst, T8H, src, T8B, 0);
+    if (dst_bt == T_INT) {
       _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
     }
   } else if (src_bt == T_SHORT) {
-    // 4S to 4I
-    assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
+    // 2S to 2I/2L, 4S to 4I
+    assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
+    assert(dst_bt == T_INT || dst_bt == T_LONG, "unsupported");
     _xshll(is_unsigned, dst, T4S, src, T4H, 0);
+    if (dst_bt == T_LONG) {
+      _xshll(is_unsigned, dst, T2D, dst, T2S, 0);
+    }
   } else if (src_bt == T_INT) {
     // 2I to 2L
     assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
@@ -1810,18 +1812,21 @@ void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
     assert(dst_bt == T_BYTE, "unsupported");
     xtn(dst, T8B, src, T8H);
   } else if (src_bt == T_INT) {
-    // 4I to 4B/4S
-    assert(src_vlen_in_bytes == 16, "unsupported");
+    // 2I to 2S, 4I to 4B/4S
+    assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
     assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
     xtn(dst, T4H, src, T4S);
     if (dst_bt == T_BYTE) {
       xtn(dst, T8B, dst, T8H);
     }
   } else if (src_bt == T_LONG) {
-    // 2L to 2I
+    // 2L to 2S/2I
     assert(src_vlen_in_bytes == 16, "unsupported");
-    assert(dst_bt == T_INT, "unsupported");
+    assert(dst_bt == T_INT || dst_bt == T_SHORT, "unsupported");
     xtn(dst, T2S, src, T2D);
+    if (dst_bt == T_SHORT) {
+      xtn(dst, T4H, dst, T4S);
+    }
   } else {
     ShouldNotReachHere();
   }
diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestDependencyOffsets.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestDependencyOffsets.java
index cfa19ce385a..24a8581434f 100644
--- a/test/hotspot/jtreg/compiler/loopopts/superword/TestDependencyOffsets.java
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestDependencyOffsets.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2023, 2025, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -597,8 +597,7 @@ public class TestDependencyOffsets {
             case "byte"   -> new CPUMinVectorWidth[]{new CPUMinVectorWidth(SSE4_ASIMD, 4 )};
             case "char"   -> new CPUMinVectorWidth[]{new CPUMinVectorWidth(SSE4,       4 ),
                                                      new CPUMinVectorWidth(ASIMD,      8 )};
-            case "short"  -> new CPUMinVectorWidth[]{new CPUMinVectorWidth(SSE4,       4 ),
-                                                     new CPUMinVectorWidth(ASIMD,      8 )};
+            case "short"  -> new CPUMinVectorWidth[]{new CPUMinVectorWidth(SSE4_ASIMD, 4 )};
             case "int"    -> new CPUMinVectorWidth[]{new CPUMinVectorWidth(SSE4_ASIMD, 8 )};
             case "long"   -> new CPUMinVectorWidth[]{new CPUMinVectorWidth(SSE4_ASIMD, 16)};
             case "float"  -> new CPUMinVectorWidth[]{new CPUMinVectorWidth(SSE4_ASIMD, 8 )};
diff --git a/test/hotspot/jtreg/compiler/vectorapi/reshape/utils/TestCastMethods.java b/test/hotspot/jtreg/compiler/vectorapi/reshape/utils/TestCastMethods.java
index fac829c82e4..5a4271cc5b0 100644
--- a/test/hotspot/jtreg/compiler/vectorapi/reshape/utils/TestCastMethods.java
+++ b/test/hotspot/jtreg/compiler/vectorapi/reshape/utils/TestCastMethods.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2021, 2025, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -649,18 +649,25 @@ public class TestCastMethods {
             makePair(SSPEC128, BSPEC64),
             makePair(SSPEC256, BSPEC128),
             makePair(SSPEC512, BSPEC256),
+            makePair(SSPEC64, ISPEC64),
             makePair(SSPEC64, ISPEC128),
             makePair(SSPEC128, ISPEC256),
             makePair(SSPEC256, ISPEC512),
+            makePair(SSPEC64, LSPEC128),
             makePair(SSPEC64, LSPEC256),
+            makePair(SSPEC128, LSPEC128),
             makePair(SSPEC128, LSPEC512),
+            makePair(SSPEC64, FSPEC64),
             makePair(SSPEC64, FSPEC128),
             makePair(SSPEC128, FSPEC256),
             makePair(SSPEC256, FSPEC512),
+            makePair(SSPEC64, DSPEC128),
             makePair(SSPEC64, DSPEC256),
+            makePair(SSPEC128, DSPEC128),
             makePair(SSPEC128, DSPEC512),
             makePair(ISPEC256, BSPEC64),
             makePair(ISPEC512, BSPEC128),
+            makePair(ISPEC64,  SSPEC64),
             makePair(ISPEC128, SSPEC64),
             makePair(ISPEC256, SSPEC128),
             makePair(ISPEC512, SSPEC256),
@@ -675,7 +682,9 @@ public class TestCastMethods {
             makePair(ISPEC128, DSPEC256),
             makePair(ISPEC256, DSPEC512),
             makePair(LSPEC512, BSPEC64),
+            makePair(LSPEC128, SSPEC64),
             makePair(LSPEC256, SSPEC64),
+            makePair(LSPEC128, SSPEC128),
             makePair(LSPEC512, SSPEC128),
             makePair(LSPEC128, ISPEC64),
             makePair(LSPEC256, ISPEC128),
@@ -688,6 +697,7 @@ public class TestCastMethods {
             makePair(LSPEC512, DSPEC512),
             makePair(FSPEC256, BSPEC64),
             makePair(FSPEC512, BSPEC128),
+            makePair(FSPEC64,  SSPEC64),
             makePair(FSPEC128, SSPEC64),
             makePair(FSPEC256, SSPEC128),
             makePair(FSPEC512, SSPEC256),
@@ -702,7 +712,9 @@ public class TestCastMethods {
             makePair(FSPEC128, DSPEC256),
             makePair(FSPEC256, DSPEC512),
             makePair(DSPEC512, BSPEC64),
+            makePair(DSPEC128, SSPEC64),
             makePair(DSPEC256, SSPEC64),
+            makePair(DSPEC128, SSPEC128),
             makePair(DSPEC512, SSPEC128),
             makePair(DSPEC128, ISPEC64),
             makePair(DSPEC256, ISPEC128),
@@ -751,14 +763,17 @@ public class TestCastMethods {
             makePair(BSPEC512, LSPEC256, true),
             makePair(BSPEC512, LSPEC512, true),
 
+            makePair(SSPEC64, ISPEC64, true),
             makePair(SSPEC64, ISPEC128, true),
             makePair(SSPEC64, ISPEC256, true),
             makePair(SSPEC64, ISPEC512, true),
+            makePair(SSPEC64, LSPEC128, true),
             makePair(SSPEC64, LSPEC256, true),
             makePair(SSPEC64, LSPEC512, true),
             makePair(SSPEC128, ISPEC128, true),
             makePair(SSPEC128, ISPEC256, true),
             makePair(SSPEC128, ISPEC512, true),
+            makePair(SSPEC128, LSPEC128, true),
             makePair(SSPEC128, LSPEC256, true),
             makePair(SSPEC128, LSPEC512, true),
             makePair(SSPEC256, ISPEC128, true),
@@ -789,23 +804,35 @@ public class TestCastMethods {
             makePair(BSPEC64, FSPEC128),
             makePair(SSPEC64, BSPEC64),
             makePair(SSPEC128, BSPEC64),
+            makePair(SSPEC64, ISPEC64),
             makePair(SSPEC64, ISPEC128),
+            makePair(SSPEC64,  LSPEC128),
+            makePair(SSPEC128, LSPEC128),
+            makePair(SSPEC64, FSPEC64),
             makePair(SSPEC64, FSPEC128),
+            makePair(SSPEC64,  DSPEC128),
+            makePair(SSPEC128, DSPEC128),
             makePair(ISPEC128, BSPEC64),
             makePair(ISPEC128, SSPEC64),
-            makePair(ISPEC64, LSPEC128),
+            makePair(ISPEC64,  SSPEC64),
+            makePair(ISPEC64,  LSPEC128),
             makePair(ISPEC64, FSPEC64),
             makePair(ISPEC128, FSPEC128),
             makePair(ISPEC64, DSPEC128),
+            makePair(LSPEC128, SSPEC64),
+            makePair(LSPEC128, SSPEC128),
             makePair(LSPEC128, ISPEC64),
             makePair(LSPEC128, FSPEC64),
             makePair(LSPEC128, DSPEC128),
             makePair(FSPEC128, BSPEC64),
+            makePair(FSPEC64, SSPEC64),
             makePair(FSPEC128, SSPEC64),
             makePair(FSPEC64, ISPEC64),
             makePair(FSPEC128, ISPEC128),
             makePair(FSPEC64, LSPEC128),
             makePair(FSPEC64, DSPEC128),
+            makePair(DSPEC128, SSPEC64),
+            makePair(DSPEC128, SSPEC128),
             makePair(DSPEC128, ISPEC64),
             makePair(DSPEC128, LSPEC128),
             makePair(DSPEC128, FSPEC64),
@@ -816,8 +843,11 @@ public class TestCastMethods {
             makePair(BSPEC128, SSPEC64, true),
             makePair(BSPEC128, SSPEC128, true),
             makePair(BSPEC128, ISPEC128, true),
+            makePair(SSPEC64, ISPEC64, true),
             makePair(SSPEC64, ISPEC128, true),
+            makePair(SSPEC64, LSPEC128, true),
             makePair(SSPEC128, ISPEC128, true),
+            makePair(SSPEC128, LSPEC128, true),
             makePair(ISPEC64, LSPEC128, true)
     );
 }
diff --git a/test/hotspot/jtreg/compiler/vectorization/TestFloatConversionsVector.java b/test/hotspot/jtreg/compiler/vectorization/TestFloatConversionsVector.java
index f777206bab5..482dcf934c5 100644
--- a/test/hotspot/jtreg/compiler/vectorization/TestFloatConversionsVector.java
+++ b/test/hotspot/jtreg/compiler/vectorization/TestFloatConversionsVector.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2022, 2025, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -84,10 +84,13 @@ public class TestFloatConversionsVector {
     }
 
     @Test
+    @IR(counts = {IRNode.VECTOR_CAST_F2HF, IRNode.VECTOR_SIZE_2, "> 0"},
+        applyIfOr = {"UseCompactObjectHeaders", "false", "AlignVector", "false"},
+        applyIfCPUFeature = {"asimd", "true"})
     public void test_float_float16_short_vector(short[] sout, float[] finp) {
-        for (int i = 0; i < finp.length; i+= 4) {
-            sout[i+0] = Float.floatToFloat16(finp[i+0]);
-            sout[i+1] = Float.floatToFloat16(finp[i+1]);
+        for (int i = 0; i < finp.length; i += 4) {
+            sout[i] = Float.floatToFloat16(finp[i]);
+            sout[i + 1] = Float.floatToFloat16(finp[i + 1]);
         }
     }
 
@@ -124,8 +127,9 @@ public class TestFloatConversionsVector {
         }
 
         // Verifying the result
-        for (int i = 0; i < ARRLEN; i++) {
+        for (int i = 0; i < ARRLEN; i += 4) {
             Asserts.assertEquals(Float.floatToFloat16(finp[i]), sout[i]);
+            Asserts.assertEquals(Float.floatToFloat16(finp[i + 1]), sout[i + 1]);
         }
     }
 
@@ -152,7 +156,19 @@ public class TestFloatConversionsVector {
         }
     }
 
-    @Run(test = {"test_float16_float", "test_float16_float_strided"}, mode = RunMode.STANDALONE)
+    @Test
+    @IR(counts = {IRNode.VECTOR_CAST_HF2F, IRNode.VECTOR_SIZE_2, "> 0"},
+        applyIfOr = {"UseCompactObjectHeaders", "false", "AlignVector", "false"},
+        applyIfCPUFeature = {"asimd", "true"})
+    public void test_float16_float_short_vector(float[] fout, short[] sinp) {
+        for (int i = 0; i < sinp.length; i += 4) {
+            fout[i] = Float.float16ToFloat(sinp[i]);
+            fout[i + 1] = Float.float16ToFloat(sinp[i + 1]);
+        }
+    }
+
+    @Run(test = {"test_float16_float", "test_float16_float_strided",
+                 "test_float16_float_short_vector"}, mode = RunMode.STANDALONE)
     public void kernel_test_float16_float() {
         sinp = new short[ARRLEN];
         fout = new float[ARRLEN];
@@ -178,5 +194,15 @@ public class TestFloatConversionsVector {
         for (int i = 0; i < ARRLEN/2; i++) {
             Asserts.assertEquals(Float.float16ToFloat(sinp[i*2]), fout[i*2]);
         }
+
+        for (int i = 0; i < ITERS; i++) {
+            test_float16_float_short_vector(fout, sinp);
+        }
+
+        // Verifying the result
+        for (int i = 0; i < ARRLEN; i += 4) {
+            Asserts.assertEquals(Float.float16ToFloat(sinp[i]), fout[i]);
+            Asserts.assertEquals(Float.float16ToFloat(sinp[i + 1]), fout[i + 1]);
+        }
     }
 }
diff --git a/test/hotspot/jtreg/compiler/vectorization/runner/ArrayTypeConvertTest.java b/test/hotspot/jtreg/compiler/vectorization/runner/ArrayTypeConvertTest.java
index 11b07d57dd9..3fa636b42f7 100644
--- a/test/hotspot/jtreg/compiler/vectorization/runner/ArrayTypeConvertTest.java
+++ b/test/hotspot/jtreg/compiler/vectorization/runner/ArrayTypeConvertTest.java
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2022, 2023, Arm Limited. All rights reserved.
- * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -252,9 +252,12 @@ public class ArrayTypeConvertTest extends VectorizationTestRunner {
     }
 
     @Test
-    @IR(applyIfCPUFeatureOr = {"sve", "true", "avx2", "true", "rvv", "true"},
+    @IR(applyIfCPUFeature = {"rvv", "true"},
         applyIf = {"MaxVectorSize", ">=32"},
         counts = {IRNode.VECTOR_CAST_S2D, IRNode.VECTOR_SIZE + "min(max_short, max_double)", ">0"})
+    @IR(applyIfCPUFeatureOr = {"asimd", "true", "avx", "true"},
+        applyIf = {"MaxVectorSize", ">=16"},
+        counts = {IRNode.VECTOR_CAST_S2D, IRNode.VECTOR_SIZE + "min(max_short, max_double)", ">0"})
     public double[] convertShortToDouble() {
         double[] res = new double[SIZE];
         for (int i = 0; i < SIZE; i++) {
@@ -374,9 +377,12 @@ public class ArrayTypeConvertTest extends VectorizationTestRunner {
     }
 
     @Test
-    @IR(applyIfCPUFeatureOr = {"sve", "true", "avx", "true", "rvv", "true"},
+    @IR(applyIfCPUFeature = {"rvv", "true"},
         applyIf = {"MaxVectorSize", ">=32"},
         counts = {IRNode.VECTOR_CAST_D2S, IRNode.VECTOR_SIZE + "min(max_double, max_short)", ">0"})
+    @IR(applyIfCPUFeatureOr = {"sve", "true", "avx", "true"},
+        applyIf = {"MaxVectorSize", ">=16"},
+        counts = {IRNode.VECTOR_CAST_D2S, IRNode.VECTOR_SIZE + "min(max_double, max_short)", ">0"})
     public short[] convertDoubleToShort() {
         short[] res = new short[SIZE];
         for (int i = 0; i < SIZE; i++) {
@@ -386,9 +392,12 @@ public class ArrayTypeConvertTest extends VectorizationTestRunner {
     }
 
     @Test
-    @IR(applyIfCPUFeatureOr = {"sve", "true", "avx", "true", "rvv", "true"},
+    @IR(applyIfCPUFeature = {"rvv", "true"},
         applyIf = {"MaxVectorSize", ">=32"},
         counts = {IRNode.VECTOR_CAST_D2S, IRNode.VECTOR_SIZE + "min(max_double, max_char)", ">0"})
+    @IR(applyIfCPUFeatureOr = {"sve", "true", "avx", "true"},
+        applyIf = {"MaxVectorSize", ">=16"},
+        counts = {IRNode.VECTOR_CAST_D2S, IRNode.VECTOR_SIZE + "min(max_double, max_char)", ">0"})
     public char[] convertDoubleToChar() {
         char[] res = new char[SIZE];
         for (int i = 0; i < SIZE; i++) {
diff --git a/test/micro/org/openjdk/bench/jdk/incubator/vector/VectorFPtoIntCastOperations.java b/test/micro/org/openjdk/bench/jdk/incubator/vector/VectorFPtoIntCastOperations.java
index 6c3f004dcd9..6e4a57b79e5 100644
--- a/test/micro/org/openjdk/bench/jdk/incubator/vector/VectorFPtoIntCastOperations.java
+++ b/test/micro/org/openjdk/bench/jdk/incubator/vector/VectorFPtoIntCastOperations.java
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2022, 2024, Oracle and/or its affiliates. All rights reserved.
+ *  Copyright (c) 2022, 2025, Oracle and/or its affiliates. All rights reserved.
  *  DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  *  This code is free software; you can redistribute it and/or modify it
@@ -120,6 +120,18 @@ public class VectorFPtoIntCastOperations {
         }
     }
 
+    @Benchmark
+    public void microFloat64ToShort64() {
+        VectorSpecies<Float> ISPECIES = FloatVector.SPECIES_64;
+        VectorSpecies<Short> OSPECIES = ShortVector.SPECIES_64;
+        for (int i = 0, j = 0; i < ISPECIES.loopBound(SIZE / 2); i += ISPECIES.length(), j += OSPECIES.length()) {
+            FloatVector.fromArray(ISPECIES, float_arr, i)
+                .convertShape(VectorOperators.F2S, OSPECIES, 0)
+                .reinterpretAsShorts()
+                .intoArray(short_res, j);
+        }
+    }
+
     @Benchmark
     public void microFloat128ToShort128() {
         VectorSpecies<Float> ISPECIES = FloatVector.SPECIES_128;
diff --git a/test/micro/org/openjdk/bench/vm/compiler/VectorTwoShorts.java b/test/micro/org/openjdk/bench/vm/compiler/VectorTwoShorts.java
new file mode 100644
index 00000000000..445f67552ab
--- /dev/null
+++ b/test/micro/org/openjdk/bench/vm/compiler/VectorTwoShorts.java
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package org.openjdk.bench.vm.compiler;
+
+import org.openjdk.jmh.annotations.*;
+
+import java.util.concurrent.TimeUnit;
+import java.util.Random;
+
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@State(Scope.Thread)
+@Warmup(iterations = 4, time = 2, timeUnit = TimeUnit.SECONDS)
+@Measurement(iterations = 4, time = 2, timeUnit = TimeUnit.SECONDS)
+@Fork(value = 3)
+public class VectorTwoShorts {
+    @Param({"64", "128", "512", "1024"})
+    public int LEN;
+
+    private short[] sA;
+    private short[] sB;
+    private short[] sC;
+
+    @Param("0")
+    private int seed;
+    private Random r = new Random(seed);
+
+    @Setup
+    public void init() {
+        sA = new short[LEN];
+        sB = new short[LEN];
+        sC = new short[LEN];
+
+        for (int i = 0; i < LEN; i++) {
+            sA[i] = (short) r.nextInt();
+            sB[i] = (short) r.nextInt();
+        }
+    }
+
+    @Benchmark
+    public void addVec2S() {
+        for (int i = 0; i < LEN - 3; i++) {
+            sC[i + 3] = (short) (sA[i] + sB[i]);
+        }
+    }
+
+    @Benchmark
+    public void mulVec2S() {
+        for (int i = 0; i < LEN - 3; i++) {
+            sC[i + 3] = (short) (sA[i] * sB[i]);
+        }
+    }
+
+    @Benchmark
+    public void reverseBytesVec2S() {
+        for (int i = 0; i < LEN - 3; i++) {
+            sC[i + 3] = (short) Short.reverseBytes(sA[i]);
+        }
+    }
+}
\ No newline at end of file