From 2ba8a06f0c0a598a6ca7f74e75bab4208e6fa689 Mon Sep 17 00:00:00 2001
From: Bhavana Kilambi <bkilambi@openjdk.org>
Date: Fri, 1 Aug 2025 13:11:21 +0000
Subject: [PATCH] 8348868: AArch64: Add backend support for SelectFromTwoVector

Co-authored-by: Jatin Bhateja <jbhateja@openjdk.org>
Reviewed-by: haosun, aph, sviswanathan, xgong
---
 src/hotspot/cpu/aarch64/aarch64.ad            | 120 +++++
 src/hotspot/cpu/aarch64/aarch64_vector.ad     |  90 ++++
 src/hotspot/cpu/aarch64/aarch64_vector_ad.m4  |  53 ++
 src/hotspot/cpu/aarch64/assembler_aarch64.hpp |  23 +-
 .../cpu/aarch64/c2_MacroAssembler_aarch64.cpp | 121 +++++
 .../cpu/aarch64/c2_MacroAssembler_aarch64.hpp |  14 +
 src/hotspot/cpu/x86/x86.ad                    |   5 +-
 src/hotspot/share/opto/vectorIntrinsics.cpp   |   5 +-
 test/hotspot/gtest/aarch64/aarch64-asmtest.py |   4 +
 test/hotspot/gtest/aarch64/asmtest.out.h      |  47 +-
 .../compiler/lib/ir_framework/IRNode.java     |  30 ++
 .../ir_framework/test/IREncodingPrinter.java  |   1 +
 .../vectorapi/TestSelectFromTwoVectorOp.java  | 486 ++++++++++++++++++
 13 files changed, 973 insertions(+), 26 deletions(-)
 create mode 100644 test/hotspot/jtreg/compiler/vectorapi/TestSelectFromTwoVectorOp.java

diff --git a/src/hotspot/cpu/aarch64/aarch64.ad b/src/hotspot/cpu/aarch64/aarch64.ad
index 404ab8d9ba4..517da8066de 100644
--- a/src/hotspot/cpu/aarch64/aarch64.ad
+++ b/src/hotspot/cpu/aarch64/aarch64.ad
@@ -881,6 +881,46 @@ reg_class vectorx_reg(
     V31, V31_H, V31_J, V31_K
 );
 
+// Class for vector register V10
+reg_class v10_veca_reg(
+    V10, V10_H, V10_J, V10_K
+);
+
+// Class for vector register V11
+reg_class v11_veca_reg(
+    V11, V11_H, V11_J, V11_K
+);
+
+// Class for vector register V12
+reg_class v12_veca_reg(
+    V12, V12_H, V12_J, V12_K
+);
+
+// Class for vector register V13
+reg_class v13_veca_reg(
+    V13, V13_H, V13_J, V13_K
+);
+
+// Class for vector register V17
+reg_class v17_veca_reg(
+    V17, V17_H, V17_J, V17_K
+);
+
+// Class for vector register V18
+reg_class v18_veca_reg(
+    V18, V18_H, V18_J, V18_K
+);
+
+// Class for vector register V23
+reg_class v23_veca_reg(
+    V23, V23_H, V23_J, V23_K
+);
+
+// Class for vector register V24
+reg_class v24_veca_reg(
+    V24, V24_H, V24_J, V24_K
+);
+
 // Class for 128 bit register v0
 reg_class v0_reg(
     V0, V0_H
@@ -4969,6 +5009,86 @@ operand vReg()
   interface(REG_INTER);
 %}
 
+operand vReg_V10()
+%{
+  constraint(ALLOC_IN_RC(v10_veca_reg));
+  match(vReg);
+
+  op_cost(0);
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand vReg_V11()
+%{
+  constraint(ALLOC_IN_RC(v11_veca_reg));
+  match(vReg);
+
+  op_cost(0);
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand vReg_V12()
+%{
+  constraint(ALLOC_IN_RC(v12_veca_reg));
+  match(vReg);
+
+  op_cost(0);
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand vReg_V13()
+%{
+  constraint(ALLOC_IN_RC(v13_veca_reg));
+  match(vReg);
+
+  op_cost(0);
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand vReg_V17()
+%{
+  constraint(ALLOC_IN_RC(v17_veca_reg));
+  match(vReg);
+
+  op_cost(0);
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand vReg_V18()
+%{
+  constraint(ALLOC_IN_RC(v18_veca_reg));
+  match(vReg);
+
+  op_cost(0);
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand vReg_V23()
+%{
+  constraint(ALLOC_IN_RC(v23_veca_reg));
+  match(vReg);
+
+  op_cost(0);
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand vReg_V24()
+%{
+  constraint(ALLOC_IN_RC(v24_veca_reg));
+  match(vReg);
+
+  op_cost(0);
+  format %{ %}
+  interface(REG_INTER);
+%}
+
 operand vecA()
 %{
   constraint(ALLOC_IN_RC(vectora_reg));
diff --git a/src/hotspot/cpu/aarch64/aarch64_vector.ad b/src/hotspot/cpu/aarch64/aarch64_vector.ad
index 1b6296ddd8b..58300992c2a 100644
--- a/src/hotspot/cpu/aarch64/aarch64_vector.ad
+++ b/src/hotspot/cpu/aarch64/aarch64_vector.ad
@@ -257,6 +257,28 @@ source %{
           return false;
         }
         break;
+      case Op_SelectFromTwoVector:
+        // The "tbl" instruction for two vector table is supported only in Neon and SVE2. Return
+        // false if vector length > 16B but supported SVE version < 2.
+        // For vector length of 16B, generate SVE2 "tbl" instruction if SVE2 is supported, else
+        // generate Neon "tbl" instruction to select from two vectors.
+        // This operation is disabled for doubles and longs on machines with SVE < 2 and instead
+        // the default VectorRearrange + VectorBlend is generated because the performance of the default
+        // implementation was better than or equal to the implementation for SelectFromTwoVector.
+        if (UseSVE < 2 && (type2aelembytes(bt) == 8 || length_in_bytes > 16)) {
+          return false;
+        }
+
+        // Because the SVE2 "tbl" instruction is unpredicated and partial operations cannot be generated
+        // using masks, we disable this operation on machines where length_in_bytes < MaxVectorSize
+        // on that machine with the only exception of 8B vector length. This is because at the time of
+        // writing this, there is no SVE2 machine available with length_in_bytes > 8 and
+        // length_in_bytes < MaxVectorSize to test this operation on (for example - there isn't an
+        // SVE2 machine available with MaxVectorSize = 32 to test a case with length_in_bytes = 16).
+        if (UseSVE == 2 && length_in_bytes > 8 && length_in_bytes < MaxVectorSize) {
+          return false;
+        }
+        break;
       default:
         break;
     }
@@ -7172,3 +7194,71 @@ instruct vexpandBits(vReg dst, vReg src1, vReg src2) %{
   %}
   ins_pipe(pipe_slow);
 %}
+
+// ------------------------------------- SelectFromTwoVector ------------------------------------
+// The Neon and SVE2 tbl instruction for two vector lookup requires both the source vectors to be
+// consecutive. The match rules for SelectFromTwoVector reserve two consecutive vector registers
+// for src1 and src2.
+// Four combinations of vector registers for vselect_from_two_vectors are chosen at random
+// (two from volatile and two from non-volatile set) which gives more freedom to the register
+// allocator to choose the best pair of source registers at that point.
+
+instruct vselect_from_two_vectors_10_11(vReg dst, vReg_V10 src1, vReg_V11 src2,
+                                        vReg index, vReg tmp) %{
+  effect(TEMP_DEF dst, TEMP tmp);
+  match(Set dst (SelectFromTwoVector (Binary index src1) src2));
+  format %{ "vselect_from_two_vectors_10_11 $dst, $src1, $src2, $index\t# KILL $tmp" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
+    __ select_from_two_vectors($dst$$FloatRegister, $src1$$FloatRegister,
+                               $src2$$FloatRegister, $index$$FloatRegister,
+                               $tmp$$FloatRegister, bt, length_in_bytes);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vselect_from_two_vectors_12_13(vReg dst, vReg_V12 src1, vReg_V13 src2,
+                                        vReg index, vReg tmp) %{
+  effect(TEMP_DEF dst, TEMP tmp);
+  match(Set dst (SelectFromTwoVector (Binary index src1) src2));
+  format %{ "vselect_from_two_vectors_12_13 $dst, $src1, $src2, $index\t# KILL $tmp" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
+    __ select_from_two_vectors($dst$$FloatRegister, $src1$$FloatRegister,
+                               $src2$$FloatRegister, $index$$FloatRegister,
+                               $tmp$$FloatRegister, bt, length_in_bytes);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vselect_from_two_vectors_17_18(vReg dst, vReg_V17 src1, vReg_V18 src2,
+                                        vReg index, vReg tmp) %{
+  effect(TEMP_DEF dst, TEMP tmp);
+  match(Set dst (SelectFromTwoVector (Binary index src1) src2));
+  format %{ "vselect_from_two_vectors_17_18 $dst, $src1, $src2, $index\t# KILL $tmp" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
+    __ select_from_two_vectors($dst$$FloatRegister, $src1$$FloatRegister,
+                               $src2$$FloatRegister, $index$$FloatRegister,
+                               $tmp$$FloatRegister, bt, length_in_bytes);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vselect_from_two_vectors_23_24(vReg dst, vReg_V23 src1, vReg_V24 src2,
+                                        vReg index, vReg tmp) %{
+  effect(TEMP_DEF dst, TEMP tmp);
+  match(Set dst (SelectFromTwoVector (Binary index src1) src2));
+  format %{ "vselect_from_two_vectors_23_24 $dst, $src1, $src2, $index\t# KILL $tmp" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
+    __ select_from_two_vectors($dst$$FloatRegister, $src1$$FloatRegister,
+                               $src2$$FloatRegister, $index$$FloatRegister,
+                               $tmp$$FloatRegister, bt, length_in_bytes);
+  %}
+  ins_pipe(pipe_slow);
+%}
diff --git a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
index efefbf692bd..4d91e04dc21 100644
--- a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
+++ b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
@@ -247,6 +247,28 @@ source %{
           return false;
         }
         break;
+      case Op_SelectFromTwoVector:
+        // The "tbl" instruction for two vector table is supported only in Neon and SVE2. Return
+        // false if vector length > 16B but supported SVE version < 2.
+        // For vector length of 16B, generate SVE2 "tbl" instruction if SVE2 is supported, else
+        // generate Neon "tbl" instruction to select from two vectors.
+        // This operation is disabled for doubles and longs on machines with SVE < 2 and instead
+        // the default VectorRearrange + VectorBlend is generated because the performance of the default
+        // implementation was better than or equal to the implementation for SelectFromTwoVector.
+        if (UseSVE < 2 && (type2aelembytes(bt) == 8 || length_in_bytes > 16)) {
+          return false;
+        }
+
+        // Because the SVE2 "tbl" instruction is unpredicated and partial operations cannot be generated
+        // using masks, we disable this operation on machines where length_in_bytes < MaxVectorSize
+        // on that machine with the only exception of 8B vector length. This is because at the time of
+        // writing this, there is no SVE2 machine available with length_in_bytes > 8 and
+        // length_in_bytes < MaxVectorSize to test this operation on (for example - there isn't an
+        // SVE2 machine available with MaxVectorSize = 32 to test a case with length_in_bytes = 16).
+        if (UseSVE == 2 && length_in_bytes > 8 && length_in_bytes < MaxVectorSize) {
+          return false;
+        }
+        break;
       default:
         break;
     }
@@ -5154,3 +5176,34 @@ BITPERM(vcompressBits, CompressBitsV, sve_bext)
 
 // ----------------------------------- ExpandBitsV ---------------------------------
 BITPERM(vexpandBits, ExpandBitsV, sve_bdep)
+
+// ------------------------------------- SelectFromTwoVector ------------------------------------
+// The Neon and SVE2 tbl instruction for two vector lookup requires both the source vectors to be
+// consecutive. The match rules for SelectFromTwoVector reserve two consecutive vector registers
+// for src1 and src2.
+// Four combinations of vector registers for vselect_from_two_vectors are chosen at random
+// (two from volatile and two from non-volatile set) which gives more freedom to the register
+// allocator to choose the best pair of source registers at that point.
+dnl
+dnl SELECT_FROM_TWO_VECTORS($1,        $2        )
+dnl SELECT_FROM_TWO_VECTORS(first_reg, second_reg)
+define(`SELECT_FROM_TWO_VECTORS', `
+instruct vselect_from_two_vectors_$1_$2(vReg dst, vReg_V$1 src1, vReg_V$2 src2,
+                                        vReg index, vReg tmp) %{
+  effect(TEMP_DEF dst, TEMP tmp);
+  match(Set dst (SelectFromTwoVector (Binary index src1) src2));
+  format %{ "vselect_from_two_vectors_$1_$2 $dst, $src1, $src2, $index\t# KILL $tmp" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
+    __ select_from_two_vectors($dst$$FloatRegister, $src1$$FloatRegister,
+                               $src2$$FloatRegister, $index$$FloatRegister,
+                               $tmp$$FloatRegister, bt, length_in_bytes);
+  %}
+  ins_pipe(pipe_slow);
+%}')dnl
+dnl
+SELECT_FROM_TWO_VECTORS(10, 11)
+SELECT_FROM_TWO_VECTORS(12, 13)
+SELECT_FROM_TWO_VECTORS(17, 18)
+SELECT_FROM_TWO_VECTORS(23, 24)
diff --git a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
index 2e35763aa43..11d302e9026 100644
--- a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
@@ -4231,12 +4231,29 @@ public:
     sf(imm1, 9, 5), rf(Zd, 0);
   }
 
-  // SVE programmable table lookup/permute using vector of element indices
-  void sve_tbl(FloatRegister Zd, SIMD_RegVariant T, FloatRegister Zn, FloatRegister Zm) {
+private:
+  void _sve_tbl(FloatRegister Zd, SIMD_RegVariant T, FloatRegister Zn, unsigned reg_count, FloatRegister Zm) {
     starti;
     assert(T != Q, "invalid size");
+    // Only supports one or two vector lookup. One vector lookup was introduced in SVE1
+    // and two vector lookup in SVE2
+    assert(0 < reg_count && reg_count <= 2, "invalid number of registers");
+
+    int op11 = (reg_count == 1) ? 0b10 : 0b01;
+
     f(0b00000101, 31, 24), f(T, 23, 22), f(0b1, 21), rf(Zm, 16);
-    f(0b001100, 15, 10), rf(Zn, 5), rf(Zd, 0);
+    f(0b001, 15, 13), f(op11, 12, 11), f(0b0, 10), rf(Zn, 5), rf(Zd, 0);
+  }
+
+public:
+  // SVE/SVE2 Programmable table lookup in one or two vector table (zeroing)
+  void sve_tbl(FloatRegister Zd, SIMD_RegVariant T, FloatRegister Zn, FloatRegister Zm) {
+    _sve_tbl(Zd, T, Zn, 1, Zm);
+  }
+
+  void sve_tbl(FloatRegister Zd, SIMD_RegVariant T, FloatRegister Zn1, FloatRegister Zn2, FloatRegister Zm) {
+    assert(Zn1->successor() == Zn2, "invalid order of registers");
+    _sve_tbl(Zd, T, Zn1, 2, Zm);
   }
 
   // Shuffle active elements of vector to the right and fill with zero
diff --git a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp
index a4ecd56af08..e87cb478c8f 100644
--- a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp
@@ -2858,3 +2858,124 @@ void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
     add(rfp, sp, framesize - 2 * wordSize);
   }
 }
+
+// Selects elements from two source vectors (src1, src2) based on index values in the index register
+// using Neon instructions and places it in the destination vector element corresponding to the
+// index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
+// where NUM_ELEM is the number of BasicType elements per vector.
+// If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
+// Otherwise, selects src2[idx – NUM_ELEM]
+void C2_MacroAssembler::select_from_two_vectors_neon(FloatRegister dst, FloatRegister src1,
+                                                     FloatRegister src2, FloatRegister index,
+                                                     FloatRegister tmp, unsigned vector_length_in_bytes) {
+  assert_different_registers(dst, src1, src2, tmp);
+  SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
+
+  if (vector_length_in_bytes == 16) {
+    assert(UseSVE <= 1, "sve must be <= 1");
+    assert(src1->successor() == src2, "Source registers must be ordered");
+    // If the vector length is 16B, then use the Neon "tbl" instruction with two vector table
+    tbl(dst, size, src1, 2, index);
+  } else { // vector length == 8
+    assert(UseSVE == 0, "must be Neon only");
+    // We need to fit both the source vectors (src1, src2) in a 128-bit register because the
+    // Neon "tbl" instruction supports only looking up 16B vectors. We then use the Neon "tbl"
+    // instruction with one vector lookup
+    ins(tmp, D, src1, 0, 0);
+    ins(tmp, D, src2, 1, 0);
+    tbl(dst, size, tmp, 1, index);
+  }
+}
+
+// Selects elements from two source vectors (src1, src2) based on index values in the index register
+// using SVE/SVE2 instructions and places it in the destination vector element corresponding to the
+// index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
+// where NUM_ELEM is the number of BasicType elements per vector.
+// If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
+// Otherwise, selects src2[idx – NUM_ELEM]
+void C2_MacroAssembler::select_from_two_vectors_sve(FloatRegister dst, FloatRegister src1,
+                                                    FloatRegister src2, FloatRegister index,
+                                                    FloatRegister tmp, SIMD_RegVariant T,
+                                                    unsigned vector_length_in_bytes) {
+  assert_different_registers(dst, src1, src2, index, tmp);
+
+  if (vector_length_in_bytes == 8) {
+    // We need to fit both the source vectors (src1, src2) in a single vector register because the
+    // SVE "tbl" instruction is unpredicated and works on the entire vector which can lead to
+    // incorrect results if each source vector is only partially filled. We then use the SVE "tbl"
+    // instruction with one vector lookup
+    assert(UseSVE >= 1, "sve must be >= 1");
+    ins(tmp, D, src1, 0, 0);
+    ins(tmp, D, src2, 1, 0);
+    sve_tbl(dst, T, tmp, index);
+  } else {  // UseSVE == 2 and vector_length_in_bytes > 8
+    // If the vector length is > 8, then use the SVE2 "tbl" instruction with the two vector table.
+    // The assertion - vector_length_in_bytes == MaxVectorSize ensures that this operation
+    // is not executed on machines where vector_length_in_bytes < MaxVectorSize
+    // with the only exception of 8B vector length.
+    assert(UseSVE == 2 && vector_length_in_bytes == MaxVectorSize, "must be");
+    assert(src1->successor() == src2, "Source registers must be ordered");
+    sve_tbl(dst, T, src1, src2, index);
+  }
+}
+
+void C2_MacroAssembler::select_from_two_vectors(FloatRegister dst, FloatRegister src1,
+                                                FloatRegister src2, FloatRegister index,
+                                                FloatRegister tmp, BasicType bt,
+                                                unsigned vector_length_in_bytes) {
+
+  assert_different_registers(dst, src1, src2, index, tmp);
+
+  // The cases that can reach this method are -
+  // - UseSVE = 0, vector_length_in_bytes = 8 or 16
+  // - UseSVE = 1, vector_length_in_bytes = 8 or 16
+  // - UseSVE = 2, vector_length_in_bytes >= 8
+  //
+  // SVE/SVE2 tbl instructions are generated when UseSVE = 1 with vector_length_in_bytes = 8
+  // and UseSVE = 2 with vector_length_in_bytes >= 8
+  //
+  // Neon instructions are generated when UseSVE = 0 with vector_length_in_bytes = 8 or 16 and
+  // UseSVE = 1 with vector_length_in_bytes = 16
+
+  if ((UseSVE == 1 && vector_length_in_bytes == 8) || UseSVE == 2) {
+    SIMD_RegVariant T = elemType_to_regVariant(bt);
+    select_from_two_vectors_sve(dst, src1, src2, index, tmp, T, vector_length_in_bytes);
+    return;
+  }
+
+  // The only BasicTypes that can reach here are T_SHORT, T_BYTE, T_INT and T_FLOAT
+  assert(bt != T_DOUBLE && bt != T_LONG, "unsupported basic type");
+  assert(vector_length_in_bytes <= 16, "length_in_bytes must be <= 16");
+
+  bool isQ = vector_length_in_bytes == 16;
+
+  SIMD_Arrangement size1 = isQ ? T16B : T8B;
+  SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
+
+  // Neon "tbl" instruction only supports byte tables, so we need to look at chunks of
+  // 2B for selecting shorts or chunks of 4B for selecting ints/floats from the table.
+  // The index values in "index" register are in the range of [0, 2 * NUM_ELEM) where NUM_ELEM
+  // is the number of elements that can fit in a vector. For ex. for T_SHORT with 64-bit vector length,
+  // the indices can range from [0, 8).
+  // As an example with 64-bit vector length and T_SHORT type - let index = [2, 5, 1, 0]
+  // Move a constant 0x02 in every byte of tmp - tmp = [0x0202, 0x0202, 0x0202, 0x0202]
+  // Multiply index vector with tmp to yield - dst = [0x0404, 0x0a0a, 0x0202, 0x0000]
+  // Move a constant 0x0100 in every 2B of tmp - tmp = [0x0100, 0x0100, 0x0100, 0x0100]
+  // Add the multiplied result to the vector in tmp to obtain the byte level
+  // offsets - dst = [0x0504, 0x0b0a, 0x0302, 0x0100]
+  // Use these offsets in the "tbl" instruction to select chunks of 2B.
+
+  if (bt == T_BYTE) {
+    select_from_two_vectors_neon(dst, src1, src2, index, tmp, vector_length_in_bytes);
+  } else {
+    int elem_size = (bt == T_SHORT) ? 2 : 4;
+    uint64_t tbl_offset = (bt == T_SHORT) ? 0x0100u : 0x03020100u;
+
+    mov(tmp, size1, elem_size);
+    mulv(dst, size2, index, tmp);
+    mov(tmp, size2, tbl_offset);
+    addv(dst, size1, dst, tmp); // "dst" now contains the processed index elements
+                                // to select a set of 2B/4B
+    select_from_two_vectors_neon(dst, src1, src2, dst, tmp, vector_length_in_bytes);
+  }
+}
diff --git a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp
index 70e4265c7cc..233f600cb14 100644
--- a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp
@@ -34,6 +34,15 @@
   void neon_reduce_logical_helper(int opc, bool sf, Register Rd, Register Rn, Register Rm,
                                   enum shift_kind kind = Assembler::LSL, unsigned shift = 0);
 
+  void select_from_two_vectors_neon(FloatRegister dst, FloatRegister src1,
+                                    FloatRegister src2, FloatRegister index,
+                                    FloatRegister tmp, unsigned vector_length_in_bytes);
+
+  void select_from_two_vectors_sve(FloatRegister dst, FloatRegister src1,
+                                   FloatRegister src2, FloatRegister index,
+                                   FloatRegister tmp, SIMD_RegVariant T,
+                                   unsigned vector_length_in_bytes);
+
  public:
   // jdk.internal.util.ArraysSupport.vectorizedHashCode
   address arrays_hashcode(Register ary, Register cnt, Register result, FloatRegister vdata0,
@@ -193,4 +202,9 @@
 
   void reconstruct_frame_pointer(Register rtmp);
 
+  // Select from a table of two vectors
+  void select_from_two_vectors(FloatRegister dst, FloatRegister src1, FloatRegister src2,
+                               FloatRegister index, FloatRegister tmp, BasicType bt,
+                               unsigned vector_length_in_bytes);
+
 #endif // CPU_AARCH64_C2_MACROASSEMBLER_AARCH64_HPP
diff --git a/src/hotspot/cpu/x86/x86.ad b/src/hotspot/cpu/x86/x86.ad
index 933be1667c2..2eb748e350c 100644
--- a/src/hotspot/cpu/x86/x86.ad
+++ b/src/hotspot/cpu/x86/x86.ad
@@ -1831,7 +1831,10 @@ bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
       }
       break;
     case Op_SelectFromTwoVector:
-       if (size_in_bits < 128 || (size_in_bits < 512 && !VM_Version::supports_avx512vl())) {
+       if (size_in_bits < 128) {
+         return false;
+       }
+       if ((size_in_bits < 512 && !VM_Version::supports_avx512vl())) {
          return false;
        }
        if (bt == T_SHORT && !VM_Version::supports_avx512bw()) {
diff --git a/src/hotspot/share/opto/vectorIntrinsics.cpp b/src/hotspot/share/opto/vectorIntrinsics.cpp
index 97c5dbe03ef..42bf1e20e24 100644
--- a/src/hotspot/share/opto/vectorIntrinsics.cpp
+++ b/src/hotspot/share/opto/vectorIntrinsics.cpp
@@ -2706,6 +2706,9 @@ bool LibraryCallKit::inline_vector_select_from_two_vectors() {
     index_elem_bt = T_LONG;
   }
 
+  // Check if the platform requires a VectorLoadShuffle node to be generated
+  bool need_load_shuffle = Matcher::vector_rearrange_requires_load_shuffle(index_elem_bt, num_elem);
+
   bool lowerSelectFromOp = false;
   if (!arch_supports_vector(Op_SelectFromTwoVector, num_elem, elem_bt, VecMaskNotUsed)) {
     int cast_vopc = VectorCastNode::opcode(-1, elem_bt, true);
@@ -2715,7 +2718,7 @@ bool LibraryCallKit::inline_vector_select_from_two_vectors() {
         !arch_supports_vector(Op_VectorMaskCast, num_elem, elem_bt, VecMaskNotUsed)          ||
         !arch_supports_vector(Op_VectorBlend, num_elem, elem_bt, VecMaskUseLoad)             ||
         !arch_supports_vector(Op_VectorRearrange, num_elem, elem_bt, VecMaskNotUsed)         ||
-        !arch_supports_vector(Op_VectorLoadShuffle, num_elem, index_elem_bt, VecMaskNotUsed) ||
+        (need_load_shuffle && !arch_supports_vector(Op_VectorLoadShuffle, num_elem, index_elem_bt, VecMaskNotUsed)) ||
         !arch_supports_vector(Op_Replicate, num_elem, index_elem_bt, VecMaskNotUsed)) {
       log_if_needed("  ** not supported: opc=%d vlen=%d etype=%s ismask=useload",
                     Op_SelectFromTwoVector, num_elem, type2name(elem_bt));
diff --git a/test/hotspot/gtest/aarch64/aarch64-asmtest.py b/test/hotspot/gtest/aarch64/aarch64-asmtest.py
index 5b2c18b0a2b..62274e2c10f 100644
--- a/test/hotspot/gtest/aarch64/aarch64-asmtest.py
+++ b/test/hotspot/gtest/aarch64/aarch64-asmtest.py
@@ -2087,6 +2087,10 @@ generate(SpecialCases, [["ccmn",   "__ ccmn(zr, zr, 3u, Assembler::LE);",
                         ["index",    "__ sve_index(z7, __ D, r5, 5);",                     "index\tz7.d, x5, #5"],
                         ["cpy",      "__ sve_cpy(z7, __ H, p3, r5);",                      "cpy\tz7.h, p3/m, w5"],
                         ["tbl",      "__ sve_tbl(z16, __ S, z17, z18);",                   "tbl\tz16.s, {z17.s}, z18.s"],
+                        ["tbl",      "__ sve_tbl(z16, __ B, z17, z18, z16);",              "tbl\tz16.b, {z17.b, z18.b}, z16.b"],
+                        ["tbl",      "__ sve_tbl(z16, __ H, z17, z18, z16);",              "tbl\tz16.h, {z17.h, z18.h}, z16.h"],
+                        ["tbl",      "__ sve_tbl(z16, __ S, z17, z18, z16);",              "tbl\tz16.s, {z17.s, z18.s}, z16.s"],
+                        ["tbl",      "__ sve_tbl(z16, __ D, z17, z18, z16);",              "tbl\tz16.d, {z17.d, z18.d}, z16.d"],
                         ["ld1w",     "__ sve_ld1w_gather(z15, p0, r5, z16);",              "ld1w\t{z15.s}, p0/z, [x5, z16.s, uxtw #2]"],
                         ["ld1d",     "__ sve_ld1d_gather(z15, p0, r5, z16);",              "ld1d\t{z15.d}, p0/z, [x5, z16.d, uxtw #3]"],
                         ["st1w",     "__ sve_st1w_scatter(z15, p0, r5, z16);",             "st1w\t{z15.s}, p0, [x5, z16.s, uxtw #2]"],
diff --git a/test/hotspot/gtest/aarch64/asmtest.out.h b/test/hotspot/gtest/aarch64/asmtest.out.h
index d90c2479995..f08c69a27dd 100644
--- a/test/hotspot/gtest/aarch64/asmtest.out.h
+++ b/test/hotspot/gtest/aarch64/asmtest.out.h
@@ -1100,6 +1100,10 @@
     __ sve_index(z7, __ D, r5, 5);                     //       index   z7.d, x5, #5
     __ sve_cpy(z7, __ H, p3, r5);                      //       cpy     z7.h, p3/m, w5
     __ sve_tbl(z16, __ S, z17, z18);                   //       tbl     z16.s, {z17.s}, z18.s
+    __ sve_tbl(z16, __ B, z17, z18, z16);              //       tbl     z16.b, {z17.b, z18.b}, z16.b
+    __ sve_tbl(z16, __ H, z17, z18, z16);              //       tbl     z16.h, {z17.h, z18.h}, z16.h
+    __ sve_tbl(z16, __ S, z17, z18, z16);              //       tbl     z16.s, {z17.s, z18.s}, z16.s
+    __ sve_tbl(z16, __ D, z17, z18, z16);              //       tbl     z16.d, {z17.d, z18.d}, z16.d
     __ sve_ld1w_gather(z15, p0, r5, z16);              //       ld1w    {z15.s}, p0/z, [x5, z16.s, uxtw #2]
     __ sve_ld1d_gather(z15, p0, r5, z16);              //       ld1d    {z15.d}, p0/z, [x5, z16.d, uxtw #3]
     __ sve_st1w_scatter(z15, p0, r5, z16);             //       st1w    {z15.s}, p0, [x5, z16.s, uxtw #2]
@@ -1438,30 +1442,30 @@
     0x9101a1a0,     0xb10a5cc8,     0xd10810aa,     0xf10fd061,
     0x120cb166,     0x321764bc,     0x52174681,     0x720c0227,
     0x9241018e,     0xb25a2969,     0xd278b411,     0xf26aad01,
-    0x14000000,     0x17ffffd7,     0x140004b0,     0x94000000,
-    0x97ffffd4,     0x940004ad,     0x3400000a,     0x34fffa2a,
-    0x3400954a,     0x35000008,     0x35fff9c8,     0x350094e8,
-    0xb400000b,     0xb4fff96b,     0xb400948b,     0xb500001d,
-    0xb5fff91d,     0xb500943d,     0x10000013,     0x10fff8b3,
-    0x100093d3,     0x90000013,     0x36300016,     0x3637f836,
-    0x36309356,     0x3758000c,     0x375ff7cc,     0x375892ec,
+    0x14000000,     0x17ffffd7,     0x140004b4,     0x94000000,
+    0x97ffffd4,     0x940004b1,     0x3400000a,     0x34fffa2a,
+    0x340095ca,     0x35000008,     0x35fff9c8,     0x35009568,
+    0xb400000b,     0xb4fff96b,     0xb400950b,     0xb500001d,
+    0xb5fff91d,     0xb50094bd,     0x10000013,     0x10fff8b3,
+    0x10009453,     0x90000013,     0x36300016,     0x3637f836,
+    0x363093d6,     0x3758000c,     0x375ff7cc,     0x3758936c,
     0x128313a0,     0x528a32c7,     0x7289173b,     0x92ab3acc,
     0xd2a0bf94,     0xf2c285e8,     0x9358722f,     0x330e652f,
     0x53067f3b,     0x93577c53,     0xb34a1aac,     0xd35a4016,
     0x13946c63,     0x93c3dbc8,     0x54000000,     0x54fff5a0,
-    0x540090c0,     0x54000001,     0x54fff541,     0x54009061,
-    0x54000002,     0x54fff4e2,     0x54009002,     0x54000002,
-    0x54fff482,     0x54008fa2,     0x54000003,     0x54fff423,
-    0x54008f43,     0x54000003,     0x54fff3c3,     0x54008ee3,
-    0x54000004,     0x54fff364,     0x54008e84,     0x54000005,
-    0x54fff305,     0x54008e25,     0x54000006,     0x54fff2a6,
-    0x54008dc6,     0x54000007,     0x54fff247,     0x54008d67,
-    0x54000008,     0x54fff1e8,     0x54008d08,     0x54000009,
-    0x54fff189,     0x54008ca9,     0x5400000a,     0x54fff12a,
-    0x54008c4a,     0x5400000b,     0x54fff0cb,     0x54008beb,
-    0x5400000c,     0x54fff06c,     0x54008b8c,     0x5400000d,
-    0x54fff00d,     0x54008b2d,     0x5400000e,     0x54ffefae,
-    0x54008ace,     0x5400000f,     0x54ffef4f,     0x54008a6f,
+    0x54009140,     0x54000001,     0x54fff541,     0x540090e1,
+    0x54000002,     0x54fff4e2,     0x54009082,     0x54000002,
+    0x54fff482,     0x54009022,     0x54000003,     0x54fff423,
+    0x54008fc3,     0x54000003,     0x54fff3c3,     0x54008f63,
+    0x54000004,     0x54fff364,     0x54008f04,     0x54000005,
+    0x54fff305,     0x54008ea5,     0x54000006,     0x54fff2a6,
+    0x54008e46,     0x54000007,     0x54fff247,     0x54008de7,
+    0x54000008,     0x54fff1e8,     0x54008d88,     0x54000009,
+    0x54fff189,     0x54008d29,     0x5400000a,     0x54fff12a,
+    0x54008cca,     0x5400000b,     0x54fff0cb,     0x54008c6b,
+    0x5400000c,     0x54fff06c,     0x54008c0c,     0x5400000d,
+    0x54fff00d,     0x54008bad,     0x5400000e,     0x54ffefae,
+    0x54008b4e,     0x5400000f,     0x54ffef4f,     0x54008aef,
     0xd40658e1,     0xd4014d22,     0xd4046543,     0xd4273f60,
     0xd44cad80,     0xd503201f,     0xd503203f,     0xd503205f,
     0xd503209f,     0xd50320bf,     0xd503219f,     0xd50323bf,
@@ -1668,7 +1672,8 @@
     0x65d8a801,     0x65dcac01,     0x655cb241,     0x0520a1e0,
     0x0521a601,     0x052281e0,     0x05238601,     0x04a14026,
     0x042244a6,     0x046344a6,     0x04a444a6,     0x04e544a7,
-    0x0568aca7,     0x05b23230,     0x853040af,     0xc5b040af,
+    0x0568aca7,     0x05b23230,     0x05302a30,     0x05702a30,
+    0x05b02a30,     0x05f02a30,     0x853040af,     0xc5b040af,
     0xe57080af,     0xe5b080af,     0x25034440,     0x254054c4,
     0x25034640,     0x25415a05,     0x25834440,     0x25c54489,
     0x250b5d3a,     0x2550dc20,     0x2518e3e1,     0x2518e021,
diff --git a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java
index a231f04b1e8..dd285777e2c 100644
--- a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java
+++ b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java
@@ -2851,6 +2851,36 @@ public class IRNode {
         fromBeforeRemoveUselessToFinalCode(BLACKHOLE, "Blackhole");
     }
 
+    public static final String SELECT_FROM_TWO_VECTOR_VB = VECTOR_PREFIX + "SELECT_FROM_TWO_VECTOR_VB" + POSTFIX;
+    static {
+        vectorNode(SELECT_FROM_TWO_VECTOR_VB, "SelectFromTwoVector", TYPE_BYTE);
+    }
+
+    public static final String SELECT_FROM_TWO_VECTOR_VS = VECTOR_PREFIX + "SELECT_FROM_TWO_VECTOR_VS" + POSTFIX;
+    static {
+        vectorNode(SELECT_FROM_TWO_VECTOR_VS, "SelectFromTwoVector", TYPE_SHORT);
+    }
+
+    public static final String SELECT_FROM_TWO_VECTOR_VI = VECTOR_PREFIX + "SELECT_FROM_TWO_VECTOR_VI" + POSTFIX;
+    static {
+        vectorNode(SELECT_FROM_TWO_VECTOR_VI, "SelectFromTwoVector", TYPE_INT);
+    }
+
+    public static final String SELECT_FROM_TWO_VECTOR_VF = VECTOR_PREFIX + "SELECT_FROM_TWO_VECTOR_VF" + POSTFIX;
+    static {
+        vectorNode(SELECT_FROM_TWO_VECTOR_VF, "SelectFromTwoVector", TYPE_FLOAT);
+    }
+
+    public static final String SELECT_FROM_TWO_VECTOR_VD = VECTOR_PREFIX + "SELECT_FROM_TWO_VECTOR_VD" + POSTFIX;
+    static {
+        vectorNode(SELECT_FROM_TWO_VECTOR_VD, "SelectFromTwoVector", TYPE_DOUBLE);
+    }
+
+    public static final String SELECT_FROM_TWO_VECTOR_VL = VECTOR_PREFIX + "SELECT_FROM_TWO_VECTOR_VL" + POSTFIX;
+    static {
+        vectorNode(SELECT_FROM_TWO_VECTOR_VL, "SelectFromTwoVector", TYPE_LONG);
+    }
+
     /*
      * Utility methods to set up IR_NODE_MAPPINGS.
      */
diff --git a/test/hotspot/jtreg/compiler/lib/ir_framework/test/IREncodingPrinter.java b/test/hotspot/jtreg/compiler/lib/ir_framework/test/IREncodingPrinter.java
index 6662acf8e9e..16b4654013a 100644
--- a/test/hotspot/jtreg/compiler/lib/ir_framework/test/IREncodingPrinter.java
+++ b/test/hotspot/jtreg/compiler/lib/ir_framework/test/IREncodingPrinter.java
@@ -105,6 +105,7 @@ public class IREncodingPrinter {
         "avx512f",
         "avx512_fp16",
         "avx512_vnni",
+        "avx512_vbmi",
         "bmi2",
         // AArch64
         "sha3",
diff --git a/test/hotspot/jtreg/compiler/vectorapi/TestSelectFromTwoVectorOp.java b/test/hotspot/jtreg/compiler/vectorapi/TestSelectFromTwoVectorOp.java
new file mode 100644
index 00000000000..3746578266c
--- /dev/null
+++ b/test/hotspot/jtreg/compiler/vectorapi/TestSelectFromTwoVectorOp.java
@@ -0,0 +1,486 @@
+/*
+ * Copyright (c) 2025, Arm Limited. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package compiler.vectorapi;
+
+import compiler.lib.generators.*;
+import compiler.lib.ir_framework.*;
+
+import jdk.incubator.vector.*;
+import jdk.incubator.vector.VectorOperators;
+import jdk.incubator.vector.VectorSpecies;
+
+import java.util.Random;
+
+import jdk.test.lib.Asserts;
+import jdk.test.lib.Utils;
+/**
+ * @test
+ * @bug 8348868
+ * @library /test/lib /
+ * @summary Verify that SelectFromTwoVector IR node is correctly being
+ *          generated on aarch64 and x86
+ * @modules jdk.incubator.vector
+ * @run driver compiler.vectorapi.TestSelectFromTwoVectorOp
+ */
+
+public class TestSelectFromTwoVectorOp {
+    private static final int SIZE = 1024;
+    private static final Generators random = Generators.G;
+
+    private static byte[] ba;
+    private static byte[] bb;
+    private static byte[] bres;
+    private static byte[][] bindex;
+
+    private static short[] sa;
+    private static short[] sb;
+    private static short[] sres;
+    private static short[][] sindex;
+
+    private static int[] ia;
+    private static int[] ib;
+    private static int[] ires;
+    private static int[][] iindex;
+
+    private static float[] fa;
+    private static float[] fb;
+    private static float[] fres;
+    private static float[][] findex;
+
+    private static long[] la;
+    private static long[] lb;
+    private static long[] lres;
+    private static long[][] lindex;
+
+    private static double[] da;
+    private static double[] db;
+    private static double[] dres;
+    private static double[][] dindex;
+
+    // Stores the possible number of elements that can be
+    // held in various vector sizes/shapes
+    private static int [] nums = {2, 4, 8, 16, 32, 64};
+
+   static {
+        ba   = new byte[SIZE];
+        bb   = new byte[SIZE];
+        bres = new byte[SIZE];
+        bindex = new byte[4][SIZE];
+
+        sa   = new short[SIZE];
+        sb   = new short[SIZE];
+        sres = new short[SIZE];
+        sindex = new short[4][SIZE];
+
+        ia   = new int[SIZE];
+        ib   = new int[SIZE];
+        ires = new int[SIZE];
+        iindex = new int[4][SIZE];
+
+        fa   = new float[SIZE];
+        fb   = new float[SIZE];
+        fres = new float[SIZE];
+        findex = new float[4][SIZE];
+
+        la   = new long[SIZE];
+        lb   = new long[SIZE];
+        lres = new long[SIZE];
+        lindex = new long[3][SIZE];
+
+        da   = new double[SIZE];
+        db   = new double[SIZE];
+        dres = new double[SIZE];
+        dindex = new double[3][SIZE];
+
+        // Populate the indices
+        for (int i = 0; i < bindex.length; i++) {
+            bindex[i] = new byte[SIZE];
+            sindex[i] = new short[SIZE];
+            iindex[i] = new int[SIZE];
+            findex[i] = new float[SIZE];
+
+            // The index array contains indices in the range of [0, vector_length * 2)
+            Generator<Integer> byteGen1 = random.uniformInts(0, (nums[i + 2] * 2) - 1);
+            Generator<Integer> shortGen1 = random.uniformInts(0, (nums[i + 1] * 2) - 1);
+
+            for (int j = 0; j < SIZE; j++) {
+                bindex[i][j] = byteGen1.next().byteValue();
+                sindex[i][j] = shortGen1.next().shortValue();
+            }
+
+            if (i < dindex.length) {
+              dindex[i] = new double[SIZE];
+              lindex[i] = new long[SIZE];
+
+              random.fill(random.uniformDoubles(0, (double) ((nums[i] * 2) - 1)), dindex[i]);
+              random.fill(random.uniformLongs(0, (long) ((nums[i] * 2) - 1)), lindex[i]);
+            }
+
+            random.fill(random.uniformInts(0, (nums[i] * 2) - 1), iindex[i]);
+            random.fill(random.uniformFloats(0, (float)((nums[i] * 2) - 1)), findex[i]);
+        }
+
+        // Populate the sources
+        Generator<Integer> byteGen = random.uniformInts(Byte.MIN_VALUE, Byte.MAX_VALUE);
+        Generator<Integer> shortGen = random.uniformInts(Short.MIN_VALUE, Short.MAX_VALUE);
+
+        for (int i = 0; i < SIZE; i++) {
+            ba[i] = byteGen.next().byteValue();
+            bb[i] = byteGen.next().byteValue();
+
+            sa[i] = shortGen.next().shortValue();
+            sb[i] = shortGen.next().shortValue();
+        }
+
+        random.fill(random.ints(), ia);
+        random.fill(random.ints(), ib);
+        random.fill(random.floats(), fa);
+        random.fill(random.floats(), fb);
+        random.fill(random.longs(), la);
+        random.fill(random.longs(), lb);
+        random.fill(random.doubles(), da);
+        random.fill(random.doubles(), db);
+    }
+
+    // Test SelectFromTwoVector operation for Bytes
+    @ForceInline
+    public static void ByteSelectFromTwoVectorKernel(VectorSpecies SPECIES, byte[] ba,
+                                                     byte[] bb, byte[] bindex) {
+        for (int i = 0; i < SPECIES.loopBound(ba.length); i += SPECIES.length()) {
+            ByteVector.fromArray(SPECIES, bindex, i)
+                .selectFrom(ByteVector.fromArray(SPECIES, ba, i),
+                            ByteVector.fromArray(SPECIES, bb, i))
+                .intoArray(bres, i);
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VB, IRNode.VECTOR_SIZE_8, ">0"},
+        applyIfCPUFeature = {"asimd", "true"},
+        applyIf = {"MaxVectorSize", ">=8"})
+    public static void selectFromTwoVector_Byte64() {
+        ByteSelectFromTwoVectorKernel(ByteVector.SPECIES_64, ba, bb, bindex[0]);
+    }
+
+    @Test
+    @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VB, IRNode.VECTOR_SIZE_16, ">0"},
+        applyIfCPUFeature = {"asimd", "true"},
+        applyIf = {"MaxVectorSize", ">=16"})
+    @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VB, IRNode.VECTOR_SIZE_16, ">0"},
+        applyIfCPUFeatureAnd = {"avx512_vbmi", "true", "avx512vl", "true"},
+        applyIf = {"MaxVectorSize", ">=16"})
+    public static void selectFromTwoVector_Byte128() {
+        ByteSelectFromTwoVectorKernel(ByteVector.SPECIES_128, ba, bb, bindex[1]);
+    }
+
+    @Test
+    @IR(failOn = {IRNode.SELECT_FROM_TWO_VECTOR_VB, IRNode.VECTOR_SIZE_32},
+        applyIfCPUFeatureAnd = {"asimd", "true", "sve2", "false"},
+        applyIf = {"MaxVectorSize", ">=32"})
+    @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VB, IRNode.VECTOR_SIZE_32, ">0"},
+        applyIfCPUFeature = {"sve2", "true"},
+        applyIf = {"MaxVectorSize", ">=32"})
+    @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VB, IRNode.VECTOR_SIZE_32, ">0"},
+        applyIfCPUFeatureAnd = {"avx512_vbmi", "true", "avx512vl", "true"},
+        applyIf = {"MaxVectorSize", ">=32"})
+    public static void selectFromTwoVector_Byte256() {
+        ByteSelectFromTwoVectorKernel(ByteVector.SPECIES_256, ba, bb, bindex[2]);
+    }
+
+    @Test
+    @IR(failOn = {IRNode.SELECT_FROM_TWO_VECTOR_VB, IRNode.VECTOR_SIZE_64},
+        applyIfCPUFeatureAnd = {"asimd", "true", "sve2", "false"},
+        applyIf = {"MaxVectorSize", ">=64"})
+    @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VB, IRNode.VECTOR_SIZE_64, ">0"},
+        applyIfCPUFeature = {"sve2", "true"},
+        applyIf = {"MaxVectorSize", ">=64"})
+    @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VB, IRNode.VECTOR_SIZE_64, ">0"},
+        applyIfCPUFeatureAnd = {"avx512_vbmi", "true", "avx512f", "true"},
+        applyIf = {"MaxVectorSize", ">=64"})
+    public static void selectFromTwoVector_Byte512() {
+        ByteSelectFromTwoVectorKernel(ByteVector.SPECIES_512, ba, bb, bindex[3]);
+    }
+
+    // Test SelectFromTwoVector operation for Shorts
+    @ForceInline
+    public static void ShortSelectFromTwoVectorKernel(VectorSpecies SPECIES, short[] sa,
+                                                      short[] sb, short[] sindex) {
+        for (int i = 0; i < SPECIES.loopBound(sa.length); i += SPECIES.length()) {
+            ShortVector.fromArray(SPECIES, sindex, i)
+                .selectFrom(ShortVector.fromArray(SPECIES, sa, i),
+                            ShortVector.fromArray(SPECIES, sb, i))
+                .intoArray(sres, i);
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VS, IRNode.VECTOR_SIZE_4, ">0"},
+        applyIfCPUFeature = {"asimd", "true"},
+        applyIf = {"MaxVectorSize", ">=8"})
+    public static void selectFromTwoVector_Short64() {
+        ShortSelectFromTwoVectorKernel(ShortVector.SPECIES_64, sa, sb, sindex[0]);
+    }
+
+    @Test
+    @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VS, IRNode.VECTOR_SIZE_8, ">0"},
+        applyIfCPUFeatureAnd = {"asimd", "true", "sve2", "false"},
+        applyIf = {"MaxVectorSize", ">=16"})
+    @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VS, IRNode.VECTOR_SIZE_8, ">0"},
+        applyIfCPUFeature = {"sve2", "true"},
+        applyIf = {"MaxVectorSize", ">=16"})
+    @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VS, IRNode.VECTOR_SIZE_8, ">0"},
+        applyIfCPUFeatureAnd = {"avx512bw", "true", "avx512vl", "true"},
+        applyIf = {"MaxVectorSize", ">=16"})
+    public static void selectFromTwoVector_Short128() {
+        ShortSelectFromTwoVectorKernel(ShortVector.SPECIES_128, sa, sb, sindex[1]);
+    }
+
+    @Test
+    @IR(failOn = {IRNode.SELECT_FROM_TWO_VECTOR_VS, IRNode.VECTOR_SIZE_16},
+        applyIfCPUFeatureAnd = {"asimd", "true", "sve2", "false"},
+        applyIf = {"MaxVectorSize", ">=32"})
+    @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VS, IRNode.VECTOR_SIZE_16, ">0"},
+        applyIfCPUFeature = {"sve2", "true"},
+        applyIf = {"MaxVectorSize", ">=32"})
+    @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VS, IRNode.VECTOR_SIZE_16, ">0"},
+        applyIfCPUFeatureAnd = {"avx512bw", "true", "avx512vl", "true"},
+        applyIf = {"MaxVectorSize", ">=32"})
+    public static void selectFromTwoVector_Short256() {
+        ShortSelectFromTwoVectorKernel(ShortVector.SPECIES_256, sa, sb, sindex[2]);
+    }
+
+    @Test
+    @IR(failOn = {IRNode.SELECT_FROM_TWO_VECTOR_VS, IRNode.VECTOR_SIZE_32},
+        applyIfCPUFeatureAnd = {"asimd", "true", "sve2", "false"},
+        applyIf = {"MaxVectorSize", ">=64"})
+    @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VS, IRNode.VECTOR_SIZE_32, ">0"},
+        applyIfCPUFeature = {"sve2", "true"},
+        applyIf = {"MaxVectorSize", ">=64"})
+    @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VS, IRNode.VECTOR_SIZE_32, ">0"},
+        applyIfCPUFeatureAnd = {"avx512bw", "true", "avx512f", "true"},
+        applyIf = {"MaxVectorSize", ">=64"})
+    public static void selectFromTwoVector_Short512() {
+        ShortSelectFromTwoVectorKernel(ShortVector.SPECIES_512, sa, sb, sindex[3]);
+    }
+
+    // Test SelectFromTwoVector operation for Ints
+    @ForceInline
+    public static void IntSelectFromTwoVectorKernel(VectorSpecies SPECIES, int[] ia,
+                                                    int[] ib, int[] iindex) {
+        for (int i = 0; i < SPECIES.loopBound(ia.length); i += SPECIES.length()) {
+            IntVector.fromArray(SPECIES, iindex, i)
+                .selectFrom(IntVector.fromArray(SPECIES, ia, i),
+                            IntVector.fromArray(SPECIES, ib, i))
+                .intoArray(ires, i);
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VI, IRNode.VECTOR_SIZE_2, ">0"},
+        applyIfCPUFeatureOr = {"asimd", "true"},
+        applyIf = {"MaxVectorSize", ">=8"})
+    public static void selectFromTwoVector_Int64() {
+        IntSelectFromTwoVectorKernel(IntVector.SPECIES_64, ia, ib, iindex[0]);
+    }
+
+    @Test
+    @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VI, IRNode.VECTOR_SIZE_4, ">0"},
+        applyIfCPUFeatureAnd = {"asimd", "true", "sve2", "false"},
+        applyIf = {"MaxVectorSize", ">=16"})
+    @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VI, IRNode.VECTOR_SIZE_4, ">0"},
+        applyIfCPUFeatureOr = {"sve2", "true", "avx512vl", "true"},
+        applyIf = {"MaxVectorSize", ">=16"})
+    public static void selectFromTwoVector_Int128() {
+        IntSelectFromTwoVectorKernel(IntVector.SPECIES_128, ia, ib, iindex[1]);
+    }
+
+    @Test
+    @IR(failOn = {IRNode.SELECT_FROM_TWO_VECTOR_VI, IRNode.VECTOR_SIZE_8},
+        applyIfCPUFeatureAnd = {"asimd", "true", "sve2", "false"},
+        applyIf = {"MaxVectorSize", ">=32"})
+    @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VI, IRNode.VECTOR_SIZE_8, ">0"},
+        applyIfCPUFeatureOr = {"sve2", "true", "avx512vl", "true"},
+        applyIf = {"MaxVectorSize", ">=32"})
+    public static void selectFromTwoVector_Int256() {
+        IntSelectFromTwoVectorKernel(IntVector.SPECIES_256, ia, ib, iindex[2]);
+    }
+
+    @Test
+    @IR(failOn = {IRNode.SELECT_FROM_TWO_VECTOR_VI, IRNode.VECTOR_SIZE_16},
+        applyIfCPUFeatureAnd = {"asimd", "true", "sve2", "false"},
+        applyIf = {"MaxVectorSize", ">=64"})
+    @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VI, IRNode.VECTOR_SIZE_16, ">0"},
+        applyIfCPUFeatureOr = {"sve2", "true", "avx512f", "true"},
+        applyIf = {"MaxVectorSize", ">=64"})
+    public static void selectFromTwoVector_Int512() {
+        IntSelectFromTwoVectorKernel(IntVector.SPECIES_512, ia, ib, iindex[3]);
+    }
+
+    // Test SelectFromTwoVector operation for Floats
+    @ForceInline
+    public static void FloatSelectFromTwoVectorKernel(VectorSpecies SPECIES, float[] fa,
+                                                      float[] fb, float[] findex) {
+        for (int i = 0; i < SPECIES.loopBound(ia.length); i += SPECIES.length()) {
+            FloatVector.fromArray(SPECIES, findex, i)
+                .selectFrom(FloatVector.fromArray(SPECIES, fa, i),
+                            FloatVector.fromArray(SPECIES, fb, i))
+                .intoArray(fres, i);
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VF, IRNode.VECTOR_SIZE_2, ">0"},
+        applyIfCPUFeatureOr = {"asimd", "true"},
+        applyIf = {"MaxVectorSize", ">=8"})
+    public static void selectFromTwoVector_Float64() {
+        FloatSelectFromTwoVectorKernel(FloatVector.SPECIES_64, fa, fb, findex[0]);
+    }
+
+    @Test
+    @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VF, IRNode.VECTOR_SIZE_4, ">0"},
+        applyIfCPUFeatureAnd = {"asimd", "true", "sve2", "false"},
+        applyIf = {"MaxVectorSize", ">=16"})
+    @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VF, IRNode.VECTOR_SIZE_4, ">0"},
+        applyIfCPUFeatureOr = {"sve2", "true", "avx512vl", "true"},
+        applyIf = {"MaxVectorSize", ">=16"})
+    public static void selectFromTwoVector_Float128() {
+        FloatSelectFromTwoVectorKernel(FloatVector.SPECIES_128, fa, fb, findex[1]);
+    }
+
+    @Test
+    @IR(failOn = {IRNode.SELECT_FROM_TWO_VECTOR_VF, IRNode.VECTOR_SIZE_8},
+        applyIfCPUFeatureAnd = {"asimd", "true", "sve2", "false"},
+        applyIf = {"MaxVectorSize", ">=32"})
+    @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VF, IRNode.VECTOR_SIZE_8, ">0"},
+        applyIfCPUFeatureOr = {"sve2", "true", "avx512vl", "true"},
+        applyIf = {"MaxVectorSize", ">=32"})
+    public static void selectFromTwoVector_Float256() {
+        FloatSelectFromTwoVectorKernel(FloatVector.SPECIES_256, fa, fb, findex[2]);
+    }
+
+    @Test
+    @IR(failOn = {IRNode.SELECT_FROM_TWO_VECTOR_VF, IRNode.VECTOR_SIZE_16},
+        applyIfCPUFeatureAnd = {"asimd", "true", "sve2", "false"},
+        applyIf = {"MaxVectorSize", ">=64"})
+    @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VF, IRNode.VECTOR_SIZE_16, ">0"},
+        applyIfCPUFeatureOr = {"sve2", "true", "avx512f", "true"},
+        applyIf = {"MaxVectorSize", ">=64"})
+    public static void selectFromTwoVector_Float512() {
+        FloatSelectFromTwoVectorKernel(FloatVector.SPECIES_512, fa, fb, findex[3]);
+    }
+
+    // Test SelectFromTwoVector operation for Doubles
+    @ForceInline
+    public static void DoubleSelectFromTwoVectorKernel(VectorSpecies SPECIES, double[] da,
+                                                       double[] db, double[] dindex) {
+        for (int i = 0; i < SPECIES.loopBound(ia.length); i += SPECIES.length()) {
+            DoubleVector.fromArray(SPECIES, dindex, i)
+                .selectFrom(DoubleVector.fromArray(SPECIES, da, i),
+                            DoubleVector.fromArray(SPECIES, db, i))
+                .intoArray(dres, i);
+        }
+    }
+
+    @Test
+    @IR(failOn = {IRNode.SELECT_FROM_TWO_VECTOR_VD, IRNode.VECTOR_SIZE_2},
+        applyIfCPUFeatureAnd = {"asimd", "true", "sve2", "false"},
+        applyIf = {"MaxVectorSize", ">=16"})
+    @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VD, IRNode.VECTOR_SIZE_2, ">0"},
+        applyIfCPUFeatureOr = {"sve2", "true", "avx512vl", "true"},
+        applyIf = {"MaxVectorSize", ">=16"})
+    public static void selectFromTwoVector_Double128() {
+        DoubleSelectFromTwoVectorKernel(DoubleVector.SPECIES_128, da, db, dindex[0]);
+    }
+
+    @Test
+    @IR(failOn = {IRNode.SELECT_FROM_TWO_VECTOR_VD, IRNode.VECTOR_SIZE_4},
+        applyIfCPUFeatureAnd = {"asimd", "true", "sve2", "false"},
+        applyIf = {"MaxVectorSize", ">=32"})
+    @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VD, IRNode.VECTOR_SIZE_4, ">0"},
+        applyIfCPUFeatureOr = {"sve2", "true", "avx512vl", "true"},
+        applyIf = {"MaxVectorSize", ">=32"})
+    public static void selectFromTwoVector_Double256() {
+        DoubleSelectFromTwoVectorKernel(DoubleVector.SPECIES_256, da, db, dindex[1]);
+    }
+
+    @Test
+    @IR(failOn = {IRNode.SELECT_FROM_TWO_VECTOR_VD, IRNode.VECTOR_SIZE_8},
+        applyIfCPUFeatureAnd = {"asimd", "true", "sve2", "false"},
+        applyIf = {"MaxVectorSize", ">=64"})
+    @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VD, IRNode.VECTOR_SIZE_8, ">0"},
+        applyIfCPUFeatureOr = {"sve2", "true", "avx512f", "true"},
+        applyIf = {"MaxVectorSize", ">=64"})
+    public static void selectFromTwoVector_Double512() {
+        DoubleSelectFromTwoVectorKernel(DoubleVector.SPECIES_512, da, db, dindex[2]);
+    }
+
+    // Test SelectFromTwoVector operation for Longs
+    @ForceInline
+    public static void LongSelectFromTwoVectorKernel(VectorSpecies SPECIES, long[] la,
+                                                     long[] lb, long[] lindex) {
+        for (int i = 0; i < SPECIES.loopBound(ia.length); i += SPECIES.length()) {
+            LongVector.fromArray(SPECIES, lindex, i)
+                .selectFrom(LongVector.fromArray(SPECIES, la, i),
+                            LongVector.fromArray(SPECIES, lb, i))
+                .intoArray(lres, i);
+        }
+    }
+
+    @Test
+    @IR(failOn = {IRNode.SELECT_FROM_TWO_VECTOR_VL, IRNode.VECTOR_SIZE_2},
+        applyIfCPUFeatureAnd = {"asimd", "true", "sve2", "false"},
+        applyIf = {"MaxVectorSize", ">=16"})
+    @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VL, IRNode.VECTOR_SIZE_2, ">0"},
+        applyIfCPUFeatureOr = {"sve2", "true", "avx512vl", "true"},
+        applyIf = {"MaxVectorSize", ">=16"})
+    public static void selectFromTwoVector_Long128() {
+        LongSelectFromTwoVectorKernel(LongVector.SPECIES_128, la, lb, lindex[0]);
+    }
+
+    @Test
+    @IR(failOn = {IRNode.SELECT_FROM_TWO_VECTOR_VL, IRNode.VECTOR_SIZE_4},
+        applyIfCPUFeatureAnd = {"asimd", "true", "sve2", "false"},
+        applyIf = {"MaxVectorSize", ">=32"})
+    @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VL, IRNode.VECTOR_SIZE_4, ">0"},
+        applyIfCPUFeatureOr = {"sve2", "true", "avx512vl", "true"},
+        applyIf = {"MaxVectorSize", ">=32"})
+    public static void selectFromTwoVector_Long256() {
+        LongSelectFromTwoVectorKernel(LongVector.SPECIES_256, la, lb, lindex[1]);
+    }
+
+    @Test
+    @IR(failOn = {IRNode.SELECT_FROM_TWO_VECTOR_VL, IRNode.VECTOR_SIZE_8},
+        applyIfCPUFeatureAnd = {"asimd", "true", "sve2", "false"},
+        applyIf = {"MaxVectorSize", ">=64"})
+    @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VL, IRNode.VECTOR_SIZE_8, ">0"},
+        applyIfCPUFeatureOr = {"sve2", "true", "avx512f", "true"},
+        applyIf = {"MaxVectorSize", ">=64"})
+    public static void selectFromTwoVector_Long512() {
+        LongSelectFromTwoVectorKernel(LongVector.SPECIES_512, la, lb, lindex[2]);
+    }
+
+    public static void main(String[] args) {
+        TestFramework.runWithFlags("--add-modules=jdk.incubator.vector");
+    }
+}