From ba658a71ba4372b42a496edee55400f5014815d4 Mon Sep 17 00:00:00 2001 From: Xiaohong Gong Date: Tue, 25 Mar 2025 01:35:27 +0000 Subject: [PATCH] 8349522: AArch64: Add backend implementation for new unsigned and saturating vector operations Reviewed-by: epeter, haosun, bkilambi --- src/hotspot/cpu/aarch64/aarch64_vector.ad | 285 +++++- src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 | 153 ++- src/hotspot/cpu/aarch64/assembler_aarch64.hpp | 29 +- test/hotspot/gtest/aarch64/aarch64-asmtest.py | 32 + test/hotspot/gtest/aarch64/asmtest.out.h | 960 ++++++++++-------- .../compiler/lib/ir_framework/IRNode.java | 15 + .../ir_framework/test/IREncodingPrinter.java | 1 + .../VectorSaturatedOperationsTest.java | 148 ++- 8 files changed, 1146 insertions(+), 477 deletions(-) diff --git a/src/hotspot/cpu/aarch64/aarch64_vector.ad b/src/hotspot/cpu/aarch64/aarch64_vector.ad index d217fb7e949..ae51df2fea6 100644 --- a/src/hotspot/cpu/aarch64/aarch64_vector.ad +++ b/src/hotspot/cpu/aarch64/aarch64_vector.ad @@ -1,5 +1,5 @@ // -// Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. +// Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved. // Copyright (c) 2020, 2024, Arm Limited. All rights reserved. // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. // @@ -254,6 +254,13 @@ source %{ case Op_CompressBitsV: case Op_ExpandBitsV: return false; + case Op_SaturatingAddV: + case Op_SaturatingSubV: + // Only SVE2 supports the predicated saturating instructions. + if (UseSVE < 2) { + return false; + } + break; // We use Op_LoadVectorMasked to implement the predicated Op_LoadVector. // Hence we turn to check whether Op_LoadVectorMasked is supported. The // same as vector store/gather/scatter. @@ -1539,6 +1546,142 @@ instruct vand_notL_masked(vReg dst_src1, vReg src2, immL_M1 m1, pRegGov pg) %{ ins_pipe(pipe_slow); %} +// ------------------------- Vector saturating add ----------------------------- + +// Signed saturating add + +instruct vsqadd(vReg dst, vReg src1, vReg src2) %{ + predicate(!n->as_SaturatingVector()->is_unsigned()); + match(Set dst (SaturatingAddV src1 src2)); + format %{ "vsqadd $dst, $src1, $src2" %} + ins_encode %{ + uint length_in_bytes = Matcher::vector_length_in_bytes(this); + if (VM_Version::use_neon_for_vector(length_in_bytes)) { + __ sqaddv($dst$$FloatRegister, get_arrangement(this), + $src1$$FloatRegister, $src2$$FloatRegister); + } else { + assert(UseSVE > 0, "must be sve"); + BasicType bt = Matcher::vector_element_basic_type(this); + __ sve_sqadd($dst$$FloatRegister, __ elemType_to_regVariant(bt), + $src1$$FloatRegister, $src2$$FloatRegister); + } + %} + ins_pipe(pipe_slow); +%} + +instruct vsqadd_masked(vReg dst_src1, vReg src2, pRegGov pg) %{ + predicate(UseSVE == 2 && !n->as_SaturatingVector()->is_unsigned()); + match(Set dst_src1 (SaturatingAddV (Binary dst_src1 src2) pg)); + format %{ "vsqadd_masked $dst_src1, $pg, $dst_src1, $src2" %} + ins_encode %{ + BasicType bt = Matcher::vector_element_basic_type(this); + __ sve_sqadd($dst_src1$$FloatRegister, __ elemType_to_regVariant(bt), + $pg$$PRegister, $src2$$FloatRegister); + %} + ins_pipe(pipe_slow); +%} + +// Unsigned saturating add + +instruct vuqadd(vReg dst, vReg src1, vReg src2) %{ + predicate(n->as_SaturatingVector()->is_unsigned()); + match(Set dst (SaturatingAddV src1 src2)); + format %{ "vuqadd $dst, $src1, $src2" %} + ins_encode %{ + uint length_in_bytes = Matcher::vector_length_in_bytes(this); + if (VM_Version::use_neon_for_vector(length_in_bytes)) { + __ uqaddv($dst$$FloatRegister, get_arrangement(this), + $src1$$FloatRegister, $src2$$FloatRegister); + } else { + assert(UseSVE > 0, "must be sve"); + BasicType bt = Matcher::vector_element_basic_type(this); + __ sve_uqadd($dst$$FloatRegister, __ elemType_to_regVariant(bt), + $src1$$FloatRegister, $src2$$FloatRegister); + } + %} + ins_pipe(pipe_slow); +%} + +instruct vuqadd_masked(vReg dst_src1, vReg src2, pRegGov pg) %{ + predicate(UseSVE == 2 && n->as_SaturatingVector()->is_unsigned()); + match(Set dst_src1 (SaturatingAddV (Binary dst_src1 src2) pg)); + format %{ "vuqadd_masked $dst_src1, $pg, $dst_src1, $src2" %} + ins_encode %{ + BasicType bt = Matcher::vector_element_basic_type(this); + __ sve_uqadd($dst_src1$$FloatRegister, __ elemType_to_regVariant(bt), + $pg$$PRegister, $src2$$FloatRegister); + %} + ins_pipe(pipe_slow); +%} + +// ------------------------- Vector saturating sub ----------------------------- + +// Signed saturating sub + +instruct vsqsub(vReg dst, vReg src1, vReg src2) %{ + predicate(!n->as_SaturatingVector()->is_unsigned()); + match(Set dst (SaturatingSubV src1 src2)); + format %{ "vsqsub $dst, $src1, $src2" %} + ins_encode %{ + uint length_in_bytes = Matcher::vector_length_in_bytes(this); + if (VM_Version::use_neon_for_vector(length_in_bytes)) { + __ sqsubv($dst$$FloatRegister, get_arrangement(this), + $src1$$FloatRegister, $src2$$FloatRegister); + } else { + assert(UseSVE > 0, "must be sve"); + BasicType bt = Matcher::vector_element_basic_type(this); + __ sve_sqsub($dst$$FloatRegister, __ elemType_to_regVariant(bt), + $src1$$FloatRegister, $src2$$FloatRegister); + } + %} + ins_pipe(pipe_slow); +%} + +instruct vsqsub_masked(vReg dst_src1, vReg src2, pRegGov pg) %{ + predicate(UseSVE == 2 && !n->as_SaturatingVector()->is_unsigned()); + match(Set dst_src1 (SaturatingSubV (Binary dst_src1 src2) pg)); + format %{ "vsqsub_masked $dst_src1, $pg, $dst_src1, $src2" %} + ins_encode %{ + BasicType bt = Matcher::vector_element_basic_type(this); + __ sve_sqsub($dst_src1$$FloatRegister, __ elemType_to_regVariant(bt), + $pg$$PRegister, $src2$$FloatRegister); + %} + ins_pipe(pipe_slow); +%} + +// Unsigned saturating sub + +instruct vuqsub(vReg dst, vReg src1, vReg src2) %{ + predicate(n->as_SaturatingVector()->is_unsigned()); + match(Set dst (SaturatingSubV src1 src2)); + format %{ "vuqsub $dst, $src1, $src2" %} + ins_encode %{ + uint length_in_bytes = Matcher::vector_length_in_bytes(this); + if (VM_Version::use_neon_for_vector(length_in_bytes)) { + __ uqsubv($dst$$FloatRegister, get_arrangement(this), + $src1$$FloatRegister, $src2$$FloatRegister); + } else { + assert(UseSVE > 0, "must be sve"); + BasicType bt = Matcher::vector_element_basic_type(this); + __ sve_uqsub($dst$$FloatRegister, __ elemType_to_regVariant(bt), + $src1$$FloatRegister, $src2$$FloatRegister); + } + %} + ins_pipe(pipe_slow); +%} + +instruct vuqsub_masked(vReg dst_src1, vReg src2, pRegGov pg) %{ + predicate(UseSVE == 2 && n->as_SaturatingVector()->is_unsigned()); + match(Set dst_src1 (SaturatingSubV (Binary dst_src1 src2) pg)); + format %{ "vuqsub_masked $dst_src1, $pg, $dst_src1, $src2" %} + ins_encode %{ + BasicType bt = Matcher::vector_element_basic_type(this); + __ sve_uqsub($dst_src1$$FloatRegister, __ elemType_to_regVariant(bt), + $pg$$PRegister, $src2$$FloatRegister); + %} + ins_pipe(pipe_slow); +%} + // ------------------------------ Vector abs ----------------------------------- // vector abs @@ -1993,6 +2136,76 @@ instruct vmin_masked(vReg dst_src1, vReg src2, pRegGov pg) %{ ins_pipe(pipe_slow); %} +// vector unsigned min - LONG + +instruct vuminL_neon(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE == 0 && Matcher::vector_element_basic_type(n) == T_LONG); + match(Set dst (UMinV src1 src2)); + effect(TEMP_DEF dst); + format %{ "vuminL_neon $dst, $src1, $src2\t# 2L" %} + ins_encode %{ + __ cm(Assembler::HI, $dst$$FloatRegister, __ T2D, $src1$$FloatRegister, $src2$$FloatRegister); + __ bsl($dst$$FloatRegister, __ T16B, $src2$$FloatRegister, $src1$$FloatRegister); + %} + ins_pipe(pipe_slow); +%} + +instruct vuminL_sve(vReg dst_src1, vReg src2) %{ + predicate(UseSVE > 0 && Matcher::vector_element_basic_type(n) == T_LONG); + match(Set dst_src1 (UMinV dst_src1 src2)); + format %{ "vuminL_sve $dst_src1, $dst_src1, $src2" %} + ins_encode %{ + __ sve_umin($dst_src1$$FloatRegister, __ D, ptrue, $src2$$FloatRegister); + %} + ins_pipe(pipe_slow); +%} + +// vector unsigned min - B/S/I + +instruct vumin_neon(vReg dst, vReg src1, vReg src2) %{ + predicate(Matcher::vector_element_basic_type(n) != T_LONG && + VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n))); + match(Set dst (UMinV src1 src2)); + format %{ "vumin_neon $dst, $src1, $src2\t# B/S/I" %} + ins_encode %{ + BasicType bt = Matcher::vector_element_basic_type(this); + assert(is_integral_type(bt) && bt != T_LONG, "unsupported type"); + __ uminv($dst$$FloatRegister, get_arrangement(this), + $src1$$FloatRegister, $src2$$FloatRegister); + %} + ins_pipe(pipe_slow); +%} + +instruct vumin_sve(vReg dst_src1, vReg src2) %{ + predicate(Matcher::vector_element_basic_type(n) != T_LONG && + !VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n))); + match(Set dst_src1 (UMinV dst_src1 src2)); + format %{ "vumin_sve $dst_src1, $dst_src1, $src2\t# B/S/I" %} + ins_encode %{ + assert(UseSVE > 0, "must be sve"); + BasicType bt = Matcher::vector_element_basic_type(this); + assert(is_integral_type(bt) && bt != T_LONG, "unsupported type"); + __ sve_umin($dst_src1$$FloatRegister, __ elemType_to_regVariant(bt), + ptrue, $src2$$FloatRegister); + %} + ins_pipe(pipe_slow); +%} + +// vector unsigned min - predicated + +instruct vumin_masked(vReg dst_src1, vReg src2, pRegGov pg) %{ + predicate(UseSVE > 0); + match(Set dst_src1 (UMinV (Binary dst_src1 src2) pg)); + format %{ "vumin_masked $dst_src1, $pg, $dst_src1, $src2" %} + ins_encode %{ + BasicType bt = Matcher::vector_element_basic_type(this); + assert(is_integral_type(bt), "unsupported type"); + __ sve_umin($dst_src1$$FloatRegister, __ elemType_to_regVariant(bt), + $pg$$PRegister, $src2$$FloatRegister); + %} + ins_pipe(pipe_slow); +%} + // ------------------------------ Vector max ----------------------------------- // vector max - LONG @@ -2080,6 +2293,76 @@ instruct vmax_masked(vReg dst_src1, vReg src2, pRegGov pg) %{ ins_pipe(pipe_slow); %} +// vector unsigned max - LONG + +instruct vumaxL_neon(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE == 0 && Matcher::vector_element_basic_type(n) == T_LONG); + match(Set dst (UMaxV src1 src2)); + effect(TEMP_DEF dst); + format %{ "vumaxL_neon $dst, $src1, $src2\t# 2L" %} + ins_encode %{ + __ cm(Assembler::HI, $dst$$FloatRegister, __ T2D, $src1$$FloatRegister, $src2$$FloatRegister); + __ bsl($dst$$FloatRegister, __ T16B, $src1$$FloatRegister, $src2$$FloatRegister); + %} + ins_pipe(pipe_slow); +%} + +instruct vumaxL_sve(vReg dst_src1, vReg src2) %{ + predicate(UseSVE > 0 && Matcher::vector_element_basic_type(n) == T_LONG); + match(Set dst_src1 (UMaxV dst_src1 src2)); + format %{ "vumaxL_sve $dst_src1, $dst_src1, $src2" %} + ins_encode %{ + __ sve_umax($dst_src1$$FloatRegister, __ D, ptrue, $src2$$FloatRegister); + %} + ins_pipe(pipe_slow); +%} + +// vector unsigned max - B/S/I + +instruct vumax_neon(vReg dst, vReg src1, vReg src2) %{ + predicate(Matcher::vector_element_basic_type(n) != T_LONG && + VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n))); + match(Set dst (UMaxV src1 src2)); + format %{ "vumax_neon $dst, $src1, $src2\t# B/S/I" %} + ins_encode %{ + BasicType bt = Matcher::vector_element_basic_type(this); + assert(is_integral_type(bt) && bt != T_LONG, "unsupported type"); + __ umaxv($dst$$FloatRegister, get_arrangement(this), + $src1$$FloatRegister, $src2$$FloatRegister); + %} + ins_pipe(pipe_slow); +%} + +instruct vumax_sve(vReg dst_src1, vReg src2) %{ + predicate(Matcher::vector_element_basic_type(n) != T_LONG && + !VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n))); + match(Set dst_src1 (UMaxV dst_src1 src2)); + format %{ "vumax_sve $dst_src1, $dst_src1, $src2\t# B/S/I" %} + ins_encode %{ + assert(UseSVE > 0, "must be sve"); + BasicType bt = Matcher::vector_element_basic_type(this); + assert(is_integral_type(bt) && bt != T_LONG, "unsupported type"); + __ sve_umax($dst_src1$$FloatRegister, __ elemType_to_regVariant(bt), + ptrue, $src2$$FloatRegister); + %} + ins_pipe(pipe_slow); +%} + +// vector unsigned max - predicated + +instruct vumax_masked(vReg dst_src1, vReg src2, pRegGov pg) %{ + predicate(UseSVE > 0); + match(Set dst_src1 (UMaxV (Binary dst_src1 src2) pg)); + format %{ "vumax_masked $dst_src1, $pg, $dst_src1, $src2" %} + ins_encode %{ + BasicType bt = Matcher::vector_element_basic_type(this); + assert(is_integral_type(bt), "unsupported type"); + __ sve_umax($dst_src1$$FloatRegister, __ elemType_to_regVariant(bt), + $pg$$PRegister, $src2$$FloatRegister); + %} + ins_pipe(pipe_slow); +%} + // ------------------------------ MLA RELATED ---------------------------------- // vector mla diff --git a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 index 422e98d9b68..5d999299f03 100644 --- a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 +++ b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 @@ -1,5 +1,5 @@ // -// Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. +// Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved. // Copyright (c) 2020, 2024, Arm Limited. All rights reserved. // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. // @@ -244,6 +244,13 @@ source %{ case Op_CompressBitsV: case Op_ExpandBitsV: return false; + case Op_SaturatingAddV: + case Op_SaturatingSubV: + // Only SVE2 supports the predicated saturating instructions. + if (UseSVE < 2) { + return false; + } + break; // We use Op_LoadVectorMasked to implement the predicated Op_LoadVector. // Hence we turn to check whether Op_LoadVectorMasked is supported. The // same as vector store/gather/scatter. @@ -813,6 +820,65 @@ dnl VECTOR_AND_NOT_PREDICATE(I) VECTOR_AND_NOT_PREDICATE(L) +dnl +dnl VECTOR_SATURATING_OP($1, $2, $3 ) +dnl VECTOR_SATURATING_OP(prefix, op, op_name) +define(`VECTOR_SATURATING_OP', ` +instruct v$1$2(vReg dst, vReg src1, vReg src2) %{ + predicate(ifelse($1, sq, `!',`')n->as_SaturatingVector()->is_unsigned()); + match(Set dst ($3 src1 src2)); + format %{ "v$1$2 $dst, $src1, $src2" %} + ins_encode %{ + uint length_in_bytes = Matcher::vector_length_in_bytes(this); + if (VM_Version::use_neon_for_vector(length_in_bytes)) { + __ $1$2v($dst$$FloatRegister, get_arrangement(this), + $src1$$FloatRegister, $src2$$FloatRegister); + } else { + assert(UseSVE > 0, "must be sve"); + BasicType bt = Matcher::vector_element_basic_type(this); + __ sve_$1$2($dst$$FloatRegister, __ elemType_to_regVariant(bt), + $src1$$FloatRegister, $src2$$FloatRegister); + } + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +dnl +dnl VECTOR_SATURATING_PREDICATE($1, $2, $3 ) +dnl VECTOR_SATURATING_PREDICATE(prefix, op, op_name) +define(`VECTOR_SATURATING_PREDICATE', ` +instruct v$1$2_masked(vReg dst_src1, vReg src2, pRegGov pg) %{ + predicate(UseSVE == 2 && ifelse($1, sq, `!',`')n->as_SaturatingVector()->is_unsigned()); + match(Set dst_src1 ($3 (Binary dst_src1 src2) pg)); + format %{ "v$1$2_masked $dst_src1, $pg, $dst_src1, $src2" %} + ins_encode %{ + BasicType bt = Matcher::vector_element_basic_type(this); + __ sve_$1$2($dst_src1$$FloatRegister, __ elemType_to_regVariant(bt), + $pg$$PRegister, $src2$$FloatRegister); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +// ------------------------- Vector saturating add ----------------------------- + +// Signed saturating add +VECTOR_SATURATING_OP(sq, add, SaturatingAddV) +VECTOR_SATURATING_PREDICATE(sq, add, SaturatingAddV) + +// Unsigned saturating add +VECTOR_SATURATING_OP(uq, add, SaturatingAddV) +VECTOR_SATURATING_PREDICATE(uq, add, SaturatingAddV) + +// ------------------------- Vector saturating sub ----------------------------- + +// Signed saturating sub +VECTOR_SATURATING_OP(sq, sub, SaturatingSubV) +VECTOR_SATURATING_PREDICATE(sq, sub, SaturatingSubV) + +// Unsigned saturating sub +VECTOR_SATURATING_OP(uq, sub, SaturatingSubV) +VECTOR_SATURATING_PREDICATE(uq, sub, SaturatingSubV) + dnl dnl UNARY_OP($1, $2, $3, $4, $5 ) dnl UNARY_OP(rule_name, op_name, insn_neon, insn_sve, size) @@ -964,17 +1030,17 @@ UNARY_OP_PREDICATE_WITH_SIZE(vsqrtF, SqrtVF, sve_fsqrt, S) UNARY_OP_PREDICATE_WITH_SIZE(vsqrtD, SqrtVD, sve_fsqrt, D) dnl -dnl VMINMAX_L_NEON($1, $2 ) -dnl VMINMAX_L_NEON(type, op_name) +dnl VMINMAX_L_NEON($1, $2 , $3 ) +dnl VMINMAX_L_NEON(type, op_name, sign) define(`VMINMAX_L_NEON', ` -instruct v$1L_neon(vReg dst, vReg src1, vReg src2) %{ +instruct v$3$1L_neon(vReg dst, vReg src1, vReg src2) %{ predicate(UseSVE == 0 && Matcher::vector_element_basic_type(n) == T_LONG); match(Set dst ($2 src1 src2)); effect(TEMP_DEF dst); - format %{ "v$1L_neon $dst, $src1, $src2\t# 2L" %} + format %{ "v$3$1L_neon $dst, $src1, $src2\t# 2L" %} ins_encode %{ - __ cm(Assembler::GT, $dst$$FloatRegister, __ T2D, $src1$$FloatRegister, $src2$$FloatRegister); - __ bsl($dst$$FloatRegister, __ T16B, ifelse(min, $1, $src2, $src1)$$FloatRegister, ifelse(min, $1, $src1, $src2)$$FloatRegister); + __ cm(Assembler::ifelse($3, u, HI, GT), $dst$$FloatRegister, __ T2D, $src1$$FloatRegister, $src2$$FloatRegister); + __ bsl($dst$$FloatRegister, __ T16B, ifelse($1, min, $src2, $src1)$$FloatRegister, ifelse(min, $1, $src1, $src2)$$FloatRegister); %} ins_pipe(pipe_slow); %}')dnl @@ -1058,6 +1124,57 @@ instruct v$1_masked(vReg dst_src1, vReg src2, pRegGov pg) %{ ins_pipe(pipe_slow); %}')dnl dnl +dnl VUMINMAX_NEON($1, $2, $3 ) +dnl VUMINMAX_NEON(type, op_name, insn) +define(`VUMINMAX_NEON', ` +instruct v$1_neon(vReg dst, vReg src1, vReg src2) %{ + predicate(Matcher::vector_element_basic_type(n) != T_LONG && + VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n))); + match(Set dst ($2 src1 src2)); + format %{ "v$1_neon $dst, $src1, $src2\t# B/S/I" %} + ins_encode %{ + BasicType bt = Matcher::vector_element_basic_type(this); + assert(is_integral_type(bt) && bt != T_LONG, "unsupported type"); + __ $3($dst$$FloatRegister, get_arrangement(this), + $src1$$FloatRegister, $src2$$FloatRegister); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +dnl VUMINMAX_SVE($1, $2, $3 ) +dnl VUMINMAX_SVE(type, op_name, insn) +define(`VUMINMAX_SVE', ` +instruct v$1_sve(vReg dst_src1, vReg src2) %{ + predicate(Matcher::vector_element_basic_type(n) != T_LONG && + !VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n))); + match(Set dst_src1 ($2 dst_src1 src2)); + format %{ "v$1_sve $dst_src1, $dst_src1, $src2\t# B/S/I" %} + ins_encode %{ + assert(UseSVE > 0, "must be sve"); + BasicType bt = Matcher::vector_element_basic_type(this); + assert(is_integral_type(bt) && bt != T_LONG, "unsupported type"); + __ $3($dst_src1$$FloatRegister, __ elemType_to_regVariant(bt), + ptrue, $src2$$FloatRegister); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +dnl VUMINMAX_PREDICATE($1, $2, $3 ) +dnl VUMINMAX_PREDICATE(type, op_name, insn) +define(`VUMINMAX_PREDICATE', ` +instruct v$1_masked(vReg dst_src1, vReg src2, pRegGov pg) %{ + predicate(UseSVE > 0); + match(Set dst_src1 ($2 (Binary dst_src1 src2) pg)); + format %{ "v$1_masked $dst_src1, $pg, $dst_src1, $src2" %} + ins_encode %{ + BasicType bt = Matcher::vector_element_basic_type(this); + assert(is_integral_type(bt), "unsupported type"); + __ $3($dst_src1$$FloatRegister, __ elemType_to_regVariant(bt), + $pg$$PRegister, $src2$$FloatRegister); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl // ------------------------------ Vector min ----------------------------------- // vector min - LONG @@ -1071,6 +1188,17 @@ VMINMAX_SVE(min, MinV, sve_fmin, sve_smin) // vector min - predicated VMINMAX_PREDICATE(min, MinV, sve_fmin, sve_smin) +// vector unsigned min - LONG +VMINMAX_L_NEON(min, UMinV, u) +VMINMAX_L_SVE(umin, UMinV, sve_umin) + +// vector unsigned min - B/S/I +VUMINMAX_NEON(umin, UMinV, uminv) +VUMINMAX_SVE(umin, UMinV, sve_umin) + +// vector unsigned min - predicated +VUMINMAX_PREDICATE(umin, UMinV, sve_umin) + // ------------------------------ Vector max ----------------------------------- // vector max - LONG @@ -1084,6 +1212,17 @@ VMINMAX_SVE(max, MaxV, sve_fmax, sve_smax) // vector max - predicated VMINMAX_PREDICATE(max, MaxV, sve_fmax, sve_smax) +// vector unsigned max - LONG +VMINMAX_L_NEON(max, UMaxV, u) +VMINMAX_L_SVE(umax, UMaxV, sve_umax) + +// vector unsigned max - B/S/I +VUMINMAX_NEON(umax, UMaxV, umaxv) +VUMINMAX_SVE(umax, UMaxV, sve_umax) + +// vector unsigned max - predicated +VUMINMAX_PREDICATE(umax, UMaxV, sve_umax) + // ------------------------------ MLA RELATED ---------------------------------- // vector mla diff --git a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp index c91a136e7d5..3db7d308844 100644 --- a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp @@ -2600,6 +2600,9 @@ template INSN(addv, 0, 0b100001, true); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S, T2D INSN(subv, 1, 0b100001, true); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S, T2D + INSN(sqaddv, 0, 0b000011, true); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S, T2D + INSN(sqsubv, 0, 0b001011, true); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S, T2D + INSN(uqaddv, 1, 0b000011, true); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S, T2D INSN(uqsubv, 1, 0b001011, true); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S, T2D INSN(mulv, 0, 0b100111, false); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S INSN(mlav, 0, 0b100101, false); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S @@ -2613,6 +2616,8 @@ template INSN(umlalv, 1, 0b100000, false); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S INSN(maxv, 0, 0b011001, false); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S INSN(minv, 0, 0b011011, false); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S + INSN(umaxv, 1, 0b011001, false); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S + INSN(uminv, 1, 0b011011, false); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S INSN(smaxp, 0, 0b101001, false); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S INSN(sminp, 0, 0b101011, false); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S INSN(sqdmulh,0, 0b101101, false); // accepted arrangements: T4H, T8H, T2S, T4S @@ -3323,8 +3328,12 @@ public: f(0b00000100, 31, 24), f(T, 23, 22), f(1, 21), \ rf(Zm, 16), f(0, 15, 13), f(opcode, 12, 10), rf(Zn, 5), rf(Zd, 0); \ } - INSN(sve_add, 0b000); - INSN(sve_sub, 0b001); + INSN(sve_add, 0b000); + INSN(sve_sub, 0b001); + INSN(sve_sqadd, 0b100); + INSN(sve_sqsub, 0b110); + INSN(sve_uqadd, 0b101); + INSN(sve_uqsub, 0b111); #undef INSN // SVE integer add/subtract immediate (unpredicated) @@ -3435,6 +3444,8 @@ public: INSN(sve_sminv, 0b00000100, 0b001010001); // signed minimum reduction to scalar INSN(sve_sub, 0b00000100, 0b000001000); // vector sub INSN(sve_uaddv, 0b00000100, 0b000001001); // unsigned add reduction to scalar + INSN(sve_umax, 0b00000100, 0b001001000); // unsigned maximum vectors + INSN(sve_umin, 0b00000100, 0b001011000); // unsigned minimum vectors #undef INSN // SVE floating-point arithmetic - predicate @@ -4226,6 +4237,20 @@ public: INSN(sve_eor3, 0b001); // Bitwise exclusive OR of three vectors #undef INSN +// SVE2 saturating operations - predicate +#define INSN(NAME, op1, op2) \ + void NAME(FloatRegister Zdn, SIMD_RegVariant T, PRegister Pg, FloatRegister Znm) { \ + assert(T != Q, "invalid register variant"); \ + sve_predicate_reg_insn(op1, op2, Zdn, T, Pg, Znm); \ + } + + INSN(sve_sqadd, 0b01000100, 0b011000100); // signed saturating add + INSN(sve_sqsub, 0b01000100, 0b011010100); // signed saturating sub + INSN(sve_uqadd, 0b01000100, 0b011001100); // unsigned saturating add + INSN(sve_uqsub, 0b01000100, 0b011011100); // unsigned saturating sub + +#undef INSN + Assembler(CodeBuffer* code) : AbstractAssembler(code) { } diff --git a/test/hotspot/gtest/aarch64/aarch64-asmtest.py b/test/hotspot/gtest/aarch64/aarch64-asmtest.py index 45ecf57e971..92868e783dc 100644 --- a/test/hotspot/gtest/aarch64/aarch64-asmtest.py +++ b/test/hotspot/gtest/aarch64/aarch64-asmtest.py @@ -1792,12 +1792,28 @@ generate(ThreeRegNEONOp, ["addv", "add", "4H"], ["addv", "add", "8H"], ["addv", "add", "2S"], ["addv", "add", "4S"], ["addv", "add", "2D"], + ["sqaddv", "sqadd", "8B"], ["sqaddv", "sqadd", "16B"], + ["sqaddv", "sqadd", "4H"], ["sqaddv", "sqadd", "8H"], + ["sqaddv", "sqadd", "2S"], ["sqaddv", "sqadd", "4S"], + ["sqaddv", "sqadd", "2D"], + ["uqaddv", "uqadd", "8B"], ["uqaddv", "uqadd", "16B"], + ["uqaddv", "uqadd", "4H"], ["uqaddv", "uqadd", "8H"], + ["uqaddv", "uqadd", "2S"], ["uqaddv", "uqadd", "4S"], + ["uqaddv", "uqadd", "2D"], ["fadd", "fadd", "2S"], ["fadd", "fadd", "4S"], ["fadd", "fadd", "2D"], ["subv", "sub", "8B"], ["subv", "sub", "16B"], ["subv", "sub", "4H"], ["subv", "sub", "8H"], ["subv", "sub", "2S"], ["subv", "sub", "4S"], ["subv", "sub", "2D"], + ["sqsubv", "sqsub", "8B"], ["sqsubv", "sqsub", "16B"], + ["sqsubv", "sqsub", "4H"], ["sqsubv", "sqsub", "8H"], + ["sqsubv", "sqsub", "2S"], ["sqsubv", "sqsub", "4S"], + ["sqsubv", "sqsub", "2D"], + ["uqsubv", "uqsub", "8B"], ["uqsubv", "uqsub", "16B"], + ["uqsubv", "uqsub", "4H"], ["uqsubv", "uqsub", "8H"], + ["uqsubv", "uqsub", "2S"], ["uqsubv", "uqsub", "4S"], + ["uqsubv", "uqsub", "2D"], ["fsub", "fsub", "2S"], ["fsub", "fsub", "4S"], ["fsub", "fsub", "2D"], ["mulv", "mul", "8B"], ["mulv", "mul", "16B"], @@ -1822,6 +1838,9 @@ generate(ThreeRegNEONOp, ["maxv", "smax", "8B"], ["maxv", "smax", "16B"], ["maxv", "smax", "4H"], ["maxv", "smax", "8H"], ["maxv", "smax", "2S"], ["maxv", "smax", "4S"], + ["umaxv", "umax", "8B"], ["umaxv", "umax", "16B"], + ["umaxv", "umax", "4H"], ["umaxv", "umax", "8H"], + ["umaxv", "umax", "2S"], ["umaxv", "umax", "4S"], ["smaxp", "smaxp", "8B"], ["smaxp", "smaxp", "16B"], ["smaxp", "smaxp", "4H"], ["smaxp", "smaxp", "8H"], ["smaxp", "smaxp", "2S"], ["smaxp", "smaxp", "4S"], @@ -1830,6 +1849,9 @@ generate(ThreeRegNEONOp, ["minv", "smin", "8B"], ["minv", "smin", "16B"], ["minv", "smin", "4H"], ["minv", "smin", "8H"], ["minv", "smin", "2S"], ["minv", "smin", "4S"], + ["uminv", "umin", "8B"], ["uminv", "umin", "16B"], + ["uminv", "umin", "4H"], ["uminv", "umin", "8H"], + ["uminv", "umin", "2S"], ["uminv", "umin", "4S"], ["sminp", "sminp", "8B"], ["sminp", "sminp", "16B"], ["sminp", "sminp", "4H"], ["sminp", "sminp", "8H"], ["sminp", "sminp", "2S"], ["sminp", "sminp", "4S"], @@ -2131,6 +2153,10 @@ generate(SVEVectorOp, [["add", "ZZZ"], ["fadd", "ZZZ"], ["fmul", "ZZZ"], ["fsub", "ZZZ"], + ["sqadd", "ZZZ"], + ["sqsub", "ZZZ"], + ["uqadd", "ZZZ"], + ["uqsub", "ZZZ"], ["abs", "ZPZ", "m"], ["add", "ZPZ", "m", "dn"], ["and", "ZPZ", "m", "dn"], @@ -2149,6 +2175,8 @@ generate(SVEVectorOp, [["add", "ZZZ"], ["revb", "ZPZ", "m"], ["smax", "ZPZ", "m", "dn"], ["smin", "ZPZ", "m", "dn"], + ["umax", "ZPZ", "m", "dn"], + ["umin", "ZPZ", "m", "dn"], ["sub", "ZPZ", "m", "dn"], ["fabs", "ZPZ", "m"], ["fadd", "ZPZ", "m", "dn"], @@ -2183,6 +2211,10 @@ generate(SVEVectorOp, [["add", "ZZZ"], ["bext", "ZZZ"], ["bdep", "ZZZ"], ["eor3", "ZZZ"], + ["sqadd", "ZPZ", "m", "dn"], + ["sqsub", "ZPZ", "m", "dn"], + ["uqadd", "ZPZ", "m", "dn"], + ["uqsub", "ZPZ", "m", "dn"], ]) generate(SVEReductionOp, [["andv", 0], ["orv", 0], ["eorv", 0], ["smaxv", 0], ["sminv", 0], diff --git a/test/hotspot/gtest/aarch64/asmtest.out.h b/test/hotspot/gtest/aarch64/asmtest.out.h index 3bcd40ba2aa..0c2011592b6 100644 --- a/test/hotspot/gtest/aarch64/asmtest.out.h +++ b/test/hotspot/gtest/aarch64/asmtest.out.h @@ -673,175 +673,215 @@ __ addv(v28, __ T2S, v29, v30); // add v28.2S, v29.2S, v30.2S __ addv(v2, __ T4S, v3, v4); // add v2.4S, v3.4S, v4.4S __ addv(v25, __ T2D, v26, v27); // add v25.2D, v26.2D, v27.2D - __ fadd(v5, __ T2S, v6, v7); // fadd v5.2S, v6.2S, v7.2S - __ fadd(v3, __ T4S, v4, v5); // fadd v3.4S, v4.4S, v5.4S - __ fadd(v8, __ T2D, v9, v10); // fadd v8.2D, v9.2D, v10.2D - __ subv(v22, __ T8B, v23, v24); // sub v22.8B, v23.8B, v24.8B - __ subv(v19, __ T16B, v20, v21); // sub v19.16B, v20.16B, v21.16B - __ subv(v13, __ T4H, v14, v15); // sub v13.4H, v14.4H, v15.4H - __ subv(v5, __ T8H, v6, v7); // sub v5.8H, v6.8H, v7.8H - __ subv(v29, __ T2S, v30, v31); // sub v29.2S, v30.2S, v31.2S - __ subv(v24, __ T4S, v25, v26); // sub v24.4S, v25.4S, v26.4S - __ subv(v21, __ T2D, v22, v23); // sub v21.2D, v22.2D, v23.2D - __ fsub(v26, __ T2S, v27, v28); // fsub v26.2S, v27.2S, v28.2S - __ fsub(v24, __ T4S, v25, v26); // fsub v24.4S, v25.4S, v26.4S - __ fsub(v3, __ T2D, v4, v5); // fsub v3.2D, v4.2D, v5.2D - __ mulv(v24, __ T8B, v25, v26); // mul v24.8B, v25.8B, v26.8B - __ mulv(v26, __ T16B, v27, v28); // mul v26.16B, v27.16B, v28.16B - __ mulv(v23, __ T4H, v24, v25); // mul v23.4H, v24.4H, v25.4H - __ mulv(v15, __ T8H, v16, v17); // mul v15.8H, v16.8H, v17.8H - __ mulv(v21, __ T2S, v22, v23); // mul v21.2S, v22.2S, v23.2S - __ mulv(v3, __ T4S, v4, v5); // mul v3.4S, v4.4S, v5.4S - __ fabd(v24, __ T2S, v25, v26); // fabd v24.2S, v25.2S, v26.2S - __ fabd(v8, __ T4S, v9, v10); // fabd v8.4S, v9.4S, v10.4S - __ fabd(v25, __ T2D, v26, v27); // fabd v25.2D, v26.2D, v27.2D - __ faddp(v20, __ T2S, v21, v22); // faddp v20.2S, v21.2S, v22.2S - __ faddp(v16, __ T4S, v17, v18); // faddp v16.4S, v17.4S, v18.4S - __ faddp(v17, __ T2D, v18, v19); // faddp v17.2D, v18.2D, v19.2D - __ fmul(v2, __ T2S, v3, v4); // fmul v2.2S, v3.2S, v4.2S - __ fmul(v1, __ T4S, v2, v3); // fmul v1.4S, v2.4S, v3.4S - __ fmul(v0, __ T2D, v1, v2); // fmul v0.2D, v1.2D, v2.2D - __ mlav(v24, __ T4H, v25, v26); // mla v24.4H, v25.4H, v26.4H + __ sqaddv(v5, __ T8B, v6, v7); // sqadd v5.8B, v6.8B, v7.8B + __ sqaddv(v3, __ T16B, v4, v5); // sqadd v3.16B, v4.16B, v5.16B + __ sqaddv(v8, __ T4H, v9, v10); // sqadd v8.4H, v9.4H, v10.4H + __ sqaddv(v22, __ T8H, v23, v24); // sqadd v22.8H, v23.8H, v24.8H + __ sqaddv(v19, __ T2S, v20, v21); // sqadd v19.2S, v20.2S, v21.2S + __ sqaddv(v13, __ T4S, v14, v15); // sqadd v13.4S, v14.4S, v15.4S + __ sqaddv(v5, __ T2D, v6, v7); // sqadd v5.2D, v6.2D, v7.2D + __ uqaddv(v29, __ T8B, v30, v31); // uqadd v29.8B, v30.8B, v31.8B + __ uqaddv(v24, __ T16B, v25, v26); // uqadd v24.16B, v25.16B, v26.16B + __ uqaddv(v21, __ T4H, v22, v23); // uqadd v21.4H, v22.4H, v23.4H + __ uqaddv(v26, __ T8H, v27, v28); // uqadd v26.8H, v27.8H, v28.8H + __ uqaddv(v24, __ T2S, v25, v26); // uqadd v24.2S, v25.2S, v26.2S + __ uqaddv(v3, __ T4S, v4, v5); // uqadd v3.4S, v4.4S, v5.4S + __ uqaddv(v24, __ T2D, v25, v26); // uqadd v24.2D, v25.2D, v26.2D + __ fadd(v26, __ T2S, v27, v28); // fadd v26.2S, v27.2S, v28.2S + __ fadd(v23, __ T4S, v24, v25); // fadd v23.4S, v24.4S, v25.4S + __ fadd(v15, __ T2D, v16, v17); // fadd v15.2D, v16.2D, v17.2D + __ subv(v21, __ T8B, v22, v23); // sub v21.8B, v22.8B, v23.8B + __ subv(v3, __ T16B, v4, v5); // sub v3.16B, v4.16B, v5.16B + __ subv(v24, __ T4H, v25, v26); // sub v24.4H, v25.4H, v26.4H + __ subv(v8, __ T8H, v9, v10); // sub v8.8H, v9.8H, v10.8H + __ subv(v25, __ T2S, v26, v27); // sub v25.2S, v26.2S, v27.2S + __ subv(v20, __ T4S, v21, v22); // sub v20.4S, v21.4S, v22.4S + __ subv(v16, __ T2D, v17, v18); // sub v16.2D, v17.2D, v18.2D + __ sqsubv(v17, __ T8B, v18, v19); // sqsub v17.8B, v18.8B, v19.8B + __ sqsubv(v2, __ T16B, v3, v4); // sqsub v2.16B, v3.16B, v4.16B + __ sqsubv(v1, __ T4H, v2, v3); // sqsub v1.4H, v2.4H, v3.4H + __ sqsubv(v0, __ T8H, v1, v2); // sqsub v0.8H, v1.8H, v2.8H + __ sqsubv(v24, __ T2S, v25, v26); // sqsub v24.2S, v25.2S, v26.2S + __ sqsubv(v4, __ T4S, v5, v6); // sqsub v4.4S, v5.4S, v6.4S + __ sqsubv(v3, __ T2D, v4, v5); // sqsub v3.2D, v4.2D, v5.2D + __ uqsubv(v12, __ T8B, v13, v14); // uqsub v12.8B, v13.8B, v14.8B + __ uqsubv(v31, __ T16B, v0, v1); // uqsub v31.16B, v0.16B, v1.16B + __ uqsubv(v28, __ T4H, v29, v30); // uqsub v28.4H, v29.4H, v30.4H + __ uqsubv(v10, __ T8H, v11, v12); // uqsub v10.8H, v11.8H, v12.8H + __ uqsubv(v26, __ T2S, v27, v28); // uqsub v26.2S, v27.2S, v28.2S + __ uqsubv(v2, __ T4S, v3, v4); // uqsub v2.4S, v3.4S, v4.4S + __ uqsubv(v12, __ T2D, v13, v14); // uqsub v12.2D, v13.2D, v14.2D + __ fsub(v18, __ T2S, v19, v20); // fsub v18.2S, v19.2S, v20.2S + __ fsub(v31, __ T4S, v0, v1); // fsub v31.4S, v0.4S, v1.4S + __ fsub(v1, __ T2D, v2, v3); // fsub v1.2D, v2.2D, v3.2D + __ mulv(v13, __ T8B, v14, v15); // mul v13.8B, v14.8B, v15.8B + __ mulv(v29, __ T16B, v30, v31); // mul v29.16B, v30.16B, v31.16B + __ mulv(v0, __ T4H, v1, v2); // mul v0.4H, v1.4H, v2.4H + __ mulv(v19, __ T8H, v20, v21); // mul v19.8H, v20.8H, v21.8H + __ mulv(v12, __ T2S, v13, v14); // mul v12.2S, v13.2S, v14.2S + __ mulv(v17, __ T4S, v18, v19); // mul v17.4S, v18.4S, v19.4S + __ fabd(v22, __ T2S, v23, v24); // fabd v22.2S, v23.2S, v24.2S + __ fabd(v13, __ T4S, v14, v15); // fabd v13.4S, v14.4S, v15.4S + __ fabd(v28, __ T2D, v29, v30); // fabd v28.2D, v29.2D, v30.2D + __ faddp(v30, __ T2S, v31, v0); // faddp v30.2S, v31.2S, v0.2S + __ faddp(v31, __ T4S, v0, v1); // faddp v31.4S, v0.4S, v1.4S + __ faddp(v1, __ T2D, v2, v3); // faddp v1.2D, v2.2D, v3.2D + __ fmul(v26, __ T2S, v27, v28); // fmul v26.2S, v27.2S, v28.2S + __ fmul(v28, __ T4S, v29, v30); // fmul v28.4S, v29.4S, v30.4S + __ fmul(v4, __ T2D, v5, v6); // fmul v4.2D, v5.2D, v6.2D + __ mlav(v30, __ T4H, v31, v0); // mla v30.4H, v31.4H, v0.4H __ mlav(v4, __ T8H, v5, v6); // mla v4.8H, v5.8H, v6.8H - __ mlav(v3, __ T2S, v4, v5); // mla v3.2S, v4.2S, v5.2S - __ mlav(v12, __ T4S, v13, v14); // mla v12.4S, v13.4S, v14.4S - __ fmla(v31, __ T2S, v0, v1); // fmla v31.2S, v0.2S, v1.2S - __ fmla(v28, __ T4S, v29, v30); // fmla v28.4S, v29.4S, v30.4S - __ fmla(v10, __ T2D, v11, v12); // fmla v10.2D, v11.2D, v12.2D - __ mlsv(v26, __ T4H, v27, v28); // mls v26.4H, v27.4H, v28.4H - __ mlsv(v2, __ T8H, v3, v4); // mls v2.8H, v3.8H, v4.8H - __ mlsv(v12, __ T2S, v13, v14); // mls v12.2S, v13.2S, v14.2S - __ mlsv(v18, __ T4S, v19, v20); // mls v18.4S, v19.4S, v20.4S - __ fmls(v31, __ T2S, v0, v1); // fmls v31.2S, v0.2S, v1.2S - __ fmls(v1, __ T4S, v2, v3); // fmls v1.4S, v2.4S, v3.4S - __ fmls(v13, __ T2D, v14, v15); // fmls v13.2D, v14.2D, v15.2D - __ fdiv(v29, __ T2S, v30, v31); // fdiv v29.2S, v30.2S, v31.2S - __ fdiv(v0, __ T4S, v1, v2); // fdiv v0.4S, v1.4S, v2.4S - __ fdiv(v19, __ T2D, v20, v21); // fdiv v19.2D, v20.2D, v21.2D - __ maxv(v12, __ T8B, v13, v14); // smax v12.8B, v13.8B, v14.8B - __ maxv(v17, __ T16B, v18, v19); // smax v17.16B, v18.16B, v19.16B - __ maxv(v22, __ T4H, v23, v24); // smax v22.4H, v23.4H, v24.4H - __ maxv(v13, __ T8H, v14, v15); // smax v13.8H, v14.8H, v15.8H - __ maxv(v28, __ T2S, v29, v30); // smax v28.2S, v29.2S, v30.2S - __ maxv(v30, __ T4S, v31, v0); // smax v30.4S, v31.4S, v0.4S - __ smaxp(v31, __ T8B, v0, v1); // smaxp v31.8B, v0.8B, v1.8B - __ smaxp(v1, __ T16B, v2, v3); // smaxp v1.16B, v2.16B, v3.16B - __ smaxp(v26, __ T4H, v27, v28); // smaxp v26.4H, v27.4H, v28.4H - __ smaxp(v28, __ T8H, v29, v30); // smaxp v28.8H, v29.8H, v30.8H - __ smaxp(v4, __ T2S, v5, v6); // smaxp v4.2S, v5.2S, v6.2S - __ smaxp(v30, __ T4S, v31, v0); // smaxp v30.4S, v31.4S, v0.4S - __ fmax(v4, __ T2S, v5, v6); // fmax v4.2S, v5.2S, v6.2S - __ fmax(v6, __ T4S, v7, v8); // fmax v6.4S, v7.4S, v8.4S - __ fmax(v30, __ T2D, v31, v0); // fmax v30.2D, v31.2D, v0.2D - __ minv(v26, __ T8B, v27, v28); // smin v26.8B, v27.8B, v28.8B - __ minv(v18, __ T16B, v19, v20); // smin v18.16B, v19.16B, v20.16B - __ minv(v9, __ T4H, v10, v11); // smin v9.4H, v10.4H, v11.4H - __ minv(v8, __ T8H, v9, v10); // smin v8.8H, v9.8H, v10.8H - __ minv(v12, __ T2S, v13, v14); // smin v12.2S, v13.2S, v14.2S - __ minv(v0, __ T4S, v1, v2); // smin v0.4S, v1.4S, v2.4S - __ sminp(v20, __ T8B, v21, v22); // sminp v20.8B, v21.8B, v22.8B - __ sminp(v1, __ T16B, v2, v3); // sminp v1.16B, v2.16B, v3.16B - __ sminp(v24, __ T4H, v25, v26); // sminp v24.4H, v25.4H, v26.4H - __ sminp(v2, __ T8H, v3, v4); // sminp v2.8H, v3.8H, v4.8H - __ sminp(v0, __ T2S, v1, v2); // sminp v0.2S, v1.2S, v2.2S - __ sminp(v9, __ T4S, v10, v11); // sminp v9.4S, v10.4S, v11.4S - __ sqdmulh(v24, __ T4H, v25, v26); // sqdmulh v24.4H, v25.4H, v26.4H - __ sqdmulh(v26, __ T8H, v27, v28); // sqdmulh v26.8H, v27.8H, v28.8H - __ sqdmulh(v16, __ T2S, v17, v18); // sqdmulh v16.2S, v17.2S, v18.2S - __ sqdmulh(v30, __ T4S, v31, v0); // sqdmulh v30.4S, v31.4S, v0.4S - __ shsubv(v3, __ T8B, v4, v5); // shsub v3.8B, v4.8B, v5.8B - __ shsubv(v10, __ T16B, v11, v12); // shsub v10.16B, v11.16B, v12.16B - __ shsubv(v23, __ T4H, v24, v25); // shsub v23.4H, v24.4H, v25.4H - __ shsubv(v10, __ T8H, v11, v12); // shsub v10.8H, v11.8H, v12.8H - __ shsubv(v4, __ T2S, v5, v6); // shsub v4.2S, v5.2S, v6.2S - __ shsubv(v18, __ T4S, v19, v20); // shsub v18.4S, v19.4S, v20.4S - __ fmin(v2, __ T2S, v3, v4); // fmin v2.2S, v3.2S, v4.2S - __ fmin(v11, __ T4S, v12, v13); // fmin v11.4S, v12.4S, v13.4S - __ fmin(v8, __ T2D, v9, v10); // fmin v8.2D, v9.2D, v10.2D - __ facgt(v10, __ T2S, v11, v12); // facgt v10.2S, v11.2S, v12.2S - __ facgt(v15, __ T4S, v16, v17); // facgt v15.4S, v16.4S, v17.4S + __ mlav(v6, __ T2S, v7, v8); // mla v6.2S, v7.2S, v8.2S + __ mlav(v30, __ T4S, v31, v0); // mla v30.4S, v31.4S, v0.4S + __ fmla(v26, __ T2S, v27, v28); // fmla v26.2S, v27.2S, v28.2S + __ fmla(v18, __ T4S, v19, v20); // fmla v18.4S, v19.4S, v20.4S + __ fmla(v9, __ T2D, v10, v11); // fmla v9.2D, v10.2D, v11.2D + __ mlsv(v8, __ T4H, v9, v10); // mls v8.4H, v9.4H, v10.4H + __ mlsv(v12, __ T8H, v13, v14); // mls v12.8H, v13.8H, v14.8H + __ mlsv(v0, __ T2S, v1, v2); // mls v0.2S, v1.2S, v2.2S + __ mlsv(v20, __ T4S, v21, v22); // mls v20.4S, v21.4S, v22.4S + __ fmls(v1, __ T2S, v2, v3); // fmls v1.2S, v2.2S, v3.2S + __ fmls(v24, __ T4S, v25, v26); // fmls v24.4S, v25.4S, v26.4S + __ fmls(v2, __ T2D, v3, v4); // fmls v2.2D, v3.2D, v4.2D + __ fdiv(v0, __ T2S, v1, v2); // fdiv v0.2S, v1.2S, v2.2S + __ fdiv(v9, __ T4S, v10, v11); // fdiv v9.4S, v10.4S, v11.4S + __ fdiv(v24, __ T2D, v25, v26); // fdiv v24.2D, v25.2D, v26.2D + __ maxv(v26, __ T8B, v27, v28); // smax v26.8B, v27.8B, v28.8B + __ maxv(v16, __ T16B, v17, v18); // smax v16.16B, v17.16B, v18.16B + __ maxv(v30, __ T4H, v31, v0); // smax v30.4H, v31.4H, v0.4H + __ maxv(v3, __ T8H, v4, v5); // smax v3.8H, v4.8H, v5.8H + __ maxv(v10, __ T2S, v11, v12); // smax v10.2S, v11.2S, v12.2S + __ maxv(v23, __ T4S, v24, v25); // smax v23.4S, v24.4S, v25.4S + __ umaxv(v10, __ T8B, v11, v12); // umax v10.8B, v11.8B, v12.8B + __ umaxv(v4, __ T16B, v5, v6); // umax v4.16B, v5.16B, v6.16B + __ umaxv(v18, __ T4H, v19, v20); // umax v18.4H, v19.4H, v20.4H + __ umaxv(v2, __ T8H, v3, v4); // umax v2.8H, v3.8H, v4.8H + __ umaxv(v11, __ T2S, v12, v13); // umax v11.2S, v12.2S, v13.2S + __ umaxv(v8, __ T4S, v9, v10); // umax v8.4S, v9.4S, v10.4S + __ smaxp(v10, __ T8B, v11, v12); // smaxp v10.8B, v11.8B, v12.8B + __ smaxp(v15, __ T16B, v16, v17); // smaxp v15.16B, v16.16B, v17.16B + __ smaxp(v17, __ T4H, v18, v19); // smaxp v17.4H, v18.4H, v19.4H + __ smaxp(v2, __ T8H, v3, v4); // smaxp v2.8H, v3.8H, v4.8H + __ smaxp(v10, __ T2S, v11, v12); // smaxp v10.2S, v11.2S, v12.2S + __ smaxp(v12, __ T4S, v13, v14); // smaxp v12.4S, v13.4S, v14.4S + __ fmax(v12, __ T2S, v13, v14); // fmax v12.2S, v13.2S, v14.2S + __ fmax(v15, __ T4S, v16, v17); // fmax v15.4S, v16.4S, v17.4S + __ fmax(v13, __ T2D, v14, v15); // fmax v13.2D, v14.2D, v15.2D + __ minv(v2, __ T8B, v3, v4); // smin v2.8B, v3.8B, v4.8B + __ minv(v7, __ T16B, v8, v9); // smin v7.16B, v8.16B, v9.16B + __ minv(v20, __ T4H, v21, v22); // smin v20.4H, v21.4H, v22.4H + __ minv(v26, __ T8H, v27, v28); // smin v26.8H, v27.8H, v28.8H + __ minv(v16, __ T2S, v17, v18); // smin v16.2S, v17.2S, v18.2S + __ minv(v4, __ T4S, v5, v6); // smin v4.4S, v5.4S, v6.4S + __ uminv(v2, __ T8B, v3, v4); // umin v2.8B, v3.8B, v4.8B + __ uminv(v4, __ T16B, v5, v6); // umin v4.16B, v5.16B, v6.16B + __ uminv(v12, __ T4H, v13, v14); // umin v12.4H, v13.4H, v14.4H + __ uminv(v18, __ T8H, v19, v20); // umin v18.8H, v19.8H, v20.8H + __ uminv(v21, __ T2S, v22, v23); // umin v21.2S, v22.2S, v23.2S + __ uminv(v16, __ T4S, v17, v18); // umin v16.4S, v17.4S, v18.4S + __ sminp(v18, __ T8B, v19, v20); // sminp v18.8B, v19.8B, v20.8B + __ sminp(v11, __ T16B, v12, v13); // sminp v11.16B, v12.16B, v13.16B + __ sminp(v21, __ T4H, v22, v23); // sminp v21.4H, v22.4H, v23.4H + __ sminp(v23, __ T8H, v24, v25); // sminp v23.8H, v24.8H, v25.8H + __ sminp(v12, __ T2S, v13, v14); // sminp v12.2S, v13.2S, v14.2S + __ sminp(v26, __ T4S, v27, v28); // sminp v26.4S, v27.4S, v28.4S + __ sqdmulh(v23, __ T4H, v24, v25); // sqdmulh v23.4H, v24.4H, v25.4H + __ sqdmulh(v28, __ T8H, v29, v30); // sqdmulh v28.8H, v29.8H, v30.8H + __ sqdmulh(v14, __ T2S, v15, v16); // sqdmulh v14.2S, v15.2S, v16.2S + __ sqdmulh(v11, __ T4S, v12, v13); // sqdmulh v11.4S, v12.4S, v13.4S + __ shsubv(v24, __ T8B, v25, v26); // shsub v24.8B, v25.8B, v26.8B + __ shsubv(v1, __ T16B, v2, v3); // shsub v1.16B, v2.16B, v3.16B + __ shsubv(v12, __ T4H, v13, v14); // shsub v12.4H, v13.4H, v14.4H + __ shsubv(v31, __ T8H, v0, v1); // shsub v31.8H, v0.8H, v1.8H + __ shsubv(v10, __ T2S, v11, v12); // shsub v10.2S, v11.2S, v12.2S + __ shsubv(v16, __ T4S, v17, v18); // shsub v16.4S, v17.4S, v18.4S + __ fmin(v7, __ T2S, v8, v9); // fmin v7.2S, v8.2S, v9.2S + __ fmin(v2, __ T4S, v3, v4); // fmin v2.4S, v3.4S, v4.4S + __ fmin(v3, __ T2D, v4, v5); // fmin v3.2D, v4.2D, v5.2D + __ facgt(v13, __ T2S, v14, v15); // facgt v13.2S, v14.2S, v15.2S + __ facgt(v19, __ T4S, v20, v21); // facgt v19.4S, v20.4S, v21.4S __ facgt(v17, __ T2D, v18, v19); // facgt v17.2D, v18.2D, v19.2D // VectorScalarNEONInstruction - __ fmlavs(v5, __ T2S, v6, v7, 0); // fmla v5.2S, v6.2S, v7.S[0] - __ mulvs(v6, __ T4S, v7, v8, 1); // mul v6.4S, v7.4S, v8.S[1] - __ fmlavs(v6, __ T2D, v7, v8, 0); // fmla v6.2D, v7.2D, v8.D[0] - __ fmlsvs(v3, __ T2S, v4, v5, 0); // fmls v3.2S, v4.2S, v5.S[0] - __ mulvs(v13, __ T4S, v14, v15, 2); // mul v13.4S, v14.4S, v15.S[2] - __ fmlsvs(v2, __ T2D, v3, v4, 1); // fmls v2.2D, v3.2D, v4.D[1] - __ fmulxvs(v2, __ T2S, v3, v4, 0); // fmulx v2.2S, v3.2S, v4.S[0] - __ mulvs(v9, __ T4S, v10, v11, 1); // mul v9.4S, v10.4S, v11.S[1] - __ fmulxvs(v8, __ T2D, v9, v10, 1); // fmulx v8.2D, v9.2D, v10.D[1] - __ mulvs(v5, __ T4H, v6, v7, 2); // mul v5.4H, v6.4H, v7.H[2] - __ mulvs(v11, __ T8H, v12, v13, 5); // mul v11.8H, v12.8H, v13.H[5] - __ mulvs(v13, __ T2S, v14, v15, 0); // mul v13.2S, v14.2S, v15.S[0] + __ fmlavs(v1, __ T2S, v2, v3, 1); // fmla v1.2S, v2.2S, v3.S[1] + __ mulvs(v5, __ T4S, v6, v7, 0); // mul v5.4S, v6.4S, v7.S[0] + __ fmlavs(v2, __ T2D, v3, v4, 1); // fmla v2.2D, v3.2D, v4.D[1] + __ fmlsvs(v7, __ T2S, v8, v9, 0); // fmls v7.2S, v8.2S, v9.S[0] + __ mulvs(v15, __ T4S, v0, v1, 3); // mul v15.4S, v0.4S, v1.S[3] + __ fmlsvs(v10, __ T2D, v11, v12, 0); // fmls v10.2D, v11.2D, v12.D[0] + __ fmulxvs(v10, __ T2S, v11, v12, 0); // fmulx v10.2S, v11.2S, v12.S[0] __ mulvs(v14, __ T4S, v15, v16, 2); // mul v14.4S, v15.4S, v16.S[2] + __ fmulxvs(v13, __ T2D, v14, v15, 1); // fmulx v13.2D, v14.2D, v15.D[1] + __ mulvs(v2, __ T4H, v3, v4, 3); // mul v2.4H, v3.4H, v4.H[3] + __ mulvs(v11, __ T8H, v12, v13, 0); // mul v11.8H, v12.8H, v13.H[0] + __ mulvs(v15, __ T2S, v0, v1, 1); // mul v15.2S, v0.2S, v1.S[1] + __ mulvs(v6, __ T4S, v7, v8, 0); // mul v6.4S, v7.4S, v8.S[0] // NEONVectorCompare - __ cm(Assembler::GT, v14, __ T8B, v15, v16); // cmgt v14.8B, v15.8B, v16.8B - __ cm(Assembler::GT, v11, __ T16B, v12, v13); // cmgt v11.16B, v12.16B, v13.16B - __ cm(Assembler::GT, v24, __ T4H, v25, v26); // cmgt v24.4H, v25.4H, v26.4H - __ cm(Assembler::GT, v1, __ T8H, v2, v3); // cmgt v1.8H, v2.8H, v3.8H - __ cm(Assembler::GT, v12, __ T2S, v13, v14); // cmgt v12.2S, v13.2S, v14.2S + __ cm(Assembler::GT, v9, __ T8B, v10, v11); // cmgt v9.8B, v10.8B, v11.8B + __ cm(Assembler::GT, v28, __ T16B, v29, v30); // cmgt v28.16B, v29.16B, v30.16B + __ cm(Assembler::GT, v15, __ T4H, v16, v17); // cmgt v15.4H, v16.4H, v17.4H + __ cm(Assembler::GT, v29, __ T8H, v30, v31); // cmgt v29.8H, v30.8H, v31.8H + __ cm(Assembler::GT, v22, __ T2S, v23, v24); // cmgt v22.2S, v23.2S, v24.2S __ cm(Assembler::GT, v31, __ T4S, v0, v1); // cmgt v31.4S, v0.4S, v1.4S - __ cm(Assembler::GT, v10, __ T2D, v11, v12); // cmgt v10.2D, v11.2D, v12.2D - __ cm(Assembler::GE, v16, __ T8B, v17, v18); // cmge v16.8B, v17.8B, v18.8B - __ cm(Assembler::GE, v7, __ T16B, v8, v9); // cmge v7.16B, v8.16B, v9.16B - __ cm(Assembler::GE, v2, __ T4H, v3, v4); // cmge v2.4H, v3.4H, v4.4H - __ cm(Assembler::GE, v3, __ T8H, v4, v5); // cmge v3.8H, v4.8H, v5.8H - __ cm(Assembler::GE, v13, __ T2S, v14, v15); // cmge v13.2S, v14.2S, v15.2S - __ cm(Assembler::GE, v19, __ T4S, v20, v21); // cmge v19.4S, v20.4S, v21.4S - __ cm(Assembler::GE, v17, __ T2D, v18, v19); // cmge v17.2D, v18.2D, v19.2D - __ cm(Assembler::EQ, v16, __ T8B, v17, v18); // cmeq v16.8B, v17.8B, v18.8B - __ cm(Assembler::EQ, v3, __ T16B, v4, v5); // cmeq v3.16B, v4.16B, v5.16B - __ cm(Assembler::EQ, v1, __ T4H, v2, v3); // cmeq v1.4H, v2.4H, v3.4H + __ cm(Assembler::GT, v19, __ T2D, v20, v21); // cmgt v19.2D, v20.2D, v21.2D + __ cm(Assembler::GE, v31, __ T8B, v0, v1); // cmge v31.8B, v0.8B, v1.8B + __ cm(Assembler::GE, v5, __ T16B, v6, v7); // cmge v5.16B, v6.16B, v7.16B + __ cm(Assembler::GE, v14, __ T4H, v15, v16); // cmge v14.4H, v15.4H, v16.4H + __ cm(Assembler::GE, v18, __ T8H, v19, v20); // cmge v18.8H, v19.8H, v20.8H + __ cm(Assembler::GE, v31, __ T2S, v0, v1); // cmge v31.2S, v0.2S, v1.2S + __ cm(Assembler::GE, v18, __ T4S, v19, v20); // cmge v18.4S, v19.4S, v20.4S + __ cm(Assembler::GE, v27, __ T2D, v28, v29); // cmge v27.2D, v28.2D, v29.2D + __ cm(Assembler::EQ, v20, __ T8B, v21, v22); // cmeq v20.8B, v21.8B, v22.8B + __ cm(Assembler::EQ, v16, __ T16B, v17, v18); // cmeq v16.16B, v17.16B, v18.16B + __ cm(Assembler::EQ, v12, __ T4H, v13, v14); // cmeq v12.4H, v13.4H, v14.4H __ cm(Assembler::EQ, v11, __ T8H, v12, v13); // cmeq v11.8H, v12.8H, v13.8H - __ cm(Assembler::EQ, v30, __ T2S, v31, v0); // cmeq v30.2S, v31.2S, v0.2S - __ cm(Assembler::EQ, v5, __ T4S, v6, v7); // cmeq v5.4S, v6.4S, v7.4S - __ cm(Assembler::EQ, v8, __ T2D, v9, v10); // cmeq v8.2D, v9.2D, v10.2D - __ cm(Assembler::HI, v15, __ T8B, v16, v17); // cmhi v15.8B, v16.8B, v17.8B - __ cm(Assembler::HI, v29, __ T16B, v30, v31); // cmhi v29.16B, v30.16B, v31.16B - __ cm(Assembler::HI, v30, __ T4H, v31, v0); // cmhi v30.4H, v31.4H, v0.4H - __ cm(Assembler::HI, v0, __ T8H, v1, v2); // cmhi v0.8H, v1.8H, v2.8H - __ cm(Assembler::HI, v20, __ T2S, v21, v22); // cmhi v20.2S, v21.2S, v22.2S - __ cm(Assembler::HI, v7, __ T4S, v8, v9); // cmhi v7.4S, v8.4S, v9.4S + __ cm(Assembler::EQ, v9, __ T2S, v10, v11); // cmeq v9.2S, v10.2S, v11.2S + __ cm(Assembler::EQ, v6, __ T4S, v7, v8); // cmeq v6.4S, v7.4S, v8.4S + __ cm(Assembler::EQ, v30, __ T2D, v31, v0); // cmeq v30.2D, v31.2D, v0.2D + __ cm(Assembler::HI, v17, __ T8B, v18, v19); // cmhi v17.8B, v18.8B, v19.8B + __ cm(Assembler::HI, v27, __ T16B, v28, v29); // cmhi v27.16B, v28.16B, v29.16B + __ cm(Assembler::HI, v28, __ T4H, v29, v30); // cmhi v28.4H, v29.4H, v30.4H + __ cm(Assembler::HI, v30, __ T8H, v31, v0); // cmhi v30.8H, v31.8H, v0.8H + __ cm(Assembler::HI, v7, __ T2S, v8, v9); // cmhi v7.2S, v8.2S, v9.2S + __ cm(Assembler::HI, v10, __ T4S, v11, v12); // cmhi v10.4S, v11.4S, v12.4S __ cm(Assembler::HI, v20, __ T2D, v21, v22); // cmhi v20.2D, v21.2D, v22.2D - __ cm(Assembler::HS, v23, __ T8B, v24, v25); // cmhs v23.8B, v24.8B, v25.8B - __ cm(Assembler::HS, v28, __ T16B, v29, v30); // cmhs v28.16B, v29.16B, v30.16B - __ cm(Assembler::HS, v21, __ T4H, v22, v23); // cmhs v21.4H, v22.4H, v23.4H - __ cm(Assembler::HS, v27, __ T8H, v28, v29); // cmhs v27.8H, v28.8H, v29.8H - __ cm(Assembler::HS, v25, __ T2S, v26, v27); // cmhs v25.2S, v26.2S, v27.2S - __ cm(Assembler::HS, v5, __ T4S, v6, v7); // cmhs v5.4S, v6.4S, v7.4S - __ cm(Assembler::HS, v1, __ T2D, v2, v3); // cmhs v1.2D, v2.2D, v3.2D - __ fcm(Assembler::EQ, v23, __ T2S, v24, v25); // fcmeq v23.2S, v24.2S, v25.2S - __ fcm(Assembler::EQ, v16, __ T4S, v17, v18); // fcmeq v16.4S, v17.4S, v18.4S - __ fcm(Assembler::EQ, v31, __ T2D, v0, v1); // fcmeq v31.2D, v0.2D, v1.2D - __ fcm(Assembler::GT, v5, __ T2S, v6, v7); // fcmgt v5.2S, v6.2S, v7.2S - __ fcm(Assembler::GT, v12, __ T4S, v13, v14); // fcmgt v12.4S, v13.4S, v14.4S - __ fcm(Assembler::GT, v9, __ T2D, v10, v11); // fcmgt v9.2D, v10.2D, v11.2D - __ fcm(Assembler::GE, v28, __ T2S, v29, v30); // fcmge v28.2S, v29.2S, v30.2S + __ cm(Assembler::HS, v10, __ T8B, v11, v12); // cmhs v10.8B, v11.8B, v12.8B + __ cm(Assembler::HS, v4, __ T16B, v5, v6); // cmhs v4.16B, v5.16B, v6.16B + __ cm(Assembler::HS, v24, __ T4H, v25, v26); // cmhs v24.4H, v25.4H, v26.4H + __ cm(Assembler::HS, v17, __ T8H, v18, v19); // cmhs v17.8H, v18.8H, v19.8H + __ cm(Assembler::HS, v17, __ T2S, v18, v19); // cmhs v17.2S, v18.2S, v19.2S + __ cm(Assembler::HS, v22, __ T4S, v23, v24); // cmhs v22.4S, v23.4S, v24.4S + __ cm(Assembler::HS, v3, __ T2D, v4, v5); // cmhs v3.2D, v4.2D, v5.2D + __ fcm(Assembler::EQ, v29, __ T2S, v30, v31); // fcmeq v29.2S, v30.2S, v31.2S + __ fcm(Assembler::EQ, v15, __ T4S, v16, v17); // fcmeq v15.4S, v16.4S, v17.4S + __ fcm(Assembler::EQ, v22, __ T2D, v23, v24); // fcmeq v22.2D, v23.2D, v24.2D + __ fcm(Assembler::GT, v19, __ T2S, v20, v21); // fcmgt v19.2S, v20.2S, v21.2S + __ fcm(Assembler::GT, v19, __ T4S, v20, v21); // fcmgt v19.4S, v20.4S, v21.4S + __ fcm(Assembler::GT, v22, __ T2D, v23, v24); // fcmgt v22.2D, v23.2D, v24.2D + __ fcm(Assembler::GE, v2, __ T2S, v3, v4); // fcmge v2.2S, v3.2S, v4.2S __ fcm(Assembler::GE, v15, __ T4S, v16, v17); // fcmge v15.4S, v16.4S, v17.4S - __ fcm(Assembler::GE, v29, __ T2D, v30, v31); // fcmge v29.2D, v30.2D, v31.2D + __ fcm(Assembler::GE, v6, __ T2D, v7, v8); // fcmge v6.2D, v7.2D, v8.2D // SVEComparisonWithZero - __ sve_fcm(Assembler::EQ, p11, __ D, p7, z31, 0.0); // fcmeq p11.d, p7/z, z31.d, #0.0 - __ sve_fcm(Assembler::GT, p2, __ D, p7, z14, 0.0); // fcmgt p2.d, p7/z, z14.d, #0.0 - __ sve_fcm(Assembler::GE, p9, __ D, p4, z27, 0.0); // fcmge p9.d, p4/z, z27.d, #0.0 - __ sve_fcm(Assembler::LT, p6, __ S, p1, z11, 0.0); // fcmlt p6.s, p1/z, z11.s, #0.0 - __ sve_fcm(Assembler::LE, p15, __ D, p7, z17, 0.0); // fcmle p15.d, p7/z, z17.d, #0.0 - __ sve_fcm(Assembler::NE, p15, __ S, p5, z7, 0.0); // fcmne p15.s, p5/z, z7.s, #0.0 + __ sve_fcm(Assembler::EQ, p6, __ S, p3, z16, 0.0); // fcmeq p6.s, p3/z, z16.s, #0.0 + __ sve_fcm(Assembler::GT, p11, __ D, p4, z1, 0.0); // fcmgt p11.d, p4/z, z1.d, #0.0 + __ sve_fcm(Assembler::GE, p2, __ S, p4, z17, 0.0); // fcmge p2.s, p4/z, z17.s, #0.0 + __ sve_fcm(Assembler::LT, p11, __ S, p5, z13, 0.0); // fcmlt p11.s, p5/z, z13.s, #0.0 + __ sve_fcm(Assembler::LE, p14, __ S, p6, z27, 0.0); // fcmle p14.s, p6/z, z27.s, #0.0 + __ sve_fcm(Assembler::NE, p1, __ D, p6, z1, 0.0); // fcmne p1.d, p6/z, z1.d, #0.0 // SVEComparisonWithImm - __ sve_cmp(Assembler::EQ, p5, __ D, p4, z4, 1); // cmpeq p5.d, p4/z, z4.d, #1 - __ sve_cmp(Assembler::GT, p11, __ D, p3, z3, 6); // cmpgt p11.d, p3/z, z3.d, #6 - __ sve_cmp(Assembler::GE, p9, __ S, p0, z19, -1); // cmpge p9.s, p0/z, z19.s, #-1 - __ sve_cmp(Assembler::LT, p3, __ S, p2, z12, -3); // cmplt p3.s, p2/z, z12.s, #-3 - __ sve_cmp(Assembler::LE, p11, __ D, p4, z1, -11); // cmple p11.d, p4/z, z1.d, #-11 - __ sve_cmp(Assembler::NE, p8, __ S, p5, z2, -3); // cmpne p8.s, p5/z, z2.s, #-3 - __ sve_cmp(Assembler::HS, p5, __ D, p6, z21, 49); // cmphs p5.d, p6/z, z21.d, #49 - __ sve_cmp(Assembler::HI, p13, __ B, p7, z3, 97); // cmphi p13.b, p7/z, z3.b, #97 - __ sve_cmp(Assembler::LS, p9, __ H, p7, z17, 109); // cmpls p9.h, p7/z, z17.h, #109 - __ sve_cmp(Assembler::LO, p7, __ S, p5, z7, 127); // cmplo p7.s, p5/z, z7.s, #127 + __ sve_cmp(Assembler::EQ, p9, __ H, p7, z17, 11); // cmpeq p9.h, p7/z, z17.h, #11 + __ sve_cmp(Assembler::GT, p7, __ S, p5, z7, 15); // cmpgt p7.s, p5/z, z7.s, #15 + __ sve_cmp(Assembler::GE, p12, __ D, p6, z2, 2); // cmpge p12.d, p6/z, z2.d, #2 + __ sve_cmp(Assembler::LT, p5, __ S, p0, z23, 2); // cmplt p5.s, p0/z, z23.s, #2 + __ sve_cmp(Assembler::LE, p0, __ D, p5, z25, -14); // cmple p0.d, p5/z, z25.d, #-14 + __ sve_cmp(Assembler::NE, p9, __ B, p7, z12, 14); // cmpne p9.b, p7/z, z12.b, #14 + __ sve_cmp(Assembler::HS, p14, __ D, p1, z16, 37); // cmphs p14.d, p1/z, z16.d, #37 + __ sve_cmp(Assembler::HI, p14, __ B, p1, z18, 29); // cmphi p14.b, p1/z, z18.b, #29 + __ sve_cmp(Assembler::LS, p7, __ S, p2, z9, 10); // cmpls p7.s, p2/z, z9.s, #10 + __ sve_cmp(Assembler::LO, p14, __ D, p1, z21, 111); // cmplo p14.d, p1/z, z21.d, #111 // SpecialCases __ ccmn(zr, zr, 3u, Assembler::LE); // ccmn xzr, xzr, #3, LE @@ -1096,229 +1136,239 @@ __ fmovd(v0, -1.0625); // fmov d0, #-1.0625 // LSEOp - __ swp(Assembler::xword, r25, r2, sp); // swp x25, x2, [sp] - __ ldadd(Assembler::xword, r27, r16, r10); // ldadd x27, x16, [x10] - __ ldbic(Assembler::xword, r23, r19, r3); // ldclr x23, x19, [x3] - __ ldeor(Assembler::xword, r16, r0, r25); // ldeor x16, x0, [x25] - __ ldorr(Assembler::xword, r26, r23, r2); // ldset x26, x23, [x2] - __ ldsmin(Assembler::xword, r16, r12, r4); // ldsmin x16, x12, [x4] - __ ldsmax(Assembler::xword, r28, r30, r29); // ldsmax x28, x30, [x29] - __ ldumin(Assembler::xword, r16, r27, r6); // ldumin x16, x27, [x6] - __ ldumax(Assembler::xword, r9, r29, r15); // ldumax x9, x29, [x15] + __ swp(Assembler::xword, r0, r17, r15); // swp x0, x17, [x15] + __ ldadd(Assembler::xword, r4, r26, r8); // ldadd x4, x26, [x8] + __ ldbic(Assembler::xword, r28, r22, r27); // ldclr x28, x22, [x27] + __ ldeor(Assembler::xword, r27, r25, r23); // ldeor x27, x25, [x23] + __ ldorr(Assembler::xword, r0, r4, r6); // ldset x0, x4, [x6] + __ ldsmin(Assembler::xword, r16, r0, r4); // ldsmin x16, x0, [x4] + __ ldsmax(Assembler::xword, r15, r1, r10); // ldsmax x15, x1, [x10] + __ ldumin(Assembler::xword, r7, r5, r10); // ldumin x7, x5, [x10] + __ ldumax(Assembler::xword, r28, r7, r20); // ldumax x28, x7, [x20] // LSEOp - __ swpa(Assembler::xword, r7, r4, r7); // swpa x7, x4, [x7] - __ ldadda(Assembler::xword, r15, r9, r23); // ldadda x15, x9, [x23] - __ ldbica(Assembler::xword, r8, r2, r28); // ldclra x8, x2, [x28] - __ ldeora(Assembler::xword, r21, zr, r5); // ldeora x21, xzr, [x5] - __ ldorra(Assembler::xword, r27, r0, r17); // ldseta x27, x0, [x17] - __ ldsmina(Assembler::xword, r15, r4, r26); // ldsmina x15, x4, [x26] - __ ldsmaxa(Assembler::xword, r8, r28, r22); // ldsmaxa x8, x28, [x22] - __ ldumina(Assembler::xword, r27, r27, r25); // ldumina x27, x27, [x25] - __ ldumaxa(Assembler::xword, r23, r0, r4); // ldumaxa x23, x0, [x4] + __ swpa(Assembler::xword, r23, r21, r6); // swpa x23, x21, [x6] + __ ldadda(Assembler::xword, r11, r8, r17); // ldadda x11, x8, [x17] + __ ldbica(Assembler::xword, zr, r6, r17); // ldclra xzr, x6, [x17] + __ ldeora(Assembler::xword, r2, r12, r30); // ldeora x2, x12, [x30] + __ ldorra(Assembler::xword, r29, r3, r27); // ldseta x29, x3, [x27] + __ ldsmina(Assembler::xword, r22, r29, r14); // ldsmina x22, x29, [x14] + __ ldsmaxa(Assembler::xword, r13, r28, r17); // ldsmaxa x13, x28, [x17] + __ ldumina(Assembler::xword, r24, r5, r2); // ldumina x24, x5, [x2] + __ ldumaxa(Assembler::xword, r14, r10, r16); // ldumaxa x14, x10, [x16] // LSEOp - __ swpal(Assembler::xword, r6, r16, r0); // swpal x6, x16, [x0] - __ ldaddal(Assembler::xword, r4, r15, r1); // ldaddal x4, x15, [x1] - __ ldbical(Assembler::xword, r10, r7, r5); // ldclral x10, x7, [x5] - __ ldeoral(Assembler::xword, r10, r28, r7); // ldeoral x10, x28, [x7] - __ ldorral(Assembler::xword, r20, r23, r21); // ldsetal x20, x23, [x21] - __ ldsminal(Assembler::xword, r6, r11, r8); // ldsminal x6, x11, [x8] - __ ldsmaxal(Assembler::xword, r17, zr, r6); // ldsmaxal x17, xzr, [x6] - __ lduminal(Assembler::xword, r17, r2, r12); // lduminal x17, x2, [x12] - __ ldumaxal(Assembler::xword, r30, r29, r3); // ldumaxal x30, x29, [x3] + __ swpal(Assembler::xword, r11, r27, r23); // swpal x11, x27, [x23] + __ ldaddal(Assembler::xword, r12, r4, r22); // ldaddal x12, x4, [x22] + __ ldbical(Assembler::xword, r17, r4, r1); // ldclral x17, x4, [x1] + __ ldeoral(Assembler::xword, r19, r16, r15); // ldeoral x19, x16, [x15] + __ ldorral(Assembler::xword, r13, r14, r12); // ldsetal x13, x14, [x12] + __ ldsminal(Assembler::xword, r2, r17, r3); // ldsminal x2, x17, [x3] + __ ldsmaxal(Assembler::xword, r21, r23, r5); // ldsmaxal x21, x23, [x5] + __ lduminal(Assembler::xword, r6, r7, r19); // lduminal x6, x7, [x19] + __ ldumaxal(Assembler::xword, r13, r28, r17); // ldumaxal x13, x28, [x17] // LSEOp - __ swpl(Assembler::xword, r27, r22, r29); // swpl x27, x22, [x29] - __ ldaddl(Assembler::xword, r14, r13, r28); // ldaddl x14, x13, [x28] - __ ldbicl(Assembler::xword, r17, r24, r5); // ldclrl x17, x24, [x5] - __ ldeorl(Assembler::xword, r2, r14, r10); // ldeorl x2, x14, [x10] - __ ldorrl(Assembler::xword, r16, r11, r27); // ldsetl x16, x11, [x27] - __ ldsminl(Assembler::xword, r23, r12, r4); // ldsminl x23, x12, [x4] - __ ldsmaxl(Assembler::xword, r22, r17, r4); // ldsmaxl x22, x17, [x4] - __ lduminl(Assembler::xword, r1, r19, r16); // lduminl x1, x19, [x16] - __ ldumaxl(Assembler::xword, r16, r13, r14); // ldumaxl x16, x13, [x14] + __ swpl(Assembler::xword, r16, r6, r2); // swpl x16, x6, [x2] + __ ldaddl(Assembler::xword, r29, r3, r4); // ldaddl x29, x3, [x4] + __ ldbicl(Assembler::xword, r6, r16, r20); // ldclrl x6, x16, [x20] + __ ldeorl(Assembler::xword, r13, r12, r20); // ldeorl x13, x12, [x20] + __ ldorrl(Assembler::xword, r8, r25, r20); // ldsetl x8, x25, [x20] + __ ldsminl(Assembler::xword, r19, r0, r11); // ldsminl x19, x0, [x11] + __ ldsmaxl(Assembler::xword, r24, r6, r20); // ldsmaxl x24, x6, [x20] + __ lduminl(Assembler::xword, zr, r14, r16); // lduminl xzr, x14, [x16] + __ ldumaxl(Assembler::xword, r6, r0, r7); // ldumaxl x6, x0, [x7] // LSEOp - __ swp(Assembler::word, r12, r2, r17); // swp w12, w2, [x17] - __ ldadd(Assembler::word, r3, r21, r23); // ldadd w3, w21, [x23] - __ ldbic(Assembler::word, r5, r6, r7); // ldclr w5, w6, [x7] - __ ldeor(Assembler::word, r19, r13, r28); // ldeor w19, w13, [x28] - __ ldorr(Assembler::word, r17, r16, r6); // ldset w17, w16, [x6] - __ ldsmin(Assembler::word, r2, r29, r3); // ldsmin w2, w29, [x3] - __ ldsmax(Assembler::word, r4, r6, r15); // ldsmax w4, w6, [x15] - __ ldumin(Assembler::word, r20, r13, r12); // ldumin w20, w13, [x12] - __ ldumax(Assembler::word, r20, r8, r25); // ldumax w20, w8, [x25] + __ swp(Assembler::word, r15, r19, r26); // swp w15, w19, [x26] + __ ldadd(Assembler::word, r9, r10, r23); // ldadd w9, w10, [x23] + __ ldbic(Assembler::word, r21, r22, r28); // ldclr w21, w22, [x28] + __ ldeor(Assembler::word, r2, r3, r15); // ldeor w2, w3, [x15] + __ ldorr(Assembler::word, r19, r20, r7); // ldset w19, w20, [x7] + __ ldsmin(Assembler::word, r4, r29, r7); // ldsmin w4, w29, [x7] + __ ldsmax(Assembler::word, r0, r9, r16); // ldsmax w0, w9, [x16] + __ ldumin(Assembler::word, r20, r23, r4); // ldumin w20, w23, [x4] + __ ldumax(Assembler::word, r16, r10, r23); // ldumax w16, w10, [x23] // LSEOp - __ swpa(Assembler::word, r20, r19, r0); // swpa w20, w19, [x0] - __ ldadda(Assembler::word, r11, r24, r6); // ldadda w11, w24, [x6] - __ ldbica(Assembler::word, r20, zr, r14); // ldclra w20, wzr, [x14] - __ ldeora(Assembler::word, r16, r6, r0); // ldeora w16, w6, [x0] - __ ldorra(Assembler::word, r7, r15, r19); // ldseta w7, w15, [x19] - __ ldsmina(Assembler::word, r26, r9, r10); // ldsmina w26, w9, [x10] - __ ldsmaxa(Assembler::word, r23, r21, r22); // ldsmaxa w23, w21, [x22] - __ ldumina(Assembler::word, r28, r2, r3); // ldumina w28, w2, [x3] - __ ldumaxa(Assembler::word, r15, r19, r20); // ldumaxa w15, w19, [x20] + __ swpa(Assembler::word, r11, r25, r6); // swpa w11, w25, [x6] + __ ldadda(Assembler::word, zr, r16, r13); // ldadda wzr, w16, [x13] + __ ldbica(Assembler::word, r23, r12, r1); // ldclra w23, w12, [x1] + __ ldeora(Assembler::word, r14, r9, r21); // ldeora w14, w9, [x21] + __ ldorra(Assembler::word, r16, r26, r15); // ldseta w16, w26, [x15] + __ ldsmina(Assembler::word, r4, r4, r15); // ldsmina w4, w4, [x15] + __ ldsmaxa(Assembler::word, r8, r6, r30); // ldsmaxa w8, w6, [x30] + __ ldumina(Assembler::word, r4, r29, r17); // ldumina w4, w29, [x17] + __ ldumaxa(Assembler::word, r29, r26, r9); // ldumaxa w29, w26, [x9] // LSEOp - __ swpal(Assembler::word, r7, r4, r29); // swpal w7, w4, [x29] - __ ldaddal(Assembler::word, r7, r0, r9); // ldaddal w7, w0, [x9] - __ ldbical(Assembler::word, r16, r20, r23); // ldclral w16, w20, [x23] - __ ldeoral(Assembler::word, r4, r16, r10); // ldeoral w4, w16, [x10] - __ ldorral(Assembler::word, r23, r11, r25); // ldsetal w23, w11, [x25] - __ ldsminal(Assembler::word, r6, zr, r16); // ldsminal w6, wzr, [x16] - __ ldsmaxal(Assembler::word, r13, r23, r12); // ldsmaxal w13, w23, [x12] - __ lduminal(Assembler::word, r1, r14, r9); // lduminal w1, w14, [x9] - __ ldumaxal(Assembler::word, r21, r16, r26); // ldumaxal w21, w16, [x26] + __ swpal(Assembler::word, r15, r2, r11); // swpal w15, w2, [x11] + __ ldaddal(Assembler::word, r29, r3, r7); // ldaddal w29, w3, [x7] + __ ldbical(Assembler::word, r1, r27, r21); // ldclral w1, w27, [x21] + __ ldeoral(Assembler::word, r16, r14, r8); // ldeoral w16, w14, [x8] + __ ldorral(Assembler::word, r16, r22, r25); // ldsetal w16, w22, [x25] + __ ldsminal(Assembler::word, r5, r20, r21); // ldsminal w5, w20, [x21] + __ ldsmaxal(Assembler::word, r16, r23, r16); // ldsmaxal w16, w23, [x16] + __ lduminal(Assembler::word, r30, r20, r20); // lduminal w30, w20, [x20] + __ ldumaxal(Assembler::word, r0, r4, r19); // ldumaxal w0, w4, [x19] // LSEOp - __ swpl(Assembler::word, r15, r4, r4); // swpl w15, w4, [x4] - __ ldaddl(Assembler::word, r16, r8, r6); // ldaddl w16, w8, [x6] - __ ldbicl(Assembler::word, r30, r4, r29); // ldclrl w30, w4, [x29] - __ ldeorl(Assembler::word, r17, r29, r26); // ldeorl w17, w29, [x26] - __ ldorrl(Assembler::word, r9, r15, r2); // ldsetl w9, w15, [x2] - __ ldsminl(Assembler::word, r11, r29, r3); // ldsminl w11, w29, [x3] - __ ldsmaxl(Assembler::word, r7, r1, r27); // ldsmaxl w7, w1, [x27] - __ lduminl(Assembler::word, r21, r16, r14); // lduminl w21, w16, [x14] - __ ldumaxl(Assembler::word, r8, r16, r22); // ldumaxl w8, w16, [x22] + __ swpl(Assembler::word, r24, r4, r20); // swpl w24, w4, [x20] + __ ldaddl(Assembler::word, r4, r24, r26); // ldaddl w4, w24, [x26] + __ ldbicl(Assembler::word, r19, r2, r8); // ldclrl w19, w2, [x8] + __ ldeorl(Assembler::word, r8, r14, r24); // ldeorl w8, w14, [x24] + __ ldorrl(Assembler::word, r16, zr, r22); // ldsetl w16, wzr, [x22] + __ ldsminl(Assembler::word, r4, zr, r1); // ldsminl w4, wzr, [x1] + __ ldsmaxl(Assembler::word, r10, r20, r12); // ldsmaxl w10, w20, [x12] + __ lduminl(Assembler::word, r0, r9, r7); // lduminl w0, w9, [x7] + __ ldumaxl(Assembler::word, r24, r16, r4); // ldumaxl w24, w16, [x4] // SHA3SIMDOp - __ bcax(v25, __ T16B, v5, v20, v21); // bcax v25.16B, v5.16B, v20.16B, v21.16B - __ eor3(v18, __ T16B, v23, v16, v30); // eor3 v18.16B, v23.16B, v16.16B, v30.16B - __ rax1(v20, __ T2D, v20, v0); // rax1 v20.2D, v20.2D, v0.2D - __ xar(v4, __ T2D, v19, v24, 9); // xar v4.2D, v19.2D, v24.2D, #9 + __ bcax(v27, __ T16B, v6, v10, v27); // bcax v27.16B, v6.16B, v10.16B, v27.16B + __ eor3(v24, __ T16B, v13, v16, v31); // eor3 v24.16B, v13.16B, v16.16B, v31.16B + __ rax1(v22, __ T2D, v22, v20); // rax1 v22.2D, v22.2D, v20.2D + __ xar(v31, __ T2D, v29, v9, 28); // xar v31.2D, v29.2D, v9.2D, #28 // SHA512SIMDOp - __ sha512h(v20, __ T2D, v4, v24); // sha512h q20, q4, v24.2D - __ sha512h2(v26, __ T2D, v19, v2); // sha512h2 q26, q19, v2.2D - __ sha512su0(v8, __ T2D, v8); // sha512su0 v8.2D, v8.2D - __ sha512su1(v14, __ T2D, v24, v18); // sha512su1 v14.2D, v24.2D, v18.2D + __ sha512h(v20, __ T2D, v7, v20); // sha512h q20, q7, v20.2D + __ sha512h2(v28, __ T2D, v9, v11); // sha512h2 q28, q9, v11.2D + __ sha512su0(v14, __ T2D, v12); // sha512su0 v14.2D, v12.2D + __ sha512su1(v20, __ T2D, v1, v24); // sha512su1 v20.2D, v1.2D, v24.2D // SVEBinaryImmOp - __ sve_add(z31, __ S, 36u); // add z31.s, z31.s, #0x24 - __ sve_sub(z31, __ B, 85u); // sub z31.b, z31.b, #0x55 - __ sve_and(z20, __ H, 4032u); // and z20.h, z20.h, #0xfc0 - __ sve_eor(z7, __ D, 274877904896u); // eor z7.d, z7.d, #0x3ffffff800 - __ sve_orr(z27, __ B, 243u); // orr z27.b, z27.b, #0xf3 + __ sve_add(z9, __ S, 108u); // add z9.s, z9.s, #0x6c + __ sve_sub(z19, __ S, 132u); // sub z19.s, z19.s, #0x84 + __ sve_and(z5, __ B, 124u); // and z5.b, z5.b, #0x7c + __ sve_eor(z8, __ H, 32768u); // eor z8.h, z8.h, #0x8000 + __ sve_orr(z4, __ H, 508u); // orr z4.h, z4.h, #0x1fc // SVEBinaryImmOp - __ sve_add(z24, __ H, 132u); // add z24.h, z24.h, #0x84 - __ sve_sub(z31, __ S, 183u); // sub z31.s, z31.s, #0xb7 - __ sve_and(z20, __ D, 4503599627354112u); // and z20.d, z20.d, #0xfffffffffc000 - __ sve_eor(z14, __ S, 4042322160u); // eor z14.s, z14.s, #0xf0f0f0f0 - __ sve_orr(z28, __ H, 32256u); // orr z28.h, z28.h, #0x7e00 + __ sve_add(z0, __ H, 181u); // add z0.h, z0.h, #0xb5 + __ sve_sub(z27, __ B, 7u); // sub z27.b, z27.b, #0x7 + __ sve_and(z9, __ S, 130023424u); // and z9.s, z9.s, #0x7c00000 + __ sve_eor(z24, __ B, 62u); // eor z24.b, z24.b, #0x3e + __ sve_orr(z24, __ D, 18428729675200069887u); // orr z24.d, z24.d, #0xffc00000000000ff // SVEBinaryImmOp - __ sve_add(z12, __ S, 13u); // add z12.s, z12.s, #0xd - __ sve_sub(z24, __ H, 159u); // sub z24.h, z24.h, #0x9f - __ sve_and(z13, __ S, 2151677951u); // and z13.s, z13.s, #0x803fffff - __ sve_eor(z5, __ B, 124u); // eor z5.b, z5.b, #0x7c - __ sve_orr(z8, __ H, 32768u); // orr z8.h, z8.h, #0x8000 + __ sve_add(z11, __ D, 104u); // add z11.d, z11.d, #0x68 + __ sve_sub(z18, __ D, 142u); // sub z18.d, z18.d, #0x8e + __ sve_and(z14, __ B, 131u); // and z14.b, z14.b, #0x83 + __ sve_eor(z22, __ S, 4042322160u); // eor z22.s, z22.s, #0xf0f0f0f0 + __ sve_orr(z3, __ B, 225u); // orr z3.b, z3.b, #0xe1 // SVEBinaryImmOp - __ sve_add(z4, __ H, 243u); // add z4.h, z4.h, #0xf3 - __ sve_sub(z5, __ B, 86u); // sub z5.b, z5.b, #0x56 - __ sve_and(z22, __ D, 8064u); // and z22.d, z22.d, #0x1f80 - __ sve_eor(z9, __ S, 130023424u); // eor z9.s, z9.s, #0x7c00000 - __ sve_orr(z24, __ B, 62u); // orr z24.b, z24.b, #0x3e + __ sve_add(z9, __ S, 142u); // add z9.s, z9.s, #0x8e + __ sve_sub(z21, __ B, 36u); // sub z21.b, z21.b, #0x24 + __ sve_and(z2, __ D, 8796093020160u); // and z2.d, z2.d, #0x7fffffff800 + __ sve_eor(z11, __ S, 3221229567u); // eor z11.s, z11.s, #0xc0000fff + __ sve_orr(z30, __ H, 126u); // orr z30.h, z30.h, #0x7e // SVEBinaryImmOp - __ sve_add(z24, __ D, 113u); // add z24.d, z24.d, #0x71 - __ sve_sub(z21, __ H, 217u); // sub z21.h, z21.h, #0xd9 - __ sve_and(z13, __ S, 3221229567u); // and z13.s, z13.s, #0xc0000fff - __ sve_eor(z14, __ B, 131u); // eor z14.b, z14.b, #0x83 - __ sve_orr(z22, __ S, 4042322160u); // orr z22.s, z22.s, #0xf0f0f0f0 + __ sve_add(z23, __ H, 29u); // add z23.h, z23.h, #0x1d + __ sve_sub(z24, __ D, 26u); // sub z24.d, z24.d, #0x1a + __ sve_and(z19, __ S, 4294049777u); // and z19.s, z19.s, #0xfff1fff1 + __ sve_eor(z21, __ H, 1008u); // eor z21.h, z21.h, #0x3f0 + __ sve_orr(z26, __ B, 131u); // orr z26.b, z26.b, #0x83 // SVEBinaryImmOp - __ sve_add(z3, __ B, 215u); // add z3.b, z3.b, #0xd7 - __ sve_sub(z19, __ H, 134u); // sub z19.h, z19.h, #0x86 - __ sve_and(z17, __ S, 491520u); // and z17.s, z17.s, #0x78000 - __ sve_eor(z2, __ D, 8796093020160u); // eor z2.d, z2.d, #0x7fffffff800 - __ sve_orr(z11, __ S, 3221229567u); // orr z11.s, z11.s, #0xc0000fff + __ sve_add(z17, __ B, 31u); // add z17.b, z17.b, #0x1f + __ sve_sub(z9, __ S, 97u); // sub z9.s, z9.s, #0x61 + __ sve_and(z8, __ H, 49155u); // and z8.h, z8.h, #0xc003 + __ sve_eor(z17, __ H, 57855u); // eor z17.h, z17.h, #0xe1ff + __ sve_orr(z18, __ D, 2251799811588096u); // orr z18.d, z18.d, #0x7ffffffe00000 // SVEVectorOp - __ sve_add(z30, __ B, z12, z3); // add z30.b, z12.b, z3.b - __ sve_sub(z23, __ D, z9, z3); // sub z23.d, z9.d, z3.d - __ sve_fadd(z28, __ D, z3, z19); // fadd z28.d, z3.d, z19.d - __ sve_fmul(z7, __ S, z26, z21); // fmul z7.s, z26.s, z21.s - __ sve_fsub(z5, __ S, z8, z26); // fsub z5.s, z8.s, z26.s - __ sve_abs(z22, __ B, p4, z17); // abs z22.b, p4/m, z17.b - __ sve_add(z3, __ H, p2, z20); // add z3.h, p2/m, z3.h, z20.h - __ sve_and(z8, __ S, p3, z14); // and z8.s, p3/m, z8.s, z14.s - __ sve_asr(z17, __ D, p2, z13); // asr z17.d, p2/m, z17.d, z13.d - __ sve_bic(z18, __ H, p7, z19); // bic z18.h, p7/m, z18.h, z19.h - __ sve_clz(z16, __ S, p3, z27); // clz z16.s, p3/m, z27.s - __ sve_cnt(z28, __ H, p5, z8); // cnt z28.h, p5/m, z8.h - __ sve_eor(z5, __ H, p7, z28); // eor z5.h, p7/m, z5.h, z28.h - __ sve_lsl(z0, __ S, p3, z25); // lsl z0.s, p3/m, z0.s, z25.s - __ sve_lsr(z21, __ S, p0, z3); // lsr z21.s, p0/m, z21.s, z3.s - __ sve_mul(z26, __ D, p1, z26); // mul z26.d, p1/m, z26.d, z26.d - __ sve_neg(z19, __ H, p4, z1); // neg z19.h, p4/m, z1.h - __ sve_not(z14, __ B, p7, z14); // not z14.b, p7/m, z14.b - __ sve_orr(z18, __ S, p0, z31); // orr z18.s, p0/m, z18.s, z31.s - __ sve_rbit(z23, __ H, p5, z30); // rbit z23.h, p5/m, z30.h - __ sve_revb(z8, __ S, p0, z0); // revb z8.s, p0/m, z0.s - __ sve_smax(z23, __ S, p5, z0); // smax z23.s, p5/m, z23.s, z0.s - __ sve_smin(z26, __ H, p6, z24); // smin z26.h, p6/m, z26.h, z24.h - __ sve_sub(z22, __ B, p5, z2); // sub z22.b, p5/m, z22.b, z2.b - __ sve_fabs(z11, __ D, p5, z12); // fabs z11.d, p5/m, z12.d - __ sve_fadd(z24, __ D, p6, z9); // fadd z24.d, p6/m, z24.d, z9.d - __ sve_fdiv(z17, __ D, p5, z20); // fdiv z17.d, p5/m, z17.d, z20.d - __ sve_fmax(z4, __ D, p5, z13); // fmax z4.d, p5/m, z4.d, z13.d - __ sve_fmin(z22, __ D, p7, z31); // fmin z22.d, p7/m, z22.d, z31.d - __ sve_fmul(z18, __ S, p4, z15); // fmul z18.s, p4/m, z18.s, z15.s - __ sve_fneg(z13, __ S, p7, z20); // fneg z13.s, p7/m, z20.s - __ sve_frintm(z1, __ S, p3, z14); // frintm z1.s, p3/m, z14.s - __ sve_frintn(z7, __ D, p2, z12); // frintn z7.d, p2/m, z12.d - __ sve_frintp(z4, __ S, p6, z15); // frintp z4.s, p6/m, z15.s - __ sve_fsqrt(z3, __ D, p7, z1); // fsqrt z3.d, p7/m, z1.d - __ sve_fsub(z5, __ D, p5, z31); // fsub z5.d, p5/m, z5.d, z31.d - __ sve_fmad(z13, __ D, p3, z9, z14); // fmad z13.d, p3/m, z9.d, z14.d - __ sve_fmla(z1, __ S, p3, z28, z3); // fmla z1.s, p3/m, z28.s, z3.s - __ sve_fmls(z26, __ S, p2, z25, z9); // fmls z26.s, p2/m, z25.s, z9.s - __ sve_fmsb(z26, __ D, p2, z14, z1); // fmsb z26.d, p2/m, z14.d, z1.d - __ sve_fnmad(z26, __ D, p1, z29, z20); // fnmad z26.d, p1/m, z29.d, z20.d - __ sve_fnmsb(z6, __ D, p7, z13, z1); // fnmsb z6.d, p7/m, z13.d, z1.d - __ sve_fnmla(z11, __ S, p2, z1, z1); // fnmla z11.s, p2/m, z1.s, z1.s - __ sve_fnmls(z27, __ S, p6, z15, z2); // fnmls z27.s, p6/m, z15.s, z2.s - __ sve_mla(z30, __ B, p4, z25, z2); // mla z30.b, p4/m, z25.b, z2.b - __ sve_mls(z24, __ H, p0, z26, z29); // mls z24.h, p0/m, z26.h, z29.h - __ sve_and(z3, z22, z14); // and z3.d, z22.d, z14.d - __ sve_eor(z28, z17, z4); // eor z28.d, z17.d, z4.d - __ sve_orr(z27, z16, z2); // orr z27.d, z16.d, z2.d - __ sve_bic(z1, z28, z7); // bic z1.d, z28.d, z7.d - __ sve_uzp1(z31, __ H, z28, z16); // uzp1 z31.h, z28.h, z16.h - __ sve_uzp2(z22, __ B, z17, z29); // uzp2 z22.b, z17.b, z29.b - __ sve_fabd(z22, __ D, p1, z12); // fabd z22.d, p1/m, z22.d, z12.d - __ sve_bext(z11, __ H, z9, z11); // bext z11.h, z9.h, z11.h - __ sve_bdep(z0, __ S, z4, z23); // bdep z0.s, z4.s, z23.s - __ sve_eor3(z20, z4, z3); // eor3 z20.d, z20.d, z4.d, z3.d + __ sve_add(z16, __ S, z15, z27); // add z16.s, z15.s, z27.s + __ sve_sub(z28, __ H, z22, z8); // sub z28.h, z22.h, z8.h + __ sve_fadd(z5, __ S, z28, z28); // fadd z5.s, z28.s, z28.s + __ sve_fmul(z0, __ D, z15, z25); // fmul z0.d, z15.d, z25.d + __ sve_fsub(z21, __ D, z0, z3); // fsub z21.d, z0.d, z3.d + __ sve_sqadd(z26, __ D, z5, z26); // sqadd z26.d, z5.d, z26.d + __ sve_sqsub(z19, __ H, z17, z1); // sqsub z19.h, z17.h, z1.h + __ sve_uqadd(z14, __ B, z30, z14); // uqadd z14.b, z30.b, z14.b + __ sve_uqsub(z18, __ S, z2, z31); // uqsub z18.s, z2.s, z31.s + __ sve_abs(z23, __ H, p5, z30); // abs z23.h, p5/m, z30.h + __ sve_add(z8, __ H, p0, z0); // add z8.h, p0/m, z8.h, z0.h + __ sve_and(z23, __ S, p5, z0); // and z23.s, p5/m, z23.s, z0.s + __ sve_asr(z26, __ H, p6, z24); // asr z26.h, p6/m, z26.h, z24.h + __ sve_bic(z22, __ B, p5, z2); // bic z22.b, p5/m, z22.b, z2.b + __ sve_clz(z11, __ S, p5, z12); // clz z11.s, p5/m, z12.s + __ sve_cnt(z24, __ D, p6, z9); // cnt z24.d, p6/m, z9.d + __ sve_eor(z17, __ S, p5, z20); // eor z17.s, p5/m, z17.s, z20.s + __ sve_lsl(z4, __ D, p5, z13); // lsl z4.d, p5/m, z4.d, z13.d + __ sve_lsr(z22, __ D, p7, z31); // lsr z22.d, p7/m, z22.d, z31.d + __ sve_mul(z18, __ H, p4, z15); // mul z18.h, p4/m, z18.h, z15.h + __ sve_neg(z13, __ B, p7, z20); // neg z13.b, p7/m, z20.b + __ sve_not(z1, __ B, p3, z14); // not z1.b, p3/m, z14.b + __ sve_orr(z7, __ S, p2, z12); // orr z7.s, p2/m, z7.s, z12.s + __ sve_rbit(z4, __ B, p6, z15); // rbit z4.b, p6/m, z15.b + __ sve_revb(z3, __ S, p7, z1); // revb z3.s, p7/m, z1.s + __ sve_smax(z5, __ D, p5, z31); // smax z5.d, p5/m, z5.d, z31.d + __ sve_smin(z13, __ H, p3, z9); // smin z13.h, p3/m, z13.h, z9.h + __ sve_umax(z30, __ D, p0, z15); // umax z30.d, p0/m, z30.d, z15.d + __ sve_umin(z3, __ H, p0, z26); // umin z3.h, p0/m, z3.h, z26.h + __ sve_sub(z25, __ D, p2, z1); // sub z25.d, p2/m, z25.d, z1.d + __ sve_fabs(z10, __ D, p3, z1); // fabs z10.d, p3/m, z1.d + __ sve_fadd(z26, __ D, p1, z29); // fadd z26.d, p1/m, z26.d, z29.d + __ sve_fdiv(z17, __ S, p1, z28); // fdiv z17.s, p1/m, z17.s, z28.s + __ sve_fmax(z1, __ S, p7, z11); // fmax z1.s, p7/m, z1.s, z11.s + __ sve_fmin(z1, __ D, p0, z1); // fmin z1.d, p0/m, z1.d, z1.d + __ sve_fmul(z27, __ S, p3, z2); // fmul z27.s, p3/m, z27.s, z2.s + __ sve_fneg(z30, __ S, p4, z25); // fneg z30.s, p4/m, z25.s + __ sve_frintm(z2, __ D, p6, z3); // frintm z2.d, p6/m, z3.d + __ sve_frintn(z29, __ D, p3, z3); // frintn z29.d, p3/m, z3.d + __ sve_frintp(z14, __ D, p4, z28); // frintp z14.d, p4/m, z28.d + __ sve_fsqrt(z4, __ D, p2, z27); // fsqrt z4.d, p2/m, z27.d + __ sve_fsub(z2, __ D, p4, z1); // fsub z2.d, p4/m, z2.d, z1.d + __ sve_fmad(z7, __ D, p5, z31, z28); // fmad z7.d, p5/m, z31.d, z28.d + __ sve_fmla(z10, __ S, p5, z17, z29); // fmla z10.s, p5/m, z17.s, z29.s + __ sve_fmls(z22, __ S, p1, z12, z24); // fmls z22.s, p1/m, z12.s, z24.s + __ sve_fmsb(z9, __ S, p2, z11, z0); // fmsb z9.s, p2/m, z11.s, z0.s + __ sve_fnmad(z23, __ S, p5, z20, z4); // fnmad z23.s, p5/m, z20.s, z4.s + __ sve_fnmsb(z15, __ D, p3, z4, z30); // fnmsb z15.d, p3/m, z4.d, z30.d + __ sve_fnmla(z27, __ S, p1, z21, z26); // fnmla z27.s, p1/m, z21.s, z26.s + __ sve_fnmls(z31, __ S, p0, z25, z4); // fnmls z31.s, p0/m, z25.s, z4.s + __ sve_mla(z6, __ D, p0, z21, z7); // mla z6.d, p0/m, z21.d, z7.d + __ sve_mls(z24, __ S, p7, z24, z31); // mls z24.s, p7/m, z24.s, z31.s + __ sve_and(z1, z10, z12); // and z1.d, z10.d, z12.d + __ sve_eor(z13, z8, z25); // eor z13.d, z8.d, z25.d + __ sve_orr(z1, z31, z23); // orr z1.d, z31.d, z23.d + __ sve_bic(z20, z0, z21); // bic z20.d, z0.d, z21.d + __ sve_uzp1(z31, __ S, z29, z27); // uzp1 z31.s, z29.s, z27.s + __ sve_uzp2(z8, __ S, z29, z26); // uzp2 z8.s, z29.s, z26.s + __ sve_fabd(z5, __ D, p1, z18); // fabd z5.d, p1/m, z5.d, z18.d + __ sve_bext(z13, __ H, z26, z21); // bext z13.h, z26.h, z21.h + __ sve_bdep(z0, __ D, z19, z10); // bdep z0.d, z19.d, z10.d + __ sve_eor3(z7, z17, z6); // eor3 z7.d, z7.d, z17.d, z6.d + __ sve_sqadd(z20, __ H, p6, z28); // sqadd z20.h, p6/m, z20.h, z28.h + __ sve_sqsub(z17, __ H, p3, z19); // sqsub z17.h, p3/m, z17.h, z19.h + __ sve_uqadd(z26, __ B, p2, z24); // uqadd z26.b, p2/m, z26.b, z24.b + __ sve_uqsub(z11, __ S, p3, z28); // uqsub z11.s, p3/m, z11.s, z28.s // SVEReductionOp - __ sve_andv(v15, __ D, p1, z30); // andv d15, p1, z30.d - __ sve_orv(v27, __ D, p1, z21); // orv d27, p1, z21.d - __ sve_eorv(v10, __ D, p7, z3); // eorv d10, p7, z3.d - __ sve_smaxv(v4, __ B, p2, z6); // smaxv b4, p2, z6.b - __ sve_sminv(v21, __ D, p1, z25); // sminv d21, p1, z25.d - __ sve_fminv(v30, __ D, p6, z31); // fminv d30, p6, z31.d - __ sve_fmaxv(v1, __ D, p2, z12); // fmaxv d1, p2, z12.d - __ sve_fadda(v13, __ D, p2, z25); // fadda d13, p2, d13, z25.d - __ sve_uaddv(v1, __ D, p7, z23); // uaddv d1, p7, z23.d + __ sve_andv(v23, __ S, p5, z28); // andv s23, p5, z28.s + __ sve_orv(v20, __ B, p7, z24); // orv b20, p7, z24.b + __ sve_eorv(v27, __ H, p1, z23); // eorv h27, p1, z23.h + __ sve_smaxv(v12, __ D, p1, z13); // smaxv d12, p1, z13.d + __ sve_sminv(v26, __ B, p5, z20); // sminv b26, p5, z20.b + __ sve_fminv(v2, __ S, p7, z29); // fminv s2, p7, z29.s + __ sve_fmaxv(v29, __ S, p5, z3); // fmaxv s29, p5, z3.s + __ sve_fadda(v5, __ S, p2, z28); // fadda s5, p2, s5, z28.s + __ sve_uaddv(v17, __ H, p3, z14); // uaddv d17, p3, z14.h // AddWideNEONOp - __ saddwv(v20, v21, __ T8H, v22, __ T8B); // saddw v20.8H, v21.8H, v22.8B - __ saddwv2(v0, v1, __ T8H, v2, __ T16B); // saddw2 v0.8H, v1.8H, v2.16B - __ saddwv(v21, v22, __ T4S, v23, __ T4H); // saddw v21.4S, v22.4S, v23.4H - __ saddwv2(v7, v8, __ T4S, v9, __ T8H); // saddw2 v7.4S, v8.4S, v9.8H - __ saddwv(v31, v0, __ T2D, v1, __ T2S); // saddw v31.2D, v0.2D, v1.2S + __ saddwv(v10, v11, __ T8H, v12, __ T8B); // saddw v10.8H, v11.8H, v12.8B + __ saddwv2(v8, v9, __ T8H, v10, __ T16B); // saddw2 v8.8H, v9.8H, v10.16B + __ saddwv(v9, v10, __ T4S, v11, __ T4H); // saddw v9.4S, v10.4S, v11.4H + __ saddwv2(v19, v20, __ T4S, v21, __ T8H); // saddw2 v19.4S, v20.4S, v21.8H + __ saddwv(v0, v1, __ T2D, v2, __ T2S); // saddw v0.2D, v1.2D, v2.2S __ saddwv2(v29, v30, __ T2D, v31, __ T4S); // saddw2 v29.2D, v30.2D, v31.4S - __ uaddwv(v27, v28, __ T8H, v29, __ T8B); // uaddw v27.8H, v28.8H, v29.8B - __ uaddwv2(v22, v23, __ T8H, v24, __ T16B); // uaddw2 v22.8H, v23.8H, v24.16B - __ uaddwv(v8, v9, __ T4S, v10, __ T4H); // uaddw v8.4S, v9.4S, v10.4H - __ uaddwv2(v29, v30, __ T4S, v31, __ T8H); // uaddw2 v29.4S, v30.4S, v31.8H - __ uaddwv(v26, v27, __ T2D, v28, __ T2S); // uaddw v26.2D, v27.2D, v28.2S - __ uaddwv2(v20, v21, __ T2D, v22, __ T4S); // uaddw2 v20.2D, v21.2D, v22.4S + __ uaddwv(v16, v17, __ T8H, v18, __ T8B); // uaddw v16.8H, v17.8H, v18.8B + __ uaddwv2(v16, v17, __ T8H, v18, __ T16B); // uaddw2 v16.8H, v17.8H, v18.16B + __ uaddwv(v13, v14, __ T4S, v15, __ T4H); // uaddw v13.4S, v14.4S, v15.4H + __ uaddwv2(v23, v24, __ T4S, v25, __ T8H); // uaddw2 v23.4S, v24.4S, v25.8H + __ uaddwv(v24, v25, __ T2D, v26, __ T2S); // uaddw v24.2D, v25.2D, v26.2S + __ uaddwv2(v23, v24, __ T2D, v25, __ T4S); // uaddw2 v23.2D, v24.2D, v25.4S __ bind(forth); @@ -1337,30 +1387,30 @@ 0x9101a1a0, 0xb10a5cc8, 0xd10810aa, 0xf10fd061, 0x120cb166, 0x321764bc, 0x52174681, 0x720c0227, 0x9241018e, 0xb25a2969, 0xd278b411, 0xf26aad01, - 0x14000000, 0x17ffffd7, 0x1400044b, 0x94000000, - 0x97ffffd4, 0x94000448, 0x3400000a, 0x34fffa2a, - 0x340088aa, 0x35000008, 0x35fff9c8, 0x35008848, - 0xb400000b, 0xb4fff96b, 0xb40087eb, 0xb500001d, - 0xb5fff91d, 0xb500879d, 0x10000013, 0x10fff8b3, - 0x10008733, 0x90000013, 0x36300016, 0x3637f836, - 0x363086b6, 0x3758000c, 0x375ff7cc, 0x3758864c, + 0x14000000, 0x17ffffd7, 0x1400047d, 0x94000000, + 0x97ffffd4, 0x9400047a, 0x3400000a, 0x34fffa2a, + 0x34008eea, 0x35000008, 0x35fff9c8, 0x35008e88, + 0xb400000b, 0xb4fff96b, 0xb4008e2b, 0xb500001d, + 0xb5fff91d, 0xb5008ddd, 0x10000013, 0x10fff8b3, + 0x10008d73, 0x90000013, 0x36300016, 0x3637f836, + 0x36308cf6, 0x3758000c, 0x375ff7cc, 0x37588c8c, 0x128313a0, 0x528a32c7, 0x7289173b, 0x92ab3acc, 0xd2a0bf94, 0xf2c285e8, 0x9358722f, 0x330e652f, 0x53067f3b, 0x93577c53, 0xb34a1aac, 0xd35a4016, 0x13946c63, 0x93c3dbc8, 0x54000000, 0x54fff5a0, - 0x54008420, 0x54000001, 0x54fff541, 0x540083c1, - 0x54000002, 0x54fff4e2, 0x54008362, 0x54000002, - 0x54fff482, 0x54008302, 0x54000003, 0x54fff423, - 0x540082a3, 0x54000003, 0x54fff3c3, 0x54008243, - 0x54000004, 0x54fff364, 0x540081e4, 0x54000005, - 0x54fff305, 0x54008185, 0x54000006, 0x54fff2a6, - 0x54008126, 0x54000007, 0x54fff247, 0x540080c7, - 0x54000008, 0x54fff1e8, 0x54008068, 0x54000009, - 0x54fff189, 0x54008009, 0x5400000a, 0x54fff12a, - 0x54007faa, 0x5400000b, 0x54fff0cb, 0x54007f4b, - 0x5400000c, 0x54fff06c, 0x54007eec, 0x5400000d, - 0x54fff00d, 0x54007e8d, 0x5400000e, 0x54ffefae, - 0x54007e2e, 0x5400000f, 0x54ffef4f, 0x54007dcf, + 0x54008a60, 0x54000001, 0x54fff541, 0x54008a01, + 0x54000002, 0x54fff4e2, 0x540089a2, 0x54000002, + 0x54fff482, 0x54008942, 0x54000003, 0x54fff423, + 0x540088e3, 0x54000003, 0x54fff3c3, 0x54008883, + 0x54000004, 0x54fff364, 0x54008824, 0x54000005, + 0x54fff305, 0x540087c5, 0x54000006, 0x54fff2a6, + 0x54008766, 0x54000007, 0x54fff247, 0x54008707, + 0x54000008, 0x54fff1e8, 0x540086a8, 0x54000009, + 0x54fff189, 0x54008649, 0x5400000a, 0x54fff12a, + 0x540085ea, 0x5400000b, 0x54fff0cb, 0x5400858b, + 0x5400000c, 0x54fff06c, 0x5400852c, 0x5400000d, + 0x54fff00d, 0x540084cd, 0x5400000e, 0x54ffefae, + 0x5400846e, 0x5400000f, 0x54ffef4f, 0x5400840f, 0xd40658e1, 0xd4014d22, 0xd4046543, 0xd4273f60, 0xd44cad80, 0xd503201f, 0xd503203f, 0xd503205f, 0xd503209f, 0xd50320bf, 0xd503219f, 0xd50323bf, @@ -1463,47 +1513,57 @@ 0x0e351e93, 0x4e381ef6, 0x0eac1d6a, 0x4ea61ca4, 0x2e211c1f, 0x6e371ed5, 0x0e2a8528, 0x4e21841f, 0x0e758693, 0x4e6c856a, 0x0ebe87bc, 0x4ea48462, - 0x4efb8759, 0x0e27d4c5, 0x4e25d483, 0x4e6ad528, - 0x2e3886f6, 0x6e358693, 0x2e6f85cd, 0x6e6784c5, - 0x2ebf87dd, 0x6eba8738, 0x6ef786d5, 0x0ebcd77a, - 0x4ebad738, 0x4ee5d483, 0x0e3a9f38, 0x4e3c9f7a, - 0x0e799f17, 0x4e719e0f, 0x0eb79ed5, 0x4ea59c83, - 0x2ebad738, 0x6eaad528, 0x6efbd759, 0x2e36d6b4, - 0x6e32d630, 0x6e73d651, 0x2e24dc62, 0x6e23dc41, - 0x6e62dc20, 0x0e7a9738, 0x4e6694a4, 0x0ea59483, - 0x4eae95ac, 0x0e21cc1f, 0x4e3ecfbc, 0x4e6ccd6a, - 0x2e7c977a, 0x6e649462, 0x2eae95ac, 0x6eb49672, - 0x0ea1cc1f, 0x4ea3cc41, 0x4eefcdcd, 0x2e3fffdd, - 0x6e22fc20, 0x6e75fe93, 0x0e2e65ac, 0x4e336651, - 0x0e7866f6, 0x4e6f65cd, 0x0ebe67bc, 0x4ea067fe, - 0x0e21a41f, 0x4e23a441, 0x0e7ca77a, 0x4e7ea7bc, - 0x0ea6a4a4, 0x4ea0a7fe, 0x0e26f4a4, 0x4e28f4e6, - 0x4e60f7fe, 0x0e3c6f7a, 0x4e346e72, 0x0e6b6d49, - 0x4e6a6d28, 0x0eae6dac, 0x4ea26c20, 0x0e36aeb4, - 0x4e23ac41, 0x0e7aaf38, 0x4e64ac62, 0x0ea2ac20, - 0x4eabad49, 0x0e7ab738, 0x4e7cb77a, 0x0eb2b630, - 0x4ea0b7fe, 0x0e252483, 0x4e2c256a, 0x0e792717, - 0x4e6c256a, 0x0ea624a4, 0x4eb42672, 0x0ea4f462, - 0x4eadf58b, 0x4eeaf528, 0x2eaced6a, 0x6eb1ee0f, - 0x6ef3ee51, 0x0f8710c5, 0x4fa880e6, 0x4fc810e6, - 0x0f855083, 0x4f8f89cd, 0x4fc45862, 0x2f849062, - 0x4fab8149, 0x6fca9928, 0x0f6780c5, 0x4f5d898b, - 0x0f8f81cd, 0x4f9089ee, 0x0e3035ee, 0x4e2d358b, - 0x0e7a3738, 0x4e633441, 0x0eae35ac, 0x4ea1341f, - 0x4eec356a, 0x0e323e30, 0x4e293d07, 0x0e643c62, - 0x4e653c83, 0x0eaf3dcd, 0x4eb53e93, 0x4ef33e51, - 0x2e328e30, 0x6e258c83, 0x2e638c41, 0x6e6d8d8b, - 0x2ea08ffe, 0x6ea78cc5, 0x6eea8d28, 0x2e31360f, - 0x6e3f37dd, 0x2e6037fe, 0x6e623420, 0x2eb636b4, - 0x6ea93507, 0x6ef636b4, 0x2e393f17, 0x6e3e3fbc, - 0x2e773ed5, 0x6e7d3f9b, 0x2ebb3f59, 0x6ea73cc5, - 0x6ee33c41, 0x0e39e717, 0x4e32e630, 0x4e61e41f, - 0x2ea7e4c5, 0x6eaee5ac, 0x6eebe549, 0x2e3ee7bc, - 0x6e31e60f, 0x6e7fe7dd, 0x65d23feb, 0x65d03dd2, - 0x65d03369, 0x65912566, 0x65d13e3f, 0x659334ef, - 0x25c19085, 0x25c60c7b, 0x259f0269, 0x259d2983, - 0x25d5303b, 0x259d9458, 0x24ec5aa5, 0x24385c7d, - 0x247b7e39, 0x24bff4e7, 0xba5fd3e3, 0x3a5f03e5, + 0x4efb8759, 0x0e270cc5, 0x4e250c83, 0x0e6a0d28, + 0x4e780ef6, 0x0eb50e93, 0x4eaf0dcd, 0x4ee70cc5, + 0x2e3f0fdd, 0x6e3a0f38, 0x2e770ed5, 0x6e7c0f7a, + 0x2eba0f38, 0x6ea50c83, 0x6efa0f38, 0x0e3cd77a, + 0x4e39d717, 0x4e71d60f, 0x2e3786d5, 0x6e258483, + 0x2e7a8738, 0x6e6a8528, 0x2ebb8759, 0x6eb686b4, + 0x6ef28630, 0x0e332e51, 0x4e242c62, 0x0e632c41, + 0x4e622c20, 0x0eba2f38, 0x4ea62ca4, 0x4ee52c83, + 0x2e2e2dac, 0x6e212c1f, 0x2e7e2fbc, 0x6e6c2d6a, + 0x2ebc2f7a, 0x6ea42c62, 0x6eee2dac, 0x0eb4d672, + 0x4ea1d41f, 0x4ee3d441, 0x0e2f9dcd, 0x4e3f9fdd, + 0x0e629c20, 0x4e759e93, 0x0eae9dac, 0x4eb39e51, + 0x2eb8d6f6, 0x6eafd5cd, 0x6efed7bc, 0x2e20d7fe, + 0x6e21d41f, 0x6e63d441, 0x2e3cdf7a, 0x6e3edfbc, + 0x6e66dca4, 0x0e6097fe, 0x4e6694a4, 0x0ea894e6, + 0x4ea097fe, 0x0e3ccf7a, 0x4e34ce72, 0x4e6bcd49, + 0x2e6a9528, 0x6e6e95ac, 0x2ea29420, 0x6eb696b4, + 0x0ea3cc41, 0x4ebacf38, 0x4ee4cc62, 0x2e22fc20, + 0x6e2bfd49, 0x6e7aff38, 0x0e3c677a, 0x4e326630, + 0x0e6067fe, 0x4e656483, 0x0eac656a, 0x4eb96717, + 0x2e2c656a, 0x6e2664a4, 0x2e746672, 0x6e646462, + 0x2ead658b, 0x6eaa6528, 0x0e2ca56a, 0x4e31a60f, + 0x0e73a651, 0x4e64a462, 0x0eaca56a, 0x4eaea5ac, + 0x0e2ef5ac, 0x4e31f60f, 0x4e6ff5cd, 0x0e246c62, + 0x4e296d07, 0x0e766eb4, 0x4e7c6f7a, 0x0eb26e30, + 0x4ea66ca4, 0x2e246c62, 0x6e266ca4, 0x2e6e6dac, + 0x6e746e72, 0x2eb76ed5, 0x6eb26e30, 0x0e34ae72, + 0x4e2dad8b, 0x0e77aed5, 0x4e79af17, 0x0eaeadac, + 0x4ebcaf7a, 0x0e79b717, 0x4e7eb7bc, 0x0eb0b5ee, + 0x4eadb58b, 0x0e3a2738, 0x4e232441, 0x0e6e25ac, + 0x4e61241f, 0x0eac256a, 0x4eb22630, 0x0ea9f507, + 0x4ea4f462, 0x4ee5f483, 0x2eafedcd, 0x6eb5ee93, + 0x6ef3ee51, 0x0fa31041, 0x4f8780c5, 0x4fc41862, + 0x0f895107, 0x4fa1880f, 0x4fcc516a, 0x2f8c916a, + 0x4f9089ee, 0x6fcf99cd, 0x0f748062, 0x4f4d818b, + 0x0fa1800f, 0x4f8880e6, 0x0e2b3549, 0x4e3e37bc, + 0x0e71360f, 0x4e7f37dd, 0x0eb836f6, 0x4ea1341f, + 0x4ef53693, 0x0e213c1f, 0x4e273cc5, 0x0e703dee, + 0x4e743e72, 0x0ea13c1f, 0x4eb43e72, 0x4efd3f9b, + 0x2e368eb4, 0x6e328e30, 0x2e6e8dac, 0x6e6d8d8b, + 0x2eab8d49, 0x6ea88ce6, 0x6ee08ffe, 0x2e333651, + 0x6e3d379b, 0x2e7e37bc, 0x6e6037fe, 0x2ea93507, + 0x6eac356a, 0x6ef636b4, 0x2e2c3d6a, 0x6e263ca4, + 0x2e7a3f38, 0x6e733e51, 0x2eb33e51, 0x6eb83ef6, + 0x6ee53c83, 0x0e3fe7dd, 0x4e31e60f, 0x4e78e6f6, + 0x2eb5e693, 0x6eb5e693, 0x6ef8e6f6, 0x2e24e462, + 0x6e31e60f, 0x6e68e4e6, 0x65922e06, 0x65d0303b, + 0x65903222, 0x659135ab, 0x65913b7e, 0x65d33821, + 0x254b9e29, 0x258f14f7, 0x25c2184c, 0x258222e5, + 0x25d23730, 0x250e9d99, 0x24e9460e, 0x2427465e, + 0x24a2a937, 0x24fbe6ae, 0xba5fd3e3, 0x3a5f03e5, 0xfa411be4, 0x7a42cbe2, 0x93df03ff, 0xc820ffff, 0x8822fc7f, 0xc8247cbf, 0x88267fff, 0x4e010fe0, 0x5e040420, 0x4e081fe1, 0x4e0c1fe1, 0x4e0a1fe1, @@ -1565,53 +1625,55 @@ 0x1e721000, 0x1e723000, 0x1e741000, 0x1e743000, 0x1e761000, 0x1e763000, 0x1e781000, 0x1e783000, 0x1e7a1000, 0x1e7a3000, 0x1e7c1000, 0x1e7c3000, - 0x1e7e1000, 0x1e7e3000, 0xf83983e2, 0xf83b0150, - 0xf8371073, 0xf8302320, 0xf83a3057, 0xf830508c, - 0xf83c43be, 0xf83070db, 0xf82961fd, 0xf8a780e4, - 0xf8af02e9, 0xf8a81382, 0xf8b520bf, 0xf8bb3220, - 0xf8af5344, 0xf8a842dc, 0xf8bb733b, 0xf8b76080, - 0xf8e68010, 0xf8e4002f, 0xf8ea10a7, 0xf8ea20fc, - 0xf8f432b7, 0xf8e6510b, 0xf8f140df, 0xf8f17182, - 0xf8fe607d, 0xf87b83b6, 0xf86e038d, 0xf87110b8, - 0xf862214e, 0xf870336b, 0xf877508c, 0xf8764091, - 0xf8617213, 0xf87061cd, 0xb82c8222, 0xb82302f5, - 0xb82510e6, 0xb833238d, 0xb83130d0, 0xb822507d, - 0xb82441e6, 0xb834718d, 0xb8346328, 0xb8b48013, - 0xb8ab00d8, 0xb8b411df, 0xb8b02006, 0xb8a7326f, - 0xb8ba5149, 0xb8b742d5, 0xb8bc7062, 0xb8af6293, - 0xb8e783a4, 0xb8e70120, 0xb8f012f4, 0xb8e42150, - 0xb8f7332b, 0xb8e6521f, 0xb8ed4197, 0xb8e1712e, - 0xb8f56350, 0xb86f8084, 0xb87000c8, 0xb87e13a4, - 0xb871235d, 0xb869304f, 0xb86b507d, 0xb8674361, - 0xb87571d0, 0xb86862d0, 0xce3454b9, 0xce107af2, - 0xce608e94, 0xce982664, 0xce788094, 0xce62867a, - 0xcec08108, 0xce728b0e, 0x25a0c49f, 0x2521cabf, - 0x058054b4, 0x0543ab47, 0x050026bb, 0x2560d098, - 0x25a1d6ff, 0x058394b4, 0x0540266e, 0x05003cbc, - 0x25a0c1ac, 0x2561d3f8, 0x05800acd, 0x05403685, - 0x05000c08, 0x2560de64, 0x2521cac5, 0x0583c8b6, - 0x05405089, 0x05003e98, 0x25e0ce38, 0x2561db35, - 0x058011ad, 0x05400e4e, 0x05002676, 0x2520dae3, - 0x2561d0d3, 0x05808871, 0x0543abe2, 0x050011ab, - 0x0423019e, 0x04e30537, 0x65d3007c, 0x65950b47, - 0x659a0505, 0x0416b236, 0x04400a83, 0x049a0dc8, - 0x04d089b1, 0x045b1e72, 0x0499af70, 0x045ab51c, - 0x04591f85, 0x04938f20, 0x04918075, 0x04d0075a, - 0x0457b033, 0x041ebdce, 0x049803f2, 0x056797d7, - 0x05a48008, 0x04881417, 0x044a1b1a, 0x04011456, - 0x04dcb58b, 0x65c09938, 0x65cd9691, 0x65c695a4, - 0x65c79ff6, 0x658291f2, 0x049dbe8d, 0x6582adc1, - 0x65c0a987, 0x6581b9e4, 0x65cdbc23, 0x65c197e5, - 0x65ee8d2d, 0x65a30f81, 0x65a92b3a, 0x65e1a9da, - 0x65f4c7ba, 0x65e1fda6, 0x65a1482b, 0x65a279fb, - 0x0402533e, 0x045d6358, 0x042e32c3, 0x04a4323c, - 0x0462321b, 0x04e73381, 0x05706b9f, 0x053d6e36, - 0x65c88596, 0x454bb12b, 0x4597b480, 0x04243874, - 0x04da27cf, 0x04d826bb, 0x04d93c6a, 0x040828c4, - 0x04ca2735, 0x65c73bfe, 0x65c62981, 0x65d82b2d, - 0x04c13ee1, 0x0e3612b4, 0x4e221020, 0x0e7712d5, - 0x4e691107, 0x0ea1101f, 0x4ebf13dd, 0x2e3d139b, - 0x6e3812f6, 0x2e6a1128, 0x6e7f13dd, 0x2ebc137a, - 0x6eb612b4, + 0x1e7e1000, 0x1e7e3000, 0xf82081f1, 0xf824011a, + 0xf83c1376, 0xf83b22f9, 0xf82030c4, 0xf8305080, + 0xf82f4141, 0xf8277145, 0xf83c6287, 0xf8b780d5, + 0xf8ab0228, 0xf8bf1226, 0xf8a223cc, 0xf8bd3363, + 0xf8b651dd, 0xf8ad423c, 0xf8b87045, 0xf8ae620a, + 0xf8eb82fb, 0xf8ec02c4, 0xf8f11024, 0xf8f321f0, + 0xf8ed318e, 0xf8e25071, 0xf8f540b7, 0xf8e67267, + 0xf8ed623c, 0xf8708046, 0xf87d0083, 0xf8661290, + 0xf86d228c, 0xf8683299, 0xf8735160, 0xf8784286, + 0xf87f720e, 0xf86660e0, 0xb82f8353, 0xb82902ea, + 0xb8351396, 0xb82221e3, 0xb83330f4, 0xb82450fd, + 0xb8204209, 0xb8347097, 0xb83062ea, 0xb8ab80d9, + 0xb8bf01b0, 0xb8b7102c, 0xb8ae22a9, 0xb8b031fa, + 0xb8a451e4, 0xb8a843c6, 0xb8a4723d, 0xb8bd613a, + 0xb8ef8162, 0xb8fd00e3, 0xb8e112bb, 0xb8f0210e, + 0xb8f03336, 0xb8e552b4, 0xb8f04217, 0xb8fe7294, + 0xb8e06264, 0xb8788284, 0xb8640358, 0xb8731102, + 0xb868230e, 0xb87032df, 0xb864503f, 0xb86a4194, + 0xb86070e9, 0xb8786090, 0xce2a6cdb, 0xce107db8, + 0xce748ed6, 0xce8973bf, 0xce7480f4, 0xce6b853c, + 0xcec0818e, 0xce788834, 0x25a0cd89, 0x25a1d093, + 0x05803685, 0x05400c08, 0x050074c4, 0x2560d6a0, + 0x2521c0fb, 0x05805089, 0x05403e98, 0x05025238, + 0x25e0cd0b, 0x25e1d1d2, 0x05800e4e, 0x05402676, + 0x05001e63, 0x25a0d1c9, 0x2521c495, 0x0583abe2, + 0x054011ab, 0x05007cbe, 0x2560c3b7, 0x25e1c358, + 0x05806593, 0x054064b5, 0x05000e5a, 0x2520c3f1, + 0x25a1cc29, 0x05801468, 0x05401d71, 0x05035bb2, + 0x04bb01f0, 0x046806dc, 0x659c0385, 0x65d909e0, + 0x65c30415, 0x04fa10ba, 0x04611a33, 0x042e17ce, + 0x04bf1c52, 0x0456b7d7, 0x04400008, 0x049a1417, + 0x04509b1a, 0x041b1456, 0x0499b58b, 0x04dab938, + 0x04991691, 0x04d395a4, 0x04d19ff6, 0x045011f2, + 0x0417be8d, 0x041eadc1, 0x04980987, 0x052799e4, + 0x05a49c23, 0x04c817e5, 0x044a0d2d, 0x04c901fe, + 0x044b0343, 0x04c10839, 0x04dcac2a, 0x65c087ba, + 0x658d8791, 0x65869d61, 0x65c78021, 0x65828c5b, + 0x049db33e, 0x65c2b862, 0x65c0ac7d, 0x65c1b38e, + 0x65cdab64, 0x65c19022, 0x65fc97e7, 0x65bd162a, + 0x65b82596, 0x65a0a969, 0x65a4d697, 0x65feec8f, + 0x65ba46bb, 0x65a4633f, 0x04c742a6, 0x049f7f18, + 0x042c3141, 0x04b9310d, 0x047733e1, 0x04f53014, + 0x05bb6bbf, 0x05ba6fa8, 0x65c88645, 0x4555b34d, + 0x45cab660, 0x043138c7, 0x44589b94, 0x445a8e71, + 0x44198b1a, 0x449b8f8b, 0x049a3797, 0x04183f14, + 0x045926fb, 0x04c825ac, 0x040a369a, 0x65873fa2, + 0x6586347d, 0x65982b85, 0x04412dd1, 0x0e2c116a, + 0x4e2a1128, 0x0e6b1149, 0x4e751293, 0x0ea21020, + 0x4ebf13dd, 0x2e321230, 0x6e321230, 0x2e6f11cd, + 0x6e791317, 0x2eba1338, 0x6eb91317, }; // END Generated code -- do not edit diff --git a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java index 25204dc280d..b650db8344a 100644 --- a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java +++ b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java @@ -2016,6 +2016,21 @@ public class IRNode { vectorNode(VECTOR_BLEND_B, "VectorBlend", TYPE_BYTE); } + public static final String VECTOR_BLEND_S = VECTOR_PREFIX + "VECTOR_BLEND_S" + POSTFIX; + static { + vectorNode(VECTOR_BLEND_S, "VectorBlend", TYPE_SHORT); + } + + public static final String VECTOR_BLEND_I = VECTOR_PREFIX + "VECTOR_BLEND_I" + POSTFIX; + static { + vectorNode(VECTOR_BLEND_I, "VectorBlend", TYPE_INT); + } + + public static final String VECTOR_BLEND_L = VECTOR_PREFIX + "VECTOR_BLEND_L" + POSTFIX; + static { + vectorNode(VECTOR_BLEND_L, "VectorBlend", TYPE_LONG); + } + public static final String VECTOR_BLEND_F = VECTOR_PREFIX + "VECTOR_BLEND_F" + POSTFIX; static { vectorNode(VECTOR_BLEND_F, "VectorBlend", TYPE_FLOAT); diff --git a/test/hotspot/jtreg/compiler/lib/ir_framework/test/IREncodingPrinter.java b/test/hotspot/jtreg/compiler/lib/ir_framework/test/IREncodingPrinter.java index 03847e8f761..c9904404e90 100644 --- a/test/hotspot/jtreg/compiler/lib/ir_framework/test/IREncodingPrinter.java +++ b/test/hotspot/jtreg/compiler/lib/ir_framework/test/IREncodingPrinter.java @@ -108,6 +108,7 @@ public class IREncodingPrinter { "sha3", "asimd", "sve", + "sve2", // Riscv64 "rvv", "zbkb", diff --git a/test/hotspot/jtreg/compiler/vectorapi/VectorSaturatedOperationsTest.java b/test/hotspot/jtreg/compiler/vectorapi/VectorSaturatedOperationsTest.java index f4ef254a980..84aaacb3a01 100644 --- a/test/hotspot/jtreg/compiler/vectorapi/VectorSaturatedOperationsTest.java +++ b/test/hotspot/jtreg/compiler/vectorapi/VectorSaturatedOperationsTest.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -23,7 +23,7 @@ /** * @test -* @bug 8338021 8342677 +* @bug 8338021 8342677 8349522 * @summary Add IR validation tests for newly added saturated vector add / sub operations * @modules jdk.incubator.vector * @library /test/lib / @@ -59,6 +59,8 @@ public class VectorSaturatedOperationsTest { private short[] short_out; private byte[] byte_out; + private boolean[] mask; + public static void main(String[] args) { TestFramework testFramework = new TestFramework(); testFramework.setDefaultWarmup(5000) @@ -137,6 +139,7 @@ public class VectorSaturatedOperationsTest { short_in2 = new short[COUNT]; int_in2 = new int[COUNT]; long_in2 = new long[COUNT]; + mask = new boolean[COUNT]; IntStream.range(0, COUNT-4).forEach( i -> { long_in1[i] = r.nextLong(Long.MIN_VALUE, Long.MAX_VALUE); @@ -147,6 +150,7 @@ public class VectorSaturatedOperationsTest { short_in2[i] = (short)r.nextInt(Short.MIN_VALUE, Short.MAX_VALUE); byte_in1[i] = (byte)r.nextInt(Byte.MIN_VALUE, Byte.MAX_VALUE); byte_in2[i] = (byte)r.nextInt(Byte.MIN_VALUE, Byte.MAX_VALUE); + mask[i] = r.nextBoolean(); } ); @@ -162,7 +166,7 @@ public class VectorSaturatedOperationsTest { } @Test - @IR(counts = {IRNode.SATURATING_ADD_VB, " >0 "}, applyIfCPUFeature = {"avx", "true"}) + @IR(counts = {IRNode.SATURATING_ADD_VB, " >0 "}, applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}) @Warmup(value = 10000) public void sadd_byte() { for (int i = 0; i < COUNT; i += bspec.length()) { @@ -185,7 +189,7 @@ public class VectorSaturatedOperationsTest { } @Test - @IR(counts = {IRNode.SATURATING_ADD_VS, " >0 "}, applyIfCPUFeature = {"avx", "true"}) + @IR(counts = {IRNode.SATURATING_ADD_VS, " >0 "}, applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}) @Warmup(value = 10000) public void sadd_short() { for (int i = 0; i < COUNT; i += sspec.length()) { @@ -208,7 +212,7 @@ public class VectorSaturatedOperationsTest { } @Test - @IR(counts = {IRNode.SATURATING_ADD_VI, " >0 "}, applyIfCPUFeature = {"avx", "true"}) + @IR(counts = {IRNode.SATURATING_ADD_VI, " >0 "}, applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}) @Warmup(value = 10000) public void sadd_int() { for (int i = 0; i < COUNT; i += ispec.length()) { @@ -231,7 +235,7 @@ public class VectorSaturatedOperationsTest { } @Test - @IR(counts = {IRNode.SATURATING_ADD_VL, " >0 "}, applyIfCPUFeature = {"avx", "true"}) + @IR(counts = {IRNode.SATURATING_ADD_VL, " >0 "}, applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}) @Warmup(value = 10000) public void sadd_long() { for (int i = 0; i < COUNT; i += lspec.length()) { @@ -256,7 +260,7 @@ public class VectorSaturatedOperationsTest { @Test @IR(counts = {IRNode.SATURATING_ADD_VB, " >0 " , "unsigned_vector_node", " >0 "}, phase = {CompilePhase.BEFORE_MATCHING}, - applyIfCPUFeature = {"avx", "true"}) + applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}) @Warmup(value = 10000) public void suadd_byte() { for (int i = 0; i < COUNT; i += bspec.length()) { @@ -281,7 +285,7 @@ public class VectorSaturatedOperationsTest { @Test @IR(counts = {IRNode.SATURATING_ADD_VS, " >0 ", "unsigned_vector_node", " >0 "}, phase = {CompilePhase.BEFORE_MATCHING}, - applyIfCPUFeature = {"avx", "true"}) + applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}) @Warmup(value = 10000) public void suadd_short() { for (int i = 0; i < COUNT; i += sspec.length()) { @@ -306,7 +310,7 @@ public class VectorSaturatedOperationsTest { @Test @IR(counts = {IRNode.SATURATING_ADD_VI, " >0 ", "unsigned_vector_node", " >0 "}, phase = {CompilePhase.BEFORE_MATCHING}, - applyIfCPUFeature = {"avx", "true"}) + applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}) @Warmup(value = 10000) public void suadd_int() { for (int i = 0; i < COUNT; i += ispec.length()) { @@ -331,7 +335,7 @@ public class VectorSaturatedOperationsTest { @Test @IR(counts = {IRNode.SATURATING_ADD_VL, " >0 ", "unsigned_vector_node", " >0 "}, phase = {CompilePhase.BEFORE_MATCHING}, - applyIfCPUFeature = {"avx", "true"}) + applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}) @Warmup(value = 10000) public void suadd_long() { for (int i = 0; i < COUNT; i += lspec.length()) { @@ -354,7 +358,7 @@ public class VectorSaturatedOperationsTest { } @Test - @IR(counts = {IRNode.SATURATING_SUB_VB, " >0 "}, applyIfCPUFeature = {"avx", "true"}) + @IR(counts = {IRNode.SATURATING_SUB_VB, " >0 "}, applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}) @Warmup(value = 10000) public void ssub_byte() { for (int i = 0; i < COUNT; i += bspec.length()) { @@ -377,7 +381,7 @@ public class VectorSaturatedOperationsTest { } @Test - @IR(counts = {IRNode.SATURATING_SUB_VS, " >0 "}, applyIfCPUFeature = {"avx", "true"}) + @IR(counts = {IRNode.SATURATING_SUB_VS, " >0 "}, applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}) @Warmup(value = 10000) public void ssub_short() { for (int i = 0; i < COUNT; i += sspec.length()) { @@ -400,7 +404,7 @@ public class VectorSaturatedOperationsTest { } @Test - @IR(counts = {IRNode.SATURATING_SUB_VI, " >0 "}, applyIfCPUFeature = {"avx", "true"}) + @IR(counts = {IRNode.SATURATING_SUB_VI, " >0 "}, applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}) @Warmup(value = 10000) public void ssub_int() { for (int i = 0; i < COUNT; i += ispec.length()) { @@ -423,7 +427,7 @@ public class VectorSaturatedOperationsTest { } @Test - @IR(counts = {IRNode.SATURATING_SUB_VL, " >0 "}, applyIfCPUFeature = {"avx", "true"}) + @IR(counts = {IRNode.SATURATING_SUB_VL, " >0 "}, applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}) @Warmup(value = 10000) public void ssub_long() { for (int i = 0; i < COUNT; i += lspec.length()) { @@ -448,7 +452,7 @@ public class VectorSaturatedOperationsTest { @Test @IR(counts = {IRNode.SATURATING_SUB_VB, " >0 " , "unsigned_vector_node", " >0 "}, phase = {CompilePhase.BEFORE_MATCHING}, - applyIfCPUFeature = {"avx", "true"}) + applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}) @Warmup(value = 10000) public void susub_byte() { for (int i = 0; i < COUNT; i += bspec.length()) { @@ -473,7 +477,7 @@ public class VectorSaturatedOperationsTest { @Test @IR(counts = {IRNode.SATURATING_SUB_VS, " >0 ", "unsigned_vector_node", " >0 "}, phase = {CompilePhase.BEFORE_MATCHING}, - applyIfCPUFeature = {"avx", "true"}) + applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}) @Warmup(value = 10000) public void susub_short() { for (int i = 0; i < COUNT; i += sspec.length()) { @@ -498,7 +502,7 @@ public class VectorSaturatedOperationsTest { @Test @IR(counts = {IRNode.SATURATING_SUB_VI, " >0 ", "unsigned_vector_node", " >0 "}, phase = {CompilePhase.BEFORE_MATCHING}, - applyIfCPUFeature = {"avx", "true"}) + applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}) @Warmup(value = 10000) public void susub_int() { for (int i = 0; i < COUNT; i += ispec.length()) { @@ -523,7 +527,7 @@ public class VectorSaturatedOperationsTest { @Test @IR(counts = {IRNode.SATURATING_SUB_VL, " >0 ", "unsigned_vector_node", " >0 "}, phase = {CompilePhase.BEFORE_MATCHING}, - applyIfCPUFeature = {"avx", "true"}) + applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}) @Warmup(value = 10000) public void susub_long() { for (int i = 0; i < COUNT; i += lspec.length()) { @@ -544,4 +548,112 @@ public class VectorSaturatedOperationsTest { } } } + + @Test + @IR(counts = {IRNode.SATURATING_ADD_VB, " >0 "}, applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}) + @IR(counts = {IRNode.VECTOR_BLEND_B, " >0 "}, applyIfCPUFeatureAnd = {"asimd", "true", "sve2", "false"}) + @IR(failOn = IRNode.VECTOR_BLEND_B, applyIfCPUFeature = {"sve2", "true"}) + @Warmup(value = 10000) + public void sadd_masked() { + for (int i = 0; i < COUNT; i += bspec.length()) { + VectorMask m = VectorMask.fromArray(bspec, mask, i); + ByteVector.fromArray(bspec, byte_in1, i) + .lanewise(VectorOperators.SADD, + ByteVector.fromArray(bspec, byte_in2, i), m) + .intoArray(byte_out, i); + } + } + + @Check(test = "sadd_masked") + public void sadd_masked_verify() { + for (int i = 0; i < COUNT; i++) { + byte actual = byte_out[i]; + byte expected = mask[i] ? VectorMath.addSaturating(byte_in1[i], byte_in2[i]) : byte_in1[i]; + if (actual != expected) { + throw new AssertionError("Result Mismatch : actual (" + actual + ") != expected (" + expected + ")"); + } + } + } + + @Test + @IR(counts = {IRNode.SATURATING_SUB_VS, " >0 "}, applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}) + @IR(counts = {IRNode.VECTOR_BLEND_S, " >0 "}, applyIfCPUFeatureAnd = {"asimd", "true", "sve2", "false"}) + @IR(failOn = IRNode.VECTOR_BLEND_S, applyIfCPUFeature = {"sve2", "true"}) + @Warmup(value = 10000) + public void ssub_masked() { + for (int i = 0; i < COUNT; i += sspec.length()) { + VectorMask m = VectorMask.fromArray(sspec, mask, i); + ShortVector.fromArray(sspec, short_in1, i) + .lanewise(VectorOperators.SSUB, + ShortVector.fromArray(sspec, short_in2, i), m) + .intoArray(short_out, i); + } + } + + @Check(test = "ssub_masked") + public void ssub_masked_verify() { + for (int i = 0; i < COUNT; i++) { + short actual = short_out[i]; + short expected = mask[i] ? VectorMath.subSaturating(short_in1[i], short_in2[i]) : short_in1[i]; + if (actual != expected) { + throw new AssertionError("Result Mismatch : actual (" + actual + ") != expected (" + expected + ")"); + } + } + } + + @Test + @IR(counts = {IRNode.SATURATING_ADD_VI, " >0 ", "unsigned_vector_node", " >0 "}, + phase = {CompilePhase.BEFORE_MATCHING}, + applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}) + @IR(counts = {IRNode.VECTOR_BLEND_I, " >0 "}, applyIfCPUFeatureAnd = {"asimd", "true", "sve2", "false"}) + @IR(failOn = IRNode.VECTOR_BLEND_I, applyIfCPUFeature = {"sve2", "true"}) + @Warmup(value = 10000) + public void suadd_masked() { + for (int i = 0; i < COUNT; i += ispec.length()) { + VectorMask m = VectorMask.fromArray(ispec, mask, i); + IntVector.fromArray(ispec, int_in1, i) + .lanewise(VectorOperators.SUADD, + IntVector.fromArray(ispec, int_in2, i), m) + .intoArray(int_out, i); + } + } + + @Check(test = "suadd_masked") + public void suadd_masked_verify() { + for (int i = 0; i < COUNT; i++) { + int actual = int_out[i]; + int expected = mask[i] ? VectorMath.addSaturatingUnsigned(int_in1[i], int_in2[i]) : int_in1[i]; + if (actual != expected) { + throw new AssertionError("Result Mismatch : actual (" + actual + ") != expected (" + expected + ")"); + } + } + } + + @Test + @IR(counts = {IRNode.SATURATING_SUB_VL, " >0 ", "unsigned_vector_node", " >0 "}, + phase = {CompilePhase.BEFORE_MATCHING}, + applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"}) + @IR(counts = {IRNode.VECTOR_BLEND_L, " >0 "}, applyIfCPUFeatureAnd = {"asimd", "true", "sve2", "false"}) + @IR(failOn = IRNode.VECTOR_BLEND_L, applyIfCPUFeature = {"sve2", "true"}) + @Warmup(value = 10000) + public void susub_masked() { + for (int i = 0; i < COUNT; i += lspec.length()) { + VectorMask m = VectorMask.fromArray(lspec, mask, i); + LongVector.fromArray(lspec, long_in1, i) + .lanewise(VectorOperators.SUSUB, + LongVector.fromArray(lspec, long_in2, i), m) + .intoArray(long_out, i); + } + } + + @Check(test = "susub_masked") + public void susub_masked_verify() { + for (int i = 0; i < COUNT; i++) { + long actual = long_out[i]; + long expected = mask[i] ? VectorMath.subSaturatingUnsigned(long_in1[i], long_in2[i]) : long_in1[i]; + if (actual != expected) { + throw new AssertionError("Result Mismatch : actual (" + actual + ") != expected (" + expected + ")"); + } + } + } }