From 33d9a857308eed53e06b448691910bc8aa2f8fc9 Mon Sep 17 00:00:00 2001 From: Ningsheng Jian Date: Fri, 12 May 2023 02:05:18 +0000 Subject: [PATCH] 8307572: AArch64: Vector registers are clobbered by some macroassemblers Reviewed-by: aph, adinn --- src/hotspot/cpu/aarch64/aarch64.ad | 125 +++++--- .../cpu/aarch64/c2_MacroAssembler_aarch64.cpp | 1 + .../cpu/aarch64/macroAssembler_aarch64.cpp | 16 +- .../cpu/aarch64/macroAssembler_aarch64.hpp | 6 +- .../cpu/aarch64/stubGenerator_aarch64.cpp | 4 + .../c2/aarch64/TestIntrinsicsRegStress.java | 296 ++++++++++++++++++ 6 files changed, 391 insertions(+), 57 deletions(-) create mode 100644 test/hotspot/jtreg/compiler/c2/aarch64/TestIntrinsicsRegStress.java diff --git a/src/hotspot/cpu/aarch64/aarch64.ad b/src/hotspot/cpu/aarch64/aarch64.ad index a7c6ddd792c..b31e5d0df6b 100644 --- a/src/hotspot/cpu/aarch64/aarch64.ad +++ b/src/hotspot/cpu/aarch64/aarch64.ad @@ -17105,14 +17105,17 @@ instruct string_compareUU_sve(iRegP_R1 str1, iRegI_R2 cnt1, iRegP_R3 str2, iRegI %} instruct string_indexofUU(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, iRegI_R2 cnt2, - iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, - iRegINoSp tmp4, iRegINoSp tmp5, iRegINoSp tmp6, rFlagsReg cr) + iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, + iRegINoSp tmp3, iRegINoSp tmp4, iRegINoSp tmp5, iRegINoSp tmp6, + vRegD_V0 vtmp0, vRegD_V1 vtmp1, rFlagsReg cr) %{ predicate(((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::UU); match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 cnt2))); effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, - TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, TEMP tmp6, KILL cr); - format %{ "String IndexOf $str1,$cnt1,$str2,$cnt2 -> $result (UU)" %} + TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, TEMP tmp6, + TEMP vtmp0, TEMP vtmp1, KILL cr); + format %{ "String IndexOf $str1,$cnt1,$str2,$cnt2 -> $result (UU) " + "# KILL $str1 $cnt1 $str2 $cnt2 $tmp1 $tmp2 $tmp3 $tmp4 $tmp5 $tmp6 V0-V1 cr" %} ins_encode %{ __ string_indexof($str1$$Register, $str2$$Register, @@ -17126,14 +17129,17 @@ instruct string_indexofUU(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, iRegI_R2 %} instruct string_indexofLL(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, iRegI_R2 cnt2, - iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, - iRegINoSp tmp4, iRegINoSp tmp5, iRegINoSp tmp6, rFlagsReg cr) + iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, + iRegINoSp tmp4, iRegINoSp tmp5, iRegINoSp tmp6, + vRegD_V0 vtmp0, vRegD_V1 vtmp1, rFlagsReg cr) %{ predicate(((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::LL); match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 cnt2))); effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, - TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, TEMP tmp6, KILL cr); - format %{ "String IndexOf $str1,$cnt1,$str2,$cnt2 -> $result (LL)" %} + TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, TEMP tmp6, + TEMP vtmp0, TEMP vtmp1, KILL cr); + format %{ "String IndexOf $str1,$cnt1,$str2,$cnt2 -> $result (LL) " + "# KILL $str1 $cnt1 $str2 $cnt2 $tmp1 $tmp2 $tmp3 $tmp4 $tmp5 $tmp6 V0-V1 cr" %} ins_encode %{ __ string_indexof($str1$$Register, $str2$$Register, @@ -17147,14 +17153,17 @@ instruct string_indexofLL(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, iRegI_R2 %} instruct string_indexofUL(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, iRegI_R2 cnt2, - iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, - iRegINoSp tmp4, iRegINoSp tmp5, iRegINoSp tmp6, rFlagsReg cr) + iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2,iRegINoSp tmp3, + iRegINoSp tmp4, iRegINoSp tmp5, iRegINoSp tmp6, + vRegD_V0 vtmp0, vRegD_V1 vtmp1, rFlagsReg cr) %{ predicate(((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::UL); match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 cnt2))); effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, - TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, TEMP tmp6, KILL cr); - format %{ "String IndexOf $str1,$cnt1,$str2,$cnt2 -> $result (UL)" %} + TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, + TEMP tmp6, TEMP vtmp0, TEMP vtmp1, KILL cr); + format %{ "String IndexOf $str1,$cnt1,$str2,$cnt2 -> $result (UL) " + "# KILL $str1 cnt1 $str2 $cnt2 $tmp1 $tmp2 $tmp3 $tmp4 $tmp5 $tmp6 V0-V1 cr" %} ins_encode %{ __ string_indexof($str1$$Register, $str2$$Register, @@ -17168,14 +17177,15 @@ instruct string_indexofUL(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, iRegI_R2 %} instruct string_indexof_conUU(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, - immI_le_4 int_cnt2, iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, - iRegINoSp tmp3, iRegINoSp tmp4, rFlagsReg cr) + immI_le_4 int_cnt2, iRegI_R0 result, iRegINoSp tmp1, + iRegINoSp tmp2, iRegINoSp tmp3, iRegINoSp tmp4, rFlagsReg cr) %{ predicate(((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::UU); match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 int_cnt2))); effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt1, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr); - format %{ "String IndexOf $str1,$cnt1,$str2,$int_cnt2 -> $result (UU)" %} + format %{ "String IndexOf $str1,$cnt1,$str2,$int_cnt2 -> $result (UU) " + "# KILL $str1 $cnt1 $str2 $tmp1 $tmp2 $tmp3 $tmp4 cr" %} ins_encode %{ int icnt2 = (int)$int_cnt2$$constant; @@ -17189,14 +17199,15 @@ instruct string_indexof_conUU(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, %} instruct string_indexof_conLL(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, - immI_le_4 int_cnt2, iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, - iRegINoSp tmp3, iRegINoSp tmp4, rFlagsReg cr) + immI_le_4 int_cnt2, iRegI_R0 result, iRegINoSp tmp1, + iRegINoSp tmp2, iRegINoSp tmp3, iRegINoSp tmp4, rFlagsReg cr) %{ predicate(((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::LL); match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 int_cnt2))); effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt1, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr); - format %{ "String IndexOf $str1,$cnt1,$str2,$int_cnt2 -> $result (LL)" %} + format %{ "String IndexOf $str1,$cnt1,$str2,$int_cnt2 -> $result (LL) " + "# KILL $str1 $cnt1 $str2 $tmp1 $tmp2 $tmp3 $tmp4 cr" %} ins_encode %{ int icnt2 = (int)$int_cnt2$$constant; @@ -17210,14 +17221,15 @@ instruct string_indexof_conLL(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, %} instruct string_indexof_conUL(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, - immI_1 int_cnt2, iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, - iRegINoSp tmp3, iRegINoSp tmp4, rFlagsReg cr) + immI_1 int_cnt2, iRegI_R0 result, iRegINoSp tmp1, + iRegINoSp tmp2, iRegINoSp tmp3, iRegINoSp tmp4, rFlagsReg cr) %{ predicate(((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::UL); match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 int_cnt2))); effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt1, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr); - format %{ "String IndexOf $str1,$cnt1,$str2,$int_cnt2 -> $result (UL)" %} + format %{ "String IndexOf $str1,$cnt1,$str2,$int_cnt2 -> $result (UL) " + "# KILL $str1 $cnt1 $str2 $tmp1 $tmp2 $tmp3 $tmp4 cr" %} ins_encode %{ int icnt2 = (int)$int_cnt2$$constant; @@ -17334,13 +17346,17 @@ instruct string_equalsU(iRegP_R1 str1, iRegP_R3 str2, iRegI_R4 cnt, instruct array_equalsB(iRegP_R1 ary1, iRegP_R2 ary2, iRegI_R0 result, iRegP_R3 tmp1, iRegP_R4 tmp2, iRegP_R5 tmp3, + vRegD_V0 vtmp0, vRegD_V1 vtmp1, vRegD_V2 vtmp2, vRegD_V3 vtmp3, + vRegD_V4 vtmp4, vRegD_V5 vtmp5, vRegD_V6 vtmp6, vRegD_V7 vtmp7, iRegP_R10 tmp, rFlagsReg cr) %{ predicate(((AryEqNode*)n)->encoding() == StrIntrinsicNode::LL); match(Set result (AryEq ary1 ary2)); - effect(KILL tmp, USE_KILL ary1, USE_KILL ary2, TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL cr); + effect(KILL tmp, USE_KILL ary1, USE_KILL ary2, TEMP tmp1, TEMP tmp2, TEMP tmp3, + TEMP vtmp0, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4, TEMP vtmp5, + TEMP vtmp6, TEMP vtmp7, KILL cr); - format %{ "Array Equals $ary1,ary2 -> $result // KILL $tmp" %} + format %{ "Array Equals $ary1,ary2 -> $result # KILL $ary1 $ary2 $tmp $tmp1 $tmp2 $tmp3 V0-V7 cr" %} ins_encode %{ address tpc = __ arrays_equals($ary1$$Register, $ary2$$Register, $tmp1$$Register, $tmp2$$Register, $tmp3$$Register, @@ -17355,13 +17371,17 @@ instruct array_equalsB(iRegP_R1 ary1, iRegP_R2 ary2, iRegI_R0 result, instruct array_equalsC(iRegP_R1 ary1, iRegP_R2 ary2, iRegI_R0 result, iRegP_R3 tmp1, iRegP_R4 tmp2, iRegP_R5 tmp3, + vRegD_V0 vtmp0, vRegD_V1 vtmp1, vRegD_V2 vtmp2, vRegD_V3 vtmp3, + vRegD_V4 vtmp4, vRegD_V5 vtmp5, vRegD_V6 vtmp6, vRegD_V7 vtmp7, iRegP_R10 tmp, rFlagsReg cr) %{ predicate(((AryEqNode*)n)->encoding() == StrIntrinsicNode::UU); match(Set result (AryEq ary1 ary2)); - effect(KILL tmp, USE_KILL ary1, USE_KILL ary2, TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL cr); + effect(KILL tmp, USE_KILL ary1, USE_KILL ary2, TEMP tmp1, TEMP tmp2, TEMP tmp3, + TEMP vtmp0, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4, TEMP vtmp5, + TEMP vtmp6, TEMP vtmp7, KILL cr); - format %{ "Array Equals $ary1,ary2 -> $result // KILL $tmp" %} + format %{ "Array Equals $ary1,ary2 -> $result # KILL $ary1 $ary2 $tmp $tmp1 $tmp2 $tmp3 V0-V7 cr" %} ins_encode %{ address tpc = __ arrays_equals($ary1$$Register, $ary2$$Register, $tmp1$$Register, $tmp2$$Register, $tmp3$$Register, @@ -17391,36 +17411,39 @@ instruct count_positives(iRegP_R1 ary1, iRegI_R2 len, iRegI_R0 result, rFlagsReg // fast char[] to byte[] compression instruct string_compress(iRegP_R2 src, iRegP_R1 dst, iRegI_R3 len, - vRegD_V0 tmp1, vRegD_V1 tmp2, - vRegD_V2 tmp3, vRegD_V3 tmp4, + vRegD_V0 vtmp0, vRegD_V1 vtmp1, vRegD_V2 vtmp2, + vRegD_V3 vtmp3, vRegD_V4 vtmp4, vRegD_V5 vtmp5, iRegI_R0 result, rFlagsReg cr) %{ match(Set result (StrCompressedCopy src (Binary dst len))); - effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, + effect(TEMP vtmp0, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4, TEMP vtmp5, USE_KILL src, USE_KILL dst, USE len, KILL cr); - format %{ "String Compress $src,$dst,$len -> $result // KILL $src,$dst" %} + format %{ "String Compress $src,$dst,$len -> $result # KILL $src $dst V0-V5 cr" %} ins_encode %{ __ char_array_compress($src$$Register, $dst$$Register, $len$$Register, - $result$$Register, - $tmp1$$FloatRegister, $tmp2$$FloatRegister, - $tmp3$$FloatRegister, $tmp4$$FloatRegister); + $result$$Register, $vtmp0$$FloatRegister, $vtmp1$$FloatRegister, + $vtmp2$$FloatRegister, $vtmp3$$FloatRegister, + $vtmp4$$FloatRegister, $vtmp5$$FloatRegister); %} ins_pipe(pipe_slow); %} // fast byte[] to char[] inflation -instruct string_inflate(Universe dummy, iRegP_R0 src, iRegP_R1 dst, iRegI_R2 len, - vRegD_V0 tmp1, vRegD_V1 tmp2, vRegD_V2 tmp3, iRegP_R3 tmp4, rFlagsReg cr) +instruct string_inflate(Universe dummy, iRegP_R0 src, iRegP_R1 dst, iRegI_R2 len, iRegP_R3 tmp, + vRegD_V0 vtmp0, vRegD_V1 vtmp1, vRegD_V2 vtmp2, vRegD_V3 vtmp3, + vRegD_V4 vtmp4, vRegD_V5 vtmp5, vRegD_V6 vtmp6, rFlagsReg cr) %{ match(Set dummy (StrInflatedCopy src (Binary dst len))); - effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, USE_KILL src, USE_KILL dst, USE_KILL len, KILL cr); + effect(TEMP vtmp0, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, + TEMP vtmp4, TEMP vtmp5, TEMP vtmp6, TEMP tmp, + USE_KILL src, USE_KILL dst, USE_KILL len, KILL cr); - format %{ "String Inflate $src,$dst // KILL $tmp1, $tmp2" %} + format %{ "String Inflate $src,$dst # KILL $tmp $src $dst $len V0-V6 cr" %} ins_encode %{ address tpc = __ byte_array_inflate($src$$Register, $dst$$Register, $len$$Register, - $tmp1$$FloatRegister, $tmp2$$FloatRegister, - $tmp3$$FloatRegister, $tmp4$$Register); + $vtmp0$$FloatRegister, $vtmp1$$FloatRegister, + $vtmp2$$FloatRegister, $tmp$$Register); if (tpc == NULL) { ciEnv::current()->record_failure("CodeCache is full"); return; @@ -17431,41 +17454,43 @@ instruct string_inflate(Universe dummy, iRegP_R0 src, iRegP_R1 dst, iRegI_R2 len // encode char[] to byte[] in ISO_8859_1 instruct encode_iso_array(iRegP_R2 src, iRegP_R1 dst, iRegI_R3 len, - vRegD_V0 vtmp0, vRegD_V1 vtmp1, - vRegD_V2 vtmp2, vRegD_V3 vtmp3, + vRegD_V0 vtmp0, vRegD_V1 vtmp1, vRegD_V2 vtmp2, + vRegD_V3 vtmp3, vRegD_V4 vtmp4, vRegD_V5 vtmp5, iRegI_R0 result, rFlagsReg cr) %{ predicate(!((EncodeISOArrayNode*)n)->is_ascii()); match(Set result (EncodeISOArray src (Binary dst len))); - effect(USE_KILL src, USE_KILL dst, USE len, - KILL vtmp0, KILL vtmp1, KILL vtmp2, KILL vtmp3, KILL cr); + effect(USE_KILL src, USE_KILL dst, USE len, KILL vtmp0, KILL vtmp1, + KILL vtmp2, KILL vtmp3, KILL vtmp4, KILL vtmp5, KILL cr); - format %{ "Encode ISO array $src,$dst,$len -> $result" %} + format %{ "Encode ISO array $src,$dst,$len -> $result # KILL $src $dst V0-V5 cr" %} ins_encode %{ __ encode_iso_array($src$$Register, $dst$$Register, $len$$Register, $result$$Register, false, $vtmp0$$FloatRegister, $vtmp1$$FloatRegister, - $vtmp2$$FloatRegister, $vtmp3$$FloatRegister); + $vtmp2$$FloatRegister, $vtmp3$$FloatRegister, + $vtmp4$$FloatRegister, $vtmp5$$FloatRegister); %} ins_pipe(pipe_class_memory); %} instruct encode_ascii_array(iRegP_R2 src, iRegP_R1 dst, iRegI_R3 len, - vRegD_V0 vtmp0, vRegD_V1 vtmp1, - vRegD_V2 vtmp2, vRegD_V3 vtmp3, + vRegD_V0 vtmp0, vRegD_V1 vtmp1, vRegD_V2 vtmp2, + vRegD_V3 vtmp3, vRegD_V4 vtmp4, vRegD_V5 vtmp5, iRegI_R0 result, rFlagsReg cr) %{ predicate(((EncodeISOArrayNode*)n)->is_ascii()); match(Set result (EncodeISOArray src (Binary dst len))); - effect(USE_KILL src, USE_KILL dst, USE len, - KILL vtmp0, KILL vtmp1, KILL vtmp2, KILL vtmp3, KILL cr); + effect(USE_KILL src, USE_KILL dst, USE len, KILL vtmp0, KILL vtmp1, + KILL vtmp2, KILL vtmp3, KILL vtmp4, KILL vtmp5, KILL cr); - format %{ "Encode ASCII array $src,$dst,$len -> $result" %} + format %{ "Encode ASCII array $src,$dst,$len -> $result # KILL $src $dst V0-V5 cr" %} ins_encode %{ __ encode_iso_array($src$$Register, $dst$$Register, $len$$Register, $result$$Register, true, $vtmp0$$FloatRegister, $vtmp1$$FloatRegister, - $vtmp2$$FloatRegister, $vtmp3$$FloatRegister); + $vtmp2$$FloatRegister, $vtmp3$$FloatRegister, + $vtmp4$$FloatRegister, $vtmp5$$FloatRegister); %} ins_pipe(pipe_class_memory); %} diff --git a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp index e96621ae2d3..dbe64f8f9ca 100644 --- a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp @@ -46,6 +46,7 @@ typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); // Search for str1 in str2 and return index or -1 +// Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1. void C2_MacroAssembler::string_indexof(Register str2, Register str1, Register cnt2, Register cnt1, Register tmp1, Register tmp2, diff --git a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp index ef2fe7cef8c..63745629038 100644 --- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp @@ -5008,6 +5008,8 @@ address MacroAssembler::count_positives(Register ary1, Register len, Register re return pc(); } +// Clobbers: rscratch1, rscratch2, rflags +// May also clobber v0-v7 when (!UseSimpleArrayEquals && UseSIMDForArrayEquals) address MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3, Register tmp4, Register tmp5, Register result, Register cnt1, int elem_size) { @@ -5557,10 +5559,12 @@ void MacroAssembler::fill_words(Register base, Register cnt, Register value) // Using 'umaxv' in the ASCII-case comes with a small penalty but does // avoid additional bloat. // +// Clobbers: src, dst, res, rscratch1, rscratch2, rflags void MacroAssembler::encode_iso_array(Register src, Register dst, Register len, Register res, bool ascii, FloatRegister vtmp0, FloatRegister vtmp1, - FloatRegister vtmp2, FloatRegister vtmp3) + FloatRegister vtmp2, FloatRegister vtmp3, + FloatRegister vtmp4, FloatRegister vtmp5) { Register cnt = res; Register max = rscratch1; @@ -5579,8 +5583,8 @@ void MacroAssembler::encode_iso_array(Register src, Register dst, br(LT, DONE_32); ld1(vtmp0, vtmp1, vtmp2, vtmp3, T8H, Address(post(src, 64))); // Extract lower bytes. - FloatRegister vlo0 = v4; - FloatRegister vlo1 = v5; + FloatRegister vlo0 = vtmp4; + FloatRegister vlo1 = vtmp5; uzp1(vlo0, T16B, vtmp0, vtmp1); uzp1(vlo1, T16B, vtmp2, vtmp3); // Merge bits... @@ -5653,6 +5657,7 @@ void MacroAssembler::encode_iso_array(Register src, Register dst, } // Inflate byte[] array to char[]. +// Clobbers: src, dst, len, rflags, rscratch1, v0-v6 address MacroAssembler::byte_array_inflate(Register src, Register dst, Register len, FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, Register tmp4) { @@ -5761,8 +5766,9 @@ address MacroAssembler::byte_array_inflate(Register src, Register dst, Register void MacroAssembler::char_array_compress(Register src, Register dst, Register len, Register res, FloatRegister tmp0, FloatRegister tmp1, - FloatRegister tmp2, FloatRegister tmp3) { - encode_iso_array(src, dst, len, res, false, tmp0, tmp1, tmp2, tmp3); + FloatRegister tmp2, FloatRegister tmp3, + FloatRegister tmp4, FloatRegister tmp5) { + encode_iso_array(src, dst, len, res, false, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5); // Adjust result: res == len ? len : 0 cmp(len, res); csel(res, res, zr, EQ); diff --git a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp index 7e58720727c..6b45be8ce43 100644 --- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp @@ -1393,12 +1393,14 @@ public: void char_array_compress(Register src, Register dst, Register len, Register res, FloatRegister vtmp0, FloatRegister vtmp1, - FloatRegister vtmp2, FloatRegister vtmp3); + FloatRegister vtmp2, FloatRegister vtmp3, + FloatRegister vtmp4, FloatRegister vtmp5); void encode_iso_array(Register src, Register dst, Register len, Register res, bool ascii, FloatRegister vtmp0, FloatRegister vtmp1, - FloatRegister vtmp2, FloatRegister vtmp3); + FloatRegister vtmp2, FloatRegister vtmp3, + FloatRegister vtmp4, FloatRegister vtmp5); void fast_log(FloatRegister vtmp0, FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, FloatRegister vtmp4, FloatRegister vtmp5, diff --git a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp index 389bb0d7d0e..ed360289298 100644 --- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp @@ -5151,6 +5151,7 @@ class StubGenerator: public StubCodeGenerator { // result = r0 - return value. Already contains "false" // cnt1 = r10 - amount of elements left to check, reduced by wordSize // r3-r5 are reserved temporary registers + // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2 address generate_large_array_equals() { Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, @@ -5734,6 +5735,8 @@ class StubGenerator: public StubCodeGenerator { // R2 = cnt1 // R3 = str1 // R4 = cnt2 + // Clobbers: rscratch1, rscratch2, v0, v1, rflags + // // This generic linear code use few additional ideas, which makes it faster: // 1) we can safely keep at least 1st register of pattern(since length >= 8) // in order to skip initial loading(help in systems with 1 ld pipeline) @@ -6048,6 +6051,7 @@ class StubGenerator: public StubCodeGenerator { // R3 = len >> 3 // V0 = 0 // v1 = loaded 8 bytes + // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6 address generate_large_byte_array_inflate() { __ align(CodeEntryAlignment); StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate"); diff --git a/test/hotspot/jtreg/compiler/c2/aarch64/TestIntrinsicsRegStress.java b/test/hotspot/jtreg/compiler/c2/aarch64/TestIntrinsicsRegStress.java new file mode 100644 index 00000000000..960661b975a --- /dev/null +++ b/test/hotspot/jtreg/compiler/c2/aarch64/TestIntrinsicsRegStress.java @@ -0,0 +1,296 @@ +/* + * Copyright (c) 2023, Arm Limited. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + * @test + * @bug 8307572 + * @summary Verify vector register clobbering in some aarch64 intrinsics + * @library /compiler/patches /test/lib + * @build java.base/java.lang.Helper + * @run main/othervm -Xbatch -XX:CompileThreshold=100 -XX:-TieredCompilation compiler.c2.aarch64.TestIntrinsicsRegStress + */ + +package compiler.c2.aarch64; + +import java.util.Arrays; + +public class TestIntrinsicsRegStress { + + final int LENGTH = 1024; + final int ITER = 10000; + final int NUM = 32; + + byte[] ba; + char[] ca; + char[] cb; + float[] fv; + + String str; + String[] strings; + String needle = "01234567890123456789"; + + public void init() { + ca = new char[LENGTH]; + fv = new float[NUM]; + strings = new String[NUM]; + for (int i = 0; i < LENGTH; i++) { + ca[i] = (char) ('a' + i % NUM); + } + cb = ca.clone(); + str = new String(ca); + for (int i = 0; i < NUM; i++) { + fv[i] = 1; + } + for (int i = 0; i < NUM; i++) { + strings[i] = str.substring(i) + needle; + } + } + + public void checkIndexOf(int iter) { + float t0 = 0; + float t1 = fv[1] * fv[0]; + float t2 = fv[2] * fv[0]; + float t3 = fv[3] * fv[0]; + float t4 = fv[4] * fv[0]; + float t5 = fv[5] * fv[0]; + float t6 = fv[6] * fv[0]; + float t7 = fv[7] * fv[0]; + float t8 = fv[8] * fv[0]; + float t9 = fv[9] * fv[0]; + float t10 = fv[10] * fv[0]; + float t11 = fv[11] * fv[0]; + float t12 = fv[12] * fv[0]; + float t13 = fv[13] * fv[0]; + float t14 = fv[14] * fv[0]; + float t15 = fv[15] * fv[0]; + float t16 = fv[16] * fv[0]; + float t17 = fv[17] * fv[0]; + float t18 = fv[18] * fv[0]; + float t19 = fv[19] * fv[0]; + float t20 = fv[20] * fv[0]; + float t21 = fv[21] * fv[0]; + float t22 = fv[22] * fv[0]; + float t23 = fv[23] * fv[0]; + float t24 = fv[24] * fv[0]; + float t25 = fv[25] * fv[0]; + float t26 = fv[26] * fv[0]; + float t27 = fv[27] * fv[0]; + float t28 = fv[28] * fv[0]; + float t29 = fv[29] * fv[0]; + float t30 = fv[30] * fv[0]; + + int result = strings[iter % NUM].indexOf(needle); + + if (result > LENGTH - NUM / 2) { + // Use fp registers as many as possible and try to make them + // live across above intrinsic function. + t0 += t1 - t2 + t3 - t4 + t5 - t6 + t7 - t8 + t9 - t10 + t11 - t12 + t13 - t14 + t15 + - t16 + t17 - t18 + t19 - t20 + t21 - t22 + t23 - t24 + t25 - t26 + t27 - t28 + + t29 - t30; // 0 + } + fv[31] += t0 + t2 - t11 + t16 - t29; + } + + public void testIndexOf() { + for (int i = 0; i < ITER; i++) { + checkIndexOf(i); + } + } + + public void checkArraysEquals() { + float t0 = 0; + float t1 = fv[1] * fv[0]; + float t2 = fv[2] * fv[0]; + float t3 = fv[3] * fv[0]; + float t4 = fv[4] * fv[0]; + float t5 = fv[5] * fv[0]; + float t6 = fv[6] * fv[0]; + float t7 = fv[7] * fv[0]; + float t8 = fv[8] * fv[0]; + float t9 = fv[9] * fv[0]; + float t10 = fv[10] * fv[0]; + float t11 = fv[11] * fv[0]; + float t12 = fv[12] * fv[0]; + float t13 = fv[13] * fv[0]; + float t14 = fv[14] * fv[0]; + float t15 = fv[15] * fv[0]; + float t16 = fv[16] * fv[0]; + float t17 = fv[17] * fv[0]; + float t18 = fv[18] * fv[0]; + float t19 = fv[19] * fv[0]; + float t20 = fv[20] * fv[0]; + float t21 = fv[21] * fv[0]; + float t22 = fv[22] * fv[0]; + float t23 = fv[23] * fv[0]; + float t24 = fv[24] * fv[0]; + float t25 = fv[25] * fv[0]; + float t26 = fv[26] * fv[0]; + float t27 = fv[27] * fv[0]; + float t28 = fv[28] * fv[0]; + float t29 = fv[29] * fv[0]; + float t30 = fv[30] * fv[0]; + + if (Arrays.equals(ca, cb)) { + // Use fp registers as many as possible and try to make them + // live across above intrinsic function. + t0 += t1 - t2 + t3 - t4 + t5 - t6 + t7 - t8 + t9 - t10 + t11 - t12 + t13 - t14 + t15 + - t16 + t17 - t18 + t19 - t20 + t21 - t22 + t23 - t24 + t25 - t26 + t27 - t28 + + t29 - t30; // 0 + } + fv[31] += t0 + t2 - t11 + t16 - t29; + } + + public void testArraysEquals() { + for (int i = 0; i < ITER; i++) { + checkArraysEquals(); + } + } + + public void checkCompress(int iter) { + float t0 = 0; + float t1 = fv[1] * fv[0]; + float t2 = fv[2] * fv[0]; + float t3 = fv[3] * fv[0]; + float t4 = fv[4] * fv[0]; + float t5 = fv[5] * fv[0]; + float t6 = fv[6] * fv[0]; + float t7 = fv[7] * fv[0]; + float t8 = fv[8] * fv[0]; + float t9 = fv[9] * fv[0]; + float t10 = fv[10] * fv[0]; + float t11 = fv[11] * fv[0]; + float t12 = fv[12] * fv[0]; + float t13 = fv[13] * fv[0]; + float t14 = fv[14] * fv[0]; + float t15 = fv[15] * fv[0]; + float t16 = fv[16] * fv[0]; + float t17 = fv[17] * fv[0]; + float t18 = fv[18] * fv[0]; + float t19 = fv[19] * fv[0]; + float t20 = fv[20] * fv[0]; + float t21 = fv[21] * fv[0]; + float t22 = fv[22] * fv[0]; + float t23 = fv[23] * fv[0]; + float t24 = fv[24] * fv[0]; + float t25 = fv[25] * fv[0]; + float t26 = fv[26] * fv[0]; + float t27 = fv[27] * fv[0]; + float t28 = fv[28] * fv[0]; + float t29 = fv[29] * fv[0]; + float t30 = fv[30] * fv[0]; + + ba = Helper.compressChar(ca, 0, LENGTH, 0, LENGTH); + + if (ba[iter % LENGTH] > (byte) ('a' + 5)) { + // Use fp registers as many as possible and try to make them + // live across above intrinsic function. + t0 += t1 - t2 + t3 - t4 + t5 - t6 + t7 - t8 + t9 - t10 + t11 - t12 + t13 - t14 + t15 + - t16 + t17 - t18 + t19 - t20 + t21 - t22 + t23 - t24 + t25 - t26 + t27 - t28 + + t29 - t30; // 0 + } + fv[31] += t0 + t2 - t11 + t16 - t29; + } + + public void testCompress() { + for (int i = 0; i < ITER; i++) { + checkCompress(i); + } + } + + public void checkInflate(int iter) { + float t0 = 0; + float t1 = fv[1] * fv[0]; + float t2 = fv[2] * fv[0]; + float t3 = fv[3] * fv[0]; + float t4 = fv[4] * fv[0]; + float t5 = fv[5] * fv[0]; + float t6 = fv[6] * fv[0]; + float t7 = fv[7] * fv[0]; + float t8 = fv[8] * fv[0]; + float t9 = fv[9] * fv[0]; + float t10 = fv[10] * fv[0]; + float t11 = fv[11] * fv[0]; + float t12 = fv[12] * fv[0]; + float t13 = fv[13] * fv[0]; + float t14 = fv[14] * fv[0]; + float t15 = fv[15] * fv[0]; + float t16 = fv[16] * fv[0]; + float t17 = fv[17] * fv[0]; + float t18 = fv[18] * fv[0]; + float t19 = fv[19] * fv[0]; + float t20 = fv[20] * fv[0]; + float t21 = fv[21] * fv[0]; + float t22 = fv[22] * fv[0]; + float t23 = fv[23] * fv[0]; + float t24 = fv[24] * fv[0]; + float t25 = fv[25] * fv[0]; + float t26 = fv[26] * fv[0]; + float t27 = fv[27] * fv[0]; + float t28 = fv[28] * fv[0]; + float t29 = fv[29] * fv[0]; + float t30 = fv[30] * fv[0]; + + str.getChars(0, LENGTH, ca, 0); + + if (ca[iter % LENGTH] > (byte) ('a' + NUM / 2)) { + // Use fp registers as many as possible and try to make them + // live across above intrinsic function. + t0 += t1 - t2 + t3 - t4 + t5 - t6 + t7 - t8 + t9 - t10 + t11 - t12 + t13 - t14 + t15 + - t16 + t17 - t18 + t19 - t20 + t21 - t22 + t23 - t24 + t25 - t26 + t27 - t28 + + t29 - t30; // 0 + } + fv[31] += t0 + t2 - t11 + t16 - t29; + } + + public void testInflate() { + for (int i = 0; i < ITER; i++) { + checkInflate(i); + } + } + + public void verifyAndReset() { + if (fv[31] != 1.0) { + throw new RuntimeException("Failed with " + Float.toString(fv[31])); + } else { + System.out.println("Success!"); + } + fv[31] = 1.0f; + } + + public static void main(String[] args) { + TestIntrinsicsRegStress t = new TestIntrinsicsRegStress(); + t.init(); + + t.testIndexOf(); + t.verifyAndReset(); + + t.testArraysEquals(); + t.verifyAndReset(); + + t.testCompress(); + t.verifyAndReset(); + + t.testInflate(); + t.verifyAndReset(); + } +}