From 82967f45db3b9555be03fcabdba380852ea21e2c Mon Sep 17 00:00:00 2001 From: steveatgh Date: Mon, 27 Nov 2023 17:35:39 +0000 Subject: [PATCH] 8310159: Bulk copy with Unsafe::arrayCopy is slower compared to memcpy Co-authored-by: Maurizio Cimadamore Reviewed-by: thartmann, jbhateja, sviswanathan --- src/hotspot/cpu/x86/assembler_x86.cpp | 21 +++ src/hotspot/cpu/x86/assembler_x86.hpp | 3 + src/hotspot/cpu/x86/stubGenerator_x86_64.hpp | 12 ++ .../x86/stubGenerator_x86_64_arraycopy.cpp | 153 ++++++++++++++++++ .../java/lang/ArrayCopyAlignedLarge.java | 70 ++++++++ 5 files changed, 259 insertions(+) create mode 100644 test/micro/org/openjdk/bench/java/lang/ArrayCopyAlignedLarge.java diff --git a/src/hotspot/cpu/x86/assembler_x86.cpp b/src/hotspot/cpu/x86/assembler_x86.cpp index 0fc98a6ac10..cedddaed975 100644 --- a/src/hotspot/cpu/x86/assembler_x86.cpp +++ b/src/hotspot/cpu/x86/assembler_x86.cpp @@ -3417,6 +3417,27 @@ void Assembler::evmovdquq(XMMRegister dst, KRegister mask, Address src, bool mer emit_operand(dst, src, 0); } +void Assembler::evmovntdquq(Address dst, XMMRegister src, int vector_len) { + // Unmasked instruction + evmovntdquq(dst, k0, src, /*merge*/ true, vector_len); +} + +void Assembler::evmovntdquq(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len) { + assert(VM_Version::supports_evex(), ""); + assert(src != xnoreg, "sanity"); + InstructionMark im(this); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit); + attributes.set_embedded_opmask_register_specifier(mask); + if (merge) { + attributes.reset_is_clear_context(); + } + attributes.set_is_evex_instruction(); + vex_prefix(dst, 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int8(0xE7); + emit_operand(src, dst, 0); +} + void Assembler::evmovdquq(Address dst, XMMRegister src, int vector_len) { // Unmasked instruction evmovdquq(dst, k0, src, /*merge*/ true, vector_len); diff --git a/src/hotspot/cpu/x86/assembler_x86.hpp b/src/hotspot/cpu/x86/assembler_x86.hpp index c6083d844aa..3bcfee90b35 100644 --- a/src/hotspot/cpu/x86/assembler_x86.hpp +++ b/src/hotspot/cpu/x86/assembler_x86.hpp @@ -1615,6 +1615,9 @@ private: void evmovdqul(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len); void evmovdqul(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len); + void evmovntdquq(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len); + void evmovntdquq(Address dst, XMMRegister src, int vector_len); + void evmovdquq(Address dst, XMMRegister src, int vector_len); void evmovdquq(XMMRegister dst, Address src, int vector_len); void evmovdquq(XMMRegister dst, XMMRegister src, int vector_len); diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp index 109c98f83bd..6b7da718498 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp @@ -187,11 +187,20 @@ class StubGenerator: public StubCodeGenerator { Register index, Register temp, bool use64byteVector, Label& L_entry, Label& L_exit); + void arraycopy_avx3_special_cases_256(XMMRegister xmm, KRegister mask, Register from, + Register to, Register count, int shift, + Register index, Register temp, Label& L_exit); + void arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegister mask, Register from, Register to, Register start_index, Register end_index, Register count, int shift, Register temp, bool use64byteVector, Label& L_entry, Label& L_exit); + void arraycopy_avx3_large(Register to, Register from, Register temp1, Register temp2, + Register temp3, Register temp4, Register count, + XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, + XMMRegister xmm4, int shift); + void copy32_avx(Register dst, Register src, Register index, XMMRegister xmm, int shift = Address::times_1, int offset = 0); @@ -199,6 +208,9 @@ class StubGenerator: public StubCodeGenerator { bool conjoint, int shift = Address::times_1, int offset = 0, bool use64byteVector = false); + void copy256_avx3(Register dst, Register src, Register index, XMMRegister xmm1, XMMRegister xmm2, + XMMRegister xmm3, XMMRegister xmm4, int shift, int offset = 0); + void copy64_masked_avx(Register dst, Register src, XMMRegister xmm, KRegister mask, Register length, Register index, Register temp, int shift = Address::times_1, int offset = 0, diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64_arraycopy.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64_arraycopy.cpp index 80d9b4844ea..c802f953c90 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64_arraycopy.cpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64_arraycopy.cpp @@ -515,8 +515,10 @@ address StubGenerator::generate_disjoint_copy_avx3_masked(address* entry, const int avx3threshold = VM_Version::avx3_threshold(); bool use64byteVector = (MaxVectorSize > 32) && (avx3threshold == 0); + const int large_threshold = 2621440; // 2.5 MB Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry; Label L_repmovs, L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64; + Label L_copy_large, L_finish; const Register from = rdi; // source array address const Register to = rsi; // destination array address const Register count = rdx; // elements count @@ -577,6 +579,12 @@ address StubGenerator::generate_disjoint_copy_avx3_masked(address* entry, const // PRE-MAIN-POST loop for aligned copy. __ BIND(L_entry); + if (MaxVectorSize == 64) { + __ movq(temp2, temp1); + __ shlq(temp2, shift); + __ cmpq(temp2, large_threshold); + __ jcc(Assembler::greaterEqual, L_copy_large); + } if (avx3threshold != 0) { __ cmpq(count, threshold[shift]); if (MaxVectorSize == 64) { @@ -703,6 +711,7 @@ address StubGenerator::generate_disjoint_copy_avx3_masked(address* entry, const __ BIND(L_exit); } + __ BIND(L_finish); address ucme_exit_pc = __ pc(); // When called from generic_arraycopy r11 contains specific values // used during arraycopy epilogue, re-initializing r11. @@ -717,9 +726,77 @@ address StubGenerator::generate_disjoint_copy_avx3_masked(address* entry, const __ leave(); // required for proper stackwalking of RuntimeStub frame __ ret(0); + if (MaxVectorSize == 64) { + __ BIND(L_copy_large); + arraycopy_avx3_large(to, from, temp1, temp2, temp3, temp4, count, xmm1, xmm2, xmm3, xmm4, shift); + __ jmp(L_finish); + } return start; } +void StubGenerator::arraycopy_avx3_large(Register to, Register from, Register temp1, Register temp2, + Register temp3, Register temp4, Register count, + XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, + XMMRegister xmm4, int shift) { + + // Type(shift) byte(0), short(1), int(2), long(3) + int loop_size[] = { 256, 128, 64, 32}; + int threshold[] = { 4096, 2048, 1024, 512}; + + Label L_main_loop_large; + Label L_tail_large; + Label L_exit_large; + Label L_entry_large; + Label L_main_pre_loop_large; + Label L_pre_main_post_large; + + assert(MaxVectorSize == 64, "vector length != 64"); + __ BIND(L_entry_large); + + __ BIND(L_pre_main_post_large); + // Partial copy to make dst address 64 byte aligned. + __ movq(temp2, to); + __ andq(temp2, 63); + __ jcc(Assembler::equal, L_main_pre_loop_large); + + __ negptr(temp2); + __ addq(temp2, 64); + if (shift) { + __ shrq(temp2, shift); + } + __ movq(temp3, temp2); + copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0, true); + __ movq(temp4, temp2); + __ movq(temp1, count); + __ subq(temp1, temp2); + + __ cmpq(temp1, loop_size[shift]); + __ jcc(Assembler::less, L_tail_large); + + __ BIND(L_main_pre_loop_large); + __ subq(temp1, loop_size[shift]); + + // Main loop with aligned copy block size of 256 bytes at 64 byte copy granularity. + __ align32(); + __ BIND(L_main_loop_large); + copy256_avx3(to, from, temp4, xmm1, xmm2, xmm3, xmm4, shift, 0); + __ addptr(temp4, loop_size[shift]); + __ subq(temp1, loop_size[shift]); + __ jcc(Assembler::greater, L_main_loop_large); + // fence needed because copy256_avx3 uses non-temporal stores + __ sfence(); + + __ addq(temp1, loop_size[shift]); + // Zero length check. + __ jcc(Assembler::lessEqual, L_exit_large); + __ BIND(L_tail_large); + // Tail handling using 64 byte [masked] vector copy operations. + __ cmpq(temp1, 0); + __ jcc(Assembler::lessEqual, L_exit_large); + arraycopy_avx3_special_cases_256(xmm1, k2, from, to, temp1, shift, + temp4, temp3, L_exit_large); + __ BIND(L_exit_large); +} // Inputs: // c_rarg0 - source array address @@ -965,6 +1042,55 @@ void StubGenerator::arraycopy_avx3_special_cases(XMMRegister xmm, KRegister mask __ jmp(L_exit); } +void StubGenerator::arraycopy_avx3_special_cases_256(XMMRegister xmm, KRegister mask, Register from, + Register to, Register count, int shift, Register index, + Register temp, Label& L_exit) { + Label L_entry_64, L_entry_128, L_entry_192, L_entry_256; + + int size_mat[][4] = { + /* T_BYTE */ {64, 128, 192, 256}, + /* T_SHORT*/ {32, 64 , 96 , 128}, + /* T_INT */ {16, 32 , 48 , 64}, + /* T_LONG */ { 8, 16 , 24 , 32} + }; + + assert(MaxVectorSize == 64, "vector length != 64"); + // Case A) Special case for length less than or equal to 64 bytes. + __ BIND(L_entry_64); + __ cmpq(count, size_mat[shift][0]); + __ jccb(Assembler::greater, L_entry_128); + copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 0, true); + __ jmp(L_exit); + + // Case B) Special case for length less than or equal to 128 bytes. + __ BIND(L_entry_128); + __ cmpq(count, size_mat[shift][1]); + __ jccb(Assembler::greater, L_entry_192); + copy64_avx(to, from, index, xmm, false, shift, 0, true); + __ subq(count, 64 >> shift); + copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 64, true); + __ jmp(L_exit); + + // Case C) Special case for length less than or equal to 192 bytes. + __ BIND(L_entry_192); + __ cmpq(count, size_mat[shift][2]); + __ jcc(Assembler::greater, L_entry_256); + copy64_avx(to, from, index, xmm, false, shift, 0, true); + copy64_avx(to, from, index, xmm, false, shift, 64, true); + __ subq(count, 128 >> shift); + copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 128, true); + __ jmp(L_exit); + + // Case D) Special case for length less than or equal to 256 bytes. + __ BIND(L_entry_256); + copy64_avx(to, from, index, xmm, false, shift, 0, true); + copy64_avx(to, from, index, xmm, false, shift, 64, true); + copy64_avx(to, from, index, xmm, false, shift, 128, true); + __ subq(count, 192 >> shift); + copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 192, true); + __ jmp(L_exit); +} + void StubGenerator::arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegister mask, Register from, Register to, Register start_index, Register end_index, Register count, int shift, Register temp, @@ -1040,6 +1166,33 @@ void StubGenerator::arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegi __ jmp(L_exit); } +void StubGenerator::copy256_avx3(Register dst, Register src, Register index, XMMRegister xmm1, + XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4, + int shift, int offset) { + if (MaxVectorSize == 64) { + Address::ScaleFactor scale = (Address::ScaleFactor)(shift); + __ prefetcht0(Address(src, index, scale, offset + 0x200)); + __ prefetcht0(Address(src, index, scale, offset + 0x240)); + __ prefetcht0(Address(src, index, scale, offset + 0x280)); + __ prefetcht0(Address(src, index, scale, offset + 0x2C0)); + + __ prefetcht0(Address(src, index, scale, offset + 0x400)); + __ prefetcht0(Address(src, index, scale, offset + 0x440)); + __ prefetcht0(Address(src, index, scale, offset + 0x480)); + __ prefetcht0(Address(src, index, scale, offset + 0x4C0)); + + __ evmovdquq(xmm1, Address(src, index, scale, offset), Assembler::AVX_512bit); + __ evmovdquq(xmm2, Address(src, index, scale, offset + 0x40), Assembler::AVX_512bit); + __ evmovdquq(xmm3, Address(src, index, scale, offset + 0x80), Assembler::AVX_512bit); + __ evmovdquq(xmm4, Address(src, index, scale, offset + 0xC0), Assembler::AVX_512bit); + + __ evmovntdquq(Address(dst, index, scale, offset), xmm1, Assembler::AVX_512bit); + __ evmovntdquq(Address(dst, index, scale, offset + 0x40), xmm2, Assembler::AVX_512bit); + __ evmovntdquq(Address(dst, index, scale, offset + 0x80), xmm3, Assembler::AVX_512bit); + __ evmovntdquq(Address(dst, index, scale, offset + 0xC0), xmm4, Assembler::AVX_512bit); + } +} + void StubGenerator::copy64_masked_avx(Register dst, Register src, XMMRegister xmm, KRegister mask, Register length, Register index, Register temp, int shift, int offset, diff --git a/test/micro/org/openjdk/bench/java/lang/ArrayCopyAlignedLarge.java b/test/micro/org/openjdk/bench/java/lang/ArrayCopyAlignedLarge.java new file mode 100644 index 00000000000..f5f6bb29490 --- /dev/null +++ b/test/micro/org/openjdk/bench/java/lang/ArrayCopyAlignedLarge.java @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +package org.openjdk.bench.java.lang; + +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Warmup; + +import java.util.concurrent.TimeUnit; + +/** + * Benchmark measuring aligned System.arraycopy. + */ +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@State(Scope.Thread) +@Warmup(iterations = 10, time = 1) +@Measurement(iterations = 5, time = 1) +@Fork(value = 3) +public class ArrayCopyAlignedLarge { + + @Param({"100000", "1000000", "2000000", "5000000", "10000000"}) + int length; + + int fromPos, toPos; + byte[] fromByteArr, toByteArr; + + @Setup + public void setup() { + // Both positions aligned + fromPos = 0; + toPos = 0; + + fromByteArr = new byte[length]; + toByteArr = new byte[length]; + } + + @Benchmark + public void testByte() { + System.arraycopy(fromByteArr, fromPos, toByteArr, toPos, length); + } +}