mirror of
https://github.com/openjdk/jdk.git
synced 2026-01-28 12:09:14 +00:00
8365290: [perf] x86 ArrayFill intrinsic generates SPLIT_STORE for unaligned arrays
Reviewed-by: sviswanathan, vpaprotski, kvn
This commit is contained in:
parent
5d9f94e05e
commit
ba7bf43c76
@ -5847,7 +5847,7 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned,
|
||||
orl(value, rtmp);
|
||||
}
|
||||
|
||||
cmpptr(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
|
||||
cmpptr(count, 8 << shift); // Short arrays (< 32 bytes) fill by element
|
||||
jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
|
||||
if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
|
||||
Label L_skip_align2;
|
||||
@ -5910,13 +5910,36 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned,
|
||||
BIND(L_check_fill_64_bytes_avx2);
|
||||
}
|
||||
// Fill 64-byte chunks
|
||||
Label L_fill_64_bytes_loop;
|
||||
vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit);
|
||||
|
||||
subptr(count, 16 << shift);
|
||||
jcc(Assembler::less, L_check_fill_32_bytes);
|
||||
align(16);
|
||||
|
||||
// align data for 64-byte chunks
|
||||
Label L_fill_64_bytes_loop, L_align_64_bytes_loop;
|
||||
if (EnableX86ECoreOpts) {
|
||||
// align 'big' arrays to cache lines to minimize split_stores
|
||||
cmpptr(count, 96 << shift);
|
||||
jcc(Assembler::below, L_fill_64_bytes_loop);
|
||||
|
||||
// Find the bytes needed for alignment
|
||||
movptr(rtmp, to);
|
||||
andptr(rtmp, 0x1c);
|
||||
jcc(Assembler::zero, L_fill_64_bytes_loop);
|
||||
negptr(rtmp); // number of bytes to fill 32-rtmp. it filled by 2 mov by 32
|
||||
addptr(rtmp, 32);
|
||||
shrptr(rtmp, 2 - shift);// get number of elements from bytes
|
||||
subptr(count, rtmp); // adjust count by number of elements
|
||||
|
||||
align(16);
|
||||
BIND(L_align_64_bytes_loop);
|
||||
movdl(Address(to, 0), xtmp);
|
||||
addptr(to, 4);
|
||||
subptr(rtmp, 1 << shift);
|
||||
jcc(Assembler::greater, L_align_64_bytes_loop);
|
||||
}
|
||||
|
||||
align(16);
|
||||
BIND(L_fill_64_bytes_loop);
|
||||
vmovdqu(Address(to, 0), xtmp);
|
||||
vmovdqu(Address(to, 32), xtmp);
|
||||
@ -5924,6 +5947,7 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned,
|
||||
subptr(count, 16 << shift);
|
||||
jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
|
||||
|
||||
align(16);
|
||||
BIND(L_check_fill_32_bytes);
|
||||
addptr(count, 8 << shift);
|
||||
jccb(Assembler::less, L_check_fill_8_bytes);
|
||||
@ -5968,6 +5992,7 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned,
|
||||
//
|
||||
// length is too short, just fill qwords
|
||||
//
|
||||
align(16);
|
||||
BIND(L_fill_8_bytes_loop);
|
||||
movq(Address(to, 0), xtmp);
|
||||
addptr(to, 8);
|
||||
@ -5976,14 +6001,22 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned,
|
||||
jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
|
||||
}
|
||||
}
|
||||
// fill trailing 4 bytes
|
||||
BIND(L_fill_4_bytes);
|
||||
testl(count, 1<<shift);
|
||||
|
||||
Label L_fill_4_bytes_loop;
|
||||
testl(count, 1 << shift);
|
||||
jccb(Assembler::zero, L_fill_2_bytes);
|
||||
|
||||
align(16);
|
||||
BIND(L_fill_4_bytes_loop);
|
||||
movl(Address(to, 0), value);
|
||||
addptr(to, 4);
|
||||
|
||||
BIND(L_fill_4_bytes);
|
||||
subptr(count, 1 << shift);
|
||||
jccb(Assembler::greaterEqual, L_fill_4_bytes_loop);
|
||||
|
||||
if (t == T_BYTE || t == T_SHORT) {
|
||||
Label L_fill_byte;
|
||||
addptr(to, 4);
|
||||
BIND(L_fill_2_bytes);
|
||||
// fill trailing 2 bytes
|
||||
testl(count, 1<<(shift-1));
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user