8365290: [perf] x86 ArrayFill intrinsic generates SPLIT_STORE for unaligned arrays

Reviewed-by: sviswanathan, vpaprotski, kvn
This commit is contained in:
Vladimir Ivanov 2025-10-05 23:55:53 +00:00 committed by Sandhya Viswanathan
parent 5d9f94e05e
commit ba7bf43c76

View File

@ -5847,7 +5847,7 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned,
orl(value, rtmp);
}
cmpptr(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
cmpptr(count, 8 << shift); // Short arrays (< 32 bytes) fill by element
jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
Label L_skip_align2;
@ -5910,13 +5910,36 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned,
BIND(L_check_fill_64_bytes_avx2);
}
// Fill 64-byte chunks
Label L_fill_64_bytes_loop;
vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit);
subptr(count, 16 << shift);
jcc(Assembler::less, L_check_fill_32_bytes);
align(16);
// align data for 64-byte chunks
Label L_fill_64_bytes_loop, L_align_64_bytes_loop;
if (EnableX86ECoreOpts) {
// align 'big' arrays to cache lines to minimize split_stores
cmpptr(count, 96 << shift);
jcc(Assembler::below, L_fill_64_bytes_loop);
// Find the bytes needed for alignment
movptr(rtmp, to);
andptr(rtmp, 0x1c);
jcc(Assembler::zero, L_fill_64_bytes_loop);
negptr(rtmp); // number of bytes to fill 32-rtmp. it filled by 2 mov by 32
addptr(rtmp, 32);
shrptr(rtmp, 2 - shift);// get number of elements from bytes
subptr(count, rtmp); // adjust count by number of elements
align(16);
BIND(L_align_64_bytes_loop);
movdl(Address(to, 0), xtmp);
addptr(to, 4);
subptr(rtmp, 1 << shift);
jcc(Assembler::greater, L_align_64_bytes_loop);
}
align(16);
BIND(L_fill_64_bytes_loop);
vmovdqu(Address(to, 0), xtmp);
vmovdqu(Address(to, 32), xtmp);
@ -5924,6 +5947,7 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned,
subptr(count, 16 << shift);
jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
align(16);
BIND(L_check_fill_32_bytes);
addptr(count, 8 << shift);
jccb(Assembler::less, L_check_fill_8_bytes);
@ -5968,6 +5992,7 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned,
//
// length is too short, just fill qwords
//
align(16);
BIND(L_fill_8_bytes_loop);
movq(Address(to, 0), xtmp);
addptr(to, 8);
@ -5976,14 +6001,22 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned,
jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
}
}
// fill trailing 4 bytes
BIND(L_fill_4_bytes);
testl(count, 1<<shift);
Label L_fill_4_bytes_loop;
testl(count, 1 << shift);
jccb(Assembler::zero, L_fill_2_bytes);
align(16);
BIND(L_fill_4_bytes_loop);
movl(Address(to, 0), value);
addptr(to, 4);
BIND(L_fill_4_bytes);
subptr(count, 1 << shift);
jccb(Assembler::greaterEqual, L_fill_4_bytes_loop);
if (t == T_BYTE || t == T_SHORT) {
Label L_fill_byte;
addptr(to, 4);
BIND(L_fill_2_bytes);
// fill trailing 2 bytes
testl(count, 1<<(shift-1));