8155617: aarch64: ClearArray does not use DC ZVA

Implement block zero using DC ZVA

Co-authored-by: Long Chen <long.chen@linaro.org>
Reviewed-by: aph
This commit is contained in:
Ed Nevill 2016-04-28 13:26:29 +00:00
parent 06645bc9d5
commit 41258ea37a
10 changed files with 234 additions and 26 deletions

View File

@ -13470,9 +13470,10 @@ instruct clearArray_reg_reg(iRegL_R11 cnt, iRegP_R10 base, Universe dummy, rFlag
ins_pipe(pipe_class_memory);
%}
instruct clearArray_imm_reg(immL cnt, iRegP base, Universe dummy, rFlagsReg cr)
instruct clearArray_imm_reg(immL cnt, iRegP_R10 base, iRegL_R11 tmp, Universe dummy, rFlagsReg cr)
%{
match(Set dummy (ClearArray cnt base));
effect(USE_KILL base, TEMP tmp);
ins_cost(4 * INSN_COST);
format %{ "ClearArray $cnt, $base" %}

View File

@ -1032,12 +1032,28 @@ public:
system(0b00, 0b011, 0b00011, SY, 0b110);
}
void dc(Register Rt) {
system(0b01, 0b011, 0b0111, 0b1011, 0b001, Rt);
void sys(int op1, int CRn, int CRm, int op2,
Register rt = (Register)0b11111) {
system(0b01, op1, CRn, CRm, op2, rt);
}
void ic(Register Rt) {
system(0b01, 0b011, 0b0111, 0b0101, 0b001, Rt);
// Only implement operations accessible from EL0 or higher, i.e.,
// op1 CRn CRm op2
// IC IVAU 3 7 5 1
// DC CVAC 3 7 10 1
// DC CVAU 3 7 11 1
// DC CIVAC 3 7 14 1
// DC ZVA 3 7 4 1
// So only deal with the CRm field.
enum icache_maintenance {IVAU = 0b0101};
enum dcache_maintenance {CVAC = 0b1010, CVAU = 0b1011, CIVAC = 0b1110, ZVA = 0b100};
void dc(dcache_maintenance cm, Register Rt) {
sys(0b011, 0b0111, cm, 0b001, Rt);
}
void ic(icache_maintenance cm, Register Rt) {
sys(0b011, 0b0111, cm, 0b001, Rt);
}
// A more convenient access to dmb for our purposes

View File

@ -132,6 +132,11 @@ define_pd_global(intx, InlineSmallCode, 1000);
"Use SIMD instructions in generated memory move code") \
product(bool, UseLSE, false, \
"Use LSE instructions") \
product(bool, UseBlockZeroing, true, \
"Use DC ZVA for block zeroing") \
product(intx, BlockZeroingLowLimit, 256, \
"Minimum size in bytes when block zeroing will be used") \
range(1, max_jint) \
product(bool, TraceTraps, false, "Trace all traps the signal handler")
#endif

View File

@ -4670,24 +4670,35 @@ void MacroAssembler::arrays_equals(Register a1, Register a2,
BLOCK_COMMENT(is_string ? "} string_equals" : "} array_equals");
}
// base: Address of a buffer to be zeroed, 8 bytes aligned.
// cnt: Count in 8-byte unit.
// base: Address of a buffer to be zeroed, 8 bytes aligned.
// cnt: Count in HeapWords.
// is_large: True when 'cnt' is known to be >= BlockZeroingLowLimit.
void MacroAssembler::zero_words(Register base, Register cnt)
{
fill_words(base, cnt, zr);
if (UseBlockZeroing) {
block_zero(base, cnt);
} else {
fill_words(base, cnt, zr);
}
}
// base: Address of a buffer to be zeroed, 8 bytes aligned.
// cnt: Immediate count in 8-byte unit.
// r10 = base: Address of a buffer to be zeroed, 8 bytes aligned.
// cnt: Immediate count in HeapWords.
// r11 = tmp: For use as cnt if we need to call out
#define ShortArraySize (18 * BytesPerLong)
void MacroAssembler::zero_words(Register base, u_int64_t cnt)
{
Register tmp = r11;
int i = cnt & 1; // store any odd word to start
if (i) str(zr, Address(base));
if (cnt <= ShortArraySize / BytesPerLong) {
for (; i < (int)cnt; i += 2)
stp(zr, zr, Address(base, i * wordSize));
} else if (UseBlockZeroing && cnt >= (u_int64_t)(BlockZeroingLowLimit >> LogBytesPerWord)) {
mov(tmp, cnt);
block_zero(base, tmp, true);
} else {
const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll
int remainder = cnt % (2 * unroll);
@ -4739,24 +4750,95 @@ void MacroAssembler::fill_words(Register base, Register cnt, Register value)
assert_different_registers(base, cnt, value, rscratch1, rscratch2);
Label entry, loop;
const int unroll = 8; // Number of str instructions we'll unroll
Label fini, skip, entry, loop;
const int unroll = 8; // Number of stp instructions we'll unroll
andr(rscratch1, cnt, unroll - 1); // tmp1 = cnt % unroll
cbz(rscratch1, entry);
sub(cnt, cnt, rscratch1); // cnt -= tmp1
// base always points to the end of the region we're about to fill
cbz(cnt, fini);
tbz(base, 3, skip);
str(value, Address(post(base, 8)));
sub(cnt, cnt, 1);
bind(skip);
andr(rscratch1, cnt, (unroll-1) * 2);
sub(cnt, cnt, rscratch1);
add(base, base, rscratch1, Assembler::LSL, 3);
adr(rscratch2, entry);
sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2);
sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1);
br(rscratch2);
bind(loop);
add(base, base, unroll * 8);
sub(cnt, cnt, unroll);
for (int i = -unroll; i < 0; i++)
str(value, Address(base, i * 8));
stp(value, value, Address(base, i * 16));
bind(entry);
cbnz(cnt, loop);
subs(cnt, cnt, unroll * 2);
add(base, base, unroll * 16);
br(Assembler::GE, loop);
tbz(cnt, 0, fini);
str(value, Address(base, -unroll * 16));
bind(fini);
}
// Use DC ZVA to do fast zeroing.
// base: Address of a buffer to be zeroed, 8 bytes aligned.
// cnt: Count in HeapWords.
// is_large: True when 'cnt' is known to be >= BlockZeroingLowLimit.
void MacroAssembler::block_zero(Register base, Register cnt, bool is_large)
{
Label small;
Label store_pair, loop_store_pair, done;
Label base_aligned;
assert_different_registers(base, cnt, rscratch1);
Register tmp = rscratch1;
Register tmp2 = rscratch2;
int zva_length = VM_Version::zva_length();
// Ensure ZVA length can be divided by 16. This is required by
// the subsequent operations.
assert (zva_length % 16 == 0, "Unexpected ZVA Length");
if (!is_large) cbz(cnt, done);
tbz(base, 3, base_aligned);
str(zr, Address(post(base, 8)));
sub(cnt, cnt, 1);
bind(base_aligned);
// Ensure count >= zva_length * 2 so that it still deserves a zva after
// alignment.
if (!is_large || !(BlockZeroingLowLimit >= zva_length * 2)) {
int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
cmp(cnt, low_limit >> 3);
br(Assembler::LT, small);
}
far_call(StubRoutines::aarch64::get_zero_longs());
bind(small);
const int unroll = 8; // Number of stp instructions we'll unroll
Label small_loop, small_table_end;
andr(tmp, cnt, (unroll-1) * 2);
sub(cnt, cnt, tmp);
add(base, base, tmp, Assembler::LSL, 3);
adr(tmp2, small_table_end);
sub(tmp2, tmp2, tmp, Assembler::LSL, 1);
br(tmp2);
bind(small_loop);
for (int i = -unroll; i < 0; i++)
stp(zr, zr, Address(base, i * 16));
bind(small_table_end);
subs(cnt, cnt, unroll * 2);
add(base, base, unroll * 16);
br(Assembler::GE, small_loop);
tbz(cnt, 0, done);
str(zr, Address(base, -unroll * 16));
bind(done);
}
// encode char[] to byte[] in ISO_8859_1

View File

@ -536,6 +536,15 @@ public:
msr(0b011, 0b0100, 0b0100, 0b001, zr);
}
// DCZID_EL0: op1 == 011
// CRn == 0000
// CRm == 0000
// op2 == 111
inline void get_dczid_el0(Register reg)
{
mrs(0b011, 0b0000, 0b0000, 0b111, reg);
}
// idiv variant which deals with MINLONG as dividend and -1 as divisor
int corrected_idivl(Register result, Register ra, Register rb,
bool want_remainder, Register tmp = rscratch1);
@ -1185,8 +1194,9 @@ public:
int elem_size, bool is_string);
void fill_words(Register base, Register cnt, Register value);
void zero_words(Register base, Register cnt);
void zero_words(Register base, u_int64_t cnt);
void zero_words(Register base, Register cnt);
void block_zero(Register base, Register cnt, bool is_large = false);
void encode_iso_array(Register src, Register dst,
Register len, Register result,

View File

@ -719,6 +719,43 @@ class StubGenerator: public StubCodeGenerator {
}
}
address generate_zero_longs(Register base, Register cnt) {
Register tmp = rscratch1;
Register tmp2 = rscratch2;
int zva_length = VM_Version::zva_length();
Label initial_table_end, loop_zva;
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "zero_longs");
address start = __ pc();
// Align base with ZVA length.
__ neg(tmp, base);
__ andr(tmp, tmp, zva_length - 1);
// tmp: the number of bytes to be filled to align the base with ZVA length.
__ add(base, base, tmp);
__ sub(cnt, cnt, tmp, Assembler::ASR, 3);
__ adr(tmp2, initial_table_end);
__ sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
__ br(tmp2);
for (int i = -zva_length + 16; i < 0; i += 16)
__ stp(zr, zr, Address(base, i));
__ bind(initial_table_end);
__ sub(cnt, cnt, zva_length >> 3);
__ bind(loop_zva);
__ dc(Assembler::ZVA, base);
__ subs(cnt, cnt, zva_length >> 3);
__ add(base, base, zva_length);
__ br(Assembler::GE, loop_zva);
__ add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
__ ret(lr);
return start;
}
typedef enum {
copy_forwards = 1,
copy_backwards = -1
@ -2104,7 +2141,21 @@ class StubGenerator: public StubCodeGenerator {
__ lsrw(cnt_words, count, 3 - shift); // number of words
__ bfi(value, value, 32, 32); // 32 bit -> 64 bit
__ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
__ fill_words(to, cnt_words, value);
if (UseBlockZeroing) {
Label non_block_zeroing, rest;
// count >= BlockZeroingLowLimit && value == 0
__ cmp(cnt_words, BlockZeroingLowLimit >> 3);
__ ccmp(value, 0 /* comparing value */, 0 /* NZCV */, Assembler::GE);
__ br(Assembler::NE, non_block_zeroing);
__ block_zero(to, cnt_words, true);
__ b(rest);
__ bind(non_block_zeroing);
__ fill_words(to, cnt_words, value);
__ bind(rest);
}
else {
__ fill_words(to, cnt_words, value);
}
// Remaining count is less than 8 bytes. Fill it by a single store.
// Note that the total length is no less than 8 bytes.
@ -2163,6 +2214,8 @@ class StubGenerator: public StubCodeGenerator {
generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
StubRoutines::aarch64::_zero_longs = generate_zero_longs(r10, r11);
//*** jbyte
// Always need aligned and unaligned versions
StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry,

View File

@ -43,6 +43,7 @@ address StubRoutines::aarch64::_float_sign_mask = NULL;
address StubRoutines::aarch64::_float_sign_flip = NULL;
address StubRoutines::aarch64::_double_sign_mask = NULL;
address StubRoutines::aarch64::_double_sign_flip = NULL;
address StubRoutines::aarch64::_zero_longs = NULL;
/**
* crc_table[] from jdk/src/share/native/java/util/zip/zlib-1.2.5/crc32.h

View File

@ -61,6 +61,8 @@ class aarch64 {
static address _double_sign_mask;
static address _double_sign_flip;
static address _zero_longs;
public:
static address get_previous_fp_entry()
@ -113,6 +115,11 @@ class aarch64 {
return _double_sign_flip;
}
static address get_zero_longs()
{
return _zero_longs;
}
private:
static juint _crc_table[];

View File

@ -71,6 +71,7 @@ int VM_Version::_model2;
int VM_Version::_variant;
int VM_Version::_revision;
int VM_Version::_stepping;
VM_Version::PsrInfo VM_Version::_psr_info = { 0, };
static BufferBlob* stub_blob;
static const int stub_size = 550;
@ -95,13 +96,16 @@ class VM_Version_StubGenerator: public StubCodeGenerator {
__ c_stub_prolog(1, 0, MacroAssembler::ret_type_void);
#endif
// void getPsrInfo(VM_Version::CpuidInfo* cpuid_info);
// void getPsrInfo(VM_Version::PsrInfo* psr_info);
address entry = __ pc();
// TODO : redefine fields in CpuidInfo and generate
// code to fill them in
__ enter();
__ get_dczid_el0(rscratch1);
__ strw(rscratch1, Address(c_rarg0, in_bytes(VM_Version::dczid_el0_offset())));
__ leave();
__ ret(lr);
# undef __
@ -118,6 +122,8 @@ void VM_Version::get_processor_features() {
_supports_atomic_getset8 = true;
_supports_atomic_getadd8 = true;
getPsrInfo_stub(&_psr_info);
if (FLAG_IS_DEFAULT(AllocatePrefetchDistance))
FLAG_SET_DEFAULT(AllocatePrefetchDistance, 256);
if (FLAG_IS_DEFAULT(AllocatePrefetchStepSize))
@ -285,6 +291,18 @@ void VM_Version::get_processor_features() {
FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
}
if (is_zva_enabled()) {
if (FLAG_IS_DEFAULT(UseBlockZeroing)) {
FLAG_SET_DEFAULT(UseBlockZeroing, true);
}
if (FLAG_IS_DEFAULT(BlockZeroingLowLimit)) {
FLAG_SET_DEFAULT(BlockZeroingLowLimit, 4 * VM_Version::zva_length());
}
} else if (UseBlockZeroing) {
warning("DC ZVA is not available on this CPU");
FLAG_SET_DEFAULT(UseBlockZeroing, false);
}
// This machine allows unaligned memory accesses
if (FLAG_IS_DEFAULT(UseUnalignedAccesses)) {
FLAG_SET_DEFAULT(UseUnalignedAccesses, true);

View File

@ -40,6 +40,10 @@ protected:
static int _revision;
static int _stepping;
struct PsrInfo {
uint32_t dczid_el0;
};
static PsrInfo _psr_info;
static void get_processor_features();
public:
@ -83,6 +87,17 @@ public:
static int cpu_model2() { return _model2; }
static int cpu_variant() { return _variant; }
static int cpu_revision() { return _revision; }
static ByteSize dczid_el0_offset() { return byte_offset_of(PsrInfo, dczid_el0); }
static bool is_zva_enabled() {
// Check the DZP bit (bit 4) of dczid_el0 is zero
// and block size (bit 0~3) is not zero.
return ((_psr_info.dczid_el0 & 0x10) == 0 &&
(_psr_info.dczid_el0 & 0xf) != 0);
}
static int zva_length() {
assert(is_zva_enabled(), "ZVA not available");
return 4 << (_psr_info.dczid_el0 & 0xf);
}
};
#endif // CPU_AARCH64_VM_VM_VERSION_AARCH64_HPP