mirror of
https://github.com/openjdk/jdk.git
synced 2026-03-23 14:19:56 +00:00
7079329: Adjust allocation prefetching for T4
On T4 2 BIS instructions should be issued to prefetch 64 bytes Reviewed-by: iveresov, phh, twisti
This commit is contained in:
parent
080f790edc
commit
90651b2666
@ -886,7 +886,11 @@ class Assembler : public AbstractAssembler {
|
||||
|
||||
enum ASIs { // page 72, v9
|
||||
ASI_PRIMARY = 0x80,
|
||||
ASI_PRIMARY_LITTLE = 0x88
|
||||
ASI_PRIMARY_LITTLE = 0x88,
|
||||
// Block initializing store
|
||||
ASI_ST_BLKINIT_PRIMARY = 0xE2,
|
||||
// Most-Recently-Used (MRU) BIS variant
|
||||
ASI_ST_BLKINIT_MRU_PRIMARY = 0xF2
|
||||
// add more from book as needed
|
||||
};
|
||||
|
||||
|
||||
@ -471,9 +471,6 @@ extern bool can_branch_register( Node *bol, Node *cmp );
|
||||
source %{
|
||||
#define __ _masm.
|
||||
|
||||
// Block initializing store
|
||||
#define ASI_BLK_INIT_QUAD_LDD_P 0xE2
|
||||
|
||||
// tertiary op of a LoadP or StoreP encoding
|
||||
#define REGP_OP true
|
||||
|
||||
@ -2819,10 +2816,10 @@ enc_class Fast_Unlock(iRegP oop, iRegP box, o7RegP scratch, iRegP scratch2) %{
|
||||
Register nof_bytes_arg = reg_to_register_object($cnt$$reg);
|
||||
Register nof_bytes_tmp = reg_to_register_object($temp$$reg);
|
||||
Register base_pointer_arg = reg_to_register_object($base$$reg);
|
||||
|
||||
|
||||
Label loop;
|
||||
__ mov(nof_bytes_arg, nof_bytes_tmp);
|
||||
|
||||
|
||||
// Loop and clear, walking backwards through the array.
|
||||
// nof_bytes_tmp (if >0) is always the number of bytes to zero
|
||||
__ bind(loop);
|
||||
@ -6269,6 +6266,7 @@ instruct loadConD(regD dst, immD con, o7RegI tmp) %{
|
||||
instruct prefetchr( memory mem ) %{
|
||||
match( PrefetchRead mem );
|
||||
ins_cost(MEMORY_REF_COST);
|
||||
size(4);
|
||||
|
||||
format %{ "PREFETCH $mem,0\t! Prefetch read-many" %}
|
||||
opcode(Assembler::prefetch_op3);
|
||||
@ -6277,9 +6275,9 @@ instruct prefetchr( memory mem ) %{
|
||||
%}
|
||||
|
||||
instruct prefetchw( memory mem ) %{
|
||||
predicate(AllocatePrefetchStyle != 3 );
|
||||
match( PrefetchWrite mem );
|
||||
ins_cost(MEMORY_REF_COST);
|
||||
size(4);
|
||||
|
||||
format %{ "PREFETCH $mem,2\t! Prefetch write-many (and read)" %}
|
||||
opcode(Assembler::prefetch_op3);
|
||||
@ -6287,24 +6285,62 @@ instruct prefetchw( memory mem ) %{
|
||||
ins_pipe(iload_mem);
|
||||
%}
|
||||
|
||||
// Use BIS instruction to prefetch.
|
||||
instruct prefetchw_bis( memory mem ) %{
|
||||
predicate(AllocatePrefetchStyle == 3);
|
||||
match( PrefetchWrite mem );
|
||||
ins_cost(MEMORY_REF_COST);
|
||||
// Prefetch instructions for allocation.
|
||||
|
||||
format %{ "STXA G0,$mem\t! // Block initializing store" %}
|
||||
instruct prefetchAlloc( memory mem ) %{
|
||||
predicate(AllocatePrefetchInstr == 0);
|
||||
match( PrefetchAllocation mem );
|
||||
ins_cost(MEMORY_REF_COST);
|
||||
size(4);
|
||||
|
||||
format %{ "PREFETCH $mem,2\t! Prefetch allocation" %}
|
||||
opcode(Assembler::prefetch_op3);
|
||||
ins_encode( form3_mem_prefetch_write( mem ) );
|
||||
ins_pipe(iload_mem);
|
||||
%}
|
||||
|
||||
// Use BIS instruction to prefetch for allocation.
|
||||
// Could fault, need space at the end of TLAB.
|
||||
instruct prefetchAlloc_bis( iRegP dst ) %{
|
||||
predicate(AllocatePrefetchInstr == 1);
|
||||
match( PrefetchAllocation dst );
|
||||
ins_cost(MEMORY_REF_COST);
|
||||
size(4);
|
||||
|
||||
format %{ "STXA [$dst]\t! // Prefetch allocation using BIS" %}
|
||||
ins_encode %{
|
||||
Register base = as_Register($mem$$base);
|
||||
int disp = $mem$$disp;
|
||||
if (disp != 0) {
|
||||
__ add(base, AllocatePrefetchStepSize, base);
|
||||
}
|
||||
__ stxa(G0, base, G0, ASI_BLK_INIT_QUAD_LDD_P);
|
||||
__ stxa(G0, $dst$$Register, G0, Assembler::ASI_ST_BLKINIT_PRIMARY);
|
||||
%}
|
||||
ins_pipe(istore_mem_reg);
|
||||
%}
|
||||
|
||||
// Next code is used for finding next cache line address to prefetch.
|
||||
#ifndef _LP64
|
||||
instruct cacheLineAdr( iRegP dst, iRegP src, immI13 mask ) %{
|
||||
match(Set dst (CastX2P (AndI (CastP2X src) mask)));
|
||||
ins_cost(DEFAULT_COST);
|
||||
size(4);
|
||||
|
||||
format %{ "AND $src,$mask,$dst\t! next cache line address" %}
|
||||
ins_encode %{
|
||||
__ and3($src$$Register, $mask$$constant, $dst$$Register);
|
||||
%}
|
||||
ins_pipe(ialu_reg_imm);
|
||||
%}
|
||||
#else
|
||||
instruct cacheLineAdr( iRegP dst, iRegP src, immL13 mask ) %{
|
||||
match(Set dst (CastX2P (AndL (CastP2X src) mask)));
|
||||
ins_cost(DEFAULT_COST);
|
||||
size(4);
|
||||
|
||||
format %{ "AND $src,$mask,$dst\t! next cache line address" %}
|
||||
ins_encode %{
|
||||
__ and3($src$$Register, $mask$$constant, $dst$$Register);
|
||||
%}
|
||||
ins_pipe(ialu_reg_imm);
|
||||
%}
|
||||
#endif
|
||||
|
||||
//----------Store Instructions-------------------------------------------------
|
||||
// Store Byte
|
||||
instruct storeB(memory mem, iRegI src) %{
|
||||
|
||||
@ -44,20 +44,31 @@ void VM_Version::initialize() {
|
||||
PrefetchScanIntervalInBytes = prefetch_scan_interval_in_bytes();
|
||||
PrefetchFieldsAhead = prefetch_fields_ahead();
|
||||
|
||||
assert(0 <= AllocatePrefetchInstr && AllocatePrefetchInstr <= 1, "invalid value");
|
||||
if( AllocatePrefetchInstr < 0 ) AllocatePrefetchInstr = 0;
|
||||
if( AllocatePrefetchInstr > 1 ) AllocatePrefetchInstr = 0;
|
||||
|
||||
// Allocation prefetch settings
|
||||
intx cache_line_size = L1_data_cache_line_size();
|
||||
intx cache_line_size = prefetch_data_size();
|
||||
if( cache_line_size > AllocatePrefetchStepSize )
|
||||
AllocatePrefetchStepSize = cache_line_size;
|
||||
if( FLAG_IS_DEFAULT(AllocatePrefetchLines) )
|
||||
AllocatePrefetchLines = 3; // Optimistic value
|
||||
assert( AllocatePrefetchLines > 0, "invalid value");
|
||||
if( AllocatePrefetchLines < 1 ) // set valid value in product VM
|
||||
AllocatePrefetchLines = 1; // Conservative value
|
||||
|
||||
assert(AllocatePrefetchLines > 0, "invalid value");
|
||||
if( AllocatePrefetchLines < 1 ) // set valid value in product VM
|
||||
AllocatePrefetchLines = 3;
|
||||
assert(AllocateInstancePrefetchLines > 0, "invalid value");
|
||||
if( AllocateInstancePrefetchLines < 1 ) // set valid value in product VM
|
||||
AllocateInstancePrefetchLines = 1;
|
||||
|
||||
AllocatePrefetchDistance = allocate_prefetch_distance();
|
||||
AllocatePrefetchStyle = allocate_prefetch_style();
|
||||
|
||||
assert(AllocatePrefetchDistance % AllocatePrefetchStepSize == 0, "invalid value");
|
||||
assert((AllocatePrefetchDistance % AllocatePrefetchStepSize) == 0 &&
|
||||
(AllocatePrefetchDistance > 0), "invalid value");
|
||||
if ((AllocatePrefetchDistance % AllocatePrefetchStepSize) != 0 ||
|
||||
(AllocatePrefetchDistance <= 0)) {
|
||||
AllocatePrefetchDistance = AllocatePrefetchStepSize;
|
||||
}
|
||||
|
||||
if (AllocatePrefetchStyle == 3 && !has_blk_init()) {
|
||||
warning("BIS instructions are not available on this CPU");
|
||||
@ -66,7 +77,7 @@ void VM_Version::initialize() {
|
||||
|
||||
UseSSE = 0; // Only on x86 and x64
|
||||
|
||||
_supports_cx8 = has_v9();
|
||||
_supports_cx8 = has_v9();
|
||||
|
||||
if (is_niagara()) {
|
||||
// Indirect branch is the same cost as direct
|
||||
@ -99,19 +110,42 @@ void VM_Version::initialize() {
|
||||
FLAG_SET_DEFAULT(InteriorEntryAlignment, 4);
|
||||
}
|
||||
if (is_niagara_plus()) {
|
||||
if (has_blk_init() && AllocatePrefetchStyle > 0 &&
|
||||
FLAG_IS_DEFAULT(AllocatePrefetchStyle)) {
|
||||
// Use BIS instruction for allocation prefetch.
|
||||
FLAG_SET_DEFAULT(AllocatePrefetchStyle, 3);
|
||||
if (has_blk_init() && UseTLAB &&
|
||||
FLAG_IS_DEFAULT(AllocatePrefetchInstr)) {
|
||||
// Use BIS instruction for TLAB allocation prefetch.
|
||||
FLAG_SET_ERGO(intx, AllocatePrefetchInstr, 1);
|
||||
if (FLAG_IS_DEFAULT(AllocatePrefetchStyle)) {
|
||||
FLAG_SET_ERGO(intx, AllocatePrefetchStyle, 3);
|
||||
}
|
||||
if (FLAG_IS_DEFAULT(AllocatePrefetchDistance)) {
|
||||
// Use smaller prefetch distance on N2 with BIS
|
||||
// Use smaller prefetch distance with BIS
|
||||
FLAG_SET_DEFAULT(AllocatePrefetchDistance, 64);
|
||||
}
|
||||
}
|
||||
if (is_T4()) {
|
||||
// Double number of prefetched cache lines on T4
|
||||
// since L2 cache line size is smaller (32 bytes).
|
||||
if (FLAG_IS_DEFAULT(AllocatePrefetchLines)) {
|
||||
FLAG_SET_ERGO(intx, AllocatePrefetchLines, AllocatePrefetchLines*2);
|
||||
}
|
||||
if (FLAG_IS_DEFAULT(AllocateInstancePrefetchLines)) {
|
||||
FLAG_SET_ERGO(intx, AllocateInstancePrefetchLines, AllocateInstancePrefetchLines*2);
|
||||
}
|
||||
}
|
||||
if (AllocatePrefetchStyle != 3 && FLAG_IS_DEFAULT(AllocatePrefetchDistance)) {
|
||||
// Use different prefetch distance without BIS
|
||||
FLAG_SET_DEFAULT(AllocatePrefetchDistance, 256);
|
||||
}
|
||||
if (AllocatePrefetchInstr == 1) {
|
||||
// Need a space at the end of TLAB for BIS since it
|
||||
// will fault when accessing memory outside of heap.
|
||||
|
||||
// +1 for rounding up to next cache line, +1 to be safe
|
||||
int lines = AllocatePrefetchLines + 2;
|
||||
int step_size = AllocatePrefetchStepSize;
|
||||
int distance = AllocatePrefetchDistance;
|
||||
_reserve_for_allocation_prefetch = (distance + step_size*lines)/(int)HeapWordSize;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@ -185,14 +219,20 @@ void VM_Version::initialize() {
|
||||
|
||||
#ifndef PRODUCT
|
||||
if (PrintMiscellaneous && Verbose) {
|
||||
tty->print("Allocation: ");
|
||||
tty->print("Allocation");
|
||||
if (AllocatePrefetchStyle <= 0) {
|
||||
tty->print_cr("no prefetching");
|
||||
tty->print_cr(": no prefetching");
|
||||
} else {
|
||||
tty->print(" prefetching: ");
|
||||
if (AllocatePrefetchInstr == 0) {
|
||||
tty->print("PREFETCH");
|
||||
} else if (AllocatePrefetchInstr == 1) {
|
||||
tty->print("BIS");
|
||||
}
|
||||
if (AllocatePrefetchLines > 1) {
|
||||
tty->print_cr("PREFETCH %d, %d lines of size %d bytes", AllocatePrefetchDistance, AllocatePrefetchLines, AllocatePrefetchStepSize);
|
||||
tty->print_cr(" at distance %d, %d lines of %d bytes", AllocatePrefetchDistance, AllocatePrefetchLines, AllocatePrefetchStepSize);
|
||||
} else {
|
||||
tty->print_cr("PREFETCH %d, one line", AllocatePrefetchDistance);
|
||||
tty->print_cr(" at distance %d, one line of %d bytes", AllocatePrefetchDistance, AllocatePrefetchStepSize);
|
||||
}
|
||||
}
|
||||
if (PrefetchCopyIntervalInBytes > 0) {
|
||||
|
||||
@ -121,6 +121,7 @@ public:
|
||||
// Returns true if the platform is in the niagara line (T series)
|
||||
// and newer than the niagara1.
|
||||
static bool is_niagara_plus() { return is_T_family(_features) && !is_T1_model(_features); }
|
||||
static bool is_T4() { return is_T_family(_features) && has_cbcond(); }
|
||||
|
||||
// Fujitsu SPARC64
|
||||
static bool is_sparc64() { return (_features & sparc64_family_m) != 0; }
|
||||
@ -130,13 +131,17 @@ public:
|
||||
|
||||
static bool has_fast_fxtof() { return is_niagara() || is_sparc64() || has_v9() && !is_ultra3(); }
|
||||
static bool has_fast_idiv() { return is_niagara_plus() || is_sparc64(); }
|
||||
|
||||
// T4 and newer Sparc have fast RDPC instruction.
|
||||
static bool has_fast_rdpc() { return is_niagara_plus() && has_cbcond(); }
|
||||
static bool has_fast_rdpc() { return is_T4(); }
|
||||
|
||||
// T4 and newer Sparc have Most-Recently-Used (MRU) BIS.
|
||||
static bool has_mru_blk_init() { return has_blk_init() && is_T4(); }
|
||||
|
||||
static const char* cpu_features() { return _features_str; }
|
||||
|
||||
static intx L1_data_cache_line_size() {
|
||||
return 64; // default prefetch block size on sparc
|
||||
static intx prefetch_data_size() {
|
||||
return is_T4() ? 32 : 64; // default prefetch block size on sparc
|
||||
}
|
||||
|
||||
// Prefetch
|
||||
|
||||
@ -2315,7 +2315,7 @@ void Assembler::prefetchnta(Address src) {
|
||||
}
|
||||
|
||||
void Assembler::prefetchr(Address src) {
|
||||
NOT_LP64(assert(VM_Version::supports_3dnow_prefetch(), "must support"));
|
||||
assert(VM_Version::supports_3dnow_prefetch(), "must support");
|
||||
InstructionMark im(this);
|
||||
prefetch_prefix(src);
|
||||
emit_byte(0x0D);
|
||||
@ -2347,7 +2347,7 @@ void Assembler::prefetcht2(Address src) {
|
||||
}
|
||||
|
||||
void Assembler::prefetchw(Address src) {
|
||||
NOT_LP64(assert(VM_Version::supports_3dnow_prefetch(), "must support"));
|
||||
assert(VM_Version::supports_3dnow_prefetch(), "must support");
|
||||
InstructionMark im(this);
|
||||
prefetch_prefix(src);
|
||||
emit_byte(0x0D);
|
||||
|
||||
@ -557,14 +557,16 @@ void VM_Version::get_processor_features() {
|
||||
if( !supports_sse() && supports_3dnow_prefetch() ) AllocatePrefetchInstr = 3;
|
||||
|
||||
// Allocation prefetch settings
|
||||
intx cache_line_size = L1_data_cache_line_size();
|
||||
intx cache_line_size = prefetch_data_size();
|
||||
if( cache_line_size > AllocatePrefetchStepSize )
|
||||
AllocatePrefetchStepSize = cache_line_size;
|
||||
if( FLAG_IS_DEFAULT(AllocatePrefetchLines) )
|
||||
AllocatePrefetchLines = 3; // Optimistic value
|
||||
|
||||
assert(AllocatePrefetchLines > 0, "invalid value");
|
||||
if( AllocatePrefetchLines < 1 ) // set valid value in product VM
|
||||
AllocatePrefetchLines = 1; // Conservative value
|
||||
if( AllocatePrefetchLines < 1 ) // set valid value in product VM
|
||||
AllocatePrefetchLines = 3;
|
||||
assert(AllocateInstancePrefetchLines > 0, "invalid value");
|
||||
if( AllocateInstancePrefetchLines < 1 ) // set valid value in product VM
|
||||
AllocateInstancePrefetchLines = 1;
|
||||
|
||||
AllocatePrefetchDistance = allocate_prefetch_distance();
|
||||
AllocatePrefetchStyle = allocate_prefetch_style();
|
||||
@ -601,10 +603,11 @@ void VM_Version::get_processor_features() {
|
||||
tty->print_cr("Logical CPUs per core: %u",
|
||||
logical_processors_per_package());
|
||||
tty->print_cr("UseSSE=%d",UseSSE);
|
||||
tty->print("Allocation: ");
|
||||
tty->print("Allocation");
|
||||
if (AllocatePrefetchStyle <= 0 || UseSSE == 0 && !supports_3dnow_prefetch()) {
|
||||
tty->print_cr("no prefetching");
|
||||
tty->print_cr(": no prefetching");
|
||||
} else {
|
||||
tty->print(" prefetching: ");
|
||||
if (UseSSE == 0 && supports_3dnow_prefetch()) {
|
||||
tty->print("PREFETCHW");
|
||||
} else if (UseSSE >= 1) {
|
||||
@ -619,9 +622,9 @@ void VM_Version::get_processor_features() {
|
||||
}
|
||||
}
|
||||
if (AllocatePrefetchLines > 1) {
|
||||
tty->print_cr(" %d, %d lines with step %d bytes", AllocatePrefetchDistance, AllocatePrefetchLines, AllocatePrefetchStepSize);
|
||||
tty->print_cr(" at distance %d, %d lines of %d bytes", AllocatePrefetchDistance, AllocatePrefetchLines, AllocatePrefetchStepSize);
|
||||
} else {
|
||||
tty->print_cr(" %d, one line", AllocatePrefetchDistance);
|
||||
tty->print_cr(" at distance %d, one line of %d bytes", AllocatePrefetchDistance, AllocatePrefetchStepSize);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -419,7 +419,7 @@ public:
|
||||
return result;
|
||||
}
|
||||
|
||||
static intx L1_data_cache_line_size() {
|
||||
static intx prefetch_data_size() {
|
||||
intx result = 0;
|
||||
if (is_intel()) {
|
||||
result = (_cpuid_info.dcp_cpuid4_ebx.bits.L1_line_size + 1);
|
||||
|
||||
@ -7325,8 +7325,9 @@ instruct prefetchr( memory mem ) %{
|
||||
ins_cost(100);
|
||||
|
||||
format %{ "PREFETCHR $mem\t! Prefetch into level 1 cache for read" %}
|
||||
opcode(0x0F, 0x0d); /* Opcode 0F 0d /0 */
|
||||
ins_encode(OpcP, OpcS, RMopc_Mem(0x00,mem));
|
||||
ins_encode %{
|
||||
__ prefetchr($mem$$Address);
|
||||
%}
|
||||
ins_pipe(ialu_mem);
|
||||
%}
|
||||
|
||||
@ -7336,8 +7337,9 @@ instruct prefetchrNTA( memory mem ) %{
|
||||
ins_cost(100);
|
||||
|
||||
format %{ "PREFETCHNTA $mem\t! Prefetch into non-temporal cache for read" %}
|
||||
opcode(0x0F, 0x18); /* Opcode 0F 18 /0 */
|
||||
ins_encode(OpcP, OpcS, RMopc_Mem(0x00,mem));
|
||||
ins_encode %{
|
||||
__ prefetchnta($mem$$Address);
|
||||
%}
|
||||
ins_pipe(ialu_mem);
|
||||
%}
|
||||
|
||||
@ -7347,8 +7349,9 @@ instruct prefetchrT0( memory mem ) %{
|
||||
ins_cost(100);
|
||||
|
||||
format %{ "PREFETCHT0 $mem\t! Prefetch into L1 and L2 caches for read" %}
|
||||
opcode(0x0F, 0x18); /* Opcode 0F 18 /1 */
|
||||
ins_encode(OpcP, OpcS, RMopc_Mem(0x01,mem));
|
||||
ins_encode %{
|
||||
__ prefetcht0($mem$$Address);
|
||||
%}
|
||||
ins_pipe(ialu_mem);
|
||||
%}
|
||||
|
||||
@ -7358,8 +7361,9 @@ instruct prefetchrT2( memory mem ) %{
|
||||
ins_cost(100);
|
||||
|
||||
format %{ "PREFETCHT2 $mem\t! Prefetch into L2 cache for read" %}
|
||||
opcode(0x0F, 0x18); /* Opcode 0F 18 /3 */
|
||||
ins_encode(OpcP, OpcS, RMopc_Mem(0x03,mem));
|
||||
ins_encode %{
|
||||
__ prefetcht2($mem$$Address);
|
||||
%}
|
||||
ins_pipe(ialu_mem);
|
||||
%}
|
||||
|
||||
@ -7374,46 +7378,86 @@ instruct prefetchw0( memory mem ) %{
|
||||
%}
|
||||
|
||||
instruct prefetchw( memory mem ) %{
|
||||
predicate(UseSSE==0 && VM_Version::supports_3dnow_prefetch() || AllocatePrefetchInstr==3);
|
||||
predicate(UseSSE==0 && VM_Version::supports_3dnow_prefetch());
|
||||
match( PrefetchWrite mem );
|
||||
ins_cost(100);
|
||||
|
||||
format %{ "PREFETCHW $mem\t! Prefetch into L1 cache and mark modified" %}
|
||||
opcode(0x0F, 0x0D); /* Opcode 0F 0D /1 */
|
||||
ins_encode(OpcP, OpcS, RMopc_Mem(0x01,mem));
|
||||
ins_encode %{
|
||||
__ prefetchw($mem$$Address);
|
||||
%}
|
||||
ins_pipe(ialu_mem);
|
||||
%}
|
||||
|
||||
instruct prefetchwNTA( memory mem ) %{
|
||||
predicate(UseSSE>=1 && AllocatePrefetchInstr==0);
|
||||
predicate(UseSSE>=1);
|
||||
match(PrefetchWrite mem);
|
||||
ins_cost(100);
|
||||
|
||||
format %{ "PREFETCHNTA $mem\t! Prefetch into non-temporal cache for write" %}
|
||||
opcode(0x0F, 0x18); /* Opcode 0F 18 /0 */
|
||||
ins_encode(OpcP, OpcS, RMopc_Mem(0x00,mem));
|
||||
ins_encode %{
|
||||
__ prefetchnta($mem$$Address);
|
||||
%}
|
||||
ins_pipe(ialu_mem);
|
||||
%}
|
||||
|
||||
instruct prefetchwT0( memory mem ) %{
|
||||
// Prefetch instructions for allocation.
|
||||
|
||||
instruct prefetchAlloc0( memory mem ) %{
|
||||
predicate(UseSSE==0 && AllocatePrefetchInstr!=3);
|
||||
match(PrefetchAllocation mem);
|
||||
ins_cost(0);
|
||||
size(0);
|
||||
format %{ "Prefetch allocation (non-SSE is empty encoding)" %}
|
||||
ins_encode();
|
||||
ins_pipe(empty);
|
||||
%}
|
||||
|
||||
instruct prefetchAlloc( memory mem ) %{
|
||||
predicate(AllocatePrefetchInstr==3);
|
||||
match( PrefetchAllocation mem );
|
||||
ins_cost(100);
|
||||
|
||||
format %{ "PREFETCHW $mem\t! Prefetch allocation into L1 cache and mark modified" %}
|
||||
ins_encode %{
|
||||
__ prefetchw($mem$$Address);
|
||||
%}
|
||||
ins_pipe(ialu_mem);
|
||||
%}
|
||||
|
||||
instruct prefetchAllocNTA( memory mem ) %{
|
||||
predicate(UseSSE>=1 && AllocatePrefetchInstr==0);
|
||||
match(PrefetchAllocation mem);
|
||||
ins_cost(100);
|
||||
|
||||
format %{ "PREFETCHNTA $mem\t! Prefetch allocation into non-temporal cache for write" %}
|
||||
ins_encode %{
|
||||
__ prefetchnta($mem$$Address);
|
||||
%}
|
||||
ins_pipe(ialu_mem);
|
||||
%}
|
||||
|
||||
instruct prefetchAllocT0( memory mem ) %{
|
||||
predicate(UseSSE>=1 && AllocatePrefetchInstr==1);
|
||||
match(PrefetchWrite mem);
|
||||
match(PrefetchAllocation mem);
|
||||
ins_cost(100);
|
||||
|
||||
format %{ "PREFETCHT0 $mem\t! Prefetch into L1 and L2 caches for write" %}
|
||||
opcode(0x0F, 0x18); /* Opcode 0F 18 /1 */
|
||||
ins_encode(OpcP, OpcS, RMopc_Mem(0x01,mem));
|
||||
format %{ "PREFETCHT0 $mem\t! Prefetch allocation into L1 and L2 caches for write" %}
|
||||
ins_encode %{
|
||||
__ prefetcht0($mem$$Address);
|
||||
%}
|
||||
ins_pipe(ialu_mem);
|
||||
%}
|
||||
|
||||
instruct prefetchwT2( memory mem ) %{
|
||||
instruct prefetchAllocT2( memory mem ) %{
|
||||
predicate(UseSSE>=1 && AllocatePrefetchInstr==2);
|
||||
match(PrefetchWrite mem);
|
||||
match(PrefetchAllocation mem);
|
||||
ins_cost(100);
|
||||
|
||||
format %{ "PREFETCHT2 $mem\t! Prefetch into L2 cache for write" %}
|
||||
opcode(0x0F, 0x18); /* Opcode 0F 18 /3 */
|
||||
ins_encode(OpcP, OpcS, RMopc_Mem(0x03,mem));
|
||||
format %{ "PREFETCHT2 $mem\t! Prefetch allocation into L2 cache for write" %}
|
||||
ins_encode %{
|
||||
__ prefetcht2($mem$$Address);
|
||||
%}
|
||||
ins_pipe(ialu_mem);
|
||||
%}
|
||||
|
||||
|
||||
@ -6617,8 +6617,9 @@ instruct prefetchr( memory mem ) %{
|
||||
ins_cost(125);
|
||||
|
||||
format %{ "PREFETCHR $mem\t# Prefetch into level 1 cache" %}
|
||||
opcode(0x0F, 0x0D); /* Opcode 0F 0D /0 */
|
||||
ins_encode(REX_mem(mem), OpcP, OpcS, RM_opc_mem(0x00, mem));
|
||||
ins_encode %{
|
||||
__ prefetchr($mem$$Address);
|
||||
%}
|
||||
ins_pipe(ialu_mem);
|
||||
%}
|
||||
|
||||
@ -6628,8 +6629,9 @@ instruct prefetchrNTA( memory mem ) %{
|
||||
ins_cost(125);
|
||||
|
||||
format %{ "PREFETCHNTA $mem\t# Prefetch into non-temporal cache for read" %}
|
||||
opcode(0x0F, 0x18); /* Opcode 0F 18 /0 */
|
||||
ins_encode(REX_mem(mem), OpcP, OpcS, RM_opc_mem(0x00, mem));
|
||||
ins_encode %{
|
||||
__ prefetchnta($mem$$Address);
|
||||
%}
|
||||
ins_pipe(ialu_mem);
|
||||
%}
|
||||
|
||||
@ -6639,8 +6641,9 @@ instruct prefetchrT0( memory mem ) %{
|
||||
ins_cost(125);
|
||||
|
||||
format %{ "PREFETCHT0 $mem\t# prefetch into L1 and L2 caches for read" %}
|
||||
opcode(0x0F, 0x18); /* Opcode 0F 18 /1 */
|
||||
ins_encode(REX_mem(mem), OpcP, OpcS, RM_opc_mem(0x01, mem));
|
||||
ins_encode %{
|
||||
__ prefetcht0($mem$$Address);
|
||||
%}
|
||||
ins_pipe(ialu_mem);
|
||||
%}
|
||||
|
||||
@ -6650,52 +6653,70 @@ instruct prefetchrT2( memory mem ) %{
|
||||
ins_cost(125);
|
||||
|
||||
format %{ "PREFETCHT2 $mem\t# prefetch into L2 caches for read" %}
|
||||
opcode(0x0F, 0x18); /* Opcode 0F 18 /3 */
|
||||
ins_encode(REX_mem(mem), OpcP, OpcS, RM_opc_mem(0x03, mem));
|
||||
ins_pipe(ialu_mem);
|
||||
%}
|
||||
|
||||
instruct prefetchw( memory mem ) %{
|
||||
predicate(AllocatePrefetchInstr==3);
|
||||
match(PrefetchWrite mem);
|
||||
ins_cost(125);
|
||||
|
||||
format %{ "PREFETCHW $mem\t# Prefetch into level 1 cache and mark modified" %}
|
||||
opcode(0x0F, 0x0D); /* Opcode 0F 0D /1 */
|
||||
ins_encode(REX_mem(mem), OpcP, OpcS, RM_opc_mem(0x01, mem));
|
||||
ins_encode %{
|
||||
__ prefetcht2($mem$$Address);
|
||||
%}
|
||||
ins_pipe(ialu_mem);
|
||||
%}
|
||||
|
||||
instruct prefetchwNTA( memory mem ) %{
|
||||
predicate(AllocatePrefetchInstr==0);
|
||||
match(PrefetchWrite mem);
|
||||
ins_cost(125);
|
||||
|
||||
format %{ "PREFETCHNTA $mem\t# Prefetch to non-temporal cache for write" %}
|
||||
opcode(0x0F, 0x18); /* Opcode 0F 18 /0 */
|
||||
ins_encode(REX_mem(mem), OpcP, OpcS, RM_opc_mem(0x00, mem));
|
||||
ins_encode %{
|
||||
__ prefetchnta($mem$$Address);
|
||||
%}
|
||||
ins_pipe(ialu_mem);
|
||||
%}
|
||||
|
||||
instruct prefetchwT0( memory mem ) %{
|
||||
// Prefetch instructions for allocation.
|
||||
|
||||
instruct prefetchAlloc( memory mem ) %{
|
||||
predicate(AllocatePrefetchInstr==3);
|
||||
match(PrefetchAllocation mem);
|
||||
ins_cost(125);
|
||||
|
||||
format %{ "PREFETCHW $mem\t# Prefetch allocation into level 1 cache and mark modified" %}
|
||||
ins_encode %{
|
||||
__ prefetchw($mem$$Address);
|
||||
%}
|
||||
ins_pipe(ialu_mem);
|
||||
%}
|
||||
|
||||
instruct prefetchAllocNTA( memory mem ) %{
|
||||
predicate(AllocatePrefetchInstr==0);
|
||||
match(PrefetchAllocation mem);
|
||||
ins_cost(125);
|
||||
|
||||
format %{ "PREFETCHNTA $mem\t# Prefetch allocation to non-temporal cache for write" %}
|
||||
ins_encode %{
|
||||
__ prefetchnta($mem$$Address);
|
||||
%}
|
||||
ins_pipe(ialu_mem);
|
||||
%}
|
||||
|
||||
instruct prefetchAllocT0( memory mem ) %{
|
||||
predicate(AllocatePrefetchInstr==1);
|
||||
match(PrefetchWrite mem);
|
||||
match(PrefetchAllocation mem);
|
||||
ins_cost(125);
|
||||
|
||||
format %{ "PREFETCHT0 $mem\t# Prefetch to level 1 and 2 caches for write" %}
|
||||
opcode(0x0F, 0x18); /* Opcode 0F 18 /1 */
|
||||
ins_encode(REX_mem(mem), OpcP, OpcS, RM_opc_mem(0x01, mem));
|
||||
format %{ "PREFETCHT0 $mem\t# Prefetch allocation to level 1 and 2 caches for write" %}
|
||||
ins_encode %{
|
||||
__ prefetcht0($mem$$Address);
|
||||
%}
|
||||
ins_pipe(ialu_mem);
|
||||
%}
|
||||
|
||||
instruct prefetchwT2( memory mem ) %{
|
||||
instruct prefetchAllocT2( memory mem ) %{
|
||||
predicate(AllocatePrefetchInstr==2);
|
||||
match(PrefetchWrite mem);
|
||||
match(PrefetchAllocation mem);
|
||||
ins_cost(125);
|
||||
|
||||
format %{ "PREFETCHT2 $mem\t# Prefetch to level 2 cache for write" %}
|
||||
opcode(0x0F, 0x18); /* Opcode 0F 18 /3 */
|
||||
ins_encode(REX_mem(mem), OpcP, OpcS, RM_opc_mem(0x03, mem));
|
||||
format %{ "PREFETCHT2 $mem\t# Prefetch allocation to level 2 cache for write" %}
|
||||
ins_encode %{
|
||||
__ prefetcht2($mem$$Address);
|
||||
%}
|
||||
ins_pipe(ialu_mem);
|
||||
%}
|
||||
|
||||
|
||||
@ -3390,7 +3390,9 @@ int MatchNode::needs_ideal_memory_edge(FormDict &globals) const {
|
||||
"ClearArray"
|
||||
};
|
||||
int cnt = sizeof(needs_ideal_memory_list)/sizeof(char*);
|
||||
if( strcmp(_opType,"PrefetchRead")==0 || strcmp(_opType,"PrefetchWrite")==0 )
|
||||
if( strcmp(_opType,"PrefetchRead")==0 ||
|
||||
strcmp(_opType,"PrefetchWrite")==0 ||
|
||||
strcmp(_opType,"PrefetchAllocation")==0 )
|
||||
return 1;
|
||||
if( _lChild ) {
|
||||
const char *opType = _lChild->_opType;
|
||||
|
||||
@ -124,16 +124,7 @@ public:
|
||||
// Reserve space at the end of TLAB
|
||||
static size_t end_reserve() {
|
||||
int reserve_size = typeArrayOopDesc::header_size(T_INT);
|
||||
if (AllocatePrefetchStyle == 3) {
|
||||
// BIS is used to prefetch - we need a space for it.
|
||||
// +1 for rounding up to next cache line +1 to be safe
|
||||
int lines = AllocatePrefetchLines + 2;
|
||||
int step_size = AllocatePrefetchStepSize;
|
||||
int distance = AllocatePrefetchDistance;
|
||||
int prefetch_end = (distance + step_size*lines)/(int)HeapWordSize;
|
||||
reserve_size = MAX2(reserve_size, prefetch_end);
|
||||
}
|
||||
return reserve_size;
|
||||
return MAX2(reserve_size, VM_Version::reserve_for_allocation_prefetch());
|
||||
}
|
||||
static size_t alignment_reserve() { return align_object_size(end_reserve()); }
|
||||
static size_t alignment_reserve_in_bytes() { return alignment_reserve() * HeapWordSize; }
|
||||
|
||||
@ -196,6 +196,7 @@ macro(Phi)
|
||||
macro(PopCountI)
|
||||
macro(PopCountL)
|
||||
macro(PowD)
|
||||
macro(PrefetchAllocation)
|
||||
macro(PrefetchRead)
|
||||
macro(PrefetchWrite)
|
||||
macro(Proj)
|
||||
|
||||
@ -1590,7 +1590,7 @@ Node* PhaseMacroExpand::prefetch_allocation(Node* i_o, Node*& needgc_false,
|
||||
prefetch_adr = new (C, 4) AddPNode( old_pf_wm, new_pf_wmt,
|
||||
_igvn.MakeConX(distance) );
|
||||
transform_later(prefetch_adr);
|
||||
prefetch = new (C, 3) PrefetchWriteNode( i_o, prefetch_adr );
|
||||
prefetch = new (C, 3) PrefetchAllocationNode( i_o, prefetch_adr );
|
||||
transform_later(prefetch);
|
||||
distance += step_size;
|
||||
i_o = prefetch;
|
||||
@ -1611,13 +1611,14 @@ Node* PhaseMacroExpand::prefetch_allocation(Node* i_o, Node*& needgc_false,
|
||||
contended_phi_rawmem = pf_phi_rawmem;
|
||||
i_o = pf_phi_abio;
|
||||
} else if( UseTLAB && AllocatePrefetchStyle == 3 ) {
|
||||
// Insert a prefetch for each allocation only on the fast-path
|
||||
// Insert a prefetch for each allocation.
|
||||
// This code is used for Sparc with BIS.
|
||||
Node *pf_region = new (C, 3) RegionNode(3);
|
||||
Node *pf_phi_rawmem = new (C, 3) PhiNode( pf_region, Type::MEMORY,
|
||||
TypeRawPtr::BOTTOM );
|
||||
|
||||
// Generate several prefetch instructions only for arrays.
|
||||
uint lines = (length != NULL) ? AllocatePrefetchLines : 1;
|
||||
// Generate several prefetch instructions.
|
||||
uint lines = (length != NULL) ? AllocatePrefetchLines : AllocateInstancePrefetchLines;
|
||||
uint step_size = AllocatePrefetchStepSize;
|
||||
uint distance = AllocatePrefetchDistance;
|
||||
|
||||
@ -1634,7 +1635,7 @@ Node* PhaseMacroExpand::prefetch_allocation(Node* i_o, Node*& needgc_false,
|
||||
transform_later(cache_adr);
|
||||
|
||||
// Prefetch
|
||||
Node *prefetch = new (C, 3) PrefetchWriteNode( contended_phi_rawmem, cache_adr );
|
||||
Node *prefetch = new (C, 3) PrefetchAllocationNode( contended_phi_rawmem, cache_adr );
|
||||
prefetch->set_req(0, needgc_false);
|
||||
transform_later(prefetch);
|
||||
contended_phi_rawmem = prefetch;
|
||||
@ -1644,7 +1645,7 @@ Node* PhaseMacroExpand::prefetch_allocation(Node* i_o, Node*& needgc_false,
|
||||
prefetch_adr = new (C, 4) AddPNode( cache_adr, cache_adr,
|
||||
_igvn.MakeConX(distance) );
|
||||
transform_later(prefetch_adr);
|
||||
prefetch = new (C, 3) PrefetchWriteNode( contended_phi_rawmem, prefetch_adr );
|
||||
prefetch = new (C, 3) PrefetchAllocationNode( contended_phi_rawmem, prefetch_adr );
|
||||
transform_later(prefetch);
|
||||
distance += step_size;
|
||||
contended_phi_rawmem = prefetch;
|
||||
@ -1653,15 +1654,15 @@ Node* PhaseMacroExpand::prefetch_allocation(Node* i_o, Node*& needgc_false,
|
||||
// Insert a prefetch for each allocation only on the fast-path
|
||||
Node *prefetch_adr;
|
||||
Node *prefetch;
|
||||
// Generate several prefetch instructions only for arrays.
|
||||
uint lines = (length != NULL) ? AllocatePrefetchLines : 1;
|
||||
// Generate several prefetch instructions.
|
||||
uint lines = (length != NULL) ? AllocatePrefetchLines : AllocateInstancePrefetchLines;
|
||||
uint step_size = AllocatePrefetchStepSize;
|
||||
uint distance = AllocatePrefetchDistance;
|
||||
for ( uint i = 0; i < lines; i++ ) {
|
||||
prefetch_adr = new (C, 4) AddPNode( old_eden_top, new_eden_top,
|
||||
_igvn.MakeConX(distance) );
|
||||
transform_later(prefetch_adr);
|
||||
prefetch = new (C, 3) PrefetchWriteNode( i_o, prefetch_adr );
|
||||
prefetch = new (C, 3) PrefetchAllocationNode( i_o, prefetch_adr );
|
||||
// Do not let it float too high, since if eden_top == eden_end,
|
||||
// both might be null.
|
||||
if( i == 0 ) { // Set control for first prefetch, next follows it
|
||||
|
||||
@ -826,6 +826,7 @@ static void match_alias_type(Compile* C, Node* n, Node* m) {
|
||||
switch (n->Opcode()) {
|
||||
case Op_PrefetchRead:
|
||||
case Op_PrefetchWrite:
|
||||
case Op_PrefetchAllocation:
|
||||
nidx = Compile::AliasIdxRaw;
|
||||
nat = TypeRawPtr::BOTTOM;
|
||||
break;
|
||||
|
||||
@ -1278,6 +1278,16 @@ public:
|
||||
virtual int Opcode() const;
|
||||
virtual uint ideal_reg() const { return NotAMachineReg; }
|
||||
virtual uint match_edge(uint idx) const { return idx==2; }
|
||||
virtual const Type *bottom_type() const { return Type::ABIO; }
|
||||
};
|
||||
|
||||
// Allocation prefetch which may fault, TLAB size have to be adjusted.
|
||||
class PrefetchAllocationNode : public Node {
|
||||
public:
|
||||
PrefetchAllocationNode(Node *mem, Node *adr) : Node(0,mem,adr) {}
|
||||
virtual int Opcode() const;
|
||||
virtual uint ideal_reg() const { return NotAMachineReg; }
|
||||
virtual uint match_edge(uint idx) const { return idx==2; }
|
||||
virtual const Type *bottom_type() const { return ( AllocatePrefetchStyle == 3 ) ? Type::MEMORY : Type::ABIO; }
|
||||
};
|
||||
|
||||
|
||||
@ -2897,8 +2897,11 @@ class CommandLineFlags {
|
||||
product(intx, AllocatePrefetchDistance, -1, \
|
||||
"Distance to prefetch ahead of allocation pointer") \
|
||||
\
|
||||
product(intx, AllocatePrefetchLines, 1, \
|
||||
"Number of lines to prefetch ahead of allocation pointer") \
|
||||
product(intx, AllocatePrefetchLines, 3, \
|
||||
"Number of lines to prefetch ahead of array allocation pointer") \
|
||||
\
|
||||
product(intx, AllocateInstancePrefetchLines, 1, \
|
||||
"Number of lines to prefetch ahead of instance allocation pointer") \
|
||||
\
|
||||
product(intx, AllocatePrefetchStepSize, 16, \
|
||||
"Step size in bytes of sequential prefetch instructions") \
|
||||
|
||||
@ -46,6 +46,7 @@ const char* Abstract_VM_Version::_s_vm_release = Abstract_VM_Version::vm_release
|
||||
const char* Abstract_VM_Version::_s_internal_vm_info_string = Abstract_VM_Version::internal_vm_info_string();
|
||||
bool Abstract_VM_Version::_supports_cx8 = false;
|
||||
unsigned int Abstract_VM_Version::_logical_processors_per_package = 1U;
|
||||
int Abstract_VM_Version::_reserve_for_allocation_prefetch = 0;
|
||||
|
||||
#ifndef HOTSPOT_RELEASE_VERSION
|
||||
#error HOTSPOT_RELEASE_VERSION must be defined
|
||||
|
||||
@ -44,6 +44,7 @@ class Abstract_VM_Version: AllStatic {
|
||||
static bool _initialized;
|
||||
static int _parallel_worker_threads;
|
||||
static bool _parallel_worker_threads_initialized;
|
||||
static int _reserve_for_allocation_prefetch;
|
||||
|
||||
static unsigned int nof_parallel_worker_threads(unsigned int num,
|
||||
unsigned int dem,
|
||||
@ -77,6 +78,12 @@ class Abstract_VM_Version: AllStatic {
|
||||
return _logical_processors_per_package;
|
||||
}
|
||||
|
||||
// Need a space at the end of TLAB for prefetch instructions
|
||||
// which may fault when accessing memory outside of heap.
|
||||
static int reserve_for_allocation_prefetch() {
|
||||
return _reserve_for_allocation_prefetch;
|
||||
}
|
||||
|
||||
// ARCH specific policy for the BiasedLocking
|
||||
static bool use_biased_locking() { return true; }
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user