From 1b6281d98cf0e7c5435c563bfedd6f07b79bfa62 Mon Sep 17 00:00:00 2001 From: Hamlin Li Date: Fri, 21 Feb 2025 10:25:50 +0000 Subject: [PATCH] 8321003: RISC-V: C2 MulReductionVI 8321004: RISC-V: C2 MulReductionVL Reviewed-by: fyang, rehn --- .../cpu/riscv/c2_MacroAssembler_riscv.cpp | 39 +++++++++++ .../cpu/riscv/c2_MacroAssembler_riscv.hpp | 4 ++ src/hotspot/cpu/riscv/riscv_v.ad | 68 +++++++++++++++++++ .../loopopts/superword/ProdRed_Int.java | 6 +- .../loopopts/superword/RedTest_int.java | 4 ++ .../loopopts/superword/RedTest_long.java | 4 ++ 6 files changed, 124 insertions(+), 1 deletion(-) diff --git a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp index c23a574e401..34a61177774 100644 --- a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp +++ b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp @@ -2954,6 +2954,45 @@ void C2_MacroAssembler::reduce_integral_v(Register dst, Register src1, vmv_x_s(dst, tmp); } +void C2_MacroAssembler::reduce_mul_integral_v(Register dst, Register src1, VectorRegister src2, + VectorRegister vtmp1, VectorRegister vtmp2, + BasicType bt, uint vector_length, VectorMask vm) { + assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); + vsetvli_helper(bt, vector_length); + + vector_length /= 2; + if (vm != Assembler::unmasked) { + // This behaviour is consistent with spec requirements of vector API, for `reduceLanes`: + // If no elements are selected, an operation-specific identity value is returned. + // If the operation is MUL, then the identity value is one. + vmv_v_i(vtmp1, 1); + vmerge_vvm(vtmp2, vtmp1, src2); // vm == v0 + vslidedown_vi(vtmp1, vtmp2, vector_length); + + vsetvli_helper(bt, vector_length); + vmul_vv(vtmp1, vtmp1, vtmp2); + } else { + vslidedown_vi(vtmp1, src2, vector_length); + + vsetvli_helper(bt, vector_length); + vmul_vv(vtmp1, vtmp1, src2); + } + + while (vector_length > 1) { + vector_length /= 2; + vslidedown_vi(vtmp2, vtmp1, vector_length); + vsetvli_helper(bt, vector_length); + vmul_vv(vtmp1, vtmp1, vtmp2); + } + + vmv_x_s(dst, vtmp1); + if (bt == T_INT) { + mulw(dst, dst, src1); + } else { + mul(dst, dst, src1); + } +} + // Set vl and vtype for full and partial vector operations. // (vma = mu, vta = tu, vill = false) void C2_MacroAssembler::vsetvli_helper(BasicType bt, uint vector_length, LMUL vlmul, Register tmp) { diff --git a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.hpp b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.hpp index 114ad0a101c..c79c360d2eb 100644 --- a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.hpp +++ b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.hpp @@ -239,6 +239,10 @@ int opc, BasicType bt, uint vector_length, VectorMask vm = Assembler::unmasked); + void reduce_mul_integral_v(Register dst, Register src1, VectorRegister src2, + VectorRegister vtmp1, VectorRegister vtmp2, BasicType bt, + uint vector_length, VectorMask vm = Assembler::unmasked); + void vsetvli_helper(BasicType bt, uint vector_length, LMUL vlmul = Assembler::m1, Register tmp = t0); void compare_integral_v(VectorRegister dst, VectorRegister src1, VectorRegister src2, int cond, diff --git a/src/hotspot/cpu/riscv/riscv_v.ad b/src/hotspot/cpu/riscv/riscv_v.ad index 7be169ef709..9892d2b9c03 100644 --- a/src/hotspot/cpu/riscv/riscv_v.ad +++ b/src/hotspot/cpu/riscv/riscv_v.ad @@ -2,6 +2,7 @@ // Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved. // Copyright (c) 2020, 2023, Arm Limited. All rights reserved. // Copyright (c) 2020, 2022, Huawei Technologies Co., Ltd. All rights reserved. +// Copyright (c) 2023, 2025, Rivos Inc. All rights reserved. // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. // // This code is free software; you can redistribute it and/or modify it @@ -99,6 +100,12 @@ source %{ return false; } break; + case Op_MulReductionVI: + case Op_MulReductionVL: + // When vlen < 4, our log2(vlen) implementation does not help to gain performance improvement. + if (vlen < 4) { + return false; + } default: break; } @@ -2427,6 +2434,67 @@ instruct vreduce_minD_masked(fRegD dst, fRegD src1, vReg src2, vRegMask_V0 v0, v ins_pipe(pipe_slow); %} + +// ------------------------------ Vector reduction mul ------------------------- + +instruct reduce_mulI(iRegINoSp dst, iRegIorL2I isrc, vReg vsrc, + vReg tmp1, vReg tmp2) %{ + match(Set dst (MulReductionVI isrc vsrc)); + effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2); + format %{ "reduce_mulI $dst, $isrc, $vsrc\t" %} + + ins_encode %{ + __ reduce_mul_integral_v($dst$$Register, $isrc$$Register, as_VectorRegister($vsrc$$reg), + as_VectorRegister($tmp1$$reg), as_VectorRegister($tmp2$$reg), + Matcher::vector_element_basic_type(this, $vsrc), Matcher::vector_length(this, $vsrc)); + %} + ins_pipe(pipe_slow); +%} + +instruct reduce_mulI_masked(iRegINoSp dst, iRegIorL2I isrc, vReg vsrc, + vRegMask_V0 v0, vReg tmp1, vReg tmp2) %{ + match(Set dst (MulReductionVI (Binary isrc vsrc) v0)); + effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2); + format %{ "reduce_mulI_masked $dst, $isrc, $vsrc, $v0\t" %} + + ins_encode %{ + __ reduce_mul_integral_v($dst$$Register, $isrc$$Register, as_VectorRegister($vsrc$$reg), + as_VectorRegister($tmp1$$reg), as_VectorRegister($tmp2$$reg), + Matcher::vector_element_basic_type(this, $vsrc), Matcher::vector_length(this, $vsrc), + Assembler::v0_t); + %} + ins_pipe(pipe_slow); +%} + +instruct reduce_mulL(iRegLNoSp dst, iRegL isrc, vReg vsrc, + vReg tmp1, vReg tmp2) %{ + match(Set dst (MulReductionVL isrc vsrc)); + effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2); + format %{ "reduce_mulL $dst, $isrc, $vsrc\t" %} + + ins_encode %{ + __ reduce_mul_integral_v($dst$$Register, $isrc$$Register, as_VectorRegister($vsrc$$reg), + as_VectorRegister($tmp1$$reg), as_VectorRegister($tmp2$$reg), + Matcher::vector_element_basic_type(this, $vsrc), Matcher::vector_length(this, $vsrc)); + %} + ins_pipe(pipe_slow); +%} + +instruct reduce_mulL_masked(iRegLNoSp dst, iRegL isrc, vReg vsrc, + vRegMask_V0 v0, vReg tmp1, vReg tmp2) %{ + match(Set dst (MulReductionVL (Binary isrc vsrc) v0)); + effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2); + format %{ "reduce_mulL_masked $dst, $isrc, $vsrc, $v0\t" %} + + ins_encode %{ + __ reduce_mul_integral_v($dst$$Register, $isrc$$Register, as_VectorRegister($vsrc$$reg), + as_VectorRegister($tmp1$$reg), as_VectorRegister($tmp2$$reg), + Matcher::vector_element_basic_type(this, $vsrc), Matcher::vector_length(this, $vsrc), + Assembler::v0_t); + %} + ins_pipe(pipe_slow); +%} + // vector replicate instruct replicate(vReg dst, iRegIorL2I src) %{ diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/ProdRed_Int.java b/test/hotspot/jtreg/compiler/loopopts/superword/ProdRed_Int.java index 17f3a97a8e8..ebc8251e025 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/ProdRed_Int.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/ProdRed_Int.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015, 2023, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -85,6 +85,10 @@ public class ProdRed_Int { @IR(applyIfCPUFeature = {"sse4.1", "true"}, applyIfAnd = {"SuperWordReductions", "true", "LoopMaxUnroll", ">= 8"}, counts = {IRNode.MUL_REDUCTION_VI, ">= 1", IRNode.MUL_REDUCTION_VI, "<= 2"}) // one for main-loop, one for vector-post-loop + @IR(applyIfPlatform = {"riscv64", "true"}, + applyIfCPUFeature = {"rvv", "true"}, + applyIfAnd = {"SuperWordReductions", "true", "LoopMaxUnroll", ">= 8"}, + counts = {IRNode.MUL_REDUCTION_VI, ">= 1", IRNode.MUL_REDUCTION_VI, "<= 2"}) // one for main-loop, one for vector-post-loop public static int prodReductionImplement(int[] a, int[] b, int total) { for (int i = 0; i < a.length; i++) { total *= a[i] + b[i]; diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/RedTest_int.java b/test/hotspot/jtreg/compiler/loopopts/superword/RedTest_int.java index d4b6777ded8..5cf7077cf17 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/RedTest_int.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/RedTest_int.java @@ -219,6 +219,10 @@ public class RedTest_int { @IR(applyIfCPUFeature = {"sse4.1", "true"}, applyIfAnd = {"SuperWordReductions", "true", "LoopMaxUnroll", ">= 8"}, counts = {IRNode.MUL_REDUCTION_VI, ">= 1", IRNode.MUL_REDUCTION_VI, "<= 2"}) // one for main-loop, one for vector-post-loop + @IR(applyIfPlatform = {"riscv64", "true"}, + applyIfCPUFeature = {"rvv", "true"}, + applyIfAnd = {"SuperWordReductions", "true", "LoopMaxUnroll", ">= 8"}, + counts = {IRNode.MUL_REDUCTION_VI, ">= 1", IRNode.MUL_REDUCTION_VI, "<= 2"}) // one for main-loop, one for vector-post-loop public static int mulReductionImplement( int[] a, int[] b, diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/RedTest_long.java b/test/hotspot/jtreg/compiler/loopopts/superword/RedTest_long.java index 63228330ed5..10cd32bbbc7 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/RedTest_long.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/RedTest_long.java @@ -226,6 +226,10 @@ public class RedTest_long { applyIfAnd = {"SuperWordReductions", "true", "LoopMaxUnroll", ">= 8"}, applyIfPlatform = {"64-bit", "true"}, counts = {IRNode.MUL_REDUCTION_VL, ">= 1", IRNode.MUL_REDUCTION_VL, "<= 2"}) // one for main-loop, one for vector-post-loop + @IR(applyIfPlatform = {"riscv64", "true"}, + applyIfCPUFeature = {"rvv", "true"}, + applyIfAnd = {"SuperWordReductions", "true", "LoopMaxUnroll", ">= 8"}, + counts = {IRNode.MUL_REDUCTION_VL, ">= 1", IRNode.MUL_REDUCTION_VL, "<= 2"}) // one for main-loop, one for vector-post-loop public static long mulReductionImplement( long[] a, long[] b,