8321003: RISC-V: C2 MulReductionVI

8321004: RISC-V: C2 MulReductionVL Reviewed-by: fyang, rehn
2026-03-14 18:03:44 +00:00 · 2025-02-21 10:25:50 +00:00 · 2025-02-21 10:25:50 +00:00 · 1b6281d98c
commit 1b6281d98c
parent c73fead5ca
6 changed files with 124 additions and 1 deletions
--- a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp
+++ b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp
@ -2954,6 +2954,45 @@ void C2_MacroAssembler::reduce_integral_v(Register dst, Register src1,
  vmv_x_s(dst, tmp);
 }

+void C2_MacroAssembler::reduce_mul_integral_v(Register dst, Register src1, VectorRegister src2,
+                                              VectorRegister vtmp1, VectorRegister vtmp2,
+                                              BasicType bt, uint vector_length, VectorMask vm) {
+  assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
+  vsetvli_helper(bt, vector_length);
+
+  vector_length /= 2;
+  if (vm != Assembler::unmasked) {
+    // This behaviour is consistent with spec requirements of vector API, for `reduceLanes`:
+    //  If no elements are selected, an operation-specific identity value is returned.
+    //    If the operation is MUL, then the identity value is one.
+    vmv_v_i(vtmp1, 1);
+    vmerge_vvm(vtmp2, vtmp1, src2); // vm == v0
+    vslidedown_vi(vtmp1, vtmp2, vector_length);
+
+    vsetvli_helper(bt, vector_length);
+    vmul_vv(vtmp1, vtmp1, vtmp2);
+  } else {
+    vslidedown_vi(vtmp1, src2, vector_length);
+
+    vsetvli_helper(bt, vector_length);
+    vmul_vv(vtmp1, vtmp1, src2);
+  }
+
+  while (vector_length > 1) {
+    vector_length /= 2;
+    vslidedown_vi(vtmp2, vtmp1, vector_length);
+    vsetvli_helper(bt, vector_length);
+    vmul_vv(vtmp1, vtmp1, vtmp2);
+  }
+
+  vmv_x_s(dst, vtmp1);
+  if (bt == T_INT) {
+    mulw(dst, dst, src1);
+  } else {
+    mul(dst, dst, src1);
+  }
+}
+
 // Set vl and vtype for full and partial vector operations.
 // (vma = mu, vta = tu, vill = false)
 void C2_MacroAssembler::vsetvli_helper(BasicType bt, uint vector_length, LMUL vlmul, Register tmp) {
--- a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.hpp
+++ b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.hpp
@ -239,6 +239,10 @@
                        int opc, BasicType bt, uint vector_length,
                        VectorMask vm = Assembler::unmasked);

+  void reduce_mul_integral_v(Register dst, Register src1, VectorRegister src2,
+                             VectorRegister vtmp1, VectorRegister vtmp2, BasicType bt,
+                             uint vector_length, VectorMask vm = Assembler::unmasked);
+
  void vsetvli_helper(BasicType bt, uint vector_length, LMUL vlmul = Assembler::m1, Register tmp = t0);

  void compare_integral_v(VectorRegister dst, VectorRegister src1, VectorRegister src2, int cond,
--- a/src/hotspot/cpu/riscv/riscv_v.ad
+++ b/src/hotspot/cpu/riscv/riscv_v.ad
@ -2,6 +2,7 @@
 // Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
 // Copyright (c) 2020, 2023, Arm Limited. All rights reserved.
 // Copyright (c) 2020, 2022, Huawei Technologies Co., Ltd. All rights reserved.
+// Copyright (c) 2023, 2025, Rivos Inc. All rights reserved.
 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 //
 // This code is free software; you can redistribute it and/or modify it
@ -99,6 +100,12 @@ source %{
          return false;
        }
        break;
+      case Op_MulReductionVI:
+      case Op_MulReductionVL:
+        // When vlen < 4, our log2(vlen) implementation does not help to gain performance improvement.
+        if (vlen < 4) {
+          return false;
+        }
      default:
        break;
    }
@ -2427,6 +2434,67 @@ instruct vreduce_minD_masked(fRegD dst, fRegD src1, vReg src2, vRegMask_V0 v0, v
  ins_pipe(pipe_slow);
 %}

+
+// ------------------------------ Vector reduction mul -------------------------
+
+instruct reduce_mulI(iRegINoSp dst, iRegIorL2I isrc, vReg vsrc,
+                     vReg tmp1, vReg tmp2) %{
+  match(Set dst (MulReductionVI isrc vsrc));
+  effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2);
+  format %{ "reduce_mulI $dst, $isrc, $vsrc\t" %}
+
+  ins_encode %{
+    __ reduce_mul_integral_v($dst$$Register, $isrc$$Register, as_VectorRegister($vsrc$$reg),
+                             as_VectorRegister($tmp1$$reg), as_VectorRegister($tmp2$$reg),
+                             Matcher::vector_element_basic_type(this, $vsrc), Matcher::vector_length(this, $vsrc));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct reduce_mulI_masked(iRegINoSp dst, iRegIorL2I isrc, vReg vsrc,
+                            vRegMask_V0 v0, vReg tmp1, vReg tmp2) %{
+  match(Set dst (MulReductionVI (Binary isrc vsrc) v0));
+  effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2);
+  format %{ "reduce_mulI_masked $dst, $isrc, $vsrc, $v0\t" %}
+
+  ins_encode %{
+    __ reduce_mul_integral_v($dst$$Register, $isrc$$Register, as_VectorRegister($vsrc$$reg),
+                             as_VectorRegister($tmp1$$reg), as_VectorRegister($tmp2$$reg),
+                             Matcher::vector_element_basic_type(this, $vsrc), Matcher::vector_length(this, $vsrc),
+                             Assembler::v0_t);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct reduce_mulL(iRegLNoSp dst, iRegL isrc, vReg vsrc,
+                     vReg tmp1, vReg tmp2) %{
+  match(Set dst (MulReductionVL isrc vsrc));
+  effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2);
+  format %{ "reduce_mulL $dst, $isrc, $vsrc\t" %}
+
+  ins_encode %{
+    __ reduce_mul_integral_v($dst$$Register, $isrc$$Register, as_VectorRegister($vsrc$$reg),
+                             as_VectorRegister($tmp1$$reg), as_VectorRegister($tmp2$$reg),
+                             Matcher::vector_element_basic_type(this, $vsrc), Matcher::vector_length(this, $vsrc));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct reduce_mulL_masked(iRegLNoSp dst, iRegL isrc, vReg vsrc,
+                            vRegMask_V0 v0, vReg tmp1, vReg tmp2) %{
+  match(Set dst (MulReductionVL (Binary isrc vsrc) v0));
+  effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2);
+  format %{ "reduce_mulL_masked $dst, $isrc, $vsrc, $v0\t" %}
+
+  ins_encode %{
+    __ reduce_mul_integral_v($dst$$Register, $isrc$$Register, as_VectorRegister($vsrc$$reg),
+                             as_VectorRegister($tmp1$$reg), as_VectorRegister($tmp2$$reg),
+                             Matcher::vector_element_basic_type(this, $vsrc), Matcher::vector_length(this, $vsrc),
+                             Assembler::v0_t);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
 // vector replicate

 instruct replicate(vReg dst, iRegIorL2I src) %{
--- a/test/hotspot/jtreg/compiler/loopopts/superword/ProdRed_Int.java
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/ProdRed_Int.java
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, 2023, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -85,6 +85,10 @@ public class ProdRed_Int {
    @IR(applyIfCPUFeature = {"sse4.1", "true"},
        applyIfAnd = {"SuperWordReductions", "true", "LoopMaxUnroll", ">= 8"},
        counts = {IRNode.MUL_REDUCTION_VI, ">= 1", IRNode.MUL_REDUCTION_VI, "<= 2"}) // one for main-loop, one for vector-post-loop
+    @IR(applyIfPlatform = {"riscv64", "true"},
+        applyIfCPUFeature = {"rvv", "true"},
+        applyIfAnd = {"SuperWordReductions", "true", "LoopMaxUnroll", ">= 8"},
+        counts = {IRNode.MUL_REDUCTION_VI, ">= 1", IRNode.MUL_REDUCTION_VI, "<= 2"}) // one for main-loop, one for vector-post-loop
    public static int prodReductionImplement(int[] a, int[] b, int total) {
        for (int i = 0; i < a.length; i++) {
            total *= a[i] + b[i];
--- a/test/hotspot/jtreg/compiler/loopopts/superword/RedTest_int.java
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/RedTest_int.java
@ -219,6 +219,10 @@ public class RedTest_int {
    @IR(applyIfCPUFeature = {"sse4.1", "true"},
        applyIfAnd = {"SuperWordReductions", "true", "LoopMaxUnroll", ">= 8"},
        counts = {IRNode.MUL_REDUCTION_VI, ">= 1", IRNode.MUL_REDUCTION_VI, "<= 2"}) // one for main-loop, one for vector-post-loop
+    @IR(applyIfPlatform = {"riscv64", "true"},
+        applyIfCPUFeature = {"rvv", "true"},
+        applyIfAnd = {"SuperWordReductions", "true", "LoopMaxUnroll", ">= 8"},
+        counts = {IRNode.MUL_REDUCTION_VI, ">= 1", IRNode.MUL_REDUCTION_VI, "<= 2"}) // one for main-loop, one for vector-post-loop
    public static int mulReductionImplement(
            int[] a,
            int[] b,
--- a/test/hotspot/jtreg/compiler/loopopts/superword/RedTest_long.java
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/RedTest_long.java
@ -226,6 +226,10 @@ public class RedTest_long {
        applyIfAnd = {"SuperWordReductions", "true", "LoopMaxUnroll", ">= 8"},
        applyIfPlatform = {"64-bit", "true"},
        counts = {IRNode.MUL_REDUCTION_VL, ">= 1", IRNode.MUL_REDUCTION_VL, "<= 2"}) // one for main-loop, one for vector-post-loop
+    @IR(applyIfPlatform = {"riscv64", "true"},
+        applyIfCPUFeature = {"rvv", "true"},
+        applyIfAnd = {"SuperWordReductions", "true", "LoopMaxUnroll", ">= 8"},
+        counts = {IRNode.MUL_REDUCTION_VL, ">= 1", IRNode.MUL_REDUCTION_VL, "<= 2"}) // one for main-loop, one for vector-post-loop
    public static long mulReductionImplement(
            long[] a,
            long[] b,