From c8bbcaf5de6982f673504a8dc766fb80bb6f0d07 Mon Sep 17 00:00:00 2001 From: Mohamed Issa Date: Fri, 2 May 2025 17:21:50 +0000 Subject: [PATCH] 8348638: Performance regression in Math.tanh Reviewed-by: jbhateja, epeter, sviswanathan --- .../cpu/x86/stubGenerator_x86_64_tanh.cpp | 35 ++-- .../org/openjdk/bench/java/lang/TanhPerf.java | 154 ++++++++++++++++++ 2 files changed, 171 insertions(+), 18 deletions(-) create mode 100644 test/micro/org/openjdk/bench/java/lang/TanhPerf.java diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64_tanh.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64_tanh.cpp index d13809bfcd9..52ce2731b1f 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64_tanh.cpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64_tanh.cpp @@ -1,5 +1,5 @@ /* -* Copyright (c) 2024, Intel Corporation. All rights reserved. +* Copyright (c) 2024, 2025, Intel Corporation. All rights reserved. * Intel Math Library (LIBM) Source Code * * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. @@ -46,7 +46,7 @@ // for |x| in [23/64,3*2^7) // e^{-2*|x|}=2^{-k-f}*2^{-r} ~ 2^{-k}*(Tn+Dn)*(1+p)=(T0+D0)*(1+p) // -// For |x| in [2^{-4},2^5): +// For |x| in [2^{-4},22): // 2^{-r}-1 ~ p=c1*r+c2*r^2+..+c5*r^5 // Let R=1/(1+T0+p*T0), truncated to 35 significant bits // R=1/(1+T0+D0+p*(T0+D0))*(1+eps), |eps|<2^{-33} @@ -66,11 +66,11 @@ // // For |x|<2^{-64}: x is returned // -// For |x|>=2^32: return +/-1 +// For |x|>=22: return +/-1 // // Special cases: // tanh(NaN) = quiet NaN, and raise invalid exception -// tanh(INF) = that INF +// tanh(+/-INF) = +/-1 // tanh(+/-0) = +/-0 // /******************************************************************************/ @@ -324,6 +324,12 @@ address StubGenerator::generate_libmTanh() { __ enter(); // required for proper stackwalking of RuntimeStub frame __ bind(B1_2); + __ pextrw(rcx, xmm0, 3); + __ movl(rdx, 32768); + __ andl(rdx, rcx); + __ andl(rcx, 32767); + __ cmpl(rcx, 16438); + __ jcc(Assembler::aboveEqual, L_2TAG_PACKET_2_0_1); // Branch only if |x| >= 22 __ movsd(xmm3, ExternalAddress(HALFMASK), r11 /*rscratch*/); __ xorpd(xmm4, xmm4); __ movsd(xmm1, ExternalAddress(L2E), r11 /*rscratch*/); @@ -331,16 +337,12 @@ address StubGenerator::generate_libmTanh() { __ movl(rax, 32768); __ pinsrw(xmm4, rax, 3); __ movsd(xmm6, ExternalAddress(Shifter), r11 /*rscratch*/); - __ pextrw(rcx, xmm0, 3); __ andpd(xmm3, xmm0); __ andnpd(xmm4, xmm0); __ pshufd(xmm5, xmm4, 68); - __ movl(rdx, 32768); - __ andl(rdx, rcx); - __ andl(rcx, 32767); __ subl(rcx, 16304); - __ cmpl(rcx, 144); - __ jcc(Assembler::aboveEqual, L_2TAG_PACKET_0_0_1); + __ cmpl(rcx, 134); + __ jcc(Assembler::aboveEqual, L_2TAG_PACKET_0_0_1); // Branch only if |x| is not in [2^{-4},22) __ subsd(xmm4, xmm3); __ mulsd(xmm3, xmm1); __ mulsd(xmm2, xmm5); @@ -427,8 +429,8 @@ address StubGenerator::generate_libmTanh() { __ bind(L_2TAG_PACKET_0_0_1); __ addl(rcx, 960); - __ cmpl(rcx, 1104); - __ jcc(Assembler::aboveEqual, L_2TAG_PACKET_1_0_1); + __ cmpl(rcx, 1094); + __ jcc(Assembler::aboveEqual, L_2TAG_PACKET_1_0_1); // Branch only if |x| not in [2^{-64}, 2^{-4}) __ movdqu(xmm2, ExternalAddress(pv), r11 /*rscratch*/); __ pshufd(xmm1, xmm0, 68); __ movdqu(xmm3, ExternalAddress(pv + 16), r11 /*rscratch*/); @@ -449,11 +451,8 @@ address StubGenerator::generate_libmTanh() { __ jmp(B1_4); __ bind(L_2TAG_PACKET_1_0_1); - __ addl(rcx, 15344); - __ cmpl(rcx, 16448); - __ jcc(Assembler::aboveEqual, L_2TAG_PACKET_2_0_1); __ cmpl(rcx, 16); - __ jcc(Assembler::below, L_2TAG_PACKET_3_0_1); + __ jcc(Assembler::below, L_2TAG_PACKET_3_0_1); // Branch only if |x| is denormalized __ xorpd(xmm2, xmm2); __ movl(rax, 17392); __ pinsrw(xmm2, rax, 3); @@ -468,7 +467,7 @@ address StubGenerator::generate_libmTanh() { __ bind(L_2TAG_PACKET_2_0_1); __ cmpl(rcx, 32752); - __ jcc(Assembler::aboveEqual, L_2TAG_PACKET_4_0_1); + __ jcc(Assembler::aboveEqual, L_2TAG_PACKET_4_0_1); // Branch only if |x| is INF or NaN __ xorpd(xmm2, xmm2); __ movl(rcx, 15344); __ pinsrw(xmm2, rcx, 3); @@ -489,7 +488,7 @@ address StubGenerator::generate_libmTanh() { __ movdl(rcx, xmm2); __ orl(rcx, rax); __ cmpl(rcx, 0); - __ jcc(Assembler::equal, L_2TAG_PACKET_5_0_1); + __ jcc(Assembler::equal, L_2TAG_PACKET_5_0_1); // Branch only if |x| is not NaN __ addsd(xmm0, xmm0); __ bind(B1_4); diff --git a/test/micro/org/openjdk/bench/java/lang/TanhPerf.java b/test/micro/org/openjdk/bench/java/lang/TanhPerf.java new file mode 100644 index 00000000000..6747b6b2aa3 --- /dev/null +++ b/test/micro/org/openjdk/bench/java/lang/TanhPerf.java @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +package org.openjdk.bench.java.lang; + +import java.util.concurrent.TimeUnit; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.annotations.OperationsPerInvocation; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.runner.Runner; +import org.openjdk.jmh.runner.RunnerException; +import org.openjdk.jmh.runner.options.Options; +import org.openjdk.jmh.runner.options.OptionsBuilder; + +import java.util.Random; + +public class TanhPerf { + + @Warmup(iterations = 3, time = 5, timeUnit = TimeUnit.MILLISECONDS) + @Measurement(iterations = 4, time = 5, timeUnit = TimeUnit.MILLISECONDS) + @Fork(2) + @BenchmarkMode(Mode.Throughput) + @State(Scope.Thread) + @OutputTimeUnit(TimeUnit.MILLISECONDS) + public static class TanhPerfRanges { + public static int tanhInputCount = 2048; + + @Param({"0", "1", "2", "3"}) + public int tanhRangeIndex; + + public double [] tanhPosRandInputs; + public double [] tanhNegRandInputs; + public int tanhInputIndex = 0; + public double tanhRangeInputs[][] = {{0.0, 0x1.0P-55}, {0x1.0P-55, 1.0}, {1.0, 22.0}, {22.1, 1.7976931348623157E308} }; + + @Setup + public void setupValues() { + Random random = new Random(1023); + + // Fill the positive and negative tanh vectors with random values + tanhPosRandInputs = new double[tanhInputCount]; + tanhNegRandInputs = new double[tanhInputCount]; + + for (int i = 0; i < tanhInputCount; i++) { + double tanhLowerBound = tanhRangeInputs[tanhRangeIndex][0]; + double tanhUpperBound = tanhRangeInputs[tanhRangeIndex][1]; + tanhPosRandInputs[i] = random.nextDouble(tanhLowerBound, tanhUpperBound); + tanhNegRandInputs[i] = random.nextDouble(-tanhUpperBound, -tanhLowerBound); + } + } + + @Benchmark + @OperationsPerInvocation(2048) + public double tanhPosRangeDouble() { + double res = 0.0; + for (int i = 0; i < tanhInputCount; i++) { + res += Math.tanh(tanhPosRandInputs[i]); + } + return res; + } + + @Benchmark + @OperationsPerInvocation(2048) + public double tanhNegRangeDouble() { + double res = 0.0; + for (int i = 0; i < tanhInputCount; i++) { + res += Math.tanh(tanhNegRandInputs[i]); + } + return res; + } + } + + @Warmup(iterations = 3, time = 5, timeUnit = TimeUnit.SECONDS) + @Measurement(iterations = 4, time = 5, timeUnit = TimeUnit.SECONDS) + @Fork(2) + @BenchmarkMode(Mode.Throughput) + @State(Scope.Thread) + @OutputTimeUnit(TimeUnit.MILLISECONDS) + public static class TanhPerfConstant { + public static final double constDoubleTiny = 0x1.0P-57; + public static final double constDoubleSmall = 0x1.0P-54; + public static final double constDouble1 = 1.0; + public static final double constDouble21 = 21.0; + public static final double constDoubleLarge = 23.0; + + @Benchmark + public double tanhConstDoubleTiny() { + return Math.tanh(constDoubleTiny); + } + + @Benchmark + public double tanhConstDoubleSmall() { + return Math.tanh(constDoubleSmall); + } + + @Benchmark + public double tanhConstDouble1() { + return Math.tanh(constDouble1); + } + + @Benchmark + public double tanhConstDouble21() { + return Math.tanh(constDouble21); + } + + @Benchmark + public double tanhConstDoubleLarge() { + return Math.tanh(constDoubleLarge); + } + } + + public static void main(String[] args) throws RunnerException { + Options opt = new OptionsBuilder() + .include(TanhPerfRanges.class.getSimpleName()) + .build(); + + new Runner(opt).run(); + + opt = new OptionsBuilder() + .include(TanhPerfConstant.class.getSimpleName()) + .build(); + + new Runner(opt).run(); + } +}