From 185d933bb9c9020dbc35daf7e664562fda538b5d Mon Sep 17 00:00:00 2001 From: Xueming Shen Date: Wed, 27 May 2026 04:56:50 +0000 Subject: [PATCH] 8376602: [Vector API] Upgrade SLEEF from 3.6.1 to 3.9.0 Reviewed-by: psandoz, fyang, erikj --- make/UpdateSleefSource.gmk | 18 +- src/jdk.incubator.vector/linux/legal/sleef.md | 4 +- .../unix/native/libsleef/README.md | 8 +- .../unix/native/libsleef/generated/misc.h | 46 +- .../libsleef/generated/sleefinline_advsimd.h | 42 +- .../libsleef/generated/sleefinline_rvvm1.h | 408 +- .../libsleef/generated/sleefinline_sve.h | 46 +- .../native/libsleef/upstream/CHANGELOG.md | 49 + .../native/libsleef/upstream/CMakeLists.txt | 104 +- .../native/libsleef/upstream/CONTRIBUTORS.md | 27 - .../native/libsleef/upstream/Configure.cmake | 290 +- .../unix/native/libsleef/upstream/Jenkinsfile | 247 + .../unix/native/libsleef/upstream/README.adoc | 129 + .../unix/native/libsleef/upstream/README.md | 221 - .../libsleef/upstream/include/sleefdft.h | 11 +- .../libsleef/upstream/sleef-config.h.in | 5 + .../libsleef/upstream/src/CMakeLists.txt | 10 +- .../libsleef/upstream/src/arch/helperavx.h | 14 +- .../libsleef/upstream/src/arch/helperavx2.h | 16 +- .../upstream/src/arch/helperavx2_128.h | 12 +- .../upstream/src/arch/helperavx512f.h | 4 +- .../libsleef/upstream/src/arch/helperneon32.h | 6 +- .../upstream/src/arch/helperpower_128.h | 28 +- .../libsleef/upstream/src/arch/helperpurec.h | 2 +- .../upstream/src/arch/helperpurec_scalar.h | 4 +- .../libsleef/upstream/src/arch/helperrvv.h | 15 +- .../upstream/src/arch/helpers390x_128.h | 34 +- .../libsleef/upstream/src/arch/helpersse2.h | 12 +- .../libsleef/upstream/src/arch/helpersve.h | 4 +- .../libsleef/upstream/src/arch/helpervecext.h | 2 +- .../upstream/src/common/CMakeLists.txt | 49 +- .../libsleef/upstream/src/common/addSuffix.c | 19 +- .../libsleef/upstream/src/common/arraymap.c | 347 -- .../libsleef/upstream/src/common/arraymap.h | 21 - .../libsleef/upstream/src/common/common.c | 2 +- .../libsleef/upstream/src/common/common.h | 15 +- .../upstream/src/common/commonfuncs.h | 4 +- .../native/libsleef/upstream/src/common/dd.h | 2 +- .../native/libsleef/upstream/src/common/df.h | 2 +- .../libsleef/upstream/src/common/estrin.h | 2 +- .../libsleef/upstream/src/common/f128util.h | 92 - .../upstream/src/common/main_checkfeature.c | 2 +- .../libsleef/upstream/src/common/misc.h | 24 +- .../libsleef/upstream/src/common/psha2.hpp | 182 + .../upstream/src/common/psha2_capi.cpp | 57 + .../libsleef/upstream/src/common/psha2_capi.h | 30 + .../src/{quad-tester => common}/qtesterutil.c | 66 +- .../src/{quad-tester => common}/qtesterutil.h | 33 +- .../libsleef/upstream/src/common/quaddef.h | 18 +- .../upstream/src/common/test_psha2.cpp | 58 + .../src/{libm-tester => common}/testerutil.c | 35 +- .../libsleef/upstream/src/common/testerutil.h | 144 + .../upstream/src/dft-tester/CMakeLists.txt | 56 + .../upstream/src/dft-tester/bench1d.c | 116 - .../upstream/src/dft-tester/dftbench.cpp | 404 ++ .../upstream/src/dft-tester/fftwtest1d.c | 50 +- .../upstream/src/dft-tester/fftwtest2d.c | 26 +- .../upstream/src/dft-tester/measuredft.c | 2 +- .../upstream/src/dft-tester/naivetest.c | 94 +- .../upstream/src/dft-tester/roundtriptest1d.c | 7 +- .../upstream/src/dft-tester/roundtriptest2d.c | 13 +- .../src/dft-tester/test_dftplanner.cpp | 168 + .../upstream/src/dft-tester/tutorial.c | 2 +- .../libsleef/upstream/src/dft/CMakeLists.txt | 172 +- .../native/libsleef/upstream/src/dft/compat.h | 45 + .../native/libsleef/upstream/src/dft/dft.c | 1441 ----- .../native/libsleef/upstream/src/dft/dft.cpp | 1491 +++++ .../libsleef/upstream/src/dft/dftcommon.c | 423 -- .../libsleef/upstream/src/dft/dftcommon.cpp | 517 ++ .../libsleef/upstream/src/dft/dftcommon.h | 69 - .../libsleef/upstream/src/dft/dftcommon.hpp | 237 + .../libsleef/upstream/src/dft/mkdispatch.c | 116 +- .../libsleef/upstream/src/dft/mkunroll.c | 55 +- .../libsleef/upstream/src/dft/serializer.hpp | 145 + .../src/dft/{unroll0.org => unroll0.cpp.in} | 46 +- .../libsleef/upstream/src/dft/unroll1.cpp.in | 4868 +++++++++++++++++ .../src/dft/{vectortype.h => vectortype.hpp} | 10 +- .../libsleef/upstream/src/gencoef/gencoef.c | 11 +- .../libsleef/upstream/src/gencoef/simplexfr.c | 17 +- .../upstream/src/libm-benchmarks/Makefile | 153 - .../src/libm-benchmarks/ProcessData.java | 193 - .../upstream/src/libm-benchmarks/bench.h | 58 - .../upstream/src/libm-benchmarks/benchsleef.c | 144 - .../src/libm-benchmarks/benchsleef128.c | 195 - .../src/libm-benchmarks/benchsleef256.c | 181 - .../src/libm-benchmarks/benchsleef512.c | 180 - .../upstream/src/libm-benchmarks/benchsvml.c | 153 - .../src/libm-benchmarks/benchsvml128.c | 144 - .../src/libm-benchmarks/benchsvml256.c | 147 - .../src/libm-benchmarks/benchsvml512.c | 144 - .../upstream/src/libm-benchmarks/measure.sh | 17 - .../upstream/src/libm-tester/CMakeLists.txt | 423 +- .../upstream/src/libm-tester/autovec.c | 2 +- .../src/libm-tester/gnuabi_compatibility.c | 106 +- .../upstream/src/libm-tester/hash_cinz.txt | 258 +- .../upstream/src/libm-tester/hash_finz.txt | 258 +- .../libsleef/upstream/src/libm-tester/iut.c | 2 +- .../upstream/src/libm-tester/iutcuda.cu | 176 +- .../upstream/src/libm-tester/iutsimd.c | 268 +- .../upstream/src/libm-tester/tester.c | 386 +- .../upstream/src/libm-tester/tester2dp.c | 38 +- .../upstream/src/libm-tester/tester2ld.c | 2 +- .../upstream/src/libm-tester/tester2qp.c | 20 +- .../upstream/src/libm-tester/tester2simddp.c | 38 +- .../upstream/src/libm-tester/tester2simdsp.c | 44 +- .../upstream/src/libm-tester/tester2sp.c | 44 +- .../upstream/src/libm-tester/tester3.c | 222 +- .../upstream/src/libm-tester/tester4simd.cpp | 2601 +++++++++ .../upstream/src/libm-tester/testerutil.h | 100 - .../libsleef/upstream/src/libm/CMakeLists.txt | 15 +- .../libsleef/upstream/src/libm/dispatcher.h | 127 +- .../libsleef/upstream/src/libm/dispavx.c.org | 9 +- .../upstream/src/libm/disppower_128.c.org | 3 +- .../upstream/src/libm/disps390x_128.c.org | 14 +- .../upstream/src/libm/dispscalar.c.org | 3 +- .../libsleef/upstream/src/libm/dispsse.c.org | 9 +- .../libsleef/upstream/src/libm/funcproto.h | 2 +- .../libsleef/upstream/src/libm/mkalias.c | 2 +- .../libsleef/upstream/src/libm/mkdisp.c | 2 +- .../upstream/src/libm/mkmasked_gnuabi.c | 2 +- .../libsleef/upstream/src/libm/mkrename.c | 3 +- .../upstream/src/libm/mkrename_gnuabi.c | 2 +- .../libsleef/upstream/src/libm/norename.h | 2 +- .../libsleef/upstream/src/libm/rempitab.c | 2 +- .../libsleef/upstream/src/libm/rename.h | 2 +- .../libsleef/upstream/src/libm/sleefdp.c | 9 +- .../src/libm/sleefinline_cuda_header.h.org | 2 +- ...ader.h.org => sleefinline_header.h.org.in} | 7 +- .../libsleef/upstream/src/libm/sleefld.c | 2 +- .../src/libm/sleeflibm_header.h.org.in | 28 +- .../libsleef/upstream/src/libm/sleefqp.c | 2 +- .../libsleef/upstream/src/libm/sleefsimddp.c | 22 +- .../libsleef/upstream/src/libm/sleefsimdsp.c | 18 +- .../libsleef/upstream/src/libm/sleefsp.c | 13 +- .../libsleef/upstream/src/libm/tryvxe2.c | 8 - .../upstream/src/quad-tester/CMakeLists.txt | 405 +- .../upstream/src/quad-tester/hash_printf.txt | 8 +- .../upstream/src/quad-tester/qiutcuda.cu | 324 +- .../upstream/src/quad-tester/qiutsimd.c | 538 +- .../upstream/src/quad-tester/qtester.c | 722 +-- .../upstream/src/quad-tester/qtester4simd.cpp | 1308 +++++ .../upstream/src/quad-tester/tester2printf.c | 2 +- .../upstream/src/quad-tester/tester2simdqp.c | 2 +- .../upstream/src/quad-tester/tester3printf.c | 72 +- .../libsleef/upstream/src/quad/CMakeLists.txt | 11 +- .../libsleef/upstream/src/quad/qdispatcher.h | 131 +- .../upstream/src/quad/qdispscalar.c.org | 3 +- .../libsleef/upstream/src/quad/qdispx2.c.org | 3 +- .../libsleef/upstream/src/quad/qfuncproto.h | 2 +- .../libsleef/upstream/src/quad/qmkdisp.c | 2 +- .../libsleef/upstream/src/quad/qmkrename.c | 5 +- .../src/quad/sleefquad_header.h.org.in | 14 +- ...g => sleefquadinline_cuda_header.h.org.in} | 5 +- ....h.org => sleefquadinline_header.h.org.in} | 5 +- .../libsleef/upstream/src/quad/sleefsimdqp.c | 24 +- .../upstream/toolchains/aarch64-gcc.cmake | 13 +- .../upstream/toolchains/armhf-gcc.cmake | 13 +- .../upstream/toolchains/native-gcc.cmake | 3 +- .../upstream/toolchains/native-llvm.cmake | 3 +- .../upstream/toolchains/ppc64el-gcc.cmake | 12 +- .../upstream/toolchains/ppc64el-llvm.cmake | 15 +- .../upstream/toolchains/s390x-gcc.cmake | 10 +- .../upstream/toolchains/s390x-llvm.cmake | 11 +- .../libsleef/upstream/winbuild-clang.bat | 29 + .../libsleef/upstream/winbuild-msvc.bat | 21 + ...tVectorLibrarySleefUnaryOpAndBinaryOp.java | 161 + .../vector/VectorTranscendentalBenchmark.java | 364 ++ 167 files changed, 17283 insertions(+), 7588 deletions(-) delete mode 100644 src/jdk.incubator.vector/unix/native/libsleef/upstream/CONTRIBUTORS.md create mode 100644 src/jdk.incubator.vector/unix/native/libsleef/upstream/Jenkinsfile create mode 100644 src/jdk.incubator.vector/unix/native/libsleef/upstream/README.adoc delete mode 100644 src/jdk.incubator.vector/unix/native/libsleef/upstream/README.md delete mode 100644 src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/arraymap.c delete mode 100644 src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/arraymap.h delete mode 100644 src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/f128util.h create mode 100644 src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/psha2.hpp create mode 100644 src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/psha2_capi.cpp create mode 100644 src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/psha2_capi.h rename src/jdk.incubator.vector/unix/native/libsleef/upstream/src/{quad-tester => common}/qtesterutil.c (93%) rename src/jdk.incubator.vector/unix/native/libsleef/upstream/src/{quad-tester => common}/qtesterutil.h (82%) create mode 100644 src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/test_psha2.cpp rename src/jdk.incubator.vector/unix/native/libsleef/upstream/src/{libm-tester => common}/testerutil.c (90%) create mode 100644 src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/testerutil.h delete mode 100644 src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft-tester/bench1d.c create mode 100644 src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft-tester/dftbench.cpp create mode 100644 src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft-tester/test_dftplanner.cpp create mode 100644 src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/compat.h delete mode 100644 src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/dft.c create mode 100644 src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/dft.cpp delete mode 100644 src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/dftcommon.c create mode 100644 src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/dftcommon.cpp delete mode 100644 src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/dftcommon.h create mode 100644 src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/dftcommon.hpp create mode 100644 src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/serializer.hpp rename src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/{unroll0.org => unroll0.cpp.in} (99%) create mode 100644 src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/unroll1.cpp.in rename src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/{vectortype.h => vectortype.hpp} (97%) delete mode 100644 src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-benchmarks/Makefile delete mode 100644 src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-benchmarks/ProcessData.java delete mode 100644 src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-benchmarks/bench.h delete mode 100644 src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-benchmarks/benchsleef.c delete mode 100644 src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-benchmarks/benchsleef128.c delete mode 100644 src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-benchmarks/benchsleef256.c delete mode 100644 src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-benchmarks/benchsleef512.c delete mode 100644 src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-benchmarks/benchsvml.c delete mode 100644 src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-benchmarks/benchsvml128.c delete mode 100644 src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-benchmarks/benchsvml256.c delete mode 100644 src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-benchmarks/benchsvml512.c delete mode 100644 src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-benchmarks/measure.sh create mode 100644 src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/tester4simd.cpp delete mode 100644 src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/testerutil.h rename src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/{sleefinline_header.h.org => sleefinline_header.h.org.in} (99%) delete mode 100644 src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/tryvxe2.c create mode 100644 src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad-tester/qtester4simd.cpp rename src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad/{sleefquadinline_cuda_header.h.org => sleefquadinline_cuda_header.h.org.in} (99%) rename src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad/{sleefquadinline_header.h.org => sleefquadinline_header.h.org.in} (99%) create mode 100644 src/jdk.incubator.vector/unix/native/libsleef/upstream/winbuild-clang.bat create mode 100644 src/jdk.incubator.vector/unix/native/libsleef/upstream/winbuild-msvc.bat create mode 100644 test/hotspot/jtreg/compiler/vectorapi/TestVectorLibrarySleefUnaryOpAndBinaryOp.java create mode 100644 test/micro/org/openjdk/bench/jdk/incubator/vector/VectorTranscendentalBenchmark.java diff --git a/make/UpdateSleefSource.gmk b/make/UpdateSleefSource.gmk index d7b8f8e141b..38483f0ec68 100644 --- a/make/UpdateSleefSource.gmk +++ b/make/UpdateSleefSource.gmk @@ -1,5 +1,5 @@ # -# Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2024, 2026, Oracle and/or its affiliates. All rights reserved. # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. # # This code is free software; you can redistribute it and/or modify it @@ -48,7 +48,7 @@ ifneq ($(OPENJDK_BUILD_OS), linux) endif SLEEF_SUPPORT_DIR := $(MAKESUPPORT_OUTPUTDIR)/sleef -SLEEF_SOURCE_BASE_DIR := $(TOPDIR)/src/jdk.incubator.vector/linux/native/libsleef +SLEEF_SOURCE_BASE_DIR := $(TOPDIR)/src/jdk.incubator.vector/unix/native/libsleef SLEEF_SOURCE_DIR := $(SLEEF_SOURCE_BASE_DIR)/upstream SLEEF_TARGET_DIR := $(SLEEF_SOURCE_BASE_DIR)/generated SLEEF_NATIVE_BUILD_DIR := $(SLEEF_SUPPORT_DIR)/native @@ -82,7 +82,12 @@ $(eval $(call SetupExecute, sleef_native_config, \ INFO := Configuring native sleef build, \ OUTPUT_DIR := $(SLEEF_NATIVE_BUILD_DIR), \ WORKING_DIR := $(SLEEF_SOURCE_DIR), \ - COMMAND := $(CMAKE) -S . -B $(SLEEF_NATIVE_BUILD_DIR), \ + COMMAND := $(CMAKE) -S . -B $(SLEEF_NATIVE_BUILD_DIR) \ + -DCMAKE_INSTALL_PREFIX=$(SLEEF_NATIVE_BUILD_DIR) \ + -DSLEEF_BUILD_TESTS=OFF \ + -DSLEEF_DISABLE_SSL=ON \ + -DSLEEF_ENABLE_TLFLOAT=OFF \ + -DSLEEF_ENABLE_TESTER4=OFF, \ )) TARGETS := $(sleef_native_config) @@ -106,6 +111,11 @@ $(eval $(call SetupExecute, sleef_cross_config, \ -DCMAKE_C_COMPILER=$(CC) \ -DCMAKE_TOOLCHAIN_FILE=$(SLEEF_CMAKE_FILE) \ -DNATIVE_BUILD_DIR=$(SLEEF_NATIVE_BUILD_DIR) \ + -DCMAKE_INSTALL_PREFIX=$(SLEEF_CROSS_BUILD_DIR) \ + -DSLEEF_BUILD_TESTS=OFF \ + -DSLEEF_DISABLE_SSL=ON \ + -DSLEEF_ENABLE_TLFLOAT=OFF \ + -DSLEEF_ENABLE_TESTER4=OFF \ -DSLEEF_BUILD_INLINE_HEADERS=TRUE \ $(EXTRA_CROSS_OPTIONS), \ )) @@ -139,7 +149,7 @@ $(eval $(call SetupCopyFiles, copy_generated_sleef_source, \ DEST := $(SLEEF_TARGET_DIR), \ )) -TARGETS := $(copy_generated_sleef_source) +TARGETS := $(copy_static_sleef_source) $(copy_generated_sleef_source) ################################################################################ diff --git a/src/jdk.incubator.vector/linux/legal/sleef.md b/src/jdk.incubator.vector/linux/legal/sleef.md index ad4c4cba790..5bade94a0c8 100644 --- a/src/jdk.incubator.vector/linux/legal/sleef.md +++ b/src/jdk.incubator.vector/linux/legal/sleef.md @@ -1,8 +1,8 @@ -## SLEEF v3.6.1 +## SLEEF v3.9.0 ### Notice ``` -Copyright © 2010-2024 SLEEF Project, Naoki Shibata and contributors +Copyright © 2010-2025 SLEEF Project, Naoki Shibata and contributors ------- src/arch/helpersve.h has the following copyright: diff --git a/src/jdk.incubator.vector/unix/native/libsleef/README.md b/src/jdk.incubator.vector/unix/native/libsleef/README.md index ef838c2949b..cd4c3d4d64e 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/README.md +++ b/src/jdk.incubator.vector/unix/native/libsleef/README.md @@ -4,15 +4,15 @@ This directory contains the source code for the SLEEF library, the **SIMD Library for Evaluating Elementary Functions**. For more information on SLEEF, see https://sleef.org/. -The currently imported libsleef sources is version 3.6.1, which has -git tag `3.6.1` and git commit hash `6ee14bcae5fe92c2ff8b000d5a01102dab08d774`. +The currently imported libsleef sources are version 3.9.0, which has +git tag `3.9.0` and git commit hash `906ca7512ee483296780a81a21b9ca715d40dfe1`. # About the libsleef integration in the JDK The upstream original source code is available in `src/jdk.incubator.vector/unix/native/libsleef/upstream`. However, this code is not directly usable in the JDK build system, but is instead used as the base for -the generation of additional souce code files. This generation is done by +the generation of additional source code files. This generation is done by the libsleef CMake files. If this should have been done at build time, it would have meant adding CMake as a required dependency to build the JDK. @@ -25,7 +25,7 @@ the JDK source tree. The generated files reside in To update the version of libsleef that is used in the JDK, clone `https://github.com/shibatch/sleef.git`, and copy all files, except the `docs`, -`.github` and `.git` directories, into +`.github` and `.git` directories, and the `.nojekyll` file, into `src/jdk.incubator.vector/unix/native/libsleef/upstream`. The libsleef source code does not follow the JDK whitespace rules as enforced by diff --git a/src/jdk.incubator.vector/unix/native/libsleef/generated/misc.h b/src/jdk.incubator.vector/unix/native/libsleef/generated/misc.h index 472cae68bd5..8d972e24164 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/generated/misc.h +++ b/src/jdk.incubator.vector/unix/native/libsleef/generated/misc.h @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2024. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -13,10 +13,15 @@ #include #endif + #ifndef M_PI #define M_PI 3.141592653589793238462643383279502884 #endif +#ifndef M_PIf +# define M_PIf ((float)M_PI) +#endif + #ifndef M_PIl #define M_PIl 3.141592653589793238462643383279502884L #endif @@ -137,9 +142,17 @@ #define L2Lf 1.428606765330187045e-06f #define R_LN2f 1.442695040888963407359924681001892137426645954152985934135449406931f -#ifndef M_PIf -# define M_PIf ((float)M_PI) -#endif + +// Overflow bounds + +// - exp(x) overflows for x over (also used in pow) +#define LOG_DBL_MAX 0x1.62e42fefa39efp+9 /* 709.782712893384 */ + +// Other bounds + +// - log1p(f)(x) approximation holds up to x equals +#define LOG1PF_BOUND 0x1.2ced32p+126 /* 1.0e+38 */ +#define LOG1P_BOUND 0x1.c7b1f3cac7433p+1019 /* 1.0e+307 */ // @@ -183,17 +196,13 @@ typedef struct { } Sleef_longdouble2; #endif -#if (defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)) && !defined(_MSC_VER) +#if (defined (__GNUC__) || defined (__clang__)) && !defined(_MSC_VER) #define LIKELY(condition) __builtin_expect(!!(condition), 1) #define UNLIKELY(condition) __builtin_expect(!!(condition), 0) #define RESTRICT __restrict__ -#ifndef __arm__ #define ALIGNED(x) __attribute__((aligned(x))) -#else -#define ALIGNED(x) -#endif #if defined(SLEEF_GENHEADER) @@ -229,7 +238,7 @@ typedef struct { #define SLEEF_INFINITYf __builtin_inff() #define SLEEF_INFINITYl __builtin_infl() -#if defined(__INTEL_COMPILER) || defined (__clang__) +#if defined (__clang__) #define SLEEF_INFINITYq __builtin_inf() #define SLEEF_NANq __builtin_nan("") #else @@ -237,7 +246,7 @@ typedef struct { #define SLEEF_NANq (SLEEF_INFINITYq - SLEEF_INFINITYq) #endif -#elif defined(_MSC_VER) // #if (defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)) && !defined(_MSC_VER) +#elif defined(_MSC_VER) // #if (defined (__GNUC__) || defined (__clang__)) && !defined(_MSC_VER) #if defined(SLEEF_GENHEADER) @@ -249,6 +258,9 @@ typedef struct { #else // #if defined(SLEEF_GENHEADER) #define INLINE __forceinline +#ifdef CONST +#undef CONST +#endif #define CONST #ifndef SLEEF_STATIC_LIBS #define EXPORT __declspec(dllexport) @@ -265,7 +277,7 @@ typedef struct { #define LIKELY(condition) (condition) #define UNLIKELY(condition) (condition) -#if (defined(__GNUC__) || defined(__CLANG__)) && (defined(__i386__) || defined(__x86_64__)) && !defined(SLEEF_GENHEADER) +#if (defined(__GNUC__) || defined(__CLANG__)) && defined(__x86_64__) && !defined(SLEEF_GENHEADER) #include #endif @@ -294,7 +306,7 @@ typedef struct { #endif #endif -#endif // #elif defined(_MSC_VER) // #if (defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)) && !defined(_MSC_VER) +#endif // #elif defined(_MSC_VER) // #if (defined (__GNUC__) || defined (__clang__)) && !defined(_MSC_VER) #if !defined(__linux__) #define isinff(x) ((x) == SLEEF_INFINITYf || (x) == -SLEEF_INFINITYf) @@ -305,15 +317,9 @@ typedef struct { #endif // #ifndef __MISC_H__ -#ifdef ENABLE_AAVPCS -#define VECTOR_CC __attribute__((aarch64_vector_pcs)) -#else -#define VECTOR_CC -#endif - // -#if defined (__GNUC__) && !defined(__INTEL_COMPILER) +#if defined (__GNUC__) #pragma GCC diagnostic ignored "-Wpragmas" #pragma GCC diagnostic ignored "-Wunknown-pragmas" #if !defined (__clang__) diff --git a/src/jdk.incubator.vector/unix/native/libsleef/generated/sleefinline_advsimd.h b/src/jdk.incubator.vector/unix/native/libsleef/generated/sleefinline_advsimd.h index 7e02768cf1e..9b6a869a8b8 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/generated/sleefinline_advsimd.h +++ b/src/jdk.incubator.vector/unix/native/libsleef/generated/sleefinline_advsimd.h @@ -1,8 +1,11 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See http://www.boost.org/LICENSE_1_0.txt) -// This file is generated by SLEEF 3.6.1 +// This file is generated by SLEEF 3.9.0 + +/* #undef SLEEF_FLOAT128_IS_IEEEQP */ +#define SLEEF_LONGDOUBLE_IS_IEEEQP #ifndef SLEEF_ALWAYS_INLINE #if defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER) @@ -1010,6 +1013,7 @@ static const double Sleef_rempitabdp[] = { 2.1353977370878701046e-273, -1.2215123283371736879e-289, 6.7342163555358599277e-306, -5.681754927174335258e-322, 2.1353977370878701046e-273, -1.2215123283371736879e-289, 6.7342163555358599277e-306, -5.681754927174335258e-322, 2.8687869620228451614e-274, -1.9537812801257956865e-290, 1.0380272777574237546e-306, 6.4228533959362050743e-323, + 0, 0, 0, 0, }; static const float Sleef_rempitabsp[] = { @@ -1116,17 +1120,10 @@ static const float Sleef_rempitabsp[] = { 1.183823005e-12, 1.161414894e-20, 1.29131908e-27, 1.715766248e-34, 1.183823005e-12, 1.161414894e-20, 1.29131908e-27, 1.715766248e-34, 2.743283031e-13, 1.161414894e-20, 1.29131908e-27, 1.715766248e-34, + 0, 0, 0, 0, }; #endif // #ifndef __SLEEF_REMPITAB__ -#if !defined(__NVCC__) && ((defined(__SIZEOF_FLOAT128__) && __SIZEOF_FLOAT128__ == 16) || (defined(__linux__) && defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))) || (defined(__PPC64__) && defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 8)) -#define SLEEF_FLOAT128_IS_IEEEQP -#endif - -#if !defined(SLEEF_FLOAT128_IS_IEEEQP) && !defined(__NVCC__) && defined(__SIZEOF_LONG_DOUBLE__) && __SIZEOF_LONG_DOUBLE__ == 16 && (defined(__aarch64__) || defined(__zarch__)) -#define SLEEF_LONGDOUBLE_IS_IEEEQP -#endif - #if !defined(Sleef_quad_DEFINED) #define Sleef_quad_DEFINED typedef struct { uint64_t x, y; } Sleef_uint64_2t; @@ -3294,7 +3291,8 @@ SLEEF_INLINE SLEEF_CONST vdouble_advsimd_sleef Sleef_expd2_u10advsimd(vdouble_ad u = vldexp2_vd_vd_vi_advsimd_sleef(u, q); - u = vsel_vd_vo_vd_vd_advsimd_sleef(vgt_vo_vd_vd_advsimd_sleef(d, vcast_vd_d_advsimd_sleef(709.78271114955742909217217426)), vcast_vd_d_advsimd_sleef(__builtin_inf()), u); + vopmask_advsimd_sleef o = vgt_vo_vd_vd_advsimd_sleef(d, vcast_vd_d_advsimd_sleef(0x1.62e42fefa39efp+9)); + u = vsel_vd_vo_vd_vd_advsimd_sleef(o, vcast_vd_d_advsimd_sleef(__builtin_inf()), u); u = vreinterpret_vd_vm_advsimd_sleef(vandnot_vm_vo64_vm_advsimd_sleef(vlt_vo_vd_vd_advsimd_sleef(d, vcast_vd_d_advsimd_sleef(-1000)), vreinterpret_vm_vd_advsimd_sleef(u))); return u; @@ -3411,13 +3409,13 @@ static SLEEF_ALWAYS_INLINE SLEEF_CONST vdouble_advsimd_sleef expk_advsimd_sleef( } SLEEF_INLINE SLEEF_CONST vdouble_advsimd_sleef Sleef_powd2_u10advsimd(vdouble_advsimd_sleef x, vdouble_advsimd_sleef y) { - vopmask_advsimd_sleef yisint = visint_vo_vd_advsimd_sleef(y); vopmask_advsimd_sleef yisodd = vand_vo_vo_vo_advsimd_sleef(visodd_vo_vd_advsimd_sleef(y), yisint); vdouble2_advsimd_sleef d = ddmul_vd2_vd2_vd_advsimd_sleef(logk_advsimd_sleef(vabs_vd_vd_advsimd_sleef(x)), y); vdouble_advsimd_sleef result = expk_advsimd_sleef(d); - result = vsel_vd_vo_vd_vd_advsimd_sleef(vgt_vo_vd_vd_advsimd_sleef(vd2getx_vd_vd2_advsimd_sleef(d), vcast_vd_d_advsimd_sleef(709.78271114955742909217217426)), vcast_vd_d_advsimd_sleef(__builtin_inf()), result); + vopmask_advsimd_sleef o = vgt_vo_vd_vd_advsimd_sleef(vd2getx_vd_vd2_advsimd_sleef(d), vcast_vd_d_advsimd_sleef(0x1.62e42fefa39efp+9)); + result = vsel_vd_vo_vd_vd_advsimd_sleef(o, vcast_vd_d_advsimd_sleef(__builtin_inf()), result); result = vmul_vd_vd_vd_advsimd_sleef(result, vsel_vd_vo_vd_vd_advsimd_sleef(vgt_vo_vd_vd_advsimd_sleef(x, vcast_vd_d_advsimd_sleef(0)), @@ -3443,7 +3441,6 @@ SLEEF_INLINE SLEEF_CONST vdouble_advsimd_sleef Sleef_powd2_u10advsimd(vdouble_ad result = vsel_vd_vo_vd_vd_advsimd_sleef(vor_vo_vo_vo_advsimd_sleef(veq_vo_vd_vd_advsimd_sleef(y, vcast_vd_d_advsimd_sleef(0)), veq_vo_vd_vd_advsimd_sleef(x, vcast_vd_d_advsimd_sleef(1))), vcast_vd_d_advsimd_sleef(1), result); return result; - } static SLEEF_ALWAYS_INLINE SLEEF_CONST vdouble2_advsimd_sleef expk2_advsimd_sleef(vdouble2_advsimd_sleef d) { @@ -3931,7 +3928,8 @@ SLEEF_INLINE SLEEF_CONST vdouble_advsimd_sleef Sleef_log1pd2_u10advsimd(vdouble_ vdouble_advsimd_sleef r = vadd_vd_vd_vd_advsimd_sleef(vd2getx_vd_vd2_advsimd_sleef(s), vd2gety_vd_vd2_advsimd_sleef(s)); - r = vsel_vd_vo_vd_vd_advsimd_sleef(vgt_vo_vd_vd_advsimd_sleef(d, vcast_vd_d_advsimd_sleef(1e+307)), vcast_vd_d_advsimd_sleef(__builtin_inf()), r); + vopmask_advsimd_sleef ocore = vle_vo_vd_vd_advsimd_sleef(d, vcast_vd_d_advsimd_sleef(0x1.c7b1f3cac7433p+1019)); + if(!__builtin_expect(!!(vtestallones_i_vo64_advsimd_sleef (ocore)), 1)) r = vsel_vd_vo_vd_vd_advsimd_sleef(ocore, r, Sleef_logd2_u10advsimd(d)); r = vsel_vd_vo_vd_vd_advsimd_sleef(vor_vo_vo_vo_advsimd_sleef(vlt_vo_vd_vd_advsimd_sleef(d, vcast_vd_d_advsimd_sleef(-1)), visnan_vo_vd_advsimd_sleef(d)), vcast_vd_d_advsimd_sleef(__builtin_nan("")), r); r = vsel_vd_vo_vd_vd_advsimd_sleef(veq_vo_vd_vd_advsimd_sleef(d, vcast_vd_d_advsimd_sleef(-1)), vcast_vd_d_advsimd_sleef(-__builtin_inf()), r); r = vsel_vd_vo_vd_vd_advsimd_sleef(visnegzero_vo_vd_advsimd_sleef(d), vcast_vd_d_advsimd_sleef(-0.0), r); @@ -4011,7 +4009,7 @@ SLEEF_INLINE SLEEF_CONST vint_advsimd_sleef Sleef_expfrexpd2_advsimd(vdouble_adv vint_advsimd_sleef ret = vcastu_vi_vm_advsimd_sleef(vreinterpret_vm_vd_advsimd_sleef(x)); ret = vsub_vi_vi_vi_advsimd_sleef(vand_vi_vi_vi_advsimd_sleef(vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(ret), 20)), vcast_vi_i_advsimd_sleef(0x7ff)), vcast_vi_i_advsimd_sleef(0x3fe)); - ret = vsel_vi_vo_vi_vi_advsimd_sleef(vor_vo_vo_vo_advsimd_sleef(vor_vo_vo_vo_advsimd_sleef(veq_vo_vd_vd_advsimd_sleef(x, vcast_vd_d_advsimd_sleef(0)), visnan_vo_vd_advsimd_sleef(x)), visinf_vo_vd_advsimd_sleef(x)), vcast_vi_i_advsimd_sleef(0), ret); + ret = vsel_vi_vo_vi_vi_advsimd_sleef(vcast_vo32_vo64_advsimd_sleef(vor_vo_vo_vo_advsimd_sleef(vor_vo_vo_vo_advsimd_sleef(veq_vo_vd_vd_advsimd_sleef(x, vcast_vd_d_advsimd_sleef(0)), visnan_vo_vd_advsimd_sleef(x)), visinf_vo_vd_advsimd_sleef(x))), vcast_vi_i_advsimd_sleef(0), ret); return ret; } @@ -4410,14 +4408,6 @@ SLEEF_INLINE SLEEF_CONST vdouble_advsimd_sleef Sleef_erfcd2_u15advsimd(vdouble_a return r; } -#if !defined(__NVCC__) && ((defined(__SIZEOF_FLOAT128__) && __SIZEOF_FLOAT128__ == 16) || (defined(__linux__) && defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))) || (defined(__PPC64__) && defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 8)) -#define SLEEF_FLOAT128_IS_IEEEQP -#endif - -#if !defined(SLEEF_FLOAT128_IS_IEEEQP) && !defined(__NVCC__) && defined(__SIZEOF_LONG_DOUBLE__) && __SIZEOF_LONG_DOUBLE__ == 16 && (defined(__aarch64__) || defined(__zarch__)) -#define SLEEF_LONGDOUBLE_IS_IEEEQP -#endif - #if !defined(Sleef_quad_DEFINED) #define Sleef_quad_DEFINED typedef struct { uint64_t x, y; } Sleef_uint64_2t; @@ -4934,6 +4924,7 @@ SLEEF_INLINE SLEEF_CONST vfloat_advsimd_sleef Sleef_tanf4_u35advsimd(vfloat_advs if (__builtin_expect(!!(vtestallones_i_vo32_advsimd_sleef(vlt_vo_vf_vf_advsimd_sleef(vabs_vf_vf_advsimd_sleef(d), vcast_vf_f_advsimd_sleef(125.0f*0.5f)))), 1)) { q = vrint_vi2_vf_advsimd_sleef(vmul_vf_vf_vf_advsimd_sleef(d, vcast_vf_f_advsimd_sleef((float)(2 * 0.318309886183790671537767526745028724)))); u = vcast_vf_vi2_advsimd_sleef(q); + x = vmla_vf_vf_vf_vf_advsimd_sleef(u, vcast_vf_f_advsimd_sleef(-3.1414794921875f*0.5f), x); x = vmla_vf_vf_vf_vf_advsimd_sleef(u, vcast_vf_f_advsimd_sleef(-0.00011315941810607910156f*0.5f), x); x = vmla_vf_vf_vf_vf_advsimd_sleef(u, vcast_vf_f_advsimd_sleef(-1.9841872589410058936e-09f*0.5f), x); @@ -6335,7 +6326,8 @@ SLEEF_INLINE SLEEF_CONST vfloat_advsimd_sleef Sleef_log1pf4_u10advsimd(vfloat_ad vfloat_advsimd_sleef r = vadd_vf_vf_vf_advsimd_sleef(vf2getx_vf_vf2_advsimd_sleef(s), vf2gety_vf_vf2_advsimd_sleef(s)); - r = vsel_vf_vo_vf_vf_advsimd_sleef(vgt_vo_vf_vf_advsimd_sleef(d, vcast_vf_f_advsimd_sleef(1e+38)), vcast_vf_f_advsimd_sleef(__builtin_inff()), r); + vopmask_advsimd_sleef ocore = vle_vo_vf_vf_advsimd_sleef(d, vcast_vf_f_advsimd_sleef(0x1.2ced32p+126)); + if(!__builtin_expect(!!(vtestallones_i_vo32_advsimd_sleef (ocore)), 1)) r = vsel_vf_vo_vf_vf_advsimd_sleef(ocore, r, Sleef_logf4_u10advsimd(d)); r = vreinterpret_vf_vm_advsimd_sleef(vor_vm_vo32_vm_advsimd_sleef(vgt_vo_vf_vf_advsimd_sleef(vcast_vf_f_advsimd_sleef(-1), d), vreinterpret_vm_vf_advsimd_sleef(r))); r = vsel_vf_vo_vf_vf_advsimd_sleef(veq_vo_vf_vf_advsimd_sleef(d, vcast_vf_f_advsimd_sleef(-1)), vcast_vf_f_advsimd_sleef(-__builtin_inff()), r); r = vsel_vf_vo_vf_vf_advsimd_sleef(visnegzero_vo_vf_advsimd_sleef(d), vcast_vf_f_advsimd_sleef(-0.0f), r); diff --git a/src/jdk.incubator.vector/unix/native/libsleef/generated/sleefinline_rvvm1.h b/src/jdk.incubator.vector/unix/native/libsleef/generated/sleefinline_rvvm1.h index a68f41d734d..23a8c572116 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/generated/sleefinline_rvvm1.h +++ b/src/jdk.incubator.vector/unix/native/libsleef/generated/sleefinline_rvvm1.h @@ -1,8 +1,11 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See http://www.boost.org/LICENSE_1_0.txt) -// This file is generated by SLEEF 3.6.1 +// This file is generated by SLEEF 3.9.0 + +/* #undef SLEEF_FLOAT128_IS_IEEEQP */ +#define SLEEF_LONGDOUBLE_IS_IEEEQP #ifndef SLEEF_ALWAYS_INLINE #if defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER) @@ -1010,6 +1013,7 @@ static const double Sleef_rempitabdp[] = { 2.1353977370878701046e-273, -1.2215123283371736879e-289, 6.7342163555358599277e-306, -5.681754927174335258e-322, 2.1353977370878701046e-273, -1.2215123283371736879e-289, 6.7342163555358599277e-306, -5.681754927174335258e-322, 2.8687869620228451614e-274, -1.9537812801257956865e-290, 1.0380272777574237546e-306, 6.4228533959362050743e-323, + 0, 0, 0, 0, }; static const float Sleef_rempitabsp[] = { @@ -1116,17 +1120,10 @@ static const float Sleef_rempitabsp[] = { 1.183823005e-12, 1.161414894e-20, 1.29131908e-27, 1.715766248e-34, 1.183823005e-12, 1.161414894e-20, 1.29131908e-27, 1.715766248e-34, 2.743283031e-13, 1.161414894e-20, 1.29131908e-27, 1.715766248e-34, + 0, 0, 0, 0, }; #endif // #ifndef __SLEEF_REMPITAB__ -#if !defined(__NVCC__) && ((defined(__SIZEOF_FLOAT128__) && __SIZEOF_FLOAT128__ == 16) || (defined(__linux__) && defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))) || (defined(__PPC64__) && defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 8)) -#define SLEEF_FLOAT128_IS_IEEEQP -#endif - -#if !defined(SLEEF_FLOAT128_IS_IEEEQP) && !defined(__NVCC__) && defined(__SIZEOF_LONG_DOUBLE__) && __SIZEOF_LONG_DOUBLE__ == 16 && (defined(__aarch64__) || defined(__zarch__)) -#define SLEEF_LONGDOUBLE_IS_IEEEQP -#endif - #if !defined(Sleef_quad_DEFINED) #define Sleef_quad_DEFINED typedef struct { uint64_t x, y; } Sleef_uint64_2t; @@ -1182,7 +1179,7 @@ typedef vquad_rvvm1_sleef vargquad_rvvm1_sleef; static SLEEF_ALWAYS_INLINE int vavailability_i_rvvm1_sleef(int name) { - return (__riscv_vsetvlmax_e64m1() >= __riscv_vsetvlmax_e64m1()) ? 3 : 0; + return (((int)__riscv_vsetvlmax_e64m1()) >= ((int)__riscv_vsetvlmax_e64m1())) ? 3 : 0; } static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef figetd_vf_di_rvvm1_sleef(fi_t_rvvm1_sleef d) { @@ -1239,144 +1236,144 @@ static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vreinterpret_vf_vi2_rvvm1_sleef(vi } static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vcast_vf_f_rvvm1_sleef(float f) { - return __riscv_vfmv_v_f_f32m1(f, (__riscv_vsetvlmax_e32m1())); + return __riscv_vfmv_v_f_f32m1(f, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vrint_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef vd_rvvm1_sleef) { - return __riscv_vfcvt_f_x_v_f32m1(__riscv_vfcvt_x_f_v_i32m1_rm(vd_rvvm1_sleef, __RISCV_FRM_RNE, (__riscv_vsetvlmax_e32m1())), (__riscv_vsetvlmax_e32m1())); + return __riscv_vfcvt_f_x_v_f32m1(__riscv_vfcvt_x_f_v_i32m1_rm(vd_rvvm1_sleef, __RISCV_FRM_RNE, ((int)__riscv_vsetvlmax_e32m1())), ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vcast_vf_vi2_rvvm1_sleef(vint2_rvvm1_sleef vi) { - return __riscv_vfcvt_f(vi, (__riscv_vsetvlmax_e32m1())); + return __riscv_vfcvt_f(vi, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vint2_rvvm1_sleef vcast_vi2_i_rvvm1_sleef(int i) { - return __riscv_vmv_v_x_i32m1(i, (__riscv_vsetvlmax_e32m1())); + return __riscv_vmv_v_x_i32m1(i, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vint2_rvvm1_sleef vrint_vi2_vf_rvvm1_sleef(vfloat_rvvm1_sleef vf) { - return __riscv_vfcvt_x_f_v_i32m1_rm(vf, __RISCV_FRM_RNE, (__riscv_vsetvlmax_e32m1())); + return __riscv_vfcvt_x_f_v_i32m1_rm(vf, __RISCV_FRM_RNE, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vint2_rvvm1_sleef vtruncate_vi2_vf_rvvm1_sleef(vfloat_rvvm1_sleef vf) { - return __riscv_vfcvt_rtz_x(vf, (__riscv_vsetvlmax_e32m1())); + return __riscv_vfcvt_rtz_x(vf, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vtruncate_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef vf) { return vcast_vf_vi2_rvvm1_sleef(vtruncate_vi2_vf_rvvm1_sleef(vf)); } static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vload_vf_p_rvvm1_sleef(const float *ptr) { - return __riscv_vle32_v_f32m1(ptr, (__riscv_vsetvlmax_e32m1())); + return __riscv_vle32_v_f32m1(ptr, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vloadu_vf_p_rvvm1_sleef(const float *ptr) { - return __riscv_vle32_v_f32m1(ptr, (__riscv_vsetvlmax_e32m1())); + return __riscv_vle32_v_f32m1(ptr, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE void vstore_v_p_vf_rvvm1_sleef(float *ptr, vfloat_rvvm1_sleef v) { - __riscv_vse32(ptr, v, (__riscv_vsetvlmax_e32m1())); + __riscv_vse32(ptr, v, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE void vstoreu_v_p_vf_rvvm1_sleef(float *ptr, vfloat_rvvm1_sleef v) { - __riscv_vse32(ptr, v, (__riscv_vsetvlmax_e32m1())); + __riscv_vse32(ptr, v, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE void vstoreu_v_p_vi2_rvvm1_sleef(int32_t *ptr, vint2_rvvm1_sleef v) { - __riscv_vse32(ptr, v, (__riscv_vsetvlmax_e32m1())); + __riscv_vse32(ptr, v, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vgather_vf_p_vi2_rvvm1_sleef(const float *ptr, vint2_rvvm1_sleef vi2) { - return __riscv_vluxei32(ptr, __riscv_vmul(__riscv_vreinterpret_u32m1(vi2), sizeof(float), (__riscv_vsetvlmax_e32m1())), (__riscv_vsetvlmax_e32m1())); + return __riscv_vluxei32(ptr, __riscv_vmul(__riscv_vreinterpret_u32m1(vi2), sizeof(float), ((int)__riscv_vsetvlmax_e32m1())), ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vadd_vf_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef x, vfloat_rvvm1_sleef y) { - return __riscv_vfadd(x, y, (__riscv_vsetvlmax_e32m1())); + return __riscv_vfadd(x, y, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vsub_vf_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef x, vfloat_rvvm1_sleef y) { - return __riscv_vfsub(x, y, (__riscv_vsetvlmax_e32m1())); + return __riscv_vfsub(x, y, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vmul_vf_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef x, vfloat_rvvm1_sleef y) { - return __riscv_vfmul(x, y, (__riscv_vsetvlmax_e32m1())); + return __riscv_vfmul(x, y, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vdiv_vf_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef x, vfloat_rvvm1_sleef y) { - return __riscv_vfdiv(x, y, (__riscv_vsetvlmax_e32m1())); + return __riscv_vfdiv(x, y, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vmax_vf_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef x, vfloat_rvvm1_sleef y) { - return __riscv_vfmax(x, y, (__riscv_vsetvlmax_e32m1())); + return __riscv_vfmax(x, y, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vmin_vf_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef x, vfloat_rvvm1_sleef y) { - return __riscv_vfmin(x, y, (__riscv_vsetvlmax_e32m1())); + return __riscv_vfmin(x, y, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vrec_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef d) { - return __riscv_vfdiv(vcast_vf_f_rvvm1_sleef(1.0f), d, (__riscv_vsetvlmax_e32m1())); + return __riscv_vfdiv(vcast_vf_f_rvvm1_sleef(1.0f), d, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vsqrt_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef d) { - return __riscv_vfsqrt(d, (__riscv_vsetvlmax_e32m1())); + return __riscv_vfsqrt(d, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vmla_vf_vf_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef x, vfloat_rvvm1_sleef y, vfloat_rvvm1_sleef z) { - return __riscv_vfmadd(x, y, z, (__riscv_vsetvlmax_e32m1())); + return __riscv_vfmadd(x, y, z, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vmlanp_vf_vf_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef x, vfloat_rvvm1_sleef y, vfloat_rvvm1_sleef z) { - return __riscv_vfnmsub(x, y, z, (__riscv_vsetvlmax_e32m1())); + return __riscv_vfnmsub(x, y, z, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vmlapn_vf_vf_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef x, vfloat_rvvm1_sleef y, vfloat_rvvm1_sleef z) { - return __riscv_vfmsub(x, y, z, (__riscv_vsetvlmax_e32m1())); + return __riscv_vfmsub(x, y, z, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vfma_vf_vf_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef x, vfloat_rvvm1_sleef y, vfloat_rvvm1_sleef z) { - return __riscv_vfmadd(x, y, z, (__riscv_vsetvlmax_e32m1())); + return __riscv_vfmadd(x, y, z, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vfmanp_vf_vf_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef x, vfloat_rvvm1_sleef y, vfloat_rvvm1_sleef z) { - return __riscv_vfnmsub(x, y, z, (__riscv_vsetvlmax_e32m1())); + return __riscv_vfnmsub(x, y, z, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vfmapn_vf_vf_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef x, vfloat_rvvm1_sleef y, vfloat_rvvm1_sleef z) { - return __riscv_vfmsub(x, y, z, (__riscv_vsetvlmax_e32m1())); + return __riscv_vfmsub(x, y, z, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vmulsign_vf_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef x, vfloat_rvvm1_sleef y) { - return __riscv_vfsgnjx(x, y, (__riscv_vsetvlmax_e32m1())); + return __riscv_vfsgnjx(x, y, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vcopysign_vf_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef x, vfloat_rvvm1_sleef y) { - return __riscv_vfsgnj(x, y, (__riscv_vsetvlmax_e32m1())); + return __riscv_vfsgnj(x, y, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vsign_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef f) { - return __riscv_vfsgnj(__riscv_vfmv_v_f_f32m1(1.0f, (__riscv_vsetvlmax_e32m1())), f, (__riscv_vsetvlmax_e32m1())); + return __riscv_vfsgnj(__riscv_vfmv_v_f_f32m1(1.0f, ((int)__riscv_vsetvlmax_e32m1())), f, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vorsign_vf_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef x, vfloat_rvvm1_sleef y) { vint2_rvvm1_sleef xi = __riscv_vreinterpret_i32m1(x); vint2_rvvm1_sleef yi = __riscv_vreinterpret_i32m1(y); - vint2_rvvm1_sleef xioryi = __riscv_vor(xi, yi, (__riscv_vsetvlmax_e32m1())); + vint2_rvvm1_sleef xioryi = __riscv_vor(xi, yi, ((int)__riscv_vsetvlmax_e32m1())); vfloat_rvvm1_sleef xory = __riscv_vreinterpret_f32m1(xioryi); - return __riscv_vfsgnj(x, xory, (__riscv_vsetvlmax_e32m1())); + return __riscv_vfsgnj(x, xory, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vabs_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef f) { - return __riscv_vfabs(f, (__riscv_vsetvlmax_e32m1())); + return __riscv_vfabs(f, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vneg_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef f) { - return __riscv_vfneg(f, (__riscv_vsetvlmax_e32m1())); + return __riscv_vfneg(f, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vint2_rvvm1_sleef vadd_vi2_vi2_vi2_rvvm1_sleef(vint2_rvvm1_sleef x, vint2_rvvm1_sleef y) { - return __riscv_vadd(x, y, (__riscv_vsetvlmax_e32m1())); + return __riscv_vadd(x, y, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vint2_rvvm1_sleef vsub_vi2_vi2_vi2_rvvm1_sleef(vint2_rvvm1_sleef x, vint2_rvvm1_sleef y) { - return __riscv_vsub(x, y, (__riscv_vsetvlmax_e32m1())); + return __riscv_vsub(x, y, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vint2_rvvm1_sleef vneg_vi2_vi2_rvvm1_sleef(vint2_rvvm1_sleef x) { - return __riscv_vneg(x, (__riscv_vsetvlmax_e32m1())); + return __riscv_vneg(x, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vint2_rvvm1_sleef vand_vi2_vi2_vi2_rvvm1_sleef(vint2_rvvm1_sleef x, vint2_rvvm1_sleef y) { - return __riscv_vand(x, y, (__riscv_vsetvlmax_e32m1())); + return __riscv_vand(x, y, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vint2_rvvm1_sleef vandnot_vi2_vi2_vi2_rvvm1_sleef(vint2_rvvm1_sleef x, vint2_rvvm1_sleef y) { - return __riscv_vand(__riscv_vnot(x, (__riscv_vsetvlmax_e32m1())), y, (__riscv_vsetvlmax_e32m1())); + return __riscv_vand(__riscv_vnot(x, ((int)__riscv_vsetvlmax_e32m1())), y, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vint2_rvvm1_sleef vor_vi2_vi2_vi2_rvvm1_sleef(vint2_rvvm1_sleef x, vint2_rvvm1_sleef y) { - return __riscv_vor(x, y, (__riscv_vsetvlmax_e32m1())); + return __riscv_vor(x, y, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vint2_rvvm1_sleef vxor_vi2_vi2_vi2_rvvm1_sleef(vint2_rvvm1_sleef x, vint2_rvvm1_sleef y) { - return __riscv_vxor(x, y, (__riscv_vsetvlmax_e32m1())); + return __riscv_vxor(x, y, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vint2_rvvm1_sleef vsll_vi2_vi2_i_rvvm1_sleef(vint2_rvvm1_sleef x, int c) { - return __riscv_vsll(x, c, (__riscv_vsetvlmax_e32m1())); + return __riscv_vsll(x, c, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vint2_rvvm1_sleef vsra_vi2_vi2_i_rvvm1_sleef(vint2_rvvm1_sleef x, int c) { - return __riscv_vsra(x, c, (__riscv_vsetvlmax_e32m1())); + return __riscv_vsra(x, c, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vint2_rvvm1_sleef vsrl_vi2_vi2_i_rvvm1_sleef(vint2_rvvm1_sleef x, int c) { - return __riscv_vreinterpret_i32m1(__riscv_vsrl(__riscv_vreinterpret_u32m1(x), c, (__riscv_vsetvlmax_e32m1()))); + return __riscv_vreinterpret_i32m1(__riscv_vsrl(__riscv_vreinterpret_u32m1(x), c, ((int)__riscv_vsetvlmax_e32m1()))); } static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vreinterpret_vf_vm_rvvm1_sleef(vmask_rvvm1_sleef vm) { @@ -1387,91 +1384,91 @@ static SLEEF_ALWAYS_INLINE vmask_rvvm1_sleef vreinterpret_vm_vf_rvvm1_sleef(vflo } static SLEEF_ALWAYS_INLINE int vtestallones_i_vo32_rvvm1_sleef(rvv_sp_vopmask_rvvm1_sleef g) { - return __riscv_vcpop(g, (__riscv_vsetvlmax_e32m1())) == (__riscv_vsetvlmax_e32m1()); + return (int)__riscv_vcpop(g, ((int)__riscv_vsetvlmax_e32m1())) == (int)((int)__riscv_vsetvlmax_e32m1()); } static SLEEF_ALWAYS_INLINE vmask_rvvm1_sleef vor_vm_vo32_vm_rvvm1_sleef(rvv_sp_vopmask_rvvm1_sleef x, vmask_rvvm1_sleef y) { rvv_vmask32 y32 = __riscv_vreinterpret_u32m1(y); - return __riscv_vreinterpret_u64m1(__riscv_vmerge(y32, -1, x, (__riscv_vsetvlmax_e32m1()))); + return __riscv_vreinterpret_u64m1(__riscv_vmerge(y32, -1, x, ((int)__riscv_vsetvlmax_e32m1()))); } static SLEEF_ALWAYS_INLINE vmask_rvvm1_sleef vand_vm_vo32_vm_rvvm1_sleef(rvv_sp_vopmask_rvvm1_sleef x, vmask_rvvm1_sleef y) { rvv_vmask32 y32 = __riscv_vreinterpret_u32m1(y); - return __riscv_vreinterpret_u64m1(__riscv_vmerge(y32, 0, __riscv_vmnot(x, (__riscv_vsetvlmax_e32m1())), (__riscv_vsetvlmax_e32m1()))); + return __riscv_vreinterpret_u64m1(__riscv_vmerge(y32, 0, __riscv_vmnot(x, ((int)__riscv_vsetvlmax_e32m1())), ((int)__riscv_vsetvlmax_e32m1()))); } static SLEEF_ALWAYS_INLINE vmask_rvvm1_sleef vandnot_vm_vo32_vm_rvvm1_sleef(rvv_sp_vopmask_rvvm1_sleef x, vmask_rvvm1_sleef y) { rvv_vmask32 y32 = __riscv_vreinterpret_u32m1(y); - return __riscv_vreinterpret_u64m1(__riscv_vmerge(y32, 0, x, (__riscv_vsetvlmax_e32m1()))); + return __riscv_vreinterpret_u64m1(__riscv_vmerge(y32, 0, x, ((int)__riscv_vsetvlmax_e32m1()))); } static SLEEF_ALWAYS_INLINE rvv_sp_vopmask_rvvm1_sleef rvv_sp_vand_vo_vo_vo(rvv_sp_vopmask_rvvm1_sleef x, rvv_sp_vopmask_rvvm1_sleef y) { - return __riscv_vmand(x, y, (__riscv_vsetvlmax_e32m1())); + return __riscv_vmand(x, y, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE rvv_sp_vopmask_rvvm1_sleef rvv_sp_vandnot_vo_vo_vo(rvv_sp_vopmask_rvvm1_sleef x, rvv_sp_vopmask_rvvm1_sleef y) { - return __riscv_vmandn(y, x, (__riscv_vsetvlmax_e32m1())); + return __riscv_vmandn(y, x, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE rvv_sp_vopmask_rvvm1_sleef rvv_sp_vor_vo_vo_vo(rvv_sp_vopmask_rvvm1_sleef x, rvv_sp_vopmask_rvvm1_sleef y) { - return __riscv_vmor(x, y, (__riscv_vsetvlmax_e32m1())); + return __riscv_vmor(x, y, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE rvv_sp_vopmask_rvvm1_sleef rvv_sp_vxor_vo_vo_vo(rvv_sp_vopmask_rvvm1_sleef x, rvv_sp_vopmask_rvvm1_sleef y) { - return __riscv_vmxor(x, y, (__riscv_vsetvlmax_e32m1())); + return __riscv_vmxor(x, y, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE rvv_sp_vopmask_rvvm1_sleef veq_vo_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef x, vfloat_rvvm1_sleef y) { - return __riscv_vmfeq(x, y, (__riscv_vsetvlmax_e32m1())); + return __riscv_vmfeq(x, y, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE rvv_sp_vopmask_rvvm1_sleef vneq_vo_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef x, vfloat_rvvm1_sleef y) { - return __riscv_vmfne(x, y, (__riscv_vsetvlmax_e32m1())); + return __riscv_vmfne(x, y, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE rvv_sp_vopmask_rvvm1_sleef vgt_vo_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef x, vfloat_rvvm1_sleef y) { - return __riscv_vmfgt(x, y, (__riscv_vsetvlmax_e32m1())); + return __riscv_vmfgt(x, y, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE rvv_sp_vopmask_rvvm1_sleef vge_vo_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef x, vfloat_rvvm1_sleef y) { - return __riscv_vmfge(x, y, (__riscv_vsetvlmax_e32m1())); + return __riscv_vmfge(x, y, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE rvv_sp_vopmask_rvvm1_sleef vlt_vo_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef x, vfloat_rvvm1_sleef y) { - return __riscv_vmflt(x, y, (__riscv_vsetvlmax_e32m1())); + return __riscv_vmflt(x, y, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE rvv_sp_vopmask_rvvm1_sleef vle_vo_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef x, vfloat_rvvm1_sleef y) { - return __riscv_vmfle(x, y, (__riscv_vsetvlmax_e32m1())); + return __riscv_vmfle(x, y, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE rvv_sp_vopmask_rvvm1_sleef visnan_vo_vf_rvvm1_sleef(vfloat_rvvm1_sleef d) { - return __riscv_vmfne(d, d, (__riscv_vsetvlmax_e32m1())); + return __riscv_vmfne(d, d, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE rvv_sp_vopmask_rvvm1_sleef visinf_vo_vf_rvvm1_sleef(vfloat_rvvm1_sleef d) { - return __riscv_vmfeq(__riscv_vfabs(d, (__riscv_vsetvlmax_e32m1())), __builtin_inff(), (__riscv_vsetvlmax_e32m1())); + return __riscv_vmfeq(__riscv_vfabs(d, ((int)__riscv_vsetvlmax_e32m1())), __builtin_inff(), ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE rvv_sp_vopmask_rvvm1_sleef vispinf_vo_vf_rvvm1_sleef(vfloat_rvvm1_sleef d) { - return __riscv_vmfeq(d, __builtin_inff(), (__riscv_vsetvlmax_e32m1())); + return __riscv_vmfeq(d, __builtin_inff(), ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vsel_vf_vo_vf_vf_rvvm1_sleef(rvv_sp_vopmask_rvvm1_sleef mask, vfloat_rvvm1_sleef x, vfloat_rvvm1_sleef y) { - return __riscv_vmerge(y, x, mask, (__riscv_vsetvlmax_e32m1())); + return __riscv_vmerge(y, x, mask, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vsel_vf_vo_f_f_rvvm1_sleef(rvv_sp_vopmask_rvvm1_sleef mask, float v1, float v0) { - return __riscv_vfmerge(vcast_vf_f_rvvm1_sleef(v0), v1, mask, (__riscv_vsetvlmax_e32m1())); + return __riscv_vfmerge(vcast_vf_f_rvvm1_sleef(v0), v1, mask, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vsel_vf_vo_vo_f_f_f_rvvm1_sleef(rvv_sp_vopmask_rvvm1_sleef o0, rvv_sp_vopmask_rvvm1_sleef o1, float d0, float d1, float d2) { - return __riscv_vfmerge(__riscv_vfmerge(vcast_vf_f_rvvm1_sleef(d2), d1, o1, (__riscv_vsetvlmax_e32m1())), d0, o0, (__riscv_vsetvlmax_e32m1())); + return __riscv_vfmerge(__riscv_vfmerge(vcast_vf_f_rvvm1_sleef(d2), d1, o1, ((int)__riscv_vsetvlmax_e32m1())), d0, o0, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vsel_vf_vo_vo_vo_f_f_f_f_rvvm1_sleef(rvv_sp_vopmask_rvvm1_sleef o0, rvv_sp_vopmask_rvvm1_sleef o1, rvv_sp_vopmask_rvvm1_sleef o2, float d0, float d1, float d2, float d3) { - return __riscv_vfmerge(__riscv_vfmerge(__riscv_vfmerge(vcast_vf_f_rvvm1_sleef(d3), d2, o2, (__riscv_vsetvlmax_e32m1())), d1, o1, (__riscv_vsetvlmax_e32m1())), d0, o0, (__riscv_vsetvlmax_e32m1())); + return __riscv_vfmerge(__riscv_vfmerge(__riscv_vfmerge(vcast_vf_f_rvvm1_sleef(d3), d2, o2, ((int)__riscv_vsetvlmax_e32m1())), d1, o1, ((int)__riscv_vsetvlmax_e32m1())), d0, o0, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE rvv_sp_vopmask_rvvm1_sleef veq_vo_vi2_vi2_rvvm1_sleef(vint2_rvvm1_sleef x, vint2_rvvm1_sleef y) { - return __riscv_vmseq(x, y, (__riscv_vsetvlmax_e32m1())); + return __riscv_vmseq(x, y, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE rvv_sp_vopmask_rvvm1_sleef vgt_vo_vi2_vi2_rvvm1_sleef(vint2_rvvm1_sleef x, vint2_rvvm1_sleef y) { - return __riscv_vmsgt(x, y, (__riscv_vsetvlmax_e32m1())); + return __riscv_vmsgt(x, y, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vint2_rvvm1_sleef vgt_vi2_vi2_vi2_rvvm1_sleef(vint2_rvvm1_sleef x, vint2_rvvm1_sleef y) { vint2_rvvm1_sleef zero = vcast_vi2_i_rvvm1_sleef(0); - return __riscv_vmerge(zero, -1, __riscv_vmsgt(x, y, (__riscv_vsetvlmax_e32m1())), (__riscv_vsetvlmax_e32m1())); + return __riscv_vmerge(zero, -1, __riscv_vmsgt(x, y, ((int)__riscv_vsetvlmax_e32m1())), ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vint2_rvvm1_sleef vsel_vi2_vo_vi2_vi2_rvvm1_sleef(rvv_sp_vopmask_rvvm1_sleef m, vint2_rvvm1_sleef x, vint2_rvvm1_sleef y) { - return __riscv_vmerge(y, x, m, (__riscv_vsetvlmax_e32m1())); + return __riscv_vmerge(y, x, m, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vint2_rvvm1_sleef vand_vi2_vo_vi2_rvvm1_sleef(rvv_sp_vopmask_rvvm1_sleef x, vint2_rvvm1_sleef y) { - return __riscv_vmerge(y, 0, __riscv_vmnot(x, (__riscv_vsetvlmax_e32m1())), (__riscv_vsetvlmax_e32m1())); + return __riscv_vmerge(y, 0, __riscv_vmnot(x, ((int)__riscv_vsetvlmax_e32m1())), ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE const vdouble_rvvm1_sleef vd2getx_vd_vd2_rvvm1_sleef(vdouble2_rvvm1_sleef v) { @@ -1537,203 +1534,203 @@ static SLEEF_ALWAYS_INLINE ddi_t_rvvm1_sleef ddisetdd_ddi_ddi_vd2_rvvm1_sleef(dd } static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vcast_vd_d_rvvm1_sleef(double d) { - return __riscv_vfmv_v_f_f64m1(d, __riscv_vsetvlmax_e64m1()); + return __riscv_vfmv_v_f_f64m1(d, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vcast_vd_vi_rvvm1_sleef(vint_rvvm1_sleef i) { - return __riscv_vfwcvt_f(i, __riscv_vsetvlmax_e64m1()); + return __riscv_vfwcvt_f(i, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vint_rvvm1_sleef vcast_vi_i_rvvm1_sleef(int32_t i) { - return __riscv_vmv_v_x_i32mf2(i, __riscv_vsetvlmax_e64m1()); + return __riscv_vmv_v_x_i32mf2(i, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vint_rvvm1_sleef vrint_vi_vd_rvvm1_sleef(vdouble_rvvm1_sleef vd_rvvm1_sleef) { - return __riscv_vfncvt_x_f_w_i32mf2_rm(vd_rvvm1_sleef, __RISCV_FRM_RNE, __riscv_vsetvlmax_e64m1()); + return __riscv_vfncvt_x_f_w_i32mf2_rm(vd_rvvm1_sleef, __RISCV_FRM_RNE, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vrint_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef vd_rvvm1_sleef) { - return __riscv_vfcvt_f_x_v_f64m1(__riscv_vfcvt_x_f_v_i64m1_rm(vd_rvvm1_sleef, __RISCV_FRM_RNE, __riscv_vsetvlmax_e64m1()), __riscv_vsetvlmax_e64m1()); + return __riscv_vfcvt_f_x_v_f64m1(__riscv_vfcvt_x_f_v_i64m1_rm(vd_rvvm1_sleef, __RISCV_FRM_RNE, ((int)__riscv_vsetvlmax_e64m1())), ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vint_rvvm1_sleef vtruncate_vi_vd_rvvm1_sleef(vdouble_rvvm1_sleef vd_rvvm1_sleef) { - return __riscv_vfncvt_rtz_x(vd_rvvm1_sleef, __riscv_vsetvlmax_e64m1()); + return __riscv_vfncvt_rtz_x(vd_rvvm1_sleef, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vtruncate_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef vd_rvvm1_sleef) { return vcast_vd_vi_rvvm1_sleef(vtruncate_vi_vd_rvvm1_sleef(vd_rvvm1_sleef)); } static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vload_vd_p_rvvm1_sleef(const double *ptr) { - return __riscv_vle64_v_f64m1(ptr, __riscv_vsetvlmax_e64m1()); + return __riscv_vle64_v_f64m1(ptr, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vloadu_vd_p_rvvm1_sleef(const double *ptr) { - return __riscv_vle64_v_f64m1(ptr, __riscv_vsetvlmax_e64m1()); + return __riscv_vle64_v_f64m1(ptr, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vint_rvvm1_sleef vloadu_vi_p_rvvm1_sleef(int32_t *p) { - return __riscv_vle32_v_i32mf2(p, __riscv_vsetvlmax_e64m1()); + return __riscv_vle32_v_i32mf2(p, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE void vstore_v_p_vd_rvvm1_sleef(double *ptr, vdouble_rvvm1_sleef v) { - __riscv_vse64(ptr, v, __riscv_vsetvlmax_e64m1()); + __riscv_vse64(ptr, v, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE void vstoreu_v_p_vd_rvvm1_sleef(double *ptr, vdouble_rvvm1_sleef v) { - __riscv_vse64(ptr, v, __riscv_vsetvlmax_e64m1()); + __riscv_vse64(ptr, v, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE void vstoreu_v_p_vi_rvvm1_sleef(int32_t *ptr, vint_rvvm1_sleef v) { - __riscv_vse32(ptr, v, __riscv_vsetvlmax_e64m1()); + __riscv_vse32(ptr, v, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vgather_vd_p_vi_rvvm1_sleef(const double *ptr, vint_rvvm1_sleef vi) { - return __riscv_vluxei64(ptr, __riscv_vwmulu(__riscv_vreinterpret_u32mf2(vi), sizeof(double), __riscv_vsetvlmax_e64m1()), __riscv_vsetvlmax_e64m1()); + return __riscv_vluxei64(ptr, __riscv_vwmulu(__riscv_vreinterpret_u32mf2(vi), sizeof(double), ((int)__riscv_vsetvlmax_e64m1())), ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vadd_vd_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y) { - return __riscv_vfadd(x, y, __riscv_vsetvlmax_e64m1()); + return __riscv_vfadd(x, y, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vsub_vd_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y) { - return __riscv_vfsub(x, y, __riscv_vsetvlmax_e64m1()); + return __riscv_vfsub(x, y, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vrec_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef d) { - return __riscv_vfdiv(vcast_vd_d_rvvm1_sleef(1.0), d, __riscv_vsetvlmax_e64m1()); + return __riscv_vfdiv(vcast_vd_d_rvvm1_sleef(1.0), d, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vabs_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef d) { - return __riscv_vfabs(d, __riscv_vsetvlmax_e64m1()); + return __riscv_vfabs(d, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vsqrt_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef d) { - return __riscv_vfsqrt(d, __riscv_vsetvlmax_e64m1()); + return __riscv_vfsqrt(d, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vmul_vd_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y) { - return __riscv_vfmul(x, y, __riscv_vsetvlmax_e64m1()); + return __riscv_vfmul(x, y, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vdiv_vd_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y) { - return __riscv_vfdiv(x, y, __riscv_vsetvlmax_e64m1()); + return __riscv_vfdiv(x, y, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vmax_vd_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y) { - return __riscv_vfmax(x, y, __riscv_vsetvlmax_e64m1()); + return __riscv_vfmax(x, y, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vmin_vd_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y) { - return __riscv_vfmin(x, y, __riscv_vsetvlmax_e64m1()); + return __riscv_vfmin(x, y, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vmla_vd_vd_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y, vdouble_rvvm1_sleef z) { - return __riscv_vfmadd(x, y, z, __riscv_vsetvlmax_e64m1()); + return __riscv_vfmadd(x, y, z, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vmlapn_vd_vd_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y, vdouble_rvvm1_sleef z) { - return __riscv_vfmsub(x, y, z, __riscv_vsetvlmax_e64m1()); + return __riscv_vfmsub(x, y, z, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vmlanp_vd_vd_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y, vdouble_rvvm1_sleef z) { - return __riscv_vfnmsac(z, x, y, __riscv_vsetvlmax_e64m1()); + return __riscv_vfnmsac(z, x, y, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vfma_vd_vd_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y, vdouble_rvvm1_sleef z) { - return __riscv_vfmadd(x, y, z, __riscv_vsetvlmax_e64m1()); + return __riscv_vfmadd(x, y, z, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vfmanp_vd_vd_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y, vdouble_rvvm1_sleef z) { - return __riscv_vfnmsub(x, y, z, __riscv_vsetvlmax_e64m1()); + return __riscv_vfnmsub(x, y, z, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vfmapn_vd_vd_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y, vdouble_rvvm1_sleef z) { - return __riscv_vfmsub(x, y, z, __riscv_vsetvlmax_e64m1()); + return __riscv_vfmsub(x, y, z, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vmulsign_vd_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y) { - return __riscv_vfsgnjx(x, y, __riscv_vsetvlmax_e64m1()); + return __riscv_vfsgnjx(x, y, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vcopysign_vd_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y) { - return __riscv_vfsgnj(x, y, __riscv_vsetvlmax_e64m1()); + return __riscv_vfsgnj(x, y, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vorsign_vd_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y) { - return __riscv_vfsgnj(x, __riscv_vreinterpret_f64m1(__riscv_vreinterpret_i64m1(__riscv_vor(__riscv_vreinterpret_u64m1(x), __riscv_vreinterpret_u64m1(y), __riscv_vsetvlmax_e64m1()))), __riscv_vsetvlmax_e64m1()); + return __riscv_vfsgnj(x, __riscv_vreinterpret_f64m1(__riscv_vreinterpret_i64m1(__riscv_vor(__riscv_vreinterpret_u64m1(x), __riscv_vreinterpret_u64m1(y), ((int)__riscv_vsetvlmax_e64m1())))), ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vneg_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef d) { - return __riscv_vfneg(d, __riscv_vsetvlmax_e64m1()); + return __riscv_vfneg(d, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vint_rvvm1_sleef vadd_vi_vi_vi_rvvm1_sleef(vint_rvvm1_sleef x, vint_rvvm1_sleef y) { - return __riscv_vadd(x, y, __riscv_vsetvlmax_e64m1()); + return __riscv_vadd(x, y, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vint_rvvm1_sleef vsub_vi_vi_vi_rvvm1_sleef(vint_rvvm1_sleef x, vint_rvvm1_sleef y) { - return __riscv_vsub(x, y, __riscv_vsetvlmax_e64m1()); + return __riscv_vsub(x, y, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vint_rvvm1_sleef vneg_vi_vi_rvvm1_sleef(vint_rvvm1_sleef x) { - return __riscv_vneg(x, __riscv_vsetvlmax_e64m1()); + return __riscv_vneg(x, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vint_rvvm1_sleef vand_vi_vi_vi_rvvm1_sleef(vint_rvvm1_sleef x, vint_rvvm1_sleef y) { - return __riscv_vand(x, y, __riscv_vsetvlmax_e64m1()); + return __riscv_vand(x, y, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vint_rvvm1_sleef vandnot_vi_vi_vi_rvvm1_sleef(vint_rvvm1_sleef x, vint_rvvm1_sleef y) { - return __riscv_vand(__riscv_vnot(x, __riscv_vsetvlmax_e64m1()), y, __riscv_vsetvlmax_e64m1()); + return __riscv_vand(__riscv_vnot(x, ((int)__riscv_vsetvlmax_e64m1())), y, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vint_rvvm1_sleef vor_vi_vi_vi_rvvm1_sleef(vint_rvvm1_sleef x, vint_rvvm1_sleef y) { - return __riscv_vor(x, y, __riscv_vsetvlmax_e64m1()); + return __riscv_vor(x, y, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vint_rvvm1_sleef vxor_vi_vi_vi_rvvm1_sleef(vint_rvvm1_sleef x, vint_rvvm1_sleef y) { - return __riscv_vxor(x, y, __riscv_vsetvlmax_e64m1()); + return __riscv_vxor(x, y, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vint_rvvm1_sleef vsll_vi_vi_i_rvvm1_sleef(vint_rvvm1_sleef x, int c) { - return __riscv_vsll(x, c, __riscv_vsetvlmax_e64m1()); + return __riscv_vsll(x, c, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vint_rvvm1_sleef vsra_vi_vi_i_rvvm1_sleef(vint_rvvm1_sleef x, int c) { - return __riscv_vsra(x, c, __riscv_vsetvlmax_e64m1()); + return __riscv_vsra(x, c, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vint_rvvm1_sleef vsrl_vi_vi_i_rvvm1_sleef(vint_rvvm1_sleef x, int c) { - return __riscv_vreinterpret_i32mf2(__riscv_vsrl(__riscv_vreinterpret_u32mf2(x), c, __riscv_vsetvlmax_e64m1())); + return __riscv_vreinterpret_i32mf2(__riscv_vsrl(__riscv_vreinterpret_u32mf2(x), c, ((int)__riscv_vsetvlmax_e64m1()))); } static SLEEF_ALWAYS_INLINE vmask_rvvm1_sleef vcast_vm_i64_rvvm1_sleef(int64_t c) { - return __riscv_vmv_v_x_u64m1(c, __riscv_vsetvlmax_e64m1()); + return __riscv_vmv_v_x_u64m1(c, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vmask_rvvm1_sleef vcast_vm_u64_rvvm1_sleef(uint64_t c) { - return __riscv_vmv_v_x_u64m1(c, __riscv_vsetvlmax_e64m1()); + return __riscv_vmv_v_x_u64m1(c, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vmask_rvvm1_sleef vcast_vm_i_i_rvvm1_sleef(int64_t h, int64_t l) { - return __riscv_vmv_v_x_u64m1((((uint64_t)h) << 32) | (uint32_t) l, __riscv_vsetvlmax_e64m1()); + return __riscv_vmv_v_x_u64m1((((uint64_t)h) << 32) | (uint32_t) l, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vmask_rvvm1_sleef vcast_vm_vi_rvvm1_sleef(vint_rvvm1_sleef vi) { - return __riscv_vreinterpret_u64m1(__riscv_vwcvt_x(vi, __riscv_vsetvlmax_e64m1())); + return __riscv_vreinterpret_u64m1(__riscv_vwcvt_x(vi, ((int)__riscv_vsetvlmax_e64m1()))); } static SLEEF_ALWAYS_INLINE vmask_rvvm1_sleef vcastu_vm_vi_rvvm1_sleef(vint_rvvm1_sleef vi) { - return __riscv_vsll(__riscv_vreinterpret_u64m1(__riscv_vwcvt_x(vi, __riscv_vsetvlmax_e64m1())), 32, __riscv_vsetvlmax_e64m1()); + return __riscv_vsll(__riscv_vreinterpret_u64m1(__riscv_vwcvt_x(vi, ((int)__riscv_vsetvlmax_e64m1()))), 32, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vint_rvvm1_sleef vcastu_vi_vm_rvvm1_sleef(vmask_rvvm1_sleef vm) { - return __riscv_vreinterpret_i32mf2(__riscv_vnsrl(vm, 32, __riscv_vsetvlmax_e64m1())); + return __riscv_vreinterpret_i32mf2(__riscv_vnsrl(vm, 32, ((int)__riscv_vsetvlmax_e64m1()))); } static SLEEF_ALWAYS_INLINE vint_rvvm1_sleef vcast_vi_vm_rvvm1_sleef(vmask_rvvm1_sleef vm) { - return __riscv_vreinterpret_i32mf2(__riscv_vncvt_x(vm, __riscv_vsetvlmax_e64m1())); + return __riscv_vreinterpret_i32mf2(__riscv_vncvt_x(vm, ((int)__riscv_vsetvlmax_e64m1()))); } static SLEEF_ALWAYS_INLINE vmask_rvvm1_sleef vand_vm_vo64_vm_rvvm1_sleef(rvv_dp_vopmask_rvvm1_sleef x, vmask_rvvm1_sleef y) { - return __riscv_vmerge(y, 0, __riscv_vmnot(x, __riscv_vsetvlmax_e64m1()), __riscv_vsetvlmax_e64m1()); + return __riscv_vmerge(y, 0, __riscv_vmnot(x, ((int)__riscv_vsetvlmax_e64m1())), ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vmask_rvvm1_sleef vand_vm_vm_vm_rvvm1_sleef(vmask_rvvm1_sleef x, vmask_rvvm1_sleef y) { - return __riscv_vand(x, y, __riscv_vsetvlmax_e64m1()); + return __riscv_vand(x, y, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vmask_rvvm1_sleef vor_vm_vm_vm_rvvm1_sleef(vmask_rvvm1_sleef x, vmask_rvvm1_sleef y) { - return __riscv_vor(x, y, __riscv_vsetvlmax_e64m1()); + return __riscv_vor(x, y, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vmask_rvvm1_sleef vxor_vm_vm_vm_rvvm1_sleef(vmask_rvvm1_sleef x, vmask_rvvm1_sleef y) { - return __riscv_vxor(x, y, __riscv_vsetvlmax_e64m1()); + return __riscv_vxor(x, y, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vmask_rvvm1_sleef vandnot_vm_vm_vm_rvvm1_sleef(vmask_rvvm1_sleef x, vmask_rvvm1_sleef y) { - return __riscv_vand(__riscv_vnot(x, __riscv_vsetvlmax_e64m1()), y, __riscv_vsetvlmax_e64m1()); + return __riscv_vand(__riscv_vnot(x, ((int)__riscv_vsetvlmax_e64m1())), y, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vmask_rvvm1_sleef vandnot_vm_vo64_vm_rvvm1_sleef(rvv_dp_vopmask_rvvm1_sleef x, vmask_rvvm1_sleef y) { - return __riscv_vmerge(y, 0, x, __riscv_vsetvlmax_e64m1()); + return __riscv_vmerge(y, 0, x, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vmask_rvvm1_sleef vsll64_vm_vm_i(vmask_rvvm1_sleef mask, int64_t c) { - return __riscv_vsll(mask, c, __riscv_vsetvlmax_e64m1()); + return __riscv_vsll(mask, c, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vmask_rvvm1_sleef vsub64_vm_vm_vm_rvvm1_sleef(vmask_rvvm1_sleef x, vmask_rvvm1_sleef y) { - return __riscv_vsub(x, y, __riscv_vsetvlmax_e64m1()); + return __riscv_vsub(x, y, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vmask_rvvm1_sleef vsrl64_vm_vm_i(vmask_rvvm1_sleef mask, int64_t c) { - return __riscv_vsrl(mask, c, __riscv_vsetvlmax_e64m1()); + return __riscv_vsrl(mask, c, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vmask_rvvm1_sleef vadd64_vm_vm_vm_rvvm1_sleef(vmask_rvvm1_sleef x, vmask_rvvm1_sleef y) { - return __riscv_vadd(x, y, __riscv_vsetvlmax_e64m1()); + return __riscv_vadd(x, y, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vmask_rvvm1_sleef vor_vm_vo64_vm_rvvm1_sleef(rvv_dp_vopmask_rvvm1_sleef x, vmask_rvvm1_sleef y) { - return __riscv_vmerge(y, -1, x, __riscv_vsetvlmax_e64m1()); + return __riscv_vmerge(y, -1, x, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vmask_rvvm1_sleef vsel_vm_vo64_vm_vm_rvvm1_sleef(rvv_dp_vopmask_rvvm1_sleef mask, vmask_rvvm1_sleef x, vmask_rvvm1_sleef y) { - return __riscv_vmerge(y, x, mask, __riscv_vsetvlmax_e64m1()); + return __riscv_vmerge(y, x, mask, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vmask_rvvm1_sleef vneg64_vm_vm_rvvm1_sleef(vmask_rvvm1_sleef mask) { - return __riscv_vreinterpret_u64m1(__riscv_vneg(__riscv_vreinterpret_i64m1(mask), __riscv_vsetvlmax_e64m1())); + return __riscv_vreinterpret_u64m1(__riscv_vneg(__riscv_vreinterpret_i64m1(mask), ((int)__riscv_vsetvlmax_e64m1()))); } static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vreinterpret_vd_vm_rvvm1_sleef(vmask_rvvm1_sleef vm) { return __riscv_vreinterpret_f64m1(__riscv_vreinterpret_i64m1(vm)); @@ -1757,111 +1754,111 @@ static SLEEF_ALWAYS_INLINE rvv_dp_vopmask_rvvm1_sleef vcast_vo32_vo64_rvvm1_slee return vo; } static SLEEF_ALWAYS_INLINE rvv_dp_vopmask_rvvm1_sleef rvv_dp_vand_vo_vo_vo(rvv_dp_vopmask_rvvm1_sleef x, rvv_dp_vopmask_rvvm1_sleef y) { - return __riscv_vmand(x, y, __riscv_vsetvlmax_e64m1()); + return __riscv_vmand(x, y, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE rvv_dp_vopmask_rvvm1_sleef rvv_dp_vandnot_vo_vo_vo(rvv_dp_vopmask_rvvm1_sleef x, rvv_dp_vopmask_rvvm1_sleef y) { - return __riscv_vmandn(y, x, __riscv_vsetvlmax_e64m1()); + return __riscv_vmandn(y, x, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE rvv_dp_vopmask_rvvm1_sleef rvv_dp_vor_vo_vo_vo(rvv_dp_vopmask_rvvm1_sleef x, rvv_dp_vopmask_rvvm1_sleef y) { - return __riscv_vmor(x, y, __riscv_vsetvlmax_e64m1()); + return __riscv_vmor(x, y, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE rvv_dp_vopmask_rvvm1_sleef rvv_dp_vxor_vo_vo_vo(rvv_dp_vopmask_rvvm1_sleef x, rvv_dp_vopmask_rvvm1_sleef y) { - return __riscv_vmxor(x, y, __riscv_vsetvlmax_e64m1()); + return __riscv_vmxor(x, y, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE rvv_dp_vopmask_rvvm1_sleef veq64_vo_vm_vm_rvvm1_sleef(vmask_rvvm1_sleef x, vmask_rvvm1_sleef y) { - return __riscv_vmseq(x, y, __riscv_vsetvlmax_e64m1()); + return __riscv_vmseq(x, y, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE rvv_dp_vopmask_rvvm1_sleef vgt64_vo_vm_vm_rvvm1_sleef(vmask_rvvm1_sleef x, vmask_rvvm1_sleef y) { - return __riscv_vmsgt(__riscv_vreinterpret_i64m1(x), __riscv_vreinterpret_i64m1(y), __riscv_vsetvlmax_e64m1()); + return __riscv_vmsgt(__riscv_vreinterpret_i64m1(x), __riscv_vreinterpret_i64m1(y), ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE rvv_dp_vopmask_rvvm1_sleef visinf_vo_vd_rvvm1_sleef(vdouble_rvvm1_sleef d) { - return __riscv_vmfeq(__riscv_vfabs(d, __riscv_vsetvlmax_e64m1()), __builtin_inf(), __riscv_vsetvlmax_e64m1()); + return __riscv_vmfeq(__riscv_vfabs(d, ((int)__riscv_vsetvlmax_e64m1())), __builtin_inf(), ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE rvv_dp_vopmask_rvvm1_sleef vispinf_vo_vd_rvvm1_sleef(vdouble_rvvm1_sleef d) { - return __riscv_vmfeq(d, __builtin_inf(), __riscv_vsetvlmax_e64m1()); + return __riscv_vmfeq(d, __builtin_inf(), ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE rvv_dp_vopmask_rvvm1_sleef veq_vo_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y) { - return __riscv_vmfeq(x, y, __riscv_vsetvlmax_e64m1()); + return __riscv_vmfeq(x, y, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE rvv_dp_vopmask_rvvm1_sleef vneq_vo_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y) { - return __riscv_vmfne(x, y, __riscv_vsetvlmax_e64m1()); + return __riscv_vmfne(x, y, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE rvv_dp_vopmask_rvvm1_sleef vlt_vo_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y) { - return __riscv_vmflt(x, y, __riscv_vsetvlmax_e64m1()); + return __riscv_vmflt(x, y, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE rvv_dp_vopmask_rvvm1_sleef vle_vo_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y) { - return __riscv_vmfle(x, y, __riscv_vsetvlmax_e64m1()); + return __riscv_vmfle(x, y, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE rvv_dp_vopmask_rvvm1_sleef vgt_vo_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y) { - return __riscv_vmfgt(x, y, __riscv_vsetvlmax_e64m1()); + return __riscv_vmfgt(x, y, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE rvv_dp_vopmask_rvvm1_sleef vge_vo_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y) { - return __riscv_vmfge(x, y, __riscv_vsetvlmax_e64m1()); + return __riscv_vmfge(x, y, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE rvv_dp_vopmask_rvvm1_sleef visnan_vo_vd_rvvm1_sleef(vdouble_rvvm1_sleef d) { - return __riscv_vmfne(d, d, __riscv_vsetvlmax_e64m1()); + return __riscv_vmfne(d, d, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vsel_vd_vo_vd_vd_rvvm1_sleef(rvv_dp_vopmask_rvvm1_sleef mask, vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y) { - return __riscv_vmerge(y, x, mask, __riscv_vsetvlmax_e64m1()); + return __riscv_vmerge(y, x, mask, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vsel_vd_vo_d_d_rvvm1_sleef(rvv_dp_vopmask_rvvm1_sleef mask, double v0, double v1) { - return __riscv_vfmerge(vcast_vd_d_rvvm1_sleef(v1), v0, mask, __riscv_vsetvlmax_e64m1()); + return __riscv_vfmerge(vcast_vd_d_rvvm1_sleef(v1), v0, mask, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vsel_vd_vo_vo_d_d_d_rvvm1_sleef(rvv_dp_vopmask_rvvm1_sleef o0, rvv_dp_vopmask_rvvm1_sleef o1, double d0, double d1, double d2) { - return __riscv_vfmerge(__riscv_vfmerge(vcast_vd_d_rvvm1_sleef(d2), d1, o1, __riscv_vsetvlmax_e64m1()), d0, o0, __riscv_vsetvlmax_e64m1()); + return __riscv_vfmerge(__riscv_vfmerge(vcast_vd_d_rvvm1_sleef(d2), d1, o1, ((int)__riscv_vsetvlmax_e64m1())), d0, o0, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vsel_vd_vo_vo_vo_d_d_d_d_rvvm1_sleef(rvv_dp_vopmask_rvvm1_sleef o0, rvv_dp_vopmask_rvvm1_sleef o1, rvv_dp_vopmask_rvvm1_sleef o2, double d0, double d1, double d2, double d3) { - return __riscv_vfmerge(__riscv_vfmerge(__riscv_vfmerge(vcast_vd_d_rvvm1_sleef(d3), d2, o2, __riscv_vsetvlmax_e64m1()), d1, o1, __riscv_vsetvlmax_e64m1()), d0, o0, __riscv_vsetvlmax_e64m1()); + return __riscv_vfmerge(__riscv_vfmerge(__riscv_vfmerge(vcast_vd_d_rvvm1_sleef(d3), d2, o2, ((int)__riscv_vsetvlmax_e64m1())), d1, o1, ((int)__riscv_vsetvlmax_e64m1())), d0, o0, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE int vtestallones_i_vo64_rvvm1_sleef(rvv_dp_vopmask_rvvm1_sleef g) { - return __riscv_vcpop(g, __riscv_vsetvlmax_e64m1()) == __riscv_vsetvlmax_e64m1(); + return (int)__riscv_vcpop(g, ((int)__riscv_vsetvlmax_e64m1())) == (int)((int)__riscv_vsetvlmax_e64m1()); } static SLEEF_ALWAYS_INLINE rvv_dp_vopmask_rvvm1_sleef veq_vo_vi_vi_rvvm1_sleef(vint_rvvm1_sleef x, vint_rvvm1_sleef y) { - return __riscv_vmseq(x, y, __riscv_vsetvlmax_e64m1()); + return __riscv_vmseq(x, y, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE rvv_dp_vopmask_rvvm1_sleef vgt_vo_vi_vi_rvvm1_sleef(vint_rvvm1_sleef x, vint_rvvm1_sleef y) { - return __riscv_vmsgt(x, y, __riscv_vsetvlmax_e64m1()); + return __riscv_vmsgt(x, y, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vint_rvvm1_sleef vgt_vi_vi_vi_rvvm1_sleef(vint_rvvm1_sleef x, vint_rvvm1_sleef y) { vint_rvvm1_sleef zero = vcast_vi_i_rvvm1_sleef(0); - return __riscv_vmerge(zero, -1, __riscv_vmsgt(x, y, __riscv_vsetvlmax_e64m1()), __riscv_vsetvlmax_e64m1()); + return __riscv_vmerge(zero, -1, __riscv_vmsgt(x, y, ((int)__riscv_vsetvlmax_e64m1())), ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vint_rvvm1_sleef vsel_vi_vo_vi_vi_rvvm1_sleef(rvv_dp_vopmask_rvvm1_sleef m, vint_rvvm1_sleef x, vint_rvvm1_sleef y) { - return __riscv_vmerge(y, x, m, __riscv_vsetvlmax_e64m1()); + return __riscv_vmerge(y, x, m, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vint_rvvm1_sleef vandnot_vi_vo_vi_rvvm1_sleef(rvv_dp_vopmask_rvvm1_sleef mask, vint_rvvm1_sleef vi) { - return __riscv_vmerge(vi, 0, mask, __riscv_vsetvlmax_e64m1()); + return __riscv_vmerge(vi, 0, mask, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vint_rvvm1_sleef vand_vi_vo_vi_rvvm1_sleef(rvv_dp_vopmask_rvvm1_sleef x, vint_rvvm1_sleef y) { - return __riscv_vmerge(y, 0, __riscv_vmnot(x, __riscv_vsetvlmax_e64m1()), __riscv_vsetvlmax_e64m1()); + return __riscv_vmerge(y, 0, __riscv_vmnot(x, ((int)__riscv_vsetvlmax_e64m1())), ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vposneg_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef d) { rvv_dp_vopmask_rvvm1_sleef mask = __riscv_vreinterpret_b64(__riscv_vmv_v_x_u8m1(0x55, __riscv_vsetvlmax_e8m1())); - vdouble_rvvm1_sleef nd = __riscv_vfneg(d, __riscv_vsetvlmax_e64m1()); - return __riscv_vmerge(nd, d, mask, __riscv_vsetvlmax_e64m1()); + vdouble_rvvm1_sleef nd = __riscv_vfneg(d, ((int)__riscv_vsetvlmax_e64m1())); + return __riscv_vmerge(nd, d, mask, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vnegpos_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef d) { rvv_dp_vopmask_rvvm1_sleef mask = __riscv_vreinterpret_b64(__riscv_vmv_v_x_u8m1(0xaa, __riscv_vsetvlmax_e8m1())); - vdouble_rvvm1_sleef nd = __riscv_vfneg(d, __riscv_vsetvlmax_e64m1()); - return __riscv_vmerge(nd, d, mask, __riscv_vsetvlmax_e64m1()); + vdouble_rvvm1_sleef nd = __riscv_vfneg(d, ((int)__riscv_vsetvlmax_e64m1())); + return __riscv_vmerge(nd, d, mask, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vposneg_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef d) { rvv_sp_vopmask_rvvm1_sleef mask = __riscv_vreinterpret_b32(__riscv_vmv_v_x_u8m1(0x55, __riscv_vsetvlmax_e8m1())); - vfloat_rvvm1_sleef nd = __riscv_vfneg(d, (__riscv_vsetvlmax_e32m1())); - return __riscv_vmerge(nd, d, mask, (__riscv_vsetvlmax_e32m1())); + vfloat_rvvm1_sleef nd = __riscv_vfneg(d, ((int)__riscv_vsetvlmax_e32m1())); + return __riscv_vmerge(nd, d, mask, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vnegpos_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef d) { rvv_sp_vopmask_rvvm1_sleef mask = __riscv_vreinterpret_b32(__riscv_vmv_v_x_u8m1(0xaa, __riscv_vsetvlmax_e8m1())); - vfloat_rvvm1_sleef nd = __riscv_vfneg(d, (__riscv_vsetvlmax_e32m1())); - return __riscv_vmerge(nd, d, mask, (__riscv_vsetvlmax_e32m1())); + vfloat_rvvm1_sleef nd = __riscv_vfneg(d, ((int)__riscv_vsetvlmax_e32m1())); + return __riscv_vmerge(nd, d, mask, ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vsubadd_vd_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y) { return vadd_vd_vd_vd_rvvm1_sleef(x, vnegpos_vd_vd_rvvm1_sleef(y)); } @@ -1870,33 +1867,33 @@ static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vmlsubadd_vd_vd_vd_vd_rvvm1_sleef static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vmlsubadd_vf_vf_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef x, vfloat_rvvm1_sleef y, vfloat_rvvm1_sleef z) { return vfma_vf_vf_vf_vf_rvvm1_sleef(x, y, vnegpos_vf_vf_rvvm1_sleef(z)); } static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vrev21_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef vd_rvvm1_sleef) { - rvv_dp_vuint2 id = __riscv_vid_v_u64m1(__riscv_vsetvlmax_e64m1()); - id = __riscv_vxor(id, 1, __riscv_vsetvlmax_e64m1()); - return __riscv_vrgather(vd_rvvm1_sleef, id, __riscv_vsetvlmax_e64m1()); + rvv_dp_vuint2 id = __riscv_vid_v_u64m1(((int)__riscv_vsetvlmax_e64m1())); + id = __riscv_vxor(id, 1, ((int)__riscv_vsetvlmax_e64m1())); + return __riscv_vrgather(vd_rvvm1_sleef, id, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vrev21_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef vf) { - vint2_rvvm1_sleef id = __riscv_vreinterpret_i32m1(__riscv_vid_v_u32m1((__riscv_vsetvlmax_e32m1()))); - id = __riscv_vxor(id, 1, (__riscv_vsetvlmax_e32m1())); - return __riscv_vrgather(vf, __riscv_vreinterpret_u32m1(id), (__riscv_vsetvlmax_e32m1())); + vint2_rvvm1_sleef id = __riscv_vreinterpret_i32m1(__riscv_vid_v_u32m1(((int)__riscv_vsetvlmax_e32m1()))); + id = __riscv_vxor(id, 1, ((int)__riscv_vsetvlmax_e32m1())); + return __riscv_vrgather(vf, __riscv_vreinterpret_u32m1(id), ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vreva2_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef vd_rvvm1_sleef) { - rvv_dp_vuint2 id = __riscv_vid_v_u64m1(__riscv_vsetvlmax_e64m1()); - id = __riscv_vxor(id, __riscv_vsetvlmax_e64m1() - 2, __riscv_vsetvlmax_e64m1()); - return __riscv_vrgather(vd_rvvm1_sleef, id, __riscv_vsetvlmax_e64m1()); + rvv_dp_vuint2 id = __riscv_vid_v_u64m1(((int)__riscv_vsetvlmax_e64m1())); + id = __riscv_vxor(id, ((int)__riscv_vsetvlmax_e64m1()) - 2, ((int)__riscv_vsetvlmax_e64m1())); + return __riscv_vrgather(vd_rvvm1_sleef, id, ((int)__riscv_vsetvlmax_e64m1())); } static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vreva2_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef vf) { - vint2_rvvm1_sleef id = __riscv_vreinterpret_i32m1(__riscv_vid_v_u32m1((__riscv_vsetvlmax_e32m1()))); - id = __riscv_vxor(id, (__riscv_vsetvlmax_e32m1()) - 2, (__riscv_vsetvlmax_e32m1())); - return __riscv_vrgather(vf, __riscv_vreinterpret_u32m1(id), (__riscv_vsetvlmax_e32m1())); + vint2_rvvm1_sleef id = __riscv_vreinterpret_i32m1(__riscv_vid_v_u32m1(((int)__riscv_vsetvlmax_e32m1()))); + id = __riscv_vxor(id, ((int)__riscv_vsetvlmax_e32m1()) - 2, ((int)__riscv_vsetvlmax_e32m1())); + return __riscv_vrgather(vf, __riscv_vreinterpret_u32m1(id), ((int)__riscv_vsetvlmax_e32m1())); } static SLEEF_ALWAYS_INLINE void vscatter2_v_p_i_i_vd_rvvm1_sleef(double *ptr, int offset, int step, vdouble_rvvm1_sleef v) { ptr += offset * 2; - for (int i = 0; i < __riscv_vsetvlmax_e64m1(); i += 2) { + for (int i = 0; i < (int)((int)__riscv_vsetvlmax_e64m1()); i += 2) { vdouble_rvvm1_sleef vv = __riscv_vslidedown(v, i, 2); __riscv_vse64(ptr, vv, 2); @@ -1907,7 +1904,7 @@ static SLEEF_ALWAYS_INLINE void vscatter2_v_p_i_i_vd_rvvm1_sleef(double *ptr, in static SLEEF_ALWAYS_INLINE void vscatter2_v_p_i_i_vf_rvvm1_sleef(float *ptr, int offset, int step, vfloat_rvvm1_sleef v) { ptr += offset * 2; - for (int i = 0; i < (__riscv_vsetvlmax_e32m1()); i += 2) { + for (int i = 0; i < (int)((int)__riscv_vsetvlmax_e32m1()); i += 2) { vfloat_rvvm1_sleef vv = __riscv_vslidedown(v, i, 2); __riscv_vse32(ptr, vv, 2); ptr += step * 2; @@ -2007,7 +2004,7 @@ static SLEEF_ALWAYS_INLINE tdi_t_rvvm1_sleef tdisettdi_tdi_vd3_vi_rvvm1_sleef(vd } static SLEEF_ALWAYS_INLINE rvv_dp_vopmask_rvvm1_sleef vcast_vo_i_rvvm1_sleef(int i) { - return __riscv_vreinterpret_b64(__riscv_vmv_v_x_u32m1(i, (__riscv_vsetvlmax_e32m1()))); + return __riscv_vreinterpret_b64(__riscv_vmv_v_x_u32m1(i, ((int)__riscv_vsetvlmax_e32m1()))); } static SLEEF_ALWAYS_INLINE vmask_rvvm1_sleef vreinterpret_vm_vi64_rvvm1_sleef(vint64_rvvm1_sleef v) { return __riscv_vreinterpret_u64m1(v); @@ -2022,7 +2019,7 @@ static SLEEF_ALWAYS_INLINE vuint64_rvvm1_sleef vreinterpret_vu64_vm_rvvm1_sleef( return m; } static SLEEF_ALWAYS_INLINE int vtestallzeros_i_vo64_rvvm1_sleef(rvv_dp_vopmask_rvvm1_sleef g) { - return __riscv_vcpop(g, __riscv_vsetvlmax_e64m1()) == 0; + return __riscv_vcpop(g, ((int)__riscv_vsetvlmax_e64m1())) == 0; } static SLEEF_ALWAYS_INLINE void vstream_v_p_vd_rvvm1_sleef(double *ptr, vdouble_rvvm1_sleef v) { vstore_v_p_vd_rvvm1_sleef(ptr, v); } @@ -2048,7 +2045,7 @@ static int vcast_i_vi2(vint2_rvvm1_sleef v) { static vquad_rvvm1_sleef loadu_vq_p_rvvm1_sleef(const int32_t *ptr) { - return __riscv_vreinterpret_u64m2(__riscv_vreinterpret_u32m2(__riscv_vle32_v_i32m2(ptr, (__riscv_vsetvlmax_e32m1()) * 2))); + return __riscv_vreinterpret_u64m2(__riscv_vreinterpret_u32m2(__riscv_vle32_v_i32m2(ptr, ((int)__riscv_vsetvlmax_e32m1()) * 2))); } static SLEEF_ALWAYS_INLINE vquad_rvvm1_sleef cast_vq_aq_rvvm1_sleef(vargquad_rvvm1_sleef aq) { return aq; } @@ -3511,7 +3508,8 @@ SLEEF_INLINE SLEEF_CONST vdouble_rvvm1_sleef Sleef_expdx_u10rvvm1(vdouble_rvvm1_ u = vldexp2_vd_vd_vi_rvvm1_sleef(u, q); - u = vsel_vd_vo_vd_vd_rvvm1_sleef(vgt_vo_vd_vd_rvvm1_sleef(d, vcast_vd_d_rvvm1_sleef(709.78271114955742909217217426)), vcast_vd_d_rvvm1_sleef(__builtin_inf()), u); + rvv_dp_vopmask_rvvm1_sleef o = vgt_vo_vd_vd_rvvm1_sleef(d, vcast_vd_d_rvvm1_sleef(0x1.62e42fefa39efp+9)); + u = vsel_vd_vo_vd_vd_rvvm1_sleef(o, vcast_vd_d_rvvm1_sleef(__builtin_inf()), u); u = vreinterpret_vd_vm_rvvm1_sleef(vandnot_vm_vo64_vm_rvvm1_sleef(vlt_vo_vd_vd_rvvm1_sleef(d, vcast_vd_d_rvvm1_sleef(-1000)), vreinterpret_vm_vd_rvvm1_sleef(u))); return u; @@ -3628,13 +3626,13 @@ static SLEEF_ALWAYS_INLINE SLEEF_CONST vdouble_rvvm1_sleef expk_rvvm1_sleef(vdou } SLEEF_INLINE SLEEF_CONST vdouble_rvvm1_sleef Sleef_powdx_u10rvvm1(vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y) { - rvv_dp_vopmask_rvvm1_sleef yisint = visint_vo_vd_rvvm1_sleef(y); rvv_dp_vopmask_rvvm1_sleef yisodd = rvv_dp_vand_vo_vo_vo(visodd_vo_vd_rvvm1_sleef(y), yisint); vdouble2_rvvm1_sleef d = ddmul_vd2_vd2_vd_rvvm1_sleef(logk_rvvm1_sleef(vabs_vd_vd_rvvm1_sleef(x)), y); vdouble_rvvm1_sleef result = expk_rvvm1_sleef(d); - result = vsel_vd_vo_vd_vd_rvvm1_sleef(vgt_vo_vd_vd_rvvm1_sleef(vd2getx_vd_vd2_rvvm1_sleef(d), vcast_vd_d_rvvm1_sleef(709.78271114955742909217217426)), vcast_vd_d_rvvm1_sleef(__builtin_inf()), result); + rvv_dp_vopmask_rvvm1_sleef o = vgt_vo_vd_vd_rvvm1_sleef(vd2getx_vd_vd2_rvvm1_sleef(d), vcast_vd_d_rvvm1_sleef(0x1.62e42fefa39efp+9)); + result = vsel_vd_vo_vd_vd_rvvm1_sleef(o, vcast_vd_d_rvvm1_sleef(__builtin_inf()), result); result = vmul_vd_vd_vd_rvvm1_sleef(result, vsel_vd_vo_vd_vd_rvvm1_sleef(vgt_vo_vd_vd_rvvm1_sleef(x, vcast_vd_d_rvvm1_sleef(0)), @@ -3660,7 +3658,6 @@ SLEEF_INLINE SLEEF_CONST vdouble_rvvm1_sleef Sleef_powdx_u10rvvm1(vdouble_rvvm1_ result = vsel_vd_vo_vd_vd_rvvm1_sleef(rvv_dp_vor_vo_vo_vo(veq_vo_vd_vd_rvvm1_sleef(y, vcast_vd_d_rvvm1_sleef(0)), veq_vo_vd_vd_rvvm1_sleef(x, vcast_vd_d_rvvm1_sleef(1))), vcast_vd_d_rvvm1_sleef(1), result); return result; - } static SLEEF_ALWAYS_INLINE SLEEF_CONST vdouble2_rvvm1_sleef expk2_rvvm1_sleef(vdouble2_rvvm1_sleef d) { @@ -4148,7 +4145,8 @@ SLEEF_INLINE SLEEF_CONST vdouble_rvvm1_sleef Sleef_log1pdx_u10rvvm1(vdouble_rvvm vdouble_rvvm1_sleef r = vadd_vd_vd_vd_rvvm1_sleef(vd2getx_vd_vd2_rvvm1_sleef(s), vd2gety_vd_vd2_rvvm1_sleef(s)); - r = vsel_vd_vo_vd_vd_rvvm1_sleef(vgt_vo_vd_vd_rvvm1_sleef(d, vcast_vd_d_rvvm1_sleef(1e+307)), vcast_vd_d_rvvm1_sleef(__builtin_inf()), r); + rvv_dp_vopmask_rvvm1_sleef ocore = vle_vo_vd_vd_rvvm1_sleef(d, vcast_vd_d_rvvm1_sleef(0x1.c7b1f3cac7433p+1019)); + if(!__builtin_expect(!!(vtestallones_i_vo64_rvvm1_sleef (ocore)), 1)) r = vsel_vd_vo_vd_vd_rvvm1_sleef(ocore, r, Sleef_logdx_u10rvvm1(d)); r = vsel_vd_vo_vd_vd_rvvm1_sleef(rvv_dp_vor_vo_vo_vo(vlt_vo_vd_vd_rvvm1_sleef(d, vcast_vd_d_rvvm1_sleef(-1)), visnan_vo_vd_rvvm1_sleef(d)), vcast_vd_d_rvvm1_sleef(__builtin_nan("")), r); r = vsel_vd_vo_vd_vd_rvvm1_sleef(veq_vo_vd_vd_rvvm1_sleef(d, vcast_vd_d_rvvm1_sleef(-1)), vcast_vd_d_rvvm1_sleef(-__builtin_inf()), r); r = vsel_vd_vo_vd_vd_rvvm1_sleef(visnegzero_vo_vd_rvvm1_sleef(d), vcast_vd_d_rvvm1_sleef(-0.0), r); @@ -4228,7 +4226,7 @@ SLEEF_INLINE SLEEF_CONST vint_rvvm1_sleef Sleef_expfrexpdx_rvvm1(vdouble_rvvm1_s vint_rvvm1_sleef ret = vcastu_vi_vm_rvvm1_sleef(vreinterpret_vm_vd_rvvm1_sleef(x)); ret = vsub_vi_vi_vi_rvvm1_sleef(vand_vi_vi_vi_rvvm1_sleef(vsrl_vi_vi_i_rvvm1_sleef(ret, 20), vcast_vi_i_rvvm1_sleef(0x7ff)), vcast_vi_i_rvvm1_sleef(0x3fe)); - ret = vsel_vi_vo_vi_vi_rvvm1_sleef(rvv_dp_vor_vo_vo_vo(rvv_dp_vor_vo_vo_vo(veq_vo_vd_vd_rvvm1_sleef(x, vcast_vd_d_rvvm1_sleef(0)), visnan_vo_vd_rvvm1_sleef(x)), visinf_vo_vd_rvvm1_sleef(x)), vcast_vi_i_rvvm1_sleef(0), ret); + ret = vsel_vi_vo_vi_vi_rvvm1_sleef(vcast_vo32_vo64_rvvm1_sleef(rvv_dp_vor_vo_vo_vo(rvv_dp_vor_vo_vo_vo(veq_vo_vd_vd_rvvm1_sleef(x, vcast_vd_d_rvvm1_sleef(0)), visnan_vo_vd_rvvm1_sleef(x)), visinf_vo_vd_rvvm1_sleef(x))), vcast_vi_i_rvvm1_sleef(0), ret); return ret; } @@ -4631,14 +4629,6 @@ SLEEF_INLINE SLEEF_CONST vdouble_rvvm1_sleef Sleef_erfcdx_u15rvvm1(vdouble_rvvm1 return r; } -#if !defined(__NVCC__) && ((defined(__SIZEOF_FLOAT128__) && __SIZEOF_FLOAT128__ == 16) || (defined(__linux__) && defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))) || (defined(__PPC64__) && defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 8)) -#define SLEEF_FLOAT128_IS_IEEEQP -#endif - -#if !defined(SLEEF_FLOAT128_IS_IEEEQP) && !defined(__NVCC__) && defined(__SIZEOF_LONG_DOUBLE__) && __SIZEOF_LONG_DOUBLE__ == 16 && (defined(__aarch64__) || defined(__zarch__)) -#define SLEEF_LONGDOUBLE_IS_IEEEQP -#endif - #if !defined(Sleef_quad_DEFINED) #define Sleef_quad_DEFINED typedef struct { uint64_t x, y; } Sleef_uint64_2t; @@ -5105,6 +5095,7 @@ SLEEF_INLINE SLEEF_CONST vfloat_rvvm1_sleef Sleef_tanfx_u35rvvm1(vfloat_rvvm1_sl if (__builtin_expect(!!(vtestallones_i_vo32_rvvm1_sleef(vlt_vo_vf_vf_rvvm1_sleef(vabs_vf_vf_rvvm1_sleef(d), vcast_vf_f_rvvm1_sleef(125.0f*0.5f)))), 1)) { q = vrint_vi2_vf_rvvm1_sleef(vmul_vf_vf_vf_rvvm1_sleef(d, vcast_vf_f_rvvm1_sleef((float)(2 * 0.318309886183790671537767526745028724)))); u = vcast_vf_vi2_rvvm1_sleef(q); + x = vmla_vf_vf_vf_vf_rvvm1_sleef(u, vcast_vf_f_rvvm1_sleef(-3.1414794921875f*0.5f), x); x = vmla_vf_vf_vf_vf_rvvm1_sleef(u, vcast_vf_f_rvvm1_sleef(-0.00011315941810607910156f*0.5f), x); x = vmla_vf_vf_vf_vf_rvvm1_sleef(u, vcast_vf_f_rvvm1_sleef(-1.9841872589410058936e-09f*0.5f), x); @@ -6506,7 +6497,8 @@ SLEEF_INLINE SLEEF_CONST vfloat_rvvm1_sleef Sleef_log1pfx_u10rvvm1(vfloat_rvvm1_ vfloat_rvvm1_sleef r = vadd_vf_vf_vf_rvvm1_sleef(vf2getx_vf_vf2_rvvm1_sleef(s), vf2gety_vf_vf2_rvvm1_sleef(s)); - r = vsel_vf_vo_vf_vf_rvvm1_sleef(vgt_vo_vf_vf_rvvm1_sleef(d, vcast_vf_f_rvvm1_sleef(1e+38)), vcast_vf_f_rvvm1_sleef(__builtin_inff()), r); + rvv_sp_vopmask_rvvm1_sleef ocore = vle_vo_vf_vf_rvvm1_sleef(d, vcast_vf_f_rvvm1_sleef(0x1.2ced32p+126)); + if(!__builtin_expect(!!(vtestallones_i_vo32_rvvm1_sleef (ocore)), 1)) r = vsel_vf_vo_vf_vf_rvvm1_sleef(ocore, r, Sleef_logfx_u10rvvm1(d)); r = vreinterpret_vf_vm_rvvm1_sleef(vor_vm_vo32_vm_rvvm1_sleef(vgt_vo_vf_vf_rvvm1_sleef(vcast_vf_f_rvvm1_sleef(-1), d), vreinterpret_vm_vf_rvvm1_sleef(r))); r = vsel_vf_vo_vf_vf_rvvm1_sleef(veq_vo_vf_vf_rvvm1_sleef(d, vcast_vf_f_rvvm1_sleef(-1)), vcast_vf_f_rvvm1_sleef(-__builtin_inff()), r); r = vsel_vf_vo_vf_vf_rvvm1_sleef(visnegzero_vo_vf_rvvm1_sleef(d), vcast_vf_f_rvvm1_sleef(-0.0f), r); diff --git a/src/jdk.incubator.vector/unix/native/libsleef/generated/sleefinline_sve.h b/src/jdk.incubator.vector/unix/native/libsleef/generated/sleefinline_sve.h index d4b15d0f218..1fc666a6c76 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/generated/sleefinline_sve.h +++ b/src/jdk.incubator.vector/unix/native/libsleef/generated/sleefinline_sve.h @@ -1,8 +1,11 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See http://www.boost.org/LICENSE_1_0.txt) -// This file is generated by SLEEF 3.6.1 +// This file is generated by SLEEF 3.9.0 + +/* #undef SLEEF_FLOAT128_IS_IEEEQP */ +#define SLEEF_LONGDOUBLE_IS_IEEEQP #ifndef SLEEF_ALWAYS_INLINE #if defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER) @@ -1010,6 +1013,7 @@ static const double Sleef_rempitabdp[] = { 2.1353977370878701046e-273, -1.2215123283371736879e-289, 6.7342163555358599277e-306, -5.681754927174335258e-322, 2.1353977370878701046e-273, -1.2215123283371736879e-289, 6.7342163555358599277e-306, -5.681754927174335258e-322, 2.8687869620228451614e-274, -1.9537812801257956865e-290, 1.0380272777574237546e-306, 6.4228533959362050743e-323, + 0, 0, 0, 0, }; static const float Sleef_rempitabsp[] = { @@ -1116,17 +1120,10 @@ static const float Sleef_rempitabsp[] = { 1.183823005e-12, 1.161414894e-20, 1.29131908e-27, 1.715766248e-34, 1.183823005e-12, 1.161414894e-20, 1.29131908e-27, 1.715766248e-34, 2.743283031e-13, 1.161414894e-20, 1.29131908e-27, 1.715766248e-34, + 0, 0, 0, 0, }; #endif // #ifndef __SLEEF_REMPITAB__ -#if !defined(__NVCC__) && ((defined(__SIZEOF_FLOAT128__) && __SIZEOF_FLOAT128__ == 16) || (defined(__linux__) && defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))) || (defined(__PPC64__) && defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 8)) -#define SLEEF_FLOAT128_IS_IEEEQP -#endif - -#if !defined(SLEEF_FLOAT128_IS_IEEEQP) && !defined(__NVCC__) && defined(__SIZEOF_LONG_DOUBLE__) && __SIZEOF_LONG_DOUBLE__ == 16 && (defined(__aarch64__) || defined(__zarch__)) -#define SLEEF_LONGDOUBLE_IS_IEEEQP -#endif - #if !defined(Sleef_quad_DEFINED) #define Sleef_quad_DEFINED typedef struct { uint64_t x, y; } Sleef_uint64_2t; @@ -1833,13 +1830,13 @@ static SLEEF_ALWAYS_INLINE vfloat_sve_sleef vmlsubadd_vf_vf_vf_vf_sve_sleef(vflo static SLEEF_ALWAYS_INLINE vdouble_sve_sleef vrev21_vd_vd_sve_sleef(vdouble_sve_sleef x) { return svzip1_f64(svuzp2_f64(x, x), svuzp1_f64(x, x)); } static SLEEF_ALWAYS_INLINE vdouble_sve_sleef vreva2_vd_vd_sve_sleef(vdouble_sve_sleef vd_sve_sleef) { - svint64_t x = svindex_s64(((svcntd())-1), -1); + svint64_t x = svindex_s64((((int)svcntd())-1), -1); x = svzip1_s64(svuzp2_s64(x, x), svuzp1_s64(x, x)); return svtbl_f64(vd_sve_sleef, svreinterpret_u64_s64(x)); } static SLEEF_ALWAYS_INLINE vfloat_sve_sleef vreva2_vf_vf_sve_sleef(vfloat_sve_sleef vf) { - svint32_t x = svindex_s32(((svcntw())-1), -1); + svint32_t x = svindex_s32((((int)svcntw())-1), -1); x = svzip1_s32(svuzp2_s32(x, x), svuzp1_s32(x, x)); return svtbl_f32(vf, svreinterpret_u32_s32(x)); } @@ -3381,7 +3378,8 @@ SLEEF_INLINE SLEEF_CONST vdouble_sve_sleef Sleef_expdx_u10sve(vdouble_sve_sleef u = vldexp2_vd_vd_vi_sve_sleef(u, q); - u = vsel_vd_vo_vd_vd_sve_sleef(vgt_vo_vd_vd_sve_sleef(d, vcast_vd_d_sve_sleef(709.78271114955742909217217426)), vcast_vd_d_sve_sleef(__builtin_inf()), u); + vopmask_sve_sleef o = vgt_vo_vd_vd_sve_sleef(d, vcast_vd_d_sve_sleef(0x1.62e42fefa39efp+9)); + u = vsel_vd_vo_vd_vd_sve_sleef(o, vcast_vd_d_sve_sleef(__builtin_inf()), u); u = vreinterpret_vd_vm_sve_sleef(vandnot_vm_vo64_vm_sve_sleef(vlt_vo_vd_vd_sve_sleef(d, vcast_vd_d_sve_sleef(-1000)), vreinterpret_vm_vd_sve_sleef(u))); return u; @@ -3498,13 +3496,13 @@ static SLEEF_ALWAYS_INLINE SLEEF_CONST vdouble_sve_sleef expk_sve_sleef(vdouble2 } SLEEF_INLINE SLEEF_CONST vdouble_sve_sleef Sleef_powdx_u10sve(vdouble_sve_sleef x, vdouble_sve_sleef y) { - vopmask_sve_sleef yisint = visint_vo_vd_sve_sleef(y); vopmask_sve_sleef yisodd = vand_vo_vo_vo_sve_sleef(visodd_vo_vd_sve_sleef(y), yisint); vdouble2_sve_sleef d = ddmul_vd2_vd2_vd_sve_sleef(logk_sve_sleef(vabs_vd_vd_sve_sleef(x)), y); vdouble_sve_sleef result = expk_sve_sleef(d); - result = vsel_vd_vo_vd_vd_sve_sleef(vgt_vo_vd_vd_sve_sleef(vd2getx_vd_vd2_sve_sleef(d), vcast_vd_d_sve_sleef(709.78271114955742909217217426)), vcast_vd_d_sve_sleef(__builtin_inf()), result); + vopmask_sve_sleef o = vgt_vo_vd_vd_sve_sleef(vd2getx_vd_vd2_sve_sleef(d), vcast_vd_d_sve_sleef(0x1.62e42fefa39efp+9)); + result = vsel_vd_vo_vd_vd_sve_sleef(o, vcast_vd_d_sve_sleef(__builtin_inf()), result); result = vmul_vd_vd_vd_sve_sleef(result, vsel_vd_vo_vd_vd_sve_sleef(vgt_vo_vd_vd_sve_sleef(x, vcast_vd_d_sve_sleef(0)), @@ -3530,7 +3528,6 @@ SLEEF_INLINE SLEEF_CONST vdouble_sve_sleef Sleef_powdx_u10sve(vdouble_sve_sleef result = vsel_vd_vo_vd_vd_sve_sleef(vor_vo_vo_vo_sve_sleef(veq_vo_vd_vd_sve_sleef(y, vcast_vd_d_sve_sleef(0)), veq_vo_vd_vd_sve_sleef(x, vcast_vd_d_sve_sleef(1))), vcast_vd_d_sve_sleef(1), result); return result; - } static SLEEF_ALWAYS_INLINE SLEEF_CONST vdouble2_sve_sleef expk2_sve_sleef(vdouble2_sve_sleef d) { @@ -4018,7 +4015,8 @@ SLEEF_INLINE SLEEF_CONST vdouble_sve_sleef Sleef_log1pdx_u10sve(vdouble_sve_slee vdouble_sve_sleef r = vadd_vd_vd_vd_sve_sleef(vd2getx_vd_vd2_sve_sleef(s), vd2gety_vd_vd2_sve_sleef(s)); - r = vsel_vd_vo_vd_vd_sve_sleef(vgt_vo_vd_vd_sve_sleef(d, vcast_vd_d_sve_sleef(1e+307)), vcast_vd_d_sve_sleef(__builtin_inf()), r); + vopmask_sve_sleef ocore = vle_vo_vd_vd_sve_sleef(d, vcast_vd_d_sve_sleef(0x1.c7b1f3cac7433p+1019)); + if(!__builtin_expect(!!(vtestallones_i_vo64_sve_sleef (ocore)), 1)) r = vsel_vd_vo_vd_vd_sve_sleef(ocore, r, Sleef_logdx_u10sve(d)); r = vsel_vd_vo_vd_vd_sve_sleef(vor_vo_vo_vo_sve_sleef(vlt_vo_vd_vd_sve_sleef(d, vcast_vd_d_sve_sleef(-1)), visnan_vo_vd_sve_sleef(d)), vcast_vd_d_sve_sleef(__builtin_nan("")), r); r = vsel_vd_vo_vd_vd_sve_sleef(veq_vo_vd_vd_sve_sleef(d, vcast_vd_d_sve_sleef(-1)), vcast_vd_d_sve_sleef(-__builtin_inf()), r); r = vsel_vd_vo_vd_vd_sve_sleef(visnegzero_vo_vd_sve_sleef(d), vcast_vd_d_sve_sleef(-0.0), r); @@ -4098,7 +4096,7 @@ SLEEF_INLINE SLEEF_CONST vint_sve_sleef Sleef_expfrexpdx_sve(vdouble_sve_sleef x vint_sve_sleef ret = vcastu_vi_vm_sve_sleef(vreinterpret_vm_vd_sve_sleef(x)); ret = vsub_vi_vi_vi_sve_sleef(vand_vi_vi_vi_sve_sleef(vsrl_vi_vi_i_sve_sleef(ret, 20), vcast_vi_i_sve_sleef(0x7ff)), vcast_vi_i_sve_sleef(0x3fe)); - ret = vsel_vi_vo_vi_vi_sve_sleef(vor_vo_vo_vo_sve_sleef(vor_vo_vo_vo_sve_sleef(veq_vo_vd_vd_sve_sleef(x, vcast_vd_d_sve_sleef(0)), visnan_vo_vd_sve_sleef(x)), visinf_vo_vd_sve_sleef(x)), vcast_vi_i_sve_sleef(0), ret); + ret = vsel_vi_vo_vi_vi_sve_sleef(vcast_vo32_vo64_sve_sleef(vor_vo_vo_vo_sve_sleef(vor_vo_vo_vo_sve_sleef(veq_vo_vd_vd_sve_sleef(x, vcast_vd_d_sve_sleef(0)), visnan_vo_vd_sve_sleef(x)), visinf_vo_vd_sve_sleef(x))), vcast_vi_i_sve_sleef(0), ret); return ret; } @@ -4497,14 +4495,6 @@ SLEEF_INLINE SLEEF_CONST vdouble_sve_sleef Sleef_erfcdx_u15sve(vdouble_sve_sleef return r; } -#if !defined(__NVCC__) && ((defined(__SIZEOF_FLOAT128__) && __SIZEOF_FLOAT128__ == 16) || (defined(__linux__) && defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))) || (defined(__PPC64__) && defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 8)) -#define SLEEF_FLOAT128_IS_IEEEQP -#endif - -#if !defined(SLEEF_FLOAT128_IS_IEEEQP) && !defined(__NVCC__) && defined(__SIZEOF_LONG_DOUBLE__) && __SIZEOF_LONG_DOUBLE__ == 16 && (defined(__aarch64__) || defined(__zarch__)) -#define SLEEF_LONGDOUBLE_IS_IEEEQP -#endif - #if !defined(Sleef_quad_DEFINED) #define Sleef_quad_DEFINED typedef struct { uint64_t x, y; } Sleef_uint64_2t; @@ -4983,6 +4973,7 @@ SLEEF_INLINE SLEEF_CONST vfloat_sve_sleef Sleef_tanfx_u35sve(vfloat_sve_sleef d) if (__builtin_expect(!!(vtestallones_i_vo32_sve_sleef(vlt_vo_vf_vf_sve_sleef(vabs_vf_vf_sve_sleef(d), vcast_vf_f_sve_sleef(125.0f*0.5f)))), 1)) { q = vrint_vi2_vf_sve_sleef(vmul_vf_vf_vf_sve_sleef(d, vcast_vf_f_sve_sleef((float)(2 * 0.318309886183790671537767526745028724)))); u = vcast_vf_vi2_sve_sleef(q); + x = vmla_vf_vf_vf_vf_sve_sleef(u, vcast_vf_f_sve_sleef(-3.1414794921875f*0.5f), x); x = vmla_vf_vf_vf_vf_sve_sleef(u, vcast_vf_f_sve_sleef(-0.00011315941810607910156f*0.5f), x); x = vmla_vf_vf_vf_vf_sve_sleef(u, vcast_vf_f_sve_sleef(-1.9841872589410058936e-09f*0.5f), x); @@ -6384,7 +6375,8 @@ SLEEF_INLINE SLEEF_CONST vfloat_sve_sleef Sleef_log1pfx_u10sve(vfloat_sve_sleef vfloat_sve_sleef r = vadd_vf_vf_vf_sve_sleef(vf2getx_vf_vf2_sve_sleef(s), vf2gety_vf_vf2_sve_sleef(s)); - r = vsel_vf_vo_vf_vf_sve_sleef(vgt_vo_vf_vf_sve_sleef(d, vcast_vf_f_sve_sleef(1e+38)), vcast_vf_f_sve_sleef(__builtin_inff()), r); + vopmask_sve_sleef ocore = vle_vo_vf_vf_sve_sleef(d, vcast_vf_f_sve_sleef(0x1.2ced32p+126)); + if(!__builtin_expect(!!(vtestallones_i_vo32_sve_sleef (ocore)), 1)) r = vsel_vf_vo_vf_vf_sve_sleef(ocore, r, Sleef_logfx_u10sve(d)); r = vreinterpret_vf_vm_sve_sleef(vor_vm_vo32_vm_sve_sleef(vgt_vo_vf_vf_sve_sleef(vcast_vf_f_sve_sleef(-1), d), vreinterpret_vm_vf_sve_sleef(r))); r = vsel_vf_vo_vf_vf_sve_sleef(veq_vo_vf_vf_sve_sleef(d, vcast_vf_f_sve_sleef(-1)), vcast_vf_f_sve_sleef(-__builtin_inff()), r); r = vsel_vf_vo_vf_vf_sve_sleef(visnegzero_vo_vf_sve_sleef(d), vcast_vf_f_sve_sleef(-0.0f), r); diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/CHANGELOG.md b/src/jdk.incubator.vector/unix/native/libsleef/upstream/CHANGELOG.md index 750955336f5..cdabe63eb05 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/CHANGELOG.md +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/CHANGELOG.md @@ -1,3 +1,52 @@ +## 3.8 - 2025-01-27 +The focus of this release has been to facilitate benchmarking in SLEEF. +It does so by providing a benchmarking tool and a plotting tool to postprocess +the results. +AArch64 self-hosted runners have been added to CI. Following this, the Linux and +compiler version have been updated. +Fix inaccuracy issues in a few functions, failures with cpp checks and a few +bugs. +Finally, the project has been extended with a blog section and its first blog +[post](https://sleef.org/2024/10/02/new-pulse.html). + +### Added +- Add benchmark and plotting tool by @joanaxcruz in #589, #597, #608 and #609 +- Use Arm-hosted runners by @blapie in #581 +- Add blog section and first post. by @blapie in #582 + +### Changed +- Update GH runners to Ubuntu 24.04 and GCC14 by @blapie in #598, #599 and #601 + +### Fixed +- Fix cbrt on AArch32, and atanf(+-0) with gcc-13 by @shibatch in #592 +- Fix oflow bound in log1p(f), exp and pow by @blapie in #604 and #606 +- Work around removal of some PowerPC intrinsics in GCC 15 by @musicinmybrain in #612 +- Fix errors reported by cppcheck by @blapie in #595 + +## 3.7 - 2024-09-17 + +The focus of this release has been to meet open-source community standards. It +does so by providing Contributing Guidelines, Issues and Pull-Requests +templates. Additionally, the documentation has been reworked to improve +navigation (via search bar, side menu/panel, eased navigation on GitHub, ...) +and maintainability (reduced line count, mostly markdown sources, ...). The +website rendering is now delegated to a template customisable theme. See the +new website at [sleef.org](https://sleef.org/), and [docs/](./docs) for the +GitHub-rendered documentation. The release also provides various bug fixes on +several targets, for CPU detection and in the benchmark infrastructure. + +### Added +- Add issue and PR templates. by @blapie in https://github.com/shibatch/sleef/pull/565 + +### Changed +- Adjust scheduling of GHA workflows by @blapie in https://github.com/shibatch/sleef/pull/553 +- Port documentation from html to markdown by @blapie in https://github.com/shibatch/sleef/pull/564 +- Update acosh documentation by @joanaxcruz in https://github.com/shibatch/sleef/pull/572 + +### Fixed +- S/390: Use getauxval for detecting VXE2 to fix #560 by @Andreas-Krebbel in https://github.com/shibatch/sleef/pull/561 +- Revive micro-benchmarks for vector functions by @joanaxcruz in https://github.com/shibatch/sleef/pull/571 + ## 3.6.1 - 2024-06-10 This patch release provides important bug fixes, including a fix diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/CMakeLists.txt b/src/jdk.incubator.vector/unix/native/libsleef/upstream/CMakeLists.txt index a37f1ac1046..2a95aabc5ba 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/CMakeLists.txt +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/CMakeLists.txt @@ -1,8 +1,14 @@ cmake_minimum_required(VERSION 3.18) -project(SLEEF VERSION 3.6.1 LANGUAGES C) + +set(SLEEF_VERSION 3.9.0) + +message(STATUS "Configuring SLEEF ${SLEEF_VERSION}") +project(SLEEF VERSION ${SLEEF_VERSION} LANGUAGES C CXX) set(SLEEF_SOVERSION ${SLEEF_VERSION_MAJOR}) +set(CMAKE_CXX_STANDARD 20) + # Options option(SLEEF_BUILD_STATIC_TEST_BINS "Build statically linked test executables" OFF) @@ -13,28 +19,96 @@ option(SLEEF_BUILD_QUAD "libsleefquad will be built." OFF) option(SLEEF_BUILD_GNUABI_LIBS "libsleefgnuabi will be built." ON) option(SLEEF_BUILD_SCALAR_LIB "libsleefscalar will be built." OFF) option(SLEEF_BUILD_TESTS "Tests will be built." ON) +option(SLEEF_BUILD_BENCH "Bench will be built." OFF) +option(SLEEF_BUILD_BENCH_REF "Benchmark script for reference (e.g. system libm) will be built." OFF) option(SLEEF_BUILD_INLINE_HEADERS "Build header for inlining whole SLEEF functions" OFF) +option(SLEEF_ENFORCE_DFT "Build fails if DFT is not built" OFF) +option(SLEEFDFT_ENABLE_STREAM "Streaming instructions are utilized in DFT." OFF) + option(SLEEF_TEST_ALL_IUT "Perform tests on implementations with all vector extensions" OFF) option(SLEEF_SHOW_CONFIG "Show SLEEF configuration status messages." ON) option(SLEEF_SHOW_ERROR_LOG "Show cmake error log." OFF) option(SLEEF_ASAN "Enable address sanitizing on all targets." OFF) +option(SLEEF_ENABLE_TESTER "Enable testing libm with tester" OFF) option(SLEEF_ENFORCE_TESTER "Build fails if tester is not available" OFF) + option(SLEEF_ENFORCE_TESTER3 "Build fails if tester3 is not built" OFF) +option(SLEEF_ENABLE_TESTER4 "Enable testing with tester4" ON) +option(SLEEF_ENFORCE_TESTER4 "Build fails if tester4 is not available" OFF) + option(SLEEF_ENABLE_ALTDIV "Enable alternative division method (aarch64 only)" OFF) option(SLEEF_ENABLE_ALTSQRT "Enable alternative sqrt method (aarch64 only)" OFF) option(SLEEF_DISABLE_FFTW "Disable testing the DFT library with FFTW" OFF) option(SLEEF_DISABLE_MPFR "Disable testing with the MPFR library" OFF) +option(SLEEF_ENABLE_TLFLOAT "Enable use of TLFloat library" ON) + option(SLEEF_DISABLE_SSL "Disable testing with the SSL library" OFF) +set(OPENSSL_EXTRA_LIBRARIES "" CACHE STRING "Extra libraries for openssl") option(SLEEF_ENABLE_CUDA "Enable CUDA" OFF) -option(SLEEF_ENABLE_CXX "Enable C++" OFF) + +option(SLEEF_BUILD_WITH_LIBM "build libsleef with libm, can turn off on Windows to solve mutiple math functions issue." ON) + +option(SLEEF_DISABLE_LONG_DOUBLE "Disable long double" OFF) +option(SLEEF_ENFORCE_LONG_DOUBLE "Build fails if long double is not supported by the compiler" OFF) + +option(SLEEF_DISABLE_FLOAT128 "Disable float128" OFF) +option(SLEEF_ENFORCE_FLOAT128 "Build fails if float128 is not supported by the compiler" OFF) + +option(SLEEF_DISABLE_SSE2 "Disable SSE2" OFF) +option(SLEEF_ENFORCE_SSE2 "Build fails if SSE2 is not supported by the compiler" OFF) +option(SLEEF_DISABLE_SSE4 "Disable SSE4" OFF) +option(SLEEF_ENFORCE_SSE4 "Build fails if SSE4 is not supported by the compiler" OFF) +option(SLEEF_DISABLE_AVX "Disable AVX" OFF) +option(SLEEF_ENFORCE_AVX "Build fails if AVX is not supported by the compiler" OFF) +option(SLEEF_DISABLE_FMA4 "Disable FMA4" OFF) +option(SLEEF_ENFORCE_FMA4 "Build fails if FMA4 is not supported by the compiler" OFF) +option(SLEEF_DISABLE_AVX2 "Disable AVX2" OFF) +option(SLEEF_ENFORCE_AVX2 "Build fails if AVX2 is not supported by the compiler" OFF) +option(SLEEF_DISABLE_AVX512F "Disable AVX512F" OFF) +option(SLEEF_ENFORCE_AVX512F "Build fails if AVX512F is not supported by the compiler" OFF) +option(SLEEF_DISABLE_SVE "Disable SVE" OFF) +option(SLEEF_ENFORCE_SVE "Build fails if SVE is not supported by the compiler" OFF) +option(SLEEF_DISABLE_VSX "Disable VSX" OFF) +option(SLEEF_ENFORCE_VSX "Build fails if VSX is not supported by the compiler" OFF) +option(SLEEF_DISABLE_VSX3 "Disable VSX3" OFF) +option(SLEEF_ENFORCE_VSX3 "Build fails if VSX3 is not supported by the compiler" OFF) +option(SLEEF_DISABLE_VXE "Disable VXE" OFF) +option(SLEEF_ENFORCE_VXE "Build fails if VXE is not supported by the compiler" OFF) +option(SLEEF_DISABLE_VXE2 "Disable VXE2" OFF) +option(SLEEF_ENFORCE_VXE2 "Build fails if VXE2 is not supported by the compiler" OFF) +option(SLEEF_DISABLE_RVVM1 "Disable RVVM1" OFF) +option(SLEEF_ENFORCE_RVVM1 "Build fails if RVVM1 is not supported by the compiler" OFF) +option(SLEEF_DISABLE_RVVM2 "Disable RVVM2" OFF) +option(SLEEF_ENFORCE_RVVM2 "Build fails if RVVM2 is not supported by the compiler" OFF) + +option(SLEEF_ENFORCE_CUDA "Build fails if CUDA is not supported" OFF) + +option(SLEEF_DISABLE_OPENMP "Disable OPENMP" OFF) +option(SLEEF_ENFORCE_OPENMP "Build fails if OPENMP is not supported by the compiler" OFF) # +if ((NOT "${CMAKE_C_COMPILER_ID}" STREQUAL "${CMAKE_CXX_COMPILER_ID}") OR + (NOT CMAKE_C_COMPILER_VERSION VERSION_EQUAL CMAKE_CXX_COMPILER_VERSION)) + message(FATAL_ERROR "Different versions of C compiler and C++ compiler") +endif() + +# + +if (SLEEF_BUILD_BENCH_REF) + if (NOT SLEEF_BUILD_BENCH) + message(FATAL_ERROR "SLEEF_BUILD_BENCH must be on when SLEEF_BUILD_BENCH_REF is enabled.") + endif () + if(NOT CMAKE_SYSTEM_NAME MATCHES Linux) + message(FATAL_ERROR "Libm benchmarking not supported in this OS.") + endif() +endif () + if (DEFINED SLEEF_BUILD_SHARED_LIBS) set(BUILD_SHARED_LIBS ${SLEEF_BUILD_SHARED_LIBS}) endif () @@ -133,13 +207,11 @@ set(COSTOVERRIDE_RVVM2NOFMA 20) # enable_testing() - -if (SLEEF_ENABLE_CXX) - enable_language(CXX) -endif() +enable_language(CXX) if (SLEEF_ENABLE_CUDA) enable_language(CUDA) + set(CMAKE_CUDA_ARCHITECTURES all-major) endif() # For specifying installation directories @@ -197,6 +269,7 @@ include(Configure.cmake) configure_file( ${PROJECT_SOURCE_DIR}/sleef-config.h.in ${PROJECT_BINARY_DIR}/include/sleef-config.h @ONLY) +include_directories(AFTER "${PROJECT_BINARY_DIR}/include") # We like to have a documented index of all targets in the project. The # variables listed below carry the names of the targets defined throughout @@ -228,7 +301,9 @@ set(TARGET_MKALIAS "mkalias") # Generates static library common # Defined in src/common/CMakeLists.txt via command add_library set(TARGET_LIBCOMMON_OBJ "common") -set(TARGET_LIBARRAYMAP_OBJ "arraymap") +set(TARGET_PSHA_OBJ "psha_obj") +set(TARGET_TESTERUTIL_OBJ "testerutil_obj") +set(TARGET_QTESTERUTIL_OBJ "qtesterutil_obj") # Function used to add an executable that is executed on host function(add_host_executable TARGETNAME) @@ -239,15 +314,23 @@ function(add_host_executable TARGETNAME) target_compile_options(${TARGETNAME} PRIVATE -arch "${CMAKE_HOST_SYSTEM_PROCESSOR}") target_link_options(${TARGETNAME} PRIVATE -arch "${CMAKE_HOST_SYSTEM_PROCESSOR}") endif() + elseif (DEFINED ENV{SLEEF_TARGET_EXEC_USE_QEMU}) + if($ENV{SLEEF_TARGET_EXEC_USE_QEMU}) + add_executable(${TARGETNAME} ${ARGN}) + endif() else() add_executable(${TARGETNAME} IMPORTED GLOBAL) - set_property(TARGET ${TARGETNAME} PROPERTY IMPORTED_LOCATION ${NATIVE_BUILD_DIR}/bin/${TARGETNAME}) + if(CMAKE_HOST_WIN32) + set_property(TARGET ${TARGETNAME} PROPERTY IMPORTED_LOCATION ${NATIVE_BUILD_DIR}/bin/${TARGETNAME}.exe) + else() + set_property(TARGET ${TARGETNAME} PROPERTY IMPORTED_LOCATION ${NATIVE_BUILD_DIR}/bin/${TARGETNAME}) + endif() endif() endfunction() function(host_target_AAVPCS_definitions TARGETNAME) if (NOT CMAKE_CROSSCOMPILING) - target_compile_definitions(${TARGETNAME} PRIVATE ENABLE_AAVPCS=1) +# target_compile_definitions(${TARGETNAME} PRIVATE ENABLE_AAVPCS=1) endif() endfunction() @@ -303,6 +386,7 @@ if(SLEEF_SHOW_CONFIG) message(" Detected C compiler: ${CMAKE_C_COMPILER_ID} @ ${CMAKE_C_COMPILER}") message(" CMake: ${CMAKE_VERSION}") message(" Make program: ${CMAKE_MAKE_PROGRAM}") + message(" CMake build type: ${CMAKE_BUILD_TYPE}") if(CMAKE_CROSSCOMPILING) message(" Crosscompiling SLEEF.") message(" Native build dir: ${NATIVE_BUILD_DIR}") @@ -317,6 +401,7 @@ if(SLEEF_SHOW_CONFIG) message(STATUS "GMP : " ${LIBGMP}) message(STATUS "RT : " ${LIBRT}) message(STATUS "FFTW3 : " ${LIBFFTW3}) + message(STATUS "FFTW3F : " ${LIBFFTW3F}) message(STATUS "OPENSSL : " ${OPENSSL_VERSION}) message(STATUS "SDE : " ${SDE_COMMAND}) if (SLEEF_BUILD_INLINE_HEADERS) @@ -337,3 +422,4 @@ if(SLEEF_SHOW_CONFIG) message(STATUS "Building SLEEF with AArch64 Vector PCS support") endif() endif(SLEEF_SHOW_CONFIG) + diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/CONTRIBUTORS.md b/src/jdk.incubator.vector/unix/native/libsleef/upstream/CONTRIBUTORS.md deleted file mode 100644 index c0f269418f3..00000000000 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/CONTRIBUTORS.md +++ /dev/null @@ -1,27 +0,0 @@ -# List of contributors - -These lists are not exhaustive and only provide most relevant contact information. -For an exhausitive list of contributors please refer to the -[GitHub contributors section for SLEEF](https://github.com/shibatch/sleef/graphs/contributors). - -## Maintainers - -| Name | Affiliation | Github profile | -| -------------------- | ----------------------- | ---------------------------------- | -| Pierre Blanchard | Arm Ltd. | https://github.com/blapie | -| Joana Cruz | Arm Ltd. | https://github.com/joanaxcruz | -| Joe Ramsay | Arm Ltd. | https://github.com/joeramsay | -| Naoki Shibata | Nara Institute of Science and Technology | https://github.com/shibatch | - -## Contributors - -| Name | Affiliation | Github profile | -| -------------------- | ----------------------- | ---------------------------------- | -| Anonymous | | https://github.com/friendlyanon | -| Diana Bite | Former Arm Ltd. | https://github.com/diaena | -| Ludovic Henry | Rivos Inc. | https://github.com/luhenry | -| Martin Krastev | Chaos Group | https://github.com/blu | -| Jilayne Lovejoy | Former Arm Inc. | https://github.com/jlovejoy | -| Kerry McLaughlin | Arm Ltd. | https://github.com/kmclaughlin-arm | -| Alexandre Mutel | Unity Technologies | https://github.com/xoofx | -| Francesco Petrogalli | Former Arm Ltd. | https://github.com/fpetrogalli-arm | diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/Configure.cmake b/src/jdk.incubator.vector/unix/native/libsleef/upstream/Configure.cmake index d78fc0bccaa..ea4d1669e00 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/Configure.cmake +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/Configure.cmake @@ -1,5 +1,6 @@ include(CheckCCompilerFlag) include(CheckCSourceCompiles) +include(CheckCXXSourceCompiles) include(CheckTypeSize) include(CheckLanguage) @@ -11,35 +12,39 @@ if (SLEEF_BUILD_STATIC_TEST_BINS) set(CMAKE_EXE_LINKER_FLAGS "-static") endif() -set(OPENSSL_EXTRA_LIBRARIES "" CACHE STRING "Extra libraries for openssl") -if (NOT CMAKE_CROSSCOMPILING AND NOT SLEEF_FORCE_FIND_PACKAGE_SSL) - if (SLEEF_BUILD_STATIC_TEST_BINS) - set(OPENSSL_USE_STATIC_LIBS TRUE) - endif() - find_package(OpenSSL) - if (OPENSSL_FOUND) - set(SLEEF_OPENSSL_FOUND TRUE) - set(SLEEF_OPENSSL_LIBRARIES ${OPENSSL_LIBRARIES}) - # Work around for tester3 sig segv, when linking versions of openssl (1.1.1) statically. - # This is a known issue https://github.com/openssl/openssl/issues/13872. +if (NOT SLEEF_DISABLE_SSL) + if (NOT CMAKE_CROSSCOMPILING AND NOT SLEEF_FORCE_FIND_PACKAGE_SSL) if (SLEEF_BUILD_STATIC_TEST_BINS) - string(REGEX REPLACE - "-lpthread" "-Wl,--whole-archive -lpthread -Wl,--no-whole-archive" - SLEEF_OPENSSL_LIBRARIES "${OPENSSL_LIBRARIES}") + set(OPENSSL_USE_STATIC_LIBS TRUE) + endif() + find_package(OpenSSL) + if (OPENSSL_FOUND) + set(SLEEF_OPENSSL_FOUND TRUE) + set(SLEEF_OPENSSL_LIBRARIES ${OPENSSL_LIBRARIES}) + # Work around for tester3 sig segv, when linking versions of openssl (1.1.1) statically. + # This is a known issue https://github.com/openssl/openssl/issues/13872. + if (SLEEF_BUILD_STATIC_TEST_BINS) + string(REGEX REPLACE + "-lpthread" "-Wl,--whole-archive -lpthread -Wl,--no-whole-archive" + SLEEF_OPENSSL_LIBRARIES "${OPENSSL_LIBRARIES}") + endif() + set(SLEEF_OPENSSL_VERSION ${OPENSSL_VERSION}) + set(SLEEF_OPENSSL_LIBRARIES ${SLEEF_OPENSSL_LIBRARIES} ${OPENSSL_EXTRA_LIBRARIES}) + set(SLEEF_OPENSSL_INCLUDE_DIR ${OPENSSL_INCLUDE_DIR}) + endif() + else() + # find_package cannot find OpenSSL when cross-compiling + find_library(LIBSSL ssl) + find_library(LIBCRYPTO crypto) + if (LIBSSL AND LIBCRYPTO) + set(SLEEF_OPENSSL_FOUND TRUE) + set(SLEEF_OPENSSL_LIBRARIES ${LIBSSL} ${LIBCRYPTO} ${OPENSSL_EXTRA_LIBRARIES}) + set(SLEEF_OPENSSL_VERSION ${LIBSSL}) endif() - set(SLEEF_OPENSSL_VERSION ${OPENSSL_VERSION}) - set(SLEEF_OPENSSL_LIBRARIES ${SLEEF_OPENSSL_LIBRARIES} ${OPENSSL_EXTRA_LIBRARIES}) - set(SLEEF_OPENSSL_INCLUDE_DIR ${OPENSSL_INCLUDE_DIR}) endif() else() - # find_package cannot find OpenSSL when cross-compiling - find_library(LIBSSL ssl) - find_library(LIBCRYPTO crypto) - if (LIBSSL AND LIBCRYPTO) - set(SLEEF_OPENSSL_FOUND TRUE) - set(SLEEF_OPENSSL_LIBRARIES ${LIBSSL} ${LIBCRYPTO} ${OPENSSL_EXTRA_LIBRARIES}) - set(SLEEF_OPENSSL_VERSION ${LIBSSL}) - endif() + set(SLEEF_OPENSSL_FOUND FALSE) + message(STATUS "Detection of OpenSSL is skipped since SLEEF_DISABLE_SSL is specified") endif() if (SLEEF_ENFORCE_TESTER3 AND NOT SLEEF_OPENSSL_FOUND) @@ -48,10 +53,20 @@ endif() # Some toolchains require explicit linking of the libraries following. find_library(LIB_MPFR mpfr) -find_library(LIBM m) +if(SLEEF_BUILD_WITH_LIBM) + find_library(LIBM m) +endif() find_library(LIBGMP gmp) find_library(LIBRT rt) + find_library(LIBFFTW3 fftw3) +find_library(LIBFFTW3F fftw3f) +find_library(LIBFFTW3_OMP fftw3_omp) +find_library(LIBFFTW3F_OMP fftw3f_omp) + +if (LIBFFTW3 AND LIBFFTW3F AND LIBFFTW3_OMP AND LIBFFTW3F_OMP) + set(SLEEF_LIBFFTW3_LIBRARIES ${LIBFFTW3} ${LIBFFTW3F} ${LIBFFTW3_OMP} ${LIBFFTW3F_OMP}) +endif() if (LIB_MPFR) find_path(MPFR_INCLUDE_DIR @@ -63,7 +78,7 @@ if (LIBFFTW3) find_path(FFTW3_INCLUDE_DIR NAMES fftw3.h ONLY_CMAKE_FIND_ROOT_PATH) -endif(LIBFFTW3) +endif() if (NOT LIBM) set(LIBM "") @@ -77,10 +92,77 @@ if (SLEEF_DISABLE_MPFR) set(LIB_MPFR "") endif() -if (SLEEF_DISABLE_SSL) - set(SLEEF_OPENSSL_FOUND FALSE) +# Include submodules + +set(SLEEF_SUBMODULE_INSTALL_DIR "${CMAKE_BINARY_DIR}/submodules") + +include(ExternalProject) +include(FindPkgConfig) + +if (NOT EXISTS "${PROJECT_SOURCE_DIR}/submodules") + file(MAKE_DIRECTORY "${PROJECT_SOURCE_DIR}/submodules") endif() +# Include TLFloat as a submodule + +if (SLEEF_ENABLE_TLFLOAT) + set(TLFLOAT_MINIMUM_VERSION 1.15.0) + set(TLFLOAT_GIT_TAG "fb0390157d5c8811fc2a5a6d7d8eac27261f06fb") + + set(TLFLOAT_SOURCE_DIR "${PROJECT_SOURCE_DIR}/submodules/tlfloat") + set(TLFLOAT_INSTALL_DIR "${SLEEF_SUBMODULE_INSTALL_DIR}/tlfloat") + + set(TLFLOAT_CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${TLFLOAT_INSTALL_DIR} -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DBUILD_LIBS=True -DBUILD_UTILS=False -DBUILD_TESTS=False) + + if (CMAKE_C_COMPILER) + list(APPEND TLFLOAT_CMAKE_ARGS -DCMAKE_C_COMPILER:PATH=${CMAKE_C_COMPILER}) + endif() + + if (CMAKE_CXX_COMPILER) + list(APPEND TLFLOAT_CMAKE_ARGS -DCMAKE_CXX_COMPILER:PATH=${CMAKE_CXX_COMPILER}) + endif() + + if (CMAKE_TOOLCHAIN_FILE) + list(APPEND TLFLOAT_CMAKE_ARGS -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}) + endif() + + if (EXISTS "${TLFLOAT_SOURCE_DIR}/CMakeLists.txt") + # If the source code of tlfloat is already downloaded, use it + ExternalProject_Add(ext_tlfloat + SOURCE_DIR "${TLFLOAT_SOURCE_DIR}" + CMAKE_ARGS ${TLFLOAT_CMAKE_ARGS} + UPDATE_DISCONNECTED TRUE + ) + include_directories(BEFORE "${TLFLOAT_INSTALL_DIR}/include") + link_directories(BEFORE "${TLFLOAT_INSTALL_DIR}/lib") + set(TLFLOAT_LIBRARIES "tlfloat") + else() + pkg_search_module(TLFLOAT tlfloat) + + if (TLFLOAT_FOUND AND TLFLOAT_VERSION VERSION_GREATER_EQUAL TLFLOAT_MINIMUM_VERSION) + # If tlfloat is installed on the system + add_custom_target(ext_tlfloat ALL) + include_directories(BEFORE "${TLFLOAT_INCLUDE_DIRS}") + link_directories(BEFORE "${TLFLOAT_LIBDIR}") + message(STATUS "Found installed TLFloat " ${TLFLOAT_VERSION}) + else() + # Otherwise, download the source code + find_package(Git REQUIRED) + ExternalProject_Add(ext_tlfloat + GIT_REPOSITORY https://github.com/shibatch/tlfloat + GIT_TAG "${TLFLOAT_GIT_TAG}" + SOURCE_DIR "${TLFLOAT_SOURCE_DIR}" + CMAKE_ARGS ${TLFLOAT_CMAKE_ARGS} + UPDATE_DISCONNECTED TRUE + ) + + include_directories(BEFORE "${TLFLOAT_INSTALL_DIR}/include") + link_directories(BEFORE "${TLFLOAT_INSTALL_DIR}/lib") + set(TLFLOAT_LIBRARIES "tlfloat") + endif() + endif() +endif(SLEEF_ENABLE_TLFLOAT) + # Force set default build type if none was specified # Note: some sleef code requires the optimisation flags turned on if(NOT CMAKE_BUILD_TYPE) @@ -124,7 +206,7 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm") set(COMPILER_SUPPORTS_NEON32VFPV4 1) set(CLANG_FLAGS_ENABLE_PURECFMA_SCALAR "-mfpu=vfpv4") -elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64") +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64" OR CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)") set(SLEEF_ARCH_PPC64 ON CACHE INTERNAL "True for PPC64 architecture.") set(CLANG_FLAGS_ENABLE_PURECFMA_SCALAR "-mvsx") @@ -149,7 +231,7 @@ if(NOT CLANG_EXE_PATH) set(CLANG_EXE_PATH ${CMAKE_C_COMPILER}) else() # Else we may find clang on the path? - find_program(CLANG_EXE_PATH NAMES clang "clang-11" "clang-10" "clang-9" "clang-8" "clang-7" "clang-6.0" "clang-5.0" "clang-4.0" "clang-3.9") + find_program(CLANG_EXE_PATH NAMES clang "clang-25" "clang-24" "clang-23" "clang-22" "clang-21" "clang-20" "clang-19" "clang-18" "clang-17") endif() endif() @@ -188,7 +270,7 @@ set(CLANG_FLAGS_ENABLE_RVVM2NOFMA "-march=rv64gcv_zba_zbb_zbs") set(FLAGS_OTHERS "") # All variables storing compiler flags should be prefixed with FLAGS_ -if(CMAKE_C_COMPILER_ID MATCHES "(GNU|Clang)") +if(CMAKE_C_COMPILER_ID MATCHES "(GNU|Clang|QCC)") # Always compile sleef with -ffp-contract. set(FLAGS_STRICTMATH "-ffp-contract=off") set(FLAGS_FASTMATH "-ffast-math") @@ -209,13 +291,13 @@ if(CMAKE_C_COMPILER_ID MATCHES "(GNU|Clang)") # Warning flags. set(FLAGS_WALL "-Wall -Wno-unused-function -Wno-attributes -Wno-unused-result") - if(CMAKE_C_COMPILER_ID MATCHES "GNU") + if(CMAKE_C_COMPILER_ID MATCHES "(GNU|QCC)") # The following compiler option is needed to suppress the warning # "AVX vector return without AVX enabled changes the ABI" at # src/arch/helpervecext.h:88 string(CONCAT FLAGS_WALL ${FLAGS_WALL} " -Wno-psabi") set(FLAGS_ENABLE_NEON32 "-mfpu=neon") - endif(CMAKE_C_COMPILER_ID MATCHES "GNU") + endif(CMAKE_C_COMPILER_ID MATCHES "(GNU|QCC)") if(CMAKE_C_COMPILER_ID MATCHES "Clang" AND SLEEF_ENABLE_LTO) if (NOT SLEEF_LLVM_AR_COMMAND) @@ -296,7 +378,7 @@ elseif(CMAKE_C_COMPILER_ID MATCHES "Intel") endif() set(SLEEF_C_FLAGS "${FLAGS_WALL} ${FLAGS_STRICTMATH} ${FLAGS_OTHERS}") -if(CMAKE_C_COMPILER_ID MATCHES "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER 6.99) +if(CMAKE_C_COMPILER_ID MATCHES "(GNU|QCC)" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER 6.99) set(DFT_C_FLAGS "${FLAGS_WALL} ${FLAGS_NOSTRICTALIASING} ${FLAGS_OTHERS}") else() set(DFT_C_FLAGS "${FLAGS_WALL} ${FLAGS_NOSTRICTALIASING} ${FLAGS_FASTMATH} ${FLAGS_OTHERS}") @@ -306,9 +388,17 @@ if(CMAKE_C_COMPILER_ID MATCHES "GNU") set(FLAGS_ENABLE_SVE "${FLAGS_ENABLE_SVE};-fno-tree-vrp") endif() +if(QNX AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64") + #set(SLEEF_C_FLAGS "${SLEEF_C_FLAGS} -march=armv8-a ") + #set(DFT_C_FLAGS "${DFT_C_FLAGS} -march=armv8-a ") +endif() + if (CMAKE_SYSTEM_PROCESSOR MATCHES "^i.86$" AND CMAKE_C_COMPILER_ID MATCHES "GNU") set(SLEEF_C_FLAGS "${SLEEF_C_FLAGS} -msse2 -mfpmath=sse") set(DFT_C_FLAGS "${DFT_C_FLAGS} -msse2 -mfpmath=sse -m128bit-long-double") +elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "^i.86$" AND CMAKE_C_COMPILER_ID MATCHES "QCC") + set(SLEEF_C_FLAGS "${SLEEF_C_FLAGS} -msse2 -mfpmath=sse") + set(DFT_C_FLAGS "${DFT_C_FLAGS} -msse2 -mfpmath=sse -m128bit-long-double") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^i.86$" AND CMAKE_C_COMPILER_ID MATCHES "Clang") set(SLEEF_C_FLAGS "${SLEEF_C_FLAGS} -msse2 -mfpmath=sse") set(DFT_C_FLAGS "${DFT_C_FLAGS} -msse2 -mfpmath=sse") @@ -328,9 +418,6 @@ endif() # Long double -option(SLEEF_DISABLE_LONG_DOUBLE "Disable long double" OFF) -option(SLEEF_ENFORCE_LONG_DOUBLE "Build fails if long double is not supported by the compiler" OFF) - if(NOT SLEEF_DISABLE_LONG_DOUBLE) CHECK_TYPE_SIZE("long double" LD_SIZE) if(LD_SIZE GREATER "9") @@ -351,9 +438,6 @@ endif() # float128 -option(SLEEF_DISABLE_FLOAT128 "Disable float128" OFF) -option(SLEEF_ENFORCE_FLOAT128 "Build fails if float128 is not supported by the compiler" OFF) - if(NOT SLEEF_DISABLE_FLOAT128) CHECK_C_SOURCE_COMPILES(" int main() { __float128 r = 1; @@ -373,10 +457,37 @@ if(COMPILER_SUPPORTS_FLOAT128) }" COMPILER_SUPPORTS_QUADMATH) endif() -# SSE2 +if(COMPILER_SUPPORTS_FLOAT128) + if (CMAKE_CXX_COMPILER_TARGET) + set(CMAKE_REQUIRED_FLAGS "--target=${CMAKE_CXX_COMPILER_TARGET}") + endif() + CHECK_CXX_SOURCE_COMPILES(" +#include +struct s { long long x, y; }; +int main(int argc, char **argv) { + constexpr s a = std::bit_cast(__float128(0.1234)*__float128(56.789)); + static_assert((a.x ^ a.y) == 0xc7d695c93a4e2b71LL); + __float128 i = argc; + return (int)i; +} +" SLEEF_FLOAT128_IS_IEEEQP) + set(CMAKE_REQUIRED_FLAGS) +endif() -option(SLEEF_DISABLE_SSE2 "Disable SSE2" OFF) -option(SLEEF_ENFORCE_SSE2 "Build fails if SSE2 is not supported by the compiler" OFF) +if (CMAKE_CXX_COMPILER_TARGET) + set(CMAKE_REQUIRED_FLAGS "--target=${CMAKE_CXX_COMPILER_TARGET}") +endif() +CHECK_CXX_SOURCE_COMPILES(" +#include +struct s { long long x, y; }; +int main(void) { + constexpr s a = std::bit_cast((long double)0.1234*(long double)56.789); + static_assert((a.x ^ a.y) == 0xc7d695c93a4e2b71LL); +} +" SLEEF_LONGDOUBLE_IS_IEEEQP) +set(CMAKE_REQUIRED_FLAGS) + +# SSE2 if(SLEEF_ARCH_X86 AND NOT SLEEF_DISABLE_SSE2) string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_SSE2}") @@ -397,9 +508,6 @@ endif() # SSE 4.1 -option(SLEEF_DISABLE_SSE4 "Disable SSE4" OFF) -option(SLEEF_ENFORCE_SSE4 "Build fails if SSE4 is not supported by the compiler" OFF) - if(SLEEF_ARCH_X86 AND NOT SLEEF_DISABLE_SSE4) string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_SSE4}") CHECK_C_SOURCE_COMPILES(" @@ -419,9 +527,6 @@ endif() # AVX -option(SLEEF_ENFORCE_AVX "Disable AVX" OFF) -option(SLEEF_ENFORCE_AVX "Build fails if AVX is not supported by the compiler" OFF) - if(SLEEF_ARCH_X86 AND NOT SLEEF_DISABLE_AVX) string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_AVX}") CHECK_C_SOURCE_COMPILES(" @@ -441,9 +546,6 @@ endif() # FMA4 -option(SLEEF_DISABLE_FMA4 "Disable FMA4" OFF) -option(SLEEF_ENFORCE_FMA4 "Build fails if FMA4 is not supported by the compiler" OFF) - if(SLEEF_ARCH_X86 AND NOT SLEEF_DISABLE_FMA4) string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_FMA4}") CHECK_C_SOURCE_COMPILES(" @@ -463,9 +565,6 @@ endif() # AVX2 -option(SLEEF_DISABLE_AVX2 "Disable AVX2" OFF) -option(SLEEF_ENFORCE_AVX2 "Build fails if AVX2 is not supported by the compiler" OFF) - if(SLEEF_ARCH_X86 AND NOT SLEEF_DISABLE_AVX2) string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_AVX2}") CHECK_C_SOURCE_COMPILES(" @@ -490,9 +589,6 @@ endif() # AVX512F -option(SLEEF_DISABLE_AVX512F "Disable AVX512F" OFF) -option(SLEEF_ENFORCE_AVX512F "Build fails if AVX512F is not supported by the compiler" OFF) - if(SLEEF_ARCH_X86 AND NOT SLEEF_DISABLE_AVX512F) string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_AVX512F}") CHECK_C_SOURCE_COMPILES(" @@ -522,9 +618,6 @@ endif() # SVE -option(SLEEF_DISABLE_SVE "Disable SVE" OFF) -option(SLEEF_ENFORCE_SVE "Build fails if SVE is not supported by the compiler" OFF) - # Darwin does not support SVE yet (see issue #474), # therefore we disable SVE on Darwin systems. if(SLEEF_ARCH_AARCH64 AND NOT SLEEF_DISABLE_SVE AND NOT CMAKE_SYSTEM_NAME STREQUAL "Darwin") @@ -546,15 +639,12 @@ endif() # VSX -option(SLEEF_DISABLE_VSX "Disable VSX" OFF) -option(SLEEF_ENFORCE_VSX "Build fails if VSX is not supported by the compiler" OFF) - if(SLEEF_ARCH_PPC64 AND NOT SLEEF_DISABLE_VSX) string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_VSX}") CHECK_C_SOURCE_COMPILES(" #include - #ifndef __LITTLE_ENDIAN__ - #error \"Only VSX(ISA2.07) little-endian mode is supported \" + #if !defined(__LITTLE_ENDIAN__) && !defined(_AIX) + #error \"Only VSX(ISA2.07) little-endian mode and AIX is supported \" #endif int main() { vector double d; @@ -576,9 +666,6 @@ endif() # VSX3 -option(SLEEF_DISABLE_VSX3 "Disable VSX3" OFF) -option(SLEEF_ENFORCE_VSX3 "Build fails if VSX3 is not supported by the compiler" OFF) - if(SLEEF_ARCH_PPC64 AND NOT SLEEF_DISABLE_VSX3) string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_VSX3}") CHECK_C_SOURCE_COMPILES(" @@ -605,9 +692,6 @@ endif() # IBM Z -option(SLEEF_DISABLE_VXE "Disable VXE" OFF) -option(SLEEF_ENFORCE_VXE "Build fails if VXE is not supported by the compiler" OFF) - if(SLEEF_ARCH_S390X AND NOT SLEEF_DISABLE_VXE) string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_VXE}") CHECK_C_SOURCE_COMPILES(" @@ -629,9 +713,6 @@ endif() # -option(SLEEF_DISABLE_VXE2 "Disable VXE2" OFF) -option(SLEEF_ENFORCE_VXE2 "Build fails if VXE2 is not supported by the compiler" OFF) - if(SLEEF_ARCH_S390X AND NOT SLEEF_DISABLE_VXE2) string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_VXE2}") CHECK_C_SOURCE_COMPILES(" @@ -653,15 +734,26 @@ endif() # RVVM1 -option(SLEEF_DISABLE_RVVM1 "Disable RVVM1" OFF) -option(SLEEF_ENFORCE_RVVM1 "Build fails if RVVM1 is not supported by the compiler" OFF) - if(SLEEF_ARCH_RISCV64 AND NOT SLEEF_DISABLE_RVVM1) string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_RVVM1}") CHECK_C_SOURCE_COMPILES(" - #include - int main() { - vint32m1_t r = __riscv_vmv_v_x_i32m1(1, __riscv_vlenb() * 8 / 32); }" + #ifdef __riscv_v + #if __riscv_v < 1000000 + #error \"RVV version 1.0 not supported\" + #endif + #else + #error \"RVV not supported\" + #endif + + #ifdef __riscv_v_intrinsic + #if __riscv_v_intrinsic < 12000 + #error \"RVV instrinsics version 0.12 not supported\" + #endif + #else + #error \"RVV intrinsics not supported\" + #endif + + int main(void) { return 0; }" COMPILER_SUPPORTS_RVVM1) if(COMPILER_SUPPORTS_RVVM1) @@ -675,15 +767,26 @@ endif() # RVVM2 -option(SLEEF_DISABLE_RVVM2 "Disable RVVM2" OFF) -option(SLEEF_ENFORCE_RVVM2 "Build fails if RVVM2 is not supported by the compiler" OFF) - if(SLEEF_ARCH_RISCV64 AND NOT SLEEF_DISABLE_RVVM2) string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_RVVM2}") CHECK_C_SOURCE_COMPILES(" - #include - int main() { - vint32m2_t r = __riscv_vmv_v_x_i32m2(1, 2 * __riscv_vlenb() * 8 / 32); }" + #ifdef __riscv_v + #if __riscv_v < 1000000 + #error \"RVV version 1.0 not supported\" + #endif + #else + #error \"RVV not supported\" + #endif + + #ifdef __riscv_v_intrinsic + #if __riscv_v_intrinsic < 12000 + #error \"RVV instrinsics version 0.12 not supported\" + #endif + #else + #error \"RVV intrinsics not supported\" + #endif + + int main(void) { return 0; }" COMPILER_SUPPORTS_RVVM2) if(COMPILER_SUPPORTS_RVVM2) @@ -697,18 +800,14 @@ endif() # CUDA -option(SLEEF_ENFORCE_CUDA "Build fails if CUDA is not supported" OFF) - if (SLEEF_ENFORCE_CUDA AND NOT CMAKE_CUDA_COMPILER) message(FATAL_ERROR "SLEEF_ENFORCE_CUDA is specified and that feature is disabled or not supported by the compiler") endif() # OpenMP -option(SLEEF_DISABLE_OPENMP "Disable OPENMP" OFF) -option(SLEEF_ENFORCE_OPENMP "Build fails if OPENMP is not supported by the compiler" OFF) - if(NOT SLEEF_DISABLE_OPENMP) + set(CMAKE_REQUIRED_FLAGS) find_package(OpenMP) # Check if compilation with OpenMP really succeeds # It might not succeed even though find_package(OpenMP) succeeds. @@ -796,6 +895,7 @@ set(CMAKE_REQUIRED_LIBRARIES) # Save the default C flags set(ORG_CMAKE_C_FLAGS ${CMAKE_C_FLAGS}) +set(ORG_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) ## @@ -838,10 +938,6 @@ if(SLEEF_SHOW_ERROR_LOG) endif() endif(SLEEF_SHOW_ERROR_LOG) -if (MSVC OR SLEEF_CLANG_ON_WINDOWS) - set(COMPILER_SUPPORTS_OPENMP FALSE) # At this time, OpenMP is not supported on MSVC -endif() - ## # Set common definitions diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/Jenkinsfile b/src/jdk.incubator.vector/unix/native/libsleef/upstream/Jenkinsfile new file mode 100644 index 00000000000..091303bc187 --- /dev/null +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/Jenkinsfile @@ -0,0 +1,247 @@ +pipeline { + agent { label 'jenkinsfile' } + + stages { + stage('Preamble') { + parallel { + stage('x86_64 linux clang-19-lto') { + agent { label 'x86_64 && ubuntu24 && avx512f' } + options { skipDefaultCheckout() } + steps { + cleanWs() + checkout scm + sh ''' + echo "x86_64 clang-19 with LTO on" `hostname` + export CC=clang-19 + export CXX=clang++-19 + mkdir build + cd build + cmake .. -GNinja -DCMAKE_INSTALL_PREFIX=../../install -DSLEEF_SHOW_CONFIG=1 -DSLEEF_BUILD_DFT=TRUE -DSLEEF_ENFORCE_DFT=TRUE -DSLEEFDFT_ENABLE_STREAM=True -DSLEEF_BUILD_QUAD=TRUE -DSLEEF_BUILD_INLINE_HEADERS=TRUE -DSLEEF_ENFORCE_SSE2=TRUE -DSLEEF_ENFORCE_SSE4=TRUE -DSLEEF_ENFORCE_AVX=TRUE -DSLEEF_ENFORCE_AVX2=TRUE -DSLEEF_ENFORCE_AVX512F=TRUE -DSLEEF_ENFORCE_TESTER4=True -DSLEEF_ENABLE_TESTER=True -DSLEEF_ENFORCE_TESTER=True -DSLEEF_ENABLE_LTO=True -DCMAKE_EXE_LINKER_FLAGS="-fuse-ld=lld-19" + cmake -E time ninja + export OMP_WAIT_POLICY=passive + export CTEST_OUTPUT_ON_FAILURE=TRUE + ctest -j `nproc` + ninja install + ''' + } + } + + stage('x86_64 linux clang-19-asan') { + agent { label 'x86_64 && ubuntu24 && avx512f' } + options { skipDefaultCheckout() } + steps { + cleanWs() + checkout scm + sh ''' + echo "x86_64 clang-19 with ASAN on" `hostname` + export CC=clang-19 + export CXX=clang++-19 + mkdir build + cd build + cmake .. -GNinja -DCMAKE_INSTALL_PREFIX=../../install -DSLEEF_SHOW_CONFIG=1 -DSLEEF_BUILD_DFT=TRUE -DSLEEF_ENFORCE_DFT=TRUE -DSLEEFDFT_ENABLE_STREAM=True -DSLEEF_BUILD_QUAD=TRUE -DSLEEF_BUILD_INLINE_HEADERS=TRUE -DSLEEF_ENFORCE_SSE2=TRUE -DSLEEF_ENFORCE_SSE4=TRUE -DSLEEF_ENFORCE_AVX=TRUE -DSLEEF_ENFORCE_AVX2=TRUE -DSLEEF_ENFORCE_AVX512F=TRUE -DSLEEF_ENFORCE_TESTER4=True -DSLEEF_ENABLE_TESTER=True -DSLEEF_ENFORCE_TESTER=True -DSLEEF_ASAN=True + cmake -E time ninja + export OMP_WAIT_POLICY=passive + export CTEST_OUTPUT_ON_FAILURE=TRUE + ctest -j `nproc` + ninja install + ''' + } + } + + stage('x86_64 linux gcc-13') { + agent { label 'x86_64 && ubuntu24 && cuda' } + options { skipDefaultCheckout() } + steps { + cleanWs() + checkout scm + sh ''' + echo "x86_64 gcc-13 on" `hostname` + export CC=gcc-13 + export CXX=g++-13 + export CUDACXX=/opt/cuda-12.6/bin/nvcc + mkdir build + cd build + cmake .. -GNinja -DCMAKE_INSTALL_PREFIX=../../install -DSLEEF_SHOW_CONFIG=1 -DSLEEF_ENABLE_CUDA=True -DSLEEF_ENFORCE_CUDA=True -DSLEEF_BUILD_DFT=TRUE -DSLEEF_ENFORCE_DFT=TRUE -DSLEEF_BUILD_QUAD=TRUE -DSLEEF_BUILD_INLINE_HEADERS=TRUE -DSLEEF_ENFORCE_SSE2=TRUE -DSLEEF_ENFORCE_SSE4=TRUE -DSLEEF_ENFORCE_AVX=TRUE -DSLEEF_ENFORCE_AVX2=TRUE -DSLEEF_ENFORCE_AVX512F=TRUE -DSLEEF_ENFORCE_TESTER4=True -DSLEEF_ENABLE_TESTER=True -DSLEEF_ENFORCE_TESTER=True + cmake -E time ninja + export OMP_WAIT_POLICY=passive + export CTEST_OUTPUT_ON_FAILURE=TRUE + ctest -j `nproc` + ninja install + ''' + } + } + + stage('x86_64 windows clang') { + agent { label 'windows11 && vs2022' } + options { skipDefaultCheckout() } + steps { + cleanWs() + checkout scm + bat """ + call "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Auxiliary\\Build\\vcvars64.bat" + if not %ERRORLEVEL% == 0 exit /b %ERRORLEVEL% + call "winbuild-clang.bat" -DCMAKE_BUILD_TYPE=Release -DSLEEF_SHOW_CONFIG=1 -DSLEEF_BUILD_DFT=True -DSLEEF_ENFORCE_DFT=TRUE -DSLEEF_BUILD_QUAD=TRUE -DSLEEF_ENFORCE_SSE2=TRUE -DSLEEF_ENFORCE_SSE4=TRUE -DSLEEF_ENFORCE_AVX=TRUE -DSLEEF_ENFORCE_AVX2=TRUE -DSLEEF_ENFORCE_AVX512F=TRUE -DSLEEF_ENABLE_TESTER4=True -DSLEEF_ENFORCE_TESTER4=True -DSLEEF_DISABLE_SSL=True + if not %ERRORLEVEL% == 0 exit /b %ERRORLEVEL% + ctest -j 4 --output-on-failure + exit /b %ERRORLEVEL% + """ + } + } + + stage('x86_64 windows vs2022') { + agent { label 'windows11 && vs2022' } + options { skipDefaultCheckout() } + steps { + cleanWs() + checkout scm + bat """ + call "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Auxiliary\\Build\\vcvars64.bat" + if not %ERRORLEVEL% == 0 exit /b %ERRORLEVEL% + call "winbuild-msvc.bat" -DCMAKE_BUILD_TYPE=Release -DSLEEF_SHOW_CONFIG=1 -DSLEEF_BUILD_DFT=True -DSLEEF_ENFORCE_DFT=TRUE -DSLEEF_BUILD_QUAD=TRUE -DSLEEF_ENFORCE_SSE2=TRUE -DSLEEF_ENFORCE_SSE4=TRUE -DSLEEF_ENFORCE_AVX=TRUE -DSLEEF_ENFORCE_AVX2=TRUE -DSLEEF_ENFORCE_AVX512F=TRUE -DSLEEF_ENFORCE_TESTER4=True + if not %ERRORLEVEL% == 0 exit /b %ERRORLEVEL% + ctest -j 4 --output-on-failure + exit /b %ERRORLEVEL% + """ + } + } + + stage('riscv linux gcc-14') { + agent { label 'riscv && ubuntu23' } + options { skipDefaultCheckout() } + steps { + script { + System.setProperty("org.jenkinsci.plugins.durabletask.BourneShellScript.HEARTBEAT_CHECK_INTERVAL", "86400"); + } + cleanWs() + checkout scm + sh ''' + echo "riscv gcc-14 on" `hostname` + export CC=gcc-14.2.0 + export CXX=g++-14.2.0 + mkdir build + cd build + cmake .. -GNinja -DCMAKE_INSTALL_PREFIX=../../install -DSLEEF_SHOW_CONFIG=1 -DSLEEF_BUILD_DFT=False -DSLEEF_ENFORCE_DFT=False -DSLEEF_BUILD_QUAD=TRUE -DSLEEF_BUILD_INLINE_HEADERS=TRUE -DSLEEF_ENFORCE_TESTER4=True -DSLEEF_ENABLE_TESTER=False -DSLEEF_ENFORCE_RVVM1=True -DSLEEF_ENFORCE_RVVM2=True + cmake -E time oomstaller ninja -j `nproc` + export OMP_WAIT_POLICY=passive + export CTEST_OUTPUT_ON_FAILURE=TRUE + ctest -j `nproc` + ninja install + ''' + } + } + + stage('arm32 linux gcc-12') { + agent { label 'armv7 && debian12' } + options { skipDefaultCheckout() } + steps { + cleanWs() + checkout scm + sh ''' + echo "arm32 gcc-12 on" `hostname` + export CC=gcc-12 + export CXX=g++-12 + mkdir build + cd build + cmake .. -GNinja -DCMAKE_INSTALL_PREFIX=../../install -DSLEEF_SHOW_CONFIG=1 -DSLEEF_BUILD_QUAD=TRUE -DSLEEF_ENFORCE_TESTER4=True -DSLEEF_ENABLE_TESTER=False + cmake -E time oomstaller ninja -j `nproc` + export CTEST_OUTPUT_ON_FAILURE=TRUE + ctest -j `nproc` + ninja install + ''' + } + } + + stage('aarch64 linux clang-19') { + agent { label 'aarch64 && ubuntu24 && apple' } + options { skipDefaultCheckout() } + steps { + cleanWs() + checkout scm + sh ''' + echo "aarch64 clang-19 on" `hostname` + export CC=clang-19 + export CXX=clang++-19 + mkdir build + cd build + cmake .. -GNinja -DCMAKE_INSTALL_PREFIX=../../install -DSLEEF_SHOW_CONFIG=1 -DSLEEF_BUILD_DFT=TRUE -DSLEEF_ENFORCE_DFT=TRUE -DSLEEF_BUILD_QUAD=TRUE -DSLEEF_BUILD_INLINE_HEADERS=TRUE -DSLEEF_ENFORCE_SVE=TRUE -DEMULATOR=qemu-aarch64-static -DSLEEF_ENFORCE_TESTER4=True -DSLEEF_ENABLE_TESTER=False -DSLEEF_ENABLE_LTO=True -DCMAKE_EXE_LINKER_FLAGS="-fuse-ld=lld-19" + cmake -E time oomstaller ninja -j `nproc` + export CTEST_OUTPUT_ON_FAILURE=TRUE + ctest -j `nproc` + ''' + } + } + + stage('aarch64 linux gcc-14') { + agent { label 'aarch64 && ubuntu24 && apple' } + options { skipDefaultCheckout() } + steps { + cleanWs() + checkout scm + sh ''' + echo "aarch64 gcc-14 on" `hostname` + export CC=gcc-14 + export CXX=g++-14 + mkdir build + cd build + cmake .. -GNinja -DCMAKE_INSTALL_PREFIX=../../install -DSLEEF_SHOW_CONFIG=1 -DSLEEF_BUILD_DFT=TRUE -DSLEEF_ENFORCE_DFT=TRUE -DSLEEF_BUILD_QUAD=TRUE -DSLEEF_BUILD_INLINE_HEADERS=TRUE -DSLEEF_ENFORCE_SVE=TRUE -DEMULATOR=qemu-aarch64-static -DSLEEF_ENFORCE_TESTER4=True -DSLEEF_ENABLE_TESTER=False + cmake -E time oomstaller ninja -j `nproc` + export CTEST_OUTPUT_ON_FAILURE=TRUE + ctest -j `nproc` + ''' + } + } + + stage('cross-ppc64el gcc') { + agent { label 'x86_64 && ubuntu24 && cuda' } + steps { + cleanWs() + checkout scm + sh ''' + echo "Cross ppc64el gcc on" `hostname` + rm -rf build-native + mkdir build-native + cd build-native + cmake -GNinja .. -DSLEEF_SHOW_CONFIG=1 -DSLEEF_BUILD_QUAD=TRUE -DSLEEF_BUILD_DFT=TRUE -DSLEEF_ENFORCE_DFT=TRUE + cmake -E time ninja + cd .. + mkdir build + cd build + cmake -GNinja .. -DCMAKE_TOOLCHAIN_FILE=../toolchains/ppc64el-gcc.cmake -DNATIVE_BUILD_DIR=`pwd`/../build-native -DCMAKE_INSTALL_PREFIX=../install -DSLEEF_SHOW_CONFIG=1 -DSLEEF_ENFORCE_TESTER3=TRUE -DSLEEF_BUILD_QUAD=TRUE -DSLEEF_BUILD_DFT=TRUE -DSLEEF_ENFORCE_DFT=TRUE -DSLEEF_ENFORCE_TESTER4=True -DSLEEF_ENABLE_TESTER=False -DSLEEF_ENFORCE_VSX=True -DSLEEF_ENFORCE_VSX3=True + cmake -E time ninja + export OMP_WAIT_POLICY=passive + export CTEST_OUTPUT_ON_FAILURE=TRUE + export LD_LIBRARY_PATH=/usr/powerpc64le-linux-gnu/lib + ctest -j `nproc` + ninja install + ''' + } + } + + stage('cross-s390x gcc') { + agent { label 'x86_64 && ubuntu24 && cuda' } + steps { + cleanWs() + checkout scm + sh ''' + echo "Cross s390x gcc on" `hostname` + rm -rf build-native + mkdir build-native + cd build-native + cmake -GNinja .. -DSLEEF_SHOW_CONFIG=1 -DSLEEF_BUILD_QUAD=TRUE -DSLEEF_BUILD_DFT=TRUE -DSLEEF_ENFORCE_DFT=TRUE + cmake -E time ninja + cd .. + mkdir build + cd build + cmake -GNinja .. -DCMAKE_TOOLCHAIN_FILE=../toolchains/s390x-gcc.cmake -DNATIVE_BUILD_DIR=`pwd`/../build-native -DCMAKE_INSTALL_PREFIX=../install -DSLEEF_SHOW_CONFIG=1 -DSLEEF_ENFORCE_TESTER3=TRUE -DSLEEF_BUILD_QUAD=TRUE -DSLEEF_BUILD_DFT=TRUE -DSLEEF_ENFORCE_DFT=TRUE -DSLEEF_ENFORCE_TESTER4=True -DSLEEF_ENABLE_TESTER=False -DSLEEF_ENFORCE_VXE=True -DSLEEF_ENFORCE_VXE2=True + cmake -E time ninja + export OMP_WAIT_POLICY=passive + export CTEST_OUTPUT_ON_FAILURE=TRUE + ctest -j `nproc` + ninja install + ''' + } + } + } + } + } +} diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/README.adoc b/src/jdk.incubator.vector/unix/native/libsleef/upstream/README.adoc new file mode 100644 index 00000000000..38144549f0f --- /dev/null +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/README.adoc @@ -0,0 +1,129 @@ +== SLEEF - SIMD Library for Evaluating Elementary Functions + +image:http://img.shields.io/badge/DOI-10.1109/TPDS.2019.2960333-blue.svg[TPDS, link=https://ieeexplore.ieee.org/document/8936472] + +SLEEF is a library that implements vectorized versions of C standard +math functions. This library also includes DFT subroutines. + +* *Web Page:* https://sleef.org/ +* *Sources:* https://github.com/shibatch/sleef + +== Supported environment + +=== Test matrix + +The following table summarizes currently supported OSes and compilers. + +[cols="1,1,1,1,1,1,1,1,1"] +|=== +| 2+|Linux 4+|Windows 2+|Mac +| |gcc |llvm |MSVC |Clang |MinGW |Cygwin |Clang |GCC +|x86_64 |✔ |✔ |✔ |✔ |✔ |❓ |✔ |❓ +|RISC-V 64 |✔ |❓ |N/A |N/A |N/A |N/A |N/A |N/A +|AArch64 |✔ |✔ |❌ |❌ |❌ |❌ |✔ |❓ +|POWER |✔ |❓ |N/A |N/A |N/A |N/A |N/A |N/A +|S390X |✔ |❓ |N/A |N/A |N/A |N/A |N/A |N/A +|AArch32 |✔ |❓ |N/A |N/A |N/A |N/A |N/A |N/A +|=== + +✔ : Tested on CI, ❓ : Not tested, ❌ : Not supported + + +== How to build SLEEF + +The library itself does not have any additional dependency. + +In order to build SLEEF, you need CMake 3.18+, and C and C++ compilers of the same version. +It is also recommended to have the following tools. + +* Ninja +* Git + +https://github.com/shibatch/tlfloat[TLFloat] is automatically downloaded if no suitable version is found on your system. + +Some tests require: + +* libssl and libcrypto, that can be provided by installing openssl. +* libm, libgmp and libmpfr +* libfftw. + + +The build procedure is as follows. + +[arabic] +. Check out the source code from our GitHub repository + +.... +git clone https://github.com/shibatch/sleef +.... + +[arabic, start=2] +. Make a separate directory to create an out-of-source build + +.... +cd sleef && mkdir build +.... + +[arabic, start=3] +. Run cmake to configure the project + +.... +cmake -S . -B build +.... + +By default this will generate shared libraries. In order to generate +static libraries, pass option `-DBUILD_SHARED_LIBS=OFF`. + +For more verbose output add option `-DSLEEF_SHOW_CONFIG=ON`. + +[arabic, start=4] +. Run make to build the project + +.... +cmake --build build -j --clean-first +.... + +[arabic, start=5] +. Run tests using ctests + +.... +ctest --test-dir build -j +.... + +For more detailed build instructions please refer to +https://sleef.org/compile.xhtml#preliminaries[our web page]. + +== How to cross-compile SLEEF + +For more detailed please refer to +https://sleef.org/compile.xhtml#cross[cross-compile SLEEF] + +== Install SLEEF + +=== From source + +Assuming following instructions were followed. + +[arabic, start=6] +. Install to specified directory `` + +.... +cmake --install build --prefix= +.... + +=== Uninstall + +In order to uninstall SLEEF library and headers run + +.... +sudo xargs rm -v < build/install_manifest.txt +.... + +== License + +The software is distributed under the Boost Software License, Version +1.0. See accompanying file link:./LICENSE.txt[LICENSE.txt] or copy at +http://www.boost.org/LICENSE_1_0.txt. Contributions to this project are +accepted under the same license. + +Copyright © 2010-2025 SLEEF Project, Naoki Shibata and contributors. diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/README.md b/src/jdk.incubator.vector/unix/native/libsleef/upstream/README.md deleted file mode 100644 index 5ba06531d89..00000000000 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/README.md +++ /dev/null @@ -1,221 +0,0 @@ -# SLEEF - -![Github Actions](https://github.com/shibatch/sleef/actions/workflows/build_and_test.yml/badge.svg?event=push&branch=master) -[![DOI:10.1109/TPDS.2019.2960333](http://img.shields.io/badge/DOI-10.1109/TPDS.2019.2960333-blue.svg)](https://ieeexplore.ieee.org/document/8936472) -[![License](https://img.shields.io/badge/License-Boost_1.0-lightblue.svg)](https://www.boost.org/LICENSE_1_0.txt) -![CMake](https://img.shields.io/badge/cmake-v3.18+-yellow.svg) -[![Spack](https://img.shields.io/spack/v/sleef)](https://spack.readthedocs.io/en/v0.16.2/package_list.html#sleef) -[![SourceForge Downloads](https://img.shields.io/sourceforge/dt/sleef)](https://sourceforge.net/projects/sleef/) - -SLEEF is a library that implements vectorized versions of C standard math functions. This library also includes DFT subroutines. - -- **Web Page:** [https://sleef.org/][webpage_url] -- **Sources:** [https://github.com/shibatch/sleef][repo_url] - -## Supported environment - -### Test matrix - -The following table summarises currently supported vector extensions, compilers and OS-es. - -:green_circle: : Tested extensively in CI. - -:yellow_circle: : Tested partially in CI. - -:x: : Currently failing some tests in CI. - -:white_circle: : Not tested in CI. Might have passed tests in previous CI framework. - -[This issue](https://github.com/shibatch/sleef/issues/481) tracks progress on improving test coverage. -Compilation of SLEEF on previously supported environments might still be safe, we just cannot verify it yet. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
OS/Compiler
LinuxmacOSWindows
Arch.Vector Extensionsgccllvmiccgccllvmgccllvm-gnullvm-msvcmsvc
x86_64SSE2, SSE4,
AVX, AVX2, AVX512F
:green_circle::green_circle::white_circle::white_circle::green_circle::white_circle::yellow_circle::white_circle::white_circle:
x86 32bit
(i386)
SSE:white_circle::white_circle::white_circle:N/A:white_circle::white_circle::white_circle::white_circle:
AArch64
(arm)
Neon, SVE:green_circle::green_circle:N/AN/A:green_circle:N/A:white_circle::white_circle::white_circle:
AArch32
(armhf)
NEON:green_circle::green_circle:N/AN/AN/A
PowerPC
(ppc64el)
VSX, VSX3:green_circle::green_circle:N/AN/AN/A
IBM/Z
(s390x)
VXE, VXE2:green_circle::green_circle:N/AN/AN/A
RISC-V
(riscv64)
RVV1, RVV2N/A (14+):green_circle:N/AN/AN/A
- -### Component support - -The above table is valid for libm in single, double and quadruple precision, as well as fast Discrete Fourier Transform (DFT). - -Generation of inline headers is also supported for most vector extensions. - -LTO is not tested in CI yet, except on Windows. - -### Compiler support - -Results are displayed for gcc 11 and llvm 17, the compiler versions used in CI tests with GitHub Actions. - -Older versions should be supported too, while newer ones are either not tested or have known issues. - -Some compiler versions simply do not support certain vector extensions, for instance SVE is only supported for gcc version 9 onwards. - -Similarly, the RISC-V interface in SLEEF is based on version 1.0 of the intrinsics, which is only supported from llvm version 17 and gcc version 14 onwards. - -Toolchain files provide some information on supported compiler versions. - -### OS support - -Only Linux distributions and macOS are fully tested in CI and thus officially supported. - -Building SLEEF for Windows on x86 machines was officially supported ( :white_circle: ), as of 3.5.1, -however it is only partially tested due to [known limitations of the test suite with MinGW or MSYS2](https://github.com/shibatch/sleef/issues/544). -As a result tests for Windows on x86 only include DFT for now (other tests are disabled in build system), -but all components are built. - -Support for iOS and Android is only preliminary on AArch64. - -SVE is not supported on Darwin-based system and therefore automatically disabled by SLEEF on Darwin. - -### More on supported environment - -Refer to our web page for [more on supported environment][supported_env_url]. - -## Install SLEEF dependencies - -The library itself does not have any additional dependency. - -However some tests require: - -- libssl and libcrypto, that can be provided by installing openssl. -- libm, libgmp and libmpfr -- libfftw. - -These tests can be disabled if necessary. - -## How to build SLEEF - -We recommend relying on CMake as much as possible in the build process to ensure portability. -**CMake 3.18+** is the minimum required. - -1. Check out the source code from our GitHub repository - -``` -git clone https://github.com/shibatch/sleef -``` - -2. Make a separate directory to create an out-of-source build - -``` -cd sleef && mkdir build -``` - -3. Run cmake to configure the project - -``` -cmake -S . -B build -``` - -By default this will generate shared libraries. In order to generate static libraries, pass option `-DBUILD_SHARED_LIBS=OFF`. - -For more verbose output add option `-DSLEEF_SHOW_CONFIG=ON`. - -4. Run make to build the project - -``` -cmake --build build -j --clean-first -``` - -5. Run tests using ctests - -``` -ctest --test-dir build -j -``` - -For more detailed build instructions please refer to the [dedicated section on CMake](./docs/build-with-cmake.md) or to [our web page][build_info_url]. - -## Install SLEEF - -### From source - -Assuming following instructions were followed. - -6. Install to specified directory `` - -``` -cmake --install build --prefix= -``` - -### Using Spack - -SLEEF can also be directly installed using Spack. - -``` -spack install sleef@master -``` - -### Uninstall - -In order to uninstall SLEEF library and headers run - -``` -sudo xargs rm -v < build/install_manifest.txt -``` - -## License - -The software is distributed under the Boost Software License, Version 1.0. -See accompanying file [LICENSE.txt](./LICENSE.txt) or copy at [http://www.boost.org/LICENSE_1_0.txt][license_url]. -Contributions to this project are accepted under the same license. - -Copyright © 2010-2024 SLEEF Project, Naoki Shibata and contributors.
- - - - -[webpage_url]: https://sleef.org/ -[build_info_url]: https://sleef.org/compile.xhtml -[supported_env_url]: https://sleef.org/index.xhtml#environment -[repo_url]: https://github.com/shibatch/sleef -[repo_license_url]: https://github.com/shibatch/sleef/blob/main/LICENSE.txt -[license_url]: http://www.boost.org/LICENSE_1_0.txt diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/include/sleefdft.h b/src/jdk.incubator.vector/unix/native/libsleef/upstream/include/sleefdft.h index 447131cd16d..0f2c3ac7077 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/include/sleefdft.h +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/include/sleefdft.h @@ -6,6 +6,7 @@ extern "C" { #endif +#include #include #include @@ -46,20 +47,24 @@ IMPORT struct SleefDFT *SleefDFT_float_init1d(uint32_t n, const float *in, float IMPORT struct SleefDFT *SleefDFT_float_init2d(uint32_t n, uint32_t m, const float *in, float *out, uint64_t mode); IMPORT void SleefDFT_float_execute(struct SleefDFT *ptr, const float *in, float *out); +IMPORT void SleefDFT_execute(struct SleefDFT *ptr, const void *in, void *out); + IMPORT void SleefDFT_dispose(struct SleefDFT *ptr); IMPORT void SleefDFT_setPath(struct SleefDFT *ptr, char *pathStr); +IMPORT int SleefDFT_getPath(struct SleefDFT *ptr, char *pathStr, int pathStrSize); + +IMPORT void SleefDFT_setDefaultVerboseFP(FILE *fp); // IMPORT void SleefDFT_setPlanFilePath(const char *path, const char *arch, uint64_t mode); +IMPORT int SleefDFT_savePlan(const char *pathStr); -#define SLEEF_PLAN_AUTOMATIC 0 #define SLEEF_PLAN_READONLY (1 << 0) #define SLEEF_PLAN_RESET (1 << 1) -#define SLEEF_PLAN_BUILDALLPLAN (1 << 2) +#define SLEEF_PLAN_AUTOMATIC (1 << 2) #define SLEEF_PLAN_NOLOCK (1 << 3) -#define SLEEF_PLAN_MEASURE (1 << 29) #define SLEEF_PLAN_REFERTOENVVAR (1 << 30) #undef IMPORT diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/sleef-config.h.in b/src/jdk.incubator.vector/unix/native/libsleef/upstream/sleef-config.h.in index 53faefcf57d..ebf9b22fab4 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/sleef-config.h.in +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/sleef-config.h.in @@ -6,6 +6,11 @@ #define SLEEF_VERSION_MAJOR @SLEEF_VERSION_MAJOR@ #define SLEEF_VERSION_MINOR @SLEEF_VERSION_MINOR@ +#cmakedefine SLEEF_FLOAT128_IS_IEEEQP +#cmakedefine SLEEF_LONGDOUBLE_IS_IEEEQP + +#ifndef SLEEF_STATIC_LIBS #cmakedefine SLEEF_STATIC_LIBS +#endif #endif // SLEEF_CONFIG_H diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/CMakeLists.txt b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/CMakeLists.txt index 0e60e5368de..a8cfc868c22 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/CMakeLists.txt +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/CMakeLists.txt @@ -7,11 +7,19 @@ if (SLEEF_BUILD_TESTS AND NOT MINGW) endif() add_subdirectory("common") -if (SLEEF_BUILD_DFT) +if (SLEEF_BUILD_BENCH) +# add_subdirectory("libm-benchmarks") +endif() + +if (SLEEF_BUILD_DFT AND COMPILER_SUPPORTS_OPENMP) add_subdirectory("dft") if (SLEEF_BUILD_TESTS) add_subdirectory("dft-tester") endif() +else() + if (SLEEF_ENFORCE_DFT) + message(FATAL_ERROR "SLEEF_ENFORCE_DFT is specified and DFT is not built") + endif() endif() if (SLEEF_BUILD_QUAD) diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helperavx.h b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helperavx.h index 1b6da17e84d..84363c92ea7 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helperavx.h +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helperavx.h @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -110,7 +110,7 @@ static INLINE int vavailability_i(int name) { #endif // #if !defined(SLEEF_GENHEADER) -static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _MM_HINT_T0); } +static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch((const char *)ptr, _MM_HINT_T0); } static INLINE int vtestallones_i_vo32(vopmask g) { return _mm_test_all_ones(_mm_and_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1))); @@ -516,10 +516,10 @@ static INLINE float vcast_f_vf(vfloat v) { #endif // -#define PNMASK ((vdouble) { +0.0, -0.0, +0.0, -0.0 }) -#define NPMASK ((vdouble) { -0.0, +0.0, -0.0, +0.0 }) -#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f }) -#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f }) +#define PNMASK _mm256_set_pd( -0.0, +0.0, -0.0, +0.0 ) +#define NPMASK _mm256_set_pd( +0.0, -0.0, +0.0, -0.0 ) +#define PNMASKf _mm256_set_ps( -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f ) +#define NPMASKf _mm256_set_ps( +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f ) static INLINE vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); } static INLINE vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); } @@ -629,7 +629,7 @@ static INLINE vmask vcast_vm_vi(vint vi) { } static INLINE vint vcast_vi_vm(vmask vm) { return _mm_or_si128(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_mm256_castsi256_si128(vm)), _mm_set1_ps(0), 0x08)), - _mm_castps_si128(_mm_shuffle_ps(_mm_set1_ps(0), _mm_castsi128_ps(_mm256_extractf128_si256(vm, 1)), 0x80))); + _mm_castps_si128(_mm_shuffle_ps(_mm_set1_ps(0), _mm_castsi128_ps(_mm256_extractf128_si256(vm, 1)), 0x80))); } static INLINE vmask vreinterpret_vm_vi64(vint64 v) { return v; } diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helperavx2.h b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helperavx2.h index 4c0c5422170..bd6fbdb358f 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helperavx2.h +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helperavx2.h @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -96,7 +96,7 @@ static INLINE int vavailability_i(int name) { #endif // #if !defined(SLEEF_GENHEADER) -static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _MM_HINT_T0); } +static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch((const char *)ptr, _MM_HINT_T0); } static INLINE int vtestallones_i_vo32(vopmask g) { return _mm_test_all_ones(_mm_and_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1))); @@ -168,7 +168,7 @@ static INLINE vmask vcastu_vm_vi(vint vi) { static INLINE vint vcastu_vi_vm(vmask vi) { return _mm_or_si128(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_mm256_castsi256_si128(vi)), _mm_set1_ps(0), 0x0d)), - _mm_castps_si128(_mm_shuffle_ps(_mm_set1_ps(0), _mm_castsi128_ps(_mm256_extractf128_si256(vi, 1)), 0xd0))); + _mm_castps_si128(_mm_shuffle_ps(_mm_set1_ps(0), _mm_castsi128_ps(_mm256_extractf128_si256(vi, 1)), 0xd0))); } static INLINE vmask vcast_vm_i_i(int i0, int i1) { @@ -392,10 +392,10 @@ static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return _mm2 // -#define PNMASK ((vdouble) { +0.0, -0.0, +0.0, -0.0 }) -#define NPMASK ((vdouble) { -0.0, +0.0, -0.0, +0.0 }) -#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f }) -#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f }) +#define PNMASK _mm256_set_pd( -0.0, +0.0, -0.0, +0.0 ) +#define NPMASK _mm256_set_pd( +0.0, -0.0, +0.0, -0.0 ) +#define PNMASKf _mm256_set_ps( -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f ) +#define NPMASKf _mm256_set_ps( +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f ) static INLINE vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); } static INLINE vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); } @@ -476,7 +476,7 @@ static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) { return _mm256_cmpgt_epi static INLINE vmask vcast_vm_vi(vint vi) { return _mm256_cvtepi32_epi64(vi); } // signed 32-bit => 64-bit static INLINE vint vcast_vi_vm(vmask vm) { // signed 32-bit <= 64-bit return _mm_or_si128(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_mm256_castsi256_si128(vm)), _mm_set1_ps(0), 0x08)), - _mm_castps_si128(_mm_shuffle_ps(_mm_set1_ps(0), _mm_castsi128_ps(_mm256_extractf128_si256(vm, 1)), 0x80))); + _mm_castps_si128(_mm_shuffle_ps(_mm_set1_ps(0), _mm_castsi128_ps(_mm256_extractf128_si256(vm, 1)), 0x80))); } static INLINE vmask vreinterpret_vm_vi64(vint64 v) { return v; } diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helperavx2_128.h b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helperavx2_128.h index 5233db1bfd7..2d1142b996e 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helperavx2_128.h +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helperavx2_128.h @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -96,7 +96,7 @@ static INLINE int vavailability_i(int name) { #endif // #if !defined(SLEEF_GENHEADER) -static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _MM_HINT_T0); } +static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch((const char *)ptr, _MM_HINT_T0); } static INLINE int vtestallones_i_vo32(vopmask g) { return _mm_movemask_epi8(g) == 0xFFFF; } static INLINE int vtestallones_i_vo64(vopmask g) { return _mm_movemask_epi8(g) == 0xFFFF; } @@ -371,10 +371,10 @@ static INLINE float vcast_f_vf(vfloat v) { // -#define PNMASK ((vdouble) { +0.0, -0.0 }) -#define NPMASK ((vdouble) { -0.0, +0.0 }) -#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f }) -#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f }) +#define PNMASK _mm_set_pd( -0.0, +0.0 ) +#define NPMASK _mm_set_pd( +0.0, -0.0 ) +#define PNMASKf _mm_set_ps( -0.0f, +0.0f, -0.0f, +0.0f ) +#define NPMASKf _mm_set_ps( +0.0f, -0.0f, +0.0f, -0.0f ) static INLINE vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); } static INLINE vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); } diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helperavx512f.h b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helperavx512f.h index d59379163de..d8db60e8adc 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helperavx512f.h +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helperavx512f.h @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -102,7 +102,7 @@ static INLINE int vavailability_i(int name) { #endif // #if !defined(SLEEF_GENHEADER) -static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _MM_HINT_T0); } +static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch((const char *)ptr, _MM_HINT_T0); } #ifdef __INTEL_COMPILER static INLINE int vtestallones_i_vo64(vopmask g) { return _mm512_mask2int(g) == 0xff; } diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helperneon32.h b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helperneon32.h index 042cad40e1d..e7598c96139 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helperneon32.h +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helperneon32.h @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -257,10 +257,10 @@ static INLINE int vavailability_i(int name) { } -static INLINE vfloat vload_vf_p(const float *ptr) { return vld1q_f32(__builtin_assume_aligned(ptr, 16)); } +static INLINE vfloat vload_vf_p(const float *ptr) { return vld1q_f32((const float32_t*)__builtin_assume_aligned(ptr, 16)); } static INLINE vfloat vloadu_vf_p(const float *ptr) { return vld1q_f32(ptr); } -static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { vst1q_f32(__builtin_assume_aligned(ptr, 16), v); } +static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { vst1q_f32((float32_t*)__builtin_assume_aligned(ptr, 16), v); } static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { vst1q_f32(ptr, v); } static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helperpower_128.h b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helperpower_128.h index f3ac298f948..ca6218c8666 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helperpower_128.h +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helperpower_128.h @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -103,16 +103,16 @@ typedef vquad vargquad; #define vset__s64(...) ((v__i64) {__VA_ARGS__}) #define vset__u64(...) ((v__u64) {__VA_ARGS__}) -#define vsetall__vi(v) vset__vi(v, v) -#define vsetall__vi2(v) vset__vi2(v, v, v, v) +#define vsetall__vi(v) vset__vi((int)v, (int)v) +#define vsetall__vi2(v) vset__vi2((int)v, (int)v, (int)v, (int)v) #define vsetall__vm(v) vset__vm(v, v, v, v) #define vsetall__vo(v) vset__vo(v, v, v, v) -#define vsetall__vf(v) vset__vf(v, v, v, v) -#define vsetall__vd(v) vset__vd(v, v) -#define vsetall__u8(v) vset__u8(v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v) -#define vsetall__u32(v) vset__u32(v, v, v, v) -#define vsetall__s64(v) vset__s64(v, v) -#define vsetall__u64(v) vset__u64(v, v) +#define vsetall__vf(v) vset__vf((float)v, (float)v, (float)v, (float)v) +#define vsetall__vd(v) vset__vd((double)v, (double)v) +#define vsetall__u8(v) vset__u8((uint8_t)v, (uint8_t)v, (uint8_t)v, (uint8_t)v, (uint8_t)v, (uint8_t)v, (uint8_t)v, (uint8_t)v, (uint8_t)v, (uint8_t)v, (uint8_t)v, (uint8_t)v, (uint8_t)v, (uint8_t)v, (uint8_t)v, (uint8_t)v) +#define vsetall__u32(v) vset__u32((uint32_t)v, (uint32_t)v, (uint32_t)v, (uint32_t)v) +#define vsetall__s64(v) vset__s64((int64_t)v, (int64_t)v) +#define vsetall__u64(v) vset__u64((uint64_t)v, (uint64_t)v) #define vzero__vi() vsetall__vi(0) #define vzero__vi2() vsetall__vi2(0) @@ -351,7 +351,7 @@ static INLINE vmask vcastu_vm_vi(vint vi) static INLINE vopmask vcast_vo_i(int i) { i = i ? -1 : 0; - return (vopmask) { i, i, i, i }; + return (vopmask) { (unsigned int)i, (unsigned int)i, (unsigned int)i, (unsigned int)i }; } // signed int to single-precision @@ -371,7 +371,7 @@ static INLINE vdouble vcast_vd_vi(vint vi) { vdouble ret; vint swap = vec_mergeh(vi, vi); -#if defined(__clang__) || __GNUC__ >= 7 +#if defined(__clang__) || (__GNUC__ >= 7 && __GNUC__ < 15) ret = __builtin_vsx_xvcvsxwdp(swap); #else __asm__ __volatile__("xvcvsxwdp %x0,%x1" : "=wa" (ret) : "wa" (swap)); @@ -406,7 +406,7 @@ static INLINE vint2 vtruncate_vi2_vf(vfloat vf) static INLINE vint vtruncate_vi_vd(vdouble vd) { vint ret; -#if defined(__clang__) || __GNUC__ >= 7 +#if defined(__clang__) || (__GNUC__ >= 7 && __GNUC__ < 15) ret = __builtin_vsx_xvcvdpsxws(vd); #else __asm__ __volatile__("xvcvdpsxws %x0,%x1" : "=wa" (ret) : "wa" (vd)); @@ -860,11 +860,11 @@ static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) { #define vsrl64_vm_vm_i(x, c) ((vmask)vec_sr((__vector signed long long)x, (__vector unsigned long long)vsetall__vm(c))) static INLINE vint vcast_vi_vm(vmask vm) { - return (vint) { vm[0], vm[2] }; + return (vint) { (int)vm[0], (int)vm[2] }; } static INLINE vmask vcast_vm_vi(vint vi) { - return (vmask) (__vector signed long long) { vi[0], vi[1] }; + return (vmask) (__vector signed long long) { (signed long long)vi[0], (signed long long)vi[1] }; } static INLINE vmask vreinterpret_vm_vi64(vint64 v) { return (vmask)v; } diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helperpurec.h b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helperpurec.h index 14142a3633d..f78be42468f 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helperpurec.h +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helperpurec.h @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helperpurec_scalar.h b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helperpurec_scalar.h index 2826ea0f945..1dc5d3eab59 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helperpurec_scalar.h +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helperpurec_scalar.h @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2023. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -426,7 +426,7 @@ static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { *ptr = v; } static vquad loadu_vq_p(void *p) { vquad vq; memcpy(8 + (char *)&vq, p, 8); - memcpy((char *)&vq, 8 + p, 8); + memcpy((char *)&vq, 8 + (char *)p, 8); return vq; } diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helperrvv.h b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helperrvv.h index f304434af80..aceea44125e 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helperrvv.h +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helperrvv.h @@ -91,6 +91,7 @@ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" static INLINE vfloat64m1x4_t __riscv_vcreate_v_f64m1x4(vfloat64m1_t x, vfloat64m1_t y, vfloat64m1_t z, vfloat64m1_t w) { vfloat64m1x4_t unused; return __riscv_vset(__riscv_vset(__riscv_vset(__riscv_vset(unused, 0, x), 1, y), 2, z), 3, w); @@ -158,14 +159,14 @@ typedef vfloat64m1x4_t tdi_t; #define SLEEF_RVV_SP_LMUL 1 #define SLEEF_RVV_DP_LMUL 1 -#define SLEEF_RVV_DP_RUNTIME_VL() __riscv_vsetvlmax_e64m1() +#define SLEEF_RVV_DP_RUNTIME_VL() ((int)__riscv_vsetvlmax_e64m1()) #if SLEEF_RVV_VLEN == 0 // The configuration didn't provide a constant vector length, meaning it'll // have to be determined at run-time. RVV offers per-data-width operations for // this so the result doesn't need to be adjusted and that operation is likely // to fold into the surrounding code for free. // -#define VECTLENSP (__riscv_vsetvlmax_e32m1()) +#define VECTLENSP ((int)__riscv_vsetvlmax_e32m1()) #define VECTLENDP SLEEF_RVV_DP_RUNTIME_VL() //@#define VECTLENSP __riscv_vsetvlmax_e32m1() //@#define VECTLENDP __riscv_vsetvlmax_e64m1() @@ -268,7 +269,7 @@ typedef vfloat64m2x4_t tdi_t; #define SLEEF_RVV_SP_LMUL 2 #define SLEEF_RVV_DP_LMUL 2 -#define SLEEF_RVV_DP_RUNTIME_VL() __riscv_vsetvlmax_e64m2() +#define SLEEF_RVV_DP_RUNTIME_VL() ((int)__riscv_vsetvlmax_e64m2()) #if SLEEF_RVV_VLEN == 0 // The configuration didn't provide a constant vector length, meaning it'll // have to be determined at run-time. RVV offers per-data-width operations for @@ -605,7 +606,7 @@ static INLINE vmask vreinterpret_vm_vf(vfloat vf) { // needed. // static INLINE int vtestallones_i_vo32(rvv_sp_vopmask g) { - return __riscv_vcpop(g, VECTLENSP) == VECTLENSP; + return (int)__riscv_vcpop(g, VECTLENSP) == (int)VECTLENSP; } static INLINE vmask vor_vm_vo32_vm(rvv_sp_vopmask x, vmask y) { rvv_vmask32 y32 = SLEEF_RVV_SP_VREINTERPRET_VM(y); @@ -1080,7 +1081,7 @@ static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(rvv_dp_vopmask o0, rvv_dp_vopmask return __riscv_vfmerge(__riscv_vfmerge(__riscv_vfmerge(vcast_vd_d(d3), d2, o2, VECTLENDP), d1, o1, VECTLENDP), d0, o0, VECTLENDP); } static INLINE int vtestallones_i_vo64(rvv_dp_vopmask g) { - return __riscv_vcpop(g, VECTLENDP) == VECTLENDP; + return (int)__riscv_vcpop(g, VECTLENDP) == (int)VECTLENDP; } // integer comparison static INLINE rvv_dp_vopmask veq_vo_vi_vi(vint x, vint y) { @@ -1171,7 +1172,7 @@ static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdoub // probably only iterate 2 or 4 times. // ptr += offset * 2; - for (int i = 0; i < VECTLENDP; i += 2) { + for (int i = 0; i < (int)VECTLENDP; i += 2) { // PROTIP: Avoid modifying `v` within the loop, and just extract the useful // part directly in each iteration, because we can. This avoids a // loop-carried dependency. @@ -1185,7 +1186,7 @@ static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdoub static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { // as above re: looping ptr += offset * 2; - for (int i = 0; i < VECTLENSP; i += 2) { + for (int i = 0; i < (int)VECTLENSP; i += 2) { vfloat vv = __riscv_vslidedown(v, i, 2); __riscv_vse32(ptr, vv, 2); ptr += step * 2; diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helpers390x_128.h b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helpers390x_128.h index 924f4eac144..0c1ed100de1 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helpers390x_128.h +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helpers390x_128.h @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -78,7 +78,7 @@ typedef vquad vargquad; static INLINE int vavailability_i(int n) { if (n == 1 || n == 2) { - return vec_max((vdouble) {n, n}, (vdouble) {n, n})[0] != 0; + return vec_max((vdouble) {(double)n, (double)n}, (vdouble) {(double)n, (double)n})[0] != 0; } return 0; } @@ -127,23 +127,23 @@ static INLINE vfloat vgather_vf_p_vi2(const float *p, vint2 vi2) { return ((vfloat) { p[vi2[0]], p[vi2[1]], p[vi2[2]], p[vi2[3]] }); } -static INLINE vopmask vcast_vo_i(int i) { return (vopmask) { i ? (long long)-1 : 0, i ? (long long)-1 : 0 }; } +static INLINE vopmask vcast_vo_i(int i) { return (vopmask) { i ? (unsigned long long)-1 : 0, i ? (unsigned long long)-1 : 0 }; } static INLINE vint vcast_vi_i(int i) { return (vint) { i, i }; } static INLINE vint2 vcast_vi2_i(int i) { return (vint2) { i, i, i, i }; } static INLINE vfloat vcast_vf_f(float f) { return (vfloat) { f, f, f, f }; } static INLINE vdouble vcast_vd_d(double d) { return (vdouble) { d, d }; } -static INLINE vdouble vcast_vd_vi(vint vi) { return (vdouble) { vi[0], vi[1] }; } -static INLINE vfloat vcast_vf_vi2(vint2 vi) { return (vfloat) { vi[0], vi[1], vi[2], vi[3] }; } +static INLINE vdouble vcast_vd_vi(vint vi) { return (vdouble) { (double)vi[0], (double)vi[1] }; } +static INLINE vfloat vcast_vf_vi2(vint2 vi) { return (vfloat) { (float)vi[0], (float)vi[1], (float)vi[2], (float)vi[3] }; } static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return __builtin_s390_vfidb(vd, 4, 5); } static INLINE vdouble vrint_vd_vd(vdouble vd) { return __builtin_s390_vfidb(vd, 4, 4); } static INLINE vint vrint_vi_vd(vdouble vd) { vd = vrint_vd_vd(vd); - return (vint) { vd[0], vd[1] }; + return (vint) { (int)vd[0], (int)vd[1] }; } -static INLINE vint vtruncate_vi_vd(vdouble vd) { return (vint) { vd[0], vd[1] }; } -static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return (vint) { vf[0], vf[1], vf[2], vf[3] }; } +static INLINE vint vtruncate_vi_vd(vdouble vd) { return (vint) { (int)vd[0], (int)vd[1] }; } +static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return (vint) { (int)vf[0], (int)vf[1], (int)vf[2], (int)vf[3] }; } static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return (vmask)vd; } static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return (vdouble)vm; } @@ -202,7 +202,7 @@ static INLINE vmask vcast_vm_i64(int64_t i) { return (vmask)(vint64){ i, i }; } static INLINE vmask vcast_vm_u64(uint64_t i) { return (vmask)(vuint64){ i, i }; } static INLINE vmask vcastu_vm_vi(vint vi) { return (vmask)(vint2){ vi[0], 0, vi[1], 0 }; } -static INLINE vint vcastu_vi_vm(vmask vi2) { return (vint){ vi2[0] >> 32, vi2[1] >> 32 }; } +static INLINE vint vcastu_vi_vm(vmask vi2) { return (vint){ (int)(vi2[0] >> 32), (int)(vi2[1] >> 32) }; } static INLINE vint vreinterpretFirstHalf_vi_vi2(vint2 vi2) { return (vint){ vi2[0], vi2[1] }; } static INLINE vint2 vreinterpretFirstHalf_vi2_vi(vint vi) { return (vint2){ vi[0], vi[1], 0, 0 }; } @@ -309,8 +309,8 @@ static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return x ^ y; } static INLINE vint vand_vi_vo_vi(vopmask x, vint y) { return vreinterpretFirstHalf_vi_vi2((vint2)x) & y; } static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) { return vec_andc(y, vreinterpretFirstHalf_vi_vi2((vint2)x)); } -static INLINE vint vsll_vi_vi_i(vint x, int c) { return (vint)(((__vector unsigned int)x) << (__vector unsigned int){c, c, c, c}); } -static INLINE vint vsrl_vi_vi_i(vint x, int c) { return (vint)(((__vector unsigned int)x) >> (__vector unsigned int){c, c, c, c}); } +static INLINE vint vsll_vi_vi_i(vint x, int c) { return (vint)(((__vector unsigned int)x) << (__vector unsigned int){(unsigned int)c, (unsigned int)c, (unsigned int)c, (unsigned int)c}); } +static INLINE vint vsrl_vi_vi_i(vint x, int c) { return (vint)(((__vector unsigned int)x) >> (__vector unsigned int){(unsigned int)c, (unsigned int)c, (unsigned int)c, (unsigned int)c}); } static INLINE vint vsra_vi_vi_i(vint x, int c) { return x >> (__vector int){c, c, c, c}; } static INLINE vint veq_vi_vi_vi(vint x, vint y) { return vec_cmpeq(x, y); } @@ -364,8 +364,8 @@ static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return x ^ y; } static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return (vint2)x & y; } static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return y & ~(vint2)x; } -static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { return (vint2)(((__vector unsigned int)x) << (__vector unsigned int){c, c, c, c}); } -static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { return (vint2)(((__vector unsigned int)x) >> (__vector unsigned int){c, c, c, c}); } +static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { return (vint2)(((__vector unsigned int)x) << (__vector unsigned int){(unsigned int)c, (unsigned int)c, (unsigned int)c, (unsigned int)c}); } +static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { return (vint2)(((__vector unsigned int)x) >> (__vector unsigned int){(unsigned int)c, (unsigned int)c, (unsigned int)c, (unsigned int)c}); } static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { return x >> (__vector int){c, c, c, c}; } static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return (vopmask)vec_cmpeq(x, y); } @@ -405,7 +405,7 @@ static INLINE vopmask visnan_vo_vf (vfloat d) { return vneq_vo_vf_vf(d, d); } static INLINE vint2 vrint_vi2_vf(vfloat vf) { vf = vrint_vf_vf(vf); - return (vint) { vf[0], vf[1], vf[2], vf[3] }; + return (vint) { (int)vf[0], (int)vf[1], (int)vf[2], (int)vf[3] }; } // @@ -445,11 +445,11 @@ static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) { return (vopmask)vec_cmpgt((__vector signed long long)x, (__vector signed long long)y); } -#define vsll64_vm_vm_i(x, c) ((vmask)((__vector unsigned long long)x << (__vector unsigned long long) { c, c })) -#define vsrl64_vm_vm_i(x, c) ((vmask)((__vector unsigned long long)x >> (__vector unsigned long long) { c, c })) +#define vsll64_vm_vm_i(x, c) ((vmask)((__vector unsigned long long)x << (__vector unsigned long long) { (unsigned long long)c, (unsigned long long)c })) +#define vsrl64_vm_vm_i(x, c) ((vmask)((__vector unsigned long long)x >> (__vector unsigned long long) { (unsigned long long)c, (unsigned long long)c })) static INLINE vint vcast_vi_vm(vmask vm) { - return (vint) { vm[0], vm[1] }; + return (vint) { (int)vm[0], (int)vm[1] }; } static INLINE vmask vcast_vm_vi(vint vi) { diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helpersse2.h b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helpersse2.h index 833f5f9b8e1..349f06efc3b 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helpersse2.h +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helpersse2.h @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -124,7 +124,7 @@ static INLINE int vavailability_i(int name) { #endif // #if !defined(SLEEF_GENHEADER) -static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _MM_HINT_T0); } +static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch((const char *)ptr, _MM_HINT_T0); } static INLINE int vtestallones_i_vo32(vopmask g) { return _mm_movemask_epi8(g) == 0xFFFF; } static INLINE int vtestallones_i_vo64(vopmask g) { return _mm_movemask_epi8(g) == 0xFFFF; } @@ -420,10 +420,10 @@ static INLINE float vcast_f_vf(vfloat v) { // -#define PNMASK ((vdouble) { +0.0, -0.0 }) -#define NPMASK ((vdouble) { -0.0, +0.0 }) -#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f }) -#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f }) +#define PNMASK _mm_set_pd( -0.0, +0.0 ) +#define NPMASK _mm_set_pd( +0.0, -0.0 ) +#define PNMASKf _mm_set_ps( -0.0f, +0.0f, -0.0f, +0.0f ) +#define NPMASKf _mm_set_ps( +0.0f, -0.0f, +0.0f, -0.0f ) static INLINE vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); } static INLINE vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); } diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helpersve.h b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helpersve.h index 75965dc6d92..e9406a12d7c 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helpersve.h +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helpersve.h @@ -22,9 +22,9 @@ #if CONFIG == 1 || CONFIG == 2 // Vector length agnostic -#define VECTLENSP (svcntw()) +#define VECTLENSP ((int)svcntw()) //@#define VECTLENSP (svcntw()) -#define VECTLENDP (svcntd()) +#define VECTLENDP ((int)svcntd()) //@#define VECTLENDP (svcntd()) #define ISANAME "AArch64 SVE" #define ptrue svptrue_b8() diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helpervecext.h b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helpervecext.h index 3f079317f0b..a15a4252464 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helpervecext.h +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/arch/helpervecext.h @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/CMakeLists.txt b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/CMakeLists.txt index 0f27a323aac..b1514c56f7c 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/CMakeLists.txt +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/CMakeLists.txt @@ -16,10 +16,49 @@ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SLEEF_C_FLAGS}") add_library(${TARGET_LIBCOMMON_OBJ} OBJECT common.c) set_target_properties(${TARGET_LIBCOMMON_OBJ} PROPERTIES ${COMMON_TARGET_PROPERTIES}) -# Target TARGET_LIBARRAYMAP_OBJ - -add_library(${TARGET_LIBARRAYMAP_OBJ} OBJECT arraymap.c) -set_target_properties(${TARGET_LIBARRAYMAP_OBJ} PROPERTIES ${COMMON_TARGET_PROPERTIES}) - add_host_executable("addSuffix" addSuffix.c) set_target_properties("addSuffix" PROPERTIES C_STANDARD 99) + +if (NOT SLEEF_OPENSSL_FOUND) + add_library(${TARGET_PSHA_OBJ} OBJECT psha2_capi.cpp) +else() + # Tests for internal sha256 + add_executable(test_psha test_psha2.cpp) + target_link_libraries(test_psha ${SLEEF_OPENSSL_LIBRARIES}) + target_include_directories(test_psha PRIVATE ${SLEEF_OPENSSL_INCLUDE_DIR}) + add_test(NAME test_psha COMMAND test_psha) + set_tests_properties(test_psha PROPERTIES COST 2.0) + + add_executable(test_psha_capi test_psha2.cpp) + target_compile_definitions(test_psha_capi PRIVATE TEST_CAPI=1) + target_link_libraries(test_psha_capi ${SLEEF_OPENSSL_LIBRARIES}) + target_include_directories(test_psha_capi PRIVATE ${SLEEF_OPENSSL_INCLUDE_DIR}) + add_test(NAME test_psha_capi COMMAND test_psha_capi) + set_tests_properties(test_psha_capi PROPERTIES COST 2.0) +endif() + +# Target TARGET_TESTERUTIL_OBJ +add_library(${TARGET_TESTERUTIL_OBJ} OBJECT testerutil.c) +target_compile_definitions(${TARGET_TESTERUTIL_OBJ} PRIVATE ${COMMON_TARGET_DEFINITIONS}) +if(LIB_MPFR) + target_compile_definitions(${TARGET_TESTERUTIL_OBJ} PRIVATE USEMPFR=1) + target_link_libraries(${TARGET_TESTERUTIL_OBJ} ${LIB_MPFR} ${LIBGMP}) +endif() +if (MPFR_INCLUDE_DIR) + target_include_directories(${TARGET_TESTERUTIL_OBJ} PRIVATE ${MPFR_INCLUDE_DIR}) +endif() + +# Target TARGET_QTESTERUTIL_OBJ +add_library(${TARGET_QTESTERUTIL_OBJ} OBJECT qtesterutil.c) +target_compile_definitions(${TARGET_QTESTERUTIL_OBJ} PRIVATE ${COMMON_TARGET_DEFINITIONS}) +if(LIB_MPFR) + target_compile_definitions(${TARGET_QTESTERUTIL_OBJ} PRIVATE USEMPFR=1) + target_link_libraries(${TARGET_QTESTERUTIL_OBJ} ${LIB_MPFR} ${LIBGMP}) +endif() +if (MPFR_INCLUDE_DIR) + target_include_directories(${TARGET_QTESTERUTIL_OBJ} PRIVATE ${MPFR_INCLUDE_DIR}) +endif() +if(COMPILER_SUPPORTS_QUADMATH) + target_link_libraries(${TARGET_QTESTERUTIL_OBJ} "-lquadmath") + target_compile_definitions(${TARGET_QTESTERUTIL_OBJ} PRIVATE ENABLEFLOAT128=1) +endif() diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/addSuffix.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/addSuffix.c index a66f56f666f..f2ca261e9f7 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/addSuffix.c +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/addSuffix.c @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -207,7 +207,18 @@ int main(int argc, char **argv) { nkeywords++; if (nkeywords >= nalloc) { nalloc *= 2; - keywords = realloc(keywords, sizeof(char *) * nalloc); + char ** tmp = realloc(keywords, sizeof(char *) * nalloc); + if (tmp == NULL) { + // free keywords if realloc fails + // otherwise address is lost. + free(keywords); + fclose(fp); + fprintf(stderr, "Failed realloc!\n"); + exit(-1); + } + else { + keywords = tmp; + } } } @@ -228,6 +239,10 @@ int main(int argc, char **argv) { fclose(fp); + for(int i=0;i -#include -#include -#include -#include -#include -#include - -// - -#if !(defined(__MINGW32__) || defined(__MINGW64__) || defined(_MSC_VER)) -#include -#include -#include - -static void FLOCK(FILE *fp) { flock(fileno(fp), LOCK_EX); } -static void FUNLOCK(FILE *fp) { flock(fileno(fp), LOCK_UN); } -static void FTRUNCATE(FILE *fp, off_t z) { - if (ftruncate(fileno(fp), z)) - ; -} -static FILE *OPENTMPFILE() { return tmpfile(); } -static void CLOSETMPFILE(FILE *fp) { fclose(fp); } -#else -#include -#include - -static void FLOCK(FILE *fp) { } -static void FUNLOCK(FILE *fp) { } -static void FTRUNCATE(FILE *fp, long z) { - fseek(fp, 0, SEEK_SET); - SetEndOfFile((HANDLE)_get_osfhandle(_fileno(fp))); -} -static FILE *OPENTMPFILE() { return fopen("tmpfile.txt", "w+"); } -static void CLOSETMPFILE(FILE *fp) { - fclose(fp); - remove("tmpfile.txt"); -} -#endif - -// - -#define MAGIC_ARRAYMAPNODE 0xf73130fa -#define MAGIC_ARRAYMAP 0x8693bd21 -#define LOGNBUCKETS 8 -#define NBUCKETS (1 << LOGNBUCKETS) - -static int hash(uint64_t key) { - return (key ^ (key >> LOGNBUCKETS) ^ (key >> (LOGNBUCKETS*2)) ^ (key >> (LOGNBUCKETS*3))) & (NBUCKETS-1); -} - -static void String_trim(char *str) { - char *dst = str, *src = str, *pterm = src; - - while(*src != '\0' && isspace((int)*src)) src++; - - for(;*src != '\0';src++) { - *dst++ = *src; - if (!isspace((int)*src)) pterm = dst; - } - - *pterm = '\0'; -} - -typedef struct ArrayMapNode { - uint32_t magic; - uint64_t key; - void *value; -} ArrayMapNode; - -typedef struct ArrayMap { - uint32_t magic; - ArrayMapNode *array[NBUCKETS]; - int size[NBUCKETS], capacity[NBUCKETS], totalSize; -} ArrayMap; - -ArrayMap *initArrayMap() { - ArrayMap *thiz = (ArrayMap *)calloc(1, sizeof(ArrayMap)); - thiz->magic = MAGIC_ARRAYMAP; - - for(int i=0;icapacity[i] = 8; - thiz->array[i] = (ArrayMapNode *)malloc(thiz->capacity[i] * sizeof(ArrayMapNode)); - thiz->size[i] = 0; - } - - thiz->totalSize = 0; - return thiz; -} - -void ArrayMap_dispose(ArrayMap *thiz) { - assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP); - - for(int j=0;jsize[j];i++) { - assert(thiz->array[j][i].magic == MAGIC_ARRAYMAPNODE); - thiz->array[j][i].magic = 0; - } - free(thiz->array[j]); - } - - thiz->magic = 0; - free(thiz); -} - -int ArrayMap_size(ArrayMap *thiz) { - assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP); - return thiz->totalSize; -} - -uint64_t *ArrayMap_keyArray(ArrayMap *thiz) { - assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP); - uint64_t *a = (uint64_t *)malloc(sizeof(uint64_t) * thiz->totalSize); - int p = 0; - for(int j=0;jsize[j];i++) { - assert(thiz->array[j][i].magic == MAGIC_ARRAYMAPNODE); - a[p++] = thiz->array[j][i].key; - } - } - return a; -} - -void **ArrayMap_valueArray(ArrayMap *thiz) { - assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP); - void **a = (void **)malloc(sizeof(void *) * thiz->totalSize); - int p = 0; - for(int j=0;jsize[j];i++) { - assert(thiz->array[j][i].magic == MAGIC_ARRAYMAPNODE); - a[p++] = thiz->array[j][i].value; - } - } - return a; -} - -void *ArrayMap_remove(ArrayMap *thiz, uint64_t key) { - assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP); - - int h = hash(key); - for(int i=0;isize[h];i++) { - assert(thiz->array[h][i].magic == MAGIC_ARRAYMAPNODE); - if (thiz->array[h][i].key == key) { - void *old = thiz->array[h][i].value; - thiz->array[h][i].key = thiz->array[h][thiz->size[h]-1].key; - thiz->array[h][i].value = thiz->array[h][thiz->size[h]-1].value; - thiz->array[h][thiz->size[h]-1].magic = 0; - thiz->size[h]--; - thiz->totalSize--; - return old; - } - } - - return NULL; -} - -void *ArrayMap_put(ArrayMap *thiz, uint64_t key, void *value) { - if (value == NULL) return ArrayMap_remove(thiz, key); - - assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP); - - int h = hash(key); - for(int i=0;isize[h];i++) { - assert(thiz->array[h][i].magic == MAGIC_ARRAYMAPNODE); - if (thiz->array[h][i].key == key) { - void *old = thiz->array[h][i].value; - thiz->array[h][i].value = value; - return old; - } - } - - if (thiz->size[h] >= thiz->capacity[h]) { - thiz->capacity[h] *= 2; - thiz->array[h] = (ArrayMapNode *)realloc(thiz->array[h], thiz->capacity[h] * sizeof(ArrayMapNode)); - } - - ArrayMapNode *n = &(thiz->array[h][thiz->size[h]++]); - n->magic = MAGIC_ARRAYMAPNODE; - n->key = key; - n->value = value; - - thiz->totalSize++; - - return NULL; -} - -void *ArrayMap_get(ArrayMap *thiz, uint64_t key) { - assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP); - - int h = hash(key); - for(int i=0;isize[h];i++) { - assert(thiz->array[h][i].magic == MAGIC_ARRAYMAPNODE); - if (thiz->array[h][i].key == key) { - return thiz->array[h][i].value; - } - } - - return NULL; -} - -#define LINELEN (1024*1024) - -ArrayMap *ArrayMap_load(const char *fn, const char *prefix, const char *idstr, int doLock) { - const int idstrlen = (int)strlen(idstr); - int prefixLen = (int)strlen(prefix) + 3; - - if (prefixLen >= LINELEN-10 || idstrlen >= LINELEN-10) return NULL; - - FILE *fp = fopen(fn, "r"); - if (fp == NULL) return NULL; - - if (doLock) FLOCK(fp); - - ArrayMap *thiz = initArrayMap(); - - char *prefix2 = malloc(prefixLen+10); - strcpy(prefix2, prefix); - String_trim(prefix2); - for(char *p = prefix2;*p != '\0';p++) { - if (*p == ':') *p = ';'; - if (*p == ' ') *p = '_'; - } - strcat(prefix2, " : "); - prefixLen = (int)strlen(prefix2); - - char *line = malloc(sizeof(char) * (LINELEN+10)); - line[idstrlen] = '\0'; - - if (fread(line, sizeof(char), idstrlen, fp) != idstrlen || - strcmp(idstr, line) != 0) { - if (doLock) FUNLOCK(fp); - fclose(fp); - free(prefix2); - free(line); - return NULL; - } - - for(;;) { - line[LINELEN] = '\0'; - if (fgets(line, LINELEN, fp) == NULL) break; - if (strncmp(line, prefix2, prefixLen) != 0) continue; - - uint64_t key; - char *value = malloc(sizeof(char) * LINELEN); - - if (sscanf(line + prefixLen, "%" SCNx64 " : %s\n", &key, value) == 2) { - ArrayMap_put(thiz, (uint64_t)key, (void *)value); - } else { - free(value); - } - } - - if (doLock) FUNLOCK(fp); - fclose(fp); - - free(prefix2); - free(line); - - return thiz; -} - -int ArrayMap_save(ArrayMap *thiz, const char *fn, const char *prefix, const char *idstr) { - assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP); - - const int idstrlen = (int)strlen(idstr); - int prefixLen = (int)strlen(prefix) + 3; - - if (prefixLen >= LINELEN-10 || idstrlen >= LINELEN-10) return -1; - - // Generate prefix2 - - char *prefix2 = malloc(prefixLen+10); - strcpy(prefix2, prefix); - String_trim(prefix2); - for(char *p = prefix2;*p != '\0';p++) { - if (*p == ':') *p = ';'; - if (*p == ' ') *p = '_'; - } - strcat(prefix2, " : "); - prefixLen = (int)strlen(prefix2); - - // - - FILE *fp = fopen(fn, "a+"); - if (fp == NULL) return -1; - - FLOCK(fp); - fseek(fp, 0, SEEK_SET); - - // Copy the file specified by fn to tmpfile - - FILE *tmpfp = OPENTMPFILE(); - if (tmpfp == NULL) { - FUNLOCK(fp); - fclose(fp); - return -1; - } - - char *line = malloc(sizeof(char) * (LINELEN+10)); - line[idstrlen] = '\0'; - - if (fread(line, sizeof(char), idstrlen, fp) == idstrlen && strcmp(idstr, line) == 0) { - for(;;) { - line[LINELEN] = '\0'; - if (fgets(line, LINELEN, fp) == NULL) break; - if (strncmp(line, prefix2, prefixLen) != 0) fputs(line, tmpfp); - } - } - - // Write the contents in the map into tmpfile - - uint64_t *keys = ArrayMap_keyArray(thiz); - int s = ArrayMap_size(thiz); - - for(int i=0;i= LINELEN-10) continue; - fprintf(tmpfp, "%s %" PRIx64 " : %s\n", prefix2, keys[i], value); - } - - free(keys); - - fseek(fp, 0, SEEK_SET); - FTRUNCATE(fp, 0); - fwrite(idstr, sizeof(char), strlen(idstr), fp); - - fseek(tmpfp, 0, SEEK_SET); - - for(;;) { - size_t s = fread(line, 1, LINELEN, tmpfp); - if (s == 0) break; - fwrite(line, 1, s, fp); - } - - FUNLOCK(fp); - fclose(fp); - - CLOSETMPFILE(tmpfp); - free(prefix2); - free(line); - return 0; -} diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/arraymap.h b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/arraymap.h deleted file mode 100644 index 9d05abe783a..00000000000 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/arraymap.h +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. -// Distributed under the Boost Software License, Version 1.0. -// (See accompanying file LICENSE.txt or copy at -// http://www.boost.org/LICENSE_1_0.txt) - -#ifndef __ARRAYMAP_H__ -#define __ARRAYMAP_H__ -typedef struct ArrayMap ArrayMap; - -ArrayMap *initArrayMap(); -void ArrayMap_dispose(ArrayMap *thiz); -int ArrayMap_size(ArrayMap *thiz); -void *ArrayMap_remove(ArrayMap *thiz, uint64_t key); -void *ArrayMap_put(ArrayMap *thiz, uint64_t key, void *value); -void *ArrayMap_get(ArrayMap *thiz, uint64_t key); - -uint64_t *ArrayMap_keyArray(ArrayMap *thiz); -void **ArrayMap_valueArray(ArrayMap *thiz); -int ArrayMap_save(ArrayMap *thiz, const char *fn, const char *prefix, const char *idstr); -ArrayMap *ArrayMap_load(const char *fn, const char *prefix, const char *idstr, int doLock); -#endif diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/common.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/common.c index 6ebcc3f4ffb..29f6946bc2f 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/common.c +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/common.c @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/common.h b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/common.h index ff278e0792a..800636ff5be 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/common.h +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/common.h @@ -1,9 +1,20 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) #ifndef __COMMON_H__ #define __COMMON_H__ -char *Sleef_getCpuIdString(); + +#ifdef __cplusplus +extern "C" +{ #endif + +char *Sleef_getCpuIdString(); + +#ifdef __cplusplus +} +#endif + +#endif // #ifndef __COMMON_H__ diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/commonfuncs.h b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/commonfuncs.h index 494b7b87c7a..1dd72628a6c 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/commonfuncs.h +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/commonfuncs.h @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2023. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -415,7 +415,7 @@ static INLINE CONST vquad add128_vq_vq_vq(vquad x, vquad y) { static INLINE CONST vquad imdvq_vq_vm_vm(vmask x, vmask y) { vquad r = vqsetxy_vq_vm_vm(x, y); return r; } // imm must be smaller than 64 -#define srl128_vq_vq_i(m, imm) \ +#define srl128_vq_vq_i(m, imm) \ imdvq_vq_vm_vm(vor_vm_vm_vm(vsrl64_vm_vm_i(vqgetx_vm_vq(m), imm), vsll64_vm_vm_i(vqgety_vm_vq(m), 64-imm)), vsrl64_vm_vm_i(vqgety_vm_vq(m), imm)) // This function is equivalent to : diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/dd.h b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/dd.h index d373bfefa90..537d858f060 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/dd.h +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/dd.h @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2024. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/df.h b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/df.h index b5a6462d58e..920d23c2a61 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/df.h +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/df.h @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2024. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/estrin.h b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/estrin.h index 1953ac6a6f2..6d226896d79 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/estrin.h +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/estrin.h @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/f128util.h b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/f128util.h deleted file mode 100644 index d9cef1510ee..00000000000 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/f128util.h +++ /dev/null @@ -1,92 +0,0 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. -// Distributed under the Boost Software License, Version 1.0. -// (See accompanying file LICENSE.txt or copy at -// http://www.boost.org/LICENSE_1_0.txt) - -#include -#include - -static __float128 mpfr_get_f128(mpfr_t m, mpfr_rnd_t rnd) { - if (isnan(mpfr_get_d(m, GMP_RNDN))) return __builtin_nan(""); - - mpfr_t frr, frd; - mpfr_inits(frr, frd, NULL); - - mpfr_exp_t e; - mpfr_frexp(&e, frr, m, GMP_RNDN); - - double d0 = mpfr_get_d(frr, GMP_RNDN); - mpfr_set_d(frd, d0, GMP_RNDN); - mpfr_sub(frr, frr, frd, GMP_RNDN); - - double d1 = mpfr_get_d(frr, GMP_RNDN); - mpfr_set_d(frd, d1, GMP_RNDN); - mpfr_sub(frr, frr, frd, GMP_RNDN); - - double d2 = mpfr_get_d(frr, GMP_RNDN); - - mpfr_clears(frr, frd, NULL); - return ldexpq((__float128)d2 + (__float128)d1 + (__float128)d0, e); -} - -static void mpfr_set_f128(mpfr_t frx, __float128 f, mpfr_rnd_t rnd) { - char s[128]; - quadmath_snprintf(s, 120, "%.50Qg", f); - mpfr_set_str(frx, s, 10, rnd); -} - -static void printf128(__float128 f) { - char s[128]; - quadmath_snprintf(s, 120, "%.50Qg", f); - printf("%s", s); -} - -static char frstr[16][1000]; -static int frstrcnt = 0; - -static char *toBC(double d) { - union { - double d; - uint64_t u64; - int64_t i64; - } cnv; - - cnv.d = d; - - int64_t l = cnv.i64; - int e = (int)((l >> 52) & ~(-1L << 11)); - int s = (int)(l >> 63); - l = d == 0 ? 0 : ((l & ~((-1L) << 52)) | (1L << 52)); - - char *ptr = frstr[(frstrcnt++) & 15]; - - sprintf(ptr, "%s%lld*2^%d", s != 0 ? "-" : "", (long long int)l, (e-0x3ff-52)); - return ptr; -} - -static char *toBCq(__float128 d) { - union { - __float128 d; - __uint128_t u128; - } cnv; - - cnv.d = d; - - __uint128_t m = cnv.u128; - int e = (int)((m >> 112) & ~(-1L << 15)); - int s = (int)(m >> 127); - m = d == 0 ? 0 : ((m & ((((__uint128_t)1) << 112)-1)) | ((__uint128_t)1 << 112)); - - uint64_t h = m / UINT64_C(10000000000000000000); - uint64_t l = m % UINT64_C(10000000000000000000); - - char *ptr = frstr[(frstrcnt++) & 15]; - - sprintf(ptr, "%s%" PRIu64 "%019" PRIu64 "*2^%d", s != 0 ? "-" : "", h, l, (e-0x3fff-112)); - - return ptr; -} - -static int xisnanq(Sleef_quad x) { return x != x; } -static int xisinfq(Sleef_quad x) { return x == (Sleef_quad)__builtin_inf() || x == -(Sleef_quad)__builtin_inf(); } -static int xisfiniteq(Sleef_quad x) { return !xisnanq(x) && !isinfq(x); } diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/main_checkfeature.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/main_checkfeature.c index b5d7b9a07f3..3c9eca64172 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/main_checkfeature.c +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/main_checkfeature.c @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/misc.h b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/misc.h index 472cae68bd5..fadcd1aa326 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/misc.h +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/misc.h @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2024. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -13,10 +13,15 @@ #include #endif + #ifndef M_PI #define M_PI 3.141592653589793238462643383279502884 #endif +#ifndef M_PIf +# define M_PIf ((float)M_PI) +#endif + #ifndef M_PIl #define M_PIl 3.141592653589793238462643383279502884L #endif @@ -137,9 +142,17 @@ #define L2Lf 1.428606765330187045e-06f #define R_LN2f 1.442695040888963407359924681001892137426645954152985934135449406931f -#ifndef M_PIf -# define M_PIf ((float)M_PI) -#endif + +// Overflow bounds + +// - exp(x) overflows for x over (also used in pow) +#define LOG_DBL_MAX 0x1.62e42fefa39efp+9 /* 709.782712893384 */ + +// Other bounds + +// - log1p(f)(x) approximation holds up to x equals +#define LOG1PF_BOUND 0x1.2ced32p+126 /* 1.0e+38 */ +#define LOG1P_BOUND 0x1.c7b1f3cac7433p+1019 /* 1.0e+307 */ // @@ -249,6 +262,9 @@ typedef struct { #else // #if defined(SLEEF_GENHEADER) #define INLINE __forceinline +#ifdef CONST +#undef CONST +#endif #define CONST #ifndef SLEEF_STATIC_LIBS #define EXPORT __declspec(dllexport) diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/psha2.hpp b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/psha2.hpp new file mode 100644 index 00000000000..17b9e47bb18 --- /dev/null +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/psha2.hpp @@ -0,0 +1,182 @@ +#ifndef __PSHA2_HPP_INCLUDED__ +#define __PSHA2_HPP_INCLUDED__ + +#include +#include + +struct PSHA2_256_Internal { + // https://github.com/983/SHA-256 + // This is public domain implementation of SHA256 + static inline uint32_t rotr(uint32_t x, int n) { + return (x >> n) | (x << (32 - n)); + } + + static inline uint32_t step1(uint32_t e, uint32_t f, uint32_t g) { + return (rotr(e, 6) ^ rotr(e, 11) ^ rotr(e, 25)) + ((e & f) ^ ((~ e) & g)); + } + + static inline uint32_t step2(uint32_t a, uint32_t b, uint32_t c) { + return (rotr(a, 2) ^ rotr(a, 13) ^ rotr(a, 22)) + ((a & b) ^ (a & c) ^ (b & c)); + } + + static inline void update_w(uint32_t *w, int i, const uint8_t *buffer) { + int j; + for(j = 0;j < 16;j++) { + if (i < 16) { + w[j] = + ((uint32_t)buffer[0] << 24) | + ((uint32_t)buffer[1] << 16) | + ((uint32_t)buffer[2] << 8) | + ((uint32_t)buffer[3]); + buffer += 4; + } else { + uint32_t a = w[(j + 1) & 15]; + uint32_t b = w[(j + 14) & 15]; + uint32_t s0 = (rotr(a, 7) ^ rotr(a, 18) ^ (a >> 3)); + uint32_t s1 = (rotr(b, 17) ^ rotr(b, 19) ^ (b >> 10)); + w[j] += w[(j + 9) & 15] + s0 + s1; + } + } + } + + uint32_t state[8]; + uint64_t n_bits; + uint8_t buffer_counter; + uint8_t buffer[64]; + + PSHA2_256_Internal() { + state[0] = 0x6a09e667; + state[1] = 0xbb67ae85; + state[2] = 0x3c6ef372; + state[3] = 0xa54ff53a; + state[4] = 0x510e527f; + state[5] = 0x9b05688c; + state[6] = 0x1f83d9ab; + state[7] = 0x5be0cd19; + n_bits = 0; + buffer_counter = 0; + for(int i=0;i<64;i++) buffer[i] = 0; + } + + void block() { + static const uint32_t k[] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, + 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, + 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, + 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, + 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, + }; + + uint32_t a = state[0]; + uint32_t b = state[1]; + uint32_t c = state[2]; + uint32_t d = state[3]; + uint32_t e = state[4]; + uint32_t f = state[5]; + uint32_t g = state[6]; + uint32_t h = state[7]; + + uint32_t w[16] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }; + + for(int i = 0;i < 64;i += 16) { + update_w(w, i, buffer); + +#if defined(__clang__) +#pragma clang loop unroll(full) +#endif + for(int j = 0;j < 16;j += 4) { + uint32_t temp; + temp = h + step1(e, f, g) + k[i + j + 0] + w[j + 0]; + h = temp + d; + d = temp + step2(a, b, c); + temp = g + step1(h, e, f) + k[i + j + 1] + w[j + 1]; + g = temp + c; + c = temp + step2(d, a, b); + temp = f + step1(g, h, e) + k[i + j + 2] + w[j + 2]; + f = temp + b; + b = temp + step2(c, d, a); + temp = e + step1(f, g, h) + k[i + j + 3] + w[j + 3]; + e = temp + a; + a = temp + step2(b, c, d); + } + } + + state[0] += a; + state[1] += b; + state[2] += c; + state[3] += d; + state[4] += e; + state[5] += f; + state[6] += g; + state[7] += h; + } + + void append_byte(uint8_t byte) { + buffer[buffer_counter++] = byte; + n_bits += 8; + + if (buffer_counter == 64) { + buffer_counter = 0; + block(); + } + } + + void append(const void *src, size_t n_bytes) { + for(size_t i = 0;i < n_bytes;i++) { + append_byte(((const uint8_t*)src)[i]); + } + } + + void appendWord(const void *src, size_t n_bytes) { +#if !defined(__BYTE_ORDER__) || (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) + for(size_t i = 0;i < n_bytes;i++) { + append_byte(((const uint8_t*)src)[i]); + } +#else + for(int i = int(n_bytes)-1;i >= 0;i--) { + append_byte(((const uint8_t*)src)[i]); + } +#endif + } + + void finalize() { + uint64_t nb = n_bits; + + append_byte(0x80); + + while(buffer_counter != 64 - 8) { + append_byte(0); + } + + for(int i = 7;i >= 0;i--) { + uint8_t byte = (nb >> 8 * i) & 0xff; + append_byte(byte); + } + } + + void finalize_bytes(void *dst_bytes32) { + uint8_t *ptr = (uint8_t*)dst_bytes32; + finalize(); + + for(int i = 0;i < 8;i++) { + for(int j = 3;j >= 0;j--) { + *ptr++ = (state[i] >> j * 8) & 0xff; + } + } + } +}; + +#endif // #ifndef __PSHA2_HPP_INCLUDED__ diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/psha2_capi.cpp b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/psha2_capi.cpp new file mode 100644 index 00000000000..00155d85248 --- /dev/null +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/psha2_capi.cpp @@ -0,0 +1,57 @@ +#include "psha2.hpp" +#include "psha2_capi.h" + +#include + +const EVP_MD *EVP_sha256(void) { + static const int one[1] = { 1 }; + return &one[0]; +} + +size_t EVP_MD_size(const EVP_MD *e) { + if (*e == 1) return SHA256_DIGEST_LENGTH; + return 0; +} + +int EVP_MD_get_size(const EVP_MD *e) { + if (*e == 1) return SHA256_DIGEST_LENGTH; + return 0; +} + +EVP_MD_CTX *EVP_MD_CTX_new(void) { + return (EVP_MD_CTX *)calloc(1, sizeof(EVP_MD_CTX)); +} + +int EVP_DigestInit_ex(EVP_MD_CTX *ctx, const EVP_MD *type, ENGINE *impl) { + ctx->type = *type; + if (*type == 1) { + ctx->psha_256 = new PSHA2_256_Internal(); + return 1; + } + return 0; +} + +int EVP_DigestUpdate(EVP_MD_CTX *ctx, const void *d, size_t cnt) { + if (ctx->type == 1) { + ctx->psha_256->append(d, cnt); + return 1; + } + return 0; +} + +int EVP_DigestFinal_ex(EVP_MD_CTX *ctx, unsigned char *md, unsigned int *s) { + if (ctx->type == 1) { + ctx->psha_256->finalize_bytes(md); + if (s) *s = SHA256_DIGEST_LENGTH; + return 1; + } + return 0; +} + +void EVP_MD_CTX_free(EVP_MD_CTX *ctx) { + if (ctx->type == 1) { + delete ctx->psha_256; + ctx->psha_256 = nullptr; + } + free(ctx); +} diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/psha2_capi.h b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/psha2_capi.h new file mode 100644 index 00000000000..5173ceff78b --- /dev/null +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/psha2_capi.h @@ -0,0 +1,30 @@ +#include + +#ifdef __cplusplus +extern "C" { +#endif + + static const size_t SHA256_DIGEST_LENGTH = 32; + + typedef int EVP_MD; + typedef void ENGINE; + + typedef struct { + int type; + union { + struct PSHA2_256_Internal *psha_256; + }; + } EVP_MD_CTX; + + const EVP_MD *EVP_sha256(void); + int EVP_MD_get_size(const EVP_MD *); + size_t EVP_MD_size(const EVP_MD *); + EVP_MD_CTX *EVP_MD_CTX_new(void); + int EVP_DigestInit_ex(EVP_MD_CTX *ctx, const EVP_MD *type, ENGINE *impl); + int EVP_DigestUpdate(EVP_MD_CTX *ctx, const void *d, size_t cnt); + int EVP_DigestFinal_ex(EVP_MD_CTX *ctx, unsigned char *md, unsigned int *s); + void EVP_MD_CTX_free(EVP_MD_CTX *ctx); + +#ifdef __cplusplus +} +#endif diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad-tester/qtesterutil.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/qtesterutil.c similarity index 93% rename from src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad-tester/qtesterutil.c rename to src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/qtesterutil.c index 90e03176614..325e48c3a02 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad-tester/qtesterutil.c +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/qtesterutil.c @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -22,6 +22,10 @@ #include #endif +#ifdef ENABLEFLOAT128 +#include +#endif + #if defined(__MINGW32__) || defined(__MINGW64__) || defined(_MSC_VER) #define STDIN_FILENO 0 #else @@ -42,33 +46,6 @@ // -int readln(int fd, char *buf, int cnt) { - int i, rcnt = 0; - - if (cnt < 1) return -1; - - while(cnt >= 2) { - i = read(fd, buf, 1); - if (i != 1) return i; - - if (*buf == '\n') break; - - rcnt++; - buf++; - cnt--; - } - - *++buf = '\0'; - rcnt++; - return rcnt; -} - -int startsWith(char *str, char *prefix) { - return strncmp(str, prefix, strlen(prefix)) == 0; -} - -// - xuint128 xu(uint64_t h, uint64_t l) { xuint128 r = { .l = l, .h = h }; return r; @@ -150,31 +127,6 @@ int isnanf128(Sleef_quad a) { // -static uint64_t xseed; - -uint64_t xrand() { - uint64_t u = xseed; - xseed = xseed * UINT64_C(6364136223846793005) + 1; - u = (u & ((~UINT64_C(0)) << 32)) | (xseed >> 32); - xseed = xseed * UINT64_C(6364136223846793005) + 1; - return u; -} - -void xsrand(uint64_t s) { - xseed = s; - xrand(); - xrand(); - xrand(); -} - -void memrand(void *p, int size) { - uint64_t *q = (uint64_t *)p; - int i; - for(i=0;i= 8) -#define SLEEF_FLOAT128_IS_IEEEQP -#endif - -#if !defined(SLEEF_FLOAT128_IS_IEEEQP) && defined(__SIZEOF_LONG_DOUBLE__) && __SIZEOF_LONG_DOUBLE__ == 16 && (defined(__aarch64__) || defined(__zarch__)) -#define SLEEF_LONGDOUBLE_IS_IEEEQP -#endif +#include "sleef-config.h" #if !defined(Sleef_quad_DEFINED) #define Sleef_quad_DEFINED @@ -74,14 +68,6 @@ typedef union { #else // #if !defined(SLEEF_GENHEADER) -SLEEFSHARPif !defined(SLEEFXXX__NVCC__) && ((defined(SLEEFXXX__SIZEOF_FLOAT128__) && SLEEFXXX__SIZEOF_FLOAT128__ == 16) || (defined(SLEEFXXX__linux__) && defined(SLEEFXXX__GNUC__) && (defined(SLEEFXXX__i386__) || defined(SLEEFXXX__x86_64__))) || (defined(SLEEFXXX__PPC64__) && defined(SLEEFXXX__GNUC__) && !defined(SLEEFXXX__clang__) && SLEEFXXX__GNUC__ >= 8)) -SLEEFSHARPdefine SLEEFXXXSLEEF_FLOAT128_IS_IEEEQP -SLEEFSHARPendif - -SLEEFSHARPif !defined(SLEEFXXXSLEEF_FLOAT128_IS_IEEEQP) && !defined(SLEEFXXX__NVCC__) && defined(SLEEFXXX__SIZEOF_LONG_DOUBLE__) && SLEEFXXX__SIZEOF_LONG_DOUBLE__ == 16 && (defined(SLEEFXXX__aarch64__) || defined(SLEEFXXX__zarch__)) -SLEEFSHARPdefine SLEEFXXXSLEEF_LONGDOUBLE_IS_IEEEQP -SLEEFSHARPendif - SLEEFSHARPif !defined(SLEEFXXXSleef_quad_DEFINED) SLEEFSHARPdefine SLEEFXXXSleef_quad_DEFINED typedef struct { uint64_t x, y; } Sleef_uint64_2t; diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/test_psha2.cpp b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/test_psha2.cpp new file mode 100644 index 00000000000..53d190708d2 --- /dev/null +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/common/test_psha2.cpp @@ -0,0 +1,58 @@ +#include "psha2.hpp" + +#if TEST_CAPI +#include "psha2_capi.h" +#else +#include +#include +#endif + +#include +#include +#include +#include +#include + +int main(int argc, char **argv) { + srand(time(NULL)); + + bool success = true; + + for(int i=0;i<10000;i++) { + int len = (rand() + ((int64_t)RAND_MAX + 1) * rand()) % (1 << (1 + (rand() % 18))); + unsigned char *plaintext = (unsigned char *)malloc(len); + for(int i=0;i> 32); + xseed = xseed * UINT64_C(6364136223846793005) + 1; + return u; +} + +void xsrand(uint64_t s) { + xseed = s; + xrand(); + xrand(); + xrand(); } // Fill memory with random bits void memrand(void *p, int size) { - uint64_t *q = (uint64_t *)p; + uint8_t *q = (uint8_t *)p; int i; - for(i=0;i>= 8; + *q++ = (uint8_t)(u & 0xff); u >>= 8; + *q++ = (uint8_t)(u & 0xff); u >>= 8; + *q++ = (uint8_t)(u & 0xff); u >>= 8; + *q++ = (uint8_t)(u & 0xff); u >>= 8; + *q++ = (uint8_t)(u & 0xff); u >>= 8; + *q++ = (uint8_t)(u & 0xff); u >>= 8; + *q++ = (uint8_t)(u & 0xff); u >>= 8; + } + for(;i + +#ifdef __cplusplus +#include +using namespace tlfloat; +#endif + +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#pragma GCC diagnostic ignored "-Wattributes" +#endif + +#if defined(__clang__) +#pragma clang diagnostic ignored "-Wvla-cxx-extension" +#pragma clang diagnostic ignored "-Wuninitialized" +#pragma clang diagnostic ignored "-Wtautological-compare" +#endif + +#define DENORMAL_DBL_MIN (4.9406564584124654418e-324) +#define POSITIVE_INFINITY INFINITY +#define NEGATIVE_INFINITY (-INFINITY) + +#define DENORMAL_FLT_MIN (1.4012984643248170709e-45f) +#define POSITIVE_INFINITYf ((float)INFINITY) +#define NEGATIVE_INFINITYf (-(float)INFINITY) + +#ifndef M_PIf +# define M_PIf ((float)M_PI) +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +extern int enableFlushToZero; +double flushToZero(double y); + +int xisnumber(double x); +int isPlusZero(double x); +int isMinusZero(double x); +int xisnan(double x); +double sign(double d); + +int isnumberf(float x); +int isPlusZerof(float x); +int isMinusZerof(float x); +int xisnanf(float x); +float signf(float d); + +int readln(int fd, char *buf, int cnt); + +#define XRAND_MAX (INT64_C(0x100000000) * (double)INT64_C(0x100000000)) + +void xsrand(uint64_t s); +uint64_t xrand(); +void memrand(void *p, int size); + +// The following functions are meant to be inlined + +static double u2d(uint64_t u) { + double d = 0; + memcpy(&d, &u, sizeof(d)); + return d; +} + +static uint64_t d2u(double d) { + uint64_t u = 0; + memcpy(&u, &d, sizeof(u)); + return u; +} + +static float u2f(uint32_t u) { + float f = 0; + memcpy(&f, &u, sizeof(f)); + return f; +} + +static uint32_t f2u(float d) { + uint32_t u = 0; + memcpy(&u, &d, sizeof(u)); + return u; +} + +static int startsWith(char *str, char *prefix) { + while(*prefix != '\0') if (*str++ != *prefix++) return 0; + return *prefix == '\0'; +} + +// + +#ifdef USEMPFR +int cmpDenormdp(double x, mpfr_t fry); +double countULPdp(double d, mpfr_t c); +double countULP2dp(double d, mpfr_t c); + +int cmpDenormsp(float x, mpfr_t fry); +double countULPsp(float d, mpfr_t c); +double countULP2sp(float d, mpfr_t c); + +#if MPFR_VERSION < MPFR_VERSION_NUM(4, 2, 0) +void mpfr_sinpi(mpfr_t ret, mpfr_t arg, mpfr_rnd_t rnd); +void mpfr_cospi(mpfr_t ret, mpfr_t arg, mpfr_rnd_t rnd); +#endif +void mpfr_lgamma_nosign(mpfr_t ret, mpfr_t arg, mpfr_rnd_t rnd); +#endif + +#ifdef __cplusplus +} + +template +static double countULP(T ot, const T& oc, + const int nbmant, const T& fltmin, const T& fltmax, + const bool checkSignedZero=false, const double abound=0.0) { + if (isnan_(oc) && isnan_(ot)) return 0; + if (isnan_(oc) || isnan_(ot)) return 10001; + if (isinf_(oc) && !isinf_(ot)) return INFINITY; + + const T halffltmin = mul_(fltmin, T(0.5)); + const bool ciszero = fabs_(oc) < halffltmin, cisinf = fabs_(oc) > fltmax; + + if (cisinf && isinf_(ot) && signbit_(oc) == signbit_(ot)) return 0; + if (ciszero && ot != 0) return 10000; + if (checkSignedZero && ciszero && ot == 0 && signbit_(oc) != signbit_(ot)) return 10002; + + double v = 0; + if (isinf_(ot) && !isinf_(oc)) { + ot = copysign_(fltmax, ot); + v = 1; + } + + const int ec = ilogb_(oc); + + auto e = fabs_(oc - ot); + if (e < abound) return 0; + + return double(div_(e, fmax_(ldexp_(T(1), ec + 1 - nbmant), fltmin))) + v; +} +#endif diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft-tester/CMakeLists.txt b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft-tester/CMakeLists.txt index 6cbcdea18cd..1682eb8481c 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft-tester/CMakeLists.txt +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft-tester/CMakeLists.txt @@ -73,8 +73,36 @@ if((NOT MSVC) AND NOT SLEEF_CLANG_ON_WINDOWS) add_test_dft(${TARGET_NAIVETESTSP}_4 $ 4) add_test_dft(${TARGET_NAIVETESTSP}_5 $ 5) add_test_dft(${TARGET_NAIVETESTSP}_10 $ 10) + + # Target executable measuredft + set(TARGET_MEASUREDFT "measuredft") + add_executable(${TARGET_MEASUREDFT} measuredft.c ${PROJECT_SOURCE_DIR}/include/sleefdft.h) + add_dependencies(${TARGET_MEASUREDFT} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT}) + target_compile_definitions(${TARGET_MEASUREDFT} PRIVATE ${COMMON_TARGET_DEFINITIONS}) + target_link_libraries(${TARGET_MEASUREDFT} ${COMMON_LINK_LIBRARIES}) + set_target_properties(${TARGET_MEASUREDFT} PROPERTIES ${COMMON_TARGET_PROPERTIES}) endif() +# Target executable test_dftplanner +set(TARGET_TEST_DFTPLANNER "test_dftplanner") +add_executable(${TARGET_TEST_DFTPLANNER} test_dftplanner.cpp ${PROJECT_SOURCE_DIR}/include/sleefdft.h) +add_dependencies(${TARGET_TEST_DFTPLANNER} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT}) +target_compile_definitions(${TARGET_TEST_DFTPLANNER} PRIVATE ${COMMON_TARGET_DEFINITIONS} MEASURE=1) +target_link_libraries(${TARGET_TEST_DFTPLANNER} ${COMMON_LINK_LIBRARIES}) +set_target_properties(${TARGET_TEST_DFTPLANNER} PROPERTIES ${COMMON_TARGET_PROPERTIES}) +add_test(NAME ${TARGET_TEST_DFTPLANNER} COMMAND $ ${PROJECT_BINARY_DIR}/testm1.plan ${PROJECT_BINARY_DIR}/testm2.plan) +set_tests_properties(${TARGET_TEST_DFTPLANNER} PROPERTIES COST 2) + +# Target executable test_dftplannerest +set(TARGET_TEST_DFTPLANNEREST "test_dftplannerest") +add_executable(${TARGET_TEST_DFTPLANNEREST} test_dftplanner.cpp ${PROJECT_SOURCE_DIR}/include/sleefdft.h) +add_dependencies(${TARGET_TEST_DFTPLANNEREST} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT}) +target_compile_definitions(${TARGET_TEST_DFTPLANNEREST} PRIVATE ${COMMON_TARGET_DEFINITIONS}) +target_link_libraries(${TARGET_TEST_DFTPLANNEREST} ${COMMON_LINK_LIBRARIES}) +set_target_properties(${TARGET_TEST_DFTPLANNEREST} PROPERTIES ${COMMON_TARGET_PROPERTIES}) +add_test(NAME ${TARGET_TEST_DFTPLANNEREST} COMMAND $ ${PROJECT_BINARY_DIR}/teste1.plan ${PROJECT_BINARY_DIR}/teste2.plan) +set_tests_properties(${TARGET_TEST_DFTPLANNEREST} PROPERTIES COST 2) + # Target executable roundtriptest1ddp set(TARGET_ROUNDTRIPTEST1DDP "roundtriptest1ddp") add_executable(${TARGET_ROUNDTRIPTEST1DDP} roundtriptest1d.c ${PROJECT_SOURCE_DIR}/include/sleefdft.h) @@ -161,6 +189,34 @@ if (LIBFFTW3 AND NOT SLEEF_DISABLE_FFTW) add_test_dft(${TARGET_FFTWTEST2DSP}_8_8 $ 8 8) add_test_dft(${TARGET_FFTWTEST2DSP}_10_10 $ 10 10) add_test_dft(${TARGET_FFTWTEST2DSP}_5_15 $ 5 15) + + if (SLEEF_LIBFFTW3_LIBRARIES) + # Target executable dftbenchdp + set(TARGET_BENCH1DDP "dftbenchdp") + add_executable(${TARGET_BENCH1DDP} dftbench.cpp ${PROJECT_SOURCE_DIR}/include/sleefdft.h) + add_dependencies(${TARGET_BENCH1DDP} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT}) + target_compile_definitions(${TARGET_BENCH1DDP} PRIVATE ${COMMON_TARGET_DEFINITIONS} BASETYPEID=1) + target_link_libraries(${TARGET_BENCH1DDP} ${COMMON_LINK_LIBRARIES} ${SLEEF_LIBFFTW3_LIBRARIES}) + set_target_properties(${TARGET_BENCH1DDP} PROPERTIES ${COMMON_TARGET_PROPERTIES}) + + #add_test_dft("dftbenchdp1d" $ 8 0 1000 1) + #set_tests_properties("dftbenchdp1d" PROPERTIES COST 3) + add_test_dft("dftbenchdp2d" $ 8 8 1000 1) + set_tests_properties("dftbenchdp2d" PROPERTIES COST 3) + + # Target executable dftbenchsp + set(TARGET_BENCH1DSP "dftbenchsp") + add_executable(${TARGET_BENCH1DSP} dftbench.cpp ${PROJECT_SOURCE_DIR}/include/sleefdft.h) + add_dependencies(${TARGET_BENCH1DSP} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT}) + target_compile_definitions(${TARGET_BENCH1DSP} PRIVATE ${COMMON_TARGET_DEFINITIONS} BASETYPEID=2) + target_link_libraries(${TARGET_BENCH1DSP} ${COMMON_LINK_LIBRARIES} ${SLEEF_LIBFFTW3_LIBRARIES}) + set_target_properties(${TARGET_BENCH1DSP} PROPERTIES ${COMMON_TARGET_PROPERTIES}) + + #add_test_dft("dftbenchsp1d" $ 8 0 1000 1) + #set_tests_properties("dftbenchsp1d" PROPERTIES COST 3) + add_test_dft("dftbenchsp2d" $ 8 8 1000 1) + set_tests_properties("dftbenchsp2d" PROPERTIES COST 3) + endif() else(LIBFFTW3 AND NOT SLEEF_DISABLE_FFTW) if(MSVC OR SLEEF_CLANG_ON_WINDOWS) # Test roundtriptestdp diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft-tester/bench1d.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft-tester/bench1d.c deleted file mode 100644 index a30dbcf296b..00000000000 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft-tester/bench1d.c +++ /dev/null @@ -1,116 +0,0 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. -// Distributed under the Boost Software License, Version 1.0. -// (See accompanying file LICENSE.txt or copy at -// http://www.boost.org/LICENSE_1_0.txt) - -#define _DEFAULT_SOURCE -#define _XOPEN_SOURCE 700 - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef USEFFTW -#include -#include -#else -#include "sleef.h" -#include "sleefdft.h" -#endif - -typedef double real; - -static uint64_t gettime() { - struct timespec tp; - clock_gettime(CLOCK_MONOTONIC, &tp); - return (uint64_t)tp.tv_sec * 1000000000 + ((uint64_t)tp.tv_nsec); -} - -#define REPEAT 8 - -int main(int argc, char **argv) { - if (argc == 1) { - fprintf(stderr, "%s \n", argv[0]); - exit(-1); - } - - int backward = 0; - - int log2n = atoi(argv[1]); - if (log2n < 0) { - backward = 1; - log2n = -log2n; - } - - const int n = 1 << log2n; - const int64_t niter = (int)(100000000000.0 / n / log2n); - - printf("Number of iterations = %lld\n", (long long int)niter); - -#ifdef USEFFTW - fftw_complex *in = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n); - fftw_complex *out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n); - -#if 0 - int fftw_init_threads(void); - fftw_plan_with_nthreads(omp_get_max_threads()); -#endif - - fftw_plan w = fftw_plan_dft_1d(n, in, out, backward ? FFTW_BACKWARD : FFTW_FORWARD, FFTW_MEASURE); - //fftw_plan w = fftw_plan_dft_1d(n, in, out, backward ? FFTW_BACKWARD : FFTW_FORWARD, FFTW_PATIENT); - - for(int i=0;i= 3) mode = SLEEF_MODE_VERBOSE | SLEEF_MODE_ESTIMATE; - - if (backward) mode |= SLEEF_MODE_BACKWARD; - struct SleefDFT *p = SleefDFT_double_init1d(n, in, out, mode); - - if (argc >= 3) SleefDFT_setPath(p, argv[2]); - - for(int i=0;i +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "sleef.h" +#include "sleefdft.h" + +using namespace std; + +#if BASETYPEID == 1 +typedef double xreal; +#define FFTW_COMPLEX fftw_complex +#define FFTW_PLAN_WITH_NTHREADS fftw_plan_with_nthreads +#define FFTW_PLAN fftw_plan +#define FFTW_MALLOC fftw_malloc +#define FFTW_FREE fftw_free +#define FFTW_PLAN_DFT_1D fftw_plan_dft_1d +#define FFTW_PLAN_DFT_2D fftw_plan_dft_2d +#define FFTW_EXECUTE fftw_execute +#define FFTW_DESTROY_PLAN fftw_destroy_plan +#define FFTW_CLEANUP fftw_cleanup +#define SLEEFDFT_INIT1D SleefDFT_double_init1d +#define SLEEFDFT_INIT2D SleefDFT_double_init2d +#elif BASETYPEID == 2 +typedef float xreal; +#define FFTW_COMPLEX fftwf_complex +#define FFTW_PLAN_WITH_NTHREADS fftwf_plan_with_nthreads +#define FFTW_PLAN fftwf_plan +#define FFTW_MALLOC fftwf_malloc +#define FFTW_FREE fftwf_free +#define FFTW_PLAN_DFT_1D fftwf_plan_dft_1d +#define FFTW_PLAN_DFT_2D fftwf_plan_dft_2d +#define FFTW_EXECUTE fftwf_execute +#define FFTW_DESTROY_PLAN fftwf_destroy_plan +#define FFTW_CLEANUP fftwf_cleanup +#define SLEEFDFT_INIT1D SleefDFT_float_init1d +#define SLEEFDFT_INIT2D SleefDFT_float_init2d +#else +#error BASETYPEID not set +#endif + +static uint64_t timens() { + return std::chrono::duration_cast + (std::chrono::high_resolution_clock::now() - std::chrono::high_resolution_clock::from_time_t(0)).count(); +} + +template +class FFTFramework { +public: + virtual void execute() = 0; + virtual cplx* getInPtr() = 0; + virtual cplx* getOutPtr() = 0; + virtual ~FFTFramework() {}; + + int64_t niter(int64_t ns) { + int64_t niter = 10, t0, t1; + + for(;;) { + t0 = timens(); + for(int64_t i=0;i 1000LL * 1000 * 10) break; + niter *= 2; + } + + return 1 + int64_t((double)niter * ns / (t1 - t0)); + } +}; + +template +class FWSleefDFT : public FFTFramework { + const int n, m; + cplx* in; + cplx* out; + SleefDFT *plan; + +public: + FWSleefDFT(int n_, int m_, bool forward, bool mt, bool check) : n(n_), m(m_) { + SleefDFT_setDefaultVerboseFP(stderr); + SleefDFT_setPlanFilePath(NULL, NULL, SLEEF_PLAN_RESET); + in = (cplx*)Sleef_malloc(sizeof(cplx) * n * m); + out = (cplx*)Sleef_malloc(sizeof(cplx) * n * m); + + if (!in || !out) { + cerr << "Sleef_malloc failed" << endl; + exit(-1); + } + + uint64_t mode = check ? SLEEF_MODE_ESTIMATE : SLEEF_MODE_MEASURE; + mode |= forward ? SLEEF_MODE_FORWARD : SLEEF_MODE_BACKWARD; + mode |= mt ? 0 : SLEEF_MODE_NO_MT; + //mode |= SLEEF_MODE_VERBOSE; + + if (m == 1) { + plan = SLEEFDFT_INIT1D(n, (xreal*)in, (xreal*)out, mode); + } else { + plan = SLEEFDFT_INIT2D(n, m, (xreal*)in, (xreal*)out, mode); + } + } + + string getPath() { + vector pathstr(1024); + SleefDFT_getPath(plan, pathstr.data(), pathstr.size()); + return pathstr.data(); + } + + ~FWSleefDFT() { + SleefDFT_dispose(plan); + Sleef_free(out); + Sleef_free(in); + } + + cplx* getInPtr () { return in ; } + cplx* getOutPtr() { return out; } + + void execute() { SleefDFT_execute(plan, NULL, NULL); } +}; + +template +class FWFFTW3 : public FFTFramework { + const int n, m; + cplx* in; + cplx* out; + FFTW_PLAN plan; + +public: + FWFFTW3(int n_, int m_, bool forward, bool mt, bool check) : n(n_), m(m_) { + //FFTW_CLEANUP(); + FFTW_PLAN_WITH_NTHREADS(mt ? omp_get_max_threads() : 1); + in = (cplx*)FFTW_MALLOC(sizeof(FFTW_COMPLEX) * n * m); + out = (cplx*)FFTW_MALLOC(sizeof(FFTW_COMPLEX) * n * m); + unsigned flags = check ? FFTW_ESTIMATE : FFTW_MEASURE; + if (m == 1) { + plan = FFTW_PLAN_DFT_1D(n, (FFTW_COMPLEX*)in, (FFTW_COMPLEX*)out, forward ? FFTW_FORWARD : FFTW_BACKWARD, flags); + } else { + plan = FFTW_PLAN_DFT_2D(n, m, (FFTW_COMPLEX*)in, (FFTW_COMPLEX*)out, forward ? FFTW_FORWARD : FFTW_BACKWARD, flags); + } + } + + ~FWFFTW3() { + FFTW_DESTROY_PLAN(plan); + FFTW_FREE(out); + FFTW_FREE(in); + } + + cplx* getInPtr() { return in; } + cplx* getOutPtr() { return out; } + + void execute() { FFTW_EXECUTE(plan); } +}; + +int main(int argc, char **argv) { + if (argc == 1) { + fprintf(stderr, "%s \n", argv[0]); + exit(-1); + } + + fftw_init_threads(); + + double measureTimeMillis = 3000; + if (argc >= 4) measureTimeMillis = atof(argv[3]); + + bool forward = true; + + int log2n = atoi(argv[1]); + if (log2n < 0) { + forward = false; + log2n = -log2n; + } + + const int n = 1 << log2n; + + const int log2m = argc >= 3 ? atoi(argv[2]) : 0; + const int m = 1 << log2m; + + cerr << "n = " << n << ", m = " << m << ", " << (forward ? "forward" : "backward") << endl; + + const int nrepeat = argc >= 5 ? atoi(argv[4]) : 1; + + vector mflops_sleefdftst, mflops_fftwst, mflops_sleefdftmt, mflops_fftwmt; + + vector> v(n * m); + for(int i=0;i>>(n, m, forward, true , true); + auto fftw = make_shared>>(n, m, forward, false, true); + + complex *in0 = sleefdft->getInPtr(); + complex *out0 = sleefdft->getOutPtr(); + complex *in1 = fftw->getInPtr(); + complex *out1 = fftw->getOutPtr(); + + for(int i=0;iexecute(); + fftw ->execute(); + + for(int i=0;i 0.1) { + cerr << "NG " << i << " : " << out0[i] << ", " << out1[i] << endl; + exit(-1); + } + } + + cerr << "Check OK" << endl; + } + + for(int nr = 0;nr < nrepeat;nr++) { + cerr << endl; +#if BASETYPEID == 1 + cerr << "DP "; +#elif BASETYPEID == 2 + cerr << "SP "; +#endif + cerr << "n = 2^" << log2n << " = " << n << ", m = 2^" << log2m << " = " << m << ", nr = " << nr << endl; + + // + + { + cerr << "Planning SleefDFT ST ... "; + int64_t ptm0 = timens(); + auto sleefdftst = make_shared>>(n, m, forward, false, false); + int64_t ptm1 = timens(); + cerr << ((ptm1 - ptm0) / 1000.0 / 1000.0) << "ms" << endl; + + cerr << sleefdftst->getPath() << endl; + + complex *in0 = sleefdftst->getInPtr(); + for(int i=0;initer(1000LL * 1000 * measureTimeMillis); + + cerr << "SleefDFT ST niter = " << niter << endl; + + for(int64_t i=0;iexecute(); // warm up + + int64_t tm0 = timens(); + for(int64_t i=0;iexecute(); + int64_t tm1 = timens(); + + double mflops = 5 * n * log2n / ((tm1 - tm0) / (double(niter)*1000)); + if (m != 1) mflops *= m * log2m; + + fprintf(stderr, "%g Mflops\n", mflops); + + mflops_sleefdftst.push_back(mflops); + } + + // + + { + cerr << "Planning FFTW ST ... "; + int64_t ptm0 = timens(); + auto fftwst = make_shared>>(n, m, forward, false, false); + int64_t ptm1 = timens(); + cerr << ((ptm1 - ptm0) / 1000.0 / 1000.0) << "ms" << endl; + + complex *in0 = fftwst->getInPtr(); + for(int i=0;initer(1000LL * 1000 * measureTimeMillis); + + cerr << "FFTW ST niter = " << niter << endl; + + for(int64_t i=0;iexecute(); // warm up + + int64_t tm0 = timens(); + for(int64_t i=0;iexecute(); + int64_t tm1 = timens(); + + double mflops = 5 * n * log2n / ((tm1 - tm0) / (double(niter)*1000)); + if (m != 1) mflops *= m * log2m; + + fprintf(stderr, "%g Mflops\n", mflops); + + mflops_fftwst.push_back(mflops); + } + + // + + { + cerr << "Planning SleefDFT MT ... "; + int64_t ptm0 = timens(); + auto sleefdftmt = make_shared>>(n, m, forward, true, false); + int64_t ptm1 = timens(); + cerr << ((ptm1 - ptm0) / 1000.0 / 1000.0) << "ms" << endl; + + cerr << sleefdftmt->getPath() << endl; + + complex *in0 = sleefdftmt->getInPtr(); + for(int i=0;initer(1000LL * 1000 * measureTimeMillis); + + cerr << "SleefDFT MT niter = " << niter << endl; + + for(int64_t i=0;iexecute(); // warm up + + int64_t tm0 = timens(); + for(int64_t i=0;iexecute(); + int64_t tm1 = timens(); + + double mflops = 5 * n * log2n / ((tm1 - tm0) / (double(niter)*1000)); + if (m != 1) mflops *= m * log2m; + + fprintf(stderr, "%g Mflops\n", mflops); + + mflops_sleefdftmt.push_back(mflops); + } + + // + + { + cerr << "Planning FFTW MT ... "; + int64_t ptm0 = timens(); + auto fftwmt = make_shared>>(n, m, forward, true, false); + int64_t ptm1 = timens(); + cerr << ((ptm1 - ptm0) / 1000.0 / 1000.0) << "ms" << endl; + + complex *in0 = fftwmt->getInPtr(); + for(int i=0;initer(1000LL * 1000 * measureTimeMillis); + + cerr << "FFTW MT niter = " << niter << endl; + + for(int64_t i=0;iexecute(); // warm up + + int64_t tm0 = timens(); + for(int64_t i=0;iexecute(); + int64_t tm1 = timens(); + + double mflops = 5 * n * log2n / ((tm1 - tm0) / (double(niter)*1000)); + if (m != 1) mflops *= m * log2m; + + fprintf(stderr, "%g Mflops\n", mflops); + + mflops_fftwmt.push_back(mflops); + } + } + + cerr << endl; + + cout << log2n << ", " << log2m << ", "; + + { + double f = 0; + for(auto a : mflops_sleefdftst) { + if (a > f) f = a; + } + cout << f << ", "; + } + + { + double f = 0; + for(auto a : mflops_sleefdftmt) { + if (a > f) f = a; + } + cout << f << ", "; + } + + { + double f = 0; + for(auto a : mflops_fftwst) { + if (a > f) f = a; + } + cout << f << ", "; + } + + { + double f = 0; + for(auto a : mflops_fftwmt) { + if (a > f) f = a; + } + cout << f << endl; + } + + // + + exit(0); +} diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft-tester/fftwtest1d.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft-tester/fftwtest1d.c index f53951bb6e8..920847c0df2 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft-tester/fftwtest1d.c +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft-tester/fftwtest1d.c @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -40,10 +40,22 @@ static double squ(double x) { return x * x; } double check_cf(int n) { fftw_complex *in = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n); fftw_complex *out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n); + + if (!in || !out) { + fprintf(stderr, "Memory allocation failed"); + exit(-1); + } + fftw_plan w = fftw_plan_dft_1d(n, in, out, FFTW_FORWARD, FFTW_ESTIMATE); real *sx = (real *)Sleef_malloc(n*2*sizeof(real)); real *sy = (real *)Sleef_malloc(n*2*sizeof(real)); + + if (!sx || !sy) { + fprintf(stderr, "Memory allocation failed"); + exit(-1); + } + struct SleefDFT *p = SleefDFT_init1d(n, sx, sy, MODE); for(int i=0;i THRES) || - (fabs(sy[(i*2+1)] - cimag(fs[i])) > THRES)) { + if ((fabs(sx[(i*2+0)] - creal(fs[i])) > THRES) || + (fabs(sx[(i*2+1)] - cimag(fs[i])) > THRES)) { success = 0; } - - double t; - t = (sy[(i*2+0)] - creal(fs[i])); - rmsn += t*t; - t = (sy[(i*2+1)] - cimag(fs[i])); - rmsn += t*t; - rmsd += creal(fs[i]) * creal(fs[i]) + cimag(fs[i]) * cimag(fs[i]); } // @@ -148,7 +144,6 @@ int check_cf(int n) { free(ts); Sleef_free(sx); - Sleef_free(sy); SleefDFT_dispose(p); // @@ -161,11 +156,15 @@ int check_cb(int n) { int i; real *sx = (real *)Sleef_malloc(sizeof(real)*n*2); - real *sy = (real *)Sleef_malloc(sizeof(real)*n*2); cmpl *ts = (cmpl *)malloc(sizeof(cmpl)*n); cmpl *fs = (cmpl *)malloc(sizeof(cmpl)*n); + if (!sx || !ts || !fs) { + fprintf(stderr, "Memory allocation failed"); + exit(-1); + } + // for(i=0;i THRES) || - (fabs(sy[(i*2+1)] - cimag(ts[i])) > THRES)) { + if ((fabs(sx[(i*2+0)] - creal(ts[i])) > THRES) || + (fabs(sx[(i*2+1)] - cimag(ts[i])) > THRES)) { success = 0; } } @@ -202,7 +201,6 @@ int check_cb(int n) { free(ts); Sleef_free(sx); - Sleef_free(sy); SleefDFT_dispose(p); // @@ -214,12 +212,16 @@ int check_cb(int n) { int check_rf(int n) { int i; - real *sx = (real *)Sleef_malloc(n * sizeof(real)); - real *sy = (real *)Sleef_malloc((n/2+1)*sizeof(real)*2); + real *sx = (real *)Sleef_malloc((n+2) * sizeof(real)); cmpl *ts = (cmpl *)malloc(sizeof(cmpl)*n); cmpl *fs = (cmpl *)malloc(sizeof(cmpl)*n); + if (!sx || !ts || !fs) { + fprintf(stderr, "Memory allocation failed"); + exit(-1); + } + // for(i=0;i THRES) success = 0; - if (fabs(sy[(2*i+1)] - cimag(fs[i])) > THRES) success = 0; + if (fabs(sx[(2*i+0)] - creal(fs[i])) > THRES) success = 0; + if (fabs(sx[(2*i+1)] - cimag(fs[i])) > THRES) success = 0; } // @@ -255,7 +259,6 @@ int check_rf(int n) { free(ts); Sleef_free(sx); - Sleef_free(sy); SleefDFT_dispose(p); // @@ -270,6 +273,11 @@ int check_rb(int n) { cmpl *ts = (cmpl *)malloc(sizeof(cmpl)*n); cmpl *fs = (cmpl *)malloc(sizeof(cmpl)*n); + if (!ts || !fs) { + fprintf(stderr, "Memory allocation failed"); + exit(-1); + } + // for(i=0;i THRES)) { + if ((fabs(sx[i] - creal(ts[i])) > THRES)) { success = 0; } } @@ -323,7 +335,6 @@ int check_rb(int n) { free(ts); Sleef_free(sx); - Sleef_free(sy); SleefDFT_dispose(p); // @@ -335,11 +346,15 @@ int check_arf(int n) { int i; real *sx = (real *)Sleef_malloc(n * sizeof(real)); - real *sy = (real *)Sleef_malloc(n * sizeof(real)); cmpl *ts = (cmpl *)malloc(sizeof(cmpl)*n); cmpl *fs = (cmpl *)malloc(sizeof(cmpl)*n); + if (!sx || !ts || !fs) { + fprintf(stderr, "Memory allocation failed"); + exit(-1); + } + // for(i=0;i THRES) success = 0; - if (fabs(sy[(2*0+1)] - creal(fs[n/2])) > THRES) success = 0; + if (fabs(sx[(2*0+0)] - creal(fs[0 ])) > THRES) success = 0; + if (fabs(sx[(2*0+1)] - creal(fs[n/2])) > THRES) success = 0; } else { - if (fabs(sy[(2*i+0)] - creal(fs[i])) > THRES) success = 0; - if (fabs(sy[(2*i+1)] - cimag(fs[i])) > THRES) success = 0; + if (fabs(sx[(2*i+0)] - creal(fs[i])) > THRES) success = 0; + if (fabs(sx[(2*i+1)] - cimag(fs[i])) > THRES) success = 0; } } // + free(fs); + free(ts); + Sleef_free(sx); - Sleef_free(sy); SleefDFT_dispose(p); // @@ -394,6 +411,11 @@ int check_arb(int n) { cmpl *ts = (cmpl *)malloc(sizeof(cmpl)*n); cmpl *fs = (cmpl *)malloc(sizeof(cmpl)*n); + if (!sx || !sy || !ts || !fs) { + fprintf(stderr, "Memory allocation failed"); + exit(-1); + } + // for(i=0;i +#include +#include + +#include +#include +#include +#include +#include + +#include "sleef.h" +#include "sleefdft.h" + +using namespace std; + +vector doTransform(int mode) { + SleefDFT *p; + vector v; + vector s(1024); + + double *din = (double *)Sleef_malloc(2048*64*2 * sizeof(double)); + double *dout = (double *)Sleef_malloc(2048*64*2 * sizeof(double)); + + float *fin = (float *)Sleef_malloc(2048*64*2 * sizeof(double)); + float *fout = (float *)Sleef_malloc(2048*64*2 * sizeof(double)); + + // + + p = SleefDFT_double_init1d(1024, din, dout, mode); + SleefDFT_getPath(p, s.data(), s.size()); + v.push_back("1d double 1024 : " + string(s.data())); + SleefDFT_dispose(p); + + p = SleefDFT_double_init1d(512, din, dout, mode); + SleefDFT_getPath(p, s.data(), s.size()); + v.push_back("1d double 512 : " + string(s.data())); + SleefDFT_dispose(p); + + p = SleefDFT_float_init1d(1024, fin, fout, mode); + SleefDFT_getPath(p, s.data(), s.size()); + v.push_back("1d float 1024 : " + string(s.data())); + SleefDFT_dispose(p); + + p = SleefDFT_float_init1d(512, fin, fout, mode); + SleefDFT_getPath(p, s.data(), s.size()); + v.push_back("1d float 512 : " + string(s.data())); + SleefDFT_dispose(p); + + p = SleefDFT_double_init2d(2048, 64, din, dout, mode); + SleefDFT_getPath(p, s.data(), s.size()); + v.push_back("2d double 2048x64 : " + string(s.data())); + SleefDFT_dispose(p); + + p = SleefDFT_double_init2d(128, 128, din, dout, mode); + SleefDFT_getPath(p, s.data(), s.size()); + v.push_back("2d double 128x128 : " + string(s.data())); + SleefDFT_dispose(p); + + p = SleefDFT_float_init2d(2048, 64, fin, fout, mode); + SleefDFT_getPath(p, s.data(), s.size()); + v.push_back("2d float 2048x64 : " + string(s.data())); + SleefDFT_dispose(p); + + p = SleefDFT_float_init2d(128, 128, fin, fout, mode); + SleefDFT_getPath(p, s.data(), s.size()); + v.push_back("2d float 128x128 : " + string(s.data())); + SleefDFT_dispose(p); + + Sleef_free(din); + Sleef_free(dout); + Sleef_free(fin); + Sleef_free(fout); + + return v; +} + +void compare(vector &runa, vector &runb) { + if (runa.size() != runb.size()) { + cerr << "Lengths do not match" << endl; + exit(-1); + } + for(size_t i=0;i paramonly ${SLEEFDFT_MAXBUTWIDTH} ${ISALIST_DP} > ${CMAKE_CURRENT_BINARY_DIR}/dispatchparam.h + COMMAND $ paramonly ALL ${SLEEFDFT_MAXBUTWIDTH} ${SLEEFDFT_MINSHIFT} ${SLEEFDFT_MAXSHIFT} ${ISALIST_SP} > ${CMAKE_CURRENT_BINARY_DIR}/dispatchparam.h DEPENDS ${TARGET_MKDISPATCH} ) add_custom_target(dispatchparam.h_generated SOURCES ${CMAKE_CURRENT_BINARY_DIR}/dispatchparam.h) @@ -282,49 +285,51 @@ foreach(T ${LIST_SUPPORTED_FPTYPE}) list(GET LISTLONGTYPENAME ${T} LT) # LT is "double" list(GET LISTTYPEID ${T} ID) # ID is 1 - string(CONCAT S "dispatch" ${ST} ".h") # S is dispatchdp.h + string(CONCAT S "dispatch" ${ST} ".hpp") # S is dispatchdp.hpp add_custom_command(OUTPUT ${S} COMMENT "Generating ${S}" - COMMAND $ ${LT} ${SLEEFDFT_MAXBUTWIDTH} ${ISALIST_${CST}} > ${S} + COMMAND $ ${LT} ${CST} ${SLEEFDFT_MAXBUTWIDTH} ${SLEEFDFT_MINSHIFT} ${SLEEFDFT_MAXSHIFT} ${ISALIST_${CST}} > ${S} DEPENDS ${TARGET_MKDISPATCH} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} ) - string(CONCAT G ${S} "_generated") # G is dispatchdp.h_generated + string(CONCAT G ${S} "_generated") # G is dispatchdp.hpp_generated add_custom_target(${G} SOURCES ${S}) endforeach() # Target dftcommon.o -add_library(dftcommon_obj OBJECT dftcommon.c dftcommon.h ${CMAKE_CURRENT_BINARY_DIR}/dispatchparam.h ${sleef_BINARY_DIR}/include/sleef.h) +add_library(dftcommon_obj OBJECT dftcommon.cpp dftcommon.hpp ${CMAKE_CURRENT_BINARY_DIR}/dispatchparam.h ${sleef_BINARY_DIR}/include/sleef.h) add_dependencies(dftcommon_obj ${TARGET_HEADERS} dispatchparam.h_generated) set_source_files_properties(${sleef_BINARY_DIR}/include/sleef.h PROPERTIES GENERATED TRUE) set_target_properties(dftcommon_obj PROPERTIES ${COMMON_TARGET_PROPERTIES}) target_compile_definitions(dftcommon_obj PRIVATE ${COMMON_TARGET_DEFINITIONS}) -# Target dft*.o +# Target dft.o -foreach(T ${LIST_SUPPORTED_FPTYPE}) - list(GET LISTSHORTTYPENAME ${T} ST) # ST is "dp", for example +add_library(dft_obj OBJECT dft.cpp dftcommon.hpp) +add_dependencies(dft_obj "dispatchdp.hpp_generated" "dispatchsp.hpp_generated" dispatchparam.h_generated ${TARGET_HEADERS}) +set_target_properties(dft_obj PROPERTIES ${COMMON_TARGET_PROPERTIES}) +target_compile_definitions(dft_obj PRIVATE ${COMMON_TARGET_DEFINITIONS}) - string(CONCAT G "dft" ${ST} "_obj") # G is "dftdp_obj" - string(CONCAT S "dispatch" ${ST} ".h") # S is "dispatchdp.h" - add_library(${G} OBJECT dft.c dftcommon.h ${S}) - string(CONCAT SG ${S} "_generated") # SG is "dispatchdp.h_generated" - add_dependencies(${G} ${SG} ${TARGET_HEADERS}) - set_target_properties(${G} PROPERTIES ${COMMON_TARGET_PROPERTIES}) - list(GET LISTTYPEID ${T} ID) # ID is 1 - target_compile_definitions(${G} PRIVATE BASETYPEID=${ID} ${COMMON_TARGET_DEFINITIONS}) -endforeach() +# Copy unroll*.cpp.in to ${CMAKE_CURRENT_BINARY_DIR} -# Copy unroll0.org to ${CMAKE_CURRENT_BINARY_DIR} +add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/unroll0.cpp.in + COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/unroll0.cpp.in ${CMAKE_CURRENT_BINARY_DIR} + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/unroll0.cpp.in) +add_custom_target(unroll0.cpp.in.copied DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/unroll0.cpp.in) -add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/unroll0.org - COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/unroll0.org ${CMAKE_CURRENT_BINARY_DIR} - DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/unroll0.org) -add_custom_target(unroll0.org.copied DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/unroll0.org) +add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/unroll1.cpp.in + COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/unroll1.cpp.in ${CMAKE_CURRENT_BINARY_DIR} + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/unroll1.cpp.in) +add_custom_target(unroll1.cpp.in.copied DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/unroll1.cpp.in) -# Target unroll*.c +add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/unroll2.cpp.in + COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/unroll2.cpp.in ${CMAKE_CURRENT_BINARY_DIR} + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/unroll2.cpp.in) +add_custom_target(unroll2.cpp.in.copied DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/unroll2.cpp.in) + +# Target unroll*.cpp foreach(T ${LIST_SUPPORTED_FPTYPE}) list(GET LISTSHORTTYPENAME ${T} ST) # ST is "dp", for example @@ -333,7 +338,7 @@ foreach(T ${LIST_SUPPORTED_FPTYPE}) foreach(E ${ISALIST_${CST}}) # E is "sse2dp" foreach(N ${NLIST}) - string(CONCAT UC unroll_ ${N} _ ${E} ".c") # UC is "unroll_0_sse2dp.c" + string(CONCAT UC unroll_ ${N} _ ${E} ".cpp") # UC is "unroll_0_sse2dp.cpp" set(UNROLL_TARGET_${CST} ${UNROLL_TARGET_${CST}} ${UC}) endforeach() endforeach() @@ -342,11 +347,31 @@ foreach(T ${LIST_SUPPORTED_FPTYPE}) if(UNROLL_TARGET_${CST}) add_custom_command(OUTPUT ${UNROLL_TARGET_${CST}} COMMENT "Generating ${UNROLL_TARGET_${CST}}" - COMMAND $ ${LT} ${ISALIST_${CST}} + COMMAND $ unroll0.cpp.in ${LT} ${CST} - ${ISALIST_${CST}} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} - DEPENDS ${TARGET_MKUNROLL} unroll0.org.copied + DEPENDS ${TARGET_MKUNROLL} unroll0.cpp.in.copied ) add_custom_target(unroll_target_${ST} DEPENDS ${UNROLL_TARGET_${CST}}) + + # + + foreach(I ${LISTSHIFTSTR}) + foreach(E ${ISALIST_${CST}}) # E is "sse2dp" + foreach(N ${NLIST}) + string(CONCAT UC unroll_ ${N} _ ${E} _ ${I} ".cpp") # UC is "unroll_0_sse2dp_1.cpp" + set(UNROLL_TARGET_${CST}_${I} ${UNROLL_TARGET_${CST}_${I}} ${UC}) + endforeach() + endforeach() + message(STATUS "Unroll target for ${CST}_${I} : ${UNROLL_TARGET_${CST}_${I}}") + + add_custom_command(OUTPUT ${UNROLL_TARGET_${CST}_${I}} + COMMENT "Generating ${UNROLL_TARGET_${CST}_${I}}" + COMMAND $ unroll1.cpp.in ${LT} ${CST} ${I} ${ISALIST_${CST}} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + DEPENDS ${TARGET_MKUNROLL} unroll1.cpp.in.copied + ) + add_custom_target(unroll_target_${ST}_${I} DEPENDS ${UNROLL_TARGET_${CST}_${I}}) + endforeach() endif() endforeach() @@ -359,43 +384,38 @@ foreach(T ${LIST_SUPPORTED_FPTYPE}) foreach(E ${ISALIST_${CST}}) # E is "sse2dp" foreach(N ${NLIST}) - string(CONCAT U unroll_ ${N} _ ${E}) # U is "unroll_0_sse2dp" + string(CONCAT U unroll_ ${N} _ ${E}) # U is "unroll_0_sse2dp" string(CONCAT UG ${U} "_obj") # UG is "unroll_0_sse2dp_obj" - string(CONCAT UC ${U} ".c") # UC is "unroll_0_sse2dp.c" + string(CONCAT UC ${U} ".cpp") # UC is "unroll_0_sse2dp.cpp" add_library(${UG} OBJECT ${UC}) set_target_properties(${UG} PROPERTIES ${COMMON_TARGET_PROPERTIES}) target_include_directories(${UG} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) target_compile_definitions(${UG} PRIVATE ${COMMON_TARGET_DEFINITIONS} ${MACRODEF_${E}}) target_compile_options(${UG} PRIVATE ${CFLAGS_${E}}) add_dependencies(${UG} ${TARGET_HEADERS} unroll_target_${ST}) + list(APPEND UNROLL_OBJECTS $) + + foreach(I ${LISTSHIFTSTR}) + string(CONCAT U unroll_ ${N} _ ${E} _ ${I}) # U is "unroll_0_sse2dp_1" + string(CONCAT UG ${U} "_obj") # UG is "unroll_0_sse2dp_1_obj" + string(CONCAT UC ${U} ".cpp") # UC is "unroll_0_sse2dp_1.cpp" + add_library(${UG} OBJECT ${UC}) + set_target_properties(${UG} PROPERTIES ${COMMON_TARGET_PROPERTIES}) + target_include_directories(${UG} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) + target_compile_definitions(${UG} PRIVATE ${COMMON_TARGET_DEFINITIONS} ${MACRODEF_${E}}) + target_compile_options(${UG} PRIVATE ${CFLAGS_${E}}) + add_dependencies(${UG} ${TARGET_HEADERS} unroll_target_${ST}_${I}) + list(APPEND UNROLL_OBJECTS $) + endforeach() endforeach() endforeach() endforeach() # Target libdft -add_library(${TARGET_LIBDFT} $ $) +add_library(${TARGET_LIBDFT} $ $ ${UNROLL_OBJECTS}) target_link_libraries(${TARGET_LIBDFT} ${TARGET_LIBSLEEF} ${LIBM}) -foreach(T ${LIST_SUPPORTED_FPTYPE}) - list(GET LISTSHORTTYPENAME ${T} ST) # ST is "dp", for example - - string(CONCAT G "dft" ${ST} "_obj") # G is "dftdp_obj" - target_sources(${TARGET_LIBDFT} PRIVATE $) -endforeach() - -foreach(T ${LIST_SUPPORTED_FPTYPE}) - list(GET LISTSHORTTYPENAME ${T} ST) # ST is "dp", for example - string(TOUPPER ${ST} CST) # CST is "DP" - - foreach(E ${ISALIST_${CST}}) # E is "sse2dp" - foreach(N ${NLIST}) - string(CONCAT UG unroll_ ${N} _ ${E} "_obj") # U is "unroll_0_sse2dp_obj" - target_sources(${TARGET_LIBDFT} PRIVATE $) - endforeach() - endforeach() -endforeach() - set_target_properties(${TARGET_LIBDFT} PROPERTIES VERSION ${SLEEF_VERSION} SOVERSION ${SLEEF_SOVERSION} diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/compat.h b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/compat.h new file mode 100644 index 00000000000..d8f14187e8a --- /dev/null +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/compat.h @@ -0,0 +1,45 @@ +#if !(defined(__MINGW32__) || defined(__MINGW64__) || defined(_MSC_VER)) + +#include +#include +#include +#include +#include + +static void FLOCK(FILE *fp) { flock(fileno(fp), LOCK_EX); } +static void FUNLOCK(FILE *fp) { flock(fileno(fp), LOCK_UN); } +static void FTRUNCATE(FILE *fp, off_t z) { + if (ftruncate(fileno(fp), z)) + ; +} +static FILE *OPENTMPFILE() { return tmpfile(); } +static void CLOSETMPFILE(FILE *fp) { fclose(fp); } + +static sigjmp_buf sigjmp; +#define SETJMP(x) sigsetjmp(x, 1) +#define LONGJMP siglongjmp + +#else + +#include +#include +#include +#include + +static void FLOCK(FILE *fp) { } +static void FUNLOCK(FILE *fp) { } +static void FTRUNCATE(FILE *fp, long z) { + fseek(fp, 0, SEEK_SET); + SetEndOfFile((HANDLE)_get_osfhandle(_fileno(fp))); +} +static FILE *OPENTMPFILE() { return fopen("tmpfile.txt", "w+"); } +static void CLOSETMPFILE(FILE *fp) { + fclose(fp); + remove("tmpfile.txt"); +} + +static jmp_buf sigjmp; +#define SETJMP(x) setjmp(x) +#define LONGJMP longjmp + +#endif diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/dft.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/dft.c deleted file mode 100644 index bc47589a12c..00000000000 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/dft.c +++ /dev/null @@ -1,1441 +0,0 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. -// Distributed under the Boost Software License, Version 1.0. -// (See accompanying file LICENSE.txt or copy at -// http://www.boost.org/LICENSE_1_0.txt) - -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "sleef.h" - -#include "misc.h" -#include "common.h" -#include "arraymap.h" -#include "dftcommon.h" - -#ifdef _OPENMP -#include -#endif - -#if BASETYPEID == 1 -typedef double real; -typedef Sleef_double2 sc_t; -#define BASETYPESTRING "double" -#define MAGIC 0x27182818 -#define MAGIC2D 0x17320508 -#define INIT SleefDFT_double_init1d -#define EXECUTE SleefDFT_double_execute -#define INIT2D SleefDFT_double_init2d -#define CTBL ctbl_double -#define REALSUB0 realSub0_double -#define REALSUB1 realSub1_double -#define GETINT getInt_double -#define GETPTR getPtr_double -#define DFTF dftf_double -#define DFTB dftb_double -#define TBUTF tbutf_double -#define TBUTB tbutb_double -#define BUTF butf_double -#define BUTB butb_double -#define SINCOSPI Sleef_sincospi_u05 -#include "dispatchdp.h" -#elif BASETYPEID == 2 -typedef float real; -typedef Sleef_float2 sc_t; -#define BASETYPESTRING "float" -#define MAGIC 0x31415926 -#define MAGIC2D 0x22360679 -#define INIT SleefDFT_float_init1d -#define EXECUTE SleefDFT_float_execute -#define INIT2D SleefDFT_float_init2d -#define CTBL ctbl_float -#define REALSUB0 realSub0_float -#define REALSUB1 realSub1_float -#define GETINT getInt_float -#define GETPTR getPtr_float -#define DFTF dftf_float -#define DFTB dftb_float -#define TBUTF tbutf_float -#define TBUTB tbutb_float -#define BUTF butf_float -#define BUTB butb_float -#define SINCOSPI Sleef_sincospif_u05 -#include "dispatchsp.h" -#else -#error No BASETYPEID specified -#endif - -#define IMPORT_IS_EXPORT -#include "sleefdft.h" - -// - -real CTBL[] = { - 0.7071067811865475243818940365159164684883L, -0.7071067811865475243818940365159164684883L, - 0.9238795325112867561014214079495587839119L, -0.382683432365089771723257530688933059082L, - 0.382683432365089771723257530688933059082L, -0.9238795325112867561014214079495587839119L, -#if MAXBUTWIDTH >= 5 - 0.9807852804032304491190993878113602022495L, -0.1950903220161282678433729148581576851029L, - 0.5555702330196022247573058028269343822103L, -0.8314696123025452370808655033762590846891L, - 0.8314696123025452370808655033762590846891L, -0.5555702330196022247573058028269343822103L, - 0.1950903220161282678433729148581576851029L, -0.9807852804032304491190993878113602022495L, -#endif -#if MAXBUTWIDTH >= 6 - 0.9951847266721968862310254699821143731242L, -0.09801714032956060199569840382660679267701L, - 0.6343932841636454982026105398063009488396L, -0.7730104533627369607965383602188325085081L, - 0.881921264348355029715105513066220055407L, -0.4713967368259976485449225247492677226546L, - 0.2902846772544623676448431737195932100803L, -0.9569403357322088649310892760624369657307L, - 0.9569403357322088649310892760624369657307L, -0.2902846772544623676448431737195932100803L, - 0.4713967368259976485449225247492677226546L, -0.881921264348355029715105513066220055407L, - 0.7730104533627369607965383602188325085081L, -0.6343932841636454982026105398063009488396L, - 0.09801714032956060199569840382660679267701L, -0.9951847266721968862310254699821143731242L, -#endif -#if MAXBUTWIDTH >= 7 - 0.9987954562051723927007702841240899260811L, -0.04906767432741801425355085940205324135377L, - 0.6715589548470184006194634573905233310143L, -0.7409511253549590911932944126139233276263L, - 0.9039892931234433315823215138173907234886L, -0.427555093430282094315230886905077056781L, - 0.336889853392220050702686798271834334173L, -0.9415440651830207783906830087961026265475L, - 0.9700312531945439926159106824865574481009L, -0.2429801799032638899447731489766866275204L, - 0.5141027441932217266072797923204262815489L, -0.8577286100002720698929313536407192941624L, - 0.8032075314806449097991200569701675249235L, -0.5956993044924333434615715265891822127742L, - 0.1467304744553617516588479505190711904561L, -0.9891765099647809734561415551112872890371L, - 0.9891765099647809734561415551112872890371L, -0.1467304744553617516588479505190711904561L, - 0.5956993044924333434615715265891822127742L, -0.8032075314806449097991200569701675249235L, - 0.8577286100002720698929313536407192941624L, -0.5141027441932217266072797923204262815489L, - 0.2429801799032638899447731489766866275204L, -0.9700312531945439926159106824865574481009L, - 0.9415440651830207783906830087961026265475L, -0.336889853392220050702686798271834334173L, - 0.427555093430282094315230886905077056781L, -0.9039892931234433315823215138173907234886L, - 0.7409511253549590911932944126139233276263L, -0.6715589548470184006194634573905233310143L, - 0.04906767432741801425355085940205324135377L, -0.9987954562051723927007702841240899260811L, -#endif -}; - -#ifndef ENABLE_STREAM -#error ENABLE_STREAM not defined -#endif - -static const int constK[] = { 0, 2, 6, 14, 38, 94, 230, 542, 1254 }; - -extern const char *configStr[]; - -extern int planFilePathSet; - -// Utility functions - -#if defined(_MSC_VER) || defined(__MINGW32__) || defined(__MINGW64__) -static jmp_buf sigjmp; -#define SETJMP(x) setjmp(x) -#define LONGJMP longjmp -#else -static sigjmp_buf sigjmp; -#define SETJMP(x) sigsetjmp(x, 1) -#define LONGJMP siglongjmp -#endif - -static void sighandler(int signum) { LONGJMP(sigjmp, 1); } - -static int checkISAAvailability(int isa) { - signal(SIGILL, sighandler); - - if (SETJMP(sigjmp) == 0) { - int ret = GETINT[isa] != NULL && (*GETINT[isa])(BASETYPEID); - signal(SIGILL, SIG_DFL); - return ret; - } - - signal(SIGILL, SIG_DFL); - return 0; -} - -#ifdef _OPENMP -static int omp_thread_count() { - int n = 0; -#pragma omp parallel reduction(+:n) - n += 1; - return n; -} -#endif - -static void startAllThreads(const int nth) { -#ifdef _OPENMP - volatile int8_t *state = calloc(nth, 1); - int th=0; -#pragma omp parallel for - for(th=0;thlog2len; - if (level == N) { - if ((p->mode & SLEEF_MODE_BACKWARD) == 0) { - void (*func)(real *, const real *, const int) = DFTF[config][p->isa][N]; - (*func)(d, s, log2len-N); - } else { - void (*func)(real *, const real *, const int) = DFTB[config][p->isa][N]; - (*func)(d, s, log2len-N); - } - } else if (level == log2len) { - assert(p->vecwidth <= (1 << N)); - if ((p->mode & SLEEF_MODE_BACKWARD) == 0) { - void (*func)(real *, uint32_t *, const real *, const int, const real *, const int) = TBUTF[config][p->isa][N]; - (*func)(d, p->perm[level], s, log2len-N, p->tbl[N][level], K); - } else { - void (*func)(real *, uint32_t *, const real *, const int, const real *, const int) = TBUTB[config][p->isa][N]; - (*func)(d, p->perm[level], s, log2len-N, p->tbl[N][level], K); - } - } else { - if ((p->mode & SLEEF_MODE_BACKWARD) == 0) { - void (*func)(real *, uint32_t *, const int, const real *, const int, const real *, const int) = BUTF[config][p->isa][N]; - (*func)(d, p->perm[level], log2len-level, s, log2len-N, p->tbl[N][level], K); - } else { - void (*func)(real *, uint32_t *, const int, const real *, const int, const real *, const int) = BUTB[config][p->isa][N]; - (*func)(d, p->perm[level], log2len-level, s, log2len-N, p->tbl[N][level], K); - } - } -} - -// Transposer - -#if defined(__GNUC__) && __GNUC__ < 5 -// This is another workaround of a bug in gcc-4 -#define LOG2BS 3 -#else -#define LOG2BS 4 -#endif - -#define BS (1 << LOG2BS) -#define TRANSPOSE_BLOCK(y2) do { \ - for(int x2=y2+1;x2= N-1) return cnt; - const int level = levelorg - levelinc; - if (bot - top > 4) { - const int bl = 1 << (N - levelinc); - const int w = bl/4; - for(int j=0;j<(bot-top)/bl;j++) { - for(int i=0;i> 1) | ((k & 0x55555555) << 1)); - r = (((r & 0xcccccccc) >> 2) | ((r & 0x33333333) << 2)); - r = (((r & 0xf0f0f0f0) >> 4) | ((r & 0x0f0f0f0f) << 4)); - r = (((r & 0xff00ff00) >> 8) | ((r & 0x00ff00ff) << 8)); - r = ((r >> 16) | (r << 16)) >> (32-nbits); - - return (((r << s) | (k & ~(-1 << s))) & ~(-1 << d)) | - ((((k >> s) | (r & (-1 << (nbits-s)))) << d) & ~(-1 << nbits)); -} - -static real **makeTable(int sign, int vecwidth, int log2len, const int N, const int K) { - if (log2len < N) return NULL; - - int *p = (int *)malloc(sizeof(int)*((N+1)<bestTime = tm; - for(uint32_t j = 0;j < p->log2len+1;j++) { - p->bestPathConfig[j] = pathConfig[j]; - p->bestPath[j] = path[j]; - } - return nTrial; - } - - if (level < 1) return nTrial-1; - - for(int i=0;i<10;i++) { - int N; - - do { - N = 1 + rand() % MAXBUTWIDTH; - } while(p->tm[0][level*(MAXBUTWIDTH+1)+N] >= 1ULL << 60); - - if (p->vecwidth > (1 << N) || N == p->log2len) continue; - - path[level] = N; - for(;;) { - pathConfig[level] = rand() % CONFIGMAX; -#if ENABLE_STREAM == 0 - pathConfig[level] &= ~1; -#endif - if ((p->mode2 & SLEEF_MODE2_MT1D) == 0 && (pathConfig[level] & CONFIG_MT) != 0) continue; - break; - } - for(int j = level-1;j >= 0;j--) path[j] = 0; - nTrial = searchForRandomPathRecurse(p, level - N, path, pathConfig, 0, nTrial); - if (nTrial <= 0) break; - if (p->bestTime < 1ULL << 60) break; - } - - return nTrial - 1; -} - -// Planner - -#define NSHORTESTPATHS 15 -#define MAXPATHLEN (MAXLOG2LEN+1) -#define POSMAX (CONFIGMAX * MAXLOG2LEN * (MAXBUTWIDTH+1)) - -static int cln2pos(int config, int level, int N) { return (config * MAXLOG2LEN + level) * MAXBUTWIDTH + N; } -static int pos2config(int pos) { return pos == -1 ? -1 : ((pos - 1) / (MAXBUTWIDTH * MAXLOG2LEN)); } -static int pos2level(int pos) { return pos == -1 ? -1 : (((pos - 1) / MAXBUTWIDTH) % MAXLOG2LEN); } -static int pos2N(int pos) { return pos == -1 ? -1 : ((pos - 1) % MAXBUTWIDTH + 1); } - -typedef struct { - SleefDFT *p; - - int countu[POSMAX]; - int path[NSHORTESTPATHS][MAXPATHLEN]; - int pathLen[NSHORTESTPATHS]; - uint64_t cost[NSHORTESTPATHS]; - int nPaths; - - int *heap; - int *heapLen; - uint64_t *heapCost; - int heapSize, nPathsInHeap; -} ks_t; - -static ks_t *ksInit(SleefDFT *p) { - ks_t *q = calloc(1, sizeof(ks_t)); - q->p = p; - q->heapSize = 10; - q->heap = calloc(q->heapSize, sizeof(int)*MAXPATHLEN); - q->heapCost = calloc(q->heapSize, sizeof(uint64_t)); - q->heapLen = calloc(q->heapSize, sizeof(int)); - return q; -} - -static void ksDispose(ks_t *q) { - free(q->heapCost); - free(q->heapLen); - free(q->heap); - free(q); -} - -// returns the number of paths in the heap -static int ksSize(ks_t *q) { return q->nPathsInHeap; } - -// adds a path to the heap -static void ksAddPath(ks_t *q, int *path, int pathLen, uint64_t cost) { - assert(pathLen <= MAXPATHLEN); - - if (q->nPathsInHeap == q->heapSize) { - q->heapSize *= 2; - q->heap = realloc(q->heap, q->heapSize * sizeof(int)*MAXPATHLEN); - q->heapCost = realloc(q->heapCost, q->heapSize * sizeof(uint64_t)); - q->heapLen = realloc(q->heapLen, q->heapSize * sizeof(int)); - } - - for(int i=0;iheap[q->nPathsInHeap * MAXPATHLEN + i] = path[i]; - q->heapLen[q->nPathsInHeap] = pathLen; - q->heapCost[q->nPathsInHeap] = cost; - q->nPathsInHeap++; -} - -// returns the cost of n-th paths in the heap -static uint64_t ksCost(ks_t *q, int n) { - assert(0 <= n && n < q->nPathsInHeap); - return q->heapCost[n]; -} - -// copies the n-th paths in the heap to path, returns its length -static int ksGetPath(ks_t *q, int *path, int n) { - assert(0 <= n && n < q->nPathsInHeap); - int len = q->heapLen[n]; - for(int i=0;iheap[n * MAXPATHLEN + i]; - return len; -} - -// removes the n-th paths in the heap -static void ksRemove(ks_t *q, int n) { - assert(0 <= n && n < q->nPathsInHeap); - - for(int i=n;inPathsInHeap-1;i++) { - int len = q->heapLen[i+1]; - assert(len < MAXPATHLEN); - for(int j=0;jheap[i * MAXPATHLEN + j] = q->heap[(i+1) * MAXPATHLEN + j]; - q->heapLen[i] = q->heapLen[i+1]; - q->heapCost[i] = q->heapCost[i+1]; - } - q->nPathsInHeap--; -} - -// returns the countu value at pos -static int ksCountu(ks_t *q, int pos) { - assert(0 <= pos && pos < POSMAX); - return q->countu[pos]; -} - -// set the countu value at pos to n -static void ksSetCountu(ks_t *q, int pos, int n) { - assert(0 <= pos && pos < POSMAX); - q->countu[pos] = n; -} - -// adds a path as one of the best k paths, returns the number best paths -static int ksAddBestPath(ks_t *q, int *path, int pathLen, uint64_t cost) { - assert(pathLen <= MAXPATHLEN); - assert(q->nPaths < NSHORTESTPATHS); - for(int i=0;ipath[q->nPaths][i] = path[i]; - q->pathLen[q->nPaths] = pathLen; - q->cost[q->nPaths] = cost; - q->nPaths++; - return q->nPaths; -} - -// returns if pos is a destination -static int ksIsDest(ks_t *q, int pos) { return pos2level(pos) == 0; } - -// returns n-th adjacent nodes at pos. -static int ksAdjacent(ks_t *q, int pos, int n) { - if (pos != -1 && pos2level(pos) == 0) return -1; - - int NMAX = MIN(MIN(q->p->log2len, MAXBUTWIDTH+1), q->p->log2len - q->p->log2vecwidth + 1); - - if (pos == -1) { - int N = n / 2 + MAX(q->p->log2vecwidth, 1); - if (N >= NMAX) return -1; - return cln2pos((n & 1) * CONFIG_MT, q->p->log2len, N); - } - - int config = (pos2config(pos) & CONFIG_MT); - int N = n + 1; - int level = pos2level(pos) - pos2N(pos); - - if (level < 0 || N >= NMAX) return -1; - if (level == 0) return n == 0 ? cln2pos(0, 0, 0) : -1; - - return cln2pos(config, level, N); -} - -static uint64_t ksAdjacentCost(ks_t *q, int pos, int n) { - int nxpos = ksAdjacent(q, pos, n); - if (nxpos == -1) return 0; - int config = pos2config(nxpos), level = pos2level(nxpos), N = pos2N(nxpos); - uint64_t ret0 = q->p->tm[config | 0][level*(MAXBUTWIDTH+1) + N]; - uint64_t ret1 = q->p->tm[config | 1][level*(MAXBUTWIDTH+1) + N]; - return MIN(ret0, ret1); -} - -static void searchForBestPath(SleefDFT *p) { - ks_t *q = ksInit(p); - - for(int i=0;;i++) { - int v = ksAdjacent(q, -1, i); - if (v == -1) break; - uint64_t c = ksAdjacentCost(q, -1, i); - int path[1] = { v }; - ksAddPath(q, path, 1, c); - } - - while(ksSize(q) != 0) { - uint64_t bestCost = 1ULL << 60; - int bestPathNum = -1; - - for(int i=0;i= NSHORTESTPATHS) continue; - ksSetCountu(q, lastPos, ksCountu(q, lastPos)+1); - - if (ksIsDest(q, lastPos)) { - if (ksAddBestPath(q, path, pathLen, cost) >= NSHORTESTPATHS) break; - continue; - } - - for(int i=0;;i++) { - int v = ksAdjacent(q, lastPos, i); - if (v == -1) break; - assert(0 <= pos2N(v) && pos2N(v) <= q->p->log2len); - uint64_t c = ksAdjacentCost(q, lastPos, i); - path[pathLen] = v; - ksAddPath(q, path, pathLen+1, cost + c); - } - } - - for(int j = p->log2len;j >= 0;j--) p->bestPath[j] = 0; - - if (((p->mode & SLEEF_MODE_MEASURE) != 0 || (planFilePathSet && (p->mode & SLEEF_MODE_MEASUREBITS) == 0))) { - uint64_t besttm = 1ULL << 62; - int bestPath = -1; - const int niter = 1 + 5000000 / ((1 << p->log2len) + 1); - - real *s2 = NULL, *d2 = NULL; - const real *s = p->in == NULL ? (s2 = (real *)memset(Sleef_malloc((2 << p->log2len) * sizeof(real)), 0, sizeof(real) * (2 << p->log2len))) : p->in; - real *d = p->out == NULL ? (d2 = (real *)memset(Sleef_malloc((2 << p->log2len) * sizeof(real)), 0, sizeof(real) * (2 << p->log2len))) : p->out; - -#ifdef _OPENMP - const int tn = omp_get_thread_num(); -#else - const int tn = 0; -#endif - - real *t[] = { p->x1[tn], p->x0[tn], d }; - - for(int mt=0;mt<2;mt++) { - for(int i=q->nPaths-1;i>=0;i--) { - if (((pos2config(q->path[i][0]) & CONFIG_MT) != 0) != mt) continue; - - if ((p->mode & SLEEF_MODE_VERBOSE) != 0) { - for(int j=0;jpathLen[i];j++) { - int N = pos2N(q->path[i][j]); - int level = pos2level(q->path[i][j]); - int config = pos2config(q->path[i][j]) & ~1; - uint64_t t0 = q->p->tm[config | 0][level*(MAXBUTWIDTH+1) + N]; - uint64_t t1 = q->p->tm[config | 1][level*(MAXBUTWIDTH+1) + N]; - config = t0 < t1 ? config : (config | 1); - - if (N != 0) printf("%d(%s) ", N, configStr[config]); - } - } - - if (mt) startAllThreads(p->nThread); - - uint64_t tm0 = Sleef_currentTimeMicros(); - for(int k=0;kpathLen & 1) == 1) nb = -1; - for(int level = p->log2len, j=0;level >= 1;j++) { - assert(pos2level(q->path[i][j]) == level); - int N = pos2N(q->path[i][j]); - int config = pos2config(q->path[i][j]) & ~1; - uint64_t t0 = q->p->tm[config | 0][level*(MAXBUTWIDTH+1) + N]; - uint64_t t1 = q->p->tm[config | 1][level*(MAXBUTWIDTH+1) + N]; - config = t0 < t1 ? config : (config | 1); - dispatch(p, N, t[nb+1], lb, level, config); - level -= N; - lb = t[nb+1]; - nb = (nb + 1) & 1; - } - } - uint64_t tm1 = Sleef_currentTimeMicros(); - for(int k=0;kpathLen & 1) == 1) nb = -1; - for(int level = p->log2len, j=0;level >= 1;j++) { - assert(pos2level(q->path[i][j]) == level); - int N = pos2N(q->path[i][j]); - int config = pos2config(q->path[i][j]) & ~1; - uint64_t t0 = q->p->tm[config | 0][level*(MAXBUTWIDTH+1) + N]; - uint64_t t1 = q->p->tm[config | 1][level*(MAXBUTWIDTH+1) + N]; - config = t0 < t1 ? config : (config | 1); - dispatch(p, N, t[nb+1], lb, level, config); - level -= N; - lb = t[nb+1]; - nb = (nb + 1) & 1; - } - } - uint64_t tm2 = Sleef_currentTimeMicros(); - - if ((p->mode & SLEEF_MODE_VERBOSE) != 0) printf(" : %lld %lld\n", (long long int)(tm1 - tm0), (long long int)(tm2 - tm1)); - if ((tm1 - tm0) < besttm) { - bestPath = i; - besttm = tm1 - tm0; - } - if ((tm2 - tm1) < besttm) { - bestPath = i; - besttm = tm2 - tm1; - } - } - } - - for(int level = p->log2len, j=0;level >= 1;j++) { - assert(pos2level(q->path[bestPath][j]) == level); - int N = pos2N(q->path[bestPath][j]); - - int config = pos2config(q->path[bestPath][j]) & ~1; - uint64_t t0 = q->p->tm[config | 0][level*(MAXBUTWIDTH+1) + N]; - uint64_t t1 = q->p->tm[config | 1][level*(MAXBUTWIDTH+1) + N]; - config = t0 < t1 ? config : (config | 1); - - p->bestPath[level] = N; - p->bestPathConfig[level] = config; - level -= N; - } - - if (d2 != NULL) Sleef_free(d2); - if (s2 != NULL) Sleef_free(s2); - } else { - for(int level = p->log2len, j=0;level >= 1;j++) { - int bestPath = 0; - assert(pos2level(q->path[bestPath][j]) == level); - int N = pos2N(q->path[bestPath][j]); - int config = pos2config(q->path[bestPath][j]); - p->bestPath[level] = N; - p->bestPathConfig[level] = config; - level -= N; - } - } - - ksDispose(q); -} - -// - -static uint64_t estimate(int log2len, int level, int N, int config) { - uint64_t ret = N * 1000 + ABS(N-3) * 1000; - if (log2len >= 14 && (config & CONFIG_MT) != 0) ret /= 2; - return ret; -} - -static void measureBut(SleefDFT *p) { - if (p->x0 == NULL) return; - - // - -#ifdef _OPENMP - const int tn = omp_get_thread_num(); -#else - const int tn = 0; -#endif - - real *s = (real *)memset(p->x0[tn], 0, sizeof(real) * (2 << p->log2len)); - real *d = (real *)memset(p->x1[tn], 0, sizeof(real) * (2 << p->log2len)); - - const int niter = 1 + 100000 / ((1 << p->log2len) + 1); - -#define MEASURE_REPEAT 4 - - for(int rep=1;rep<=MEASURE_REPEAT;rep++) { - for(int config=0;configmode2 & SLEEF_MODE2_MT1D) == 0 && (config & CONFIG_MT) != 0) continue; - for(uint32_t level = p->log2len;level >= 1;level--) { - for(uint32_t N=1;N<=MAXBUTWIDTH;N++) { - if (level < N || p->log2len <= N) continue; - if (level == N) { - if ((int)p->log2len - (int)level < p->log2vecwidth) continue; - - uint64_t tm = Sleef_currentTimeMicros(); - for(int i=0;itm[config][level*(MAXBUTWIDTH+1)+N] = MIN(p->tm[config][level*(MAXBUTWIDTH+1)+N], tm); - } else if (level == p->log2len) { - if (p->tbl[N] == NULL || p->tbl[N][level] == NULL) continue; - if (p->vecwidth > (1 << N)) continue; - if ((config & CONFIG_MT) != 0) { - int i1=0; -#ifdef _OPENMP -#pragma omp parallel for -#endif - for(i1=0;i1 < (1 << (p->log2len-N-p->log2vecwidth));i1++) { - int i0 = i1 << p->log2vecwidth; - p->perm[level][i1] = 2*perm(p->log2len, i0, p->log2len-level, p->log2len-(level-N)); - } - } else { - for(int i0=0, i1=0;i0 < (1 << (p->log2len-N));i0+=p->vecwidth, i1++) { - p->perm[level][i1] = 2*perm(p->log2len, i0, p->log2len-level, p->log2len-(level-N)); - } - } - - uint64_t tm = Sleef_currentTimeMicros(); - for(int i=0;itm[config][level*(MAXBUTWIDTH+1)+N] = MIN(p->tm[config][level*(MAXBUTWIDTH+1)+N], tm); - } else { - if (p->tbl[N] == NULL || p->tbl[N][level] == NULL) continue; - if (p->vecwidth > 2 && p->log2len <= N+2) continue; - if ((int)p->log2len - (int)level < p->log2vecwidth) continue; - if ((config & CONFIG_MT) != 0) { - int i1=0; -#ifdef _OPENMP -#pragma omp parallel for -#endif - for(i1=0;i1 < (1 << (p->log2len-N-p->log2vecwidth));i1++) { - int i0 = i1 << p->log2vecwidth; - p->perm[level][i1] = 2*perm(p->log2len, i0, p->log2len-level, p->log2len-(level-N)); - } - } else { - for(int i0=0, i1=0;i0 < (1 << (p->log2len-N));i0+=p->vecwidth, i1++) { - p->perm[level][i1] = 2*perm(p->log2len, i0, p->log2len-level, p->log2len-(level-N)); - } - } - - uint64_t tm = Sleef_currentTimeMicros(); - for(int i=0;itm[config][level*(MAXBUTWIDTH+1)+N] = MIN(p->tm[config][level*(MAXBUTWIDTH+1)+N], tm); - } - } - } - } - } - - if ((p->mode & SLEEF_MODE_VERBOSE) != 0) { - for(uint32_t level = p->log2len;level >= 1;level--) { - for(uint32_t N=1;N<=MAXBUTWIDTH;N++) { - if (level < N || p->log2len <= N) continue; - if (level == N) { - if ((int)p->log2len - (int)level < p->log2vecwidth) continue; - printf("bot %d, %d, %d, ", p->log2len, level, N); - for(int config=0;configtm[config][level*(MAXBUTWIDTH+1)+N] == 1ULL << 60) { - printf("N/A, "); - } else { - printf("%lld, ", (long long int)p->tm[config][level*(MAXBUTWIDTH+1)+N]); - } - } - printf("\n"); - } else if (level == p->log2len) { - if (p->tbl[N] == NULL || p->tbl[N][level] == NULL) continue; - if (p->vecwidth > (1 << N)) continue; - printf("top %d, %d, %d, ", p->log2len, level, N); - for(int config=0;configtm[config][level*(MAXBUTWIDTH+1)+N] == 1ULL << 60) { - printf("N/A, "); - } else { - printf("%lld, ", (long long int)p->tm[config][level*(MAXBUTWIDTH+1)+N]); - } - } - printf("\n"); - } else { - if (p->tbl[N] == NULL || p->tbl[N][level] == NULL) continue; - if (p->vecwidth > 2 && p->log2len <= N+2) continue; - if ((int)p->log2len - (int)level < p->log2vecwidth) continue; - printf("mid %d, %d, %d, ", p->log2len, level, N); - for(int config=0;configtm[config][level*(MAXBUTWIDTH+1)+N] == 1ULL << 60) { - printf("N/A, "); - } else { - printf("%lld, ", (long long int)p->tm[config][level*(MAXBUTWIDTH+1)+N]); - } - } - printf("\n"); - } - } - } - } -} - -static void estimateBut(SleefDFT *p) { - for(uint32_t level = p->log2len;level >= 1;level--) { - for(uint32_t N=1;N<=MAXBUTWIDTH;N++) { - if (level < N || p->log2len <= N) continue; - if (level == N) { - if ((int)p->log2len - (int)level < p->log2vecwidth) continue; - for(int config=0;configtm[config][level*(MAXBUTWIDTH+1)+N] = estimate(p->log2len, level, N, config); - } - } else if (level == p->log2len) { - if (p->tbl[N] == NULL || p->tbl[N][level] == NULL) continue; - if (p->vecwidth > (1 << N)) continue; - for(int config=0;configtm[config][level*(MAXBUTWIDTH+1)+N] = estimate(p->log2len, level, N, config); - } - } else { - if (p->tbl[N] == NULL || p->tbl[N][level] == NULL) continue; - if (p->vecwidth > 2 && p->log2len <= N+2) continue; - if ((int)p->log2len - (int)level < p->log2vecwidth) continue; - for(int config=0;configtm[config][level*(MAXBUTWIDTH+1)+N] = estimate(p->log2len, level, N, config); - } - } - } - } -} - -static int measure(SleefDFT *p, int randomize) { - if (p->log2len == 1) { - p->bestTime = 1ULL << 60; - - p->pathLen = 1; - p->bestPath[1] = 1; - - return 1; - } - - if (PlanManager_loadMeasurementResultsP(p, (p->mode & SLEEF_MODE_NO_MT) != 0 ? 1 : 0)) { - if ((p->mode & SLEEF_MODE_VERBOSE) != 0) { - printf("Path(loaded) : "); - for(int j = p->log2len;j >= 0;j--) if (p->bestPath[j] != 0) printf("%d(%s) ", p->bestPath[j], configStr[p->bestPathConfig[j]]); - printf("\n"); - } - - return 1; - } - - int toBeSaved = 0; - - for(uint32_t level = p->log2len;level >= 1;level--) { - for(uint32_t N=1;N<=MAXBUTWIDTH;N++) { - for(int config=0;configtm[config][level*(MAXBUTWIDTH+1)+N] = 1ULL << 60; - } - } - } - - if (((p->mode & SLEEF_MODE_MEASURE) != 0 || (planFilePathSet && (p->mode & SLEEF_MODE_MEASUREBITS) == 0)) && !randomize) { - measureBut(p); - toBeSaved = 1; - } else { - estimateBut(p); - } - - int executable = 0; - for(int i=1;i<=MAXBUTWIDTH && !executable;i++) { - if (p->tm[0][p->log2len*(MAXBUTWIDTH+1)+i] < (1ULL << 60)) executable = 1; - } - - if (!executable) return 0; - - p->bestTime = 1ULL << 60; - - p->bestPath[p->log2len] = 0; - - if (!randomize) { - searchForBestPath(p); - } else { - int path[MAXLOG2LEN+1]; - int pathConfig[MAXLOG2LEN+1]; - for(int j = p->log2len;j >= 0;j--) path[j] = pathConfig[j] = 0; - - int nTrial = 100000; - do { - nTrial = searchForRandomPathRecurse(p, p->log2len, path, pathConfig, 0, nTrial); - } while(p->bestTime == 1ULL << 60 && nTrial >= 0); - } - - if (p->bestPath[p->log2len] == 0) return 0; - - p->pathLen = 0; - for(int j = p->log2len;j >= 0;j--) if (p->bestPath[j] != 0) p->pathLen++; - - if ((p->mode & SLEEF_MODE_VERBOSE) != 0) { - printf("Path"); - if (randomize) printf("(random) :"); - else if (toBeSaved) printf("(measured) :"); - else printf("(estimated) :"); - - for(int j = p->log2len;j >= 0;j--) if (p->bestPath[j] != 0) printf("%d(%s) ", p->bestPath[j], configStr[p->bestPathConfig[j]]); - printf("\n"); - } - - if (toBeSaved) { - PlanManager_saveMeasurementResultsP(p, (p->mode & SLEEF_MODE_NO_MT) != 0 ? 1 : 0); - } - - return 1; -} - -static void measureTranspose(SleefDFT *p) { - if (PlanManager_loadMeasurementResultsT(p)) { - if ((p->mode & SLEEF_MODE_VERBOSE) != 0) printf("transpose NoMT(loaded): %lld\n", (long long int)p->tmNoMT); - if ((p->mode & SLEEF_MODE_VERBOSE) != 0) printf("transpose MT(loaded): %lld\n", (long long int)p->tmMT); - return; - } - - if ((p->mode & SLEEF_MODE_MEASURE) == 0 && (!planFilePathSet || (p->mode & SLEEF_MODE_MEASUREBITS) != 0)) { - if (p->log2hlen + p->log2vlen >= 14) { - p->tmNoMT = 20; - p->tmMT = 10; - if ((p->mode & SLEEF_MODE_VERBOSE) != 0) printf("transpose : selected MT(estimated)\n"); - } else { - p->tmNoMT = 10; - p->tmMT = 20; - if ((p->mode & SLEEF_MODE_VERBOSE) != 0) printf("transpose : selected NoMT(estimated)\n"); - } - return; - } - - real *tBuf2 = (real *)Sleef_malloc(sizeof(real)*2*p->hlen*p->vlen); - - const int niter = 1 + 5000000 / (p->hlen * p->vlen + 1); - uint64_t tm; - - tm = Sleef_currentTimeMicros(); - for(int i=0;itBuf, p->log2hlen, p->log2vlen); - transpose(tBuf2, p->tBuf, p->log2vlen, p->log2hlen); - } - p->tmNoMT = Sleef_currentTimeMicros() - tm + 1; - - if ((p->mode & SLEEF_MODE_VERBOSE) != 0) printf("transpose NoMT(measured): %lld\n", (long long int)p->tmNoMT); - -#ifdef _OPENMP - tm = Sleef_currentTimeMicros(); - for(int i=0;itBuf, p->log2hlen, p->log2vlen); - transposeMT(tBuf2, p->tBuf, p->log2vlen, p->log2hlen); - } - p->tmMT = Sleef_currentTimeMicros() - tm + 1; - - if ((p->mode & SLEEF_MODE_VERBOSE) != 0) printf("transpose MT(measured): %lld\n", (long long int)p->tmMT); -#else - p->tmMT = p->tmNoMT*2; -#endif - - Sleef_free(tBuf2); - - PlanManager_saveMeasurementResultsT(p); -} - -// Implementation of SleefDFT_*_init1d - -EXPORT SleefDFT *INIT(uint32_t n, const real *in, real *out, uint64_t mode) { - SleefDFT *p = (SleefDFT *)calloc(1, sizeof(SleefDFT)); - p->magic = MAGIC; - p->baseTypeID = BASETYPEID; - p->in = (const void *)in; - p->out = (void *)out; - - // Mode - - p->mode = mode; - - if ((p->mode & SLEEF_MODE_NO_MT) == 0) { - p->mode2 |= SLEEF_MODE2_MT1D; - } - - if ((mode & SLEEF_MODE_REAL) != 0) n /= 2; - p->log2len = ilog2(n); - - if (p->log2len <= 1) return p; - - if ((mode & SLEEF_MODE_ALT) != 0) p->mode = mode = mode ^ SLEEF_MODE_BACKWARD; - -#ifdef _OPENMP - p->nThread = omp_thread_count(); -#else - p->nThread = 1; - p->mode2 &= ~SLEEF_MODE2_MT1D; -#endif - - // ISA availability - - int bestPriority = -1; - p->isa = -1; - - for(int i=0;i= (uint32_t)((*GETINT[i])(GETINT_VECWIDTH) * (*GETINT[i])(GETINT_VECWIDTH))) { - bestPriority = (*GETINT[i])(GETINT_DFTPRIORITY); - p->isa = i; - } - } - - if (p->isa == -1) { - if ((p->mode & SLEEF_MODE_VERBOSE) != 0) printf("ISA not available\n"); - p->magic = 0; - free(p); - return NULL; - } - - // Tables - - p->perm = (uint32_t **)calloc(sizeof(uint32_t *), p->log2len+1); - for(int level = p->log2len;level >= 1;level--) { - p->perm[level] = (uint32_t *)Sleef_malloc(sizeof(uint32_t) * ((1 << p->log2len) + 8)); - } - - p->x0 = malloc(sizeof(real *) * p->nThread); - p->x1 = malloc(sizeof(real *) * p->nThread); - - for(int i=0;inThread;i++) { - p->x0[i] = (real *)Sleef_malloc(sizeof(real) * 2 * n); - p->x1[i] = (real *)Sleef_malloc(sizeof(real) * 2 * n); - } - - if ((mode & SLEEF_MODE_REAL) != 0) { - p->rtCoef0 = (real *)Sleef_malloc(sizeof(real) * n); - p->rtCoef1 = (real *)Sleef_malloc(sizeof(real) * n); - - if ((mode & SLEEF_MODE_BACKWARD) == 0) { - for(uint32_t i=0;irtCoef0)[i*2+0] = ((real *)p->rtCoef0)[i*2+1] = (real)0.5 - (real)0.5 * sc.x; - ((real *)p->rtCoef1)[i*2+0] = ((real *)p->rtCoef1)[i*2+1] = (real)0.5*sc.y; - } - } else { - for(uint32_t i=0;irtCoef0)[i*2+0] = ((real *)p->rtCoef0)[i*2+1] = (real)0.5 + (real)0.5 * sc.x; - ((real *)p->rtCoef1)[i*2+0] = ((real *)p->rtCoef1)[i*2+1] = (real)0.5*sc.y; - } - } - } - - // Measure - - int sign = (mode & SLEEF_MODE_BACKWARD) != 0 ? -1 : 1; - - p->vecwidth = (*GETINT[p->isa])(GETINT_VECWIDTH); - p->log2vecwidth = ilog2(p->vecwidth); - - for(int i=1;i<=MAXBUTWIDTH;i++) { - ((real ***)p->tbl)[i] = makeTable(sign, p->vecwidth, p->log2len, i, constK[i]); - } - - if (!measure(p, (mode & SLEEF_MODE_DEBUG))) { - // Fall back to the first ISA - freeTables(p); - p->isa = 0; - - p->vecwidth = (*GETINT[p->isa])(GETINT_VECWIDTH); - p->log2vecwidth = ilog2(p->vecwidth); - - for(int i=1;i<=MAXBUTWIDTH;i++) { - ((real ***)p->tbl)[i] = makeTable(sign, p->vecwidth, p->log2len, i, constK[i]); - } - - for(int level = p->log2len;level >= 1;) { - int N = ABS(p->bestPath[level]); - if (level == N) { level -= N; continue; } - - int i1 = 0; - for(int i0=0;i0 < (1 << (p->log2len-N));i0+=p->vecwidth, i1++) { - p->perm[level][i1] = 2*perm(p->log2len, i0, p->log2len-level, p->log2len-(level-N)); - } - for(;i1 < (1 << p->log2len) + 8;i1++) p->perm[level][i1] = 0; - - level -= N; - } - - if (!measure(p, (mode & SLEEF_MODE_DEBUG))) { - if ((p->mode & SLEEF_MODE_VERBOSE) != 0) printf("Suitable ISA not found. This should not happen.\n"); - return NULL; - } - } - - for(int level = p->log2len;level >= 1;) { - int N = ABS(p->bestPath[level]); - if (level == N) { level -= N; continue; } - - int i1 = 0; - for(int i0=0;i0 < (1 << (p->log2len-N));i0+=p->vecwidth, i1++) { - p->perm[level][i1] = 2*perm(p->log2len, i0, p->log2len-level, p->log2len-(level-N)); - } - for(;i1 < (1 << p->log2len) + 8;i1++) p->perm[level][i1] = 0; - - level -= N; - } - - if ((p->mode & SLEEF_MODE_VERBOSE) != 0) printf("ISA : %s %d bit %s\n", (char *)(*GETPTR[p->isa])(0), (int)(GETINT[p->isa](GETINT_VECWIDTH) * sizeof(real) * 16), BASETYPESTRING); - - return p; -} - -// Implementation of SleefDFT_*_init2d - -EXPORT SleefDFT *INIT2D(uint32_t vlen, uint32_t hlen, const real *in, real *out, uint64_t mode) { - SleefDFT *p = (SleefDFT *)calloc(1, sizeof(SleefDFT)); - p->magic = MAGIC2D; - p->mode = mode; - p->baseTypeID = BASETYPEID; - p->in = in; - p->out = out; - p->hlen = hlen; - p->log2hlen = ilog2(hlen); - p->vlen = vlen; - p->log2vlen = ilog2(vlen); - - uint64_t mode1D = mode; - mode1D |= SLEEF_MODE_NO_MT; - - if ((mode & SLEEF_MODE_NO_MT) == 0) p->mode3 |= SLEEF_MODE3_MT2D; - - p->instH = p->instV = INIT(hlen, NULL, NULL, mode1D); - if (hlen != vlen) p->instV = INIT(vlen, NULL, NULL, mode1D); - - p->tBuf = (void *)Sleef_malloc(sizeof(real)*2*hlen*vlen); - - measureTranspose(p); - - return p; -} - -// Implementation of SleefDFT_*_execute - -EXPORT void EXECUTE(SleefDFT *p, const real *s0, real *d0) { - assert(p != NULL && (p->magic == MAGIC || p->magic == MAGIC2D)); - - const real *s = s0 == NULL ? p->in : s0; - real *d = d0 == NULL ? p->out : d0; - - if (p->magic == MAGIC2D) { - // S -> T -> D -> T -> D - - real *tBuf = (real *)(p->tBuf); - -#ifdef _OPENMP - if ((p->mode3 & SLEEF_MODE3_MT2D) != 0 && - (((p->mode & SLEEF_MODE_DEBUG) == 0 && p->tmMT < p->tmNoMT) || - ((p->mode & SLEEF_MODE_DEBUG) != 0 && (rand() & 1)))) - { - int y=0; -#pragma omp parallel for - for(y=0;yvlen;y++) { - EXECUTE(p->instH, &s[p->hlen*2*y], &tBuf[p->hlen*2*y]); - } - - transposeMT(d, tBuf, p->log2vlen, p->log2hlen); - -#pragma omp parallel for - for(y=0;yhlen;y++) { - EXECUTE(p->instV, &d[p->vlen*2*y], &tBuf[p->vlen*2*y]); - } - - transposeMT(d, tBuf, p->log2hlen, p->log2vlen); - } else -#endif - { - for(int y=0;yvlen;y++) { - EXECUTE(p->instH, &s[p->hlen*2*y], &tBuf[p->hlen*2*y]); - } - - transpose(d, tBuf, p->log2vlen, p->log2hlen); - - for(int y=0;yhlen;y++) { - EXECUTE(p->instV, &d[p->vlen*2*y], &tBuf[p->vlen*2*y]); - } - - transpose(d, tBuf, p->log2hlen, p->log2vlen); - } - - return; - } - - if (p->log2len <= 1) { - if ((p->mode & SLEEF_MODE_REAL) == 0) { - real r0 = s[0] + s[2]; - real r1 = s[1] + s[3]; - real r2 = s[0] - s[2]; - real r3 = s[1] - s[3]; - d[0] = r0; d[1] = r1; d[2] = r2; d[3] = r3; - } else { - if ((p->mode & SLEEF_MODE_ALT) == 0) { - if (p->log2len == 1) { - if ((p->mode & SLEEF_MODE_BACKWARD) == 0) { - real r0 = s[0] + s[2] + (s[1] + s[3]); - real r1 = s[0] + s[2] - (s[1] + s[3]); - real r2 = s[0] - s[2]; - real r3 = s[3] - s[1]; - d[0] = r0; d[1] = 0; d[2] = r2; d[3] = r3; d[4] = r1; d[5] = 0; - } else { - real r0 = (s[0] + s[4])*(real)0.5 + s[2]; - real r1 = (s[0] - s[4])*(real)0.5 - s[3]; - real r2 = (s[0] + s[4])*(real)0.5 - s[2]; - real r3 = (s[0] - s[4])*(real)0.5 + s[3]; - d[0] = r0*2; d[1] = r1*2; d[2] = r2*2; d[3] = r3*2; - } - } else { - if ((p->mode & SLEEF_MODE_BACKWARD) == 0) { - real r0 = s[0] + s[1]; - real r1 = s[0] - s[1]; - d[0] = r0; d[1] = 0; d[2] = r1; d[3] = 0; - } else { - real r0 = s[0] + s[2]; - real r1 = s[0] - s[2]; - d[0] = r0; d[1] = r1; - } - } - } else { - if (p->log2len == 1) { - if ((p->mode & SLEEF_MODE_BACKWARD) == 0) { - real r0 = s[0] + s[2] + (s[1] + s[3]); - real r1 = s[0] + s[2] - (s[1] + s[3]); - real r2 = s[0] - s[2]; - real r3 = s[1] - s[3]; - d[0] = r0; d[1] = r1; d[2] = r2; d[3] = r3; - } else { - real r0 = (s[0] + s[1])*(real)0.5 + s[2]; - real r1 = (s[0] - s[1])*(real)0.5 + s[3]; - real r2 = (s[0] + s[1])*(real)0.5 - s[2]; - real r3 = (s[0] - s[1])*(real)0.5 - s[3]; - d[0] = r0; d[1] = r1; d[2] = r2; d[3] = r3; - } - } else { - real c = ((p->mode & SLEEF_MODE_BACKWARD) != 0) ? (real)0.5 : (real)1.0; - real r0 = s[0] + s[1]; - real r1 = s[0] - s[1]; - d[0] = r0 * c; d[1] = r1 * c; - } - } - } - return; - } - - // - -#ifdef _OPENMP - const int tn = omp_get_thread_num(); - real *t[] = { p->x1[tn], p->x0[tn], d }; -#else - real *t[] = { p->x1[0], p->x0[0], d }; -#endif - - const real *lb = s; - int nb = 0; - - if ((p->mode & SLEEF_MODE_REAL) != 0 && (p->pathLen & 1) == 0 && - ((p->mode & SLEEF_MODE_BACKWARD) != 0) != ((p->mode & SLEEF_MODE_ALT) != 0)) nb = -1; - if ((p->mode & SLEEF_MODE_REAL) == 0 && (p->pathLen & 1) == 1) nb = -1; - - if ((p->mode & SLEEF_MODE_REAL) != 0 && - ((p->mode & SLEEF_MODE_BACKWARD) != 0) != ((p->mode & SLEEF_MODE_ALT) != 0)) { - (*REALSUB1[p->isa])(t[nb+1], s, p->log2len, p->rtCoef0, p->rtCoef1, (p->mode & SLEEF_MODE_ALT) == 0); - if ((p-> mode & SLEEF_MODE_ALT) == 0) t[nb+1][(1 << p->log2len)+1] = -s[(1 << p->log2len)+1] * 2; - lb = t[nb+1]; - nb = (nb + 1) & 1; - } - - for(int level = p->log2len;level >= 1;) { - int N = ABS(p->bestPath[level]), config = p->bestPathConfig[level]; - dispatch(p, N, t[nb+1], lb, level, config); - level -= N; - lb = t[nb+1]; - nb = (nb + 1) & 1; - } - - if ((p->mode & SLEEF_MODE_REAL) != 0 && - ((p->mode & SLEEF_MODE_BACKWARD) == 0) != ((p->mode & SLEEF_MODE_ALT) != 0)) { - (*REALSUB0[p->isa])(d, lb, p->log2len, p->rtCoef0, p->rtCoef1); - if ((p->mode & SLEEF_MODE_ALT) == 0) { - d[(1 << p->log2len)+1] = -d[(1 << p->log2len)+1]; - d[(2 << p->log2len)+0] = d[1]; - d[(2 << p->log2len)+1] = 0; - d[1] = 0; - } - } -} diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/dft.cpp b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/dft.cpp new file mode 100644 index 00000000000..b347874db0c --- /dev/null +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/dft.cpp @@ -0,0 +1,1491 @@ +// Copyright Naoki Shibata and contributors 2010 - 2025. +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include "compat.h" + +#include "sleef.h" +#define IMPORT_IS_EXPORT +#include "sleefdft.h" + +#include "misc.h" +#include "common.h" + +#include "dftcommon.hpp" +#include "dispatchdp.hpp" +#include "dispatchsp.hpp" + +using namespace std; + +// + +#ifndef ENABLE_STREAM +#error ENABLE_STREAM not defined +#endif + +static const int constK[] = { 0, 2, 6, 14, 38, 94, 230, 542, 1254 }; + +extern const char *configStr[]; + +static void sighandler(int signum) { LONGJMP(sigjmp, 1); } + +static int checkISAAvailability(int isa, int (*GETINT_[16])(int), int BASETYPEID_) { + static mutex mtx; + + unique_lock lock(mtx); + + signal(SIGILL, sighandler); + + if (SETJMP(sigjmp) == 0) { + int ret = GETINT_[isa] != NULL && (*GETINT_[isa])(BASETYPEID_); + signal(SIGILL, SIG_DFL); + return ret; + } + + signal(SIGILL, SIG_DFL); + return 0; +} + +static int omp_thread_count() { + int n = 0; +#pragma omp parallel reduction(+:n) + n += 1; + return n; +} + +static void startAllThreads(const int nth) { + volatile int8_t *state = (int8_t *)calloc(nth, 1); + int th=0; +#pragma omp parallel for + for(th=0;th>= r; + qq = q | (q >> 1); + qq |= (qq >> 2); + qq = ((qq & 0x10) >> 4) | ((qq & 0x100) >> 7) | ((qq & 0x1000) >> 10); + + return r + tab[qq] * 4 + tab[q >> (tab[qq] * 4)] - 1; +} + +static uint32_t uperm(int nbits, uint32_t k, int s, int d) { + s = MIN(MAX(s, 0), nbits); + d = MIN(MAX(d, 0), nbits); + uint32_t r; + r = (((k & 0xaaaaaaaa) >> 1) | ((k & 0x55555555) << 1)); + r = (((r & 0xcccccccc) >> 2) | ((r & 0x33333333) << 2)); + r = (((r & 0xf0f0f0f0) >> 4) | ((r & 0x0f0f0f0f) << 4)); + r = (((r & 0xff00ff00) >> 8) | ((r & 0x00ff00ff) << 8)); + r = ((r >> 16) | (r << 16)) >> (32-nbits); + + return (((r << s) | (k & ~(-1 << s))) & ~(-1 << d)) | + ((((k >> s) | (r & (-1 << (nbits-s)))) << d) & ~(-1 << nbits)); +} + +static void showPath(ostream &os, const string &mes, const vector& path) { + os << mes; + for(auto e : path) os << e << " "; + os << endl; +} + +static void showPath(FILE *fp, const string &mes, const vector& path) { + ostringstream s; + showPath(s, mes, path); + fputs(s.str().c_str(), fp); +} + +// Dispatcher + +template +void SleefDFTXX::dispatch(const int N, real *d, const real *s, const int level, const int config) { + const int K = constK[N]; + if (level == N) { + // Last + if ((mode & SLEEF_MODE_BACKWARD) == 0) { + void (*func)(real *, const real *, const int) = DFTF[config][isa][N]; + (*func)(d, s, log2len-N); + } else { + void (*func)(real *, const real *, const int) = DFTB[config][isa][N]; + (*func)(d, s, log2len-N); + } + } else if (level == (int)log2len) { + // First + assert(vecwidth <= (1 << N)); + const int shift = log2len-N - log2vecwidth; + if ((mode & SLEEF_MODE_BACKWARD) == 0) { + if (minshift <= shift && shift < MAXSHIFT) { + void (*func)(real *, uint32_t *, const real *, const real *, const int) = TBUTFS[shift][config][isa][N]; + (*func)(d, perm[level], s, tbl[N][level], K); + } else { + void (*func)(real *, uint32_t *, const real *, const int, const real *, const int) = TBUTF[config][isa][N]; + (*func)(d, perm[level], s, log2len-N, tbl[N][level], K); + } + } else { + if (minshift <= shift && shift < MAXSHIFT) { + void (*func)(real *, uint32_t *, const real *, const real *, const int) = TBUTBS[shift][config][isa][N]; + (*func)(d, perm[level], s, tbl[N][level], K); + } else { + void (*func)(real *, uint32_t *, const real *, const int, const real *, const int) = TBUTB[config][isa][N]; + (*func)(d, perm[level], s, log2len-N, tbl[N][level], K); + } + } + } else { + if ((mode & SLEEF_MODE_BACKWARD) == 0) { + void (*func)(real *, uint32_t *, const int, const real *, const int, const real *, const int) = BUTF[config][isa][N]; + (*func)(d, perm[level], log2len-level, s, log2len-N, tbl[N][level], K); + } else { + void (*func)(real *, uint32_t *, const int, const real *, const int, const real *, const int) = BUTB[config][isa][N]; + (*func)(d, perm[level], log2len-level, s, log2len-N, tbl[N][level], K); + } + } +} + +// Transposer + +#define LOG2BS 4 +#define BS (1 << LOG2BS) + +#define TRANSPOSE_BLOCK(y2) do { \ + for(int x2=y2+1;x2 +static void transpose(real *RESTRICT ALIGNED(256) d, real *RESTRICT ALIGNED(256) s, const int log2n, const int log2m) { + if (log2n < LOG2BS || log2m < LOG2BS) { + for(int y=0;y<(1 << log2n);y++) { + for(int x=0;x<(1 << log2m);x++) { + real r0 = s[((y << log2m)+x)*2+0]; + real r1 = s[((y << log2m)+x)*2+1]; + d[((x << log2n)+y)*2+0] = r0; + d[((x << log2n)+y)*2+1] = r1; + } + } + } else { +#if defined(__GNUC__) && !defined(__clang__) + typedef struct { real __attribute__((vector_size(sizeof(real)*BS*2))) r; } row_t; + typedef struct { real __attribute__((vector_size(sizeof(real)*2))) r; } element_t; +#else + typedef struct { real r[BS*2]; } row_t; + typedef struct { real r0, r1; } element_t; +#endif + for(int y=0;y<(1 << log2n);y+=BS) { + for(int x=0;x<(1 << log2m);x+=BS) { + row_t row[BS]; + for(int y2=0;y2 +static void transposeMT(real *RESTRICT ALIGNED(256) d, real *RESTRICT ALIGNED(256) s, int log2n, int log2m) { + if (log2n < LOG2BS || log2m < LOG2BS) { + for(int y=0;y<(1 << log2n);y++) { + for(int x=0;x<(1 << log2m);x++) { + real r0 = s[((y << log2m)+x)*2+0]; + real r1 = s[((y << log2m)+x)*2+1]; + d[((x << log2n)+y)*2+0] = r0; + d[((x << log2n)+y)*2+1] = r1; + } + } + } else { +#if defined(__GNUC__) && !defined(__clang__) + typedef struct { real __attribute__((vector_size(sizeof(real)*BS*2))) r; } row_t; + typedef struct { real __attribute__((vector_size(sizeof(real)*2))) r; } element_t; +#else + typedef struct { real r[BS*2]; } row_t; + typedef struct { real r0, r1; } element_t; +#endif + int y=0; +#pragma omp parallel for + for(y=0;y<(1 << log2n);y+=BS) { + for(int x=0;x<(1 << log2m);x+=BS) { + row_t row[BS]; + for(int y2=0;y2 +static real2 r2coefsc(int i, int log2len, int level, real2 (*SINCOSPI_)(real)) { + return (*SINCOSPI_)((i & ((-1 << (log2len - level)) & ~(-1 << log2len))) * ((real)1.0/(1 << (log2len-1)))); +} + +template +static real2 srcoefsc(int i, int log2len, int level, real2 (*SINCOSPI_)(real)) { + return (*SINCOSPI_)(((3*(i & (-1 << (log2len - level)))) & ~(-1 << log2len)) * ((real)1.0/(1 << (log2len-1)))); +} + +template +static int makeTableRecurse(real *x, int *p, const int log2len, const int levelorg, const int levelinc, const int sign, const int top, const int bot, const int N, int cnt, real2 (*SINCOSPI_)(real)) { + if (levelinc >= N-1) return cnt; + const int level = levelorg - levelinc; + if (bot - top > 4) { + const int bl = 1 << (N - levelinc); + const int w = bl/4; + for(int j=0;j<(bot-top)/bl;j++) { + for(int i=0;i(a, log2len, level, SINCOSPI_); + x[cnt++] = -sc.x; x[cnt++] = -sc.y; + sc = srcoefsc(a, log2len, level, SINCOSPI_); + x[cnt++] = -sc.x; x[cnt++] = -sc.y; + } + cnt = makeTableRecurse(x, p, log2len, levelorg, levelinc+1, sign, top+bl*j , top+bl*j + bl/2, N, cnt, SINCOSPI_); + cnt = makeTableRecurse(x, p, log2len, levelorg, levelinc+2, sign, top+bl*j + bl/2, top+bl*j + bl , N, cnt, SINCOSPI_); + } + } else if (bot - top == 4) { + int a = sign*(p[(levelinc << N) + top] & (-1 << (log2len - level))); + real2 sc; + sc = r2coefsc(a, log2len, level, SINCOSPI_); + x[cnt++] = -sc.x; x[cnt++] = -sc.y; + sc = srcoefsc(a, log2len, level, SINCOSPI_); + x[cnt++] = -sc.x; x[cnt++] = -sc.y; + } + + return cnt; +} + +template +static real **makeTable(int sign, int vecwidth, int log2len, const int N, const int K, real2 (*SINCOSPI_)(real)) { + if (log2len < N) return NULL; + + int *p = (int *)malloc(sizeof(int)*((N+1)<(a, log2len, level-N+1, SINCOSPI_); + tbl[level][tblOffset++] = sc.y; tbl[level][tblOffset++] = sc.x; + + tblOffset = makeTableRecurse(tbl[level], p, log2len, level, 0, sign, 0, 1 << N, N, tblOffset, SINCOSPI_); + } + + if (level == log2len) { + real *atbl = (real *)Sleef_malloc(sizeof(real)*(K << (log2len-N))*2); + tblOffset = 0; + while(tblOffset < (K << (log2len-N))) { + for(int k=0;k < K;k++) { + for(int v = 0;v < vecwidth;v++) { + assert((tblOffset + k * vecwidth + v)*2 + 1 < (K << (log2len-N))*2); + atbl[(tblOffset + k * vecwidth + v)*2 + 0] = tbl[log2len][tblOffset + v * K + k]; + atbl[(tblOffset + k * vecwidth + v)*2 + 1] = tbl[log2len][tblOffset + v * K + k]; + } + } + tblOffset += K * vecwidth; + } + Sleef_free(tbl[log2len]); + tbl[log2len] = atbl; + } + } + + free(p); + + return tbl; +} + +template +void SleefDFTXX::generatePerm(const vector &path) { + for(unsigned i=0;i +class KShortest { + vector> heap; + vector heapCost; + unordered_map reached; + + /** Remove the n-th path in the heap */ + void remove(unsigned n) { + assert(n < heap.size()); + heap.erase(heap.begin() + n); + heapCost.erase(heapCost.begin() + n); + assert(heap.size() == heapCost.size()); + } + +public: + size_t limit = 0; + + virtual ~KShortest() {} + + /** Add a path to the heap */ + size_t addPath(vector &p, double cost) { + heap.push_back(p); + heapCost.push_back(cost); + assert(heap.size() == heapCost.size()); + if (p.size()) reached[p[p.size()-1]]++; + return heap.size(); + } + + void showHeap(ostream &os) const { + os << "Heap :" << endl; + int i = 0; + for(auto a : heap) { + os << i << " : "; + for(auto e : a) os << e << " "; + os << ": " << heapCost[i] << endl; + i++; + } + os << endl; + } + + /** Return the n-th path in the heap */ + vector getPath(unsigned n) const { + assert(n < heap.size()); + return heap[n]; + } + + /** Return if pos is a destination */ + virtual bool isDestination(const T& pos) = 0; + + /** Return next nodes after the path */ + virtual vector next(const vector& path) = 0; + + /** Return the cost to travel the path */ + virtual double cost(const vector& path) = 0; + + /** Compute and return the next-best path */ + vector execute() { + for(;;) { +#ifdef DEBUG + showHeap(cout); +#endif + + double bestCost = INFINITY_; + unsigned bestNum = UINT_MAX; + + for(unsigned i=0;i(); + + vector best = getPath(bestNum); + + remove(bestNum); + + if (isDestination(best[best.size()-1])) return best; + + auto adj = next(best); + + for(auto a : adj) { + if (limit != 0 && reached[a] >= limit) continue; + vector p(best); + p.push_back(a); + addPath(p, cost(p)); + } + } + } +}; + +template +void SleefDFTXX::measurementRun(real *d, const real *s, const vector &path, uint64_t niter) { + const int tn = omp_get_thread_num(); + real *t[] = { x1[tn], x0[tn], d }; + + for(uint64_t i=0;i +double SleefDFTXX::measurePath(const vector &path, uint64_t minTime) { + real *s2 = NULL, *d2 = NULL; + const real *s = in == NULL ? (s2 = (real *)memset(Sleef_malloc((2 << log2len) * sizeof(real)), 0, sizeof(real) * (2 << log2len))) : in; + real *d = out == NULL ? (d2 = (real *)memset(Sleef_malloc((2 << log2len) * sizeof(real)), 0, sizeof(real) * (2 << log2len))) : out; + + generatePerm(path); + + uint64_t tm = UINT64_MAX, niter = 1; + + if ((path[0].config & CONFIG_MT) != 0) startAllThreads(nThread); + + for(;;) { + auto tm0 = chrono::high_resolution_clock::now(); + + measurementRun(d, s, path, niter); + + auto tm1 = chrono::high_resolution_clock::now(); + + tm = chrono::duration_cast(tm1 - tm0).count(); + + if (tm >= minTime) break; + + niter *= 2; + } + + { + auto tm0 = chrono::high_resolution_clock::now(); + + measurementRun(d, s, path, niter); + + auto tm1 = chrono::high_resolution_clock::now(); + + uint64_t tm2 = chrono::duration_cast(tm1 - tm0).count(); + if (tm2 < tm) tm = tm2; + } + + if (d2 != NULL) Sleef_free(d2); + if (s2 != NULL) Sleef_free(s2); + + return double(tm) / niter; +} + +template +double SleefDFT2DXX::measurePath(SleefDFTXX *inst, bool mt, + const vector &path, uint32_t hlen, uint32_t vlen, uint64_t minTime) { + real *s2 = NULL; + const size_t z = (2 << (log2hlen + log2vlen)) * sizeof(real); + const real *s = in == NULL ? (s2 = (real *)memset(Sleef_malloc(z), 0, z)) : in; + double scale = 1; + + if (mt) { + if ((int)vlen > inst->nThread * 2) { + scale = vlen / (inst->nThread * 2); + vlen = inst->nThread * 2; + } + } else { + if (vlen > 2) { + scale = vlen / 2; + vlen = 2; + } + } + + inst->generatePerm(path); + + uint64_t tm = UINT64_MAX, niter = 1; + + if (mt) startAllThreads(inst->nThread); + + for(;;) { + auto tm0 = chrono::high_resolution_clock::now(); + int y=0; + + if (mt) { +#pragma omp parallel for + for(y=0;y<(int)vlen;y++) { + inst->measurementRun(&tBuf[hlen*2*y], &s[hlen*2*y], path, niter); + } + } else { + for(y=0;y<(int)vlen;y++) { + inst->measurementRun(&tBuf[hlen*2*y], &s[hlen*2*y], path, niter); + } + } + + auto tm1 = chrono::high_resolution_clock::now(); + + tm = chrono::duration_cast(tm1 - tm0).count(); + + if (tm >= minTime) break; + + niter *= 2; + } + + { + auto tm0 = chrono::high_resolution_clock::now(); + + int y=0; + + if (mt) { +#pragma omp parallel for + for(y=0;y<(int)vlen;y++) { + inst->measurementRun(&tBuf[hlen*2*y], &s[hlen*2*y], path, niter); + } + } else { + for(y=0;y<(int)vlen;y++) { + inst->measurementRun(&tBuf[hlen*2*y], &s[hlen*2*y], path, niter); + } + } + + auto tm1 = chrono::high_resolution_clock::now(); + + uint64_t tm2 = chrono::duration_cast(tm1 - tm0).count(); + if (tm2 < tm) tm = tm2; + } + + if (s2 != NULL) Sleef_free(s2); + + return double(tm) * scale / niter; +} + +template +class QuickFinder : public KShortest { + SleefDFTXX &inst; + +public: + QuickFinder(SleefDFTXX &inst_, const vector &startPoints, size_t limit_) : + inst(inst_) { + limit = limit_; + for(auto a : startPoints) { + vector v { a }; + addPath(v, cost(v)); + } + } + + ~QuickFinder() {} + + virtual bool isDestination(const Action& pos) { + return pos.level == pos.N; + } + + virtual vector next(const vector& path) { + const int NMAX = MIN(MIN(inst.log2len, MAXBUTWIDTH+1), inst.log2len - inst.log2vecwidth + 1); + + vector v; + + Action last = path[path.size()-1]; + + int level = last.level - last.N; + + assert(level > 0); + + for(int config = 0;config < CONFIGMAX;config++) { + if ((config & CONFIG_MT) != (last.config & CONFIG_MT)) continue; + + for(int N=1;N& path) { + return inst.measurePath(path, 100000); + } +}; + +template +class QuickFinder2 : public KShortest { + SleefDFT2DXX &inst2d; + SleefDFTXX *inst1d; + const bool mt; + const uint32_t hlen, vlen; + +public: + QuickFinder2(SleefDFT2DXX &inst2d_, + SleefDFTXX *inst1d_, bool mt_, + const vector &startPoints, uint32_t hlen_, uint32_t vlen_, size_t limit_) : + inst2d(inst2d_), inst1d(inst1d_), mt(mt_), hlen(hlen_), vlen(vlen_) { + limit = limit_; + for(auto a : startPoints) { + vector v { a }; + addPath(v, cost(v)); + } + } + + ~QuickFinder2() {} + + virtual bool isDestination(const Action& pos) { + return pos.level == pos.N; + } + + virtual vector next(const vector& path) { + const int NMAX = MIN(MIN(inst1d->log2len, MAXBUTWIDTH+1), inst1d->log2len - inst1d->log2vecwidth + 1); + + vector v; + + Action last = path[path.size()-1]; + + int level = last.level - last.N; + + assert(level > 0); + + for(int config = 0;config < CONFIGMAX;config++) { + if ((config & CONFIG_MT) != (last.config & CONFIG_MT)) continue; + + for(int N=1;Nexecutable[config][level][N]) continue; + Action a(config, level, N); + v.push_back(a); + } + } + + return v; + } + + virtual double cost(const vector& path) { + return inst2d.measurePath(inst1d, mt, path, hlen, vlen, 100000); + } +}; + +template +class PathEstimator : public KShortest { + SleefDFTXX &inst; + +public: + PathEstimator(SleefDFTXX &inst_, const vector &startPoints) : + inst(inst_) { + limit = 1; + for(auto a : startPoints) { + vector v { a }; + addPath(v, cost(v)); + } + } + + ~PathEstimator() {} + + virtual bool isDestination(const Action& pos) { + return pos.level == pos.N; + } + + virtual vector next(const vector& path) { + const int NMAX = MIN(MIN(inst.log2len, MAXBUTWIDTH+1), inst.log2len - inst.log2vecwidth + 1); + + vector v; + + Action last = path[path.size()-1]; + + if (last.level == 0) return v; + + int level = last.level - last.N; + + assert(level > 0); + + for(int config = 0;config < CONFIGMAX;config++) { + if ((config & CONFIG_MT) != (last.config & CONFIG_MT)) continue; + + for(int N=1;N= 14 && (config & CONFIG_MT) != 0) ret /= 2; + return ret; + } + + virtual double cost(const vector& path) { + uint64_t t = 0; + for(auto a : path) { + if (!inst.executable[a.config][a.level][a.N]) return INFINITY_; + t += estimate(inst.log2len, a.config, a.level, a.N); + } + return t; + } +}; + +template +void SleefDFTXX::searchForBestPath(int nPaths) { + const int NMAX = MIN(MIN(log2len, MAXBUTWIDTH+1), log2len - log2vecwidth + 1); + + vector sp; + + for(int config = 0;config < CONFIGMAX;config++) { + for(int N=1;N>(*this, sp); + bestPath = pf->execute(); + return; + } + + auto pf = make_shared>(*this, sp, 1); + + double bestTime = INFINITY_; + + for(int i=0;iexecute(); + + if (p.size() == 0) break; + + double tm = measurePath(p, 1000000); + + if (tm < bestTime) { + bestPath = p; + bestTime = tm; + } + } +} + +template +void SleefDFTXX::searchForRandomPath() { + const int NMAX = MIN(MIN(log2len, MAXBUTWIDTH+1), log2len - log2vecwidth + 1); + + vector path; + + int level = log2len; + while(level > 0) { + int config = 0; + int N = rand() % MIN(level, NMAX-1) + 1; + if (!executable[config][level][N]) continue; + + path.push_back(Action(config, level, N)); + level -= N; + } + + bestPath = path; +} + +template +pair, double> SleefDFT2DXX::searchForBestPath(SleefDFTXX *inst, + bool mt, uint32_t hlen, uint32_t vlen, int nPaths) { + assert(nPaths != 0); + + const int NMAX = MIN(MIN(inst->log2len, MAXBUTWIDTH+1), inst->log2len - inst->log2vecwidth + 1); + + vector sp; + + for(int config = 0;config < CONFIGMAX;config++) { + for(int N=1;Nexecutable[config][inst->log2len][N]) continue; + sp.push_back(Action(config, inst->log2len, N)); + } + } + + auto qf2 = QuickFinder2(*this, inst, mt, sp, hlen, vlen, 1); + + vector bestPath; + double bestTime = INFINITY_; + + for(int i=0;i, double>(bestPath, bestTime); +} + +// + +template +bool SleefDFTXX::measure(bool randomize) { + if (log2len == 1) { + bestPath.clear(); + bestPath.push_back(Action(0, 1, 1)); + + return true; + } + + for(int config=0;config= 1;level--) { + for(uint32_t N=1;N<=MAXBUTWIDTH;N++) { + executable[config][level][N] = false; + } + } + } + + for(int config=0;config= 1;level--) { + for(uint32_t N=1;N<=MAXBUTWIDTH;N++) { + if (level < N || log2len <= N) continue; + if (level == N) { + executable[config][level][N] = true; + } else if (level == log2len) { + if (tbl[N] == NULL || tbl[N][level] == NULL) continue; + if (vecwidth > (1 << N)) continue; + executable[config][level][N] = true; + } else { + if (tbl[N] == NULL || tbl[N][level] == NULL) continue; + if (vecwidth > 2 && log2len <= N+2) continue; + if ((int)log2len - (int)level < log2vecwidth) continue; + executable[config][level][N] = true; + } + } + } + } + + // + + { + bool executable_ = false; + for(int i=1;i<=MAXBUTWIDTH && !executable_;i++) { + if (executable[0][log2len][i]) executable_ = true; + } + + if (!executable_) return false; + } + + if (!randomize) { + searchForBestPath((mode & SLEEF_MODE_MEASURE) != 0 ? 32 : 0); + if ((mode & SLEEF_MODE_VERBOSE) != 0) { + if ((mode & SLEEF_MODE_MEASURE) != 0) { + showPath(verboseFP, "Measure : ", bestPath); + } else if ((mode & SLEEF_MODE_INTERNAL_2D) == 0) { + showPath(verboseFP, "Estimate : ", bestPath); + } + } + + if ((mode & SLEEF_MODE_MEASURE) != 0) saveMeasurementResults(); + } else { + searchForRandomPath(); + if ((mode & SLEEF_MODE_VERBOSE) != 0) { + showPath(verboseFP, "Random path : ", bestPath); + } + } + + return true; +} + +template +pair SleefDFT2DXX::measureTranspose() { + uint64_t tmMT, tmNoMT; + + real *tBuf2 = (real *)Sleef_malloc(sizeof(real)*2*hlen*vlen); + + const int niter = 1 + 5000000 / (hlen * vlen + 1); + + auto tm0 = chrono::high_resolution_clock::now(); + for(int i=0;i(tBuf2, tBuf, log2hlen, log2vlen); + transpose(tBuf2, tBuf, log2vlen, log2hlen); + } + auto tm1 = chrono::high_resolution_clock::now(); + tmNoMT = chrono::duration_cast(tm1 - tm0).count(); + + if ((mode & SLEEF_MODE_VERBOSE) != 0) fprintf(verboseFP, "transpose NoMT(measured): %lld\n", (long long int)tmNoMT); + + tm0 = chrono::high_resolution_clock::now(); + for(int i=0;i(tBuf2, tBuf, log2hlen, log2vlen); + transposeMT(tBuf2, tBuf, log2vlen, log2hlen); + } + tm1 = chrono::high_resolution_clock::now(); + tmMT = chrono::duration_cast(tm1 - tm0).count(); + + if ((mode & SLEEF_MODE_VERBOSE) != 0) fprintf(verboseFP, "transpose MT(measured): %lld\n", (long long int)tmMT); + + Sleef_free(tBuf2); + + tmMT /= niter; + tmNoMT /= niter; + + return pair(tmMT, tmNoMT); +} + +// Implementation of SleefDFT_*_init1d + +template +SleefDFTXX::SleefDFTXX(uint32_t n, const real *in_, real *out_, uint64_t mode_, const char *baseTypeString, + int BASETYPEID_, int MAGIC_, int minshift_, + int (*GETINT_[16])(int), const void *(*GETPTR_[16])(int), real2 (*SINCOSPI_)(real), + void (*DFTF_[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, const real *, const int), + void (*DFTB_[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, const real *, const int), + void (*TBUTF_[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const int, const real *, const int), + void (*TBUTB_[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const int, const real *, const int), + void (*BUTF_[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const int, const real *, const int, const real *, const int), + void (*BUTB_[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const int, const real *, const int, const real *, const int), + void (*REALSUB0_[ISAMAX])(real *, const real *, const int, const real *, const real *), + void (*REALSUB1_[ISAMAX])(real *, const real *, const int, const real *, const real *, const int), + void (*TBUTFS_[MAXSHIFT][CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const real *, const int), + void (*TBUTBS_[MAXSHIFT][CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const real *, const int) + ) : + magic(MAGIC_), baseTypeID(BASETYPEID_), in(in_), out(out_), nThread(omp_thread_count()), + log2len((mode_ & SLEEF_MODE_REAL) ? ilog2(n)-1 : ilog2(n)), + mode(((mode_ & SLEEF_MODE_ALT) && log2len > 1) ? mode_ ^ SLEEF_MODE_BACKWARD : mode_), + minshift(minshift_), + DFTF(DFTF_), DFTB(DFTB_), TBUTF(TBUTF_), TBUTB(TBUTB_), BUTF(BUTF_), BUTB(BUTB_), REALSUB0(REALSUB0_), REALSUB1(REALSUB1_), + TBUTFS(TBUTFS_), TBUTBS(TBUTBS_) { + + verboseFP = defaultVerboseFP; + + // Mode + + if ((mode & SLEEF_MODE_REAL) != 0) n /= 2; + + if ((mode & SLEEF_MODE_NO_MT) == 0) mode2 |= SLEEF_MODE2_MT1D; + + if (log2len <= 1) return; + + // ISA availability + + int bestPriority = -1; + isa = -1; + + for(int i=0;i= (uint32_t)((*GETINT_[i])(GETINT_VECWIDTH) * (*GETINT_[i])(GETINT_VECWIDTH))) { + bestPriority = (*GETINT_[i])(GETINT_DFTPRIORITY); + isa = i; + } + } + + if (isa == -1) { + if ((mode & SLEEF_MODE_VERBOSE) != 0) fprintf(verboseFP, "ISA not available\n"); + magic = 0; + return; + } + + // Generate tables + + perm = (uint32_t **)calloc(sizeof(uint32_t *), log2len+1); + for(int level = log2len;level >= 1;level--) { + perm[level] = (uint32_t *)Sleef_malloc(sizeof(uint32_t) * ((1 << log2len) + 8)); + } + + x0 = (real **)malloc(sizeof(real *) * nThread); + x1 = (real **)malloc(sizeof(real *) * nThread); + + for(int i=0;i(sign, vecwidth, log2len, i, constK[i], SINCOSPI_); + } + + if (loadMeasurementResults()) { + if ((mode & SLEEF_MODE_VERBOSE) != 0) { + showPath(verboseFP, "Loaded : ", bestPath); + } + } else if (!measure(mode & SLEEF_MODE_DEBUG)) { + // Fall back to the first ISA + freeTables(); + isa = 0; + + vecwidth = (*GETINT_[isa])(GETINT_VECWIDTH); + log2vecwidth = ilog2(vecwidth); + + for(int i=1;i<=MAXBUTWIDTH;i++) { + tbl[i] = makeTable(sign, vecwidth, log2len, i, constK[i], SINCOSPI_); + } + + generatePerm(bestPath); + + if (!measure(mode & SLEEF_MODE_DEBUG)) { + if ((mode & SLEEF_MODE_VERBOSE) != 0) fprintf(verboseFP, "Suitable ISA not found. This should not happen.\n"); + abort(); + } + } + + generatePerm(bestPath); + + if ((mode & SLEEF_MODE_VERBOSE) != 0) fprintf(verboseFP, "ISA : %s %d bit %s\n", (char *)(*GETPTR_[isa])(0), (int)(GETINT_[isa](GETINT_VECWIDTH) * sizeof(real) * 16), baseTypeString); +} + +// Implementation of SleefDFT_*_init2d + +template +SleefDFT2DXX::SleefDFT2DXX(uint32_t vlen_, uint32_t hlen_, const real *in_, real *out_, uint64_t mode_, const char *baseTypeString, + int BASETYPEID_, int MAGIC_, int MAGIC2D_, int minshift_, + int (*GETINT_[16])(int), const void *(*GETPTR_[16])(int), real2 (*SINCOSPI_)(real), + void (*DFTF_[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, const real *, const int), + void (*DFTB_[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, const real *, const int), + void (*TBUTF_[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const int, const real *, const int), + void (*TBUTB_[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const int, const real *, const int), + void (*BUTF_[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const int, const real *, const int, const real *, const int), + void (*BUTB_[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const int, const real *, const int, const real *, const int), + void (*REALSUB0_[ISAMAX])(real *, const real *, const int, const real *, const real *), + void (*REALSUB1_[ISAMAX])(real *, const real *, const int, const real *, const real *, const int), + void (*TBUTFS_[MAXSHIFT][CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const real *, const int), + void (*TBUTBS_[MAXSHIFT][CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const real *, const int) + ) { + magic = MAGIC2D_; + baseTypeID = BASETYPEID_; + in = in_; + out = out_; + hlen = hlen_; + log2hlen = ilog2(hlen_); + vlen = vlen_; + log2vlen = ilog2(vlen_); + mode = mode_ | SLEEF_MODE_INTERNAL_2D; + + mode2 = 0; + mode3 = 0; + + planMT = false; + + verboseFP = stdout; + + uint64_t mode1D = (mode & ~SLEEF_MODE_MEASUREBITS) | SLEEF_MODE_ESTIMATE | SLEEF_MODE_NO_MT; + + if ((mode & SLEEF_MODE_NO_MT) == 0) mode3 |= SLEEF_MODE3_MT2D; + + instH = instV = new SleefDFTXX(hlen, NULL, NULL, mode1D, baseTypeString, + BASETYPEID_, MAGIC_, minshift_, + GETINT_, GETPTR_, SINCOSPI_, + DFTF_, DFTB_, TBUTF_, TBUTB_, BUTF_, BUTB_, + REALSUB0_, REALSUB1_, TBUTFS_, TBUTBS_); + if (hlen != vlen) instV = new SleefDFTXX(vlen, NULL, NULL, mode1D, baseTypeString, + BASETYPEID_, MAGIC_, minshift_, + GETINT_, GETPTR_, SINCOSPI_, + DFTF_, DFTB_, TBUTF_, TBUTB_, BUTF_, BUTB_, + REALSUB0_, REALSUB1_, TBUTFS_, TBUTBS_); + + tBuf = (real *)Sleef_malloc(sizeof(real)*2*hlen*vlen); + + if (!loadMeasurementResults()) { + if ((mode & SLEEF_MODE_MEASURE) != 0) { + uint64_t tmMT, tmNoMT; + auto a = measureTranspose(); + tmMT = a.first; + tmNoMT = a.second; + planMT = tmMT < tmNoMT; + + pair, double> noMT_H, MT_H, noMT_V, MT_V; + + const bool mt = (mode & SLEEF_MODE_NO_MT) == 0; + + if (instH == instV) { + noMT_H = searchForBestPath(instH, false, hlen, vlen, 8); + tmNoMT += noMT_H.second * 2; + + if (mt) { + MT_H = searchForBestPath(instH, true, hlen, vlen, 8); + tmMT += MT_H.second * 2; + } + } else { + noMT_H = searchForBestPath(instH, false, hlen, vlen, 8); + noMT_V = searchForBestPath(instV, false, vlen, hlen, 8); + tmNoMT += noMT_H.second + noMT_V.second; + + if (mt) { + MT_H = searchForBestPath(instH, true, hlen, vlen, 8); + MT_V = searchForBestPath(instV, true, vlen, hlen, 8); + tmMT += MT_H.second + MT_V.second; + } + } + + if (!mt) tmMT = ULLONG_MAX; + + if (tmMT < tmNoMT) { + planMT = true; + instH->bestPath = MT_H.first; + if (instH != instV) instV->bestPath = MT_V.first; + } else { + planMT = false; + instH->bestPath = noMT_H.first; + if (instH != instV) instV->bestPath = noMT_V.first; + } + + saveMeasurementResults(); + } else { + planMT = log2hlen + log2vlen >= 14; + // When the paths are to be estimated, the paths set in the constructors are used + } + } + + instH->generatePerm(instH->bestPath); + if (instH != instV) instV->generatePerm(instV->bestPath); +} + +// Implementation of SleefDFT_*_execute + +template +void SleefDFTXX::execute(const real *s0, real *d0, int MAGIC_, int MAGIC2D_) { + assert(magic == MAGIC_); + + const real *s = s0 == NULL ? in : s0; + real *d = d0 == NULL ? out : d0; + + if (log2len <= 1) { + if ((mode & SLEEF_MODE_REAL) == 0) { + real r0 = s[0] + s[2]; + real r1 = s[1] + s[3]; + real r2 = s[0] - s[2]; + real r3 = s[1] - s[3]; + d[0] = r0; d[1] = r1; d[2] = r2; d[3] = r3; + } else { + if ((mode & SLEEF_MODE_ALT) == 0) { + if (log2len == 1) { + if ((mode & SLEEF_MODE_BACKWARD) == 0) { + real r0 = s[0] + s[2] + (s[1] + s[3]); + real r1 = s[0] + s[2] - (s[1] + s[3]); + real r2 = s[0] - s[2]; + real r3 = s[3] - s[1]; + d[0] = r0; d[1] = 0; d[2] = r2; d[3] = r3; d[4] = r1; d[5] = 0; + } else { + real r0 = (s[0] + s[4])*(real)0.5 + s[2]; + real r1 = (s[0] - s[4])*(real)0.5 - s[3]; + real r2 = (s[0] + s[4])*(real)0.5 - s[2]; + real r3 = (s[0] - s[4])*(real)0.5 + s[3]; + d[0] = r0*2; d[1] = r1*2; d[2] = r2*2; d[3] = r3*2; + } + } else { + if ((mode & SLEEF_MODE_BACKWARD) == 0) { + real r0 = s[0] + s[1]; + real r1 = s[0] - s[1]; + d[0] = r0; d[1] = 0; d[2] = r1; d[3] = 0; + } else { + real r0 = s[0] + s[2]; + real r1 = s[0] - s[2]; + d[0] = r0; d[1] = r1; + } + } + } else { + if (log2len == 1) { + if ((mode & SLEEF_MODE_BACKWARD) == 0) { + real r0 = s[0] + s[2] + (s[1] + s[3]); + real r1 = s[0] + s[2] - (s[1] + s[3]); + real r2 = s[0] - s[2]; + real r3 = s[1] - s[3]; + d[0] = r0; d[1] = r1; d[2] = r2; d[3] = r3; + } else { + real r0 = (s[0] + s[1])*(real)0.5 + s[2]; + real r1 = (s[0] - s[1])*(real)0.5 + s[3]; + real r2 = (s[0] + s[1])*(real)0.5 - s[2]; + real r3 = (s[0] - s[1])*(real)0.5 - s[3]; + d[0] = r0; d[1] = r1; d[2] = r2; d[3] = r3; + } + } else { + real c = ((mode & SLEEF_MODE_BACKWARD) != 0) ? (real)0.5 : (real)1.0; + real r0 = s[0] + s[1]; + real r1 = s[0] - s[1]; + d[0] = r0 * c; d[1] = r1 * c; + } + } + } + return; + } + + // + + const int tn = omp_get_thread_num(); + real *t[] = { x1[tn], x0[tn], d }; + + const real *lb = s; + int nb = 0; + + if ((mode & SLEEF_MODE_REAL) != 0 && (bestPath.size() & 1) == 0 && + ((mode & SLEEF_MODE_BACKWARD) != 0) != ((mode & SLEEF_MODE_ALT) != 0)) nb = -1; + if ((mode & SLEEF_MODE_REAL) == 0 && (bestPath.size() & 1) == 1) nb = -1; + + if ((mode & SLEEF_MODE_REAL) != 0 && + ((mode & SLEEF_MODE_BACKWARD) != 0) != ((mode & SLEEF_MODE_ALT) != 0)) { + (*REALSUB1[isa])(t[nb+1], s, log2len, rtCoef0, rtCoef1, (mode & SLEEF_MODE_ALT) == 0); + if (( mode & SLEEF_MODE_ALT) == 0) t[nb+1][(1 << log2len)+1] = -s[(1 << log2len)+1] * 2; + lb = t[nb+1]; + nb = (nb + 1) & 1; + } + + int level = log2len; + for(unsigned j=0;j +void SleefDFT2DXX::execute(const real *s0, real *d0, int MAGIC_, int MAGIC2D_) { + assert(magic == MAGIC2D_); + + const real *s = s0 == NULL ? in : s0; + real *d = d0 == NULL ? out : d0; + + // S -> T -> D -> T -> D + + if ((mode3 & SLEEF_MODE3_MT2D) != 0 && + (((mode & SLEEF_MODE_DEBUG) == 0 && planMT) || + ((mode & SLEEF_MODE_DEBUG) != 0 && (rand() & 1)))) { + int y=0; +#pragma omp parallel for + for(y=0;yexecute(&s[hlen*2*y], &tBuf[hlen*2*y], MAGIC_, MAGIC2D_); + } + + transposeMT(d, tBuf, log2vlen, log2hlen); + +#pragma omp parallel for + for(y=0;yexecute(&d[vlen*2*y], &tBuf[vlen*2*y], MAGIC_, MAGIC2D_); + } + + transposeMT(d, tBuf, log2hlen, log2vlen); + } else { + for(int y=0;yexecute(&s[hlen*2*y], &tBuf[hlen*2*y], MAGIC_, MAGIC2D_); + } + + transpose(d, tBuf, log2vlen, log2hlen); + + for(int y=0;yexecute(&d[vlen*2*y], &tBuf[vlen*2*y], MAGIC_, MAGIC2D_); + } + + transpose(d, tBuf, log2hlen, log2vlen); + } +} + +// + +EXPORT SleefDFT *SleefDFT_double_init1d(uint32_t n, const double *in, double *out, uint64_t mode) { + SleefDFT *p = (SleefDFT *)calloc(1, sizeof(SleefDFT)); + p->double_ = new SleefDFTXX(n, in, out, mode, "double", + 1, MAGIC_DOUBLE, MINSHIFTDP, getInt_double, getPtr_double, Sleef_sincospi_u05, + dftf_double, dftb_double, tbutf_double, tbutb_double, butf_double, butb_double, + realSub0_double, realSub1_double, tbutfs_double, tbutbs_double + ); + p->magic = p->double_->magic; + return p; +} + +EXPORT SleefDFT *SleefDFT_double_init2d(uint32_t vlen, uint32_t hlen, const double *in, double *out, uint64_t mode) { + SleefDFT *p = (SleefDFT *)calloc(1, sizeof(SleefDFT)); + p->double2d_ = new SleefDFT2DXX(vlen, hlen, in, out, mode, "double", + 1, MAGIC_DOUBLE, MAGIC2D_DOUBLE, MINSHIFTDP, getInt_double, getPtr_double, Sleef_sincospi_u05, + dftf_double, dftb_double, tbutf_double, tbutb_double, butf_double, butb_double, + realSub0_double, realSub1_double, tbutfs_double, tbutbs_double + ); + p->magic = p->double2d_->magic; + return p; +} + +EXPORT void SleefDFT_double_execute(SleefDFT *p, const double *s0, double *d0) { + switch(p->magic) { + case MAGIC_DOUBLE: + p->double_->execute(s0, d0, MAGIC_DOUBLE, MAGIC2D_DOUBLE); + break; + case MAGIC2D_DOUBLE: + p->double2d_->execute(s0, d0, MAGIC_DOUBLE, MAGIC2D_DOUBLE); + break; + default: + abort(); + } +} + +EXPORT SleefDFT *SleefDFT_float_init1d(uint32_t n, const float *in, float *out, uint64_t mode) { + SleefDFT *p = (SleefDFT *)calloc(1, sizeof(SleefDFT)); + p->float_ = new SleefDFTXX(n, in, out, mode, "float", + 2, MAGIC_FLOAT, MINSHIFTSP, getInt_float, getPtr_float, Sleef_sincospif_u05, + dftf_float, dftb_float, tbutf_float, tbutb_float, butf_float, butb_float, + realSub0_float, realSub1_float, tbutfs_float, tbutbs_float + ); + p->magic = p->float_->magic; + return p; +} + +EXPORT SleefDFT *SleefDFT_float_init2d(uint32_t vlen, uint32_t hlen, const float *in, float *out, uint64_t mode) { + SleefDFT *p = (SleefDFT *)calloc(1, sizeof(SleefDFT)); + p->float2d_ = new SleefDFT2DXX(vlen, hlen, in, out, mode, "float", + 2, MAGIC_FLOAT, MAGIC2D_FLOAT, MINSHIFTSP, getInt_float, getPtr_float, Sleef_sincospif_u05, + dftf_float, dftb_float, tbutf_float, tbutb_float, butf_float, butb_float, + realSub0_float, realSub1_float, tbutfs_float, tbutbs_float + ); + p->magic = p->float2d_->magic; + return p; +} + +EXPORT void SleefDFT_float_execute(SleefDFT *p, const float *s0, float *d0) { + switch(p->magic) { + case MAGIC_FLOAT: + p->float_->execute(s0, d0, MAGIC_FLOAT, MAGIC2D_FLOAT); + break; + case MAGIC2D_FLOAT: + p->float2d_->execute(s0, d0, MAGIC_FLOAT, MAGIC2D_FLOAT); + break; + default: + abort(); + } +} + +EXPORT void SleefDFT_execute(SleefDFT *p, const void *s0, void *d0) { + switch(p->magic) { + case MAGIC_DOUBLE: + p->double_->execute((const double *)s0, (double *)d0, MAGIC_DOUBLE, MAGIC2D_DOUBLE); + break; + case MAGIC2D_DOUBLE: + p->double2d_->execute((const double *)s0, (double *)d0, MAGIC_DOUBLE, MAGIC2D_DOUBLE); + break; + case MAGIC_FLOAT: + p->float_->execute((const float *)s0, (float *)d0, MAGIC_FLOAT, MAGIC2D_FLOAT); + break; + case MAGIC2D_FLOAT: + p->float2d_->execute((const float *)s0, (float *)d0, MAGIC_FLOAT, MAGIC2D_FLOAT); + break; + default: + abort(); + } +} diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/dftcommon.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/dftcommon.c deleted file mode 100644 index 184af8f2027..00000000000 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/dftcommon.c +++ /dev/null @@ -1,423 +0,0 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. -// Distributed under the Boost Software License, Version 1.0. -// (See accompanying file LICENSE.txt or copy at -// http://www.boost.org/LICENSE_1_0.txt) - -#include -#include -#include -#include -#include -#include -#include - -#include - -#ifdef _OPENMP -#include -#endif - -#include "misc.h" -#include "sleef.h" - -#define IMPORT_IS_EXPORT -#include "sleefdft.h" -#include "dispatchparam.h" -#include "dftcommon.h" -#include "common.h" -#include "arraymap.h" - -#define MAGIC_FLOAT 0x31415926 -#define MAGIC_DOUBLE 0x27182818 - -#define MAGIC2D_FLOAT 0x22360679 -#define MAGIC2D_DOUBLE 0x17320508 - -const char *configStr[] = { "ST", "ST stream", "MT", "MT stream" }; - -static int parsePathStr(char *p, int *path, int *config, int pathLenMax, int log2len) { - int pathLen = 0, l2l = 0; - - for(;;) { - while(*p == ' ') p++; - if (*p == '\0') break; - if (!isdigit((int)*p)) return -1; - - pathLen++; - if (pathLen >= pathLenMax) return -2; - - int n = 0; - while(isdigit((int)*p)) n = n * 10 + *p++ - '0'; - - if (n > MAXBUTWIDTH) return -6; - path[pathLen-1] = n; - l2l += n; - config[pathLen-1] = 0; - - if (*p != '(') continue; - - int c; - for(c=3;c>=0;c--) if (strncmp(p+1, configStr[c], strlen(configStr[c])) == 0) break; - if (c == -1) return -3; - p += strlen(configStr[c]) + 1; - if (*p != ')') return -4; - p++; - - config[pathLen-1] = c; - } - - if (l2l != log2len) return -5; - - return pathLen; -} - -EXPORT void SleefDFT_setPath(SleefDFT *p, char *pathStr) { - assert(p != NULL && (p->magic == MAGIC_FLOAT || p->magic == MAGIC_DOUBLE)); - - int path[32], config[32]; - int pathLen = parsePathStr(pathStr, path, config, 31, p->log2len); - - if (pathLen < 0) { - if ((p->mode & SLEEF_MODE_VERBOSE) != 0) printf("Error %d in parsing path string : %s\n", pathLen, pathStr); - return; - } - - for(uint32_t j = 0;j <= p->log2len;j++) p->bestPath[j] = 0; - - for(int level = p->log2len, j=0;level > 0 && j < pathLen;) { - p->bestPath[level] = path[j]; - p->bestPathConfig[level] = config[j]; - level -= path[j]; - j++; - } - - p->pathLen = 0; - for(int j = p->log2len;j >= 0;j--) if (p->bestPath[j] != 0) p->pathLen++; - - if ((p->mode & SLEEF_MODE_VERBOSE) != 0) { - printf("Set path : "); - for(int j = p->log2len;j >= 0;j--) if (p->bestPath[j] != 0) printf("%d(%s) ", p->bestPath[j], configStr[p->bestPathConfig[j]]); - printf("\n"); - } -} - -void freeTables(SleefDFT *p) { - for(int N=1;N<=MAXBUTWIDTH;N++) { - for(uint32_t level=N;level<=p->log2len;level++) { - Sleef_free(p->tbl[N][level]); - } - free(p->tbl[N]); - p->tbl[N] = NULL; - } -} - -EXPORT void SleefDFT_dispose(SleefDFT *p) { - if (p != NULL && (p->magic == MAGIC2D_FLOAT || p->magic == MAGIC2D_DOUBLE)) { - Sleef_free(p->tBuf); - SleefDFT_dispose(p->instH); - if (p->hlen != p->vlen) SleefDFT_dispose(p->instV); - - p->magic = 0; - free(p); - return; - } - - assert(p != NULL && (p->magic == MAGIC_FLOAT || p->magic == MAGIC_DOUBLE)); - - if (p->log2len <= 1) { - p->magic = 0; - free(p); - return; - } - - if ((p->mode & SLEEF_MODE_REAL) != 0) { - Sleef_free(p->rtCoef1); - Sleef_free(p->rtCoef0); - p->rtCoef0 = p->rtCoef1 = NULL; - } - - for(int level = p->log2len;level >= 1;level--) { - Sleef_free(p->perm[level]); - } - free(p->perm); - p->perm = NULL; - - freeTables(p); - - p->magic = 0; - free(p); -} - -uint32_t ilog2(uint32_t q) { - static const uint32_t tab[] = {0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4}; - uint32_t r = 0,qq; - - if (q & 0xffff0000) r = 16; - - q >>= r; - qq = q | (q >> 1); - qq |= (qq >> 2); - qq = ((qq & 0x10) >> 4) | ((qq & 0x100) >> 7) | ((qq & 0x1000) >> 10); - - return r + tab[qq] * 4 + tab[q >> (tab[qq] * 4)] - 1; -} - -// - -char *dftPlanFilePath = NULL; -char *archID = NULL; -uint64_t planMode = SLEEF_PLAN_REFERTOENVVAR; -ArrayMap *planMap = NULL; -int planFilePathSet = 0, planFileLoaded = 0; -#ifdef _OPENMP -omp_lock_t planMapLock; -int planMapLockInitialized = 0; -#endif - -static void initPlanMapLock() { -#ifdef _OPENMP -#pragma omp critical - { - if (!planMapLockInitialized) { - planMapLockInitialized = 1; - omp_init_lock(&planMapLock); - } - } -#endif -} - -static void planMap_clear() { - if (planMap != NULL) ArrayMap_dispose(planMap); - planMap = NULL; -} - -EXPORT void SleefDFT_setPlanFilePath(const char *path, const char *arch, uint64_t mode) { - initPlanMapLock(); - - if ((mode & SLEEF_PLAN_RESET) != 0) { - planMap_clear(); - planFileLoaded = 0; - planFilePathSet = 0; - } - - if (dftPlanFilePath != NULL) free(dftPlanFilePath); - if (path != NULL) { - dftPlanFilePath = malloc(strlen(path)+10); - strcpy(dftPlanFilePath, path); - } else { - dftPlanFilePath = NULL; - } - - if (archID != NULL) free(archID); - if (arch == NULL) arch = Sleef_getCpuIdString(); - archID = malloc(strlen(arch)+10); - strcpy(archID, arch); - - planMode = mode; - planFilePathSet = 1; -} - -static void loadPlanFromFile() { - if (planFilePathSet == 0 && (planMode & SLEEF_PLAN_REFERTOENVVAR) != 0) { - char *s = getenv(ENVVAR); - if (s != NULL) SleefDFT_setPlanFilePath(s, NULL, planMode); - } - - if (planMap != NULL) ArrayMap_dispose(planMap); - - if (dftPlanFilePath != NULL && (planMode & SLEEF_PLAN_RESET) == 0) { - planMap = ArrayMap_load(dftPlanFilePath, archID, PLANFILEID, (planMode & SLEEF_PLAN_NOLOCK) == 0); - } - - if (planMap == NULL) planMap = initArrayMap(); - - planFileLoaded = 1; -} - -static void savePlanToFile() { - assert(planFileLoaded); - if ((planMode & SLEEF_PLAN_READONLY) == 0 && dftPlanFilePath != NULL) { - ArrayMap_save(planMap, dftPlanFilePath, archID, PLANFILEID); - } -} - -#define CATBIT 8 -#define BASETYPEIDBIT 2 -#define LOG2LENBIT 8 -#define DIRBIT 1 - -#define BUTSTATBIT 16 - -static uint64_t keyButStat(int baseTypeID, int log2len, int dir, int butStat) { - dir = (dir & SLEEF_MODE_BACKWARD) == 0; - int cat = 0; - uint64_t k = 0; - k = (k << BUTSTATBIT) | (butStat & ~(~(uint64_t)0 << BUTSTATBIT)); - k = (k << LOG2LENBIT) | (log2len & ~(~(uint64_t)0 << LOG2LENBIT)); - k = (k << DIRBIT) | (dir & ~(~(uint64_t)0 << LOG2LENBIT)); - k = (k << BASETYPEIDBIT) | (baseTypeID & ~(~(uint64_t)0 << BASETYPEIDBIT)); - k = (k << CATBIT) | (cat & ~(~(uint64_t)0 << CATBIT)); - return k; -} - -#define LEVELBIT LOG2LENBIT -#define BUTCONFIGBIT 8 -#define TRANSCONFIGBIT 8 - -static uint64_t keyTrans(int baseTypeID, int hlen, int vlen, int transConfig) { - int max = MAX(hlen, vlen), min = MIN(hlen, vlen); - int cat = 2; - uint64_t k = 0; - k = (k << TRANSCONFIGBIT) | (transConfig & ~(~(uint64_t)0 << TRANSCONFIGBIT)); - k = (k << LOG2LENBIT) | (max & ~(~(uint64_t)0 << LOG2LENBIT)); - k = (k << LOG2LENBIT) | (min & ~(~(uint64_t)0 << LOG2LENBIT)); - k = (k << BASETYPEIDBIT) | (baseTypeID & ~(~(uint64_t)0 << BASETYPEIDBIT)); - k = (k << CATBIT) | (cat & ~(~(uint64_t)0 << CATBIT)); - return k; -} - -static uint64_t keyPath(int baseTypeID, int log2len, int dir, int level, int config) { - dir = (dir & SLEEF_MODE_BACKWARD) == 0; - int cat = 3; - uint64_t k = 0; - k = (k << BUTCONFIGBIT) | (config & ~(~(uint64_t)0 << BUTCONFIGBIT)); - k = (k << LEVELBIT) | (level & ~(~(uint64_t)0 << LEVELBIT)); - k = (k << LOG2LENBIT) | (log2len & ~(~(uint64_t)0 << LOG2LENBIT)); - k = (k << DIRBIT) | (dir & ~(~(uint64_t)0 << LOG2LENBIT)); - k = (k << BASETYPEIDBIT) | (baseTypeID & ~(~(uint64_t)0 << BASETYPEIDBIT)); - k = (k << CATBIT) | (cat & ~(~(uint64_t)0 << CATBIT)); - return k; -} - -static uint64_t keyPathConfig(int baseTypeID, int log2len, int dir, int level, int config) { - dir = (dir & SLEEF_MODE_BACKWARD) == 0; - int cat = 4; - uint64_t k = 0; - k = (k << BUTCONFIGBIT) | (config & ~(~(uint64_t)0 << BUTCONFIGBIT)); - k = (k << LEVELBIT) | (level & ~(~(uint64_t)0 << LEVELBIT)); - k = (k << LOG2LENBIT) | (log2len & ~(~(uint64_t)0 << LOG2LENBIT)); - k = (k << DIRBIT) | (dir & ~(~(uint64_t)0 << LOG2LENBIT)); - k = (k << BASETYPEIDBIT) | (baseTypeID & ~(~(uint64_t)0 << BASETYPEIDBIT)); - k = (k << CATBIT) | (cat & ~(~(uint64_t)0 << CATBIT)); - return k; -} - -static uint64_t planMap_getU64(uint64_t key) { - char *s = ArrayMap_get(planMap, key); - if (s == NULL) return 0; - uint64_t ret; - if (sscanf(s, "%" SCNx64, &ret) != 1) return 0; - return ret; -} - -static void planMap_putU64(uint64_t key, uint64_t value) { - char *s = malloc(100); - sprintf(s, "%" PRIx64, value); - s = ArrayMap_put(planMap, key, s); - if (s != NULL) free(s); -} - -int PlanManager_loadMeasurementResultsP(SleefDFT *p, int pathCat) { - assert(p != NULL && (p->magic == MAGIC_FLOAT || p->magic == MAGIC_DOUBLE)); - - initPlanMapLock(); - -#ifdef _OPENMP - omp_set_lock(&planMapLock); -#endif - if (!planFileLoaded) loadPlanFromFile(); - - int stat = planMap_getU64(keyButStat(p->baseTypeID, p->log2len, p->mode, pathCat+10)); - if (stat == 0) { -#ifdef _OPENMP - omp_unset_lock(&planMapLock); -#endif - return 0; - } - - int ret = 1; - - for(int j = p->log2len;j >= 0;j--) { - p->bestPath[j] = planMap_getU64(keyPath(p->baseTypeID, p->log2len, p->mode, j, pathCat)); - p->bestPathConfig[j] = planMap_getU64(keyPathConfig(p->baseTypeID, p->log2len, p->mode, j, pathCat)); - if (p->bestPath[j] > MAXBUTWIDTH) ret = 0; - } - - p->pathLen = 0; - for(int j = p->log2len;j >= 0;j--) if (p->bestPath[j] != 0) p->pathLen++; - -#ifdef _OPENMP - omp_unset_lock(&planMapLock); -#endif - return ret; -} - -void PlanManager_saveMeasurementResultsP(SleefDFT *p, int pathCat) { - assert(p != NULL && (p->magic == MAGIC_FLOAT || p->magic == MAGIC_DOUBLE)); - - initPlanMapLock(); - -#ifdef _OPENMP - omp_set_lock(&planMapLock); -#endif - if (!planFileLoaded) loadPlanFromFile(); - - if (planMap_getU64(keyButStat(p->baseTypeID, p->log2len, p->mode, pathCat+10)) != 0) { -#ifdef _OPENMP - omp_unset_lock(&planMapLock); -#endif - return; - } - - for(int j = p->log2len;j >= 0;j--) { - planMap_putU64(keyPath(p->baseTypeID, p->log2len, p->mode, j, pathCat), p->bestPath[j]); - planMap_putU64(keyPathConfig(p->baseTypeID, p->log2len, p->mode, j, pathCat), p->bestPathConfig[j]); - } - - planMap_putU64(keyButStat(p->baseTypeID, p->log2len, p->mode, pathCat+10), 1); - - if ((planMode & SLEEF_PLAN_READONLY) == 0) savePlanToFile(); - -#ifdef _OPENMP - omp_unset_lock(&planMapLock); -#endif -} - -int PlanManager_loadMeasurementResultsT(SleefDFT *p) { - assert(p != NULL && (p->magic == MAGIC2D_FLOAT || p->magic == MAGIC2D_DOUBLE)); - - initPlanMapLock(); - -#ifdef _OPENMP - omp_set_lock(&planMapLock); -#endif - if (!planFileLoaded) loadPlanFromFile(); - - p->tmNoMT = planMap_getU64(keyTrans(p->baseTypeID, p->log2hlen, p->log2vlen, 0)); - p->tmMT = planMap_getU64(keyTrans(p->baseTypeID, p->log2hlen, p->log2vlen, 1)); - -#ifdef _OPENMP - omp_unset_lock(&planMapLock); -#endif - return p->tmNoMT != 0; -} - -void PlanManager_saveMeasurementResultsT(SleefDFT *p) { - assert(p != NULL && (p->magic == MAGIC2D_FLOAT || p->magic == MAGIC2D_DOUBLE)); - - initPlanMapLock(); - -#ifdef _OPENMP - omp_set_lock(&planMapLock); -#endif - if (!planFileLoaded) loadPlanFromFile(); - - planMap_putU64(keyTrans(p->baseTypeID, p->log2hlen, p->log2vlen, 0), p->tmNoMT); - planMap_putU64(keyTrans(p->baseTypeID, p->log2hlen, p->log2vlen, 1), p->tmMT ); - - if ((planMode & SLEEF_PLAN_READONLY) == 0) savePlanToFile(); - -#ifdef _OPENMP - omp_unset_lock(&planMapLock); -#endif -} diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/dftcommon.cpp b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/dftcommon.cpp new file mode 100644 index 00000000000..54d931b1c70 --- /dev/null +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/dftcommon.cpp @@ -0,0 +1,517 @@ +// Copyright Naoki Shibata and contributors 2010 - 2025. +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "compat.h" +#include "misc.h" +#include "sleef.h" + +#define IMPORT_IS_EXPORT +#include "sleefdft.h" +#include "dftcommon.hpp" +#include "common.h" +#include "serializer.hpp" + +const char *configStr[] = { "ST", "ST stream", "MT", "MT stream" }; + +template +vector SleefDFTXX::parsePathStr(const char *p) { + vector v; + + int level = log2len; + for(;;) { + while(isspace((int)*p)) p++; + if (*p == '\0') break; + if (!isdigit((int)*p)) throw(runtime_error("Unexpected character")); + + int N = 0; + while(isdigit((int)*p)) N = N * 10 + *p++ - '0'; + + if (N > MAXBUTWIDTHALL) throw(runtime_error("N too large")); + if (N > level) throw(runtime_error("N larger than level")); + + int config = 0; + if (*p == '(') { + p++; + + for(config=3;config>=0;config--) { + if (strncmp(p, configStr[config], strlen(configStr[config])) == 0) break; + } + if (config == -1) throw(runtime_error("Unknown config")); + p += strlen(configStr[config]); + if (*p++ != ')') throw(runtime_error("No ')' after config")); + } + + v.push_back(Action(config, level, N)); + level -= N; + } + + if (level != 0) throw(runtime_error("Sum of N less than level")); + + return v; +} + +static string to_string(vector v) { + string s = ""; + for(auto e : v) { + string c = "? " + to_string(e.config); + if (0 <= e.config && e.config < 4) c = configStr[e.config]; + s += to_string(e.N) + "(" + c + ") "; + } + return s; +} + +template +void SleefDFTXX::setPath(const char *pathStr) { + assert(magic == MAGIC_FLOAT || magic == MAGIC_DOUBLE); + + try { + bestPath = parsePathStr(pathStr); + + if ((mode & SLEEF_MODE_VERBOSE) != 0) fprintf(verboseFP, "Set path : %s\n", to_string(bestPath).c_str()); + } catch(exception &ex) { + if ((mode & SLEEF_MODE_VERBOSE) != 0) fprintf(verboseFP, "Parse error : %s\n", ex.what()); + } +} + +template +void SleefDFT2DXX::setPath(const char *pathStr) { + assert(magic == MAGIC2D_FLOAT || magic == MAGIC2D_DOUBLE); + int planMT_ = 0; + if (sscanf(pathStr, "%d", &planMT_) != 1) return; + planMT = planMT_; + + string pathH = pathStr; + size_t cpos = pathH.find_first_of(':'); + if (cpos == string::npos) return; + pathH = pathH.substr(cpos + 1); + + cpos = pathH.find_first_of(','); + if (cpos == string::npos) return; + string pathV = pathH.substr(cpos+1); + pathH = pathH.substr(0, cpos); + + instH->setPath(pathH.c_str()); + instV->setPath(pathV.c_str()); +} + +template +string SleefDFTXX::getPath() { + assert(magic == MAGIC_FLOAT || magic == MAGIC_DOUBLE); + return to_string(bestPath); +} + +template +string SleefDFT2DXX::getPath() { + assert(magic == MAGIC2D_FLOAT || magic == MAGIC2D_DOUBLE); + return to_string((int)planMT) + ":" + + instH->getPath() + "," + instV->getPath(); +} + +EXPORT void SleefDFT_setPath(SleefDFT *p, char *pathStr) { + assert(p != NULL); + switch(p->magic) { + case MAGIC_DOUBLE: + p->double_->setPath(pathStr); + break; + case MAGIC_FLOAT: + p->float_->setPath(pathStr); + break; + case MAGIC2D_DOUBLE: + p->double2d_->setPath(pathStr); + break; + case MAGIC2D_FLOAT: + p->float2d_->setPath(pathStr); + break; + default: abort(); + } +} + +EXPORT int SleefDFT_getPath(SleefDFT *p, char *pathStr, int pathStrSize) { + assert(p != NULL); + + string str; + switch(p->magic) { + case MAGIC_DOUBLE: + str = p->double_->getPath(); + break; + case MAGIC_FLOAT: + str = p->float_->getPath(); + break; + case MAGIC2D_DOUBLE: + str = p->double2d_->getPath(); + break; + case MAGIC2D_FLOAT: + str = p->float2d_->getPath(); + break; + default: abort(); + } + + strncpy(pathStr, str.c_str(), pathStrSize); + + return pathStrSize == 0 ? 0 : strlen(pathStr); +} + +template +void SleefDFTXX::freeTables() { + for(int N=1;N<=MAXBUTWIDTH;N++) { + for(uint32_t level=N;level<=log2len;level++) { + Sleef_free(tbl[N][level]); + tbl[N][level] = nullptr; + } + free(tbl[N]); + tbl[N] = NULL; + } + + for(int i=0;i +SleefDFTXX::~SleefDFTXX() { + assert(magic == MAGIC_FLOAT || magic == MAGIC_DOUBLE); + + if (log2len <= 1) { + magic = 0; + return; + } + + if ((mode & SLEEF_MODE_REAL) != 0) { + Sleef_free(rtCoef1); + rtCoef1 = nullptr; + Sleef_free(rtCoef0); + rtCoef0 = nullptr; + } + + for(int level = log2len;level >= 1;level--) { + Sleef_free(perm[level]); + perm[level] = nullptr; + } + free(perm); + perm = NULL; + + freeTables(); + + magic = 0; +} + +template +SleefDFT2DXX::~SleefDFT2DXX() { + assert(magic == MAGIC2D_FLOAT || magic == MAGIC2D_DOUBLE); + + Sleef_free(tBuf); + tBuf = nullptr; + delete instH; + instH = nullptr; + if (hlen != vlen) { + delete instV; + instV = nullptr; + } + + magic = 0; +} + +EXPORT void SleefDFT_dispose(SleefDFT *p) { + assert(p != NULL); + switch(p->magic) { + case MAGIC_DOUBLE: + delete p->double_; + p->magic = 0; + p->double_ = nullptr; + free(p); + break; + case MAGIC2D_DOUBLE: + delete p->double2d_; + p->magic = 0; + p->double_ = nullptr; + free(p); + break; + case MAGIC_FLOAT: + delete p->float_; + p->magic = 0; + p->float_ = nullptr; + free(p); + break; + case MAGIC2D_FLOAT: + delete p->float2d_; + p->magic = 0; + p->float_ = nullptr; + free(p); + break; + default: abort(); + } +} + +// PlanManager + +template +string SleefDFTXX::planKeyString(string suffix) { + string s; + s += baseTypeID == 1 ? "D" : "S"; + s += (mode & SLEEF_MODE_REAL) ? "r" : "c"; + s += (mode & SLEEF_MODE_BACKWARD) ? "b" : "f"; + s += (mode & SLEEF_MODE_ALT) ? "o" : "w"; + s += (mode & SLEEF_MODE_NO_MT) ? "s" : "m"; + s += to_string(log2len) + "," + "0"; + if (suffix != "") s += ":" + suffix; + return s; +} + +template +string SleefDFT2DXX::planKeyString(string suffix) { + string s; + s += baseTypeID == 1 ? "D" : "S"; + s += (mode & SLEEF_MODE_REAL) ? "r" : "c"; + s += (mode & SLEEF_MODE_BACKWARD) ? "b" : "f"; + s += (mode & SLEEF_MODE_ALT) ? "o" : "w"; + s += (mode & SLEEF_MODE_NO_MT) ? "s" : "m"; + s += to_string(log2hlen) + "," + to_string(log2vlen); + if (suffix != "") s += ":" + suffix; + return s; +} + +static string getPlanIdPrefix() { + string s; + +#ifdef ENABLE_STREAM + s += "s"; +#else + s += "n"; +#endif + s += to_string(CONFIGMAX) + ","; + s += to_string(ISAMAX) + ","; + s += to_string(MAXBUTWIDTHDP) + ","; + s += to_string(MAXBUTWIDTHSP) + ","; + s += to_string(MINSHIFTDP) + ","; + s += to_string(MAXSHIFTDP) + ","; + s += to_string(MINSHIFTSP) + ","; + s += to_string(MAXSHIFTSP) + ":"; + + return s; +} + +PlanManager::PlanManager() { + planID = getPlanIdPrefix() + Sleef_getCpuIdString(); +} + +void PlanManager::setPlanFilePath(const char *path, const char *arch, uint64_t mode) { + planMode_ = mode; + + dftPlanFilePath = ""; + if (path != NULL) dftPlanFilePath = path; + + planID = Sleef_getCpuIdString(); + if (arch != NULL) planID = arch; + planID = getPlanIdPrefix() + planID; + + if ((mode & SLEEF_PLAN_RESET) != 0) std::get<0>(thePlan)[planID].clear(); +} + +void PlanManager::loadPlanFromFile() { + if ((planMode_ & SLEEF_PLAN_REFERTOENVVAR) != 0) { + char *s = std::getenv(ENVVAR); + if (s != NULL) SleefDFT_setPlanFilePath(s, NULL, planMode_); + } + + if (dftPlanFilePath != "") { + FILE *fp = fopen(dftPlanFilePath.c_str(), "rb"); + if (fp) { + if (!(planMode_ & SLEEF_PLAN_NOLOCK)) FLOCK(fp); + FileDeserializer d(fp); + tuple>, string> plan; + try { + d >> plan; + } catch(exception &ex) {} + if (!(planMode_ & SLEEF_PLAN_NOLOCK)) FUNLOCK(fp); + fclose(fp); + if (std::get<1>(plan) == PLANFILEID) thePlan = plan; + } + } +} + +bool PlanManager::savePlanToFile(const string &fn) { + if (fn != "") { + FILE *fp = fopen(fn.c_str(), "wb"); + if (fp) { + FLOCK(fp); + FileSerializer s(fp); + std::get<1>(thePlan) = PLANFILEID; + s << thePlan; + FUNLOCK(fp); + fclose(fp); + return true; + } + } + return false; +} + +bool PlanManager::savePlanToFile() { + if ((planMode_ & SLEEF_PLAN_READONLY) != 0) return false; + return savePlanToFile(dftPlanFilePath); +} + +bool PlanManager::loadAndPutToFile(const string& key, const string& value) { + if ((planMode_ & SLEEF_PLAN_REFERTOENVVAR) != 0) { + char *s = std::getenv(ENVVAR); + if (s != NULL) SleefDFT_setPlanFilePath(s, NULL, planMode_); + } + + if (dftPlanFilePath != "") { + FILE *fp = fopen(dftPlanFilePath.c_str(), "r+b"); + if (!fp) fp = fopen(dftPlanFilePath.c_str(), "w+b"); + if (fp) { + if (!(planMode_ & SLEEF_PLAN_NOLOCK)) FLOCK(fp); + fseek(fp, 0, SEEK_END); + if (ftell(fp) != 0) { + fseek(fp, 0, SEEK_SET); + FileDeserializer d(fp); + tuple>, string> plan; + try { + d >> plan; + } catch(exception &ex) {} + if (std::get<1>(plan) == PLANFILEID) thePlan = plan; + } + + std::get<0>(thePlan)[planID][key] = value; + std::get<1>(thePlan) = PLANFILEID; + fseek(fp, 0, SEEK_SET); + FileSerializer s(fp); + s << thePlan; + if (!(planMode_ & SLEEF_PLAN_NOLOCK)) FUNLOCK(fp); + fclose(fp); + return true; + } + } + + return false; +} + +EXPORT void SleefDFT_setPlanFilePath(const char *path, const char *arch, uint64_t mode) { + planManager.setPlanFilePath(path, arch, mode); +} + +EXPORT int SleefDFT_savePlan(const char *pathStr) { + return (int)planManager.savePlanToFile(pathStr); +} + +string PlanManager::get(const string& key) { + if (std::get<0>(thePlan)[planID].count(key) == 0) return ""; + + return std::get<0>(thePlan)[planID].at(key); +} + +void PlanManager::put(const string& key, const string& value) { + std::get<0>(thePlan)[planID][key] = value; +} + +// + +template +void SleefDFTXX::saveMeasurementResults() { + assert(magic == MAGIC_FLOAT || magic == MAGIC_DOUBLE); + + unique_lock lock(planManager.mtx); + + if ((planManager.planMode() & SLEEF_PLAN_AUTOMATIC) != 0) { + if (planManager.loadAndPutToFile(planKeyString(), getPath()) && (mode & SLEEF_MODE_VERBOSE) != 0) { + fprintf(verboseFP, "Saving plan to file\n"); + } + } else { + planManager.put(planKeyString(), getPath()); + } +} + +template +void SleefDFT2DXX::saveMeasurementResults() { + assert(magic == MAGIC2D_FLOAT || magic == MAGIC2D_DOUBLE); + + unique_lock lock(planManager.mtx); + + if ((planManager.planMode() & SLEEF_PLAN_AUTOMATIC) != 0) { + if (planManager.loadAndPutToFile(planKeyString(), getPath()) && (mode & SLEEF_MODE_VERBOSE) != 0) { + fprintf(verboseFP, "Saving plan to file\n"); + } + } else { + planManager.put(planKeyString(), getPath()); + } +} + +template +bool SleefDFTXX::loadMeasurementResults() { + assert(magic == MAGIC_FLOAT || magic == MAGIC_DOUBLE); + + unique_lock lock(planManager.mtx); + + planManager.loadPlanFromFile(); + + string path = planManager.get(planKeyString()); + if (path == "") return false; + + setPath(path.c_str()); + + return true; +} + +template +bool SleefDFT2DXX::loadMeasurementResults() { + assert(magic == MAGIC2D_FLOAT || magic == MAGIC2D_DOUBLE); + + unique_lock lock(planManager.mtx); + + planManager.loadPlanFromFile(); + + string path = planManager.get(planKeyString()); + if (path == "") return false; + + setPath(path.c_str()); + + return true; +} + +// Instantiation + +template void SleefDFTXX::freeTables(); +template void SleefDFTXX::freeTables(); +template SleefDFTXX::~SleefDFTXX(); +template SleefDFTXX::~SleefDFTXX(); +template SleefDFT2DXX::~SleefDFT2DXX(); +template SleefDFT2DXX::~SleefDFT2DXX(); + +template bool SleefDFTXX::loadMeasurementResults(); +template bool SleefDFTXX::loadMeasurementResults(); +template void SleefDFTXX::saveMeasurementResults(); +template void SleefDFTXX::saveMeasurementResults(); +template bool SleefDFT2DXX::loadMeasurementResults(); +template bool SleefDFT2DXX::loadMeasurementResults(); +template void SleefDFT2DXX::saveMeasurementResults(); +template void SleefDFT2DXX::saveMeasurementResults(); + +PlanManager planManager; + +FILE *defaultVerboseFP = stdout; + +EXPORT void SleefDFT_setDefaultVerboseFP(FILE *fp) { + defaultVerboseFP = fp; +} diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/dftcommon.h b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/dftcommon.h deleted file mode 100644 index 54a461d7d92..00000000000 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/dftcommon.h +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. -// Distributed under the Boost Software License, Version 1.0. -// (See accompanying file LICENSE.txt or copy at -// http://www.boost.org/LICENSE_1_0.txt) - -#define CONFIGMAX 4 -#define CONFIG_STREAM 1 -#define CONFIG_MT 2 - -#define MAXLOG2LEN 32 - -typedef struct SleefDFT { - uint32_t magic; - uint64_t mode, mode2, mode3; - int baseTypeID; - const void *in; - void *out; - - union { - struct { - uint32_t log2len; - - void **tbl[MAXBUTWIDTH+1]; - void *rtCoef0, *rtCoef1; - uint32_t **perm; - - void **x0, **x1; - - int isa; - int planMode; - - int vecwidth, log2vecwidth; - int nThread; - - uint64_t tm[CONFIGMAX][(MAXBUTWIDTH+1)*32]; - uint64_t bestTime; - int16_t bestPath[32], bestPathConfig[32], pathLen; - }; - - struct { - int32_t hlen, vlen; - int32_t log2hlen, log2vlen; - uint64_t tmNoMT, tmMT; - struct SleefDFT *instH, *instV; - void *tBuf; - }; - }; -} SleefDFT; - -#define SLEEF_MODE2_MT1D (1 << 0) -#define SLEEF_MODE3_MT2D (1 << 0) - -#define PLANFILEID "SLEEFDFT0\n" -#define ENVVAR "SLEEFDFTPLAN" - -#define SLEEF_MODE_MEASUREBITS (3 << 20) - -void freeTables(SleefDFT *p); -uint32_t ilog2(uint32_t q); - -//int PlanManager_loadMeasurementResultsB(SleefDFT *p); -//void PlanManager_saveMeasurementResultsB(SleefDFT *p, int butStat); -int PlanManager_loadMeasurementResultsT(SleefDFT *p); -void PlanManager_saveMeasurementResultsT(SleefDFT *p); -int PlanManager_loadMeasurementResultsP(SleefDFT *p, int pathCat); -void PlanManager_saveMeasurementResultsP(SleefDFT *p, int pathCat); - -#define GETINT_VECWIDTH 100 -#define GETINT_DFTPRIORITY 101 diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/dftcommon.hpp b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/dftcommon.hpp new file mode 100644 index 00000000000..282b7a6a313 --- /dev/null +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/dftcommon.hpp @@ -0,0 +1,237 @@ +// Copyright Naoki Shibata and contributors 2010 - 2025. +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; + +#include "dispatchparam.h" + +#define MAGIC_FLOAT 0x31415926 +#define MAGIC_DOUBLE 0x27182818 +#define MAGIC2D_FLOAT 0x53589793 +#define MAGIC2D_DOUBLE 0x28459045 + +#define CONFIG_STREAM 1 +#define CONFIG_MT 2 + +#define SLEEF_MODE2_MT1D (1 << 0) +#define SLEEF_MODE3_MT2D (1 << 0) + +#define PLANFILEID "SLEEFDFT1" +#define ENVVAR "SLEEFDFTPLAN" + +#define SLEEF_MODE_MEASUREBITS (7 << 20) +#define SLEEF_MODE_INTERNAL_2D (1ULL << 40) + +#define GETINT_VECWIDTH 100 +#define GETINT_DFTPRIORITY 101 + +#define MAXLOG2LEN 32 + +#define INFINITY_ (1e+300 * 1e+300) + +class Action { +public: + int config, level, N; + + Action(const Action& a) = default; + + Action(int config_, int level_, int N_) : config(config_), level(level_), N(N_) {} + + bool operator==(const Action& rhs) const { + return config == rhs.config && level == rhs.level && N == rhs.N; + } + bool operator!=(const Action& rhs) const { return !(*this == rhs); } + + friend ostream& operator<<(ostream &os, const Action &ac) { + return os << "[" << ac.config << ", " << ac.level << ", " << ac.N << "]"; + } +}; + +template <> +struct std::hash { + size_t operator()(const Action &a) const { + size_t u = 0; + u ^= a.config; + u = (u << 7) | (u >> ((sizeof(u)*8)-7)); + u ^= a.level; + u = (u << 7) | (u >> ((sizeof(u)*8)-7)); + u ^= a.N; + return u; + } +}; + +template +struct SleefDFTXX { + int magic; + const int baseTypeID; + const real * const in; + real * const out; + const int nThread; + const uint32_t log2len; + const uint64_t mode; + const int minshift; + + uint64_t mode2 = 0, mode3 = 0; + + // + + real **tbl[MAXBUTWIDTH+1]; + real *rtCoef0, *rtCoef1; + uint32_t **perm; + + real **x0, **x1; + + int isa = 0; + int planMode = 0; + + int vecwidth, log2vecwidth; + + bool executable[CONFIGMAX][MAXLOG2LEN][MAXLOG2LEN]; + vector bestPath; + + FILE *verboseFP = NULL; + + void (*(* const DFTF)[ISAMAX][MAXBUTWIDTH+1])(real *, const real *, const int); + void (*(* const DFTB)[ISAMAX][MAXBUTWIDTH+1])(real *, const real *, const int); + void (*(* const TBUTF)[ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const int, const real *, const int); + void (*(* const TBUTB)[ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const int, const real *, const int); + void (*(* const BUTF)[ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const int, const real *, const int, const real *, const int); + void (*(* const BUTB)[ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const int, const real *, const int, const real *, const int); + void (** const REALSUB0)(real *, const real *, const int, const real *, const real *); + void (** const REALSUB1)(real *, const real *, const int, const real *, const real *, const int); + void (*(* const TBUTFS)[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const real *, const int); + void (*(* const TBUTBS)[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const real *, const int); + + SleefDFTXX(uint32_t n, const real *in, real *out, uint64_t mode, const char *baseTypeString, int BASETYPEID_, int MAGIC_, int minshift_, + int (*GETINT_[16])(int), const void *(*GETPTR_[16])(int), real2 (*SINCOSPI_)(real), + void (*DFTF_[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, const real *, const int), + void (*DFTB_[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, const real *, const int), + void (*TBUTF_[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const int, const real *, const int), + void (*TBUTB_[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const int, const real *, const int), + void (*BUTF_[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const int, const real *, const int, const real *, const int), + void (*BUTB_[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const int, const real *, const int, const real *, const int), + void (*REALSUB0_[ISAMAX])(real *, const real *, const int, const real *, const real *), + void (*REALSUB1_[ISAMAX])(real *, const real *, const int, const real *, const real *, const int), + void (*TBUTFS_[MAXSHIFT][CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const real *, const int), + void (*TBUTBS_[MAXSHIFT][CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const real *, const int) + ); + + ~SleefDFTXX(); + + void dispatch(const int N, real *d, const real *s, const int level, const int config); + void execute(const real *s0, real *d0, int MAGIC_, int MAGIC2D_); + void freeTables(); + void generatePerm(const vector &); + + void measurementRun(real *d, const real *s, const vector &path, uint64_t niter); + double measurePath(const vector &path, uint64_t minTime); + void searchForBestPath(int nPaths); + void searchForRandomPath(); + bool measure(bool randomize); + + vector parsePathStr(const char *); + + string planKeyString(string = ""); + bool loadMeasurementResults(); + void saveMeasurementResults(); + void setPath(const char *pathStr); + string getPath(); +}; + +template +struct SleefDFT2DXX { + int magic; + uint64_t mode, mode2, mode3; + int baseTypeID; + const real *in; + real *out; + + // + + int32_t hlen, vlen; + int32_t log2hlen, log2vlen; + bool planMT; + real *tBuf; + + SleefDFTXX *instH, *instV; + + FILE *verboseFP = NULL; + + SleefDFT2DXX(uint32_t vlen, uint32_t hlen, const real *in, real *out, uint64_t mode, const char *baseTypeString, + int BASETYPEID_, int MAGIC_, int MAGIC2D_, int minshift_, + int (*GETINT_[16])(int), const void *(*GETPTR_[16])(int), real2 (*SINCOSPI_)(real), + void (*DFTF_[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, const real *, const int), + void (*DFTB_[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, const real *, const int), + void (*TBUTF_[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const int, const real *, const int), + void (*TBUTB_[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const int, const real *, const int), + void (*BUTF_[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const int, const real *, const int, const real *, const int), + void (*BUTB_[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const int, const real *, const int, const real *, const int), + void (*REALSUB0_[ISAMAX])(real *, const real *, const int, const real *, const real *), + void (*REALSUB1_[ISAMAX])(real *, const real *, const int, const real *, const real *, const int), + void (*TBUTFS_[MAXSHIFT][CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const real *, const int), + void (*TBUTBS_[MAXSHIFT][CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const real *, const int) + ); + + ~SleefDFT2DXX(); + + void execute(const real *s0, real *d0, int MAGIC_, int MAGIC2D_); + pair measureTranspose(); + double measurePath(SleefDFTXX *inst, bool mt, + const vector &path, uint32_t hlen, uint32_t vlen, uint64_t minTime); + pair, double> searchForBestPath(SleefDFTXX *inst, bool mt, uint32_t hlen, uint32_t vlen, int nPaths); + + string planKeyString(string = ""); + bool loadMeasurementResults(); + void saveMeasurementResults(); + void setPath(const char *pathStr); + string getPath(); +}; + +struct SleefDFT { + uint32_t magic; + union { + SleefDFTXX *double_; + SleefDFTXX *float_; + SleefDFT2DXX *double2d_; + SleefDFT2DXX *float2d_; + }; +}; + +class PlanManager { + string dftPlanFilePath; + uint64_t planMode_ = SLEEF_PLAN_REFERTOENVVAR; + + string planID; + tuple>, string> thePlan; + +public: + PlanManager(); + + recursive_mutex mtx; + + uint64_t planMode() { return planMode_; } + + void setPlanFilePath(const char *path, const char *arch, uint64_t mode); + void loadPlanFromFile(); + bool savePlanToFile(const string &fn); + bool savePlanToFile(); + + bool loadAndPutToFile(const string& key, const string& value); + + string get(const string& key); + void put(const string& key, const string& value); +}; + +extern PlanManager planManager; +extern FILE *defaultVerboseFP; diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/mkdispatch.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/mkdispatch.c index 76d6b72e835..0edc5423b70 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/mkdispatch.c +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/mkdispatch.c @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -14,13 +14,16 @@ int main(int argc, char **argv) { if (argc < 3) { - fprintf(stderr, "Usage : %s ...\n", argv[0]); + fprintf(stderr, "Usage : %s ...\n", argv[0]); exit(-1); } - const char *basetype = argv[1]; - const int maxbutwidth = atoi(argv[2]); - const int isastart = 3; + const char *baseType = argv[1]; + const char *baseTypeID = argv[2]; + const int maxbutwidth = atoi(argv[3]); + const int minshift = atoi(argv[4]); + const int maxshift = atoi(argv[5]); + const int isastart = 6; const int isamax = argc - isastart; #if ENABLE_STREAM == 1 @@ -29,13 +32,14 @@ int main(int argc, char **argv) { const int enable_stream = 0; #endif - printf("#define MAXBUTWIDTH %d\n", maxbutwidth); + printf("#define MAXBUTWIDTH%s %d\n", baseTypeID, maxbutwidth); + printf("#define MINSHIFT%s %d\n", baseTypeID, minshift); + printf("#define MAXSHIFT%s %d\n", baseTypeID, maxshift); + printf("#define CONFIGMAX 4\n"); + printf("#define ISAMAX %d\n", isamax); printf("\n"); - if (strcmp(basetype, "paramonly") == 0) exit(0); - - printf("#define ISAMAX %d\n", isamax); - printf("#define CONFIGMAX 4\n"); + if (strcmp(baseType, "paramonly") == 0) exit(0); for(int k=isastart;k= minshift) { + printf("tbut%df_%d_%d_%s, ", 1 << i, s, config, argv[k]); + } else { + printf("NULL, "); + } + } + printf("},\n"); + } + printf(" },\n"); + } + printf(" },\n"); + } + printf("};\n\n"); + + printf("void (*tbutbs_%s[MAXSHIFT%s][CONFIGMAX][ISAMAX][MAXBUTWIDTH%s+1])(%s *, uint32_t *, const %s *, const %s *, const int) = {\n", baseType, baseTypeID, baseTypeID, baseType, baseType, baseType); + for(int s=0;s= minshift) { + printf("tbut%db_%d_%d_%s, ", 1 << i, s, config, argv[k]); + } else { + printf("NULL, "); + } + } + printf("},\n"); + } + printf(" },\n"); + } + printf(" },\n"); + } + printf("};\n\n"); + // - printf("void (*realSub0_%s[ISAMAX])(real *, const real *, const int, const real *, const real *) = {\n ", basetype); + printf("void (*realSub0_%s[ISAMAX])(%s *, const %s *, const int, const %s *, const %s *) = {\n ", baseType, baseType, baseType, baseType, baseType); for(int k=isastart;k ...\n", argv[0]); + if (argc < 5) { + fprintf(stderr, "Usage : %s ...\n", argv[0]); exit(-1); } - const char *baseType = argv[1]; - const int isastart = 2; + const char *fn = argv[1]; + const char *baseTypeID = argv[3]; + int shift = atoi(argv[4]); + const int isastart = 5; + int mode = 1; + if (strcmp(argv[4], "-") == 0) { + mode = 0; + } else if (shift <= 0) { + mode = 2; + shift = -shift; + } + + char shiftstr[21]; + snprintf(shiftstr, 20, "%d", shift); for(int config=0;config +#include +#include +#include +#include + +using namespace std; + +class Serializer { +public: + virtual void write(const void *, size_t) = 0; + virtual void flush() {} +}; + +class Deserializer { +public: + virtual void read(void *, size_t) = 0; + + template::value), int>::type = 0> + T read() { + T t; + read(&t, sizeof(T)); + return t; + } +}; + +class FileSerializer : public Serializer { + FILE *fp; + +public: + FileSerializer(FILE *fp_) : fp(fp_) {} + + void write(const void *p, size_t z) { + fwrite(p, z, 1, fp); + } + + void flush() { fflush(fp); } +}; + +class FileDeserializer : public Deserializer { + FILE *fp; + +public: + FileDeserializer(FILE *fp_) : fp(fp_) {} + + void read(void *p, size_t z) { + if (!fread(p, z, 1, fp)) throw(runtime_error("FileDeserializer::read : could not read")); + } +}; + +template::value), int>::type = 0> +Serializer& operator<<(Serializer &s, const T& v) { + s.write((const char *)&v, sizeof(v)); + return s; +} + +template::value), int>::type = 0> +Deserializer& operator>>(Deserializer &s, T& v) { + s.read((char *)&v, sizeof(v)); + return s; +} + +template +Serializer& operator<<(Serializer &s, const vector& v) { + s << v.size(); + for(size_t i=0;i +Deserializer& operator>>(Deserializer &d, vector& v) { + size_t z = d.read(); + for(size_t i=0;i> t; + v.push_back(t); + } + return d; +} + +Serializer& operator<<(Serializer &s, const string& str) { + s << (str.size() + 1); + s.write(str.c_str(), str.size() + 1); + return s; +} + +Deserializer& operator>>(Deserializer &d, string& str) { + vector v; + d >> v; + str = v.data(); + return d; +} + +template +Serializer& operator<<(Serializer &s, const unordered_map& m) { + s << m.size(); + for(auto a : m) s << a.first << a.second; + return s; +} + +template +Deserializer& operator>>(Deserializer &d, unordered_map& m) { + size_t z = d.read(); + for(size_t i=0;i> key; + VT value; + d >> value; + m[key] = value; + } + return d; +} + +template +static void serialize_tuple(Serializer &s, const tupletype& t) { + if constexpr (idx < tuple_size_v) { + s << get(t); + serialize_tuple(s, t); + } +} + +template +Serializer& operator<<(Serializer &s, const tuple& t) { + serialize_tuple(s, t); + return s; +} + +template +static void deserialize_tuple(Deserializer &d, tupletype& t) { + if constexpr (idx < tuple_size_v) { + d >> get(t); + deserialize_tuple(d, t); + } +} + +template +Deserializer& operator>>(Deserializer &d, tuple &t) { + deserialize_tuple(d, t); + return d; +} diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/unroll0.org b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/unroll0.cpp.in similarity index 99% rename from src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/unroll0.org rename to src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/unroll0.cpp.in index 3f1f15a6c7d..d31eea62159 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/unroll0.org +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/unroll0.cpp.in @@ -1,8 +1,42 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) +static const real ctbl[] = { + 0.7071067811865475243818940365159164684883L, -0.7071067811865475243818940365159164684883L, + 0.9238795325112867561014214079495587839119L, -0.382683432365089771723257530688933059082L, + 0.382683432365089771723257530688933059082L, -0.9238795325112867561014214079495587839119L, + 0.9807852804032304491190993878113602022495L, -0.1950903220161282678433729148581576851029L, + 0.5555702330196022247573058028269343822103L, -0.8314696123025452370808655033762590846891L, + 0.8314696123025452370808655033762590846891L, -0.5555702330196022247573058028269343822103L, + 0.1950903220161282678433729148581576851029L, -0.9807852804032304491190993878113602022495L, + 0.9951847266721968862310254699821143731242L, -0.09801714032956060199569840382660679267701L, + 0.6343932841636454982026105398063009488396L, -0.7730104533627369607965383602188325085081L, + 0.881921264348355029715105513066220055407L, -0.4713967368259976485449225247492677226546L, + 0.2902846772544623676448431737195932100803L, -0.9569403357322088649310892760624369657307L, + 0.9569403357322088649310892760624369657307L, -0.2902846772544623676448431737195932100803L, + 0.4713967368259976485449225247492677226546L, -0.881921264348355029715105513066220055407L, + 0.7730104533627369607965383602188325085081L, -0.6343932841636454982026105398063009488396L, + 0.09801714032956060199569840382660679267701L, -0.9951847266721968862310254699821143731242L, + 0.9987954562051723927007702841240899260811L, -0.04906767432741801425355085940205324135377L, + 0.6715589548470184006194634573905233310143L, -0.7409511253549590911932944126139233276263L, + 0.9039892931234433315823215138173907234886L, -0.427555093430282094315230886905077056781L, + 0.336889853392220050702686798271834334173L, -0.9415440651830207783906830087961026265475L, + 0.9700312531945439926159106824865574481009L, -0.2429801799032638899447731489766866275204L, + 0.5141027441932217266072797923204262815489L, -0.8577286100002720698929313536407192941624L, + 0.8032075314806449097991200569701675249235L, -0.5956993044924333434615715265891822127742L, + 0.1467304744553617516588479505190711904561L, -0.9891765099647809734561415551112872890371L, + 0.9891765099647809734561415551112872890371L, -0.1467304744553617516588479505190711904561L, + 0.5956993044924333434615715265891822127742L, -0.8032075314806449097991200569701675249235L, + 0.8577286100002720698929313536407192941624L, -0.5141027441932217266072797923204262815489L, + 0.2429801799032638899447731489766866275204L, -0.9700312531945439926159106824865574481009L, + 0.9415440651830207783906830087961026265475L, -0.336889853392220050702686798271834334173L, + 0.427555093430282094315230886905077056781L, -0.9039892931234433315823215138173907234886L, + 0.7409511253549590911932944126139233276263L, -0.6715589548470184006194634573905233310143L, + 0.04906767432741801425355085940205324135377L, -0.9987954562051723927007702841240899260811L, +}; + ALIGNED(8192) void dft2f_%CONFIG%_%ISA%(real *RESTRICT out0, const real *RESTRICT in0, const int shift) { const int k = 1 << (shift - LOG2VECWIDTH); int i=0; @@ -241,7 +275,7 @@ ALIGNED(8192) void tbut4b_%CONFIG%_%ISA%(real *RESTRICT out0, uint32_t *q, const } } -#if MAXBUTWIDTH >= 3 +#if MAXBUTWIDTH%TYPEID% >= 3 ALIGNED(8192) void dft8f_%CONFIG%_%ISA%(real *RESTRICT out0, const real *RESTRICT in0, const int shift) { const int k = 1 << (shift - LOG2VECWIDTH); int i=0; @@ -551,7 +585,7 @@ ALIGNED(8192) void tbut8b_%CONFIG%_%ISA%(real *RESTRICT out0, uint32_t *q, const } #endif -#if MAXBUTWIDTH >= 4 +#if MAXBUTWIDTH%TYPEID% >= 4 ALIGNED(8192) void dft16f_%CONFIG%_%ISA%(real *RESTRICT out0, const real *RESTRICT in0, const int shift) { const int k = 1 << (shift - LOG2VECWIDTH); int i=0; @@ -1217,7 +1251,7 @@ ALIGNED(8192) void tbut16b_%CONFIG%_%ISA%(real *RESTRICT out0, uint32_t *q, cons } #endif -#if MAXBUTWIDTH >= 5 +#if MAXBUTWIDTH%TYPEID% >= 5 ALIGNED(8192) void dft32f_%CONFIG%_%ISA%(real *RESTRICT out0, const real *RESTRICT in0, const int shift) { const int k = 1 << (shift - LOG2VECWIDTH); int i=0; @@ -2727,7 +2761,7 @@ ALIGNED(8192) void tbut32b_%CONFIG%_%ISA%(real *RESTRICT out0, uint32_t *q, cons } #endif -#if MAXBUTWIDTH >= 6 +#if MAXBUTWIDTH%TYPEID% >= 6 ALIGNED(8192) void dft64f_%CONFIG%_%ISA%(real *RESTRICT out0, const real *RESTRICT in0, const int shift) { const int k = 1 << (shift - LOG2VECWIDTH); int i=0; @@ -6191,7 +6225,7 @@ ALIGNED(8192) void tbut64b_%CONFIG%_%ISA%(real *RESTRICT out0, uint32_t *q, cons // -#if MAXBUTWIDTH >= 7 +#if MAXBUTWIDTH%TYPEID% >= 7 ALIGNED(8192) void dft128f_%CONFIG%_%ISA%(real *RESTRICT out0, const real *RESTRICT in0, const int shift) { const int k = 1 << (shift - LOG2VECWIDTH); int i=0; diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/unroll1.cpp.in b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/unroll1.cpp.in new file mode 100644 index 00000000000..6401d5d3aee --- /dev/null +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/unroll1.cpp.in @@ -0,0 +1,4868 @@ +// Copyright Naoki Shibata and contributors 2010 - 2025. +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +ALIGNED(8192) void tbut2f_%SHIFT%_%CONFIG%_%ISA%(real *RESTRICT out0, uint32_t *q, const real *RESTRICT in0, const real *RESTRICT tbl, const int K) { + const int shift = %SHIFT% + LOG2VECWIDTH, k = 1 << (shift - LOG2VECWIDTH); + int i=0; +#pragma omp parallel for + for(i=0;i < k;i++) { + int i0 = i << LOG2VECWIDTH; + real *out = out0 + q[i]; + const real *in = in0 + i0*2; + const int tbloffset = K * i0; + + scatter(out, 0, 2, plus(load(in, (0 << shift)), load(in, (1 << shift)))); + real2 v4 = minus(load(in, (0 << shift)), load(in, (1 << shift))); + scatter(out, 1, 2, timesminusplus(v4, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v4), load(tbl, 1 * VECWIDTH + tbloffset)))); + } +} + +ALIGNED(8192) void tbut2b_%SHIFT%_%CONFIG%_%ISA%(real *RESTRICT out0, uint32_t *q, const real *RESTRICT in0, const real *RESTRICT tbl, const int K) { + const int shift = %SHIFT% + LOG2VECWIDTH, k = 1 << (shift - LOG2VECWIDTH); + int i=0; +#pragma omp parallel for + for(i=0;i < k;i++) { + int i0 = i << LOG2VECWIDTH; + real *out = out0 + q[i]; + const real *in = in0 + i0*2; + const int tbloffset = K * i0; + + scatter(out, 0, 2, plus(load(in, (0 << shift)), load(in, (1 << shift)))); + real2 v4 = minus(load(in, (0 << shift)), load(in, (1 << shift))); + scatter(out, 1, 2, timesminusplus(v4, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v4), load(tbl, 1 * VECWIDTH + tbloffset)))); + } +} + +ALIGNED(8192) void tbut4f_%SHIFT%_%CONFIG%_%ISA%(real *RESTRICT out0, uint32_t *q, const real *RESTRICT in0, const real *RESTRICT tbl, const int K) { + const int shift = %SHIFT% + LOG2VECWIDTH, k = 1 << (shift - LOG2VECWIDTH); + int i=0; +#pragma omp parallel for + for(i=0;i < k;i++) { + int i0 = i << LOG2VECWIDTH; + real *out = out0 + q[i]; + const real *in = in0 + i0*2; + const int tbloffset = K * i0; + + real2 v5 = load(in, 3 << shift); + real2 v3 = load(in, 1 << shift); + real2 v7 = reverse(minus(v3, v5)); + real2 v13 = plus(v3, v5); + real2 v2 = load(in, 0 << shift); + real2 v4 = load(in, 2 << shift); + real2 v8 = minus(v4, v2); + real2 v12 = plus(v2, v4); + scatter(out, 0, 4, plus(v12, v13)); + real2 v26 = minus(v12, v13); + scatter(out, 2, 4, timesminusplus(v26, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v26), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v11 = minusplus(uminus(v7), v8); + real2 v9 = minusplus(v7, v8); + scatter(out, 1, 4, timesminusplus(reverse(v9), load(tbl, 2 * VECWIDTH + tbloffset), times(v9, load(tbl, 3 * VECWIDTH + tbloffset)))); + scatter(out, 3, 4, timesminusplus(reverse(v11), load(tbl, 4 * VECWIDTH + tbloffset), times(v11, load(tbl, 5 * VECWIDTH + tbloffset)))); + } +} + +ALIGNED(8192) void tbut4b_%SHIFT%_%CONFIG%_%ISA%(real *RESTRICT out0, uint32_t *q, const real *RESTRICT in0, const real *RESTRICT tbl, const int K) { + const int shift = %SHIFT% + LOG2VECWIDTH, k = 1 << (shift - LOG2VECWIDTH); + int i=0; +#pragma omp parallel for + for(i=0;i < k;i++) { + int i0 = i << LOG2VECWIDTH; + real *out = out0 + q[i]; + const real *in = in0 + i0*2; + const int tbloffset = K * i0; + + real2 v5 = load(in, 3 << shift); + real2 v3 = load(in, 1 << shift); + real2 v7 = reverse(minus(v5, v3)); + real2 v13 = plus(v3, v5); + real2 v2 = load(in, 0 << shift); + real2 v4 = load(in, 2 << shift); + real2 v8 = minus(v4, v2); + real2 v12 = plus(v2, v4); + scatter(out, 0, 4, plus(v12, v13)); + real2 v26 = minus(v12, v13); + scatter(out, 2, 4, timesminusplus(v26, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v26), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v11 = minusplus(uminus(v7), v8); + real2 v9 = minusplus(v7, v8); + scatter(out, 1, 4, timesminusplus(reverse(v9), load(tbl, 2 * VECWIDTH + tbloffset), times(v9, load(tbl, 3 * VECWIDTH + tbloffset)))); + scatter(out, 3, 4, timesminusplus(reverse(v11), load(tbl, 4 * VECWIDTH + tbloffset), times(v11, load(tbl, 5 * VECWIDTH + tbloffset)))); + } +} + +#if MAXBUTWIDTH%TYPEID% >= 3 +ALIGNED(8192) void tbut8f_%SHIFT%_%CONFIG%_%ISA%(real *RESTRICT out0, uint32_t *q, const real *RESTRICT in0, const real *RESTRICT tbl, const int K) { + const int shift = %SHIFT% + LOG2VECWIDTH, k = 1 << (shift - LOG2VECWIDTH); + int i=0; +#pragma omp parallel for + for(i=0;i < k;i++) { + int i0 = i << LOG2VECWIDTH; + real *out = out0 + q[i]; + const real *in = in0 + i0*2; + const int tbloffset = K * i0; + + real2 v9 = load(in, 7 << shift); + real2 v5 = load(in, 3 << shift); + real2 v37 = plus(v5, v9); + real2 v31 = reverse(minus(v5, v9)); + real2 v7 = load(in, 5 << shift); + real2 v3 = load(in, 1 << shift); + real2 v36 = plus(v3, v7); + real2 v32 = minus(v7, v3); + real2 v57 = plus(v36, v37); + real2 v51 = reverse(minus(v36, v37)); + real2 v35 = minusplus(uminus(v31), v32); + real2 v33 = minusplus(v31, v32); + real2 v43 = timesminusplus(reverse(v33), load(tbl, 6 * VECWIDTH + tbloffset), times(v33, load(tbl, 7 * VECWIDTH + tbloffset))); + real2 v6 = load(in, 4 << shift); + real2 v2 = load(in, 0 << shift); + real2 v16 = plus(v2, v6); + real2 v12 = minus(v6, v2); + real2 v8 = load(in, 6 << shift); + real2 v4 = load(in, 2 << shift); + real2 v17 = plus(v4, v8); + real2 v11 = reverse(minus(v4, v8)); + real2 v52 = minus(v17, v16); + real2 v56 = plus(v16, v17); + scatter(out, 0, 8, plus(v56, v57)); + real2 v70 = minus(v56, v57); + scatter(out, 4, 8, timesminusplus(v70, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v70), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v53 = minusplus(v51, v52); + scatter(out, 2, 8, timesminusplus(reverse(v53), load(tbl, 10 * VECWIDTH + tbloffset), times(v53, load(tbl, 11 * VECWIDTH + tbloffset)))); + real2 v55 = minusplus(uminus(v51), v52); + scatter(out, 6, 8, timesminusplus(reverse(v55), load(tbl, 12 * VECWIDTH + tbloffset), times(v55, load(tbl, 13 * VECWIDTH + tbloffset)))); + real2 v15 = minusplus(uminus(v11), v12); + real2 v13 = minusplus(v11, v12); + real2 v23 = timesminusplus(reverse(v13), load(tbl, 2 * VECWIDTH + tbloffset), times(v13, load(tbl, 3 * VECWIDTH + tbloffset))); + scatter(out, 1, 8, plus(v23, v43)); + real2 v78 = minus(v23, v43); + scatter(out, 5, 8, timesminusplus(v78, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v78), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v49 = timesminusplus(reverse(v35), load(tbl, 8 * VECWIDTH + tbloffset), times(v35, load(tbl, 9 * VECWIDTH + tbloffset))); + real2 v29 = timesminusplus(reverse(v15), load(tbl, 4 * VECWIDTH + tbloffset), times(v15, load(tbl, 5 * VECWIDTH + tbloffset))); + scatter(out, 3, 8, plus(v29, v49)); + real2 v84 = minus(v29, v49); + scatter(out, 7, 8, timesminusplus(v84, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v84), load(tbl, 1 * VECWIDTH + tbloffset)))); + } +} + +ALIGNED(8192) void tbut8b_%SHIFT%_%CONFIG%_%ISA%(real *RESTRICT out0, uint32_t *q, const real *RESTRICT in0, const real *RESTRICT tbl, const int K) { + const int shift = %SHIFT% + LOG2VECWIDTH, k = 1 << (shift - LOG2VECWIDTH); + int i=0; +#pragma omp parallel for + for(i=0;i < k;i++) { + int i0 = i << LOG2VECWIDTH; + real *out = out0 + q[i]; + const real *in = in0 + i0*2; + const int tbloffset = K * i0; + + real2 v9 = load(in, 7 << shift); + real2 v5 = load(in, 3 << shift); + real2 v37 = plus(v5, v9); + real2 v31 = reverse(minus(v9, v5)); + real2 v7 = load(in, 5 << shift); + real2 v3 = load(in, 1 << shift); + real2 v36 = plus(v3, v7); + real2 v32 = minus(v7, v3); + real2 v57 = plus(v36, v37); + real2 v51 = reverse(minus(v37, v36)); + real2 v35 = minusplus(uminus(v31), v32); + real2 v33 = minusplus(v31, v32); + real2 v43 = timesminusplus(reverse(v33), load(tbl, 6 * VECWIDTH + tbloffset), times(v33, load(tbl, 7 * VECWIDTH + tbloffset))); + real2 v6 = load(in, 4 << shift); + real2 v2 = load(in, 0 << shift); + real2 v16 = plus(v2, v6); + real2 v12 = minus(v6, v2); + real2 v8 = load(in, 6 << shift); + real2 v4 = load(in, 2 << shift); + real2 v17 = plus(v4, v8); + real2 v11 = reverse(minus(v8, v4)); + real2 v52 = minus(v17, v16); + real2 v56 = plus(v16, v17); + scatter(out, 0, 8, plus(v56, v57)); + real2 v70 = minus(v56, v57); + scatter(out, 4, 8, timesminusplus(v70, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v70), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v53 = minusplus(v51, v52); + scatter(out, 2, 8, timesminusplus(reverse(v53), load(tbl, 10 * VECWIDTH + tbloffset), times(v53, load(tbl, 11 * VECWIDTH + tbloffset)))); + real2 v55 = minusplus(uminus(v51), v52); + scatter(out, 6, 8, timesminusplus(reverse(v55), load(tbl, 12 * VECWIDTH + tbloffset), times(v55, load(tbl, 13 * VECWIDTH + tbloffset)))); + real2 v15 = minusplus(uminus(v11), v12); + real2 v13 = minusplus(v11, v12); + real2 v23 = timesminusplus(reverse(v13), load(tbl, 2 * VECWIDTH + tbloffset), times(v13, load(tbl, 3 * VECWIDTH + tbloffset))); + scatter(out, 1, 8, plus(v23, v43)); + real2 v78 = minus(v23, v43); + scatter(out, 5, 8, timesminusplus(v78, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v78), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v49 = timesminusplus(reverse(v35), load(tbl, 8 * VECWIDTH + tbloffset), times(v35, load(tbl, 9 * VECWIDTH + tbloffset))); + real2 v29 = timesminusplus(reverse(v15), load(tbl, 4 * VECWIDTH + tbloffset), times(v15, load(tbl, 5 * VECWIDTH + tbloffset))); + scatter(out, 3, 8, plus(v29, v49)); + real2 v84 = minus(v29, v49); + scatter(out, 7, 8, timesminusplus(v84, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v84), load(tbl, 1 * VECWIDTH + tbloffset)))); + } +} +#endif + +#if MAXBUTWIDTH%TYPEID% >= 4 +ALIGNED(8192) void tbut16f_%SHIFT%_%CONFIG%_%ISA%(real *RESTRICT out0, uint32_t *q, const real *RESTRICT in0, const real *RESTRICT tbl, const int K) { + const int shift = %SHIFT% + LOG2VECWIDTH, k = 1 << (shift - LOG2VECWIDTH); + int i=0; +#pragma omp parallel for + for(i=0;i < k;i++) { + int i0 = i << LOG2VECWIDTH; + real *out = out0 + q[i]; + const real *in = in0 + i0*2; + const int tbloffset = K * i0; + + real2 v15 = load(in, 13 << shift); + real2 v7 = load(in, 5 << shift); + real2 v45 = plus(v7, v15); + real2 v39 = reverse(minus(v7, v15)); + real2 v3 = load(in, 1 << shift); + real2 v11 = load(in, 9 << shift); + real2 v40 = minus(v11, v3); + real2 v44 = plus(v3, v11); + real2 v124 = plus(v44, v45); + real2 v120 = minus(v45, v44); + real2 v41 = minusplus(v39, v40); + real2 v43 = minusplus(uminus(v39), v40); + real2 v57 = timesminusplus(reverse(v43), load(tbl, 8 * VECWIDTH + tbloffset), times(v43, load(tbl, 9 * VECWIDTH + tbloffset))); + real2 v13 = load(in, 11 << shift); + real2 v5 = load(in, 3 << shift); + real2 v84 = plus(v5, v13); + real2 v80 = minus(v13, v5); + real2 v17 = load(in, 15 << shift); + real2 v9 = load(in, 7 << shift); + real2 v85 = plus(v9, v17); + real2 v79 = reverse(minus(v9, v17)); + real2 v119 = reverse(minus(v84, v85)); + real2 v125 = plus(v84, v85); + real2 v145 = plus(v124, v125); + real2 v139 = reverse(minus(v124, v125)); + real2 v121 = minusplus(v119, v120); + real2 v123 = minusplus(uminus(v119), v120); + real2 v137 = timesminusplus(reverse(v123), load(tbl, 24 * VECWIDTH + tbloffset), times(v123, load(tbl, 25 * VECWIDTH + tbloffset))); + real2 v131 = timesminusplus(reverse(v121), load(tbl, 22 * VECWIDTH + tbloffset), times(v121, load(tbl, 23 * VECWIDTH + tbloffset))); + real2 v4 = load(in, 2 << shift); + real2 v12 = load(in, 10 << shift); + real2 v64 = plus(v4, v12); + real2 v60 = minus(v12, v4); + real2 v8 = load(in, 6 << shift); + real2 v16 = load(in, 14 << shift); + real2 v65 = plus(v8, v16); + real2 v59 = reverse(minus(v8, v16)); + real2 v99 = reverse(minus(v64, v65)); + real2 v105 = plus(v64, v65); + real2 v14 = load(in, 12 << shift); + real2 v6 = load(in, 4 << shift); + real2 v25 = plus(v6, v14); + real2 v19 = reverse(minus(v6, v14)); + real2 v10 = load(in, 8 << shift); + real2 v2 = load(in, 0 << shift); + real2 v20 = minus(v10, v2); + real2 v24 = plus(v2, v10); + real2 v104 = plus(v24, v25); + real2 v100 = minus(v25, v24); + real2 v140 = minus(v105, v104); + real2 v144 = plus(v104, v105); + scatter(out, 0, 16, plus(v144, v145)); + real2 v158 = minus(v144, v145); + scatter(out, 8, 16, timesminusplus(v158, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v158), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v143 = minusplus(uminus(v139), v140); + scatter(out, 12, 16, timesminusplus(reverse(v143), load(tbl, 28 * VECWIDTH + tbloffset), times(v143, load(tbl, 29 * VECWIDTH + tbloffset)))); + real2 v141 = minusplus(v139, v140); + scatter(out, 4, 16, timesminusplus(reverse(v141), load(tbl, 26 * VECWIDTH + tbloffset), times(v141, load(tbl, 27 * VECWIDTH + tbloffset)))); + real2 v101 = minusplus(v99, v100); + real2 v103 = minusplus(uminus(v99), v100); + real2 v117 = timesminusplus(reverse(v103), load(tbl, 20 * VECWIDTH + tbloffset), times(v103, load(tbl, 21 * VECWIDTH + tbloffset))); + scatter(out, 6, 16, plus(v117, v137)); + real2 v172 = minus(v117, v137); + scatter(out, 14, 16, timesminusplus(v172, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v172), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v111 = timesminusplus(reverse(v101), load(tbl, 18 * VECWIDTH + tbloffset), times(v101, load(tbl, 19 * VECWIDTH + tbloffset))); + scatter(out, 2, 16, plus(v111, v131)); + real2 v166 = minus(v111, v131); + scatter(out, 10, 16, timesminusplus(v166, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v166), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v23 = minusplus(uminus(v19), v20); + real2 v21 = minusplus(v19, v20); + real2 v81 = minusplus(v79, v80); + real2 v83 = minusplus(uminus(v79), v80); + real2 v97 = timesminusplus(reverse(v83), load(tbl, 16 * VECWIDTH + tbloffset), times(v83, load(tbl, 17 * VECWIDTH + tbloffset))); + real2 v211 = plus(v57, v97); + real2 v205 = reverse(minus(v57, v97)); + real2 v61 = minusplus(v59, v60); + real2 v63 = minusplus(uminus(v59), v60); + real2 v77 = timesminusplus(reverse(v63), load(tbl, 12 * VECWIDTH + tbloffset), times(v63, load(tbl, 13 * VECWIDTH + tbloffset))); + real2 v37 = timesminusplus(reverse(v23), load(tbl, 4 * VECWIDTH + tbloffset), times(v23, load(tbl, 5 * VECWIDTH + tbloffset))); + real2 v210 = plus(v37, v77); + real2 v206 = minus(v77, v37); + scatter(out, 3, 16, plus(v210, v211)); + real2 v224 = minus(v210, v211); + scatter(out, 11, 16, timesminusplus(v224, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v224), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v207 = minusplus(v205, v206); + real2 v209 = minusplus(uminus(v205), v206); + scatter(out, 15, 16, timesminusplus(reverse(v209), load(tbl, 36 * VECWIDTH + tbloffset), times(v209, load(tbl, 37 * VECWIDTH + tbloffset)))); + scatter(out, 7, 16, timesminusplus(reverse(v207), load(tbl, 34 * VECWIDTH + tbloffset), times(v207, load(tbl, 35 * VECWIDTH + tbloffset)))); + real2 v71 = timesminusplus(reverse(v61), load(tbl, 10 * VECWIDTH + tbloffset), times(v61, load(tbl, 11 * VECWIDTH + tbloffset))); + real2 v51 = timesminusplus(reverse(v41), load(tbl, 6 * VECWIDTH + tbloffset), times(v41, load(tbl, 7 * VECWIDTH + tbloffset))); + real2 v91 = timesminusplus(reverse(v81), load(tbl, 14 * VECWIDTH + tbloffset), times(v81, load(tbl, 15 * VECWIDTH + tbloffset))); + real2 v185 = plus(v51, v91); + real2 v179 = reverse(minus(v51, v91)); + real2 v31 = timesminusplus(reverse(v21), load(tbl, 2 * VECWIDTH + tbloffset), times(v21, load(tbl, 3 * VECWIDTH + tbloffset))); + real2 v184 = plus(v31, v71); + real2 v180 = minus(v71, v31); + scatter(out, 1, 16, plus(v184, v185)); + real2 v198 = minus(v184, v185); + scatter(out, 9, 16, timesminusplus(v198, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v198), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v181 = minusplus(v179, v180); + scatter(out, 5, 16, timesminusplus(reverse(v181), load(tbl, 30 * VECWIDTH + tbloffset), times(v181, load(tbl, 31 * VECWIDTH + tbloffset)))); + real2 v183 = minusplus(uminus(v179), v180); + scatter(out, 13, 16, timesminusplus(reverse(v183), load(tbl, 32 * VECWIDTH + tbloffset), times(v183, load(tbl, 33 * VECWIDTH + tbloffset)))); + } +} + +ALIGNED(8192) void tbut16b_%SHIFT%_%CONFIG%_%ISA%(real *RESTRICT out0, uint32_t *q, const real *RESTRICT in0, const real *RESTRICT tbl, const int K) { + const int shift = %SHIFT% + LOG2VECWIDTH, k = 1 << (shift - LOG2VECWIDTH); + int i=0; +#pragma omp parallel for + for(i=0;i < k;i++) { + int i0 = i << LOG2VECWIDTH; + real *out = out0 + q[i]; + const real *in = in0 + i0*2; + const int tbloffset = K * i0; + + real2 v15 = load(in, 13 << shift); + real2 v7 = load(in, 5 << shift); + real2 v45 = plus(v7, v15); + real2 v39 = reverse(minus(v15, v7)); + real2 v3 = load(in, 1 << shift); + real2 v11 = load(in, 9 << shift); + real2 v40 = minus(v11, v3); + real2 v44 = plus(v3, v11); + real2 v124 = plus(v44, v45); + real2 v120 = minus(v45, v44); + real2 v41 = minusplus(v39, v40); + real2 v43 = minusplus(uminus(v39), v40); + real2 v57 = timesminusplus(reverse(v43), load(tbl, 8 * VECWIDTH + tbloffset), times(v43, load(tbl, 9 * VECWIDTH + tbloffset))); + real2 v13 = load(in, 11 << shift); + real2 v5 = load(in, 3 << shift); + real2 v84 = plus(v5, v13); + real2 v80 = minus(v13, v5); + real2 v17 = load(in, 15 << shift); + real2 v9 = load(in, 7 << shift); + real2 v85 = plus(v9, v17); + real2 v79 = reverse(minus(v17, v9)); + real2 v119 = reverse(minus(v85, v84)); + real2 v125 = plus(v84, v85); + real2 v145 = plus(v124, v125); + real2 v139 = reverse(minus(v125, v124)); + real2 v121 = minusplus(v119, v120); + real2 v123 = minusplus(uminus(v119), v120); + real2 v137 = timesminusplus(reverse(v123), load(tbl, 24 * VECWIDTH + tbloffset), times(v123, load(tbl, 25 * VECWIDTH + tbloffset))); + real2 v131 = timesminusplus(reverse(v121), load(tbl, 22 * VECWIDTH + tbloffset), times(v121, load(tbl, 23 * VECWIDTH + tbloffset))); + real2 v4 = load(in, 2 << shift); + real2 v12 = load(in, 10 << shift); + real2 v64 = plus(v4, v12); + real2 v60 = minus(v12, v4); + real2 v8 = load(in, 6 << shift); + real2 v16 = load(in, 14 << shift); + real2 v65 = plus(v8, v16); + real2 v59 = reverse(minus(v16, v8)); + real2 v99 = reverse(minus(v65, v64)); + real2 v105 = plus(v64, v65); + real2 v14 = load(in, 12 << shift); + real2 v6 = load(in, 4 << shift); + real2 v25 = plus(v6, v14); + real2 v19 = reverse(minus(v14, v6)); + real2 v10 = load(in, 8 << shift); + real2 v2 = load(in, 0 << shift); + real2 v20 = minus(v10, v2); + real2 v24 = plus(v2, v10); + real2 v104 = plus(v24, v25); + real2 v100 = minus(v25, v24); + real2 v140 = minus(v105, v104); + real2 v144 = plus(v104, v105); + scatter(out, 0, 16, plus(v144, v145)); + real2 v158 = minus(v144, v145); + scatter(out, 8, 16, timesminusplus(v158, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v158), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v143 = minusplus(uminus(v139), v140); + scatter(out, 12, 16, timesminusplus(reverse(v143), load(tbl, 28 * VECWIDTH + tbloffset), times(v143, load(tbl, 29 * VECWIDTH + tbloffset)))); + real2 v141 = minusplus(v139, v140); + scatter(out, 4, 16, timesminusplus(reverse(v141), load(tbl, 26 * VECWIDTH + tbloffset), times(v141, load(tbl, 27 * VECWIDTH + tbloffset)))); + real2 v101 = minusplus(v99, v100); + real2 v103 = minusplus(uminus(v99), v100); + real2 v117 = timesminusplus(reverse(v103), load(tbl, 20 * VECWIDTH + tbloffset), times(v103, load(tbl, 21 * VECWIDTH + tbloffset))); + scatter(out, 6, 16, plus(v117, v137)); + real2 v172 = minus(v117, v137); + scatter(out, 14, 16, timesminusplus(v172, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v172), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v111 = timesminusplus(reverse(v101), load(tbl, 18 * VECWIDTH + tbloffset), times(v101, load(tbl, 19 * VECWIDTH + tbloffset))); + scatter(out, 2, 16, plus(v111, v131)); + real2 v166 = minus(v111, v131); + scatter(out, 10, 16, timesminusplus(v166, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v166), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v23 = minusplus(uminus(v19), v20); + real2 v21 = minusplus(v19, v20); + real2 v81 = minusplus(v79, v80); + real2 v83 = minusplus(uminus(v79), v80); + real2 v97 = timesminusplus(reverse(v83), load(tbl, 16 * VECWIDTH + tbloffset), times(v83, load(tbl, 17 * VECWIDTH + tbloffset))); + real2 v211 = plus(v57, v97); + real2 v205 = reverse(minus(v97, v57)); + real2 v61 = minusplus(v59, v60); + real2 v63 = minusplus(uminus(v59), v60); + real2 v77 = timesminusplus(reverse(v63), load(tbl, 12 * VECWIDTH + tbloffset), times(v63, load(tbl, 13 * VECWIDTH + tbloffset))); + real2 v37 = timesminusplus(reverse(v23), load(tbl, 4 * VECWIDTH + tbloffset), times(v23, load(tbl, 5 * VECWIDTH + tbloffset))); + real2 v210 = plus(v37, v77); + real2 v206 = minus(v77, v37); + scatter(out, 3, 16, plus(v210, v211)); + real2 v224 = minus(v210, v211); + scatter(out, 11, 16, timesminusplus(v224, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v224), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v207 = minusplus(v205, v206); + real2 v209 = minusplus(uminus(v205), v206); + scatter(out, 15, 16, timesminusplus(reverse(v209), load(tbl, 36 * VECWIDTH + tbloffset), times(v209, load(tbl, 37 * VECWIDTH + tbloffset)))); + scatter(out, 7, 16, timesminusplus(reverse(v207), load(tbl, 34 * VECWIDTH + tbloffset), times(v207, load(tbl, 35 * VECWIDTH + tbloffset)))); + real2 v71 = timesminusplus(reverse(v61), load(tbl, 10 * VECWIDTH + tbloffset), times(v61, load(tbl, 11 * VECWIDTH + tbloffset))); + real2 v51 = timesminusplus(reverse(v41), load(tbl, 6 * VECWIDTH + tbloffset), times(v41, load(tbl, 7 * VECWIDTH + tbloffset))); + real2 v91 = timesminusplus(reverse(v81), load(tbl, 14 * VECWIDTH + tbloffset), times(v81, load(tbl, 15 * VECWIDTH + tbloffset))); + real2 v185 = plus(v51, v91); + real2 v179 = reverse(minus(v91, v51)); + real2 v31 = timesminusplus(reverse(v21), load(tbl, 2 * VECWIDTH + tbloffset), times(v21, load(tbl, 3 * VECWIDTH + tbloffset))); + real2 v184 = plus(v31, v71); + real2 v180 = minus(v71, v31); + scatter(out, 1, 16, plus(v184, v185)); + real2 v198 = minus(v184, v185); + scatter(out, 9, 16, timesminusplus(v198, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v198), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v181 = minusplus(v179, v180); + scatter(out, 5, 16, timesminusplus(reverse(v181), load(tbl, 30 * VECWIDTH + tbloffset), times(v181, load(tbl, 31 * VECWIDTH + tbloffset)))); + real2 v183 = minusplus(uminus(v179), v180); + scatter(out, 13, 16, timesminusplus(reverse(v183), load(tbl, 32 * VECWIDTH + tbloffset), times(v183, load(tbl, 33 * VECWIDTH + tbloffset)))); + } +} +#endif + +#if MAXBUTWIDTH%TYPEID% >= 5 +ALIGNED(8192) void tbut32f_%SHIFT%_%CONFIG%_%ISA%(real *RESTRICT out0, uint32_t *q, const real *RESTRICT in0, const real *RESTRICT tbl, const int K) { + const int shift = %SHIFT% + LOG2VECWIDTH, k = 1 << (shift - LOG2VECWIDTH); + int i=0; +#pragma omp parallel for + for(i=0;i < k;i++) { + int i0 = i << LOG2VECWIDTH; + real *out = out0 + q[i]; + const real *in = in0 + i0*2; + const int tbloffset = K * i0; + + real2 v14 = load(in, 12 << shift); + real2 v30 = load(in, 28 << shift); + real2 v115 = reverse(minus(v14, v30)); + real2 v121 = plus(v14, v30); + real2 v6 = load(in, 4 << shift); + real2 v22 = load(in, 20 << shift); + real2 v120 = plus(v6, v22); + real2 v116 = minus(v22, v6); + real2 v201 = plus(v120, v121); + real2 v195 = reverse(minus(v120, v121)); + real2 v119 = minusplus(uminus(v115), v116); + real2 v117 = minusplus(v115, v116); + real2 v133 = timesminusplus(reverse(v119), load(tbl, 20 * VECWIDTH + tbloffset), times(v119, load(tbl, 21 * VECWIDTH + tbloffset))); + real2 v127 = timesminusplus(reverse(v117), load(tbl, 18 * VECWIDTH + tbloffset), times(v117, load(tbl, 19 * VECWIDTH + tbloffset))); + real2 v18 = load(in, 16 << shift); + real2 v2 = load(in, 0 << shift); + real2 v40 = plus(v2, v18); + real2 v36 = minus(v18, v2); + real2 v10 = load(in, 8 << shift); + real2 v26 = load(in, 24 << shift); + real2 v41 = plus(v10, v26); + real2 v35 = reverse(minus(v10, v26)); + real2 v200 = plus(v40, v41); + real2 v196 = minus(v41, v40); + real2 v37 = minusplus(v35, v36); + real2 v39 = minusplus(uminus(v35), v36); + real2 v53 = timesminusplus(reverse(v39), load(tbl, 4 * VECWIDTH + tbloffset), times(v39, load(tbl, 5 * VECWIDTH + tbloffset))); + real2 v276 = minus(v201, v200); + real2 v280 = plus(v200, v201); + real2 v47 = timesminusplus(reverse(v37), load(tbl, 2 * VECWIDTH + tbloffset), times(v37, load(tbl, 3 * VECWIDTH + tbloffset))); + real2 v199 = minusplus(uminus(v195), v196); + real2 v197 = minusplus(v195, v196); + real2 v486 = minus(v133, v53); + real2 v490 = plus(v53, v133); + real2 v213 = timesminusplus(reverse(v199), load(tbl, 36 * VECWIDTH + tbloffset), times(v199, load(tbl, 37 * VECWIDTH + tbloffset))); + real2 v207 = timesminusplus(reverse(v197), load(tbl, 34 * VECWIDTH + tbloffset), times(v197, load(tbl, 35 * VECWIDTH + tbloffset))); + real2 v28 = load(in, 26 << shift); + real2 v12 = load(in, 10 << shift); + real2 v81 = plus(v12, v28); + real2 v75 = reverse(minus(v12, v28)); + real2 v20 = load(in, 18 << shift); + real2 v4 = load(in, 2 << shift); + real2 v80 = plus(v4, v20); + real2 v76 = minus(v20, v4); + real2 v236 = minus(v81, v80); + real2 v240 = plus(v80, v81); + real2 v77 = minusplus(v75, v76); + real2 v79 = minusplus(uminus(v75), v76); + real2 v93 = timesminusplus(reverse(v79), load(tbl, 12 * VECWIDTH + tbloffset), times(v79, load(tbl, 13 * VECWIDTH + tbloffset))); + real2 v32 = load(in, 30 << shift); + real2 v16 = load(in, 14 << shift); + real2 v155 = reverse(minus(v16, v32)); + real2 v161 = plus(v16, v32); + real2 v24 = load(in, 22 << shift); + real2 v8 = load(in, 6 << shift); + real2 v160 = plus(v8, v24); + real2 v156 = minus(v24, v8); + real2 v235 = reverse(minus(v160, v161)); + real2 v241 = plus(v160, v161); + real2 v157 = minusplus(v155, v156); + real2 v159 = minusplus(uminus(v155), v156); + real2 v173 = timesminusplus(reverse(v159), load(tbl, 28 * VECWIDTH + tbloffset), times(v159, load(tbl, 29 * VECWIDTH + tbloffset))); + real2 v485 = reverse(minus(v93, v173)); + real2 v491 = plus(v93, v173); + real2 v489 = minusplus(uminus(v485), v486); + real2 v487 = minusplus(v485, v486); + real2 v239 = minusplus(uminus(v235), v236); + real2 v237 = minusplus(v235, v236); + real2 v253 = timesminusplus(reverse(v239), load(tbl, 44 * VECWIDTH + tbloffset), times(v239, load(tbl, 45 * VECWIDTH + tbloffset))); + real2 v497 = timesminusplus(reverse(v487), load(tbl, 82 * VECWIDTH + tbloffset), times(v487, load(tbl, 83 * VECWIDTH + tbloffset))); + real2 v530 = plus(v490, v491); + real2 v526 = minus(v491, v490); + real2 v503 = timesminusplus(reverse(v489), load(tbl, 84 * VECWIDTH + tbloffset), times(v489, load(tbl, 85 * VECWIDTH + tbloffset))); + real2 v247 = timesminusplus(reverse(v237), load(tbl, 42 * VECWIDTH + tbloffset), times(v237, load(tbl, 43 * VECWIDTH + tbloffset))); + real2 v356 = minus(v247, v207); + real2 v360 = plus(v207, v247); + real2 v386 = plus(v213, v253); + real2 v382 = minus(v253, v213); + real2 v17 = load(in, 15 << shift); + real2 v33 = load(in, 31 << shift); + real2 v175 = reverse(minus(v17, v33)); + real2 v181 = plus(v17, v33); + real2 v25 = load(in, 23 << shift); + real2 v9 = load(in, 7 << shift); + real2 v176 = minus(v25, v9); + real2 v180 = plus(v9, v25); + real2 v177 = minusplus(v175, v176); + real2 v179 = minusplus(uminus(v175), v176); + real2 v193 = timesminusplus(reverse(v179), load(tbl, 32 * VECWIDTH + tbloffset), times(v179, load(tbl, 33 * VECWIDTH + tbloffset))); + real2 v261 = plus(v180, v181); + real2 v255 = reverse(minus(v180, v181)); + real2 v29 = load(in, 27 << shift); + real2 v13 = load(in, 11 << shift); + real2 v101 = plus(v13, v29); + real2 v95 = reverse(minus(v13, v29)); + real2 v21 = load(in, 19 << shift); + real2 v5 = load(in, 3 << shift); + real2 v100 = plus(v5, v21); + real2 v96 = minus(v21, v5); + real2 v99 = minusplus(uminus(v95), v96); + real2 v97 = minusplus(v95, v96); + real2 v260 = plus(v100, v101); + real2 v256 = minus(v101, v100); + real2 v259 = minusplus(uminus(v255), v256); + real2 v257 = minusplus(v255, v256); + real2 v273 = timesminusplus(reverse(v259), load(tbl, 48 * VECWIDTH + tbloffset), times(v259, load(tbl, 49 * VECWIDTH + tbloffset))); + real2 v267 = timesminusplus(reverse(v257), load(tbl, 46 * VECWIDTH + tbloffset), times(v257, load(tbl, 47 * VECWIDTH + tbloffset))); + real2 v3 = load(in, 1 << shift); + real2 v19 = load(in, 17 << shift); + real2 v60 = plus(v3, v19); + real2 v56 = minus(v19, v3); + real2 v27 = load(in, 25 << shift); + real2 v11 = load(in, 9 << shift); + real2 v55 = reverse(minus(v11, v27)); + real2 v61 = plus(v11, v27); + real2 v220 = plus(v60, v61); + real2 v216 = minus(v61, v60); + real2 v7 = load(in, 5 << shift); + real2 v23 = load(in, 21 << shift); + real2 v136 = minus(v23, v7); + real2 v140 = plus(v7, v23); + real2 v15 = load(in, 13 << shift); + real2 v31 = load(in, 29 << shift); + real2 v135 = reverse(minus(v15, v31)); + real2 v141 = plus(v15, v31); + real2 v215 = reverse(minus(v140, v141)); + real2 v221 = plus(v140, v141); + real2 v219 = minusplus(uminus(v215), v216); + real2 v217 = minusplus(v215, v216); + real2 v227 = timesminusplus(reverse(v217), load(tbl, 38 * VECWIDTH + tbloffset), times(v217, load(tbl, 39 * VECWIDTH + tbloffset))); + real2 v355 = reverse(minus(v227, v267)); + real2 v361 = plus(v227, v267); + scatter(out, 2, 32, plus(v360, v361)); + real2 v374 = minus(v360, v361); + scatter(out, 18, 32, timesminusplus(v374, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v374), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v357 = minusplus(v355, v356); + scatter(out, 10, 32, timesminusplus(reverse(v357), load(tbl, 62 * VECWIDTH + tbloffset), times(v357, load(tbl, 63 * VECWIDTH + tbloffset)))); + real2 v359 = minusplus(uminus(v355), v356); + scatter(out, 26, 32, timesminusplus(reverse(v359), load(tbl, 64 * VECWIDTH + tbloffset), times(v359, load(tbl, 65 * VECWIDTH + tbloffset)))); + real2 v233 = timesminusplus(reverse(v219), load(tbl, 40 * VECWIDTH + tbloffset), times(v219, load(tbl, 41 * VECWIDTH + tbloffset))); + real2 v381 = reverse(minus(v233, v273)); + real2 v387 = plus(v233, v273); + scatter(out, 6, 32, plus(v386, v387)); + real2 v400 = minus(v386, v387); + scatter(out, 22, 32, timesminusplus(v400, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v400), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v383 = minusplus(v381, v382); + real2 v385 = minusplus(uminus(v381), v382); + scatter(out, 30, 32, timesminusplus(reverse(v385), load(tbl, 68 * VECWIDTH + tbloffset), times(v385, load(tbl, 69 * VECWIDTH + tbloffset)))); + scatter(out, 14, 32, timesminusplus(reverse(v383), load(tbl, 66 * VECWIDTH + tbloffset), times(v383, load(tbl, 67 * VECWIDTH + tbloffset)))); + real2 v137 = minusplus(v135, v136); + real2 v139 = minusplus(uminus(v135), v136); + real2 v153 = timesminusplus(reverse(v139), load(tbl, 24 * VECWIDTH + tbloffset), times(v139, load(tbl, 25 * VECWIDTH + tbloffset))); + real2 v113 = timesminusplus(reverse(v99), load(tbl, 16 * VECWIDTH + tbloffset), times(v99, load(tbl, 17 * VECWIDTH + tbloffset))); + real2 v511 = plus(v113, v193); + real2 v505 = reverse(minus(v113, v193)); + real2 v57 = minusplus(v55, v56); + real2 v59 = minusplus(uminus(v55), v56); + real2 v73 = timesminusplus(reverse(v59), load(tbl, 8 * VECWIDTH + tbloffset), times(v59, load(tbl, 9 * VECWIDTH + tbloffset))); + real2 v510 = plus(v73, v153); + real2 v506 = minus(v153, v73); + real2 v531 = plus(v510, v511); + real2 v525 = reverse(minus(v510, v511)); + scatter(out, 3, 32, plus(v530, v531)); + real2 v544 = minus(v530, v531); + scatter(out, 19, 32, timesminusplus(v544, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v544), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v527 = minusplus(v525, v526); + scatter(out, 11, 32, timesminusplus(reverse(v527), load(tbl, 90 * VECWIDTH + tbloffset), times(v527, load(tbl, 91 * VECWIDTH + tbloffset)))); + real2 v529 = minusplus(uminus(v525), v526); + scatter(out, 27, 32, timesminusplus(reverse(v529), load(tbl, 92 * VECWIDTH + tbloffset), times(v529, load(tbl, 93 * VECWIDTH + tbloffset)))); + real2 v509 = minusplus(uminus(v505), v506); + real2 v507 = minusplus(v505, v506); + real2 v523 = timesminusplus(reverse(v509), load(tbl, 88 * VECWIDTH + tbloffset), times(v509, load(tbl, 89 * VECWIDTH + tbloffset))); + scatter(out, 15, 32, plus(v503, v523)); + real2 v556 = minus(v503, v523); + scatter(out, 31, 32, timesminusplus(v556, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v556), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v517 = timesminusplus(reverse(v507), load(tbl, 86 * VECWIDTH + tbloffset), times(v507, load(tbl, 87 * VECWIDTH + tbloffset))); + scatter(out, 7, 32, plus(v497, v517)); + real2 v550 = minus(v497, v517); + scatter(out, 23, 32, timesminusplus(v550, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v550), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v275 = reverse(minus(v240, v241)); + real2 v281 = plus(v240, v241); + real2 v320 = plus(v280, v281); + real2 v316 = minus(v281, v280); + real2 v301 = plus(v260, v261); + real2 v295 = reverse(minus(v260, v261)); + real2 v300 = plus(v220, v221); + real2 v296 = minus(v221, v220); + real2 v315 = reverse(minus(v300, v301)); + real2 v321 = plus(v300, v301); + scatter(out, 0, 32, plus(v320, v321)); + real2 v334 = minus(v320, v321); + scatter(out, 16, 32, timesminusplus(v334, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v334), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v319 = minusplus(uminus(v315), v316); + real2 v317 = minusplus(v315, v316); + scatter(out, 8, 32, timesminusplus(reverse(v317), load(tbl, 58 * VECWIDTH + tbloffset), times(v317, load(tbl, 59 * VECWIDTH + tbloffset)))); + scatter(out, 24, 32, timesminusplus(reverse(v319), load(tbl, 60 * VECWIDTH + tbloffset), times(v319, load(tbl, 61 * VECWIDTH + tbloffset)))); + real2 v299 = minusplus(uminus(v295), v296); + real2 v297 = minusplus(v295, v296); + real2 v279 = minusplus(uminus(v275), v276); + real2 v277 = minusplus(v275, v276); + real2 v287 = timesminusplus(reverse(v277), load(tbl, 50 * VECWIDTH + tbloffset), times(v277, load(tbl, 51 * VECWIDTH + tbloffset))); + real2 v307 = timesminusplus(reverse(v297), load(tbl, 54 * VECWIDTH + tbloffset), times(v297, load(tbl, 55 * VECWIDTH + tbloffset))); + scatter(out, 4, 32, plus(v287, v307)); + real2 v342 = minus(v287, v307); + scatter(out, 20, 32, timesminusplus(v342, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v342), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v313 = timesminusplus(reverse(v299), load(tbl, 56 * VECWIDTH + tbloffset), times(v299, load(tbl, 57 * VECWIDTH + tbloffset))); + real2 v293 = timesminusplus(reverse(v279), load(tbl, 52 * VECWIDTH + tbloffset), times(v279, load(tbl, 53 * VECWIDTH + tbloffset))); + scatter(out, 12, 32, plus(v293, v313)); + real2 v348 = minus(v293, v313); + scatter(out, 28, 32, timesminusplus(v348, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v348), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v87 = timesminusplus(reverse(v77), load(tbl, 10 * VECWIDTH + tbloffset), times(v77, load(tbl, 11 * VECWIDTH + tbloffset))); + real2 v147 = timesminusplus(reverse(v137), load(tbl, 22 * VECWIDTH + tbloffset), times(v137, load(tbl, 23 * VECWIDTH + tbloffset))); + real2 v187 = timesminusplus(reverse(v177), load(tbl, 30 * VECWIDTH + tbloffset), times(v177, load(tbl, 31 * VECWIDTH + tbloffset))); + real2 v167 = timesminusplus(reverse(v157), load(tbl, 26 * VECWIDTH + tbloffset), times(v157, load(tbl, 27 * VECWIDTH + tbloffset))); + real2 v413 = plus(v87, v167); + real2 v407 = reverse(minus(v87, v167)); + real2 v67 = timesminusplus(reverse(v57), load(tbl, 6 * VECWIDTH + tbloffset), times(v57, load(tbl, 7 * VECWIDTH + tbloffset))); + real2 v107 = timesminusplus(reverse(v97), load(tbl, 14 * VECWIDTH + tbloffset), times(v97, load(tbl, 15 * VECWIDTH + tbloffset))); + real2 v427 = reverse(minus(v107, v187)); + real2 v433 = plus(v107, v187); + real2 v432 = plus(v67, v147); + real2 v428 = minus(v147, v67); + real2 v453 = plus(v432, v433); + real2 v447 = reverse(minus(v432, v433)); + real2 v408 = minus(v127, v47); + real2 v412 = plus(v47, v127); + real2 v452 = plus(v412, v413); + real2 v448 = minus(v413, v412); + scatter(out, 1, 32, plus(v452, v453)); + real2 v466 = minus(v452, v453); + scatter(out, 17, 32, timesminusplus(v466, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v466), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v451 = minusplus(uminus(v447), v448); + scatter(out, 25, 32, timesminusplus(reverse(v451), load(tbl, 80 * VECWIDTH + tbloffset), times(v451, load(tbl, 81 * VECWIDTH + tbloffset)))); + real2 v449 = minusplus(v447, v448); + scatter(out, 9, 32, timesminusplus(reverse(v449), load(tbl, 78 * VECWIDTH + tbloffset), times(v449, load(tbl, 79 * VECWIDTH + tbloffset)))); + real2 v429 = minusplus(v427, v428); + real2 v431 = minusplus(uminus(v427), v428); + real2 v445 = timesminusplus(reverse(v431), load(tbl, 76 * VECWIDTH + tbloffset), times(v431, load(tbl, 77 * VECWIDTH + tbloffset))); + real2 v409 = minusplus(v407, v408); + real2 v411 = minusplus(uminus(v407), v408); + real2 v425 = timesminusplus(reverse(v411), load(tbl, 72 * VECWIDTH + tbloffset), times(v411, load(tbl, 73 * VECWIDTH + tbloffset))); + scatter(out, 13, 32, plus(v425, v445)); + real2 v478 = minus(v425, v445); + scatter(out, 29, 32, timesminusplus(v478, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v478), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v439 = timesminusplus(reverse(v429), load(tbl, 74 * VECWIDTH + tbloffset), times(v429, load(tbl, 75 * VECWIDTH + tbloffset))); + real2 v419 = timesminusplus(reverse(v409), load(tbl, 70 * VECWIDTH + tbloffset), times(v409, load(tbl, 71 * VECWIDTH + tbloffset))); + scatter(out, 5, 32, plus(v419, v439)); + real2 v472 = minus(v419, v439); + scatter(out, 21, 32, timesminusplus(v472, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v472), load(tbl, 1 * VECWIDTH + tbloffset)))); + } +} + +ALIGNED(8192) void tbut32b_%SHIFT%_%CONFIG%_%ISA%(real *RESTRICT out0, uint32_t *q, const real *RESTRICT in0, const real *RESTRICT tbl, const int K) { + const int shift = %SHIFT% + LOG2VECWIDTH, k = 1 << (shift - LOG2VECWIDTH); + int i=0; +#pragma omp parallel for + for(i=0;i < k;i++) { + int i0 = i << LOG2VECWIDTH; + real *out = out0 + q[i]; + const real *in = in0 + i0*2; + const int tbloffset = K * i0; + + real2 v14 = load(in, 12 << shift); + real2 v30 = load(in, 28 << shift); + real2 v115 = reverse(minus(v30, v14)); + real2 v121 = plus(v14, v30); + real2 v6 = load(in, 4 << shift); + real2 v22 = load(in, 20 << shift); + real2 v120 = plus(v6, v22); + real2 v116 = minus(v22, v6); + real2 v201 = plus(v120, v121); + real2 v195 = reverse(minus(v121, v120)); + real2 v119 = minusplus(uminus(v115), v116); + real2 v117 = minusplus(v115, v116); + real2 v133 = timesminusplus(reverse(v119), load(tbl, 20 * VECWIDTH + tbloffset), times(v119, load(tbl, 21 * VECWIDTH + tbloffset))); + real2 v127 = timesminusplus(reverse(v117), load(tbl, 18 * VECWIDTH + tbloffset), times(v117, load(tbl, 19 * VECWIDTH + tbloffset))); + real2 v18 = load(in, 16 << shift); + real2 v2 = load(in, 0 << shift); + real2 v40 = plus(v2, v18); + real2 v36 = minus(v18, v2); + real2 v10 = load(in, 8 << shift); + real2 v26 = load(in, 24 << shift); + real2 v41 = plus(v10, v26); + real2 v35 = reverse(minus(v26, v10)); + real2 v200 = plus(v40, v41); + real2 v196 = minus(v41, v40); + real2 v37 = minusplus(v35, v36); + real2 v39 = minusplus(uminus(v35), v36); + real2 v53 = timesminusplus(reverse(v39), load(tbl, 4 * VECWIDTH + tbloffset), times(v39, load(tbl, 5 * VECWIDTH + tbloffset))); + real2 v276 = minus(v201, v200); + real2 v280 = plus(v200, v201); + real2 v47 = timesminusplus(reverse(v37), load(tbl, 2 * VECWIDTH + tbloffset), times(v37, load(tbl, 3 * VECWIDTH + tbloffset))); + real2 v199 = minusplus(uminus(v195), v196); + real2 v197 = minusplus(v195, v196); + real2 v486 = minus(v133, v53); + real2 v490 = plus(v53, v133); + real2 v213 = timesminusplus(reverse(v199), load(tbl, 36 * VECWIDTH + tbloffset), times(v199, load(tbl, 37 * VECWIDTH + tbloffset))); + real2 v207 = timesminusplus(reverse(v197), load(tbl, 34 * VECWIDTH + tbloffset), times(v197, load(tbl, 35 * VECWIDTH + tbloffset))); + real2 v28 = load(in, 26 << shift); + real2 v12 = load(in, 10 << shift); + real2 v81 = plus(v12, v28); + real2 v75 = reverse(minus(v28, v12)); + real2 v20 = load(in, 18 << shift); + real2 v4 = load(in, 2 << shift); + real2 v80 = plus(v4, v20); + real2 v76 = minus(v20, v4); + real2 v236 = minus(v81, v80); + real2 v240 = plus(v80, v81); + real2 v77 = minusplus(v75, v76); + real2 v79 = minusplus(uminus(v75), v76); + real2 v93 = timesminusplus(reverse(v79), load(tbl, 12 * VECWIDTH + tbloffset), times(v79, load(tbl, 13 * VECWIDTH + tbloffset))); + real2 v32 = load(in, 30 << shift); + real2 v16 = load(in, 14 << shift); + real2 v155 = reverse(minus(v32, v16)); + real2 v161 = plus(v16, v32); + real2 v24 = load(in, 22 << shift); + real2 v8 = load(in, 6 << shift); + real2 v160 = plus(v8, v24); + real2 v156 = minus(v24, v8); + real2 v235 = reverse(minus(v161, v160)); + real2 v241 = plus(v160, v161); + real2 v157 = minusplus(v155, v156); + real2 v159 = minusplus(uminus(v155), v156); + real2 v173 = timesminusplus(reverse(v159), load(tbl, 28 * VECWIDTH + tbloffset), times(v159, load(tbl, 29 * VECWIDTH + tbloffset))); + real2 v485 = reverse(minus(v173, v93)); + real2 v491 = plus(v93, v173); + real2 v489 = minusplus(uminus(v485), v486); + real2 v487 = minusplus(v485, v486); + real2 v239 = minusplus(uminus(v235), v236); + real2 v237 = minusplus(v235, v236); + real2 v253 = timesminusplus(reverse(v239), load(tbl, 44 * VECWIDTH + tbloffset), times(v239, load(tbl, 45 * VECWIDTH + tbloffset))); + real2 v497 = timesminusplus(reverse(v487), load(tbl, 82 * VECWIDTH + tbloffset), times(v487, load(tbl, 83 * VECWIDTH + tbloffset))); + real2 v530 = plus(v490, v491); + real2 v526 = minus(v491, v490); + real2 v503 = timesminusplus(reverse(v489), load(tbl, 84 * VECWIDTH + tbloffset), times(v489, load(tbl, 85 * VECWIDTH + tbloffset))); + real2 v247 = timesminusplus(reverse(v237), load(tbl, 42 * VECWIDTH + tbloffset), times(v237, load(tbl, 43 * VECWIDTH + tbloffset))); + real2 v356 = minus(v247, v207); + real2 v360 = plus(v207, v247); + real2 v386 = plus(v213, v253); + real2 v382 = minus(v253, v213); + real2 v17 = load(in, 15 << shift); + real2 v33 = load(in, 31 << shift); + real2 v175 = reverse(minus(v33, v17)); + real2 v181 = plus(v17, v33); + real2 v25 = load(in, 23 << shift); + real2 v9 = load(in, 7 << shift); + real2 v176 = minus(v25, v9); + real2 v180 = plus(v9, v25); + real2 v177 = minusplus(v175, v176); + real2 v179 = minusplus(uminus(v175), v176); + real2 v193 = timesminusplus(reverse(v179), load(tbl, 32 * VECWIDTH + tbloffset), times(v179, load(tbl, 33 * VECWIDTH + tbloffset))); + real2 v261 = plus(v180, v181); + real2 v255 = reverse(minus(v181, v180)); + real2 v29 = load(in, 27 << shift); + real2 v13 = load(in, 11 << shift); + real2 v101 = plus(v13, v29); + real2 v95 = reverse(minus(v29, v13)); + real2 v21 = load(in, 19 << shift); + real2 v5 = load(in, 3 << shift); + real2 v100 = plus(v5, v21); + real2 v96 = minus(v21, v5); + real2 v99 = minusplus(uminus(v95), v96); + real2 v97 = minusplus(v95, v96); + real2 v260 = plus(v100, v101); + real2 v256 = minus(v101, v100); + real2 v259 = minusplus(uminus(v255), v256); + real2 v257 = minusplus(v255, v256); + real2 v273 = timesminusplus(reverse(v259), load(tbl, 48 * VECWIDTH + tbloffset), times(v259, load(tbl, 49 * VECWIDTH + tbloffset))); + real2 v267 = timesminusplus(reverse(v257), load(tbl, 46 * VECWIDTH + tbloffset), times(v257, load(tbl, 47 * VECWIDTH + tbloffset))); + real2 v3 = load(in, 1 << shift); + real2 v19 = load(in, 17 << shift); + real2 v60 = plus(v3, v19); + real2 v56 = minus(v19, v3); + real2 v27 = load(in, 25 << shift); + real2 v11 = load(in, 9 << shift); + real2 v55 = reverse(minus(v27, v11)); + real2 v61 = plus(v11, v27); + real2 v220 = plus(v60, v61); + real2 v216 = minus(v61, v60); + real2 v7 = load(in, 5 << shift); + real2 v23 = load(in, 21 << shift); + real2 v136 = minus(v23, v7); + real2 v140 = plus(v7, v23); + real2 v15 = load(in, 13 << shift); + real2 v31 = load(in, 29 << shift); + real2 v135 = reverse(minus(v31, v15)); + real2 v141 = plus(v15, v31); + real2 v215 = reverse(minus(v141, v140)); + real2 v221 = plus(v140, v141); + real2 v219 = minusplus(uminus(v215), v216); + real2 v217 = minusplus(v215, v216); + real2 v227 = timesminusplus(reverse(v217), load(tbl, 38 * VECWIDTH + tbloffset), times(v217, load(tbl, 39 * VECWIDTH + tbloffset))); + real2 v355 = reverse(minus(v267, v227)); + real2 v361 = plus(v227, v267); + scatter(out, 2, 32, plus(v360, v361)); + real2 v374 = minus(v360, v361); + scatter(out, 18, 32, timesminusplus(v374, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v374), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v357 = minusplus(v355, v356); + scatter(out, 10, 32, timesminusplus(reverse(v357), load(tbl, 62 * VECWIDTH + tbloffset), times(v357, load(tbl, 63 * VECWIDTH + tbloffset)))); + real2 v359 = minusplus(uminus(v355), v356); + scatter(out, 26, 32, timesminusplus(reverse(v359), load(tbl, 64 * VECWIDTH + tbloffset), times(v359, load(tbl, 65 * VECWIDTH + tbloffset)))); + real2 v233 = timesminusplus(reverse(v219), load(tbl, 40 * VECWIDTH + tbloffset), times(v219, load(tbl, 41 * VECWIDTH + tbloffset))); + real2 v381 = reverse(minus(v273, v233)); + real2 v387 = plus(v233, v273); + scatter(out, 6, 32, plus(v386, v387)); + real2 v400 = minus(v386, v387); + scatter(out, 22, 32, timesminusplus(v400, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v400), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v383 = minusplus(v381, v382); + real2 v385 = minusplus(uminus(v381), v382); + scatter(out, 30, 32, timesminusplus(reverse(v385), load(tbl, 68 * VECWIDTH + tbloffset), times(v385, load(tbl, 69 * VECWIDTH + tbloffset)))); + scatter(out, 14, 32, timesminusplus(reverse(v383), load(tbl, 66 * VECWIDTH + tbloffset), times(v383, load(tbl, 67 * VECWIDTH + tbloffset)))); + real2 v137 = minusplus(v135, v136); + real2 v139 = minusplus(uminus(v135), v136); + real2 v153 = timesminusplus(reverse(v139), load(tbl, 24 * VECWIDTH + tbloffset), times(v139, load(tbl, 25 * VECWIDTH + tbloffset))); + real2 v113 = timesminusplus(reverse(v99), load(tbl, 16 * VECWIDTH + tbloffset), times(v99, load(tbl, 17 * VECWIDTH + tbloffset))); + real2 v511 = plus(v113, v193); + real2 v505 = reverse(minus(v193, v113)); + real2 v57 = minusplus(v55, v56); + real2 v59 = minusplus(uminus(v55), v56); + real2 v73 = timesminusplus(reverse(v59), load(tbl, 8 * VECWIDTH + tbloffset), times(v59, load(tbl, 9 * VECWIDTH + tbloffset))); + real2 v510 = plus(v73, v153); + real2 v506 = minus(v153, v73); + real2 v531 = plus(v510, v511); + real2 v525 = reverse(minus(v511, v510)); + scatter(out, 3, 32, plus(v530, v531)); + real2 v544 = minus(v530, v531); + scatter(out, 19, 32, timesminusplus(v544, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v544), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v527 = minusplus(v525, v526); + scatter(out, 11, 32, timesminusplus(reverse(v527), load(tbl, 90 * VECWIDTH + tbloffset), times(v527, load(tbl, 91 * VECWIDTH + tbloffset)))); + real2 v529 = minusplus(uminus(v525), v526); + scatter(out, 27, 32, timesminusplus(reverse(v529), load(tbl, 92 * VECWIDTH + tbloffset), times(v529, load(tbl, 93 * VECWIDTH + tbloffset)))); + real2 v509 = minusplus(uminus(v505), v506); + real2 v507 = minusplus(v505, v506); + real2 v523 = timesminusplus(reverse(v509), load(tbl, 88 * VECWIDTH + tbloffset), times(v509, load(tbl, 89 * VECWIDTH + tbloffset))); + scatter(out, 15, 32, plus(v503, v523)); + real2 v556 = minus(v503, v523); + scatter(out, 31, 32, timesminusplus(v556, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v556), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v517 = timesminusplus(reverse(v507), load(tbl, 86 * VECWIDTH + tbloffset), times(v507, load(tbl, 87 * VECWIDTH + tbloffset))); + scatter(out, 7, 32, plus(v497, v517)); + real2 v550 = minus(v497, v517); + scatter(out, 23, 32, timesminusplus(v550, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v550), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v275 = reverse(minus(v241, v240)); + real2 v281 = plus(v240, v241); + real2 v320 = plus(v280, v281); + real2 v316 = minus(v281, v280); + real2 v301 = plus(v260, v261); + real2 v295 = reverse(minus(v261, v260)); + real2 v300 = plus(v220, v221); + real2 v296 = minus(v221, v220); + real2 v315 = reverse(minus(v301, v300)); + real2 v321 = plus(v300, v301); + scatter(out, 0, 32, plus(v320, v321)); + real2 v334 = minus(v320, v321); + scatter(out, 16, 32, timesminusplus(v334, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v334), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v319 = minusplus(uminus(v315), v316); + real2 v317 = minusplus(v315, v316); + scatter(out, 8, 32, timesminusplus(reverse(v317), load(tbl, 58 * VECWIDTH + tbloffset), times(v317, load(tbl, 59 * VECWIDTH + tbloffset)))); + scatter(out, 24, 32, timesminusplus(reverse(v319), load(tbl, 60 * VECWIDTH + tbloffset), times(v319, load(tbl, 61 * VECWIDTH + tbloffset)))); + real2 v299 = minusplus(uminus(v295), v296); + real2 v297 = minusplus(v295, v296); + real2 v279 = minusplus(uminus(v275), v276); + real2 v277 = minusplus(v275, v276); + real2 v287 = timesminusplus(reverse(v277), load(tbl, 50 * VECWIDTH + tbloffset), times(v277, load(tbl, 51 * VECWIDTH + tbloffset))); + real2 v307 = timesminusplus(reverse(v297), load(tbl, 54 * VECWIDTH + tbloffset), times(v297, load(tbl, 55 * VECWIDTH + tbloffset))); + scatter(out, 4, 32, plus(v287, v307)); + real2 v342 = minus(v287, v307); + scatter(out, 20, 32, timesminusplus(v342, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v342), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v313 = timesminusplus(reverse(v299), load(tbl, 56 * VECWIDTH + tbloffset), times(v299, load(tbl, 57 * VECWIDTH + tbloffset))); + real2 v293 = timesminusplus(reverse(v279), load(tbl, 52 * VECWIDTH + tbloffset), times(v279, load(tbl, 53 * VECWIDTH + tbloffset))); + scatter(out, 12, 32, plus(v293, v313)); + real2 v348 = minus(v293, v313); + scatter(out, 28, 32, timesminusplus(v348, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v348), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v87 = timesminusplus(reverse(v77), load(tbl, 10 * VECWIDTH + tbloffset), times(v77, load(tbl, 11 * VECWIDTH + tbloffset))); + real2 v147 = timesminusplus(reverse(v137), load(tbl, 22 * VECWIDTH + tbloffset), times(v137, load(tbl, 23 * VECWIDTH + tbloffset))); + real2 v187 = timesminusplus(reverse(v177), load(tbl, 30 * VECWIDTH + tbloffset), times(v177, load(tbl, 31 * VECWIDTH + tbloffset))); + real2 v167 = timesminusplus(reverse(v157), load(tbl, 26 * VECWIDTH + tbloffset), times(v157, load(tbl, 27 * VECWIDTH + tbloffset))); + real2 v413 = plus(v87, v167); + real2 v407 = reverse(minus(v167, v87)); + real2 v67 = timesminusplus(reverse(v57), load(tbl, 6 * VECWIDTH + tbloffset), times(v57, load(tbl, 7 * VECWIDTH + tbloffset))); + real2 v107 = timesminusplus(reverse(v97), load(tbl, 14 * VECWIDTH + tbloffset), times(v97, load(tbl, 15 * VECWIDTH + tbloffset))); + real2 v427 = reverse(minus(v187, v107)); + real2 v433 = plus(v107, v187); + real2 v432 = plus(v67, v147); + real2 v428 = minus(v147, v67); + real2 v453 = plus(v432, v433); + real2 v447 = reverse(minus(v433, v432)); + real2 v408 = minus(v127, v47); + real2 v412 = plus(v47, v127); + real2 v452 = plus(v412, v413); + real2 v448 = minus(v413, v412); + scatter(out, 1, 32, plus(v452, v453)); + real2 v466 = minus(v452, v453); + scatter(out, 17, 32, timesminusplus(v466, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v466), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v451 = minusplus(uminus(v447), v448); + scatter(out, 25, 32, timesminusplus(reverse(v451), load(tbl, 80 * VECWIDTH + tbloffset), times(v451, load(tbl, 81 * VECWIDTH + tbloffset)))); + real2 v449 = minusplus(v447, v448); + scatter(out, 9, 32, timesminusplus(reverse(v449), load(tbl, 78 * VECWIDTH + tbloffset), times(v449, load(tbl, 79 * VECWIDTH + tbloffset)))); + real2 v429 = minusplus(v427, v428); + real2 v431 = minusplus(uminus(v427), v428); + real2 v445 = timesminusplus(reverse(v431), load(tbl, 76 * VECWIDTH + tbloffset), times(v431, load(tbl, 77 * VECWIDTH + tbloffset))); + real2 v409 = minusplus(v407, v408); + real2 v411 = minusplus(uminus(v407), v408); + real2 v425 = timesminusplus(reverse(v411), load(tbl, 72 * VECWIDTH + tbloffset), times(v411, load(tbl, 73 * VECWIDTH + tbloffset))); + scatter(out, 13, 32, plus(v425, v445)); + real2 v478 = minus(v425, v445); + scatter(out, 29, 32, timesminusplus(v478, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v478), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v439 = timesminusplus(reverse(v429), load(tbl, 74 * VECWIDTH + tbloffset), times(v429, load(tbl, 75 * VECWIDTH + tbloffset))); + real2 v419 = timesminusplus(reverse(v409), load(tbl, 70 * VECWIDTH + tbloffset), times(v409, load(tbl, 71 * VECWIDTH + tbloffset))); + scatter(out, 5, 32, plus(v419, v439)); + real2 v472 = minus(v419, v439); + scatter(out, 21, 32, timesminusplus(v472, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v472), load(tbl, 1 * VECWIDTH + tbloffset)))); + } +} +#endif + +#if MAXBUTWIDTH%TYPEID% >= 6 +ALIGNED(8192) void tbut64f_%SHIFT%_%CONFIG%_%ISA%(real *RESTRICT out0, uint32_t *q, const real *RESTRICT in0, const real *RESTRICT tbl, const int K) { + const int shift = %SHIFT% + LOG2VECWIDTH, k = 1 << (shift - LOG2VECWIDTH); + int i=0; +#pragma omp parallel for + for(i=0;i < k;i++) { + int i0 = i << LOG2VECWIDTH; + real *out = out0 + q[i]; + const real *in = in0 + i0*2; + const int tbloffset = K * i0; + + // Pres : 30254 + real2 v37 = load(in, 35 << shift); + real2 v5 = load(in, 3 << shift); + real2 v132 = plus(v5, v37); + real2 v128 = minus(v37, v5); + real2 v21 = load(in, 19 << shift); + real2 v53 = load(in, 51 << shift); + real2 v133 = plus(v21, v53); + real2 v127 = reverse(minus(v21, v53)); + real2 v131 = minusplus(uminus(v127), v128); + real2 v129 = minusplus(v127, v128); + real2 v139 = timesminusplus(reverse(v129), load(tbl, 14 * VECWIDTH + tbloffset), times(v129, load(tbl, 15 * VECWIDTH + tbloffset))); + real2 v145 = timesminusplus(reverse(v131), load(tbl, 16 * VECWIDTH + tbloffset), times(v131, load(tbl, 17 * VECWIDTH + tbloffset))); + real2 v448 = minus(v133, v132); + real2 v452 = plus(v132, v133); + real2 v45 = load(in, 43 << shift); + real2 v13 = load(in, 11 << shift); + real2 v292 = plus(v13, v45); + real2 v288 = minus(v45, v13); + real2 v29 = load(in, 27 << shift); + real2 v61 = load(in, 59 << shift); + real2 v293 = plus(v29, v61); + real2 v287 = reverse(minus(v29, v61)); + real2 v291 = minusplus(uminus(v287), v288); + real2 v289 = minusplus(v287, v288); + real2 v299 = timesminusplus(reverse(v289), load(tbl, 46 * VECWIDTH + tbloffset), times(v289, load(tbl, 47 * VECWIDTH + tbloffset))); + real2 v453 = plus(v292, v293); + real2 v447 = reverse(minus(v292, v293)); + real2 v608 = minus(v453, v452); + real2 v612 = plus(v452, v453); + real2 v980 = plus(v139, v299); + real2 v976 = minus(v299, v139); + real2 v449 = minusplus(v447, v448); + real2 v451 = minusplus(uminus(v447), v448); + real2 v465 = timesminusplus(reverse(v451), load(tbl, 80 * VECWIDTH + tbloffset), times(v451, load(tbl, 81 * VECWIDTH + tbloffset))); + real2 v305 = timesminusplus(reverse(v291), load(tbl, 48 * VECWIDTH + tbloffset), times(v291, load(tbl, 49 * VECWIDTH + tbloffset))); + real2 v1186 = minus(v305, v145); + real2 v1190 = plus(v145, v305); + real2 v459 = timesminusplus(reverse(v449), load(tbl, 78 * VECWIDTH + tbloffset), times(v449, load(tbl, 79 * VECWIDTH + tbloffset))); + real2 v25 = load(in, 23 << shift); + real2 v57 = load(in, 55 << shift); + real2 v207 = reverse(minus(v25, v57)); + real2 v213 = plus(v25, v57); + real2 v9 = load(in, 7 << shift); + real2 v41 = load(in, 39 << shift); + real2 v212 = plus(v9, v41); + real2 v208 = minus(v41, v9); + real2 v528 = minus(v213, v212); + real2 v532 = plus(v212, v213); + real2 v209 = minusplus(v207, v208); + real2 v211 = minusplus(uminus(v207), v208); + real2 v225 = timesminusplus(reverse(v211), load(tbl, 32 * VECWIDTH + tbloffset), times(v211, load(tbl, 33 * VECWIDTH + tbloffset))); + real2 v219 = timesminusplus(reverse(v209), load(tbl, 30 * VECWIDTH + tbloffset), times(v209, load(tbl, 31 * VECWIDTH + tbloffset))); + real2 v17 = load(in, 15 << shift); + real2 v49 = load(in, 47 << shift); + real2 v368 = minus(v49, v17); + real2 v372 = plus(v17, v49); + real2 v33 = load(in, 31 << shift); + real2 v65 = load(in, 63 << shift); + real2 v367 = reverse(minus(v33, v65)); + real2 v373 = plus(v33, v65); + real2 v369 = minusplus(v367, v368); + real2 v371 = minusplus(uminus(v367), v368); + real2 v533 = plus(v372, v373); + real2 v527 = reverse(minus(v372, v373)); + real2 v607 = reverse(minus(v532, v533)); + real2 v613 = plus(v532, v533); + real2 v529 = minusplus(v527, v528); + real2 v531 = minusplus(uminus(v527), v528); + real2 v545 = timesminusplus(reverse(v531), load(tbl, 96 * VECWIDTH + tbloffset), times(v531, load(tbl, 97 * VECWIDTH + tbloffset))); + real2 v653 = plus(v612, v613); + real2 v647 = reverse(minus(v612, v613)); + real2 v609 = minusplus(v607, v608); + real2 v611 = minusplus(uminus(v607), v608); + real2 v863 = plus(v465, v545); + real2 v857 = reverse(minus(v465, v545)); + real2 v539 = timesminusplus(reverse(v529), load(tbl, 94 * VECWIDTH + tbloffset), times(v529, load(tbl, 95 * VECWIDTH + tbloffset))); + real2 v385 = timesminusplus(reverse(v371), load(tbl, 64 * VECWIDTH + tbloffset), times(v371, load(tbl, 65 * VECWIDTH + tbloffset))); + real2 v619 = timesminusplus(reverse(v609), load(tbl, 110 * VECWIDTH + tbloffset), times(v609, load(tbl, 111 * VECWIDTH + tbloffset))); + real2 v1191 = plus(v225, v385); + real2 v1185 = reverse(minus(v225, v385)); + real2 v779 = reverse(minus(v459, v539)); + real2 v785 = plus(v459, v539); + real2 v625 = timesminusplus(reverse(v611), load(tbl, 112 * VECWIDTH + tbloffset), times(v611, load(tbl, 113 * VECWIDTH + tbloffset))); + real2 v379 = timesminusplus(reverse(v369), load(tbl, 62 * VECWIDTH + tbloffset), times(v369, load(tbl, 63 * VECWIDTH + tbloffset))); + real2 v975 = reverse(minus(v219, v379)); + real2 v981 = plus(v219, v379); + real2 v977 = minusplus(v975, v976); + real2 v979 = minusplus(uminus(v975), v976); + real2 v987 = timesminusplus(reverse(v977), load(tbl, 170 * VECWIDTH + tbloffset), times(v977, load(tbl, 171 * VECWIDTH + tbloffset))); + real2 v993 = timesminusplus(reverse(v979), load(tbl, 172 * VECWIDTH + tbloffset), times(v979, load(tbl, 173 * VECWIDTH + tbloffset))); + real2 v1015 = reverse(minus(v980, v981)); + real2 v1021 = plus(v980, v981); + real2 v11 = load(in, 9 << shift); + real2 v43 = load(in, 41 << shift); + real2 v248 = minus(v43, v11); + real2 v252 = plus(v11, v43); + real2 v59 = load(in, 57 << shift); + real2 v27 = load(in, 25 << shift); + real2 v253 = plus(v27, v59); + real2 v247 = reverse(minus(v27, v59)); + real2 v413 = plus(v252, v253); + real2 v407 = reverse(minus(v252, v253)); + real2 v249 = minusplus(v247, v248); + real2 v251 = minusplus(uminus(v247), v248); + real2 v259 = timesminusplus(reverse(v249), load(tbl, 38 * VECWIDTH + tbloffset), times(v249, load(tbl, 39 * VECWIDTH + tbloffset))); + real2 v35 = load(in, 33 << shift); + real2 v3 = load(in, 1 << shift); + real2 v92 = plus(v3, v35); + real2 v88 = minus(v35, v3); + real2 v51 = load(in, 49 << shift); + real2 v19 = load(in, 17 << shift); + real2 v87 = reverse(minus(v19, v51)); + real2 v93 = plus(v19, v51); + real2 v412 = plus(v92, v93); + real2 v408 = minus(v93, v92); + real2 v411 = minusplus(uminus(v407), v408); + real2 v409 = minusplus(v407, v408); + real2 v91 = minusplus(uminus(v87), v88); + real2 v89 = minusplus(v87, v88); + real2 v99 = timesminusplus(reverse(v89), load(tbl, 6 * VECWIDTH + tbloffset), times(v89, load(tbl, 7 * VECWIDTH + tbloffset))); + real2 v425 = timesminusplus(reverse(v411), load(tbl, 72 * VECWIDTH + tbloffset), times(v411, load(tbl, 73 * VECWIDTH + tbloffset))); + real2 v568 = minus(v413, v412); + real2 v572 = plus(v412, v413); + real2 v940 = plus(v99, v259); + real2 v936 = minus(v259, v99); + real2 v419 = timesminusplus(reverse(v409), load(tbl, 70 * VECWIDTH + tbloffset), times(v409, load(tbl, 71 * VECWIDTH + tbloffset))); + real2 v47 = load(in, 45 << shift); + real2 v15 = load(in, 13 << shift); + real2 v332 = plus(v15, v47); + real2 v328 = minus(v47, v15); + real2 v63 = load(in, 61 << shift); + real2 v31 = load(in, 29 << shift); + real2 v327 = reverse(minus(v31, v63)); + real2 v333 = plus(v31, v63); + real2 v329 = minusplus(v327, v328); + real2 v331 = minusplus(uminus(v327), v328); + real2 v339 = timesminusplus(reverse(v329), load(tbl, 54 * VECWIDTH + tbloffset), times(v329, load(tbl, 55 * VECWIDTH + tbloffset))); + real2 v487 = reverse(minus(v332, v333)); + real2 v493 = plus(v332, v333); + real2 v7 = load(in, 5 << shift); + real2 v39 = load(in, 37 << shift); + real2 v172 = plus(v7, v39); + real2 v168 = minus(v39, v7); + real2 v55 = load(in, 53 << shift); + real2 v23 = load(in, 21 << shift); + real2 v173 = plus(v23, v55); + real2 v167 = reverse(minus(v23, v55)); + real2 v488 = minus(v173, v172); + real2 v492 = plus(v172, v173); + real2 v491 = minusplus(uminus(v487), v488); + real2 v489 = minusplus(v487, v488); + real2 v499 = timesminusplus(reverse(v489), load(tbl, 86 * VECWIDTH + tbloffset), times(v489, load(tbl, 87 * VECWIDTH + tbloffset))); + real2 v505 = timesminusplus(reverse(v491), load(tbl, 88 * VECWIDTH + tbloffset), times(v491, load(tbl, 89 * VECWIDTH + tbloffset))); + real2 v567 = reverse(minus(v492, v493)); + real2 v573 = plus(v492, v493); + real2 v571 = minusplus(uminus(v567), v568); + real2 v569 = minusplus(v567, v568); + real2 v579 = timesminusplus(reverse(v569), load(tbl, 102 * VECWIDTH + tbloffset), times(v569, load(tbl, 103 * VECWIDTH + tbloffset))); + real2 v585 = timesminusplus(reverse(v571), load(tbl, 104 * VECWIDTH + tbloffset), times(v571, load(tbl, 105 * VECWIDTH + tbloffset))); + real2 v739 = plus(v585, v625); + real2 v733 = reverse(minus(v585, v625)); + real2 v707 = reverse(minus(v579, v619)); + real2 v713 = plus(v579, v619); + real2 v648 = minus(v573, v572); + real2 v652 = plus(v572, v573); + real2 v673 = plus(v652, v653); + real2 v667 = reverse(minus(v652, v653)); + real2 v651 = minusplus(uminus(v647), v648); + real2 v649 = minusplus(v647, v648); + real2 v659 = timesminusplus(reverse(v649), load(tbl, 118 * VECWIDTH + tbloffset), times(v649, load(tbl, 119 * VECWIDTH + tbloffset))); + real2 v665 = timesminusplus(reverse(v651), load(tbl, 120 * VECWIDTH + tbloffset), times(v651, load(tbl, 121 * VECWIDTH + tbloffset))); + real2 v780 = minus(v499, v419); + real2 v784 = plus(v419, v499); + real2 v781 = minusplus(v779, v780); + real2 v783 = minusplus(uminus(v779), v780); + real2 v805 = plus(v784, v785); + real2 v799 = reverse(minus(v784, v785)); + real2 v862 = plus(v425, v505); + real2 v858 = minus(v505, v425); + real2 v859 = minusplus(v857, v858); + real2 v861 = minusplus(uminus(v857), v858); + real2 v875 = timesminusplus(reverse(v861), load(tbl, 152 * VECWIDTH + tbloffset), times(v861, load(tbl, 153 * VECWIDTH + tbloffset))); + real2 v791 = timesminusplus(reverse(v781), load(tbl, 138 * VECWIDTH + tbloffset), times(v781, load(tbl, 139 * VECWIDTH + tbloffset))); + real2 v797 = timesminusplus(reverse(v783), load(tbl, 140 * VECWIDTH + tbloffset), times(v783, load(tbl, 141 * VECWIDTH + tbloffset))); + real2 v883 = plus(v862, v863); + real2 v877 = reverse(minus(v862, v863)); + real2 v869 = timesminusplus(reverse(v859), load(tbl, 150 * VECWIDTH + tbloffset), times(v859, load(tbl, 151 * VECWIDTH + tbloffset))); + real2 v36 = load(in, 34 << shift); + real2 v4 = load(in, 2 << shift); + real2 v108 = minus(v36, v4); + real2 v112 = plus(v4, v36); + real2 v52 = load(in, 50 << shift); + real2 v20 = load(in, 18 << shift); + real2 v113 = plus(v20, v52); + real2 v107 = reverse(minus(v20, v52)); + real2 v428 = minus(v113, v112); + real2 v432 = plus(v112, v113); + real2 v12 = load(in, 10 << shift); + real2 v44 = load(in, 42 << shift); + real2 v268 = minus(v44, v12); + real2 v272 = plus(v12, v44); + real2 v28 = load(in, 26 << shift); + real2 v60 = load(in, 58 << shift); + real2 v267 = reverse(minus(v28, v60)); + real2 v273 = plus(v28, v60); + real2 v427 = reverse(minus(v272, v273)); + real2 v433 = plus(v272, v273); + real2 v431 = minusplus(uminus(v427), v428); + real2 v429 = minusplus(v427, v428); + real2 v439 = timesminusplus(reverse(v429), load(tbl, 74 * VECWIDTH + tbloffset), times(v429, load(tbl, 75 * VECWIDTH + tbloffset))); + real2 v588 = minus(v433, v432); + real2 v592 = plus(v432, v433); + real2 v40 = load(in, 38 << shift); + real2 v8 = load(in, 6 << shift); + real2 v188 = minus(v40, v8); + real2 v192 = plus(v8, v40); + real2 v24 = load(in, 22 << shift); + real2 v56 = load(in, 54 << shift); + real2 v187 = reverse(minus(v24, v56)); + real2 v193 = plus(v24, v56); + real2 v512 = plus(v192, v193); + real2 v508 = minus(v193, v192); + real2 v32 = load(in, 30 << shift); + real2 v64 = load(in, 62 << shift); + real2 v347 = reverse(minus(v32, v64)); + real2 v353 = plus(v32, v64); + real2 v48 = load(in, 46 << shift); + real2 v16 = load(in, 14 << shift); + real2 v348 = minus(v48, v16); + real2 v352 = plus(v16, v48); + real2 v513 = plus(v352, v353); + real2 v507 = reverse(minus(v352, v353)); + real2 v587 = reverse(minus(v512, v513)); + real2 v593 = plus(v512, v513); + real2 v633 = plus(v592, v593); + real2 v627 = reverse(minus(v592, v593)); + real2 v591 = minusplus(uminus(v587), v588); + real2 v589 = minusplus(v587, v588); + real2 v605 = timesminusplus(reverse(v591), load(tbl, 108 * VECWIDTH + tbloffset), times(v591, load(tbl, 109 * VECWIDTH + tbloffset))); + real2 v599 = timesminusplus(reverse(v589), load(tbl, 106 * VECWIDTH + tbloffset), times(v589, load(tbl, 107 * VECWIDTH + tbloffset))); + real2 v46 = load(in, 44 << shift); + real2 v14 = load(in, 12 << shift); + real2 v312 = plus(v14, v46); + real2 v308 = minus(v46, v14); + real2 v62 = load(in, 60 << shift); + real2 v30 = load(in, 28 << shift); + real2 v313 = plus(v30, v62); + real2 v307 = reverse(minus(v30, v62)); + real2 v467 = reverse(minus(v312, v313)); + real2 v473 = plus(v312, v313); + real2 v22 = load(in, 20 << shift); + real2 v54 = load(in, 52 << shift); + real2 v147 = reverse(minus(v22, v54)); + real2 v153 = plus(v22, v54); + real2 v6 = load(in, 4 << shift); + real2 v38 = load(in, 36 << shift); + real2 v148 = minus(v38, v6); + real2 v152 = plus(v6, v38); + real2 v472 = plus(v152, v153); + real2 v468 = minus(v153, v152); + real2 v547 = reverse(minus(v472, v473)); + real2 v553 = plus(v472, v473); + real2 v10 = load(in, 8 << shift); + real2 v42 = load(in, 40 << shift); + real2 v232 = plus(v10, v42); + real2 v228 = minus(v42, v10); + real2 v58 = load(in, 56 << shift); + real2 v26 = load(in, 24 << shift); + real2 v233 = plus(v26, v58); + real2 v227 = reverse(minus(v26, v58)); + real2 v393 = plus(v232, v233); + real2 v387 = reverse(minus(v232, v233)); + real2 v2 = load(in, 0 << shift); + real2 v34 = load(in, 32 << shift); + real2 v72 = plus(v2, v34); + real2 v68 = minus(v34, v2); + real2 v18 = load(in, 16 << shift); + real2 v50 = load(in, 48 << shift); + real2 v73 = plus(v18, v50); + real2 v67 = reverse(minus(v18, v50)); + real2 v388 = minus(v73, v72); + real2 v392 = plus(v72, v73); + real2 v548 = minus(v393, v392); + real2 v552 = plus(v392, v393); + real2 v628 = minus(v553, v552); + real2 v632 = plus(v552, v553); + real2 v672 = plus(v632, v633); + real2 v668 = minus(v633, v632); + scatter(out, 0, 64, plus(v672, v673)); + real2 v686 = minus(v672, v673); + scatter(out, 32, 64, timesminusplus(v686, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v686), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v669 = minusplus(v667, v668); + real2 v671 = minusplus(uminus(v667), v668); + scatter(out, 48, 64, timesminusplus(reverse(v671), load(tbl, 124 * VECWIDTH + tbloffset), times(v671, load(tbl, 125 * VECWIDTH + tbloffset)))); + scatter(out, 16, 64, timesminusplus(reverse(v669), load(tbl, 122 * VECWIDTH + tbloffset), times(v669, load(tbl, 123 * VECWIDTH + tbloffset)))); + real2 v631 = minusplus(uminus(v627), v628); + real2 v629 = minusplus(v627, v628); + real2 v639 = timesminusplus(reverse(v629), load(tbl, 114 * VECWIDTH + tbloffset), times(v629, load(tbl, 115 * VECWIDTH + tbloffset))); + scatter(out, 8, 64, plus(v639, v659)); + real2 v694 = minus(v639, v659); + scatter(out, 40, 64, timesminusplus(v694, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v694), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v645 = timesminusplus(reverse(v631), load(tbl, 116 * VECWIDTH + tbloffset), times(v631, load(tbl, 117 * VECWIDTH + tbloffset))); + scatter(out, 24, 64, plus(v645, v665)); + real2 v700 = minus(v645, v665); + scatter(out, 56, 64, timesminusplus(v700, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v700), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v549 = minusplus(v547, v548); + real2 v551 = minusplus(uminus(v547), v548); + real2 v559 = timesminusplus(reverse(v549), load(tbl, 98 * VECWIDTH + tbloffset), times(v549, load(tbl, 99 * VECWIDTH + tbloffset))); + real2 v708 = minus(v599, v559); + real2 v712 = plus(v559, v599); + scatter(out, 4, 64, plus(v712, v713)); + real2 v726 = minus(v712, v713); + scatter(out, 36, 64, timesminusplus(v726, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v726), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v711 = minusplus(uminus(v707), v708); + real2 v709 = minusplus(v707, v708); + scatter(out, 20, 64, timesminusplus(reverse(v709), load(tbl, 126 * VECWIDTH + tbloffset), times(v709, load(tbl, 127 * VECWIDTH + tbloffset)))); + scatter(out, 52, 64, timesminusplus(reverse(v711), load(tbl, 128 * VECWIDTH + tbloffset), times(v711, load(tbl, 129 * VECWIDTH + tbloffset)))); + real2 v565 = timesminusplus(reverse(v551), load(tbl, 100 * VECWIDTH + tbloffset), times(v551, load(tbl, 101 * VECWIDTH + tbloffset))); + real2 v738 = plus(v565, v605); + real2 v734 = minus(v605, v565); + scatter(out, 12, 64, plus(v738, v739)); + real2 v752 = minus(v738, v739); + scatter(out, 44, 64, timesminusplus(v752, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v752), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v737 = minusplus(uminus(v733), v734); + scatter(out, 60, 64, timesminusplus(reverse(v737), load(tbl, 132 * VECWIDTH + tbloffset), times(v737, load(tbl, 133 * VECWIDTH + tbloffset)))); + real2 v735 = minusplus(v733, v734); + scatter(out, 28, 64, timesminusplus(reverse(v735), load(tbl, 130 * VECWIDTH + tbloffset), times(v735, load(tbl, 131 * VECWIDTH + tbloffset)))); + real2 v471 = minusplus(uminus(v467), v468); + real2 v469 = minusplus(v467, v468); + real2 v479 = timesminusplus(reverse(v469), load(tbl, 82 * VECWIDTH + tbloffset), times(v469, load(tbl, 83 * VECWIDTH + tbloffset))); + real2 v511 = minusplus(uminus(v507), v508); + real2 v509 = minusplus(v507, v508); + real2 v519 = timesminusplus(reverse(v509), load(tbl, 90 * VECWIDTH + tbloffset), times(v509, load(tbl, 91 * VECWIDTH + tbloffset))); + real2 v765 = plus(v439, v519); + real2 v759 = reverse(minus(v439, v519)); + real2 v389 = minusplus(v387, v388); + real2 v391 = minusplus(uminus(v387), v388); + real2 v399 = timesminusplus(reverse(v389), load(tbl, 66 * VECWIDTH + tbloffset), times(v389, load(tbl, 67 * VECWIDTH + tbloffset))); + real2 v764 = plus(v399, v479); + real2 v760 = minus(v479, v399); + real2 v804 = plus(v764, v765); + real2 v800 = minus(v765, v764); + scatter(out, 2, 64, plus(v804, v805)); + real2 v818 = minus(v804, v805); + scatter(out, 34, 64, timesminusplus(v818, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v818), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v803 = minusplus(uminus(v799), v800); + scatter(out, 50, 64, timesminusplus(reverse(v803), load(tbl, 144 * VECWIDTH + tbloffset), times(v803, load(tbl, 145 * VECWIDTH + tbloffset)))); + real2 v801 = minusplus(v799, v800); + scatter(out, 18, 64, timesminusplus(reverse(v801), load(tbl, 142 * VECWIDTH + tbloffset), times(v801, load(tbl, 143 * VECWIDTH + tbloffset)))); + real2 v763 = minusplus(uminus(v759), v760); + real2 v761 = minusplus(v759, v760); + real2 v777 = timesminusplus(reverse(v763), load(tbl, 136 * VECWIDTH + tbloffset), times(v763, load(tbl, 137 * VECWIDTH + tbloffset))); + scatter(out, 26, 64, plus(v777, v797)); + real2 v830 = minus(v777, v797); + scatter(out, 58, 64, timesminusplus(v830, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v830), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v771 = timesminusplus(reverse(v761), load(tbl, 134 * VECWIDTH + tbloffset), times(v761, load(tbl, 135 * VECWIDTH + tbloffset))); + scatter(out, 10, 64, plus(v771, v791)); + real2 v824 = minus(v771, v791); + scatter(out, 42, 64, timesminusplus(v824, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v824), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v445 = timesminusplus(reverse(v431), load(tbl, 76 * VECWIDTH + tbloffset), times(v431, load(tbl, 77 * VECWIDTH + tbloffset))); + real2 v525 = timesminusplus(reverse(v511), load(tbl, 92 * VECWIDTH + tbloffset), times(v511, load(tbl, 93 * VECWIDTH + tbloffset))); + real2 v837 = reverse(minus(v445, v525)); + real2 v843 = plus(v445, v525); + real2 v485 = timesminusplus(reverse(v471), load(tbl, 84 * VECWIDTH + tbloffset), times(v471, load(tbl, 85 * VECWIDTH + tbloffset))); + real2 v405 = timesminusplus(reverse(v391), load(tbl, 68 * VECWIDTH + tbloffset), times(v391, load(tbl, 69 * VECWIDTH + tbloffset))); + real2 v838 = minus(v485, v405); + real2 v842 = plus(v405, v485); + real2 v878 = minus(v843, v842); + real2 v882 = plus(v842, v843); + scatter(out, 6, 64, plus(v882, v883)); + real2 v896 = minus(v882, v883); + scatter(out, 38, 64, timesminusplus(v896, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v896), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v881 = minusplus(uminus(v877), v878); + scatter(out, 54, 64, timesminusplus(reverse(v881), load(tbl, 156 * VECWIDTH + tbloffset), times(v881, load(tbl, 157 * VECWIDTH + tbloffset)))); + real2 v879 = minusplus(v877, v878); + scatter(out, 22, 64, timesminusplus(reverse(v879), load(tbl, 154 * VECWIDTH + tbloffset), times(v879, load(tbl, 155 * VECWIDTH + tbloffset)))); + real2 v841 = minusplus(uminus(v837), v838); + real2 v839 = minusplus(v837, v838); + real2 v855 = timesminusplus(reverse(v841), load(tbl, 148 * VECWIDTH + tbloffset), times(v841, load(tbl, 149 * VECWIDTH + tbloffset))); + scatter(out, 30, 64, plus(v855, v875)); + real2 v908 = minus(v855, v875); + scatter(out, 62, 64, timesminusplus(v908, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v908), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v849 = timesminusplus(reverse(v839), load(tbl, 146 * VECWIDTH + tbloffset), times(v839, load(tbl, 147 * VECWIDTH + tbloffset))); + scatter(out, 14, 64, plus(v849, v869)); + real2 v902 = minus(v849, v869); + scatter(out, 46, 64, timesminusplus(v902, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v902), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v151 = minusplus(uminus(v147), v148); + real2 v149 = minusplus(v147, v148); + real2 v311 = minusplus(uminus(v307), v308); + real2 v309 = minusplus(v307, v308); + real2 v109 = minusplus(v107, v108); + real2 v111 = minusplus(uminus(v107), v108); + real2 v119 = timesminusplus(reverse(v109), load(tbl, 10 * VECWIDTH + tbloffset), times(v109, load(tbl, 11 * VECWIDTH + tbloffset))); + real2 v269 = minusplus(v267, v268); + real2 v271 = minusplus(uminus(v267), v268); + real2 v279 = timesminusplus(reverse(v269), load(tbl, 42 * VECWIDTH + tbloffset), times(v269, load(tbl, 43 * VECWIDTH + tbloffset))); + real2 v960 = plus(v119, v279); + real2 v956 = minus(v279, v119); + real2 v169 = minusplus(v167, v168); + real2 v171 = minusplus(uminus(v167), v168); + real2 v159 = timesminusplus(reverse(v149), load(tbl, 18 * VECWIDTH + tbloffset), times(v149, load(tbl, 19 * VECWIDTH + tbloffset))); + real2 v319 = timesminusplus(reverse(v309), load(tbl, 50 * VECWIDTH + tbloffset), times(v309, load(tbl, 51 * VECWIDTH + tbloffset))); + real2 v921 = plus(v159, v319); + real2 v915 = reverse(minus(v159, v319)); + real2 v351 = minusplus(uminus(v347), v348); + real2 v349 = minusplus(v347, v348); + real2 v359 = timesminusplus(reverse(v349), load(tbl, 58 * VECWIDTH + tbloffset), times(v349, load(tbl, 59 * VECWIDTH + tbloffset))); + real2 v191 = minusplus(uminus(v187), v188); + real2 v189 = minusplus(v187, v188); + real2 v199 = timesminusplus(reverse(v189), load(tbl, 26 * VECWIDTH + tbloffset), times(v189, load(tbl, 27 * VECWIDTH + tbloffset))); + real2 v961 = plus(v199, v359); + real2 v955 = reverse(minus(v199, v359)); + real2 v995 = reverse(minus(v960, v961)); + real2 v1001 = plus(v960, v961); + real2 v179 = timesminusplus(reverse(v169), load(tbl, 22 * VECWIDTH + tbloffset), times(v169, load(tbl, 23 * VECWIDTH + tbloffset))); + real2 v941 = plus(v179, v339); + real2 v935 = reverse(minus(v179, v339)); + real2 v1016 = minus(v941, v940); + real2 v1020 = plus(v940, v941); + real2 v71 = minusplus(uminus(v67), v68); + real2 v69 = minusplus(v67, v68); + real2 v79 = timesminusplus(reverse(v69), load(tbl, 2 * VECWIDTH + tbloffset), times(v69, load(tbl, 3 * VECWIDTH + tbloffset))); + real2 v1041 = plus(v1020, v1021); + real2 v1035 = reverse(minus(v1020, v1021)); + real2 v229 = minusplus(v227, v228); + real2 v231 = minusplus(uminus(v227), v228); + real2 v239 = timesminusplus(reverse(v229), load(tbl, 34 * VECWIDTH + tbloffset), times(v229, load(tbl, 35 * VECWIDTH + tbloffset))); + real2 v920 = plus(v79, v239); + real2 v916 = minus(v239, v79); + real2 v996 = minus(v921, v920); + real2 v1000 = plus(v920, v921); + real2 v1040 = plus(v1000, v1001); + real2 v1036 = minus(v1001, v1000); + scatter(out, 1, 64, plus(v1040, v1041)); + real2 v1054 = minus(v1040, v1041); + scatter(out, 33, 64, timesminusplus(v1054, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1054), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1037 = minusplus(v1035, v1036); + real2 v1039 = minusplus(uminus(v1035), v1036); + scatter(out, 49, 64, timesminusplus(reverse(v1039), load(tbl, 184 * VECWIDTH + tbloffset), times(v1039, load(tbl, 185 * VECWIDTH + tbloffset)))); + scatter(out, 17, 64, timesminusplus(reverse(v1037), load(tbl, 182 * VECWIDTH + tbloffset), times(v1037, load(tbl, 183 * VECWIDTH + tbloffset)))); + real2 v1017 = minusplus(v1015, v1016); + real2 v1019 = minusplus(uminus(v1015), v1016); + real2 v1033 = timesminusplus(reverse(v1019), load(tbl, 180 * VECWIDTH + tbloffset), times(v1019, load(tbl, 181 * VECWIDTH + tbloffset))); + real2 v997 = minusplus(v995, v996); + real2 v999 = minusplus(uminus(v995), v996); + real2 v1013 = timesminusplus(reverse(v999), load(tbl, 176 * VECWIDTH + tbloffset), times(v999, load(tbl, 177 * VECWIDTH + tbloffset))); + scatter(out, 25, 64, plus(v1013, v1033)); + real2 v1066 = minus(v1013, v1033); + scatter(out, 57, 64, timesminusplus(v1066, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1066), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1027 = timesminusplus(reverse(v1017), load(tbl, 178 * VECWIDTH + tbloffset), times(v1017, load(tbl, 179 * VECWIDTH + tbloffset))); + real2 v1007 = timesminusplus(reverse(v997), load(tbl, 174 * VECWIDTH + tbloffset), times(v997, load(tbl, 175 * VECWIDTH + tbloffset))); + scatter(out, 9, 64, plus(v1007, v1027)); + real2 v1060 = minus(v1007, v1027); + scatter(out, 41, 64, timesminusplus(v1060, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1060), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v937 = minusplus(v935, v936); + real2 v939 = minusplus(uminus(v935), v936); + real2 v959 = minusplus(uminus(v955), v956); + real2 v957 = minusplus(v955, v956); + real2 v967 = timesminusplus(reverse(v957), load(tbl, 166 * VECWIDTH + tbloffset), times(v957, load(tbl, 167 * VECWIDTH + tbloffset))); + real2 v947 = timesminusplus(reverse(v937), load(tbl, 162 * VECWIDTH + tbloffset), times(v937, load(tbl, 163 * VECWIDTH + tbloffset))); + real2 v919 = minusplus(uminus(v915), v916); + real2 v917 = minusplus(v915, v916); + real2 v1079 = plus(v947, v987); + real2 v1073 = reverse(minus(v947, v987)); + real2 v927 = timesminusplus(reverse(v917), load(tbl, 158 * VECWIDTH + tbloffset), times(v917, load(tbl, 159 * VECWIDTH + tbloffset))); + real2 v1074 = minus(v967, v927); + real2 v1078 = plus(v927, v967); + scatter(out, 5, 64, plus(v1078, v1079)); + real2 v1092 = minus(v1078, v1079); + scatter(out, 37, 64, timesminusplus(v1092, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1092), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1075 = minusplus(v1073, v1074); + scatter(out, 21, 64, timesminusplus(reverse(v1075), load(tbl, 186 * VECWIDTH + tbloffset), times(v1075, load(tbl, 187 * VECWIDTH + tbloffset)))); + real2 v1077 = minusplus(uminus(v1073), v1074); + scatter(out, 53, 64, timesminusplus(reverse(v1077), load(tbl, 188 * VECWIDTH + tbloffset), times(v1077, load(tbl, 189 * VECWIDTH + tbloffset)))); + real2 v953 = timesminusplus(reverse(v939), load(tbl, 164 * VECWIDTH + tbloffset), times(v939, load(tbl, 165 * VECWIDTH + tbloffset))); + real2 v1099 = reverse(minus(v953, v993)); + real2 v1105 = plus(v953, v993); + real2 v973 = timesminusplus(reverse(v959), load(tbl, 168 * VECWIDTH + tbloffset), times(v959, load(tbl, 169 * VECWIDTH + tbloffset))); + real2 v933 = timesminusplus(reverse(v919), load(tbl, 160 * VECWIDTH + tbloffset), times(v919, load(tbl, 161 * VECWIDTH + tbloffset))); + real2 v1104 = plus(v933, v973); + real2 v1100 = minus(v973, v933); + scatter(out, 13, 64, plus(v1104, v1105)); + real2 v1118 = minus(v1104, v1105); + scatter(out, 45, 64, timesminusplus(v1118, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1118), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1101 = minusplus(v1099, v1100); + scatter(out, 29, 64, timesminusplus(reverse(v1101), load(tbl, 190 * VECWIDTH + tbloffset), times(v1101, load(tbl, 191 * VECWIDTH + tbloffset)))); + real2 v1103 = minusplus(uminus(v1099), v1100); + scatter(out, 61, 64, timesminusplus(reverse(v1103), load(tbl, 192 * VECWIDTH + tbloffset), times(v1103, load(tbl, 193 * VECWIDTH + tbloffset)))); + real2 v345 = timesminusplus(reverse(v331), load(tbl, 56 * VECWIDTH + tbloffset), times(v331, load(tbl, 57 * VECWIDTH + tbloffset))); + real2 v325 = timesminusplus(reverse(v311), load(tbl, 52 * VECWIDTH + tbloffset), times(v311, load(tbl, 53 * VECWIDTH + tbloffset))); + real2 v265 = timesminusplus(reverse(v251), load(tbl, 40 * VECWIDTH + tbloffset), times(v251, load(tbl, 41 * VECWIDTH + tbloffset))); + real2 v185 = timesminusplus(reverse(v171), load(tbl, 24 * VECWIDTH + tbloffset), times(v171, load(tbl, 25 * VECWIDTH + tbloffset))); + real2 v165 = timesminusplus(reverse(v151), load(tbl, 20 * VECWIDTH + tbloffset), times(v151, load(tbl, 21 * VECWIDTH + tbloffset))); + real2 v1131 = plus(v165, v325); + real2 v1125 = reverse(minus(v165, v325)); + real2 v1151 = plus(v185, v345); + real2 v1145 = reverse(minus(v185, v345)); + real2 v105 = timesminusplus(reverse(v91), load(tbl, 8 * VECWIDTH + tbloffset), times(v91, load(tbl, 9 * VECWIDTH + tbloffset))); + real2 v1150 = plus(v105, v265); + real2 v1146 = minus(v265, v105); + real2 v1226 = minus(v1151, v1150); + real2 v1230 = plus(v1150, v1151); + real2 v1231 = plus(v1190, v1191); + real2 v1225 = reverse(minus(v1190, v1191)); + real2 v1245 = reverse(minus(v1230, v1231)); + real2 v1251 = plus(v1230, v1231); + real2 v365 = timesminusplus(reverse(v351), load(tbl, 60 * VECWIDTH + tbloffset), times(v351, load(tbl, 61 * VECWIDTH + tbloffset))); + real2 v285 = timesminusplus(reverse(v271), load(tbl, 44 * VECWIDTH + tbloffset), times(v271, load(tbl, 45 * VECWIDTH + tbloffset))); + real2 v205 = timesminusplus(reverse(v191), load(tbl, 28 * VECWIDTH + tbloffset), times(v191, load(tbl, 29 * VECWIDTH + tbloffset))); + real2 v1171 = plus(v205, v365); + real2 v1165 = reverse(minus(v205, v365)); + real2 v125 = timesminusplus(reverse(v111), load(tbl, 12 * VECWIDTH + tbloffset), times(v111, load(tbl, 13 * VECWIDTH + tbloffset))); + real2 v85 = timesminusplus(reverse(v71), load(tbl, 4 * VECWIDTH + tbloffset), times(v71, load(tbl, 5 * VECWIDTH + tbloffset))); + real2 v245 = timesminusplus(reverse(v231), load(tbl, 36 * VECWIDTH + tbloffset), times(v231, load(tbl, 37 * VECWIDTH + tbloffset))); + real2 v1126 = minus(v245, v85); + real2 v1130 = plus(v85, v245); + real2 v1210 = plus(v1130, v1131); + real2 v1206 = minus(v1131, v1130); + real2 v1166 = minus(v285, v125); + real2 v1170 = plus(v125, v285); + real2 v1211 = plus(v1170, v1171); + real2 v1205 = reverse(minus(v1170, v1171)); + real2 v1246 = minus(v1211, v1210); + real2 v1250 = plus(v1210, v1211); + scatter(out, 3, 64, plus(v1250, v1251)); + real2 v1264 = minus(v1250, v1251); + scatter(out, 35, 64, timesminusplus(v1264, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1264), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1247 = minusplus(v1245, v1246); + real2 v1249 = minusplus(uminus(v1245), v1246); + scatter(out, 19, 64, timesminusplus(reverse(v1247), load(tbl, 218 * VECWIDTH + tbloffset), times(v1247, load(tbl, 219 * VECWIDTH + tbloffset)))); + scatter(out, 51, 64, timesminusplus(reverse(v1249), load(tbl, 220 * VECWIDTH + tbloffset), times(v1249, load(tbl, 221 * VECWIDTH + tbloffset)))); + real2 v1229 = minusplus(uminus(v1225), v1226); + real2 v1227 = minusplus(v1225, v1226); + real2 v1207 = minusplus(v1205, v1206); + real2 v1209 = minusplus(uminus(v1205), v1206); + real2 v1237 = timesminusplus(reverse(v1227), load(tbl, 214 * VECWIDTH + tbloffset), times(v1227, load(tbl, 215 * VECWIDTH + tbloffset))); + real2 v1217 = timesminusplus(reverse(v1207), load(tbl, 210 * VECWIDTH + tbloffset), times(v1207, load(tbl, 211 * VECWIDTH + tbloffset))); + scatter(out, 11, 64, plus(v1217, v1237)); + real2 v1270 = minus(v1217, v1237); + scatter(out, 43, 64, timesminusplus(v1270, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1270), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1223 = timesminusplus(reverse(v1209), load(tbl, 212 * VECWIDTH + tbloffset), times(v1209, load(tbl, 213 * VECWIDTH + tbloffset))); + real2 v1243 = timesminusplus(reverse(v1229), load(tbl, 216 * VECWIDTH + tbloffset), times(v1229, load(tbl, 217 * VECWIDTH + tbloffset))); + scatter(out, 27, 64, plus(v1223, v1243)); + real2 v1276 = minus(v1223, v1243); + scatter(out, 59, 64, timesminusplus(v1276, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1276), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1189 = minusplus(uminus(v1185), v1186); + real2 v1187 = minusplus(v1185, v1186); + real2 v1129 = minusplus(uminus(v1125), v1126); + real2 v1127 = minusplus(v1125, v1126); + real2 v1147 = minusplus(v1145, v1146); + real2 v1149 = minusplus(uminus(v1145), v1146); + real2 v1167 = minusplus(v1165, v1166); + real2 v1169 = minusplus(uminus(v1165), v1166); + real2 v1143 = timesminusplus(reverse(v1129), load(tbl, 196 * VECWIDTH + tbloffset), times(v1129, load(tbl, 197 * VECWIDTH + tbloffset))); + real2 v1163 = timesminusplus(reverse(v1149), load(tbl, 200 * VECWIDTH + tbloffset), times(v1149, load(tbl, 201 * VECWIDTH + tbloffset))); + real2 v1203 = timesminusplus(reverse(v1189), load(tbl, 208 * VECWIDTH + tbloffset), times(v1189, load(tbl, 209 * VECWIDTH + tbloffset))); + real2 v1315 = plus(v1163, v1203); + real2 v1309 = reverse(minus(v1163, v1203)); + real2 v1183 = timesminusplus(reverse(v1169), load(tbl, 204 * VECWIDTH + tbloffset), times(v1169, load(tbl, 205 * VECWIDTH + tbloffset))); + real2 v1314 = plus(v1143, v1183); + real2 v1310 = minus(v1183, v1143); + scatter(out, 15, 64, plus(v1314, v1315)); + real2 v1328 = minus(v1314, v1315); + scatter(out, 47, 64, timesminusplus(v1328, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1328), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1311 = minusplus(v1309, v1310); + scatter(out, 31, 64, timesminusplus(reverse(v1311), load(tbl, 226 * VECWIDTH + tbloffset), times(v1311, load(tbl, 227 * VECWIDTH + tbloffset)))); + real2 v1313 = minusplus(uminus(v1309), v1310); + scatter(out, 63, 64, timesminusplus(reverse(v1313), load(tbl, 228 * VECWIDTH + tbloffset), times(v1313, load(tbl, 229 * VECWIDTH + tbloffset)))); + real2 v1177 = timesminusplus(reverse(v1167), load(tbl, 202 * VECWIDTH + tbloffset), times(v1167, load(tbl, 203 * VECWIDTH + tbloffset))); + real2 v1137 = timesminusplus(reverse(v1127), load(tbl, 194 * VECWIDTH + tbloffset), times(v1127, load(tbl, 195 * VECWIDTH + tbloffset))); + real2 v1197 = timesminusplus(reverse(v1187), load(tbl, 206 * VECWIDTH + tbloffset), times(v1187, load(tbl, 207 * VECWIDTH + tbloffset))); + real2 v1157 = timesminusplus(reverse(v1147), load(tbl, 198 * VECWIDTH + tbloffset), times(v1147, load(tbl, 199 * VECWIDTH + tbloffset))); + real2 v1283 = reverse(minus(v1157, v1197)); + real2 v1289 = plus(v1157, v1197); + real2 v1288 = plus(v1137, v1177); + real2 v1284 = minus(v1177, v1137); + scatter(out, 7, 64, plus(v1288, v1289)); + real2 v1302 = minus(v1288, v1289); + scatter(out, 39, 64, timesminusplus(v1302, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1302), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1285 = minusplus(v1283, v1284); + real2 v1287 = minusplus(uminus(v1283), v1284); + scatter(out, 55, 64, timesminusplus(reverse(v1287), load(tbl, 224 * VECWIDTH + tbloffset), times(v1287, load(tbl, 225 * VECWIDTH + tbloffset)))); + scatter(out, 23, 64, timesminusplus(reverse(v1285), load(tbl, 222 * VECWIDTH + tbloffset), times(v1285, load(tbl, 223 * VECWIDTH + tbloffset)))); + // Pres : 17339 + } +} + +ALIGNED(8192) void tbut64b_%SHIFT%_%CONFIG%_%ISA%(real *RESTRICT out0, uint32_t *q, const real *RESTRICT in0, const real *RESTRICT tbl, const int K) { + const int shift = %SHIFT% + LOG2VECWIDTH, k = 1 << (shift - LOG2VECWIDTH); + int i=0; +#pragma omp parallel for + for(i=0;i < k;i++) { + int i0 = i << LOG2VECWIDTH; + real *out = out0 + q[i]; + const real *in = in0 + i0*2; + const int tbloffset = K * i0; + + // Pres : 30254 + real2 v37 = load(in, 35 << shift); + real2 v5 = load(in, 3 << shift); + real2 v132 = plus(v5, v37); + real2 v128 = minus(v37, v5); + real2 v21 = load(in, 19 << shift); + real2 v53 = load(in, 51 << shift); + real2 v133 = plus(v21, v53); + real2 v127 = reverse(minus(v53, v21)); + real2 v131 = minusplus(uminus(v127), v128); + real2 v129 = minusplus(v127, v128); + real2 v139 = timesminusplus(reverse(v129), load(tbl, 14 * VECWIDTH + tbloffset), times(v129, load(tbl, 15 * VECWIDTH + tbloffset))); + real2 v145 = timesminusplus(reverse(v131), load(tbl, 16 * VECWIDTH + tbloffset), times(v131, load(tbl, 17 * VECWIDTH + tbloffset))); + real2 v448 = minus(v133, v132); + real2 v452 = plus(v132, v133); + real2 v45 = load(in, 43 << shift); + real2 v13 = load(in, 11 << shift); + real2 v292 = plus(v13, v45); + real2 v288 = minus(v45, v13); + real2 v29 = load(in, 27 << shift); + real2 v61 = load(in, 59 << shift); + real2 v293 = plus(v29, v61); + real2 v287 = reverse(minus(v61, v29)); + real2 v291 = minusplus(uminus(v287), v288); + real2 v289 = minusplus(v287, v288); + real2 v299 = timesminusplus(reverse(v289), load(tbl, 46 * VECWIDTH + tbloffset), times(v289, load(tbl, 47 * VECWIDTH + tbloffset))); + real2 v453 = plus(v292, v293); + real2 v447 = reverse(minus(v293, v292)); + real2 v608 = minus(v453, v452); + real2 v612 = plus(v452, v453); + real2 v980 = plus(v139, v299); + real2 v976 = minus(v299, v139); + real2 v449 = minusplus(v447, v448); + real2 v451 = minusplus(uminus(v447), v448); + real2 v465 = timesminusplus(reverse(v451), load(tbl, 80 * VECWIDTH + tbloffset), times(v451, load(tbl, 81 * VECWIDTH + tbloffset))); + real2 v305 = timesminusplus(reverse(v291), load(tbl, 48 * VECWIDTH + tbloffset), times(v291, load(tbl, 49 * VECWIDTH + tbloffset))); + real2 v1186 = minus(v305, v145); + real2 v1190 = plus(v145, v305); + real2 v459 = timesminusplus(reverse(v449), load(tbl, 78 * VECWIDTH + tbloffset), times(v449, load(tbl, 79 * VECWIDTH + tbloffset))); + real2 v25 = load(in, 23 << shift); + real2 v57 = load(in, 55 << shift); + real2 v207 = reverse(minus(v57, v25)); + real2 v213 = plus(v25, v57); + real2 v9 = load(in, 7 << shift); + real2 v41 = load(in, 39 << shift); + real2 v212 = plus(v9, v41); + real2 v208 = minus(v41, v9); + real2 v528 = minus(v213, v212); + real2 v532 = plus(v212, v213); + real2 v209 = minusplus(v207, v208); + real2 v211 = minusplus(uminus(v207), v208); + real2 v225 = timesminusplus(reverse(v211), load(tbl, 32 * VECWIDTH + tbloffset), times(v211, load(tbl, 33 * VECWIDTH + tbloffset))); + real2 v219 = timesminusplus(reverse(v209), load(tbl, 30 * VECWIDTH + tbloffset), times(v209, load(tbl, 31 * VECWIDTH + tbloffset))); + real2 v17 = load(in, 15 << shift); + real2 v49 = load(in, 47 << shift); + real2 v368 = minus(v49, v17); + real2 v372 = plus(v17, v49); + real2 v33 = load(in, 31 << shift); + real2 v65 = load(in, 63 << shift); + real2 v367 = reverse(minus(v65, v33)); + real2 v373 = plus(v33, v65); + real2 v369 = minusplus(v367, v368); + real2 v371 = minusplus(uminus(v367), v368); + real2 v533 = plus(v372, v373); + real2 v527 = reverse(minus(v373, v372)); + real2 v607 = reverse(minus(v533, v532)); + real2 v613 = plus(v532, v533); + real2 v529 = minusplus(v527, v528); + real2 v531 = minusplus(uminus(v527), v528); + real2 v545 = timesminusplus(reverse(v531), load(tbl, 96 * VECWIDTH + tbloffset), times(v531, load(tbl, 97 * VECWIDTH + tbloffset))); + real2 v653 = plus(v612, v613); + real2 v647 = reverse(minus(v613, v612)); + real2 v609 = minusplus(v607, v608); + real2 v611 = minusplus(uminus(v607), v608); + real2 v863 = plus(v465, v545); + real2 v857 = reverse(minus(v545, v465)); + real2 v539 = timesminusplus(reverse(v529), load(tbl, 94 * VECWIDTH + tbloffset), times(v529, load(tbl, 95 * VECWIDTH + tbloffset))); + real2 v385 = timesminusplus(reverse(v371), load(tbl, 64 * VECWIDTH + tbloffset), times(v371, load(tbl, 65 * VECWIDTH + tbloffset))); + real2 v619 = timesminusplus(reverse(v609), load(tbl, 110 * VECWIDTH + tbloffset), times(v609, load(tbl, 111 * VECWIDTH + tbloffset))); + real2 v1191 = plus(v225, v385); + real2 v1185 = reverse(minus(v385, v225)); + real2 v779 = reverse(minus(v539, v459)); + real2 v785 = plus(v459, v539); + real2 v625 = timesminusplus(reverse(v611), load(tbl, 112 * VECWIDTH + tbloffset), times(v611, load(tbl, 113 * VECWIDTH + tbloffset))); + real2 v379 = timesminusplus(reverse(v369), load(tbl, 62 * VECWIDTH + tbloffset), times(v369, load(tbl, 63 * VECWIDTH + tbloffset))); + real2 v975 = reverse(minus(v379, v219)); + real2 v981 = plus(v219, v379); + real2 v977 = minusplus(v975, v976); + real2 v979 = minusplus(uminus(v975), v976); + real2 v987 = timesminusplus(reverse(v977), load(tbl, 170 * VECWIDTH + tbloffset), times(v977, load(tbl, 171 * VECWIDTH + tbloffset))); + real2 v993 = timesminusplus(reverse(v979), load(tbl, 172 * VECWIDTH + tbloffset), times(v979, load(tbl, 173 * VECWIDTH + tbloffset))); + real2 v1015 = reverse(minus(v981, v980)); + real2 v1021 = plus(v980, v981); + real2 v11 = load(in, 9 << shift); + real2 v43 = load(in, 41 << shift); + real2 v248 = minus(v43, v11); + real2 v252 = plus(v11, v43); + real2 v59 = load(in, 57 << shift); + real2 v27 = load(in, 25 << shift); + real2 v253 = plus(v27, v59); + real2 v247 = reverse(minus(v59, v27)); + real2 v413 = plus(v252, v253); + real2 v407 = reverse(minus(v253, v252)); + real2 v249 = minusplus(v247, v248); + real2 v251 = minusplus(uminus(v247), v248); + real2 v259 = timesminusplus(reverse(v249), load(tbl, 38 * VECWIDTH + tbloffset), times(v249, load(tbl, 39 * VECWIDTH + tbloffset))); + real2 v35 = load(in, 33 << shift); + real2 v3 = load(in, 1 << shift); + real2 v92 = plus(v3, v35); + real2 v88 = minus(v35, v3); + real2 v51 = load(in, 49 << shift); + real2 v19 = load(in, 17 << shift); + real2 v87 = reverse(minus(v51, v19)); + real2 v93 = plus(v19, v51); + real2 v412 = plus(v92, v93); + real2 v408 = minus(v93, v92); + real2 v411 = minusplus(uminus(v407), v408); + real2 v409 = minusplus(v407, v408); + real2 v91 = minusplus(uminus(v87), v88); + real2 v89 = minusplus(v87, v88); + real2 v99 = timesminusplus(reverse(v89), load(tbl, 6 * VECWIDTH + tbloffset), times(v89, load(tbl, 7 * VECWIDTH + tbloffset))); + real2 v425 = timesminusplus(reverse(v411), load(tbl, 72 * VECWIDTH + tbloffset), times(v411, load(tbl, 73 * VECWIDTH + tbloffset))); + real2 v568 = minus(v413, v412); + real2 v572 = plus(v412, v413); + real2 v940 = plus(v99, v259); + real2 v936 = minus(v259, v99); + real2 v419 = timesminusplus(reverse(v409), load(tbl, 70 * VECWIDTH + tbloffset), times(v409, load(tbl, 71 * VECWIDTH + tbloffset))); + real2 v47 = load(in, 45 << shift); + real2 v15 = load(in, 13 << shift); + real2 v332 = plus(v15, v47); + real2 v328 = minus(v47, v15); + real2 v63 = load(in, 61 << shift); + real2 v31 = load(in, 29 << shift); + real2 v327 = reverse(minus(v63, v31)); + real2 v333 = plus(v31, v63); + real2 v329 = minusplus(v327, v328); + real2 v331 = minusplus(uminus(v327), v328); + real2 v339 = timesminusplus(reverse(v329), load(tbl, 54 * VECWIDTH + tbloffset), times(v329, load(tbl, 55 * VECWIDTH + tbloffset))); + real2 v487 = reverse(minus(v333, v332)); + real2 v493 = plus(v332, v333); + real2 v7 = load(in, 5 << shift); + real2 v39 = load(in, 37 << shift); + real2 v172 = plus(v7, v39); + real2 v168 = minus(v39, v7); + real2 v55 = load(in, 53 << shift); + real2 v23 = load(in, 21 << shift); + real2 v173 = plus(v23, v55); + real2 v167 = reverse(minus(v55, v23)); + real2 v488 = minus(v173, v172); + real2 v492 = plus(v172, v173); + real2 v491 = minusplus(uminus(v487), v488); + real2 v489 = minusplus(v487, v488); + real2 v499 = timesminusplus(reverse(v489), load(tbl, 86 * VECWIDTH + tbloffset), times(v489, load(tbl, 87 * VECWIDTH + tbloffset))); + real2 v505 = timesminusplus(reverse(v491), load(tbl, 88 * VECWIDTH + tbloffset), times(v491, load(tbl, 89 * VECWIDTH + tbloffset))); + real2 v567 = reverse(minus(v493, v492)); + real2 v573 = plus(v492, v493); + real2 v571 = minusplus(uminus(v567), v568); + real2 v569 = minusplus(v567, v568); + real2 v579 = timesminusplus(reverse(v569), load(tbl, 102 * VECWIDTH + tbloffset), times(v569, load(tbl, 103 * VECWIDTH + tbloffset))); + real2 v585 = timesminusplus(reverse(v571), load(tbl, 104 * VECWIDTH + tbloffset), times(v571, load(tbl, 105 * VECWIDTH + tbloffset))); + real2 v739 = plus(v585, v625); + real2 v733 = reverse(minus(v625, v585)); + real2 v707 = reverse(minus(v619, v579)); + real2 v713 = plus(v579, v619); + real2 v648 = minus(v573, v572); + real2 v652 = plus(v572, v573); + real2 v673 = plus(v652, v653); + real2 v667 = reverse(minus(v653, v652)); + real2 v651 = minusplus(uminus(v647), v648); + real2 v649 = minusplus(v647, v648); + real2 v659 = timesminusplus(reverse(v649), load(tbl, 118 * VECWIDTH + tbloffset), times(v649, load(tbl, 119 * VECWIDTH + tbloffset))); + real2 v665 = timesminusplus(reverse(v651), load(tbl, 120 * VECWIDTH + tbloffset), times(v651, load(tbl, 121 * VECWIDTH + tbloffset))); + real2 v780 = minus(v499, v419); + real2 v784 = plus(v419, v499); + real2 v781 = minusplus(v779, v780); + real2 v783 = minusplus(uminus(v779), v780); + real2 v805 = plus(v784, v785); + real2 v799 = reverse(minus(v785, v784)); + real2 v862 = plus(v425, v505); + real2 v858 = minus(v505, v425); + real2 v859 = minusplus(v857, v858); + real2 v861 = minusplus(uminus(v857), v858); + real2 v875 = timesminusplus(reverse(v861), load(tbl, 152 * VECWIDTH + tbloffset), times(v861, load(tbl, 153 * VECWIDTH + tbloffset))); + real2 v791 = timesminusplus(reverse(v781), load(tbl, 138 * VECWIDTH + tbloffset), times(v781, load(tbl, 139 * VECWIDTH + tbloffset))); + real2 v797 = timesminusplus(reverse(v783), load(tbl, 140 * VECWIDTH + tbloffset), times(v783, load(tbl, 141 * VECWIDTH + tbloffset))); + real2 v883 = plus(v862, v863); + real2 v877 = reverse(minus(v863, v862)); + real2 v869 = timesminusplus(reverse(v859), load(tbl, 150 * VECWIDTH + tbloffset), times(v859, load(tbl, 151 * VECWIDTH + tbloffset))); + real2 v36 = load(in, 34 << shift); + real2 v4 = load(in, 2 << shift); + real2 v108 = minus(v36, v4); + real2 v112 = plus(v4, v36); + real2 v52 = load(in, 50 << shift); + real2 v20 = load(in, 18 << shift); + real2 v113 = plus(v20, v52); + real2 v107 = reverse(minus(v52, v20)); + real2 v428 = minus(v113, v112); + real2 v432 = plus(v112, v113); + real2 v12 = load(in, 10 << shift); + real2 v44 = load(in, 42 << shift); + real2 v268 = minus(v44, v12); + real2 v272 = plus(v12, v44); + real2 v28 = load(in, 26 << shift); + real2 v60 = load(in, 58 << shift); + real2 v267 = reverse(minus(v60, v28)); + real2 v273 = plus(v28, v60); + real2 v427 = reverse(minus(v273, v272)); + real2 v433 = plus(v272, v273); + real2 v431 = minusplus(uminus(v427), v428); + real2 v429 = minusplus(v427, v428); + real2 v439 = timesminusplus(reverse(v429), load(tbl, 74 * VECWIDTH + tbloffset), times(v429, load(tbl, 75 * VECWIDTH + tbloffset))); + real2 v588 = minus(v433, v432); + real2 v592 = plus(v432, v433); + real2 v40 = load(in, 38 << shift); + real2 v8 = load(in, 6 << shift); + real2 v188 = minus(v40, v8); + real2 v192 = plus(v8, v40); + real2 v24 = load(in, 22 << shift); + real2 v56 = load(in, 54 << shift); + real2 v187 = reverse(minus(v56, v24)); + real2 v193 = plus(v24, v56); + real2 v512 = plus(v192, v193); + real2 v508 = minus(v193, v192); + real2 v32 = load(in, 30 << shift); + real2 v64 = load(in, 62 << shift); + real2 v347 = reverse(minus(v64, v32)); + real2 v353 = plus(v32, v64); + real2 v48 = load(in, 46 << shift); + real2 v16 = load(in, 14 << shift); + real2 v348 = minus(v48, v16); + real2 v352 = plus(v16, v48); + real2 v513 = plus(v352, v353); + real2 v507 = reverse(minus(v353, v352)); + real2 v587 = reverse(minus(v513, v512)); + real2 v593 = plus(v512, v513); + real2 v633 = plus(v592, v593); + real2 v627 = reverse(minus(v593, v592)); + real2 v591 = minusplus(uminus(v587), v588); + real2 v589 = minusplus(v587, v588); + real2 v605 = timesminusplus(reverse(v591), load(tbl, 108 * VECWIDTH + tbloffset), times(v591, load(tbl, 109 * VECWIDTH + tbloffset))); + real2 v599 = timesminusplus(reverse(v589), load(tbl, 106 * VECWIDTH + tbloffset), times(v589, load(tbl, 107 * VECWIDTH + tbloffset))); + real2 v46 = load(in, 44 << shift); + real2 v14 = load(in, 12 << shift); + real2 v312 = plus(v14, v46); + real2 v308 = minus(v46, v14); + real2 v62 = load(in, 60 << shift); + real2 v30 = load(in, 28 << shift); + real2 v313 = plus(v30, v62); + real2 v307 = reverse(minus(v62, v30)); + real2 v467 = reverse(minus(v313, v312)); + real2 v473 = plus(v312, v313); + real2 v22 = load(in, 20 << shift); + real2 v54 = load(in, 52 << shift); + real2 v147 = reverse(minus(v54, v22)); + real2 v153 = plus(v22, v54); + real2 v6 = load(in, 4 << shift); + real2 v38 = load(in, 36 << shift); + real2 v148 = minus(v38, v6); + real2 v152 = plus(v6, v38); + real2 v472 = plus(v152, v153); + real2 v468 = minus(v153, v152); + real2 v547 = reverse(minus(v473, v472)); + real2 v553 = plus(v472, v473); + real2 v10 = load(in, 8 << shift); + real2 v42 = load(in, 40 << shift); + real2 v232 = plus(v10, v42); + real2 v228 = minus(v42, v10); + real2 v58 = load(in, 56 << shift); + real2 v26 = load(in, 24 << shift); + real2 v233 = plus(v26, v58); + real2 v227 = reverse(minus(v58, v26)); + real2 v393 = plus(v232, v233); + real2 v387 = reverse(minus(v233, v232)); + real2 v2 = load(in, 0 << shift); + real2 v34 = load(in, 32 << shift); + real2 v72 = plus(v2, v34); + real2 v68 = minus(v34, v2); + real2 v18 = load(in, 16 << shift); + real2 v50 = load(in, 48 << shift); + real2 v73 = plus(v18, v50); + real2 v67 = reverse(minus(v50, v18)); + real2 v388 = minus(v73, v72); + real2 v392 = plus(v72, v73); + real2 v548 = minus(v393, v392); + real2 v552 = plus(v392, v393); + real2 v628 = minus(v553, v552); + real2 v632 = plus(v552, v553); + real2 v672 = plus(v632, v633); + real2 v668 = minus(v633, v632); + scatter(out, 0, 64, plus(v672, v673)); + real2 v686 = minus(v672, v673); + scatter(out, 32, 64, timesminusplus(v686, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v686), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v669 = minusplus(v667, v668); + real2 v671 = minusplus(uminus(v667), v668); + scatter(out, 48, 64, timesminusplus(reverse(v671), load(tbl, 124 * VECWIDTH + tbloffset), times(v671, load(tbl, 125 * VECWIDTH + tbloffset)))); + scatter(out, 16, 64, timesminusplus(reverse(v669), load(tbl, 122 * VECWIDTH + tbloffset), times(v669, load(tbl, 123 * VECWIDTH + tbloffset)))); + real2 v631 = minusplus(uminus(v627), v628); + real2 v629 = minusplus(v627, v628); + real2 v639 = timesminusplus(reverse(v629), load(tbl, 114 * VECWIDTH + tbloffset), times(v629, load(tbl, 115 * VECWIDTH + tbloffset))); + scatter(out, 8, 64, plus(v639, v659)); + real2 v694 = minus(v639, v659); + scatter(out, 40, 64, timesminusplus(v694, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v694), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v645 = timesminusplus(reverse(v631), load(tbl, 116 * VECWIDTH + tbloffset), times(v631, load(tbl, 117 * VECWIDTH + tbloffset))); + scatter(out, 24, 64, plus(v645, v665)); + real2 v700 = minus(v645, v665); + scatter(out, 56, 64, timesminusplus(v700, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v700), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v549 = minusplus(v547, v548); + real2 v551 = minusplus(uminus(v547), v548); + real2 v559 = timesminusplus(reverse(v549), load(tbl, 98 * VECWIDTH + tbloffset), times(v549, load(tbl, 99 * VECWIDTH + tbloffset))); + real2 v708 = minus(v599, v559); + real2 v712 = plus(v559, v599); + scatter(out, 4, 64, plus(v712, v713)); + real2 v726 = minus(v712, v713); + scatter(out, 36, 64, timesminusplus(v726, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v726), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v711 = minusplus(uminus(v707), v708); + real2 v709 = minusplus(v707, v708); + scatter(out, 20, 64, timesminusplus(reverse(v709), load(tbl, 126 * VECWIDTH + tbloffset), times(v709, load(tbl, 127 * VECWIDTH + tbloffset)))); + scatter(out, 52, 64, timesminusplus(reverse(v711), load(tbl, 128 * VECWIDTH + tbloffset), times(v711, load(tbl, 129 * VECWIDTH + tbloffset)))); + real2 v565 = timesminusplus(reverse(v551), load(tbl, 100 * VECWIDTH + tbloffset), times(v551, load(tbl, 101 * VECWIDTH + tbloffset))); + real2 v738 = plus(v565, v605); + real2 v734 = minus(v605, v565); + scatter(out, 12, 64, plus(v738, v739)); + real2 v752 = minus(v738, v739); + scatter(out, 44, 64, timesminusplus(v752, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v752), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v737 = minusplus(uminus(v733), v734); + scatter(out, 60, 64, timesminusplus(reverse(v737), load(tbl, 132 * VECWIDTH + tbloffset), times(v737, load(tbl, 133 * VECWIDTH + tbloffset)))); + real2 v735 = minusplus(v733, v734); + scatter(out, 28, 64, timesminusplus(reverse(v735), load(tbl, 130 * VECWIDTH + tbloffset), times(v735, load(tbl, 131 * VECWIDTH + tbloffset)))); + real2 v471 = minusplus(uminus(v467), v468); + real2 v469 = minusplus(v467, v468); + real2 v479 = timesminusplus(reverse(v469), load(tbl, 82 * VECWIDTH + tbloffset), times(v469, load(tbl, 83 * VECWIDTH + tbloffset))); + real2 v511 = minusplus(uminus(v507), v508); + real2 v509 = minusplus(v507, v508); + real2 v519 = timesminusplus(reverse(v509), load(tbl, 90 * VECWIDTH + tbloffset), times(v509, load(tbl, 91 * VECWIDTH + tbloffset))); + real2 v765 = plus(v439, v519); + real2 v759 = reverse(minus(v519, v439)); + real2 v389 = minusplus(v387, v388); + real2 v391 = minusplus(uminus(v387), v388); + real2 v399 = timesminusplus(reverse(v389), load(tbl, 66 * VECWIDTH + tbloffset), times(v389, load(tbl, 67 * VECWIDTH + tbloffset))); + real2 v764 = plus(v399, v479); + real2 v760 = minus(v479, v399); + real2 v804 = plus(v764, v765); + real2 v800 = minus(v765, v764); + scatter(out, 2, 64, plus(v804, v805)); + real2 v818 = minus(v804, v805); + scatter(out, 34, 64, timesminusplus(v818, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v818), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v803 = minusplus(uminus(v799), v800); + scatter(out, 50, 64, timesminusplus(reverse(v803), load(tbl, 144 * VECWIDTH + tbloffset), times(v803, load(tbl, 145 * VECWIDTH + tbloffset)))); + real2 v801 = minusplus(v799, v800); + scatter(out, 18, 64, timesminusplus(reverse(v801), load(tbl, 142 * VECWIDTH + tbloffset), times(v801, load(tbl, 143 * VECWIDTH + tbloffset)))); + real2 v763 = minusplus(uminus(v759), v760); + real2 v761 = minusplus(v759, v760); + real2 v777 = timesminusplus(reverse(v763), load(tbl, 136 * VECWIDTH + tbloffset), times(v763, load(tbl, 137 * VECWIDTH + tbloffset))); + scatter(out, 26, 64, plus(v777, v797)); + real2 v830 = minus(v777, v797); + scatter(out, 58, 64, timesminusplus(v830, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v830), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v771 = timesminusplus(reverse(v761), load(tbl, 134 * VECWIDTH + tbloffset), times(v761, load(tbl, 135 * VECWIDTH + tbloffset))); + scatter(out, 10, 64, plus(v771, v791)); + real2 v824 = minus(v771, v791); + scatter(out, 42, 64, timesminusplus(v824, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v824), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v445 = timesminusplus(reverse(v431), load(tbl, 76 * VECWIDTH + tbloffset), times(v431, load(tbl, 77 * VECWIDTH + tbloffset))); + real2 v525 = timesminusplus(reverse(v511), load(tbl, 92 * VECWIDTH + tbloffset), times(v511, load(tbl, 93 * VECWIDTH + tbloffset))); + real2 v837 = reverse(minus(v525, v445)); + real2 v843 = plus(v445, v525); + real2 v485 = timesminusplus(reverse(v471), load(tbl, 84 * VECWIDTH + tbloffset), times(v471, load(tbl, 85 * VECWIDTH + tbloffset))); + real2 v405 = timesminusplus(reverse(v391), load(tbl, 68 * VECWIDTH + tbloffset), times(v391, load(tbl, 69 * VECWIDTH + tbloffset))); + real2 v838 = minus(v485, v405); + real2 v842 = plus(v405, v485); + real2 v878 = minus(v843, v842); + real2 v882 = plus(v842, v843); + scatter(out, 6, 64, plus(v882, v883)); + real2 v896 = minus(v882, v883); + scatter(out, 38, 64, timesminusplus(v896, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v896), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v881 = minusplus(uminus(v877), v878); + scatter(out, 54, 64, timesminusplus(reverse(v881), load(tbl, 156 * VECWIDTH + tbloffset), times(v881, load(tbl, 157 * VECWIDTH + tbloffset)))); + real2 v879 = minusplus(v877, v878); + scatter(out, 22, 64, timesminusplus(reverse(v879), load(tbl, 154 * VECWIDTH + tbloffset), times(v879, load(tbl, 155 * VECWIDTH + tbloffset)))); + real2 v841 = minusplus(uminus(v837), v838); + real2 v839 = minusplus(v837, v838); + real2 v855 = timesminusplus(reverse(v841), load(tbl, 148 * VECWIDTH + tbloffset), times(v841, load(tbl, 149 * VECWIDTH + tbloffset))); + scatter(out, 30, 64, plus(v855, v875)); + real2 v908 = minus(v855, v875); + scatter(out, 62, 64, timesminusplus(v908, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v908), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v849 = timesminusplus(reverse(v839), load(tbl, 146 * VECWIDTH + tbloffset), times(v839, load(tbl, 147 * VECWIDTH + tbloffset))); + scatter(out, 14, 64, plus(v849, v869)); + real2 v902 = minus(v849, v869); + scatter(out, 46, 64, timesminusplus(v902, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v902), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v151 = minusplus(uminus(v147), v148); + real2 v149 = minusplus(v147, v148); + real2 v311 = minusplus(uminus(v307), v308); + real2 v309 = minusplus(v307, v308); + real2 v109 = minusplus(v107, v108); + real2 v111 = minusplus(uminus(v107), v108); + real2 v119 = timesminusplus(reverse(v109), load(tbl, 10 * VECWIDTH + tbloffset), times(v109, load(tbl, 11 * VECWIDTH + tbloffset))); + real2 v269 = minusplus(v267, v268); + real2 v271 = minusplus(uminus(v267), v268); + real2 v279 = timesminusplus(reverse(v269), load(tbl, 42 * VECWIDTH + tbloffset), times(v269, load(tbl, 43 * VECWIDTH + tbloffset))); + real2 v960 = plus(v119, v279); + real2 v956 = minus(v279, v119); + real2 v169 = minusplus(v167, v168); + real2 v171 = minusplus(uminus(v167), v168); + real2 v159 = timesminusplus(reverse(v149), load(tbl, 18 * VECWIDTH + tbloffset), times(v149, load(tbl, 19 * VECWIDTH + tbloffset))); + real2 v319 = timesminusplus(reverse(v309), load(tbl, 50 * VECWIDTH + tbloffset), times(v309, load(tbl, 51 * VECWIDTH + tbloffset))); + real2 v921 = plus(v159, v319); + real2 v915 = reverse(minus(v319, v159)); + real2 v351 = minusplus(uminus(v347), v348); + real2 v349 = minusplus(v347, v348); + real2 v359 = timesminusplus(reverse(v349), load(tbl, 58 * VECWIDTH + tbloffset), times(v349, load(tbl, 59 * VECWIDTH + tbloffset))); + real2 v191 = minusplus(uminus(v187), v188); + real2 v189 = minusplus(v187, v188); + real2 v199 = timesminusplus(reverse(v189), load(tbl, 26 * VECWIDTH + tbloffset), times(v189, load(tbl, 27 * VECWIDTH + tbloffset))); + real2 v961 = plus(v199, v359); + real2 v955 = reverse(minus(v359, v199)); + real2 v995 = reverse(minus(v961, v960)); + real2 v1001 = plus(v960, v961); + real2 v179 = timesminusplus(reverse(v169), load(tbl, 22 * VECWIDTH + tbloffset), times(v169, load(tbl, 23 * VECWIDTH + tbloffset))); + real2 v941 = plus(v179, v339); + real2 v935 = reverse(minus(v339, v179)); + real2 v1016 = minus(v941, v940); + real2 v1020 = plus(v940, v941); + real2 v71 = minusplus(uminus(v67), v68); + real2 v69 = minusplus(v67, v68); + real2 v79 = timesminusplus(reverse(v69), load(tbl, 2 * VECWIDTH + tbloffset), times(v69, load(tbl, 3 * VECWIDTH + tbloffset))); + real2 v1041 = plus(v1020, v1021); + real2 v1035 = reverse(minus(v1021, v1020)); + real2 v229 = minusplus(v227, v228); + real2 v231 = minusplus(uminus(v227), v228); + real2 v239 = timesminusplus(reverse(v229), load(tbl, 34 * VECWIDTH + tbloffset), times(v229, load(tbl, 35 * VECWIDTH + tbloffset))); + real2 v920 = plus(v79, v239); + real2 v916 = minus(v239, v79); + real2 v996 = minus(v921, v920); + real2 v1000 = plus(v920, v921); + real2 v1040 = plus(v1000, v1001); + real2 v1036 = minus(v1001, v1000); + scatter(out, 1, 64, plus(v1040, v1041)); + real2 v1054 = minus(v1040, v1041); + scatter(out, 33, 64, timesminusplus(v1054, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1054), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1037 = minusplus(v1035, v1036); + real2 v1039 = minusplus(uminus(v1035), v1036); + scatter(out, 49, 64, timesminusplus(reverse(v1039), load(tbl, 184 * VECWIDTH + tbloffset), times(v1039, load(tbl, 185 * VECWIDTH + tbloffset)))); + scatter(out, 17, 64, timesminusplus(reverse(v1037), load(tbl, 182 * VECWIDTH + tbloffset), times(v1037, load(tbl, 183 * VECWIDTH + tbloffset)))); + real2 v1017 = minusplus(v1015, v1016); + real2 v1019 = minusplus(uminus(v1015), v1016); + real2 v1033 = timesminusplus(reverse(v1019), load(tbl, 180 * VECWIDTH + tbloffset), times(v1019, load(tbl, 181 * VECWIDTH + tbloffset))); + real2 v997 = minusplus(v995, v996); + real2 v999 = minusplus(uminus(v995), v996); + real2 v1013 = timesminusplus(reverse(v999), load(tbl, 176 * VECWIDTH + tbloffset), times(v999, load(tbl, 177 * VECWIDTH + tbloffset))); + scatter(out, 25, 64, plus(v1013, v1033)); + real2 v1066 = minus(v1013, v1033); + scatter(out, 57, 64, timesminusplus(v1066, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1066), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1027 = timesminusplus(reverse(v1017), load(tbl, 178 * VECWIDTH + tbloffset), times(v1017, load(tbl, 179 * VECWIDTH + tbloffset))); + real2 v1007 = timesminusplus(reverse(v997), load(tbl, 174 * VECWIDTH + tbloffset), times(v997, load(tbl, 175 * VECWIDTH + tbloffset))); + scatter(out, 9, 64, plus(v1007, v1027)); + real2 v1060 = minus(v1007, v1027); + scatter(out, 41, 64, timesminusplus(v1060, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1060), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v937 = minusplus(v935, v936); + real2 v939 = minusplus(uminus(v935), v936); + real2 v959 = minusplus(uminus(v955), v956); + real2 v957 = minusplus(v955, v956); + real2 v967 = timesminusplus(reverse(v957), load(tbl, 166 * VECWIDTH + tbloffset), times(v957, load(tbl, 167 * VECWIDTH + tbloffset))); + real2 v947 = timesminusplus(reverse(v937), load(tbl, 162 * VECWIDTH + tbloffset), times(v937, load(tbl, 163 * VECWIDTH + tbloffset))); + real2 v919 = minusplus(uminus(v915), v916); + real2 v917 = minusplus(v915, v916); + real2 v1079 = plus(v947, v987); + real2 v1073 = reverse(minus(v987, v947)); + real2 v927 = timesminusplus(reverse(v917), load(tbl, 158 * VECWIDTH + tbloffset), times(v917, load(tbl, 159 * VECWIDTH + tbloffset))); + real2 v1074 = minus(v967, v927); + real2 v1078 = plus(v927, v967); + scatter(out, 5, 64, plus(v1078, v1079)); + real2 v1092 = minus(v1078, v1079); + scatter(out, 37, 64, timesminusplus(v1092, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1092), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1075 = minusplus(v1073, v1074); + scatter(out, 21, 64, timesminusplus(reverse(v1075), load(tbl, 186 * VECWIDTH + tbloffset), times(v1075, load(tbl, 187 * VECWIDTH + tbloffset)))); + real2 v1077 = minusplus(uminus(v1073), v1074); + scatter(out, 53, 64, timesminusplus(reverse(v1077), load(tbl, 188 * VECWIDTH + tbloffset), times(v1077, load(tbl, 189 * VECWIDTH + tbloffset)))); + real2 v953 = timesminusplus(reverse(v939), load(tbl, 164 * VECWIDTH + tbloffset), times(v939, load(tbl, 165 * VECWIDTH + tbloffset))); + real2 v1099 = reverse(minus(v993, v953)); + real2 v1105 = plus(v953, v993); + real2 v973 = timesminusplus(reverse(v959), load(tbl, 168 * VECWIDTH + tbloffset), times(v959, load(tbl, 169 * VECWIDTH + tbloffset))); + real2 v933 = timesminusplus(reverse(v919), load(tbl, 160 * VECWIDTH + tbloffset), times(v919, load(tbl, 161 * VECWIDTH + tbloffset))); + real2 v1104 = plus(v933, v973); + real2 v1100 = minus(v973, v933); + scatter(out, 13, 64, plus(v1104, v1105)); + real2 v1118 = minus(v1104, v1105); + scatter(out, 45, 64, timesminusplus(v1118, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1118), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1101 = minusplus(v1099, v1100); + scatter(out, 29, 64, timesminusplus(reverse(v1101), load(tbl, 190 * VECWIDTH + tbloffset), times(v1101, load(tbl, 191 * VECWIDTH + tbloffset)))); + real2 v1103 = minusplus(uminus(v1099), v1100); + scatter(out, 61, 64, timesminusplus(reverse(v1103), load(tbl, 192 * VECWIDTH + tbloffset), times(v1103, load(tbl, 193 * VECWIDTH + tbloffset)))); + real2 v345 = timesminusplus(reverse(v331), load(tbl, 56 * VECWIDTH + tbloffset), times(v331, load(tbl, 57 * VECWIDTH + tbloffset))); + real2 v325 = timesminusplus(reverse(v311), load(tbl, 52 * VECWIDTH + tbloffset), times(v311, load(tbl, 53 * VECWIDTH + tbloffset))); + real2 v265 = timesminusplus(reverse(v251), load(tbl, 40 * VECWIDTH + tbloffset), times(v251, load(tbl, 41 * VECWIDTH + tbloffset))); + real2 v185 = timesminusplus(reverse(v171), load(tbl, 24 * VECWIDTH + tbloffset), times(v171, load(tbl, 25 * VECWIDTH + tbloffset))); + real2 v165 = timesminusplus(reverse(v151), load(tbl, 20 * VECWIDTH + tbloffset), times(v151, load(tbl, 21 * VECWIDTH + tbloffset))); + real2 v1131 = plus(v165, v325); + real2 v1125 = reverse(minus(v325, v165)); + real2 v1151 = plus(v185, v345); + real2 v1145 = reverse(minus(v345, v185)); + real2 v105 = timesminusplus(reverse(v91), load(tbl, 8 * VECWIDTH + tbloffset), times(v91, load(tbl, 9 * VECWIDTH + tbloffset))); + real2 v1150 = plus(v105, v265); + real2 v1146 = minus(v265, v105); + real2 v1226 = minus(v1151, v1150); + real2 v1230 = plus(v1150, v1151); + real2 v1231 = plus(v1190, v1191); + real2 v1225 = reverse(minus(v1191, v1190)); + real2 v1245 = reverse(minus(v1231, v1230)); + real2 v1251 = plus(v1230, v1231); + real2 v365 = timesminusplus(reverse(v351), load(tbl, 60 * VECWIDTH + tbloffset), times(v351, load(tbl, 61 * VECWIDTH + tbloffset))); + real2 v285 = timesminusplus(reverse(v271), load(tbl, 44 * VECWIDTH + tbloffset), times(v271, load(tbl, 45 * VECWIDTH + tbloffset))); + real2 v205 = timesminusplus(reverse(v191), load(tbl, 28 * VECWIDTH + tbloffset), times(v191, load(tbl, 29 * VECWIDTH + tbloffset))); + real2 v1171 = plus(v205, v365); + real2 v1165 = reverse(minus(v365, v205)); + real2 v125 = timesminusplus(reverse(v111), load(tbl, 12 * VECWIDTH + tbloffset), times(v111, load(tbl, 13 * VECWIDTH + tbloffset))); + real2 v85 = timesminusplus(reverse(v71), load(tbl, 4 * VECWIDTH + tbloffset), times(v71, load(tbl, 5 * VECWIDTH + tbloffset))); + real2 v245 = timesminusplus(reverse(v231), load(tbl, 36 * VECWIDTH + tbloffset), times(v231, load(tbl, 37 * VECWIDTH + tbloffset))); + real2 v1126 = minus(v245, v85); + real2 v1130 = plus(v85, v245); + real2 v1210 = plus(v1130, v1131); + real2 v1206 = minus(v1131, v1130); + real2 v1166 = minus(v285, v125); + real2 v1170 = plus(v125, v285); + real2 v1211 = plus(v1170, v1171); + real2 v1205 = reverse(minus(v1171, v1170)); + real2 v1246 = minus(v1211, v1210); + real2 v1250 = plus(v1210, v1211); + scatter(out, 3, 64, plus(v1250, v1251)); + real2 v1264 = minus(v1250, v1251); + scatter(out, 35, 64, timesminusplus(v1264, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1264), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1247 = minusplus(v1245, v1246); + real2 v1249 = minusplus(uminus(v1245), v1246); + scatter(out, 19, 64, timesminusplus(reverse(v1247), load(tbl, 218 * VECWIDTH + tbloffset), times(v1247, load(tbl, 219 * VECWIDTH + tbloffset)))); + scatter(out, 51, 64, timesminusplus(reverse(v1249), load(tbl, 220 * VECWIDTH + tbloffset), times(v1249, load(tbl, 221 * VECWIDTH + tbloffset)))); + real2 v1229 = minusplus(uminus(v1225), v1226); + real2 v1227 = minusplus(v1225, v1226); + real2 v1207 = minusplus(v1205, v1206); + real2 v1209 = minusplus(uminus(v1205), v1206); + real2 v1237 = timesminusplus(reverse(v1227), load(tbl, 214 * VECWIDTH + tbloffset), times(v1227, load(tbl, 215 * VECWIDTH + tbloffset))); + real2 v1217 = timesminusplus(reverse(v1207), load(tbl, 210 * VECWIDTH + tbloffset), times(v1207, load(tbl, 211 * VECWIDTH + tbloffset))); + scatter(out, 11, 64, plus(v1217, v1237)); + real2 v1270 = minus(v1217, v1237); + scatter(out, 43, 64, timesminusplus(v1270, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1270), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1223 = timesminusplus(reverse(v1209), load(tbl, 212 * VECWIDTH + tbloffset), times(v1209, load(tbl, 213 * VECWIDTH + tbloffset))); + real2 v1243 = timesminusplus(reverse(v1229), load(tbl, 216 * VECWIDTH + tbloffset), times(v1229, load(tbl, 217 * VECWIDTH + tbloffset))); + scatter(out, 27, 64, plus(v1223, v1243)); + real2 v1276 = minus(v1223, v1243); + scatter(out, 59, 64, timesminusplus(v1276, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1276), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1189 = minusplus(uminus(v1185), v1186); + real2 v1187 = minusplus(v1185, v1186); + real2 v1129 = minusplus(uminus(v1125), v1126); + real2 v1127 = minusplus(v1125, v1126); + real2 v1147 = minusplus(v1145, v1146); + real2 v1149 = minusplus(uminus(v1145), v1146); + real2 v1167 = minusplus(v1165, v1166); + real2 v1169 = minusplus(uminus(v1165), v1166); + real2 v1143 = timesminusplus(reverse(v1129), load(tbl, 196 * VECWIDTH + tbloffset), times(v1129, load(tbl, 197 * VECWIDTH + tbloffset))); + real2 v1163 = timesminusplus(reverse(v1149), load(tbl, 200 * VECWIDTH + tbloffset), times(v1149, load(tbl, 201 * VECWIDTH + tbloffset))); + real2 v1203 = timesminusplus(reverse(v1189), load(tbl, 208 * VECWIDTH + tbloffset), times(v1189, load(tbl, 209 * VECWIDTH + tbloffset))); + real2 v1315 = plus(v1163, v1203); + real2 v1309 = reverse(minus(v1203, v1163)); + real2 v1183 = timesminusplus(reverse(v1169), load(tbl, 204 * VECWIDTH + tbloffset), times(v1169, load(tbl, 205 * VECWIDTH + tbloffset))); + real2 v1314 = plus(v1143, v1183); + real2 v1310 = minus(v1183, v1143); + scatter(out, 15, 64, plus(v1314, v1315)); + real2 v1328 = minus(v1314, v1315); + scatter(out, 47, 64, timesminusplus(v1328, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1328), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1311 = minusplus(v1309, v1310); + scatter(out, 31, 64, timesminusplus(reverse(v1311), load(tbl, 226 * VECWIDTH + tbloffset), times(v1311, load(tbl, 227 * VECWIDTH + tbloffset)))); + real2 v1313 = minusplus(uminus(v1309), v1310); + scatter(out, 63, 64, timesminusplus(reverse(v1313), load(tbl, 228 * VECWIDTH + tbloffset), times(v1313, load(tbl, 229 * VECWIDTH + tbloffset)))); + real2 v1177 = timesminusplus(reverse(v1167), load(tbl, 202 * VECWIDTH + tbloffset), times(v1167, load(tbl, 203 * VECWIDTH + tbloffset))); + real2 v1137 = timesminusplus(reverse(v1127), load(tbl, 194 * VECWIDTH + tbloffset), times(v1127, load(tbl, 195 * VECWIDTH + tbloffset))); + real2 v1197 = timesminusplus(reverse(v1187), load(tbl, 206 * VECWIDTH + tbloffset), times(v1187, load(tbl, 207 * VECWIDTH + tbloffset))); + real2 v1157 = timesminusplus(reverse(v1147), load(tbl, 198 * VECWIDTH + tbloffset), times(v1147, load(tbl, 199 * VECWIDTH + tbloffset))); + real2 v1283 = reverse(minus(v1197, v1157)); + real2 v1289 = plus(v1157, v1197); + real2 v1288 = plus(v1137, v1177); + real2 v1284 = minus(v1177, v1137); + scatter(out, 7, 64, plus(v1288, v1289)); + real2 v1302 = minus(v1288, v1289); + scatter(out, 39, 64, timesminusplus(v1302, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1302), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1285 = minusplus(v1283, v1284); + real2 v1287 = minusplus(uminus(v1283), v1284); + scatter(out, 55, 64, timesminusplus(reverse(v1287), load(tbl, 224 * VECWIDTH + tbloffset), times(v1287, load(tbl, 225 * VECWIDTH + tbloffset)))); + scatter(out, 23, 64, timesminusplus(reverse(v1285), load(tbl, 222 * VECWIDTH + tbloffset), times(v1285, load(tbl, 223 * VECWIDTH + tbloffset)))); + // Pres : 17339 + } +} +#endif + +// + +#if MAXBUTWIDTH%TYPEID% >= 7 +ALIGNED(8192) void tbut128f_%SHIFT%_%CONFIG%_%ISA%(real *RESTRICT out0, uint32_t *q, const real *RESTRICT in0, const real *RESTRICT tbl, const int K) { + const int shift = %SHIFT% + LOG2VECWIDTH, k = 1 << (shift - LOG2VECWIDTH); + int i=0; +#pragma omp parallel for + for(i=0;i < k;i++) { + int i0 = i << LOG2VECWIDTH; + real *out = out0 + q[i]; + const real *in = in0 + i0*2; + const int tbloffset = K * i0; + + // Pres : 148586 + real2 v56 = load(in, 54 << shift); + real2 v120 = load(in, 118 << shift); + real2 v571 = reverse(minus(v56, v120)); + real2 v577 = plus(v56, v120); + real2 v24 = load(in, 22 << shift); + real2 v88 = load(in, 86 << shift); + real2 v576 = plus(v24, v88); + real2 v572 = minus(v88, v24); + real2 v573 = minusplus(v571, v572); + real2 v575 = minusplus(uminus(v571), v572); + real2 v589 = timesminusplus(reverse(v575), load(tbl, 92 * VECWIDTH + tbloffset), times(v575, load(tbl, 93 * VECWIDTH + tbloffset))); + real2 v583 = timesminusplus(reverse(v573), load(tbl, 90 * VECWIDTH + tbloffset), times(v573, load(tbl, 91 * VECWIDTH + tbloffset))); + real2 v897 = plus(v576, v577); + real2 v891 = reverse(minus(v576, v577)); + real2 v8 = load(in, 6 << shift); + real2 v72 = load(in, 70 << shift); + real2 v252 = minus(v72, v8); + real2 v256 = plus(v8, v72); + real2 v104 = load(in, 102 << shift); + real2 v40 = load(in, 38 << shift); + real2 v251 = reverse(minus(v40, v104)); + real2 v257 = plus(v40, v104); + real2 v255 = minusplus(uminus(v251), v252); + real2 v253 = minusplus(v251, v252); + real2 v263 = timesminusplus(reverse(v253), load(tbl, 26 * VECWIDTH + tbloffset), times(v253, load(tbl, 27 * VECWIDTH + tbloffset))); + real2 v896 = plus(v256, v257); + real2 v892 = minus(v257, v256); + real2 v895 = minusplus(uminus(v891), v892); + real2 v893 = minusplus(v891, v892); + real2 v909 = timesminusplus(reverse(v895), load(tbl, 156 * VECWIDTH + tbloffset), times(v895, load(tbl, 157 * VECWIDTH + tbloffset))); + real2 v903 = timesminusplus(reverse(v893), load(tbl, 154 * VECWIDTH + tbloffset), times(v893, load(tbl, 155 * VECWIDTH + tbloffset))); + real2 v269 = timesminusplus(reverse(v255), load(tbl, 28 * VECWIDTH + tbloffset), times(v255, load(tbl, 29 * VECWIDTH + tbloffset))); + real2 v1216 = plus(v896, v897); + real2 v1212 = minus(v897, v896); + real2 v2160 = minus(v583, v263); + real2 v2164 = plus(v263, v583); + real2 v2686 = minus(v589, v269); + real2 v2690 = plus(v269, v589); + real2 v96 = load(in, 94 << shift); + real2 v32 = load(in, 30 << shift); + real2 v736 = plus(v32, v96); + real2 v732 = minus(v96, v32); + real2 v64 = load(in, 62 << shift); + real2 v128 = load(in, 126 << shift); + real2 v737 = plus(v64, v128); + real2 v731 = reverse(minus(v64, v128)); + real2 v1057 = plus(v736, v737); + real2 v1051 = reverse(minus(v736, v737)); + real2 v733 = minusplus(v731, v732); + real2 v735 = minusplus(uminus(v731), v732); + real2 v749 = timesminusplus(reverse(v735), load(tbl, 124 * VECWIDTH + tbloffset), times(v735, load(tbl, 125 * VECWIDTH + tbloffset))); + real2 v743 = timesminusplus(reverse(v733), load(tbl, 122 * VECWIDTH + tbloffset), times(v733, load(tbl, 123 * VECWIDTH + tbloffset))); + real2 v16 = load(in, 14 << shift); + real2 v80 = load(in, 78 << shift); + real2 v412 = minus(v80, v16); + real2 v416 = plus(v16, v80); + real2 v112 = load(in, 110 << shift); + real2 v48 = load(in, 46 << shift); + real2 v417 = plus(v48, v112); + real2 v411 = reverse(minus(v48, v112)); + real2 v1056 = plus(v416, v417); + real2 v1052 = minus(v417, v416); + real2 v1055 = minusplus(uminus(v1051), v1052); + real2 v1053 = minusplus(v1051, v1052); + real2 v1063 = timesminusplus(reverse(v1053), load(tbl, 186 * VECWIDTH + tbloffset), times(v1053, load(tbl, 187 * VECWIDTH + tbloffset))); + real2 v1665 = plus(v903, v1063); + real2 v1659 = reverse(minus(v903, v1063)); + real2 v1069 = timesminusplus(reverse(v1055), load(tbl, 188 * VECWIDTH + tbloffset), times(v1055, load(tbl, 189 * VECWIDTH + tbloffset))); + real2 v1869 = reverse(minus(v909, v1069)); + real2 v1875 = plus(v909, v1069); + real2 v413 = minusplus(v411, v412); + real2 v415 = minusplus(uminus(v411), v412); + real2 v429 = timesminusplus(reverse(v415), load(tbl, 60 * VECWIDTH + tbloffset), times(v415, load(tbl, 61 * VECWIDTH + tbloffset))); + real2 v1217 = plus(v1056, v1057); + real2 v1211 = reverse(minus(v1056, v1057)); + real2 v1297 = plus(v1216, v1217); + real2 v1291 = reverse(minus(v1216, v1217)); + real2 v2691 = plus(v429, v749); + real2 v2685 = reverse(minus(v429, v749)); + real2 v2765 = reverse(minus(v2690, v2691)); + real2 v2771 = plus(v2690, v2691); + real2 v2689 = minusplus(uminus(v2685), v2686); + real2 v2687 = minusplus(v2685, v2686); + real2 v2703 = timesminusplus(reverse(v2689), load(tbl, 476 * VECWIDTH + tbloffset), times(v2689, load(tbl, 477 * VECWIDTH + tbloffset))); + real2 v2697 = timesminusplus(reverse(v2687), load(tbl, 474 * VECWIDTH + tbloffset), times(v2687, load(tbl, 475 * VECWIDTH + tbloffset))); + real2 v1215 = minusplus(uminus(v1211), v1212); + real2 v1213 = minusplus(v1211, v1212); + real2 v1223 = timesminusplus(reverse(v1213), load(tbl, 218 * VECWIDTH + tbloffset), times(v1213, load(tbl, 219 * VECWIDTH + tbloffset))); + real2 v1229 = timesminusplus(reverse(v1215), load(tbl, 220 * VECWIDTH + tbloffset), times(v1215, load(tbl, 221 * VECWIDTH + tbloffset))); + real2 v423 = timesminusplus(reverse(v413), load(tbl, 58 * VECWIDTH + tbloffset), times(v413, load(tbl, 59 * VECWIDTH + tbloffset))); + real2 v2165 = plus(v423, v743); + real2 v2159 = reverse(minus(v423, v743)); + real2 v2245 = plus(v2164, v2165); + real2 v2239 = reverse(minus(v2164, v2165)); + real2 v44 = load(in, 42 << shift); + real2 v108 = load(in, 106 << shift); + real2 v331 = reverse(minus(v44, v108)); + real2 v337 = plus(v44, v108); + real2 v76 = load(in, 74 << shift); + real2 v12 = load(in, 10 << shift); + real2 v336 = plus(v12, v76); + real2 v332 = minus(v76, v12); + real2 v976 = plus(v336, v337); + real2 v972 = minus(v337, v336); + real2 v335 = minusplus(uminus(v331), v332); + real2 v333 = minusplus(v331, v332); + real2 v343 = timesminusplus(reverse(v333), load(tbl, 42 * VECWIDTH + tbloffset), times(v333, load(tbl, 43 * VECWIDTH + tbloffset))); + real2 v349 = timesminusplus(reverse(v335), load(tbl, 44 * VECWIDTH + tbloffset), times(v335, load(tbl, 45 * VECWIDTH + tbloffset))); + real2 v124 = load(in, 122 << shift); + real2 v60 = load(in, 58 << shift); + real2 v651 = reverse(minus(v60, v124)); + real2 v657 = plus(v60, v124); + real2 v28 = load(in, 26 << shift); + real2 v92 = load(in, 90 << shift); + real2 v652 = minus(v92, v28); + real2 v656 = plus(v28, v92); + real2 v977 = plus(v656, v657); + real2 v971 = reverse(minus(v656, v657)); + real2 v973 = minusplus(v971, v972); + real2 v975 = minusplus(uminus(v971), v972); + real2 v983 = timesminusplus(reverse(v973), load(tbl, 170 * VECWIDTH + tbloffset), times(v973, load(tbl, 171 * VECWIDTH + tbloffset))); + real2 v1131 = reverse(minus(v976, v977)); + real2 v1137 = plus(v976, v977); + real2 v655 = minusplus(uminus(v651), v652); + real2 v653 = minusplus(v651, v652); + real2 v669 = timesminusplus(reverse(v655), load(tbl, 108 * VECWIDTH + tbloffset), times(v655, load(tbl, 109 * VECWIDTH + tbloffset))); + real2 v663 = timesminusplus(reverse(v653), load(tbl, 106 * VECWIDTH + tbloffset), times(v653, load(tbl, 107 * VECWIDTH + tbloffset))); + real2 v2079 = reverse(minus(v343, v663)); + real2 v2085 = plus(v343, v663); + real2 v2605 = reverse(minus(v349, v669)); + real2 v2611 = plus(v349, v669); + real2 v989 = timesminusplus(reverse(v975), load(tbl, 172 * VECWIDTH + tbloffset), times(v975, load(tbl, 173 * VECWIDTH + tbloffset))); + real2 v20 = load(in, 18 << shift); + real2 v84 = load(in, 82 << shift); + real2 v496 = plus(v20, v84); + real2 v492 = minus(v84, v20); + real2 v52 = load(in, 50 << shift); + real2 v116 = load(in, 114 << shift); + real2 v491 = reverse(minus(v52, v116)); + real2 v497 = plus(v52, v116); + real2 v817 = plus(v496, v497); + real2 v811 = reverse(minus(v496, v497)); + real2 v493 = minusplus(v491, v492); + real2 v495 = minusplus(uminus(v491), v492); + real2 v509 = timesminusplus(reverse(v495), load(tbl, 76 * VECWIDTH + tbloffset), times(v495, load(tbl, 77 * VECWIDTH + tbloffset))); + real2 v503 = timesminusplus(reverse(v493), load(tbl, 74 * VECWIDTH + tbloffset), times(v493, load(tbl, 75 * VECWIDTH + tbloffset))); + real2 v36 = load(in, 34 << shift); + real2 v100 = load(in, 98 << shift); + real2 v171 = reverse(minus(v36, v100)); + real2 v177 = plus(v36, v100); + real2 v68 = load(in, 66 << shift); + real2 v4 = load(in, 2 << shift); + real2 v176 = plus(v4, v68); + real2 v172 = minus(v68, v4); + real2 v816 = plus(v176, v177); + real2 v812 = minus(v177, v176); + real2 v1136 = plus(v816, v817); + real2 v1132 = minus(v817, v816); + real2 v1133 = minusplus(v1131, v1132); + real2 v1135 = minusplus(uminus(v1131), v1132); + real2 v1149 = timesminusplus(reverse(v1135), load(tbl, 204 * VECWIDTH + tbloffset), times(v1135, load(tbl, 205 * VECWIDTH + tbloffset))); + real2 v1296 = plus(v1136, v1137); + real2 v1292 = minus(v1137, v1136); + real2 v1295 = minusplus(uminus(v1291), v1292); + real2 v1293 = minusplus(v1291, v1292); + real2 v1303 = timesminusplus(reverse(v1293), load(tbl, 234 * VECWIDTH + tbloffset), times(v1293, load(tbl, 235 * VECWIDTH + tbloffset))); + real2 v1331 = reverse(minus(v1296, v1297)); + real2 v1337 = plus(v1296, v1297); + real2 v173 = minusplus(v171, v172); + real2 v175 = minusplus(uminus(v171), v172); + real2 v189 = timesminusplus(reverse(v175), load(tbl, 12 * VECWIDTH + tbloffset), times(v175, load(tbl, 13 * VECWIDTH + tbloffset))); + real2 v1309 = timesminusplus(reverse(v1295), load(tbl, 236 * VECWIDTH + tbloffset), times(v1295, load(tbl, 237 * VECWIDTH + tbloffset))); + real2 v815 = minusplus(uminus(v811), v812); + real2 v813 = minusplus(v811, v812); + real2 v1143 = timesminusplus(reverse(v1133), load(tbl, 202 * VECWIDTH + tbloffset), times(v1133, load(tbl, 203 * VECWIDTH + tbloffset))); + real2 v1541 = reverse(minus(v1149, v1229)); + real2 v1547 = plus(v1149, v1229); + real2 v2610 = plus(v189, v509); + real2 v2606 = minus(v509, v189); + real2 v2770 = plus(v2610, v2611); + real2 v2766 = minus(v2611, v2610); + real2 v823 = timesminusplus(reverse(v813), load(tbl, 138 * VECWIDTH + tbloffset), times(v813, load(tbl, 139 * VECWIDTH + tbloffset))); + real2 v829 = timesminusplus(reverse(v815), load(tbl, 140 * VECWIDTH + tbloffset), times(v815, load(tbl, 141 * VECWIDTH + tbloffset))); + real2 v2811 = plus(v2770, v2771); + real2 v2805 = reverse(minus(v2770, v2771)); + real2 v2767 = minusplus(v2765, v2766); + real2 v2769 = minusplus(uminus(v2765), v2766); + real2 v2607 = minusplus(v2605, v2606); + real2 v2609 = minusplus(uminus(v2605), v2606); + real2 v2617 = timesminusplus(reverse(v2607), load(tbl, 458 * VECWIDTH + tbloffset), times(v2607, load(tbl, 459 * VECWIDTH + tbloffset))); + real2 v2623 = timesminusplus(reverse(v2609), load(tbl, 460 * VECWIDTH + tbloffset), times(v2609, load(tbl, 461 * VECWIDTH + tbloffset))); + real2 v3013 = reverse(minus(v2623, v2703)); + real2 v3019 = plus(v2623, v2703); + real2 v2783 = timesminusplus(reverse(v2769), load(tbl, 492 * VECWIDTH + tbloffset), times(v2769, load(tbl, 493 * VECWIDTH + tbloffset))); + real2 v2941 = plus(v2617, v2697); + real2 v2935 = reverse(minus(v2617, v2697)); + real2 v2777 = timesminusplus(reverse(v2767), load(tbl, 490 * VECWIDTH + tbloffset), times(v2767, load(tbl, 491 * VECWIDTH + tbloffset))); + real2 v1660 = minus(v983, v823); + real2 v1664 = plus(v823, v983); + real2 v1874 = plus(v829, v989); + real2 v1870 = minus(v989, v829); + real2 v1909 = reverse(minus(v1874, v1875)); + real2 v1915 = plus(v1874, v1875); + real2 v1663 = minusplus(uminus(v1659), v1660); + real2 v1661 = minusplus(v1659, v1660); + real2 v1677 = timesminusplus(reverse(v1663), load(tbl, 296 * VECWIDTH + tbloffset), times(v1663, load(tbl, 297 * VECWIDTH + tbloffset))); + real2 v1873 = minusplus(uminus(v1869), v1870); + real2 v1871 = minusplus(v1869, v1870); + real2 v1887 = timesminusplus(reverse(v1873), load(tbl, 332 * VECWIDTH + tbloffset), times(v1873, load(tbl, 333 * VECWIDTH + tbloffset))); + real2 v1705 = plus(v1664, v1665); + real2 v1699 = reverse(minus(v1664, v1665)); + real2 v1671 = timesminusplus(reverse(v1661), load(tbl, 294 * VECWIDTH + tbloffset), times(v1661, load(tbl, 295 * VECWIDTH + tbloffset))); + real2 v1881 = timesminusplus(reverse(v1871), load(tbl, 330 * VECWIDTH + tbloffset), times(v1871, load(tbl, 331 * VECWIDTH + tbloffset))); + real2 v1469 = plus(v1143, v1223); + real2 v1463 = reverse(minus(v1143, v1223)); + real2 v54 = load(in, 52 << shift); + real2 v118 = load(in, 116 << shift); + real2 v537 = plus(v54, v118); + real2 v531 = reverse(minus(v54, v118)); + real2 v86 = load(in, 84 << shift); + real2 v22 = load(in, 20 << shift); + real2 v536 = plus(v22, v86); + real2 v532 = minus(v86, v22); + real2 v851 = reverse(minus(v536, v537)); + real2 v857 = plus(v536, v537); + real2 v533 = minusplus(v531, v532); + real2 v535 = minusplus(uminus(v531), v532); + real2 v549 = timesminusplus(reverse(v535), load(tbl, 84 * VECWIDTH + tbloffset), times(v535, load(tbl, 85 * VECWIDTH + tbloffset))); + real2 v102 = load(in, 100 << shift); + real2 v38 = load(in, 36 << shift); + real2 v217 = plus(v38, v102); + real2 v211 = reverse(minus(v38, v102)); + real2 v70 = load(in, 68 << shift); + real2 v6 = load(in, 4 << shift); + real2 v216 = plus(v6, v70); + real2 v212 = minus(v70, v6); + real2 v213 = minusplus(v211, v212); + real2 v215 = minusplus(uminus(v211), v212); + real2 v229 = timesminusplus(reverse(v215), load(tbl, 20 * VECWIDTH + tbloffset), times(v215, load(tbl, 21 * VECWIDTH + tbloffset))); + real2 v2646 = minus(v549, v229); + real2 v2650 = plus(v229, v549); + real2 v856 = plus(v216, v217); + real2 v852 = minus(v217, v216); + real2 v853 = minusplus(v851, v852); + real2 v855 = minusplus(uminus(v851), v852); + real2 v863 = timesminusplus(reverse(v853), load(tbl, 146 * VECWIDTH + tbloffset), times(v853, load(tbl, 147 * VECWIDTH + tbloffset))); + real2 v869 = timesminusplus(reverse(v855), load(tbl, 148 * VECWIDTH + tbloffset), times(v855, load(tbl, 149 * VECWIDTH + tbloffset))); + real2 v1176 = plus(v856, v857); + real2 v1172 = minus(v857, v856); + real2 v110 = load(in, 108 << shift); + real2 v46 = load(in, 44 << shift); + real2 v377 = plus(v46, v110); + real2 v371 = reverse(minus(v46, v110)); + real2 v78 = load(in, 76 << shift); + real2 v14 = load(in, 12 << shift); + real2 v372 = minus(v78, v14); + real2 v376 = plus(v14, v78); + real2 v1012 = minus(v377, v376); + real2 v1016 = plus(v376, v377); + real2 v373 = minusplus(v371, v372); + real2 v375 = minusplus(uminus(v371), v372); + real2 v389 = timesminusplus(reverse(v375), load(tbl, 52 * VECWIDTH + tbloffset), times(v375, load(tbl, 53 * VECWIDTH + tbloffset))); + real2 v30 = load(in, 28 << shift); + real2 v94 = load(in, 92 << shift); + real2 v696 = plus(v30, v94); + real2 v692 = minus(v94, v30); + real2 v62 = load(in, 60 << shift); + real2 v126 = load(in, 124 << shift); + real2 v697 = plus(v62, v126); + real2 v691 = reverse(minus(v62, v126)); + real2 v1017 = plus(v696, v697); + real2 v1011 = reverse(minus(v696, v697)); + real2 v1171 = reverse(minus(v1016, v1017)); + real2 v1177 = plus(v1016, v1017); + real2 v1013 = minusplus(v1011, v1012); + real2 v1015 = minusplus(uminus(v1011), v1012); + real2 v1175 = minusplus(uminus(v1171), v1172); + real2 v1173 = minusplus(v1171, v1172); + real2 v1183 = timesminusplus(reverse(v1173), load(tbl, 210 * VECWIDTH + tbloffset), times(v1173, load(tbl, 211 * VECWIDTH + tbloffset))); + real2 v1189 = timesminusplus(reverse(v1175), load(tbl, 212 * VECWIDTH + tbloffset), times(v1175, load(tbl, 213 * VECWIDTH + tbloffset))); + real2 v1029 = timesminusplus(reverse(v1015), load(tbl, 180 * VECWIDTH + tbloffset), times(v1015, load(tbl, 181 * VECWIDTH + tbloffset))); + real2 v1023 = timesminusplus(reverse(v1013), load(tbl, 178 * VECWIDTH + tbloffset), times(v1013, load(tbl, 179 * VECWIDTH + tbloffset))); + real2 v1625 = plus(v863, v1023); + real2 v1619 = reverse(minus(v863, v1023)); + real2 v1835 = plus(v869, v1029); + real2 v1829 = reverse(minus(v869, v1029)); + real2 v693 = minusplus(v691, v692); + real2 v695 = minusplus(uminus(v691), v692); + real2 v709 = timesminusplus(reverse(v695), load(tbl, 116 * VECWIDTH + tbloffset), times(v695, load(tbl, 117 * VECWIDTH + tbloffset))); + real2 v2645 = reverse(minus(v389, v709)); + real2 v2651 = plus(v389, v709); + real2 v1257 = plus(v1176, v1177); + real2 v1251 = reverse(minus(v1176, v1177)); + real2 v2731 = plus(v2650, v2651); + real2 v2725 = reverse(minus(v2650, v2651)); + real2 v114 = load(in, 112 << shift); + real2 v50 = load(in, 48 << shift); + real2 v457 = plus(v50, v114); + real2 v451 = reverse(minus(v50, v114)); + real2 v18 = load(in, 16 << shift); + real2 v82 = load(in, 80 << shift); + real2 v456 = plus(v18, v82); + real2 v452 = minus(v82, v18); + real2 v771 = reverse(minus(v456, v457)); + real2 v777 = plus(v456, v457); + real2 v453 = minusplus(v451, v452); + real2 v455 = minusplus(uminus(v451), v452); + real2 v469 = timesminusplus(reverse(v455), load(tbl, 68 * VECWIDTH + tbloffset), times(v455, load(tbl, 69 * VECWIDTH + tbloffset))); + real2 v66 = load(in, 64 << shift); + real2 v2 = load(in, 0 << shift); + real2 v132 = minus(v66, v2); + real2 v136 = plus(v2, v66); + real2 v98 = load(in, 96 << shift); + real2 v34 = load(in, 32 << shift); + real2 v131 = reverse(minus(v34, v98)); + real2 v137 = plus(v34, v98); + real2 v133 = minusplus(v131, v132); + real2 v135 = minusplus(uminus(v131), v132); + real2 v149 = timesminusplus(reverse(v135), load(tbl, 4 * VECWIDTH + tbloffset), times(v135, load(tbl, 5 * VECWIDTH + tbloffset))); + real2 v2566 = minus(v469, v149); + real2 v2570 = plus(v149, v469); + real2 v772 = minus(v137, v136); + real2 v776 = plus(v136, v137); + real2 v1092 = minus(v777, v776); + real2 v1096 = plus(v776, v777); + real2 v773 = minusplus(v771, v772); + real2 v775 = minusplus(uminus(v771), v772); + real2 v783 = timesminusplus(reverse(v773), load(tbl, 130 * VECWIDTH + tbloffset), times(v773, load(tbl, 131 * VECWIDTH + tbloffset))); + real2 v789 = timesminusplus(reverse(v775), load(tbl, 132 * VECWIDTH + tbloffset), times(v775, load(tbl, 133 * VECWIDTH + tbloffset))); + real2 v74 = load(in, 72 << shift); + real2 v10 = load(in, 8 << shift); + real2 v296 = plus(v10, v74); + real2 v292 = minus(v74, v10); + real2 v42 = load(in, 40 << shift); + real2 v106 = load(in, 104 << shift); + real2 v291 = reverse(minus(v42, v106)); + real2 v297 = plus(v42, v106); + real2 v293 = minusplus(v291, v292); + real2 v295 = minusplus(uminus(v291), v292); + real2 v309 = timesminusplus(reverse(v295), load(tbl, 36 * VECWIDTH + tbloffset), times(v295, load(tbl, 37 * VECWIDTH + tbloffset))); + real2 v932 = minus(v297, v296); + real2 v936 = plus(v296, v297); + real2 v122 = load(in, 120 << shift); + real2 v58 = load(in, 56 << shift); + real2 v617 = plus(v58, v122); + real2 v611 = reverse(minus(v58, v122)); + real2 v26 = load(in, 24 << shift); + real2 v90 = load(in, 88 << shift); + real2 v612 = minus(v90, v26); + real2 v616 = plus(v26, v90); + real2 v937 = plus(v616, v617); + real2 v931 = reverse(minus(v616, v617)); + real2 v1091 = reverse(minus(v936, v937)); + real2 v1097 = plus(v936, v937); + real2 v933 = minusplus(v931, v932); + real2 v935 = minusplus(uminus(v931), v932); + real2 v1093 = minusplus(v1091, v1092); + real2 v1095 = minusplus(uminus(v1091), v1092); + real2 v1103 = timesminusplus(reverse(v1093), load(tbl, 194 * VECWIDTH + tbloffset), times(v1093, load(tbl, 195 * VECWIDTH + tbloffset))); + real2 v1468 = plus(v1103, v1183); + real2 v1464 = minus(v1183, v1103); + real2 v1508 = plus(v1468, v1469); + real2 v1504 = minus(v1469, v1468); + real2 v1252 = minus(v1097, v1096); + real2 v1256 = plus(v1096, v1097); + real2 v1336 = plus(v1256, v1257); + real2 v1332 = minus(v1257, v1256); + real2 v1335 = minusplus(uminus(v1331), v1332); + real2 v1333 = minusplus(v1331, v1332); + real2 v1343 = timesminusplus(reverse(v1333), load(tbl, 242 * VECWIDTH + tbloffset), times(v1333, load(tbl, 243 * VECWIDTH + tbloffset))); + real2 v1349 = timesminusplus(reverse(v1335), load(tbl, 244 * VECWIDTH + tbloffset), times(v1335, load(tbl, 245 * VECWIDTH + tbloffset))); + real2 v1376 = plus(v1336, v1337); + real2 v1372 = minus(v1337, v1336); + real2 v1465 = minusplus(v1463, v1464); + real2 v1467 = minusplus(uminus(v1463), v1464); + real2 v1255 = minusplus(uminus(v1251), v1252); + real2 v1253 = minusplus(v1251, v1252); + real2 v1481 = timesminusplus(reverse(v1467), load(tbl, 264 * VECWIDTH + tbloffset), times(v1467, load(tbl, 265 * VECWIDTH + tbloffset))); + real2 v1475 = timesminusplus(reverse(v1465), load(tbl, 262 * VECWIDTH + tbloffset), times(v1465, load(tbl, 263 * VECWIDTH + tbloffset))); + real2 v1109 = timesminusplus(reverse(v1095), load(tbl, 196 * VECWIDTH + tbloffset), times(v1095, load(tbl, 197 * VECWIDTH + tbloffset))); + real2 v1542 = minus(v1189, v1109); + real2 v1546 = plus(v1109, v1189); + real2 v1545 = minusplus(uminus(v1541), v1542); + real2 v1543 = minusplus(v1541, v1542); + real2 v1553 = timesminusplus(reverse(v1543), load(tbl, 274 * VECWIDTH + tbloffset), times(v1543, load(tbl, 275 * VECWIDTH + tbloffset))); + real2 v1559 = timesminusplus(reverse(v1545), load(tbl, 276 * VECWIDTH + tbloffset), times(v1545, load(tbl, 277 * VECWIDTH + tbloffset))); + real2 v1582 = minus(v1547, v1546); + real2 v1586 = plus(v1546, v1547); + real2 v1269 = timesminusplus(reverse(v1255), load(tbl, 228 * VECWIDTH + tbloffset), times(v1255, load(tbl, 229 * VECWIDTH + tbloffset))); + real2 v1438 = minus(v1309, v1269); + real2 v1442 = plus(v1269, v1309); + real2 v1263 = timesminusplus(reverse(v1253), load(tbl, 226 * VECWIDTH + tbloffset), times(v1253, load(tbl, 227 * VECWIDTH + tbloffset))); + real2 v943 = timesminusplus(reverse(v933), load(tbl, 162 * VECWIDTH + tbloffset), times(v933, load(tbl, 163 * VECWIDTH + tbloffset))); + real2 v1624 = plus(v783, v943); + real2 v1620 = minus(v943, v783); + real2 v1623 = minusplus(uminus(v1619), v1620); + real2 v1621 = minusplus(v1619, v1620); + real2 v1700 = minus(v1625, v1624); + real2 v1704 = plus(v1624, v1625); + real2 v1631 = timesminusplus(reverse(v1621), load(tbl, 286 * VECWIDTH + tbloffset), times(v1621, load(tbl, 287 * VECWIDTH + tbloffset))); + real2 v949 = timesminusplus(reverse(v935), load(tbl, 164 * VECWIDTH + tbloffset), times(v935, load(tbl, 165 * VECWIDTH + tbloffset))); + real2 v1830 = minus(v949, v789); + real2 v1834 = plus(v789, v949); + real2 v1782 = plus(v1631, v1671); + real2 v1778 = minus(v1671, v1631); + real2 v1910 = minus(v1835, v1834); + real2 v1914 = plus(v1834, v1835); + real2 v1950 = minus(v1915, v1914); + real2 v1954 = plus(v1914, v1915); + real2 v1913 = minusplus(uminus(v1909), v1910); + real2 v1911 = minusplus(v1909, v1910); + real2 v613 = minusplus(v611, v612); + real2 v615 = minusplus(uminus(v611), v612); + real2 v629 = timesminusplus(reverse(v615), load(tbl, 100 * VECWIDTH + tbloffset), times(v615, load(tbl, 101 * VECWIDTH + tbloffset))); + real2 v1744 = plus(v1704, v1705); + real2 v1740 = minus(v1705, v1704); + real2 v1637 = timesminusplus(reverse(v1623), load(tbl, 288 * VECWIDTH + tbloffset), times(v1623, load(tbl, 289 * VECWIDTH + tbloffset))); + real2 v1927 = timesminusplus(reverse(v1913), load(tbl, 340 * VECWIDTH + tbloffset), times(v1913, load(tbl, 341 * VECWIDTH + tbloffset))); + real2 v2571 = plus(v309, v629); + real2 v2565 = reverse(minus(v309, v629)); + real2 v1833 = minusplus(uminus(v1829), v1830); + real2 v1831 = minusplus(v1829, v1830); + real2 v1921 = timesminusplus(reverse(v1911), load(tbl, 338 * VECWIDTH + tbloffset), times(v1911, load(tbl, 339 * VECWIDTH + tbloffset))); + real2 v1804 = minus(v1677, v1637); + real2 v1808 = plus(v1637, v1677); + real2 v1847 = timesminusplus(reverse(v1833), load(tbl, 324 * VECWIDTH + tbloffset), times(v1833, load(tbl, 325 * VECWIDTH + tbloffset))); + real2 v2014 = minus(v1887, v1847); + real2 v2018 = plus(v1847, v1887); + real2 v1841 = timesminusplus(reverse(v1831), load(tbl, 322 * VECWIDTH + tbloffset), times(v1831, load(tbl, 323 * VECWIDTH + tbloffset))); + real2 v1988 = minus(v1881, v1841); + real2 v1992 = plus(v1841, v1881); + real2 v1703 = minusplus(uminus(v1699), v1700); + real2 v1701 = minusplus(v1699, v1700); + real2 v1717 = timesminusplus(reverse(v1703), load(tbl, 304 * VECWIDTH + tbloffset), times(v1703, load(tbl, 305 * VECWIDTH + tbloffset))); + real2 v1711 = timesminusplus(reverse(v1701), load(tbl, 302 * VECWIDTH + tbloffset), times(v1701, load(tbl, 303 * VECWIDTH + tbloffset))); + real2 v2730 = plus(v2570, v2571); + real2 v2726 = minus(v2571, v2570); + real2 v1412 = minus(v1303, v1263); + real2 v1416 = plus(v1263, v1303); + real2 v63 = load(in, 61 << shift); + real2 v127 = load(in, 125 << shift); + real2 v717 = plus(v63, v127); + real2 v711 = reverse(minus(v63, v127)); + real2 v95 = load(in, 93 << shift); + real2 v31 = load(in, 29 << shift); + real2 v712 = minus(v95, v31); + real2 v716 = plus(v31, v95); + real2 v1037 = plus(v716, v717); + real2 v1031 = reverse(minus(v716, v717)); + real2 v79 = load(in, 77 << shift); + real2 v15 = load(in, 13 << shift); + real2 v396 = plus(v15, v79); + real2 v392 = minus(v79, v15); + real2 v111 = load(in, 109 << shift); + real2 v47 = load(in, 45 << shift); + real2 v397 = plus(v47, v111); + real2 v391 = reverse(minus(v47, v111)); + real2 v1032 = minus(v397, v396); + real2 v1036 = plus(v396, v397); + real2 v1033 = minusplus(v1031, v1032); + real2 v1035 = minusplus(uminus(v1031), v1032); + real2 v1049 = timesminusplus(reverse(v1035), load(tbl, 184 * VECWIDTH + tbloffset), times(v1035, load(tbl, 185 * VECWIDTH + tbloffset))); + real2 v1043 = timesminusplus(reverse(v1033), load(tbl, 182 * VECWIDTH + tbloffset), times(v1033, load(tbl, 183 * VECWIDTH + tbloffset))); + real2 v1197 = plus(v1036, v1037); + real2 v1191 = reverse(minus(v1036, v1037)); + real2 v23 = load(in, 21 << shift); + real2 v87 = load(in, 85 << shift); + real2 v556 = plus(v23, v87); + real2 v552 = minus(v87, v23); + real2 v119 = load(in, 117 << shift); + real2 v55 = load(in, 53 << shift); + real2 v557 = plus(v55, v119); + real2 v551 = reverse(minus(v55, v119)); + real2 v877 = plus(v556, v557); + real2 v871 = reverse(minus(v556, v557)); + real2 v7 = load(in, 5 << shift); + real2 v71 = load(in, 69 << shift); + real2 v232 = minus(v71, v7); + real2 v236 = plus(v7, v71); + real2 v103 = load(in, 101 << shift); + real2 v39 = load(in, 37 << shift); + real2 v237 = plus(v39, v103); + real2 v231 = reverse(minus(v39, v103)); + real2 v876 = plus(v236, v237); + real2 v872 = minus(v237, v236); + real2 v1192 = minus(v877, v876); + real2 v1196 = plus(v876, v877); + real2 v1271 = reverse(minus(v1196, v1197)); + real2 v1277 = plus(v1196, v1197); + real2 v875 = minusplus(uminus(v871), v872); + real2 v873 = minusplus(v871, v872); + real2 v883 = timesminusplus(reverse(v873), load(tbl, 150 * VECWIDTH + tbloffset), times(v873, load(tbl, 151 * VECWIDTH + tbloffset))); + real2 v1639 = reverse(minus(v883, v1043)); + real2 v1645 = plus(v883, v1043); + real2 v1195 = minusplus(uminus(v1191), v1192); + real2 v1193 = minusplus(v1191, v1192); + real2 v1209 = timesminusplus(reverse(v1195), load(tbl, 216 * VECWIDTH + tbloffset), times(v1195, load(tbl, 217 * VECWIDTH + tbloffset))); + real2 v1203 = timesminusplus(reverse(v1193), load(tbl, 214 * VECWIDTH + tbloffset), times(v1193, load(tbl, 215 * VECWIDTH + tbloffset))); + real2 v83 = load(in, 81 << shift); + real2 v19 = load(in, 17 << shift); + real2 v476 = plus(v19, v83); + real2 v472 = minus(v83, v19); + real2 v51 = load(in, 49 << shift); + real2 v115 = load(in, 113 << shift); + real2 v477 = plus(v51, v115); + real2 v471 = reverse(minus(v51, v115)); + real2 v797 = plus(v476, v477); + real2 v791 = reverse(minus(v476, v477)); + real2 v3 = load(in, 1 << shift); + real2 v67 = load(in, 65 << shift); + real2 v156 = plus(v3, v67); + real2 v152 = minus(v67, v3); + real2 v35 = load(in, 33 << shift); + real2 v99 = load(in, 97 << shift); + real2 v157 = plus(v35, v99); + real2 v151 = reverse(minus(v35, v99)); + real2 v792 = minus(v157, v156); + real2 v796 = plus(v156, v157); + real2 v793 = minusplus(v791, v792); + real2 v795 = minusplus(uminus(v791), v792); + real2 v803 = timesminusplus(reverse(v793), load(tbl, 134 * VECWIDTH + tbloffset), times(v793, load(tbl, 135 * VECWIDTH + tbloffset))); + real2 v1112 = minus(v797, v796); + real2 v1116 = plus(v796, v797); + real2 v107 = load(in, 105 << shift); + real2 v43 = load(in, 41 << shift); + real2 v317 = plus(v43, v107); + real2 v311 = reverse(minus(v43, v107)); + real2 v75 = load(in, 73 << shift); + real2 v11 = load(in, 9 << shift); + real2 v316 = plus(v11, v75); + real2 v312 = minus(v75, v11); + real2 v956 = plus(v316, v317); + real2 v952 = minus(v317, v316); + real2 v59 = load(in, 57 << shift); + real2 v123 = load(in, 121 << shift); + real2 v631 = reverse(minus(v59, v123)); + real2 v637 = plus(v59, v123); + real2 v27 = load(in, 25 << shift); + real2 v91 = load(in, 89 << shift); + real2 v636 = plus(v27, v91); + real2 v632 = minus(v91, v27); + real2 v957 = plus(v636, v637); + real2 v951 = reverse(minus(v636, v637)); + real2 v1111 = reverse(minus(v956, v957)); + real2 v1117 = plus(v956, v957); + real2 v1276 = plus(v1116, v1117); + real2 v1272 = minus(v1117, v1116); + real2 v1275 = minusplus(uminus(v1271), v1272); + real2 v1273 = minusplus(v1271, v1272); + real2 v1283 = timesminusplus(reverse(v1273), load(tbl, 230 * VECWIDTH + tbloffset), times(v1273, load(tbl, 231 * VECWIDTH + tbloffset))); + real2 v1352 = minus(v1277, v1276); + real2 v1356 = plus(v1276, v1277); + real2 v1289 = timesminusplus(reverse(v1275), load(tbl, 232 * VECWIDTH + tbloffset), times(v1275, load(tbl, 233 * VECWIDTH + tbloffset))); + real2 v1115 = minusplus(uminus(v1111), v1112); + real2 v1113 = minusplus(v1111, v1112); + real2 v1123 = timesminusplus(reverse(v1113), load(tbl, 198 * VECWIDTH + tbloffset), times(v1113, load(tbl, 199 * VECWIDTH + tbloffset))); + real2 v1129 = timesminusplus(reverse(v1115), load(tbl, 200 * VECWIDTH + tbloffset), times(v1115, load(tbl, 201 * VECWIDTH + tbloffset))); + real2 v1488 = plus(v1123, v1203); + real2 v1484 = minus(v1203, v1123); + real2 v1566 = plus(v1129, v1209); + real2 v1562 = minus(v1209, v1129); + real2 v85 = load(in, 83 << shift); + real2 v21 = load(in, 19 << shift); + real2 v512 = minus(v85, v21); + real2 v516 = plus(v21, v85); + real2 v117 = load(in, 115 << shift); + real2 v53 = load(in, 51 << shift); + real2 v517 = plus(v53, v117); + real2 v511 = reverse(minus(v53, v117)); + real2 v831 = reverse(minus(v516, v517)); + real2 v837 = plus(v516, v517); + real2 v69 = load(in, 67 << shift); + real2 v5 = load(in, 3 << shift); + real2 v192 = minus(v69, v5); + real2 v196 = plus(v5, v69); + real2 v37 = load(in, 35 << shift); + real2 v101 = load(in, 99 << shift); + real2 v197 = plus(v37, v101); + real2 v191 = reverse(minus(v37, v101)); + real2 v832 = minus(v197, v196); + real2 v836 = plus(v196, v197); + real2 v1152 = minus(v837, v836); + real2 v1156 = plus(v836, v837); + real2 v61 = load(in, 59 << shift); + real2 v125 = load(in, 123 << shift); + real2 v677 = plus(v61, v125); + real2 v671 = reverse(minus(v61, v125)); + real2 v29 = load(in, 27 << shift); + real2 v93 = load(in, 91 << shift); + real2 v672 = minus(v93, v29); + real2 v676 = plus(v29, v93); + real2 v997 = plus(v676, v677); + real2 v991 = reverse(minus(v676, v677)); + real2 v109 = load(in, 107 << shift); + real2 v45 = load(in, 43 << shift); + real2 v357 = plus(v45, v109); + real2 v351 = reverse(minus(v45, v109)); + real2 v77 = load(in, 75 << shift); + real2 v13 = load(in, 11 << shift); + real2 v352 = minus(v77, v13); + real2 v356 = plus(v13, v77); + real2 v992 = minus(v357, v356); + real2 v996 = plus(v356, v357); + real2 v1157 = plus(v996, v997); + real2 v1151 = reverse(minus(v996, v997)); + real2 v1155 = minusplus(uminus(v1151), v1152); + real2 v1153 = minusplus(v1151, v1152); + real2 v1163 = timesminusplus(reverse(v1153), load(tbl, 206 * VECWIDTH + tbloffset), times(v1153, load(tbl, 207 * VECWIDTH + tbloffset))); + real2 v1316 = plus(v1156, v1157); + real2 v1312 = minus(v1157, v1156); + real2 v41 = load(in, 39 << shift); + real2 v105 = load(in, 103 << shift); + real2 v277 = plus(v41, v105); + real2 v271 = reverse(minus(v41, v105)); + real2 v9 = load(in, 7 << shift); + real2 v73 = load(in, 71 << shift); + real2 v276 = plus(v9, v73); + real2 v272 = minus(v73, v9); + real2 v916 = plus(v276, v277); + real2 v912 = minus(v277, v276); + real2 v89 = load(in, 87 << shift); + real2 v25 = load(in, 23 << shift); + real2 v592 = minus(v89, v25); + real2 v596 = plus(v25, v89); + real2 v57 = load(in, 55 << shift); + real2 v121 = load(in, 119 << shift); + real2 v591 = reverse(minus(v57, v121)); + real2 v597 = plus(v57, v121); + real2 v911 = reverse(minus(v596, v597)); + real2 v917 = plus(v596, v597); + real2 v1236 = plus(v916, v917); + real2 v1232 = minus(v917, v916); + real2 v81 = load(in, 79 << shift); + real2 v17 = load(in, 15 << shift); + real2 v432 = minus(v81, v17); + real2 v436 = plus(v17, v81); + real2 v113 = load(in, 111 << shift); + real2 v49 = load(in, 47 << shift); + real2 v437 = plus(v49, v113); + real2 v431 = reverse(minus(v49, v113)); + real2 v1072 = minus(v437, v436); + real2 v1076 = plus(v436, v437); + real2 v65 = load(in, 63 << shift); + real2 v129 = load(in, 127 << shift); + real2 v757 = plus(v65, v129); + real2 v751 = reverse(minus(v65, v129)); + real2 v97 = load(in, 95 << shift); + real2 v33 = load(in, 31 << shift); + real2 v752 = minus(v97, v33); + real2 v756 = plus(v33, v97); + real2 v1077 = plus(v756, v757); + real2 v1071 = reverse(minus(v756, v757)); + real2 v1231 = reverse(minus(v1076, v1077)); + real2 v1237 = plus(v1076, v1077); + real2 v1317 = plus(v1236, v1237); + real2 v1311 = reverse(minus(v1236, v1237)); + real2 v1351 = reverse(minus(v1316, v1317)); + real2 v1357 = plus(v1316, v1317); + real2 v1371 = reverse(minus(v1356, v1357)); + real2 v1377 = plus(v1356, v1357); + scatter(out, 0, 128, plus(v1376, v1377)); + real2 v1390 = minus(v1376, v1377); + scatter(out, 64, 128, timesminusplus(v1390, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1390), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1353 = minusplus(v1351, v1352); + real2 v1355 = minusplus(uminus(v1351), v1352); + real2 v1369 = timesminusplus(reverse(v1355), load(tbl, 248 * VECWIDTH + tbloffset), times(v1355, load(tbl, 249 * VECWIDTH + tbloffset))); + scatter(out, 48, 128, plus(v1349, v1369)); + real2 v1404 = minus(v1349, v1369); + scatter(out, 112, 128, timesminusplus(v1404, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1404), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1363 = timesminusplus(reverse(v1353), load(tbl, 246 * VECWIDTH + tbloffset), times(v1353, load(tbl, 247 * VECWIDTH + tbloffset))); + scatter(out, 16, 128, plus(v1343, v1363)); + real2 v1398 = minus(v1343, v1363); + scatter(out, 80, 128, timesminusplus(v1398, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1398), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1373 = minusplus(v1371, v1372); + real2 v1375 = minusplus(uminus(v1371), v1372); + scatter(out, 96, 128, timesminusplus(reverse(v1375), load(tbl, 252 * VECWIDTH + tbloffset), times(v1375, load(tbl, 253 * VECWIDTH + tbloffset)))); + scatter(out, 32, 128, timesminusplus(reverse(v1373), load(tbl, 250 * VECWIDTH + tbloffset), times(v1373, load(tbl, 251 * VECWIDTH + tbloffset)))); + real2 v1313 = minusplus(v1311, v1312); + real2 v1315 = minusplus(uminus(v1311), v1312); + real2 v1323 = timesminusplus(reverse(v1313), load(tbl, 238 * VECWIDTH + tbloffset), times(v1313, load(tbl, 239 * VECWIDTH + tbloffset))); + real2 v1417 = plus(v1283, v1323); + real2 v1411 = reverse(minus(v1283, v1323)); + scatter(out, 8, 128, plus(v1416, v1417)); + real2 v1430 = minus(v1416, v1417); + scatter(out, 72, 128, timesminusplus(v1430, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1430), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1413 = minusplus(v1411, v1412); + real2 v1415 = minusplus(uminus(v1411), v1412); + scatter(out, 104, 128, timesminusplus(reverse(v1415), load(tbl, 256 * VECWIDTH + tbloffset), times(v1415, load(tbl, 257 * VECWIDTH + tbloffset)))); + scatter(out, 40, 128, timesminusplus(reverse(v1413), load(tbl, 254 * VECWIDTH + tbloffset), times(v1413, load(tbl, 255 * VECWIDTH + tbloffset)))); + real2 v1329 = timesminusplus(reverse(v1315), load(tbl, 240 * VECWIDTH + tbloffset), times(v1315, load(tbl, 241 * VECWIDTH + tbloffset))); + real2 v1443 = plus(v1289, v1329); + real2 v1437 = reverse(minus(v1289, v1329)); + scatter(out, 24, 128, plus(v1442, v1443)); + real2 v1456 = minus(v1442, v1443); + scatter(out, 88, 128, timesminusplus(v1456, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1456), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1441 = minusplus(uminus(v1437), v1438); + real2 v1439 = minusplus(v1437, v1438); + scatter(out, 120, 128, timesminusplus(reverse(v1441), load(tbl, 260 * VECWIDTH + tbloffset), times(v1441, load(tbl, 261 * VECWIDTH + tbloffset)))); + scatter(out, 56, 128, timesminusplus(reverse(v1439), load(tbl, 258 * VECWIDTH + tbloffset), times(v1439, load(tbl, 259 * VECWIDTH + tbloffset)))); + real2 v1235 = minusplus(uminus(v1231), v1232); + real2 v1233 = minusplus(v1231, v1232); + real2 v1243 = timesminusplus(reverse(v1233), load(tbl, 222 * VECWIDTH + tbloffset), times(v1233, load(tbl, 223 * VECWIDTH + tbloffset))); + real2 v1489 = plus(v1163, v1243); + real2 v1483 = reverse(minus(v1163, v1243)); + real2 v1509 = plus(v1488, v1489); + real2 v1503 = reverse(minus(v1488, v1489)); + scatter(out, 4, 128, plus(v1508, v1509)); + real2 v1522 = minus(v1508, v1509); + scatter(out, 68, 128, timesminusplus(v1522, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1522), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1507 = minusplus(uminus(v1503), v1504); + real2 v1505 = minusplus(v1503, v1504); + scatter(out, 36, 128, timesminusplus(reverse(v1505), load(tbl, 270 * VECWIDTH + tbloffset), times(v1505, load(tbl, 271 * VECWIDTH + tbloffset)))); + scatter(out, 100, 128, timesminusplus(reverse(v1507), load(tbl, 272 * VECWIDTH + tbloffset), times(v1507, load(tbl, 273 * VECWIDTH + tbloffset)))); + real2 v1485 = minusplus(v1483, v1484); + real2 v1487 = minusplus(uminus(v1483), v1484); + real2 v1501 = timesminusplus(reverse(v1487), load(tbl, 268 * VECWIDTH + tbloffset), times(v1487, load(tbl, 269 * VECWIDTH + tbloffset))); + scatter(out, 52, 128, plus(v1481, v1501)); + real2 v1534 = minus(v1481, v1501); + scatter(out, 116, 128, timesminusplus(v1534, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1534), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1495 = timesminusplus(reverse(v1485), load(tbl, 266 * VECWIDTH + tbloffset), times(v1485, load(tbl, 267 * VECWIDTH + tbloffset))); + scatter(out, 20, 128, plus(v1475, v1495)); + real2 v1528 = minus(v1475, v1495); + scatter(out, 84, 128, timesminusplus(v1528, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1528), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1249 = timesminusplus(reverse(v1235), load(tbl, 224 * VECWIDTH + tbloffset), times(v1235, load(tbl, 225 * VECWIDTH + tbloffset))); + real2 v1169 = timesminusplus(reverse(v1155), load(tbl, 208 * VECWIDTH + tbloffset), times(v1155, load(tbl, 209 * VECWIDTH + tbloffset))); + real2 v1567 = plus(v1169, v1249); + real2 v1561 = reverse(minus(v1169, v1249)); + real2 v1581 = reverse(minus(v1566, v1567)); + real2 v1587 = plus(v1566, v1567); + scatter(out, 12, 128, plus(v1586, v1587)); + real2 v1600 = minus(v1586, v1587); + scatter(out, 76, 128, timesminusplus(v1600, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1600), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1583 = minusplus(v1581, v1582); + scatter(out, 44, 128, timesminusplus(reverse(v1583), load(tbl, 282 * VECWIDTH + tbloffset), times(v1583, load(tbl, 283 * VECWIDTH + tbloffset)))); + real2 v1585 = minusplus(uminus(v1581), v1582); + scatter(out, 108, 128, timesminusplus(reverse(v1585), load(tbl, 284 * VECWIDTH + tbloffset), times(v1585, load(tbl, 285 * VECWIDTH + tbloffset)))); + real2 v1565 = minusplus(uminus(v1561), v1562); + real2 v1563 = minusplus(v1561, v1562); + real2 v1579 = timesminusplus(reverse(v1565), load(tbl, 280 * VECWIDTH + tbloffset), times(v1565, load(tbl, 281 * VECWIDTH + tbloffset))); + scatter(out, 60, 128, plus(v1559, v1579)); + real2 v1612 = minus(v1559, v1579); + scatter(out, 124, 128, timesminusplus(v1612, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1612), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1573 = timesminusplus(reverse(v1563), load(tbl, 278 * VECWIDTH + tbloffset), times(v1563, load(tbl, 279 * VECWIDTH + tbloffset))); + scatter(out, 28, 128, plus(v1553, v1573)); + real2 v1606 = minus(v1553, v1573); + scatter(out, 92, 128, timesminusplus(v1606, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1606), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v833 = minusplus(v831, v832); + real2 v835 = minusplus(uminus(v831), v832); + real2 v955 = minusplus(uminus(v951), v952); + real2 v953 = minusplus(v951, v952); + real2 v963 = timesminusplus(reverse(v953), load(tbl, 166 * VECWIDTH + tbloffset), times(v953, load(tbl, 167 * VECWIDTH + tbloffset))); + real2 v995 = minusplus(uminus(v991), v992); + real2 v993 = minusplus(v991, v992); + real2 v1003 = timesminusplus(reverse(v993), load(tbl, 174 * VECWIDTH + tbloffset), times(v993, load(tbl, 175 * VECWIDTH + tbloffset))); + real2 v843 = timesminusplus(reverse(v833), load(tbl, 142 * VECWIDTH + tbloffset), times(v833, load(tbl, 143 * VECWIDTH + tbloffset))); + real2 v1640 = minus(v963, v803); + real2 v1644 = plus(v803, v963); + real2 v1680 = minus(v1003, v843); + real2 v1684 = plus(v843, v1003); + real2 v1641 = minusplus(v1639, v1640); + real2 v1643 = minusplus(uminus(v1639), v1640); + real2 v1657 = timesminusplus(reverse(v1643), load(tbl, 292 * VECWIDTH + tbloffset), times(v1643, load(tbl, 293 * VECWIDTH + tbloffset))); + real2 v913 = minusplus(v911, v912); + real2 v915 = minusplus(uminus(v911), v912); + real2 v1073 = minusplus(v1071, v1072); + real2 v1075 = minusplus(uminus(v1071), v1072); + real2 v923 = timesminusplus(reverse(v913), load(tbl, 158 * VECWIDTH + tbloffset), times(v913, load(tbl, 159 * VECWIDTH + tbloffset))); + real2 v1083 = timesminusplus(reverse(v1073), load(tbl, 190 * VECWIDTH + tbloffset), times(v1073, load(tbl, 191 * VECWIDTH + tbloffset))); + real2 v1685 = plus(v923, v1083); + real2 v1679 = reverse(minus(v923, v1083)); + real2 v1681 = minusplus(v1679, v1680); + real2 v1683 = minusplus(uminus(v1679), v1680); + real2 v1697 = timesminusplus(reverse(v1683), load(tbl, 300 * VECWIDTH + tbloffset), times(v1683, load(tbl, 301 * VECWIDTH + tbloffset))); + real2 v1809 = plus(v1657, v1697); + real2 v1803 = reverse(minus(v1657, v1697)); + scatter(out, 26, 128, plus(v1808, v1809)); + real2 v1822 = minus(v1808, v1809); + scatter(out, 90, 128, timesminusplus(v1822, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1822), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1807 = minusplus(uminus(v1803), v1804); + real2 v1805 = minusplus(v1803, v1804); + scatter(out, 58, 128, timesminusplus(reverse(v1805), load(tbl, 318 * VECWIDTH + tbloffset), times(v1805, load(tbl, 319 * VECWIDTH + tbloffset)))); + scatter(out, 122, 128, timesminusplus(reverse(v1807), load(tbl, 320 * VECWIDTH + tbloffset), times(v1807, load(tbl, 321 * VECWIDTH + tbloffset)))); + real2 v1651 = timesminusplus(reverse(v1641), load(tbl, 290 * VECWIDTH + tbloffset), times(v1641, load(tbl, 291 * VECWIDTH + tbloffset))); + real2 v1691 = timesminusplus(reverse(v1681), load(tbl, 298 * VECWIDTH + tbloffset), times(v1681, load(tbl, 299 * VECWIDTH + tbloffset))); + real2 v1783 = plus(v1651, v1691); + real2 v1777 = reverse(minus(v1651, v1691)); + real2 v1779 = minusplus(v1777, v1778); + real2 v1781 = minusplus(uminus(v1777), v1778); + scatter(out, 106, 128, timesminusplus(reverse(v1781), load(tbl, 316 * VECWIDTH + tbloffset), times(v1781, load(tbl, 317 * VECWIDTH + tbloffset)))); + scatter(out, 42, 128, timesminusplus(reverse(v1779), load(tbl, 314 * VECWIDTH + tbloffset), times(v1779, load(tbl, 315 * VECWIDTH + tbloffset)))); + scatter(out, 10, 128, plus(v1782, v1783)); + real2 v1796 = minus(v1782, v1783); + scatter(out, 74, 128, timesminusplus(v1796, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1796), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1720 = minus(v1645, v1644); + real2 v1724 = plus(v1644, v1645); + real2 v1719 = reverse(minus(v1684, v1685)); + real2 v1725 = plus(v1684, v1685); + real2 v1745 = plus(v1724, v1725); + real2 v1739 = reverse(minus(v1724, v1725)); + scatter(out, 2, 128, plus(v1744, v1745)); + real2 v1758 = minus(v1744, v1745); + scatter(out, 66, 128, timesminusplus(v1758, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1758), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1741 = minusplus(v1739, v1740); + real2 v1743 = minusplus(uminus(v1739), v1740); + scatter(out, 98, 128, timesminusplus(reverse(v1743), load(tbl, 312 * VECWIDTH + tbloffset), times(v1743, load(tbl, 313 * VECWIDTH + tbloffset)))); + scatter(out, 34, 128, timesminusplus(reverse(v1741), load(tbl, 310 * VECWIDTH + tbloffset), times(v1741, load(tbl, 311 * VECWIDTH + tbloffset)))); + real2 v1723 = minusplus(uminus(v1719), v1720); + real2 v1721 = minusplus(v1719, v1720); + real2 v1737 = timesminusplus(reverse(v1723), load(tbl, 308 * VECWIDTH + tbloffset), times(v1723, load(tbl, 309 * VECWIDTH + tbloffset))); + scatter(out, 50, 128, plus(v1717, v1737)); + real2 v1770 = minus(v1717, v1737); + scatter(out, 114, 128, timesminusplus(v1770, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1770), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1731 = timesminusplus(reverse(v1721), load(tbl, 306 * VECWIDTH + tbloffset), times(v1721, load(tbl, 307 * VECWIDTH + tbloffset))); + scatter(out, 18, 128, plus(v1711, v1731)); + real2 v1764 = minus(v1711, v1731); + scatter(out, 82, 128, timesminusplus(v1764, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1764), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v809 = timesminusplus(reverse(v795), load(tbl, 136 * VECWIDTH + tbloffset), times(v795, load(tbl, 137 * VECWIDTH + tbloffset))); + real2 v969 = timesminusplus(reverse(v955), load(tbl, 168 * VECWIDTH + tbloffset), times(v955, load(tbl, 169 * VECWIDTH + tbloffset))); + real2 v1850 = minus(v969, v809); + real2 v1854 = plus(v809, v969); + real2 v849 = timesminusplus(reverse(v835), load(tbl, 144 * VECWIDTH + tbloffset), times(v835, load(tbl, 145 * VECWIDTH + tbloffset))); + real2 v929 = timesminusplus(reverse(v915), load(tbl, 160 * VECWIDTH + tbloffset), times(v915, load(tbl, 161 * VECWIDTH + tbloffset))); + real2 v889 = timesminusplus(reverse(v875), load(tbl, 152 * VECWIDTH + tbloffset), times(v875, load(tbl, 153 * VECWIDTH + tbloffset))); + real2 v1089 = timesminusplus(reverse(v1075), load(tbl, 192 * VECWIDTH + tbloffset), times(v1075, load(tbl, 193 * VECWIDTH + tbloffset))); + real2 v1009 = timesminusplus(reverse(v995), load(tbl, 176 * VECWIDTH + tbloffset), times(v995, load(tbl, 177 * VECWIDTH + tbloffset))); + real2 v1890 = minus(v1009, v849); + real2 v1894 = plus(v849, v1009); + real2 v1849 = reverse(minus(v889, v1049)); + real2 v1855 = plus(v889, v1049); + real2 v1930 = minus(v1855, v1854); + real2 v1934 = plus(v1854, v1855); + real2 v1895 = plus(v929, v1089); + real2 v1889 = reverse(minus(v929, v1089)); + real2 v1929 = reverse(minus(v1894, v1895)); + real2 v1935 = plus(v1894, v1895); + real2 v1955 = plus(v1934, v1935); + real2 v1949 = reverse(minus(v1934, v1935)); + scatter(out, 6, 128, plus(v1954, v1955)); + real2 v1968 = minus(v1954, v1955); + scatter(out, 70, 128, timesminusplus(v1968, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1968), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1951 = minusplus(v1949, v1950); + scatter(out, 38, 128, timesminusplus(reverse(v1951), load(tbl, 346 * VECWIDTH + tbloffset), times(v1951, load(tbl, 347 * VECWIDTH + tbloffset)))); + real2 v1953 = minusplus(uminus(v1949), v1950); + scatter(out, 102, 128, timesminusplus(reverse(v1953), load(tbl, 348 * VECWIDTH + tbloffset), times(v1953, load(tbl, 349 * VECWIDTH + tbloffset)))); + real2 v1931 = minusplus(v1929, v1930); + real2 v1933 = minusplus(uminus(v1929), v1930); + real2 v1947 = timesminusplus(reverse(v1933), load(tbl, 344 * VECWIDTH + tbloffset), times(v1933, load(tbl, 345 * VECWIDTH + tbloffset))); + scatter(out, 54, 128, plus(v1927, v1947)); + real2 v1980 = minus(v1927, v1947); + scatter(out, 118, 128, timesminusplus(v1980, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1980), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1941 = timesminusplus(reverse(v1931), load(tbl, 342 * VECWIDTH + tbloffset), times(v1931, load(tbl, 343 * VECWIDTH + tbloffset))); + scatter(out, 22, 128, plus(v1921, v1941)); + real2 v1974 = minus(v1921, v1941); + scatter(out, 86, 128, timesminusplus(v1974, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1974), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1851 = minusplus(v1849, v1850); + real2 v1853 = minusplus(uminus(v1849), v1850); + real2 v1867 = timesminusplus(reverse(v1853), load(tbl, 328 * VECWIDTH + tbloffset), times(v1853, load(tbl, 329 * VECWIDTH + tbloffset))); + real2 v1891 = minusplus(v1889, v1890); + real2 v1893 = minusplus(uminus(v1889), v1890); + real2 v1907 = timesminusplus(reverse(v1893), load(tbl, 336 * VECWIDTH + tbloffset), times(v1893, load(tbl, 337 * VECWIDTH + tbloffset))); + real2 v2019 = plus(v1867, v1907); + real2 v2013 = reverse(minus(v1867, v1907)); + scatter(out, 30, 128, plus(v2018, v2019)); + real2 v2032 = minus(v2018, v2019); + scatter(out, 94, 128, timesminusplus(v2032, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v2032), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v2017 = minusplus(uminus(v2013), v2014); + scatter(out, 126, 128, timesminusplus(reverse(v2017), load(tbl, 356 * VECWIDTH + tbloffset), times(v2017, load(tbl, 357 * VECWIDTH + tbloffset)))); + real2 v2015 = minusplus(v2013, v2014); + scatter(out, 62, 128, timesminusplus(reverse(v2015), load(tbl, 354 * VECWIDTH + tbloffset), times(v2015, load(tbl, 355 * VECWIDTH + tbloffset)))); + real2 v1861 = timesminusplus(reverse(v1851), load(tbl, 326 * VECWIDTH + tbloffset), times(v1851, load(tbl, 327 * VECWIDTH + tbloffset))); + real2 v1901 = timesminusplus(reverse(v1891), load(tbl, 334 * VECWIDTH + tbloffset), times(v1891, load(tbl, 335 * VECWIDTH + tbloffset))); + real2 v1993 = plus(v1861, v1901); + real2 v1987 = reverse(minus(v1861, v1901)); + scatter(out, 14, 128, plus(v1992, v1993)); + real2 v2006 = minus(v1992, v1993); + scatter(out, 78, 128, timesminusplus(v2006, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v2006), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1991 = minusplus(uminus(v1987), v1988); + scatter(out, 110, 128, timesminusplus(reverse(v1991), load(tbl, 352 * VECWIDTH + tbloffset), times(v1991, load(tbl, 353 * VECWIDTH + tbloffset)))); + real2 v1989 = minusplus(v1987, v1988); + scatter(out, 46, 128, timesminusplus(reverse(v1989), load(tbl, 350 * VECWIDTH + tbloffset), times(v1989, load(tbl, 351 * VECWIDTH + tbloffset)))); + real2 v593 = minusplus(v591, v592); + real2 v595 = minusplus(uminus(v591), v592); + real2 v473 = minusplus(v471, v472); + real2 v475 = minusplus(uminus(v471), v472); + real2 v555 = minusplus(uminus(v551), v552); + real2 v553 = minusplus(v551, v552); + real2 v609 = timesminusplus(reverse(v595), load(tbl, 96 * VECWIDTH + tbloffset), times(v595, load(tbl, 97 * VECWIDTH + tbloffset))); + real2 v195 = minusplus(uminus(v191), v192); + real2 v193 = minusplus(v191, v192); + real2 v275 = minusplus(uminus(v271), v272); + real2 v273 = minusplus(v271, v272); + real2 v673 = minusplus(v671, v672); + real2 v675 = minusplus(uminus(v671), v672); + real2 v689 = timesminusplus(reverse(v675), load(tbl, 112 * VECWIDTH + tbloffset), times(v675, load(tbl, 113 * VECWIDTH + tbloffset))); + real2 v209 = timesminusplus(reverse(v195), load(tbl, 16 * VECWIDTH + tbloffset), times(v195, load(tbl, 17 * VECWIDTH + tbloffset))); + real2 v289 = timesminusplus(reverse(v275), load(tbl, 32 * VECWIDTH + tbloffset), times(v275, load(tbl, 33 * VECWIDTH + tbloffset))); + real2 v755 = minusplus(uminus(v751), v752); + real2 v753 = minusplus(v751, v752); + real2 v435 = minusplus(uminus(v431), v432); + real2 v433 = minusplus(v431, v432); + real2 v513 = minusplus(v511, v512); + real2 v515 = minusplus(uminus(v511), v512); + real2 v529 = timesminusplus(reverse(v515), load(tbl, 80 * VECWIDTH + tbloffset), times(v515, load(tbl, 81 * VECWIDTH + tbloffset))); + real2 v353 = minusplus(v351, v352); + real2 v355 = minusplus(uminus(v351), v352); + real2 v369 = timesminusplus(reverse(v355), load(tbl, 48 * VECWIDTH + tbloffset), times(v355, load(tbl, 49 * VECWIDTH + tbloffset))); + real2 v2631 = plus(v369, v689); + real2 v2625 = reverse(minus(v369, v689)); + real2 v449 = timesminusplus(reverse(v435), load(tbl, 64 * VECWIDTH + tbloffset), times(v435, load(tbl, 65 * VECWIDTH + tbloffset))); + real2 v2710 = plus(v289, v609); + real2 v2706 = minus(v609, v289); + real2 v2630 = plus(v209, v529); + real2 v2626 = minus(v529, v209); + real2 v2790 = plus(v2630, v2631); + real2 v2786 = minus(v2631, v2630); + real2 v713 = minusplus(v711, v712); + real2 v715 = minusplus(uminus(v711), v712); + real2 v769 = timesminusplus(reverse(v755), load(tbl, 128 * VECWIDTH + tbloffset), times(v755, load(tbl, 129 * VECWIDTH + tbloffset))); + real2 v2705 = reverse(minus(v449, v769)); + real2 v2711 = plus(v449, v769); + real2 v313 = minusplus(v311, v312); + real2 v315 = minusplus(uminus(v311), v312); + real2 v393 = minusplus(v391, v392); + real2 v395 = minusplus(uminus(v391), v392); + real2 v409 = timesminusplus(reverse(v395), load(tbl, 56 * VECWIDTH + tbloffset), times(v395, load(tbl, 57 * VECWIDTH + tbloffset))); + real2 v729 = timesminusplus(reverse(v715), load(tbl, 120 * VECWIDTH + tbloffset), times(v715, load(tbl, 121 * VECWIDTH + tbloffset))); + real2 v329 = timesminusplus(reverse(v315), load(tbl, 40 * VECWIDTH + tbloffset), times(v315, load(tbl, 41 * VECWIDTH + tbloffset))); + real2 v489 = timesminusplus(reverse(v475), load(tbl, 72 * VECWIDTH + tbloffset), times(v475, load(tbl, 73 * VECWIDTH + tbloffset))); + real2 v153 = minusplus(v151, v152); + real2 v155 = minusplus(uminus(v151), v152); + real2 v169 = timesminusplus(reverse(v155), load(tbl, 8 * VECWIDTH + tbloffset), times(v155, load(tbl, 9 * VECWIDTH + tbloffset))); + real2 v2586 = minus(v489, v169); + real2 v2590 = plus(v169, v489); + real2 v233 = minusplus(v231, v232); + real2 v235 = minusplus(uminus(v231), v232); + real2 v633 = minusplus(v631, v632); + real2 v635 = minusplus(uminus(v631), v632); + real2 v649 = timesminusplus(reverse(v635), load(tbl, 104 * VECWIDTH + tbloffset), times(v635, load(tbl, 105 * VECWIDTH + tbloffset))); + real2 v249 = timesminusplus(reverse(v235), load(tbl, 24 * VECWIDTH + tbloffset), times(v235, load(tbl, 25 * VECWIDTH + tbloffset))); + real2 v569 = timesminusplus(reverse(v555), load(tbl, 88 * VECWIDTH + tbloffset), times(v555, load(tbl, 89 * VECWIDTH + tbloffset))); + real2 v2670 = plus(v249, v569); + real2 v2666 = minus(v569, v249); + real2 v2785 = reverse(minus(v2710, v2711)); + real2 v2791 = plus(v2710, v2711); + real2 v2825 = reverse(minus(v2790, v2791)); + real2 v2831 = plus(v2790, v2791); + real2 v2671 = plus(v409, v729); + real2 v2665 = reverse(minus(v409, v729)); + real2 v2745 = reverse(minus(v2670, v2671)); + real2 v2751 = plus(v2670, v2671); + real2 v2806 = minus(v2731, v2730); + real2 v2810 = plus(v2730, v2731); + real2 v2846 = minus(v2811, v2810); + real2 v2850 = plus(v2810, v2811); + real2 v2591 = plus(v329, v649); + real2 v2585 = reverse(minus(v329, v649)); + real2 v2750 = plus(v2590, v2591); + real2 v2746 = minus(v2591, v2590); + real2 v2830 = plus(v2750, v2751); + real2 v2826 = minus(v2751, v2750); + real2 v2845 = reverse(minus(v2830, v2831)); + real2 v2851 = plus(v2830, v2831); + scatter(out, 3, 128, plus(v2850, v2851)); + real2 v2864 = minus(v2850, v2851); + scatter(out, 67, 128, timesminusplus(v2864, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v2864), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v2849 = minusplus(uminus(v2845), v2846); + real2 v2847 = minusplus(v2845, v2846); + scatter(out, 35, 128, timesminusplus(reverse(v2847), load(tbl, 506 * VECWIDTH + tbloffset), times(v2847, load(tbl, 507 * VECWIDTH + tbloffset)))); + scatter(out, 99, 128, timesminusplus(reverse(v2849), load(tbl, 508 * VECWIDTH + tbloffset), times(v2849, load(tbl, 509 * VECWIDTH + tbloffset)))); + real2 v2827 = minusplus(v2825, v2826); + real2 v2829 = minusplus(uminus(v2825), v2826); + real2 v2837 = timesminusplus(reverse(v2827), load(tbl, 502 * VECWIDTH + tbloffset), times(v2827, load(tbl, 503 * VECWIDTH + tbloffset))); + real2 v2809 = minusplus(uminus(v2805), v2806); + real2 v2807 = minusplus(v2805, v2806); + real2 v2817 = timesminusplus(reverse(v2807), load(tbl, 498 * VECWIDTH + tbloffset), times(v2807, load(tbl, 499 * VECWIDTH + tbloffset))); + scatter(out, 19, 128, plus(v2817, v2837)); + real2 v2870 = minus(v2817, v2837); + scatter(out, 83, 128, timesminusplus(v2870, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v2870), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v2823 = timesminusplus(reverse(v2809), load(tbl, 500 * VECWIDTH + tbloffset), times(v2809, load(tbl, 501 * VECWIDTH + tbloffset))); + real2 v2843 = timesminusplus(reverse(v2829), load(tbl, 504 * VECWIDTH + tbloffset), times(v2829, load(tbl, 505 * VECWIDTH + tbloffset))); + scatter(out, 51, 128, plus(v2823, v2843)); + real2 v2876 = minus(v2823, v2843); + scatter(out, 115, 128, timesminusplus(v2876, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v2876), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v2787 = minusplus(v2785, v2786); + real2 v2789 = minusplus(uminus(v2785), v2786); + real2 v2803 = timesminusplus(reverse(v2789), load(tbl, 496 * VECWIDTH + tbloffset), times(v2789, load(tbl, 497 * VECWIDTH + tbloffset))); + real2 v2727 = minusplus(v2725, v2726); + real2 v2729 = minusplus(uminus(v2725), v2726); + real2 v2743 = timesminusplus(reverse(v2729), load(tbl, 484 * VECWIDTH + tbloffset), times(v2729, load(tbl, 485 * VECWIDTH + tbloffset))); + real2 v2914 = plus(v2743, v2783); + real2 v2910 = minus(v2783, v2743); + real2 v2749 = minusplus(uminus(v2745), v2746); + real2 v2747 = minusplus(v2745, v2746); + real2 v2763 = timesminusplus(reverse(v2749), load(tbl, 488 * VECWIDTH + tbloffset), times(v2749, load(tbl, 489 * VECWIDTH + tbloffset))); + real2 v2909 = reverse(minus(v2763, v2803)); + real2 v2915 = plus(v2763, v2803); + scatter(out, 27, 128, plus(v2914, v2915)); + real2 v2928 = minus(v2914, v2915); + scatter(out, 91, 128, timesminusplus(v2928, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v2928), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v2913 = minusplus(uminus(v2909), v2910); + scatter(out, 123, 128, timesminusplus(reverse(v2913), load(tbl, 516 * VECWIDTH + tbloffset), times(v2913, load(tbl, 517 * VECWIDTH + tbloffset)))); + real2 v2911 = minusplus(v2909, v2910); + scatter(out, 59, 128, timesminusplus(reverse(v2911), load(tbl, 514 * VECWIDTH + tbloffset), times(v2911, load(tbl, 515 * VECWIDTH + tbloffset)))); + real2 v2737 = timesminusplus(reverse(v2727), load(tbl, 482 * VECWIDTH + tbloffset), times(v2727, load(tbl, 483 * VECWIDTH + tbloffset))); + real2 v2888 = plus(v2737, v2777); + real2 v2884 = minus(v2777, v2737); + real2 v2797 = timesminusplus(reverse(v2787), load(tbl, 494 * VECWIDTH + tbloffset), times(v2787, load(tbl, 495 * VECWIDTH + tbloffset))); + real2 v2757 = timesminusplus(reverse(v2747), load(tbl, 486 * VECWIDTH + tbloffset), times(v2747, load(tbl, 487 * VECWIDTH + tbloffset))); + real2 v2889 = plus(v2757, v2797); + real2 v2883 = reverse(minus(v2757, v2797)); + scatter(out, 11, 128, plus(v2888, v2889)); + real2 v2902 = minus(v2888, v2889); + scatter(out, 75, 128, timesminusplus(v2902, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v2902), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v2887 = minusplus(uminus(v2883), v2884); + scatter(out, 107, 128, timesminusplus(reverse(v2887), load(tbl, 512 * VECWIDTH + tbloffset), times(v2887, load(tbl, 513 * VECWIDTH + tbloffset)))); + real2 v2885 = minusplus(v2883, v2884); + scatter(out, 43, 128, timesminusplus(reverse(v2885), load(tbl, 510 * VECWIDTH + tbloffset), times(v2885, load(tbl, 511 * VECWIDTH + tbloffset)))); + real2 v2669 = minusplus(uminus(v2665), v2666); + real2 v2667 = minusplus(v2665, v2666); + real2 v2707 = minusplus(v2705, v2706); + real2 v2709 = minusplus(uminus(v2705), v2706); + real2 v2717 = timesminusplus(reverse(v2707), load(tbl, 478 * VECWIDTH + tbloffset), times(v2707, load(tbl, 479 * VECWIDTH + tbloffset))); + real2 v2627 = minusplus(v2625, v2626); + real2 v2629 = minusplus(uminus(v2625), v2626); + real2 v2637 = timesminusplus(reverse(v2627), load(tbl, 462 * VECWIDTH + tbloffset), times(v2627, load(tbl, 463 * VECWIDTH + tbloffset))); + real2 v2961 = plus(v2637, v2717); + real2 v2955 = reverse(minus(v2637, v2717)); + real2 v2649 = minusplus(uminus(v2645), v2646); + real2 v2647 = minusplus(v2645, v2646); + real2 v2569 = minusplus(uminus(v2565), v2566); + real2 v2567 = minusplus(v2565, v2566); + real2 v2577 = timesminusplus(reverse(v2567), load(tbl, 450 * VECWIDTH + tbloffset), times(v2567, load(tbl, 451 * VECWIDTH + tbloffset))); + real2 v2657 = timesminusplus(reverse(v2647), load(tbl, 466 * VECWIDTH + tbloffset), times(v2647, load(tbl, 467 * VECWIDTH + tbloffset))); + real2 v2936 = minus(v2657, v2577); + real2 v2940 = plus(v2577, v2657); + real2 v2976 = minus(v2941, v2940); + real2 v2980 = plus(v2940, v2941); + real2 v2677 = timesminusplus(reverse(v2667), load(tbl, 470 * VECWIDTH + tbloffset), times(v2667, load(tbl, 471 * VECWIDTH + tbloffset))); + real2 v2587 = minusplus(v2585, v2586); + real2 v2589 = minusplus(uminus(v2585), v2586); + real2 v2597 = timesminusplus(reverse(v2587), load(tbl, 454 * VECWIDTH + tbloffset), times(v2587, load(tbl, 455 * VECWIDTH + tbloffset))); + real2 v2956 = minus(v2677, v2597); + real2 v2960 = plus(v2597, v2677); + real2 v2975 = reverse(minus(v2960, v2961)); + real2 v2981 = plus(v2960, v2961); + scatter(out, 7, 128, plus(v2980, v2981)); + real2 v2994 = minus(v2980, v2981); + scatter(out, 71, 128, timesminusplus(v2994, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v2994), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v2979 = minusplus(uminus(v2975), v2976); + scatter(out, 103, 128, timesminusplus(reverse(v2979), load(tbl, 528 * VECWIDTH + tbloffset), times(v2979, load(tbl, 529 * VECWIDTH + tbloffset)))); + real2 v2977 = minusplus(v2975, v2976); + scatter(out, 39, 128, timesminusplus(reverse(v2977), load(tbl, 526 * VECWIDTH + tbloffset), times(v2977, load(tbl, 527 * VECWIDTH + tbloffset)))); + real2 v2939 = minusplus(uminus(v2935), v2936); + real2 v2937 = minusplus(v2935, v2936); + real2 v2953 = timesminusplus(reverse(v2939), load(tbl, 520 * VECWIDTH + tbloffset), times(v2939, load(tbl, 521 * VECWIDTH + tbloffset))); + real2 v2957 = minusplus(v2955, v2956); + real2 v2959 = minusplus(uminus(v2955), v2956); + real2 v2973 = timesminusplus(reverse(v2959), load(tbl, 524 * VECWIDTH + tbloffset), times(v2959, load(tbl, 525 * VECWIDTH + tbloffset))); + scatter(out, 55, 128, plus(v2953, v2973)); + real2 v3006 = minus(v2953, v2973); + scatter(out, 119, 128, timesminusplus(v3006, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v3006), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v2947 = timesminusplus(reverse(v2937), load(tbl, 518 * VECWIDTH + tbloffset), times(v2937, load(tbl, 519 * VECWIDTH + tbloffset))); + real2 v2967 = timesminusplus(reverse(v2957), load(tbl, 522 * VECWIDTH + tbloffset), times(v2957, load(tbl, 523 * VECWIDTH + tbloffset))); + scatter(out, 23, 128, plus(v2947, v2967)); + real2 v3000 = minus(v2947, v2967); + scatter(out, 87, 128, timesminusplus(v3000, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v3000), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v2663 = timesminusplus(reverse(v2649), load(tbl, 468 * VECWIDTH + tbloffset), times(v2649, load(tbl, 469 * VECWIDTH + tbloffset))); + real2 v2583 = timesminusplus(reverse(v2569), load(tbl, 452 * VECWIDTH + tbloffset), times(v2569, load(tbl, 453 * VECWIDTH + tbloffset))); + real2 v3014 = minus(v2663, v2583); + real2 v3018 = plus(v2583, v2663); + real2 v3015 = minusplus(v3013, v3014); + real2 v3017 = minusplus(uminus(v3013), v3014); + real2 v2643 = timesminusplus(reverse(v2629), load(tbl, 464 * VECWIDTH + tbloffset), times(v2629, load(tbl, 465 * VECWIDTH + tbloffset))); + real2 v2723 = timesminusplus(reverse(v2709), load(tbl, 480 * VECWIDTH + tbloffset), times(v2709, load(tbl, 481 * VECWIDTH + tbloffset))); + real2 v3039 = plus(v2643, v2723); + real2 v3033 = reverse(minus(v2643, v2723)); + real2 v2683 = timesminusplus(reverse(v2669), load(tbl, 472 * VECWIDTH + tbloffset), times(v2669, load(tbl, 473 * VECWIDTH + tbloffset))); + real2 v3031 = timesminusplus(reverse(v3017), load(tbl, 532 * VECWIDTH + tbloffset), times(v3017, load(tbl, 533 * VECWIDTH + tbloffset))); + real2 v2603 = timesminusplus(reverse(v2589), load(tbl, 456 * VECWIDTH + tbloffset), times(v2589, load(tbl, 457 * VECWIDTH + tbloffset))); + real2 v3034 = minus(v2683, v2603); + real2 v3038 = plus(v2603, v2683); + real2 v3037 = minusplus(uminus(v3033), v3034); + real2 v3035 = minusplus(v3033, v3034); + real2 v3051 = timesminusplus(reverse(v3037), load(tbl, 536 * VECWIDTH + tbloffset), times(v3037, load(tbl, 537 * VECWIDTH + tbloffset))); + scatter(out, 63, 128, plus(v3031, v3051)); + real2 v3084 = minus(v3031, v3051); + scatter(out, 127, 128, timesminusplus(v3084, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v3084), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v3025 = timesminusplus(reverse(v3015), load(tbl, 530 * VECWIDTH + tbloffset), times(v3015, load(tbl, 531 * VECWIDTH + tbloffset))); + real2 v3045 = timesminusplus(reverse(v3035), load(tbl, 534 * VECWIDTH + tbloffset), times(v3035, load(tbl, 535 * VECWIDTH + tbloffset))); + scatter(out, 31, 128, plus(v3025, v3045)); + real2 v3078 = minus(v3025, v3045); + scatter(out, 95, 128, timesminusplus(v3078, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v3078), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v3058 = plus(v3018, v3019); + real2 v3054 = minus(v3019, v3018); + real2 v3053 = reverse(minus(v3038, v3039)); + real2 v3059 = plus(v3038, v3039); + real2 v3055 = minusplus(v3053, v3054); + scatter(out, 47, 128, timesminusplus(reverse(v3055), load(tbl, 538 * VECWIDTH + tbloffset), times(v3055, load(tbl, 539 * VECWIDTH + tbloffset)))); + real2 v3057 = minusplus(uminus(v3053), v3054); + scatter(out, 111, 128, timesminusplus(reverse(v3057), load(tbl, 540 * VECWIDTH + tbloffset), times(v3057, load(tbl, 541 * VECWIDTH + tbloffset)))); + scatter(out, 15, 128, plus(v3058, v3059)); + real2 v3072 = minus(v3058, v3059); + scatter(out, 79, 128, timesminusplus(v3072, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v3072), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v683 = timesminusplus(reverse(v673), load(tbl, 110 * VECWIDTH + tbloffset), times(v673, load(tbl, 111 * VECWIDTH + tbloffset))); + real2 v363 = timesminusplus(reverse(v353), load(tbl, 46 * VECWIDTH + tbloffset), times(v353, load(tbl, 47 * VECWIDTH + tbloffset))); + real2 v2105 = plus(v363, v683); + real2 v2099 = reverse(minus(v363, v683)); + real2 v283 = timesminusplus(reverse(v273), load(tbl, 30 * VECWIDTH + tbloffset), times(v273, load(tbl, 31 * VECWIDTH + tbloffset))); + real2 v723 = timesminusplus(reverse(v713), load(tbl, 118 * VECWIDTH + tbloffset), times(v713, load(tbl, 119 * VECWIDTH + tbloffset))); + real2 v403 = timesminusplus(reverse(v393), load(tbl, 54 * VECWIDTH + tbloffset), times(v393, load(tbl, 55 * VECWIDTH + tbloffset))); + real2 v603 = timesminusplus(reverse(v593), load(tbl, 94 * VECWIDTH + tbloffset), times(v593, load(tbl, 95 * VECWIDTH + tbloffset))); + real2 v2180 = minus(v603, v283); + real2 v2184 = plus(v283, v603); + real2 v2145 = plus(v403, v723); + real2 v2139 = reverse(minus(v403, v723)); + real2 v543 = timesminusplus(reverse(v533), load(tbl, 82 * VECWIDTH + tbloffset), times(v533, load(tbl, 83 * VECWIDTH + tbloffset))); + real2 v383 = timesminusplus(reverse(v373), load(tbl, 50 * VECWIDTH + tbloffset), times(v373, load(tbl, 51 * VECWIDTH + tbloffset))); + real2 v703 = timesminusplus(reverse(v693), load(tbl, 114 * VECWIDTH + tbloffset), times(v693, load(tbl, 115 * VECWIDTH + tbloffset))); + real2 v2125 = plus(v383, v703); + real2 v2119 = reverse(minus(v383, v703)); + real2 v223 = timesminusplus(reverse(v213), load(tbl, 18 * VECWIDTH + tbloffset), times(v213, load(tbl, 19 * VECWIDTH + tbloffset))); + real2 v2120 = minus(v543, v223); + real2 v2124 = plus(v223, v543); + real2 v443 = timesminusplus(reverse(v433), load(tbl, 62 * VECWIDTH + tbloffset), times(v433, load(tbl, 63 * VECWIDTH + tbloffset))); + real2 v203 = timesminusplus(reverse(v193), load(tbl, 14 * VECWIDTH + tbloffset), times(v193, load(tbl, 15 * VECWIDTH + tbloffset))); + real2 v763 = timesminusplus(reverse(v753), load(tbl, 126 * VECWIDTH + tbloffset), times(v753, load(tbl, 127 * VECWIDTH + tbloffset))); + real2 v2179 = reverse(minus(v443, v763)); + real2 v2185 = plus(v443, v763); + real2 v523 = timesminusplus(reverse(v513), load(tbl, 78 * VECWIDTH + tbloffset), times(v513, load(tbl, 79 * VECWIDTH + tbloffset))); + real2 v2100 = minus(v523, v203); + real2 v2104 = plus(v203, v523); + real2 v2264 = plus(v2104, v2105); + real2 v2260 = minus(v2105, v2104); + real2 v643 = timesminusplus(reverse(v633), load(tbl, 102 * VECWIDTH + tbloffset), times(v633, load(tbl, 103 * VECWIDTH + tbloffset))); + real2 v2265 = plus(v2184, v2185); + real2 v2259 = reverse(minus(v2184, v2185)); + real2 v563 = timesminusplus(reverse(v553), load(tbl, 86 * VECWIDTH + tbloffset), times(v553, load(tbl, 87 * VECWIDTH + tbloffset))); + real2 v243 = timesminusplus(reverse(v233), load(tbl, 22 * VECWIDTH + tbloffset), times(v233, load(tbl, 23 * VECWIDTH + tbloffset))); + real2 v2144 = plus(v243, v563); + real2 v2140 = minus(v563, v243); + real2 v143 = timesminusplus(reverse(v133), load(tbl, 2 * VECWIDTH + tbloffset), times(v133, load(tbl, 3 * VECWIDTH + tbloffset))); + real2 v183 = timesminusplus(reverse(v173), load(tbl, 10 * VECWIDTH + tbloffset), times(v173, load(tbl, 11 * VECWIDTH + tbloffset))); + real2 v2084 = plus(v183, v503); + real2 v2080 = minus(v503, v183); + real2 v163 = timesminusplus(reverse(v153), load(tbl, 6 * VECWIDTH + tbloffset), times(v153, load(tbl, 7 * VECWIDTH + tbloffset))); + real2 v303 = timesminusplus(reverse(v293), load(tbl, 34 * VECWIDTH + tbloffset), times(v293, load(tbl, 35 * VECWIDTH + tbloffset))); + real2 v623 = timesminusplus(reverse(v613), load(tbl, 98 * VECWIDTH + tbloffset), times(v613, load(tbl, 99 * VECWIDTH + tbloffset))); + real2 v2039 = reverse(minus(v303, v623)); + real2 v2045 = plus(v303, v623); + real2 v463 = timesminusplus(reverse(v453), load(tbl, 66 * VECWIDTH + tbloffset), times(v453, load(tbl, 67 * VECWIDTH + tbloffset))); + real2 v2044 = plus(v143, v463); + real2 v2040 = minus(v463, v143); + real2 v2204 = plus(v2044, v2045); + real2 v2200 = minus(v2045, v2044); + real2 v323 = timesminusplus(reverse(v313), load(tbl, 38 * VECWIDTH + tbloffset), times(v313, load(tbl, 39 * VECWIDTH + tbloffset))); + real2 v2205 = plus(v2124, v2125); + real2 v2199 = reverse(minus(v2124, v2125)); + real2 v2280 = minus(v2205, v2204); + real2 v2284 = plus(v2204, v2205); + real2 v2225 = plus(v2144, v2145); + real2 v2219 = reverse(minus(v2144, v2145)); + real2 v2305 = plus(v2264, v2265); + real2 v2299 = reverse(minus(v2264, v2265)); + real2 v2240 = minus(v2085, v2084); + real2 v2244 = plus(v2084, v2085); + real2 v2279 = reverse(minus(v2244, v2245)); + real2 v2285 = plus(v2244, v2245); + real2 v2281 = minusplus(v2279, v2280); + real2 v2283 = minusplus(uminus(v2279), v2280); + real2 v2291 = timesminusplus(reverse(v2281), load(tbl, 406 * VECWIDTH + tbloffset), times(v2281, load(tbl, 407 * VECWIDTH + tbloffset))); + real2 v483 = timesminusplus(reverse(v473), load(tbl, 70 * VECWIDTH + tbloffset), times(v473, load(tbl, 71 * VECWIDTH + tbloffset))); + real2 v2060 = minus(v483, v163); + real2 v2064 = plus(v163, v483); + real2 v2065 = plus(v323, v643); + real2 v2059 = reverse(minus(v323, v643)); + real2 v2220 = minus(v2065, v2064); + real2 v2224 = plus(v2064, v2065); + real2 v2304 = plus(v2224, v2225); + real2 v2300 = minus(v2225, v2224); + real2 v2301 = minusplus(v2299, v2300); + real2 v2303 = minusplus(uminus(v2299), v2300); + real2 v2311 = timesminusplus(reverse(v2301), load(tbl, 410 * VECWIDTH + tbloffset), times(v2301, load(tbl, 411 * VECWIDTH + tbloffset))); + scatter(out, 17, 128, plus(v2291, v2311)); + real2 v2344 = minus(v2291, v2311); + scatter(out, 81, 128, timesminusplus(v2344, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v2344), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v2297 = timesminusplus(reverse(v2283), load(tbl, 408 * VECWIDTH + tbloffset), times(v2283, load(tbl, 409 * VECWIDTH + tbloffset))); + real2 v2317 = timesminusplus(reverse(v2303), load(tbl, 412 * VECWIDTH + tbloffset), times(v2303, load(tbl, 413 * VECWIDTH + tbloffset))); + scatter(out, 49, 128, plus(v2297, v2317)); + real2 v2350 = minus(v2297, v2317); + scatter(out, 113, 128, timesminusplus(v2350, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v2350), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v2320 = minus(v2285, v2284); + real2 v2324 = plus(v2284, v2285); + real2 v2325 = plus(v2304, v2305); + real2 v2319 = reverse(minus(v2304, v2305)); + scatter(out, 1, 128, plus(v2324, v2325)); + real2 v2338 = minus(v2324, v2325); + scatter(out, 65, 128, timesminusplus(v2338, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v2338), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v2321 = minusplus(v2319, v2320); + scatter(out, 33, 128, timesminusplus(reverse(v2321), load(tbl, 414 * VECWIDTH + tbloffset), times(v2321, load(tbl, 415 * VECWIDTH + tbloffset)))); + real2 v2323 = minusplus(uminus(v2319), v2320); + scatter(out, 97, 128, timesminusplus(reverse(v2323), load(tbl, 416 * VECWIDTH + tbloffset), times(v2323, load(tbl, 417 * VECWIDTH + tbloffset)))); + real2 v2201 = minusplus(v2199, v2200); + real2 v2203 = minusplus(uminus(v2199), v2200); + real2 v2263 = minusplus(uminus(v2259), v2260); + real2 v2261 = minusplus(v2259, v2260); + real2 v2243 = minusplus(uminus(v2239), v2240); + real2 v2241 = minusplus(v2239, v2240); + real2 v2257 = timesminusplus(reverse(v2243), load(tbl, 400 * VECWIDTH + tbloffset), times(v2243, load(tbl, 401 * VECWIDTH + tbloffset))); + real2 v2217 = timesminusplus(reverse(v2203), load(tbl, 392 * VECWIDTH + tbloffset), times(v2203, load(tbl, 393 * VECWIDTH + tbloffset))); + real2 v2388 = plus(v2217, v2257); + real2 v2384 = minus(v2257, v2217); + real2 v2277 = timesminusplus(reverse(v2263), load(tbl, 404 * VECWIDTH + tbloffset), times(v2263, load(tbl, 405 * VECWIDTH + tbloffset))); + real2 v2221 = minusplus(v2219, v2220); + real2 v2223 = minusplus(uminus(v2219), v2220); + real2 v2237 = timesminusplus(reverse(v2223), load(tbl, 396 * VECWIDTH + tbloffset), times(v2223, load(tbl, 397 * VECWIDTH + tbloffset))); + real2 v2389 = plus(v2237, v2277); + real2 v2383 = reverse(minus(v2237, v2277)); + scatter(out, 25, 128, plus(v2388, v2389)); + real2 v2402 = minus(v2388, v2389); + scatter(out, 89, 128, timesminusplus(v2402, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v2402), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v2385 = minusplus(v2383, v2384); + real2 v2387 = minusplus(uminus(v2383), v2384); + scatter(out, 121, 128, timesminusplus(reverse(v2387), load(tbl, 424 * VECWIDTH + tbloffset), times(v2387, load(tbl, 425 * VECWIDTH + tbloffset)))); + scatter(out, 57, 128, timesminusplus(reverse(v2385), load(tbl, 422 * VECWIDTH + tbloffset), times(v2385, load(tbl, 423 * VECWIDTH + tbloffset)))); + real2 v2251 = timesminusplus(reverse(v2241), load(tbl, 398 * VECWIDTH + tbloffset), times(v2241, load(tbl, 399 * VECWIDTH + tbloffset))); + real2 v2211 = timesminusplus(reverse(v2201), load(tbl, 390 * VECWIDTH + tbloffset), times(v2201, load(tbl, 391 * VECWIDTH + tbloffset))); + real2 v2358 = minus(v2251, v2211); + real2 v2362 = plus(v2211, v2251); + real2 v2271 = timesminusplus(reverse(v2261), load(tbl, 402 * VECWIDTH + tbloffset), times(v2261, load(tbl, 403 * VECWIDTH + tbloffset))); + real2 v2231 = timesminusplus(reverse(v2221), load(tbl, 394 * VECWIDTH + tbloffset), times(v2221, load(tbl, 395 * VECWIDTH + tbloffset))); + real2 v2357 = reverse(minus(v2231, v2271)); + real2 v2363 = plus(v2231, v2271); + scatter(out, 9, 128, plus(v2362, v2363)); + real2 v2376 = minus(v2362, v2363); + scatter(out, 73, 128, timesminusplus(v2376, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v2376), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v2361 = minusplus(uminus(v2357), v2358); + scatter(out, 105, 128, timesminusplus(reverse(v2361), load(tbl, 420 * VECWIDTH + tbloffset), times(v2361, load(tbl, 421 * VECWIDTH + tbloffset)))); + real2 v2359 = minusplus(v2357, v2358); + scatter(out, 41, 128, timesminusplus(reverse(v2359), load(tbl, 418 * VECWIDTH + tbloffset), times(v2359, load(tbl, 419 * VECWIDTH + tbloffset)))); + real2 v2121 = minusplus(v2119, v2120); + real2 v2123 = minusplus(uminus(v2119), v2120); + real2 v2083 = minusplus(uminus(v2079), v2080); + real2 v2081 = minusplus(v2079, v2080); + real2 v2091 = timesminusplus(reverse(v2081), load(tbl, 366 * VECWIDTH + tbloffset), times(v2081, load(tbl, 367 * VECWIDTH + tbloffset))); + real2 v2043 = minusplus(uminus(v2039), v2040); + real2 v2041 = minusplus(v2039, v2040); + real2 v2051 = timesminusplus(reverse(v2041), load(tbl, 358 * VECWIDTH + tbloffset), times(v2041, load(tbl, 359 * VECWIDTH + tbloffset))); + real2 v2131 = timesminusplus(reverse(v2121), load(tbl, 374 * VECWIDTH + tbloffset), times(v2121, load(tbl, 375 * VECWIDTH + tbloffset))); + real2 v2163 = minusplus(uminus(v2159), v2160); + real2 v2161 = minusplus(v2159, v2160); + real2 v2171 = timesminusplus(reverse(v2161), load(tbl, 382 * VECWIDTH + tbloffset), times(v2161, load(tbl, 383 * VECWIDTH + tbloffset))); + real2 v2409 = reverse(minus(v2091, v2171)); + real2 v2415 = plus(v2091, v2171); + real2 v2410 = minus(v2131, v2051); + real2 v2414 = plus(v2051, v2131); + real2 v2454 = plus(v2414, v2415); + real2 v2450 = minus(v2415, v2414); + real2 v2181 = minusplus(v2179, v2180); + real2 v2183 = minusplus(uminus(v2179), v2180); + real2 v2191 = timesminusplus(reverse(v2181), load(tbl, 386 * VECWIDTH + tbloffset), times(v2181, load(tbl, 387 * VECWIDTH + tbloffset))); + real2 v2103 = minusplus(uminus(v2099), v2100); + real2 v2101 = minusplus(v2099, v2100); + real2 v2111 = timesminusplus(reverse(v2101), load(tbl, 370 * VECWIDTH + tbloffset), times(v2101, load(tbl, 371 * VECWIDTH + tbloffset))); + real2 v2435 = plus(v2111, v2191); + real2 v2429 = reverse(minus(v2111, v2191)); + real2 v2141 = minusplus(v2139, v2140); + real2 v2143 = minusplus(uminus(v2139), v2140); + real2 v2151 = timesminusplus(reverse(v2141), load(tbl, 378 * VECWIDTH + tbloffset), times(v2141, load(tbl, 379 * VECWIDTH + tbloffset))); + real2 v2063 = minusplus(uminus(v2059), v2060); + real2 v2061 = minusplus(v2059, v2060); + real2 v2071 = timesminusplus(reverse(v2061), load(tbl, 362 * VECWIDTH + tbloffset), times(v2061, load(tbl, 363 * VECWIDTH + tbloffset))); + real2 v2434 = plus(v2071, v2151); + real2 v2430 = minus(v2151, v2071); + real2 v2455 = plus(v2434, v2435); + real2 v2449 = reverse(minus(v2434, v2435)); + scatter(out, 5, 128, plus(v2454, v2455)); + real2 v2468 = minus(v2454, v2455); + scatter(out, 69, 128, timesminusplus(v2468, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v2468), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v2451 = minusplus(v2449, v2450); + real2 v2453 = minusplus(uminus(v2449), v2450); + scatter(out, 101, 128, timesminusplus(reverse(v2453), load(tbl, 436 * VECWIDTH + tbloffset), times(v2453, load(tbl, 437 * VECWIDTH + tbloffset)))); + scatter(out, 37, 128, timesminusplus(reverse(v2451), load(tbl, 434 * VECWIDTH + tbloffset), times(v2451, load(tbl, 435 * VECWIDTH + tbloffset)))); + real2 v2411 = minusplus(v2409, v2410); + real2 v2413 = minusplus(uminus(v2409), v2410); + real2 v2433 = minusplus(uminus(v2429), v2430); + real2 v2431 = minusplus(v2429, v2430); + real2 v2421 = timesminusplus(reverse(v2411), load(tbl, 426 * VECWIDTH + tbloffset), times(v2411, load(tbl, 427 * VECWIDTH + tbloffset))); + real2 v2441 = timesminusplus(reverse(v2431), load(tbl, 430 * VECWIDTH + tbloffset), times(v2431, load(tbl, 431 * VECWIDTH + tbloffset))); + scatter(out, 21, 128, plus(v2421, v2441)); + real2 v2474 = minus(v2421, v2441); + scatter(out, 85, 128, timesminusplus(v2474, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v2474), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v2427 = timesminusplus(reverse(v2413), load(tbl, 428 * VECWIDTH + tbloffset), times(v2413, load(tbl, 429 * VECWIDTH + tbloffset))); + real2 v2447 = timesminusplus(reverse(v2433), load(tbl, 432 * VECWIDTH + tbloffset), times(v2433, load(tbl, 433 * VECWIDTH + tbloffset))); + scatter(out, 53, 128, plus(v2427, v2447)); + real2 v2480 = minus(v2427, v2447); + scatter(out, 117, 128, timesminusplus(v2480, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v2480), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v2057 = timesminusplus(reverse(v2043), load(tbl, 360 * VECWIDTH + tbloffset), times(v2043, load(tbl, 361 * VECWIDTH + tbloffset))); + real2 v2097 = timesminusplus(reverse(v2083), load(tbl, 368 * VECWIDTH + tbloffset), times(v2083, load(tbl, 369 * VECWIDTH + tbloffset))); + real2 v2157 = timesminusplus(reverse(v2143), load(tbl, 380 * VECWIDTH + tbloffset), times(v2143, load(tbl, 381 * VECWIDTH + tbloffset))); + real2 v2197 = timesminusplus(reverse(v2183), load(tbl, 388 * VECWIDTH + tbloffset), times(v2183, load(tbl, 389 * VECWIDTH + tbloffset))); + real2 v2117 = timesminusplus(reverse(v2103), load(tbl, 372 * VECWIDTH + tbloffset), times(v2103, load(tbl, 373 * VECWIDTH + tbloffset))); + real2 v2507 = reverse(minus(v2117, v2197)); + real2 v2513 = plus(v2117, v2197); + real2 v2137 = timesminusplus(reverse(v2123), load(tbl, 376 * VECWIDTH + tbloffset), times(v2123, load(tbl, 377 * VECWIDTH + tbloffset))); + real2 v2488 = minus(v2137, v2057); + real2 v2492 = plus(v2057, v2137); + real2 v2177 = timesminusplus(reverse(v2163), load(tbl, 384 * VECWIDTH + tbloffset), times(v2163, load(tbl, 385 * VECWIDTH + tbloffset))); + real2 v2493 = plus(v2097, v2177); + real2 v2487 = reverse(minus(v2097, v2177)); + real2 v2532 = plus(v2492, v2493); + real2 v2528 = minus(v2493, v2492); + real2 v2077 = timesminusplus(reverse(v2063), load(tbl, 364 * VECWIDTH + tbloffset), times(v2063, load(tbl, 365 * VECWIDTH + tbloffset))); + real2 v2512 = plus(v2077, v2157); + real2 v2508 = minus(v2157, v2077); + real2 v2527 = reverse(minus(v2512, v2513)); + real2 v2533 = plus(v2512, v2513); + real2 v2529 = minusplus(v2527, v2528); + real2 v2531 = minusplus(uminus(v2527), v2528); + scatter(out, 109, 128, timesminusplus(reverse(v2531), load(tbl, 448 * VECWIDTH + tbloffset), times(v2531, load(tbl, 449 * VECWIDTH + tbloffset)))); + scatter(out, 45, 128, timesminusplus(reverse(v2529), load(tbl, 446 * VECWIDTH + tbloffset), times(v2529, load(tbl, 447 * VECWIDTH + tbloffset)))); + scatter(out, 13, 128, plus(v2532, v2533)); + real2 v2546 = minus(v2532, v2533); + scatter(out, 77, 128, timesminusplus(v2546, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v2546), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v2509 = minusplus(v2507, v2508); + real2 v2511 = minusplus(uminus(v2507), v2508); + real2 v2491 = minusplus(uminus(v2487), v2488); + real2 v2489 = minusplus(v2487, v2488); + real2 v2499 = timesminusplus(reverse(v2489), load(tbl, 438 * VECWIDTH + tbloffset), times(v2489, load(tbl, 439 * VECWIDTH + tbloffset))); + real2 v2519 = timesminusplus(reverse(v2509), load(tbl, 442 * VECWIDTH + tbloffset), times(v2509, load(tbl, 443 * VECWIDTH + tbloffset))); + scatter(out, 29, 128, plus(v2499, v2519)); + real2 v2552 = minus(v2499, v2519); + scatter(out, 93, 128, timesminusplus(v2552, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v2552), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v2505 = timesminusplus(reverse(v2491), load(tbl, 440 * VECWIDTH + tbloffset), times(v2491, load(tbl, 441 * VECWIDTH + tbloffset))); + real2 v2525 = timesminusplus(reverse(v2511), load(tbl, 444 * VECWIDTH + tbloffset), times(v2511, load(tbl, 445 * VECWIDTH + tbloffset))); + scatter(out, 61, 128, plus(v2505, v2525)); + real2 v2558 = minus(v2505, v2525); + scatter(out, 125, 128, timesminusplus(v2558, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v2558), load(tbl, 1 * VECWIDTH + tbloffset)))); + // Pres : 76263 + } +} + +ALIGNED(8192) void tbut128b_%SHIFT%_%CONFIG%_%ISA%(real *RESTRICT out0, uint32_t *q, const real *RESTRICT in0, const real *RESTRICT tbl, const int K) { + const int shift = %SHIFT% + LOG2VECWIDTH, k = 1 << (shift - LOG2VECWIDTH); + int i=0; +#pragma omp parallel for + for(i=0;i < k;i++) { + int i0 = i << LOG2VECWIDTH; + real *out = out0 + q[i]; + const real *in = in0 + i0*2; + const int tbloffset = K * i0; + + // Pres : 148586 + real2 v56 = load(in, 54 << shift); + real2 v120 = load(in, 118 << shift); + real2 v571 = reverse(minus(v120, v56)); + real2 v577 = plus(v56, v120); + real2 v24 = load(in, 22 << shift); + real2 v88 = load(in, 86 << shift); + real2 v576 = plus(v24, v88); + real2 v572 = minus(v88, v24); + real2 v573 = minusplus(v571, v572); + real2 v575 = minusplus(uminus(v571), v572); + real2 v589 = timesminusplus(reverse(v575), load(tbl, 92 * VECWIDTH + tbloffset), times(v575, load(tbl, 93 * VECWIDTH + tbloffset))); + real2 v583 = timesminusplus(reverse(v573), load(tbl, 90 * VECWIDTH + tbloffset), times(v573, load(tbl, 91 * VECWIDTH + tbloffset))); + real2 v897 = plus(v576, v577); + real2 v891 = reverse(minus(v577, v576)); + real2 v8 = load(in, 6 << shift); + real2 v72 = load(in, 70 << shift); + real2 v252 = minus(v72, v8); + real2 v256 = plus(v8, v72); + real2 v104 = load(in, 102 << shift); + real2 v40 = load(in, 38 << shift); + real2 v251 = reverse(minus(v104, v40)); + real2 v257 = plus(v40, v104); + real2 v255 = minusplus(uminus(v251), v252); + real2 v253 = minusplus(v251, v252); + real2 v263 = timesminusplus(reverse(v253), load(tbl, 26 * VECWIDTH + tbloffset), times(v253, load(tbl, 27 * VECWIDTH + tbloffset))); + real2 v896 = plus(v256, v257); + real2 v892 = minus(v257, v256); + real2 v895 = minusplus(uminus(v891), v892); + real2 v893 = minusplus(v891, v892); + real2 v909 = timesminusplus(reverse(v895), load(tbl, 156 * VECWIDTH + tbloffset), times(v895, load(tbl, 157 * VECWIDTH + tbloffset))); + real2 v903 = timesminusplus(reverse(v893), load(tbl, 154 * VECWIDTH + tbloffset), times(v893, load(tbl, 155 * VECWIDTH + tbloffset))); + real2 v269 = timesminusplus(reverse(v255), load(tbl, 28 * VECWIDTH + tbloffset), times(v255, load(tbl, 29 * VECWIDTH + tbloffset))); + real2 v1216 = plus(v896, v897); + real2 v1212 = minus(v897, v896); + real2 v2160 = minus(v583, v263); + real2 v2164 = plus(v263, v583); + real2 v2686 = minus(v589, v269); + real2 v2690 = plus(v269, v589); + real2 v96 = load(in, 94 << shift); + real2 v32 = load(in, 30 << shift); + real2 v736 = plus(v32, v96); + real2 v732 = minus(v96, v32); + real2 v64 = load(in, 62 << shift); + real2 v128 = load(in, 126 << shift); + real2 v737 = plus(v64, v128); + real2 v731 = reverse(minus(v128, v64)); + real2 v1057 = plus(v736, v737); + real2 v1051 = reverse(minus(v737, v736)); + real2 v733 = minusplus(v731, v732); + real2 v735 = minusplus(uminus(v731), v732); + real2 v749 = timesminusplus(reverse(v735), load(tbl, 124 * VECWIDTH + tbloffset), times(v735, load(tbl, 125 * VECWIDTH + tbloffset))); + real2 v743 = timesminusplus(reverse(v733), load(tbl, 122 * VECWIDTH + tbloffset), times(v733, load(tbl, 123 * VECWIDTH + tbloffset))); + real2 v16 = load(in, 14 << shift); + real2 v80 = load(in, 78 << shift); + real2 v412 = minus(v80, v16); + real2 v416 = plus(v16, v80); + real2 v112 = load(in, 110 << shift); + real2 v48 = load(in, 46 << shift); + real2 v417 = plus(v48, v112); + real2 v411 = reverse(minus(v112, v48)); + real2 v1056 = plus(v416, v417); + real2 v1052 = minus(v417, v416); + real2 v1055 = minusplus(uminus(v1051), v1052); + real2 v1053 = minusplus(v1051, v1052); + real2 v1063 = timesminusplus(reverse(v1053), load(tbl, 186 * VECWIDTH + tbloffset), times(v1053, load(tbl, 187 * VECWIDTH + tbloffset))); + real2 v1665 = plus(v903, v1063); + real2 v1659 = reverse(minus(v1063, v903)); + real2 v1069 = timesminusplus(reverse(v1055), load(tbl, 188 * VECWIDTH + tbloffset), times(v1055, load(tbl, 189 * VECWIDTH + tbloffset))); + real2 v1869 = reverse(minus(v1069, v909)); + real2 v1875 = plus(v909, v1069); + real2 v413 = minusplus(v411, v412); + real2 v415 = minusplus(uminus(v411), v412); + real2 v429 = timesminusplus(reverse(v415), load(tbl, 60 * VECWIDTH + tbloffset), times(v415, load(tbl, 61 * VECWIDTH + tbloffset))); + real2 v1217 = plus(v1056, v1057); + real2 v1211 = reverse(minus(v1057, v1056)); + real2 v1297 = plus(v1216, v1217); + real2 v1291 = reverse(minus(v1217, v1216)); + real2 v2691 = plus(v429, v749); + real2 v2685 = reverse(minus(v749, v429)); + real2 v2765 = reverse(minus(v2691, v2690)); + real2 v2771 = plus(v2690, v2691); + real2 v2689 = minusplus(uminus(v2685), v2686); + real2 v2687 = minusplus(v2685, v2686); + real2 v2703 = timesminusplus(reverse(v2689), load(tbl, 476 * VECWIDTH + tbloffset), times(v2689, load(tbl, 477 * VECWIDTH + tbloffset))); + real2 v2697 = timesminusplus(reverse(v2687), load(tbl, 474 * VECWIDTH + tbloffset), times(v2687, load(tbl, 475 * VECWIDTH + tbloffset))); + real2 v1215 = minusplus(uminus(v1211), v1212); + real2 v1213 = minusplus(v1211, v1212); + real2 v1223 = timesminusplus(reverse(v1213), load(tbl, 218 * VECWIDTH + tbloffset), times(v1213, load(tbl, 219 * VECWIDTH + tbloffset))); + real2 v1229 = timesminusplus(reverse(v1215), load(tbl, 220 * VECWIDTH + tbloffset), times(v1215, load(tbl, 221 * VECWIDTH + tbloffset))); + real2 v423 = timesminusplus(reverse(v413), load(tbl, 58 * VECWIDTH + tbloffset), times(v413, load(tbl, 59 * VECWIDTH + tbloffset))); + real2 v2165 = plus(v423, v743); + real2 v2159 = reverse(minus(v743, v423)); + real2 v2245 = plus(v2164, v2165); + real2 v2239 = reverse(minus(v2165, v2164)); + real2 v44 = load(in, 42 << shift); + real2 v108 = load(in, 106 << shift); + real2 v331 = reverse(minus(v108, v44)); + real2 v337 = plus(v44, v108); + real2 v76 = load(in, 74 << shift); + real2 v12 = load(in, 10 << shift); + real2 v336 = plus(v12, v76); + real2 v332 = minus(v76, v12); + real2 v976 = plus(v336, v337); + real2 v972 = minus(v337, v336); + real2 v335 = minusplus(uminus(v331), v332); + real2 v333 = minusplus(v331, v332); + real2 v343 = timesminusplus(reverse(v333), load(tbl, 42 * VECWIDTH + tbloffset), times(v333, load(tbl, 43 * VECWIDTH + tbloffset))); + real2 v349 = timesminusplus(reverse(v335), load(tbl, 44 * VECWIDTH + tbloffset), times(v335, load(tbl, 45 * VECWIDTH + tbloffset))); + real2 v124 = load(in, 122 << shift); + real2 v60 = load(in, 58 << shift); + real2 v651 = reverse(minus(v124, v60)); + real2 v657 = plus(v60, v124); + real2 v28 = load(in, 26 << shift); + real2 v92 = load(in, 90 << shift); + real2 v652 = minus(v92, v28); + real2 v656 = plus(v28, v92); + real2 v977 = plus(v656, v657); + real2 v971 = reverse(minus(v657, v656)); + real2 v973 = minusplus(v971, v972); + real2 v975 = minusplus(uminus(v971), v972); + real2 v983 = timesminusplus(reverse(v973), load(tbl, 170 * VECWIDTH + tbloffset), times(v973, load(tbl, 171 * VECWIDTH + tbloffset))); + real2 v1131 = reverse(minus(v977, v976)); + real2 v1137 = plus(v976, v977); + real2 v655 = minusplus(uminus(v651), v652); + real2 v653 = minusplus(v651, v652); + real2 v669 = timesminusplus(reverse(v655), load(tbl, 108 * VECWIDTH + tbloffset), times(v655, load(tbl, 109 * VECWIDTH + tbloffset))); + real2 v663 = timesminusplus(reverse(v653), load(tbl, 106 * VECWIDTH + tbloffset), times(v653, load(tbl, 107 * VECWIDTH + tbloffset))); + real2 v2079 = reverse(minus(v663, v343)); + real2 v2085 = plus(v343, v663); + real2 v2605 = reverse(minus(v669, v349)); + real2 v2611 = plus(v349, v669); + real2 v989 = timesminusplus(reverse(v975), load(tbl, 172 * VECWIDTH + tbloffset), times(v975, load(tbl, 173 * VECWIDTH + tbloffset))); + real2 v20 = load(in, 18 << shift); + real2 v84 = load(in, 82 << shift); + real2 v496 = plus(v20, v84); + real2 v492 = minus(v84, v20); + real2 v52 = load(in, 50 << shift); + real2 v116 = load(in, 114 << shift); + real2 v491 = reverse(minus(v116, v52)); + real2 v497 = plus(v52, v116); + real2 v817 = plus(v496, v497); + real2 v811 = reverse(minus(v497, v496)); + real2 v493 = minusplus(v491, v492); + real2 v495 = minusplus(uminus(v491), v492); + real2 v509 = timesminusplus(reverse(v495), load(tbl, 76 * VECWIDTH + tbloffset), times(v495, load(tbl, 77 * VECWIDTH + tbloffset))); + real2 v503 = timesminusplus(reverse(v493), load(tbl, 74 * VECWIDTH + tbloffset), times(v493, load(tbl, 75 * VECWIDTH + tbloffset))); + real2 v36 = load(in, 34 << shift); + real2 v100 = load(in, 98 << shift); + real2 v171 = reverse(minus(v100, v36)); + real2 v177 = plus(v36, v100); + real2 v68 = load(in, 66 << shift); + real2 v4 = load(in, 2 << shift); + real2 v176 = plus(v4, v68); + real2 v172 = minus(v68, v4); + real2 v816 = plus(v176, v177); + real2 v812 = minus(v177, v176); + real2 v1136 = plus(v816, v817); + real2 v1132 = minus(v817, v816); + real2 v1133 = minusplus(v1131, v1132); + real2 v1135 = minusplus(uminus(v1131), v1132); + real2 v1149 = timesminusplus(reverse(v1135), load(tbl, 204 * VECWIDTH + tbloffset), times(v1135, load(tbl, 205 * VECWIDTH + tbloffset))); + real2 v1296 = plus(v1136, v1137); + real2 v1292 = minus(v1137, v1136); + real2 v1295 = minusplus(uminus(v1291), v1292); + real2 v1293 = minusplus(v1291, v1292); + real2 v1303 = timesminusplus(reverse(v1293), load(tbl, 234 * VECWIDTH + tbloffset), times(v1293, load(tbl, 235 * VECWIDTH + tbloffset))); + real2 v1331 = reverse(minus(v1297, v1296)); + real2 v1337 = plus(v1296, v1297); + real2 v173 = minusplus(v171, v172); + real2 v175 = minusplus(uminus(v171), v172); + real2 v189 = timesminusplus(reverse(v175), load(tbl, 12 * VECWIDTH + tbloffset), times(v175, load(tbl, 13 * VECWIDTH + tbloffset))); + real2 v1309 = timesminusplus(reverse(v1295), load(tbl, 236 * VECWIDTH + tbloffset), times(v1295, load(tbl, 237 * VECWIDTH + tbloffset))); + real2 v815 = minusplus(uminus(v811), v812); + real2 v813 = minusplus(v811, v812); + real2 v1143 = timesminusplus(reverse(v1133), load(tbl, 202 * VECWIDTH + tbloffset), times(v1133, load(tbl, 203 * VECWIDTH + tbloffset))); + real2 v1541 = reverse(minus(v1229, v1149)); + real2 v1547 = plus(v1149, v1229); + real2 v2610 = plus(v189, v509); + real2 v2606 = minus(v509, v189); + real2 v2770 = plus(v2610, v2611); + real2 v2766 = minus(v2611, v2610); + real2 v823 = timesminusplus(reverse(v813), load(tbl, 138 * VECWIDTH + tbloffset), times(v813, load(tbl, 139 * VECWIDTH + tbloffset))); + real2 v829 = timesminusplus(reverse(v815), load(tbl, 140 * VECWIDTH + tbloffset), times(v815, load(tbl, 141 * VECWIDTH + tbloffset))); + real2 v2811 = plus(v2770, v2771); + real2 v2805 = reverse(minus(v2771, v2770)); + real2 v2767 = minusplus(v2765, v2766); + real2 v2769 = minusplus(uminus(v2765), v2766); + real2 v2607 = minusplus(v2605, v2606); + real2 v2609 = minusplus(uminus(v2605), v2606); + real2 v2617 = timesminusplus(reverse(v2607), load(tbl, 458 * VECWIDTH + tbloffset), times(v2607, load(tbl, 459 * VECWIDTH + tbloffset))); + real2 v2623 = timesminusplus(reverse(v2609), load(tbl, 460 * VECWIDTH + tbloffset), times(v2609, load(tbl, 461 * VECWIDTH + tbloffset))); + real2 v3013 = reverse(minus(v2703, v2623)); + real2 v3019 = plus(v2623, v2703); + real2 v2783 = timesminusplus(reverse(v2769), load(tbl, 492 * VECWIDTH + tbloffset), times(v2769, load(tbl, 493 * VECWIDTH + tbloffset))); + real2 v2941 = plus(v2617, v2697); + real2 v2935 = reverse(minus(v2697, v2617)); + real2 v2777 = timesminusplus(reverse(v2767), load(tbl, 490 * VECWIDTH + tbloffset), times(v2767, load(tbl, 491 * VECWIDTH + tbloffset))); + real2 v1660 = minus(v983, v823); + real2 v1664 = plus(v823, v983); + real2 v1874 = plus(v829, v989); + real2 v1870 = minus(v989, v829); + real2 v1909 = reverse(minus(v1875, v1874)); + real2 v1915 = plus(v1874, v1875); + real2 v1663 = minusplus(uminus(v1659), v1660); + real2 v1661 = minusplus(v1659, v1660); + real2 v1677 = timesminusplus(reverse(v1663), load(tbl, 296 * VECWIDTH + tbloffset), times(v1663, load(tbl, 297 * VECWIDTH + tbloffset))); + real2 v1873 = minusplus(uminus(v1869), v1870); + real2 v1871 = minusplus(v1869, v1870); + real2 v1887 = timesminusplus(reverse(v1873), load(tbl, 332 * VECWIDTH + tbloffset), times(v1873, load(tbl, 333 * VECWIDTH + tbloffset))); + real2 v1705 = plus(v1664, v1665); + real2 v1699 = reverse(minus(v1665, v1664)); + real2 v1671 = timesminusplus(reverse(v1661), load(tbl, 294 * VECWIDTH + tbloffset), times(v1661, load(tbl, 295 * VECWIDTH + tbloffset))); + real2 v1881 = timesminusplus(reverse(v1871), load(tbl, 330 * VECWIDTH + tbloffset), times(v1871, load(tbl, 331 * VECWIDTH + tbloffset))); + real2 v1469 = plus(v1143, v1223); + real2 v1463 = reverse(minus(v1223, v1143)); + real2 v54 = load(in, 52 << shift); + real2 v118 = load(in, 116 << shift); + real2 v537 = plus(v54, v118); + real2 v531 = reverse(minus(v118, v54)); + real2 v86 = load(in, 84 << shift); + real2 v22 = load(in, 20 << shift); + real2 v536 = plus(v22, v86); + real2 v532 = minus(v86, v22); + real2 v851 = reverse(minus(v537, v536)); + real2 v857 = plus(v536, v537); + real2 v533 = minusplus(v531, v532); + real2 v535 = minusplus(uminus(v531), v532); + real2 v549 = timesminusplus(reverse(v535), load(tbl, 84 * VECWIDTH + tbloffset), times(v535, load(tbl, 85 * VECWIDTH + tbloffset))); + real2 v102 = load(in, 100 << shift); + real2 v38 = load(in, 36 << shift); + real2 v217 = plus(v38, v102); + real2 v211 = reverse(minus(v102, v38)); + real2 v70 = load(in, 68 << shift); + real2 v6 = load(in, 4 << shift); + real2 v216 = plus(v6, v70); + real2 v212 = minus(v70, v6); + real2 v213 = minusplus(v211, v212); + real2 v215 = minusplus(uminus(v211), v212); + real2 v229 = timesminusplus(reverse(v215), load(tbl, 20 * VECWIDTH + tbloffset), times(v215, load(tbl, 21 * VECWIDTH + tbloffset))); + real2 v2646 = minus(v549, v229); + real2 v2650 = plus(v229, v549); + real2 v856 = plus(v216, v217); + real2 v852 = minus(v217, v216); + real2 v853 = minusplus(v851, v852); + real2 v855 = minusplus(uminus(v851), v852); + real2 v863 = timesminusplus(reverse(v853), load(tbl, 146 * VECWIDTH + tbloffset), times(v853, load(tbl, 147 * VECWIDTH + tbloffset))); + real2 v869 = timesminusplus(reverse(v855), load(tbl, 148 * VECWIDTH + tbloffset), times(v855, load(tbl, 149 * VECWIDTH + tbloffset))); + real2 v1176 = plus(v856, v857); + real2 v1172 = minus(v857, v856); + real2 v110 = load(in, 108 << shift); + real2 v46 = load(in, 44 << shift); + real2 v377 = plus(v46, v110); + real2 v371 = reverse(minus(v110, v46)); + real2 v78 = load(in, 76 << shift); + real2 v14 = load(in, 12 << shift); + real2 v372 = minus(v78, v14); + real2 v376 = plus(v14, v78); + real2 v1012 = minus(v377, v376); + real2 v1016 = plus(v376, v377); + real2 v373 = minusplus(v371, v372); + real2 v375 = minusplus(uminus(v371), v372); + real2 v389 = timesminusplus(reverse(v375), load(tbl, 52 * VECWIDTH + tbloffset), times(v375, load(tbl, 53 * VECWIDTH + tbloffset))); + real2 v30 = load(in, 28 << shift); + real2 v94 = load(in, 92 << shift); + real2 v696 = plus(v30, v94); + real2 v692 = minus(v94, v30); + real2 v62 = load(in, 60 << shift); + real2 v126 = load(in, 124 << shift); + real2 v697 = plus(v62, v126); + real2 v691 = reverse(minus(v126, v62)); + real2 v1017 = plus(v696, v697); + real2 v1011 = reverse(minus(v697, v696)); + real2 v1171 = reverse(minus(v1017, v1016)); + real2 v1177 = plus(v1016, v1017); + real2 v1013 = minusplus(v1011, v1012); + real2 v1015 = minusplus(uminus(v1011), v1012); + real2 v1175 = minusplus(uminus(v1171), v1172); + real2 v1173 = minusplus(v1171, v1172); + real2 v1183 = timesminusplus(reverse(v1173), load(tbl, 210 * VECWIDTH + tbloffset), times(v1173, load(tbl, 211 * VECWIDTH + tbloffset))); + real2 v1189 = timesminusplus(reverse(v1175), load(tbl, 212 * VECWIDTH + tbloffset), times(v1175, load(tbl, 213 * VECWIDTH + tbloffset))); + real2 v1029 = timesminusplus(reverse(v1015), load(tbl, 180 * VECWIDTH + tbloffset), times(v1015, load(tbl, 181 * VECWIDTH + tbloffset))); + real2 v1023 = timesminusplus(reverse(v1013), load(tbl, 178 * VECWIDTH + tbloffset), times(v1013, load(tbl, 179 * VECWIDTH + tbloffset))); + real2 v1625 = plus(v863, v1023); + real2 v1619 = reverse(minus(v1023, v863)); + real2 v1835 = plus(v869, v1029); + real2 v1829 = reverse(minus(v1029, v869)); + real2 v693 = minusplus(v691, v692); + real2 v695 = minusplus(uminus(v691), v692); + real2 v709 = timesminusplus(reverse(v695), load(tbl, 116 * VECWIDTH + tbloffset), times(v695, load(tbl, 117 * VECWIDTH + tbloffset))); + real2 v2645 = reverse(minus(v709, v389)); + real2 v2651 = plus(v389, v709); + real2 v1257 = plus(v1176, v1177); + real2 v1251 = reverse(minus(v1177, v1176)); + real2 v2731 = plus(v2650, v2651); + real2 v2725 = reverse(minus(v2651, v2650)); + real2 v114 = load(in, 112 << shift); + real2 v50 = load(in, 48 << shift); + real2 v457 = plus(v50, v114); + real2 v451 = reverse(minus(v114, v50)); + real2 v18 = load(in, 16 << shift); + real2 v82 = load(in, 80 << shift); + real2 v456 = plus(v18, v82); + real2 v452 = minus(v82, v18); + real2 v771 = reverse(minus(v457, v456)); + real2 v777 = plus(v456, v457); + real2 v453 = minusplus(v451, v452); + real2 v455 = minusplus(uminus(v451), v452); + real2 v469 = timesminusplus(reverse(v455), load(tbl, 68 * VECWIDTH + tbloffset), times(v455, load(tbl, 69 * VECWIDTH + tbloffset))); + real2 v66 = load(in, 64 << shift); + real2 v2 = load(in, 0 << shift); + real2 v132 = minus(v66, v2); + real2 v136 = plus(v2, v66); + real2 v98 = load(in, 96 << shift); + real2 v34 = load(in, 32 << shift); + real2 v131 = reverse(minus(v98, v34)); + real2 v137 = plus(v34, v98); + real2 v133 = minusplus(v131, v132); + real2 v135 = minusplus(uminus(v131), v132); + real2 v149 = timesminusplus(reverse(v135), load(tbl, 4 * VECWIDTH + tbloffset), times(v135, load(tbl, 5 * VECWIDTH + tbloffset))); + real2 v2566 = minus(v469, v149); + real2 v2570 = plus(v149, v469); + real2 v772 = minus(v137, v136); + real2 v776 = plus(v136, v137); + real2 v1092 = minus(v777, v776); + real2 v1096 = plus(v776, v777); + real2 v773 = minusplus(v771, v772); + real2 v775 = minusplus(uminus(v771), v772); + real2 v783 = timesminusplus(reverse(v773), load(tbl, 130 * VECWIDTH + tbloffset), times(v773, load(tbl, 131 * VECWIDTH + tbloffset))); + real2 v789 = timesminusplus(reverse(v775), load(tbl, 132 * VECWIDTH + tbloffset), times(v775, load(tbl, 133 * VECWIDTH + tbloffset))); + real2 v74 = load(in, 72 << shift); + real2 v10 = load(in, 8 << shift); + real2 v296 = plus(v10, v74); + real2 v292 = minus(v74, v10); + real2 v42 = load(in, 40 << shift); + real2 v106 = load(in, 104 << shift); + real2 v291 = reverse(minus(v106, v42)); + real2 v297 = plus(v42, v106); + real2 v293 = minusplus(v291, v292); + real2 v295 = minusplus(uminus(v291), v292); + real2 v309 = timesminusplus(reverse(v295), load(tbl, 36 * VECWIDTH + tbloffset), times(v295, load(tbl, 37 * VECWIDTH + tbloffset))); + real2 v932 = minus(v297, v296); + real2 v936 = plus(v296, v297); + real2 v122 = load(in, 120 << shift); + real2 v58 = load(in, 56 << shift); + real2 v617 = plus(v58, v122); + real2 v611 = reverse(minus(v122, v58)); + real2 v26 = load(in, 24 << shift); + real2 v90 = load(in, 88 << shift); + real2 v612 = minus(v90, v26); + real2 v616 = plus(v26, v90); + real2 v937 = plus(v616, v617); + real2 v931 = reverse(minus(v617, v616)); + real2 v1091 = reverse(minus(v937, v936)); + real2 v1097 = plus(v936, v937); + real2 v933 = minusplus(v931, v932); + real2 v935 = minusplus(uminus(v931), v932); + real2 v1093 = minusplus(v1091, v1092); + real2 v1095 = minusplus(uminus(v1091), v1092); + real2 v1103 = timesminusplus(reverse(v1093), load(tbl, 194 * VECWIDTH + tbloffset), times(v1093, load(tbl, 195 * VECWIDTH + tbloffset))); + real2 v1468 = plus(v1103, v1183); + real2 v1464 = minus(v1183, v1103); + real2 v1508 = plus(v1468, v1469); + real2 v1504 = minus(v1469, v1468); + real2 v1252 = minus(v1097, v1096); + real2 v1256 = plus(v1096, v1097); + real2 v1336 = plus(v1256, v1257); + real2 v1332 = minus(v1257, v1256); + real2 v1335 = minusplus(uminus(v1331), v1332); + real2 v1333 = minusplus(v1331, v1332); + real2 v1343 = timesminusplus(reverse(v1333), load(tbl, 242 * VECWIDTH + tbloffset), times(v1333, load(tbl, 243 * VECWIDTH + tbloffset))); + real2 v1349 = timesminusplus(reverse(v1335), load(tbl, 244 * VECWIDTH + tbloffset), times(v1335, load(tbl, 245 * VECWIDTH + tbloffset))); + real2 v1376 = plus(v1336, v1337); + real2 v1372 = minus(v1337, v1336); + real2 v1465 = minusplus(v1463, v1464); + real2 v1467 = minusplus(uminus(v1463), v1464); + real2 v1255 = minusplus(uminus(v1251), v1252); + real2 v1253 = minusplus(v1251, v1252); + real2 v1481 = timesminusplus(reverse(v1467), load(tbl, 264 * VECWIDTH + tbloffset), times(v1467, load(tbl, 265 * VECWIDTH + tbloffset))); + real2 v1475 = timesminusplus(reverse(v1465), load(tbl, 262 * VECWIDTH + tbloffset), times(v1465, load(tbl, 263 * VECWIDTH + tbloffset))); + real2 v1109 = timesminusplus(reverse(v1095), load(tbl, 196 * VECWIDTH + tbloffset), times(v1095, load(tbl, 197 * VECWIDTH + tbloffset))); + real2 v1542 = minus(v1189, v1109); + real2 v1546 = plus(v1109, v1189); + real2 v1545 = minusplus(uminus(v1541), v1542); + real2 v1543 = minusplus(v1541, v1542); + real2 v1553 = timesminusplus(reverse(v1543), load(tbl, 274 * VECWIDTH + tbloffset), times(v1543, load(tbl, 275 * VECWIDTH + tbloffset))); + real2 v1559 = timesminusplus(reverse(v1545), load(tbl, 276 * VECWIDTH + tbloffset), times(v1545, load(tbl, 277 * VECWIDTH + tbloffset))); + real2 v1582 = minus(v1547, v1546); + real2 v1586 = plus(v1546, v1547); + real2 v1269 = timesminusplus(reverse(v1255), load(tbl, 228 * VECWIDTH + tbloffset), times(v1255, load(tbl, 229 * VECWIDTH + tbloffset))); + real2 v1438 = minus(v1309, v1269); + real2 v1442 = plus(v1269, v1309); + real2 v1263 = timesminusplus(reverse(v1253), load(tbl, 226 * VECWIDTH + tbloffset), times(v1253, load(tbl, 227 * VECWIDTH + tbloffset))); + real2 v943 = timesminusplus(reverse(v933), load(tbl, 162 * VECWIDTH + tbloffset), times(v933, load(tbl, 163 * VECWIDTH + tbloffset))); + real2 v1624 = plus(v783, v943); + real2 v1620 = minus(v943, v783); + real2 v1623 = minusplus(uminus(v1619), v1620); + real2 v1621 = minusplus(v1619, v1620); + real2 v1700 = minus(v1625, v1624); + real2 v1704 = plus(v1624, v1625); + real2 v1631 = timesminusplus(reverse(v1621), load(tbl, 286 * VECWIDTH + tbloffset), times(v1621, load(tbl, 287 * VECWIDTH + tbloffset))); + real2 v949 = timesminusplus(reverse(v935), load(tbl, 164 * VECWIDTH + tbloffset), times(v935, load(tbl, 165 * VECWIDTH + tbloffset))); + real2 v1830 = minus(v949, v789); + real2 v1834 = plus(v789, v949); + real2 v1782 = plus(v1631, v1671); + real2 v1778 = minus(v1671, v1631); + real2 v1910 = minus(v1835, v1834); + real2 v1914 = plus(v1834, v1835); + real2 v1950 = minus(v1915, v1914); + real2 v1954 = plus(v1914, v1915); + real2 v1913 = minusplus(uminus(v1909), v1910); + real2 v1911 = minusplus(v1909, v1910); + real2 v613 = minusplus(v611, v612); + real2 v615 = minusplus(uminus(v611), v612); + real2 v629 = timesminusplus(reverse(v615), load(tbl, 100 * VECWIDTH + tbloffset), times(v615, load(tbl, 101 * VECWIDTH + tbloffset))); + real2 v1744 = plus(v1704, v1705); + real2 v1740 = minus(v1705, v1704); + real2 v1637 = timesminusplus(reverse(v1623), load(tbl, 288 * VECWIDTH + tbloffset), times(v1623, load(tbl, 289 * VECWIDTH + tbloffset))); + real2 v1927 = timesminusplus(reverse(v1913), load(tbl, 340 * VECWIDTH + tbloffset), times(v1913, load(tbl, 341 * VECWIDTH + tbloffset))); + real2 v2571 = plus(v309, v629); + real2 v2565 = reverse(minus(v629, v309)); + real2 v1833 = minusplus(uminus(v1829), v1830); + real2 v1831 = minusplus(v1829, v1830); + real2 v1921 = timesminusplus(reverse(v1911), load(tbl, 338 * VECWIDTH + tbloffset), times(v1911, load(tbl, 339 * VECWIDTH + tbloffset))); + real2 v1804 = minus(v1677, v1637); + real2 v1808 = plus(v1637, v1677); + real2 v1847 = timesminusplus(reverse(v1833), load(tbl, 324 * VECWIDTH + tbloffset), times(v1833, load(tbl, 325 * VECWIDTH + tbloffset))); + real2 v2014 = minus(v1887, v1847); + real2 v2018 = plus(v1847, v1887); + real2 v1841 = timesminusplus(reverse(v1831), load(tbl, 322 * VECWIDTH + tbloffset), times(v1831, load(tbl, 323 * VECWIDTH + tbloffset))); + real2 v1988 = minus(v1881, v1841); + real2 v1992 = plus(v1841, v1881); + real2 v1703 = minusplus(uminus(v1699), v1700); + real2 v1701 = minusplus(v1699, v1700); + real2 v1717 = timesminusplus(reverse(v1703), load(tbl, 304 * VECWIDTH + tbloffset), times(v1703, load(tbl, 305 * VECWIDTH + tbloffset))); + real2 v1711 = timesminusplus(reverse(v1701), load(tbl, 302 * VECWIDTH + tbloffset), times(v1701, load(tbl, 303 * VECWIDTH + tbloffset))); + real2 v2730 = plus(v2570, v2571); + real2 v2726 = minus(v2571, v2570); + real2 v1412 = minus(v1303, v1263); + real2 v1416 = plus(v1263, v1303); + real2 v63 = load(in, 61 << shift); + real2 v127 = load(in, 125 << shift); + real2 v717 = plus(v63, v127); + real2 v711 = reverse(minus(v127, v63)); + real2 v95 = load(in, 93 << shift); + real2 v31 = load(in, 29 << shift); + real2 v712 = minus(v95, v31); + real2 v716 = plus(v31, v95); + real2 v1037 = plus(v716, v717); + real2 v1031 = reverse(minus(v717, v716)); + real2 v79 = load(in, 77 << shift); + real2 v15 = load(in, 13 << shift); + real2 v396 = plus(v15, v79); + real2 v392 = minus(v79, v15); + real2 v111 = load(in, 109 << shift); + real2 v47 = load(in, 45 << shift); + real2 v397 = plus(v47, v111); + real2 v391 = reverse(minus(v111, v47)); + real2 v1032 = minus(v397, v396); + real2 v1036 = plus(v396, v397); + real2 v1033 = minusplus(v1031, v1032); + real2 v1035 = minusplus(uminus(v1031), v1032); + real2 v1049 = timesminusplus(reverse(v1035), load(tbl, 184 * VECWIDTH + tbloffset), times(v1035, load(tbl, 185 * VECWIDTH + tbloffset))); + real2 v1043 = timesminusplus(reverse(v1033), load(tbl, 182 * VECWIDTH + tbloffset), times(v1033, load(tbl, 183 * VECWIDTH + tbloffset))); + real2 v1197 = plus(v1036, v1037); + real2 v1191 = reverse(minus(v1037, v1036)); + real2 v23 = load(in, 21 << shift); + real2 v87 = load(in, 85 << shift); + real2 v556 = plus(v23, v87); + real2 v552 = minus(v87, v23); + real2 v119 = load(in, 117 << shift); + real2 v55 = load(in, 53 << shift); + real2 v557 = plus(v55, v119); + real2 v551 = reverse(minus(v119, v55)); + real2 v877 = plus(v556, v557); + real2 v871 = reverse(minus(v557, v556)); + real2 v7 = load(in, 5 << shift); + real2 v71 = load(in, 69 << shift); + real2 v232 = minus(v71, v7); + real2 v236 = plus(v7, v71); + real2 v103 = load(in, 101 << shift); + real2 v39 = load(in, 37 << shift); + real2 v237 = plus(v39, v103); + real2 v231 = reverse(minus(v103, v39)); + real2 v876 = plus(v236, v237); + real2 v872 = minus(v237, v236); + real2 v1192 = minus(v877, v876); + real2 v1196 = plus(v876, v877); + real2 v1271 = reverse(minus(v1197, v1196)); + real2 v1277 = plus(v1196, v1197); + real2 v875 = minusplus(uminus(v871), v872); + real2 v873 = minusplus(v871, v872); + real2 v883 = timesminusplus(reverse(v873), load(tbl, 150 * VECWIDTH + tbloffset), times(v873, load(tbl, 151 * VECWIDTH + tbloffset))); + real2 v1639 = reverse(minus(v1043, v883)); + real2 v1645 = plus(v883, v1043); + real2 v1195 = minusplus(uminus(v1191), v1192); + real2 v1193 = minusplus(v1191, v1192); + real2 v1209 = timesminusplus(reverse(v1195), load(tbl, 216 * VECWIDTH + tbloffset), times(v1195, load(tbl, 217 * VECWIDTH + tbloffset))); + real2 v1203 = timesminusplus(reverse(v1193), load(tbl, 214 * VECWIDTH + tbloffset), times(v1193, load(tbl, 215 * VECWIDTH + tbloffset))); + real2 v83 = load(in, 81 << shift); + real2 v19 = load(in, 17 << shift); + real2 v476 = plus(v19, v83); + real2 v472 = minus(v83, v19); + real2 v51 = load(in, 49 << shift); + real2 v115 = load(in, 113 << shift); + real2 v477 = plus(v51, v115); + real2 v471 = reverse(minus(v115, v51)); + real2 v797 = plus(v476, v477); + real2 v791 = reverse(minus(v477, v476)); + real2 v3 = load(in, 1 << shift); + real2 v67 = load(in, 65 << shift); + real2 v156 = plus(v3, v67); + real2 v152 = minus(v67, v3); + real2 v35 = load(in, 33 << shift); + real2 v99 = load(in, 97 << shift); + real2 v157 = plus(v35, v99); + real2 v151 = reverse(minus(v99, v35)); + real2 v792 = minus(v157, v156); + real2 v796 = plus(v156, v157); + real2 v793 = minusplus(v791, v792); + real2 v795 = minusplus(uminus(v791), v792); + real2 v803 = timesminusplus(reverse(v793), load(tbl, 134 * VECWIDTH + tbloffset), times(v793, load(tbl, 135 * VECWIDTH + tbloffset))); + real2 v1112 = minus(v797, v796); + real2 v1116 = plus(v796, v797); + real2 v107 = load(in, 105 << shift); + real2 v43 = load(in, 41 << shift); + real2 v317 = plus(v43, v107); + real2 v311 = reverse(minus(v107, v43)); + real2 v75 = load(in, 73 << shift); + real2 v11 = load(in, 9 << shift); + real2 v316 = plus(v11, v75); + real2 v312 = minus(v75, v11); + real2 v956 = plus(v316, v317); + real2 v952 = minus(v317, v316); + real2 v59 = load(in, 57 << shift); + real2 v123 = load(in, 121 << shift); + real2 v631 = reverse(minus(v123, v59)); + real2 v637 = plus(v59, v123); + real2 v27 = load(in, 25 << shift); + real2 v91 = load(in, 89 << shift); + real2 v636 = plus(v27, v91); + real2 v632 = minus(v91, v27); + real2 v957 = plus(v636, v637); + real2 v951 = reverse(minus(v637, v636)); + real2 v1111 = reverse(minus(v957, v956)); + real2 v1117 = plus(v956, v957); + real2 v1276 = plus(v1116, v1117); + real2 v1272 = minus(v1117, v1116); + real2 v1275 = minusplus(uminus(v1271), v1272); + real2 v1273 = minusplus(v1271, v1272); + real2 v1283 = timesminusplus(reverse(v1273), load(tbl, 230 * VECWIDTH + tbloffset), times(v1273, load(tbl, 231 * VECWIDTH + tbloffset))); + real2 v1352 = minus(v1277, v1276); + real2 v1356 = plus(v1276, v1277); + real2 v1289 = timesminusplus(reverse(v1275), load(tbl, 232 * VECWIDTH + tbloffset), times(v1275, load(tbl, 233 * VECWIDTH + tbloffset))); + real2 v1115 = minusplus(uminus(v1111), v1112); + real2 v1113 = minusplus(v1111, v1112); + real2 v1123 = timesminusplus(reverse(v1113), load(tbl, 198 * VECWIDTH + tbloffset), times(v1113, load(tbl, 199 * VECWIDTH + tbloffset))); + real2 v1129 = timesminusplus(reverse(v1115), load(tbl, 200 * VECWIDTH + tbloffset), times(v1115, load(tbl, 201 * VECWIDTH + tbloffset))); + real2 v1488 = plus(v1123, v1203); + real2 v1484 = minus(v1203, v1123); + real2 v1566 = plus(v1129, v1209); + real2 v1562 = minus(v1209, v1129); + real2 v85 = load(in, 83 << shift); + real2 v21 = load(in, 19 << shift); + real2 v512 = minus(v85, v21); + real2 v516 = plus(v21, v85); + real2 v117 = load(in, 115 << shift); + real2 v53 = load(in, 51 << shift); + real2 v517 = plus(v53, v117); + real2 v511 = reverse(minus(v117, v53)); + real2 v831 = reverse(minus(v517, v516)); + real2 v837 = plus(v516, v517); + real2 v69 = load(in, 67 << shift); + real2 v5 = load(in, 3 << shift); + real2 v192 = minus(v69, v5); + real2 v196 = plus(v5, v69); + real2 v37 = load(in, 35 << shift); + real2 v101 = load(in, 99 << shift); + real2 v197 = plus(v37, v101); + real2 v191 = reverse(minus(v101, v37)); + real2 v832 = minus(v197, v196); + real2 v836 = plus(v196, v197); + real2 v1152 = minus(v837, v836); + real2 v1156 = plus(v836, v837); + real2 v61 = load(in, 59 << shift); + real2 v125 = load(in, 123 << shift); + real2 v677 = plus(v61, v125); + real2 v671 = reverse(minus(v125, v61)); + real2 v29 = load(in, 27 << shift); + real2 v93 = load(in, 91 << shift); + real2 v672 = minus(v93, v29); + real2 v676 = plus(v29, v93); + real2 v997 = plus(v676, v677); + real2 v991 = reverse(minus(v677, v676)); + real2 v109 = load(in, 107 << shift); + real2 v45 = load(in, 43 << shift); + real2 v357 = plus(v45, v109); + real2 v351 = reverse(minus(v109, v45)); + real2 v77 = load(in, 75 << shift); + real2 v13 = load(in, 11 << shift); + real2 v352 = minus(v77, v13); + real2 v356 = plus(v13, v77); + real2 v992 = minus(v357, v356); + real2 v996 = plus(v356, v357); + real2 v1157 = plus(v996, v997); + real2 v1151 = reverse(minus(v997, v996)); + real2 v1155 = minusplus(uminus(v1151), v1152); + real2 v1153 = minusplus(v1151, v1152); + real2 v1163 = timesminusplus(reverse(v1153), load(tbl, 206 * VECWIDTH + tbloffset), times(v1153, load(tbl, 207 * VECWIDTH + tbloffset))); + real2 v1316 = plus(v1156, v1157); + real2 v1312 = minus(v1157, v1156); + real2 v41 = load(in, 39 << shift); + real2 v105 = load(in, 103 << shift); + real2 v277 = plus(v41, v105); + real2 v271 = reverse(minus(v105, v41)); + real2 v9 = load(in, 7 << shift); + real2 v73 = load(in, 71 << shift); + real2 v276 = plus(v9, v73); + real2 v272 = minus(v73, v9); + real2 v916 = plus(v276, v277); + real2 v912 = minus(v277, v276); + real2 v89 = load(in, 87 << shift); + real2 v25 = load(in, 23 << shift); + real2 v592 = minus(v89, v25); + real2 v596 = plus(v25, v89); + real2 v57 = load(in, 55 << shift); + real2 v121 = load(in, 119 << shift); + real2 v591 = reverse(minus(v121, v57)); + real2 v597 = plus(v57, v121); + real2 v911 = reverse(minus(v597, v596)); + real2 v917 = plus(v596, v597); + real2 v1236 = plus(v916, v917); + real2 v1232 = minus(v917, v916); + real2 v81 = load(in, 79 << shift); + real2 v17 = load(in, 15 << shift); + real2 v432 = minus(v81, v17); + real2 v436 = plus(v17, v81); + real2 v113 = load(in, 111 << shift); + real2 v49 = load(in, 47 << shift); + real2 v437 = plus(v49, v113); + real2 v431 = reverse(minus(v113, v49)); + real2 v1072 = minus(v437, v436); + real2 v1076 = plus(v436, v437); + real2 v65 = load(in, 63 << shift); + real2 v129 = load(in, 127 << shift); + real2 v757 = plus(v65, v129); + real2 v751 = reverse(minus(v129, v65)); + real2 v97 = load(in, 95 << shift); + real2 v33 = load(in, 31 << shift); + real2 v752 = minus(v97, v33); + real2 v756 = plus(v33, v97); + real2 v1077 = plus(v756, v757); + real2 v1071 = reverse(minus(v757, v756)); + real2 v1231 = reverse(minus(v1077, v1076)); + real2 v1237 = plus(v1076, v1077); + real2 v1317 = plus(v1236, v1237); + real2 v1311 = reverse(minus(v1237, v1236)); + real2 v1351 = reverse(minus(v1317, v1316)); + real2 v1357 = plus(v1316, v1317); + real2 v1371 = reverse(minus(v1357, v1356)); + real2 v1377 = plus(v1356, v1357); + scatter(out, 0, 128, plus(v1376, v1377)); + real2 v1390 = minus(v1376, v1377); + scatter(out, 64, 128, timesminusplus(v1390, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1390), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1353 = minusplus(v1351, v1352); + real2 v1355 = minusplus(uminus(v1351), v1352); + real2 v1369 = timesminusplus(reverse(v1355), load(tbl, 248 * VECWIDTH + tbloffset), times(v1355, load(tbl, 249 * VECWIDTH + tbloffset))); + scatter(out, 48, 128, plus(v1349, v1369)); + real2 v1404 = minus(v1349, v1369); + scatter(out, 112, 128, timesminusplus(v1404, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1404), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1363 = timesminusplus(reverse(v1353), load(tbl, 246 * VECWIDTH + tbloffset), times(v1353, load(tbl, 247 * VECWIDTH + tbloffset))); + scatter(out, 16, 128, plus(v1343, v1363)); + real2 v1398 = minus(v1343, v1363); + scatter(out, 80, 128, timesminusplus(v1398, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1398), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1373 = minusplus(v1371, v1372); + real2 v1375 = minusplus(uminus(v1371), v1372); + scatter(out, 96, 128, timesminusplus(reverse(v1375), load(tbl, 252 * VECWIDTH + tbloffset), times(v1375, load(tbl, 253 * VECWIDTH + tbloffset)))); + scatter(out, 32, 128, timesminusplus(reverse(v1373), load(tbl, 250 * VECWIDTH + tbloffset), times(v1373, load(tbl, 251 * VECWIDTH + tbloffset)))); + real2 v1313 = minusplus(v1311, v1312); + real2 v1315 = minusplus(uminus(v1311), v1312); + real2 v1323 = timesminusplus(reverse(v1313), load(tbl, 238 * VECWIDTH + tbloffset), times(v1313, load(tbl, 239 * VECWIDTH + tbloffset))); + real2 v1417 = plus(v1283, v1323); + real2 v1411 = reverse(minus(v1323, v1283)); + scatter(out, 8, 128, plus(v1416, v1417)); + real2 v1430 = minus(v1416, v1417); + scatter(out, 72, 128, timesminusplus(v1430, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1430), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1413 = minusplus(v1411, v1412); + real2 v1415 = minusplus(uminus(v1411), v1412); + scatter(out, 104, 128, timesminusplus(reverse(v1415), load(tbl, 256 * VECWIDTH + tbloffset), times(v1415, load(tbl, 257 * VECWIDTH + tbloffset)))); + scatter(out, 40, 128, timesminusplus(reverse(v1413), load(tbl, 254 * VECWIDTH + tbloffset), times(v1413, load(tbl, 255 * VECWIDTH + tbloffset)))); + real2 v1329 = timesminusplus(reverse(v1315), load(tbl, 240 * VECWIDTH + tbloffset), times(v1315, load(tbl, 241 * VECWIDTH + tbloffset))); + real2 v1443 = plus(v1289, v1329); + real2 v1437 = reverse(minus(v1329, v1289)); + scatter(out, 24, 128, plus(v1442, v1443)); + real2 v1456 = minus(v1442, v1443); + scatter(out, 88, 128, timesminusplus(v1456, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1456), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1441 = minusplus(uminus(v1437), v1438); + real2 v1439 = minusplus(v1437, v1438); + scatter(out, 120, 128, timesminusplus(reverse(v1441), load(tbl, 260 * VECWIDTH + tbloffset), times(v1441, load(tbl, 261 * VECWIDTH + tbloffset)))); + scatter(out, 56, 128, timesminusplus(reverse(v1439), load(tbl, 258 * VECWIDTH + tbloffset), times(v1439, load(tbl, 259 * VECWIDTH + tbloffset)))); + real2 v1235 = minusplus(uminus(v1231), v1232); + real2 v1233 = minusplus(v1231, v1232); + real2 v1243 = timesminusplus(reverse(v1233), load(tbl, 222 * VECWIDTH + tbloffset), times(v1233, load(tbl, 223 * VECWIDTH + tbloffset))); + real2 v1489 = plus(v1163, v1243); + real2 v1483 = reverse(minus(v1243, v1163)); + real2 v1509 = plus(v1488, v1489); + real2 v1503 = reverse(minus(v1489, v1488)); + scatter(out, 4, 128, plus(v1508, v1509)); + real2 v1522 = minus(v1508, v1509); + scatter(out, 68, 128, timesminusplus(v1522, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1522), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1507 = minusplus(uminus(v1503), v1504); + real2 v1505 = minusplus(v1503, v1504); + scatter(out, 36, 128, timesminusplus(reverse(v1505), load(tbl, 270 * VECWIDTH + tbloffset), times(v1505, load(tbl, 271 * VECWIDTH + tbloffset)))); + scatter(out, 100, 128, timesminusplus(reverse(v1507), load(tbl, 272 * VECWIDTH + tbloffset), times(v1507, load(tbl, 273 * VECWIDTH + tbloffset)))); + real2 v1485 = minusplus(v1483, v1484); + real2 v1487 = minusplus(uminus(v1483), v1484); + real2 v1501 = timesminusplus(reverse(v1487), load(tbl, 268 * VECWIDTH + tbloffset), times(v1487, load(tbl, 269 * VECWIDTH + tbloffset))); + scatter(out, 52, 128, plus(v1481, v1501)); + real2 v1534 = minus(v1481, v1501); + scatter(out, 116, 128, timesminusplus(v1534, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1534), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1495 = timesminusplus(reverse(v1485), load(tbl, 266 * VECWIDTH + tbloffset), times(v1485, load(tbl, 267 * VECWIDTH + tbloffset))); + scatter(out, 20, 128, plus(v1475, v1495)); + real2 v1528 = minus(v1475, v1495); + scatter(out, 84, 128, timesminusplus(v1528, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1528), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1249 = timesminusplus(reverse(v1235), load(tbl, 224 * VECWIDTH + tbloffset), times(v1235, load(tbl, 225 * VECWIDTH + tbloffset))); + real2 v1169 = timesminusplus(reverse(v1155), load(tbl, 208 * VECWIDTH + tbloffset), times(v1155, load(tbl, 209 * VECWIDTH + tbloffset))); + real2 v1567 = plus(v1169, v1249); + real2 v1561 = reverse(minus(v1249, v1169)); + real2 v1581 = reverse(minus(v1567, v1566)); + real2 v1587 = plus(v1566, v1567); + scatter(out, 12, 128, plus(v1586, v1587)); + real2 v1600 = minus(v1586, v1587); + scatter(out, 76, 128, timesminusplus(v1600, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1600), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1583 = minusplus(v1581, v1582); + scatter(out, 44, 128, timesminusplus(reverse(v1583), load(tbl, 282 * VECWIDTH + tbloffset), times(v1583, load(tbl, 283 * VECWIDTH + tbloffset)))); + real2 v1585 = minusplus(uminus(v1581), v1582); + scatter(out, 108, 128, timesminusplus(reverse(v1585), load(tbl, 284 * VECWIDTH + tbloffset), times(v1585, load(tbl, 285 * VECWIDTH + tbloffset)))); + real2 v1565 = minusplus(uminus(v1561), v1562); + real2 v1563 = minusplus(v1561, v1562); + real2 v1579 = timesminusplus(reverse(v1565), load(tbl, 280 * VECWIDTH + tbloffset), times(v1565, load(tbl, 281 * VECWIDTH + tbloffset))); + scatter(out, 60, 128, plus(v1559, v1579)); + real2 v1612 = minus(v1559, v1579); + scatter(out, 124, 128, timesminusplus(v1612, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1612), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1573 = timesminusplus(reverse(v1563), load(tbl, 278 * VECWIDTH + tbloffset), times(v1563, load(tbl, 279 * VECWIDTH + tbloffset))); + scatter(out, 28, 128, plus(v1553, v1573)); + real2 v1606 = minus(v1553, v1573); + scatter(out, 92, 128, timesminusplus(v1606, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1606), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v833 = minusplus(v831, v832); + real2 v835 = minusplus(uminus(v831), v832); + real2 v955 = minusplus(uminus(v951), v952); + real2 v953 = minusplus(v951, v952); + real2 v963 = timesminusplus(reverse(v953), load(tbl, 166 * VECWIDTH + tbloffset), times(v953, load(tbl, 167 * VECWIDTH + tbloffset))); + real2 v995 = minusplus(uminus(v991), v992); + real2 v993 = minusplus(v991, v992); + real2 v1003 = timesminusplus(reverse(v993), load(tbl, 174 * VECWIDTH + tbloffset), times(v993, load(tbl, 175 * VECWIDTH + tbloffset))); + real2 v843 = timesminusplus(reverse(v833), load(tbl, 142 * VECWIDTH + tbloffset), times(v833, load(tbl, 143 * VECWIDTH + tbloffset))); + real2 v1640 = minus(v963, v803); + real2 v1644 = plus(v803, v963); + real2 v1680 = minus(v1003, v843); + real2 v1684 = plus(v843, v1003); + real2 v1641 = minusplus(v1639, v1640); + real2 v1643 = minusplus(uminus(v1639), v1640); + real2 v1657 = timesminusplus(reverse(v1643), load(tbl, 292 * VECWIDTH + tbloffset), times(v1643, load(tbl, 293 * VECWIDTH + tbloffset))); + real2 v913 = minusplus(v911, v912); + real2 v915 = minusplus(uminus(v911), v912); + real2 v1073 = minusplus(v1071, v1072); + real2 v1075 = minusplus(uminus(v1071), v1072); + real2 v923 = timesminusplus(reverse(v913), load(tbl, 158 * VECWIDTH + tbloffset), times(v913, load(tbl, 159 * VECWIDTH + tbloffset))); + real2 v1083 = timesminusplus(reverse(v1073), load(tbl, 190 * VECWIDTH + tbloffset), times(v1073, load(tbl, 191 * VECWIDTH + tbloffset))); + real2 v1685 = plus(v923, v1083); + real2 v1679 = reverse(minus(v1083, v923)); + real2 v1681 = minusplus(v1679, v1680); + real2 v1683 = minusplus(uminus(v1679), v1680); + real2 v1697 = timesminusplus(reverse(v1683), load(tbl, 300 * VECWIDTH + tbloffset), times(v1683, load(tbl, 301 * VECWIDTH + tbloffset))); + real2 v1809 = plus(v1657, v1697); + real2 v1803 = reverse(minus(v1697, v1657)); + scatter(out, 26, 128, plus(v1808, v1809)); + real2 v1822 = minus(v1808, v1809); + scatter(out, 90, 128, timesminusplus(v1822, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1822), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1807 = minusplus(uminus(v1803), v1804); + real2 v1805 = minusplus(v1803, v1804); + scatter(out, 58, 128, timesminusplus(reverse(v1805), load(tbl, 318 * VECWIDTH + tbloffset), times(v1805, load(tbl, 319 * VECWIDTH + tbloffset)))); + scatter(out, 122, 128, timesminusplus(reverse(v1807), load(tbl, 320 * VECWIDTH + tbloffset), times(v1807, load(tbl, 321 * VECWIDTH + tbloffset)))); + real2 v1651 = timesminusplus(reverse(v1641), load(tbl, 290 * VECWIDTH + tbloffset), times(v1641, load(tbl, 291 * VECWIDTH + tbloffset))); + real2 v1691 = timesminusplus(reverse(v1681), load(tbl, 298 * VECWIDTH + tbloffset), times(v1681, load(tbl, 299 * VECWIDTH + tbloffset))); + real2 v1783 = plus(v1651, v1691); + real2 v1777 = reverse(minus(v1691, v1651)); + real2 v1779 = minusplus(v1777, v1778); + real2 v1781 = minusplus(uminus(v1777), v1778); + scatter(out, 106, 128, timesminusplus(reverse(v1781), load(tbl, 316 * VECWIDTH + tbloffset), times(v1781, load(tbl, 317 * VECWIDTH + tbloffset)))); + scatter(out, 42, 128, timesminusplus(reverse(v1779), load(tbl, 314 * VECWIDTH + tbloffset), times(v1779, load(tbl, 315 * VECWIDTH + tbloffset)))); + scatter(out, 10, 128, plus(v1782, v1783)); + real2 v1796 = minus(v1782, v1783); + scatter(out, 74, 128, timesminusplus(v1796, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1796), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1720 = minus(v1645, v1644); + real2 v1724 = plus(v1644, v1645); + real2 v1719 = reverse(minus(v1685, v1684)); + real2 v1725 = plus(v1684, v1685); + real2 v1745 = plus(v1724, v1725); + real2 v1739 = reverse(minus(v1725, v1724)); + scatter(out, 2, 128, plus(v1744, v1745)); + real2 v1758 = minus(v1744, v1745); + scatter(out, 66, 128, timesminusplus(v1758, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1758), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1741 = minusplus(v1739, v1740); + real2 v1743 = minusplus(uminus(v1739), v1740); + scatter(out, 98, 128, timesminusplus(reverse(v1743), load(tbl, 312 * VECWIDTH + tbloffset), times(v1743, load(tbl, 313 * VECWIDTH + tbloffset)))); + scatter(out, 34, 128, timesminusplus(reverse(v1741), load(tbl, 310 * VECWIDTH + tbloffset), times(v1741, load(tbl, 311 * VECWIDTH + tbloffset)))); + real2 v1723 = minusplus(uminus(v1719), v1720); + real2 v1721 = minusplus(v1719, v1720); + real2 v1737 = timesminusplus(reverse(v1723), load(tbl, 308 * VECWIDTH + tbloffset), times(v1723, load(tbl, 309 * VECWIDTH + tbloffset))); + scatter(out, 50, 128, plus(v1717, v1737)); + real2 v1770 = minus(v1717, v1737); + scatter(out, 114, 128, timesminusplus(v1770, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1770), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1731 = timesminusplus(reverse(v1721), load(tbl, 306 * VECWIDTH + tbloffset), times(v1721, load(tbl, 307 * VECWIDTH + tbloffset))); + scatter(out, 18, 128, plus(v1711, v1731)); + real2 v1764 = minus(v1711, v1731); + scatter(out, 82, 128, timesminusplus(v1764, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1764), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v809 = timesminusplus(reverse(v795), load(tbl, 136 * VECWIDTH + tbloffset), times(v795, load(tbl, 137 * VECWIDTH + tbloffset))); + real2 v969 = timesminusplus(reverse(v955), load(tbl, 168 * VECWIDTH + tbloffset), times(v955, load(tbl, 169 * VECWIDTH + tbloffset))); + real2 v1850 = minus(v969, v809); + real2 v1854 = plus(v809, v969); + real2 v849 = timesminusplus(reverse(v835), load(tbl, 144 * VECWIDTH + tbloffset), times(v835, load(tbl, 145 * VECWIDTH + tbloffset))); + real2 v929 = timesminusplus(reverse(v915), load(tbl, 160 * VECWIDTH + tbloffset), times(v915, load(tbl, 161 * VECWIDTH + tbloffset))); + real2 v889 = timesminusplus(reverse(v875), load(tbl, 152 * VECWIDTH + tbloffset), times(v875, load(tbl, 153 * VECWIDTH + tbloffset))); + real2 v1089 = timesminusplus(reverse(v1075), load(tbl, 192 * VECWIDTH + tbloffset), times(v1075, load(tbl, 193 * VECWIDTH + tbloffset))); + real2 v1009 = timesminusplus(reverse(v995), load(tbl, 176 * VECWIDTH + tbloffset), times(v995, load(tbl, 177 * VECWIDTH + tbloffset))); + real2 v1890 = minus(v1009, v849); + real2 v1894 = plus(v849, v1009); + real2 v1849 = reverse(minus(v1049, v889)); + real2 v1855 = plus(v889, v1049); + real2 v1930 = minus(v1855, v1854); + real2 v1934 = plus(v1854, v1855); + real2 v1895 = plus(v929, v1089); + real2 v1889 = reverse(minus(v1089, v929)); + real2 v1929 = reverse(minus(v1895, v1894)); + real2 v1935 = plus(v1894, v1895); + real2 v1955 = plus(v1934, v1935); + real2 v1949 = reverse(minus(v1935, v1934)); + scatter(out, 6, 128, plus(v1954, v1955)); + real2 v1968 = minus(v1954, v1955); + scatter(out, 70, 128, timesminusplus(v1968, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1968), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1951 = minusplus(v1949, v1950); + scatter(out, 38, 128, timesminusplus(reverse(v1951), load(tbl, 346 * VECWIDTH + tbloffset), times(v1951, load(tbl, 347 * VECWIDTH + tbloffset)))); + real2 v1953 = minusplus(uminus(v1949), v1950); + scatter(out, 102, 128, timesminusplus(reverse(v1953), load(tbl, 348 * VECWIDTH + tbloffset), times(v1953, load(tbl, 349 * VECWIDTH + tbloffset)))); + real2 v1931 = minusplus(v1929, v1930); + real2 v1933 = minusplus(uminus(v1929), v1930); + real2 v1947 = timesminusplus(reverse(v1933), load(tbl, 344 * VECWIDTH + tbloffset), times(v1933, load(tbl, 345 * VECWIDTH + tbloffset))); + scatter(out, 54, 128, plus(v1927, v1947)); + real2 v1980 = minus(v1927, v1947); + scatter(out, 118, 128, timesminusplus(v1980, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1980), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1941 = timesminusplus(reverse(v1931), load(tbl, 342 * VECWIDTH + tbloffset), times(v1931, load(tbl, 343 * VECWIDTH + tbloffset))); + scatter(out, 22, 128, plus(v1921, v1941)); + real2 v1974 = minus(v1921, v1941); + scatter(out, 86, 128, timesminusplus(v1974, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v1974), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1851 = minusplus(v1849, v1850); + real2 v1853 = minusplus(uminus(v1849), v1850); + real2 v1867 = timesminusplus(reverse(v1853), load(tbl, 328 * VECWIDTH + tbloffset), times(v1853, load(tbl, 329 * VECWIDTH + tbloffset))); + real2 v1891 = minusplus(v1889, v1890); + real2 v1893 = minusplus(uminus(v1889), v1890); + real2 v1907 = timesminusplus(reverse(v1893), load(tbl, 336 * VECWIDTH + tbloffset), times(v1893, load(tbl, 337 * VECWIDTH + tbloffset))); + real2 v2019 = plus(v1867, v1907); + real2 v2013 = reverse(minus(v1907, v1867)); + scatter(out, 30, 128, plus(v2018, v2019)); + real2 v2032 = minus(v2018, v2019); + scatter(out, 94, 128, timesminusplus(v2032, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v2032), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v2017 = minusplus(uminus(v2013), v2014); + scatter(out, 126, 128, timesminusplus(reverse(v2017), load(tbl, 356 * VECWIDTH + tbloffset), times(v2017, load(tbl, 357 * VECWIDTH + tbloffset)))); + real2 v2015 = minusplus(v2013, v2014); + scatter(out, 62, 128, timesminusplus(reverse(v2015), load(tbl, 354 * VECWIDTH + tbloffset), times(v2015, load(tbl, 355 * VECWIDTH + tbloffset)))); + real2 v1861 = timesminusplus(reverse(v1851), load(tbl, 326 * VECWIDTH + tbloffset), times(v1851, load(tbl, 327 * VECWIDTH + tbloffset))); + real2 v1901 = timesminusplus(reverse(v1891), load(tbl, 334 * VECWIDTH + tbloffset), times(v1891, load(tbl, 335 * VECWIDTH + tbloffset))); + real2 v1993 = plus(v1861, v1901); + real2 v1987 = reverse(minus(v1901, v1861)); + scatter(out, 14, 128, plus(v1992, v1993)); + real2 v2006 = minus(v1992, v1993); + scatter(out, 78, 128, timesminusplus(v2006, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v2006), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v1991 = minusplus(uminus(v1987), v1988); + scatter(out, 110, 128, timesminusplus(reverse(v1991), load(tbl, 352 * VECWIDTH + tbloffset), times(v1991, load(tbl, 353 * VECWIDTH + tbloffset)))); + real2 v1989 = minusplus(v1987, v1988); + scatter(out, 46, 128, timesminusplus(reverse(v1989), load(tbl, 350 * VECWIDTH + tbloffset), times(v1989, load(tbl, 351 * VECWIDTH + tbloffset)))); + real2 v593 = minusplus(v591, v592); + real2 v595 = minusplus(uminus(v591), v592); + real2 v473 = minusplus(v471, v472); + real2 v475 = minusplus(uminus(v471), v472); + real2 v555 = minusplus(uminus(v551), v552); + real2 v553 = minusplus(v551, v552); + real2 v609 = timesminusplus(reverse(v595), load(tbl, 96 * VECWIDTH + tbloffset), times(v595, load(tbl, 97 * VECWIDTH + tbloffset))); + real2 v195 = minusplus(uminus(v191), v192); + real2 v193 = minusplus(v191, v192); + real2 v275 = minusplus(uminus(v271), v272); + real2 v273 = minusplus(v271, v272); + real2 v673 = minusplus(v671, v672); + real2 v675 = minusplus(uminus(v671), v672); + real2 v689 = timesminusplus(reverse(v675), load(tbl, 112 * VECWIDTH + tbloffset), times(v675, load(tbl, 113 * VECWIDTH + tbloffset))); + real2 v209 = timesminusplus(reverse(v195), load(tbl, 16 * VECWIDTH + tbloffset), times(v195, load(tbl, 17 * VECWIDTH + tbloffset))); + real2 v289 = timesminusplus(reverse(v275), load(tbl, 32 * VECWIDTH + tbloffset), times(v275, load(tbl, 33 * VECWIDTH + tbloffset))); + real2 v755 = minusplus(uminus(v751), v752); + real2 v753 = minusplus(v751, v752); + real2 v435 = minusplus(uminus(v431), v432); + real2 v433 = minusplus(v431, v432); + real2 v513 = minusplus(v511, v512); + real2 v515 = minusplus(uminus(v511), v512); + real2 v529 = timesminusplus(reverse(v515), load(tbl, 80 * VECWIDTH + tbloffset), times(v515, load(tbl, 81 * VECWIDTH + tbloffset))); + real2 v353 = minusplus(v351, v352); + real2 v355 = minusplus(uminus(v351), v352); + real2 v369 = timesminusplus(reverse(v355), load(tbl, 48 * VECWIDTH + tbloffset), times(v355, load(tbl, 49 * VECWIDTH + tbloffset))); + real2 v2631 = plus(v369, v689); + real2 v2625 = reverse(minus(v689, v369)); + real2 v449 = timesminusplus(reverse(v435), load(tbl, 64 * VECWIDTH + tbloffset), times(v435, load(tbl, 65 * VECWIDTH + tbloffset))); + real2 v2710 = plus(v289, v609); + real2 v2706 = minus(v609, v289); + real2 v2630 = plus(v209, v529); + real2 v2626 = minus(v529, v209); + real2 v2790 = plus(v2630, v2631); + real2 v2786 = minus(v2631, v2630); + real2 v713 = minusplus(v711, v712); + real2 v715 = minusplus(uminus(v711), v712); + real2 v769 = timesminusplus(reverse(v755), load(tbl, 128 * VECWIDTH + tbloffset), times(v755, load(tbl, 129 * VECWIDTH + tbloffset))); + real2 v2705 = reverse(minus(v769, v449)); + real2 v2711 = plus(v449, v769); + real2 v313 = minusplus(v311, v312); + real2 v315 = minusplus(uminus(v311), v312); + real2 v393 = minusplus(v391, v392); + real2 v395 = minusplus(uminus(v391), v392); + real2 v409 = timesminusplus(reverse(v395), load(tbl, 56 * VECWIDTH + tbloffset), times(v395, load(tbl, 57 * VECWIDTH + tbloffset))); + real2 v729 = timesminusplus(reverse(v715), load(tbl, 120 * VECWIDTH + tbloffset), times(v715, load(tbl, 121 * VECWIDTH + tbloffset))); + real2 v329 = timesminusplus(reverse(v315), load(tbl, 40 * VECWIDTH + tbloffset), times(v315, load(tbl, 41 * VECWIDTH + tbloffset))); + real2 v489 = timesminusplus(reverse(v475), load(tbl, 72 * VECWIDTH + tbloffset), times(v475, load(tbl, 73 * VECWIDTH + tbloffset))); + real2 v153 = minusplus(v151, v152); + real2 v155 = minusplus(uminus(v151), v152); + real2 v169 = timesminusplus(reverse(v155), load(tbl, 8 * VECWIDTH + tbloffset), times(v155, load(tbl, 9 * VECWIDTH + tbloffset))); + real2 v2586 = minus(v489, v169); + real2 v2590 = plus(v169, v489); + real2 v233 = minusplus(v231, v232); + real2 v235 = minusplus(uminus(v231), v232); + real2 v633 = minusplus(v631, v632); + real2 v635 = minusplus(uminus(v631), v632); + real2 v649 = timesminusplus(reverse(v635), load(tbl, 104 * VECWIDTH + tbloffset), times(v635, load(tbl, 105 * VECWIDTH + tbloffset))); + real2 v249 = timesminusplus(reverse(v235), load(tbl, 24 * VECWIDTH + tbloffset), times(v235, load(tbl, 25 * VECWIDTH + tbloffset))); + real2 v569 = timesminusplus(reverse(v555), load(tbl, 88 * VECWIDTH + tbloffset), times(v555, load(tbl, 89 * VECWIDTH + tbloffset))); + real2 v2670 = plus(v249, v569); + real2 v2666 = minus(v569, v249); + real2 v2785 = reverse(minus(v2711, v2710)); + real2 v2791 = plus(v2710, v2711); + real2 v2825 = reverse(minus(v2791, v2790)); + real2 v2831 = plus(v2790, v2791); + real2 v2671 = plus(v409, v729); + real2 v2665 = reverse(minus(v729, v409)); + real2 v2745 = reverse(minus(v2671, v2670)); + real2 v2751 = plus(v2670, v2671); + real2 v2806 = minus(v2731, v2730); + real2 v2810 = plus(v2730, v2731); + real2 v2846 = minus(v2811, v2810); + real2 v2850 = plus(v2810, v2811); + real2 v2591 = plus(v329, v649); + real2 v2585 = reverse(minus(v649, v329)); + real2 v2750 = plus(v2590, v2591); + real2 v2746 = minus(v2591, v2590); + real2 v2830 = plus(v2750, v2751); + real2 v2826 = minus(v2751, v2750); + real2 v2845 = reverse(minus(v2831, v2830)); + real2 v2851 = plus(v2830, v2831); + scatter(out, 3, 128, plus(v2850, v2851)); + real2 v2864 = minus(v2850, v2851); + scatter(out, 67, 128, timesminusplus(v2864, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v2864), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v2849 = minusplus(uminus(v2845), v2846); + real2 v2847 = minusplus(v2845, v2846); + scatter(out, 35, 128, timesminusplus(reverse(v2847), load(tbl, 506 * VECWIDTH + tbloffset), times(v2847, load(tbl, 507 * VECWIDTH + tbloffset)))); + scatter(out, 99, 128, timesminusplus(reverse(v2849), load(tbl, 508 * VECWIDTH + tbloffset), times(v2849, load(tbl, 509 * VECWIDTH + tbloffset)))); + real2 v2827 = minusplus(v2825, v2826); + real2 v2829 = minusplus(uminus(v2825), v2826); + real2 v2837 = timesminusplus(reverse(v2827), load(tbl, 502 * VECWIDTH + tbloffset), times(v2827, load(tbl, 503 * VECWIDTH + tbloffset))); + real2 v2809 = minusplus(uminus(v2805), v2806); + real2 v2807 = minusplus(v2805, v2806); + real2 v2817 = timesminusplus(reverse(v2807), load(tbl, 498 * VECWIDTH + tbloffset), times(v2807, load(tbl, 499 * VECWIDTH + tbloffset))); + scatter(out, 19, 128, plus(v2817, v2837)); + real2 v2870 = minus(v2817, v2837); + scatter(out, 83, 128, timesminusplus(v2870, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v2870), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v2823 = timesminusplus(reverse(v2809), load(tbl, 500 * VECWIDTH + tbloffset), times(v2809, load(tbl, 501 * VECWIDTH + tbloffset))); + real2 v2843 = timesminusplus(reverse(v2829), load(tbl, 504 * VECWIDTH + tbloffset), times(v2829, load(tbl, 505 * VECWIDTH + tbloffset))); + scatter(out, 51, 128, plus(v2823, v2843)); + real2 v2876 = minus(v2823, v2843); + scatter(out, 115, 128, timesminusplus(v2876, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v2876), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v2787 = minusplus(v2785, v2786); + real2 v2789 = minusplus(uminus(v2785), v2786); + real2 v2803 = timesminusplus(reverse(v2789), load(tbl, 496 * VECWIDTH + tbloffset), times(v2789, load(tbl, 497 * VECWIDTH + tbloffset))); + real2 v2727 = minusplus(v2725, v2726); + real2 v2729 = minusplus(uminus(v2725), v2726); + real2 v2743 = timesminusplus(reverse(v2729), load(tbl, 484 * VECWIDTH + tbloffset), times(v2729, load(tbl, 485 * VECWIDTH + tbloffset))); + real2 v2914 = plus(v2743, v2783); + real2 v2910 = minus(v2783, v2743); + real2 v2749 = minusplus(uminus(v2745), v2746); + real2 v2747 = minusplus(v2745, v2746); + real2 v2763 = timesminusplus(reverse(v2749), load(tbl, 488 * VECWIDTH + tbloffset), times(v2749, load(tbl, 489 * VECWIDTH + tbloffset))); + real2 v2909 = reverse(minus(v2803, v2763)); + real2 v2915 = plus(v2763, v2803); + scatter(out, 27, 128, plus(v2914, v2915)); + real2 v2928 = minus(v2914, v2915); + scatter(out, 91, 128, timesminusplus(v2928, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v2928), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v2913 = minusplus(uminus(v2909), v2910); + scatter(out, 123, 128, timesminusplus(reverse(v2913), load(tbl, 516 * VECWIDTH + tbloffset), times(v2913, load(tbl, 517 * VECWIDTH + tbloffset)))); + real2 v2911 = minusplus(v2909, v2910); + scatter(out, 59, 128, timesminusplus(reverse(v2911), load(tbl, 514 * VECWIDTH + tbloffset), times(v2911, load(tbl, 515 * VECWIDTH + tbloffset)))); + real2 v2737 = timesminusplus(reverse(v2727), load(tbl, 482 * VECWIDTH + tbloffset), times(v2727, load(tbl, 483 * VECWIDTH + tbloffset))); + real2 v2888 = plus(v2737, v2777); + real2 v2884 = minus(v2777, v2737); + real2 v2797 = timesminusplus(reverse(v2787), load(tbl, 494 * VECWIDTH + tbloffset), times(v2787, load(tbl, 495 * VECWIDTH + tbloffset))); + real2 v2757 = timesminusplus(reverse(v2747), load(tbl, 486 * VECWIDTH + tbloffset), times(v2747, load(tbl, 487 * VECWIDTH + tbloffset))); + real2 v2889 = plus(v2757, v2797); + real2 v2883 = reverse(minus(v2797, v2757)); + scatter(out, 11, 128, plus(v2888, v2889)); + real2 v2902 = minus(v2888, v2889); + scatter(out, 75, 128, timesminusplus(v2902, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v2902), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v2887 = minusplus(uminus(v2883), v2884); + scatter(out, 107, 128, timesminusplus(reverse(v2887), load(tbl, 512 * VECWIDTH + tbloffset), times(v2887, load(tbl, 513 * VECWIDTH + tbloffset)))); + real2 v2885 = minusplus(v2883, v2884); + scatter(out, 43, 128, timesminusplus(reverse(v2885), load(tbl, 510 * VECWIDTH + tbloffset), times(v2885, load(tbl, 511 * VECWIDTH + tbloffset)))); + real2 v2669 = minusplus(uminus(v2665), v2666); + real2 v2667 = minusplus(v2665, v2666); + real2 v2707 = minusplus(v2705, v2706); + real2 v2709 = minusplus(uminus(v2705), v2706); + real2 v2717 = timesminusplus(reverse(v2707), load(tbl, 478 * VECWIDTH + tbloffset), times(v2707, load(tbl, 479 * VECWIDTH + tbloffset))); + real2 v2627 = minusplus(v2625, v2626); + real2 v2629 = minusplus(uminus(v2625), v2626); + real2 v2637 = timesminusplus(reverse(v2627), load(tbl, 462 * VECWIDTH + tbloffset), times(v2627, load(tbl, 463 * VECWIDTH + tbloffset))); + real2 v2961 = plus(v2637, v2717); + real2 v2955 = reverse(minus(v2717, v2637)); + real2 v2649 = minusplus(uminus(v2645), v2646); + real2 v2647 = minusplus(v2645, v2646); + real2 v2569 = minusplus(uminus(v2565), v2566); + real2 v2567 = minusplus(v2565, v2566); + real2 v2577 = timesminusplus(reverse(v2567), load(tbl, 450 * VECWIDTH + tbloffset), times(v2567, load(tbl, 451 * VECWIDTH + tbloffset))); + real2 v2657 = timesminusplus(reverse(v2647), load(tbl, 466 * VECWIDTH + tbloffset), times(v2647, load(tbl, 467 * VECWIDTH + tbloffset))); + real2 v2936 = minus(v2657, v2577); + real2 v2940 = plus(v2577, v2657); + real2 v2976 = minus(v2941, v2940); + real2 v2980 = plus(v2940, v2941); + real2 v2677 = timesminusplus(reverse(v2667), load(tbl, 470 * VECWIDTH + tbloffset), times(v2667, load(tbl, 471 * VECWIDTH + tbloffset))); + real2 v2587 = minusplus(v2585, v2586); + real2 v2589 = minusplus(uminus(v2585), v2586); + real2 v2597 = timesminusplus(reverse(v2587), load(tbl, 454 * VECWIDTH + tbloffset), times(v2587, load(tbl, 455 * VECWIDTH + tbloffset))); + real2 v2956 = minus(v2677, v2597); + real2 v2960 = plus(v2597, v2677); + real2 v2975 = reverse(minus(v2961, v2960)); + real2 v2981 = plus(v2960, v2961); + scatter(out, 7, 128, plus(v2980, v2981)); + real2 v2994 = minus(v2980, v2981); + scatter(out, 71, 128, timesminusplus(v2994, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v2994), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v2979 = minusplus(uminus(v2975), v2976); + scatter(out, 103, 128, timesminusplus(reverse(v2979), load(tbl, 528 * VECWIDTH + tbloffset), times(v2979, load(tbl, 529 * VECWIDTH + tbloffset)))); + real2 v2977 = minusplus(v2975, v2976); + scatter(out, 39, 128, timesminusplus(reverse(v2977), load(tbl, 526 * VECWIDTH + tbloffset), times(v2977, load(tbl, 527 * VECWIDTH + tbloffset)))); + real2 v2939 = minusplus(uminus(v2935), v2936); + real2 v2937 = minusplus(v2935, v2936); + real2 v2953 = timesminusplus(reverse(v2939), load(tbl, 520 * VECWIDTH + tbloffset), times(v2939, load(tbl, 521 * VECWIDTH + tbloffset))); + real2 v2957 = minusplus(v2955, v2956); + real2 v2959 = minusplus(uminus(v2955), v2956); + real2 v2973 = timesminusplus(reverse(v2959), load(tbl, 524 * VECWIDTH + tbloffset), times(v2959, load(tbl, 525 * VECWIDTH + tbloffset))); + scatter(out, 55, 128, plus(v2953, v2973)); + real2 v3006 = minus(v2953, v2973); + scatter(out, 119, 128, timesminusplus(v3006, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v3006), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v2947 = timesminusplus(reverse(v2937), load(tbl, 518 * VECWIDTH + tbloffset), times(v2937, load(tbl, 519 * VECWIDTH + tbloffset))); + real2 v2967 = timesminusplus(reverse(v2957), load(tbl, 522 * VECWIDTH + tbloffset), times(v2957, load(tbl, 523 * VECWIDTH + tbloffset))); + scatter(out, 23, 128, plus(v2947, v2967)); + real2 v3000 = minus(v2947, v2967); + scatter(out, 87, 128, timesminusplus(v3000, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v3000), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v2663 = timesminusplus(reverse(v2649), load(tbl, 468 * VECWIDTH + tbloffset), times(v2649, load(tbl, 469 * VECWIDTH + tbloffset))); + real2 v2583 = timesminusplus(reverse(v2569), load(tbl, 452 * VECWIDTH + tbloffset), times(v2569, load(tbl, 453 * VECWIDTH + tbloffset))); + real2 v3014 = minus(v2663, v2583); + real2 v3018 = plus(v2583, v2663); + real2 v3015 = minusplus(v3013, v3014); + real2 v3017 = minusplus(uminus(v3013), v3014); + real2 v2643 = timesminusplus(reverse(v2629), load(tbl, 464 * VECWIDTH + tbloffset), times(v2629, load(tbl, 465 * VECWIDTH + tbloffset))); + real2 v2723 = timesminusplus(reverse(v2709), load(tbl, 480 * VECWIDTH + tbloffset), times(v2709, load(tbl, 481 * VECWIDTH + tbloffset))); + real2 v3039 = plus(v2643, v2723); + real2 v3033 = reverse(minus(v2723, v2643)); + real2 v2683 = timesminusplus(reverse(v2669), load(tbl, 472 * VECWIDTH + tbloffset), times(v2669, load(tbl, 473 * VECWIDTH + tbloffset))); + real2 v3031 = timesminusplus(reverse(v3017), load(tbl, 532 * VECWIDTH + tbloffset), times(v3017, load(tbl, 533 * VECWIDTH + tbloffset))); + real2 v2603 = timesminusplus(reverse(v2589), load(tbl, 456 * VECWIDTH + tbloffset), times(v2589, load(tbl, 457 * VECWIDTH + tbloffset))); + real2 v3034 = minus(v2683, v2603); + real2 v3038 = plus(v2603, v2683); + real2 v3037 = minusplus(uminus(v3033), v3034); + real2 v3035 = minusplus(v3033, v3034); + real2 v3051 = timesminusplus(reverse(v3037), load(tbl, 536 * VECWIDTH + tbloffset), times(v3037, load(tbl, 537 * VECWIDTH + tbloffset))); + scatter(out, 63, 128, plus(v3031, v3051)); + real2 v3084 = minus(v3031, v3051); + scatter(out, 127, 128, timesminusplus(v3084, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v3084), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v3025 = timesminusplus(reverse(v3015), load(tbl, 530 * VECWIDTH + tbloffset), times(v3015, load(tbl, 531 * VECWIDTH + tbloffset))); + real2 v3045 = timesminusplus(reverse(v3035), load(tbl, 534 * VECWIDTH + tbloffset), times(v3035, load(tbl, 535 * VECWIDTH + tbloffset))); + scatter(out, 31, 128, plus(v3025, v3045)); + real2 v3078 = minus(v3025, v3045); + scatter(out, 95, 128, timesminusplus(v3078, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v3078), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v3058 = plus(v3018, v3019); + real2 v3054 = minus(v3019, v3018); + real2 v3053 = reverse(minus(v3039, v3038)); + real2 v3059 = plus(v3038, v3039); + real2 v3055 = minusplus(v3053, v3054); + scatter(out, 47, 128, timesminusplus(reverse(v3055), load(tbl, 538 * VECWIDTH + tbloffset), times(v3055, load(tbl, 539 * VECWIDTH + tbloffset)))); + real2 v3057 = minusplus(uminus(v3053), v3054); + scatter(out, 111, 128, timesminusplus(reverse(v3057), load(tbl, 540 * VECWIDTH + tbloffset), times(v3057, load(tbl, 541 * VECWIDTH + tbloffset)))); + scatter(out, 15, 128, plus(v3058, v3059)); + real2 v3072 = minus(v3058, v3059); + scatter(out, 79, 128, timesminusplus(v3072, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v3072), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v683 = timesminusplus(reverse(v673), load(tbl, 110 * VECWIDTH + tbloffset), times(v673, load(tbl, 111 * VECWIDTH + tbloffset))); + real2 v363 = timesminusplus(reverse(v353), load(tbl, 46 * VECWIDTH + tbloffset), times(v353, load(tbl, 47 * VECWIDTH + tbloffset))); + real2 v2105 = plus(v363, v683); + real2 v2099 = reverse(minus(v683, v363)); + real2 v283 = timesminusplus(reverse(v273), load(tbl, 30 * VECWIDTH + tbloffset), times(v273, load(tbl, 31 * VECWIDTH + tbloffset))); + real2 v723 = timesminusplus(reverse(v713), load(tbl, 118 * VECWIDTH + tbloffset), times(v713, load(tbl, 119 * VECWIDTH + tbloffset))); + real2 v403 = timesminusplus(reverse(v393), load(tbl, 54 * VECWIDTH + tbloffset), times(v393, load(tbl, 55 * VECWIDTH + tbloffset))); + real2 v603 = timesminusplus(reverse(v593), load(tbl, 94 * VECWIDTH + tbloffset), times(v593, load(tbl, 95 * VECWIDTH + tbloffset))); + real2 v2180 = minus(v603, v283); + real2 v2184 = plus(v283, v603); + real2 v2145 = plus(v403, v723); + real2 v2139 = reverse(minus(v723, v403)); + real2 v543 = timesminusplus(reverse(v533), load(tbl, 82 * VECWIDTH + tbloffset), times(v533, load(tbl, 83 * VECWIDTH + tbloffset))); + real2 v383 = timesminusplus(reverse(v373), load(tbl, 50 * VECWIDTH + tbloffset), times(v373, load(tbl, 51 * VECWIDTH + tbloffset))); + real2 v703 = timesminusplus(reverse(v693), load(tbl, 114 * VECWIDTH + tbloffset), times(v693, load(tbl, 115 * VECWIDTH + tbloffset))); + real2 v2125 = plus(v383, v703); + real2 v2119 = reverse(minus(v703, v383)); + real2 v223 = timesminusplus(reverse(v213), load(tbl, 18 * VECWIDTH + tbloffset), times(v213, load(tbl, 19 * VECWIDTH + tbloffset))); + real2 v2120 = minus(v543, v223); + real2 v2124 = plus(v223, v543); + real2 v443 = timesminusplus(reverse(v433), load(tbl, 62 * VECWIDTH + tbloffset), times(v433, load(tbl, 63 * VECWIDTH + tbloffset))); + real2 v203 = timesminusplus(reverse(v193), load(tbl, 14 * VECWIDTH + tbloffset), times(v193, load(tbl, 15 * VECWIDTH + tbloffset))); + real2 v763 = timesminusplus(reverse(v753), load(tbl, 126 * VECWIDTH + tbloffset), times(v753, load(tbl, 127 * VECWIDTH + tbloffset))); + real2 v2179 = reverse(minus(v763, v443)); + real2 v2185 = plus(v443, v763); + real2 v523 = timesminusplus(reverse(v513), load(tbl, 78 * VECWIDTH + tbloffset), times(v513, load(tbl, 79 * VECWIDTH + tbloffset))); + real2 v2100 = minus(v523, v203); + real2 v2104 = plus(v203, v523); + real2 v2264 = plus(v2104, v2105); + real2 v2260 = minus(v2105, v2104); + real2 v643 = timesminusplus(reverse(v633), load(tbl, 102 * VECWIDTH + tbloffset), times(v633, load(tbl, 103 * VECWIDTH + tbloffset))); + real2 v2265 = plus(v2184, v2185); + real2 v2259 = reverse(minus(v2185, v2184)); + real2 v563 = timesminusplus(reverse(v553), load(tbl, 86 * VECWIDTH + tbloffset), times(v553, load(tbl, 87 * VECWIDTH + tbloffset))); + real2 v243 = timesminusplus(reverse(v233), load(tbl, 22 * VECWIDTH + tbloffset), times(v233, load(tbl, 23 * VECWIDTH + tbloffset))); + real2 v2144 = plus(v243, v563); + real2 v2140 = minus(v563, v243); + real2 v143 = timesminusplus(reverse(v133), load(tbl, 2 * VECWIDTH + tbloffset), times(v133, load(tbl, 3 * VECWIDTH + tbloffset))); + real2 v183 = timesminusplus(reverse(v173), load(tbl, 10 * VECWIDTH + tbloffset), times(v173, load(tbl, 11 * VECWIDTH + tbloffset))); + real2 v2084 = plus(v183, v503); + real2 v2080 = minus(v503, v183); + real2 v163 = timesminusplus(reverse(v153), load(tbl, 6 * VECWIDTH + tbloffset), times(v153, load(tbl, 7 * VECWIDTH + tbloffset))); + real2 v303 = timesminusplus(reverse(v293), load(tbl, 34 * VECWIDTH + tbloffset), times(v293, load(tbl, 35 * VECWIDTH + tbloffset))); + real2 v623 = timesminusplus(reverse(v613), load(tbl, 98 * VECWIDTH + tbloffset), times(v613, load(tbl, 99 * VECWIDTH + tbloffset))); + real2 v2039 = reverse(minus(v623, v303)); + real2 v2045 = plus(v303, v623); + real2 v463 = timesminusplus(reverse(v453), load(tbl, 66 * VECWIDTH + tbloffset), times(v453, load(tbl, 67 * VECWIDTH + tbloffset))); + real2 v2044 = plus(v143, v463); + real2 v2040 = minus(v463, v143); + real2 v2204 = plus(v2044, v2045); + real2 v2200 = minus(v2045, v2044); + real2 v323 = timesminusplus(reverse(v313), load(tbl, 38 * VECWIDTH + tbloffset), times(v313, load(tbl, 39 * VECWIDTH + tbloffset))); + real2 v2205 = plus(v2124, v2125); + real2 v2199 = reverse(minus(v2125, v2124)); + real2 v2280 = minus(v2205, v2204); + real2 v2284 = plus(v2204, v2205); + real2 v2225 = plus(v2144, v2145); + real2 v2219 = reverse(minus(v2145, v2144)); + real2 v2305 = plus(v2264, v2265); + real2 v2299 = reverse(minus(v2265, v2264)); + real2 v2240 = minus(v2085, v2084); + real2 v2244 = plus(v2084, v2085); + real2 v2279 = reverse(minus(v2245, v2244)); + real2 v2285 = plus(v2244, v2245); + real2 v2281 = minusplus(v2279, v2280); + real2 v2283 = minusplus(uminus(v2279), v2280); + real2 v2291 = timesminusplus(reverse(v2281), load(tbl, 406 * VECWIDTH + tbloffset), times(v2281, load(tbl, 407 * VECWIDTH + tbloffset))); + real2 v483 = timesminusplus(reverse(v473), load(tbl, 70 * VECWIDTH + tbloffset), times(v473, load(tbl, 71 * VECWIDTH + tbloffset))); + real2 v2060 = minus(v483, v163); + real2 v2064 = plus(v163, v483); + real2 v2065 = plus(v323, v643); + real2 v2059 = reverse(minus(v643, v323)); + real2 v2220 = minus(v2065, v2064); + real2 v2224 = plus(v2064, v2065); + real2 v2304 = plus(v2224, v2225); + real2 v2300 = minus(v2225, v2224); + real2 v2301 = minusplus(v2299, v2300); + real2 v2303 = minusplus(uminus(v2299), v2300); + real2 v2311 = timesminusplus(reverse(v2301), load(tbl, 410 * VECWIDTH + tbloffset), times(v2301, load(tbl, 411 * VECWIDTH + tbloffset))); + scatter(out, 17, 128, plus(v2291, v2311)); + real2 v2344 = minus(v2291, v2311); + scatter(out, 81, 128, timesminusplus(v2344, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v2344), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v2297 = timesminusplus(reverse(v2283), load(tbl, 408 * VECWIDTH + tbloffset), times(v2283, load(tbl, 409 * VECWIDTH + tbloffset))); + real2 v2317 = timesminusplus(reverse(v2303), load(tbl, 412 * VECWIDTH + tbloffset), times(v2303, load(tbl, 413 * VECWIDTH + tbloffset))); + scatter(out, 49, 128, plus(v2297, v2317)); + real2 v2350 = minus(v2297, v2317); + scatter(out, 113, 128, timesminusplus(v2350, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v2350), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v2320 = minus(v2285, v2284); + real2 v2324 = plus(v2284, v2285); + real2 v2325 = plus(v2304, v2305); + real2 v2319 = reverse(minus(v2305, v2304)); + scatter(out, 1, 128, plus(v2324, v2325)); + real2 v2338 = minus(v2324, v2325); + scatter(out, 65, 128, timesminusplus(v2338, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v2338), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v2321 = minusplus(v2319, v2320); + scatter(out, 33, 128, timesminusplus(reverse(v2321), load(tbl, 414 * VECWIDTH + tbloffset), times(v2321, load(tbl, 415 * VECWIDTH + tbloffset)))); + real2 v2323 = minusplus(uminus(v2319), v2320); + scatter(out, 97, 128, timesminusplus(reverse(v2323), load(tbl, 416 * VECWIDTH + tbloffset), times(v2323, load(tbl, 417 * VECWIDTH + tbloffset)))); + real2 v2201 = minusplus(v2199, v2200); + real2 v2203 = minusplus(uminus(v2199), v2200); + real2 v2263 = minusplus(uminus(v2259), v2260); + real2 v2261 = minusplus(v2259, v2260); + real2 v2243 = minusplus(uminus(v2239), v2240); + real2 v2241 = minusplus(v2239, v2240); + real2 v2257 = timesminusplus(reverse(v2243), load(tbl, 400 * VECWIDTH + tbloffset), times(v2243, load(tbl, 401 * VECWIDTH + tbloffset))); + real2 v2217 = timesminusplus(reverse(v2203), load(tbl, 392 * VECWIDTH + tbloffset), times(v2203, load(tbl, 393 * VECWIDTH + tbloffset))); + real2 v2388 = plus(v2217, v2257); + real2 v2384 = minus(v2257, v2217); + real2 v2277 = timesminusplus(reverse(v2263), load(tbl, 404 * VECWIDTH + tbloffset), times(v2263, load(tbl, 405 * VECWIDTH + tbloffset))); + real2 v2221 = minusplus(v2219, v2220); + real2 v2223 = minusplus(uminus(v2219), v2220); + real2 v2237 = timesminusplus(reverse(v2223), load(tbl, 396 * VECWIDTH + tbloffset), times(v2223, load(tbl, 397 * VECWIDTH + tbloffset))); + real2 v2389 = plus(v2237, v2277); + real2 v2383 = reverse(minus(v2277, v2237)); + scatter(out, 25, 128, plus(v2388, v2389)); + real2 v2402 = minus(v2388, v2389); + scatter(out, 89, 128, timesminusplus(v2402, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v2402), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v2385 = minusplus(v2383, v2384); + real2 v2387 = minusplus(uminus(v2383), v2384); + scatter(out, 121, 128, timesminusplus(reverse(v2387), load(tbl, 424 * VECWIDTH + tbloffset), times(v2387, load(tbl, 425 * VECWIDTH + tbloffset)))); + scatter(out, 57, 128, timesminusplus(reverse(v2385), load(tbl, 422 * VECWIDTH + tbloffset), times(v2385, load(tbl, 423 * VECWIDTH + tbloffset)))); + real2 v2251 = timesminusplus(reverse(v2241), load(tbl, 398 * VECWIDTH + tbloffset), times(v2241, load(tbl, 399 * VECWIDTH + tbloffset))); + real2 v2211 = timesminusplus(reverse(v2201), load(tbl, 390 * VECWIDTH + tbloffset), times(v2201, load(tbl, 391 * VECWIDTH + tbloffset))); + real2 v2358 = minus(v2251, v2211); + real2 v2362 = plus(v2211, v2251); + real2 v2271 = timesminusplus(reverse(v2261), load(tbl, 402 * VECWIDTH + tbloffset), times(v2261, load(tbl, 403 * VECWIDTH + tbloffset))); + real2 v2231 = timesminusplus(reverse(v2221), load(tbl, 394 * VECWIDTH + tbloffset), times(v2221, load(tbl, 395 * VECWIDTH + tbloffset))); + real2 v2357 = reverse(minus(v2271, v2231)); + real2 v2363 = plus(v2231, v2271); + scatter(out, 9, 128, plus(v2362, v2363)); + real2 v2376 = minus(v2362, v2363); + scatter(out, 73, 128, timesminusplus(v2376, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v2376), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v2361 = minusplus(uminus(v2357), v2358); + scatter(out, 105, 128, timesminusplus(reverse(v2361), load(tbl, 420 * VECWIDTH + tbloffset), times(v2361, load(tbl, 421 * VECWIDTH + tbloffset)))); + real2 v2359 = minusplus(v2357, v2358); + scatter(out, 41, 128, timesminusplus(reverse(v2359), load(tbl, 418 * VECWIDTH + tbloffset), times(v2359, load(tbl, 419 * VECWIDTH + tbloffset)))); + real2 v2121 = minusplus(v2119, v2120); + real2 v2123 = minusplus(uminus(v2119), v2120); + real2 v2083 = minusplus(uminus(v2079), v2080); + real2 v2081 = minusplus(v2079, v2080); + real2 v2091 = timesminusplus(reverse(v2081), load(tbl, 366 * VECWIDTH + tbloffset), times(v2081, load(tbl, 367 * VECWIDTH + tbloffset))); + real2 v2043 = minusplus(uminus(v2039), v2040); + real2 v2041 = minusplus(v2039, v2040); + real2 v2051 = timesminusplus(reverse(v2041), load(tbl, 358 * VECWIDTH + tbloffset), times(v2041, load(tbl, 359 * VECWIDTH + tbloffset))); + real2 v2131 = timesminusplus(reverse(v2121), load(tbl, 374 * VECWIDTH + tbloffset), times(v2121, load(tbl, 375 * VECWIDTH + tbloffset))); + real2 v2163 = minusplus(uminus(v2159), v2160); + real2 v2161 = minusplus(v2159, v2160); + real2 v2171 = timesminusplus(reverse(v2161), load(tbl, 382 * VECWIDTH + tbloffset), times(v2161, load(tbl, 383 * VECWIDTH + tbloffset))); + real2 v2409 = reverse(minus(v2171, v2091)); + real2 v2415 = plus(v2091, v2171); + real2 v2410 = minus(v2131, v2051); + real2 v2414 = plus(v2051, v2131); + real2 v2454 = plus(v2414, v2415); + real2 v2450 = minus(v2415, v2414); + real2 v2181 = minusplus(v2179, v2180); + real2 v2183 = minusplus(uminus(v2179), v2180); + real2 v2191 = timesminusplus(reverse(v2181), load(tbl, 386 * VECWIDTH + tbloffset), times(v2181, load(tbl, 387 * VECWIDTH + tbloffset))); + real2 v2103 = minusplus(uminus(v2099), v2100); + real2 v2101 = minusplus(v2099, v2100); + real2 v2111 = timesminusplus(reverse(v2101), load(tbl, 370 * VECWIDTH + tbloffset), times(v2101, load(tbl, 371 * VECWIDTH + tbloffset))); + real2 v2435 = plus(v2111, v2191); + real2 v2429 = reverse(minus(v2191, v2111)); + real2 v2141 = minusplus(v2139, v2140); + real2 v2143 = minusplus(uminus(v2139), v2140); + real2 v2151 = timesminusplus(reverse(v2141), load(tbl, 378 * VECWIDTH + tbloffset), times(v2141, load(tbl, 379 * VECWIDTH + tbloffset))); + real2 v2063 = minusplus(uminus(v2059), v2060); + real2 v2061 = minusplus(v2059, v2060); + real2 v2071 = timesminusplus(reverse(v2061), load(tbl, 362 * VECWIDTH + tbloffset), times(v2061, load(tbl, 363 * VECWIDTH + tbloffset))); + real2 v2434 = plus(v2071, v2151); + real2 v2430 = minus(v2151, v2071); + real2 v2455 = plus(v2434, v2435); + real2 v2449 = reverse(minus(v2435, v2434)); + scatter(out, 5, 128, plus(v2454, v2455)); + real2 v2468 = minus(v2454, v2455); + scatter(out, 69, 128, timesminusplus(v2468, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v2468), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v2451 = minusplus(v2449, v2450); + real2 v2453 = minusplus(uminus(v2449), v2450); + scatter(out, 101, 128, timesminusplus(reverse(v2453), load(tbl, 436 * VECWIDTH + tbloffset), times(v2453, load(tbl, 437 * VECWIDTH + tbloffset)))); + scatter(out, 37, 128, timesminusplus(reverse(v2451), load(tbl, 434 * VECWIDTH + tbloffset), times(v2451, load(tbl, 435 * VECWIDTH + tbloffset)))); + real2 v2411 = minusplus(v2409, v2410); + real2 v2413 = minusplus(uminus(v2409), v2410); + real2 v2433 = minusplus(uminus(v2429), v2430); + real2 v2431 = minusplus(v2429, v2430); + real2 v2421 = timesminusplus(reverse(v2411), load(tbl, 426 * VECWIDTH + tbloffset), times(v2411, load(tbl, 427 * VECWIDTH + tbloffset))); + real2 v2441 = timesminusplus(reverse(v2431), load(tbl, 430 * VECWIDTH + tbloffset), times(v2431, load(tbl, 431 * VECWIDTH + tbloffset))); + scatter(out, 21, 128, plus(v2421, v2441)); + real2 v2474 = minus(v2421, v2441); + scatter(out, 85, 128, timesminusplus(v2474, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v2474), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v2427 = timesminusplus(reverse(v2413), load(tbl, 428 * VECWIDTH + tbloffset), times(v2413, load(tbl, 429 * VECWIDTH + tbloffset))); + real2 v2447 = timesminusplus(reverse(v2433), load(tbl, 432 * VECWIDTH + tbloffset), times(v2433, load(tbl, 433 * VECWIDTH + tbloffset))); + scatter(out, 53, 128, plus(v2427, v2447)); + real2 v2480 = minus(v2427, v2447); + scatter(out, 117, 128, timesminusplus(v2480, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v2480), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v2057 = timesminusplus(reverse(v2043), load(tbl, 360 * VECWIDTH + tbloffset), times(v2043, load(tbl, 361 * VECWIDTH + tbloffset))); + real2 v2097 = timesminusplus(reverse(v2083), load(tbl, 368 * VECWIDTH + tbloffset), times(v2083, load(tbl, 369 * VECWIDTH + tbloffset))); + real2 v2157 = timesminusplus(reverse(v2143), load(tbl, 380 * VECWIDTH + tbloffset), times(v2143, load(tbl, 381 * VECWIDTH + tbloffset))); + real2 v2197 = timesminusplus(reverse(v2183), load(tbl, 388 * VECWIDTH + tbloffset), times(v2183, load(tbl, 389 * VECWIDTH + tbloffset))); + real2 v2117 = timesminusplus(reverse(v2103), load(tbl, 372 * VECWIDTH + tbloffset), times(v2103, load(tbl, 373 * VECWIDTH + tbloffset))); + real2 v2507 = reverse(minus(v2197, v2117)); + real2 v2513 = plus(v2117, v2197); + real2 v2137 = timesminusplus(reverse(v2123), load(tbl, 376 * VECWIDTH + tbloffset), times(v2123, load(tbl, 377 * VECWIDTH + tbloffset))); + real2 v2488 = minus(v2137, v2057); + real2 v2492 = plus(v2057, v2137); + real2 v2177 = timesminusplus(reverse(v2163), load(tbl, 384 * VECWIDTH + tbloffset), times(v2163, load(tbl, 385 * VECWIDTH + tbloffset))); + real2 v2493 = plus(v2097, v2177); + real2 v2487 = reverse(minus(v2177, v2097)); + real2 v2532 = plus(v2492, v2493); + real2 v2528 = minus(v2493, v2492); + real2 v2077 = timesminusplus(reverse(v2063), load(tbl, 364 * VECWIDTH + tbloffset), times(v2063, load(tbl, 365 * VECWIDTH + tbloffset))); + real2 v2512 = plus(v2077, v2157); + real2 v2508 = minus(v2157, v2077); + real2 v2527 = reverse(minus(v2513, v2512)); + real2 v2533 = plus(v2512, v2513); + real2 v2529 = minusplus(v2527, v2528); + real2 v2531 = minusplus(uminus(v2527), v2528); + scatter(out, 109, 128, timesminusplus(reverse(v2531), load(tbl, 448 * VECWIDTH + tbloffset), times(v2531, load(tbl, 449 * VECWIDTH + tbloffset)))); + scatter(out, 45, 128, timesminusplus(reverse(v2529), load(tbl, 446 * VECWIDTH + tbloffset), times(v2529, load(tbl, 447 * VECWIDTH + tbloffset)))); + scatter(out, 13, 128, plus(v2532, v2533)); + real2 v2546 = minus(v2532, v2533); + scatter(out, 77, 128, timesminusplus(v2546, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v2546), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v2509 = minusplus(v2507, v2508); + real2 v2511 = minusplus(uminus(v2507), v2508); + real2 v2491 = minusplus(uminus(v2487), v2488); + real2 v2489 = minusplus(v2487, v2488); + real2 v2499 = timesminusplus(reverse(v2489), load(tbl, 438 * VECWIDTH + tbloffset), times(v2489, load(tbl, 439 * VECWIDTH + tbloffset))); + real2 v2519 = timesminusplus(reverse(v2509), load(tbl, 442 * VECWIDTH + tbloffset), times(v2509, load(tbl, 443 * VECWIDTH + tbloffset))); + scatter(out, 29, 128, plus(v2499, v2519)); + real2 v2552 = minus(v2499, v2519); + scatter(out, 93, 128, timesminusplus(v2552, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v2552), load(tbl, 1 * VECWIDTH + tbloffset)))); + real2 v2505 = timesminusplus(reverse(v2491), load(tbl, 440 * VECWIDTH + tbloffset), times(v2491, load(tbl, 441 * VECWIDTH + tbloffset))); + real2 v2525 = timesminusplus(reverse(v2511), load(tbl, 444 * VECWIDTH + tbloffset), times(v2511, load(tbl, 445 * VECWIDTH + tbloffset))); + scatter(out, 61, 128, plus(v2505, v2525)); + real2 v2558 = minus(v2505, v2525); + scatter(out, 125, 128, timesminusplus(v2558, load(tbl, 0 * VECWIDTH + tbloffset), times(reverse(v2558), load(tbl, 1 * VECWIDTH + tbloffset)))); + // Pres : 76263 + } +} +#endif diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/vectortype.h b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/vectortype.hpp similarity index 97% rename from src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/vectortype.h rename to src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/vectortype.hpp index bfec5eeff6b..619a832eb73 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/vectortype.h +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/dft/vectortype.hpp @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -6,6 +6,10 @@ #ifndef __VECTORTYPE_H__ #define __VECTORTYPE_H__ +#if defined(__GNUC__) +#pragma GCC diagnostic ignored "-Wattributes" +#endif + #include #include "sleef.h" @@ -57,10 +61,6 @@ #include "helpers390x_128.h" #endif -#ifdef ENABLE_VECEXT -#include "helpervecext.h" -#endif - #ifdef ENABLE_PUREC #include "helperpurec.h" #endif diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/gencoef/gencoef.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/gencoef/gencoef.c index 62ff69d8476..1a0b7401441 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/gencoef/gencoef.c +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/gencoef/gencoef.c @@ -194,13 +194,12 @@ int main(int argc, char **argv) mpfr_zinit(result[i]); } - mpfr_t fra, frb, frc, frd, fre; + mpfr_t fra, frb, frc, frd; - mpfr_zinit(fra); - mpfr_zinit(frb); - mpfr_zinit(frc); - mpfr_zinit(frd); - mpfr_zinit(fre); + mpfr_init(fra); + mpfr_init(frb); + mpfr_init(frc); + mpfr_init(frd); for(i=0;i /dev/null) -ARCH := $(shell uname -p) - -all : -ifndef BUILDDIR - @echo - @echo Please set the build directory to BUILDDIR environment variable and run make once again. - @echo e.g. export BUILDDIR='`pwd`'/../../build - @echo -else - @echo - @echo You can start measurement by "'"make measure"'". -ifdef ICCAVAILABLE - @echo You can start measurement with SVML by "'"make measureSVML"'". -endif - @echo Then, you can plot the results of measurement by "'"make plot"'". - @echo - @echo You have to install java and gnuplot to do plotting. - @echo Stop all tasks on the computer before starting measurement. - @echo -endif - -benchsvml128_10.o : benchsvml128.c bench.h - -command -v icc >/dev/null 2>&1 && icc benchsvml128.c -Wall -I.. -DSVMLULP=1 -fimf-max-error=1.0 -fimf-domain-exclusion=0 -march=core-avx2 -O0 -lm -c -o benchsvml128_10.o - -benchsvml128_40.o : benchsvml128.c bench.h - -command -v icc >/dev/null 2>&1 && icc benchsvml128.c -Wall -I.. -DSVMLULP=4 -fimf-max-error=4.0 -fimf-domain-exclusion=0 -march=core-avx2 -O0 -lm -c -o benchsvml128_40.o - -benchsvml256_10.o : benchsvml256.c bench.h - -command -v icc >/dev/null 2>&1 && icc benchsvml256.c -Wall -I.. -DSVMLULP=1 -fimf-max-error=1.0 -fimf-domain-exclusion=0 -march=core-avx2 -O0 -lm -c -o benchsvml256_10.o - -benchsvml256_40.o : benchsvml256.c bench.h - -command -v icc >/dev/null 2>&1 && icc benchsvml256.c -Wall -I.. -DSVMLULP=4 -fimf-max-error=4.0 -fimf-domain-exclusion=0 -march=core-avx2 -O0 -lm -c -o benchsvml256_40.o - -benchsvml512_10.o : benchsvml512.c bench.h - -command -v icc >/dev/null 2>&1 && icc benchsvml512.c -Wall -I.. -DSVMLULP=1 -fimf-max-error=1.0 -fimf-domain-exclusion=0 -xCOMMON-AVX512 -O0 -lm -c -o benchsvml512_10.o - -benchsvml512_40.o : benchsvml512.c bench.h - -command -v icc >/dev/null 2>&1 && icc benchsvml512.c -Wall -I.. -DSVMLULP=4 -fimf-max-error=4.0 -fimf-domain-exclusion=0 -xCOMMON-AVX512 -O0 -lm -c -o benchsvml512_40.o - - -benchsvml_10 : benchsvml.c benchsvml128_10.o benchsvml256_10.o benchsvml512_10.o bench.h - -command -v icc >/dev/null 2>&1 && icc benchsvml.c benchsvml128_10.o benchsvml256_10.o benchsvml512_10.o -Wall -I.. -DSVMLULP=1 -fimf-max-error=1.0 -fimf-domain-exclusion=0 -O0 -march=native -lm -o benchsvml_10 - -benchsvml_40 : benchsvml.c benchsvml128_40.o benchsvml256_40.o benchsvml512_40.o bench.h - -command -v icc >/dev/null 2>&1 && icc benchsvml.c benchsvml128_40.o benchsvml256_40.o benchsvml512_40.o -Wall -I.. -DSVMLULP=4 -fimf-max-error=4.0 -fimf-domain-exclusion=0 -O0 -march=native -lm -o benchsvml_40 - -# - -ifeq ($(ARCH),aarch64) - -benchsleef : benchsleef.c benchsleef128.o bench.h - $(CC) benchsleef.c benchsleef128.o -Wall -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -lsleef -lm -o benchsleef - -benchsleef128.o : benchsleef128.c bench.h - $(CC) benchsleef128.c -Wall -march=native -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -c - -else ifeq ($(ARCH),s390x) - -benchsleef : benchsleef.c benchsleef128.o bench.h - $(CC) benchsleef.c benchsleef128.o -Wall -march=native -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -lsleef -lm -o benchsleef - -benchsleef128.o : benchsleef128.c bench.h - $(CC) benchsleef128.c -Wall -mzvector -march=native -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -c - -else ifeq ($(ARCH),ppc64le) - -benchsleef : benchsleef.c benchsleef128.o bench.h - $(CC) benchsleef.c benchsleef128.o -Wall -mcpu=native -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -lsleef -lm -o benchsleef - -benchsleef128.o : benchsleef128.c bench.h - $(CC) benchsleef128.c -Wall -mcpu=native -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -c - -else - -benchsleef : benchsleef.c benchsleef128.o benchsleef256.o benchsleef512.o bench.h - $(CC) benchsleef.c benchsleef128.o benchsleef256.o benchsleef512.o -Wall -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -lsleef -lm -o benchsleef - -benchsleef128.o : benchsleef128.c bench.h - $(CC) benchsleef128.c -Wall -march=native -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -c - -benchsleef256.o : benchsleef256.c bench.h - $(CC) benchsleef256.c -Wall -march=native -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -c - -benchsleef512.o : benchsleef512.c bench.h - $(CC) benchsleef512.c -Wall -mavx512f -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -c - -endif - -# - -ProcessData.class : ProcessData.java - javac ProcessData.java - -# - -ifndef BUILDDIR -measure : - @echo - @echo Please set the build directory to BUILDDIR environment variable and run make once again. - @echo e.g. export BUILDDIR='`pwd`'/../../build - @echo -else -measure : benchsleef - chmod +x ./measure.sh - LD_LIBRARY_PATH=$(BUILDDIR)/lib ./measure.sh ./benchsleef - @echo - @echo Now, you can plot the results of measurement by "'"make plot"'". - @echo You can do another measurement by "'"make measure"'". -ifdef ICCAVAILABLE - @echo You can start another measurement with SVML by "'"make measureSVML"'". -endif - @echo You can start over by "'"make restart"'". - @echo -endif - -measureSVML : all benchsvml_10 benchsvml_40 - chmod +x ./measure.sh - ./measure.sh ./benchsvml_10 ./benchsvml_40 - @echo - @echo Now, you can plot the results of measurement by "'"make plot"'". - @echo You can do another measurement by "'"make measure"'". -ifdef ICCAVAILABLE - @echo You can start another measurement with SVML by "'"make measureSVML"'". -endif - @echo You can start over by "'"make restart"'". - @echo - -plot : ProcessData.class counter.txt - java ProcessData *dptrig*.out - gnuplot script.out - mv output.png trigdp.png - java ProcessData *dpnontrig*.out - gnuplot script.out - mv output.png nontrigdp.png - java ProcessData *sptrig*.out - gnuplot script.out - mv output.png trigsp.png - java ProcessData *spnontrig*.out - gnuplot script.out - mv output.png nontrigsp.png - @echo - @echo Plotted results are in trigdp.png, nontrigdp.png, trigsp.png and nontrigsp.png. - @echo - -clean : - rm -f *~ a.out *.so *.so.* *.a *.s *.o - rm -rf *.dSYM *.dylib - rm -f *.obj *.lib *.dll *.exp *.exe *.stackdump - rm -f *.class *.png benchsleef benchsvml_10 benchsvml_40 *.out counter.txt - -restart : - rm -f *.out counter.txt diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-benchmarks/ProcessData.java b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-benchmarks/ProcessData.java deleted file mode 100644 index 7231191e990..00000000000 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-benchmarks/ProcessData.java +++ /dev/null @@ -1,193 +0,0 @@ -import java.util.*; -import java.io.*; - -public class ProcessData { - static final int DP = 64, SP = 32; - - static LinkedHashMap funcNameOrder = new LinkedHashMap(); - - static class Key { - final String funcName; - - final int prec, bits; - final ArrayList range = new ArrayList(); - final double ulps; - - Key(String s) { - String[] a = s.split(","); - - funcName = a[0].trim(); - if (funcNameOrder.get(funcName) == null) { - funcNameOrder.put(funcName, funcNameOrder.size()); - } - - prec = - a[1].trim().equals("DP") ? DP : - a[1].trim().equals("SP") ? SP : - 0; - - bits = Integer.parseInt(a[2].trim()); - - int c; - - for(c = 3;;c++) { - if (a[c].trim().endsWith("ulps")) break; - range.add(Double.parseDouble(a[c])); - } - - ulps = Double.parseDouble(a[c].trim().replace("ulps", "")); - } - - public int hashCode() { - int h = funcName.hashCode(); - h ^= prec ^ bits; - return h; - } - - public boolean equals(Object o) { - if (this == o) return true; - Key k = (Key) o; - if (funcName.compareTo(k.funcName) != 0) return false; - if (prec != k.prec) return false; - if (bits != k.bits) return false; - if (range.size() != k.range.size()) return false; - for(int i=0;i { - public int compare(Key d0, Key d1) { - if (d0 == d1) return 0; - if (d0.prec < d1.prec) return 1; - if (d0.prec > d1.prec) return -1; - if (d0.ulps > d1.ulps) return 1; - if (d0.ulps < d1.ulps) return -1; - - int fc = (int)funcNameOrder.get(d0.funcName) - (int)funcNameOrder.get(d1.funcName); - if (fc != 0) return fc; - - if (d0.bits > d1.bits) return 1; - if (d0.bits < d1.bits) return -1; - - if (d0.range.size() > d1.range.size()) return 1; - if (d0.range.size() < d1.range.size()) return -1; - - for(int i=0;i d1.range.get(i)) return 1; - if (d0.range.get(i) < d1.range.get(i)) return -1; - } - - return 0; - } - } - - public static void main(String[] args) throws Exception { - LinkedHashMap> allData = new LinkedHashMap>(); - TreeSet allKeys = new TreeSet(new KeyComparator()); - LinkedHashSet allColumnTitles = new LinkedHashSet(); - double maximum = 0; - - for(int i=0;i v = allData.get(key); - if (v == null) { - v = new LinkedHashMap(); - allData.put(key, v); - } - String[] a = s.split(","); - - double time = Double.parseDouble(a[a.length-1]); - v.put(columnTitle, time); - maximum = Math.max(maximum, time); - } - - lnr.close(); - } - - PrintStream ps = new PrintStream("data.out"); - - for(Key k : allKeys) { - ps.print("\"" + k + "\" "); - - LinkedHashMap v = allData.get(k); - - for(String s : allColumnTitles) { - Double d = v.get(s); - if (d != null) ps.print(d); - if (d == null) ps.print("0"); - ps.print("\t"); - } - ps.println(); - } - - ps.close(); - - ps = new PrintStream("script.out"); - - ps.println("set terminal pngcairo size 1280, 800 font \",10\""); - ps.println("set output \"output.png\""); - - ps.println("color00 = \"#FF5050\";"); // red - ps.println("color01 = \"#0066FF\";"); // blue - ps.println("color02 = \"#00FF00\";"); // green - ps.println("color03 = \"#FF9900\";"); // orange - ps.println("color04 = \"#CC00CC\";"); // purple - ps.println("color05 = \"#880000\";"); // brown - ps.println("color06 = \"#003300\";"); // dark green - ps.println("color07 = \"#000066\";"); // dark blue - - ps.println("set style data histogram"); - ps.println("set style histogram cluster gap 1"); - ps.println("set style fill solid 1.00"); - ps.println("set boxwidth 0.9"); - ps.println("set xtics format \"\""); - ps.println("set xtics rotate by -90"); - ps.println("set grid ytics"); - - ps.println("set ylabel \"Execution time in micro sec.\""); - ps.println("set yrange [0:*]"); - ps.println("set bmargin 24"); - - ps.println("set title \"Single execution time in micro sec.\""); - ps.print("plot"); - - int i = 0; - for(String s : allColumnTitles) { - ps.print("\"data.out\" using " + (i+2) + ":xtic(1) title \"" + s + - "\" linecolor rgb color" + String.format("%02d", i)); - if (i != allColumnTitles.size()-1) ps.print(", "); - i++; - } - ps.println(); - - ps.close(); - } -} diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-benchmarks/bench.h b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-benchmarks/bench.h deleted file mode 100644 index d4c87edf28b..00000000000 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-benchmarks/bench.h +++ /dev/null @@ -1,58 +0,0 @@ -#define NITER1 100000 -#define NITER2 10000 -#define NITER (NITER1 * NITER2) - -#define callFuncSLEEF1_1(funcName, name, xmin, xmax, ulp, arg, type) ({ \ - printf("%s\n", #funcName); \ - uint64_t t = Sleef_currentTimeMicros(); \ - for(int j=0;j -#include -#include -#include -#include -#include -#include - -#include "bench.h" - -int veclen = 16; -double *abufdp, *bbufdp; -float *abufsp, *bbufsp; -FILE *fp; - -#if defined(__i386__) || defined(__x86_64__) -void x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx) { - uint32_t a, b, c, d; - __asm__ __volatile__ ("cpuid" : "=a" (a), "=b" (b), "=c" (c), "=d" (d) : "a" (eax), "c"(ecx)); - out[0] = a; out[1] = b; out[2] = c; out[3] = d; -} - -int cpuSupportsAVX() { - int32_t reg[4]; - x86CpuID(reg, 1, 0); - return (reg[2] & (1 << 28)) != 0; -} - -int cpuSupportsAVX512F() { - int32_t reg[4]; - x86CpuID(reg, 7, 0); - return (reg[1] & (1 << 16)) != 0; -} -#endif - -void fillDP(double *buf, double min, double max) { - for(int i=0;i= 3) fnBase = argv[2]; - - srandom(time(NULL)); - -#if defined(__i386__) || defined(__x86_64__) - int do128bit = 1; - int do256bit = cpuSupportsAVX(); - int do512bit = cpuSupportsAVX512F(); -#elif defined(__ARM_NEON) || defined(__VSX__) || defined(__VX__) - int do128bit = 1; -#else -#error Unsupported architecture -#endif - - posix_memalign((void **)&abufdp, veclen*sizeof(double), NITER1*veclen*sizeof(double)); - posix_memalign((void **)&bbufdp, veclen*sizeof(double), NITER1*veclen*sizeof(double)); - - abufsp = (float *)abufdp; - bbufsp = (float *)bbufdp; - - sprintf(fn, "%sdptrig.out", fnBase); - fp = fopen(fn, "w"); - fprintf(fp, "%s\n", columnTitle); - - if (do128bit) benchSleef128_DPTrig(); -#if defined(__i386__) || defined(__x86_64__) - if (do256bit) benchSleef256_DPTrig(); - if (do512bit) benchSleef512_DPTrig(); -#endif - - fclose(fp); - - sprintf(fn, "%sdpnontrig.out", fnBase); - fp = fopen(fn, "w"); - fprintf(fp, "%s\n", columnTitle); - - if (do128bit) benchSleef128_DPNontrig(); -#if defined(__i386__) || defined(__x86_64__) - if (do256bit) benchSleef256_DPNontrig(); - if (do512bit) benchSleef512_DPNontrig(); -#endif - - fclose(fp); - - sprintf(fn, "%ssptrig.out", fnBase); - fp = fopen(fn, "w"); - fprintf(fp, "%s\n", columnTitle); - - if (do128bit) benchSleef128_SPTrig(); -#if defined(__i386__) || defined(__x86_64__) - if (do256bit) benchSleef256_SPTrig(); - if (do512bit) benchSleef512_SPTrig(); -#endif - - fclose(fp); - - sprintf(fn, "%sspnontrig.out", fnBase); - fp = fopen(fn, "w"); - fprintf(fp, "%s\n", columnTitle); - - if (do128bit) benchSleef128_SPNontrig(); -#if defined(__i386__) || defined(__x86_64__) - if (do256bit) benchSleef256_SPNontrig(); - if (do512bit) benchSleef512_SPNontrig(); -#endif - - fclose(fp); - - exit(0); -} diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-benchmarks/benchsleef128.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-benchmarks/benchsleef128.c deleted file mode 100644 index e5632d18fc1..00000000000 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-benchmarks/benchsleef128.c +++ /dev/null @@ -1,195 +0,0 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. -// Distributed under the Boost Software License, Version 1.0. -// (See accompanying file LICENSE.txt or copy at -// http://www.boost.org/LICENSE_1_0.txt) - -#include -#include -#include -#include -#include -#include -#include - -void fillDP(double *buf, double min, double max); -void fillSP(float *buf, double min, double max); - -extern char x86BrandString[256], versionString[1024]; -extern int veclen; -extern double *abufdp, *bbufdp; -extern float *abufsp, *bbufsp; -extern FILE *fp; - -#include "bench.h" - -#ifdef __SSE2__ -#if defined(_MSC_VER) -#include -#else -#include -#endif -typedef __m128d vdouble; -typedef __m128 vfloat; -#define ENABLED -#elif defined(__ARM_NEON) -#include -typedef float64x2_t vdouble; -typedef float32x4_t vfloat; -#define ENABLED -#elif defined(__VSX__) -#include -typedef __vector double vdouble; -typedef __vector float vfloat; -#define ENABLED -#elif defined(__VX__) -#include -typedef __vector double vdouble; -typedef __vector float vfloat; -#define ENABLED -#endif - -#ifdef ENABLED -void benchSleef128_DPTrig() { - fillDP(abufdp, 0, 6.28); - - callFuncSLEEF1_1(Sleef_sind2_u10 , "sin, DP, 128", 0, 6.28, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_cosd2_u10 , "cos, DP, 128", 0, 6.28, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_tand2_u10 , "tan, DP, 128", 0, 6.28, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_sincosd2_u10, "sincos, DP, 128", 0, 6.28, 1.0, abufdp, vdouble); - - callFuncSLEEF1_1(Sleef_sind2_u35 , "sin, DP, 128", 0, 6.28, 4.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_cosd2_u35 , "cos, DP, 128", 0, 6.28, 4.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_tand2_u35 , "tan, DP, 128", 0, 6.28, 4.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_sincosd2_u35, "sincos, DP, 128", 0, 6.28, 4.0, abufdp, vdouble); - - fillDP(abufdp, 0, 1e+6); - - callFuncSLEEF1_1(Sleef_sind2_u10 , "sin, DP, 128", 0, 1e+6, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_cosd2_u10 , "cos, DP, 128", 0, 1e+6, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_tand2_u10 , "tan, DP, 128", 0, 1e+6, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_sincosd2_u10, "sincos, DP, 128", 0, 1e+6, 1.0, abufdp, vdouble); - - callFuncSLEEF1_1(Sleef_sind2_u35 , "sin, DP, 128", 0, 1e+6, 4.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_cosd2_u35 , "cos, DP, 128", 0, 1e+6, 4.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_tand2_u35 , "tan, DP, 128", 0, 1e+6, 4.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_sincosd2_u35, "sincos, DP, 128", 0, 1e+6, 4.0, abufdp, vdouble); - - fillDP(abufdp, 0, 1e+100); - - callFuncSLEEF1_1(Sleef_sind2_u10 , "sin, DP, 128", 0, 1e+100, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_cosd2_u10 , "cos, DP, 128", 0, 1e+100, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_tand2_u10 , "tan, DP, 128", 0, 1e+100, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_sincosd2_u10, "sincos, DP, 128", 0, 1e+100, 1.0, abufdp, vdouble); - - callFuncSLEEF1_1(Sleef_sind2_u35 , "sin, DP, 128", 0, 1e+100, 4.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_cosd2_u35 , "cos, DP, 128", 0, 1e+100, 4.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_tand2_u35 , "tan, DP, 128", 0, 1e+100, 4.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_sincosd2_u35, "sincos, DP, 128", 0, 1e+100, 4.0, abufdp, vdouble); -} - -void benchSleef128_DPNontrig() { - fillDP(abufdp, 0, 1e+300); - - callFuncSLEEF1_1(Sleef_logd2_u10 , "log, DP, 128", 0, 1e+300, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_log10d2_u10, "log10, DP, 128", 0, 1e+300, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_log1pd2_u10, "log1p, DP, 128", 0, 1e+300, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_logd2_u35 , "log, DP, 128", 0, 1e+300, 4.0, abufdp, vdouble); - - fillDP(abufdp, -700, 700); - - callFuncSLEEF1_1(Sleef_expd2_u10 , "exp, DP, 128", -700, 700, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_exp2d2_u10 , "exp2, DP, 128", -700, 700, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_exp10d2_u10, "exp10, DP, 128", -700, 700, 1.0, abufdp, vdouble); - - fillDP(abufdp, -30, 30); - fillDP(bbufdp, -30, 30); - - callFuncSLEEF1_2(Sleef_powd2_u10, "pow, DP, 128", -30, 30, -30, 30, 1.0, abufdp, bbufdp, vdouble); - - fillDP(abufdp, -1.0, 1.0); - - callFuncSLEEF1_1(Sleef_asind2_u10, "asin, DP, 128", -1.0, 1.0, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_acosd2_u10, "acos, DP, 128", -1.0, 1.0, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_asind2_u35, "asin, DP, 128", -1.0, 1.0, 4.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_acosd2_u35, "acos, DP, 128", -1.0, 1.0, 4.0, abufdp, vdouble); - - fillDP(abufdp, -10, 10); - fillDP(bbufdp, -10, 10); - - callFuncSLEEF1_1(Sleef_atand2_u10, "atan, DP, 128", -10, 10, 1.0, abufdp, vdouble); - callFuncSLEEF1_2(Sleef_atan2d2_u10, "atan2, DP, 128", -10, 10, -10, 10, 1.0, abufdp, bbufdp, vdouble); - callFuncSLEEF1_1(Sleef_atand2_u35, "atan, DP, 128", -10, 10, 4.0, abufdp, vdouble); - callFuncSLEEF1_2(Sleef_atan2d2_u35, "atan2, DP, 128", -10, 10, -10, 10, 4.0, abufdp, bbufdp, vdouble); -} - -void benchSleef128_SPTrig() { - fillSP(abufsp, 0, 6.28); - - callFuncSLEEF1_1(Sleef_sinf4_u10 , "sin, SP, 128", 0, 6.28, 1.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_cosf4_u10 , "cos, SP, 128", 0, 6.28, 1.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_tanf4_u10 , "tan, SP, 128", 0, 6.28, 1.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_sincosf4_u10, "sincos, SP, 128", 0, 6.28, 1.0, abufsp, vfloat); - - callFuncSLEEF1_1(Sleef_sinf4_u35 , "sin, SP, 128", 0, 6.28, 4.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_cosf4_u35 , "cos, SP, 128", 0, 6.28, 4.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_tanf4_u35 , "tan, SP, 128", 0, 6.28, 4.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_sincosf4_u35, "sincos, SP, 128", 0, 6.28, 4.0, abufsp, vfloat); - - fillSP(abufsp, 0, 1e+20); - - callFuncSLEEF1_1(Sleef_sinf4_u10 , "sin, SP, 128", 0, 1e+20, 1.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_cosf4_u10 , "cos, SP, 128", 0, 1e+20, 1.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_tanf4_u10 , "tan, SP, 128", 0, 1e+20, 1.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_sincosf4_u10, "sincos, SP, 128", 0, 1e+20, 1.0, abufsp, vfloat); - - callFuncSLEEF1_1(Sleef_sinf4_u35 , "sin, SP, 128", 0, 1e+20, 4.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_cosf4_u35 , "cos, SP, 128", 0, 1e+20, 4.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_tanf4_u35 , "tan, SP, 128", 0, 1e+20, 4.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_sincosf4_u35, "sincos, SP, 128", 0, 1e+20, 4.0, abufsp, vfloat); -} - -void benchSleef128_SPNontrig() { - fillSP(abufsp, 0, 1e+38); - - callFuncSLEEF1_1(Sleef_logf4_u10 , "log, SP, 128", 0, 1e+38, 1.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_log10f4_u10, "log10, SP, 128", 0, 1e+38, 1.0, abufsp, vfloat); - //callFuncSLEEF1_1(Sleef_log1pf4_u10, "log1p, SP, 128", 0, 1e+38, 1.0, abufsp, vfloat); - - callFuncSLEEF1_1(Sleef_logf4_u35 , "log, SP, 128", 0, 1e+38, 4.0, abufsp, vfloat); - //callFuncSLEEF1_1(Sleef_log10f4_u35, "log10, SP, 128", 0, 1e+38, 4.0, abufsp, vfloat); - //callFuncSLEEF1_1(Sleef_log1pf4_u35, "log1p, SP, 128", 0, 1e+38, 4.0, abufsp, vfloat); - - fillSP(abufsp, -100, 100); - - callFuncSLEEF1_1(Sleef_expf4_u10 , "exp, SP, 128", -100, 100, 1.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_exp2f4_u10 , "exp2, SP, 128", -100, 100, 1.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_exp10f4_u10, "exp10, SP, 128", -100, 100, 1.0, abufsp, vfloat); - - fillSP(abufsp, -30, 30); - fillSP(bbufsp, -30, 30); - - callFuncSLEEF1_2(Sleef_powf4_u10, "pow, SP, 128", -30, 30, -30, 30, 1.0, abufsp, bbufsp, vfloat); - - fillSP(abufsp, -1.0, 1.0); - - callFuncSLEEF1_1(Sleef_asinf4_u10, "asin, SP, 128", -1.0, 1, 1.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_acosf4_u10, "acos, SP, 128", -1.0, 1, 1.0, abufsp, vfloat); - - callFuncSLEEF1_1(Sleef_asinf4_u35, "asin, SP, 128", -1.0, 1.0, 4.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_acosf4_u35, "acos, SP, 128", -1.0, 1.0, 4.0, abufsp, vfloat); - - fillSP(abufsp, -10, 10); - fillSP(bbufsp, -10, 10); - - callFuncSLEEF1_1(Sleef_atanf4_u10, "atan, SP, 128", -10, 10, 1.0, abufsp, vfloat); - callFuncSLEEF1_2(Sleef_atan2f4_u10, "atan2, SP, 128", -10, 10, -10, 10, 1.0, abufsp, bbufsp, vfloat); - - callFuncSLEEF1_1(Sleef_atanf4_u35, "atan, SP, 128", -10, 10, 4.0, abufsp, vfloat); - callFuncSLEEF1_2(Sleef_atan2f4_u35, "atan2, SP, 128", -10, 10, -10, 10, 4.0, abufsp, bbufsp, vfloat); -} -#else // #ifdef ENABLED -void benchSleef128_DPTrig() {} -void benchSleef128_DPNontrig() {} -void benchSleef128_SPTrig() {} -void benchSleef128_SPNontrig() {} -#endif // #ifdef ENABLED diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-benchmarks/benchsleef256.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-benchmarks/benchsleef256.c deleted file mode 100644 index 12cdf35f31b..00000000000 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-benchmarks/benchsleef256.c +++ /dev/null @@ -1,181 +0,0 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. -// Distributed under the Boost Software License, Version 1.0. -// (See accompanying file LICENSE.txt or copy at -// http://www.boost.org/LICENSE_1_0.txt) - -#include -#include -#include -#include -#include -#include -#include - -void fillDP(double *buf, double min, double max); -void fillSP(float *buf, double min, double max); - -extern char x86BrandString[256], versionString[1024]; -extern int veclen; -extern double *abufdp, *bbufdp; -extern float *abufsp, *bbufsp; -extern FILE *fp; - -#include "bench.h" - -#ifdef __AVX__ -#if defined(_MSC_VER) -#include -#else -#include -#endif -typedef __m256d vdouble; -typedef __m256 vfloat; -#define ENABLED -#endif - -#ifdef ENABLED -void benchSleef256_DPTrig() { - fillDP(abufdp, 0, 6.28); - - callFuncSLEEF1_1(Sleef_sind4_u10 , "sin, DP, 256", 0, 6.28, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_cosd4_u10 , "cos, DP, 256", 0, 6.28, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_tand4_u10 , "tan, DP, 256", 0, 6.28, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_sincosd4_u10, "sincos, DP, 256", 0, 6.28, 1.0, abufdp, vdouble); - - callFuncSLEEF1_1(Sleef_sind4_u35 , "sin, DP, 256", 0, 6.28, 4.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_cosd4_u35 , "cos, DP, 256", 0, 6.28, 4.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_tand4_u35 , "tan, DP, 256", 0, 6.28, 4.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_sincosd4_u35, "sincos, DP, 256", 0, 6.28, 4.0, abufdp, vdouble); - - fillDP(abufdp, 0, 1e+6); - - callFuncSLEEF1_1(Sleef_sind4_u10 , "sin, DP, 256", 0, 1e+6, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_cosd4_u10 , "cos, DP, 256", 0, 1e+6, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_tand4_u10 , "tan, DP, 256", 0, 1e+6, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_sincosd4_u10, "sincos, DP, 256", 0, 1e+6, 1.0, abufdp, vdouble); - - callFuncSLEEF1_1(Sleef_sind4_u35 , "sin, DP, 256", 0, 1e+6, 4.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_cosd4_u35 , "cos, DP, 256", 0, 1e+6, 4.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_tand4_u35 , "tan, DP, 256", 0, 1e+6, 4.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_sincosd4_u35, "sincos, DP, 256", 0, 1e+6, 4.0, abufdp, vdouble); - - fillDP(abufdp, 0, 1e+100); - - callFuncSLEEF1_1(Sleef_sind4_u10 , "sin, DP, 256", 0, 1e+100, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_cosd4_u10 , "cos, DP, 256", 0, 1e+100, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_tand4_u10 , "tan, DP, 256", 0, 1e+100, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_sincosd4_u10, "sincos, DP, 256", 0, 1e+100, 1.0, abufdp, vdouble); - - callFuncSLEEF1_1(Sleef_sind4_u35 , "sin, DP, 256", 0, 1e+100, 4.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_cosd4_u35 , "cos, DP, 256", 0, 1e+100, 4.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_tand4_u35 , "tan, DP, 256", 0, 1e+100, 4.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_sincosd4_u35, "sincos, DP, 256", 0, 1e+100, 4.0, abufdp, vdouble); -} - -void benchSleef256_DPNontrig() { - fillDP(abufdp, 0, 1e+300); - - callFuncSLEEF1_1(Sleef_logd4_u10 , "log, DP, 256", 0, 1e+300, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_log10d4_u10, "log10, DP, 256", 0, 1e+300, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_log1pd4_u10, "log1p, DP, 256", 0, 1e+300, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_logd4_u35 , "log, DP, 256", 0, 1e+300, 4.0, abufdp, vdouble); - - fillDP(abufdp, -700, 700); - - callFuncSLEEF1_1(Sleef_expd4_u10 , "exp, DP, 256", -700, 700, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_exp2d4_u10 , "exp2, DP, 256", -700, 700, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_exp10d4_u10, "exp10, DP, 256", -700, 700, 1.0, abufdp, vdouble); - - fillDP(abufdp, -30, 30); - fillDP(bbufdp, -30, 30); - - callFuncSLEEF1_2(Sleef_powd4_u10, "pow, DP, 256", -30, 30, -30, 30, 1.0, abufdp, bbufdp, vdouble); - - fillDP(abufdp, -1.0, 1.0); - - callFuncSLEEF1_1(Sleef_asind4_u10, "asin, DP, 256", -1.0, 1.0, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_acosd4_u10, "acos, DP, 256", -1.0, 1.0, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_asind4_u35, "asin, DP, 256", -1.0, 1.0, 4.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_acosd4_u35, "acos, DP, 256", -1.0, 1.0, 4.0, abufdp, vdouble); - - fillDP(abufdp, -10, 10); - fillDP(bbufdp, -10, 10); - - callFuncSLEEF1_1(Sleef_atand4_u10, "atan, DP, 256", -10, 10, 1.0, abufdp, vdouble); - callFuncSLEEF1_2(Sleef_atan2d4_u10, "atan2, DP, 256", -10, 10, -10, 10, 1.0, abufdp, bbufdp, vdouble); - callFuncSLEEF1_1(Sleef_atand4_u35, "atan, DP, 256", -10, 10, 4.0, abufdp, vdouble); - callFuncSLEEF1_2(Sleef_atan2d4_u35, "atan2, DP, 256", -10, 10, -10, 10, 4.0, abufdp, bbufdp, vdouble); -} - -void benchSleef256_SPTrig() { - fillSP(abufsp, 0, 6.28); - - callFuncSLEEF1_1(Sleef_sinf8_u10 , "sin, SP, 256", 0, 6.28, 1.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_cosf8_u10 , "cos, SP, 256", 0, 6.28, 1.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_tanf8_u10 , "tan, SP, 256", 0, 6.28, 1.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_sincosf8_u10, "sincos, SP, 256", 0, 6.28, 1.0, abufsp, vfloat); - - callFuncSLEEF1_1(Sleef_sinf8_u35 , "sin, SP, 256", 0, 6.28, 4.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_cosf8_u35 , "cos, SP, 256", 0, 6.28, 4.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_tanf8_u35 , "tan, SP, 256", 0, 6.28, 4.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_sincosf8_u35, "sincos, SP, 256", 0, 6.28, 4.0, abufsp, vfloat); - - fillSP(abufsp, 0, 1e+20); - - callFuncSLEEF1_1(Sleef_sinf8_u10 , "sin, SP, 256", 0, 1e+20, 1.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_cosf8_u10 , "cos, SP, 256", 0, 1e+20, 1.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_tanf8_u10 , "tan, SP, 256", 0, 1e+20, 1.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_sincosf8_u10, "sincos, SP, 256", 0, 1e+20, 1.0, abufsp, vfloat); - - callFuncSLEEF1_1(Sleef_sinf8_u35 , "sin, SP, 256", 0, 1e+20, 4.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_cosf8_u35 , "cos, SP, 256", 0, 1e+20, 4.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_tanf8_u35 , "tan, SP, 256", 0, 1e+20, 4.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_sincosf8_u35, "sincos, SP, 256", 0, 1e+20, 4.0, abufsp, vfloat); -} - -void benchSleef256_SPNontrig() { - fillSP(abufsp, 0, 1e+38); - - callFuncSLEEF1_1(Sleef_logf8_u10 , "log, SP, 256", 0, 1e+38, 1.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_log10f8_u10, "log10, SP, 256", 0, 1e+38, 1.0, abufsp, vfloat); - //callFuncSLEEF1_1(Sleef_log1pf8_u10, "log1p, SP, 256", 0, 1e+38, 1.0, abufsp, vfloat); - - callFuncSLEEF1_1(Sleef_logf8_u35 , "log, SP, 256", 0, 1e+38, 4.0, abufsp, vfloat); - //callFuncSLEEF1_1(Sleef_log10f8_u35, "log10, SP, 256", 0, 1e+38, 4.0, abufsp, vfloat); - //callFuncSLEEF1_1(Sleef_log1pf8_u35, "log1p, SP, 256", 0, 1e+38, 4.0, abufsp, vfloat); - - fillSP(abufsp, -100, 100); - - callFuncSLEEF1_1(Sleef_expf8_u10 , "exp, SP, 256", -100, 100, 1.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_exp2f8_u10 , "exp2, SP, 256", -100, 100, 1.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_exp10f8_u10, "exp10, SP, 256", -100, 100, 1.0, abufsp, vfloat); - - fillSP(abufsp, -30, 30); - fillSP(bbufsp, -30, 30); - - callFuncSLEEF1_2(Sleef_powf8_u10, "pow, SP, 256", -30, 30, -30, 30, 1.0, abufsp, bbufsp, vfloat); - - fillSP(abufsp, -1.0, 1.0); - - callFuncSLEEF1_1(Sleef_asinf8_u10, "asin, SP, 256", -1.0, 1, 1.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_acosf8_u10, "acos, SP, 256", -1.0, 1, 1.0, abufsp, vfloat); - - callFuncSLEEF1_1(Sleef_asinf8_u35, "asin, SP, 256", -1.0, 1.0, 4.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_acosf8_u35, "acos, SP, 256", -1.0, 1.0, 4.0, abufsp, vfloat); - - fillSP(abufsp, -10, 10); - fillSP(bbufsp, -10, 10); - - callFuncSLEEF1_1(Sleef_atanf8_u10, "atan, SP, 256", -10, 10, 1.0, abufsp, vfloat); - callFuncSLEEF1_2(Sleef_atan2f8_u10, "atan2, SP, 256", -10, 10, -10, 10, 1.0, abufsp, bbufsp, vfloat); - - callFuncSLEEF1_1(Sleef_atanf8_u35, "atan, SP, 256", -10, 10, 4.0, abufsp, vfloat); - callFuncSLEEF1_2(Sleef_atan2f8_u35, "atan2, SP, 256", -10, 10, -10, 10, 4.0, abufsp, bbufsp, vfloat); -} -#else // #ifdef ENABLED -void zeroupper256() {} -void benchSleef256_DPTrig() {} -void benchSleef256_DPNontrig() {} -void benchSleef256_SPTrig() {} -void benchSleef256_SPNontrig() {} -#endif // #ifdef ENABLED diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-benchmarks/benchsleef512.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-benchmarks/benchsleef512.c deleted file mode 100644 index 296c1236931..00000000000 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-benchmarks/benchsleef512.c +++ /dev/null @@ -1,180 +0,0 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. -// Distributed under the Boost Software License, Version 1.0. -// (See accompanying file LICENSE.txt or copy at -// http://www.boost.org/LICENSE_1_0.txt) - -#include -#include -#include -#include -#include -#include -#include - -void fillDP(double *buf, double min, double max); -void fillSP(float *buf, double min, double max); - -extern char x86BrandString[256], versionString[1024]; -extern int veclen; -extern double *abufdp, *bbufdp; -extern float *abufsp, *bbufsp; -extern FILE *fp; - -#include "bench.h" - -#ifdef __AVX512F__ -#if defined(_MSC_VER) -#include -#else -#include -#endif -typedef __m512d vdouble; -typedef __m512 vfloat; -#define ENABLED -#endif - -#ifdef ENABLED -void benchSleef512_DPTrig() { - fillDP(abufdp, 0, 6.28); - - callFuncSLEEF1_1(Sleef_sind8_u10 , "sin, DP, 512", 0, 6.28, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_cosd8_u10 , "cos, DP, 512", 0, 6.28, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_tand8_u10 , "tan, DP, 512", 0, 6.28, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_sincosd8_u10, "sincos, DP, 512", 0, 6.28, 1.0, abufdp, vdouble); - - callFuncSLEEF1_1(Sleef_sind8_u35 , "sin, DP, 512", 0, 6.28, 4.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_cosd8_u35 , "cos, DP, 512", 0, 6.28, 4.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_tand8_u35 , "tan, DP, 512", 0, 6.28, 4.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_sincosd8_u35, "sincos, DP, 512", 0, 6.28, 4.0, abufdp, vdouble); - - fillDP(abufdp, 0, 1e+6); - - callFuncSLEEF1_1(Sleef_sind8_u10 , "sin, DP, 512", 0, 1e+6, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_cosd8_u10 , "cos, DP, 512", 0, 1e+6, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_tand8_u10 , "tan, DP, 512", 0, 1e+6, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_sincosd8_u10, "sincos, DP, 512", 0, 1e+6, 1.0, abufdp, vdouble); - - callFuncSLEEF1_1(Sleef_sind8_u35 , "sin, DP, 512", 0, 1e+6, 4.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_cosd8_u35 , "cos, DP, 512", 0, 1e+6, 4.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_tand8_u35 , "tan, DP, 512", 0, 1e+6, 4.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_sincosd8_u35, "sincos, DP, 512", 0, 1e+6, 4.0, abufdp, vdouble); - - fillDP(abufdp, 0, 1e+100); - - callFuncSLEEF1_1(Sleef_sind8_u10 , "sin, DP, 512", 0, 1e+100, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_cosd8_u10 , "cos, DP, 512", 0, 1e+100, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_tand8_u10 , "tan, DP, 512", 0, 1e+100, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_sincosd8_u10, "sincos, DP, 512", 0, 1e+100, 1.0, abufdp, vdouble); - - callFuncSLEEF1_1(Sleef_sind8_u35 , "sin, DP, 512", 0, 1e+100, 4.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_cosd8_u35 , "cos, DP, 512", 0, 1e+100, 4.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_tand8_u35 , "tan, DP, 512", 0, 1e+100, 4.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_sincosd8_u35, "sincos, DP, 512", 0, 1e+100, 4.0, abufdp, vdouble); -} - -void benchSleef512_DPNontrig() { - fillDP(abufdp, 0, 1e+300); - - callFuncSLEEF1_1(Sleef_logd8_u10 , "log, DP, 512", 0, 1e+300, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_log10d8_u10, "log10, DP, 512", 0, 1e+300, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_log1pd8_u10, "log1p, DP, 512", 0, 1e+300, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_logd8_u35 , "log, DP, 512", 0, 1e+300, 4.0, abufdp, vdouble); - - fillDP(abufdp, -700, 700); - - callFuncSLEEF1_1(Sleef_expd8_u10 , "exp, DP, 512", -700, 700, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_exp2d8_u10 , "exp2, DP, 512", -700, 700, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_exp10d8_u10, "exp10, DP, 512", -700, 700, 1.0, abufdp, vdouble); - - fillDP(abufdp, -30, 30); - fillDP(bbufdp, -30, 30); - - callFuncSLEEF1_2(Sleef_powd8_u10, "pow, DP, 512", -30, 30, -30, 30, 1.0, abufdp, bbufdp, vdouble); - - fillDP(abufdp, -1.0, 1.0); - - callFuncSLEEF1_1(Sleef_asind8_u10, "asin, DP, 512", -1.0, 1.0, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_acosd8_u10, "acos, DP, 512", -1.0, 1.0, 1.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_asind8_u35, "asin, DP, 512", -1.0, 1.0, 4.0, abufdp, vdouble); - callFuncSLEEF1_1(Sleef_acosd8_u35, "acos, DP, 512", -1.0, 1.0, 4.0, abufdp, vdouble); - - fillDP(abufdp, -10, 10); - fillDP(bbufdp, -10, 10); - - callFuncSLEEF1_1(Sleef_atand8_u10, "atan, DP, 512", -10, 10, 1.0, abufdp, vdouble); - callFuncSLEEF1_2(Sleef_atan2d8_u10, "atan2, DP, 512", -10, 10, -10, 10, 1.0, abufdp, bbufdp, vdouble); - callFuncSLEEF1_1(Sleef_atand8_u35, "atan, DP, 512", -10, 10, 4.0, abufdp, vdouble); - callFuncSLEEF1_2(Sleef_atan2d8_u35, "atan2, DP, 512", -10, 10, -10, 10, 4.0, abufdp, bbufdp, vdouble); -} - -void benchSleef512_SPTrig() { - fillSP(abufsp, 0, 6.28); - - callFuncSLEEF1_1(Sleef_sinf16_u10 , "sin, SP, 512", 0, 6.28, 1.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_cosf16_u10 , "cos, SP, 512", 0, 6.28, 1.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_tanf16_u10 , "tan, SP, 512", 0, 6.28, 1.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_sincosf16_u10, "sincos, SP, 512", 0, 6.28, 1.0, abufsp, vfloat); - - callFuncSLEEF1_1(Sleef_sinf16_u35 , "sin, SP, 512", 0, 6.28, 4.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_cosf16_u35 , "cos, SP, 512", 0, 6.28, 4.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_tanf16_u35 , "tan, SP, 512", 0, 6.28, 4.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_sincosf16_u35, "sincos, SP, 512", 0, 6.28, 4.0, abufsp, vfloat); - - fillSP(abufsp, 0, 1e+20); - - callFuncSLEEF1_1(Sleef_sinf16_u10 , "sin, SP, 512", 0, 1e+20, 1.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_cosf16_u10 , "cos, SP, 512", 0, 1e+20, 1.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_tanf16_u10 , "tan, SP, 512", 0, 1e+20, 1.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_sincosf16_u10, "sincos, SP, 512", 0, 1e+20, 1.0, abufsp, vfloat); - - callFuncSLEEF1_1(Sleef_sinf16_u35 , "sin, SP, 512", 0, 1e+20, 4.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_cosf16_u35 , "cos, SP, 512", 0, 1e+20, 4.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_tanf16_u35 , "tan, SP, 512", 0, 1e+20, 4.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_sincosf16_u35, "sincos, SP, 512", 0, 1e+20, 4.0, abufsp, vfloat); -} - -void benchSleef512_SPNontrig() { - fillSP(abufsp, 0, 1e+38); - - callFuncSLEEF1_1(Sleef_logf16_u10 , "log, SP, 512", 0, 1e+38, 1.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_log10f16_u10, "log10, SP, 512", 0, 1e+38, 1.0, abufsp, vfloat); - //callFuncSLEEF1_1(Sleef_log1pf16_u10, "log1p, SP, 512", 0, 1e+38, 1.0, abufsp, vfloat); - - callFuncSLEEF1_1(Sleef_logf16_u35 , "log, SP, 512", 0, 1e+38, 4.0, abufsp, vfloat); - //callFuncSLEEF1_1(Sleef_log10f16_u35, "log10, SP, 512", 0, 1e+38, 4.0, abufsp, vfloat); - //callFuncSLEEF1_1(Sleef_log1pf16_u35, "log1p, SP, 512", 0, 1e+38, 4.0, abufsp, vfloat); - - fillSP(abufsp, -100, 100); - - callFuncSLEEF1_1(Sleef_expf16_u10 , "exp, SP, 512", -100, 100, 1.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_exp2f16_u10 , "exp2, SP, 512", -100, 100, 1.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_exp10f16_u10, "exp10, SP, 512", -100, 100, 1.0, abufsp, vfloat); - - fillSP(abufsp, -30, 30); - fillSP(bbufsp, -30, 30); - - callFuncSLEEF1_2(Sleef_powf16_u10, "pow, SP, 512", -30, 30, -30, 30, 1.0, abufsp, bbufsp, vfloat); - - fillSP(abufsp, -1.0, 1.0); - - callFuncSLEEF1_1(Sleef_asinf16_u10, "asin, SP, 512", -1.0, 1, 1.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_acosf16_u10, "acos, SP, 512", -1.0, 1, 1.0, abufsp, vfloat); - - callFuncSLEEF1_1(Sleef_asinf16_u35, "asin, SP, 512", -1.0, 1.0, 4.0, abufsp, vfloat); - callFuncSLEEF1_1(Sleef_acosf16_u35, "acos, SP, 512", -1.0, 1.0, 4.0, abufsp, vfloat); - - fillSP(abufsp, -10, 10); - fillSP(bbufsp, -10, 10); - - callFuncSLEEF1_1(Sleef_atanf16_u10, "atan, SP, 512", -10, 10, 1.0, abufsp, vfloat); - callFuncSLEEF1_2(Sleef_atan2f16_u10, "atan2, SP, 512", -10, 10, -10, 10, 1.0, abufsp, bbufsp, vfloat); - - callFuncSLEEF1_1(Sleef_atanf16_u35, "atan, SP, 512", -10, 10, 4.0, abufsp, vfloat); - callFuncSLEEF1_2(Sleef_atan2f16_u35, "atan2, SP, 512", -10, 10, -10, 10, 4.0, abufsp, bbufsp, vfloat); -} -#else // #ifdef ENABLED -void benchSleef512_DPTrig() {} -void benchSleef512_DPNontrig() {} -void benchSleef512_SPTrig() {} -void benchSleef512_SPNontrig() {} -#endif // #ifdef ENABLED diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-benchmarks/benchsvml.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-benchmarks/benchsvml.c deleted file mode 100644 index feb9cb6263b..00000000000 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-benchmarks/benchsvml.c +++ /dev/null @@ -1,153 +0,0 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. -// Distributed under the Boost Software License, Version 1.0. -// (See accompanying file LICENSE.txt or copy at -// http://www.boost.org/LICENSE_1_0.txt) - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "bench.h" - -int veclen = 16; -int enableLogExp; -double *abufdp, *bbufdp; -float *abufsp, *bbufsp; -FILE *fp; - -#if defined(__i386__) || defined(__x86_64__) -void x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx) { - uint32_t a, b, c, d; - __asm__ __volatile__ ("cpuid" : "=a" (a), "=b" (b), "=c" (c), "=d" (d) : "a" (eax), "c"(ecx)); - out[0] = a; out[1] = b; out[2] = c; out[3] = d; -} - -int cpuSupportsAVX() { - int32_t reg[4]; - x86CpuID(reg, 1, 0); - return (reg[2] & (1 << 28)) != 0; -} - -int cpuSupportsAVX512F() { - int32_t reg[4]; - x86CpuID(reg, 7, 0); - return (reg[1] & (1 << 16)) != 0; -} -#endif - -uint64_t Sleef_currentTimeMicros() { - struct timespec tp; - clock_gettime(CLOCK_MONOTONIC, &tp); - return (uint64_t)tp.tv_sec * 1000000LL + ((uint64_t)tp.tv_nsec/1000); -} - -void fillDP(double *buf, double min, double max) { - for(int i=0;i= 3) fnBase = argv[2]; - - srandom(time(NULL)); - -#if defined(__i386__) || defined(__x86_64__) - int do128bit = 1; - int do256bit = cpuSupportsAVX(); - int do512bit = cpuSupportsAVX512F(); -#elif defined(__ARM_NEON) - int do128bit = 1; - int do256bit = 0; - int do512bit = 0; -#else -#error Unsupported architecture -#endif - - posix_memalign((void **)&abufdp, veclen*sizeof(double), NITER1*veclen*sizeof(double)); - posix_memalign((void **)&bbufdp, veclen*sizeof(double), NITER1*veclen*sizeof(double)); - - abufsp = (float *)abufdp; - bbufsp = (float *)bbufdp; - - enableLogExp = SVMLULP < 2; - - sprintf(fn, "%sdptrig%gulp.out", fnBase, (double)SVMLULP); - fp = fopen(fn, "w"); - fprintf(fp, "%s\n", columnTitle); - - if (do256bit) zeroupper256(); - if (do128bit) benchSVML128_DPTrig(); - if (do256bit) benchSVML256_DPTrig(); - if (do512bit) benchSVML512_DPTrig(); - - fclose(fp); - - sprintf(fn, "%sdpnontrig%gulp.out", fnBase, (double)SVMLULP); - fp = fopen(fn, "w"); - fprintf(fp, "%s\n", columnTitle); - - if (do256bit) zeroupper256(); - if (do128bit) benchSVML128_DPNontrig(); - if (do256bit) benchSVML256_DPNontrig(); - if (do512bit) benchSVML512_DPNontrig(); - - fclose(fp); - - sprintf(fn, "%ssptrig%gulp.out", fnBase, (double)SVMLULP); - fp = fopen(fn, "w"); - fprintf(fp, "%s\n", columnTitle); - - if (do256bit) zeroupper256(); - if (do128bit) benchSVML128_SPTrig(); - if (do256bit) benchSVML256_SPTrig(); - if (do512bit) benchSVML512_SPTrig(); - - fclose(fp); - - sprintf(fn, "%sspnontrig%gulp.out", fnBase, (double)SVMLULP); - fp = fopen(fn, "w"); - fprintf(fp, "%s\n", columnTitle); - - if (do256bit) zeroupper256(); - if (do128bit) benchSVML128_SPNontrig(); - if (do256bit) benchSVML256_SPNontrig(); - if (do512bit) benchSVML512_SPNontrig(); - - fclose(fp); - - exit(0); -} diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-benchmarks/benchsvml128.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-benchmarks/benchsvml128.c deleted file mode 100644 index 5c8c5d78546..00000000000 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-benchmarks/benchsvml128.c +++ /dev/null @@ -1,144 +0,0 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. -// Distributed under the Boost Software License, Version 1.0. -// (See accompanying file LICENSE.txt or copy at -// http://www.boost.org/LICENSE_1_0.txt) - -#include -#include -#include -#include -#include -#include -#include -#include - -uint64_t Sleef_currentTimeMicros(); -void fillDP(double *buf, double min, double max); -void fillSP(float *buf, double min, double max); - -extern char x86BrandString[256], versionString[1024]; -extern int veclen; -extern int enableLogExp; -extern double *abufdp, *bbufdp; -extern float *abufsp, *bbufsp; -extern FILE *fp; - -#include "bench.h" - -#ifdef __SSE2__ -typedef __m128d vdouble; -typedef __m128 vfloat; -#define ENABLED -#endif - -#ifdef ENABLED -void benchSVML128_DPTrig() { - fillDP(abufdp, 0, 6.28); - - callFuncSVML1_1(_mm_sin_pd , "sin, DP, 128", 0, 6.28, abufdp, vdouble); - callFuncSVML1_1(_mm_cos_pd , "cos, DP, 128", 0, 6.28, abufdp, vdouble); - callFuncSVML1_1(_mm_tan_pd , "tan, DP, 128", 0, 6.28, abufdp, vdouble); - callFuncSVML2_1(_mm_sincos_pd, "sincos, DP, 128", 0, 6.28, abufdp, vdouble); - - fillDP(abufdp, 0, 1e+6); - - callFuncSVML1_1(_mm_sin_pd , "sin, DP, 128", 0, 1e+6, abufdp, vdouble); - callFuncSVML1_1(_mm_cos_pd , "cos, DP, 128", 0, 1e+6, abufdp, vdouble); - callFuncSVML1_1(_mm_tan_pd , "tan, DP, 128", 0, 1e+6, abufdp, vdouble); - callFuncSVML2_1(_mm_sincos_pd, "sincos, DP, 128", 0, 1e+6, abufdp, vdouble); - - fillDP(abufdp, 0, 1e+100); - - callFuncSVML1_1(_mm_sin_pd , "sin, DP, 128", 0, 1e+100, abufdp, vdouble); - callFuncSVML1_1(_mm_cos_pd , "cos, DP, 128", 0, 1e+100, abufdp, vdouble); - callFuncSVML1_1(_mm_tan_pd , "tan, DP, 128", 0, 1e+100, abufdp, vdouble); - callFuncSVML2_1(_mm_sincos_pd, "sincos, DP, 128", 0, 1e+100, abufdp, vdouble); -} - -void benchSVML128_DPNontrig() { - fillDP(abufdp, 0, 1e+300); - - callFuncSVML1_1(_mm_log_pd , "log, DP, 128", 0, 1e+300, abufdp, vdouble); - - if (enableLogExp) { - callFuncSVML1_1(_mm_log10_pd, "log10, DP, 128", 0, 1e+300, abufdp, vdouble); - callFuncSVML1_1(_mm_log1p_pd, "log1p, DP, 128", 0, 1e+300, abufdp, vdouble); - - fillDP(abufdp, -700, 700); - - callFuncSVML1_1(_mm_exp_pd , "exp, DP, 128", -700, 700, abufdp, vdouble); - callFuncSVML1_1(_mm_exp2_pd , "exp2, DP, 128", -700, 700, abufdp, vdouble); - callFuncSVML1_1(_mm_exp10_pd, "exp10, DP, 128", -700, 700, abufdp, vdouble); - - fillDP(abufdp, -30, 30); - fillDP(bbufdp, -30, 30); - - callFuncSVML1_2(_mm_pow_pd, "pow, DP, 128", -30, 30, -30, 30, abufdp, bbufdp, vdouble); - } - - fillDP(abufdp, -1.0, 1.0); - - callFuncSVML1_1(_mm_asin_pd, "asin, DP, 128", -1.0, 1.0, abufdp, vdouble); - callFuncSVML1_1(_mm_acos_pd, "acos, DP, 128", -1.0, 1.0, abufdp, vdouble); - - fillDP(abufdp, -10, 10); - fillDP(bbufdp, -10, 10); - - callFuncSVML1_1(_mm_atan_pd, "atan, DP, 128", -10, 10, abufdp, vdouble); - callFuncSVML1_2(_mm_atan2_pd, "atan2, DP, 128", -10, 10, -10, 10, abufdp, bbufdp, vdouble); -} - -void benchSVML128_SPTrig() { - fillSP(abufsp, 0, 6.28); - - callFuncSVML1_1(_mm_sin_ps , "sin, SP, 128", 0, 6.28, abufsp, vfloat); - callFuncSVML1_1(_mm_cos_ps , "cos, SP, 128", 0, 6.28, abufsp, vfloat); - callFuncSVML1_1(_mm_tan_ps , "tan, SP, 128", 0, 6.28, abufsp, vfloat); - callFuncSVML2_1(_mm_sincos_ps, "sincos, SP, 128", 0, 6.28, abufsp, vfloat); - - fillSP(abufsp, 0, 1e+20); - - callFuncSVML1_1(_mm_sin_ps , "sin, SP, 128", 0, 1e+20, abufsp, vfloat); - callFuncSVML1_1(_mm_cos_ps , "cos, SP, 128", 0, 1e+20, abufsp, vfloat); - callFuncSVML1_1(_mm_tan_ps , "tan, SP, 128", 0, 1e+20, abufsp, vfloat); - callFuncSVML2_1(_mm_sincos_ps, "sincos, SP, 128", 0, 1e+20, abufsp, vfloat); -} - -void benchSVML128_SPNontrig() { - fillSP(abufsp, 0, 1e+38); - - callFuncSVML1_1(_mm_log_ps , "log, SP, 128", 0, 1e+38, abufsp, vfloat); - - if (enableLogExp) { - callFuncSVML1_1(_mm_log10_ps, "log10, SP, 128", 0, 1e+38, abufsp, vfloat); - //callFuncSVML1_1(_mm_log1p_ps, "log1p, SP, 128", 0, 1e+38, abufsp, vfloat); - - fillSP(abufsp, -100, 100); - - callFuncSVML1_1(_mm_exp_ps , "exp, SP, 128", -100, 100, abufsp, vfloat); - callFuncSVML1_1(_mm_exp2_ps , "exp2, SP, 128", -100, 100, abufsp, vfloat); - callFuncSVML1_1(_mm_exp10_ps, "exp10, SP, 128", -100, 100, abufsp, vfloat); - - fillSP(abufsp, -30, 30); - fillSP(bbufsp, -30, 30); - - callFuncSVML1_2(_mm_pow_ps, "pow, SP, 128", -30, 30, -30, 30, abufsp, bbufsp, vfloat); - } - - fillSP(abufsp, -1.0, 1.0); - - callFuncSVML1_1(_mm_asin_ps, "asin, SP, 128", -1.0, 1, abufsp, vfloat); - callFuncSVML1_1(_mm_acos_ps, "acos, SP, 128", -1.0, 1, abufsp, vfloat); - - fillSP(abufsp, -10, 10); - fillSP(bbufsp, -10, 10); - - callFuncSVML1_1(_mm_atan_ps, "atan, SP, 128", -10, 10, abufsp, vfloat); - callFuncSVML1_2(_mm_atan2_ps, "atan2, SP, 128", -10, 10, -10, 10, abufsp, bbufsp, vfloat); -} -#else // #ifdef ENABLED -void benchSVML128_DPTrig() {} -void benchSVML128_DPNontrig() {} -void benchSVML128_SPTrig() {} -void benchSVML128_SPNontrig() {} -#endif // #ifdef ENABLED diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-benchmarks/benchsvml256.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-benchmarks/benchsvml256.c deleted file mode 100644 index 2f1e2ee5e21..00000000000 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-benchmarks/benchsvml256.c +++ /dev/null @@ -1,147 +0,0 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. -// Distributed under the Boost Software License, Version 1.0. -// (See accompanying file LICENSE.txt or copy at -// http://www.boost.org/LICENSE_1_0.txt) - -#include -#include -#include -#include -#include -#include -#include -#include - -uint64_t Sleef_currentTimeMicros(); -void fillDP(double *buf, double min, double max); -void fillSP(float *buf, double min, double max); - -extern char x86BrandString[256], versionString[1024]; -extern int veclen; -extern int enableLogExp; -extern double *abufdp, *bbufdp; -extern float *abufsp, *bbufsp; -extern FILE *fp; - -#include "bench.h" - -#ifdef __AVX__ -typedef __m256d vdouble; -typedef __m256 vfloat; -#define ENABLED -#endif - -#ifdef ENABLED -void zeroupper256() { _mm256_zeroupper(); } - -void benchSVML256_DPTrig() { - fillDP(abufdp, 0, 6.28); - - callFuncSVML1_1(_mm256_sin_pd , "sin, DP, 256", 0, 6.28, abufdp, vdouble); - callFuncSVML1_1(_mm256_cos_pd , "cos, DP, 256", 0, 6.28, abufdp, vdouble); - callFuncSVML1_1(_mm256_tan_pd , "tan, DP, 256", 0, 6.28, abufdp, vdouble); - callFuncSVML2_1(_mm256_sincos_pd, "sincos, DP, 256", 0, 6.28, abufdp, vdouble); - - fillDP(abufdp, 0, 1e+6); - - callFuncSVML1_1(_mm256_sin_pd , "sin, DP, 256", 0, 1e+6, abufdp, vdouble); - callFuncSVML1_1(_mm256_cos_pd , "cos, DP, 256", 0, 1e+6, abufdp, vdouble); - callFuncSVML1_1(_mm256_tan_pd , "tan, DP, 256", 0, 1e+6, abufdp, vdouble); - callFuncSVML2_1(_mm256_sincos_pd, "sincos, DP, 256", 0, 1e+6, abufdp, vdouble); - - fillDP(abufdp, 0, 1e+100); - - callFuncSVML1_1(_mm256_sin_pd , "sin, DP, 256", 0, 1e+100, abufdp, vdouble); - callFuncSVML1_1(_mm256_cos_pd , "cos, DP, 256", 0, 1e+100, abufdp, vdouble); - callFuncSVML1_1(_mm256_tan_pd , "tan, DP, 256", 0, 1e+100, abufdp, vdouble); - callFuncSVML2_1(_mm256_sincos_pd, "sincos, DP, 256", 0, 1e+100, abufdp, vdouble); -} - -void benchSVML256_DPNontrig() { - fillDP(abufdp, 0, 1e+300); - - callFuncSVML1_1(_mm256_log_pd , "log, DP, 256", 0, 1e+300, abufdp, vdouble); - - if (enableLogExp) { - callFuncSVML1_1(_mm256_log10_pd, "log10, DP, 256", 0, 1e+300, abufdp, vdouble); - callFuncSVML1_1(_mm256_log1p_pd, "log1p, DP, 256", 0, 1e+300, abufdp, vdouble); - - fillDP(abufdp, -700, 700); - - callFuncSVML1_1(_mm256_exp_pd , "exp, DP, 256", -700, 700, abufdp, vdouble); - callFuncSVML1_1(_mm256_exp2_pd , "exp2, DP, 256", -700, 700, abufdp, vdouble); - callFuncSVML1_1(_mm256_exp10_pd, "exp10, DP, 256", -700, 700, abufdp, vdouble); - - fillDP(abufdp, -30, 30); - fillDP(bbufdp, -30, 30); - - callFuncSVML1_2(_mm256_pow_pd, "pow, DP, 256", -30, 30, -30, 30, abufdp, bbufdp, vdouble); - } - - fillDP(abufdp, -1.0, 1.0); - - callFuncSVML1_1(_mm256_asin_pd, "asin, DP, 256", -1.0, 1.0, abufdp, vdouble); - callFuncSVML1_1(_mm256_acos_pd, "acos, DP, 256", -1.0, 1.0, abufdp, vdouble); - - fillDP(abufdp, -10, 10); - fillDP(bbufdp, -10, 10); - - callFuncSVML1_1(_mm256_atan_pd, "atan, DP, 256", -10, 10, abufdp, vdouble); - callFuncSVML1_2(_mm256_atan2_pd, "atan2, DP, 256", -10, 10, -10, 10, abufdp, bbufdp, vdouble); -} - -void benchSVML256_SPTrig() { - fillSP(abufsp, 0, 6.28); - - callFuncSVML1_1(_mm256_sin_ps , "sin, SP, 256", 0, 6.28, abufsp, vfloat); - callFuncSVML1_1(_mm256_cos_ps , "cos, SP, 256", 0, 6.28, abufsp, vfloat); - callFuncSVML1_1(_mm256_tan_ps , "tan, SP, 256", 0, 6.28, abufsp, vfloat); - callFuncSVML2_1(_mm256_sincos_ps, "sincos, SP, 256", 0, 6.28, abufsp, vfloat); - - fillSP(abufsp, 0, 1e+20); - - callFuncSVML1_1(_mm256_sin_ps , "sin, SP, 256", 0, 1e+20, abufsp, vfloat); - callFuncSVML1_1(_mm256_cos_ps , "cos, SP, 256", 0, 1e+20, abufsp, vfloat); - callFuncSVML1_1(_mm256_tan_ps , "tan, SP, 256", 0, 1e+20, abufsp, vfloat); - callFuncSVML2_1(_mm256_sincos_ps, "sincos, SP, 256", 0, 1e+20, abufsp, vfloat); -} - -void benchSVML256_SPNontrig() { - fillSP(abufsp, 0, 1e+38); - - callFuncSVML1_1(_mm256_log_ps , "log, SP, 256", 0, 1e+38, abufsp, vfloat); - - if (enableLogExp) { - callFuncSVML1_1(_mm256_log10_ps, "log10, SP, 256", 0, 1e+38, abufsp, vfloat); - //callFuncSVML1_1(_mm256_log1p_ps, "log1p, SP, 256", 0, 1e+38, abufsp, vfloat); - - fillSP(abufsp, -100, 100); - - callFuncSVML1_1(_mm256_exp_ps , "exp, SP, 256", -100, 100, abufsp, vfloat); - callFuncSVML1_1(_mm256_exp2_ps , "exp2, SP, 256", -100, 100, abufsp, vfloat); - callFuncSVML1_1(_mm256_exp10_ps, "exp10, SP, 256", -100, 100, abufsp, vfloat); - - fillSP(abufsp, -30, 30); - fillSP(bbufsp, -30, 30); - - callFuncSVML1_2(_mm256_pow_ps, "pow, SP, 256", -30, 30, -30, 30, abufsp, bbufsp, vfloat); - } - - fillSP(abufsp, -1.0, 1.0); - - callFuncSVML1_1(_mm256_asin_ps, "asin, SP, 256", -1.0, 1, abufsp, vfloat); - callFuncSVML1_1(_mm256_acos_ps, "acos, SP, 256", -1.0, 1, abufsp, vfloat); - - fillSP(abufsp, -10, 10); - fillSP(bbufsp, -10, 10); - - callFuncSVML1_1(_mm256_atan_ps, "atan, SP, 256", -10, 10, abufsp, vfloat); - callFuncSVML1_2(_mm256_atan2_ps, "atan2, SP, 256", -10, 10, -10, 10, abufsp, bbufsp, vfloat); -} -#else // #ifdef ENABLED -void zeroupper256() {} -void benchSVML256_DPTrig() {} -void benchSVML256_DPNontrig() {} -void benchSVML256_SPTrig() {} -void benchSVML256_SPNontrig() {} -#endif // #ifdef ENABLED diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-benchmarks/benchsvml512.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-benchmarks/benchsvml512.c deleted file mode 100644 index 537ee28658a..00000000000 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-benchmarks/benchsvml512.c +++ /dev/null @@ -1,144 +0,0 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. -// Distributed under the Boost Software License, Version 1.0. -// (See accompanying file LICENSE.txt or copy at -// http://www.boost.org/LICENSE_1_0.txt) - -#include -#include -#include -#include -#include -#include -#include -#include - -uint64_t Sleef_currentTimeMicros(); -void fillDP(double *buf, double min, double max); -void fillSP(float *buf, double min, double max); - -extern char x86BrandString[256], versionString[1024]; -extern int veclen; -extern int enableLogExp; -extern double *abufdp, *bbufdp; -extern float *abufsp, *bbufsp; -extern FILE *fp; - -#include "bench.h" - -#ifdef __AVX512F__ -typedef __m512d vdouble; -typedef __m512 vfloat; -#define ENABLED -#endif - -#ifdef ENABLED -void benchSVML512_DPTrig() { - fillDP(abufdp, 0, 6.28); - - callFuncSVML1_1(_mm512_sin_pd , "sin, DP, 512", 0, 6.28, abufdp, vdouble); - callFuncSVML1_1(_mm512_cos_pd , "cos, DP, 512", 0, 6.28, abufdp, vdouble); - callFuncSVML1_1(_mm512_tan_pd , "tan, DP, 512", 0, 6.28, abufdp, vdouble); - callFuncSVML2_1(_mm512_sincos_pd, "sincos, DP, 512", 0, 6.28, abufdp, vdouble); - - fillDP(abufdp, 0, 1e+6); - - callFuncSVML1_1(_mm512_sin_pd , "sin, DP, 512", 0, 1e+6, abufdp, vdouble); - callFuncSVML1_1(_mm512_cos_pd , "cos, DP, 512", 0, 1e+6, abufdp, vdouble); - callFuncSVML1_1(_mm512_tan_pd , "tan, DP, 512", 0, 1e+6, abufdp, vdouble); - callFuncSVML2_1(_mm512_sincos_pd, "sincos, DP, 512", 0, 1e+6, abufdp, vdouble); - - fillDP(abufdp, 0, 1e+100); - - callFuncSVML1_1(_mm512_sin_pd , "sin, DP, 512", 0, 1e+100, abufdp, vdouble); - callFuncSVML1_1(_mm512_cos_pd , "cos, DP, 512", 0, 1e+100, abufdp, vdouble); - callFuncSVML1_1(_mm512_tan_pd , "tan, DP, 512", 0, 1e+100, abufdp, vdouble); - callFuncSVML2_1(_mm512_sincos_pd, "sincos, DP, 512", 0, 1e+100, abufdp, vdouble); -} - -void benchSVML512_DPNontrig() { - fillDP(abufdp, 0, 1e+300); - - callFuncSVML1_1(_mm512_log_pd , "log, DP, 512", 0, 1e+300, abufdp, vdouble); - - if (enableLogExp) { - callFuncSVML1_1(_mm512_log10_pd, "log10, DP, 512", 0, 1e+300, abufdp, vdouble); - callFuncSVML1_1(_mm512_log1p_pd, "log1p, DP, 512", 0, 1e+300, abufdp, vdouble); - - fillDP(abufdp, -700, 700); - - callFuncSVML1_1(_mm512_exp_pd , "exp, DP, 512", -700, 700, abufdp, vdouble); - callFuncSVML1_1(_mm512_exp2_pd , "exp2, DP, 512", -700, 700, abufdp, vdouble); - callFuncSVML1_1(_mm512_exp10_pd, "exp10, DP, 512", -700, 700, abufdp, vdouble); - - fillDP(abufdp, -30, 30); - fillDP(bbufdp, -30, 30); - - callFuncSVML1_2(_mm512_pow_pd, "pow, DP, 512", -30, 30, -30, 30, abufdp, bbufdp, vdouble); - } - - fillDP(abufdp, -1.0, 1.0); - - callFuncSVML1_1(_mm512_asin_pd, "asin, DP, 512", -1.0, 1.0, abufdp, vdouble); - callFuncSVML1_1(_mm512_acos_pd, "acos, DP, 512", -1.0, 1.0, abufdp, vdouble); - - fillDP(abufdp, -10, 10); - fillDP(bbufdp, -10, 10); - - callFuncSVML1_1(_mm512_atan_pd, "atan, DP, 512", -10, 10, abufdp, vdouble); - callFuncSVML1_2(_mm512_atan2_pd, "atan2, DP, 512", -10, 10, -10, 10, abufdp, bbufdp, vdouble); -} - -void benchSVML512_SPTrig() { - fillSP(abufsp, 0, 6.28); - - callFuncSVML1_1(_mm512_sin_ps , "sin, SP, 512", 0, 6.28, abufsp, vfloat); - callFuncSVML1_1(_mm512_cos_ps , "cos, SP, 512", 0, 6.28, abufsp, vfloat); - callFuncSVML1_1(_mm512_tan_ps , "tan, SP, 512", 0, 6.28, abufsp, vfloat); - callFuncSVML2_1(_mm512_sincos_ps, "sincos, SP, 512", 0, 6.28, abufsp, vfloat); - - fillSP(abufsp, 0, 1e+20); - - callFuncSVML1_1(_mm512_sin_ps , "sin, SP, 512", 0, 1e+20, abufsp, vfloat); - callFuncSVML1_1(_mm512_cos_ps , "cos, SP, 512", 0, 1e+20, abufsp, vfloat); - callFuncSVML1_1(_mm512_tan_ps , "tan, SP, 512", 0, 1e+20, abufsp, vfloat); - callFuncSVML2_1(_mm512_sincos_ps, "sincos, SP, 512", 0, 1e+20, abufsp, vfloat); -} - -void benchSVML512_SPNontrig() { - fillSP(abufsp, 0, 1e+38); - - callFuncSVML1_1(_mm512_log_ps , "log, SP, 512", 0, 1e+38, abufsp, vfloat); - - if (enableLogExp) { - callFuncSVML1_1(_mm512_log10_ps, "log10, SP, 512", 0, 1e+38, abufsp, vfloat); - //callFuncSVML1_1(_mm512_log1p_ps, "log1p, SP, 512", 0, 1e+38, abufsp, vfloat); - - fillSP(abufsp, -100, 100); - - callFuncSVML1_1(_mm512_exp_ps , "exp, SP, 512", -100, 100, abufsp, vfloat); - callFuncSVML1_1(_mm512_exp2_ps , "exp2, SP, 512", -100, 100, abufsp, vfloat); - callFuncSVML1_1(_mm512_exp10_ps, "exp10, SP, 512", -100, 100, abufsp, vfloat); - - fillSP(abufsp, -30, 30); - fillSP(bbufsp, -30, 30); - - callFuncSVML1_2(_mm512_pow_ps, "pow, SP, 512", -30, 30, -30, 30, abufsp, bbufsp, vfloat); - } - - fillSP(abufsp, -1.0, 1.0); - - callFuncSVML1_1(_mm512_asin_ps, "asin, SP, 512", -1.0, 1, abufsp, vfloat); - callFuncSVML1_1(_mm512_acos_ps, "acos, SP, 512", -1.0, 1, abufsp, vfloat); - - fillSP(abufsp, -10, 10); - fillSP(bbufsp, -10, 10); - - callFuncSVML1_1(_mm512_atan_ps, "atan, SP, 512", -10, 10, abufsp, vfloat); - callFuncSVML1_2(_mm512_atan2_ps, "atan2, SP, 512", -10, 10, -10, 10, abufsp, bbufsp, vfloat); -} -#else // #ifdef ENABLED -void benchSVML512_DPTrig() {} -void benchSVML512_DPNontrig() {} -void benchSVML512_SPTrig() {} -void benchSVML512_SPNontrig() {} -#endif // #ifdef ENABLED diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-benchmarks/measure.sh b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-benchmarks/measure.sh deleted file mode 100644 index 74ccb8b5327..00000000000 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-benchmarks/measure.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/sh -echo -read -p "Enter label of measurement(e.g. My desktop PC) : " label - -if [ -f counter.txt ] -then - counter=`cat counter.txt` -else - counter=0 -fi - -echo Measurement in progress. This may take several minutes. -for i in $*; do - $i "$label" $counter -done -counter=$((counter+1)) -echo $counter > counter.txt diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/CMakeLists.txt b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/CMakeLists.txt index b391c1525a4..f054d4d0519 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/CMakeLists.txt +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/CMakeLists.txt @@ -65,20 +65,33 @@ include_directories(${sleef_BINARY_DIR}/include) # sleef.h include_directories(${sleef_SOURCE_DIR}/src/libm) # rename.h include_directories(${sleef_BINARY_DIR}/src/libm/include) # rename headers -if(NOT LIB_MPFR) +if (SLEEF_ENFORCE_TESTER AND NOT SLEEF_ENABLE_TESTER) + message(FATAL_ERROR "SLEEF_ENFORCE_TESTER is specified but SLEEF_ENABLE_TESTER is false") +endif(SLEEF_ENFORCE_TESTER AND NOT SLEEF_ENABLE_TESTER) + +if(SLEEF_ENABLE_TESTER AND NOT LIB_MPFR) find_program(TESTER_COMMAND tester) -endif(NOT LIB_MPFR) +endif(SLEEF_ENABLE_TESTER AND NOT LIB_MPFR) if (SLEEF_ENFORCE_TESTER AND NOT LIB_MPFR AND NOT TESTER_COMMAND) message(FATAL_ERROR "SLEEF_ENFORCE_TESTER is specified and tester is not available") endif(SLEEF_ENFORCE_TESTER AND NOT LIB_MPFR AND NOT TESTER_COMMAND) +if (SLEEF_ENFORCE_TESTER4 AND NOT SLEEF_ENABLE_TESTER4) + message(FATAL_ERROR "SLEEF_ENFORCE_TESTER4 is specified but SLEEF_ENABLE_TESTER4 is false") +endif() + +if (SLEEF_ENFORCE_TESTER4 AND NOT TLFLOAT_LIBRARIES) + message(FATAL_ERROR "SLEEF_ENFORCE_TESTER4 is specified but TLFloat is not available") +endif() + find_library(LIBRT rt) if (NOT LIBRT) set(LIBRT "") endif() set(CMAKE_C_FLAGS "${ORG_CMAKE_C_FLAGS} ${SLEEF_C_FLAGS} ${FLAGS_NOSTRICTALIASING}") +set(CMAKE_CXX_FLAGS "${ORG_CMAKE_CXX_FLAGS} ${SLEEF_C_FLAGS} ${FLAGS_NOSTRICTALIASING}") set(COMMON_TARGET_PROPERTIES C_STANDARD 99 # -std=gnu99 @@ -90,6 +103,17 @@ endif() # +function(add_test_with_emu C CMD) + if (SDE_COMMAND) + add_test(NAME ${CMD} COMMAND ${SDE_COMMAND} "--" ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${CMD}) + elseif(EMULATOR) + add_test(NAME ${CMD} COMMAND ${EMULATOR} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${CMD}) + else() + add_test(NAME ${CMD} COMMAND ${CMD}) + endif() + set_tests_properties(${CMD} PROPERTIES COST ${C}) +endfunction() + function(add_test_iut IUT C) if (LIB_MPFR) set(TESTER ${TARGET_TESTER}) @@ -126,14 +150,19 @@ function(add_test_iut IUT C) endif() endfunction() -# Compile executable 'iut' -add_executable(${TARGET_IUT} iut.c testerutil.c) -target_compile_definitions(${TARGET_IUT} PRIVATE ${COMMON_TARGET_DEFINITIONS}) -target_link_libraries(${TARGET_IUT} ${TARGET_LIBSLEEF} - ${LIBM} ${LIBRT}) -set_target_properties(${TARGET_IUT} PROPERTIES ${COMMON_TARGET_PROPERTIES}) -add_test_iut(${TARGET_IUT} 1.0) -set(IUT_LIST ${TARGET_IUT}) +if (SLEEF_ENABLE_TESTER) + # Compile executable 'iut' + add_executable(${TARGET_IUT} iut.c) + target_compile_definitions(${TARGET_IUT} PRIVATE ${COMMON_TARGET_DEFINITIONS}) + target_link_libraries(${TARGET_IUT} ${TARGET_LIBSLEEF} ${TARGET_TESTERUTIL_OBJ} + ${LIBM} ${LIBRT}) + set_target_properties(${TARGET_IUT} PROPERTIES ${COMMON_TARGET_PROPERTIES}) + add_test_iut(${TARGET_IUT} 1.0) + set(IUT_LIST ${TARGET_IUT}) + + # Tests depends on the library + add_dependencies(${TARGET_IUT} ${TARGET_HEADERS}) +endif() # Compile executable 'iutcuda' if (SLEEF_BUILD_INLINE_HEADERS AND SED_COMMAND AND CMAKE_CUDA_COMPILER) @@ -145,97 +174,179 @@ if (SLEEF_BUILD_INLINE_HEADERS AND SED_COMMAND AND CMAKE_CUDA_COMPILER) list(APPEND IUT_LIST iutcuda) endif() -set(IUT_SRC iutsimd.c ${sleef_SOURCE_DIR}/src/common/main_checkfeature.c testerutil.c) +set(IUT_SRC iutsimd.c ${sleef_SOURCE_DIR}/src/common/main_checkfeature.c) # Add vector extension `iut`s macro(test_extension SIMD) if(COMPILER_SUPPORTS_${SIMD}) string(TOLOWER ${SIMD} LCSIMD) - string(CONCAT TARGET_IUT${SIMD} "iut" ${LCSIMD}) - add_executable(${TARGET_IUT${SIMD}} ${IUT_SRC}) - target_compile_options(${TARGET_IUT${SIMD}} - PRIVATE ${FLAGS_ENABLE_${SIMD}}) - target_compile_definitions(${TARGET_IUT${SIMD}} - PRIVATE ENABLE_${SIMD}=1 ${COMMON_TARGET_DEFINITIONS}) - target_link_libraries(${TARGET_IUT${SIMD}} ${TARGET_LIBSLEEF} - ${LIBM} ${LIBRT}) - if (FORCE_AAVPCS) - target_compile_definitions(${TARGET_IUT${SIMD}} PRIVATE ENABLE_AAVPCS=1) - endif(FORCE_AAVPCS) + if (SLEEF_ENABLE_TESTER) + string(CONCAT TARGET_IUT${SIMD} "iut" ${LCSIMD}) - add_dependencies(${TARGET_IUT${SIMD}} ${TARGET_HEADERS}) - add_dependencies(${TARGET_IUT${SIMD}} ${TARGET_LIBSLEEF}) - set_target_properties(${TARGET_IUT${SIMD}} PROPERTIES ${COMMON_TARGET_PROPERTIES}) - if (DEFINED COSTOVERRIDE_${SIMD}) - add_test_iut(${TARGET_IUT${SIMD}} ${COSTOVERRIDE_${SIMD}}) - else() - add_test_iut(${TARGET_IUT${SIMD}} 1.0) - endif() - list(APPEND IUT_LIST ${TARGET_IUT${SIMD}}) + add_executable(${TARGET_IUT${SIMD}} ${IUT_SRC}) + target_compile_options(${TARGET_IUT${SIMD}} + PRIVATE ${FLAGS_ENABLE_${SIMD}}) + target_compile_definitions(${TARGET_IUT${SIMD}} + PRIVATE ENABLE_${SIMD}=1 ${COMMON_TARGET_DEFINITIONS}) + target_link_libraries(${TARGET_IUT${SIMD}} ${TARGET_LIBSLEEF} ${TARGET_TESTERUTIL_OBJ} + ${LIBM} ${LIBRT}) + if (FORCE_AAVPCS) + target_compile_definitions(${TARGET_IUT${SIMD}} PRIVATE ENABLE_AAVPCS=1) + endif(FORCE_AAVPCS) - # The iut programs whose names begin with "iuty" are the iut for the - # deterministic version of functions. By checking the result of - # testing with iutysse2, for example, it can be checked that the - # corresponding deterministic functions passes the accuracy and - # nonnumber tests. - - string(CONCAT IUTYNAME "iuty" ${LCSIMD}) - add_executable(${IUTYNAME} ${IUT_SRC}) - target_compile_options(${IUTYNAME} - PRIVATE ${FLAGS_ENABLE_${SIMD}}) - target_compile_definitions(${IUTYNAME} - PRIVATE ENABLE_${SIMD}=1 ${COMMON_TARGET_DEFINITIONS} DETERMINISTIC=1) - target_link_libraries(${IUTYNAME} ${TARGET_LIBSLEEF} - ${LIBM} ${LIBRT}) - add_dependencies(${IUTYNAME} ${TARGET_HEADERS}) - add_dependencies(${IUTYNAME} ${TARGET_LIBSLEEF}) - set_target_properties(${IUTYNAME} PROPERTIES ${COMMON_TARGET_PROPERTIES}) - if (DEFINED COSTOVERRIDE_${SIMD}) - add_test_iut(${IUTYNAME} ${COSTOVERRIDE_${SIMD}}) - else() - add_test_iut(${IUTYNAME} 1.0) - endif() - list(APPEND IUT_LIST ${IUTYNAME}) - - # The iut programs whose names begin with "iuti" are the iut for the - # inline version of functions. - - if (SLEEF_BUILD_INLINE_HEADERS AND SED_COMMAND) - string(CONCAT IUTINAME "iuti" ${LCSIMD}) - add_executable(${IUTINAME} ${IUT_SRC}) - target_compile_options(${IUTINAME} PRIVATE ${FLAGS_ENABLE_${SIMD}}) - target_compile_definitions(${IUTINAME} - PRIVATE ENABLE_${SIMD}=1 ${COMMON_TARGET_DEFINITIONS} - USE_INLINE_HEADER="sleefinline_${LCSIMD}.h" - MACRO_ONLY_HEADER="macroonly${SIMD}.h" - SIMD_SUFFIX=_${LCSIMD}_sleef - ) - target_include_directories(${IUTINAME} PRIVATE ${PROJECT_BINARY_DIR}/include) - target_link_libraries(${IUTINAME} ${LIBM} ${LIBRT}) - add_dependencies(${IUTINAME} ${TARGET_INLINE_HEADERS}) - set_target_properties(${IUTINAME} PROPERTIES C_STANDARD 99) + add_dependencies(${TARGET_IUT${SIMD}} ${TARGET_HEADERS}) + add_dependencies(${TARGET_IUT${SIMD}} ${TARGET_LIBSLEEF}) + set_target_properties(${TARGET_IUT${SIMD}} PROPERTIES ${COMMON_TARGET_PROPERTIES}) if (DEFINED COSTOVERRIDE_${SIMD}) - add_test_iut(${IUTINAME} ${COSTOVERRIDE_${SIMD}}) + add_test_iut(${TARGET_IUT${SIMD}} ${COSTOVERRIDE_${SIMD}}) else() - add_test_iut(${IUTINAME} 1.0) + add_test_iut(${TARGET_IUT${SIMD}} 1.0) endif() - list(APPEND IUT_LIST ${IUTINAME}) - endif(SLEEF_BUILD_INLINE_HEADERS AND SED_COMMAND) + list(APPEND IUT_LIST ${TARGET_IUT${SIMD}}) + + # The iut programs whose names begin with "iuty" are the iut for the + # deterministic version of functions. By checking the result of + # testing with iutysse2, for example, it can be checked that the + # corresponding deterministic functions passes the accuracy and + # nonnumber tests. + + string(CONCAT IUTYNAME "iuty" ${LCSIMD}) + add_executable(${IUTYNAME} ${IUT_SRC}) + target_compile_options(${IUTYNAME} + PRIVATE ${FLAGS_ENABLE_${SIMD}}) + target_compile_definitions(${IUTYNAME} + PRIVATE ENABLE_${SIMD}=1 ${COMMON_TARGET_DEFINITIONS} DETERMINISTIC=1) + target_link_libraries(${IUTYNAME} ${TARGET_LIBSLEEF} ${TARGET_TESTERUTIL_OBJ} + ${LIBM} ${LIBRT}) + add_dependencies(${IUTYNAME} ${TARGET_HEADERS}) + add_dependencies(${IUTYNAME} ${TARGET_LIBSLEEF}) + set_target_properties(${IUTYNAME} PROPERTIES ${COMMON_TARGET_PROPERTIES}) + if (DEFINED COSTOVERRIDE_${SIMD}) + add_test_iut(${IUTYNAME} ${COSTOVERRIDE_${SIMD}}) + else() + add_test_iut(${IUTYNAME} 1.0) + endif() + list(APPEND IUT_LIST ${IUTYNAME}) + + # The iut programs whose names begin with "iuti" are the iut for the + # inline version of functions. + + if (SLEEF_BUILD_INLINE_HEADERS AND SED_COMMAND) + string(CONCAT IUTINAME "iuti" ${LCSIMD}) + add_executable(${IUTINAME} ${IUT_SRC}) + target_compile_options(${IUTINAME} PRIVATE ${FLAGS_ENABLE_${SIMD}}) + target_compile_definitions(${IUTINAME} + PRIVATE ENABLE_${SIMD}=1 ${COMMON_TARGET_DEFINITIONS} + USE_INLINE_HEADER="sleefinline_${LCSIMD}.h" + MACRO_ONLY_HEADER="macroonly${SIMD}.h" + SIMD_SUFFIX=_${LCSIMD}_sleef + ) + target_include_directories(${IUTINAME} PRIVATE ${PROJECT_BINARY_DIR}/include) + target_link_libraries(${IUTINAME} ${TARGET_TESTERUTIL_OBJ} ${LIBM} ${LIBRT}) + add_dependencies(${IUTINAME} ${TARGET_INLINE_HEADERS}) + set_target_properties(${IUTINAME} PROPERTIES C_STANDARD 99) + if (DEFINED COSTOVERRIDE_${SIMD}) + add_test_iut(${IUTINAME} ${COSTOVERRIDE_${SIMD}}) + else() + add_test_iut(${IUTINAME} 1.0) + endif() + list(APPEND IUT_LIST ${IUTINAME}) + endif(SLEEF_BUILD_INLINE_HEADERS AND SED_COMMAND) + endif(SLEEF_ENABLE_TESTER) + + # + + if (SLEEF_ENABLE_TESTER4 AND TLFLOAT_LIBRARIES) + set(TESTER4_SRC tester4simd.cpp ${sleef_SOURCE_DIR}/src/common/main_checkfeature.c) + + string(CONCAT TARGET_TESTER4_${SIMD} "tester4" ${LCSIMD}) + + add_executable(${TARGET_TESTER4_${SIMD}} ${TESTER4_SRC}) + target_compile_options(${TARGET_TESTER4_${SIMD}} + PRIVATE ${FLAGS_ENABLE_${SIMD}}) + target_compile_definitions(${TARGET_TESTER4_${SIMD}} + PRIVATE ENABLE_${SIMD}=1 ${COMMON_TARGET_DEFINITIONS}) + target_link_libraries(${TARGET_TESTER4_${SIMD}} ${TARGET_LIBSLEEF} ${TLFLOAT_LIBRARIES} ${TARGET_TESTERUTIL_OBJ}) + if (FORCE_AAVPCS) + target_compile_definitions(${TARGET_TESTER4_${SIMD}} PRIVATE ENABLE_AAVPCS=1) + endif(FORCE_AAVPCS) + + add_dependencies(${TARGET_TESTER4_${SIMD}} ${TARGET_HEADERS}) + add_dependencies(${TARGET_TESTER4_${SIMD}} ${TARGET_LIBSLEEF}) + add_dependencies(${TARGET_TESTER4_${SIMD}} ext_tlfloat) + set_target_properties(${TARGET_TESTER4_${SIMD}} PROPERTIES ${COMMON_TARGET_PROPERTIES}) + if (DEFINED COSTOVERRIDE_${SIMD}) + add_test_with_emu(${COSTOVERRIDE_${SIMD}} ${TARGET_TESTER4_${SIMD}}) + else() + add_test_with_emu(1.0 ${TARGET_TESTER4_${SIMD}}) + endif() + + # + + string(CONCAT TARGET_TESTER4Y_${SIMD} "tester4y" ${LCSIMD}) + + add_executable(${TARGET_TESTER4Y_${SIMD}} ${TESTER4_SRC}) + target_compile_options(${TARGET_TESTER4Y_${SIMD}} + PRIVATE ${FLAGS_ENABLE_${SIMD}}) + target_compile_definitions(${TARGET_TESTER4Y_${SIMD}} + PRIVATE ENABLE_${SIMD}=1 ${COMMON_TARGET_DEFINITIONS} DETERMINISTIC=1) + target_link_libraries(${TARGET_TESTER4Y_${SIMD}} ${TARGET_LIBSLEEF} ${TLFLOAT_LIBRARIES} ${TARGET_TESTERUTIL_OBJ}) + add_dependencies(${TARGET_TESTER4Y_${SIMD}} ${TARGET_HEADERS}) + add_dependencies(${TARGET_TESTER4Y_${SIMD}} ${TARGET_LIBSLEEF}) + add_dependencies(${TARGET_TESTER4Y_${SIMD}} ext_tlfloat) + set_target_properties(${TARGET_TESTER4Y_${SIMD}} PROPERTIES ${COMMON_TARGET_PROPERTIES}) + if (DEFINED COSTOVERRIDE_${SIMD}) + add_test_with_emu(${COSTOVERRIDE_${SIMD}} ${TARGET_TESTER4Y_${SIMD}}) + else() + add_test_with_emu(1.0 ${TARGET_TESTER4Y_${SIMD}}) + endif() + + # + + if (SLEEF_BUILD_INLINE_HEADERS) + string(CONCAT TARGET_TESTER4I_${SIMD} "tester4i" ${LCSIMD}) + + add_executable(${TARGET_TESTER4I_${SIMD}} ${TESTER4_SRC}) + target_compile_options(${TARGET_TESTER4I_${SIMD}} + PRIVATE ${FLAGS_ENABLE_${SIMD}}) + target_link_libraries(${TARGET_TESTER4I_${SIMD}} ${TLFLOAT_LIBRARIES} ${TARGET_TESTERUTIL_OBJ}) + if(CMAKE_C_COMPILER_ID MATCHES "GNU") + target_compile_options(${TARGET_TESTER4I_${SIMD}} PRIVATE "-Wno-unknown-pragmas") + endif() + target_compile_definitions(${TARGET_TESTER4I_${SIMD}} + PRIVATE ENABLE_${SIMD}=1 ${COMMON_TARGET_DEFINITIONS} + USE_INLINE_HEADER="sleefinline_${LCSIMD}.h" + MACRO_ONLY_HEADER="macroonly${SIMD}.h" + SIMD_SUFFIX=_${LCSIMD}_sleef + ) + target_include_directories(${TARGET_TESTER4I_${SIMD}} PRIVATE ${PROJECT_BINARY_DIR}/include) + add_dependencies(${TARGET_TESTER4I_${SIMD}} ${TARGET_INLINE_HEADERS}) + add_dependencies(${TARGET_TESTER4I_${SIMD}} ext_tlfloat) + set_target_properties(${TARGET_TESTER4I_${SIMD}} PROPERTIES ${COMMON_TARGET_PROPERTIES}) + if (DEFINED COSTOVERRIDE_${SIMD}) + add_test_with_emu(${COSTOVERRIDE_${SIMD}} ${TARGET_TESTER4I_${SIMD}}) + else() + add_test_with_emu(1.0 ${TARGET_TESTER4I_${SIMD}}) + endif() + endif(SLEEF_BUILD_INLINE_HEADERS) + endif(SLEEF_ENABLE_TESTER4 AND TLFLOAT_LIBRARIES) + + # if(LIB_MPFR AND NOT ${SIMD} STREQUAL NEON32 AND NOT ${SIMD} STREQUAL NEON32VFPV4 AND NOT MINGW) # Build tester2 SIMD string(TOLOWER ${SIMD} SCSIMD) foreach(P dp sp) set(T "tester2${SCSIMD}${P}") - add_executable(${T} tester2simd${P}.c testerutil.c) + add_executable(${T} tester2simd${P}.c) if(FORCE_AAVPCS) target_compile_definitions(${T} PRIVATE ENABLE_AAVPCS=1) endif(FORCE_AAVPCS) target_compile_options(${T} PRIVATE ${FLAGS_ENABLE_${SIMD}}) target_compile_definitions(${T} PRIVATE ENABLE_${SIMD}=1 USEMPFR=1 ${COMMON_TARGET_DEFINITIONS}) set_target_properties(${T} PROPERTIES ${COMMON_TARGET_PROPERTIES}) - target_link_libraries(${T} ${TARGET_LIBSLEEF} ${LIB_MPFR} ${LIBM} ${LIBGMP}) + target_link_libraries(${T} ${TARGET_LIBSLEEF} ${TARGET_TESTERUTIL_OBJ} ${LIB_MPFR} ${LIBGMP} ${LIBM}) add_dependencies(${T} ${TARGET_HEADERS}) add_dependencies(${T} ${TARGET_LIBSLEEF}) if (MPFR_INCLUDE_DIR) @@ -246,11 +357,11 @@ macro(test_extension SIMD) # testing program for the deterministic version of functions. set(T "tester2y${SCSIMD}${P}") - add_executable(${T} tester2simd${P}.c testerutil.c) + add_executable(${T} tester2simd${P}.c) target_compile_options(${T} PRIVATE ${FLAGS_ENABLE_${SIMD}}) target_compile_definitions(${T} PRIVATE ENABLE_${SIMD}=1 USEMPFR=1 ${COMMON_TARGET_DEFINITIONS} DETERMINISTIC=1) set_target_properties(${T} PROPERTIES ${COMMON_TARGET_PROPERTIES}) - target_link_libraries(${T} ${TARGET_LIBSLEEF} ${LIB_MPFR} ${LIBM} ${LIBGMP}) + target_link_libraries(${T} ${TARGET_LIBSLEEF} ${TARGET_TESTERUTIL_OBJ} ${LIB_MPFR} ${LIBGMP} ${LIBM}) add_dependencies(${T} ${TARGET_HEADERS}) add_dependencies(${T} ${TARGET_LIBSLEEF}) if (MPFR_INCLUDE_DIR) @@ -259,13 +370,16 @@ macro(test_extension SIMD) endforeach() endif() - if(NOT ${SIMD} STREQUAL NEON32 AND NOT ${SIMD} STREQUAL NEON32VFPV4 AND SLEEF_OPENSSL_FOUND) + if(NOT ${SIMD} STREQUAL NEON32 AND NOT ${SIMD} STREQUAL NEON32VFPV4) # Build tester3 string(TOLOWER ${SIMD} SCSIMD) set(T "tester3${SCSIMD}") - add_executable(${T} tester3.c ${sleef_SOURCE_DIR}/src/common/main_checkfeature.c testerutil.c) + add_executable(${T} tester3.c ${sleef_SOURCE_DIR}/src/common/main_checkfeature.c) target_compile_options(${T} PRIVATE ${FLAGS_ENABLE_${SIMD}}) target_compile_definitions(${T} PRIVATE ${COMMON_TARGET_DEFINITIONS} ${TESTER3_DEFINITIONS_${SIMD}}) + if (NOT SLEEF_OPENSSL_FOUND) + target_compile_definitions(${T} PRIVATE SLEEF_USE_INTERNAL_SHA256=1) + endif() set_target_properties(${T} PROPERTIES ${COMMON_TARGET_PROPERTIES}) # Enable Vector PCS for Advanced SIMD (if supported) @@ -273,8 +387,18 @@ macro(test_extension SIMD) host_target_AAVPCS_definitions(${T}) endif() - target_link_libraries(${T} ${TARGET_LIBSLEEF} ${LIBM} ${SLEEF_OPENSSL_LIBRARIES}) - target_include_directories(${T} PRIVATE ${SLEEF_OPENSSL_INCLUDE_DIR}) + target_link_libraries(${T} ${TARGET_LIBSLEEF} ${TARGET_TESTERUTIL_OBJ} ${LIBM}) + if(LIB_MPFR) + target_link_libraries(${T} ${LIB_MPFR} ${LIBGMP}) + endif() + if (SLEEF_OPENSSL_FOUND) + target_link_libraries(${T} ${SLEEF_OPENSSL_LIBRARIES}) + target_include_directories(${T} PRIVATE ${SLEEF_OPENSSL_INCLUDE_DIR}) + else() + target_link_libraries(${T} ${TARGET_PSHA_OBJ}) + target_include_directories(${T} PRIVATE ${sleef_SOURCE_DIR}/src/common) + endif() + add_dependencies(${T} ${TARGET_HEADERS}) add_dependencies(${T} ${TARGET_LIBSLEEF}) @@ -371,53 +495,99 @@ endif(ENABLE_GNUABI) # if (SLEEF_ARCH_X86) - # iutdsp128 - add_executable(iutdsp128 ${IUT_SRC}) - target_compile_definitions(iutdsp128 PRIVATE ENABLE_DSP128=1 ${COMMON_TARGET_DEFINITIONS}) - target_compile_options(iutdsp128 PRIVATE ${FLAGS_ENABLE_SSE2}) - target_link_libraries(iutdsp128 ${TARGET_LIBSLEEF} ${LIBM} ${LIBRT}) - add_dependencies(iutdsp128 ${TARGET_HEADERS} ${TARGET_LIBSLEEF}) - add_test_iut(iutdsp128 1.0) - list(APPEND IUT_LIST iutdsp128) + if (SLEEF_ENABLE_TESTER) + # iutdsp128 + add_executable(iutdsp128 ${IUT_SRC}) + target_compile_definitions(iutdsp128 PRIVATE ENABLE_DSP128=1 ${COMMON_TARGET_DEFINITIONS}) + target_compile_options(iutdsp128 PRIVATE ${FLAGS_ENABLE_SSE2}) + target_link_libraries(iutdsp128 ${TARGET_LIBSLEEF} ${LIBRT} ${TARGET_TESTERUTIL_OBJ} ${LIBM}) + add_dependencies(iutdsp128 ${TARGET_HEADERS} ${TARGET_LIBSLEEF}) + add_test_iut(iutdsp128 1.0) + list(APPEND IUT_LIST iutdsp128) - # iutdsp256 - add_executable(iutdsp256 ${IUT_SRC}) - target_compile_definitions(iutdsp256 PRIVATE ENABLE_DSP256=1 ${COMMON_TARGET_DEFINITIONS}) - target_compile_options(iutdsp256 PRIVATE ${FLAGS_ENABLE_AVX}) - target_link_libraries(iutdsp256 ${TARGET_LIBSLEEF} ${LIBM} ${LIBRT}) - add_dependencies(iutdsp256 ${TARGET_HEADERS} ${TARGET_LIBSLEEF}) - add_test_iut(iutdsp256 1.0) - list(APPEND IUT_LIST iutdsp256) + # iutdsp256 + add_executable(iutdsp256 ${IUT_SRC}) + target_compile_definitions(iutdsp256 PRIVATE ENABLE_DSP256=1 ${COMMON_TARGET_DEFINITIONS}) + target_compile_options(iutdsp256 PRIVATE ${FLAGS_ENABLE_AVX}) + target_link_libraries(iutdsp256 ${TARGET_LIBSLEEF} ${LIBRT} ${TARGET_TESTERUTIL_OBJ} ${LIBM}) + add_dependencies(iutdsp256 ${TARGET_HEADERS} ${TARGET_LIBSLEEF}) + add_test_iut(iutdsp256 1.0) + list(APPEND IUT_LIST iutdsp256) + endif(SLEEF_ENABLE_TESTER) + + if (SLEEF_ENABLE_TESTER4 AND TLFLOAT_LIBRARIES) + # tester4dsp128 + add_executable(tester4dsp128 ${TESTER4_SRC}) + target_compile_definitions(tester4dsp128 PRIVATE + ENABLE_DSP128=1 ${COMMON_TARGET_DEFINITIONS}) + target_compile_options(tester4dsp128 PRIVATE ${FLAGS_ENABLE_SSE2}) + target_link_libraries(tester4dsp128 ${TARGET_LIBSLEEF} ${TLFLOAT_LIBRARIES} ${TARGET_TESTERUTIL_OBJ}) + add_dependencies(tester4dsp128 ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ext_tlfloat) + add_test_with_emu(1.0 tester4dsp128) + + # tester4dsp256 + add_executable(tester4dsp256 ${TESTER4_SRC}) + target_compile_definitions(tester4dsp256 PRIVATE + ENABLE_DSP256=1 ${COMMON_TARGET_DEFINITIONS}) + target_compile_options(tester4dsp256 PRIVATE ${FLAGS_ENABLE_AVX}) + target_link_libraries(tester4dsp256 ${TARGET_LIBSLEEF} ${TLFLOAT_LIBRARIES} ${TARGET_TESTERUTIL_OBJ}) + add_dependencies(tester4dsp256 ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ext_tlfloat) + add_test_with_emu(1.0 tester4dsp256) + endif(SLEEF_ENABLE_TESTER4 AND TLFLOAT_LIBRARIES) endif(SLEEF_ARCH_X86) if (SLEEF_ARCH_PPC64) - add_executable(iutdsp128 ${IUT_SRC}) - target_compile_definitions(iutdsp128 PRIVATE ENABLE_DSPPOWER_128=1 ${COMMON_TARGET_DEFINITIONS}) - target_compile_options(iutdsp128 PRIVATE ${FLAGS_ENABLE_VSX}) - target_link_libraries(iutdsp128 ${TARGET_LIBSLEEF} ${LIBM} ${LIBRT}) - add_dependencies(iutdsp128 ${TARGET_HEADERS} ${TARGET_LIBSLEEF}) - add_test_iut(iutdsp128 1.0) - list(APPEND IUT_LIST iutdsp128) + if (SLEEF_ENABLE_TESTER) + add_executable(iutdsp128 ${IUT_SRC}) + target_compile_definitions(iutdsp128 PRIVATE ENABLE_DSPPOWER_128=1 ${COMMON_TARGET_DEFINITIONS}) + target_compile_options(iutdsp128 PRIVATE ${FLAGS_ENABLE_VSX}) + target_link_libraries(iutdsp128 ${TARGET_LIBSLEEF} ${LIBRT} ${TARGET_TESTERUTIL_OBJ} ${LIBM}) + add_dependencies(iutdsp128 ${TARGET_HEADERS} ${TARGET_LIBSLEEF}) + add_test_iut(iutdsp128 1.0) + list(APPEND IUT_LIST iutdsp128) + endif(SLEEF_ENABLE_TESTER) + + if (SLEEF_ENABLE_TESTER4 AND TLFLOAT_LIBRARIES) + add_executable(tester4dsp128 ${TESTER4_SRC}) + target_compile_definitions(tester4dsp128 PRIVATE ENABLE_DSPPOWER_128=1 ${COMMON_TARGET_DEFINITIONS}) + target_compile_options(tester4dsp128 PRIVATE ${FLAGS_ENABLE_VSX}) + target_link_libraries(tester4dsp128 ${TARGET_LIBSLEEF} ${TLFLOAT_LIBRARIES} ${TARGET_TESTERUTIL_OBJ}) + add_dependencies(tester4dsp128 ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ext_tlfloat) + add_test_with_emu(1.0 tester4dsp128) + endif(SLEEF_ENABLE_TESTER4 AND TLFLOAT_LIBRARIES) endif(SLEEF_ARCH_PPC64) if (SLEEF_ARCH_S390X) - add_executable(iutdsp128 ${IUT_SRC}) - target_compile_definitions(iutdsp128 PRIVATE ENABLE_DSPS390X_128=1 ${COMMON_TARGET_DEFINITIONS}) - target_compile_options(iutdsp128 PRIVATE ${FLAGS_ENABLE_VXE}) - target_link_libraries(iutdsp128 ${TARGET_LIBSLEEF} ${LIBM} ${LIBRT}) - add_dependencies(iutdsp128 ${TARGET_HEADERS} ${TARGET_LIBSLEEF}) - add_test_iut(iutdsp128 1.0) - list(APPEND IUT_LIST iutdsp128) + if (SLEEF_ENABLE_TESTER) + add_executable(iutdsp128 ${IUT_SRC}) + target_compile_definitions(iutdsp128 PRIVATE ENABLE_DSPS390X_128=1 ${COMMON_TARGET_DEFINITIONS}) + target_compile_options(iutdsp128 PRIVATE ${FLAGS_ENABLE_VXE}) + target_link_libraries(iutdsp128 ${TARGET_LIBSLEEF} ${LIBRT} ${TARGET_TESTERUTIL_OBJ} ${LIBM}) + add_dependencies(iutdsp128 ${TARGET_HEADERS} ${TARGET_LIBSLEEF}) + add_test_iut(iutdsp128 1.0) + list(APPEND IUT_LIST iutdsp128) + endif(SLEEF_ENABLE_TESTER) + + if (SLEEF_ENABLE_TESTER4 AND TLFLOAT_LIBRARIES) + add_executable(tester4dsp128 ${TESTER4_SRC}) + target_compile_definitions(tester4dsp128 PRIVATE ENABLE_DSPS390X_128=1 ${COMMON_TARGET_DEFINITIONS}) + target_compile_options(tester4dsp128 PRIVATE ${FLAGS_ENABLE_VXE}) + target_link_libraries(tester4dsp128 ${TARGET_LIBSLEEF} ${TLFLOAT_LIBRARIES} ${TARGET_TESTERUTIL_OBJ}) + add_dependencies(tester4dsp128 ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ext_tlfloat) + add_test_with_emu(1.0 tester4dsp128) + endif(SLEEF_ENABLE_TESTER4 AND TLFLOAT_LIBRARIES) endif(SLEEF_ARCH_S390X) if(SLEEF_BUILD_SCALAR_LIB) - # Compile executable 'iutscalar' - add_executable(iutscalar iut.c testerutil.c) - target_compile_definitions(iutscalar PRIVATE ${COMMON_TARGET_DEFINITIONS}) - target_link_libraries(iutscalar sleefscalar ${TARGET_LIBSLEEF} ${LIBM} ${LIBRT}) - set_target_properties(iutscalar PROPERTIES ${COMMON_TARGET_PROPERTIES}) - add_test_iut(iutscalar 1.0) - list(APPEND IUT_LIST iutscalar) + if (SLEEF_ENABLE_TESTER) + # Compile executable 'iutscalar' + add_executable(iutscalar iut.c) + target_compile_definitions(iutscalar PRIVATE ${COMMON_TARGET_DEFINITIONS}) + target_link_libraries(iutscalar sleefscalar ${TARGET_LIBSLEEF} ${LIBRT} ${TARGET_TESTERUTIL_OBJ} ${LIBM}) + set_target_properties(iutscalar PROPERTIES ${COMMON_TARGET_PROPERTIES}) + add_test_iut(iutscalar 1.0) + list(APPEND IUT_LIST iutscalar) + endif(SLEEF_ENABLE_TESTER) endif() if(LIB_MPFR AND NOT MINGW) @@ -433,7 +603,7 @@ if(LIB_MPFR AND NOT MINGW) endif() foreach(P ${PRECISIONS}) set(T "tester2${P}") - add_executable(${T} tester2${P}.c testerutil.c) + add_executable(${T} tester2${P}.c) target_compile_definitions(${T} PRIVATE USEMPFR=1 ${ENABLEFLOAT128} ${COMMON_TARGET_DEFINITIONS}) set_target_properties(${T} PROPERTIES ${COMMON_TARGET_PROPERTIES}) if (FORCE_AAVPCS) @@ -442,15 +612,15 @@ if(LIB_MPFR AND NOT MINGW) if (MPFR_INCLUDE_DIR) target_include_directories(${T} PRIVATE ${MPFR_INCLUDE_DIR}) endif() - target_link_libraries(${T} ${TARGET_LIBSLEEF} ${LIBQUADMATH} ${LIB_MPFR} ${LIBM} ${LIBGMP}) + target_link_libraries(${T} ${TARGET_LIBSLEEF} ${LIBQUADMATH} ${TARGET_QTESTERUTIL_OBJ} ${TARGET_TESTERUTIL_OBJ} ${LIB_MPFR} ${LIBGMP} ${LIBM}) add_dependencies(${T} ${TARGET_HEADERS}) add_dependencies(${T} ${TARGET_LIBSLEEF}) endforeach() # Compile executable 'tester' - add_host_executable(${TARGET_TESTER} tester.c testerutil.c) + add_host_executable(${TARGET_TESTER} tester.c) if (NOT CMAKE_CROSSCOMPILING) - target_link_libraries(${TARGET_TESTER} ${LIB_MPFR} ${TARGET_LIBSLEEF} ${LIBM} ${LIBGMP}) + target_link_libraries(${TARGET_TESTER} ${TARGET_LIBSLEEF} ${TARGET_TESTERUTIL_OBJ} ${LIB_MPFR} ${LIBGMP} ${LIBM}) target_compile_definitions(${TARGET_TESTER} PRIVATE USEMPFR=1 ${COMMON_TARGET_DEFINITIONS}) target_compile_options(${TARGET_TESTER} PRIVATE -Wno-unused-result) @@ -512,6 +682,3 @@ if (FILECHECK_COMMAND AND COMPILER_SUPPORTS_OPENMP AND SLEEF_ARCH_X86 AND CMAKE_ add_test(NAME testervecabi-sse2 COMMAND sh -c "${CMAKE_C_COMPILER} -Wno-attributes -fopenmp -msse2 -O3 ${CMAKE_CURRENT_SOURCE_DIR}/testervecabi.c -I${sleef_BINARY_DIR}/include -S -o- | ${FILECHECK_COMMAND} ${CMAKE_CURRENT_SOURCE_DIR}/testervecabi.c -check-prefix=CHECK-SSE2") add_test(NAME testervecabi-avx2 COMMAND sh -c "${CMAKE_C_COMPILER} -Wno-attributes -fopenmp -mavx2 -O3 ${CMAKE_CURRENT_SOURCE_DIR}/testervecabi.c -I${sleef_BINARY_DIR}/include -S -o- | ${FILECHECK_COMMAND} ${CMAKE_CURRENT_SOURCE_DIR}/testervecabi.c -check-prefix=CHECK-AVX2") endif() - -# Tests depends on the library -add_dependencies(${TARGET_IUT} ${TARGET_HEADERS}) diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/autovec.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/autovec.c index 41a0d0f2a77..c34cbe4f8f2 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/autovec.c +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/autovec.c @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/gnuabi_compatibility.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/gnuabi_compatibility.c index f71befba906..599d3966868 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/gnuabi_compatibility.c +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/gnuabi_compatibility.c @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -118,148 +118,148 @@ typedef svint32_t vint2; #define __MAKE_FN_NAME(name, t, vl, p) _ZGV##t##N##vl##p##_##name -#define __DECLARE_vd_vd(name, t, vl, p) \ +#define __DECLARE_vd_vd(name, t, vl, p) \ extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble) -#define __CALL_vd_vd(name, t, vl, p) \ +#define __CALL_vd_vd(name, t, vl, p) \ do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1); } while(0) -#define __DECLARE_vi_vd(name, t, vl, p) \ +#define __DECLARE_vi_vd(name, t, vl, p) \ extern vint VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble) -#define __CALL_vi_vd(name, t, vl, p) \ +#define __CALL_vi_vd(name, t, vl, p) \ do { vi0 = __MAKE_FN_NAME(name, t, vl, p)(vd1); } while(0) -#define __DECLARE_vd_vd_vi(name, t, vl, p) \ +#define __DECLARE_vd_vd_vi(name, t, vl, p) \ extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vint) -#define __CALL_vd_vd_vi(name, t, vl, p) \ +#define __CALL_vd_vd_vi(name, t, vl, p) \ do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, vi2); } while(0) -#define __DECLARE_vd_vd_vd(name, t, vl, p) \ +#define __DECLARE_vd_vd_vd(name, t, vl, p) \ extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble) -#define __CALL_vd_vd_vd(name, t, vl, p) \ +#define __CALL_vd_vd_vd(name, t, vl, p) \ do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, vd2); } while(0) -#define __DECLARE_vd_vd_vd_vd(name, t, vl, p) \ +#define __DECLARE_vd_vd_vd_vd(name, t, vl, p) \ extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble, vdouble) -#define __CALL_vd_vd_vd_vd(name, t, vl, p) \ +#define __CALL_vd_vd_vd_vd(name, t, vl, p) \ do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, vd2, vd3); } while(0) -#define __DECLARE_vd_vd_pvd(name, t, vl, p) \ +#define __DECLARE_vd_vd_pvd(name, t, vl, p) \ extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble *) #define __CALL_vd_vd_pvd(name, t, vl, p) \ do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, &vd2); } while(0) -#define __DECLARE_v_vd_pvd_pvd(name, t, vl, p) \ +#define __DECLARE_v_vd_pvd_pvd(name, t, vl, p) \ extern void VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble *, vdouble *) -#define __CALL_v_vd_pvd_pvd(name, t, vl, p) \ +#define __CALL_v_vd_pvd_pvd(name, t, vl, p) \ do { __MAKE_FN_NAME(name, t, vl, p)(vd0, &vd1, &vd2); } while(0) -#define __DECLARE_vf_vf(name, t, vl, p) \ +#define __DECLARE_vf_vf(name, t, vl, p) \ extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat) -#define __CALL_vf_vf(name, t, vl, p) \ +#define __CALL_vf_vf(name, t, vl, p) \ do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1); } while(0) -#define __DECLARE_vf_vf_vf(name, t, vl, p) \ +#define __DECLARE_vf_vf_vf(name, t, vl, p) \ extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat) -#define __CALL_vf_vf_vf(name, t, vl, p) \ +#define __CALL_vf_vf_vf(name, t, vl, p) \ do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, vf2); } while(0) -#define __DECLARE_vf_vf_vf_vf(name, t, vl, p) \ +#define __DECLARE_vf_vf_vf_vf(name, t, vl, p) \ extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat, vfloat) -#define __CALL_vf_vf_vf_vf(name, t, vl, p) \ +#define __CALL_vf_vf_vf_vf(name, t, vl, p) \ do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, vf2, vf3); } while(0) -#define __DECLARE_vf_vf_pvf(name, t, vl, p) \ +#define __DECLARE_vf_vf_pvf(name, t, vl, p) \ extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat *) #define __CALL_vf_vf_pvf(name, t, vl, p) \ do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, &vf2); } while(0) -#define __DECLARE_vi_vf(name, t, vl, p) \ +#define __DECLARE_vi_vf(name, t, vl, p) \ extern vint2 VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat) -#define __CALL_vi_vf(name, t, vl, p) \ +#define __CALL_vi_vf(name, t, vl, p) \ do { vi20 = __MAKE_FN_NAME(name, t, vl, p)(vf1); } while(0) -#define __DECLARE_vf_vf_vi(name, t, vl, p) \ +#define __DECLARE_vf_vf_vi(name, t, vl, p) \ extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vint2) -#define __CALL_vf_vf_vi(name, t, vl, p) \ +#define __CALL_vf_vf_vi(name, t, vl, p) \ do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, vi22); } while(0) -#define __DECLARE_v_vf_pvf_pvf(name, t, vl, p) \ +#define __DECLARE_v_vf_pvf_pvf(name, t, vl, p) \ extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat *, vfloat*) -#define __CALL_v_vf_pvf_pvf(name, t, vl, p) \ +#define __CALL_v_vf_pvf_pvf(name, t, vl, p) \ do { __MAKE_FN_NAME(name, t, vl, p)(vf0, &vf1, &vf2); } while(0) #else /******************** MASKED_GNUABI *****************************/ #define __MAKE_FN_NAME(name, t, vl, p) _ZGV##t##M##vl##p##_##name -#define __DECLARE_vd_vd(name, t, vl, p) \ +#define __DECLARE_vd_vd(name, t, vl, p) \ extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vopmask) -#define __CALL_vd_vd(name, t, vl, p) \ +#define __CALL_vd_vd(name, t, vl, p) \ do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, mask); } while(0) -#define __DECLARE_vi_vd(name, t, vl, p) \ +#define __DECLARE_vi_vd(name, t, vl, p) \ extern vint VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vopmask) -#define __CALL_vi_vd(name, t, vl, p) \ +#define __CALL_vi_vd(name, t, vl, p) \ do { vi0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, mask); } while(0) -#define __DECLARE_vd_vd_vi(name, t, vl, p) \ +#define __DECLARE_vd_vd_vi(name, t, vl, p) \ extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vint, vopmask) -#define __CALL_vd_vd_vi(name, t, vl, p) \ +#define __CALL_vd_vd_vi(name, t, vl, p) \ do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, vi2, mask); } while(0) -#define __DECLARE_vd_vd_vd(name, t, vl, p) \ +#define __DECLARE_vd_vd_vd(name, t, vl, p) \ extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble, vopmask) -#define __CALL_vd_vd_vd(name, t, vl, p) \ +#define __CALL_vd_vd_vd(name, t, vl, p) \ do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, vd2, mask); } while(0) -#define __DECLARE_vd_vd_vd_vd(name, t, vl, p) \ +#define __DECLARE_vd_vd_vd_vd(name, t, vl, p) \ extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble, vdouble, vopmask) -#define __CALL_vd_vd_vd_vd(name, t, vl, p) \ +#define __CALL_vd_vd_vd_vd(name, t, vl, p) \ do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, vd2, vd3, mask); } while(0) -#define __DECLARE_vd_vd_pvd(name, t, vl, p) \ +#define __DECLARE_vd_vd_pvd(name, t, vl, p) \ extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble *, vopmask) #define __CALL_vd_vd_pvd(name, t, vl, p) \ do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, &vd2, mask); } while(0) -#define __DECLARE_v_vd_pvd_pvd(name, t, vl, p) \ +#define __DECLARE_v_vd_pvd_pvd(name, t, vl, p) \ extern void VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble *, vdouble *, vopmask) -#define __CALL_v_vd_pvd_pvd(name, t, vl, p) \ +#define __CALL_v_vd_pvd_pvd(name, t, vl, p) \ do { __MAKE_FN_NAME(name, t, vl, p)(vd0, &vd1, &vd2, mask); } while(0) -#define __DECLARE_vf_vf(name, t, vl, p) \ +#define __DECLARE_vf_vf(name, t, vl, p) \ extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vopmask) -#define __CALL_vf_vf(name, t, vl, p) \ +#define __CALL_vf_vf(name, t, vl, p) \ do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, mask); } while(0) -#define __DECLARE_vf_vf_vf(name, t, vl, p) \ +#define __DECLARE_vf_vf_vf(name, t, vl, p) \ extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat, vopmask) -#define __CALL_vf_vf_vf(name, t, vl, p) \ +#define __CALL_vf_vf_vf(name, t, vl, p) \ do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, vf2, mask); } while(0) -#define __DECLARE_vf_vf_vf_vf(name, t, vl, p) \ +#define __DECLARE_vf_vf_vf_vf(name, t, vl, p) \ extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat, vfloat, vopmask) -#define __CALL_vf_vf_vf_vf(name, t, vl, p) \ +#define __CALL_vf_vf_vf_vf(name, t, vl, p) \ do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, vf2, vf3, mask); } while(0) -#define __DECLARE_vf_vf_pvf(name, t, vl, p) \ +#define __DECLARE_vf_vf_pvf(name, t, vl, p) \ extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat *, vopmask) #define __CALL_vf_vf_pvf(name, t, vl, p) \ do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, &vf2, mask); } while(0) -#define __DECLARE_vi_vf(name, t, vl, p) \ +#define __DECLARE_vi_vf(name, t, vl, p) \ extern vint2 VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vopmask) -#define __CALL_vi_vf(name, t, vl, p) \ +#define __CALL_vi_vf(name, t, vl, p) \ do { vi20 = __MAKE_FN_NAME(name, t, vl, p)(vf1, mask); } while(0) -#define __DECLARE_vf_vf_vi(name, t, vl, p) \ +#define __DECLARE_vf_vf_vi(name, t, vl, p) \ extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vint2, vopmask) -#define __CALL_vf_vf_vi(name, t, vl, p) \ +#define __CALL_vf_vf_vi(name, t, vl, p) \ do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, vi22, mask); } while(0) -#define __DECLARE_v_vf_pvf_pvf(name, t, vl, p) \ +#define __DECLARE_v_vf_pvf_pvf(name, t, vl, p) \ extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat *, vfloat*, vopmask) -#define __CALL_v_vf_pvf_pvf(name, t, vl, p) \ +#define __CALL_v_vf_pvf_pvf(name, t, vl, p) \ do { __MAKE_FN_NAME(name, t, vl, p)(vf0, &vf1, &vf2, mask); } while(0) #endif /* MASKED_GNUABI */ diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/hash_cinz.txt b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/hash_cinz.txt index 2a8c1a85c9f..26b278370b5 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/hash_cinz.txt +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/hash_cinz.txt @@ -1,129 +1,129 @@ -sin u35 bc50dfbcbd8ef534541d1babe90860c7 -sin u10 dbc2cf81f292ef50fa0119e222c6c9f9 -cos u35 506e34a809b80ad3603ed46ba2a574b0 -cos u10 a0f69df5937152b8f8f0e671f3676289 -tan u35 970b5cd7f0e05defa22ebb155ab61a40 -tan u10 5fd08e0552e3ab853439bf5fd2bd344d -sincos u10 7c164edcaa45988f6165b653fc76c495 -sincos u35 38fe7e261e184ed8dbf432ce6bedc5c4 -sincospi u05 0c6fc00c7aaf0b6e67d66542d1be833d -sincospi u35 c428b0fc3e6c5be4d2c03dcd8bb27a7c -log u10 4855b27222d900bea47a27cadba71727 -log u35 c95484de57c167da3d8d6d1baadf9ffa -log2 u10 2662df9af919680ca62e1752fb1b7539 -log2 u35 1cd6d7f194a5e8364191497adc5c5cec -log10 u10 36645e8031d873d66fd0ec2c5959f273 -log1p u10 1383924fb56cf2e7eda27de21320c591 -exp u10 13692a48edf2cf7a3e047b16ddfb7b81 -exp2 u10 436146f8d6dcaa4a754837108a9aa3e1 -exp2 u35 8881d075d9101a1dfa3f6a10b9ee8373 -exp10 u10 9d704b310f683872a6446cfc97726a4d -exp10 u35 bc07745ebc22a7ee97679154c24b23cc -expm1 u10 cd3f0b8e86943d52c278394b60e2d22e -pow u10 a0ea63b27d33262346a35c9439741075 -cbrt u10 5d8bf28ac74624594fd1be9217817690 -cbrt u10 3c896e03746bcf1b3f70182dfec3d93b -cbrt u35 73daa306764e208aab1627ac110b10d7 -cbrt u35 c29b7bf200215425b4ba948c8cc94c42 -hypot u05 cc2f18e409e19a02cadf7b91fd869120 -hypot u35 5194e0a554174a6145511ce3df9c1f46 -asin u10 86c061caec3fa2e1bc71bda4dad29f4c -asin u35 31303b88bdc00206265002d6cc5e89e4 -acos u10 0a1a403590f2ac8364f132b334920945 -acos u35 493f960c1cce57931d95a5a22a0587a3 -atan u10 c97624a24ec034cc0c8985acb61d13cd -atan u10 0be0f550406923016cfeb5ef62c25b15 -atan u35 9d6d83e066b5a4851d44771418c9948c -atan u35 f32c1aa4caa08c6945afd1125ba8b113 -atan2 u10 6b1d9d25fcd96053acc19d1633fab36a -atan2 u35 afb07894347062a96dab705b34eb1763 -sinh u10 61d459b1f368087f6f23ebf8e9f0ea01 -cosh u10 f77eb95f79e274c12b4e92dc0389259b -tanh u10 2bb9dd54ed0fa22bb5f3b6d557eb58a3 -asinh u10 01136e54e2a434839530dda54f33cfdb -acosh u10 2f3c28c9ee2eb2b3d5659c6cb2a58e3e -atanh u10 601a77ba8c1d5175f2808b48a41260c1 -lgamma u10 90cdc41063f4198c6ad592c0cdd0f5da -tgamma u10 6f864c3a1f17fbdf914cac7ffcd82cb7 -erf u10 f4ae148b59bb7501d8f5746300850376 -erfc u15 5e116a4316dafa742769f71e18f6f9fe -fabs bef2f2ac8a4789357e580b4da4f9b9fe -copysign 3219022f267464e3704f90558e8df3bc -fmax 4e4f5220ccfef191864c316df0d18fc0 -fmin c0f8effb6c611e2b3b91b820ad943f62 -fdim e876d103931f18ceede5bfd7e3df7ab0 -fmod 618aa751e13012afdb41ec80dd35e6ba -remainder 8d692dbb44bbc9be5af0c0657d3008b8 -modf f03ce73cd4f9ea7f69c017f6e53355d5 -nextafter 9eba4e30d12d74dc4e8003fcff0f1582 -trunc 1bc7e909eba121dcef7f0e4046937ae5 -floor 2cff66b499dc8a30cec9467de659b774 -ceil b080e632dcb8f8134d8715752be12917 -round 8907e21687ca9c2a539297536e754950 -rint e49f837096bc661fe1c742801dd99a30 -sinf u35 833d845950b9cbb025629fe4c040f8f6 -sinf u10 9c21afa4d7d6af3fc666309c3cd647fe -cosf u35 74d7f871a6553cd0019087895e2052ad -cosf u10 35349e94c323c1614f22093959288010 -tanf u35 bbb7c092d017e96d2454a38a20687735 -tanf u10 227423bc04f42d76a8f68082ba696126 -sincosf u10 83ecc4e3d5295056e9d8c52bc196b666 -sincosf u35 533319caa49a961e4909bd6dcab40721 -sincospif u05 8b3762b67a661957c1414c351ec49034 -sincospif u35 cec15ed76a358091632634166fa77b66 -logf u10 c5a90119943acc4199e1cc7030b5def8 -logf u35 af2fbe4bfa2caaf59c734e3749dd15be -log2f u10 ba8acae369bbb7b6404cccbc633fe25b -log2f u35 ba32ebaa8c470899ebd433d190c00f03 -log10f u10 7e235a82d960e4434575dd39648d8bb7 -log1pf u10 350fc4f13502b36bb1107e1b1122acb1 -expf u10 ee4adaabefa3fac6c0f1925b2a948eea -exp2f u10 b0d283dbae0f36f1b3c7eed9871f0d0d -exp2f u35 522cc30f722f77fceb07015830b351a3 -exp10f u10 b0564be151965600f5744ff2e4992bc9 -exp10f u35 d142f1fb40e44f0c9e042718f27ee3e0 -expm1f u10 ebfd6498cb40f61b609882de8a7f3c74 -powf u10 a7cba3239c87969662e8b41a4dd8b4ab -cbrtf u10 01c5cac23fe21638be1c3eab6e368fd6 -cbrtf u10 2a245b03f83e9114644d03b40dac707b -cbrtf u35 3ce62350fd585f0524a12c974fbe6cf5 -cbrtf u35 2aca0404626a28f7af7f60105ad6e217 -hypotf u05 bc5971cbeebee27b4c0d91fbe3f6bf30 -hypotf u35 a6f0f774b346a6bba08889ff9ba3f193 -asinf u10 7f77f7453b961512c89e87e49c549cfe -asinf u35 22ed8760aa328e1f714031eec592a4d8 -acosf u10 15617dd0429b90e59d2923415934c2a6 -acosf u35 af0b132d9e263721f9296187dbf9b9bf -atanf u10 26b77fb423104b45633cf24500237d6e -atanf u10 4313d0bc2708de53f74d804aac6564d4 -atanf u35 97a1797897955643c722c7d291987331 -atanf u35 7d3f47169415058e8578f11d899bfd10 -atan2f u10 098a33f730fe95ce4774a991db4cee14 -atan2f u35 56fc6bd8349979f0d0b1dcdb57f68363 -sinhf u10 0780a2f57df3a831718195d1ee5c19ef -coshf u10 cfbb6aed408e43a7b7f053474100ff2d -tanhf u10 d19f254d41e8726c748df87b95bc9acd -asinhf u10 260d129221468a86bbfd609c27bfea6a -acoshf u10 24ced7e5631c78b20a5716faeedbaa92 -atanhf u10 164fd77b8372b8c131baaacab1c9e650 -lgammaf u10 3bf6d824175c4f4d86f3073064e41e84 -tgammaf u10 f3a8d25c852068622bdfcae4cb813583 -erff u10 f34af3814153de040b93e573ca7d21d8 -erfcf u15 915ab9830de89a5a504b3ce7cd2fecda -fabsf a3c72220bc0ade68fe22e0a15eb730d4 -copysignf 6b35517b8e1da78d9c9b52915d9a9b19 -fmaxf 9833a60a2080e8fd9ae8de32c758966f -fminf 2dcfa19e1f1ab4973a7dec9f2cc09fa0 -fdimf c5c0fe7b095eb8ccbb19fbf934a36b24 -fmodf 77aa84a9703e202a56e5f4609bd2482b -remainderf 5a453b1217c173e4dc0b0211066750be -modff 5fa4f044f20478216aa085a01b189697 -nextafterf 517c1c8f072e9024518d3d9ead98b85b -truncf 6937050850be63c44d4b7dbd666febe6 -floorf 9341be69ee345c8554bf3ab4e9316133 -ceilf c70874771cbe9741f1f05fedd4b629e9 -roundf 0cf52f6b8015099771e9a7dfa6b090bc -rintf bed68e788e2b11543c09c9d52198abf8 -fastsinf u3500 8eb51f86fb40414dd21284f020f24b6c -fastcosf u3500 69cbc3703f1d2c68695b00b1b09287b2 -fastpowf u3500 e02e6a692cfa22a6b7149168c67ea1d2 +sin u35 7ddf50bfc76c34f8640e1d48368a4807046ed09a7cd9f4e092364c0ece567420 +sin u10 2dec8ff3f5d3f0601ee7d5d8cda65777b3b31d86f522b1306cf50d0a7820bdba +cos u35 26a6889b13864c87e41500246afd02ec626529b122a1622ab5b4d915342fd981 +cos u10 094594b432e3f6f7695f21a9eac5f48adfc2b52729a0b7f6dcc73d56572896d4 +tan u35 9e4884d3079d52edb120d080ae609bc94dea6de36b91f9c41f7a69fb424cb7bf +tan u10 ae386240aec3b3ce4b7d5a13b1f69759f54fc57378439b9801c65de4e7c8f5c6 +sincos u10 dccd728b97586cd65da3998eb225c3b59634b360acb56ea74d1d45d61fea4f4e +sincos u35 2c16ec6ba4050808419fd5b9c995606412a0fd41f2a7e109c1a8cab5adf0b11b +sincospi u05 9fffb591dd38190f8dd61d0f9dcaf7843606d4c3f6717bfac9835471178600a4 +sincospi u35 b362c2f22c2475715d0933caa5ee1400ae1639da9e60c83eeca676e3b2be12d7 +log u10 a25704431659d3f451536556bd81a2b9c2abc82203e23539df2ecd899436a9e2 +log u35 83476779543cb9f3a038e478e8fee0d6ee0060227a2433363d221d71ddc72ac7 +log2 u10 bf2467410af2c29e30ebf509bc066759c17b31fc409120382898a6979fbbad2e +log2 u35 2d416462682e561a2bab83d5b11ea235cfb991675e3777fa50da75d755b08774 +log10 u10 1aa2fb18c8ae9a19f8f9be331f72cb3f842188b705d73e86bde47ecf661297cd +log1p u10 e21e7518e09b85f0adaf1d0d3cff362364e925fd07aa3163d77b818cb644d942 +exp u10 c21df57b84d8c9010aae562e21daf7b1c3f7df277db9cff2999d74bfb517e60d +exp2 u10 451209f52083f022f30793abcf7761eae138642bf8d5a252ca8c83489088bff3 +exp2 u35 0661d1afebb47f2755e97337d6b065cf925219aba48e192b9fbb56f696f17d84 +exp10 u10 9881cd7b6c7c2eeb7b8b5d297277d1d0f4276ea74835672a94fbcade8e604d34 +exp10 u35 5a8d99078d3ca904dad9fc3ac4ec7c90d2bcd216417022dcb38df30293e1cdf5 +expm1 u10 609ae579ed99b4c8ff7ccaead9c3a2216bfbc1d156dc05a6b401de066b0a079c +pow u10 a0034cc77ecd21a809265f76e67528217357f2ef3d2883ff017512f92bbf9360 +cbrt u10 e128b321cd05dca403a7b0633424cad82600ceb5b61966f70ff3cf425bd6b3f9 +cbrt u10 b722d767ae6dd66d3d1dfa9d5d2aedaed3c652020dab5fcfdd729b3f2c803e98 +cbrt u35 5ecd857b96a17ecf71808a53416e0f40d0935f236e307dd5e43587b12db375cb +cbrt u35 c46da13b1a71174922de04a844b1b303ac5fd2d0da98a6352b234292cf7e42e9 +hypot u05 9f4275e06e1ce269722162c4bc521f159906a448ee05f9619037706cd3e54b72 +hypot u35 de0c1ae1ea4c9eda164e0dca28c293cc72caf3b12b2d15f757bbb4bb347f257b +asin u10 c51e0211bc0a1a422982df89d38f48ef0b0af1d90588a1715fd4ce966c701b66 +asin u35 405410e624265daa84c0837c55ccf2d45d8c4f6086b6f6a744c4c6e133cbcc1d +acos u10 8e8c6e984110c0decc1ce21bf71505195f029a935064bc3692997b400cb15edc +acos u35 bc99071767af3d4bf23c3d828284a6950ae205898a6b3773a5aca0b59d6d6a0d +atan u10 c96690351d5df7745fed2004b1c72dc7aceaa32c4d400f296c32efc9ecddab0e +atan u10 9f64e9a576084542e1fa4a4064055af79b4ae20ced35ca617c4327a30a4a70e4 +atan u35 a0852efacaa91625350cf104f8fe0dcbb5936d2b9ebbd3cf8cd6234ccaf8a0d3 +atan u35 e61f1f4917e474cbc7ca5ada17c31bdece04c6a86210a472c53cf5e8faeac882 +atan2 u10 9b6c9b875a9c841259fca8d718778a1895a5b434ab4b95d284c4345249c2f853 +atan2 u35 895dfae0dbce6c2aff81b986ebc732fb0323b267f57c7b1e0d5c8ec522da6af4 +sinh u10 d3859e3dc1ca924f11dc7b464cb0bb535d4ad71d1ec6f416a82db6e0e2390367 +cosh u10 e6fd1172e97fa9341028299dd8a00379f1313170b8444a6a3c291230e4f178b7 +tanh u10 5e2c1ce9d160d1a5dcc5ef8fd74f860751764f5dc14124075f848074ee386618 +asinh u10 37d0df9811cc871b1dde4d762cc0eb53ec6c71c7bcf13100b9b5302ba1a85b99 +acosh u10 158fb84af679aea2ab411fb84cd0b12ff876d897722ff84c54fa567c35705033 +atanh u10 32253ae4f643e56a3d25a6d96d316ed94cd3a9e5ea16ad7180ff96e68571dc34 +lgamma u10 4663f72dcb58a53bedefe071de51f0fccb9b73db12f5b53d5acea347d4de06cd +tgamma u10 87e21460a2a991b677416b39a85d391051e4327a39baa7bfb93f2e27965567af +erf u10 56488fa7013635a233d05787e9a681c1c8775b6d9aace07f0d1dd16fc34c5875 +erfc u15 0e5e1126a0eb4cce30f6cb164b33330ac4d792c21b8bfbe33cc9a828b4f9f047 +fabs ff336faed535e34a082752839c9e957ba069ffdf0b046215bd415ce9120f29a3 +copysign 67a7a162bfc2f15b76ded0470f938ab000edf8f8566d5a19fa99d4ea4d29fff3 +fmax 57f39d5440fadb2a7387a47c00b067d5fc57ceabd7e5d64943b033acb5212063 +fmin 87e131762ec9c46badd6105ab66f09d99d65776e2719f6af9befd8d6d3f59b6b +fdim 3331d6a17f289f54d429bdda9374d7d2574e0cd173e930a57436e8e484f271e9 +fmod 89d26af516be177c55ba9fcec972416c35e229456b053271548021e9b070c193 +remainder 2db01bb12776ec14d4a15469c31b49e759d74a3c8ed30d14fe88af3b27b5c398 +modf 7780d1e6448f21bec6504e398a4e826f304da10aaec3c4e210bed86abdaecedf +nextafter 60a6c07477f6d07cd938ba6361d020175193a934a2714132615dae0bcedf785a +trunc ee43b2f9d897428885cb039f85259ea5ffe4efbfe4bf0dba16ee19829d198ac6 +floor 29f8be9b8ad5795e65ed4f34878a85f5f8a1be707489345c4ad04b36d4da54bf +ceil bf267441867b261f8dcfca61b55fdc7ac0ff7a017b150da1b532776894962208 +round 5d7d57a50d9860a7d145d428884df0341564dec7f14c24d5c319c8bce5565f9d +rint 834f8e41e3a28f43b26bc9a5836882cbc0fceeaec5774202cb6df473d995f5a1 +sinf u35 0b91688d57e650a50dff113cae51be6088e067e877baf0fc50675528432d1539 +sinf u10 d6ccd197ac5534b74a04340e62e38fc5ec9fb1cbffef80fb1782e659a1832260 +cosf u35 c5d48802983d4673bf3961453a3b02f13b894b83144f067d93b1d804de722aa2 +cosf u10 420ba2e57ee0bae63e995ffb85aac07a5f1758d76f824d24193f75af349fca8c +tanf u35 ec5bcbe8a93d2a5f59365656ba15a10af2f24375bf265663f762730674a656b9 +tanf u10 2d4c53018daf572ce2e20fc7bbe1435b04746db6b0cee9c33304cef94f14dcde +sincosf u10 b0390e1d3554fd469d53d5e45146e9e1f440d46fc0a9b8f9ea334071af369f55 +sincosf u35 c4967d888e7713ff231c3fa3372a0d89c5df220585054156256bc3d4f0917f3a +sincospif u05 66ccd831fa4c215b71cc791f3d0cb31babeadd34539867df8029cddf45539ded +sincospif u35 9fadd97cd2996c6601079869248a59772bbd5b23b625177ef0351120f0759fc2 +logf u10 a43f52f3ce728ebd9ec9e2e84c901f6012fe0d6b83029c8380036404f59cd3ea +logf u35 fd05264b52e29af9f0907b98af57f0cc0737b506a6290c259d3eff92123add86 +log2f u10 c732f1b5c7f5147d1576d4d858db46952d42ec229117dffce8b82e798799d2b6 +log2f u35 d2e637436e49d04e7747258946075b715033e925ca589696b4577a4f96632a9b +log10f u10 c616f9465c071c42532255e9a49ba4305e0a588fc8d87ac31fceb30d2c59391e +log1pf u10 fbfce7374fd3e030b5678fa31e99bba2aa4e68e60e8eeb15a10e41fb34ed1cda +expf u10 d75ce19c93fb038cfdd8059f816a7912481b26f7d90cbd554545f21a0b873861 +exp2f u10 4a579f3f572362629acd563e55d765a7d83cbc625584f26e0a36163e80bffe87 +exp2f u35 90c3bb433051b828f081de99c3d3e1d731a718de306d0c9937478f2b57e981ce +exp10f u10 57856cab0911b80ebeeded0c30b9e978ca6d17314ca2e7522c02ff6b6e904f57 +exp10f u35 e14dfb56cd4798e675b751c6cd4ddc073e9a5e8f59a97638bc8a9b766f564a96 +expm1f u10 c0066ace0274e83dfce6b6f806ad89ef4c8b0919011477934d43c88dc42e0db2 +powf u10 b380319c0b9bad2cf717f8c31a09361b869d49c1e58ee5e1f0b987f96e3acffa +cbrtf u10 3589ce3bce26b796ddc4c6ec177cdb0ed05ece414530f4c22c77452b37432050 +cbrtf u10 cfe7b512f728e60f99e14f597d34c94279b96e6cc897fc5ad1377365afb164c8 +cbrtf u35 57902935bad6d5f45565d447e82ac2fd673442b8fb01fa178079376ff1220b27 +cbrtf u35 172785fb38220b147078c16b7b203edf4e879f853e335522074ae0103cddc472 +hypotf u05 efc46c07c1bff7caf4f1d52fbe0db4ab70100601c114acea3f4ecf7b2aeaf826 +hypotf u35 f09d3b29f563e599ea2d5e6434ff84de3e72ae277fce5055ee2bbf9ce6aa4214 +asinf u10 82e645be1e4e8216be262cf67eac586a8d8a0e962ae5d34cb14c55ad177883d7 +asinf u35 1010918bc615b794d532b8643b60a315f2bc8e2248020b4a6024ffbd593c54b1 +acosf u10 886eb790a1d46f29fe04d470a1e71ee565951d22383cfd67eca92d3f3437db6a +acosf u35 75ebefc2d532049af4234e3247b311782aa60a776c53d669956f578e5b2e76cd +atanf u10 540a69391b28afe8d067cc99ac86abbffe08bb3c24f8962be4b7aef0677562de +atanf u10 2c12f291846249ca41d6a9c4108bd93a6b30246ef776bc282ad8cbb9e6c05890 +atanf u35 d8d7c1156fd61d138ccb88d435097be739c7bf4806ff605c0d39216380b55e96 +atanf u35 6985b58dddf827aa610029c51aaa204952589175efb607e2b135a1dc666b3fca +atan2f u10 7756cae9e0b7ebe7e5180f9714e49c6403ead4182ebacbb89dc0cb3cc386e998 +atan2f u35 a645f681b04876451d8f0de0dd28958303b2b7f3b51957883b09588776111ddf +sinhf u10 d8094aaed987d20b0c4e8eccb63ed5cc00f4ad8bf46c67888f5ab87c21b15681 +coshf u10 26d59cb9ec0a6f5965dfe66df3f89fd2bb348ce75f811ee580426df42f1ebdc3 +tanhf u10 3b715185ce7c39d70ff17dceb539380b8ac9c80303c9796e41d1ebda6f2b0ece +asinhf u10 281dfc8d6f3a9cad40276392b21e48d14ae05986d9a97ce21cf122adf5d14ce0 +acoshf u10 9a5809171d6a8c4a3e39fd32a71d5dd83d7a55ae8c2c352dc453e59b01c4a42a +atanhf u10 66540cd17454f09a95ef5adbeef6e9413ca31cb0446edc879447838f7b8c079c +lgammaf u10 b26a90f8b782f2a91132d5c12dcd56d749e301bf51e275085df2c4579639fa44 +tgammaf u10 e3ba7f95b002555d655e07e8906d29e0f867c28c3abe6513d32c20468cdce05c +erff u10 81041541f31e72a7745d6fca4b208d4e332af8fc2366df5372b6cb38755369c9 +erfcf u15 88205a29a679f22867bf078202e68f2a8f5557780f0b8366db2f0f20c1e23151 +fabsf 560d13e463bce4448d733798c5818b13e5634f893211047bc2fad9f4c613797d +copysignf 74f7af06376f4c79d7af9ce4e50bde2fb8f22b56d741bdc67624ef7d1989e76b +fmaxf 7474be750857fe400beb2bc14fd1b1113a2e365ae7b45b0acc508436b4c32a94 +fminf 4e22c453645f3c108c27e2c2fea65cd6a6b535f8236fd7382ff1082db3b31b5a +fdimf 16c17ff31778c7d63ec7f65c3b2a8561b79be62b1bc1b399ac0ffc43285b6cd0 +fmodf 9fa4ece68b16803e6c47fa5cb280f8c246a2ef5731b0609bec71e1db27906f0a +remainderf f32761a428b1336051ee773e470b74ada93a611cafbc08f6a9aff36957c84f64 +modff e976f223f2f4d380e9955392cb010920d5439665ae8eaf0fd6abbc889700a4f7 +nextafterf ac05b3fd824c3ce73eb3946c7e3dec94ce4b1ec4efd0237bcfb4578d3d422cc6 +truncf a014cd8206fb15c21b1cc773b951cf7f673e8be9e6e697ca0cf7293becb3d55c +floorf f17658ab95f73a1b4cfec0417e82b1b071cb97a3aac0110e289ba6449b2aeb55 +ceilf e173e35cc97a85629ceb5025bd5b1abad52e4e153166cbf9672396b4ca23b59a +roundf 3e67b087b019c806d87593850bf0cd106718cf34f50684784296ef040f301fc0 +rintf 827acf1e7d253c4fe9fdd4c5c9e53f35c80af5550ff6cbbb7aaac67577630c70 +fastsinf u3500 6c68502acd4bde521daad91a0947faea0bd4b15c8e1d8adf4614351eca60f7dd +fastcosf u3500 64cb4ab04eca2de35df084ac4c3c7285553301474357783f96ee6467e21f9144 +fastpowf u3500 a908509f84693183aabb532aef9c26f42e340bd0a0253d1e40cab44358c6b76a diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/hash_finz.txt b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/hash_finz.txt index 6d8589f35bc..ff3dd6a844e 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/hash_finz.txt +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/hash_finz.txt @@ -1,129 +1,129 @@ -sin u35 c163e4a7e9ccebb2181dcc8653367d8c -sin u10 0d6bf6f2c935db82588222da95659019 -cos u35 52f902bd939d751b5b544ac70181fcff -cos u10 afcdba92a75a76d56b8cf2f22d4bec9e -tan u35 906cc42b6755fe514c5e185fcb4d2f55 -tan u10 c98f29a62067fa63646d9bcc29a310c6 -sincos u10 3fe37f4eb805505152f2b14a22a9f94e -sincos u35 95a7b7f48c71febf10ec6eff796dd391 -sincospi u05 0c6fc00c7aaf0b6e67d66542d1be833d -sincospi u35 c428b0fc3e6c5be4d2c03dcd8bb27a7c -log u10 4855b27222d900bea47a27cadba71727 -log u35 015f8ae899c9b921d48919dd12ef19a9 -log2 u10 2662df9af919680ca62e1752fb1b7539 -log2 u35 908b1949db34ea855944f00089b21e23 -log10 u10 36645e8031d873d66fd0ec2c5959f273 -log1p u10 1383924fb56cf2e7eda27de21320c591 -exp u10 084e5be89c2ad03e356078ea4f287bab -exp2 u10 6e36db9ae2cf9eca82e3d9157c622351 -exp2 u35 6e36db9ae2cf9eca82e3d9157c622351 -exp10 u10 0cc08bc6a3d08d6e61450b5370c6161e -exp10 u35 6904d5509ca794747aa249c13886f90f -expm1 u10 cd3f0b8e86943d52c278394b60e2d22e -pow u10 7e19796027d7c1d1999be948f90e6181 -cbrt u10 5d8bf28ac74624594fd1be9217817690 -cbrt u10 3c896e03746bcf1b3f70182dfec3d93b -cbrt u35 fc7ee3e3e6c54365d708b752c242a947 -cbrt u35 2408714a56d74f8c82389ca6772cdbc1 -hypot u05 cc2f18e409e19a02cadf7b91fd869120 -hypot u35 be7bbd41dffd746b70261ee773cbd4b2 -asin u10 8a21b7c28cdaffc9d3e53f415367932e -asin u35 9c9e8107782898e9faed6924ad1b3cb1 -acos u10 28261e4eb8331865660c814676d5c6bc -acos u35 310911130bfc45b10dabe3a072939331 -atan u10 f931de72f2f6a7928f307a8a382ae255 -atan u10 453f9ef62f58f9829320baf482a1d457 -atan u35 6161b6189609f105b017d8768d0a41f1 -atan u35 6face71d8d93c69448d49ed6140e361d -atan2 u10 469babaeee9bd30e17af2f473b3ea500 -atan2 u35 6a3e764125aab2a0a13e7a0d9ec02f7f -sinh u10 61d459b1f368087f6f23ebf8e9f0ea01 -cosh u10 f77eb95f79e274c12b4e92dc0389259b -tanh u10 2bb9dd54ed0fa22bb5f3b6d557eb58a3 -asinh u10 01136e54e2a434839530dda54f33cfdb -acosh u10 2f3c28c9ee2eb2b3d5659c6cb2a58e3e -atanh u10 601a77ba8c1d5175f2808b48a41260c1 -lgamma u10 90cdc41063f4198c6ad592c0cdd0f5da -tgamma u10 cb9a93844ad1713d2ab92ff5b6398150 -erf u10 8a0bc2146a5c67b6bebc58f4b0076568 -erfc u15 3e247a54183eeddedc33e99c50118995 -fabs bef2f2ac8a4789357e580b4da4f9b9fe -copysign 3219022f267464e3704f90558e8df3bc -fmax 4e4f5220ccfef191864c316df0d18fc0 -fmin c0f8effb6c611e2b3b91b820ad943f62 -fdim e876d103931f18ceede5bfd7e3df7ab0 -fmod 618aa751e13012afdb41ec80dd35e6ba -remainder 8d692dbb44bbc9be5af0c0657d3008b8 -modf f03ce73cd4f9ea7f69c017f6e53355d5 -nextafter 9eba4e30d12d74dc4e8003fcff0f1582 -trunc 1bc7e909eba121dcef7f0e4046937ae5 -floor 2cff66b499dc8a30cec9467de659b774 -ceil b080e632dcb8f8134d8715752be12917 -round 8907e21687ca9c2a539297536e754950 -rint e49f837096bc661fe1c742801dd99a30 -sinf u35 f8f804eae1d9443103e81fec96293477 -sinf u10 3f12a7381f1cbb1830d92b4ec72d21fe -cosf u35 f2f3d1c9f090cde9c02439608dc7066e -cosf u10 dc35f27fae65f63f0aa6ad241f8b387b -tanf u35 68d42ad1fb412e6b8be3853461e61213 -tanf u10 97df301d4f59e67d5318b5356b703f06 -sincosf u10 a97124d810ec461c135dc4fb0c059b6f -sincosf u35 0cc521e52ae1227d311012c2919c1ff2 -sincospif u05 8b3762b67a661957c1414c351ec49034 -sincospif u35 8720757f221c00cc8de24b7dc4949144 -logf u10 c5a90119943acc4199e1cc7030b5def8 -logf u35 b6234302d534d6ccd48155dd6b9a4293 -log2f u10 ba8acae369bbb7b6404cccbc633fe25b -log2f u35 74174c90717c86642b71284452a8aef6 -log10f u10 7e235a82d960e4434575dd39648d8bb7 -log1pf u10 e53dbfa80bcc1a7bcfd21000e6950475 -expf u10 9597388315e4b3e89c4c97ce46374dcf -exp2f u10 42d66e5e4cb88feb29c5b36c632159a5 -exp2f u35 42d66e5e4cb88feb29c5b36c632159a5 -exp10f u10 954f0824b6d949d0da03b49950dc6642 -exp10f u35 6fb0e9a829e12a06679d379d05b53ede -expm1f u10 ebfd6498cb40f61b609882de8a7f3c74 -powf u10 2ed84af40d03e307a620365f172d010d -cbrtf u10 01c5cac23fe21638be1c3eab6e368fd6 -cbrtf u10 2a245b03f83e9114644d03b40dac707b -cbrtf u35 6c22a6dc132c5212250970f22f42256d -cbrtf u35 5ab696ae11f9637413d30e6496d5324b -hypotf u05 bc5971cbeebee27b4c0d91fbe3f6bf30 -hypotf u35 2a7cd97768287084b7fffc7e9fb39072 -asinf u10 e2e571a01984c4ffb3f6e38e0328d90e -asinf u35 70df2dfc3a3569868cce60c38e7b1962 -acosf u10 5180fde4b02a0ca4cd75f0a786a1bfeb -acosf u35 72b0e2f9791f90f1c43570b9e9ba893f -atanf u10 fa672e387a204055f735b7af98dd8a35 -atanf u10 d017670c13bc221b68bc9ee5f41c4b5e -atanf u35 f592e46eaa5d29583f86d3e336f20b6b -atanf u35 e7087fe40de46921826b373d10c40954 -atan2f u10 275b2fa8ee554c45551bb142db9f8197 -atan2f u35 44b187851195d24bab2561eb8f4ff5d0 -sinhf u10 45bc228a14c3e39eeb35e9764394a23e -coshf u10 838d441e85d415ef4fb1e5c5ea966a71 -tanhf u10 d19f254d41e8726c748df87b95bc9acd -asinhf u10 927eeb621a3e2d5039f1a07fcf150901 -acoshf u10 932520013273174fcabe2be4a55f919f -atanhf u10 164fd77b8372b8c131baaacab1c9e650 -lgammaf u10 3bf6d824175c4f4d86f3073064e41e84 -tgammaf u10 c3059747811d98846f74a63d3747ac3d -erff u10 f34af3814153de040b93e573ca7d21d8 -erfcf u15 687a9c577512d349ddbc0643013d2c56 -fabsf a3c72220bc0ade68fe22e0a15eb730d4 -copysignf 6b35517b8e1da78d9c9b52915d9a9b19 -fmaxf 9833a60a2080e8fd9ae8de32c758966f -fminf 2dcfa19e1f1ab4973a7dec9f2cc09fa0 -fdimf c5c0fe7b095eb8ccbb19fbf934a36b24 -fmodf 77aa84a9703e202a56e5f4609bd2482b -remainderf 5a453b1217c173e4dc0b0211066750be -modff 5fa4f044f20478216aa085a01b189697 -nextafterf 517c1c8f072e9024518d3d9ead98b85b -truncf 6937050850be63c44d4b7dbd666febe6 -floorf 9341be69ee345c8554bf3ab4e9316133 -ceilf c70874771cbe9741f1f05fedd4b629e9 -roundf 0cf52f6b8015099771e9a7dfa6b090bc -rintf bed68e788e2b11543c09c9d52198abf8 -fastsinf u3500 5c48081c74cd0316379b580b047dbfc2 -fastcosf u3500 6f73d116f109283e5632c31f5988f55b -fastpowf u3500 6dbb3110412df4fed5a71f50d40def89 +sin u35 c0c8e53bd8762032e30a6e843131ee80bcb7c6acd3fb299e937be6add5a8d5aa +sin u10 6692fc59b029f7b11a511c21ff2a5e7c01c8b76bcfce80357878b0ac8dc42b29 +cos u35 5096992132d8ea8ffdf32f0193b6c6dfa5700bbb64a278ec2e7e5ddf4d0ccd51 +cos u10 bb8942ccdf1c86289f2ab560033d38f39b37bcb87d0a2f646f71a9521456e905 +tan u35 334507c35c29da824184f60c8318d3d0cab6ec91291768794936a0fd1caa08f3 +tan u10 48006a954a296162fe7232ffeb33e602ac54bbf38e2764ab65ea2717f53b7906 +sincos u10 042262aeafa5774345a43d75e0aca41d4e8e591ba86a35fb113e9f41c1b1b198 +sincos u35 628ebb6a27b6eacff75deddf301f06ec517dde8ba4566f84d765775d4d2cd8d1 +sincospi u05 9fffb591dd38190f8dd61d0f9dcaf7843606d4c3f6717bfac9835471178600a4 +sincospi u35 b362c2f22c2475715d0933caa5ee1400ae1639da9e60c83eeca676e3b2be12d7 +log u10 a25704431659d3f451536556bd81a2b9c2abc82203e23539df2ecd899436a9e2 +log u35 b47e57b1afc82b14211b9f3338f41208771b7d971774cf535e9e9bcdb6327db5 +log2 u10 bf2467410af2c29e30ebf509bc066759c17b31fc409120382898a6979fbbad2e +log2 u35 61cdc83d0e7de8d132764065fc7ba47bc18dadac441938d7bb0550c18b27956b +log10 u10 1aa2fb18c8ae9a19f8f9be331f72cb3f842188b705d73e86bde47ecf661297cd +log1p u10 e21e7518e09b85f0adaf1d0d3cff362364e925fd07aa3163d77b818cb644d942 +exp u10 c7997af9618cab09736d7736614dfe6541c6417b75894474c02849e25c5eb6a4 +exp2 u10 43ca5b299c5ef8d38c7ea3594e8925f00ff7dda62788f0ed003ffac026f4aaa4 +exp2 u35 43ca5b299c5ef8d38c7ea3594e8925f00ff7dda62788f0ed003ffac026f4aaa4 +exp10 u10 b9d8ea0a1bffa2097c84ea57752a00e71e12b0454ced6ce40a56c0d62a05c2f0 +exp10 u35 9dd4096b0f0907112a7051e4cd0f8b93f4e56403224f5cb5e0e1a3601b55fc14 +expm1 u10 609ae579ed99b4c8ff7ccaead9c3a2216bfbc1d156dc05a6b401de066b0a079c +pow u10 74772c3583d5579f1b28fd322048a40c286595057df623ec65028a9647f7bf46 +cbrt u10 e128b321cd05dca403a7b0633424cad82600ceb5b61966f70ff3cf425bd6b3f9 +cbrt u10 b722d767ae6dd66d3d1dfa9d5d2aedaed3c652020dab5fcfdd729b3f2c803e98 +cbrt u35 96d1ef3aa862044af5cb0ee7fe62e161b61fbb9ab50549925b5f4bc8c1450106 +cbrt u35 3d648e8f0e56d75a4765d3fe4ba58578dde6576199dce8a920d4fc74f3fd2077 +hypot u05 9f4275e06e1ce269722162c4bc521f159906a448ee05f9619037706cd3e54b72 +hypot u35 0473b61c7dd7a4e6a8394bbafdc613f4e1d8eac704830dbc6257ee8f85601149 +asin u10 7c466883cd3b6055bff9f8f13e2a8eff00de053f428f88b169fcb18b85f5859e +asin u35 cf291432912ad68a37dccb92882199e11d382b402794d72bf78d467a40ba6911 +acos u10 31f80b277ac9dbedb9f4397fa058b11e3e2497adb5ad8dca3055b18bd071b2d4 +acos u35 6025e6a4a64608b06709ba1eda3da1a3a697344c27dc1be50aeecb722aed5837 +atan u10 561fe325ecfbe2ed5b3761da5f43886ba4081566e12b793f02fb105f57d74cd7 +atan u10 6f8ded4d8fba9461e3df9faf8924499424d5910b4e3d7829573efc4b088316e1 +atan u35 9408d2aa734a6b0c0bc1c80f4ad34e2b3dacb5eae623366deaa2cc2b9454499f +atan u35 c03ad6398c6992d946f89ff389fcd548be3bd9cb4fd0a1613f686a5a1ea1f0dc +atan2 u10 a3bcea5507555b07f1128585312e7772532dd414dd21588a95405188e4af6af6 +atan2 u35 4cdbd13d36484ca540eb04d8854674103107aada4deb662d49dfdae9aa3eb7ca +sinh u10 d3859e3dc1ca924f11dc7b464cb0bb535d4ad71d1ec6f416a82db6e0e2390367 +cosh u10 e6fd1172e97fa9341028299dd8a00379f1313170b8444a6a3c291230e4f178b7 +tanh u10 5e2c1ce9d160d1a5dcc5ef8fd74f860751764f5dc14124075f848074ee386618 +asinh u10 37d0df9811cc871b1dde4d762cc0eb53ec6c71c7bcf13100b9b5302ba1a85b99 +acosh u10 158fb84af679aea2ab411fb84cd0b12ff876d897722ff84c54fa567c35705033 +atanh u10 32253ae4f643e56a3d25a6d96d316ed94cd3a9e5ea16ad7180ff96e68571dc34 +lgamma u10 4663f72dcb58a53bedefe071de51f0fccb9b73db12f5b53d5acea347d4de06cd +tgamma u10 ae094d163ce1ccaf94f5146ce3b147f76a886fee2758c8735328304bbb514b42 +erf u10 73867031c0df90a5d060040cd160c7fe14fa6fc0c46104959e574ab6efdd67f7 +erfc u15 4632ba9c10e73c7bbb32adf163d48d4cd90aa0c3314de4a7878953da08433f4d +fabs ff336faed535e34a082752839c9e957ba069ffdf0b046215bd415ce9120f29a3 +copysign 67a7a162bfc2f15b76ded0470f938ab000edf8f8566d5a19fa99d4ea4d29fff3 +fmax 57f39d5440fadb2a7387a47c00b067d5fc57ceabd7e5d64943b033acb5212063 +fmin 87e131762ec9c46badd6105ab66f09d99d65776e2719f6af9befd8d6d3f59b6b +fdim 3331d6a17f289f54d429bdda9374d7d2574e0cd173e930a57436e8e484f271e9 +fmod 89d26af516be177c55ba9fcec972416c35e229456b053271548021e9b070c193 +remainder 2db01bb12776ec14d4a15469c31b49e759d74a3c8ed30d14fe88af3b27b5c398 +modf 7780d1e6448f21bec6504e398a4e826f304da10aaec3c4e210bed86abdaecedf +nextafter 60a6c07477f6d07cd938ba6361d020175193a934a2714132615dae0bcedf785a +trunc ee43b2f9d897428885cb039f85259ea5ffe4efbfe4bf0dba16ee19829d198ac6 +floor 29f8be9b8ad5795e65ed4f34878a85f5f8a1be707489345c4ad04b36d4da54bf +ceil bf267441867b261f8dcfca61b55fdc7ac0ff7a017b150da1b532776894962208 +round 5d7d57a50d9860a7d145d428884df0341564dec7f14c24d5c319c8bce5565f9d +rint 834f8e41e3a28f43b26bc9a5836882cbc0fceeaec5774202cb6df473d995f5a1 +sinf u35 5667c75091aaa7f6cad0b8e1ff80c5470cb5bfcbeb37ca089597a42bb89d21f9 +sinf u10 4749c75d58eb24a83df44f86cfc204cd49b00a84472a592adfa5b0dc6ee5920e +cosf u35 c9aa15477ba53c5d4816a63ebca00123ebe9798374b7f93001478baf01f42393 +cosf u10 8a8cc7609d7afacff4ff1a075784ad32d891567eb6dcc6ab115b0421c3985359 +tanf u35 f7c53052860fa55f44e2fe63af8af15eade5e94951637634ebc5d0ee3c56dd6a +tanf u10 4dcccb3f2c42cf20d9cfa5b5602d86d8242d4d080cfa4f00321333e338cfb9ad +sincosf u10 3643081262b2d43ccedd509daca5d16fb66449aa1774a645a5b1343d4682c81b +sincosf u35 e02f3f1d2848c047d30ad1d89adeab6a9b0aef211fa0d8cd6613a43170e4e0fe +sincospif u05 66ccd831fa4c215b71cc791f3d0cb31babeadd34539867df8029cddf45539ded +sincospif u35 c2a92e1892c9ca12031896177e0dd898cb22b5b8305b42754b1a834485189c9b +logf u10 a43f52f3ce728ebd9ec9e2e84c901f6012fe0d6b83029c8380036404f59cd3ea +logf u35 68ef65827671b86d1fc77d8cb734c49e4c211bfb35990c84a4bbdec6026d8b4d +log2f u10 c732f1b5c7f5147d1576d4d858db46952d42ec229117dffce8b82e798799d2b6 +log2f u35 529ca0ddf923543e938ad3663ad572b9addc586e7f1398c13dcde257b3bd65d1 +log10f u10 c616f9465c071c42532255e9a49ba4305e0a588fc8d87ac31fceb30d2c59391e +log1pf u10 384577af7f24c0ff0abf3a574bf21e348bceb60a7a26b3a7006b7f1fa7032049 +expf u10 1554f1b37125fdf5cf7e516415a04df7547be47dd89d262d24519c0a092593a7 +exp2f u10 374572349c0d64862128a5f7e27555d5f7a2768ec20d52cfc73b2dd608128542 +exp2f u35 374572349c0d64862128a5f7e27555d5f7a2768ec20d52cfc73b2dd608128542 +exp10f u10 240f4207fcca7934627f058b87b2d935a0d5733123a61efa0cee45ed38af6d7b +exp10f u35 3806645d79d1e6ce3cb56f1d1d95689d835e54061b647c8ca8d8c0cb7eb19c97 +expm1f u10 c0066ace0274e83dfce6b6f806ad89ef4c8b0919011477934d43c88dc42e0db2 +powf u10 d370c629e456bed37684cff089d3f04dbe110d8ea0ba40e5e4f49abf9d874134 +cbrtf u10 3589ce3bce26b796ddc4c6ec177cdb0ed05ece414530f4c22c77452b37432050 +cbrtf u10 cfe7b512f728e60f99e14f597d34c94279b96e6cc897fc5ad1377365afb164c8 +cbrtf u35 30fa2b571dec71ccd9f31607bc26c591036ced33e0ceaf038042e6a162b1ddba +cbrtf u35 a0ee4a56fbe28cc4c922188397c10456a0dd54bc31c54b0bd2cfffc7c5626dba +hypotf u05 efc46c07c1bff7caf4f1d52fbe0db4ab70100601c114acea3f4ecf7b2aeaf826 +hypotf u35 e2e71c42bba52629c44960938d5b9961387aff15d92126799dff5e08f351b1e4 +asinf u10 151d448af3ece5f8b2b1775b375cc3260895ac76042814d30bcf156f368d3d45 +asinf u35 2daf25858c2c889ec4b3920ac12b00d7a1494f35f2abb36a3c7daabad99b751f +acosf u10 d4ea707c8f340c6580ed68072d92065abd8942272fdc048cc0318b02e6d312a4 +acosf u35 a7a7a0a8e081e8ef26610c118afc1b7e60b8c6577ca644f49b0aca06f97beb91 +atanf u10 c5e2e79af3d422f9ac9424afda4eab64c17ab80903305b3a281580c997a86055 +atanf u10 ccea76f6a4c4a8941a5259c9c50c6899d71d0bc13948421333c14a604718c31b +atanf u35 67f3d2ab58989e4f24d6ac4f7106a58043d6a8d3a749a6308f155237d1c38eee +atanf u35 7fbc39fe8698ebd79040c51fbc31356acd27b1988435b96e4191eec8662b27d6 +atan2f u10 fa56d1cfea9cbec5de469b1768bd660c19bb079361ec861f3ac0604a0acaee64 +atan2f u35 6ed820eb372024d39c6db25a3242c7cc63c1d416fa3df8e0c68638a979c333f8 +sinhf u10 18d9bc4d115cc4fb5061fda0e1a6b3aa90bce4fd68aa3000cea10dc94cc907e1 +coshf u10 fcbdbe1ebd51db181bad96b3aa08aec5b81858925dd676e3dfd04d679863aa2e +tanhf u10 3b715185ce7c39d70ff17dceb539380b8ac9c80303c9796e41d1ebda6f2b0ece +asinhf u10 1fb7d432a1af3a637e602c9170d73dea5da7e82b57623bfd3b37bbbce1cc9bb1 +acoshf u10 c01055933edfe7bcb45e5dea7377d2b2960ee61551a63270d9e7a28b76f3daad +atanhf u10 66540cd17454f09a95ef5adbeef6e9413ca31cb0446edc879447838f7b8c079c +lgammaf u10 b26a90f8b782f2a91132d5c12dcd56d749e301bf51e275085df2c4579639fa44 +tgammaf u10 2790e8800bd1a29f564fe35ef8463f90b8566968739026c6b04097bbfa536f57 +erff u10 81041541f31e72a7745d6fca4b208d4e332af8fc2366df5372b6cb38755369c9 +erfcf u15 e310f5ed2f0c0b32a84280832bffbefec65cc063483497861f3fb684d72f046d +fabsf 560d13e463bce4448d733798c5818b13e5634f893211047bc2fad9f4c613797d +copysignf 74f7af06376f4c79d7af9ce4e50bde2fb8f22b56d741bdc67624ef7d1989e76b +fmaxf 7474be750857fe400beb2bc14fd1b1113a2e365ae7b45b0acc508436b4c32a94 +fminf 4e22c453645f3c108c27e2c2fea65cd6a6b535f8236fd7382ff1082db3b31b5a +fdimf 16c17ff31778c7d63ec7f65c3b2a8561b79be62b1bc1b399ac0ffc43285b6cd0 +fmodf 9fa4ece68b16803e6c47fa5cb280f8c246a2ef5731b0609bec71e1db27906f0a +remainderf f32761a428b1336051ee773e470b74ada93a611cafbc08f6a9aff36957c84f64 +modff e976f223f2f4d380e9955392cb010920d5439665ae8eaf0fd6abbc889700a4f7 +nextafterf ac05b3fd824c3ce73eb3946c7e3dec94ce4b1ec4efd0237bcfb4578d3d422cc6 +truncf a014cd8206fb15c21b1cc773b951cf7f673e8be9e6e697ca0cf7293becb3d55c +floorf f17658ab95f73a1b4cfec0417e82b1b071cb97a3aac0110e289ba6449b2aeb55 +ceilf e173e35cc97a85629ceb5025bd5b1abad52e4e153166cbf9672396b4ca23b59a +roundf 3e67b087b019c806d87593850bf0cd106718cf34f50684784296ef040f301fc0 +rintf 827acf1e7d253c4fe9fdd4c5c9e53f35c80af5550ff6cbbb7aaac67577630c70 +fastsinf u3500 dbf93ee799553cfb9abf84aaccc458e26113d7d78c4f634db4469bd0d9dd0e19 +fastcosf u3500 55893f9b416b8876d022d7f960281efbb4f9241fdff0cbb059c2695d4c666d5b +fastpowf u3500 30b1aaff8eaad36907f99fd027a34bc06f39ffae218deeae10e399f133e72f8e diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/iut.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/iut.c index 079bbc09d85..b25a57af01a 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/iut.c +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/iut.c @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/iutcuda.cu b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/iutcuda.cu index fe58ba29997..f2b2b0148b7 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/iutcuda.cu +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/iutcuda.cu @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -220,113 +220,113 @@ __global__ void xerfcf_u15(float *r, float *a0) { *r = Sleef_erfcf1_u15cuda(*a0) // -#define func_d_d(funcStr, funcName) { \ - while (startsWith(buf, funcStr " ")) { \ - uint64_t u; \ - sscanf(buf, funcStr " %" PRIx64, &u); \ - *a0 = u2d(u); \ +#define func_d_d(funcStr, funcName) { \ + while (startsWith(buf, funcStr " ")) { \ + uint64_t u; \ + sscanf(buf, funcStr " %" PRIx64, &u); \ + *a0 = u2d(u); \ funcName<<<1, 1>>>(r, a0); \ - cudaDeviceSynchronize(); \ - printf("%" PRIx64 "\n", d2u(*r)); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ + cudaDeviceSynchronize(); \ + printf("%" PRIx64 "\n", d2u(*r)); \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } -#define func_d2_d(funcStr, funcName) { \ - while (startsWith(buf, funcStr " ")) { \ - uint64_t u; \ - sscanf(buf, funcStr " %" PRIx64, &u); \ - *a0 = u2d(u); \ - funcName<<<1, 1>>>(r2, a0); \ - cudaDeviceSynchronize(); \ - printf("%" PRIx64 " %" PRIx64 "\n", d2u(r2->x), d2u(r2->y)); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ +#define func_d2_d(funcStr, funcName) { \ + while (startsWith(buf, funcStr " ")) { \ + uint64_t u; \ + sscanf(buf, funcStr " %" PRIx64, &u); \ + *a0 = u2d(u); \ + funcName<<<1, 1>>>(r2, a0); \ + cudaDeviceSynchronize(); \ + printf("%" PRIx64 " %" PRIx64 "\n", d2u(r2->x), d2u(r2->y)); \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } -#define func_d_d_d(funcStr, funcName) { \ - while (startsWith(buf, funcStr " ")) { \ - uint64_t u, v; \ - sscanf(buf, funcStr " %" PRIx64 " %" PRIx64, &u, &v); \ - *a0 = u2d(u); \ - *a1 = u2d(v); \ - funcName<<<1, 1>>>(r, a0, a1); \ - cudaDeviceSynchronize(); \ - printf("%" PRIx64 "\n", d2u(*r)); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ +#define func_d_d_d(funcStr, funcName) { \ + while (startsWith(buf, funcStr " ")) { \ + uint64_t u, v; \ + sscanf(buf, funcStr " %" PRIx64 " %" PRIx64, &u, &v); \ + *a0 = u2d(u); \ + *a1 = u2d(v); \ + funcName<<<1, 1>>>(r, a0, a1); \ + cudaDeviceSynchronize(); \ + printf("%" PRIx64 "\n", d2u(*r)); \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } -#define func_d_d_i(funcStr, funcName) { \ - while (startsWith(buf, funcStr " ")) { \ - uint64_t u, v; \ - sscanf(buf, funcStr " %" PRIx64 " %" PRIx64, &u, &v); \ - *a0 = u2d(u); \ +#define func_d_d_i(funcStr, funcName) { \ + while (startsWith(buf, funcStr " ")) { \ + uint64_t u, v; \ + sscanf(buf, funcStr " %" PRIx64 " %" PRIx64, &u, &v); \ + *a0 = u2d(u); \ *i0 = (int)u2d(v); \ - funcName<<<1, 1>>>(r, a0, i0); \ - cudaDeviceSynchronize(); \ - printf("%" PRIx64 "\n", d2u(*r)); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ + funcName<<<1, 1>>>(r, a0, i0); \ + cudaDeviceSynchronize(); \ + printf("%" PRIx64 "\n", d2u(*r)); \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } -#define func_i_d(funcStr, funcName) { \ - while (startsWith(buf, funcStr " ")) { \ - uint64_t u; \ - sscanf(buf, funcStr " %" PRIx64, &u); \ - *a0 = u2d(u); \ - funcName<<<1, 1>>>(i0, a0); \ - cudaDeviceSynchronize(); \ - printf("%d\n", *i0); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ +#define func_i_d(funcStr, funcName) { \ + while (startsWith(buf, funcStr " ")) { \ + uint64_t u; \ + sscanf(buf, funcStr " %" PRIx64, &u); \ + *a0 = u2d(u); \ + funcName<<<1, 1>>>(i0, a0); \ + cudaDeviceSynchronize(); \ + printf("%d\n", *i0); \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } // -#define func_f_f(funcStr, funcName) { \ - while (startsWith(buf, funcStr " ")) { \ - uint32_t u; \ - sscanf(buf, funcStr " %x", &u); \ - *b0 = u2f(u); \ +#define func_f_f(funcStr, funcName) { \ + while (startsWith(buf, funcStr " ")) { \ + uint32_t u; \ + sscanf(buf, funcStr " %x", &u); \ + *b0 = u2f(u); \ funcName<<<1, 1>>>(s, b0); \ - cudaDeviceSynchronize(); \ - printf("%x\n", f2u(*s)); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ + cudaDeviceSynchronize(); \ + printf("%x\n", f2u(*s)); \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } -#define func_f2_f(funcStr, funcName) { \ - while (startsWith(buf, funcStr " ")) { \ - uint32_t u; \ - sscanf(buf, funcStr " %x", &u); \ - *b0 = u2f(u); \ - funcName<<<1, 1>>>(s2, b0); \ - cudaDeviceSynchronize(); \ +#define func_f2_f(funcStr, funcName) { \ + while (startsWith(buf, funcStr " ")) { \ + uint32_t u; \ + sscanf(buf, funcStr " %x", &u); \ + *b0 = u2f(u); \ + funcName<<<1, 1>>>(s2, b0); \ + cudaDeviceSynchronize(); \ printf("%x %x\n", f2u(s2->x), f2u(s2->y)); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } -#define func_f_f_f(funcStr, funcName) { \ - while (startsWith(buf, funcStr " ")) { \ - uint32_t u, v; \ - sscanf(buf, funcStr " %x %x", &u, &v); \ - *b0 = u2f(u); \ - *b1 = u2f(v); \ - funcName<<<1, 1>>>(s, b0, b1); \ - cudaDeviceSynchronize(); \ - printf("%x\n", f2u(*s)); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ +#define func_f_f_f(funcStr, funcName) { \ + while (startsWith(buf, funcStr " ")) { \ + uint32_t u, v; \ + sscanf(buf, funcStr " %x %x", &u, &v); \ + *b0 = u2f(u); \ + *b1 = u2f(v); \ + funcName<<<1, 1>>>(s, b0, b1); \ + cudaDeviceSynchronize(); \ + printf("%x\n", f2u(*s)); \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } // diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/iutsimd.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/iutsimd.c index dd8ee96ba36..0414a4101e3 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/iutsimd.c +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/iutsimd.c @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2023. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -474,172 +474,172 @@ static vfloat vf2gety_vf_vf2(vfloat2 v) { return v.y; } // -#define func_d_d(funcStr, funcName) { \ - while (startsWith(buf, funcStr " ")) { \ - uint64_t u; \ - sscanf(buf, funcStr " %" PRIx64, &u); \ - double s[VECTLENDP]; \ - memrand(s, sizeof(s)); \ +#define func_d_d(funcStr, funcName) { \ + while (startsWith(buf, funcStr " ")) { \ + uint64_t u; \ + sscanf(buf, funcStr " %" PRIx64, &u); \ + double s[VECTLENDP]; \ + memrand(s, sizeof(s)); \ int idx = xrand() & (VECTLENDP-1); \ - s[idx] = u2d(u); \ - vdouble a = vloadu_vd_p(s); \ - a = funcName(a); \ - vstoreu_v_p_vd(s, a); \ - u = d2u(s[idx]); \ - printf("%" PRIx64 "\n", u); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ + s[idx] = u2d(u); \ + vdouble a = vloadu_vd_p(s); \ + a = funcName(a); \ + vstoreu_v_p_vd(s, a); \ + u = d2u(s[idx]); \ + printf("%" PRIx64 "\n", u); \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } -#define func_d2_d(funcStr, funcName) { \ - while (startsWith(buf, funcStr " ")) { \ - uint64_t u; \ - sscanf(buf, funcStr " %" PRIx64, &u); \ +#define func_d2_d(funcStr, funcName) { \ + while (startsWith(buf, funcStr " ")) { \ + uint64_t u; \ + sscanf(buf, funcStr " %" PRIx64, &u); \ double s[VECTLENDP], t[VECTLENDP]; \ - memrand(s, sizeof(s)); \ - memrand(t, sizeof(t)); \ + memrand(s, sizeof(s)); \ + memrand(t, sizeof(t)); \ int idx = xrand() & (VECTLENDP-1); \ - s[idx] = u2d(u); \ - vdouble2 v; \ - vdouble a = vloadu_vd_p(s); \ - v = funcName(a); \ - vstoreu_v_p_vd(s, vd2getx_vd_vd2(v)); \ - vstoreu_v_p_vd(t, vd2gety_vd_vd2(v)); \ - Sleef_double2 d2; \ - d2.x = s[idx]; \ - d2.y = t[idx]; \ + s[idx] = u2d(u); \ + vdouble2 v; \ + vdouble a = vloadu_vd_p(s); \ + v = funcName(a); \ + vstoreu_v_p_vd(s, vd2getx_vd_vd2(v)); \ + vstoreu_v_p_vd(t, vd2gety_vd_vd2(v)); \ + Sleef_double2 d2; \ + d2.x = s[idx]; \ + d2.y = t[idx]; \ printf("%" PRIx64 " %" PRIx64 "\n", d2u(d2.x), d2u(d2.y)); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } -#define func_d_d_d(funcStr, funcName) { \ - while (startsWith(buf, funcStr " ")) { \ - uint64_t u, v; \ - sscanf(buf, funcStr " %" PRIx64 " %" PRIx64, &u, &v); \ +#define func_d_d_d(funcStr, funcName) { \ + while (startsWith(buf, funcStr " ")) { \ + uint64_t u, v; \ + sscanf(buf, funcStr " %" PRIx64 " %" PRIx64, &u, &v); \ double s[VECTLENDP], t[VECTLENDP]; \ - memrand(s, sizeof(s)); \ - memrand(t, sizeof(t)); \ + memrand(s, sizeof(s)); \ + memrand(t, sizeof(t)); \ int idx = xrand() & (VECTLENDP-1); \ - s[idx] = u2d(u); \ - t[idx] = u2d(v); \ - vdouble a, b; \ - a = vloadu_vd_p(s); \ - b = vloadu_vd_p(t); \ - a = funcName(a, b); \ - vstoreu_v_p_vd(s, a); \ - u = d2u(s[idx]); \ - printf("%" PRIx64 "\n", u); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ + s[idx] = u2d(u); \ + t[idx] = u2d(v); \ + vdouble a, b; \ + a = vloadu_vd_p(s); \ + b = vloadu_vd_p(t); \ + a = funcName(a, b); \ + vstoreu_v_p_vd(s, a); \ + u = d2u(s[idx]); \ + printf("%" PRIx64 "\n", u); \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } -#define func_d_d_i(funcStr, funcName) { \ - while (startsWith(buf, funcStr " ")) { \ - uint64_t u, v; \ - sscanf(buf, funcStr " %" PRIx64 " %" PRIx64, &u, &v); \ - double s[VECTLENDP]; \ - int t[VECTLENDP*2]; \ - memrand(s, sizeof(s)); \ - memrand(t, sizeof(t)); \ +#define func_d_d_i(funcStr, funcName) { \ + while (startsWith(buf, funcStr " ")) { \ + uint64_t u, v; \ + sscanf(buf, funcStr " %" PRIx64 " %" PRIx64, &u, &v); \ + double s[VECTLENDP]; \ + int t[VECTLENDP*2]; \ + memrand(s, sizeof(s)); \ + memrand(t, sizeof(t)); \ int idx = xrand() & (VECTLENDP-1); \ - s[idx] = u2d(u); \ - t[idx] = (int)u2d(v); \ - vstoreu_v_p_vd(s, funcName(vloadu_vd_p(s), vloadu_vi_p(t))); \ - u = d2u(s[idx]); \ - printf("%" PRIx64 "\n", u); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ + s[idx] = u2d(u); \ + t[idx] = (int)u2d(v); \ + vstoreu_v_p_vd(s, funcName(vloadu_vd_p(s), vloadu_vi_p(t))); \ + u = d2u(s[idx]); \ + printf("%" PRIx64 "\n", u); \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } -#define func_i_d(funcStr, funcName) { \ - while (startsWith(buf, funcStr " ")) { \ - uint64_t u; \ - int i; \ - sscanf(buf, funcStr " %" PRIx64, &u); \ - double s[VECTLENDP]; \ - int t[VECTLENDP*2]; \ - memrand(s, sizeof(s)); \ - memrand(t, sizeof(t)); \ +#define func_i_d(funcStr, funcName) { \ + while (startsWith(buf, funcStr " ")) { \ + uint64_t u; \ + int i; \ + sscanf(buf, funcStr " %" PRIx64, &u); \ + double s[VECTLENDP]; \ + int t[VECTLENDP*2]; \ + memrand(s, sizeof(s)); \ + memrand(t, sizeof(t)); \ int idx = xrand() & (VECTLENDP-1); \ - s[idx] = u2d(u); \ - vdouble a = vloadu_vd_p(s); \ - vint vi = funcName(a); \ - vstoreu_v_p_vi(t, vi); \ - i = t[idx]; \ + s[idx] = u2d(u); \ + vdouble a = vloadu_vd_p(s); \ + vint vi = funcName(a); \ + vstoreu_v_p_vi(t, vi); \ + i = t[idx]; \ printf("%d\n", i); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } // -#define func_f_f(funcStr, funcName) { \ - while (startsWith(buf, funcStr " ")) { \ - uint32_t u; \ - sscanf(buf, funcStr " %x", &u); \ - float s[VECTLENSP]; \ - memrand(s, sizeof(s)); \ +#define func_f_f(funcStr, funcName) { \ + while (startsWith(buf, funcStr " ")) { \ + uint32_t u; \ + sscanf(buf, funcStr " %x", &u); \ + float s[VECTLENSP]; \ + memrand(s, sizeof(s)); \ int idx = xrand() & (VECTLENSP-1); \ - s[idx] = u2f(u); \ + s[idx] = u2f(u); \ vfloat a = vloadu_vf_p(s); \ - a = funcName(a); \ - vstoreu_v_p_vf(s, a); \ - u = f2u(s[idx]); \ + a = funcName(a); \ + vstoreu_v_p_vf(s, a); \ + u = f2u(s[idx]); \ printf("%x\n", u); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } -#define func_f2_f(funcStr, funcName) { \ - while (startsWith(buf, funcStr " ")) { \ - uint32_t u; \ - sscanf(buf, funcStr " %x", &u); \ - float s[VECTLENSP], t[VECTLENSP]; \ - memrand(s, sizeof(s)); \ - memrand(t, sizeof(t)); \ +#define func_f2_f(funcStr, funcName) { \ + while (startsWith(buf, funcStr " ")) { \ + uint32_t u; \ + sscanf(buf, funcStr " %x", &u); \ + float s[VECTLENSP], t[VECTLENSP]; \ + memrand(s, sizeof(s)); \ + memrand(t, sizeof(t)); \ int idx = xrand() & (VECTLENSP-1); \ - s[idx] = u2f(u); \ + s[idx] = u2f(u); \ vfloat2 v; \ vfloat a = vloadu_vf_p(s); \ - v = funcName(a); \ - vstoreu_v_p_vf(s, vf2getx_vf_vf2(v)); \ - vstoreu_v_p_vf(t, vf2gety_vf_vf2(v)); \ - Sleef_float2 d2; \ - d2.x = s[idx]; \ - d2.y = t[idx]; \ - printf("%x %x\n", f2u(d2.x), f2u(d2.y)); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ + v = funcName(a); \ + vstoreu_v_p_vf(s, vf2getx_vf_vf2(v)); \ + vstoreu_v_p_vf(t, vf2gety_vf_vf2(v)); \ + Sleef_float2 d2; \ + d2.x = s[idx]; \ + d2.y = t[idx]; \ + printf("%x %x\n", f2u(d2.x), f2u(d2.y)); \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } -#define func_f_f_f(funcStr, funcName) { \ - while (startsWith(buf, funcStr " ")) { \ - uint32_t u, v; \ - sscanf(buf, funcStr " %x %x", &u, &v); \ - float s[VECTLENSP], t[VECTLENSP]; \ - memrand(s, sizeof(s)); \ - memrand(t, sizeof(t)); \ +#define func_f_f_f(funcStr, funcName) { \ + while (startsWith(buf, funcStr " ")) { \ + uint32_t u, v; \ + sscanf(buf, funcStr " %x %x", &u, &v); \ + float s[VECTLENSP], t[VECTLENSP]; \ + memrand(s, sizeof(s)); \ + memrand(t, sizeof(t)); \ int idx = xrand() & (VECTLENSP-1); \ - s[idx] = u2f(u); \ - t[idx] = u2f(v); \ - vfloat a, b; \ - a = vloadu_vf_p(s); \ - b = vloadu_vf_p(t); \ - a = funcName(a, b); \ - vstoreu_v_p_vf(s, a); \ - u = f2u(s[idx]); \ + s[idx] = u2f(u); \ + t[idx] = u2f(v); \ + vfloat a, b; \ + a = vloadu_vf_p(s); \ + b = vloadu_vf_p(t); \ + a = funcName(a, b); \ + vstoreu_v_p_vf(s, a); \ + u = f2u(s[idx]); \ printf("%x\n", u); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } // diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/tester.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/tester.c index b82400329ae..75256124a78 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/tester.c +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/tester.c @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -89,37 +89,37 @@ void startChild(const char *path, char *const argv[]) { // -#define child_d_d(funcStr, arg) do { \ - char str[256]; \ - uint64_t u; \ - sprintf(str, funcStr " %" PRIx64 "\n", d2u(arg)); \ - write(ptoc[1], str, strlen(str)); \ +#define child_d_d(funcStr, arg) do { \ + char str[256]; \ + uint64_t u; \ + sprintf(str, funcStr " %" PRIx64 "\n", d2u(arg)); \ + write(ptoc[1], str, strlen(str)); \ if (fgets(str, 255, fpctop) == NULL) stop("child " funcStr); \ sscanf(str, "%" PRIx64, &u); \ - return u2d(u); \ + return u2d(u); \ } while(0) -#define child_d2_d(funcStr, arg) do { \ - char str[256]; \ - uint64_t u, v; \ - sprintf(str, funcStr " %" PRIx64 "\n", d2u(arg)); \ - write(ptoc[1], str, strlen(str)); \ +#define child_d2_d(funcStr, arg) do { \ + char str[256]; \ + uint64_t u, v; \ + sprintf(str, funcStr " %" PRIx64 "\n", d2u(arg)); \ + write(ptoc[1], str, strlen(str)); \ if (fgets(str, 255, fpctop) == NULL) stop("child " funcStr); \ sscanf(str, "%" PRIx64 " %" PRIx64, &u, &v); \ - Sleef_double2 ret; \ - ret.x = u2d(u); \ - ret.y = u2d(v); \ - return ret; \ + Sleef_double2 ret; \ + ret.x = u2d(u); \ + ret.y = u2d(v); \ + return ret; \ } while(0) -#define child_d_d_d(funcStr, arg1, arg2) do { \ - char str[256]; \ - uint64_t u; \ +#define child_d_d_d(funcStr, arg1, arg2) do { \ + char str[256]; \ + uint64_t u; \ sprintf(str, funcStr " %" PRIx64 " %" PRIx64 "\n", d2u(arg1), d2u(arg2)); \ - write(ptoc[1], str, strlen(str)); \ + write(ptoc[1], str, strlen(str)); \ if (fgets(str, 255, fpctop) == NULL) stop("child " funcStr); \ sscanf(str, "%" PRIx64, &u); \ - return u2d(u); \ + return u2d(u); \ } while(0) double child_sin(double x) { child_d_d("sin", x); } @@ -224,37 +224,37 @@ int child_ilogb(double x) { // -#define child_f_f(funcStr, arg) do { \ - char str[256]; \ - uint32_t u; \ - sprintf(str, funcStr " %x\n", f2u(arg)); \ - write(ptoc[1], str, strlen(str)); \ +#define child_f_f(funcStr, arg) do { \ + char str[256]; \ + uint32_t u; \ + sprintf(str, funcStr " %x\n", f2u(arg)); \ + write(ptoc[1], str, strlen(str)); \ if (fgets(str, 255, fpctop) == NULL) stop("child " funcStr); \ - sscanf(str, "%x", &u); \ - return u2f(u); \ + sscanf(str, "%x", &u); \ + return u2f(u); \ } while(0) -#define child_f2_f(funcStr, arg) do { \ - char str[256]; \ - uint32_t u, v; \ - sprintf(str, funcStr " %x\n", f2u(arg)); \ - write(ptoc[1], str, strlen(str)); \ +#define child_f2_f(funcStr, arg) do { \ + char str[256]; \ + uint32_t u, v; \ + sprintf(str, funcStr " %x\n", f2u(arg)); \ + write(ptoc[1], str, strlen(str)); \ if (fgets(str, 255, fpctop) == NULL) stop("child " funcStr); \ - sscanf(str, "%x %x", &u, &v); \ - Sleef_float2 ret; \ - ret.x = u2f(u); \ - ret.y = u2f(v); \ - return ret; \ + sscanf(str, "%x %x", &u, &v); \ + Sleef_float2 ret; \ + ret.x = u2f(u); \ + ret.y = u2f(v); \ + return ret; \ } while(0) -#define child_f_f_f(funcStr, arg1, arg2) do { \ - char str[256]; \ - uint32_t u; \ - sprintf(str, funcStr " %x %x\n", f2u(arg1), f2u(arg2)); \ - write(ptoc[1], str, strlen(str)); \ +#define child_f_f_f(funcStr, arg1, arg2) do { \ + char str[256]; \ + uint32_t u; \ + sprintf(str, funcStr " %x %x\n", f2u(arg1), f2u(arg2)); \ + write(ptoc[1], str, strlen(str)); \ if (fgets(str, 255, fpctop) == NULL) stop("child " funcStr); \ - sscanf(str, "%x", &u); \ - return u2f(u); \ + sscanf(str, "%x", &u); \ + return u2f(u); \ } while(0) float child_sinf(float x) { child_f_f("sinf", x); } @@ -1142,62 +1142,62 @@ void do_test() { // -#define cmpDenorm_f(mpfrFunc, childFunc, argx) do { \ +#define cmpDenorm_f(mpfrFunc, childFunc, argx) do { \ mpfr_set_d(frx, (float)flushToZero(argx), GMP_RNDN); \ - mpfrFunc(frc, frx, GMP_RNDN); \ - if (!cmpDenormsp(childFunc((float)flushToZero(argx)), frc)) { \ - fprintf(stderr, "arg = %.20g, test = %.20g, correct = %.20g\n", \ + mpfrFunc(frc, frx, GMP_RNDN); \ + if (!cmpDenormsp(childFunc((float)flushToZero(argx)), frc)) { \ + fprintf(stderr, "arg = %.20g, test = %.20g, correct = %.20g\n", \ (float)flushToZero(argx), childFunc((float)flushToZero(argx)), flushToZero(mpfr_get_d(frc, GMP_RNDN))); \ - success = 0; \ - break; \ - } \ + success = 0; \ + break; \ + } \ } while(0) -#define cmpDenormNR_f(mpfrFunc, childFunc, argx) do { \ +#define cmpDenormNR_f(mpfrFunc, childFunc, argx) do { \ mpfr_set_d(frx, (float)flushToZero(argx), GMP_RNDN); \ - mpfrFunc(frc, frx); \ - if (!cmpDenormsp(childFunc((float)flushToZero(argx)), frc)) { \ - fprintf(stderr, "arg = %.20g, test = %.20g, correct = %.20g\n", \ + mpfrFunc(frc, frx); \ + if (!cmpDenormsp(childFunc((float)flushToZero(argx)), frc)) { \ + fprintf(stderr, "arg = %.20g, test = %.20g, correct = %.20g\n", \ (float)flushToZero(argx), childFunc((float)flushToZero(argx)), mpfr_get_d(frc, GMP_RNDN)); \ - success = 0; \ - break; \ - } \ + success = 0; \ + break; \ + } \ } while(0) -#define cmpDenorm_f_f(mpfrFunc, childFunc, argx, argy) do { \ +#define cmpDenorm_f_f(mpfrFunc, childFunc, argx, argy) do { \ mpfr_set_d(frx, (float)flushToZero(argx), GMP_RNDN); \ mpfr_set_d(fry, (float)flushToZero(argy), GMP_RNDN); \ - mpfrFunc(frc, frx, fry, GMP_RNDN); \ + mpfrFunc(frc, frx, fry, GMP_RNDN); \ if (!cmpDenormsp(childFunc((float)flushToZero(argx), (float)flushToZero(argy)), frc)) { \ fprintf(stderr, "arg = %.20g, %.20g, test = %.20g, correct = %.20g\n", \ (float)flushToZero(argx), (float)flushToZero(argy), childFunc((float)flushToZero(argx), (float)flushToZero(argy)), mpfr_get_d(frc, GMP_RNDN)); \ - success = 0; \ - break; \ - } \ + success = 0; \ + break; \ + } \ } while(0) -#define cmpDenormX_f(mpfrFunc, childFunc, argx) do { \ +#define cmpDenormX_f(mpfrFunc, childFunc, argx) do { \ mpfr_set_d(frx, (float)flushToZero(argx), GMP_RNDN); \ - mpfrFunc(frc, frx, GMP_RNDN); \ - Sleef_float2 d2 = childFunc((float)flushToZero(argx)); \ - if (!cmpDenormsp(d2.x, frc)) { \ - fprintf(stderr, "arg = %.20g, test = %.20g, correct = %.20g\n", \ + mpfrFunc(frc, frx, GMP_RNDN); \ + Sleef_float2 d2 = childFunc((float)flushToZero(argx)); \ + if (!cmpDenormsp(d2.x, frc)) { \ + fprintf(stderr, "arg = %.20g, test = %.20g, correct = %.20g\n", \ (float)flushToZero(argx), d2.x, mpfr_get_d(frc, GMP_RNDN)); \ - success = 0; \ - break; \ - } \ + success = 0; \ + break; \ + } \ } while(0) -#define cmpDenormY_f(mpfrFunc, childFunc, argx) do { \ +#define cmpDenormY_f(mpfrFunc, childFunc, argx) do { \ mpfr_set_d(frx, (float)flushToZero(argx), GMP_RNDN); \ - mpfrFunc(frc, frx, GMP_RNDN); \ - Sleef_float2 d2 = childFunc((float)flushToZero(argx)); \ - if (!cmpDenormsp(d2.y, frc)) { \ - fprintf(stderr, "arg = %.20g, test = %.20g, correct = %.20g\n", \ + mpfrFunc(frc, frx, GMP_RNDN); \ + Sleef_float2 d2 = childFunc((float)flushToZero(argx)); \ + if (!cmpDenormsp(d2.y, frc)) { \ + fprintf(stderr, "arg = %.20g, test = %.20g, correct = %.20g\n", \ (float)flushToZero(argx), d2.y, mpfr_get_d(frc, GMP_RNDN)); \ - success = 0; \ - break; \ - } \ + success = 0; \ + break; \ + } \ } while(0) // @@ -2157,57 +2157,57 @@ void do_test() { // -#define cmpDenorm_d(mpfrFunc, childFunc, argx) do { \ - mpfr_set_d(frx, argx, GMP_RNDN); \ - mpfrFunc(frc, frx, GMP_RNDN); \ - if (!cmpDenormdp(childFunc(argx), frc)) { \ +#define cmpDenorm_d(mpfrFunc, childFunc, argx) do { \ + mpfr_set_d(frx, argx, GMP_RNDN); \ + mpfrFunc(frc, frx, GMP_RNDN); \ + if (!cmpDenormdp(childFunc(argx), frc)) { \ fprintf(stderr, "arg = %.20g, test = %.20g, correct = %.20g\n", argx, childFunc(argx), mpfr_get_d(frc, GMP_RNDN)); \ - success = 0; \ - break; \ - } \ + success = 0; \ + break; \ + } \ } while(0) -#define cmpDenormNR_d(mpfrFunc, childFunc, argx) do { \ - mpfr_set_d(frx, argx, GMP_RNDN); \ - mpfrFunc(frc, frx); \ - if (!cmpDenormdp(childFunc(argx), frc)) { \ +#define cmpDenormNR_d(mpfrFunc, childFunc, argx) do { \ + mpfr_set_d(frx, argx, GMP_RNDN); \ + mpfrFunc(frc, frx); \ + if (!cmpDenormdp(childFunc(argx), frc)) { \ fprintf(stderr, "arg = %.20g, test = %.20g, correct = %.20g\n", argx, childFunc(argx), mpfr_get_d(frc, GMP_RNDN)); \ - success = 0; \ - break; \ - } \ + success = 0; \ + break; \ + } \ } while(0) -#define cmpDenorm_d_d(mpfrFunc, childFunc, argx, argy) do { \ - mpfr_set_d(frx, argx, GMP_RNDN); \ - mpfr_set_d(fry, argy, GMP_RNDN); \ +#define cmpDenorm_d_d(mpfrFunc, childFunc, argx, argy) do { \ + mpfr_set_d(frx, argx, GMP_RNDN); \ + mpfr_set_d(fry, argy, GMP_RNDN); \ mpfrFunc(frc, frx, fry, GMP_RNDN); \ - if (!cmpDenormdp(childFunc(argx, argy), frc)) { \ + if (!cmpDenormdp(childFunc(argx, argy), frc)) { \ fprintf(stderr, "arg = %.20g, %.20g, test = %.20g, correct = %.20g\n", argx, argy, childFunc(argx, argy), mpfr_get_d(frc, GMP_RNDN)); \ - success = 0; \ - break; \ - } \ + success = 0; \ + break; \ + } \ } while(0) -#define cmpDenormX_d(mpfrFunc, childFunc, argx) do { \ - mpfr_set_d(frx, argx, GMP_RNDN); \ - mpfrFunc(frc, frx, GMP_RNDN); \ - Sleef_double2 d2 = childFunc(argx); \ - if (!cmpDenormdp(d2.x, frc)) { \ +#define cmpDenormX_d(mpfrFunc, childFunc, argx) do { \ + mpfr_set_d(frx, argx, GMP_RNDN); \ + mpfrFunc(frc, frx, GMP_RNDN); \ + Sleef_double2 d2 = childFunc(argx); \ + if (!cmpDenormdp(d2.x, frc)) { \ fprintf(stderr, "arg = %.20g, test = %.20g, correct = %.20g\n", argx, d2.x, mpfr_get_d(frc, GMP_RNDN)); \ - success = 0; \ - break; \ - } \ + success = 0; \ + break; \ + } \ } while(0) -#define cmpDenormY_d(mpfrFunc, childFunc, argx) do { \ - mpfr_set_d(frx, argx, GMP_RNDN); \ - mpfrFunc(frc, frx, GMP_RNDN); \ - Sleef_double2 d2 = childFunc(argx); \ - if (!cmpDenormdp(d2.y, frc)) { \ +#define cmpDenormY_d(mpfrFunc, childFunc, argx) do { \ + mpfr_set_d(frx, argx, GMP_RNDN); \ + mpfrFunc(frc, frx, GMP_RNDN); \ + Sleef_double2 d2 = childFunc(argx); \ + if (!cmpDenormdp(d2.y, frc)) { \ fprintf(stderr, "arg = %.20g, test = %.20g, correct = %.20g\n", argx, d2.y, mpfr_get_d(frc, GMP_RNDN)); \ - success = 0; \ - break; \ - } \ + success = 0; \ + break; \ + } \ } while(0) // @@ -3435,58 +3435,58 @@ void do_test() { // -#define checkAccuracy_d(mpfrFunc, childFunc, argx, bound) do { \ - mpfr_set_d(frx, argx, GMP_RNDN); \ - mpfrFunc(frc, frx, GMP_RNDN); \ - if (countULPdp(childFunc(argx), frc) > bound) { \ +#define checkAccuracy_d(mpfrFunc, childFunc, argx, bound) do { \ + mpfr_set_d(frx, argx, GMP_RNDN); \ + mpfrFunc(frc, frx, GMP_RNDN); \ + if (countULPdp(childFunc(argx), frc) > bound) { \ fprintf(stderr, "\narg = %.20g, test = %.20g, correct = %.20g, ULP = %lf\n", argx, childFunc(argx), mpfr_get_d(frc, GMP_RNDN), countULPdp(childFunc(argx), frc)); \ - success = 0; \ - break; \ - } \ + success = 0; \ + break; \ + } \ } while(0) #define checkAccuracyNR_d(mpfrFunc, childFunc, argx, bound) do { \ - mpfr_set_d(frx, argx, GMP_RNDN); \ - mpfrFunc(frc, frx); \ - if (countULPdp(childFunc(argx), frc) > bound) { \ + mpfr_set_d(frx, argx, GMP_RNDN); \ + mpfrFunc(frc, frx); \ + if (countULPdp(childFunc(argx), frc) > bound) { \ fprintf(stderr, "\narg = %.20g, test = %.20g, correct = %.20g, ULP = %lf\n", argx, childFunc(argx), mpfr_get_d(frc, GMP_RNDN), countULPdp(childFunc(argx), frc)); \ - success = 0; \ - break; \ - } \ + success = 0; \ + break; \ + } \ } while(0) -#define checkAccuracy_d_d(mpfrFunc, childFunc, argx, argy, bound) do { \ - mpfr_set_d(frx, argx, GMP_RNDN); \ - mpfr_set_d(fry, argy, GMP_RNDN); \ - mpfrFunc(frc, frx, fry, GMP_RNDN); \ - if (countULPdp(childFunc(argx, argy), frc) > bound) { \ +#define checkAccuracy_d_d(mpfrFunc, childFunc, argx, argy, bound) do { \ + mpfr_set_d(frx, argx, GMP_RNDN); \ + mpfr_set_d(fry, argy, GMP_RNDN); \ + mpfrFunc(frc, frx, fry, GMP_RNDN); \ + if (countULPdp(childFunc(argx, argy), frc) > bound) { \ fprintf(stderr, "\narg = %.20g, %.20g, test = %.20g, correct = %.20g, ULP = %lf\n", \ argx, argy, childFunc(argx, argy), mpfr_get_d(frc, GMP_RNDN), countULPdp(childFunc(argx, argy), frc)); \ - success = 0; \ - break; \ - } \ + success = 0; \ + break; \ + } \ } while(0) -#define checkAccuracyX_d(mpfrFunc, childFunc, argx, bound) do { \ - mpfr_set_d(frx, argx, GMP_RNDN); \ - mpfrFunc(frc, frx, GMP_RNDN); \ - Sleef_double2 d2 = childFunc(argx); \ +#define checkAccuracyX_d(mpfrFunc, childFunc, argx, bound) do { \ + mpfr_set_d(frx, argx, GMP_RNDN); \ + mpfrFunc(frc, frx, GMP_RNDN); \ + Sleef_double2 d2 = childFunc(argx); \ if (countULPdp(d2.x, frc) > bound) { \ fprintf(stderr, "\narg = %.20g, test = %.20g, correct = %.20g, ULP = %lf\n", argx, d2.x, mpfr_get_d(frc, GMP_RNDN), countULPdp(d2.x, frc)); \ - success = 0; \ - break; \ - } \ + success = 0; \ + break; \ + } \ } while(0) -#define checkAccuracyY_d(mpfrFunc, childFunc, argx, bound) do { \ - mpfr_set_d(frx, argx, GMP_RNDN); \ - mpfrFunc(frc, frx, GMP_RNDN); \ - Sleef_double2 d2 = childFunc(argx); \ +#define checkAccuracyY_d(mpfrFunc, childFunc, argx, bound) do { \ + mpfr_set_d(frx, argx, GMP_RNDN); \ + mpfrFunc(frc, frx, GMP_RNDN); \ + Sleef_double2 d2 = childFunc(argx); \ if (countULPdp(d2.y, frc) > bound) { \ fprintf(stderr, "\narg = %.20g, test = %.20g, correct = %.20g, ULP = %lf\n", argx, d2.y, mpfr_get_d(frc, GMP_RNDN), countULPdp(d2.y, frc)); \ - success = 0; \ - break; \ - } \ + success = 0; \ + break; \ + } \ } while(0) // @@ -3903,6 +3903,8 @@ void do_test() { fprintf(stderr, "exp : "); for(d = -10;d < 10 && success;d += 0.002) checkAccuracy_d(mpfr_exp, child_exp, d, 1.0); for(d = -1000;d < 1000 && success;d += 1.1) checkAccuracy_d(mpfr_exp, child_exp, d, 1.0); + // Test for early or late overflow, e.g before or after x = LOG_DBL_MAX + for(d = LOG_DBL_MAX - 0.0001;(d < LOG_DBL_MAX + 0.0001) && success;d += 0.00001) checkAccuracy_d(mpfr_exp, child_exp, d, 1.0); showResult(success); // @@ -3914,6 +3916,8 @@ void do_test() { } } for(y = -1000;y < 1000 && success;y += 0.1) checkAccuracy_d_d(mpfr_pow, child_pow, 2.1, y, 1.0); + // Test for early or late overflow (test limited to x = e) + for(d = LOG_DBL_MAX - 0.0001;(d < LOG_DBL_MAX + 0.0001) && success;d += 0.00001) checkAccuracy_d_d(mpfr_pow, child_pow, exp(1.0), d, 1.0); showResult(success); // @@ -4141,6 +4145,7 @@ void do_test() { fprintf(stderr, "log1p : "); for(d = 0.0001;d < 10 && success;d += 0.001) checkAccuracy_d(mpfr_log1p, child_log1p, d, 1.0); + for(d = 1.0e+307;d < DBL_MAX && success;d += 1.0e+306) checkAccuracy_d(mpfr_log1p, child_log1p, d, 1.0); showResult(success); // @@ -4222,73 +4227,73 @@ void do_test() { // -#define checkAccuracy_f(mpfrFunc, childFunc, argx, bound) do { \ +#define checkAccuracy_f(mpfrFunc, childFunc, argx, bound) do { \ mpfr_set_d(frx, (float)flushToZero(argx), GMP_RNDN); \ - mpfrFunc(frc, frx, GMP_RNDN); \ - if (countULPsp(childFunc((float)flushToZero(argx)), frc) > bound) { \ + mpfrFunc(frc, frx, GMP_RNDN); \ + if (countULPsp(childFunc((float)flushToZero(argx)), frc) > bound) { \ fprintf(stderr, "\narg = %.20g, test = %.20g, correct = %.20g, ULP = %lf\n", \ (float)flushToZero(argx), (double)childFunc((float)flushToZero(argx)), mpfr_get_d(frc, GMP_RNDN), countULPsp(childFunc((float)flushToZero(argx)), frc)); \ - success = 0; \ - break; \ - } \ + success = 0; \ + break; \ + } \ } while(0) #define checkAccuracyNR_f(mpfrFunc, childFunc, argx, bound) do { \ mpfr_set_d(frx, (float)flushToZero(argx), GMP_RNDN); \ - mpfrFunc(frc, frx); \ - if (countULPsp(childFunc((float)flushToZero(argx)), frc) > bound) { \ + mpfrFunc(frc, frx); \ + if (countULPsp(childFunc((float)flushToZero(argx)), frc) > bound) { \ fprintf(stderr, "\narg = %.20g, test = %.20g, correct = %.20g, ULP = %lf\n", \ (float)flushToZero(argx), (double)childFunc((float)flushToZero(argx)), mpfr_get_d(frc, GMP_RNDN), countULPsp(childFunc((float)flushToZero(argx)), frc)); \ - success = 0; \ - break; \ - } \ + success = 0; \ + break; \ + } \ } while(0) -#define checkAccuracy_f_f(mpfrFunc, childFunc, argx, argy, bound) do { \ +#define checkAccuracy_f_f(mpfrFunc, childFunc, argx, argy, bound) do { \ mpfr_set_d(frx, (float)flushToZero(argx), GMP_RNDN); \ mpfr_set_d(fry, (float)flushToZero(argy), GMP_RNDN); \ - mpfrFunc(frc, frx, fry, GMP_RNDN); \ - if (countULPsp(childFunc((float)flushToZero(argx), (float)flushToZero(argy)), frc) > bound) { \ + mpfrFunc(frc, frx, fry, GMP_RNDN); \ + if (countULPsp(childFunc((float)flushToZero(argx), (float)flushToZero(argy)), frc) > bound) { \ fprintf(stderr, "\narg = %.20g, %.20g, test = %.20g, correct = %.20g, ULP = %lf\n", \ (float)flushToZero(argx), (float)flushToZero(argy), childFunc((float)flushToZero(argx), (float)flushToZero(argy)), mpfr_get_d(frc, GMP_RNDN), countULPsp(childFunc((float)flushToZero(argx), (float)flushToZero(argy)), frc)); \ - success = 0; \ - break; \ - } \ + success = 0; \ + break; \ + } \ } while(0) -#define checkAccuracyX_f(mpfrFunc, childFunc, argx, bound) do { \ +#define checkAccuracyX_f(mpfrFunc, childFunc, argx, bound) do { \ mpfr_set_d(frx, (float)flushToZero(argx), GMP_RNDN); \ - mpfrFunc(frc, frx, GMP_RNDN); \ - Sleef_float2 d2 = childFunc((float)flushToZero(argx)); \ + mpfrFunc(frc, frx, GMP_RNDN); \ + Sleef_float2 d2 = childFunc((float)flushToZero(argx)); \ if (countULPsp(d2.x, frc) > bound) { \ fprintf(stderr, "\narg = %.20g, test = %.20g, correct = %.20g, ULP = %lf\n", (float)flushToZero(argx), (double)d2.x, mpfr_get_d(frc, GMP_RNDN), countULPsp(d2.x, frc)); \ - success = 0; \ - break; \ - } \ + success = 0; \ + break; \ + } \ } while(0) -#define checkAccuracyY_f(mpfrFunc, childFunc, argx, bound) do { \ +#define checkAccuracyY_f(mpfrFunc, childFunc, argx, bound) do { \ mpfr_set_d(frx, (float)flushToZero(argx), GMP_RNDN); \ - mpfrFunc(frc, frx, GMP_RNDN); \ - Sleef_float2 d2 = childFunc((float)flushToZero(argx)); \ + mpfrFunc(frc, frx, GMP_RNDN); \ + Sleef_float2 d2 = childFunc((float)flushToZero(argx)); \ if (countULPsp(d2.y, frc) > bound) { \ fprintf(stderr, "\narg = %.20g, test = %.20g, correct = %.20g, ULP = %lf\n", (float)flushToZero(argx), (double)d2.y, mpfr_get_d(frc, GMP_RNDN), countULPsp(d2.y, frc)); \ - success = 0; \ - break; \ - } \ + success = 0; \ + break; \ + } \ } while(0) -#define checkAccuracy2_f(mpfrFunc, childFunc, argx, bound, abound) do { \ +#define checkAccuracy2_f(mpfrFunc, childFunc, argx, bound, abound) do { \ mpfr_set_d(frx, (float)flushToZero(argx), GMP_RNDN); \ - mpfrFunc(frc, frx, GMP_RNDN); \ - double t = childFunc((float)flushToZero(argx)); \ - double ae = fabs(mpfr_get_d(frc, GMP_RNDN) - t); \ - if (countULPsp(t, frc) > bound && ae > abound) { \ + mpfrFunc(frc, frx, GMP_RNDN); \ + double t = childFunc((float)flushToZero(argx)); \ + double ae = fabs(mpfr_get_d(frc, GMP_RNDN) - t); \ + if (countULPsp(t, frc) > bound && ae > abound) { \ fprintf(stderr, "\narg = %.20g, test = %.20g, correct = %.20g, ULP = %lf, abserror = %g\n", \ (float)flushToZero(argx), (double)childFunc((float)flushToZero(argx)), mpfr_get_d(frc, GMP_RNDN), countULPsp(childFunc((float)flushToZero(argx)), frc), ae); \ - success = 0; \ - break; \ - } \ + success = 0; \ + break; \ + } \ } while(0) // @@ -4825,6 +4830,8 @@ void do_test() { fprintf(stderr, "atanf : "); for(d = -10;d < 10 && success;d += 0.002) checkAccuracy_f(mpfr_atan, child_atanf, d, 3.5); for(d = -10000;d < 10000 && success;d += 2.1) checkAccuracy_f(mpfr_atan, child_atanf, d, 3.5); + checkAccuracy_f(mpfr_atan, child_atanf, +INFINITY, 3.5); + checkAccuracy_f(mpfr_atan, child_atanf, -INFINITY, 3.5); showResult(success); // @@ -5012,6 +5019,7 @@ void do_test() { fprintf(stderr, "log1pf : "); for(d = 0.0001;d < 10 && success;d += 0.001) checkAccuracy_f(mpfr_log1p, child_log1pf, d, 1.0); + for(d = 1.0e+38;d < FLT_MAX && success;d += 1.0e+37) checkAccuracy_f(mpfr_log1p, child_log1pf, d, 1.0); showResult(success); // diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/tester2dp.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/tester2dp.c index 035c6bc14a7..1e6d740b4a8 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/tester2dp.c +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/tester2dp.c @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -70,7 +70,7 @@ double rnd_fr() { #else c.u64 = random() | ((uint64_t)random() << 31) | ((uint64_t)random() << 62); #endif - } while(!isnumber(c.d)); + } while(!xisnumber(c.d)); return c.d; } @@ -82,7 +82,7 @@ double rnd_zo() { #else c.u64 = random() | ((uint64_t)random() << 31) | ((uint64_t)random() << 62); #endif - } while(!isnumber(c.d) || c.d < -1 || 1 < c.d); + } while(!xisnumber(c.d) || c.d < -1 || 1 < c.d); return c.d; } @@ -144,21 +144,21 @@ int main(int argc,char **argv) double u0 = countULP2dp(t = sc.x, frx); - if (u0 != 0 && ((fabs(d) <= rangemax2 && u0 > 0.506) || fabs(t) > 1 || !isnumber(t))) { + if (u0 != 0 && ((fabs(d) <= rangemax2 && u0 > 0.506) || fabs(t) > 1 || !xisnumber(t))) { printf("Pure C sincospi_u05 sin arg=%.20g ulp=%.20g\n", d, u0); fflush(stdout); ecnt++; } double u1 = countULP2dp(t = sc2.x, frx); - if (u1 != 0 && ((fabs(d) <= rangemax2 && u1 > 1.5) || fabs(t) > 1 || !isnumber(t))) { + if (u1 != 0 && ((fabs(d) <= rangemax2 && u1 > 1.5) || fabs(t) > 1 || !xisnumber(t))) { printf("Pure C sincospi_u35 sin arg=%.20g ulp=%.20g\n", d, u1); fflush(stdout); ecnt++; } double u2 = countULP2dp(t = xsinpi_u05(d), frx); - if (u2 != 0 && ((fabs(d) <= rangemax2 && u2 > 0.506) || fabs(t) > 1 || !isnumber(t))) { + if (u2 != 0 && ((fabs(d) <= rangemax2 && u2 > 0.506) || fabs(t) > 1 || !xisnumber(t))) { printf("Pure C sinpi_u05 arg=%.20g ulp=%.20g\n", d, u2); fflush(stdout); ecnt++; } @@ -172,21 +172,21 @@ int main(int argc,char **argv) double u0 = countULP2dp(t = sc.y, frx); - if (u0 != 0 && ((fabs(d) <= rangemax2 && u0 > 0.506) || fabs(t) > 1 || !isnumber(t))) { + if (u0 != 0 && ((fabs(d) <= rangemax2 && u0 > 0.506) || fabs(t) > 1 || !xisnumber(t))) { printf("Pure C sincospi_u05 cos arg=%.20g ulp=%.20g\n", d, u0); fflush(stdout); ecnt++; } double u1 = countULP2dp(t = sc.y, frx); - if (u1 != 0 && ((fabs(d) <= rangemax2 && u1 > 1.5) || fabs(t) > 1 || !isnumber(t))) { + if (u1 != 0 && ((fabs(d) <= rangemax2 && u1 > 1.5) || fabs(t) > 1 || !xisnumber(t))) { printf("Pure C sincospi_u35 cos arg=%.20g ulp=%.20g\n", d, u1); fflush(stdout); ecnt++; } double u2 = countULP2dp(t = xcospi_u05(d), frx); - if (u2 != 0 && ((fabs(d) <= rangemax2 && u2 > 0.506) || fabs(t) > 1 || !isnumber(t))) { + if (u2 != 0 && ((fabs(d) <= rangemax2 && u2 > 0.506) || fabs(t) > 1 || !xisnumber(t))) { printf("Pure C cospi_u05 arg=%.20g ulp=%.20g\n", d, u2); fflush(stdout); ecnt++; } @@ -201,7 +201,7 @@ int main(int argc,char **argv) double u0 = countULPdp(t = xsin(d), frx); - if (u0 != 0 && (u0 > 3.5 || fabs(t) > 1 || !isnumber(t))) { + if (u0 != 0 && (u0 > 3.5 || fabs(t) > 1 || !xisnumber(t))) { printf("Pure C sin arg=%.20g ulp=%.20g\n", d, u0); printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t); fflush(stdout); ecnt++; @@ -209,7 +209,7 @@ int main(int argc,char **argv) double u1 = countULPdp(sc.x, frx); - if (u1 != 0 && (u1 > 3.5 || fabs(t) > 1 || !isnumber(t))) { + if (u1 != 0 && (u1 > 3.5 || fabs(t) > 1 || !xisnumber(t))) { printf("Pure C sincos sin arg=%.20g ulp=%.20g\n", d, u1); printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t); fflush(stdout); ecnt++; @@ -217,7 +217,7 @@ int main(int argc,char **argv) double u2 = countULPdp(t = xsin_u1(d), frx); - if (u2 != 0 && (u2 > 1 || fabs(t) > 1 || !isnumber(t))) { + if (u2 != 0 && (u2 > 1 || fabs(t) > 1 || !xisnumber(t))) { printf("Pure C sin_u1 arg=%.20g ulp=%.20g\n", d, u2); printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t); fflush(stdout); ecnt++; @@ -225,7 +225,7 @@ int main(int argc,char **argv) double u3 = countULPdp(t = sc2.x, frx); - if (u3 != 0 && (u3 > 1 || fabs(t) > 1 || !isnumber(t))) { + if (u3 != 0 && (u3 > 1 || fabs(t) > 1 || !xisnumber(t))) { printf("Pure C sincos_u1 sin arg=%.20g ulp=%.20g\n", d, u3); printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t); fflush(stdout); ecnt++; @@ -238,28 +238,28 @@ int main(int argc,char **argv) double u0 = countULPdp(t = xcos(d), frx); - if (u0 != 0 && (u0 > 3.5 || fabs(t) > 1 || !isnumber(t))) { + if (u0 != 0 && (u0 > 3.5 || fabs(t) > 1 || !xisnumber(t))) { printf("Pure C cos arg=%.20g ulp=%.20g\n", d, u0); fflush(stdout); ecnt++; } double u1 = countULPdp(t = sc.y, frx); - if (u1 != 0 && (u1 > 3.5 || fabs(t) > 1 || !isnumber(t))) { + if (u1 != 0 && (u1 > 3.5 || fabs(t) > 1 || !xisnumber(t))) { printf("Pure C sincos cos arg=%.20g ulp=%.20g\n", d, u1); fflush(stdout); ecnt++; } double u2 = countULPdp(t = xcos_u1(d), frx); - if (u2 != 0 && (u2 > 1 || fabs(t) > 1 || !isnumber(t))) { + if (u2 != 0 && (u2 > 1 || fabs(t) > 1 || !xisnumber(t))) { printf("Pure C cos_u1 arg=%.20g ulp=%.20g\n", d, u2); fflush(stdout); ecnt++; } double u3 = countULPdp(t = sc2.y, frx); - if (u3 != 0 && (u3 > 1 || fabs(t) > 1 || !isnumber(t))) { + if (u3 != 0 && (u3 > 1 || fabs(t) > 1 || !xisnumber(t))) { printf("Pure C sincos_u1 cos arg=%.20g ulp=%.20g\n", d, u3); fflush(stdout); ecnt++; } @@ -851,7 +851,7 @@ int main(int argc,char **argv) double u0 = countULPdp(t = xfrfrexp(d), frx); - if (d != 0 && isnumber(d) && u0 != 0) { + if (d != 0 && xisnumber(d) && u0 != 0) { printf("Pure C frfrexp arg=%.20g ulp=%.20g\n", d, u0); printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t); fflush(stdout); ecnt++; @@ -864,7 +864,7 @@ int main(int argc,char **argv) int texp = xexpfrexp(d); - if (d != 0 && isnumber(d) && cexp != texp) { + if (d != 0 && xisnumber(d) && cexp != texp) { printf("Pure C expfrexp arg=%.20g\n", d); printf("correct = %d, test = %d\n", cexp, texp); fflush(stdout); ecnt++; diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/tester2ld.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/tester2ld.c index dd75119c810..4f065137ef7 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/tester2ld.c +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/tester2ld.c @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2024. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/tester2qp.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/tester2qp.c index 2065e2eb74a..1224aeacb3a 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/tester2qp.c +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/tester2qp.c @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2024. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -24,7 +24,7 @@ #include "sleef.h" -#include "f128util.h" +#include "qtesterutil.h" #define DORENAME #include "rename.h" @@ -165,7 +165,7 @@ int main(int argc,char **argv) mpfr_inits(fra, frb, frc, frd, frw, frx, fry, frz, NULL); conv_t cd; - Sleef_quad d, t, d2, zo; + Sleef_quad d, t; //, d2, zo; int cnt, ecnt = 0; @@ -178,26 +178,26 @@ int main(int argc,char **argv) printf("%g\n", countULP2(cd.d, frx)); #endif - const Sleef_quad rangemax = 1e+9; + //const Sleef_quad rangemax = 1e+9; for(cnt = 0;ecnt < 1000;cnt++) { switch(cnt & 7) { case 0: d = rnd(); - d2 = rnd(); - zo = rnd(); + //d2 = rnd(); + //zo = rnd(); break; case 1: cd.d = rint((2 * (double)random() / RAND_MAX - 1) * 1e+10) * M_PI_4; cd.u128 += (random() & 0xff) - 0x7f; d = cd.d; - d2 = rnd(); - zo = rnd(); + //d2 = rnd(); + //zo = rnd(); break; default: d = rnd_fr(); - d2 = rnd_fr(); - zo = rnd_zo(); + //d2 = rnd_fr(); + //zo = rnd_zo(); break; } diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/tester2simddp.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/tester2simddp.c index 1ceffcb6fcf..1d8884d566d 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/tester2simddp.c +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/tester2simddp.c @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2023. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -293,7 +293,7 @@ double rnd_fr() { #else c.u64 = random() | ((uint64_t)random() << 31) | ((uint64_t)random() << 62); #endif - } while(!isnumber(c.d)); + } while(!xisnumber(c.d)); return c.d; } @@ -305,7 +305,7 @@ double rnd_zo() { #else c.u64 = random() | ((uint64_t)random() << 31) | ((uint64_t)random() << 62); #endif - } while(!isnumber(c.d) || c.d < -1 || 1 < c.d); + } while(!xisnumber(c.d) || c.d < -1 || 1 < c.d); return c.d; } @@ -427,21 +427,21 @@ int main(int argc,char **argv) double u0 = countULP2dp(t = vget(vd2getx_vd_vd2(sc), e), frx); - if (u0 != 0 && ((fabs(d) <= rangemax2 && u0 > 0.506) || fabs(t) > 1 || !isnumber(t))) { + if (u0 != 0 && ((fabs(d) <= rangemax2 && u0 > 0.506) || fabs(t) > 1 || !xisnumber(t))) { printf(ISANAME " sincospi_u05 sin arg=%.20g ulp=%.20g\n", d, u0); fflush(stdout); ecnt++; } double u1 = countULP2dp(t = vget(vd2getx_vd_vd2(sc2), e), frx); - if (u1 != 0 && ((fabs(d) <= rangemax2 && u1 > 1.5) || fabs(t) > 1 || !isnumber(t))) { + if (u1 != 0 && ((fabs(d) <= rangemax2 && u1 > 1.5) || fabs(t) > 1 || !xisnumber(t))) { printf(ISANAME " sincospi_u35 sin arg=%.20g ulp=%.20g\n", d, u1); fflush(stdout); ecnt++; } double u2 = countULP2dp(t = vget(xsinpi_u05(vd), e), frx); - if (u2 != 0 && ((fabs(d) <= rangemax2 && u2 > 0.506) || fabs(t) > 1 || !isnumber(t))) { + if (u2 != 0 && ((fabs(d) <= rangemax2 && u2 > 0.506) || fabs(t) > 1 || !xisnumber(t))) { printf(ISANAME " sinpi_u05 arg=%.20g ulp=%.20g\n", d, u2); fflush(stdout); ecnt++; } @@ -454,21 +454,21 @@ int main(int argc,char **argv) double u0 = countULP2dp(t = vget(vd2gety_vd_vd2(sc), e), frx); - if (u0 != 0 && ((fabs(d) <= rangemax2 && u0 > 0.506) || fabs(t) > 1 || !isnumber(t))) { + if (u0 != 0 && ((fabs(d) <= rangemax2 && u0 > 0.506) || fabs(t) > 1 || !xisnumber(t))) { printf(ISANAME " sincospi_u05 cos arg=%.20g ulp=%.20g\n", d, u0); fflush(stdout); ecnt++; } double u1 = countULP2dp(t = vget(vd2gety_vd_vd2(sc), e), frx); - if (u1 != 0 && ((fabs(d) <= rangemax2 && u1 > 1.5) || fabs(t) > 1 || !isnumber(t))) { + if (u1 != 0 && ((fabs(d) <= rangemax2 && u1 > 1.5) || fabs(t) > 1 || !xisnumber(t))) { printf(ISANAME " sincospi_u35 cos arg=%.20g ulp=%.20g\n", d, u1); fflush(stdout); ecnt++; } double u2 = countULP2dp(t = vget(xcospi_u05(vd), e), frx); - if (u2 != 0 && ((fabs(d) <= rangemax2 && u2 > 0.506) || fabs(t) > 1 || !isnumber(t))) { + if (u2 != 0 && ((fabs(d) <= rangemax2 && u2 > 0.506) || fabs(t) > 1 || !xisnumber(t))) { printf(ISANAME " cospi_u05 arg=%.20g ulp=%.20g\n", d, u2); fflush(stdout); ecnt++; } @@ -483,28 +483,28 @@ int main(int argc,char **argv) double u0 = countULPdp(t = vget(xsin(vd), e), frx); - if (u0 != 0 && (u0 > 3.5 || fabs(t) > 1 || !isnumber(t))) { + if (u0 != 0 && (u0 > 3.5 || fabs(t) > 1 || !xisnumber(t))) { printf(ISANAME " sin arg=%.20g ulp=%.20g\n", d, u0); fflush(stdout); ecnt++; } double u1 = countULPdp(t = vget(vd2getx_vd_vd2(sc), e), frx); - if (u1 != 0 && (u1 > 3.5 || fabs(t) > 1 || !isnumber(t))) { + if (u1 != 0 && (u1 > 3.5 || fabs(t) > 1 || !xisnumber(t))) { printf(ISANAME " sincos sin arg=%.20g ulp=%.20g\n", d, u1); fflush(stdout); ecnt++; } double u2 = countULPdp(t = vget(xsin_u1(vd), e), frx); - if (u2 != 0 && (u2 > 1 || fabs(t) > 1 || !isnumber(t))) { + if (u2 != 0 && (u2 > 1 || fabs(t) > 1 || !xisnumber(t))) { printf(ISANAME " sin_u1 arg=%.20g ulp=%.20g\n", d, u2); fflush(stdout); ecnt++; } double u3 = countULPdp(t = vget(vd2getx_vd_vd2(sc2), e), frx); - if (u3 != 0 && (u3 > 1 || fabs(t) > 1 || !isnumber(t))) { + if (u3 != 0 && (u3 > 1 || fabs(t) > 1 || !xisnumber(t))) { printf(ISANAME " sincos_u1 sin arg=%.20g ulp=%.20g\n", d, u3); fflush(stdout); ecnt++; } @@ -516,28 +516,28 @@ int main(int argc,char **argv) double u0 = countULPdp(t = vget(xcos(vd), e), frx); - if (u0 != 0 && (u0 > 3.5 || fabs(t) > 1 || !isnumber(t))) { + if (u0 != 0 && (u0 > 3.5 || fabs(t) > 1 || !xisnumber(t))) { printf(ISANAME " cos arg=%.20g ulp=%.20g\n", d, u0); fflush(stdout); ecnt++; } double u1 = countULPdp(t = vget(vd2gety_vd_vd2(sc), e), frx); - if (u1 != 0 && (u1 > 3.5 || fabs(t) > 1 || !isnumber(t))) { + if (u1 != 0 && (u1 > 3.5 || fabs(t) > 1 || !xisnumber(t))) { printf(ISANAME " sincos cos arg=%.20g ulp=%.20g\n", d, u1); fflush(stdout); ecnt++; } double u2 = countULPdp(t = vget(xcos_u1(vd), e), frx); - if (u2 != 0 && (u2 > 1 || fabs(t) > 1 || !isnumber(t))) { + if (u2 != 0 && (u2 > 1 || fabs(t) > 1 || !xisnumber(t))) { printf(ISANAME " cos_u1 arg=%.20g ulp=%.20g\n", d, u2); fflush(stdout); ecnt++; } double u3 = countULPdp(t = vget(vd2gety_vd_vd2(sc2), e), frx); - if (u3 != 0 && (u3 > 1 || fabs(t) > 1 || !isnumber(t))) { + if (u3 != 0 && (u3 > 1 || fabs(t) > 1 || !xisnumber(t))) { printf(ISANAME " sincos_u1 cos arg=%.20g ulp=%.20g\n", d, u3); fflush(stdout); ecnt++; } @@ -1159,7 +1159,7 @@ int main(int argc,char **argv) double u0 = countULPdp(t = vget(xfrfrexp(vd), e), frx); - if (d != 0 && isnumber(d) && u0 != 0) { + if (d != 0 && xisnumber(d) && u0 != 0) { printf(ISANAME " frfrexp arg=%.20g ulp=%.20g\n", d, u0); fflush(stdout); ecnt++; } @@ -1200,7 +1200,7 @@ int main(int argc,char **argv) int texp = vgeti(xexpfrexp(vd), e); - if (isnumber(d) && cexp != texp) { + if (xisnumber(d) && cexp != texp) { printf(ISANAME " expfrexp arg=%.20g\n", d); fflush(stdout); ecnt++; } diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/tester2simdsp.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/tester2simdsp.c index 930cadfb43d..bad7bd9e9a8 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/tester2simdsp.c +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/tester2simdsp.c @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2023. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -298,7 +298,7 @@ float rnd_fr() { #else c.u32 = (uint32_t)random() | ((uint32_t)random() << 31); #endif - } while(!isnumber(c.f)); + } while(!xisnumber(c.f)); return c.f; } @@ -310,7 +310,7 @@ float rnd_zo() { #else c.u32 = (uint32_t)random() | ((uint32_t)random() << 31); #endif - } while(!isnumber(c.f) || c.f < -1 || 1 < c.f); + } while(!xisnumber(c.f) || c.f < -1 || 1 < c.f); return c.f; } @@ -397,21 +397,21 @@ int main(int argc,char **argv) double u0 = countULP2sp(t = vget(vf2getx_vf_vf2(sc), e), frx); - if (u0 != 0 && ((fabs(d) <= rangemax2 && u0 > 0.505) || fabs(t) > 1 || !isnumber(t))) { + if (u0 != 0 && ((fabs(d) <= rangemax2 && u0 > 0.505) || fabs(t) > 1 || !xisnumber(t))) { printf(ISANAME " sincospif_u05 sin arg=%.20g ulp=%.20g\n", d, u0); fflush(stdout); ecnt++; } double u1 = countULP2sp(t = vget(vf2getx_vf_vf2(sc2), e), frx); - if (u1 != 0 && ((fabs(d) <= rangemax2 && u1 > 2.0) || fabs(t) > 1 || !isnumber(t))) { + if (u1 != 0 && ((fabs(d) <= rangemax2 && u1 > 2.0) || fabs(t) > 1 || !xisnumber(t))) { printf(ISANAME " sincospif_u35 sin arg=%.20g ulp=%.20g\n", d, u1); fflush(stdout); ecnt++; } double u2 = countULP2sp(t = vget(xsinpif_u05(vd), e), frx); - if (u2 != 0 && ((fabs(d) <= rangemax2 && u2 > 0.506) || fabs(t) > 1 || !isnumber(t))) { + if (u2 != 0 && ((fabs(d) <= rangemax2 && u2 > 0.506) || fabs(t) > 1 || !xisnumber(t))) { printf(ISANAME " sinpif_u05 arg=%.20g ulp=%.20g\n", d, u2); fflush(stdout); ecnt++; } @@ -425,21 +425,21 @@ int main(int argc,char **argv) double u0 = countULP2sp(t = vget(vf2gety_vf_vf2(sc), e), frx); - if (u0 != 0 && ((fabs(d) <= rangemax2 && u0 > 0.505) || fabs(t) > 1 || !isnumber(t))) { + if (u0 != 0 && ((fabs(d) <= rangemax2 && u0 > 0.505) || fabs(t) > 1 || !xisnumber(t))) { printf(ISANAME " sincospif_u05 cos arg=%.20g ulp=%.20g\n", d, u0); fflush(stdout); ecnt++; } double u1 = countULP2sp(t = vget(vf2gety_vf_vf2(sc), e), frx); - if (u1 != 0 && ((fabs(d) <= rangemax2 && u1 > 2.0) || fabs(t) > 1 || !isnumber(t))) { + if (u1 != 0 && ((fabs(d) <= rangemax2 && u1 > 2.0) || fabs(t) > 1 || !xisnumber(t))) { printf(ISANAME " sincospif_u35 cos arg=%.20g ulp=%.20g\n", d, u1); fflush(stdout); ecnt++; } double u2 = countULP2sp(t = vget(xcospif_u05(vd), e), frx); - if (u2 != 0 && ((fabs(d) <= rangemax2 && u2 > 0.506) || fabs(t) > 1 || !isnumber(t))) { + if (u2 != 0 && ((fabs(d) <= rangemax2 && u2 > 0.506) || fabs(t) > 1 || !xisnumber(t))) { printf(ISANAME " cospif_u05 arg=%.20g ulp=%.20g\n", d, u2); fflush(stdout); ecnt++; } @@ -454,28 +454,28 @@ int main(int argc,char **argv) float u0 = countULPsp(t = vget(xsinf(vd), e), frx); - if (u0 != 0 && (u0 > 3.5 || fabs(t) > 1 || !isnumber(t))) { + if (u0 != 0 && (u0 > 3.5 || fabs(t) > 1 || !xisnumber(t))) { printf(ISANAME " sinf arg=%.20g ulp=%.20g\n", d, u0); fflush(stdout); ecnt++; } float u1 = countULPsp(t = vget(vf2getx_vf_vf2(sc), e), frx); - if (u1 != 0 && (u1 > 3.5 || fabs(t) > 1 || !isnumber(t))) { + if (u1 != 0 && (u1 > 3.5 || fabs(t) > 1 || !xisnumber(t))) { printf(ISANAME " sincosf sin arg=%.20g ulp=%.20g\n", d, u1); fflush(stdout); ecnt++; } float u2 = countULPsp(t = vget(xsinf_u1(vd), e), frx); - if (u2 != 0 && (u2 > 1 || fabs(t) > 1 || !isnumber(t))) { + if (u2 != 0 && (u2 > 1 || fabs(t) > 1 || !xisnumber(t))) { printf(ISANAME " sinf_u1 arg=%.20g ulp=%.20g\n", d, u2); fflush(stdout); ecnt++; } float u3 = countULPsp(t = vget(vf2getx_vf_vf2(sc2), e), frx); - if (u3 != 0 && (u3 > 1 || fabs(t) > 1 || !isnumber(t))) { + if (u3 != 0 && (u3 > 1 || fabs(t) > 1 || !xisnumber(t))) { printf(ISANAME " sincosf_u1 sin arg=%.20g ulp=%.20g\n", d, u3); fflush(stdout); ecnt++; } @@ -495,28 +495,28 @@ int main(int argc,char **argv) float u0 = countULPsp(t = vget(xcosf(vd), e), frx); - if (u0 != 0 && (u0 > 3.5 || fabs(t) > 1 || !isnumber(t))) { + if (u0 != 0 && (u0 > 3.5 || fabs(t) > 1 || !xisnumber(t))) { printf(ISANAME " cosf arg=%.20g ulp=%.20g\n", d, u0); fflush(stdout); ecnt++; } float u1 = countULPsp(t = vget(vf2gety_vf_vf2(sc), e), frx); - if (u1 != 0 && (u1 > 3.5 || fabs(t) > 1 || !isnumber(t))) { + if (u1 != 0 && (u1 > 3.5 || fabs(t) > 1 || !xisnumber(t))) { printf(ISANAME " sincosf cos arg=%.20g ulp=%.20g\n", d, u1); fflush(stdout); ecnt++; } float u2 = countULPsp(t = vget(xcosf_u1(vd), e), frx); - if (u2 != 0 && (u2 > 1 || fabs(t) > 1 || !isnumber(t))) { + if (u2 != 0 && (u2 > 1 || fabs(t) > 1 || !xisnumber(t))) { printf(ISANAME " cosf_u1 arg=%.20g ulp=%.20g\n", d, u2); fflush(stdout); ecnt++; } float u3 = countULPsp(t = vget(vf2gety_vf_vf2(sc2), e), frx); - if (u3 != 0 && (u3 > 1 || fabs(t) > 1 || !isnumber(t))) { + if (u3 != 0 && (u3 > 1 || fabs(t) > 1 || !xisnumber(t))) { printf(ISANAME " sincosf_u1 cos arg=%.20g ulp=%.20g\n", d, u3); fflush(stdout); ecnt++; } @@ -688,10 +688,10 @@ int main(int argc,char **argv) fflush(stdout); ecnt++; } - if (isnumber(d) && isnumber(d2)) { + if (xisnumber(d) && xisnumber(d2)) { double u1 = countULPsp(t = vget(xfastpowf_u3500(vd2, vd), e), frx); - if (isnumber((float)mpfr_get_d(frx, GMP_RNDN)) && u1 > 350) { + if (xisnumber((float)mpfr_get_d(frx, GMP_RNDN)) && u1 > 350) { printf(ISANAME " fastpowf_u3500 arg=%.20g, %.20g ulp=%.20g\n", d2, d, u1); printf("correct = %g, test = %g\n", mpfr_get_d(frx, GMP_RNDN), t); fflush(stdout); ecnt++; @@ -1095,7 +1095,7 @@ int main(int argc,char **argv) double u0 = countULPsp(t = vget(xfrfrexpf(vd), e), frx); - if (d != 0 && isnumber(d) && u0 != 0) { + if (d != 0 && xisnumber(d) && u0 != 0) { printf(ISANAME " frfrexpf arg=%.20g ulp=%.20g\n", d, u0); fflush(stdout); ecnt++; } @@ -1108,7 +1108,7 @@ int main(int argc,char **argv) int texp = xexpfrexpf(d); - if (d != 0 && isnumber(d) && cexp != texp) { + if (d != 0 && xisnumber(d) && cexp != texp) { printf(ISANAME " expfrexpf arg=%.20g\n", d); fflush(stdout); ecnt++; } @@ -1278,7 +1278,7 @@ int main(int argc,char **argv) double u0 = countULP2sp(t = vget(xtgammaf_u1(vd), e), frx); double c = mpfr_get_d(frx, GMP_RNDN); - if (isnumber(c) || isnumber(t)) { + if (xisnumber(c) || xisnumber(t)) { if (u0 > 1.0) { printf(ISANAME " xtgammaf_u1 arg=%.20g ulp=%.20g\n", d, u0); printf("correct = %.20g, test = %.20g\n", (float)mpfr_get_d(frx, GMP_RNDN), t); diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/tester2sp.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/tester2sp.c index b5af3965168..a6507fbf21d 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/tester2sp.c +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/tester2sp.c @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -84,7 +84,7 @@ float rnd_fr() { #else c.u32 = (uint32_t)random() | ((uint32_t)random() << 31); #endif - } while(!isnumber(c.f)); + } while(!xisnumber(c.f)); return c.f; } @@ -96,7 +96,7 @@ float rnd_zo() { #else c.u32 = (uint32_t)random() | ((uint32_t)random() << 31); #endif - } while(!isnumber(c.f) || c.f < -1 || 1 < c.f); + } while(!xisnumber(c.f) || c.f < -1 || 1 < c.f); return c.f; } @@ -158,21 +158,21 @@ int main(int argc,char **argv) double u0 = countULP2sp(t = sc.x, frx); - if (u0 != 0 && ((fabs(d) <= rangemax2 && u0 > 0.505) || fabs(t) > 1 || !isnumber(t))) { + if (u0 != 0 && ((fabs(d) <= rangemax2 && u0 > 0.505) || fabs(t) > 1 || !xisnumber(t))) { printf("Pure C sincospif_u05 sin arg=%.20g ulp=%.20g\n", d, u0); fflush(stdout); ecnt++; } double u1 = countULP2sp(t = sc2.x, frx); - if (u1 != 0 && ((fabs(d) <= rangemax2 && u1 > 2.0) || fabs(t) > 1 || !isnumber(t))) { + if (u1 != 0 && ((fabs(d) <= rangemax2 && u1 > 2.0) || fabs(t) > 1 || !xisnumber(t))) { printf("Pure C sincospif_u35 sin arg=%.20g ulp=%.20g\n", d, u1); fflush(stdout); ecnt++; } double u2 = countULP2sp(t = xsinpif_u05(d), frx); - if (u2 != 0 && ((fabs(d) <= rangemax2 && u2 > 0.506) || fabs(t) > 1 || !isnumber(t))) { + if (u2 != 0 && ((fabs(d) <= rangemax2 && u2 > 0.506) || fabs(t) > 1 || !xisnumber(t))) { printf("Pure C sinpif_u05 arg=%.20g ulp=%.20g\n", d, u2); printf("correct = %g, test = %g\n", mpfr_get_d(frx, GMP_RNDN), t); fflush(stdout); ecnt++; @@ -187,21 +187,21 @@ int main(int argc,char **argv) double u0 = countULP2sp(t = sc.y, frx); - if (u0 != 0 && ((fabs(d) <= rangemax2 && u0 > 0.505) || fabs(t) > 1 || !isnumber(t))) { + if (u0 != 0 && ((fabs(d) <= rangemax2 && u0 > 0.505) || fabs(t) > 1 || !xisnumber(t))) { printf("Pure C sincospif_u05 cos arg=%.20g ulp=%.20g\n", d, u0); fflush(stdout); ecnt++; } double u1 = countULP2sp(t = sc.y, frx); - if (u1 != 0 && ((fabs(d) <= rangemax2 && u1 > 2.0) || fabs(t) > 1 || !isnumber(t))) { + if (u1 != 0 && ((fabs(d) <= rangemax2 && u1 > 2.0) || fabs(t) > 1 || !xisnumber(t))) { printf("Pure C sincospif_u35 cos arg=%.20g ulp=%.20g\n", d, u1); fflush(stdout); ecnt++; } double u2 = countULP2sp(t = xcospif_u05(d), frx); - if (u2 != 0 && ((fabs(d) <= rangemax2 && u2 > 0.506) || fabs(t) > 1 || !isnumber(t))) { + if (u2 != 0 && ((fabs(d) <= rangemax2 && u2 > 0.506) || fabs(t) > 1 || !xisnumber(t))) { printf("Pure C cospif_u05 arg=%.20g ulp=%.20g\n", d, u2); printf("correct = %g, test = %g\n", mpfr_get_d(frx, GMP_RNDN), t); fflush(stdout); ecnt++; @@ -217,28 +217,28 @@ int main(int argc,char **argv) float u0 = countULPsp(t = xsinf(d), frx); - if (u0 != 0 && (u0 > 3.5 || fabs(t) > 1 || !isnumber(t))) { + if (u0 != 0 && (u0 > 3.5 || fabs(t) > 1 || !xisnumber(t))) { printf("Pure C sinf arg=%.20g ulp=%.20g\n", d, u0); fflush(stdout); ecnt++; } float u1 = countULPsp(t = sc.x, frx); - if (u1 != 0 && (u1 > 3.5 || fabs(t) > 1 || !isnumber(t))) { + if (u1 != 0 && (u1 > 3.5 || fabs(t) > 1 || !xisnumber(t))) { printf("Pure C sincosf sin arg=%.20g ulp=%.20g\n", d, u1); fflush(stdout); ecnt++; } float u2 = countULPsp(t = xsinf_u1(d), frx); - if (u2 != 0 && (u2 > 1 || fabs(t) > 1 || !isnumber(t))) { + if (u2 != 0 && (u2 > 1 || fabs(t) > 1 || !xisnumber(t))) { printf("Pure C sinf_u1 arg=%.20g ulp=%.20g\n", d, u2); fflush(stdout); ecnt++; } float u3 = countULPsp(t = sc2.x, frx); - if (u3 != 0 && (u3 > 1 || fabs(t) > 1 || !isnumber(t))) { + if (u3 != 0 && (u3 > 1 || fabs(t) > 1 || !xisnumber(t))) { printf("Pure C sincosf_u1 sin arg=%.20g ulp=%.20g\n", d, u3); fflush(stdout); ecnt++; } @@ -258,28 +258,28 @@ int main(int argc,char **argv) float u0 = countULPsp(t = xcosf(d), frx); - if (u0 != 0 && (u0 > 3.5 || fabs(t) > 1 || !isnumber(t))) { + if (u0 != 0 && (u0 > 3.5 || fabs(t) > 1 || !xisnumber(t))) { printf("Pure C cosf arg=%.20g ulp=%.20g\n", d, u0); fflush(stdout); ecnt++; } float u1 = countULPsp(t = sc.y, frx); - if (u1 != 0 && (u1 > 3.5 || fabs(t) > 1 || !isnumber(t))) { + if (u1 != 0 && (u1 > 3.5 || fabs(t) > 1 || !xisnumber(t))) { printf("Pure C sincosf cos arg=%.20g ulp=%.20g\n", d, u1); fflush(stdout); ecnt++; } float u2 = countULPsp(t = xcosf_u1(d), frx); - if (u2 != 0 && (u2 > 1 || fabs(t) > 1 || !isnumber(t))) { + if (u2 != 0 && (u2 > 1 || fabs(t) > 1 || !xisnumber(t))) { printf("Pure C cosf_u1 arg=%.20g ulp=%.20g\n", d, u2); fflush(stdout); ecnt++; } float u3 = countULPsp(t = sc2.y, frx); - if (u3 != 0 && (u3 > 1 || fabs(t) > 1 || !isnumber(t))) { + if (u3 != 0 && (u3 > 1 || fabs(t) > 1 || !xisnumber(t))) { printf("Pure C sincosf_u1 cos arg=%.20g ulp=%.20g\n", d, u3); fflush(stdout); ecnt++; } @@ -451,10 +451,10 @@ int main(int argc,char **argv) fflush(stdout); ecnt++; } - if (isnumber(d) && isnumber(d2)) { + if (xisnumber(d) && xisnumber(d2)) { double u1 = countULPsp(t = xfastpowf_u3500(d2, d), frx); - if (isnumber((float)mpfr_get_d(frx, GMP_RNDN)) && u1 > 350) { + if (xisnumber((float)mpfr_get_d(frx, GMP_RNDN)) && u1 > 350) { printf("Pure C fastpowf_u3500 arg=%.20g, %.20g ulp=%.20g\n", d2, d, u1); fflush(stdout); ecnt++; } @@ -856,7 +856,7 @@ int main(int argc,char **argv) double u0 = countULPsp(t = xfrfrexpf(d), frx); - if (d != 0 && isnumber(d) && u0 != 0) { + if (d != 0 && xisnumber(d) && u0 != 0) { printf("Pure C frfrexpf arg=%.20g ulp=%.20g\n", d, u0); fflush(stdout); ecnt++; } @@ -868,7 +868,7 @@ int main(int argc,char **argv) int texp = xexpfrexpf(d); - if (d != 0 && isnumber(d) && cexp != texp) { + if (d != 0 && xisnumber(d) && cexp != texp) { printf("Pure C expfrexpf arg=%.20g\n", d); fflush(stdout); ecnt++; } @@ -1022,7 +1022,7 @@ int main(int argc,char **argv) double u0 = countULP2sp(t = xtgammaf_u1(d), frx); double c = mpfr_get_d(frx, GMP_RNDN); - if (isnumber(c) || isnumber(t)) { + if (xisnumber(c) || xisnumber(t)) { if (u0 > 1.0) { printf("Pure C xtgamma arg=%.20g ulp=%.20g\n", d, u0); printf("Correct = %.20Lg, test = %.20g\n", mpfr_get_ld(frx, GMP_RNDN), t); diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/tester3.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/tester3.c index 9abeca4db5b..bca39fc417e 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/tester3.c +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/tester3.c @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2023. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -11,7 +11,11 @@ #include #include +#ifndef SLEEF_USE_INTERNAL_SHA256 #include +#else +#include "psha2_capi.h" +#endif #include "sleef.h" #include "misc.h" @@ -157,190 +161,190 @@ static SPTYPE vf2gety_vf_vf2(TYPE2(SPTYPE) v) { return v.y; } // -#define initDigest \ - EVP_MD_CTX *ctx; ctx = EVP_MD_CTX_new(); \ - if (!ctx) { \ - fprintf(stderr, "Error creating context.\n"); \ - return 0; \ - } \ - if (!EVP_DigestInit_ex(ctx, EVP_md5(), NULL)) { \ +#define initDigest \ + EVP_MD_CTX *ctx; ctx = EVP_MD_CTX_new(); \ + if (!ctx) { \ + fprintf(stderr, "Error creating context.\n"); \ + return 0; \ + } \ + if (!EVP_DigestInit_ex(ctx, EVP_sha256(), NULL)) { \ fprintf(stderr, "Error initializing context.\n"); \ - return 0; \ + return 0; \ } -#define checkDigest(NAME, ULP) do { \ - unsigned int md5_digest_len = EVP_MD_size(EVP_md5()); \ - unsigned char *md5_digest; \ - md5_digest = (unsigned char *)malloc(md5_digest_len); \ - if (!EVP_DigestFinal_ex(ctx, md5_digest, &md5_digest_len)) { \ - fprintf(stderr, "Error finalizing digest.\n"); \ - return 0; \ - } \ - EVP_MD_CTX_free(ctx); \ - unsigned char mes[64], buf[64]; \ - memset(mes, 0, 64); \ +#define checkDigest(NAME, ULP) do { \ + unsigned int sha256_digest_len = EVP_MD_size(EVP_sha256()); \ + unsigned char *sha256_digest; \ + sha256_digest = (unsigned char *)malloc(sha256_digest_len); \ + if (!EVP_DigestFinal_ex(ctx, sha256_digest, &sha256_digest_len)) { \ + fprintf(stderr, "Error finalizing digest.\n"); \ + return 0; \ + } \ + EVP_MD_CTX_free(ctx); \ + unsigned char mes[256], buf[256]; \ + memset(mes, 0, 256); \ sprintf((char *)mes, "%s ", #NAME " " #ULP); \ char tmp[3] = { 0 }; \ - for (int i = 0; i < md5_digest_len; i++) { \ - sprintf(tmp, "%02x", md5_digest[i]); \ - strcat((char *)mes, tmp); \ - } \ - free(md5_digest); \ - if (fp != NULL) { \ - fgets((char *)buf, 60, fp); \ + for (int i = 0; i < sha256_digest_len; i++) { \ + snprintf(tmp, sizeof(tmp), "%02x", sha256_digest[i]); \ + strncat((char *)mes, tmp, sizeof(mes)-1); \ + } \ + free(sha256_digest); \ + if (fp != NULL) { \ + fgets((char *)buf, 250, fp); \ if (strncmp((char *)mes, (char *)buf, strlen((char *)mes)) != 0) { \ - puts((char *)mes); \ - puts((char *)buf); \ - success = 0; \ - } \ - } else puts((char *)mes); \ + puts((char *)mes); \ + puts((char *)buf); \ + success = 0; \ + } \ + } else puts((char *)mes); \ } while(0) #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) #define convertEndianness(ptr, len) do { \ - for(int k=0;k +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "testerutil.h" + +using namespace std; + +// + +#if !defined(USE_INLINE_HEADER) +#include "sleef.h" +#else // #if !defined(USE_INLINE_HEADER) +#include +#include +#include +#include + +#if defined(__AVX2__) || defined(__aarch64__) || defined(__arm__) || defined(__powerpc64__) +#ifndef FP_FAST_FMA +#define FP_FAST_FMA +#endif +#endif + +#if defined(_MSC_VER) && !defined(__STDC__) +#define __STDC__ 1 +#endif + +#if (defined(__GNUC__) || defined(__CLANG__)) && (defined(__i386__) || defined(__x86_64__)) +#include +#endif + +#if (defined(_MSC_VER)) +#include +#endif + +#if defined(__ARM_NEON__) || defined(__ARM_NEON) +#include +#endif + +#if defined(__ARM_FEATURE_SVE) +#include +#endif + +#if defined(__riscv) && defined(__riscv_v) +#include +#endif + +#if defined(__VSX__) +#include +#endif + +#if defined(__VX__) +#include +#endif + +#define SLEEF_ALWAYS_INLINE inline +#define SLEEF_INLINE +#define SLEEF_CONST +#include USE_INLINE_HEADER +#include MACRO_ONLY_HEADER + +#ifndef ENABLE_PUREC_SCALAR +#include "sleefinline_purec_scalar.h" +#endif + +#endif // #if !defined(USE_INLINE_HEADER) + + +#ifdef ENABLE_SSE2 +#include "renamesse2.h" +#if !defined(USE_INLINE_HEADER) +#define CONFIG 2 +#include "helpersse2.h" +typedef Sleef___m128d_2 vdouble2; +typedef Sleef___m128_2 vfloat2; +#endif +#endif + +#ifdef ENABLE_SSE4 +#include "renamesse4.h" +#if !defined(USE_INLINE_HEADER) +#define CONFIG 4 +#include "helpersse2.h" +typedef Sleef___m128d_2 vdouble2; +typedef Sleef___m128_2 vfloat2; +#endif +#endif + +#ifdef ENABLE_AVX +#include "renameavx.h" +#if !defined(USE_INLINE_HEADER) +#define CONFIG 1 +#include "helperavx.h" +typedef Sleef___m256d_2 vdouble2; +typedef Sleef___m256_2 vfloat2; +#endif +#endif + +#ifdef ENABLE_FMA4 +#include "renamefma4.h" +#if !defined(USE_INLINE_HEADER) +#define CONFIG 4 +#include "helperavx.h" +typedef Sleef___m256d_2 vdouble2; +typedef Sleef___m256_2 vfloat2; +#endif +#endif + +#ifdef ENABLE_AVX2 +#include "renameavx2.h" +#if !defined(USE_INLINE_HEADER) +#define CONFIG 1 +#include "helperavx2.h" +typedef Sleef___m256d_2 vdouble2; +typedef Sleef___m256_2 vfloat2; +#endif +#endif + +#ifdef ENABLE_AVX2128 +#include "renameavx2128.h" +#if !defined(USE_INLINE_HEADER) +#define CONFIG 1 +#include "helperavx2_128.h" +typedef Sleef___m128d_2 vdouble2; +typedef Sleef___m128_2 vfloat2; +#endif +#endif + +#ifdef ENABLE_AVX512F +#include "renameavx512f.h" +#if !defined(USE_INLINE_HEADER) +#define CONFIG 1 +#include "helperavx512f.h" +typedef Sleef___m512d_2 vdouble2; +typedef Sleef___m512_2 vfloat2; +#endif +#endif + +#ifdef ENABLE_AVX512FNOFMA +#include "renameavx512fnofma.h" +#if !defined(USE_INLINE_HEADER) +#define CONFIG 2 +#include "helperavx512f.h" +typedef Sleef___m512d_2 vdouble2; +typedef Sleef___m512_2 vfloat2; +#endif +#endif + +#ifdef ENABLE_VECEXT +#define CONFIG 1 +#include "helpervecext.h" +#include "norename.h" +#endif + +#ifdef ENABLE_PUREC +#define CONFIG 1 +#include "helperpurec.h" +#include "norename.h" +#endif + +#ifdef ENABLE_NEON32 +#include "renameneon32.h" +#if !defined(USE_INLINE_HEADER) +#define CONFIG 1 +#include "helperneon32.h" +typedef Sleef_float32x4_t_2 vfloat2; +#endif +#endif + +#ifdef ENABLE_NEON32VFPV4 +#include "renameneon32vfpv4.h" +#if !defined(USE_INLINE_HEADER) +#define CONFIG 4 +#include "helperneon32.h" +typedef Sleef_float32x4_t_2 vfloat2; +#endif +#endif + +#ifdef ENABLE_ADVSIMD +#include "renameadvsimd.h" +#if !defined(USE_INLINE_HEADER) +#define CONFIG 1 +#include "helperadvsimd.h" +typedef Sleef_float64x2_t_2 vdouble2; +typedef Sleef_float32x4_t_2 vfloat2; +#endif +#endif + +#ifdef ENABLE_ADVSIMDNOFMA +#include "renameadvsimdnofma.h" +#if !defined(USE_INLINE_HEADER) +#define CONFIG 2 +#include "helperadvsimd.h" +typedef Sleef_float64x2_t_2 vdouble2; +typedef Sleef_float32x4_t_2 vfloat2; +#endif +#endif + +#ifdef ENABLE_DSP128 +#define CONFIG 2 +#include "helpersse2.h" +#include "renamedsp128.h" +typedef Sleef___m128d_2 vdouble2; +typedef Sleef___m128_2 vfloat2; +#endif + +#ifdef ENABLE_SVE +#include "renamesve.h" +#if !defined(USE_INLINE_HEADER) +#define CONFIG 1 +#include "helpersve.h" +#endif +#endif + +#ifdef ENABLE_SVENOFMA +#include "renamesvenofma.h" +#if !defined(USE_INLINE_HEADER) +#define CONFIG 2 +#include "helpersve.h" +#endif +#endif + +#ifdef ENABLE_DSP256 +#define CONFIG 1 +#include "helperavx.h" +#include "renamedsp256.h" +typedef Sleef___m256d_2 vdouble2; +typedef Sleef___m256_2 vfloat2; +#endif + +#ifdef ENABLE_VSX +#include "renamevsx.h" +#if !defined(USE_INLINE_HEADER) +#define CONFIG 1 +#include "helperpower_128.h" +#include "renamevsx.h" +typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2; +typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2; +#endif +#endif + +#ifdef ENABLE_VSXNOFMA +#include "renamevsxnofma.h" +#if !defined(USE_INLINE_HEADER) +#define CONFIG 2 +#include "helperpower_128.h" +#include "renamevsxnofma.h" +typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2; +typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2; +#endif +#endif + +#ifdef ENABLE_VSX3 +#include "renamevsx3.h" +#if !defined(USE_INLINE_HEADER) +#define CONFIG 3 +#include "helperpower_128.h" +#include "renamevsx3.h" +typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2; +typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2; +#endif +#endif + +#ifdef ENABLE_VSX3NOFMA +#include "renamevsx3nofma.h" +#if !defined(USE_INLINE_HEADER) +#define CONFIG 4 +#include "helperpower_128.h" +#include "renamevsx3nofma.h" +typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2; +typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2; +#endif +#endif + +#ifdef ENABLE_VXE +#include "renamevxe.h" +#if !defined(USE_INLINE_HEADER) +#define CONFIG 140 +#include "helpers390x_128.h" +typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2; +typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2; +#endif +#endif + +#ifdef ENABLE_VXENOFMA +#include "renamevxenofma.h" +#if !defined(USE_INLINE_HEADER) +#define CONFIG 141 +#include "helpers390x_128.h" +typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2; +typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2; +#endif +#endif + +#ifdef ENABLE_VXE2 +#include "renamevxe2.h" +#if !defined(USE_INLINE_HEADER) +#define CONFIG 150 +#include "helpers390x_128.h" +typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2; +typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2; +#endif +#endif + +#ifdef ENABLE_VXE2NOFMA +#include "renamevxe2nofma.h" +#if !defined(USE_INLINE_HEADER) +#define CONFIG 151 +#include "helpers390x_128.h" +typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2; +typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2; +#endif +#endif + +#ifdef ENABLE_DSPPOWER_128 +#define CONFIG 1 +#include "helperpower_128.h" +#include "renamedsp128.h" +typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2; +typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2; +#endif + +#ifdef ENABLE_DSPS390X_128 +#define CONFIG 140 +#include "helpers390x_128.h" +#include "renamedsp128.h" +typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2; +typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2; +#endif + +#ifdef ENABLE_RVVM1 +#include "renamervvm1.h" +#if !defined(USE_INLINE_HEADER) +#define CONFIG 1 +#include "helperrvv.h" +#endif +#endif + +#ifdef ENABLE_RVVM1NOFMA +#include "renamervvm1nofma.h" +#if !defined(USE_INLINE_HEADER) +#define CONFIG 2 +#include "helperrvv.h" +#endif +#endif + +#ifdef ENABLE_RVVM2 +#include "renamervvm2.h" +#if !defined(USE_INLINE_HEADER) +#define CONFIG 1 +#include "helperrvv.h" +#endif +#endif + +#ifdef ENABLE_RVVM2NOFMA +#include "renamervvm2nofma.h" +#if !defined(USE_INLINE_HEADER) +#define CONFIG 2 +#include "helperrvv.h" +#endif +#endif + +#ifdef ENABLE_PUREC_SCALAR +#include "renamepurec_scalar.h" +#if !defined(USE_INLINE_HEADER) +#define CONFIG 1 +#include "helperpurec_scalar.h" +typedef Sleef_double_2 vdouble2; +typedef Sleef_float_2 vfloat2; +#endif +#endif + +#ifdef ENABLE_PURECFMA_SCALAR +#include "renamepurecfma_scalar.h" +#if !defined(USE_INLINE_HEADER) +#define CONFIG 2 +#include "helperpurec_scalar.h" +typedef Sleef_double_2 vdouble2; +typedef Sleef_float_2 vfloat2; +#endif +#endif + +#ifdef ENABLE_DSP_SCALAR +#include "renamedspscalar.h" +#define CONFIG 1 +#include "helperpurec_scalar.h" +typedef Sleef_double_2 vdouble2; +typedef Sleef_float_2 vfloat2; +#endif + +#ifdef USE_INLINE_HEADER +#ifdef vopmask +#undef vopmask +#endif + +#define CONCAT_SIMD_SUFFIX_(keyword, suffix) keyword ## suffix +#define CONCAT_SIMD_SUFFIX(keyword, suffix) CONCAT_SIMD_SUFFIX_(keyword, suffix) +#define vmask CONCAT_SIMD_SUFFIX(vmask, SIMD_SUFFIX) +#define vopmask CONCAT_SIMD_SUFFIX(vopmask, SIMD_SUFFIX) +#define vdouble CONCAT_SIMD_SUFFIX(vdouble, SIMD_SUFFIX) +#define vint CONCAT_SIMD_SUFFIX(vint, SIMD_SUFFIX) +#define vfloat CONCAT_SIMD_SUFFIX(vfloat, SIMD_SUFFIX) +#define vint2 CONCAT_SIMD_SUFFIX(vint2, SIMD_SUFFIX) +#define vdouble2 CONCAT_SIMD_SUFFIX(vdouble2, SIMD_SUFFIX) +#define vfloat2 CONCAT_SIMD_SUFFIX(vfloat2, SIMD_SUFFIX) +#define vd2getx_vd_vd2 CONCAT_SIMD_SUFFIX(vd2getx_vd_vd2, SIMD_SUFFIX) +#define vd2gety_vd_vd2 CONCAT_SIMD_SUFFIX(vd2gety_vd_vd2, SIMD_SUFFIX) +#define vf2getx_vf_vf2 CONCAT_SIMD_SUFFIX(vf2getx_vf_vf2, SIMD_SUFFIX) +#define vf2gety_vf_vf2 CONCAT_SIMD_SUFFIX(vf2gety_vf_vf2, SIMD_SUFFIX) +#define vloadu_vd_p CONCAT_SIMD_SUFFIX(vloadu_vd_p, SIMD_SUFFIX) +#define vstoreu_v_p_vd CONCAT_SIMD_SUFFIX(vstoreu_v_p_vd, SIMD_SUFFIX) +#define vloadu_vf_p CONCAT_SIMD_SUFFIX(vloadu_vf_p, SIMD_SUFFIX) +#define vstoreu_v_p_vf CONCAT_SIMD_SUFFIX(vstoreu_v_p_vf, SIMD_SUFFIX) +#define vloadu_vi_p CONCAT_SIMD_SUFFIX(vloadu_vi_p, SIMD_SUFFIX) +#define vstoreu_v_p_vi CONCAT_SIMD_SUFFIX(vstoreu_v_p_vi, SIMD_SUFFIX) +#endif + +// + +#if defined(ENABLE_DP) && !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA) || defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA) || defined(USE_INLINE_HEADER)) +static vdouble vd2getx_vd_vd2(vdouble2 v) { return v.x; } +static vdouble vd2gety_vd_vd2(vdouble2 v) { return v.y; } +#endif + +#if defined(ENABLE_SP) && !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA) || defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA) || defined(USE_INLINE_HEADER)) +static vfloat vf2getx_vf_vf2(vfloat2 v) { return v.x; } +static vfloat vf2gety_vf_vf2(vfloat2 v) { return v.y; } +#endif + +#ifndef SLEEF_DBL_DENORM_MIN +#define SLEEF_DBL_DENORM_MIN 4.9406564584124654e-324 +#define SLEEF_FLT_DENORM_MIN 1.40129846e-45F +#endif + +// + +extern "C" { + int check_feature(double d, float f) { +#ifdef ENABLE_DP + { + double s[VECTLENDP]; + int i; + for(i=0;i<(int)VECTLENDP;i++) { + s[i] = d; + } + vdouble a = vloadu_vd_p(s); + a = xpow(a, a); + vstoreu_v_p_vd(s, a); + if (s[0] == s[0]) return 1; + } +#endif +#ifdef ENABLE_SP + { + float s[VECTLENSP]; + int i; + for(i=0;i<(int)VECTLENSP;i++) { + s[i] = d; + } + vfloat a = vloadu_vf_p(s); + a = xpowf(a, a); + vstoreu_v_p_vf(s, a); + if (s[0] == s[0]) return 1; + } +#endif + return 0; + } +} + +// + +#if defined(ENABLE_DP) +template +static bool check_d_d(const char *msg, vdouble (*vfunc)(vdouble), T (*tlfunc)(const T), + const double *a0, size_t z, double tol, bool checkSignedZero) { + double s0[VECTLENDP]; + for(size_t i=0;i(s0[idx], (*tlfunc)(a0[i]), DBL_MANT_DIG, + SLEEF_DBL_DENORM_MIN, DBL_MAX, checkSignedZero); + if (u > tol) { + printf("%s : arg = %a (%g), ulp = %g, t = %.16g, ", msg, a0[i], a0[i], u, s0[idx]); + cout << "c = " << tlfloat::to_string((*tlfunc)(a0[i])) << endl; + return false; + } + } + return true; +} + +template +static bool check_d_d(const char *msg, vdouble (*vfunc)(vdouble), T (*tlfunc)(const T), + double start, double end, double step, double tol, bool checkSignedZero) { + double s0[VECTLENDP]; + for(size_t i=0;start + i * step <= end;i++) { + double a0 = start + i * step; + memrand(s0, sizeof(s0)); + int idx = xrand() & (VECTLENDP-1); + s0[idx] = a0; + vdouble v0 = vloadu_vd_p(s0); + v0 = (*vfunc)(v0); + vstoreu_v_p_vd(s0, v0); + double u = countULP(s0[idx], (*tlfunc)(a0), DBL_MANT_DIG, + SLEEF_DBL_DENORM_MIN, DBL_MAX, checkSignedZero); + if (u > tol) { + printf("%s : arg = %a (%g), ulp = %g, t = %.16g, ", msg, a0, a0, u, s0[idx]); + cout << "c = " << tlfloat::to_string((*tlfunc)(a0)) << endl; + return false; + } + } + return true; +} + +template +static bool check_d_d_d(const char *msg, vdouble (*vfunc)(vdouble, vdouble), T (*tlfunc)(const T, const T), + const double *a0, size_t z0, const double *a1, size_t z1, + double tol, bool checkSignedZero) { + double s0[VECTLENDP], s1[VECTLENDP]; + for(size_t i=0;i(s0[idx], (*tlfunc)(a0[i], a1[j]), DBL_MANT_DIG, + SLEEF_DBL_DENORM_MIN, DBL_MAX, checkSignedZero); + if (u > tol) { + printf("%s : arg0 = %a (%g), arg1 = %a (%g), ulp = %g, t = %.16g, ", + msg, a0[i], a0[i], a1[j], a1[j], u, s0[idx]); + cout << "c = " << tlfloat::to_string((*tlfunc)(a0[i], a1[j])) << endl; + return false; + } + } + } + return true; +} + +template +static bool check_d_d_d(const char *msg, vdouble (*vfunc)(vdouble, vdouble), T (*tlfunc)(const T, const T), + double startx, double endx, double stepx, double starty, double endy, double stepy, + double tol, bool checkSignedZero) { + double s0[VECTLENDP], s1[VECTLENDP]; + bool ret = true; + for(size_t i=0;startx + i * stepx <= endx;i++) { + double a0 = startx + i * stepx; + for(size_t j=0;starty + j * stepy <= endy;j++) { + double a1 = starty + j * stepy; + memrand(s0, sizeof(s0)); + memrand(s1, sizeof(s1)); + int idx = xrand() & (VECTLENDP-1); + s0[idx] = a0; + s1[idx] = a1; + vdouble v0 = vloadu_vd_p(s0); + vdouble v1 = vloadu_vd_p(s1); + v0 = (*vfunc)(v0, v1); + vstoreu_v_p_vd(s0, v0); + double u = countULP(s0[idx], (*tlfunc)(a0, a1), DBL_MANT_DIG, + SLEEF_DBL_DENORM_MIN, DBL_MAX, checkSignedZero); + if (u > tol) { + printf("%s : arg0 = %a (%g), arg1 = %a (%g), ulp = %g, t = %.16g, ", + msg, a0, a0, a1, a1, u, s0[idx]); + cout << "c = " << tlfloat::to_string((*tlfunc)(a0, a1)) << endl; + ret = false; + } + } + } + return ret; +} + +template +static bool checkX_d_d(const char *msg, vdouble2 (*vfunc)(vdouble), T (*tlfunc)(const T), + const double *a0, size_t z, double tol, bool checkSignedZero) { + double s0[VECTLENDP]; + for(size_t i=0;i(s0[idx], (*tlfunc)(a0[i]), DBL_MANT_DIG, + SLEEF_DBL_DENORM_MIN, DBL_MAX, checkSignedZero); + if (u > tol) { + printf("%s : arg = %a (%g), ulp = %g, t = %.16g, ", msg, a0[i], a0[i], u, s0[idx]); + cout << "c = " << tlfloat::to_string((*tlfunc)(a0[i])) << endl; + return false; + } + } + return true; +} + +template +static bool checkX_d_d(const char *msg, vdouble2 (*vfunc)(vdouble), T (*tlfunc)(const T), + double start, double end, double step, double tol, bool checkSignedZero) { + double s0[VECTLENDP]; + for(size_t i=0;start + i * step <= end;i++) { + double a0 = start + i * step; + memrand(s0, sizeof(s0)); + int idx = xrand() & (VECTLENDP-1); + s0[idx] = a0; + vdouble v0 = vloadu_vd_p(s0); + v0 = vd2getx_vd_vd2((*vfunc)(v0)); + vstoreu_v_p_vd(s0, v0); + double u = countULP(s0[idx], (*tlfunc)(a0), DBL_MANT_DIG, + SLEEF_DBL_DENORM_MIN, DBL_MAX, checkSignedZero); + if (u > tol) { + printf("%s : arg = %a (%g), ulp = %g, t = %.16g, ", msg, a0, a0, u, s0[idx]); + cout << "c = " << tlfloat::to_string((*tlfunc)(a0)) << endl; + return false; + } + } + return true; +} + +template +static bool checkY_d_d(const char *msg, vdouble2 (*vfunc)(vdouble), T (*tlfunc)(const T), + const double *a0, size_t z, double tol, bool checkSignedZero) { + double s0[VECTLENDP]; + for(size_t i=0;i(s0[idx], (*tlfunc)(a0[i]), DBL_MANT_DIG, + SLEEF_DBL_DENORM_MIN, DBL_MAX, checkSignedZero); + if (u > tol) { + printf("%s : arg = %a (%g), ulp = %g, t = %.16g, ", msg, a0[i], a0[i], u, s0[idx]); + cout << "c = " << tlfloat::to_string((*tlfunc)(a0[i])) << endl; + return false; + } + } + return true; +} + +template +static bool checkY_d_d(const char *msg, vdouble2 (*vfunc)(vdouble), T (*tlfunc)(const T), + double start, double end, double step, double tol, bool checkSignedZero) { + double s0[VECTLENDP]; + for(size_t i=0;start + i * step <= end;i++) { + double a0 = start + i * step; + memrand(s0, sizeof(s0)); + int idx = xrand() & (VECTLENDP-1); + s0[idx] = a0; + vdouble v0 = vloadu_vd_p(s0); + v0 = vd2gety_vd_vd2((*vfunc)(v0)); + vstoreu_v_p_vd(s0, v0); + double u = countULP(s0[idx], (*tlfunc)(a0), DBL_MANT_DIG, + SLEEF_DBL_DENORM_MIN, DBL_MAX, checkSignedZero); + if (u > tol) { + printf("%s : arg = %a (%g), ulp = %g, t = %.16g, ", msg, a0, a0, u, s0[idx]); + cout << "c = " << tlfloat::to_string((*tlfunc)(a0)) << endl; + return false; + } + } + return true; +} + +static int32_t func_i_d(vint (*vfunc)(vdouble), double a) { + int idx = xrand() & (VECTLENDP-1); + double s0[VECTLENDP]; + memrand(s0, sizeof(s0)); + s0[idx] = a; + vdouble v0 = vloadu_vd_p(s0); + vint vi0 = (*vfunc)(v0); + int t0[VECTLENDP*2]; + vstoreu_v_p_vi(t0, vi0); + return t0[idx]; +} + +static double func_d_d_i(vdouble (*vfunc)(vdouble, vint), double a, int i) { + int idx = xrand() & (VECTLENDP-1); + double s0[VECTLENDP]; + memrand(s0, sizeof(s0)); + s0[idx] = a; + int t0[VECTLENDP*2]; + memrand(t0, sizeof(t0)); + t0[idx] = i; + vdouble v0 = vloadu_vd_p(s0); + vint vi0 = vloadu_vi_p(t0); + v0 = (*vfunc)(v0, vi0); + vstoreu_v_p_vd(s0, v0); + return s0[idx]; +} +#endif // #if defined(ENABLE_DP) + +template +static bool check_f_f(const char *msg, vfloat (*vfunc)(vfloat), T (*tlfunc)(const T), + const float *a0, size_t z, double tol, bool checkSignedZero) { + float s0[VECTLENSP]; + for(size_t i=0;i(s0[idx], (*tlfunc)(a), FLT_MANT_DIG, + enableFlushToZero ? FLT_MIN : SLEEF_FLT_DENORM_MIN, + FLT_MAX, checkSignedZero); + if (u > tol) { + printf("%s : arg = %a (%g), ulp = %g, t = %.16g, ", msg, a, a, u, s0[idx]); + cout << "c = " << tlfloat::to_string((*tlfunc)(a)) << endl; + return false; + } + } + return true; +} + +template +static bool check_f_f(const char *msg, vfloat (*vfunc)(vfloat), T (*tlfunc)(const T), + double start, double end, double step, double tol, bool checkSignedZero, double abound = 0.0) { + float s0[VECTLENSP]; + for(size_t i=0;start + i * step <= end;i++) { + float a0 = flushToZero(start + i * step); + memrand(s0, sizeof(s0)); + int idx = xrand() & (VECTLENSP-1); + s0[idx] = a0; + vfloat v0 = vloadu_vf_p(s0); + v0 = (*vfunc)(v0); + vstoreu_v_p_vf(s0, v0); + double u = countULP(s0[idx], (*tlfunc)(a0), FLT_MANT_DIG, + enableFlushToZero ? FLT_MIN : SLEEF_FLT_DENORM_MIN, + FLT_MAX, checkSignedZero, abound); + if (u > tol) { + printf("%s : arg = %a (%g), ulp = %g, t = %.16g, ", msg, a0, a0, u, s0[idx]); + cout << "c = " << tlfloat::to_string((*tlfunc)(a0)) << endl; + return false; + } + } + return true; +} + +template +static bool checkX_f_f(const char *msg, vfloat2 (*vfunc)(vfloat), T (*tlfunc)(const T), + const float *a0, size_t z, double tol, bool checkSignedZero) { + float s0[VECTLENSP]; + for(size_t i=0;i(s0[idx], (*tlfunc)(a), FLT_MANT_DIG, + enableFlushToZero ? FLT_MIN : SLEEF_FLT_DENORM_MIN, + FLT_MAX, checkSignedZero); + if (u > tol) { + printf("%s : arg = %a (%g), ulp = %g, t = %.16g, ", msg, a, a, u, s0[idx]); + cout << "c = " << tlfloat::to_string((*tlfunc)(a)) << endl; + return false; + } + } + return true; +} + +template +static bool checkX_f_f(const char *msg, vfloat2 (*vfunc)(vfloat), T (*tlfunc)(const T), + double start, double end, double step, double tol, bool checkSignedZero) { + float s0[VECTLENSP]; + for(size_t i=0;start + i * step <= end;i++) { + float a0 = flushToZero(start + i * step); + memrand(s0, sizeof(s0)); + int idx = xrand() & (VECTLENSP-1); + s0[idx] = a0; + vfloat v0 = vloadu_vf_p(s0); + v0 = vf2getx_vf_vf2((*vfunc)(v0)); + vstoreu_v_p_vf(s0, v0); + double u = countULP(s0[idx], (*tlfunc)(a0), FLT_MANT_DIG, + enableFlushToZero ? FLT_MIN : SLEEF_FLT_DENORM_MIN, + FLT_MAX, checkSignedZero); + if (u > tol) { + printf("%s : arg = %a (%g), ulp = %g, t = %.16g, ", msg, a0, a0, u, s0[idx]); + cout << "c = " << tlfloat::to_string((*tlfunc)(a0)) << endl; + return false; + } + } + return true; +} + +template +static bool checkY_f_f(const char *msg, vfloat2 (*vfunc)(vfloat), T (*tlfunc)(const T), + const float *a0, size_t z, double tol, bool checkSignedZero) { + float s0[VECTLENSP]; + for(size_t i=0;i(s0[idx], (*tlfunc)(a), FLT_MANT_DIG, + enableFlushToZero ? FLT_MIN : SLEEF_FLT_DENORM_MIN, + FLT_MAX, checkSignedZero); + if (u > tol) { + printf("%s : arg = %a (%g), ulp = %g, t = %.16g, ", msg, a, a, u, s0[idx]); + cout << "c = " << tlfloat::to_string((*tlfunc)(a)) << endl; + return false; + } + } + return true; +} + +template +static bool checkY_f_f(const char *msg, vfloat2 (*vfunc)(vfloat), T (*tlfunc)(const T), + double start, double end, double step, double tol, bool checkSignedZero) { + float s0[VECTLENSP]; + for(size_t i=0;start + i * step <= end;i++) { + float a0 = flushToZero(start + i * step); + memrand(s0, sizeof(s0)); + int idx = xrand() & (VECTLENSP-1); + s0[idx] = a0; + vfloat v0 = vloadu_vf_p(s0); + v0 = vf2gety_vf_vf2((*vfunc)(v0)); + vstoreu_v_p_vf(s0, v0); + double u = countULP(s0[idx], (*tlfunc)(a0), FLT_MANT_DIG, + enableFlushToZero ? FLT_MIN : SLEEF_FLT_DENORM_MIN, + FLT_MAX, checkSignedZero); + if (u > tol) { + printf("%s : arg = %a (%g), ulp = %g, t = %.16g, ", msg, a0, a0, u, s0[idx]); + cout << "c = " << tlfloat::to_string((*tlfunc)(a0)) << endl; + return false; + } + } + return true; +} + +template +static bool check_f_f_f(const char *msg, vfloat (*vfunc)(vfloat, vfloat), T (*tlfunc)(const T, const T), + const float *a0, size_t z0, const float *a1, size_t z1, + double tol, bool checkSignedZero) { + float s0[VECTLENSP], s1[VECTLENSP]; + for(size_t i=0;i(s0[idx], (*tlfunc)(flushToZero(a0[i]), flushToZero(a1[j])), FLT_MANT_DIG, + enableFlushToZero ? FLT_MIN : SLEEF_FLT_DENORM_MIN, + FLT_MAX, checkSignedZero); + if (u > tol) { + printf("%s : arg0 = %a (%g), arg1 = %a (%g), ulp = %g, t = %.16g, ", + msg, flushToZero(a0[i]), flushToZero(a0[i]), flushToZero(a1[j]), flushToZero(a1[j]), u, s0[idx]); + cout << "c = " << tlfloat::to_string((*tlfunc)(flushToZero(a0[i]), flushToZero(a1[j]))) << endl; + return false; + } + } + } + return true; +} + +template +static bool check_f_f_f(const char *msg, vfloat (*vfunc)(vfloat, vfloat), T (*tlfunc)(const T, const T), + double startx, double endx, double stepx, double starty, double endy, double stepy, + double tol, bool checkSignedZero) { + float s0[VECTLENSP], s1[VECTLENSP]; + for(size_t i=0;startx + i * stepx <= endx;i++) { + float a0 = flushToZero(startx + i * stepx); + for(size_t j=0;starty + j * stepy <= endy;j++) { + float a1 = flushToZero(starty + j * stepy); + memrand(s0, sizeof(s0)); + memrand(s1, sizeof(s1)); + int idx = xrand() & (VECTLENSP-1); + s0[idx] = a0; + s1[idx] = a1; + vfloat v0 = vloadu_vf_p(s0); + vfloat v1 = vloadu_vf_p(s1); + v0 = (*vfunc)(v0, v1); + vstoreu_v_p_vf(s0, v0); + double u = countULP(s0[idx], (*tlfunc)(a0, a1), FLT_MANT_DIG, + SLEEF_FLT_DENORM_MIN, FLT_MAX, checkSignedZero); + if (u > tol) { + printf("%s : arg0 = %a (%g), arg1 = %a (%g), ulp = %g, t = %.16g, ", + msg, a0, a0, a1, a1, u, s0[idx]); + cout << "c = " << tlfloat::to_string((*tlfunc)(a0, a1)) << endl; + return false; + } + } + } + return true; +} + +template +void check(const double t, const double c, int nbmant, const double flmin, const double flmax, const double culp) { + double tulp = countULP(t, c, nbmant, flmin, flmax, true); + if (tulp != culp) { + cout << "NG" << endl; + printf("t = %a\n", t); + printf("c = %a\n", c); + printf("tulp = %g\n", tulp); + printf("culp = %g\n", culp); + exit(-1); + } +} + +// + +extern "C" { + int main2(int argc, char **argv); +} + +int main2(int argc, char **argv) { +#if defined(ENABLE_NEON32) || defined(ENABLE_NEON32VFPV4) + enableFlushToZero = true; +#warning Flush to zero +#endif + + bool success = true; + + // Tests if counting ulp numbers is correct + + check(+0.0, +0.0, DBL_MANT_DIG, SLEEF_DBL_DENORM_MIN, DBL_MAX, 0); + check(-0.0, +0.0, DBL_MANT_DIG, SLEEF_DBL_DENORM_MIN, DBL_MAX, 10002); + check(+0.0, -0.0, DBL_MANT_DIG, SLEEF_DBL_DENORM_MIN, DBL_MAX, 10002); + check(-0.0, -0.0, DBL_MANT_DIG, SLEEF_DBL_DENORM_MIN, DBL_MAX, 0); + + check(+1.0, +1.0, DBL_MANT_DIG, SLEEF_DBL_DENORM_MIN, DBL_MAX, 0); + check(nextafter(+1.0, +INFINITY), +1.0, DBL_MANT_DIG, SLEEF_DBL_DENORM_MIN, DBL_MAX, 1.0); + check(nextafter(+1.0, -INFINITY), +1.0, DBL_MANT_DIG, SLEEF_DBL_DENORM_MIN, DBL_MAX, 0.5); + + check(-1.0, -1.0, DBL_MANT_DIG, SLEEF_DBL_DENORM_MIN, DBL_MAX, 0); + check(nextafter(-1.0, +INFINITY), -1.0, DBL_MANT_DIG, SLEEF_DBL_DENORM_MIN, DBL_MAX, 0.5); + check(nextafter(-1.0, -INFINITY), -1.0, DBL_MANT_DIG, SLEEF_DBL_DENORM_MIN, DBL_MAX, 1.0); + + check(INFINITY, INFINITY, DBL_MANT_DIG, SLEEF_DBL_DENORM_MIN, DBL_MAX, 0); + check(nextafter(INFINITY, 0), INFINITY, DBL_MANT_DIG, SLEEF_DBL_DENORM_MIN, DBL_MAX, INFINITY); + check(INFINITY, nextafter(INFINITY, 0), DBL_MANT_DIG, SLEEF_DBL_DENORM_MIN, DBL_MAX, 1.0); + + check(-INFINITY, -INFINITY, DBL_MANT_DIG, SLEEF_DBL_DENORM_MIN, DBL_MAX, 0); + check(nextafter(-INFINITY, 0), -INFINITY, DBL_MANT_DIG, SLEEF_DBL_DENORM_MIN, DBL_MAX, INFINITY); + check(-INFINITY, nextafter(-INFINITY, 0), DBL_MANT_DIG, SLEEF_DBL_DENORM_MIN, DBL_MAX, 1.0); + + check(DBL_MIN, DBL_MIN, DBL_MANT_DIG, SLEEF_DBL_DENORM_MIN, DBL_MAX, 0); + check(nextafter(DBL_MIN, 0.0), DBL_MIN, DBL_MANT_DIG, SLEEF_DBL_DENORM_MIN, DBL_MAX, 1.0); + check(nextafter(DBL_MIN, 1.0), DBL_MIN, DBL_MANT_DIG, SLEEF_DBL_DENORM_MIN, DBL_MAX, 1.0); + + check(-DBL_MIN, -DBL_MIN, DBL_MANT_DIG, SLEEF_DBL_DENORM_MIN, DBL_MAX, 0); + check(nextafter(-DBL_MIN, 0.0), -DBL_MIN, DBL_MANT_DIG, SLEEF_DBL_DENORM_MIN, DBL_MAX, 1.0); + check(nextafter(-DBL_MIN, 1.0), -DBL_MIN, DBL_MANT_DIG, SLEEF_DBL_DENORM_MIN, DBL_MAX, 1.0); + + check(+0.0, +0.0, FLT_MANT_DIG, SLEEF_FLT_DENORM_MIN, FLT_MAX, 0); + check(-0.0, +0.0, FLT_MANT_DIG, SLEEF_FLT_DENORM_MIN, FLT_MAX, 10002); + check(+0.0, -0.0, FLT_MANT_DIG, SLEEF_FLT_DENORM_MIN, FLT_MAX, 10002); + check(-0.0, -0.0, FLT_MANT_DIG, SLEEF_FLT_DENORM_MIN, FLT_MAX, 0); + + check(+1.0, +1.0, FLT_MANT_DIG, SLEEF_FLT_DENORM_MIN, FLT_MAX, 0); + check(nextafterf(+1.0, +INFINITY), +1.0, FLT_MANT_DIG, SLEEF_FLT_DENORM_MIN, FLT_MAX, 1.0); + check(nextafterf(+1.0, -INFINITY), +1.0, FLT_MANT_DIG, SLEEF_FLT_DENORM_MIN, FLT_MAX, 0.5); + + check(-1.0, -1.0, FLT_MANT_DIG, SLEEF_FLT_DENORM_MIN, FLT_MAX, 0); + check(nextafterf(-1.0, +INFINITY), -1.0, FLT_MANT_DIG, SLEEF_FLT_DENORM_MIN, FLT_MAX, 0.5); + check(nextafterf(-1.0, -INFINITY), -1.0, FLT_MANT_DIG, SLEEF_FLT_DENORM_MIN, FLT_MAX, 1.0); + + check(INFINITY, INFINITY, FLT_MANT_DIG, SLEEF_FLT_DENORM_MIN, FLT_MAX, 0); + check(nextafterf(INFINITY, 0), INFINITY, FLT_MANT_DIG, SLEEF_FLT_DENORM_MIN, FLT_MAX, INFINITY); + check(INFINITY, nextafterf(INFINITY, 0), FLT_MANT_DIG, SLEEF_FLT_DENORM_MIN, FLT_MAX, 1.0); + + check(-INFINITY, -INFINITY, FLT_MANT_DIG, SLEEF_FLT_DENORM_MIN, FLT_MAX, 0); + check(nextafterf(-INFINITY, 0), -INFINITY, FLT_MANT_DIG, SLEEF_FLT_DENORM_MIN, FLT_MAX, INFINITY); + check(-INFINITY, nextafterf(-INFINITY, 0), FLT_MANT_DIG, SLEEF_FLT_DENORM_MIN, FLT_MAX, 1.0); + + check(FLT_MIN, FLT_MIN, FLT_MANT_DIG, SLEEF_FLT_DENORM_MIN, FLT_MAX, 0); + check(nextafterf(FLT_MIN, 0.0), FLT_MIN, FLT_MANT_DIG, SLEEF_FLT_DENORM_MIN, FLT_MAX, 1.0); + check(nextafterf(FLT_MIN, 1.0), FLT_MIN, FLT_MANT_DIG, SLEEF_FLT_DENORM_MIN, FLT_MAX, 1.0); + + check(-FLT_MIN, -FLT_MIN, FLT_MANT_DIG, SLEEF_FLT_DENORM_MIN, FLT_MAX, 0); + check(nextafterf(-FLT_MIN, 0.0), -FLT_MIN, FLT_MANT_DIG, SLEEF_FLT_DENORM_MIN, FLT_MAX, 1.0); + check(nextafterf(-FLT_MIN, 1.0), -FLT_MIN, FLT_MANT_DIG, SLEEF_FLT_DENORM_MIN, FLT_MAX, 1.0); + + // + +#if defined(ENABLE_DP) + static const double ad[] = { NAN, + -INFINITY, -DBL_MAX, -DBL_MIN, -SLEEF_DBL_DENORM_MIN, -0.0, + +0.0, SLEEF_DBL_DENORM_MIN, DBL_MIN, DBL_MAX, +INFINITY, + -M_PI*2, -M_PI, -M_PI/2, -M_PI/4, M_PI/4, M_PI/2, M_PI, M_PI*2, + -1e+100, -1e+10, -100001, -100000.5, -100000, -7.0, -5.0, -4.0, -3.0, -2.5, -2.0, -1.5, -1.0, -0.999, -0.5, + +0.5, +0.999, +1.0, +1.5, +2.0, +2.5, +3.0, +4.0, +5.0, +7.0, +100000, +100000.5, +100001, +1e+10, +1e+100, + nextafter(-1, -2), nextafter(+1, +2) + }; + + // + + { + vector v; + for(int i = 0;i < 920;i++) v.push_back(pow(2.16, i)); + for(int64_t i64=(int64_t)-1e+14;i64<(int64_t)1e+14;i64+=(int64_t)1e+12) { + double start = u2d(d2u(M_PI / 4 * i64)-20), end = u2d(d2u(M_PI / 4 * i64)+20); + for(double d = start;d <= end;d = u2d(d2u(d)+1)) v.push_back(d); + } + + cout << "sin" << endl; + success = check_d_d("sin", xsin, tlfloat_sinq, + ad, sizeof(ad)/sizeof(ad[0]), 3.5, true) && success; + success = check_d_d("sin", xsin, tlfloat_sinq, + -10, 10, 0.002, 3.5, false) && success; + success = check_d_d("sin", xsin, tlfloat_sinq, + -1e+14, 1e+14, 1e+10 + 0.1, 3.5, false) && success; + success = check_d_d("sin", xsin, tlfloat_sinq, + v.data(), v.size(), 3.5, false) && success; + + cout << "sin in sincos" << endl; + success = checkX_d_d("sin in sincos", xsincos, tlfloat_sinq, + ad, sizeof(ad)/sizeof(ad[0]), 3.5, true) && success; + success = checkX_d_d("sin in sincos", xsincos, tlfloat_sinq, + -10, 10, 0.002, 3.5, false) && success; + success = checkX_d_d("sin in sincos", xsincos, tlfloat_sinq, + -1e+14, 1e+14, 1e+10 + 0.1, 3.5, false) && success; + success = checkX_d_d("sin in sincos", xsincos, tlfloat_sinq, + v.data(), v.size(), 3.5, false) && success; + + cout << "sin_u1" << endl; + success = check_d_d("sin_u1", xsin_u1, tlfloat_sinq, + ad, sizeof(ad)/sizeof(ad[0]), 1.0, true) && success; + success = check_d_d("sin_u1", xsin_u1, tlfloat_sinq, + -10, 10, 0.002, 1.0, false) && success; + success = check_d_d("sin_u1", xsin_u1, tlfloat_sinq, + -1e+14, 1e+14, 1e+10 + 0.1, 1.0, false) && success; + success = check_d_d("sin_u1", xsin_u1, tlfloat_sinq, + v.data(), v.size(), 1.0, false) && success; + + cout << "sin in sincos_u1" << endl; + success = checkX_d_d("sin in sincos_u1", xsincos_u1, tlfloat_sinq, + ad, sizeof(ad)/sizeof(ad[0]), 1.0, true) && success; + success = checkX_d_d("sin in sincos_u1", xsincos_u1, tlfloat_sinq, + -10, 10, 0.002, 1.0, false) && success; + success = checkX_d_d("sin in sincos_u1", xsincos_u1, tlfloat_sinq, + -1e+14, 1e+14, 1e+10 + 0.1, 1.0, false) && success; + success = checkX_d_d("sin in sincos_u1", xsincos_u1, tlfloat_sinq, + v.data(), v.size(), 1.0, false) && success; + + cout << "cos" << endl; + success = check_d_d("cos", xcos, tlfloat_cosq, + ad, sizeof(ad)/sizeof(ad[0]), 3.5, true) && success; + success = check_d_d("cos", xcos, tlfloat_cosq, + -10, 10, 0.002, 3.5, false) && success; + success = check_d_d("cos", xcos, tlfloat_cosq, + -1e+14, 1e+14, 1e+10 + 0.1, 3.5, false) && success; + success = check_d_d("cos", xcos, tlfloat_cosq, + v.data(), v.size(), 3.5, false) && success; + + cout << "cos in sincos" << endl; + success = checkY_d_d("cos in sincos", xsincos, tlfloat_cosq, + ad, sizeof(ad)/sizeof(ad[0]), 3.5, true) && success; + success = checkY_d_d("cos in sincos", xsincos, tlfloat_cosq, + -10, 10, 0.002, 3.5, false) && success; + success = checkY_d_d("cos in sincos", xsincos, tlfloat_cosq, + -1e+14, 1e+14, 1e+10 + 0.1, 3.5, false) && success; + success = checkY_d_d("cos in sincos", xsincos, tlfloat_cosq, + v.data(), v.size(), 3.5, false) && success; + + cout << "cos_u1" << endl; + success = check_d_d("cos_u1", xcos_u1, tlfloat_cosq, + ad, sizeof(ad)/sizeof(ad[0]), 1.0, true) && success; + success = check_d_d("cos_u1", xcos_u1, tlfloat_cosq, + -10, 10, 0.002, 1.0, false) && success; + success = check_d_d("cos_u1", xcos_u1, tlfloat_cosq, + -1e+14, 1e+14, 1e+10 + 0.1, 1.0, false) && success; + success = check_d_d("cos_u1", xcos_u1, tlfloat_cosq, + v.data(), v.size(), 1.0, false) && success; + + cout << "cos in sincos_u1" << endl; + success = checkY_d_d("cos in sincos_u1", xsincos_u1, tlfloat_cosq, + ad, sizeof(ad)/sizeof(ad[0]), 1.0, true) && success; + success = checkY_d_d("cos in sincos_u1", xsincos_u1, tlfloat_cosq, + -10, 10, 0.002, 1.0, false) && success; + success = checkY_d_d("cos in sincos_u1", xsincos_u1, tlfloat_cosq, + -1e+14, 1e+14, 1e+10 + 0.1, 1.0, false) && success; + success = checkY_d_d("cos in sincos_u1", xsincos_u1, tlfloat_cosq, + v.data(), v.size(), 1.0, false) && success; + } + + // + + { + static const double ad2[] = { +0.0, -0.0, INFINITY, -INFINITY, NAN }; + + vector v; + for(int i=1;i<10000 && success;i+=31) { + double start = u2d(d2u(i)-20), end = u2d(d2u(i)+20); + for(double d = start;d <= end;d = u2d(d2u(d)+1)) v.push_back(d); + } + for(int i=1;i<=20 && success;i++) { + double start = u2d(d2u(0.25 * i)-20), end = u2d(d2u(0.25 * i)+20); + for(double d = start;d <= end;d = u2d(d2u(d)+1)) v.push_back(d); + } + + cout << "sinpi_u05" << endl; + success = check_d_d("sinpi_u05", xsinpi_u05, tlfloat_sinpiq, + ad2, sizeof(ad2)/sizeof(ad2[0]), 0.506, true) && success; + success = check_d_d("sinpi_u05", xsinpi_u05, tlfloat_sinpiq, + -10.1, 10, 0.0021, 0.506, false) && success; + success = check_d_d("sinpi_u05", xsinpi_u05, tlfloat_sinpiq, + -1e+8-0.1, 1e+8, 1e+10 + 0.1, 0.506, false) && success; + success = check_d_d("sinpi_u05", xsinpi_u05, tlfloat_sinpiq, + v.data(), v.size(), 0.506, false) && success; + + cout << "sin in sincospi_u35" << endl; + success = checkX_d_d("sin in sincospi_u35", xsincospi_u35, tlfloat_sinpiq, + ad2, sizeof(ad2)/sizeof(ad2[0]), 3.5, true) && success; + success = checkX_d_d("sin in sincospi_u35", xsincospi_u35, tlfloat_sinpiq, + -10.1, 10, 0.0021, 3.5, false) && success; + success = checkX_d_d("sin in sincospi_u35", xsincospi_u35, tlfloat_sinpiq, + -1e+8-0.1, 1e+8, 1e+10 + 0.1, 3.5, false) && success; + success = checkX_d_d("sin in sincospi_u35", xsincospi_u35, tlfloat_sinpiq, + v.data(), v.size(), 3.5, false) && success; + + cout << "sin in sincospi_u05" << endl; + success = checkX_d_d("sin in sincospi_u05", xsincospi_u05, tlfloat_sinpiq, + ad2, sizeof(ad2)/sizeof(ad2[0]), 0.506, true) && success; + success = checkX_d_d("sin in sincospi_u05", xsincospi_u05, tlfloat_sinpiq, + -10.1, 10, 0.0021, 0.506, false) && success; + success = checkX_d_d("sin in sincospi_u05", xsincospi_u05, tlfloat_sinpiq, + -1e+8-0.1, 1e+8, 1e+10 + 0.1, 0.506, false) && success; + success = checkX_d_d("sin in sincospi_u05", xsincospi_u05, tlfloat_sinpiq, + v.data(), v.size(), 0.506, false) && success; + + cout << "cospi_u05" << endl; + success = check_d_d("cospi_u05", xcospi_u05, tlfloat_cospiq, + ad2, sizeof(ad2)/sizeof(ad2[0]), 0.506, true) && success; + success = check_d_d("cospi_u05", xcospi_u05, tlfloat_cospiq, + -10.1, 10, 0.0021, 0.506, false) && success; + success = check_d_d("cospi_u05", xcospi_u05, tlfloat_cospiq, + -1e+8-0.1, 1e+8, 1e+10 + 0.1, 0.506, false) && success; + success = check_d_d("cospi_u05", xcospi_u05, tlfloat_cospiq, + v.data(), v.size(), 0.506, false) && success; + + cout << "cos in sincospi_u35" << endl; + success = checkY_d_d("cos in sincospi_u35", xsincospi_u35, tlfloat_cospiq, + ad2, sizeof(ad2)/sizeof(ad2[0]), 3.5, true) && success; + success = checkY_d_d("cos in sincospi_u35", xsincospi_u35, tlfloat_cospiq, + -10.1, 10, 0.0021, 3.5, false) && success; + success = checkY_d_d("cos in sincospi_u35", xsincospi_u35, tlfloat_cospiq, + -1e+8-0.1, 1e+8, 1e+10 + 0.1, 3.5, false) && success; + success = checkY_d_d("cos in sincospi_u35", xsincospi_u35, tlfloat_cospiq, + v.data(), v.size(), 3.5, false) && success; + + cout << "cos in sincospi_u05" << endl; + success = checkY_d_d("cos in sincospi_u05", xsincospi_u05, tlfloat_cospiq, + ad2, sizeof(ad2)/sizeof(ad2[0]), 0.506, true) && success; + success = checkY_d_d("cos in sincospi_u05", xsincospi_u05, tlfloat_cospiq, + -10.1, 10, 0.0021, 0.506, false) && success; + success = checkY_d_d("cos in sincospi_u05", xsincospi_u05, tlfloat_cospiq, + -1e+8-0.1, 1e+8, 1e+10 + 0.1, 0.506, false) && success; + success = checkY_d_d("cos in sincospi_u05", xsincospi_u05, tlfloat_cospiq, + v.data(), v.size(), 0.506, false) && success; + } + + { + vector v; + for(int i = 0;i < 920;i++) v.push_back(pow(2.16, i)); + for(int i=1;i<10000 && success;i+=31) { + double start = u2d(d2u(M_PI / 4 * i)-20), end = u2d(d2u(M_PI / 4 * i)+20); + for(double d = start;d <= end;d = u2d(d2u(d)+1)) v.push_back(d); + } + + cout << "tan" << endl; + success = check_d_d("tan", xtan, tlfloat_tanq, + ad, sizeof(ad)/sizeof(ad[0]), 3.5, true) && success; + success = check_d_d("tan", xtan, tlfloat_tanq, + -10, 10, 0.002, 3.5, false) && success; + success = check_d_d("tan", xtan, tlfloat_tanq, + -1e+7, 1e+7, 100.1, 3.5, false) && success; + success = check_d_d("tan", xtan, tlfloat_tanq, + -1e+14, 1e+14, 1e+10 + 0.1, 3.5, false) && success; + success = check_d_d("tan", xtan, tlfloat_tanq, + v.data(), v.size(), 3.5, false) && success; + + cout << "tan_u1" << endl; + success = check_d_d("tan_u1", xtan_u1, tlfloat_tanq, + ad, sizeof(ad)/sizeof(ad[0]), 1.0, true) && success; + success = check_d_d("tan_u1", xtan_u1, tlfloat_tanq, + -10, 10, 0.002, 1.0, false) && success; + success = check_d_d("tan_u1", xtan_u1, tlfloat_tanq, + -1e+7, 1e+7, 100.1, 1.0, false) && success; + success = check_d_d("tan_u1", xtan_u1, tlfloat_tanq, + -1e+14, 1e+14, 1e+10 + 0.1, 1.0, false) && success; + success = check_d_d("tan_u1", xtan_u1, tlfloat_tanq, + v.data(), v.size(), 1.0, false) && success; + } + + { + vector v; + for(int i = -1000;i <= 1000 && success;i+=10) v.push_back(pow(2.1, i)); + for(int i=0;i<10000 && success;i+=10) v.push_back(DBL_MAX * pow(0.9314821319758632, i)); + for(int i=0;i<10000 && success;i+=10) v.push_back(pow(0.933254300796991, i)); + for(int i=0;i<10000 && success;i+=10) v.push_back(DBL_MIN * pow(0.996323, i)); + + cout << "log" << endl; + success = check_d_d("log", xlog, tlfloat_logq, + ad, sizeof(ad)/sizeof(ad[0]), 3.5, true) && success; + success = check_d_d("log", xlog, tlfloat_logq, + 0.0001, 10, 0.001, 3.5, false) && success; + success = check_d_d("log", xlog, tlfloat_logq, + 0.0001, 10000, 1.1, 3.5, false) && success; + success = check_d_d("log", xlog, tlfloat_logq, + v.data(), v.size(), 3.5, false) && success; + + cout << "log_u1" << endl; + success = check_d_d("log_u1", xlog_u1, tlfloat_logq, + ad, sizeof(ad)/sizeof(ad[0]), 1.0, true) && success; + success = check_d_d("log_u1", xlog_u1, tlfloat_logq, + 0.0001, 10, 0.001, 1.0, false) && success; + success = check_d_d("log_u1", xlog_u1, tlfloat_logq, + 0.0001, 10000, 1.1, 1.0, false) && success; + success = check_d_d("log_u1", xlog_u1, tlfloat_logq, + v.data(), v.size(), 1.0, false) && success; + + cout << "log10" << endl; + success = check_d_d("log10", xlog10, tlfloat_log10q, + ad, sizeof(ad)/sizeof(ad[0]), 1.0, true) && success; + success = check_d_d("log10", xlog10, tlfloat_log10q, + 0.0001, 10, 0.001, 1.0, false) && success; + success = check_d_d("log10", xlog10, tlfloat_log10q, + 0.0001, 10000, 1.1, 1.0, false) && success; + success = check_d_d("log10", xlog10, tlfloat_log10q, + v.data(), v.size(), 1.0, false) && success; + + cout << "log2" << endl; + success = check_d_d("log2", xlog2, tlfloat_log2q, + ad, sizeof(ad)/sizeof(ad[0]), 1.0, true) && success; + success = check_d_d("log2", xlog2, tlfloat_log2q, + 0.0001, 10, 0.001, 1.0, false) && success; + success = check_d_d("log2", xlog2, tlfloat_log2q, + 0.0001, 10000, 1.1, 1.0, false) && success; + success = check_d_d("log2", xlog2, tlfloat_log2q, + v.data(), v.size(), 1.0, false) && success; + + cout << "log2_u35" << endl; + success = check_d_d("log2_u35", xlog2_u35, tlfloat_log2q, + ad, sizeof(ad)/sizeof(ad[0]), 3.5, true) && success; + success = check_d_d("log2_u35", xlog2_u35, tlfloat_log2q, + 0.0001, 10, 0.001, 3.5, false) && success; + success = check_d_d("log2_u35", xlog2_u35, tlfloat_log2q, + 0.0001, 10000, 1.1, 3.5, false) && success; + success = check_d_d("log2_u35", xlog2_u35, tlfloat_log2q, + v.data(), v.size(), 3.5, false) && success; + + static const double ad2[] = { + +0.0, -0.0, +1, -1, +1e+10, -1e+10, DBL_MIN, -DBL_MIN, + INFINITY, -INFINITY, NAN, nextafter(-1, -2), -2 }; + + cout << "log1p" << endl; + success = check_d_d("log1p", xlog1p, tlfloat_log1pq, + ad2, sizeof(ad2)/sizeof(ad2[0]), 1.0, true) && success; + success = check_d_d("log1p", xlog1p, tlfloat_log1pq, + 0.0001, 10, 0.001, 1.0, false) && success; + } + + cout << "exp" << endl; + success = check_d_d("exp", xexp, tlfloat_expq, + ad, sizeof(ad)/sizeof(ad[0]), 1.0, true) && success; + success = check_d_d("exp", xexp, tlfloat_expq, + -10, 10, 0.002, 1.0, false) && success; + success = check_d_d("exp", xexp, tlfloat_expq, + -1000, 1000, 1.1, 1.0, false) && success; + cout << "exp2" << endl; + success = check_d_d("exp2", xexp2, tlfloat_exp2q, + ad, sizeof(ad)/sizeof(ad[0]), 1.0, true) && success; + success = check_d_d("exp2", xexp2, tlfloat_exp2q, + -10, 10, 0.002, 1.0, false) && success; + success = check_d_d("exp2", xexp2, tlfloat_exp2q, + -1000, 1000, 0.2, 1.0, false) && success; + + cout << "exp10" << endl; + success = check_d_d("exp10", xexp10, tlfloat_exp10q, + ad, sizeof(ad)/sizeof(ad[0]), 1.0, true) && success; + success = check_d_d("exp10", xexp10, tlfloat_exp10q, + -10, 10, 0.002, 1.0, false) && success; + success = check_d_d("exp10", xexp10, tlfloat_exp10q, + -300, 300, 0.1, 1.0, false) && success; + + cout << "exp2_u35" << endl; + success = check_d_d("exp2_u35", xexp2_u35, tlfloat_exp2q, + ad, sizeof(ad)/sizeof(ad[0]), 3.5, true) && success; + success = check_d_d("exp2_u35", xexp2_u35, tlfloat_exp2q, + -10, 10, 0.002, 3.5, false) && success; + success = check_d_d("exp2_u35", xexp2_u35, tlfloat_exp2q, + -1000, 1000, 0.2, 3.5, false) && success; + + cout << "exp10_u35" << endl; + success = check_d_d("exp10_u35", xexp10_u35, tlfloat_exp10q, + ad, sizeof(ad)/sizeof(ad[0]), 3.5, true) && success; + success = check_d_d("exp10_u35", xexp10_u35, tlfloat_exp10q, + -10, 10, 0.002, 3.5, false) && success; + success = check_d_d("exp10_u35", xexp10_u35, tlfloat_exp10q, + -300, 300, 0.1, 3.5, false) && success; + + { + vector v; + for(double d = 0;d < 300 && success;d += 0.21) { + v.push_back(+pow(10, -d)); + v.push_back(-pow(10, -d)); + } + + cout << "expm1" << endl; + success = check_d_d("expm1", xexpm1, tlfloat_expm1q, + ad, sizeof(ad)/sizeof(ad[0]), 1.0, true) && success; + success = check_d_d("expm1", xexpm1, tlfloat_expm1q, + -10, 10, 0.002, 1.0, false) && success; + success = check_d_d("expm1", xexpm1, tlfloat_expm1q, + -1000, 1000, 0.21, 1.0, false) && success; + success = check_d_d("expm1", xexpm1, tlfloat_expm1q, + v.data(), v.size(), 1.0, false) && success; + } + + { + vector v, w; + for(double y = -1000;y < 1000;y += 0.1) v.push_back(y); + w.push_back(2.1); + + cout << "pow" << endl; + success = check_d_d_d("pow", xpow, tlfloat_powq, + ad, sizeof(ad)/sizeof(ad[0]), ad, sizeof(ad)/sizeof(ad[0]), + 1.0, true) && success; + success = check_d_d_d("pow", xpow, tlfloat_powq, + -100, 100, 0.6, 0.1, 100, 0.6, 1.0, false) && success; + success = check_d_d_d("pow", xpow, tlfloat_powq, + w.data(), w.size(), v.data(), v.size(), 1.0, false) && success; + + static const double regx[] = { + 0x1.7fed001e5f0edp-1, 0x1.7f136e35a1af6p-1, 0x1.7e7a67798b72dp-1, 0x1.7f5c8e80a3cf7p-1, 0x1.7ff1b57d71188p-1, 0x1.7ff1b57d71188p-1 + }; + static const double regy[] = { + 0x1.1b5ce4d1fb0aep+11, 0x1.2c2f3c91cf6c5p+11, 0x1.e0157ee6672fbp+10, 0x1.235db085e49b7p+11, 0x1.2e8d51b04ab8p+11, 0x1.2e8d51b04ab8p+11 + }; + success = check_d_d_d("pow", xpow, tlfloat_powq, + regx, sizeof(regx)/sizeof(regx[0]), regy, sizeof(regy)/sizeof(regy[0]), + 1.25, true) && success; + } + + { + vector v; + for(int i = -1000;i <= 1000 && success;i+=10) v.push_back(pow(2.1, i)); + +#ifndef DETERMINISTIC + cout << "sqrt" << endl; + success = check_d_d("sqrt", xsqrt, tlfloat_sqrtq, + ad, sizeof(ad)/sizeof(ad[0]), 1.0, true) && success; + success = check_d_d("sqrt", xsqrt, tlfloat_sqrtq, + -10000, 10000, 2.1, 1.0, false) && success; + success = check_d_d("sqrt", xsqrt, tlfloat_sqrtq, + v.data(), v.size(), 1.0, false) && success; + + cout << "sqrt_u05" << endl; + success = check_d_d("sqrt", xsqrt_u05, tlfloat_sqrtq, + ad, sizeof(ad)/sizeof(ad[0]), 0.506, true) && success; + success = check_d_d("sqrt", xsqrt_u05, tlfloat_sqrtq, + -10000, 10000, 2.1, 0.506, false) && success; + success = check_d_d("sqrt", xsqrt_u05, tlfloat_sqrtq, + v.data(), v.size(), 0.506, false) && success; + + cout << "sqrt_u35" << endl; + success = check_d_d("sqrt", xsqrt_u35, tlfloat_sqrtq, + ad, sizeof(ad)/sizeof(ad[0]), 3.5, true) && success; + success = check_d_d("sqrt", xsqrt_u35, tlfloat_sqrtq, + -10000, 10000, 2.1, 3.5, false) && success; + success = check_d_d("sqrt", xsqrt_u35, tlfloat_sqrtq, + v.data(), v.size(), 3.5, false) && success; +#endif + + cout << "cbrt" << endl; + success = check_d_d("cbrt", xcbrt, tlfloat_cbrtq, + ad, sizeof(ad)/sizeof(ad[0]), 3.5, true) && success; + success = check_d_d("cbrt", xcbrt, tlfloat_cbrtq, + -10000, 10000, 2.1, 3.5, false) && success; + success = check_d_d("cbrt", xcbrt, tlfloat_cbrtq, + v.data(), v.size(), 3.5, false) && success; + + cout << "cbrt_u1" << endl; + success = check_d_d("cbrt_u1", xcbrt_u1, tlfloat_cbrtq, + ad, sizeof(ad)/sizeof(ad[0]), 1.0, true) && success; + success = check_d_d("cbrt_u1", xcbrt_u1, tlfloat_cbrtq, + -10000, 10000, 2.1, 1.0, false) && success; + success = check_d_d("cbrt_u1", xcbrt_u1, tlfloat_cbrtq, + v.data(), v.size(), 1.0, false) && success; + } + + cout << "hypot_u35" << endl; + success = check_d_d_d("hypot_u35", xhypot_u35, tlfloat_hypotq, + ad, sizeof(ad)/sizeof(ad[0]), ad, sizeof(ad)/sizeof(ad[0]), + 3.5, true) && success; + success = check_d_d_d("hypot_u35", xhypot_u35, tlfloat_hypotq, + -10, 10, 0.15, -10, 10, 0.15, 3.5, false) && success; + success = check_d_d_d("hypot_u35", xhypot_u35, tlfloat_hypotq, + -1e+10, 1e+10, 1.51e+8, -1e+10, 1e+10, 1.51e+8, 3.5, false) && success; + + cout << "hypot_u05" << endl; + success = check_d_d_d("hypot_u05", xhypot_u05, tlfloat_hypotq, + ad, sizeof(ad)/sizeof(ad[0]), ad, sizeof(ad)/sizeof(ad[0]), + 0.5, true) && success; + success = check_d_d_d("hypot_u05", xhypot_u05, tlfloat_hypotq, + -10, 10, 0.15, -10, 10, 0.15, 0.5, false) && success; + success = check_d_d_d("hypot_u05", xhypot_u05, tlfloat_hypotq, + -1e+10, 1e+10, 1.51e+8, -1e+10, 1e+10, 1.51e+8, 0.5, false) && success; + + cout << "asin" << endl; + success = check_d_d("asin", xasin, tlfloat_asinq, + ad, sizeof(ad)/sizeof(ad[0]), 3.5, true) && success; + success = check_d_d("asin", xasin, tlfloat_asinq, + -1, 1, 0.0002, 3.5, false) && success; + + cout << "asin_u1" << endl; + success = check_d_d("asin_u1", xasin_u1, tlfloat_asinq, + ad, sizeof(ad)/sizeof(ad[0]), 1.0, true) && success; + success = check_d_d("asin_u1", xasin_u1, tlfloat_asinq, + -1, 1, 0.0002, 1.0, false) && success; + + cout << "acos" << endl; + success = check_d_d("acos", xacos, tlfloat_acosq, + ad, sizeof(ad)/sizeof(ad[0]), 3.5, true) && success; + success = check_d_d("acos", xacos, tlfloat_acosq, + -1, 1, 0.0002, 3.5, false) && success; + + cout << "acos_u1" << endl; + success = check_d_d("acos_u1", xacos_u1, tlfloat_acosq, + ad, sizeof(ad)/sizeof(ad[0]), 1.0, true) && success; + success = check_d_d("acos_u1", xacos_u1, tlfloat_acosq, + -1, 1, 0.0002, 1.0, false) && success; + + cout << "atan" << endl; + success = check_d_d("atan", xatan, tlfloat_atanq, + ad, sizeof(ad)/sizeof(ad[0]), 3.5, true) && success; + success = check_d_d("atan", xatan, tlfloat_atanq, + -10, 10, 0.002, 3.5, false) && success; + success = check_d_d("atan", xatan, tlfloat_atanq, + -10000, 10000, 2.1, 3.5, false) && success; + + cout << "atan_u1" << endl; + success = check_d_d("atan_u1", xatan_u1, tlfloat_atanq, + ad, sizeof(ad)/sizeof(ad[0]), 1.0, true) && success; + success = check_d_d("atan_u1", xatan_u1, tlfloat_atanq, + -10, 10, 0.002, 1.0, false) && success; + success = check_d_d("atan_u1", xatan_u1, tlfloat_atanq, + -10000, 10000, 2.1, 1.0, false) && success; + + cout << "atan2" << endl; + success = check_d_d_d("atan2", xatan2, tlfloat_atan2q, + ad, sizeof(ad)/sizeof(ad[0]), ad, sizeof(ad)/sizeof(ad[0]), + 3.5, true) && success; + success = check_d_d_d("atan2", xatan2, tlfloat_atan2q, + -10, 10, 0.15, -10, 10, 0.15, 3.5, false) && success; + success = check_d_d_d("atan2", xatan2, tlfloat_atan2q, + -100, 100, 1.51, -100, 100, 1.51, 3.5, false) && success; + + cout << "atan2_u1" << endl; + success = check_d_d_d("atan2_u1", xatan2_u1, tlfloat_atan2q, + ad, sizeof(ad)/sizeof(ad[0]), ad, sizeof(ad)/sizeof(ad[0]), + 1.0, true) && success; + success = check_d_d_d("atan2_u1", xatan2_u1, tlfloat_atan2q, + -10, 10, 0.15, -10, 10, 0.15, 1.0, false) && success; + success = check_d_d_d("atan2_u1", xatan2_u1, tlfloat_atan2q, + -100, 100, 1.51, -100, 100, 1.51, 1.0, false) && success; + + cout << "sinh" << endl; + success = check_d_d("sinh", xsinh, tlfloat_sinhq, + ad, sizeof(ad)/sizeof(ad[0]), 1.0, true) && success; + success = check_d_d("sinh", xsinh, tlfloat_sinhq, + -10, 10, 0.002, 1.0, false) && success; + success = check_d_d("sinh", xsinh, tlfloat_sinhq, + -709, 709, 0.2, 1.0, false) && success; + + cout << "cosh" << endl; + success = check_d_d("cosh", xcosh, tlfloat_coshq, + ad, sizeof(ad)/sizeof(ad[0]), 1.0, true) && success; + success = check_d_d("cosh", xcosh, tlfloat_coshq, + -10, 10, 0.002, 1.0, false) && success; + success = check_d_d("cosh", xcosh, tlfloat_coshq, + -709, 709, 0.2, 1.0, false) && success; + + cout << "tanh" << endl; + success = check_d_d("tanh", xtanh, tlfloat_tanhq, + ad, sizeof(ad)/sizeof(ad[0]), 1.0, true) && success; + success = check_d_d("tanh", xtanh, tlfloat_tanhq, + -10, 10, 0.002, 1.0, false) && success; + success = check_d_d("tanh", xtanh, tlfloat_tanhq, + -1000, 1000, 0.2, 1.0, false) && success; + + cout << "sinh_u35" << endl; + success = check_d_d("sinh_u35", xsinh_u35, tlfloat_sinhq, + ad, sizeof(ad)/sizeof(ad[0]), 3.5, true) && success; + success = check_d_d("sinh_u35", xsinh_u35, tlfloat_sinhq, + -10, 10, 0.002, 3.5, false) && success; + success = check_d_d("sinh_u35", xsinh_u35, tlfloat_sinhq, + -709, 709, 0.2, 3.5, false) && success; + + cout << "cosh_u35" << endl; + success = check_d_d("cosh_u35", xcosh_u35, tlfloat_coshq, + ad, sizeof(ad)/sizeof(ad[0]), 3.5, true) && success; + success = check_d_d("cosh_u35", xcosh_u35, tlfloat_coshq, + -10, 10, 0.002, 3.5, false) && success; + success = check_d_d("cosh_u35", xcosh_u35, tlfloat_coshq, + -709, 709, 0.2, 3.5, false) && success; + + cout << "tanh_u35" << endl; + success = check_d_d("tanh_u35", xtanh_u35, tlfloat_tanhq, + ad, sizeof(ad)/sizeof(ad[0]), 3.5, true) && success; + success = check_d_d("tanh_u35", xtanh_u35, tlfloat_tanhq, + -10, 10, 0.002, 3.5, false) && success; + success = check_d_d("tanh_u35", xtanh_u35, tlfloat_tanhq, + -1000, 1000, 0.2, 3.5, false) && success; + + { + static const double ad2[] = { + +0.0, -0.0, +1, -1, +1e+10, -1e+10, DBL_MIN, -DBL_MIN, INFINITY, -INFINITY, NAN + }; + + cout << "asinh" << endl; + success = check_d_d("asinh", xasinh, tlfloat_asinhq, + ad2, sizeof(ad2)/sizeof(ad2[0]), 1.0, true) && success; + success = check_d_d("asinh", xasinh, tlfloat_asinhq, + -10, 10, 0.002, 1.0, false) && success; + success = check_d_d("asinh", xasinh, tlfloat_asinhq, + -1000, 1000, 0.2, 1.0, false) && success; + + cout << "acosh" << endl; + success = check_d_d("acosh", xacosh, tlfloat_acoshq, + ad2, sizeof(ad2)/sizeof(ad2[0]), 1.0, true) && success; + success = check_d_d("acosh", xacosh, tlfloat_acoshq, + 1, 10, 0.002, 1.0, false) && success; + success = check_d_d("acosh", xacosh, tlfloat_acoshq, + 1, 1000, 0.2, 1.0, false) && success; + } + + cout << "atanh" << endl; + success = check_d_d("atanh", xatanh, tlfloat_atanhq, + ad, sizeof(ad)/sizeof(ad[0]), 1.0, true) && success; + success = check_d_d("atanh", xatanh, tlfloat_atanhq, + -10, 10, 0.002, 1.0, false) && success; + success = check_d_d("atanh", xatanh, tlfloat_atanhq, + -1000, 1000, 0.2, 1.0, false) && success; + + cout << "copysign" << endl; + success = check_d_d_d("copysign", xcopysign, tlfloat_copysign, + ad, sizeof(ad)/sizeof(ad[0]), ad, sizeof(ad)/sizeof(ad[0]), + 0.0, true) && success; + success = check_d_d_d("copysign", xcopysign, tlfloat_copysign, + -10, 10, 0.15, -10, 10, 0.15, 0.0, false) && success; + success = check_d_d_d("copysign", xcopysign, tlfloat_copysign, + -1e+10, 1e+10, 1.51e+8, -1e+10, 1e+10, 1.51e+8, 0.0, false) && success; + + cout << "fmax" << endl; + success = check_d_d_d("fmax", xfmax, tlfloat_fmaxq, + ad, sizeof(ad)/sizeof(ad[0]), ad, sizeof(ad)/sizeof(ad[0]), + 0.0, true) && success; + success = check_d_d_d("fmax", xfmax, tlfloat_fmaxq, + -10, 10, 0.15, -10, 10, 0.15, 0.0, false) && success; + success = check_d_d_d("fmax", xfmax, tlfloat_fmaxq, + -1e+10, 1e+10, 1.51e+8, -1e+10, 1e+10, 1.51e+8, 0.0, false) && success; + + cout << "fmin" << endl; + success = check_d_d_d("fmin", xfmin, tlfloat_fminq, + ad, sizeof(ad)/sizeof(ad[0]), ad, sizeof(ad)/sizeof(ad[0]), + 0.0, true) && success; + success = check_d_d_d("fmin", xfmin, tlfloat_fminq, + -10, 10, 0.15, -10, 10, 0.15, 0.0, false) && success; + success = check_d_d_d("fmin", xfmin, tlfloat_fminq, + -1e+10, 1e+10, 1.51e+8, -1e+10, 1e+10, 1.51e+8, 0.0, false) && success; + + cout << "fdim" << endl; + success = check_d_d_d("fdim", xfdim, tlfloat_fdimq, + ad, sizeof(ad)/sizeof(ad[0]), ad, sizeof(ad)/sizeof(ad[0]), + 0.5, true) && success; + success = check_d_d_d("fdim", xfdim, tlfloat_fdimq, + -10, 10, 0.15, -10, 10, 0.15, 0.5, false) && success; + success = check_d_d_d("fdim", xfdim, tlfloat_fdimq, + -1e+10, 1e+10, 1.51e+8, -1e+10, 1e+10, 1.51e+8, 0.5, false) && success; + + cout << "fmod" << endl; + for(int i=0;i 1e+300) continue; + success = check_d_d_d("fmod", xfmod, tlfloat_fmodq, + &ad[i], 1, &ad[j], 1, 0.5, true) && success; + } + } + success = check_d_d_d("fmod", xfmod, tlfloat_fmodq, + -10, 10, 0.15, -10, 10, 0.15, 0.5, false) && success; + success = check_d_d_d("fmod", xfmod, tlfloat_fmodq, + -1e+10, 1e+10, 1.51e+8, -1e+10, 1e+10, 1.51e+8, 0.5, false) && success; + + cout << "remainder" << endl; + for(int i=0;i 1e+300) continue; + success = check_d_d_d("remainder", xremainder, tlfloat_remainderq, + &ad[i], 1, &ad[j], 1, 0.5, true) && success; + } + } + success = check_d_d_d("remainder", xremainder, tlfloat_remainderq, + -10, 10, 0.15, -10, 10, 0.15, 0.5, false) && success; + success = check_d_d_d("remainder", xremainder, tlfloat_remainderq, + -1e+10, 1e+10, 1.51e+8, -1e+10, 1e+10, 1.51e+8, 0.5, false) && success; + + { + vector v; + for(double x = -100.5;x <= 100.5;x+=0.5) { + for(double d = u2d(d2u(x)-3);d <= u2d(d2u(x)+3) && success;d = u2d(d2u(d)+1)) v.push_back(d); + double start = u2d(d2u((double)(INT64_C(1) << 52))-20), end = u2d(d2u((double)(INT64_C(1) << 52))+20); + for(double d = start;d <= end;d = u2d(d2u(d)+1)) { v.push_back(d); v.push_back(-d); } + } + + cout << "trunc" << endl; + success = check_d_d("trunc", xtrunc, tlfloat_truncq, + ad, sizeof(ad)/sizeof(ad[0]), 0.0, true) && success; + success = check_d_d("trunc", xtrunc, tlfloat_truncq, + v.data(), v.size(), 0.0, false) && success; + success = check_d_d("trunc", xtrunc, tlfloat_truncq, + -10000, 10000, 2.5, 0.0, false) && success; + + cout << "floor" << endl; + success = check_d_d("floor", xfloor, tlfloat_floorq, + ad, sizeof(ad)/sizeof(ad[0]), 0.0, true) && success; + success = check_d_d("floor", xfloor, tlfloat_floorq, + v.data(), v.size(), 0.0, false) && success; + success = check_d_d("floor", xfloor, tlfloat_floorq, + -10000, 10000, 2.5, 0.0, false) && success; + + cout << "ceil" << endl; + success = check_d_d("ceil", xceil, tlfloat_ceilq, + ad, sizeof(ad)/sizeof(ad[0]), 0.0, true) && success; + success = check_d_d("ceil", xceil, tlfloat_ceilq, + v.data(), v.size(), 0.0, false) && success; + success = check_d_d("ceil", xceil, tlfloat_ceilq, + -10000, 10000, 2.5, 0.0, false) && success; + + cout << "round" << endl; + success = check_d_d("round", xround, tlfloat_roundq, + ad, sizeof(ad)/sizeof(ad[0]), 0.0, true) && success; + success = check_d_d("round", xround, tlfloat_roundq, + v.data(), v.size(), 0.0, false) && success; + success = check_d_d("round", xround, tlfloat_roundq, + -10000, 10000, 2.5, 0.0, false) && success; + + cout << "rint" << endl; + success = check_d_d("rint", xrint, tlfloat_rintq, + ad, sizeof(ad)/sizeof(ad[0]), 0.0, true) && success; + success = check_d_d("rint", xrint, tlfloat_rintq, + v.data(), v.size(), 0.0, false) && success; + success = check_d_d("rint", xrint, tlfloat_rintq, + -10000, 10000, 2.5, 0.0, false) && success; + } + + { + static const double ad2[] = { + -4, -3, -2, -1, +0.0, -0.0, +1e+10, -1e+10, INFINITY, -INFINITY, NAN + }; + + cout << "lgamma_u1" << endl; + success = check_d_d("lgamma_u1", xlgamma_u1, tlfloat_lgammaq, + ad2, sizeof(ad2)/sizeof(ad2[0]), 1.0, true) && success; + success = check_d_d("lgamma_u1", xlgamma_u1, tlfloat_lgammaq, + -5000, 5000, 1.1, 1.0, false) && success; + + cout << "tgamma_u1" << endl; + success = check_d_d("tgamma_u1", xtgamma_u1, tlfloat_tgammaq, + ad2, sizeof(ad2)/sizeof(ad2[0]), 1.0, true) && success; + success = check_d_d("tgamma_u1", xtgamma_u1, tlfloat_tgammaq, + -10, 10, 0.002, 1.0, false) && success; + } + + cout << "erf_u1" << endl; + success = check_d_d("erf_u1", xerf_u1, tlfloat_erfq, + ad, sizeof(ad)/sizeof(ad[0]), 1.0, true) && success; + success = check_d_d("erf_u1", xerf_u1, tlfloat_erfq, + -100, 100, 0.02, 1.0, false) && success; + + cout << "erfc_u15" << endl; + success = check_d_d("erfc_u15", xerfc_u15, tlfloat_erfcq, + ad, sizeof(ad)/sizeof(ad[0]), 1.5, true) && success; + success = check_d_d("erfc_u15", xerfc_u15, tlfloat_erfcq, + -1, 100, 0.01, 1.5, false) && success; + + { + cout << "ilogb" << endl; + + static const double ad2[] = { INFINITY, -INFINITY, -1 }; + + for(int i=0;i<3;i++) { + if (func_i_d(xilogb, ad2[i]) != tlfloat_ilogb(double(ad2[i]))) { + printf("ilogb a = %g, t = %d, c = %d\n", ad2[i], func_i_d(xilogb, ad2[i]), tlfloat_ilogb(ad2[i])); + success = false; + } + } + + if (func_i_d(xilogb, NAN) != INT_MAX && func_i_d(xilogb, NAN) != INT_MIN) { + printf("ilogb a = %g, t = %d\n", NAN, func_i_d(xilogb, NAN)); + success = false; + } + + if (func_i_d(xilogb, 0) != INT_MIN && func_i_d(xilogb, 0) != -INT_MAX) { + printf("ilogb a = %g, t = %d\n", 0.0, func_i_d(xilogb, 0)); + success = false; + } + + for(double d = 0.0001;d < 10;d += 0.001) { + if (func_i_d(xilogb, d) != tlfloat_ilogb(double(d))) { + printf("ilogb a = %a (%g), t = %d, c = %d\n", d, d, func_i_d(xilogb, d), tlfloat_ilogb(d)); + success = false; + } + } + + for(double d = 0.0001;d < 10000;d += 1.1) { + if (func_i_d(xilogb, d) != tlfloat_ilogb(double(d))) { + printf("ilogb a = %a (%g), t = %d, c = %d\n", d, d, func_i_d(xilogb, d), tlfloat_ilogb(d)); + success = false; + } + } + + for(int i=0;i<10000;i+=10) { + double d = DBL_MIN * pow(0.996323, i); + if (func_i_d(xilogb, d) != tlfloat_ilogb(double(d))) { + printf("ilogb a = %a (%g), t = %d, c = %d\n", d, d, func_i_d(xilogb, d), tlfloat_ilogb(d)); + success = false; + } + } + + for(int i=0;i<10000;i+=10) { + double d = pow(0.933254300796991, i); + if (func_i_d(xilogb, d) != tlfloat_ilogb(double(d))) { + printf("ilogb a = %a (%g), t = %d, c = %d\n", d, d, func_i_d(xilogb, d), tlfloat_ilogb(d)); + success = false; + } + } + } + + { + cout << "ldexp" << endl; + + for(int i=-10000;i<=10000 && success;i++) { + double t = func_d_d_i(xldexp, 1.0, i); + double c = (double)ldexp_(1.0, i); + + if (c != t) { + fprintf(stderr, "ldexp args = (1.0, %d), t = %g, c = %g\n", i, t, c); + success = false; + } + } + } +#endif // #if defined(ENABLE_DP) + + // + + static const float af[] = { NAN, + -INFINITY, -FLT_MAX, -FLT_MIN, -SLEEF_FLT_DENORM_MIN, -0.0, + +0.0, SLEEF_FLT_DENORM_MIN, FLT_MIN, FLT_MAX, +INFINITY, + -M_PI*2, -M_PI, -M_PI/2, -M_PI/4, M_PI/4, M_PI/2, M_PI, M_PI*2, + -1e+30, -1e+10, -100001, -100000.5, -100000, -7.0, -5.0, -4.0, -3.0, -2.5, -2.0, -1.5, -1.0, -0.999, -0.5, + +0.5, +0.999, +1.0, +1.5, +2.0, +2.5, +3.0, +4.0, +5.0, +7.0, +100000, +100000.5, +100001, +1e+10, +1e+30, + nextafterf(-1, -2), nextafterf(+1, +2) + }; + + // + + { + vector v; + for(int i = 0;i < 1000;i++) v.push_back(pow(1.092, i)); + for(int64_t i64=(int64_t)-1000;i64<(int64_t)1000 && success;i64+=(int64_t)1) { + double start = u2f(f2u(M_PI / 4 * i64)-20), end = u2f(f2u(M_PI / 4 * i64)+20); + for(double d = start;d <= end;d = u2f(f2u(d)+1)) v.push_back(d); + } + + cout << "sinf" << endl; + success = check_f_f("sinf", xsinf, tlfloat_sin, + af, sizeof(af)/sizeof(af[0]), 3.5, true) && success; + success = check_f_f("sinf", xsinf, tlfloat_sin, + -10, 10, 0.002, 3.5, false) && success; + success = check_f_f("sinf", xsinf, tlfloat_sin, + -10000, 10000, 1.1, 3.5, false) && success; + success = check_f_f("sinf", xsinf, tlfloat_sin, + v.data(), v.size(), 3.5, false) && success; + + cout << "sin in sincosf" << endl; + success = checkX_f_f("sin in sincosf", xsincosf, tlfloat_sin, + af, sizeof(af)/sizeof(af[0]), 3.5, true) && success; + success = checkX_f_f("sin in sincosf", xsincosf, tlfloat_sin, + -10, 10, 0.002, 3.5, false) && success; + success = checkX_f_f("sin in sincosf", xsincosf, tlfloat_sin, + -10000, 10000, 1.1, 3.5, false) && success; + success = checkX_f_f("sin in sincosf", xsincosf, tlfloat_sin, + v.data(), v.size(), 3.5, false) && success; + + cout << "sinf_u1" << endl; + success = check_f_f("sinf_u1", xsinf_u1, tlfloat_sin, + af, sizeof(af)/sizeof(af[0]), 1.0, true) && success; + success = check_f_f("sinf_u1", xsinf_u1, tlfloat_sin, + -10, 10, 0.002, 1.0, false) && success; + success = check_f_f("sinf_u1", xsinf_u1, tlfloat_sin, + -10000, 10000, 1.1, 1.0, false) && success; + success = check_f_f("sinf_u1", xsinf_u1, tlfloat_sin, + v.data(), v.size(), 1.0, false) && success; + + cout << "sin in sincosf_u1" << endl; + success = checkX_f_f("sin in sincosf_u1", xsincosf_u1, tlfloat_sin, + af, sizeof(af)/sizeof(af[0]), 1.0, true) && success; + success = checkX_f_f("sin in sincosf_u1", xsincosf_u1, tlfloat_sin, + -10, 10, 0.002, 1.0, false) && success; + success = checkX_f_f("sin in sincosf_u1", xsincosf_u1, tlfloat_sin, + -10000, 10000, 1.1, 1.0, false) && success; + success = checkX_f_f("sin in sincosf_u1", xsincosf_u1, tlfloat_sin, + v.data(), v.size(), 1.0, false) && success; + + cout << "fastsinf_u3500" << endl; + success = check_f_f("fastsinf_u3500", xfastsinf_u3500, tlfloat_sin, + -32, 32, 0.001, 350.0, false, 2e-6) && success; + + cout << "cosf" << endl; + success = check_f_f("cosf", xcosf, tlfloat_cos, + af, sizeof(af)/sizeof(af[0]), 3.5, true) && success; + success = check_f_f("cosf", xcosf, tlfloat_cos, + -10, 10, 0.002, 3.5, false) && success; + success = check_f_f("cosf", xcosf, tlfloat_cos, + -10000, 10000, 1.1, 3.5, false) && success; + success = check_f_f("cosf", xcosf, tlfloat_cos, + v.data(), v.size(), 3.5, false) && success; + + cout << "cos in sincosf" << endl; + success = checkY_f_f("cos in sincosf", xsincosf, tlfloat_cos, + af, sizeof(af)/sizeof(af[0]), 3.5, true) && success; + success = checkY_f_f("cos in sincosf", xsincosf, tlfloat_cos, + -10, 10, 0.002, 3.5, false) && success; + success = checkY_f_f("cos in sincosf", xsincosf, tlfloat_cos, + -10000, 10000, 1.1, 3.5, false) && success; + success = checkY_f_f("cos in sincosf", xsincosf, tlfloat_cos, + v.data(), v.size(), 3.5, false) && success; + + cout << "cosf_u1" << endl; + success = check_f_f("cosf_u1", xcosf_u1, tlfloat_cos, + af, sizeof(af)/sizeof(af[0]), 1.0, true) && success; + success = check_f_f("cosf_u1", xcosf_u1, tlfloat_cos, + -10, 10, 0.002, 1.0, false) && success; + success = check_f_f("cosf_u1", xcosf_u1, tlfloat_cos, + -10000, 10000, 1.1, 1.0, false) && success; + success = check_f_f("cosf_u1", xcosf_u1, tlfloat_cos, + v.data(), v.size(), 1.0, false) && success; + + cout << "cos in sincosf_u1" << endl; + success = checkY_f_f("cos in sincosf_u1", xsincosf_u1, tlfloat_cos, + af, sizeof(af)/sizeof(af[0]), 1.0, true) && success; + success = checkY_f_f("cos in sincosf_u1", xsincosf_u1, tlfloat_cos, + -10, 10, 0.002, 1.0, false) && success; + success = checkY_f_f("cos in sincosf_u1", xsincosf_u1, tlfloat_cos, + -10000, 10000, 1.1, 1.0, false) && success; + success = checkY_f_f("cos in sincosf_u1", xsincosf_u1, tlfloat_cos, + v.data(), v.size(), 1.0, false) && success; + + cout << "fastcosf_u3500" << endl; + success = check_f_f("fastcosf_u3500", xfastcosf_u3500, tlfloat_cos, + -32, 32, 0.001, 350.0, false, 2e-6) && success; + } + + { + static const float af2[] = { +0.0, -0.0, INFINITY, -INFINITY, NAN }; + + vector v; + for(int i=1;i<10000 && success;i+=31) { + double start = u2f(f2u(i)-20), end = u2f(f2u(i)+20); + for(double d = start;d <= end;d = u2f(f2u(d)+1)) v.push_back(d); + } + for(int i=1;i<=20 && success;i++) { + double start = u2f(f2u(0.25 * i)-20), end = u2f(f2u(0.25 * i)+20); + for(double d = start;d <= end;d = u2f(f2u(d)+1)) v.push_back(d); + } + + cout << "sinpif_u05" << endl; + success = check_f_f("sinpif_u05", xsinpif_u05, tlfloat_sinpi, + af2, sizeof(af2)/sizeof(af2[0]), 0.506, true) && success; + success = check_f_f("sinpif_u05", xsinpif_u05, tlfloat_sinpi, + -10.1, 10, 0.0021, 0.506, false) && success; + success = check_f_f("sinpif_u05", xsinpif_u05, tlfloat_sinpi, + -10000-0.1, 10000, 1.1, 0.506, false) && success; + success = check_f_f("sinpif_u05", xsinpif_u05, tlfloat_sinpi, + v.data(), v.size(), 0.506, false) && success; + + cout << "sin in sincospif_u35" << endl; + success = checkX_f_f("sin in sincospif_u35", xsincospif_u35, tlfloat_sinpi, + af2, sizeof(af2)/sizeof(af2[0]), 3.5, true) && success; + success = checkX_f_f("sin in sincospif_u35", xsincospif_u35, tlfloat_sinpi, + -10.1, 10, 0.0021, 3.5, false) && success; + success = checkX_f_f("sin in sincospif_u35", xsincospif_u35, tlfloat_sinpi, + -10000-0.1, 10000, 1.1, 3.5, false) && success; + success = checkX_f_f("sin in sincospif_u35", xsincospif_u35, tlfloat_sinpi, + v.data(), v.size(), 3.5, false) && success; + + cout << "sin in sincospif_u05" << endl; + success = checkX_f_f("sin in sincospif_u05", xsincospif_u05, tlfloat_sinpi, + af2, sizeof(af2)/sizeof(af2[0]), 0.506, true) && success; + success = checkX_f_f("sin in sincospif_u05", xsincospif_u05, tlfloat_sinpi, + -10.1, 10, 0.0021, 0.506, false) && success; + success = checkX_f_f("sin in sincospif_u05", xsincospif_u05, tlfloat_sinpi, + -10000-0.1, 10000, 1.1, 0.506, false) && success; + success = checkX_f_f("sin in sincospif_u05", xsincospif_u05, tlfloat_sinpi, + v.data(), v.size(), 0.506, false) && success; + + cout << "cospif_u05" << endl; + success = check_f_f("cospif_u05", xcospif_u05, tlfloat_cospi, + af2, sizeof(af2)/sizeof(af2[0]), 0.506, true) && success; + success = check_f_f("cospif_u05", xcospif_u05, tlfloat_cospi, + -10.1, 10, 0.0021, 0.506, false) && success; + success = check_f_f("cospif_u05", xcospif_u05, tlfloat_cospi, + -10000-0.1, 10000, 1.1, 0.506, false) && success; + success = check_f_f("cospif_u05", xcospif_u05, tlfloat_cospi, + v.data(), v.size(), 0.506, false) && success; + + cout << "cos in sincospif_u35" << endl; + success = checkY_f_f("cos in sincospif_u35", xsincospif_u35, tlfloat_cospi, + af2, sizeof(af2)/sizeof(af2[0]), 3.5, true) && success; + success = checkY_f_f("cos in sincospif_u35", xsincospif_u35, tlfloat_cospi, + -10.1, 10, 0.0021, 3.5, false) && success; + success = checkY_f_f("cos in sincospif_u35", xsincospif_u35, tlfloat_cospi, + -10000-0.1, 10000, 1.1, 3.5, false) && success; + success = checkY_f_f("cos in sincospif_u35", xsincospif_u35, tlfloat_cospi, + v.data(), v.size(), 3.5, false) && success; + + cout << "cos in sincospif_u05" << endl; + success = checkY_f_f("cos in sincospif_u05", xsincospif_u05, tlfloat_cospi, + af2, sizeof(af2)/sizeof(af2[0]), 0.506, true) && success; + success = checkY_f_f("cos in sincospif_u05", xsincospif_u05, tlfloat_cospi, + -10.1, 10, 0.0021, 0.506, false) && success; + success = checkY_f_f("cos in sincospif_u05", xsincospif_u05, tlfloat_cospi, + -10000-0.1, 10000, 1.1, 0.506, false) && success; + success = checkY_f_f("cos in sincospif_u05", xsincospif_u05, tlfloat_cospi, + v.data(), v.size(), 0.506, false) && success; + } + + { + vector v; + v.push_back(70.936981201171875); + for(int i = 0;i < 1000;i++) v.push_back(pow(1.092, i)); + for(int i=1;i<10000 && success;i+=31) { + double start = u2f(f2u(M_PI / 4 * i)-20), end = u2f(f2u(M_PI / 4 * i)+20); + for(double d = start;d <= end;d = u2f(f2u(d)+1)) v.push_back(d); + } + + cout << "tanf" << endl; + success = check_f_f("tanf", xtanf, tlfloat_tan, + af, sizeof(af)/sizeof(af[0]), 3.5, true) && success; + success = check_f_f("tanf", xtanf, tlfloat_tan, + -10, 10, 0.002, 3.5, false) && success; + success = check_f_f("tanf", xtanf, tlfloat_tan, + -10000, 10000, 1.1, 3.5, false) && success; + success = check_f_f("tanf", xtanf, tlfloat_tan, + v.data(), v.size(), 3.5, false) && success; + + cout << "tanf_u1" << endl; + success = check_f_f("tanf_u1", xtanf_u1, tlfloat_tan, + af, sizeof(af)/sizeof(af[0]), 1.0, true) && success; + success = check_f_f("tanf_u1", xtanf_u1, tlfloat_tan, + -10, 10, 0.002, 1.0, false) && success; + success = check_f_f("tanf_u1", xtanf_u1, tlfloat_tan, + -10000, 10000, 1.1, 1.0, false) && success; + success = check_f_f("tanf_u1", xtanf_u1, tlfloat_tan, + v.data(), v.size(), 1.0, false) && success; + } + + { + vector v; + for(int i = -1000;i <= 1000 && success;i+=10) v.push_back(pow(2.1, i)); + for(int i=0;i<10000 && success;i+=10) v.push_back(FLT_MAX * pow(0.9314821319758632, i)); + for(int i=0;i<10000 && success;i+=10) v.push_back(pow(0.933254300796991, i)); + for(int i=0;i<10000 && success;i+=10) v.push_back(FLT_MIN * pow(0.996323, i)); + + cout << "logf" << endl; + success = check_f_f("logf", xlogf, tlfloat_log, + af, sizeof(af)/sizeof(af[0]), 3.5, true) && success; + success = check_f_f("logf", xlogf, tlfloat_log, + 0.0001, 10, 0.001, 3.5, false) && success; + success = check_f_f("logf", xlogf, tlfloat_log, + 0.0001, 10000, 1.1, 3.5, false) && success; + success = check_f_f("logf", xlogf, tlfloat_log, + v.data(), v.size(), 3.5, false) && success; + + cout << "logf_u1" << endl; + success = check_f_f("logf_u1", xlogf_u1, tlfloat_log, + af, sizeof(af)/sizeof(af[0]), 1.0, true) && success; + success = check_f_f("logf_u1", xlogf_u1, tlfloat_log, + 0.0001, 10, 0.001, 1.0, false) && success; + success = check_f_f("logf_u1", xlogf_u1, tlfloat_log, + 0.0001, 10000, 1.1, 1.0, false) && success; + if (!enableFlushToZero) { + success = check_f_f("logf_u1", xlogf_u1, tlfloat_log, + v.data(), v.size(), 1.0, false) && success; + } + + cout << "log10f" << endl; + success = check_f_f("log10f", xlog10f, tlfloat_log10, + af, sizeof(af)/sizeof(af[0]), 1.0, true) && success; + success = check_f_f("log10f", xlog10f, tlfloat_log10, + 0.0001, 10, 0.001, 1.0, false) && success; + success = check_f_f("log10f", xlog10f, tlfloat_log10, + 0.0001, 10000, 1.1, 1.0, false) && success; + success = check_f_f("log10f", xlog10f, tlfloat_log10, + v.data(), v.size(), 1.0, false) && success; + + cout << "log2f" << endl; + success = check_f_f("log2f", xlog2f, tlfloat_log2, + af, sizeof(af)/sizeof(af[0]), 1.0, true) && success; + success = check_f_f("log2f", xlog2f, tlfloat_log2, + 0.0001, 10, 0.001, 1.0, false) && success; + success = check_f_f("log2f", xlog2f, tlfloat_log2, + 0.0001, 10000, 1.1, 1.0, false) && success; + success = check_f_f("log2f", xlog2f, tlfloat_log2, + v.data(), v.size(), 1.0, false) && success; + + cout << "log2f_u35" << endl; + success = check_f_f("log2f_u35", xlog2f_u35, tlfloat_log2, + af, sizeof(af)/sizeof(af[0]), 3.5, true) && success; + success = check_f_f("log2f_u35", xlog2f_u35, tlfloat_log2, + 0.0001, 10, 0.001, 3.5, false) && success; + success = check_f_f("log2f_u35", xlog2f_u35, tlfloat_log2, + 0.0001, 10000, 1.1, 3.5, false) && success; + success = check_f_f("log2f_u35", xlog2f_u35, tlfloat_log2, + v.data(), v.size(), 3.5, false) && success; + + static const float af2[] = { + +0.0, -0.0, +1, -1, +1e+10, -1e+10, FLT_MIN, -FLT_MIN, + INFINITY, -INFINITY, NAN, nextafterf(-1, -2), -2 }; + + cout << "log1pf" << endl; + success = check_f_f("log1pf", xlog1pf, tlfloat_log1p, + af2, sizeof(af2)/sizeof(af2[0]), 1.0, true) && success; + success = check_f_f("log1pf", xlog1pf, tlfloat_log1p, + 0.0001, 10, 0.001, 1.0, false) && success; + } + + cout << "expf" << endl; + success = check_f_f("expf", xexpf, tlfloat_exp, + af, sizeof(af)/sizeof(af[0]), 1.0, true) && success; + success = check_f_f("expf", xexpf, tlfloat_exp, + -10, 10, 0.002, 1.0, false) && success; + success = check_f_f("expf", xexpf, tlfloat_exp, + -1000, 1000, 1.1, 1.0, false) && success; + + cout << "exp2f" << endl; + success = check_f_f("exp2f", xexp2f, tlfloat_exp2, + af, sizeof(af)/sizeof(af[0]), 1.0, true) && success; + success = check_f_f("exp2f", xexp2f, tlfloat_exp2, + -10, 10, 0.002, 1.0, false) && success; + success = check_f_f("exp2f", xexp2f, tlfloat_exp2, + -1000, 1000, 0.2, 1.0, false) && success; + + cout << "exp10" << endl; + success = check_f_f("exp10", xexp10f, tlfloat_exp10, + af, sizeof(af)/sizeof(af[0]), 1.0, true) && success; + success = check_f_f("exp10", xexp10f, tlfloat_exp10, + -10, 10, 0.002, 1.0, false) && success; + success = check_f_f("exp10", xexp10f, tlfloat_exp10, + -300, 300, 0.1, 1.0, false) && success; + + cout << "exp2f_u35" << endl; + success = check_f_f("exp2f_u35", xexp2f_u35, tlfloat_exp2, + af, sizeof(af)/sizeof(af[0]), 3.5, true) && success; + success = check_f_f("exp2f_u35", xexp2f_u35, tlfloat_exp2, + -10, 10, 0.002, 3.5, false) && success; + success = check_f_f("exp2f_u35", xexp2f_u35, tlfloat_exp2, + -1000, 1000, 0.2, 3.5, false) && success; + + cout << "exp10f_u35" << endl; + success = check_f_f("exp10f_u35", xexp10f_u35, tlfloat_exp10, + af, sizeof(af)/sizeof(af[0]), 3.5, true) && success; + success = check_f_f("exp10f_u35", xexp10f_u35, tlfloat_exp10, + -10, 10, 0.002, 3.5, false) && success; + success = check_f_f("exp10f_u35", xexp10f_u35, tlfloat_exp10, + -300, 300, 0.1, 3.5, false) && success; + + { + vector v; + for(double d = 0;d < 300 && success;d += 0.21) { + v.push_back(+pow(10, -d)); + v.push_back(-pow(10, -d)); + } + + cout << "expm1f" << endl; + success = check_f_f("expm1f", xexpm1f, tlfloat_expm1, + af, sizeof(af)/sizeof(af[0]), 1.0, true) && success; + success = check_f_f("expm1f", xexpm1f, tlfloat_expm1, + -10, 10, 0.002, 1.0, false) && success; + success = check_f_f("expm1f", xexpm1f, tlfloat_expm1, + -1000, 1000, 0.21, 1.0, false) && success; + success = check_f_f("expm1f", xexpm1f, tlfloat_expm1, + v.data(), v.size(), 1.0, false) && success; + } + + cout << "powf" << endl; + success = check_f_f_f("powf", xpowf, tlfloat_pow, + af, sizeof(af)/sizeof(af[0]), af, sizeof(af)/sizeof(af[0]), + 1.0, true) && success; + if (!enableFlushToZero) { + success = check_f_f_f("powf", xpowf, tlfloat_pow, + -100, 100, 0.6, 0.1, 100, 0.6, 1.0, false) && success; + vector v, w; + for(double y = -1000;y < 1000;y += 0.1) v.push_back(y); + w.push_back(2.1); + success = check_f_f_f("powf", xpowf, tlfloat_pow, + w.data(), w.size(), v.data(), v.size(), 1.0, false) && success; + } else { + success = check_f_f_f("powf", xpowf, tlfloat_pow, + -100, 10, 0.06, 0.1, 10, 0.06, 1.0, false) && success; + } + + cout << "fastpowf_u3500" << endl; + success = check_f_f_f("fastpowf_u3500", xfastpowf_u3500, tlfloat_pow, + 0.1, 25, 0.251, -25, 25, 0.121, 350.0, false) && success; + + { + vector v; + for(int i = -1000;i <= 1000 && success;i+=10) v.push_back(pow(2.1, i)); + +#ifndef DETERMINISTIC + cout << "sqrtf" << endl; + success = check_f_f("sqrtf", xsqrtf, tlfloat_sqrt, + af, sizeof(af)/sizeof(af[0]), 1.0, true) && success; + success = check_f_f("sqrtf", xsqrtf, tlfloat_sqrt, + -10000, 10000, 2.1, 1.0, false) && success; + success = check_f_f("sqrtf", xsqrtf, tlfloat_sqrt, + v.data(), v.size(), 1.0, false) && success; + + cout << "sqrtf_u05" << endl; + success = check_f_f("sqrtf", xsqrtf_u05, tlfloat_sqrt, + af, sizeof(af)/sizeof(af[0]), 0.506, true) && success; + success = check_f_f("sqrtf", xsqrtf_u05, tlfloat_sqrt, + -10000, 10000, 2.1, 0.506, false) && success; + success = check_f_f("sqrtf", xsqrtf_u05, tlfloat_sqrt, + v.data(), v.size(), 0.506, false) && success; + + cout << "sqrtf_u35" << endl; + success = check_f_f("sqrtf", xsqrtf_u35, tlfloat_sqrt, + af, sizeof(af)/sizeof(af[0]), 3.5, true) && success; + success = check_f_f("sqrtf", xsqrtf_u35, tlfloat_sqrt, + -10000, 10000, 2.1, 3.5, false) && success; + success = check_f_f("sqrtf", xsqrtf_u35, tlfloat_sqrt, + v.data(), v.size(), 3.5, false) && success; +#endif + + cout << "cbrtf" << endl; + success = check_f_f("cbrtf", xcbrtf, tlfloat_cbrt, + af, sizeof(af)/sizeof(af[0]), 3.5, true) && success; + success = check_f_f("cbrtf", xcbrtf, tlfloat_cbrt, + -10000, 10000, 2.1, 3.5, false) && success; + success = check_f_f("cbrtf", xcbrtf, tlfloat_cbrt, + v.data(), v.size(), 3.5, false) && success; + + cout << "cbrtf_u1" << endl; + success = check_f_f("cbrtf_u1", xcbrtf_u1, tlfloat_cbrt, + af, sizeof(af)/sizeof(af[0]), 1.0, true) && success; + success = check_f_f("cbrtf_u1", xcbrtf_u1, tlfloat_cbrt, + -10000, 10000, 2.1, 1.0, false) && success; + success = check_f_f("cbrtf_u1", xcbrtf_u1, tlfloat_cbrt, + v.data(), v.size(), 1.0, false) && success; + } + + if (!enableFlushToZero) { + cout << "hypotf_u35" << endl; + success = check_f_f_f("hypotf_u35", xhypotf_u35, hypot, + af, sizeof(af)/sizeof(af[0]), af, sizeof(af)/sizeof(af[0]), + 3.5, true) && success; + success = check_f_f_f("hypotf_u35", xhypotf_u35, hypot, + -10, 10, 0.15, -10, 10, 0.15, 3.5, false) && success; + success = check_f_f_f("hypotf_u35", xhypotf_u35, hypot, + -1e+10, 1e+10, 1.51e+8, -1e+10, 1e+10, 1.51e+8, 3.5, false) && success; + + cout << "hypotf_u05" << endl; + success = check_f_f_f("hypotf_u05", xhypotf_u05, hypot, + af, sizeof(af)/sizeof(af[0]), af, sizeof(af)/sizeof(af[0]), + 0.5, true) && success; + success = check_f_f_f("hypotf_u05", xhypotf_u05, hypot, + -10, 10, 0.15, -10, 10, 0.15, 0.5, false) && success; + success = check_f_f_f("hypotf_u05", xhypotf_u05, hypot, + -1e+10, 1e+10, 1.51e+8, -1e+10, 1e+10, 1.51e+8, 0.5, false) && success; + } else { + static const float af2[] = { + +0.0, -0.0, +1, -1, +1e+10, -1e+10, INFINITY, -INFINITY, NAN + }; + + cout << "hypotf_u35" << endl; + success = check_f_f_f("hypotf_u35", xhypotf_u35, hypot, + af2, sizeof(af2)/sizeof(af2[0]), af2, sizeof(af2)/sizeof(af2[0]), + 3.5, true) && success; + success = check_f_f_f("hypotf_u35", xhypotf_u35, hypot, + -10, 10, 0.15, -10, 10, 0.15, 3.5, false) && success; + success = check_f_f_f("hypotf_u35", xhypotf_u35, hypot, + -1e+10, 1e+10, 1.51e+8, -1e+10, 1e+10, 1.51e+8, 3.5, false) && success; + + cout << "hypotf_u05" << endl; + success = check_f_f_f("hypotf_u05", xhypotf_u05, hypot, + af2, sizeof(af2)/sizeof(af2[0]), af2, sizeof(af2)/sizeof(af2[0]), + 0.5, true) && success; + success = check_f_f_f("hypotf_u05", xhypotf_u05, hypot, + -10, 10, 0.15, -10, 10, 0.15, 0.5, false) && success; + success = check_f_f_f("hypotf_u05", xhypotf_u05, hypot, + -1e+10, 1e+10, 1.51e+8, -1e+10, 1e+10, 1.51e+8, 0.5, false) && success; + } + + cout << "asinf" << endl; + success = check_f_f("asinf", xasinf, tlfloat_asin, + af, sizeof(af)/sizeof(af[0]), 3.5, true) && success; + success = check_f_f("asinf", xasinf, tlfloat_asin, + -1, 1, 0.0002, 3.5, false) && success; + + cout << "asinf_u1" << endl; + success = check_f_f("asinf_u1", xasinf_u1, tlfloat_asin, + af, sizeof(af)/sizeof(af[0]), 3.5, true) && success; + success = check_f_f("asinf_u1", xasinf_u1, tlfloat_asin, + -1, 1, 0.0002, 3.5, false) && success; + + cout << "acosf" << endl; + success = check_f_f("acosf", xacosf, tlfloat_acos, + af, sizeof(af)/sizeof(af[0]), 3.5, true) && success; + success = check_f_f("acosf", xacosf, tlfloat_acos, + -1, 1, 0.0002, 3.5, false) && success; + + cout << "acosf_u1" << endl; + success = check_f_f("acosf_u1", xacosf_u1, tlfloat_acos, + af, sizeof(af)/sizeof(af[0]), 1.0, true) && success; + success = check_f_f("acosf_u1", xacosf_u1, tlfloat_acos, + -1, 1, 0.0002, 1.0, false) && success; + + cout << "atanf" << endl; + success = check_f_f("atanf", xatanf, tlfloat_atan, + af, sizeof(af)/sizeof(af[0]), 3.5, true) && success; + success = check_f_f("atanf", xatanf, tlfloat_atan, + -10, 10, 0.002, 3.5, false) && success; + success = check_f_f("atanf", xatanf, tlfloat_atan, + -10000, 10000, 2.1, 3.5, false) && success; + + cout << "atanf_u1" << endl; + success = check_f_f("atanf_u1", xatanf_u1, tlfloat_atan, + af, sizeof(af)/sizeof(af[0]), 1.0, true) && success; + success = check_f_f("atanf_u1", xatanf_u1, tlfloat_atan, + -10, 10, 0.002, 1.0, false) && success; + success = check_f_f("atanf_u1", xatanf_u1, tlfloat_atan, + -10000, 10000, 2.1, 1.0, false) && success; + + if (!enableFlushToZero) { + cout << "atan2f" << endl; + success = check_f_f_f("atan2f", xatan2f, tlfloat_atan2, + af, sizeof(af)/sizeof(af[0]), af, sizeof(af)/sizeof(af[0]), + 3.5, true) && success; + success = check_f_f_f("atan2f", xatan2f, tlfloat_atan2, + -10, 10, 0.15, -10, 10, 0.15, 3.5, false) && success; + success = check_f_f_f("atan2f", xatan2f, tlfloat_atan2, + -100, 100, 1.51, -100, 100, 1.51, 3.5, false) && success; + + cout << "atan2f_u1" << endl; + success = check_f_f_f("atan2f_u1", xatan2f_u1, tlfloat_atan2, + af, sizeof(af)/sizeof(af[0]), af, sizeof(af)/sizeof(af[0]), + 1.0, true) && success; + success = check_f_f_f("atan2f_u1", xatan2f_u1, tlfloat_atan2, + -10, 10, 0.15, -10, 10, 0.15, 1.0, false) && success; + success = check_f_f_f("atan2f_u1", xatan2f_u1, tlfloat_atan2, + -100, 100, 1.51, -100, 100, 1.51, 1.0, false) && success; + } else { + static const float af2[] = { + NAN, -INFINITY, -SLEEF_FLT_DENORM_MIN, -0.0, +0.0, SLEEF_FLT_DENORM_MIN, +INFINITY, + -M_PI*2, -M_PI, -M_PI/2, -M_PI/4, M_PI/4, M_PI/2, M_PI, M_PI*2, + -1e+10, -100001, -100000.5, -100000, -7.0, -5.0, -4.0, -3.0, -2.5, -2.0, -1.5, -1.0, -0.999, -0.5, + +0.5, +0.999, +1.0, +1.5, +2.0, +2.5, +3.0, +4.0, +5.0, +7.0, +100000, +100000.5, +100001, +1e+10, + nextafterf(-1, -2), nextafterf(+1, +2) + }; + + cout << "atan2f" << endl; + success = check_f_f_f("atan2f", xatan2f, tlfloat_atan2, + af2, sizeof(af2)/sizeof(af2[0]), af2, sizeof(af2)/sizeof(af2[0]), + 3.5, true) && success; + success = check_f_f_f("atan2f", xatan2f, tlfloat_atan2, + -10, 10, 0.15, -10, 10, 0.15, 3.5, false) && success; + success = check_f_f_f("atan2f", xatan2f, tlfloat_atan2, + -100, 100, 1.51, -100, 100, 1.51, 3.5, false) && success; + + cout << "atan2f_u1" << endl; + success = check_f_f_f("atan2f_u1", xatan2f_u1, tlfloat_atan2, + af2, sizeof(af2)/sizeof(af2[0]), af2, sizeof(af2)/sizeof(af2[0]), + 1.0, true) && success; + success = check_f_f_f("atan2f_u1", xatan2f_u1, tlfloat_atan2, + -10, 10, 0.15, -10, 10, 0.15, 1.0, false) && success; + success = check_f_f_f("atan2f_u1", xatan2f_u1, tlfloat_atan2, + -100, 100, 1.51, -100, 100, 1.51, 1.0, false) && success; + } + + cout << "sinhf" << endl; + success = check_f_f("sinhf", xsinhf, tlfloat_sinh, + af, sizeof(af)/sizeof(af[0]), 1.0, true) && success; + success = check_f_f("sinhf", xsinhf, tlfloat_sinh, + -10, 10, 0.002, 1.0, false) && success; + success = check_f_f("sinhf", xsinhf, tlfloat_sinh, + -88, 88, 0.2, 1.0, false) && success; + + cout << "coshf" << endl; + success = check_f_f("coshf", xcoshf, tlfloat_cosh, + af, sizeof(af)/sizeof(af[0]), 1.0, true) && success; + success = check_f_f("coshf", xcoshf, tlfloat_cosh, + -10, 10, 0.002, 1.0, false) && success; + success = check_f_f("coshf", xcoshf, tlfloat_cosh, + -88, 88, 0.2, 1.0, false) && success; + + cout << "tanhf" << endl; + success = check_f_f("tanhf", xtanhf, tlfloat_tanh, + af, sizeof(af)/sizeof(af[0]), 1.0, true) && success; + success = check_f_f("tanhf", xtanhf, tlfloat_tanh, + -10, 10, 0.002, 1.0, false) && success; + success = check_f_f("tanhf", xtanhf, tlfloat_tanh, + -1000, 1000, 0.2, 1.0, false) && success; + + cout << "sinhf_u35" << endl; + if (!enableFlushToZero) { + success = check_f_f("sinhf_u35", xsinhf_u35, tlfloat_sinh, + af, sizeof(af)/sizeof(af[0]), 3.5, true) && success; + + success = check_f_f("sinhf_u35", xsinhf_u35, tlfloat_sinh, + -10, 10, 0.002, 3.5, false) && success; + success = check_f_f("sinhf_u35", xsinhf_u35, tlfloat_sinh, + -88, 88, 0.2, 3.5, false) && success; + } else { + static const float af2[] = { + +0.0, -0.0, +1, -1, +1e+7, -1e+7, FLT_MAX, -FLT_MAX, INFINITY, -INFINITY, NAN + }; + success = check_f_f("sinhf_u35", xsinhf_u35, tlfloat_sinh, + af, sizeof(af2)/sizeof(af2[0]), 3.5, true) && success; + + success = check_f_f("sinhf_u35", xsinhf_u35, tlfloat_sinh, + -10, 10, 0.002, 3.5, false) && success; + } + + cout << "coshf_u35" << endl; + success = check_f_f("coshf_u35", xcoshf_u35, tlfloat_cosh, + af, sizeof(af)/sizeof(af[0]), 3.5, true) && success; + success = check_f_f("coshf_u35", xcoshf_u35, tlfloat_cosh, + -10, 10, 0.002, 3.5, false) && success; + success = check_f_f("coshf_u35", xcoshf_u35, tlfloat_cosh, + -88, 88, 0.2, 3.5, false) && success; + + cout << "tanhf_u35" << endl; + success = check_f_f("tanhf_u35", xtanhf_u35, tlfloat_tanh, + af, sizeof(af)/sizeof(af[0]), 3.5, true) && success; + success = check_f_f("tanhf_u35", xtanhf_u35, tlfloat_tanh, + -10, 10, 0.002, 3.5, false) && success; + success = check_f_f("tanhf_u35", xtanhf_u35, tlfloat_tanh, + -1000, 1000, 0.2, 3.5, false) && success; + + { + static const float af2[] = { + +0.0, -0.0, +1, -1, +1e+10, -1e+10, DBL_MIN, -DBL_MIN, INFINITY, -INFINITY, NAN + }; + + cout << "asinhf" << endl; + success = check_f_f("asinhf", xasinhf, tlfloat_asinh, + af2, sizeof(af2)/sizeof(af2[0]), 1.0, true) && success; + success = check_f_f("asinhf", xasinhf, tlfloat_asinh, + -10, 10, 0.002, 1.0, false) && success; + success = check_f_f("asinhf", xasinhf, tlfloat_asinh, + -1000, 1000, 0.2, 1.0, false) && success; + + cout << "acoshf" << endl; + success = check_f_f("acoshf", xacoshf, tlfloat_acosh, + af2, sizeof(af2)/sizeof(af2[0]), 1.0, true) && success; + success = check_f_f("acoshf", xacoshf, tlfloat_acosh, + 1, 10, 0.002, 1.0, false) && success; + success = check_f_f("acoshf", xacoshf, tlfloat_acosh, + 1, 1000, 0.2, 1.0, false) && success; + } + + cout << "atanhf" << endl; + success = check_f_f("atanhf", xatanhf, tlfloat_atanh, + af, sizeof(af)/sizeof(af[0]), 1.0, true) && success; + success = check_f_f("atanhf", xatanhf, tlfloat_atanh, + -10, 10, 0.002, 1.0, false) && success; + success = check_f_f("atanhf", xatanhf, tlfloat_atanh, + -1000, 1000, 0.2, 1.0, false) && success; + + cout << "copysignf" << endl; + success = check_f_f_f("copysignf", xcopysignf, tlfloat_copysign, + af, sizeof(af)/sizeof(af[0]), af, sizeof(af)/sizeof(af[0]), + 0.0, true) && success; + success = check_f_f_f("copysignf", xcopysignf, tlfloat_copysign, + -10, 10, 0.15, -10, 10, 0.15, 0.0, false) && success; + success = check_f_f_f("copysignf", xcopysignf, tlfloat_copysign, + -1e+7, 1e+7, 1.51e+5, -1e+7, 1e+7, 1.51e+5, 0.0, false) && success; + + cout << "fmaxf" << endl; + success = check_f_f_f("fmaxf", xfmaxf, tlfloat_fmax, + af, sizeof(af)/sizeof(af[0]), af, sizeof(af)/sizeof(af[0]), + 0.0, true) && success; + success = check_f_f_f("fmaxf", xfmaxf, tlfloat_fmax, + -10, 10, 0.15, -10, 10, 0.15, 0.0, false) && success; + success = check_f_f_f("fmaxf", xfmaxf, tlfloat_fmax, + -1e+7, 1e+7, 1.51e+5, -1e+7, 1e+7, 1.51e+5, 0.0, false) && success; + + cout << "fminf" << endl; + success = check_f_f_f("fminf", xfminf, tlfloat_fmin, + af, sizeof(af)/sizeof(af[0]), af, sizeof(af)/sizeof(af[0]), + 0.0, true) && success; + success = check_f_f_f("fminf", xfminf, tlfloat_fmin, + -10, 10, 0.15, -10, 10, 0.15, 0.0, false) && success; + success = check_f_f_f("fminf", xfminf, tlfloat_fmin, + -1e+7, 1e+7, 1.51e+5, -1e+7, 1e+7, 1.51e+5, 0.0, false) && success; + + cout << "fdimf" << endl; + success = check_f_f_f("fdimf", xfdimf, tlfloat_fdim, + af, sizeof(af)/sizeof(af[0]), af, sizeof(af)/sizeof(af[0]), + 0.5, true) && success; + success = check_f_f_f("fdimf", xfdimf, tlfloat_fdim, + -10, 10, 0.15, -10, 10, 0.15, 0.5, false) && success; + success = check_f_f_f("fdimf", xfdimf, tlfloat_fdim, + -1e+7, 1e+7, 1.51e+5, -1e+7, 1e+7, 1.51e+5, 0.5, false) && success; + + if (!enableFlushToZero) { + cout << "fmodf" << endl; + for(int i=0;i 1e+300) continue; + success = check_f_f_f("fmodf", xfmodf, tlfloat_fmod, + &af[i], 1, &af[j], 1, 0.5, true) && success; + } + } + success = check_f_f_f("fmodf", xfmodf, tlfloat_fmod, + -10, 10, 0.15, -10, 10, 0.15, 0.5, false) && success; + success = check_f_f_f("fmodf", xfmodf, tlfloat_fmod, + -1e+7, 1e+7, 1.51e+5, -1e+7, 1e+7, 1.51e+5, 0.5, false) && success; + + cout << "remainderf" << endl; + for(int i=0;i 1e+300) continue; + success = check_f_f_f("remainderf", xremainderf, tlfloat_remainder, + &af[i], 1, &af[j], 1, 0.5, true) && success; + } + } + { + float af3x = 11114942644092928.0, af3y = 224544296009728.0; + success = check_f_f_f("remainderf", xremainderf, tlfloat_remainder, + &af3x, 1, &af3y, 1, 0.5, false) && success; + } + success = check_f_f_f("remainderf", xremainderf, tlfloat_remainder, + -10, 10, 0.15, -10, 10, 0.15, 0.5, false) && success; + success = check_f_f_f("remainderf", xremainderf, tlfloat_remainder, + -1e+7, 1e+7, 1.51e+5, -1e+7, 1e+7, 1.51e+5, 0.5, false) && success; + } else { + float xa[] = { +0.0, -0.0, +1, -1, +1e+30, -1e+30, FLT_MAX, -FLT_MAX, +INFINITY, -INFINITY, NAN }; + float ya[] = { +0.0, -0.0, +1, -1, +INFINITY, -INFINITY, NAN }; + + cout << "fmodf" << endl; + for(int i=0;i 1e+300) continue; + success = check_f_f_f("fmodf", xfmodf, tlfloat_fmod, + &xa[i], 1, &ya[j], 1, 0.5, true) && success; + } + } + success = check_f_f_f("fmodf", xfmodf, tlfloat_fmod, + -10, 10, 0.15, -10, 10, 0.15, 0.5, false) && success; + success = check_f_f_f("fmodf", xfmodf, tlfloat_fmod, + -1e+7, 1e+7, 1.51e+5, -1e+7, 1e+7, 1.51e+5, 0.5, false) && success; + + cout << "remainderf" << endl; + for(int i=0;i 1e+300) continue; + success = check_f_f_f("remainderf", xremainderf, tlfloat_remainder, + &xa[i], 1, &ya[j], 1, 0.5, true) && success; + } + } + { + float af3x = 11114942644092928.0, af3y = 224544296009728.0; + success = check_f_f_f("remainderf", xremainderf, tlfloat_remainder, + &af3x, 1, &af3y, 1, 0.5, false) && success; + } + success = check_f_f_f("remainderf", xremainderf, tlfloat_remainder, + -10, 10, 0.15, -10, 10, 0.15, 0.5, false) && success; + success = check_f_f_f("remainderf", xremainderf, tlfloat_remainder, + -1e+7, 1e+7, 1.51e+5, -1e+7, 1e+7, 1.51e+5, 0.5, false) && success; + } + + { + vector v; + for(double x = -100.5;x <= 100.5;x+=0.5) { + for(double d = u2f(f2u(x)-3);d <= u2f(f2u(x)+3) && success;d = u2f(f2u(d)+1)) v.push_back(d); + double start = u2f(f2u((double)(INT64_C(1) << 23))-20), end = u2f(f2u((double)(INT64_C(1) << 23))+20); + for(double d = start;d <= end;d = u2f(f2u(d)+1)) { v.push_back(d); v.push_back(-d); } + } + + cout << "truncf" << endl; + success = check_f_f("truncf", xtruncf, tlfloat_trunc, + af, sizeof(af)/sizeof(af[0]), 0.0, true) && success; + success = check_f_f("truncf", xtruncf, tlfloat_trunc, + v.data(), v.size(), 0.0, false) && success; + success = check_f_f("truncf", xtruncf, tlfloat_trunc, + -10000, 10000, 2.5, 0.0, false) && success; + + cout << "floorf" << endl; + success = check_f_f("floorf", xfloorf, tlfloat_floor, + af, sizeof(af)/sizeof(af[0]), 0.0, true) && success; + success = check_f_f("floorf", xfloorf, tlfloat_floor, + v.data(), v.size(), 0.0, false) && success; + success = check_f_f("floorf", xfloorf, tlfloat_floor, + -10000, 10000, 2.5, 0.0, false) && success; + + cout << "ceilf" << endl; + success = check_f_f("ceilf", xceilf, tlfloat_ceil, + af, sizeof(af)/sizeof(af[0]), 0.0, true) && success; + success = check_f_f("ceilf", xceilf, tlfloat_ceil, + v.data(), v.size(), 0.0, false) && success; + success = check_f_f("ceilf", xceilf, tlfloat_ceil, + -10000, 10000, 2.5, 0.0, false) && success; + + cout << "roundf" << endl; + success = check_f_f("roundf", xroundf, tlfloat_round, + af, sizeof(af)/sizeof(af[0]), 0.0, true) && success; + success = check_f_f("roundf", xroundf, tlfloat_round, + v.data(), v.size(), 0.0, false) && success; + success = check_f_f("roundf", xroundf, tlfloat_round, + -10000, 10000, 2.5, 0.0, false) && success; + + cout << "rintf" << endl; + success = check_f_f("rintf", xrintf, tlfloat_rint, + af, sizeof(af)/sizeof(af[0]), 0.0, true) && success; + success = check_f_f("rintf", xrintf, tlfloat_rint, + v.data(), v.size(), 0.0, false) && success; + success = check_f_f("rintf", xrintf, tlfloat_rint, + -10000, 10000, 2.5, 0.0, false) && success; + } + + { + static const float af2[] = { + -4, -3, -2, -1, +0.0, -0.0, +1e+10, -1e+10, INFINITY, -INFINITY, NAN + }; + + cout << "lgammaf_u1" << endl; + success = check_f_f("lgammaf_u1", xlgammaf_u1, tlfloat_lgamma, + af2, sizeof(af2)/sizeof(af2[0]), 1.0, true) && success; + success = check_f_f("lgammaf_u1", xlgammaf_u1, tlfloat_lgamma, + -5000, 5000, 1.1, 1.0, false) && success; + + cout << "tgammaf_u1" << endl; + success = check_f_f("tgammaf_u1", xtgammaf_u1, tlfloat_tgamma, + af2, sizeof(af2)/sizeof(af2[0]), 1.0, true) && success; + success = check_f_f("tgammaf_u1", xtgammaf_u1, tlfloat_tgamma, + -10, 10, 0.002, 1.0, false) && success; + } + + cout << "erff_u1" << endl; + success = check_f_f("erff_u1", xerff_u1, tlfloat_erf, + af, sizeof(af)/sizeof(af[0]), 1.0, true) && success; + success = check_f_f("erff_u1", xerff_u1, tlfloat_erf, + -100, 100, 0.02, 1.0, false) && success; + + cout << "erfcf_u15" << endl; + if (!enableFlushToZero) { + success = check_f_f("erfcf_u15", xerfcf_u15, tlfloat_erfc, + af, sizeof(af)/sizeof(af[0]), 1.5, true) && success; + } else { + static const float af2[] = { + -1, +0.0, -0.0, +1, +1e+10, -1e+10, INFINITY, -INFINITY, NAN + }; + success = check_f_f("erfcf_u15", xerfcf_u15, tlfloat_erfc, + af2, sizeof(af2)/sizeof(af2[0]), 1.5, true) && success; + } + + success = check_f_f("erfcf_u15", xerfcf_u15, tlfloat_erfc, + -1, 8, 0.001, 1.5, false) && success; + + // + + if (success) { + cout << "OK" << endl; + } else { + cout << "NG" << endl; + } + + return success ? 0 : -1; +} diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/testerutil.h b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/testerutil.h deleted file mode 100644 index 53350746c3c..00000000000 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm-tester/testerutil.h +++ /dev/null @@ -1,100 +0,0 @@ -// Copyright Naoki Shibata and contributors 2010 - 2023. -// Distributed under the Boost Software License, Version 1.0. -// (See accompanying file LICENSE.txt or copy at -// http://www.boost.org/LICENSE_1_0.txt) - -#define DENORMAL_DBL_MIN (4.9406564584124654418e-324) -#define POSITIVE_INFINITY INFINITY -#define NEGATIVE_INFINITY (-INFINITY) - -#define DENORMAL_FLT_MIN (1.4012984643248170709e-45f) -#define POSITIVE_INFINITYf ((float)INFINITY) -#define NEGATIVE_INFINITYf (-(float)INFINITY) - -#ifndef M_PIf -# define M_PIf ((float)M_PI) -#endif - -extern int enableFlushToZero; -double flushToZero(double y); - -int isnumber(double x); -int isPlusZero(double x); -int isMinusZero(double x); -int xisnan(double x); -double sign(double d); - -int isnumberf(float x); -int isPlusZerof(float x); -int isMinusZerof(float x); -int xisnanf(float x); -float signf(float d); - -int readln(int fd, char *buf, int cnt); - -#define XRAND_MAX (INT64_C(0x100000000) * (double)INT64_C(0x100000000)) - -void xsrand(uint64_t s); -uint64_t xrand(); -void memrand(void *p, int size); - -// The following functions are meant to be inlined - -static double u2d(uint64_t u) { - union { - double f; - uint64_t i; - } tmp; - tmp.i = u; - return tmp.f; -} - -static uint64_t d2u(double d) { - union { - double f; - uint64_t i; - } tmp; - tmp.f = d; - return tmp.i; -} - -static float u2f(uint32_t u) { - union { - float f; - uint32_t i; - } tmp; - tmp.i = u; - return tmp.f; -} - -static uint32_t f2u(float d) { - union { - float f; - uint32_t i; - } tmp; - tmp.f = d; - return tmp.i; -} - -static int startsWith(char *str, char *prefix) { - while(*prefix != '\0') if (*str++ != *prefix++) return 0; - return *prefix == '\0'; -} - -// - -#ifdef USEMPFR -int cmpDenormdp(double x, mpfr_t fry); -double countULPdp(double d, mpfr_t c); -double countULP2dp(double d, mpfr_t c); - -int cmpDenormsp(float x, mpfr_t fry); -double countULPsp(float d, mpfr_t c); -double countULP2sp(float d, mpfr_t c); - -#if MPFR_VERSION < MPFR_VERSION_NUM(4, 2, 0) -void mpfr_sinpi(mpfr_t ret, mpfr_t arg, mpfr_rnd_t rnd); -void mpfr_cospi(mpfr_t ret, mpfr_t arg, mpfr_rnd_t rnd); -#endif -void mpfr_lgamma_nosign(mpfr_t ret, mpfr_t arg, mpfr_rnd_t rnd); -#endif diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/CMakeLists.txt b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/CMakeLists.txt index aebc3d34edc..28e61c49a50 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/CMakeLists.txt +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/CMakeLists.txt @@ -183,6 +183,7 @@ include_directories(${CMAKE_CURRENT_BINARY_DIR}/include/) include_directories(${CMAKE_CURRENT_SOURCE_DIR}) set(CMAKE_C_FLAGS "${ORG_CMAKE_C_FLAGS} ${SLEEF_C_FLAGS}") +set(CMAKE_CXX_FLAGS "${ORG_CMAKE_C_FLAGS} ${SLEEF_C_FLAGS}") # -------------------------------------------------------------------- # sleef.h @@ -448,7 +449,7 @@ foreach(SIMD ${SLEEF_SUPPORTED_LIBM_EXTENSIONS}) sleef_concat_files( OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/include/alias_${SIMDLC}.h SOURCES ${CMAKE_CURRENT_BINARY_DIR}/alias_${SIMD}_sp.h.tmp ${CMAKE_CURRENT_BINARY_DIR}/alias_${SIMD}_dp.h.tmp - ) + ) add_custom_target(alias_${SIMDLC}.h_generated SOURCES ${CMAKE_CURRENT_BINARY_DIR}/include/alias_${SIMDLC}.h) add_dependencies(${OBJECT_${SIMD}} alias_${SIMDLC}.h_generated) add_dependencies(${OBJECTDET_${SIMD}} alias_${SIMDLC}.h_generated) @@ -504,6 +505,8 @@ if(SLEEF_BUILD_INLINE_HEADERS) file(MAKE_DIRECTORY ${PROJECT_BINARY_DIR}/include/) set(INLINE_HEADER_FILES_GENERATED "") + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/sleefinline_header.h.org.in ${CMAKE_CURRENT_BINARY_DIR}/sleefinline_header.h.org) + if (SED_COMMAND) foreach(SIMD ${SLEEF_SUPPORTED_LIBM_EXTENSIONS} CUDA) if(COMPILER_SUPPORTS_${SIMD} OR ${SIMD} STREQUAL "CUDA") @@ -521,10 +524,10 @@ if(SLEEF_BUILD_INLINE_HEADERS) set(INLINE_HEADER_ORG ${CMAKE_CURRENT_SOURCE_DIR}/sleefinline_cuda_header.h.org) # Remove redundant __device__ set(TARGET_REPLACEMENTS -e "s/__device__ __device__/__device__/g" -e "s/__device__ __device__/__device__/g") - set(TARGET_ADDSUFFIX_KEYWORDS double2 double3 float2) + set(TARGET_ADDSUFFIX_KEYWORDS) else() set(TARGET_ADDSUFFIX_KEYWORDS Sleef_rempitabdp Sleef_rempitabsp) - set(INLINE_HEADER_ORG ${CMAKE_CURRENT_SOURCE_DIR}/sleefinline_header.h.org) + set(INLINE_HEADER_ORG ${CMAKE_CURRENT_BINARY_DIR}/sleefinline_header.h.org) endif() set(INLINE_HEADER_FILE ${PROJECT_BINARY_DIR}/include/sleefinline_${SIMDLC}.h) @@ -833,12 +836,6 @@ if (SLEEF_ARCH_S390X) add_dependencies(disps390x_128_obj disps390x_128.c_generated renamedsp128.h_generated ${TARGET_HEADERS}) target_sources(${TARGET_LIBSLEEF} PRIVATE $) - if(COMPILER_SUPPORTS_VXE2) - add_library(tryvxe2_obj OBJECT tryvxe2.c) - target_compile_options(tryvxe2_obj PRIVATE ${FLAGS_ENABLE_VXE2}) - set_target_properties(tryvxe2_obj PROPERTIES ${COMMON_TARGET_PROPERTIES}) - target_sources(${TARGET_LIBSLEEF} PRIVATE $) - endif() endif(SLEEF_ARCH_S390X) # -------------------------------------------------------------------- diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/dispatcher.h b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/dispatcher.h index 6a81892e26e..ab972e48290 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/dispatcher.h +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/dispatcher.h @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -68,105 +68,106 @@ static int cpuSupportsExt(void (*tryExt)()) { */ #define DISPATCH_vf_vf(fptype, veclen, funcNameS, funcNameS2, funcName, pfn, dfn, funcExt0, funcExt1, funcExt2) \ - static CONST VECTOR_CC fptype (*pfn)(fptype arg0); \ - static CONST VECTOR_CC fptype dfn(fptype arg0) { \ + static CONST VECTOR_CC fptype (*pfn)(fptype arg0); \ + static CONST VECTOR_CC fptype dfn(fptype arg0) { \ fptype CONST VECTOR_CC (*p)(fptype arg0) = funcExt0; \ - SUBST_IF_EXT1(funcExt1); \ - SUBST_IF_EXT2(funcExt2); \ - pfn = p; \ + SUBST_IF_EXT1(funcExt1); \ + SUBST_IF_EXT2(funcExt2); \ + pfn = p; \ return (*pfn)(arg0); \ - } \ - static CONST VECTOR_CC fptype (*pfn)(fptype arg0) = dfn; \ + } \ + static CONST VECTOR_CC fptype (*pfn)(fptype arg0) = dfn; \ EXPORT CONST VECTOR_CC fptype funcName(fptype arg0) { return (*pfn)(arg0); } \ - VECALIAS_vf_vf(fptype, funcNameS, funcName, veclen) \ + VECALIAS_vf_vf(fptype, funcNameS, funcName, veclen) \ VECALIAS_vf_vf(fptype, funcNameS2, funcName, veclen) #define DISPATCH_vf_vf_vf(fptype, veclen, funcNameS, funcNameS2, funcName, pfn, dfn, funcExt0, funcExt1, funcExt2) \ - static CONST VECTOR_CC fptype (*pfn)(fptype arg0, fptype arg1); \ - static CONST VECTOR_CC fptype dfn(fptype arg0, fptype arg1) { \ - fptype CONST VECTOR_CC (*p)(fptype arg0, fptype arg1) = funcExt0; \ - SUBST_IF_EXT1(funcExt1); \ - SUBST_IF_EXT2(funcExt2); \ - pfn = p; \ - return (*pfn)(arg0, arg1); \ - } \ + static CONST VECTOR_CC fptype (*pfn)(fptype arg0, fptype arg1); \ + static CONST VECTOR_CC fptype dfn(fptype arg0, fptype arg1) { \ + fptype CONST VECTOR_CC (*p)(fptype arg0, fptype arg1) = funcExt0; \ + SUBST_IF_EXT1(funcExt1); \ + SUBST_IF_EXT2(funcExt2); \ + pfn = p; \ + return (*pfn)(arg0, arg1); \ + } \ static CONST VECTOR_CC fptype (*pfn)(fptype arg0, fptype arg1) = dfn; \ EXPORT CONST VECTOR_CC fptype funcName(fptype arg0, fptype arg1) { return (*pfn)(arg0, arg1); } \ VECALIAS_vf_vf_vf(fptype, funcNameS, funcName, veclen) \ VECALIAS_vf_vf_vf(fptype, funcNameS2, funcName, veclen) #define DISPATCH_vf2_vf(fptype, fptype2, veclen, funcNameS, funcNameS2, funcName, pfn, dfn, funcExt0, funcExt1, funcExt2) \ - static CONST VECTOR_CC fptype2 (*pfn)(fptype arg0); \ - static CONST VECTOR_CC fptype2 dfn(fptype arg0) { \ - fptype2 CONST VECTOR_CC (*p)(fptype arg0) = funcExt0; \ - SUBST_IF_EXT1(funcExt1); \ - SUBST_IF_EXT2(funcExt2); \ - pfn = p; \ + static CONST VECTOR_CC fptype2 (*pfn)(fptype arg0); \ + static CONST VECTOR_CC fptype2 dfn(fptype arg0) { \ + fptype2 CONST VECTOR_CC (*p)(fptype arg0) = funcExt0; \ + SUBST_IF_EXT1(funcExt1); \ + SUBST_IF_EXT2(funcExt2); \ + pfn = p; \ return (*pfn)(arg0); \ - } \ - static CONST VECTOR_CC fptype2 (*pfn)(fptype arg0) = dfn; \ + } \ + static CONST VECTOR_CC fptype2 (*pfn)(fptype arg0) = dfn; \ EXPORT CONST VECTOR_CC fptype2 funcName(fptype arg0) { return (*pfn)(arg0); } #define DISPATCH_vf_vf_vi(fptype, itype, veclen, funcNameS, funcNameS2, funcName, pfn, dfn, funcExt0, funcExt1, funcExt2) \ static CONST VECTOR_CC fptype (*pfn)(fptype arg0, itype arg1); \ - static CONST VECTOR_CC fptype dfn(fptype arg0, itype arg1) { \ - fptype CONST VECTOR_CC (*p)(fptype arg0, itype arg1) = funcExt0; \ - SUBST_IF_EXT1(funcExt1); \ - SUBST_IF_EXT2(funcExt2); \ - pfn = p; \ - return (*pfn)(arg0, arg1); \ - } \ - static CONST VECTOR_CC fptype (*pfn)(fptype arg0, itype arg1) = dfn; \ + static CONST VECTOR_CC fptype dfn(fptype arg0, itype arg1) { \ + fptype CONST VECTOR_CC (*p)(fptype arg0, itype arg1) = funcExt0; \ + SUBST_IF_EXT1(funcExt1); \ + SUBST_IF_EXT2(funcExt2); \ + pfn = p; \ + return (*pfn)(arg0, arg1); \ + } \ + static CONST VECTOR_CC fptype (*pfn)(fptype arg0, itype arg1) = dfn; \ EXPORT CONST VECTOR_CC fptype funcName(fptype arg0, itype arg1) { return (*pfn)(arg0, arg1); } #define DISPATCH_vi_vf(fptype, itype, veclen, funcNameS, funcNameS2, funcName, pfn, dfn, funcExt0, funcExt1, funcExt2) \ - static CONST VECTOR_CC itype (*pfn)(fptype arg0); \ - static CONST VECTOR_CC itype dfn(fptype arg0) { \ - itype CONST VECTOR_CC (*p)(fptype arg0) = funcExt0; \ - SUBST_IF_EXT1(funcExt1); \ - SUBST_IF_EXT2(funcExt2); \ - pfn = p; \ + static CONST VECTOR_CC itype (*pfn)(fptype arg0); \ + static CONST VECTOR_CC itype dfn(fptype arg0) { \ + itype CONST VECTOR_CC (*p)(fptype arg0) = funcExt0; \ + SUBST_IF_EXT1(funcExt1); \ + SUBST_IF_EXT2(funcExt2); \ + pfn = p; \ return (*pfn)(arg0); \ - } \ - static CONST VECTOR_CC itype (*pfn)(fptype arg0) = dfn; \ + } \ + static CONST VECTOR_CC itype (*pfn)(fptype arg0) = dfn; \ EXPORT CONST VECTOR_CC itype funcName(fptype arg0) { return (*pfn)(arg0); } #define DISPATCH_vf_vf_vf_vf(fptype, veclen, funcNameS, funcNameS2, funcName, pfn, dfn, funcExt0, funcExt1, funcExt2) \ static CONST VECTOR_CC fptype (*pfn)(fptype arg0, fptype arg1, fptype arg2); \ static CONST VECTOR_CC fptype dfn(fptype arg0, fptype arg1, fptype arg2) { \ fptype CONST VECTOR_CC (*p)(fptype arg0, fptype arg1, fptype arg2) = funcExt0; \ - SUBST_IF_EXT1(funcExt1); \ - SUBST_IF_EXT2(funcExt2); \ - pfn = p; \ - return (*pfn)(arg0, arg1, arg2); \ - } \ + SUBST_IF_EXT1(funcExt1); \ + SUBST_IF_EXT2(funcExt2); \ + pfn = p; \ + return (*pfn)(arg0, arg1, arg2); \ + } \ static CONST VECTOR_CC fptype (*pfn)(fptype arg0, fptype arg1, fptype arg2) = dfn; \ EXPORT CONST VECTOR_CC fptype funcName(fptype arg0, fptype arg1, fptype arg2) { return (*pfn)(arg0, arg1, arg2); } \ - VECALIAS_vf_vf_vf_vf(fptype, funcNameS, funcName, veclen) \ + VECALIAS_vf_vf_vf_vf(fptype, funcNameS, funcName, veclen) \ VECALIAS_vf_vf_vf_vf(fptype, funcNameS2, funcName, veclen) #define DISPATCH_i_i(veclen, funcNameS, funcNameS2, funcName, pfn, dfn, funcExt0, funcExt1, funcExt2) \ - static CONST int (*pfn)(int arg0); \ - static CONST int dfn(int arg0) { \ - int CONST (*p)(int) = funcExt0; \ - SUBST_IF_EXT1(funcExt1); \ - SUBST_IF_EXT2(funcExt2); \ - pfn = p; \ + static CONST int (*pfn)(int arg0); \ + static CONST int dfn(int arg0) { \ + int CONST (*p)(int) = funcExt0; \ + SUBST_IF_EXT1(funcExt1); \ + SUBST_IF_EXT2(funcExt2); \ + pfn = p; \ return (*pfn)(arg0); \ - } \ - static CONST int (*pfn)(int arg0) = dfn; \ + } \ + static CONST int (*pfn)(int arg0) = dfn; \ EXPORT CONST int funcName(int arg0) { return (*pfn)(arg0); } #define DISPATCH_p_i(veclen, funcNameS, funcNameS2, funcName, pfn, dfn, funcExt0, funcExt1, funcExt2) \ - static CONST void *(*pfn)(int arg0); \ - static CONST void *dfn(int arg0) { \ - CONST void *(*p)(int) = funcExt0; \ - SUBST_IF_EXT1(funcExt1); \ - SUBST_IF_EXT2(funcExt2); \ - pfn = p; \ + static CONST void *(*pfn)(int arg0); \ + static CONST void *dfn(int arg0) { \ + CONST void *(*p)(int) = funcExt0; \ + SUBST_IF_EXT1(funcExt1); \ + SUBST_IF_EXT2(funcExt2); \ + pfn = p; \ return (*pfn)(arg0); \ - } \ - static CONST void *(*pfn)(int arg0) = dfn; \ + } \ + static CONST void *(*pfn)(int arg0) = dfn; \ EXPORT CONST void *funcName(int arg0) { return (*pfn)(arg0); } // + diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/dispavx.c.org b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/dispavx.c.org index 30dda0c9f19..4eeeaf8a9be 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/dispavx.c.org +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/dispavx.c.org @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -56,13 +56,13 @@ static int cpuSupportsFMA4() { #endif #ifdef ENABLE_ALIAS -#define VECALIAS_vf_vf(fptype, funcNameS, funcName, veclen) \ +#define VECALIAS_vf_vf(fptype, funcNameS, funcName, veclen) \ EXPORT CONST VECTOR_CC fptype _ZGVcN ## veclen ## v_ ## funcNameS(fptype) __attribute__((alias(stringify(funcName)))); \ EXPORT CONST VECTOR_CC fptype _ZGVdN ## veclen ## v_ ## funcNameS(fptype) __attribute__((alias(stringify(funcName)))); -#define VECALIAS_vf_vf_vf(fptype, funcNameS, funcName, veclen) \ +#define VECALIAS_vf_vf_vf(fptype, funcNameS, funcName, veclen) \ EXPORT CONST VECTOR_CC fptype _ZGVcN ## veclen ## vv_ ## funcNameS(fptype, fptype) __attribute__((alias(stringify(funcName)))); \ EXPORT CONST VECTOR_CC fptype _ZGVdN ## veclen ## vv_ ## funcNameS(fptype, fptype) __attribute__((alias(stringify(funcName)))); -#define VECALIAS_vf_vf_vf_vf(fptype, funcNameS, funcName, veclen) \ +#define VECALIAS_vf_vf_vf_vf(fptype, funcNameS, funcName, veclen) \ EXPORT CONST VECTOR_CC fptype _ZGVcN ## veclen ## vvv_ ## funcNameS(fptype, fptype, fptype) __attribute__((alias(stringify(funcName)))); \ EXPORT CONST VECTOR_CC fptype _ZGVdN ## veclen ## vvv_ ## funcNameS(fptype, fptype, fptype) __attribute__((alias(stringify(funcName)))); #endif @@ -70,3 +70,4 @@ static int cpuSupportsFMA4() { #include "dispatcher.h" // + diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/disppower_128.c.org b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/disppower_128.c.org index 334b2b88eef..eda4ea2a2df 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/disppower_128.c.org +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/disppower_128.c.org @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -28,3 +28,4 @@ void sleef_tryVSX3(); #define SUBST_IF_EXT2(funcExt2) // + diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/disps390x_128.c.org b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/disps390x_128.c.org index ae912145abb..62d2c8e0850 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/disps390x_128.c.org +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/disps390x_128.c.org @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -7,6 +7,7 @@ #include #include #include +#include #include "misc.h" @@ -15,9 +16,15 @@ #include "dispatcher.h" +static int cpuSupportsVXE2() { + static int ret = -1; + if (ret == -1) + ret = !!(getauxval(AT_HWCAP) & HWCAP_S390_VXRS_EXT2); + return ret; +} + #ifdef ENABLE_VXE2 -void sleef_tryVXE2(); -#define SUBST_IF_EXT1(funcvxe2) if (cpuSupportsExt(sleef_tryVXE2)) p = funcvxe2; +#define SUBST_IF_EXT1(funcvxe2) if (cpuSupportsVXE2()) p = funcvxe2; #else #define SUBST_IF_EXT1(funcvxe2) #endif @@ -25,3 +32,4 @@ void sleef_tryVXE2(); #define SUBST_IF_EXT2(funcExt2) // + diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/dispscalar.c.org b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/dispscalar.c.org index a18f030ad7d..4102214ea27 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/dispscalar.c.org +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/dispscalar.c.org @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2023. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -22,3 +22,4 @@ static void tryFMA() { sleef_cpuid_SCALFMA = Sleef_sind1_u10purecfma(sleef_cpuid #define SUBST_IF_EXT2(funcExt2) // + diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/dispsse.c.org b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/dispsse.c.org index 1436e1030cf..aaa99709785 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/dispsse.c.org +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/dispsse.c.org @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -52,14 +52,15 @@ static int cpuSupportsFMA() { #endif #ifdef ENABLE_ALIAS -#define VECALIAS_vf_vf(fptype, funcNameS, funcName, veclen) \ +#define VECALIAS_vf_vf(fptype, funcNameS, funcName, veclen) \ EXPORT CONST VECTOR_CC fptype _ZGVbN ## veclen ## v_ ## funcNameS(fptype) __attribute__((alias(stringify(funcName)))); -#define VECALIAS_vf_vf_vf(fptype, funcNameS, funcName, veclen) \ +#define VECALIAS_vf_vf_vf(fptype, funcNameS, funcName, veclen) \ EXPORT CONST VECTOR_CC fptype _ZGVbN ## veclen ## vv_ ## funcNameS(fptype, fptype) __attribute__((alias(stringify(funcName)))); -#define VECALIAS_vf_vf_vf_vf(fptype, funcNameS, funcName, veclen) \ +#define VECALIAS_vf_vf_vf_vf(fptype, funcNameS, funcName, veclen) \ EXPORT CONST VECTOR_CC fptype _ZGVbN ## veclen ## vvv_ ## funcNameS(fptype, fptype, fptype) __attribute__((alias(stringify(funcName)))); #endif #include "dispatcher.h" // + diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/funcproto.h b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/funcproto.h index 1ab66b3b905..c0041fb0f21 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/funcproto.h +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/funcproto.h @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/mkalias.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/mkalias.c index 8a06c791395..591c435d8b8 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/mkalias.c +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/mkalias.c @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/mkdisp.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/mkdisp.c index 7579753cfd9..fcdedb9c41e 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/mkdisp.c +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/mkdisp.c @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/mkmasked_gnuabi.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/mkmasked_gnuabi.c index fae173c344d..43829cee0d6 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/mkmasked_gnuabi.c +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/mkmasked_gnuabi.c @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/mkrename.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/mkrename.c index 8b326436f67..12f146f4651 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/mkrename.c +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/mkrename.c @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -615,3 +615,4 @@ int main(int argc, char **argv) { exit(0); } + diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/mkrename_gnuabi.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/mkrename_gnuabi.c index ed7ab9164dc..26a704888e8 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/mkrename_gnuabi.c +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/mkrename_gnuabi.c @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/norename.h b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/norename.h index e8b3ae6babc..e2902a3ebcd 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/norename.h +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/norename.h @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/rempitab.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/rempitab.c index b174bc7bdc1..33c3e059fd7 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/rempitab.c +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/rempitab.c @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/rename.h b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/rename.h index 8810742b1a5..16fefc5d9f2 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/rename.h +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/rename.h @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/sleefdp.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/sleefdp.c index 0ece9233c16..ae46c577b19 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/sleefdp.c +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/sleefdp.c @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -1490,7 +1490,7 @@ EXPORT CONST double xexp(double d) { u = s * s * u + s + 1; u = ldexp2k(u, q); - if (d > 709.78271114955742909217217426) u = SLEEF_INFINITY; + if (d > LOG_DBL_MAX) u = SLEEF_INFINITY; if (d < -1000) u = 0; return u; @@ -1641,7 +1641,7 @@ EXPORT CONST double xpow(double x, double y) { Sleef_double2 d = ddmul_d2_d2_d(logk(fabsk(x)), y); double result = expk(d); - result = (d.x > 709.78271114955742909217217426 || xisnan(result)) ? SLEEF_INFINITY : result; + result = (d.x > LOG_DBL_MAX || xisnan(result)) ? SLEEF_INFINITY : result; result *= (x > 0 ? 1 : (yisint ? (yisodd ? -1 : 1) : SLEEF_NAN)); double efx = mulsign(fabsk(x) - 1, y); @@ -2145,6 +2145,8 @@ EXPORT CONST double xlog1p(double d) { double m, t, x2; int e; + if (d > LOG1P_BOUND) return xlog_u1(d); // ~log(d) + double dp1 = d + 1; int o = dp1 < DBL_MIN; @@ -2176,7 +2178,6 @@ EXPORT CONST double xlog1p(double d) { double r = s.x + s.y; - if (d > 1e+307) r = SLEEF_INFINITY; if (d < -1 || xisnan(d)) r = SLEEF_NAN; if (d == -1) r = -SLEEF_INFINITY; if (xisnegzero(d)) r = -0.0; diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/sleefinline_cuda_header.h.org b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/sleefinline_cuda_header.h.org index c129f4ec297..9a25c9f16fa 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/sleefinline_cuda_header.h.org +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/sleefinline_cuda_header.h.org @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See http://www.boost.org/LICENSE_1_0.txt) diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/sleefinline_header.h.org b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/sleefinline_header.h.org.in similarity index 99% rename from src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/sleefinline_header.h.org rename to src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/sleefinline_header.h.org.in index a2cb471e7f6..ad452fd7ea0 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/sleefinline_header.h.org +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/sleefinline_header.h.org.in @@ -1,9 +1,12 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See http://www.boost.org/LICENSE_1_0.txt) // This file is generated by SLEEF SLEEF_VERSION_SLEEF +#cmakedefine SLEEF_FLOAT128_IS_IEEEQP +#cmakedefine SLEEF_LONGDOUBLE_IS_IEEEQP + #ifndef SLEEF_ALWAYS_INLINE #if defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER) #define SLEEF_ALWAYS_INLINE inline __attribute__((always_inline)) @@ -1010,6 +1013,7 @@ static const double Sleef_rempitabdp[] = { 2.1353977370878701046e-273, -1.2215123283371736879e-289, 6.7342163555358599277e-306, -5.681754927174335258e-322, 2.1353977370878701046e-273, -1.2215123283371736879e-289, 6.7342163555358599277e-306, -5.681754927174335258e-322, 2.8687869620228451614e-274, -1.9537812801257956865e-290, 1.0380272777574237546e-306, 6.4228533959362050743e-323, + 0, 0, 0, 0, }; static const float Sleef_rempitabsp[] = { @@ -1116,5 +1120,6 @@ static const float Sleef_rempitabsp[] = { 1.183823005e-12, 1.161414894e-20, 1.29131908e-27, 1.715766248e-34, 1.183823005e-12, 1.161414894e-20, 1.29131908e-27, 1.715766248e-34, 2.743283031e-13, 1.161414894e-20, 1.29131908e-27, 1.715766248e-34, + 0, 0, 0, 0, }; #endif // #ifndef __SLEEF_REMPITAB__ diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/sleefld.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/sleefld.c index 0d9eff87b08..961ad231de6 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/sleefld.c +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/sleefld.c @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2024. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/sleeflibm_header.h.org.in b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/sleeflibm_header.h.org.in index 66d88db3c70..abf1d0aebdc 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/sleeflibm_header.h.org.in +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/sleeflibm_header.h.org.in @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2024. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -10,9 +10,17 @@ #define SLEEF_VERSION_MINOR @SLEEF_VERSION_MINOR@ #define SLEEF_VERSION_PATCHLEVEL @SLEEF_VERSION_PATCH@ +#cmakedefine SLEEF_FLOAT128_IS_IEEEQP +#cmakedefine SLEEF_LONGDOUBLE_IS_IEEEQP + #include #include +#ifdef __cplusplus +extern "C" +{ +#endif + #if defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER) #define SLEEF_CONST __attribute__((const)) #define SLEEF_INLINE __attribute__((always_inline)) @@ -63,7 +71,7 @@ #include #endif -#if defined(__VSX__) && defined(__PPC64__) && defined(__LITTLE_ENDIAN__) +#if (defined(__VSX__) && defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) || defined(_AIX) #include typedef __vector double SLEEF_VECTOR_DOUBLE; typedef __vector float SLEEF_VECTOR_FLOAT; @@ -164,17 +172,12 @@ typedef struct { } Sleef_longdouble2; #endif -#if (defined(__SIZEOF_FLOAT128__) && __SIZEOF_FLOAT128__ == 16) || (defined(__linux__) && defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))) || (defined(__PPC64__) && defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 8) -#define SLEEF_FLOAT128_IS_IEEEQP -#endif - -#if !defined(SLEEF_FLOAT128_IS_IEEEQP) && defined(__SIZEOF_LONG_DOUBLE__) && __SIZEOF_LONG_DOUBLE__ == 16 && (defined(__aarch64__) || defined(__zarch__)) -#define SLEEF_LONGDOUBLE_IS_IEEEQP -#endif - #if !defined(Sleef_quad_DEFINED) #define Sleef_quad_DEFINED typedef struct { uint64_t x, y; } Sleef_uint64_2t; +#ifdef _AIX +#undef SLEEF_FLOAT128_IS_IEEEQP +#endif #if defined(SLEEF_FLOAT128_IS_IEEEQP) || defined(ENABLEFLOAT128) typedef __float128 Sleef_quad; #define SLEEF_QUAD_C(x) (x ## Q) @@ -195,11 +198,6 @@ typedef union { } Sleef_quad2; #endif -#ifdef __cplusplus -extern "C" -{ -#endif - SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_sin_u35(double); SLEEF_PRAGMA_OMP_SIMD_DP SLEEF_IMPORT SLEEF_CONST double Sleef_cos_u35(double); SLEEF_IMPORT SLEEF_CONST Sleef_double2 Sleef_sincos_u35(double); diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/sleefqp.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/sleefqp.c index f605c0eb8ad..6dc9d8b6a4f 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/sleefqp.c +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/sleefqp.c @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2024. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/sleefsimddp.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/sleefsimddp.c index 1d64424a0ef..8bf30b49870 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/sleefsimddp.c +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/sleefsimddp.c @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2024. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -15,7 +15,7 @@ #include "quaddef.h" #include "misc.h" -#ifndef SLEEF_ENABLE_CUDA +#ifndef ENABLE_CUDA extern const double Sleef_rempitabdp[]; #endif @@ -292,7 +292,7 @@ extern const double Sleef_rempitabdp[]; #endif #endif -#ifdef SLEEF_ENABLE_CUDA +#ifdef ENABLE_CUDA #define CONFIG 3 #include "helperpurec_scalar.h" #ifdef DORENAME @@ -2170,7 +2170,8 @@ EXPORT CONST VECTOR_CC vdouble xexp(vdouble d) { u = vldexp2_vd_vd_vi(u, q); - u = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(d, vcast_vd_d(709.78271114955742909217217426)), vcast_vd_d(SLEEF_INFINITY), u); + vopmask o = vgt_vo_vd_vd(d, vcast_vd_d(LOG_DBL_MAX)); + u = vsel_vd_vo_vd_vd(o, vcast_vd_d(SLEEF_INFINITY), u); u = vreinterpret_vd_vm(vandnot_vm_vo64_vm(vlt_vo_vd_vd(d, vcast_vd_d(-1000)), vreinterpret_vm_vd(u))); return u; @@ -2340,13 +2341,13 @@ static INLINE CONST VECTOR_CC vdouble expk(vdouble2 d) { #if !defined(DETERMINISTIC) EXPORT CONST VECTOR_CC vdouble xpow(vdouble x, vdouble y) { -#if 1 vopmask yisint = visint_vo_vd(y); vopmask yisodd = vand_vo_vo_vo(visodd_vo_vd(y), yisint); vdouble2 d = ddmul_vd2_vd2_vd(logk(vabs_vd_vd(x)), y); vdouble result = expk(d); - result = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vd2getx_vd_vd2(d), vcast_vd_d(709.78271114955742909217217426)), vcast_vd_d(SLEEF_INFINITY), result); + vopmask o = vgt_vo_vd_vd(vd2getx_vd_vd2(d), vcast_vd_d(LOG_DBL_MAX)); + result = vsel_vd_vo_vd_vd(o, vcast_vd_d(SLEEF_INFINITY), result); result = vmul_vd_vd_vd(result, vsel_vd_vo_vd_vd(vgt_vo_vd_vd(x, vcast_vd_d(0)), @@ -2372,9 +2373,6 @@ EXPORT CONST VECTOR_CC vdouble xpow(vdouble x, vdouble y) { result = vsel_vd_vo_vd_vd(vor_vo_vo_vo(veq_vo_vd_vd(y, vcast_vd_d(0)), veq_vo_vd_vd(x, vcast_vd_d(1))), vcast_vd_d(1), result); return result; -#else - return expk(ddmul_vd2_vd2_vd(logk(x), y)); -#endif } #endif // #if !defined(DETERMINISTIC) @@ -2995,7 +2993,9 @@ EXPORT CONST VECTOR_CC vdouble xlog1p(vdouble d) { vdouble r = vadd_vd_vd_vd(vd2getx_vd_vd2(s), vd2gety_vd_vd2(s)); - r = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(d, vcast_vd_d(1e+307)), vcast_vd_d(SLEEF_INFINITY), r); + // Use log(d) if d too large to use core approximation. + vopmask ocore = vle_vo_vd_vd(d, vcast_vd_d(LOG1P_BOUND)); + if(!LIKELY(vtestallones_i_vo64 (ocore))) r = vsel_vd_vo_vd_vd(ocore, r, xlog_u1(d)); r = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vlt_vo_vd_vd(d, vcast_vd_d(-1)), visnan_vo_vd(d)), vcast_vd_d(SLEEF_NAN), r); r = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(-1)), vcast_vd_d(-SLEEF_INFINITY), r); r = vsel_vd_vo_vd_vd(visnegzero_vo_vd(d), vcast_vd_d(-0.0), r); @@ -3081,7 +3081,7 @@ EXPORT CONST VECTOR_CC vint xexpfrexp(vdouble x) { vint ret = vcastu_vi_vm(vreinterpret_vm_vd(x)); ret = vsub_vi_vi_vi(vand_vi_vi_vi(vsrl_vi_vi_i(ret, 20), vcast_vi_i(0x7ff)), vcast_vi_i(0x3fe)); - ret = vsel_vi_vo_vi_vi(vor_vo_vo_vo(vor_vo_vo_vo(veq_vo_vd_vd(x, vcast_vd_d(0)), visnan_vo_vd(x)), visinf_vo_vd(x)), vcast_vi_i(0), ret); + ret = vsel_vi_vo_vi_vi(vcast_vo32_vo64(vor_vo_vo_vo(vor_vo_vo_vo(veq_vo_vd_vd(x, vcast_vd_d(0)), visnan_vo_vd(x)), visinf_vo_vd(x))), vcast_vi_i(0), ret); return ret; } diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/sleefsimdsp.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/sleefsimdsp.c index efe9c0e79c4..e8a9e9b970e 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/sleefsimdsp.c +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/sleefsimdsp.c @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2024. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -15,7 +15,7 @@ #include "quaddef.h" #include "misc.h" -#ifndef SLEEF_ENABLE_CUDA +#ifndef ENABLE_CUDA extern const float Sleef_rempitabsp[]; #endif @@ -420,7 +420,7 @@ extern const float Sleef_rempitabsp[]; #endif #endif -#ifdef SLEEF_ENABLE_CUDA +#ifdef ENABLE_CUDA #define CONFIG 3 #if !defined(SLEEF_GENHEADER) #include "helperpurec_scalar.h" @@ -853,6 +853,9 @@ EXPORT CONST VECTOR_CC vfloat xtanf(vfloat d) { if (LIKELY(vtestallones_i_vo32(vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX2f*0.5f))))) { q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)(2 * M_1_PI)))); u = vcast_vf_vi2(q); +#if (defined(ENABLE_PUREC_SCALAR) || defined(ENABLE_PURECFMA_SCALAR) || defined(ENABLE_VXE) || defined(ENABLE_VXENOFMA) || defined(ENABLE_VXE2) || defined(ENABLE_VXE2NOFMA)) && !defined(__clang__) && __GNUC__ == 13 + u = vsel_vf_vo_vf_vf(veq_vo_vi2_vi2(q, vcast_vi2_i(0)), vcast_vf_f(0), u); +#endif x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_A2f*0.5f), x); x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_B2f*0.5f), x); x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_C2f*0.5f), x); @@ -908,6 +911,9 @@ EXPORT CONST VECTOR_CC vfloat xtanf(vfloat d) { q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)(2 * M_1_PI)))); u = vcast_vf_vi2(q); +#if (defined(ENABLE_PUREC_SCALAR) || defined(ENABLE_PURECFMA_SCALAR) || defined(ENABLE_VXE) || defined(ENABLE_VXENOFMA) || defined(ENABLE_VXE2) || defined(ENABLE_VXE2NOFMA)) && !defined(__clang__) && __GNUC__ == 13 + u = vsel_vf_vo_vf_vf(veq_vo_vi2_vi2(q, vcast_vi2_i(0)), vcast_vf_f(0), u); +#endif x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_A2f*0.5f), d); x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_B2f*0.5f), x); x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_C2f*0.5f), x); @@ -1770,7 +1776,7 @@ EXPORT CONST VECTOR_CC vfloat xatanf(vfloat d) { t = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(2)), vreinterpret_vm_vf(vcast_vf_f(-0.0f))), vreinterpret_vm_vf(t))); #if defined(ENABLE_NEON32) || defined(ENABLE_NEON32VFPV4) - t = vsel_vf_vo_vf_vf(visinf_vo_vf(d), vmulsign_vf_vf_vf(vcast_vf_f(1.5874010519681994747517056f), d), t); + t = vsel_vf_vo_vf_vf(visinf_vo_vf(d), vmulsign_vf_vf_vf(vcast_vf_f(1.570796326794896557998982), d), t); #endif return t; @@ -2873,7 +2879,9 @@ EXPORT CONST VECTOR_CC vfloat xlog1pf(vfloat d) { vfloat r = vadd_vf_vf_vf(vf2getx_vf_vf2(s), vf2gety_vf_vf2(s)); - r = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(d, vcast_vf_f(1e+38)), vcast_vf_f(SLEEF_INFINITYf), r); + // Use log(d) if d too large to use core approximation. + vopmask ocore = vle_vo_vf_vf(d, vcast_vf_f(LOG1PF_BOUND)); + if(!LIKELY(vtestallones_i_vo32 (ocore))) r = vsel_vf_vo_vf_vf(ocore, r, xlogf_u1(d)); r = vreinterpret_vf_vm(vor_vm_vo32_vm(vgt_vo_vf_vf(vcast_vf_f(-1), d), vreinterpret_vm_vf(r))); r = vsel_vf_vo_vf_vf(veq_vo_vf_vf(d, vcast_vf_f(-1)), vcast_vf_f(-SLEEF_INFINITYf), r); r = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), vcast_vf_f(-0.0f), r); diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/sleefsp.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/sleefsp.c index 7d3370c0b02..824b4dea637 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/sleefsp.c +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/sleefsp.c @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -53,11 +53,11 @@ static INLINE CONST float fabsfk(float x) { } static INLINE CONST float mulsignf(float x, float y) { - return intBitsToFloat(floatToRawIntBits(x) ^ (floatToRawIntBits(y) & (1 << 31))); + return intBitsToFloat(floatToRawIntBits(x) ^ (floatToRawIntBits(y) & 0x80000000U)); } static INLINE CONST float copysignfk(float x, float y) { - return intBitsToFloat((floatToRawIntBits(x) & ~(1 << 31)) ^ (floatToRawIntBits(y) & (1 << 31))); + return intBitsToFloat((floatToRawIntBits(x) & ~0x80000000U) ^ (floatToRawIntBits(y) & 0x80000000U)); } static INLINE CONST float signf(float d) { return mulsignf(1, d); } @@ -1733,6 +1733,8 @@ EXPORT CONST float xlog1pf(float d) { float m, t, x2; int e; + if (d > LOG1PF_BOUND) return xlogf(d); // ~log(d) + float dp1 = d + 1; int o = dp1 < FLT_MIN; @@ -1758,7 +1760,6 @@ EXPORT CONST float xlog1pf(float d) { float r = s.x + s.y; - if (d > 1e+38) r = SLEEF_INFINITYf; if (d < -1) r = SLEEF_NANf; if (d == -1) r = -SLEEF_INFINITYf; if (xisnegzerof(d)) r = -0.0f; @@ -1920,11 +1921,11 @@ EXPORT CONST float xnextafterf(float x, float y) { cxf = x == 0 ? mulsignf(0, y) : x; memcpy(&cxi, &cxf, sizeof(cxi)); int c = (cxi < 0) == (y < x); - if (c) cxi = -(cxi ^ (1 << 31)); + if (c) cxi = -(cxi ^ 0x80000000U); if (x != y) cxi--; - if (c) cxi = -(cxi ^ (1 << 31)); + if (c) cxi = -(cxi ^ 0x80000000U); memcpy(&cxf, &cxi, sizeof(cxf)); if (cxf == 0 && x != 0) cxf = mulsignf(0, x); diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/tryvxe2.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/tryvxe2.c deleted file mode 100644 index b4a73a9f863..00000000000 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/libm/tryvxe2.c +++ /dev/null @@ -1,8 +0,0 @@ -#include - -__vector float sleef_cpuid_VXE2; -__vector int sleef_cpuid_VXE1; - -void sleef_tryVXE2() { - sleef_cpuid_VXE2 = vec_float(sleef_cpuid_VXE1); -} diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad-tester/CMakeLists.txt b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad-tester/CMakeLists.txt index f884f1b5b96..e3c06d39e4c 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad-tester/CMakeLists.txt +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad-tester/CMakeLists.txt @@ -13,33 +13,44 @@ if (NOT LIBRT) set(LIBRT "") endif() +if(COMPILER_SUPPORTS_QUADMATH) + set(LIBQUADMATH "-lquadmath") +else() + set(LIBQUADMATH "") +endif() + set(CMAKE_C_FLAGS "${ORG_CMAKE_C_FLAGS} ${SLEEF_C_FLAGS} ${FLAGS_NOSTRICTALIASING}") +set(CMAKE_CXX_FLAGS "${ORG_CMAKE_CXX_FLAGS} ${SLEEF_C_FLAGS} ${FLAGS_NOSTRICTALIASING}") if(COMPILER_SUPPORTS_FLOAT128) list(APPEND COMMON_TARGET_DEFINITIONS ENABLEFLOAT128=1) endif() -# +# Build tester3printf -if(SLEEF_OPENSSL_FOUND) - # Build tester3printf - add_executable(tester3printf tester3printf.c) - add_dependencies(tester3printf sleefquad sleefquad_headers ${TARGET_LIBSLEEF} ${TARGET_HEADERS}) - target_compile_definitions(tester3printf PRIVATE ${COMMON_TARGET_DEFINITIONS}) - set_target_properties(tester3printf PROPERTIES C_STANDARD 99) +add_executable(tester3printf tester3printf.c) +add_dependencies(tester3printf sleefquad sleefquad_headers ${TARGET_LIBSLEEF} ${TARGET_HEADERS}) +target_compile_definitions(tester3printf PRIVATE ${COMMON_TARGET_DEFINITIONS}) +set_target_properties(tester3printf PROPERTIES C_STANDARD 99) + +if (SLEEF_OPENSSL_FOUND) target_link_libraries(tester3printf sleefquad ${TARGET_LIBSLEEF} ${SLEEF_OPENSSL_LIBRARIES}) target_include_directories(tester3printf PRIVATE ${SLEEF_OPENSSL_INCLUDE_DIR}) - - if (SDE_COMMAND) - add_test(NAME tester3printf COMMAND ${SDE_COMMAND} "--" ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/tester3printf ${sleef_SOURCE_DIR}/src/quad-tester/hash_printf.txt) - elseif(EMULATOR) - add_test(NAME tester3printf COMMAND ${EMULATOR} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/tester3printf ${sleef_SOURCE_DIR}/src/quad-tester/hash_printf.txt) - else() - add_test(NAME tester3printf COMMAND tester3printf ${sleef_SOURCE_DIR}/src/quad-tester/hash_printf.txt) - endif() - set_tests_properties(tester3printf PROPERTIES COST 5.0) +else() + target_link_libraries(tester3printf sleefquad ${TARGET_LIBSLEEF} ${TARGET_PSHA_OBJ}) + target_include_directories(tester3printf PRIVATE ${sleef_SOURCE_DIR}/src/common) + target_compile_definitions(tester3printf PRIVATE SLEEF_USE_INTERNAL_SHA256=1) endif() +if (SDE_COMMAND) + add_test(NAME tester3printf COMMAND ${SDE_COMMAND} "--" ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/tester3printf ${sleef_SOURCE_DIR}/src/quad-tester/hash_printf.txt) +elseif(EMULATOR) + add_test(NAME tester3printf COMMAND ${EMULATOR} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/tester3printf ${sleef_SOURCE_DIR}/src/quad-tester/hash_printf.txt) +else() + add_test(NAME tester3printf COMMAND tester3printf ${sleef_SOURCE_DIR}/src/quad-tester/hash_printf.txt) +endif() +set_tests_properties(tester3printf PROPERTIES COST 5.0) + # function(add_test_iut IUT C) @@ -79,30 +90,57 @@ function(add_test_iut IUT C) endfunction() # Add vector extension `iut`s -set(IUT_SRC qiutsimd.c ${sleef_SOURCE_DIR}/src/common/main_checkfeature.c qtesterutil.c) +set(IUT_SRC qiutsimd.c ${sleef_SOURCE_DIR}/src/common/main_checkfeature.c) macro(test_extension SIMD) if(COMPILER_SUPPORTS_${SIMD}) string(TOLOWER ${SIMD} LCSIMD) - string(CONCAT TARGET_IUT${SIMD} "qiut" ${LCSIMD}) - add_executable(${TARGET_IUT${SIMD}} ${IUT_SRC}) - target_compile_options(${TARGET_IUT${SIMD}} - PRIVATE ${FLAGS_ENABLE_${SIMD}}) - target_compile_definitions(${TARGET_IUT${SIMD}} - PRIVATE ENABLE_${SIMD}=1 ${COMMON_TARGET_DEFINITIONS}) - target_link_libraries(${TARGET_IUT${SIMD}} sleefquad ${TARGET_LIBSLEEF} ${LIBM} ${LIBRT}) + if (SLEEF_ENABLE_TESTER) + string(CONCAT TARGET_IUT${SIMD} "qiut" ${LCSIMD}) - add_dependencies(${TARGET_IUT${SIMD}} sleefquad_headers ${TARGET_HEADERS}) - add_dependencies(${TARGET_IUT${SIMD}} sleefquad ${TARGET_LIBSLEEF}) - set_target_properties(${TARGET_IUT${SIMD}} PROPERTIES C_STANDARD 99) - if (DEFINED COSTOVERRIDE_${SIMD}) - math(EXPR C "${COSTOVERRIDE_${SIMD}} + 1") - add_test_iut(${TARGET_IUT${SIMD}} ${C}) - else() - add_test_iut(${TARGET_IUT${SIMD}} 0.5) - endif() - list(APPEND IUT_LIST ${TARGET_IUT${SIMD}}) + add_executable(${TARGET_IUT${SIMD}} ${IUT_SRC}) + target_compile_options(${TARGET_IUT${SIMD}} + PRIVATE ${FLAGS_ENABLE_${SIMD}}) + target_compile_definitions(${TARGET_IUT${SIMD}} + PRIVATE ENABLE_${SIMD}=1 ${COMMON_TARGET_DEFINITIONS}) + target_link_libraries(${TARGET_IUT${SIMD}} sleefquad ${TARGET_LIBSLEEF} ${LIBRT} ${TARGET_QTESTERUTIL_OBJ} ${TARGET_TESTERUTIL_OBJ} ${LIBQUADMATH} ${LIBM}) + + add_dependencies(${TARGET_IUT${SIMD}} sleefquad_headers ${TARGET_HEADERS}) + add_dependencies(${TARGET_IUT${SIMD}} sleefquad ${TARGET_LIBSLEEF}) + set_target_properties(${TARGET_IUT${SIMD}} PROPERTIES C_STANDARD 99) + if (DEFINED COSTOVERRIDE_${SIMD}) + math(EXPR C "${COSTOVERRIDE_${SIMD}} + 1") + add_test_iut(${TARGET_IUT${SIMD}} ${C}) + else() + add_test_iut(${TARGET_IUT${SIMD}} 0.5) + endif() + list(APPEND IUT_LIST ${TARGET_IUT${SIMD}}) + endif(SLEEF_ENABLE_TESTER) + + # + + if (SLEEF_ENABLE_TESTER4 AND TLFLOAT_LIBRARIES) + set(TESTER4_SRC qtester4simd.cpp ${sleef_SOURCE_DIR}/src/common/main_checkfeature.c) + string(CONCAT TARGET_TESTER4_${SIMD} "qtester4" ${LCSIMD}) + + add_executable(${TARGET_TESTER4_${SIMD}} ${TESTER4_SRC}) + target_compile_options(${TARGET_TESTER4_${SIMD}} + PRIVATE ${FLAGS_ENABLE_${SIMD}}) + target_compile_definitions(${TARGET_TESTER4_${SIMD}} + PRIVATE ENABLE_${SIMD}=1 ${COMMON_TARGET_DEFINITIONS}) + target_link_libraries(${TARGET_TESTER4_${SIMD}} sleefquad ${TARGET_LIBSLEEF} ${TARGET_QTESTERUTIL_OBJ} ${TARGET_TESTERUTIL_OBJ} ${TLFLOAT_LIBRARIES} ${LIBM}) + + add_dependencies(${TARGET_TESTER4_${SIMD}} sleefquad_headers ${TARGET_HEADERS}) + add_dependencies(${TARGET_TESTER4_${SIMD}} sleefquad ${TARGET_LIBSLEEF}) + add_dependencies(${TARGET_TESTER4_${SIMD}} ext_tlfloat) + set_target_properties(${TARGET_TESTER4_${SIMD}} PROPERTIES C_STANDARD 99) + if (DEFINED COSTOVERRIDE_${SIMD}) + add_test_with_emu(${COSTOVERRIDE_${SIMD}} ${TARGET_TESTER4_${SIMD}}) + else() + add_test_with_emu(1.0 ${TARGET_TESTER4_${SIMD}}) + endif() + endif(SLEEF_ENABLE_TESTER4 AND TLFLOAT_LIBRARIES) # The iut programs whose names begin with "qiuti" are the iut for the # inline version of quad functions. @@ -110,40 +148,74 @@ macro(test_extension SIMD) if (SLEEF_BUILD_INLINE_HEADERS AND SED_COMMAND) if (MSVC AND NOT SLEEF_CLANG_ON_WINDOWS) message(STATUS "Quad inline headers are not tested with MSVC") - else() - string(CONCAT IUTINAME "qiuti" ${LCSIMD}) - add_executable(${IUTINAME} ${IUT_SRC}) - target_compile_options(${IUTINAME} - PRIVATE ${FLAGS_ENABLE_${SIMD}}) - target_compile_definitions(${IUTINAME} - PRIVATE ENABLE_${SIMD}=1 ${COMMON_TARGET_DEFINITIONS} - USE_INLINE_HEADER="sleefquadinline_${LCSIMD}.h" - MACRO_ONLY_HEADER="qmacroonly${SIMD}.h" - SIMD_SUFFIX=_${LCSIMD}_sleefq + else(MSVC AND NOT SLEEF_CLANG_ON_WINDOWS) + if (SLEEF_ENABLE_TESTER) + string(CONCAT IUTINAME "qiuti" ${LCSIMD}) + add_executable(${IUTINAME} ${IUT_SRC}) + target_compile_options(${IUTINAME} + PRIVATE ${FLAGS_ENABLE_${SIMD}}) + target_compile_definitions(${IUTINAME} + PRIVATE ENABLE_${SIMD}=1 ${COMMON_TARGET_DEFINITIONS} + USE_INLINE_HEADER="sleefquadinline_${LCSIMD}.h" + MACRO_ONLY_HEADER="qmacroonly${SIMD}.h" + SIMD_SUFFIX=_${LCSIMD}_sleefq ) - target_include_directories(${IUTINAME} PRIVATE ${PROJECT_BINARY_DIR}/include) - target_link_libraries(${IUTINAME} ${LIBM} ${LIBRT}) - add_dependencies(${IUTINAME} ${TARGET_QINLINE_HEADERS}) - set_target_properties(${IUTINAME} PROPERTIES C_STANDARD 99) - if (DEFINED COSTOVERRIDE_${SIMD}) - math(EXPR C "${COSTOVERRIDE_${SIMD}} + 1") - add_test_iut(${IUTINAME} ${C}) - else() - add_test_iut(${IUTINAME} 0.5) - endif() - list(APPEND IUT_LIST ${IUTINAME}) - endif() + target_include_directories(${IUTINAME} PRIVATE ${PROJECT_BINARY_DIR}/include) + target_link_libraries(${IUTINAME} ${LIBRT} ${TARGET_QTESTERUTIL_OBJ} ${TARGET_TESTERUTIL_OBJ} ${LIBQUADMATH} ${LIBM}) + add_dependencies(${IUTINAME} ${TARGET_QINLINE_HEADERS}) + set_target_properties(${IUTINAME} PROPERTIES C_STANDARD 99) + if (DEFINED COSTOVERRIDE_${SIMD}) + math(EXPR C "${COSTOVERRIDE_${SIMD}} + 1") + add_test_iut(${IUTINAME} ${C}) + else() + add_test_iut(${IUTINAME} 0.5) + endif() + list(APPEND IUT_LIST ${IUTINAME}) + endif(SLEEF_ENABLE_TESTER) + + # + + if (SLEEF_ENABLE_TESTER4 AND TLFLOAT_LIBRARIES) + string(CONCAT TARGET_TESTER4I_${SIMD} "qtester4i" ${LCSIMD}) + add_executable(${TARGET_TESTER4I_${SIMD}} ${TESTER4_SRC}) + target_compile_options(${TARGET_TESTER4I_${SIMD}} + PRIVATE ${FLAGS_ENABLE_${SIMD}}) + target_compile_definitions(${TARGET_TESTER4I_${SIMD}} + PRIVATE ENABLE_${SIMD}=1 ${COMMON_TARGET_DEFINITIONS}) + target_link_libraries(${TARGET_TESTER4I_${SIMD}} sleefquad ${TARGET_LIBSLEEF} ${TARGET_QTESTERUTIL_OBJ} ${TARGET_TESTERUTIL_OBJ} ${TLFLOAT_LIBRARIES}) + if(CMAKE_C_COMPILER_ID MATCHES "GNU") + target_compile_options(${TARGET_TESTER4I_${SIMD}} PRIVATE "-Wno-unknown-pragmas") + endif() + target_compile_definitions(${TARGET_TESTER4I_${SIMD}} + PRIVATE ENABLE_${SIMD}=1 ${COMMON_TARGET_DEFINITIONS} + USE_INLINE_HEADER="sleefquadinline_${LCSIMD}.h" + MACRO_ONLY_HEADER="qmacroonly${SIMD}.h" + SIMD_SUFFIX=_${LCSIMD}_sleefq + ) + target_include_directories(${TARGET_TESTER4I_${SIMD}} PRIVATE ${PROJECT_BINARY_DIR}/include) + add_dependencies(${TARGET_TESTER4I_${SIMD}} sleefquad sleefquad_headers ${TARGET_QINLINE_HEADERS} ext_tlfloat) + #set_target_properties(${TARGET_TESTER4I_${SIMD}} PROPERTIES ${COMMON_TARGET_PROPERTIES}) + if (DEFINED COSTOVERRIDE_${SIMD}) + add_test_with_emu(${COSTOVERRIDE_${SIMD}} ${TARGET_TESTER4I_${SIMD}}) + else() + add_test_with_emu(1.0 ${TARGET_TESTER4I_${SIMD}}) + endif() + endif(SLEEF_ENABLE_TESTER4 AND TLFLOAT_LIBRARIES) + + endif(MSVC AND NOT SLEEF_CLANG_ON_WINDOWS) endif(SLEEF_BUILD_INLINE_HEADERS AND SED_COMMAND) + # + if(LIB_MPFR AND NOT MINGW) # Build qtester2 SIMD string(TOLOWER ${SIMD} SIMDLC) set(T "tester2${SIMDLC}qp") - add_executable(${T} tester2simdqp.c qtesterutil.c) + add_executable(${T} tester2simdqp.c) target_compile_options(${T} PRIVATE ${FLAGS_ENABLE_${SIMD}}) target_compile_definitions(${T} PRIVATE ENABLE_${SIMD}=1 USEMPFR=1 ${COMMON_TARGET_DEFINITIONS}) set_target_properties(${T} PROPERTIES C_STANDARD 99) - target_link_libraries(${T} sleefquad ${TARGET_LIBSLEEF} ${LIB_MPFR} ${LIBM} ${LIBGMP}) + target_link_libraries(${T} sleefquad ${TARGET_LIBSLEEF} ${TARGET_QTESTERUTIL_OBJ} ${TARGET_TESTERUTIL_OBJ} ${LIB_MPFR} ${LIBGMP} ${LIBQUADMATH} ${LIBM}) add_dependencies(${T} sleefquad sleefquad_headers ${TARGET_LIBSLEEF} ${TARGET_HEADERS}) if (MPFR_INCLUDE_DIR) target_include_directories(${T} PRIVATE ${MPFR_INCLUDE_DIR}) @@ -158,63 +230,180 @@ endforeach() # Compile executable 'qiutdspscalar' -add_executable(qiutdspscalar ${IUT_SRC}) -target_compile_definitions(qiutdspscalar PRIVATE ENABLE_DSPSCALAR=1 ${COMMON_TARGET_DEFINITIONS}) -target_link_libraries(qiutdspscalar sleefquad ${TARGET_LIBSLEEF} ${LIBM} ${LIBRT}) -set_target_properties(qiutdspscalar PROPERTIES C_STANDARD 99) -add_dependencies(qiutdspscalar sleefquad_headers ${TARGET_HEADERS}) -add_dependencies(qiutdspscalar sleefquad ${TARGET_LIBSLEEF}) -add_test_iut(qiutdspscalar 0.5) -list(APPEND IUT_LIST qiutdspscalar) +if (SLEEF_ENABLE_TESTER) + add_executable(qiutdspscalar ${IUT_SRC}) + target_compile_definitions(qiutdspscalar PRIVATE ENABLE_DSPSCALAR=1 ${COMMON_TARGET_DEFINITIONS}) + target_link_libraries(qiutdspscalar sleefquad ${TARGET_LIBSLEEF} ${LIBRT} ${TARGET_QTESTERUTIL_OBJ} ${TARGET_TESTERUTIL_OBJ} ${LIBQUADMATH} ${LIBM}) + set_target_properties(qiutdspscalar PROPERTIES C_STANDARD 99) + add_dependencies(qiutdspscalar sleefquad_headers ${TARGET_HEADERS}) + add_dependencies(qiutdspscalar sleefquad ${TARGET_LIBSLEEF}) + add_test_iut(qiutdspscalar 0.5) + list(APPEND IUT_LIST qiutdspscalar) +endif(SLEEF_ENABLE_TESTER) + +if (SLEEF_ENABLE_TESTER4 AND TLFLOAT_LIBRARIES) + # Compile executable 'qtester4dspscalar' + set(TESTER4_SRC qtester4simd.cpp ${sleef_SOURCE_DIR}/src/common/main_checkfeature.c) + set(SIMD "DSPSCALAR") + set(LCSIMD "dspscalar") + string(CONCAT TARGET_TESTER4_${SIMD} "qtester4" ${LCSIMD}) + + add_executable(${TARGET_TESTER4_${SIMD}} ${TESTER4_SRC}) + target_compile_definitions(${TARGET_TESTER4_${SIMD}} PRIVATE ENABLE_${SIMD}=1 ${COMMON_TARGET_DEFINITIONS}) + target_link_libraries(${TARGET_TESTER4_${SIMD}} sleefquad ${TARGET_LIBSLEEF} ${TARGET_QTESTERUTIL_OBJ} ${TARGET_TESTERUTIL_OBJ} ${TLFLOAT_LIBRARIES} ${LIBM}) + add_dependencies(${TARGET_TESTER4_${SIMD}} sleefquad_headers ${TARGET_HEADERS}) + add_dependencies(${TARGET_TESTER4_${SIMD}} sleefquad ${TARGET_LIBSLEEF}) + add_dependencies(${TARGET_TESTER4_${SIMD}} ext_tlfloat) + set_target_properties(${TARGET_TESTER4_${SIMD}} PROPERTIES C_STANDARD 99) + if (DEFINED COSTOVERRIDE_${SIMD}) + add_test_with_emu(${COSTOVERRIDE_${SIMD}} ${TARGET_TESTER4_${SIMD}}) + else() + add_test_with_emu(1.0 ${TARGET_TESTER4_${SIMD}}) + endif() +endif(SLEEF_ENABLE_TESTER4 AND TLFLOAT_LIBRARIES) if (SLEEF_ARCH_X86) - # Compile executable 'qiutdspx2' - add_executable(qiutdspx2 ${IUT_SRC}) - target_compile_definitions(qiutdspx2 PRIVATE ENABLE_DSPX2_X86=1 ${COMMON_TARGET_DEFINITIONS}) - target_link_libraries(qiutdspx2 sleefquad ${TARGET_LIBSLEEF} ${LIBM} ${LIBRT}) - set_target_properties(qiutdspx2 PROPERTIES C_STANDARD 99) - add_dependencies(qiutdspx2 sleefquad_headers ${TARGET_HEADERS}) - add_dependencies(qiutdspx2 sleefquad ${TARGET_LIBSLEEF}) - add_test_iut(qiutdspx2 0.5) - list(APPEND IUT_LIST qiutdspx2) + if (SLEEF_ENABLE_TESTER) + # Compile executable 'qiutdspx2' + add_executable(qiutdspx2 ${IUT_SRC}) + target_compile_definitions(qiutdspx2 PRIVATE ENABLE_DSPX2_X86=1 ${COMMON_TARGET_DEFINITIONS}) + target_link_libraries(qiutdspx2 sleefquad ${TARGET_LIBSLEEF} ${LIBRT} ${TARGET_QTESTERUTIL_OBJ} ${TARGET_TESTERUTIL_OBJ} ${LIBQUADMATH} ${LIBM}) + set_target_properties(qiutdspx2 PROPERTIES C_STANDARD 99) + add_dependencies(qiutdspx2 sleefquad_headers ${TARGET_HEADERS}) + add_dependencies(qiutdspx2 sleefquad ${TARGET_LIBSLEEF}) + add_test_iut(qiutdspx2 0.5) + list(APPEND IUT_LIST qiutdspx2) + endif(SLEEF_ENABLE_TESTER) + + if (SLEEF_ENABLE_TESTER4 AND TLFLOAT_LIBRARIES) + # Compile executable 'qtester4dspx2' + set(TESTER4_SRC qtester4simd.cpp ${sleef_SOURCE_DIR}/src/common/main_checkfeature.c) + set(SIMD "DSPX2") + set(LCSIMD "dspx2") + string(CONCAT TARGET_TESTER4_${SIMD} "qtester4" ${LCSIMD}) + + add_executable(${TARGET_TESTER4_${SIMD}} ${TESTER4_SRC}) + target_compile_definitions(${TARGET_TESTER4_${SIMD}} PRIVATE ENABLE_DSPX2_X86=1 ${COMMON_TARGET_DEFINITIONS}) + target_link_libraries(${TARGET_TESTER4_${SIMD}} sleefquad ${TARGET_LIBSLEEF} ${TARGET_QTESTERUTIL_OBJ} ${TARGET_TESTERUTIL_OBJ} ${TLFLOAT_LIBRARIES} ${LIBM}) + add_dependencies(${TARGET_TESTER4_${SIMD}} sleefquad_headers ${TARGET_HEADERS}) + add_dependencies(${TARGET_TESTER4_${SIMD}} sleefquad ${TARGET_LIBSLEEF}) + add_dependencies(${TARGET_TESTER4_${SIMD}} ext_tlfloat) + set_target_properties(${TARGET_TESTER4_${SIMD}} PROPERTIES C_STANDARD 99) + if (DEFINED COSTOVERRIDE_${SIMD}) + add_test_with_emu(${COSTOVERRIDE_${SIMD}} ${TARGET_TESTER4_${SIMD}}) + else() + add_test_with_emu(1.0 ${TARGET_TESTER4_${SIMD}}) + endif() + endif(SLEEF_ENABLE_TESTER4 AND TLFLOAT_LIBRARIES) endif() if (SLEEF_ARCH_AARCH64) - # Compile executable 'qiutdspx2' - add_executable(qiutdspx2 ${IUT_SRC}) - target_compile_definitions(qiutdspx2 PRIVATE ENABLE_DSPX2_AARCH64=1 ${COMMON_TARGET_DEFINITIONS}) - set_target_properties(qiutdspx2 PROPERTIES C_STANDARD 99) - target_link_libraries(qiutdspx2 sleefquad ${TARGET_LIBSLEEF} ${LIBM} ${LIBRT}) - add_dependencies(qiutdspx2 sleefquad_headers ${TARGET_HEADERS}) - add_dependencies(qiutdspx2 sleefquad ${TARGET_LIBSLEEF}) - add_test_iut(qiutdspx2 0.5) - list(APPEND IUT_LIST qiutdspx2) + if (SLEEF_ENABLE_TESTER) + # Compile executable 'qiutdspx2' + add_executable(qiutdspx2 ${IUT_SRC}) + target_compile_definitions(qiutdspx2 PRIVATE ENABLE_DSPX2_AARCH64=1 ${COMMON_TARGET_DEFINITIONS}) + set_target_properties(qiutdspx2 PROPERTIES C_STANDARD 99) + target_link_libraries(qiutdspx2 sleefquad ${TARGET_LIBSLEEF} ${LIBRT} ${TARGET_QTESTERUTIL_OBJ} ${TARGET_TESTERUTIL_OBJ} ${LIBQUADMATH} ${LIBM}) + add_dependencies(qiutdspx2 sleefquad_headers ${TARGET_HEADERS}) + add_dependencies(qiutdspx2 sleefquad ${TARGET_LIBSLEEF}) + add_test_iut(qiutdspx2 0.5) + list(APPEND IUT_LIST qiutdspx2) + endif(SLEEF_ENABLE_TESTER) + + if (SLEEF_ENABLE_TESTER4 AND TLFLOAT_LIBRARIES) + # Compile executable 'qtester4dspx2' + set(TESTER4_SRC qtester4simd.cpp ${sleef_SOURCE_DIR}/src/common/main_checkfeature.c) + set(SIMD "DSPX2") + set(LCSIMD "dspx2") + string(CONCAT TARGET_TESTER4_${SIMD} "qtester4" ${LCSIMD}) + + add_executable(${TARGET_TESTER4_${SIMD}} ${TESTER4_SRC}) + target_compile_definitions(${TARGET_TESTER4_${SIMD}} PRIVATE ENABLE_DSPX2_AARCH64=1 ${COMMON_TARGET_DEFINITIONS}) + target_link_libraries(${TARGET_TESTER4_${SIMD}} sleefquad ${TARGET_LIBSLEEF} ${TARGET_QTESTERUTIL_OBJ} ${TARGET_TESTERUTIL_OBJ} ${TLFLOAT_LIBRARIES} ${LIBM}) + add_dependencies(${TARGET_TESTER4_${SIMD}} sleefquad_headers ${TARGET_HEADERS}) + add_dependencies(${TARGET_TESTER4_${SIMD}} sleefquad ${TARGET_LIBSLEEF}) + add_dependencies(${TARGET_TESTER4_${SIMD}} ext_tlfloat) + set_target_properties(${TARGET_TESTER4_${SIMD}} PROPERTIES C_STANDARD 99) + if (DEFINED COSTOVERRIDE_${SIMD}) + add_test_with_emu(${COSTOVERRIDE_${SIMD}} ${TARGET_TESTER4_${SIMD}}) + else() + add_test_with_emu(1.0 ${TARGET_TESTER4_${SIMD}}) + endif() + endif(SLEEF_ENABLE_TESTER4 AND TLFLOAT_LIBRARIES) endif() if (SLEEF_ARCH_PPC64) - # Compile executable 'qiutdspx2' - add_executable(qiutdspx2 ${IUT_SRC}) - target_compile_options(qiutdspx2 PRIVATE ${FLAGS_ENABLE_VSX}) - set_target_properties(qiutdspx2 PROPERTIES C_STANDARD 99) - target_compile_definitions(qiutdspx2 PRIVATE ENABLE_DSPX2_PPC64=1 ${COMMON_TARGET_DEFINITIONS}) - target_link_libraries(qiutdspx2 sleefquad ${TARGET_LIBSLEEF} ${LIBM} ${LIBRT}) - add_dependencies(qiutdspx2 sleefquad_headers ${TARGET_HEADERS}) - add_dependencies(qiutdspx2 sleefquad ${TARGET_LIBSLEEF}) - add_test_iut(qiutdspx2 0.5) - list(APPEND IUT_LIST qiutdspx2) + if (SLEEF_ENABLE_TESTER) + # Compile executable 'qiutdspx2' + add_executable(qiutdspx2 ${IUT_SRC}) + target_compile_options(qiutdspx2 PRIVATE ${FLAGS_ENABLE_VSX}) + set_target_properties(qiutdspx2 PROPERTIES C_STANDARD 99) + target_compile_definitions(qiutdspx2 PRIVATE ENABLE_DSPX2_PPC64=1 ${COMMON_TARGET_DEFINITIONS}) + target_link_libraries(qiutdspx2 sleefquad ${TARGET_LIBSLEEF} ${LIBRT} ${TARGET_QTESTERUTIL_OBJ} ${TARGET_TESTERUTIL_OBJ} ${LIBQUADMATH} ${LIBM}) + add_dependencies(qiutdspx2 sleefquad_headers ${TARGET_HEADERS}) + add_dependencies(qiutdspx2 sleefquad ${TARGET_LIBSLEEF}) + add_test_iut(qiutdspx2 0.5) + list(APPEND IUT_LIST qiutdspx2) + endif(SLEEF_ENABLE_TESTER) + + if (SLEEF_ENABLE_TESTER4 AND TLFLOAT_LIBRARIES) + # Compile executable 'qtester4dspx2' + set(TESTER4_SRC qtester4simd.cpp ${sleef_SOURCE_DIR}/src/common/main_checkfeature.c) + set(SIMD "DSPX2") + set(LCSIMD "dspx2") + string(CONCAT TARGET_TESTER4_${SIMD} "qtester4" ${LCSIMD}) + + add_executable(${TARGET_TESTER4_${SIMD}} ${TESTER4_SRC}) + target_compile_options(${TARGET_TESTER4_${SIMD}} PRIVATE ${FLAGS_ENABLE_VSX}) + target_compile_definitions(${TARGET_TESTER4_${SIMD}} PRIVATE ENABLE_DSPX2_PPC64=1 ${COMMON_TARGET_DEFINITIONS}) + target_link_libraries(${TARGET_TESTER4_${SIMD}} sleefquad ${TARGET_LIBSLEEF} ${TARGET_QTESTERUTIL_OBJ} ${TARGET_TESTERUTIL_OBJ} ${TLFLOAT_LIBRARIES} ${LIBM}) + add_dependencies(${TARGET_TESTER4_${SIMD}} sleefquad_headers ${TARGET_HEADERS}) + add_dependencies(${TARGET_TESTER4_${SIMD}} sleefquad ${TARGET_LIBSLEEF}) + add_dependencies(${TARGET_TESTER4_${SIMD}} ext_tlfloat) + set_target_properties(${TARGET_TESTER4_${SIMD}} PROPERTIES C_STANDARD 99) + if (DEFINED COSTOVERRIDE_${SIMD}) + add_test_with_emu(${COSTOVERRIDE_${SIMD}} ${TARGET_TESTER4_${SIMD}}) + else() + add_test_with_emu(1.0 ${TARGET_TESTER4_${SIMD}}) + endif() + endif(SLEEF_ENABLE_TESTER4 AND TLFLOAT_LIBRARIES) endif() if (SLEEF_ARCH_S390X) - # Compile executable 'qiutdspx2' - add_executable(qiutdspx2 ${IUT_SRC}) - target_compile_options(qiutdspx2 PRIVATE ${FLAGS_ENABLE_VXE}) - set_target_properties(qiutdspx2 PROPERTIES C_STANDARD 99) - target_compile_definitions(qiutdspx2 PRIVATE ENABLE_DSPX2_S390X=1 ${COMMON_TARGET_DEFINITIONS}) - target_link_libraries(qiutdspx2 sleefquad ${TARGET_LIBSLEEF} ${LIBM} ${LIBRT}) - add_dependencies(qiutdspx2 sleefquad_headers ${TARGET_HEADERS}) - add_dependencies(qiutdspx2 sleefquad ${TARGET_LIBSLEEF}) - add_test_iut(qiutdspx2 0.5) - list(APPEND IUT_LIST qiutdspx2) + if (SLEEF_ENABLE_TESTER) + # Compile executable 'qiutdspx2' + add_executable(qiutdspx2 ${IUT_SRC}) + target_compile_options(qiutdspx2 PRIVATE ${FLAGS_ENABLE_VXE}) + set_target_properties(qiutdspx2 PROPERTIES C_STANDARD 99) + target_compile_definitions(qiutdspx2 PRIVATE ENABLE_DSPX2_S390X=1 ${COMMON_TARGET_DEFINITIONS}) + target_link_libraries(qiutdspx2 sleefquad ${TARGET_LIBSLEEF} ${LIBRT} ${TARGET_QTESTERUTIL_OBJ} ${TARGET_TESTERUTIL_OBJ} ${LIBQUADMATH} ${LIBM}) + add_dependencies(qiutdspx2 sleefquad_headers ${TARGET_HEADERS}) + add_dependencies(qiutdspx2 sleefquad ${TARGET_LIBSLEEF}) + add_test_iut(qiutdspx2 0.5) + list(APPEND IUT_LIST qiutdspx2) + endif(SLEEF_ENABLE_TESTER) + + if (SLEEF_ENABLE_TESTER4 AND TLFLOAT_LIBRARIES) + # Compile executable 'qtester4dspx2' + set(TESTER4_SRC qtester4simd.cpp ${sleef_SOURCE_DIR}/src/common/main_checkfeature.c) + set(SIMD "DSPX2") + set(LCSIMD "dspx2") + string(CONCAT TARGET_TESTER4_${SIMD} "qtester4" ${LCSIMD}) + + add_executable(${TARGET_TESTER4_${SIMD}} ${TESTER4_SRC}) + target_compile_options(${TARGET_TESTER4_${SIMD}} PRIVATE ${FLAGS_ENABLE_VXE}) + target_compile_definitions(${TARGET_TESTER4_${SIMD}} PRIVATE ENABLE_DSPX2_S390X=1 ${COMMON_TARGET_DEFINITIONS}) + target_link_libraries(${TARGET_TESTER4_${SIMD}} sleefquad ${TARGET_LIBSLEEF} ${TARGET_QTESTERUTIL_OBJ} ${TARGET_TESTERUTIL_OBJ} ${TLFLOAT_LIBRARIES} ${LIBM}) + add_dependencies(${TARGET_TESTER4_${SIMD}} sleefquad_headers ${TARGET_HEADERS}) + add_dependencies(${TARGET_TESTER4_${SIMD}} sleefquad ${TARGET_LIBSLEEF}) + add_dependencies(${TARGET_TESTER4_${SIMD}} ext_tlfloat) + set_target_properties(${TARGET_TESTER4_${SIMD}} PROPERTIES C_STANDARD 99) + if (DEFINED COSTOVERRIDE_${SIMD}) + add_test_with_emu(${COSTOVERRIDE_${SIMD}} ${TARGET_TESTER4_${SIMD}}) + else() + add_test_with_emu(1.0 ${TARGET_TESTER4_${SIMD}}) + endif() + endif(SLEEF_ENABLE_TESTER4 AND TLFLOAT_LIBRARIES) endif() # Compile executable 'qiutcuda' @@ -222,9 +411,9 @@ endif() if (SLEEF_BUILD_INLINE_HEADERS AND SED_COMMAND AND CMAKE_CUDA_COMPILER) add_executable(qiutcuda qiutcuda.cu) set_target_properties(qiutcuda PROPERTIES LINKER_LANGUAGE CUDA) - target_compile_options(qiutcuda PRIVATE "--fmad=false;-Xcompiler;-ffp-contract=off") + target_compile_options(qiutcuda PRIVATE "--fmad=false;-Xcompiler;-fext-numeric-literals;-Xcompiler;-ffp-contract=off") add_dependencies(qiutcuda ${TARGET_QINLINE_HEADERS}) - add_test_iut(qiutcuda 1.0) + add_test_iut(qiutcuda 20.0) list(APPEND IUT_LIST qiutcuda) endif() @@ -232,9 +421,9 @@ endif() if(LIB_MPFR AND NOT MINGW) # Compile executable 'qtester' - add_host_executable(qtester qtester.c qtesterutil.c) + add_host_executable(qtester qtester.c) if (NOT CMAKE_CROSSCOMPILING) - target_link_libraries(qtester sleefquad ${TARGET_LIBSLEEF} ${LIBM} ${LIB_MPFR} ${LIBGMP}) + target_link_libraries(qtester sleefquad ${TARGET_LIBSLEEF} ${TARGET_QTESTERUTIL_OBJ} ${TARGET_TESTERUTIL_OBJ} ${LIB_MPFR} ${LIBGMP} ${LIBQUADMATH} ${LIBM}) target_compile_definitions(qtester PRIVATE USEMPFR=1 ${COMMON_TARGET_DEFINITIONS}) target_compile_options(qtester PRIVATE -Wno-unused-result) set_target_properties(qtester PROPERTIES C_STANDARD 99) diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad-tester/hash_printf.txt b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad-tester/hash_printf.txt index 5e6919d554c..08b040bfff8 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad-tester/hash_printf.txt +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad-tester/hash_printf.txt @@ -1,4 +1,4 @@ -Pe 7ff4a1686c831c7a9b1bd62faffe9b14 -Pf 84331dfc378b032877f7a07767db7cc5 -Pg 2351f96a90d34bf4dd80dd6341a47624 -Pa ad6bb18af2f2648e791098ebf87ce25d +Pe 5bac203cf4f186c11a0a80cdcce2674ae0ef62c076415a88b8d4c7a2ee0d42df +Pf a932665a1becd55826e899b63d72bbd700c3aef8981f25452c4cd3bb98239043 +Pg 2377240e7d3ae40fd1dc96a23a7c3c2e32d5d8a2eeb8917428fd4aebf14546d8 +Pa 0903bb415feeadb4bf002223540cf2e6a30dc27831141396d3200a997930e3e3 diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad-tester/qiutcuda.cu b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad-tester/qiutcuda.cu index 3baca09029e..9a54464233e 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad-tester/qiutcuda.cu +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad-tester/qiutcuda.cu @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -125,216 +125,216 @@ typedef union { #define BUFSIZE 1024 -#define func_q_q(funcStr, funcName) { \ - while (startsWith(buf, funcStr " ")) { \ - sentinel = 0; \ +#define func_q_q(funcStr, funcName) { \ + while (startsWith(buf, funcStr " ")) { \ + sentinel = 0; \ cnv128 c0; \ - sscanf(buf, funcStr " %" PRIx64 ":%" PRIx64, &c0.h, &c0.l); \ - *a0 = Sleef_setq1_cuda(*a0, 0, c0.q); \ + sscanf(buf, funcStr " %" PRIx64 ":%" PRIx64, &c0.h, &c0.l); \ + *a0 = Sleef_setq1_cuda(*a0, 0, c0.q); \ funcName<<<1, 1>>>(r, a0); \ - cudaDeviceSynchronize(); \ - c0.q = Sleef_getq1_cuda(*r, 0); \ - printf("%" PRIx64 ":%" PRIx64 "\n", c0.h, c0.l); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ + cudaDeviceSynchronize(); \ + c0.q = Sleef_getq1_cuda(*r, 0); \ + printf("%" PRIx64 ":%" PRIx64 "\n", c0.h, c0.l); \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } -#define func_q_q_q(funcStr, funcName) { \ - while (startsWith(buf, funcStr " ")) { \ - sentinel = 0; \ - cnv128 c0, c1; \ +#define func_q_q_q(funcStr, funcName) { \ + while (startsWith(buf, funcStr " ")) { \ + sentinel = 0; \ + cnv128 c0, c1; \ sscanf(buf, funcStr " %" PRIx64 ":%" PRIx64 " %" PRIx64 ":%" PRIx64, &c0.h, &c0.l, &c1.h, &c1.l); \ - *a0 = Sleef_setq1_cuda(*a0, 0, c0.q); \ - *a1 = Sleef_setq1_cuda(*a1, 0, c1.q); \ - funcName<<<1, 1>>>(r, a0, a1); \ - cudaDeviceSynchronize(); \ - c0.q = Sleef_getq1_cuda(*r, 0); \ - printf("%" PRIx64 ":%" PRIx64 "\n", c0.h, c0.l); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ + *a0 = Sleef_setq1_cuda(*a0, 0, c0.q); \ + *a1 = Sleef_setq1_cuda(*a1, 0, c1.q); \ + funcName<<<1, 1>>>(r, a0, a1); \ + cudaDeviceSynchronize(); \ + c0.q = Sleef_getq1_cuda(*r, 0); \ + printf("%" PRIx64 ":%" PRIx64 "\n", c0.h, c0.l); \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } -#define func_q_q_q_q(funcStr, funcName) { \ - while (startsWith(buf, funcStr " ")) { \ - sentinel = 0; \ +#define func_q_q_q_q(funcStr, funcName) { \ + while (startsWith(buf, funcStr " ")) { \ + sentinel = 0; \ cnv128 c0, c1, c2; \ sscanf(buf, funcStr " %" PRIx64 ":%" PRIx64 " %" PRIx64 ":%" PRIx64 " %" PRIx64 ":%" PRIx64, \ - &c0.h, &c0.l, &c1.h, &c1.l, &c2.h, &c2.l); \ - *a0 = Sleef_setq1_cuda(*a0, 0, c0.q); \ - *a1 = Sleef_setq1_cuda(*a1, 0, c1.q); \ - *a2 = Sleef_setq1_cuda(*a2, 0, c2.q); \ + &c0.h, &c0.l, &c1.h, &c1.l, &c2.h, &c2.l); \ + *a0 = Sleef_setq1_cuda(*a0, 0, c0.q); \ + *a1 = Sleef_setq1_cuda(*a1, 0, c1.q); \ + *a2 = Sleef_setq1_cuda(*a2, 0, c2.q); \ funcName<<<1, 1>>>(r, a0, a1, a2); \ - cudaDeviceSynchronize(); \ - c0.q = Sleef_getq1_cuda(*r, 0); \ - printf("%" PRIx64 ":%" PRIx64 "\n", c0.h, c0.l); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ + cudaDeviceSynchronize(); \ + c0.q = Sleef_getq1_cuda(*r, 0); \ + printf("%" PRIx64 ":%" PRIx64 "\n", c0.h, c0.l); \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } -#define func_i_q(funcStr, funcName) { \ - while (startsWith(buf, funcStr " ")) { \ - sentinel = 0; \ +#define func_i_q(funcStr, funcName) { \ + while (startsWith(buf, funcStr " ")) { \ + sentinel = 0; \ cnv128 c0; \ sscanf(buf, funcStr " %" PRIx64 ":%" PRIx64, &c0.h, &c0.l); \ - *a0 = Sleef_setq1_cuda(*a0, 0, c0.q); \ - funcName<<<1, 1>>>(i0, a0); \ - cudaDeviceSynchronize(); \ - printf("%d\n", *i0); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ + *a0 = Sleef_setq1_cuda(*a0, 0, c0.q); \ + funcName<<<1, 1>>>(i0, a0); \ + cudaDeviceSynchronize(); \ + printf("%d\n", *i0); \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } -#define func_i_q_q(funcStr, funcName) { \ - while (startsWith(buf, funcStr " ")) { \ - sentinel = 0; \ - cnv128 c0, c1; \ +#define func_i_q_q(funcStr, funcName) { \ + while (startsWith(buf, funcStr " ")) { \ + sentinel = 0; \ + cnv128 c0, c1; \ sscanf(buf, funcStr " %" PRIx64 ":%" PRIx64 " %" PRIx64 ":%" PRIx64, &c0.h, &c0.l, &c1.h, &c1.l); \ - *a0 = Sleef_setq1_cuda(*a0, 0, c0.q); \ - *a1 = Sleef_setq1_cuda(*a1, 0, c1.q); \ - funcName<<<1, 1>>>(i0, a0, a1); \ - cudaDeviceSynchronize(); \ - printf("%d\n", *i0); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ + *a0 = Sleef_setq1_cuda(*a0, 0, c0.q); \ + *a1 = Sleef_setq1_cuda(*a1, 0, c1.q); \ + funcName<<<1, 1>>>(i0, a0, a1); \ + cudaDeviceSynchronize(); \ + printf("%d\n", *i0); \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } -#define func_q_q_i(funcStr, funcName) { \ - while (startsWith(buf, funcStr " ")) { \ - sentinel = 0; \ +#define func_q_q_i(funcStr, funcName) { \ + while (startsWith(buf, funcStr " ")) { \ + sentinel = 0; \ cnv128 c0; \ - int k; \ + int k; \ sscanf(buf, funcStr " %" PRIx64 ":%" PRIx64 " %d", &c0.h, &c0.l, &k); \ - *a0 = Sleef_setq1_cuda(*a0, 0, c0.q); \ - *i0 = k; \ - funcName<<<1, 1>>>(r, a0, i0); \ - cudaDeviceSynchronize(); \ - c0.q = Sleef_getq1_cuda(*r, 0); \ - printf("%" PRIx64 ":%" PRIx64 "\n", c0.h, c0.l); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ + *a0 = Sleef_setq1_cuda(*a0, 0, c0.q); \ + *i0 = k; \ + funcName<<<1, 1>>>(r, a0, i0); \ + cudaDeviceSynchronize(); \ + c0.q = Sleef_getq1_cuda(*r, 0); \ + printf("%" PRIx64 ":%" PRIx64 "\n", c0.h, c0.l); \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } -#define func_d_q(funcStr, funcName) { \ - while (startsWith(buf, funcStr " ")) { \ - sentinel = 0; \ +#define func_d_q(funcStr, funcName) { \ + while (startsWith(buf, funcStr " ")) { \ + sentinel = 0; \ cnv128 c0; \ - sscanf(buf, funcStr " %" PRIx64 ":%" PRIx64, &c0.h, &c0.l); \ - *a0 = Sleef_setq1_cuda(*a0, 0, c0.q); \ - funcName<<<1, 1>>>(d0, a0); \ - cudaDeviceSynchronize(); \ + sscanf(buf, funcStr " %" PRIx64 ":%" PRIx64, &c0.h, &c0.l); \ + *a0 = Sleef_setq1_cuda(*a0, 0, c0.q); \ + funcName<<<1, 1>>>(d0, a0); \ + cudaDeviceSynchronize(); \ printf("%" PRIx64 "\n", d2u(*d0)); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } -#define func_q_d(funcStr, funcName) { \ - while (startsWith(buf, funcStr " ")) { \ - sentinel = 0; \ - uint64_t u; \ - sscanf(buf, funcStr " %" PRIx64, &u); \ - *d0 = u2d(u); \ +#define func_q_d(funcStr, funcName) { \ + while (startsWith(buf, funcStr " ")) { \ + sentinel = 0; \ + uint64_t u; \ + sscanf(buf, funcStr " %" PRIx64, &u); \ + *d0 = u2d(u); \ funcName<<<1, 1>>>(r, d0); \ - cudaDeviceSynchronize(); \ + cudaDeviceSynchronize(); \ cnv128 c0; \ - c0.q = Sleef_getq1_cuda(*r, 0); \ - printf("%" PRIx64 ":%" PRIx64 "\n", c0.h, c0.l); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ + c0.q = Sleef_getq1_cuda(*r, 0); \ + printf("%" PRIx64 ":%" PRIx64 "\n", c0.h, c0.l); \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } -#define func_i64_q(funcStr, funcName) { \ - while (startsWith(buf, funcStr " ")) { \ - sentinel = 0; \ +#define func_i64_q(funcStr, funcName) { \ + while (startsWith(buf, funcStr " ")) { \ + sentinel = 0; \ cnv128 c0; \ - sscanf(buf, funcStr " %" PRIx64 ":%" PRIx64, &c0.h, &c0.l); \ - *a0 = Sleef_setq1_cuda(*a0, 0, c0.q); \ - funcName<<<1, 1>>>(i64, a0); \ - cudaDeviceSynchronize(); \ - printf("%" PRIx64 "\n", *i64); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ + sscanf(buf, funcStr " %" PRIx64 ":%" PRIx64, &c0.h, &c0.l); \ + *a0 = Sleef_setq1_cuda(*a0, 0, c0.q); \ + funcName<<<1, 1>>>(i64, a0); \ + cudaDeviceSynchronize(); \ + printf("%" PRIx64 "\n", *i64); \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } -#define func_q_i64(funcStr, funcName) { \ - while (startsWith(buf, funcStr " ")) { \ - sentinel = 0; \ - sscanf(buf, funcStr " %" PRIx64, i64); \ - funcName<<<1, 1>>>(r, i64); \ - cudaDeviceSynchronize(); \ +#define func_q_i64(funcStr, funcName) { \ + while (startsWith(buf, funcStr " ")) { \ + sentinel = 0; \ + sscanf(buf, funcStr " %" PRIx64, i64); \ + funcName<<<1, 1>>>(r, i64); \ + cudaDeviceSynchronize(); \ cnv128 c0; \ - c0.q = Sleef_getq1_cuda(*r, 0); \ - printf("%" PRIx64 ":%" PRIx64 "\n", c0.h, c0.l); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ + c0.q = Sleef_getq1_cuda(*r, 0); \ + printf("%" PRIx64 ":%" PRIx64 "\n", c0.h, c0.l); \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } -#define func_u64_q(funcStr, funcName) { \ - while (startsWith(buf, funcStr " ")) { \ - sentinel = 0; \ +#define func_u64_q(funcStr, funcName) { \ + while (startsWith(buf, funcStr " ")) { \ + sentinel = 0; \ cnv128 c0; \ - sscanf(buf, funcStr " %" PRIx64 ":%" PRIx64, &c0.h, &c0.l); \ - *a0 = Sleef_setq1_cuda(*a0, 0, c0.q); \ - funcName<<<1, 1>>>(u64, a0); \ - cudaDeviceSynchronize(); \ - printf("%" PRIx64 "\n", *u64); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ + sscanf(buf, funcStr " %" PRIx64 ":%" PRIx64, &c0.h, &c0.l); \ + *a0 = Sleef_setq1_cuda(*a0, 0, c0.q); \ + funcName<<<1, 1>>>(u64, a0); \ + cudaDeviceSynchronize(); \ + printf("%" PRIx64 "\n", *u64); \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } -#define func_q_u64(funcStr, funcName) { \ - while (startsWith(buf, funcStr " ")) { \ - sentinel = 0; \ - sscanf(buf, funcStr " %" PRIx64, u64); \ - funcName<<<1, 1>>>(r, u64); \ - cudaDeviceSynchronize(); \ +#define func_q_u64(funcStr, funcName) { \ + while (startsWith(buf, funcStr " ")) { \ + sentinel = 0; \ + sscanf(buf, funcStr " %" PRIx64, u64); \ + funcName<<<1, 1>>>(r, u64); \ + cudaDeviceSynchronize(); \ cnv128 c0; \ - c0.q = Sleef_getq1_cuda(*r, 0); \ - printf("%" PRIx64 ":%" PRIx64 "\n", c0.h, c0.l); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ + c0.q = Sleef_getq1_cuda(*r, 0); \ + printf("%" PRIx64 ":%" PRIx64 "\n", c0.h, c0.l); \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } #define func_q_q_pi(funcStr, funcName) { \ - while (startsWith(buf, funcStr " ")) { \ - sentinel = 0; \ + while (startsWith(buf, funcStr " ")) { \ + sentinel = 0; \ cnv128 c0; \ - sscanf(buf, funcStr " %" PRIx64 ":%" PRIx64, &c0.h, &c0.l); \ - *a0 = Sleef_setq1_cuda(*a0, 0, c0.q); \ - funcName<<<1, 1>>>(r, a0, i0); \ - cudaDeviceSynchronize(); \ - c0.q = Sleef_getq1_cuda(*r, 0); \ - printf("%" PRIx64 ":%" PRIx64 " %d\n", c0.h, c0.l, *i0); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ + sscanf(buf, funcStr " %" PRIx64 ":%" PRIx64, &c0.h, &c0.l); \ + *a0 = Sleef_setq1_cuda(*a0, 0, c0.q); \ + funcName<<<1, 1>>>(r, a0, i0); \ + cudaDeviceSynchronize(); \ + c0.q = Sleef_getq1_cuda(*r, 0); \ + printf("%" PRIx64 ":%" PRIx64 " %d\n", c0.h, c0.l, *i0); \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } #define func_q_q_pq(funcStr, funcName) { \ - while (startsWith(buf, funcStr " ")) { \ - sentinel = 0; \ - cnv128 c0, c1; \ - sscanf(buf, funcStr " %" PRIx64 ":%" PRIx64, &c0.h, &c0.l); \ - *a0 = Sleef_setq1_cuda(*a0, 0, c0.q); \ - funcName<<<1, 1>>>(r, a0, a1); \ - cudaDeviceSynchronize(); \ - c0.q = Sleef_getq1_cuda(*r, 0); \ - c1.q = Sleef_getq1_cuda(*a1, 0); \ + while (startsWith(buf, funcStr " ")) { \ + sentinel = 0; \ + cnv128 c0, c1; \ + sscanf(buf, funcStr " %" PRIx64 ":%" PRIx64, &c0.h, &c0.l); \ + *a0 = Sleef_setq1_cuda(*a0, 0, c0.q); \ + funcName<<<1, 1>>>(r, a0, a1); \ + cudaDeviceSynchronize(); \ + c0.q = Sleef_getq1_cuda(*r, 0); \ + c1.q = Sleef_getq1_cuda(*a1, 0); \ printf("%" PRIx64 ":%" PRIx64 " %" PRIx64 ":%" PRIx64 "\n", c0.h, c0.l, c1.h, c1.l); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } int main(int argc, char **argv) { diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad-tester/qiutsimd.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad-tester/qiutsimd.c index b15eecf8bae..b7ff4dff7cf 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad-tester/qiutsimd.c +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad-tester/qiutsimd.c @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -321,344 +321,344 @@ typedef union { #define BUFSIZE 1024 -#define func_q_q(funcStr, funcName) { \ - while (startsWith(buf, funcStr " ")) { \ - sentinel = 0; \ - int lane = xrand() % VECTLENDP; \ +#define func_q_q(funcStr, funcName) { \ + while (startsWith(buf, funcStr " ")) { \ + sentinel = 0; \ + int lane = xrand() % VECTLENDP; \ cnv128 c0; \ - sscanf(buf, funcStr " %" PRIx64 ":%" PRIx64, &c0.h, &c0.l); \ - VARGQUAD a0; \ - memrand(&a0, SIZEOF_VARGQUAD); \ - a0 = xsetq(a0, lane, c0.q); \ + sscanf(buf, funcStr " %" PRIx64 ":%" PRIx64, &c0.h, &c0.l); \ + VARGQUAD a0; \ + memrand(&a0, SIZEOF_VARGQUAD); \ + a0 = xsetq(a0, lane, c0.q); \ a0 = funcName(a0); \ - c0.q = xgetq(a0, lane); \ - printf("%" PRIx64 ":%" PRIx64 "\n", c0.h, c0.l); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ + c0.q = xgetq(a0, lane); \ + printf("%" PRIx64 ":%" PRIx64 "\n", c0.h, c0.l); \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } -#define func_q_q_q(funcStr, funcName) { \ - while (startsWith(buf, funcStr " ")) { \ - sentinel = 0; \ - int lane = xrand() % VECTLENDP; \ - cnv128 c0, c1; \ +#define func_q_q_q(funcStr, funcName) { \ + while (startsWith(buf, funcStr " ")) { \ + sentinel = 0; \ + int lane = xrand() % VECTLENDP; \ + cnv128 c0, c1; \ sscanf(buf, funcStr " %" PRIx64 ":%" PRIx64 " %" PRIx64 ":%" PRIx64, &c0.h, &c0.l, &c1.h, &c1.l); \ - VARGQUAD a0, a1; \ - memrand(&a0, SIZEOF_VARGQUAD); \ - memrand(&a1, SIZEOF_VARGQUAD); \ - a0 = xsetq(a0, lane, c0.q); \ - a1 = xsetq(a1, lane, c1.q); \ - a0 = funcName(a0, a1); \ - c0.q = xgetq(a0, lane); \ - printf("%" PRIx64 ":%" PRIx64 "\n", c0.h, c0.l); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ + VARGQUAD a0, a1; \ + memrand(&a0, SIZEOF_VARGQUAD); \ + memrand(&a1, SIZEOF_VARGQUAD); \ + a0 = xsetq(a0, lane, c0.q); \ + a1 = xsetq(a1, lane, c1.q); \ + a0 = funcName(a0, a1); \ + c0.q = xgetq(a0, lane); \ + printf("%" PRIx64 ":%" PRIx64 "\n", c0.h, c0.l); \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } -#define func_q_q_q_q(funcStr, funcName) { \ - while (startsWith(buf, funcStr " ")) { \ - sentinel = 0; \ - int lane = xrand() % VECTLENDP; \ +#define func_q_q_q_q(funcStr, funcName) { \ + while (startsWith(buf, funcStr " ")) { \ + sentinel = 0; \ + int lane = xrand() % VECTLENDP; \ cnv128 c0, c1, c2; \ sscanf(buf, funcStr " %" PRIx64 ":%" PRIx64 " %" PRIx64 ":%" PRIx64 " %" PRIx64 ":%" PRIx64, \ - &c0.h, &c0.l, &c1.h, &c1.l, &c2.h, &c2.l); \ - VARGQUAD a0, a1, a2; \ - memrand(&a0, SIZEOF_VARGQUAD); \ - memrand(&a1, SIZEOF_VARGQUAD); \ - memrand(&a2, SIZEOF_VARGQUAD); \ - a0 = xsetq(a0, lane, c0.q); \ - a1 = xsetq(a1, lane, c1.q); \ - a2 = xsetq(a2, lane, c2.q); \ + &c0.h, &c0.l, &c1.h, &c1.l, &c2.h, &c2.l); \ + VARGQUAD a0, a1, a2; \ + memrand(&a0, SIZEOF_VARGQUAD); \ + memrand(&a1, SIZEOF_VARGQUAD); \ + memrand(&a2, SIZEOF_VARGQUAD); \ + a0 = xsetq(a0, lane, c0.q); \ + a1 = xsetq(a1, lane, c1.q); \ + a2 = xsetq(a2, lane, c2.q); \ a0 = funcName(a0, a1, a2); \ - c0.q = xgetq(a0, lane); \ - printf("%" PRIx64 ":%" PRIx64 "\n", c0.h, c0.l); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ + c0.q = xgetq(a0, lane); \ + printf("%" PRIx64 ":%" PRIx64 "\n", c0.h, c0.l); \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } -#define func_i_q(funcStr, funcName) { \ - while (startsWith(buf, funcStr " ")) { \ - sentinel = 0; \ - int lane = xrand() % VECTLENDP; \ +#define func_i_q(funcStr, funcName) { \ + while (startsWith(buf, funcStr " ")) { \ + sentinel = 0; \ + int lane = xrand() % VECTLENDP; \ cnv128 c0; \ - sscanf(buf, funcStr " %" PRIx64 ":%" PRIx64, &c0.h, &c0.l); \ - VARGQUAD a0; \ - memrand(&a0, SIZEOF_VARGQUAD); \ - a0 = xsetq(a0, lane, c0.q); \ - vint vi = funcName(a0); \ - int t[VECTLENDP*2]; \ - vstoreu_v_p_vi(t, vi); \ - printf("%d\n", t[lane]); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ + sscanf(buf, funcStr " %" PRIx64 ":%" PRIx64, &c0.h, &c0.l); \ + VARGQUAD a0; \ + memrand(&a0, SIZEOF_VARGQUAD); \ + a0 = xsetq(a0, lane, c0.q); \ + vint vi = funcName(a0); \ + int t[VECTLENDP*2]; \ + vstoreu_v_p_vi(t, vi); \ + printf("%d\n", t[lane]); \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } -#define func_i_q_q(funcStr, funcName) { \ - while (startsWith(buf, funcStr " ")) { \ - sentinel = 0; \ - int lane = xrand() % VECTLENDP; \ - cnv128 c0, c1; \ +#define func_i_q_q(funcStr, funcName) { \ + while (startsWith(buf, funcStr " ")) { \ + sentinel = 0; \ + int lane = xrand() % VECTLENDP; \ + cnv128 c0, c1; \ sscanf(buf, funcStr " %" PRIx64 ":%" PRIx64 " %" PRIx64 ":%" PRIx64, &c0.h, &c0.l, &c1.h, &c1.l); \ - VARGQUAD a0, a1; \ - memrand(&a0, SIZEOF_VARGQUAD); \ - memrand(&a1, SIZEOF_VARGQUAD); \ - a0 = xsetq(a0, lane, c0.q); \ - a1 = xsetq(a1, lane, c1.q); \ - vint vi = funcName(a0, a1); \ - int t[VECTLENDP*2]; \ - vstoreu_v_p_vi(t, vi); \ - printf("%d\n", t[lane]); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ + VARGQUAD a0, a1; \ + memrand(&a0, SIZEOF_VARGQUAD); \ + memrand(&a1, SIZEOF_VARGQUAD); \ + a0 = xsetq(a0, lane, c0.q); \ + a1 = xsetq(a1, lane, c1.q); \ + vint vi = funcName(a0, a1); \ + int t[VECTLENDP*2]; \ + vstoreu_v_p_vi(t, vi); \ + printf("%d\n", t[lane]); \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } -#define func_q_q_i(funcStr, funcName) { \ - while (startsWith(buf, funcStr " ")) { \ - sentinel = 0; \ - int lane = xrand() % VECTLENDP; \ +#define func_q_q_i(funcStr, funcName) { \ + while (startsWith(buf, funcStr " ")) { \ + sentinel = 0; \ + int lane = xrand() % VECTLENDP; \ cnv128 c0; \ - int k; \ + int k; \ sscanf(buf, funcStr " %" PRIx64 ":%" PRIx64 " %d", &c0.h, &c0.l, &k); \ - VARGQUAD a0; \ - memrand(&a0, SIZEOF_VARGQUAD); \ - a0 = xsetq(a0, lane, c0.q); \ - int t[VECTLENDP*2]; \ - memrand(t, sizeof(t)); \ - t[lane] = k; \ + VARGQUAD a0; \ + memrand(&a0, SIZEOF_VARGQUAD); \ + a0 = xsetq(a0, lane, c0.q); \ + int t[VECTLENDP*2]; \ + memrand(t, sizeof(t)); \ + t[lane] = k; \ a0 = funcName(a0, vloadu_vi_p(t)); \ - c0.q = xgetq(a0, lane); \ - printf("%" PRIx64 ":%" PRIx64 "\n", c0.h, c0.l); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ + c0.q = xgetq(a0, lane); \ + printf("%" PRIx64 ":%" PRIx64 "\n", c0.h, c0.l); \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } -#define func_d_q(funcStr, funcName) { \ - while (startsWith(buf, funcStr " ")) { \ - sentinel = 0; \ - int lane = xrand() % VECTLENDP; \ +#define func_d_q(funcStr, funcName) { \ + while (startsWith(buf, funcStr " ")) { \ + sentinel = 0; \ + int lane = xrand() % VECTLENDP; \ cnv128 c0; \ - sscanf(buf, funcStr " %" PRIx64 ":%" PRIx64, &c0.h, &c0.l); \ - VARGQUAD a0; \ - memrand(&a0, SIZEOF_VARGQUAD); \ - a0 = xsetq(a0, lane, c0.q); \ - double d[VECTLENDP]; \ - vstoreu_v_p_vd(d, funcName(a0)); \ - printf("%" PRIx64 "\n", d2u(d[lane])); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ + sscanf(buf, funcStr " %" PRIx64 ":%" PRIx64, &c0.h, &c0.l); \ + VARGQUAD a0; \ + memrand(&a0, SIZEOF_VARGQUAD); \ + a0 = xsetq(a0, lane, c0.q); \ + double d[VECTLENDP]; \ + vstoreu_v_p_vd(d, funcName(a0)); \ + printf("%" PRIx64 "\n", d2u(d[lane])); \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } -#define func_q_d(funcStr, funcName) { \ - while (startsWith(buf, funcStr " ")) { \ - sentinel = 0; \ - int lane = xrand() % VECTLENDP; \ - uint64_t u; \ - sscanf(buf, funcStr " %" PRIx64, &u); \ - double s[VECTLENDP]; \ - memrand(s, sizeof(s)); \ - s[lane] = u2d(u); \ - VARGQUAD a0 = funcName(vloadu_vd_p(s)); \ +#define func_q_d(funcStr, funcName) { \ + while (startsWith(buf, funcStr " ")) { \ + sentinel = 0; \ + int lane = xrand() % VECTLENDP; \ + uint64_t u; \ + sscanf(buf, funcStr " %" PRIx64, &u); \ + double s[VECTLENDP]; \ + memrand(s, sizeof(s)); \ + s[lane] = u2d(u); \ + VARGQUAD a0 = funcName(vloadu_vd_p(s)); \ cnv128 c0; \ - c0.q = xgetq(a0, lane); \ - printf("%" PRIx64 ":%" PRIx64 "\n", c0.h, c0.l); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ + c0.q = xgetq(a0, lane); \ + printf("%" PRIx64 ":%" PRIx64 "\n", c0.h, c0.l); \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } -#define func_i64_q(funcStr, funcName) { \ - while (startsWith(buf, funcStr " ")) { \ - sentinel = 0; \ - int lane = xrand() % VECTLENDP; \ +#define func_i64_q(funcStr, funcName) { \ + while (startsWith(buf, funcStr " ")) { \ + sentinel = 0; \ + int lane = xrand() % VECTLENDP; \ cnv128 c0; \ - sscanf(buf, funcStr " %" PRIx64 ":%" PRIx64, &c0.h, &c0.l); \ - VARGQUAD a0; \ - memrand(&a0, SIZEOF_VARGQUAD); \ - a0 = xsetq(a0, lane, c0.q); \ - double d[VECTLENDP]; \ + sscanf(buf, funcStr " %" PRIx64 ":%" PRIx64, &c0.h, &c0.l); \ + VARGQUAD a0; \ + memrand(&a0, SIZEOF_VARGQUAD); \ + a0 = xsetq(a0, lane, c0.q); \ + double d[VECTLENDP]; \ vstoreu_v_p_vd(d, vreinterpret_vd_vm(vreinterpret_vm_vi64(funcName(a0)))); \ - printf("%" PRIx64 "\n", d2u(d[lane])); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ + printf("%" PRIx64 "\n", d2u(d[lane])); \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } -#define func_q_i64(funcStr, funcName) { \ - while (startsWith(buf, funcStr " ")) { \ - sentinel = 0; \ - int lane = xrand() % VECTLENDP; \ - uint64_t u; \ - sscanf(buf, funcStr " %" PRIx64, &u); \ - double s[VECTLENDP]; \ - memrand(s, sizeof(s)); \ - s[lane] = u2d(u); \ - VARGQUAD a0 = funcName(vreinterpret_vi64_vm(vreinterpret_vm_vd(vloadu_vd_p(s)))); \ +#define func_q_i64(funcStr, funcName) { \ + while (startsWith(buf, funcStr " ")) { \ + sentinel = 0; \ + int lane = xrand() % VECTLENDP; \ + uint64_t u; \ + sscanf(buf, funcStr " %" PRIx64, &u); \ + double s[VECTLENDP]; \ + memrand(s, sizeof(s)); \ + s[lane] = u2d(u); \ + VARGQUAD a0 = funcName(vreinterpret_vi64_vm(vreinterpret_vm_vd(vloadu_vd_p(s)))); \ cnv128 c0; \ - c0.q = xgetq(a0, lane); \ - printf("%" PRIx64 ":%" PRIx64 "\n", c0.h, c0.l); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ + c0.q = xgetq(a0, lane); \ + printf("%" PRIx64 ":%" PRIx64 "\n", c0.h, c0.l); \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } -#define func_u64_q(funcStr, funcName) { \ - while (startsWith(buf, funcStr " ")) { \ - sentinel = 0; \ - int lane = xrand() % VECTLENDP; \ +#define func_u64_q(funcStr, funcName) { \ + while (startsWith(buf, funcStr " ")) { \ + sentinel = 0; \ + int lane = xrand() % VECTLENDP; \ cnv128 c0; \ - sscanf(buf, funcStr " %" PRIx64 ":%" PRIx64, &c0.h, &c0.l); \ - VARGQUAD a0; \ - memrand(&a0, SIZEOF_VARGQUAD); \ - a0 = xsetq(a0, lane, c0.q); \ - double d[VECTLENDP]; \ + sscanf(buf, funcStr " %" PRIx64 ":%" PRIx64, &c0.h, &c0.l); \ + VARGQUAD a0; \ + memrand(&a0, SIZEOF_VARGQUAD); \ + a0 = xsetq(a0, lane, c0.q); \ + double d[VECTLENDP]; \ vstoreu_v_p_vd(d, vreinterpret_vd_vm(vreinterpret_vm_vu64(funcName(a0)))); \ - printf("%" PRIx64 "\n", d2u(d[lane])); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ + printf("%" PRIx64 "\n", d2u(d[lane])); \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } -#define func_q_u64(funcStr, funcName) { \ - while (startsWith(buf, funcStr " ")) { \ - sentinel = 0; \ - int lane = xrand() % VECTLENDP; \ - uint64_t u; \ - sscanf(buf, funcStr " %" PRIx64, &u); \ - double s[VECTLENDP]; \ - memrand(s, sizeof(s)); \ - s[lane] = u2d(u); \ - VARGQUAD a0 = funcName(vreinterpret_vu64_vm(vreinterpret_vm_vd(vloadu_vd_p(s)))); \ +#define func_q_u64(funcStr, funcName) { \ + while (startsWith(buf, funcStr " ")) { \ + sentinel = 0; \ + int lane = xrand() % VECTLENDP; \ + uint64_t u; \ + sscanf(buf, funcStr " %" PRIx64, &u); \ + double s[VECTLENDP]; \ + memrand(s, sizeof(s)); \ + s[lane] = u2d(u); \ + VARGQUAD a0 = funcName(vreinterpret_vu64_vm(vreinterpret_vm_vd(vloadu_vd_p(s)))); \ cnv128 c0; \ - c0.q = xgetq(a0, lane); \ - printf("%" PRIx64 ":%" PRIx64 "\n", c0.h, c0.l); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ + c0.q = xgetq(a0, lane); \ + printf("%" PRIx64 ":%" PRIx64 "\n", c0.h, c0.l); \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } #define func_q_q_pi(funcStr, funcName) { \ - while (startsWith(buf, funcStr " ")) { \ - sentinel = 0; \ - int lane = xrand() % VECTLENDP; \ + while (startsWith(buf, funcStr " ")) { \ + sentinel = 0; \ + int lane = xrand() % VECTLENDP; \ cnv128 c0; \ - sscanf(buf, funcStr " %" PRIx64 ":%" PRIx64, &c0.h, &c0.l); \ - VARGQUAD a0; \ - memrand(&a0, SIZEOF_VARGQUAD); \ - a0 = xsetq(a0, lane, c0.q); \ - vint vi; \ - a0 = funcName(a0, &vi); \ - c0.q = xgetq(a0, lane); \ - int t[VECTLENDP*2]; \ - vstoreu_v_p_vi(t, vi); \ - printf("%" PRIx64 ":%" PRIx64 " %d\n", c0.h, c0.l, t[lane]); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ + sscanf(buf, funcStr " %" PRIx64 ":%" PRIx64, &c0.h, &c0.l); \ + VARGQUAD a0; \ + memrand(&a0, SIZEOF_VARGQUAD); \ + a0 = xsetq(a0, lane, c0.q); \ + vint vi; \ + a0 = funcName(a0, &vi); \ + c0.q = xgetq(a0, lane); \ + int t[VECTLENDP*2]; \ + vstoreu_v_p_vi(t, vi); \ + printf("%" PRIx64 ":%" PRIx64 " %d\n", c0.h, c0.l, t[lane]); \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } #define func_q_q_pq(funcStr, funcName) { \ - while (startsWith(buf, funcStr " ")) { \ - sentinel = 0; \ - int lane = xrand() % VECTLENDP; \ - cnv128 c0, c1; \ - sscanf(buf, funcStr " %" PRIx64 ":%" PRIx64, &c0.h, &c0.l); \ - VARGQUAD a0, a1; \ - memrand(&a0, SIZEOF_VARGQUAD); \ - a0 = xsetq(a0, lane, c0.q); \ - a0 = funcName(a0, &a1); \ - c0.q = xgetq(a0, lane); \ - c1.q = xgetq(a1, lane); \ + while (startsWith(buf, funcStr " ")) { \ + sentinel = 0; \ + int lane = xrand() % VECTLENDP; \ + cnv128 c0, c1; \ + sscanf(buf, funcStr " %" PRIx64 ":%" PRIx64, &c0.h, &c0.l); \ + VARGQUAD a0, a1; \ + memrand(&a0, SIZEOF_VARGQUAD); \ + a0 = xsetq(a0, lane, c0.q); \ + a0 = funcName(a0, &a1); \ + c0.q = xgetq(a0, lane); \ + c1.q = xgetq(a1, lane); \ printf("%" PRIx64 ":%" PRIx64 " %" PRIx64 ":%" PRIx64 "\n", c0.h, c0.l, c1.h, c1.l); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } -#define func_strtoq(funcStr) { \ - while (startsWith(buf, funcStr " ")) { \ - sentinel = 0; \ - char s[64]; \ - sscanf(buf, funcStr " %63s", s); \ - VARGQUAD a0; \ - a0 = Sleef_strtoq(s, NULL); \ +#define func_strtoq(funcStr) { \ + while (startsWith(buf, funcStr " ")) { \ + sentinel = 0; \ + char s[64]; \ + sscanf(buf, funcStr " %63s", s); \ + VARGQUAD a0; \ + a0 = Sleef_strtoq(s, NULL); \ cnv128 c0; \ - c0.q = xgetq(a0, 0); \ - printf("%" PRIx64 ":%" PRIx64 "\n", c0.h, c0.l); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ + c0.q = xgetq(a0, 0); \ + printf("%" PRIx64 ":%" PRIx64 "\n", c0.h, c0.l); \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } #if !(defined(ENABLEFLOAT128) && defined(__clang__)) -#define func_snprintf_40Qg(funcStr) { \ - while (startsWith(buf, funcStr " ")) { \ - sentinel = 0; \ +#define func_snprintf_40Qg(funcStr) { \ + while (startsWith(buf, funcStr " ")) { \ + sentinel = 0; \ cnv128 c0; \ - sscanf(buf, funcStr " %" PRIx64 ":%" PRIx64, &c0.h, &c0.l); \ - VARGQUAD a0; \ - memset(&a0, 0, sizeof(a0)); \ - a0 = xsetq(a0, 0, c0.q); \ - char s[64]; \ - Sleef_snprintf(s, 63, "%.40Qg", a0); \ + sscanf(buf, funcStr " %" PRIx64 ":%" PRIx64, &c0.h, &c0.l); \ + VARGQUAD a0; \ + memset(&a0, 0, sizeof(a0)); \ + a0 = xsetq(a0, 0, c0.q); \ + char s[64]; \ + Sleef_snprintf(s, 63, "%.40Qg", a0); \ printf("%s\n", s); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } -#define func_snprintf_Qa(funcStr) { \ - while (startsWith(buf, funcStr " ")) { \ - sentinel = 0; \ +#define func_snprintf_Qa(funcStr) { \ + while (startsWith(buf, funcStr " ")) { \ + sentinel = 0; \ cnv128 c0; \ - sscanf(buf, funcStr " %" PRIx64 ":%" PRIx64, &c0.h, &c0.l); \ - VARGQUAD a0; \ - memset(&a0, 0, sizeof(a0)); \ - a0 = xsetq(a0, 0, c0.q); \ - char s[64]; \ - Sleef_snprintf(s, 63, "%Qa", a0); \ + sscanf(buf, funcStr " %" PRIx64 ":%" PRIx64, &c0.h, &c0.l); \ + VARGQUAD a0; \ + memset(&a0, 0, sizeof(a0)); \ + a0 = xsetq(a0, 0, c0.q); \ + char s[64]; \ + Sleef_snprintf(s, 63, "%Qa", a0); \ printf("%s\n", s); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } #else -#define func_snprintf_40Qg(funcStr) { \ - while (startsWith(buf, funcStr " ")) { \ - sentinel = 0; \ +#define func_snprintf_40Qg(funcStr) { \ + while (startsWith(buf, funcStr " ")) { \ + sentinel = 0; \ cnv128 c0; \ - sscanf(buf, funcStr " %" PRIx64 ":%" PRIx64, &c0.h, &c0.l); \ - VARGQUAD a0; \ - memset(&a0, 0, sizeof(a0)); \ - a0 = xsetq(a0, 0, c0.q); \ - char s[64]; \ - Sleef_snprintf(s, 63, "%.40Pg", &a0); \ + sscanf(buf, funcStr " %" PRIx64 ":%" PRIx64, &c0.h, &c0.l); \ + VARGQUAD a0; \ + memset(&a0, 0, sizeof(a0)); \ + a0 = xsetq(a0, 0, c0.q); \ + char s[64]; \ + Sleef_snprintf(s, 63, "%.40Pg", &a0); \ printf("%s\n", s); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } -#define func_snprintf_Qa(funcStr) { \ - while (startsWith(buf, funcStr " ")) { \ - sentinel = 0; \ +#define func_snprintf_Qa(funcStr) { \ + while (startsWith(buf, funcStr " ")) { \ + sentinel = 0; \ cnv128 c0; \ - sscanf(buf, funcStr " %" PRIx64 ":%" PRIx64, &c0.h, &c0.l); \ - VARGQUAD a0; \ - memset(&a0, 0, sizeof(a0)); \ - a0 = xsetq(a0, 0, c0.q); \ - char s[64]; \ + sscanf(buf, funcStr " %" PRIx64 ":%" PRIx64, &c0.h, &c0.l); \ + VARGQUAD a0; \ + memset(&a0, 0, sizeof(a0)); \ + a0 = xsetq(a0, 0, c0.q); \ + char s[64]; \ Sleef_snprintf(s, 63, "%Pa", &a0); \ printf("%s\n", s); \ - fflush(stdout); \ - if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ - } \ + fflush(stdout); \ + if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \ + } \ } #endif diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad-tester/qtester.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad-tester/qtester.c index 38097da7122..5b1e647a512 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad-tester/qtester.c +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad-tester/qtester.c @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -101,177 +101,177 @@ typedef union { } cnv128; #endif -#define child_q_q(funcStr, arg) do { \ - char str[256]; \ - cnv128 c; \ - c.q = arg; \ - sprintf(str, funcStr " %" PRIx64 ":%" PRIx64 "\n", c.h, c.l); \ - write(ptoc[1], str, strlen(str)); \ +#define child_q_q(funcStr, arg) do { \ + char str[256]; \ + cnv128 c; \ + c.q = arg; \ + sprintf(str, funcStr " %" PRIx64 ":%" PRIx64 "\n", c.h, c.l); \ + write(ptoc[1], str, strlen(str)); \ if (fgets(str, 255, fpctop) == NULL) stop("child " funcStr); \ - sscanf(str, "%" PRIx64 ":%" PRIx64, &c.h, &c.l); \ - return c.q; \ + sscanf(str, "%" PRIx64 ":%" PRIx64, &c.h, &c.l); \ + return c.q; \ } while(0) -#define child_q2_q(funcStr, arg) do { \ - char str[256]; \ - cnv128 c0, c1; \ - c0.q = arg; \ - sprintf(str, funcStr " %" PRIx64 ":%" PRIx64 "\n", c0.h, c0.l); \ - write(ptoc[1], str, strlen(str)); \ +#define child_q2_q(funcStr, arg) do { \ + char str[256]; \ + cnv128 c0, c1; \ + c0.q = arg; \ + sprintf(str, funcStr " %" PRIx64 ":%" PRIx64 "\n", c0.h, c0.l); \ + write(ptoc[1], str, strlen(str)); \ if (fgets(str, 255, fpctop) == NULL) stop("child " funcStr); \ sscanf(str, "%" PRIx64 ":%" PRIx64 " %" PRIx64 ":%" PRIx64 , &c0.h, &c0.l, &c1.h, &c1.l); \ - Sleef_quad2 ret = { c0.q, c1.q }; \ - return ret; \ + Sleef_quad2 ret = { c0.q, c1.q }; \ + return ret; \ } while(0) -#define child_q_q_q(funcStr, arg0, arg1) do { \ - char str[256]; \ - cnv128 c0, c1; \ +#define child_q_q_q(funcStr, arg0, arg1) do { \ + char str[256]; \ + cnv128 c0, c1; \ c0.q = arg0; \ c1.q = arg1; \ sprintf(str, funcStr " %" PRIx64 ":%" PRIx64 " %" PRIx64 ":%" PRIx64 "\n", c0.h, c0.l, c1.h, c1.l); \ - write(ptoc[1], str, strlen(str)); \ + write(ptoc[1], str, strlen(str)); \ if (fgets(str, 255, fpctop) == NULL) stop("child " funcStr); \ - sscanf(str, "%" PRIx64 ":%" PRIx64, &c0.h, &c0.l); \ + sscanf(str, "%" PRIx64 ":%" PRIx64, &c0.h, &c0.l); \ return c0.q; \ } while(0) -#define child_q_q_q_q(funcStr, arg0, arg1, arg2) do { \ - char str[256]; \ - cnv128 c0, c1, c2; \ +#define child_q_q_q_q(funcStr, arg0, arg1, arg2) do { \ + char str[256]; \ + cnv128 c0, c1, c2; \ c0.q = arg0; \ c1.q = arg1; \ c2.q = arg2; \ sprintf(str, funcStr " %" PRIx64 ":%" PRIx64 " %" PRIx64 ":%" PRIx64 " %" PRIx64 ":%" PRIx64 "\n", c0.h, c0.l, c1.h, c1.l, c2.h, c2.l); \ - write(ptoc[1], str, strlen(str)); \ + write(ptoc[1], str, strlen(str)); \ if (fgets(str, 255, fpctop) == NULL) stop("child " funcStr); \ - sscanf(str, "%" PRIx64 ":%" PRIx64, &c0.h, &c0.l); \ + sscanf(str, "%" PRIx64 ":%" PRIx64, &c0.h, &c0.l); \ return c0.q; \ } while(0) -#define child_i_q(funcStr, arg0) do { \ - char str[256]; \ - cnv128 c0; \ +#define child_i_q(funcStr, arg0) do { \ + char str[256]; \ + cnv128 c0; \ c0.q = arg0; \ sprintf(str, funcStr " %" PRIx64 ":%" PRIx64 "\n", c0.h, c0.l); \ - write(ptoc[1], str, strlen(str)); \ + write(ptoc[1], str, strlen(str)); \ if (fgets(str, 255, fpctop) == NULL) stop("child " funcStr); \ - int i; \ - sscanf(str, "%d", &i); \ - return i; \ + int i; \ + sscanf(str, "%d", &i); \ + return i; \ } while(0) -#define child_i_q_q(funcStr, arg0, arg1) do { \ - char str[256]; \ - cnv128 c0, c1; \ +#define child_i_q_q(funcStr, arg0, arg1) do { \ + char str[256]; \ + cnv128 c0, c1; \ c0.q = arg0; \ c1.q = arg1; \ sprintf(str, funcStr " %" PRIx64 ":%" PRIx64 " %" PRIx64 ":%" PRIx64 "\n", c0.h, c0.l, c1.h, c1.l); \ - write(ptoc[1], str, strlen(str)); \ + write(ptoc[1], str, strlen(str)); \ if (fgets(str, 255, fpctop) == NULL) stop("child " funcStr); \ - int i; \ - sscanf(str, "%d", &i); \ - return i; \ + int i; \ + sscanf(str, "%d", &i); \ + return i; \ } while(0) -#define child_q_q_i(funcStr, arg0, arg1) do { \ - char str[256]; \ - cnv128 c; \ - c.q = arg0; \ +#define child_q_q_i(funcStr, arg0, arg1) do { \ + char str[256]; \ + cnv128 c; \ + c.q = arg0; \ sprintf(str, funcStr " %" PRIx64 ":%" PRIx64 " %d\n", c.h, c.l, arg1); \ - write(ptoc[1], str, strlen(str)); \ + write(ptoc[1], str, strlen(str)); \ if (fgets(str, 255, fpctop) == NULL) stop("child " funcStr); \ - sscanf(str, "%" PRIx64 ":%" PRIx64, &c.h, &c.l); \ - return c.q; \ + sscanf(str, "%" PRIx64 ":%" PRIx64, &c.h, &c.l); \ + return c.q; \ } while(0) -#define child_d_q(funcStr, arg) do { \ - char str[256]; \ - cnv128 c; \ - c.q = arg; \ - sprintf(str, funcStr " %" PRIx64 ":%" PRIx64 "\n", c.h, c.l); \ - write(ptoc[1], str, strlen(str)); \ +#define child_d_q(funcStr, arg) do { \ + char str[256]; \ + cnv128 c; \ + c.q = arg; \ + sprintf(str, funcStr " %" PRIx64 ":%" PRIx64 "\n", c.h, c.l); \ + write(ptoc[1], str, strlen(str)); \ if (fgets(str, 255, fpctop) == NULL) stop("child " funcStr); \ - uint64_t u; \ + uint64_t u; \ sscanf(str, "%" PRIx64, &u); \ - return u2d(u); \ + return u2d(u); \ } while(0) -#define child_q_d(funcStr, arg) do { \ - char str[256]; \ - sprintf(str, funcStr " %" PRIx64 "\n", d2u(arg)); \ - write(ptoc[1], str, strlen(str)); \ +#define child_q_d(funcStr, arg) do { \ + char str[256]; \ + sprintf(str, funcStr " %" PRIx64 "\n", d2u(arg)); \ + write(ptoc[1], str, strlen(str)); \ if (fgets(str, 255, fpctop) == NULL) stop("child " funcStr); \ - cnv128 c; \ - sscanf(str, "%" PRIx64 ":%" PRIx64, &c.h, &c.l); \ - return c.q; \ + cnv128 c; \ + sscanf(str, "%" PRIx64 ":%" PRIx64, &c.h, &c.l); \ + return c.q; \ } while(0) -#define child_m_q(funcStr, arg) do { \ - char str[256]; \ - cnv128 c; \ - c.q = arg; \ - sprintf(str, funcStr " %" PRIx64 ":%" PRIx64 "\n", c.h, c.l); \ - write(ptoc[1], str, strlen(str)); \ +#define child_m_q(funcStr, arg) do { \ + char str[256]; \ + cnv128 c; \ + c.q = arg; \ + sprintf(str, funcStr " %" PRIx64 ":%" PRIx64 "\n", c.h, c.l); \ + write(ptoc[1], str, strlen(str)); \ if (fgets(str, 255, fpctop) == NULL) stop("child " funcStr); \ - uint64_t u; \ + uint64_t u; \ sscanf(str, "%" PRIx64, &u); \ - return u; \ + return u; \ } while(0) -#define child_q_m(funcStr, arg) do { \ - char str[256]; \ +#define child_q_m(funcStr, arg) do { \ + char str[256]; \ sprintf(str, funcStr " %" PRIx64 "\n", arg); \ - write(ptoc[1], str, strlen(str)); \ + write(ptoc[1], str, strlen(str)); \ if (fgets(str, 255, fpctop) == NULL) stop("child " funcStr); \ - cnv128 c; \ - sscanf(str, "%" PRIx64 ":%" PRIx64, &c.h, &c.l); \ - return c.q; \ + cnv128 c; \ + sscanf(str, "%" PRIx64 ":%" PRIx64, &c.h, &c.l); \ + return c.q; \ } while(0) -#define child_q_q_pi(funcStr, arg) do { \ - char str[256]; \ - cnv128 c; \ - c.q = arg; \ - sprintf(str, funcStr " %" PRIx64 ":%" PRIx64 "\n", c.h, c.l); \ - write(ptoc[1], str, strlen(str)); \ +#define child_q_q_pi(funcStr, arg) do { \ + char str[256]; \ + cnv128 c; \ + c.q = arg; \ + sprintf(str, funcStr " %" PRIx64 ":%" PRIx64 "\n", c.h, c.l); \ + write(ptoc[1], str, strlen(str)); \ if (fgets(str, 255, fpctop) == NULL) stop("child " funcStr); \ - int i; \ - sscanf(str, "%" PRIx64 ":%" PRIx64 " %d", &c.h, &c.l, &i); \ - *ptr = i; \ - return c.q; \ + int i; \ + sscanf(str, "%" PRIx64 ":%" PRIx64 " %d", &c.h, &c.l, &i); \ + *ptr = i; \ + return c.q; \ } while(0) -#define child_q_q_pq(funcStr, arg) do { \ - char str[256]; \ - cnv128 c0, c1; \ - c0.q = arg; \ - sprintf(str, funcStr " %" PRIx64 ":%" PRIx64 "\n", c0.h, c0.l); \ - write(ptoc[1], str, strlen(str)); \ +#define child_q_q_pq(funcStr, arg) do { \ + char str[256]; \ + cnv128 c0, c1; \ + c0.q = arg; \ + sprintf(str, funcStr " %" PRIx64 ":%" PRIx64 "\n", c0.h, c0.l); \ + write(ptoc[1], str, strlen(str)); \ if (fgets(str, 255, fpctop) == NULL) stop("child " funcStr); \ sscanf(str, "%" PRIx64 ":%" PRIx64 " %" PRIx64 ":%" PRIx64, &c0.h, &c0.l, &c1.h, &c1.l); \ *ptr = c1.q; \ return c0.q; \ } while(0) -#define child_q_str(funcStr, arg) do { \ - char str[256]; \ - sprintf(str, funcStr " %s\n", arg); \ - write(ptoc[1], str, strlen(str)); \ +#define child_q_str(funcStr, arg) do { \ + char str[256]; \ + sprintf(str, funcStr " %s\n", arg); \ + write(ptoc[1], str, strlen(str)); \ if (fgets(str, 255, fpctop) == NULL) stop("child " funcStr); \ - cnv128 c; \ - sscanf(str, "%" PRIx64 ":%" PRIx64, &c.h, &c.l); \ - return c.q; \ + cnv128 c; \ + sscanf(str, "%" PRIx64 ":%" PRIx64, &c.h, &c.l); \ + return c.q; \ } while(0) -#define child_str_q(funcStr, ret, arg) do { \ - char str[256]; \ - cnv128 c; \ - c.q = arg; \ - sprintf(str, funcStr " %" PRIx64 ":%" PRIx64 "\n", c.h, c.l); \ - write(ptoc[1], str, strlen(str)); \ +#define child_str_q(funcStr, ret, arg) do { \ + char str[256]; \ + cnv128 c; \ + c.q = arg; \ + sprintf(str, funcStr " %" PRIx64 ":%" PRIx64 "\n", c.h, c.l); \ + write(ptoc[1], str, strlen(str)); \ if (fgets(str, 255, fpctop) == NULL) stop("child " funcStr); \ - sscanf(str, "%63s", ret); \ + sscanf(str, "%63s", ret); \ } while(0) Sleef_quad child_addq_u05(Sleef_quad x, Sleef_quad y) { child_q_q_q("addq_u05", x, y); } @@ -348,368 +348,368 @@ Sleef_quad child_rintq(Sleef_quad x) { child_q_q("rintq", x); } // -#define cmpDenorm_q(mpfrFunc, childFunc, argx) do { \ - mpfr_set_f128(frx, argx, GMP_RNDN); \ - mpfrFunc(frz, frx, GMP_RNDN); \ - Sleef_quad t = childFunc(argx); \ - double u = countULPf128(t, frz, 1); \ - if (u >= 10) { \ - fprintf(stderr, "\narg = %s\ntest = %s\ncorrect = %s\nulp = %g\n", \ - sprintf128(argx), sprintf128(t), sprintfr(frz), u); \ - success = 0; \ - break; \ - } \ +#define cmpDenorm_q(mpfrFunc, childFunc, argx) do { \ + mpfr_set_f128(frx, argx, GMP_RNDN); \ + mpfrFunc(frz, frx, GMP_RNDN); \ + Sleef_quad t = childFunc(argx); \ + double u = countULPf128(t, frz, 1); \ + if (u >= 10) { \ + fprintf(stderr, "\narg = %s\ntest = %s\ncorrect = %s\nulp = %g\n", \ + sprintf128(argx), sprintf128(t), sprintfr(frz), u); \ + success = 0; \ + break; \ + } \ } while(0) -#define cmpDenormNMR_q(mpfrFunc, childFunc, argx) do { \ - mpfr_set_f128(frx, argx, GMP_RNDN); \ - mpfrFunc(frz, frx); \ - Sleef_quad t = childFunc(argx); \ - double u = countULPf128(t, frz, 1); \ - if (u >= 10) { \ - fprintf(stderr, "\narg = %s\ntest = %s\ncorrect = %s\nulp = %g\n", \ - sprintf128(argx), sprintf128(t), sprintfr(frz), u); \ - success = 0; \ - break; \ - } \ +#define cmpDenormNMR_q(mpfrFunc, childFunc, argx) do { \ + mpfr_set_f128(frx, argx, GMP_RNDN); \ + mpfrFunc(frz, frx); \ + Sleef_quad t = childFunc(argx); \ + double u = countULPf128(t, frz, 1); \ + if (u >= 10) { \ + fprintf(stderr, "\narg = %s\ntest = %s\ncorrect = %s\nulp = %g\n", \ + sprintf128(argx), sprintf128(t), sprintfr(frz), u); \ + success = 0; \ + break; \ + } \ } while(0) -#define cmpDenorm_q_q(mpfrFunc, childFunc, argx, argy) do { \ - mpfr_set_f128(frx, argx, GMP_RNDN); \ - mpfr_set_f128(fry, argy, GMP_RNDN); \ - mpfrFunc(frz, frx, fry, GMP_RNDN); \ - Sleef_quad t = childFunc(argx, argy); \ - double u = countULPf128(t, frz, 1); \ - if (u >= 10) { \ - Sleef_quad qz = mpfr_get_f128(frz, GMP_RNDN); \ +#define cmpDenorm_q_q(mpfrFunc, childFunc, argx, argy) do { \ + mpfr_set_f128(frx, argx, GMP_RNDN); \ + mpfr_set_f128(fry, argy, GMP_RNDN); \ + mpfrFunc(frz, frx, fry, GMP_RNDN); \ + Sleef_quad t = childFunc(argx, argy); \ + double u = countULPf128(t, frz, 1); \ + if (u >= 10) { \ + Sleef_quad qz = mpfr_get_f128(frz, GMP_RNDN); \ fprintf(stderr, "\narg = %s,\n %s\ntest = %s\ncorrect = %s\nulp = %g\n", \ sprintf128(argx), sprintf128(argy), sprintf128(t), sprintf128(qz), u); \ - success = 0; \ - break; \ - } \ + success = 0; \ + break; \ + } \ } while(0) -#define cmpDenorm_q_q_q(mpfrFunc, childFunc, argw, argx, argy) do { \ - mpfr_set_f128(frw, argw, GMP_RNDN); \ - mpfr_set_f128(frx, argx, GMP_RNDN); \ - mpfr_set_f128(fry, argy, GMP_RNDN); \ - mpfrFunc(frz, frw, frx, fry, GMP_RNDN); \ - Sleef_quad t = childFunc(argw, argx, argy); \ - double u = countULPf128(t, frz, 1); \ - if (u >= 10) { \ - Sleef_quad qz = mpfr_get_f128(frz, GMP_RNDN); \ +#define cmpDenorm_q_q_q(mpfrFunc, childFunc, argw, argx, argy) do { \ + mpfr_set_f128(frw, argw, GMP_RNDN); \ + mpfr_set_f128(frx, argx, GMP_RNDN); \ + mpfr_set_f128(fry, argy, GMP_RNDN); \ + mpfrFunc(frz, frw, frx, fry, GMP_RNDN); \ + Sleef_quad t = childFunc(argw, argx, argy); \ + double u = countULPf128(t, frz, 1); \ + if (u >= 10) { \ + Sleef_quad qz = mpfr_get_f128(frz, GMP_RNDN); \ fprintf(stderr, "\narg = %s,\n %s,\n %s\ntest = %s\ncorrect = %s\nulp = %g\n", \ sprintf128(argw), sprintf128(argx), sprintf128(argy), sprintf128(t), sprintf128(qz), u); \ - success = 0; \ - break; \ - } \ + success = 0; \ + break; \ + } \ } while(0) -#define cmpDenorm_q_pi(mpfrFunc, childFunc, argx) do { \ - mpfr_set_f128(frx, argx, GMP_RNDN); \ - mpfr_exp_t e; \ - mpfrFunc(&e, frz, frx, GMP_RNDN); \ - int i; \ - Sleef_quad t = childFunc(argx, &i); \ - double u = countULPf128(t, frz, 1); \ - if (u >= 10 || i != (int)e) { \ - fprintf(stderr, "\narg = %s\ntest = %s, %d\ncorrect = %s, %d\nulp = %g\n", \ +#define cmpDenorm_q_pi(mpfrFunc, childFunc, argx) do { \ + mpfr_set_f128(frx, argx, GMP_RNDN); \ + mpfr_exp_t e; \ + mpfrFunc(&e, frz, frx, GMP_RNDN); \ + int i; \ + Sleef_quad t = childFunc(argx, &i); \ + double u = countULPf128(t, frz, 1); \ + if (u >= 10 || i != (int)e) { \ + fprintf(stderr, "\narg = %s\ntest = %s, %d\ncorrect = %s, %d\nulp = %g\n", \ sprintf128(argx), sprintf128(t), i, sprintfr(frz), (int)e, u); \ - success = 0; \ - break; \ - } \ + success = 0; \ + break; \ + } \ } while(0) -#define cmpDenorm_q_pq(mpfrFunc, childFunc, argx) do { \ - mpfr_set_f128(frx, argx, GMP_RNDN); \ - mpfrFunc(fry, frz, frx, GMP_RNDN); \ - Sleef_quad qi, qf; \ - qf = childFunc(argx, &qi); \ +#define cmpDenorm_q_pq(mpfrFunc, childFunc, argx) do { \ + mpfr_set_f128(frx, argx, GMP_RNDN); \ + mpfrFunc(fry, frz, frx, GMP_RNDN); \ + Sleef_quad qi, qf; \ + qf = childFunc(argx, &qi); \ double u = countULPf128(qf, frz, 1); \ double v = countULPf128(qi, fry, 1); \ - if (u >= 10 || v >= 10) { \ + if (u >= 10 || v >= 10) { \ fprintf(stderr, "\narg = %s\ntest = %s, %s\ncorrect = %s, %s\nulp = %g, %g\n", \ sprintf128(argx), sprintf128(qf), sprintf128(qi), sprintfr(frz), sprintfr(fry), u, v); \ - success = 0; \ - break; \ - } \ + success = 0; \ + break; \ + } \ } while(0) -#define checkAccuracy_q(mpfrFunc, childFunc, argx, bound) do { \ - mpfr_set_f128(frx, argx, GMP_RNDN); \ - mpfrFunc(frz, frx, GMP_RNDN); \ - Sleef_quad t = childFunc(argx); \ - double e = countULPf128(t, frz, 0); \ - maxError = fmax(maxError, e); \ - if (e > bound) { \ +#define checkAccuracy_q(mpfrFunc, childFunc, argx, bound) do { \ + mpfr_set_f128(frx, argx, GMP_RNDN); \ + mpfrFunc(frz, frx, GMP_RNDN); \ + Sleef_quad t = childFunc(argx); \ + double e = countULPf128(t, frz, 0); \ + maxError = fmax(maxError, e); \ + if (e > bound) { \ fprintf(stderr, "\narg = %s, test = %s, correct = %s, ULP = %lf\n", \ sprintf128(argx), sprintf128(childFunc(argx)), sprintfr(frz), countULPf128(t, frz, 0)); \ - success = 0; \ - break; \ - } \ + success = 0; \ + break; \ + } \ } while(0) -#define checkAccuracyNMR_q(mpfrFunc, childFunc, argx, bound) do { \ - mpfr_set_f128(frx, argx, GMP_RNDN); \ - mpfrFunc(frz, frx); \ - Sleef_quad t = childFunc(argx); \ - double e = countULPf128(t, frz, 0); \ - maxError = fmax(maxError, e); \ - if (e > bound) { \ +#define checkAccuracyNMR_q(mpfrFunc, childFunc, argx, bound) do { \ + mpfr_set_f128(frx, argx, GMP_RNDN); \ + mpfrFunc(frz, frx); \ + Sleef_quad t = childFunc(argx); \ + double e = countULPf128(t, frz, 0); \ + maxError = fmax(maxError, e); \ + if (e > bound) { \ fprintf(stderr, "\narg = %s, test = %s, correct = %s, ULP = %lf\n", \ sprintf128(argx), sprintf128(childFunc(argx)), sprintfr(frz), countULPf128(t, frz, 0)); \ - success = 0; \ - break; \ - } \ + success = 0; \ + break; \ + } \ } while(0) -#define checkAccuracy_q_q(mpfrFunc, childFunc, argx, argy, bound) do { \ - mpfr_set_f128(frx, argx, GMP_RNDN); \ - mpfr_set_f128(fry, argy, GMP_RNDN); \ - mpfrFunc(frz, frx, fry, GMP_RNDN); \ - Sleef_quad t = childFunc(argx, argy); \ - double e = countULPf128(t, frz, 0); \ - maxError = fmax(maxError, e); \ - if (e > bound) { \ +#define checkAccuracy_q_q(mpfrFunc, childFunc, argx, argy, bound) do { \ + mpfr_set_f128(frx, argx, GMP_RNDN); \ + mpfr_set_f128(fry, argy, GMP_RNDN); \ + mpfrFunc(frz, frx, fry, GMP_RNDN); \ + Sleef_quad t = childFunc(argx, argy); \ + double e = countULPf128(t, frz, 0); \ + maxError = fmax(maxError, e); \ + if (e > bound) { \ fprintf(stderr, "\narg = %s, %s, test = %s, correct = %s, ULP = %lf\n", \ sprintf128(argx), sprintf128(argy), sprintf128(childFunc(argx, argy)), sprintfr(frz), countULPf128(t, frz, 0)); \ - success = 0; \ - break; \ - } \ + success = 0; \ + break; \ + } \ } while(0) -#define checkAccuracy_q_q_q(mpfrFunc, childFunc, argw, argx, argy, bound) do { \ - mpfr_set_f128(frw, argw, GMP_RNDN); \ - mpfr_set_f128(frx, argx, GMP_RNDN); \ - mpfr_set_f128(fry, argy, GMP_RNDN); \ - mpfrFunc(frz, frw, frx, fry, GMP_RNDN); \ - Sleef_quad t = childFunc(argw, argx, argy); \ - double e = countULPf128(t, frz, 0); \ - maxError = fmax(maxError, e); \ - if (e > bound) { \ +#define checkAccuracy_q_q_q(mpfrFunc, childFunc, argw, argx, argy, bound) do { \ + mpfr_set_f128(frw, argw, GMP_RNDN); \ + mpfr_set_f128(frx, argx, GMP_RNDN); \ + mpfr_set_f128(fry, argy, GMP_RNDN); \ + mpfrFunc(frz, frw, frx, fry, GMP_RNDN); \ + Sleef_quad t = childFunc(argw, argx, argy); \ + double e = countULPf128(t, frz, 0); \ + maxError = fmax(maxError, e); \ + if (e > bound) { \ fprintf(stderr, "\narg = %s, %s, %s, test = %s, correct = %s, ULP = %lf\n", \ sprintf128(argw), sprintf128(argx), sprintf128(argy), sprintf128(childFunc(argw, argx, argy)), sprintfr(frz), countULPf128(t, frz, 0)); \ - success = 0; \ - break; \ - } \ + success = 0; \ + break; \ + } \ } while(0) -#define checkAccuracy_q_pi(mpfrFunc, childFunc, argx, bound) do { \ - mpfr_set_f128(frx, argx, GMP_RNDN); \ - mpfr_exp_t ex; \ - mpfrFunc(&ex, frz, frx, GMP_RNDN); \ - int i; \ - Sleef_quad t = childFunc(argx, &i); \ - double e = countULPf128(t, frz, 0); \ - maxError = fmax(maxError, e); \ - if (e > bound || i != (int)ex) { \ +#define checkAccuracy_q_pi(mpfrFunc, childFunc, argx, bound) do { \ + mpfr_set_f128(frx, argx, GMP_RNDN); \ + mpfr_exp_t ex; \ + mpfrFunc(&ex, frz, frx, GMP_RNDN); \ + int i; \ + Sleef_quad t = childFunc(argx, &i); \ + double e = countULPf128(t, frz, 0); \ + maxError = fmax(maxError, e); \ + if (e > bound || i != (int)ex) { \ fprintf(stderr, "\narg = %s, test = %s, %d, correct = %s, %d, ULP = %lf\n", \ sprintf128(argx), sprintf128(t), i, sprintfr(frz), (int)ex, countULPf128(t, frz, 0)); \ - success = 0; \ - break; \ - } \ + success = 0; \ + break; \ + } \ } while(0) -#define checkAccuracy_q_pq(mpfrFunc, childFunc, argx, bound) do { \ - mpfr_set_f128(frx, argx, GMP_RNDN); \ - mpfrFunc(fry, frz, frx, GMP_RNDN); \ - Sleef_quad qi, qf; \ - qf = childFunc(argx, &qi); \ - double ef = countULPf128(qf, frz, 0); \ - double ei = countULPf128(qi, fry, 0); \ - maxError = fmax(maxError, ef); \ - maxError = fmax(maxError, ei); \ - if (ef > bound || ei > bound) { \ +#define checkAccuracy_q_pq(mpfrFunc, childFunc, argx, bound) do { \ + mpfr_set_f128(frx, argx, GMP_RNDN); \ + mpfrFunc(fry, frz, frx, GMP_RNDN); \ + Sleef_quad qi, qf; \ + qf = childFunc(argx, &qi); \ + double ef = countULPf128(qf, frz, 0); \ + double ei = countULPf128(qi, fry, 0); \ + maxError = fmax(maxError, ef); \ + maxError = fmax(maxError, ei); \ + if (ef > bound || ei > bound) { \ fprintf(stderr, "\narg = %s, test = %s, %s, correct = %s, %s, ULP = %lf, %lf\n", \ sprintf128(argx), sprintf128(qf), sprintf128(qi), sprintfr(frz), sprintfr(fry), ef, ei); \ - success = 0; \ - break; \ - } \ + success = 0; \ + break; \ + } \ } while(0) -#define testComparison(mpfrFunc, childFunc, argx, argy) do { \ - mpfr_set_f128(frx, argx, GMP_RNDN); \ - mpfr_set_f128(fry, argy, GMP_RNDN); \ - int c = mpfrFunc(frx, fry); \ - int t = childFunc(argx, argy); \ - if ((c != 0) != (t != 0)) { \ - fprintf(stderr, "\narg = %s, %s, test = %d, correct = %d\n", \ +#define testComparison(mpfrFunc, childFunc, argx, argy) do { \ + mpfr_set_f128(frx, argx, GMP_RNDN); \ + mpfr_set_f128(fry, argy, GMP_RNDN); \ + int c = mpfrFunc(frx, fry); \ + int t = childFunc(argx, argy); \ + if ((c != 0) != (t != 0)) { \ + fprintf(stderr, "\narg = %s, %s, test = %d, correct = %d\n", \ sprintf128(argx), sprintf128(argy), t, c); \ - success = 0; \ - break; \ - } \ + success = 0; \ + break; \ + } \ } while(0) // -#define cmpDenormOuterLoop_q(mpfrFunc, childFunc, checkVals) do { \ - for(int i=0;i +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "misc.h" +#include "qtesterutil.h" + +using namespace std; + +// + +#if !defined(USE_INLINE_HEADER) +#include "sleef.h" +#include "sleefquad.h" +#else // #if !defined(USE_INLINE_HEADER) +#include +#include +#include +#include + +#if defined(__AVX2__) || defined(__aarch64__) || defined(__arm__) || defined(__powerpc64__) +#ifndef FP_FAST_FMA +#define FP_FAST_FMA +#endif +#endif + +#if defined(_MSC_VER) && !defined(__STDC__) +#define __STDC__ 1 +#endif + +#if (defined(__GNUC__) || defined(__CLANG__)) && (defined(__i386__) || defined(__x86_64__)) +#include +#endif + +#if (defined(_MSC_VER)) +#include +#endif + +#if defined(__ARM_NEON__) || defined(__ARM_NEON) +#include +#endif + +#if defined(__ARM_FEATURE_SVE) +#include +#endif + +#if defined(__riscv) && defined(__riscv_v) +#include +#endif + +#if defined(__VSX__) +#include +#endif + +#if defined(__VX__) +#include +#endif + +#define SLEEF_ALWAYS_INLINE inline +#define SLEEF_INLINE +#define SLEEF_CONST +#include USE_INLINE_HEADER +#include MACRO_ONLY_HEADER + +#ifndef ENABLE_PUREC_SCALAR +#include "sleefquadinline_purec_scalar.h" +#endif + +#endif // #if !defined(USE_INLINE_HEADER) + +// + +#ifdef ENABLE_PUREC_SCALAR +#include "qrenamepurec_scalar.h" +#if !defined(USE_INLINE_HEADER) +#define CONFIG 1 +#include "helperpurec_scalar.h" +#define VARGQUAD Sleef_quad +#endif +#endif + +#ifdef ENABLE_PURECFMA_SCALAR +#include "qrenamepurecfma_scalar.h" +#if !defined(USE_INLINE_HEADER) +#define CONFIG 2 +#include "helperpurec_scalar.h" +#define VARGQUAD Sleef_quad +#endif +#endif + +#ifdef ENABLE_DSPSCALAR +#include "qrenamedspscalar.h" +#define CONFIG 1 +#include "helperpurec_scalar.h" +#define VARGQUAD Sleef_quad +#endif + +#ifdef ENABLE_SSE2 +#include "qrenamesse2.h" +#if !defined(USE_INLINE_HEADER) +#define CONFIG 2 +#include "helpersse2.h" +#define VARGQUAD Sleef_quadx2 +#endif +#endif + +#ifdef ENABLE_AVX2128 +#include "qrenameavx2128.h" +#if !defined(USE_INLINE_HEADER) +#define CONFIG 1 +#include "helperavx2_128.h" +#define VARGQUAD Sleef_quadx2 +#endif +#endif + +#ifdef ENABLE_DSPX2_X86 +#include "qrenamedspx2.h" +#define CONFIG 2 +#include "helpersse2.h" +#define VARGQUAD Sleef_quadx2 +#endif + +#ifdef ENABLE_AVX2 +#include "qrenameavx2.h" +#if !defined(USE_INLINE_HEADER) +#define CONFIG 1 +#include "helperavx2.h" +#define VARGQUAD Sleef_quadx4 +#endif +#endif + +#ifdef ENABLE_AVX512F +#include "qrenameavx512f.h" +#if !defined(USE_INLINE_HEADER) +#define CONFIG 1 +#include "helperavx512f.h" +#define VARGQUAD Sleef_quadx8 +#endif +#endif + +#ifdef ENABLE_ADVSIMD +#include "qrenameadvsimd.h" +#if !defined(USE_INLINE_HEADER) +#define CONFIG 1 +#include "helperadvsimd.h" +#define VARGQUAD Sleef_quadx2 +#endif +#endif + +#ifdef ENABLE_DSPX2_AARCH64 +#include "qrenamedspx2.h" +#define CONFIG 2 +#include "helperadvsimd.h" +#define VARGQUAD Sleef_quadx2 +#endif + +#ifdef ENABLE_SVE +#include "qrenamesve.h" +#if !defined(USE_INLINE_HEADER) +#define CONFIG 1 +#include "helpersve.h" +#define VARGQUAD Sleef_svquad +#endif +#define SIZEOF_VARGQUAD (svcntd()*8) +#endif + +#ifdef ENABLE_VSX +#include "qrenamevsx.h" +#if !defined(USE_INLINE_HEADER) +#define CONFIG 1 +#include "helperpower_128.h" +#define VARGQUAD Sleef_quadx2 +#endif +#endif + +#ifdef ENABLE_VSX3 +#include "qrenamevsx3.h" +#if !defined(USE_INLINE_HEADER) +#define CONFIG 3 +#include "helperpower_128.h" +#define VARGQUAD Sleef_quadx2 +#endif +#endif + +#ifdef ENABLE_DSPX2_PPC64 +#include "qrenamedspx2.h" +#define CONFIG 1 +#include "helperpower_128.h" +#define VARGQUAD Sleef_quadx2 +#endif + +#ifdef ENABLE_VXE +#include "qrenamevxe.h" +#if !defined(USE_INLINE_HEADER) +#define CONFIG 140 +#include "helpers390x_128.h" +#define VARGQUAD Sleef_quadx2 +#endif +#endif + +#ifdef ENABLE_VXE2 +#include "qrenamevxe2.h" +#if !defined(USE_INLINE_HEADER) +#define CONFIG 150 +#include "helpers390x_128.h" +#define VARGQUAD Sleef_quadx2 +#endif +#endif + +#ifdef ENABLE_DSPX2_S390X +#include "qrenamedspx2.h" +#define CONFIG 140 +#include "helpers390x_128.h" +#define VARGQUAD Sleef_quadx2 +#endif + +#ifdef ENABLE_RVVM1 +#include "qrenamervvm1.h" +#if !defined(USE_INLINE_HEADER) +#define CONFIG 1 +#define ENABLE_RVV_DP +#include "helperrvv.h" +#define VARGQUAD Sleef_rvvm1quad +#endif +#define SIZEOF_VARGQUAD (__riscv_vsetvlmax_e64m1()*8) +#endif + +#ifdef ENABLE_RVVM2 +#include "qrenamervvm2.h" +#if !defined(USE_INLINE_HEADER) +#define CONFIG 1 +#define ENABLE_RVV_DP +#include "helperrvv.h" +#define VARGQUAD Sleef_rvvm2quad +#endif +#define SIZEOF_VARGQUAD (__riscv_vsetvlmax_e64m2()*8) +#endif + + +#ifndef VARGQUAD +#define VARGQUAD vargquad +#endif + +#ifndef SIZEOF_VARGQUAD +#define SIZEOF_VARGQUAD sizeof(VARGQUAD) +#endif + +#ifdef USE_INLINE_HEADER +#ifdef vopmask +#undef vopmask +#endif + +#define CONCAT_SIMD_SUFFIX_(keyword, suffix) keyword ## suffix +#define CONCAT_SIMD_SUFFIX(keyword, suffix) CONCAT_SIMD_SUFFIX_(keyword, suffix) +#define vmask CONCAT_SIMD_SUFFIX(vmask, SIMD_SUFFIX) +#define vopmask CONCAT_SIMD_SUFFIX(vopmask, SIMD_SUFFIX) +#define vdouble CONCAT_SIMD_SUFFIX(vdouble, SIMD_SUFFIX) +#define vargquad CONCAT_SIMD_SUFFIX(vargquad, SIMD_SUFFIX) +#define vint CONCAT_SIMD_SUFFIX(vint, SIMD_SUFFIX) +#define vint2 CONCAT_SIMD_SUFFIX(vint2, SIMD_SUFFIX) +#define vdouble2 CONCAT_SIMD_SUFFIX(vdouble2, SIMD_SUFFIX) +#define vd2getx_vd_vd2 CONCAT_SIMD_SUFFIX(vd2getx_vd_vd2, SIMD_SUFFIX) +#define vd2gety_vd_vd2 CONCAT_SIMD_SUFFIX(vd2gety_vd_vd2, SIMD_SUFFIX) +#define vloadu_vd_p CONCAT_SIMD_SUFFIX(vloadu_vd_p, SIMD_SUFFIX) +#define vstoreu_v_p_vd CONCAT_SIMD_SUFFIX(vstoreu_v_p_vd, SIMD_SUFFIX) +#define vloadu_vi_p CONCAT_SIMD_SUFFIX(vloadu_vi_p, SIMD_SUFFIX) +#define vstoreu_v_p_vi CONCAT_SIMD_SUFFIX(vstoreu_v_p_vi, SIMD_SUFFIX) +#define vreinterpret_vm_vu64 CONCAT_SIMD_SUFFIX(vreinterpret_vm_vu64, SIMD_SUFFIX) +#define vreinterpret_vu64_vm CONCAT_SIMD_SUFFIX(vreinterpret_vu64_vm, SIMD_SUFFIX) +#define vreinterpret_vm_vi64 CONCAT_SIMD_SUFFIX(vreinterpret_vm_vi64, SIMD_SUFFIX) +#define vreinterpret_vi64_vm CONCAT_SIMD_SUFFIX(vreinterpret_vi64_vm, SIMD_SUFFIX) +#define vreinterpret_vm_vd CONCAT_SIMD_SUFFIX(vreinterpret_vm_vd, SIMD_SUFFIX) +#define vreinterpret_vd_vm CONCAT_SIMD_SUFFIX(vreinterpret_vd_vm, SIMD_SUFFIX) +#endif + +// + +extern "C" { + int check_feature(double d, float f) { + double s[VECTLENDP]; + for(int i=0;i<(int)VECTLENDP;i++) s[i] = d; + VARGQUAD a = xcast_from_doubleq(vloadu_vd_p(s)); + a = xpowq_u10(a, a); + vint vi = xicmpeqq(a, xsplatq(sleef_q(+0x1000000000000LL, 0x0000000000000000ULL, 0))); + int t[VECTLENDP*2]; + memset(t, 0, sizeof(t)); + vstoreu_v_p_vi(t, vi); + return t[0]; + } +} + +// + +static double maxULP = 0; + +static bool check_q_q(const char *msg, VARGQUAD (*vfunc)(VARGQUAD), tlfloat_octuple (*tlfunc)(const tlfloat_octuple), + const tlfloat_quad *a0, size_t z, double tol, bool checkSignedZero) { + VARGQUAD v0; + for(size_t i=0;i(t, c, TLFLOAT_FLT128_MANT_DIG, + TLFLOAT_FLT128_DENORM_MIN, TLFLOAT_FLT128_MAX, checkSignedZero); + // tlfloat_printf("t = %.35Og, c = %.35Og, ulp = %g\n", t, c, u); + if (u > maxULP) maxULP = u; + if (u > tol) { + tlfloat_printf("%s : arg = %Qa (%.35Qg), ulp = %g, t = %.35Og, c = %.35Og\n", msg, a0[i], a0[i], u, t, c); + return false; + } + } + return true; +} + +static bool check_q_q(const char *msg, VARGQUAD (*vfunc)(VARGQUAD), tlfloat_octuple (*tlfunc)(const tlfloat_octuple), + const char *minStr, const char *maxStr, bool sign, int nLoop, uint64_t seed, double tol, bool checkSignedZero) { + xsrand(seed); + tlfloat_quad min = tlfloat_strtoq(minStr, nullptr), max = tlfloat_strtoq(maxStr, nullptr); + VARGQUAD v0; + for(int i=0;i(t, c, TLFLOAT_FLT128_MANT_DIG, + TLFLOAT_FLT128_DENORM_MIN, TLFLOAT_FLT128_MAX, checkSignedZero); + // tlfloat_printf("t = %.35Og, c = %.35Og, ulp = %g\n", t, c, u); + if (u > maxULP) maxULP = u; + if (u > tol) { + tlfloat_printf("%s : arg = %Qa (%.35Qg), ulp = %g, t = %.35Og, c = %.35Og\n", msg, x, x, u, t, c); + return false; + } + } + return true; +} + +static bool check_q_q_q(const char *msg, VARGQUAD (*vfunc)(VARGQUAD, VARGQUAD), + tlfloat_octuple (*tlfunc)(const tlfloat_octuple, const tlfloat_octuple), + const tlfloat_quad *a, size_t z, double tol, bool checkSignedZero) { + VARGQUAD v0, v1; + for(size_t i=0;i(t, c, TLFLOAT_FLT128_MANT_DIG, + TLFLOAT_FLT128_DENORM_MIN, TLFLOAT_FLT128_MAX, checkSignedZero); + //tlfloat_printf("t = %.35Og, c = %.35Og, ulp = %g\n", t, c, u); + if (u > maxULP) maxULP = u; + if (u > tol) { + tlfloat_printf("%s : arg0 = %Qa (%.35Qg), arg1 = %Qa (%.35Qg), ulp = %g, t = %Oa (%.35Og), c = %Oa (%.35Og)\n", msg, a[i], a[i], a[j], a[j], u, t, t, c, c); + tlfloat_printf("c = %Qa (%.35Qg)\n", (tlfloat_quad)c, (tlfloat_quad)c); + return false; + } + } + } + return true; +} + +static bool check_q_q_q(const char *msg, VARGQUAD (*vfunc)(VARGQUAD, VARGQUAD), + tlfloat_octuple (*tlfunc)(const tlfloat_octuple, const tlfloat_octuple), + const char *minStr, const char *maxStr, bool sign, int nLoop, uint64_t seed, double tol, bool checkSignedZero) { + xsrand(seed); + tlfloat_quad min = tlfloat_strtoq(minStr, nullptr), max = tlfloat_strtoq(maxStr, nullptr); + VARGQUAD v0, v1; + for(int i=0;i(t, c, TLFLOAT_FLT128_MANT_DIG, + TLFLOAT_FLT128_DENORM_MIN, TLFLOAT_FLT128_MAX, checkSignedZero); + //tlfloat_printf("t = %.35Og, c = %.35Og, ulp = %g\n", t, c, u); + if (u > maxULP) maxULP = u; + if (u > tol) { + tlfloat_printf("%s : arg0 = %Qa (%.35Qg), arg1 = %Qa (%.35Qg), ulp = %g, t = %Oa (%.35Og), c = %Oa (%.35Og)\n", msg, x, x, y, y, u, t, t, c, c); + return false; + } + } + return true; +} + +static bool check_q_q_q_q(const char *msg, VARGQUAD (*vfunc)(VARGQUAD, VARGQUAD, VARGQUAD), + tlfloat_octuple (*tlfunc)(const tlfloat_octuple, const tlfloat_octuple, const tlfloat_octuple), + const tlfloat_quad *a, size_t z, double tol, bool checkSignedZero) { + VARGQUAD v0, v1, v2; + for(size_t i=0;i(t, c, TLFLOAT_FLT128_MANT_DIG, + TLFLOAT_FLT128_DENORM_MIN, TLFLOAT_FLT128_MAX, checkSignedZero); + //tlfloat_printf("t = %.35Og, c = %.35Og, ulp = %g\n", t, c, u); + if (u > maxULP) maxULP = u; + if (u > tol) { + tlfloat_printf("%s : arg0 = %Qa (%.35Qg), arg1 = %Qa (%.35Qg), arg2 = %Qa (%.35Qg), ulp = %g, t = %Oa (%.35Og), c = %Oa (%.35Og)\n", msg, a[i], a[i], a[j], a[j], a[k], a[k], u, t, t, c, c); + return false; + } + } + } + } + return true; +} + +static bool check_q_q_q_q(const char *msg, VARGQUAD (*vfunc)(VARGQUAD, VARGQUAD, VARGQUAD), + tlfloat_octuple (*tlfunc)(const tlfloat_octuple, const tlfloat_octuple, const tlfloat_octuple), + const char *minStr, const char *maxStr, bool sign, int nLoop, uint64_t seed, double tol, bool checkSignedZero) { + xsrand(seed); + tlfloat_quad min = tlfloat_strtoq(minStr, nullptr), max = tlfloat_strtoq(maxStr, nullptr); + VARGQUAD v0, v1, v2; + for(int i=0;i(t, c, TLFLOAT_FLT128_MANT_DIG, + TLFLOAT_FLT128_DENORM_MIN, TLFLOAT_FLT128_MAX, checkSignedZero); + //tlfloat_printf("t = %.35Og, c = %.35Og, ulp = %g\n", t, c, u); + if (u > maxULP) maxULP = u; + if (u > tol) { + tlfloat_printf("%s : arg0 = %Qa (%.35Qg), arg1 = %Qa (%.35Qg), arg1 = %Qa (%.35Qg), ulp = %g, t = %Oa (%.35Og), c = %Oa (%.35Og)\n", msg, x, x, y, y, z, z, u, t, t, c, c); + return false; + } + } + return true; +} + +static bool check_i_q_q(const char *msg, vint (*vfunc)(VARGQUAD, VARGQUAD), int (*tlfunc)(const tlfloat_octuple, const tlfloat_octuple), + const tlfloat_quad *a, size_t z) { + VARGQUAD v0, v1; + for(size_t i=0;i(t, c, nbmant, flmin, flmax, true); + if (tulp != culp) { + cout << "NG" << endl; + tlfloat_printf("t = %Oa %.35Og\n", t, t); + tlfloat_printf("c = %Oa %.35Og\n", c, c); + printf("tulp = %g\n", tulp); + printf("culp = %g\n", culp); + exit(-1); + } +} + +void showULP(bool success) { + printf("%s (%g ulp)\n", success ? "OK" : "NG", maxULP); + maxULP = 0; +} + +// + +extern "C" { + int main2(int argc, char **argv); +} + +int main2(int argc, char **argv) { + bool success = true; + const int64_t NTEST = argc == 1 ? 1000 : strtoll(argv[1], NULL, 10); + + // Tests if counting ulp numbers is correct + + check(+0.0, +0.0, TLFLOAT_FLT128_MANT_DIG, TLFLOAT_FLT128_DENORM_MIN, TLFLOAT_FLT128_MAX, 0); + check(-0.0, +0.0, TLFLOAT_FLT128_MANT_DIG, TLFLOAT_FLT128_DENORM_MIN, TLFLOAT_FLT128_MAX, 10002); + check(+0.0, -0.0, TLFLOAT_FLT128_MANT_DIG, TLFLOAT_FLT128_DENORM_MIN, TLFLOAT_FLT128_MAX, 10002); + check(-0.0, -0.0, TLFLOAT_FLT128_MANT_DIG, TLFLOAT_FLT128_DENORM_MIN, TLFLOAT_FLT128_MAX, 0); + + check(+1.0, +1.0, TLFLOAT_FLT128_MANT_DIG, TLFLOAT_FLT128_DENORM_MIN, TLFLOAT_FLT128_MAX, 0); + check(tlfloat_nextafterq(+1.0, +INFINITY), +1.0, TLFLOAT_FLT128_MANT_DIG, TLFLOAT_FLT128_DENORM_MIN, TLFLOAT_FLT128_MAX, 1.0); + check(tlfloat_nextafterq(+1.0, -INFINITY), +1.0, TLFLOAT_FLT128_MANT_DIG, TLFLOAT_FLT128_DENORM_MIN, TLFLOAT_FLT128_MAX, 0.5); + + check(-1.0, -1.0, TLFLOAT_FLT128_MANT_DIG, TLFLOAT_FLT128_DENORM_MIN, TLFLOAT_FLT128_MAX, 0); + check(tlfloat_nextafterq(-1.0, +INFINITY), -1.0, TLFLOAT_FLT128_MANT_DIG, TLFLOAT_FLT128_DENORM_MIN, TLFLOAT_FLT128_MAX, 0.5); + check(tlfloat_nextafterq(-1.0, -INFINITY), -1.0, TLFLOAT_FLT128_MANT_DIG, TLFLOAT_FLT128_DENORM_MIN, TLFLOAT_FLT128_MAX, 1.0); + + check(INFINITY, INFINITY, TLFLOAT_FLT128_MANT_DIG, TLFLOAT_FLT128_DENORM_MIN, TLFLOAT_FLT128_MAX, 0); + check(tlfloat_nextafterq(INFINITY, 0), INFINITY, TLFLOAT_FLT128_MANT_DIG, TLFLOAT_FLT128_DENORM_MIN, TLFLOAT_FLT128_MAX, INFINITY); + check(INFINITY, tlfloat_nextafterq(INFINITY, 0), TLFLOAT_FLT128_MANT_DIG, TLFLOAT_FLT128_DENORM_MIN, TLFLOAT_FLT128_MAX, 1.0); + + check(-INFINITY, -INFINITY, TLFLOAT_FLT128_MANT_DIG, TLFLOAT_FLT128_DENORM_MIN, TLFLOAT_FLT128_MAX, 0); + check(tlfloat_nextafterq(-INFINITY, 0), -INFINITY, TLFLOAT_FLT128_MANT_DIG, TLFLOAT_FLT128_DENORM_MIN, TLFLOAT_FLT128_MAX, INFINITY); + check(-INFINITY, tlfloat_nextafterq(-INFINITY, 0), TLFLOAT_FLT128_MANT_DIG, TLFLOAT_FLT128_DENORM_MIN, TLFLOAT_FLT128_MAX, 1.0); + + check(TLFLOAT_FLT128_MIN, TLFLOAT_FLT128_MIN, TLFLOAT_FLT128_MANT_DIG, TLFLOAT_FLT128_DENORM_MIN, TLFLOAT_FLT128_MAX, 0); + check(tlfloat_nextafterq(TLFLOAT_FLT128_MIN, 0.0), TLFLOAT_FLT128_MIN, TLFLOAT_FLT128_MANT_DIG, TLFLOAT_FLT128_DENORM_MIN, TLFLOAT_FLT128_MAX, 1.0); + check(tlfloat_nextafterq(TLFLOAT_FLT128_MIN, 1.0), TLFLOAT_FLT128_MIN, TLFLOAT_FLT128_MANT_DIG, TLFLOAT_FLT128_DENORM_MIN, TLFLOAT_FLT128_MAX, 1.0); + + check(-TLFLOAT_FLT128_MIN, -TLFLOAT_FLT128_MIN, TLFLOAT_FLT128_MANT_DIG, TLFLOAT_FLT128_DENORM_MIN, TLFLOAT_FLT128_MAX, 0); + check(tlfloat_nextafterq(-TLFLOAT_FLT128_MIN, 0.0), -TLFLOAT_FLT128_MIN, TLFLOAT_FLT128_MANT_DIG, TLFLOAT_FLT128_DENORM_MIN, TLFLOAT_FLT128_MAX, 1.0); + check(tlfloat_nextafterq(-TLFLOAT_FLT128_MIN, 1.0), -TLFLOAT_FLT128_MIN, TLFLOAT_FLT128_MANT_DIG, TLFLOAT_FLT128_DENORM_MIN, TLFLOAT_FLT128_MAX, 1.0); + + // + +#if !defined(ENABLE_PUREC_SCALAR) && !defined(ENABLE_PURECFMA_SCALAR) && !defined(ENABLE_DSPSCALAR) + // Do simple testing on splat, select and sleef_q + { + VARGQUAD v0 = xsplatq(sleef_q(+0x1921fb54442d1LL, 0x8469898cc51701b8ULL, 1)); + VARGQUAD v1 = xsplatq(sleef_q(+0x0000000000000LL, 0x0000000000000000ULL, 0)); + v1 = xsetq(v1, 1, sleef_q(+0x15bf0a8b14576LL, 0x95355fb8ac404e7aULL, 1)); + v1 = xmulq_u05(v0, v1); + + vint vi = xicmpeqq(v1, xsplatq(sleef_q(+0x1114580b45d47LL, 0x49e6108579a2d0caULL, 3))); + int t[VECTLENDP*2]; + memset(t, 0, sizeof(t)); + vstoreu_v_p_vi(t, vi); + + if (!(t[0] == 0 && t[1] == 1)) { + fprintf(stderr, "Testing on splat and select failed\n"); + exit(-1); + } + } +#endif + +#if defined(SLEEF_QUAD_C) + { + VARGQUAD v0 = xsplatq(SLEEF_QUAD_C(3.141592653589793238462643383279502884)); + VARGQUAD v1 = xsplatq(sleef_q(+0x1921fb54442d1LL, 0x8469898cc51701b8ULL, 1)); + if (Sleef_icmpneq1_purec(xgetq(v0, 0), xgetq(v1, 0))) { + fprintf(stderr, "Testing on SLEEF_QUAD_C failed\n"); + exit(-1); + } + } +#elif defined(ENABLE_PUREC_SCALAR) +#pragma message ("SLEEF_QUAD_C not defined") +#endif + + { + VARGQUAD v0 = xsplatq(SLEEF_M_PIq); + VARGQUAD v1 = xsplatq((Sleef_quad)tlfloat_strtoq("2.718281828459045235360287471352662498", NULL)); + Sleef_quad q = xgetq(xmulq_u05(v0, v1), 0); + if (Sleef_icmpneq1_purec(q, (Sleef_quad)tlfloat_strtoq("8.539734222673567065463550869546573820", NULL))) { + tlfloat_printf("Testing with xgetq failed : %.35Qg\n", q); + exit(-1); + } + } + + // + +#define STR_QUAD_MIN "3.36210314311209350626267781732175260e-4932" +#define STR_QUAD_MAX "1.18973149535723176508575932662800702e+4932" +#define STR_QUAD_DENORM_MIN "6.475175119438025110924438958227646552e-4966" + + static const char *stdCheckValsStr[] = { + "-0.0", "0.0", "+0.25", "-0.25", "+0.5", "-0.5", "+0.75", "-0.75", "+1.0", "-1.0", + "+1.25", "-1.25", "+1.5", "-1.5", "+2.0", "-2.0", "+2.5", "-2.5", "+3.0", "-3.0", + "+4.0", "-4.0", "+5.0", "-5.0", "+6.0", "-6.0", "+7.0", "-7.0", + "1.234", "-1.234", "+1.234e+100", "-1.234e+100", "+1.234e-100", "-1.234e-100", + "+1.234e+3000", "-1.234e+3000", "+1.234e-3000", "-1.234e-3000", + "3.1415926535897932384626433832795028841971693993751058209749445923078164", + "+" STR_QUAD_MIN, "-" STR_QUAD_MIN, + "+" STR_QUAD_DENORM_MIN, "-" STR_QUAD_DENORM_MIN, + "Inf", "-Inf", "NaN" + }; + + static const char *noInfCheckValsStr[] = { + "-0.0", "0.0", "+0.25", "-0.25", "+0.5", "-0.5", "+0.75", "-0.75", "+1.0", "-1.0", + "+1.25", "-1.25", "+1.5", "-1.5", "+2.0", "-2.0", "+2.5", "-2.5", "+3.0", "-3.0", + "+4.0", "-4.0", "+5.0", "-5.0", "+6.0", "-6.0", "+7.0", "-7.0", + "1.234", "-1.234", "+1.234e+100", "-1.234e+100", "+1.234e-100", "-1.234e-100", + "+1.234e+3000", "-1.234e+3000", "+1.234e-3000", "-1.234e-3000", + "3.1415926535897932384626433832795028841971693993751058209749445923078164", + "+" STR_QUAD_MIN, "-" STR_QUAD_MIN, + "+" STR_QUAD_DENORM_MIN, "-" STR_QUAD_DENORM_MIN, + "NaN" + }; + + static const char *trigCheckValsStr[] = { + "3.141592653589793238462643383279502884197169399375105820974944592307", + "6.283185307179586476925286766559005768394338798750211641949889184615", + "25.13274122871834590770114706623602307357735519500084656779955673846", + "402.1238596594935345232183530597763691772376831200135450847929078154", + "102943.7080728303448379438983833027505093728468787234675417069844007", + "6746518852.261009479299491324448129057382258893044021168813308929687", + "28976077832308491369.53730422794043954984410931622923280838485698255", + "534514292032483373929840186580935391650.3203828374578833308216124114", + "1.8188578844588316214011747138886493132669668866419621497938607555896e+77" + "3.141592653589793238462643383279502884197169399375105820974944592307e+1000", + "3.141592653589793238462643383279502884197169399375105820974944592307e+2000", + }; + + static const char *bigIntCheckValsStr[] = { + "+5192296858534827628530496329220094.0", + "+5192296858534827628530496329220094.25", + "+5192296858534827628530496329220094.5", + "+5192296858534827628530496329220094.75", + "+5192296858534827628530496329220095.0", + "+5192296858534827628530496329220095.25", + "+5192296858534827628530496329220095.5", + "+5192296858534827628530496329220095.75", + "+5192296858534827628530496329220096.0", + "+5192296858534827628530496329220097.0", + "+5192296858534827628530496329220098.0", + "-5192296858534827628530496329220094.0", + "-5192296858534827628530496329220094.25", + "-5192296858534827628530496329220094.5", + "-5192296858534827628530496329220094.75", + "-5192296858534827628530496329220095.0", + "-5192296858534827628530496329220095.25", + "-5192296858534827628530496329220095.5", + "-5192296858534827628530496329220095.75", + "-5192296858534827628530496329220096.0", + "-5192296858534827628530496329220097.0", + "-5192296858534827628530496329220098.0", + }; + + static const char *log1pCheckValsStr[] = { + "-.9", "-.99999999", "-.9999999999999999", "-.9999999999999999999999999999999999" + }; + +#define DEFCHECKVALS(ASTR, AVAL) \ + static tlfloat_quad AVAL[sizeof(ASTR)/sizeof(ASTR[0])]; \ + for(unsigned i=0;i(t, c, DBL_MANT_DIG, + SLEEF_DBL_DENORM_MIN, DBL_MAX, true); + if (!((tlfloat_isnan(t) && tlfloat_isnan(c)) || (fabs(t) <= DBL_MIN && u <= 1.0) || t == c)) { + tlfloat_printf("arg0 = %Qa (%.35Qg), t = %a (%.16g), c = %a (%.16g), u = %g\n", + a, a, t, t, c, c, u); + success = false; + break; + } + } + + printf("%s\n", success ? "OK" : "NG"); + } + + { + printf("cast_from_int64q : "); + + xsrand(1); + for(int i=0;i<10 * NTEST;i++) { + int64_t d; + switch(i) { + case 0: d = 0; break; + case 1: d = +0x7fffffffffffffffL; break; + case 2: d = -0x8000000000000000L; break; + default : memrand(&d, sizeof(d)); + } + tlfloat_quad c = tlfloat_quad(d); + tlfloat_quad t = 0; + { + int idx = xrand() % VECTLENDP; + int64_t s[VECTLENDP]; + memrand(s, sizeof(s)); + s[idx] = d; + VARGQUAD q = xcast_from_int64q(vreinterpret_vi64_vm(vreinterpret_vm_vd(vloadu_vd_p((double *)s)))); + t = (tlfloat_quad)xgetq(q, idx); + } + if (t != c) { + tlfloat_printf("arg0 = %016llx (%lld), t = %Qa (%.35Qg), c = %Qa (%.35Qg)\n", + (long long)d, (long long)d, t, t, c, c); + success = false; + break; + } + } + + printf("%s\n", success ? "OK" : "NG"); + } + + { + printf("cast_to_int64q : "); + + xsrand(1); + Sleef_quad min = (Sleef_quad)tlfloat_strtoq("0", nullptr), max = (Sleef_quad)tlfloat_strtoq("1e+20", nullptr); + for(int i=0;i<10 * NTEST;i++) { + Sleef_quad a; + if (i < int(sizeof(stdCheckVals)/sizeof(stdCheckVals[0])-1)) { + a = (Sleef_quad)stdCheckVals[i]; + } else { + a = rndf128(min, max, true); + } + int64_t t = 0, c = (int64_t)(tlfloat_quad)a; + { + int idx = xrand() % VECTLENDP; + VARGQUAD v0; + memrand(&v0, SIZEOF_VARGQUAD); + v0 = xsetq(v0, idx, a); + int64_t s[VECTLENDP]; + vstoreu_v_p_vd((double *)s, vreinterpret_vd_vm(vreinterpret_vm_vi64(xcast_to_int64q(v0)))); + t = s[idx]; + } + if (-ldexp(1, 63) < a && a < ldexp(1, 63) && t != c) { + tlfloat_printf("arg0 = %Qa (%.35Qg), t = %016llx (%lld), c = %016llx (%lld)\n", + a, a, (long long)t, (long long)t, (long long)c, (long long)c); + success = false; + break; + } + } + + printf("%s\n", success ? "OK" : "NG"); + } + + { + printf("cast_from_uint64q : "); + + xsrand(1); + for(int i=0;i<10 * NTEST;i++) { + uint64_t d; + switch(i) { + case 0: d = 0; break; + case 1: d = +0x7fffffffffffffffL; break; + case 2: d = -0x8000000000000000L; break; + default : memrand(&d, sizeof(d)); + } + tlfloat_quad c = tlfloat_quad(d); + tlfloat_quad t = 0; + { + int idx = xrand() % VECTLENDP; + uint64_t s[VECTLENDP]; + memrand(s, sizeof(s)); + s[idx] = d; + VARGQUAD q = xcast_from_uint64q(vreinterpret_vu64_vm(vreinterpret_vm_vd(vloadu_vd_p((double *)s)))); + t = (tlfloat_quad)xgetq(q, idx); + } + if (t != c) { + tlfloat_printf("arg0 = %016llx (%lld), t = %Qa (%.35Qg), c = %Qa (%.35Qg)\n", + (long long)d, (long long)d, t, t, c, c); + success = false; + break; + } + } + + printf("%s\n", success ? "OK" : "NG"); + } + + { + printf("cast_to_uint64q : "); + + xsrand(1); + Sleef_quad min = (Sleef_quad)tlfloat_strtoq("0", nullptr), max = (Sleef_quad)tlfloat_strtoq("1e+20", nullptr); + for(int i=0;i<10 * NTEST;i++) { + Sleef_quad a; + if (i < int(sizeof(stdCheckVals)/sizeof(stdCheckVals[0])-1)) { + a = (Sleef_quad)stdCheckVals[i]; + } else { + a = rndf128(min, max, true); + } + uint64_t t = 0, c = (uint64_t)(tlfloat_quad)a; + { + int idx = xrand() % VECTLENDP; + VARGQUAD v0; + memrand(&v0, SIZEOF_VARGQUAD); + v0 = xsetq(v0, idx, a); + uint64_t s[VECTLENDP]; + vstoreu_v_p_vd((double *)s, vreinterpret_vd_vm(vreinterpret_vm_vu64(xcast_to_uint64q(v0)))); + t = s[idx]; + } + if (0 <= a && a < ldexp(1, 64) && t != c) { + tlfloat_printf("arg0 = %Qa (%.35Qg), t = %016llx (%lld), c = %016llx (%lld)\n", + a, a, (long long)t, (long long)t, (long long)c, (long long)c); + success = false; + break; + } + } + + printf("%s\n", success ? "OK" : "NG"); + } + + // + + if (success) { + cout << "OK" << endl; + } else { + cout << "NG" << endl; + } + + return success ? 0 : -1; +} diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad-tester/tester2printf.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad-tester/tester2printf.c index 32ff6301ef0..fe51efe2dea 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad-tester/tester2printf.c +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad-tester/tester2printf.c @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad-tester/tester2simdqp.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad-tester/tester2simdqp.c index 4ddcd36961e..b9133510b7c 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad-tester/tester2simdqp.c +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad-tester/tester2simdqp.c @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad-tester/tester3printf.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad-tester/tester3printf.c index 58c5c8e20da..6545c5afb95 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad-tester/tester3printf.c +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad-tester/tester3printf.c @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -9,7 +9,11 @@ #include #include -#include +#ifndef SLEEF_USE_INTERNAL_SHA256 +#include +#else +#include "psha2_capi.h" +#endif #include "sleefquad.h" @@ -24,7 +28,7 @@ static void convertEndianness(void *ptr, int len) { #endif } -static void testem(MD5_CTX *ctx, Sleef_quad val, char *types) { +static void testem(EVP_MD_CTX *ctx, Sleef_quad val, char *types) { for(int alt=0;alt<2;alt++) { for(int zero=0;zero<2;zero++) { for(int left=0;left<2;left++) { @@ -43,10 +47,10 @@ static void testem(MD5_CTX *ctx, Sleef_quad val, char *types) { r = Sleef_snprintf(buf, 99, fmt, &val); assert(r < 100); - MD5_Update(ctx, buf, r < 0 ? 0 : r); + EVP_DigestUpdate(ctx, buf, r < 0 ? 0 : r); q = Sleef_strtoq(buf, NULL); convertEndianness(&q, sizeof(q)); - MD5_Update(ctx, &q, sizeof(Sleef_quad)); + EVP_DigestUpdate(ctx, &q, sizeof(Sleef_quad)); for(int width=0;width<=40;width += 2) { snprintf(fmt, 99, "%%%s%s%s%s%s%d.%s", @@ -59,10 +63,10 @@ static void testem(MD5_CTX *ctx, Sleef_quad val, char *types) { r = Sleef_snprintf(buf, 99, fmt, &val); assert(r < 100); - MD5_Update(ctx, buf, r < 0 ? 0 : r); + EVP_DigestUpdate(ctx, buf, r < 0 ? 0 : r); q = Sleef_strtoq(buf, NULL); convertEndianness(&q, sizeof(q)); - MD5_Update(ctx, &q, sizeof(Sleef_quad)); + EVP_DigestUpdate(ctx, &q, sizeof(Sleef_quad)); } for(int prec=0;prec<=40;prec += 3) { @@ -77,10 +81,10 @@ static void testem(MD5_CTX *ctx, Sleef_quad val, char *types) { r = Sleef_snprintf(buf, 99, fmt, &val); assert(r < 100); - MD5_Update(ctx, buf, r < 0 ? 0 : r); + EVP_DigestUpdate(ctx, buf, r < 0 ? 0 : r); q = Sleef_strtoq(buf, NULL); convertEndianness(&q, sizeof(q)); - MD5_Update(ctx, &q, sizeof(Sleef_quad)); + EVP_DigestUpdate(ctx, &q, sizeof(Sleef_quad)); } snprintf(fmt, 99, "%%%s%s%s%s%s.%d%s", @@ -93,10 +97,10 @@ static void testem(MD5_CTX *ctx, Sleef_quad val, char *types) { r = Sleef_snprintf(buf, 99, fmt, &val); assert(r < 100); - MD5_Update(ctx, buf, r < 0 ? 0 : r); + EVP_DigestUpdate(ctx, buf, r < 0 ? 0 : r); q = Sleef_strtoq(buf, NULL); convertEndianness(&q, sizeof(q)); - MD5_Update(ctx, &q, sizeof(Sleef_quad)); + EVP_DigestUpdate(ctx, &q, sizeof(Sleef_quad)); } } } @@ -233,24 +237,44 @@ int main(int argc, char **argv) { // for(int j=0;j<4;j++) { - MD5_CTX ctx; - memset(&ctx, 0, sizeof(MD5_CTX)); - MD5_Init(&ctx); - - for(int i=0;i ${CMAKE_CURRENT_BINARY_DIR}/sleef${SIMD}.h.qtmp4 - DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/sleefquadinline_header.h.org ${CMAKE_CURRENT_BINARY_DIR}/sleef${SIMD}.h.qtmp2 + DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/sleefquadinline_header.h.org ${CMAKE_CURRENT_BINARY_DIR}/sleef${SIMD}.h.qtmp2 VERBATIM ) diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad/qdispatcher.h b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad/qdispatcher.h index 67032ce597c..d18b9524666 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad/qdispatcher.h +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad/qdispatcher.h @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -41,114 +41,115 @@ static int cpuSupportsExt(void (*tryExt)()) { return cache; } -#define DISPATCH_vq_vq(qtype, funcName, pfn, dfn, funcExt0, funcExt1) \ - static CONST VECTOR_CC qtype (*pfn)(qtype arg0); \ +#define DISPATCH_vq_vq(qtype, funcName, pfn, dfn, funcExt0, funcExt1) \ + static CONST VECTOR_CC qtype (*pfn)(qtype arg0); \ static CONST VECTOR_CC qtype dfn(qtype arg0) { \ - qtype CONST VECTOR_CC (*p)(qtype arg0) = funcExt0; \ - SUBST_IF_EXT1(funcExt1); \ - pfn = p; \ + qtype CONST VECTOR_CC (*p)(qtype arg0) = funcExt0; \ + SUBST_IF_EXT1(funcExt1); \ + pfn = p; \ return (*pfn)(arg0); \ - } \ + } \ static CONST VECTOR_CC qtype (*pfn)(qtype arg0) = dfn; \ EXPORT CONST VECTOR_CC qtype funcName(qtype arg0) { return (*pfn)(arg0); } #define DISPATCH_vq_vq_vq(qtype, funcName, pfn, dfn, funcExt0, funcExt1) \ - static CONST VECTOR_CC qtype (*pfn)(qtype arg0, qtype arg1); \ - static CONST VECTOR_CC qtype dfn(qtype arg0, qtype arg1) { \ - qtype CONST VECTOR_CC (*p)(qtype arg0, qtype arg1) = funcExt0; \ - SUBST_IF_EXT1(funcExt1); \ - pfn = p; \ - return (*pfn)(arg0, arg1); \ - } \ - static CONST VECTOR_CC qtype (*pfn)(qtype arg0, qtype arg1) = dfn; \ + static CONST VECTOR_CC qtype (*pfn)(qtype arg0, qtype arg1); \ + static CONST VECTOR_CC qtype dfn(qtype arg0, qtype arg1) { \ + qtype CONST VECTOR_CC (*p)(qtype arg0, qtype arg1) = funcExt0; \ + SUBST_IF_EXT1(funcExt1); \ + pfn = p; \ + return (*pfn)(arg0, arg1); \ + } \ + static CONST VECTOR_CC qtype (*pfn)(qtype arg0, qtype arg1) = dfn; \ EXPORT CONST VECTOR_CC qtype funcName(qtype arg0, qtype arg1) { return (*pfn)(arg0, arg1); } #define DISPATCH_vq_vq_vq_vq(qtype, funcName, pfn, dfn, funcExt0, funcExt1) \ static CONST VECTOR_CC qtype (*pfn)(qtype arg0, qtype arg1, qtype arg2); \ static CONST VECTOR_CC qtype dfn(qtype arg0, qtype arg1, qtype arg2) { \ qtype CONST VECTOR_CC (*p)(qtype arg0, qtype arg1, qtype arg2) = funcExt0; \ - SUBST_IF_EXT1(funcExt1); \ - pfn = p; \ - return (*pfn)(arg0, arg1, arg2); \ - } \ + SUBST_IF_EXT1(funcExt1); \ + pfn = p; \ + return (*pfn)(arg0, arg1, arg2); \ + } \ static CONST VECTOR_CC qtype (*pfn)(qtype arg0, qtype arg1, qtype arg2) = dfn; \ EXPORT CONST VECTOR_CC qtype funcName(qtype arg0, qtype arg1, qtype arg2) { return (*pfn)(arg0, arg1, arg2); } #define DISPATCH_vq_vq_vx(qtype, xtype, funcName, pfn, dfn, funcExt0, funcExt1) \ - static CONST VECTOR_CC qtype (*pfn)(qtype arg0, xtype arg1); \ - static CONST VECTOR_CC qtype dfn(qtype arg0, xtype arg1) { \ - qtype CONST VECTOR_CC (*p)(qtype arg0, xtype arg1) = funcExt0; \ - SUBST_IF_EXT1(funcExt1); \ - pfn = p; \ - return (*pfn)(arg0, arg1); \ - } \ - static CONST VECTOR_CC qtype (*pfn)(qtype arg0, xtype arg1) = dfn; \ + static CONST VECTOR_CC qtype (*pfn)(qtype arg0, xtype arg1); \ + static CONST VECTOR_CC qtype dfn(qtype arg0, xtype arg1) { \ + qtype CONST VECTOR_CC (*p)(qtype arg0, xtype arg1) = funcExt0; \ + SUBST_IF_EXT1(funcExt1); \ + pfn = p; \ + return (*pfn)(arg0, arg1); \ + } \ + static CONST VECTOR_CC qtype (*pfn)(qtype arg0, xtype arg1) = dfn; \ EXPORT CONST VECTOR_CC qtype funcName(qtype arg0, xtype arg1) { return (*pfn)(arg0, arg1); } #define DISPATCH_vq_vq_pvx(qtype, xtype, funcName, pfn, dfn, funcExt0, funcExt1) \ - static VECTOR_CC qtype (*pfn)(qtype arg0, xtype *arg1); \ - static VECTOR_CC qtype dfn(qtype arg0, xtype *arg1) { \ - qtype VECTOR_CC (*p)(qtype arg0, xtype *arg1) = funcExt0; \ - SUBST_IF_EXT1(funcExt1); \ - pfn = p; \ - return (*pfn)(arg0, arg1); \ - } \ - static VECTOR_CC qtype (*pfn)(qtype arg0, xtype *arg1) = dfn; \ + static VECTOR_CC qtype (*pfn)(qtype arg0, xtype *arg1); \ + static VECTOR_CC qtype dfn(qtype arg0, xtype *arg1) { \ + qtype VECTOR_CC (*p)(qtype arg0, xtype *arg1) = funcExt0; \ + SUBST_IF_EXT1(funcExt1); \ + pfn = p; \ + return (*pfn)(arg0, arg1); \ + } \ + static VECTOR_CC qtype (*pfn)(qtype arg0, xtype *arg1) = dfn; \ EXPORT VECTOR_CC qtype funcName(qtype arg0, xtype *arg1) { return (*pfn)(arg0, arg1); } #define DISPATCH_vq_vx(qtype, xtype, funcName, pfn, dfn, funcExt0, funcExt1) \ - static CONST VECTOR_CC qtype (*pfn)(xtype arg0); \ + static CONST VECTOR_CC qtype (*pfn)(xtype arg0); \ static CONST VECTOR_CC qtype dfn(xtype arg0) { \ - qtype CONST VECTOR_CC (*p)(xtype arg0) = funcExt0; \ - SUBST_IF_EXT1(funcExt1); \ - pfn = p; \ + qtype CONST VECTOR_CC (*p)(xtype arg0) = funcExt0; \ + SUBST_IF_EXT1(funcExt1); \ + pfn = p; \ return (*pfn)(arg0); \ - } \ + } \ static CONST VECTOR_CC qtype (*pfn)(xtype arg0) = dfn; \ EXPORT CONST VECTOR_CC qtype funcName(xtype arg0) { return (*pfn)(arg0); } #define DISPATCH_vx_vq(qtype, xtype, funcName, pfn, dfn, funcExt0, funcExt1) \ - static CONST VECTOR_CC xtype (*pfn)(qtype arg0); \ + static CONST VECTOR_CC xtype (*pfn)(qtype arg0); \ static CONST VECTOR_CC xtype dfn(qtype arg0) { \ - xtype CONST VECTOR_CC (*p)(qtype arg0) = funcExt0; \ - SUBST_IF_EXT1(funcExt1); \ - pfn = p; \ + xtype CONST VECTOR_CC (*p)(qtype arg0) = funcExt0; \ + SUBST_IF_EXT1(funcExt1); \ + pfn = p; \ return (*pfn)(arg0); \ - } \ + } \ static CONST VECTOR_CC xtype (*pfn)(qtype arg0) = dfn; \ EXPORT CONST VECTOR_CC xtype funcName(qtype arg0) { return (*pfn)(arg0); } #define DISPATCH_vx_vq_vq(qtype, xtype, funcName, pfn, dfn, funcExt0, funcExt1) \ - static CONST VECTOR_CC xtype (*pfn)(qtype arg0, qtype arg1); \ - static CONST VECTOR_CC xtype dfn(qtype arg0, qtype arg1) { \ - xtype CONST VECTOR_CC (*p)(qtype arg0, qtype arg1) = funcExt0; \ - SUBST_IF_EXT1(funcExt1); \ - pfn = p; \ - return (*pfn)(arg0, arg1); \ - } \ - static CONST VECTOR_CC xtype (*pfn)(qtype arg0, qtype arg1) = dfn; \ + static CONST VECTOR_CC xtype (*pfn)(qtype arg0, qtype arg1); \ + static CONST VECTOR_CC xtype dfn(qtype arg0, qtype arg1) { \ + xtype CONST VECTOR_CC (*p)(qtype arg0, qtype arg1) = funcExt0; \ + SUBST_IF_EXT1(funcExt1); \ + pfn = p; \ + return (*pfn)(arg0, arg1); \ + } \ + static CONST VECTOR_CC xtype (*pfn)(qtype arg0, qtype arg1) = dfn; \ EXPORT CONST VECTOR_CC xtype funcName(qtype arg0, qtype arg1) { return (*pfn)(arg0, arg1); } #define DISPATCH_q_vq_vx(qtype, xtype, funcName, pfn, dfn, funcExt0, funcExt1) \ - static CONST VECTOR_CC Sleef_quad (*pfn)(qtype arg0, xtype arg1); \ - static CONST VECTOR_CC Sleef_quad dfn(qtype arg0, xtype arg1) { \ - Sleef_quad CONST VECTOR_CC (*p)(qtype arg0, xtype arg1) = funcExt0; \ - SUBST_IF_EXT1(funcExt1); \ - pfn = p; \ - return (*pfn)(arg0, arg1); \ - } \ + static CONST VECTOR_CC Sleef_quad (*pfn)(qtype arg0, xtype arg1); \ + static CONST VECTOR_CC Sleef_quad dfn(qtype arg0, xtype arg1) { \ + Sleef_quad CONST VECTOR_CC (*p)(qtype arg0, xtype arg1) = funcExt0; \ + SUBST_IF_EXT1(funcExt1); \ + pfn = p; \ + return (*pfn)(arg0, arg1); \ + } \ static CONST VECTOR_CC Sleef_quad (*pfn)(qtype arg0, xtype arg1) = dfn; \ EXPORT CONST VECTOR_CC Sleef_quad funcName(qtype arg0, xtype arg1) { return (*pfn)(arg0, arg1); } #define DISPATCH_vq_vq_vi_q(qtype, xtype, funcName, pfn, dfn, funcExt0, funcExt1) \ - static CONST VECTOR_CC qtype (*pfn)(qtype arg0, xtype arg1, Sleef_quad arg2); \ + static CONST VECTOR_CC qtype (*pfn)(qtype arg0, xtype arg1, Sleef_quad arg2); \ static CONST VECTOR_CC qtype dfn(qtype arg0, xtype arg1, Sleef_quad arg2) { \ qtype CONST VECTOR_CC (*p)(qtype arg0, xtype arg1, Sleef_quad arg2) = funcExt0; \ - SUBST_IF_EXT1(funcExt1); \ - pfn = p; \ - return (*pfn)(arg0, arg1, arg2); \ - } \ + SUBST_IF_EXT1(funcExt1); \ + pfn = p; \ + return (*pfn)(arg0, arg1, arg2); \ + } \ static CONST VECTOR_CC qtype (*pfn)(qtype arg0, xtype arg1, Sleef_quad arg2) = dfn; \ EXPORT CONST VECTOR_CC qtype funcName(qtype arg0, xtype arg1, Sleef_quad arg2) { return (*pfn)(arg0, arg1, arg2); } // + diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad/qdispscalar.c.org b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad/qdispscalar.c.org index cc2b89133f5..c4c129202ae 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad/qdispscalar.c.org +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad/qdispscalar.c.org @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2023. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -21,3 +21,4 @@ static void tryFMA() { sleef_cpuid_QUADFMA_0 = Sleef_sinq1_u10purecfma(sleef_cpu #define SUBST_IF_EXT1(funcExt1) if (cpuSupportsExt(tryFMA)) p = funcExt1; // + diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad/qdispx2.c.org b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad/qdispx2.c.org index 3ee64a5702d..533996a1ff4 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad/qdispx2.c.org +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad/qdispx2.c.org @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2023. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -37,3 +37,4 @@ static void tryEXT1() { sleef_cpuid_EXT = Sleef_sinq2_u10vxe2(sleef_cpuid_EXT); #endif // + diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad/qfuncproto.h b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad/qfuncproto.h index d52e8f8b937..7574ac375ae 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad/qfuncproto.h +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad/qfuncproto.h @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad/qmkdisp.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad/qmkdisp.c index 91a9effed3d..6af823220d8 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad/qmkdisp.c +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad/qmkdisp.c @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad/qmkrename.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad/qmkrename.c index 17d8cea7725..d5f78c4ca25 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad/qmkrename.c +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad/qmkrename.c @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -231,7 +231,7 @@ int main(int argc, char **argv) { break; case 15: assert(funcList[i].ulp == -1); - printf("SLEEF_IMPORT SLEEF_CONST void Sleef_%sq%s%s%s(Sleef_quad *, %s);\n", + printf("SLEEF_IMPORT void Sleef_%sq%s%s%s(Sleef_quad *, %s);\n", funcList[i].name, wqp, isaub, isaname, vargquadname); @@ -338,3 +338,4 @@ int main(int argc, char **argv) { exit(0); } + diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad/sleefquad_header.h.org.in b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad/sleefquad_header.h.org.in index f38cd7eb79f..fdcdc0b6493 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad/sleefquad_header.h.org.in +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad/sleefquad_header.h.org.in @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -10,6 +10,9 @@ #define SLEEF_VERSION_MINOR @SLEEF_VERSION_MINOR@ #define SLEEF_VERSION_PATCHLEVEL @SLEEF_VERSION_PATCH@ +#cmakedefine SLEEF_FLOAT128_IS_IEEEQP +#cmakedefine SLEEF_LONGDOUBLE_IS_IEEEQP + #include "sleef.h" #include @@ -43,14 +46,6 @@ extern "C" // -#if (defined(__SIZEOF_FLOAT128__) && __SIZEOF_FLOAT128__ == 16) || (defined(__linux__) && defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))) || (defined(__PPC64__) && defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 8) -#define SLEEF_FLOAT128_IS_IEEEQP -#endif - -#if !defined(SLEEF_FLOAT128_IS_IEEEQP) && defined(__SIZEOF_LONG_DOUBLE__) && __SIZEOF_LONG_DOUBLE__ == 16 && (defined(__aarch64__) || defined(__zarch__)) -#define SLEEF_LONGDOUBLE_IS_IEEEQP -#endif - #if !defined(Sleef_quad_DEFINED) #define Sleef_quad_DEFINED typedef struct { uint64_t x, y; } Sleef_uint64_2t; @@ -245,3 +240,4 @@ SLEEF_IMPORT int Sleef_registerPrintfHook(); SLEEF_IMPORT void Sleef_unregisterPrintfHook(); // + diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad/sleefquadinline_cuda_header.h.org b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad/sleefquadinline_cuda_header.h.org.in similarity index 99% rename from src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad/sleefquadinline_cuda_header.h.org rename to src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad/sleefquadinline_cuda_header.h.org.in index e6626edc4dd..c553c11007b 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad/sleefquadinline_cuda_header.h.org +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad/sleefquadinline_cuda_header.h.org.in @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See http://www.boost.org/LICENSE_1_0.txt) @@ -7,6 +7,9 @@ // Use --fmad=false option to compile this file // Include cmath, cfloat and cstdint before including this file +#cmakedefine SLEEF_FLOAT128_IS_IEEEQP +#cmakedefine SLEEF_LONGDOUBLE_IS_IEEEQP + #ifndef SLEEF_FP_ILOGB0 #define SLEEF_FP_ILOGB0 ((int)0x80000000) #endif diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad/sleefquadinline_header.h.org b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad/sleefquadinline_header.h.org.in similarity index 99% rename from src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad/sleefquadinline_header.h.org rename to src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad/sleefquadinline_header.h.org.in index 2001d577aec..8ca63a57932 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad/sleefquadinline_header.h.org +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad/sleefquadinline_header.h.org.in @@ -1,9 +1,12 @@ -// Copyright Naoki Shibata and contributors 2010 - 2021. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See http://www.boost.org/LICENSE_1_0.txt) // This file is generated by SLEEF SLEEF_VERSION_SLEEF +#cmakedefine SLEEF_FLOAT128_IS_IEEEQP +#cmakedefine SLEEF_LONGDOUBLE_IS_IEEEQP + #ifndef SLEEF_ALWAYS_INLINE #if defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER) #define SLEEF_ALWAYS_INLINE inline __attribute__((always_inline)) diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad/sleefsimdqp.c b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad/sleefsimdqp.c index 781b55132a2..db7b36093fc 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad/sleefsimdqp.c +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/src/quad/sleefsimdqp.c @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata and contributors 2010 - 2024. +// Copyright Naoki Shibata and contributors 2010 - 2025. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -15,7 +15,7 @@ #include "quaddef.h" #include "misc.h" -#ifndef SLEEF_ENABLE_CUDA +#ifndef ENABLE_CUDA extern const double Sleef_rempitabqp[]; #endif @@ -47,7 +47,7 @@ extern const double Sleef_rempitabqp[]; #endif #endif -#ifdef SLEEF_ENABLE_CUDA +#ifdef ENABLE_CUDA #define CONFIG 3 #include "helperpurec_scalar.h" #ifdef DORENAME @@ -737,11 +737,9 @@ static INLINE CONST VECTOR_CC vdouble2 poly27dd(vdouble2 x, vdouble c26, double2 // -#ifndef SLEEF_ENABLE_CUDA typedef struct { double x, y, z; } double3; -#endif static INLINE CONST VECTOR_CC vdouble3 cast_vd3_d3(double3 td) { return vd3setxyz_vd3_vd_vd_vd(vcast_vd_d(td.x), vcast_vd_d(td.y), vcast_vd_d(td.z)); @@ -975,6 +973,8 @@ static INLINE CONST VECTOR_CC vmask ilogb_vm_tdx(tdx t) { static INLINE CONST VECTOR_CC tdx add_tdx_tdx_tdx(tdx dd0, tdx dd1) { // finite numbers only vmask ed = vsub64_vm_vm_vm(tdxgete_vm_tdx(dd1), tdxgete_vm_tdx(dd0)); + ed = vsel_vm_vo64_vm_vm(vandnot_vo_vo_vo(iszero_vo_tdx(dd1), iszero_vo_tdx(dd0)), vcast_vm_i64( 1000000), ed); + ed = vsel_vm_vo64_vm_vm(vandnot_vo_vo_vo(iszero_vo_tdx(dd0), iszero_vo_tdx(dd1)), vcast_vm_i64(-1000000), ed); vdouble t = vldexp3_vd_vd_vm(vcast_vd_d(1), ed); vdouble3 rd3 = scaleadd2_vd3_vd3_vd3_vd(tdxgetd3_vd3_tdx(dd0), tdxgetd3_vd3_tdx(dd1), t); @@ -993,6 +993,8 @@ static INLINE CONST VECTOR_CC tdx add_tdx_tdx_tdx(tdx dd0, tdx dd1) { // finite static INLINE CONST VECTOR_CC tdx sub_tdx_tdx_tdx(tdx dd0, tdx dd1) { vmask ed = vsub64_vm_vm_vm(tdxgete_vm_tdx(dd1), tdxgete_vm_tdx(dd0)); + ed = vsel_vm_vo64_vm_vm(vandnot_vo_vo_vo(iszero_vo_tdx(dd1), iszero_vo_tdx(dd0)), vcast_vm_i64( 1000000), ed); + ed = vsel_vm_vo64_vm_vm(vandnot_vo_vo_vo(iszero_vo_tdx(dd0), iszero_vo_tdx(dd1)), vcast_vm_i64(-1000000), ed); vdouble t = vldexp3_vd_vd_vm(vcast_vd_d(1), ed); vdouble3 rd3 = scalesub2_vd3_vd3_vd3_vd(tdxgetd3_vd3_tdx(dd0), tdxgetd3_vd3_tdx(dd1), t); @@ -2786,7 +2788,7 @@ EXPORT CONST VECTOR_CC vint xicmpneq(vargquad ax, vargquad ay) { vquad y = cast_vq_aq(ay), cy = cmpcnv_vq_vq(y); vopmask o = isnan_vo_vq(x); o = vandnot_vo_vo_vo(o, vnot_vo64_vo64(vand_vo_vo_vo(veq64_vo_vm_vm(vqgety_vm_vq(cy), vqgety_vm_vq(cx)), veq64_vo_vm_vm(vqgetx_vm_vq(cx), vqgetx_vm_vq(cy))))); - o = vcast_vo32_vo64(vandnot_vo_vo_vo(isnan_vo_vq(y), o)); + o = vcast_vo32_vo64(vor_vo_vo_vo(vor_vo_vo_vo(isnan_vo_vq(x), isnan_vo_vq(y)), o)); vint vi = vsel_vi_vo_vi_vi(o, vcast_vi_i(1), vcast_vi_i(0)); return vi; } @@ -3209,7 +3211,7 @@ EXPORT CONST VECTOR_CC vargquad xldexpq(vargquad aa, vint e) { #ifndef ENABLE_SVE -#ifndef SLEEF_ENABLE_CUDA +#ifndef ENABLE_CUDA #define EXPORT2 EXPORT #define CONST2 CONST #else @@ -3554,7 +3556,7 @@ EXPORT vargquad Sleef_strtoq(const char *str, const char **endptr) { #define FLAG_UPPER (1 << 5) static int snprintquad(char *buf, size_t bufsize, vargquad argvalue, int typespec, int width, int precision, int flags) { - if (width > bufsize) width = bufsize; + if (width > (int)bufsize) width = bufsize; vquad c128 = cast_vq_aq(argvalue); @@ -3585,7 +3587,7 @@ static int snprintquad(char *buf, size_t bufsize, vargquad argvalue, int typespe flags &= ~FLAG_ZERO; } else { if (precision < 0) precision = 6; - if (precision > bufsize/2 - 10) precision = bufsize/2 - 10; + if (precision > (int)(bufsize/2 - 10)) precision = bufsize/2 - 10; if (typespec == 'g' && precision > 0) precision--; tdx rounder = mul_tdx_tdx_tdx(cast_tdx_d(0.5), exp10i(-precision)); @@ -3712,7 +3714,7 @@ static int snprintquad(char *buf, size_t bufsize, vargquad argvalue, int typespe } static int snprintquadhex(char *buf, size_t bufsize, vargquad argvalue, int width, int precision, int flags) { - if (width > bufsize) width = bufsize; + if (width > (int)bufsize) width = bufsize; char *bufend = buf + bufsize, *ptr = buf; vquad c128 = cast_vq_aq(argvalue); @@ -3995,6 +3997,8 @@ static int xvprintf(size_t (*consumer)(const char *ptr, size_t size, void *arg), outlen += (*consumer)(xbuf, strlen(xbuf), arg); } + va_end(ap2); + fmt++; } diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/toolchains/aarch64-gcc.cmake b/src/jdk.incubator.vector/unix/native/libsleef/upstream/toolchains/aarch64-gcc.cmake index c3594551ee1..e3b12ea8513 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/toolchains/aarch64-gcc.cmake +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/toolchains/aarch64-gcc.cmake @@ -4,7 +4,18 @@ SET (CMAKE_SYSTEM_PROCESSOR "aarch64") SET(CMAKE_FIND_ROOT_PATH /usr/aarch64-linux-gnu /usr/include/aarch64-linux-gnu /usr/lib/aarch64-linux-gnu /lib/aarch64-linux-gnu) -find_program(CMAKE_C_COMPILER NAMES aarch64-linux-gnu-gcc-11 aarch64-linux-gnu-gcc-8 aarch64-linux-gnu-gcc-7 aarch64-linux-gnu-gcc-6 aarch64-linux-gnu-gcc-5 aarch64-linux-gnu-gcc) +find_program(CMAKE_C_COMPILER + NAMES aarch64-linux-gnu-gcc-14 + aarch64-linux-gnu-gcc-13 + aarch64-linux-gnu-gcc-12 + aarch64-linux-gnu-gcc-11 + aarch64-linux-gnu-gcc-10 + aarch64-linux-gnu-gcc-9 + aarch64-linux-gnu-gcc-8 + aarch64-linux-gnu-gcc-7 + aarch64-linux-gnu-gcc-6 + aarch64-linux-gnu-gcc-5 + aarch64-linux-gnu-gcc) SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY BOTH) diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/toolchains/armhf-gcc.cmake b/src/jdk.incubator.vector/unix/native/libsleef/upstream/toolchains/armhf-gcc.cmake index 24e160b965b..f0d91e62a38 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/toolchains/armhf-gcc.cmake +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/toolchains/armhf-gcc.cmake @@ -4,7 +4,18 @@ SET (CMAKE_SYSTEM_PROCESSOR "armhf") SET(CMAKE_FIND_ROOT_PATH /usr/arm-linux-gnueabihf /usr/include/arm-linux-gnueabihf /usr/lib/arm-linux-gnueabihf) -find_program(CMAKE_C_COMPILER NAMES arm-linux-gnueabihf-gcc-11 arm-linux-gnueabihf-gcc-8 arm-linux-gnueabihf-gcc-7 arm-linux-gnueabihf-gcc-6 arm-linux-gnueabihf-gcc-5 arm-linux-gnueabihf-gcc) +find_program(CMAKE_C_COMPILER + NAMES arm-linux-gnueabihf-gcc-14 + arm-linux-gnueabihf-gcc-13 + arm-linux-gnueabihf-gcc-12 + arm-linux-gnueabihf-gcc-11 + arm-linux-gnueabihf-gcc-10 + arm-linux-gnueabihf-gcc-9 + arm-linux-gnueabihf-gcc-8 + arm-linux-gnueabihf-gcc-7 + arm-linux-gnueabihf-gcc-6 + arm-linux-gnueabihf-gcc-5 + arm-linux-gnueabihf-gcc) SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY BOTH) diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/toolchains/native-gcc.cmake b/src/jdk.incubator.vector/unix/native/libsleef/upstream/toolchains/native-gcc.cmake index 07ea294d378..88f246f983b 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/toolchains/native-gcc.cmake +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/toolchains/native-gcc.cmake @@ -1 +1,2 @@ -find_program(CMAKE_C_COMPILER gcc) +find_program(CMAKE_C_COMPILER NAMES gcc-15 gcc-14 gcc-13 gcc-12 gcc-11 gcc-10 gcc) +find_program(CMAKE_CXX_COMPILER NAMES g++-15 g++-14 g++-13 g++-12 g++-11 g++-10 g++) diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/toolchains/native-llvm.cmake b/src/jdk.incubator.vector/unix/native/libsleef/upstream/toolchains/native-llvm.cmake index 6f8e7121afa..c07c89a3263 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/toolchains/native-llvm.cmake +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/toolchains/native-llvm.cmake @@ -1 +1,2 @@ -find_program(CMAKE_C_COMPILER NAMES clang-17 clang-16 clang-15 clang-14 clang-13 clang) +find_program(CMAKE_C_COMPILER NAMES clang-19 clang-18 clang-17 clang-16 clang-15 clang) +find_program(CMAKE_CXX_COMPILER NAMES clang++-19 clang++-18 clang++-17 clang++-16 clang++-15 clang++) diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/toolchains/ppc64el-gcc.cmake b/src/jdk.incubator.vector/unix/native/libsleef/upstream/toolchains/ppc64el-gcc.cmake index 7d6c96ae203..2505f4aad9a 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/toolchains/ppc64el-gcc.cmake +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/toolchains/ppc64el-gcc.cmake @@ -2,12 +2,16 @@ SET (CMAKE_CROSSCOMPILING TRUE) SET (CMAKE_SYSTEM_NAME "Linux") SET (CMAKE_SYSTEM_PROCESSOR "ppc64") -SET(CMAKE_FIND_ROOT_PATH /usr/powerpc64le-linux-gnu /usr/include/powerpc64le-linux-gnu /usr/lib/powerpc64le-linux-gnu) +SET(CMAKE_FIND_ROOT_PATH /usr/powerpc64le-linux-gnu /usr/include/powerpc64le-linux-gnu /usr/lib/powerpc64le-linux-gnu) -find_program(CMAKE_C_COMPILER NAMES powerpc64le-linux-gnu-gcc-11 powerpc64le-linux-gnu-gcc ppc64el-cc) +execute_process(COMMAND bash -c "compgen -c | egrep '^powerpc64le-linux-gnu-gcc(-[0-9]+(\\.[0-9]+\\.[0-9]+)?)?$' | sort -nr | uniq" OUTPUT_VARIABLE GCC_CANDIDATES) +string(REPLACE "\n" ";" GCC_CANDIDATES "${GCC_CANDIDATES}") +execute_process(COMMAND bash -c "compgen -c | egrep '^powerpc64le-linux-gnu-g\\+\\+(-[0-9]+(\\.[0-9]+\\.[0-9]+)?)?$' | sort -nr | uniq" OUTPUT_VARIABLE GXX_CANDIDATES) +string(REPLACE "\n" ";" GXX_CANDIDATES "${GXX_CANDIDATES}") -SET(CMAKE_AR /usr/powerpc64le-linux-gnu/bin/ar) +find_program(CMAKE_C_COMPILER NAMES ${GCC_CANDIDATES}) +find_program(CMAKE_CXX_COMPILER NAMES ${GXX_CANDIDATES}) SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) -SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY BOTH) +SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/toolchains/ppc64el-llvm.cmake b/src/jdk.incubator.vector/unix/native/libsleef/upstream/toolchains/ppc64el-llvm.cmake index 531b36f35f8..7d0ba02dd22 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/toolchains/ppc64el-llvm.cmake +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/toolchains/ppc64el-llvm.cmake @@ -2,13 +2,18 @@ SET (CMAKE_CROSSCOMPILING TRUE) SET (CMAKE_SYSTEM_NAME "Linux") SET (CMAKE_SYSTEM_PROCESSOR "ppc64") -SET(CMAKE_FIND_ROOT_PATH /usr/powerpc64le-linux-gnu /usr/include/powerpc64le-linux-gnu /usr/lib/powerpc64le-linux-gnu) +SET(CMAKE_FIND_ROOT_PATH /usr/powerpc64le-linux-gnu /usr/include/powerpc64le-linux-gnu /usr/lib/powerpc64le-linux-gnu) -find_program(CMAKE_C_COMPILER NAMES clang-17 clang-16 clang-15 clang-14 clang-13 clang) +execute_process(COMMAND bash -c "compgen -c | egrep '^clang(-[0-9]+(\\.[0-9]+\\.[0-9]+)?)?$' | sort -nr | uniq" OUTPUT_VARIABLE CLANG_CANDIDATES) +string(REPLACE "\n" ";" CLANG_CANDIDATES "${CLANG_CANDIDATES}") +execute_process(COMMAND bash -c "compgen -c | egrep '^clang\\+\\+(-[0-9]+(\\.[0-9]+\\.[0-9]+)?)?$' | sort -nr | uniq" OUTPUT_VARIABLE CLANGXX_CANDIDATES) +string(REPLACE "\n" ";" CLANGXX_CANDIDATES "${CLANGXX_CANDIDATES}") + +find_program(CMAKE_C_COMPILER NAMES ${CLANG_CANDIDATES}) set(CMAKE_C_COMPILER_TARGET powerpc64le-linux-gnu) - -SET(CMAKE_AR /usr/powerpc64le-linux-gnu/bin/ar) +find_program(CMAKE_CXX_COMPILER NAMES ${CLANGXX_CANDIDATES}) +set(CMAKE_CXX_COMPILER_TARGET powerpc64le-linux-gnu) SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) -SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY BOTH) +SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/toolchains/s390x-gcc.cmake b/src/jdk.incubator.vector/unix/native/libsleef/upstream/toolchains/s390x-gcc.cmake index 4aa9f12cfb1..a3701f4eaa2 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/toolchains/s390x-gcc.cmake +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/toolchains/s390x-gcc.cmake @@ -2,9 +2,15 @@ SET (CMAKE_CROSSCOMPILING TRUE) SET (CMAKE_SYSTEM_NAME "Linux") SET (CMAKE_SYSTEM_PROCESSOR "s390x") -SET(CMAKE_FIND_ROOT_PATH /usr/s390x-linux-gnu /usr/include/s390x-linux-gnu /usr/lib/s390x-linux-gnu) +SET(CMAKE_FIND_ROOT_PATH /usr/s390x-linux-gnu /usr/include/s390x-linux-gnu /usr/lib/s390x-linux-gnu) -find_program(CMAKE_C_COMPILER NAMES s390x-linux-gnu-gcc-11 s390x-linux-gnu-gcc) +execute_process(COMMAND bash -c "compgen -c | egrep '^s390x-linux-gnu-gcc(-[0-9]+(\\.[0-9]+\\.[0-9]+)?)?$' | sort -nr | uniq" OUTPUT_VARIABLE GCC_CANDIDATES) +string(REPLACE "\n" ";" GCC_CANDIDATES "${GCC_CANDIDATES}") +execute_process(COMMAND bash -c "compgen -c | egrep '^s390x-linux-gnu-g\\+\\+(-[0-9]+(\\.[0-9]+\\.[0-9]+)?)?$' | sort -nr | uniq" OUTPUT_VARIABLE GXX_CANDIDATES) +string(REPLACE "\n" ";" GXX_CANDIDATES "${GXX_CANDIDATES}") + +find_program(CMAKE_C_COMPILER NAMES ${GCC_CANDIDATES}) +find_program(CMAKE_CXX_COMPILER NAMES ${GXX_CANDIDATES}) SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY BOTH) diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/toolchains/s390x-llvm.cmake b/src/jdk.incubator.vector/unix/native/libsleef/upstream/toolchains/s390x-llvm.cmake index ca5e96878d1..9c6a359a3d7 100644 --- a/src/jdk.incubator.vector/unix/native/libsleef/upstream/toolchains/s390x-llvm.cmake +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/toolchains/s390x-llvm.cmake @@ -2,10 +2,17 @@ SET (CMAKE_CROSSCOMPILING TRUE) SET (CMAKE_SYSTEM_NAME "Linux") SET (CMAKE_SYSTEM_PROCESSOR "s390x") -SET(CMAKE_FIND_ROOT_PATH /usr/s390x-linux-gnu /usr/include/s390x-linux-gnu /usr/lib/s390x-linux-gnu) +SET(CMAKE_FIND_ROOT_PATH /usr/s390x-linux-gnu /usr/include/s390x-linux-gnu /usr/lib/s390x-linux-gnu) -find_program(CMAKE_C_COMPILER NAMES clang-17 clang-16 clang-15 clang-14 clang-13 clang) +execute_process(COMMAND bash -c "compgen -c | egrep '^clang(-[0-9]+(\\.[0-9]+\\.[0-9]+)?)?$' | sort -nr | uniq" OUTPUT_VARIABLE CLANG_CANDIDATES) +string(REPLACE "\n" ";" CLANG_CANDIDATES "${CLANG_CANDIDATES}") +execute_process(COMMAND bash -c "compgen -c | egrep '^clang\\+\\+(-[0-9]+(\\.[0-9]+\\.[0-9]+)?)?$' | sort -nr | uniq" OUTPUT_VARIABLE CLANGXX_CANDIDATES) +string(REPLACE "\n" ";" CLANGXX_CANDIDATES "${CLANGXX_CANDIDATES}") + +find_program(CMAKE_C_COMPILER NAMES ${CLANG_CANDIDATES}) set(CMAKE_C_COMPILER_TARGET s390x-linux-gnu) +find_program(CMAKE_CXX_COMPILER NAMES ${CLANGXX_CANDIDATES}) +set(CMAKE_CXX_COMPILER_TARGET s390x-linux-gnu) SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY BOTH) diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/winbuild-clang.bat b/src/jdk.incubator.vector/unix/native/libsleef/upstream/winbuild-clang.bat new file mode 100644 index 00000000000..7dbba797274 --- /dev/null +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/winbuild-clang.bat @@ -0,0 +1,29 @@ +@echo off +if "%INSTALLDIR%"=="" set INSTALLDIR=..\..\sleef_install + +if NOT exist winbuild-clang.bat exit /b 255 + +if "%VSCMD_ARG_HOST_ARCH%"=="" ( +echo Run this batch file from Developer Command Prompt for VS 20XX +exit /b 255 +) + +if "%CLANGINSTALLDIR%"=="" set CLANGINSTALLDIR=%VCINSTALLDIR%Tools\Llvm\x64 + +if NOT exist "%CLANGINSTALLDIR%\bin\clang.exe" ( +echo Cannot find "%CLANGINSTALLDIR%\bin\clang.exe" +echo Please set CLANGINSTALLDIR correctly. +exit /b 255 +) + +if %VSCMD_ARG_HOST_ARCH%==x86 call "%VCINSTALLDIR%Auxiliary\Build\vcvars64.bat" + +if exist build\ rmdir /S /Q build +mkdir build +cd build +if exist %INSTALLDIR%\ rmdir /S /Q %INSTALLDIR% +cmake -GNinja .. -DCMAKE_C_COMPILER:PATH="%CLANGINSTALLDIR%\bin\clang.exe" -DCMAKE_CXX_COMPILER:PATH="%CLANGINSTALLDIR%\bin\clang++.exe" -DCMAKE_INSTALL_PREFIX=%INSTALLDIR% %* +if not errorlevel 0 exit /b 255 +cmake -E time ninja +if not errorlevel 0 exit /b 255 +ninja install diff --git a/src/jdk.incubator.vector/unix/native/libsleef/upstream/winbuild-msvc.bat b/src/jdk.incubator.vector/unix/native/libsleef/upstream/winbuild-msvc.bat new file mode 100644 index 00000000000..386aff867bb --- /dev/null +++ b/src/jdk.incubator.vector/unix/native/libsleef/upstream/winbuild-msvc.bat @@ -0,0 +1,21 @@ +@echo off +if "%INSTALLDIR%"=="" set INSTALLDIR=..\..\sleef_install + +if NOT exist winbuild-msvc.bat exit /b 255 + +if "%VSCMD_ARG_HOST_ARCH%"=="" ( +echo Run this batch file from Developer Command Prompt for VS 20XX +exit /b 255 +) + +if %VSCMD_ARG_HOST_ARCH%==x86 call "%VCINSTALLDIR%Auxiliary\Build\vcvars64.bat" + +if exist build\ rmdir /S /Q build +mkdir build +cd build +if exist %INSTALLDIR%\ rmdir /S /Q %INSTALLDIR% +cmake .. -GNinja -DCMAKE_INSTALL_PREFIX=%INSTALLDIR% %* +if not errorlevel 0 exit /b 255 +cmake -E time ninja +if not errorlevel 0 exit /b 255 +ninja install diff --git a/test/hotspot/jtreg/compiler/vectorapi/TestVectorLibrarySleefUnaryOpAndBinaryOp.java b/test/hotspot/jtreg/compiler/vectorapi/TestVectorLibrarySleefUnaryOpAndBinaryOp.java new file mode 100644 index 00000000000..07f9acaa195 --- /dev/null +++ b/test/hotspot/jtreg/compiler/vectorapi/TestVectorLibrarySleefUnaryOpAndBinaryOp.java @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2026, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package compiler.vectorapi; + +import compiler.lib.ir_framework.*; +import jdk.incubator.vector.*; +import jtreg.SkippedException; + +import static jdk.incubator.vector.VectorOperators.ACOS; +import static jdk.incubator.vector.VectorOperators.ADD; +import static jdk.incubator.vector.VectorOperators.ASIN; +import static jdk.incubator.vector.VectorOperators.ATAN; +import static jdk.incubator.vector.VectorOperators.ATAN2; +import static jdk.incubator.vector.VectorOperators.CBRT; +import static jdk.incubator.vector.VectorOperators.COS; +import static jdk.incubator.vector.VectorOperators.COSH; +import static jdk.incubator.vector.VectorOperators.EXP; +import static jdk.incubator.vector.VectorOperators.EXPM1; +import static jdk.incubator.vector.VectorOperators.HYPOT; +import static jdk.incubator.vector.VectorOperators.LOG; +import static jdk.incubator.vector.VectorOperators.LOG10; +import static jdk.incubator.vector.VectorOperators.LOG1P; +import static jdk.incubator.vector.VectorOperators.POW; +import static jdk.incubator.vector.VectorOperators.SIN; +import static jdk.incubator.vector.VectorOperators.SINH; +import static jdk.incubator.vector.VectorOperators.TAN; + +/** + * @test + * @bug 8376602 + * @library /test/lib / + * @requires (os.arch == "aarch64" & vm.cpu.features ~= ".*asimd.*") | + * (os.arch == "riscv64" & vm.cpu.features ~= ".*rvv.*") + * @summary VectorAPI: SLEEF unary and binary math library operations should be intrinsified. + * This test is run on SVML/SLEEF supported platforms only. + * @modules jdk.incubator.vector + * + * @run driver compiler.vectorapi.TestVectorLibrarySleefUnaryOpAndBinaryOp + */ + +public class TestVectorLibrarySleefUnaryOpAndBinaryOp { + private static final VectorSpecies F_SPECIES = FloatVector.SPECIES_128; + private static final VectorSpecies D_SPECIES = DoubleVector.SPECIES_128; + + // TANH is not included because VectorMathLibrary.SLEEF intentionally rejects it. + private static final int SLEEF_UNARY_OP_COUNT = 14; + private static final int SLEEF_BINARY_OP_COUNT = 3; + + @Test + @IR(counts = { IRNode.CALL_LEAF_VECTOR, "= " + SLEEF_UNARY_OP_COUNT }) + public static float testFloatUnary() { + FloatVector v = FloatVector.broadcast(F_SPECIES, 3.14f); + FloatVector r = FloatVector.zero(F_SPECIES); + + r = r.add(v.lanewise(SIN)); + r = r.add(v.lanewise(COS)); + r = r.add(v.lanewise(TAN)); + r = r.add(v.lanewise(ASIN)); + r = r.add(v.lanewise(ACOS)); + r = r.add(v.lanewise(ATAN)); + r = r.add(v.lanewise(EXP)); + r = r.add(v.lanewise(LOG)); + r = r.add(v.lanewise(LOG10)); + r = r.add(v.lanewise(CBRT)); + r = r.add(v.lanewise(SINH)); + r = r.add(v.lanewise(COSH)); + r = r.add(v.lanewise(EXPM1)); + r = r.add(v.lanewise(LOG1P)); + + return r.reduceLanes(ADD); + } + + @Test + @IR(counts = { IRNode.CALL_LEAF_VECTOR, "= " + SLEEF_UNARY_OP_COUNT }) + public static double testDoubleUnary() { + DoubleVector v = DoubleVector.broadcast(D_SPECIES, 3.14d); + DoubleVector r = DoubleVector.zero(D_SPECIES); + + r = r.add(v.lanewise(SIN)); + r = r.add(v.lanewise(COS)); + r = r.add(v.lanewise(TAN)); + r = r.add(v.lanewise(ASIN)); + r = r.add(v.lanewise(ACOS)); + r = r.add(v.lanewise(ATAN)); + r = r.add(v.lanewise(EXP)); + r = r.add(v.lanewise(LOG)); + r = r.add(v.lanewise(LOG10)); + r = r.add(v.lanewise(CBRT)); + r = r.add(v.lanewise(SINH)); + r = r.add(v.lanewise(COSH)); + r = r.add(v.lanewise(EXPM1)); + r = r.add(v.lanewise(LOG1P)); + + return r.reduceLanes(ADD); + } + + @Test + @IR(counts = { IRNode.CALL_LEAF_VECTOR, "= " + SLEEF_BINARY_OP_COUNT }) + public static float testFloatBinary() { + FloatVector v1 = FloatVector.broadcast(F_SPECIES, 3.14f); + FloatVector v2 = FloatVector.broadcast(F_SPECIES, 0.5f); + FloatVector r = FloatVector.zero(F_SPECIES); + + r = r.add(v1.lanewise(ATAN2, v2)); + r = r.add(v1.lanewise(POW, v2)); + r = r.add(v1.lanewise(HYPOT, v2)); + + return r.reduceLanes(ADD); + } + + @Test + @IR(counts = { IRNode.CALL_LEAF_VECTOR, "= " + SLEEF_BINARY_OP_COUNT }) + public static double testDoubleBinary() { + DoubleVector v1 = DoubleVector.broadcast(D_SPECIES, 3.14d); + DoubleVector v2 = DoubleVector.broadcast(D_SPECIES, 0.5d); + DoubleVector r = DoubleVector.zero(D_SPECIES); + + r = r.add(v1.lanewise(ATAN2, v2)); + r = r.add(v1.lanewise(POW, v2)); + r = r.add(v1.lanewise(HYPOT, v2)); + + return r.reduceLanes(ADD); + } + + private static void checkSleef() { + try { + System.loadLibrary("sleef"); + } catch (UnsatisfiedLinkError _) { + throw new SkippedException("SLEEF not found"); + } + } + + public static void main(String[] args) { + checkSleef(); + + TestFramework testFramework = new TestFramework(); + testFramework.addFlags("--add-modules=jdk.incubator.vector") + .start(); + } +} diff --git a/test/micro/org/openjdk/bench/jdk/incubator/vector/VectorTranscendentalBenchmark.java b/test/micro/org/openjdk/bench/jdk/incubator/vector/VectorTranscendentalBenchmark.java new file mode 100644 index 00000000000..355bbb9f8f4 --- /dev/null +++ b/test/micro/org/openjdk/bench/jdk/incubator/vector/VectorTranscendentalBenchmark.java @@ -0,0 +1,364 @@ +/* + * Copyright (c) 2026, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package org.openjdk.bench.jdk.incubator.vector; + +import java.util.Random; +import java.util.concurrent.TimeUnit; + +import jdk.incubator.vector.DoubleVector; +import jdk.incubator.vector.FloatVector; +import jdk.incubator.vector.VectorOperators; +import jdk.incubator.vector.VectorSpecies; + +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.infra.Blackhole; + +@BenchmarkMode(Mode.Throughput) +@OutputTimeUnit(TimeUnit.MILLISECONDS) +@Warmup(iterations = 3, time = 1) +@Measurement(iterations = 5, time = 1) +@Fork(value = 1, jvmArgs = {"--add-modules=jdk.incubator.vector"}) +@State(Scope.Thread) +public class VectorTranscendentalBenchmark { + // TANH is intentionally excluded because the SLEEF VectorMathLibrary backend rejects it. + @Param({"1024", "4096", "16384"}) + private int size; + + @Param({"128", "256"}) + private int bits; + + private VectorSpecies fsp; + private VectorSpecies dsp; + + private FloatVector[] anyF; + private FloatVector[] smallF; + private FloatVector[] unitF; + private FloatVector[] positiveF; + private FloatVector[] log1pF; + private FloatVector[] powBaseF; + private FloatVector[] powExpF; + + private DoubleVector[] anyD; + private DoubleVector[] smallD; + private DoubleVector[] unitD; + private DoubleVector[] positiveD; + private DoubleVector[] log1pD; + private DoubleVector[] powBaseD; + private DoubleVector[] powExpD; + + @Setup + public void setup() { + fsp = switch (bits) { + case 128 -> FloatVector.SPECIES_128; + case 256 -> FloatVector.SPECIES_256; + default -> throw new IllegalArgumentException("Unsupported vector size: " + bits); + }; + dsp = switch (bits) { + case 128 -> DoubleVector.SPECIES_128; + case 256 -> DoubleVector.SPECIES_256; + default -> throw new IllegalArgumentException("Unsupported vector size: " + bits); + }; + + float[] anyFArr = new float[size]; + float[] smallFArr = new float[size]; + float[] unitFArr = new float[size]; + float[] positiveFArr = new float[size]; + float[] log1pFArr = new float[size]; + float[] powBaseFArr = new float[size]; + float[] powExpFArr = new float[size]; + + double[] anyDArr = new double[size]; + double[] smallDArr = new double[size]; + double[] unitDArr = new double[size]; + double[] positiveDArr = new double[size]; + double[] log1pDArr = new double[size]; + double[] powBaseDArr = new double[size]; + double[] powExpDArr = new double[size]; + + Random random = new Random(42); + for (int i = 0; i < size; i++) { + double any = (random.nextDouble() - 0.5d) * Math.PI * 4.0d; + double small = (random.nextDouble() - 0.5d) * 2.0d; + double unit = (random.nextDouble() - 0.5d) * 1.8d; + double positive = random.nextDouble() * 10.0d + 0.1d; + double log1p = random.nextDouble() * 4.5d - 0.5d; + double powBase = random.nextDouble() * 10.0d + 0.1d; + double powExp = (random.nextDouble() - 0.5d) * 4.0d; + + anyFArr[i] = (float) any; + smallFArr[i] = (float) small; + unitFArr[i] = (float) unit; + positiveFArr[i] = (float) positive; + log1pFArr[i] = (float) log1p; + powBaseFArr[i] = (float) powBase; + powExpFArr[i] = (float) powExp; + + anyDArr[i] = any; + smallDArr[i] = small; + unitDArr[i] = unit; + positiveDArr[i] = positive; + log1pDArr[i] = log1p; + powBaseDArr[i] = powBase; + powExpDArr[i] = powExp; + } + + anyF = loadFloatVectors(anyFArr); + smallF = loadFloatVectors(smallFArr); + unitF = loadFloatVectors(unitFArr); + positiveF = loadFloatVectors(positiveFArr); + log1pF = loadFloatVectors(log1pFArr); + powBaseF = loadFloatVectors(powBaseFArr); + powExpF = loadFloatVectors(powExpFArr); + + anyD = loadDoubleVectors(anyDArr); + smallD = loadDoubleVectors(smallDArr); + unitD = loadDoubleVectors(unitDArr); + positiveD = loadDoubleVectors(positiveDArr); + log1pD = loadDoubleVectors(log1pDArr); + powBaseD = loadDoubleVectors(powBaseDArr); + powExpD = loadDoubleVectors(powExpDArr); + } + + @Benchmark + public void floatTan(Blackhole bh) { + unaryFloat(VectorOperators.TAN, smallF, bh); + } + + @Benchmark + public void doubleTan(Blackhole bh) { + unaryDouble(VectorOperators.TAN, smallD, bh); + } + + @Benchmark + public void floatSin(Blackhole bh) { + unaryFloat(VectorOperators.SIN, anyF, bh); + } + + @Benchmark + public void doubleSin(Blackhole bh) { + unaryDouble(VectorOperators.SIN, anyD, bh); + } + + @Benchmark + public void floatSinh(Blackhole bh) { + unaryFloat(VectorOperators.SINH, smallF, bh); + } + + @Benchmark + public void doubleSinh(Blackhole bh) { + unaryDouble(VectorOperators.SINH, smallD, bh); + } + + @Benchmark + public void floatCos(Blackhole bh) { + unaryFloat(VectorOperators.COS, anyF, bh); + } + + @Benchmark + public void doubleCos(Blackhole bh) { + unaryDouble(VectorOperators.COS, anyD, bh); + } + + @Benchmark + public void floatCosh(Blackhole bh) { + unaryFloat(VectorOperators.COSH, smallF, bh); + } + + @Benchmark + public void doubleCosh(Blackhole bh) { + unaryDouble(VectorOperators.COSH, smallD, bh); + } + + @Benchmark + public void floatAsin(Blackhole bh) { + unaryFloat(VectorOperators.ASIN, unitF, bh); + } + + @Benchmark + public void doubleAsin(Blackhole bh) { + unaryDouble(VectorOperators.ASIN, unitD, bh); + } + + @Benchmark + public void floatAcos(Blackhole bh) { + unaryFloat(VectorOperators.ACOS, unitF, bh); + } + + @Benchmark + public void doubleAcos(Blackhole bh) { + unaryDouble(VectorOperators.ACOS, unitD, bh); + } + + @Benchmark + public void floatAtan(Blackhole bh) { + unaryFloat(VectorOperators.ATAN, anyF, bh); + } + + @Benchmark + public void doubleAtan(Blackhole bh) { + unaryDouble(VectorOperators.ATAN, anyD, bh); + } + + @Benchmark + public void floatCbrt(Blackhole bh) { + unaryFloat(VectorOperators.CBRT, anyF, bh); + } + + @Benchmark + public void doubleCbrt(Blackhole bh) { + unaryDouble(VectorOperators.CBRT, anyD, bh); + } + + @Benchmark + public void floatLog(Blackhole bh) { + unaryFloat(VectorOperators.LOG, positiveF, bh); + } + + @Benchmark + public void doubleLog(Blackhole bh) { + unaryDouble(VectorOperators.LOG, positiveD, bh); + } + + @Benchmark + public void floatLog10(Blackhole bh) { + unaryFloat(VectorOperators.LOG10, positiveF, bh); + } + + @Benchmark + public void doubleLog10(Blackhole bh) { + unaryDouble(VectorOperators.LOG10, positiveD, bh); + } + + @Benchmark + public void floatLog1p(Blackhole bh) { + unaryFloat(VectorOperators.LOG1P, log1pF, bh); + } + + @Benchmark + public void doubleLog1p(Blackhole bh) { + unaryDouble(VectorOperators.LOG1P, log1pD, bh); + } + + @Benchmark + public void floatExp(Blackhole bh) { + unaryFloat(VectorOperators.EXP, smallF, bh); + } + + @Benchmark + public void doubleExp(Blackhole bh) { + unaryDouble(VectorOperators.EXP, smallD, bh); + } + + @Benchmark + public void floatExpm1(Blackhole bh) { + unaryFloat(VectorOperators.EXPM1, smallF, bh); + } + + @Benchmark + public void doubleExpm1(Blackhole bh) { + unaryDouble(VectorOperators.EXPM1, smallD, bh); + } + + @Benchmark + public void floatAtan2(Blackhole bh) { + binaryFloat(VectorOperators.ATAN2, anyF, smallF, bh); + } + + @Benchmark + public void doubleAtan2(Blackhole bh) { + binaryDouble(VectorOperators.ATAN2, anyD, smallD, bh); + } + + @Benchmark + public void floatPow(Blackhole bh) { + binaryFloat(VectorOperators.POW, powBaseF, powExpF, bh); + } + + @Benchmark + public void doublePow(Blackhole bh) { + binaryDouble(VectorOperators.POW, powBaseD, powExpD, bh); + } + + @Benchmark + public void floatHypot(Blackhole bh) { + binaryFloat(VectorOperators.HYPOT, anyF, smallF, bh); + } + + @Benchmark + public void doubleHypot(Blackhole bh) { + binaryDouble(VectorOperators.HYPOT, anyD, smallD, bh); + } + + private FloatVector[] loadFloatVectors(float[] input) { + FloatVector[] vectors = new FloatVector[size / fsp.length()]; + for (int i = 0; i < vectors.length; i++) { + vectors[i] = FloatVector.fromArray(fsp, input, i * fsp.length()); + } + return vectors; + } + + private DoubleVector[] loadDoubleVectors(double[] input) { + DoubleVector[] vectors = new DoubleVector[size / dsp.length()]; + for (int i = 0; i < vectors.length; i++) { + vectors[i] = DoubleVector.fromArray(dsp, input, i * dsp.length()); + } + return vectors; + } + + private static void unaryFloat(VectorOperators.Unary op, FloatVector[] input, Blackhole bh) { + for (FloatVector v : input) { + bh.consume(v.lanewise(op)); + } + } + + private static void unaryDouble(VectorOperators.Unary op, DoubleVector[] input, Blackhole bh) { + for (DoubleVector v : input) { + bh.consume(v.lanewise(op)); + } + } + + private static void binaryFloat(VectorOperators.Binary op, FloatVector[] input1, + FloatVector[] input2, Blackhole bh) { + for (int i = 0; i < input1.length; i++) { + bh.consume(input1[i].lanewise(op, input2[i])); + } + } + + private static void binaryDouble(VectorOperators.Binary op, DoubleVector[] input1, + DoubleVector[] input2, Blackhole bh) { + for (int i = 0; i < input1.length; i++) { + bh.consume(input1[i].lanewise(op, input2[i])); + } + } +}