mirror of
https://github.com/openjdk/jdk.git
synced 2026-06-06 10:42:45 +00:00
8376602: [Vector API] Upgrade SLEEF from 3.6.1 to 3.9.0
Reviewed-by: psandoz, fyang, erikj
This commit is contained in:
parent
7757684450
commit
185d933bb9
@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved.
|
||||
# Copyright (c) 2024, 2026, Oracle and/or its affiliates. All rights reserved.
|
||||
# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
#
|
||||
# This code is free software; you can redistribute it and/or modify it
|
||||
@ -48,7 +48,7 @@ ifneq ($(OPENJDK_BUILD_OS), linux)
|
||||
endif
|
||||
|
||||
SLEEF_SUPPORT_DIR := $(MAKESUPPORT_OUTPUTDIR)/sleef
|
||||
SLEEF_SOURCE_BASE_DIR := $(TOPDIR)/src/jdk.incubator.vector/linux/native/libsleef
|
||||
SLEEF_SOURCE_BASE_DIR := $(TOPDIR)/src/jdk.incubator.vector/unix/native/libsleef
|
||||
SLEEF_SOURCE_DIR := $(SLEEF_SOURCE_BASE_DIR)/upstream
|
||||
SLEEF_TARGET_DIR := $(SLEEF_SOURCE_BASE_DIR)/generated
|
||||
SLEEF_NATIVE_BUILD_DIR := $(SLEEF_SUPPORT_DIR)/native
|
||||
@ -82,7 +82,12 @@ $(eval $(call SetupExecute, sleef_native_config, \
|
||||
INFO := Configuring native sleef build, \
|
||||
OUTPUT_DIR := $(SLEEF_NATIVE_BUILD_DIR), \
|
||||
WORKING_DIR := $(SLEEF_SOURCE_DIR), \
|
||||
COMMAND := $(CMAKE) -S . -B $(SLEEF_NATIVE_BUILD_DIR), \
|
||||
COMMAND := $(CMAKE) -S . -B $(SLEEF_NATIVE_BUILD_DIR) \
|
||||
-DCMAKE_INSTALL_PREFIX=$(SLEEF_NATIVE_BUILD_DIR) \
|
||||
-DSLEEF_BUILD_TESTS=OFF \
|
||||
-DSLEEF_DISABLE_SSL=ON \
|
||||
-DSLEEF_ENABLE_TLFLOAT=OFF \
|
||||
-DSLEEF_ENABLE_TESTER4=OFF, \
|
||||
))
|
||||
|
||||
TARGETS := $(sleef_native_config)
|
||||
@ -106,6 +111,11 @@ $(eval $(call SetupExecute, sleef_cross_config, \
|
||||
-DCMAKE_C_COMPILER=$(CC) \
|
||||
-DCMAKE_TOOLCHAIN_FILE=$(SLEEF_CMAKE_FILE) \
|
||||
-DNATIVE_BUILD_DIR=$(SLEEF_NATIVE_BUILD_DIR) \
|
||||
-DCMAKE_INSTALL_PREFIX=$(SLEEF_CROSS_BUILD_DIR) \
|
||||
-DSLEEF_BUILD_TESTS=OFF \
|
||||
-DSLEEF_DISABLE_SSL=ON \
|
||||
-DSLEEF_ENABLE_TLFLOAT=OFF \
|
||||
-DSLEEF_ENABLE_TESTER4=OFF \
|
||||
-DSLEEF_BUILD_INLINE_HEADERS=TRUE \
|
||||
$(EXTRA_CROSS_OPTIONS), \
|
||||
))
|
||||
@ -139,7 +149,7 @@ $(eval $(call SetupCopyFiles, copy_generated_sleef_source, \
|
||||
DEST := $(SLEEF_TARGET_DIR), \
|
||||
))
|
||||
|
||||
TARGETS := $(copy_generated_sleef_source)
|
||||
TARGETS := $(copy_static_sleef_source) $(copy_generated_sleef_source)
|
||||
|
||||
################################################################################
|
||||
|
||||
|
||||
@ -1,8 +1,8 @@
|
||||
## SLEEF v3.6.1
|
||||
## SLEEF v3.9.0
|
||||
|
||||
### Notice
|
||||
```
|
||||
Copyright © 2010-2024 SLEEF Project, Naoki Shibata and contributors
|
||||
Copyright © 2010-2025 SLEEF Project, Naoki Shibata and contributors
|
||||
|
||||
-------
|
||||
src/arch/helpersve.h has the following copyright:
|
||||
|
||||
@ -4,15 +4,15 @@ This directory contains the source code for the SLEEF library, the
|
||||
**SIMD Library for Evaluating Elementary Functions**. For more information on
|
||||
SLEEF, see https://sleef.org/.
|
||||
|
||||
The currently imported libsleef sources is version 3.6.1, which has
|
||||
git tag `3.6.1` and git commit hash `6ee14bcae5fe92c2ff8b000d5a01102dab08d774`.
|
||||
The currently imported libsleef sources are version 3.9.0, which has
|
||||
git tag `3.9.0` and git commit hash `906ca7512ee483296780a81a21b9ca715d40dfe1`.
|
||||
|
||||
# About the libsleef integration in the JDK
|
||||
|
||||
The upstream original source code is available in
|
||||
`src/jdk.incubator.vector/unix/native/libsleef/upstream`. However, this code is
|
||||
not directly usable in the JDK build system, but is instead used as the base for
|
||||
the generation of additional souce code files. This generation is done by
|
||||
the generation of additional source code files. This generation is done by
|
||||
the libsleef CMake files. If this should have been done at build time, it would
|
||||
have meant adding CMake as a required dependency to build the JDK.
|
||||
|
||||
@ -25,7 +25,7 @@ the JDK source tree. The generated files reside in
|
||||
|
||||
To update the version of libsleef that is used in the JDK, clone
|
||||
`https://github.com/shibatch/sleef.git`, and copy all files, except the `docs`,
|
||||
`.github` and `.git` directories, into
|
||||
`.github` and `.git` directories, and the `.nojekyll` file, into
|
||||
`src/jdk.incubator.vector/unix/native/libsleef/upstream`.
|
||||
|
||||
The libsleef source code does not follow the JDK whitespace rules as enforced by
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2024.
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
@ -13,10 +13,15 @@
|
||||
#include <string.h>
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef M_PI
|
||||
#define M_PI 3.141592653589793238462643383279502884
|
||||
#endif
|
||||
|
||||
#ifndef M_PIf
|
||||
# define M_PIf ((float)M_PI)
|
||||
#endif
|
||||
|
||||
#ifndef M_PIl
|
||||
#define M_PIl 3.141592653589793238462643383279502884L
|
||||
#endif
|
||||
@ -137,9 +142,17 @@
|
||||
#define L2Lf 1.428606765330187045e-06f
|
||||
|
||||
#define R_LN2f 1.442695040888963407359924681001892137426645954152985934135449406931f
|
||||
#ifndef M_PIf
|
||||
# define M_PIf ((float)M_PI)
|
||||
#endif
|
||||
|
||||
// Overflow bounds
|
||||
|
||||
// - exp(x) overflows for x over (also used in pow)
|
||||
#define LOG_DBL_MAX 0x1.62e42fefa39efp+9 /* 709.782712893384 */
|
||||
|
||||
// Other bounds
|
||||
|
||||
// - log1p(f)(x) approximation holds up to x equals
|
||||
#define LOG1PF_BOUND 0x1.2ced32p+126 /* 1.0e+38 */
|
||||
#define LOG1P_BOUND 0x1.c7b1f3cac7433p+1019 /* 1.0e+307 */
|
||||
|
||||
//
|
||||
|
||||
@ -183,17 +196,13 @@ typedef struct {
|
||||
} Sleef_longdouble2;
|
||||
#endif
|
||||
|
||||
#if (defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)) && !defined(_MSC_VER)
|
||||
#if (defined (__GNUC__) || defined (__clang__)) && !defined(_MSC_VER)
|
||||
|
||||
#define LIKELY(condition) __builtin_expect(!!(condition), 1)
|
||||
#define UNLIKELY(condition) __builtin_expect(!!(condition), 0)
|
||||
#define RESTRICT __restrict__
|
||||
|
||||
#ifndef __arm__
|
||||
#define ALIGNED(x) __attribute__((aligned(x)))
|
||||
#else
|
||||
#define ALIGNED(x)
|
||||
#endif
|
||||
|
||||
#if defined(SLEEF_GENHEADER)
|
||||
|
||||
@ -229,7 +238,7 @@ typedef struct {
|
||||
#define SLEEF_INFINITYf __builtin_inff()
|
||||
#define SLEEF_INFINITYl __builtin_infl()
|
||||
|
||||
#if defined(__INTEL_COMPILER) || defined (__clang__)
|
||||
#if defined (__clang__)
|
||||
#define SLEEF_INFINITYq __builtin_inf()
|
||||
#define SLEEF_NANq __builtin_nan("")
|
||||
#else
|
||||
@ -237,7 +246,7 @@ typedef struct {
|
||||
#define SLEEF_NANq (SLEEF_INFINITYq - SLEEF_INFINITYq)
|
||||
#endif
|
||||
|
||||
#elif defined(_MSC_VER) // #if (defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)) && !defined(_MSC_VER)
|
||||
#elif defined(_MSC_VER) // #if (defined (__GNUC__) || defined (__clang__)) && !defined(_MSC_VER)
|
||||
|
||||
#if defined(SLEEF_GENHEADER)
|
||||
|
||||
@ -249,6 +258,9 @@ typedef struct {
|
||||
#else // #if defined(SLEEF_GENHEADER)
|
||||
|
||||
#define INLINE __forceinline
|
||||
#ifdef CONST
|
||||
#undef CONST
|
||||
#endif
|
||||
#define CONST
|
||||
#ifndef SLEEF_STATIC_LIBS
|
||||
#define EXPORT __declspec(dllexport)
|
||||
@ -265,7 +277,7 @@ typedef struct {
|
||||
#define LIKELY(condition) (condition)
|
||||
#define UNLIKELY(condition) (condition)
|
||||
|
||||
#if (defined(__GNUC__) || defined(__CLANG__)) && (defined(__i386__) || defined(__x86_64__)) && !defined(SLEEF_GENHEADER)
|
||||
#if (defined(__GNUC__) || defined(__CLANG__)) && defined(__x86_64__) && !defined(SLEEF_GENHEADER)
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
|
||||
@ -294,7 +306,7 @@ typedef struct {
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#endif // #elif defined(_MSC_VER) // #if (defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)) && !defined(_MSC_VER)
|
||||
#endif // #elif defined(_MSC_VER) // #if (defined (__GNUC__) || defined (__clang__)) && !defined(_MSC_VER)
|
||||
|
||||
#if !defined(__linux__)
|
||||
#define isinff(x) ((x) == SLEEF_INFINITYf || (x) == -SLEEF_INFINITYf)
|
||||
@ -305,15 +317,9 @@ typedef struct {
|
||||
|
||||
#endif // #ifndef __MISC_H__
|
||||
|
||||
#ifdef ENABLE_AAVPCS
|
||||
#define VECTOR_CC __attribute__((aarch64_vector_pcs))
|
||||
#else
|
||||
#define VECTOR_CC
|
||||
#endif
|
||||
|
||||
//
|
||||
|
||||
#if defined (__GNUC__) && !defined(__INTEL_COMPILER)
|
||||
#if defined (__GNUC__)
|
||||
#pragma GCC diagnostic ignored "-Wpragmas"
|
||||
#pragma GCC diagnostic ignored "-Wunknown-pragmas"
|
||||
#if !defined (__clang__)
|
||||
|
||||
@ -1,8 +1,11 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
// This file is generated by SLEEF 3.6.1
|
||||
// This file is generated by SLEEF 3.9.0
|
||||
|
||||
/* #undef SLEEF_FLOAT128_IS_IEEEQP */
|
||||
#define SLEEF_LONGDOUBLE_IS_IEEEQP
|
||||
|
||||
#ifndef SLEEF_ALWAYS_INLINE
|
||||
#if defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)
|
||||
@ -1010,6 +1013,7 @@ static const double Sleef_rempitabdp[] = {
|
||||
2.1353977370878701046e-273, -1.2215123283371736879e-289, 6.7342163555358599277e-306, -5.681754927174335258e-322,
|
||||
2.1353977370878701046e-273, -1.2215123283371736879e-289, 6.7342163555358599277e-306, -5.681754927174335258e-322,
|
||||
2.8687869620228451614e-274, -1.9537812801257956865e-290, 1.0380272777574237546e-306, 6.4228533959362050743e-323,
|
||||
0, 0, 0, 0,
|
||||
};
|
||||
|
||||
static const float Sleef_rempitabsp[] = {
|
||||
@ -1116,17 +1120,10 @@ static const float Sleef_rempitabsp[] = {
|
||||
1.183823005e-12, 1.161414894e-20, 1.29131908e-27, 1.715766248e-34,
|
||||
1.183823005e-12, 1.161414894e-20, 1.29131908e-27, 1.715766248e-34,
|
||||
2.743283031e-13, 1.161414894e-20, 1.29131908e-27, 1.715766248e-34,
|
||||
0, 0, 0, 0,
|
||||
};
|
||||
#endif // #ifndef __SLEEF_REMPITAB__
|
||||
|
||||
#if !defined(__NVCC__) && ((defined(__SIZEOF_FLOAT128__) && __SIZEOF_FLOAT128__ == 16) || (defined(__linux__) && defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))) || (defined(__PPC64__) && defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 8))
|
||||
#define SLEEF_FLOAT128_IS_IEEEQP
|
||||
#endif
|
||||
|
||||
#if !defined(SLEEF_FLOAT128_IS_IEEEQP) && !defined(__NVCC__) && defined(__SIZEOF_LONG_DOUBLE__) && __SIZEOF_LONG_DOUBLE__ == 16 && (defined(__aarch64__) || defined(__zarch__))
|
||||
#define SLEEF_LONGDOUBLE_IS_IEEEQP
|
||||
#endif
|
||||
|
||||
#if !defined(Sleef_quad_DEFINED)
|
||||
#define Sleef_quad_DEFINED
|
||||
typedef struct { uint64_t x, y; } Sleef_uint64_2t;
|
||||
@ -3294,7 +3291,8 @@ SLEEF_INLINE SLEEF_CONST vdouble_advsimd_sleef Sleef_expd2_u10advsimd(vdouble_ad
|
||||
|
||||
u = vldexp2_vd_vd_vi_advsimd_sleef(u, q);
|
||||
|
||||
u = vsel_vd_vo_vd_vd_advsimd_sleef(vgt_vo_vd_vd_advsimd_sleef(d, vcast_vd_d_advsimd_sleef(709.78271114955742909217217426)), vcast_vd_d_advsimd_sleef(__builtin_inf()), u);
|
||||
vopmask_advsimd_sleef o = vgt_vo_vd_vd_advsimd_sleef(d, vcast_vd_d_advsimd_sleef(0x1.62e42fefa39efp+9));
|
||||
u = vsel_vd_vo_vd_vd_advsimd_sleef(o, vcast_vd_d_advsimd_sleef(__builtin_inf()), u);
|
||||
u = vreinterpret_vd_vm_advsimd_sleef(vandnot_vm_vo64_vm_advsimd_sleef(vlt_vo_vd_vd_advsimd_sleef(d, vcast_vd_d_advsimd_sleef(-1000)), vreinterpret_vm_vd_advsimd_sleef(u)));
|
||||
|
||||
return u;
|
||||
@ -3411,13 +3409,13 @@ static SLEEF_ALWAYS_INLINE SLEEF_CONST vdouble_advsimd_sleef expk_advsimd_sleef(
|
||||
}
|
||||
|
||||
SLEEF_INLINE SLEEF_CONST vdouble_advsimd_sleef Sleef_powd2_u10advsimd(vdouble_advsimd_sleef x, vdouble_advsimd_sleef y) {
|
||||
|
||||
vopmask_advsimd_sleef yisint = visint_vo_vd_advsimd_sleef(y);
|
||||
vopmask_advsimd_sleef yisodd = vand_vo_vo_vo_advsimd_sleef(visodd_vo_vd_advsimd_sleef(y), yisint);
|
||||
|
||||
vdouble2_advsimd_sleef d = ddmul_vd2_vd2_vd_advsimd_sleef(logk_advsimd_sleef(vabs_vd_vd_advsimd_sleef(x)), y);
|
||||
vdouble_advsimd_sleef result = expk_advsimd_sleef(d);
|
||||
result = vsel_vd_vo_vd_vd_advsimd_sleef(vgt_vo_vd_vd_advsimd_sleef(vd2getx_vd_vd2_advsimd_sleef(d), vcast_vd_d_advsimd_sleef(709.78271114955742909217217426)), vcast_vd_d_advsimd_sleef(__builtin_inf()), result);
|
||||
vopmask_advsimd_sleef o = vgt_vo_vd_vd_advsimd_sleef(vd2getx_vd_vd2_advsimd_sleef(d), vcast_vd_d_advsimd_sleef(0x1.62e42fefa39efp+9));
|
||||
result = vsel_vd_vo_vd_vd_advsimd_sleef(o, vcast_vd_d_advsimd_sleef(__builtin_inf()), result);
|
||||
|
||||
result = vmul_vd_vd_vd_advsimd_sleef(result,
|
||||
vsel_vd_vo_vd_vd_advsimd_sleef(vgt_vo_vd_vd_advsimd_sleef(x, vcast_vd_d_advsimd_sleef(0)),
|
||||
@ -3443,7 +3441,6 @@ SLEEF_INLINE SLEEF_CONST vdouble_advsimd_sleef Sleef_powd2_u10advsimd(vdouble_ad
|
||||
result = vsel_vd_vo_vd_vd_advsimd_sleef(vor_vo_vo_vo_advsimd_sleef(veq_vo_vd_vd_advsimd_sleef(y, vcast_vd_d_advsimd_sleef(0)), veq_vo_vd_vd_advsimd_sleef(x, vcast_vd_d_advsimd_sleef(1))), vcast_vd_d_advsimd_sleef(1), result);
|
||||
|
||||
return result;
|
||||
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE SLEEF_CONST vdouble2_advsimd_sleef expk2_advsimd_sleef(vdouble2_advsimd_sleef d) {
|
||||
@ -3931,7 +3928,8 @@ SLEEF_INLINE SLEEF_CONST vdouble_advsimd_sleef Sleef_log1pd2_u10advsimd(vdouble_
|
||||
|
||||
vdouble_advsimd_sleef r = vadd_vd_vd_vd_advsimd_sleef(vd2getx_vd_vd2_advsimd_sleef(s), vd2gety_vd_vd2_advsimd_sleef(s));
|
||||
|
||||
r = vsel_vd_vo_vd_vd_advsimd_sleef(vgt_vo_vd_vd_advsimd_sleef(d, vcast_vd_d_advsimd_sleef(1e+307)), vcast_vd_d_advsimd_sleef(__builtin_inf()), r);
|
||||
vopmask_advsimd_sleef ocore = vle_vo_vd_vd_advsimd_sleef(d, vcast_vd_d_advsimd_sleef(0x1.c7b1f3cac7433p+1019));
|
||||
if(!__builtin_expect(!!(vtestallones_i_vo64_advsimd_sleef (ocore)), 1)) r = vsel_vd_vo_vd_vd_advsimd_sleef(ocore, r, Sleef_logd2_u10advsimd(d));
|
||||
r = vsel_vd_vo_vd_vd_advsimd_sleef(vor_vo_vo_vo_advsimd_sleef(vlt_vo_vd_vd_advsimd_sleef(d, vcast_vd_d_advsimd_sleef(-1)), visnan_vo_vd_advsimd_sleef(d)), vcast_vd_d_advsimd_sleef(__builtin_nan("")), r);
|
||||
r = vsel_vd_vo_vd_vd_advsimd_sleef(veq_vo_vd_vd_advsimd_sleef(d, vcast_vd_d_advsimd_sleef(-1)), vcast_vd_d_advsimd_sleef(-__builtin_inf()), r);
|
||||
r = vsel_vd_vo_vd_vd_advsimd_sleef(visnegzero_vo_vd_advsimd_sleef(d), vcast_vd_d_advsimd_sleef(-0.0), r);
|
||||
@ -4011,7 +4009,7 @@ SLEEF_INLINE SLEEF_CONST vint_advsimd_sleef Sleef_expfrexpd2_advsimd(vdouble_adv
|
||||
vint_advsimd_sleef ret = vcastu_vi_vm_advsimd_sleef(vreinterpret_vm_vd_advsimd_sleef(x));
|
||||
ret = vsub_vi_vi_vi_advsimd_sleef(vand_vi_vi_vi_advsimd_sleef(vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(ret), 20)), vcast_vi_i_advsimd_sleef(0x7ff)), vcast_vi_i_advsimd_sleef(0x3fe));
|
||||
|
||||
ret = vsel_vi_vo_vi_vi_advsimd_sleef(vor_vo_vo_vo_advsimd_sleef(vor_vo_vo_vo_advsimd_sleef(veq_vo_vd_vd_advsimd_sleef(x, vcast_vd_d_advsimd_sleef(0)), visnan_vo_vd_advsimd_sleef(x)), visinf_vo_vd_advsimd_sleef(x)), vcast_vi_i_advsimd_sleef(0), ret);
|
||||
ret = vsel_vi_vo_vi_vi_advsimd_sleef(vcast_vo32_vo64_advsimd_sleef(vor_vo_vo_vo_advsimd_sleef(vor_vo_vo_vo_advsimd_sleef(veq_vo_vd_vd_advsimd_sleef(x, vcast_vd_d_advsimd_sleef(0)), visnan_vo_vd_advsimd_sleef(x)), visinf_vo_vd_advsimd_sleef(x))), vcast_vi_i_advsimd_sleef(0), ret);
|
||||
|
||||
return ret;
|
||||
}
|
||||
@ -4410,14 +4408,6 @@ SLEEF_INLINE SLEEF_CONST vdouble_advsimd_sleef Sleef_erfcd2_u15advsimd(vdouble_a
|
||||
return r;
|
||||
}
|
||||
|
||||
#if !defined(__NVCC__) && ((defined(__SIZEOF_FLOAT128__) && __SIZEOF_FLOAT128__ == 16) || (defined(__linux__) && defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))) || (defined(__PPC64__) && defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 8))
|
||||
#define SLEEF_FLOAT128_IS_IEEEQP
|
||||
#endif
|
||||
|
||||
#if !defined(SLEEF_FLOAT128_IS_IEEEQP) && !defined(__NVCC__) && defined(__SIZEOF_LONG_DOUBLE__) && __SIZEOF_LONG_DOUBLE__ == 16 && (defined(__aarch64__) || defined(__zarch__))
|
||||
#define SLEEF_LONGDOUBLE_IS_IEEEQP
|
||||
#endif
|
||||
|
||||
#if !defined(Sleef_quad_DEFINED)
|
||||
#define Sleef_quad_DEFINED
|
||||
typedef struct { uint64_t x, y; } Sleef_uint64_2t;
|
||||
@ -4934,6 +4924,7 @@ SLEEF_INLINE SLEEF_CONST vfloat_advsimd_sleef Sleef_tanf4_u35advsimd(vfloat_advs
|
||||
if (__builtin_expect(!!(vtestallones_i_vo32_advsimd_sleef(vlt_vo_vf_vf_advsimd_sleef(vabs_vf_vf_advsimd_sleef(d), vcast_vf_f_advsimd_sleef(125.0f*0.5f)))), 1)) {
|
||||
q = vrint_vi2_vf_advsimd_sleef(vmul_vf_vf_vf_advsimd_sleef(d, vcast_vf_f_advsimd_sleef((float)(2 * 0.318309886183790671537767526745028724))));
|
||||
u = vcast_vf_vi2_advsimd_sleef(q);
|
||||
|
||||
x = vmla_vf_vf_vf_vf_advsimd_sleef(u, vcast_vf_f_advsimd_sleef(-3.1414794921875f*0.5f), x);
|
||||
x = vmla_vf_vf_vf_vf_advsimd_sleef(u, vcast_vf_f_advsimd_sleef(-0.00011315941810607910156f*0.5f), x);
|
||||
x = vmla_vf_vf_vf_vf_advsimd_sleef(u, vcast_vf_f_advsimd_sleef(-1.9841872589410058936e-09f*0.5f), x);
|
||||
@ -6335,7 +6326,8 @@ SLEEF_INLINE SLEEF_CONST vfloat_advsimd_sleef Sleef_log1pf4_u10advsimd(vfloat_ad
|
||||
|
||||
vfloat_advsimd_sleef r = vadd_vf_vf_vf_advsimd_sleef(vf2getx_vf_vf2_advsimd_sleef(s), vf2gety_vf_vf2_advsimd_sleef(s));
|
||||
|
||||
r = vsel_vf_vo_vf_vf_advsimd_sleef(vgt_vo_vf_vf_advsimd_sleef(d, vcast_vf_f_advsimd_sleef(1e+38)), vcast_vf_f_advsimd_sleef(__builtin_inff()), r);
|
||||
vopmask_advsimd_sleef ocore = vle_vo_vf_vf_advsimd_sleef(d, vcast_vf_f_advsimd_sleef(0x1.2ced32p+126));
|
||||
if(!__builtin_expect(!!(vtestallones_i_vo32_advsimd_sleef (ocore)), 1)) r = vsel_vf_vo_vf_vf_advsimd_sleef(ocore, r, Sleef_logf4_u10advsimd(d));
|
||||
r = vreinterpret_vf_vm_advsimd_sleef(vor_vm_vo32_vm_advsimd_sleef(vgt_vo_vf_vf_advsimd_sleef(vcast_vf_f_advsimd_sleef(-1), d), vreinterpret_vm_vf_advsimd_sleef(r)));
|
||||
r = vsel_vf_vo_vf_vf_advsimd_sleef(veq_vo_vf_vf_advsimd_sleef(d, vcast_vf_f_advsimd_sleef(-1)), vcast_vf_f_advsimd_sleef(-__builtin_inff()), r);
|
||||
r = vsel_vf_vo_vf_vf_advsimd_sleef(visnegzero_vo_vf_advsimd_sleef(d), vcast_vf_f_advsimd_sleef(-0.0f), r);
|
||||
|
||||
@ -1,8 +1,11 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
// This file is generated by SLEEF 3.6.1
|
||||
// This file is generated by SLEEF 3.9.0
|
||||
|
||||
/* #undef SLEEF_FLOAT128_IS_IEEEQP */
|
||||
#define SLEEF_LONGDOUBLE_IS_IEEEQP
|
||||
|
||||
#ifndef SLEEF_ALWAYS_INLINE
|
||||
#if defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)
|
||||
@ -1010,6 +1013,7 @@ static const double Sleef_rempitabdp[] = {
|
||||
2.1353977370878701046e-273, -1.2215123283371736879e-289, 6.7342163555358599277e-306, -5.681754927174335258e-322,
|
||||
2.1353977370878701046e-273, -1.2215123283371736879e-289, 6.7342163555358599277e-306, -5.681754927174335258e-322,
|
||||
2.8687869620228451614e-274, -1.9537812801257956865e-290, 1.0380272777574237546e-306, 6.4228533959362050743e-323,
|
||||
0, 0, 0, 0,
|
||||
};
|
||||
|
||||
static const float Sleef_rempitabsp[] = {
|
||||
@ -1116,17 +1120,10 @@ static const float Sleef_rempitabsp[] = {
|
||||
1.183823005e-12, 1.161414894e-20, 1.29131908e-27, 1.715766248e-34,
|
||||
1.183823005e-12, 1.161414894e-20, 1.29131908e-27, 1.715766248e-34,
|
||||
2.743283031e-13, 1.161414894e-20, 1.29131908e-27, 1.715766248e-34,
|
||||
0, 0, 0, 0,
|
||||
};
|
||||
#endif // #ifndef __SLEEF_REMPITAB__
|
||||
|
||||
#if !defined(__NVCC__) && ((defined(__SIZEOF_FLOAT128__) && __SIZEOF_FLOAT128__ == 16) || (defined(__linux__) && defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))) || (defined(__PPC64__) && defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 8))
|
||||
#define SLEEF_FLOAT128_IS_IEEEQP
|
||||
#endif
|
||||
|
||||
#if !defined(SLEEF_FLOAT128_IS_IEEEQP) && !defined(__NVCC__) && defined(__SIZEOF_LONG_DOUBLE__) && __SIZEOF_LONG_DOUBLE__ == 16 && (defined(__aarch64__) || defined(__zarch__))
|
||||
#define SLEEF_LONGDOUBLE_IS_IEEEQP
|
||||
#endif
|
||||
|
||||
#if !defined(Sleef_quad_DEFINED)
|
||||
#define Sleef_quad_DEFINED
|
||||
typedef struct { uint64_t x, y; } Sleef_uint64_2t;
|
||||
@ -1182,7 +1179,7 @@ typedef vquad_rvvm1_sleef vargquad_rvvm1_sleef;
|
||||
|
||||
static SLEEF_ALWAYS_INLINE int vavailability_i_rvvm1_sleef(int name) {
|
||||
|
||||
return (__riscv_vsetvlmax_e64m1() >= __riscv_vsetvlmax_e64m1()) ? 3 : 0;
|
||||
return (((int)__riscv_vsetvlmax_e64m1()) >= ((int)__riscv_vsetvlmax_e64m1())) ? 3 : 0;
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef figetd_vf_di_rvvm1_sleef(fi_t_rvvm1_sleef d) {
|
||||
@ -1239,144 +1236,144 @@ static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vreinterpret_vf_vi2_rvvm1_sleef(vi
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vcast_vf_f_rvvm1_sleef(float f) {
|
||||
return __riscv_vfmv_v_f_f32m1(f, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vfmv_v_f_f32m1(f, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vrint_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef vd_rvvm1_sleef) {
|
||||
return __riscv_vfcvt_f_x_v_f32m1(__riscv_vfcvt_x_f_v_i32m1_rm(vd_rvvm1_sleef, __RISCV_FRM_RNE, (__riscv_vsetvlmax_e32m1())), (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vfcvt_f_x_v_f32m1(__riscv_vfcvt_x_f_v_i32m1_rm(vd_rvvm1_sleef, __RISCV_FRM_RNE, ((int)__riscv_vsetvlmax_e32m1())), ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vcast_vf_vi2_rvvm1_sleef(vint2_rvvm1_sleef vi) {
|
||||
return __riscv_vfcvt_f(vi, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vfcvt_f(vi, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vint2_rvvm1_sleef vcast_vi2_i_rvvm1_sleef(int i) {
|
||||
return __riscv_vmv_v_x_i32m1(i, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vmv_v_x_i32m1(i, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vint2_rvvm1_sleef vrint_vi2_vf_rvvm1_sleef(vfloat_rvvm1_sleef vf) {
|
||||
return __riscv_vfcvt_x_f_v_i32m1_rm(vf, __RISCV_FRM_RNE, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vfcvt_x_f_v_i32m1_rm(vf, __RISCV_FRM_RNE, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vint2_rvvm1_sleef vtruncate_vi2_vf_rvvm1_sleef(vfloat_rvvm1_sleef vf) {
|
||||
return __riscv_vfcvt_rtz_x(vf, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vfcvt_rtz_x(vf, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vtruncate_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef vf) {
|
||||
return vcast_vf_vi2_rvvm1_sleef(vtruncate_vi2_vf_rvvm1_sleef(vf));
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vload_vf_p_rvvm1_sleef(const float *ptr) {
|
||||
return __riscv_vle32_v_f32m1(ptr, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vle32_v_f32m1(ptr, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vloadu_vf_p_rvvm1_sleef(const float *ptr) {
|
||||
return __riscv_vle32_v_f32m1(ptr, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vle32_v_f32m1(ptr, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE void vstore_v_p_vf_rvvm1_sleef(float *ptr, vfloat_rvvm1_sleef v) {
|
||||
__riscv_vse32(ptr, v, (__riscv_vsetvlmax_e32m1()));
|
||||
__riscv_vse32(ptr, v, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE void vstoreu_v_p_vf_rvvm1_sleef(float *ptr, vfloat_rvvm1_sleef v) {
|
||||
__riscv_vse32(ptr, v, (__riscv_vsetvlmax_e32m1()));
|
||||
__riscv_vse32(ptr, v, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE void vstoreu_v_p_vi2_rvvm1_sleef(int32_t *ptr, vint2_rvvm1_sleef v) {
|
||||
__riscv_vse32(ptr, v, (__riscv_vsetvlmax_e32m1()));
|
||||
__riscv_vse32(ptr, v, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vgather_vf_p_vi2_rvvm1_sleef(const float *ptr, vint2_rvvm1_sleef vi2) {
|
||||
return __riscv_vluxei32(ptr, __riscv_vmul(__riscv_vreinterpret_u32m1(vi2), sizeof(float), (__riscv_vsetvlmax_e32m1())), (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vluxei32(ptr, __riscv_vmul(__riscv_vreinterpret_u32m1(vi2), sizeof(float), ((int)__riscv_vsetvlmax_e32m1())), ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vadd_vf_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef x, vfloat_rvvm1_sleef y) {
|
||||
return __riscv_vfadd(x, y, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vfadd(x, y, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vsub_vf_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef x, vfloat_rvvm1_sleef y) {
|
||||
return __riscv_vfsub(x, y, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vfsub(x, y, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vmul_vf_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef x, vfloat_rvvm1_sleef y) {
|
||||
return __riscv_vfmul(x, y, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vfmul(x, y, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vdiv_vf_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef x, vfloat_rvvm1_sleef y) {
|
||||
return __riscv_vfdiv(x, y, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vfdiv(x, y, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vmax_vf_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef x, vfloat_rvvm1_sleef y) {
|
||||
return __riscv_vfmax(x, y, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vfmax(x, y, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vmin_vf_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef x, vfloat_rvvm1_sleef y) {
|
||||
return __riscv_vfmin(x, y, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vfmin(x, y, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vrec_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef d) {
|
||||
return __riscv_vfdiv(vcast_vf_f_rvvm1_sleef(1.0f), d, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vfdiv(vcast_vf_f_rvvm1_sleef(1.0f), d, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vsqrt_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef d) {
|
||||
return __riscv_vfsqrt(d, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vfsqrt(d, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vmla_vf_vf_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef x, vfloat_rvvm1_sleef y, vfloat_rvvm1_sleef z) {
|
||||
return __riscv_vfmadd(x, y, z, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vfmadd(x, y, z, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vmlanp_vf_vf_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef x, vfloat_rvvm1_sleef y, vfloat_rvvm1_sleef z) {
|
||||
return __riscv_vfnmsub(x, y, z, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vfnmsub(x, y, z, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vmlapn_vf_vf_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef x, vfloat_rvvm1_sleef y, vfloat_rvvm1_sleef z) {
|
||||
return __riscv_vfmsub(x, y, z, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vfmsub(x, y, z, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vfma_vf_vf_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef x, vfloat_rvvm1_sleef y, vfloat_rvvm1_sleef z) {
|
||||
return __riscv_vfmadd(x, y, z, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vfmadd(x, y, z, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vfmanp_vf_vf_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef x, vfloat_rvvm1_sleef y, vfloat_rvvm1_sleef z) {
|
||||
return __riscv_vfnmsub(x, y, z, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vfnmsub(x, y, z, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vfmapn_vf_vf_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef x, vfloat_rvvm1_sleef y, vfloat_rvvm1_sleef z) {
|
||||
return __riscv_vfmsub(x, y, z, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vfmsub(x, y, z, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vmulsign_vf_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef x, vfloat_rvvm1_sleef y) {
|
||||
return __riscv_vfsgnjx(x, y, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vfsgnjx(x, y, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vcopysign_vf_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef x, vfloat_rvvm1_sleef y) {
|
||||
return __riscv_vfsgnj(x, y, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vfsgnj(x, y, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vsign_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef f) {
|
||||
return __riscv_vfsgnj(__riscv_vfmv_v_f_f32m1(1.0f, (__riscv_vsetvlmax_e32m1())), f, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vfsgnj(__riscv_vfmv_v_f_f32m1(1.0f, ((int)__riscv_vsetvlmax_e32m1())), f, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vorsign_vf_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef x, vfloat_rvvm1_sleef y) {
|
||||
vint2_rvvm1_sleef xi = __riscv_vreinterpret_i32m1(x);
|
||||
vint2_rvvm1_sleef yi = __riscv_vreinterpret_i32m1(y);
|
||||
vint2_rvvm1_sleef xioryi = __riscv_vor(xi, yi, (__riscv_vsetvlmax_e32m1()));
|
||||
vint2_rvvm1_sleef xioryi = __riscv_vor(xi, yi, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
vfloat_rvvm1_sleef xory = __riscv_vreinterpret_f32m1(xioryi);
|
||||
return __riscv_vfsgnj(x, xory, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vfsgnj(x, xory, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vabs_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef f) {
|
||||
return __riscv_vfabs(f, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vfabs(f, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vneg_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef f) {
|
||||
return __riscv_vfneg(f, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vfneg(f, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE vint2_rvvm1_sleef vadd_vi2_vi2_vi2_rvvm1_sleef(vint2_rvvm1_sleef x, vint2_rvvm1_sleef y) {
|
||||
return __riscv_vadd(x, y, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vadd(x, y, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vint2_rvvm1_sleef vsub_vi2_vi2_vi2_rvvm1_sleef(vint2_rvvm1_sleef x, vint2_rvvm1_sleef y) {
|
||||
return __riscv_vsub(x, y, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vsub(x, y, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vint2_rvvm1_sleef vneg_vi2_vi2_rvvm1_sleef(vint2_rvvm1_sleef x) {
|
||||
return __riscv_vneg(x, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vneg(x, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vint2_rvvm1_sleef vand_vi2_vi2_vi2_rvvm1_sleef(vint2_rvvm1_sleef x, vint2_rvvm1_sleef y) {
|
||||
return __riscv_vand(x, y, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vand(x, y, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vint2_rvvm1_sleef vandnot_vi2_vi2_vi2_rvvm1_sleef(vint2_rvvm1_sleef x, vint2_rvvm1_sleef y) {
|
||||
return __riscv_vand(__riscv_vnot(x, (__riscv_vsetvlmax_e32m1())), y, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vand(__riscv_vnot(x, ((int)__riscv_vsetvlmax_e32m1())), y, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vint2_rvvm1_sleef vor_vi2_vi2_vi2_rvvm1_sleef(vint2_rvvm1_sleef x, vint2_rvvm1_sleef y) {
|
||||
return __riscv_vor(x, y, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vor(x, y, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vint2_rvvm1_sleef vxor_vi2_vi2_vi2_rvvm1_sleef(vint2_rvvm1_sleef x, vint2_rvvm1_sleef y) {
|
||||
return __riscv_vxor(x, y, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vxor(x, y, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vint2_rvvm1_sleef vsll_vi2_vi2_i_rvvm1_sleef(vint2_rvvm1_sleef x, int c) {
|
||||
return __riscv_vsll(x, c, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vsll(x, c, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vint2_rvvm1_sleef vsra_vi2_vi2_i_rvvm1_sleef(vint2_rvvm1_sleef x, int c) {
|
||||
return __riscv_vsra(x, c, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vsra(x, c, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vint2_rvvm1_sleef vsrl_vi2_vi2_i_rvvm1_sleef(vint2_rvvm1_sleef x, int c) {
|
||||
return __riscv_vreinterpret_i32m1(__riscv_vsrl(__riscv_vreinterpret_u32m1(x), c, (__riscv_vsetvlmax_e32m1())));
|
||||
return __riscv_vreinterpret_i32m1(__riscv_vsrl(__riscv_vreinterpret_u32m1(x), c, ((int)__riscv_vsetvlmax_e32m1())));
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vreinterpret_vf_vm_rvvm1_sleef(vmask_rvvm1_sleef vm) {
|
||||
@ -1387,91 +1384,91 @@ static SLEEF_ALWAYS_INLINE vmask_rvvm1_sleef vreinterpret_vm_vf_rvvm1_sleef(vflo
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE int vtestallones_i_vo32_rvvm1_sleef(rvv_sp_vopmask_rvvm1_sleef g) {
|
||||
return __riscv_vcpop(g, (__riscv_vsetvlmax_e32m1())) == (__riscv_vsetvlmax_e32m1());
|
||||
return (int)__riscv_vcpop(g, ((int)__riscv_vsetvlmax_e32m1())) == (int)((int)__riscv_vsetvlmax_e32m1());
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vmask_rvvm1_sleef vor_vm_vo32_vm_rvvm1_sleef(rvv_sp_vopmask_rvvm1_sleef x, vmask_rvvm1_sleef y) {
|
||||
rvv_vmask32 y32 = __riscv_vreinterpret_u32m1(y);
|
||||
return __riscv_vreinterpret_u64m1(__riscv_vmerge(y32, -1, x, (__riscv_vsetvlmax_e32m1())));
|
||||
return __riscv_vreinterpret_u64m1(__riscv_vmerge(y32, -1, x, ((int)__riscv_vsetvlmax_e32m1())));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vmask_rvvm1_sleef vand_vm_vo32_vm_rvvm1_sleef(rvv_sp_vopmask_rvvm1_sleef x, vmask_rvvm1_sleef y) {
|
||||
rvv_vmask32 y32 = __riscv_vreinterpret_u32m1(y);
|
||||
return __riscv_vreinterpret_u64m1(__riscv_vmerge(y32, 0, __riscv_vmnot(x, (__riscv_vsetvlmax_e32m1())), (__riscv_vsetvlmax_e32m1())));
|
||||
return __riscv_vreinterpret_u64m1(__riscv_vmerge(y32, 0, __riscv_vmnot(x, ((int)__riscv_vsetvlmax_e32m1())), ((int)__riscv_vsetvlmax_e32m1())));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vmask_rvvm1_sleef vandnot_vm_vo32_vm_rvvm1_sleef(rvv_sp_vopmask_rvvm1_sleef x, vmask_rvvm1_sleef y) {
|
||||
rvv_vmask32 y32 = __riscv_vreinterpret_u32m1(y);
|
||||
return __riscv_vreinterpret_u64m1(__riscv_vmerge(y32, 0, x, (__riscv_vsetvlmax_e32m1())));
|
||||
return __riscv_vreinterpret_u64m1(__riscv_vmerge(y32, 0, x, ((int)__riscv_vsetvlmax_e32m1())));
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE rvv_sp_vopmask_rvvm1_sleef rvv_sp_vand_vo_vo_vo(rvv_sp_vopmask_rvvm1_sleef x, rvv_sp_vopmask_rvvm1_sleef y) {
|
||||
return __riscv_vmand(x, y, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vmand(x, y, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE rvv_sp_vopmask_rvvm1_sleef rvv_sp_vandnot_vo_vo_vo(rvv_sp_vopmask_rvvm1_sleef x, rvv_sp_vopmask_rvvm1_sleef y) {
|
||||
return __riscv_vmandn(y, x, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vmandn(y, x, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE rvv_sp_vopmask_rvvm1_sleef rvv_sp_vor_vo_vo_vo(rvv_sp_vopmask_rvvm1_sleef x, rvv_sp_vopmask_rvvm1_sleef y) {
|
||||
return __riscv_vmor(x, y, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vmor(x, y, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE rvv_sp_vopmask_rvvm1_sleef rvv_sp_vxor_vo_vo_vo(rvv_sp_vopmask_rvvm1_sleef x, rvv_sp_vopmask_rvvm1_sleef y) {
|
||||
return __riscv_vmxor(x, y, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vmxor(x, y, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE rvv_sp_vopmask_rvvm1_sleef veq_vo_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef x, vfloat_rvvm1_sleef y) {
|
||||
return __riscv_vmfeq(x, y, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vmfeq(x, y, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE rvv_sp_vopmask_rvvm1_sleef vneq_vo_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef x, vfloat_rvvm1_sleef y) {
|
||||
return __riscv_vmfne(x, y, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vmfne(x, y, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE rvv_sp_vopmask_rvvm1_sleef vgt_vo_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef x, vfloat_rvvm1_sleef y) {
|
||||
return __riscv_vmfgt(x, y, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vmfgt(x, y, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE rvv_sp_vopmask_rvvm1_sleef vge_vo_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef x, vfloat_rvvm1_sleef y) {
|
||||
return __riscv_vmfge(x, y, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vmfge(x, y, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE rvv_sp_vopmask_rvvm1_sleef vlt_vo_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef x, vfloat_rvvm1_sleef y) {
|
||||
return __riscv_vmflt(x, y, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vmflt(x, y, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE rvv_sp_vopmask_rvvm1_sleef vle_vo_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef x, vfloat_rvvm1_sleef y) {
|
||||
return __riscv_vmfle(x, y, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vmfle(x, y, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE rvv_sp_vopmask_rvvm1_sleef visnan_vo_vf_rvvm1_sleef(vfloat_rvvm1_sleef d) {
|
||||
return __riscv_vmfne(d, d, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vmfne(d, d, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE rvv_sp_vopmask_rvvm1_sleef visinf_vo_vf_rvvm1_sleef(vfloat_rvvm1_sleef d) {
|
||||
return __riscv_vmfeq(__riscv_vfabs(d, (__riscv_vsetvlmax_e32m1())), __builtin_inff(), (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vmfeq(__riscv_vfabs(d, ((int)__riscv_vsetvlmax_e32m1())), __builtin_inff(), ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE rvv_sp_vopmask_rvvm1_sleef vispinf_vo_vf_rvvm1_sleef(vfloat_rvvm1_sleef d) {
|
||||
return __riscv_vmfeq(d, __builtin_inff(), (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vmfeq(d, __builtin_inff(), ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vsel_vf_vo_vf_vf_rvvm1_sleef(rvv_sp_vopmask_rvvm1_sleef mask, vfloat_rvvm1_sleef x, vfloat_rvvm1_sleef y) {
|
||||
return __riscv_vmerge(y, x, mask, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vmerge(y, x, mask, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vsel_vf_vo_f_f_rvvm1_sleef(rvv_sp_vopmask_rvvm1_sleef mask, float v1, float v0) {
|
||||
return __riscv_vfmerge(vcast_vf_f_rvvm1_sleef(v0), v1, mask, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vfmerge(vcast_vf_f_rvvm1_sleef(v0), v1, mask, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vsel_vf_vo_vo_f_f_f_rvvm1_sleef(rvv_sp_vopmask_rvvm1_sleef o0, rvv_sp_vopmask_rvvm1_sleef o1, float d0, float d1, float d2) {
|
||||
return __riscv_vfmerge(__riscv_vfmerge(vcast_vf_f_rvvm1_sleef(d2), d1, o1, (__riscv_vsetvlmax_e32m1())), d0, o0, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vfmerge(__riscv_vfmerge(vcast_vf_f_rvvm1_sleef(d2), d1, o1, ((int)__riscv_vsetvlmax_e32m1())), d0, o0, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vsel_vf_vo_vo_vo_f_f_f_f_rvvm1_sleef(rvv_sp_vopmask_rvvm1_sleef o0, rvv_sp_vopmask_rvvm1_sleef o1, rvv_sp_vopmask_rvvm1_sleef o2, float d0, float d1, float d2, float d3) {
|
||||
return __riscv_vfmerge(__riscv_vfmerge(__riscv_vfmerge(vcast_vf_f_rvvm1_sleef(d3), d2, o2, (__riscv_vsetvlmax_e32m1())), d1, o1, (__riscv_vsetvlmax_e32m1())), d0, o0, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vfmerge(__riscv_vfmerge(__riscv_vfmerge(vcast_vf_f_rvvm1_sleef(d3), d2, o2, ((int)__riscv_vsetvlmax_e32m1())), d1, o1, ((int)__riscv_vsetvlmax_e32m1())), d0, o0, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE rvv_sp_vopmask_rvvm1_sleef veq_vo_vi2_vi2_rvvm1_sleef(vint2_rvvm1_sleef x, vint2_rvvm1_sleef y) {
|
||||
return __riscv_vmseq(x, y, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vmseq(x, y, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE rvv_sp_vopmask_rvvm1_sleef vgt_vo_vi2_vi2_rvvm1_sleef(vint2_rvvm1_sleef x, vint2_rvvm1_sleef y) {
|
||||
return __riscv_vmsgt(x, y, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vmsgt(x, y, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vint2_rvvm1_sleef vgt_vi2_vi2_vi2_rvvm1_sleef(vint2_rvvm1_sleef x, vint2_rvvm1_sleef y) {
|
||||
vint2_rvvm1_sleef zero = vcast_vi2_i_rvvm1_sleef(0);
|
||||
return __riscv_vmerge(zero, -1, __riscv_vmsgt(x, y, (__riscv_vsetvlmax_e32m1())), (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vmerge(zero, -1, __riscv_vmsgt(x, y, ((int)__riscv_vsetvlmax_e32m1())), ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE vint2_rvvm1_sleef vsel_vi2_vo_vi2_vi2_rvvm1_sleef(rvv_sp_vopmask_rvvm1_sleef m, vint2_rvvm1_sleef x, vint2_rvvm1_sleef y) {
|
||||
return __riscv_vmerge(y, x, m, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vmerge(y, x, m, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vint2_rvvm1_sleef vand_vi2_vo_vi2_rvvm1_sleef(rvv_sp_vopmask_rvvm1_sleef x, vint2_rvvm1_sleef y) {
|
||||
return __riscv_vmerge(y, 0, __riscv_vmnot(x, (__riscv_vsetvlmax_e32m1())), (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vmerge(y, 0, __riscv_vmnot(x, ((int)__riscv_vsetvlmax_e32m1())), ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE const vdouble_rvvm1_sleef vd2getx_vd_vd2_rvvm1_sleef(vdouble2_rvvm1_sleef v) {
|
||||
@ -1537,203 +1534,203 @@ static SLEEF_ALWAYS_INLINE ddi_t_rvvm1_sleef ddisetdd_ddi_ddi_vd2_rvvm1_sleef(dd
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vcast_vd_d_rvvm1_sleef(double d) {
|
||||
return __riscv_vfmv_v_f_f64m1(d, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vfmv_v_f_f64m1(d, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vcast_vd_vi_rvvm1_sleef(vint_rvvm1_sleef i) {
|
||||
return __riscv_vfwcvt_f(i, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vfwcvt_f(i, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vint_rvvm1_sleef vcast_vi_i_rvvm1_sleef(int32_t i) {
|
||||
return __riscv_vmv_v_x_i32mf2(i, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vmv_v_x_i32mf2(i, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vint_rvvm1_sleef vrint_vi_vd_rvvm1_sleef(vdouble_rvvm1_sleef vd_rvvm1_sleef) {
|
||||
return __riscv_vfncvt_x_f_w_i32mf2_rm(vd_rvvm1_sleef, __RISCV_FRM_RNE, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vfncvt_x_f_w_i32mf2_rm(vd_rvvm1_sleef, __RISCV_FRM_RNE, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vrint_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef vd_rvvm1_sleef) {
|
||||
return __riscv_vfcvt_f_x_v_f64m1(__riscv_vfcvt_x_f_v_i64m1_rm(vd_rvvm1_sleef, __RISCV_FRM_RNE, __riscv_vsetvlmax_e64m1()), __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vfcvt_f_x_v_f64m1(__riscv_vfcvt_x_f_v_i64m1_rm(vd_rvvm1_sleef, __RISCV_FRM_RNE, ((int)__riscv_vsetvlmax_e64m1())), ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vint_rvvm1_sleef vtruncate_vi_vd_rvvm1_sleef(vdouble_rvvm1_sleef vd_rvvm1_sleef) {
|
||||
return __riscv_vfncvt_rtz_x(vd_rvvm1_sleef, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vfncvt_rtz_x(vd_rvvm1_sleef, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vtruncate_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef vd_rvvm1_sleef) {
|
||||
return vcast_vd_vi_rvvm1_sleef(vtruncate_vi_vd_rvvm1_sleef(vd_rvvm1_sleef));
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vload_vd_p_rvvm1_sleef(const double *ptr) {
|
||||
return __riscv_vle64_v_f64m1(ptr, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vle64_v_f64m1(ptr, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vloadu_vd_p_rvvm1_sleef(const double *ptr) {
|
||||
return __riscv_vle64_v_f64m1(ptr, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vle64_v_f64m1(ptr, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vint_rvvm1_sleef vloadu_vi_p_rvvm1_sleef(int32_t *p) {
|
||||
return __riscv_vle32_v_i32mf2(p, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vle32_v_i32mf2(p, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE void vstore_v_p_vd_rvvm1_sleef(double *ptr, vdouble_rvvm1_sleef v) {
|
||||
__riscv_vse64(ptr, v, __riscv_vsetvlmax_e64m1());
|
||||
__riscv_vse64(ptr, v, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE void vstoreu_v_p_vd_rvvm1_sleef(double *ptr, vdouble_rvvm1_sleef v) {
|
||||
__riscv_vse64(ptr, v, __riscv_vsetvlmax_e64m1());
|
||||
__riscv_vse64(ptr, v, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE void vstoreu_v_p_vi_rvvm1_sleef(int32_t *ptr, vint_rvvm1_sleef v) {
|
||||
__riscv_vse32(ptr, v, __riscv_vsetvlmax_e64m1());
|
||||
__riscv_vse32(ptr, v, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vgather_vd_p_vi_rvvm1_sleef(const double *ptr, vint_rvvm1_sleef vi) {
|
||||
return __riscv_vluxei64(ptr, __riscv_vwmulu(__riscv_vreinterpret_u32mf2(vi), sizeof(double), __riscv_vsetvlmax_e64m1()), __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vluxei64(ptr, __riscv_vwmulu(__riscv_vreinterpret_u32mf2(vi), sizeof(double), ((int)__riscv_vsetvlmax_e64m1())), ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vadd_vd_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y) {
|
||||
return __riscv_vfadd(x, y, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vfadd(x, y, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vsub_vd_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y) {
|
||||
return __riscv_vfsub(x, y, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vfsub(x, y, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vrec_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef d) {
|
||||
return __riscv_vfdiv(vcast_vd_d_rvvm1_sleef(1.0), d, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vfdiv(vcast_vd_d_rvvm1_sleef(1.0), d, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vabs_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef d) {
|
||||
return __riscv_vfabs(d, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vfabs(d, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vsqrt_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef d) {
|
||||
return __riscv_vfsqrt(d, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vfsqrt(d, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vmul_vd_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y) {
|
||||
return __riscv_vfmul(x, y, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vfmul(x, y, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vdiv_vd_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y) {
|
||||
return __riscv_vfdiv(x, y, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vfdiv(x, y, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vmax_vd_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y) {
|
||||
return __riscv_vfmax(x, y, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vfmax(x, y, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vmin_vd_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y) {
|
||||
return __riscv_vfmin(x, y, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vfmin(x, y, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vmla_vd_vd_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y, vdouble_rvvm1_sleef z) {
|
||||
return __riscv_vfmadd(x, y, z, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vfmadd(x, y, z, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vmlapn_vd_vd_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y, vdouble_rvvm1_sleef z) {
|
||||
return __riscv_vfmsub(x, y, z, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vfmsub(x, y, z, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vmlanp_vd_vd_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y, vdouble_rvvm1_sleef z) {
|
||||
return __riscv_vfnmsac(z, x, y, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vfnmsac(z, x, y, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vfma_vd_vd_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y, vdouble_rvvm1_sleef z) {
|
||||
return __riscv_vfmadd(x, y, z, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vfmadd(x, y, z, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vfmanp_vd_vd_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y, vdouble_rvvm1_sleef z) {
|
||||
return __riscv_vfnmsub(x, y, z, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vfnmsub(x, y, z, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vfmapn_vd_vd_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y, vdouble_rvvm1_sleef z) {
|
||||
return __riscv_vfmsub(x, y, z, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vfmsub(x, y, z, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vmulsign_vd_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y) {
|
||||
return __riscv_vfsgnjx(x, y, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vfsgnjx(x, y, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vcopysign_vd_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y) {
|
||||
return __riscv_vfsgnj(x, y, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vfsgnj(x, y, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vorsign_vd_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y) {
|
||||
return __riscv_vfsgnj(x, __riscv_vreinterpret_f64m1(__riscv_vreinterpret_i64m1(__riscv_vor(__riscv_vreinterpret_u64m1(x), __riscv_vreinterpret_u64m1(y), __riscv_vsetvlmax_e64m1()))), __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vfsgnj(x, __riscv_vreinterpret_f64m1(__riscv_vreinterpret_i64m1(__riscv_vor(__riscv_vreinterpret_u64m1(x), __riscv_vreinterpret_u64m1(y), ((int)__riscv_vsetvlmax_e64m1())))), ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vneg_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef d) {
|
||||
return __riscv_vfneg(d, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vfneg(d, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE vint_rvvm1_sleef vadd_vi_vi_vi_rvvm1_sleef(vint_rvvm1_sleef x, vint_rvvm1_sleef y) {
|
||||
return __riscv_vadd(x, y, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vadd(x, y, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vint_rvvm1_sleef vsub_vi_vi_vi_rvvm1_sleef(vint_rvvm1_sleef x, vint_rvvm1_sleef y) {
|
||||
return __riscv_vsub(x, y, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vsub(x, y, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vint_rvvm1_sleef vneg_vi_vi_rvvm1_sleef(vint_rvvm1_sleef x) {
|
||||
return __riscv_vneg(x, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vneg(x, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vint_rvvm1_sleef vand_vi_vi_vi_rvvm1_sleef(vint_rvvm1_sleef x, vint_rvvm1_sleef y) {
|
||||
return __riscv_vand(x, y, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vand(x, y, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vint_rvvm1_sleef vandnot_vi_vi_vi_rvvm1_sleef(vint_rvvm1_sleef x, vint_rvvm1_sleef y) {
|
||||
return __riscv_vand(__riscv_vnot(x, __riscv_vsetvlmax_e64m1()), y, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vand(__riscv_vnot(x, ((int)__riscv_vsetvlmax_e64m1())), y, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vint_rvvm1_sleef vor_vi_vi_vi_rvvm1_sleef(vint_rvvm1_sleef x, vint_rvvm1_sleef y) {
|
||||
return __riscv_vor(x, y, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vor(x, y, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vint_rvvm1_sleef vxor_vi_vi_vi_rvvm1_sleef(vint_rvvm1_sleef x, vint_rvvm1_sleef y) {
|
||||
return __riscv_vxor(x, y, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vxor(x, y, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vint_rvvm1_sleef vsll_vi_vi_i_rvvm1_sleef(vint_rvvm1_sleef x, int c) {
|
||||
return __riscv_vsll(x, c, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vsll(x, c, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vint_rvvm1_sleef vsra_vi_vi_i_rvvm1_sleef(vint_rvvm1_sleef x, int c) {
|
||||
return __riscv_vsra(x, c, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vsra(x, c, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vint_rvvm1_sleef vsrl_vi_vi_i_rvvm1_sleef(vint_rvvm1_sleef x, int c) {
|
||||
return __riscv_vreinterpret_i32mf2(__riscv_vsrl(__riscv_vreinterpret_u32mf2(x), c, __riscv_vsetvlmax_e64m1()));
|
||||
return __riscv_vreinterpret_i32mf2(__riscv_vsrl(__riscv_vreinterpret_u32mf2(x), c, ((int)__riscv_vsetvlmax_e64m1())));
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE vmask_rvvm1_sleef vcast_vm_i64_rvvm1_sleef(int64_t c) {
|
||||
return __riscv_vmv_v_x_u64m1(c, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vmv_v_x_u64m1(c, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vmask_rvvm1_sleef vcast_vm_u64_rvvm1_sleef(uint64_t c) {
|
||||
return __riscv_vmv_v_x_u64m1(c, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vmv_v_x_u64m1(c, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vmask_rvvm1_sleef vcast_vm_i_i_rvvm1_sleef(int64_t h, int64_t l) {
|
||||
return __riscv_vmv_v_x_u64m1((((uint64_t)h) << 32) | (uint32_t) l, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vmv_v_x_u64m1((((uint64_t)h) << 32) | (uint32_t) l, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vmask_rvvm1_sleef vcast_vm_vi_rvvm1_sleef(vint_rvvm1_sleef vi) {
|
||||
return __riscv_vreinterpret_u64m1(__riscv_vwcvt_x(vi, __riscv_vsetvlmax_e64m1()));
|
||||
return __riscv_vreinterpret_u64m1(__riscv_vwcvt_x(vi, ((int)__riscv_vsetvlmax_e64m1())));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vmask_rvvm1_sleef vcastu_vm_vi_rvvm1_sleef(vint_rvvm1_sleef vi) {
|
||||
return __riscv_vsll(__riscv_vreinterpret_u64m1(__riscv_vwcvt_x(vi, __riscv_vsetvlmax_e64m1())), 32, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vsll(__riscv_vreinterpret_u64m1(__riscv_vwcvt_x(vi, ((int)__riscv_vsetvlmax_e64m1()))), 32, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vint_rvvm1_sleef vcastu_vi_vm_rvvm1_sleef(vmask_rvvm1_sleef vm) {
|
||||
return __riscv_vreinterpret_i32mf2(__riscv_vnsrl(vm, 32, __riscv_vsetvlmax_e64m1()));
|
||||
return __riscv_vreinterpret_i32mf2(__riscv_vnsrl(vm, 32, ((int)__riscv_vsetvlmax_e64m1())));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vint_rvvm1_sleef vcast_vi_vm_rvvm1_sleef(vmask_rvvm1_sleef vm) {
|
||||
return __riscv_vreinterpret_i32mf2(__riscv_vncvt_x(vm, __riscv_vsetvlmax_e64m1()));
|
||||
return __riscv_vreinterpret_i32mf2(__riscv_vncvt_x(vm, ((int)__riscv_vsetvlmax_e64m1())));
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE vmask_rvvm1_sleef vand_vm_vo64_vm_rvvm1_sleef(rvv_dp_vopmask_rvvm1_sleef x, vmask_rvvm1_sleef y) {
|
||||
return __riscv_vmerge(y, 0, __riscv_vmnot(x, __riscv_vsetvlmax_e64m1()), __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vmerge(y, 0, __riscv_vmnot(x, ((int)__riscv_vsetvlmax_e64m1())), ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vmask_rvvm1_sleef vand_vm_vm_vm_rvvm1_sleef(vmask_rvvm1_sleef x, vmask_rvvm1_sleef y) {
|
||||
return __riscv_vand(x, y, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vand(x, y, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vmask_rvvm1_sleef vor_vm_vm_vm_rvvm1_sleef(vmask_rvvm1_sleef x, vmask_rvvm1_sleef y) {
|
||||
return __riscv_vor(x, y, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vor(x, y, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vmask_rvvm1_sleef vxor_vm_vm_vm_rvvm1_sleef(vmask_rvvm1_sleef x, vmask_rvvm1_sleef y) {
|
||||
return __riscv_vxor(x, y, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vxor(x, y, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vmask_rvvm1_sleef vandnot_vm_vm_vm_rvvm1_sleef(vmask_rvvm1_sleef x, vmask_rvvm1_sleef y) {
|
||||
return __riscv_vand(__riscv_vnot(x, __riscv_vsetvlmax_e64m1()), y, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vand(__riscv_vnot(x, ((int)__riscv_vsetvlmax_e64m1())), y, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vmask_rvvm1_sleef vandnot_vm_vo64_vm_rvvm1_sleef(rvv_dp_vopmask_rvvm1_sleef x, vmask_rvvm1_sleef y) {
|
||||
return __riscv_vmerge(y, 0, x, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vmerge(y, 0, x, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vmask_rvvm1_sleef vsll64_vm_vm_i(vmask_rvvm1_sleef mask, int64_t c) {
|
||||
return __riscv_vsll(mask, c, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vsll(mask, c, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vmask_rvvm1_sleef vsub64_vm_vm_vm_rvvm1_sleef(vmask_rvvm1_sleef x, vmask_rvvm1_sleef y) {
|
||||
return __riscv_vsub(x, y, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vsub(x, y, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vmask_rvvm1_sleef vsrl64_vm_vm_i(vmask_rvvm1_sleef mask, int64_t c) {
|
||||
return __riscv_vsrl(mask, c, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vsrl(mask, c, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vmask_rvvm1_sleef vadd64_vm_vm_vm_rvvm1_sleef(vmask_rvvm1_sleef x, vmask_rvvm1_sleef y) {
|
||||
return __riscv_vadd(x, y, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vadd(x, y, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vmask_rvvm1_sleef vor_vm_vo64_vm_rvvm1_sleef(rvv_dp_vopmask_rvvm1_sleef x, vmask_rvvm1_sleef y) {
|
||||
return __riscv_vmerge(y, -1, x, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vmerge(y, -1, x, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vmask_rvvm1_sleef vsel_vm_vo64_vm_vm_rvvm1_sleef(rvv_dp_vopmask_rvvm1_sleef mask, vmask_rvvm1_sleef x, vmask_rvvm1_sleef y) {
|
||||
return __riscv_vmerge(y, x, mask, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vmerge(y, x, mask, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vmask_rvvm1_sleef vneg64_vm_vm_rvvm1_sleef(vmask_rvvm1_sleef mask) {
|
||||
return __riscv_vreinterpret_u64m1(__riscv_vneg(__riscv_vreinterpret_i64m1(mask), __riscv_vsetvlmax_e64m1()));
|
||||
return __riscv_vreinterpret_u64m1(__riscv_vneg(__riscv_vreinterpret_i64m1(mask), ((int)__riscv_vsetvlmax_e64m1())));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vreinterpret_vd_vm_rvvm1_sleef(vmask_rvvm1_sleef vm) {
|
||||
return __riscv_vreinterpret_f64m1(__riscv_vreinterpret_i64m1(vm));
|
||||
@ -1757,111 +1754,111 @@ static SLEEF_ALWAYS_INLINE rvv_dp_vopmask_rvvm1_sleef vcast_vo32_vo64_rvvm1_slee
|
||||
return vo;
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE rvv_dp_vopmask_rvvm1_sleef rvv_dp_vand_vo_vo_vo(rvv_dp_vopmask_rvvm1_sleef x, rvv_dp_vopmask_rvvm1_sleef y) {
|
||||
return __riscv_vmand(x, y, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vmand(x, y, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE rvv_dp_vopmask_rvvm1_sleef rvv_dp_vandnot_vo_vo_vo(rvv_dp_vopmask_rvvm1_sleef x, rvv_dp_vopmask_rvvm1_sleef y) {
|
||||
return __riscv_vmandn(y, x, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vmandn(y, x, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE rvv_dp_vopmask_rvvm1_sleef rvv_dp_vor_vo_vo_vo(rvv_dp_vopmask_rvvm1_sleef x, rvv_dp_vopmask_rvvm1_sleef y) {
|
||||
return __riscv_vmor(x, y, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vmor(x, y, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE rvv_dp_vopmask_rvvm1_sleef rvv_dp_vxor_vo_vo_vo(rvv_dp_vopmask_rvvm1_sleef x, rvv_dp_vopmask_rvvm1_sleef y) {
|
||||
return __riscv_vmxor(x, y, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vmxor(x, y, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE rvv_dp_vopmask_rvvm1_sleef veq64_vo_vm_vm_rvvm1_sleef(vmask_rvvm1_sleef x, vmask_rvvm1_sleef y) {
|
||||
return __riscv_vmseq(x, y, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vmseq(x, y, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE rvv_dp_vopmask_rvvm1_sleef vgt64_vo_vm_vm_rvvm1_sleef(vmask_rvvm1_sleef x, vmask_rvvm1_sleef y) {
|
||||
return __riscv_vmsgt(__riscv_vreinterpret_i64m1(x), __riscv_vreinterpret_i64m1(y), __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vmsgt(__riscv_vreinterpret_i64m1(x), __riscv_vreinterpret_i64m1(y), ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE rvv_dp_vopmask_rvvm1_sleef visinf_vo_vd_rvvm1_sleef(vdouble_rvvm1_sleef d) {
|
||||
return __riscv_vmfeq(__riscv_vfabs(d, __riscv_vsetvlmax_e64m1()), __builtin_inf(), __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vmfeq(__riscv_vfabs(d, ((int)__riscv_vsetvlmax_e64m1())), __builtin_inf(), ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE rvv_dp_vopmask_rvvm1_sleef vispinf_vo_vd_rvvm1_sleef(vdouble_rvvm1_sleef d) {
|
||||
return __riscv_vmfeq(d, __builtin_inf(), __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vmfeq(d, __builtin_inf(), ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE rvv_dp_vopmask_rvvm1_sleef veq_vo_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y) {
|
||||
return __riscv_vmfeq(x, y, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vmfeq(x, y, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE rvv_dp_vopmask_rvvm1_sleef vneq_vo_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y) {
|
||||
return __riscv_vmfne(x, y, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vmfne(x, y, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE rvv_dp_vopmask_rvvm1_sleef vlt_vo_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y) {
|
||||
return __riscv_vmflt(x, y, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vmflt(x, y, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE rvv_dp_vopmask_rvvm1_sleef vle_vo_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y) {
|
||||
return __riscv_vmfle(x, y, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vmfle(x, y, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE rvv_dp_vopmask_rvvm1_sleef vgt_vo_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y) {
|
||||
return __riscv_vmfgt(x, y, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vmfgt(x, y, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE rvv_dp_vopmask_rvvm1_sleef vge_vo_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y) {
|
||||
return __riscv_vmfge(x, y, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vmfge(x, y, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE rvv_dp_vopmask_rvvm1_sleef visnan_vo_vd_rvvm1_sleef(vdouble_rvvm1_sleef d) {
|
||||
return __riscv_vmfne(d, d, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vmfne(d, d, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vsel_vd_vo_vd_vd_rvvm1_sleef(rvv_dp_vopmask_rvvm1_sleef mask, vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y) {
|
||||
return __riscv_vmerge(y, x, mask, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vmerge(y, x, mask, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vsel_vd_vo_d_d_rvvm1_sleef(rvv_dp_vopmask_rvvm1_sleef mask, double v0, double v1) {
|
||||
return __riscv_vfmerge(vcast_vd_d_rvvm1_sleef(v1), v0, mask, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vfmerge(vcast_vd_d_rvvm1_sleef(v1), v0, mask, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vsel_vd_vo_vo_d_d_d_rvvm1_sleef(rvv_dp_vopmask_rvvm1_sleef o0, rvv_dp_vopmask_rvvm1_sleef o1, double d0, double d1, double d2) {
|
||||
return __riscv_vfmerge(__riscv_vfmerge(vcast_vd_d_rvvm1_sleef(d2), d1, o1, __riscv_vsetvlmax_e64m1()), d0, o0, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vfmerge(__riscv_vfmerge(vcast_vd_d_rvvm1_sleef(d2), d1, o1, ((int)__riscv_vsetvlmax_e64m1())), d0, o0, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vsel_vd_vo_vo_vo_d_d_d_d_rvvm1_sleef(rvv_dp_vopmask_rvvm1_sleef o0, rvv_dp_vopmask_rvvm1_sleef o1, rvv_dp_vopmask_rvvm1_sleef o2, double d0, double d1, double d2, double d3) {
|
||||
return __riscv_vfmerge(__riscv_vfmerge(__riscv_vfmerge(vcast_vd_d_rvvm1_sleef(d3), d2, o2, __riscv_vsetvlmax_e64m1()), d1, o1, __riscv_vsetvlmax_e64m1()), d0, o0, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vfmerge(__riscv_vfmerge(__riscv_vfmerge(vcast_vd_d_rvvm1_sleef(d3), d2, o2, ((int)__riscv_vsetvlmax_e64m1())), d1, o1, ((int)__riscv_vsetvlmax_e64m1())), d0, o0, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE int vtestallones_i_vo64_rvvm1_sleef(rvv_dp_vopmask_rvvm1_sleef g) {
|
||||
return __riscv_vcpop(g, __riscv_vsetvlmax_e64m1()) == __riscv_vsetvlmax_e64m1();
|
||||
return (int)__riscv_vcpop(g, ((int)__riscv_vsetvlmax_e64m1())) == (int)((int)__riscv_vsetvlmax_e64m1());
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE rvv_dp_vopmask_rvvm1_sleef veq_vo_vi_vi_rvvm1_sleef(vint_rvvm1_sleef x, vint_rvvm1_sleef y) {
|
||||
return __riscv_vmseq(x, y, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vmseq(x, y, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE rvv_dp_vopmask_rvvm1_sleef vgt_vo_vi_vi_rvvm1_sleef(vint_rvvm1_sleef x, vint_rvvm1_sleef y) {
|
||||
return __riscv_vmsgt(x, y, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vmsgt(x, y, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vint_rvvm1_sleef vgt_vi_vi_vi_rvvm1_sleef(vint_rvvm1_sleef x, vint_rvvm1_sleef y) {
|
||||
vint_rvvm1_sleef zero = vcast_vi_i_rvvm1_sleef(0);
|
||||
return __riscv_vmerge(zero, -1, __riscv_vmsgt(x, y, __riscv_vsetvlmax_e64m1()), __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vmerge(zero, -1, __riscv_vmsgt(x, y, ((int)__riscv_vsetvlmax_e64m1())), ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE vint_rvvm1_sleef vsel_vi_vo_vi_vi_rvvm1_sleef(rvv_dp_vopmask_rvvm1_sleef m, vint_rvvm1_sleef x, vint_rvvm1_sleef y) {
|
||||
return __riscv_vmerge(y, x, m, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vmerge(y, x, m, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vint_rvvm1_sleef vandnot_vi_vo_vi_rvvm1_sleef(rvv_dp_vopmask_rvvm1_sleef mask, vint_rvvm1_sleef vi) {
|
||||
return __riscv_vmerge(vi, 0, mask, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vmerge(vi, 0, mask, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vint_rvvm1_sleef vand_vi_vo_vi_rvvm1_sleef(rvv_dp_vopmask_rvvm1_sleef x, vint_rvvm1_sleef y) {
|
||||
return __riscv_vmerge(y, 0, __riscv_vmnot(x, __riscv_vsetvlmax_e64m1()), __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vmerge(y, 0, __riscv_vmnot(x, ((int)__riscv_vsetvlmax_e64m1())), ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vposneg_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef d) {
|
||||
rvv_dp_vopmask_rvvm1_sleef mask = __riscv_vreinterpret_b64(__riscv_vmv_v_x_u8m1(0x55, __riscv_vsetvlmax_e8m1()));
|
||||
vdouble_rvvm1_sleef nd = __riscv_vfneg(d, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vmerge(nd, d, mask, __riscv_vsetvlmax_e64m1());
|
||||
vdouble_rvvm1_sleef nd = __riscv_vfneg(d, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
return __riscv_vmerge(nd, d, mask, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vnegpos_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef d) {
|
||||
rvv_dp_vopmask_rvvm1_sleef mask = __riscv_vreinterpret_b64(__riscv_vmv_v_x_u8m1(0xaa, __riscv_vsetvlmax_e8m1()));
|
||||
vdouble_rvvm1_sleef nd = __riscv_vfneg(d, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vmerge(nd, d, mask, __riscv_vsetvlmax_e64m1());
|
||||
vdouble_rvvm1_sleef nd = __riscv_vfneg(d, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
return __riscv_vmerge(nd, d, mask, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vposneg_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef d) {
|
||||
rvv_sp_vopmask_rvvm1_sleef mask = __riscv_vreinterpret_b32(__riscv_vmv_v_x_u8m1(0x55, __riscv_vsetvlmax_e8m1()));
|
||||
vfloat_rvvm1_sleef nd = __riscv_vfneg(d, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vmerge(nd, d, mask, (__riscv_vsetvlmax_e32m1()));
|
||||
vfloat_rvvm1_sleef nd = __riscv_vfneg(d, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vmerge(nd, d, mask, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vnegpos_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef d) {
|
||||
rvv_sp_vopmask_rvvm1_sleef mask = __riscv_vreinterpret_b32(__riscv_vmv_v_x_u8m1(0xaa, __riscv_vsetvlmax_e8m1()));
|
||||
vfloat_rvvm1_sleef nd = __riscv_vfneg(d, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vmerge(nd, d, mask, (__riscv_vsetvlmax_e32m1()));
|
||||
vfloat_rvvm1_sleef nd = __riscv_vfneg(d, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vmerge(nd, d, mask, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vsubadd_vd_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y) { return vadd_vd_vd_vd_rvvm1_sleef(x, vnegpos_vd_vd_rvvm1_sleef(y)); }
|
||||
@ -1870,33 +1867,33 @@ static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vmlsubadd_vd_vd_vd_vd_rvvm1_sleef
|
||||
static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vmlsubadd_vf_vf_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef x, vfloat_rvvm1_sleef y, vfloat_rvvm1_sleef z) { return vfma_vf_vf_vf_vf_rvvm1_sleef(x, y, vnegpos_vf_vf_rvvm1_sleef(z)); }
|
||||
|
||||
static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vrev21_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef vd_rvvm1_sleef) {
|
||||
rvv_dp_vuint2 id = __riscv_vid_v_u64m1(__riscv_vsetvlmax_e64m1());
|
||||
id = __riscv_vxor(id, 1, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vrgather(vd_rvvm1_sleef, id, __riscv_vsetvlmax_e64m1());
|
||||
rvv_dp_vuint2 id = __riscv_vid_v_u64m1(((int)__riscv_vsetvlmax_e64m1()));
|
||||
id = __riscv_vxor(id, 1, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
return __riscv_vrgather(vd_rvvm1_sleef, id, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vrev21_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef vf) {
|
||||
vint2_rvvm1_sleef id = __riscv_vreinterpret_i32m1(__riscv_vid_v_u32m1((__riscv_vsetvlmax_e32m1())));
|
||||
id = __riscv_vxor(id, 1, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vrgather(vf, __riscv_vreinterpret_u32m1(id), (__riscv_vsetvlmax_e32m1()));
|
||||
vint2_rvvm1_sleef id = __riscv_vreinterpret_i32m1(__riscv_vid_v_u32m1(((int)__riscv_vsetvlmax_e32m1())));
|
||||
id = __riscv_vxor(id, 1, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vrgather(vf, __riscv_vreinterpret_u32m1(id), ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE vdouble_rvvm1_sleef vreva2_vd_vd_rvvm1_sleef(vdouble_rvvm1_sleef vd_rvvm1_sleef) {
|
||||
rvv_dp_vuint2 id = __riscv_vid_v_u64m1(__riscv_vsetvlmax_e64m1());
|
||||
id = __riscv_vxor(id, __riscv_vsetvlmax_e64m1() - 2, __riscv_vsetvlmax_e64m1());
|
||||
return __riscv_vrgather(vd_rvvm1_sleef, id, __riscv_vsetvlmax_e64m1());
|
||||
rvv_dp_vuint2 id = __riscv_vid_v_u64m1(((int)__riscv_vsetvlmax_e64m1()));
|
||||
id = __riscv_vxor(id, ((int)__riscv_vsetvlmax_e64m1()) - 2, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
return __riscv_vrgather(vd_rvvm1_sleef, id, ((int)__riscv_vsetvlmax_e64m1()));
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE vfloat_rvvm1_sleef vreva2_vf_vf_rvvm1_sleef(vfloat_rvvm1_sleef vf) {
|
||||
vint2_rvvm1_sleef id = __riscv_vreinterpret_i32m1(__riscv_vid_v_u32m1((__riscv_vsetvlmax_e32m1())));
|
||||
id = __riscv_vxor(id, (__riscv_vsetvlmax_e32m1()) - 2, (__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vrgather(vf, __riscv_vreinterpret_u32m1(id), (__riscv_vsetvlmax_e32m1()));
|
||||
vint2_rvvm1_sleef id = __riscv_vreinterpret_i32m1(__riscv_vid_v_u32m1(((int)__riscv_vsetvlmax_e32m1())));
|
||||
id = __riscv_vxor(id, ((int)__riscv_vsetvlmax_e32m1()) - 2, ((int)__riscv_vsetvlmax_e32m1()));
|
||||
return __riscv_vrgather(vf, __riscv_vreinterpret_u32m1(id), ((int)__riscv_vsetvlmax_e32m1()));
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE void vscatter2_v_p_i_i_vd_rvvm1_sleef(double *ptr, int offset, int step, vdouble_rvvm1_sleef v) {
|
||||
|
||||
ptr += offset * 2;
|
||||
for (int i = 0; i < __riscv_vsetvlmax_e64m1(); i += 2) {
|
||||
for (int i = 0; i < (int)((int)__riscv_vsetvlmax_e64m1()); i += 2) {
|
||||
|
||||
vdouble_rvvm1_sleef vv = __riscv_vslidedown(v, i, 2);
|
||||
__riscv_vse64(ptr, vv, 2);
|
||||
@ -1907,7 +1904,7 @@ static SLEEF_ALWAYS_INLINE void vscatter2_v_p_i_i_vd_rvvm1_sleef(double *ptr, in
|
||||
static SLEEF_ALWAYS_INLINE void vscatter2_v_p_i_i_vf_rvvm1_sleef(float *ptr, int offset, int step, vfloat_rvvm1_sleef v) {
|
||||
|
||||
ptr += offset * 2;
|
||||
for (int i = 0; i < (__riscv_vsetvlmax_e32m1()); i += 2) {
|
||||
for (int i = 0; i < (int)((int)__riscv_vsetvlmax_e32m1()); i += 2) {
|
||||
vfloat_rvvm1_sleef vv = __riscv_vslidedown(v, i, 2);
|
||||
__riscv_vse32(ptr, vv, 2);
|
||||
ptr += step * 2;
|
||||
@ -2007,7 +2004,7 @@ static SLEEF_ALWAYS_INLINE tdi_t_rvvm1_sleef tdisettdi_tdi_vd3_vi_rvvm1_sleef(vd
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE rvv_dp_vopmask_rvvm1_sleef vcast_vo_i_rvvm1_sleef(int i) {
|
||||
return __riscv_vreinterpret_b64(__riscv_vmv_v_x_u32m1(i, (__riscv_vsetvlmax_e32m1())));
|
||||
return __riscv_vreinterpret_b64(__riscv_vmv_v_x_u32m1(i, ((int)__riscv_vsetvlmax_e32m1())));
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE vmask_rvvm1_sleef vreinterpret_vm_vi64_rvvm1_sleef(vint64_rvvm1_sleef v) {
|
||||
return __riscv_vreinterpret_u64m1(v);
|
||||
@ -2022,7 +2019,7 @@ static SLEEF_ALWAYS_INLINE vuint64_rvvm1_sleef vreinterpret_vu64_vm_rvvm1_sleef(
|
||||
return m;
|
||||
}
|
||||
static SLEEF_ALWAYS_INLINE int vtestallzeros_i_vo64_rvvm1_sleef(rvv_dp_vopmask_rvvm1_sleef g) {
|
||||
return __riscv_vcpop(g, __riscv_vsetvlmax_e64m1()) == 0;
|
||||
return __riscv_vcpop(g, ((int)__riscv_vsetvlmax_e64m1())) == 0;
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE void vstream_v_p_vd_rvvm1_sleef(double *ptr, vdouble_rvvm1_sleef v) { vstore_v_p_vd_rvvm1_sleef(ptr, v); }
|
||||
@ -2048,7 +2045,7 @@ static int vcast_i_vi2(vint2_rvvm1_sleef v) {
|
||||
|
||||
static vquad_rvvm1_sleef loadu_vq_p_rvvm1_sleef(const int32_t *ptr) {
|
||||
|
||||
return __riscv_vreinterpret_u64m2(__riscv_vreinterpret_u32m2(__riscv_vle32_v_i32m2(ptr, (__riscv_vsetvlmax_e32m1()) * 2)));
|
||||
return __riscv_vreinterpret_u64m2(__riscv_vreinterpret_u32m2(__riscv_vle32_v_i32m2(ptr, ((int)__riscv_vsetvlmax_e32m1()) * 2)));
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE vquad_rvvm1_sleef cast_vq_aq_rvvm1_sleef(vargquad_rvvm1_sleef aq) { return aq; }
|
||||
@ -3511,7 +3508,8 @@ SLEEF_INLINE SLEEF_CONST vdouble_rvvm1_sleef Sleef_expdx_u10rvvm1(vdouble_rvvm1_
|
||||
|
||||
u = vldexp2_vd_vd_vi_rvvm1_sleef(u, q);
|
||||
|
||||
u = vsel_vd_vo_vd_vd_rvvm1_sleef(vgt_vo_vd_vd_rvvm1_sleef(d, vcast_vd_d_rvvm1_sleef(709.78271114955742909217217426)), vcast_vd_d_rvvm1_sleef(__builtin_inf()), u);
|
||||
rvv_dp_vopmask_rvvm1_sleef o = vgt_vo_vd_vd_rvvm1_sleef(d, vcast_vd_d_rvvm1_sleef(0x1.62e42fefa39efp+9));
|
||||
u = vsel_vd_vo_vd_vd_rvvm1_sleef(o, vcast_vd_d_rvvm1_sleef(__builtin_inf()), u);
|
||||
u = vreinterpret_vd_vm_rvvm1_sleef(vandnot_vm_vo64_vm_rvvm1_sleef(vlt_vo_vd_vd_rvvm1_sleef(d, vcast_vd_d_rvvm1_sleef(-1000)), vreinterpret_vm_vd_rvvm1_sleef(u)));
|
||||
|
||||
return u;
|
||||
@ -3628,13 +3626,13 @@ static SLEEF_ALWAYS_INLINE SLEEF_CONST vdouble_rvvm1_sleef expk_rvvm1_sleef(vdou
|
||||
}
|
||||
|
||||
SLEEF_INLINE SLEEF_CONST vdouble_rvvm1_sleef Sleef_powdx_u10rvvm1(vdouble_rvvm1_sleef x, vdouble_rvvm1_sleef y) {
|
||||
|
||||
rvv_dp_vopmask_rvvm1_sleef yisint = visint_vo_vd_rvvm1_sleef(y);
|
||||
rvv_dp_vopmask_rvvm1_sleef yisodd = rvv_dp_vand_vo_vo_vo(visodd_vo_vd_rvvm1_sleef(y), yisint);
|
||||
|
||||
vdouble2_rvvm1_sleef d = ddmul_vd2_vd2_vd_rvvm1_sleef(logk_rvvm1_sleef(vabs_vd_vd_rvvm1_sleef(x)), y);
|
||||
vdouble_rvvm1_sleef result = expk_rvvm1_sleef(d);
|
||||
result = vsel_vd_vo_vd_vd_rvvm1_sleef(vgt_vo_vd_vd_rvvm1_sleef(vd2getx_vd_vd2_rvvm1_sleef(d), vcast_vd_d_rvvm1_sleef(709.78271114955742909217217426)), vcast_vd_d_rvvm1_sleef(__builtin_inf()), result);
|
||||
rvv_dp_vopmask_rvvm1_sleef o = vgt_vo_vd_vd_rvvm1_sleef(vd2getx_vd_vd2_rvvm1_sleef(d), vcast_vd_d_rvvm1_sleef(0x1.62e42fefa39efp+9));
|
||||
result = vsel_vd_vo_vd_vd_rvvm1_sleef(o, vcast_vd_d_rvvm1_sleef(__builtin_inf()), result);
|
||||
|
||||
result = vmul_vd_vd_vd_rvvm1_sleef(result,
|
||||
vsel_vd_vo_vd_vd_rvvm1_sleef(vgt_vo_vd_vd_rvvm1_sleef(x, vcast_vd_d_rvvm1_sleef(0)),
|
||||
@ -3660,7 +3658,6 @@ SLEEF_INLINE SLEEF_CONST vdouble_rvvm1_sleef Sleef_powdx_u10rvvm1(vdouble_rvvm1_
|
||||
result = vsel_vd_vo_vd_vd_rvvm1_sleef(rvv_dp_vor_vo_vo_vo(veq_vo_vd_vd_rvvm1_sleef(y, vcast_vd_d_rvvm1_sleef(0)), veq_vo_vd_vd_rvvm1_sleef(x, vcast_vd_d_rvvm1_sleef(1))), vcast_vd_d_rvvm1_sleef(1), result);
|
||||
|
||||
return result;
|
||||
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE SLEEF_CONST vdouble2_rvvm1_sleef expk2_rvvm1_sleef(vdouble2_rvvm1_sleef d) {
|
||||
@ -4148,7 +4145,8 @@ SLEEF_INLINE SLEEF_CONST vdouble_rvvm1_sleef Sleef_log1pdx_u10rvvm1(vdouble_rvvm
|
||||
|
||||
vdouble_rvvm1_sleef r = vadd_vd_vd_vd_rvvm1_sleef(vd2getx_vd_vd2_rvvm1_sleef(s), vd2gety_vd_vd2_rvvm1_sleef(s));
|
||||
|
||||
r = vsel_vd_vo_vd_vd_rvvm1_sleef(vgt_vo_vd_vd_rvvm1_sleef(d, vcast_vd_d_rvvm1_sleef(1e+307)), vcast_vd_d_rvvm1_sleef(__builtin_inf()), r);
|
||||
rvv_dp_vopmask_rvvm1_sleef ocore = vle_vo_vd_vd_rvvm1_sleef(d, vcast_vd_d_rvvm1_sleef(0x1.c7b1f3cac7433p+1019));
|
||||
if(!__builtin_expect(!!(vtestallones_i_vo64_rvvm1_sleef (ocore)), 1)) r = vsel_vd_vo_vd_vd_rvvm1_sleef(ocore, r, Sleef_logdx_u10rvvm1(d));
|
||||
r = vsel_vd_vo_vd_vd_rvvm1_sleef(rvv_dp_vor_vo_vo_vo(vlt_vo_vd_vd_rvvm1_sleef(d, vcast_vd_d_rvvm1_sleef(-1)), visnan_vo_vd_rvvm1_sleef(d)), vcast_vd_d_rvvm1_sleef(__builtin_nan("")), r);
|
||||
r = vsel_vd_vo_vd_vd_rvvm1_sleef(veq_vo_vd_vd_rvvm1_sleef(d, vcast_vd_d_rvvm1_sleef(-1)), vcast_vd_d_rvvm1_sleef(-__builtin_inf()), r);
|
||||
r = vsel_vd_vo_vd_vd_rvvm1_sleef(visnegzero_vo_vd_rvvm1_sleef(d), vcast_vd_d_rvvm1_sleef(-0.0), r);
|
||||
@ -4228,7 +4226,7 @@ SLEEF_INLINE SLEEF_CONST vint_rvvm1_sleef Sleef_expfrexpdx_rvvm1(vdouble_rvvm1_s
|
||||
vint_rvvm1_sleef ret = vcastu_vi_vm_rvvm1_sleef(vreinterpret_vm_vd_rvvm1_sleef(x));
|
||||
ret = vsub_vi_vi_vi_rvvm1_sleef(vand_vi_vi_vi_rvvm1_sleef(vsrl_vi_vi_i_rvvm1_sleef(ret, 20), vcast_vi_i_rvvm1_sleef(0x7ff)), vcast_vi_i_rvvm1_sleef(0x3fe));
|
||||
|
||||
ret = vsel_vi_vo_vi_vi_rvvm1_sleef(rvv_dp_vor_vo_vo_vo(rvv_dp_vor_vo_vo_vo(veq_vo_vd_vd_rvvm1_sleef(x, vcast_vd_d_rvvm1_sleef(0)), visnan_vo_vd_rvvm1_sleef(x)), visinf_vo_vd_rvvm1_sleef(x)), vcast_vi_i_rvvm1_sleef(0), ret);
|
||||
ret = vsel_vi_vo_vi_vi_rvvm1_sleef(vcast_vo32_vo64_rvvm1_sleef(rvv_dp_vor_vo_vo_vo(rvv_dp_vor_vo_vo_vo(veq_vo_vd_vd_rvvm1_sleef(x, vcast_vd_d_rvvm1_sleef(0)), visnan_vo_vd_rvvm1_sleef(x)), visinf_vo_vd_rvvm1_sleef(x))), vcast_vi_i_rvvm1_sleef(0), ret);
|
||||
|
||||
return ret;
|
||||
}
|
||||
@ -4631,14 +4629,6 @@ SLEEF_INLINE SLEEF_CONST vdouble_rvvm1_sleef Sleef_erfcdx_u15rvvm1(vdouble_rvvm1
|
||||
return r;
|
||||
}
|
||||
|
||||
#if !defined(__NVCC__) && ((defined(__SIZEOF_FLOAT128__) && __SIZEOF_FLOAT128__ == 16) || (defined(__linux__) && defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))) || (defined(__PPC64__) && defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 8))
|
||||
#define SLEEF_FLOAT128_IS_IEEEQP
|
||||
#endif
|
||||
|
||||
#if !defined(SLEEF_FLOAT128_IS_IEEEQP) && !defined(__NVCC__) && defined(__SIZEOF_LONG_DOUBLE__) && __SIZEOF_LONG_DOUBLE__ == 16 && (defined(__aarch64__) || defined(__zarch__))
|
||||
#define SLEEF_LONGDOUBLE_IS_IEEEQP
|
||||
#endif
|
||||
|
||||
#if !defined(Sleef_quad_DEFINED)
|
||||
#define Sleef_quad_DEFINED
|
||||
typedef struct { uint64_t x, y; } Sleef_uint64_2t;
|
||||
@ -5105,6 +5095,7 @@ SLEEF_INLINE SLEEF_CONST vfloat_rvvm1_sleef Sleef_tanfx_u35rvvm1(vfloat_rvvm1_sl
|
||||
if (__builtin_expect(!!(vtestallones_i_vo32_rvvm1_sleef(vlt_vo_vf_vf_rvvm1_sleef(vabs_vf_vf_rvvm1_sleef(d), vcast_vf_f_rvvm1_sleef(125.0f*0.5f)))), 1)) {
|
||||
q = vrint_vi2_vf_rvvm1_sleef(vmul_vf_vf_vf_rvvm1_sleef(d, vcast_vf_f_rvvm1_sleef((float)(2 * 0.318309886183790671537767526745028724))));
|
||||
u = vcast_vf_vi2_rvvm1_sleef(q);
|
||||
|
||||
x = vmla_vf_vf_vf_vf_rvvm1_sleef(u, vcast_vf_f_rvvm1_sleef(-3.1414794921875f*0.5f), x);
|
||||
x = vmla_vf_vf_vf_vf_rvvm1_sleef(u, vcast_vf_f_rvvm1_sleef(-0.00011315941810607910156f*0.5f), x);
|
||||
x = vmla_vf_vf_vf_vf_rvvm1_sleef(u, vcast_vf_f_rvvm1_sleef(-1.9841872589410058936e-09f*0.5f), x);
|
||||
@ -6506,7 +6497,8 @@ SLEEF_INLINE SLEEF_CONST vfloat_rvvm1_sleef Sleef_log1pfx_u10rvvm1(vfloat_rvvm1_
|
||||
|
||||
vfloat_rvvm1_sleef r = vadd_vf_vf_vf_rvvm1_sleef(vf2getx_vf_vf2_rvvm1_sleef(s), vf2gety_vf_vf2_rvvm1_sleef(s));
|
||||
|
||||
r = vsel_vf_vo_vf_vf_rvvm1_sleef(vgt_vo_vf_vf_rvvm1_sleef(d, vcast_vf_f_rvvm1_sleef(1e+38)), vcast_vf_f_rvvm1_sleef(__builtin_inff()), r);
|
||||
rvv_sp_vopmask_rvvm1_sleef ocore = vle_vo_vf_vf_rvvm1_sleef(d, vcast_vf_f_rvvm1_sleef(0x1.2ced32p+126));
|
||||
if(!__builtin_expect(!!(vtestallones_i_vo32_rvvm1_sleef (ocore)), 1)) r = vsel_vf_vo_vf_vf_rvvm1_sleef(ocore, r, Sleef_logfx_u10rvvm1(d));
|
||||
r = vreinterpret_vf_vm_rvvm1_sleef(vor_vm_vo32_vm_rvvm1_sleef(vgt_vo_vf_vf_rvvm1_sleef(vcast_vf_f_rvvm1_sleef(-1), d), vreinterpret_vm_vf_rvvm1_sleef(r)));
|
||||
r = vsel_vf_vo_vf_vf_rvvm1_sleef(veq_vo_vf_vf_rvvm1_sleef(d, vcast_vf_f_rvvm1_sleef(-1)), vcast_vf_f_rvvm1_sleef(-__builtin_inff()), r);
|
||||
r = vsel_vf_vo_vf_vf_rvvm1_sleef(visnegzero_vo_vf_rvvm1_sleef(d), vcast_vf_f_rvvm1_sleef(-0.0f), r);
|
||||
|
||||
@ -1,8 +1,11 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
// This file is generated by SLEEF 3.6.1
|
||||
// This file is generated by SLEEF 3.9.0
|
||||
|
||||
/* #undef SLEEF_FLOAT128_IS_IEEEQP */
|
||||
#define SLEEF_LONGDOUBLE_IS_IEEEQP
|
||||
|
||||
#ifndef SLEEF_ALWAYS_INLINE
|
||||
#if defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)
|
||||
@ -1010,6 +1013,7 @@ static const double Sleef_rempitabdp[] = {
|
||||
2.1353977370878701046e-273, -1.2215123283371736879e-289, 6.7342163555358599277e-306, -5.681754927174335258e-322,
|
||||
2.1353977370878701046e-273, -1.2215123283371736879e-289, 6.7342163555358599277e-306, -5.681754927174335258e-322,
|
||||
2.8687869620228451614e-274, -1.9537812801257956865e-290, 1.0380272777574237546e-306, 6.4228533959362050743e-323,
|
||||
0, 0, 0, 0,
|
||||
};
|
||||
|
||||
static const float Sleef_rempitabsp[] = {
|
||||
@ -1116,17 +1120,10 @@ static const float Sleef_rempitabsp[] = {
|
||||
1.183823005e-12, 1.161414894e-20, 1.29131908e-27, 1.715766248e-34,
|
||||
1.183823005e-12, 1.161414894e-20, 1.29131908e-27, 1.715766248e-34,
|
||||
2.743283031e-13, 1.161414894e-20, 1.29131908e-27, 1.715766248e-34,
|
||||
0, 0, 0, 0,
|
||||
};
|
||||
#endif // #ifndef __SLEEF_REMPITAB__
|
||||
|
||||
#if !defined(__NVCC__) && ((defined(__SIZEOF_FLOAT128__) && __SIZEOF_FLOAT128__ == 16) || (defined(__linux__) && defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))) || (defined(__PPC64__) && defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 8))
|
||||
#define SLEEF_FLOAT128_IS_IEEEQP
|
||||
#endif
|
||||
|
||||
#if !defined(SLEEF_FLOAT128_IS_IEEEQP) && !defined(__NVCC__) && defined(__SIZEOF_LONG_DOUBLE__) && __SIZEOF_LONG_DOUBLE__ == 16 && (defined(__aarch64__) || defined(__zarch__))
|
||||
#define SLEEF_LONGDOUBLE_IS_IEEEQP
|
||||
#endif
|
||||
|
||||
#if !defined(Sleef_quad_DEFINED)
|
||||
#define Sleef_quad_DEFINED
|
||||
typedef struct { uint64_t x, y; } Sleef_uint64_2t;
|
||||
@ -1833,13 +1830,13 @@ static SLEEF_ALWAYS_INLINE vfloat_sve_sleef vmlsubadd_vf_vf_vf_vf_sve_sleef(vflo
|
||||
static SLEEF_ALWAYS_INLINE vdouble_sve_sleef vrev21_vd_vd_sve_sleef(vdouble_sve_sleef x) { return svzip1_f64(svuzp2_f64(x, x), svuzp1_f64(x, x)); }
|
||||
|
||||
static SLEEF_ALWAYS_INLINE vdouble_sve_sleef vreva2_vd_vd_sve_sleef(vdouble_sve_sleef vd_sve_sleef) {
|
||||
svint64_t x = svindex_s64(((svcntd())-1), -1);
|
||||
svint64_t x = svindex_s64((((int)svcntd())-1), -1);
|
||||
x = svzip1_s64(svuzp2_s64(x, x), svuzp1_s64(x, x));
|
||||
return svtbl_f64(vd_sve_sleef, svreinterpret_u64_s64(x));
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE vfloat_sve_sleef vreva2_vf_vf_sve_sleef(vfloat_sve_sleef vf) {
|
||||
svint32_t x = svindex_s32(((svcntw())-1), -1);
|
||||
svint32_t x = svindex_s32((((int)svcntw())-1), -1);
|
||||
x = svzip1_s32(svuzp2_s32(x, x), svuzp1_s32(x, x));
|
||||
return svtbl_f32(vf, svreinterpret_u32_s32(x));
|
||||
}
|
||||
@ -3381,7 +3378,8 @@ SLEEF_INLINE SLEEF_CONST vdouble_sve_sleef Sleef_expdx_u10sve(vdouble_sve_sleef
|
||||
|
||||
u = vldexp2_vd_vd_vi_sve_sleef(u, q);
|
||||
|
||||
u = vsel_vd_vo_vd_vd_sve_sleef(vgt_vo_vd_vd_sve_sleef(d, vcast_vd_d_sve_sleef(709.78271114955742909217217426)), vcast_vd_d_sve_sleef(__builtin_inf()), u);
|
||||
vopmask_sve_sleef o = vgt_vo_vd_vd_sve_sleef(d, vcast_vd_d_sve_sleef(0x1.62e42fefa39efp+9));
|
||||
u = vsel_vd_vo_vd_vd_sve_sleef(o, vcast_vd_d_sve_sleef(__builtin_inf()), u);
|
||||
u = vreinterpret_vd_vm_sve_sleef(vandnot_vm_vo64_vm_sve_sleef(vlt_vo_vd_vd_sve_sleef(d, vcast_vd_d_sve_sleef(-1000)), vreinterpret_vm_vd_sve_sleef(u)));
|
||||
|
||||
return u;
|
||||
@ -3498,13 +3496,13 @@ static SLEEF_ALWAYS_INLINE SLEEF_CONST vdouble_sve_sleef expk_sve_sleef(vdouble2
|
||||
}
|
||||
|
||||
SLEEF_INLINE SLEEF_CONST vdouble_sve_sleef Sleef_powdx_u10sve(vdouble_sve_sleef x, vdouble_sve_sleef y) {
|
||||
|
||||
vopmask_sve_sleef yisint = visint_vo_vd_sve_sleef(y);
|
||||
vopmask_sve_sleef yisodd = vand_vo_vo_vo_sve_sleef(visodd_vo_vd_sve_sleef(y), yisint);
|
||||
|
||||
vdouble2_sve_sleef d = ddmul_vd2_vd2_vd_sve_sleef(logk_sve_sleef(vabs_vd_vd_sve_sleef(x)), y);
|
||||
vdouble_sve_sleef result = expk_sve_sleef(d);
|
||||
result = vsel_vd_vo_vd_vd_sve_sleef(vgt_vo_vd_vd_sve_sleef(vd2getx_vd_vd2_sve_sleef(d), vcast_vd_d_sve_sleef(709.78271114955742909217217426)), vcast_vd_d_sve_sleef(__builtin_inf()), result);
|
||||
vopmask_sve_sleef o = vgt_vo_vd_vd_sve_sleef(vd2getx_vd_vd2_sve_sleef(d), vcast_vd_d_sve_sleef(0x1.62e42fefa39efp+9));
|
||||
result = vsel_vd_vo_vd_vd_sve_sleef(o, vcast_vd_d_sve_sleef(__builtin_inf()), result);
|
||||
|
||||
result = vmul_vd_vd_vd_sve_sleef(result,
|
||||
vsel_vd_vo_vd_vd_sve_sleef(vgt_vo_vd_vd_sve_sleef(x, vcast_vd_d_sve_sleef(0)),
|
||||
@ -3530,7 +3528,6 @@ SLEEF_INLINE SLEEF_CONST vdouble_sve_sleef Sleef_powdx_u10sve(vdouble_sve_sleef
|
||||
result = vsel_vd_vo_vd_vd_sve_sleef(vor_vo_vo_vo_sve_sleef(veq_vo_vd_vd_sve_sleef(y, vcast_vd_d_sve_sleef(0)), veq_vo_vd_vd_sve_sleef(x, vcast_vd_d_sve_sleef(1))), vcast_vd_d_sve_sleef(1), result);
|
||||
|
||||
return result;
|
||||
|
||||
}
|
||||
|
||||
static SLEEF_ALWAYS_INLINE SLEEF_CONST vdouble2_sve_sleef expk2_sve_sleef(vdouble2_sve_sleef d) {
|
||||
@ -4018,7 +4015,8 @@ SLEEF_INLINE SLEEF_CONST vdouble_sve_sleef Sleef_log1pdx_u10sve(vdouble_sve_slee
|
||||
|
||||
vdouble_sve_sleef r = vadd_vd_vd_vd_sve_sleef(vd2getx_vd_vd2_sve_sleef(s), vd2gety_vd_vd2_sve_sleef(s));
|
||||
|
||||
r = vsel_vd_vo_vd_vd_sve_sleef(vgt_vo_vd_vd_sve_sleef(d, vcast_vd_d_sve_sleef(1e+307)), vcast_vd_d_sve_sleef(__builtin_inf()), r);
|
||||
vopmask_sve_sleef ocore = vle_vo_vd_vd_sve_sleef(d, vcast_vd_d_sve_sleef(0x1.c7b1f3cac7433p+1019));
|
||||
if(!__builtin_expect(!!(vtestallones_i_vo64_sve_sleef (ocore)), 1)) r = vsel_vd_vo_vd_vd_sve_sleef(ocore, r, Sleef_logdx_u10sve(d));
|
||||
r = vsel_vd_vo_vd_vd_sve_sleef(vor_vo_vo_vo_sve_sleef(vlt_vo_vd_vd_sve_sleef(d, vcast_vd_d_sve_sleef(-1)), visnan_vo_vd_sve_sleef(d)), vcast_vd_d_sve_sleef(__builtin_nan("")), r);
|
||||
r = vsel_vd_vo_vd_vd_sve_sleef(veq_vo_vd_vd_sve_sleef(d, vcast_vd_d_sve_sleef(-1)), vcast_vd_d_sve_sleef(-__builtin_inf()), r);
|
||||
r = vsel_vd_vo_vd_vd_sve_sleef(visnegzero_vo_vd_sve_sleef(d), vcast_vd_d_sve_sleef(-0.0), r);
|
||||
@ -4098,7 +4096,7 @@ SLEEF_INLINE SLEEF_CONST vint_sve_sleef Sleef_expfrexpdx_sve(vdouble_sve_sleef x
|
||||
vint_sve_sleef ret = vcastu_vi_vm_sve_sleef(vreinterpret_vm_vd_sve_sleef(x));
|
||||
ret = vsub_vi_vi_vi_sve_sleef(vand_vi_vi_vi_sve_sleef(vsrl_vi_vi_i_sve_sleef(ret, 20), vcast_vi_i_sve_sleef(0x7ff)), vcast_vi_i_sve_sleef(0x3fe));
|
||||
|
||||
ret = vsel_vi_vo_vi_vi_sve_sleef(vor_vo_vo_vo_sve_sleef(vor_vo_vo_vo_sve_sleef(veq_vo_vd_vd_sve_sleef(x, vcast_vd_d_sve_sleef(0)), visnan_vo_vd_sve_sleef(x)), visinf_vo_vd_sve_sleef(x)), vcast_vi_i_sve_sleef(0), ret);
|
||||
ret = vsel_vi_vo_vi_vi_sve_sleef(vcast_vo32_vo64_sve_sleef(vor_vo_vo_vo_sve_sleef(vor_vo_vo_vo_sve_sleef(veq_vo_vd_vd_sve_sleef(x, vcast_vd_d_sve_sleef(0)), visnan_vo_vd_sve_sleef(x)), visinf_vo_vd_sve_sleef(x))), vcast_vi_i_sve_sleef(0), ret);
|
||||
|
||||
return ret;
|
||||
}
|
||||
@ -4497,14 +4495,6 @@ SLEEF_INLINE SLEEF_CONST vdouble_sve_sleef Sleef_erfcdx_u15sve(vdouble_sve_sleef
|
||||
return r;
|
||||
}
|
||||
|
||||
#if !defined(__NVCC__) && ((defined(__SIZEOF_FLOAT128__) && __SIZEOF_FLOAT128__ == 16) || (defined(__linux__) && defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))) || (defined(__PPC64__) && defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 8))
|
||||
#define SLEEF_FLOAT128_IS_IEEEQP
|
||||
#endif
|
||||
|
||||
#if !defined(SLEEF_FLOAT128_IS_IEEEQP) && !defined(__NVCC__) && defined(__SIZEOF_LONG_DOUBLE__) && __SIZEOF_LONG_DOUBLE__ == 16 && (defined(__aarch64__) || defined(__zarch__))
|
||||
#define SLEEF_LONGDOUBLE_IS_IEEEQP
|
||||
#endif
|
||||
|
||||
#if !defined(Sleef_quad_DEFINED)
|
||||
#define Sleef_quad_DEFINED
|
||||
typedef struct { uint64_t x, y; } Sleef_uint64_2t;
|
||||
@ -4983,6 +4973,7 @@ SLEEF_INLINE SLEEF_CONST vfloat_sve_sleef Sleef_tanfx_u35sve(vfloat_sve_sleef d)
|
||||
if (__builtin_expect(!!(vtestallones_i_vo32_sve_sleef(vlt_vo_vf_vf_sve_sleef(vabs_vf_vf_sve_sleef(d), vcast_vf_f_sve_sleef(125.0f*0.5f)))), 1)) {
|
||||
q = vrint_vi2_vf_sve_sleef(vmul_vf_vf_vf_sve_sleef(d, vcast_vf_f_sve_sleef((float)(2 * 0.318309886183790671537767526745028724))));
|
||||
u = vcast_vf_vi2_sve_sleef(q);
|
||||
|
||||
x = vmla_vf_vf_vf_vf_sve_sleef(u, vcast_vf_f_sve_sleef(-3.1414794921875f*0.5f), x);
|
||||
x = vmla_vf_vf_vf_vf_sve_sleef(u, vcast_vf_f_sve_sleef(-0.00011315941810607910156f*0.5f), x);
|
||||
x = vmla_vf_vf_vf_vf_sve_sleef(u, vcast_vf_f_sve_sleef(-1.9841872589410058936e-09f*0.5f), x);
|
||||
@ -6384,7 +6375,8 @@ SLEEF_INLINE SLEEF_CONST vfloat_sve_sleef Sleef_log1pfx_u10sve(vfloat_sve_sleef
|
||||
|
||||
vfloat_sve_sleef r = vadd_vf_vf_vf_sve_sleef(vf2getx_vf_vf2_sve_sleef(s), vf2gety_vf_vf2_sve_sleef(s));
|
||||
|
||||
r = vsel_vf_vo_vf_vf_sve_sleef(vgt_vo_vf_vf_sve_sleef(d, vcast_vf_f_sve_sleef(1e+38)), vcast_vf_f_sve_sleef(__builtin_inff()), r);
|
||||
vopmask_sve_sleef ocore = vle_vo_vf_vf_sve_sleef(d, vcast_vf_f_sve_sleef(0x1.2ced32p+126));
|
||||
if(!__builtin_expect(!!(vtestallones_i_vo32_sve_sleef (ocore)), 1)) r = vsel_vf_vo_vf_vf_sve_sleef(ocore, r, Sleef_logfx_u10sve(d));
|
||||
r = vreinterpret_vf_vm_sve_sleef(vor_vm_vo32_vm_sve_sleef(vgt_vo_vf_vf_sve_sleef(vcast_vf_f_sve_sleef(-1), d), vreinterpret_vm_vf_sve_sleef(r)));
|
||||
r = vsel_vf_vo_vf_vf_sve_sleef(veq_vo_vf_vf_sve_sleef(d, vcast_vf_f_sve_sleef(-1)), vcast_vf_f_sve_sleef(-__builtin_inff()), r);
|
||||
r = vsel_vf_vo_vf_vf_sve_sleef(visnegzero_vo_vf_sve_sleef(d), vcast_vf_f_sve_sleef(-0.0f), r);
|
||||
|
||||
@ -1,3 +1,52 @@
|
||||
## 3.8 - 2025-01-27
|
||||
The focus of this release has been to facilitate benchmarking in SLEEF.
|
||||
It does so by providing a benchmarking tool and a plotting tool to postprocess
|
||||
the results.
|
||||
AArch64 self-hosted runners have been added to CI. Following this, the Linux and
|
||||
compiler version have been updated.
|
||||
Fix inaccuracy issues in a few functions, failures with cpp checks and a few
|
||||
bugs.
|
||||
Finally, the project has been extended with a blog section and its first blog
|
||||
[post](https://sleef.org/2024/10/02/new-pulse.html).
|
||||
|
||||
### Added
|
||||
- Add benchmark and plotting tool by @joanaxcruz in #589, #597, #608 and #609
|
||||
- Use Arm-hosted runners by @blapie in #581
|
||||
- Add blog section and first post. by @blapie in #582
|
||||
|
||||
### Changed
|
||||
- Update GH runners to Ubuntu 24.04 and GCC14 by @blapie in #598, #599 and #601
|
||||
|
||||
### Fixed
|
||||
- Fix cbrt on AArch32, and atanf(+-0) with gcc-13 by @shibatch in #592
|
||||
- Fix oflow bound in log1p(f), exp and pow by @blapie in #604 and #606
|
||||
- Work around removal of some PowerPC intrinsics in GCC 15 by @musicinmybrain in #612
|
||||
- Fix errors reported by cppcheck by @blapie in #595
|
||||
|
||||
## 3.7 - 2024-09-17
|
||||
|
||||
The focus of this release has been to meet open-source community standards. It
|
||||
does so by providing Contributing Guidelines, Issues and Pull-Requests
|
||||
templates. Additionally, the documentation has been reworked to improve
|
||||
navigation (via search bar, side menu/panel, eased navigation on GitHub, ...)
|
||||
and maintainability (reduced line count, mostly markdown sources, ...). The
|
||||
website rendering is now delegated to a template customisable theme. See the
|
||||
new website at [sleef.org](https://sleef.org/), and [docs/](./docs) for the
|
||||
GitHub-rendered documentation. The release also provides various bug fixes on
|
||||
several targets, for CPU detection and in the benchmark infrastructure.
|
||||
|
||||
### Added
|
||||
- Add issue and PR templates. by @blapie in https://github.com/shibatch/sleef/pull/565
|
||||
|
||||
### Changed
|
||||
- Adjust scheduling of GHA workflows by @blapie in https://github.com/shibatch/sleef/pull/553
|
||||
- Port documentation from html to markdown by @blapie in https://github.com/shibatch/sleef/pull/564
|
||||
- Update acosh documentation by @joanaxcruz in https://github.com/shibatch/sleef/pull/572
|
||||
|
||||
### Fixed
|
||||
- S/390: Use getauxval for detecting VXE2 to fix #560 by @Andreas-Krebbel in https://github.com/shibatch/sleef/pull/561
|
||||
- Revive micro-benchmarks for vector functions by @joanaxcruz in https://github.com/shibatch/sleef/pull/571
|
||||
|
||||
## 3.6.1 - 2024-06-10
|
||||
|
||||
This patch release provides important bug fixes, including a fix
|
||||
|
||||
@ -1,8 +1,14 @@
|
||||
cmake_minimum_required(VERSION 3.18)
|
||||
project(SLEEF VERSION 3.6.1 LANGUAGES C)
|
||||
|
||||
set(SLEEF_VERSION 3.9.0)
|
||||
|
||||
message(STATUS "Configuring SLEEF ${SLEEF_VERSION}")
|
||||
project(SLEEF VERSION ${SLEEF_VERSION} LANGUAGES C CXX)
|
||||
|
||||
set(SLEEF_SOVERSION ${SLEEF_VERSION_MAJOR})
|
||||
|
||||
set(CMAKE_CXX_STANDARD 20)
|
||||
|
||||
# Options
|
||||
|
||||
option(SLEEF_BUILD_STATIC_TEST_BINS "Build statically linked test executables" OFF)
|
||||
@ -13,28 +19,96 @@ option(SLEEF_BUILD_QUAD "libsleefquad will be built." OFF)
|
||||
option(SLEEF_BUILD_GNUABI_LIBS "libsleefgnuabi will be built." ON)
|
||||
option(SLEEF_BUILD_SCALAR_LIB "libsleefscalar will be built." OFF)
|
||||
option(SLEEF_BUILD_TESTS "Tests will be built." ON)
|
||||
option(SLEEF_BUILD_BENCH "Bench will be built." OFF)
|
||||
option(SLEEF_BUILD_BENCH_REF "Benchmark script for reference (e.g. system libm) will be built." OFF)
|
||||
option(SLEEF_BUILD_INLINE_HEADERS "Build header for inlining whole SLEEF functions" OFF)
|
||||
|
||||
option(SLEEF_ENFORCE_DFT "Build fails if DFT is not built" OFF)
|
||||
option(SLEEFDFT_ENABLE_STREAM "Streaming instructions are utilized in DFT." OFF)
|
||||
|
||||
option(SLEEF_TEST_ALL_IUT "Perform tests on implementations with all vector extensions" OFF)
|
||||
option(SLEEF_SHOW_CONFIG "Show SLEEF configuration status messages." ON)
|
||||
option(SLEEF_SHOW_ERROR_LOG "Show cmake error log." OFF)
|
||||
option(SLEEF_ASAN "Enable address sanitizing on all targets." OFF)
|
||||
|
||||
option(SLEEF_ENABLE_TESTER "Enable testing libm with tester" OFF)
|
||||
option(SLEEF_ENFORCE_TESTER "Build fails if tester is not available" OFF)
|
||||
|
||||
option(SLEEF_ENFORCE_TESTER3 "Build fails if tester3 is not built" OFF)
|
||||
|
||||
option(SLEEF_ENABLE_TESTER4 "Enable testing with tester4" ON)
|
||||
option(SLEEF_ENFORCE_TESTER4 "Build fails if tester4 is not available" OFF)
|
||||
|
||||
option(SLEEF_ENABLE_ALTDIV "Enable alternative division method (aarch64 only)" OFF)
|
||||
option(SLEEF_ENABLE_ALTSQRT "Enable alternative sqrt method (aarch64 only)" OFF)
|
||||
|
||||
option(SLEEF_DISABLE_FFTW "Disable testing the DFT library with FFTW" OFF)
|
||||
option(SLEEF_DISABLE_MPFR "Disable testing with the MPFR library" OFF)
|
||||
option(SLEEF_ENABLE_TLFLOAT "Enable use of TLFloat library" ON)
|
||||
|
||||
option(SLEEF_DISABLE_SSL "Disable testing with the SSL library" OFF)
|
||||
set(OPENSSL_EXTRA_LIBRARIES "" CACHE STRING "Extra libraries for openssl")
|
||||
|
||||
option(SLEEF_ENABLE_CUDA "Enable CUDA" OFF)
|
||||
option(SLEEF_ENABLE_CXX "Enable C++" OFF)
|
||||
|
||||
option(SLEEF_BUILD_WITH_LIBM "build libsleef with libm, can turn off on Windows to solve mutiple math functions issue." ON)
|
||||
|
||||
option(SLEEF_DISABLE_LONG_DOUBLE "Disable long double" OFF)
|
||||
option(SLEEF_ENFORCE_LONG_DOUBLE "Build fails if long double is not supported by the compiler" OFF)
|
||||
|
||||
option(SLEEF_DISABLE_FLOAT128 "Disable float128" OFF)
|
||||
option(SLEEF_ENFORCE_FLOAT128 "Build fails if float128 is not supported by the compiler" OFF)
|
||||
|
||||
option(SLEEF_DISABLE_SSE2 "Disable SSE2" OFF)
|
||||
option(SLEEF_ENFORCE_SSE2 "Build fails if SSE2 is not supported by the compiler" OFF)
|
||||
option(SLEEF_DISABLE_SSE4 "Disable SSE4" OFF)
|
||||
option(SLEEF_ENFORCE_SSE4 "Build fails if SSE4 is not supported by the compiler" OFF)
|
||||
option(SLEEF_DISABLE_AVX "Disable AVX" OFF)
|
||||
option(SLEEF_ENFORCE_AVX "Build fails if AVX is not supported by the compiler" OFF)
|
||||
option(SLEEF_DISABLE_FMA4 "Disable FMA4" OFF)
|
||||
option(SLEEF_ENFORCE_FMA4 "Build fails if FMA4 is not supported by the compiler" OFF)
|
||||
option(SLEEF_DISABLE_AVX2 "Disable AVX2" OFF)
|
||||
option(SLEEF_ENFORCE_AVX2 "Build fails if AVX2 is not supported by the compiler" OFF)
|
||||
option(SLEEF_DISABLE_AVX512F "Disable AVX512F" OFF)
|
||||
option(SLEEF_ENFORCE_AVX512F "Build fails if AVX512F is not supported by the compiler" OFF)
|
||||
option(SLEEF_DISABLE_SVE "Disable SVE" OFF)
|
||||
option(SLEEF_ENFORCE_SVE "Build fails if SVE is not supported by the compiler" OFF)
|
||||
option(SLEEF_DISABLE_VSX "Disable VSX" OFF)
|
||||
option(SLEEF_ENFORCE_VSX "Build fails if VSX is not supported by the compiler" OFF)
|
||||
option(SLEEF_DISABLE_VSX3 "Disable VSX3" OFF)
|
||||
option(SLEEF_ENFORCE_VSX3 "Build fails if VSX3 is not supported by the compiler" OFF)
|
||||
option(SLEEF_DISABLE_VXE "Disable VXE" OFF)
|
||||
option(SLEEF_ENFORCE_VXE "Build fails if VXE is not supported by the compiler" OFF)
|
||||
option(SLEEF_DISABLE_VXE2 "Disable VXE2" OFF)
|
||||
option(SLEEF_ENFORCE_VXE2 "Build fails if VXE2 is not supported by the compiler" OFF)
|
||||
option(SLEEF_DISABLE_RVVM1 "Disable RVVM1" OFF)
|
||||
option(SLEEF_ENFORCE_RVVM1 "Build fails if RVVM1 is not supported by the compiler" OFF)
|
||||
option(SLEEF_DISABLE_RVVM2 "Disable RVVM2" OFF)
|
||||
option(SLEEF_ENFORCE_RVVM2 "Build fails if RVVM2 is not supported by the compiler" OFF)
|
||||
|
||||
option(SLEEF_ENFORCE_CUDA "Build fails if CUDA is not supported" OFF)
|
||||
|
||||
option(SLEEF_DISABLE_OPENMP "Disable OPENMP" OFF)
|
||||
option(SLEEF_ENFORCE_OPENMP "Build fails if OPENMP is not supported by the compiler" OFF)
|
||||
|
||||
#
|
||||
|
||||
if ((NOT "${CMAKE_C_COMPILER_ID}" STREQUAL "${CMAKE_CXX_COMPILER_ID}") OR
|
||||
(NOT CMAKE_C_COMPILER_VERSION VERSION_EQUAL CMAKE_CXX_COMPILER_VERSION))
|
||||
message(FATAL_ERROR "Different versions of C compiler and C++ compiler")
|
||||
endif()
|
||||
|
||||
#
|
||||
|
||||
if (SLEEF_BUILD_BENCH_REF)
|
||||
if (NOT SLEEF_BUILD_BENCH)
|
||||
message(FATAL_ERROR "SLEEF_BUILD_BENCH must be on when SLEEF_BUILD_BENCH_REF is enabled.")
|
||||
endif ()
|
||||
if(NOT CMAKE_SYSTEM_NAME MATCHES Linux)
|
||||
message(FATAL_ERROR "Libm benchmarking not supported in this OS.")
|
||||
endif()
|
||||
endif ()
|
||||
|
||||
if (DEFINED SLEEF_BUILD_SHARED_LIBS)
|
||||
set(BUILD_SHARED_LIBS ${SLEEF_BUILD_SHARED_LIBS})
|
||||
endif ()
|
||||
@ -133,13 +207,11 @@ set(COSTOVERRIDE_RVVM2NOFMA 20)
|
||||
#
|
||||
|
||||
enable_testing()
|
||||
|
||||
if (SLEEF_ENABLE_CXX)
|
||||
enable_language(CXX)
|
||||
endif()
|
||||
enable_language(CXX)
|
||||
|
||||
if (SLEEF_ENABLE_CUDA)
|
||||
enable_language(CUDA)
|
||||
set(CMAKE_CUDA_ARCHITECTURES all-major)
|
||||
endif()
|
||||
|
||||
# For specifying installation directories
|
||||
@ -197,6 +269,7 @@ include(Configure.cmake)
|
||||
configure_file(
|
||||
${PROJECT_SOURCE_DIR}/sleef-config.h.in
|
||||
${PROJECT_BINARY_DIR}/include/sleef-config.h @ONLY)
|
||||
include_directories(AFTER "${PROJECT_BINARY_DIR}/include")
|
||||
|
||||
# We like to have a documented index of all targets in the project. The
|
||||
# variables listed below carry the names of the targets defined throughout
|
||||
@ -228,7 +301,9 @@ set(TARGET_MKALIAS "mkalias")
|
||||
# Generates static library common
|
||||
# Defined in src/common/CMakeLists.txt via command add_library
|
||||
set(TARGET_LIBCOMMON_OBJ "common")
|
||||
set(TARGET_LIBARRAYMAP_OBJ "arraymap")
|
||||
set(TARGET_PSHA_OBJ "psha_obj")
|
||||
set(TARGET_TESTERUTIL_OBJ "testerutil_obj")
|
||||
set(TARGET_QTESTERUTIL_OBJ "qtesterutil_obj")
|
||||
|
||||
# Function used to add an executable that is executed on host
|
||||
function(add_host_executable TARGETNAME)
|
||||
@ -239,15 +314,23 @@ function(add_host_executable TARGETNAME)
|
||||
target_compile_options(${TARGETNAME} PRIVATE -arch "${CMAKE_HOST_SYSTEM_PROCESSOR}")
|
||||
target_link_options(${TARGETNAME} PRIVATE -arch "${CMAKE_HOST_SYSTEM_PROCESSOR}")
|
||||
endif()
|
||||
elseif (DEFINED ENV{SLEEF_TARGET_EXEC_USE_QEMU})
|
||||
if($ENV{SLEEF_TARGET_EXEC_USE_QEMU})
|
||||
add_executable(${TARGETNAME} ${ARGN})
|
||||
endif()
|
||||
else()
|
||||
add_executable(${TARGETNAME} IMPORTED GLOBAL)
|
||||
set_property(TARGET ${TARGETNAME} PROPERTY IMPORTED_LOCATION ${NATIVE_BUILD_DIR}/bin/${TARGETNAME})
|
||||
if(CMAKE_HOST_WIN32)
|
||||
set_property(TARGET ${TARGETNAME} PROPERTY IMPORTED_LOCATION ${NATIVE_BUILD_DIR}/bin/${TARGETNAME}.exe)
|
||||
else()
|
||||
set_property(TARGET ${TARGETNAME} PROPERTY IMPORTED_LOCATION ${NATIVE_BUILD_DIR}/bin/${TARGETNAME})
|
||||
endif()
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
function(host_target_AAVPCS_definitions TARGETNAME)
|
||||
if (NOT CMAKE_CROSSCOMPILING)
|
||||
target_compile_definitions(${TARGETNAME} PRIVATE ENABLE_AAVPCS=1)
|
||||
# target_compile_definitions(${TARGETNAME} PRIVATE ENABLE_AAVPCS=1)
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
@ -303,6 +386,7 @@ if(SLEEF_SHOW_CONFIG)
|
||||
message(" Detected C compiler: ${CMAKE_C_COMPILER_ID} @ ${CMAKE_C_COMPILER}")
|
||||
message(" CMake: ${CMAKE_VERSION}")
|
||||
message(" Make program: ${CMAKE_MAKE_PROGRAM}")
|
||||
message(" CMake build type: ${CMAKE_BUILD_TYPE}")
|
||||
if(CMAKE_CROSSCOMPILING)
|
||||
message(" Crosscompiling SLEEF.")
|
||||
message(" Native build dir: ${NATIVE_BUILD_DIR}")
|
||||
@ -317,6 +401,7 @@ if(SLEEF_SHOW_CONFIG)
|
||||
message(STATUS "GMP : " ${LIBGMP})
|
||||
message(STATUS "RT : " ${LIBRT})
|
||||
message(STATUS "FFTW3 : " ${LIBFFTW3})
|
||||
message(STATUS "FFTW3F : " ${LIBFFTW3F})
|
||||
message(STATUS "OPENSSL : " ${OPENSSL_VERSION})
|
||||
message(STATUS "SDE : " ${SDE_COMMAND})
|
||||
if (SLEEF_BUILD_INLINE_HEADERS)
|
||||
@ -337,3 +422,4 @@ if(SLEEF_SHOW_CONFIG)
|
||||
message(STATUS "Building SLEEF with AArch64 Vector PCS support")
|
||||
endif()
|
||||
endif(SLEEF_SHOW_CONFIG)
|
||||
|
||||
|
||||
@ -1,27 +0,0 @@
|
||||
# List of contributors
|
||||
|
||||
These lists are not exhaustive and only provide most relevant contact information.
|
||||
For an exhausitive list of contributors please refer to the
|
||||
[GitHub contributors section for SLEEF](https://github.com/shibatch/sleef/graphs/contributors).
|
||||
|
||||
## Maintainers
|
||||
|
||||
| Name | Affiliation | Github profile |
|
||||
| -------------------- | ----------------------- | ---------------------------------- |
|
||||
| Pierre Blanchard | Arm Ltd. | https://github.com/blapie |
|
||||
| Joana Cruz | Arm Ltd. | https://github.com/joanaxcruz |
|
||||
| Joe Ramsay | Arm Ltd. | https://github.com/joeramsay |
|
||||
| Naoki Shibata | Nara Institute of Science and Technology | https://github.com/shibatch |
|
||||
|
||||
## Contributors
|
||||
|
||||
| Name | Affiliation | Github profile |
|
||||
| -------------------- | ----------------------- | ---------------------------------- |
|
||||
| Anonymous | | https://github.com/friendlyanon |
|
||||
| Diana Bite | Former Arm Ltd. | https://github.com/diaena |
|
||||
| Ludovic Henry | Rivos Inc. | https://github.com/luhenry |
|
||||
| Martin Krastev | Chaos Group | https://github.com/blu |
|
||||
| Jilayne Lovejoy | Former Arm Inc. | https://github.com/jlovejoy |
|
||||
| Kerry McLaughlin | Arm Ltd. | https://github.com/kmclaughlin-arm |
|
||||
| Alexandre Mutel | Unity Technologies | https://github.com/xoofx |
|
||||
| Francesco Petrogalli | Former Arm Ltd. | https://github.com/fpetrogalli-arm |
|
||||
@ -1,5 +1,6 @@
|
||||
include(CheckCCompilerFlag)
|
||||
include(CheckCSourceCompiles)
|
||||
include(CheckCXXSourceCompiles)
|
||||
include(CheckTypeSize)
|
||||
include(CheckLanguage)
|
||||
|
||||
@ -11,35 +12,39 @@ if (SLEEF_BUILD_STATIC_TEST_BINS)
|
||||
set(CMAKE_EXE_LINKER_FLAGS "-static")
|
||||
endif()
|
||||
|
||||
set(OPENSSL_EXTRA_LIBRARIES "" CACHE STRING "Extra libraries for openssl")
|
||||
if (NOT CMAKE_CROSSCOMPILING AND NOT SLEEF_FORCE_FIND_PACKAGE_SSL)
|
||||
if (SLEEF_BUILD_STATIC_TEST_BINS)
|
||||
set(OPENSSL_USE_STATIC_LIBS TRUE)
|
||||
endif()
|
||||
find_package(OpenSSL)
|
||||
if (OPENSSL_FOUND)
|
||||
set(SLEEF_OPENSSL_FOUND TRUE)
|
||||
set(SLEEF_OPENSSL_LIBRARIES ${OPENSSL_LIBRARIES})
|
||||
# Work around for tester3 sig segv, when linking versions of openssl (1.1.1) statically.
|
||||
# This is a known issue https://github.com/openssl/openssl/issues/13872.
|
||||
if (NOT SLEEF_DISABLE_SSL)
|
||||
if (NOT CMAKE_CROSSCOMPILING AND NOT SLEEF_FORCE_FIND_PACKAGE_SSL)
|
||||
if (SLEEF_BUILD_STATIC_TEST_BINS)
|
||||
string(REGEX REPLACE
|
||||
"-lpthread" "-Wl,--whole-archive -lpthread -Wl,--no-whole-archive"
|
||||
SLEEF_OPENSSL_LIBRARIES "${OPENSSL_LIBRARIES}")
|
||||
set(OPENSSL_USE_STATIC_LIBS TRUE)
|
||||
endif()
|
||||
find_package(OpenSSL)
|
||||
if (OPENSSL_FOUND)
|
||||
set(SLEEF_OPENSSL_FOUND TRUE)
|
||||
set(SLEEF_OPENSSL_LIBRARIES ${OPENSSL_LIBRARIES})
|
||||
# Work around for tester3 sig segv, when linking versions of openssl (1.1.1) statically.
|
||||
# This is a known issue https://github.com/openssl/openssl/issues/13872.
|
||||
if (SLEEF_BUILD_STATIC_TEST_BINS)
|
||||
string(REGEX REPLACE
|
||||
"-lpthread" "-Wl,--whole-archive -lpthread -Wl,--no-whole-archive"
|
||||
SLEEF_OPENSSL_LIBRARIES "${OPENSSL_LIBRARIES}")
|
||||
endif()
|
||||
set(SLEEF_OPENSSL_VERSION ${OPENSSL_VERSION})
|
||||
set(SLEEF_OPENSSL_LIBRARIES ${SLEEF_OPENSSL_LIBRARIES} ${OPENSSL_EXTRA_LIBRARIES})
|
||||
set(SLEEF_OPENSSL_INCLUDE_DIR ${OPENSSL_INCLUDE_DIR})
|
||||
endif()
|
||||
else()
|
||||
# find_package cannot find OpenSSL when cross-compiling
|
||||
find_library(LIBSSL ssl)
|
||||
find_library(LIBCRYPTO crypto)
|
||||
if (LIBSSL AND LIBCRYPTO)
|
||||
set(SLEEF_OPENSSL_FOUND TRUE)
|
||||
set(SLEEF_OPENSSL_LIBRARIES ${LIBSSL} ${LIBCRYPTO} ${OPENSSL_EXTRA_LIBRARIES})
|
||||
set(SLEEF_OPENSSL_VERSION ${LIBSSL})
|
||||
endif()
|
||||
set(SLEEF_OPENSSL_VERSION ${OPENSSL_VERSION})
|
||||
set(SLEEF_OPENSSL_LIBRARIES ${SLEEF_OPENSSL_LIBRARIES} ${OPENSSL_EXTRA_LIBRARIES})
|
||||
set(SLEEF_OPENSSL_INCLUDE_DIR ${OPENSSL_INCLUDE_DIR})
|
||||
endif()
|
||||
else()
|
||||
# find_package cannot find OpenSSL when cross-compiling
|
||||
find_library(LIBSSL ssl)
|
||||
find_library(LIBCRYPTO crypto)
|
||||
if (LIBSSL AND LIBCRYPTO)
|
||||
set(SLEEF_OPENSSL_FOUND TRUE)
|
||||
set(SLEEF_OPENSSL_LIBRARIES ${LIBSSL} ${LIBCRYPTO} ${OPENSSL_EXTRA_LIBRARIES})
|
||||
set(SLEEF_OPENSSL_VERSION ${LIBSSL})
|
||||
endif()
|
||||
set(SLEEF_OPENSSL_FOUND FALSE)
|
||||
message(STATUS "Detection of OpenSSL is skipped since SLEEF_DISABLE_SSL is specified")
|
||||
endif()
|
||||
|
||||
if (SLEEF_ENFORCE_TESTER3 AND NOT SLEEF_OPENSSL_FOUND)
|
||||
@ -48,10 +53,20 @@ endif()
|
||||
|
||||
# Some toolchains require explicit linking of the libraries following.
|
||||
find_library(LIB_MPFR mpfr)
|
||||
find_library(LIBM m)
|
||||
if(SLEEF_BUILD_WITH_LIBM)
|
||||
find_library(LIBM m)
|
||||
endif()
|
||||
find_library(LIBGMP gmp)
|
||||
find_library(LIBRT rt)
|
||||
|
||||
find_library(LIBFFTW3 fftw3)
|
||||
find_library(LIBFFTW3F fftw3f)
|
||||
find_library(LIBFFTW3_OMP fftw3_omp)
|
||||
find_library(LIBFFTW3F_OMP fftw3f_omp)
|
||||
|
||||
if (LIBFFTW3 AND LIBFFTW3F AND LIBFFTW3_OMP AND LIBFFTW3F_OMP)
|
||||
set(SLEEF_LIBFFTW3_LIBRARIES ${LIBFFTW3} ${LIBFFTW3F} ${LIBFFTW3_OMP} ${LIBFFTW3F_OMP})
|
||||
endif()
|
||||
|
||||
if (LIB_MPFR)
|
||||
find_path(MPFR_INCLUDE_DIR
|
||||
@ -63,7 +78,7 @@ if (LIBFFTW3)
|
||||
find_path(FFTW3_INCLUDE_DIR
|
||||
NAMES fftw3.h
|
||||
ONLY_CMAKE_FIND_ROOT_PATH)
|
||||
endif(LIBFFTW3)
|
||||
endif()
|
||||
|
||||
if (NOT LIBM)
|
||||
set(LIBM "")
|
||||
@ -77,10 +92,77 @@ if (SLEEF_DISABLE_MPFR)
|
||||
set(LIB_MPFR "")
|
||||
endif()
|
||||
|
||||
if (SLEEF_DISABLE_SSL)
|
||||
set(SLEEF_OPENSSL_FOUND FALSE)
|
||||
# Include submodules
|
||||
|
||||
set(SLEEF_SUBMODULE_INSTALL_DIR "${CMAKE_BINARY_DIR}/submodules")
|
||||
|
||||
include(ExternalProject)
|
||||
include(FindPkgConfig)
|
||||
|
||||
if (NOT EXISTS "${PROJECT_SOURCE_DIR}/submodules")
|
||||
file(MAKE_DIRECTORY "${PROJECT_SOURCE_DIR}/submodules")
|
||||
endif()
|
||||
|
||||
# Include TLFloat as a submodule
|
||||
|
||||
if (SLEEF_ENABLE_TLFLOAT)
|
||||
set(TLFLOAT_MINIMUM_VERSION 1.15.0)
|
||||
set(TLFLOAT_GIT_TAG "fb0390157d5c8811fc2a5a6d7d8eac27261f06fb")
|
||||
|
||||
set(TLFLOAT_SOURCE_DIR "${PROJECT_SOURCE_DIR}/submodules/tlfloat")
|
||||
set(TLFLOAT_INSTALL_DIR "${SLEEF_SUBMODULE_INSTALL_DIR}/tlfloat")
|
||||
|
||||
set(TLFLOAT_CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${TLFLOAT_INSTALL_DIR} -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DBUILD_LIBS=True -DBUILD_UTILS=False -DBUILD_TESTS=False)
|
||||
|
||||
if (CMAKE_C_COMPILER)
|
||||
list(APPEND TLFLOAT_CMAKE_ARGS -DCMAKE_C_COMPILER:PATH=${CMAKE_C_COMPILER})
|
||||
endif()
|
||||
|
||||
if (CMAKE_CXX_COMPILER)
|
||||
list(APPEND TLFLOAT_CMAKE_ARGS -DCMAKE_CXX_COMPILER:PATH=${CMAKE_CXX_COMPILER})
|
||||
endif()
|
||||
|
||||
if (CMAKE_TOOLCHAIN_FILE)
|
||||
list(APPEND TLFLOAT_CMAKE_ARGS -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE})
|
||||
endif()
|
||||
|
||||
if (EXISTS "${TLFLOAT_SOURCE_DIR}/CMakeLists.txt")
|
||||
# If the source code of tlfloat is already downloaded, use it
|
||||
ExternalProject_Add(ext_tlfloat
|
||||
SOURCE_DIR "${TLFLOAT_SOURCE_DIR}"
|
||||
CMAKE_ARGS ${TLFLOAT_CMAKE_ARGS}
|
||||
UPDATE_DISCONNECTED TRUE
|
||||
)
|
||||
include_directories(BEFORE "${TLFLOAT_INSTALL_DIR}/include")
|
||||
link_directories(BEFORE "${TLFLOAT_INSTALL_DIR}/lib")
|
||||
set(TLFLOAT_LIBRARIES "tlfloat")
|
||||
else()
|
||||
pkg_search_module(TLFLOAT tlfloat)
|
||||
|
||||
if (TLFLOAT_FOUND AND TLFLOAT_VERSION VERSION_GREATER_EQUAL TLFLOAT_MINIMUM_VERSION)
|
||||
# If tlfloat is installed on the system
|
||||
add_custom_target(ext_tlfloat ALL)
|
||||
include_directories(BEFORE "${TLFLOAT_INCLUDE_DIRS}")
|
||||
link_directories(BEFORE "${TLFLOAT_LIBDIR}")
|
||||
message(STATUS "Found installed TLFloat " ${TLFLOAT_VERSION})
|
||||
else()
|
||||
# Otherwise, download the source code
|
||||
find_package(Git REQUIRED)
|
||||
ExternalProject_Add(ext_tlfloat
|
||||
GIT_REPOSITORY https://github.com/shibatch/tlfloat
|
||||
GIT_TAG "${TLFLOAT_GIT_TAG}"
|
||||
SOURCE_DIR "${TLFLOAT_SOURCE_DIR}"
|
||||
CMAKE_ARGS ${TLFLOAT_CMAKE_ARGS}
|
||||
UPDATE_DISCONNECTED TRUE
|
||||
)
|
||||
|
||||
include_directories(BEFORE "${TLFLOAT_INSTALL_DIR}/include")
|
||||
link_directories(BEFORE "${TLFLOAT_INSTALL_DIR}/lib")
|
||||
set(TLFLOAT_LIBRARIES "tlfloat")
|
||||
endif()
|
||||
endif()
|
||||
endif(SLEEF_ENABLE_TLFLOAT)
|
||||
|
||||
# Force set default build type if none was specified
|
||||
# Note: some sleef code requires the optimisation flags turned on
|
||||
if(NOT CMAKE_BUILD_TYPE)
|
||||
@ -124,7 +206,7 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm")
|
||||
set(COMPILER_SUPPORTS_NEON32VFPV4 1)
|
||||
|
||||
set(CLANG_FLAGS_ENABLE_PURECFMA_SCALAR "-mfpu=vfpv4")
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64")
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64" OR CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)")
|
||||
set(SLEEF_ARCH_PPC64 ON CACHE INTERNAL "True for PPC64 architecture.")
|
||||
|
||||
set(CLANG_FLAGS_ENABLE_PURECFMA_SCALAR "-mvsx")
|
||||
@ -149,7 +231,7 @@ if(NOT CLANG_EXE_PATH)
|
||||
set(CLANG_EXE_PATH ${CMAKE_C_COMPILER})
|
||||
else()
|
||||
# Else we may find clang on the path?
|
||||
find_program(CLANG_EXE_PATH NAMES clang "clang-11" "clang-10" "clang-9" "clang-8" "clang-7" "clang-6.0" "clang-5.0" "clang-4.0" "clang-3.9")
|
||||
find_program(CLANG_EXE_PATH NAMES clang "clang-25" "clang-24" "clang-23" "clang-22" "clang-21" "clang-20" "clang-19" "clang-18" "clang-17")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
@ -188,7 +270,7 @@ set(CLANG_FLAGS_ENABLE_RVVM2NOFMA "-march=rv64gcv_zba_zbb_zbs")
|
||||
set(FLAGS_OTHERS "")
|
||||
|
||||
# All variables storing compiler flags should be prefixed with FLAGS_
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "(GNU|Clang)")
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "(GNU|Clang|QCC)")
|
||||
# Always compile sleef with -ffp-contract.
|
||||
set(FLAGS_STRICTMATH "-ffp-contract=off")
|
||||
set(FLAGS_FASTMATH "-ffast-math")
|
||||
@ -209,13 +291,13 @@ if(CMAKE_C_COMPILER_ID MATCHES "(GNU|Clang)")
|
||||
|
||||
# Warning flags.
|
||||
set(FLAGS_WALL "-Wall -Wno-unused-function -Wno-attributes -Wno-unused-result")
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "GNU")
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "(GNU|QCC)")
|
||||
# The following compiler option is needed to suppress the warning
|
||||
# "AVX vector return without AVX enabled changes the ABI" at
|
||||
# src/arch/helpervecext.h:88
|
||||
string(CONCAT FLAGS_WALL ${FLAGS_WALL} " -Wno-psabi")
|
||||
set(FLAGS_ENABLE_NEON32 "-mfpu=neon")
|
||||
endif(CMAKE_C_COMPILER_ID MATCHES "GNU")
|
||||
endif(CMAKE_C_COMPILER_ID MATCHES "(GNU|QCC)")
|
||||
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "Clang" AND SLEEF_ENABLE_LTO)
|
||||
if (NOT SLEEF_LLVM_AR_COMMAND)
|
||||
@ -296,7 +378,7 @@ elseif(CMAKE_C_COMPILER_ID MATCHES "Intel")
|
||||
endif()
|
||||
|
||||
set(SLEEF_C_FLAGS "${FLAGS_WALL} ${FLAGS_STRICTMATH} ${FLAGS_OTHERS}")
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER 6.99)
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "(GNU|QCC)" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER 6.99)
|
||||
set(DFT_C_FLAGS "${FLAGS_WALL} ${FLAGS_NOSTRICTALIASING} ${FLAGS_OTHERS}")
|
||||
else()
|
||||
set(DFT_C_FLAGS "${FLAGS_WALL} ${FLAGS_NOSTRICTALIASING} ${FLAGS_FASTMATH} ${FLAGS_OTHERS}")
|
||||
@ -306,9 +388,17 @@ if(CMAKE_C_COMPILER_ID MATCHES "GNU")
|
||||
set(FLAGS_ENABLE_SVE "${FLAGS_ENABLE_SVE};-fno-tree-vrp")
|
||||
endif()
|
||||
|
||||
if(QNX AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
|
||||
#set(SLEEF_C_FLAGS "${SLEEF_C_FLAGS} -march=armv8-a ")
|
||||
#set(DFT_C_FLAGS "${DFT_C_FLAGS} -march=armv8-a ")
|
||||
endif()
|
||||
|
||||
if (CMAKE_SYSTEM_PROCESSOR MATCHES "^i.86$" AND CMAKE_C_COMPILER_ID MATCHES "GNU")
|
||||
set(SLEEF_C_FLAGS "${SLEEF_C_FLAGS} -msse2 -mfpmath=sse")
|
||||
set(DFT_C_FLAGS "${DFT_C_FLAGS} -msse2 -mfpmath=sse -m128bit-long-double")
|
||||
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "^i.86$" AND CMAKE_C_COMPILER_ID MATCHES "QCC")
|
||||
set(SLEEF_C_FLAGS "${SLEEF_C_FLAGS} -msse2 -mfpmath=sse")
|
||||
set(DFT_C_FLAGS "${DFT_C_FLAGS} -msse2 -mfpmath=sse -m128bit-long-double")
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^i.86$" AND CMAKE_C_COMPILER_ID MATCHES "Clang")
|
||||
set(SLEEF_C_FLAGS "${SLEEF_C_FLAGS} -msse2 -mfpmath=sse")
|
||||
set(DFT_C_FLAGS "${DFT_C_FLAGS} -msse2 -mfpmath=sse")
|
||||
@ -328,9 +418,6 @@ endif()
|
||||
|
||||
# Long double
|
||||
|
||||
option(SLEEF_DISABLE_LONG_DOUBLE "Disable long double" OFF)
|
||||
option(SLEEF_ENFORCE_LONG_DOUBLE "Build fails if long double is not supported by the compiler" OFF)
|
||||
|
||||
if(NOT SLEEF_DISABLE_LONG_DOUBLE)
|
||||
CHECK_TYPE_SIZE("long double" LD_SIZE)
|
||||
if(LD_SIZE GREATER "9")
|
||||
@ -351,9 +438,6 @@ endif()
|
||||
|
||||
# float128
|
||||
|
||||
option(SLEEF_DISABLE_FLOAT128 "Disable float128" OFF)
|
||||
option(SLEEF_ENFORCE_FLOAT128 "Build fails if float128 is not supported by the compiler" OFF)
|
||||
|
||||
if(NOT SLEEF_DISABLE_FLOAT128)
|
||||
CHECK_C_SOURCE_COMPILES("
|
||||
int main() { __float128 r = 1;
|
||||
@ -373,10 +457,37 @@ if(COMPILER_SUPPORTS_FLOAT128)
|
||||
}" COMPILER_SUPPORTS_QUADMATH)
|
||||
endif()
|
||||
|
||||
# SSE2
|
||||
if(COMPILER_SUPPORTS_FLOAT128)
|
||||
if (CMAKE_CXX_COMPILER_TARGET)
|
||||
set(CMAKE_REQUIRED_FLAGS "--target=${CMAKE_CXX_COMPILER_TARGET}")
|
||||
endif()
|
||||
CHECK_CXX_SOURCE_COMPILES("
|
||||
#include <bit>
|
||||
struct s { long long x, y; };
|
||||
int main(int argc, char **argv) {
|
||||
constexpr s a = std::bit_cast<s>(__float128(0.1234)*__float128(56.789));
|
||||
static_assert((a.x ^ a.y) == 0xc7d695c93a4e2b71LL);
|
||||
__float128 i = argc;
|
||||
return (int)i;
|
||||
}
|
||||
" SLEEF_FLOAT128_IS_IEEEQP)
|
||||
set(CMAKE_REQUIRED_FLAGS)
|
||||
endif()
|
||||
|
||||
option(SLEEF_DISABLE_SSE2 "Disable SSE2" OFF)
|
||||
option(SLEEF_ENFORCE_SSE2 "Build fails if SSE2 is not supported by the compiler" OFF)
|
||||
if (CMAKE_CXX_COMPILER_TARGET)
|
||||
set(CMAKE_REQUIRED_FLAGS "--target=${CMAKE_CXX_COMPILER_TARGET}")
|
||||
endif()
|
||||
CHECK_CXX_SOURCE_COMPILES("
|
||||
#include <bit>
|
||||
struct s { long long x, y; };
|
||||
int main(void) {
|
||||
constexpr s a = std::bit_cast<s>((long double)0.1234*(long double)56.789);
|
||||
static_assert((a.x ^ a.y) == 0xc7d695c93a4e2b71LL);
|
||||
}
|
||||
" SLEEF_LONGDOUBLE_IS_IEEEQP)
|
||||
set(CMAKE_REQUIRED_FLAGS)
|
||||
|
||||
# SSE2
|
||||
|
||||
if(SLEEF_ARCH_X86 AND NOT SLEEF_DISABLE_SSE2)
|
||||
string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_SSE2}")
|
||||
@ -397,9 +508,6 @@ endif()
|
||||
|
||||
# SSE 4.1
|
||||
|
||||
option(SLEEF_DISABLE_SSE4 "Disable SSE4" OFF)
|
||||
option(SLEEF_ENFORCE_SSE4 "Build fails if SSE4 is not supported by the compiler" OFF)
|
||||
|
||||
if(SLEEF_ARCH_X86 AND NOT SLEEF_DISABLE_SSE4)
|
||||
string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_SSE4}")
|
||||
CHECK_C_SOURCE_COMPILES("
|
||||
@ -419,9 +527,6 @@ endif()
|
||||
|
||||
# AVX
|
||||
|
||||
option(SLEEF_ENFORCE_AVX "Disable AVX" OFF)
|
||||
option(SLEEF_ENFORCE_AVX "Build fails if AVX is not supported by the compiler" OFF)
|
||||
|
||||
if(SLEEF_ARCH_X86 AND NOT SLEEF_DISABLE_AVX)
|
||||
string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_AVX}")
|
||||
CHECK_C_SOURCE_COMPILES("
|
||||
@ -441,9 +546,6 @@ endif()
|
||||
|
||||
# FMA4
|
||||
|
||||
option(SLEEF_DISABLE_FMA4 "Disable FMA4" OFF)
|
||||
option(SLEEF_ENFORCE_FMA4 "Build fails if FMA4 is not supported by the compiler" OFF)
|
||||
|
||||
if(SLEEF_ARCH_X86 AND NOT SLEEF_DISABLE_FMA4)
|
||||
string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_FMA4}")
|
||||
CHECK_C_SOURCE_COMPILES("
|
||||
@ -463,9 +565,6 @@ endif()
|
||||
|
||||
# AVX2
|
||||
|
||||
option(SLEEF_DISABLE_AVX2 "Disable AVX2" OFF)
|
||||
option(SLEEF_ENFORCE_AVX2 "Build fails if AVX2 is not supported by the compiler" OFF)
|
||||
|
||||
if(SLEEF_ARCH_X86 AND NOT SLEEF_DISABLE_AVX2)
|
||||
string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_AVX2}")
|
||||
CHECK_C_SOURCE_COMPILES("
|
||||
@ -490,9 +589,6 @@ endif()
|
||||
|
||||
# AVX512F
|
||||
|
||||
option(SLEEF_DISABLE_AVX512F "Disable AVX512F" OFF)
|
||||
option(SLEEF_ENFORCE_AVX512F "Build fails if AVX512F is not supported by the compiler" OFF)
|
||||
|
||||
if(SLEEF_ARCH_X86 AND NOT SLEEF_DISABLE_AVX512F)
|
||||
string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_AVX512F}")
|
||||
CHECK_C_SOURCE_COMPILES("
|
||||
@ -522,9 +618,6 @@ endif()
|
||||
|
||||
# SVE
|
||||
|
||||
option(SLEEF_DISABLE_SVE "Disable SVE" OFF)
|
||||
option(SLEEF_ENFORCE_SVE "Build fails if SVE is not supported by the compiler" OFF)
|
||||
|
||||
# Darwin does not support SVE yet (see issue #474),
|
||||
# therefore we disable SVE on Darwin systems.
|
||||
if(SLEEF_ARCH_AARCH64 AND NOT SLEEF_DISABLE_SVE AND NOT CMAKE_SYSTEM_NAME STREQUAL "Darwin")
|
||||
@ -546,15 +639,12 @@ endif()
|
||||
|
||||
# VSX
|
||||
|
||||
option(SLEEF_DISABLE_VSX "Disable VSX" OFF)
|
||||
option(SLEEF_ENFORCE_VSX "Build fails if VSX is not supported by the compiler" OFF)
|
||||
|
||||
if(SLEEF_ARCH_PPC64 AND NOT SLEEF_DISABLE_VSX)
|
||||
string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_VSX}")
|
||||
CHECK_C_SOURCE_COMPILES("
|
||||
#include <altivec.h>
|
||||
#ifndef __LITTLE_ENDIAN__
|
||||
#error \"Only VSX(ISA2.07) little-endian mode is supported \"
|
||||
#if !defined(__LITTLE_ENDIAN__) && !defined(_AIX)
|
||||
#error \"Only VSX(ISA2.07) little-endian mode and AIX is supported \"
|
||||
#endif
|
||||
int main() {
|
||||
vector double d;
|
||||
@ -576,9 +666,6 @@ endif()
|
||||
|
||||
# VSX3
|
||||
|
||||
option(SLEEF_DISABLE_VSX3 "Disable VSX3" OFF)
|
||||
option(SLEEF_ENFORCE_VSX3 "Build fails if VSX3 is not supported by the compiler" OFF)
|
||||
|
||||
if(SLEEF_ARCH_PPC64 AND NOT SLEEF_DISABLE_VSX3)
|
||||
string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_VSX3}")
|
||||
CHECK_C_SOURCE_COMPILES("
|
||||
@ -605,9 +692,6 @@ endif()
|
||||
|
||||
# IBM Z
|
||||
|
||||
option(SLEEF_DISABLE_VXE "Disable VXE" OFF)
|
||||
option(SLEEF_ENFORCE_VXE "Build fails if VXE is not supported by the compiler" OFF)
|
||||
|
||||
if(SLEEF_ARCH_S390X AND NOT SLEEF_DISABLE_VXE)
|
||||
string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_VXE}")
|
||||
CHECK_C_SOURCE_COMPILES("
|
||||
@ -629,9 +713,6 @@ endif()
|
||||
|
||||
#
|
||||
|
||||
option(SLEEF_DISABLE_VXE2 "Disable VXE2" OFF)
|
||||
option(SLEEF_ENFORCE_VXE2 "Build fails if VXE2 is not supported by the compiler" OFF)
|
||||
|
||||
if(SLEEF_ARCH_S390X AND NOT SLEEF_DISABLE_VXE2)
|
||||
string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_VXE2}")
|
||||
CHECK_C_SOURCE_COMPILES("
|
||||
@ -653,15 +734,26 @@ endif()
|
||||
|
||||
# RVVM1
|
||||
|
||||
option(SLEEF_DISABLE_RVVM1 "Disable RVVM1" OFF)
|
||||
option(SLEEF_ENFORCE_RVVM1 "Build fails if RVVM1 is not supported by the compiler" OFF)
|
||||
|
||||
if(SLEEF_ARCH_RISCV64 AND NOT SLEEF_DISABLE_RVVM1)
|
||||
string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_RVVM1}")
|
||||
CHECK_C_SOURCE_COMPILES("
|
||||
#include <riscv_vector.h>
|
||||
int main() {
|
||||
vint32m1_t r = __riscv_vmv_v_x_i32m1(1, __riscv_vlenb() * 8 / 32); }"
|
||||
#ifdef __riscv_v
|
||||
#if __riscv_v < 1000000
|
||||
#error \"RVV version 1.0 not supported\"
|
||||
#endif
|
||||
#else
|
||||
#error \"RVV not supported\"
|
||||
#endif
|
||||
|
||||
#ifdef __riscv_v_intrinsic
|
||||
#if __riscv_v_intrinsic < 12000
|
||||
#error \"RVV instrinsics version 0.12 not supported\"
|
||||
#endif
|
||||
#else
|
||||
#error \"RVV intrinsics not supported\"
|
||||
#endif
|
||||
|
||||
int main(void) { return 0; }"
|
||||
COMPILER_SUPPORTS_RVVM1)
|
||||
|
||||
if(COMPILER_SUPPORTS_RVVM1)
|
||||
@ -675,15 +767,26 @@ endif()
|
||||
|
||||
# RVVM2
|
||||
|
||||
option(SLEEF_DISABLE_RVVM2 "Disable RVVM2" OFF)
|
||||
option(SLEEF_ENFORCE_RVVM2 "Build fails if RVVM2 is not supported by the compiler" OFF)
|
||||
|
||||
if(SLEEF_ARCH_RISCV64 AND NOT SLEEF_DISABLE_RVVM2)
|
||||
string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_RVVM2}")
|
||||
CHECK_C_SOURCE_COMPILES("
|
||||
#include <riscv_vector.h>
|
||||
int main() {
|
||||
vint32m2_t r = __riscv_vmv_v_x_i32m2(1, 2 * __riscv_vlenb() * 8 / 32); }"
|
||||
#ifdef __riscv_v
|
||||
#if __riscv_v < 1000000
|
||||
#error \"RVV version 1.0 not supported\"
|
||||
#endif
|
||||
#else
|
||||
#error \"RVV not supported\"
|
||||
#endif
|
||||
|
||||
#ifdef __riscv_v_intrinsic
|
||||
#if __riscv_v_intrinsic < 12000
|
||||
#error \"RVV instrinsics version 0.12 not supported\"
|
||||
#endif
|
||||
#else
|
||||
#error \"RVV intrinsics not supported\"
|
||||
#endif
|
||||
|
||||
int main(void) { return 0; }"
|
||||
COMPILER_SUPPORTS_RVVM2)
|
||||
|
||||
if(COMPILER_SUPPORTS_RVVM2)
|
||||
@ -697,18 +800,14 @@ endif()
|
||||
|
||||
# CUDA
|
||||
|
||||
option(SLEEF_ENFORCE_CUDA "Build fails if CUDA is not supported" OFF)
|
||||
|
||||
if (SLEEF_ENFORCE_CUDA AND NOT CMAKE_CUDA_COMPILER)
|
||||
message(FATAL_ERROR "SLEEF_ENFORCE_CUDA is specified and that feature is disabled or not supported by the compiler")
|
||||
endif()
|
||||
|
||||
# OpenMP
|
||||
|
||||
option(SLEEF_DISABLE_OPENMP "Disable OPENMP" OFF)
|
||||
option(SLEEF_ENFORCE_OPENMP "Build fails if OPENMP is not supported by the compiler" OFF)
|
||||
|
||||
if(NOT SLEEF_DISABLE_OPENMP)
|
||||
set(CMAKE_REQUIRED_FLAGS)
|
||||
find_package(OpenMP)
|
||||
# Check if compilation with OpenMP really succeeds
|
||||
# It might not succeed even though find_package(OpenMP) succeeds.
|
||||
@ -796,6 +895,7 @@ set(CMAKE_REQUIRED_LIBRARIES)
|
||||
|
||||
# Save the default C flags
|
||||
set(ORG_CMAKE_C_FLAGS ${CMAKE_C_FLAGS})
|
||||
set(ORG_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
|
||||
|
||||
##
|
||||
|
||||
@ -838,10 +938,6 @@ if(SLEEF_SHOW_ERROR_LOG)
|
||||
endif()
|
||||
endif(SLEEF_SHOW_ERROR_LOG)
|
||||
|
||||
if (MSVC OR SLEEF_CLANG_ON_WINDOWS)
|
||||
set(COMPILER_SUPPORTS_OPENMP FALSE) # At this time, OpenMP is not supported on MSVC
|
||||
endif()
|
||||
|
||||
##
|
||||
|
||||
# Set common definitions
|
||||
|
||||
247
src/jdk.incubator.vector/unix/native/libsleef/upstream/Jenkinsfile
vendored
Normal file
247
src/jdk.incubator.vector/unix/native/libsleef/upstream/Jenkinsfile
vendored
Normal file
@ -0,0 +1,247 @@
|
||||
pipeline {
|
||||
agent { label 'jenkinsfile' }
|
||||
|
||||
stages {
|
||||
stage('Preamble') {
|
||||
parallel {
|
||||
stage('x86_64 linux clang-19-lto') {
|
||||
agent { label 'x86_64 && ubuntu24 && avx512f' }
|
||||
options { skipDefaultCheckout() }
|
||||
steps {
|
||||
cleanWs()
|
||||
checkout scm
|
||||
sh '''
|
||||
echo "x86_64 clang-19 with LTO on" `hostname`
|
||||
export CC=clang-19
|
||||
export CXX=clang++-19
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -GNinja -DCMAKE_INSTALL_PREFIX=../../install -DSLEEF_SHOW_CONFIG=1 -DSLEEF_BUILD_DFT=TRUE -DSLEEF_ENFORCE_DFT=TRUE -DSLEEFDFT_ENABLE_STREAM=True -DSLEEF_BUILD_QUAD=TRUE -DSLEEF_BUILD_INLINE_HEADERS=TRUE -DSLEEF_ENFORCE_SSE2=TRUE -DSLEEF_ENFORCE_SSE4=TRUE -DSLEEF_ENFORCE_AVX=TRUE -DSLEEF_ENFORCE_AVX2=TRUE -DSLEEF_ENFORCE_AVX512F=TRUE -DSLEEF_ENFORCE_TESTER4=True -DSLEEF_ENABLE_TESTER=True -DSLEEF_ENFORCE_TESTER=True -DSLEEF_ENABLE_LTO=True -DCMAKE_EXE_LINKER_FLAGS="-fuse-ld=lld-19"
|
||||
cmake -E time ninja
|
||||
export OMP_WAIT_POLICY=passive
|
||||
export CTEST_OUTPUT_ON_FAILURE=TRUE
|
||||
ctest -j `nproc`
|
||||
ninja install
|
||||
'''
|
||||
}
|
||||
}
|
||||
|
||||
stage('x86_64 linux clang-19-asan') {
|
||||
agent { label 'x86_64 && ubuntu24 && avx512f' }
|
||||
options { skipDefaultCheckout() }
|
||||
steps {
|
||||
cleanWs()
|
||||
checkout scm
|
||||
sh '''
|
||||
echo "x86_64 clang-19 with ASAN on" `hostname`
|
||||
export CC=clang-19
|
||||
export CXX=clang++-19
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -GNinja -DCMAKE_INSTALL_PREFIX=../../install -DSLEEF_SHOW_CONFIG=1 -DSLEEF_BUILD_DFT=TRUE -DSLEEF_ENFORCE_DFT=TRUE -DSLEEFDFT_ENABLE_STREAM=True -DSLEEF_BUILD_QUAD=TRUE -DSLEEF_BUILD_INLINE_HEADERS=TRUE -DSLEEF_ENFORCE_SSE2=TRUE -DSLEEF_ENFORCE_SSE4=TRUE -DSLEEF_ENFORCE_AVX=TRUE -DSLEEF_ENFORCE_AVX2=TRUE -DSLEEF_ENFORCE_AVX512F=TRUE -DSLEEF_ENFORCE_TESTER4=True -DSLEEF_ENABLE_TESTER=True -DSLEEF_ENFORCE_TESTER=True -DSLEEF_ASAN=True
|
||||
cmake -E time ninja
|
||||
export OMP_WAIT_POLICY=passive
|
||||
export CTEST_OUTPUT_ON_FAILURE=TRUE
|
||||
ctest -j `nproc`
|
||||
ninja install
|
||||
'''
|
||||
}
|
||||
}
|
||||
|
||||
stage('x86_64 linux gcc-13') {
|
||||
agent { label 'x86_64 && ubuntu24 && cuda' }
|
||||
options { skipDefaultCheckout() }
|
||||
steps {
|
||||
cleanWs()
|
||||
checkout scm
|
||||
sh '''
|
||||
echo "x86_64 gcc-13 on" `hostname`
|
||||
export CC=gcc-13
|
||||
export CXX=g++-13
|
||||
export CUDACXX=/opt/cuda-12.6/bin/nvcc
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -GNinja -DCMAKE_INSTALL_PREFIX=../../install -DSLEEF_SHOW_CONFIG=1 -DSLEEF_ENABLE_CUDA=True -DSLEEF_ENFORCE_CUDA=True -DSLEEF_BUILD_DFT=TRUE -DSLEEF_ENFORCE_DFT=TRUE -DSLEEF_BUILD_QUAD=TRUE -DSLEEF_BUILD_INLINE_HEADERS=TRUE -DSLEEF_ENFORCE_SSE2=TRUE -DSLEEF_ENFORCE_SSE4=TRUE -DSLEEF_ENFORCE_AVX=TRUE -DSLEEF_ENFORCE_AVX2=TRUE -DSLEEF_ENFORCE_AVX512F=TRUE -DSLEEF_ENFORCE_TESTER4=True -DSLEEF_ENABLE_TESTER=True -DSLEEF_ENFORCE_TESTER=True
|
||||
cmake -E time ninja
|
||||
export OMP_WAIT_POLICY=passive
|
||||
export CTEST_OUTPUT_ON_FAILURE=TRUE
|
||||
ctest -j `nproc`
|
||||
ninja install
|
||||
'''
|
||||
}
|
||||
}
|
||||
|
||||
stage('x86_64 windows clang') {
|
||||
agent { label 'windows11 && vs2022' }
|
||||
options { skipDefaultCheckout() }
|
||||
steps {
|
||||
cleanWs()
|
||||
checkout scm
|
||||
bat """
|
||||
call "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Auxiliary\\Build\\vcvars64.bat"
|
||||
if not %ERRORLEVEL% == 0 exit /b %ERRORLEVEL%
|
||||
call "winbuild-clang.bat" -DCMAKE_BUILD_TYPE=Release -DSLEEF_SHOW_CONFIG=1 -DSLEEF_BUILD_DFT=True -DSLEEF_ENFORCE_DFT=TRUE -DSLEEF_BUILD_QUAD=TRUE -DSLEEF_ENFORCE_SSE2=TRUE -DSLEEF_ENFORCE_SSE4=TRUE -DSLEEF_ENFORCE_AVX=TRUE -DSLEEF_ENFORCE_AVX2=TRUE -DSLEEF_ENFORCE_AVX512F=TRUE -DSLEEF_ENABLE_TESTER4=True -DSLEEF_ENFORCE_TESTER4=True -DSLEEF_DISABLE_SSL=True
|
||||
if not %ERRORLEVEL% == 0 exit /b %ERRORLEVEL%
|
||||
ctest -j 4 --output-on-failure
|
||||
exit /b %ERRORLEVEL%
|
||||
"""
|
||||
}
|
||||
}
|
||||
|
||||
stage('x86_64 windows vs2022') {
|
||||
agent { label 'windows11 && vs2022' }
|
||||
options { skipDefaultCheckout() }
|
||||
steps {
|
||||
cleanWs()
|
||||
checkout scm
|
||||
bat """
|
||||
call "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Auxiliary\\Build\\vcvars64.bat"
|
||||
if not %ERRORLEVEL% == 0 exit /b %ERRORLEVEL%
|
||||
call "winbuild-msvc.bat" -DCMAKE_BUILD_TYPE=Release -DSLEEF_SHOW_CONFIG=1 -DSLEEF_BUILD_DFT=True -DSLEEF_ENFORCE_DFT=TRUE -DSLEEF_BUILD_QUAD=TRUE -DSLEEF_ENFORCE_SSE2=TRUE -DSLEEF_ENFORCE_SSE4=TRUE -DSLEEF_ENFORCE_AVX=TRUE -DSLEEF_ENFORCE_AVX2=TRUE -DSLEEF_ENFORCE_AVX512F=TRUE -DSLEEF_ENFORCE_TESTER4=True
|
||||
if not %ERRORLEVEL% == 0 exit /b %ERRORLEVEL%
|
||||
ctest -j 4 --output-on-failure
|
||||
exit /b %ERRORLEVEL%
|
||||
"""
|
||||
}
|
||||
}
|
||||
|
||||
stage('riscv linux gcc-14') {
|
||||
agent { label 'riscv && ubuntu23' }
|
||||
options { skipDefaultCheckout() }
|
||||
steps {
|
||||
script {
|
||||
System.setProperty("org.jenkinsci.plugins.durabletask.BourneShellScript.HEARTBEAT_CHECK_INTERVAL", "86400");
|
||||
}
|
||||
cleanWs()
|
||||
checkout scm
|
||||
sh '''
|
||||
echo "riscv gcc-14 on" `hostname`
|
||||
export CC=gcc-14.2.0
|
||||
export CXX=g++-14.2.0
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -GNinja -DCMAKE_INSTALL_PREFIX=../../install -DSLEEF_SHOW_CONFIG=1 -DSLEEF_BUILD_DFT=False -DSLEEF_ENFORCE_DFT=False -DSLEEF_BUILD_QUAD=TRUE -DSLEEF_BUILD_INLINE_HEADERS=TRUE -DSLEEF_ENFORCE_TESTER4=True -DSLEEF_ENABLE_TESTER=False -DSLEEF_ENFORCE_RVVM1=True -DSLEEF_ENFORCE_RVVM2=True
|
||||
cmake -E time oomstaller ninja -j `nproc`
|
||||
export OMP_WAIT_POLICY=passive
|
||||
export CTEST_OUTPUT_ON_FAILURE=TRUE
|
||||
ctest -j `nproc`
|
||||
ninja install
|
||||
'''
|
||||
}
|
||||
}
|
||||
|
||||
stage('arm32 linux gcc-12') {
|
||||
agent { label 'armv7 && debian12' }
|
||||
options { skipDefaultCheckout() }
|
||||
steps {
|
||||
cleanWs()
|
||||
checkout scm
|
||||
sh '''
|
||||
echo "arm32 gcc-12 on" `hostname`
|
||||
export CC=gcc-12
|
||||
export CXX=g++-12
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -GNinja -DCMAKE_INSTALL_PREFIX=../../install -DSLEEF_SHOW_CONFIG=1 -DSLEEF_BUILD_QUAD=TRUE -DSLEEF_ENFORCE_TESTER4=True -DSLEEF_ENABLE_TESTER=False
|
||||
cmake -E time oomstaller ninja -j `nproc`
|
||||
export CTEST_OUTPUT_ON_FAILURE=TRUE
|
||||
ctest -j `nproc`
|
||||
ninja install
|
||||
'''
|
||||
}
|
||||
}
|
||||
|
||||
stage('aarch64 linux clang-19') {
|
||||
agent { label 'aarch64 && ubuntu24 && apple' }
|
||||
options { skipDefaultCheckout() }
|
||||
steps {
|
||||
cleanWs()
|
||||
checkout scm
|
||||
sh '''
|
||||
echo "aarch64 clang-19 on" `hostname`
|
||||
export CC=clang-19
|
||||
export CXX=clang++-19
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -GNinja -DCMAKE_INSTALL_PREFIX=../../install -DSLEEF_SHOW_CONFIG=1 -DSLEEF_BUILD_DFT=TRUE -DSLEEF_ENFORCE_DFT=TRUE -DSLEEF_BUILD_QUAD=TRUE -DSLEEF_BUILD_INLINE_HEADERS=TRUE -DSLEEF_ENFORCE_SVE=TRUE -DEMULATOR=qemu-aarch64-static -DSLEEF_ENFORCE_TESTER4=True -DSLEEF_ENABLE_TESTER=False -DSLEEF_ENABLE_LTO=True -DCMAKE_EXE_LINKER_FLAGS="-fuse-ld=lld-19"
|
||||
cmake -E time oomstaller ninja -j `nproc`
|
||||
export CTEST_OUTPUT_ON_FAILURE=TRUE
|
||||
ctest -j `nproc`
|
||||
'''
|
||||
}
|
||||
}
|
||||
|
||||
stage('aarch64 linux gcc-14') {
|
||||
agent { label 'aarch64 && ubuntu24 && apple' }
|
||||
options { skipDefaultCheckout() }
|
||||
steps {
|
||||
cleanWs()
|
||||
checkout scm
|
||||
sh '''
|
||||
echo "aarch64 gcc-14 on" `hostname`
|
||||
export CC=gcc-14
|
||||
export CXX=g++-14
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -GNinja -DCMAKE_INSTALL_PREFIX=../../install -DSLEEF_SHOW_CONFIG=1 -DSLEEF_BUILD_DFT=TRUE -DSLEEF_ENFORCE_DFT=TRUE -DSLEEF_BUILD_QUAD=TRUE -DSLEEF_BUILD_INLINE_HEADERS=TRUE -DSLEEF_ENFORCE_SVE=TRUE -DEMULATOR=qemu-aarch64-static -DSLEEF_ENFORCE_TESTER4=True -DSLEEF_ENABLE_TESTER=False
|
||||
cmake -E time oomstaller ninja -j `nproc`
|
||||
export CTEST_OUTPUT_ON_FAILURE=TRUE
|
||||
ctest -j `nproc`
|
||||
'''
|
||||
}
|
||||
}
|
||||
|
||||
stage('cross-ppc64el gcc') {
|
||||
agent { label 'x86_64 && ubuntu24 && cuda' }
|
||||
steps {
|
||||
cleanWs()
|
||||
checkout scm
|
||||
sh '''
|
||||
echo "Cross ppc64el gcc on" `hostname`
|
||||
rm -rf build-native
|
||||
mkdir build-native
|
||||
cd build-native
|
||||
cmake -GNinja .. -DSLEEF_SHOW_CONFIG=1 -DSLEEF_BUILD_QUAD=TRUE -DSLEEF_BUILD_DFT=TRUE -DSLEEF_ENFORCE_DFT=TRUE
|
||||
cmake -E time ninja
|
||||
cd ..
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -GNinja .. -DCMAKE_TOOLCHAIN_FILE=../toolchains/ppc64el-gcc.cmake -DNATIVE_BUILD_DIR=`pwd`/../build-native -DCMAKE_INSTALL_PREFIX=../install -DSLEEF_SHOW_CONFIG=1 -DSLEEF_ENFORCE_TESTER3=TRUE -DSLEEF_BUILD_QUAD=TRUE -DSLEEF_BUILD_DFT=TRUE -DSLEEF_ENFORCE_DFT=TRUE -DSLEEF_ENFORCE_TESTER4=True -DSLEEF_ENABLE_TESTER=False -DSLEEF_ENFORCE_VSX=True -DSLEEF_ENFORCE_VSX3=True
|
||||
cmake -E time ninja
|
||||
export OMP_WAIT_POLICY=passive
|
||||
export CTEST_OUTPUT_ON_FAILURE=TRUE
|
||||
export LD_LIBRARY_PATH=/usr/powerpc64le-linux-gnu/lib
|
||||
ctest -j `nproc`
|
||||
ninja install
|
||||
'''
|
||||
}
|
||||
}
|
||||
|
||||
stage('cross-s390x gcc') {
|
||||
agent { label 'x86_64 && ubuntu24 && cuda' }
|
||||
steps {
|
||||
cleanWs()
|
||||
checkout scm
|
||||
sh '''
|
||||
echo "Cross s390x gcc on" `hostname`
|
||||
rm -rf build-native
|
||||
mkdir build-native
|
||||
cd build-native
|
||||
cmake -GNinja .. -DSLEEF_SHOW_CONFIG=1 -DSLEEF_BUILD_QUAD=TRUE -DSLEEF_BUILD_DFT=TRUE -DSLEEF_ENFORCE_DFT=TRUE
|
||||
cmake -E time ninja
|
||||
cd ..
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -GNinja .. -DCMAKE_TOOLCHAIN_FILE=../toolchains/s390x-gcc.cmake -DNATIVE_BUILD_DIR=`pwd`/../build-native -DCMAKE_INSTALL_PREFIX=../install -DSLEEF_SHOW_CONFIG=1 -DSLEEF_ENFORCE_TESTER3=TRUE -DSLEEF_BUILD_QUAD=TRUE -DSLEEF_BUILD_DFT=TRUE -DSLEEF_ENFORCE_DFT=TRUE -DSLEEF_ENFORCE_TESTER4=True -DSLEEF_ENABLE_TESTER=False -DSLEEF_ENFORCE_VXE=True -DSLEEF_ENFORCE_VXE2=True
|
||||
cmake -E time ninja
|
||||
export OMP_WAIT_POLICY=passive
|
||||
export CTEST_OUTPUT_ON_FAILURE=TRUE
|
||||
ctest -j `nproc`
|
||||
ninja install
|
||||
'''
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,129 @@
|
||||
== SLEEF - SIMD Library for Evaluating Elementary Functions
|
||||
|
||||
image:http://img.shields.io/badge/DOI-10.1109/TPDS.2019.2960333-blue.svg[TPDS, link=https://ieeexplore.ieee.org/document/8936472]
|
||||
|
||||
SLEEF is a library that implements vectorized versions of C standard
|
||||
math functions. This library also includes DFT subroutines.
|
||||
|
||||
* *Web Page:* https://sleef.org/
|
||||
* *Sources:* https://github.com/shibatch/sleef
|
||||
|
||||
== Supported environment
|
||||
|
||||
=== Test matrix
|
||||
|
||||
The following table summarizes currently supported OSes and compilers.
|
||||
|
||||
[cols="1,1,1,1,1,1,1,1,1"]
|
||||
|===
|
||||
| 2+|Linux 4+|Windows 2+|Mac
|
||||
| |gcc |llvm |MSVC |Clang |MinGW |Cygwin |Clang |GCC
|
||||
|x86_64 |✔ |✔ |✔ |✔ |✔ |❓ |✔ |❓
|
||||
|RISC-V 64 |✔ |❓ |N/A |N/A |N/A |N/A |N/A |N/A
|
||||
|AArch64 |✔ |✔ |❌ |❌ |❌ |❌ |✔ |❓
|
||||
|POWER |✔ |❓ |N/A |N/A |N/A |N/A |N/A |N/A
|
||||
|S390X |✔ |❓ |N/A |N/A |N/A |N/A |N/A |N/A
|
||||
|AArch32 |✔ |❓ |N/A |N/A |N/A |N/A |N/A |N/A
|
||||
|===
|
||||
|
||||
✔ : Tested on CI, ❓ : Not tested, ❌ : Not supported
|
||||
|
||||
|
||||
== How to build SLEEF
|
||||
|
||||
The library itself does not have any additional dependency.
|
||||
|
||||
In order to build SLEEF, you need CMake 3.18+, and C and C++ compilers of the same version.
|
||||
It is also recommended to have the following tools.
|
||||
|
||||
* Ninja
|
||||
* Git
|
||||
|
||||
https://github.com/shibatch/tlfloat[TLFloat] is automatically downloaded if no suitable version is found on your system.
|
||||
|
||||
Some tests require:
|
||||
|
||||
* libssl and libcrypto, that can be provided by installing openssl.
|
||||
* libm, libgmp and libmpfr
|
||||
* libfftw.
|
||||
|
||||
|
||||
The build procedure is as follows.
|
||||
|
||||
[arabic]
|
||||
. Check out the source code from our GitHub repository
|
||||
|
||||
....
|
||||
git clone https://github.com/shibatch/sleef
|
||||
....
|
||||
|
||||
[arabic, start=2]
|
||||
. Make a separate directory to create an out-of-source build
|
||||
|
||||
....
|
||||
cd sleef && mkdir build
|
||||
....
|
||||
|
||||
[arabic, start=3]
|
||||
. Run cmake to configure the project
|
||||
|
||||
....
|
||||
cmake -S . -B build
|
||||
....
|
||||
|
||||
By default this will generate shared libraries. In order to generate
|
||||
static libraries, pass option `-DBUILD_SHARED_LIBS=OFF`.
|
||||
|
||||
For more verbose output add option `-DSLEEF_SHOW_CONFIG=ON`.
|
||||
|
||||
[arabic, start=4]
|
||||
. Run make to build the project
|
||||
|
||||
....
|
||||
cmake --build build -j --clean-first
|
||||
....
|
||||
|
||||
[arabic, start=5]
|
||||
. Run tests using ctests
|
||||
|
||||
....
|
||||
ctest --test-dir build -j
|
||||
....
|
||||
|
||||
For more detailed build instructions please refer to
|
||||
https://sleef.org/compile.xhtml#preliminaries[our web page].
|
||||
|
||||
== How to cross-compile SLEEF
|
||||
|
||||
For more detailed please refer to
|
||||
https://sleef.org/compile.xhtml#cross[cross-compile SLEEF]
|
||||
|
||||
== Install SLEEF
|
||||
|
||||
=== From source
|
||||
|
||||
Assuming following instructions were followed.
|
||||
|
||||
[arabic, start=6]
|
||||
. Install to specified directory `<prefix>`
|
||||
|
||||
....
|
||||
cmake --install build --prefix=<prefix>
|
||||
....
|
||||
|
||||
=== Uninstall
|
||||
|
||||
In order to uninstall SLEEF library and headers run
|
||||
|
||||
....
|
||||
sudo xargs rm -v < build/install_manifest.txt
|
||||
....
|
||||
|
||||
== License
|
||||
|
||||
The software is distributed under the Boost Software License, Version
|
||||
1.0. See accompanying file link:./LICENSE.txt[LICENSE.txt] or copy at
|
||||
http://www.boost.org/LICENSE_1_0.txt. Contributions to this project are
|
||||
accepted under the same license.
|
||||
|
||||
Copyright © 2010-2025 SLEEF Project, Naoki Shibata and contributors.
|
||||
@ -1,221 +0,0 @@
|
||||
# SLEEF
|
||||
|
||||

|
||||
[](https://ieeexplore.ieee.org/document/8936472)
|
||||
[](https://www.boost.org/LICENSE_1_0.txt)
|
||||

|
||||
[](https://spack.readthedocs.io/en/v0.16.2/package_list.html#sleef)
|
||||
[](https://sourceforge.net/projects/sleef/)
|
||||
|
||||
SLEEF is a library that implements vectorized versions of C standard math functions. This library also includes DFT subroutines.
|
||||
|
||||
- **Web Page:** [https://sleef.org/][webpage_url]
|
||||
- **Sources:** [https://github.com/shibatch/sleef][repo_url]
|
||||
|
||||
## Supported environment
|
||||
|
||||
### Test matrix
|
||||
|
||||
The following table summarises currently supported vector extensions, compilers and OS-es.
|
||||
|
||||
:green_circle: : Tested extensively in CI.
|
||||
|
||||
:yellow_circle: : Tested partially in CI.
|
||||
|
||||
:x: : Currently failing some tests in CI.
|
||||
|
||||
:white_circle: : Not tested in CI. Might have passed tests in previous CI framework.
|
||||
|
||||
[This issue](https://github.com/shibatch/sleef/issues/481) tracks progress on improving test coverage.
|
||||
Compilation of SLEEF on previously supported environments might still be safe, we just cannot verify it yet.
|
||||
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<th colspan="2" rowspan="2"></th>
|
||||
<th colspan="9">OS/Compiler</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<th colspan="3">Linux</th>
|
||||
<th colspan="2">macOS</th>
|
||||
<th colspan="4">Windows</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Arch.</th>
|
||||
<th>Vector Extensions</th>
|
||||
<th>gcc</th><th>llvm</th><th>icc</th>
|
||||
<th>gcc</th><th>llvm</th>
|
||||
<th>gcc</th><th>llvm-gnu</th><th>llvm-msvc</th><th>msvc</th>
|
||||
</tr>
|
||||
<tr align="center"><th>x86_64</th><th>SSE2, SSE4,<br>AVX, AVX2, AVX512F</th>
|
||||
<td>:green_circle:</td><td>:green_circle:</td><td>:white_circle:</td>
|
||||
<td>:white_circle:</td><td>:green_circle:</td>
|
||||
<td>:white_circle:</td><td>:yellow_circle:</td><td>:white_circle:</td><td>:white_circle:</td>
|
||||
</tr>
|
||||
<tr align="center"><th>x86 32bit<br>(i386)</th><th>SSE</th>
|
||||
<td>:white_circle:</td><td>:white_circle:</td><td>:white_circle:</td>
|
||||
<td colspan="2">N/A</td>
|
||||
<td>:white_circle:</td><td>:white_circle:</td><td>:white_circle:</td><td>:white_circle:</td>
|
||||
</tr>
|
||||
<tr align="center"><th>AArch64<br>(arm)</th><th>Neon, SVE</th>
|
||||
<td>:green_circle:</td><td>:green_circle:</td><td>N/A</td>
|
||||
<td colspan="1">N/A</td><td>:green_circle:</td>
|
||||
<td colspan="1">N/A</td><td>:white_circle:</td><td>:white_circle:</td><td>:white_circle:</td>
|
||||
</tr>
|
||||
<tr align="center"><th>AArch32<br>(armhf)</th><th>NEON</th>
|
||||
<td>:green_circle:</td><td>:green_circle:</td><td>N/A</td>
|
||||
<td colspan="2">N/A</td>
|
||||
<td colspan="4">N/A</td>
|
||||
</tr>
|
||||
<tr align="center"><th>PowerPC<br>(ppc64el)</th><th>VSX, VSX3</th>
|
||||
<td>:green_circle:</td><td>:green_circle:</td><td>N/A</td>
|
||||
<td colspan="2">N/A</td>
|
||||
<td colspan="4">N/A</td>
|
||||
</tr>
|
||||
<tr align="center"><th>IBM/Z<br>(s390x)</th><th>VXE, VXE2</th>
|
||||
<td>:green_circle:</td><td>:green_circle:</td><td>N/A</td>
|
||||
<td colspan="2">N/A</td>
|
||||
<td colspan="4">N/A</td>
|
||||
</tr>
|
||||
<tr align="center"><th>RISC-V<br>(riscv64)</th><th>RVV1, RVV2</th>
|
||||
<td>N/A (14+)</td><td>:green_circle:</td><td>N/A</td>
|
||||
<td colspan="2">N/A</td>
|
||||
<td colspan="4">N/A</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
### Component support
|
||||
|
||||
The above table is valid for libm in single, double and quadruple precision, as well as fast Discrete Fourier Transform (DFT).
|
||||
|
||||
Generation of inline headers is also supported for most vector extensions.
|
||||
|
||||
LTO is not tested in CI yet, except on Windows.
|
||||
|
||||
### Compiler support
|
||||
|
||||
Results are displayed for gcc 11 and llvm 17, the compiler versions used in CI tests with GitHub Actions.
|
||||
|
||||
Older versions should be supported too, while newer ones are either not tested or have known issues.
|
||||
|
||||
Some compiler versions simply do not support certain vector extensions, for instance SVE is only supported for gcc version 9 onwards.
|
||||
|
||||
Similarly, the RISC-V interface in SLEEF is based on version 1.0 of the intrinsics, which is only supported from llvm version 17 and gcc version 14 onwards.
|
||||
|
||||
Toolchain files provide some information on supported compiler versions.
|
||||
|
||||
### OS support
|
||||
|
||||
Only Linux distributions and macOS are fully tested in CI and thus officially supported.
|
||||
|
||||
Building SLEEF for Windows on x86 machines was officially supported ( :white_circle: ), as of 3.5.1,
|
||||
however it is only partially tested due to [known limitations of the test suite with MinGW or MSYS2](https://github.com/shibatch/sleef/issues/544).
|
||||
As a result tests for Windows on x86 only include DFT for now (other tests are disabled in build system),
|
||||
but all components are built.
|
||||
|
||||
Support for iOS and Android is only preliminary on AArch64.
|
||||
|
||||
SVE is not supported on Darwin-based system and therefore automatically disabled by SLEEF on Darwin.
|
||||
|
||||
### More on supported environment
|
||||
|
||||
Refer to our web page for [more on supported environment][supported_env_url].
|
||||
|
||||
## Install SLEEF dependencies
|
||||
|
||||
The library itself does not have any additional dependency.
|
||||
|
||||
However some tests require:
|
||||
|
||||
- libssl and libcrypto, that can be provided by installing openssl.
|
||||
- libm, libgmp and libmpfr
|
||||
- libfftw.
|
||||
|
||||
These tests can be disabled if necessary.
|
||||
|
||||
## How to build SLEEF
|
||||
|
||||
We recommend relying on CMake as much as possible in the build process to ensure portability.
|
||||
**CMake 3.18+** is the minimum required.
|
||||
|
||||
1. Check out the source code from our GitHub repository
|
||||
|
||||
```
|
||||
git clone https://github.com/shibatch/sleef
|
||||
```
|
||||
|
||||
2. Make a separate directory to create an out-of-source build
|
||||
|
||||
```
|
||||
cd sleef && mkdir build
|
||||
```
|
||||
|
||||
3. Run cmake to configure the project
|
||||
|
||||
```
|
||||
cmake -S . -B build
|
||||
```
|
||||
|
||||
By default this will generate shared libraries. In order to generate static libraries, pass option `-DBUILD_SHARED_LIBS=OFF`.
|
||||
|
||||
For more verbose output add option `-DSLEEF_SHOW_CONFIG=ON`.
|
||||
|
||||
4. Run make to build the project
|
||||
|
||||
```
|
||||
cmake --build build -j --clean-first
|
||||
```
|
||||
|
||||
5. Run tests using ctests
|
||||
|
||||
```
|
||||
ctest --test-dir build -j
|
||||
```
|
||||
|
||||
For more detailed build instructions please refer to the [dedicated section on CMake](./docs/build-with-cmake.md) or to [our web page][build_info_url].
|
||||
|
||||
## Install SLEEF
|
||||
|
||||
### From source
|
||||
|
||||
Assuming following instructions were followed.
|
||||
|
||||
6. Install to specified directory `<prefix>`
|
||||
|
||||
```
|
||||
cmake --install build --prefix=<prefix>
|
||||
```
|
||||
|
||||
### Using Spack
|
||||
|
||||
SLEEF can also be directly installed using Spack.
|
||||
|
||||
```
|
||||
spack install sleef@master
|
||||
```
|
||||
|
||||
### Uninstall
|
||||
|
||||
In order to uninstall SLEEF library and headers run
|
||||
|
||||
```
|
||||
sudo xargs rm -v < build/install_manifest.txt
|
||||
```
|
||||
|
||||
## License
|
||||
|
||||
The software is distributed under the Boost Software License, Version 1.0.
|
||||
See accompanying file [LICENSE.txt](./LICENSE.txt) or copy at [http://www.boost.org/LICENSE_1_0.txt][license_url].
|
||||
Contributions to this project are accepted under the same license.
|
||||
|
||||
Copyright © 2010-2024 SLEEF Project, Naoki Shibata and contributors.<br/>
|
||||
|
||||
|
||||
<!-- Repository links -->
|
||||
|
||||
[webpage_url]: https://sleef.org/
|
||||
[build_info_url]: https://sleef.org/compile.xhtml
|
||||
[supported_env_url]: https://sleef.org/index.xhtml#environment
|
||||
[repo_url]: https://github.com/shibatch/sleef
|
||||
[repo_license_url]: https://github.com/shibatch/sleef/blob/main/LICENSE.txt
|
||||
[license_url]: http://www.boost.org/LICENSE_1_0.txt
|
||||
@ -6,6 +6,7 @@ extern "C"
|
||||
{
|
||||
#endif
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
|
||||
@ -46,20 +47,24 @@ IMPORT struct SleefDFT *SleefDFT_float_init1d(uint32_t n, const float *in, float
|
||||
IMPORT struct SleefDFT *SleefDFT_float_init2d(uint32_t n, uint32_t m, const float *in, float *out, uint64_t mode);
|
||||
IMPORT void SleefDFT_float_execute(struct SleefDFT *ptr, const float *in, float *out);
|
||||
|
||||
IMPORT void SleefDFT_execute(struct SleefDFT *ptr, const void *in, void *out);
|
||||
|
||||
IMPORT void SleefDFT_dispose(struct SleefDFT *ptr);
|
||||
|
||||
IMPORT void SleefDFT_setPath(struct SleefDFT *ptr, char *pathStr);
|
||||
IMPORT int SleefDFT_getPath(struct SleefDFT *ptr, char *pathStr, int pathStrSize);
|
||||
|
||||
IMPORT void SleefDFT_setDefaultVerboseFP(FILE *fp);
|
||||
|
||||
//
|
||||
|
||||
IMPORT void SleefDFT_setPlanFilePath(const char *path, const char *arch, uint64_t mode);
|
||||
IMPORT int SleefDFT_savePlan(const char *pathStr);
|
||||
|
||||
#define SLEEF_PLAN_AUTOMATIC 0
|
||||
#define SLEEF_PLAN_READONLY (1 << 0)
|
||||
#define SLEEF_PLAN_RESET (1 << 1)
|
||||
#define SLEEF_PLAN_BUILDALLPLAN (1 << 2)
|
||||
#define SLEEF_PLAN_AUTOMATIC (1 << 2)
|
||||
#define SLEEF_PLAN_NOLOCK (1 << 3)
|
||||
#define SLEEF_PLAN_MEASURE (1 << 29)
|
||||
#define SLEEF_PLAN_REFERTOENVVAR (1 << 30)
|
||||
|
||||
#undef IMPORT
|
||||
|
||||
@ -6,6 +6,11 @@
|
||||
#define SLEEF_VERSION_MAJOR @SLEEF_VERSION_MAJOR@
|
||||
#define SLEEF_VERSION_MINOR @SLEEF_VERSION_MINOR@
|
||||
|
||||
#cmakedefine SLEEF_FLOAT128_IS_IEEEQP
|
||||
#cmakedefine SLEEF_LONGDOUBLE_IS_IEEEQP
|
||||
|
||||
#ifndef SLEEF_STATIC_LIBS
|
||||
#cmakedefine SLEEF_STATIC_LIBS
|
||||
#endif
|
||||
|
||||
#endif // SLEEF_CONFIG_H
|
||||
|
||||
@ -7,11 +7,19 @@ if (SLEEF_BUILD_TESTS AND NOT MINGW)
|
||||
endif()
|
||||
add_subdirectory("common")
|
||||
|
||||
if (SLEEF_BUILD_DFT)
|
||||
if (SLEEF_BUILD_BENCH)
|
||||
# add_subdirectory("libm-benchmarks")
|
||||
endif()
|
||||
|
||||
if (SLEEF_BUILD_DFT AND COMPILER_SUPPORTS_OPENMP)
|
||||
add_subdirectory("dft")
|
||||
if (SLEEF_BUILD_TESTS)
|
||||
add_subdirectory("dft-tester")
|
||||
endif()
|
||||
else()
|
||||
if (SLEEF_ENFORCE_DFT)
|
||||
message(FATAL_ERROR "SLEEF_ENFORCE_DFT is specified and DFT is not built")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (SLEEF_BUILD_QUAD)
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
@ -110,7 +110,7 @@ static INLINE int vavailability_i(int name) {
|
||||
|
||||
#endif // #if !defined(SLEEF_GENHEADER)
|
||||
|
||||
static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _MM_HINT_T0); }
|
||||
static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch((const char *)ptr, _MM_HINT_T0); }
|
||||
|
||||
static INLINE int vtestallones_i_vo32(vopmask g) {
|
||||
return _mm_test_all_ones(_mm_and_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1)));
|
||||
@ -516,10 +516,10 @@ static INLINE float vcast_f_vf(vfloat v) {
|
||||
#endif
|
||||
//
|
||||
|
||||
#define PNMASK ((vdouble) { +0.0, -0.0, +0.0, -0.0 })
|
||||
#define NPMASK ((vdouble) { -0.0, +0.0, -0.0, +0.0 })
|
||||
#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f })
|
||||
#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f })
|
||||
#define PNMASK _mm256_set_pd( -0.0, +0.0, -0.0, +0.0 )
|
||||
#define NPMASK _mm256_set_pd( +0.0, -0.0, +0.0, -0.0 )
|
||||
#define PNMASKf _mm256_set_ps( -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f )
|
||||
#define NPMASKf _mm256_set_ps( +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f )
|
||||
|
||||
static INLINE vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); }
|
||||
static INLINE vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); }
|
||||
@ -629,7 +629,7 @@ static INLINE vmask vcast_vm_vi(vint vi) {
|
||||
}
|
||||
static INLINE vint vcast_vi_vm(vmask vm) {
|
||||
return _mm_or_si128(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_mm256_castsi256_si128(vm)), _mm_set1_ps(0), 0x08)),
|
||||
_mm_castps_si128(_mm_shuffle_ps(_mm_set1_ps(0), _mm_castsi128_ps(_mm256_extractf128_si256(vm, 1)), 0x80)));
|
||||
_mm_castps_si128(_mm_shuffle_ps(_mm_set1_ps(0), _mm_castsi128_ps(_mm256_extractf128_si256(vm, 1)), 0x80)));
|
||||
}
|
||||
|
||||
static INLINE vmask vreinterpret_vm_vi64(vint64 v) { return v; }
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
@ -96,7 +96,7 @@ static INLINE int vavailability_i(int name) {
|
||||
|
||||
#endif // #if !defined(SLEEF_GENHEADER)
|
||||
|
||||
static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _MM_HINT_T0); }
|
||||
static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch((const char *)ptr, _MM_HINT_T0); }
|
||||
|
||||
static INLINE int vtestallones_i_vo32(vopmask g) {
|
||||
return _mm_test_all_ones(_mm_and_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1)));
|
||||
@ -168,7 +168,7 @@ static INLINE vmask vcastu_vm_vi(vint vi) {
|
||||
|
||||
static INLINE vint vcastu_vi_vm(vmask vi) {
|
||||
return _mm_or_si128(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_mm256_castsi256_si128(vi)), _mm_set1_ps(0), 0x0d)),
|
||||
_mm_castps_si128(_mm_shuffle_ps(_mm_set1_ps(0), _mm_castsi128_ps(_mm256_extractf128_si256(vi, 1)), 0xd0)));
|
||||
_mm_castps_si128(_mm_shuffle_ps(_mm_set1_ps(0), _mm_castsi128_ps(_mm256_extractf128_si256(vi, 1)), 0xd0)));
|
||||
}
|
||||
|
||||
static INLINE vmask vcast_vm_i_i(int i0, int i1) {
|
||||
@ -392,10 +392,10 @@ static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return _mm2
|
||||
|
||||
//
|
||||
|
||||
#define PNMASK ((vdouble) { +0.0, -0.0, +0.0, -0.0 })
|
||||
#define NPMASK ((vdouble) { -0.0, +0.0, -0.0, +0.0 })
|
||||
#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f })
|
||||
#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f })
|
||||
#define PNMASK _mm256_set_pd( -0.0, +0.0, -0.0, +0.0 )
|
||||
#define NPMASK _mm256_set_pd( +0.0, -0.0, +0.0, -0.0 )
|
||||
#define PNMASKf _mm256_set_ps( -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f )
|
||||
#define NPMASKf _mm256_set_ps( +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f )
|
||||
|
||||
static INLINE vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); }
|
||||
static INLINE vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); }
|
||||
@ -476,7 +476,7 @@ static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) { return _mm256_cmpgt_epi
|
||||
static INLINE vmask vcast_vm_vi(vint vi) { return _mm256_cvtepi32_epi64(vi); } // signed 32-bit => 64-bit
|
||||
static INLINE vint vcast_vi_vm(vmask vm) { // signed 32-bit <= 64-bit
|
||||
return _mm_or_si128(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_mm256_castsi256_si128(vm)), _mm_set1_ps(0), 0x08)),
|
||||
_mm_castps_si128(_mm_shuffle_ps(_mm_set1_ps(0), _mm_castsi128_ps(_mm256_extractf128_si256(vm, 1)), 0x80)));
|
||||
_mm_castps_si128(_mm_shuffle_ps(_mm_set1_ps(0), _mm_castsi128_ps(_mm256_extractf128_si256(vm, 1)), 0x80)));
|
||||
}
|
||||
|
||||
static INLINE vmask vreinterpret_vm_vi64(vint64 v) { return v; }
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
@ -96,7 +96,7 @@ static INLINE int vavailability_i(int name) {
|
||||
|
||||
#endif // #if !defined(SLEEF_GENHEADER)
|
||||
|
||||
static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _MM_HINT_T0); }
|
||||
static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch((const char *)ptr, _MM_HINT_T0); }
|
||||
|
||||
static INLINE int vtestallones_i_vo32(vopmask g) { return _mm_movemask_epi8(g) == 0xFFFF; }
|
||||
static INLINE int vtestallones_i_vo64(vopmask g) { return _mm_movemask_epi8(g) == 0xFFFF; }
|
||||
@ -371,10 +371,10 @@ static INLINE float vcast_f_vf(vfloat v) {
|
||||
|
||||
//
|
||||
|
||||
#define PNMASK ((vdouble) { +0.0, -0.0 })
|
||||
#define NPMASK ((vdouble) { -0.0, +0.0 })
|
||||
#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f })
|
||||
#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f })
|
||||
#define PNMASK _mm_set_pd( -0.0, +0.0 )
|
||||
#define NPMASK _mm_set_pd( +0.0, -0.0 )
|
||||
#define PNMASKf _mm_set_ps( -0.0f, +0.0f, -0.0f, +0.0f )
|
||||
#define NPMASKf _mm_set_ps( +0.0f, -0.0f, +0.0f, -0.0f )
|
||||
|
||||
static INLINE vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); }
|
||||
static INLINE vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); }
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
@ -102,7 +102,7 @@ static INLINE int vavailability_i(int name) {
|
||||
|
||||
#endif // #if !defined(SLEEF_GENHEADER)
|
||||
|
||||
static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _MM_HINT_T0); }
|
||||
static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch((const char *)ptr, _MM_HINT_T0); }
|
||||
|
||||
#ifdef __INTEL_COMPILER
|
||||
static INLINE int vtestallones_i_vo64(vopmask g) { return _mm512_mask2int(g) == 0xff; }
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
@ -257,10 +257,10 @@ static INLINE int vavailability_i(int name) {
|
||||
}
|
||||
|
||||
|
||||
static INLINE vfloat vload_vf_p(const float *ptr) { return vld1q_f32(__builtin_assume_aligned(ptr, 16)); }
|
||||
static INLINE vfloat vload_vf_p(const float *ptr) { return vld1q_f32((const float32_t*)__builtin_assume_aligned(ptr, 16)); }
|
||||
static INLINE vfloat vloadu_vf_p(const float *ptr) { return vld1q_f32(ptr); }
|
||||
|
||||
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { vst1q_f32(__builtin_assume_aligned(ptr, 16), v); }
|
||||
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { vst1q_f32((float32_t*)__builtin_assume_aligned(ptr, 16), v); }
|
||||
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { vst1q_f32(ptr, v); }
|
||||
|
||||
static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
@ -103,16 +103,16 @@ typedef vquad vargquad;
|
||||
#define vset__s64(...) ((v__i64) {__VA_ARGS__})
|
||||
#define vset__u64(...) ((v__u64) {__VA_ARGS__})
|
||||
|
||||
#define vsetall__vi(v) vset__vi(v, v)
|
||||
#define vsetall__vi2(v) vset__vi2(v, v, v, v)
|
||||
#define vsetall__vi(v) vset__vi((int)v, (int)v)
|
||||
#define vsetall__vi2(v) vset__vi2((int)v, (int)v, (int)v, (int)v)
|
||||
#define vsetall__vm(v) vset__vm(v, v, v, v)
|
||||
#define vsetall__vo(v) vset__vo(v, v, v, v)
|
||||
#define vsetall__vf(v) vset__vf(v, v, v, v)
|
||||
#define vsetall__vd(v) vset__vd(v, v)
|
||||
#define vsetall__u8(v) vset__u8(v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v)
|
||||
#define vsetall__u32(v) vset__u32(v, v, v, v)
|
||||
#define vsetall__s64(v) vset__s64(v, v)
|
||||
#define vsetall__u64(v) vset__u64(v, v)
|
||||
#define vsetall__vf(v) vset__vf((float)v, (float)v, (float)v, (float)v)
|
||||
#define vsetall__vd(v) vset__vd((double)v, (double)v)
|
||||
#define vsetall__u8(v) vset__u8((uint8_t)v, (uint8_t)v, (uint8_t)v, (uint8_t)v, (uint8_t)v, (uint8_t)v, (uint8_t)v, (uint8_t)v, (uint8_t)v, (uint8_t)v, (uint8_t)v, (uint8_t)v, (uint8_t)v, (uint8_t)v, (uint8_t)v, (uint8_t)v)
|
||||
#define vsetall__u32(v) vset__u32((uint32_t)v, (uint32_t)v, (uint32_t)v, (uint32_t)v)
|
||||
#define vsetall__s64(v) vset__s64((int64_t)v, (int64_t)v)
|
||||
#define vsetall__u64(v) vset__u64((uint64_t)v, (uint64_t)v)
|
||||
|
||||
#define vzero__vi() vsetall__vi(0)
|
||||
#define vzero__vi2() vsetall__vi2(0)
|
||||
@ -351,7 +351,7 @@ static INLINE vmask vcastu_vm_vi(vint vi)
|
||||
|
||||
static INLINE vopmask vcast_vo_i(int i) {
|
||||
i = i ? -1 : 0;
|
||||
return (vopmask) { i, i, i, i };
|
||||
return (vopmask) { (unsigned int)i, (unsigned int)i, (unsigned int)i, (unsigned int)i };
|
||||
}
|
||||
|
||||
// signed int to single-precision
|
||||
@ -371,7 +371,7 @@ static INLINE vdouble vcast_vd_vi(vint vi)
|
||||
{
|
||||
vdouble ret;
|
||||
vint swap = vec_mergeh(vi, vi);
|
||||
#if defined(__clang__) || __GNUC__ >= 7
|
||||
#if defined(__clang__) || (__GNUC__ >= 7 && __GNUC__ < 15)
|
||||
ret = __builtin_vsx_xvcvsxwdp(swap);
|
||||
#else
|
||||
__asm__ __volatile__("xvcvsxwdp %x0,%x1" : "=wa" (ret) : "wa" (swap));
|
||||
@ -406,7 +406,7 @@ static INLINE vint2 vtruncate_vi2_vf(vfloat vf)
|
||||
static INLINE vint vtruncate_vi_vd(vdouble vd)
|
||||
{
|
||||
vint ret;
|
||||
#if defined(__clang__) || __GNUC__ >= 7
|
||||
#if defined(__clang__) || (__GNUC__ >= 7 && __GNUC__ < 15)
|
||||
ret = __builtin_vsx_xvcvdpsxws(vd);
|
||||
#else
|
||||
__asm__ __volatile__("xvcvdpsxws %x0,%x1" : "=wa" (ret) : "wa" (vd));
|
||||
@ -860,11 +860,11 @@ static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) {
|
||||
#define vsrl64_vm_vm_i(x, c) ((vmask)vec_sr((__vector signed long long)x, (__vector unsigned long long)vsetall__vm(c)))
|
||||
|
||||
static INLINE vint vcast_vi_vm(vmask vm) {
|
||||
return (vint) { vm[0], vm[2] };
|
||||
return (vint) { (int)vm[0], (int)vm[2] };
|
||||
}
|
||||
|
||||
static INLINE vmask vcast_vm_vi(vint vi) {
|
||||
return (vmask) (__vector signed long long) { vi[0], vi[1] };
|
||||
return (vmask) (__vector signed long long) { (signed long long)vi[0], (signed long long)vi[1] };
|
||||
}
|
||||
|
||||
static INLINE vmask vreinterpret_vm_vi64(vint64 v) { return (vmask)v; }
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2023.
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
@ -426,7 +426,7 @@ static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { *ptr = v; }
|
||||
static vquad loadu_vq_p(void *p) {
|
||||
vquad vq;
|
||||
memcpy(8 + (char *)&vq, p, 8);
|
||||
memcpy((char *)&vq, 8 + p, 8);
|
||||
memcpy((char *)&vq, 8 + (char *)p, 8);
|
||||
return vq;
|
||||
}
|
||||
|
||||
|
||||
@ -91,6 +91,7 @@
|
||||
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wuninitialized"
|
||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
|
||||
static INLINE vfloat64m1x4_t __riscv_vcreate_v_f64m1x4(vfloat64m1_t x, vfloat64m1_t y, vfloat64m1_t z, vfloat64m1_t w) {
|
||||
vfloat64m1x4_t unused;
|
||||
return __riscv_vset(__riscv_vset(__riscv_vset(__riscv_vset(unused, 0, x), 1, y), 2, z), 3, w);
|
||||
@ -158,14 +159,14 @@ typedef vfloat64m1x4_t tdi_t;
|
||||
|
||||
#define SLEEF_RVV_SP_LMUL 1
|
||||
#define SLEEF_RVV_DP_LMUL 1
|
||||
#define SLEEF_RVV_DP_RUNTIME_VL() __riscv_vsetvlmax_e64m1()
|
||||
#define SLEEF_RVV_DP_RUNTIME_VL() ((int)__riscv_vsetvlmax_e64m1())
|
||||
#if SLEEF_RVV_VLEN == 0
|
||||
// The configuration didn't provide a constant vector length, meaning it'll
|
||||
// have to be determined at run-time. RVV offers per-data-width operations for
|
||||
// this so the result doesn't need to be adjusted and that operation is likely
|
||||
// to fold into the surrounding code for free.
|
||||
//
|
||||
#define VECTLENSP (__riscv_vsetvlmax_e32m1())
|
||||
#define VECTLENSP ((int)__riscv_vsetvlmax_e32m1())
|
||||
#define VECTLENDP SLEEF_RVV_DP_RUNTIME_VL()
|
||||
//@#define VECTLENSP __riscv_vsetvlmax_e32m1()
|
||||
//@#define VECTLENDP __riscv_vsetvlmax_e64m1()
|
||||
@ -268,7 +269,7 @@ typedef vfloat64m2x4_t tdi_t;
|
||||
|
||||
#define SLEEF_RVV_SP_LMUL 2
|
||||
#define SLEEF_RVV_DP_LMUL 2
|
||||
#define SLEEF_RVV_DP_RUNTIME_VL() __riscv_vsetvlmax_e64m2()
|
||||
#define SLEEF_RVV_DP_RUNTIME_VL() ((int)__riscv_vsetvlmax_e64m2())
|
||||
#if SLEEF_RVV_VLEN == 0
|
||||
// The configuration didn't provide a constant vector length, meaning it'll
|
||||
// have to be determined at run-time. RVV offers per-data-width operations for
|
||||
@ -605,7 +606,7 @@ static INLINE vmask vreinterpret_vm_vf(vfloat vf) {
|
||||
// needed.
|
||||
//
|
||||
static INLINE int vtestallones_i_vo32(rvv_sp_vopmask g) {
|
||||
return __riscv_vcpop(g, VECTLENSP) == VECTLENSP;
|
||||
return (int)__riscv_vcpop(g, VECTLENSP) == (int)VECTLENSP;
|
||||
}
|
||||
static INLINE vmask vor_vm_vo32_vm(rvv_sp_vopmask x, vmask y) {
|
||||
rvv_vmask32 y32 = SLEEF_RVV_SP_VREINTERPRET_VM(y);
|
||||
@ -1080,7 +1081,7 @@ static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(rvv_dp_vopmask o0, rvv_dp_vopmask
|
||||
return __riscv_vfmerge(__riscv_vfmerge(__riscv_vfmerge(vcast_vd_d(d3), d2, o2, VECTLENDP), d1, o1, VECTLENDP), d0, o0, VECTLENDP);
|
||||
}
|
||||
static INLINE int vtestallones_i_vo64(rvv_dp_vopmask g) {
|
||||
return __riscv_vcpop(g, VECTLENDP) == VECTLENDP;
|
||||
return (int)__riscv_vcpop(g, VECTLENDP) == (int)VECTLENDP;
|
||||
}
|
||||
// integer comparison
|
||||
static INLINE rvv_dp_vopmask veq_vo_vi_vi(vint x, vint y) {
|
||||
@ -1171,7 +1172,7 @@ static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdoub
|
||||
// probably only iterate 2 or 4 times.
|
||||
//
|
||||
ptr += offset * 2;
|
||||
for (int i = 0; i < VECTLENDP; i += 2) {
|
||||
for (int i = 0; i < (int)VECTLENDP; i += 2) {
|
||||
// PROTIP: Avoid modifying `v` within the loop, and just extract the useful
|
||||
// part directly in each iteration, because we can. This avoids a
|
||||
// loop-carried dependency.
|
||||
@ -1185,7 +1186,7 @@ static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdoub
|
||||
static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
|
||||
// as above re: looping
|
||||
ptr += offset * 2;
|
||||
for (int i = 0; i < VECTLENSP; i += 2) {
|
||||
for (int i = 0; i < (int)VECTLENSP; i += 2) {
|
||||
vfloat vv = __riscv_vslidedown(v, i, 2);
|
||||
__riscv_vse32(ptr, vv, 2);
|
||||
ptr += step * 2;
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
@ -78,7 +78,7 @@ typedef vquad vargquad;
|
||||
|
||||
static INLINE int vavailability_i(int n) {
|
||||
if (n == 1 || n == 2) {
|
||||
return vec_max((vdouble) {n, n}, (vdouble) {n, n})[0] != 0;
|
||||
return vec_max((vdouble) {(double)n, (double)n}, (vdouble) {(double)n, (double)n})[0] != 0;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
@ -127,23 +127,23 @@ static INLINE vfloat vgather_vf_p_vi2(const float *p, vint2 vi2) {
|
||||
return ((vfloat) { p[vi2[0]], p[vi2[1]], p[vi2[2]], p[vi2[3]] });
|
||||
}
|
||||
|
||||
static INLINE vopmask vcast_vo_i(int i) { return (vopmask) { i ? (long long)-1 : 0, i ? (long long)-1 : 0 }; }
|
||||
static INLINE vopmask vcast_vo_i(int i) { return (vopmask) { i ? (unsigned long long)-1 : 0, i ? (unsigned long long)-1 : 0 }; }
|
||||
static INLINE vint vcast_vi_i(int i) { return (vint) { i, i }; }
|
||||
static INLINE vint2 vcast_vi2_i(int i) { return (vint2) { i, i, i, i }; }
|
||||
static INLINE vfloat vcast_vf_f(float f) { return (vfloat) { f, f, f, f }; }
|
||||
static INLINE vdouble vcast_vd_d(double d) { return (vdouble) { d, d }; }
|
||||
|
||||
static INLINE vdouble vcast_vd_vi(vint vi) { return (vdouble) { vi[0], vi[1] }; }
|
||||
static INLINE vfloat vcast_vf_vi2(vint2 vi) { return (vfloat) { vi[0], vi[1], vi[2], vi[3] }; }
|
||||
static INLINE vdouble vcast_vd_vi(vint vi) { return (vdouble) { (double)vi[0], (double)vi[1] }; }
|
||||
static INLINE vfloat vcast_vf_vi2(vint2 vi) { return (vfloat) { (float)vi[0], (float)vi[1], (float)vi[2], (float)vi[3] }; }
|
||||
static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return __builtin_s390_vfidb(vd, 4, 5); }
|
||||
static INLINE vdouble vrint_vd_vd(vdouble vd) { return __builtin_s390_vfidb(vd, 4, 4); }
|
||||
|
||||
static INLINE vint vrint_vi_vd(vdouble vd) {
|
||||
vd = vrint_vd_vd(vd);
|
||||
return (vint) { vd[0], vd[1] };
|
||||
return (vint) { (int)vd[0], (int)vd[1] };
|
||||
}
|
||||
static INLINE vint vtruncate_vi_vd(vdouble vd) { return (vint) { vd[0], vd[1] }; }
|
||||
static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return (vint) { vf[0], vf[1], vf[2], vf[3] }; }
|
||||
static INLINE vint vtruncate_vi_vd(vdouble vd) { return (vint) { (int)vd[0], (int)vd[1] }; }
|
||||
static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return (vint) { (int)vf[0], (int)vf[1], (int)vf[2], (int)vf[3] }; }
|
||||
|
||||
static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return (vmask)vd; }
|
||||
static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return (vdouble)vm; }
|
||||
@ -202,7 +202,7 @@ static INLINE vmask vcast_vm_i64(int64_t i) { return (vmask)(vint64){ i, i }; }
|
||||
static INLINE vmask vcast_vm_u64(uint64_t i) { return (vmask)(vuint64){ i, i }; }
|
||||
|
||||
static INLINE vmask vcastu_vm_vi(vint vi) { return (vmask)(vint2){ vi[0], 0, vi[1], 0 }; }
|
||||
static INLINE vint vcastu_vi_vm(vmask vi2) { return (vint){ vi2[0] >> 32, vi2[1] >> 32 }; }
|
||||
static INLINE vint vcastu_vi_vm(vmask vi2) { return (vint){ (int)(vi2[0] >> 32), (int)(vi2[1] >> 32) }; }
|
||||
|
||||
static INLINE vint vreinterpretFirstHalf_vi_vi2(vint2 vi2) { return (vint){ vi2[0], vi2[1] }; }
|
||||
static INLINE vint2 vreinterpretFirstHalf_vi2_vi(vint vi) { return (vint2){ vi[0], vi[1], 0, 0 }; }
|
||||
@ -309,8 +309,8 @@ static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return x ^ y; }
|
||||
static INLINE vint vand_vi_vo_vi(vopmask x, vint y) { return vreinterpretFirstHalf_vi_vi2((vint2)x) & y; }
|
||||
static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) { return vec_andc(y, vreinterpretFirstHalf_vi_vi2((vint2)x)); }
|
||||
|
||||
static INLINE vint vsll_vi_vi_i(vint x, int c) { return (vint)(((__vector unsigned int)x) << (__vector unsigned int){c, c, c, c}); }
|
||||
static INLINE vint vsrl_vi_vi_i(vint x, int c) { return (vint)(((__vector unsigned int)x) >> (__vector unsigned int){c, c, c, c}); }
|
||||
static INLINE vint vsll_vi_vi_i(vint x, int c) { return (vint)(((__vector unsigned int)x) << (__vector unsigned int){(unsigned int)c, (unsigned int)c, (unsigned int)c, (unsigned int)c}); }
|
||||
static INLINE vint vsrl_vi_vi_i(vint x, int c) { return (vint)(((__vector unsigned int)x) >> (__vector unsigned int){(unsigned int)c, (unsigned int)c, (unsigned int)c, (unsigned int)c}); }
|
||||
static INLINE vint vsra_vi_vi_i(vint x, int c) { return x >> (__vector int){c, c, c, c}; }
|
||||
|
||||
static INLINE vint veq_vi_vi_vi(vint x, vint y) { return vec_cmpeq(x, y); }
|
||||
@ -364,8 +364,8 @@ static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return x ^ y; }
|
||||
static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return (vint2)x & y; }
|
||||
static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return y & ~(vint2)x; }
|
||||
|
||||
static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { return (vint2)(((__vector unsigned int)x) << (__vector unsigned int){c, c, c, c}); }
|
||||
static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { return (vint2)(((__vector unsigned int)x) >> (__vector unsigned int){c, c, c, c}); }
|
||||
static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { return (vint2)(((__vector unsigned int)x) << (__vector unsigned int){(unsigned int)c, (unsigned int)c, (unsigned int)c, (unsigned int)c}); }
|
||||
static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { return (vint2)(((__vector unsigned int)x) >> (__vector unsigned int){(unsigned int)c, (unsigned int)c, (unsigned int)c, (unsigned int)c}); }
|
||||
static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { return x >> (__vector int){c, c, c, c}; }
|
||||
|
||||
static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return (vopmask)vec_cmpeq(x, y); }
|
||||
@ -405,7 +405,7 @@ static INLINE vopmask visnan_vo_vf (vfloat d) { return vneq_vo_vf_vf(d, d); }
|
||||
|
||||
static INLINE vint2 vrint_vi2_vf(vfloat vf) {
|
||||
vf = vrint_vf_vf(vf);
|
||||
return (vint) { vf[0], vf[1], vf[2], vf[3] };
|
||||
return (vint) { (int)vf[0], (int)vf[1], (int)vf[2], (int)vf[3] };
|
||||
}
|
||||
|
||||
//
|
||||
@ -445,11 +445,11 @@ static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) {
|
||||
return (vopmask)vec_cmpgt((__vector signed long long)x, (__vector signed long long)y);
|
||||
}
|
||||
|
||||
#define vsll64_vm_vm_i(x, c) ((vmask)((__vector unsigned long long)x << (__vector unsigned long long) { c, c }))
|
||||
#define vsrl64_vm_vm_i(x, c) ((vmask)((__vector unsigned long long)x >> (__vector unsigned long long) { c, c }))
|
||||
#define vsll64_vm_vm_i(x, c) ((vmask)((__vector unsigned long long)x << (__vector unsigned long long) { (unsigned long long)c, (unsigned long long)c }))
|
||||
#define vsrl64_vm_vm_i(x, c) ((vmask)((__vector unsigned long long)x >> (__vector unsigned long long) { (unsigned long long)c, (unsigned long long)c }))
|
||||
|
||||
static INLINE vint vcast_vi_vm(vmask vm) {
|
||||
return (vint) { vm[0], vm[1] };
|
||||
return (vint) { (int)vm[0], (int)vm[1] };
|
||||
}
|
||||
|
||||
static INLINE vmask vcast_vm_vi(vint vi) {
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
@ -124,7 +124,7 @@ static INLINE int vavailability_i(int name) {
|
||||
|
||||
#endif // #if !defined(SLEEF_GENHEADER)
|
||||
|
||||
static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _MM_HINT_T0); }
|
||||
static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch((const char *)ptr, _MM_HINT_T0); }
|
||||
|
||||
static INLINE int vtestallones_i_vo32(vopmask g) { return _mm_movemask_epi8(g) == 0xFFFF; }
|
||||
static INLINE int vtestallones_i_vo64(vopmask g) { return _mm_movemask_epi8(g) == 0xFFFF; }
|
||||
@ -420,10 +420,10 @@ static INLINE float vcast_f_vf(vfloat v) {
|
||||
|
||||
//
|
||||
|
||||
#define PNMASK ((vdouble) { +0.0, -0.0 })
|
||||
#define NPMASK ((vdouble) { -0.0, +0.0 })
|
||||
#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f })
|
||||
#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f })
|
||||
#define PNMASK _mm_set_pd( -0.0, +0.0 )
|
||||
#define NPMASK _mm_set_pd( +0.0, -0.0 )
|
||||
#define PNMASKf _mm_set_ps( -0.0f, +0.0f, -0.0f, +0.0f )
|
||||
#define NPMASKf _mm_set_ps( +0.0f, -0.0f, +0.0f, -0.0f )
|
||||
|
||||
static INLINE vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); }
|
||||
static INLINE vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); }
|
||||
|
||||
@ -22,9 +22,9 @@
|
||||
|
||||
#if CONFIG == 1 || CONFIG == 2
|
||||
// Vector length agnostic
|
||||
#define VECTLENSP (svcntw())
|
||||
#define VECTLENSP ((int)svcntw())
|
||||
//@#define VECTLENSP (svcntw())
|
||||
#define VECTLENDP (svcntd())
|
||||
#define VECTLENDP ((int)svcntd())
|
||||
//@#define VECTLENDP (svcntd())
|
||||
#define ISANAME "AArch64 SVE"
|
||||
#define ptrue svptrue_b8()
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
@ -16,10 +16,49 @@ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SLEEF_C_FLAGS}")
|
||||
add_library(${TARGET_LIBCOMMON_OBJ} OBJECT common.c)
|
||||
set_target_properties(${TARGET_LIBCOMMON_OBJ} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
|
||||
# Target TARGET_LIBARRAYMAP_OBJ
|
||||
|
||||
add_library(${TARGET_LIBARRAYMAP_OBJ} OBJECT arraymap.c)
|
||||
set_target_properties(${TARGET_LIBARRAYMAP_OBJ} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
|
||||
add_host_executable("addSuffix" addSuffix.c)
|
||||
set_target_properties("addSuffix" PROPERTIES C_STANDARD 99)
|
||||
|
||||
if (NOT SLEEF_OPENSSL_FOUND)
|
||||
add_library(${TARGET_PSHA_OBJ} OBJECT psha2_capi.cpp)
|
||||
else()
|
||||
# Tests for internal sha256
|
||||
add_executable(test_psha test_psha2.cpp)
|
||||
target_link_libraries(test_psha ${SLEEF_OPENSSL_LIBRARIES})
|
||||
target_include_directories(test_psha PRIVATE ${SLEEF_OPENSSL_INCLUDE_DIR})
|
||||
add_test(NAME test_psha COMMAND test_psha)
|
||||
set_tests_properties(test_psha PROPERTIES COST 2.0)
|
||||
|
||||
add_executable(test_psha_capi test_psha2.cpp)
|
||||
target_compile_definitions(test_psha_capi PRIVATE TEST_CAPI=1)
|
||||
target_link_libraries(test_psha_capi ${SLEEF_OPENSSL_LIBRARIES})
|
||||
target_include_directories(test_psha_capi PRIVATE ${SLEEF_OPENSSL_INCLUDE_DIR})
|
||||
add_test(NAME test_psha_capi COMMAND test_psha_capi)
|
||||
set_tests_properties(test_psha_capi PROPERTIES COST 2.0)
|
||||
endif()
|
||||
|
||||
# Target TARGET_TESTERUTIL_OBJ
|
||||
add_library(${TARGET_TESTERUTIL_OBJ} OBJECT testerutil.c)
|
||||
target_compile_definitions(${TARGET_TESTERUTIL_OBJ} PRIVATE ${COMMON_TARGET_DEFINITIONS})
|
||||
if(LIB_MPFR)
|
||||
target_compile_definitions(${TARGET_TESTERUTIL_OBJ} PRIVATE USEMPFR=1)
|
||||
target_link_libraries(${TARGET_TESTERUTIL_OBJ} ${LIB_MPFR} ${LIBGMP})
|
||||
endif()
|
||||
if (MPFR_INCLUDE_DIR)
|
||||
target_include_directories(${TARGET_TESTERUTIL_OBJ} PRIVATE ${MPFR_INCLUDE_DIR})
|
||||
endif()
|
||||
|
||||
# Target TARGET_QTESTERUTIL_OBJ
|
||||
add_library(${TARGET_QTESTERUTIL_OBJ} OBJECT qtesterutil.c)
|
||||
target_compile_definitions(${TARGET_QTESTERUTIL_OBJ} PRIVATE ${COMMON_TARGET_DEFINITIONS})
|
||||
if(LIB_MPFR)
|
||||
target_compile_definitions(${TARGET_QTESTERUTIL_OBJ} PRIVATE USEMPFR=1)
|
||||
target_link_libraries(${TARGET_QTESTERUTIL_OBJ} ${LIB_MPFR} ${LIBGMP})
|
||||
endif()
|
||||
if (MPFR_INCLUDE_DIR)
|
||||
target_include_directories(${TARGET_QTESTERUTIL_OBJ} PRIVATE ${MPFR_INCLUDE_DIR})
|
||||
endif()
|
||||
if(COMPILER_SUPPORTS_QUADMATH)
|
||||
target_link_libraries(${TARGET_QTESTERUTIL_OBJ} "-lquadmath")
|
||||
target_compile_definitions(${TARGET_QTESTERUTIL_OBJ} PRIVATE ENABLEFLOAT128=1)
|
||||
endif()
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
@ -207,7 +207,18 @@ int main(int argc, char **argv) {
|
||||
nkeywords++;
|
||||
if (nkeywords >= nalloc) {
|
||||
nalloc *= 2;
|
||||
keywords = realloc(keywords, sizeof(char *) * nalloc);
|
||||
char ** tmp = realloc(keywords, sizeof(char *) * nalloc);
|
||||
if (tmp == NULL) {
|
||||
// free keywords if realloc fails
|
||||
// otherwise address is lost.
|
||||
free(keywords);
|
||||
fclose(fp);
|
||||
fprintf(stderr, "Failed realloc!\n");
|
||||
exit(-1);
|
||||
}
|
||||
else {
|
||||
keywords = tmp;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -228,6 +239,10 @@ int main(int argc, char **argv) {
|
||||
|
||||
fclose(fp);
|
||||
|
||||
for(int i=0;i<nkeywords;i++) free(keywords[i]);
|
||||
|
||||
free(keywords);
|
||||
|
||||
exit(0);
|
||||
}
|
||||
|
||||
|
||||
@ -1,347 +0,0 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <ctype.h>
|
||||
#include <inttypes.h>
|
||||
#include <assert.h>
|
||||
|
||||
//
|
||||
|
||||
#if !(defined(__MINGW32__) || defined(__MINGW64__) || defined(_MSC_VER))
|
||||
#include <unistd.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/file.h>
|
||||
|
||||
static void FLOCK(FILE *fp) { flock(fileno(fp), LOCK_EX); }
|
||||
static void FUNLOCK(FILE *fp) { flock(fileno(fp), LOCK_UN); }
|
||||
static void FTRUNCATE(FILE *fp, off_t z) {
|
||||
if (ftruncate(fileno(fp), z))
|
||||
;
|
||||
}
|
||||
static FILE *OPENTMPFILE() { return tmpfile(); }
|
||||
static void CLOSETMPFILE(FILE *fp) { fclose(fp); }
|
||||
#else
|
||||
#include <windows.h>
|
||||
#include <io.h>
|
||||
|
||||
static void FLOCK(FILE *fp) { }
|
||||
static void FUNLOCK(FILE *fp) { }
|
||||
static void FTRUNCATE(FILE *fp, long z) {
|
||||
fseek(fp, 0, SEEK_SET);
|
||||
SetEndOfFile((HANDLE)_get_osfhandle(_fileno(fp)));
|
||||
}
|
||||
static FILE *OPENTMPFILE() { return fopen("tmpfile.txt", "w+"); }
|
||||
static void CLOSETMPFILE(FILE *fp) {
|
||||
fclose(fp);
|
||||
remove("tmpfile.txt");
|
||||
}
|
||||
#endif
|
||||
|
||||
//
|
||||
|
||||
#define MAGIC_ARRAYMAPNODE 0xf73130fa
|
||||
#define MAGIC_ARRAYMAP 0x8693bd21
|
||||
#define LOGNBUCKETS 8
|
||||
#define NBUCKETS (1 << LOGNBUCKETS)
|
||||
|
||||
static int hash(uint64_t key) {
|
||||
return (key ^ (key >> LOGNBUCKETS) ^ (key >> (LOGNBUCKETS*2)) ^ (key >> (LOGNBUCKETS*3))) & (NBUCKETS-1);
|
||||
}
|
||||
|
||||
static void String_trim(char *str) {
|
||||
char *dst = str, *src = str, *pterm = src;
|
||||
|
||||
while(*src != '\0' && isspace((int)*src)) src++;
|
||||
|
||||
for(;*src != '\0';src++) {
|
||||
*dst++ = *src;
|
||||
if (!isspace((int)*src)) pterm = dst;
|
||||
}
|
||||
|
||||
*pterm = '\0';
|
||||
}
|
||||
|
||||
typedef struct ArrayMapNode {
|
||||
uint32_t magic;
|
||||
uint64_t key;
|
||||
void *value;
|
||||
} ArrayMapNode;
|
||||
|
||||
typedef struct ArrayMap {
|
||||
uint32_t magic;
|
||||
ArrayMapNode *array[NBUCKETS];
|
||||
int size[NBUCKETS], capacity[NBUCKETS], totalSize;
|
||||
} ArrayMap;
|
||||
|
||||
ArrayMap *initArrayMap() {
|
||||
ArrayMap *thiz = (ArrayMap *)calloc(1, sizeof(ArrayMap));
|
||||
thiz->magic = MAGIC_ARRAYMAP;
|
||||
|
||||
for(int i=0;i<NBUCKETS;i++) {
|
||||
thiz->capacity[i] = 8;
|
||||
thiz->array[i] = (ArrayMapNode *)malloc(thiz->capacity[i] * sizeof(ArrayMapNode));
|
||||
thiz->size[i] = 0;
|
||||
}
|
||||
|
||||
thiz->totalSize = 0;
|
||||
return thiz;
|
||||
}
|
||||
|
||||
void ArrayMap_dispose(ArrayMap *thiz) {
|
||||
assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP);
|
||||
|
||||
for(int j=0;j<NBUCKETS;j++) {
|
||||
for(int i=0;i<thiz->size[j];i++) {
|
||||
assert(thiz->array[j][i].magic == MAGIC_ARRAYMAPNODE);
|
||||
thiz->array[j][i].magic = 0;
|
||||
}
|
||||
free(thiz->array[j]);
|
||||
}
|
||||
|
||||
thiz->magic = 0;
|
||||
free(thiz);
|
||||
}
|
||||
|
||||
int ArrayMap_size(ArrayMap *thiz) {
|
||||
assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP);
|
||||
return thiz->totalSize;
|
||||
}
|
||||
|
||||
uint64_t *ArrayMap_keyArray(ArrayMap *thiz) {
|
||||
assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP);
|
||||
uint64_t *a = (uint64_t *)malloc(sizeof(uint64_t) * thiz->totalSize);
|
||||
int p = 0;
|
||||
for(int j=0;j<NBUCKETS;j++) {
|
||||
for(int i=0;i<thiz->size[j];i++) {
|
||||
assert(thiz->array[j][i].magic == MAGIC_ARRAYMAPNODE);
|
||||
a[p++] = thiz->array[j][i].key;
|
||||
}
|
||||
}
|
||||
return a;
|
||||
}
|
||||
|
||||
void **ArrayMap_valueArray(ArrayMap *thiz) {
|
||||
assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP);
|
||||
void **a = (void **)malloc(sizeof(void *) * thiz->totalSize);
|
||||
int p = 0;
|
||||
for(int j=0;j<NBUCKETS;j++) {
|
||||
for(int i=0;i<thiz->size[j];i++) {
|
||||
assert(thiz->array[j][i].magic == MAGIC_ARRAYMAPNODE);
|
||||
a[p++] = thiz->array[j][i].value;
|
||||
}
|
||||
}
|
||||
return a;
|
||||
}
|
||||
|
||||
void *ArrayMap_remove(ArrayMap *thiz, uint64_t key) {
|
||||
assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP);
|
||||
|
||||
int h = hash(key);
|
||||
for(int i=0;i<thiz->size[h];i++) {
|
||||
assert(thiz->array[h][i].magic == MAGIC_ARRAYMAPNODE);
|
||||
if (thiz->array[h][i].key == key) {
|
||||
void *old = thiz->array[h][i].value;
|
||||
thiz->array[h][i].key = thiz->array[h][thiz->size[h]-1].key;
|
||||
thiz->array[h][i].value = thiz->array[h][thiz->size[h]-1].value;
|
||||
thiz->array[h][thiz->size[h]-1].magic = 0;
|
||||
thiz->size[h]--;
|
||||
thiz->totalSize--;
|
||||
return old;
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void *ArrayMap_put(ArrayMap *thiz, uint64_t key, void *value) {
|
||||
if (value == NULL) return ArrayMap_remove(thiz, key);
|
||||
|
||||
assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP);
|
||||
|
||||
int h = hash(key);
|
||||
for(int i=0;i<thiz->size[h];i++) {
|
||||
assert(thiz->array[h][i].magic == MAGIC_ARRAYMAPNODE);
|
||||
if (thiz->array[h][i].key == key) {
|
||||
void *old = thiz->array[h][i].value;
|
||||
thiz->array[h][i].value = value;
|
||||
return old;
|
||||
}
|
||||
}
|
||||
|
||||
if (thiz->size[h] >= thiz->capacity[h]) {
|
||||
thiz->capacity[h] *= 2;
|
||||
thiz->array[h] = (ArrayMapNode *)realloc(thiz->array[h], thiz->capacity[h] * sizeof(ArrayMapNode));
|
||||
}
|
||||
|
||||
ArrayMapNode *n = &(thiz->array[h][thiz->size[h]++]);
|
||||
n->magic = MAGIC_ARRAYMAPNODE;
|
||||
n->key = key;
|
||||
n->value = value;
|
||||
|
||||
thiz->totalSize++;
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void *ArrayMap_get(ArrayMap *thiz, uint64_t key) {
|
||||
assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP);
|
||||
|
||||
int h = hash(key);
|
||||
for(int i=0;i<thiz->size[h];i++) {
|
||||
assert(thiz->array[h][i].magic == MAGIC_ARRAYMAPNODE);
|
||||
if (thiz->array[h][i].key == key) {
|
||||
return thiz->array[h][i].value;
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#define LINELEN (1024*1024)
|
||||
|
||||
ArrayMap *ArrayMap_load(const char *fn, const char *prefix, const char *idstr, int doLock) {
|
||||
const int idstrlen = (int)strlen(idstr);
|
||||
int prefixLen = (int)strlen(prefix) + 3;
|
||||
|
||||
if (prefixLen >= LINELEN-10 || idstrlen >= LINELEN-10) return NULL;
|
||||
|
||||
FILE *fp = fopen(fn, "r");
|
||||
if (fp == NULL) return NULL;
|
||||
|
||||
if (doLock) FLOCK(fp);
|
||||
|
||||
ArrayMap *thiz = initArrayMap();
|
||||
|
||||
char *prefix2 = malloc(prefixLen+10);
|
||||
strcpy(prefix2, prefix);
|
||||
String_trim(prefix2);
|
||||
for(char *p = prefix2;*p != '\0';p++) {
|
||||
if (*p == ':') *p = ';';
|
||||
if (*p == ' ') *p = '_';
|
||||
}
|
||||
strcat(prefix2, " : ");
|
||||
prefixLen = (int)strlen(prefix2);
|
||||
|
||||
char *line = malloc(sizeof(char) * (LINELEN+10));
|
||||
line[idstrlen] = '\0';
|
||||
|
||||
if (fread(line, sizeof(char), idstrlen, fp) != idstrlen ||
|
||||
strcmp(idstr, line) != 0) {
|
||||
if (doLock) FUNLOCK(fp);
|
||||
fclose(fp);
|
||||
free(prefix2);
|
||||
free(line);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
for(;;) {
|
||||
line[LINELEN] = '\0';
|
||||
if (fgets(line, LINELEN, fp) == NULL) break;
|
||||
if (strncmp(line, prefix2, prefixLen) != 0) continue;
|
||||
|
||||
uint64_t key;
|
||||
char *value = malloc(sizeof(char) * LINELEN);
|
||||
|
||||
if (sscanf(line + prefixLen, "%" SCNx64 " : %s\n", &key, value) == 2) {
|
||||
ArrayMap_put(thiz, (uint64_t)key, (void *)value);
|
||||
} else {
|
||||
free(value);
|
||||
}
|
||||
}
|
||||
|
||||
if (doLock) FUNLOCK(fp);
|
||||
fclose(fp);
|
||||
|
||||
free(prefix2);
|
||||
free(line);
|
||||
|
||||
return thiz;
|
||||
}
|
||||
|
||||
int ArrayMap_save(ArrayMap *thiz, const char *fn, const char *prefix, const char *idstr) {
|
||||
assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP);
|
||||
|
||||
const int idstrlen = (int)strlen(idstr);
|
||||
int prefixLen = (int)strlen(prefix) + 3;
|
||||
|
||||
if (prefixLen >= LINELEN-10 || idstrlen >= LINELEN-10) return -1;
|
||||
|
||||
// Generate prefix2
|
||||
|
||||
char *prefix2 = malloc(prefixLen+10);
|
||||
strcpy(prefix2, prefix);
|
||||
String_trim(prefix2);
|
||||
for(char *p = prefix2;*p != '\0';p++) {
|
||||
if (*p == ':') *p = ';';
|
||||
if (*p == ' ') *p = '_';
|
||||
}
|
||||
strcat(prefix2, " : ");
|
||||
prefixLen = (int)strlen(prefix2);
|
||||
|
||||
//
|
||||
|
||||
FILE *fp = fopen(fn, "a+");
|
||||
if (fp == NULL) return -1;
|
||||
|
||||
FLOCK(fp);
|
||||
fseek(fp, 0, SEEK_SET);
|
||||
|
||||
// Copy the file specified by fn to tmpfile
|
||||
|
||||
FILE *tmpfp = OPENTMPFILE();
|
||||
if (tmpfp == NULL) {
|
||||
FUNLOCK(fp);
|
||||
fclose(fp);
|
||||
return -1;
|
||||
}
|
||||
|
||||
char *line = malloc(sizeof(char) * (LINELEN+10));
|
||||
line[idstrlen] = '\0';
|
||||
|
||||
if (fread(line, sizeof(char), idstrlen, fp) == idstrlen && strcmp(idstr, line) == 0) {
|
||||
for(;;) {
|
||||
line[LINELEN] = '\0';
|
||||
if (fgets(line, LINELEN, fp) == NULL) break;
|
||||
if (strncmp(line, prefix2, prefixLen) != 0) fputs(line, tmpfp);
|
||||
}
|
||||
}
|
||||
|
||||
// Write the contents in the map into tmpfile
|
||||
|
||||
uint64_t *keys = ArrayMap_keyArray(thiz);
|
||||
int s = ArrayMap_size(thiz);
|
||||
|
||||
for(int i=0;i<s;i++) {
|
||||
char *value = ArrayMap_get(thiz, keys[i]);
|
||||
if (strlen(value) + prefixLen >= LINELEN-10) continue;
|
||||
fprintf(tmpfp, "%s %" PRIx64 " : %s\n", prefix2, keys[i], value);
|
||||
}
|
||||
|
||||
free(keys);
|
||||
|
||||
fseek(fp, 0, SEEK_SET);
|
||||
FTRUNCATE(fp, 0);
|
||||
fwrite(idstr, sizeof(char), strlen(idstr), fp);
|
||||
|
||||
fseek(tmpfp, 0, SEEK_SET);
|
||||
|
||||
for(;;) {
|
||||
size_t s = fread(line, 1, LINELEN, tmpfp);
|
||||
if (s == 0) break;
|
||||
fwrite(line, 1, s, fp);
|
||||
}
|
||||
|
||||
FUNLOCK(fp);
|
||||
fclose(fp);
|
||||
|
||||
CLOSETMPFILE(tmpfp);
|
||||
free(prefix2);
|
||||
free(line);
|
||||
return 0;
|
||||
}
|
||||
@ -1,21 +0,0 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#ifndef __ARRAYMAP_H__
|
||||
#define __ARRAYMAP_H__
|
||||
typedef struct ArrayMap ArrayMap;
|
||||
|
||||
ArrayMap *initArrayMap();
|
||||
void ArrayMap_dispose(ArrayMap *thiz);
|
||||
int ArrayMap_size(ArrayMap *thiz);
|
||||
void *ArrayMap_remove(ArrayMap *thiz, uint64_t key);
|
||||
void *ArrayMap_put(ArrayMap *thiz, uint64_t key, void *value);
|
||||
void *ArrayMap_get(ArrayMap *thiz, uint64_t key);
|
||||
|
||||
uint64_t *ArrayMap_keyArray(ArrayMap *thiz);
|
||||
void **ArrayMap_valueArray(ArrayMap *thiz);
|
||||
int ArrayMap_save(ArrayMap *thiz, const char *fn, const char *prefix, const char *idstr);
|
||||
ArrayMap *ArrayMap_load(const char *fn, const char *prefix, const char *idstr, int doLock);
|
||||
#endif
|
||||
@ -1,4 +1,4 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
@ -1,9 +1,20 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#ifndef __COMMON_H__
|
||||
#define __COMMON_H__
|
||||
char *Sleef_getCpuIdString();
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"
|
||||
{
|
||||
#endif
|
||||
|
||||
char *Sleef_getCpuIdString();
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // #ifndef __COMMON_H__
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2023.
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
@ -415,7 +415,7 @@ static INLINE CONST vquad add128_vq_vq_vq(vquad x, vquad y) {
|
||||
static INLINE CONST vquad imdvq_vq_vm_vm(vmask x, vmask y) { vquad r = vqsetxy_vq_vm_vm(x, y); return r; }
|
||||
|
||||
// imm must be smaller than 64
|
||||
#define srl128_vq_vq_i(m, imm) \
|
||||
#define srl128_vq_vq_i(m, imm) \
|
||||
imdvq_vq_vm_vm(vor_vm_vm_vm(vsrl64_vm_vm_i(vqgetx_vm_vq(m), imm), vsll64_vm_vm_i(vqgety_vm_vq(m), 64-imm)), vsrl64_vm_vm_i(vqgety_vm_vq(m), imm))
|
||||
|
||||
// This function is equivalent to :
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2024.
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2024.
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
@ -1,92 +0,0 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#include <quadmath.h>
|
||||
#include <inttypes.h>
|
||||
|
||||
static __float128 mpfr_get_f128(mpfr_t m, mpfr_rnd_t rnd) {
|
||||
if (isnan(mpfr_get_d(m, GMP_RNDN))) return __builtin_nan("");
|
||||
|
||||
mpfr_t frr, frd;
|
||||
mpfr_inits(frr, frd, NULL);
|
||||
|
||||
mpfr_exp_t e;
|
||||
mpfr_frexp(&e, frr, m, GMP_RNDN);
|
||||
|
||||
double d0 = mpfr_get_d(frr, GMP_RNDN);
|
||||
mpfr_set_d(frd, d0, GMP_RNDN);
|
||||
mpfr_sub(frr, frr, frd, GMP_RNDN);
|
||||
|
||||
double d1 = mpfr_get_d(frr, GMP_RNDN);
|
||||
mpfr_set_d(frd, d1, GMP_RNDN);
|
||||
mpfr_sub(frr, frr, frd, GMP_RNDN);
|
||||
|
||||
double d2 = mpfr_get_d(frr, GMP_RNDN);
|
||||
|
||||
mpfr_clears(frr, frd, NULL);
|
||||
return ldexpq((__float128)d2 + (__float128)d1 + (__float128)d0, e);
|
||||
}
|
||||
|
||||
static void mpfr_set_f128(mpfr_t frx, __float128 f, mpfr_rnd_t rnd) {
|
||||
char s[128];
|
||||
quadmath_snprintf(s, 120, "%.50Qg", f);
|
||||
mpfr_set_str(frx, s, 10, rnd);
|
||||
}
|
||||
|
||||
static void printf128(__float128 f) {
|
||||
char s[128];
|
||||
quadmath_snprintf(s, 120, "%.50Qg", f);
|
||||
printf("%s", s);
|
||||
}
|
||||
|
||||
static char frstr[16][1000];
|
||||
static int frstrcnt = 0;
|
||||
|
||||
static char *toBC(double d) {
|
||||
union {
|
||||
double d;
|
||||
uint64_t u64;
|
||||
int64_t i64;
|
||||
} cnv;
|
||||
|
||||
cnv.d = d;
|
||||
|
||||
int64_t l = cnv.i64;
|
||||
int e = (int)((l >> 52) & ~(-1L << 11));
|
||||
int s = (int)(l >> 63);
|
||||
l = d == 0 ? 0 : ((l & ~((-1L) << 52)) | (1L << 52));
|
||||
|
||||
char *ptr = frstr[(frstrcnt++) & 15];
|
||||
|
||||
sprintf(ptr, "%s%lld*2^%d", s != 0 ? "-" : "", (long long int)l, (e-0x3ff-52));
|
||||
return ptr;
|
||||
}
|
||||
|
||||
static char *toBCq(__float128 d) {
|
||||
union {
|
||||
__float128 d;
|
||||
__uint128_t u128;
|
||||
} cnv;
|
||||
|
||||
cnv.d = d;
|
||||
|
||||
__uint128_t m = cnv.u128;
|
||||
int e = (int)((m >> 112) & ~(-1L << 15));
|
||||
int s = (int)(m >> 127);
|
||||
m = d == 0 ? 0 : ((m & ((((__uint128_t)1) << 112)-1)) | ((__uint128_t)1 << 112));
|
||||
|
||||
uint64_t h = m / UINT64_C(10000000000000000000);
|
||||
uint64_t l = m % UINT64_C(10000000000000000000);
|
||||
|
||||
char *ptr = frstr[(frstrcnt++) & 15];
|
||||
|
||||
sprintf(ptr, "%s%" PRIu64 "%019" PRIu64 "*2^%d", s != 0 ? "-" : "", h, l, (e-0x3fff-112));
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
static int xisnanq(Sleef_quad x) { return x != x; }
|
||||
static int xisinfq(Sleef_quad x) { return x == (Sleef_quad)__builtin_inf() || x == -(Sleef_quad)__builtin_inf(); }
|
||||
static int xisfiniteq(Sleef_quad x) { return !xisnanq(x) && !isinfq(x); }
|
||||
@ -1,4 +1,4 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2024.
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
@ -13,10 +13,15 @@
|
||||
#include <string.h>
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef M_PI
|
||||
#define M_PI 3.141592653589793238462643383279502884
|
||||
#endif
|
||||
|
||||
#ifndef M_PIf
|
||||
# define M_PIf ((float)M_PI)
|
||||
#endif
|
||||
|
||||
#ifndef M_PIl
|
||||
#define M_PIl 3.141592653589793238462643383279502884L
|
||||
#endif
|
||||
@ -137,9 +142,17 @@
|
||||
#define L2Lf 1.428606765330187045e-06f
|
||||
|
||||
#define R_LN2f 1.442695040888963407359924681001892137426645954152985934135449406931f
|
||||
#ifndef M_PIf
|
||||
# define M_PIf ((float)M_PI)
|
||||
#endif
|
||||
|
||||
// Overflow bounds
|
||||
|
||||
// - exp(x) overflows for x over (also used in pow)
|
||||
#define LOG_DBL_MAX 0x1.62e42fefa39efp+9 /* 709.782712893384 */
|
||||
|
||||
// Other bounds
|
||||
|
||||
// - log1p(f)(x) approximation holds up to x equals
|
||||
#define LOG1PF_BOUND 0x1.2ced32p+126 /* 1.0e+38 */
|
||||
#define LOG1P_BOUND 0x1.c7b1f3cac7433p+1019 /* 1.0e+307 */
|
||||
|
||||
//
|
||||
|
||||
@ -249,6 +262,9 @@ typedef struct {
|
||||
#else // #if defined(SLEEF_GENHEADER)
|
||||
|
||||
#define INLINE __forceinline
|
||||
#ifdef CONST
|
||||
#undef CONST
|
||||
#endif
|
||||
#define CONST
|
||||
#ifndef SLEEF_STATIC_LIBS
|
||||
#define EXPORT __declspec(dllexport)
|
||||
|
||||
@ -0,0 +1,182 @@
|
||||
#ifndef __PSHA2_HPP_INCLUDED__
|
||||
#define __PSHA2_HPP_INCLUDED__
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
|
||||
struct PSHA2_256_Internal {
|
||||
// https://github.com/983/SHA-256
|
||||
// This is public domain implementation of SHA256
|
||||
static inline uint32_t rotr(uint32_t x, int n) {
|
||||
return (x >> n) | (x << (32 - n));
|
||||
}
|
||||
|
||||
static inline uint32_t step1(uint32_t e, uint32_t f, uint32_t g) {
|
||||
return (rotr(e, 6) ^ rotr(e, 11) ^ rotr(e, 25)) + ((e & f) ^ ((~ e) & g));
|
||||
}
|
||||
|
||||
static inline uint32_t step2(uint32_t a, uint32_t b, uint32_t c) {
|
||||
return (rotr(a, 2) ^ rotr(a, 13) ^ rotr(a, 22)) + ((a & b) ^ (a & c) ^ (b & c));
|
||||
}
|
||||
|
||||
static inline void update_w(uint32_t *w, int i, const uint8_t *buffer) {
|
||||
int j;
|
||||
for(j = 0;j < 16;j++) {
|
||||
if (i < 16) {
|
||||
w[j] =
|
||||
((uint32_t)buffer[0] << 24) |
|
||||
((uint32_t)buffer[1] << 16) |
|
||||
((uint32_t)buffer[2] << 8) |
|
||||
((uint32_t)buffer[3]);
|
||||
buffer += 4;
|
||||
} else {
|
||||
uint32_t a = w[(j + 1) & 15];
|
||||
uint32_t b = w[(j + 14) & 15];
|
||||
uint32_t s0 = (rotr(a, 7) ^ rotr(a, 18) ^ (a >> 3));
|
||||
uint32_t s1 = (rotr(b, 17) ^ rotr(b, 19) ^ (b >> 10));
|
||||
w[j] += w[(j + 9) & 15] + s0 + s1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t state[8];
|
||||
uint64_t n_bits;
|
||||
uint8_t buffer_counter;
|
||||
uint8_t buffer[64];
|
||||
|
||||
PSHA2_256_Internal() {
|
||||
state[0] = 0x6a09e667;
|
||||
state[1] = 0xbb67ae85;
|
||||
state[2] = 0x3c6ef372;
|
||||
state[3] = 0xa54ff53a;
|
||||
state[4] = 0x510e527f;
|
||||
state[5] = 0x9b05688c;
|
||||
state[6] = 0x1f83d9ab;
|
||||
state[7] = 0x5be0cd19;
|
||||
n_bits = 0;
|
||||
buffer_counter = 0;
|
||||
for(int i=0;i<64;i++) buffer[i] = 0;
|
||||
}
|
||||
|
||||
void block() {
|
||||
static const uint32_t k[] = {
|
||||
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
|
||||
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
|
||||
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
|
||||
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
|
||||
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
|
||||
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
|
||||
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
|
||||
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
|
||||
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
|
||||
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
|
||||
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
|
||||
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
|
||||
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
|
||||
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
|
||||
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
|
||||
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
|
||||
};
|
||||
|
||||
uint32_t a = state[0];
|
||||
uint32_t b = state[1];
|
||||
uint32_t c = state[2];
|
||||
uint32_t d = state[3];
|
||||
uint32_t e = state[4];
|
||||
uint32_t f = state[5];
|
||||
uint32_t g = state[6];
|
||||
uint32_t h = state[7];
|
||||
|
||||
uint32_t w[16] = {
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
};
|
||||
|
||||
for(int i = 0;i < 64;i += 16) {
|
||||
update_w(w, i, buffer);
|
||||
|
||||
#if defined(__clang__)
|
||||
#pragma clang loop unroll(full)
|
||||
#endif
|
||||
for(int j = 0;j < 16;j += 4) {
|
||||
uint32_t temp;
|
||||
temp = h + step1(e, f, g) + k[i + j + 0] + w[j + 0];
|
||||
h = temp + d;
|
||||
d = temp + step2(a, b, c);
|
||||
temp = g + step1(h, e, f) + k[i + j + 1] + w[j + 1];
|
||||
g = temp + c;
|
||||
c = temp + step2(d, a, b);
|
||||
temp = f + step1(g, h, e) + k[i + j + 2] + w[j + 2];
|
||||
f = temp + b;
|
||||
b = temp + step2(c, d, a);
|
||||
temp = e + step1(f, g, h) + k[i + j + 3] + w[j + 3];
|
||||
e = temp + a;
|
||||
a = temp + step2(b, c, d);
|
||||
}
|
||||
}
|
||||
|
||||
state[0] += a;
|
||||
state[1] += b;
|
||||
state[2] += c;
|
||||
state[3] += d;
|
||||
state[4] += e;
|
||||
state[5] += f;
|
||||
state[6] += g;
|
||||
state[7] += h;
|
||||
}
|
||||
|
||||
void append_byte(uint8_t byte) {
|
||||
buffer[buffer_counter++] = byte;
|
||||
n_bits += 8;
|
||||
|
||||
if (buffer_counter == 64) {
|
||||
buffer_counter = 0;
|
||||
block();
|
||||
}
|
||||
}
|
||||
|
||||
void append(const void *src, size_t n_bytes) {
|
||||
for(size_t i = 0;i < n_bytes;i++) {
|
||||
append_byte(((const uint8_t*)src)[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void appendWord(const void *src, size_t n_bytes) {
|
||||
#if !defined(__BYTE_ORDER__) || (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||
for(size_t i = 0;i < n_bytes;i++) {
|
||||
append_byte(((const uint8_t*)src)[i]);
|
||||
}
|
||||
#else
|
||||
for(int i = int(n_bytes)-1;i >= 0;i--) {
|
||||
append_byte(((const uint8_t*)src)[i]);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void finalize() {
|
||||
uint64_t nb = n_bits;
|
||||
|
||||
append_byte(0x80);
|
||||
|
||||
while(buffer_counter != 64 - 8) {
|
||||
append_byte(0);
|
||||
}
|
||||
|
||||
for(int i = 7;i >= 0;i--) {
|
||||
uint8_t byte = (nb >> 8 * i) & 0xff;
|
||||
append_byte(byte);
|
||||
}
|
||||
}
|
||||
|
||||
void finalize_bytes(void *dst_bytes32) {
|
||||
uint8_t *ptr = (uint8_t*)dst_bytes32;
|
||||
finalize();
|
||||
|
||||
for(int i = 0;i < 8;i++) {
|
||||
for(int j = 3;j >= 0;j--) {
|
||||
*ptr++ = (state[i] >> j * 8) & 0xff;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
#endif // #ifndef __PSHA2_HPP_INCLUDED__
|
||||
@ -0,0 +1,57 @@
|
||||
#include "psha2.hpp"
|
||||
#include "psha2_capi.h"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
const EVP_MD *EVP_sha256(void) {
|
||||
static const int one[1] = { 1 };
|
||||
return &one[0];
|
||||
}
|
||||
|
||||
size_t EVP_MD_size(const EVP_MD *e) {
|
||||
if (*e == 1) return SHA256_DIGEST_LENGTH;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int EVP_MD_get_size(const EVP_MD *e) {
|
||||
if (*e == 1) return SHA256_DIGEST_LENGTH;
|
||||
return 0;
|
||||
}
|
||||
|
||||
EVP_MD_CTX *EVP_MD_CTX_new(void) {
|
||||
return (EVP_MD_CTX *)calloc(1, sizeof(EVP_MD_CTX));
|
||||
}
|
||||
|
||||
int EVP_DigestInit_ex(EVP_MD_CTX *ctx, const EVP_MD *type, ENGINE *impl) {
|
||||
ctx->type = *type;
|
||||
if (*type == 1) {
|
||||
ctx->psha_256 = new PSHA2_256_Internal();
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int EVP_DigestUpdate(EVP_MD_CTX *ctx, const void *d, size_t cnt) {
|
||||
if (ctx->type == 1) {
|
||||
ctx->psha_256->append(d, cnt);
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int EVP_DigestFinal_ex(EVP_MD_CTX *ctx, unsigned char *md, unsigned int *s) {
|
||||
if (ctx->type == 1) {
|
||||
ctx->psha_256->finalize_bytes(md);
|
||||
if (s) *s = SHA256_DIGEST_LENGTH;
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void EVP_MD_CTX_free(EVP_MD_CTX *ctx) {
|
||||
if (ctx->type == 1) {
|
||||
delete ctx->psha_256;
|
||||
ctx->psha_256 = nullptr;
|
||||
}
|
||||
free(ctx);
|
||||
}
|
||||
@ -0,0 +1,30 @@
|
||||
#include <stddef.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
static const size_t SHA256_DIGEST_LENGTH = 32;
|
||||
|
||||
typedef int EVP_MD;
|
||||
typedef void ENGINE;
|
||||
|
||||
typedef struct {
|
||||
int type;
|
||||
union {
|
||||
struct PSHA2_256_Internal *psha_256;
|
||||
};
|
||||
} EVP_MD_CTX;
|
||||
|
||||
const EVP_MD *EVP_sha256(void);
|
||||
int EVP_MD_get_size(const EVP_MD *);
|
||||
size_t EVP_MD_size(const EVP_MD *);
|
||||
EVP_MD_CTX *EVP_MD_CTX_new(void);
|
||||
int EVP_DigestInit_ex(EVP_MD_CTX *ctx, const EVP_MD *type, ENGINE *impl);
|
||||
int EVP_DigestUpdate(EVP_MD_CTX *ctx, const void *d, size_t cnt);
|
||||
int EVP_DigestFinal_ex(EVP_MD_CTX *ctx, unsigned char *md, unsigned int *s);
|
||||
void EVP_MD_CTX_free(EVP_MD_CTX *ctx);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
@ -1,4 +1,4 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
@ -22,6 +22,10 @@
|
||||
#include <mpfr.h>
|
||||
#endif
|
||||
|
||||
#ifdef ENABLEFLOAT128
|
||||
#include <quadmath.h>
|
||||
#endif
|
||||
|
||||
#if defined(__MINGW32__) || defined(__MINGW64__) || defined(_MSC_VER)
|
||||
#define STDIN_FILENO 0
|
||||
#else
|
||||
@ -42,33 +46,6 @@
|
||||
|
||||
//
|
||||
|
||||
int readln(int fd, char *buf, int cnt) {
|
||||
int i, rcnt = 0;
|
||||
|
||||
if (cnt < 1) return -1;
|
||||
|
||||
while(cnt >= 2) {
|
||||
i = read(fd, buf, 1);
|
||||
if (i != 1) return i;
|
||||
|
||||
if (*buf == '\n') break;
|
||||
|
||||
rcnt++;
|
||||
buf++;
|
||||
cnt--;
|
||||
}
|
||||
|
||||
*++buf = '\0';
|
||||
rcnt++;
|
||||
return rcnt;
|
||||
}
|
||||
|
||||
int startsWith(char *str, char *prefix) {
|
||||
return strncmp(str, prefix, strlen(prefix)) == 0;
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
xuint128 xu(uint64_t h, uint64_t l) {
|
||||
xuint128 r = { .l = l, .h = h };
|
||||
return r;
|
||||
@ -150,31 +127,6 @@ int isnanf128(Sleef_quad a) {
|
||||
|
||||
//
|
||||
|
||||
static uint64_t xseed;
|
||||
|
||||
uint64_t xrand() {
|
||||
uint64_t u = xseed;
|
||||
xseed = xseed * UINT64_C(6364136223846793005) + 1;
|
||||
u = (u & ((~UINT64_C(0)) << 32)) | (xseed >> 32);
|
||||
xseed = xseed * UINT64_C(6364136223846793005) + 1;
|
||||
return u;
|
||||
}
|
||||
|
||||
void xsrand(uint64_t s) {
|
||||
xseed = s;
|
||||
xrand();
|
||||
xrand();
|
||||
xrand();
|
||||
}
|
||||
|
||||
void memrand(void *p, int size) {
|
||||
uint64_t *q = (uint64_t *)p;
|
||||
int i;
|
||||
for(i=0;i<size;i+=8) *q++ = xrand();
|
||||
uint8_t *r = (uint8_t *)q;
|
||||
for(;i<size;i++) *r++ = xrand() & 0xff;
|
||||
}
|
||||
|
||||
Sleef_quad rndf128(Sleef_quad min, Sleef_quad max, int setSignRandomly) {
|
||||
cnv_t cmin = { .q = min }, cmax = { .q = max }, c;
|
||||
do {
|
||||
@ -581,6 +533,14 @@ char *sprintf128(Sleef_quad q) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
#ifdef QUADMATH_H
|
||||
void printf128(Sleef_quad f) {
|
||||
char s[128];
|
||||
quadmath_snprintf(s, 120, "%.50Qg", f);
|
||||
printf("%s", s);
|
||||
}
|
||||
#endif
|
||||
|
||||
double cast_d_q(Sleef_quad q) {
|
||||
mpfr_t fr;
|
||||
mpfr_inits(fr, NULL);
|
||||
@ -1,9 +1,14 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#include "quaddef.h"
|
||||
#include "testerutil.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
@ -33,24 +38,6 @@ int isinff128(Sleef_quad a);
|
||||
int isnonnumberf128(Sleef_quad a);
|
||||
int isnanf128(Sleef_quad a);
|
||||
|
||||
static double u2d(uint64_t u) {
|
||||
union {
|
||||
double f;
|
||||
uint64_t i;
|
||||
} tmp;
|
||||
tmp.i = u;
|
||||
return tmp.f;
|
||||
}
|
||||
|
||||
static uint64_t d2u(double d) {
|
||||
union {
|
||||
double f;
|
||||
uint64_t i;
|
||||
} tmp;
|
||||
tmp.f = d;
|
||||
return tmp.i;
|
||||
}
|
||||
|
||||
#ifdef USEMPFR
|
||||
void mpfr_set_f128(mpfr_t frx, Sleef_quad a, mpfr_rnd_t rnd);
|
||||
Sleef_quad mpfr_get_f128(mpfr_t m, mpfr_rnd_t rnd);
|
||||
@ -59,8 +46,16 @@ double countULPf128(Sleef_quad d, mpfr_t c, int checkNegZero);
|
||||
char *sprintfr(mpfr_t fr);
|
||||
char *sprintf128(Sleef_quad x);
|
||||
|
||||
#ifdef QUADMATH_H
|
||||
void printf128(Sleef_quad f);
|
||||
#endif
|
||||
|
||||
double cast_d_q(Sleef_quad q);
|
||||
Sleef_quad cast_q_str(const char *s);
|
||||
Sleef_quad cast_q_str_hex(const char *s);
|
||||
Sleef_quad add_q_d(Sleef_quad q, double d);
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
@ -1,17 +1,11 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#if !defined(SLEEF_GENHEADER)
|
||||
|
||||
#if (defined(__SIZEOF_FLOAT128__) && __SIZEOF_FLOAT128__ == 16) || (defined(__linux__) && defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))) || (defined(__PPC64__) && defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 8)
|
||||
#define SLEEF_FLOAT128_IS_IEEEQP
|
||||
#endif
|
||||
|
||||
#if !defined(SLEEF_FLOAT128_IS_IEEEQP) && defined(__SIZEOF_LONG_DOUBLE__) && __SIZEOF_LONG_DOUBLE__ == 16 && (defined(__aarch64__) || defined(__zarch__))
|
||||
#define SLEEF_LONGDOUBLE_IS_IEEEQP
|
||||
#endif
|
||||
#include "sleef-config.h"
|
||||
|
||||
#if !defined(Sleef_quad_DEFINED)
|
||||
#define Sleef_quad_DEFINED
|
||||
@ -74,14 +68,6 @@ typedef union {
|
||||
|
||||
#else // #if !defined(SLEEF_GENHEADER)
|
||||
|
||||
SLEEFSHARPif !defined(SLEEFXXX__NVCC__) && ((defined(SLEEFXXX__SIZEOF_FLOAT128__) && SLEEFXXX__SIZEOF_FLOAT128__ == 16) || (defined(SLEEFXXX__linux__) && defined(SLEEFXXX__GNUC__) && (defined(SLEEFXXX__i386__) || defined(SLEEFXXX__x86_64__))) || (defined(SLEEFXXX__PPC64__) && defined(SLEEFXXX__GNUC__) && !defined(SLEEFXXX__clang__) && SLEEFXXX__GNUC__ >= 8))
|
||||
SLEEFSHARPdefine SLEEFXXXSLEEF_FLOAT128_IS_IEEEQP
|
||||
SLEEFSHARPendif
|
||||
|
||||
SLEEFSHARPif !defined(SLEEFXXXSLEEF_FLOAT128_IS_IEEEQP) && !defined(SLEEFXXX__NVCC__) && defined(SLEEFXXX__SIZEOF_LONG_DOUBLE__) && SLEEFXXX__SIZEOF_LONG_DOUBLE__ == 16 && (defined(SLEEFXXX__aarch64__) || defined(SLEEFXXX__zarch__))
|
||||
SLEEFSHARPdefine SLEEFXXXSLEEF_LONGDOUBLE_IS_IEEEQP
|
||||
SLEEFSHARPendif
|
||||
|
||||
SLEEFSHARPif !defined(SLEEFXXXSleef_quad_DEFINED)
|
||||
SLEEFSHARPdefine SLEEFXXXSleef_quad_DEFINED
|
||||
typedef struct { uint64_t x, y; } Sleef_uint64_2t;
|
||||
|
||||
@ -0,0 +1,58 @@
|
||||
#include "psha2.hpp"
|
||||
|
||||
#if TEST_CAPI
|
||||
#include "psha2_capi.h"
|
||||
#else
|
||||
#include <openssl/sha.h>
|
||||
#include <openssl/evp.h>
|
||||
#endif
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
srand(time(NULL));
|
||||
|
||||
bool success = true;
|
||||
|
||||
for(int i=0;i<10000;i++) {
|
||||
int len = (rand() + ((int64_t)RAND_MAX + 1) * rand()) % (1 << (1 + (rand() % 18)));
|
||||
unsigned char *plaintext = (unsigned char *)malloc(len);
|
||||
for(int i=0;i<len;i++) plaintext[i] = rand() & 0xff;
|
||||
|
||||
//
|
||||
|
||||
PSHA2_256_Internal psha;
|
||||
unsigned char dgst0[SHA256_DIGEST_LENGTH];
|
||||
|
||||
psha.append(plaintext, len);
|
||||
psha.finalize_bytes(dgst0);
|
||||
|
||||
//
|
||||
|
||||
unsigned char dgst1[SHA256_DIGEST_LENGTH];
|
||||
|
||||
EVP_MD_CTX *ctx = EVP_MD_CTX_new();
|
||||
EVP_DigestInit_ex(ctx, EVP_sha256(), NULL);
|
||||
EVP_DigestUpdate(ctx, plaintext, len);
|
||||
EVP_DigestFinal_ex(ctx, dgst1, NULL);
|
||||
EVP_MD_CTX_free(ctx);
|
||||
|
||||
//
|
||||
|
||||
if (memcmp(dgst0, dgst1, SHA256_DIGEST_LENGTH) != 0) success = false;
|
||||
|
||||
free(plaintext);
|
||||
}
|
||||
|
||||
if (success) {
|
||||
printf("OK\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
printf("NG\n");
|
||||
return -1;
|
||||
}
|
||||
@ -1,4 +1,4 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2023.
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
@ -38,7 +38,7 @@
|
||||
#define POSITIVE_INFINITYf ((float)INFINITY)
|
||||
#define NEGATIVE_INFINITYf (-(float)INFINITY)
|
||||
|
||||
int isnumber(double x) { return !isinf(x) && !isnan(x); }
|
||||
int xisnumber(double x) { return !isinf(x) && !isnan(x); }
|
||||
int isPlusZero(double x) { return x == 0 && copysign(1, x) == 1; }
|
||||
int isMinusZero(double x) { return x == 0 && copysign(1, x) == -1; }
|
||||
double sign(double d) { return d < 0 ? -1 : 1; }
|
||||
@ -83,21 +83,38 @@ int readln(int fd, char *buf, int cnt) {
|
||||
static uint64_t xseed;
|
||||
|
||||
uint64_t xrand() {
|
||||
uint64_t u = xseed;
|
||||
xseed = xseed * UINT64_C(6364136223846793005) + 1;
|
||||
return xseed;
|
||||
u = (u & ((~UINT64_C(0)) << 32)) | (xseed >> 32);
|
||||
xseed = xseed * UINT64_C(6364136223846793005) + 1;
|
||||
return u;
|
||||
}
|
||||
|
||||
void xsrand(uint64_t s) {
|
||||
xseed = s;
|
||||
xrand();
|
||||
xrand();
|
||||
xrand();
|
||||
}
|
||||
|
||||
// Fill memory with random bits
|
||||
void memrand(void *p, int size) {
|
||||
uint64_t *q = (uint64_t *)p;
|
||||
uint8_t *q = (uint8_t *)p;
|
||||
int i;
|
||||
for(i=0;i<size/8;i++) *q++ = xrand();
|
||||
uint8_t *r = (uint8_t *)q;
|
||||
for(i *= 8;i<size;i++) *r++ = xrand() & 0xff;
|
||||
for(i=0;i<(size & ~7);i+=8) {
|
||||
uint64_t u = xrand();
|
||||
*q++ = (uint8_t)(u & 0xff); u >>= 8;
|
||||
*q++ = (uint8_t)(u & 0xff); u >>= 8;
|
||||
*q++ = (uint8_t)(u & 0xff); u >>= 8;
|
||||
*q++ = (uint8_t)(u & 0xff); u >>= 8;
|
||||
*q++ = (uint8_t)(u & 0xff); u >>= 8;
|
||||
*q++ = (uint8_t)(u & 0xff); u >>= 8;
|
||||
*q++ = (uint8_t)(u & 0xff); u >>= 8;
|
||||
*q++ = (uint8_t)(u & 0xff); u >>= 8;
|
||||
}
|
||||
for(;i<size;i++) *q++ = xrand() & 0xff;
|
||||
}
|
||||
|
||||
void xsrand(uint64_t s) { xseed = s; }
|
||||
|
||||
//
|
||||
|
||||
#ifdef USEMPFR
|
||||
@ -0,0 +1,144 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
#include <tlfloat/tlfloat.h>
|
||||
using namespace tlfloat;
|
||||
#endif
|
||||
|
||||
#if defined(__GNUC__) && !defined(__clang__)
|
||||
#pragma GCC diagnostic ignored "-Wuninitialized"
|
||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
|
||||
#pragma GCC diagnostic ignored "-Wattributes"
|
||||
#endif
|
||||
|
||||
#if defined(__clang__)
|
||||
#pragma clang diagnostic ignored "-Wvla-cxx-extension"
|
||||
#pragma clang diagnostic ignored "-Wuninitialized"
|
||||
#pragma clang diagnostic ignored "-Wtautological-compare"
|
||||
#endif
|
||||
|
||||
#define DENORMAL_DBL_MIN (4.9406564584124654418e-324)
|
||||
#define POSITIVE_INFINITY INFINITY
|
||||
#define NEGATIVE_INFINITY (-INFINITY)
|
||||
|
||||
#define DENORMAL_FLT_MIN (1.4012984643248170709e-45f)
|
||||
#define POSITIVE_INFINITYf ((float)INFINITY)
|
||||
#define NEGATIVE_INFINITYf (-(float)INFINITY)
|
||||
|
||||
#ifndef M_PIf
|
||||
# define M_PIf ((float)M_PI)
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
extern int enableFlushToZero;
|
||||
double flushToZero(double y);
|
||||
|
||||
int xisnumber(double x);
|
||||
int isPlusZero(double x);
|
||||
int isMinusZero(double x);
|
||||
int xisnan(double x);
|
||||
double sign(double d);
|
||||
|
||||
int isnumberf(float x);
|
||||
int isPlusZerof(float x);
|
||||
int isMinusZerof(float x);
|
||||
int xisnanf(float x);
|
||||
float signf(float d);
|
||||
|
||||
int readln(int fd, char *buf, int cnt);
|
||||
|
||||
#define XRAND_MAX (INT64_C(0x100000000) * (double)INT64_C(0x100000000))
|
||||
|
||||
void xsrand(uint64_t s);
|
||||
uint64_t xrand();
|
||||
void memrand(void *p, int size);
|
||||
|
||||
// The following functions are meant to be inlined
|
||||
|
||||
static double u2d(uint64_t u) {
|
||||
double d = 0;
|
||||
memcpy(&d, &u, sizeof(d));
|
||||
return d;
|
||||
}
|
||||
|
||||
static uint64_t d2u(double d) {
|
||||
uint64_t u = 0;
|
||||
memcpy(&u, &d, sizeof(u));
|
||||
return u;
|
||||
}
|
||||
|
||||
static float u2f(uint32_t u) {
|
||||
float f = 0;
|
||||
memcpy(&f, &u, sizeof(f));
|
||||
return f;
|
||||
}
|
||||
|
||||
static uint32_t f2u(float d) {
|
||||
uint32_t u = 0;
|
||||
memcpy(&u, &d, sizeof(u));
|
||||
return u;
|
||||
}
|
||||
|
||||
static int startsWith(char *str, char *prefix) {
|
||||
while(*prefix != '\0') if (*str++ != *prefix++) return 0;
|
||||
return *prefix == '\0';
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
#ifdef USEMPFR
|
||||
int cmpDenormdp(double x, mpfr_t fry);
|
||||
double countULPdp(double d, mpfr_t c);
|
||||
double countULP2dp(double d, mpfr_t c);
|
||||
|
||||
int cmpDenormsp(float x, mpfr_t fry);
|
||||
double countULPsp(float d, mpfr_t c);
|
||||
double countULP2sp(float d, mpfr_t c);
|
||||
|
||||
#if MPFR_VERSION < MPFR_VERSION_NUM(4, 2, 0)
|
||||
void mpfr_sinpi(mpfr_t ret, mpfr_t arg, mpfr_rnd_t rnd);
|
||||
void mpfr_cospi(mpfr_t ret, mpfr_t arg, mpfr_rnd_t rnd);
|
||||
#endif
|
||||
void mpfr_lgamma_nosign(mpfr_t ret, mpfr_t arg, mpfr_rnd_t rnd);
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static double countULP(T ot, const T& oc,
|
||||
const int nbmant, const T& fltmin, const T& fltmax,
|
||||
const bool checkSignedZero=false, const double abound=0.0) {
|
||||
if (isnan_(oc) && isnan_(ot)) return 0;
|
||||
if (isnan_(oc) || isnan_(ot)) return 10001;
|
||||
if (isinf_(oc) && !isinf_(ot)) return INFINITY;
|
||||
|
||||
const T halffltmin = mul_(fltmin, T(0.5));
|
||||
const bool ciszero = fabs_(oc) < halffltmin, cisinf = fabs_(oc) > fltmax;
|
||||
|
||||
if (cisinf && isinf_(ot) && signbit_(oc) == signbit_(ot)) return 0;
|
||||
if (ciszero && ot != 0) return 10000;
|
||||
if (checkSignedZero && ciszero && ot == 0 && signbit_(oc) != signbit_(ot)) return 10002;
|
||||
|
||||
double v = 0;
|
||||
if (isinf_(ot) && !isinf_(oc)) {
|
||||
ot = copysign_(fltmax, ot);
|
||||
v = 1;
|
||||
}
|
||||
|
||||
const int ec = ilogb_(oc);
|
||||
|
||||
auto e = fabs_(oc - ot);
|
||||
if (e < abound) return 0;
|
||||
|
||||
return double(div_(e, fmax_(ldexp_(T(1), ec + 1 - nbmant), fltmin))) + v;
|
||||
}
|
||||
#endif
|
||||
@ -73,8 +73,36 @@ if((NOT MSVC) AND NOT SLEEF_CLANG_ON_WINDOWS)
|
||||
add_test_dft(${TARGET_NAIVETESTSP}_4 $<TARGET_FILE:${TARGET_NAIVETESTSP}> 4)
|
||||
add_test_dft(${TARGET_NAIVETESTSP}_5 $<TARGET_FILE:${TARGET_NAIVETESTSP}> 5)
|
||||
add_test_dft(${TARGET_NAIVETESTSP}_10 $<TARGET_FILE:${TARGET_NAIVETESTSP}> 10)
|
||||
|
||||
# Target executable measuredft
|
||||
set(TARGET_MEASUREDFT "measuredft")
|
||||
add_executable(${TARGET_MEASUREDFT} measuredft.c ${PROJECT_SOURCE_DIR}/include/sleefdft.h)
|
||||
add_dependencies(${TARGET_MEASUREDFT} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
|
||||
target_compile_definitions(${TARGET_MEASUREDFT} PRIVATE ${COMMON_TARGET_DEFINITIONS})
|
||||
target_link_libraries(${TARGET_MEASUREDFT} ${COMMON_LINK_LIBRARIES})
|
||||
set_target_properties(${TARGET_MEASUREDFT} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
endif()
|
||||
|
||||
# Target executable test_dftplanner
|
||||
set(TARGET_TEST_DFTPLANNER "test_dftplanner")
|
||||
add_executable(${TARGET_TEST_DFTPLANNER} test_dftplanner.cpp ${PROJECT_SOURCE_DIR}/include/sleefdft.h)
|
||||
add_dependencies(${TARGET_TEST_DFTPLANNER} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
|
||||
target_compile_definitions(${TARGET_TEST_DFTPLANNER} PRIVATE ${COMMON_TARGET_DEFINITIONS} MEASURE=1)
|
||||
target_link_libraries(${TARGET_TEST_DFTPLANNER} ${COMMON_LINK_LIBRARIES})
|
||||
set_target_properties(${TARGET_TEST_DFTPLANNER} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
add_test(NAME ${TARGET_TEST_DFTPLANNER} COMMAND $<TARGET_FILE:${TARGET_TEST_DFTPLANNER}> ${PROJECT_BINARY_DIR}/testm1.plan ${PROJECT_BINARY_DIR}/testm2.plan)
|
||||
set_tests_properties(${TARGET_TEST_DFTPLANNER} PROPERTIES COST 2)
|
||||
|
||||
# Target executable test_dftplannerest
|
||||
set(TARGET_TEST_DFTPLANNEREST "test_dftplannerest")
|
||||
add_executable(${TARGET_TEST_DFTPLANNEREST} test_dftplanner.cpp ${PROJECT_SOURCE_DIR}/include/sleefdft.h)
|
||||
add_dependencies(${TARGET_TEST_DFTPLANNEREST} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
|
||||
target_compile_definitions(${TARGET_TEST_DFTPLANNEREST} PRIVATE ${COMMON_TARGET_DEFINITIONS})
|
||||
target_link_libraries(${TARGET_TEST_DFTPLANNEREST} ${COMMON_LINK_LIBRARIES})
|
||||
set_target_properties(${TARGET_TEST_DFTPLANNEREST} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
add_test(NAME ${TARGET_TEST_DFTPLANNEREST} COMMAND $<TARGET_FILE:${TARGET_TEST_DFTPLANNEREST}> ${PROJECT_BINARY_DIR}/teste1.plan ${PROJECT_BINARY_DIR}/teste2.plan)
|
||||
set_tests_properties(${TARGET_TEST_DFTPLANNEREST} PROPERTIES COST 2)
|
||||
|
||||
# Target executable roundtriptest1ddp
|
||||
set(TARGET_ROUNDTRIPTEST1DDP "roundtriptest1ddp")
|
||||
add_executable(${TARGET_ROUNDTRIPTEST1DDP} roundtriptest1d.c ${PROJECT_SOURCE_DIR}/include/sleefdft.h)
|
||||
@ -161,6 +189,34 @@ if (LIBFFTW3 AND NOT SLEEF_DISABLE_FFTW)
|
||||
add_test_dft(${TARGET_FFTWTEST2DSP}_8_8 $<TARGET_FILE:${TARGET_FFTWTEST2DSP}> 8 8)
|
||||
add_test_dft(${TARGET_FFTWTEST2DSP}_10_10 $<TARGET_FILE:${TARGET_FFTWTEST2DSP}> 10 10)
|
||||
add_test_dft(${TARGET_FFTWTEST2DSP}_5_15 $<TARGET_FILE:${TARGET_FFTWTEST2DSP}> 5 15)
|
||||
|
||||
if (SLEEF_LIBFFTW3_LIBRARIES)
|
||||
# Target executable dftbenchdp
|
||||
set(TARGET_BENCH1DDP "dftbenchdp")
|
||||
add_executable(${TARGET_BENCH1DDP} dftbench.cpp ${PROJECT_SOURCE_DIR}/include/sleefdft.h)
|
||||
add_dependencies(${TARGET_BENCH1DDP} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
|
||||
target_compile_definitions(${TARGET_BENCH1DDP} PRIVATE ${COMMON_TARGET_DEFINITIONS} BASETYPEID=1)
|
||||
target_link_libraries(${TARGET_BENCH1DDP} ${COMMON_LINK_LIBRARIES} ${SLEEF_LIBFFTW3_LIBRARIES})
|
||||
set_target_properties(${TARGET_BENCH1DDP} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
|
||||
#add_test_dft("dftbenchdp1d" $<TARGET_FILE:${TARGET_BENCH1DDP}> 8 0 1000 1)
|
||||
#set_tests_properties("dftbenchdp1d" PROPERTIES COST 3)
|
||||
add_test_dft("dftbenchdp2d" $<TARGET_FILE:${TARGET_BENCH1DDP}> 8 8 1000 1)
|
||||
set_tests_properties("dftbenchdp2d" PROPERTIES COST 3)
|
||||
|
||||
# Target executable dftbenchsp
|
||||
set(TARGET_BENCH1DSP "dftbenchsp")
|
||||
add_executable(${TARGET_BENCH1DSP} dftbench.cpp ${PROJECT_SOURCE_DIR}/include/sleefdft.h)
|
||||
add_dependencies(${TARGET_BENCH1DSP} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
|
||||
target_compile_definitions(${TARGET_BENCH1DSP} PRIVATE ${COMMON_TARGET_DEFINITIONS} BASETYPEID=2)
|
||||
target_link_libraries(${TARGET_BENCH1DSP} ${COMMON_LINK_LIBRARIES} ${SLEEF_LIBFFTW3_LIBRARIES})
|
||||
set_target_properties(${TARGET_BENCH1DSP} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
|
||||
#add_test_dft("dftbenchsp1d" $<TARGET_FILE:${TARGET_BENCH1DSP}> 8 0 1000 1)
|
||||
#set_tests_properties("dftbenchsp1d" PROPERTIES COST 3)
|
||||
add_test_dft("dftbenchsp2d" $<TARGET_FILE:${TARGET_BENCH1DSP}> 8 8 1000 1)
|
||||
set_tests_properties("dftbenchsp2d" PROPERTIES COST 3)
|
||||
endif()
|
||||
else(LIBFFTW3 AND NOT SLEEF_DISABLE_FFTW)
|
||||
if(MSVC OR SLEEF_CLANG_ON_WINDOWS)
|
||||
# Test roundtriptestdp
|
||||
|
||||
@ -1,116 +0,0 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#define _DEFAULT_SOURCE
|
||||
#define _XOPEN_SOURCE 700
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
#include <complex.h>
|
||||
#include <time.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/time.h>
|
||||
|
||||
#ifdef USEFFTW
|
||||
#include <fftw3.h>
|
||||
#include <omp.h>
|
||||
#else
|
||||
#include "sleef.h"
|
||||
#include "sleefdft.h"
|
||||
#endif
|
||||
|
||||
typedef double real;
|
||||
|
||||
static uint64_t gettime() {
|
||||
struct timespec tp;
|
||||
clock_gettime(CLOCK_MONOTONIC, &tp);
|
||||
return (uint64_t)tp.tv_sec * 1000000000 + ((uint64_t)tp.tv_nsec);
|
||||
}
|
||||
|
||||
#define REPEAT 8
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
if (argc == 1) {
|
||||
fprintf(stderr, "%s <log2n>\n", argv[0]);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
int backward = 0;
|
||||
|
||||
int log2n = atoi(argv[1]);
|
||||
if (log2n < 0) {
|
||||
backward = 1;
|
||||
log2n = -log2n;
|
||||
}
|
||||
|
||||
const int n = 1 << log2n;
|
||||
const int64_t niter = (int)(100000000000.0 / n / log2n);
|
||||
|
||||
printf("Number of iterations = %lld\n", (long long int)niter);
|
||||
|
||||
#ifdef USEFFTW
|
||||
fftw_complex *in = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n);
|
||||
fftw_complex *out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n);
|
||||
|
||||
#if 0
|
||||
int fftw_init_threads(void);
|
||||
fftw_plan_with_nthreads(omp_get_max_threads());
|
||||
#endif
|
||||
|
||||
fftw_plan w = fftw_plan_dft_1d(n, in, out, backward ? FFTW_BACKWARD : FFTW_FORWARD, FFTW_MEASURE);
|
||||
//fftw_plan w = fftw_plan_dft_1d(n, in, out, backward ? FFTW_BACKWARD : FFTW_FORWARD, FFTW_PATIENT);
|
||||
|
||||
for(int i=0;i<n;i++) {
|
||||
in[i] = (2.0 * (rand() / (double)RAND_MAX) - 1) + (2.0 * (rand() / (double)RAND_MAX) - 1) * _Complex_I;
|
||||
}
|
||||
|
||||
for(int64_t i=0;i<niter/2;i++) fftw_execute(w);
|
||||
#else
|
||||
SleefDFT_setPlanFilePath(NULL, NULL, SLEEF_PLAN_RESET);
|
||||
|
||||
real *in = (real *)Sleef_malloc(n*2 * sizeof(real));
|
||||
real *out = (real *)Sleef_malloc(n*2 * sizeof(real));
|
||||
|
||||
int mode = SLEEF_MODE_MEASURE | SLEEF_MODE_VERBOSE; // | SLEEF_MODE_NO_MT;
|
||||
if (argc >= 3) mode = SLEEF_MODE_VERBOSE | SLEEF_MODE_ESTIMATE;
|
||||
|
||||
if (backward) mode |= SLEEF_MODE_BACKWARD;
|
||||
struct SleefDFT *p = SleefDFT_double_init1d(n, in, out, mode);
|
||||
|
||||
if (argc >= 3) SleefDFT_setPath(p, argv[2]);
|
||||
|
||||
for(int i=0;i<n*2;i++) {
|
||||
in[i] = (2.0 * (rand() / (double)RAND_MAX) - 1);
|
||||
}
|
||||
|
||||
for(int64_t i=0;i<niter/2;i++) SleefDFT_double_execute(p, in, out);
|
||||
#endif
|
||||
|
||||
for(int rep=0;rep<REPEAT;rep++) {
|
||||
uint64_t tm0 = gettime();
|
||||
for(int64_t i=0;i<niter;i++) {
|
||||
#ifdef USEFFTW
|
||||
fftw_execute(w);
|
||||
#else
|
||||
SleefDFT_double_execute(p, in, out);
|
||||
#endif
|
||||
}
|
||||
uint64_t tm1 = gettime();
|
||||
|
||||
printf("Actual time = %g ns\n", (double)(tm1 - tm0) / niter);
|
||||
double timeus = (tm1 - tm0) / ((double)niter * 1000);
|
||||
|
||||
double mflops = 5 * n * log2n / timeus;
|
||||
|
||||
printf("%g Mflops\n", mflops);
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
exit(0);
|
||||
}
|
||||
@ -0,0 +1,404 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
#include <complex>
|
||||
#include <ctime>
|
||||
#include <chrono>
|
||||
#include <thread>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include <fftw3.h>
|
||||
#include <omp.h>
|
||||
|
||||
#include "sleef.h"
|
||||
#include "sleefdft.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
#if BASETYPEID == 1
|
||||
typedef double xreal;
|
||||
#define FFTW_COMPLEX fftw_complex
|
||||
#define FFTW_PLAN_WITH_NTHREADS fftw_plan_with_nthreads
|
||||
#define FFTW_PLAN fftw_plan
|
||||
#define FFTW_MALLOC fftw_malloc
|
||||
#define FFTW_FREE fftw_free
|
||||
#define FFTW_PLAN_DFT_1D fftw_plan_dft_1d
|
||||
#define FFTW_PLAN_DFT_2D fftw_plan_dft_2d
|
||||
#define FFTW_EXECUTE fftw_execute
|
||||
#define FFTW_DESTROY_PLAN fftw_destroy_plan
|
||||
#define FFTW_CLEANUP fftw_cleanup
|
||||
#define SLEEFDFT_INIT1D SleefDFT_double_init1d
|
||||
#define SLEEFDFT_INIT2D SleefDFT_double_init2d
|
||||
#elif BASETYPEID == 2
|
||||
typedef float xreal;
|
||||
#define FFTW_COMPLEX fftwf_complex
|
||||
#define FFTW_PLAN_WITH_NTHREADS fftwf_plan_with_nthreads
|
||||
#define FFTW_PLAN fftwf_plan
|
||||
#define FFTW_MALLOC fftwf_malloc
|
||||
#define FFTW_FREE fftwf_free
|
||||
#define FFTW_PLAN_DFT_1D fftwf_plan_dft_1d
|
||||
#define FFTW_PLAN_DFT_2D fftwf_plan_dft_2d
|
||||
#define FFTW_EXECUTE fftwf_execute
|
||||
#define FFTW_DESTROY_PLAN fftwf_destroy_plan
|
||||
#define FFTW_CLEANUP fftwf_cleanup
|
||||
#define SLEEFDFT_INIT1D SleefDFT_float_init1d
|
||||
#define SLEEFDFT_INIT2D SleefDFT_float_init2d
|
||||
#else
|
||||
#error BASETYPEID not set
|
||||
#endif
|
||||
|
||||
static uint64_t timens() {
|
||||
return std::chrono::duration_cast<std::chrono::nanoseconds>
|
||||
(std::chrono::high_resolution_clock::now() - std::chrono::high_resolution_clock::from_time_t(0)).count();
|
||||
}
|
||||
|
||||
template<typename cplx>
|
||||
class FFTFramework {
|
||||
public:
|
||||
virtual void execute() = 0;
|
||||
virtual cplx* getInPtr() = 0;
|
||||
virtual cplx* getOutPtr() = 0;
|
||||
virtual ~FFTFramework() {};
|
||||
|
||||
int64_t niter(int64_t ns) {
|
||||
int64_t niter = 10, t0, t1;
|
||||
|
||||
for(;;) {
|
||||
t0 = timens();
|
||||
for(int64_t i=0;i<niter;i++) execute();
|
||||
t1 = timens();
|
||||
if (t1 - t0 > 1000LL * 1000 * 10) break;
|
||||
niter *= 2;
|
||||
}
|
||||
|
||||
return 1 + int64_t((double)niter * ns / (t1 - t0));
|
||||
}
|
||||
};
|
||||
|
||||
template<typename cplx>
|
||||
class FWSleefDFT : public FFTFramework<cplx> {
|
||||
const int n, m;
|
||||
cplx* in;
|
||||
cplx* out;
|
||||
SleefDFT *plan;
|
||||
|
||||
public:
|
||||
FWSleefDFT(int n_, int m_, bool forward, bool mt, bool check) : n(n_), m(m_) {
|
||||
SleefDFT_setDefaultVerboseFP(stderr);
|
||||
SleefDFT_setPlanFilePath(NULL, NULL, SLEEF_PLAN_RESET);
|
||||
in = (cplx*)Sleef_malloc(sizeof(cplx) * n * m);
|
||||
out = (cplx*)Sleef_malloc(sizeof(cplx) * n * m);
|
||||
|
||||
if (!in || !out) {
|
||||
cerr << "Sleef_malloc failed" << endl;
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
uint64_t mode = check ? SLEEF_MODE_ESTIMATE : SLEEF_MODE_MEASURE;
|
||||
mode |= forward ? SLEEF_MODE_FORWARD : SLEEF_MODE_BACKWARD;
|
||||
mode |= mt ? 0 : SLEEF_MODE_NO_MT;
|
||||
//mode |= SLEEF_MODE_VERBOSE;
|
||||
|
||||
if (m == 1) {
|
||||
plan = SLEEFDFT_INIT1D(n, (xreal*)in, (xreal*)out, mode);
|
||||
} else {
|
||||
plan = SLEEFDFT_INIT2D(n, m, (xreal*)in, (xreal*)out, mode);
|
||||
}
|
||||
}
|
||||
|
||||
string getPath() {
|
||||
vector<char> pathstr(1024);
|
||||
SleefDFT_getPath(plan, pathstr.data(), pathstr.size());
|
||||
return pathstr.data();
|
||||
}
|
||||
|
||||
~FWSleefDFT() {
|
||||
SleefDFT_dispose(plan);
|
||||
Sleef_free(out);
|
||||
Sleef_free(in);
|
||||
}
|
||||
|
||||
cplx* getInPtr () { return in ; }
|
||||
cplx* getOutPtr() { return out; }
|
||||
|
||||
void execute() { SleefDFT_execute(plan, NULL, NULL); }
|
||||
};
|
||||
|
||||
template<typename cplx>
|
||||
class FWFFTW3 : public FFTFramework<cplx> {
|
||||
const int n, m;
|
||||
cplx* in;
|
||||
cplx* out;
|
||||
FFTW_PLAN plan;
|
||||
|
||||
public:
|
||||
FWFFTW3(int n_, int m_, bool forward, bool mt, bool check) : n(n_), m(m_) {
|
||||
//FFTW_CLEANUP();
|
||||
FFTW_PLAN_WITH_NTHREADS(mt ? omp_get_max_threads() : 1);
|
||||
in = (cplx*)FFTW_MALLOC(sizeof(FFTW_COMPLEX) * n * m);
|
||||
out = (cplx*)FFTW_MALLOC(sizeof(FFTW_COMPLEX) * n * m);
|
||||
unsigned flags = check ? FFTW_ESTIMATE : FFTW_MEASURE;
|
||||
if (m == 1) {
|
||||
plan = FFTW_PLAN_DFT_1D(n, (FFTW_COMPLEX*)in, (FFTW_COMPLEX*)out, forward ? FFTW_FORWARD : FFTW_BACKWARD, flags);
|
||||
} else {
|
||||
plan = FFTW_PLAN_DFT_2D(n, m, (FFTW_COMPLEX*)in, (FFTW_COMPLEX*)out, forward ? FFTW_FORWARD : FFTW_BACKWARD, flags);
|
||||
}
|
||||
}
|
||||
|
||||
~FWFFTW3() {
|
||||
FFTW_DESTROY_PLAN(plan);
|
||||
FFTW_FREE(out);
|
||||
FFTW_FREE(in);
|
||||
}
|
||||
|
||||
cplx* getInPtr() { return in; }
|
||||
cplx* getOutPtr() { return out; }
|
||||
|
||||
void execute() { FFTW_EXECUTE(plan); }
|
||||
};
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
if (argc == 1) {
|
||||
fprintf(stderr, "%s <log2n> <log2m> <measurement time in ms> <nrepeat>\n", argv[0]);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
fftw_init_threads();
|
||||
|
||||
double measureTimeMillis = 3000;
|
||||
if (argc >= 4) measureTimeMillis = atof(argv[3]);
|
||||
|
||||
bool forward = true;
|
||||
|
||||
int log2n = atoi(argv[1]);
|
||||
if (log2n < 0) {
|
||||
forward = false;
|
||||
log2n = -log2n;
|
||||
}
|
||||
|
||||
const int n = 1 << log2n;
|
||||
|
||||
const int log2m = argc >= 3 ? atoi(argv[2]) : 0;
|
||||
const int m = 1 << log2m;
|
||||
|
||||
cerr << "n = " << n << ", m = " << m << ", " << (forward ? "forward" : "backward") << endl;
|
||||
|
||||
const int nrepeat = argc >= 5 ? atoi(argv[4]) : 1;
|
||||
|
||||
vector<double> mflops_sleefdftst, mflops_fftwst, mflops_sleefdftmt, mflops_fftwmt;
|
||||
|
||||
vector<complex<xreal>> v(n * m);
|
||||
for(int i=0;i<n * m;i++) {
|
||||
v[i] = (2.0 * (rand() / (double)RAND_MAX) - 1) + (2.0 * (rand() / (double)RAND_MAX) - 1) * 1i;
|
||||
}
|
||||
|
||||
{
|
||||
// Check if we are really computing the same values
|
||||
|
||||
auto sleefdft = make_shared<FWSleefDFT<complex<xreal>>>(n, m, forward, true , true);
|
||||
auto fftw = make_shared<FWFFTW3 <complex<xreal>>>(n, m, forward, false, true);
|
||||
|
||||
complex<xreal> *in0 = sleefdft->getInPtr();
|
||||
complex<xreal> *out0 = sleefdft->getOutPtr();
|
||||
complex<xreal> *in1 = fftw->getInPtr();
|
||||
complex<xreal> *out1 = fftw->getOutPtr();
|
||||
|
||||
for(int i=0;i<n * m;i++) in0[i] = in1[i] = v[i];
|
||||
|
||||
sleefdft->execute();
|
||||
fftw ->execute();
|
||||
|
||||
for(int i=0;i<n * m;i++) {
|
||||
if (std::real(abs((out0[i] - out1[i]) * (out0[i] - out1[i]))) > 0.1) {
|
||||
cerr << "NG " << i << " : " << out0[i] << ", " << out1[i] << endl;
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
cerr << "Check OK" << endl;
|
||||
}
|
||||
|
||||
for(int nr = 0;nr < nrepeat;nr++) {
|
||||
cerr << endl;
|
||||
#if BASETYPEID == 1
|
||||
cerr << "DP ";
|
||||
#elif BASETYPEID == 2
|
||||
cerr << "SP ";
|
||||
#endif
|
||||
cerr << "n = 2^" << log2n << " = " << n << ", m = 2^" << log2m << " = " << m << ", nr = " << nr << endl;
|
||||
|
||||
//
|
||||
|
||||
{
|
||||
cerr << "Planning SleefDFT ST ... ";
|
||||
int64_t ptm0 = timens();
|
||||
auto sleefdftst = make_shared<FWSleefDFT<complex<xreal>>>(n, m, forward, false, false);
|
||||
int64_t ptm1 = timens();
|
||||
cerr << ((ptm1 - ptm0) / 1000.0 / 1000.0) << "ms" << endl;
|
||||
|
||||
cerr << sleefdftst->getPath() << endl;
|
||||
|
||||
complex<xreal> *in0 = sleefdftst->getInPtr();
|
||||
for(int i=0;i<n * m;i++) in0[i] = v[i];
|
||||
|
||||
auto niter = sleefdftst->niter(1000LL * 1000 * measureTimeMillis);
|
||||
|
||||
cerr << "SleefDFT ST niter = " << niter << endl;
|
||||
|
||||
for(int64_t i=0;i<niter/10;i++) sleefdftst->execute(); // warm up
|
||||
|
||||
int64_t tm0 = timens();
|
||||
for(int64_t i=0;i<niter;i++) sleefdftst->execute();
|
||||
int64_t tm1 = timens();
|
||||
|
||||
double mflops = 5 * n * log2n / ((tm1 - tm0) / (double(niter)*1000));
|
||||
if (m != 1) mflops *= m * log2m;
|
||||
|
||||
fprintf(stderr, "%g Mflops\n", mflops);
|
||||
|
||||
mflops_sleefdftst.push_back(mflops);
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
{
|
||||
cerr << "Planning FFTW ST ... ";
|
||||
int64_t ptm0 = timens();
|
||||
auto fftwst = make_shared<FWFFTW3<complex<xreal>>>(n, m, forward, false, false);
|
||||
int64_t ptm1 = timens();
|
||||
cerr << ((ptm1 - ptm0) / 1000.0 / 1000.0) << "ms" << endl;
|
||||
|
||||
complex<xreal> *in0 = fftwst->getInPtr();
|
||||
for(int i=0;i<n * m;i++) in0[i] = v[i];
|
||||
|
||||
auto niter = fftwst->niter(1000LL * 1000 * measureTimeMillis);
|
||||
|
||||
cerr << "FFTW ST niter = " << niter << endl;
|
||||
|
||||
for(int64_t i=0;i<niter/10;i++) fftwst->execute(); // warm up
|
||||
|
||||
int64_t tm0 = timens();
|
||||
for(int64_t i=0;i<niter;i++) fftwst->execute();
|
||||
int64_t tm1 = timens();
|
||||
|
||||
double mflops = 5 * n * log2n / ((tm1 - tm0) / (double(niter)*1000));
|
||||
if (m != 1) mflops *= m * log2m;
|
||||
|
||||
fprintf(stderr, "%g Mflops\n", mflops);
|
||||
|
||||
mflops_fftwst.push_back(mflops);
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
{
|
||||
cerr << "Planning SleefDFT MT ... ";
|
||||
int64_t ptm0 = timens();
|
||||
auto sleefdftmt = make_shared<FWSleefDFT<complex<xreal>>>(n, m, forward, true, false);
|
||||
int64_t ptm1 = timens();
|
||||
cerr << ((ptm1 - ptm0) / 1000.0 / 1000.0) << "ms" << endl;
|
||||
|
||||
cerr << sleefdftmt->getPath() << endl;
|
||||
|
||||
complex<xreal> *in0 = sleefdftmt->getInPtr();
|
||||
for(int i=0;i<n * m;i++) in0[i] = v[i];
|
||||
|
||||
auto niter = sleefdftmt->niter(1000LL * 1000 * measureTimeMillis);
|
||||
|
||||
cerr << "SleefDFT MT niter = " << niter << endl;
|
||||
|
||||
for(int64_t i=0;i<niter/10;i++) sleefdftmt->execute(); // warm up
|
||||
|
||||
int64_t tm0 = timens();
|
||||
for(int64_t i=0;i<niter;i++) sleefdftmt->execute();
|
||||
int64_t tm1 = timens();
|
||||
|
||||
double mflops = 5 * n * log2n / ((tm1 - tm0) / (double(niter)*1000));
|
||||
if (m != 1) mflops *= m * log2m;
|
||||
|
||||
fprintf(stderr, "%g Mflops\n", mflops);
|
||||
|
||||
mflops_sleefdftmt.push_back(mflops);
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
{
|
||||
cerr << "Planning FFTW MT ... ";
|
||||
int64_t ptm0 = timens();
|
||||
auto fftwmt = make_shared<FWFFTW3<complex<xreal>>>(n, m, forward, true, false);
|
||||
int64_t ptm1 = timens();
|
||||
cerr << ((ptm1 - ptm0) / 1000.0 / 1000.0) << "ms" << endl;
|
||||
|
||||
complex<xreal> *in0 = fftwmt->getInPtr();
|
||||
for(int i=0;i<n * m;i++) in0[i] = v[i];
|
||||
|
||||
auto niter = fftwmt->niter(1000LL * 1000 * measureTimeMillis);
|
||||
|
||||
cerr << "FFTW MT niter = " << niter << endl;
|
||||
|
||||
for(int64_t i=0;i<niter/10;i++) fftwmt->execute(); // warm up
|
||||
|
||||
int64_t tm0 = timens();
|
||||
for(int64_t i=0;i<niter;i++) fftwmt->execute();
|
||||
int64_t tm1 = timens();
|
||||
|
||||
double mflops = 5 * n * log2n / ((tm1 - tm0) / (double(niter)*1000));
|
||||
if (m != 1) mflops *= m * log2m;
|
||||
|
||||
fprintf(stderr, "%g Mflops\n", mflops);
|
||||
|
||||
mflops_fftwmt.push_back(mflops);
|
||||
}
|
||||
}
|
||||
|
||||
cerr << endl;
|
||||
|
||||
cout << log2n << ", " << log2m << ", ";
|
||||
|
||||
{
|
||||
double f = 0;
|
||||
for(auto a : mflops_sleefdftst) {
|
||||
if (a > f) f = a;
|
||||
}
|
||||
cout << f << ", ";
|
||||
}
|
||||
|
||||
{
|
||||
double f = 0;
|
||||
for(auto a : mflops_sleefdftmt) {
|
||||
if (a > f) f = a;
|
||||
}
|
||||
cout << f << ", ";
|
||||
}
|
||||
|
||||
{
|
||||
double f = 0;
|
||||
for(auto a : mflops_fftwst) {
|
||||
if (a > f) f = a;
|
||||
}
|
||||
cout << f << ", ";
|
||||
}
|
||||
|
||||
{
|
||||
double f = 0;
|
||||
for(auto a : mflops_fftwmt) {
|
||||
if (a > f) f = a;
|
||||
}
|
||||
cout << f << endl;
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
exit(0);
|
||||
}
|
||||
@ -1,4 +1,4 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
@ -40,10 +40,22 @@ static double squ(double x) { return x * x; }
|
||||
double check_cf(int n) {
|
||||
fftw_complex *in = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n);
|
||||
fftw_complex *out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n);
|
||||
|
||||
if (!in || !out) {
|
||||
fprintf(stderr, "Memory allocation failed");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
fftw_plan w = fftw_plan_dft_1d(n, in, out, FFTW_FORWARD, FFTW_ESTIMATE);
|
||||
|
||||
real *sx = (real *)Sleef_malloc(n*2*sizeof(real));
|
||||
real *sy = (real *)Sleef_malloc(n*2*sizeof(real));
|
||||
|
||||
if (!sx || !sy) {
|
||||
fprintf(stderr, "Memory allocation failed");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
struct SleefDFT *p = SleefDFT_init1d(n, sx, sy, MODE);
|
||||
|
||||
for(int i=0;i<n;i++) {
|
||||
@ -79,10 +91,22 @@ double check_cf(int n) {
|
||||
double check_cb(int n) {
|
||||
fftw_complex *in = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n);
|
||||
fftw_complex *out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n);
|
||||
|
||||
if (!in || !out) {
|
||||
fprintf(stderr, "Memory allocation failed");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
fftw_plan w = fftw_plan_dft_1d(n, in, out, FFTW_BACKWARD, FFTW_ESTIMATE);
|
||||
|
||||
real *sx = (real *)Sleef_malloc(n*2*sizeof(real));
|
||||
real *sy = (real *)Sleef_malloc(n*2*sizeof(real));
|
||||
|
||||
if (!sx || !sy) {
|
||||
fprintf(stderr, "Memory allocation failed");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
struct SleefDFT *p = SleefDFT_init1d(n, sx, sy, SLEEF_MODE_BACKWARD | MODE);
|
||||
|
||||
for(int i=0;i<n;i++) {
|
||||
@ -118,10 +142,22 @@ double check_cb(int n) {
|
||||
double check_rf(int n) {
|
||||
double *in = (double *) fftw_malloc(sizeof(double) * n);
|
||||
fftw_complex *out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * (n/2+1));
|
||||
|
||||
if (!in || !out) {
|
||||
fprintf(stderr, "Memory allocation failed");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
fftw_plan w = fftw_plan_dft_r2c_1d(n, in, out, FFTW_ESTIMATE);
|
||||
|
||||
real *sx = (real *)Sleef_malloc(n*sizeof(real));
|
||||
real *sy = (real *)Sleef_malloc((n/2+1)*sizeof(real)*2);
|
||||
|
||||
if (!sx || !sy) {
|
||||
fprintf(stderr, "Memory allocation failed");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
struct SleefDFT *p = SleefDFT_init1d(n, sx, sy, SLEEF_MODE_REAL | MODE);
|
||||
|
||||
for(int i=0;i<n;i++) {
|
||||
@ -155,10 +191,22 @@ double check_rf(int n) {
|
||||
double check_rb(int n) {
|
||||
fftw_complex *in = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * (n/2+1));
|
||||
double *out = (double *) fftw_malloc(sizeof(double) * n);
|
||||
|
||||
if (!in || !out) {
|
||||
fprintf(stderr, "Memory allocation failed");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
fftw_plan w = fftw_plan_dft_c2r_1d(n, in, out, FFTW_ESTIMATE);
|
||||
|
||||
real *sx = (real *)Sleef_malloc((n/2+1) * sizeof(real)*2);
|
||||
real *sy = (real *)Sleef_malloc(sizeof(real)*n);
|
||||
|
||||
if (!sx || !sy) {
|
||||
fprintf(stderr, "Memory allocation failed");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
struct SleefDFT *p = SleefDFT_init1d(n, sx, sy, SLEEF_MODE_REAL | SLEEF_MODE_BACKWARD | MODE);
|
||||
|
||||
for(int i=0;i<n/2;i++) {
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
@ -40,10 +40,22 @@ static double squ(double x) { return x * x; }
|
||||
double check_cf(int n, int m) {
|
||||
fftw_complex *in = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n * m);
|
||||
fftw_complex *out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n * m);
|
||||
|
||||
if (!in || !out) {
|
||||
fprintf(stderr, "Memory allocation failed");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
fftw_plan w = fftw_plan_dft_2d(n, m, in, out, FFTW_FORWARD, FFTW_ESTIMATE);
|
||||
|
||||
real *sx = (real *)Sleef_malloc(n*m*2*sizeof(real));
|
||||
real *sy = (real *)Sleef_malloc(n*m*2*sizeof(real));
|
||||
|
||||
if (!sx || !sy) {
|
||||
fprintf(stderr, "Memory allocation failed");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
struct SleefDFT *p = SleefDFT_init2d(n, m, sx, sy, MODE);
|
||||
|
||||
for(int i=0;i<n*m;i++) {
|
||||
@ -79,10 +91,22 @@ double check_cf(int n, int m) {
|
||||
double check_cb(int n, int m) {
|
||||
fftw_complex *in = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n * m);
|
||||
fftw_complex *out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n * m);
|
||||
|
||||
if (!in || !out) {
|
||||
fprintf(stderr, "Memory allocation failed");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
fftw_plan w = fftw_plan_dft_2d(n, m, in, out, FFTW_BACKWARD, FFTW_ESTIMATE);
|
||||
|
||||
real *sx = (real *)Sleef_malloc(n*m*2*sizeof(real));
|
||||
real *sy = (real *)Sleef_malloc(n*m*2*sizeof(real));
|
||||
|
||||
if (!sx || !sy) {
|
||||
fprintf(stderr, "Memory allocation failed");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
struct SleefDFT *p = SleefDFT_init2d(n, m, sx, sy, SLEEF_MODE_BACKWARD | MODE);
|
||||
|
||||
for(int i=0;i<n*m;i++) {
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
@ -97,11 +97,15 @@ int check_cf(int n) {
|
||||
int i;
|
||||
|
||||
real *sx = (real *)Sleef_malloc(n*2 * sizeof(real));
|
||||
real *sy = (real *)Sleef_malloc(n*2 * sizeof(real));
|
||||
|
||||
cmpl *ts = (cmpl *)malloc(sizeof(cmpl)*n);
|
||||
cmpl *fs = (cmpl *)malloc(sizeof(cmpl)*n);
|
||||
|
||||
if (!sx || !ts || !fs) {
|
||||
fprintf(stderr, "Memory allocation failed");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
for(i=0;i<n;i++) {
|
||||
@ -121,25 +125,17 @@ int check_cf(int n) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
SleefDFT_execute(p, sx, sy);
|
||||
SleefDFT_execute(p, sx, sx);
|
||||
|
||||
//
|
||||
|
||||
int success = 1;
|
||||
double rmsn = 0, rmsd = 0;
|
||||
|
||||
for(i=0;i<n;i++) {
|
||||
if ((fabs(sy[(i*2+0)] - creal(fs[i])) > THRES) ||
|
||||
(fabs(sy[(i*2+1)] - cimag(fs[i])) > THRES)) {
|
||||
if ((fabs(sx[(i*2+0)] - creal(fs[i])) > THRES) ||
|
||||
(fabs(sx[(i*2+1)] - cimag(fs[i])) > THRES)) {
|
||||
success = 0;
|
||||
}
|
||||
|
||||
double t;
|
||||
t = (sy[(i*2+0)] - creal(fs[i]));
|
||||
rmsn += t*t;
|
||||
t = (sy[(i*2+1)] - cimag(fs[i]));
|
||||
rmsn += t*t;
|
||||
rmsd += creal(fs[i]) * creal(fs[i]) + cimag(fs[i]) * cimag(fs[i]);
|
||||
}
|
||||
|
||||
//
|
||||
@ -148,7 +144,6 @@ int check_cf(int n) {
|
||||
free(ts);
|
||||
|
||||
Sleef_free(sx);
|
||||
Sleef_free(sy);
|
||||
SleefDFT_dispose(p);
|
||||
|
||||
//
|
||||
@ -161,11 +156,15 @@ int check_cb(int n) {
|
||||
int i;
|
||||
|
||||
real *sx = (real *)Sleef_malloc(sizeof(real)*n*2);
|
||||
real *sy = (real *)Sleef_malloc(sizeof(real)*n*2);
|
||||
|
||||
cmpl *ts = (cmpl *)malloc(sizeof(cmpl)*n);
|
||||
cmpl *fs = (cmpl *)malloc(sizeof(cmpl)*n);
|
||||
|
||||
if (!sx || !ts || !fs) {
|
||||
fprintf(stderr, "Memory allocation failed");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
for(i=0;i<n;i++) {
|
||||
@ -183,15 +182,15 @@ int check_cb(int n) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
SleefDFT_execute(p, sx, sy);
|
||||
SleefDFT_execute(p, sx, sx);
|
||||
|
||||
//
|
||||
|
||||
int success = 1;
|
||||
|
||||
for(i=0;i<n;i++) {
|
||||
if ((fabs(sy[(i*2+0)] - creal(ts[i])) > THRES) ||
|
||||
(fabs(sy[(i*2+1)] - cimag(ts[i])) > THRES)) {
|
||||
if ((fabs(sx[(i*2+0)] - creal(ts[i])) > THRES) ||
|
||||
(fabs(sx[(i*2+1)] - cimag(ts[i])) > THRES)) {
|
||||
success = 0;
|
||||
}
|
||||
}
|
||||
@ -202,7 +201,6 @@ int check_cb(int n) {
|
||||
free(ts);
|
||||
|
||||
Sleef_free(sx);
|
||||
Sleef_free(sy);
|
||||
SleefDFT_dispose(p);
|
||||
|
||||
//
|
||||
@ -214,12 +212,16 @@ int check_cb(int n) {
|
||||
int check_rf(int n) {
|
||||
int i;
|
||||
|
||||
real *sx = (real *)Sleef_malloc(n * sizeof(real));
|
||||
real *sy = (real *)Sleef_malloc((n/2+1)*sizeof(real)*2);
|
||||
real *sx = (real *)Sleef_malloc((n+2) * sizeof(real));
|
||||
|
||||
cmpl *ts = (cmpl *)malloc(sizeof(cmpl)*n);
|
||||
cmpl *fs = (cmpl *)malloc(sizeof(cmpl)*n);
|
||||
|
||||
if (!sx || !ts || !fs) {
|
||||
fprintf(stderr, "Memory allocation failed");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
for(i=0;i<n;i++) {
|
||||
@ -227,6 +229,8 @@ int check_rf(int n) {
|
||||
sx[i] = creal(ts[i]);
|
||||
}
|
||||
|
||||
sx[n] = sx[n+1] = 0;
|
||||
|
||||
//
|
||||
|
||||
forward(ts, fs, n);
|
||||
@ -238,15 +242,15 @@ int check_rf(int n) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
SleefDFT_execute(p, sx, sy);
|
||||
SleefDFT_execute(p, sx, sx);
|
||||
|
||||
//
|
||||
|
||||
int success = 1;
|
||||
|
||||
for(i=0;i<n/2+1;i++) {
|
||||
if (fabs(sy[(2*i+0)] - creal(fs[i])) > THRES) success = 0;
|
||||
if (fabs(sy[(2*i+1)] - cimag(fs[i])) > THRES) success = 0;
|
||||
if (fabs(sx[(2*i+0)] - creal(fs[i])) > THRES) success = 0;
|
||||
if (fabs(sx[(2*i+1)] - cimag(fs[i])) > THRES) success = 0;
|
||||
}
|
||||
|
||||
//
|
||||
@ -255,7 +259,6 @@ int check_rf(int n) {
|
||||
free(ts);
|
||||
|
||||
Sleef_free(sx);
|
||||
Sleef_free(sy);
|
||||
SleefDFT_dispose(p);
|
||||
|
||||
//
|
||||
@ -270,6 +273,11 @@ int check_rb(int n) {
|
||||
cmpl *ts = (cmpl *)malloc(sizeof(cmpl)*n);
|
||||
cmpl *fs = (cmpl *)malloc(sizeof(cmpl)*n);
|
||||
|
||||
if (!ts || !fs) {
|
||||
fprintf(stderr, "Memory allocation failed");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
for(i=0;i<n/2;i++) {
|
||||
@ -283,7 +291,11 @@ int check_rb(int n) {
|
||||
}
|
||||
|
||||
real *sx = (real *)Sleef_malloc((n/2+1) * sizeof(real)*2);
|
||||
real *sy = (real *)Sleef_malloc(sizeof(real)*n);
|
||||
|
||||
if (!sx) {
|
||||
fprintf(stderr, "Memory allocation failed");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
for(i=0;i<n/2+1;i++) {
|
||||
sx[2*i+0] = creal(fs[i]);
|
||||
@ -301,7 +313,7 @@ int check_rb(int n) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
SleefDFT_execute(p, sx, sy);
|
||||
SleefDFT_execute(p, sx, sx);
|
||||
|
||||
//
|
||||
|
||||
@ -312,7 +324,7 @@ int check_rb(int n) {
|
||||
success = 0;
|
||||
}
|
||||
|
||||
if ((fabs(sy[i] - creal(ts[i])) > THRES)) {
|
||||
if ((fabs(sx[i] - creal(ts[i])) > THRES)) {
|
||||
success = 0;
|
||||
}
|
||||
}
|
||||
@ -323,7 +335,6 @@ int check_rb(int n) {
|
||||
free(ts);
|
||||
|
||||
Sleef_free(sx);
|
||||
Sleef_free(sy);
|
||||
SleefDFT_dispose(p);
|
||||
|
||||
//
|
||||
@ -335,11 +346,15 @@ int check_arf(int n) {
|
||||
int i;
|
||||
|
||||
real *sx = (real *)Sleef_malloc(n * sizeof(real));
|
||||
real *sy = (real *)Sleef_malloc(n * sizeof(real));
|
||||
|
||||
cmpl *ts = (cmpl *)malloc(sizeof(cmpl)*n);
|
||||
cmpl *fs = (cmpl *)malloc(sizeof(cmpl)*n);
|
||||
|
||||
if (!sx || !ts || !fs) {
|
||||
fprintf(stderr, "Memory allocation failed");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
for(i=0;i<n;i++) {
|
||||
@ -358,7 +373,7 @@ int check_arf(int n) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
SleefDFT_execute(p, sx, sy);
|
||||
SleefDFT_execute(p, sx, sx);
|
||||
|
||||
//
|
||||
|
||||
@ -366,18 +381,20 @@ int check_arf(int n) {
|
||||
|
||||
for(i=0;i<n/2;i++) {
|
||||
if (i == 0) {
|
||||
if (fabs(sy[(2*0+0)] - creal(fs[0 ])) > THRES) success = 0;
|
||||
if (fabs(sy[(2*0+1)] - creal(fs[n/2])) > THRES) success = 0;
|
||||
if (fabs(sx[(2*0+0)] - creal(fs[0 ])) > THRES) success = 0;
|
||||
if (fabs(sx[(2*0+1)] - creal(fs[n/2])) > THRES) success = 0;
|
||||
} else {
|
||||
if (fabs(sy[(2*i+0)] - creal(fs[i])) > THRES) success = 0;
|
||||
if (fabs(sy[(2*i+1)] - cimag(fs[i])) > THRES) success = 0;
|
||||
if (fabs(sx[(2*i+0)] - creal(fs[i])) > THRES) success = 0;
|
||||
if (fabs(sx[(2*i+1)] - cimag(fs[i])) > THRES) success = 0;
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
free(fs);
|
||||
free(ts);
|
||||
|
||||
Sleef_free(sx);
|
||||
Sleef_free(sy);
|
||||
SleefDFT_dispose(p);
|
||||
|
||||
//
|
||||
@ -394,6 +411,11 @@ int check_arb(int n) {
|
||||
cmpl *ts = (cmpl *)malloc(sizeof(cmpl)*n);
|
||||
cmpl *fs = (cmpl *)malloc(sizeof(cmpl)*n);
|
||||
|
||||
if (!sx || !sy || !ts || !fs) {
|
||||
fprintf(stderr, "Memory allocation failed");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
for(i=0;i<n/2;i++) {
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
@ -43,6 +43,11 @@ double check_c(int n) {
|
||||
real *sy = (real *)Sleef_malloc(n*2 * sizeof(real));
|
||||
real *sz = (real *)Sleef_malloc(n*2 * sizeof(real));
|
||||
|
||||
if (!sx || !sy || !sz) {
|
||||
fprintf(stderr, "Memory allocation failed");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
for(int i=0;i<n*2;i++) sx[i] = (real)(2.0 * (rand() / (double)RAND_MAX) - 1);
|
||||
|
||||
//
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
@ -41,7 +41,11 @@ double check_c(int n, int m) {
|
||||
|
||||
real *sx = (real *)Sleef_malloc(n*m*2 * sizeof(real));
|
||||
real *sy = (real *)Sleef_malloc(n*m*2 * sizeof(real));
|
||||
real *sz = (real *)Sleef_malloc(n*m*2 * sizeof(real));
|
||||
|
||||
if (!sx || !sy) {
|
||||
fprintf(stderr, "Memory allocation failed");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
for(int i=0;i<n*m*2;i++) sx[i] = (real)(2.0 * (rand() / (double)RAND_MAX) - 1);
|
||||
|
||||
@ -66,7 +70,7 @@ double check_c(int n, int m) {
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
SleefDFT_execute(p, sy, sz);
|
||||
SleefDFT_execute(p, sy, sy);
|
||||
SleefDFT_dispose(p);
|
||||
|
||||
//
|
||||
@ -74,7 +78,7 @@ double check_c(int n, int m) {
|
||||
double rmsn = 0, rmsd = 0, scale = 1 / (n*(double)m);
|
||||
|
||||
for(int i=0;i<n*m;i++) {
|
||||
rmsn += squ(scale * sz[i*2+0] - sx[i*2+0]) + squ(scale * sz[i*2+1] - sx[i*2+1]);
|
||||
rmsn += squ(scale * sy[i*2+0] - sx[i*2+0]) + squ(scale * sy[i*2+1] - sx[i*2+1]);
|
||||
rmsd += squ( sx[i*2+0]) + squ( sx[i*2+1]);
|
||||
}
|
||||
|
||||
@ -82,7 +86,6 @@ double check_c(int n, int m) {
|
||||
|
||||
Sleef_free(sx);
|
||||
Sleef_free(sy);
|
||||
Sleef_free(sz);
|
||||
|
||||
//
|
||||
|
||||
|
||||
@ -0,0 +1,168 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <cmath>
|
||||
|
||||
#include "sleef.h"
|
||||
#include "sleefdft.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
vector<string> doTransform(int mode) {
|
||||
SleefDFT *p;
|
||||
vector<string> v;
|
||||
vector<char> s(1024);
|
||||
|
||||
double *din = (double *)Sleef_malloc(2048*64*2 * sizeof(double));
|
||||
double *dout = (double *)Sleef_malloc(2048*64*2 * sizeof(double));
|
||||
|
||||
float *fin = (float *)Sleef_malloc(2048*64*2 * sizeof(double));
|
||||
float *fout = (float *)Sleef_malloc(2048*64*2 * sizeof(double));
|
||||
|
||||
//
|
||||
|
||||
p = SleefDFT_double_init1d(1024, din, dout, mode);
|
||||
SleefDFT_getPath(p, s.data(), s.size());
|
||||
v.push_back("1d double 1024 : " + string(s.data()));
|
||||
SleefDFT_dispose(p);
|
||||
|
||||
p = SleefDFT_double_init1d(512, din, dout, mode);
|
||||
SleefDFT_getPath(p, s.data(), s.size());
|
||||
v.push_back("1d double 512 : " + string(s.data()));
|
||||
SleefDFT_dispose(p);
|
||||
|
||||
p = SleefDFT_float_init1d(1024, fin, fout, mode);
|
||||
SleefDFT_getPath(p, s.data(), s.size());
|
||||
v.push_back("1d float 1024 : " + string(s.data()));
|
||||
SleefDFT_dispose(p);
|
||||
|
||||
p = SleefDFT_float_init1d(512, fin, fout, mode);
|
||||
SleefDFT_getPath(p, s.data(), s.size());
|
||||
v.push_back("1d float 512 : " + string(s.data()));
|
||||
SleefDFT_dispose(p);
|
||||
|
||||
p = SleefDFT_double_init2d(2048, 64, din, dout, mode);
|
||||
SleefDFT_getPath(p, s.data(), s.size());
|
||||
v.push_back("2d double 2048x64 : " + string(s.data()));
|
||||
SleefDFT_dispose(p);
|
||||
|
||||
p = SleefDFT_double_init2d(128, 128, din, dout, mode);
|
||||
SleefDFT_getPath(p, s.data(), s.size());
|
||||
v.push_back("2d double 128x128 : " + string(s.data()));
|
||||
SleefDFT_dispose(p);
|
||||
|
||||
p = SleefDFT_float_init2d(2048, 64, fin, fout, mode);
|
||||
SleefDFT_getPath(p, s.data(), s.size());
|
||||
v.push_back("2d float 2048x64 : " + string(s.data()));
|
||||
SleefDFT_dispose(p);
|
||||
|
||||
p = SleefDFT_float_init2d(128, 128, fin, fout, mode);
|
||||
SleefDFT_getPath(p, s.data(), s.size());
|
||||
v.push_back("2d float 128x128 : " + string(s.data()));
|
||||
SleefDFT_dispose(p);
|
||||
|
||||
Sleef_free(din);
|
||||
Sleef_free(dout);
|
||||
Sleef_free(fin);
|
||||
Sleef_free(fout);
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
void compare(vector<string> &runa, vector<string> &runb) {
|
||||
if (runa.size() != runb.size()) {
|
||||
cerr << "Lengths do not match" << endl;
|
||||
exit(-1);
|
||||
}
|
||||
for(size_t i=0;i<runa.size();i++) {
|
||||
if (runa[i] != runb[i]) {
|
||||
cerr << "Paths do not match" << endl;
|
||||
cerr << runa[i] << endl;
|
||||
cerr << runb[i] << endl;
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
if (argc < 3) exit(-1);
|
||||
|
||||
string fn1 = argv[1], fn2 = argv[2];
|
||||
|
||||
#ifdef MEASURE
|
||||
#ifdef MULTITHREAD
|
||||
int mode = SLEEF_MODE_MEASURE | SLEEF_MODE_VERBOSE;
|
||||
#else
|
||||
int mode = SLEEF_MODE_MEASURE | SLEEF_MODE_VERBOSE | SLEEF_MODE_NO_MT;
|
||||
#endif
|
||||
#else
|
||||
#ifdef MULTITHREAD
|
||||
int mode = SLEEF_MODE_ESTIMATE | SLEEF_MODE_VERBOSE;
|
||||
#else
|
||||
int mode = SLEEF_MODE_ESTIMATE | SLEEF_MODE_VERBOSE | SLEEF_MODE_NO_MT;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
int planMode = argc == 1 ? 0 : SLEEF_PLAN_AUTOMATIC;
|
||||
|
||||
//
|
||||
|
||||
cerr << "Run 0" << endl;
|
||||
|
||||
SleefDFT_setPlanFilePath(fn1.c_str(), NULL, planMode);
|
||||
|
||||
auto run0 = doTransform(mode);
|
||||
|
||||
cerr << endl << "Run 1" << endl;
|
||||
|
||||
SleefDFT_setPlanFilePath(NULL, NULL, SLEEF_PLAN_RESET);
|
||||
SleefDFT_setPlanFilePath(fn2.c_str(), NULL, planMode);
|
||||
|
||||
auto run1 = doTransform(mode);
|
||||
|
||||
cerr << endl << "Run 2" << endl;
|
||||
|
||||
SleefDFT_setPlanFilePath(fn1.c_str(), NULL, planMode);
|
||||
|
||||
auto run2 = doTransform(mode);
|
||||
|
||||
compare(run0, run2);
|
||||
|
||||
#ifdef MEASURE
|
||||
SleefDFT_savePlan("manual.plan");
|
||||
#endif
|
||||
|
||||
cerr << endl << "Run 3" << endl;
|
||||
|
||||
SleefDFT_setPlanFilePath(NULL, NULL, SLEEF_PLAN_RESET);
|
||||
SleefDFT_setPlanFilePath(fn2.c_str(), NULL, planMode);
|
||||
|
||||
auto run3 = doTransform(mode);
|
||||
|
||||
compare(run1, run3);
|
||||
|
||||
#ifdef MEASURE
|
||||
cerr << endl << "Run 4" << endl;
|
||||
|
||||
SleefDFT_setPlanFilePath(NULL, NULL, SLEEF_PLAN_RESET);
|
||||
SleefDFT_setPlanFilePath("manual.plan", NULL, planMode);
|
||||
|
||||
auto run4 = doTransform(mode);
|
||||
|
||||
compare(run0, run4);
|
||||
#endif
|
||||
|
||||
cerr << "OK" << endl;
|
||||
|
||||
exit(0);
|
||||
}
|
||||
@ -1,4 +1,4 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
@ -11,7 +11,21 @@ if (SLEEFDFT_MAXBUTWIDTH GREATER 7)
|
||||
message(FATAL_ERROR "SLEEFDFT_MAXBUTWIDTH has to be smaller than 8." )
|
||||
endif()
|
||||
|
||||
option(SLEEFDFT_ENABLE_STREAM "Streaming instructions are utilized in DFT." OFF)
|
||||
set(SLEEFDFT_MINSHIFT 1 CACHE STRING "Min hardcoded shift")
|
||||
set(SLEEFDFT_MAXSHIFT 1 CACHE STRING "Max hardcoded shift")
|
||||
|
||||
if ((${SLEEFDFT_MINSHIFT} GREATER ${SLEEFDFT_MAXSHIFT}) OR (${SLEEFDFT_MINSHIFT} LESS 1))
|
||||
message(FATAL_ERROR "SLEEFDFT_MINSHIFT, SLEEFDFT_MAXSHIFT range error")
|
||||
endif()
|
||||
|
||||
math(EXPR SLEEFDFT_MAXSHIFT_MINUS_1 "${SLEEFDFT_MAXSHIFT} - 1")
|
||||
if (${SLEEFDFT_MINSHIFT} LESS ${SLEEFDFT_MAXSHIFT})
|
||||
foreach(J RANGE ${SLEEFDFT_MINSHIFT} ${SLEEFDFT_MAXSHIFT_MINUS_1})
|
||||
list(APPEND LISTSHIFTSTR ${J})
|
||||
endforeach()
|
||||
else()
|
||||
set(LISTSHIFTSTR)
|
||||
endif()
|
||||
|
||||
# Settings
|
||||
|
||||
@ -21,18 +35,14 @@ set(LISTSHORTTYPENAME "dp" "sp")
|
||||
set(LISTLONGTYPENAME "double" "float")
|
||||
set(LISTTYPEID "1" "2")
|
||||
|
||||
set(MACRODEF_vecextdp BASETYPEID=1 ENABLE_VECEXT CONFIG=1)
|
||||
set(CFLAGS_vecextdp ${FLAGS_ENABLE_VECEXT})
|
||||
set(MACRODEF_vecextsp BASETYPEID=2 ENABLE_VECEXT CONFIG=1)
|
||||
set(CFLAGS_vecextsp ${FLAGS_ENABLE_VECEXT})
|
||||
set(MACRODEF_vecextld BASETYPEID=3 ENABLE_VECEXT CONFIG=1)
|
||||
set(CFLAGS_vecextld ${FLAGS_ENABLE_VECEXT})
|
||||
set(MACRODEF_vecextqp BASETYPEID=4 ENABLE_VECEXT CONFIG=1)
|
||||
set(CFLAGS_vecextqp ${FLAGS_ENABLE_VECEXT})
|
||||
set(MACRODEF_purecdp BASETYPEID=1 ENABLE_PUREC CONFIG=1)
|
||||
set(CFLAGS_purecdp ${FLAGS_ENABLE_PUREC})
|
||||
set(MACRODEF_purecsp BASETYPEID=2 ENABLE_PUREC CONFIG=1)
|
||||
set(CFLAGS_purecsp ${FLAGS_ENABLE_PUREC})
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "Clang")
|
||||
set(CFLAGS_purecdp ${FLAGS_ENABLE_PUREC} -O0)
|
||||
set(CFLAGS_purecsp ${FLAGS_ENABLE_PUREC} -O0)
|
||||
endif()
|
||||
set(MACRODEF_purecld BASETYPEID=3 ENABLE_PUREC CONFIG=1)
|
||||
set(CFLAGS_purecld ${FLAGS_ENABLE_PUREC})
|
||||
set(MACRODEF_purecqp BASETYPEID=4 ENABLE_PUREC CONFIG=1)
|
||||
@ -41,10 +51,6 @@ set(MACRODEF_sse2dp BASETYPEID=1 ENABLE_SSE2 CONFIG=4)
|
||||
set(CFLAGS_sse2dp ${FLAGS_ENABLE_SSE4})
|
||||
set(MACRODEF_sse2sp BASETYPEID=2 ENABLE_SSE2 CONFIG=4)
|
||||
set(CFLAGS_sse2sp ${FLAGS_ENABLE_SSE4})
|
||||
set(MACRODEF_avxdp BASETYPEID=1 ENABLE_AVX CONFIG=1)
|
||||
set(CFLAGS_avxdp ${FLAGS_ENABLE_AVX})
|
||||
set(MACRODEF_avxsp BASETYPEID=2 ENABLE_AVX CONFIG=1)
|
||||
set(CFLAGS_avxsp ${FLAGS_ENABLE_AVX})
|
||||
set(MACRODEF_avx2dp BASETYPEID=1 ENABLE_AVX2 CONFIG=1)
|
||||
set(CFLAGS_avx2dp ${FLAGS_ENABLE_AVX2})
|
||||
set(MACRODEF_avx2sp BASETYPEID=2 ENABLE_AVX2 CONFIG=1)
|
||||
@ -138,10 +144,6 @@ set(ISALIST_SP purecsp)
|
||||
set(ISALIST_DP purecdp)
|
||||
|
||||
set(LIST_SUPPORTED_FPTYPE 0 1)
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "(GNU|Clang)")
|
||||
set(ISALIST_SP vecextsp)
|
||||
set(ISALIST_DP vecextdp)
|
||||
endif(CMAKE_C_COMPILER_ID MATCHES "(GNU|Clang)")
|
||||
|
||||
# List all available vector data types
|
||||
|
||||
@ -150,11 +152,6 @@ if (COMPILER_SUPPORTS_SSE4)
|
||||
set(ISALIST_DP ${ISALIST_DP} sse2dp)
|
||||
endif(COMPILER_SUPPORTS_SSE4)
|
||||
|
||||
if (COMPILER_SUPPORTS_AVX)
|
||||
set(ISALIST_SP ${ISALIST_SP} avxsp)
|
||||
set(ISALIST_DP ${ISALIST_DP} avxdp)
|
||||
endif(COMPILER_SUPPORTS_AVX)
|
||||
|
||||
if (COMPILER_SUPPORTS_AVX2)
|
||||
set(ISALIST_SP ${ISALIST_SP} avx2sp)
|
||||
set(ISALIST_DP ${ISALIST_DP} avx2dp)
|
||||
@ -219,7 +216,13 @@ endif()
|
||||
|
||||
# Compiler properties
|
||||
|
||||
set(CMAKE_C_FLAGS "${ORG_CMAKE_C_FLAGS} ${DFT_C_FLAGS}")
|
||||
set(CMAKE_C_FLAGS "${ORG_CMAKE_C_FLAGS} ${DFT_C_FLAGS} ${OpenMP_C_FLAGS}")
|
||||
set(CMAKE_CXX_FLAGS "${ORG_CMAKE_C_FLAGS} ${DFT_C_FLAGS} ${OpenMP_C_FLAGS}")
|
||||
|
||||
if(MSVC)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc")
|
||||
endif()
|
||||
|
||||
set(COMMON_TARGET_PROPERTIES
|
||||
C_STANDARD 99 # -std=gnu99
|
||||
)
|
||||
@ -228,7 +231,11 @@ if (BUILD_SHARED_LIBS)
|
||||
list(APPEND COMMON_TARGET_PROPERTIES POSITION_INDEPENDENT_CODE ON) # -fPIC
|
||||
endif()
|
||||
|
||||
set(COMMON_TARGET_DEFINITIONS ${COMMON_TARGET_DEFINITIONS} MAXBUTWIDTH=${SLEEFDFT_MAXBUTWIDTH})
|
||||
set(COMMON_TARGET_DEFINITIONS ${COMMON_TARGET_DEFINITIONS}
|
||||
MAXBUTWIDTHDP=${SLEEFDFT_MAXBUTWIDTH} MAXBUTWIDTHSP=${SLEEFDFT_MAXBUTWIDTH}
|
||||
MINSHIFTDP=${SLEEFDFT_MINSHIFT} MAXSHIFTDP=${SLEEFDFT_MAXSHIFT}
|
||||
MINSHIFTSP=${SLEEFDFT_MINSHIFT} MAXSHIFTSP=${SLEEFDFT_MAXSHIFT}
|
||||
)
|
||||
|
||||
if (SLEEFDFT_ENABLE_STREAM)
|
||||
set(COMMON_TARGET_DEFINITIONS ${COMMON_TARGET_DEFINITIONS} ENABLE_STREAM=1)
|
||||
@ -236,10 +243,6 @@ else()
|
||||
set(COMMON_TARGET_DEFINITIONS ${COMMON_TARGET_DEFINITIONS} ENABLE_STREAM=0)
|
||||
endif()
|
||||
|
||||
if(COMPILER_SUPPORTS_OPENMP)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
|
||||
endif(COMPILER_SUPPORTS_OPENMP)
|
||||
|
||||
|
||||
# Include directories
|
||||
|
||||
@ -269,7 +272,7 @@ endif()
|
||||
|
||||
add_custom_command(OUTPUT dispatchparam.h
|
||||
COMMENT "Generating dispatchparam.h"
|
||||
COMMAND $<TARGET_FILE:${TARGET_MKDISPATCH}> paramonly ${SLEEFDFT_MAXBUTWIDTH} ${ISALIST_DP} > ${CMAKE_CURRENT_BINARY_DIR}/dispatchparam.h
|
||||
COMMAND $<TARGET_FILE:${TARGET_MKDISPATCH}> paramonly ALL ${SLEEFDFT_MAXBUTWIDTH} ${SLEEFDFT_MINSHIFT} ${SLEEFDFT_MAXSHIFT} ${ISALIST_SP} > ${CMAKE_CURRENT_BINARY_DIR}/dispatchparam.h
|
||||
DEPENDS ${TARGET_MKDISPATCH}
|
||||
)
|
||||
add_custom_target(dispatchparam.h_generated SOURCES ${CMAKE_CURRENT_BINARY_DIR}/dispatchparam.h)
|
||||
@ -282,49 +285,51 @@ foreach(T ${LIST_SUPPORTED_FPTYPE})
|
||||
list(GET LISTLONGTYPENAME ${T} LT) # LT is "double"
|
||||
list(GET LISTTYPEID ${T} ID) # ID is 1
|
||||
|
||||
string(CONCAT S "dispatch" ${ST} ".h") # S is dispatchdp.h
|
||||
string(CONCAT S "dispatch" ${ST} ".hpp") # S is dispatchdp.hpp
|
||||
add_custom_command(OUTPUT ${S}
|
||||
COMMENT "Generating ${S}"
|
||||
COMMAND $<TARGET_FILE:${TARGET_MKDISPATCH}> ${LT} ${SLEEFDFT_MAXBUTWIDTH} ${ISALIST_${CST}} > ${S}
|
||||
COMMAND $<TARGET_FILE:${TARGET_MKDISPATCH}> ${LT} ${CST} ${SLEEFDFT_MAXBUTWIDTH} ${SLEEFDFT_MINSHIFT} ${SLEEFDFT_MAXSHIFT} ${ISALIST_${CST}} > ${S}
|
||||
DEPENDS ${TARGET_MKDISPATCH}
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
|
||||
)
|
||||
|
||||
string(CONCAT G ${S} "_generated") # G is dispatchdp.h_generated
|
||||
string(CONCAT G ${S} "_generated") # G is dispatchdp.hpp_generated
|
||||
add_custom_target(${G} SOURCES ${S})
|
||||
endforeach()
|
||||
|
||||
# Target dftcommon.o
|
||||
|
||||
add_library(dftcommon_obj OBJECT dftcommon.c dftcommon.h ${CMAKE_CURRENT_BINARY_DIR}/dispatchparam.h ${sleef_BINARY_DIR}/include/sleef.h)
|
||||
add_library(dftcommon_obj OBJECT dftcommon.cpp dftcommon.hpp ${CMAKE_CURRENT_BINARY_DIR}/dispatchparam.h ${sleef_BINARY_DIR}/include/sleef.h)
|
||||
add_dependencies(dftcommon_obj ${TARGET_HEADERS} dispatchparam.h_generated)
|
||||
set_source_files_properties(${sleef_BINARY_DIR}/include/sleef.h PROPERTIES GENERATED TRUE)
|
||||
set_target_properties(dftcommon_obj PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
target_compile_definitions(dftcommon_obj PRIVATE ${COMMON_TARGET_DEFINITIONS})
|
||||
|
||||
# Target dft*.o
|
||||
# Target dft.o
|
||||
|
||||
foreach(T ${LIST_SUPPORTED_FPTYPE})
|
||||
list(GET LISTSHORTTYPENAME ${T} ST) # ST is "dp", for example
|
||||
add_library(dft_obj OBJECT dft.cpp dftcommon.hpp)
|
||||
add_dependencies(dft_obj "dispatchdp.hpp_generated" "dispatchsp.hpp_generated" dispatchparam.h_generated ${TARGET_HEADERS})
|
||||
set_target_properties(dft_obj PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
target_compile_definitions(dft_obj PRIVATE ${COMMON_TARGET_DEFINITIONS})
|
||||
|
||||
string(CONCAT G "dft" ${ST} "_obj") # G is "dftdp_obj"
|
||||
string(CONCAT S "dispatch" ${ST} ".h") # S is "dispatchdp.h"
|
||||
add_library(${G} OBJECT dft.c dftcommon.h ${S})
|
||||
string(CONCAT SG ${S} "_generated") # SG is "dispatchdp.h_generated"
|
||||
add_dependencies(${G} ${SG} ${TARGET_HEADERS})
|
||||
set_target_properties(${G} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
list(GET LISTTYPEID ${T} ID) # ID is 1
|
||||
target_compile_definitions(${G} PRIVATE BASETYPEID=${ID} ${COMMON_TARGET_DEFINITIONS})
|
||||
endforeach()
|
||||
# Copy unroll*.cpp.in to ${CMAKE_CURRENT_BINARY_DIR}
|
||||
|
||||
# Copy unroll0.org to ${CMAKE_CURRENT_BINARY_DIR}
|
||||
add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/unroll0.cpp.in
|
||||
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/unroll0.cpp.in ${CMAKE_CURRENT_BINARY_DIR}
|
||||
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/unroll0.cpp.in)
|
||||
add_custom_target(unroll0.cpp.in.copied DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/unroll0.cpp.in)
|
||||
|
||||
add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/unroll0.org
|
||||
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/unroll0.org ${CMAKE_CURRENT_BINARY_DIR}
|
||||
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/unroll0.org)
|
||||
add_custom_target(unroll0.org.copied DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/unroll0.org)
|
||||
add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/unroll1.cpp.in
|
||||
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/unroll1.cpp.in ${CMAKE_CURRENT_BINARY_DIR}
|
||||
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/unroll1.cpp.in)
|
||||
add_custom_target(unroll1.cpp.in.copied DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/unroll1.cpp.in)
|
||||
|
||||
# Target unroll*.c
|
||||
add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/unroll2.cpp.in
|
||||
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/unroll2.cpp.in ${CMAKE_CURRENT_BINARY_DIR}
|
||||
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/unroll2.cpp.in)
|
||||
add_custom_target(unroll2.cpp.in.copied DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/unroll2.cpp.in)
|
||||
|
||||
# Target unroll*.cpp
|
||||
|
||||
foreach(T ${LIST_SUPPORTED_FPTYPE})
|
||||
list(GET LISTSHORTTYPENAME ${T} ST) # ST is "dp", for example
|
||||
@ -333,7 +338,7 @@ foreach(T ${LIST_SUPPORTED_FPTYPE})
|
||||
|
||||
foreach(E ${ISALIST_${CST}}) # E is "sse2dp"
|
||||
foreach(N ${NLIST})
|
||||
string(CONCAT UC unroll_ ${N} _ ${E} ".c") # UC is "unroll_0_sse2dp.c"
|
||||
string(CONCAT UC unroll_ ${N} _ ${E} ".cpp") # UC is "unroll_0_sse2dp.cpp"
|
||||
set(UNROLL_TARGET_${CST} ${UNROLL_TARGET_${CST}} ${UC})
|
||||
endforeach()
|
||||
endforeach()
|
||||
@ -342,11 +347,31 @@ foreach(T ${LIST_SUPPORTED_FPTYPE})
|
||||
if(UNROLL_TARGET_${CST})
|
||||
add_custom_command(OUTPUT ${UNROLL_TARGET_${CST}}
|
||||
COMMENT "Generating ${UNROLL_TARGET_${CST}}"
|
||||
COMMAND $<TARGET_FILE:${TARGET_MKUNROLL}> ${LT} ${ISALIST_${CST}}
|
||||
COMMAND $<TARGET_FILE:${TARGET_MKUNROLL}> unroll0.cpp.in ${LT} ${CST} - ${ISALIST_${CST}}
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
|
||||
DEPENDS ${TARGET_MKUNROLL} unroll0.org.copied
|
||||
DEPENDS ${TARGET_MKUNROLL} unroll0.cpp.in.copied
|
||||
)
|
||||
add_custom_target(unroll_target_${ST} DEPENDS ${UNROLL_TARGET_${CST}})
|
||||
|
||||
#
|
||||
|
||||
foreach(I ${LISTSHIFTSTR})
|
||||
foreach(E ${ISALIST_${CST}}) # E is "sse2dp"
|
||||
foreach(N ${NLIST})
|
||||
string(CONCAT UC unroll_ ${N} _ ${E} _ ${I} ".cpp") # UC is "unroll_0_sse2dp_1.cpp"
|
||||
set(UNROLL_TARGET_${CST}_${I} ${UNROLL_TARGET_${CST}_${I}} ${UC})
|
||||
endforeach()
|
||||
endforeach()
|
||||
message(STATUS "Unroll target for ${CST}_${I} : ${UNROLL_TARGET_${CST}_${I}}")
|
||||
|
||||
add_custom_command(OUTPUT ${UNROLL_TARGET_${CST}_${I}}
|
||||
COMMENT "Generating ${UNROLL_TARGET_${CST}_${I}}"
|
||||
COMMAND $<TARGET_FILE:${TARGET_MKUNROLL}> unroll1.cpp.in ${LT} ${CST} ${I} ${ISALIST_${CST}}
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
|
||||
DEPENDS ${TARGET_MKUNROLL} unroll1.cpp.in.copied
|
||||
)
|
||||
add_custom_target(unroll_target_${ST}_${I} DEPENDS ${UNROLL_TARGET_${CST}_${I}})
|
||||
endforeach()
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
@ -359,43 +384,38 @@ foreach(T ${LIST_SUPPORTED_FPTYPE})
|
||||
|
||||
foreach(E ${ISALIST_${CST}}) # E is "sse2dp"
|
||||
foreach(N ${NLIST})
|
||||
string(CONCAT U unroll_ ${N} _ ${E}) # U is "unroll_0_sse2dp"
|
||||
string(CONCAT U unroll_ ${N} _ ${E}) # U is "unroll_0_sse2dp"
|
||||
string(CONCAT UG ${U} "_obj") # UG is "unroll_0_sse2dp_obj"
|
||||
string(CONCAT UC ${U} ".c") # UC is "unroll_0_sse2dp.c"
|
||||
string(CONCAT UC ${U} ".cpp") # UC is "unroll_0_sse2dp.cpp"
|
||||
add_library(${UG} OBJECT ${UC})
|
||||
set_target_properties(${UG} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
target_include_directories(${UG} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
target_compile_definitions(${UG} PRIVATE ${COMMON_TARGET_DEFINITIONS} ${MACRODEF_${E}})
|
||||
target_compile_options(${UG} PRIVATE ${CFLAGS_${E}})
|
||||
add_dependencies(${UG} ${TARGET_HEADERS} unroll_target_${ST})
|
||||
list(APPEND UNROLL_OBJECTS $<TARGET_OBJECTS:${UG}>)
|
||||
|
||||
foreach(I ${LISTSHIFTSTR})
|
||||
string(CONCAT U unroll_ ${N} _ ${E} _ ${I}) # U is "unroll_0_sse2dp_1"
|
||||
string(CONCAT UG ${U} "_obj") # UG is "unroll_0_sse2dp_1_obj"
|
||||
string(CONCAT UC ${U} ".cpp") # UC is "unroll_0_sse2dp_1.cpp"
|
||||
add_library(${UG} OBJECT ${UC})
|
||||
set_target_properties(${UG} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
target_include_directories(${UG} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
target_compile_definitions(${UG} PRIVATE ${COMMON_TARGET_DEFINITIONS} ${MACRODEF_${E}})
|
||||
target_compile_options(${UG} PRIVATE ${CFLAGS_${E}})
|
||||
add_dependencies(${UG} ${TARGET_HEADERS} unroll_target_${ST}_${I})
|
||||
list(APPEND UNROLL_OBJECTS $<TARGET_OBJECTS:${UG}>)
|
||||
endforeach()
|
||||
endforeach()
|
||||
endforeach()
|
||||
endforeach()
|
||||
|
||||
# Target libdft
|
||||
|
||||
add_library(${TARGET_LIBDFT} $<TARGET_OBJECTS:dftcommon_obj> $<TARGET_OBJECTS:${TARGET_LIBARRAYMAP_OBJ}>)
|
||||
add_library(${TARGET_LIBDFT} $<TARGET_OBJECTS:dftcommon_obj> $<TARGET_OBJECTS:dft_obj> ${UNROLL_OBJECTS})
|
||||
target_link_libraries(${TARGET_LIBDFT} ${TARGET_LIBSLEEF} ${LIBM})
|
||||
|
||||
foreach(T ${LIST_SUPPORTED_FPTYPE})
|
||||
list(GET LISTSHORTTYPENAME ${T} ST) # ST is "dp", for example
|
||||
|
||||
string(CONCAT G "dft" ${ST} "_obj") # G is "dftdp_obj"
|
||||
target_sources(${TARGET_LIBDFT} PRIVATE $<TARGET_OBJECTS:${G}>)
|
||||
endforeach()
|
||||
|
||||
foreach(T ${LIST_SUPPORTED_FPTYPE})
|
||||
list(GET LISTSHORTTYPENAME ${T} ST) # ST is "dp", for example
|
||||
string(TOUPPER ${ST} CST) # CST is "DP"
|
||||
|
||||
foreach(E ${ISALIST_${CST}}) # E is "sse2dp"
|
||||
foreach(N ${NLIST})
|
||||
string(CONCAT UG unroll_ ${N} _ ${E} "_obj") # U is "unroll_0_sse2dp_obj"
|
||||
target_sources(${TARGET_LIBDFT} PRIVATE $<TARGET_OBJECTS:${UG}>)
|
||||
endforeach()
|
||||
endforeach()
|
||||
endforeach()
|
||||
|
||||
set_target_properties(${TARGET_LIBDFT} PROPERTIES
|
||||
VERSION ${SLEEF_VERSION}
|
||||
SOVERSION ${SLEEF_SOVERSION}
|
||||
|
||||
@ -0,0 +1,45 @@
|
||||
#if !(defined(__MINGW32__) || defined(__MINGW64__) || defined(_MSC_VER))
|
||||
|
||||
#include <unistd.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/file.h>
|
||||
#include <signal.h>
|
||||
#include <setjmp.h>
|
||||
|
||||
static void FLOCK(FILE *fp) { flock(fileno(fp), LOCK_EX); }
|
||||
static void FUNLOCK(FILE *fp) { flock(fileno(fp), LOCK_UN); }
|
||||
static void FTRUNCATE(FILE *fp, off_t z) {
|
||||
if (ftruncate(fileno(fp), z))
|
||||
;
|
||||
}
|
||||
static FILE *OPENTMPFILE() { return tmpfile(); }
|
||||
static void CLOSETMPFILE(FILE *fp) { fclose(fp); }
|
||||
|
||||
static sigjmp_buf sigjmp;
|
||||
#define SETJMP(x) sigsetjmp(x, 1)
|
||||
#define LONGJMP siglongjmp
|
||||
|
||||
#else
|
||||
|
||||
#include <Windows.h>
|
||||
#include <io.h>
|
||||
#include <signal.h>
|
||||
#include <setjmp.h>
|
||||
|
||||
static void FLOCK(FILE *fp) { }
|
||||
static void FUNLOCK(FILE *fp) { }
|
||||
static void FTRUNCATE(FILE *fp, long z) {
|
||||
fseek(fp, 0, SEEK_SET);
|
||||
SetEndOfFile((HANDLE)_get_osfhandle(_fileno(fp)));
|
||||
}
|
||||
static FILE *OPENTMPFILE() { return fopen("tmpfile.txt", "w+"); }
|
||||
static void CLOSETMPFILE(FILE *fp) {
|
||||
fclose(fp);
|
||||
remove("tmpfile.txt");
|
||||
}
|
||||
|
||||
static jmp_buf sigjmp;
|
||||
#define SETJMP(x) setjmp(x)
|
||||
#define LONGJMP longjmp
|
||||
|
||||
#endif
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,423 +0,0 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <ctype.h>
|
||||
#include <inttypes.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include <math.h>
|
||||
|
||||
#ifdef _OPENMP
|
||||
#include <omp.h>
|
||||
#endif
|
||||
|
||||
#include "misc.h"
|
||||
#include "sleef.h"
|
||||
|
||||
#define IMPORT_IS_EXPORT
|
||||
#include "sleefdft.h"
|
||||
#include "dispatchparam.h"
|
||||
#include "dftcommon.h"
|
||||
#include "common.h"
|
||||
#include "arraymap.h"
|
||||
|
||||
#define MAGIC_FLOAT 0x31415926
|
||||
#define MAGIC_DOUBLE 0x27182818
|
||||
|
||||
#define MAGIC2D_FLOAT 0x22360679
|
||||
#define MAGIC2D_DOUBLE 0x17320508
|
||||
|
||||
const char *configStr[] = { "ST", "ST stream", "MT", "MT stream" };
|
||||
|
||||
static int parsePathStr(char *p, int *path, int *config, int pathLenMax, int log2len) {
|
||||
int pathLen = 0, l2l = 0;
|
||||
|
||||
for(;;) {
|
||||
while(*p == ' ') p++;
|
||||
if (*p == '\0') break;
|
||||
if (!isdigit((int)*p)) return -1;
|
||||
|
||||
pathLen++;
|
||||
if (pathLen >= pathLenMax) return -2;
|
||||
|
||||
int n = 0;
|
||||
while(isdigit((int)*p)) n = n * 10 + *p++ - '0';
|
||||
|
||||
if (n > MAXBUTWIDTH) return -6;
|
||||
path[pathLen-1] = n;
|
||||
l2l += n;
|
||||
config[pathLen-1] = 0;
|
||||
|
||||
if (*p != '(') continue;
|
||||
|
||||
int c;
|
||||
for(c=3;c>=0;c--) if (strncmp(p+1, configStr[c], strlen(configStr[c])) == 0) break;
|
||||
if (c == -1) return -3;
|
||||
p += strlen(configStr[c]) + 1;
|
||||
if (*p != ')') return -4;
|
||||
p++;
|
||||
|
||||
config[pathLen-1] = c;
|
||||
}
|
||||
|
||||
if (l2l != log2len) return -5;
|
||||
|
||||
return pathLen;
|
||||
}
|
||||
|
||||
EXPORT void SleefDFT_setPath(SleefDFT *p, char *pathStr) {
|
||||
assert(p != NULL && (p->magic == MAGIC_FLOAT || p->magic == MAGIC_DOUBLE));
|
||||
|
||||
int path[32], config[32];
|
||||
int pathLen = parsePathStr(pathStr, path, config, 31, p->log2len);
|
||||
|
||||
if (pathLen < 0) {
|
||||
if ((p->mode & SLEEF_MODE_VERBOSE) != 0) printf("Error %d in parsing path string : %s\n", pathLen, pathStr);
|
||||
return;
|
||||
}
|
||||
|
||||
for(uint32_t j = 0;j <= p->log2len;j++) p->bestPath[j] = 0;
|
||||
|
||||
for(int level = p->log2len, j=0;level > 0 && j < pathLen;) {
|
||||
p->bestPath[level] = path[j];
|
||||
p->bestPathConfig[level] = config[j];
|
||||
level -= path[j];
|
||||
j++;
|
||||
}
|
||||
|
||||
p->pathLen = 0;
|
||||
for(int j = p->log2len;j >= 0;j--) if (p->bestPath[j] != 0) p->pathLen++;
|
||||
|
||||
if ((p->mode & SLEEF_MODE_VERBOSE) != 0) {
|
||||
printf("Set path : ");
|
||||
for(int j = p->log2len;j >= 0;j--) if (p->bestPath[j] != 0) printf("%d(%s) ", p->bestPath[j], configStr[p->bestPathConfig[j]]);
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
|
||||
void freeTables(SleefDFT *p) {
|
||||
for(int N=1;N<=MAXBUTWIDTH;N++) {
|
||||
for(uint32_t level=N;level<=p->log2len;level++) {
|
||||
Sleef_free(p->tbl[N][level]);
|
||||
}
|
||||
free(p->tbl[N]);
|
||||
p->tbl[N] = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
EXPORT void SleefDFT_dispose(SleefDFT *p) {
|
||||
if (p != NULL && (p->magic == MAGIC2D_FLOAT || p->magic == MAGIC2D_DOUBLE)) {
|
||||
Sleef_free(p->tBuf);
|
||||
SleefDFT_dispose(p->instH);
|
||||
if (p->hlen != p->vlen) SleefDFT_dispose(p->instV);
|
||||
|
||||
p->magic = 0;
|
||||
free(p);
|
||||
return;
|
||||
}
|
||||
|
||||
assert(p != NULL && (p->magic == MAGIC_FLOAT || p->magic == MAGIC_DOUBLE));
|
||||
|
||||
if (p->log2len <= 1) {
|
||||
p->magic = 0;
|
||||
free(p);
|
||||
return;
|
||||
}
|
||||
|
||||
if ((p->mode & SLEEF_MODE_REAL) != 0) {
|
||||
Sleef_free(p->rtCoef1);
|
||||
Sleef_free(p->rtCoef0);
|
||||
p->rtCoef0 = p->rtCoef1 = NULL;
|
||||
}
|
||||
|
||||
for(int level = p->log2len;level >= 1;level--) {
|
||||
Sleef_free(p->perm[level]);
|
||||
}
|
||||
free(p->perm);
|
||||
p->perm = NULL;
|
||||
|
||||
freeTables(p);
|
||||
|
||||
p->magic = 0;
|
||||
free(p);
|
||||
}
|
||||
|
||||
uint32_t ilog2(uint32_t q) {
|
||||
static const uint32_t tab[] = {0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4};
|
||||
uint32_t r = 0,qq;
|
||||
|
||||
if (q & 0xffff0000) r = 16;
|
||||
|
||||
q >>= r;
|
||||
qq = q | (q >> 1);
|
||||
qq |= (qq >> 2);
|
||||
qq = ((qq & 0x10) >> 4) | ((qq & 0x100) >> 7) | ((qq & 0x1000) >> 10);
|
||||
|
||||
return r + tab[qq] * 4 + tab[q >> (tab[qq] * 4)] - 1;
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
char *dftPlanFilePath = NULL;
|
||||
char *archID = NULL;
|
||||
uint64_t planMode = SLEEF_PLAN_REFERTOENVVAR;
|
||||
ArrayMap *planMap = NULL;
|
||||
int planFilePathSet = 0, planFileLoaded = 0;
|
||||
#ifdef _OPENMP
|
||||
omp_lock_t planMapLock;
|
||||
int planMapLockInitialized = 0;
|
||||
#endif
|
||||
|
||||
static void initPlanMapLock() {
|
||||
#ifdef _OPENMP
|
||||
#pragma omp critical
|
||||
{
|
||||
if (!planMapLockInitialized) {
|
||||
planMapLockInitialized = 1;
|
||||
omp_init_lock(&planMapLock);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
static void planMap_clear() {
|
||||
if (planMap != NULL) ArrayMap_dispose(planMap);
|
||||
planMap = NULL;
|
||||
}
|
||||
|
||||
EXPORT void SleefDFT_setPlanFilePath(const char *path, const char *arch, uint64_t mode) {
|
||||
initPlanMapLock();
|
||||
|
||||
if ((mode & SLEEF_PLAN_RESET) != 0) {
|
||||
planMap_clear();
|
||||
planFileLoaded = 0;
|
||||
planFilePathSet = 0;
|
||||
}
|
||||
|
||||
if (dftPlanFilePath != NULL) free(dftPlanFilePath);
|
||||
if (path != NULL) {
|
||||
dftPlanFilePath = malloc(strlen(path)+10);
|
||||
strcpy(dftPlanFilePath, path);
|
||||
} else {
|
||||
dftPlanFilePath = NULL;
|
||||
}
|
||||
|
||||
if (archID != NULL) free(archID);
|
||||
if (arch == NULL) arch = Sleef_getCpuIdString();
|
||||
archID = malloc(strlen(arch)+10);
|
||||
strcpy(archID, arch);
|
||||
|
||||
planMode = mode;
|
||||
planFilePathSet = 1;
|
||||
}
|
||||
|
||||
static void loadPlanFromFile() {
|
||||
if (planFilePathSet == 0 && (planMode & SLEEF_PLAN_REFERTOENVVAR) != 0) {
|
||||
char *s = getenv(ENVVAR);
|
||||
if (s != NULL) SleefDFT_setPlanFilePath(s, NULL, planMode);
|
||||
}
|
||||
|
||||
if (planMap != NULL) ArrayMap_dispose(planMap);
|
||||
|
||||
if (dftPlanFilePath != NULL && (planMode & SLEEF_PLAN_RESET) == 0) {
|
||||
planMap = ArrayMap_load(dftPlanFilePath, archID, PLANFILEID, (planMode & SLEEF_PLAN_NOLOCK) == 0);
|
||||
}
|
||||
|
||||
if (planMap == NULL) planMap = initArrayMap();
|
||||
|
||||
planFileLoaded = 1;
|
||||
}
|
||||
|
||||
static void savePlanToFile() {
|
||||
assert(planFileLoaded);
|
||||
if ((planMode & SLEEF_PLAN_READONLY) == 0 && dftPlanFilePath != NULL) {
|
||||
ArrayMap_save(planMap, dftPlanFilePath, archID, PLANFILEID);
|
||||
}
|
||||
}
|
||||
|
||||
#define CATBIT 8
|
||||
#define BASETYPEIDBIT 2
|
||||
#define LOG2LENBIT 8
|
||||
#define DIRBIT 1
|
||||
|
||||
#define BUTSTATBIT 16
|
||||
|
||||
static uint64_t keyButStat(int baseTypeID, int log2len, int dir, int butStat) {
|
||||
dir = (dir & SLEEF_MODE_BACKWARD) == 0;
|
||||
int cat = 0;
|
||||
uint64_t k = 0;
|
||||
k = (k << BUTSTATBIT) | (butStat & ~(~(uint64_t)0 << BUTSTATBIT));
|
||||
k = (k << LOG2LENBIT) | (log2len & ~(~(uint64_t)0 << LOG2LENBIT));
|
||||
k = (k << DIRBIT) | (dir & ~(~(uint64_t)0 << LOG2LENBIT));
|
||||
k = (k << BASETYPEIDBIT) | (baseTypeID & ~(~(uint64_t)0 << BASETYPEIDBIT));
|
||||
k = (k << CATBIT) | (cat & ~(~(uint64_t)0 << CATBIT));
|
||||
return k;
|
||||
}
|
||||
|
||||
#define LEVELBIT LOG2LENBIT
|
||||
#define BUTCONFIGBIT 8
|
||||
#define TRANSCONFIGBIT 8
|
||||
|
||||
static uint64_t keyTrans(int baseTypeID, int hlen, int vlen, int transConfig) {
|
||||
int max = MAX(hlen, vlen), min = MIN(hlen, vlen);
|
||||
int cat = 2;
|
||||
uint64_t k = 0;
|
||||
k = (k << TRANSCONFIGBIT) | (transConfig & ~(~(uint64_t)0 << TRANSCONFIGBIT));
|
||||
k = (k << LOG2LENBIT) | (max & ~(~(uint64_t)0 << LOG2LENBIT));
|
||||
k = (k << LOG2LENBIT) | (min & ~(~(uint64_t)0 << LOG2LENBIT));
|
||||
k = (k << BASETYPEIDBIT) | (baseTypeID & ~(~(uint64_t)0 << BASETYPEIDBIT));
|
||||
k = (k << CATBIT) | (cat & ~(~(uint64_t)0 << CATBIT));
|
||||
return k;
|
||||
}
|
||||
|
||||
static uint64_t keyPath(int baseTypeID, int log2len, int dir, int level, int config) {
|
||||
dir = (dir & SLEEF_MODE_BACKWARD) == 0;
|
||||
int cat = 3;
|
||||
uint64_t k = 0;
|
||||
k = (k << BUTCONFIGBIT) | (config & ~(~(uint64_t)0 << BUTCONFIGBIT));
|
||||
k = (k << LEVELBIT) | (level & ~(~(uint64_t)0 << LEVELBIT));
|
||||
k = (k << LOG2LENBIT) | (log2len & ~(~(uint64_t)0 << LOG2LENBIT));
|
||||
k = (k << DIRBIT) | (dir & ~(~(uint64_t)0 << LOG2LENBIT));
|
||||
k = (k << BASETYPEIDBIT) | (baseTypeID & ~(~(uint64_t)0 << BASETYPEIDBIT));
|
||||
k = (k << CATBIT) | (cat & ~(~(uint64_t)0 << CATBIT));
|
||||
return k;
|
||||
}
|
||||
|
||||
static uint64_t keyPathConfig(int baseTypeID, int log2len, int dir, int level, int config) {
|
||||
dir = (dir & SLEEF_MODE_BACKWARD) == 0;
|
||||
int cat = 4;
|
||||
uint64_t k = 0;
|
||||
k = (k << BUTCONFIGBIT) | (config & ~(~(uint64_t)0 << BUTCONFIGBIT));
|
||||
k = (k << LEVELBIT) | (level & ~(~(uint64_t)0 << LEVELBIT));
|
||||
k = (k << LOG2LENBIT) | (log2len & ~(~(uint64_t)0 << LOG2LENBIT));
|
||||
k = (k << DIRBIT) | (dir & ~(~(uint64_t)0 << LOG2LENBIT));
|
||||
k = (k << BASETYPEIDBIT) | (baseTypeID & ~(~(uint64_t)0 << BASETYPEIDBIT));
|
||||
k = (k << CATBIT) | (cat & ~(~(uint64_t)0 << CATBIT));
|
||||
return k;
|
||||
}
|
||||
|
||||
static uint64_t planMap_getU64(uint64_t key) {
|
||||
char *s = ArrayMap_get(planMap, key);
|
||||
if (s == NULL) return 0;
|
||||
uint64_t ret;
|
||||
if (sscanf(s, "%" SCNx64, &ret) != 1) return 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void planMap_putU64(uint64_t key, uint64_t value) {
|
||||
char *s = malloc(100);
|
||||
sprintf(s, "%" PRIx64, value);
|
||||
s = ArrayMap_put(planMap, key, s);
|
||||
if (s != NULL) free(s);
|
||||
}
|
||||
|
||||
int PlanManager_loadMeasurementResultsP(SleefDFT *p, int pathCat) {
|
||||
assert(p != NULL && (p->magic == MAGIC_FLOAT || p->magic == MAGIC_DOUBLE));
|
||||
|
||||
initPlanMapLock();
|
||||
|
||||
#ifdef _OPENMP
|
||||
omp_set_lock(&planMapLock);
|
||||
#endif
|
||||
if (!planFileLoaded) loadPlanFromFile();
|
||||
|
||||
int stat = planMap_getU64(keyButStat(p->baseTypeID, p->log2len, p->mode, pathCat+10));
|
||||
if (stat == 0) {
|
||||
#ifdef _OPENMP
|
||||
omp_unset_lock(&planMapLock);
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ret = 1;
|
||||
|
||||
for(int j = p->log2len;j >= 0;j--) {
|
||||
p->bestPath[j] = planMap_getU64(keyPath(p->baseTypeID, p->log2len, p->mode, j, pathCat));
|
||||
p->bestPathConfig[j] = planMap_getU64(keyPathConfig(p->baseTypeID, p->log2len, p->mode, j, pathCat));
|
||||
if (p->bestPath[j] > MAXBUTWIDTH) ret = 0;
|
||||
}
|
||||
|
||||
p->pathLen = 0;
|
||||
for(int j = p->log2len;j >= 0;j--) if (p->bestPath[j] != 0) p->pathLen++;
|
||||
|
||||
#ifdef _OPENMP
|
||||
omp_unset_lock(&planMapLock);
|
||||
#endif
|
||||
return ret;
|
||||
}
|
||||
|
||||
void PlanManager_saveMeasurementResultsP(SleefDFT *p, int pathCat) {
|
||||
assert(p != NULL && (p->magic == MAGIC_FLOAT || p->magic == MAGIC_DOUBLE));
|
||||
|
||||
initPlanMapLock();
|
||||
|
||||
#ifdef _OPENMP
|
||||
omp_set_lock(&planMapLock);
|
||||
#endif
|
||||
if (!planFileLoaded) loadPlanFromFile();
|
||||
|
||||
if (planMap_getU64(keyButStat(p->baseTypeID, p->log2len, p->mode, pathCat+10)) != 0) {
|
||||
#ifdef _OPENMP
|
||||
omp_unset_lock(&planMapLock);
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
for(int j = p->log2len;j >= 0;j--) {
|
||||
planMap_putU64(keyPath(p->baseTypeID, p->log2len, p->mode, j, pathCat), p->bestPath[j]);
|
||||
planMap_putU64(keyPathConfig(p->baseTypeID, p->log2len, p->mode, j, pathCat), p->bestPathConfig[j]);
|
||||
}
|
||||
|
||||
planMap_putU64(keyButStat(p->baseTypeID, p->log2len, p->mode, pathCat+10), 1);
|
||||
|
||||
if ((planMode & SLEEF_PLAN_READONLY) == 0) savePlanToFile();
|
||||
|
||||
#ifdef _OPENMP
|
||||
omp_unset_lock(&planMapLock);
|
||||
#endif
|
||||
}
|
||||
|
||||
int PlanManager_loadMeasurementResultsT(SleefDFT *p) {
|
||||
assert(p != NULL && (p->magic == MAGIC2D_FLOAT || p->magic == MAGIC2D_DOUBLE));
|
||||
|
||||
initPlanMapLock();
|
||||
|
||||
#ifdef _OPENMP
|
||||
omp_set_lock(&planMapLock);
|
||||
#endif
|
||||
if (!planFileLoaded) loadPlanFromFile();
|
||||
|
||||
p->tmNoMT = planMap_getU64(keyTrans(p->baseTypeID, p->log2hlen, p->log2vlen, 0));
|
||||
p->tmMT = planMap_getU64(keyTrans(p->baseTypeID, p->log2hlen, p->log2vlen, 1));
|
||||
|
||||
#ifdef _OPENMP
|
||||
omp_unset_lock(&planMapLock);
|
||||
#endif
|
||||
return p->tmNoMT != 0;
|
||||
}
|
||||
|
||||
void PlanManager_saveMeasurementResultsT(SleefDFT *p) {
|
||||
assert(p != NULL && (p->magic == MAGIC2D_FLOAT || p->magic == MAGIC2D_DOUBLE));
|
||||
|
||||
initPlanMapLock();
|
||||
|
||||
#ifdef _OPENMP
|
||||
omp_set_lock(&planMapLock);
|
||||
#endif
|
||||
if (!planFileLoaded) loadPlanFromFile();
|
||||
|
||||
planMap_putU64(keyTrans(p->baseTypeID, p->log2hlen, p->log2vlen, 0), p->tmNoMT);
|
||||
planMap_putU64(keyTrans(p->baseTypeID, p->log2hlen, p->log2vlen, 1), p->tmMT );
|
||||
|
||||
if ((planMode & SLEEF_PLAN_READONLY) == 0) savePlanToFile();
|
||||
|
||||
#ifdef _OPENMP
|
||||
omp_unset_lock(&planMapLock);
|
||||
#endif
|
||||
}
|
||||
@ -0,0 +1,517 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <cctype>
|
||||
#include <cinttypes>
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
|
||||
#include <omp.h>
|
||||
#include <vector>
|
||||
|
||||
#include "compat.h"
|
||||
#include "misc.h"
|
||||
#include "sleef.h"
|
||||
|
||||
#define IMPORT_IS_EXPORT
|
||||
#include "sleefdft.h"
|
||||
#include "dftcommon.hpp"
|
||||
#include "common.h"
|
||||
#include "serializer.hpp"
|
||||
|
||||
const char *configStr[] = { "ST", "ST stream", "MT", "MT stream" };
|
||||
|
||||
template<typename real, typename real2, int MAXSHIFT, int MAXBUTWIDTH>
|
||||
vector<Action> SleefDFTXX<real, real2, MAXSHIFT, MAXBUTWIDTH>::parsePathStr(const char *p) {
|
||||
vector<Action> v;
|
||||
|
||||
int level = log2len;
|
||||
for(;;) {
|
||||
while(isspace((int)*p)) p++;
|
||||
if (*p == '\0') break;
|
||||
if (!isdigit((int)*p)) throw(runtime_error("Unexpected character"));
|
||||
|
||||
int N = 0;
|
||||
while(isdigit((int)*p)) N = N * 10 + *p++ - '0';
|
||||
|
||||
if (N > MAXBUTWIDTHALL) throw(runtime_error("N too large"));
|
||||
if (N > level) throw(runtime_error("N larger than level"));
|
||||
|
||||
int config = 0;
|
||||
if (*p == '(') {
|
||||
p++;
|
||||
|
||||
for(config=3;config>=0;config--) {
|
||||
if (strncmp(p, configStr[config], strlen(configStr[config])) == 0) break;
|
||||
}
|
||||
if (config == -1) throw(runtime_error("Unknown config"));
|
||||
p += strlen(configStr[config]);
|
||||
if (*p++ != ')') throw(runtime_error("No ')' after config"));
|
||||
}
|
||||
|
||||
v.push_back(Action(config, level, N));
|
||||
level -= N;
|
||||
}
|
||||
|
||||
if (level != 0) throw(runtime_error("Sum of N less than level"));
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
static string to_string(vector<Action> v) {
|
||||
string s = "";
|
||||
for(auto e : v) {
|
||||
string c = "? " + to_string(e.config);
|
||||
if (0 <= e.config && e.config < 4) c = configStr[e.config];
|
||||
s += to_string(e.N) + "(" + c + ") ";
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
template<typename real, typename real2, int MAXSHIFT, int MAXBUTWIDTH>
|
||||
void SleefDFTXX<real, real2, MAXSHIFT, MAXBUTWIDTH>::setPath(const char *pathStr) {
|
||||
assert(magic == MAGIC_FLOAT || magic == MAGIC_DOUBLE);
|
||||
|
||||
try {
|
||||
bestPath = parsePathStr(pathStr);
|
||||
|
||||
if ((mode & SLEEF_MODE_VERBOSE) != 0) fprintf(verboseFP, "Set path : %s\n", to_string(bestPath).c_str());
|
||||
} catch(exception &ex) {
|
||||
if ((mode & SLEEF_MODE_VERBOSE) != 0) fprintf(verboseFP, "Parse error : %s\n", ex.what());
|
||||
}
|
||||
}
|
||||
|
||||
template<typename real, typename real2, int MAXSHIFT, int MAXBUTWIDTH>
|
||||
void SleefDFT2DXX<real, real2, MAXSHIFT, MAXBUTWIDTH>::setPath(const char *pathStr) {
|
||||
assert(magic == MAGIC2D_FLOAT || magic == MAGIC2D_DOUBLE);
|
||||
int planMT_ = 0;
|
||||
if (sscanf(pathStr, "%d", &planMT_) != 1) return;
|
||||
planMT = planMT_;
|
||||
|
||||
string pathH = pathStr;
|
||||
size_t cpos = pathH.find_first_of(':');
|
||||
if (cpos == string::npos) return;
|
||||
pathH = pathH.substr(cpos + 1);
|
||||
|
||||
cpos = pathH.find_first_of(',');
|
||||
if (cpos == string::npos) return;
|
||||
string pathV = pathH.substr(cpos+1);
|
||||
pathH = pathH.substr(0, cpos);
|
||||
|
||||
instH->setPath(pathH.c_str());
|
||||
instV->setPath(pathV.c_str());
|
||||
}
|
||||
|
||||
template<typename real, typename real2, int MAXSHIFT, int MAXBUTWIDTH>
|
||||
string SleefDFTXX<real, real2, MAXSHIFT, MAXBUTWIDTH>::getPath() {
|
||||
assert(magic == MAGIC_FLOAT || magic == MAGIC_DOUBLE);
|
||||
return to_string(bestPath);
|
||||
}
|
||||
|
||||
template<typename real, typename real2, int MAXSHIFT, int MAXBUTWIDTH>
|
||||
string SleefDFT2DXX<real, real2, MAXSHIFT, MAXBUTWIDTH>::getPath() {
|
||||
assert(magic == MAGIC2D_FLOAT || magic == MAGIC2D_DOUBLE);
|
||||
return to_string((int)planMT) + ":" +
|
||||
instH->getPath() + "," + instV->getPath();
|
||||
}
|
||||
|
||||
EXPORT void SleefDFT_setPath(SleefDFT *p, char *pathStr) {
|
||||
assert(p != NULL);
|
||||
switch(p->magic) {
|
||||
case MAGIC_DOUBLE:
|
||||
p->double_->setPath(pathStr);
|
||||
break;
|
||||
case MAGIC_FLOAT:
|
||||
p->float_->setPath(pathStr);
|
||||
break;
|
||||
case MAGIC2D_DOUBLE:
|
||||
p->double2d_->setPath(pathStr);
|
||||
break;
|
||||
case MAGIC2D_FLOAT:
|
||||
p->float2d_->setPath(pathStr);
|
||||
break;
|
||||
default: abort();
|
||||
}
|
||||
}
|
||||
|
||||
EXPORT int SleefDFT_getPath(SleefDFT *p, char *pathStr, int pathStrSize) {
|
||||
assert(p != NULL);
|
||||
|
||||
string str;
|
||||
switch(p->magic) {
|
||||
case MAGIC_DOUBLE:
|
||||
str = p->double_->getPath();
|
||||
break;
|
||||
case MAGIC_FLOAT:
|
||||
str = p->float_->getPath();
|
||||
break;
|
||||
case MAGIC2D_DOUBLE:
|
||||
str = p->double2d_->getPath();
|
||||
break;
|
||||
case MAGIC2D_FLOAT:
|
||||
str = p->float2d_->getPath();
|
||||
break;
|
||||
default: abort();
|
||||
}
|
||||
|
||||
strncpy(pathStr, str.c_str(), pathStrSize);
|
||||
|
||||
return pathStrSize == 0 ? 0 : strlen(pathStr);
|
||||
}
|
||||
|
||||
template<typename real, typename real2, int MAXSHIFT, int MAXBUTWIDTH>
|
||||
void SleefDFTXX<real, real2, MAXSHIFT, MAXBUTWIDTH>::freeTables() {
|
||||
for(int N=1;N<=MAXBUTWIDTH;N++) {
|
||||
for(uint32_t level=N;level<=log2len;level++) {
|
||||
Sleef_free(tbl[N][level]);
|
||||
tbl[N][level] = nullptr;
|
||||
}
|
||||
free(tbl[N]);
|
||||
tbl[N] = NULL;
|
||||
}
|
||||
|
||||
for(int i=0;i<nThread;i++) {
|
||||
Sleef_free(x1[i]);
|
||||
x1[i] = nullptr;
|
||||
Sleef_free(x0[i]);
|
||||
x0[i] = nullptr;
|
||||
}
|
||||
|
||||
free(x1);
|
||||
x1 = nullptr;
|
||||
free(x0);
|
||||
x0 = nullptr;
|
||||
}
|
||||
|
||||
template<typename real, typename real2, int MAXSHIFT, int MAXBUTWIDTH>
|
||||
SleefDFTXX<real, real2, MAXSHIFT, MAXBUTWIDTH>::~SleefDFTXX() {
|
||||
assert(magic == MAGIC_FLOAT || magic == MAGIC_DOUBLE);
|
||||
|
||||
if (log2len <= 1) {
|
||||
magic = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
if ((mode & SLEEF_MODE_REAL) != 0) {
|
||||
Sleef_free(rtCoef1);
|
||||
rtCoef1 = nullptr;
|
||||
Sleef_free(rtCoef0);
|
||||
rtCoef0 = nullptr;
|
||||
}
|
||||
|
||||
for(int level = log2len;level >= 1;level--) {
|
||||
Sleef_free(perm[level]);
|
||||
perm[level] = nullptr;
|
||||
}
|
||||
free(perm);
|
||||
perm = NULL;
|
||||
|
||||
freeTables();
|
||||
|
||||
magic = 0;
|
||||
}
|
||||
|
||||
template<typename real, typename real2, int MAXSHIFT, int MAXBUTWIDTH>
|
||||
SleefDFT2DXX<real, real2, MAXSHIFT, MAXBUTWIDTH>::~SleefDFT2DXX() {
|
||||
assert(magic == MAGIC2D_FLOAT || magic == MAGIC2D_DOUBLE);
|
||||
|
||||
Sleef_free(tBuf);
|
||||
tBuf = nullptr;
|
||||
delete instH;
|
||||
instH = nullptr;
|
||||
if (hlen != vlen) {
|
||||
delete instV;
|
||||
instV = nullptr;
|
||||
}
|
||||
|
||||
magic = 0;
|
||||
}
|
||||
|
||||
EXPORT void SleefDFT_dispose(SleefDFT *p) {
|
||||
assert(p != NULL);
|
||||
switch(p->magic) {
|
||||
case MAGIC_DOUBLE:
|
||||
delete p->double_;
|
||||
p->magic = 0;
|
||||
p->double_ = nullptr;
|
||||
free(p);
|
||||
break;
|
||||
case MAGIC2D_DOUBLE:
|
||||
delete p->double2d_;
|
||||
p->magic = 0;
|
||||
p->double_ = nullptr;
|
||||
free(p);
|
||||
break;
|
||||
case MAGIC_FLOAT:
|
||||
delete p->float_;
|
||||
p->magic = 0;
|
||||
p->float_ = nullptr;
|
||||
free(p);
|
||||
break;
|
||||
case MAGIC2D_FLOAT:
|
||||
delete p->float2d_;
|
||||
p->magic = 0;
|
||||
p->float_ = nullptr;
|
||||
free(p);
|
||||
break;
|
||||
default: abort();
|
||||
}
|
||||
}
|
||||
|
||||
// PlanManager
|
||||
|
||||
template<typename real, typename real2, int MAXSHIFT, int MAXBUTWIDTH>
|
||||
string SleefDFTXX<real, real2, MAXSHIFT, MAXBUTWIDTH>::planKeyString(string suffix) {
|
||||
string s;
|
||||
s += baseTypeID == 1 ? "D" : "S";
|
||||
s += (mode & SLEEF_MODE_REAL) ? "r" : "c";
|
||||
s += (mode & SLEEF_MODE_BACKWARD) ? "b" : "f";
|
||||
s += (mode & SLEEF_MODE_ALT) ? "o" : "w";
|
||||
s += (mode & SLEEF_MODE_NO_MT) ? "s" : "m";
|
||||
s += to_string(log2len) + "," + "0";
|
||||
if (suffix != "") s += ":" + suffix;
|
||||
return s;
|
||||
}
|
||||
|
||||
template<typename real, typename real2, int MAXSHIFT, int MAXBUTWIDTH>
|
||||
string SleefDFT2DXX<real, real2, MAXSHIFT, MAXBUTWIDTH>::planKeyString(string suffix) {
|
||||
string s;
|
||||
s += baseTypeID == 1 ? "D" : "S";
|
||||
s += (mode & SLEEF_MODE_REAL) ? "r" : "c";
|
||||
s += (mode & SLEEF_MODE_BACKWARD) ? "b" : "f";
|
||||
s += (mode & SLEEF_MODE_ALT) ? "o" : "w";
|
||||
s += (mode & SLEEF_MODE_NO_MT) ? "s" : "m";
|
||||
s += to_string(log2hlen) + "," + to_string(log2vlen);
|
||||
if (suffix != "") s += ":" + suffix;
|
||||
return s;
|
||||
}
|
||||
|
||||
static string getPlanIdPrefix() {
|
||||
string s;
|
||||
|
||||
#ifdef ENABLE_STREAM
|
||||
s += "s";
|
||||
#else
|
||||
s += "n";
|
||||
#endif
|
||||
s += to_string(CONFIGMAX) + ",";
|
||||
s += to_string(ISAMAX) + ",";
|
||||
s += to_string(MAXBUTWIDTHDP) + ",";
|
||||
s += to_string(MAXBUTWIDTHSP) + ",";
|
||||
s += to_string(MINSHIFTDP) + ",";
|
||||
s += to_string(MAXSHIFTDP) + ",";
|
||||
s += to_string(MINSHIFTSP) + ",";
|
||||
s += to_string(MAXSHIFTSP) + ":";
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
PlanManager::PlanManager() {
|
||||
planID = getPlanIdPrefix() + Sleef_getCpuIdString();
|
||||
}
|
||||
|
||||
void PlanManager::setPlanFilePath(const char *path, const char *arch, uint64_t mode) {
|
||||
planMode_ = mode;
|
||||
|
||||
dftPlanFilePath = "";
|
||||
if (path != NULL) dftPlanFilePath = path;
|
||||
|
||||
planID = Sleef_getCpuIdString();
|
||||
if (arch != NULL) planID = arch;
|
||||
planID = getPlanIdPrefix() + planID;
|
||||
|
||||
if ((mode & SLEEF_PLAN_RESET) != 0) std::get<0>(thePlan)[planID].clear();
|
||||
}
|
||||
|
||||
void PlanManager::loadPlanFromFile() {
|
||||
if ((planMode_ & SLEEF_PLAN_REFERTOENVVAR) != 0) {
|
||||
char *s = std::getenv(ENVVAR);
|
||||
if (s != NULL) SleefDFT_setPlanFilePath(s, NULL, planMode_);
|
||||
}
|
||||
|
||||
if (dftPlanFilePath != "") {
|
||||
FILE *fp = fopen(dftPlanFilePath.c_str(), "rb");
|
||||
if (fp) {
|
||||
if (!(planMode_ & SLEEF_PLAN_NOLOCK)) FLOCK(fp);
|
||||
FileDeserializer d(fp);
|
||||
tuple<unordered_map<string, unordered_map<string, string>>, string> plan;
|
||||
try {
|
||||
d >> plan;
|
||||
} catch(exception &ex) {}
|
||||
if (!(planMode_ & SLEEF_PLAN_NOLOCK)) FUNLOCK(fp);
|
||||
fclose(fp);
|
||||
if (std::get<1>(plan) == PLANFILEID) thePlan = plan;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool PlanManager::savePlanToFile(const string &fn) {
|
||||
if (fn != "") {
|
||||
FILE *fp = fopen(fn.c_str(), "wb");
|
||||
if (fp) {
|
||||
FLOCK(fp);
|
||||
FileSerializer s(fp);
|
||||
std::get<1>(thePlan) = PLANFILEID;
|
||||
s << thePlan;
|
||||
FUNLOCK(fp);
|
||||
fclose(fp);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool PlanManager::savePlanToFile() {
|
||||
if ((planMode_ & SLEEF_PLAN_READONLY) != 0) return false;
|
||||
return savePlanToFile(dftPlanFilePath);
|
||||
}
|
||||
|
||||
bool PlanManager::loadAndPutToFile(const string& key, const string& value) {
|
||||
if ((planMode_ & SLEEF_PLAN_REFERTOENVVAR) != 0) {
|
||||
char *s = std::getenv(ENVVAR);
|
||||
if (s != NULL) SleefDFT_setPlanFilePath(s, NULL, planMode_);
|
||||
}
|
||||
|
||||
if (dftPlanFilePath != "") {
|
||||
FILE *fp = fopen(dftPlanFilePath.c_str(), "r+b");
|
||||
if (!fp) fp = fopen(dftPlanFilePath.c_str(), "w+b");
|
||||
if (fp) {
|
||||
if (!(planMode_ & SLEEF_PLAN_NOLOCK)) FLOCK(fp);
|
||||
fseek(fp, 0, SEEK_END);
|
||||
if (ftell(fp) != 0) {
|
||||
fseek(fp, 0, SEEK_SET);
|
||||
FileDeserializer d(fp);
|
||||
tuple<unordered_map<string, unordered_map<string, string>>, string> plan;
|
||||
try {
|
||||
d >> plan;
|
||||
} catch(exception &ex) {}
|
||||
if (std::get<1>(plan) == PLANFILEID) thePlan = plan;
|
||||
}
|
||||
|
||||
std::get<0>(thePlan)[planID][key] = value;
|
||||
std::get<1>(thePlan) = PLANFILEID;
|
||||
fseek(fp, 0, SEEK_SET);
|
||||
FileSerializer s(fp);
|
||||
s << thePlan;
|
||||
if (!(planMode_ & SLEEF_PLAN_NOLOCK)) FUNLOCK(fp);
|
||||
fclose(fp);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
EXPORT void SleefDFT_setPlanFilePath(const char *path, const char *arch, uint64_t mode) {
|
||||
planManager.setPlanFilePath(path, arch, mode);
|
||||
}
|
||||
|
||||
EXPORT int SleefDFT_savePlan(const char *pathStr) {
|
||||
return (int)planManager.savePlanToFile(pathStr);
|
||||
}
|
||||
|
||||
string PlanManager::get(const string& key) {
|
||||
if (std::get<0>(thePlan)[planID].count(key) == 0) return "";
|
||||
|
||||
return std::get<0>(thePlan)[planID].at(key);
|
||||
}
|
||||
|
||||
void PlanManager::put(const string& key, const string& value) {
|
||||
std::get<0>(thePlan)[planID][key] = value;
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
template<typename real, typename real2, int MAXSHIFT, int MAXBUTWIDTH>
|
||||
void SleefDFTXX<real, real2, MAXSHIFT, MAXBUTWIDTH>::saveMeasurementResults() {
|
||||
assert(magic == MAGIC_FLOAT || magic == MAGIC_DOUBLE);
|
||||
|
||||
unique_lock<recursive_mutex> lock(planManager.mtx);
|
||||
|
||||
if ((planManager.planMode() & SLEEF_PLAN_AUTOMATIC) != 0) {
|
||||
if (planManager.loadAndPutToFile(planKeyString(), getPath()) && (mode & SLEEF_MODE_VERBOSE) != 0) {
|
||||
fprintf(verboseFP, "Saving plan to file\n");
|
||||
}
|
||||
} else {
|
||||
planManager.put(planKeyString(), getPath());
|
||||
}
|
||||
}
|
||||
|
||||
template<typename real, typename real2, int MAXSHIFT, int MAXBUTWIDTH>
|
||||
void SleefDFT2DXX<real, real2, MAXSHIFT, MAXBUTWIDTH>::saveMeasurementResults() {
|
||||
assert(magic == MAGIC2D_FLOAT || magic == MAGIC2D_DOUBLE);
|
||||
|
||||
unique_lock<recursive_mutex> lock(planManager.mtx);
|
||||
|
||||
if ((planManager.planMode() & SLEEF_PLAN_AUTOMATIC) != 0) {
|
||||
if (planManager.loadAndPutToFile(planKeyString(), getPath()) && (mode & SLEEF_MODE_VERBOSE) != 0) {
|
||||
fprintf(verboseFP, "Saving plan to file\n");
|
||||
}
|
||||
} else {
|
||||
planManager.put(planKeyString(), getPath());
|
||||
}
|
||||
}
|
||||
|
||||
template<typename real, typename real2, int MAXSHIFT, int MAXBUTWIDTH>
|
||||
bool SleefDFTXX<real, real2, MAXSHIFT, MAXBUTWIDTH>::loadMeasurementResults() {
|
||||
assert(magic == MAGIC_FLOAT || magic == MAGIC_DOUBLE);
|
||||
|
||||
unique_lock<recursive_mutex> lock(planManager.mtx);
|
||||
|
||||
planManager.loadPlanFromFile();
|
||||
|
||||
string path = planManager.get(planKeyString());
|
||||
if (path == "") return false;
|
||||
|
||||
setPath(path.c_str());
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
template<typename real, typename real2, int MAXSHIFT, int MAXBUTWIDTH>
|
||||
bool SleefDFT2DXX<real, real2, MAXSHIFT, MAXBUTWIDTH>::loadMeasurementResults() {
|
||||
assert(magic == MAGIC2D_FLOAT || magic == MAGIC2D_DOUBLE);
|
||||
|
||||
unique_lock<recursive_mutex> lock(planManager.mtx);
|
||||
|
||||
planManager.loadPlanFromFile();
|
||||
|
||||
string path = planManager.get(planKeyString());
|
||||
if (path == "") return false;
|
||||
|
||||
setPath(path.c_str());
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// Instantiation
|
||||
|
||||
template void SleefDFTXX<double, Sleef_double2, MAXSHIFTDP, MAXBUTWIDTHDP>::freeTables();
|
||||
template void SleefDFTXX<float, Sleef_float2, MAXSHIFTSP, MAXBUTWIDTHSP>::freeTables();
|
||||
template SleefDFTXX<double, Sleef_double2, MAXSHIFTDP, MAXBUTWIDTHDP>::~SleefDFTXX();
|
||||
template SleefDFTXX<float, Sleef_float2, MAXSHIFTSP, MAXBUTWIDTHSP>::~SleefDFTXX();
|
||||
template SleefDFT2DXX<double, Sleef_double2, MAXSHIFTDP, MAXBUTWIDTHDP>::~SleefDFT2DXX();
|
||||
template SleefDFT2DXX<float, Sleef_float2, MAXSHIFTSP, MAXBUTWIDTHSP>::~SleefDFT2DXX();
|
||||
|
||||
template bool SleefDFTXX<double, Sleef_double2, MAXSHIFTDP, MAXBUTWIDTHDP>::loadMeasurementResults();
|
||||
template bool SleefDFTXX<float, Sleef_float2, MAXSHIFTSP, MAXBUTWIDTHSP>::loadMeasurementResults();
|
||||
template void SleefDFTXX<double, Sleef_double2, MAXSHIFTDP, MAXBUTWIDTHDP>::saveMeasurementResults();
|
||||
template void SleefDFTXX<float, Sleef_float2, MAXSHIFTSP, MAXBUTWIDTHSP>::saveMeasurementResults();
|
||||
template bool SleefDFT2DXX<double, Sleef_double2, MAXSHIFTDP, MAXBUTWIDTHDP>::loadMeasurementResults();
|
||||
template bool SleefDFT2DXX<float, Sleef_float2, MAXSHIFTSP, MAXBUTWIDTHSP>::loadMeasurementResults();
|
||||
template void SleefDFT2DXX<double, Sleef_double2, MAXSHIFTDP, MAXBUTWIDTHDP>::saveMeasurementResults();
|
||||
template void SleefDFT2DXX<float, Sleef_float2, MAXSHIFTSP, MAXBUTWIDTHSP>::saveMeasurementResults();
|
||||
|
||||
PlanManager planManager;
|
||||
|
||||
FILE *defaultVerboseFP = stdout;
|
||||
|
||||
EXPORT void SleefDFT_setDefaultVerboseFP(FILE *fp) {
|
||||
defaultVerboseFP = fp;
|
||||
}
|
||||
@ -1,69 +0,0 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#define CONFIGMAX 4
|
||||
#define CONFIG_STREAM 1
|
||||
#define CONFIG_MT 2
|
||||
|
||||
#define MAXLOG2LEN 32
|
||||
|
||||
typedef struct SleefDFT {
|
||||
uint32_t magic;
|
||||
uint64_t mode, mode2, mode3;
|
||||
int baseTypeID;
|
||||
const void *in;
|
||||
void *out;
|
||||
|
||||
union {
|
||||
struct {
|
||||
uint32_t log2len;
|
||||
|
||||
void **tbl[MAXBUTWIDTH+1];
|
||||
void *rtCoef0, *rtCoef1;
|
||||
uint32_t **perm;
|
||||
|
||||
void **x0, **x1;
|
||||
|
||||
int isa;
|
||||
int planMode;
|
||||
|
||||
int vecwidth, log2vecwidth;
|
||||
int nThread;
|
||||
|
||||
uint64_t tm[CONFIGMAX][(MAXBUTWIDTH+1)*32];
|
||||
uint64_t bestTime;
|
||||
int16_t bestPath[32], bestPathConfig[32], pathLen;
|
||||
};
|
||||
|
||||
struct {
|
||||
int32_t hlen, vlen;
|
||||
int32_t log2hlen, log2vlen;
|
||||
uint64_t tmNoMT, tmMT;
|
||||
struct SleefDFT *instH, *instV;
|
||||
void *tBuf;
|
||||
};
|
||||
};
|
||||
} SleefDFT;
|
||||
|
||||
#define SLEEF_MODE2_MT1D (1 << 0)
|
||||
#define SLEEF_MODE3_MT2D (1 << 0)
|
||||
|
||||
#define PLANFILEID "SLEEFDFT0\n"
|
||||
#define ENVVAR "SLEEFDFTPLAN"
|
||||
|
||||
#define SLEEF_MODE_MEASUREBITS (3 << 20)
|
||||
|
||||
void freeTables(SleefDFT *p);
|
||||
uint32_t ilog2(uint32_t q);
|
||||
|
||||
//int PlanManager_loadMeasurementResultsB(SleefDFT *p);
|
||||
//void PlanManager_saveMeasurementResultsB(SleefDFT *p, int butStat);
|
||||
int PlanManager_loadMeasurementResultsT(SleefDFT *p);
|
||||
void PlanManager_saveMeasurementResultsT(SleefDFT *p);
|
||||
int PlanManager_loadMeasurementResultsP(SleefDFT *p, int pathCat);
|
||||
void PlanManager_saveMeasurementResultsP(SleefDFT *p, int pathCat);
|
||||
|
||||
#define GETINT_VECWIDTH 100
|
||||
#define GETINT_DFTPRIORITY 101
|
||||
@ -0,0 +1,237 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <climits>
|
||||
#include <unordered_map>
|
||||
#include <tuple>
|
||||
#include <utility>
|
||||
#include <mutex>
|
||||
|
||||
using namespace std;
|
||||
|
||||
#include "dispatchparam.h"
|
||||
|
||||
#define MAGIC_FLOAT 0x31415926
|
||||
#define MAGIC_DOUBLE 0x27182818
|
||||
#define MAGIC2D_FLOAT 0x53589793
|
||||
#define MAGIC2D_DOUBLE 0x28459045
|
||||
|
||||
#define CONFIG_STREAM 1
|
||||
#define CONFIG_MT 2
|
||||
|
||||
#define SLEEF_MODE2_MT1D (1 << 0)
|
||||
#define SLEEF_MODE3_MT2D (1 << 0)
|
||||
|
||||
#define PLANFILEID "SLEEFDFT1"
|
||||
#define ENVVAR "SLEEFDFTPLAN"
|
||||
|
||||
#define SLEEF_MODE_MEASUREBITS (7 << 20)
|
||||
#define SLEEF_MODE_INTERNAL_2D (1ULL << 40)
|
||||
|
||||
#define GETINT_VECWIDTH 100
|
||||
#define GETINT_DFTPRIORITY 101
|
||||
|
||||
#define MAXLOG2LEN 32
|
||||
|
||||
#define INFINITY_ (1e+300 * 1e+300)
|
||||
|
||||
class Action {
|
||||
public:
|
||||
int config, level, N;
|
||||
|
||||
Action(const Action& a) = default;
|
||||
|
||||
Action(int config_, int level_, int N_) : config(config_), level(level_), N(N_) {}
|
||||
|
||||
bool operator==(const Action& rhs) const {
|
||||
return config == rhs.config && level == rhs.level && N == rhs.N;
|
||||
}
|
||||
bool operator!=(const Action& rhs) const { return !(*this == rhs); }
|
||||
|
||||
friend ostream& operator<<(ostream &os, const Action &ac) {
|
||||
return os << "[" << ac.config << ", " << ac.level << ", " << ac.N << "]";
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct std::hash<Action> {
|
||||
size_t operator()(const Action &a) const {
|
||||
size_t u = 0;
|
||||
u ^= a.config;
|
||||
u = (u << 7) | (u >> ((sizeof(u)*8)-7));
|
||||
u ^= a.level;
|
||||
u = (u << 7) | (u >> ((sizeof(u)*8)-7));
|
||||
u ^= a.N;
|
||||
return u;
|
||||
}
|
||||
};
|
||||
|
||||
template<typename real, typename real2, int MAXSHIFT, int MAXBUTWIDTH>
|
||||
struct SleefDFTXX {
|
||||
int magic;
|
||||
const int baseTypeID;
|
||||
const real * const in;
|
||||
real * const out;
|
||||
const int nThread;
|
||||
const uint32_t log2len;
|
||||
const uint64_t mode;
|
||||
const int minshift;
|
||||
|
||||
uint64_t mode2 = 0, mode3 = 0;
|
||||
|
||||
//
|
||||
|
||||
real **tbl[MAXBUTWIDTH+1];
|
||||
real *rtCoef0, *rtCoef1;
|
||||
uint32_t **perm;
|
||||
|
||||
real **x0, **x1;
|
||||
|
||||
int isa = 0;
|
||||
int planMode = 0;
|
||||
|
||||
int vecwidth, log2vecwidth;
|
||||
|
||||
bool executable[CONFIGMAX][MAXLOG2LEN][MAXLOG2LEN];
|
||||
vector<Action> bestPath;
|
||||
|
||||
FILE *verboseFP = NULL;
|
||||
|
||||
void (*(* const DFTF)[ISAMAX][MAXBUTWIDTH+1])(real *, const real *, const int);
|
||||
void (*(* const DFTB)[ISAMAX][MAXBUTWIDTH+1])(real *, const real *, const int);
|
||||
void (*(* const TBUTF)[ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const int, const real *, const int);
|
||||
void (*(* const TBUTB)[ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const int, const real *, const int);
|
||||
void (*(* const BUTF)[ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const int, const real *, const int, const real *, const int);
|
||||
void (*(* const BUTB)[ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const int, const real *, const int, const real *, const int);
|
||||
void (** const REALSUB0)(real *, const real *, const int, const real *, const real *);
|
||||
void (** const REALSUB1)(real *, const real *, const int, const real *, const real *, const int);
|
||||
void (*(* const TBUTFS)[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const real *, const int);
|
||||
void (*(* const TBUTBS)[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const real *, const int);
|
||||
|
||||
SleefDFTXX(uint32_t n, const real *in, real *out, uint64_t mode, const char *baseTypeString, int BASETYPEID_, int MAGIC_, int minshift_,
|
||||
int (*GETINT_[16])(int), const void *(*GETPTR_[16])(int), real2 (*SINCOSPI_)(real),
|
||||
void (*DFTF_[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, const real *, const int),
|
||||
void (*DFTB_[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, const real *, const int),
|
||||
void (*TBUTF_[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const int, const real *, const int),
|
||||
void (*TBUTB_[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const int, const real *, const int),
|
||||
void (*BUTF_[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const int, const real *, const int, const real *, const int),
|
||||
void (*BUTB_[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const int, const real *, const int, const real *, const int),
|
||||
void (*REALSUB0_[ISAMAX])(real *, const real *, const int, const real *, const real *),
|
||||
void (*REALSUB1_[ISAMAX])(real *, const real *, const int, const real *, const real *, const int),
|
||||
void (*TBUTFS_[MAXSHIFT][CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const real *, const int),
|
||||
void (*TBUTBS_[MAXSHIFT][CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const real *, const int)
|
||||
);
|
||||
|
||||
~SleefDFTXX();
|
||||
|
||||
void dispatch(const int N, real *d, const real *s, const int level, const int config);
|
||||
void execute(const real *s0, real *d0, int MAGIC_, int MAGIC2D_);
|
||||
void freeTables();
|
||||
void generatePerm(const vector<Action> &);
|
||||
|
||||
void measurementRun(real *d, const real *s, const vector<Action> &path, uint64_t niter);
|
||||
double measurePath(const vector<Action> &path, uint64_t minTime);
|
||||
void searchForBestPath(int nPaths);
|
||||
void searchForRandomPath();
|
||||
bool measure(bool randomize);
|
||||
|
||||
vector<Action> parsePathStr(const char *);
|
||||
|
||||
string planKeyString(string = "");
|
||||
bool loadMeasurementResults();
|
||||
void saveMeasurementResults();
|
||||
void setPath(const char *pathStr);
|
||||
string getPath();
|
||||
};
|
||||
|
||||
template<typename real, typename real2, int MAXSHIFT, int MAXBUTWIDTH>
|
||||
struct SleefDFT2DXX {
|
||||
int magic;
|
||||
uint64_t mode, mode2, mode3;
|
||||
int baseTypeID;
|
||||
const real *in;
|
||||
real *out;
|
||||
|
||||
//
|
||||
|
||||
int32_t hlen, vlen;
|
||||
int32_t log2hlen, log2vlen;
|
||||
bool planMT;
|
||||
real *tBuf;
|
||||
|
||||
SleefDFTXX<real, real2, MAXSHIFT, MAXBUTWIDTH> *instH, *instV;
|
||||
|
||||
FILE *verboseFP = NULL;
|
||||
|
||||
SleefDFT2DXX(uint32_t vlen, uint32_t hlen, const real *in, real *out, uint64_t mode, const char *baseTypeString,
|
||||
int BASETYPEID_, int MAGIC_, int MAGIC2D_, int minshift_,
|
||||
int (*GETINT_[16])(int), const void *(*GETPTR_[16])(int), real2 (*SINCOSPI_)(real),
|
||||
void (*DFTF_[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, const real *, const int),
|
||||
void (*DFTB_[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, const real *, const int),
|
||||
void (*TBUTF_[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const int, const real *, const int),
|
||||
void (*TBUTB_[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const int, const real *, const int),
|
||||
void (*BUTF_[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const int, const real *, const int, const real *, const int),
|
||||
void (*BUTB_[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const int, const real *, const int, const real *, const int),
|
||||
void (*REALSUB0_[ISAMAX])(real *, const real *, const int, const real *, const real *),
|
||||
void (*REALSUB1_[ISAMAX])(real *, const real *, const int, const real *, const real *, const int),
|
||||
void (*TBUTFS_[MAXSHIFT][CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const real *, const int),
|
||||
void (*TBUTBS_[MAXSHIFT][CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const real *, const int)
|
||||
);
|
||||
|
||||
~SleefDFT2DXX();
|
||||
|
||||
void execute(const real *s0, real *d0, int MAGIC_, int MAGIC2D_);
|
||||
pair<uint64_t, uint64_t> measureTranspose();
|
||||
double measurePath(SleefDFTXX<real, real2, MAXSHIFT, MAXBUTWIDTH> *inst, bool mt,
|
||||
const vector<Action> &path, uint32_t hlen, uint32_t vlen, uint64_t minTime);
|
||||
pair<vector<Action>, double> searchForBestPath(SleefDFTXX<real, real2, MAXSHIFT, MAXBUTWIDTH> *inst, bool mt, uint32_t hlen, uint32_t vlen, int nPaths);
|
||||
|
||||
string planKeyString(string = "");
|
||||
bool loadMeasurementResults();
|
||||
void saveMeasurementResults();
|
||||
void setPath(const char *pathStr);
|
||||
string getPath();
|
||||
};
|
||||
|
||||
struct SleefDFT {
|
||||
uint32_t magic;
|
||||
union {
|
||||
SleefDFTXX<double, Sleef_double2, MAXSHIFTDP, MAXBUTWIDTHDP> *double_;
|
||||
SleefDFTXX<float, Sleef_float2, MAXSHIFTSP, MAXBUTWIDTHSP> *float_;
|
||||
SleefDFT2DXX<double, Sleef_double2, MAXSHIFTDP, MAXBUTWIDTHDP> *double2d_;
|
||||
SleefDFT2DXX<float, Sleef_float2, MAXSHIFTSP, MAXBUTWIDTHSP> *float2d_;
|
||||
};
|
||||
};
|
||||
|
||||
class PlanManager {
|
||||
string dftPlanFilePath;
|
||||
uint64_t planMode_ = SLEEF_PLAN_REFERTOENVVAR;
|
||||
|
||||
string planID;
|
||||
tuple<unordered_map<string, unordered_map<string, string>>, string> thePlan;
|
||||
|
||||
public:
|
||||
PlanManager();
|
||||
|
||||
recursive_mutex mtx;
|
||||
|
||||
uint64_t planMode() { return planMode_; }
|
||||
|
||||
void setPlanFilePath(const char *path, const char *arch, uint64_t mode);
|
||||
void loadPlanFromFile();
|
||||
bool savePlanToFile(const string &fn);
|
||||
bool savePlanToFile();
|
||||
|
||||
bool loadAndPutToFile(const string& key, const string& value);
|
||||
|
||||
string get(const string& key);
|
||||
void put(const string& key, const string& value);
|
||||
};
|
||||
|
||||
extern PlanManager planManager;
|
||||
extern FILE *defaultVerboseFP;
|
||||
@ -1,4 +1,4 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
@ -14,13 +14,16 @@
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
if (argc < 3) {
|
||||
fprintf(stderr, "Usage : %s <basetype> <unrollmax> <unrollmax2> <maxbutwidth> <isa> ...\n", argv[0]);
|
||||
fprintf(stderr, "Usage : %s <base type> <base type ID> <maxbutwidth> <minshift> <maxshift> <isa> ...\n", argv[0]);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
const char *basetype = argv[1];
|
||||
const int maxbutwidth = atoi(argv[2]);
|
||||
const int isastart = 3;
|
||||
const char *baseType = argv[1];
|
||||
const char *baseTypeID = argv[2];
|
||||
const int maxbutwidth = atoi(argv[3]);
|
||||
const int minshift = atoi(argv[4]);
|
||||
const int maxshift = atoi(argv[5]);
|
||||
const int isastart = 6;
|
||||
const int isamax = argc - isastart;
|
||||
|
||||
#if ENABLE_STREAM == 1
|
||||
@ -29,13 +32,14 @@ int main(int argc, char **argv) {
|
||||
const int enable_stream = 0;
|
||||
#endif
|
||||
|
||||
printf("#define MAXBUTWIDTH %d\n", maxbutwidth);
|
||||
printf("#define MAXBUTWIDTH%s %d\n", baseTypeID, maxbutwidth);
|
||||
printf("#define MINSHIFT%s %d\n", baseTypeID, minshift);
|
||||
printf("#define MAXSHIFT%s %d\n", baseTypeID, maxshift);
|
||||
printf("#define CONFIGMAX 4\n");
|
||||
printf("#define ISAMAX %d\n", isamax);
|
||||
printf("\n");
|
||||
|
||||
if (strcmp(basetype, "paramonly") == 0) exit(0);
|
||||
|
||||
printf("#define ISAMAX %d\n", isamax);
|
||||
printf("#define CONFIGMAX 4\n");
|
||||
if (strcmp(baseType, "paramonly") == 0) exit(0);
|
||||
|
||||
for(int k=isastart;k<argc;k++) {
|
||||
for(int config=0;config<4;config++) {
|
||||
@ -43,23 +47,35 @@ int main(int argc, char **argv) {
|
||||
if ((config & 1) != 0) continue;
|
||||
#endif
|
||||
for(int j=1;j<=maxbutwidth;j++) {
|
||||
printf("void dft%df_%d_%s(real *, const real *, const int);\n", 1 << j, config, argv[k]);
|
||||
printf("void dft%db_%d_%s(real *, const real *, const int);\n", 1 << j, config, argv[k]);
|
||||
printf("void tbut%df_%d_%s(real *, uint32_t *, const real *, const int, const real *, const int);\n", 1 << j, config, argv[k]);
|
||||
printf("void tbut%db_%d_%s(real *, uint32_t *, const real *, const int, const real *, const int);\n", 1 << j, config, argv[k]);
|
||||
printf("void but%df_%d_%s(real *, uint32_t *, const int, const real *, const int, const real *, const int);\n", 1 << j, config, argv[k]);
|
||||
printf("void but%db_%d_%s(real *, uint32_t *, const int, const real *, const int, const real *, const int);\n", 1 << j, config, argv[k]);
|
||||
printf("void dft%df_%d_%s(%s *, const %s *, const int);\n", 1 << j, config, argv[k], baseType, baseType);
|
||||
printf("void dft%db_%d_%s(%s *, const %s *, const int);\n", 1 << j, config, argv[k], baseType, baseType);
|
||||
printf("void tbut%df_%d_%s(%s *, uint32_t *, const %s *, const int, const %s *, const int);\n", 1 << j, config, argv[k], baseType, baseType, baseType);
|
||||
printf("void tbut%db_%d_%s(%s *, uint32_t *, const %s *, const int, const %s *, const int);\n", 1 << j, config, argv[k], baseType, baseType, baseType);
|
||||
printf("void but%df_%d_%s(%s *, uint32_t *, const int, const %s *, const int, const %s *, const int);\n", 1 << j, config, argv[k], baseType, baseType, baseType);
|
||||
printf("void but%db_%d_%s(%s *, uint32_t *, const int, const %s *, const int, const %s *, const int);\n", 1 << j, config, argv[k], baseType, baseType, baseType);
|
||||
|
||||
for(int s=minshift;s<maxshift;s++) {
|
||||
printf("void dft%df_%d_%d_%s(%s *, const %s *);\n", 1 << j, s, config, argv[k], baseType, baseType);
|
||||
printf("void dft%db_%d_%d_%s(%s *, const %s *);\n", 1 << j, s, config, argv[k], baseType, baseType);
|
||||
printf("void tbut%df_%d_%d_%s(%s *, uint32_t *, const %s *, const %s *, const int);\n", 1 << j, s, config, argv[k], baseType, baseType, baseType);
|
||||
printf("void tbut%db_%d_%d_%s(%s *, uint32_t *, const %s *, const %s *, const int);\n", 1 << j, s, config, argv[k], baseType, baseType, baseType);
|
||||
}
|
||||
|
||||
for(int s=0;s<maxshift;s++) {
|
||||
printf("void but%df_%d_%d_%s(%s *, uint32_t *, const %s *, const int, const %s *, const int);\n", 1 << j, s, config, argv[k], baseType, baseType, baseType);
|
||||
printf("void but%db_%d_%d_%s(%s *, uint32_t *, const %s *, const int, const %s *, const int);\n", 1 << j, s, config, argv[k], baseType, baseType, baseType);
|
||||
}
|
||||
}
|
||||
}
|
||||
printf("void realSub0_%s(real *, const real *, const int, const real *, const real *);\n", argv[k]);
|
||||
printf("void realSub1_%s(real *, const real *, const int, const real *, const real *, const int);\n", argv[k]);
|
||||
printf("void realSub0_%s(%s *, const %s *, const int, const %s *, const %s *);\n", argv[k], baseType, baseType, baseType, baseType);
|
||||
printf("void realSub1_%s(%s *, const %s *, const int, const %s *, const %s *, const int);\n", argv[k], baseType, baseType, baseType, baseType);
|
||||
printf("int getInt_%s(int);\n", argv[k]);
|
||||
printf("const void *getPtr_%s(int);\n", argv[k]);
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
|
||||
printf("void (*dftf_%s[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, const real *, const int) = {\n", basetype);
|
||||
printf("void (*dftf_%s[CONFIGMAX][ISAMAX][MAXBUTWIDTH%s+1])(%s *, const %s *, const int) = {\n", baseType, baseTypeID, baseType, baseType);
|
||||
for(int config=0;config<4;config++) {
|
||||
printf(" {\n");
|
||||
for(int k=isastart;k<argc;k++) {
|
||||
@ -77,7 +93,7 @@ int main(int argc, char **argv) {
|
||||
}
|
||||
printf("};\n\n");
|
||||
|
||||
printf("void (*dftb_%s[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, const real *, const int) = {\n", basetype);
|
||||
printf("void (*dftb_%s[CONFIGMAX][ISAMAX][MAXBUTWIDTH%s+1])(%s *, const %s *, const int) = {\n", baseType, baseTypeID, baseType, baseType);
|
||||
for(int config=0;config<4;config++) {
|
||||
printf(" {\n");
|
||||
for(int k=isastart;k<argc;k++) {
|
||||
@ -99,7 +115,7 @@ int main(int argc, char **argv) {
|
||||
}
|
||||
printf("};\n\n");
|
||||
|
||||
printf("void (*tbutf_%s[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const int, const real *, const int) = {\n", basetype);
|
||||
printf("void (*tbutf_%s[CONFIGMAX][ISAMAX][MAXBUTWIDTH%s+1])(%s *, uint32_t *, const %s *, const int, const %s *, const int) = {\n", baseType, baseTypeID, baseType, baseType, baseType);
|
||||
for(int config=0;config<4;config++) {
|
||||
printf(" {\n");
|
||||
for(int k=isastart;k<argc;k++) {
|
||||
@ -117,7 +133,7 @@ int main(int argc, char **argv) {
|
||||
}
|
||||
printf("};\n\n");
|
||||
|
||||
printf("void (*tbutb_%s[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const int, const real *, const int) = {\n", basetype);
|
||||
printf("void (*tbutb_%s[CONFIGMAX][ISAMAX][MAXBUTWIDTH%s+1])(%s *, uint32_t *, const %s *, const int, const %s *, const int) = {\n", baseType, baseTypeID, baseType, baseType, baseType);
|
||||
for(int config=0;config<4;config++) {
|
||||
printf(" {\n");
|
||||
for(int k=isastart;k<argc;k++) {
|
||||
@ -135,7 +151,7 @@ int main(int argc, char **argv) {
|
||||
}
|
||||
printf("};\n\n");
|
||||
|
||||
printf("void (*butf_%s[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const int, const real *, const int, const real *, const int) = {\n", basetype);
|
||||
printf("void (*butf_%s[CONFIGMAX][ISAMAX][MAXBUTWIDTH%s+1])(%s *, uint32_t *, const int, const %s *, const int, const %s *, const int) = {\n", baseType, baseTypeID, baseType, baseType, baseType);
|
||||
for(int config=0;config<4;config++) {
|
||||
printf(" {\n");
|
||||
for(int k=isastart;k<argc;k++) {
|
||||
@ -153,7 +169,7 @@ int main(int argc, char **argv) {
|
||||
}
|
||||
printf("};\n\n");
|
||||
|
||||
printf("void (*butb_%s[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const int, const real *, const int, const real *, const int) = {\n", basetype);
|
||||
printf("void (*butb_%s[CONFIGMAX][ISAMAX][MAXBUTWIDTH%s+1])(%s *, uint32_t *, const int, const %s *, const int, const %s *, const int) = {\n", baseType, baseTypeID, baseType, baseType, baseType);
|
||||
for(int config=0;config<4;config++) {
|
||||
printf(" {\n");
|
||||
for(int k=isastart;k<argc;k++) {
|
||||
@ -171,22 +187,66 @@ int main(int argc, char **argv) {
|
||||
}
|
||||
printf("};\n\n");
|
||||
|
||||
printf("void (*tbutfs_%s[MAXSHIFT%s][CONFIGMAX][ISAMAX][MAXBUTWIDTH%s+1])(%s *, uint32_t *, const %s *, const %s *, const int) = {\n", baseType, baseTypeID, baseTypeID, baseType, baseType, baseType);
|
||||
for(int s=0;s<maxshift;s++) {
|
||||
printf(" {\n");
|
||||
for(int config=0;config<4;config++) {
|
||||
printf(" {\n");
|
||||
for(int k=isastart;k<argc;k++) {
|
||||
printf(" {NULL, ");
|
||||
for(int i=1;i<=maxbutwidth;i++) {
|
||||
if ((enable_stream || (config & 1) == 0) && s >= minshift) {
|
||||
printf("tbut%df_%d_%d_%s, ", 1 << i, s, config, argv[k]);
|
||||
} else {
|
||||
printf("NULL, ");
|
||||
}
|
||||
}
|
||||
printf("},\n");
|
||||
}
|
||||
printf(" },\n");
|
||||
}
|
||||
printf(" },\n");
|
||||
}
|
||||
printf("};\n\n");
|
||||
|
||||
printf("void (*tbutbs_%s[MAXSHIFT%s][CONFIGMAX][ISAMAX][MAXBUTWIDTH%s+1])(%s *, uint32_t *, const %s *, const %s *, const int) = {\n", baseType, baseTypeID, baseTypeID, baseType, baseType, baseType);
|
||||
for(int s=0;s<maxshift;s++) {
|
||||
printf(" {\n");
|
||||
for(int config=0;config<4;config++) {
|
||||
printf(" {\n");
|
||||
for(int k=isastart;k<argc;k++) {
|
||||
printf(" {NULL, ");
|
||||
for(int i=1;i<=maxbutwidth;i++) {
|
||||
if ((enable_stream || (config & 1) == 0) && s >= minshift) {
|
||||
printf("tbut%db_%d_%d_%s, ", 1 << i, s, config, argv[k]);
|
||||
} else {
|
||||
printf("NULL, ");
|
||||
}
|
||||
}
|
||||
printf("},\n");
|
||||
}
|
||||
printf(" },\n");
|
||||
}
|
||||
printf(" },\n");
|
||||
}
|
||||
printf("};\n\n");
|
||||
|
||||
//
|
||||
|
||||
printf("void (*realSub0_%s[ISAMAX])(real *, const real *, const int, const real *, const real *) = {\n ", basetype);
|
||||
printf("void (*realSub0_%s[ISAMAX])(%s *, const %s *, const int, const %s *, const %s *) = {\n ", baseType, baseType, baseType, baseType, baseType);
|
||||
for(int k=isastart;k<argc;k++) printf("realSub0_%s, ", argv[k]);
|
||||
printf("\n};\n\n");
|
||||
|
||||
printf("void (*realSub1_%s[ISAMAX])(real *, const real *, const int, const real *, const real *, const int) = {\n ", basetype);
|
||||
printf("void (*realSub1_%s[ISAMAX])(%s *, const %s *, const int, const %s *, const %s *, const int) = {\n ", baseType, baseType, baseType, baseType, baseType);
|
||||
for(int k=isastart;k<argc;k++) printf("realSub1_%s, ", argv[k]);
|
||||
printf("\n};\n\n");
|
||||
|
||||
printf("int (*getInt_%s[16])(int) = {\n ", basetype);
|
||||
printf("int (*getInt_%s[16])(int) = {\n ", baseType);
|
||||
for(int k=isastart;k<argc;k++) printf("getInt_%s, ", argv[k]);
|
||||
for(int k=0;k<16-(argc-isastart);k++) printf("NULL, ");
|
||||
printf("\n};\n\n");
|
||||
|
||||
printf("const void *(*getPtr_%s[16])(int) = {\n ", basetype);
|
||||
printf("const void *(*getPtr_%s[16])(int) = {\n ", baseType);
|
||||
for(int k=isastart;k<argc;k++) printf("getPtr_%s, ", argv[k]);
|
||||
for(int k=0;k<16-(argc-isastart);k++) printf("NULL, ");
|
||||
printf("\n};\n\n");
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
@ -41,13 +41,25 @@ char *replaceAll(const char *in, const char *pat, const char *replace) {
|
||||
char line[LEN+10];
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
if (argc < 2) {
|
||||
fprintf(stderr, "Usage : %s <Base type> <ISA> ...\n", argv[0]);
|
||||
if (argc < 5) {
|
||||
fprintf(stderr, "Usage : %s <file name> <Base type> <Base type ID> <shift> <ISA> ...\n", argv[0]);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
const char *baseType = argv[1];
|
||||
const int isastart = 2;
|
||||
const char *fn = argv[1];
|
||||
const char *baseTypeID = argv[3];
|
||||
int shift = atoi(argv[4]);
|
||||
const int isastart = 5;
|
||||
int mode = 1;
|
||||
if (strcmp(argv[4], "-") == 0) {
|
||||
mode = 0;
|
||||
} else if (shift <= 0) {
|
||||
mode = 2;
|
||||
shift = -shift;
|
||||
}
|
||||
|
||||
char shiftstr[21];
|
||||
snprintf(shiftstr, 20, "%d", shift);
|
||||
|
||||
for(int config=0;config<CONFIGMAX;config++) {
|
||||
#if ENABLE_STREAM == 0
|
||||
@ -58,13 +70,22 @@ int main(int argc, char **argv) {
|
||||
char configString[100];
|
||||
sprintf(configString, "%d", config);
|
||||
|
||||
FILE *fpin = fopen("unroll0.org", "r");
|
||||
FILE *fpin = fopen(fn, "r");
|
||||
|
||||
switch(mode) {
|
||||
case 0:
|
||||
sprintf(line, "unroll_%d_%s.cpp", config, isaString);
|
||||
break;
|
||||
case 1:
|
||||
sprintf(line, "unroll_%d_%s_%d.cpp", config, isaString, shift);
|
||||
break;
|
||||
case 2:
|
||||
sprintf(line, "unroll2_%d_%s_%d.cpp", config, isaString, shift);
|
||||
break;
|
||||
}
|
||||
|
||||
sprintf(line, "unroll_%d_%s.c", config, isaString);
|
||||
FILE *fpout = fopen(line, "w");
|
||||
fputs("#include \"vectortype.h\"\n\n", fpout);
|
||||
fprintf(fpout, "extern %s ctbl_%s[];\n", baseType, baseType);
|
||||
fprintf(fpout, "#define ctbl ctbl_%s\n\n", baseType);
|
||||
fputs("#include \"vectortype.hpp\"\n\n", fpout);
|
||||
|
||||
for(;;) {
|
||||
if (fgets(line, LEN, fpin) == NULL) break;
|
||||
@ -82,7 +103,7 @@ int main(int argc, char **argv) {
|
||||
}
|
||||
|
||||
if ((config & 2) == 0) {
|
||||
char *s0 = replaceAll(s, "#pragma", "//");
|
||||
char *s0 = replaceAll(s, "#pragma", "//pragma");
|
||||
free(s);
|
||||
s = s0;
|
||||
}
|
||||
@ -93,6 +114,18 @@ int main(int argc, char **argv) {
|
||||
s = s0;
|
||||
}
|
||||
|
||||
{
|
||||
char *s0 = replaceAll(s, "%TYPEID%", baseTypeID);
|
||||
free(s);
|
||||
s = s0;
|
||||
}
|
||||
|
||||
{
|
||||
char *s0 = replaceAll(s, "%SHIFT%", shiftstr);
|
||||
free(s);
|
||||
s = s0;
|
||||
}
|
||||
|
||||
fputs(s, fpout);
|
||||
free(s);
|
||||
}
|
||||
|
||||
@ -0,0 +1,145 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#include <iostream>
|
||||
#include <cstdio>
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
#include <tuple>
|
||||
|
||||
using namespace std;
|
||||
|
||||
class Serializer {
|
||||
public:
|
||||
virtual void write(const void *, size_t) = 0;
|
||||
virtual void flush() {}
|
||||
};
|
||||
|
||||
class Deserializer {
|
||||
public:
|
||||
virtual void read(void *, size_t) = 0;
|
||||
|
||||
template<typename T, typename enable_if<(is_trivially_copyable<T>::value), int>::type = 0>
|
||||
T read() {
|
||||
T t;
|
||||
read(&t, sizeof(T));
|
||||
return t;
|
||||
}
|
||||
};
|
||||
|
||||
class FileSerializer : public Serializer {
|
||||
FILE *fp;
|
||||
|
||||
public:
|
||||
FileSerializer(FILE *fp_) : fp(fp_) {}
|
||||
|
||||
void write(const void *p, size_t z) {
|
||||
fwrite(p, z, 1, fp);
|
||||
}
|
||||
|
||||
void flush() { fflush(fp); }
|
||||
};
|
||||
|
||||
class FileDeserializer : public Deserializer {
|
||||
FILE *fp;
|
||||
|
||||
public:
|
||||
FileDeserializer(FILE *fp_) : fp(fp_) {}
|
||||
|
||||
void read(void *p, size_t z) {
|
||||
if (!fread(p, z, 1, fp)) throw(runtime_error("FileDeserializer::read : could not read"));
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T, typename enable_if<(is_trivially_copyable<T>::value), int>::type = 0>
|
||||
Serializer& operator<<(Serializer &s, const T& v) {
|
||||
s.write((const char *)&v, sizeof(v));
|
||||
return s;
|
||||
}
|
||||
|
||||
template<typename T, typename enable_if<(is_trivially_copyable<T>::value), int>::type = 0>
|
||||
Deserializer& operator>>(Deserializer &s, T& v) {
|
||||
s.read((char *)&v, sizeof(v));
|
||||
return s;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
Serializer& operator<<(Serializer &s, const vector<T>& v) {
|
||||
s << v.size();
|
||||
for(size_t i=0;i<v.size();i++) s << v.data()[i];
|
||||
return s;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
Deserializer& operator>>(Deserializer &d, vector<T>& v) {
|
||||
size_t z = d.read<size_t>();
|
||||
for(size_t i=0;i<z;i++) {
|
||||
T t;
|
||||
d >> t;
|
||||
v.push_back(t);
|
||||
}
|
||||
return d;
|
||||
}
|
||||
|
||||
Serializer& operator<<(Serializer &s, const string& str) {
|
||||
s << (str.size() + 1);
|
||||
s.write(str.c_str(), str.size() + 1);
|
||||
return s;
|
||||
}
|
||||
|
||||
Deserializer& operator>>(Deserializer &d, string& str) {
|
||||
vector<char> v;
|
||||
d >> v;
|
||||
str = v.data();
|
||||
return d;
|
||||
}
|
||||
|
||||
template<typename KT, typename VT>
|
||||
Serializer& operator<<(Serializer &s, const unordered_map<KT, VT>& m) {
|
||||
s << m.size();
|
||||
for(auto a : m) s << a.first << a.second;
|
||||
return s;
|
||||
}
|
||||
|
||||
template<typename KT, typename VT>
|
||||
Deserializer& operator>>(Deserializer &d, unordered_map<KT, VT>& m) {
|
||||
size_t z = d.read<size_t>();
|
||||
for(size_t i=0;i<z;i++) {
|
||||
KT key;
|
||||
d >> key;
|
||||
VT value;
|
||||
d >> value;
|
||||
m[key] = value;
|
||||
}
|
||||
return d;
|
||||
}
|
||||
|
||||
template<class tupletype, uint32_t idx=0>
|
||||
static void serialize_tuple(Serializer &s, const tupletype& t) {
|
||||
if constexpr (idx < tuple_size_v<tupletype>) {
|
||||
s << get<idx>(t);
|
||||
serialize_tuple<tupletype, idx + 1>(s, t);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename ...Ts>
|
||||
Serializer& operator<<(Serializer &s, const tuple<Ts...>& t) {
|
||||
serialize_tuple(s, t);
|
||||
return s;
|
||||
}
|
||||
|
||||
template<class tupletype, uint32_t idx=0>
|
||||
static void deserialize_tuple(Deserializer &d, tupletype& t) {
|
||||
if constexpr (idx < tuple_size_v<tupletype>) {
|
||||
d >> get<idx>(t);
|
||||
deserialize_tuple<tupletype, idx + 1>(d, t);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename ...Ts>
|
||||
Deserializer& operator>>(Deserializer &d, tuple<Ts...> &t) {
|
||||
deserialize_tuple(d, t);
|
||||
return d;
|
||||
}
|
||||
@ -1,8 +1,42 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
static const real ctbl[] = {
|
||||
0.7071067811865475243818940365159164684883L, -0.7071067811865475243818940365159164684883L,
|
||||
0.9238795325112867561014214079495587839119L, -0.382683432365089771723257530688933059082L,
|
||||
0.382683432365089771723257530688933059082L, -0.9238795325112867561014214079495587839119L,
|
||||
0.9807852804032304491190993878113602022495L, -0.1950903220161282678433729148581576851029L,
|
||||
0.5555702330196022247573058028269343822103L, -0.8314696123025452370808655033762590846891L,
|
||||
0.8314696123025452370808655033762590846891L, -0.5555702330196022247573058028269343822103L,
|
||||
0.1950903220161282678433729148581576851029L, -0.9807852804032304491190993878113602022495L,
|
||||
0.9951847266721968862310254699821143731242L, -0.09801714032956060199569840382660679267701L,
|
||||
0.6343932841636454982026105398063009488396L, -0.7730104533627369607965383602188325085081L,
|
||||
0.881921264348355029715105513066220055407L, -0.4713967368259976485449225247492677226546L,
|
||||
0.2902846772544623676448431737195932100803L, -0.9569403357322088649310892760624369657307L,
|
||||
0.9569403357322088649310892760624369657307L, -0.2902846772544623676448431737195932100803L,
|
||||
0.4713967368259976485449225247492677226546L, -0.881921264348355029715105513066220055407L,
|
||||
0.7730104533627369607965383602188325085081L, -0.6343932841636454982026105398063009488396L,
|
||||
0.09801714032956060199569840382660679267701L, -0.9951847266721968862310254699821143731242L,
|
||||
0.9987954562051723927007702841240899260811L, -0.04906767432741801425355085940205324135377L,
|
||||
0.6715589548470184006194634573905233310143L, -0.7409511253549590911932944126139233276263L,
|
||||
0.9039892931234433315823215138173907234886L, -0.427555093430282094315230886905077056781L,
|
||||
0.336889853392220050702686798271834334173L, -0.9415440651830207783906830087961026265475L,
|
||||
0.9700312531945439926159106824865574481009L, -0.2429801799032638899447731489766866275204L,
|
||||
0.5141027441932217266072797923204262815489L, -0.8577286100002720698929313536407192941624L,
|
||||
0.8032075314806449097991200569701675249235L, -0.5956993044924333434615715265891822127742L,
|
||||
0.1467304744553617516588479505190711904561L, -0.9891765099647809734561415551112872890371L,
|
||||
0.9891765099647809734561415551112872890371L, -0.1467304744553617516588479505190711904561L,
|
||||
0.5956993044924333434615715265891822127742L, -0.8032075314806449097991200569701675249235L,
|
||||
0.8577286100002720698929313536407192941624L, -0.5141027441932217266072797923204262815489L,
|
||||
0.2429801799032638899447731489766866275204L, -0.9700312531945439926159106824865574481009L,
|
||||
0.9415440651830207783906830087961026265475L, -0.336889853392220050702686798271834334173L,
|
||||
0.427555093430282094315230886905077056781L, -0.9039892931234433315823215138173907234886L,
|
||||
0.7409511253549590911932944126139233276263L, -0.6715589548470184006194634573905233310143L,
|
||||
0.04906767432741801425355085940205324135377L, -0.9987954562051723927007702841240899260811L,
|
||||
};
|
||||
|
||||
ALIGNED(8192) void dft2f_%CONFIG%_%ISA%(real *RESTRICT out0, const real *RESTRICT in0, const int shift) {
|
||||
const int k = 1 << (shift - LOG2VECWIDTH);
|
||||
int i=0;
|
||||
@ -241,7 +275,7 @@ ALIGNED(8192) void tbut4b_%CONFIG%_%ISA%(real *RESTRICT out0, uint32_t *q, const
|
||||
}
|
||||
}
|
||||
|
||||
#if MAXBUTWIDTH >= 3
|
||||
#if MAXBUTWIDTH%TYPEID% >= 3
|
||||
ALIGNED(8192) void dft8f_%CONFIG%_%ISA%(real *RESTRICT out0, const real *RESTRICT in0, const int shift) {
|
||||
const int k = 1 << (shift - LOG2VECWIDTH);
|
||||
int i=0;
|
||||
@ -551,7 +585,7 @@ ALIGNED(8192) void tbut8b_%CONFIG%_%ISA%(real *RESTRICT out0, uint32_t *q, const
|
||||
}
|
||||
#endif
|
||||
|
||||
#if MAXBUTWIDTH >= 4
|
||||
#if MAXBUTWIDTH%TYPEID% >= 4
|
||||
ALIGNED(8192) void dft16f_%CONFIG%_%ISA%(real *RESTRICT out0, const real *RESTRICT in0, const int shift) {
|
||||
const int k = 1 << (shift - LOG2VECWIDTH);
|
||||
int i=0;
|
||||
@ -1217,7 +1251,7 @@ ALIGNED(8192) void tbut16b_%CONFIG%_%ISA%(real *RESTRICT out0, uint32_t *q, cons
|
||||
}
|
||||
#endif
|
||||
|
||||
#if MAXBUTWIDTH >= 5
|
||||
#if MAXBUTWIDTH%TYPEID% >= 5
|
||||
ALIGNED(8192) void dft32f_%CONFIG%_%ISA%(real *RESTRICT out0, const real *RESTRICT in0, const int shift) {
|
||||
const int k = 1 << (shift - LOG2VECWIDTH);
|
||||
int i=0;
|
||||
@ -2727,7 +2761,7 @@ ALIGNED(8192) void tbut32b_%CONFIG%_%ISA%(real *RESTRICT out0, uint32_t *q, cons
|
||||
}
|
||||
#endif
|
||||
|
||||
#if MAXBUTWIDTH >= 6
|
||||
#if MAXBUTWIDTH%TYPEID% >= 6
|
||||
ALIGNED(8192) void dft64f_%CONFIG%_%ISA%(real *RESTRICT out0, const real *RESTRICT in0, const int shift) {
|
||||
const int k = 1 << (shift - LOG2VECWIDTH);
|
||||
int i=0;
|
||||
@ -6191,7 +6225,7 @@ ALIGNED(8192) void tbut64b_%CONFIG%_%ISA%(real *RESTRICT out0, uint32_t *q, cons
|
||||
|
||||
//
|
||||
|
||||
#if MAXBUTWIDTH >= 7
|
||||
#if MAXBUTWIDTH%TYPEID% >= 7
|
||||
ALIGNED(8192) void dft128f_%CONFIG%_%ISA%(real *RESTRICT out0, const real *RESTRICT in0, const int shift) {
|
||||
const int k = 1 << (shift - LOG2VECWIDTH);
|
||||
int i=0;
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,4 +1,4 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
@ -6,6 +6,10 @@
|
||||
#ifndef __VECTORTYPE_H__
|
||||
#define __VECTORTYPE_H__
|
||||
|
||||
#if defined(__GNUC__)
|
||||
#pragma GCC diagnostic ignored "-Wattributes"
|
||||
#endif
|
||||
|
||||
#include <math.h>
|
||||
#include "sleef.h"
|
||||
|
||||
@ -57,10 +61,6 @@
|
||||
#include "helpers390x_128.h"
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_VECEXT
|
||||
#include "helpervecext.h"
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_PUREC
|
||||
#include "helperpurec.h"
|
||||
#endif
|
||||
@ -194,13 +194,12 @@ int main(int argc, char **argv)
|
||||
mpfr_zinit(result[i]);
|
||||
}
|
||||
|
||||
mpfr_t fra, frb, frc, frd, fre;
|
||||
mpfr_t fra, frb, frc, frd;
|
||||
|
||||
mpfr_zinit(fra);
|
||||
mpfr_zinit(frb);
|
||||
mpfr_zinit(frc);
|
||||
mpfr_zinit(frd);
|
||||
mpfr_zinit(fre);
|
||||
mpfr_init(fra);
|
||||
mpfr_init(frb);
|
||||
mpfr_init(frc);
|
||||
mpfr_init(frd);
|
||||
|
||||
for(i=0;i<n;i++) {
|
||||
double b = 1.0 - pow((double)i / (n-1), p);
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
// The original code for simplex algorithm is taken from Haruhiko Okumura's book.
|
||||
// The original code for simplex algorithm is taken from Haruhiko Okumura's book.
|
||||
// https://oku.edu.mie-u.ac.jp/~okumura/algo/
|
||||
// The code is distributed under the Creative Commons Attribution 4.0 International License.
|
||||
// https://creativecommons.org/licenses/by/4.0/
|
||||
@ -156,7 +156,7 @@ static void tableau(mpfr_t ret, int i, int j) {
|
||||
|
||||
if (j <= n) {
|
||||
mpfr_t s;
|
||||
mpfr_zinit(s);
|
||||
mpfr_init(s);
|
||||
mpfr_set_d(s, 0, GMP_RNDN);
|
||||
|
||||
mpfr_t *tab = malloc(sizeof(mpfr_t) * (m + 1));
|
||||
@ -190,9 +190,7 @@ static void tableau(mpfr_t ret, int i, int j) {
|
||||
static void pivot(int ipivot, int jpivot) {
|
||||
int i, j;
|
||||
mpfr_t u;
|
||||
|
||||
mpfr_zinit(u);
|
||||
|
||||
mpfr_init(u);
|
||||
mpfr_set(u, pivotcolumn[ipivot], GMP_RNDN);
|
||||
|
||||
for (j = 1; j <= m; j++) {
|
||||
@ -254,7 +252,8 @@ static int minimize() {
|
||||
static int phase1() {
|
||||
int i, j;
|
||||
mpfr_t u;
|
||||
mpfr_zinit(u);
|
||||
mpfr_init(u);
|
||||
mpfr_set_d(u, 0, GMP_RNDN);
|
||||
|
||||
jmax = n3;
|
||||
for (i = 0; i <= m; i++) {
|
||||
@ -309,7 +308,8 @@ int solve_fr(mpfr_t *result, int n0, int m0, mpfr_t **a0, int *ineq0, mpfr_t *c0
|
||||
init(n, m);
|
||||
|
||||
mpfr_t csum;
|
||||
mpfr_zinit(csum);
|
||||
mpfr_init(csum);
|
||||
mpfr_set_d(csum, 0, GMP_RNDN);
|
||||
|
||||
for(j=0;j<n0+1;j++) {
|
||||
mpfr_set(c[j], c0[j], GMP_RNDN);
|
||||
@ -370,7 +370,8 @@ int solve_fr(mpfr_t *result, int n0, int m0, mpfr_t **a0, int *ineq0, mpfr_t *c0
|
||||
}
|
||||
|
||||
mpfr_t cs;
|
||||
mpfr_zinit(cs);
|
||||
mpfr_init(cs);
|
||||
mpfr_set_d(cs, 0, GMP_RNDN);
|
||||
if (row[n] != 0) tableau(cs, row[n], 0);
|
||||
|
||||
for (j = 1; j < n; j++) {
|
||||
|
||||
@ -1,153 +0,0 @@
|
||||
ICCAVAILABLE := $(shell command -v icc 2> /dev/null)
|
||||
ARCH := $(shell uname -p)
|
||||
|
||||
all :
|
||||
ifndef BUILDDIR
|
||||
@echo
|
||||
@echo Please set the build directory to BUILDDIR environment variable and run make once again.
|
||||
@echo e.g. export BUILDDIR='`pwd`'/../../build
|
||||
@echo
|
||||
else
|
||||
@echo
|
||||
@echo You can start measurement by "'"make measure"'".
|
||||
ifdef ICCAVAILABLE
|
||||
@echo You can start measurement with SVML by "'"make measureSVML"'".
|
||||
endif
|
||||
@echo Then, you can plot the results of measurement by "'"make plot"'".
|
||||
@echo
|
||||
@echo You have to install java and gnuplot to do plotting.
|
||||
@echo Stop all tasks on the computer before starting measurement.
|
||||
@echo
|
||||
endif
|
||||
|
||||
benchsvml128_10.o : benchsvml128.c bench.h
|
||||
-command -v icc >/dev/null 2>&1 && icc benchsvml128.c -Wall -I.. -DSVMLULP=1 -fimf-max-error=1.0 -fimf-domain-exclusion=0 -march=core-avx2 -O0 -lm -c -o benchsvml128_10.o
|
||||
|
||||
benchsvml128_40.o : benchsvml128.c bench.h
|
||||
-command -v icc >/dev/null 2>&1 && icc benchsvml128.c -Wall -I.. -DSVMLULP=4 -fimf-max-error=4.0 -fimf-domain-exclusion=0 -march=core-avx2 -O0 -lm -c -o benchsvml128_40.o
|
||||
|
||||
benchsvml256_10.o : benchsvml256.c bench.h
|
||||
-command -v icc >/dev/null 2>&1 && icc benchsvml256.c -Wall -I.. -DSVMLULP=1 -fimf-max-error=1.0 -fimf-domain-exclusion=0 -march=core-avx2 -O0 -lm -c -o benchsvml256_10.o
|
||||
|
||||
benchsvml256_40.o : benchsvml256.c bench.h
|
||||
-command -v icc >/dev/null 2>&1 && icc benchsvml256.c -Wall -I.. -DSVMLULP=4 -fimf-max-error=4.0 -fimf-domain-exclusion=0 -march=core-avx2 -O0 -lm -c -o benchsvml256_40.o
|
||||
|
||||
benchsvml512_10.o : benchsvml512.c bench.h
|
||||
-command -v icc >/dev/null 2>&1 && icc benchsvml512.c -Wall -I.. -DSVMLULP=1 -fimf-max-error=1.0 -fimf-domain-exclusion=0 -xCOMMON-AVX512 -O0 -lm -c -o benchsvml512_10.o
|
||||
|
||||
benchsvml512_40.o : benchsvml512.c bench.h
|
||||
-command -v icc >/dev/null 2>&1 && icc benchsvml512.c -Wall -I.. -DSVMLULP=4 -fimf-max-error=4.0 -fimf-domain-exclusion=0 -xCOMMON-AVX512 -O0 -lm -c -o benchsvml512_40.o
|
||||
|
||||
|
||||
benchsvml_10 : benchsvml.c benchsvml128_10.o benchsvml256_10.o benchsvml512_10.o bench.h
|
||||
-command -v icc >/dev/null 2>&1 && icc benchsvml.c benchsvml128_10.o benchsvml256_10.o benchsvml512_10.o -Wall -I.. -DSVMLULP=1 -fimf-max-error=1.0 -fimf-domain-exclusion=0 -O0 -march=native -lm -o benchsvml_10
|
||||
|
||||
benchsvml_40 : benchsvml.c benchsvml128_40.o benchsvml256_40.o benchsvml512_40.o bench.h
|
||||
-command -v icc >/dev/null 2>&1 && icc benchsvml.c benchsvml128_40.o benchsvml256_40.o benchsvml512_40.o -Wall -I.. -DSVMLULP=4 -fimf-max-error=4.0 -fimf-domain-exclusion=0 -O0 -march=native -lm -o benchsvml_40
|
||||
|
||||
#
|
||||
|
||||
ifeq ($(ARCH),aarch64)
|
||||
|
||||
benchsleef : benchsleef.c benchsleef128.o bench.h
|
||||
$(CC) benchsleef.c benchsleef128.o -Wall -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -lsleef -lm -o benchsleef
|
||||
|
||||
benchsleef128.o : benchsleef128.c bench.h
|
||||
$(CC) benchsleef128.c -Wall -march=native -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -c
|
||||
|
||||
else ifeq ($(ARCH),s390x)
|
||||
|
||||
benchsleef : benchsleef.c benchsleef128.o bench.h
|
||||
$(CC) benchsleef.c benchsleef128.o -Wall -march=native -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -lsleef -lm -o benchsleef
|
||||
|
||||
benchsleef128.o : benchsleef128.c bench.h
|
||||
$(CC) benchsleef128.c -Wall -mzvector -march=native -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -c
|
||||
|
||||
else ifeq ($(ARCH),ppc64le)
|
||||
|
||||
benchsleef : benchsleef.c benchsleef128.o bench.h
|
||||
$(CC) benchsleef.c benchsleef128.o -Wall -mcpu=native -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -lsleef -lm -o benchsleef
|
||||
|
||||
benchsleef128.o : benchsleef128.c bench.h
|
||||
$(CC) benchsleef128.c -Wall -mcpu=native -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -c
|
||||
|
||||
else
|
||||
|
||||
benchsleef : benchsleef.c benchsleef128.o benchsleef256.o benchsleef512.o bench.h
|
||||
$(CC) benchsleef.c benchsleef128.o benchsleef256.o benchsleef512.o -Wall -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -lsleef -lm -o benchsleef
|
||||
|
||||
benchsleef128.o : benchsleef128.c bench.h
|
||||
$(CC) benchsleef128.c -Wall -march=native -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -c
|
||||
|
||||
benchsleef256.o : benchsleef256.c bench.h
|
||||
$(CC) benchsleef256.c -Wall -march=native -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -c
|
||||
|
||||
benchsleef512.o : benchsleef512.c bench.h
|
||||
$(CC) benchsleef512.c -Wall -mavx512f -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -c
|
||||
|
||||
endif
|
||||
|
||||
#
|
||||
|
||||
ProcessData.class : ProcessData.java
|
||||
javac ProcessData.java
|
||||
|
||||
#
|
||||
|
||||
ifndef BUILDDIR
|
||||
measure :
|
||||
@echo
|
||||
@echo Please set the build directory to BUILDDIR environment variable and run make once again.
|
||||
@echo e.g. export BUILDDIR='`pwd`'/../../build
|
||||
@echo
|
||||
else
|
||||
measure : benchsleef
|
||||
chmod +x ./measure.sh
|
||||
LD_LIBRARY_PATH=$(BUILDDIR)/lib ./measure.sh ./benchsleef
|
||||
@echo
|
||||
@echo Now, you can plot the results of measurement by "'"make plot"'".
|
||||
@echo You can do another measurement by "'"make measure"'".
|
||||
ifdef ICCAVAILABLE
|
||||
@echo You can start another measurement with SVML by "'"make measureSVML"'".
|
||||
endif
|
||||
@echo You can start over by "'"make restart"'".
|
||||
@echo
|
||||
endif
|
||||
|
||||
measureSVML : all benchsvml_10 benchsvml_40
|
||||
chmod +x ./measure.sh
|
||||
./measure.sh ./benchsvml_10 ./benchsvml_40
|
||||
@echo
|
||||
@echo Now, you can plot the results of measurement by "'"make plot"'".
|
||||
@echo You can do another measurement by "'"make measure"'".
|
||||
ifdef ICCAVAILABLE
|
||||
@echo You can start another measurement with SVML by "'"make measureSVML"'".
|
||||
endif
|
||||
@echo You can start over by "'"make restart"'".
|
||||
@echo
|
||||
|
||||
plot : ProcessData.class counter.txt
|
||||
java ProcessData *dptrig*.out
|
||||
gnuplot script.out
|
||||
mv output.png trigdp.png
|
||||
java ProcessData *dpnontrig*.out
|
||||
gnuplot script.out
|
||||
mv output.png nontrigdp.png
|
||||
java ProcessData *sptrig*.out
|
||||
gnuplot script.out
|
||||
mv output.png trigsp.png
|
||||
java ProcessData *spnontrig*.out
|
||||
gnuplot script.out
|
||||
mv output.png nontrigsp.png
|
||||
@echo
|
||||
@echo Plotted results are in trigdp.png, nontrigdp.png, trigsp.png and nontrigsp.png.
|
||||
@echo
|
||||
|
||||
clean :
|
||||
rm -f *~ a.out *.so *.so.* *.a *.s *.o
|
||||
rm -rf *.dSYM *.dylib
|
||||
rm -f *.obj *.lib *.dll *.exp *.exe *.stackdump
|
||||
rm -f *.class *.png benchsleef benchsvml_10 benchsvml_40 *.out counter.txt
|
||||
|
||||
restart :
|
||||
rm -f *.out counter.txt
|
||||
@ -1,193 +0,0 @@
|
||||
import java.util.*;
|
||||
import java.io.*;
|
||||
|
||||
public class ProcessData {
|
||||
static final int DP = 64, SP = 32;
|
||||
|
||||
static LinkedHashMap<String, Integer> funcNameOrder = new LinkedHashMap<String, Integer>();
|
||||
|
||||
static class Key {
|
||||
final String funcName;
|
||||
|
||||
final int prec, bits;
|
||||
final ArrayList<Double> range = new ArrayList<Double>();
|
||||
final double ulps;
|
||||
|
||||
Key(String s) {
|
||||
String[] a = s.split(",");
|
||||
|
||||
funcName = a[0].trim();
|
||||
if (funcNameOrder.get(funcName) == null) {
|
||||
funcNameOrder.put(funcName, funcNameOrder.size());
|
||||
}
|
||||
|
||||
prec =
|
||||
a[1].trim().equals("DP") ? DP :
|
||||
a[1].trim().equals("SP") ? SP :
|
||||
0;
|
||||
|
||||
bits = Integer.parseInt(a[2].trim());
|
||||
|
||||
int c;
|
||||
|
||||
for(c = 3;;c++) {
|
||||
if (a[c].trim().endsWith("ulps")) break;
|
||||
range.add(Double.parseDouble(a[c]));
|
||||
}
|
||||
|
||||
ulps = Double.parseDouble(a[c].trim().replace("ulps", ""));
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
int h = funcName.hashCode();
|
||||
h ^= prec ^ bits;
|
||||
return h;
|
||||
}
|
||||
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
Key k = (Key) o;
|
||||
if (funcName.compareTo(k.funcName) != 0) return false;
|
||||
if (prec != k.prec) return false;
|
||||
if (bits != k.bits) return false;
|
||||
if (range.size() != k.range.size()) return false;
|
||||
for(int i=0;i<range.size();i++) {
|
||||
if ((double)range.get(i) != (double)k.range.get(i)) return false;
|
||||
}
|
||||
|
||||
if (ulps != k.ulps) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
String s = funcName + " ";
|
||||
s += prec == DP ? "DP " : "SP ";
|
||||
s += bits + "bit ";
|
||||
s += String.format(" %.0fulp ", ulps);
|
||||
for(int i=0;i<range.size();i+=2) {
|
||||
s += "[" + String.format("%.3g", range.get(i)) + ", " + String.format("%.3g", range.get(i+1)) + "]";
|
||||
if (i + 2 < range.size()) s += " ";
|
||||
}
|
||||
return s;
|
||||
}
|
||||
}
|
||||
|
||||
static class KeyComparator implements Comparator<Key> {
|
||||
public int compare(Key d0, Key d1) {
|
||||
if (d0 == d1) return 0;
|
||||
if (d0.prec < d1.prec) return 1;
|
||||
if (d0.prec > d1.prec) return -1;
|
||||
if (d0.ulps > d1.ulps) return 1;
|
||||
if (d0.ulps < d1.ulps) return -1;
|
||||
|
||||
int fc = (int)funcNameOrder.get(d0.funcName) - (int)funcNameOrder.get(d1.funcName);
|
||||
if (fc != 0) return fc;
|
||||
|
||||
if (d0.bits > d1.bits) return 1;
|
||||
if (d0.bits < d1.bits) return -1;
|
||||
|
||||
if (d0.range.size() > d1.range.size()) return 1;
|
||||
if (d0.range.size() < d1.range.size()) return -1;
|
||||
|
||||
for(int i=0;i<d0.range.size();i++) {
|
||||
if (d0.range.get(i) > d1.range.get(i)) return 1;
|
||||
if (d0.range.get(i) < d1.range.get(i)) return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
LinkedHashMap<Key, LinkedHashMap<String, Double>> allData = new LinkedHashMap<Key, LinkedHashMap<String, Double>>();
|
||||
TreeSet<Key> allKeys = new TreeSet<Key>(new KeyComparator());
|
||||
LinkedHashSet<String> allColumnTitles = new LinkedHashSet<String>();
|
||||
double maximum = 0;
|
||||
|
||||
for(int i=0;i<args.length;i++) {
|
||||
LineNumberReader lnr = new LineNumberReader(new FileReader(args[i]));
|
||||
|
||||
String columnTitle = lnr.readLine();
|
||||
allColumnTitles.add(columnTitle);
|
||||
|
||||
for(;;) {
|
||||
String s = lnr.readLine();
|
||||
if (s == null) break;
|
||||
|
||||
Key key = new Key(s);
|
||||
allKeys.add(key);
|
||||
|
||||
LinkedHashMap<String, Double> v = allData.get(key);
|
||||
if (v == null) {
|
||||
v = new LinkedHashMap<String, Double>();
|
||||
allData.put(key, v);
|
||||
}
|
||||
String[] a = s.split(",");
|
||||
|
||||
double time = Double.parseDouble(a[a.length-1]);
|
||||
v.put(columnTitle, time);
|
||||
maximum = Math.max(maximum, time);
|
||||
}
|
||||
|
||||
lnr.close();
|
||||
}
|
||||
|
||||
PrintStream ps = new PrintStream("data.out");
|
||||
|
||||
for(Key k : allKeys) {
|
||||
ps.print("\"" + k + "\" ");
|
||||
|
||||
LinkedHashMap<String, Double> v = allData.get(k);
|
||||
|
||||
for(String s : allColumnTitles) {
|
||||
Double d = v.get(s);
|
||||
if (d != null) ps.print(d);
|
||||
if (d == null) ps.print("0");
|
||||
ps.print("\t");
|
||||
}
|
||||
ps.println();
|
||||
}
|
||||
|
||||
ps.close();
|
||||
|
||||
ps = new PrintStream("script.out");
|
||||
|
||||
ps.println("set terminal pngcairo size 1280, 800 font \",10\"");
|
||||
ps.println("set output \"output.png\"");
|
||||
|
||||
ps.println("color00 = \"#FF5050\";"); // red
|
||||
ps.println("color01 = \"#0066FF\";"); // blue
|
||||
ps.println("color02 = \"#00FF00\";"); // green
|
||||
ps.println("color03 = \"#FF9900\";"); // orange
|
||||
ps.println("color04 = \"#CC00CC\";"); // purple
|
||||
ps.println("color05 = \"#880000\";"); // brown
|
||||
ps.println("color06 = \"#003300\";"); // dark green
|
||||
ps.println("color07 = \"#000066\";"); // dark blue
|
||||
|
||||
ps.println("set style data histogram");
|
||||
ps.println("set style histogram cluster gap 1");
|
||||
ps.println("set style fill solid 1.00");
|
||||
ps.println("set boxwidth 0.9");
|
||||
ps.println("set xtics format \"\"");
|
||||
ps.println("set xtics rotate by -90");
|
||||
ps.println("set grid ytics");
|
||||
|
||||
ps.println("set ylabel \"Execution time in micro sec.\"");
|
||||
ps.println("set yrange [0:*]");
|
||||
ps.println("set bmargin 24");
|
||||
|
||||
ps.println("set title \"Single execution time in micro sec.\"");
|
||||
ps.print("plot");
|
||||
|
||||
int i = 0;
|
||||
for(String s : allColumnTitles) {
|
||||
ps.print("\"data.out\" using " + (i+2) + ":xtic(1) title \"" + s +
|
||||
"\" linecolor rgb color" + String.format("%02d", i));
|
||||
if (i != allColumnTitles.size()-1) ps.print(", ");
|
||||
i++;
|
||||
}
|
||||
ps.println();
|
||||
|
||||
ps.close();
|
||||
}
|
||||
}
|
||||
@ -1,58 +0,0 @@
|
||||
#define NITER1 100000
|
||||
#define NITER2 10000
|
||||
#define NITER (NITER1 * NITER2)
|
||||
|
||||
#define callFuncSLEEF1_1(funcName, name, xmin, xmax, ulp, arg, type) ({ \
|
||||
printf("%s\n", #funcName); \
|
||||
uint64_t t = Sleef_currentTimeMicros(); \
|
||||
for(int j=0;j<NITER2;j++) { \
|
||||
type *p = (type *)(arg); \
|
||||
for(int i=0;i<NITER1;i++) funcName(*p++); \
|
||||
} \
|
||||
fprintf(fp, name ", %.3g, %.3g, %gulps, %g\n", \
|
||||
(double)xmin, (double)xmax, ulp, (double)(Sleef_currentTimeMicros() - t) / NITER); \
|
||||
})
|
||||
|
||||
#define callFuncSLEEF1_2(funcName, name, xmin, xmax, ymin, ymax, ulp, arg1, arg2, type) ({ \
|
||||
printf("%s\n", #funcName); \
|
||||
uint64_t t = Sleef_currentTimeMicros(); \
|
||||
for(int j=0;j<NITER2;j++) { \
|
||||
type *p1 = (type *)(arg1), *p2 = (type *)(arg2); \
|
||||
for(int i=0;i<NITER1;i++) funcName(*p1++, *p2++); \
|
||||
} \
|
||||
fprintf(fp, name ", %.3g, %.3g, %.3g, %.3g, %gulps, %g\n", \
|
||||
(double)xmin, (double)xmax, (double)ymin, (double)ymax, ulp, (double)(Sleef_currentTimeMicros() - t) / NITER); \
|
||||
})
|
||||
|
||||
#define callFuncSVML1_1(funcName, name, xmin, xmax, arg, type) ({ \
|
||||
printf("%s\n", #funcName); \
|
||||
uint64_t t = Sleef_currentTimeMicros(); \
|
||||
for(int j=0;j<NITER2;j++) { \
|
||||
type *p = (type *)(arg); \
|
||||
for(int i=0;i<NITER1;i++) funcName(*p++); \
|
||||
} \
|
||||
fprintf(fp, name ", %.3g, %.3g, %gulps, %g\n", \
|
||||
(double)xmin, (double)xmax, (double)SVMLULP, (double)(Sleef_currentTimeMicros() - t) / NITER); \
|
||||
})
|
||||
|
||||
#define callFuncSVML2_1(funcName, name, xmin, xmax, arg, type) ({ \
|
||||
printf("%s\n", #funcName); \
|
||||
uint64_t t = Sleef_currentTimeMicros(); \
|
||||
for(int j=0;j<NITER2;j++) { \
|
||||
type *p = (type *)(arg), c; \
|
||||
for(int i=0;i<NITER1;i++) funcName(&c, *p++); \
|
||||
} \
|
||||
fprintf(fp, name ", %.3g, %.3g, %gulps, %g\n", \
|
||||
(double)xmin, (double)xmax, (double)SVMLULP, (double)(Sleef_currentTimeMicros() - t) / NITER); \
|
||||
})
|
||||
|
||||
#define callFuncSVML1_2(funcName, name, xmin, xmax, ymin, ymax, arg1, arg2, type) ({ \
|
||||
printf("%s\n", #funcName); \
|
||||
uint64_t t = Sleef_currentTimeMicros(); \
|
||||
for(int j=0;j<NITER2;j++) { \
|
||||
type *p1 = (type *)(arg1), *p2 = (type *)(arg2); \
|
||||
for(int i=0;i<NITER1;i++) funcName(*p1++, *p2++); \
|
||||
} \
|
||||
fprintf(fp, name ", %.3g, %.3g, %.3g, %.3g, %gulps, %g\n", \
|
||||
(double)xmin, (double)xmax, (double)ymin, (double)ymax, (double)SVMLULP, (double)(Sleef_currentTimeMicros() - t) / NITER); \
|
||||
})
|
||||
@ -1,144 +0,0 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
#include <time.h>
|
||||
#include <sleef.h>
|
||||
|
||||
#include "bench.h"
|
||||
|
||||
int veclen = 16;
|
||||
double *abufdp, *bbufdp;
|
||||
float *abufsp, *bbufsp;
|
||||
FILE *fp;
|
||||
|
||||
#if defined(__i386__) || defined(__x86_64__)
|
||||
void x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx) {
|
||||
uint32_t a, b, c, d;
|
||||
__asm__ __volatile__ ("cpuid" : "=a" (a), "=b" (b), "=c" (c), "=d" (d) : "a" (eax), "c"(ecx));
|
||||
out[0] = a; out[1] = b; out[2] = c; out[3] = d;
|
||||
}
|
||||
|
||||
int cpuSupportsAVX() {
|
||||
int32_t reg[4];
|
||||
x86CpuID(reg, 1, 0);
|
||||
return (reg[2] & (1 << 28)) != 0;
|
||||
}
|
||||
|
||||
int cpuSupportsAVX512F() {
|
||||
int32_t reg[4];
|
||||
x86CpuID(reg, 7, 0);
|
||||
return (reg[1] & (1 << 16)) != 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
void fillDP(double *buf, double min, double max) {
|
||||
for(int i=0;i<NITER1*veclen;i++) {
|
||||
double r = ((double)random() + RAND_MAX * (double)random()) / (RAND_MAX * (double)RAND_MAX);
|
||||
buf[i] = r * (max - min) + min;
|
||||
}
|
||||
}
|
||||
|
||||
void fillSP(float *buf, double min, double max) {
|
||||
for(int i=0;i<NITER1*veclen;i++) {
|
||||
double r = ((double)random() + RAND_MAX * (double)random()) / (RAND_MAX * (double)RAND_MAX);
|
||||
buf[i] = r * (max - min) + min;
|
||||
}
|
||||
}
|
||||
|
||||
void benchSleef128_DPTrig();
|
||||
void benchSleef256_DPTrig();
|
||||
void benchSleef512_DPTrig();
|
||||
void benchSleef128_DPNontrig();
|
||||
void benchSleef256_DPNontrig();
|
||||
void benchSleef512_DPNontrig();
|
||||
void benchSleef128_SPTrig();
|
||||
void benchSleef256_SPTrig();
|
||||
void benchSleef512_SPTrig();
|
||||
void benchSleef128_SPNontrig();
|
||||
void benchSleef256_SPNontrig();
|
||||
void benchSleef512_SPNontrig();
|
||||
|
||||
//
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
char *columnTitle = "SLEEF", *fnBase = "sleef";
|
||||
char fn[1024];
|
||||
|
||||
if (argc != 1) columnTitle = argv[1];
|
||||
if (argc >= 3) fnBase = argv[2];
|
||||
|
||||
srandom(time(NULL));
|
||||
|
||||
#if defined(__i386__) || defined(__x86_64__)
|
||||
int do128bit = 1;
|
||||
int do256bit = cpuSupportsAVX();
|
||||
int do512bit = cpuSupportsAVX512F();
|
||||
#elif defined(__ARM_NEON) || defined(__VSX__) || defined(__VX__)
|
||||
int do128bit = 1;
|
||||
#else
|
||||
#error Unsupported architecture
|
||||
#endif
|
||||
|
||||
posix_memalign((void **)&abufdp, veclen*sizeof(double), NITER1*veclen*sizeof(double));
|
||||
posix_memalign((void **)&bbufdp, veclen*sizeof(double), NITER1*veclen*sizeof(double));
|
||||
|
||||
abufsp = (float *)abufdp;
|
||||
bbufsp = (float *)bbufdp;
|
||||
|
||||
sprintf(fn, "%sdptrig.out", fnBase);
|
||||
fp = fopen(fn, "w");
|
||||
fprintf(fp, "%s\n", columnTitle);
|
||||
|
||||
if (do128bit) benchSleef128_DPTrig();
|
||||
#if defined(__i386__) || defined(__x86_64__)
|
||||
if (do256bit) benchSleef256_DPTrig();
|
||||
if (do512bit) benchSleef512_DPTrig();
|
||||
#endif
|
||||
|
||||
fclose(fp);
|
||||
|
||||
sprintf(fn, "%sdpnontrig.out", fnBase);
|
||||
fp = fopen(fn, "w");
|
||||
fprintf(fp, "%s\n", columnTitle);
|
||||
|
||||
if (do128bit) benchSleef128_DPNontrig();
|
||||
#if defined(__i386__) || defined(__x86_64__)
|
||||
if (do256bit) benchSleef256_DPNontrig();
|
||||
if (do512bit) benchSleef512_DPNontrig();
|
||||
#endif
|
||||
|
||||
fclose(fp);
|
||||
|
||||
sprintf(fn, "%ssptrig.out", fnBase);
|
||||
fp = fopen(fn, "w");
|
||||
fprintf(fp, "%s\n", columnTitle);
|
||||
|
||||
if (do128bit) benchSleef128_SPTrig();
|
||||
#if defined(__i386__) || defined(__x86_64__)
|
||||
if (do256bit) benchSleef256_SPTrig();
|
||||
if (do512bit) benchSleef512_SPTrig();
|
||||
#endif
|
||||
|
||||
fclose(fp);
|
||||
|
||||
sprintf(fn, "%sspnontrig.out", fnBase);
|
||||
fp = fopen(fn, "w");
|
||||
fprintf(fp, "%s\n", columnTitle);
|
||||
|
||||
if (do128bit) benchSleef128_SPNontrig();
|
||||
#if defined(__i386__) || defined(__x86_64__)
|
||||
if (do256bit) benchSleef256_SPNontrig();
|
||||
if (do512bit) benchSleef512_SPNontrig();
|
||||
#endif
|
||||
|
||||
fclose(fp);
|
||||
|
||||
exit(0);
|
||||
}
|
||||
@ -1,195 +0,0 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
#include <time.h>
|
||||
#include <sleef.h>
|
||||
|
||||
void fillDP(double *buf, double min, double max);
|
||||
void fillSP(float *buf, double min, double max);
|
||||
|
||||
extern char x86BrandString[256], versionString[1024];
|
||||
extern int veclen;
|
||||
extern double *abufdp, *bbufdp;
|
||||
extern float *abufsp, *bbufsp;
|
||||
extern FILE *fp;
|
||||
|
||||
#include "bench.h"
|
||||
|
||||
#ifdef __SSE2__
|
||||
#if defined(_MSC_VER)
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
typedef __m128d vdouble;
|
||||
typedef __m128 vfloat;
|
||||
#define ENABLED
|
||||
#elif defined(__ARM_NEON)
|
||||
#include <arm_neon.h>
|
||||
typedef float64x2_t vdouble;
|
||||
typedef float32x4_t vfloat;
|
||||
#define ENABLED
|
||||
#elif defined(__VSX__)
|
||||
#include <altivec.h>
|
||||
typedef __vector double vdouble;
|
||||
typedef __vector float vfloat;
|
||||
#define ENABLED
|
||||
#elif defined(__VX__)
|
||||
#include <vecintrin.h>
|
||||
typedef __vector double vdouble;
|
||||
typedef __vector float vfloat;
|
||||
#define ENABLED
|
||||
#endif
|
||||
|
||||
#ifdef ENABLED
|
||||
void benchSleef128_DPTrig() {
|
||||
fillDP(abufdp, 0, 6.28);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sind2_u10 , "sin, DP, 128", 0, 6.28, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_cosd2_u10 , "cos, DP, 128", 0, 6.28, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_tand2_u10 , "tan, DP, 128", 0, 6.28, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_sincosd2_u10, "sincos, DP, 128", 0, 6.28, 1.0, abufdp, vdouble);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sind2_u35 , "sin, DP, 128", 0, 6.28, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_cosd2_u35 , "cos, DP, 128", 0, 6.28, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_tand2_u35 , "tan, DP, 128", 0, 6.28, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_sincosd2_u35, "sincos, DP, 128", 0, 6.28, 4.0, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, 0, 1e+6);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sind2_u10 , "sin, DP, 128", 0, 1e+6, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_cosd2_u10 , "cos, DP, 128", 0, 1e+6, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_tand2_u10 , "tan, DP, 128", 0, 1e+6, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_sincosd2_u10, "sincos, DP, 128", 0, 1e+6, 1.0, abufdp, vdouble);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sind2_u35 , "sin, DP, 128", 0, 1e+6, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_cosd2_u35 , "cos, DP, 128", 0, 1e+6, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_tand2_u35 , "tan, DP, 128", 0, 1e+6, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_sincosd2_u35, "sincos, DP, 128", 0, 1e+6, 4.0, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, 0, 1e+100);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sind2_u10 , "sin, DP, 128", 0, 1e+100, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_cosd2_u10 , "cos, DP, 128", 0, 1e+100, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_tand2_u10 , "tan, DP, 128", 0, 1e+100, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_sincosd2_u10, "sincos, DP, 128", 0, 1e+100, 1.0, abufdp, vdouble);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sind2_u35 , "sin, DP, 128", 0, 1e+100, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_cosd2_u35 , "cos, DP, 128", 0, 1e+100, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_tand2_u35 , "tan, DP, 128", 0, 1e+100, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_sincosd2_u35, "sincos, DP, 128", 0, 1e+100, 4.0, abufdp, vdouble);
|
||||
}
|
||||
|
||||
void benchSleef128_DPNontrig() {
|
||||
fillDP(abufdp, 0, 1e+300);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_logd2_u10 , "log, DP, 128", 0, 1e+300, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_log10d2_u10, "log10, DP, 128", 0, 1e+300, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_log1pd2_u10, "log1p, DP, 128", 0, 1e+300, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_logd2_u35 , "log, DP, 128", 0, 1e+300, 4.0, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, -700, 700);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_expd2_u10 , "exp, DP, 128", -700, 700, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_exp2d2_u10 , "exp2, DP, 128", -700, 700, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_exp10d2_u10, "exp10, DP, 128", -700, 700, 1.0, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, -30, 30);
|
||||
fillDP(bbufdp, -30, 30);
|
||||
|
||||
callFuncSLEEF1_2(Sleef_powd2_u10, "pow, DP, 128", -30, 30, -30, 30, 1.0, abufdp, bbufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, -1.0, 1.0);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_asind2_u10, "asin, DP, 128", -1.0, 1.0, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_acosd2_u10, "acos, DP, 128", -1.0, 1.0, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_asind2_u35, "asin, DP, 128", -1.0, 1.0, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_acosd2_u35, "acos, DP, 128", -1.0, 1.0, 4.0, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, -10, 10);
|
||||
fillDP(bbufdp, -10, 10);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_atand2_u10, "atan, DP, 128", -10, 10, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_2(Sleef_atan2d2_u10, "atan2, DP, 128", -10, 10, -10, 10, 1.0, abufdp, bbufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_atand2_u35, "atan, DP, 128", -10, 10, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_2(Sleef_atan2d2_u35, "atan2, DP, 128", -10, 10, -10, 10, 4.0, abufdp, bbufdp, vdouble);
|
||||
}
|
||||
|
||||
void benchSleef128_SPTrig() {
|
||||
fillSP(abufsp, 0, 6.28);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sinf4_u10 , "sin, SP, 128", 0, 6.28, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_cosf4_u10 , "cos, SP, 128", 0, 6.28, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_tanf4_u10 , "tan, SP, 128", 0, 6.28, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_sincosf4_u10, "sincos, SP, 128", 0, 6.28, 1.0, abufsp, vfloat);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sinf4_u35 , "sin, SP, 128", 0, 6.28, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_cosf4_u35 , "cos, SP, 128", 0, 6.28, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_tanf4_u35 , "tan, SP, 128", 0, 6.28, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_sincosf4_u35, "sincos, SP, 128", 0, 6.28, 4.0, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, 0, 1e+20);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sinf4_u10 , "sin, SP, 128", 0, 1e+20, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_cosf4_u10 , "cos, SP, 128", 0, 1e+20, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_tanf4_u10 , "tan, SP, 128", 0, 1e+20, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_sincosf4_u10, "sincos, SP, 128", 0, 1e+20, 1.0, abufsp, vfloat);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sinf4_u35 , "sin, SP, 128", 0, 1e+20, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_cosf4_u35 , "cos, SP, 128", 0, 1e+20, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_tanf4_u35 , "tan, SP, 128", 0, 1e+20, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_sincosf4_u35, "sincos, SP, 128", 0, 1e+20, 4.0, abufsp, vfloat);
|
||||
}
|
||||
|
||||
void benchSleef128_SPNontrig() {
|
||||
fillSP(abufsp, 0, 1e+38);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_logf4_u10 , "log, SP, 128", 0, 1e+38, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_log10f4_u10, "log10, SP, 128", 0, 1e+38, 1.0, abufsp, vfloat);
|
||||
//callFuncSLEEF1_1(Sleef_log1pf4_u10, "log1p, SP, 128", 0, 1e+38, 1.0, abufsp, vfloat);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_logf4_u35 , "log, SP, 128", 0, 1e+38, 4.0, abufsp, vfloat);
|
||||
//callFuncSLEEF1_1(Sleef_log10f4_u35, "log10, SP, 128", 0, 1e+38, 4.0, abufsp, vfloat);
|
||||
//callFuncSLEEF1_1(Sleef_log1pf4_u35, "log1p, SP, 128", 0, 1e+38, 4.0, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, -100, 100);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_expf4_u10 , "exp, SP, 128", -100, 100, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_exp2f4_u10 , "exp2, SP, 128", -100, 100, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_exp10f4_u10, "exp10, SP, 128", -100, 100, 1.0, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, -30, 30);
|
||||
fillSP(bbufsp, -30, 30);
|
||||
|
||||
callFuncSLEEF1_2(Sleef_powf4_u10, "pow, SP, 128", -30, 30, -30, 30, 1.0, abufsp, bbufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, -1.0, 1.0);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_asinf4_u10, "asin, SP, 128", -1.0, 1, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_acosf4_u10, "acos, SP, 128", -1.0, 1, 1.0, abufsp, vfloat);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_asinf4_u35, "asin, SP, 128", -1.0, 1.0, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_acosf4_u35, "acos, SP, 128", -1.0, 1.0, 4.0, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, -10, 10);
|
||||
fillSP(bbufsp, -10, 10);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_atanf4_u10, "atan, SP, 128", -10, 10, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_2(Sleef_atan2f4_u10, "atan2, SP, 128", -10, 10, -10, 10, 1.0, abufsp, bbufsp, vfloat);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_atanf4_u35, "atan, SP, 128", -10, 10, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_2(Sleef_atan2f4_u35, "atan2, SP, 128", -10, 10, -10, 10, 4.0, abufsp, bbufsp, vfloat);
|
||||
}
|
||||
#else // #ifdef ENABLED
|
||||
void benchSleef128_DPTrig() {}
|
||||
void benchSleef128_DPNontrig() {}
|
||||
void benchSleef128_SPTrig() {}
|
||||
void benchSleef128_SPNontrig() {}
|
||||
#endif // #ifdef ENABLED
|
||||
@ -1,181 +0,0 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
#include <time.h>
|
||||
#include <sleef.h>
|
||||
|
||||
void fillDP(double *buf, double min, double max);
|
||||
void fillSP(float *buf, double min, double max);
|
||||
|
||||
extern char x86BrandString[256], versionString[1024];
|
||||
extern int veclen;
|
||||
extern double *abufdp, *bbufdp;
|
||||
extern float *abufsp, *bbufsp;
|
||||
extern FILE *fp;
|
||||
|
||||
#include "bench.h"
|
||||
|
||||
#ifdef __AVX__
|
||||
#if defined(_MSC_VER)
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
typedef __m256d vdouble;
|
||||
typedef __m256 vfloat;
|
||||
#define ENABLED
|
||||
#endif
|
||||
|
||||
#ifdef ENABLED
|
||||
void benchSleef256_DPTrig() {
|
||||
fillDP(abufdp, 0, 6.28);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sind4_u10 , "sin, DP, 256", 0, 6.28, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_cosd4_u10 , "cos, DP, 256", 0, 6.28, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_tand4_u10 , "tan, DP, 256", 0, 6.28, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_sincosd4_u10, "sincos, DP, 256", 0, 6.28, 1.0, abufdp, vdouble);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sind4_u35 , "sin, DP, 256", 0, 6.28, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_cosd4_u35 , "cos, DP, 256", 0, 6.28, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_tand4_u35 , "tan, DP, 256", 0, 6.28, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_sincosd4_u35, "sincos, DP, 256", 0, 6.28, 4.0, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, 0, 1e+6);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sind4_u10 , "sin, DP, 256", 0, 1e+6, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_cosd4_u10 , "cos, DP, 256", 0, 1e+6, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_tand4_u10 , "tan, DP, 256", 0, 1e+6, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_sincosd4_u10, "sincos, DP, 256", 0, 1e+6, 1.0, abufdp, vdouble);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sind4_u35 , "sin, DP, 256", 0, 1e+6, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_cosd4_u35 , "cos, DP, 256", 0, 1e+6, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_tand4_u35 , "tan, DP, 256", 0, 1e+6, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_sincosd4_u35, "sincos, DP, 256", 0, 1e+6, 4.0, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, 0, 1e+100);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sind4_u10 , "sin, DP, 256", 0, 1e+100, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_cosd4_u10 , "cos, DP, 256", 0, 1e+100, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_tand4_u10 , "tan, DP, 256", 0, 1e+100, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_sincosd4_u10, "sincos, DP, 256", 0, 1e+100, 1.0, abufdp, vdouble);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sind4_u35 , "sin, DP, 256", 0, 1e+100, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_cosd4_u35 , "cos, DP, 256", 0, 1e+100, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_tand4_u35 , "tan, DP, 256", 0, 1e+100, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_sincosd4_u35, "sincos, DP, 256", 0, 1e+100, 4.0, abufdp, vdouble);
|
||||
}
|
||||
|
||||
void benchSleef256_DPNontrig() {
|
||||
fillDP(abufdp, 0, 1e+300);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_logd4_u10 , "log, DP, 256", 0, 1e+300, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_log10d4_u10, "log10, DP, 256", 0, 1e+300, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_log1pd4_u10, "log1p, DP, 256", 0, 1e+300, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_logd4_u35 , "log, DP, 256", 0, 1e+300, 4.0, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, -700, 700);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_expd4_u10 , "exp, DP, 256", -700, 700, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_exp2d4_u10 , "exp2, DP, 256", -700, 700, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_exp10d4_u10, "exp10, DP, 256", -700, 700, 1.0, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, -30, 30);
|
||||
fillDP(bbufdp, -30, 30);
|
||||
|
||||
callFuncSLEEF1_2(Sleef_powd4_u10, "pow, DP, 256", -30, 30, -30, 30, 1.0, abufdp, bbufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, -1.0, 1.0);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_asind4_u10, "asin, DP, 256", -1.0, 1.0, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_acosd4_u10, "acos, DP, 256", -1.0, 1.0, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_asind4_u35, "asin, DP, 256", -1.0, 1.0, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_acosd4_u35, "acos, DP, 256", -1.0, 1.0, 4.0, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, -10, 10);
|
||||
fillDP(bbufdp, -10, 10);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_atand4_u10, "atan, DP, 256", -10, 10, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_2(Sleef_atan2d4_u10, "atan2, DP, 256", -10, 10, -10, 10, 1.0, abufdp, bbufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_atand4_u35, "atan, DP, 256", -10, 10, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_2(Sleef_atan2d4_u35, "atan2, DP, 256", -10, 10, -10, 10, 4.0, abufdp, bbufdp, vdouble);
|
||||
}
|
||||
|
||||
void benchSleef256_SPTrig() {
|
||||
fillSP(abufsp, 0, 6.28);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sinf8_u10 , "sin, SP, 256", 0, 6.28, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_cosf8_u10 , "cos, SP, 256", 0, 6.28, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_tanf8_u10 , "tan, SP, 256", 0, 6.28, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_sincosf8_u10, "sincos, SP, 256", 0, 6.28, 1.0, abufsp, vfloat);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sinf8_u35 , "sin, SP, 256", 0, 6.28, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_cosf8_u35 , "cos, SP, 256", 0, 6.28, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_tanf8_u35 , "tan, SP, 256", 0, 6.28, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_sincosf8_u35, "sincos, SP, 256", 0, 6.28, 4.0, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, 0, 1e+20);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sinf8_u10 , "sin, SP, 256", 0, 1e+20, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_cosf8_u10 , "cos, SP, 256", 0, 1e+20, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_tanf8_u10 , "tan, SP, 256", 0, 1e+20, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_sincosf8_u10, "sincos, SP, 256", 0, 1e+20, 1.0, abufsp, vfloat);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sinf8_u35 , "sin, SP, 256", 0, 1e+20, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_cosf8_u35 , "cos, SP, 256", 0, 1e+20, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_tanf8_u35 , "tan, SP, 256", 0, 1e+20, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_sincosf8_u35, "sincos, SP, 256", 0, 1e+20, 4.0, abufsp, vfloat);
|
||||
}
|
||||
|
||||
void benchSleef256_SPNontrig() {
|
||||
fillSP(abufsp, 0, 1e+38);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_logf8_u10 , "log, SP, 256", 0, 1e+38, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_log10f8_u10, "log10, SP, 256", 0, 1e+38, 1.0, abufsp, vfloat);
|
||||
//callFuncSLEEF1_1(Sleef_log1pf8_u10, "log1p, SP, 256", 0, 1e+38, 1.0, abufsp, vfloat);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_logf8_u35 , "log, SP, 256", 0, 1e+38, 4.0, abufsp, vfloat);
|
||||
//callFuncSLEEF1_1(Sleef_log10f8_u35, "log10, SP, 256", 0, 1e+38, 4.0, abufsp, vfloat);
|
||||
//callFuncSLEEF1_1(Sleef_log1pf8_u35, "log1p, SP, 256", 0, 1e+38, 4.0, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, -100, 100);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_expf8_u10 , "exp, SP, 256", -100, 100, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_exp2f8_u10 , "exp2, SP, 256", -100, 100, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_exp10f8_u10, "exp10, SP, 256", -100, 100, 1.0, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, -30, 30);
|
||||
fillSP(bbufsp, -30, 30);
|
||||
|
||||
callFuncSLEEF1_2(Sleef_powf8_u10, "pow, SP, 256", -30, 30, -30, 30, 1.0, abufsp, bbufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, -1.0, 1.0);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_asinf8_u10, "asin, SP, 256", -1.0, 1, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_acosf8_u10, "acos, SP, 256", -1.0, 1, 1.0, abufsp, vfloat);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_asinf8_u35, "asin, SP, 256", -1.0, 1.0, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_acosf8_u35, "acos, SP, 256", -1.0, 1.0, 4.0, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, -10, 10);
|
||||
fillSP(bbufsp, -10, 10);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_atanf8_u10, "atan, SP, 256", -10, 10, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_2(Sleef_atan2f8_u10, "atan2, SP, 256", -10, 10, -10, 10, 1.0, abufsp, bbufsp, vfloat);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_atanf8_u35, "atan, SP, 256", -10, 10, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_2(Sleef_atan2f8_u35, "atan2, SP, 256", -10, 10, -10, 10, 4.0, abufsp, bbufsp, vfloat);
|
||||
}
|
||||
#else // #ifdef ENABLED
|
||||
void zeroupper256() {}
|
||||
void benchSleef256_DPTrig() {}
|
||||
void benchSleef256_DPNontrig() {}
|
||||
void benchSleef256_SPTrig() {}
|
||||
void benchSleef256_SPNontrig() {}
|
||||
#endif // #ifdef ENABLED
|
||||
@ -1,180 +0,0 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
#include <time.h>
|
||||
#include <sleef.h>
|
||||
|
||||
void fillDP(double *buf, double min, double max);
|
||||
void fillSP(float *buf, double min, double max);
|
||||
|
||||
extern char x86BrandString[256], versionString[1024];
|
||||
extern int veclen;
|
||||
extern double *abufdp, *bbufdp;
|
||||
extern float *abufsp, *bbufsp;
|
||||
extern FILE *fp;
|
||||
|
||||
#include "bench.h"
|
||||
|
||||
#ifdef __AVX512F__
|
||||
#if defined(_MSC_VER)
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
typedef __m512d vdouble;
|
||||
typedef __m512 vfloat;
|
||||
#define ENABLED
|
||||
#endif
|
||||
|
||||
#ifdef ENABLED
|
||||
void benchSleef512_DPTrig() {
|
||||
fillDP(abufdp, 0, 6.28);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sind8_u10 , "sin, DP, 512", 0, 6.28, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_cosd8_u10 , "cos, DP, 512", 0, 6.28, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_tand8_u10 , "tan, DP, 512", 0, 6.28, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_sincosd8_u10, "sincos, DP, 512", 0, 6.28, 1.0, abufdp, vdouble);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sind8_u35 , "sin, DP, 512", 0, 6.28, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_cosd8_u35 , "cos, DP, 512", 0, 6.28, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_tand8_u35 , "tan, DP, 512", 0, 6.28, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_sincosd8_u35, "sincos, DP, 512", 0, 6.28, 4.0, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, 0, 1e+6);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sind8_u10 , "sin, DP, 512", 0, 1e+6, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_cosd8_u10 , "cos, DP, 512", 0, 1e+6, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_tand8_u10 , "tan, DP, 512", 0, 1e+6, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_sincosd8_u10, "sincos, DP, 512", 0, 1e+6, 1.0, abufdp, vdouble);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sind8_u35 , "sin, DP, 512", 0, 1e+6, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_cosd8_u35 , "cos, DP, 512", 0, 1e+6, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_tand8_u35 , "tan, DP, 512", 0, 1e+6, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_sincosd8_u35, "sincos, DP, 512", 0, 1e+6, 4.0, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, 0, 1e+100);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sind8_u10 , "sin, DP, 512", 0, 1e+100, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_cosd8_u10 , "cos, DP, 512", 0, 1e+100, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_tand8_u10 , "tan, DP, 512", 0, 1e+100, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_sincosd8_u10, "sincos, DP, 512", 0, 1e+100, 1.0, abufdp, vdouble);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sind8_u35 , "sin, DP, 512", 0, 1e+100, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_cosd8_u35 , "cos, DP, 512", 0, 1e+100, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_tand8_u35 , "tan, DP, 512", 0, 1e+100, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_sincosd8_u35, "sincos, DP, 512", 0, 1e+100, 4.0, abufdp, vdouble);
|
||||
}
|
||||
|
||||
void benchSleef512_DPNontrig() {
|
||||
fillDP(abufdp, 0, 1e+300);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_logd8_u10 , "log, DP, 512", 0, 1e+300, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_log10d8_u10, "log10, DP, 512", 0, 1e+300, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_log1pd8_u10, "log1p, DP, 512", 0, 1e+300, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_logd8_u35 , "log, DP, 512", 0, 1e+300, 4.0, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, -700, 700);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_expd8_u10 , "exp, DP, 512", -700, 700, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_exp2d8_u10 , "exp2, DP, 512", -700, 700, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_exp10d8_u10, "exp10, DP, 512", -700, 700, 1.0, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, -30, 30);
|
||||
fillDP(bbufdp, -30, 30);
|
||||
|
||||
callFuncSLEEF1_2(Sleef_powd8_u10, "pow, DP, 512", -30, 30, -30, 30, 1.0, abufdp, bbufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, -1.0, 1.0);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_asind8_u10, "asin, DP, 512", -1.0, 1.0, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_acosd8_u10, "acos, DP, 512", -1.0, 1.0, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_asind8_u35, "asin, DP, 512", -1.0, 1.0, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_acosd8_u35, "acos, DP, 512", -1.0, 1.0, 4.0, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, -10, 10);
|
||||
fillDP(bbufdp, -10, 10);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_atand8_u10, "atan, DP, 512", -10, 10, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_2(Sleef_atan2d8_u10, "atan2, DP, 512", -10, 10, -10, 10, 1.0, abufdp, bbufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_atand8_u35, "atan, DP, 512", -10, 10, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_2(Sleef_atan2d8_u35, "atan2, DP, 512", -10, 10, -10, 10, 4.0, abufdp, bbufdp, vdouble);
|
||||
}
|
||||
|
||||
void benchSleef512_SPTrig() {
|
||||
fillSP(abufsp, 0, 6.28);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sinf16_u10 , "sin, SP, 512", 0, 6.28, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_cosf16_u10 , "cos, SP, 512", 0, 6.28, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_tanf16_u10 , "tan, SP, 512", 0, 6.28, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_sincosf16_u10, "sincos, SP, 512", 0, 6.28, 1.0, abufsp, vfloat);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sinf16_u35 , "sin, SP, 512", 0, 6.28, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_cosf16_u35 , "cos, SP, 512", 0, 6.28, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_tanf16_u35 , "tan, SP, 512", 0, 6.28, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_sincosf16_u35, "sincos, SP, 512", 0, 6.28, 4.0, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, 0, 1e+20);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sinf16_u10 , "sin, SP, 512", 0, 1e+20, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_cosf16_u10 , "cos, SP, 512", 0, 1e+20, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_tanf16_u10 , "tan, SP, 512", 0, 1e+20, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_sincosf16_u10, "sincos, SP, 512", 0, 1e+20, 1.0, abufsp, vfloat);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sinf16_u35 , "sin, SP, 512", 0, 1e+20, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_cosf16_u35 , "cos, SP, 512", 0, 1e+20, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_tanf16_u35 , "tan, SP, 512", 0, 1e+20, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_sincosf16_u35, "sincos, SP, 512", 0, 1e+20, 4.0, abufsp, vfloat);
|
||||
}
|
||||
|
||||
void benchSleef512_SPNontrig() {
|
||||
fillSP(abufsp, 0, 1e+38);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_logf16_u10 , "log, SP, 512", 0, 1e+38, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_log10f16_u10, "log10, SP, 512", 0, 1e+38, 1.0, abufsp, vfloat);
|
||||
//callFuncSLEEF1_1(Sleef_log1pf16_u10, "log1p, SP, 512", 0, 1e+38, 1.0, abufsp, vfloat);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_logf16_u35 , "log, SP, 512", 0, 1e+38, 4.0, abufsp, vfloat);
|
||||
//callFuncSLEEF1_1(Sleef_log10f16_u35, "log10, SP, 512", 0, 1e+38, 4.0, abufsp, vfloat);
|
||||
//callFuncSLEEF1_1(Sleef_log1pf16_u35, "log1p, SP, 512", 0, 1e+38, 4.0, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, -100, 100);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_expf16_u10 , "exp, SP, 512", -100, 100, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_exp2f16_u10 , "exp2, SP, 512", -100, 100, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_exp10f16_u10, "exp10, SP, 512", -100, 100, 1.0, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, -30, 30);
|
||||
fillSP(bbufsp, -30, 30);
|
||||
|
||||
callFuncSLEEF1_2(Sleef_powf16_u10, "pow, SP, 512", -30, 30, -30, 30, 1.0, abufsp, bbufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, -1.0, 1.0);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_asinf16_u10, "asin, SP, 512", -1.0, 1, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_acosf16_u10, "acos, SP, 512", -1.0, 1, 1.0, abufsp, vfloat);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_asinf16_u35, "asin, SP, 512", -1.0, 1.0, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_acosf16_u35, "acos, SP, 512", -1.0, 1.0, 4.0, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, -10, 10);
|
||||
fillSP(bbufsp, -10, 10);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_atanf16_u10, "atan, SP, 512", -10, 10, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_2(Sleef_atan2f16_u10, "atan2, SP, 512", -10, 10, -10, 10, 1.0, abufsp, bbufsp, vfloat);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_atanf16_u35, "atan, SP, 512", -10, 10, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_2(Sleef_atan2f16_u35, "atan2, SP, 512", -10, 10, -10, 10, 4.0, abufsp, bbufsp, vfloat);
|
||||
}
|
||||
#else // #ifdef ENABLED
|
||||
void benchSleef512_DPTrig() {}
|
||||
void benchSleef512_DPNontrig() {}
|
||||
void benchSleef512_SPTrig() {}
|
||||
void benchSleef512_SPNontrig() {}
|
||||
#endif // #ifdef ENABLED
|
||||
@ -1,153 +0,0 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
#include <time.h>
|
||||
#include <unistd.h>
|
||||
#include <x86intrin.h>
|
||||
|
||||
#include "bench.h"
|
||||
|
||||
int veclen = 16;
|
||||
int enableLogExp;
|
||||
double *abufdp, *bbufdp;
|
||||
float *abufsp, *bbufsp;
|
||||
FILE *fp;
|
||||
|
||||
#if defined(__i386__) || defined(__x86_64__)
|
||||
void x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx) {
|
||||
uint32_t a, b, c, d;
|
||||
__asm__ __volatile__ ("cpuid" : "=a" (a), "=b" (b), "=c" (c), "=d" (d) : "a" (eax), "c"(ecx));
|
||||
out[0] = a; out[1] = b; out[2] = c; out[3] = d;
|
||||
}
|
||||
|
||||
int cpuSupportsAVX() {
|
||||
int32_t reg[4];
|
||||
x86CpuID(reg, 1, 0);
|
||||
return (reg[2] & (1 << 28)) != 0;
|
||||
}
|
||||
|
||||
int cpuSupportsAVX512F() {
|
||||
int32_t reg[4];
|
||||
x86CpuID(reg, 7, 0);
|
||||
return (reg[1] & (1 << 16)) != 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
uint64_t Sleef_currentTimeMicros() {
|
||||
struct timespec tp;
|
||||
clock_gettime(CLOCK_MONOTONIC, &tp);
|
||||
return (uint64_t)tp.tv_sec * 1000000LL + ((uint64_t)tp.tv_nsec/1000);
|
||||
}
|
||||
|
||||
void fillDP(double *buf, double min, double max) {
|
||||
for(int i=0;i<NITER1*veclen;i++) {
|
||||
double r = ((double)random() + RAND_MAX * (double)random()) / (RAND_MAX * (double)RAND_MAX);
|
||||
buf[i] = r * (max - min) + min;
|
||||
}
|
||||
}
|
||||
|
||||
void fillSP(float *buf, double min, double max) {
|
||||
for(int i=0;i<NITER1*veclen;i++) {
|
||||
double r = ((double)random() + RAND_MAX * (double)random()) / (RAND_MAX * (double)RAND_MAX);
|
||||
buf[i] = r * (max - min) + min;
|
||||
}
|
||||
}
|
||||
|
||||
void zeroupper256();
|
||||
void benchSVML128_DPTrig();
|
||||
void benchSVML256_DPTrig();
|
||||
void benchSVML512_DPTrig();
|
||||
void benchSVML128_DPNontrig();
|
||||
void benchSVML256_DPNontrig();
|
||||
void benchSVML512_DPNontrig();
|
||||
void benchSVML128_SPTrig();
|
||||
void benchSVML256_SPTrig();
|
||||
void benchSVML512_SPTrig();
|
||||
void benchSVML128_SPNontrig();
|
||||
void benchSVML256_SPNontrig();
|
||||
void benchSVML512_SPNontrig();
|
||||
|
||||
//
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
char *columnTitle = "SVML", *fnBase = "svml";
|
||||
char fn[1024];
|
||||
|
||||
if (argc != 1) columnTitle = argv[1];
|
||||
if (argc >= 3) fnBase = argv[2];
|
||||
|
||||
srandom(time(NULL));
|
||||
|
||||
#if defined(__i386__) || defined(__x86_64__)
|
||||
int do128bit = 1;
|
||||
int do256bit = cpuSupportsAVX();
|
||||
int do512bit = cpuSupportsAVX512F();
|
||||
#elif defined(__ARM_NEON)
|
||||
int do128bit = 1;
|
||||
int do256bit = 0;
|
||||
int do512bit = 0;
|
||||
#else
|
||||
#error Unsupported architecture
|
||||
#endif
|
||||
|
||||
posix_memalign((void **)&abufdp, veclen*sizeof(double), NITER1*veclen*sizeof(double));
|
||||
posix_memalign((void **)&bbufdp, veclen*sizeof(double), NITER1*veclen*sizeof(double));
|
||||
|
||||
abufsp = (float *)abufdp;
|
||||
bbufsp = (float *)bbufdp;
|
||||
|
||||
enableLogExp = SVMLULP < 2;
|
||||
|
||||
sprintf(fn, "%sdptrig%gulp.out", fnBase, (double)SVMLULP);
|
||||
fp = fopen(fn, "w");
|
||||
fprintf(fp, "%s\n", columnTitle);
|
||||
|
||||
if (do256bit) zeroupper256();
|
||||
if (do128bit) benchSVML128_DPTrig();
|
||||
if (do256bit) benchSVML256_DPTrig();
|
||||
if (do512bit) benchSVML512_DPTrig();
|
||||
|
||||
fclose(fp);
|
||||
|
||||
sprintf(fn, "%sdpnontrig%gulp.out", fnBase, (double)SVMLULP);
|
||||
fp = fopen(fn, "w");
|
||||
fprintf(fp, "%s\n", columnTitle);
|
||||
|
||||
if (do256bit) zeroupper256();
|
||||
if (do128bit) benchSVML128_DPNontrig();
|
||||
if (do256bit) benchSVML256_DPNontrig();
|
||||
if (do512bit) benchSVML512_DPNontrig();
|
||||
|
||||
fclose(fp);
|
||||
|
||||
sprintf(fn, "%ssptrig%gulp.out", fnBase, (double)SVMLULP);
|
||||
fp = fopen(fn, "w");
|
||||
fprintf(fp, "%s\n", columnTitle);
|
||||
|
||||
if (do256bit) zeroupper256();
|
||||
if (do128bit) benchSVML128_SPTrig();
|
||||
if (do256bit) benchSVML256_SPTrig();
|
||||
if (do512bit) benchSVML512_SPTrig();
|
||||
|
||||
fclose(fp);
|
||||
|
||||
sprintf(fn, "%sspnontrig%gulp.out", fnBase, (double)SVMLULP);
|
||||
fp = fopen(fn, "w");
|
||||
fprintf(fp, "%s\n", columnTitle);
|
||||
|
||||
if (do256bit) zeroupper256();
|
||||
if (do128bit) benchSVML128_SPNontrig();
|
||||
if (do256bit) benchSVML256_SPNontrig();
|
||||
if (do512bit) benchSVML512_SPNontrig();
|
||||
|
||||
fclose(fp);
|
||||
|
||||
exit(0);
|
||||
}
|
||||
@ -1,144 +0,0 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
#include <time.h>
|
||||
#include <unistd.h>
|
||||
#include <x86intrin.h>
|
||||
|
||||
uint64_t Sleef_currentTimeMicros();
|
||||
void fillDP(double *buf, double min, double max);
|
||||
void fillSP(float *buf, double min, double max);
|
||||
|
||||
extern char x86BrandString[256], versionString[1024];
|
||||
extern int veclen;
|
||||
extern int enableLogExp;
|
||||
extern double *abufdp, *bbufdp;
|
||||
extern float *abufsp, *bbufsp;
|
||||
extern FILE *fp;
|
||||
|
||||
#include "bench.h"
|
||||
|
||||
#ifdef __SSE2__
|
||||
typedef __m128d vdouble;
|
||||
typedef __m128 vfloat;
|
||||
#define ENABLED
|
||||
#endif
|
||||
|
||||
#ifdef ENABLED
|
||||
void benchSVML128_DPTrig() {
|
||||
fillDP(abufdp, 0, 6.28);
|
||||
|
||||
callFuncSVML1_1(_mm_sin_pd , "sin, DP, 128", 0, 6.28, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm_cos_pd , "cos, DP, 128", 0, 6.28, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm_tan_pd , "tan, DP, 128", 0, 6.28, abufdp, vdouble);
|
||||
callFuncSVML2_1(_mm_sincos_pd, "sincos, DP, 128", 0, 6.28, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, 0, 1e+6);
|
||||
|
||||
callFuncSVML1_1(_mm_sin_pd , "sin, DP, 128", 0, 1e+6, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm_cos_pd , "cos, DP, 128", 0, 1e+6, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm_tan_pd , "tan, DP, 128", 0, 1e+6, abufdp, vdouble);
|
||||
callFuncSVML2_1(_mm_sincos_pd, "sincos, DP, 128", 0, 1e+6, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, 0, 1e+100);
|
||||
|
||||
callFuncSVML1_1(_mm_sin_pd , "sin, DP, 128", 0, 1e+100, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm_cos_pd , "cos, DP, 128", 0, 1e+100, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm_tan_pd , "tan, DP, 128", 0, 1e+100, abufdp, vdouble);
|
||||
callFuncSVML2_1(_mm_sincos_pd, "sincos, DP, 128", 0, 1e+100, abufdp, vdouble);
|
||||
}
|
||||
|
||||
void benchSVML128_DPNontrig() {
|
||||
fillDP(abufdp, 0, 1e+300);
|
||||
|
||||
callFuncSVML1_1(_mm_log_pd , "log, DP, 128", 0, 1e+300, abufdp, vdouble);
|
||||
|
||||
if (enableLogExp) {
|
||||
callFuncSVML1_1(_mm_log10_pd, "log10, DP, 128", 0, 1e+300, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm_log1p_pd, "log1p, DP, 128", 0, 1e+300, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, -700, 700);
|
||||
|
||||
callFuncSVML1_1(_mm_exp_pd , "exp, DP, 128", -700, 700, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm_exp2_pd , "exp2, DP, 128", -700, 700, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm_exp10_pd, "exp10, DP, 128", -700, 700, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, -30, 30);
|
||||
fillDP(bbufdp, -30, 30);
|
||||
|
||||
callFuncSVML1_2(_mm_pow_pd, "pow, DP, 128", -30, 30, -30, 30, abufdp, bbufdp, vdouble);
|
||||
}
|
||||
|
||||
fillDP(abufdp, -1.0, 1.0);
|
||||
|
||||
callFuncSVML1_1(_mm_asin_pd, "asin, DP, 128", -1.0, 1.0, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm_acos_pd, "acos, DP, 128", -1.0, 1.0, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, -10, 10);
|
||||
fillDP(bbufdp, -10, 10);
|
||||
|
||||
callFuncSVML1_1(_mm_atan_pd, "atan, DP, 128", -10, 10, abufdp, vdouble);
|
||||
callFuncSVML1_2(_mm_atan2_pd, "atan2, DP, 128", -10, 10, -10, 10, abufdp, bbufdp, vdouble);
|
||||
}
|
||||
|
||||
void benchSVML128_SPTrig() {
|
||||
fillSP(abufsp, 0, 6.28);
|
||||
|
||||
callFuncSVML1_1(_mm_sin_ps , "sin, SP, 128", 0, 6.28, abufsp, vfloat);
|
||||
callFuncSVML1_1(_mm_cos_ps , "cos, SP, 128", 0, 6.28, abufsp, vfloat);
|
||||
callFuncSVML1_1(_mm_tan_ps , "tan, SP, 128", 0, 6.28, abufsp, vfloat);
|
||||
callFuncSVML2_1(_mm_sincos_ps, "sincos, SP, 128", 0, 6.28, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, 0, 1e+20);
|
||||
|
||||
callFuncSVML1_1(_mm_sin_ps , "sin, SP, 128", 0, 1e+20, abufsp, vfloat);
|
||||
callFuncSVML1_1(_mm_cos_ps , "cos, SP, 128", 0, 1e+20, abufsp, vfloat);
|
||||
callFuncSVML1_1(_mm_tan_ps , "tan, SP, 128", 0, 1e+20, abufsp, vfloat);
|
||||
callFuncSVML2_1(_mm_sincos_ps, "sincos, SP, 128", 0, 1e+20, abufsp, vfloat);
|
||||
}
|
||||
|
||||
void benchSVML128_SPNontrig() {
|
||||
fillSP(abufsp, 0, 1e+38);
|
||||
|
||||
callFuncSVML1_1(_mm_log_ps , "log, SP, 128", 0, 1e+38, abufsp, vfloat);
|
||||
|
||||
if (enableLogExp) {
|
||||
callFuncSVML1_1(_mm_log10_ps, "log10, SP, 128", 0, 1e+38, abufsp, vfloat);
|
||||
//callFuncSVML1_1(_mm_log1p_ps, "log1p, SP, 128", 0, 1e+38, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, -100, 100);
|
||||
|
||||
callFuncSVML1_1(_mm_exp_ps , "exp, SP, 128", -100, 100, abufsp, vfloat);
|
||||
callFuncSVML1_1(_mm_exp2_ps , "exp2, SP, 128", -100, 100, abufsp, vfloat);
|
||||
callFuncSVML1_1(_mm_exp10_ps, "exp10, SP, 128", -100, 100, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, -30, 30);
|
||||
fillSP(bbufsp, -30, 30);
|
||||
|
||||
callFuncSVML1_2(_mm_pow_ps, "pow, SP, 128", -30, 30, -30, 30, abufsp, bbufsp, vfloat);
|
||||
}
|
||||
|
||||
fillSP(abufsp, -1.0, 1.0);
|
||||
|
||||
callFuncSVML1_1(_mm_asin_ps, "asin, SP, 128", -1.0, 1, abufsp, vfloat);
|
||||
callFuncSVML1_1(_mm_acos_ps, "acos, SP, 128", -1.0, 1, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, -10, 10);
|
||||
fillSP(bbufsp, -10, 10);
|
||||
|
||||
callFuncSVML1_1(_mm_atan_ps, "atan, SP, 128", -10, 10, abufsp, vfloat);
|
||||
callFuncSVML1_2(_mm_atan2_ps, "atan2, SP, 128", -10, 10, -10, 10, abufsp, bbufsp, vfloat);
|
||||
}
|
||||
#else // #ifdef ENABLED
|
||||
void benchSVML128_DPTrig() {}
|
||||
void benchSVML128_DPNontrig() {}
|
||||
void benchSVML128_SPTrig() {}
|
||||
void benchSVML128_SPNontrig() {}
|
||||
#endif // #ifdef ENABLED
|
||||
@ -1,147 +0,0 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
#include <time.h>
|
||||
#include <unistd.h>
|
||||
#include <x86intrin.h>
|
||||
|
||||
uint64_t Sleef_currentTimeMicros();
|
||||
void fillDP(double *buf, double min, double max);
|
||||
void fillSP(float *buf, double min, double max);
|
||||
|
||||
extern char x86BrandString[256], versionString[1024];
|
||||
extern int veclen;
|
||||
extern int enableLogExp;
|
||||
extern double *abufdp, *bbufdp;
|
||||
extern float *abufsp, *bbufsp;
|
||||
extern FILE *fp;
|
||||
|
||||
#include "bench.h"
|
||||
|
||||
#ifdef __AVX__
|
||||
typedef __m256d vdouble;
|
||||
typedef __m256 vfloat;
|
||||
#define ENABLED
|
||||
#endif
|
||||
|
||||
#ifdef ENABLED
|
||||
void zeroupper256() { _mm256_zeroupper(); }
|
||||
|
||||
void benchSVML256_DPTrig() {
|
||||
fillDP(abufdp, 0, 6.28);
|
||||
|
||||
callFuncSVML1_1(_mm256_sin_pd , "sin, DP, 256", 0, 6.28, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm256_cos_pd , "cos, DP, 256", 0, 6.28, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm256_tan_pd , "tan, DP, 256", 0, 6.28, abufdp, vdouble);
|
||||
callFuncSVML2_1(_mm256_sincos_pd, "sincos, DP, 256", 0, 6.28, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, 0, 1e+6);
|
||||
|
||||
callFuncSVML1_1(_mm256_sin_pd , "sin, DP, 256", 0, 1e+6, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm256_cos_pd , "cos, DP, 256", 0, 1e+6, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm256_tan_pd , "tan, DP, 256", 0, 1e+6, abufdp, vdouble);
|
||||
callFuncSVML2_1(_mm256_sincos_pd, "sincos, DP, 256", 0, 1e+6, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, 0, 1e+100);
|
||||
|
||||
callFuncSVML1_1(_mm256_sin_pd , "sin, DP, 256", 0, 1e+100, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm256_cos_pd , "cos, DP, 256", 0, 1e+100, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm256_tan_pd , "tan, DP, 256", 0, 1e+100, abufdp, vdouble);
|
||||
callFuncSVML2_1(_mm256_sincos_pd, "sincos, DP, 256", 0, 1e+100, abufdp, vdouble);
|
||||
}
|
||||
|
||||
void benchSVML256_DPNontrig() {
|
||||
fillDP(abufdp, 0, 1e+300);
|
||||
|
||||
callFuncSVML1_1(_mm256_log_pd , "log, DP, 256", 0, 1e+300, abufdp, vdouble);
|
||||
|
||||
if (enableLogExp) {
|
||||
callFuncSVML1_1(_mm256_log10_pd, "log10, DP, 256", 0, 1e+300, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm256_log1p_pd, "log1p, DP, 256", 0, 1e+300, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, -700, 700);
|
||||
|
||||
callFuncSVML1_1(_mm256_exp_pd , "exp, DP, 256", -700, 700, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm256_exp2_pd , "exp2, DP, 256", -700, 700, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm256_exp10_pd, "exp10, DP, 256", -700, 700, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, -30, 30);
|
||||
fillDP(bbufdp, -30, 30);
|
||||
|
||||
callFuncSVML1_2(_mm256_pow_pd, "pow, DP, 256", -30, 30, -30, 30, abufdp, bbufdp, vdouble);
|
||||
}
|
||||
|
||||
fillDP(abufdp, -1.0, 1.0);
|
||||
|
||||
callFuncSVML1_1(_mm256_asin_pd, "asin, DP, 256", -1.0, 1.0, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm256_acos_pd, "acos, DP, 256", -1.0, 1.0, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, -10, 10);
|
||||
fillDP(bbufdp, -10, 10);
|
||||
|
||||
callFuncSVML1_1(_mm256_atan_pd, "atan, DP, 256", -10, 10, abufdp, vdouble);
|
||||
callFuncSVML1_2(_mm256_atan2_pd, "atan2, DP, 256", -10, 10, -10, 10, abufdp, bbufdp, vdouble);
|
||||
}
|
||||
|
||||
void benchSVML256_SPTrig() {
|
||||
fillSP(abufsp, 0, 6.28);
|
||||
|
||||
callFuncSVML1_1(_mm256_sin_ps , "sin, SP, 256", 0, 6.28, abufsp, vfloat);
|
||||
callFuncSVML1_1(_mm256_cos_ps , "cos, SP, 256", 0, 6.28, abufsp, vfloat);
|
||||
callFuncSVML1_1(_mm256_tan_ps , "tan, SP, 256", 0, 6.28, abufsp, vfloat);
|
||||
callFuncSVML2_1(_mm256_sincos_ps, "sincos, SP, 256", 0, 6.28, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, 0, 1e+20);
|
||||
|
||||
callFuncSVML1_1(_mm256_sin_ps , "sin, SP, 256", 0, 1e+20, abufsp, vfloat);
|
||||
callFuncSVML1_1(_mm256_cos_ps , "cos, SP, 256", 0, 1e+20, abufsp, vfloat);
|
||||
callFuncSVML1_1(_mm256_tan_ps , "tan, SP, 256", 0, 1e+20, abufsp, vfloat);
|
||||
callFuncSVML2_1(_mm256_sincos_ps, "sincos, SP, 256", 0, 1e+20, abufsp, vfloat);
|
||||
}
|
||||
|
||||
void benchSVML256_SPNontrig() {
|
||||
fillSP(abufsp, 0, 1e+38);
|
||||
|
||||
callFuncSVML1_1(_mm256_log_ps , "log, SP, 256", 0, 1e+38, abufsp, vfloat);
|
||||
|
||||
if (enableLogExp) {
|
||||
callFuncSVML1_1(_mm256_log10_ps, "log10, SP, 256", 0, 1e+38, abufsp, vfloat);
|
||||
//callFuncSVML1_1(_mm256_log1p_ps, "log1p, SP, 256", 0, 1e+38, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, -100, 100);
|
||||
|
||||
callFuncSVML1_1(_mm256_exp_ps , "exp, SP, 256", -100, 100, abufsp, vfloat);
|
||||
callFuncSVML1_1(_mm256_exp2_ps , "exp2, SP, 256", -100, 100, abufsp, vfloat);
|
||||
callFuncSVML1_1(_mm256_exp10_ps, "exp10, SP, 256", -100, 100, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, -30, 30);
|
||||
fillSP(bbufsp, -30, 30);
|
||||
|
||||
callFuncSVML1_2(_mm256_pow_ps, "pow, SP, 256", -30, 30, -30, 30, abufsp, bbufsp, vfloat);
|
||||
}
|
||||
|
||||
fillSP(abufsp, -1.0, 1.0);
|
||||
|
||||
callFuncSVML1_1(_mm256_asin_ps, "asin, SP, 256", -1.0, 1, abufsp, vfloat);
|
||||
callFuncSVML1_1(_mm256_acos_ps, "acos, SP, 256", -1.0, 1, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, -10, 10);
|
||||
fillSP(bbufsp, -10, 10);
|
||||
|
||||
callFuncSVML1_1(_mm256_atan_ps, "atan, SP, 256", -10, 10, abufsp, vfloat);
|
||||
callFuncSVML1_2(_mm256_atan2_ps, "atan2, SP, 256", -10, 10, -10, 10, abufsp, bbufsp, vfloat);
|
||||
}
|
||||
#else // #ifdef ENABLED
|
||||
void zeroupper256() {}
|
||||
void benchSVML256_DPTrig() {}
|
||||
void benchSVML256_DPNontrig() {}
|
||||
void benchSVML256_SPTrig() {}
|
||||
void benchSVML256_SPNontrig() {}
|
||||
#endif // #ifdef ENABLED
|
||||
@ -1,144 +0,0 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
#include <time.h>
|
||||
#include <unistd.h>
|
||||
#include <x86intrin.h>
|
||||
|
||||
uint64_t Sleef_currentTimeMicros();
|
||||
void fillDP(double *buf, double min, double max);
|
||||
void fillSP(float *buf, double min, double max);
|
||||
|
||||
extern char x86BrandString[256], versionString[1024];
|
||||
extern int veclen;
|
||||
extern int enableLogExp;
|
||||
extern double *abufdp, *bbufdp;
|
||||
extern float *abufsp, *bbufsp;
|
||||
extern FILE *fp;
|
||||
|
||||
#include "bench.h"
|
||||
|
||||
#ifdef __AVX512F__
|
||||
typedef __m512d vdouble;
|
||||
typedef __m512 vfloat;
|
||||
#define ENABLED
|
||||
#endif
|
||||
|
||||
#ifdef ENABLED
|
||||
void benchSVML512_DPTrig() {
|
||||
fillDP(abufdp, 0, 6.28);
|
||||
|
||||
callFuncSVML1_1(_mm512_sin_pd , "sin, DP, 512", 0, 6.28, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm512_cos_pd , "cos, DP, 512", 0, 6.28, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm512_tan_pd , "tan, DP, 512", 0, 6.28, abufdp, vdouble);
|
||||
callFuncSVML2_1(_mm512_sincos_pd, "sincos, DP, 512", 0, 6.28, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, 0, 1e+6);
|
||||
|
||||
callFuncSVML1_1(_mm512_sin_pd , "sin, DP, 512", 0, 1e+6, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm512_cos_pd , "cos, DP, 512", 0, 1e+6, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm512_tan_pd , "tan, DP, 512", 0, 1e+6, abufdp, vdouble);
|
||||
callFuncSVML2_1(_mm512_sincos_pd, "sincos, DP, 512", 0, 1e+6, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, 0, 1e+100);
|
||||
|
||||
callFuncSVML1_1(_mm512_sin_pd , "sin, DP, 512", 0, 1e+100, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm512_cos_pd , "cos, DP, 512", 0, 1e+100, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm512_tan_pd , "tan, DP, 512", 0, 1e+100, abufdp, vdouble);
|
||||
callFuncSVML2_1(_mm512_sincos_pd, "sincos, DP, 512", 0, 1e+100, abufdp, vdouble);
|
||||
}
|
||||
|
||||
void benchSVML512_DPNontrig() {
|
||||
fillDP(abufdp, 0, 1e+300);
|
||||
|
||||
callFuncSVML1_1(_mm512_log_pd , "log, DP, 512", 0, 1e+300, abufdp, vdouble);
|
||||
|
||||
if (enableLogExp) {
|
||||
callFuncSVML1_1(_mm512_log10_pd, "log10, DP, 512", 0, 1e+300, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm512_log1p_pd, "log1p, DP, 512", 0, 1e+300, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, -700, 700);
|
||||
|
||||
callFuncSVML1_1(_mm512_exp_pd , "exp, DP, 512", -700, 700, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm512_exp2_pd , "exp2, DP, 512", -700, 700, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm512_exp10_pd, "exp10, DP, 512", -700, 700, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, -30, 30);
|
||||
fillDP(bbufdp, -30, 30);
|
||||
|
||||
callFuncSVML1_2(_mm512_pow_pd, "pow, DP, 512", -30, 30, -30, 30, abufdp, bbufdp, vdouble);
|
||||
}
|
||||
|
||||
fillDP(abufdp, -1.0, 1.0);
|
||||
|
||||
callFuncSVML1_1(_mm512_asin_pd, "asin, DP, 512", -1.0, 1.0, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm512_acos_pd, "acos, DP, 512", -1.0, 1.0, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, -10, 10);
|
||||
fillDP(bbufdp, -10, 10);
|
||||
|
||||
callFuncSVML1_1(_mm512_atan_pd, "atan, DP, 512", -10, 10, abufdp, vdouble);
|
||||
callFuncSVML1_2(_mm512_atan2_pd, "atan2, DP, 512", -10, 10, -10, 10, abufdp, bbufdp, vdouble);
|
||||
}
|
||||
|
||||
void benchSVML512_SPTrig() {
|
||||
fillSP(abufsp, 0, 6.28);
|
||||
|
||||
callFuncSVML1_1(_mm512_sin_ps , "sin, SP, 512", 0, 6.28, abufsp, vfloat);
|
||||
callFuncSVML1_1(_mm512_cos_ps , "cos, SP, 512", 0, 6.28, abufsp, vfloat);
|
||||
callFuncSVML1_1(_mm512_tan_ps , "tan, SP, 512", 0, 6.28, abufsp, vfloat);
|
||||
callFuncSVML2_1(_mm512_sincos_ps, "sincos, SP, 512", 0, 6.28, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, 0, 1e+20);
|
||||
|
||||
callFuncSVML1_1(_mm512_sin_ps , "sin, SP, 512", 0, 1e+20, abufsp, vfloat);
|
||||
callFuncSVML1_1(_mm512_cos_ps , "cos, SP, 512", 0, 1e+20, abufsp, vfloat);
|
||||
callFuncSVML1_1(_mm512_tan_ps , "tan, SP, 512", 0, 1e+20, abufsp, vfloat);
|
||||
callFuncSVML2_1(_mm512_sincos_ps, "sincos, SP, 512", 0, 1e+20, abufsp, vfloat);
|
||||
}
|
||||
|
||||
void benchSVML512_SPNontrig() {
|
||||
fillSP(abufsp, 0, 1e+38);
|
||||
|
||||
callFuncSVML1_1(_mm512_log_ps , "log, SP, 512", 0, 1e+38, abufsp, vfloat);
|
||||
|
||||
if (enableLogExp) {
|
||||
callFuncSVML1_1(_mm512_log10_ps, "log10, SP, 512", 0, 1e+38, abufsp, vfloat);
|
||||
//callFuncSVML1_1(_mm512_log1p_ps, "log1p, SP, 512", 0, 1e+38, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, -100, 100);
|
||||
|
||||
callFuncSVML1_1(_mm512_exp_ps , "exp, SP, 512", -100, 100, abufsp, vfloat);
|
||||
callFuncSVML1_1(_mm512_exp2_ps , "exp2, SP, 512", -100, 100, abufsp, vfloat);
|
||||
callFuncSVML1_1(_mm512_exp10_ps, "exp10, SP, 512", -100, 100, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, -30, 30);
|
||||
fillSP(bbufsp, -30, 30);
|
||||
|
||||
callFuncSVML1_2(_mm512_pow_ps, "pow, SP, 512", -30, 30, -30, 30, abufsp, bbufsp, vfloat);
|
||||
}
|
||||
|
||||
fillSP(abufsp, -1.0, 1.0);
|
||||
|
||||
callFuncSVML1_1(_mm512_asin_ps, "asin, SP, 512", -1.0, 1, abufsp, vfloat);
|
||||
callFuncSVML1_1(_mm512_acos_ps, "acos, SP, 512", -1.0, 1, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, -10, 10);
|
||||
fillSP(bbufsp, -10, 10);
|
||||
|
||||
callFuncSVML1_1(_mm512_atan_ps, "atan, SP, 512", -10, 10, abufsp, vfloat);
|
||||
callFuncSVML1_2(_mm512_atan2_ps, "atan2, SP, 512", -10, 10, -10, 10, abufsp, bbufsp, vfloat);
|
||||
}
|
||||
#else // #ifdef ENABLED
|
||||
void benchSVML512_DPTrig() {}
|
||||
void benchSVML512_DPNontrig() {}
|
||||
void benchSVML512_SPTrig() {}
|
||||
void benchSVML512_SPNontrig() {}
|
||||
#endif // #ifdef ENABLED
|
||||
@ -1,17 +0,0 @@
|
||||
#!/bin/sh
|
||||
echo
|
||||
read -p "Enter label of measurement(e.g. My desktop PC) : " label
|
||||
|
||||
if [ -f counter.txt ]
|
||||
then
|
||||
counter=`cat counter.txt`
|
||||
else
|
||||
counter=0
|
||||
fi
|
||||
|
||||
echo Measurement in progress. This may take several minutes.
|
||||
for i in $*; do
|
||||
$i "$label" $counter
|
||||
done
|
||||
counter=$((counter+1))
|
||||
echo $counter > counter.txt
|
||||
@ -65,20 +65,33 @@ include_directories(${sleef_BINARY_DIR}/include) # sleef.h
|
||||
include_directories(${sleef_SOURCE_DIR}/src/libm) # rename.h
|
||||
include_directories(${sleef_BINARY_DIR}/src/libm/include) # rename headers
|
||||
|
||||
if(NOT LIB_MPFR)
|
||||
if (SLEEF_ENFORCE_TESTER AND NOT SLEEF_ENABLE_TESTER)
|
||||
message(FATAL_ERROR "SLEEF_ENFORCE_TESTER is specified but SLEEF_ENABLE_TESTER is false")
|
||||
endif(SLEEF_ENFORCE_TESTER AND NOT SLEEF_ENABLE_TESTER)
|
||||
|
||||
if(SLEEF_ENABLE_TESTER AND NOT LIB_MPFR)
|
||||
find_program(TESTER_COMMAND tester)
|
||||
endif(NOT LIB_MPFR)
|
||||
endif(SLEEF_ENABLE_TESTER AND NOT LIB_MPFR)
|
||||
|
||||
if (SLEEF_ENFORCE_TESTER AND NOT LIB_MPFR AND NOT TESTER_COMMAND)
|
||||
message(FATAL_ERROR "SLEEF_ENFORCE_TESTER is specified and tester is not available")
|
||||
endif(SLEEF_ENFORCE_TESTER AND NOT LIB_MPFR AND NOT TESTER_COMMAND)
|
||||
|
||||
if (SLEEF_ENFORCE_TESTER4 AND NOT SLEEF_ENABLE_TESTER4)
|
||||
message(FATAL_ERROR "SLEEF_ENFORCE_TESTER4 is specified but SLEEF_ENABLE_TESTER4 is false")
|
||||
endif()
|
||||
|
||||
if (SLEEF_ENFORCE_TESTER4 AND NOT TLFLOAT_LIBRARIES)
|
||||
message(FATAL_ERROR "SLEEF_ENFORCE_TESTER4 is specified but TLFloat is not available")
|
||||
endif()
|
||||
|
||||
find_library(LIBRT rt)
|
||||
if (NOT LIBRT)
|
||||
set(LIBRT "")
|
||||
endif()
|
||||
|
||||
set(CMAKE_C_FLAGS "${ORG_CMAKE_C_FLAGS} ${SLEEF_C_FLAGS} ${FLAGS_NOSTRICTALIASING}")
|
||||
set(CMAKE_CXX_FLAGS "${ORG_CMAKE_CXX_FLAGS} ${SLEEF_C_FLAGS} ${FLAGS_NOSTRICTALIASING}")
|
||||
|
||||
set(COMMON_TARGET_PROPERTIES
|
||||
C_STANDARD 99 # -std=gnu99
|
||||
@ -90,6 +103,17 @@ endif()
|
||||
|
||||
#
|
||||
|
||||
function(add_test_with_emu C CMD)
|
||||
if (SDE_COMMAND)
|
||||
add_test(NAME ${CMD} COMMAND ${SDE_COMMAND} "--" ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${CMD})
|
||||
elseif(EMULATOR)
|
||||
add_test(NAME ${CMD} COMMAND ${EMULATOR} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${CMD})
|
||||
else()
|
||||
add_test(NAME ${CMD} COMMAND ${CMD})
|
||||
endif()
|
||||
set_tests_properties(${CMD} PROPERTIES COST ${C})
|
||||
endfunction()
|
||||
|
||||
function(add_test_iut IUT C)
|
||||
if (LIB_MPFR)
|
||||
set(TESTER ${TARGET_TESTER})
|
||||
@ -126,14 +150,19 @@ function(add_test_iut IUT C)
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
# Compile executable 'iut'
|
||||
add_executable(${TARGET_IUT} iut.c testerutil.c)
|
||||
target_compile_definitions(${TARGET_IUT} PRIVATE ${COMMON_TARGET_DEFINITIONS})
|
||||
target_link_libraries(${TARGET_IUT} ${TARGET_LIBSLEEF}
|
||||
${LIBM} ${LIBRT})
|
||||
set_target_properties(${TARGET_IUT} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
add_test_iut(${TARGET_IUT} 1.0)
|
||||
set(IUT_LIST ${TARGET_IUT})
|
||||
if (SLEEF_ENABLE_TESTER)
|
||||
# Compile executable 'iut'
|
||||
add_executable(${TARGET_IUT} iut.c)
|
||||
target_compile_definitions(${TARGET_IUT} PRIVATE ${COMMON_TARGET_DEFINITIONS})
|
||||
target_link_libraries(${TARGET_IUT} ${TARGET_LIBSLEEF} ${TARGET_TESTERUTIL_OBJ}
|
||||
${LIBM} ${LIBRT})
|
||||
set_target_properties(${TARGET_IUT} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
add_test_iut(${TARGET_IUT} 1.0)
|
||||
set(IUT_LIST ${TARGET_IUT})
|
||||
|
||||
# Tests depends on the library
|
||||
add_dependencies(${TARGET_IUT} ${TARGET_HEADERS})
|
||||
endif()
|
||||
|
||||
# Compile executable 'iutcuda'
|
||||
if (SLEEF_BUILD_INLINE_HEADERS AND SED_COMMAND AND CMAKE_CUDA_COMPILER)
|
||||
@ -145,97 +174,179 @@ if (SLEEF_BUILD_INLINE_HEADERS AND SED_COMMAND AND CMAKE_CUDA_COMPILER)
|
||||
list(APPEND IUT_LIST iutcuda)
|
||||
endif()
|
||||
|
||||
set(IUT_SRC iutsimd.c ${sleef_SOURCE_DIR}/src/common/main_checkfeature.c testerutil.c)
|
||||
set(IUT_SRC iutsimd.c ${sleef_SOURCE_DIR}/src/common/main_checkfeature.c)
|
||||
|
||||
# Add vector extension `iut`s
|
||||
macro(test_extension SIMD)
|
||||
if(COMPILER_SUPPORTS_${SIMD})
|
||||
string(TOLOWER ${SIMD} LCSIMD)
|
||||
string(CONCAT TARGET_IUT${SIMD} "iut" ${LCSIMD})
|
||||
|
||||
add_executable(${TARGET_IUT${SIMD}} ${IUT_SRC})
|
||||
target_compile_options(${TARGET_IUT${SIMD}}
|
||||
PRIVATE ${FLAGS_ENABLE_${SIMD}})
|
||||
target_compile_definitions(${TARGET_IUT${SIMD}}
|
||||
PRIVATE ENABLE_${SIMD}=1 ${COMMON_TARGET_DEFINITIONS})
|
||||
target_link_libraries(${TARGET_IUT${SIMD}} ${TARGET_LIBSLEEF}
|
||||
${LIBM} ${LIBRT})
|
||||
if (FORCE_AAVPCS)
|
||||
target_compile_definitions(${TARGET_IUT${SIMD}} PRIVATE ENABLE_AAVPCS=1)
|
||||
endif(FORCE_AAVPCS)
|
||||
if (SLEEF_ENABLE_TESTER)
|
||||
string(CONCAT TARGET_IUT${SIMD} "iut" ${LCSIMD})
|
||||
|
||||
add_dependencies(${TARGET_IUT${SIMD}} ${TARGET_HEADERS})
|
||||
add_dependencies(${TARGET_IUT${SIMD}} ${TARGET_LIBSLEEF})
|
||||
set_target_properties(${TARGET_IUT${SIMD}} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
if (DEFINED COSTOVERRIDE_${SIMD})
|
||||
add_test_iut(${TARGET_IUT${SIMD}} ${COSTOVERRIDE_${SIMD}})
|
||||
else()
|
||||
add_test_iut(${TARGET_IUT${SIMD}} 1.0)
|
||||
endif()
|
||||
list(APPEND IUT_LIST ${TARGET_IUT${SIMD}})
|
||||
add_executable(${TARGET_IUT${SIMD}} ${IUT_SRC})
|
||||
target_compile_options(${TARGET_IUT${SIMD}}
|
||||
PRIVATE ${FLAGS_ENABLE_${SIMD}})
|
||||
target_compile_definitions(${TARGET_IUT${SIMD}}
|
||||
PRIVATE ENABLE_${SIMD}=1 ${COMMON_TARGET_DEFINITIONS})
|
||||
target_link_libraries(${TARGET_IUT${SIMD}} ${TARGET_LIBSLEEF} ${TARGET_TESTERUTIL_OBJ}
|
||||
${LIBM} ${LIBRT})
|
||||
if (FORCE_AAVPCS)
|
||||
target_compile_definitions(${TARGET_IUT${SIMD}} PRIVATE ENABLE_AAVPCS=1)
|
||||
endif(FORCE_AAVPCS)
|
||||
|
||||
# The iut programs whose names begin with "iuty" are the iut for the
|
||||
# deterministic version of functions. By checking the result of
|
||||
# testing with iutysse2, for example, it can be checked that the
|
||||
# corresponding deterministic functions passes the accuracy and
|
||||
# nonnumber tests.
|
||||
|
||||
string(CONCAT IUTYNAME "iuty" ${LCSIMD})
|
||||
add_executable(${IUTYNAME} ${IUT_SRC})
|
||||
target_compile_options(${IUTYNAME}
|
||||
PRIVATE ${FLAGS_ENABLE_${SIMD}})
|
||||
target_compile_definitions(${IUTYNAME}
|
||||
PRIVATE ENABLE_${SIMD}=1 ${COMMON_TARGET_DEFINITIONS} DETERMINISTIC=1)
|
||||
target_link_libraries(${IUTYNAME} ${TARGET_LIBSLEEF}
|
||||
${LIBM} ${LIBRT})
|
||||
add_dependencies(${IUTYNAME} ${TARGET_HEADERS})
|
||||
add_dependencies(${IUTYNAME} ${TARGET_LIBSLEEF})
|
||||
set_target_properties(${IUTYNAME} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
if (DEFINED COSTOVERRIDE_${SIMD})
|
||||
add_test_iut(${IUTYNAME} ${COSTOVERRIDE_${SIMD}})
|
||||
else()
|
||||
add_test_iut(${IUTYNAME} 1.0)
|
||||
endif()
|
||||
list(APPEND IUT_LIST ${IUTYNAME})
|
||||
|
||||
# The iut programs whose names begin with "iuti" are the iut for the
|
||||
# inline version of functions.
|
||||
|
||||
if (SLEEF_BUILD_INLINE_HEADERS AND SED_COMMAND)
|
||||
string(CONCAT IUTINAME "iuti" ${LCSIMD})
|
||||
add_executable(${IUTINAME} ${IUT_SRC})
|
||||
target_compile_options(${IUTINAME} PRIVATE ${FLAGS_ENABLE_${SIMD}})
|
||||
target_compile_definitions(${IUTINAME}
|
||||
PRIVATE ENABLE_${SIMD}=1 ${COMMON_TARGET_DEFINITIONS}
|
||||
USE_INLINE_HEADER="sleefinline_${LCSIMD}.h"
|
||||
MACRO_ONLY_HEADER="macroonly${SIMD}.h"
|
||||
SIMD_SUFFIX=_${LCSIMD}_sleef
|
||||
)
|
||||
target_include_directories(${IUTINAME} PRIVATE ${PROJECT_BINARY_DIR}/include)
|
||||
target_link_libraries(${IUTINAME} ${LIBM} ${LIBRT})
|
||||
add_dependencies(${IUTINAME} ${TARGET_INLINE_HEADERS})
|
||||
set_target_properties(${IUTINAME} PROPERTIES C_STANDARD 99)
|
||||
add_dependencies(${TARGET_IUT${SIMD}} ${TARGET_HEADERS})
|
||||
add_dependencies(${TARGET_IUT${SIMD}} ${TARGET_LIBSLEEF})
|
||||
set_target_properties(${TARGET_IUT${SIMD}} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
if (DEFINED COSTOVERRIDE_${SIMD})
|
||||
add_test_iut(${IUTINAME} ${COSTOVERRIDE_${SIMD}})
|
||||
add_test_iut(${TARGET_IUT${SIMD}} ${COSTOVERRIDE_${SIMD}})
|
||||
else()
|
||||
add_test_iut(${IUTINAME} 1.0)
|
||||
add_test_iut(${TARGET_IUT${SIMD}} 1.0)
|
||||
endif()
|
||||
list(APPEND IUT_LIST ${IUTINAME})
|
||||
endif(SLEEF_BUILD_INLINE_HEADERS AND SED_COMMAND)
|
||||
list(APPEND IUT_LIST ${TARGET_IUT${SIMD}})
|
||||
|
||||
# The iut programs whose names begin with "iuty" are the iut for the
|
||||
# deterministic version of functions. By checking the result of
|
||||
# testing with iutysse2, for example, it can be checked that the
|
||||
# corresponding deterministic functions passes the accuracy and
|
||||
# nonnumber tests.
|
||||
|
||||
string(CONCAT IUTYNAME "iuty" ${LCSIMD})
|
||||
add_executable(${IUTYNAME} ${IUT_SRC})
|
||||
target_compile_options(${IUTYNAME}
|
||||
PRIVATE ${FLAGS_ENABLE_${SIMD}})
|
||||
target_compile_definitions(${IUTYNAME}
|
||||
PRIVATE ENABLE_${SIMD}=1 ${COMMON_TARGET_DEFINITIONS} DETERMINISTIC=1)
|
||||
target_link_libraries(${IUTYNAME} ${TARGET_LIBSLEEF} ${TARGET_TESTERUTIL_OBJ}
|
||||
${LIBM} ${LIBRT})
|
||||
add_dependencies(${IUTYNAME} ${TARGET_HEADERS})
|
||||
add_dependencies(${IUTYNAME} ${TARGET_LIBSLEEF})
|
||||
set_target_properties(${IUTYNAME} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
if (DEFINED COSTOVERRIDE_${SIMD})
|
||||
add_test_iut(${IUTYNAME} ${COSTOVERRIDE_${SIMD}})
|
||||
else()
|
||||
add_test_iut(${IUTYNAME} 1.0)
|
||||
endif()
|
||||
list(APPEND IUT_LIST ${IUTYNAME})
|
||||
|
||||
# The iut programs whose names begin with "iuti" are the iut for the
|
||||
# inline version of functions.
|
||||
|
||||
if (SLEEF_BUILD_INLINE_HEADERS AND SED_COMMAND)
|
||||
string(CONCAT IUTINAME "iuti" ${LCSIMD})
|
||||
add_executable(${IUTINAME} ${IUT_SRC})
|
||||
target_compile_options(${IUTINAME} PRIVATE ${FLAGS_ENABLE_${SIMD}})
|
||||
target_compile_definitions(${IUTINAME}
|
||||
PRIVATE ENABLE_${SIMD}=1 ${COMMON_TARGET_DEFINITIONS}
|
||||
USE_INLINE_HEADER="sleefinline_${LCSIMD}.h"
|
||||
MACRO_ONLY_HEADER="macroonly${SIMD}.h"
|
||||
SIMD_SUFFIX=_${LCSIMD}_sleef
|
||||
)
|
||||
target_include_directories(${IUTINAME} PRIVATE ${PROJECT_BINARY_DIR}/include)
|
||||
target_link_libraries(${IUTINAME} ${TARGET_TESTERUTIL_OBJ} ${LIBM} ${LIBRT})
|
||||
add_dependencies(${IUTINAME} ${TARGET_INLINE_HEADERS})
|
||||
set_target_properties(${IUTINAME} PROPERTIES C_STANDARD 99)
|
||||
if (DEFINED COSTOVERRIDE_${SIMD})
|
||||
add_test_iut(${IUTINAME} ${COSTOVERRIDE_${SIMD}})
|
||||
else()
|
||||
add_test_iut(${IUTINAME} 1.0)
|
||||
endif()
|
||||
list(APPEND IUT_LIST ${IUTINAME})
|
||||
endif(SLEEF_BUILD_INLINE_HEADERS AND SED_COMMAND)
|
||||
endif(SLEEF_ENABLE_TESTER)
|
||||
|
||||
#
|
||||
|
||||
if (SLEEF_ENABLE_TESTER4 AND TLFLOAT_LIBRARIES)
|
||||
set(TESTER4_SRC tester4simd.cpp ${sleef_SOURCE_DIR}/src/common/main_checkfeature.c)
|
||||
|
||||
string(CONCAT TARGET_TESTER4_${SIMD} "tester4" ${LCSIMD})
|
||||
|
||||
add_executable(${TARGET_TESTER4_${SIMD}} ${TESTER4_SRC})
|
||||
target_compile_options(${TARGET_TESTER4_${SIMD}}
|
||||
PRIVATE ${FLAGS_ENABLE_${SIMD}})
|
||||
target_compile_definitions(${TARGET_TESTER4_${SIMD}}
|
||||
PRIVATE ENABLE_${SIMD}=1 ${COMMON_TARGET_DEFINITIONS})
|
||||
target_link_libraries(${TARGET_TESTER4_${SIMD}} ${TARGET_LIBSLEEF} ${TLFLOAT_LIBRARIES} ${TARGET_TESTERUTIL_OBJ})
|
||||
if (FORCE_AAVPCS)
|
||||
target_compile_definitions(${TARGET_TESTER4_${SIMD}} PRIVATE ENABLE_AAVPCS=1)
|
||||
endif(FORCE_AAVPCS)
|
||||
|
||||
add_dependencies(${TARGET_TESTER4_${SIMD}} ${TARGET_HEADERS})
|
||||
add_dependencies(${TARGET_TESTER4_${SIMD}} ${TARGET_LIBSLEEF})
|
||||
add_dependencies(${TARGET_TESTER4_${SIMD}} ext_tlfloat)
|
||||
set_target_properties(${TARGET_TESTER4_${SIMD}} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
if (DEFINED COSTOVERRIDE_${SIMD})
|
||||
add_test_with_emu(${COSTOVERRIDE_${SIMD}} ${TARGET_TESTER4_${SIMD}})
|
||||
else()
|
||||
add_test_with_emu(1.0 ${TARGET_TESTER4_${SIMD}})
|
||||
endif()
|
||||
|
||||
#
|
||||
|
||||
string(CONCAT TARGET_TESTER4Y_${SIMD} "tester4y" ${LCSIMD})
|
||||
|
||||
add_executable(${TARGET_TESTER4Y_${SIMD}} ${TESTER4_SRC})
|
||||
target_compile_options(${TARGET_TESTER4Y_${SIMD}}
|
||||
PRIVATE ${FLAGS_ENABLE_${SIMD}})
|
||||
target_compile_definitions(${TARGET_TESTER4Y_${SIMD}}
|
||||
PRIVATE ENABLE_${SIMD}=1 ${COMMON_TARGET_DEFINITIONS} DETERMINISTIC=1)
|
||||
target_link_libraries(${TARGET_TESTER4Y_${SIMD}} ${TARGET_LIBSLEEF} ${TLFLOAT_LIBRARIES} ${TARGET_TESTERUTIL_OBJ})
|
||||
add_dependencies(${TARGET_TESTER4Y_${SIMD}} ${TARGET_HEADERS})
|
||||
add_dependencies(${TARGET_TESTER4Y_${SIMD}} ${TARGET_LIBSLEEF})
|
||||
add_dependencies(${TARGET_TESTER4Y_${SIMD}} ext_tlfloat)
|
||||
set_target_properties(${TARGET_TESTER4Y_${SIMD}} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
if (DEFINED COSTOVERRIDE_${SIMD})
|
||||
add_test_with_emu(${COSTOVERRIDE_${SIMD}} ${TARGET_TESTER4Y_${SIMD}})
|
||||
else()
|
||||
add_test_with_emu(1.0 ${TARGET_TESTER4Y_${SIMD}})
|
||||
endif()
|
||||
|
||||
#
|
||||
|
||||
if (SLEEF_BUILD_INLINE_HEADERS)
|
||||
string(CONCAT TARGET_TESTER4I_${SIMD} "tester4i" ${LCSIMD})
|
||||
|
||||
add_executable(${TARGET_TESTER4I_${SIMD}} ${TESTER4_SRC})
|
||||
target_compile_options(${TARGET_TESTER4I_${SIMD}}
|
||||
PRIVATE ${FLAGS_ENABLE_${SIMD}})
|
||||
target_link_libraries(${TARGET_TESTER4I_${SIMD}} ${TLFLOAT_LIBRARIES} ${TARGET_TESTERUTIL_OBJ})
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "GNU")
|
||||
target_compile_options(${TARGET_TESTER4I_${SIMD}} PRIVATE "-Wno-unknown-pragmas")
|
||||
endif()
|
||||
target_compile_definitions(${TARGET_TESTER4I_${SIMD}}
|
||||
PRIVATE ENABLE_${SIMD}=1 ${COMMON_TARGET_DEFINITIONS}
|
||||
USE_INLINE_HEADER="sleefinline_${LCSIMD}.h"
|
||||
MACRO_ONLY_HEADER="macroonly${SIMD}.h"
|
||||
SIMD_SUFFIX=_${LCSIMD}_sleef
|
||||
)
|
||||
target_include_directories(${TARGET_TESTER4I_${SIMD}} PRIVATE ${PROJECT_BINARY_DIR}/include)
|
||||
add_dependencies(${TARGET_TESTER4I_${SIMD}} ${TARGET_INLINE_HEADERS})
|
||||
add_dependencies(${TARGET_TESTER4I_${SIMD}} ext_tlfloat)
|
||||
set_target_properties(${TARGET_TESTER4I_${SIMD}} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
if (DEFINED COSTOVERRIDE_${SIMD})
|
||||
add_test_with_emu(${COSTOVERRIDE_${SIMD}} ${TARGET_TESTER4I_${SIMD}})
|
||||
else()
|
||||
add_test_with_emu(1.0 ${TARGET_TESTER4I_${SIMD}})
|
||||
endif()
|
||||
endif(SLEEF_BUILD_INLINE_HEADERS)
|
||||
endif(SLEEF_ENABLE_TESTER4 AND TLFLOAT_LIBRARIES)
|
||||
|
||||
#
|
||||
|
||||
if(LIB_MPFR AND NOT ${SIMD} STREQUAL NEON32 AND NOT ${SIMD} STREQUAL NEON32VFPV4 AND NOT MINGW)
|
||||
# Build tester2 SIMD
|
||||
string(TOLOWER ${SIMD} SCSIMD)
|
||||
foreach(P dp sp)
|
||||
set(T "tester2${SCSIMD}${P}")
|
||||
add_executable(${T} tester2simd${P}.c testerutil.c)
|
||||
add_executable(${T} tester2simd${P}.c)
|
||||
if(FORCE_AAVPCS)
|
||||
target_compile_definitions(${T} PRIVATE ENABLE_AAVPCS=1)
|
||||
endif(FORCE_AAVPCS)
|
||||
target_compile_options(${T} PRIVATE ${FLAGS_ENABLE_${SIMD}})
|
||||
target_compile_definitions(${T} PRIVATE ENABLE_${SIMD}=1 USEMPFR=1 ${COMMON_TARGET_DEFINITIONS})
|
||||
set_target_properties(${T} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
target_link_libraries(${T} ${TARGET_LIBSLEEF} ${LIB_MPFR} ${LIBM} ${LIBGMP})
|
||||
target_link_libraries(${T} ${TARGET_LIBSLEEF} ${TARGET_TESTERUTIL_OBJ} ${LIB_MPFR} ${LIBGMP} ${LIBM})
|
||||
add_dependencies(${T} ${TARGET_HEADERS})
|
||||
add_dependencies(${T} ${TARGET_LIBSLEEF})
|
||||
if (MPFR_INCLUDE_DIR)
|
||||
@ -246,11 +357,11 @@ macro(test_extension SIMD)
|
||||
# testing program for the deterministic version of functions.
|
||||
|
||||
set(T "tester2y${SCSIMD}${P}")
|
||||
add_executable(${T} tester2simd${P}.c testerutil.c)
|
||||
add_executable(${T} tester2simd${P}.c)
|
||||
target_compile_options(${T} PRIVATE ${FLAGS_ENABLE_${SIMD}})
|
||||
target_compile_definitions(${T} PRIVATE ENABLE_${SIMD}=1 USEMPFR=1 ${COMMON_TARGET_DEFINITIONS} DETERMINISTIC=1)
|
||||
set_target_properties(${T} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
target_link_libraries(${T} ${TARGET_LIBSLEEF} ${LIB_MPFR} ${LIBM} ${LIBGMP})
|
||||
target_link_libraries(${T} ${TARGET_LIBSLEEF} ${TARGET_TESTERUTIL_OBJ} ${LIB_MPFR} ${LIBGMP} ${LIBM})
|
||||
add_dependencies(${T} ${TARGET_HEADERS})
|
||||
add_dependencies(${T} ${TARGET_LIBSLEEF})
|
||||
if (MPFR_INCLUDE_DIR)
|
||||
@ -259,13 +370,16 @@ macro(test_extension SIMD)
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
if(NOT ${SIMD} STREQUAL NEON32 AND NOT ${SIMD} STREQUAL NEON32VFPV4 AND SLEEF_OPENSSL_FOUND)
|
||||
if(NOT ${SIMD} STREQUAL NEON32 AND NOT ${SIMD} STREQUAL NEON32VFPV4)
|
||||
# Build tester3
|
||||
string(TOLOWER ${SIMD} SCSIMD)
|
||||
set(T "tester3${SCSIMD}")
|
||||
add_executable(${T} tester3.c ${sleef_SOURCE_DIR}/src/common/main_checkfeature.c testerutil.c)
|
||||
add_executable(${T} tester3.c ${sleef_SOURCE_DIR}/src/common/main_checkfeature.c)
|
||||
target_compile_options(${T} PRIVATE ${FLAGS_ENABLE_${SIMD}})
|
||||
target_compile_definitions(${T} PRIVATE ${COMMON_TARGET_DEFINITIONS} ${TESTER3_DEFINITIONS_${SIMD}})
|
||||
if (NOT SLEEF_OPENSSL_FOUND)
|
||||
target_compile_definitions(${T} PRIVATE SLEEF_USE_INTERNAL_SHA256=1)
|
||||
endif()
|
||||
set_target_properties(${T} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
|
||||
# Enable Vector PCS for Advanced SIMD (if supported)
|
||||
@ -273,8 +387,18 @@ macro(test_extension SIMD)
|
||||
host_target_AAVPCS_definitions(${T})
|
||||
endif()
|
||||
|
||||
target_link_libraries(${T} ${TARGET_LIBSLEEF} ${LIBM} ${SLEEF_OPENSSL_LIBRARIES})
|
||||
target_include_directories(${T} PRIVATE ${SLEEF_OPENSSL_INCLUDE_DIR})
|
||||
target_link_libraries(${T} ${TARGET_LIBSLEEF} ${TARGET_TESTERUTIL_OBJ} ${LIBM})
|
||||
if(LIB_MPFR)
|
||||
target_link_libraries(${T} ${LIB_MPFR} ${LIBGMP})
|
||||
endif()
|
||||
if (SLEEF_OPENSSL_FOUND)
|
||||
target_link_libraries(${T} ${SLEEF_OPENSSL_LIBRARIES})
|
||||
target_include_directories(${T} PRIVATE ${SLEEF_OPENSSL_INCLUDE_DIR})
|
||||
else()
|
||||
target_link_libraries(${T} ${TARGET_PSHA_OBJ})
|
||||
target_include_directories(${T} PRIVATE ${sleef_SOURCE_DIR}/src/common)
|
||||
endif()
|
||||
|
||||
add_dependencies(${T} ${TARGET_HEADERS})
|
||||
add_dependencies(${T} ${TARGET_LIBSLEEF})
|
||||
|
||||
@ -371,53 +495,99 @@ endif(ENABLE_GNUABI)
|
||||
#
|
||||
|
||||
if (SLEEF_ARCH_X86)
|
||||
# iutdsp128
|
||||
add_executable(iutdsp128 ${IUT_SRC})
|
||||
target_compile_definitions(iutdsp128 PRIVATE ENABLE_DSP128=1 ${COMMON_TARGET_DEFINITIONS})
|
||||
target_compile_options(iutdsp128 PRIVATE ${FLAGS_ENABLE_SSE2})
|
||||
target_link_libraries(iutdsp128 ${TARGET_LIBSLEEF} ${LIBM} ${LIBRT})
|
||||
add_dependencies(iutdsp128 ${TARGET_HEADERS} ${TARGET_LIBSLEEF})
|
||||
add_test_iut(iutdsp128 1.0)
|
||||
list(APPEND IUT_LIST iutdsp128)
|
||||
if (SLEEF_ENABLE_TESTER)
|
||||
# iutdsp128
|
||||
add_executable(iutdsp128 ${IUT_SRC})
|
||||
target_compile_definitions(iutdsp128 PRIVATE ENABLE_DSP128=1 ${COMMON_TARGET_DEFINITIONS})
|
||||
target_compile_options(iutdsp128 PRIVATE ${FLAGS_ENABLE_SSE2})
|
||||
target_link_libraries(iutdsp128 ${TARGET_LIBSLEEF} ${LIBRT} ${TARGET_TESTERUTIL_OBJ} ${LIBM})
|
||||
add_dependencies(iutdsp128 ${TARGET_HEADERS} ${TARGET_LIBSLEEF})
|
||||
add_test_iut(iutdsp128 1.0)
|
||||
list(APPEND IUT_LIST iutdsp128)
|
||||
|
||||
# iutdsp256
|
||||
add_executable(iutdsp256 ${IUT_SRC})
|
||||
target_compile_definitions(iutdsp256 PRIVATE ENABLE_DSP256=1 ${COMMON_TARGET_DEFINITIONS})
|
||||
target_compile_options(iutdsp256 PRIVATE ${FLAGS_ENABLE_AVX})
|
||||
target_link_libraries(iutdsp256 ${TARGET_LIBSLEEF} ${LIBM} ${LIBRT})
|
||||
add_dependencies(iutdsp256 ${TARGET_HEADERS} ${TARGET_LIBSLEEF})
|
||||
add_test_iut(iutdsp256 1.0)
|
||||
list(APPEND IUT_LIST iutdsp256)
|
||||
# iutdsp256
|
||||
add_executable(iutdsp256 ${IUT_SRC})
|
||||
target_compile_definitions(iutdsp256 PRIVATE ENABLE_DSP256=1 ${COMMON_TARGET_DEFINITIONS})
|
||||
target_compile_options(iutdsp256 PRIVATE ${FLAGS_ENABLE_AVX})
|
||||
target_link_libraries(iutdsp256 ${TARGET_LIBSLEEF} ${LIBRT} ${TARGET_TESTERUTIL_OBJ} ${LIBM})
|
||||
add_dependencies(iutdsp256 ${TARGET_HEADERS} ${TARGET_LIBSLEEF})
|
||||
add_test_iut(iutdsp256 1.0)
|
||||
list(APPEND IUT_LIST iutdsp256)
|
||||
endif(SLEEF_ENABLE_TESTER)
|
||||
|
||||
if (SLEEF_ENABLE_TESTER4 AND TLFLOAT_LIBRARIES)
|
||||
# tester4dsp128
|
||||
add_executable(tester4dsp128 ${TESTER4_SRC})
|
||||
target_compile_definitions(tester4dsp128 PRIVATE
|
||||
ENABLE_DSP128=1 ${COMMON_TARGET_DEFINITIONS})
|
||||
target_compile_options(tester4dsp128 PRIVATE ${FLAGS_ENABLE_SSE2})
|
||||
target_link_libraries(tester4dsp128 ${TARGET_LIBSLEEF} ${TLFLOAT_LIBRARIES} ${TARGET_TESTERUTIL_OBJ})
|
||||
add_dependencies(tester4dsp128 ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ext_tlfloat)
|
||||
add_test_with_emu(1.0 tester4dsp128)
|
||||
|
||||
# tester4dsp256
|
||||
add_executable(tester4dsp256 ${TESTER4_SRC})
|
||||
target_compile_definitions(tester4dsp256 PRIVATE
|
||||
ENABLE_DSP256=1 ${COMMON_TARGET_DEFINITIONS})
|
||||
target_compile_options(tester4dsp256 PRIVATE ${FLAGS_ENABLE_AVX})
|
||||
target_link_libraries(tester4dsp256 ${TARGET_LIBSLEEF} ${TLFLOAT_LIBRARIES} ${TARGET_TESTERUTIL_OBJ})
|
||||
add_dependencies(tester4dsp256 ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ext_tlfloat)
|
||||
add_test_with_emu(1.0 tester4dsp256)
|
||||
endif(SLEEF_ENABLE_TESTER4 AND TLFLOAT_LIBRARIES)
|
||||
endif(SLEEF_ARCH_X86)
|
||||
|
||||
if (SLEEF_ARCH_PPC64)
|
||||
add_executable(iutdsp128 ${IUT_SRC})
|
||||
target_compile_definitions(iutdsp128 PRIVATE ENABLE_DSPPOWER_128=1 ${COMMON_TARGET_DEFINITIONS})
|
||||
target_compile_options(iutdsp128 PRIVATE ${FLAGS_ENABLE_VSX})
|
||||
target_link_libraries(iutdsp128 ${TARGET_LIBSLEEF} ${LIBM} ${LIBRT})
|
||||
add_dependencies(iutdsp128 ${TARGET_HEADERS} ${TARGET_LIBSLEEF})
|
||||
add_test_iut(iutdsp128 1.0)
|
||||
list(APPEND IUT_LIST iutdsp128)
|
||||
if (SLEEF_ENABLE_TESTER)
|
||||
add_executable(iutdsp128 ${IUT_SRC})
|
||||
target_compile_definitions(iutdsp128 PRIVATE ENABLE_DSPPOWER_128=1 ${COMMON_TARGET_DEFINITIONS})
|
||||
target_compile_options(iutdsp128 PRIVATE ${FLAGS_ENABLE_VSX})
|
||||
target_link_libraries(iutdsp128 ${TARGET_LIBSLEEF} ${LIBRT} ${TARGET_TESTERUTIL_OBJ} ${LIBM})
|
||||
add_dependencies(iutdsp128 ${TARGET_HEADERS} ${TARGET_LIBSLEEF})
|
||||
add_test_iut(iutdsp128 1.0)
|
||||
list(APPEND IUT_LIST iutdsp128)
|
||||
endif(SLEEF_ENABLE_TESTER)
|
||||
|
||||
if (SLEEF_ENABLE_TESTER4 AND TLFLOAT_LIBRARIES)
|
||||
add_executable(tester4dsp128 ${TESTER4_SRC})
|
||||
target_compile_definitions(tester4dsp128 PRIVATE ENABLE_DSPPOWER_128=1 ${COMMON_TARGET_DEFINITIONS})
|
||||
target_compile_options(tester4dsp128 PRIVATE ${FLAGS_ENABLE_VSX})
|
||||
target_link_libraries(tester4dsp128 ${TARGET_LIBSLEEF} ${TLFLOAT_LIBRARIES} ${TARGET_TESTERUTIL_OBJ})
|
||||
add_dependencies(tester4dsp128 ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ext_tlfloat)
|
||||
add_test_with_emu(1.0 tester4dsp128)
|
||||
endif(SLEEF_ENABLE_TESTER4 AND TLFLOAT_LIBRARIES)
|
||||
endif(SLEEF_ARCH_PPC64)
|
||||
|
||||
if (SLEEF_ARCH_S390X)
|
||||
add_executable(iutdsp128 ${IUT_SRC})
|
||||
target_compile_definitions(iutdsp128 PRIVATE ENABLE_DSPS390X_128=1 ${COMMON_TARGET_DEFINITIONS})
|
||||
target_compile_options(iutdsp128 PRIVATE ${FLAGS_ENABLE_VXE})
|
||||
target_link_libraries(iutdsp128 ${TARGET_LIBSLEEF} ${LIBM} ${LIBRT})
|
||||
add_dependencies(iutdsp128 ${TARGET_HEADERS} ${TARGET_LIBSLEEF})
|
||||
add_test_iut(iutdsp128 1.0)
|
||||
list(APPEND IUT_LIST iutdsp128)
|
||||
if (SLEEF_ENABLE_TESTER)
|
||||
add_executable(iutdsp128 ${IUT_SRC})
|
||||
target_compile_definitions(iutdsp128 PRIVATE ENABLE_DSPS390X_128=1 ${COMMON_TARGET_DEFINITIONS})
|
||||
target_compile_options(iutdsp128 PRIVATE ${FLAGS_ENABLE_VXE})
|
||||
target_link_libraries(iutdsp128 ${TARGET_LIBSLEEF} ${LIBRT} ${TARGET_TESTERUTIL_OBJ} ${LIBM})
|
||||
add_dependencies(iutdsp128 ${TARGET_HEADERS} ${TARGET_LIBSLEEF})
|
||||
add_test_iut(iutdsp128 1.0)
|
||||
list(APPEND IUT_LIST iutdsp128)
|
||||
endif(SLEEF_ENABLE_TESTER)
|
||||
|
||||
if (SLEEF_ENABLE_TESTER4 AND TLFLOAT_LIBRARIES)
|
||||
add_executable(tester4dsp128 ${TESTER4_SRC})
|
||||
target_compile_definitions(tester4dsp128 PRIVATE ENABLE_DSPS390X_128=1 ${COMMON_TARGET_DEFINITIONS})
|
||||
target_compile_options(tester4dsp128 PRIVATE ${FLAGS_ENABLE_VXE})
|
||||
target_link_libraries(tester4dsp128 ${TARGET_LIBSLEEF} ${TLFLOAT_LIBRARIES} ${TARGET_TESTERUTIL_OBJ})
|
||||
add_dependencies(tester4dsp128 ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ext_tlfloat)
|
||||
add_test_with_emu(1.0 tester4dsp128)
|
||||
endif(SLEEF_ENABLE_TESTER4 AND TLFLOAT_LIBRARIES)
|
||||
endif(SLEEF_ARCH_S390X)
|
||||
|
||||
if(SLEEF_BUILD_SCALAR_LIB)
|
||||
# Compile executable 'iutscalar'
|
||||
add_executable(iutscalar iut.c testerutil.c)
|
||||
target_compile_definitions(iutscalar PRIVATE ${COMMON_TARGET_DEFINITIONS})
|
||||
target_link_libraries(iutscalar sleefscalar ${TARGET_LIBSLEEF} ${LIBM} ${LIBRT})
|
||||
set_target_properties(iutscalar PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
add_test_iut(iutscalar 1.0)
|
||||
list(APPEND IUT_LIST iutscalar)
|
||||
if (SLEEF_ENABLE_TESTER)
|
||||
# Compile executable 'iutscalar'
|
||||
add_executable(iutscalar iut.c)
|
||||
target_compile_definitions(iutscalar PRIVATE ${COMMON_TARGET_DEFINITIONS})
|
||||
target_link_libraries(iutscalar sleefscalar ${TARGET_LIBSLEEF} ${LIBRT} ${TARGET_TESTERUTIL_OBJ} ${LIBM})
|
||||
set_target_properties(iutscalar PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
add_test_iut(iutscalar 1.0)
|
||||
list(APPEND IUT_LIST iutscalar)
|
||||
endif(SLEEF_ENABLE_TESTER)
|
||||
endif()
|
||||
|
||||
if(LIB_MPFR AND NOT MINGW)
|
||||
@ -433,7 +603,7 @@ if(LIB_MPFR AND NOT MINGW)
|
||||
endif()
|
||||
foreach(P ${PRECISIONS})
|
||||
set(T "tester2${P}")
|
||||
add_executable(${T} tester2${P}.c testerutil.c)
|
||||
add_executable(${T} tester2${P}.c)
|
||||
target_compile_definitions(${T} PRIVATE USEMPFR=1 ${ENABLEFLOAT128} ${COMMON_TARGET_DEFINITIONS})
|
||||
set_target_properties(${T} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
if (FORCE_AAVPCS)
|
||||
@ -442,15 +612,15 @@ if(LIB_MPFR AND NOT MINGW)
|
||||
if (MPFR_INCLUDE_DIR)
|
||||
target_include_directories(${T} PRIVATE ${MPFR_INCLUDE_DIR})
|
||||
endif()
|
||||
target_link_libraries(${T} ${TARGET_LIBSLEEF} ${LIBQUADMATH} ${LIB_MPFR} ${LIBM} ${LIBGMP})
|
||||
target_link_libraries(${T} ${TARGET_LIBSLEEF} ${LIBQUADMATH} ${TARGET_QTESTERUTIL_OBJ} ${TARGET_TESTERUTIL_OBJ} ${LIB_MPFR} ${LIBGMP} ${LIBM})
|
||||
add_dependencies(${T} ${TARGET_HEADERS})
|
||||
add_dependencies(${T} ${TARGET_LIBSLEEF})
|
||||
endforeach()
|
||||
|
||||
# Compile executable 'tester'
|
||||
add_host_executable(${TARGET_TESTER} tester.c testerutil.c)
|
||||
add_host_executable(${TARGET_TESTER} tester.c)
|
||||
if (NOT CMAKE_CROSSCOMPILING)
|
||||
target_link_libraries(${TARGET_TESTER} ${LIB_MPFR} ${TARGET_LIBSLEEF} ${LIBM} ${LIBGMP})
|
||||
target_link_libraries(${TARGET_TESTER} ${TARGET_LIBSLEEF} ${TARGET_TESTERUTIL_OBJ} ${LIB_MPFR} ${LIBGMP} ${LIBM})
|
||||
target_compile_definitions(${TARGET_TESTER}
|
||||
PRIVATE USEMPFR=1 ${COMMON_TARGET_DEFINITIONS})
|
||||
target_compile_options(${TARGET_TESTER} PRIVATE -Wno-unused-result)
|
||||
@ -512,6 +682,3 @@ if (FILECHECK_COMMAND AND COMPILER_SUPPORTS_OPENMP AND SLEEF_ARCH_X86 AND CMAKE_
|
||||
add_test(NAME testervecabi-sse2 COMMAND sh -c "${CMAKE_C_COMPILER} -Wno-attributes -fopenmp -msse2 -O3 ${CMAKE_CURRENT_SOURCE_DIR}/testervecabi.c -I${sleef_BINARY_DIR}/include -S -o- | ${FILECHECK_COMMAND} ${CMAKE_CURRENT_SOURCE_DIR}/testervecabi.c -check-prefix=CHECK-SSE2")
|
||||
add_test(NAME testervecabi-avx2 COMMAND sh -c "${CMAKE_C_COMPILER} -Wno-attributes -fopenmp -mavx2 -O3 ${CMAKE_CURRENT_SOURCE_DIR}/testervecabi.c -I${sleef_BINARY_DIR}/include -S -o- | ${FILECHECK_COMMAND} ${CMAKE_CURRENT_SOURCE_DIR}/testervecabi.c -check-prefix=CHECK-AVX2")
|
||||
endif()
|
||||
|
||||
# Tests depends on the library
|
||||
add_dependencies(${TARGET_IUT} ${TARGET_HEADERS})
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
@ -118,148 +118,148 @@ typedef svint32_t vint2;
|
||||
|
||||
#define __MAKE_FN_NAME(name, t, vl, p) _ZGV##t##N##vl##p##_##name
|
||||
|
||||
#define __DECLARE_vd_vd(name, t, vl, p) \
|
||||
#define __DECLARE_vd_vd(name, t, vl, p) \
|
||||
extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble)
|
||||
#define __CALL_vd_vd(name, t, vl, p) \
|
||||
#define __CALL_vd_vd(name, t, vl, p) \
|
||||
do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1); } while(0)
|
||||
|
||||
#define __DECLARE_vi_vd(name, t, vl, p) \
|
||||
#define __DECLARE_vi_vd(name, t, vl, p) \
|
||||
extern vint VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble)
|
||||
#define __CALL_vi_vd(name, t, vl, p) \
|
||||
#define __CALL_vi_vd(name, t, vl, p) \
|
||||
do { vi0 = __MAKE_FN_NAME(name, t, vl, p)(vd1); } while(0)
|
||||
|
||||
#define __DECLARE_vd_vd_vi(name, t, vl, p) \
|
||||
#define __DECLARE_vd_vd_vi(name, t, vl, p) \
|
||||
extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vint)
|
||||
#define __CALL_vd_vd_vi(name, t, vl, p) \
|
||||
#define __CALL_vd_vd_vi(name, t, vl, p) \
|
||||
do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, vi2); } while(0)
|
||||
|
||||
#define __DECLARE_vd_vd_vd(name, t, vl, p) \
|
||||
#define __DECLARE_vd_vd_vd(name, t, vl, p) \
|
||||
extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble)
|
||||
#define __CALL_vd_vd_vd(name, t, vl, p) \
|
||||
#define __CALL_vd_vd_vd(name, t, vl, p) \
|
||||
do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, vd2); } while(0)
|
||||
|
||||
#define __DECLARE_vd_vd_vd_vd(name, t, vl, p) \
|
||||
#define __DECLARE_vd_vd_vd_vd(name, t, vl, p) \
|
||||
extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble, vdouble)
|
||||
#define __CALL_vd_vd_vd_vd(name, t, vl, p) \
|
||||
#define __CALL_vd_vd_vd_vd(name, t, vl, p) \
|
||||
do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, vd2, vd3); } while(0)
|
||||
|
||||
#define __DECLARE_vd_vd_pvd(name, t, vl, p) \
|
||||
#define __DECLARE_vd_vd_pvd(name, t, vl, p) \
|
||||
extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble *)
|
||||
#define __CALL_vd_vd_pvd(name, t, vl, p) \
|
||||
do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, &vd2); } while(0)
|
||||
|
||||
#define __DECLARE_v_vd_pvd_pvd(name, t, vl, p) \
|
||||
#define __DECLARE_v_vd_pvd_pvd(name, t, vl, p) \
|
||||
extern void VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble *, vdouble *)
|
||||
#define __CALL_v_vd_pvd_pvd(name, t, vl, p) \
|
||||
#define __CALL_v_vd_pvd_pvd(name, t, vl, p) \
|
||||
do { __MAKE_FN_NAME(name, t, vl, p)(vd0, &vd1, &vd2); } while(0)
|
||||
|
||||
#define __DECLARE_vf_vf(name, t, vl, p) \
|
||||
#define __DECLARE_vf_vf(name, t, vl, p) \
|
||||
extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat)
|
||||
#define __CALL_vf_vf(name, t, vl, p) \
|
||||
#define __CALL_vf_vf(name, t, vl, p) \
|
||||
do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1); } while(0)
|
||||
|
||||
#define __DECLARE_vf_vf_vf(name, t, vl, p) \
|
||||
#define __DECLARE_vf_vf_vf(name, t, vl, p) \
|
||||
extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat)
|
||||
#define __CALL_vf_vf_vf(name, t, vl, p) \
|
||||
#define __CALL_vf_vf_vf(name, t, vl, p) \
|
||||
do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, vf2); } while(0)
|
||||
|
||||
#define __DECLARE_vf_vf_vf_vf(name, t, vl, p) \
|
||||
#define __DECLARE_vf_vf_vf_vf(name, t, vl, p) \
|
||||
extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat, vfloat)
|
||||
#define __CALL_vf_vf_vf_vf(name, t, vl, p) \
|
||||
#define __CALL_vf_vf_vf_vf(name, t, vl, p) \
|
||||
do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, vf2, vf3); } while(0)
|
||||
|
||||
#define __DECLARE_vf_vf_pvf(name, t, vl, p) \
|
||||
#define __DECLARE_vf_vf_pvf(name, t, vl, p) \
|
||||
extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat *)
|
||||
#define __CALL_vf_vf_pvf(name, t, vl, p) \
|
||||
do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, &vf2); } while(0)
|
||||
|
||||
#define __DECLARE_vi_vf(name, t, vl, p) \
|
||||
#define __DECLARE_vi_vf(name, t, vl, p) \
|
||||
extern vint2 VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat)
|
||||
#define __CALL_vi_vf(name, t, vl, p) \
|
||||
#define __CALL_vi_vf(name, t, vl, p) \
|
||||
do { vi20 = __MAKE_FN_NAME(name, t, vl, p)(vf1); } while(0)
|
||||
|
||||
#define __DECLARE_vf_vf_vi(name, t, vl, p) \
|
||||
#define __DECLARE_vf_vf_vi(name, t, vl, p) \
|
||||
extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vint2)
|
||||
#define __CALL_vf_vf_vi(name, t, vl, p) \
|
||||
#define __CALL_vf_vf_vi(name, t, vl, p) \
|
||||
do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, vi22); } while(0)
|
||||
|
||||
#define __DECLARE_v_vf_pvf_pvf(name, t, vl, p) \
|
||||
#define __DECLARE_v_vf_pvf_pvf(name, t, vl, p) \
|
||||
extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat *, vfloat*)
|
||||
#define __CALL_v_vf_pvf_pvf(name, t, vl, p) \
|
||||
#define __CALL_v_vf_pvf_pvf(name, t, vl, p) \
|
||||
do { __MAKE_FN_NAME(name, t, vl, p)(vf0, &vf1, &vf2); } while(0)
|
||||
|
||||
#else /******************** MASKED_GNUABI *****************************/
|
||||
|
||||
#define __MAKE_FN_NAME(name, t, vl, p) _ZGV##t##M##vl##p##_##name
|
||||
|
||||
#define __DECLARE_vd_vd(name, t, vl, p) \
|
||||
#define __DECLARE_vd_vd(name, t, vl, p) \
|
||||
extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vopmask)
|
||||
#define __CALL_vd_vd(name, t, vl, p) \
|
||||
#define __CALL_vd_vd(name, t, vl, p) \
|
||||
do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, mask); } while(0)
|
||||
|
||||
#define __DECLARE_vi_vd(name, t, vl, p) \
|
||||
#define __DECLARE_vi_vd(name, t, vl, p) \
|
||||
extern vint VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vopmask)
|
||||
#define __CALL_vi_vd(name, t, vl, p) \
|
||||
#define __CALL_vi_vd(name, t, vl, p) \
|
||||
do { vi0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, mask); } while(0)
|
||||
|
||||
#define __DECLARE_vd_vd_vi(name, t, vl, p) \
|
||||
#define __DECLARE_vd_vd_vi(name, t, vl, p) \
|
||||
extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vint, vopmask)
|
||||
#define __CALL_vd_vd_vi(name, t, vl, p) \
|
||||
#define __CALL_vd_vd_vi(name, t, vl, p) \
|
||||
do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, vi2, mask); } while(0)
|
||||
|
||||
#define __DECLARE_vd_vd_vd(name, t, vl, p) \
|
||||
#define __DECLARE_vd_vd_vd(name, t, vl, p) \
|
||||
extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble, vopmask)
|
||||
#define __CALL_vd_vd_vd(name, t, vl, p) \
|
||||
#define __CALL_vd_vd_vd(name, t, vl, p) \
|
||||
do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, vd2, mask); } while(0)
|
||||
|
||||
#define __DECLARE_vd_vd_vd_vd(name, t, vl, p) \
|
||||
#define __DECLARE_vd_vd_vd_vd(name, t, vl, p) \
|
||||
extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble, vdouble, vopmask)
|
||||
#define __CALL_vd_vd_vd_vd(name, t, vl, p) \
|
||||
#define __CALL_vd_vd_vd_vd(name, t, vl, p) \
|
||||
do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, vd2, vd3, mask); } while(0)
|
||||
|
||||
#define __DECLARE_vd_vd_pvd(name, t, vl, p) \
|
||||
#define __DECLARE_vd_vd_pvd(name, t, vl, p) \
|
||||
extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble *, vopmask)
|
||||
#define __CALL_vd_vd_pvd(name, t, vl, p) \
|
||||
do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, &vd2, mask); } while(0)
|
||||
|
||||
#define __DECLARE_v_vd_pvd_pvd(name, t, vl, p) \
|
||||
#define __DECLARE_v_vd_pvd_pvd(name, t, vl, p) \
|
||||
extern void VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble *, vdouble *, vopmask)
|
||||
#define __CALL_v_vd_pvd_pvd(name, t, vl, p) \
|
||||
#define __CALL_v_vd_pvd_pvd(name, t, vl, p) \
|
||||
do { __MAKE_FN_NAME(name, t, vl, p)(vd0, &vd1, &vd2, mask); } while(0)
|
||||
|
||||
#define __DECLARE_vf_vf(name, t, vl, p) \
|
||||
#define __DECLARE_vf_vf(name, t, vl, p) \
|
||||
extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vopmask)
|
||||
#define __CALL_vf_vf(name, t, vl, p) \
|
||||
#define __CALL_vf_vf(name, t, vl, p) \
|
||||
do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, mask); } while(0)
|
||||
|
||||
#define __DECLARE_vf_vf_vf(name, t, vl, p) \
|
||||
#define __DECLARE_vf_vf_vf(name, t, vl, p) \
|
||||
extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat, vopmask)
|
||||
#define __CALL_vf_vf_vf(name, t, vl, p) \
|
||||
#define __CALL_vf_vf_vf(name, t, vl, p) \
|
||||
do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, vf2, mask); } while(0)
|
||||
|
||||
#define __DECLARE_vf_vf_vf_vf(name, t, vl, p) \
|
||||
#define __DECLARE_vf_vf_vf_vf(name, t, vl, p) \
|
||||
extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat, vfloat, vopmask)
|
||||
#define __CALL_vf_vf_vf_vf(name, t, vl, p) \
|
||||
#define __CALL_vf_vf_vf_vf(name, t, vl, p) \
|
||||
do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, vf2, vf3, mask); } while(0)
|
||||
|
||||
#define __DECLARE_vf_vf_pvf(name, t, vl, p) \
|
||||
#define __DECLARE_vf_vf_pvf(name, t, vl, p) \
|
||||
extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat *, vopmask)
|
||||
#define __CALL_vf_vf_pvf(name, t, vl, p) \
|
||||
do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, &vf2, mask); } while(0)
|
||||
|
||||
#define __DECLARE_vi_vf(name, t, vl, p) \
|
||||
#define __DECLARE_vi_vf(name, t, vl, p) \
|
||||
extern vint2 VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vopmask)
|
||||
#define __CALL_vi_vf(name, t, vl, p) \
|
||||
#define __CALL_vi_vf(name, t, vl, p) \
|
||||
do { vi20 = __MAKE_FN_NAME(name, t, vl, p)(vf1, mask); } while(0)
|
||||
|
||||
#define __DECLARE_vf_vf_vi(name, t, vl, p) \
|
||||
#define __DECLARE_vf_vf_vi(name, t, vl, p) \
|
||||
extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vint2, vopmask)
|
||||
#define __CALL_vf_vf_vi(name, t, vl, p) \
|
||||
#define __CALL_vf_vf_vi(name, t, vl, p) \
|
||||
do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, vi22, mask); } while(0)
|
||||
|
||||
#define __DECLARE_v_vf_pvf_pvf(name, t, vl, p) \
|
||||
#define __DECLARE_v_vf_pvf_pvf(name, t, vl, p) \
|
||||
extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat *, vfloat*, vopmask)
|
||||
#define __CALL_v_vf_pvf_pvf(name, t, vl, p) \
|
||||
#define __CALL_v_vf_pvf_pvf(name, t, vl, p) \
|
||||
do { __MAKE_FN_NAME(name, t, vl, p)(vf0, &vf1, &vf2, mask); } while(0)
|
||||
|
||||
#endif /* MASKED_GNUABI */
|
||||
|
||||
@ -1,129 +1,129 @@
|
||||
sin u35 bc50dfbcbd8ef534541d1babe90860c7
|
||||
sin u10 dbc2cf81f292ef50fa0119e222c6c9f9
|
||||
cos u35 506e34a809b80ad3603ed46ba2a574b0
|
||||
cos u10 a0f69df5937152b8f8f0e671f3676289
|
||||
tan u35 970b5cd7f0e05defa22ebb155ab61a40
|
||||
tan u10 5fd08e0552e3ab853439bf5fd2bd344d
|
||||
sincos u10 7c164edcaa45988f6165b653fc76c495
|
||||
sincos u35 38fe7e261e184ed8dbf432ce6bedc5c4
|
||||
sincospi u05 0c6fc00c7aaf0b6e67d66542d1be833d
|
||||
sincospi u35 c428b0fc3e6c5be4d2c03dcd8bb27a7c
|
||||
log u10 4855b27222d900bea47a27cadba71727
|
||||
log u35 c95484de57c167da3d8d6d1baadf9ffa
|
||||
log2 u10 2662df9af919680ca62e1752fb1b7539
|
||||
log2 u35 1cd6d7f194a5e8364191497adc5c5cec
|
||||
log10 u10 36645e8031d873d66fd0ec2c5959f273
|
||||
log1p u10 1383924fb56cf2e7eda27de21320c591
|
||||
exp u10 13692a48edf2cf7a3e047b16ddfb7b81
|
||||
exp2 u10 436146f8d6dcaa4a754837108a9aa3e1
|
||||
exp2 u35 8881d075d9101a1dfa3f6a10b9ee8373
|
||||
exp10 u10 9d704b310f683872a6446cfc97726a4d
|
||||
exp10 u35 bc07745ebc22a7ee97679154c24b23cc
|
||||
expm1 u10 cd3f0b8e86943d52c278394b60e2d22e
|
||||
pow u10 a0ea63b27d33262346a35c9439741075
|
||||
cbrt u10 5d8bf28ac74624594fd1be9217817690
|
||||
cbrt u10 3c896e03746bcf1b3f70182dfec3d93b
|
||||
cbrt u35 73daa306764e208aab1627ac110b10d7
|
||||
cbrt u35 c29b7bf200215425b4ba948c8cc94c42
|
||||
hypot u05 cc2f18e409e19a02cadf7b91fd869120
|
||||
hypot u35 5194e0a554174a6145511ce3df9c1f46
|
||||
asin u10 86c061caec3fa2e1bc71bda4dad29f4c
|
||||
asin u35 31303b88bdc00206265002d6cc5e89e4
|
||||
acos u10 0a1a403590f2ac8364f132b334920945
|
||||
acos u35 493f960c1cce57931d95a5a22a0587a3
|
||||
atan u10 c97624a24ec034cc0c8985acb61d13cd
|
||||
atan u10 0be0f550406923016cfeb5ef62c25b15
|
||||
atan u35 9d6d83e066b5a4851d44771418c9948c
|
||||
atan u35 f32c1aa4caa08c6945afd1125ba8b113
|
||||
atan2 u10 6b1d9d25fcd96053acc19d1633fab36a
|
||||
atan2 u35 afb07894347062a96dab705b34eb1763
|
||||
sinh u10 61d459b1f368087f6f23ebf8e9f0ea01
|
||||
cosh u10 f77eb95f79e274c12b4e92dc0389259b
|
||||
tanh u10 2bb9dd54ed0fa22bb5f3b6d557eb58a3
|
||||
asinh u10 01136e54e2a434839530dda54f33cfdb
|
||||
acosh u10 2f3c28c9ee2eb2b3d5659c6cb2a58e3e
|
||||
atanh u10 601a77ba8c1d5175f2808b48a41260c1
|
||||
lgamma u10 90cdc41063f4198c6ad592c0cdd0f5da
|
||||
tgamma u10 6f864c3a1f17fbdf914cac7ffcd82cb7
|
||||
erf u10 f4ae148b59bb7501d8f5746300850376
|
||||
erfc u15 5e116a4316dafa742769f71e18f6f9fe
|
||||
fabs bef2f2ac8a4789357e580b4da4f9b9fe
|
||||
copysign 3219022f267464e3704f90558e8df3bc
|
||||
fmax 4e4f5220ccfef191864c316df0d18fc0
|
||||
fmin c0f8effb6c611e2b3b91b820ad943f62
|
||||
fdim e876d103931f18ceede5bfd7e3df7ab0
|
||||
fmod 618aa751e13012afdb41ec80dd35e6ba
|
||||
remainder 8d692dbb44bbc9be5af0c0657d3008b8
|
||||
modf f03ce73cd4f9ea7f69c017f6e53355d5
|
||||
nextafter 9eba4e30d12d74dc4e8003fcff0f1582
|
||||
trunc 1bc7e909eba121dcef7f0e4046937ae5
|
||||
floor 2cff66b499dc8a30cec9467de659b774
|
||||
ceil b080e632dcb8f8134d8715752be12917
|
||||
round 8907e21687ca9c2a539297536e754950
|
||||
rint e49f837096bc661fe1c742801dd99a30
|
||||
sinf u35 833d845950b9cbb025629fe4c040f8f6
|
||||
sinf u10 9c21afa4d7d6af3fc666309c3cd647fe
|
||||
cosf u35 74d7f871a6553cd0019087895e2052ad
|
||||
cosf u10 35349e94c323c1614f22093959288010
|
||||
tanf u35 bbb7c092d017e96d2454a38a20687735
|
||||
tanf u10 227423bc04f42d76a8f68082ba696126
|
||||
sincosf u10 83ecc4e3d5295056e9d8c52bc196b666
|
||||
sincosf u35 533319caa49a961e4909bd6dcab40721
|
||||
sincospif u05 8b3762b67a661957c1414c351ec49034
|
||||
sincospif u35 cec15ed76a358091632634166fa77b66
|
||||
logf u10 c5a90119943acc4199e1cc7030b5def8
|
||||
logf u35 af2fbe4bfa2caaf59c734e3749dd15be
|
||||
log2f u10 ba8acae369bbb7b6404cccbc633fe25b
|
||||
log2f u35 ba32ebaa8c470899ebd433d190c00f03
|
||||
log10f u10 7e235a82d960e4434575dd39648d8bb7
|
||||
log1pf u10 350fc4f13502b36bb1107e1b1122acb1
|
||||
expf u10 ee4adaabefa3fac6c0f1925b2a948eea
|
||||
exp2f u10 b0d283dbae0f36f1b3c7eed9871f0d0d
|
||||
exp2f u35 522cc30f722f77fceb07015830b351a3
|
||||
exp10f u10 b0564be151965600f5744ff2e4992bc9
|
||||
exp10f u35 d142f1fb40e44f0c9e042718f27ee3e0
|
||||
expm1f u10 ebfd6498cb40f61b609882de8a7f3c74
|
||||
powf u10 a7cba3239c87969662e8b41a4dd8b4ab
|
||||
cbrtf u10 01c5cac23fe21638be1c3eab6e368fd6
|
||||
cbrtf u10 2a245b03f83e9114644d03b40dac707b
|
||||
cbrtf u35 3ce62350fd585f0524a12c974fbe6cf5
|
||||
cbrtf u35 2aca0404626a28f7af7f60105ad6e217
|
||||
hypotf u05 bc5971cbeebee27b4c0d91fbe3f6bf30
|
||||
hypotf u35 a6f0f774b346a6bba08889ff9ba3f193
|
||||
asinf u10 7f77f7453b961512c89e87e49c549cfe
|
||||
asinf u35 22ed8760aa328e1f714031eec592a4d8
|
||||
acosf u10 15617dd0429b90e59d2923415934c2a6
|
||||
acosf u35 af0b132d9e263721f9296187dbf9b9bf
|
||||
atanf u10 26b77fb423104b45633cf24500237d6e
|
||||
atanf u10 4313d0bc2708de53f74d804aac6564d4
|
||||
atanf u35 97a1797897955643c722c7d291987331
|
||||
atanf u35 7d3f47169415058e8578f11d899bfd10
|
||||
atan2f u10 098a33f730fe95ce4774a991db4cee14
|
||||
atan2f u35 56fc6bd8349979f0d0b1dcdb57f68363
|
||||
sinhf u10 0780a2f57df3a831718195d1ee5c19ef
|
||||
coshf u10 cfbb6aed408e43a7b7f053474100ff2d
|
||||
tanhf u10 d19f254d41e8726c748df87b95bc9acd
|
||||
asinhf u10 260d129221468a86bbfd609c27bfea6a
|
||||
acoshf u10 24ced7e5631c78b20a5716faeedbaa92
|
||||
atanhf u10 164fd77b8372b8c131baaacab1c9e650
|
||||
lgammaf u10 3bf6d824175c4f4d86f3073064e41e84
|
||||
tgammaf u10 f3a8d25c852068622bdfcae4cb813583
|
||||
erff u10 f34af3814153de040b93e573ca7d21d8
|
||||
erfcf u15 915ab9830de89a5a504b3ce7cd2fecda
|
||||
fabsf a3c72220bc0ade68fe22e0a15eb730d4
|
||||
copysignf 6b35517b8e1da78d9c9b52915d9a9b19
|
||||
fmaxf 9833a60a2080e8fd9ae8de32c758966f
|
||||
fminf 2dcfa19e1f1ab4973a7dec9f2cc09fa0
|
||||
fdimf c5c0fe7b095eb8ccbb19fbf934a36b24
|
||||
fmodf 77aa84a9703e202a56e5f4609bd2482b
|
||||
remainderf 5a453b1217c173e4dc0b0211066750be
|
||||
modff 5fa4f044f20478216aa085a01b189697
|
||||
nextafterf 517c1c8f072e9024518d3d9ead98b85b
|
||||
truncf 6937050850be63c44d4b7dbd666febe6
|
||||
floorf 9341be69ee345c8554bf3ab4e9316133
|
||||
ceilf c70874771cbe9741f1f05fedd4b629e9
|
||||
roundf 0cf52f6b8015099771e9a7dfa6b090bc
|
||||
rintf bed68e788e2b11543c09c9d52198abf8
|
||||
fastsinf u3500 8eb51f86fb40414dd21284f020f24b6c
|
||||
fastcosf u3500 69cbc3703f1d2c68695b00b1b09287b2
|
||||
fastpowf u3500 e02e6a692cfa22a6b7149168c67ea1d2
|
||||
sin u35 7ddf50bfc76c34f8640e1d48368a4807046ed09a7cd9f4e092364c0ece567420
|
||||
sin u10 2dec8ff3f5d3f0601ee7d5d8cda65777b3b31d86f522b1306cf50d0a7820bdba
|
||||
cos u35 26a6889b13864c87e41500246afd02ec626529b122a1622ab5b4d915342fd981
|
||||
cos u10 094594b432e3f6f7695f21a9eac5f48adfc2b52729a0b7f6dcc73d56572896d4
|
||||
tan u35 9e4884d3079d52edb120d080ae609bc94dea6de36b91f9c41f7a69fb424cb7bf
|
||||
tan u10 ae386240aec3b3ce4b7d5a13b1f69759f54fc57378439b9801c65de4e7c8f5c6
|
||||
sincos u10 dccd728b97586cd65da3998eb225c3b59634b360acb56ea74d1d45d61fea4f4e
|
||||
sincos u35 2c16ec6ba4050808419fd5b9c995606412a0fd41f2a7e109c1a8cab5adf0b11b
|
||||
sincospi u05 9fffb591dd38190f8dd61d0f9dcaf7843606d4c3f6717bfac9835471178600a4
|
||||
sincospi u35 b362c2f22c2475715d0933caa5ee1400ae1639da9e60c83eeca676e3b2be12d7
|
||||
log u10 a25704431659d3f451536556bd81a2b9c2abc82203e23539df2ecd899436a9e2
|
||||
log u35 83476779543cb9f3a038e478e8fee0d6ee0060227a2433363d221d71ddc72ac7
|
||||
log2 u10 bf2467410af2c29e30ebf509bc066759c17b31fc409120382898a6979fbbad2e
|
||||
log2 u35 2d416462682e561a2bab83d5b11ea235cfb991675e3777fa50da75d755b08774
|
||||
log10 u10 1aa2fb18c8ae9a19f8f9be331f72cb3f842188b705d73e86bde47ecf661297cd
|
||||
log1p u10 e21e7518e09b85f0adaf1d0d3cff362364e925fd07aa3163d77b818cb644d942
|
||||
exp u10 c21df57b84d8c9010aae562e21daf7b1c3f7df277db9cff2999d74bfb517e60d
|
||||
exp2 u10 451209f52083f022f30793abcf7761eae138642bf8d5a252ca8c83489088bff3
|
||||
exp2 u35 0661d1afebb47f2755e97337d6b065cf925219aba48e192b9fbb56f696f17d84
|
||||
exp10 u10 9881cd7b6c7c2eeb7b8b5d297277d1d0f4276ea74835672a94fbcade8e604d34
|
||||
exp10 u35 5a8d99078d3ca904dad9fc3ac4ec7c90d2bcd216417022dcb38df30293e1cdf5
|
||||
expm1 u10 609ae579ed99b4c8ff7ccaead9c3a2216bfbc1d156dc05a6b401de066b0a079c
|
||||
pow u10 a0034cc77ecd21a809265f76e67528217357f2ef3d2883ff017512f92bbf9360
|
||||
cbrt u10 e128b321cd05dca403a7b0633424cad82600ceb5b61966f70ff3cf425bd6b3f9
|
||||
cbrt u10 b722d767ae6dd66d3d1dfa9d5d2aedaed3c652020dab5fcfdd729b3f2c803e98
|
||||
cbrt u35 5ecd857b96a17ecf71808a53416e0f40d0935f236e307dd5e43587b12db375cb
|
||||
cbrt u35 c46da13b1a71174922de04a844b1b303ac5fd2d0da98a6352b234292cf7e42e9
|
||||
hypot u05 9f4275e06e1ce269722162c4bc521f159906a448ee05f9619037706cd3e54b72
|
||||
hypot u35 de0c1ae1ea4c9eda164e0dca28c293cc72caf3b12b2d15f757bbb4bb347f257b
|
||||
asin u10 c51e0211bc0a1a422982df89d38f48ef0b0af1d90588a1715fd4ce966c701b66
|
||||
asin u35 405410e624265daa84c0837c55ccf2d45d8c4f6086b6f6a744c4c6e133cbcc1d
|
||||
acos u10 8e8c6e984110c0decc1ce21bf71505195f029a935064bc3692997b400cb15edc
|
||||
acos u35 bc99071767af3d4bf23c3d828284a6950ae205898a6b3773a5aca0b59d6d6a0d
|
||||
atan u10 c96690351d5df7745fed2004b1c72dc7aceaa32c4d400f296c32efc9ecddab0e
|
||||
atan u10 9f64e9a576084542e1fa4a4064055af79b4ae20ced35ca617c4327a30a4a70e4
|
||||
atan u35 a0852efacaa91625350cf104f8fe0dcbb5936d2b9ebbd3cf8cd6234ccaf8a0d3
|
||||
atan u35 e61f1f4917e474cbc7ca5ada17c31bdece04c6a86210a472c53cf5e8faeac882
|
||||
atan2 u10 9b6c9b875a9c841259fca8d718778a1895a5b434ab4b95d284c4345249c2f853
|
||||
atan2 u35 895dfae0dbce6c2aff81b986ebc732fb0323b267f57c7b1e0d5c8ec522da6af4
|
||||
sinh u10 d3859e3dc1ca924f11dc7b464cb0bb535d4ad71d1ec6f416a82db6e0e2390367
|
||||
cosh u10 e6fd1172e97fa9341028299dd8a00379f1313170b8444a6a3c291230e4f178b7
|
||||
tanh u10 5e2c1ce9d160d1a5dcc5ef8fd74f860751764f5dc14124075f848074ee386618
|
||||
asinh u10 37d0df9811cc871b1dde4d762cc0eb53ec6c71c7bcf13100b9b5302ba1a85b99
|
||||
acosh u10 158fb84af679aea2ab411fb84cd0b12ff876d897722ff84c54fa567c35705033
|
||||
atanh u10 32253ae4f643e56a3d25a6d96d316ed94cd3a9e5ea16ad7180ff96e68571dc34
|
||||
lgamma u10 4663f72dcb58a53bedefe071de51f0fccb9b73db12f5b53d5acea347d4de06cd
|
||||
tgamma u10 87e21460a2a991b677416b39a85d391051e4327a39baa7bfb93f2e27965567af
|
||||
erf u10 56488fa7013635a233d05787e9a681c1c8775b6d9aace07f0d1dd16fc34c5875
|
||||
erfc u15 0e5e1126a0eb4cce30f6cb164b33330ac4d792c21b8bfbe33cc9a828b4f9f047
|
||||
fabs ff336faed535e34a082752839c9e957ba069ffdf0b046215bd415ce9120f29a3
|
||||
copysign 67a7a162bfc2f15b76ded0470f938ab000edf8f8566d5a19fa99d4ea4d29fff3
|
||||
fmax 57f39d5440fadb2a7387a47c00b067d5fc57ceabd7e5d64943b033acb5212063
|
||||
fmin 87e131762ec9c46badd6105ab66f09d99d65776e2719f6af9befd8d6d3f59b6b
|
||||
fdim 3331d6a17f289f54d429bdda9374d7d2574e0cd173e930a57436e8e484f271e9
|
||||
fmod 89d26af516be177c55ba9fcec972416c35e229456b053271548021e9b070c193
|
||||
remainder 2db01bb12776ec14d4a15469c31b49e759d74a3c8ed30d14fe88af3b27b5c398
|
||||
modf 7780d1e6448f21bec6504e398a4e826f304da10aaec3c4e210bed86abdaecedf
|
||||
nextafter 60a6c07477f6d07cd938ba6361d020175193a934a2714132615dae0bcedf785a
|
||||
trunc ee43b2f9d897428885cb039f85259ea5ffe4efbfe4bf0dba16ee19829d198ac6
|
||||
floor 29f8be9b8ad5795e65ed4f34878a85f5f8a1be707489345c4ad04b36d4da54bf
|
||||
ceil bf267441867b261f8dcfca61b55fdc7ac0ff7a017b150da1b532776894962208
|
||||
round 5d7d57a50d9860a7d145d428884df0341564dec7f14c24d5c319c8bce5565f9d
|
||||
rint 834f8e41e3a28f43b26bc9a5836882cbc0fceeaec5774202cb6df473d995f5a1
|
||||
sinf u35 0b91688d57e650a50dff113cae51be6088e067e877baf0fc50675528432d1539
|
||||
sinf u10 d6ccd197ac5534b74a04340e62e38fc5ec9fb1cbffef80fb1782e659a1832260
|
||||
cosf u35 c5d48802983d4673bf3961453a3b02f13b894b83144f067d93b1d804de722aa2
|
||||
cosf u10 420ba2e57ee0bae63e995ffb85aac07a5f1758d76f824d24193f75af349fca8c
|
||||
tanf u35 ec5bcbe8a93d2a5f59365656ba15a10af2f24375bf265663f762730674a656b9
|
||||
tanf u10 2d4c53018daf572ce2e20fc7bbe1435b04746db6b0cee9c33304cef94f14dcde
|
||||
sincosf u10 b0390e1d3554fd469d53d5e45146e9e1f440d46fc0a9b8f9ea334071af369f55
|
||||
sincosf u35 c4967d888e7713ff231c3fa3372a0d89c5df220585054156256bc3d4f0917f3a
|
||||
sincospif u05 66ccd831fa4c215b71cc791f3d0cb31babeadd34539867df8029cddf45539ded
|
||||
sincospif u35 9fadd97cd2996c6601079869248a59772bbd5b23b625177ef0351120f0759fc2
|
||||
logf u10 a43f52f3ce728ebd9ec9e2e84c901f6012fe0d6b83029c8380036404f59cd3ea
|
||||
logf u35 fd05264b52e29af9f0907b98af57f0cc0737b506a6290c259d3eff92123add86
|
||||
log2f u10 c732f1b5c7f5147d1576d4d858db46952d42ec229117dffce8b82e798799d2b6
|
||||
log2f u35 d2e637436e49d04e7747258946075b715033e925ca589696b4577a4f96632a9b
|
||||
log10f u10 c616f9465c071c42532255e9a49ba4305e0a588fc8d87ac31fceb30d2c59391e
|
||||
log1pf u10 fbfce7374fd3e030b5678fa31e99bba2aa4e68e60e8eeb15a10e41fb34ed1cda
|
||||
expf u10 d75ce19c93fb038cfdd8059f816a7912481b26f7d90cbd554545f21a0b873861
|
||||
exp2f u10 4a579f3f572362629acd563e55d765a7d83cbc625584f26e0a36163e80bffe87
|
||||
exp2f u35 90c3bb433051b828f081de99c3d3e1d731a718de306d0c9937478f2b57e981ce
|
||||
exp10f u10 57856cab0911b80ebeeded0c30b9e978ca6d17314ca2e7522c02ff6b6e904f57
|
||||
exp10f u35 e14dfb56cd4798e675b751c6cd4ddc073e9a5e8f59a97638bc8a9b766f564a96
|
||||
expm1f u10 c0066ace0274e83dfce6b6f806ad89ef4c8b0919011477934d43c88dc42e0db2
|
||||
powf u10 b380319c0b9bad2cf717f8c31a09361b869d49c1e58ee5e1f0b987f96e3acffa
|
||||
cbrtf u10 3589ce3bce26b796ddc4c6ec177cdb0ed05ece414530f4c22c77452b37432050
|
||||
cbrtf u10 cfe7b512f728e60f99e14f597d34c94279b96e6cc897fc5ad1377365afb164c8
|
||||
cbrtf u35 57902935bad6d5f45565d447e82ac2fd673442b8fb01fa178079376ff1220b27
|
||||
cbrtf u35 172785fb38220b147078c16b7b203edf4e879f853e335522074ae0103cddc472
|
||||
hypotf u05 efc46c07c1bff7caf4f1d52fbe0db4ab70100601c114acea3f4ecf7b2aeaf826
|
||||
hypotf u35 f09d3b29f563e599ea2d5e6434ff84de3e72ae277fce5055ee2bbf9ce6aa4214
|
||||
asinf u10 82e645be1e4e8216be262cf67eac586a8d8a0e962ae5d34cb14c55ad177883d7
|
||||
asinf u35 1010918bc615b794d532b8643b60a315f2bc8e2248020b4a6024ffbd593c54b1
|
||||
acosf u10 886eb790a1d46f29fe04d470a1e71ee565951d22383cfd67eca92d3f3437db6a
|
||||
acosf u35 75ebefc2d532049af4234e3247b311782aa60a776c53d669956f578e5b2e76cd
|
||||
atanf u10 540a69391b28afe8d067cc99ac86abbffe08bb3c24f8962be4b7aef0677562de
|
||||
atanf u10 2c12f291846249ca41d6a9c4108bd93a6b30246ef776bc282ad8cbb9e6c05890
|
||||
atanf u35 d8d7c1156fd61d138ccb88d435097be739c7bf4806ff605c0d39216380b55e96
|
||||
atanf u35 6985b58dddf827aa610029c51aaa204952589175efb607e2b135a1dc666b3fca
|
||||
atan2f u10 7756cae9e0b7ebe7e5180f9714e49c6403ead4182ebacbb89dc0cb3cc386e998
|
||||
atan2f u35 a645f681b04876451d8f0de0dd28958303b2b7f3b51957883b09588776111ddf
|
||||
sinhf u10 d8094aaed987d20b0c4e8eccb63ed5cc00f4ad8bf46c67888f5ab87c21b15681
|
||||
coshf u10 26d59cb9ec0a6f5965dfe66df3f89fd2bb348ce75f811ee580426df42f1ebdc3
|
||||
tanhf u10 3b715185ce7c39d70ff17dceb539380b8ac9c80303c9796e41d1ebda6f2b0ece
|
||||
asinhf u10 281dfc8d6f3a9cad40276392b21e48d14ae05986d9a97ce21cf122adf5d14ce0
|
||||
acoshf u10 9a5809171d6a8c4a3e39fd32a71d5dd83d7a55ae8c2c352dc453e59b01c4a42a
|
||||
atanhf u10 66540cd17454f09a95ef5adbeef6e9413ca31cb0446edc879447838f7b8c079c
|
||||
lgammaf u10 b26a90f8b782f2a91132d5c12dcd56d749e301bf51e275085df2c4579639fa44
|
||||
tgammaf u10 e3ba7f95b002555d655e07e8906d29e0f867c28c3abe6513d32c20468cdce05c
|
||||
erff u10 81041541f31e72a7745d6fca4b208d4e332af8fc2366df5372b6cb38755369c9
|
||||
erfcf u15 88205a29a679f22867bf078202e68f2a8f5557780f0b8366db2f0f20c1e23151
|
||||
fabsf 560d13e463bce4448d733798c5818b13e5634f893211047bc2fad9f4c613797d
|
||||
copysignf 74f7af06376f4c79d7af9ce4e50bde2fb8f22b56d741bdc67624ef7d1989e76b
|
||||
fmaxf 7474be750857fe400beb2bc14fd1b1113a2e365ae7b45b0acc508436b4c32a94
|
||||
fminf 4e22c453645f3c108c27e2c2fea65cd6a6b535f8236fd7382ff1082db3b31b5a
|
||||
fdimf 16c17ff31778c7d63ec7f65c3b2a8561b79be62b1bc1b399ac0ffc43285b6cd0
|
||||
fmodf 9fa4ece68b16803e6c47fa5cb280f8c246a2ef5731b0609bec71e1db27906f0a
|
||||
remainderf f32761a428b1336051ee773e470b74ada93a611cafbc08f6a9aff36957c84f64
|
||||
modff e976f223f2f4d380e9955392cb010920d5439665ae8eaf0fd6abbc889700a4f7
|
||||
nextafterf ac05b3fd824c3ce73eb3946c7e3dec94ce4b1ec4efd0237bcfb4578d3d422cc6
|
||||
truncf a014cd8206fb15c21b1cc773b951cf7f673e8be9e6e697ca0cf7293becb3d55c
|
||||
floorf f17658ab95f73a1b4cfec0417e82b1b071cb97a3aac0110e289ba6449b2aeb55
|
||||
ceilf e173e35cc97a85629ceb5025bd5b1abad52e4e153166cbf9672396b4ca23b59a
|
||||
roundf 3e67b087b019c806d87593850bf0cd106718cf34f50684784296ef040f301fc0
|
||||
rintf 827acf1e7d253c4fe9fdd4c5c9e53f35c80af5550ff6cbbb7aaac67577630c70
|
||||
fastsinf u3500 6c68502acd4bde521daad91a0947faea0bd4b15c8e1d8adf4614351eca60f7dd
|
||||
fastcosf u3500 64cb4ab04eca2de35df084ac4c3c7285553301474357783f96ee6467e21f9144
|
||||
fastpowf u3500 a908509f84693183aabb532aef9c26f42e340bd0a0253d1e40cab44358c6b76a
|
||||
|
||||
@ -1,129 +1,129 @@
|
||||
sin u35 c163e4a7e9ccebb2181dcc8653367d8c
|
||||
sin u10 0d6bf6f2c935db82588222da95659019
|
||||
cos u35 52f902bd939d751b5b544ac70181fcff
|
||||
cos u10 afcdba92a75a76d56b8cf2f22d4bec9e
|
||||
tan u35 906cc42b6755fe514c5e185fcb4d2f55
|
||||
tan u10 c98f29a62067fa63646d9bcc29a310c6
|
||||
sincos u10 3fe37f4eb805505152f2b14a22a9f94e
|
||||
sincos u35 95a7b7f48c71febf10ec6eff796dd391
|
||||
sincospi u05 0c6fc00c7aaf0b6e67d66542d1be833d
|
||||
sincospi u35 c428b0fc3e6c5be4d2c03dcd8bb27a7c
|
||||
log u10 4855b27222d900bea47a27cadba71727
|
||||
log u35 015f8ae899c9b921d48919dd12ef19a9
|
||||
log2 u10 2662df9af919680ca62e1752fb1b7539
|
||||
log2 u35 908b1949db34ea855944f00089b21e23
|
||||
log10 u10 36645e8031d873d66fd0ec2c5959f273
|
||||
log1p u10 1383924fb56cf2e7eda27de21320c591
|
||||
exp u10 084e5be89c2ad03e356078ea4f287bab
|
||||
exp2 u10 6e36db9ae2cf9eca82e3d9157c622351
|
||||
exp2 u35 6e36db9ae2cf9eca82e3d9157c622351
|
||||
exp10 u10 0cc08bc6a3d08d6e61450b5370c6161e
|
||||
exp10 u35 6904d5509ca794747aa249c13886f90f
|
||||
expm1 u10 cd3f0b8e86943d52c278394b60e2d22e
|
||||
pow u10 7e19796027d7c1d1999be948f90e6181
|
||||
cbrt u10 5d8bf28ac74624594fd1be9217817690
|
||||
cbrt u10 3c896e03746bcf1b3f70182dfec3d93b
|
||||
cbrt u35 fc7ee3e3e6c54365d708b752c242a947
|
||||
cbrt u35 2408714a56d74f8c82389ca6772cdbc1
|
||||
hypot u05 cc2f18e409e19a02cadf7b91fd869120
|
||||
hypot u35 be7bbd41dffd746b70261ee773cbd4b2
|
||||
asin u10 8a21b7c28cdaffc9d3e53f415367932e
|
||||
asin u35 9c9e8107782898e9faed6924ad1b3cb1
|
||||
acos u10 28261e4eb8331865660c814676d5c6bc
|
||||
acos u35 310911130bfc45b10dabe3a072939331
|
||||
atan u10 f931de72f2f6a7928f307a8a382ae255
|
||||
atan u10 453f9ef62f58f9829320baf482a1d457
|
||||
atan u35 6161b6189609f105b017d8768d0a41f1
|
||||
atan u35 6face71d8d93c69448d49ed6140e361d
|
||||
atan2 u10 469babaeee9bd30e17af2f473b3ea500
|
||||
atan2 u35 6a3e764125aab2a0a13e7a0d9ec02f7f
|
||||
sinh u10 61d459b1f368087f6f23ebf8e9f0ea01
|
||||
cosh u10 f77eb95f79e274c12b4e92dc0389259b
|
||||
tanh u10 2bb9dd54ed0fa22bb5f3b6d557eb58a3
|
||||
asinh u10 01136e54e2a434839530dda54f33cfdb
|
||||
acosh u10 2f3c28c9ee2eb2b3d5659c6cb2a58e3e
|
||||
atanh u10 601a77ba8c1d5175f2808b48a41260c1
|
||||
lgamma u10 90cdc41063f4198c6ad592c0cdd0f5da
|
||||
tgamma u10 cb9a93844ad1713d2ab92ff5b6398150
|
||||
erf u10 8a0bc2146a5c67b6bebc58f4b0076568
|
||||
erfc u15 3e247a54183eeddedc33e99c50118995
|
||||
fabs bef2f2ac8a4789357e580b4da4f9b9fe
|
||||
copysign 3219022f267464e3704f90558e8df3bc
|
||||
fmax 4e4f5220ccfef191864c316df0d18fc0
|
||||
fmin c0f8effb6c611e2b3b91b820ad943f62
|
||||
fdim e876d103931f18ceede5bfd7e3df7ab0
|
||||
fmod 618aa751e13012afdb41ec80dd35e6ba
|
||||
remainder 8d692dbb44bbc9be5af0c0657d3008b8
|
||||
modf f03ce73cd4f9ea7f69c017f6e53355d5
|
||||
nextafter 9eba4e30d12d74dc4e8003fcff0f1582
|
||||
trunc 1bc7e909eba121dcef7f0e4046937ae5
|
||||
floor 2cff66b499dc8a30cec9467de659b774
|
||||
ceil b080e632dcb8f8134d8715752be12917
|
||||
round 8907e21687ca9c2a539297536e754950
|
||||
rint e49f837096bc661fe1c742801dd99a30
|
||||
sinf u35 f8f804eae1d9443103e81fec96293477
|
||||
sinf u10 3f12a7381f1cbb1830d92b4ec72d21fe
|
||||
cosf u35 f2f3d1c9f090cde9c02439608dc7066e
|
||||
cosf u10 dc35f27fae65f63f0aa6ad241f8b387b
|
||||
tanf u35 68d42ad1fb412e6b8be3853461e61213
|
||||
tanf u10 97df301d4f59e67d5318b5356b703f06
|
||||
sincosf u10 a97124d810ec461c135dc4fb0c059b6f
|
||||
sincosf u35 0cc521e52ae1227d311012c2919c1ff2
|
||||
sincospif u05 8b3762b67a661957c1414c351ec49034
|
||||
sincospif u35 8720757f221c00cc8de24b7dc4949144
|
||||
logf u10 c5a90119943acc4199e1cc7030b5def8
|
||||
logf u35 b6234302d534d6ccd48155dd6b9a4293
|
||||
log2f u10 ba8acae369bbb7b6404cccbc633fe25b
|
||||
log2f u35 74174c90717c86642b71284452a8aef6
|
||||
log10f u10 7e235a82d960e4434575dd39648d8bb7
|
||||
log1pf u10 e53dbfa80bcc1a7bcfd21000e6950475
|
||||
expf u10 9597388315e4b3e89c4c97ce46374dcf
|
||||
exp2f u10 42d66e5e4cb88feb29c5b36c632159a5
|
||||
exp2f u35 42d66e5e4cb88feb29c5b36c632159a5
|
||||
exp10f u10 954f0824b6d949d0da03b49950dc6642
|
||||
exp10f u35 6fb0e9a829e12a06679d379d05b53ede
|
||||
expm1f u10 ebfd6498cb40f61b609882de8a7f3c74
|
||||
powf u10 2ed84af40d03e307a620365f172d010d
|
||||
cbrtf u10 01c5cac23fe21638be1c3eab6e368fd6
|
||||
cbrtf u10 2a245b03f83e9114644d03b40dac707b
|
||||
cbrtf u35 6c22a6dc132c5212250970f22f42256d
|
||||
cbrtf u35 5ab696ae11f9637413d30e6496d5324b
|
||||
hypotf u05 bc5971cbeebee27b4c0d91fbe3f6bf30
|
||||
hypotf u35 2a7cd97768287084b7fffc7e9fb39072
|
||||
asinf u10 e2e571a01984c4ffb3f6e38e0328d90e
|
||||
asinf u35 70df2dfc3a3569868cce60c38e7b1962
|
||||
acosf u10 5180fde4b02a0ca4cd75f0a786a1bfeb
|
||||
acosf u35 72b0e2f9791f90f1c43570b9e9ba893f
|
||||
atanf u10 fa672e387a204055f735b7af98dd8a35
|
||||
atanf u10 d017670c13bc221b68bc9ee5f41c4b5e
|
||||
atanf u35 f592e46eaa5d29583f86d3e336f20b6b
|
||||
atanf u35 e7087fe40de46921826b373d10c40954
|
||||
atan2f u10 275b2fa8ee554c45551bb142db9f8197
|
||||
atan2f u35 44b187851195d24bab2561eb8f4ff5d0
|
||||
sinhf u10 45bc228a14c3e39eeb35e9764394a23e
|
||||
coshf u10 838d441e85d415ef4fb1e5c5ea966a71
|
||||
tanhf u10 d19f254d41e8726c748df87b95bc9acd
|
||||
asinhf u10 927eeb621a3e2d5039f1a07fcf150901
|
||||
acoshf u10 932520013273174fcabe2be4a55f919f
|
||||
atanhf u10 164fd77b8372b8c131baaacab1c9e650
|
||||
lgammaf u10 3bf6d824175c4f4d86f3073064e41e84
|
||||
tgammaf u10 c3059747811d98846f74a63d3747ac3d
|
||||
erff u10 f34af3814153de040b93e573ca7d21d8
|
||||
erfcf u15 687a9c577512d349ddbc0643013d2c56
|
||||
fabsf a3c72220bc0ade68fe22e0a15eb730d4
|
||||
copysignf 6b35517b8e1da78d9c9b52915d9a9b19
|
||||
fmaxf 9833a60a2080e8fd9ae8de32c758966f
|
||||
fminf 2dcfa19e1f1ab4973a7dec9f2cc09fa0
|
||||
fdimf c5c0fe7b095eb8ccbb19fbf934a36b24
|
||||
fmodf 77aa84a9703e202a56e5f4609bd2482b
|
||||
remainderf 5a453b1217c173e4dc0b0211066750be
|
||||
modff 5fa4f044f20478216aa085a01b189697
|
||||
nextafterf 517c1c8f072e9024518d3d9ead98b85b
|
||||
truncf 6937050850be63c44d4b7dbd666febe6
|
||||
floorf 9341be69ee345c8554bf3ab4e9316133
|
||||
ceilf c70874771cbe9741f1f05fedd4b629e9
|
||||
roundf 0cf52f6b8015099771e9a7dfa6b090bc
|
||||
rintf bed68e788e2b11543c09c9d52198abf8
|
||||
fastsinf u3500 5c48081c74cd0316379b580b047dbfc2
|
||||
fastcosf u3500 6f73d116f109283e5632c31f5988f55b
|
||||
fastpowf u3500 6dbb3110412df4fed5a71f50d40def89
|
||||
sin u35 c0c8e53bd8762032e30a6e843131ee80bcb7c6acd3fb299e937be6add5a8d5aa
|
||||
sin u10 6692fc59b029f7b11a511c21ff2a5e7c01c8b76bcfce80357878b0ac8dc42b29
|
||||
cos u35 5096992132d8ea8ffdf32f0193b6c6dfa5700bbb64a278ec2e7e5ddf4d0ccd51
|
||||
cos u10 bb8942ccdf1c86289f2ab560033d38f39b37bcb87d0a2f646f71a9521456e905
|
||||
tan u35 334507c35c29da824184f60c8318d3d0cab6ec91291768794936a0fd1caa08f3
|
||||
tan u10 48006a954a296162fe7232ffeb33e602ac54bbf38e2764ab65ea2717f53b7906
|
||||
sincos u10 042262aeafa5774345a43d75e0aca41d4e8e591ba86a35fb113e9f41c1b1b198
|
||||
sincos u35 628ebb6a27b6eacff75deddf301f06ec517dde8ba4566f84d765775d4d2cd8d1
|
||||
sincospi u05 9fffb591dd38190f8dd61d0f9dcaf7843606d4c3f6717bfac9835471178600a4
|
||||
sincospi u35 b362c2f22c2475715d0933caa5ee1400ae1639da9e60c83eeca676e3b2be12d7
|
||||
log u10 a25704431659d3f451536556bd81a2b9c2abc82203e23539df2ecd899436a9e2
|
||||
log u35 b47e57b1afc82b14211b9f3338f41208771b7d971774cf535e9e9bcdb6327db5
|
||||
log2 u10 bf2467410af2c29e30ebf509bc066759c17b31fc409120382898a6979fbbad2e
|
||||
log2 u35 61cdc83d0e7de8d132764065fc7ba47bc18dadac441938d7bb0550c18b27956b
|
||||
log10 u10 1aa2fb18c8ae9a19f8f9be331f72cb3f842188b705d73e86bde47ecf661297cd
|
||||
log1p u10 e21e7518e09b85f0adaf1d0d3cff362364e925fd07aa3163d77b818cb644d942
|
||||
exp u10 c7997af9618cab09736d7736614dfe6541c6417b75894474c02849e25c5eb6a4
|
||||
exp2 u10 43ca5b299c5ef8d38c7ea3594e8925f00ff7dda62788f0ed003ffac026f4aaa4
|
||||
exp2 u35 43ca5b299c5ef8d38c7ea3594e8925f00ff7dda62788f0ed003ffac026f4aaa4
|
||||
exp10 u10 b9d8ea0a1bffa2097c84ea57752a00e71e12b0454ced6ce40a56c0d62a05c2f0
|
||||
exp10 u35 9dd4096b0f0907112a7051e4cd0f8b93f4e56403224f5cb5e0e1a3601b55fc14
|
||||
expm1 u10 609ae579ed99b4c8ff7ccaead9c3a2216bfbc1d156dc05a6b401de066b0a079c
|
||||
pow u10 74772c3583d5579f1b28fd322048a40c286595057df623ec65028a9647f7bf46
|
||||
cbrt u10 e128b321cd05dca403a7b0633424cad82600ceb5b61966f70ff3cf425bd6b3f9
|
||||
cbrt u10 b722d767ae6dd66d3d1dfa9d5d2aedaed3c652020dab5fcfdd729b3f2c803e98
|
||||
cbrt u35 96d1ef3aa862044af5cb0ee7fe62e161b61fbb9ab50549925b5f4bc8c1450106
|
||||
cbrt u35 3d648e8f0e56d75a4765d3fe4ba58578dde6576199dce8a920d4fc74f3fd2077
|
||||
hypot u05 9f4275e06e1ce269722162c4bc521f159906a448ee05f9619037706cd3e54b72
|
||||
hypot u35 0473b61c7dd7a4e6a8394bbafdc613f4e1d8eac704830dbc6257ee8f85601149
|
||||
asin u10 7c466883cd3b6055bff9f8f13e2a8eff00de053f428f88b169fcb18b85f5859e
|
||||
asin u35 cf291432912ad68a37dccb92882199e11d382b402794d72bf78d467a40ba6911
|
||||
acos u10 31f80b277ac9dbedb9f4397fa058b11e3e2497adb5ad8dca3055b18bd071b2d4
|
||||
acos u35 6025e6a4a64608b06709ba1eda3da1a3a697344c27dc1be50aeecb722aed5837
|
||||
atan u10 561fe325ecfbe2ed5b3761da5f43886ba4081566e12b793f02fb105f57d74cd7
|
||||
atan u10 6f8ded4d8fba9461e3df9faf8924499424d5910b4e3d7829573efc4b088316e1
|
||||
atan u35 9408d2aa734a6b0c0bc1c80f4ad34e2b3dacb5eae623366deaa2cc2b9454499f
|
||||
atan u35 c03ad6398c6992d946f89ff389fcd548be3bd9cb4fd0a1613f686a5a1ea1f0dc
|
||||
atan2 u10 a3bcea5507555b07f1128585312e7772532dd414dd21588a95405188e4af6af6
|
||||
atan2 u35 4cdbd13d36484ca540eb04d8854674103107aada4deb662d49dfdae9aa3eb7ca
|
||||
sinh u10 d3859e3dc1ca924f11dc7b464cb0bb535d4ad71d1ec6f416a82db6e0e2390367
|
||||
cosh u10 e6fd1172e97fa9341028299dd8a00379f1313170b8444a6a3c291230e4f178b7
|
||||
tanh u10 5e2c1ce9d160d1a5dcc5ef8fd74f860751764f5dc14124075f848074ee386618
|
||||
asinh u10 37d0df9811cc871b1dde4d762cc0eb53ec6c71c7bcf13100b9b5302ba1a85b99
|
||||
acosh u10 158fb84af679aea2ab411fb84cd0b12ff876d897722ff84c54fa567c35705033
|
||||
atanh u10 32253ae4f643e56a3d25a6d96d316ed94cd3a9e5ea16ad7180ff96e68571dc34
|
||||
lgamma u10 4663f72dcb58a53bedefe071de51f0fccb9b73db12f5b53d5acea347d4de06cd
|
||||
tgamma u10 ae094d163ce1ccaf94f5146ce3b147f76a886fee2758c8735328304bbb514b42
|
||||
erf u10 73867031c0df90a5d060040cd160c7fe14fa6fc0c46104959e574ab6efdd67f7
|
||||
erfc u15 4632ba9c10e73c7bbb32adf163d48d4cd90aa0c3314de4a7878953da08433f4d
|
||||
fabs ff336faed535e34a082752839c9e957ba069ffdf0b046215bd415ce9120f29a3
|
||||
copysign 67a7a162bfc2f15b76ded0470f938ab000edf8f8566d5a19fa99d4ea4d29fff3
|
||||
fmax 57f39d5440fadb2a7387a47c00b067d5fc57ceabd7e5d64943b033acb5212063
|
||||
fmin 87e131762ec9c46badd6105ab66f09d99d65776e2719f6af9befd8d6d3f59b6b
|
||||
fdim 3331d6a17f289f54d429bdda9374d7d2574e0cd173e930a57436e8e484f271e9
|
||||
fmod 89d26af516be177c55ba9fcec972416c35e229456b053271548021e9b070c193
|
||||
remainder 2db01bb12776ec14d4a15469c31b49e759d74a3c8ed30d14fe88af3b27b5c398
|
||||
modf 7780d1e6448f21bec6504e398a4e826f304da10aaec3c4e210bed86abdaecedf
|
||||
nextafter 60a6c07477f6d07cd938ba6361d020175193a934a2714132615dae0bcedf785a
|
||||
trunc ee43b2f9d897428885cb039f85259ea5ffe4efbfe4bf0dba16ee19829d198ac6
|
||||
floor 29f8be9b8ad5795e65ed4f34878a85f5f8a1be707489345c4ad04b36d4da54bf
|
||||
ceil bf267441867b261f8dcfca61b55fdc7ac0ff7a017b150da1b532776894962208
|
||||
round 5d7d57a50d9860a7d145d428884df0341564dec7f14c24d5c319c8bce5565f9d
|
||||
rint 834f8e41e3a28f43b26bc9a5836882cbc0fceeaec5774202cb6df473d995f5a1
|
||||
sinf u35 5667c75091aaa7f6cad0b8e1ff80c5470cb5bfcbeb37ca089597a42bb89d21f9
|
||||
sinf u10 4749c75d58eb24a83df44f86cfc204cd49b00a84472a592adfa5b0dc6ee5920e
|
||||
cosf u35 c9aa15477ba53c5d4816a63ebca00123ebe9798374b7f93001478baf01f42393
|
||||
cosf u10 8a8cc7609d7afacff4ff1a075784ad32d891567eb6dcc6ab115b0421c3985359
|
||||
tanf u35 f7c53052860fa55f44e2fe63af8af15eade5e94951637634ebc5d0ee3c56dd6a
|
||||
tanf u10 4dcccb3f2c42cf20d9cfa5b5602d86d8242d4d080cfa4f00321333e338cfb9ad
|
||||
sincosf u10 3643081262b2d43ccedd509daca5d16fb66449aa1774a645a5b1343d4682c81b
|
||||
sincosf u35 e02f3f1d2848c047d30ad1d89adeab6a9b0aef211fa0d8cd6613a43170e4e0fe
|
||||
sincospif u05 66ccd831fa4c215b71cc791f3d0cb31babeadd34539867df8029cddf45539ded
|
||||
sincospif u35 c2a92e1892c9ca12031896177e0dd898cb22b5b8305b42754b1a834485189c9b
|
||||
logf u10 a43f52f3ce728ebd9ec9e2e84c901f6012fe0d6b83029c8380036404f59cd3ea
|
||||
logf u35 68ef65827671b86d1fc77d8cb734c49e4c211bfb35990c84a4bbdec6026d8b4d
|
||||
log2f u10 c732f1b5c7f5147d1576d4d858db46952d42ec229117dffce8b82e798799d2b6
|
||||
log2f u35 529ca0ddf923543e938ad3663ad572b9addc586e7f1398c13dcde257b3bd65d1
|
||||
log10f u10 c616f9465c071c42532255e9a49ba4305e0a588fc8d87ac31fceb30d2c59391e
|
||||
log1pf u10 384577af7f24c0ff0abf3a574bf21e348bceb60a7a26b3a7006b7f1fa7032049
|
||||
expf u10 1554f1b37125fdf5cf7e516415a04df7547be47dd89d262d24519c0a092593a7
|
||||
exp2f u10 374572349c0d64862128a5f7e27555d5f7a2768ec20d52cfc73b2dd608128542
|
||||
exp2f u35 374572349c0d64862128a5f7e27555d5f7a2768ec20d52cfc73b2dd608128542
|
||||
exp10f u10 240f4207fcca7934627f058b87b2d935a0d5733123a61efa0cee45ed38af6d7b
|
||||
exp10f u35 3806645d79d1e6ce3cb56f1d1d95689d835e54061b647c8ca8d8c0cb7eb19c97
|
||||
expm1f u10 c0066ace0274e83dfce6b6f806ad89ef4c8b0919011477934d43c88dc42e0db2
|
||||
powf u10 d370c629e456bed37684cff089d3f04dbe110d8ea0ba40e5e4f49abf9d874134
|
||||
cbrtf u10 3589ce3bce26b796ddc4c6ec177cdb0ed05ece414530f4c22c77452b37432050
|
||||
cbrtf u10 cfe7b512f728e60f99e14f597d34c94279b96e6cc897fc5ad1377365afb164c8
|
||||
cbrtf u35 30fa2b571dec71ccd9f31607bc26c591036ced33e0ceaf038042e6a162b1ddba
|
||||
cbrtf u35 a0ee4a56fbe28cc4c922188397c10456a0dd54bc31c54b0bd2cfffc7c5626dba
|
||||
hypotf u05 efc46c07c1bff7caf4f1d52fbe0db4ab70100601c114acea3f4ecf7b2aeaf826
|
||||
hypotf u35 e2e71c42bba52629c44960938d5b9961387aff15d92126799dff5e08f351b1e4
|
||||
asinf u10 151d448af3ece5f8b2b1775b375cc3260895ac76042814d30bcf156f368d3d45
|
||||
asinf u35 2daf25858c2c889ec4b3920ac12b00d7a1494f35f2abb36a3c7daabad99b751f
|
||||
acosf u10 d4ea707c8f340c6580ed68072d92065abd8942272fdc048cc0318b02e6d312a4
|
||||
acosf u35 a7a7a0a8e081e8ef26610c118afc1b7e60b8c6577ca644f49b0aca06f97beb91
|
||||
atanf u10 c5e2e79af3d422f9ac9424afda4eab64c17ab80903305b3a281580c997a86055
|
||||
atanf u10 ccea76f6a4c4a8941a5259c9c50c6899d71d0bc13948421333c14a604718c31b
|
||||
atanf u35 67f3d2ab58989e4f24d6ac4f7106a58043d6a8d3a749a6308f155237d1c38eee
|
||||
atanf u35 7fbc39fe8698ebd79040c51fbc31356acd27b1988435b96e4191eec8662b27d6
|
||||
atan2f u10 fa56d1cfea9cbec5de469b1768bd660c19bb079361ec861f3ac0604a0acaee64
|
||||
atan2f u35 6ed820eb372024d39c6db25a3242c7cc63c1d416fa3df8e0c68638a979c333f8
|
||||
sinhf u10 18d9bc4d115cc4fb5061fda0e1a6b3aa90bce4fd68aa3000cea10dc94cc907e1
|
||||
coshf u10 fcbdbe1ebd51db181bad96b3aa08aec5b81858925dd676e3dfd04d679863aa2e
|
||||
tanhf u10 3b715185ce7c39d70ff17dceb539380b8ac9c80303c9796e41d1ebda6f2b0ece
|
||||
asinhf u10 1fb7d432a1af3a637e602c9170d73dea5da7e82b57623bfd3b37bbbce1cc9bb1
|
||||
acoshf u10 c01055933edfe7bcb45e5dea7377d2b2960ee61551a63270d9e7a28b76f3daad
|
||||
atanhf u10 66540cd17454f09a95ef5adbeef6e9413ca31cb0446edc879447838f7b8c079c
|
||||
lgammaf u10 b26a90f8b782f2a91132d5c12dcd56d749e301bf51e275085df2c4579639fa44
|
||||
tgammaf u10 2790e8800bd1a29f564fe35ef8463f90b8566968739026c6b04097bbfa536f57
|
||||
erff u10 81041541f31e72a7745d6fca4b208d4e332af8fc2366df5372b6cb38755369c9
|
||||
erfcf u15 e310f5ed2f0c0b32a84280832bffbefec65cc063483497861f3fb684d72f046d
|
||||
fabsf 560d13e463bce4448d733798c5818b13e5634f893211047bc2fad9f4c613797d
|
||||
copysignf 74f7af06376f4c79d7af9ce4e50bde2fb8f22b56d741bdc67624ef7d1989e76b
|
||||
fmaxf 7474be750857fe400beb2bc14fd1b1113a2e365ae7b45b0acc508436b4c32a94
|
||||
fminf 4e22c453645f3c108c27e2c2fea65cd6a6b535f8236fd7382ff1082db3b31b5a
|
||||
fdimf 16c17ff31778c7d63ec7f65c3b2a8561b79be62b1bc1b399ac0ffc43285b6cd0
|
||||
fmodf 9fa4ece68b16803e6c47fa5cb280f8c246a2ef5731b0609bec71e1db27906f0a
|
||||
remainderf f32761a428b1336051ee773e470b74ada93a611cafbc08f6a9aff36957c84f64
|
||||
modff e976f223f2f4d380e9955392cb010920d5439665ae8eaf0fd6abbc889700a4f7
|
||||
nextafterf ac05b3fd824c3ce73eb3946c7e3dec94ce4b1ec4efd0237bcfb4578d3d422cc6
|
||||
truncf a014cd8206fb15c21b1cc773b951cf7f673e8be9e6e697ca0cf7293becb3d55c
|
||||
floorf f17658ab95f73a1b4cfec0417e82b1b071cb97a3aac0110e289ba6449b2aeb55
|
||||
ceilf e173e35cc97a85629ceb5025bd5b1abad52e4e153166cbf9672396b4ca23b59a
|
||||
roundf 3e67b087b019c806d87593850bf0cd106718cf34f50684784296ef040f301fc0
|
||||
rintf 827acf1e7d253c4fe9fdd4c5c9e53f35c80af5550ff6cbbb7aaac67577630c70
|
||||
fastsinf u3500 dbf93ee799553cfb9abf84aaccc458e26113d7d78c4f634db4469bd0d9dd0e19
|
||||
fastcosf u3500 55893f9b416b8876d022d7f960281efbb4f9241fdff0cbb059c2695d4c666d5b
|
||||
fastpowf u3500 30b1aaff8eaad36907f99fd027a34bc06f39ffae218deeae10e399f133e72f8e
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
@ -220,113 +220,113 @@ __global__ void xerfcf_u15(float *r, float *a0) { *r = Sleef_erfcf1_u15cuda(*a0)
|
||||
|
||||
//
|
||||
|
||||
#define func_d_d(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint64_t u; \
|
||||
sscanf(buf, funcStr " %" PRIx64, &u); \
|
||||
*a0 = u2d(u); \
|
||||
#define func_d_d(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint64_t u; \
|
||||
sscanf(buf, funcStr " %" PRIx64, &u); \
|
||||
*a0 = u2d(u); \
|
||||
funcName<<<1, 1>>>(r, a0); \
|
||||
cudaDeviceSynchronize(); \
|
||||
printf("%" PRIx64 "\n", d2u(*r)); \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
cudaDeviceSynchronize(); \
|
||||
printf("%" PRIx64 "\n", d2u(*r)); \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define func_d2_d(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint64_t u; \
|
||||
sscanf(buf, funcStr " %" PRIx64, &u); \
|
||||
*a0 = u2d(u); \
|
||||
funcName<<<1, 1>>>(r2, a0); \
|
||||
cudaDeviceSynchronize(); \
|
||||
printf("%" PRIx64 " %" PRIx64 "\n", d2u(r2->x), d2u(r2->y)); \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
#define func_d2_d(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint64_t u; \
|
||||
sscanf(buf, funcStr " %" PRIx64, &u); \
|
||||
*a0 = u2d(u); \
|
||||
funcName<<<1, 1>>>(r2, a0); \
|
||||
cudaDeviceSynchronize(); \
|
||||
printf("%" PRIx64 " %" PRIx64 "\n", d2u(r2->x), d2u(r2->y)); \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define func_d_d_d(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint64_t u, v; \
|
||||
sscanf(buf, funcStr " %" PRIx64 " %" PRIx64, &u, &v); \
|
||||
*a0 = u2d(u); \
|
||||
*a1 = u2d(v); \
|
||||
funcName<<<1, 1>>>(r, a0, a1); \
|
||||
cudaDeviceSynchronize(); \
|
||||
printf("%" PRIx64 "\n", d2u(*r)); \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
#define func_d_d_d(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint64_t u, v; \
|
||||
sscanf(buf, funcStr " %" PRIx64 " %" PRIx64, &u, &v); \
|
||||
*a0 = u2d(u); \
|
||||
*a1 = u2d(v); \
|
||||
funcName<<<1, 1>>>(r, a0, a1); \
|
||||
cudaDeviceSynchronize(); \
|
||||
printf("%" PRIx64 "\n", d2u(*r)); \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define func_d_d_i(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint64_t u, v; \
|
||||
sscanf(buf, funcStr " %" PRIx64 " %" PRIx64, &u, &v); \
|
||||
*a0 = u2d(u); \
|
||||
#define func_d_d_i(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint64_t u, v; \
|
||||
sscanf(buf, funcStr " %" PRIx64 " %" PRIx64, &u, &v); \
|
||||
*a0 = u2d(u); \
|
||||
*i0 = (int)u2d(v); \
|
||||
funcName<<<1, 1>>>(r, a0, i0); \
|
||||
cudaDeviceSynchronize(); \
|
||||
printf("%" PRIx64 "\n", d2u(*r)); \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
funcName<<<1, 1>>>(r, a0, i0); \
|
||||
cudaDeviceSynchronize(); \
|
||||
printf("%" PRIx64 "\n", d2u(*r)); \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define func_i_d(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint64_t u; \
|
||||
sscanf(buf, funcStr " %" PRIx64, &u); \
|
||||
*a0 = u2d(u); \
|
||||
funcName<<<1, 1>>>(i0, a0); \
|
||||
cudaDeviceSynchronize(); \
|
||||
printf("%d\n", *i0); \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
#define func_i_d(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint64_t u; \
|
||||
sscanf(buf, funcStr " %" PRIx64, &u); \
|
||||
*a0 = u2d(u); \
|
||||
funcName<<<1, 1>>>(i0, a0); \
|
||||
cudaDeviceSynchronize(); \
|
||||
printf("%d\n", *i0); \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
#define func_f_f(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint32_t u; \
|
||||
sscanf(buf, funcStr " %x", &u); \
|
||||
*b0 = u2f(u); \
|
||||
#define func_f_f(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint32_t u; \
|
||||
sscanf(buf, funcStr " %x", &u); \
|
||||
*b0 = u2f(u); \
|
||||
funcName<<<1, 1>>>(s, b0); \
|
||||
cudaDeviceSynchronize(); \
|
||||
printf("%x\n", f2u(*s)); \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
cudaDeviceSynchronize(); \
|
||||
printf("%x\n", f2u(*s)); \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define func_f2_f(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint32_t u; \
|
||||
sscanf(buf, funcStr " %x", &u); \
|
||||
*b0 = u2f(u); \
|
||||
funcName<<<1, 1>>>(s2, b0); \
|
||||
cudaDeviceSynchronize(); \
|
||||
#define func_f2_f(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint32_t u; \
|
||||
sscanf(buf, funcStr " %x", &u); \
|
||||
*b0 = u2f(u); \
|
||||
funcName<<<1, 1>>>(s2, b0); \
|
||||
cudaDeviceSynchronize(); \
|
||||
printf("%x %x\n", f2u(s2->x), f2u(s2->y)); \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define func_f_f_f(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint32_t u, v; \
|
||||
sscanf(buf, funcStr " %x %x", &u, &v); \
|
||||
*b0 = u2f(u); \
|
||||
*b1 = u2f(v); \
|
||||
funcName<<<1, 1>>>(s, b0, b1); \
|
||||
cudaDeviceSynchronize(); \
|
||||
printf("%x\n", f2u(*s)); \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
#define func_f_f_f(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint32_t u, v; \
|
||||
sscanf(buf, funcStr " %x %x", &u, &v); \
|
||||
*b0 = u2f(u); \
|
||||
*b1 = u2f(v); \
|
||||
funcName<<<1, 1>>>(s, b0, b1); \
|
||||
cudaDeviceSynchronize(); \
|
||||
printf("%x\n", f2u(*s)); \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2023.
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
@ -474,172 +474,172 @@ static vfloat vf2gety_vf_vf2(vfloat2 v) { return v.y; }
|
||||
|
||||
//
|
||||
|
||||
#define func_d_d(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint64_t u; \
|
||||
sscanf(buf, funcStr " %" PRIx64, &u); \
|
||||
double s[VECTLENDP]; \
|
||||
memrand(s, sizeof(s)); \
|
||||
#define func_d_d(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint64_t u; \
|
||||
sscanf(buf, funcStr " %" PRIx64, &u); \
|
||||
double s[VECTLENDP]; \
|
||||
memrand(s, sizeof(s)); \
|
||||
int idx = xrand() & (VECTLENDP-1); \
|
||||
s[idx] = u2d(u); \
|
||||
vdouble a = vloadu_vd_p(s); \
|
||||
a = funcName(a); \
|
||||
vstoreu_v_p_vd(s, a); \
|
||||
u = d2u(s[idx]); \
|
||||
printf("%" PRIx64 "\n", u); \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
s[idx] = u2d(u); \
|
||||
vdouble a = vloadu_vd_p(s); \
|
||||
a = funcName(a); \
|
||||
vstoreu_v_p_vd(s, a); \
|
||||
u = d2u(s[idx]); \
|
||||
printf("%" PRIx64 "\n", u); \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define func_d2_d(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint64_t u; \
|
||||
sscanf(buf, funcStr " %" PRIx64, &u); \
|
||||
#define func_d2_d(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint64_t u; \
|
||||
sscanf(buf, funcStr " %" PRIx64, &u); \
|
||||
double s[VECTLENDP], t[VECTLENDP]; \
|
||||
memrand(s, sizeof(s)); \
|
||||
memrand(t, sizeof(t)); \
|
||||
memrand(s, sizeof(s)); \
|
||||
memrand(t, sizeof(t)); \
|
||||
int idx = xrand() & (VECTLENDP-1); \
|
||||
s[idx] = u2d(u); \
|
||||
vdouble2 v; \
|
||||
vdouble a = vloadu_vd_p(s); \
|
||||
v = funcName(a); \
|
||||
vstoreu_v_p_vd(s, vd2getx_vd_vd2(v)); \
|
||||
vstoreu_v_p_vd(t, vd2gety_vd_vd2(v)); \
|
||||
Sleef_double2 d2; \
|
||||
d2.x = s[idx]; \
|
||||
d2.y = t[idx]; \
|
||||
s[idx] = u2d(u); \
|
||||
vdouble2 v; \
|
||||
vdouble a = vloadu_vd_p(s); \
|
||||
v = funcName(a); \
|
||||
vstoreu_v_p_vd(s, vd2getx_vd_vd2(v)); \
|
||||
vstoreu_v_p_vd(t, vd2gety_vd_vd2(v)); \
|
||||
Sleef_double2 d2; \
|
||||
d2.x = s[idx]; \
|
||||
d2.y = t[idx]; \
|
||||
printf("%" PRIx64 " %" PRIx64 "\n", d2u(d2.x), d2u(d2.y)); \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define func_d_d_d(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint64_t u, v; \
|
||||
sscanf(buf, funcStr " %" PRIx64 " %" PRIx64, &u, &v); \
|
||||
#define func_d_d_d(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint64_t u, v; \
|
||||
sscanf(buf, funcStr " %" PRIx64 " %" PRIx64, &u, &v); \
|
||||
double s[VECTLENDP], t[VECTLENDP]; \
|
||||
memrand(s, sizeof(s)); \
|
||||
memrand(t, sizeof(t)); \
|
||||
memrand(s, sizeof(s)); \
|
||||
memrand(t, sizeof(t)); \
|
||||
int idx = xrand() & (VECTLENDP-1); \
|
||||
s[idx] = u2d(u); \
|
||||
t[idx] = u2d(v); \
|
||||
vdouble a, b; \
|
||||
a = vloadu_vd_p(s); \
|
||||
b = vloadu_vd_p(t); \
|
||||
a = funcName(a, b); \
|
||||
vstoreu_v_p_vd(s, a); \
|
||||
u = d2u(s[idx]); \
|
||||
printf("%" PRIx64 "\n", u); \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
s[idx] = u2d(u); \
|
||||
t[idx] = u2d(v); \
|
||||
vdouble a, b; \
|
||||
a = vloadu_vd_p(s); \
|
||||
b = vloadu_vd_p(t); \
|
||||
a = funcName(a, b); \
|
||||
vstoreu_v_p_vd(s, a); \
|
||||
u = d2u(s[idx]); \
|
||||
printf("%" PRIx64 "\n", u); \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define func_d_d_i(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint64_t u, v; \
|
||||
sscanf(buf, funcStr " %" PRIx64 " %" PRIx64, &u, &v); \
|
||||
double s[VECTLENDP]; \
|
||||
int t[VECTLENDP*2]; \
|
||||
memrand(s, sizeof(s)); \
|
||||
memrand(t, sizeof(t)); \
|
||||
#define func_d_d_i(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint64_t u, v; \
|
||||
sscanf(buf, funcStr " %" PRIx64 " %" PRIx64, &u, &v); \
|
||||
double s[VECTLENDP]; \
|
||||
int t[VECTLENDP*2]; \
|
||||
memrand(s, sizeof(s)); \
|
||||
memrand(t, sizeof(t)); \
|
||||
int idx = xrand() & (VECTLENDP-1); \
|
||||
s[idx] = u2d(u); \
|
||||
t[idx] = (int)u2d(v); \
|
||||
vstoreu_v_p_vd(s, funcName(vloadu_vd_p(s), vloadu_vi_p(t))); \
|
||||
u = d2u(s[idx]); \
|
||||
printf("%" PRIx64 "\n", u); \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
s[idx] = u2d(u); \
|
||||
t[idx] = (int)u2d(v); \
|
||||
vstoreu_v_p_vd(s, funcName(vloadu_vd_p(s), vloadu_vi_p(t))); \
|
||||
u = d2u(s[idx]); \
|
||||
printf("%" PRIx64 "\n", u); \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define func_i_d(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint64_t u; \
|
||||
int i; \
|
||||
sscanf(buf, funcStr " %" PRIx64, &u); \
|
||||
double s[VECTLENDP]; \
|
||||
int t[VECTLENDP*2]; \
|
||||
memrand(s, sizeof(s)); \
|
||||
memrand(t, sizeof(t)); \
|
||||
#define func_i_d(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint64_t u; \
|
||||
int i; \
|
||||
sscanf(buf, funcStr " %" PRIx64, &u); \
|
||||
double s[VECTLENDP]; \
|
||||
int t[VECTLENDP*2]; \
|
||||
memrand(s, sizeof(s)); \
|
||||
memrand(t, sizeof(t)); \
|
||||
int idx = xrand() & (VECTLENDP-1); \
|
||||
s[idx] = u2d(u); \
|
||||
vdouble a = vloadu_vd_p(s); \
|
||||
vint vi = funcName(a); \
|
||||
vstoreu_v_p_vi(t, vi); \
|
||||
i = t[idx]; \
|
||||
s[idx] = u2d(u); \
|
||||
vdouble a = vloadu_vd_p(s); \
|
||||
vint vi = funcName(a); \
|
||||
vstoreu_v_p_vi(t, vi); \
|
||||
i = t[idx]; \
|
||||
printf("%d\n", i); \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
#define func_f_f(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint32_t u; \
|
||||
sscanf(buf, funcStr " %x", &u); \
|
||||
float s[VECTLENSP]; \
|
||||
memrand(s, sizeof(s)); \
|
||||
#define func_f_f(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint32_t u; \
|
||||
sscanf(buf, funcStr " %x", &u); \
|
||||
float s[VECTLENSP]; \
|
||||
memrand(s, sizeof(s)); \
|
||||
int idx = xrand() & (VECTLENSP-1); \
|
||||
s[idx] = u2f(u); \
|
||||
s[idx] = u2f(u); \
|
||||
vfloat a = vloadu_vf_p(s); \
|
||||
a = funcName(a); \
|
||||
vstoreu_v_p_vf(s, a); \
|
||||
u = f2u(s[idx]); \
|
||||
a = funcName(a); \
|
||||
vstoreu_v_p_vf(s, a); \
|
||||
u = f2u(s[idx]); \
|
||||
printf("%x\n", u); \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define func_f2_f(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint32_t u; \
|
||||
sscanf(buf, funcStr " %x", &u); \
|
||||
float s[VECTLENSP], t[VECTLENSP]; \
|
||||
memrand(s, sizeof(s)); \
|
||||
memrand(t, sizeof(t)); \
|
||||
#define func_f2_f(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint32_t u; \
|
||||
sscanf(buf, funcStr " %x", &u); \
|
||||
float s[VECTLENSP], t[VECTLENSP]; \
|
||||
memrand(s, sizeof(s)); \
|
||||
memrand(t, sizeof(t)); \
|
||||
int idx = xrand() & (VECTLENSP-1); \
|
||||
s[idx] = u2f(u); \
|
||||
s[idx] = u2f(u); \
|
||||
vfloat2 v; \
|
||||
vfloat a = vloadu_vf_p(s); \
|
||||
v = funcName(a); \
|
||||
vstoreu_v_p_vf(s, vf2getx_vf_vf2(v)); \
|
||||
vstoreu_v_p_vf(t, vf2gety_vf_vf2(v)); \
|
||||
Sleef_float2 d2; \
|
||||
d2.x = s[idx]; \
|
||||
d2.y = t[idx]; \
|
||||
printf("%x %x\n", f2u(d2.x), f2u(d2.y)); \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
v = funcName(a); \
|
||||
vstoreu_v_p_vf(s, vf2getx_vf_vf2(v)); \
|
||||
vstoreu_v_p_vf(t, vf2gety_vf_vf2(v)); \
|
||||
Sleef_float2 d2; \
|
||||
d2.x = s[idx]; \
|
||||
d2.y = t[idx]; \
|
||||
printf("%x %x\n", f2u(d2.x), f2u(d2.y)); \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define func_f_f_f(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint32_t u, v; \
|
||||
sscanf(buf, funcStr " %x %x", &u, &v); \
|
||||
float s[VECTLENSP], t[VECTLENSP]; \
|
||||
memrand(s, sizeof(s)); \
|
||||
memrand(t, sizeof(t)); \
|
||||
#define func_f_f_f(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint32_t u, v; \
|
||||
sscanf(buf, funcStr " %x %x", &u, &v); \
|
||||
float s[VECTLENSP], t[VECTLENSP]; \
|
||||
memrand(s, sizeof(s)); \
|
||||
memrand(t, sizeof(t)); \
|
||||
int idx = xrand() & (VECTLENSP-1); \
|
||||
s[idx] = u2f(u); \
|
||||
t[idx] = u2f(v); \
|
||||
vfloat a, b; \
|
||||
a = vloadu_vf_p(s); \
|
||||
b = vloadu_vf_p(t); \
|
||||
a = funcName(a, b); \
|
||||
vstoreu_v_p_vf(s, a); \
|
||||
u = f2u(s[idx]); \
|
||||
s[idx] = u2f(u); \
|
||||
t[idx] = u2f(v); \
|
||||
vfloat a, b; \
|
||||
a = vloadu_vf_p(s); \
|
||||
b = vloadu_vf_p(t); \
|
||||
a = funcName(a, b); \
|
||||
vstoreu_v_p_vf(s, a); \
|
||||
u = f2u(s[idx]); \
|
||||
printf("%x\n", u); \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2025.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
@ -89,37 +89,37 @@ void startChild(const char *path, char *const argv[]) {
|
||||
|
||||
//
|
||||
|
||||
#define child_d_d(funcStr, arg) do { \
|
||||
char str[256]; \
|
||||
uint64_t u; \
|
||||
sprintf(str, funcStr " %" PRIx64 "\n", d2u(arg)); \
|
||||
write(ptoc[1], str, strlen(str)); \
|
||||
#define child_d_d(funcStr, arg) do { \
|
||||
char str[256]; \
|
||||
uint64_t u; \
|
||||
sprintf(str, funcStr " %" PRIx64 "\n", d2u(arg)); \
|
||||
write(ptoc[1], str, strlen(str)); \
|
||||
if (fgets(str, 255, fpctop) == NULL) stop("child " funcStr); \
|
||||
sscanf(str, "%" PRIx64, &u); \
|
||||
return u2d(u); \
|
||||
return u2d(u); \
|
||||
} while(0)
|
||||
|
||||
#define child_d2_d(funcStr, arg) do { \
|
||||
char str[256]; \
|
||||
uint64_t u, v; \
|
||||
sprintf(str, funcStr " %" PRIx64 "\n", d2u(arg)); \
|
||||
write(ptoc[1], str, strlen(str)); \
|
||||
#define child_d2_d(funcStr, arg) do { \
|
||||
char str[256]; \
|
||||
uint64_t u, v; \
|
||||
sprintf(str, funcStr " %" PRIx64 "\n", d2u(arg)); \
|
||||
write(ptoc[1], str, strlen(str)); \
|
||||
if (fgets(str, 255, fpctop) == NULL) stop("child " funcStr); \
|
||||
sscanf(str, "%" PRIx64 " %" PRIx64, &u, &v); \
|
||||
Sleef_double2 ret; \
|
||||
ret.x = u2d(u); \
|
||||
ret.y = u2d(v); \
|
||||
return ret; \
|
||||
Sleef_double2 ret; \
|
||||
ret.x = u2d(u); \
|
||||
ret.y = u2d(v); \
|
||||
return ret; \
|
||||
} while(0)
|
||||
|
||||
#define child_d_d_d(funcStr, arg1, arg2) do { \
|
||||
char str[256]; \
|
||||
uint64_t u; \
|
||||
#define child_d_d_d(funcStr, arg1, arg2) do { \
|
||||
char str[256]; \
|
||||
uint64_t u; \
|
||||
sprintf(str, funcStr " %" PRIx64 " %" PRIx64 "\n", d2u(arg1), d2u(arg2)); \
|
||||
write(ptoc[1], str, strlen(str)); \
|
||||
write(ptoc[1], str, strlen(str)); \
|
||||
if (fgets(str, 255, fpctop) == NULL) stop("child " funcStr); \
|
||||
sscanf(str, "%" PRIx64, &u); \
|
||||
return u2d(u); \
|
||||
return u2d(u); \
|
||||
} while(0)
|
||||
|
||||
double child_sin(double x) { child_d_d("sin", x); }
|
||||
@ -224,37 +224,37 @@ int child_ilogb(double x) {
|
||||
|
||||
//
|
||||
|
||||
#define child_f_f(funcStr, arg) do { \
|
||||
char str[256]; \
|
||||
uint32_t u; \
|
||||
sprintf(str, funcStr " %x\n", f2u(arg)); \
|
||||
write(ptoc[1], str, strlen(str)); \
|
||||
#define child_f_f(funcStr, arg) do { \
|
||||
char str[256]; \
|
||||
uint32_t u; \
|
||||
sprintf(str, funcStr " %x\n", f2u(arg)); \
|
||||
write(ptoc[1], str, strlen(str)); \
|
||||
if (fgets(str, 255, fpctop) == NULL) stop("child " funcStr); \
|
||||
sscanf(str, "%x", &u); \
|
||||
return u2f(u); \
|
||||
sscanf(str, "%x", &u); \
|
||||
return u2f(u); \
|
||||
} while(0)
|
||||
|
||||
#define child_f2_f(funcStr, arg) do { \
|
||||
char str[256]; \
|
||||
uint32_t u, v; \
|
||||
sprintf(str, funcStr " %x\n", f2u(arg)); \
|
||||
write(ptoc[1], str, strlen(str)); \
|
||||
#define child_f2_f(funcStr, arg) do { \
|
||||
char str[256]; \
|
||||
uint32_t u, v; \
|
||||
sprintf(str, funcStr " %x\n", f2u(arg)); \
|
||||
write(ptoc[1], str, strlen(str)); \
|
||||
if (fgets(str, 255, fpctop) == NULL) stop("child " funcStr); \
|
||||
sscanf(str, "%x %x", &u, &v); \
|
||||
Sleef_float2 ret; \
|
||||
ret.x = u2f(u); \
|
||||
ret.y = u2f(v); \
|
||||
return ret; \
|
||||
sscanf(str, "%x %x", &u, &v); \
|
||||
Sleef_float2 ret; \
|
||||
ret.x = u2f(u); \
|
||||
ret.y = u2f(v); \
|
||||
return ret; \
|
||||
} while(0)
|
||||
|
||||
#define child_f_f_f(funcStr, arg1, arg2) do { \
|
||||
char str[256]; \
|
||||
uint32_t u; \
|
||||
sprintf(str, funcStr " %x %x\n", f2u(arg1), f2u(arg2)); \
|
||||
write(ptoc[1], str, strlen(str)); \
|
||||
#define child_f_f_f(funcStr, arg1, arg2) do { \
|
||||
char str[256]; \
|
||||
uint32_t u; \
|
||||
sprintf(str, funcStr " %x %x\n", f2u(arg1), f2u(arg2)); \
|
||||
write(ptoc[1], str, strlen(str)); \
|
||||
if (fgets(str, 255, fpctop) == NULL) stop("child " funcStr); \
|
||||
sscanf(str, "%x", &u); \
|
||||
return u2f(u); \
|
||||
sscanf(str, "%x", &u); \
|
||||
return u2f(u); \
|
||||
} while(0)
|
||||
|
||||
float child_sinf(float x) { child_f_f("sinf", x); }
|
||||
@ -1142,62 +1142,62 @@ void do_test() {
|
||||
|
||||
//
|
||||
|
||||
#define cmpDenorm_f(mpfrFunc, childFunc, argx) do { \
|
||||
#define cmpDenorm_f(mpfrFunc, childFunc, argx) do { \
|
||||
mpfr_set_d(frx, (float)flushToZero(argx), GMP_RNDN); \
|
||||
mpfrFunc(frc, frx, GMP_RNDN); \
|
||||
if (!cmpDenormsp(childFunc((float)flushToZero(argx)), frc)) { \
|
||||
fprintf(stderr, "arg = %.20g, test = %.20g, correct = %.20g\n", \
|
||||
mpfrFunc(frc, frx, GMP_RNDN); \
|
||||
if (!cmpDenormsp(childFunc((float)flushToZero(argx)), frc)) { \
|
||||
fprintf(stderr, "arg = %.20g, test = %.20g, correct = %.20g\n", \
|
||||
(float)flushToZero(argx), childFunc((float)flushToZero(argx)), flushToZero(mpfr_get_d(frc, GMP_RNDN))); \
|
||||
success = 0; \
|
||||
break; \
|
||||
} \
|
||||
success = 0; \
|
||||
break; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define cmpDenormNR_f(mpfrFunc, childFunc, argx) do { \
|
||||
#define cmpDenormNR_f(mpfrFunc, childFunc, argx) do { \
|
||||
mpfr_set_d(frx, (float)flushToZero(argx), GMP_RNDN); \
|
||||
mpfrFunc(frc, frx); \
|
||||
if (!cmpDenormsp(childFunc((float)flushToZero(argx)), frc)) { \
|
||||
fprintf(stderr, "arg = %.20g, test = %.20g, correct = %.20g\n", \
|
||||
mpfrFunc(frc, frx); \
|
||||
if (!cmpDenormsp(childFunc((float)flushToZero(argx)), frc)) { \
|
||||
fprintf(stderr, "arg = %.20g, test = %.20g, correct = %.20g\n", \
|
||||
(float)flushToZero(argx), childFunc((float)flushToZero(argx)), mpfr_get_d(frc, GMP_RNDN)); \
|
||||
success = 0; \
|
||||
break; \
|
||||
} \
|
||||
success = 0; \
|
||||
break; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define cmpDenorm_f_f(mpfrFunc, childFunc, argx, argy) do { \
|
||||
#define cmpDenorm_f_f(mpfrFunc, childFunc, argx, argy) do { \
|
||||
mpfr_set_d(frx, (float)flushToZero(argx), GMP_RNDN); \
|
||||
mpfr_set_d(fry, (float)flushToZero(argy), GMP_RNDN); \
|
||||
mpfrFunc(frc, frx, fry, GMP_RNDN); \
|
||||
mpfrFunc(frc, frx, fry, GMP_RNDN); \
|
||||
if (!cmpDenormsp(childFunc((float)flushToZero(argx), (float)flushToZero(argy)), frc)) { \
|
||||
fprintf(stderr, "arg = %.20g, %.20g, test = %.20g, correct = %.20g\n", \
|
||||
(float)flushToZero(argx), (float)flushToZero(argy), childFunc((float)flushToZero(argx), (float)flushToZero(argy)), mpfr_get_d(frc, GMP_RNDN)); \
|
||||
success = 0; \
|
||||
break; \
|
||||
} \
|
||||
success = 0; \
|
||||
break; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define cmpDenormX_f(mpfrFunc, childFunc, argx) do { \
|
||||
#define cmpDenormX_f(mpfrFunc, childFunc, argx) do { \
|
||||
mpfr_set_d(frx, (float)flushToZero(argx), GMP_RNDN); \
|
||||
mpfrFunc(frc, frx, GMP_RNDN); \
|
||||
Sleef_float2 d2 = childFunc((float)flushToZero(argx)); \
|
||||
if (!cmpDenormsp(d2.x, frc)) { \
|
||||
fprintf(stderr, "arg = %.20g, test = %.20g, correct = %.20g\n", \
|
||||
mpfrFunc(frc, frx, GMP_RNDN); \
|
||||
Sleef_float2 d2 = childFunc((float)flushToZero(argx)); \
|
||||
if (!cmpDenormsp(d2.x, frc)) { \
|
||||
fprintf(stderr, "arg = %.20g, test = %.20g, correct = %.20g\n", \
|
||||
(float)flushToZero(argx), d2.x, mpfr_get_d(frc, GMP_RNDN)); \
|
||||
success = 0; \
|
||||
break; \
|
||||
} \
|
||||
success = 0; \
|
||||
break; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define cmpDenormY_f(mpfrFunc, childFunc, argx) do { \
|
||||
#define cmpDenormY_f(mpfrFunc, childFunc, argx) do { \
|
||||
mpfr_set_d(frx, (float)flushToZero(argx), GMP_RNDN); \
|
||||
mpfrFunc(frc, frx, GMP_RNDN); \
|
||||
Sleef_float2 d2 = childFunc((float)flushToZero(argx)); \
|
||||
if (!cmpDenormsp(d2.y, frc)) { \
|
||||
fprintf(stderr, "arg = %.20g, test = %.20g, correct = %.20g\n", \
|
||||
mpfrFunc(frc, frx, GMP_RNDN); \
|
||||
Sleef_float2 d2 = childFunc((float)flushToZero(argx)); \
|
||||
if (!cmpDenormsp(d2.y, frc)) { \
|
||||
fprintf(stderr, "arg = %.20g, test = %.20g, correct = %.20g\n", \
|
||||
(float)flushToZero(argx), d2.y, mpfr_get_d(frc, GMP_RNDN)); \
|
||||
success = 0; \
|
||||
break; \
|
||||
} \
|
||||
success = 0; \
|
||||
break; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
//
|
||||
@ -2157,57 +2157,57 @@ void do_test() {
|
||||
|
||||
//
|
||||
|
||||
#define cmpDenorm_d(mpfrFunc, childFunc, argx) do { \
|
||||
mpfr_set_d(frx, argx, GMP_RNDN); \
|
||||
mpfrFunc(frc, frx, GMP_RNDN); \
|
||||
if (!cmpDenormdp(childFunc(argx), frc)) { \
|
||||
#define cmpDenorm_d(mpfrFunc, childFunc, argx) do { \
|
||||
mpfr_set_d(frx, argx, GMP_RNDN); \
|
||||
mpfrFunc(frc, frx, GMP_RNDN); \
|
||||
if (!cmpDenormdp(childFunc(argx), frc)) { \
|
||||
fprintf(stderr, "arg = %.20g, test = %.20g, correct = %.20g\n", argx, childFunc(argx), mpfr_get_d(frc, GMP_RNDN)); \
|
||||
success = 0; \
|
||||
break; \
|
||||
} \
|
||||
success = 0; \
|
||||
break; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define cmpDenormNR_d(mpfrFunc, childFunc, argx) do { \
|
||||
mpfr_set_d(frx, argx, GMP_RNDN); \
|
||||
mpfrFunc(frc, frx); \
|
||||
if (!cmpDenormdp(childFunc(argx), frc)) { \
|
||||
#define cmpDenormNR_d(mpfrFunc, childFunc, argx) do { \
|
||||
mpfr_set_d(frx, argx, GMP_RNDN); \
|
||||
mpfrFunc(frc, frx); \
|
||||
if (!cmpDenormdp(childFunc(argx), frc)) { \
|
||||
fprintf(stderr, "arg = %.20g, test = %.20g, correct = %.20g\n", argx, childFunc(argx), mpfr_get_d(frc, GMP_RNDN)); \
|
||||
success = 0; \
|
||||
break; \
|
||||
} \
|
||||
success = 0; \
|
||||
break; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define cmpDenorm_d_d(mpfrFunc, childFunc, argx, argy) do { \
|
||||
mpfr_set_d(frx, argx, GMP_RNDN); \
|
||||
mpfr_set_d(fry, argy, GMP_RNDN); \
|
||||
#define cmpDenorm_d_d(mpfrFunc, childFunc, argx, argy) do { \
|
||||
mpfr_set_d(frx, argx, GMP_RNDN); \
|
||||
mpfr_set_d(fry, argy, GMP_RNDN); \
|
||||
mpfrFunc(frc, frx, fry, GMP_RNDN); \
|
||||
if (!cmpDenormdp(childFunc(argx, argy), frc)) { \
|
||||
if (!cmpDenormdp(childFunc(argx, argy), frc)) { \
|
||||
fprintf(stderr, "arg = %.20g, %.20g, test = %.20g, correct = %.20g\n", argx, argy, childFunc(argx, argy), mpfr_get_d(frc, GMP_RNDN)); \
|
||||
success = 0; \
|
||||
break; \
|
||||
} \
|
||||
success = 0; \
|
||||
break; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define cmpDenormX_d(mpfrFunc, childFunc, argx) do { \
|
||||
mpfr_set_d(frx, argx, GMP_RNDN); \
|
||||
mpfrFunc(frc, frx, GMP_RNDN); \
|
||||
Sleef_double2 d2 = childFunc(argx); \
|
||||
if (!cmpDenormdp(d2.x, frc)) { \
|
||||
#define cmpDenormX_d(mpfrFunc, childFunc, argx) do { \
|
||||
mpfr_set_d(frx, argx, GMP_RNDN); \
|
||||
mpfrFunc(frc, frx, GMP_RNDN); \
|
||||
Sleef_double2 d2 = childFunc(argx); \
|
||||
if (!cmpDenormdp(d2.x, frc)) { \
|
||||
fprintf(stderr, "arg = %.20g, test = %.20g, correct = %.20g\n", argx, d2.x, mpfr_get_d(frc, GMP_RNDN)); \
|
||||
success = 0; \
|
||||
break; \
|
||||
} \
|
||||
success = 0; \
|
||||
break; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define cmpDenormY_d(mpfrFunc, childFunc, argx) do { \
|
||||
mpfr_set_d(frx, argx, GMP_RNDN); \
|
||||
mpfrFunc(frc, frx, GMP_RNDN); \
|
||||
Sleef_double2 d2 = childFunc(argx); \
|
||||
if (!cmpDenormdp(d2.y, frc)) { \
|
||||
#define cmpDenormY_d(mpfrFunc, childFunc, argx) do { \
|
||||
mpfr_set_d(frx, argx, GMP_RNDN); \
|
||||
mpfrFunc(frc, frx, GMP_RNDN); \
|
||||
Sleef_double2 d2 = childFunc(argx); \
|
||||
if (!cmpDenormdp(d2.y, frc)) { \
|
||||
fprintf(stderr, "arg = %.20g, test = %.20g, correct = %.20g\n", argx, d2.y, mpfr_get_d(frc, GMP_RNDN)); \
|
||||
success = 0; \
|
||||
break; \
|
||||
} \
|
||||
success = 0; \
|
||||
break; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
//
|
||||
@ -3435,58 +3435,58 @@ void do_test() {
|
||||
|
||||
//
|
||||
|
||||
#define checkAccuracy_d(mpfrFunc, childFunc, argx, bound) do { \
|
||||
mpfr_set_d(frx, argx, GMP_RNDN); \
|
||||
mpfrFunc(frc, frx, GMP_RNDN); \
|
||||
if (countULPdp(childFunc(argx), frc) > bound) { \
|
||||
#define checkAccuracy_d(mpfrFunc, childFunc, argx, bound) do { \
|
||||
mpfr_set_d(frx, argx, GMP_RNDN); \
|
||||
mpfrFunc(frc, frx, GMP_RNDN); \
|
||||
if (countULPdp(childFunc(argx), frc) > bound) { \
|
||||
fprintf(stderr, "\narg = %.20g, test = %.20g, correct = %.20g, ULP = %lf\n", argx, childFunc(argx), mpfr_get_d(frc, GMP_RNDN), countULPdp(childFunc(argx), frc)); \
|
||||
success = 0; \
|
||||
break; \
|
||||
} \
|
||||
success = 0; \
|
||||
break; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define checkAccuracyNR_d(mpfrFunc, childFunc, argx, bound) do { \
|
||||
mpfr_set_d(frx, argx, GMP_RNDN); \
|
||||
mpfrFunc(frc, frx); \
|
||||
if (countULPdp(childFunc(argx), frc) > bound) { \
|
||||
mpfr_set_d(frx, argx, GMP_RNDN); \
|
||||
mpfrFunc(frc, frx); \
|
||||
if (countULPdp(childFunc(argx), frc) > bound) { \
|
||||
fprintf(stderr, "\narg = %.20g, test = %.20g, correct = %.20g, ULP = %lf\n", argx, childFunc(argx), mpfr_get_d(frc, GMP_RNDN), countULPdp(childFunc(argx), frc)); \
|
||||
success = 0; \
|
||||
break; \
|
||||
} \
|
||||
success = 0; \
|
||||
break; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define checkAccuracy_d_d(mpfrFunc, childFunc, argx, argy, bound) do { \
|
||||
mpfr_set_d(frx, argx, GMP_RNDN); \
|
||||
mpfr_set_d(fry, argy, GMP_RNDN); \
|
||||
mpfrFunc(frc, frx, fry, GMP_RNDN); \
|
||||
if (countULPdp(childFunc(argx, argy), frc) > bound) { \
|
||||
#define checkAccuracy_d_d(mpfrFunc, childFunc, argx, argy, bound) do { \
|
||||
mpfr_set_d(frx, argx, GMP_RNDN); \
|
||||
mpfr_set_d(fry, argy, GMP_RNDN); \
|
||||
mpfrFunc(frc, frx, fry, GMP_RNDN); \
|
||||
if (countULPdp(childFunc(argx, argy), frc) > bound) { \
|
||||
fprintf(stderr, "\narg = %.20g, %.20g, test = %.20g, correct = %.20g, ULP = %lf\n", \
|
||||
argx, argy, childFunc(argx, argy), mpfr_get_d(frc, GMP_RNDN), countULPdp(childFunc(argx, argy), frc)); \
|
||||
success = 0; \
|
||||
break; \
|
||||
} \
|
||||
success = 0; \
|
||||
break; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define checkAccuracyX_d(mpfrFunc, childFunc, argx, bound) do { \
|
||||
mpfr_set_d(frx, argx, GMP_RNDN); \
|
||||
mpfrFunc(frc, frx, GMP_RNDN); \
|
||||
Sleef_double2 d2 = childFunc(argx); \
|
||||
#define checkAccuracyX_d(mpfrFunc, childFunc, argx, bound) do { \
|
||||
mpfr_set_d(frx, argx, GMP_RNDN); \
|
||||
mpfrFunc(frc, frx, GMP_RNDN); \
|
||||
Sleef_double2 d2 = childFunc(argx); \
|
||||
if (countULPdp(d2.x, frc) > bound) { \
|
||||
fprintf(stderr, "\narg = %.20g, test = %.20g, correct = %.20g, ULP = %lf\n", argx, d2.x, mpfr_get_d(frc, GMP_RNDN), countULPdp(d2.x, frc)); \
|
||||
success = 0; \
|
||||
break; \
|
||||
} \
|
||||
success = 0; \
|
||||
break; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define checkAccuracyY_d(mpfrFunc, childFunc, argx, bound) do { \
|
||||
mpfr_set_d(frx, argx, GMP_RNDN); \
|
||||
mpfrFunc(frc, frx, GMP_RNDN); \
|
||||
Sleef_double2 d2 = childFunc(argx); \
|
||||
#define checkAccuracyY_d(mpfrFunc, childFunc, argx, bound) do { \
|
||||
mpfr_set_d(frx, argx, GMP_RNDN); \
|
||||
mpfrFunc(frc, frx, GMP_RNDN); \
|
||||
Sleef_double2 d2 = childFunc(argx); \
|
||||
if (countULPdp(d2.y, frc) > bound) { \
|
||||
fprintf(stderr, "\narg = %.20g, test = %.20g, correct = %.20g, ULP = %lf\n", argx, d2.y, mpfr_get_d(frc, GMP_RNDN), countULPdp(d2.y, frc)); \
|
||||
success = 0; \
|
||||
break; \
|
||||
} \
|
||||
success = 0; \
|
||||
break; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
//
|
||||
@ -3903,6 +3903,8 @@ void do_test() {
|
||||
fprintf(stderr, "exp : ");
|
||||
for(d = -10;d < 10 && success;d += 0.002) checkAccuracy_d(mpfr_exp, child_exp, d, 1.0);
|
||||
for(d = -1000;d < 1000 && success;d += 1.1) checkAccuracy_d(mpfr_exp, child_exp, d, 1.0);
|
||||
// Test for early or late overflow, e.g before or after x = LOG_DBL_MAX
|
||||
for(d = LOG_DBL_MAX - 0.0001;(d < LOG_DBL_MAX + 0.0001) && success;d += 0.00001) checkAccuracy_d(mpfr_exp, child_exp, d, 1.0);
|
||||
showResult(success);
|
||||
|
||||
//
|
||||
@ -3914,6 +3916,8 @@ void do_test() {
|
||||
}
|
||||
}
|
||||
for(y = -1000;y < 1000 && success;y += 0.1) checkAccuracy_d_d(mpfr_pow, child_pow, 2.1, y, 1.0);
|
||||
// Test for early or late overflow (test limited to x = e)
|
||||
for(d = LOG_DBL_MAX - 0.0001;(d < LOG_DBL_MAX + 0.0001) && success;d += 0.00001) checkAccuracy_d_d(mpfr_pow, child_pow, exp(1.0), d, 1.0);
|
||||
showResult(success);
|
||||
|
||||
//
|
||||
@ -4141,6 +4145,7 @@ void do_test() {
|
||||
|
||||
fprintf(stderr, "log1p : ");
|
||||
for(d = 0.0001;d < 10 && success;d += 0.001) checkAccuracy_d(mpfr_log1p, child_log1p, d, 1.0);
|
||||
for(d = 1.0e+307;d < DBL_MAX && success;d += 1.0e+306) checkAccuracy_d(mpfr_log1p, child_log1p, d, 1.0);
|
||||
showResult(success);
|
||||
|
||||
//
|
||||
@ -4222,73 +4227,73 @@ void do_test() {
|
||||
|
||||
//
|
||||
|
||||
#define checkAccuracy_f(mpfrFunc, childFunc, argx, bound) do { \
|
||||
#define checkAccuracy_f(mpfrFunc, childFunc, argx, bound) do { \
|
||||
mpfr_set_d(frx, (float)flushToZero(argx), GMP_RNDN); \
|
||||
mpfrFunc(frc, frx, GMP_RNDN); \
|
||||
if (countULPsp(childFunc((float)flushToZero(argx)), frc) > bound) { \
|
||||
mpfrFunc(frc, frx, GMP_RNDN); \
|
||||
if (countULPsp(childFunc((float)flushToZero(argx)), frc) > bound) { \
|
||||
fprintf(stderr, "\narg = %.20g, test = %.20g, correct = %.20g, ULP = %lf\n", \
|
||||
(float)flushToZero(argx), (double)childFunc((float)flushToZero(argx)), mpfr_get_d(frc, GMP_RNDN), countULPsp(childFunc((float)flushToZero(argx)), frc)); \
|
||||
success = 0; \
|
||||
break; \
|
||||
} \
|
||||
success = 0; \
|
||||
break; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define checkAccuracyNR_f(mpfrFunc, childFunc, argx, bound) do { \
|
||||
mpfr_set_d(frx, (float)flushToZero(argx), GMP_RNDN); \
|
||||
mpfrFunc(frc, frx); \
|
||||
if (countULPsp(childFunc((float)flushToZero(argx)), frc) > bound) { \
|
||||
mpfrFunc(frc, frx); \
|
||||
if (countULPsp(childFunc((float)flushToZero(argx)), frc) > bound) { \
|
||||
fprintf(stderr, "\narg = %.20g, test = %.20g, correct = %.20g, ULP = %lf\n", \
|
||||
(float)flushToZero(argx), (double)childFunc((float)flushToZero(argx)), mpfr_get_d(frc, GMP_RNDN), countULPsp(childFunc((float)flushToZero(argx)), frc)); \
|
||||
success = 0; \
|
||||
break; \
|
||||
} \
|
||||
success = 0; \
|
||||
break; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define checkAccuracy_f_f(mpfrFunc, childFunc, argx, argy, bound) do { \
|
||||
#define checkAccuracy_f_f(mpfrFunc, childFunc, argx, argy, bound) do { \
|
||||
mpfr_set_d(frx, (float)flushToZero(argx), GMP_RNDN); \
|
||||
mpfr_set_d(fry, (float)flushToZero(argy), GMP_RNDN); \
|
||||
mpfrFunc(frc, frx, fry, GMP_RNDN); \
|
||||
if (countULPsp(childFunc((float)flushToZero(argx), (float)flushToZero(argy)), frc) > bound) { \
|
||||
mpfrFunc(frc, frx, fry, GMP_RNDN); \
|
||||
if (countULPsp(childFunc((float)flushToZero(argx), (float)flushToZero(argy)), frc) > bound) { \
|
||||
fprintf(stderr, "\narg = %.20g, %.20g, test = %.20g, correct = %.20g, ULP = %lf\n", \
|
||||
(float)flushToZero(argx), (float)flushToZero(argy), childFunc((float)flushToZero(argx), (float)flushToZero(argy)), mpfr_get_d(frc, GMP_RNDN), countULPsp(childFunc((float)flushToZero(argx), (float)flushToZero(argy)), frc)); \
|
||||
success = 0; \
|
||||
break; \
|
||||
} \
|
||||
success = 0; \
|
||||
break; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define checkAccuracyX_f(mpfrFunc, childFunc, argx, bound) do { \
|
||||
#define checkAccuracyX_f(mpfrFunc, childFunc, argx, bound) do { \
|
||||
mpfr_set_d(frx, (float)flushToZero(argx), GMP_RNDN); \
|
||||
mpfrFunc(frc, frx, GMP_RNDN); \
|
||||
Sleef_float2 d2 = childFunc((float)flushToZero(argx)); \
|
||||
mpfrFunc(frc, frx, GMP_RNDN); \
|
||||
Sleef_float2 d2 = childFunc((float)flushToZero(argx)); \
|
||||
if (countULPsp(d2.x, frc) > bound) { \
|
||||
fprintf(stderr, "\narg = %.20g, test = %.20g, correct = %.20g, ULP = %lf\n", (float)flushToZero(argx), (double)d2.x, mpfr_get_d(frc, GMP_RNDN), countULPsp(d2.x, frc)); \
|
||||
success = 0; \
|
||||
break; \
|
||||
} \
|
||||
success = 0; \
|
||||
break; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define checkAccuracyY_f(mpfrFunc, childFunc, argx, bound) do { \
|
||||
#define checkAccuracyY_f(mpfrFunc, childFunc, argx, bound) do { \
|
||||
mpfr_set_d(frx, (float)flushToZero(argx), GMP_RNDN); \
|
||||
mpfrFunc(frc, frx, GMP_RNDN); \
|
||||
Sleef_float2 d2 = childFunc((float)flushToZero(argx)); \
|
||||
mpfrFunc(frc, frx, GMP_RNDN); \
|
||||
Sleef_float2 d2 = childFunc((float)flushToZero(argx)); \
|
||||
if (countULPsp(d2.y, frc) > bound) { \
|
||||
fprintf(stderr, "\narg = %.20g, test = %.20g, correct = %.20g, ULP = %lf\n", (float)flushToZero(argx), (double)d2.y, mpfr_get_d(frc, GMP_RNDN), countULPsp(d2.y, frc)); \
|
||||
success = 0; \
|
||||
break; \
|
||||
} \
|
||||
success = 0; \
|
||||
break; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define checkAccuracy2_f(mpfrFunc, childFunc, argx, bound, abound) do { \
|
||||
#define checkAccuracy2_f(mpfrFunc, childFunc, argx, bound, abound) do { \
|
||||
mpfr_set_d(frx, (float)flushToZero(argx), GMP_RNDN); \
|
||||
mpfrFunc(frc, frx, GMP_RNDN); \
|
||||
double t = childFunc((float)flushToZero(argx)); \
|
||||
double ae = fabs(mpfr_get_d(frc, GMP_RNDN) - t); \
|
||||
if (countULPsp(t, frc) > bound && ae > abound) { \
|
||||
mpfrFunc(frc, frx, GMP_RNDN); \
|
||||
double t = childFunc((float)flushToZero(argx)); \
|
||||
double ae = fabs(mpfr_get_d(frc, GMP_RNDN) - t); \
|
||||
if (countULPsp(t, frc) > bound && ae > abound) { \
|
||||
fprintf(stderr, "\narg = %.20g, test = %.20g, correct = %.20g, ULP = %lf, abserror = %g\n", \
|
||||
(float)flushToZero(argx), (double)childFunc((float)flushToZero(argx)), mpfr_get_d(frc, GMP_RNDN), countULPsp(childFunc((float)flushToZero(argx)), frc), ae); \
|
||||
success = 0; \
|
||||
break; \
|
||||
} \
|
||||
success = 0; \
|
||||
break; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
//
|
||||
@ -4825,6 +4830,8 @@ void do_test() {
|
||||
fprintf(stderr, "atanf : ");
|
||||
for(d = -10;d < 10 && success;d += 0.002) checkAccuracy_f(mpfr_atan, child_atanf, d, 3.5);
|
||||
for(d = -10000;d < 10000 && success;d += 2.1) checkAccuracy_f(mpfr_atan, child_atanf, d, 3.5);
|
||||
checkAccuracy_f(mpfr_atan, child_atanf, +INFINITY, 3.5);
|
||||
checkAccuracy_f(mpfr_atan, child_atanf, -INFINITY, 3.5);
|
||||
showResult(success);
|
||||
|
||||
//
|
||||
@ -5012,6 +5019,7 @@ void do_test() {
|
||||
|
||||
fprintf(stderr, "log1pf : ");
|
||||
for(d = 0.0001;d < 10 && success;d += 0.001) checkAccuracy_f(mpfr_log1p, child_log1pf, d, 1.0);
|
||||
for(d = 1.0e+38;d < FLT_MAX && success;d += 1.0e+37) checkAccuracy_f(mpfr_log1p, child_log1pf, d, 1.0);
|
||||
showResult(success);
|
||||
|
||||
//
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user