Bhavana Kilambi 513b1f1d82 AArch64 SVE implementation for Arrays.sort
This patch adds an SVE implementation of primitive array sorting
(Arrays.sort()) on AArch64 systems that support SVE. On non-SVE machines,
we fall back to the existing Java implementation.

For smaller arrays (length <= 64), we use insertion sort;
for larger arrays we use an SVE-vectorized quicksort partitioner
followed by an odd-even transposition cleanup pass.

The SVE path is enabled by default for int type.
For float type, it is available through the experimental flag :

-XX:+UnlockExperimentalVMOptions -XX:+UseSVELibSimdSortForFP

Without this flag being enabled, the default Java implementation would
be executed for floats (the flag is disabled by default).

Float is gated due to observed regressions on some small/medium sizes.
On larger arrays, the SVE float path shows upto 1.47x speedup on
Neoverse V2 and 2.12x on Neoverse V1.

Following are the performance numbers for ArraysSort JMH benchmark -

Case A: Ratio between the scores of master branch and
UseSVELibSimdSortForFP flag disabled (which is the default).
Case B: Ratio between the scores of master branch and
UseSVELibSimdSortForFP flag enabled (the int numbers will be the same
but this now enables SVE vectorized sorting for floats).

We would want the ratios to be >= 1 to be at par or better than the
default Java implementation (master branch).

On Neoverse V1:

Benchmark                       (size)   Mode    Cnt    A       B
ArraysSort.floatParallelSort    10       avgt    3      0.98    0.98
ArraysSort.floatParallelSort    25       avgt    3      1.01    0.83
ArraysSort.floatParallelSort    50       avgt    3      0.99    0.55
ArraysSort.floatParallelSort    75       avgt    3      0.99    0.66
ArraysSort.floatParallelSort    100      avgt    3      0.98    0.66
ArraysSort.floatParallelSort    1000     avgt    3      1.00    0.84
ArraysSort.floatParallelSort    10000    avgt    3      1.03    1.52
ArraysSort.floatParallelSort    100000   avgt    3      1.03    1.46
ArraysSort.floatParallelSort    1000000  avgt    3      0.98    1.81
ArraysSort.floatSort            10       avgt    3      1.00    0.98
ArraysSort.floatSort            25       avgt    3      1.00    0.81
ArraysSort.floatSort            50       avgt    3      0.99    0.56
ArraysSort.floatSort            75       avgt    3      0.99    0.65
ArraysSort.floatSort            100      avgt    3      0.98    0.70
ArraysSort.floatSort            1000     avgt    3      0.99    0.84
ArraysSort.floatSort            10000    avgt    3      0.99    1.72
ArraysSort.floatSort            100000   avgt    3      1.00    1.94
ArraysSort.floatSort            1000000  avgt    3      1.00    2.13
ArraysSort.intParallelSort      10       avgt    3      1.08    1.08
ArraysSort.intParallelSort      25       avgt    3      1.04    1.05
ArraysSort.intParallelSort      50       avgt    3      1.29    1.30
ArraysSort.intParallelSort      75       avgt    3      1.16    1.16
ArraysSort.intParallelSort      100      avgt    3      1.07    1.07
ArraysSort.intParallelSort      1000     avgt    3      1.13    1.13
ArraysSort.intParallelSort      10000    avgt    3      1.49    1.38
ArraysSort.intParallelSort      100000   avgt    3      1.64    1.62
ArraysSort.intParallelSort      1000000  avgt    3      2.26    2.27
ArraysSort.intSort              10       avgt    3      1.08    1.08
ArraysSort.intSort              25       avgt    3      1.02    1.02
ArraysSort.intSort              50       avgt    3      1.25    1.25
ArraysSort.intSort              75       avgt    3      1.16    1.20
ArraysSort.intSort              100      avgt    3      1.07    1.07
ArraysSort.intSort              1000     avgt    3      1.12    1.13
ArraysSort.intSort              10000    avgt    3      1.94    1.95
ArraysSort.intSort              100000   avgt    3      1.86    1.86
ArraysSort.intSort              1000000  avgt    3      2.09    2.09
On Neoverse V2:

Benchmark                       (size)   Mode    Cnt    A       B
ArraysSort.floatParallelSort    10       avgt    3      1.02    1.02
ArraysSort.floatParallelSort    25       avgt    3      0.97    0.71
ArraysSort.floatParallelSort    50       avgt    3      0.94    0.65
ArraysSort.floatParallelSort    75       avgt    3      0.96    0.82
ArraysSort.floatParallelSort    100      avgt    3      0.95    0.84
ArraysSort.floatParallelSort    1000     avgt    3      1.01    0.94
ArraysSort.floatParallelSort    10000    avgt    3      1.01    1.25
ArraysSort.floatParallelSort    100000   avgt    3      1.01    1.09
ArraysSort.floatParallelSort    1000000  avgt    3      1.00    1.10
ArraysSort.floatSort            10       avgt    3      1.02    1.00
ArraysSort.floatSort            25       avgt    3      0.99    0.76
ArraysSort.floatSort            50       avgt    3      0.97    0.66
ArraysSort.floatSort            75       avgt    3      1.01    0.83
ArraysSort.floatSort            100      avgt    3      1.00    0.85
ArraysSort.floatSort            1000     avgt    3      0.99    0.93
ArraysSort.floatSort            10000    avgt    3      1.00    1.28
ArraysSort.floatSort            100000   avgt    3      1.00    1.37
ArraysSort.floatSort            1000000  avgt    3      1.00    1.48
ArraysSort.intParallelSort      10       avgt    3      1.05    1.05
ArraysSort.intParallelSort      25       avgt    3      0.99    0.84
ArraysSort.intParallelSort      50       avgt    3      1.03    1.14
ArraysSort.intParallelSort      75       avgt    3      0.91    0.99
ArraysSort.intParallelSort      100      avgt    3      0.98    0.96
ArraysSort.intParallelSort      1000     avgt    3      1.32    1.30
ArraysSort.intParallelSort      10000    avgt    3      1.40    1.40
ArraysSort.intParallelSort      100000   avgt    3      1.00    1.04
ArraysSort.intParallelSort      1000000  avgt    3      1.15    1.14
ArraysSort.intSort              10       avgt    3      1.05    1.05
ArraysSort.intSort              25       avgt    3      1.03    1.03
ArraysSort.intSort              50       avgt    3      1.08    1.14
ArraysSort.intSort              75       avgt    3      0.88    0.98
ArraysSort.intSort              100      avgt    3      1.01    0.99
ArraysSort.intSort              1000     avgt    3      1.3     1.32
ArraysSort.intSort              10000    avgt    3      1.43    1.43
ArraysSort.intSort              100000   avgt    3      1.30    1.30
ArraysSort.intSort              1000000  avgt    3      1.37    1.37
2025-12-05 14:23:12 +00:00

227 lines
8.4 KiB
Plaintext

#
# Copyright (c) 2011, 2025, Oracle and/or its affiliates. All rights reserved.
# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
#
# This code is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License version 2 only, as
# published by the Free Software Foundation. Oracle designates this
# particular file as subject to the "Classpath" exception as provided
# by Oracle in the LICENSE file that accompanied this code.
#
# This code is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
# version 2 for more details (a copy is included in the LICENSE file that
# accompanied this code).
#
# You should have received a copy of the GNU General Public License version
# 2 along with this work; if not, write to the Free Software Foundation,
# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
#
# Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
# or visit www.oracle.com if you need additional information or have any
# questions.
#
################################################################################
include CopyFiles.gmk
include LibCommon.gmk
# Prepare the find cache.
$(call FillFindCache, $(wildcard $(TOPDIR)/src/java.base/*/native))
################################################################################
# Create all the core libraries
include lib/CoreLibraries.gmk
################################################################################
## Build libnet
################################################################################
$(eval $(call SetupJdkLibrary, BUILD_LIBNET, \
NAME := net, \
OPTIMIZATION := LOW, \
DISABLED_WARNINGS_gcc_net_util_md.c := format-nonliteral unused-variable, \
DISABLED_WARNINGS_gcc_NetworkInterface.c := unused-function, \
DISABLED_WARNINGS_clang_net_util_md.c := format-nonliteral \
unused-variable, \
DISABLED_WARNINGS_clang_NetworkInterface.c := unused-function, \
DISABLED_WARNINGS_clang_aix_DefaultProxySelector.c := \
deprecated-non-prototype, \
DISABLED_WARNINGS_clang_aix_NetworkInterface.c := gnu-pointer-arith, \
DISABLED_WARNINGS_microsoft_InetAddress.c := 4244, \
DISABLED_WARNINGS_microsoft_ResolverConfigurationImpl.c := 4996, \
LDFLAGS_windows := -delayload:iphlpapi.dll -delayload:secur32.dll \
-delayload:winhttp.dll, \
JDK_LIBS := libjava libjvm, \
LIBS_linux := $(LIBDL), \
LIBS_aix := $(LIBDL), \
LIBS_windows := advapi32.lib delayimp.lib iphlpapi.lib secur32.lib \
winhttp.lib ws2_32.lib, \
LIBS_macosx := \
-framework CoreFoundation \
-framework CoreServices, \
))
TARGETS += $(BUILD_LIBNET)
################################################################################
## Build libnio
################################################################################
$(eval $(call SetupJdkLibrary, BUILD_LIBNIO, \
NAME := nio, \
OPTIMIZATION := HIGH, \
EXTRA_HEADER_DIRS := \
libnio/ch \
libnio/fs \
libjvm, \
DISABLED_WARNINGS_clang_Net.c := unused-function unused-variable, \
DISABLED_WARNINGS_clang_UnixNativeDispatcher.c := unused-variable, \
JDK_LIBS := libjava libnet, \
LIBS_linux := $(LIBDL) $(LIBPTHREAD), \
LIBS_aix := $(LIBDL), \
LIBS_macosx := \
-framework CoreFoundation \
-framework CoreServices, \
LIBS_windows := advapi32.lib mswsock.lib ws2_32.lib, \
))
TARGETS += $(BUILD_LIBNIO)
ifeq ($(call isTargetOs, macosx), true)
##############################################################################
## Build libosxsecurity
##############################################################################
$(eval $(call SetupJdkLibrary, BUILD_LIBOSXSECURITY, \
NAME := osxsecurity, \
OPTIMIZATION := LOW, \
DISABLED_WARNINGS_clang_KeystoreImpl.m := deprecated-declarations \
unused-but-set-variable unused-variable, \
JDK_LIBS := libjava, \
LIBS_macosx := -lobjc \
-framework CoreServices \
-framework Foundation \
-framework Security, \
))
TARGETS += $(BUILD_LIBOSXSECURITY)
endif
ifeq ($(call isTargetOsType, unix), true)
##############################################################################
## Build libjsig
##############################################################################
$(eval $(call SetupJdkLibrary, BUILD_LIBJSIG, \
NAME := jsig, \
OPTIMIZATION := LOW, \
jsig.c_CFLAGS := -DHOTSPOT_VM_DISTRO='"$(HOTSPOT_VM_DISTRO)"', \
DISABLED_WARNINGS_gcc_jsig.c := unused-but-set-variable, \
DISABLED_WARNINGS_clang_jsig.c := unused-but-set-variable, \
LIBS_linux := $(LIBDL), \
LIBS_aix := $(LIBDL), \
ONLY_EXPORTED := true, \
))
TARGETS += $(BUILD_LIBJSIG)
##############################################################################
# Create symlinks to libjsig in each JVM variant sub dir
ifneq ($(STATIC_LIBS), true)
LIB_OUTPUTDIR := $(call FindLibDirForModule, java.base)
LIBJSIG_NAME := $(LIBRARY_PREFIX)jsig$(SHARED_LIBRARY_SUFFIX)
# $1 variant subdir
define CreateSymlinks
# Always symlink from libdir/variant/libjsig.so -> ../libjsig.so.
$(LIB_OUTPUTDIR)/$1/$(LIBJSIG_NAME): \
$(LIB_OUTPUTDIR)/$(LIBJSIG_NAME)
$$(call MakeDir, $$(@D))
$(RM) $$@
$(LN) -s ../$$(@F) $$@
TARGETS += $(LIB_OUTPUTDIR)/$1/$(LIBJSIG_NAME)
endef
# The subdir is the same as the variant
$(foreach v, $(JVM_VARIANTS), $(eval $(call CreateSymlinks,$v)))
endif
endif
################################################################################
## Build libsyslookup
## The LIBDL dependency on Linux is needed to dynamically access libdl symbols,
## which may be needed as part of resolving some standard symbols
################################################################################
$(eval $(call SetupJdkLibrary, BUILD_LIBSYSLOOKUP, \
NAME := syslookup, \
EXTRA_HEADER_DIRS := java.base:libjava, \
LD_SET_ORIGIN := false, \
LDFLAGS_linux := -Wl$(COMMA)--no-as-needed, \
LDFLAGS_aix := -brtl -bexpfull, \
LIBS_linux := $(LIBDL) $(LIBM), \
LIBS_aix := -ldecNumber $(LIBM), \
))
TARGETS += $(BUILD_LIBSYSLOOKUP)
ifeq ($(ENABLE_FALLBACK_LINKER), true)
##############################################################################
## Build libfallbackLinker
##############################################################################
$(eval $(call SetupJdkLibrary, BUILD_LIBFALLBACKLINKER, \
NAME := fallbackLinker, \
EXTRA_HEADER_DIRS := java.base:libjava, \
CFLAGS := $(LIBFFI_CFLAGS), \
DISABLED_WARNINGS_gcc := implicit-function-declaration unused-variable, \
LIBS := $(LIBFFI_LIBS), \
LIBS_windows := ws2_32.lib, \
))
TARGETS += $(BUILD_LIBFALLBACKLINKER)
endif
SIMDSORT_BASE_DIR := $(TOPDIR)/src/java.base/linux/native/libsimdsort
ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, x86_64)+$(INCLUDE_COMPILER2)+$(filter $(TOOLCHAIN_TYPE), gcc), true+true+true+gcc)
##############################################################################
## Build libsimdsort
##############################################################################
$(eval $(call SetupJdkLibrary, BUILD_LIBSIMD_SORT, \
NAME := simdsort, \
LINK_TYPE := C++, \
OPTIMIZATION := HIGH, \
SRC := $(SIMDSORT_BASE_DIR)/x86, \
CXXFLAGS := -std=c++17, \
DISABLED_WARNINGS_gcc := unused-variable, \
LIBS_linux := $(LIBM), \
))
TARGETS += $(BUILD_LIBSIMD_SORT)
endif
ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, aarch64)+$(INCLUDE_COMPILER2)+$(filter $(TOOLCHAIN_TYPE), gcc), true+true+true+gcc)
$(eval $(call SetupJdkLibrary, BUILD_LIBSIMD_SORT, \
NAME := simdsort, \
TOOLCHAIN := TOOLCHAIN_LINK_CXX, \
OPTIMIZATION := HIGH, \
SRC := $(SIMDSORT_BASE_DIR)/aarch64, \
CFLAGS := $(CFLAGS_JDKLIB) -march=armv8.2-a+sve, \
CXXFLAGS := $(CXXFLAGS_JDKLIB) -march=armv8.2-a+sve -std=c++17, \
LDFLAGS := $(LDFLAGS_JDKLIB) \
$(call SET_SHARED_LIBRARY_ORIGIN), \
LIBS := $(LIBCXX), \
DISABLED_WARNINGS_gcc := unused-variable, \
LIBS_linux := -lc -lm -ldl, \
))
TARGETS += $(BUILD_LIBSIMD_SORT)
endif
################################################################################