8349582: APX NDD code generation for OpenJDK

Reviewed-by: epeter, jbhateja, sviswanathan
2026-01-28 12:09:14 +00:00 · 2025-03-24 16:44:26 +00:00 · 2025-03-24 16:44:26 +00:00 · c87e1be052
commit c87e1be052
parent 7d1fe0e03f
5 changed files with 3575 additions and 1609 deletions
--- a/src/hotspot/cpu/x86/assembler_x86.cpp
+++ b/src/hotspot/cpu/x86/assembler_x86.cpp
@ -332,6 +332,21 @@ void Assembler::emit_arith(int op1, int op2, Register dst, int32_t imm32) {
  }
 }

+void Assembler::emit_arith_ndd(int op1, int op2, Register dst, int32_t imm32) {
+  assert(isByte(op1) && isByte(op2), "wrong opcode");
+  assert(op1 == 0x81, "Unexpected opcode");
+  // This code cache friendly optimization saves 3 bytes per encoding, which offsets the EVEX encoding penalty.
+  if (is8bit(imm32)) {
+    emit_int24(op1 | 0x02,        // set sign bit
+               op2 | encode(dst),
+               imm32 & 0xFF);
+  }
+  else {
+    emit_int16(op1, (op2 | encode(dst)));
+    emit_int32(imm32);
+  }
+}
+
 // Force generation of a 4 byte immediate value even if it fits into 8bit
 void Assembler::emit_arith_imm32(int op1, int op2, Register dst, int32_t imm32) {
  assert(isByte(op1) && isByte(op2), "wrong opcode");
@ -1461,7 +1476,7 @@ void Assembler::addl(Register dst, int32_t imm32) {
 void Assembler::eaddl(Register dst, Register src, int32_t imm32, bool no_flags) {
  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
  (void) evex_prefix_and_encode_ndd(0, dst->encoding(), src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C, &attributes, no_flags);
-  emit_arith(0x81, 0xC0, src, imm32);
+  emit_arith_ndd(0x81, 0xC0, src, imm32);
 }

 void Assembler::addl(Register dst, Address src) {
@ -1695,7 +1710,7 @@ void Assembler::andl(Register dst, int32_t imm32) {
 void Assembler::eandl(Register dst, Register src, int32_t imm32, bool no_flags) {
  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
  (void) evex_prefix_and_encode_ndd(0, dst->encoding(), src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C, &attributes, no_flags);
-  emit_arith(0x81, 0xE0, src, imm32);
+  emit_arith_ndd(0x81, 0xE0, src, imm32);
 }

 void Assembler::andl(Address dst, Register src) {
@ -4532,7 +4547,7 @@ void Assembler::orl(Register dst, int32_t imm32) {
 void Assembler::eorl(Register dst, Register src, int32_t imm32, bool no_flags) {
  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
  evex_prefix_and_encode_ndd(0, dst->encoding(), src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C, &attributes, no_flags);
-  emit_arith(0x81, 0xC8, src, imm32);
+  emit_arith_ndd(0x81, 0xC8, src, imm32);
 }

 void Assembler::orl(Register dst, Address src) {
@ -7171,7 +7186,7 @@ void Assembler::subl(Register dst, int32_t imm32) {
 void Assembler::esubl(Register dst, Register src, int32_t imm32, bool no_flags) {
  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
  (void) evex_prefix_and_encode_ndd(0, dst->encoding(), src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C, &attributes, no_flags);
-  emit_arith(0x81, 0xE8, src, imm32);
+  emit_arith_ndd(0x81, 0xE8, src, imm32);
 }

 // Force generation of a 4 byte immediate value even if it fits into 8bit
@ -7512,7 +7527,7 @@ void Assembler::xorl(Register dst, int32_t imm32) {
 void Assembler::exorl(Register dst, Register src, int32_t imm32, bool no_flags) {
  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
  evex_prefix_and_encode_ndd(0, dst->encoding(), src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C, &attributes, no_flags);
-  emit_arith(0x81, 0xF0, src, imm32);
+  emit_arith_ndd(0x81, 0xF0, src, imm32);
 }

 void Assembler::xorl(Register dst, Address src) {
@ -15158,7 +15173,7 @@ void Assembler::addq(Register dst, int32_t imm32) {
 void Assembler::eaddq(Register dst, Register src, int32_t imm32, bool no_flags) {
  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
  (void) evex_prefix_and_encode_ndd(0, dst->encoding(), src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C, &attributes, no_flags);
-  emit_arith(0x81, 0xC0, src, imm32);
+  emit_arith_ndd(0x81, 0xC0, src, imm32);
 }

 void Assembler::addq(Register dst, Address src) {
@ -15255,7 +15270,7 @@ void Assembler::andq(Register dst, int32_t imm32) {
 void Assembler::eandq(Register dst, Register src, int32_t imm32, bool no_flags) {
  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
  evex_prefix_and_encode_ndd(0, dst->encoding(), src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C, &attributes, no_flags);
-  emit_arith(0x81, 0xE0, src, imm32);
+  emit_arith_ndd(0x81, 0xE0, src, imm32);
 }

 void Assembler::andq(Register dst, Address src) {
@ -16142,7 +16157,7 @@ void Assembler::orq(Register dst, int32_t imm32) {
 void Assembler::eorq(Register dst, Register src, int32_t imm32, bool no_flags) {
  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
  evex_prefix_and_encode_ndd(0, dst->encoding(), src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C, &attributes, no_flags);
-  emit_arith(0x81, 0xC8, src, imm32);
+  emit_arith_ndd(0x81, 0xC8, src, imm32);
 }

 void Assembler::orq_imm32(Register dst, int32_t imm32) {
@ -16830,7 +16845,7 @@ void Assembler::subq(Register dst, int32_t imm32) {
 void Assembler::esubq(Register dst, Register src, int32_t imm32, bool no_flags) {
  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
  (void) evex_prefix_and_encode_ndd(0, dst->encoding(), src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C, &attributes, no_flags);
-  emit_arith(0x81, 0xE8, src, imm32);
+  emit_arith_ndd(0x81, 0xE8, src, imm32);
 }

 // Force generation of a 4 byte immediate value even if it fits into 8bit
@ -16961,7 +16976,7 @@ void Assembler::xorq(Register dst, int32_t imm32) {
 void Assembler::exorq(Register dst, Register src, int32_t imm32, bool no_flags) {
  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
  evex_prefix_and_encode_ndd(0, dst->encoding(), src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C, &attributes, no_flags);
-  emit_arith(0x81, 0xF0, src, imm32);
+  emit_arith_ndd(0x81, 0xF0, src, imm32);
 }

 void Assembler::xorq(Address dst, int32_t imm32) {
--- a/src/hotspot/cpu/x86/assembler_x86.hpp
+++ b/src/hotspot/cpu/x86/assembler_x86.hpp
@ -823,6 +823,7 @@ private:
  void emit_arith_b(int op1, int op2, Register dst, int imm8);

  void emit_arith(int op1, int op2, Register dst, int32_t imm32);
+  void emit_arith_ndd(int op1, int op2, Register dst, int32_t imm32);
  // Force generation of a 4 byte immediate value even if it fits into 8bit
  void emit_arith_imm32(int op1, int op2, Register dst, int32_t imm32);
  void emit_arith(int op1, int op2, Register dst, Register src);
--- a/src/hotspot/cpu/x86/x86_64.ad
+++ b/src/hotspot/cpu/x86/x86_64.ad
--- a/test/hotspot/gtest/x86/asmtest.out.h
+++ b/test/hotspot/gtest/x86/asmtest.out.h
--- a/test/hotspot/gtest/x86/x86-asmtest.py
+++ b/test/hotspot/gtest/x86/x86-asmtest.py
@ -1,4 +1,4 @@
-# Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved.
 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 #
 # This code is free software; you can redistribute it and/or modify it
@ -57,7 +57,7 @@ shift_rot_ops = {'sarl', 'sarq', 'sall', 'salq', 'shll', 'shlq', 'shrl', 'shrq',

 registers_mapping = {
    # skip rax, rsi, rdi, rsp, rbp as they have special encodings
-    # 'rax': {64: 'rax', 32: 'eax', 16: 'ax', 8: 'al'},
+    'rax': {64: 'rax', 32: 'eax', 16: 'ax', 8: 'al'},
    'rcx': {64: 'rcx', 32: 'ecx', 16: 'cx', 8: 'cl'},
    'rdx': {64: 'rdx', 32: 'edx', 16: 'dx', 8: 'dl'},
    'rbx': {64: 'rbx', 32: 'ebx', 16: 'bx', 8: 'bl'},
@ -415,7 +415,7 @@ class RegRegRegImmNddInstruction(NFInstruction):
        self.imm = Immediate().generate(imm)
        self.generate_operands(self.reg1, self.reg2, self.reg3, self.imm)

-test_regs = list(registers_mapping.keys())
+test_regs = [key for key in registers_mapping.keys() if key != 'rax']

 immediates32 = [2 ** i for i in range(0, 32, 4)]
 immediates16 = [2 ** i for i in range(0, 16, 2)]
@ -624,6 +624,14 @@ def generate(RegOp, ops, print_lp64_flag=True, full_set=False):
                instr = RegOp(*op, reg1=test_reg1, reg2=test_reg2, imm=imm)
                print_instruction(instr, lp64_flag, print_lp64_flag)

+                # additional tests with rax as destination
+                if RegOp in [RegRegImmNddInstruction]:
+                    test_reg1 = 'rax'
+                    test_reg2 = random.choice(test_regs)
+                    lp64_flag = handle_lp64_flag(lp64_flag, print_lp64_flag, test_reg1, test_reg2)
+                    instr = RegOp(*op, reg1=test_reg1, reg2=test_reg2, imm=imm)
+                    print_instruction(instr, lp64_flag, print_lp64_flag)
+
        elif RegOp in [RegMemImmInstruction, RegMemImmNddInstruction]:
            if full_set:
                imm_list = get_immediate_list(op_name, width)
@ -1466,4 +1474,4 @@ if __name__ == "__main__":
    print("// END  Generated code -- do not edit")

    for f in ["x86ops.s", "x86ops.o", "x86ops.bin"]:
-        os.remove(f)
+        os.remove(f)