8223347: Integration of Vector API (Incubator)

Co-authored-by: Vivek Deshpande <vdeshpande@openjdk.org> Co-authored-by: Qi Feng <qfeng@openjdk.org> Co-authored-by: Ian Graves <igraves@openjdk.org> Co-authored-by: Jean-Philippe Halimi <jphalimi@openjdk.org> Co-authored-by: Vladimir Ivanov <vlivanov@openjdk.org> Co-authored-by: Ningsheng Jian <njian@openjdk.org> Co-authored-by: Razvan Lupusoru <rlupusoru@openjdk.org> Co-authored-by: Smita Kamath <svkamath@openjdk.org> Co-authored-by: Rahul Kandu <rkandu@openjdk.org> Co-authored-by: Kishor Kharbas <kkharbas@openjdk.org> Co-authored-by: Eric Liu <Eric.Liu2@arm.com> Co-authored-by: Aaloan Miftah <someusername3@gmail.com> Co-authored-by: John R Rose <jrose@openjdk.org> Co-authored-by: Shravya Rukmannagari <srukmannagar@openjdk.org> Co-authored-by: Paul Sandoz <psandoz@openjdk.org> Co-authored-by: Sandhya Viswanathan <sviswanathan@openjdk.org> Co-authored-by: Lauren Walkowski <lauren.walkowski@arm.com> Co-authored-by: Yang Zang <Yang.Zhang@arm.com> Co-authored-by: Joshua Zhu <jzhu@openjdk.org> Co-authored-by: Wang Zhuo <wzhuo@openjdk.org> Co-authored-by: Jatin Bhateja <jbhateja@openjdk.org> Reviewed-by: erikj, chegar, kvn, darcy, forax, briangoetz, aph, epavlova, coleenp
2026-02-14 12:25:21 +00:00 · 2020-10-14 20:02:46 +00:00 · 2020-10-14 20:02:46 +00:00 · 0c99b19258
commit 0c99b19258
parent 386e7e8b73
336 changed files with 293978 additions and 2083 deletions
--- a/make/common/Modules.gmk
+++ b/make/common/Modules.gmk
@ -59,6 +59,7 @@ BOOT_MODULES += \
    java.security.sasl \
    java.xml \
    jdk.incubator.foreign \
+    jdk.incubator.vector \
    jdk.internal.vm.ci \
    jdk.jfr \
    jdk.management \
@ -145,6 +146,7 @@ DOCS_MODULES += \
    jdk.hotspot.agent \
    jdk.httpserver \
    jdk.incubator.jpackage \
+    jdk.incubator.vector \
    jdk.jartool \
    jdk.javadoc \
    jdk.jcmd \
--- a/make/hotspot/gensrc/GensrcAdlc.gmk
+++ b/make/hotspot/gensrc/GensrcAdlc.gmk
@ -138,6 +138,7 @@ ifeq ($(call check-jvm-feature, compiler2), true)

  ifeq ($(HOTSPOT_TARGET_CPU_ARCH), aarch64)
    AD_SRC_FILES += $(call uniq, $(wildcard $(foreach d, $(AD_SRC_ROOTS), \
+        $d/cpu/$(HOTSPOT_TARGET_CPU_ARCH)/$(HOTSPOT_TARGET_CPU_ARCH)_neon.ad \
        $d/cpu/$(HOTSPOT_TARGET_CPU_ARCH)/$(HOTSPOT_TARGET_CPU_ARCH)_sve.ad \
    )))
  endif
--- a/make/jdk/src/classes/build/tools/spp/Spp.java
+++ b/make/jdk/src/classes/build/tools/spp/Spp.java
@ -106,7 +106,7 @@ public class Spp {
    static final String LNSEP = System.getProperty("line.separator");
    static final String KEY = "([a-zA-Z0-9]+)";
    static final String VAR = "([a-zA-Z0-9_\\-]+)";
-    static final String TEXT = "([a-zA-Z0-9&;,.<>/#() \\?\\[\\]\\$]+)"; // $ -- hack embedded $var$
+    static final String TEXT = "([\\p{Print}&&[^{#:}]]+)";

    static final int GN_NOT = 1;
    static final int GN_KEY = 2;
@ -140,6 +140,10 @@ public class Spp {
                    }
                }
            }
+            if (repl == null) {
+                System.err.println("Error: undefined variable in line " + ln);
+                System.exit(-1);
+            }
            vardef.appendReplacement(buf, repl);
        }
        vardef.appendTail(buf);
--- a/src/hotspot/cpu/aarch64/aarch64-asmtest.py
+++ b/src/hotspot/cpu/aarch64/aarch64-asmtest.py
@ -1,4 +1,7 @@
+import os
 import random
+import subprocess
+import sys

 AARCH64_AS = "as"
 AARCH64_OBJDUMP = "objdump"
@ -129,6 +132,8 @@ class OperandFactory:

    _modes = {'x' : GeneralRegister,
              'w' : GeneralRegister,
+              'b' : FloatRegister,
+              'h' : FloatRegister,
              's' : FloatRegister,
              'd' : FloatRegister,
              'z' : FloatZero,
@ -198,16 +203,16 @@ class InstructionWithModes(Instruction):
        self.isFloat = (mode == 'd') | (mode == 's')
        if self.isFloat:
            self.isWord = mode != 'd'
-            self.asmRegPrefix = ["d", "s"][self.isWord] 
+            self.asmRegPrefix = ["d", "s"][self.isWord]
        else:
            self.isWord = mode != 'x'
            self.asmRegPrefix = ["x", "w"][self.isWord]
-       
+
    def name(self):
        return self._name + (self.mode if self.mode != 'x' else '')
-            
+
    def aname(self):
-        return (self._name+mode if (mode == 'b' or mode == 'h') 
+        return (self._name+mode if (mode == 'b' or mode == 'h')
            else self._name)

 class ThreeRegInstruction(Instruction):
@ -220,17 +225,17 @@ class ThreeRegInstruction(Instruction):

    def cstr(self):
        return (super(ThreeRegInstruction, self).cstr()
-                + ('%s, %s, %s' 
+                + ('%s, %s, %s'
                   % (self.reg[0],
                      self.reg[1], self.reg[2])))
-                
+
    def astr(self):
        prefix = self.asmRegPrefix
        return (super(ThreeRegInstruction, self).astr()
-                + ('%s, %s, %s' 
+                + ('%s, %s, %s'
                   % (self.reg[0].astr(prefix),
                      self.reg[1].astr(prefix), self.reg[2].astr(prefix))))
-                
+
 class FourRegInstruction(ThreeRegInstruction):

    def generate(self):
@ -241,12 +246,12 @@ class FourRegInstruction(ThreeRegInstruction):
    def cstr(self):
        return (super(FourRegInstruction, self).cstr()
                + (', %s' % self.reg[3]))
-                
+
    def astr(self):
        prefix = self.asmRegPrefix
        return (super(FourRegInstruction, self).astr()
                + (', %s' % self.reg[3].astr(prefix)))
-                
+
 class TwoRegInstruction(Instruction):

    def generate(self):
@ -261,17 +266,17 @@ class TwoRegInstruction(Instruction):
    def astr(self):
        prefix = self.asmRegPrefix
        return (super(TwoRegInstruction, self).astr()
-                + ('%s, %s' 
+                + ('%s, %s'
                   % (self.reg[0].astr(prefix),
                      self.reg[1].astr(prefix))))
-                
+
 class TwoRegImmedInstruction(TwoRegInstruction):

    def generate(self):
        super(TwoRegImmedInstruction, self).generate()
        self.immed = random.randint(0, 1<<11 -1)
        return self
-        
+
    def cstr(self):
        return (super(TwoRegImmedInstruction, self).cstr()
                + ', %su' % self.immed)
@ -301,9 +306,9 @@ class ArithOp(ThreeRegInstruction):
        self.kind = ShiftKind().generate()
        self.distance = random.randint(0, (1<<5)-1 if self.isWord else (1<<6)-1)
        return self
-        
+
    def cstr(self):
-        return ('%s, Assembler::%s, %s);' 
+        return ('%s, Assembler::%s, %s);'
                % (ThreeRegInstruction.cstr(self),
                   self.kind.cstr(), self.distance))

@ -314,9 +319,9 @@ class ArithOp(ThreeRegInstruction):
                   self.distance))

 class AddSubCarryOp(ThreeRegInstruction):
-    
+
    def cstr(self):
-        return ('%s);' 
+        return ('%s);'
                % (ThreeRegInstruction.cstr(self)))

 class AddSubExtendedOp(ThreeRegInstruction):
@ -332,76 +337,75 @@ class AddSubExtendedOp(ThreeRegInstruction):

    def cstr(self):
        return (super(AddSubExtendedOp, self).cstr()
-                + (", ext::" + AddSubExtendedOp.optNames[self.option] 
+                + (", ext::" + AddSubExtendedOp.optNames[self.option]
                   + ", " + str(self.amount) + ");"))
-                
+
    def astr(self):
        return (super(AddSubExtendedOp, self).astr()
-                + (", " + AddSubExtendedOp.optNames[self.option] 
+                + (", " + AddSubExtendedOp.optNames[self.option]
                   + " #" + str(self.amount)))

 class AddSubImmOp(TwoRegImmedInstruction):

    def cstr(self):
         return super(AddSubImmOp, self).cstr() + ");"
-    
+
 class LogicalImmOp(AddSubImmOp):

     # These tables are legal immediate logical operands
     immediates32 \
-         = [0x1, 0x3f, 0x1f0, 0x7e0, 
-            0x1c00, 0x3ff0, 0x8000, 0x1e000, 
-            0x3e000, 0x78000, 0xe0000, 0x100000, 
-            0x1fffe0, 0x3fe000, 0x780000, 0x7ffff8, 
-            0xff8000, 0x1800180, 0x1fffc00, 0x3c003c0, 
-            0x3ffff00, 0x7c00000, 0x7fffe00, 0xf000f00, 
-            0xfffe000, 0x18181818, 0x1ffc0000, 0x1ffffffe, 
-            0x3f003f00, 0x3fffe000, 0x60006000, 0x7f807f80, 
-            0x7ffffc00, 0x800001ff, 0x803fffff, 0x9f9f9f9f, 
-            0xc0000fff, 0xc0c0c0c0, 0xe0000000, 0xe003e003, 
-            0xe3ffffff, 0xf0000fff, 0xf0f0f0f0, 0xf80000ff, 
-            0xf83ff83f, 0xfc00007f, 0xfc1fffff, 0xfe0001ff, 
-            0xfe3fffff, 0xff003fff, 0xff800003, 0xff87ff87, 
-            0xffc00fff, 0xffe0000f, 0xffefffef, 0xfff1fff1, 
-            0xfff83fff, 0xfffc0fff, 0xfffe0fff, 0xffff3fff, 
-            0xffffc007, 0xffffe1ff, 0xfffff80f, 0xfffffe07, 
+         = [0x1, 0x3f, 0x1f0, 0x7e0,
+            0x1c00, 0x3ff0, 0x8000, 0x1e000,
+            0x3e000, 0x78000, 0xe0000, 0x100000,
+            0x1fffe0, 0x3fe000, 0x780000, 0x7ffff8,
+            0xff8000, 0x1800180, 0x1fffc00, 0x3c003c0,
+            0x3ffff00, 0x7c00000, 0x7fffe00, 0xf000f00,
+            0xfffe000, 0x18181818, 0x1ffc0000, 0x1ffffffe,
+            0x3f003f00, 0x3fffe000, 0x60006000, 0x7f807f80,
+            0x7ffffc00, 0x800001ff, 0x803fffff, 0x9f9f9f9f,
+            0xc0000fff, 0xc0c0c0c0, 0xe0000000, 0xe003e003,
+            0xe3ffffff, 0xf0000fff, 0xf0f0f0f0, 0xf80000ff,
+            0xf83ff83f, 0xfc00007f, 0xfc1fffff, 0xfe0001ff,
+            0xfe3fffff, 0xff003fff, 0xff800003, 0xff87ff87,
+            0xffc00fff, 0xffe0000f, 0xffefffef, 0xfff1fff1,
+            0xfff83fff, 0xfffc0fff, 0xfffe0fff, 0xffff3fff,
+            0xffffc007, 0xffffe1ff, 0xfffff80f, 0xfffffe07,
            0xffffffbf, 0xfffffffd]

     immediates \
-         = [0x1, 0x1f80, 0x3fff0, 0x3ffffc, 
-            0x3fe0000, 0x1ffc0000, 0xf8000000, 0x3ffffc000, 
-            0xffffffe00, 0x3ffffff800, 0xffffc00000, 0x3f000000000, 
-            0x7fffffff800, 0x1fe000001fe0, 0x3ffffff80000, 0xc00000000000, 
-            0x1ffc000000000, 0x3ffff0003ffff, 0x7ffffffe00000, 0xfffffffffc000, 
-            0x1ffffffffffc00, 0x3fffffffffff00, 0x7ffffffffffc00, 0xffffffffff8000, 
-            0x1ffffffff800000, 0x3fffffc03fffffc, 0x7fffc0000000000, 0xff80ff80ff80ff8, 
-            0x1c00000000000000, 0x1fffffffffff0000, 0x3fffff803fffff80, 0x7fc000007fc00000, 
-            0x8000000000000000, 0x803fffff803fffff, 0xc000007fc000007f, 0xe00000000000ffff, 
-            0xe3ffffffffffffff, 0xf007f007f007f007, 0xf80003ffffffffff, 0xfc000003fc000003, 
-            0xfe000000007fffff, 0xff00000000007fff, 0xff800000000003ff, 0xffc00000000000ff, 
-            0xffe00000000003ff, 0xfff0000000003fff, 0xfff80000001fffff, 0xfffc0000fffc0000, 
-            0xfffe003fffffffff, 0xffff3fffffffffff, 0xffffc0000007ffff, 0xffffe01fffffe01f, 
-            0xfffff800000007ff, 0xfffffc0fffffffff, 0xffffff00003fffff, 0xffffffc0000007ff, 
-            0xfffffff0000001ff, 0xfffffffc00003fff, 0xffffffff07ffffff, 0xffffffffe003ffff, 
+         = [0x1, 0x1f80, 0x3fff0, 0x3ffffc,
+            0x3fe0000, 0x1ffc0000, 0xf8000000, 0x3ffffc000,
+            0xffffffe00, 0x3ffffff800, 0xffffc00000, 0x3f000000000,
+            0x7fffffff800, 0x1fe000001fe0, 0x3ffffff80000, 0xc00000000000,
+            0x1ffc000000000, 0x3ffff0003ffff, 0x7ffffffe00000, 0xfffffffffc000,
+            0x1ffffffffffc00, 0x3fffffffffff00, 0x7ffffffffffc00, 0xffffffffff8000,
+            0x1ffffffff800000, 0x3fffffc03fffffc, 0x7fffc0000000000, 0xff80ff80ff80ff8,
+            0x1c00000000000000, 0x1fffffffffff0000, 0x3fffff803fffff80, 0x7fc000007fc00000,
+            0x8000000000000000, 0x803fffff803fffff, 0xc000007fc000007f, 0xe00000000000ffff,
+            0xe3ffffffffffffff, 0xf007f007f007f007, 0xf80003ffffffffff, 0xfc000003fc000003,
+            0xfe000000007fffff, 0xff00000000007fff, 0xff800000000003ff, 0xffc00000000000ff,
+            0xffe00000000003ff, 0xfff0000000003fff, 0xfff80000001fffff, 0xfffc0000fffc0000,
+            0xfffe003fffffffff, 0xffff3fffffffffff, 0xffffc0000007ffff, 0xffffe01fffffe01f,
+            0xfffff800000007ff, 0xfffffc0fffffffff, 0xffffff00003fffff, 0xffffffc0000007ff,
+            0xfffffff0000001ff, 0xfffffffc00003fff, 0xffffffff07ffffff, 0xffffffffe003ffff,
            0xfffffffffc01ffff, 0xffffffffffc00003, 0xfffffffffffc000f, 0xffffffffffffe07f]

     def generate(self):
          AddSubImmOp.generate(self)
          self.immed = \
              self.immediates32[random.randint(0, len(self.immediates32)-1)] \
-              	if self.isWord \
-              else \
-              	self.immediates[random.randint(0, len(self.immediates)-1)]
-              
+              if self.isWord else \
+              self.immediates[random.randint(0, len(self.immediates)-1)]
+
          return self
-                  
+
     def astr(self):
          return (super(TwoRegImmedInstruction, self).astr()
                  + ', #0x%x' % self.immed)

     def cstr(self):
          return super(AddSubImmOp, self).cstr() + "ll);"
-    
+
 class MultiOp():

    def multipleForms(self):
@ -422,9 +426,9 @@ class AbsOp(MultiOp, Instruction):
        return Instruction.astr(self) + "%s"

 class RegAndAbsOp(MultiOp, Instruction):
-    
+
    def multipleForms(self):
-        if self.name() == "adrp": 
+        if self.name() == "adrp":
            # We can only test one form of adrp because anything other
            # than "adrp ." requires relocs in the assembler output
            return 1
@ -434,11 +438,11 @@ class RegAndAbsOp(MultiOp, Instruction):
        Instruction.generate(self)
        self.reg = GeneralRegister().generate()
        return self
-    
+
    def cstr(self):
        if self.name() == "adrp":
            return "__ _adrp(" + "%s, %s);" % (self.reg, "%s")
-        return (super(RegAndAbsOp, self).cstr() 
+        return (super(RegAndAbsOp, self).cstr()
                + "%s, %s);" % (self.reg, "%s"))

    def astr(self):
@ -446,14 +450,14 @@ class RegAndAbsOp(MultiOp, Instruction):
                + self.reg.astr(self.asmRegPrefix) + ", %s")

 class RegImmAbsOp(RegAndAbsOp):
-    
+
    def cstr(self):
        return (Instruction.cstr(self)
                + "%s, %s, %s);" % (self.reg, self.immed, "%s"))

    def astr(self):
        return (Instruction.astr(self)
-                + ("%s, #%s, %s" 
+                + ("%s, #%s, %s"
                   % (self.reg.astr(self.asmRegPrefix), self.immed, "%s")))

    def generate(self):
@ -462,7 +466,7 @@ class RegImmAbsOp(RegAndAbsOp):
        return self

 class MoveWideImmOp(RegImmAbsOp):
-    
+
    def multipleForms(self):
         return 0

@ -472,8 +476,8 @@ class MoveWideImmOp(RegImmAbsOp):

    def astr(self):
        return (Instruction.astr(self)
-                + ("%s, #%s, lsl %s" 
-                   % (self.reg.astr(self.asmRegPrefix), 
+                + ("%s, #%s, lsl %s"
+                   % (self.reg.astr(self.asmRegPrefix),
                      self.immed, self.shift)))

    def generate(self):
@ -486,7 +490,7 @@ class MoveWideImmOp(RegImmAbsOp):
        return self

 class BitfieldOp(TwoRegInstruction):
-    
+
    def cstr(self):
        return (Instruction.cstr(self)
                + ("%s, %s, %s, %s);"
@ -513,16 +517,16 @@ class ExtractOp(ThreeRegInstruction):
    def cstr(self):
        return (ThreeRegInstruction.cstr(self)
                + (", %s);" % self.lsb))
-    
+
    def astr(self):
        return (ThreeRegInstruction.astr(self)
                + (", #%s" % self.lsb))
-    
+
 class CondBranchOp(MultiOp, Instruction):

    def cstr(self):
        return "__ br(Assembler::" + self.name() + ", %s);"
-        
+
    def astr(self):
        return "b." + self.name() + "\t%s"

@ -530,10 +534,10 @@ class ImmOp(Instruction):

    def cstr(self):
        return "%s%s);" % (Instruction.cstr(self), self.immed)
-        
+
    def astr(self):
        return Instruction.astr(self) + "#" + str(self.immed)
-        
+
    def generate(self):
        self.immed = random.randint(0, 1<<16 -1)
        return self
@ -542,6 +546,8 @@ class Op(Instruction):

    def cstr(self):
        return Instruction.cstr(self) + ");"
+    def astr(self):
+        return self.aname();

 class SystemOp(Instruction):

@ -573,11 +579,11 @@ class ConditionalCompareOp(TwoRegImmedInstruction):
        return self

    def cstr(self):
-        return (super(ConditionalCompareOp, self).cstr() + ", " 
+        return (super(ConditionalCompareOp, self).cstr() + ", "
                + "Assembler::" + conditionCodes[self.cond] + ");")

    def astr(self):
-        return (super(ConditionalCompareOp, self).astr() + 
+        return (super(ConditionalCompareOp, self).astr() +
                 ", " + conditionCodes[self.cond])

 class ConditionalCompareImmedOp(Instruction):
@ -596,33 +602,33 @@ class ConditionalCompareImmedOp(Instruction):
                + "Assembler::" + conditionCodes[self.cond] + ");")

    def astr(self):
-        return (Instruction.astr(self) 
-                + self.reg.astr(self.asmRegPrefix) 
+        return (Instruction.astr(self)
+                + self.reg.astr(self.asmRegPrefix)
                + ", #" + str(self.immed)
                + ", #" + str(self.immed2)
                + ", " + conditionCodes[self.cond])

 class TwoRegOp(TwoRegInstruction):
-    
+
    def cstr(self):
        return TwoRegInstruction.cstr(self) + ");"

 class ThreeRegOp(ThreeRegInstruction):
-    
+
    def cstr(self):
        return ThreeRegInstruction.cstr(self) + ");"

 class FourRegMulOp(FourRegInstruction):
-    
+
    def cstr(self):
        return FourRegInstruction.cstr(self) + ");"

    def astr(self):
        isMaddsub = self.name().startswith("madd") | self.name().startswith("msub")
        midPrefix = self.asmRegPrefix if isMaddsub else "w"
-        return (Instruction.astr(self) 
-                + self.reg[0].astr(self.asmRegPrefix) 
-                + ", " + self.reg[1].astr(midPrefix) 
+        return (Instruction.astr(self)
+                + self.reg[0].astr(self.asmRegPrefix)
+                + ", " + self.reg[1].astr(midPrefix)
                + ", " + self.reg[2].astr(midPrefix)
                + ", " + self.reg[3].astr(self.asmRegPrefix))

@ -638,8 +644,8 @@ class ConditionalSelectOp(ThreeRegInstruction):
                + "Assembler::" + conditionCodes[self.cond] + ");")

    def astr(self):
-        return (ThreeRegInstruction.astr(self) 
-                + ", " + conditionCodes[self.cond])    
+        return (ThreeRegInstruction.astr(self)
+                + ", " + conditionCodes[self.cond])

 class LoadStoreExclusiveOp(InstructionWithModes):

@ -651,7 +657,7 @@ class LoadStoreExclusiveOp(InstructionWithModes):
        result = self.aname() + '\t'
        regs = list(self.regs)
        index = regs.pop() # The last reg is the index register
-        prefix = ('x' if (self.mode == 'x') 
+        prefix = ('x' if (self.mode == 'x')
                  & ((self.name().startswith("ld"))
                     | (self.name().startswith("stlr"))) # Ewww :-(
                  else 'w')
@ -698,17 +704,17 @@ class LoadStoreExclusiveOp(InstructionWithModes):
            return self._name

 class Address(object):
-    
+
    base_plus_unscaled_offset, pre, post, base_plus_reg, \
        base_plus_scaled_offset, pcrel, post_reg, base_only = range(8)
-    kinds = ["base_plus_unscaled_offset", "pre", "post", "base_plus_reg", 
+    kinds = ["base_plus_unscaled_offset", "pre", "post", "base_plus_reg",
             "base_plus_scaled_offset", "pcrel", "post_reg", "base_only"]
    extend_kinds = ["uxtw", "lsl", "sxtw", "sxtx"]

    @classmethod
    def kindToStr(cls, i):
         return cls.kinds[i]
-    
+
    def generate(self, kind, shift_distance):
        self.kind = kind
        self.base = GeneralRegister().generate()
@ -738,7 +744,7 @@ class Address(object):
            Address.pcrel: "",
            Address.base_plus_reg: "Address(%s, %s, Address::%s(%s))" \
                % (self.base, self.index, self.extend_kind, self.shift_distance),
-            Address.base_plus_scaled_offset: 
+            Address.base_plus_scaled_offset:
            "Address(%s, %s)" % (self.base, self.offset) } [self.kind]
        if (self.kind == Address.pcrel):
            result = ["__ pc()", "back", "forth"][self.offset]
@ -758,7 +764,7 @@ class Address(object):
            Address.base_only: "[%s]" %  (self.base.astr(prefix)),
            Address.pcrel: "",
            Address.base_plus_reg: "[%s, %s, %s #%s]" \
-                % (self.base.astr(prefix), self.index.astr(extend_prefix), 
+                % (self.base.astr(prefix), self.index.astr(extend_prefix),
                   self.extend_kind, self.shift_distance),
            Address.base_plus_scaled_offset: \
                "[%s, %s]" \
@ -767,7 +773,7 @@ class Address(object):
        if (self.kind == Address.pcrel):
            result = [".", "back", "forth"][self.offset]
        return result
-        
+
 class LoadStoreOp(InstructionWithModes):

    def __init__(self, args):
@ -822,14 +828,14 @@ class LoadStoreOp(InstructionWithModes):
 class LoadStorePairOp(InstructionWithModes):

     numRegs = 2
-     
+
     def __init__(self, args):
          name, self.asmname, self.kind, mode = args
          InstructionWithModes.__init__(self, name, mode)
          self.offset = random.randint(-1<<4, 1<<4-1) << 4
-          
+
     def generate(self):
-          self.reg = [OperandFactory.create(self.mode).generate() 
+          self.reg = [OperandFactory.create(self.mode).generate()
                      for i in range(self.numRegs)]
          self.base = OperandFactory.create('x').generate()
          kindStr = Address.kindToStr(self.kind);
@ -846,8 +852,8 @@ class LoadStorePairOp(InstructionWithModes):
          address = ["[%s, #%s]", "[%s, #%s]!", "[%s], #%s"][self.kind]
          address = address % (self.base.astr('x'), self.offset)
          result = "%s\t%s, %s, %s" \
-              % (self.asmname, 
-                 self.reg[0].astr(self.asmRegPrefix), 
+              % (self.asmname,
+                 self.reg[0].astr(self.asmRegPrefix),
                 self.reg[1].astr(self.asmRegPrefix), address)
          return result

@ -875,7 +881,7 @@ class FloatInstruction(Instruction):
        Instruction.__init__(self, name)

    def generate(self):
-        self.reg = [OperandFactory.create(self.modes[i]).generate() 
+        self.reg = [OperandFactory.create(self.modes[i]).generate()
                    for i in range(self.numRegs)]
        return self

@ -884,7 +890,7 @@ class FloatInstruction(Instruction):
        return (formatStr
                % tuple([Instruction.cstr(self)] +
                        [str(self.reg[i]) for i in range(self.numRegs)])) # Yowza
-    
+
    def astr(self):
        formatStr = "%s%s" + ''.join([", %s" for i in range(1, self.numRegs)])
        return (formatStr
@ -985,7 +991,7 @@ class SVEReductionOp(Instruction):
                                 moreReg +
                                 [str(self.reg[2]) + self._width.astr()])

-class LdStSIMDOp(Instruction):
+class LdStNEONOp(Instruction):
    def __init__(self, args):
        self._name, self.regnum, self.arrangement, self.addresskind = args

@ -1004,7 +1010,7 @@ class LdStSIMDOp(Instruction):
        return self

    def cstr(self):
-        buf = super(LdStSIMDOp, self).cstr() + str(self._firstSIMDreg)
+        buf = super(LdStNEONOp, self).cstr() + str(self._firstSIMDreg)
        current = self._firstSIMDreg
        for cnt in range(1, self.regnum):
            buf = '%s, %s' % (buf, current.nextReg())
@ -1022,6 +1028,57 @@ class LdStSIMDOp(Instruction):
    def aname(self):
         return self._name

+class NEONReduceInstruction(Instruction):
+    def __init__(self, args):
+        self._name, self.insname, self.arrangement = args
+
+    def generate(self):
+        current = FloatRegister().generate()
+        self.dstSIMDreg = current
+        self.srcSIMDreg = current.nextReg()
+        return self
+
+    def cstr(self):
+        buf = Instruction.cstr(self) + str(self.dstSIMDreg)
+        buf = '%s, __ T%s, %s);' % (buf, self.arrangement, self.srcSIMDreg)
+        return buf
+
+    def astr(self):
+        buf = '%s\t%s' % (self.insname, self.dstSIMDreg.astr(self.arrangement[-1].lower()))
+        buf = '%s, %s.%s' % (buf, self.srcSIMDreg, self.arrangement)
+        return buf
+
+    def aname(self):
+        return self._name
+
+class CommonNEONInstruction(Instruction):
+    def __init__(self, args):
+        self._name, self.insname, self.arrangement = args
+
+    def generate(self):
+        self._firstSIMDreg = FloatRegister().generate()
+        return self
+
+    def cstr(self):
+        buf = Instruction.cstr(self) + str(self._firstSIMDreg)
+        buf = '%s, __ T%s' % (buf, self.arrangement)
+        current = self._firstSIMDreg
+        for cnt in range(1, self.numRegs):
+            buf = '%s, %s' % (buf, current.nextReg())
+            current = current.nextReg()
+        return '%s);' % (buf)
+
+    def astr(self):
+        buf = '%s\t%s.%s' % (self.insname, self._firstSIMDreg, self.arrangement)
+        current = self._firstSIMDreg
+        for cnt in range(1, self.numRegs):
+            buf = '%s, %s.%s' % (buf, current.nextReg(), self.arrangement)
+            current = current.nextReg()
+        return buf
+
+    def aname(self):
+        return self._name
+
 class SHA512SIMDOp(Instruction):

    def generate(self):
@ -1097,6 +1154,12 @@ class FloatConvertOp(TwoRegFloatOp):
    def cname(self):
        return self._cname

+class TwoRegNEONOp(CommonNEONInstruction):
+    numRegs = 2
+
+class ThreeRegNEONOp(TwoRegNEONOp):
+    numRegs = 3
+
 class SpecialCases(Instruction):
    def __init__(self, data):
        self._name = data[0]
@ -1129,6 +1192,7 @@ def generate(kind, names):

 outfile = open("aarch64ops.s", "w")

+# To minimize the changes of assembler test code
 random.seed(0)

 print "// BEGIN  Generated code -- do not edit"
@ -1139,18 +1203,18 @@ print "    __ bind(back);"

 outfile.write("back:\n")

-generate (ArithOp, 
+generate (ArithOp,
          [ "add", "sub", "adds", "subs",
            "addw", "subw", "addsw", "subsw",
            "and", "orr", "eor", "ands",
-            "andw", "orrw", "eorw", "andsw", 
-            "bic", "orn", "eon", "bics", 
+            "andw", "orrw", "eorw", "andsw",
+            "bic", "orn", "eon", "bics",
            "bicw", "ornw", "eonw", "bicsw" ])

-generate (AddSubImmOp, 
+generate (AddSubImmOp,
          [ "addw", "addsw", "subw", "subsw",
            "add", "adds", "sub", "subs"])
-generate (LogicalImmOp, 
+generate (LogicalImmOp,
          [ "andw", "orrw", "eorw", "andsw",
            "and", "orr", "eor", "ands"])

@ -1191,26 +1255,26 @@ for mode in 'xw':
                                     ["stxp", mode, 4], ["stlxp", mode, 4]])

 for kind in range(6):
-    print "\n// " + Address.kindToStr(kind),
+    sys.stdout.write("\n// " + Address.kindToStr(kind))
    if kind != Address.pcrel:
-        generate (LoadStoreOp, 
-                  [["str", "str", kind, "x"], ["str", "str", kind, "w"], 
+        generate (LoadStoreOp,
+                  [["str", "str", kind, "x"], ["str", "str", kind, "w"],
                   ["str", "strb", kind, "b"], ["str", "strh", kind, "h"],
-                   ["ldr", "ldr", kind, "x"], ["ldr", "ldr", kind, "w"], 
+                   ["ldr", "ldr", kind, "x"], ["ldr", "ldr", kind, "w"],
                   ["ldr", "ldrb", kind, "b"], ["ldr", "ldrh", kind, "h"],
-                   ["ldrsb", "ldrsb", kind, "x"], ["ldrsh", "ldrsh", kind, "x"], 
+                   ["ldrsb", "ldrsb", kind, "x"], ["ldrsh", "ldrsh", kind, "x"],
                   ["ldrsh", "ldrsh", kind, "w"], ["ldrsw", "ldrsw", kind, "x"],
-                   ["ldr", "ldr", kind, "d"], ["ldr", "ldr", kind, "s"], 
-                   ["str", "str", kind, "d"], ["str", "str", kind, "s"], 
+                   ["ldr", "ldr", kind, "d"], ["ldr", "ldr", kind, "s"],
+                   ["str", "str", kind, "d"], ["str", "str", kind, "s"],
                   ])
    else:
-        generate (LoadStoreOp, 
+        generate (LoadStoreOp,
                  [["ldr", "ldr", kind, "x"], ["ldr", "ldr", kind, "w"]])
-        
+

 for kind in (Address.base_plus_unscaled_offset, Address.pcrel, Address.base_plus_reg, \
                 Address.base_plus_scaled_offset):
-    generate (LoadStoreOp, 
+    generate (LoadStoreOp,
              [["prfm", "prfm\tPLDL1KEEP,", kind, "x"]])

 generate(AddSubCarryOp, ["adcw", "adcsw", "sbcw", "sbcsw", "adc", "adcs", "sbc", "sbcs"])
@ -1219,32 +1283,32 @@ generate(AddSubExtendedOp, ["addw", "addsw", "sub", "subsw", "add", "adds", "sub

 generate(ConditionalCompareOp, ["ccmnw", "ccmpw", "ccmn", "ccmp"])
 generate(ConditionalCompareImmedOp, ["ccmnw", "ccmpw", "ccmn", "ccmp"])
-generate(ConditionalSelectOp, 
+generate(ConditionalSelectOp,
         ["cselw", "csincw", "csinvw", "csnegw", "csel", "csinc", "csinv", "csneg"])

-generate(TwoRegOp, 
-         ["rbitw", "rev16w", "revw", "clzw", "clsw", "rbit", 
+generate(TwoRegOp,
+         ["rbitw", "rev16w", "revw", "clzw", "clsw", "rbit",
          "rev16", "rev32", "rev", "clz", "cls"])
-generate(ThreeRegOp, 
-         ["udivw", "sdivw", "lslvw", "lsrvw", "asrvw", "rorvw", "udiv", "sdiv", 
+generate(ThreeRegOp,
+         ["udivw", "sdivw", "lslvw", "lsrvw", "asrvw", "rorvw", "udiv", "sdiv",
          "lslv", "lsrv", "asrv", "rorv", "umulh", "smulh"])
-generate(FourRegMulOp, 
+generate(FourRegMulOp,
         ["maddw", "msubw", "madd", "msub", "smaddl", "smsubl", "umaddl", "umsubl"])

-generate(ThreeRegFloatOp, 
-         [["fmuls", "sss"], ["fdivs", "sss"], ["fadds", "sss"], ["fsubs", "sss"], 
+generate(ThreeRegFloatOp,
+         [["fmuls", "sss"], ["fdivs", "sss"], ["fadds", "sss"], ["fsubs", "sss"],
          ["fmuls", "sss"],
-          ["fmuld", "ddd"], ["fdivd", "ddd"], ["faddd", "ddd"], ["fsubd", "ddd"], 
+          ["fmuld", "ddd"], ["fdivd", "ddd"], ["faddd", "ddd"], ["fsubd", "ddd"],
          ["fmuld", "ddd"]])

-generate(FourRegFloatOp, 
-         [["fmadds", "ssss"], ["fmsubs", "ssss"], ["fnmadds", "ssss"], ["fnmadds", "ssss"], 
+generate(FourRegFloatOp,
+         [["fmadds", "ssss"], ["fmsubs", "ssss"], ["fnmadds", "ssss"], ["fnmadds", "ssss"],
          ["fmaddd", "dddd"], ["fmsubd", "dddd"], ["fnmaddd", "dddd"], ["fnmaddd", "dddd"],])

-generate(TwoRegFloatOp, 
-         [["fmovs", "ss"], ["fabss", "ss"], ["fnegs", "ss"], ["fsqrts", "ss"], 
+generate(TwoRegFloatOp,
+         [["fmovs", "ss"], ["fabss", "ss"], ["fnegs", "ss"], ["fsqrts", "ss"],
          ["fcvts", "ds"],
-          ["fmovd", "dd"], ["fabsd", "dd"], ["fnegd", "dd"], ["fsqrtd", "dd"], 
+          ["fmovd", "dd"], ["fabsd", "dd"], ["fnegd", "dd"], ["fsqrtd", "dd"],
          ["fcvtd", "sd"],
          ])

@ -1255,18 +1319,18 @@ generate(FloatConvertOp, [["fcvtzsw", "fcvtzs", "ws"], ["fcvtzs", "fcvtzs", "xs"
                          ["fmovs", "fmov", "ws"], ["fmovd", "fmov", "xd"],
                          ["fmovs", "fmov", "sw"], ["fmovd", "fmov", "dx"]])

-generate(TwoRegFloatOp, [["fcmps", "ss"], ["fcmpd", "dd"], 
+generate(TwoRegFloatOp, [["fcmps", "ss"], ["fcmpd", "dd"],
                         ["fcmps", "sz"], ["fcmpd", "dz"]])

 for kind in range(3):
     generate(LoadStorePairOp, [["stp", "stp", kind, "w"], ["ldp", "ldp", kind, "w"],
-                                ["ldpsw", "ldpsw", kind, "x"], 
+                                ["ldpsw", "ldpsw", kind, "x"],
                                ["stp", "stp", kind, "x"], ["ldp", "ldp", kind, "x"]
                                ])
 generate(LoadStorePairOp, [["stnp", "stnp", 0, "w"], ["ldnp", "ldnp", 0, "w"],
                           ["stnp", "stnp", 0, "x"], ["ldnp", "ldnp", 0, "x"]])

-generate(LdStSIMDOp, [["ld1",  1, "8B",  Address.base_only],
+generate(LdStNEONOp, [["ld1",  1, "8B",  Address.base_only],
                      ["ld1",  2, "16B", Address.post],
                      ["ld1",  3, "1D",  Address.post_reg],
                      ["ld1",  4, "8H",  Address.post],
@ -1290,6 +1354,93 @@ generate(LdStSIMDOp, [["ld1",  1, "8B",  Address.base_only],
                      ["ld4r", 4, "2S",  Address.post_reg],
 ])

+generate(NEONReduceInstruction,
+         [["addv", "addv", "8B"], ["addv", "addv", "16B"],
+          ["addv", "addv", "4H"], ["addv", "addv", "8H"],
+          ["addv", "addv", "4S"],
+          ["smaxv", "smaxv", "8B"], ["smaxv", "smaxv", "16B"],
+          ["smaxv", "smaxv", "4H"], ["smaxv", "smaxv", "8H"],
+          ["smaxv", "smaxv", "4S"], ["fmaxv", "fmaxv", "4S"],
+          ["sminv", "sminv", "8B"], ["sminv", "sminv", "16B"],
+          ["sminv", "sminv", "4H"], ["sminv", "sminv", "8H"],
+          ["sminv", "sminv", "4S"], ["fminv", "fminv", "4S"],
+          ])
+
+generate(TwoRegNEONOp,
+         [["absr", "abs", "8B"], ["absr", "abs", "16B"],
+          ["absr", "abs", "4H"], ["absr", "abs", "8H"],
+          ["absr", "abs", "2S"], ["absr", "abs", "4S"],
+          ["absr", "abs", "2D"],
+          ["fabs", "fabs", "2S"], ["fabs", "fabs", "4S"],
+          ["fabs", "fabs", "2D"],
+          ["fneg", "fneg", "2S"], ["fneg", "fneg", "4S"],
+          ["fneg", "fneg", "2D"],
+          ["fsqrt", "fsqrt", "2S"], ["fsqrt", "fsqrt", "4S"],
+          ["fsqrt", "fsqrt", "2D"],
+          ["notr", "not", "8B"], ["notr", "not", "16B"],
+          ])
+
+generate(ThreeRegNEONOp,
+         [["andr", "and", "8B"], ["andr", "and", "16B"],
+          ["orr", "orr", "8B"], ["orr", "orr", "16B"],
+          ["eor", "eor", "8B"], ["eor", "eor", "16B"],
+          ["addv", "add", "8B"], ["addv", "add", "16B"],
+          ["addv", "add", "4H"], ["addv", "add", "8H"],
+          ["addv", "add", "2S"], ["addv", "add", "4S"],
+          ["addv", "add", "2D"],
+          ["fadd", "fadd", "2S"], ["fadd", "fadd", "4S"],
+          ["fadd", "fadd", "2D"],
+          ["subv", "sub", "8B"], ["subv", "sub", "16B"],
+          ["subv", "sub", "4H"], ["subv", "sub", "8H"],
+          ["subv", "sub", "2S"], ["subv", "sub", "4S"],
+          ["subv", "sub", "2D"],
+          ["fsub", "fsub", "2S"], ["fsub", "fsub", "4S"],
+          ["fsub", "fsub", "2D"],
+          ["mulv", "mul", "8B"], ["mulv", "mul", "16B"],
+          ["mulv", "mul", "4H"], ["mulv", "mul", "8H"],
+          ["mulv", "mul", "2S"], ["mulv", "mul", "4S"],
+          ["fmul", "fmul", "2S"], ["fmul", "fmul", "4S"],
+          ["fmul", "fmul", "2D"],
+          ["mlav", "mla", "4H"], ["mlav", "mla", "8H"],
+          ["mlav", "mla", "2S"], ["mlav", "mla", "4S"],
+          ["fmla", "fmla", "2S"], ["fmla", "fmla", "4S"],
+          ["fmla", "fmla", "2D"],
+          ["mlsv", "mls", "4H"], ["mlsv", "mls", "8H"],
+          ["mlsv", "mls", "2S"], ["mlsv", "mls", "4S"],
+          ["fmls", "fmls", "2S"], ["fmls", "fmls", "4S"],
+          ["fmls", "fmls", "2D"],
+          ["fdiv", "fdiv", "2S"], ["fdiv", "fdiv", "4S"],
+          ["fdiv", "fdiv", "2D"],
+          ["maxv", "smax", "8B"], ["maxv", "smax", "16B"],
+          ["maxv", "smax", "4H"], ["maxv", "smax", "8H"],
+          ["maxv", "smax", "2S"], ["maxv", "smax", "4S"],
+          ["fmax", "fmax", "2S"], ["fmax", "fmax", "4S"],
+          ["fmax", "fmax", "2D"],
+          ["minv", "smin", "8B"], ["minv", "smin", "16B"],
+          ["minv", "smin", "4H"], ["minv", "smin", "8H"],
+          ["minv", "smin", "2S"], ["minv", "smin", "4S"],
+          ["fmin", "fmin", "2S"], ["fmin", "fmin", "4S"],
+          ["fmin", "fmin", "2D"],
+          ["cmeq", "cmeq", "8B"], ["cmeq", "cmeq", "16B"],
+          ["cmeq", "cmeq", "4H"], ["cmeq", "cmeq", "8H"],
+          ["cmeq", "cmeq", "2S"], ["cmeq", "cmeq", "4S"],
+          ["cmeq", "cmeq", "2D"],
+          ["fcmeq", "fcmeq", "2S"], ["fcmeq", "fcmeq", "4S"],
+          ["fcmeq", "fcmeq", "2D"],
+          ["cmgt", "cmgt", "8B"], ["cmgt", "cmgt", "16B"],
+          ["cmgt", "cmgt", "4H"], ["cmgt", "cmgt", "8H"],
+          ["cmgt", "cmgt", "2S"], ["cmgt", "cmgt", "4S"],
+          ["cmgt", "cmgt", "2D"],
+          ["fcmgt", "fcmgt", "2S"], ["fcmgt", "fcmgt", "4S"],
+          ["fcmgt", "fcmgt", "2D"],
+          ["cmge", "cmge", "8B"], ["cmge", "cmge", "16B"],
+          ["cmge", "cmge", "4H"], ["cmge", "cmge", "8H"],
+          ["cmge", "cmge", "2S"], ["cmge", "cmge", "4S"],
+          ["cmge", "cmge", "2D"],
+          ["fcmge", "fcmge", "2S"], ["fcmge", "fcmge", "4S"],
+          ["fcmge", "fcmge", "2D"],
+          ])
+
 generate(SHA512SIMDOp, ["sha512h", "sha512h2", "sha512su0", "sha512su1"])

 generate(SpecialCases, [["ccmn",   "__ ccmn(zr, zr, 3u, Assembler::LE);",                "ccmn\txzr, xzr, #3, LE"],
@ -1344,9 +1495,9 @@ generate(SpecialCases, [["ccmn",   "__ ccmn(zr, zr, 3u, Assembler::LE);",
 ])

 print "\n// FloatImmediateOp"
-for float in ("2.0", "2.125", "4.0", "4.25", "8.0", "8.5", "16.0", "17.0", "0.125", 
-              "0.1328125", "0.25", "0.265625", "0.5", "0.53125", "1.0", "1.0625", 
-              "-2.0", "-2.125", "-4.0", "-4.25", "-8.0", "-8.5", "-16.0", "-17.0", 
+for float in ("2.0", "2.125", "4.0", "4.25", "8.0", "8.5", "16.0", "17.0", "0.125",
+              "0.1328125", "0.25", "0.265625", "0.5", "0.53125", "1.0", "1.0625",
+              "-2.0", "-2.125", "-4.0", "-4.25", "-8.0", "-8.5", "-16.0", "-17.0",
              "-0.125", "-0.1328125", "-0.25", "-0.265625", "-0.5", "-0.53125", "-1.0", "-1.0625"):
    astr = "fmov d0, #" + float
    cstr = "__ fmovd(v0, " + float + ");"
@ -1414,16 +1565,11 @@ outfile.write("forth:\n")

 outfile.close()

-import subprocess
-import sys
-
 # compile for sve with 8.1 and sha2 because of lse atomics and sha512 crypto extension.
 subprocess.check_call([AARCH64_AS, "-march=armv8.1-a+sha2+sve", "aarch64ops.s", "-o", "aarch64ops.o"])

 print
-print "/*",
-sys.stdout.flush()
-subprocess.check_call([AARCH64_OBJDUMP, "-d", "aarch64ops.o"])
+print "/*"
 print "*/"

 subprocess.check_call([AARCH64_OBJCOPY, "-O", "binary", "-j", ".text", "aarch64ops.o", "aarch64ops.bin"])
@ -1444,4 +1590,7 @@ while i < len(bytes):
 print "\n  };"
 print "// END  Generated code -- do not edit"

+infile.close()

+for f in ["aarch64ops.s", "aarch64ops.o", "aarch64ops.bin"]:
+    os.remove(f)
--- a/src/hotspot/cpu/aarch64/aarch64.ad
+++ b/src/hotspot/cpu/aarch64/aarch64.ad
@ -2410,6 +2410,12 @@ const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType
      break;
    case Op_MulVL:
      return false;
+    case Op_VectorLoadShuffle:
+    case Op_VectorRearrange:
+      if (vlen < 4) {
+        return false;
+      }
+      break;
    default:
      break;
    }
@ -2421,6 +2427,10 @@ const bool Matcher::has_predicated_vectors(void) {
  return UseSVE > 0;
 }

+bool Matcher::supports_vector_variable_shifts(void) {
+  return true;
+}
+
 const int Matcher::float_pressure(int default_pressure_threshold) {
  return default_pressure_threshold;
 }
@ -2466,11 +2476,18 @@ const int Matcher::min_vector_size(const BasicType bt) {
  if ((UseSVE > 0) && (MaxVectorSize >= 16)) {
    // Currently vector length less than SVE vector register size is not supported.
    return max_size;
-  } else {
-    //  For the moment limit the vector size to 8 bytes with NEON.
+  } else { // NEON
+    // Limit the vector size to 8 bytes
    int size = 8 / type2aelembytes(bt);
+    if (bt == T_BYTE) {
+      // To support vector api shuffle/rearrange.
+      size = 4;
+    } else if (bt == T_BOOLEAN) {
+      // To support vector api load/store mask.
+      size = 2;
+    }
    if (size < 2) size = 2;
-    return size;
+    return MIN2(size,max_size);
  }
 }

@ -2489,6 +2506,9 @@ const uint Matcher::vector_ideal_reg(int len) {
    return Op_VecA;
  }
  switch(len) {
+    // For 16-bit/32-bit mask vector, reuse VecD.
+    case  2:
+    case  4:
    case  8: return Op_VecD;
    case 16: return Op_VecX;
  }
@ -3131,6 +3151,12 @@ encode %{
  // END Non-volatile memory access

  // Vector loads and stores
+  enc_class aarch64_enc_ldrvH(vecD dst, memory mem) %{
+    FloatRegister dst_reg = as_FloatRegister($dst$$reg);
+    loadStore(C2_MacroAssembler(&cbuf), &MacroAssembler::ldr, dst_reg, MacroAssembler::H,
+       $mem->opcode(), as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
+  %}
+
  enc_class aarch64_enc_ldrvS(vecD dst, memory mem) %{
    FloatRegister dst_reg = as_FloatRegister($dst$$reg);
    loadStore(C2_MacroAssembler(&cbuf), &MacroAssembler::ldr, dst_reg, MacroAssembler::S,
@ -3149,6 +3175,12 @@ encode %{
       $mem->opcode(), as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
  %}

+  enc_class aarch64_enc_strvH(vecD src, memory mem) %{
+    FloatRegister src_reg = as_FloatRegister($src$$reg);
+    loadStore(C2_MacroAssembler(&cbuf), &MacroAssembler::str, src_reg, MacroAssembler::H,
+       $mem->opcode(), as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
+  %}
+
  enc_class aarch64_enc_strvS(vecD src, memory mem) %{
    FloatRegister src_reg = as_FloatRegister($src$$reg);
    loadStore(C2_MacroAssembler(&cbuf), &MacroAssembler::str, src_reg, MacroAssembler::S,
@ -4252,6 +4284,26 @@ operand immI_31()
  interface(CONST_INTER);
 %}

+operand immI_2()
+%{
+  predicate(n->get_int() == 2);
+  match(ConI);
+
+  op_cost(0);
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand immI_4()
+%{
+  predicate(n->get_int() == 4);
+  match(ConI);
+
+  op_cost(0);
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
 operand immI_8()
 %{
  predicate(n->get_int() == 8);
@ -11222,6 +11274,7 @@ instruct rShiftL_reg_imm(iRegLNoSp dst, iRegL src1, immI src2) %{
 %}

 // BEGIN This section of the file is automatically generated. Do not edit --------------
+// This section is generated from aarch64_ad.m4


 // This pattern is automatically generated from aarch64_ad.m4
@ -16848,6 +16901,7 @@ instruct replicate2D(vecX dst, vRegD src)

 instruct reduce_add2I(iRegINoSp dst, iRegIorL2I isrc, vecD vsrc, iRegINoSp tmp, iRegINoSp tmp2)
 %{
+  predicate(n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
  match(Set dst (AddReductionVI isrc vsrc));
  ins_cost(INSN_COST);
  effect(TEMP tmp, TEMP tmp2);
@ -16867,6 +16921,7 @@ instruct reduce_add2I(iRegINoSp dst, iRegIorL2I isrc, vecD vsrc, iRegINoSp tmp,

 instruct reduce_add4I(iRegINoSp dst, iRegIorL2I isrc, vecX vsrc, vecX vtmp, iRegINoSp itmp)
 %{
+  predicate(n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
  match(Set dst (AddReductionVI isrc vsrc));
  ins_cost(INSN_COST);
  effect(TEMP vtmp, TEMP itmp);
@ -16885,6 +16940,7 @@ instruct reduce_add4I(iRegINoSp dst, iRegIorL2I isrc, vecX vsrc, vecX vtmp, iReg

 instruct reduce_mul2I(iRegINoSp dst, iRegIorL2I isrc, vecD vsrc, iRegINoSp tmp)
 %{
+  predicate(n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
  match(Set dst (MulReductionVI isrc vsrc));
  ins_cost(INSN_COST);
  effect(TEMP tmp, TEMP dst);
@ -16904,6 +16960,7 @@ instruct reduce_mul2I(iRegINoSp dst, iRegIorL2I isrc, vecD vsrc, iRegINoSp tmp)

 instruct reduce_mul4I(iRegINoSp dst, iRegIorL2I isrc, vecX vsrc, vecX vtmp, iRegINoSp itmp)
 %{
+  predicate(n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
  match(Set dst (MulReductionVI isrc vsrc));
  ins_cost(INSN_COST);
  effect(TEMP vtmp, TEMP itmp, TEMP dst);
@ -17985,8 +18042,7 @@ instruct vabs2F(vecD dst, vecD src)
  ins_cost(INSN_COST * 3);
  format %{ "fabs  $dst,$src\t# vector (2S)" %}
  ins_encode %{
-    __ fabs(as_FloatRegister($dst$$reg), __ T2S,
-            as_FloatRegister($src$$reg));
+    __ fabs(as_FloatRegister($dst$$reg), __ T2S, as_FloatRegister($src$$reg));
  %}
  ins_pipe(vunop_fp64);
 %}
@ -17998,8 +18054,7 @@ instruct vabs4F(vecX dst, vecX src)
  ins_cost(INSN_COST * 3);
  format %{ "fabs  $dst,$src\t# vector (4S)" %}
  ins_encode %{
-    __ fabs(as_FloatRegister($dst$$reg), __ T4S,
-            as_FloatRegister($src$$reg));
+    __ fabs(as_FloatRegister($dst$$reg), __ T4S, as_FloatRegister($src$$reg));
  %}
  ins_pipe(vunop_fp128);
 %}
@ -18011,8 +18066,7 @@ instruct vabs2D(vecX dst, vecX src)
  ins_cost(INSN_COST * 3);
  format %{ "fabs  $dst,$src\t# vector (2D)" %}
  ins_encode %{
-    __ fabs(as_FloatRegister($dst$$reg), __ T2D,
-            as_FloatRegister($src$$reg));
+    __ fabs(as_FloatRegister($dst$$reg), __ T2D, as_FloatRegister($src$$reg));
  %}
  ins_pipe(vunop_fp128);
 %}
@ -18153,7 +18207,8 @@ instruct vxor16B(vecX dst, vecX src1, vecX src2)

 // ------------------------------ Shift ---------------------------------------
 instruct vshiftcnt8B(vecD dst, iRegIorL2I cnt) %{
-  predicate(n->as_Vector()->length_in_bytes() == 8);
+  predicate(n->as_Vector()->length_in_bytes() == 4 ||
+            n->as_Vector()->length_in_bytes() == 8);
  match(Set dst (LShiftCntV cnt));
  match(Set dst (RShiftCntV cnt));
  format %{ "dup  $dst, $cnt\t# shift count vector (8B)" %}
@ -18977,12 +19032,12 @@ instruct vpopcount4I(vecX dst, vecX src) %{
    "uaddlp  $dst, $dst\t# vector (8H)"
  %}
  ins_encode %{
-     __ cnt(as_FloatRegister($dst$$reg), __ T16B,
-            as_FloatRegister($src$$reg));
-     __ uaddlp(as_FloatRegister($dst$$reg), __ T16B,
-               as_FloatRegister($dst$$reg));
-     __ uaddlp(as_FloatRegister($dst$$reg), __ T8H,
-               as_FloatRegister($dst$$reg));
+    __ cnt(as_FloatRegister($dst$$reg), __ T16B,
+           as_FloatRegister($src$$reg));
+    __ uaddlp(as_FloatRegister($dst$$reg), __ T16B,
+              as_FloatRegister($dst$$reg));
+    __ uaddlp(as_FloatRegister($dst$$reg), __ T8H,
+              as_FloatRegister($dst$$reg));
  %}
  ins_pipe(pipe_class_default);
 %}
@ -18996,12 +19051,12 @@ instruct vpopcount2I(vecD dst, vecD src) %{
    "uaddlp  $dst, $dst\t# vector (4H)"
  %}
  ins_encode %{
-     __ cnt(as_FloatRegister($dst$$reg), __ T8B,
-            as_FloatRegister($src$$reg));
-     __ uaddlp(as_FloatRegister($dst$$reg), __ T8B,
-               as_FloatRegister($dst$$reg));
-     __ uaddlp(as_FloatRegister($dst$$reg), __ T4H,
-               as_FloatRegister($dst$$reg));
+    __ cnt(as_FloatRegister($dst$$reg), __ T8B,
+           as_FloatRegister($src$$reg));
+    __ uaddlp(as_FloatRegister($dst$$reg), __ T8B,
+              as_FloatRegister($dst$$reg));
+    __ uaddlp(as_FloatRegister($dst$$reg), __ T4H,
+              as_FloatRegister($dst$$reg));
  %}
  ins_pipe(pipe_class_default);
 %}
--- a/src/hotspot/cpu/aarch64/aarch64_ad.m4
+++ b/src/hotspot/cpu/aarch64/aarch64_ad.m4
@ -1,4 +1,4 @@
-dnl Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved.
+dnl Copyright (c) 2019, 2020, Red Hat Inc. All rights reserved.
 dnl DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 dnl
 dnl This code is free software; you can redistribute it and/or modify it
@ -19,10 +19,14 @@ dnl Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 dnl or visit www.oracle.com if you need additional information or have any
 dnl questions.
 dnl
-dnl 
-dnl Process this file with m4 aarch64_ad.m4 to generate the arithmetic
-dnl and shift patterns patterns used in aarch64.ad.
 dnl
+dnl Process this file with m4 aarch64_ad.m4 to generate instructions used in
+dnl aarch64.ad:
+dnl 1. the arithmetic
+dnl 2. shift patterns
+dnl
+// BEGIN This section of the file is automatically generated. Do not edit --------------
+// This section is generated from aarch64_ad.m4
 dnl
 define(`ORL2I', `ifelse($1,I,orL2I)')
 dnl
--- a/src/hotspot/cpu/aarch64/aarch64_neon.ad
+++ b/src/hotspot/cpu/aarch64/aarch64_neon.ad
--- a/src/hotspot/cpu/aarch64/aarch64_neon_ad.m4
+++ b/src/hotspot/cpu/aarch64/aarch64_neon_ad.m4
--- a/src/hotspot/cpu/aarch64/assembler_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/assembler_aarch64.cpp
--- a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
@ -1371,6 +1371,21 @@ public:

 #undef INSN

+#define INSN(NAME, size, opc)                                           \
+  void NAME(FloatRegister Rt, Register Rn) {                            \
+    starti;                                                             \
+    f(size, 31, 30), f(0b111100, 29, 24), f(opc, 23, 22), f(0, 21);     \
+    f(0, 20, 12), f(0b01, 11, 10);                                      \
+    rf(Rn, 5), rf((Register)Rt, 0);                                     \
+  }
+
+  INSN(ldrs, 0b10, 0b01);
+  INSN(ldrd, 0b11, 0b01);
+  INSN(ldrq, 0b00, 0b11);
+
+#undef INSN
+
+
 #define INSN(NAME, opc, V)                                              \
  void NAME(address dest, prfop op = PLDL1KEEP) {                       \
    int64_t offset = (dest - pc()) >> 2;                                \
@ -1508,6 +1523,21 @@ public:

 #undef INSN

+/* SIMD extensions
+ *
+ * We just use FloatRegister in the following. They are exactly the same
+ * as SIMD registers.
+ */
+public:
+
+  enum SIMD_Arrangement {
+    T8B, T16B, T4H, T8H, T2S, T4S, T1D, T2D, T1Q
+  };
+
+  enum SIMD_RegVariant {
+    B, H, S, D, Q
+  };
+
  enum shift_kind { LSL, LSR, ASR, ROR };

  void op_shifted_reg(unsigned decode,
@ -1887,6 +1917,30 @@ public:
    i_fmovs(Vd, Vn);
  }

+private:
+  void _fcvt_narrow_extend(FloatRegister Vd, SIMD_Arrangement Ta,
+                           FloatRegister Vn, SIMD_Arrangement Tb, bool do_extend) {
+    assert((do_extend && (Tb >> 1) + 1 == (Ta >> 1))
+           || (!do_extend && (Ta >> 1) + 1 == (Tb >> 1)), "Incompatible arrangement");
+    starti;
+    int op30 = (do_extend ? Tb : Ta) & 1;
+    int op22 = ((do_extend ? Ta : Tb) >> 1) & 1;
+    f(0, 31), f(op30, 30), f(0b0011100, 29, 23), f(op22, 22);
+    f(0b100001011, 21, 13), f(do_extend ? 1 : 0, 12), f(0b10, 11, 10);
+    rf(Vn, 5), rf(Vd, 0);
+  }
+
+public:
+  void fcvtl(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn,  SIMD_Arrangement Tb) {
+    assert(Tb == T4H || Tb == T8H|| Tb == T2S || Tb == T4S, "invalid arrangement");
+    _fcvt_narrow_extend(Vd, Ta, Vn, Tb, true);
+  }
+
+  void fcvtn(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn,  SIMD_Arrangement Tb) {
+    assert(Ta == T4H || Ta == T8H|| Ta == T2S || Ta == T4S, "invalid arrangement");
+    _fcvt_narrow_extend(Vd, Ta, Vn, Tb, false);
+  }
+
 #undef INSN

  // Floating-point data-processing (2 source)
@ -2023,6 +2077,43 @@ public:

 #undef INSN

+  enum sign_kind { SIGNED, UNSIGNED };
+
+private:
+  void _xcvtf_scalar_integer(sign_kind sign, unsigned sz,
+                             FloatRegister Rd, FloatRegister Rn) {
+    starti;
+    f(0b01, 31, 30), f(sign == SIGNED ? 0 : 1, 29);
+    f(0b111100, 27, 23), f((sz >> 1) & 1, 22), f(0b100001110110, 21, 10);
+    rf(Rn, 5), rf(Rd, 0);
+  }
+
+public:
+#define INSN(NAME, sign, sz)                        \
+  void NAME(FloatRegister Rd, FloatRegister Rn) {   \
+    _xcvtf_scalar_integer(sign, sz, Rd, Rn);        \
+  }
+
+  INSN(scvtfs, SIGNED, 0);
+  INSN(scvtfd, SIGNED, 1);
+
+#undef INSN
+
+private:
+  void _xcvtf_vector_integer(sign_kind sign, SIMD_Arrangement T,
+                             FloatRegister Rd, FloatRegister Rn) {
+    assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
+    starti;
+    f(0, 31), f(T & 1, 30), f(sign == SIGNED ? 0 : 1, 29);
+    f(0b011100, 28, 23), f((T >> 1) & 1, 22), f(0b100001110110, 21, 10);
+    rf(Rn, 5), rf(Rd, 0);
+  }
+
+public:
+  void scvtfv(SIMD_Arrangement T, FloatRegister Rd, FloatRegister Rn) {
+    _xcvtf_vector_integer(SIGNED, T, Rd, Rn);
+  }
+
  // Floating-point compare
  void float_compare(unsigned op31, unsigned type,
                     unsigned op, unsigned op2,
@ -2152,21 +2243,6 @@ public:
  INSN(frintzd, 0b01, 0b011);
 #undef INSN

-/* SIMD extensions
- *
- * We just use FloatRegister in the following. They are exactly the same
- * as SIMD registers.
- */
- public:
-
-  enum SIMD_Arrangement {
-       T8B, T16B, T4H, T8H, T2S, T4S, T1D, T2D, T1Q
-  };
-
-  enum SIMD_RegVariant {
-       B, H, S, D, Q
-  };
-
 private:
  static short SIMD_Size_in_bytes[];

@ -2324,6 +2400,11 @@ public:
  INSN(smullv, 0, 0b110000, false); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S
  INSN(umullv, 1, 0b110000, false); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S
  INSN(umlalv, 1, 0b100000, false); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S
+  INSN(maxv,   0, 0b011001, false); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S
+  INSN(minv,   0, 0b011011, false); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S
+  INSN(cmeq,   1, 0b100011, true);  // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S, T2D
+  INSN(cmgt,   0, 0b001101, true);  // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S, T2D
+  INSN(cmge,   0, 0b001111, true);  // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S, T2D

 #undef INSN

@ -2343,6 +2424,8 @@ public:
  INSN(negr,   1, 0b100000101110, 3); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S, T2D
  INSN(notr,   1, 0b100000010110, 0); // accepted arrangements: T8B, T16B
  INSN(addv,   0, 0b110001101110, 1); // accepted arrangements: T8B, T16B, T4H, T8H,      T4S
+  INSN(smaxv,  0, 0b110000101010, 1); // accepted arrangements: T8B, T16B, T4H, T8H,      T4S
+  INSN(sminv,  0, 0b110001101010, 1); // accepted arrangements: T8B, T16B, T4H, T8H,      T4S
  INSN(cls,    0, 0b100000010010, 2); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S
  INSN(clz,    1, 0b100000010010, 2); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S
  INSN(cnt,    0, 0b100000010110, 0); // accepted arrangements: T8B, T16B
@ -2407,6 +2490,9 @@ public:
  INSN(fmls, 0, 1, 0b110011);
  INSN(fmax, 0, 0, 0b111101);
  INSN(fmin, 0, 1, 0b111101);
+  INSN(fcmeq, 0, 0, 0b111001);
+  INSN(fcmgt, 1, 1, 0b111001);
+  INSN(fcmge, 1, 0, 0b111001);

 #undef INSN

@ -2506,10 +2592,20 @@ public:
    rf(Vn, 5), rf(Vd, 0);
  }

-  // (double) {a, b} -> (a + b)
-  void faddpd(FloatRegister Vd, FloatRegister Vn) {
+  // (long) {a, b} -> (a + b)
+  void addpd(FloatRegister Vd, FloatRegister Vn) {
    starti;
-    f(0b0111111001110000110110, 31, 10);
+    f(0b0101111011110001101110, 31, 10);
+    rf(Vn, 5), rf(Vd, 0);
+  }
+
+  // (Floating-point) {a, b} -> (a + b)
+  void faddp(FloatRegister Vd, FloatRegister Vn, SIMD_RegVariant type) {
+    assert(type == D || type == S, "Wrong type for faddp");
+    starti;
+    f(0b011111100, 31, 23);
+    f(type == D ? 1 : 0, 22);
+    f(0b110000110110, 21, 10);
    rf(Vn, 5), rf(Vd, 0);
  }

@ -2576,29 +2672,48 @@ public:
 #undef INSN

 private:
-  void _ushll(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn, SIMD_Arrangement Tb, int shift) {
+  void _xshll(sign_kind sign, FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn, SIMD_Arrangement Tb, int shift) {
    starti;
    /* The encodings for the immh:immb fields (bits 22:16) are
-     *   0001 xxx       8H, 8B/16b shift = xxx
+     *   0001 xxx       8H, 8B/16B shift = xxx
     *   001x xxx       4S, 4H/8H  shift = xxxx
     *   01xx xxx       2D, 2S/4S  shift = xxxxx
     *   1xxx xxx       RESERVED
     */
    assert((Tb >> 1) + 1 == (Ta >> 1), "Incompatible arrangement");
    assert((1 << ((Tb>>1)+3)) > shift, "Invalid shift value");
-    f(0, 31), f(Tb & 1, 30), f(0b1011110, 29, 23), f((1 << ((Tb>>1)+3))|shift, 22, 16);
+    f(0, 31), f(Tb & 1, 30), f(sign == SIGNED ? 0 : 1, 29), f(0b011110, 28, 23);
+    f((1 << ((Tb>>1)+3))|shift, 22, 16);
    f(0b101001, 15, 10), rf(Vn, 5), rf(Vd, 0);
  }

 public:
  void ushll(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn,  SIMD_Arrangement Tb, int shift) {
    assert(Tb == T8B || Tb == T4H || Tb == T2S, "invalid arrangement");
-    _ushll(Vd, Ta, Vn, Tb, shift);
+    _xshll(UNSIGNED, Vd, Ta, Vn, Tb, shift);
  }

  void ushll2(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn,  SIMD_Arrangement Tb, int shift) {
    assert(Tb == T16B || Tb == T8H || Tb == T4S, "invalid arrangement");
-    _ushll(Vd, Ta, Vn, Tb, shift);
+    _xshll(UNSIGNED, Vd, Ta, Vn, Tb, shift);
+  }
+
+  void uxtl(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn,  SIMD_Arrangement Tb) {
+    ushll(Vd, Ta, Vn, Tb, 0);
+  }
+
+  void sshll(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn,  SIMD_Arrangement Tb, int shift) {
+    assert(Tb == T8B || Tb == T4H || Tb == T2S, "invalid arrangement");
+    _xshll(SIGNED, Vd, Ta, Vn, Tb, shift);
+  }
+
+  void sshll2(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn,  SIMD_Arrangement Tb, int shift) {
+    assert(Tb == T16B || Tb == T8H || Tb == T4S, "invalid arrangement");
+    _xshll(SIGNED, Vd, Ta, Vn, Tb, shift);
+  }
+
+  void sxtl(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn,  SIMD_Arrangement Tb) {
+    sshll(Vd, Ta, Vn, Tb, 0);
  }

  // Move from general purpose register
@ -2649,6 +2764,15 @@ public:
    f(0b100001010010, 21, 10), rf(Vn, 5), rf(Vd, 0);
  }

+  void xtn(FloatRegister Vd, SIMD_Arrangement Tb, FloatRegister Vn, SIMD_Arrangement Ta) {
+    starti;
+    int size_b = (int)Tb >> 1;
+    int size_a = (int)Ta >> 1;
+    assert(size_b < 3 && size_b == size_a - 1, "Invalid size specifier");
+    f(0, 31), f(Tb & 1, 30), f(0b001110, 29, 24), f(size_b, 23, 22);
+    f(0b100001001010, 21, 10), rf(Vn, 5), rf(Vd, 0);
+  }
+
  void dup(FloatRegister Vd, SIMD_Arrangement T, Register Xs)
  {
    starti;
--- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp
@ -611,6 +611,16 @@ class StubGenerator: public StubCodeGenerator {

  void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }

+  // Generate indices for iota vector.
+  address generate_iota_indices(const char *stub_name) {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", stub_name);
+    address start = __ pc();
+    __ emit_data64(0x0706050403020100, relocInfo::none);
+    __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
+    return start;
+  }
+
  // The inner part of zero_words().  This is the bulk operation,
  // zeroing words in blocks, possibly using DC ZVA to do it.  The
  // caller is responsible for zeroing the last few words.
@ -5958,6 +5968,8 @@ class StubGenerator: public StubCodeGenerator {
                                                SharedRuntime::
                                                throw_NullPointerException_at_call));

+    StubRoutines::aarch64::_vector_iota_indices    = generate_iota_indices("iota_indices");
+
    // arraycopy stubs used by compilers
    generate_arraycopy_stubs();

--- a/src/hotspot/cpu/aarch64/stubRoutines_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/stubRoutines_aarch64.cpp
@ -40,6 +40,7 @@ address StubRoutines::aarch64::_f2i_fixup = NULL;
 address StubRoutines::aarch64::_f2l_fixup = NULL;
 address StubRoutines::aarch64::_d2i_fixup = NULL;
 address StubRoutines::aarch64::_d2l_fixup = NULL;
+address StubRoutines::aarch64::_vector_iota_indices = NULL;
 address StubRoutines::aarch64::_float_sign_mask = NULL;
 address StubRoutines::aarch64::_float_sign_flip = NULL;
 address StubRoutines::aarch64::_double_sign_mask = NULL;
--- a/src/hotspot/cpu/aarch64/stubRoutines_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/stubRoutines_aarch64.hpp
@ -51,6 +51,7 @@ class aarch64 {
  static address _d2i_fixup;
  static address _d2l_fixup;

+  static address _vector_iota_indices;
  static address _float_sign_mask;
  static address _float_sign_flip;
  static address _double_sign_mask;
@ -106,6 +107,10 @@ class aarch64 {
    return _d2l_fixup;
  }

+  static address vector_iota_indices() {
+    return _vector_iota_indices;
+  }
+
  static address float_sign_mask()
  {
    return _float_sign_mask;
--- a/src/hotspot/cpu/arm/arm.ad
+++ b/src/hotspot/cpu/arm/arm.ad
@ -993,6 +993,10 @@ const bool Matcher::has_predicated_vectors(void) {
  return false;
 }

+bool Matcher::supports_vector_variable_shifts(void) {
+  return VM_Version::has_simd();
+}
+
 const int Matcher::float_pressure(int default_pressure_threshold) {
  return default_pressure_threshold;
 }
--- a/src/hotspot/cpu/ppc/ppc.ad
+++ b/src/hotspot/cpu/ppc/ppc.ad
@ -2161,6 +2161,10 @@ const bool Matcher::has_predicated_vectors(void) {
  return false;
 }

+bool Matcher::supports_vector_variable_shifts(void) {
+  return false; // not supported
+}
+
 const int Matcher::float_pressure(int default_pressure_threshold) {
  return default_pressure_threshold;
 }
--- a/src/hotspot/cpu/s390/s390.ad
+++ b/src/hotspot/cpu/s390/s390.ad
@ -1573,6 +1573,10 @@ const bool Matcher::has_predicated_vectors(void) {
  return false;
 }

+bool Matcher::supports_vector_variable_shifts(void) {
+  return false; // not supported
+}
+
 const int Matcher::float_pressure(int default_pressure_threshold) {
  return default_pressure_threshold;
 }
--- a/src/hotspot/cpu/x86/assembler_x86.cpp
+++ b/src/hotspot/cpu/x86/assembler_x86.cpp
--- a/src/hotspot/cpu/x86/assembler_x86.hpp
+++ b/src/hotspot/cpu/x86/assembler_x86.hpp
@ -588,6 +588,7 @@ class Assembler : public AbstractAssembler  {
 #endif
  };

+  // Comparison predicates for integral types & FP types when using SSE
  enum ComparisonPredicate {
    eq = 0,
    lt = 1,
@ -599,6 +600,51 @@ class Assembler : public AbstractAssembler  {
    _true = 7
  };

+  // Comparison predicates for FP types when using AVX
+  // O means ordered. U is unordered. When using ordered, any NaN comparison is false. Otherwise, it is true.
+  // S means signaling. Q means non-signaling. When signaling is true, instruction signals #IA on NaN.
+  enum ComparisonPredicateFP {
+    EQ_OQ = 0,
+    LT_OS = 1,
+    LE_OS = 2,
+    UNORD_Q = 3,
+    NEQ_UQ = 4,
+    NLT_US = 5,
+    NLE_US = 6,
+    ORD_Q = 7,
+    EQ_UQ = 8,
+    NGE_US = 9,
+    NGT_US = 0xA,
+    FALSE_OQ = 0XB,
+    NEQ_OQ = 0xC,
+    GE_OS = 0xD,
+    GT_OS = 0xE,
+    TRUE_UQ = 0xF,
+    EQ_OS = 0x10,
+    LT_OQ = 0x11,
+    LE_OQ = 0x12,
+    UNORD_S = 0x13,
+    NEQ_US = 0x14,
+    NLT_UQ = 0x15,
+    NLE_UQ = 0x16,
+    ORD_S = 0x17,
+    EQ_US = 0x18,
+    NGE_UQ = 0x19,
+    NGT_UQ = 0x1A,
+    FALSE_OS = 0x1B,
+    NEQ_OS = 0x1C,
+    GE_OQ = 0x1D,
+    GT_OQ = 0x1E,
+    TRUE_US =0x1F
+  };
+
+  enum Width {
+    B = 0,
+    W = 1,
+    D = 2,
+    Q = 3
+  };
+
  //---<  calculate length of instruction  >---
  // As instruction size can't be found out easily on x86/x64,
  // we just use '4' for len and maxlen.
@ -918,6 +964,7 @@ private:
  void adcq(Register dst, Register src);

  void addb(Address dst, int imm8);
+  void addw(Register dst, Register src);
  void addw(Address dst, int imm16);

  void addl(Address dst, int32_t imm32);
@ -968,6 +1015,8 @@ private:
  void vaesdec(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void vaesdeclast(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);

+  void andw(Register dst, Register src);
+
  void andl(Address  dst, int32_t imm32);
  void andl(Register dst, int32_t imm32);
  void andl(Register dst, Address src);
@ -1093,9 +1142,11 @@ private:

  // Convert Packed Signed Doubleword Integers to Packed Double-Precision Floating-Point Value
  void cvtdq2pd(XMMRegister dst, XMMRegister src);
+  void vcvtdq2pd(XMMRegister dst, XMMRegister src, int vector_len);

  // Convert Packed Signed Doubleword Integers to Packed Single-Precision Floating-Point Value
  void cvtdq2ps(XMMRegister dst, XMMRegister src);
+  void vcvtdq2ps(XMMRegister dst, XMMRegister src, int vector_len);

  // Convert Scalar Single-Precision Floating-Point Value to Scalar Double-Precision Floating-Point Value
  void cvtss2sd(XMMRegister dst, XMMRegister src);
@ -1111,8 +1162,25 @@ private:
  void cvttss2sil(Register dst, XMMRegister src);
  void cvttss2siq(Register dst, XMMRegister src);

+  // Convert vector double to int
  void cvttpd2dq(XMMRegister dst, XMMRegister src);

+  // Convert vector float and double
+  void vcvtps2pd(XMMRegister dst, XMMRegister src, int vector_len);
+  void vcvtpd2ps(XMMRegister dst, XMMRegister src, int vector_len);
+
+  // Convert vector long to vector FP
+  void evcvtqq2ps(XMMRegister dst, XMMRegister src, int vector_len);
+  void evcvtqq2pd(XMMRegister dst, XMMRegister src, int vector_len);
+
+  // Evex casts with truncation
+  void evpmovwb(XMMRegister dst, XMMRegister src, int vector_len);
+  void evpmovdw(XMMRegister dst, XMMRegister src, int vector_len);
+  void evpmovdb(XMMRegister dst, XMMRegister src, int vector_len);
+  void evpmovqd(XMMRegister dst, XMMRegister src, int vector_len);
+  void evpmovqb(XMMRegister dst, XMMRegister src, int vector_len);
+  void evpmovqw(XMMRegister dst, XMMRegister src, int vector_len);
+
  //Abs of packed Integer values
  void pabsb(XMMRegister dst, XMMRegister src);
  void pabsw(XMMRegister dst, XMMRegister src);
@ -1472,20 +1540,26 @@ private:
  void vmovdqu(XMMRegister dst, XMMRegister src);

   // Move Unaligned 512bit Vector
-  void evmovdqub(Address dst, XMMRegister src, int vector_len);
-  void evmovdqub(XMMRegister dst, Address src, int vector_len);
-  void evmovdqub(XMMRegister dst, XMMRegister src, int vector_len);
-  void evmovdqub(XMMRegister dst, KRegister mask, Address src, int vector_len);
-  void evmovdquw(Address dst, XMMRegister src, int vector_len);
-  void evmovdquw(Address dst, KRegister mask, XMMRegister src, int vector_len);
-  void evmovdquw(XMMRegister dst, Address src, int vector_len);
-  void evmovdquw(XMMRegister dst, KRegister mask, Address src, int vector_len);
+  void evmovdqub(Address dst, XMMRegister src, bool merge, int vector_len);
+  void evmovdqub(XMMRegister dst, Address src, bool merge, int vector_len);
+  void evmovdqub(XMMRegister dst, XMMRegister src, bool merge, int vector_len);
+  void evmovdqub(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len);
+  void evmovdquw(Address dst, XMMRegister src, bool merge, int vector_len);
+  void evmovdquw(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
+  void evmovdquw(XMMRegister dst, Address src, bool merge, int vector_len);
+  void evmovdquw(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len);
  void evmovdqul(Address dst, XMMRegister src, int vector_len);
  void evmovdqul(XMMRegister dst, Address src, int vector_len);
  void evmovdqul(XMMRegister dst, XMMRegister src, int vector_len);
+  void evmovdqul(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
+  void evmovdqul(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len);
+  void evmovdqul(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
  void evmovdquq(Address dst, XMMRegister src, int vector_len);
  void evmovdquq(XMMRegister dst, Address src, int vector_len);
  void evmovdquq(XMMRegister dst, XMMRegister src, int vector_len);
+  void evmovdquq(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
+  void evmovdquq(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len);
+  void evmovdquq(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);

  // Generic move instructions.
  void evmovdqu(Address dst, KRegister mask, XMMRegister src, int vector_len, int type);
@ -1521,6 +1595,9 @@ private:
  // Move Quadword
  void movq(Address     dst, XMMRegister src);
  void movq(XMMRegister dst, Address src);
+  void movq(XMMRegister dst, XMMRegister src);
+  void movq(Register dst, XMMRegister src);
+  void movq(XMMRegister dst, Register src);

  void movsbl(Register dst, Address src);
  void movsbl(Register dst, Register src);
@ -1601,6 +1678,8 @@ private:
  void btrq(Address dst, int imm8);
 #endif

+  void orw(Register dst, Register src);
+
  void orl(Address dst, int32_t imm32);
  void orl(Register dst, int32_t imm32);
  void orl(Register dst, Address src);
@ -1614,17 +1693,32 @@ private:
  void orq(Register dst, Address src);
  void orq(Register dst, Register src);

+  // Pack with signed saturation
+  void packsswb(XMMRegister dst, XMMRegister src);
+  void vpacksswb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void packssdw(XMMRegister dst, XMMRegister src);
+  void vpackssdw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+
  // Pack with unsigned saturation
  void packuswb(XMMRegister dst, XMMRegister src);
  void packuswb(XMMRegister dst, Address src);
+  void packusdw(XMMRegister dst, XMMRegister src);
  void vpackuswb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void vpackusdw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);

-  // Pemutation of 64bit words
+  // Permutations
  void vpermq(XMMRegister dst, XMMRegister src, int imm8, int vector_len);
  void vpermq(XMMRegister dst, XMMRegister src, int imm8);
  void vpermq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void vpermb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void vpermw(XMMRegister dst,  XMMRegister nds, XMMRegister src, int vector_len);
+  void vpermd(XMMRegister dst,  XMMRegister nds, Address src, int vector_len);
+  void vpermd(XMMRegister dst,  XMMRegister nds, XMMRegister src, int vector_len);
  void vperm2i128(XMMRegister dst,  XMMRegister nds, XMMRegister src, int imm8);
  void vperm2f128(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8);
+  void vpermilps(XMMRegister dst, XMMRegister src, int imm8, int vector_len);
+  void vpermilpd(XMMRegister dst, XMMRegister src, int imm8, int vector_len);
+  void vpermpd(XMMRegister dst, XMMRegister src, int imm8, int vector_len);
  void evpermi2q(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);

  void pause();
@ -1637,11 +1731,14 @@ private:
  void pcmpestri(XMMRegister xmm1, Address src, int imm8);

  void pcmpeqb(XMMRegister dst, XMMRegister src);
+  void vpcmpCCbwd(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, int vector_len);
+
  void vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void evpcmpeqb(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len);
  void evpcmpeqb(KRegister kdst, XMMRegister nds, Address src, int vector_len);
  void evpcmpeqb(KRegister kdst, KRegister mask, XMMRegister nds, Address src, int vector_len);

+  void vpcmpgtb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void evpcmpgtb(KRegister kdst, XMMRegister nds, Address src, int vector_len);
  void evpcmpgtb(KRegister kdst, KRegister mask, XMMRegister nds, Address src, int vector_len);

@ -1654,16 +1751,22 @@ private:
  void evpcmpeqw(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len);
  void evpcmpeqw(KRegister kdst, XMMRegister nds, Address src, int vector_len);

+  void vpcmpgtw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+
  void pcmpeqd(XMMRegister dst, XMMRegister src);
  void vpcmpeqd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
-  void evpcmpeqd(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len);
-  void evpcmpeqd(KRegister kdst, XMMRegister nds, Address src, int vector_len);
+  void evpcmpeqd(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src, int vector_len);
+  void evpcmpeqd(KRegister kdst, KRegister mask, XMMRegister nds, Address src, int vector_len);

  void pcmpeqq(XMMRegister dst, XMMRegister src);
+  void vpcmpCCq(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, int vector_len);
  void vpcmpeqq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void evpcmpeqq(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len);
  void evpcmpeqq(KRegister kdst, XMMRegister nds, Address src, int vector_len);

+  void pcmpgtq(XMMRegister dst, XMMRegister src);
+  void vpcmpgtq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+
  void pmovmskb(Register dst, XMMRegister src);
  void vpmovmskb(Register dst, XMMRegister src);

@ -1672,6 +1775,7 @@ private:
  void pextrq(Register dst, XMMRegister src, int imm8);
  void pextrd(Address dst, XMMRegister src, int imm8);
  void pextrq(Address dst, XMMRegister src, int imm8);
+  void pextrb(Register dst, XMMRegister src, int imm8);
  void pextrb(Address dst, XMMRegister src, int imm8);
  // SSE 2 extract
  void pextrw(Register dst, XMMRegister src, int imm8);
@ -1680,21 +1784,46 @@ private:
  // SSE 4.1 insert
  void pinsrd(XMMRegister dst, Register src, int imm8);
  void pinsrq(XMMRegister dst, Register src, int imm8);
+  void pinsrb(XMMRegister dst, Register src, int imm8);
  void pinsrd(XMMRegister dst, Address src, int imm8);
  void pinsrq(XMMRegister dst, Address src, int imm8);
  void pinsrb(XMMRegister dst, Address src, int imm8);
+  void insertps(XMMRegister dst, XMMRegister src, int imm8);
  // SSE 2 insert
  void pinsrw(XMMRegister dst, Register src, int imm8);
  void pinsrw(XMMRegister dst, Address src, int imm8);

-  // SSE4.1 packed move
+  // AVX insert
+  void vpinsrd(XMMRegister dst, XMMRegister nds, Register src, int imm8);
+  void vpinsrb(XMMRegister dst, XMMRegister nds, Register src, int imm8);
+  void vpinsrq(XMMRegister dst, XMMRegister nds, Register src, int imm8);
+  void vpinsrw(XMMRegister dst, XMMRegister nds, Register src, int imm8);
+  void vinsertps(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8);
+
+  // Zero extend moves
  void pmovzxbw(XMMRegister dst, XMMRegister src);
  void pmovzxbw(XMMRegister dst, Address src);
-
+  void pmovzxbd(XMMRegister dst, XMMRegister src);
  void vpmovzxbw( XMMRegister dst, Address src, int vector_len);
+  void pmovzxdq(XMMRegister dst, XMMRegister src);
  void vpmovzxbw(XMMRegister dst, XMMRegister src, int vector_len);
+  void vpmovzxdq(XMMRegister dst, XMMRegister src, int vector_len);
+  void vpmovzxbd(XMMRegister dst, XMMRegister src, int vector_len);
+  void vpmovzxbq(XMMRegister dst, XMMRegister src, int vector_len);
  void evpmovzxbw(XMMRegister dst, KRegister mask, Address src, int vector_len);

+  // Sign extend moves
+  void pmovsxbd(XMMRegister dst, XMMRegister src);
+  void pmovsxbq(XMMRegister dst, XMMRegister src);
+  void pmovsxbw(XMMRegister dst, XMMRegister src);
+  void pmovsxwd(XMMRegister dst, XMMRegister src);
+  void vpmovsxbd(XMMRegister dst, XMMRegister src, int vector_len);
+  void vpmovsxbq(XMMRegister dst, XMMRegister src, int vector_len);
+  void vpmovsxbw(XMMRegister dst, XMMRegister src, int vector_len);
+  void vpmovsxwd(XMMRegister dst, XMMRegister src, int vector_len);
+  void vpmovsxwq(XMMRegister dst, XMMRegister src, int vector_len);
+  void vpmovsxdq(XMMRegister dst, XMMRegister src, int vector_len);
+
  void evpmovwb(Address dst, XMMRegister src, int vector_len);
  void evpmovwb(Address dst, KRegister mask, XMMRegister src, int vector_len);

@ -1702,10 +1831,6 @@ private:

  void evpmovdb(Address dst, XMMRegister src, int vector_len);

-  // Sign extend moves
-  void pmovsxbw(XMMRegister dst, XMMRegister src);
-  void vpmovsxbw(XMMRegister dst, XMMRegister src, int vector_len);
-
  // Multiply add
  void pmaddwd(XMMRegister dst, XMMRegister src);
  void vpmaddwd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
@ -1749,10 +1874,17 @@ private:
  void pshufd(XMMRegister dst, Address src,     int mode);
  void vpshufd(XMMRegister dst, XMMRegister src, int mode, int vector_len);

-  // Shuffle Packed Low Words
+  // Shuffle Packed High/Low Words
+  void pshufhw(XMMRegister dst, XMMRegister src, int mode);
  void pshuflw(XMMRegister dst, XMMRegister src, int mode);
  void pshuflw(XMMRegister dst, Address src,     int mode);

+  //shuffle floats and doubles
+  void pshufps(XMMRegister, XMMRegister, int);
+  void pshufpd(XMMRegister, XMMRegister, int);
+  void vpshufps(XMMRegister, XMMRegister, XMMRegister, int, int);
+  void vpshufpd(XMMRegister, XMMRegister, XMMRegister, int, int);
+
  // Shuffle packed values at 128 bit granularity
  void evshufi64x2(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len);

@ -1768,6 +1900,9 @@ private:
  void vptest(XMMRegister dst, XMMRegister src);
  void vptest(XMMRegister dst, Address src);

+  // Vector compare
+  void vptest(XMMRegister dst, XMMRegister src, int vector_len);
+
  // Interleave Low Bytes
  void punpcklbw(XMMRegister dst, XMMRegister src);
  void punpcklbw(XMMRegister dst, Address src);
@ -1841,6 +1976,7 @@ private:
  void evalignq(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);

  void pblendw(XMMRegister dst, XMMRegister src, int imm8);
+  void vblendps(XMMRegister dst, XMMRegister src1, XMMRegister src2, int imm8, int vector_len);

  void sha1rnds4(XMMRegister dst, XMMRegister src, int imm8);
  void sha1nexte(XMMRegister dst, XMMRegister src);
@ -1959,6 +2095,7 @@ private:
  void xorl(Register dst, Register src);

  void xorb(Register dst, Address src);
+  void xorw(Register dst, Register src);

  void xorq(Register dst, Address src);
  void xorq(Register dst, Register src);
@ -1997,6 +2134,8 @@ private:


  //====================VECTOR ARITHMETIC=====================================
+  void evpmovd2m(KRegister kdst, XMMRegister src, int vector_len);
+  void evpmovq2m(KRegister kdst, XMMRegister src, int vector_len);

  // Add Packed Floating-Point Values
  void addpd(XMMRegister dst, XMMRegister src);
@ -2106,13 +2245,41 @@ private:
  // Multiply packed integers (only shorts and ints)
  void pmullw(XMMRegister dst, XMMRegister src);
  void pmulld(XMMRegister dst, XMMRegister src);
+  void pmuludq(XMMRegister dst, XMMRegister src);
  void vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void vpmullq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void vpmuludq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
  void vpmulld(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
  void vpmullq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);

+  // Minimum of packed integers
+  void pminsb(XMMRegister dst, XMMRegister src);
+  void vpminsb(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
+  void pminsw(XMMRegister dst, XMMRegister src);
+  void vpminsw(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
+  void pminsd(XMMRegister dst, XMMRegister src);
+  void vpminsd(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
+  void vpminsq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
+  void minps(XMMRegister dst, XMMRegister src);
+  void vminps(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
+  void minpd(XMMRegister dst, XMMRegister src);
+  void vminpd(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
+
+  // Maximum of packed integers
+  void pmaxsb(XMMRegister dst, XMMRegister src);
+  void vpmaxsb(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
+  void pmaxsw(XMMRegister dst, XMMRegister src);
+  void vpmaxsw(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
+  void pmaxsd(XMMRegister dst, XMMRegister src);
+  void vpmaxsd(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
+  void vpmaxsq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
+  void maxps(XMMRegister dst, XMMRegister src);
+  void vmaxps(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
+  void maxpd(XMMRegister dst, XMMRegister src);
+  void vmaxpd(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
+
  // Shift left packed integers
  void psllw(XMMRegister dst, int shift);
  void pslld(XMMRegister dst, int shift);
@ -2154,9 +2321,22 @@ private:
  void vpsrad(XMMRegister dst, XMMRegister src, int shift, int vector_len);
  void vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
  void vpsrad(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
+  void evpsravw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void evpsraq(XMMRegister dst, XMMRegister src, int shift, int vector_len);
  void evpsraq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);

+  // Variable shift left packed integers
+  void vpsllvd(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
+  void vpsllvq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
+
+  // Variable shift right packed integers
+  void vpsrlvd(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
+  void vpsrlvq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
+
+  // Variable shift right arithmetic packed integers
+  void vpsravd(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
+  void evpsravq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
+
  void vpshldvd(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
  void vpshrdvd(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);

@ -2164,6 +2344,7 @@ private:
  void pand(XMMRegister dst, XMMRegister src);
  void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+  void evpandd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
  void vpandq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);

  // Andn packed integers
@ -2176,10 +2357,15 @@ private:
  void vpor(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
  void vporq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);

+  void evpord(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
+  void evpord(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
+
  // Xor packed integers
  void pxor(XMMRegister dst, XMMRegister src);
  void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void vpxor(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+  void vpxorq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void evpxord(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
  void evpxorq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void evpxorq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);

@ -2257,7 +2443,21 @@ private:
  void evpbroadcastd(XMMRegister dst, Register src, int vector_len);
  void evpbroadcastq(XMMRegister dst, Register src, int vector_len);

-  void evpgatherdd(XMMRegister dst, KRegister k1, Address src, int vector_len);
+  // Gather AVX2 and AVX3
+  void vpgatherdd(XMMRegister dst, Address src, XMMRegister mask, int vector_len);
+  void vpgatherdq(XMMRegister dst, Address src, XMMRegister mask, int vector_len);
+  void vgatherdpd(XMMRegister dst, Address src, XMMRegister mask, int vector_len);
+  void vgatherdps(XMMRegister dst, Address src, XMMRegister mask, int vector_len);
+  void evpgatherdd(XMMRegister dst, KRegister mask, Address src, int vector_len);
+  void evpgatherdq(XMMRegister dst, KRegister mask, Address src, int vector_len);
+  void evgatherdpd(XMMRegister dst, KRegister mask, Address src, int vector_len);
+  void evgatherdps(XMMRegister dst, KRegister mask, Address src, int vector_len);
+
+  //Scatter AVX3 only
+  void evpscatterdd(Address dst, KRegister mask, XMMRegister src, int vector_len);
+  void evpscatterdq(Address dst, KRegister mask, XMMRegister src, int vector_len);
+  void evscatterdps(Address dst, KRegister mask, XMMRegister src, int vector_len);
+  void evscatterdpd(Address dst, KRegister mask, XMMRegister src, int vector_len);

  // Carry-Less Multiplication Quadword
  void pclmulqdq(XMMRegister dst, XMMRegister src, int mask);
@ -2270,14 +2470,56 @@ private:
  // runtime code and native libraries.
  void vzeroupper();

-  // AVX support for vectorized conditional move (float/double). The following two instructions used only coupled.
-  void blendvpb(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len);
-  void cmppd(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len);
-  void blendvpd(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len);
-  void cmpps(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len);
-  void blendvps(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len);
-  void vpblendd(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len);
+  // Vector double compares
+  void vcmppd(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len);
+  void evcmppd(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src,
+               ComparisonPredicateFP comparison, int vector_len);

+  // Vector float compares
+  void vcmpps(XMMRegister dst, XMMRegister nds, XMMRegister src, int comparison, int vector_len);
+  void evcmpps(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src,
+               ComparisonPredicateFP comparison, int vector_len);
+
+  // Vector integer compares
+  void vpcmpgtd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src,
+               int comparison, int vector_len);
+  void evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, Address src,
+               int comparison, int vector_len);
+
+  // Vector long compares
+  void evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src,
+               int comparison, int vector_len);
+  void evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, Address src,
+               int comparison, int vector_len);
+
+  // Vector byte compares
+  void evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src,
+               int comparison, int vector_len);
+  void evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, Address src,
+               int comparison, int vector_len);
+
+  // Vector short compares
+  void evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src,
+               int comparison, int vector_len);
+  void evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, Address src,
+               int comparison, int vector_len);
+
+  // Vector blends
+  void blendvps(XMMRegister dst, XMMRegister src);
+  void blendvpd(XMMRegister dst, XMMRegister src);
+  void pblendvb(XMMRegister dst, XMMRegister src);
+  void blendvpb(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len);
+  void vblendvps(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister mask, int vector_len);
+  void vblendvpd(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len);
+  void vpblendvb(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister mask, int vector_len);
+  void vpblendd(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len);
+  void evblendmpd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
+  void evblendmps(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
+  void evpblendmb(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
+  void evpblendmw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
+  void evpblendmd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
+  void evpblendmq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
 protected:
  // Next instructions require address alignment 16 bytes SSE mode.
  // They should be called only from corresponding MacroAssembler instructions.
@ -2373,7 +2615,8 @@ public:
  // Internal encoding data used in compressed immediate offset programming
  void set_evex_encoding(int value) { _evex_encoding = value; }

-  // Set the Evex.Z field to be used to clear all non directed XMM/YMM/ZMM components
+  // When the Evex.Z field is set (true), it is used to clear all non directed XMM/YMM/ZMM components.
+  // This method unsets it so that merge semantics are used instead.
  void reset_is_clear_context(void) { _is_clear_context = false; }

  // Map back to current asembler so that we can manage object level assocation
--- a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
+++ b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
--- a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp
+++ b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp
@ -28,6 +28,8 @@
 // C2_MacroAssembler contains high-level macros for C2

 public:
+  Assembler::AvxVectorLen vector_length_encoding(int vlen_in_bytes);
+
  // special instructions for EVEX
  void setvectmask(Register dst, Register src);
  void restorevectmask();
@ -71,25 +73,69 @@ public:
  void vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr);
  void vabsnegf(int opcode, XMMRegister dst, XMMRegister src, Register scr);
  void vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr);
+
+  void pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src,
+               XMMRegister tmp = xnoreg);
+  void vpminmax(int opcode, BasicType elem_bt,
+                XMMRegister dst, XMMRegister src1, XMMRegister src2,
+                int vlen_enc);
+
+  void vminmax_fp(int opcode, BasicType elem_bt,
+                  XMMRegister dst, XMMRegister a, XMMRegister b,
+                  XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
+                  int vlen_enc);
+  void evminmax_fp(int opcode, BasicType elem_bt,
+                   XMMRegister dst, XMMRegister a, XMMRegister b,
+                   KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
+                   int vlen_enc);
+
  void vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len);
  void vextendbw(bool sign, XMMRegister dst, XMMRegister src);
-  void vshiftd(int opcode, XMMRegister dst, XMMRegister src);
+  void vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len);
+  void vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len);
+
+  void vshiftd(int opcode, XMMRegister dst, XMMRegister shift);
  void vshiftd_imm(int opcode, XMMRegister dst, int shift);
-  void vshiftd(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc);
  void vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len);
-  void vshiftw(int opcode, XMMRegister dst, XMMRegister src);
-  void vshiftw(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
-  void vshiftq(int opcode, XMMRegister dst, XMMRegister src);
+  void vshiftw(int opcode, XMMRegister dst, XMMRegister shift);
+  void vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc);
+  void vshiftq(int opcode, XMMRegister dst, XMMRegister shift);
  void vshiftq_imm(int opcode, XMMRegister dst, int shift);
-  void vshiftq(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc);
  void vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len);

  void vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, int shift, int vector_len);
  void vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);

-  // Reductions for vectors of ints, longs, floats, and doubles.
+  void varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc);
+  void varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc);
+  void varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister vtmp = xnoreg);
+  void varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch);
+  void evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch);

-  // dst = src1 + reduce(op, src2) using vtmp as temps
+  void insert(BasicType typ, XMMRegister dst, Register val, int idx);
+  void vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx);
+  void vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len);
+  void evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len);
+  void evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len);
+
+  // extract
+  void extract(BasicType typ, Register dst, XMMRegister src, int idx);
+  XMMRegister get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex);
+  void get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex);
+  void get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, Register tmp = noreg, XMMRegister vtmp = xnoreg);
+
+  // blend
+  void evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch = rscratch1);
+  void evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len);
+
+  void load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt);
+  void load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes);
+
+  // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
+
+  // dst = src1  reduce(op, src2) using vtmp as temps
  void reduceI(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
 #ifdef _LP64
  void reduceL(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
@ -99,32 +145,62 @@ public:
  void reduce_fp(int opcode, int vlen,
                 XMMRegister dst, XMMRegister src,
                 XMMRegister vtmp1, XMMRegister vtmp2 = xnoreg);
+  void reduceB(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
+  void mulreduceB(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
+  void reduceS(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
+  void reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
+                         XMMRegister dst, XMMRegister src,
+                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, XMMRegister xmm_0, XMMRegister xmm_1 = xnoreg);
+  void reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid,
+                          XMMRegister dst, XMMRegister src,
+                          XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, XMMRegister xmm_0, XMMRegister xmm_1 = xnoreg);
 private:
  void reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
  void reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);

+  // Int Reduction
  void reduce2I (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
  void reduce4I (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
  void reduce8I (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
  void reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);

+  // Byte Reduction
+  void reduce8B (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
+  void reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
+  void reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
+  void reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
+  void mulreduce8B (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
+  void mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
+  void mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
+  void mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
+
+  // Short Reduction
+  void reduce4S (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
+  void reduce8S (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
+  void reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
+  void reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
+
+  // Long Reduction
 #ifdef _LP64
  void reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
  void reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
  void reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
 #endif // _LP64

+  // Float Reduction
  void reduce2F (int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp);
  void reduce4F (int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp);
  void reduce8F (int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
  void reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);

+  // Double Reduction
  void reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp);
  void reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
  void reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);

-  void reduce_operation_128(int opcode, XMMRegister dst, XMMRegister src);
-  void reduce_operation_256(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2);
+  // Base reduction instruction
+  void reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src);
+  void reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2);

 public:

--- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp
@ -112,6 +112,7 @@ void MacroAssembler::cmpklass(Address src1, Metadata* obj) {
  cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 }

+
 void MacroAssembler::cmpklass(Register src1, Metadata* obj) {
  cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 }
@ -2495,6 +2496,7 @@ void MacroAssembler::movdqu(XMMRegister dst, Address src) {

 void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) {
    assert(((dst->encoding() < 16  && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
+    if (dst->encoding() == src->encoding()) return;
    Assembler::movdqu(dst, src);
 }

@ -2519,6 +2521,7 @@ void MacroAssembler::vmovdqu(XMMRegister dst, Address src) {

 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) {
    assert(((dst->encoding() < 16  && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
+    if (dst->encoding() == src->encoding()) return;
    Assembler::vmovdqu(dst, src);
 }

@ -2532,6 +2535,64 @@ void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, Register scrat
  }
 }

+
+void MacroAssembler::kmovwl(KRegister dst, AddressLiteral src, Register scratch_reg) {
+  if (reachable(src)) {
+    kmovwl(dst, as_Address(src));
+  } else {
+    lea(scratch_reg, src);
+    kmovwl(dst, Address(scratch_reg, 0));
+  }
+}
+
+void MacroAssembler::evmovdqub(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
+                               int vector_len, Register scratch_reg) {
+  if (reachable(src)) {
+    if (mask == k0) {
+      Assembler::evmovdqub(dst, as_Address(src), merge, vector_len);
+    } else {
+      Assembler::evmovdqub(dst, mask, as_Address(src), merge, vector_len);
+    }
+  } else {
+    lea(scratch_reg, src);
+    if (mask == k0) {
+      Assembler::evmovdqub(dst, Address(scratch_reg, 0), merge, vector_len);
+    } else {
+      Assembler::evmovdqub(dst, mask, Address(scratch_reg, 0), merge, vector_len);
+    }
+  }
+}
+
+void MacroAssembler::evmovdquw(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
+                               int vector_len, Register scratch_reg) {
+  if (reachable(src)) {
+    Assembler::evmovdquw(dst, mask, as_Address(src), merge, vector_len);
+  } else {
+    lea(scratch_reg, src);
+    Assembler::evmovdquw(dst, mask, Address(scratch_reg, 0), merge, vector_len);
+  }
+}
+
+void MacroAssembler::evmovdqul(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
+                               int vector_len, Register scratch_reg) {
+  if (reachable(src)) {
+    Assembler::evmovdqul(dst, mask, as_Address(src), merge, vector_len);
+  } else {
+    lea(scratch_reg, src);
+    Assembler::evmovdqul(dst, mask, Address(scratch_reg, 0), merge, vector_len);
+  }
+}
+
+void MacroAssembler::evmovdquq(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
+                               int vector_len, Register scratch_reg) {
+  if (reachable(src)) {
+    Assembler::evmovdquq(dst, mask, as_Address(src), merge, vector_len);
+  } else {
+    lea(scratch_reg, src);
+    Assembler::evmovdquq(dst, mask, Address(scratch_reg, 0), merge, vector_len);
+  }
+}
+
 void MacroAssembler::evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
  if (reachable(src)) {
    Assembler::evmovdquq(dst, as_Address(src), vector_len);
@ -3019,6 +3080,98 @@ void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src,
  Assembler::vpcmpeqw(dst, nds, src, vector_len);
 }

+void MacroAssembler::evpcmpeqd(KRegister kdst, KRegister mask, XMMRegister nds,
+                               AddressLiteral src, int vector_len, Register scratch_reg) {
+  if (reachable(src)) {
+    Assembler::evpcmpeqd(kdst, mask, nds, as_Address(src), vector_len);
+  } else {
+    lea(scratch_reg, src);
+    Assembler::evpcmpeqd(kdst, mask, nds, Address(scratch_reg, 0), vector_len);
+  }
+}
+
+void MacroAssembler::evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
+                             int comparison, int vector_len, Register scratch_reg) {
+  if (reachable(src)) {
+    Assembler::evpcmpd(kdst, mask, nds, as_Address(src), comparison, vector_len);
+  } else {
+    lea(scratch_reg, src);
+    Assembler::evpcmpd(kdst, mask, nds, Address(scratch_reg, 0), comparison, vector_len);
+  }
+}
+
+void MacroAssembler::evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
+                             int comparison, int vector_len, Register scratch_reg) {
+  if (reachable(src)) {
+    Assembler::evpcmpq(kdst, mask, nds, as_Address(src), comparison, vector_len);
+  } else {
+    lea(scratch_reg, src);
+    Assembler::evpcmpq(kdst, mask, nds, Address(scratch_reg, 0), comparison, vector_len);
+  }
+}
+
+void MacroAssembler::evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
+                             int comparison, int vector_len, Register scratch_reg) {
+  if (reachable(src)) {
+    Assembler::evpcmpb(kdst, mask, nds, as_Address(src), comparison, vector_len);
+  } else {
+    lea(scratch_reg, src);
+    Assembler::evpcmpb(kdst, mask, nds, Address(scratch_reg, 0), comparison, vector_len);
+  }
+}
+
+void MacroAssembler::evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
+                             int comparison, int vector_len, Register scratch_reg) {
+  if (reachable(src)) {
+    Assembler::evpcmpw(kdst, mask, nds, as_Address(src), comparison, vector_len);
+  } else {
+    lea(scratch_reg, src);
+    Assembler::evpcmpw(kdst, mask, nds, Address(scratch_reg, 0), comparison, vector_len);
+  }
+}
+
+void MacroAssembler::vpcmpCC(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, Width width, int vector_len) {
+  if (width == Assembler::Q) {
+    Assembler::vpcmpCCq(dst, nds, src, cond_encoding, vector_len);
+  } else {
+    Assembler::vpcmpCCbwd(dst, nds, src, cond_encoding, vector_len);
+  }
+}
+
+void MacroAssembler::vpcmpCCW(XMMRegister dst, XMMRegister nds, XMMRegister src, ComparisonPredicate cond, Width width, int vector_len, Register scratch_reg) {
+  int eq_cond_enc = 0x29;
+  int gt_cond_enc = 0x37;
+  if (width != Assembler::Q) {
+    eq_cond_enc = 0x74 + width;
+    gt_cond_enc = 0x64 + width;
+  }
+  switch (cond) {
+  case eq:
+    vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len);
+    break;
+  case neq:
+    vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len);
+    vpxor(dst, dst, ExternalAddress(StubRoutines::x86::vector_all_bits_set()), vector_len, scratch_reg);
+    break;
+  case le:
+    vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len);
+    vpxor(dst, dst, ExternalAddress(StubRoutines::x86::vector_all_bits_set()), vector_len, scratch_reg);
+    break;
+  case nlt:
+    vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len);
+    vpxor(dst, dst, ExternalAddress(StubRoutines::x86::vector_all_bits_set()), vector_len, scratch_reg);
+    break;
+  case lt:
+    vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len);
+    break;
+  case nle:
+    vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len);
+    break;
+  default:
+    assert(false, "Should not reach here");
+  }
+}
+
 void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
  assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
  Assembler::vpmovzxbw(dst, src, vector_len);
@ -3143,6 +3296,16 @@ void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src
  }
 }

+void MacroAssembler::evpord(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src,
+                            bool merge, int vector_len, Register scratch_reg) {
+  if (reachable(src)) {
+    Assembler::evpord(dst, mask, nds, as_Address(src), merge, vector_len);
+  } else {
+    lea(scratch_reg, src);
+    Assembler::evpord(dst, mask, nds, Address(scratch_reg, 0), merge, vector_len);
+  }
+}
+
 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
  if (reachable(src)) {
    vdivsd(dst, nds, as_Address(src));
@ -3239,7 +3402,14 @@ void MacroAssembler::vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src,
  }
 }

-//-------------------------------------------------------------------------------------------
+void MacroAssembler::vpermd(XMMRegister dst,  XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
+  if (reachable(src)) {
+    Assembler::vpermd(dst, nds, as_Address(src), vector_len);
+  } else {
+    lea(scratch_reg, src);
+    Assembler::vpermd(dst, nds, Address(scratch_reg, 0), vector_len);
+  }
+}

 void MacroAssembler::clear_jweak_tag(Register possibly_jweak) {
  const int32_t inverted_jweak_mask = ~static_cast<int32_t>(JNIHandles::weak_tag_mask);
@ -5765,7 +5935,7 @@ void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register

    bind(VECTOR64_LOOP);
    // AVX512 code to compare 64 byte vectors.
-    evmovdqub(rymm0, Address(obja, result), Assembler::AVX_512bit);
+    evmovdqub(rymm0, Address(obja, result), false, Assembler::AVX_512bit);
    evpcmpeqb(k7, rymm0, Address(objb, result), Assembler::AVX_512bit);
    kortestql(k7, k7);
    jcc(Assembler::aboveEqual, VECTOR64_NOT_EQUAL);     // mismatch
@ -5784,7 +5954,7 @@ void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register
    notq(tmp2);
    kmovql(k3, tmp2);

-    evmovdqub(rymm0, k3, Address(obja, result), Assembler::AVX_512bit);
+    evmovdqub(rymm0, k3, Address(obja, result), false, Assembler::AVX_512bit);
    evpcmpeqb(k7, k3, rymm0, Address(objb, result), Assembler::AVX_512bit);

    ktestql(k7, k3);
@ -7579,7 +7749,7 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le
    notl(result);
    kmovdl(k3, result);

-    evmovdquw(tmp1Reg, k3, Address(src, 0), Assembler::AVX_512bit);
+    evmovdquw(tmp1Reg, k3, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
    evpcmpuw(k2, k3, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
    ktestd(k2, k3);
    jcc(Assembler::carryClear, return_zero);
@ -7604,7 +7774,7 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le
    negptr(len);

    bind(copy_32_loop);
-    evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit);
+    evmovdquw(tmp1Reg, Address(src, len, Address::times_2), /*merge*/ false, Assembler::AVX_512bit);
    evpcmpuw(k2, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
    kortestdl(k2, k2);
    jcc(Assembler::carryClear, return_zero);
@ -7629,7 +7799,7 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le

    kmovdl(k3, result);

-    evmovdquw(tmp1Reg, k3, Address(src, 0), Assembler::AVX_512bit);
+    evmovdquw(tmp1Reg, k3, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
    evpcmpuw(k2, k3, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
    ktestd(k2, k3);
    jcc(Assembler::carryClear, return_zero);
@ -7774,7 +7944,7 @@ void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len
    // inflate 32 chars per iter
    bind(copy_32_loop);
    vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit);
-    evmovdquw(Address(dst, len, Address::times_2), tmp1, Assembler::AVX_512bit);
+    evmovdquw(Address(dst, len, Address::times_2), tmp1, /*merge*/ false, Assembler::AVX_512bit);
    addptr(len, 32);
    jcc(Assembler::notZero, copy_32_loop);

@ -7789,7 +7959,7 @@ void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len
    notl(tmp3_aliased);
    kmovdl(k2, tmp3_aliased);
    evpmovzxbw(tmp1, k2, Address(src, 0), Assembler::AVX_512bit);
-    evmovdquw(Address(dst, 0), k2, tmp1, Assembler::AVX_512bit);
+    evmovdquw(Address(dst, 0), k2, tmp1, /*merge*/ true, Assembler::AVX_512bit);

    jmp(done);
    bind(avx3_threshold);
--- a/src/hotspot/cpu/x86/macroAssembler_x86.hpp
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.hpp
@ -1076,15 +1076,59 @@ public:
  void movdqu(XMMRegister dst, Address src);
  void movdqu(XMMRegister dst, XMMRegister src);
  void movdqu(XMMRegister dst, AddressLiteral src, Register scratchReg = rscratch1);
+
+  void kmovwl(KRegister dst, Register src) { Assembler::kmovwl(dst, src); }
+  void kmovwl(Register dst, KRegister src) { Assembler::kmovwl(dst, src); }
+  void kmovwl(KRegister dst, Address src) { Assembler::kmovwl(dst, src); }
+  void kmovwl(KRegister dst, AddressLiteral src, Register scratch_reg = rscratch1);
+
  // AVX Unaligned forms
  void vmovdqu(Address     dst, XMMRegister src);
  void vmovdqu(XMMRegister dst, Address src);
  void vmovdqu(XMMRegister dst, XMMRegister src);
  void vmovdqu(XMMRegister dst, AddressLiteral src, Register scratch_reg = rscratch1);
+
+  // AVX512 Unaligned
+  void evmovdqub(Address dst, XMMRegister src, bool merge, int vector_len) { Assembler::evmovdqub(dst, src, merge, vector_len); }
+  void evmovdqub(XMMRegister dst, Address src, bool merge, int vector_len) { Assembler::evmovdqub(dst, src, merge, vector_len); }
+  void evmovdqub(XMMRegister dst, XMMRegister src, bool merge, int vector_len) { Assembler::evmovdqub(dst, src, merge, vector_len); }
+  void evmovdqub(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len) { Assembler::evmovdqub(dst, mask, src, merge, vector_len); }
+  void evmovdqub(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register scratch_reg);
+
+  void evmovdquw(Address dst, XMMRegister src, bool merge, int vector_len) { Assembler::evmovdquw(dst, src, merge, vector_len); }
+  void evmovdquw(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len) { Assembler::evmovdquw(dst, mask, src, merge, vector_len); }
+  void evmovdquw(XMMRegister dst, Address src, bool merge, int vector_len) { Assembler::evmovdquw(dst, src, merge, vector_len); }
+  void evmovdquw(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len) { Assembler::evmovdquw(dst, mask, src, merge, vector_len); }
+  void evmovdquw(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register scratch_reg);
+
+  void evmovdqul(Address dst, XMMRegister src, int vector_len) { Assembler::evmovdqul(dst, src, vector_len); }
+  void evmovdqul(XMMRegister dst, Address src, int vector_len) { Assembler::evmovdqul(dst, src, vector_len); }
+  void evmovdqul(XMMRegister dst, XMMRegister src, int vector_len) {
+     if (dst->encoding() == src->encoding()) return;
+     Assembler::evmovdqul(dst, src, vector_len);
+  }
+  void evmovdqul(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len) { Assembler::evmovdqul(dst, mask, src, merge, vector_len); }
+  void evmovdqul(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len) { Assembler::evmovdqul(dst, mask, src, merge, vector_len); }
+  void evmovdqul(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
+    if (dst->encoding() == src->encoding() && mask == k0) return;
+    Assembler::evmovdqul(dst, mask, src, merge, vector_len);
+   }
+  void evmovdqul(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register scratch_reg);
+
  void evmovdquq(XMMRegister dst, Address src, int vector_len) { Assembler::evmovdquq(dst, src, vector_len); }
-  void evmovdquq(XMMRegister dst, XMMRegister src, int vector_len) { Assembler::evmovdquq(dst, src, vector_len); }
  void evmovdquq(Address dst, XMMRegister src, int vector_len) { Assembler::evmovdquq(dst, src, vector_len); }
  void evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch);
+  void evmovdquq(XMMRegister dst, XMMRegister src, int vector_len) {
+    if (dst->encoding() == src->encoding()) return;
+    Assembler::evmovdquq(dst, src, vector_len);
+  }
+  void evmovdquq(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len) { Assembler::evmovdquq(dst, mask, src, merge, vector_len); }
+  void evmovdquq(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len) { Assembler::evmovdquq(dst, mask, src, merge, vector_len); }
+  void evmovdquq(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
+    if (dst->encoding() == src->encoding() && mask == k0) return;
+    Assembler::evmovdquq(dst, mask, src, merge, vector_len);
+  }
+  void evmovdquq(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register scratch_reg);

  // Move Aligned Double Quadword
  void movdqa(XMMRegister dst, Address src)       { Assembler::movdqa(dst, src); }
@ -1206,6 +1250,30 @@ public:
  void vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);

  void vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void evpcmpeqd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg);
+
+  // Vector compares
+  void evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src,
+               int comparison, int vector_len) { Assembler::evpcmpd(kdst, mask, nds, src, comparison, vector_len); }
+  void evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
+               int comparison, int vector_len, Register scratch_reg);
+  void evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src,
+               int comparison, int vector_len) { Assembler::evpcmpq(kdst, mask, nds, src, comparison, vector_len); }
+  void evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
+               int comparison, int vector_len, Register scratch_reg);
+  void evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src,
+               int comparison, int vector_len) { Assembler::evpcmpb(kdst, mask, nds, src, comparison, vector_len); }
+  void evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
+               int comparison, int vector_len, Register scratch_reg);
+  void evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src,
+               int comparison, int vector_len) { Assembler::evpcmpw(kdst, mask, nds, src, comparison, vector_len); }
+  void evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
+               int comparison, int vector_len, Register scratch_reg);
+
+
+  // Emit comparison instruction for the specified comparison predicate.
+  void vpcmpCCW(XMMRegister dst, XMMRegister nds, XMMRegister src, ComparisonPredicate cond, Width width, int vector_len, Register scratch_reg);
+  void vpcmpCC(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, Width width, int vector_len);

  void vpmovzxbw(XMMRegister dst, Address src, int vector_len);
  void vpmovzxbw(XMMRegister dst, XMMRegister src, int vector_len) { Assembler::vpmovzxbw(dst, src, vector_len); }
@ -1234,6 +1302,7 @@ public:
  void vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len);

  void vptest(XMMRegister dst, XMMRegister src);
+  void vptest(XMMRegister dst, XMMRegister src, int vector_len) { Assembler::vptest(dst, src, vector_len); }

  void punpcklbw(XMMRegister dst, XMMRegister src);
  void punpcklbw(XMMRegister dst, Address src) { Assembler::punpcklbw(dst, src); }
@ -1252,6 +1321,8 @@ public:
  void vandps(XMMRegister dst, XMMRegister nds, Address src, int vector_len)     { Assembler::vandps(dst, nds, src, vector_len); }
  void vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg = rscratch1);

+  void evpord(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src, bool merge, int vector_len, Register scratch_reg);
+
  void vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vdivsd(dst, nds, src); }
  void vdivsd(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vdivsd(dst, nds, src); }
  void vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src);
@ -1307,6 +1378,9 @@ public:
  void vpxor(XMMRegister dst, XMMRegister src) { Assembler::vpxor(dst, dst, src, true); }
  void vpxor(XMMRegister dst, Address src) { Assembler::vpxor(dst, dst, src, true); }

+  void vpermd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vpermd(dst, nds, src, vector_len); }
+  void vpermd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg);
+
  void vinserti128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
    if (UseAVX > 2 && VM_Version::supports_avx512novl()) {
      Assembler::vinserti32x4(dst, dst, src, imm8);
--- a/src/hotspot/cpu/x86/stubGenerator_x86_32.cpp
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_32.cpp
@ -587,6 +587,29 @@ class StubGenerator: public StubCodeGenerator {
    return start;
  }

+  address generate_iota_indices(const char *stub_name) {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", stub_name);
+    address start = __ pc();
+    __ emit_data(0x03020100, relocInfo::none, 0);
+    __ emit_data(0x07060504, relocInfo::none, 0);
+    __ emit_data(0x0B0A0908, relocInfo::none, 0);
+    __ emit_data(0x0F0E0D0C, relocInfo::none, 0);
+    __ emit_data(0x13121110, relocInfo::none, 0);
+    __ emit_data(0x17161514, relocInfo::none, 0);
+    __ emit_data(0x1B1A1918, relocInfo::none, 0);
+    __ emit_data(0x1F1E1D1C, relocInfo::none, 0);
+    __ emit_data(0x23222120, relocInfo::none, 0);
+    __ emit_data(0x27262524, relocInfo::none, 0);
+    __ emit_data(0x2B2A2928, relocInfo::none, 0);
+    __ emit_data(0x2F2E2D2C, relocInfo::none, 0);
+    __ emit_data(0x33323130, relocInfo::none, 0);
+    __ emit_data(0x37363534, relocInfo::none, 0);
+    __ emit_data(0x3B3A3938, relocInfo::none, 0);
+    __ emit_data(0x3F3E3D3C, relocInfo::none, 0);
+    return start;
+  }
+
  address generate_vector_mask_long_double(const char *stub_name, int32_t maskhi, int32_t masklo) {
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", stub_name);
@ -627,6 +650,40 @@ class StubGenerator: public StubCodeGenerator {
    return start;
  }

+  address generate_vector_custom_i32(const char *stub_name, Assembler::AvxVectorLen len,
+                                     int32_t val0, int32_t val1, int32_t val2, int32_t val3,
+                                     int32_t val4 = 0, int32_t val5 = 0, int32_t val6 = 0, int32_t val7 = 0,
+                                     int32_t val8 = 0, int32_t val9 = 0, int32_t val10 = 0, int32_t val11 = 0,
+                                     int32_t val12 = 0, int32_t val13 = 0, int32_t val14 = 0, int32_t val15 = 0) {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", stub_name);
+    address start = __ pc();
+
+    assert(len != Assembler::AVX_NoVec, "vector len must be specified");
+    __ emit_data(val0, relocInfo::none, 0);
+    __ emit_data(val1, relocInfo::none, 0);
+    __ emit_data(val2, relocInfo::none, 0);
+    __ emit_data(val3, relocInfo::none, 0);
+    if (len >= Assembler::AVX_256bit) {
+      __ emit_data(val4, relocInfo::none, 0);
+      __ emit_data(val5, relocInfo::none, 0);
+      __ emit_data(val6, relocInfo::none, 0);
+      __ emit_data(val7, relocInfo::none, 0);
+      if (len >= Assembler::AVX_512bit) {
+        __ emit_data(val8, relocInfo::none, 0);
+        __ emit_data(val9, relocInfo::none, 0);
+        __ emit_data(val10, relocInfo::none, 0);
+        __ emit_data(val11, relocInfo::none, 0);
+        __ emit_data(val12, relocInfo::none, 0);
+        __ emit_data(val13, relocInfo::none, 0);
+        __ emit_data(val14, relocInfo::none, 0);
+        __ emit_data(val15, relocInfo::none, 0);
+      }
+    }
+
+    return start;
+  }
+
  //----------------------------------------------------------------------------------------------------
  // Non-destructive plausibility checks for oops

@ -3902,8 +3959,19 @@ class StubGenerator: public StubCodeGenerator {
    StubRoutines::x86::_vector_double_sign_mask = generate_vector_mask_long_double("vector_double_sign_mask", 0x7FFFFFFF, 0xFFFFFFFF);
    StubRoutines::x86::_vector_double_sign_flip = generate_vector_mask_long_double("vector_double_sign_flip", 0x80000000, 0x00000000);
    StubRoutines::x86::_vector_short_to_byte_mask = generate_vector_mask("vector_short_to_byte_mask", 0x00ff00ff);
+    StubRoutines::x86::_vector_int_to_byte_mask = generate_vector_mask("vector_int_to_byte_mask", 0x000000ff);
+    StubRoutines::x86::_vector_int_to_short_mask = generate_vector_mask("vector_int_to_short_mask", 0x0000ffff);
+    StubRoutines::x86::_vector_32_bit_mask = generate_vector_custom_i32("vector_32_bit_mask", Assembler::AVX_512bit,
+                                                                        0xFFFFFFFF, 0, 0, 0);
+    StubRoutines::x86::_vector_64_bit_mask = generate_vector_custom_i32("vector_64_bit_mask", Assembler::AVX_512bit,
+                                                                        0xFFFFFFFF, 0xFFFFFFFF, 0, 0);
+    StubRoutines::x86::_vector_int_shuffle_mask = generate_vector_mask("vector_int_shuffle_mask", 0x03020100);
+    StubRoutines::x86::_vector_short_shuffle_mask = generate_vector_mask("vector_short_shuffle_mask", 0x01000100);
+    StubRoutines::x86::_vector_long_shuffle_mask = generate_vector_mask_long_double("vector_long_shuffle_mask", 0x00000001, 0x0);
    StubRoutines::x86::_vector_byte_perm_mask = generate_vector_byte_perm_mask("vector_byte_perm_mask");
    StubRoutines::x86::_vector_long_sign_mask = generate_vector_mask_long_double("vector_long_sign_mask", 0x80000000, 0x00000000);
+    StubRoutines::x86::_vector_all_bits_set = generate_vector_mask("vector_all_bits_set", 0xFFFFFFFF);
+    StubRoutines::x86::_vector_iota_indices = generate_iota_indices("iota_indices");

    // support for verify_oop (must happen after universe_init)
    StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
--- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
@ -809,6 +809,21 @@ class StubGenerator: public StubCodeGenerator {
    return start;
  }

+  address generate_iota_indices(const char *stub_name) {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", stub_name);
+    address start = __ pc();
+    __ emit_data64(0x0706050403020100, relocInfo::none);
+    __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
+    __ emit_data64(0x1716151413121110, relocInfo::none);
+    __ emit_data64(0x1F1E1D1C1B1A1918, relocInfo::none);
+    __ emit_data64(0x2726252423222120, relocInfo::none);
+    __ emit_data64(0x2F2E2D2C2B2A2928, relocInfo::none);
+    __ emit_data64(0x3736353433323130, relocInfo::none);
+    __ emit_data64(0x3F3E3D3C3B3A3938, relocInfo::none);
+    return start;
+  }
+
  address generate_fp_mask(const char *stub_name, int64_t mask) {
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", stub_name);
@ -854,6 +869,57 @@ class StubGenerator: public StubCodeGenerator {
    return start;
  }

+  address generate_vector_fp_mask(const char *stub_name, int64_t mask) {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", stub_name);
+    address start = __ pc();
+
+    __ emit_data64(mask, relocInfo::none);
+    __ emit_data64(mask, relocInfo::none);
+    __ emit_data64(mask, relocInfo::none);
+    __ emit_data64(mask, relocInfo::none);
+    __ emit_data64(mask, relocInfo::none);
+    __ emit_data64(mask, relocInfo::none);
+    __ emit_data64(mask, relocInfo::none);
+    __ emit_data64(mask, relocInfo::none);
+
+    return start;
+  }
+
+  address generate_vector_custom_i32(const char *stub_name, Assembler::AvxVectorLen len,
+                                     int32_t val0, int32_t val1, int32_t val2, int32_t val3,
+                                     int32_t val4 = 0, int32_t val5 = 0, int32_t val6 = 0, int32_t val7 = 0,
+                                     int32_t val8 = 0, int32_t val9 = 0, int32_t val10 = 0, int32_t val11 = 0,
+                                     int32_t val12 = 0, int32_t val13 = 0, int32_t val14 = 0, int32_t val15 = 0) {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", stub_name);
+    address start = __ pc();
+
+    assert(len != Assembler::AVX_NoVec, "vector len must be specified");
+    __ emit_data(val0, relocInfo::none, 0);
+    __ emit_data(val1, relocInfo::none, 0);
+    __ emit_data(val2, relocInfo::none, 0);
+    __ emit_data(val3, relocInfo::none, 0);
+    if (len >= Assembler::AVX_256bit) {
+      __ emit_data(val4, relocInfo::none, 0);
+      __ emit_data(val5, relocInfo::none, 0);
+      __ emit_data(val6, relocInfo::none, 0);
+      __ emit_data(val7, relocInfo::none, 0);
+      if (len >= Assembler::AVX_512bit) {
+        __ emit_data(val8, relocInfo::none, 0);
+        __ emit_data(val9, relocInfo::none, 0);
+        __ emit_data(val10, relocInfo::none, 0);
+        __ emit_data(val11, relocInfo::none, 0);
+        __ emit_data(val12, relocInfo::none, 0);
+        __ emit_data(val13, relocInfo::none, 0);
+        __ emit_data(val14, relocInfo::none, 0);
+        __ emit_data(val15, relocInfo::none, 0);
+      }
+    }
+
+    return start;
+  }
+
  // Non-destructive plausibility checks for oops
  //
  // Arguments:
@ -6769,9 +6835,20 @@ address generate_avx_ghash_processBlocks() {
    StubRoutines::x86::_vector_float_sign_flip = generate_vector_mask("vector_float_sign_flip", 0x8000000080000000);
    StubRoutines::x86::_vector_double_sign_mask = generate_vector_mask("vector_double_sign_mask", 0x7FFFFFFFFFFFFFFF);
    StubRoutines::x86::_vector_double_sign_flip = generate_vector_mask("vector_double_sign_flip", 0x8000000000000000);
+    StubRoutines::x86::_vector_all_bits_set = generate_vector_mask("vector_all_bits_set", 0xFFFFFFFFFFFFFFFF);
    StubRoutines::x86::_vector_short_to_byte_mask = generate_vector_mask("vector_short_to_byte_mask", 0x00ff00ff00ff00ff);
    StubRoutines::x86::_vector_byte_perm_mask = generate_vector_byte_perm_mask("vector_byte_perm_mask");
+    StubRoutines::x86::_vector_int_to_byte_mask = generate_vector_mask("vector_int_to_byte_mask", 0x000000ff000000ff);
+    StubRoutines::x86::_vector_int_to_short_mask = generate_vector_mask("vector_int_to_short_mask", 0x0000ffff0000ffff);
+    StubRoutines::x86::_vector_32_bit_mask = generate_vector_custom_i32("vector_32_bit_mask", Assembler::AVX_512bit,
+                                                                        0xFFFFFFFF, 0, 0, 0);
+    StubRoutines::x86::_vector_64_bit_mask = generate_vector_custom_i32("vector_64_bit_mask", Assembler::AVX_512bit,
+                                                                        0xFFFFFFFF, 0xFFFFFFFF, 0, 0);
+    StubRoutines::x86::_vector_int_shuffle_mask = generate_vector_mask("vector_int_shuffle_mask", 0x0302010003020100);
+    StubRoutines::x86::_vector_short_shuffle_mask = generate_vector_mask("vector_short_shuffle_mask", 0x0100010001000100);
+    StubRoutines::x86::_vector_long_shuffle_mask = generate_vector_mask("vector_long_shuffle_mask", 0x0000000100000000);
    StubRoutines::x86::_vector_long_sign_mask = generate_vector_mask("vector_long_sign_mask", 0x8000000000000000);
+    StubRoutines::x86::_vector_iota_indices = generate_iota_indices("iota_indices");

    // support for verify_oop (must happen after universe_init)
    StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
--- a/src/hotspot/cpu/x86/stubRoutines_x86.cpp
+++ b/src/hotspot/cpu/x86/stubRoutines_x86.cpp
@ -44,12 +44,21 @@ address StubRoutines::x86::_upper_word_mask_addr = NULL;
 address StubRoutines::x86::_shuffle_byte_flip_mask_addr = NULL;
 address StubRoutines::x86::_k256_adr = NULL;
 address StubRoutines::x86::_vector_short_to_byte_mask = NULL;
+address StubRoutines::x86::_vector_int_to_byte_mask = NULL;
+address StubRoutines::x86::_vector_int_to_short_mask = NULL;
+address StubRoutines::x86::_vector_all_bits_set = NULL;
+address StubRoutines::x86::_vector_short_shuffle_mask = NULL;
+address StubRoutines::x86::_vector_int_shuffle_mask = NULL;
+address StubRoutines::x86::_vector_long_shuffle_mask = NULL;
 address StubRoutines::x86::_vector_float_sign_mask = NULL;
 address StubRoutines::x86::_vector_float_sign_flip = NULL;
 address StubRoutines::x86::_vector_double_sign_mask = NULL;
 address StubRoutines::x86::_vector_double_sign_flip = NULL;
 address StubRoutines::x86::_vector_byte_perm_mask = NULL;
 address StubRoutines::x86::_vector_long_sign_mask = NULL;
+address StubRoutines::x86::_vector_iota_indices = NULL;
+address StubRoutines::x86::_vector_32_bit_mask = NULL;
+address StubRoutines::x86::_vector_64_bit_mask = NULL;
 #ifdef _LP64
 address StubRoutines::x86::_k256_W_adr = NULL;
 address StubRoutines::x86::_k512_W_addr = NULL;
--- a/src/hotspot/cpu/x86/stubRoutines_x86.hpp
+++ b/src/hotspot/cpu/x86/stubRoutines_x86.hpp
@ -146,8 +146,17 @@ class x86 {
  static address _vector_float_sign_flip;
  static address _vector_double_sign_mask;
  static address _vector_double_sign_flip;
-  static address _vector_byte_perm_mask;
  static address _vector_long_sign_mask;
+  static address _vector_all_bits_set;
+  static address _vector_byte_perm_mask;
+  static address _vector_int_to_byte_mask;
+  static address _vector_int_to_short_mask;
+  static address _vector_32_bit_mask;
+  static address _vector_64_bit_mask;
+  static address _vector_int_shuffle_mask;
+  static address _vector_short_shuffle_mask;
+  static address _vector_long_shuffle_mask;
+  static address _vector_iota_indices;
 #ifdef _LP64
  static juint _k256_W[];
  static address _k256_W_adr;
@ -248,13 +257,50 @@ class x86 {
    return _vector_double_sign_flip;
  }

+  static address vector_all_bits_set() {
+    return _vector_all_bits_set;
+  }
+
  static address vector_byte_perm_mask() {
    return _vector_byte_perm_mask;
  }

+  static address vector_int_to_byte_mask() {
+    return _vector_int_to_byte_mask;
+  }
+
+  static address vector_int_to_short_mask() {
+    return _vector_int_to_short_mask;
+  }
+
+  static address vector_32_bit_mask() {
+    return _vector_32_bit_mask;
+  }
+
+  static address vector_64_bit_mask() {
+    return _vector_64_bit_mask;
+  }
+
+  static address vector_int_shuffle_mask() {
+    return _vector_int_shuffle_mask;
+  }
+
+  static address vector_short_shuffle_mask() {
+    return _vector_short_shuffle_mask;
+  }
+
+  static address vector_long_shuffle_mask() {
+    return _vector_long_shuffle_mask;
+  }
+
  static address vector_long_sign_mask() {
    return _vector_long_sign_mask;
  }
+
+  static address vector_iota_indices() {
+    return _vector_iota_indices;
+  }
+
 #ifdef _LP64
  static address k256_W_addr()    { return _k256_W_adr; }
  static address k512_W_addr()    { return _k512_W_addr; }
--- a/src/hotspot/cpu/x86/x86.ad
+++ b/src/hotspot/cpu/x86/x86.ad
--- a/src/hotspot/cpu/x86/x86_32.ad
+++ b/src/hotspot/cpu/x86/x86_32.ad
@ -3315,7 +3315,7 @@ operand immI() %{
 %}

 // Constant for test vs zero
-operand immI0() %{
+operand immI_0() %{
  predicate(n->get_int() == 0);
  match(ConI);

@ -3325,7 +3325,7 @@ operand immI0() %{
 %}

 // Constant for increment
-operand immI1() %{
+operand immI_1() %{
  predicate(n->get_int() == 1);
  match(ConI);

@ -3419,15 +3419,6 @@ operand immI_32_63() %{
  interface(CONST_INTER);
 %}

-operand immI_1() %{
-  predicate( n->get_int() == 1 );
-  match(ConI);
-
-  op_cost(0);
-  format %{ %}
-  interface(CONST_INTER);
-%}
-
 operand immI_2() %{
  predicate( n->get_int() == 2 );
  match(ConI);
@ -3446,6 +3437,26 @@ operand immI_3() %{
  interface(CONST_INTER);
 %}

+operand immI_4()
+%{
+  predicate(n->get_int() == 4);
+  match(ConI);
+
+  op_cost(0);
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand immI_8()
+%{
+  predicate(n->get_int() == 8);
+  match(ConI);
+
+  op_cost(0);
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
 // Pointer Immediate
 operand immP() %{
  match(ConP);
@ -3815,6 +3826,18 @@ operand eRegP() %{
  interface(REG_INTER);
 %}

+operand rRegP() %{
+  constraint(ALLOC_IN_RC(int_reg));
+  match(RegP);
+  match(eAXRegP);
+  match(eBXRegP);
+  match(eCXRegP);
+  match(eDIRegP);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
 // On windows95, EBP is not safe to use for implicit null tests.
 operand eRegP_no_EBP() %{
  constraint(ALLOC_IN_RC(int_reg_no_ebp));
@ -3947,6 +3970,15 @@ operand eADXRegL_low_only() %{
  interface(REG_INTER);
 %}

+// Flags register, used as output of compare instructions
+operand rFlagsReg() %{
+  constraint(ALLOC_IN_RC(int_flags));
+  match(RegFlags);
+
+  format %{ "EFLAGS" %}
+  interface(REG_INTER);
+%}
+
 // Flags register, used as output of compare instructions
 operand eFlagsReg() %{
  constraint(ALLOC_IN_RC(int_flags));
@ -4077,6 +4109,14 @@ operand regF() %{
  interface(REG_INTER);
 %}

+operand legRegF() %{
+  predicate( UseSSE>=1 );
+  constraint(ALLOC_IN_RC(float_reg_legacy));
+  match(RegF);
+  format %{ %}
+  interface(REG_INTER);
+%}
+
 // Float register operands
 operand vlRegF() %{
   constraint(ALLOC_IN_RC(float_reg_vl));
@ -4096,6 +4136,14 @@ operand regD() %{
 %}

 // Double register operands
+operand legRegD() %{
+  predicate( UseSSE>=2 );
+  constraint(ALLOC_IN_RC(double_reg_legacy));
+  match(RegD);
+  format %{ %}
+  interface(REG_INTER);
+%}
+
 operand vlRegD() %{
   constraint(ALLOC_IN_RC(double_reg_vl));
   match(RegD);
@ -5846,6 +5894,46 @@ instruct loadKlass(eRegP dst, memory mem) %{
  ins_pipe( ialu_reg_mem );
 %}

+// Load Float
+instruct MoveF2LEG(legRegF dst, regF src) %{
+  match(Set dst src);
+  format %{ "movss $dst,$src\t# if src != dst load float (4 bytes)" %}
+  ins_encode %{
+    __ movflt($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+// Load Float
+instruct MoveLEG2F(regF dst, legRegF src) %{
+  match(Set dst src);
+  format %{ "movss $dst,$src\t# if src != dst load float (4 bytes)" %}
+  ins_encode %{
+    __ movflt($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+// Load Double
+instruct MoveD2LEG(legRegD dst, regD src) %{
+  match(Set dst src);
+  format %{ "movsd $dst,$src\t# if src != dst load double (8 bytes)" %}
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+// Load Double
+instruct MoveLEG2D(regD dst, legRegD src) %{
+  match(Set dst src);
+  format %{ "movsd $dst,$src\t# if src != dst load double (8 bytes)" %}
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
 // Load Double
 instruct loadDPR(regDPR dst, memory mem) %{
  predicate(UseSSE<=1);
@ -5971,7 +6059,7 @@ instruct loadConI(rRegI dst, immI src) %{
 %}

 // Load Constant zero
-instruct loadConI0(rRegI dst, immI0 src, eFlagsReg cr) %{
+instruct loadConI0(rRegI dst, immI_0 src, eFlagsReg cr) %{
  match(Set dst src);
  effect(KILL cr);

@ -7083,7 +7171,7 @@ instruct addI_eReg_imm(rRegI dst, immI src, eFlagsReg cr) %{
  ins_pipe( ialu_reg );
 %}

-instruct incI_eReg(rRegI dst, immI1 src, eFlagsReg cr) %{
+instruct incI_eReg(rRegI dst, immI_1 src, eFlagsReg cr) %{
  predicate(UseIncDec);
  match(Set dst (AddI dst src));
  effect(KILL cr);
@ -7183,7 +7271,7 @@ instruct addI_mem_imm(memory dst, immI src, eFlagsReg cr) %{
  ins_pipe( ialu_mem_imm );
 %}

-instruct incI_mem(memory dst, immI1 src, eFlagsReg cr) %{
+instruct incI_mem(memory dst, immI_1 src, eFlagsReg cr) %{
  match(Set dst (StoreI dst (AddI (LoadI dst) src)));
  effect(KILL cr);

@ -7552,7 +7640,7 @@ instruct subI_mem_eReg(memory dst, rRegI src, eFlagsReg cr) %{
 %}

 // Subtract from a pointer
-instruct subP_eReg(eRegP dst, rRegI src, immI0 zero, eFlagsReg cr) %{
+instruct subP_eReg(eRegP dst, rRegI src, immI_0 zero, eFlagsReg cr) %{
  match(Set dst (AddP dst (SubI zero src)));
  effect(KILL cr);

@ -7563,7 +7651,7 @@ instruct subP_eReg(eRegP dst, rRegI src, immI0 zero, eFlagsReg cr) %{
  ins_pipe( ialu_reg_reg );
 %}

-instruct negI_eReg(rRegI dst, immI0 zero, eFlagsReg cr) %{
+instruct negI_eReg(rRegI dst, immI_0 zero, eFlagsReg cr) %{
  match(Set dst (SubI zero dst));
  effect(KILL cr);

@ -8017,7 +8105,7 @@ instruct modL_eReg_imm32( eADXRegL dst, immL32 imm, rRegI tmp, rRegI tmp2, eFlag

 // Integer Shift Instructions
 // Shift Left by one
-instruct shlI_eReg_1(rRegI dst, immI1 shift, eFlagsReg cr) %{
+instruct shlI_eReg_1(rRegI dst, immI_1 shift, eFlagsReg cr) %{
  match(Set dst (LShiftI dst shift));
  effect(KILL cr);

@ -8053,7 +8141,7 @@ instruct salI_eReg_CL(rRegI dst, eCXRegI shift, eFlagsReg cr) %{
 %}

 // Arithmetic shift right by one
-instruct sarI_eReg_1(rRegI dst, immI1 shift, eFlagsReg cr) %{
+instruct sarI_eReg_1(rRegI dst, immI_1 shift, eFlagsReg cr) %{
  match(Set dst (RShiftI dst shift));
  effect(KILL cr);

@ -8065,7 +8153,7 @@ instruct sarI_eReg_1(rRegI dst, immI1 shift, eFlagsReg cr) %{
 %}

 // Arithmetic shift right by one
-instruct sarI_mem_1(memory dst, immI1 shift, eFlagsReg cr) %{
+instruct sarI_mem_1(memory dst, immI_1 shift, eFlagsReg cr) %{
  match(Set dst (StoreI dst (RShiftI (LoadI dst) shift)));
  effect(KILL cr);
  format %{ "SAR    $dst,$shift" %}
@ -8110,7 +8198,7 @@ instruct sarI_eReg_CL(rRegI dst, eCXRegI shift, eFlagsReg cr) %{
 %}

 // Logical shift right by one
-instruct shrI_eReg_1(rRegI dst, immI1 shift, eFlagsReg cr) %{
+instruct shrI_eReg_1(rRegI dst, immI_1 shift, eFlagsReg cr) %{
  match(Set dst (URShiftI dst shift));
  effect(KILL cr);

@ -8266,7 +8354,7 @@ instruct andnI_rReg_rReg_mem(rRegI dst, rRegI src1, memory src2, immI_M1 minus_1
  ins_pipe(ialu_reg_mem);
 %}

-instruct blsiI_rReg_rReg(rRegI dst, rRegI src, immI0 imm_zero, eFlagsReg cr) %{
+instruct blsiI_rReg_rReg(rRegI dst, rRegI src, immI_0 imm_zero, eFlagsReg cr) %{
  match(Set dst (AndI (SubI imm_zero src) src));
  predicate(UseBMI1Instructions);
  effect(KILL cr);
@ -8279,7 +8367,7 @@ instruct blsiI_rReg_rReg(rRegI dst, rRegI src, immI0 imm_zero, eFlagsReg cr) %{
  ins_pipe(ialu_reg);
 %}

-instruct blsiI_rReg_mem(rRegI dst, memory src, immI0 imm_zero, eFlagsReg cr) %{
+instruct blsiI_rReg_mem(rRegI dst, memory src, immI_0 imm_zero, eFlagsReg cr) %{
  match(Set dst (AndI (SubI imm_zero (LoadI src) ) (LoadI src) ));
  predicate(UseBMI1Instructions);
  effect(KILL cr);
@ -8431,7 +8519,7 @@ instruct orI_mem_imm(memory dst, immI src, eFlagsReg cr) %{

 // ROL/ROR
 // ROL expand
-instruct rolI_eReg_imm1(rRegI dst, immI1 shift, eFlagsReg cr) %{
+instruct rolI_eReg_imm1(rRegI dst, immI_1 shift, eFlagsReg cr) %{
  effect(USE_DEF dst, USE shift, KILL cr);

  format %{ "ROL    $dst, $shift" %}
@ -8460,7 +8548,7 @@ instruct rolI_eReg_CL(ncxRegI dst, eCXRegI shift, eFlagsReg cr) %{
 // end of ROL expand

 // ROL 32bit by one once
-instruct rolI_eReg_i1(rRegI dst, immI1 lshift, immI_M1 rshift, eFlagsReg cr) %{
+instruct rolI_eReg_i1(rRegI dst, immI_1 lshift, immI_M1 rshift, eFlagsReg cr) %{
  match(Set dst ( OrI (LShiftI dst lshift) (URShiftI dst rshift)));

  expand %{
@ -8479,7 +8567,7 @@ instruct rolI_eReg_i8(rRegI dst, immI8 lshift, immI8 rshift, eFlagsReg cr) %{
 %}

 // ROL 32bit var by var once
-instruct rolI_eReg_Var_C0(ncxRegI dst, eCXRegI shift, immI0 zero, eFlagsReg cr) %{
+instruct rolI_eReg_Var_C0(ncxRegI dst, eCXRegI shift, immI_0 zero, eFlagsReg cr) %{
  match(Set dst ( OrI (LShiftI dst shift) (URShiftI dst (SubI zero shift))));

  expand %{
@ -8497,7 +8585,7 @@ instruct rolI_eReg_Var_C32(ncxRegI dst, eCXRegI shift, immI_32 c32, eFlagsReg cr
 %}

 // ROR expand
-instruct rorI_eReg_imm1(rRegI dst, immI1 shift, eFlagsReg cr) %{
+instruct rorI_eReg_imm1(rRegI dst, immI_1 shift, eFlagsReg cr) %{
  effect(USE_DEF dst, USE shift, KILL cr);

  format %{ "ROR    $dst, $shift" %}
@ -8526,7 +8614,7 @@ instruct rorI_eReg_CL(ncxRegI dst, eCXRegI shift, eFlagsReg cr)%{
 // end of ROR expand

 // ROR right once
-instruct rorI_eReg_i1(rRegI dst, immI1 rshift, immI_M1 lshift, eFlagsReg cr) %{
+instruct rorI_eReg_i1(rRegI dst, immI_1 rshift, immI_M1 lshift, eFlagsReg cr) %{
  match(Set dst ( OrI (URShiftI dst rshift) (LShiftI dst lshift)));

  expand %{
@ -8545,7 +8633,7 @@ instruct rorI_eReg_i8(rRegI dst, immI8 rshift, immI8 lshift, eFlagsReg cr) %{
 %}

 // ROR 32bit var by var once
-instruct rorI_eReg_Var_C0(ncxRegI dst, eCXRegI shift, immI0 zero, eFlagsReg cr) %{
+instruct rorI_eReg_Var_C0(ncxRegI dst, eCXRegI shift, immI_0 zero, eFlagsReg cr) %{
  match(Set dst ( OrI (URShiftI dst shift) (LShiftI dst (SubI zero shift))));

  expand %{
@ -8713,7 +8801,7 @@ instruct cmpLTMask(eCXRegI dst, ncxRegI p, ncxRegI q, eFlagsReg cr) %{
  ins_pipe(pipe_slow);
 %}

-instruct cmpLTMask0(rRegI dst, immI0 zero, eFlagsReg cr) %{
+instruct cmpLTMask0(rRegI dst, immI_0 zero, eFlagsReg cr) %{
  match(Set dst (CmpLTMask dst zero));
  effect(DEF dst, KILL cr);
  ins_cost(100);
@ -8827,7 +8915,7 @@ instruct overflowSubI_rReg_imm(eFlagsReg cr, rRegI op1, immI op2)
  ins_pipe(ialu_reg_reg);
 %}

-instruct overflowNegI_rReg(eFlagsReg cr, immI0 zero, eAXRegI op2)
+instruct overflowNegI_rReg(eFlagsReg cr, immI_0 zero, eAXRegI op2)
 %{
  match(Set cr (OverflowSubI zero op2));
  effect(DEF cr, USE_KILL op2);
@ -11979,7 +12067,7 @@ instruct compI_eReg_mem(eFlagsReg cr, rRegI op1, memory op2) %{
  ins_pipe( ialu_cr_reg_mem );
 %}

-instruct testI_reg( eFlagsReg cr, rRegI src, immI0 zero ) %{
+instruct testI_reg( eFlagsReg cr, rRegI src, immI_0 zero ) %{
  match(Set cr (CmpI src zero));
  effect( DEF cr, USE src );

@ -11989,7 +12077,7 @@ instruct testI_reg( eFlagsReg cr, rRegI src, immI0 zero ) %{
  ins_pipe( ialu_cr_reg_imm );
 %}

-instruct testI_reg_imm( eFlagsReg cr, rRegI src, immI con, immI0 zero ) %{
+instruct testI_reg_imm( eFlagsReg cr, rRegI src, immI con, immI_0 zero ) %{
  match(Set cr (CmpI (AndI src con) zero));

  format %{ "TEST   $src,$con" %}
@ -11998,7 +12086,7 @@ instruct testI_reg_imm( eFlagsReg cr, rRegI src, immI con, immI0 zero ) %{
  ins_pipe( ialu_cr_reg_imm );
 %}

-instruct testI_reg_mem( eFlagsReg cr, rRegI src, memory mem, immI0 zero ) %{
+instruct testI_reg_mem( eFlagsReg cr, rRegI src, memory mem, immI_0 zero ) %{
  match(Set cr (CmpI (AndI src mem) zero));

  format %{ "TEST   $src,$mem" %}
@ -12048,7 +12136,7 @@ instruct compU_eReg_mem(eFlagsRegU cr, rRegI op1, memory op2) %{
 //  ins_encode( OpcP, RegMem( op1, op2) );
 //%}

-instruct testU_reg( eFlagsRegU cr, rRegI src, immI0 zero ) %{
+instruct testU_reg( eFlagsRegU cr, rRegI src, immI_0 zero ) %{
  match(Set cr (CmpU src zero));

  format %{ "TESTu  $src,$src" %}
@ -12125,7 +12213,7 @@ instruct testP_reg( eFlagsReg cr, eRegP src, immP0 zero ) %{
 // Cisc-spilled version of testP_reg
 // This will generate a signed flags result. This should be ok
 // since any compare to a zero should be eq/neq.
-instruct testP_Reg_mem( eFlagsReg cr, memory op, immI0 zero ) %{
+instruct testP_Reg_mem( eFlagsReg cr, memory op, immI_0 zero ) %{
  match(Set cr (CmpP (LoadP op) zero));

  format %{ "TEST   $op,0xFFFFFFFF" %}
@ -13496,7 +13584,7 @@ instruct tlsLoadP(eRegP dst, eFlagsReg cr) %{
 //   match(Set dst (CopyI src));
 // %}
 //
-// instruct incI_eReg(rRegI dst, immI1 src, eFlagsReg cr) %{
+// instruct incI_eReg(rRegI dst, immI_1 src, eFlagsReg cr) %{
 //   match(Set dst (AddI dst src));
 //   effect(KILL cr);
 // %}
--- a/src/hotspot/cpu/x86/x86_64.ad
+++ b/src/hotspot/cpu/x86/x86_64.ad
@ -2871,7 +2871,7 @@ operand immI()
 %}

 // Constant for test vs zero
-operand immI0()
+operand immI_0()
 %{
  predicate(n->get_int() == 0);
  match(ConI);
@ -2882,7 +2882,7 @@ operand immI0()
 %}

 // Constant for increment
-operand immI1()
+operand immI_1()
 %{
  predicate(n->get_int() == 1);
  match(ConI);
@ -2903,6 +2903,36 @@ operand immI_M1()
  interface(CONST_INTER);
 %}

+operand immI_2()
+%{
+  predicate(n->get_int() == 2);
+  match(ConI);
+
+  op_cost(0);
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand immI_4()
+%{
+  predicate(n->get_int() == 4);
+  match(ConI);
+
+  op_cost(0);
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand immI_8()
+%{
+  predicate(n->get_int() == 8);
+  match(ConI);
+
+  op_cost(0);
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
 // Valid scale values for addressing modes
 operand immI2()
 %{
@ -5217,19 +5247,19 @@ instruct maxF_reg(legRegF dst, legRegF a, legRegF b, legRegF tmp, legRegF atmp,
  match(Set dst (MaxF a b));
  effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
  format %{
-     "blendvps         $btmp,$b,$a,$b           \n\t"
-     "blendvps         $atmp,$a,$b,$b           \n\t"
+     "vblendvps        $btmp,$b,$a,$b           \n\t"
+     "vblendvps        $atmp,$a,$b,$b           \n\t"
     "vmaxss           $tmp,$atmp,$btmp         \n\t"
-     "cmpps.unordered  $btmp,$atmp,$atmp        \n\t"
-     "blendvps         $dst,$tmp,$atmp,$btmp    \n\t"
+     "vcmpps.unordered $btmp,$atmp,$atmp        \n\t"
+     "vblendvps        $dst,$tmp,$atmp,$btmp    \n\t"
  %}
  ins_encode %{
    int vector_len = Assembler::AVX_128bit;
-    __ blendvps($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, vector_len);
-    __ blendvps($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $b$$XMMRegister, vector_len);
+    __ vblendvps($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, vector_len);
+    __ vblendvps($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $b$$XMMRegister, vector_len);
    __ vmaxss($tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister);
-    __ cmpps($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::_false, vector_len);
-    __ blendvps($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
+    __ vcmpps($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::_false, vector_len);
+    __ vblendvps($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
 %}
  ins_pipe( pipe_slow );
 %}
@ -5253,19 +5283,19 @@ instruct maxD_reg(legRegD dst, legRegD a, legRegD b, legRegD tmp, legRegD atmp,
  match(Set dst (MaxD a b));
  effect(USE a, USE b, TEMP atmp, TEMP btmp, TEMP tmp);
  format %{
-     "blendvpd         $btmp,$b,$a,$b            \n\t"
-     "blendvpd         $atmp,$a,$b,$b            \n\t"
+     "vblendvpd        $btmp,$b,$a,$b            \n\t"
+     "vblendvpd        $atmp,$a,$b,$b            \n\t"
     "vmaxsd           $tmp,$atmp,$btmp          \n\t"
-     "cmppd.unordered  $btmp,$atmp,$atmp         \n\t"
-     "blendvpd         $dst,$tmp,$atmp,$btmp     \n\t"
+     "vcmppd.unordered $btmp,$atmp,$atmp         \n\t"
+     "vblendvpd        $dst,$tmp,$atmp,$btmp     \n\t"
  %}
  ins_encode %{
    int vector_len = Assembler::AVX_128bit;
-    __ blendvpd($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, vector_len);
-    __ blendvpd($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $b$$XMMRegister, vector_len);
+    __ vblendvpd($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, vector_len);
+    __ vblendvpd($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $b$$XMMRegister, vector_len);
    __ vmaxsd($tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister);
-    __ cmppd($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::_false, vector_len);
-    __ blendvpd($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
+    __ vcmppd($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::_false, vector_len);
+    __ vblendvpd($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
  %}
  ins_pipe( pipe_slow );
 %}
@ -5289,19 +5319,19 @@ instruct minF_reg(legRegF dst, legRegF a, legRegF b, legRegF tmp, legRegF atmp,
  match(Set dst (MinF a b));
  effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
  format %{
-     "blendvps         $atmp,$a,$b,$a             \n\t"
-     "blendvps         $btmp,$b,$a,$a             \n\t"
+     "vblendvps        $atmp,$a,$b,$a             \n\t"
+     "vblendvps        $btmp,$b,$a,$a             \n\t"
     "vminss           $tmp,$atmp,$btmp           \n\t"
-     "cmpps.unordered  $btmp,$atmp,$atmp          \n\t"
-     "blendvps         $dst,$tmp,$atmp,$btmp      \n\t"
+     "vcmpps.unordered $btmp,$atmp,$atmp          \n\t"
+     "vblendvps        $dst,$tmp,$atmp,$btmp      \n\t"
  %}
  ins_encode %{
    int vector_len = Assembler::AVX_128bit;
-    __ blendvps($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, vector_len);
-    __ blendvps($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $a$$XMMRegister, vector_len);
+    __ vblendvps($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, vector_len);
+    __ vblendvps($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $a$$XMMRegister, vector_len);
    __ vminss($tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister);
-    __ cmpps($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::_false, vector_len);
-    __ blendvps($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
+    __ vcmpps($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::_false, vector_len);
+    __ vblendvps($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
  %}
  ins_pipe( pipe_slow );
 %}
@ -5325,19 +5355,19 @@ instruct minD_reg(legRegD dst, legRegD a, legRegD b, legRegD tmp, legRegD atmp,
  match(Set dst (MinD a b));
  effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
  format %{
-     "blendvpd         $atmp,$a,$b,$a           \n\t"
-     "blendvpd         $btmp,$b,$a,$a           \n\t"
+     "vblendvpd        $atmp,$a,$b,$a           \n\t"
+     "vblendvpd        $btmp,$b,$a,$a           \n\t"
     "vminsd           $tmp,$atmp,$btmp         \n\t"
-     "cmppd.unordered  $btmp,$atmp,$atmp        \n\t"
-     "blendvpd         $dst,$tmp,$atmp,$btmp    \n\t"
+     "vcmppd.unordered $btmp,$atmp,$atmp        \n\t"
+     "vblendvpd        $dst,$tmp,$atmp,$btmp    \n\t"
  %}
  ins_encode %{
    int vector_len = Assembler::AVX_128bit;
-    __ blendvpd($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, vector_len);
-    __ blendvpd($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $a$$XMMRegister, vector_len);
+    __ vblendvpd($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, vector_len);
+    __ vblendvpd($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $a$$XMMRegister, vector_len);
    __ vminsd($tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister);
-    __ cmppd($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::_false, vector_len);
-    __ blendvpd($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
+    __ vcmppd($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::_false, vector_len);
+    __ vblendvpd($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
  %}
  ins_pipe( pipe_slow );
 %}
@ -5561,7 +5591,7 @@ instruct loadConI(rRegI dst, immI src)
  ins_pipe(ialu_reg_fat); // XXX
 %}

-instruct loadConI0(rRegI dst, immI0 src, rFlagsReg cr)
+instruct loadConI0(rRegI dst, immI_0 src, rFlagsReg cr)
 %{
  match(Set dst src);
  effect(KILL cr);
@ -5997,7 +6027,7 @@ instruct storeImmNKlass(memory mem, immNKlass src)
 %}

 // Store Integer Immediate
-instruct storeImmI0(memory mem, immI0 zero)
+instruct storeImmI0(memory mem, immI_0 zero)
 %{
  predicate(UseCompressedOops && (CompressedOops::base() == NULL));
  match(Set mem (StoreI mem zero));
@ -6047,7 +6077,7 @@ instruct storeImmL(memory mem, immL32 src)
 %}

 // Store Short/Char Immediate
-instruct storeImmC0(memory mem, immI0 zero)
+instruct storeImmC0(memory mem, immI_0 zero)
 %{
  predicate(UseCompressedOops && (CompressedOops::base() == NULL));
  match(Set mem (StoreC mem zero));
@ -6073,7 +6103,7 @@ instruct storeImmI16(memory mem, immI16 src)
 %}

 // Store Byte Immediate
-instruct storeImmB0(memory mem, immI0 zero)
+instruct storeImmB0(memory mem, immI_0 zero)
 %{
  predicate(UseCompressedOops && (CompressedOops::base() == NULL));
  match(Set mem (StoreB mem zero));
@ -6098,7 +6128,7 @@ instruct storeImmB(memory mem, immI8 src)
 %}

 // Store CMS card-mark Immediate
-instruct storeImmCM0_reg(memory mem, immI0 zero)
+instruct storeImmCM0_reg(memory mem, immI_0 zero)
 %{
  predicate(UseCompressedOops && (CompressedOops::base() == NULL));
  match(Set mem (StoreCM mem zero));
@ -6111,7 +6141,7 @@ instruct storeImmCM0_reg(memory mem, immI0 zero)
  ins_pipe(ialu_mem_reg);
 %}

-instruct storeImmCM0(memory mem, immI0 src)
+instruct storeImmCM0(memory mem, immI_0 src)
 %{
  match(Set mem (StoreCM mem src));

@ -7196,7 +7226,7 @@ instruct addI_mem_imm(memory dst, immI src, rFlagsReg cr)
  ins_pipe(ialu_mem_imm);
 %}

-instruct incI_rReg(rRegI dst, immI1 src, rFlagsReg cr)
+instruct incI_rReg(rRegI dst, immI_1 src, rFlagsReg cr)
 %{
  predicate(UseIncDec);
  match(Set dst (AddI dst src));
@ -7208,7 +7238,7 @@ instruct incI_rReg(rRegI dst, immI1 src, rFlagsReg cr)
  ins_pipe(ialu_reg);
 %}

-instruct incI_mem(memory dst, immI1 src, rFlagsReg cr)
+instruct incI_mem(memory dst, immI_1 src, rFlagsReg cr)
 %{
  predicate(UseIncDec);
  match(Set dst (StoreI dst (AddI (LoadI dst) src)));
@ -8091,7 +8121,7 @@ instruct subL_mem_imm(memory dst, immL32 src, rFlagsReg cr)

 // Subtract from a pointer
 // XXX hmpf???
-instruct subP_rReg(rRegP dst, rRegI src, immI0 zero, rFlagsReg cr)
+instruct subP_rReg(rRegP dst, rRegI src, immI_0 zero, rFlagsReg cr)
 %{
  match(Set dst (AddP dst (SubI zero src)));
  effect(KILL cr);
@ -8102,7 +8132,7 @@ instruct subP_rReg(rRegP dst, rRegI src, immI0 zero, rFlagsReg cr)
  ins_pipe(ialu_reg_reg);
 %}

-instruct negI_rReg(rRegI dst, immI0 zero, rFlagsReg cr)
+instruct negI_rReg(rRegI dst, immI_0 zero, rFlagsReg cr)
 %{
  match(Set dst (SubI zero dst));
  effect(KILL cr);
@ -8113,7 +8143,19 @@ instruct negI_rReg(rRegI dst, immI0 zero, rFlagsReg cr)
  ins_pipe(ialu_reg);
 %}

-instruct negI_mem(memory dst, immI0 zero, rFlagsReg cr)
+instruct negI_rReg_2(rRegI dst, rFlagsReg cr)
+%{
+  match(Set dst (NegI dst));
+  effect(KILL cr);
+
+  format %{ "negl    $dst\t# int" %}
+  ins_encode %{
+    __ negl($dst$$Register);
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct negI_mem(memory dst, immI_0 zero, rFlagsReg cr)
 %{
  match(Set dst (StoreI dst (SubI zero (LoadI dst))));
  effect(KILL cr);
@ -8135,6 +8177,18 @@ instruct negL_rReg(rRegL dst, immL0 zero, rFlagsReg cr)
  ins_pipe(ialu_reg);
 %}

+instruct negL_rReg_2(rRegL dst, rFlagsReg cr)
+%{
+  match(Set dst (NegL dst));
+  effect(KILL cr);
+
+  format %{ "negq    $dst\t# int" %}
+  ins_encode %{
+    __ negq($dst$$Register);
+  %}
+  ins_pipe(ialu_reg);
+%}
+
 instruct negL_mem(memory dst, immL0 zero, rFlagsReg cr)
 %{
  match(Set dst (StoreL dst (SubL zero (LoadL dst))));
@ -8460,7 +8514,7 @@ instruct modL_rReg(rdx_RegL rdx, rax_RegL rax, no_rax_rdx_RegL div,

 // Integer Shift Instructions
 // Shift Left by one
-instruct salI_rReg_1(rRegI dst, immI1 shift, rFlagsReg cr)
+instruct salI_rReg_1(rRegI dst, immI_1 shift, rFlagsReg cr)
 %{
  match(Set dst (LShiftI dst shift));
  effect(KILL cr);
@ -8472,7 +8526,7 @@ instruct salI_rReg_1(rRegI dst, immI1 shift, rFlagsReg cr)
 %}

 // Shift Left by one
-instruct salI_mem_1(memory dst, immI1 shift, rFlagsReg cr)
+instruct salI_mem_1(memory dst, immI_1 shift, rFlagsReg cr)
 %{
  match(Set dst (StoreI dst (LShiftI (LoadI dst) shift)));
  effect(KILL cr);
@ -8532,7 +8586,7 @@ instruct salI_mem_CL(memory dst, rcx_RegI shift, rFlagsReg cr)
 %}

 // Arithmetic shift right by one
-instruct sarI_rReg_1(rRegI dst, immI1 shift, rFlagsReg cr)
+instruct sarI_rReg_1(rRegI dst, immI_1 shift, rFlagsReg cr)
 %{
  match(Set dst (RShiftI dst shift));
  effect(KILL cr);
@ -8544,7 +8598,7 @@ instruct sarI_rReg_1(rRegI dst, immI1 shift, rFlagsReg cr)
 %}

 // Arithmetic shift right by one
-instruct sarI_mem_1(memory dst, immI1 shift, rFlagsReg cr)
+instruct sarI_mem_1(memory dst, immI_1 shift, rFlagsReg cr)
 %{
  match(Set dst (StoreI dst (RShiftI (LoadI dst) shift)));
  effect(KILL cr);
@ -8604,7 +8658,7 @@ instruct sarI_mem_CL(memory dst, rcx_RegI shift, rFlagsReg cr)
 %}

 // Logical shift right by one
-instruct shrI_rReg_1(rRegI dst, immI1 shift, rFlagsReg cr)
+instruct shrI_rReg_1(rRegI dst, immI_1 shift, rFlagsReg cr)
 %{
  match(Set dst (URShiftI dst shift));
  effect(KILL cr);
@ -8616,7 +8670,7 @@ instruct shrI_rReg_1(rRegI dst, immI1 shift, rFlagsReg cr)
 %}

 // Logical shift right by one
-instruct shrI_mem_1(memory dst, immI1 shift, rFlagsReg cr)
+instruct shrI_mem_1(memory dst, immI_1 shift, rFlagsReg cr)
 %{
  match(Set dst (StoreI dst (URShiftI (LoadI dst) shift)));
  effect(KILL cr);
@ -8677,7 +8731,7 @@ instruct shrI_mem_CL(memory dst, rcx_RegI shift, rFlagsReg cr)

 // Long Shift Instructions
 // Shift Left by one
-instruct salL_rReg_1(rRegL dst, immI1 shift, rFlagsReg cr)
+instruct salL_rReg_1(rRegL dst, immI_1 shift, rFlagsReg cr)
 %{
  match(Set dst (LShiftL dst shift));
  effect(KILL cr);
@ -8689,7 +8743,7 @@ instruct salL_rReg_1(rRegL dst, immI1 shift, rFlagsReg cr)
 %}

 // Shift Left by one
-instruct salL_mem_1(memory dst, immI1 shift, rFlagsReg cr)
+instruct salL_mem_1(memory dst, immI_1 shift, rFlagsReg cr)
 %{
  match(Set dst (StoreL dst (LShiftL (LoadL dst) shift)));
  effect(KILL cr);
@ -8750,7 +8804,7 @@ instruct salL_mem_CL(memory dst, rcx_RegI shift, rFlagsReg cr)
 %}

 // Arithmetic shift right by one
-instruct sarL_rReg_1(rRegL dst, immI1 shift, rFlagsReg cr)
+instruct sarL_rReg_1(rRegL dst, immI_1 shift, rFlagsReg cr)
 %{
  match(Set dst (RShiftL dst shift));
  effect(KILL cr);
@ -8762,7 +8816,7 @@ instruct sarL_rReg_1(rRegL dst, immI1 shift, rFlagsReg cr)
 %}

 // Arithmetic shift right by one
-instruct sarL_mem_1(memory dst, immI1 shift, rFlagsReg cr)
+instruct sarL_mem_1(memory dst, immI_1 shift, rFlagsReg cr)
 %{
  match(Set dst (StoreL dst (RShiftL (LoadL dst) shift)));
  effect(KILL cr);
@ -8823,7 +8877,7 @@ instruct sarL_mem_CL(memory dst, rcx_RegI shift, rFlagsReg cr)
 %}

 // Logical shift right by one
-instruct shrL_rReg_1(rRegL dst, immI1 shift, rFlagsReg cr)
+instruct shrL_rReg_1(rRegL dst, immI_1 shift, rFlagsReg cr)
 %{
  match(Set dst (URShiftL dst shift));
  effect(KILL cr);
@ -8835,7 +8889,7 @@ instruct shrL_rReg_1(rRegL dst, immI1 shift, rFlagsReg cr)
 %}

 // Logical shift right by one
-instruct shrL_mem_1(memory dst, immI1 shift, rFlagsReg cr)
+instruct shrL_mem_1(memory dst, immI_1 shift, rFlagsReg cr)
 %{
  match(Set dst (StoreL dst (URShiftL (LoadL dst) shift)));
  effect(KILL cr);
@ -9207,7 +9261,7 @@ instruct andnI_rReg_rReg_rReg(rRegI dst, rRegI src1, rRegI src2, immI_M1 minus_1
  ins_pipe(ialu_reg);
 %}

-instruct blsiI_rReg_rReg(rRegI dst, rRegI src, immI0 imm_zero, rFlagsReg cr) %{
+instruct blsiI_rReg_rReg(rRegI dst, rRegI src, immI_0 imm_zero, rFlagsReg cr) %{
  match(Set dst (AndI (SubI imm_zero src) src));
  predicate(UseBMI1Instructions);
  effect(KILL cr);
@ -9220,7 +9274,7 @@ instruct blsiI_rReg_rReg(rRegI dst, rRegI src, immI0 imm_zero, rFlagsReg cr) %{
  ins_pipe(ialu_reg);
 %}

-instruct blsiI_rReg_mem(rRegI dst, memory src, immI0 imm_zero, rFlagsReg cr) %{
+instruct blsiI_rReg_mem(rRegI dst, memory src, immI_0 imm_zero, rFlagsReg cr) %{
  match(Set dst (AndI (SubI imm_zero (LoadI src) ) (LoadI src) ));
  predicate(UseBMI1Instructions);
  effect(KILL cr);
@ -9903,7 +9957,7 @@ instruct cmpLTMask(rRegI dst, rRegI p, rRegI q, rFlagsReg cr)
  ins_pipe(pipe_slow);
 %}

-instruct cmpLTMask0(rRegI dst, immI0 zero, rFlagsReg cr)
+instruct cmpLTMask0(rRegI dst, immI_0 zero, rFlagsReg cr)
 %{
  match(Set dst (CmpLTMask dst zero));
  effect(KILL cr);
@ -11250,7 +11304,7 @@ instruct overflowSubL_rReg_imm(rFlagsReg cr, rRegL op1, immL32 op2)
  ins_pipe(ialu_reg_reg);
 %}

-instruct overflowNegI_rReg(rFlagsReg cr, immI0 zero, rax_RegI op2)
+instruct overflowNegI_rReg(rFlagsReg cr, immI_0 zero, rax_RegI op2)
 %{
  match(Set cr (OverflowSubI zero op2));
  effect(DEF cr, USE_KILL op2);
@ -11359,7 +11413,7 @@ instruct compI_rReg_mem(rFlagsReg cr, rRegI op1, memory op2)
  ins_pipe(ialu_cr_reg_mem);
 %}

-instruct testI_reg(rFlagsReg cr, rRegI src, immI0 zero)
+instruct testI_reg(rFlagsReg cr, rRegI src, immI_0 zero)
 %{
  match(Set cr (CmpI src zero));

@ -11369,7 +11423,7 @@ instruct testI_reg(rFlagsReg cr, rRegI src, immI0 zero)
  ins_pipe(ialu_cr_reg_imm);
 %}

-instruct testI_reg_imm(rFlagsReg cr, rRegI src, immI con, immI0 zero)
+instruct testI_reg_imm(rFlagsReg cr, rRegI src, immI con, immI_0 zero)
 %{
  match(Set cr (CmpI (AndI src con) zero));

@ -11379,7 +11433,7 @@ instruct testI_reg_imm(rFlagsReg cr, rRegI src, immI con, immI0 zero)
  ins_pipe(ialu_cr_reg_imm);
 %}

-instruct testI_reg_mem(rFlagsReg cr, rRegI src, memory mem, immI0 zero)
+instruct testI_reg_mem(rFlagsReg cr, rRegI src, memory mem, immI_0 zero)
 %{
  match(Set cr (CmpI (AndI src (LoadI mem)) zero));

@ -11433,7 +11487,7 @@ instruct compU_rReg_mem(rFlagsRegU cr, rRegI op1, memory op2)
 // //  ins_encode( OpcP, reg_mem( op1, op2) );
 // //%}

-instruct testU_reg(rFlagsRegU cr, rRegI src, immI0 zero)
+instruct testU_reg(rFlagsRegU cr, rRegI src, immI_0 zero)
 %{
  match(Set cr (CmpU src zero));

@ -11771,7 +11825,7 @@ instruct compB_mem_imm(rFlagsReg cr, memory mem, immI8 imm)
  ins_pipe(ialu_cr_reg_mem);
 %}

-instruct testUB_mem_imm(rFlagsReg cr, memory mem, immU7 imm, immI0 zero)
+instruct testUB_mem_imm(rFlagsReg cr, memory mem, immU7 imm, immI_0 zero)
 %{
  match(Set cr (CmpI (AndI (LoadUB mem) imm) zero));

@ -11781,7 +11835,7 @@ instruct testUB_mem_imm(rFlagsReg cr, memory mem, immU7 imm, immI0 zero)
  ins_pipe(ialu_cr_reg_mem);
 %}

-instruct testB_mem_imm(rFlagsReg cr, memory mem, immI8 imm, immI0 zero)
+instruct testB_mem_imm(rFlagsReg cr, memory mem, immI8 imm, immI_0 zero)
 %{
  match(Set cr (CmpI (AndI (LoadB mem) imm) zero));

@ -12504,7 +12558,7 @@ instruct tlsLoadP(r15_RegP dst) %{
 //   match(Set dst (CopyI src));
 // %}
 //
-// instruct incI_rReg(rRegI dst, immI1 src, rFlagsReg cr)
+// instruct incI_rReg(rRegI dst, immI_1 src, rFlagsReg cr)
 // %{
 //   match(Set dst (AddI dst src));
 //   effect(KILL cr);
--- a/src/hotspot/share/adlc/forms.cpp
+++ b/src/hotspot/share/adlc/forms.cpp
@ -268,6 +268,7 @@ Form::DataType Form::is_load_from_memory(const char *opType) const {
  if( strcmp(opType,"LoadRange")==0 )  return Form::idealI;
  if( strcmp(opType,"LoadS")==0 )  return Form::idealS;
  if( strcmp(opType,"LoadVector")==0 )  return Form::idealV;
+  if( strcmp(opType,"LoadVectorGather")==0 )  return Form::idealV;
  assert( strcmp(opType,"Load") != 0, "Must type Loads" );
  return Form::none;
 }
@ -284,6 +285,7 @@ Form::DataType Form::is_store_to_memory(const char *opType) const {
  if( strcmp(opType,"StoreN")==0)  return Form::idealN;
  if( strcmp(opType,"StoreNKlass")==0)  return Form::idealNKlass;
  if( strcmp(opType,"StoreVector")==0 )  return Form::idealV;
+  if( strcmp(opType,"StoreVectorScatter")==0 )  return Form::idealV;
  assert( strcmp(opType,"Store") != 0, "Must type Stores" );
  return Form::none;
 }
--- a/src/hotspot/share/adlc/formssel.cpp
+++ b/src/hotspot/share/adlc/formssel.cpp
@ -3484,7 +3484,7 @@ int MatchNode::needs_ideal_memory_edge(FormDict &globals) const {
    "StoreB","StoreC","Store" ,"StoreFP",
    "LoadI", "LoadL", "LoadP" ,"LoadN", "LoadD" ,"LoadF"  ,
    "LoadB" , "LoadUB", "LoadUS" ,"LoadS" ,"Load" ,
-    "StoreVector", "LoadVector",
+    "StoreVector", "LoadVector", "LoadVectorGather", "StoreVectorScatter",
    "LoadRange", "LoadKlass", "LoadNKlass", "LoadL_unaligned", "LoadD_unaligned",
    "LoadPLocked",
    "StorePConditional", "StoreIConditional", "StoreLConditional",
@ -3801,6 +3801,7 @@ void MatchNode::count_commutative_op(int& count) {
    "MaxV", "MinV",
    "MulI","MulL","MulF","MulD",
    "MulVB","MulVS","MulVI","MulVL","MulVF","MulVD",
+    "MinV","MaxV",
    "OrI","OrL",
    "OrV",
    "XorI","XorL",
@ -4151,8 +4152,9 @@ bool MatchRule::is_vector() const {
    "MulVB","MulVS","MulVI","MulVL","MulVF","MulVD",
    "CMoveVD", "CMoveVF",
    "DivVF","DivVD",
+    "MinV","MaxV",
    "AbsVB","AbsVS","AbsVI","AbsVL","AbsVF","AbsVD",
-    "NegVF","NegVD",
+    "NegVF","NegVD","NegVI",
    "SqrtVD","SqrtVF",
    "AndV" ,"XorV" ,"OrV",
    "MaxV", "MinV",
@ -4169,6 +4171,12 @@ bool MatchRule::is_vector() const {
    "URShiftVB","URShiftVS","URShiftVI","URShiftVL",
    "ReplicateB","ReplicateS","ReplicateI","ReplicateL","ReplicateF","ReplicateD",
    "RoundDoubleModeV","RotateLeftV" , "RotateRightV", "LoadVector","StoreVector",
+    "LoadVectorGather", "StoreVectorScatter",
+    "VectorTest", "VectorLoadMask", "VectorStoreMask", "VectorBlend", "VectorInsert",
+    "VectorRearrange","VectorLoadShuffle", "VectorLoadConst",
+    "VectorCastB2X", "VectorCastS2X", "VectorCastI2X",
+    "VectorCastL2X", "VectorCastF2X", "VectorCastD2X",
+    "VectorMaskWrapper", "VectorMaskCmp", "VectorReinterpret",
    "FmaVD", "FmaVF","PopCountVI",
    // Next are not supported currently.
    "PackB","PackS","PackI","PackL","PackF","PackD","Pack2L","Pack2D",
--- a/src/hotspot/share/ci/ciField.cpp
+++ b/src/hotspot/share/ci/ciField.cpp
@ -222,6 +222,7 @@ static bool trust_final_non_static_fields(ciInstanceKlass* holder) {
  // Even if general trusting is disabled, trust system-built closures in these packages.
  if (holder->is_in_package("java/lang/invoke") || holder->is_in_package("sun/invoke") ||
      holder->is_in_package("jdk/internal/foreign") || holder->is_in_package("jdk/incubator/foreign") ||
+      holder->is_in_package("jdk/internal/vm/vector") || holder->is_in_package("jdk/incubator/vector") ||
      holder->is_in_package("java/lang"))
    return true;
  // Trust hidden classes and VM unsafe anonymous classes. They are created via
--- a/src/hotspot/share/ci/ciMethod.cpp
+++ b/src/hotspot/share/ci/ciMethod.cpp
@ -1353,6 +1353,11 @@ bool ciMethod::is_unboxing_method() const {
  return false;
 }

+bool ciMethod::is_vector_method() const {
+  return (holder() == ciEnv::current()->vector_VectorSupport_klass()) &&
+         (intrinsic_id() != vmIntrinsics::_none);
+}
+
 BCEscapeAnalyzer  *ciMethod::get_bcea() {
 #ifdef COMPILER2
  if (_bcea == NULL) {
--- a/src/hotspot/share/ci/ciMethod.hpp
+++ b/src/hotspot/share/ci/ciMethod.hpp
@ -356,6 +356,7 @@ class ciMethod : public ciMetadata {
  bool has_reserved_stack_access() const         { return _has_reserved_stack_access; }
  bool is_boxing_method() const;
  bool is_unboxing_method() const;
+  bool is_vector_method() const;
  bool is_object_initializer() const;

  bool can_be_statically_bound(ciInstanceKlass* context) const;
--- a/src/hotspot/share/classfile/javaClasses.cpp
+++ b/src/hotspot/share/classfile/javaClasses.cpp
@ -4539,6 +4539,30 @@ void java_util_concurrent_locks_AbstractOwnableSynchronizer::serialize_offsets(S
 }
 #endif

+int vector_VectorPayload::_payload_offset;
+
+#define VECTORPAYLOAD_FIELDS_DO(macro) \
+  macro(_payload_offset, k, "payload", object_signature, false)
+
+void vector_VectorPayload::compute_offsets() {
+  InstanceKlass* k = SystemDictionary::vector_VectorPayload_klass();
+  VECTORPAYLOAD_FIELDS_DO(FIELD_COMPUTE_OFFSET);
+}
+
+#if INCLUDE_CDS
+void vector_VectorPayload::serialize_offsets(SerializeClosure* f) {
+  VECTORPAYLOAD_FIELDS_DO(FIELD_SERIALIZE_OFFSET);
+}
+#endif
+
+void vector_VectorPayload::set_payload(oop o, oop val) {
+  o->obj_field_put(_payload_offset, val);
+}
+
+bool vector_VectorPayload::is_instance(oop obj) {
+  return obj != NULL && is_subclass(obj->klass());
+}
+
 int java_lang_Integer_IntegerCache::_static_cache_offset;
 int java_lang_Long_LongCache::_static_cache_offset;
 int java_lang_Character_CharacterCache::_static_cache_offset;
--- a/src/hotspot/share/classfile/javaClasses.hpp
+++ b/src/hotspot/share/classfile/javaClasses.hpp
@ -76,6 +76,7 @@ class RecordComponent;
  f(java_util_concurrent_locks_AbstractOwnableSynchronizer) \
  f(jdk_internal_misc_UnsafeConstants) \
  f(java_lang_boxing_object) \
+  f(vector_VectorPayload) \
  //end

 #define BASIC_JAVA_CLASSES_DO(f) \
@ -1564,6 +1565,24 @@ class jdk_internal_misc_UnsafeConstants : AllStatic {
  static void serialize_offsets(SerializeClosure* f) { }
 };

+// Interface to jdk.internal.vm.vector.VectorSupport.VectorPayload objects
+
+class vector_VectorPayload : AllStatic {
+ private:
+  static int _payload_offset;
+ public:
+  static void set_payload(oop o, oop val);
+
+  static void compute_offsets();
+  static void serialize_offsets(SerializeClosure* f) NOT_CDS_RETURN;
+
+  // Testers
+  static bool is_subclass(Klass* klass) {
+    return klass->is_subclass_of(SystemDictionary::vector_VectorPayload_klass());
+  }
+  static bool is_instance(oop obj);
+};
+
 class java_lang_Integer : AllStatic {
 public:
  static jint value(oop obj);
--- a/src/hotspot/share/classfile/modules.cpp
+++ b/src/hotspot/share/classfile/modules.cpp
@ -43,6 +43,7 @@
 #include "memory/metaspaceShared.hpp"
 #include "memory/resourceArea.hpp"
 #include "prims/jvmtiExport.hpp"
+#include "runtime/globals_extension.hpp"
 #include "runtime/handles.inline.hpp"
 #include "runtime/javaCalls.hpp"
 #include "runtime/jniHandles.inline.hpp"
@ -452,6 +453,24 @@ void Modules::define_module(jobject module, jboolean is_open, jstring version,
  if (h_loader.is_null() && !ClassLoader::has_jrt_entry()) {
    ClassLoader::add_to_exploded_build_list(module_symbol, CHECK);
  }
+
+#ifdef COMPILER2
+  // Special handling of jdk.incubator.vector
+  if (strcmp(module_name, "jdk.incubator.vector") == 0) {
+    if (FLAG_IS_DEFAULT(EnableVectorSupport)) {
+      FLAG_SET_DEFAULT(EnableVectorSupport, true);
+    }
+    if (EnableVectorSupport && FLAG_IS_DEFAULT(EnableVectorReboxing)) {
+      FLAG_SET_DEFAULT(EnableVectorReboxing, true);
+    }
+    if (EnableVectorSupport && EnableVectorReboxing && FLAG_IS_DEFAULT(EnableVectorAggressiveReboxing)) {
+      FLAG_SET_DEFAULT(EnableVectorAggressiveReboxing, true);
+    }
+    log_info(compilation)("EnableVectorSupport=%s",            (EnableVectorSupport            ? "true" : "false"));
+    log_info(compilation)("EnableVectorReboxing=%s",           (EnableVectorReboxing           ? "true" : "false"));
+    log_info(compilation)("EnableVectorAggressiveReboxing=%s", (EnableVectorAggressiveReboxing ? "true" : "false"));
+  }
+#endif // COMPILER2
 }

 #if INCLUDE_CDS_JAVA_HEAP
--- a/src/hotspot/share/classfile/systemDictionary.hpp
+++ b/src/hotspot/share/classfile/systemDictionary.hpp
@ -226,6 +226,13 @@ class TableStatistics;
  /* support for records */                                                                                     \
  do_klass(RecordComponent_klass,                       java_lang_reflect_RecordComponent                     ) \
                                                                                                                \
+  /* support for vectors*/                                                                                      \
+  do_klass(vector_VectorSupport_klass,                  jdk_internal_vm_vector_VectorSupport                  ) \
+  do_klass(vector_VectorPayload_klass,                  jdk_internal_vm_vector_VectorPayload                  ) \
+  do_klass(vector_Vector_klass,                         jdk_internal_vm_vector_Vector                         ) \
+  do_klass(vector_VectorMask_klass,                     jdk_internal_vm_vector_VectorMask                     ) \
+  do_klass(vector_VectorShuffle_klass,                  jdk_internal_vm_vector_VectorShuffle                  ) \
+                                                                                                                \
  /*end*/

 class SystemDictionary : AllStatic {
--- a/src/hotspot/share/classfile/vmIntrinsics.hpp
+++ b/src/hotspot/share/classfile/vmIntrinsics.hpp
@ -778,6 +778,122 @@
  do_intrinsic(_getAndSetReference,       jdk_internal_misc_Unsafe,     getAndSetReference_name, getAndSetReference_signature, F_R) \
   do_name(     getAndSetReference_name,                                "getAndSetReference")                                  \
   do_signature(getAndSetReference_signature,                           "(Ljava/lang/Object;JLjava/lang/Object;)Ljava/lang/Object;" ) \
+                                                                                                                                               \
+  /* Vector API intrinsification support */                                                                                                    \
+                                                                                                                                               \
+  do_intrinsic(_VectorUnaryOp, jdk_internal_vm_vector_VectorSupport, vector_unary_op_name, vector_unary_op_sig, F_S)                           \
+   do_signature(vector_unary_op_sig, "(ILjava/lang/Class;Ljava/lang/Class;ILjava/lang/Object;Ljava/util/function/Function;)Ljava/lang/Object;") \
+   do_name(vector_unary_op_name,     "unaryOp")                                                                                                \
+                                                                                                                                               \
+  do_intrinsic(_VectorBinaryOp, jdk_internal_vm_vector_VectorSupport, vector_binary_op_name, vector_binary_op_sig, F_S)                        \
+   do_signature(vector_binary_op_sig, "(ILjava/lang/Class;Ljava/lang/Class;ILjava/lang/Object;Ljava/lang/Object;"                              \
+                                       "Ljava/util/function/BiFunction;)Ljava/lang/Object;")                                                   \
+   do_name(vector_binary_op_name,     "binaryOp")                                                                                              \
+                                                                                                                                               \
+  do_intrinsic(_VectorTernaryOp, jdk_internal_vm_vector_VectorSupport, vector_ternary_op_name, vector_ternary_op_sig, F_S)                     \
+   do_signature(vector_ternary_op_sig, "(ILjava/lang/Class;Ljava/lang/Class;ILjava/lang/Object;Ljava/lang/Object;"                             \
+                                        "Ljava/lang/Object;Ljdk/internal/vm/vector/VectorSupport$TernaryOperation;)Ljava/lang/Object;")        \
+   do_name(vector_ternary_op_name,     "ternaryOp")                                                                                            \
+                                                                                                                                               \
+  do_intrinsic(_VectorBroadcastCoerced, jdk_internal_vm_vector_VectorSupport, vector_broadcast_coerced_name, vector_broadcast_coerced_sig, F_S)\
+   do_signature(vector_broadcast_coerced_sig, "(Ljava/lang/Class;Ljava/lang/Class;IJLjdk/internal/vm/vector/VectorSupport$VectorSpecies;"      \
+                                               "Ljdk/internal/vm/vector/VectorSupport$BroadcastOperation;)Ljava/lang/Object;")                 \
+   do_name(vector_broadcast_coerced_name, "broadcastCoerced")                                                                                  \
+                                                                                                                                               \
+  do_intrinsic(_VectorShuffleIota, jdk_internal_vm_vector_VectorSupport, vector_shuffle_step_iota_name, vector_shuffle_step_iota_sig, F_S)     \
+   do_signature(vector_shuffle_step_iota_sig, "(Ljava/lang/Class;Ljava/lang/Class;Ljdk/internal/vm/vector/VectorSupport$VectorSpecies;"        \
+                                               "IIIILjdk/internal/vm/vector/VectorSupport$ShuffleIotaOperation;)Ljdk/internal/vm/vector/VectorSupport$VectorShuffle;") \
+   do_name(vector_shuffle_step_iota_name, "shuffleIota")                                                                                       \
+                                                                                                                                               \
+  do_intrinsic(_VectorShuffleToVector, jdk_internal_vm_vector_VectorSupport, vector_shuffle_to_vector_name, vector_shuffle_to_vector_sig, F_S) \
+   do_signature(vector_shuffle_to_vector_sig, "(Ljava/lang/Class;Ljava/lang/Class;Ljava/lang/Class;Ljdk/internal/vm/vector/VectorSupport$VectorShuffle;" \
+                                               "ILjdk/internal/vm/vector/VectorSupport$ShuffleToVectorOperation;)Ljava/lang/Object;")          \
+   do_name(vector_shuffle_to_vector_name, "shuffleToVector")                                                                                   \
+                                                                                                                                               \
+  do_intrinsic(_VectorLoadOp, jdk_internal_vm_vector_VectorSupport, vector_load_op_name, vector_load_op_sig, F_S)                              \
+   do_signature(vector_load_op_sig, "(Ljava/lang/Class;Ljava/lang/Class;ILjava/lang/Object;JLjava/lang/Object;"                                \
+                                     "ILjdk/internal/vm/vector/VectorSupport$VectorSpecies;Ljdk/internal/vm/vector/VectorSupport$LoadOperation;)Ljava/lang/Object;") \
+   do_name(vector_load_op_name,     "load")                                                                                                    \
+                                                                                                                                               \
+  do_intrinsic(_VectorStoreOp, jdk_internal_vm_vector_VectorSupport, vector_store_op_name, vector_store_op_sig, F_S)                           \
+   do_signature(vector_store_op_sig, "(Ljava/lang/Class;Ljava/lang/Class;ILjava/lang/Object;JLjdk/internal/vm/vector/VectorSupport$Vector;"    \
+                                      "Ljava/lang/Object;ILjdk/internal/vm/vector/VectorSupport$StoreVectorOperation;)V")                      \
+   do_name(vector_store_op_name,     "store")                                                                                                  \
+                                                                                                                                               \
+  do_intrinsic(_VectorReductionCoerced, jdk_internal_vm_vector_VectorSupport, vector_reduction_coerced_name, vector_reduction_coerced_sig, F_S) \
+   do_signature(vector_reduction_coerced_sig, "(ILjava/lang/Class;Ljava/lang/Class;ILjdk/internal/vm/vector/VectorSupport$Vector;Ljava/util/function/Function;)J") \
+   do_name(vector_reduction_coerced_name, "reductionCoerced")                                                                                  \
+                                                                                                                                               \
+  do_intrinsic(_VectorTest, jdk_internal_vm_vector_VectorSupport, vector_test_name, vector_test_sig, F_S)                                      \
+   do_signature(vector_test_sig, "(ILjava/lang/Class;Ljava/lang/Class;ILjava/lang/Object;Ljava/lang/Object;Ljava/util/function/BiFunction;)Z") \
+   do_name(vector_test_name, "test")                                                                                                           \
+                                                                                                                                               \
+  do_intrinsic(_VectorBlend, jdk_internal_vm_vector_VectorSupport, vector_blend_name, vector_blend_sig, F_S)                                   \
+   do_signature(vector_blend_sig, "(Ljava/lang/Class;Ljava/lang/Class;Ljava/lang/Class;I"                                                      \
+                                   "Ljdk/internal/vm/vector/VectorSupport$Vector;Ljdk/internal/vm/vector/VectorSupport$Vector;Ljdk/internal/vm/vector/VectorSupport$VectorMask;" \
+                                   "Ljdk/internal/vm/vector/VectorSupport$VectorBlendOp;)Ljdk/internal/vm/vector/VectorSupport$Vector;")       \
+   do_name(vector_blend_name, "blend")                                                                                                         \
+                                                                                                                                               \
+  do_intrinsic(_VectorCompare, jdk_internal_vm_vector_VectorSupport, vector_compare_name, vector_compare_sig, F_S)                             \
+   do_signature(vector_compare_sig, "(ILjava/lang/Class;Ljava/lang/Class;Ljava/lang/Class;I"                                                   \
+                                     "Ljdk/internal/vm/vector/VectorSupport$Vector;" "Ljdk/internal/vm/vector/VectorSupport$Vector;"           \
+                                     "Ljdk/internal/vm/vector/VectorSupport$VectorCompareOp;" ")" "Ljdk/internal/vm/vector/VectorSupport$VectorMask;") \
+   do_name(vector_compare_name, "compare")                                                                                                     \
+                                                                                                                                               \
+  do_intrinsic(_VectorRearrange, jdk_internal_vm_vector_VectorSupport, vector_rearrange_name, vector_rearrange_sig, F_S)                       \
+   do_signature(vector_rearrange_sig, "(Ljava/lang/Class;Ljava/lang/Class;Ljava/lang/Class;I"                                                  \
+                                       "Ljdk/internal/vm/vector/VectorSupport$Vector;Ljdk/internal/vm/vector/VectorSupport$VectorShuffle;"     \
+                                       "Ljdk/internal/vm/vector/VectorSupport$VectorRearrangeOp;)Ljdk/internal/vm/vector/VectorSupport$Vector;") \
+   do_name(vector_rearrange_name, "rearrangeOp")                                                                                               \
+                                                                                                                                               \
+  do_intrinsic(_VectorExtract, jdk_internal_vm_vector_VectorSupport, vector_extract_name, vector_extract_sig, F_S)                             \
+   do_signature(vector_extract_sig, "(Ljava/lang/Class;Ljava/lang/Class;I"                                                                     \
+                                     "Ljdk/internal/vm/vector/VectorSupport$Vector;I"                                                          \
+                                     "Ljdk/internal/vm/vector/VectorSupport$VecExtractOp;)J")                                                  \
+   do_name(vector_extract_name, "extract")                                                                                                     \
+                                                                                                                                               \
+ do_intrinsic(_VectorInsert, jdk_internal_vm_vector_VectorSupport, vector_insert_name, vector_insert_sig, F_S)                                 \
+   do_signature(vector_insert_sig, "(Ljava/lang/Class;Ljava/lang/Class;I"                                                                      \
+                                    "Ljdk/internal/vm/vector/VectorSupport$Vector;IJ"                                                          \
+                                    "Ljdk/internal/vm/vector/VectorSupport$VecInsertOp;)Ljdk/internal/vm/vector/VectorSupport$Vector;")        \
+   do_name(vector_insert_name, "insert")                                                                                                       \
+                                                                                                                                               \
+  do_intrinsic(_VectorBroadcastInt, jdk_internal_vm_vector_VectorSupport, vector_broadcast_int_name, vector_broadcast_int_sig, F_S)            \
+   do_signature(vector_broadcast_int_sig, "(ILjava/lang/Class;Ljava/lang/Class;I"                                                              \
+                                           "Ljdk/internal/vm/vector/VectorSupport$Vector;I"                                                    \
+                                           "Ljdk/internal/vm/vector/VectorSupport$VectorBroadcastIntOp;)Ljdk/internal/vm/vector/VectorSupport$Vector;") \
+   do_name(vector_broadcast_int_name, "broadcastInt")                                                                                          \
+                                                                                                                                               \
+  do_intrinsic(_VectorConvert, jdk_internal_vm_vector_VectorSupport, vector_convert_name, vector_convert_sig, F_S)                             \
+   do_signature(vector_convert_sig, "(ILjava/lang/Class;Ljava/lang/Class;I"                                                                    \
+                                     "Ljava/lang/Class;Ljava/lang/Class;I"                                                                     \
+                                     "Ljdk/internal/vm/vector/VectorSupport$VectorPayload;"                                                    \
+                                     "Ljdk/internal/vm/vector/VectorSupport$VectorSpecies;"                                                    \
+                                     "Ljdk/internal/vm/vector/VectorSupport$VectorConvertOp;)Ljdk/internal/vm/vector/VectorSupport$VectorPayload;") \
+   do_name(vector_convert_name, "convert")                                                                                                     \
+                                                                                                                                               \
+   do_intrinsic(_VectorGatherOp, jdk_internal_vm_vector_VectorSupport, vector_gather_name, vector_gather_sig, F_S)                             \
+    do_signature(vector_gather_sig, "(Ljava/lang/Class;Ljava/lang/Class;ILjava/lang/Class;"                                                    \
+                                     "Ljava/lang/Object;J"                                                                                     \
+                                     "Ljdk/internal/vm/vector/VectorSupport$Vector;"                                                           \
+                                     "Ljava/lang/Object;I[II"                                                                                  \
+                                     "Ljdk/internal/vm/vector/VectorSupport$VectorSpecies;"                                                    \
+                                     "Ljdk/internal/vm/vector/VectorSupport$LoadVectorOperationWithMap;)"                                      \
+                                     "Ljdk/internal/vm/vector/VectorSupport$Vector;")                                                          \
+    do_name(vector_gather_name, "loadWithMap")                                                                                                 \
+                                                                                                                                               \
+   do_intrinsic(_VectorScatterOp, jdk_internal_vm_vector_VectorSupport, vector_scatter_name, vector_scatter_sig, F_S)                          \
+    do_signature(vector_scatter_sig, "(Ljava/lang/Class;Ljava/lang/Class;ILjava/lang/Class;"                                                   \
+                                      "Ljava/lang/Object;J"                                                                                    \
+                                      "Ljdk/internal/vm/vector/VectorSupport$Vector;Ljdk/internal/vm/vector/VectorSupport$Vector;"             \
+                                      "Ljava/lang/Object;I[II"                                                                                 \
+                                      "Ljdk/internal/vm/vector/VectorSupport$StoreVectorOperationWithMap;)V")                                  \
+    do_name(vector_scatter_name, "storeWithMap")                                                                                               \
+                                                                                                                                               \
+  do_intrinsic(_VectorRebox, jdk_internal_vm_vector_VectorSupport, vector_rebox_name, vector_rebox_sig, F_S)                                   \
+   do_alias(vector_rebox_sig, object_object_signature)                                                                                         \
+   do_name(vector_rebox_name, "maybeRebox")                                                                                                    \
+                                                                                                                                               \
                                                                                                                               \
   /* (2) Bytecode intrinsics                                                                        */                        \
                                                                                                                               \
--- a/src/hotspot/share/classfile/vmSymbols.hpp
+++ b/src/hotspot/share/classfile/vmSymbols.hpp
@ -81,6 +81,16 @@
  template(java_lang_Integer_IntegerCache,            "java/lang/Integer$IntegerCache")           \
  template(java_lang_Long,                            "java/lang/Long")                           \
  template(java_lang_Long_LongCache,                  "java/lang/Long$LongCache")                 \
+                                                                                                  \
+  template(jdk_internal_vm_vector_VectorSupport,      "jdk/internal/vm/vector/VectorSupport")               \
+  template(jdk_internal_vm_vector_VectorPayload,      "jdk/internal/vm/vector/VectorSupport$VectorPayload") \
+  template(jdk_internal_vm_vector_Vector,             "jdk/internal/vm/vector/VectorSupport$Vector")        \
+  template(jdk_internal_vm_vector_VectorMask,         "jdk/internal/vm/vector/VectorSupport$VectorMask")    \
+  template(jdk_internal_vm_vector_VectorShuffle,      "jdk/internal/vm/vector/VectorSupport$VectorShuffle") \
+  template(payload_name,                              "payload")                                            \
+  template(ETYPE_name,                                "ETYPE")                                              \
+  template(VLENGTH_name,                              "VLENGTH")                                            \
+                                                                                                  \
  template(java_lang_Shutdown,                        "java/lang/Shutdown")                       \
  template(java_lang_ref_Reference,                   "java/lang/ref/Reference")                  \
  template(java_lang_ref_SoftReference,               "java/lang/ref/SoftReference")              \
@ -768,7 +778,7 @@ class vmIntrinsics: AllStatic {
    #undef VM_INTRINSIC_ENUM

    ID_LIMIT,
-    LAST_COMPILER_INLINE = _getAndSetReference,
+    LAST_COMPILER_INLINE = _VectorScatterOp,
    FIRST_MH_SIG_POLY    = _invokeGeneric,
    FIRST_MH_STATIC      = _linkToVirtual,
    LAST_MH_SIG_POLY     = _linkToInterface,
--- a/src/hotspot/share/code/debugInfo.hpp
+++ b/src/hotspot/share/code/debugInfo.hpp
@ -42,6 +42,7 @@
 // - ConstantValue   describes a constant

 class ConstantOopReadValue;
+class LocationValue;
 class ObjectValue;

 class ScopeValue: public ResourceObj {
@ -67,6 +68,11 @@ class ScopeValue: public ResourceObj {
    return (ObjectValue*)this;
  }

+  LocationValue* as_LocationValue() {
+    assert(is_location(), "must be");
+    return (LocationValue*)this;
+  }
+
  // Serialization of debugging information
  virtual void write_on(DebugInfoWriteStream* stream) = 0;
  static ScopeValue* read_from(DebugInfoReadStream* stream);
--- a/src/hotspot/share/code/location.hpp
+++ b/src/hotspot/share/code/location.hpp
@ -58,6 +58,7 @@ class Location {
    lng,                        // Long held in one register
    float_in_dbl,               // Float held in double register
    dbl,                        // Double held in one register
+    vector,                     // Vector in one register
    addr,                       // JSR return address
    narrowoop                   // Narrow Oop (please GC me!)
  };
--- a/src/hotspot/share/opto/addnode.hpp
+++ b/src/hotspot/share/opto/addnode.hpp
@ -281,6 +281,30 @@ public:
  virtual Node *Ideal(PhaseGVN *phase, bool can_reshape);
 };

+//------------------------------MaxLNode---------------------------------------
+// MAXimum of 2 longs.
+class MaxLNode : public MaxNode {
+public:
+  MaxLNode(Node *in1, Node *in2) : MaxNode(in1, in2) {}
+  virtual int Opcode() const;
+  virtual const Type *add_ring(const Type*, const Type*) const { return TypeLong::LONG; }
+  virtual const Type *add_id() const { return TypeLong::make(min_jlong); }
+  virtual const Type *bottom_type() const { return TypeLong::LONG; }
+  virtual uint ideal_reg() const { return Op_RegL; }
+};
+
+//------------------------------MinLNode---------------------------------------
+// MINimum of 2 longs.
+class MinLNode : public MaxNode {
+public:
+  MinLNode(Node *in1, Node *in2) : MaxNode(in1, in2) {}
+  virtual int Opcode() const;
+  virtual const Type *add_ring(const Type*, const Type*) const { return TypeLong::LONG; }
+  virtual const Type *add_id() const { return TypeLong::make(max_jlong); }
+  virtual const Type *bottom_type() const { return TypeLong::LONG; }
+  virtual uint ideal_reg() const { return Op_RegL; }
+};
+
 //------------------------------MaxFNode---------------------------------------
 // Maximum of 2 floats.
 class MaxFNode : public MaxNode {
--- a/src/hotspot/share/opto/c2_globals.hpp
+++ b/src/hotspot/share/opto/c2_globals.hpp
@ -743,6 +743,15 @@
  product(bool, UseMontgomerySquareIntrinsic, false, DIAGNOSTIC,            \
          "Enables intrinsification of BigInteger.montgomerySquare()")      \
                                                                            \
+  product(bool, EnableVectorSupport, false, EXPERIMENTAL,                   \
+          "Enables VectorSupport intrinsics")                               \
+                                                                            \
+  product(bool, EnableVectorReboxing, false, EXPERIMENTAL,                  \
+          "Enables reboxing of vectors")                                    \
+                                                                            \
+  product(bool, EnableVectorAggressiveReboxing, false, EXPERIMENTAL,        \
+          "Enables aggressive reboxing of vectors")                         \
+                                                                            \
  product(bool, UseTypeSpeculation, true,                                   \
          "Speculatively propagate types from profiles")                    \
                                                                            \
--- a/src/hotspot/share/opto/c2compiler.cpp
+++ b/src/hotspot/share/opto/c2compiler.cpp
@ -649,6 +649,28 @@ bool C2Compiler::is_intrinsic_supported(const methodHandle& method, bool is_virt
  case vmIntrinsics::_isCompileConstant:
  case vmIntrinsics::_Preconditions_checkIndex:
    break;
+
+  case vmIntrinsics::_VectorUnaryOp:
+  case vmIntrinsics::_VectorBinaryOp:
+  case vmIntrinsics::_VectorTernaryOp:
+  case vmIntrinsics::_VectorBroadcastCoerced:
+  case vmIntrinsics::_VectorShuffleIota:
+  case vmIntrinsics::_VectorShuffleToVector:
+  case vmIntrinsics::_VectorLoadOp:
+  case vmIntrinsics::_VectorStoreOp:
+  case vmIntrinsics::_VectorGatherOp:
+  case vmIntrinsics::_VectorScatterOp:
+  case vmIntrinsics::_VectorReductionCoerced:
+  case vmIntrinsics::_VectorTest:
+  case vmIntrinsics::_VectorBlend:
+  case vmIntrinsics::_VectorRearrange:
+  case vmIntrinsics::_VectorCompare:
+  case vmIntrinsics::_VectorBroadcastInt:
+  case vmIntrinsics::_VectorConvert:
+  case vmIntrinsics::_VectorInsert:
+  case vmIntrinsics::_VectorExtract:
+    return EnableVectorSupport;
+
  default:
    return false;
  }
--- a/src/hotspot/share/opto/callGenerator.cpp
+++ b/src/hotspot/share/opto/callGenerator.cpp
@ -536,7 +536,7 @@ class LateInlineStringCallGenerator : public LateInlineCallGenerator {

    C->add_string_late_inline(this);

-    JVMState* new_jvms =  DirectCallGenerator::generate(jvms);
+    JVMState* new_jvms = DirectCallGenerator::generate(jvms);
    return new_jvms;
  }

@ -560,7 +560,7 @@ class LateInlineBoxingCallGenerator : public LateInlineCallGenerator {

    C->add_boxing_late_inline(this);

-    JVMState* new_jvms =  DirectCallGenerator::generate(jvms);
+    JVMState* new_jvms = DirectCallGenerator::generate(jvms);
    return new_jvms;
  }
 };
@ -569,6 +569,28 @@ CallGenerator* CallGenerator::for_boxing_late_inline(ciMethod* method, CallGener
  return new LateInlineBoxingCallGenerator(method, inline_cg);
 }

+class LateInlineVectorReboxingCallGenerator : public LateInlineCallGenerator {
+
+ public:
+  LateInlineVectorReboxingCallGenerator(ciMethod* method, CallGenerator* inline_cg) :
+    LateInlineCallGenerator(method, inline_cg, /*is_pure=*/true) {}
+
+  virtual JVMState* generate(JVMState* jvms) {
+    Compile *C = Compile::current();
+
+    C->log_inline_id(this);
+
+    C->add_vector_reboxing_late_inline(this);
+
+    JVMState* new_jvms = DirectCallGenerator::generate(jvms);
+    return new_jvms;
+  }
+};
+
+//   static CallGenerator* for_vector_reboxing_late_inline(ciMethod* m, CallGenerator* inline_cg);
+CallGenerator* CallGenerator::for_vector_reboxing_late_inline(ciMethod* method, CallGenerator* inline_cg) {
+  return new LateInlineVectorReboxingCallGenerator(method, inline_cg);
+}
 //---------------------------WarmCallGenerator--------------------------------
 // Internal class which handles initial deferral of inlining decisions.
 class WarmCallGenerator : public CallGenerator {
--- a/src/hotspot/share/opto/callGenerator.hpp
+++ b/src/hotspot/share/opto/callGenerator.hpp
@ -127,6 +127,7 @@ class CallGenerator : public ResourceObj {
  static CallGenerator* for_mh_late_inline(ciMethod* caller, ciMethod* callee, bool input_not_const);
  static CallGenerator* for_string_late_inline(ciMethod* m, CallGenerator* inline_cg);
  static CallGenerator* for_boxing_late_inline(ciMethod* m, CallGenerator* inline_cg);
+  static CallGenerator* for_vector_reboxing_late_inline(ciMethod* m, CallGenerator* inline_cg);

  // How to make a call but defer the decision whether to inline or not.
  static CallGenerator* for_warm_call(WarmCallInfo* ci,
--- a/src/hotspot/share/opto/castnode.cpp
+++ b/src/hotspot/share/opto/castnode.cpp
@ -290,9 +290,17 @@ Node* CheckCastPPNode::Identity(PhaseGVN* phase) {
  if (_carry_dependency) {
    return this;
  }
-  // Toned down to rescue meeting at a Phi 3 different oops all implementing
-  // the same interface.
-  return (phase->type(in(1)) == phase->type(this)) ? in(1) : this;
+  const Type* t = phase->type(in(1));
+  if (EnableVectorReboxing && in(1)->Opcode() == Op_VectorBox) {
+    if (t->higher_equal_speculative(phase->type(this))) {
+      return in(1);
+    }
+  } else if (t == phase->type(this)) {
+    // Toned down to rescue meeting at a Phi 3 different oops all implementing
+    // the same interface.
+    return in(1);
+  }
+  return this;
 }

 //------------------------------Value------------------------------------------
--- a/src/hotspot/share/opto/cfgnode.cpp
+++ b/src/hotspot/share/opto/cfgnode.cpp
@ -43,6 +43,7 @@
 #include "opto/regmask.hpp"
 #include "opto/runtime.hpp"
 #include "opto/subnode.hpp"
+#include "opto/vectornode.hpp"
 #include "utilities/vmError.hpp"

 // Portions of code courtesy of Clifford Click
@ -2387,6 +2388,47 @@ Node *PhiNode::Ideal(PhaseGVN *phase, bool can_reshape) {
  }
 #endif

+  // Phi (VB ... VB) => VB (Phi ...) (Phi ...)
+  if (EnableVectorReboxing && can_reshape && progress == NULL) {
+    PhaseIterGVN* igvn = phase->is_IterGVN();
+
+    bool all_inputs_are_equiv_vboxes = true;
+    for (uint i = 1; i < req(); ++i) {
+      Node* n = in(i);
+      if (in(i)->Opcode() != Op_VectorBox) {
+        all_inputs_are_equiv_vboxes = false;
+        break;
+      }
+      // Check that vector type of vboxes is equivalent
+      if (i != 1) {
+        if (Type::cmp(in(i-0)->in(VectorBoxNode::Value)->bottom_type(),
+                      in(i-1)->in(VectorBoxNode::Value)->bottom_type()) != 0) {
+          all_inputs_are_equiv_vboxes = false;
+          break;
+        }
+        if (Type::cmp(in(i-0)->in(VectorBoxNode::Box)->bottom_type(),
+                      in(i-1)->in(VectorBoxNode::Box)->bottom_type()) != 0) {
+          all_inputs_are_equiv_vboxes = false;
+          break;
+        }
+      }
+    }
+
+    if (all_inputs_are_equiv_vboxes) {
+      VectorBoxNode* vbox = static_cast<VectorBoxNode*>(in(1));
+      PhiNode* new_vbox_phi = new PhiNode(r, vbox->box_type());
+      PhiNode* new_vect_phi = new PhiNode(r, vbox->vec_type());
+      for (uint i = 1; i < req(); ++i) {
+        VectorBoxNode* old_vbox = static_cast<VectorBoxNode*>(in(i));
+        new_vbox_phi->set_req(i, old_vbox->in(VectorBoxNode::Box));
+        new_vect_phi->set_req(i, old_vbox->in(VectorBoxNode::Value));
+      }
+      igvn->register_new_node_with_optimizer(new_vbox_phi, this);
+      igvn->register_new_node_with_optimizer(new_vect_phi, this);
+      progress = new VectorBoxNode(igvn->C, new_vbox_phi, new_vect_phi, vbox->box_type(), vbox->vec_type());
+    }
+  }
+
  return progress;              // Return any progress
 }

--- a/src/hotspot/share/opto/classes.hpp
+++ b/src/hotspot/share/opto/classes.hpp
@ -198,9 +198,10 @@ macro(LoopLimit)
 macro(Mach)
 macro(MachProj)
 macro(MulAddS2I)
+macro(MaxI)
+macro(MaxL)
 macro(MaxD)
 macro(MaxF)
-macro(MaxI)
 macro(MemBarAcquire)
 macro(LoadFence)
 macro(SetVectMaskI)
@ -212,9 +213,10 @@ macro(MemBarReleaseLock)
 macro(MemBarVolatile)
 macro(MemBarStoreStore)
 macro(MergeMem)
-macro(MinD)
-macro(MinF)
 macro(MinI)
+macro(MinL)
+macro(MinF)
+macro(MinD)
 macro(ModD)
 macro(ModF)
 macro(ModI)
@ -229,6 +231,8 @@ macro(MulHiL)
 macro(MulI)
 macro(MulL)
 macro(Multi)
+macro(NegI)
+macro(NegL)
 macro(NegD)
 macro(NegF)
 macro(NeverBranch)
@ -324,6 +328,8 @@ macro(TailJump)
 macro(MacroLogicV)
 macro(ThreadLocal)
 macro(Unlock)
+macro(URShiftB)
+macro(URShiftS)
 macro(URShiftI)
 macro(URShiftL)
 macro(XorI)
@ -366,6 +372,7 @@ macro(AbsVI)
 macro(AbsVL)
 macro(AbsVF)
 macro(AbsVD)
+macro(NegVI)
 macro(NegVF)
 macro(NegVD)
 macro(SqrtVD)
@ -395,7 +402,9 @@ macro(MaxV)
 macro(MinReductionV)
 macro(MaxReductionV)
 macro(LoadVector)
+macro(LoadVectorGather)
 macro(StoreVector)
+macro(StoreVectorScatter)
 macro(Pack)
 macro(PackB)
 macro(PackS)
@ -424,3 +433,24 @@ macro(Digit)
 macro(LowerCase)
 macro(UpperCase)
 macro(Whitespace)
+macro(VectorBox)
+macro(VectorBoxAllocate)
+macro(VectorUnbox)
+macro(VectorMaskWrapper)
+macro(VectorMaskCmp)
+macro(VectorTest)
+macro(VectorBlend)
+macro(VectorRearrange)
+macro(VectorLoadMask)
+macro(VectorLoadShuffle)
+macro(VectorLoadConst)
+macro(VectorStoreMask)
+macro(VectorReinterpret)
+macro(VectorCast)
+macro(VectorCastB2X)
+macro(VectorCastS2X)
+macro(VectorCastI2X)
+macro(VectorCastL2X)
+macro(VectorCastF2X)
+macro(VectorCastD2X)
+macro(VectorInsert)
--- a/src/hotspot/share/opto/compile.cpp
+++ b/src/hotspot/share/opto/compile.cpp
@ -68,6 +68,7 @@
 #include "opto/runtime.hpp"
 #include "opto/stringopts.hpp"
 #include "opto/type.hpp"
+#include "opto/vector.hpp"
 #include "opto/vectornode.hpp"
 #include "runtime/arguments.hpp"
 #include "runtime/globals_extension.hpp"
@ -412,6 +413,7 @@ void Compile::remove_useless_nodes(Unique_Node_List &useful) {
  remove_useless_late_inlines(&_string_late_inlines, useful);
  remove_useless_late_inlines(&_boxing_late_inlines, useful);
  remove_useless_late_inlines(&_late_inlines, useful);
+  remove_useless_late_inlines(&_vector_reboxing_late_inlines, useful);
  debug_only(verify_graph_edges(true/*check for no_dead_code*/);)
 }

@ -545,6 +547,7 @@ Compile::Compile( ciEnv* ci_env, ciMethod* target, int osr_bci,
                  _late_inlines(comp_arena(), 2, 0, NULL),
                  _string_late_inlines(comp_arena(), 2, 0, NULL),
                  _boxing_late_inlines(comp_arena(), 2, 0, NULL),
+                  _vector_reboxing_late_inlines(comp_arena(), 2, 0, NULL),
                  _late_inlines_pos(0),
                  _number_of_mh_late_inlines(0),
                  _print_inlining_stream(NULL),
@ -1962,6 +1965,8 @@ void Compile::inline_incrementally(PhaseIterGVN& igvn) {

    inline_incrementally_cleanup(igvn);

+    print_method(PHASE_INCREMENTAL_INLINE_STEP, 3);
+
    if (failing())  return;
  }
  assert( igvn._worklist.size() == 0, "should be done with igvn" );
@ -2096,6 +2101,16 @@ void Compile::Optimize() {
  // so keep only the actual candidates for optimizations.
  cleanup_expensive_nodes(igvn);

+  assert(EnableVectorSupport || !has_vbox_nodes(), "sanity");
+  if (EnableVectorSupport && has_vbox_nodes()) {
+    TracePhase tp("", &timers[_t_vector]);
+    PhaseVector pv(igvn);
+    pv.optimize_vector_boxes();
+
+    print_method(PHASE_ITER_GVN_AFTER_VECTOR, 2);
+  }
+  assert(!has_vbox_nodes(), "sanity");
+
  if (!failing() && RenumberLiveNodes && live_nodes() + NodeLimitFudgeFactor < unique()) {
    Compile::TracePhase tp("", &timers[_t_renumberLive]);
    initial_gvn()->replace_with(&igvn);
@ -2272,6 +2287,35 @@ void Compile::Optimize() {
 DEBUG_ONLY(set_phase_optimize_finished();)
 }

+void Compile::inline_vector_reboxing_calls() {
+  if (C->_vector_reboxing_late_inlines.length() > 0) {
+    PhaseGVN* gvn = C->initial_gvn();
+
+    _late_inlines_pos = C->_late_inlines.length();
+    while (_vector_reboxing_late_inlines.length() > 0) {
+      CallGenerator* cg = _vector_reboxing_late_inlines.pop();
+      cg->do_late_inline();
+      if (failing())  return;
+      print_method(PHASE_INLINE_VECTOR_REBOX, cg->call_node());
+    }
+    _vector_reboxing_late_inlines.trunc_to(0);
+  }
+}
+
+bool Compile::has_vbox_nodes() {
+  if (C->_vector_reboxing_late_inlines.length() > 0) {
+    return true;
+  }
+  for (int macro_idx = C->macro_count() - 1; macro_idx >= 0; macro_idx--) {
+    Node * n = C->macro_node(macro_idx);
+    assert(n->is_macro(), "only macro nodes expected here");
+    if (n->Opcode() == Op_VectorUnbox || n->Opcode() == Op_VectorBox || n->Opcode() == Op_VectorBoxAllocate) {
+      return true;
+    }
+  }
+  return false;
+}
+
 //---------------------------- Bitwise operation packing optimization ---------------------------

 static bool is_vector_unary_bitwise_op(Node* n) {
@ -2618,8 +2662,8 @@ void Compile::Code_Gen() {
    if (failing()) {
      return;
    }
+    print_method(PHASE_AFTER_MATCHING, 3);
  }
-
  // In debug mode can dump m._nodes.dump() for mapping of ideal to machine
  // nodes.  Mapping is only valid at the root of each matched subtree.
  NOT_PRODUCT( verify_graph_edges(); )
@ -2798,7 +2842,8 @@ void Compile::final_graph_reshaping_impl( Node *n, Final_Reshape_Counts &frc) {
    // Check for commutative opcode
    switch( nop ) {
    case Op_AddI:  case Op_AddF:  case Op_AddD:  case Op_AddL:
-    case Op_MaxI:  case Op_MinI:
+    case Op_MaxI:  case Op_MaxL:  case Op_MaxF:  case Op_MaxD:
+    case Op_MinI:  case Op_MinL:  case Op_MinF:  case Op_MinD:
    case Op_MulI:  case Op_MulF:  case Op_MulD:  case Op_MulL:
    case Op_AndL:  case Op_XorL:  case Op_OrL:
    case Op_AndI:  case Op_XorI:  case Op_OrI: {
@ -3348,6 +3393,8 @@ void Compile::final_graph_reshaping_main_switch(Node* n, Final_Reshape_Counts& f

  case Op_LoadVector:
  case Op_StoreVector:
+  case Op_LoadVectorGather:
+  case Op_StoreVectorScatter:
    break;

  case Op_AddReductionVI:
@ -4568,26 +4615,43 @@ void Compile::sort_macro_nodes() {
  }
 }

-void Compile::print_method(CompilerPhaseType cpt, int level, int idx) {
+void Compile::print_method(CompilerPhaseType cpt, const char *name, int level, int idx) {
  EventCompilerPhase event;
  if (event.should_commit()) {
    CompilerEvent::PhaseEvent::post(event, C->_latest_stage_start_counter, cpt, C->_compile_id, level);
  }
-
 #ifndef PRODUCT
  if (should_print(level)) {
-    char output[1024];
-    if (idx != 0) {
-      jio_snprintf(output, sizeof(output), "%s:%d", CompilerPhaseTypeHelper::to_string(cpt), idx);
-    } else {
-      jio_snprintf(output, sizeof(output), "%s", CompilerPhaseTypeHelper::to_string(cpt));
-    }
-    _printer->print_method(output, level);
+    _printer->print_method(name, level);
  }
 #endif
  C->_latest_stage_start_counter.stamp();
 }

+void Compile::print_method(CompilerPhaseType cpt, int level, int idx) {
+  char output[1024];
+#ifndef PRODUCT
+  if (idx != 0) {
+    jio_snprintf(output, sizeof(output), "%s:%d", CompilerPhaseTypeHelper::to_string(cpt), idx);
+  } else {
+    jio_snprintf(output, sizeof(output), "%s", CompilerPhaseTypeHelper::to_string(cpt));
+  }
+#endif
+  print_method(cpt, output, level, idx);
+}
+
+void Compile::print_method(CompilerPhaseType cpt, Node* n, int level) {
+  ResourceMark rm;
+  stringStream ss;
+  ss.print_raw(CompilerPhaseTypeHelper::to_string(cpt));
+  if (n != NULL) {
+    ss.print(": %d %s ", n->_idx, NodeClassNames[n->Opcode()]);
+  } else {
+    ss.print_raw(": NULL");
+  }
+  C->print_method(cpt, ss.as_string(), level);
+}
+
 void Compile::end_method(int level) {
  EventCompilerPhase event;
  if (event.should_commit()) {
--- a/src/hotspot/share/opto/compile.hpp
+++ b/src/hotspot/share/opto/compile.hpp
@ -382,6 +382,8 @@ class Compile : public Phase {

  GrowableArray<CallGenerator*> _boxing_late_inlines; // same but for boxing operations

+  GrowableArray<CallGenerator*> _vector_reboxing_late_inlines; // same but for vector reboxing operations
+
  int                           _late_inlines_pos;    // Where in the queue should the next late inlining candidate go (emulate depth first inlining)
  uint                          _number_of_mh_late_inlines; // number of method handle late inlining still pending

@ -644,7 +646,9 @@ class Compile : public Phase {
 #endif
  }

+  void print_method(CompilerPhaseType cpt, const char *name, int level = 1, int idx = 0);
  void print_method(CompilerPhaseType cpt, int level = 1, int idx = 0);
+  void print_method(CompilerPhaseType cpt, Node* n, int level = 3);

 #ifndef PRODUCT
  void igv_print_method_to_file(const char* phase_name = "Debug", bool append = false);
@ -865,10 +869,13 @@ class Compile : public Phase {
                                   bool allow_intrinsics = true);
  bool should_delay_inlining(ciMethod* call_method, JVMState* jvms) {
    return should_delay_string_inlining(call_method, jvms) ||
-           should_delay_boxing_inlining(call_method, jvms);
+           should_delay_boxing_inlining(call_method, jvms) ||
+           should_delay_vector_inlining(call_method, jvms);
  }
  bool should_delay_string_inlining(ciMethod* call_method, JVMState* jvms);
  bool should_delay_boxing_inlining(ciMethod* call_method, JVMState* jvms);
+  bool should_delay_vector_inlining(ciMethod* call_method, JVMState* jvms);
+  bool should_delay_vector_reboxing_inlining(ciMethod* call_method, JVMState* jvms);

  // Helper functions to identify inlining potential at call-site
  ciMethod* optimize_virtual_call(ciMethod* caller, int bci, ciInstanceKlass* klass,
@ -940,6 +947,10 @@ class Compile : public Phase {
    _boxing_late_inlines.push(cg);
  }

+  void              add_vector_reboxing_late_inline(CallGenerator* cg) {
+    _vector_reboxing_late_inlines.push(cg);
+  }
+
  void remove_useless_late_inlines(GrowableArray<CallGenerator*>* inlines, Unique_Node_List &useful);

  void process_print_inlining();
@ -969,6 +980,9 @@ class Compile : public Phase {
  bool optimize_loops(PhaseIterGVN& igvn, LoopOptsMode mode);
  void remove_root_to_sfpts_edges(PhaseIterGVN& igvn);

+  void inline_vector_reboxing_calls();
+  bool has_vbox_nodes();
+
  // Matching, CFG layout, allocation, code generation
  PhaseCFG*         cfg()                       { return _cfg; }
  bool              has_java_calls() const      { return _java_calls > 0; }
--- a/src/hotspot/share/opto/doCall.cpp
+++ b/src/hotspot/share/opto/doCall.cpp
@ -135,6 +135,8 @@ CallGenerator* Compile::call_generator(ciMethod* callee, int vtable_index, bool
      if (cg->does_virtual_dispatch()) {
        cg_intrinsic = cg;
        cg = NULL;
+      } else if (should_delay_vector_inlining(callee, jvms)) {
+        return CallGenerator::for_late_inline(callee, cg);
      } else {
        return cg;
      }
@ -185,6 +187,8 @@ CallGenerator* Compile::call_generator(ciMethod* callee, int vtable_index, bool
            return CallGenerator::for_string_late_inline(callee, cg);
          } else if (should_delay_boxing_inlining(callee, jvms)) {
            return CallGenerator::for_boxing_late_inline(callee, cg);
+          } else if (should_delay_vector_reboxing_inlining(callee, jvms)) {
+            return CallGenerator::for_vector_reboxing_late_inline(callee, cg);
          } else if ((should_delay || AlwaysIncrementalInline)) {
            return CallGenerator::for_late_inline(callee, cg);
          }
@ -422,6 +426,14 @@ bool Compile::should_delay_boxing_inlining(ciMethod* call_method, JVMState* jvms
  return false;
 }

+bool Compile::should_delay_vector_inlining(ciMethod* call_method, JVMState* jvms) {
+  return EnableVectorSupport && call_method->is_vector_method();
+}
+
+bool Compile::should_delay_vector_reboxing_inlining(ciMethod* call_method, JVMState* jvms) {
+  return EnableVectorSupport && (call_method->intrinsic_id() == vmIntrinsics::_VectorRebox);
+}
+
 // uncommon-trap call-sites where callee is unloaded, uninitialized or will not link
 bool Parse::can_not_compile_call_site(ciMethod *dest_method, ciInstanceKlass* klass) {
  // Additional inputs to consider...
--- a/src/hotspot/share/opto/lcm.cpp
+++ b/src/hotspot/share/opto/lcm.cpp
@ -686,6 +686,7 @@ void PhaseCFG::adjust_register_pressure(Node* n, Block* block, intptr_t* recalc_
        case Op_StoreP:
        case Op_StoreN:
        case Op_StoreVector:
+        case Op_StoreVectorScatter:
        case Op_StoreNKlass:
          for (uint k = 1; k < m->req(); k++) {
            Node *in = m->in(k);
--- a/src/hotspot/share/opto/library_call.cpp
+++ b/src/hotspot/share/opto/library_call.cpp
@ -37,15 +37,13 @@
 #include "opto/addnode.hpp"
 #include "opto/arraycopynode.hpp"
 #include "opto/c2compiler.hpp"
-#include "opto/callGenerator.hpp"
 #include "opto/castnode.hpp"
 #include "opto/cfgnode.hpp"
 #include "opto/convertnode.hpp"
 #include "opto/countbitsnode.hpp"
-#include "opto/intrinsicnode.hpp"
 #include "opto/idealKit.hpp"
+#include "opto/library_call.hpp"
 #include "opto/mathexactnode.hpp"
-#include "opto/movenode.hpp"
 #include "opto/mulnode.hpp"
 #include "opto/narrowptrnode.hpp"
 #include "opto/opaquenode.hpp"
@ -60,291 +58,6 @@
 #include "utilities/macros.hpp"
 #include "utilities/powerOfTwo.hpp"

-class LibraryIntrinsic : public InlineCallGenerator {
-  // Extend the set of intrinsics known to the runtime:
- public:
- private:
-  bool             _is_virtual;
-  bool             _does_virtual_dispatch;
-  int8_t           _predicates_count;  // Intrinsic is predicated by several conditions
-  int8_t           _last_predicate; // Last generated predicate
-  vmIntrinsics::ID _intrinsic_id;
-
- public:
-  LibraryIntrinsic(ciMethod* m, bool is_virtual, int predicates_count, bool does_virtual_dispatch, vmIntrinsics::ID id)
-    : InlineCallGenerator(m),
-      _is_virtual(is_virtual),
-      _does_virtual_dispatch(does_virtual_dispatch),
-      _predicates_count((int8_t)predicates_count),
-      _last_predicate((int8_t)-1),
-      _intrinsic_id(id)
-  {
-  }
-  virtual bool is_intrinsic() const { return true; }
-  virtual bool is_virtual()   const { return _is_virtual; }
-  virtual bool is_predicated() const { return _predicates_count > 0; }
-  virtual int  predicates_count() const { return _predicates_count; }
-  virtual bool does_virtual_dispatch()   const { return _does_virtual_dispatch; }
-  virtual JVMState* generate(JVMState* jvms);
-  virtual Node* generate_predicate(JVMState* jvms, int predicate);
-  vmIntrinsics::ID intrinsic_id() const { return _intrinsic_id; }
-};
-
-
-// Local helper class for LibraryIntrinsic:
-class LibraryCallKit : public GraphKit {
- private:
-  LibraryIntrinsic* _intrinsic;     // the library intrinsic being called
-  Node*             _result;        // the result node, if any
-  int               _reexecute_sp;  // the stack pointer when bytecode needs to be reexecuted
-
-  const TypeOopPtr* sharpen_unsafe_type(Compile::AliasType* alias_type, const TypePtr *adr_type);
-
- public:
-  LibraryCallKit(JVMState* jvms, LibraryIntrinsic* intrinsic)
-    : GraphKit(jvms),
-      _intrinsic(intrinsic),
-      _result(NULL)
-  {
-    // Check if this is a root compile.  In that case we don't have a caller.
-    if (!jvms->has_method()) {
-      _reexecute_sp = sp();
-    } else {
-      // Find out how many arguments the interpreter needs when deoptimizing
-      // and save the stack pointer value so it can used by uncommon_trap.
-      // We find the argument count by looking at the declared signature.
-      bool ignored_will_link;
-      ciSignature* declared_signature = NULL;
-      ciMethod* ignored_callee = caller()->get_method_at_bci(bci(), ignored_will_link, &declared_signature);
-      const int nargs = declared_signature->arg_size_for_bc(caller()->java_code_at_bci(bci()));
-      _reexecute_sp = sp() + nargs;  // "push" arguments back on stack
-    }
-  }
-
-  virtual LibraryCallKit* is_LibraryCallKit() const { return (LibraryCallKit*)this; }
-
-  ciMethod*         caller()    const    { return jvms()->method(); }
-  int               bci()       const    { return jvms()->bci(); }
-  LibraryIntrinsic* intrinsic() const    { return _intrinsic; }
-  vmIntrinsics::ID  intrinsic_id() const { return _intrinsic->intrinsic_id(); }
-  ciMethod*         callee()    const    { return _intrinsic->method(); }
-
-  bool  try_to_inline(int predicate);
-  Node* try_to_predicate(int predicate);
-
-  void push_result() {
-    // Push the result onto the stack.
-    if (!stopped() && result() != NULL) {
-      BasicType bt = result()->bottom_type()->basic_type();
-      push_node(bt, result());
-    }
-  }
-
- private:
-  void fatal_unexpected_iid(vmIntrinsics::ID iid) {
-    fatal("unexpected intrinsic %d: %s", iid, vmIntrinsics::name_at(iid));
-  }
-
-  void  set_result(Node* n) { assert(_result == NULL, "only set once"); _result = n; }
-  void  set_result(RegionNode* region, PhiNode* value);
-  Node*     result() { return _result; }
-
-  virtual int reexecute_sp() { return _reexecute_sp; }
-
-  // Helper functions to inline natives
-  Node* generate_guard(Node* test, RegionNode* region, float true_prob);
-  Node* generate_slow_guard(Node* test, RegionNode* region);
-  Node* generate_fair_guard(Node* test, RegionNode* region);
-  Node* generate_negative_guard(Node* index, RegionNode* region,
-                                // resulting CastII of index:
-                                Node* *pos_index = NULL);
-  Node* generate_limit_guard(Node* offset, Node* subseq_length,
-                             Node* array_length,
-                             RegionNode* region);
-  void  generate_string_range_check(Node* array, Node* offset,
-                                    Node* length, bool char_count);
-  Node* generate_current_thread(Node* &tls_output);
-  Node* load_mirror_from_klass(Node* klass);
-  Node* load_klass_from_mirror_common(Node* mirror, bool never_see_null,
-                                      RegionNode* region, int null_path,
-                                      int offset);
-  Node* load_klass_from_mirror(Node* mirror, bool never_see_null,
-                               RegionNode* region, int null_path) {
-    int offset = java_lang_Class::klass_offset();
-    return load_klass_from_mirror_common(mirror, never_see_null,
-                                         region, null_path,
-                                         offset);
-  }
-  Node* load_array_klass_from_mirror(Node* mirror, bool never_see_null,
-                                     RegionNode* region, int null_path) {
-    int offset = java_lang_Class::array_klass_offset();
-    return load_klass_from_mirror_common(mirror, never_see_null,
-                                         region, null_path,
-                                         offset);
-  }
-  Node* generate_access_flags_guard(Node* kls,
-                                    int modifier_mask, int modifier_bits,
-                                    RegionNode* region);
-  Node* generate_interface_guard(Node* kls, RegionNode* region);
-  Node* generate_hidden_class_guard(Node* kls, RegionNode* region);
-  Node* generate_array_guard(Node* kls, RegionNode* region) {
-    return generate_array_guard_common(kls, region, false, false);
-  }
-  Node* generate_non_array_guard(Node* kls, RegionNode* region) {
-    return generate_array_guard_common(kls, region, false, true);
-  }
-  Node* generate_objArray_guard(Node* kls, RegionNode* region) {
-    return generate_array_guard_common(kls, region, true, false);
-  }
-  Node* generate_non_objArray_guard(Node* kls, RegionNode* region) {
-    return generate_array_guard_common(kls, region, true, true);
-  }
-  Node* generate_array_guard_common(Node* kls, RegionNode* region,
-                                    bool obj_array, bool not_array);
-  Node* generate_virtual_guard(Node* obj_klass, RegionNode* slow_region);
-  CallJavaNode* generate_method_call(vmIntrinsics::ID method_id,
-                                     bool is_virtual = false, bool is_static = false);
-  CallJavaNode* generate_method_call_static(vmIntrinsics::ID method_id) {
-    return generate_method_call(method_id, false, true);
-  }
-  CallJavaNode* generate_method_call_virtual(vmIntrinsics::ID method_id) {
-    return generate_method_call(method_id, true, false);
-  }
-  Node * load_field_from_object(Node * fromObj, const char * fieldName, const char * fieldTypeString, bool is_exact, bool is_static, ciInstanceKlass * fromKls);
-  Node * field_address_from_object(Node * fromObj, const char * fieldName, const char * fieldTypeString, bool is_exact, bool is_static, ciInstanceKlass * fromKls);
-
-  Node* make_string_method_node(int opcode, Node* str1_start, Node* cnt1, Node* str2_start, Node* cnt2, StrIntrinsicNode::ArgEnc ae);
-  bool inline_string_compareTo(StrIntrinsicNode::ArgEnc ae);
-  bool inline_string_indexOf(StrIntrinsicNode::ArgEnc ae);
-  bool inline_string_indexOfI(StrIntrinsicNode::ArgEnc ae);
-  Node* make_indexOf_node(Node* src_start, Node* src_count, Node* tgt_start, Node* tgt_count,
-                          RegionNode* region, Node* phi, StrIntrinsicNode::ArgEnc ae);
-  bool inline_string_indexOfChar(StrIntrinsicNode::ArgEnc ae);
-  bool inline_string_equals(StrIntrinsicNode::ArgEnc ae);
-  bool inline_string_toBytesU();
-  bool inline_string_getCharsU();
-  bool inline_string_copy(bool compress);
-  bool inline_string_char_access(bool is_store);
-  Node* round_double_node(Node* n);
-  bool runtime_math(const TypeFunc* call_type, address funcAddr, const char* funcName);
-  bool inline_math_native(vmIntrinsics::ID id);
-  bool inline_math(vmIntrinsics::ID id);
-  bool inline_double_math(vmIntrinsics::ID id);
-  template <typename OverflowOp>
-  bool inline_math_overflow(Node* arg1, Node* arg2);
-  void inline_math_mathExact(Node* math, Node* test);
-  bool inline_math_addExactI(bool is_increment);
-  bool inline_math_addExactL(bool is_increment);
-  bool inline_math_multiplyExactI();
-  bool inline_math_multiplyExactL();
-  bool inline_math_multiplyHigh();
-  bool inline_math_negateExactI();
-  bool inline_math_negateExactL();
-  bool inline_math_subtractExactI(bool is_decrement);
-  bool inline_math_subtractExactL(bool is_decrement);
-  bool inline_min_max(vmIntrinsics::ID id);
-  bool inline_notify(vmIntrinsics::ID id);
-  Node* generate_min_max(vmIntrinsics::ID id, Node* x, Node* y);
-  // This returns Type::AnyPtr, RawPtr, or OopPtr.
-  int classify_unsafe_addr(Node* &base, Node* &offset, BasicType type);
-  Node* make_unsafe_address(Node*& base, Node* offset, DecoratorSet decorators, BasicType type = T_ILLEGAL, bool can_cast = false);
-
-  typedef enum { Relaxed, Opaque, Volatile, Acquire, Release } AccessKind;
-  DecoratorSet mo_decorator_for_access_kind(AccessKind kind);
-  bool inline_unsafe_access(bool is_store, BasicType type, AccessKind kind, bool is_unaligned);
-  static bool klass_needs_init_guard(Node* kls);
-  bool inline_unsafe_allocate();
-  bool inline_unsafe_newArray(bool uninitialized);
-  bool inline_unsafe_writeback0();
-  bool inline_unsafe_writebackSync0(bool is_pre);
-  bool inline_unsafe_copyMemory();
-  bool inline_native_currentThread();
-
-  bool inline_native_time_funcs(address method, const char* funcName);
-#ifdef JFR_HAVE_INTRINSICS
-  bool inline_native_classID();
-  bool inline_native_getEventWriter();
-#endif
-  bool inline_native_Class_query(vmIntrinsics::ID id);
-  bool inline_native_subtype_check();
-  bool inline_native_getLength();
-  bool inline_array_copyOf(bool is_copyOfRange);
-  bool inline_array_equals(StrIntrinsicNode::ArgEnc ae);
-  bool inline_preconditions_checkIndex();
-  void copy_to_clone(Node* obj, Node* alloc_obj, Node* obj_size, bool is_array);
-  bool inline_native_clone(bool is_virtual);
-  bool inline_native_Reflection_getCallerClass();
-  // Helper function for inlining native object hash method
-  bool inline_native_hashcode(bool is_virtual, bool is_static);
-  bool inline_native_getClass();
-
-  // Helper functions for inlining arraycopy
-  bool inline_arraycopy();
-  AllocateArrayNode* tightly_coupled_allocation(Node* ptr,
-                                                RegionNode* slow_region);
-  JVMState* arraycopy_restore_alloc_state(AllocateArrayNode* alloc, int& saved_reexecute_sp);
-  void arraycopy_move_allocation_here(AllocateArrayNode* alloc, Node* dest, JVMState* saved_jvms, int saved_reexecute_sp,
-                                      uint new_idx);
-
-  typedef enum { LS_get_add, LS_get_set, LS_cmp_swap, LS_cmp_swap_weak, LS_cmp_exchange } LoadStoreKind;
-  bool inline_unsafe_load_store(BasicType type,  LoadStoreKind kind, AccessKind access_kind);
-  bool inline_unsafe_fence(vmIntrinsics::ID id);
-  bool inline_onspinwait();
-  bool inline_fp_conversions(vmIntrinsics::ID id);
-  bool inline_number_methods(vmIntrinsics::ID id);
-  bool inline_reference_get();
-  bool inline_Class_cast();
-  bool inline_aescrypt_Block(vmIntrinsics::ID id);
-  bool inline_cipherBlockChaining_AESCrypt(vmIntrinsics::ID id);
-  bool inline_electronicCodeBook_AESCrypt(vmIntrinsics::ID id);
-  bool inline_counterMode_AESCrypt(vmIntrinsics::ID id);
-  Node* inline_cipherBlockChaining_AESCrypt_predicate(bool decrypting);
-  Node* inline_electronicCodeBook_AESCrypt_predicate(bool decrypting);
-  Node* inline_counterMode_AESCrypt_predicate();
-  Node* get_key_start_from_aescrypt_object(Node* aescrypt_object);
-  Node* get_original_key_start_from_aescrypt_object(Node* aescrypt_object);
-  bool inline_ghash_processBlocks();
-  bool inline_base64_encodeBlock();
-  bool inline_digestBase_implCompress(vmIntrinsics::ID id);
-  bool inline_digestBase_implCompressMB(int predicate);
-  bool inline_digestBase_implCompressMB(Node* digestBaseObj, ciInstanceKlass* instklass,
-                                        bool long_state, address stubAddr, const char *stubName,
-                                        Node* src_start, Node* ofs, Node* limit);
-  Node* get_state_from_digest_object(Node *digestBase_object);
-  Node* get_long_state_from_digest_object(Node *digestBase_object);
-  Node* inline_digestBase_implCompressMB_predicate(int predicate);
-  bool inline_encodeISOArray();
-  bool inline_updateCRC32();
-  bool inline_updateBytesCRC32();
-  bool inline_updateByteBufferCRC32();
-  Node* get_table_from_crc32c_class(ciInstanceKlass *crc32c_class);
-  bool inline_updateBytesCRC32C();
-  bool inline_updateDirectByteBufferCRC32C();
-  bool inline_updateBytesAdler32();
-  bool inline_updateByteBufferAdler32();
-  bool inline_multiplyToLen();
-  bool inline_hasNegatives();
-  bool inline_squareToLen();
-  bool inline_mulAdd();
-  bool inline_montgomeryMultiply();
-  bool inline_montgomerySquare();
-  bool inline_bigIntegerShift(bool isRightShift);
-  bool inline_vectorizedMismatch();
-  bool inline_fma(vmIntrinsics::ID id);
-  bool inline_character_compare(vmIntrinsics::ID id);
-  bool inline_fp_min_max(vmIntrinsics::ID id);
-
-  bool inline_profileBoolean();
-  bool inline_isCompileConstant();
-  void clear_upper_avx() {
-#ifdef X86
-    if (UseAVX >= 2) {
-      C->set_clear_upper_avx(true);
-    }
-#endif
-  }
-};
-
 //---------------------------make_vm_intrinsic----------------------------
 CallGenerator* Compile::make_vm_intrinsic(ciMethod* m, bool is_virtual) {
  vmIntrinsics::ID id = m->intrinsic_id();
@ -453,6 +166,7 @@ JVMState* LibraryIntrinsic::generate(JVMState* jvms) {
  }
  C->gather_intrinsic_statistics(intrinsic_id(), is_virtual(), Compile::_intrinsic_failed);
  C->print_inlining_update(this);
+
  return NULL;
 }

@ -530,7 +244,6 @@ bool LibraryCallKit::try_to_inline(int predicate) {
  }
  assert(merged_memory(), "");

-
  switch (intrinsic_id()) {
  case vmIntrinsics::_hashCode:                 return inline_native_hashcode(intrinsic()->is_virtual(), !is_static);
  case vmIntrinsics::_identityHashCode:         return inline_native_hashcode(/*!virtual*/ false,         is_static);
@ -912,6 +625,45 @@ bool LibraryCallKit::try_to_inline(int predicate) {
  case vmIntrinsics::_minD:
    return inline_fp_min_max(intrinsic_id());

+  case vmIntrinsics::_VectorUnaryOp:
+    return inline_vector_nary_operation(1);
+  case vmIntrinsics::_VectorBinaryOp:
+    return inline_vector_nary_operation(2);
+  case vmIntrinsics::_VectorTernaryOp:
+    return inline_vector_nary_operation(3);
+  case vmIntrinsics::_VectorBroadcastCoerced:
+    return inline_vector_broadcast_coerced();
+  case vmIntrinsics::_VectorShuffleIota:
+    return inline_vector_shuffle_iota();
+  case vmIntrinsics::_VectorShuffleToVector:
+    return inline_vector_shuffle_to_vector();
+  case vmIntrinsics::_VectorLoadOp:
+    return inline_vector_mem_operation(/*is_store=*/false);
+  case vmIntrinsics::_VectorStoreOp:
+    return inline_vector_mem_operation(/*is_store=*/true);
+  case vmIntrinsics::_VectorGatherOp:
+    return inline_vector_gather_scatter(/*is_scatter*/ false);
+  case vmIntrinsics::_VectorScatterOp:
+    return inline_vector_gather_scatter(/*is_scatter*/ true);
+  case vmIntrinsics::_VectorReductionCoerced:
+    return inline_vector_reduction();
+  case vmIntrinsics::_VectorTest:
+    return inline_vector_test();
+  case vmIntrinsics::_VectorBlend:
+    return inline_vector_blend();
+  case vmIntrinsics::_VectorRearrange:
+    return inline_vector_rearrange();
+  case vmIntrinsics::_VectorCompare:
+    return inline_vector_compare();
+  case vmIntrinsics::_VectorBroadcastInt:
+    return inline_vector_broadcast_int();
+  case vmIntrinsics::_VectorConvert:
+    return inline_vector_convert();
+  case vmIntrinsics::_VectorInsert:
+    return inline_vector_insert();
+  case vmIntrinsics::_VectorExtract:
+    return inline_vector_extract();
+
  default:
    // If you get here, it may be that someone has added a new intrinsic
    // to the list in vmSymbols.hpp without implementing it here.
@ -2255,7 +2007,7 @@ LibraryCallKit::classify_unsafe_addr(Node* &base, Node* &offset, BasicType type)
  }
 }

-inline Node* LibraryCallKit::make_unsafe_address(Node*& base, Node* offset, DecoratorSet decorators, BasicType type, bool can_cast) {
+Node* LibraryCallKit::make_unsafe_address(Node*& base, Node* offset, DecoratorSet decorators, BasicType type, bool can_cast) {
  Node* uncasted_base = base;
  int kind = classify_unsafe_addr(uncasted_base, offset, type);
  if (kind == Type::RawPtr) {
--- a/src/hotspot/share/opto/library_call.hpp
+++ b/src/hotspot/share/opto/library_call.hpp
@ -0,0 +1,348 @@
+/*
+ * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "ci/ciMethod.hpp"
+#include "classfile/javaClasses.hpp"
+#include "opto/callGenerator.hpp"
+#include "opto/graphKit.hpp"
+#include "opto/castnode.hpp"
+#include "opto/convertnode.hpp"
+#include "opto/intrinsicnode.hpp"
+#include "opto/movenode.hpp"
+
+class LibraryIntrinsic : public InlineCallGenerator {
+  // Extend the set of intrinsics known to the runtime:
+ public:
+ private:
+  bool             _is_virtual;
+  bool             _does_virtual_dispatch;
+  int8_t           _predicates_count;  // Intrinsic is predicated by several conditions
+  int8_t           _last_predicate; // Last generated predicate
+  vmIntrinsics::ID _intrinsic_id;
+
+ public:
+  LibraryIntrinsic(ciMethod* m, bool is_virtual, int predicates_count, bool does_virtual_dispatch, vmIntrinsics::ID id)
+    : InlineCallGenerator(m),
+      _is_virtual(is_virtual),
+      _does_virtual_dispatch(does_virtual_dispatch),
+      _predicates_count((int8_t)predicates_count),
+      _last_predicate((int8_t)-1),
+      _intrinsic_id(id)
+  {
+  }
+  virtual bool is_intrinsic() const { return true; }
+  virtual bool is_virtual()   const { return _is_virtual; }
+  virtual bool is_predicated() const { return _predicates_count > 0; }
+  virtual int  predicates_count() const { return _predicates_count; }
+  virtual bool does_virtual_dispatch()   const { return _does_virtual_dispatch; }
+  virtual JVMState* generate(JVMState* jvms);
+  virtual Node* generate_predicate(JVMState* jvms, int predicate);
+  vmIntrinsics::ID intrinsic_id() const { return _intrinsic_id; }
+};
+
+
+// Local helper class for LibraryIntrinsic:
+class LibraryCallKit : public GraphKit {
+ private:
+  LibraryIntrinsic* _intrinsic;     // the library intrinsic being called
+  Node*             _result;        // the result node, if any
+  int               _reexecute_sp;  // the stack pointer when bytecode needs to be reexecuted
+
+  const TypeOopPtr* sharpen_unsafe_type(Compile::AliasType* alias_type, const TypePtr *adr_type);
+
+ public:
+  LibraryCallKit(JVMState* jvms, LibraryIntrinsic* intrinsic)
+    : GraphKit(jvms),
+      _intrinsic(intrinsic),
+      _result(NULL)
+  {
+    // Check if this is a root compile.  In that case we don't have a caller.
+    if (!jvms->has_method()) {
+      _reexecute_sp = sp();
+    } else {
+      // Find out how many arguments the interpreter needs when deoptimizing
+      // and save the stack pointer value so it can used by uncommon_trap.
+      // We find the argument count by looking at the declared signature.
+      bool ignored_will_link;
+      ciSignature* declared_signature = NULL;
+      ciMethod* ignored_callee = caller()->get_method_at_bci(bci(), ignored_will_link, &declared_signature);
+      const int nargs = declared_signature->arg_size_for_bc(caller()->java_code_at_bci(bci()));
+      _reexecute_sp = sp() + nargs;  // "push" arguments back on stack
+    }
+  }
+
+  virtual LibraryCallKit* is_LibraryCallKit() const { return (LibraryCallKit*)this; }
+
+  ciMethod*         caller()    const    { return jvms()->method(); }
+  int               bci()       const    { return jvms()->bci(); }
+  LibraryIntrinsic* intrinsic() const    { return _intrinsic; }
+  vmIntrinsics::ID  intrinsic_id() const { return _intrinsic->intrinsic_id(); }
+  ciMethod*         callee()    const    { return _intrinsic->method(); }
+
+  bool  try_to_inline(int predicate);
+  Node* try_to_predicate(int predicate);
+
+  void push_result() {
+    // Push the result onto the stack.
+    if (!stopped() && result() != NULL) {
+      BasicType bt = result()->bottom_type()->basic_type();
+      push_node(bt, result());
+    }
+  }
+
+ private:
+  void fatal_unexpected_iid(vmIntrinsics::ID iid) {
+    fatal("unexpected intrinsic %d: %s", iid, vmIntrinsics::name_at(iid));
+  }
+
+  void  set_result(Node* n) { assert(_result == NULL, "only set once"); _result = n; }
+  void  set_result(RegionNode* region, PhiNode* value);
+  Node*     result() { return _result; }
+
+  virtual int reexecute_sp() { return _reexecute_sp; }
+
+  // Helper functions to inline natives
+  Node* generate_guard(Node* test, RegionNode* region, float true_prob);
+  Node* generate_slow_guard(Node* test, RegionNode* region);
+  Node* generate_fair_guard(Node* test, RegionNode* region);
+  Node* generate_negative_guard(Node* index, RegionNode* region,
+                                // resulting CastII of index:
+                                Node* *pos_index = NULL);
+  Node* generate_limit_guard(Node* offset, Node* subseq_length,
+                             Node* array_length,
+                             RegionNode* region);
+  void  generate_string_range_check(Node* array, Node* offset,
+                                    Node* length, bool char_count);
+  Node* generate_current_thread(Node* &tls_output);
+  Node* load_mirror_from_klass(Node* klass);
+  Node* load_klass_from_mirror_common(Node* mirror, bool never_see_null,
+                                      RegionNode* region, int null_path,
+                                      int offset);
+  Node* load_klass_from_mirror(Node* mirror, bool never_see_null,
+                               RegionNode* region, int null_path) {
+    int offset = java_lang_Class::klass_offset();
+    return load_klass_from_mirror_common(mirror, never_see_null,
+                                         region, null_path,
+                                         offset);
+  }
+  Node* load_array_klass_from_mirror(Node* mirror, bool never_see_null,
+                                     RegionNode* region, int null_path) {
+    int offset = java_lang_Class::array_klass_offset();
+    return load_klass_from_mirror_common(mirror, never_see_null,
+                                         region, null_path,
+                                         offset);
+  }
+  Node* generate_access_flags_guard(Node* kls,
+                                    int modifier_mask, int modifier_bits,
+                                    RegionNode* region);
+  Node* generate_interface_guard(Node* kls, RegionNode* region);
+  Node* generate_hidden_class_guard(Node* kls, RegionNode* region);
+  Node* generate_array_guard(Node* kls, RegionNode* region) {
+    return generate_array_guard_common(kls, region, false, false);
+  }
+  Node* generate_non_array_guard(Node* kls, RegionNode* region) {
+    return generate_array_guard_common(kls, region, false, true);
+  }
+  Node* generate_objArray_guard(Node* kls, RegionNode* region) {
+    return generate_array_guard_common(kls, region, true, false);
+  }
+  Node* generate_non_objArray_guard(Node* kls, RegionNode* region) {
+    return generate_array_guard_common(kls, region, true, true);
+  }
+  Node* generate_array_guard_common(Node* kls, RegionNode* region,
+                                    bool obj_array, bool not_array);
+  Node* generate_virtual_guard(Node* obj_klass, RegionNode* slow_region);
+  CallJavaNode* generate_method_call(vmIntrinsics::ID method_id,
+                                     bool is_virtual = false, bool is_static = false);
+  CallJavaNode* generate_method_call_static(vmIntrinsics::ID method_id) {
+    return generate_method_call(method_id, false, true);
+  }
+  CallJavaNode* generate_method_call_virtual(vmIntrinsics::ID method_id) {
+    return generate_method_call(method_id, true, false);
+  }
+  Node * load_field_from_object(Node * fromObj, const char * fieldName, const char * fieldTypeString, bool is_exact, bool is_static, ciInstanceKlass * fromKls);
+  Node * field_address_from_object(Node * fromObj, const char * fieldName, const char * fieldTypeString, bool is_exact, bool is_static, ciInstanceKlass * fromKls);
+
+  Node* make_string_method_node(int opcode, Node* str1_start, Node* cnt1, Node* str2_start, Node* cnt2, StrIntrinsicNode::ArgEnc ae);
+  bool inline_string_compareTo(StrIntrinsicNode::ArgEnc ae);
+  bool inline_string_indexOf(StrIntrinsicNode::ArgEnc ae);
+  bool inline_string_indexOfI(StrIntrinsicNode::ArgEnc ae);
+  Node* make_indexOf_node(Node* src_start, Node* src_count, Node* tgt_start, Node* tgt_count,
+                          RegionNode* region, Node* phi, StrIntrinsicNode::ArgEnc ae);
+  bool inline_string_indexOfChar(StrIntrinsicNode::ArgEnc ae);
+  bool inline_string_equals(StrIntrinsicNode::ArgEnc ae);
+  bool inline_string_toBytesU();
+  bool inline_string_getCharsU();
+  bool inline_string_copy(bool compress);
+  bool inline_string_char_access(bool is_store);
+  Node* round_double_node(Node* n);
+  bool runtime_math(const TypeFunc* call_type, address funcAddr, const char* funcName);
+  bool inline_math_native(vmIntrinsics::ID id);
+  bool inline_math(vmIntrinsics::ID id);
+  bool inline_double_math(vmIntrinsics::ID id);
+  template <typename OverflowOp>
+  bool inline_math_overflow(Node* arg1, Node* arg2);
+  void inline_math_mathExact(Node* math, Node* test);
+  bool inline_math_addExactI(bool is_increment);
+  bool inline_math_addExactL(bool is_increment);
+  bool inline_math_multiplyExactI();
+  bool inline_math_multiplyExactL();
+  bool inline_math_multiplyHigh();
+  bool inline_math_negateExactI();
+  bool inline_math_negateExactL();
+  bool inline_math_subtractExactI(bool is_decrement);
+  bool inline_math_subtractExactL(bool is_decrement);
+  bool inline_min_max(vmIntrinsics::ID id);
+  bool inline_notify(vmIntrinsics::ID id);
+  Node* generate_min_max(vmIntrinsics::ID id, Node* x, Node* y);
+  // This returns Type::AnyPtr, RawPtr, or OopPtr.
+  int classify_unsafe_addr(Node* &base, Node* &offset, BasicType type);
+  Node* make_unsafe_address(Node*& base, Node* offset, DecoratorSet decorators, BasicType type = T_ILLEGAL, bool can_cast = false);
+
+  typedef enum { Relaxed, Opaque, Volatile, Acquire, Release } AccessKind;
+  DecoratorSet mo_decorator_for_access_kind(AccessKind kind);
+  bool inline_unsafe_access(bool is_store, BasicType type, AccessKind kind, bool is_unaligned);
+  static bool klass_needs_init_guard(Node* kls);
+  bool inline_unsafe_allocate();
+  bool inline_unsafe_newArray(bool uninitialized);
+  bool inline_unsafe_writeback0();
+  bool inline_unsafe_writebackSync0(bool is_pre);
+  bool inline_unsafe_copyMemory();
+  bool inline_native_currentThread();
+
+  bool inline_native_time_funcs(address method, const char* funcName);
+#ifdef JFR_HAVE_INTRINSICS
+  bool inline_native_classID();
+  bool inline_native_getEventWriter();
+#endif
+  bool inline_native_Class_query(vmIntrinsics::ID id);
+  bool inline_native_subtype_check();
+  bool inline_native_getLength();
+  bool inline_array_copyOf(bool is_copyOfRange);
+  bool inline_array_equals(StrIntrinsicNode::ArgEnc ae);
+  bool inline_preconditions_checkIndex();
+  void copy_to_clone(Node* obj, Node* alloc_obj, Node* obj_size, bool is_array);
+  bool inline_native_clone(bool is_virtual);
+  bool inline_native_Reflection_getCallerClass();
+  // Helper function for inlining native object hash method
+  bool inline_native_hashcode(bool is_virtual, bool is_static);
+  bool inline_native_getClass();
+
+  // Helper functions for inlining arraycopy
+  bool inline_arraycopy();
+  AllocateArrayNode* tightly_coupled_allocation(Node* ptr,
+                                                RegionNode* slow_region);
+  JVMState* arraycopy_restore_alloc_state(AllocateArrayNode* alloc, int& saved_reexecute_sp);
+  void arraycopy_move_allocation_here(AllocateArrayNode* alloc, Node* dest, JVMState* saved_jvms, int saved_reexecute_sp,
+                                      uint new_idx);
+
+  typedef enum { LS_get_add, LS_get_set, LS_cmp_swap, LS_cmp_swap_weak, LS_cmp_exchange } LoadStoreKind;
+  bool inline_unsafe_load_store(BasicType type,  LoadStoreKind kind, AccessKind access_kind);
+  bool inline_unsafe_fence(vmIntrinsics::ID id);
+  bool inline_onspinwait();
+  bool inline_fp_conversions(vmIntrinsics::ID id);
+  bool inline_number_methods(vmIntrinsics::ID id);
+  bool inline_reference_get();
+  bool inline_Class_cast();
+  bool inline_aescrypt_Block(vmIntrinsics::ID id);
+  bool inline_cipherBlockChaining_AESCrypt(vmIntrinsics::ID id);
+  bool inline_electronicCodeBook_AESCrypt(vmIntrinsics::ID id);
+  bool inline_counterMode_AESCrypt(vmIntrinsics::ID id);
+  Node* inline_cipherBlockChaining_AESCrypt_predicate(bool decrypting);
+  Node* inline_electronicCodeBook_AESCrypt_predicate(bool decrypting);
+  Node* inline_counterMode_AESCrypt_predicate();
+  Node* get_key_start_from_aescrypt_object(Node* aescrypt_object);
+  Node* get_original_key_start_from_aescrypt_object(Node* aescrypt_object);
+  bool inline_ghash_processBlocks();
+  bool inline_base64_encodeBlock();
+  bool inline_digestBase_implCompress(vmIntrinsics::ID id);
+  bool inline_digestBase_implCompressMB(int predicate);
+  bool inline_digestBase_implCompressMB(Node* digestBaseObj, ciInstanceKlass* instklass,
+                                        bool long_state, address stubAddr, const char *stubName,
+                                        Node* src_start, Node* ofs, Node* limit);
+  Node* get_state_from_digest_object(Node *digestBase_object);
+  Node* get_long_state_from_digest_object(Node *digestBase_object);
+  Node* inline_digestBase_implCompressMB_predicate(int predicate);
+  bool inline_encodeISOArray();
+  bool inline_updateCRC32();
+  bool inline_updateBytesCRC32();
+  bool inline_updateByteBufferCRC32();
+  Node* get_table_from_crc32c_class(ciInstanceKlass *crc32c_class);
+  bool inline_updateBytesCRC32C();
+  bool inline_updateDirectByteBufferCRC32C();
+  bool inline_updateBytesAdler32();
+  bool inline_updateByteBufferAdler32();
+  bool inline_multiplyToLen();
+  bool inline_hasNegatives();
+  bool inline_squareToLen();
+  bool inline_mulAdd();
+  bool inline_montgomeryMultiply();
+  bool inline_montgomerySquare();
+  bool inline_bigIntegerShift(bool isRightShift);
+  bool inline_vectorizedMismatch();
+  bool inline_fma(vmIntrinsics::ID id);
+  bool inline_character_compare(vmIntrinsics::ID id);
+  bool inline_fp_min_max(vmIntrinsics::ID id);
+
+  bool inline_profileBoolean();
+  bool inline_isCompileConstant();
+
+  // Vector API support
+  bool inline_vector_nary_operation(int n);
+  bool inline_vector_broadcast_coerced();
+  bool inline_vector_shuffle_to_vector();
+  bool inline_vector_shuffle_iota();
+  bool inline_vector_mem_operation(bool is_store);
+  bool inline_vector_gather_scatter(bool is_scatter);
+  bool inline_vector_reduction();
+  bool inline_vector_test();
+  bool inline_vector_blend();
+  bool inline_vector_rearrange();
+  bool inline_vector_compare();
+  bool inline_vector_broadcast_int();
+  bool inline_vector_convert();
+  bool inline_vector_extract();
+  bool inline_vector_insert();
+  Node* box_vector(Node* in, const TypeInstPtr* vbox_type, BasicType bt, int num_elem);
+  Node* unbox_vector(Node* in, const TypeInstPtr* vbox_type, BasicType bt, int num_elem, bool shuffle_to_vector = false);
+  Node* shift_count(Node* cnt, int shift_op, BasicType bt, int num_elem);
+
+  enum VectorMaskUseType {
+    VecMaskUseLoad,
+    VecMaskUseStore,
+    VecMaskUseAll,
+    VecMaskNotUsed
+  };
+
+  bool arch_supports_vector(int op, int num_elem, BasicType type, VectorMaskUseType mask_use_type, bool has_scalar_args = false);
+
+  void clear_upper_avx() {
+#ifdef X86
+    if (UseAVX >= 2) {
+      C->set_clear_upper_avx(true);
+    }
+#endif
+  }
+};
+
--- a/src/hotspot/share/opto/matcher.cpp
+++ b/src/hotspot/share/opto/matcher.cpp
@ -430,7 +430,7 @@ static RegMask *init_input_masks( uint size, RegMask &ret_adr, RegMask &fp ) {
  return rms;
 }

-#define NOF_STACK_MASKS (3*6+6)
+#define NOF_STACK_MASKS (3*12)

 // Create the initial stack mask used by values spilling to the stack.
 // Disallow any debug info in outgoing argument areas by setting the
@ -473,6 +473,20 @@ void Matcher::init_first_stack_mask() {
  idealreg2spillmask  [Op_VecY] = &rms[22];
  idealreg2spillmask  [Op_VecZ] = &rms[23];

+  idealreg2debugmask  [Op_VecA] = &rms[24];
+  idealreg2debugmask  [Op_VecS] = &rms[25];
+  idealreg2debugmask  [Op_VecD] = &rms[26];
+  idealreg2debugmask  [Op_VecX] = &rms[27];
+  idealreg2debugmask  [Op_VecY] = &rms[28];
+  idealreg2debugmask  [Op_VecZ] = &rms[29];
+
+  idealreg2mhdebugmask[Op_VecA] = &rms[30];
+  idealreg2mhdebugmask[Op_VecS] = &rms[31];
+  idealreg2mhdebugmask[Op_VecD] = &rms[32];
+  idealreg2mhdebugmask[Op_VecX] = &rms[33];
+  idealreg2mhdebugmask[Op_VecY] = &rms[34];
+  idealreg2mhdebugmask[Op_VecZ] = &rms[35];
+
  OptoReg::Name i;

  // At first, start with the empty mask
@ -520,13 +534,19 @@ void Matcher::init_first_stack_mask() {
  if (Matcher::vector_size_supported(T_BYTE,4)) {
    *idealreg2spillmask[Op_VecS] = *idealreg2regmask[Op_VecS];
     idealreg2spillmask[Op_VecS]->OR(C->FIRST_STACK_mask());
+  } else {
+    *idealreg2spillmask[Op_VecS] = RegMask::Empty;
  }
+
  if (Matcher::vector_size_supported(T_FLOAT,2)) {
    // For VecD we need dual alignment and 8 bytes (2 slots) for spills.
    // RA guarantees such alignment since it is needed for Double and Long values.
    *idealreg2spillmask[Op_VecD] = *idealreg2regmask[Op_VecD];
     idealreg2spillmask[Op_VecD]->OR(aligned_stack_mask);
+  } else {
+    *idealreg2spillmask[Op_VecD] = RegMask::Empty;
  }
+
  if (Matcher::vector_size_supported(T_FLOAT,4)) {
    // For VecX we need quadro alignment and 16 bytes (4 slots) for spills.
    //
@ -544,7 +564,10 @@ void Matcher::init_first_stack_mask() {
     assert(aligned_stack_mask.is_AllStack(), "should be infinite stack");
    *idealreg2spillmask[Op_VecX] = *idealreg2regmask[Op_VecX];
     idealreg2spillmask[Op_VecX]->OR(aligned_stack_mask);
+  } else {
+    *idealreg2spillmask[Op_VecX] = RegMask::Empty;
  }
+
  if (Matcher::vector_size_supported(T_FLOAT,8)) {
    // For VecY we need octo alignment and 32 bytes (8 slots) for spills.
    OptoReg::Name in = OptoReg::add(_in_arg_limit, -1);
@ -556,7 +579,10 @@ void Matcher::init_first_stack_mask() {
     assert(aligned_stack_mask.is_AllStack(), "should be infinite stack");
    *idealreg2spillmask[Op_VecY] = *idealreg2regmask[Op_VecY];
     idealreg2spillmask[Op_VecY]->OR(aligned_stack_mask);
+  } else {
+    *idealreg2spillmask[Op_VecY] = RegMask::Empty;
  }
+
  if (Matcher::vector_size_supported(T_FLOAT,16)) {
    // For VecZ we need enough alignment and 64 bytes (16 slots) for spills.
    OptoReg::Name in = OptoReg::add(_in_arg_limit, -1);
@ -568,6 +594,8 @@ void Matcher::init_first_stack_mask() {
     assert(aligned_stack_mask.is_AllStack(), "should be infinite stack");
    *idealreg2spillmask[Op_VecZ] = *idealreg2regmask[Op_VecZ];
     idealreg2spillmask[Op_VecZ]->OR(aligned_stack_mask);
+  } else {
+    *idealreg2spillmask[Op_VecZ] = RegMask::Empty;
  }

  if (Matcher::supports_scalable_vector()) {
@ -622,6 +650,13 @@ void Matcher::init_first_stack_mask() {
  *idealreg2debugmask  [Op_RegD] = *idealreg2spillmask[Op_RegD];
  *idealreg2debugmask  [Op_RegP] = *idealreg2spillmask[Op_RegP];

+  *idealreg2debugmask  [Op_VecA] = *idealreg2spillmask[Op_VecA];
+  *idealreg2debugmask  [Op_VecS] = *idealreg2spillmask[Op_VecS];
+  *idealreg2debugmask  [Op_VecD] = *idealreg2spillmask[Op_VecD];
+  *idealreg2debugmask  [Op_VecX] = *idealreg2spillmask[Op_VecX];
+  *idealreg2debugmask  [Op_VecY] = *idealreg2spillmask[Op_VecY];
+  *idealreg2debugmask  [Op_VecZ] = *idealreg2spillmask[Op_VecZ];
+
  *idealreg2mhdebugmask[Op_RegN] = *idealreg2spillmask[Op_RegN];
  *idealreg2mhdebugmask[Op_RegI] = *idealreg2spillmask[Op_RegI];
  *idealreg2mhdebugmask[Op_RegL] = *idealreg2spillmask[Op_RegL];
@ -629,6 +664,13 @@ void Matcher::init_first_stack_mask() {
  *idealreg2mhdebugmask[Op_RegD] = *idealreg2spillmask[Op_RegD];
  *idealreg2mhdebugmask[Op_RegP] = *idealreg2spillmask[Op_RegP];

+  *idealreg2mhdebugmask[Op_VecA] = *idealreg2spillmask[Op_VecA];
+  *idealreg2mhdebugmask[Op_VecS] = *idealreg2spillmask[Op_VecS];
+  *idealreg2mhdebugmask[Op_VecD] = *idealreg2spillmask[Op_VecD];
+  *idealreg2mhdebugmask[Op_VecX] = *idealreg2spillmask[Op_VecX];
+  *idealreg2mhdebugmask[Op_VecY] = *idealreg2spillmask[Op_VecY];
+  *idealreg2mhdebugmask[Op_VecZ] = *idealreg2spillmask[Op_VecZ];
+
  // Prevent stub compilations from attempting to reference
  // callee-saved (SOE) registers from debug info
  bool exclude_soe = !Compile::current()->is_method_compilation();
@ -642,12 +684,26 @@ void Matcher::init_first_stack_mask() {
  idealreg2debugmask[Op_RegD]->SUBTRACT(*caller_save_mask);
  idealreg2debugmask[Op_RegP]->SUBTRACT(*caller_save_mask);

+  idealreg2debugmask[Op_VecA]->SUBTRACT(*caller_save_mask);
+  idealreg2debugmask[Op_VecS]->SUBTRACT(*caller_save_mask);
+  idealreg2debugmask[Op_VecD]->SUBTRACT(*caller_save_mask);
+  idealreg2debugmask[Op_VecX]->SUBTRACT(*caller_save_mask);
+  idealreg2debugmask[Op_VecY]->SUBTRACT(*caller_save_mask);
+  idealreg2debugmask[Op_VecZ]->SUBTRACT(*caller_save_mask);
+
  idealreg2mhdebugmask[Op_RegN]->SUBTRACT(*mh_caller_save_mask);
  idealreg2mhdebugmask[Op_RegI]->SUBTRACT(*mh_caller_save_mask);
  idealreg2mhdebugmask[Op_RegL]->SUBTRACT(*mh_caller_save_mask);
  idealreg2mhdebugmask[Op_RegF]->SUBTRACT(*mh_caller_save_mask);
  idealreg2mhdebugmask[Op_RegD]->SUBTRACT(*mh_caller_save_mask);
  idealreg2mhdebugmask[Op_RegP]->SUBTRACT(*mh_caller_save_mask);
+
+  idealreg2mhdebugmask[Op_VecA]->SUBTRACT(*mh_caller_save_mask);
+  idealreg2mhdebugmask[Op_VecS]->SUBTRACT(*mh_caller_save_mask);
+  idealreg2mhdebugmask[Op_VecD]->SUBTRACT(*mh_caller_save_mask);
+  idealreg2mhdebugmask[Op_VecX]->SUBTRACT(*mh_caller_save_mask);
+  idealreg2mhdebugmask[Op_VecY]->SUBTRACT(*mh_caller_save_mask);
+  idealreg2mhdebugmask[Op_VecZ]->SUBTRACT(*mh_caller_save_mask);
 }

 //---------------------------is_save_on_entry----------------------------------
@ -1953,7 +2009,6 @@ bool Matcher::is_vshift_con_pattern(Node *n, Node *m) {
  return false;
 }

-
 bool Matcher::clone_node(Node* n, Node* m, Matcher::MStack& mstack) {
  // Must clone all producers of flags, or we will not match correctly.
  // Suppose a compare setting int-flags is shared (e.g., a switch-tree)
@ -2308,8 +2363,28 @@ void Matcher::find_shared_post_visit(Node* n, uint opcode) {
      n->del_req(3);
      break;
    }
+    case Op_VectorBlend:
+    case Op_VectorInsert: {
+      Node* pair = new BinaryNode(n->in(1), n->in(2));
+      n->set_req(1, pair);
+      n->set_req(2, n->in(3));
+      n->del_req(3);
+      break;
+    }
+    case Op_StoreVectorScatter: {
+      Node* pair = new BinaryNode(n->in(MemNode::ValueIn), n->in(MemNode::ValueIn+1));
+      n->set_req(MemNode::ValueIn, pair);
+      n->del_req(MemNode::ValueIn+1);
+      break;
+    }
+    case Op_VectorMaskCmp: {
+      n->set_req(1, new BinaryNode(n->in(1), n->in(2)));
+      n->set_req(2, n->in(3));
+      n->del_req(3);
+      break;
    default:
      break;
+    }
  }
 }

--- a/src/hotspot/share/opto/matcher.hpp
+++ b/src/hotspot/share/opto/matcher.hpp
@ -345,6 +345,9 @@ public:
  // Vector ideal reg
  static const uint vector_ideal_reg(int len);

+  // Does the CPU supports vector variable shift instructions?
+  static bool supports_vector_variable_shifts(void);
+
  // CPU supports misaligned vectors store/load.
  static const bool misaligned_vectors_ok();

--- a/src/hotspot/share/opto/memnode.cpp
+++ b/src/hotspot/share/opto/memnode.cpp
@ -641,7 +641,8 @@ Node* MemNode::find_previous_store(PhaseTransform* phase) {
      }

      if (st_offset != offset && st_offset != Type::OffsetBot) {
-        const int MAX_STORE = BytesPerLong;
+        const int MAX_STORE = MAX2(BytesPerLong, (int)MaxVectorSize);
+        assert(mem->as_Store()->memory_size() <= MAX_STORE, "");
        if (st_offset >= offset + size_in_bytes ||
            st_offset <= offset - MAX_STORE ||
            st_offset <= offset - mem->as_Store()->memory_size()) {
@ -1111,11 +1112,16 @@ Node* MemNode::can_see_stored_value(Node* st, PhaseTransform* phase) const {
      // (This is one of the few places where a generic PhaseTransform
      // can create new nodes.  Think of it as lazily manifesting
      // virtually pre-existing constants.)
-      if (ReduceBulkZeroing || find_array_copy_clone(phase, ld_alloc, in(MemNode::Memory)) == NULL) {
-        // If ReduceBulkZeroing is disabled, we need to check if the allocation does not belong to an
-        // ArrayCopyNode clone. If it does, then we cannot assume zero since the initialization is done
-        // by the ArrayCopyNode.
-        return phase->zerocon(memory_type());
+      if (memory_type() != T_VOID) {
+        if (ReduceBulkZeroing || find_array_copy_clone(phase, ld_alloc, in(MemNode::Memory)) == NULL) {
+          // If ReduceBulkZeroing is disabled, we need to check if the allocation does not belong to an
+          // ArrayCopyNode clone. If it does, then we cannot assume zero since the initialization is done
+          // by the ArrayCopyNode.
+          return phase->zerocon(memory_type());
+        }
+      } else {
+        // TODO: materialize all-zero vector constant
+        assert(!isa_Load() || as_Load()->type()->isa_vect(), "");
      }
    }

@ -2561,6 +2567,8 @@ Node *StoreNode::Ideal(PhaseGVN *phase, bool can_reshape) {
      assert(Opcode() == st->Opcode() ||
             st->Opcode() == Op_StoreVector ||
             Opcode() == Op_StoreVector ||
+             st->Opcode() == Op_StoreVectorScatter ||
+             Opcode() == Op_StoreVectorScatter ||
             phase->C->get_alias_index(adr_type()) == Compile::AliasIdxRaw ||
             (Opcode() == Op_StoreL && st->Opcode() == Op_StoreI) || // expanded ClearArrayNode
             (Opcode() == Op_StoreI && st->Opcode() == Op_StoreL) || // initialization by arraycopy
@ -3744,7 +3752,7 @@ intptr_t InitializeNode::can_capture_store(StoreNode* st, PhaseGVN* phase, bool
 int InitializeNode::captured_store_insertion_point(intptr_t start,
                                                   int size_in_bytes,
                                                   PhaseTransform* phase) {
-  const int FAIL = 0, MAX_STORE = BytesPerLong;
+  const int FAIL = 0, MAX_STORE = MAX2(BytesPerLong, (int)MaxVectorSize);

  if (is_complete())
    return FAIL;                // arraycopy got here first; punt
@ -3774,6 +3782,7 @@ int InitializeNode::captured_store_insertion_point(intptr_t start,
      }
      return -(int)i;           // not found; here is where to put it
    } else if (st_off < start) {
+      assert(st->as_Store()->memory_size() <= MAX_STORE, "");
      if (size_in_bytes != 0 &&
          start < st_off + MAX_STORE &&
          start < st_off + st->as_Store()->memory_size()) {
--- a/src/hotspot/share/opto/movenode.cpp
+++ b/src/hotspot/share/opto/movenode.cpp
@ -363,6 +363,14 @@ const Type* MoveL2DNode::Value(PhaseGVN* phase) const {
  return TypeD::make( v.get_jdouble() );
 }

+//------------------------------Identity----------------------------------------
+Node* MoveL2DNode::Identity(PhaseGVN* phase) {
+  if (in(1)->Opcode() == Op_MoveD2L) {
+    return in(1)->in(1);
+  }
+  return this;
+}
+
 //------------------------------Value------------------------------------------
 const Type* MoveI2FNode::Value(PhaseGVN* phase) const {
  const Type *t = phase->type( in(1) );
@ -374,6 +382,14 @@ const Type* MoveI2FNode::Value(PhaseGVN* phase) const {
  return TypeF::make( v.get_jfloat() );
 }

+//------------------------------Identity----------------------------------------
+Node* MoveI2FNode::Identity(PhaseGVN* phase) {
+  if (in(1)->Opcode() == Op_MoveF2I) {
+    return in(1)->in(1);
+  }
+  return this;
+}
+
 //------------------------------Value------------------------------------------
 const Type* MoveF2INode::Value(PhaseGVN* phase) const {
  const Type *t = phase->type( in(1) );
@ -385,6 +401,14 @@ const Type* MoveF2INode::Value(PhaseGVN* phase) const {
  return TypeInt::make( v.get_jint() );
 }

+//------------------------------Identity----------------------------------------
+Node* MoveF2INode::Identity(PhaseGVN* phase) {
+  if (in(1)->Opcode() == Op_MoveI2F) {
+    return in(1)->in(1);
+  }
+  return this;
+}
+
 //------------------------------Value------------------------------------------
 const Type* MoveD2LNode::Value(PhaseGVN* phase) const {
  const Type *t = phase->type( in(1) );
@ -396,6 +420,14 @@ const Type* MoveD2LNode::Value(PhaseGVN* phase) const {
  return TypeLong::make( v.get_jlong() );
 }

+//------------------------------Identity----------------------------------------
+Node* MoveD2LNode::Identity(PhaseGVN* phase) {
+  if (in(1)->Opcode() == Op_MoveL2D) {
+    return in(1)->in(1);
+  }
+  return this;
+}
+
 #ifndef PRODUCT
 //----------------------------BinaryNode---------------------------------------
 // The set of related nodes for a BinaryNode is all data inputs and all outputs
--- a/src/hotspot/share/opto/movenode.hpp
+++ b/src/hotspot/share/opto/movenode.hpp
@ -105,6 +105,7 @@ class MoveI2FNode : public Node {
  virtual const Type *bottom_type() const { return Type::FLOAT; }
  virtual uint ideal_reg() const { return Op_RegF; }
  virtual const Type* Value(PhaseGVN* phase) const;
+  virtual Node* Identity(PhaseGVN* phase);
 };

 class MoveL2DNode : public Node {
@ -114,6 +115,7 @@ class MoveL2DNode : public Node {
  virtual const Type *bottom_type() const { return Type::DOUBLE; }
  virtual uint ideal_reg() const { return Op_RegD; }
  virtual const Type* Value(PhaseGVN* phase) const;
+  virtual Node* Identity(PhaseGVN* phase);
 };

 class MoveF2INode : public Node {
@ -123,6 +125,7 @@ class MoveF2INode : public Node {
  virtual const Type *bottom_type() const { return TypeInt::INT; }
  virtual uint ideal_reg() const { return Op_RegI; }
  virtual const Type* Value(PhaseGVN* phase) const;
+  virtual Node* Identity(PhaseGVN* phase);
 };

 class MoveD2LNode : public Node {
@ -132,6 +135,7 @@ class MoveD2LNode : public Node {
  virtual const Type *bottom_type() const { return TypeLong::LONG; }
  virtual uint ideal_reg() const { return Op_RegL; }
  virtual const Type* Value(PhaseGVN* phase) const;
+  virtual Node* Identity(PhaseGVN* phase);
 };

 //------------------------------BinaryNode-------------------------------------
--- a/src/hotspot/share/opto/mulnode.hpp
+++ b/src/hotspot/share/opto/mulnode.hpp
@ -259,6 +259,25 @@ public:
  virtual uint ideal_reg() const { return Op_RegL; }
 };

+//------------------------------URShiftBNode-----------------------------------
+// Logical shift right
+class URShiftBNode : public Node {
+public:
+  URShiftBNode( Node *in1, Node *in2 ) : Node(0,in1,in2) {
+    ShouldNotReachHere(); // only vector variant is used
+  }
+  virtual int Opcode() const;
+};
+
+//------------------------------URShiftSNode-----------------------------------
+// Logical shift right
+class URShiftSNode : public Node {
+public:
+  URShiftSNode( Node *in1, Node *in2 ) : Node(0,in1,in2) {
+    ShouldNotReachHere(); // only vector variant is used
+  }
+  virtual int Opcode() const;
+};

 //------------------------------URShiftINode-----------------------------------
 // Logical shift right
--- a/src/hotspot/share/opto/node.hpp
+++ b/src/hotspot/share/opto/node.hpp
@ -152,7 +152,10 @@ class TypeNode;
 class UnlockNode;
 class VectorNode;
 class LoadVectorNode;
+class LoadVectorGatherNode;
 class StoreVectorNode;
+class StoreVectorScatterNode;
+class VectorMaskCmpNode;
 class VectorSet;
 typedef void (*NFunc)(Node&,void*);
 extern "C" {
@ -688,8 +691,10 @@ public:
    DEFINE_CLASS_ID(Mem,   Node, 4)
      DEFINE_CLASS_ID(Load,  Mem, 0)
        DEFINE_CLASS_ID(LoadVector,  Load, 0)
+          DEFINE_CLASS_ID(LoadVectorGather, LoadVector, 0)
      DEFINE_CLASS_ID(Store, Mem, 1)
        DEFINE_CLASS_ID(StoreVector, Store, 0)
+          DEFINE_CLASS_ID(StoreVectorScatter, StoreVector, 0)
      DEFINE_CLASS_ID(LoadStore, Mem, 2)
        DEFINE_CLASS_ID(LoadStoreConditional, LoadStore, 0)
          DEFINE_CLASS_ID(CompareAndSwap, LoadStoreConditional, 0)
@ -714,6 +719,7 @@ public:
    DEFINE_CLASS_ID(Add,      Node, 11)
    DEFINE_CLASS_ID(Mul,      Node, 12)
    DEFINE_CLASS_ID(Vector,   Node, 13)
+      DEFINE_CLASS_ID(VectorMaskCmp, Vector, 0)
    DEFINE_CLASS_ID(ClearArray, Node, 14)
    DEFINE_CLASS_ID(Halt, Node, 15)
    DEFINE_CLASS_ID(Opaque1, Node, 16)
@ -884,7 +890,10 @@ public:
  DEFINE_CLASS_QUERY(Type)
  DEFINE_CLASS_QUERY(Vector)
  DEFINE_CLASS_QUERY(LoadVector)
+  DEFINE_CLASS_QUERY(LoadVectorGather)
  DEFINE_CLASS_QUERY(StoreVector)
+  DEFINE_CLASS_QUERY(StoreVectorScatter)
+  DEFINE_CLASS_QUERY(VectorMaskCmp)
  DEFINE_CLASS_QUERY(Unlock)

  #undef DEFINE_CLASS_QUERY
--- a/src/hotspot/share/opto/output.cpp
+++ b/src/hotspot/share/opto/output.cpp
@ -892,6 +892,10 @@ void PhaseOutput::FillLocArray( int idx, MachSafePointNode* sfpt, Node *local,
                                                      ? Location::int_in_long : Location::normal ));
    } else if( t->base() == Type::NarrowOop ) {
      array->append(new_loc_value( C->regalloc(), regnum, Location::narrowoop ));
+    } else if ( t->base() == Type::VectorS || t->base() == Type::VectorD ||
+                t->base() == Type::VectorX || t->base() == Type::VectorY ||
+                t->base() == Type::VectorZ) {
+      array->append(new_loc_value( C->regalloc(), regnum, Location::vector ));
    } else {
      array->append(new_loc_value( C->regalloc(), regnum, C->regalloc()->is_oop(local) ? Location::oop : Location::normal ));
    }
--- a/src/hotspot/share/opto/phase.cpp
+++ b/src/hotspot/share/opto/phase.cpp
@ -78,6 +78,10 @@ void Phase::print_timers() {
       }
    }
    tty->print_cr ("         Renumber Live:       %7.3f s", timers[_t_renumberLive].seconds());
+    tty->print_cr ("         Vector:              %7.3f s", timers[_t_vector].seconds());
+    tty->print_cr ("           Box elimination:   %7.3f s", timers[_t_vector_elimination].seconds());
+    tty->print_cr ("             IGVN:            %7.3f s", timers[_t_vector_igvn].seconds());
+    tty->print_cr ("             Prune Useless:   %7.3f s", timers[_t_vector_pru].seconds());
    tty->print_cr ("         IdealLoop:           %7.3f s", timers[_t_idealLoop].seconds());
    tty->print_cr ("         IdealLoop Verify:    %7.3f s", timers[_t_idealLoopVerify].seconds());
    tty->print_cr ("         Cond Const Prop:     %7.3f s", timers[_t_ccp].seconds());
--- a/src/hotspot/share/opto/phase.hpp
+++ b/src/hotspot/share/opto/phase.hpp
@ -59,6 +59,7 @@ public:
    Ideal_Loop,                       // Find idealized trip-counted loops
    Macro_Expand,                     // Expand macro nodes
    Peephole,                         // Apply peephole optimizations
+    Vector,
    Output,
    last_phase
  };
@ -75,6 +76,10 @@ public:
        _t_incrInline_igvn,
        _t_incrInline_pru,
        _t_incrInline_inline,
+      _t_vector,
+        _t_vector_elimination,
+          _t_vector_igvn,
+          _t_vector_pru,
      _t_renumberLive,
      _t_idealLoop,
      _t_idealLoopVerify,
--- a/src/hotspot/share/opto/phasetype.hpp
+++ b/src/hotspot/share/opto/phasetype.hpp
@ -31,7 +31,14 @@ enum CompilerPhaseType {
  PHASE_BEFORE_REMOVEUSELESS,
  PHASE_AFTER_PARSING,
  PHASE_ITER_GVN1,
+  PHASE_EXPAND_VUNBOX,
+  PHASE_SCALARIZE_VBOX,
+  PHASE_INLINE_VECTOR_REBOX,
+  PHASE_EXPAND_VBOX,
+  PHASE_ELIMINATE_VBOX_ALLOC,
  PHASE_PHASEIDEAL_BEFORE_EA,
+  PHASE_ITER_GVN_AFTER_VECTOR,
+  PHASE_ITER_GVN_BEFORE_EA,
  PHASE_ITER_GVN_AFTER_EA,
  PHASE_ITER_GVN_AFTER_ELIMINATION,
  PHASE_PHASEIDEALLOOP1,
@ -41,6 +48,7 @@ enum CompilerPhaseType {
  PHASE_ITER_GVN2,
  PHASE_PHASEIDEALLOOP_ITERATIONS,
  PHASE_OPTIMIZE_FINISHED,
+  PHASE_AFTER_MATCHING,
  PHASE_GLOBAL_CODE_MOTION,
  PHASE_FINAL_CODE,
  PHASE_AFTER_EA,
@ -51,6 +59,7 @@ enum CompilerPhaseType {
  PHASE_BEFORE_MATCHING,
  PHASE_MATCHING,
  PHASE_INCREMENTAL_INLINE,
+  PHASE_INCREMENTAL_INLINE_STEP,
  PHASE_INCREMENTAL_BOXING_INLINE,
  PHASE_CALL_CATCH_CLEANUP,
  PHASE_INSERT_BARRIER,
@ -73,7 +82,14 @@ class CompilerPhaseTypeHelper {
      case PHASE_BEFORE_REMOVEUSELESS:       return "Before RemoveUseless";
      case PHASE_AFTER_PARSING:              return "After Parsing";
      case PHASE_ITER_GVN1:                  return "Iter GVN 1";
+      case PHASE_EXPAND_VUNBOX:              return "Expand VectorUnbox";
+      case PHASE_SCALARIZE_VBOX:             return "Scalarize VectorBox";
+      case PHASE_INLINE_VECTOR_REBOX:        return "Inline Vector Rebox Calls";
+      case PHASE_EXPAND_VBOX:                return "Expand VectorBox";
+      case PHASE_ELIMINATE_VBOX_ALLOC:       return "Eliminate VectorBoxAllocate";
      case PHASE_PHASEIDEAL_BEFORE_EA:       return "PhaseIdealLoop before EA";
+      case PHASE_ITER_GVN_AFTER_VECTOR:      return "Iter GVN after vector box elimination";
+      case PHASE_ITER_GVN_BEFORE_EA:         return "Iter GVN before EA";
      case PHASE_ITER_GVN_AFTER_EA:          return "Iter GVN after EA";
      case PHASE_ITER_GVN_AFTER_ELIMINATION: return "Iter GVN after eliminating allocations and locks";
      case PHASE_PHASEIDEALLOOP1:            return "PhaseIdealLoop 1";
@ -83,6 +99,7 @@ class CompilerPhaseTypeHelper {
      case PHASE_ITER_GVN2:                  return "Iter GVN 2";
      case PHASE_PHASEIDEALLOOP_ITERATIONS:  return "PhaseIdealLoop iterations";
      case PHASE_OPTIMIZE_FINISHED:          return "Optimize finished";
+      case PHASE_AFTER_MATCHING:             return "After Matching";
      case PHASE_GLOBAL_CODE_MOTION:         return "Global code motion";
      case PHASE_FINAL_CODE:                 return "Final Code";
      case PHASE_AFTER_EA:                   return "After Escape Analysis";
@ -93,6 +110,7 @@ class CompilerPhaseTypeHelper {
      case PHASE_BEFORE_MATCHING:            return "Before matching";
      case PHASE_MATCHING:                   return "After matching";
      case PHASE_INCREMENTAL_INLINE:         return "Incremental Inline";
+      case PHASE_INCREMENTAL_INLINE_STEP:    return "Incremental Inline Step";
      case PHASE_INCREMENTAL_BOXING_INLINE:  return "Incremental Boxing Inline";
      case PHASE_CALL_CATCH_CLEANUP:         return "Call catch cleanup";
      case PHASE_INSERT_BARRIER:             return "Insert barrier";
--- a/src/hotspot/share/opto/subnode.hpp
+++ b/src/hotspot/share/opto/subnode.hpp
@ -404,6 +404,28 @@ public:
  NegNode( Node *in1 ) : Node(0,in1) {}
 };

+//------------------------------NegINode---------------------------------------
+// Negate value an int.  For int values, negation is the same as subtraction
+// from zero
+class NegINode : public NegNode {
+public:
+  NegINode(Node *in1) : NegNode(in1) {}
+  virtual int Opcode() const;
+  const Type *bottom_type() const { return TypeInt::INT; }
+  virtual uint ideal_reg() const { return Op_RegI; }
+};
+
+//------------------------------NegLNode---------------------------------------
+// Negate value an int.  For int values, negation is the same as subtraction
+// from zero
+class NegLNode : public NegNode {
+public:
+  NegLNode(Node *in1) : NegNode(in1) {}
+  virtual int Opcode() const;
+  const Type *bottom_type() const { return TypeLong::LONG; }
+  virtual uint ideal_reg() const { return Op_RegL; }
+};
+
 //------------------------------NegFNode---------------------------------------
 // Negate value a float.  Negating 0.0 returns -0.0, but subtracting from
 // zero returns +0.0 (per JVM spec on 'fneg' bytecode).  As subtraction
--- a/src/hotspot/share/opto/superword.cpp
+++ b/src/hotspot/share/opto/superword.cpp
@ -2767,7 +2767,7 @@ Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
        }
      }
      // Move shift count into vector register.
-      cnt = VectorNode::shift_count(p0, cnt, vlen, velt_basic_type(p0));
+      cnt = VectorNode::shift_count(p0->Opcode(), cnt, vlen, velt_basic_type(p0));
      _igvn.register_new_node_with_optimizer(cnt);
      _phase->set_ctrl(cnt, _phase->get_ctrl(opd));
      return cnt;
--- a/src/hotspot/share/opto/type.cpp
+++ b/src/hotspot/share/opto/type.cpp
@ -439,16 +439,22 @@ void Type::Initialize_shared(Compile* current) {
  BOTTOM  = make(Bottom);       // Everything
  HALF    = make(Half);         // Placeholder half of doublewide type

+  TypeF::MAX = TypeF::make(max_jfloat); // Float MAX
+  TypeF::MIN = TypeF::make(min_jfloat); // Float MIN
  TypeF::ZERO = TypeF::make(0.0); // Float 0 (positive zero)
  TypeF::ONE  = TypeF::make(1.0); // Float 1
  TypeF::POS_INF = TypeF::make(jfloat_cast(POSITIVE_INFINITE_F));
  TypeF::NEG_INF = TypeF::make(-jfloat_cast(POSITIVE_INFINITE_F));

+  TypeD::MAX = TypeD::make(max_jdouble); // Double MAX
+  TypeD::MIN = TypeD::make(min_jdouble); // Double MIN
  TypeD::ZERO = TypeD::make(0.0); // Double 0 (positive zero)
  TypeD::ONE  = TypeD::make(1.0); // Double 1
  TypeD::POS_INF = TypeD::make(jdouble_cast(POSITIVE_INFINITE_D));
  TypeD::NEG_INF = TypeD::make(-jdouble_cast(POSITIVE_INFINITE_D));

+  TypeInt::MAX = TypeInt::make(max_jint); // Int MAX
+  TypeInt::MIN = TypeInt::make(min_jint); // Int MIN
  TypeInt::MINUS_1 = TypeInt::make(-1);  // -1
  TypeInt::ZERO    = TypeInt::make( 0);  //  0
  TypeInt::ONE     = TypeInt::make( 1);  //  1
@ -477,6 +483,8 @@ void Type::Initialize_shared(Compile* current) {
  assert( TypeInt::CC_GE == TypeInt::BOOL,    "types must match for CmpL to work" );
  assert( (juint)(TypeInt::CC->_hi - TypeInt::CC->_lo) <= SMALLINT, "CC is truly small");

+  TypeLong::MAX = TypeLong::make(max_jlong);  // Long MAX
+  TypeLong::MIN = TypeLong::make(min_jlong);  // Long MIN
  TypeLong::MINUS_1 = TypeLong::make(-1);        // -1
  TypeLong::ZERO    = TypeLong::make( 0);        //  0
  TypeLong::ONE     = TypeLong::make( 1);        //  1
@ -1119,6 +1127,8 @@ void Type::typerr( const Type *t ) const {

 //=============================================================================
 // Convenience common pre-built types.
+const TypeF *TypeF::MAX;        // Floating point max
+const TypeF *TypeF::MIN;        // Floating point min
 const TypeF *TypeF::ZERO;       // Floating point zero
 const TypeF *TypeF::ONE;        // Floating point one
 const TypeF *TypeF::POS_INF;    // Floating point positive infinity
@ -1229,6 +1239,8 @@ bool TypeF::empty(void) const {

 //=============================================================================
 // Convenience common pre-built types.
+const TypeD *TypeD::MAX;        // Floating point max
+const TypeD *TypeD::MIN;        // Floating point min
 const TypeD *TypeD::ZERO;       // Floating point zero
 const TypeD *TypeD::ONE;        // Floating point one
 const TypeD *TypeD::POS_INF;    // Floating point positive infinity
@ -1335,6 +1347,8 @@ bool TypeD::empty(void) const {

 //=============================================================================
 // Convience common pre-built types.
+const TypeInt *TypeInt::MAX;    // INT_MAX
+const TypeInt *TypeInt::MIN;    // INT_MIN
 const TypeInt *TypeInt::MINUS_1;// -1
 const TypeInt *TypeInt::ZERO;   // 0
 const TypeInt *TypeInt::ONE;    // 1
@ -1604,6 +1618,8 @@ bool TypeInt::empty(void) const {

 //=============================================================================
 // Convenience common pre-built types.
+const TypeLong *TypeLong::MAX;
+const TypeLong *TypeLong::MIN;
 const TypeLong *TypeLong::MINUS_1;// -1
 const TypeLong *TypeLong::ZERO; // 0
 const TypeLong *TypeLong::ONE;  // 1
--- a/src/hotspot/share/opto/type.hpp
+++ b/src/hotspot/share/opto/type.hpp
@ -483,6 +483,8 @@ public:
  virtual const Type *xmeet( const Type *t ) const;
  virtual const Type *xdual() const;    // Compute dual right now.
  // Convenience common pre-built types.
+  static const TypeF *MAX;
+  static const TypeF *MIN;
  static const TypeF *ZERO; // positive zero only
  static const TypeF *ONE;
  static const TypeF *POS_INF;
@ -512,6 +514,8 @@ public:
  virtual const Type *xmeet( const Type *t ) const;
  virtual const Type *xdual() const;    // Compute dual right now.
  // Convenience common pre-built types.
+  static const TypeD *MAX;
+  static const TypeD *MIN;
  static const TypeD *ZERO; // positive zero only
  static const TypeD *ONE;
  static const TypeD *POS_INF;
@ -555,6 +559,8 @@ public:
  virtual const Type *narrow( const Type *t ) const;
  // Do not kill _widen bits.
  // Convenience common pre-built types.
+  static const TypeInt *MAX;
+  static const TypeInt *MIN;
  static const TypeInt *MINUS_1;
  static const TypeInt *ZERO;
  static const TypeInt *ONE;
@ -620,6 +626,8 @@ public:
  virtual const Type *widen( const Type *t, const Type* limit_type ) const;
  virtual const Type *narrow( const Type *t ) const;
  // Convenience common pre-built types.
+  static const TypeLong *MAX;
+  static const TypeLong *MIN;
  static const TypeLong *MINUS_1;
  static const TypeLong *ZERO;
  static const TypeLong *ONE;
--- a/src/hotspot/share/opto/vector.cpp
+++ b/src/hotspot/share/opto/vector.cpp
@ -0,0 +1,466 @@
+/*
+ * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "opto/castnode.hpp"
+#include "opto/graphKit.hpp"
+#include "opto/phaseX.hpp"
+#include "opto/rootnode.hpp"
+#include "opto/vector.hpp"
+#include "utilities/macros.hpp"
+
+void PhaseVector::optimize_vector_boxes() {
+  Compile::TracePhase tp("vector_elimination", &timers[_t_vector_elimination]);
+
+  // Signal GraphKit it's post-parse phase.
+  assert(C->inlining_incrementally() == false, "sanity");
+  C->set_inlining_incrementally(true);
+
+  C->for_igvn()->clear();
+  C->initial_gvn()->replace_with(&_igvn);
+
+  expand_vunbox_nodes();
+  scalarize_vbox_nodes();
+
+  C->inline_vector_reboxing_calls();
+
+  expand_vbox_nodes();
+  eliminate_vbox_alloc_nodes();
+
+  C->set_inlining_incrementally(false);
+
+  do_cleanup();
+}
+
+void PhaseVector::do_cleanup() {
+  if (C->failing())  return;
+  {
+    Compile::TracePhase tp("vector_pru", &timers[_t_vector_pru]);
+    ResourceMark rm;
+    PhaseRemoveUseless pru(C->initial_gvn(), C->for_igvn());
+    if (C->failing())  return;
+  }
+  {
+    Compile::TracePhase tp("incrementalInline_igvn", &timers[_t_vector_igvn]);
+    _igvn = PhaseIterGVN(C->initial_gvn());
+    _igvn.optimize();
+    if (C->failing())  return;
+  }
+  C->print_method(PHASE_ITER_GVN_BEFORE_EA, 3);
+}
+
+void PhaseVector::scalarize_vbox_nodes() {
+  if (C->failing())  return;
+
+  if (!EnableVectorReboxing) {
+    return; // don't scalarize vector boxes
+  }
+
+  int macro_idx = C->macro_count() - 1;
+  while (macro_idx >= 0) {
+    Node * n = C->macro_node(macro_idx);
+    assert(n->is_macro(), "only macro nodes expected here");
+    if (n->Opcode() == Op_VectorBox) {
+      VectorBoxNode* vbox = static_cast<VectorBoxNode*>(n);
+      scalarize_vbox_node(vbox);
+      if (C->failing())  return;
+      C->print_method(PHASE_SCALARIZE_VBOX, vbox, 3);
+    }
+    if (C->failing())  return;
+    macro_idx = MIN2(macro_idx - 1, C->macro_count() - 1);
+  }
+}
+
+void PhaseVector::expand_vbox_nodes() {
+  if (C->failing())  return;
+
+  int macro_idx = C->macro_count() - 1;
+  while (macro_idx >= 0) {
+    Node * n = C->macro_node(macro_idx);
+    assert(n->is_macro(), "only macro nodes expected here");
+    if (n->Opcode() == Op_VectorBox) {
+      VectorBoxNode* vbox = static_cast<VectorBoxNode*>(n);
+      expand_vbox_node(vbox);
+      if (C->failing())  return;
+    }
+    if (C->failing())  return;
+    macro_idx = MIN2(macro_idx - 1, C->macro_count() - 1);
+  }
+}
+
+void PhaseVector::expand_vunbox_nodes() {
+  if (C->failing())  return;
+
+  int macro_idx = C->macro_count() - 1;
+  while (macro_idx >= 0) {
+    Node * n = C->macro_node(macro_idx);
+    assert(n->is_macro(), "only macro nodes expected here");
+    if (n->Opcode() == Op_VectorUnbox) {
+      VectorUnboxNode* vec_unbox = static_cast<VectorUnboxNode*>(n);
+      expand_vunbox_node(vec_unbox);
+      if (C->failing())  return;
+      C->print_method(PHASE_EXPAND_VUNBOX, vec_unbox, 3);
+    }
+    if (C->failing())  return;
+    macro_idx = MIN2(macro_idx - 1, C->macro_count() - 1);
+  }
+}
+
+void PhaseVector::eliminate_vbox_alloc_nodes() {
+  if (C->failing())  return;
+
+  int macro_idx = C->macro_count() - 1;
+  while (macro_idx >= 0) {
+    Node * n = C->macro_node(macro_idx);
+    assert(n->is_macro(), "only macro nodes expected here");
+    if (n->Opcode() == Op_VectorBoxAllocate) {
+      VectorBoxAllocateNode* vbox_alloc = static_cast<VectorBoxAllocateNode*>(n);
+      eliminate_vbox_alloc_node(vbox_alloc);
+      if (C->failing())  return;
+      C->print_method(PHASE_ELIMINATE_VBOX_ALLOC, vbox_alloc, 3);
+    }
+    if (C->failing())  return;
+    macro_idx = MIN2(macro_idx - 1, C->macro_count() - 1);
+  }
+}
+
+static JVMState* clone_jvms(Compile* C, SafePointNode* sfpt) {
+  JVMState* new_jvms = sfpt->jvms()->clone_shallow(C);
+  uint size = sfpt->req();
+  SafePointNode* map = new SafePointNode(size, new_jvms);
+  for (uint i = 0; i < size; i++) {
+    map->init_req(i, sfpt->in(i));
+  }
+  new_jvms->set_map(map);
+  return new_jvms;
+}
+
+void PhaseVector::scalarize_vbox_node(VectorBoxNode* vec_box) {
+  Node* vec_value = vec_box->in(VectorBoxNode::Value);
+  PhaseGVN& gvn = *C->initial_gvn();
+
+  // Process merged VBAs
+
+  if (EnableVectorAggressiveReboxing) {
+    Unique_Node_List calls(C->comp_arena());
+    for (DUIterator_Fast imax, i = vec_box->fast_outs(imax); i < imax; i++) {
+      Node* use = vec_box->fast_out(i);
+      if (use->is_CallJava()) {
+        CallJavaNode* call = use->as_CallJava();
+        if (call->has_non_debug_use(vec_box) && vec_box->in(VectorBoxNode::Box)->is_Phi()) {
+          calls.push(call);
+        }
+      }
+    }
+
+    while (calls.size() > 0) {
+      CallJavaNode* call = calls.pop()->as_CallJava();
+      // Attach new VBA to the call and use it instead of Phi (VBA ... VBA).
+
+      JVMState* jvms = clone_jvms(C, call);
+      GraphKit kit(jvms);
+      PhaseGVN& gvn = kit.gvn();
+
+      // Adjust JVMS from post-call to pre-call state: put args on stack
+      uint nargs = call->method()->arg_size();
+      kit.ensure_stack(kit.sp() + nargs);
+      for (uint i = TypeFunc::Parms; i < call->tf()->domain()->cnt(); i++) {
+        kit.push(call->in(i));
+      }
+      jvms = kit.sync_jvms();
+
+      Node* new_vbox = NULL;
+      {
+        PreserveReexecuteState prs(&kit);
+
+        kit.jvms()->set_should_reexecute(true);
+
+        const TypeInstPtr* vbox_type = vec_box->box_type();
+        const TypeVect* vect_type = vec_box->vec_type();
+        Node* vect = vec_box->in(VectorBoxNode::Value);
+
+        VectorBoxAllocateNode* alloc = new VectorBoxAllocateNode(C, vbox_type);
+        kit.set_edges_for_java_call(alloc, /*must_throw=*/false, /*separate_io_proj=*/true);
+        kit.make_slow_call_ex(alloc, C->env()->Throwable_klass(), /*separate_io_proj=*/true, /*deoptimize=*/true);
+        kit.set_i_o(gvn.transform( new ProjNode(alloc, TypeFunc::I_O) ));
+        kit.set_all_memory(gvn.transform( new ProjNode(alloc, TypeFunc::Memory) ));
+        Node* ret = gvn.transform(new ProjNode(alloc, TypeFunc::Parms));
+
+        new_vbox = gvn.transform(new VectorBoxNode(C, ret, vect, vbox_type, vect_type));
+
+        kit.replace_in_map(vec_box, new_vbox);
+      }
+
+      kit.dec_sp(nargs);
+      jvms = kit.sync_jvms();
+
+      call->set_req(TypeFunc::Control , kit.control());
+      call->set_req(TypeFunc::I_O     , kit.i_o());
+      call->set_req(TypeFunc::Memory  , kit.reset_memory());
+      call->set_req(TypeFunc::FramePtr, kit.frameptr());
+      call->replace_edge(vec_box, new_vbox);
+
+      C->record_for_igvn(call);
+    }
+  }
+
+  // Process debug uses at safepoints
+  Unique_Node_List safepoints(C->comp_arena());
+
+  for (DUIterator_Fast imax, i = vec_box->fast_outs(imax); i < imax; i++) {
+    Node* use = vec_box->fast_out(i);
+    if (use->is_SafePoint()) {
+      SafePointNode* sfpt = use->as_SafePoint();
+      if (!sfpt->is_Call() || !sfpt->as_Call()->has_non_debug_use(vec_box)) {
+        safepoints.push(sfpt);
+      }
+    }
+  }
+
+  while (safepoints.size() > 0) {
+    SafePointNode* sfpt = safepoints.pop()->as_SafePoint();
+
+    uint first_ind = (sfpt->req() - sfpt->jvms()->scloff());
+    Node* sobj = new SafePointScalarObjectNode(vec_box->box_type(),
+#ifdef ASSERT
+                                               NULL,
+#endif // ASSERT
+                                               first_ind, /*n_fields=*/1);
+    sobj->init_req(0, C->root());
+    sfpt->add_req(vec_value);
+
+    sobj = gvn.transform(sobj);
+
+    JVMState *jvms = sfpt->jvms();
+
+    jvms->set_endoff(sfpt->req());
+    // Now make a pass over the debug information replacing any references
+    // to the allocated object with "sobj"
+    int start = jvms->debug_start();
+    int end   = jvms->debug_end();
+    sfpt->replace_edges_in_range(vec_box, sobj, start, end);
+
+    C->record_for_igvn(sfpt);
+  }
+}
+
+void PhaseVector::expand_vbox_node(VectorBoxNode* vec_box) {
+  if (vec_box->outcnt() > 0) {
+    Node* vbox = vec_box->in(VectorBoxNode::Box);
+    Node* vect = vec_box->in(VectorBoxNode::Value);
+    Node* result = expand_vbox_node_helper(vbox, vect, vec_box->box_type(), vec_box->vec_type());
+    C->gvn_replace_by(vec_box, result);
+    C->print_method(PHASE_EXPAND_VBOX, vec_box, 3);
+  }
+  C->remove_macro_node(vec_box);
+}
+
+Node* PhaseVector::expand_vbox_node_helper(Node* vbox,
+                                           Node* vect,
+                                           const TypeInstPtr* box_type,
+                                           const TypeVect* vect_type) {
+  if (vbox->is_Phi() && vect->is_Phi()) {
+    assert(vbox->as_Phi()->region() == vect->as_Phi()->region(), "");
+    Node* new_phi = new PhiNode(vbox->as_Phi()->region(), box_type);
+    for (uint i = 1; i < vbox->req(); i++) {
+      Node* new_box = expand_vbox_node_helper(vbox->in(i), vect->in(i), box_type, vect_type);
+      new_phi->set_req(i, new_box);
+    }
+    new_phi = C->initial_gvn()->transform(new_phi);
+    return new_phi;
+  } else if (vbox->is_Proj() && vbox->in(0)->Opcode() == Op_VectorBoxAllocate) {
+    VectorBoxAllocateNode* vbox_alloc = static_cast<VectorBoxAllocateNode*>(vbox->in(0));
+    return expand_vbox_alloc_node(vbox_alloc, vect, box_type, vect_type);
+  } else {
+    assert(!vbox->is_Phi(), "");
+    // TODO: assert that expanded vbox is initialized with the same value (vect).
+    return vbox; // already expanded
+  }
+}
+
+static bool is_vector_mask(ciKlass* klass) {
+  return klass->is_subclass_of(ciEnv::current()->vector_VectorMask_klass());
+}
+
+static bool is_vector_shuffle(ciKlass* klass) {
+  return klass->is_subclass_of(ciEnv::current()->vector_VectorShuffle_klass());
+}
+
+Node* PhaseVector::expand_vbox_alloc_node(VectorBoxAllocateNode* vbox_alloc,
+                                          Node* value,
+                                          const TypeInstPtr* box_type,
+                                          const TypeVect* vect_type) {
+  JVMState* jvms = clone_jvms(C, vbox_alloc);
+  GraphKit kit(jvms);
+  PhaseGVN& gvn = kit.gvn();
+
+  ciInstanceKlass* box_klass = box_type->klass()->as_instance_klass();
+  BasicType bt = vect_type->element_basic_type();
+  int num_elem = vect_type->length();
+
+  bool is_mask = is_vector_mask(box_klass);
+  if (is_mask && bt != T_BOOLEAN) {
+    value = gvn.transform(VectorStoreMaskNode::make(gvn, value, bt, num_elem));
+    // Although type of mask depends on its definition, in terms of storage everything is stored in boolean array.
+    bt = T_BOOLEAN;
+    assert(value->as_Vector()->bottom_type()->is_vect()->element_basic_type() == bt,
+           "must be consistent with mask representation");
+  }
+
+  // Generate array allocation for the field which holds the values.
+  const TypeKlassPtr* array_klass = TypeKlassPtr::make(ciTypeArrayKlass::make(bt));
+  Node* arr = kit.new_array(kit.makecon(array_klass), kit.intcon(num_elem), 1);
+
+  // Store the vector value into the array.
+  // (The store should be captured by InitializeNode and turned into initialized store later.)
+  Node* arr_adr = kit.array_element_address(arr, kit.intcon(0), bt);
+  const TypePtr* arr_adr_type = arr_adr->bottom_type()->is_ptr();
+  Node* arr_mem = kit.memory(arr_adr);
+  Node* vstore = gvn.transform(StoreVectorNode::make(0,
+                                                     kit.control(),
+                                                     arr_mem,
+                                                     arr_adr,
+                                                     arr_adr_type,
+                                                     value,
+                                                     num_elem));
+  kit.set_memory(vstore, arr_adr_type);
+
+  C->set_max_vector_size(MAX2(C->max_vector_size(), vect_type->length_in_bytes()));
+
+  // Generate the allocate for the Vector object.
+  const TypeKlassPtr* klass_type = box_type->as_klass_type();
+  Node* klass_node = kit.makecon(klass_type);
+  Node* vec_obj = kit.new_instance(klass_node);
+
+  // Store the allocated array into object.
+  ciField* field = ciEnv::current()->vector_VectorPayload_klass()->get_field_by_name(ciSymbol::payload_name(),
+                                                                                     ciSymbol::object_signature(),
+                                                                                     false);
+  assert(field != NULL, "");
+  Node* vec_field = kit.basic_plus_adr(vec_obj, field->offset_in_bytes());
+  const TypePtr* vec_adr_type = vec_field->bottom_type()->is_ptr();
+
+  // The store should be captured by InitializeNode and turned into initialized store later.
+  Node* field_store = gvn.transform(kit.access_store_at(vec_obj,
+                                                            vec_field,
+                                                            vec_adr_type,
+                                                            arr,
+                                                            TypeOopPtr::make_from_klass(field->type()->as_klass()),
+                                                            T_OBJECT,
+                                                            IN_HEAP));
+  kit.set_memory(field_store, vec_adr_type);
+
+  kit.replace_call(vbox_alloc, vec_obj, true);
+  C->remove_macro_node(vbox_alloc);
+
+  return vec_obj;
+}
+
+void PhaseVector::expand_vunbox_node(VectorUnboxNode* vec_unbox) {
+  if (vec_unbox->outcnt() > 0) {
+    GraphKit kit;
+    PhaseGVN& gvn = kit.gvn();
+
+    Node* obj = vec_unbox->obj();
+    const TypeInstPtr* tinst = gvn.type(obj)->isa_instptr();
+    ciInstanceKlass* from_kls = tinst->klass()->as_instance_klass();
+    BasicType bt = vec_unbox->vect_type()->element_basic_type();
+    BasicType masktype = bt;
+    BasicType elem_bt;
+
+    if (is_vector_mask(from_kls)) {
+      bt = T_BOOLEAN;
+    } else if (is_vector_shuffle(from_kls)) {
+      if (vec_unbox->is_shuffle_to_vector() == true) {
+        elem_bt = bt;
+      }
+      bt = T_BYTE;
+    }
+
+    ciField* field = ciEnv::current()->vector_VectorPayload_klass()->get_field_by_name(ciSymbol::payload_name(),
+                                                                                       ciSymbol::object_signature(),
+                                                                                       false);
+    assert(field != NULL, "");
+    int offset = field->offset_in_bytes();
+    Node* vec_adr = kit.basic_plus_adr(obj, offset);
+
+    Node* mem = vec_unbox->mem();
+    Node* ctrl = vec_unbox->in(0);
+    Node* vec_field_ld = LoadNode::make(gvn,
+                                        ctrl,
+                                        mem,
+                                        vec_adr,
+                                        vec_adr->bottom_type()->is_ptr(),
+                                        TypeOopPtr::make_from_klass(field->type()->as_klass()),
+                                        T_OBJECT,
+                                        MemNode::unordered);
+    vec_field_ld = gvn.transform(vec_field_ld);
+
+    // For proper aliasing, attach concrete payload type.
+    ciKlass* payload_klass = ciTypeArrayKlass::make(bt);
+    const Type* payload_type = TypeAryPtr::make_from_klass(payload_klass)->cast_to_ptr_type(TypePtr::NotNull);
+    vec_field_ld = gvn.transform(new CastPPNode(vec_field_ld, payload_type));
+
+    Node* adr = kit.array_element_address(vec_field_ld, gvn.intcon(0), bt);
+    const TypePtr* adr_type = adr->bottom_type()->is_ptr();
+    const TypeVect* vt = vec_unbox->bottom_type()->is_vect();
+    int num_elem = vt->length();
+    Node* vec_val_load = LoadVectorNode::make(0,
+                                              ctrl,
+                                              mem,
+                                              adr,
+                                              adr_type,
+                                              num_elem,
+                                              bt);
+    vec_val_load = gvn.transform(vec_val_load);
+
+    C->set_max_vector_size(MAX2(C->max_vector_size(), vt->length_in_bytes()));
+
+    if (is_vector_mask(from_kls) && masktype != T_BOOLEAN) {
+      assert(vec_unbox->bottom_type()->is_vect()->element_basic_type() == masktype, "expect mask type consistency");
+      vec_val_load = gvn.transform(new VectorLoadMaskNode(vec_val_load, TypeVect::make(masktype, num_elem)));
+    } else if (is_vector_shuffle(from_kls)) {
+      if (vec_unbox->is_shuffle_to_vector() == false) {
+        assert(vec_unbox->bottom_type()->is_vect()->element_basic_type() == masktype, "expect shuffle type consistency");
+        vec_val_load = gvn.transform(new VectorLoadShuffleNode(vec_val_load, TypeVect::make(masktype, num_elem)));
+      } else if (elem_bt != T_BYTE) {
+        vec_val_load = gvn.transform(VectorCastNode::make(Op_VectorCastB2X, vec_val_load, elem_bt, num_elem));
+      }
+    }
+
+    gvn.hash_delete(vec_unbox);
+    vec_unbox->disconnect_inputs(C);
+    C->gvn_replace_by(vec_unbox, vec_val_load);
+  }
+  C->remove_macro_node(vec_unbox);
+}
+
+void PhaseVector::eliminate_vbox_alloc_node(VectorBoxAllocateNode* vbox_alloc) {
+  JVMState* jvms = clone_jvms(C, vbox_alloc);
+  GraphKit kit(jvms);
+  // Remove VBA, but leave a safepoint behind.
+  // Otherwise, it may end up with a loop without any safepoint polls.
+  kit.replace_call(vbox_alloc, kit.map(), true);
+  C->remove_macro_node(vbox_alloc);
+}
--- a/src/hotspot/share/opto/vector.hpp
+++ b/src/hotspot/share/opto/vector.hpp
@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef SHARE_OPTO_VECTOR_HPP
+#define SHARE_OPTO_VECTOR_HPP
+
+#include "opto/node.hpp"
+#include "opto/phaseX.hpp"
+#include "opto/type.hpp"
+#include "opto/vectornode.hpp"
+
+class PhaseVector : public Phase {
+ private:
+  PhaseIterGVN& _igvn;
+
+  void expand_vbox_nodes();
+  void expand_vbox_node(VectorBoxNode* vec_box);
+  Node* expand_vbox_node_helper(Node* vbox,
+                                Node* vect,
+                                const TypeInstPtr* box_type,
+                                const TypeVect* vect_type);
+  Node* expand_vbox_alloc_node(VectorBoxAllocateNode* vbox_alloc,
+                               Node* value,
+                               const TypeInstPtr* box_type,
+                               const TypeVect* vect_type);
+  void scalarize_vbox_nodes();
+  void scalarize_vbox_node(VectorBoxNode* vec_box);
+  void expand_vunbox_nodes();
+  void expand_vunbox_node(VectorUnboxNode* vec_box);
+  void eliminate_vbox_alloc_nodes();
+  void eliminate_vbox_alloc_node(VectorBoxAllocateNode* vbox_alloc);
+  void do_cleanup();
+  void scalarize_vector_boxes();
+  void expand_vector_boxes();
+
+ public:
+  PhaseVector(PhaseIterGVN& igvn) : Phase(Vector), _igvn(igvn) {}
+  void optimize_vector_boxes();
+};
+
+#endif // SHARE_OPTO_VECTOR_HPP
--- a/src/hotspot/share/opto/vectorIntrinsics.cpp
+++ b/src/hotspot/share/opto/vectorIntrinsics.cpp
--- a/src/hotspot/share/opto/vectornode.cpp
+++ b/src/hotspot/share/opto/vectornode.cpp
@ -120,12 +120,51 @@ int VectorNode::opcode(int sopc, BasicType bt) {
  case Op_AbsL:
    assert(bt == T_LONG, "must be");
    return Op_AbsVL;
+  case Op_MinI:
+    switch (bt) {
+    case T_BOOLEAN:
+    case T_CHAR:   return 0;
+    case T_BYTE:
+    case T_SHORT:
+    case T_INT:    return Op_MinV;
+    default:       ShouldNotReachHere(); return 0;
+    }
+  case Op_MinL:
+    assert(bt == T_LONG, "must be");
+    return Op_MinV;
+  case Op_MinF:
+    assert(bt == T_FLOAT, "must be");
+    return Op_MinV;
+  case Op_MinD:
+    assert(bt == T_DOUBLE, "must be");
+    return Op_MinV;
+  case Op_MaxI:
+    switch (bt) {
+    case T_BOOLEAN:
+    case T_CHAR:   return 0;
+    case T_BYTE:
+    case T_SHORT:
+    case T_INT:    return Op_MaxV;
+    default:       ShouldNotReachHere(); return 0;
+    }
+  case Op_MaxL:
+    assert(bt == T_LONG, "must be");
+    return Op_MaxV;
+  case Op_MaxF:
+    assert(bt == T_FLOAT, "must be");
+    return Op_MaxV;
+  case Op_MaxD:
+    assert(bt == T_DOUBLE, "must be");
+    return Op_MaxV;
  case Op_AbsF:
    assert(bt == T_FLOAT, "must be");
    return Op_AbsVF;
  case Op_AbsD:
    assert(bt == T_DOUBLE, "must be");
    return Op_AbsVD;
+  case Op_NegI:
+    assert(bt == T_INT, "must be");
+    return Op_NegVI;
  case Op_NegF:
    assert(bt == T_FLOAT, "must be");
    return Op_NegVF;
@ -178,6 +217,12 @@ int VectorNode::opcode(int sopc, BasicType bt) {
  case Op_RShiftL:
    assert(bt == T_LONG, "must be");
    return Op_RShiftVL;
+  case Op_URShiftB:
+    assert(bt == T_BYTE, "must be");
+    return Op_URShiftVB;
+  case Op_URShiftS:
+    assert(bt == T_SHORT, "must be");
+    return Op_URShiftVS;
  case Op_URShiftI:
    switch (bt) {
    case T_BOOLEAN:return Op_URShiftVB;
@ -203,18 +248,6 @@ int VectorNode::opcode(int sopc, BasicType bt) {
  case Op_XorI:
  case Op_XorL:
    return Op_XorV;
-  case Op_MinF:
-    assert(bt == T_FLOAT, "must be");
-    return Op_MinV;
-  case Op_MinD:
-    assert(bt == T_DOUBLE, "must be");
-    return Op_MinV;
-  case Op_MaxF:
-    assert(bt == T_FLOAT, "must be");
-    return Op_MaxV;
-  case Op_MaxD:
-    assert(bt == T_DOUBLE, "must be");
-    return Op_MaxV;

  case Op_LoadB:
  case Op_LoadUB:
@ -241,6 +274,28 @@ int VectorNode::opcode(int sopc, BasicType bt) {
  }
 }

+int VectorNode::replicate_opcode(BasicType bt) {
+  switch(bt) {
+    case T_BOOLEAN:
+    case T_BYTE:
+      return Op_ReplicateB;
+    case T_SHORT:
+    case T_CHAR:
+      return Op_ReplicateS;
+    case T_INT:
+      return Op_ReplicateI;
+    case T_LONG:
+      return Op_ReplicateL;
+    case T_FLOAT:
+      return Op_ReplicateF;
+    case T_DOUBLE:
+      return Op_ReplicateD;
+    default:
+      assert(false, "wrong type: %s", type2name(bt));
+      return 0;
+  }
+}
+
 // Also used to check if the code generator
 // supports the vector operation.
 bool VectorNode::implemented(int opc, uint vlen, BasicType bt) {
@ -331,6 +386,16 @@ bool VectorNode::is_shift(Node* n) {
  }
 }

+bool VectorNode::is_vshift_cnt(Node* n) {
+  switch (n->Opcode()) {
+  case Op_LShiftCntV:
+  case Op_RShiftCntV:
+    return true;
+  default:
+    return false;
+  }
+}
+
 // Check if input is loop invariant vector.
 bool VectorNode::is_invariant_vector(Node* n) {
  // Only Replicate vector nodes are loop invariant for now.
@ -397,12 +462,10 @@ void VectorNode::vector_operands(Node* n, uint* start, uint* end) {
  }
 }

-// Return the vector version of a scalar operation node.
-VectorNode* VectorNode::make(int opc, Node* n1, Node* n2, uint vlen, BasicType bt) {
-  const TypeVect* vt = TypeVect::make(bt, vlen);
-  int vopc = VectorNode::opcode(opc, bt);
+// Make a vector node for binary operation
+VectorNode* VectorNode::make(int vopc, Node* n1, Node* n2, const TypeVect* vt) {
  // This method should not be called for unimplemented vectors.
-  guarantee(vopc > 0, "Vector for '%s' is not implemented", NodeClassNames[opc]);
+  guarantee(vopc > 0, "vopc must be > 0");
  switch (vopc) {
  case Op_AddVB: return new AddVBNode(n1, n2, vt);
  case Op_AddVS: return new AddVSNode(n1, n2, vt);
@ -428,13 +491,17 @@ VectorNode* VectorNode::make(int opc, Node* n1, Node* n2, uint vlen, BasicType b
  case Op_DivVF: return new DivVFNode(n1, n2, vt);
  case Op_DivVD: return new DivVDNode(n1, n2, vt);

+  case Op_MinV: return new MinVNode(n1, n2, vt);
+  case Op_MaxV: return new MaxVNode(n1, n2, vt);
+
+  case Op_AbsVF: return new AbsVFNode(n1, vt);
+  case Op_AbsVD: return new AbsVDNode(n1, vt);
  case Op_AbsVB: return new AbsVBNode(n1, vt);
  case Op_AbsVS: return new AbsVSNode(n1, vt);
  case Op_AbsVI: return new AbsVINode(n1, vt);
  case Op_AbsVL: return new AbsVLNode(n1, vt);
-  case Op_AbsVF: return new AbsVFNode(n1, vt);
-  case Op_AbsVD: return new AbsVDNode(n1, vt);

+  case Op_NegVI: return new NegVINode(n1, vt);
  case Op_NegVF: return new NegVFNode(n1, vt);
  case Op_NegVD: return new NegVDNode(n1, vt);

@ -464,9 +531,6 @@ VectorNode* VectorNode::make(int opc, Node* n1, Node* n2, uint vlen, BasicType b
  case Op_OrV:  return new OrVNode (n1, n2, vt);
  case Op_XorV: return new XorVNode(n1, n2, vt);

-  case Op_MinV: return new MinVNode(n1, n2, vt);
-  case Op_MaxV: return new MaxVNode(n1, n2, vt);
-
  case Op_RoundDoubleModeV: return new RoundDoubleModeVNode(n1, n2, vt);

  case Op_MulAddVS2VI: return new MulAddVS2VINode(n1, n2, vt);
@ -476,11 +540,19 @@ VectorNode* VectorNode::make(int opc, Node* n1, Node* n2, uint vlen, BasicType b
  }
 }

-VectorNode* VectorNode::make(int opc, Node* n1, Node* n2, Node* n3, uint vlen, BasicType bt) {
+// Return the vector version of a scalar binary operation node.
+VectorNode* VectorNode::make(int opc, Node* n1, Node* n2, uint vlen, BasicType bt) {
  const TypeVect* vt = TypeVect::make(bt, vlen);
  int vopc = VectorNode::opcode(opc, bt);
  // This method should not be called for unimplemented vectors.
  guarantee(vopc > 0, "Vector for '%s' is not implemented", NodeClassNames[opc]);
+  return make(vopc, n1, n2, vt);
+}
+
+// Make a vector node for ternary operation
+VectorNode* VectorNode::make(int vopc, Node* n1, Node* n2, Node* n3, const TypeVect* vt) {
+  // This method should not be called for unimplemented vectors.
+  guarantee(vopc > 0, "vopc must be > 0");
  switch (vopc) {
  case Op_FmaVD: return new FmaVDNode(n1, n2, n3, vt);
  case Op_FmaVF: return new FmaVFNode(n1, n2, n3, vt);
@ -490,6 +562,15 @@ VectorNode* VectorNode::make(int opc, Node* n1, Node* n2, Node* n3, uint vlen, B
  }
 }

+// Return the vector version of a scalar ternary operation node.
+VectorNode* VectorNode::make(int opc, Node* n1, Node* n2, Node* n3, uint vlen, BasicType bt) {
+  const TypeVect* vt = TypeVect::make(bt, vlen);
+  int vopc = VectorNode::opcode(opc, bt);
+  // This method should not be called for unimplemented vectors.
+  guarantee(vopc > 0, "Vector for '%s' is not implemented", NodeClassNames[opc]);
+  return make(vopc, n1, n2, n3, vt);
+}
+
 // Scalar promotion
 VectorNode* VectorNode::scalar2vector(Node* s, uint vlen, const Type* opd_t) {
  BasicType bt = opd_t->array_element_basic_type();
@ -516,21 +597,22 @@ VectorNode* VectorNode::scalar2vector(Node* s, uint vlen, const Type* opd_t) {
  }
 }

-VectorNode* VectorNode::shift_count(Node* shift, Node* cnt, uint vlen, BasicType bt) {
-  assert(VectorNode::is_shift(shift), "sanity");
+VectorNode* VectorNode::shift_count(int opc, Node* cnt, uint vlen, BasicType bt) {
  // Match shift count type with shift vector type.
  const TypeVect* vt = TypeVect::make(bt, vlen);
-  switch (shift->Opcode()) {
+  switch (opc) {
  case Op_LShiftI:
  case Op_LShiftL:
    return new LShiftCntVNode(cnt, vt);
  case Op_RShiftI:
  case Op_RShiftL:
+  case Op_URShiftB:
+  case Op_URShiftS:
  case Op_URShiftI:
  case Op_URShiftL:
    return new RShiftCntVNode(cnt, vt);
  default:
-    fatal("Missed vector creation for '%s'", NodeClassNames[shift->Opcode()]);
+    fatal("Missed vector creation for '%s'", NodeClassNames[opc]);
    return NULL;
  }
 }
@ -677,29 +759,37 @@ StoreVectorNode* StoreVectorNode::make(int opc, Node* ctl, Node* mem,
  return new StoreVectorNode(ctl, mem, adr, atyp, val);
 }

+int ExtractNode::opcode(BasicType bt) {
+  switch (bt) {
+    case T_BOOLEAN: return Op_ExtractUB;
+    case T_BYTE:    return Op_ExtractB;
+    case T_CHAR:    return Op_ExtractC;
+    case T_SHORT:   return Op_ExtractS;
+    case T_INT:     return Op_ExtractI;
+    case T_LONG:    return Op_ExtractL;
+    case T_FLOAT:   return Op_ExtractF;
+    case T_DOUBLE:  return Op_ExtractD;
+    default:
+      assert(false, "wrong type: %s", type2name(bt));
+      return 0;
+  }
+}
+
 // Extract a scalar element of vector.
 Node* ExtractNode::make(Node* v, uint position, BasicType bt) {
  assert((int)position < Matcher::max_vector_size(bt), "pos in range");
  ConINode* pos = ConINode::make((int)position);
  switch (bt) {
-  case T_BOOLEAN:
-    return new ExtractUBNode(v, pos);
-  case T_BYTE:
-    return new ExtractBNode(v, pos);
-  case T_CHAR:
-    return new ExtractCNode(v, pos);
-  case T_SHORT:
-    return new ExtractSNode(v, pos);
-  case T_INT:
-    return new ExtractINode(v, pos);
-  case T_LONG:
-    return new ExtractLNode(v, pos);
-  case T_FLOAT:
-    return new ExtractFNode(v, pos);
-  case T_DOUBLE:
-    return new ExtractDNode(v, pos);
+  case T_BOOLEAN: return new ExtractUBNode(v, pos);
+  case T_BYTE:    return new ExtractBNode(v, pos);
+  case T_CHAR:    return new ExtractCNode(v, pos);
+  case T_SHORT:   return new ExtractSNode(v, pos);
+  case T_INT:     return new ExtractINode(v, pos);
+  case T_LONG:    return new ExtractLNode(v, pos);
+  case T_FLOAT:   return new ExtractFNode(v, pos);
+  case T_DOUBLE:  return new ExtractDNode(v, pos);
  default:
-    fatal("Type '%s' is not supported for vectors", type2name(bt));
+    assert(false, "wrong type: %s", type2name(bt));
    return NULL;
  }
 }
@ -708,8 +798,16 @@ int ReductionNode::opcode(int opc, BasicType bt) {
  int vopc = opc;
  switch (opc) {
    case Op_AddI:
-      assert(bt == T_INT, "must be");
-      vopc = Op_AddReductionVI;
+      switch (bt) {
+        case T_BOOLEAN:
+        case T_CHAR: return 0;
+        case T_BYTE:
+        case T_SHORT:
+        case T_INT:
+          vopc = Op_AddReductionVI;
+          break;
+        default: ShouldNotReachHere(); return 0;
+      }
      break;
    case Op_AddL:
      assert(bt == T_LONG, "must be");
@ -724,8 +822,16 @@ int ReductionNode::opcode(int opc, BasicType bt) {
      vopc = Op_AddReductionVD;
      break;
    case Op_MulI:
-      assert(bt == T_INT, "must be");
-      vopc = Op_MulReductionVI;
+      switch (bt) {
+        case T_BOOLEAN:
+        case T_CHAR: return 0;
+        case T_BYTE:
+        case T_SHORT:
+        case T_INT:
+          vopc = Op_MulReductionVI;
+          break;
+        default: ShouldNotReachHere(); return 0;
+      }
      break;
    case Op_MulL:
      assert(bt == T_LONG, "must be");
@ -739,6 +845,22 @@ int ReductionNode::opcode(int opc, BasicType bt) {
      assert(bt == T_DOUBLE, "must be");
      vopc = Op_MulReductionVD;
      break;
+    case Op_MinI:
+      switch (bt) {
+        case T_BOOLEAN:
+        case T_CHAR: return 0;
+        case T_BYTE:
+        case T_SHORT:
+        case T_INT:
+          vopc = Op_MinReductionV;
+          break;
+        default: ShouldNotReachHere(); return 0;
+      }
+      break;
+    case Op_MinL:
+      assert(bt == T_LONG, "must be");
+      vopc = Op_MinReductionV;
+      break;
    case Op_MinF:
      assert(bt == T_FLOAT, "must be");
      vopc = Op_MinReductionV;
@ -747,6 +869,22 @@ int ReductionNode::opcode(int opc, BasicType bt) {
      assert(bt == T_DOUBLE, "must be");
      vopc = Op_MinReductionV;
      break;
+    case Op_MaxI:
+      switch (bt) {
+        case T_BOOLEAN:
+        case T_CHAR: return 0;
+        case T_BYTE:
+        case T_SHORT:
+        case T_INT:
+          vopc = Op_MaxReductionV;
+          break;
+        default: ShouldNotReachHere(); return 0;
+      }
+      break;
+    case Op_MaxL:
+      assert(bt == T_LONG, "must be");
+      vopc = Op_MaxReductionV;
+      break;
    case Op_MaxF:
      assert(bt == T_FLOAT, "must be");
      vopc = Op_MaxReductionV;
@ -756,24 +894,48 @@ int ReductionNode::opcode(int opc, BasicType bt) {
      vopc = Op_MaxReductionV;
      break;
    case Op_AndI:
-      assert(bt == T_INT, "must be");
-      vopc = Op_AndReductionV;
+      switch (bt) {
+      case T_BOOLEAN:
+      case T_CHAR: return 0;
+      case T_BYTE:
+      case T_SHORT:
+      case T_INT:
+        vopc = Op_AndReductionV;
+        break;
+      default: ShouldNotReachHere(); return 0;
+      }
      break;
    case Op_AndL:
      assert(bt == T_LONG, "must be");
      vopc = Op_AndReductionV;
      break;
    case Op_OrI:
-      assert(bt == T_INT, "must be");
-      vopc = Op_OrReductionV;
+      switch(bt) {
+      case T_BOOLEAN:
+      case T_CHAR: return 0;
+      case T_BYTE:
+      case T_SHORT:
+      case T_INT:
+        vopc = Op_OrReductionV;
+        break;
+      default: ShouldNotReachHere(); return 0;
+      }
      break;
    case Op_OrL:
      assert(bt == T_LONG, "must be");
      vopc = Op_OrReductionV;
      break;
    case Op_XorI:
-      assert(bt == T_INT, "must be");
-      vopc = Op_XorReductionV;
+      switch(bt) {
+      case T_BOOLEAN:
+      case T_CHAR: return 0;
+      case T_BYTE:
+      case T_SHORT:
+      case T_INT:
+        vopc = Op_XorReductionV;
+        break;
+      default: ShouldNotReachHere(); return 0;
+      }
      break;
    case Op_XorL:
      assert(bt == T_LONG, "must be");
@ -808,11 +970,116 @@ ReductionNode* ReductionNode::make(int opc, Node *ctrl, Node* n1, Node* n2, Basi
  case Op_OrReductionV:   return new OrReductionVNode(ctrl, n1, n2);
  case Op_XorReductionV:  return new XorReductionVNode(ctrl, n1, n2);
  default:
-    fatal("Missed vector creation for '%s'", NodeClassNames[vopc]);
+    assert(false, "unknown node: %s", NodeClassNames[vopc]);
    return NULL;
  }
 }

+VectorStoreMaskNode* VectorStoreMaskNode::make(PhaseGVN& gvn, Node* in, BasicType in_type, uint num_elem) {
+  assert(in->bottom_type()->isa_vect(), "sanity");
+  const TypeVect* vt = TypeVect::make(T_BOOLEAN, num_elem);
+  int elem_size = type2aelembytes(in_type);
+  return new VectorStoreMaskNode(in, gvn.intcon(elem_size), vt);
+}
+
+VectorCastNode* VectorCastNode::make(int vopc, Node* n1, BasicType bt, uint vlen) {
+  const TypeVect* vt = TypeVect::make(bt, vlen);
+  switch (vopc) {
+    case Op_VectorCastB2X: return new VectorCastB2XNode(n1, vt);
+    case Op_VectorCastS2X: return new VectorCastS2XNode(n1, vt);
+    case Op_VectorCastI2X: return new VectorCastI2XNode(n1, vt);
+    case Op_VectorCastL2X: return new VectorCastL2XNode(n1, vt);
+    case Op_VectorCastF2X: return new VectorCastF2XNode(n1, vt);
+    case Op_VectorCastD2X: return new VectorCastD2XNode(n1, vt);
+    default:
+      assert(false, "unknown node: %s", NodeClassNames[vopc]);
+      return NULL;
+  }
+}
+
+int VectorCastNode::opcode(BasicType bt) {
+  switch (bt) {
+    case T_BYTE:   return Op_VectorCastB2X;
+    case T_SHORT:  return Op_VectorCastS2X;
+    case T_INT:    return Op_VectorCastI2X;
+    case T_LONG:   return Op_VectorCastL2X;
+    case T_FLOAT:  return Op_VectorCastF2X;
+    case T_DOUBLE: return Op_VectorCastD2X;
+    default:
+      assert(false, "unknown type: %s", type2name(bt));
+      return 0;
+  }
+}
+
+Node* ReductionNode::make_reduction_input(PhaseGVN& gvn, int opc, BasicType bt) {
+  int vopc = opcode(opc, bt);
+  guarantee(vopc != opc, "Vector reduction for '%s' is not implemented", NodeClassNames[opc]);
+
+  switch (vopc) {
+    case Op_AndReductionV:
+      switch (bt) {
+        case T_BYTE:
+        case T_SHORT:
+        case T_INT:
+          return gvn.makecon(TypeInt::MINUS_1);
+        case T_LONG:
+          return gvn.makecon(TypeLong::MINUS_1);
+        default:
+          fatal("Missed vector creation for '%s' as the basic type is not correct.", NodeClassNames[vopc]);
+          return NULL;
+      }
+      break;
+    case Op_AddReductionVI: // fallthrough
+    case Op_AddReductionVL: // fallthrough
+    case Op_AddReductionVF: // fallthrough
+    case Op_AddReductionVD:
+    case Op_OrReductionV:
+    case Op_XorReductionV:
+      return gvn.zerocon(bt);
+    case Op_MulReductionVI:
+      return gvn.makecon(TypeInt::ONE);
+    case Op_MulReductionVL:
+      return gvn.makecon(TypeLong::ONE);
+    case Op_MulReductionVF:
+      return gvn.makecon(TypeF::ONE);
+    case Op_MulReductionVD:
+      return gvn.makecon(TypeD::ONE);
+    case Op_MinReductionV:
+      switch (bt) {
+        case T_BYTE:
+        case T_SHORT:
+        case T_INT:
+          return gvn.makecon(TypeInt::MAX);
+        case T_LONG:
+          return gvn.makecon(TypeLong::MAX);
+        case T_FLOAT:
+          return gvn.makecon(TypeF::POS_INF);
+        case T_DOUBLE:
+          return gvn.makecon(TypeD::POS_INF);
+          default: Unimplemented(); return NULL;
+      }
+      break;
+    case Op_MaxReductionV:
+      switch (bt) {
+        case T_BYTE:
+        case T_SHORT:
+        case T_INT:
+          return gvn.makecon(TypeInt::MIN);
+        case T_LONG:
+          return gvn.makecon(TypeLong::MIN);
+        case T_FLOAT:
+          return gvn.makecon(TypeF::NEG_INF);
+        case T_DOUBLE:
+          return gvn.makecon(TypeD::NEG_INF);
+          default: Unimplemented(); return NULL;
+      }
+      break;
+    default:
+      fatal("Missed vector creation for '%s'", NodeClassNames[vopc]);
+      return NULL;
+  }
+}
+
 bool ReductionNode::implemented(int opc, uint vlen, BasicType bt) {
  if (is_java_primitive(bt) &&
      (vlen > 1) && is_power_of_2(vlen) &&
@ -824,7 +1091,7 @@ bool ReductionNode::implemented(int opc, uint vlen, BasicType bt) {
 }

 MacroLogicVNode* MacroLogicVNode::make(PhaseGVN& gvn, Node* in1, Node* in2, Node* in3,
-                                      uint truth_table, const TypeVect* vt) {
+                                       uint truth_table, const TypeVect* vt) {
  assert(truth_table <= 0xFF, "invalid");
  assert(in1->bottom_type()->is_vect()->length_in_bytes() == vt->length_in_bytes(), "mismatch");
  assert(in2->bottom_type()->is_vect()->length_in_bytes() == vt->length_in_bytes(), "mismatch");
@ -895,3 +1162,51 @@ Node* RotateRightVNode::Ideal(PhaseGVN* phase, bool can_reshape) {
  return NULL;
 }

+#ifndef PRODUCT
+void VectorMaskCmpNode::dump_spec(outputStream *st) const {
+  st->print(" %d #", _predicate); _type->dump_on(st);
+}
+#endif // PRODUCT
+
+Node* VectorReinterpretNode::Identity(PhaseGVN *phase) {
+  Node* n = in(1);
+  if (n->Opcode() == Op_VectorReinterpret) {
+    if (Type::cmp(bottom_type(), n->in(1)->bottom_type()) == 0) {
+      return n->in(1);
+    }
+  }
+  return this;
+}
+
+Node* VectorInsertNode::make(Node* vec, Node* new_val, int position) {
+  assert(position < (int)vec->bottom_type()->is_vect()->length(), "pos in range");
+  ConINode* pos = ConINode::make(position);
+  return new VectorInsertNode(vec, new_val, pos, vec->bottom_type()->is_vect());
+}
+
+Node* VectorUnboxNode::Identity(PhaseGVN *phase) {
+  Node* n = obj()->uncast();
+  if (EnableVectorReboxing && n->Opcode() == Op_VectorBox) {
+    if (Type::cmp(bottom_type(), n->in(VectorBoxNode::Value)->bottom_type()) == 0) {
+      return n->in(VectorBoxNode::Value);
+    }
+  }
+  return this;
+}
+
+const TypeFunc* VectorBoxNode::vec_box_type(const TypeInstPtr* box_type) {
+  const Type** fields = TypeTuple::fields(0);
+  const TypeTuple *domain = TypeTuple::make(TypeFunc::Parms, fields);
+
+  fields = TypeTuple::fields(1);
+  fields[TypeFunc::Parms+0] = box_type;
+  const TypeTuple *range = TypeTuple::make(TypeFunc::Parms+1, fields);
+
+  return TypeFunc::make(domain, range);
+}
+
+#ifndef PRODUCT
+void VectorBoxAllocateNode::dump_spec(outputStream *st) const {
+  CallStaticJavaNode::dump_spec(st);
+}
+#endif // !PRODUCT
--- a/src/hotspot/share/opto/vectornode.hpp
+++ b/src/hotspot/share/opto/vectornode.hpp
@ -24,6 +24,7 @@
 #ifndef SHARE_OPTO_VECTORNODE_HPP
 #define SHARE_OPTO_VECTORNODE_HPP

+#include "opto/callnode.hpp"
 #include "opto/matcher.hpp"
 #include "opto/memnode.hpp"
 #include "opto/node.hpp"
@ -68,13 +69,17 @@ class VectorNode : public TypeNode {
  virtual uint ideal_reg() const { return Matcher::vector_ideal_reg(vect_type()->length_in_bytes()); }

  static VectorNode* scalar2vector(Node* s, uint vlen, const Type* opd_t);
-  static VectorNode* shift_count(Node* shift, Node* cnt, uint vlen, BasicType bt);
+  static VectorNode* shift_count(int opc, Node* cnt, uint vlen, BasicType bt);
  static VectorNode* make(int opc, Node* n1, Node* n2, uint vlen, BasicType bt);
+  static VectorNode* make(int vopc, Node* n1, Node* n2, const TypeVect* vt);
  static VectorNode* make(int opc, Node* n1, Node* n2, Node* n3, uint vlen, BasicType bt);
+  static VectorNode* make(int vopc, Node* n1, Node* n2, Node* n3, const TypeVect* vt);

  static int  opcode(int opc, BasicType bt);
+  static int replicate_opcode(BasicType bt);
  static bool implemented(int opc, uint vlen, BasicType bt);
  static bool is_shift(Node* n);
+  static bool is_vshift_cnt(Node* n);
  static bool is_type_transition_short_to_int(Node* n);
  static bool is_type_transition_to_int(Node* n);
  static bool is_muladds2i(Node* n);
@ -160,9 +165,10 @@ class ReductionNode : public Node {
  static ReductionNode* make(int opc, Node *ctrl, Node* in1, Node* in2, BasicType bt);
  static int  opcode(int opc, BasicType bt);
  static bool implemented(int opc, uint vlen, BasicType bt);
+  static Node* make_reduction_input(PhaseGVN& gvn, int opc, BasicType bt);

  virtual const Type* bottom_type() const {
-    BasicType vbt = in(2)->bottom_type()->is_vect()->element_basic_type();
+    BasicType vbt = in(1)->bottom_type()->basic_type();
    return Type::get_const_basic_type(vbt);
  }

@ -172,13 +178,11 @@ class ReductionNode : public Node {
 };

 //------------------------------AddReductionVINode--------------------------------------
-// Vector add int as a reduction
+// Vector add byte, short and int as a reduction
 class AddReductionVINode : public ReductionNode {
 public:
  AddReductionVINode(Node * ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
  virtual int Opcode() const;
-  virtual const Type* bottom_type() const { return TypeInt::INT; }
-  virtual uint ideal_reg() const { return Op_RegI; }
 };

 //------------------------------AddReductionVLNode--------------------------------------
@ -187,8 +191,6 @@ class AddReductionVLNode : public ReductionNode {
 public:
  AddReductionVLNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
  virtual int Opcode() const;
-  virtual const Type* bottom_type() const { return TypeLong::LONG; }
-  virtual uint ideal_reg() const { return Op_RegL; }
 };

 //------------------------------AddReductionVFNode--------------------------------------
@ -197,8 +199,6 @@ class AddReductionVFNode : public ReductionNode {
 public:
  AddReductionVFNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
  virtual int Opcode() const;
-  virtual const Type* bottom_type() const { return Type::FLOAT; }
-  virtual uint ideal_reg() const { return Op_RegF; }
 };

 //------------------------------AddReductionVDNode--------------------------------------
@ -207,8 +207,6 @@ class AddReductionVDNode : public ReductionNode {
 public:
  AddReductionVDNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
  virtual int Opcode() const;
-  virtual const Type* bottom_type() const { return Type::DOUBLE; }
-  virtual uint ideal_reg() const { return Op_RegD; }
 };

 //------------------------------SubVBNode--------------------------------------
@ -348,13 +346,11 @@ public:
 };

 //------------------------------MulReductionVINode--------------------------------------
-// Vector multiply int as a reduction
+// Vector multiply byte, short and int as a reduction
 class MulReductionVINode : public ReductionNode {
 public:
  MulReductionVINode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
  virtual int Opcode() const;
-  virtual const Type* bottom_type() const { return TypeInt::INT; }
-  virtual uint ideal_reg() const { return Op_RegI; }
 };

 //------------------------------MulReductionVLNode--------------------------------------
@ -363,8 +359,6 @@ class MulReductionVLNode : public ReductionNode {
 public:
  MulReductionVLNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
  virtual int Opcode() const;
-  virtual const Type* bottom_type() const { return TypeLong::LONG; }
-  virtual uint ideal_reg() const { return Op_RegI; }
 };

 //------------------------------MulReductionVFNode--------------------------------------
@ -373,8 +367,6 @@ class MulReductionVFNode : public ReductionNode {
 public:
  MulReductionVFNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
  virtual int Opcode() const;
-  virtual const Type* bottom_type() const { return Type::FLOAT; }
-  virtual uint ideal_reg() const { return Op_RegF; }
 };

 //------------------------------MulReductionVDNode--------------------------------------
@ -383,8 +375,6 @@ class MulReductionVDNode : public ReductionNode {
 public:
  MulReductionVDNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
  virtual int Opcode() const;
-  virtual const Type* bottom_type() const { return Type::DOUBLE; }
-  virtual uint ideal_reg() const { return Op_RegD; }
 };

 //------------------------------DivVFNode--------------------------------------
@ -419,10 +409,26 @@ public:
  virtual int Opcode() const;
 };

+//------------------------------MinVNode--------------------------------------
+// Vector Min
+class MinVNode : public VectorNode {
+public:
+  MinVNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1, in2, vt) {}
+  virtual int Opcode() const;
+};
+
+//------------------------------MaxVNode--------------------------------------
+// Vector Max
+class MaxVNode : public VectorNode {
+ public:
+  MaxVNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1, in2, vt) {}
+  virtual int Opcode() const;
+};
+
 //------------------------------AbsVINode--------------------------------------
 // Vector Abs int
 class AbsVINode : public VectorNode {
-public:
+ public:
  AbsVINode(Node* in, const TypeVect* vt) : VectorNode(in, vt) {}
  virtual int Opcode() const;
 };
@ -451,6 +457,14 @@ class AbsVDNode : public VectorNode {
  virtual int Opcode() const;
 };

+//------------------------------NegVINode--------------------------------------
+// Vector Neg int
+class NegVINode : public VectorNode {
+ public:
+  NegVINode(Node* in, const TypeVect* vt) : VectorNode(in, vt) {}
+  virtual int Opcode() const;
+};
+
 //------------------------------NegVFNode--------------------------------------
 // Vector Neg float
 class NegVFNode : public VectorNode {
@ -618,14 +632,38 @@ class AndVNode : public VectorNode {
  virtual int Opcode() const;
 };

+//------------------------------AndReductionVNode--------------------------------------
+// Vector and byte, short, int, long as a reduction
+class AndReductionVNode : public ReductionNode {
+ public:
+  AndReductionVNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
+  virtual int Opcode() const;
+};
+
 //------------------------------OrVNode---------------------------------------
-// Vector or integer
+// Vector or byte, short, int, long as a reduction
 class OrVNode : public VectorNode {
 public:
  OrVNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
  virtual int Opcode() const;
 };

+//------------------------------OrReductionVNode--------------------------------------
+// Vector xor byte, short, int, long as a reduction
+class OrReductionVNode : public ReductionNode {
+ public:
+  OrReductionVNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
+  virtual int Opcode() const;
+};
+
+//------------------------------XorReductionVNode--------------------------------------
+// Vector and int, long as a reduction
+class XorReductionVNode : public ReductionNode {
+ public:
+  XorReductionVNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
+  virtual int Opcode() const;
+};
+
 //------------------------------XorVNode---------------------------------------
 // Vector xor integer
 class XorVNode : public VectorNode {
@ -634,48 +672,8 @@ class XorVNode : public VectorNode {
  virtual int Opcode() const;
 };

-//------------------------------AndReductionVNode--------------------------------------
-// Vector and int, long as a reduction
-class AndReductionVNode : public ReductionNode {
-public:
-  AndReductionVNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
-  virtual int Opcode() const;
-};
-
-//------------------------------OrReductionVNode--------------------------------------
-// Vector or int, long as a reduction
-class OrReductionVNode : public ReductionNode {
-public:
-  OrReductionVNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
-  virtual int Opcode() const;
-};
-
-//------------------------------XorReductionVNode--------------------------------------
-// Vector xor int, long as a reduction
-class XorReductionVNode : public ReductionNode {
-public:
-  XorReductionVNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
-  virtual int Opcode() const;
-};
-
-//------------------------------MinVNode--------------------------------------
-// Vector min
-class MinVNode : public VectorNode {
-public:
-  MinVNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1, in2, vt) {}
-  virtual int Opcode() const;
-};
-
-//------------------------------MaxVNode--------------------------------------
-// Vector max
-class MaxVNode : public VectorNode {
-public:
-  MaxVNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1, in2, vt) {}
-  virtual int Opcode() const;
-};
-
 //------------------------------MinReductionVNode--------------------------------------
-// Vector min as a reduction
+// Vector min byte, short, int, long, float, double as a reduction
 class MinReductionVNode : public ReductionNode {
 public:
  MinReductionVNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
@ -683,7 +681,7 @@ public:
 };

 //------------------------------MaxReductionVNode--------------------------------------
-// Vector max as a reduction
+// Vector min byte, short, int, long, float, double as a reduction
 class MaxReductionVNode : public ReductionNode {
 public:
  MaxReductionVNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
@ -720,13 +718,28 @@ class LoadVectorNode : public LoadNode {
  uint element_size(void) { return type2aelembytes(vect_type()->element_basic_type()); }
 };

+//------------------------------LoadVectorGatherNode------------------------------
+// Load Vector from memory via index map
+class LoadVectorGatherNode : public LoadVectorNode {
+ public:
+  LoadVectorGatherNode(Node* c, Node* mem, Node* adr, const TypePtr* at, const TypeVect* vt, Node* indices)
+    : LoadVectorNode(c, mem, adr, at, vt) {
+    init_class_id(Class_LoadVectorGather);
+    assert(indices->bottom_type()->is_vect(), "indices must be in vector");
+    add_req(indices);
+    assert(req() == MemNode::ValueIn + 1, "match_edge expects that last input is in MemNode::ValueIn");
+  }
+
+  virtual int Opcode() const;
+  virtual uint match_edge(uint idx) const { return idx == MemNode::Address || idx == MemNode::ValueIn; }
+};
+
 //------------------------------StoreVectorNode--------------------------------
 // Store Vector to memory
 class StoreVectorNode : public StoreNode {
 public:
  StoreVectorNode(Node* c, Node* mem, Node* adr, const TypePtr* at, Node* val)
    : StoreNode(c, mem, adr, at, val, MemNode::unordered) {
-    assert(val->is_Vector() || val->is_LoadVector(), "sanity");
    init_class_id(Class_StoreVector);
    set_mismatched_access();
  }
@ -747,6 +760,23 @@ class StoreVectorNode : public StoreNode {
  uint element_size(void) { return type2aelembytes(vect_type()->element_basic_type()); }
 };

+//------------------------------StoreVectorScatterNode------------------------------
+// Store Vector into memory via index map
+
+ class StoreVectorScatterNode : public StoreVectorNode {
+  public:
+   StoreVectorScatterNode(Node* c, Node* mem, Node* adr, const TypePtr* at, Node* val, Node* indices)
+     : StoreVectorNode(c, mem, adr, at, val) {
+     init_class_id(Class_StoreVectorScatter);
+     assert(indices->bottom_type()->is_vect(), "indices must be in vector");
+     add_req(indices);
+     assert(req() == MemNode::ValueIn + 2, "match_edge expects that last input is in MemNode::ValueIn+1");
+   }
+   virtual int Opcode() const;
+   virtual uint match_edge(uint idx) const { return idx == MemNode::Address ||
+                                                     idx == MemNode::ValueIn ||
+                                                     idx == MemNode::ValueIn + 1; }
+};

 //=========================Promote_Scalar_to_Vector============================

@ -888,6 +918,12 @@ class Pack2DNode : public PackNode {
 };


+class VectorLoadConstNode : public VectorNode {
+ public:
+  VectorLoadConstNode(Node* in1, const TypeVect* vt) : VectorNode(in1, vt) {}
+  virtual int Opcode() const;
+};
+
 //========================Extract_Scalar_from_Vector===========================

 //------------------------------ExtractNode------------------------------------
@ -901,6 +937,7 @@ class ExtractNode : public Node {
  uint  pos() const { return in(2)->get_int(); }

  static Node* make(Node* v, uint position, BasicType bt);
+  static int opcode(BasicType bt);
 };

 //------------------------------ExtractBNode-----------------------------------
@ -929,7 +966,7 @@ class ExtractCNode : public ExtractNode {
 public:
  ExtractCNode(Node* src, ConINode* pos) : ExtractNode(src, pos) {}
  virtual int Opcode() const;
-  virtual const Type *bottom_type() const { return TypeInt::INT; }
+  virtual const Type *bottom_type() const { return TypeInt::CHAR; }
  virtual uint ideal_reg() const { return Op_RegI; }
 };

@ -939,7 +976,7 @@ class ExtractSNode : public ExtractNode {
 public:
  ExtractSNode(Node* src, ConINode* pos) : ExtractNode(src, pos) {}
  virtual int Opcode() const;
-  virtual const Type *bottom_type() const { return TypeInt::INT; }
+  virtual const Type *bottom_type() const { return TypeInt::SHORT; }
  virtual uint ideal_reg() const { return Op_RegI; }
 };

@ -1007,6 +1044,286 @@ public:
  static MacroLogicVNode* make(PhaseGVN& igvn, Node* in1, Node* in2, Node* in3, uint truth_table, const TypeVect* vt);
 };

+class VectorMaskCmpNode : public VectorNode {
+ private:
+  BoolTest::mask _predicate;
+
+ protected:
+  uint size_of() const { return sizeof(*this); }
+
+ public:
+  VectorMaskCmpNode(BoolTest::mask predicate, Node* in1, Node* in2, ConINode* predicate_node, const TypeVect* vt) :
+      VectorNode(in1, in2, predicate_node, vt),
+      _predicate(predicate) {
+    assert(in1->bottom_type()->is_vect()->element_basic_type() == in2->bottom_type()->is_vect()->element_basic_type(),
+           "VectorMaskCmp inputs must have same type for elements");
+    assert(in1->bottom_type()->is_vect()->length() == in2->bottom_type()->is_vect()->length(),
+           "VectorMaskCmp inputs must have same number of elements");
+    init_class_id(Class_VectorMaskCmp);
+  }
+
+  virtual int Opcode() const;
+  virtual uint hash() const { return VectorNode::hash() + _predicate; }
+  virtual bool cmp( const Node &n ) const {
+    return VectorNode::cmp(n) && _predicate == ((VectorMaskCmpNode&)n)._predicate;
+  }
+  BoolTest::mask get_predicate() { return _predicate; }
+#ifndef PRODUCT
+  virtual void dump_spec(outputStream *st) const;
+#endif // !PRODUCT
+};
+
+// Used to wrap other vector nodes in order to add masking functionality.
+class VectorMaskWrapperNode : public VectorNode {
+ public:
+  VectorMaskWrapperNode(Node* vector, Node* mask)
+    : VectorNode(vector, mask, vector->bottom_type()->is_vect()) {
+    assert(mask->is_VectorMaskCmp(), "VectorMaskWrapper requires that second argument be a mask");
+  }
+
+  virtual int Opcode() const;
+  Node* vector_val() const { return in(1); }
+  Node* vector_mask() const { return in(2); }
+};
+
+class VectorTestNode : public Node {
+ private:
+  BoolTest::mask _predicate;
+
+ protected:
+  uint size_of() const { return sizeof(*this); }
+
+ public:
+  VectorTestNode( Node *in1, Node *in2, BoolTest::mask predicate) : Node(NULL, in1, in2), _predicate(predicate) {
+    assert(in1->is_Vector() || in1->is_LoadVector(), "must be vector");
+    assert(in2->is_Vector() || in2->is_LoadVector(), "must be vector");
+    assert(in1->bottom_type()->is_vect()->element_basic_type() == in2->bottom_type()->is_vect()->element_basic_type(),
+           "same type elements are needed");
+    assert(in1->bottom_type()->is_vect()->length() == in2->bottom_type()->is_vect()->length(),
+           "same number of elements is needed");
+  }
+  virtual int Opcode() const;
+  virtual uint hash() const { return Node::hash() + _predicate; }
+  virtual bool cmp( const Node &n ) const {
+    return Node::cmp(n) && _predicate == ((VectorTestNode&)n)._predicate;
+  }
+  virtual const Type *bottom_type() const { return TypeInt::BOOL; }
+  virtual uint ideal_reg() const { return Op_RegI; }  // TODO Should be RegFlags but due to missing comparison flags for BoolTest
+                                                      // in middle-end, we make it boolean result directly.
+  BoolTest::mask get_predicate() const { return _predicate; }
+};
+
+class VectorBlendNode : public VectorNode {
+ public:
+  VectorBlendNode(Node* vec1, Node* vec2, Node* mask)
+    : VectorNode(vec1, vec2, mask, vec1->bottom_type()->is_vect()) {
+    // assert(mask->is_VectorMask(), "VectorBlendNode requires that third argument be a mask");
+  }
+
+  virtual int Opcode() const;
+  Node* vec1() const { return in(1); }
+  Node* vec2() const { return in(2); }
+  Node* vec_mask() const { return in(3); }
+};
+
+class VectorRearrangeNode : public VectorNode {
+ public:
+  VectorRearrangeNode(Node* vec1, Node* shuffle)
+    : VectorNode(vec1, shuffle, vec1->bottom_type()->is_vect()) {
+    // assert(mask->is_VectorMask(), "VectorBlendNode requires that third argument be a mask");
+  }
+
+  virtual int Opcode() const;
+  Node* vec1() const { return in(1); }
+  Node* vec_shuffle() const { return in(2); }
+};
+
+
+class VectorLoadMaskNode : public VectorNode {
+ public:
+  VectorLoadMaskNode(Node* in, const TypeVect* vt)
+    : VectorNode(in, vt) {
+    assert(in->is_LoadVector(), "expected load vector");
+    assert(in->as_LoadVector()->vect_type()->element_basic_type() == T_BOOLEAN, "must be boolean");
+  }
+
+  virtual int Opcode() const;
+};
+
+class VectorLoadShuffleNode : public VectorNode {
+ public:
+  VectorLoadShuffleNode(Node* in, const TypeVect* vt)
+    : VectorNode(in, vt) {
+    assert(in->is_LoadVector(), "expected load vector");
+    assert(in->as_LoadVector()->vect_type()->element_basic_type() == T_BYTE, "must be BYTE");
+  }
+
+  int GetOutShuffleSize() const { return type2aelembytes(vect_type()->element_basic_type()); }
+  virtual int Opcode() const;
+};
+
+class VectorStoreMaskNode : public VectorNode {
+ protected:
+  VectorStoreMaskNode(Node* in1, ConINode* in2, const TypeVect* vt)
+    : VectorNode(in1, in2, vt) { }
+
+ public:
+  virtual int Opcode() const;
+
+  static VectorStoreMaskNode* make(PhaseGVN& gvn, Node* in, BasicType in_type, uint num_elem);
+};
+
+// This is intended for use as a simple reinterpret node that has no cast.
+class VectorReinterpretNode : public VectorNode {
+ private:
+  const TypeVect* _src_vt;
+ protected:
+  uint size_of() const { return sizeof(*this); }
+ public:
+  VectorReinterpretNode(Node* in, const TypeVect* src_vt, const TypeVect* dst_vt)
+      : VectorNode(in, dst_vt), _src_vt(src_vt) { }
+
+  virtual uint hash() const { return VectorNode::hash() + _src_vt->hash(); }
+  virtual bool cmp( const Node &n ) const {
+    return VectorNode::cmp(n) && !Type::cmp(_src_vt,((VectorReinterpretNode&)n)._src_vt);
+  }
+  virtual Node *Identity(PhaseGVN *phase);
+
+  virtual int Opcode() const;
+};
+
+class VectorCastNode : public VectorNode {
+ public:
+  VectorCastNode(Node* in, const TypeVect* vt) : VectorNode(in, vt) {}
+  virtual int Opcode() const;
+
+  static VectorCastNode* make(int vopc, Node* n1, BasicType bt, uint vlen);
+  static int  opcode(BasicType bt);
+  static bool implemented(BasicType bt, uint vlen);
+};
+
+class VectorCastB2XNode : public VectorCastNode {
+ public:
+  VectorCastB2XNode(Node* in, const TypeVect* vt) : VectorCastNode(in, vt) {
+    assert(in->bottom_type()->is_vect()->element_basic_type() == T_BYTE, "must be byte");
+  }
+  virtual int Opcode() const;
+};
+
+class VectorCastS2XNode : public VectorCastNode {
+ public:
+  VectorCastS2XNode(Node* in, const TypeVect* vt) : VectorCastNode(in, vt) {
+    assert(in->bottom_type()->is_vect()->element_basic_type() == T_SHORT, "must be short");
+  }
+  virtual int Opcode() const;
+};
+
+class VectorCastI2XNode : public VectorCastNode {
+ public:
+  VectorCastI2XNode(Node* in, const TypeVect* vt) : VectorCastNode(in, vt) {
+    assert(in->bottom_type()->is_vect()->element_basic_type() == T_INT, "must be int");
+  }
+  virtual int Opcode() const;
+};
+
+class VectorCastL2XNode : public VectorCastNode {
+ public:
+  VectorCastL2XNode(Node* in, const TypeVect* vt) : VectorCastNode(in, vt) {
+    assert(in->bottom_type()->is_vect()->element_basic_type() == T_LONG, "must be long");
+  }
+  virtual int Opcode() const;
+};
+
+class VectorCastF2XNode : public VectorCastNode {
+ public:
+  VectorCastF2XNode(Node* in, const TypeVect* vt) : VectorCastNode(in, vt) {
+    assert(in->bottom_type()->is_vect()->element_basic_type() == T_FLOAT, "must be float");
+  }
+  virtual int Opcode() const;
+};
+
+class VectorCastD2XNode : public VectorCastNode {
+ public:
+  VectorCastD2XNode(Node* in, const TypeVect* vt) : VectorCastNode(in, vt) {
+    assert(in->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE, "must be double");
+  }
+  virtual int Opcode() const;
+};
+
+class VectorInsertNode : public VectorNode {
+ public:
+  VectorInsertNode(Node* vsrc, Node* new_val, ConINode* pos, const TypeVect* vt) : VectorNode(vsrc, new_val, (Node*)pos, vt) {
+   assert(pos->get_int() >= 0, "positive constants");
+   assert(pos->get_int() < (int)vt->length(), "index must be less than vector length");
+   assert(Type::cmp(vt, vsrc->bottom_type()) == 0, "input and output must be same type");
+  }
+  virtual int Opcode() const;
+  uint pos() const { return in(3)->get_int(); }
+
+  static Node* make(Node* vec, Node* new_val, int position);
+};
+
+class VectorBoxNode : public Node {
+ private:
+  const TypeInstPtr* const _box_type;
+  const TypeVect*    const _vec_type;
+ public:
+  enum {
+     Box   = 1,
+     Value = 2
+  };
+  VectorBoxNode(Compile* C, Node* box, Node* val,
+                const TypeInstPtr* box_type, const TypeVect* vt)
+    : Node(NULL, box, val), _box_type(box_type), _vec_type(vt) {
+    init_flags(Flag_is_macro);
+    C->add_macro_node(this);
+  }
+
+  const  TypeInstPtr* box_type() const { assert(_box_type != NULL, ""); return _box_type; };
+  const  TypeVect*    vec_type() const { assert(_vec_type != NULL, ""); return _vec_type; };
+
+  virtual int Opcode() const;
+  virtual const Type* bottom_type() const { return _box_type; }
+  virtual       uint  ideal_reg() const { return box_type()->ideal_reg(); }
+  virtual       uint  size_of() const { return sizeof(*this); }
+
+  static const TypeFunc* vec_box_type(const TypeInstPtr* box_type);
+};
+
+class VectorBoxAllocateNode : public CallStaticJavaNode {
+ public:
+  VectorBoxAllocateNode(Compile* C, const TypeInstPtr* vbox_type)
+    : CallStaticJavaNode(C, VectorBoxNode::vec_box_type(vbox_type), NULL, NULL, -1) {
+    init_flags(Flag_is_macro);
+    C->add_macro_node(this);
+  }
+
+  virtual int Opcode() const;
+#ifndef PRODUCT
+  virtual void dump_spec(outputStream *st) const;
+#endif // !PRODUCT
+};
+
+class VectorUnboxNode : public VectorNode {
+ private:
+  bool _shuffle_to_vector;
+ protected:
+  uint size_of() const { return sizeof(*this); }
+ public:
+  VectorUnboxNode(Compile* C, const TypeVect* vec_type, Node* obj, Node* mem, bool shuffle_to_vector)
+    : VectorNode(mem, obj, vec_type) {
+    _shuffle_to_vector = shuffle_to_vector;
+    init_flags(Flag_is_macro);
+    C->add_macro_node(this);
+  }
+
+  virtual int Opcode() const;
+  Node* obj() const { return in(2); }
+  Node* mem() const { return in(1); }
+  virtual Node *Identity(PhaseGVN *phase);
+  bool is_shuffle_to_vector() { return _shuffle_to_vector; }
+};
+
 class RotateRightVNode : public VectorNode {
 public:
  RotateRightVNode(Node* in1, Node* in2, const TypeVect* vt)
--- a/src/hotspot/share/prims/nativeLookup.cpp
+++ b/src/hotspot/share/prims/nativeLookup.cpp
@ -118,6 +118,7 @@ extern "C" {
  void JNICALL JVM_RegisterMethodHandleMethods(JNIEnv *env, jclass unsafecls);
  void JNICALL JVM_RegisterPerfMethods(JNIEnv *env, jclass perfclass);
  void JNICALL JVM_RegisterWhiteBoxMethods(JNIEnv *env, jclass wbclass);
+  void JNICALL JVM_RegisterVectorSupportMethods(JNIEnv *env, jclass vsclass);
 #if INCLUDE_JVMCI
  jobject  JNICALL JVM_GetJVMCIRuntime(JNIEnv *env, jclass c);
  void     JNICALL JVM_RegisterJVMCINatives(JNIEnv *env, jclass compilerToVMClass);
@ -132,6 +133,7 @@ static JNINativeMethod lookup_special_native_methods[] = {
  { CC"Java_java_lang_invoke_MethodHandleNatives_registerNatives", NULL, FN_PTR(JVM_RegisterMethodHandleMethods) },
  { CC"Java_jdk_internal_perf_Perf_registerNatives",               NULL, FN_PTR(JVM_RegisterPerfMethods)         },
  { CC"Java_sun_hotspot_WhiteBox_registerNatives",                 NULL, FN_PTR(JVM_RegisterWhiteBoxMethods)     },
+  { CC"Java_jdk_internal_vm_vector_VectorSupport_registerNatives", NULL, FN_PTR(JVM_RegisterVectorSupportMethods)},
 #if INCLUDE_JVMCI
  { CC"Java_jdk_vm_ci_runtime_JVMCI_initializeRuntime",            NULL, FN_PTR(JVM_GetJVMCIRuntime)             },
  { CC"Java_jdk_vm_ci_hotspot_CompilerToVM_registerNatives",       NULL, FN_PTR(JVM_RegisterJVMCINatives)        },
--- a/src/hotspot/share/prims/vectorSupport.cpp
+++ b/src/hotspot/share/prims/vectorSupport.cpp
@ -0,0 +1,429 @@
+/*
+ * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "jni.h"
+#include "jvm.h"
+#include "classfile/javaClasses.inline.hpp"
+#include "code/location.hpp"
+#include "prims/vectorSupport.hpp"
+#include "runtime/fieldDescriptor.inline.hpp"
+#include "runtime/handles.inline.hpp"
+#include "runtime/interfaceSupport.inline.hpp"
+#include "runtime/jniHandles.inline.hpp"
+#include "runtime/stackValue.hpp"
+
+#ifdef COMPILER2
+#include "opto/matcher.hpp" // Matcher::max_vector_size(BasicType)
+#endif // COMPILER2
+
+bool VectorSupport::is_vector(Klass* klass) {
+  return klass->is_subclass_of(SystemDictionary::vector_VectorPayload_klass());
+}
+
+bool VectorSupport::is_vector_mask(Klass* klass) {
+  return klass->is_subclass_of(SystemDictionary::vector_VectorMask_klass());
+}
+
+bool VectorSupport::is_vector_shuffle(Klass* klass) {
+  return klass->is_subclass_of(SystemDictionary::vector_VectorShuffle_klass());
+}
+
+BasicType VectorSupport::klass2bt(InstanceKlass* ik) {
+  assert(ik->is_subclass_of(SystemDictionary::vector_VectorPayload_klass()), "%s not a VectorPayload", ik->name()->as_C_string());
+  fieldDescriptor fd; // find_field initializes fd if found
+  // static final Class<?> ETYPE;
+  Klass* holder = ik->find_field(vmSymbols::ETYPE_name(), vmSymbols::class_signature(), &fd);
+
+  assert(holder != NULL, "sanity");
+  assert(fd.is_static(), "");
+  assert(fd.offset() > 0, "");
+
+  if (is_vector_shuffle(ik)) {
+    return T_BYTE;
+  } else { // vector and mask
+    oop value = ik->java_mirror()->obj_field(fd.offset());
+    BasicType elem_bt = java_lang_Class::as_BasicType(value);
+    return elem_bt;
+  }
+}
+
+jint VectorSupport::klass2length(InstanceKlass* ik) {
+  fieldDescriptor fd; // find_field initializes fd if found
+  // static final int VLENGTH;
+  Klass* holder = ik->find_field(vmSymbols::VLENGTH_name(), vmSymbols::int_signature(), &fd);
+
+  assert(holder != NULL, "sanity");
+  assert(fd.is_static(), "");
+  assert(fd.offset() > 0, "");
+
+  jint vlen = ik->java_mirror()->int_field(fd.offset());
+  assert(vlen > 0, "");
+  return vlen;
+}
+
+void VectorSupport::init_vector_array(typeArrayOop arr, BasicType elem_bt, int num_elem, address value_addr) {
+  int elem_size = type2aelembytes(elem_bt);
+  for (int i = 0; i < num_elem; i++) {
+    switch (elem_bt) {
+      case T_BYTE: {
+        jbyte elem_value = *(jbyte*) (value_addr + i * elem_size);
+        arr->byte_at_put(i, elem_value);
+        break;
+      }
+      case T_SHORT: {
+        jshort elem_value = *(jshort*) (value_addr + i * elem_size);
+        arr->short_at_put(i, elem_value);
+        break;
+      }
+      case T_INT: {
+        jint elem_value = *(jint*) (value_addr + i * elem_size);
+        arr->int_at_put(i, elem_value);
+        break;
+      }
+      case T_LONG: {
+        jlong elem_value = *(jlong*) (value_addr + i * elem_size);
+        arr->long_at_put(i, elem_value);
+        break;
+      }
+      case T_FLOAT: {
+        jfloat elem_value = *(jfloat*) (value_addr + i * elem_size);
+        arr->float_at_put(i, elem_value);
+        break;
+      }
+      case T_DOUBLE: {
+        jdouble elem_value = *(jdouble*) (value_addr + i * elem_size);
+        arr->double_at_put(i, elem_value);
+        break;
+      }
+      default:
+        fatal("unsupported: %s", type2name(elem_bt));
+    }
+  }
+}
+
+void VectorSupport::init_mask_array(typeArrayOop arr, BasicType elem_bt, int num_elem, address value_addr) {
+  int elem_size = type2aelembytes(elem_bt);
+
+  for (int i = 0; i < num_elem; i++) {
+    switch (elem_bt) {
+      case T_BYTE: {
+        jbyte elem_value = *(jbyte*) (value_addr + i * elem_size);
+        arr->bool_at_put(i, elem_value != 0);
+        break;
+      }
+      case T_SHORT: {
+        jshort elem_value = *(jshort*) (value_addr + i * elem_size);
+        arr->bool_at_put(i, elem_value != 0);
+        break;
+      }
+      case T_INT:   // fall-through
+      case T_FLOAT: {
+        jint elem_value = *(jint*) (value_addr + i * elem_size);
+        arr->bool_at_put(i, elem_value != 0);
+        break;
+      }
+      case T_LONG: // fall-through
+      case T_DOUBLE: {
+        jlong elem_value = *(jlong*) (value_addr + i * elem_size);
+        arr->bool_at_put(i, elem_value != 0);
+        break;
+      }
+      default:
+        fatal("unsupported: %s", type2name(elem_bt));
+    }
+  }
+}
+
+oop VectorSupport::allocate_vector_payload_helper(InstanceKlass* ik, BasicType elem_bt, int num_elem, address value_addr, TRAPS) {
+
+  bool is_mask = is_vector_mask(ik);
+
+  // On-heap vector values are represented as primitive arrays.
+  TypeArrayKlass* tak = TypeArrayKlass::cast(Universe::typeArrayKlassObj(is_mask ? T_BOOLEAN : elem_bt));
+
+  typeArrayOop arr = tak->allocate(num_elem, CHECK_NULL); // safepoint
+
+  if (is_mask) {
+    init_mask_array(arr, elem_bt, num_elem, value_addr);
+  } else {
+    init_vector_array(arr, elem_bt, num_elem, value_addr);
+  }
+  return arr;
+}
+
+oop VectorSupport::allocate_vector(InstanceKlass* ik, frame* fr, RegisterMap* reg_map, ObjectValue* ov, TRAPS) {
+  assert(is_vector(ik), "%s not a vector", ik->name()->as_C_string());
+  assert(ov->field_size() == 1, "%s not a vector", ik->name()->as_C_string());
+
+  // Vector value in an aligned adjacent tuple (1, 2, 4, 8, or 16 slots).
+  LocationValue* loc_value = ov->field_at(0)->as_LocationValue();
+
+  BasicType elem_bt = klass2bt(ik);
+  int num_elem = klass2length(ik);
+
+  Handle vbox = ik->allocate_instance_handle(CHECK_NULL);
+
+  Location loc = loc_value->location();
+
+  oop payload = NULL;
+  if (loc.type() == Location::vector) {
+    address value_addr = loc.is_register()
+        // Value was in a callee-save register
+        ? reg_map->location(VMRegImpl::as_VMReg(loc.register_number()))
+        // Else value was directly saved on the stack. The frame's original stack pointer,
+        // before any extension by its callee (due to Compiler1 linkage on SPARC), must be used.
+        : ((address)fr->unextended_sp()) + loc.stack_offset();
+    payload = allocate_vector_payload_helper(ik, elem_bt, num_elem, value_addr, CHECK_NULL); // safepoint
+  } else {
+    // assert(false, "interesting");
+    StackValue* value = StackValue::create_stack_value(fr, reg_map, loc_value);
+    payload = value->get_obj()();
+  }
+  vector_VectorPayload::set_payload(vbox(), payload);
+  return vbox();
+}
+
+#ifdef COMPILER2
+int VectorSupport::vop2ideal(jint id, BasicType bt) {
+  VectorOperation vop = (VectorOperation)id;
+  switch (vop) {
+    case VECTOR_OP_ADD: {
+      switch (bt) {
+        case T_BYTE:   // fall-through
+        case T_SHORT:  // fall-through
+        case T_INT:    return Op_AddI;
+        case T_LONG:   return Op_AddL;
+        case T_FLOAT:  return Op_AddF;
+        case T_DOUBLE: return Op_AddD;
+        default: fatal("ADD: %s", type2name(bt));
+      }
+      break;
+    }
+    case VECTOR_OP_SUB: {
+      switch (bt) {
+        case T_BYTE:   // fall-through
+        case T_SHORT:  // fall-through
+        case T_INT:    return Op_SubI;
+        case T_LONG:   return Op_SubL;
+        case T_FLOAT:  return Op_SubF;
+        case T_DOUBLE: return Op_SubD;
+        default: fatal("SUB: %s", type2name(bt));
+      }
+      break;
+    }
+    case VECTOR_OP_MUL: {
+      switch (bt) {
+        case T_BYTE:   // fall-through
+        case T_SHORT:  // fall-through
+        case T_INT:    return Op_MulI;
+        case T_LONG:   return Op_MulL;
+        case T_FLOAT:  return Op_MulF;
+        case T_DOUBLE: return Op_MulD;
+        default: fatal("MUL: %s", type2name(bt));
+      }
+      break;
+    }
+    case VECTOR_OP_DIV: {
+      switch (bt) {
+        case T_BYTE:   // fall-through
+        case T_SHORT:  // fall-through
+        case T_INT:    return Op_DivI;
+        case T_LONG:   return Op_DivL;
+        case T_FLOAT:  return Op_DivF;
+        case T_DOUBLE: return Op_DivD;
+        default: fatal("DIV: %s", type2name(bt));
+      }
+      break;
+    }
+    case VECTOR_OP_MIN: {
+      switch (bt) {
+        case T_BYTE:
+        case T_SHORT:
+        case T_INT:    return Op_MinI;
+        case T_LONG:   return Op_MinL;
+        case T_FLOAT:  return Op_MinF;
+        case T_DOUBLE: return Op_MinD;
+        default: fatal("MIN: %s", type2name(bt));
+      }
+      break;
+    }
+    case VECTOR_OP_MAX: {
+      switch (bt) {
+        case T_BYTE:
+        case T_SHORT:
+        case T_INT:    return Op_MaxI;
+        case T_LONG:   return Op_MaxL;
+        case T_FLOAT:  return Op_MaxF;
+        case T_DOUBLE: return Op_MaxD;
+        default: fatal("MAX: %s", type2name(bt));
+      }
+      break;
+    }
+    case VECTOR_OP_ABS: {
+      switch (bt) {
+        case T_BYTE:   // fall-through
+        case T_SHORT:  // fall-through
+        case T_INT:    return Op_AbsI;
+        case T_LONG:   return Op_AbsL;
+        case T_FLOAT:  return Op_AbsF;
+        case T_DOUBLE: return Op_AbsD;
+        default: fatal("ABS: %s", type2name(bt));
+      }
+      break;
+    }
+    case VECTOR_OP_NEG: {
+      switch (bt) {
+        case T_BYTE:   // fall-through
+        case T_SHORT:  // fall-through
+        case T_INT:    return Op_NegI;
+        case T_FLOAT:  return Op_NegF;
+        case T_DOUBLE: return Op_NegD;
+        default: fatal("NEG: %s", type2name(bt));
+      }
+      break;
+    }
+    case VECTOR_OP_AND: {
+      switch (bt) {
+        case T_BYTE:   // fall-through
+        case T_SHORT:  // fall-through
+        case T_INT:    return Op_AndI;
+        case T_LONG:   return Op_AndL;
+        default: fatal("AND: %s", type2name(bt));
+      }
+      break;
+    }
+    case VECTOR_OP_OR: {
+      switch (bt) {
+        case T_BYTE:   // fall-through
+        case T_SHORT:  // fall-through
+        case T_INT:    return Op_OrI;
+        case T_LONG:   return Op_OrL;
+        default: fatal("OR: %s", type2name(bt));
+      }
+      break;
+    }
+    case VECTOR_OP_XOR: {
+      switch (bt) {
+        case T_BYTE:   // fall-through
+        case T_SHORT:  // fall-through
+        case T_INT:    return Op_XorI;
+        case T_LONG:   return Op_XorL;
+        default: fatal("XOR: %s", type2name(bt));
+      }
+      break;
+    }
+    case VECTOR_OP_SQRT: {
+      switch (bt) {
+        case T_FLOAT:  return Op_SqrtF;
+        case T_DOUBLE: return Op_SqrtD;
+        default: fatal("SQRT: %s", type2name(bt));
+      }
+      break;
+    }
+    case VECTOR_OP_FMA: {
+      switch (bt) {
+        case T_FLOAT:  return Op_FmaF;
+        case T_DOUBLE: return Op_FmaD;
+        default: fatal("FMA: %s", type2name(bt));
+      }
+      break;
+    }
+    case VECTOR_OP_LSHIFT: {
+      switch (bt) {
+        case T_BYTE:   // fall-through
+        case T_SHORT:  // fall-through
+        case T_INT:  return Op_LShiftI;
+        case T_LONG: return Op_LShiftL;
+        default: fatal("LSHIFT: %s", type2name(bt));
+      }
+      break;
+    }
+    case VECTOR_OP_RSHIFT: {
+      switch (bt) {
+        case T_BYTE:   // fall-through
+        case T_SHORT:  // fall-through
+        case T_INT:  return Op_RShiftI;
+        case T_LONG: return Op_RShiftL;
+        default: fatal("RSHIFT: %s", type2name(bt));
+      }
+      break;
+    }
+    case VECTOR_OP_URSHIFT: {
+      switch (bt) {
+        case T_BYTE:  return Op_URShiftB;
+        case T_SHORT: return Op_URShiftS;
+        case T_INT:   return Op_URShiftI;
+        case T_LONG:  return Op_URShiftL;
+        default: fatal("URSHIFT: %s", type2name(bt));
+      }
+      break;
+    }
+    default: fatal("unknown op: %d", vop);
+  }
+  return 0; // Unimplemented
+}
+#endif // COMPILER2
+
+/**
+ * Implementation of the jdk.internal.vm.vector.VectorSupport class
+ */
+
+JVM_ENTRY(jint, VectorSupport_GetMaxLaneCount(JNIEnv *env, jclass vsclazz, jobject clazz)) {
+#ifdef COMPILER2
+  oop mirror = JNIHandles::resolve_non_null(clazz);
+  if (java_lang_Class::is_primitive(mirror)) {
+    BasicType bt = java_lang_Class::primitive_type(mirror);
+    return Matcher::max_vector_size(bt);
+  }
+#endif // COMPILER2
+  return -1;
+} JVM_END
+
+// JVM_RegisterVectorSupportMethods
+
+#define LANG "Ljava/lang/"
+#define CLS LANG "Class;"
+
+#define CC (char*)  /*cast a literal from (const char*)*/
+#define FN_PTR(f) CAST_FROM_FN_PTR(void*, &f)
+
+static JNINativeMethod jdk_internal_vm_vector_VectorSupport_methods[] = {
+    {CC "getMaxLaneCount",   CC "(" CLS ")I", FN_PTR(VectorSupport_GetMaxLaneCount)}
+};
+
+#undef CC
+#undef FN_PTR
+
+#undef LANG
+#undef CLS
+
+// This function is exported, used by NativeLookup.
+
+JVM_ENTRY(void, JVM_RegisterVectorSupportMethods(JNIEnv* env, jclass vsclass)) {
+  ThreadToNativeFromVM ttnfv(thread);
+
+  int ok = env->RegisterNatives(vsclass, jdk_internal_vm_vector_VectorSupport_methods, sizeof(jdk_internal_vm_vector_VectorSupport_methods)/sizeof(JNINativeMethod));
+  guarantee(ok == 0, "register jdk.internal.vm.vector.VectorSupport natives");
+} JVM_END
--- a/src/hotspot/share/prims/vectorSupport.hpp
+++ b/src/hotspot/share/prims/vectorSupport.hpp
@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef SHARE_PRIMS_VECTORSUPPORT_HPP
+#define SHARE_PRIMS_VECTORSUPPORT_HPP
+
+#include "jni.h"
+#include "code/debugInfo.hpp"
+#include "memory/allocation.hpp"
+#include "oops/typeArrayOop.inline.hpp"
+#include "runtime/frame.inline.hpp"
+#include "runtime/registerMap.hpp"
+#include "utilities/exceptions.hpp"
+
+extern "C" {
+  void JNICALL JVM_RegisterVectorSupportMethods(JNIEnv* env, jclass vsclass);
+}
+
+class VectorSupport : AllStatic {
+ private:
+  static void init_mask_array(typeArrayOop arr, BasicType elem_bt, int num_elem, address value_addr);
+  static void init_vector_array(typeArrayOop arr, BasicType elem_bt, int num_elem, address value_addr);
+  static oop  allocate_vector_payload_helper(InstanceKlass* ik, BasicType elem_bt, int num_elem, address value_addr, TRAPS);
+
+  static BasicType klass2bt(InstanceKlass* ik);
+  static jint klass2length(InstanceKlass* ik);
+
+ public:
+
+   // Should be aligned with constants in jdk.internal.vm.vector.VectorSupport
+  enum VectorOperation {
+    // Unary
+    VECTOR_OP_ABS     = 0,
+    VECTOR_OP_NEG     = 1,
+    VECTOR_OP_SQRT    = 2,
+
+    // Binary
+    VECTOR_OP_ADD     = 4,
+    VECTOR_OP_SUB     = 5,
+    VECTOR_OP_MUL     = 6,
+    VECTOR_OP_DIV     = 7,
+    VECTOR_OP_MIN     = 8,
+    VECTOR_OP_MAX     = 9,
+    VECTOR_OP_AND     = 10,
+    VECTOR_OP_OR      = 11,
+    VECTOR_OP_XOR     = 12,
+
+    // Ternary
+    VECTOR_OP_FMA     = 13,
+
+    // Broadcast int
+    VECTOR_OP_LSHIFT  = 14,
+    VECTOR_OP_RSHIFT  = 15,
+    VECTOR_OP_URSHIFT = 16,
+
+    // Convert
+    VECTOR_OP_CAST        = 17,
+    VECTOR_OP_REINTERPRET = 18
+  };
+
+  static int vop2ideal(jint vop, BasicType bt);
+
+  static oop  allocate_vector(InstanceKlass* holder, frame* fr, RegisterMap* reg_map, ObjectValue* sv, TRAPS);
+
+  static bool is_vector(Klass* klass);
+  static bool is_vector_mask(Klass* klass);
+  static bool is_vector_shuffle(Klass* klass);
+};
+#endif // SHARE_PRIMS_VECTORSUPPORT_HPP
--- a/src/hotspot/share/runtime/arguments.cpp
+++ b/src/hotspot/share/runtime/arguments.cpp
@ -4195,7 +4195,23 @@ jint Arguments::apply_ergo() {
  if (!UseBiasedLocking) {
    UseOptoBiasInlining = false;
  }
-#endif
+
+  if (!EnableVectorSupport) {
+    if (!FLAG_IS_DEFAULT(EnableVectorReboxing) && EnableVectorReboxing) {
+      warning("Disabling EnableVectorReboxing since EnableVectorSupport is turned off.");
+    }
+    FLAG_SET_DEFAULT(EnableVectorReboxing, false);
+
+    if (!FLAG_IS_DEFAULT(EnableVectorAggressiveReboxing) && EnableVectorAggressiveReboxing) {
+      if (!EnableVectorReboxing) {
+        warning("Disabling EnableVectorAggressiveReboxing since EnableVectorReboxing is turned off.");
+      } else {
+        warning("Disabling EnableVectorAggressiveReboxing since EnableVectorSupport is turned off.");
+      }
+    }
+    FLAG_SET_DEFAULT(EnableVectorAggressiveReboxing, false);
+  }
+#endif // COMPILER2

  if (FLAG_IS_CMDLINE(DiagnoseSyncOnPrimitiveWrappers)) {
    if (DiagnoseSyncOnPrimitiveWrappers == ObjectSynchronizer::LOG_WARNING && !log_is_enabled(Info, primitivewrappers)) {
--- a/src/hotspot/share/runtime/deoptimization.cpp
+++ b/src/hotspot/share/runtime/deoptimization.cpp
@ -49,6 +49,7 @@
 #include "oops/typeArrayOop.inline.hpp"
 #include "oops/verifyOopClosure.hpp"
 #include "prims/jvmtiThreadState.hpp"
+#include "prims/vectorSupport.hpp"
 #include "prims/methodHandles.hpp"
 #include "runtime/atomic.hpp"
 #include "runtime/biasedLocking.hpp"
@ -1015,7 +1016,15 @@ bool Deoptimization::realloc_objects(JavaThread* thread, frame* fr, RegisterMap*
 #endif // INCLUDE_JVMCI || INCLUDE_AOT
      InstanceKlass* ik = InstanceKlass::cast(k);
      if (obj == NULL) {
+#ifdef COMPILER2
+        if (EnableVectorSupport && VectorSupport::is_vector(ik)) {
+          obj = VectorSupport::allocate_vector(ik, fr, reg_map, sv, THREAD);
+        } else {
+          obj = ik->allocate_instance(THREAD);
+        }
+#else
        obj = ik->allocate_instance(THREAD);
+#endif // COMPILER2
      }
    } else if (k->is_typeArray_klass()) {
      TypeArrayKlass* ak = TypeArrayKlass::cast(k);
@ -1352,6 +1361,11 @@ void Deoptimization::reassign_fields(frame* fr, RegisterMap* reg_map, GrowableAr
      continue;
    }
 #endif // INCLUDE_JVMCI || INCLUDE_AOT
+#ifdef COMPILER2
+    if (EnableVectorSupport && VectorSupport::is_vector(k)) {
+      continue; // skip field reassignment for vectors
+    }
+#endif
    if (k->is_instance_klass()) {
      InstanceKlass* ik = InstanceKlass::cast(k);
      reassign_fields_by_klass(ik, fr, reg_map, sv, 0, obj(), skip_internal);
--- a/src/hotspot/share/runtime/stackValue.cpp
+++ b/src/hotspot/share/runtime/stackValue.cpp
@ -150,8 +150,12 @@ StackValue* StackValue::create_stack_value(const frame* fr, const RegisterMap* r
      value.ji = *(jint*)value_addr;
      return new StackValue(value.p);
    }
-    case Location::invalid:
+    case Location::invalid: {
      return new StackValue();
+    }
+    case Location::vector: {
+      ShouldNotReachHere(); // should be handled by Deoptimization::realloc_objects()
+    }
    default:
      ShouldNotReachHere();
    }
@ -222,7 +226,7 @@ void StackValue::print_on(outputStream* st) const {
        st->print("NULL");
      }
      st->print(" <" INTPTR_FORMAT ">", p2i(_handle_value()));
-     break;
+      break;

    case T_CONFLICT:
     st->print("conflict");
--- a/src/hotspot/share/runtime/vmStructs.cpp
+++ b/src/hotspot/share/runtime/vmStructs.cpp
@ -1502,6 +1502,8 @@ typedef HashtableEntry<InstanceKlass*, mtClass>  KlassHashtableEntry;
  declare_c2_type(MaxNode, AddNode)                                       \
  declare_c2_type(MaxINode, MaxNode)                                      \
  declare_c2_type(MinINode, MaxNode)                                      \
+  declare_c2_type(MaxLNode, MaxNode)                                      \
+  declare_c2_type(MinLNode, MaxNode)                                      \
  declare_c2_type(MaxFNode, MaxNode)                                      \
  declare_c2_type(MinFNode, MaxNode)                                      \
  declare_c2_type(MaxDNode, MaxNode)                                      \
@ -1736,6 +1738,8 @@ typedef HashtableEntry<InstanceKlass*, mtClass>  KlassHashtableEntry;
  declare_c2_type(AbsDNode, AbsNode)                                      \
  declare_c2_type(CmpLTMaskNode, Node)                                    \
  declare_c2_type(NegNode, Node)                                          \
+  declare_c2_type(NegINode, NegNode)                                      \
+  declare_c2_type(NegLNode, NegNode)                                      \
  declare_c2_type(NegFNode, NegNode)                                      \
  declare_c2_type(NegDNode, NegNode)                                      \
  declare_c2_type(AtanDNode, Node)                                        \
@ -1745,10 +1749,12 @@ typedef HashtableEntry<InstanceKlass*, mtClass>  KlassHashtableEntry;
  declare_c2_type(ReverseBytesLNode, Node)                                \
  declare_c2_type(ReductionNode, Node)                                    \
  declare_c2_type(VectorNode, Node)                                       \
-  declare_c2_type(AbsVBNode, VectorNode)                                   \
-  declare_c2_type(AbsVSNode, VectorNode)                                   \
-  declare_c2_type(AbsVINode, VectorNode)                                   \
-  declare_c2_type(AbsVLNode, VectorNode)                                   \
+  declare_c2_type(AbsVFNode, VectorNode)                                  \
+  declare_c2_type(AbsVDNode, VectorNode)                                  \
+  declare_c2_type(AbsVBNode, VectorNode)                                  \
+  declare_c2_type(AbsVSNode, VectorNode)                                  \
+  declare_c2_type(AbsVINode, VectorNode)                                  \
+  declare_c2_type(AbsVLNode, VectorNode)                                  \
  declare_c2_type(AddVBNode, VectorNode)                                  \
  declare_c2_type(AddVSNode, VectorNode)                                  \
  declare_c2_type(AddVINode, VectorNode)                                  \
@ -1774,6 +1780,7 @@ typedef HashtableEntry<InstanceKlass*, mtClass>  KlassHashtableEntry;
  declare_c2_type(MulVFNode, VectorNode)                                  \
  declare_c2_type(MulReductionVFNode, ReductionNode)                      \
  declare_c2_type(MulVDNode, VectorNode)                                  \
+  declare_c2_type(NegVINode, VectorNode)                                  \
  declare_c2_type(NegVFNode, VectorNode)                                  \
  declare_c2_type(NegVDNode, VectorNode)                                  \
  declare_c2_type(FmaVDNode, VectorNode)                                  \
@ -1796,6 +1803,8 @@ typedef HashtableEntry<InstanceKlass*, mtClass>  KlassHashtableEntry;
  declare_c2_type(URShiftVSNode, VectorNode)                              \
  declare_c2_type(URShiftVINode, VectorNode)                              \
  declare_c2_type(URShiftVLNode, VectorNode)                              \
+  declare_c2_type(MinReductionVNode, ReductionNode)                       \
+  declare_c2_type(MaxReductionVNode, ReductionNode)                       \
  declare_c2_type(AndVNode, VectorNode)                                   \
  declare_c2_type(AndReductionVNode, ReductionNode)                       \
  declare_c2_type(OrVNode, VectorNode)                                    \
@ -1804,8 +1813,6 @@ typedef HashtableEntry<InstanceKlass*, mtClass>  KlassHashtableEntry;
  declare_c2_type(XorReductionVNode, ReductionNode)                       \
  declare_c2_type(MaxVNode, VectorNode)                                   \
  declare_c2_type(MinVNode, VectorNode)                                   \
-  declare_c2_type(MaxReductionVNode, ReductionNode)                       \
-  declare_c2_type(MinReductionVNode, ReductionNode)                       \
  declare_c2_type(LoadVectorNode, LoadNode)                               \
  declare_c2_type(StoreVectorNode, StoreNode)                             \
  declare_c2_type(ReplicateBNode, VectorNode)                             \
@ -1847,6 +1854,27 @@ typedef HashtableEntry<InstanceKlass*, mtClass>  KlassHashtableEntry;
  declare_c2_type(CopySignFNode, Node)                                    \
  declare_c2_type(SignumDNode, Node)                                      \
  declare_c2_type(SignumFNode, Node)                                      \
+  declare_c2_type(LoadVectorGatherNode, LoadVectorNode)                   \
+  declare_c2_type(StoreVectorScatterNode, StoreVectorNode)                \
+  declare_c2_type(VectorLoadMaskNode, VectorNode)                         \
+  declare_c2_type(VectorLoadShuffleNode, VectorNode)                      \
+  declare_c2_type(VectorStoreMaskNode, VectorNode)                        \
+  declare_c2_type(VectorBlendNode, VectorNode)                            \
+  declare_c2_type(VectorRearrangeNode, VectorNode)                        \
+  declare_c2_type(VectorMaskWrapperNode, VectorNode)                      \
+  declare_c2_type(VectorMaskCmpNode, VectorNode)                          \
+  declare_c2_type(VectorCastB2XNode, VectorNode)                          \
+  declare_c2_type(VectorCastS2XNode, VectorNode)                          \
+  declare_c2_type(VectorCastI2XNode, VectorNode)                          \
+  declare_c2_type(VectorCastL2XNode, VectorNode)                          \
+  declare_c2_type(VectorCastF2XNode, VectorNode)                          \
+  declare_c2_type(VectorCastD2XNode, VectorNode)                          \
+  declare_c2_type(VectorInsertNode, VectorNode)                           \
+  declare_c2_type(VectorUnboxNode, VectorNode)                            \
+  declare_c2_type(VectorReinterpretNode, VectorNode)                      \
+  declare_c2_type(VectorBoxNode, Node)                                    \
+  declare_c2_type(VectorBoxAllocateNode, CallStaticJavaNode)              \
+  declare_c2_type(VectorTestNode, Node)                                   \
                                                                          \
  /*********************/                                                 \
  /* Adapter Blob Entries */                                              \
--- a/src/hotspot/share/utilities/globalDefinitions.hpp
+++ b/src/hotspot/share/utilities/globalDefinitions.hpp
@ -237,6 +237,9 @@ inline size_t heap_word_size(size_t byte_size) {
  return (byte_size + (HeapWordSize-1)) >> LogHeapWordSize;
 }

+inline jfloat jfloat_cast(jint x);
+inline jdouble jdouble_cast(jlong x);
+
 //-------------------------------------------
 // Constant for jlong (standardized by C++11)

@ -247,6 +250,13 @@ inline size_t heap_word_size(size_t byte_size) {
 const jlong min_jlong = CONST64(0x8000000000000000);
 const jlong max_jlong = CONST64(0x7fffffffffffffff);

+//-------------------------------------------
+// Constant for jdouble
+const jlong min_jlongDouble = CONST64(0x0000000000000001);
+const jdouble min_jdouble = jdouble_cast(min_jlongDouble);
+const jlong max_jlongDouble = CONST64(0x7fefffffffffffff);
+const jdouble max_jdouble = jdouble_cast(max_jlongDouble);
+
 const size_t K                  = 1024;
 const size_t M                  = K*K;
 const size_t G                  = M*K;
@ -469,6 +479,11 @@ const jshort max_jshort = (1 << 15) - 1; // largest jshort
 const jint min_jint = (jint)1 << (sizeof(jint)*BitsPerByte-1); // 0x80000000 == smallest jint
 const jint max_jint = (juint)min_jint - 1;                     // 0x7FFFFFFF == largest jint

+const jint min_jintFloat = (jint)(0x00000001);
+const jfloat min_jfloat = jfloat_cast(min_jintFloat);
+const jint max_jintFloat = (jint)(0x7f7fffff);
+const jfloat max_jfloat = jfloat_cast(max_jintFloat);
+
 //----------------------------------------------------------------------------------------------------
 // JVM spec restrictions

@ -673,6 +688,14 @@ inline bool is_reference_type(BasicType t) {
  return (t == T_OBJECT || t == T_ARRAY);
 }

+inline bool is_integral_type(BasicType t) {
+  return is_subword_type(t) || t == T_INT || t == T_LONG;
+}
+
+inline bool is_floating_point_type(BasicType t) {
+  return (t == T_FLOAT || t == T_DOUBLE);
+}
+
 extern char type2char_tab[T_CONFLICT+1];     // Map a BasicType to a jchar
 inline char type2char(BasicType t) { return (uint)t < T_CONFLICT+1 ? type2char_tab[t] : 0; }
 extern int type2size[T_CONFLICT+1];         // Map BasicType to result stack elements
--- a/src/java.base/share/classes/jdk/internal/vm/vector/VectorSupport.java
+++ b/src/java.base/share/classes/jdk/internal/vm/vector/VectorSupport.java
@ -0,0 +1,468 @@
+/*
+ * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package jdk.internal.vm.vector;
+
+import jdk.internal.vm.annotation.IntrinsicCandidate;
+import jdk.internal.misc.Unsafe;
+import jdk.internal.vm.annotation.ForceInline;
+
+import java.nio.Buffer;
+import java.nio.ByteBuffer;
+import java.util.Objects;
+import java.util.function.*;
+
+public class VectorSupport {
+    static {
+        registerNatives();
+    }
+
+    private static final Unsafe U = Unsafe.getUnsafe();
+
+    // Unary
+    public static final int VECTOR_OP_ABS  = 0;
+    public static final int VECTOR_OP_NEG  = 1;
+    public static final int VECTOR_OP_SQRT = 2;
+
+    // Binary
+    public static final int VECTOR_OP_ADD  = 4;
+    public static final int VECTOR_OP_SUB  = 5;
+    public static final int VECTOR_OP_MUL  = 6;
+    public static final int VECTOR_OP_DIV  = 7;
+    public static final int VECTOR_OP_MIN  = 8;
+    public static final int VECTOR_OP_MAX  = 9;
+
+    public static final int VECTOR_OP_AND  = 10;
+    public static final int VECTOR_OP_OR   = 11;
+    public static final int VECTOR_OP_XOR  = 12;
+
+    // Ternary
+    public static final int VECTOR_OP_FMA  = 13;
+
+    // Broadcast int
+    public static final int VECTOR_OP_LSHIFT  = 14;
+    public static final int VECTOR_OP_RSHIFT  = 15;
+    public static final int VECTOR_OP_URSHIFT = 16;
+
+    public static final int VECTOR_OP_CAST        = 17;
+    public static final int VECTOR_OP_REINTERPRET = 18;
+
+    // enum BoolTest
+    public static final int BT_eq = 0;
+    public static final int BT_ne = 4;
+    public static final int BT_le = 5;
+    public static final int BT_ge = 7;
+    public static final int BT_lt = 3;
+    public static final int BT_gt = 1;
+    public static final int BT_overflow = 2;
+    public static final int BT_no_overflow = 6;
+
+    // BasicType codes, for primitives only:
+    public static final int
+        T_FLOAT   = 6,
+        T_DOUBLE  = 7,
+        T_BYTE    = 8,
+        T_SHORT   = 9,
+        T_INT     = 10,
+        T_LONG    = 11;
+
+    /* ============================================================================ */
+
+    public static class VectorSpecies<E> {}
+
+    public static class VectorPayload {
+        private final Object payload; // array of primitives
+
+        public VectorPayload(Object payload) {
+            this.payload = payload;
+        }
+
+        protected final Object getPayload() {
+            return VectorSupport.maybeRebox(this).payload;
+        }
+    }
+
+    public static class Vector<E> extends VectorPayload {
+        public Vector(Object payload) {
+            super(payload);
+        }
+    }
+
+    public static class VectorShuffle<E> extends VectorPayload {
+        public VectorShuffle(Object payload) {
+            super(payload);
+        }
+    }
+    public static class VectorMask<E> extends VectorPayload {
+        public VectorMask(Object payload) {
+            super(payload);
+        }
+    }
+
+    /* ============================================================================ */
+    public interface BroadcastOperation<VM, E, S extends VectorSpecies<E>> {
+        VM broadcast(long l, S s);
+    }
+
+    @IntrinsicCandidate
+    public static
+    <VM, E, S extends VectorSpecies<E>>
+    VM broadcastCoerced(Class<? extends VM> vmClass, Class<E> E, int length,
+                                  long bits, S s,
+                                  BroadcastOperation<VM, E, S> defaultImpl) {
+        assert isNonCapturingLambda(defaultImpl) : defaultImpl;
+        return defaultImpl.broadcast(bits, s);
+    }
+
+    /* ============================================================================ */
+    public interface ShuffleIotaOperation<E, S extends VectorSpecies<E>> {
+        VectorShuffle<E> apply(int length, int start, int step, S s);
+    }
+
+    @IntrinsicCandidate
+    public static
+    <E, S extends VectorSpecies<E>>
+    VectorShuffle<E> shuffleIota(Class<?> E, Class<?> ShuffleClass, S s, int length,
+                     int start, int step, int wrap, ShuffleIotaOperation<E, S> defaultImpl) {
+       assert isNonCapturingLambda(defaultImpl) : defaultImpl;
+       return defaultImpl.apply(length, start, step, s);
+    }
+
+    public interface ShuffleToVectorOperation<VM, Sh, E> {
+       VM apply(Sh s);
+    }
+
+    @IntrinsicCandidate
+    public static
+    <VM ,Sh extends VectorShuffle<E>, E>
+    VM shuffleToVector(Class<?> VM, Class<?>E , Class<?> ShuffleClass, Sh s, int length,
+                       ShuffleToVectorOperation<VM,Sh,E> defaultImpl) {
+      assert isNonCapturingLambda(defaultImpl) : defaultImpl;
+      return defaultImpl.apply(s);
+    }
+
+    /* ============================================================================ */
+    public interface IndexOperation<V extends Vector<E>, E, S extends VectorSpecies<E>> {
+        V index(V v, int step, S s);
+    }
+
+    //FIXME @IntrinsicCandidate
+    public static
+    <V extends Vector<E>, E, S extends VectorSpecies<E>>
+    V indexVector(Class<? extends V> vClass, Class<E> E, int length,
+                  V v, int step, S s,
+                  IndexOperation<V, E, S> defaultImpl) {
+        assert isNonCapturingLambda(defaultImpl) : defaultImpl;
+        return defaultImpl.index(v, step, s);
+    }
+
+    /* ============================================================================ */
+
+    @IntrinsicCandidate
+    public static
+    <V extends Vector<?>>
+    long reductionCoerced(int oprId, Class<?> vectorClass, Class<?> elementType, int length,
+                          V v,
+                          Function<V,Long> defaultImpl) {
+        assert isNonCapturingLambda(defaultImpl) : defaultImpl;
+        return defaultImpl.apply(v);
+    }
+
+    /* ============================================================================ */
+
+    public interface VecExtractOp<V> {
+        long apply(V v1, int idx);
+    }
+
+    @IntrinsicCandidate
+    public static
+    <V extends Vector<?>>
+    long extract(Class<?> vectorClass, Class<?> elementType, int vlen,
+                 V vec, int ix,
+                 VecExtractOp<V> defaultImpl) {
+        assert isNonCapturingLambda(defaultImpl) : defaultImpl;
+        return defaultImpl.apply(vec, ix);
+    }
+
+    /* ============================================================================ */
+
+    public interface VecInsertOp<V> {
+        V apply(V v1, int idx, long val);
+    }
+
+    @IntrinsicCandidate
+    public static
+    <V extends Vector<?>>
+    V insert(Class<? extends V> vectorClass, Class<?> elementType, int vlen,
+             V vec, int ix, long val,
+             VecInsertOp<V> defaultImpl) {
+        assert isNonCapturingLambda(defaultImpl) : defaultImpl;
+        return defaultImpl.apply(vec, ix, val);
+    }
+
+    /* ============================================================================ */
+
+    @IntrinsicCandidate
+    public static
+    <VM>
+    VM unaryOp(int oprId, Class<? extends VM> vmClass, Class<?> elementType, int length,
+               VM vm,
+               Function<VM, VM> defaultImpl) {
+        assert isNonCapturingLambda(defaultImpl) : defaultImpl;
+        return defaultImpl.apply(vm);
+    }
+
+    /* ============================================================================ */
+
+    @IntrinsicCandidate
+    public static
+    <VM>
+    VM binaryOp(int oprId, Class<? extends VM> vmClass, Class<?> elementType, int length,
+                VM vm1, VM vm2,
+                BiFunction<VM, VM, VM> defaultImpl) {
+        assert isNonCapturingLambda(defaultImpl) : defaultImpl;
+        return defaultImpl.apply(vm1, vm2);
+    }
+
+    /* ============================================================================ */
+
+    public interface TernaryOperation<V> {
+        V apply(V v1, V v2, V v3);
+    }
+
+    @IntrinsicCandidate
+    public static
+    <VM>
+    VM ternaryOp(int oprId, Class<? extends VM> vmClass, Class<?> elementType, int length,
+                 VM vm1, VM vm2, VM vm3,
+                 TernaryOperation<VM> defaultImpl) {
+        assert isNonCapturingLambda(defaultImpl) : defaultImpl;
+        return defaultImpl.apply(vm1, vm2, vm3);
+    }
+
+    /* ============================================================================ */
+
+    // Memory operations
+
+    public interface LoadOperation<C, V, E, S extends VectorSpecies<E>> {
+        V load(C container, int index, S s);
+    }
+
+    @IntrinsicCandidate
+    public static
+    <C, VM, E, S extends VectorSpecies<E>>
+    VM load(Class<? extends VM> vmClass, Class<E> E, int length,
+           Object base, long offset,    // Unsafe addressing
+           C container, int index, S s,     // Arguments for default implementation
+           LoadOperation<C, VM, E, S> defaultImpl) {
+        assert isNonCapturingLambda(defaultImpl) : defaultImpl;
+        return defaultImpl.load(container, index, s);
+    }
+
+    /* ============================================================================ */
+
+    public interface LoadVectorOperationWithMap<C, V extends Vector<?>, E, S extends VectorSpecies<E>> {
+        V loadWithMap(C container, int index, int[] indexMap, int indexM, S s);
+    }
+
+    @IntrinsicCandidate
+    public static
+    <C, V extends Vector<?>, W extends Vector<Integer>, E, S extends VectorSpecies<E>>
+    V loadWithMap(Class<?> vectorClass, Class<E> E, int length, Class<?> vectorIndexClass,
+                  Object base, long offset, // Unsafe addressing
+                  W index_vector,
+                  C container, int index, int[] indexMap, int indexM, S s, // Arguments for default implementation
+                  LoadVectorOperationWithMap<C, V, E, S> defaultImpl) {
+        assert isNonCapturingLambda(defaultImpl) : defaultImpl;
+        return defaultImpl.loadWithMap(container, index, indexMap, indexM, s);
+    }
+
+    /* ============================================================================ */
+
+    public interface StoreVectorOperation<C, V extends Vector<?>> {
+        void store(C container, int index, V v);
+    }
+
+    @IntrinsicCandidate
+    public static
+    <C, V extends Vector<?>>
+    void store(Class<?> vectorClass, Class<?> elementType, int length,
+               Object base, long offset,    // Unsafe addressing
+               V v,
+               C container, int index,      // Arguments for default implementation
+               StoreVectorOperation<C, V> defaultImpl) {
+        assert isNonCapturingLambda(defaultImpl) : defaultImpl;
+        defaultImpl.store(container, index, v);
+    }
+
+    /* ============================================================================ */
+
+    public interface StoreVectorOperationWithMap<C, V extends Vector<?>> {
+        void storeWithMap(C container, int index, V v, int[] indexMap, int indexM);
+    }
+
+    @IntrinsicCandidate
+    public static
+    <C, V extends Vector<?>, W extends Vector<Integer>>
+    void storeWithMap(Class<?> vectorClass, Class<?> elementType, int length, Class<?> vectorIndexClass,
+                      Object base, long offset,    // Unsafe addressing
+                      W index_vector, V v,
+                      C container, int index, int[] indexMap, int indexM, // Arguments for default implementation
+                      StoreVectorOperationWithMap<C, V> defaultImpl) {
+        assert isNonCapturingLambda(defaultImpl) : defaultImpl;
+        defaultImpl.storeWithMap(container, index, v, indexMap, indexM);
+    }
+
+    /* ============================================================================ */
+
+    @IntrinsicCandidate
+    public static
+    <VM>
+    boolean test(int cond, Class<?> vmClass, Class<?> elementType, int length,
+                 VM vm1, VM vm2,
+                 BiFunction<VM, VM, Boolean> defaultImpl) {
+        assert isNonCapturingLambda(defaultImpl) : defaultImpl;
+        return defaultImpl.apply(vm1, vm2);
+    }
+
+    /* ============================================================================ */
+
+    public interface VectorCompareOp<V,M> {
+        M apply(int cond, V v1, V v2);
+    }
+
+    @IntrinsicCandidate
+    public static <V extends Vector<E>,
+                   M extends VectorMask<E>,
+                   E>
+    M compare(int cond, Class<? extends V> vectorClass, Class<M> maskClass, Class<?> elementType, int length,
+              V v1, V v2,
+              VectorCompareOp<V,M> defaultImpl) {
+        assert isNonCapturingLambda(defaultImpl) : defaultImpl;
+        return defaultImpl.apply(cond, v1, v2);
+    }
+
+    /* ============================================================================ */
+
+    public interface VectorRearrangeOp<V extends Vector<E>,
+            Sh extends VectorShuffle<E>,
+            E> {
+        V apply(V v1, Sh shuffle);
+    }
+
+    @IntrinsicCandidate
+    public static
+    <V extends Vector<E>,
+            Sh extends VectorShuffle<E>,
+            E>
+    V rearrangeOp(Class<? extends V> vectorClass, Class<Sh> shuffleClass, Class<?> elementType, int vlen,
+                  V v1, Sh sh,
+                  VectorRearrangeOp<V,Sh, E> defaultImpl) {
+        assert isNonCapturingLambda(defaultImpl) : defaultImpl;
+        return defaultImpl.apply(v1, sh);
+    }
+
+    /* ============================================================================ */
+
+    public interface VectorBlendOp<V extends Vector<E>,
+            M extends VectorMask<E>,
+            E> {
+        V apply(V v1, V v2, M mask);
+    }
+
+    @IntrinsicCandidate
+    public static
+    <V extends Vector<E>,
+     M extends VectorMask<E>,
+     E>
+    V blend(Class<? extends V> vectorClass, Class<M> maskClass, Class<?> elementType, int length,
+            V v1, V v2, M m,
+            VectorBlendOp<V,M, E> defaultImpl) {
+        assert isNonCapturingLambda(defaultImpl) : defaultImpl;
+        return defaultImpl.apply(v1, v2, m);
+    }
+
+    /* ============================================================================ */
+
+    public interface VectorBroadcastIntOp<V extends Vector<?>> {
+        V apply(V v, int n);
+    }
+
+    @IntrinsicCandidate
+    public static
+    <V extends Vector<?>>
+    V broadcastInt(int opr, Class<? extends V> vectorClass, Class<?> elementType, int length,
+                   V v, int n,
+                   VectorBroadcastIntOp<V> defaultImpl) {
+        assert isNonCapturingLambda(defaultImpl) : defaultImpl;
+        return defaultImpl.apply(v, n);
+    }
+
+    /* ============================================================================ */
+
+    public interface VectorConvertOp<VOUT, VIN, S> {
+        VOUT apply(VIN v, S species);
+    }
+
+    // Users of this intrinsic assume that it respects
+    // REGISTER_ENDIAN, which is currently ByteOrder.LITTLE_ENDIAN.
+    // See javadoc for REGISTER_ENDIAN.
+
+    @IntrinsicCandidate
+    public static <VOUT extends VectorPayload,
+                    VIN extends VectorPayload,
+                      S extends VectorSpecies<?>>
+    VOUT convert(int oprId,
+              Class<?> fromVectorClass, Class<?> fromElementType, int fromVLen,
+              Class<?>   toVectorClass, Class<?>   toElementType, int   toVLen,
+              VIN v, S s,
+              VectorConvertOp<VOUT, VIN, S> defaultImpl) {
+        assert isNonCapturingLambda(defaultImpl) : defaultImpl;
+        return defaultImpl.apply(v, s);
+    }
+
+    /* ============================================================================ */
+
+    @IntrinsicCandidate
+    public static <V> V maybeRebox(V v) {
+        // The fence is added here to avoid memory aliasing problems in C2 between scalar & vector accesses.
+        // TODO: move the fence generation into C2. Generate only when reboxing is taking place.
+        U.loadFence();
+        return v;
+    }
+
+    /* ============================================================================ */
+
+    // query the JVM's supported vector sizes and types
+    public static native int getMaxLaneCount(Class<?> etype);
+
+    /* ============================================================================ */
+
+    public static boolean isNonCapturingLambda(Object o) {
+        return o.getClass().getDeclaredFields().length == 0;
+    }
+
+    /* ============================================================================ */
+
+    private static native int registerNatives();
+}
--- a/src/java.base/share/classes/module-info.java
+++ b/src/java.base/share/classes/module-info.java
@ -138,9 +138,10 @@ module java.base {
        jdk.incubator.foreign;
    exports com.sun.security.ntlm to
        java.security.sasl;
-    exports jdk.internal to
+    exports jdk.internal to // for @HotSpotIntrinsicCandidate
        java.compiler,
        jdk.compiler,
+        jdk.incubator.vector,
        jdk.jshell;
    exports jdk.internal.access to
        java.desktop,
@ -195,6 +196,7 @@ module java.base {
        jdk.attach,
        jdk.charsets,
        jdk.compiler,
+        jdk.incubator.vector,
        jdk.jfr,
        jdk.jshell,
        jdk.nio.mapmode,
@ -228,9 +230,12 @@ module java.base {
        jdk.management.agent;
    exports jdk.internal.vm.annotation to
        jdk.internal.vm.ci,
+        jdk.incubator.vector,
        jdk.incubator.foreign,
        jdk.jfr,
        jdk.unsupported;
+    exports jdk.internal.vm.vector to
+        jdk.incubator.vector;
    exports jdk.internal.util to
            jdk.incubator.foreign;
    exports jdk.internal.util.jar to
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/AbstractMask.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/AbstractMask.java
@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2017, 2020, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.  Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package jdk.incubator.vector;
+
+import jdk.internal.vm.annotation.ForceInline;
+
+import static jdk.incubator.vector.VectorOperators.*;
+
+abstract class AbstractMask<E> extends VectorMask<E> {
+    AbstractMask(boolean[] bits) {
+        super(bits);
+    }
+
+    /*package-private*/
+    abstract boolean[] getBits();
+
+    // Unary operator
+
+    interface MUnOp {
+        boolean apply(int i, boolean a);
+    }
+
+    abstract AbstractMask<E> uOp(MUnOp f);
+
+    // Binary operator
+
+    interface MBinOp {
+        boolean apply(int i, boolean a, boolean b);
+    }
+
+    abstract AbstractMask<E> bOp(VectorMask<E> o, MBinOp f);
+
+    /*package-private*/
+    abstract AbstractSpecies<E> vspecies();
+
+    @Override
+    @ForceInline
+    public final VectorSpecies<E> vectorSpecies() {
+        return vspecies();
+    }
+
+    @Override
+    public boolean laneIsSet(int i) {
+        return getBits()[i];
+    }
+
+    @Override
+    public long toLong() {
+        // FIXME: This should be an intrinsic.
+        if (length() > Long.SIZE) {
+            throw new UnsupportedOperationException("too many lanes for one long");
+        }
+        long res = 0;
+        long set = 1;
+        boolean[] bits = getBits();
+        for (int i = 0; i < bits.length; i++) {
+            res = bits[i] ? res | set : res;
+            set = set << 1;
+        }
+        return res;
+    }
+
+    @Override
+    public void intoArray(boolean[] bits, int i) {
+        System.arraycopy(getBits(), 0, bits, i, length());
+    }
+
+    @Override
+    public boolean[] toArray() {
+        return getBits().clone();
+    }
+
+    @Override
+    @ForceInline
+    @SuppressWarnings("unchecked")
+    public
+    <F> VectorMask<F> check(Class<F> elementType) {
+        if (vectorSpecies().elementType() != elementType) {
+            throw AbstractSpecies.checkFailed(this, elementType);
+        }
+        return (VectorMask<F>) this;
+    }
+
+    @Override
+    @ForceInline
+    @SuppressWarnings("unchecked")
+    public
+    <F> VectorMask<F> check(VectorSpecies<F> species) {
+        if (species != vectorSpecies()) {
+            throw AbstractSpecies.checkFailed(this, species);
+        }
+        return (VectorMask<F>) this;
+    }
+
+    @Override
+    public int trueCount() {
+        //FIXME: use a population count intrinsic here
+        int c = 0;
+        for (boolean i : getBits()) {
+            if (i) c++;
+        }
+        return c;
+    }
+
+    @Override
+    public int firstTrue() {
+        //FIXME: use a count trailing zeros intrinsic here
+        boolean[] bits = getBits();
+        for (int i = 0; i < bits.length; i++) {
+            if (bits[i])  return i;
+        }
+        return bits.length;
+    }
+
+    @Override
+    public int lastTrue() {
+        //FIXME: use a count leading zeros intrinsic here
+        boolean[] bits = getBits();
+        for (int i = bits.length-1; i >= 0; i--) {
+            if (bits[i])  return i;
+        }
+        return -1;
+    }
+
+    @Override
+    public VectorMask<E> eq(VectorMask<E> m) {
+        // FIXME: Generate good code here.
+        return bOp(m, (i, a, b) -> a == b);
+    }
+
+    @Override
+    public VectorMask<E> andNot(VectorMask<E> m) {
+        // FIXME: Generate good code here.
+        return bOp(m, (i, a, b) -> a && !b);
+    }
+
+    /*package-private*/
+    static boolean anyTrueHelper(boolean[] bits) {
+        // FIXME: Maybe use toLong() != 0 here.
+        for (boolean i : bits) {
+            if (i) return true;
+        }
+        return false;
+    }
+
+    /*package-private*/
+    static boolean allTrueHelper(boolean[] bits) {
+        // FIXME: Maybe use not().toLong() == 0 here.
+        for (boolean i : bits) {
+            if (!i) return false;
+        }
+        return true;
+    }
+
+    @Override
+    @ForceInline
+    public VectorMask<E> indexInRange(int offset, int limit) {
+        int vlength = length();
+        Vector<E> iota = vectorSpecies().zero().addIndex(1);
+        VectorMask<E> badMask = checkIndex0(offset, limit, iota, vlength);
+        return this.andNot(badMask);
+    }
+
+    /*package-private*/
+    @ForceInline
+    AbstractVector<E>
+    toVectorTemplate() {
+        AbstractSpecies<E> vsp = vspecies();
+        Vector<E> zero = vsp.broadcast(0);
+        Vector<E> mone = vsp.broadcast(-1);
+        // -1 will result in the most significant bit being set in
+        // addition to some or all other lane bits.
+        // For integral types, *all* lane bits will be set.
+        // The bits for -1.0 are like {0b10111*0000*}.
+        // FIXME: Use a conversion intrinsic for this operation.
+        // https://bugs.openjdk.java.net/browse/JDK-8225740
+        return (AbstractVector<E>) zero.blend(mone, this);
+    }
+
+    /**
+     * Test if a masked memory access at a given offset into an array
+     * of the given length will stay within the array.
+     * The per-lane offsets are iota*esize.
+     */
+    /*package-private*/
+    @ForceInline
+    void checkIndexByLane(int offset, int alength,
+                          Vector<E> iota,
+                          int esize) {
+        if (VectorIntrinsics.VECTOR_ACCESS_OOB_CHECK == 0) {
+            return;
+        }
+        // Although the specification is simple, the implementation is
+        // tricky, because the value iota*esize might possibly
+        // overflow.  So we calculate our test values as scalars,
+        // clipping to the range [-1..VLENGTH], and test them against
+        // the unscaled iota vector, whose values are in [0..VLENGTH-1].
+        int vlength = length();
+        VectorMask<E> badMask;
+        if (esize == 1) {
+            badMask = checkIndex0(offset, alength, iota, vlength);
+        } else if (offset >= 0) {
+            // Masked access to multi-byte lanes in byte array.
+            // It could be aligned anywhere.
+            int elemCount = Math.min(vlength, (alength - offset) / esize);
+            badMask = checkIndex0(0, elemCount, iota, vlength);
+        } else {
+            // This requires a split test.
+            int clipOffset = Math.max(offset, -(vlength * esize));
+            int elemCount = Math.min(vlength, (alength - clipOffset) / esize);
+            badMask = checkIndex0(0, elemCount, iota, vlength);
+            clipOffset &= (esize - 1);  // power of two, so OK
+            VectorMask<E> badMask2 = checkIndex0(clipOffset / esize, vlength,
+                                                 iota, vlength);
+            badMask = badMask.or(badMask2);
+        }
+        badMask = badMask.and(this);
+        if (badMask.anyTrue()) {
+            int badLane = badMask.firstTrue();
+            throw ((AbstractMask<E>)badMask)
+                   .checkIndexFailed(offset, badLane, alength, esize);
+        }
+    }
+
+    private
+    @ForceInline
+    VectorMask<E> checkIndex0(int offset, int alength,
+                              Vector<E> iota, int vlength) {
+        // An active lane is bad if its number is greater than
+        // alength-offset, since when added to offset it will step off
+        // of the end of the array.  To avoid overflow when
+        // converting, clip the comparison value to [0..vlength]
+        // inclusive.
+        int indexLimit = Math.max(0, Math.min(alength - offset, vlength));
+        VectorMask<E> badMask =
+            iota.compare(GE, iota.broadcast(indexLimit));
+        if (offset < 0) {
+            // An active lane is bad if its number is less than
+            // -offset, because when added to offset it will then
+            // address an array element at a negative index.  To avoid
+            // overflow when converting, clip the comparison value at
+            // vlength.  This specific expression works correctly even
+            // when offset is Integer.MIN_VALUE.
+            int firstGoodIndex = -Math.max(offset, -vlength);
+            VectorMask<E> badMask2 =
+                iota.compare(LT, iota.broadcast(firstGoodIndex));
+            if (indexLimit >= vlength) {
+                badMask = badMask2;  // 1st badMask is all true
+            } else {
+                badMask = badMask.or(badMask2);
+            }
+        }
+        return badMask;
+    }
+
+    private IndexOutOfBoundsException checkIndexFailed(int offset, int lane,
+                                                       int alength, int esize) {
+        String msg = String.format("Masked range check failed: "+
+                                   "vector mask %s out of bounds at "+
+                                   "index %d+%d in array of length %d",
+                                   this, offset, lane * esize, alength);
+        if (esize != 1) {
+            msg += String.format(" (each lane spans %d array elements)", esize);
+        }
+        throw new IndexOutOfBoundsException(msg);
+    }
+
+}
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/AbstractShuffle.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/AbstractShuffle.java
@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2018, 2020, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.  Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package jdk.incubator.vector;
+
+import java.util.function.IntUnaryOperator;
+import jdk.internal.vm.annotation.ForceInline;
+
+abstract class AbstractShuffle<E> extends VectorShuffle<E> {
+    static final IntUnaryOperator IDENTITY = i -> i;
+
+    // Internal representation allows for a maximum index of 256
+    // Values are clipped to [-VLENGTH..VLENGTH-1].
+
+    AbstractShuffle(int length, byte[] reorder) {
+        super(reorder);
+        assert(length == reorder.length);
+        assert(indexesInRange(reorder));
+    }
+
+    AbstractShuffle(int length, int[] reorder) {
+        this(length, reorder, 0);
+    }
+
+    AbstractShuffle(int length, int[] reorder, int offset) {
+        super(prepare(length, reorder, offset));
+    }
+
+    AbstractShuffle(int length, IntUnaryOperator f) {
+        super(prepare(length, f));
+    }
+
+    private static byte[] prepare(int length, int[] reorder, int offset) {
+        byte[] a = new byte[length];
+        for (int i = 0; i < length; i++) {
+            int si = reorder[offset + i];
+            si = partiallyWrapIndex(si, length);
+            a[i] = (byte) si;
+        }
+        return a;
+    }
+
+    private static byte[] prepare(int length, IntUnaryOperator f) {
+        byte[] a = new byte[length];
+        for (int i = 0; i < a.length; i++) {
+            int si = f.applyAsInt(i);
+            si = partiallyWrapIndex(si, length);
+            a[i] = (byte) si;
+        }
+        return a;
+    }
+
+    byte[] reorder() {
+        return (byte[])getPayload();
+    }
+
+    /*package-private*/
+    abstract AbstractSpecies<E> vspecies();
+
+    @Override
+    @ForceInline
+    public final VectorSpecies<E> vectorSpecies() {
+        return vspecies();
+    }
+
+    @Override
+    @ForceInline
+    public void intoArray(int[] a, int offset) {
+        byte[] reorder = reorder();
+        int vlen = reorder.length;
+        for (int i = 0; i < vlen; i++) {
+            int sourceIndex = reorder[i];
+            assert(sourceIndex >= -vlen && sourceIndex < vlen);
+            a[offset + i] = sourceIndex;
+        }
+    }
+
+    @Override
+    @ForceInline
+    public int[] toArray() {
+        byte[] reorder = reorder();
+        int[] a = new int[reorder.length];
+        intoArray(a, 0);
+        return a;
+    }
+
+    /*package-private*/
+    @ForceInline
+    final
+    AbstractVector<E>
+    toVectorTemplate() {
+        // Note that the values produced by laneSource
+        // are already clipped.  At this point we convert
+        // them from internal ints (or bytes) into the ETYPE.
+        // FIXME: Use a conversion intrinsic for this operation.
+        // https://bugs.openjdk.java.net/browse/JDK-8225740
+        return (AbstractVector<E>) vspecies().fromIntValues(toArray());
+    }
+
+    @ForceInline
+    public final VectorShuffle<E> checkIndexes() {
+        // FIXME: vectorize this
+        for (int index : reorder()) {
+            if (index < 0) {
+                throw checkIndexFailed(index, length());
+            }
+        }
+        return this;
+    }
+
+    @ForceInline
+    public final VectorShuffle<E> wrapIndexes() {
+        // FIXME: vectorize this
+        byte[] reorder = reorder();
+        int length = reorder.length;
+        for (int index : reorder) {
+            if (index < 0) {
+                return wrapAndRebuild(reorder);
+            }
+        }
+        return this;
+    }
+
+    @ForceInline
+    public final VectorShuffle<E> wrapAndRebuild(byte[] oldReorder) {
+        int length = oldReorder.length;
+        byte[] reorder = new byte[length];
+        for (int i = 0; i < length; i++) {
+            int si = oldReorder[i];
+            // FIXME: This does not work unless it's a power of 2.
+            if ((length & (length - 1)) == 0) {
+                si += si & length;  // power-of-two optimization
+            } else if (si < 0) {
+                // non-POT code requires a conditional add
+                si += length;
+            }
+            assert(si >= 0 && si < length);
+            reorder[i] = (byte) si;
+        }
+        return vspecies().dummyVector().shuffleFromBytes(reorder);
+    }
+
+    @ForceInline
+    public final VectorMask<E> laneIsValid() {
+        // FIXME: vectorize this
+        byte[] reorder = reorder();
+        int length = reorder.length;
+        boolean[] bits = new boolean[length];
+        for (int i = 0; i < length; i++) {
+            if (reorder[i] >= 0) {
+                bits[i] = true;
+            }
+        }
+        return vspecies().dummyVector().maskFromArray(bits);
+    }
+
+    @Override
+    @ForceInline
+    @SuppressWarnings("unchecked")
+    public final
+    <F> VectorShuffle<F> check(VectorSpecies<F> species) {
+        if (species != vectorSpecies()) {
+            throw AbstractSpecies.checkFailed(this, species);
+        }
+        return (VectorShuffle<F>) this;
+    }
+
+    @Override
+    @ForceInline
+    public final int checkIndex(int index) {
+        return checkIndex0(index, length(), (byte)1);
+    }
+
+    @Override
+    @ForceInline
+    public final int wrapIndex(int index) {
+        return checkIndex0(index, length(), (byte)0);
+    }
+
+    /** Return invalid indexes partially wrapped
+     * mod VLENGTH to negative values.
+     */
+    /*package-private*/
+    @ForceInline
+    static
+    int partiallyWrapIndex(int index, int laneCount) {
+        return checkIndex0(index, laneCount, (byte)-1);
+    }
+
+    /*package-private*/
+    @ForceInline
+    static int checkIndex0(int index, int laneCount, byte mode) {
+        int wrapped = VectorIntrinsics.wrapToRange(index, laneCount);
+        if (mode == 0 || wrapped == index) {
+            return wrapped;
+        }
+        if (mode < 0) {
+            return wrapped - laneCount;  // special mode for internal storage
+        }
+        throw checkIndexFailed(index, laneCount);
+    }
+
+    private static IndexOutOfBoundsException checkIndexFailed(int index, int laneCount) {
+        int max = laneCount - 1;
+        String msg = "required an index in [0.."+max+"] but found "+index;
+        return new IndexOutOfBoundsException(msg);
+    }
+
+    static boolean indexesInRange(byte[] reorder) {
+        int length = reorder.length;
+        for (byte si : reorder) {
+            if (si >= length || si < -length) {
+                boolean assertsEnabled = false;
+                assert(assertsEnabled = true);
+                if (assertsEnabled) {
+                    String msg = ("index "+si+"out of range ["+length+"] in "+
+                                  java.util.Arrays.toString(reorder));
+                    throw new AssertionError(msg);
+                }
+                return false;
+            }
+        }
+        return true;
+    }
+}
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/AbstractSpecies.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/AbstractSpecies.java
@ -0,0 +1,658 @@
+/*
+ * Copyright (c) 2017, 2020, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.  Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package jdk.incubator.vector;
+
+import jdk.internal.vm.annotation.ForceInline;
+import jdk.internal.vm.annotation.Stable;
+import java.nio.ByteOrder;
+import java.lang.reflect.Array;
+import java.util.Arrays;
+import java.util.function.Function;
+import java.util.function.IntUnaryOperator;
+
+abstract class AbstractSpecies<E> extends jdk.internal.vm.vector.VectorSupport.VectorSpecies<E>
+                                  implements VectorSpecies<E> {
+    @Stable
+    final VectorShape vectorShape;
+    @Stable
+    final LaneType laneType;
+    @Stable
+    final int laneCount;
+    @Stable
+    final int laneCountLog2P1;
+    @Stable
+    final Class<? extends AbstractVector<E>> vectorType;
+    @Stable
+    final Class<? extends AbstractMask<E>> maskType;
+    @Stable
+    final Function<Object, ? extends AbstractVector<E>> vectorFactory;
+
+    @Stable
+    final VectorShape indexShape;
+    @Stable
+    final int maxScale, minScale;
+    @Stable
+    final int vectorBitSize, vectorByteSize;
+
+    AbstractSpecies(VectorShape vectorShape,
+                    LaneType laneType,
+                    Class<? extends AbstractVector<E>> vectorType,
+                    Class<? extends AbstractMask<E>> maskType,
+                    Function<Object, ? extends AbstractVector<E>> vectorFactory) {
+        this.vectorShape = vectorShape;
+        this.laneType = laneType;
+        this.vectorType = vectorType;
+        this.maskType = maskType;
+        this.vectorFactory = vectorFactory;
+
+        // derived values:
+        int bitSize = vectorShape.vectorBitSize();
+        int byteSize = bitSize / Byte.SIZE;
+        assert(byteSize * 8 == bitSize);
+        this.vectorBitSize = bitSize;
+        this.vectorByteSize = byteSize;
+        int elementSize = laneType.elementSize;
+        this.laneCount = bitSize / elementSize;
+        assert(laneCount > 0);  // could be 1 for mono-vector (double in v64)
+        this.laneCountLog2P1 = Integer.numberOfTrailingZeros(laneCount) + 1;
+
+        // Note:  The shape might be the max-shape,
+        // if there is no vector this large.
+        int indexBitSize = Integer.SIZE * laneCount;
+        this.indexShape = VectorShape.forIndexBitSize(indexBitSize, elementSize);
+
+        // What are the largest and smallest scale factors that,
+        // when multiplied times the elements in [0..VLENGTH],
+        // inclusive, do not overflow the ETYPE?
+        int precision = laneType.elementPrecision;
+        if (precision >= Integer.SIZE) {
+            // No overflow possible from int*int.
+            this.maxScale = Integer.MAX_VALUE;
+            this.minScale = Integer.MIN_VALUE;
+        } else {
+            boolean isfp = (laneType.elementKind == 'F');
+            long x = laneCount;
+            long maxScale = ((1L << precision)-(isfp?0:1)) / x;
+            long minScale = (-1L << precision) / x;
+            this.maxScale = (int) maxScale;
+            this.minScale = (int) minScale;
+        }
+    }
+
+    @Stable //lazy JIT constant
+    AbstractSpecies<Integer> indexSpecies;
+
+    @Stable //lazy JIT constant
+    AbstractShuffle<Byte> swapBytesShuffle;
+
+    @Stable //lazy JIT constant
+    AbstractVector<E> dummyVector;
+
+    @Override
+    @ForceInline
+    public final int length() {
+        return laneCount;
+    }
+
+    // Inside the implementation we use the more descriptive
+    // term laneCount:
+
+    /*package-private*/
+    @ForceInline
+    final int laneCount() {
+        return laneCount;
+    }
+
+    /*package-private*/
+    @ForceInline
+    final int laneCountLog2() {
+        return laneCountLog2P1 - 1;  // subtract one from stable value
+    }
+
+    @Override
+    @ForceInline
+    @SuppressWarnings("unchecked")
+    //NOT FINAL: SPECIALIZED
+    public Class<E> elementType() {
+        return (Class<E>) laneType.elementType;
+    }
+
+    // FIXME: appeal to general method (see https://bugs.openjdk.java.net/browse/JDK-6176992)
+    // replace usages of this method and remove
+    @ForceInline
+    @SuppressWarnings("unchecked")
+    //NOT FINAL: SPECIALIZED
+    Class<E> genericElementType() {
+        return (Class<E>) laneType.genericElementType;
+    }
+
+    @Override
+    @ForceInline
+    //NOT FINAL: SPECIALIZED
+    public Class<? extends AbstractVector<E>> vectorType() {
+        return vectorType;
+    }
+
+    @Override
+    @ForceInline
+    public final Class<? extends AbstractMask<E>> maskType() {
+        return maskType;
+    }
+
+    @Override
+    @ForceInline
+    public final int elementSize() {
+        return laneType.elementSize;
+    }
+
+    /*package-private*/
+    @ForceInline
+    final int elementByteSize() {
+        return laneType.elementSize / Byte.SIZE;
+    }
+
+    @Override
+    @ForceInline
+    public final VectorShape vectorShape() {
+        return vectorShape;
+    }
+
+    @ForceInline
+    /*package-private*/
+    final VectorShape indexShape() {
+        return indexShape;
+    }
+
+    @Override
+    @ForceInline
+    public final int vectorBitSize() {
+        return vectorBitSize;
+    }
+
+    @Override
+    @ForceInline
+    public final int vectorByteSize() {
+        return vectorByteSize;
+    }
+
+    @Override
+    @ForceInline
+    public final int loopBound(int length) {
+        return VectorIntrinsics.roundDown(length, laneCount);
+    }
+
+    @Override
+    @ForceInline
+    public final VectorMask<E> indexInRange(int offset, int limit) {
+        return maskAll(true).indexInRange(offset, limit);
+    }
+
+    @Override
+    @ForceInline
+    public final <F> VectorSpecies<F> withLanes(Class<F> newType) {
+        return withLanes(LaneType.of(newType)).check(newType);
+    }
+
+    @ForceInline
+    /*package-private*/
+    final
+    AbstractSpecies<?> withLanes(LaneType newType) {
+        if (newType == laneType)  return this;
+        return findSpecies(newType, vectorShape);
+    }
+
+    @ForceInline
+    /*package-private*/
+    AbstractSpecies<?> asIntegral() {
+        return withLanes(laneType.asIntegral());
+    }
+
+    @ForceInline
+    /*package-private*/
+    AbstractSpecies<?> asFloating() {
+        return withLanes(laneType.asFloating());
+    }
+
+    @Override
+    @ForceInline
+    @SuppressWarnings("unchecked")
+    public final VectorSpecies<E> withShape(VectorShape newShape) {
+        if (newShape == vectorShape)  return this;
+        return (VectorSpecies<E>) findSpecies(laneType, newShape);
+    }
+
+    @ForceInline
+    /*package-private*/
+    AbstractSpecies<Integer> indexSpecies() {
+        // This JITs to a constant value:
+        AbstractSpecies<Integer> sp = indexSpecies;
+        if (sp != null)  return sp;
+        return indexSpecies = findSpecies(LaneType.INT, indexShape).check0(int.class);
+    }
+
+    @ForceInline
+    /*package-private*/
+    @SuppressWarnings("unchecked")
+    AbstractSpecies<Byte> byteSpecies() {
+        // This JITs to a constant value:
+        return (AbstractSpecies<Byte>) withLanes(LaneType.BYTE);
+    }
+
+    @ForceInline
+    /*package-private*/
+    AbstractShuffle<Byte> swapBytesShuffle() {
+        // This JITs to a constant value:
+        AbstractShuffle<Byte> sh = swapBytesShuffle;
+        if (sh != null)  return sh;
+        return swapBytesShuffle = makeSwapBytesShuffle();
+    }
+    private AbstractShuffle<Byte> makeSwapBytesShuffle() {
+        int vbytes = vectorByteSize();
+        int lbytes = elementByteSize();
+        int[] sourceIndexes = new int[vbytes];
+        for (int i = 0; i < vbytes; i++) {
+            sourceIndexes[i] = i ^ (lbytes-1);
+        }
+        return (AbstractShuffle<Byte>)
+            VectorShuffle.fromValues(byteSpecies(), sourceIndexes);
+    }
+    /*package-private*/
+    abstract Vector<E> fromIntValues(int[] values);
+
+    /**
+     * Do not use a dummy except to call methods on it when you don't
+     * care about the lane values.  The main benefit of it is to
+     * populate the type profile, which then allows the JIT to derive
+     * constant values for dummy.species(), the current species, and
+     * then for all of its attributes: ETYPE, VLENGTH, VSHAPE, etc.
+     */
+    @ForceInline
+    /*package-private*/
+    AbstractVector<E> dummyVector() {
+        // This JITs to a constant value:
+        AbstractVector<E> dummy = dummyVector;
+        if (dummy != null)  return dummy;
+        // The rest of this computation is probably not JIT-ted.
+        return makeDummyVector();
+    }
+    private AbstractVector<E> makeDummyVector() {
+        Object za = Array.newInstance(elementType(), laneCount);
+        return dummyVector = vectorFactory.apply(za);
+        // This is the only use of vectorFactory.
+        // All other factory requests are routed
+        // through the dummy vector.
+    }
+
+    /**
+     * Build a mask by directly calling its constructor.
+     * It is an error if the array is aliased elsewhere.
+     */
+    @ForceInline
+    /*package-private*/
+    AbstractMask<E> maskFactory(boolean[] bits) {
+        return dummyVector().maskFromArray(bits);
+    }
+
+    public final
+    @Override
+    @ForceInline
+    VectorShuffle<E> shuffleFromArray(int[] sourceIndexes, int offset) {
+        return dummyVector().shuffleFromArray(sourceIndexes, offset);
+    }
+
+    public final
+    @Override
+    @ForceInline
+    VectorShuffle<E> shuffleFromValues(int... sourceIndexes) {
+        return dummyVector().shuffleFromArray(sourceIndexes, 0);
+    }
+
+    public final
+    @Override
+    @ForceInline
+    VectorShuffle<E> shuffleFromOp(IntUnaryOperator fn) {
+        return dummyVector().shuffleFromOp(fn);
+    }
+
+    public final
+    @Override
+    @ForceInline
+    VectorShuffle<E> iotaShuffle(int start, int step, boolean wrap) {
+        AbstractShuffle<E> res;
+        if (start == 0 && step == 1)
+            return dummyVector().iotaShuffle();
+        else
+            return dummyVector().iotaShuffle(start, step, wrap);
+    }
+
+    @ForceInline
+    @Override
+    public final Vector<E> fromByteArray(byte[] a, int offset, ByteOrder bo) {
+        return dummyVector()
+            .fromByteArray0(a, offset)
+            .maybeSwap(bo);
+    }
+
+    @Override
+    public VectorMask<E> loadMask(boolean[] bits, int offset) {
+        return VectorMask.fromArray(this, bits, offset);
+    }
+
+    // Define zero and iota when we know the ETYPE and VSHAPE.
+    public abstract AbstractVector<E> zero();
+    /*package-private*/ abstract AbstractVector<E> iota();
+
+    // Constructing vectors from raw bits.
+
+    /*package-private*/
+    abstract long longToElementBits(long e);
+
+    /*package-private*/
+    abstract AbstractVector<E> broadcastBits(long bits);
+
+    /*package-private*/
+    final IllegalArgumentException badElementBits(long iv, Object cv) {
+        String msg = String.format("Vector creation failed: "+
+                                   "value %s cannot be represented in ETYPE %s"+
+                                   "; result of cast is %s",
+                                   iv,
+                                   elementType(),
+                                   cv);
+        return new IllegalArgumentException(msg);
+    }
+
+    /*package-private*/
+    static
+    final IllegalArgumentException badArrayBits(Object iv,
+                                                boolean isInt,
+                                                long cv) {
+        String msg = String.format("Array creation failed: "+
+                                   "lane value %s cannot be represented in %s"+
+                                   "; result of cast is %s",
+                                   iv,
+                                   (isInt ? "int" : "long"),
+                                   cv);
+        return new IllegalArgumentException(msg);
+    }
+
+    /*package-private*/
+    Object iotaArray() {
+        // Create an iota array.  It's OK if this is really slow,
+        // because it happens only once per species.
+        Object ia = Array.newInstance(laneType.elementType,
+                                      laneCount);
+        assert(ia.getClass() == laneType.arrayType);
+        checkValue(laneCount-1);  // worst case
+        for (int i = 0; i < laneCount; i++) {
+            if ((byte)i == i)
+                Array.setByte(ia, i, (byte)i);
+            else if ((short)i == i)
+                Array.setShort(ia, i, (short)i);
+            else
+                Array.setInt(ia, i, i);
+            assert(Array.getDouble(ia, i) == i);
+        }
+        return ia;
+    }
+
+    @ForceInline
+    /*package-private*/
+    void checkScale(int scale) {
+        if (scale > 0) {
+            if (scale <= maxScale)  return;
+        } else { // scale <= 0
+            if (scale >= minScale)  return;
+        }
+        throw checkScaleFailed(scale);
+    }
+    private IllegalArgumentException checkScaleFailed(int scale) {
+        String msg = String.format("%s: cannot represent VLENGTH*%d",
+                                   this, scale);
+        return new IllegalArgumentException(msg);
+    }
+
+    /*package-private*/
+    interface RVOp {
+        long apply(int i);  // supply raw element bits
+    }
+
+    /*package-private*/
+    abstract AbstractVector<E> rvOp(RVOp f);
+
+    /*package-private*/
+    interface FOpm {
+        boolean apply(int i);
+    }
+
+    AbstractMask<E> opm(FOpm f) {
+        boolean[] res = new boolean[laneCount];
+        for (int i = 0; i < res.length; i++) {
+            res[i] = f.apply(i);
+        }
+        return dummyVector().maskFromArray(res);
+    }
+
+    @Override
+    @ForceInline
+    public final
+    <F> VectorSpecies<F> check(Class<F> elementType) {
+        return check0(elementType);
+    }
+
+    @ForceInline
+    @SuppressWarnings("unchecked")
+    /*package-private*/ final
+    <F> AbstractSpecies<F> check0(Class<F> elementType) {
+        if (elementType != this.elementType()) {
+            throw AbstractSpecies.checkFailed(this, elementType);
+        }
+        return (AbstractSpecies<F>) this;
+    }
+
+    @ForceInline
+    /*package-private*/
+    AbstractSpecies<E> check(LaneType laneType) {
+        if (laneType != this.laneType) {
+            throw AbstractSpecies.checkFailed(this, laneType);
+        }
+        return this;
+    }
+
+
+    @Override
+    @ForceInline
+    public int partLimit(VectorSpecies<?> toSpecies, boolean lanewise) {
+        AbstractSpecies<?> rsp = (AbstractSpecies<?>) toSpecies;
+        int inSizeLog2 = this.vectorShape.vectorBitSizeLog2;
+        int outSizeLog2 = rsp.vectorShape.vectorBitSizeLog2;
+        if (lanewise) {
+            inSizeLog2 += (rsp.laneType.elementSizeLog2 -
+                           this.laneType.elementSizeLog2);
+        }
+        int diff = (inSizeLog2 - outSizeLog2);
+        // Let's try a branch-free version of this.
+        int sign = (diff >> -1);
+        //d = Math.abs(diff);
+        //d = (sign == 0 ? diff : sign == -1 ? 1 + ~diff);
+        int d = (diff ^ sign) - sign;
+        // Compute sgn(diff) << abs(diff), but replace 1 by 0.
+        return ((sign | 1) << d) & ~1;
+    }
+
+    /**
+     * Helper for throwing CheckCastExceptions,
+     * used by the various Vector*.check(*) methods.
+     */
+    /*package-private*/
+    static ClassCastException checkFailed(Object what, Object required) {
+        // Find a species for the thing that's failing.
+        AbstractSpecies<?> whatSpecies = null;
+        String where;
+        if (what instanceof VectorSpecies) {
+            whatSpecies = (AbstractSpecies<?>) what;
+            where = whatSpecies.toString();
+        } else if (what instanceof Vector) {
+            whatSpecies = (AbstractSpecies<?>) ((Vector<?>) what).species();
+            where = "a Vector<"+whatSpecies.genericElementType()+">";
+        } else if (what instanceof VectorMask) {
+            whatSpecies = (AbstractSpecies<?>) ((VectorMask<?>) what).vectorSpecies();
+            where = "a VectorMask<"+whatSpecies.genericElementType()+">";
+        } else if (what instanceof VectorShuffle) {
+            whatSpecies = (AbstractSpecies<?>) ((VectorShuffle<?>) what).vectorSpecies();
+            where = "a VectorShuffle<"+whatSpecies.genericElementType()+">";
+        } else {
+            where = what.toString();
+        }
+
+        Object found = null;
+        if (whatSpecies != null) {
+            if (required instanceof VectorSpecies) {
+                // required is a VectorSpecies; found the wrong species
+                found = whatSpecies;
+            } else if (required instanceof Vector) {
+                // same VectorSpecies required; found the wrong species
+                found = whatSpecies;
+                required = ((Vector<?>)required).species();
+            } else if (required instanceof Class) {
+                // required is a Class; found the wrong ETYPE
+                Class<?> requiredClass = (Class<?>) required;
+                LaneType requiredType = LaneType.forClassOrNull(requiredClass);
+                found = whatSpecies.elementType();
+                if (requiredType == null) {
+                    required = required + " (not a valid lane type)";
+                } else if (!requiredClass.isPrimitive()) {
+                    required = required + " (should be " + requiredType + ")";
+                }
+            } else if (required instanceof LaneType) {
+                // required is a LaneType; found the wrong ETYPE
+                required = ((LaneType) required).elementType;
+                found = whatSpecies.elementType();
+            } else if (required instanceof Integer) {
+                // required is a length; species has wrong VLENGTH
+                required = required + " lanes";
+                found = whatSpecies.length();
+            }
+        }
+        if (found == null)  found = "bad value";
+
+        String msg = where+": required "+required+" but found "+found;
+        return new ClassCastException(msg);
+    }
+
+    private static final @Stable AbstractSpecies<?>[][] CACHES
+        = new AbstractSpecies<?>[LaneType.SK_LIMIT][VectorShape.SK_LIMIT];
+
+    // Helper functions for finding species:
+
+    /*package-private*/
+    @ForceInline
+    static <E>
+    AbstractSpecies<E> findSpecies(Class<E> elementType,
+                                   LaneType laneType,
+                                   VectorShape shape) {
+        assert(elementType == laneType.elementType);
+        return findSpecies(laneType, shape).check0(elementType);
+    }
+
+    /*package-private*/
+    @ForceInline
+    static
+    AbstractSpecies<?> findSpecies(LaneType laneType,
+                                   VectorShape shape) {
+        // The JIT can see into this cache.
+        // Therefore it is useful to arrange for constant
+        // arguments to this method.  If the cache
+        // is full when the JIT runs, the cache item becomes
+        // a compile-time constant.  And then all the @Stable
+        // fields of the AbstractSpecies are also constants.
+        AbstractSpecies<?> s = CACHES[laneType.switchKey][shape.switchKey];
+        if (s != null)  return s;
+        return computeSpecies(laneType, shape);
+    }
+
+    private static
+    AbstractSpecies<?> computeSpecies(LaneType laneType,
+                                      VectorShape shape) {
+        AbstractSpecies<?> s = null;
+        // enum-switches don't optimize properly JDK-8161245
+        switch (laneType.switchKey) {
+        case LaneType.SK_FLOAT:
+            s = FloatVector.species(shape); break;
+        case LaneType.SK_DOUBLE:
+            s = DoubleVector.species(shape); break;
+        case LaneType.SK_BYTE:
+            s = ByteVector.species(shape); break;
+        case LaneType.SK_SHORT:
+            s = ShortVector.species(shape); break;
+        case LaneType.SK_INT:
+            s = IntVector.species(shape); break;
+        case LaneType.SK_LONG:
+            s = LongVector.species(shape); break;
+        }
+        if (s == null) {
+            // NOTE: The result of this method is guaranteed to be
+            // non-null.  Later calls to ".check" also ensure this.
+            // If this method hits a NPE, it is because a helper
+            // method EVector.species() has returned a null value, and
+            // that is because a SPECIES_X static constant has not yet
+            // been initialized.  And that, in turn, is because
+            // somebody is calling this method way too early during
+            // bootstrapping.
+            throw new AssertionError("bootstrap problem");
+        }
+        assert(s.laneType == laneType) : s + "!=" + laneType;
+        assert(s.vectorShape == shape) : s + "!=" + shape;
+        CACHES[laneType.switchKey][shape.switchKey] = s;
+        return s;
+    }
+
+    @Override
+    public final String toString() {
+        return "Species["+laneType+", "+laneCount+", "+vectorShape+"]";
+    }
+
+    @Override
+    public final boolean equals(Object obj) {
+        if (obj instanceof AbstractSpecies) {
+            AbstractSpecies<?> that = (AbstractSpecies<?>) obj;
+            return (this.laneType == that.laneType &&
+                    this.laneCount == that.laneCount &&
+                    this.vectorShape == that.vectorShape);
+        }
+        return this == obj;
+    }
+
+    /**
+     * Returns a hash code value for the shuffle,
+     * based on the lane source indexes and the vector species.
+     *
+     * @return  a hash code value for this shuffle
+     */
+    @Override
+    public final int hashCode() {
+        int[] a = { laneType.ordinal(), laneCount, vectorShape.ordinal() };
+        return Arrays.hashCode(a);
+    }
+}
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/AbstractVector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/AbstractVector.java
@ -0,0 +1,726 @@
+/*
+ * Copyright (c) 2019, 2020, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.  Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package jdk.incubator.vector;
+
+import jdk.internal.vm.annotation.ForceInline;
+import jdk.internal.vm.vector.VectorSupport;
+
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.util.function.IntUnaryOperator;
+
+import static jdk.incubator.vector.VectorOperators.*;
+
+@SuppressWarnings("cast")
+abstract class AbstractVector<E> extends Vector<E> {
+    /**
+     * The order of vector bytes when stored in natural,
+     * array elements of the same lane type.
+     * This is the also the behavior of the
+     * VectorSupport load/store instructions.
+     * If these instructions gain the capability to do
+     * byte swapping on the fly, add a bit to those
+     * instructions, but let this polarity be the
+     * "neutral" or "default" setting of the bit.
+     */
+    /*package-private*/
+    static final ByteOrder NATIVE_ENDIAN = ByteOrder.nativeOrder();
+
+    /**
+     * The order of vector bytes as stored in the register
+     * file.  This becomes visible with the asRaw[Type]Vector
+     * operations, which convert between the internal byte-wise
+     * representation and the typed lane-wise representation.
+     * It is very possible for a platform to have big-endian
+     * memory layout and little-endian register layout,
+     * so this is a different setting from NATIVE_ENDIAN.
+     * In fact, both Intel and ARM use LE conventions here.
+     * Future work may be needed for resolutely BE platforms.
+     */
+    /*package-private*/
+    static final ByteOrder REGISTER_ENDIAN = ByteOrder.LITTLE_ENDIAN;
+
+    /*package-private*/
+    AbstractVector(Object bits) {
+        super(bits);
+    }
+
+    // Extractors
+
+    /*package-private*/
+    abstract AbstractSpecies<E> vspecies();
+
+    @Override
+    @ForceInline
+    public final VectorSpecies<E> species() {
+        return vspecies();
+    }
+
+    // Something to make types match up better:
+
+    @Override
+    @ForceInline
+    public final
+    <F> Vector<F> check(VectorSpecies<F> species) {
+        return check0(species);
+    }
+
+    @ForceInline
+    @SuppressWarnings("unchecked")
+    /*package-private*/ final
+    <F> AbstractVector<F> check0(VectorSpecies<F> species) {
+        if (!sameSpecies(species)) {
+            throw AbstractSpecies.checkFailed(this, species);
+        }
+        return (AbstractVector<F>) this;
+    }
+
+    /**
+     * {@inheritDoc} <!--workaround-->
+     */
+    @Override
+    @ForceInline
+    public final
+    <F> Vector<F> check(Class<F> elementType) {
+        return check0(elementType);
+    }
+
+    @ForceInline
+    @SuppressWarnings("unchecked")
+    /*package-private*/ final
+    <F> AbstractVector<F> check0(Class<F> elementType) {
+        if (this.elementType() != elementType) {
+            throw AbstractSpecies.checkFailed(this, elementType);
+        }
+        return (AbstractVector<F>) this;
+    }
+
+    @ForceInline
+    @SuppressWarnings("unchecked")
+    /*package-private*/ final
+    <F> AbstractVector<F> check(Vector<F> other) {
+        if (!sameSpecies(other)) {
+            throw AbstractSpecies.checkFailed(this, other);
+        }
+        return (AbstractVector<F>) this;
+    }
+
+    @ForceInline
+    private boolean sameSpecies(Vector<?> other) {
+        // It's simpler and faster to do a class check.
+        boolean same = (this.getClass() == other.getClass());
+        // Make sure it works, too!
+        assert(same == (this.species() == other.species())) : same;
+        return same;
+    }
+
+    @ForceInline
+    private boolean sameSpecies(VectorSpecies<?> species) {
+        // It's simpler and faster to do a class check,
+        // even if you have to load a dummy vector.
+        AbstractVector<?> other = ((AbstractSpecies<?>)species).dummyVector();
+        boolean same = (this.getClass() == other.getClass());
+        // Make sure it works, too!
+        assert(same == (this.species() == species)) : same;
+        return same;
+    }
+
+    /**
+     * {@inheritDoc} <!--workaround-->
+     */
+    @Override
+    @ForceInline
+    public final VectorMask<E> maskAll(boolean bit) {
+        return species().maskAll(bit);
+    }
+
+    // Make myself into a vector of the same shape
+    // and same information content but different lane type
+    /*package-private*/
+    abstract AbstractVector<?> asVectorRaw(LaneType laneType);
+
+    // Make myself into a byte vector of the same shape
+    /*package-private*/
+    abstract ByteVector asByteVectorRaw();
+
+    /*package-private*/
+    @ForceInline
+    final AbstractVector<?> asVectorRawTemplate(LaneType laneType) {
+        // NOTE:  This assumes that convert0('X')
+        // respects REGISTER_ENDIAN order.
+        return convert0('X', vspecies().withLanes(laneType));
+    }
+
+    /*package-private*/
+    @ForceInline
+    ByteVector asByteVectorRawTemplate() {
+        return (ByteVector) asVectorRawTemplate(LaneType.BYTE);
+    }
+
+
+    abstract AbstractMask<E> maskFromArray(boolean[] bits);
+
+    abstract AbstractShuffle<E> iotaShuffle();
+
+    abstract AbstractShuffle<E> iotaShuffle(int start, int step, boolean wrap);
+
+    /*do not alias this byte array*/
+    abstract AbstractShuffle<E> shuffleFromBytes(byte[] reorder);
+
+    abstract AbstractShuffle<E> shuffleFromArray(int[] indexes, int i);
+
+    abstract AbstractShuffle<E> shuffleFromOp(IntUnaryOperator fn);
+
+    /*package-private*/
+    abstract AbstractVector<E> fromByteArray0(byte[] a, int offset);
+
+    /*package-private*/
+    abstract AbstractVector<E> maybeSwap(ByteOrder bo);
+
+    /*package-private*/
+    @ForceInline
+    VectorShuffle<Byte> swapBytesShuffle() {
+        return vspecies().swapBytesShuffle();
+    }
+
+    /**
+     * {@inheritDoc} <!--workaround-->
+     */
+    @Override
+    @ForceInline
+    public ShortVector reinterpretAsShorts() {
+        return (ShortVector) asVectorRaw(LaneType.SHORT);
+    }
+
+    /**
+     * {@inheritDoc} <!--workaround-->
+     */
+    @Override
+    @ForceInline
+    public IntVector reinterpretAsInts() {
+        return (IntVector) asVectorRaw(LaneType.INT);
+    }
+
+    /**
+     * {@inheritDoc} <!--workaround-->
+     */
+    @Override
+    @ForceInline
+    public LongVector reinterpretAsLongs() {
+        return (LongVector) asVectorRaw(LaneType.LONG);
+    }
+
+    /**
+     * {@inheritDoc} <!--workaround-->
+     */
+    @Override
+    @ForceInline
+    public FloatVector reinterpretAsFloats() {
+        return (FloatVector) asVectorRaw(LaneType.FLOAT);
+    }
+
+    /**
+     * {@inheritDoc} <!--workaround-->
+     */
+    @Override
+    @ForceInline
+    public DoubleVector reinterpretAsDoubles() {
+        return (DoubleVector) asVectorRaw(LaneType.DOUBLE);
+    }
+
+    /**
+     * {@inheritDoc} <!--workaround-->
+     */
+    @Override
+    @ForceInline
+    public final <F>
+    Vector<F> convert(Conversion<E,F> conv, int part) {
+        // Shape invariance is simple to implement.
+        // It's part of the API because shape invariance
+        // is the default mode of operation, and shape
+        // shifting operations must advertise themselves.
+        ConversionImpl<E,F> c = (ConversionImpl<E,F>) conv;
+        @SuppressWarnings("unchecked")
+        VectorSpecies<F> rsp = (VectorSpecies<F>)
+            vspecies().withLanes(c.range());
+        return convertShape(conv, rsp, part);
+    }
+
+    /**
+     * {@inheritDoc} <!--workaround-->
+     */
+    @Override
+    @ForceInline
+    public final <F>
+    Vector<F> castShape(VectorSpecies<F> toSpecies, int part) {
+        // This is an odd mix of shape conversion plus
+        // lanewise conversions.  It seems to be useful
+        // sometimes as a shorthand, though maybe we
+        // can drop it.
+        AbstractSpecies<E> vsp = vspecies();
+        AbstractSpecies<F> rsp = (AbstractSpecies<F>) toSpecies;
+        @SuppressWarnings("unchecked")
+        ConversionImpl<E,F> c = (ConversionImpl<E,F>)
+            ConversionImpl.ofCast(vsp.laneType, rsp.laneType);
+        return convertShape(c, rsp, part);
+    }
+
+    /**
+     * {@inheritDoc} <!--workaround-->
+     */
+    @Override
+    @ForceInline
+    public abstract <F>
+    Vector<F> convertShape(Conversion<E,F> conv, VectorSpecies<F> rsp, int part);
+
+    /**
+     * This is the template for Vector::reinterpretShape, to be
+     * specialized by each distinct vector class.
+     */
+    /*package-private*/
+    @ForceInline
+    final <F>
+    AbstractVector<F> reinterpretShapeTemplate(VectorSpecies<F> toSpecies, int part) {
+        AbstractSpecies<F> rsp = (AbstractSpecies<F>) toSpecies;
+        AbstractSpecies<E> vsp = vspecies();
+        if (part == 0) {
+            // Works the same for in-place, expand, or contract.
+            return convert0('X', rsp);
+        } else {
+            int origin = shapeChangeOrigin(vsp, rsp, false, part);
+            //System.out.println("*** origin = "+origin+", part = "+part+", reinterpret");
+            if (part > 0) {  // Expansion: slice first then cast.
+                return slice(origin, vsp.zero()).convert0('X', rsp);
+            } else {  // Contraction: cast first then unslice.
+                return rsp.zero().slice(rsp.laneCount() - origin,
+                                        convert0('X', rsp));
+            }
+        }
+    }
+
+    @Override
+    public abstract AbstractVector<E> slice(int origin, Vector<E> v1);
+
+    /**
+     * This is the template for Vector::convertShape, to be
+     * specialized by each distinct vector class.
+     */
+    /*package-private*/
+    @ForceInline
+    final <F>
+    AbstractVector<F> convertShapeTemplate(Conversion<E,F> conv, VectorSpecies<F> toSpecies, int part) {
+        ConversionImpl<E,F> c = (ConversionImpl<E,F>) conv;
+        AbstractSpecies<F> rsp = (AbstractSpecies<F>) toSpecies;
+        AbstractSpecies<E> vsp = vspecies();
+        char kind = c.kind();
+        switch (kind) {
+        case 'C': // Regular cast conversion, known to the JIT.
+            break;
+        case 'I':  // Identity conversion => reinterpret.
+            assert(c.sizeChangeLog2() == 0);
+            kind = 'X';
+            break;
+        case 'Z':  // Lane-wise expansion with zero padding.
+            assert(c.sizeChangeLog2() > 0);
+            assert(c.range().elementKind == 'I');
+            break;
+        case 'R':  // Lane-wise reinterpret conversion.
+            if (c.sizeChangeLog2() != 0) {
+                kind = 'Z';  // some goofy stuff here
+                break;
+            }
+            kind = 'X';  // No size change => reinterpret whole vector
+            break;
+        default:
+            throw new AssertionError(c);
+        }
+        vsp.check(c.domain());  // apply dynamic check to conv
+        rsp.check(c.range());   // apply dynamic check to conv
+        if (part == 0) {
+            // Works the same for in-place, expand, or contract.
+            return convert0(kind, rsp);
+        } else {
+            int origin = shapeChangeOrigin(vsp, rsp, true, part);
+            //System.out.println("*** origin = "+origin+", part = "+part+", lanewise");
+            if (part > 0) {  // Expansion: slice first then cast.
+                return slice(origin, vsp.zero()).convert0(kind, rsp);
+            } else {  // Contraction: cast first then unslice.
+                return rsp.zero().slice(rsp.laneCount() - origin,
+                                        convert0(kind, rsp));
+            }
+        }
+    }
+
+    /**
+     * Check a part number and return it multiplied by the appropriate
+     * block factor to yield the origin of the operand block, as a
+     * lane number.  For expansions the origin is reckoned in the
+     * domain vector, since the domain vector has too much information
+     * and must be sliced.  For contractions the origin is reckoned in
+     * the range vector, since the range vector has too many lanes and
+     * the result must be unsliced at the same position as the inverse
+     * expansion.  If the conversion is lanewise, then lane sizes may
+     * be changing as well.  This affects the logical size of the
+     * result, and so the domain size is multiplied or divided by the
+     * lane size change.
+     */
+    /*package-private*/
+    @ForceInline
+    static
+    int shapeChangeOrigin(AbstractSpecies<?> dsp,
+                          AbstractSpecies<?> rsp,
+                          boolean lanewise,
+                          int part) {
+        int domSizeLog2 = dsp.vectorShape.vectorBitSizeLog2;
+        int phySizeLog2 = rsp.vectorShape.vectorBitSizeLog2;
+        int laneChangeLog2 = 0;
+        if (lanewise) {
+            laneChangeLog2 = (rsp.laneType.elementSizeLog2 -
+                              dsp.laneType.elementSizeLog2);
+        }
+        int resSizeLog2 = domSizeLog2 + laneChangeLog2;
+        // resSizeLog2 = 0 => 1-lane vector shrinking to 1-byte lane-size
+        // resSizeLog2 < 0 => small vector shrinking by more than a lane-size
+        assert(resSizeLog2 >= 0);
+        // Expansion ratio: expansionLog2 = resSizeLog2 - phySizeLog2;
+        if (!partInRange(resSizeLog2, phySizeLog2, part)) {
+            // fall through...
+        } else if (resSizeLog2 > phySizeLog2) {
+            // Expansion by M means we must slice a block from the domain.
+            // What is that block size?  It is 1/M of the domain.
+            // Let's compute the log2 of that block size, as 's'.
+            //s = (dsp.laneCountLog2() - expansionLog2);
+            //s = ((domSizeLog2 - dsp.laneType.elementSizeLog2) - expansionLog2);
+            //s = (domSizeLog2 - expansionLog2 - dsp.laneType.elementSizeLog2);
+            int s = phySizeLog2 - laneChangeLog2 - dsp.laneType.elementSizeLog2;
+            // Scale the part number by the input block size, in input lanes.
+            if ((s & 31) == s)  // sanity check
+                return part << s;
+        } else {
+            // Contraction by M means we must drop a block into the range.
+            // What is that block size?  It is 1/M of the range.
+            // Let's compute the log2 of that block size, as 's'.
+            //s = (rsp.laneCountLog2() + expansionLog2);
+            //s = ((phySizeLog2 - rsp.laneType.elementSizeLog2) + expansionLog2);
+            //s = (phySizeLog2 + expansionLog2 - rsp.laneType.elementSizeLog2);
+            int s = resSizeLog2 - rsp.laneType.elementSizeLog2;
+            // Scale the part number by the output block size, in output lanes.
+            if ((s & 31) == s)  // sanity check
+                return -part << s;
+        }
+        throw wrongPart(dsp, rsp, lanewise, part);
+    }
+
+    @ForceInline
+    private static boolean partInRange(int resSizeLog2, int phySizeLog2, int part) {
+        // Let's try a branch-free version of this.
+        int diff = (resSizeLog2 - phySizeLog2);
+        int sign = (diff >> -1);
+        //d = Math.abs(diff);
+        //d = (sign == 0 ? diff : sign == -1 ? 1 + ~diff);
+        int d = (diff ^ sign) - sign;
+        assert(d == Math.abs(diff) && d <= 16);  // let's not go crazy here
+        //p = part * sign;
+        int p = (part ^ sign) - sign;
+        // z = sign == 0  ? 0<=part<(1<<d),  == (part & (-1 << d)) == 0
+        // z = sign == -1 ? 0<=-part<(1<<d), == (-part & (-1 << d)) == 0
+        boolean z = (p & (-1 << d)) == 0;
+        assert(z == partInRangeSlow(resSizeLog2, phySizeLog2, part)) : z;
+        return z;
+    }
+
+    private static boolean partInRangeSlow(int resSizeLog2, int phySizeLog2, int part) {
+        if (resSizeLog2 > phySizeLog2) {  // expansion
+            int limit = 1 << (resSizeLog2 - phySizeLog2);
+            return part >= 0 && part < limit;
+        } else if (resSizeLog2 < phySizeLog2) {  // contraction
+            int limit = 1 << (phySizeLog2 - resSizeLog2);
+            return part > -limit && part <= 0;
+        } else {
+            return (part == 0);
+        }
+    }
+
+    private static
+    ArrayIndexOutOfBoundsException
+    wrongPart(AbstractSpecies<?> dsp,
+              AbstractSpecies<?> rsp,
+              boolean lanewise,
+              int part) {
+        String laneChange = "";
+        String converting = "converting";
+        int dsize = dsp.elementSize(), rsize = rsp.elementSize();
+        if (!lanewise) {
+            converting = "reinterpreting";
+        } else if (dsize < rsize) {
+            laneChange = String.format(" (lanes are expanding by %d)",
+                                       rsize / dsize);
+        } else if (dsize > rsize) {
+            laneChange = String.format(" (lanes are contracting by %d)",
+                                       dsize / rsize);
+        }
+        String msg = String.format("bad part number %d %s %s -> %s%s",
+                                   part, converting, dsp, rsp, laneChange);
+        return new ArrayIndexOutOfBoundsException(msg);
+    }
+
+    /*package-private*/
+    ArithmeticException divZeroException() {
+        throw new ArithmeticException("zero vector lane in dividend "+this);
+    }
+
+    /**
+     * Helper function for all sorts of byte-wise reinterpretation casts.
+     * This function kicks in after intrinsic failure.
+     */
+    /*package-private*/
+    @ForceInline
+    final <F>
+    AbstractVector<F> defaultReinterpret(AbstractSpecies<F> rsp) {
+        int blen = Math.max(this.bitSize(), rsp.vectorBitSize()) / Byte.SIZE;
+        ByteOrder bo = ByteOrder.LITTLE_ENDIAN;
+        ByteBuffer bb = ByteBuffer.allocate(blen);
+        this.intoByteBuffer(bb, 0, bo);
+        VectorMask<F> m = rsp.maskAll(true);
+        // enum-switches don't optimize properly JDK-8161245
+        switch (rsp.laneType.switchKey) {
+        case LaneType.SK_BYTE:
+            return ByteVector.fromByteBuffer(rsp.check(byte.class), bb, 0, bo, m.check(byte.class)).check0(rsp);
+        case LaneType.SK_SHORT:
+            return ShortVector.fromByteBuffer(rsp.check(short.class), bb, 0, bo, m.check(short.class)).check0(rsp);
+        case LaneType.SK_INT:
+            return IntVector.fromByteBuffer(rsp.check(int.class), bb, 0, bo, m.check(int.class)).check0(rsp);
+        case LaneType.SK_LONG:
+            return LongVector.fromByteBuffer(rsp.check(long.class), bb, 0, bo, m.check(long.class)).check0(rsp);
+        case LaneType.SK_FLOAT:
+            return FloatVector.fromByteBuffer(rsp.check(float.class), bb, 0, bo, m.check(float.class)).check0(rsp);
+        case LaneType.SK_DOUBLE:
+            return DoubleVector.fromByteBuffer(rsp.check(double.class), bb, 0, bo, m.check(double.class)).check0(rsp);
+        default:
+            throw new AssertionError(rsp.toString());
+        }
+    }
+
+    /**
+     * Helper function for all sorts of lane-wise conversions.
+     * This function kicks in after intrinsic failure.
+     */
+    /*package-private*/
+    @ForceInline
+    final <F>
+    AbstractVector<F> defaultCast(AbstractSpecies<F> dsp) {
+        int rlength = dsp.laneCount;
+        if (vspecies().laneType.elementKind == 'F') {
+            // Buffer input values in a double array.
+            double[] lanes = toDoubleArray();
+            int limit = Math.min(lanes.length, rlength);
+            // enum-switches don't optimize properly JDK-8161245
+            switch (dsp.laneType.switchKey) {
+            case LaneType.SK_BYTE: {
+                byte[] a = new byte[rlength];
+                for (int i = 0; i < limit; i++) {
+                    a[i] = (byte) lanes[i];
+                }
+                return ByteVector.fromArray(dsp.check(byte.class), a, 0).check0(dsp);
+            }
+            case LaneType.SK_SHORT: {
+                short[] a = new short[rlength];
+                for (int i = 0; i < limit; i++) {
+                    a[i] = (short) lanes[i];
+                }
+                return ShortVector.fromArray(dsp.check(short.class), a, 0).check0(dsp);
+            }
+            case LaneType.SK_INT: {
+                int[] a = new int[rlength];
+                for (int i = 0; i < limit; i++) {
+                    a[i] = (int) lanes[i];
+                }
+                return IntVector.fromArray(dsp.check(int.class), a, 0).check0(dsp);
+            }
+            case LaneType.SK_LONG: {
+                long[] a = new long[rlength];
+                for (int i = 0; i < limit; i++) {
+                    a[i] = (long) lanes[i];
+                }
+                return LongVector.fromArray(dsp.check(long.class), a, 0).check0(dsp);
+            }
+            case LaneType.SK_FLOAT: {
+                float[] a = new float[rlength];
+                for (int i = 0; i < limit; i++) {
+                    a[i] = (float) lanes[i];
+                }
+                return FloatVector.fromArray(dsp.check(float.class), a, 0).check0(dsp);
+            }
+            case LaneType.SK_DOUBLE: {
+                double[] a = new double[rlength];
+                for (int i = 0; i < limit; i++) {
+                    a[i] = (double) lanes[i];
+                }
+                return DoubleVector.fromArray(dsp.check(double.class), a, 0).check0(dsp);
+            }
+            default: break;
+            }
+        } else {
+            // Buffer input values in a long array.
+            long[] lanes = toLongArray();
+            int limit = Math.min(lanes.length, rlength);
+            // enum-switches don't optimize properly JDK-8161245
+            switch (dsp.laneType.switchKey) {
+            case LaneType.SK_BYTE: {
+                byte[] a = new byte[rlength];
+                for (int i = 0; i < limit; i++) {
+                    a[i] = (byte) lanes[i];
+                }
+                return ByteVector.fromArray(dsp.check(byte.class), a, 0).check0(dsp);
+            }
+            case LaneType.SK_SHORT: {
+                short[] a = new short[rlength];
+                for (int i = 0; i < limit; i++) {
+                    a[i] = (short) lanes[i];
+                }
+                return ShortVector.fromArray(dsp.check(short.class), a, 0).check0(dsp);
+            }
+            case LaneType.SK_INT: {
+                int[] a = new int[rlength];
+                for (int i = 0; i < limit; i++) {
+                    a[i] = (int) lanes[i];
+                }
+                return IntVector.fromArray(dsp.check(int.class), a, 0).check0(dsp);
+            }
+            case LaneType.SK_LONG: {
+                long[] a = new long[rlength];
+                for (int i = 0; i < limit; i++) {
+                    a[i] = (long) lanes[i];
+                }
+                return LongVector.fromArray(dsp.check(long.class), a, 0).check0(dsp);
+            }
+            case LaneType.SK_FLOAT: {
+                float[] a = new float[rlength];
+                for (int i = 0; i < limit; i++) {
+                    a[i] = (float) lanes[i];
+                }
+                return FloatVector.fromArray(dsp.check(float.class), a, 0).check0(dsp);
+            }
+            case LaneType.SK_DOUBLE: {
+                double[] a = new double[rlength];
+                for (int i = 0; i < limit; i++) {
+                    a[i] = (double) lanes[i];
+                }
+                return DoubleVector.fromArray(dsp.check(double.class), a, 0).check0(dsp);
+            }
+            default: break;
+            }
+        }
+        throw new AssertionError();
+    }
+
+    // Constant-folded access to conversion intrinsics:
+
+    /**
+     * Dispatch on conversion kind and target species.
+     * The code of this is arranged to fold up if the
+     * vector class is constant and the target species
+     * is also constant.  This is often the case.
+     * Residual non-folded code may also perform acceptably
+     * in some cases due to type profiling, especially
+     * of rvtype.  If only one shape is being used,
+     * the profiling of rvtype should help speculatively
+     * fold the code even when the target species is
+     * not a constant.
+     */
+    /*package-private*/
+    @ForceInline
+    final <F>
+    AbstractVector<F> convert0(char kind, AbstractSpecies<F> rsp) {
+        // Derive some JIT-time constants:
+        Class<?> etype;   // fill in after switch (constant)
+        int vlength;      // fill in after switch (mark type profile?)
+        Class<?> rvtype;  // fill in after switch (mark type profile)
+        Class<?> rtype;
+        int rlength;
+        switch (kind) {
+        case 'Z':  // lane-wise size change, maybe with sign clip
+            // Maybe this should be an intrinsic also.
+            AbstractSpecies<?> rspi = rsp.asIntegral();
+            AbstractVector<?> bitv = resizeLanes0(this, rspi);
+            return (rspi == rsp ? bitv.check0(rsp) : bitv.convert0('X', rsp));
+        case 'C':  // lane-wise cast (but not identity)
+            rtype = rsp.elementType();
+            rlength = rsp.laneCount();
+            etype = this.elementType(); // (profile)
+            vlength = this.length();  // (profile)
+            rvtype = rsp.dummyVector().getClass();  // (profile)
+            return VectorSupport.convert(VectorSupport.VECTOR_OP_CAST,
+                    this.getClass(), etype, vlength,
+                    rvtype, rtype, rlength,
+                    this, rsp,
+                    AbstractVector::defaultCast);
+        case 'X':  // reinterpret cast, not lane-wise if lane sizes differ
+            rtype = rsp.elementType();
+            rlength = rsp.laneCount();
+            etype = this.elementType(); // (profile)
+            vlength = this.length();  // (profile)
+            rvtype = rsp.dummyVector().getClass();  // (profile)
+            return VectorSupport.convert(VectorSupport.VECTOR_OP_REINTERPRET,
+                    this.getClass(), etype, vlength,
+                    rvtype, rtype, rlength,
+                    this, rsp,
+                    AbstractVector::defaultReinterpret);
+        }
+        throw new AssertionError();
+    }
+
+    @ForceInline
+    private static <F>
+    AbstractVector<F>
+    resizeLanes0(AbstractVector<?> v, AbstractSpecies<F> rspi) {
+        AbstractSpecies<?> dsp = v.vspecies();
+        int sizeChange = rspi.elementSize() - dsp.elementSize();
+        AbstractSpecies<?> dspi = dsp.asIntegral();
+        if (dspi != dsp)  v = v.convert0('R', dspi);
+        if (sizeChange <= 0) {  // clip in place
+            return v.convert0('C', rspi);
+        }
+        // extend in place, but remove unwanted sign extension
+        long mask = -1L >>> sizeChange;
+        return (AbstractVector<F>)
+            v.convert0('C', rspi)
+            .lanewise(AND, rspi.broadcast(mask));
+    }
+
+    // Byte buffer wrappers.
+    static ByteBuffer wrapper(ByteBuffer bb, ByteOrder bo) {
+        return bb.duplicate().order(bo);
+    }
+
+    static ByteBuffer wrapper(byte[] a, ByteOrder bo) {
+        return ByteBuffer.wrap(a).order(bo);
+    }
+
+    static {
+        // Recode uses of VectorSupport.reinterpret if this assertion fails:
+        assert(REGISTER_ENDIAN == ByteOrder.LITTLE_ENDIAN);
+    }
+}
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Byte128Vector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Byte128Vector.java
@ -0,0 +1,840 @@
+/*
+ * Copyright (c) 2017, 2020, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.  Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package jdk.incubator.vector;
+
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.Objects;
+import java.util.function.IntUnaryOperator;
+
+import jdk.internal.vm.annotation.ForceInline;
+import jdk.internal.vm.vector.VectorSupport;
+
+import static jdk.internal.vm.vector.VectorSupport.*;
+
+import static jdk.incubator.vector.VectorOperators.*;
+
+// -- This file was mechanically generated: Do not edit! -- //
+
+@SuppressWarnings("cast")  // warning: redundant cast
+final class Byte128Vector extends ByteVector {
+    static final ByteSpecies VSPECIES =
+        (ByteSpecies) ByteVector.SPECIES_128;
+
+    static final VectorShape VSHAPE =
+        VSPECIES.vectorShape();
+
+    static final Class<Byte128Vector> VCLASS = Byte128Vector.class;
+
+    static final int VSIZE = VSPECIES.vectorBitSize();
+
+    static final int VLENGTH = VSPECIES.laneCount(); // used by the JVM
+
+    static final Class<Byte> ETYPE = byte.class; // used by the JVM
+
+    Byte128Vector(byte[] v) {
+        super(v);
+    }
+
+    // For compatibility as Byte128Vector::new,
+    // stored into species.vectorFactory.
+    Byte128Vector(Object v) {
+        this((byte[]) v);
+    }
+
+    static final Byte128Vector ZERO = new Byte128Vector(new byte[VLENGTH]);
+    static final Byte128Vector IOTA = new Byte128Vector(VSPECIES.iotaArray());
+
+    static {
+        // Warm up a few species caches.
+        // If we do this too much we will
+        // get NPEs from bootstrap circularity.
+        VSPECIES.dummyVector();
+        VSPECIES.withLanes(LaneType.BYTE);
+    }
+
+    // Specialized extractors
+
+    @ForceInline
+    final @Override
+    public ByteSpecies vspecies() {
+        // ISSUE:  This should probably be a @Stable
+        // field inside AbstractVector, rather than
+        // a megamorphic method.
+        return VSPECIES;
+    }
+
+    @ForceInline
+    @Override
+    public final Class<Byte> elementType() { return byte.class; }
+
+    @ForceInline
+    @Override
+    public final int elementSize() { return Byte.SIZE; }
+
+    @ForceInline
+    @Override
+    public final VectorShape shape() { return VSHAPE; }
+
+    @ForceInline
+    @Override
+    public final int length() { return VLENGTH; }
+
+    @ForceInline
+    @Override
+    public final int bitSize() { return VSIZE; }
+
+    @ForceInline
+    @Override
+    public final int byteSize() { return VSIZE / Byte.SIZE; }
+
+    /*package-private*/
+    @ForceInline
+    final @Override
+    byte[] vec() {
+        return (byte[])getPayload();
+    }
+
+    // Virtualized constructors
+
+    @Override
+    @ForceInline
+    public final Byte128Vector broadcast(byte e) {
+        return (Byte128Vector) super.broadcastTemplate(e);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final Byte128Vector broadcast(long e) {
+        return (Byte128Vector) super.broadcastTemplate(e);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    Byte128Mask maskFromArray(boolean[] bits) {
+        return new Byte128Mask(bits);
+    }
+
+    @Override
+    @ForceInline
+    Byte128Shuffle iotaShuffle() { return Byte128Shuffle.IOTA; }
+
+    @ForceInline
+    Byte128Shuffle iotaShuffle(int start, int step, boolean wrap) {
+      if (wrap) {
+        return (Byte128Shuffle)VectorSupport.shuffleIota(ETYPE, Byte128Shuffle.class, VSPECIES, VLENGTH, start, step, 1,
+                (l, lstart, lstep, s) -> s.shuffleFromOp(i -> (VectorIntrinsics.wrapToRange(i*lstep + lstart, l))));
+      } else {
+        return (Byte128Shuffle)VectorSupport.shuffleIota(ETYPE, Byte128Shuffle.class, VSPECIES, VLENGTH, start, step, 0,
+                (l, lstart, lstep, s) -> s.shuffleFromOp(i -> (i*lstep + lstart)));
+      }
+    }
+
+    @Override
+    @ForceInline
+    Byte128Shuffle shuffleFromBytes(byte[] reorder) { return new Byte128Shuffle(reorder); }
+
+    @Override
+    @ForceInline
+    Byte128Shuffle shuffleFromArray(int[] indexes, int i) { return new Byte128Shuffle(indexes, i); }
+
+    @Override
+    @ForceInline
+    Byte128Shuffle shuffleFromOp(IntUnaryOperator fn) { return new Byte128Shuffle(fn); }
+
+    // Make a vector of the same species but the given elements:
+    @ForceInline
+    final @Override
+    Byte128Vector vectorFactory(byte[] vec) {
+        return new Byte128Vector(vec);
+    }
+
+    @ForceInline
+    final @Override
+    Byte128Vector asByteVectorRaw() {
+        return (Byte128Vector) super.asByteVectorRawTemplate();  // specialize
+    }
+
+    @ForceInline
+    final @Override
+    AbstractVector<?> asVectorRaw(LaneType laneType) {
+        return super.asVectorRawTemplate(laneType);  // specialize
+    }
+
+    // Unary operator
+
+    @ForceInline
+    final @Override
+    Byte128Vector uOp(FUnOp f) {
+        return (Byte128Vector) super.uOpTemplate(f);  // specialize
+    }
+
+    @ForceInline
+    final @Override
+    Byte128Vector uOp(VectorMask<Byte> m, FUnOp f) {
+        return (Byte128Vector)
+            super.uOpTemplate((Byte128Mask)m, f);  // specialize
+    }
+
+    // Binary operator
+
+    @ForceInline
+    final @Override
+    Byte128Vector bOp(Vector<Byte> v, FBinOp f) {
+        return (Byte128Vector) super.bOpTemplate((Byte128Vector)v, f);  // specialize
+    }
+
+    @ForceInline
+    final @Override
+    Byte128Vector bOp(Vector<Byte> v,
+                     VectorMask<Byte> m, FBinOp f) {
+        return (Byte128Vector)
+            super.bOpTemplate((Byte128Vector)v, (Byte128Mask)m,
+                              f);  // specialize
+    }
+
+    // Ternary operator
+
+    @ForceInline
+    final @Override
+    Byte128Vector tOp(Vector<Byte> v1, Vector<Byte> v2, FTriOp f) {
+        return (Byte128Vector)
+            super.tOpTemplate((Byte128Vector)v1, (Byte128Vector)v2,
+                              f);  // specialize
+    }
+
+    @ForceInline
+    final @Override
+    Byte128Vector tOp(Vector<Byte> v1, Vector<Byte> v2,
+                     VectorMask<Byte> m, FTriOp f) {
+        return (Byte128Vector)
+            super.tOpTemplate((Byte128Vector)v1, (Byte128Vector)v2,
+                              (Byte128Mask)m, f);  // specialize
+    }
+
+    @ForceInline
+    final @Override
+    byte rOp(byte v, FBinOp f) {
+        return super.rOpTemplate(v, f);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final <F>
+    Vector<F> convertShape(VectorOperators.Conversion<Byte,F> conv,
+                           VectorSpecies<F> rsp, int part) {
+        return super.convertShapeTemplate(conv, rsp, part);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final <F>
+    Vector<F> reinterpretShape(VectorSpecies<F> toSpecies, int part) {
+        return super.reinterpretShapeTemplate(toSpecies, part);  // specialize
+    }
+
+    // Specialized algebraic operations:
+
+    // The following definition forces a specialized version of this
+    // crucial method into the v-table of this class.  A call to add()
+    // will inline to a call to lanewise(ADD,), at which point the JIT
+    // intrinsic will have the opcode of ADD, plus all the metadata
+    // for this particular class, enabling it to generate precise
+    // code.
+    //
+    // There is probably no benefit to the JIT to specialize the
+    // masked or broadcast versions of the lanewise method.
+
+    @Override
+    @ForceInline
+    public Byte128Vector lanewise(Unary op) {
+        return (Byte128Vector) super.lanewiseTemplate(op);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Byte128Vector lanewise(Binary op, Vector<Byte> v) {
+        return (Byte128Vector) super.lanewiseTemplate(op, v);  // specialize
+    }
+
+    /*package-private*/
+    @Override
+    @ForceInline Byte128Vector
+    lanewiseShift(VectorOperators.Binary op, int e) {
+        return (Byte128Vector) super.lanewiseShiftTemplate(op, e);  // specialize
+    }
+
+    /*package-private*/
+    @Override
+    @ForceInline
+    public final
+    Byte128Vector
+    lanewise(VectorOperators.Ternary op, Vector<Byte> v1, Vector<Byte> v2) {
+        return (Byte128Vector) super.lanewiseTemplate(op, v1, v2);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final
+    Byte128Vector addIndex(int scale) {
+        return (Byte128Vector) super.addIndexTemplate(scale);  // specialize
+    }
+
+    // Type specific horizontal reductions
+
+    @Override
+    @ForceInline
+    public final byte reduceLanes(VectorOperators.Associative op) {
+        return super.reduceLanesTemplate(op);  // specialized
+    }
+
+    @Override
+    @ForceInline
+    public final byte reduceLanes(VectorOperators.Associative op,
+                                    VectorMask<Byte> m) {
+        return super.reduceLanesTemplate(op, m);  // specialized
+    }
+
+    @Override
+    @ForceInline
+    public final long reduceLanesToLong(VectorOperators.Associative op) {
+        return (long) super.reduceLanesTemplate(op);  // specialized
+    }
+
+    @Override
+    @ForceInline
+    public final long reduceLanesToLong(VectorOperators.Associative op,
+                                        VectorMask<Byte> m) {
+        return (long) super.reduceLanesTemplate(op, m);  // specialized
+    }
+
+    @Override
+    @ForceInline
+    public VectorShuffle<Byte> toShuffle() {
+        byte[] a = toArray();
+        int[] sa = new int[a.length];
+        for (int i = 0; i < a.length; i++) {
+            sa[i] = (int) a[i];
+        }
+        return VectorShuffle.fromArray(VSPECIES, sa, 0);
+    }
+
+    // Specialized unary testing
+
+    @Override
+    @ForceInline
+    public final Byte128Mask test(Test op) {
+        return super.testTemplate(Byte128Mask.class, op);  // specialize
+    }
+
+    // Specialized comparisons
+
+    @Override
+    @ForceInline
+    public final Byte128Mask compare(Comparison op, Vector<Byte> v) {
+        return super.compareTemplate(Byte128Mask.class, op, v);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final Byte128Mask compare(Comparison op, byte s) {
+        return super.compareTemplate(Byte128Mask.class, op, s);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final Byte128Mask compare(Comparison op, long s) {
+        return super.compareTemplate(Byte128Mask.class, op, s);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Byte128Vector blend(Vector<Byte> v, VectorMask<Byte> m) {
+        return (Byte128Vector)
+            super.blendTemplate(Byte128Mask.class,
+                                (Byte128Vector) v,
+                                (Byte128Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Byte128Vector slice(int origin, Vector<Byte> v) {
+        return (Byte128Vector) super.sliceTemplate(origin, v);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Byte128Vector slice(int origin) {
+       if ((origin < 0) || (origin >= VLENGTH)) {
+         throw new ArrayIndexOutOfBoundsException("Index " + origin + " out of bounds for vector length " + VLENGTH);
+       } else {
+         Byte128Shuffle Iota = iotaShuffle();
+         VectorMask<Byte> BlendMask = Iota.toVector().compare(VectorOperators.LT, (broadcast((byte)(VLENGTH-origin))));
+         Iota = iotaShuffle(origin, 1, true);
+         return ZERO.blend(this.rearrange(Iota), BlendMask);
+       }
+    }
+
+    @Override
+    @ForceInline
+    public Byte128Vector unslice(int origin, Vector<Byte> w, int part) {
+        return (Byte128Vector) super.unsliceTemplate(origin, w, part);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Byte128Vector unslice(int origin, Vector<Byte> w, int part, VectorMask<Byte> m) {
+        return (Byte128Vector)
+            super.unsliceTemplate(Byte128Mask.class,
+                                  origin, w, part,
+                                  (Byte128Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Byte128Vector unslice(int origin) {
+       if ((origin < 0) || (origin >= VLENGTH)) {
+         throw new ArrayIndexOutOfBoundsException("Index " + origin + " out of bounds for vector length " + VLENGTH);
+       } else {
+         Byte128Shuffle Iota = iotaShuffle();
+         VectorMask<Byte> BlendMask = Iota.toVector().compare(VectorOperators.GE, (broadcast((byte)(origin))));
+         Iota = iotaShuffle(-origin, 1, true);
+         return ZERO.blend(this.rearrange(Iota), BlendMask);
+       }
+    }
+
+    @Override
+    @ForceInline
+    public Byte128Vector rearrange(VectorShuffle<Byte> s) {
+        return (Byte128Vector)
+            super.rearrangeTemplate(Byte128Shuffle.class,
+                                    (Byte128Shuffle) s);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Byte128Vector rearrange(VectorShuffle<Byte> shuffle,
+                                  VectorMask<Byte> m) {
+        return (Byte128Vector)
+            super.rearrangeTemplate(Byte128Shuffle.class,
+                                    (Byte128Shuffle) shuffle,
+                                    (Byte128Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Byte128Vector rearrange(VectorShuffle<Byte> s,
+                                  Vector<Byte> v) {
+        return (Byte128Vector)
+            super.rearrangeTemplate(Byte128Shuffle.class,
+                                    (Byte128Shuffle) s,
+                                    (Byte128Vector) v);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Byte128Vector selectFrom(Vector<Byte> v) {
+        return (Byte128Vector)
+            super.selectFromTemplate((Byte128Vector) v);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Byte128Vector selectFrom(Vector<Byte> v,
+                                   VectorMask<Byte> m) {
+        return (Byte128Vector)
+            super.selectFromTemplate((Byte128Vector) v,
+                                     (Byte128Mask) m);  // specialize
+    }
+
+
+    @ForceInline
+    @Override
+    public byte lane(int i) {
+        switch(i) {
+            case 0: return laneHelper(0);
+            case 1: return laneHelper(1);
+            case 2: return laneHelper(2);
+            case 3: return laneHelper(3);
+            case 4: return laneHelper(4);
+            case 5: return laneHelper(5);
+            case 6: return laneHelper(6);
+            case 7: return laneHelper(7);
+            case 8: return laneHelper(8);
+            case 9: return laneHelper(9);
+            case 10: return laneHelper(10);
+            case 11: return laneHelper(11);
+            case 12: return laneHelper(12);
+            case 13: return laneHelper(13);
+            case 14: return laneHelper(14);
+            case 15: return laneHelper(15);
+            default: throw new IllegalArgumentException("Index " + i + " must be zero or positive, and less than " + VLENGTH);
+        }
+    }
+
+    public byte laneHelper(int i) {
+        return (byte) VectorSupport.extract(
+                                VCLASS, ETYPE, VLENGTH,
+                                this, i,
+                                (vec, ix) -> {
+                                    byte[] vecarr = vec.vec();
+                                    return (long)vecarr[ix];
+                                });
+    }
+
+    @ForceInline
+    @Override
+    public Byte128Vector withLane(int i, byte e) {
+        switch (i) {
+            case 0: return withLaneHelper(0, e);
+            case 1: return withLaneHelper(1, e);
+            case 2: return withLaneHelper(2, e);
+            case 3: return withLaneHelper(3, e);
+            case 4: return withLaneHelper(4, e);
+            case 5: return withLaneHelper(5, e);
+            case 6: return withLaneHelper(6, e);
+            case 7: return withLaneHelper(7, e);
+            case 8: return withLaneHelper(8, e);
+            case 9: return withLaneHelper(9, e);
+            case 10: return withLaneHelper(10, e);
+            case 11: return withLaneHelper(11, e);
+            case 12: return withLaneHelper(12, e);
+            case 13: return withLaneHelper(13, e);
+            case 14: return withLaneHelper(14, e);
+            case 15: return withLaneHelper(15, e);
+            default: throw new IllegalArgumentException("Index " + i + " must be zero or positive, and less than " + VLENGTH);
+        }
+    }
+
+    public Byte128Vector withLaneHelper(int i, byte e) {
+        return VectorSupport.insert(
+                                VCLASS, ETYPE, VLENGTH,
+                                this, i, (long)e,
+                                (v, ix, bits) -> {
+                                    byte[] res = v.vec().clone();
+                                    res[ix] = (byte)bits;
+                                    return v.vectorFactory(res);
+                                });
+    }
+
+    // Mask
+
+    static final class Byte128Mask extends AbstractMask<Byte> {
+        static final int VLENGTH = VSPECIES.laneCount();    // used by the JVM
+        static final Class<Byte> ETYPE = byte.class; // used by the JVM
+
+        Byte128Mask(boolean[] bits) {
+            this(bits, 0);
+        }
+
+        Byte128Mask(boolean[] bits, int offset) {
+            super(prepare(bits, offset));
+        }
+
+        Byte128Mask(boolean val) {
+            super(prepare(val));
+        }
+
+        private static boolean[] prepare(boolean[] bits, int offset) {
+            boolean[] newBits = new boolean[VSPECIES.laneCount()];
+            for (int i = 0; i < newBits.length; i++) {
+                newBits[i] = bits[offset + i];
+            }
+            return newBits;
+        }
+
+        private static boolean[] prepare(boolean val) {
+            boolean[] bits = new boolean[VSPECIES.laneCount()];
+            Arrays.fill(bits, val);
+            return bits;
+        }
+
+        @ForceInline
+        final @Override
+        public ByteSpecies vspecies() {
+            // ISSUE:  This should probably be a @Stable
+            // field inside AbstractMask, rather than
+            // a megamorphic method.
+            return VSPECIES;
+        }
+
+        @ForceInline
+        boolean[] getBits() {
+            return (boolean[])getPayload();
+        }
+
+        @Override
+        Byte128Mask uOp(MUnOp f) {
+            boolean[] res = new boolean[vspecies().laneCount()];
+            boolean[] bits = getBits();
+            for (int i = 0; i < res.length; i++) {
+                res[i] = f.apply(i, bits[i]);
+            }
+            return new Byte128Mask(res);
+        }
+
+        @Override
+        Byte128Mask bOp(VectorMask<Byte> m, MBinOp f) {
+            boolean[] res = new boolean[vspecies().laneCount()];
+            boolean[] bits = getBits();
+            boolean[] mbits = ((Byte128Mask)m).getBits();
+            for (int i = 0; i < res.length; i++) {
+                res[i] = f.apply(i, bits[i], mbits[i]);
+            }
+            return new Byte128Mask(res);
+        }
+
+        @ForceInline
+        @Override
+        public final
+        Byte128Vector toVector() {
+            return (Byte128Vector) super.toVectorTemplate();  // specialize
+        }
+
+        @Override
+        @ForceInline
+        public <E> VectorMask<E> cast(VectorSpecies<E> s) {
+            AbstractSpecies<E> species = (AbstractSpecies<E>) s;
+            if (length() != species.laneCount())
+                throw new IllegalArgumentException("VectorMask length and species length differ");
+            boolean[] maskArray = toArray();
+            // enum-switches don't optimize properly JDK-8161245
+            switch (species.laneType.switchKey) {
+            case LaneType.SK_BYTE:
+                return new Byte128Vector.Byte128Mask(maskArray).check(species);
+            case LaneType.SK_SHORT:
+                return new Short128Vector.Short128Mask(maskArray).check(species);
+            case LaneType.SK_INT:
+                return new Int128Vector.Int128Mask(maskArray).check(species);
+            case LaneType.SK_LONG:
+                return new Long128Vector.Long128Mask(maskArray).check(species);
+            case LaneType.SK_FLOAT:
+                return new Float128Vector.Float128Mask(maskArray).check(species);
+            case LaneType.SK_DOUBLE:
+                return new Double128Vector.Double128Mask(maskArray).check(species);
+            }
+
+            // Should not reach here.
+            throw new AssertionError(species);
+        }
+
+        // Unary operations
+
+        @Override
+        @ForceInline
+        public Byte128Mask not() {
+            return xor(maskAll(true));
+        }
+
+        // Binary operations
+
+        @Override
+        @ForceInline
+        public Byte128Mask and(VectorMask<Byte> mask) {
+            Objects.requireNonNull(mask);
+            Byte128Mask m = (Byte128Mask)mask;
+            return VectorSupport.binaryOp(VECTOR_OP_AND, Byte128Mask.class, byte.class, VLENGTH,
+                                             this, m,
+                                             (m1, m2) -> m1.bOp(m2, (i, a, b) -> a & b));
+        }
+
+        @Override
+        @ForceInline
+        public Byte128Mask or(VectorMask<Byte> mask) {
+            Objects.requireNonNull(mask);
+            Byte128Mask m = (Byte128Mask)mask;
+            return VectorSupport.binaryOp(VECTOR_OP_OR, Byte128Mask.class, byte.class, VLENGTH,
+                                             this, m,
+                                             (m1, m2) -> m1.bOp(m2, (i, a, b) -> a | b));
+        }
+
+        @ForceInline
+        /* package-private */
+        Byte128Mask xor(VectorMask<Byte> mask) {
+            Objects.requireNonNull(mask);
+            Byte128Mask m = (Byte128Mask)mask;
+            return VectorSupport.binaryOp(VECTOR_OP_XOR, Byte128Mask.class, byte.class, VLENGTH,
+                                          this, m,
+                                          (m1, m2) -> m1.bOp(m2, (i, a, b) -> a ^ b));
+        }
+
+        // Reductions
+
+        @Override
+        @ForceInline
+        public boolean anyTrue() {
+            return VectorSupport.test(BT_ne, Byte128Mask.class, byte.class, VLENGTH,
+                                         this, vspecies().maskAll(true),
+                                         (m, __) -> anyTrueHelper(((Byte128Mask)m).getBits()));
+        }
+
+        @Override
+        @ForceInline
+        public boolean allTrue() {
+            return VectorSupport.test(BT_overflow, Byte128Mask.class, byte.class, VLENGTH,
+                                         this, vspecies().maskAll(true),
+                                         (m, __) -> allTrueHelper(((Byte128Mask)m).getBits()));
+        }
+
+        @ForceInline
+        /*package-private*/
+        static Byte128Mask maskAll(boolean bit) {
+            return VectorSupport.broadcastCoerced(Byte128Mask.class, byte.class, VLENGTH,
+                                                  (bit ? -1 : 0), null,
+                                                  (v, __) -> (v != 0 ? TRUE_MASK : FALSE_MASK));
+        }
+        private static final Byte128Mask  TRUE_MASK = new Byte128Mask(true);
+        private static final Byte128Mask FALSE_MASK = new Byte128Mask(false);
+
+    }
+
+    // Shuffle
+
+    static final class Byte128Shuffle extends AbstractShuffle<Byte> {
+        static final int VLENGTH = VSPECIES.laneCount();    // used by the JVM
+        static final Class<Byte> ETYPE = byte.class; // used by the JVM
+
+        Byte128Shuffle(byte[] reorder) {
+            super(VLENGTH, reorder);
+        }
+
+        public Byte128Shuffle(int[] reorder) {
+            super(VLENGTH, reorder);
+        }
+
+        public Byte128Shuffle(int[] reorder, int i) {
+            super(VLENGTH, reorder, i);
+        }
+
+        public Byte128Shuffle(IntUnaryOperator fn) {
+            super(VLENGTH, fn);
+        }
+
+        @Override
+        public ByteSpecies vspecies() {
+            return VSPECIES;
+        }
+
+        static {
+            // There must be enough bits in the shuffle lanes to encode
+            // VLENGTH valid indexes and VLENGTH exceptional ones.
+            assert(VLENGTH < Byte.MAX_VALUE);
+            assert(Byte.MIN_VALUE <= -VLENGTH);
+        }
+        static final Byte128Shuffle IOTA = new Byte128Shuffle(IDENTITY);
+
+        @Override
+        @ForceInline
+        public Byte128Vector toVector() {
+            return VectorSupport.shuffleToVector(VCLASS, ETYPE, Byte128Shuffle.class, this, VLENGTH,
+                                                    (s) -> ((Byte128Vector)(((AbstractShuffle<Byte>)(s)).toVectorTemplate())));
+        }
+
+        @Override
+        @ForceInline
+        public <F> VectorShuffle<F> cast(VectorSpecies<F> s) {
+            AbstractSpecies<F> species = (AbstractSpecies<F>) s;
+            if (length() != species.laneCount())
+                throw new IllegalArgumentException("VectorShuffle length and species length differ");
+            int[] shuffleArray = toArray();
+            // enum-switches don't optimize properly JDK-8161245
+            switch (species.laneType.switchKey) {
+            case LaneType.SK_BYTE:
+                return new Byte128Vector.Byte128Shuffle(shuffleArray).check(species);
+            case LaneType.SK_SHORT:
+                return new Short128Vector.Short128Shuffle(shuffleArray).check(species);
+            case LaneType.SK_INT:
+                return new Int128Vector.Int128Shuffle(shuffleArray).check(species);
+            case LaneType.SK_LONG:
+                return new Long128Vector.Long128Shuffle(shuffleArray).check(species);
+            case LaneType.SK_FLOAT:
+                return new Float128Vector.Float128Shuffle(shuffleArray).check(species);
+            case LaneType.SK_DOUBLE:
+                return new Double128Vector.Double128Shuffle(shuffleArray).check(species);
+            }
+
+            // Should not reach here.
+            throw new AssertionError(species);
+        }
+
+        @ForceInline
+        @Override
+        public Byte128Shuffle rearrange(VectorShuffle<Byte> shuffle) {
+            Byte128Shuffle s = (Byte128Shuffle) shuffle;
+            byte[] reorder1 = reorder();
+            byte[] reorder2 = s.reorder();
+            byte[] r = new byte[reorder1.length];
+            for (int i = 0; i < reorder1.length; i++) {
+                int ssi = reorder2[i];
+                r[i] = reorder1[ssi];  // throws on exceptional index
+            }
+            return new Byte128Shuffle(r);
+        }
+    }
+
+    // ================================================
+
+    // Specialized low-level memory operations.
+
+    @ForceInline
+    @Override
+    final
+    ByteVector fromArray0(byte[] a, int offset) {
+        return super.fromArray0Template(a, offset);  // specialize
+    }
+
+    @ForceInline
+    @Override
+    final
+    ByteVector fromByteArray0(byte[] a, int offset) {
+        return super.fromByteArray0Template(a, offset);  // specialize
+    }
+
+    @ForceInline
+    @Override
+    final
+    ByteVector fromByteBuffer0(ByteBuffer bb, int offset) {
+        return super.fromByteBuffer0Template(bb, offset);  // specialize
+    }
+
+    @ForceInline
+    @Override
+    final
+    void intoArray0(byte[] a, int offset) {
+        super.intoArray0Template(a, offset);  // specialize
+    }
+
+    @ForceInline
+    @Override
+    final
+    void intoByteArray0(byte[] a, int offset) {
+        super.intoByteArray0Template(a, offset);  // specialize
+    }
+
+    // End of specialized low-level memory operations.
+
+    // ================================================
+
+}
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Byte256Vector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Byte256Vector.java
@ -0,0 +1,872 @@
+/*
+ * Copyright (c) 2017, 2020, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.  Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package jdk.incubator.vector;
+
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.Objects;
+import java.util.function.IntUnaryOperator;
+
+import jdk.internal.vm.annotation.ForceInline;
+import jdk.internal.vm.vector.VectorSupport;
+
+import static jdk.internal.vm.vector.VectorSupport.*;
+
+import static jdk.incubator.vector.VectorOperators.*;
+
+// -- This file was mechanically generated: Do not edit! -- //
+
+@SuppressWarnings("cast")  // warning: redundant cast
+final class Byte256Vector extends ByteVector {
+    static final ByteSpecies VSPECIES =
+        (ByteSpecies) ByteVector.SPECIES_256;
+
+    static final VectorShape VSHAPE =
+        VSPECIES.vectorShape();
+
+    static final Class<Byte256Vector> VCLASS = Byte256Vector.class;
+
+    static final int VSIZE = VSPECIES.vectorBitSize();
+
+    static final int VLENGTH = VSPECIES.laneCount(); // used by the JVM
+
+    static final Class<Byte> ETYPE = byte.class; // used by the JVM
+
+    Byte256Vector(byte[] v) {
+        super(v);
+    }
+
+    // For compatibility as Byte256Vector::new,
+    // stored into species.vectorFactory.
+    Byte256Vector(Object v) {
+        this((byte[]) v);
+    }
+
+    static final Byte256Vector ZERO = new Byte256Vector(new byte[VLENGTH]);
+    static final Byte256Vector IOTA = new Byte256Vector(VSPECIES.iotaArray());
+
+    static {
+        // Warm up a few species caches.
+        // If we do this too much we will
+        // get NPEs from bootstrap circularity.
+        VSPECIES.dummyVector();
+        VSPECIES.withLanes(LaneType.BYTE);
+    }
+
+    // Specialized extractors
+
+    @ForceInline
+    final @Override
+    public ByteSpecies vspecies() {
+        // ISSUE:  This should probably be a @Stable
+        // field inside AbstractVector, rather than
+        // a megamorphic method.
+        return VSPECIES;
+    }
+
+    @ForceInline
+    @Override
+    public final Class<Byte> elementType() { return byte.class; }
+
+    @ForceInline
+    @Override
+    public final int elementSize() { return Byte.SIZE; }
+
+    @ForceInline
+    @Override
+    public final VectorShape shape() { return VSHAPE; }
+
+    @ForceInline
+    @Override
+    public final int length() { return VLENGTH; }
+
+    @ForceInline
+    @Override
+    public final int bitSize() { return VSIZE; }
+
+    @ForceInline
+    @Override
+    public final int byteSize() { return VSIZE / Byte.SIZE; }
+
+    /*package-private*/
+    @ForceInline
+    final @Override
+    byte[] vec() {
+        return (byte[])getPayload();
+    }
+
+    // Virtualized constructors
+
+    @Override
+    @ForceInline
+    public final Byte256Vector broadcast(byte e) {
+        return (Byte256Vector) super.broadcastTemplate(e);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final Byte256Vector broadcast(long e) {
+        return (Byte256Vector) super.broadcastTemplate(e);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    Byte256Mask maskFromArray(boolean[] bits) {
+        return new Byte256Mask(bits);
+    }
+
+    @Override
+    @ForceInline
+    Byte256Shuffle iotaShuffle() { return Byte256Shuffle.IOTA; }
+
+    @ForceInline
+    Byte256Shuffle iotaShuffle(int start, int step, boolean wrap) {
+      if (wrap) {
+        return (Byte256Shuffle)VectorSupport.shuffleIota(ETYPE, Byte256Shuffle.class, VSPECIES, VLENGTH, start, step, 1,
+                (l, lstart, lstep, s) -> s.shuffleFromOp(i -> (VectorIntrinsics.wrapToRange(i*lstep + lstart, l))));
+      } else {
+        return (Byte256Shuffle)VectorSupport.shuffleIota(ETYPE, Byte256Shuffle.class, VSPECIES, VLENGTH, start, step, 0,
+                (l, lstart, lstep, s) -> s.shuffleFromOp(i -> (i*lstep + lstart)));
+      }
+    }
+
+    @Override
+    @ForceInline
+    Byte256Shuffle shuffleFromBytes(byte[] reorder) { return new Byte256Shuffle(reorder); }
+
+    @Override
+    @ForceInline
+    Byte256Shuffle shuffleFromArray(int[] indexes, int i) { return new Byte256Shuffle(indexes, i); }
+
+    @Override
+    @ForceInline
+    Byte256Shuffle shuffleFromOp(IntUnaryOperator fn) { return new Byte256Shuffle(fn); }
+
+    // Make a vector of the same species but the given elements:
+    @ForceInline
+    final @Override
+    Byte256Vector vectorFactory(byte[] vec) {
+        return new Byte256Vector(vec);
+    }
+
+    @ForceInline
+    final @Override
+    Byte256Vector asByteVectorRaw() {
+        return (Byte256Vector) super.asByteVectorRawTemplate();  // specialize
+    }
+
+    @ForceInline
+    final @Override
+    AbstractVector<?> asVectorRaw(LaneType laneType) {
+        return super.asVectorRawTemplate(laneType);  // specialize
+    }
+
+    // Unary operator
+
+    @ForceInline
+    final @Override
+    Byte256Vector uOp(FUnOp f) {
+        return (Byte256Vector) super.uOpTemplate(f);  // specialize
+    }
+
+    @ForceInline
+    final @Override
+    Byte256Vector uOp(VectorMask<Byte> m, FUnOp f) {
+        return (Byte256Vector)
+            super.uOpTemplate((Byte256Mask)m, f);  // specialize
+    }
+
+    // Binary operator
+
+    @ForceInline
+    final @Override
+    Byte256Vector bOp(Vector<Byte> v, FBinOp f) {
+        return (Byte256Vector) super.bOpTemplate((Byte256Vector)v, f);  // specialize
+    }
+
+    @ForceInline
+    final @Override
+    Byte256Vector bOp(Vector<Byte> v,
+                     VectorMask<Byte> m, FBinOp f) {
+        return (Byte256Vector)
+            super.bOpTemplate((Byte256Vector)v, (Byte256Mask)m,
+                              f);  // specialize
+    }
+
+    // Ternary operator
+
+    @ForceInline
+    final @Override
+    Byte256Vector tOp(Vector<Byte> v1, Vector<Byte> v2, FTriOp f) {
+        return (Byte256Vector)
+            super.tOpTemplate((Byte256Vector)v1, (Byte256Vector)v2,
+                              f);  // specialize
+    }
+
+    @ForceInline
+    final @Override
+    Byte256Vector tOp(Vector<Byte> v1, Vector<Byte> v2,
+                     VectorMask<Byte> m, FTriOp f) {
+        return (Byte256Vector)
+            super.tOpTemplate((Byte256Vector)v1, (Byte256Vector)v2,
+                              (Byte256Mask)m, f);  // specialize
+    }
+
+    @ForceInline
+    final @Override
+    byte rOp(byte v, FBinOp f) {
+        return super.rOpTemplate(v, f);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final <F>
+    Vector<F> convertShape(VectorOperators.Conversion<Byte,F> conv,
+                           VectorSpecies<F> rsp, int part) {
+        return super.convertShapeTemplate(conv, rsp, part);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final <F>
+    Vector<F> reinterpretShape(VectorSpecies<F> toSpecies, int part) {
+        return super.reinterpretShapeTemplate(toSpecies, part);  // specialize
+    }
+
+    // Specialized algebraic operations:
+
+    // The following definition forces a specialized version of this
+    // crucial method into the v-table of this class.  A call to add()
+    // will inline to a call to lanewise(ADD,), at which point the JIT
+    // intrinsic will have the opcode of ADD, plus all the metadata
+    // for this particular class, enabling it to generate precise
+    // code.
+    //
+    // There is probably no benefit to the JIT to specialize the
+    // masked or broadcast versions of the lanewise method.
+
+    @Override
+    @ForceInline
+    public Byte256Vector lanewise(Unary op) {
+        return (Byte256Vector) super.lanewiseTemplate(op);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Byte256Vector lanewise(Binary op, Vector<Byte> v) {
+        return (Byte256Vector) super.lanewiseTemplate(op, v);  // specialize
+    }
+
+    /*package-private*/
+    @Override
+    @ForceInline Byte256Vector
+    lanewiseShift(VectorOperators.Binary op, int e) {
+        return (Byte256Vector) super.lanewiseShiftTemplate(op, e);  // specialize
+    }
+
+    /*package-private*/
+    @Override
+    @ForceInline
+    public final
+    Byte256Vector
+    lanewise(VectorOperators.Ternary op, Vector<Byte> v1, Vector<Byte> v2) {
+        return (Byte256Vector) super.lanewiseTemplate(op, v1, v2);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final
+    Byte256Vector addIndex(int scale) {
+        return (Byte256Vector) super.addIndexTemplate(scale);  // specialize
+    }
+
+    // Type specific horizontal reductions
+
+    @Override
+    @ForceInline
+    public final byte reduceLanes(VectorOperators.Associative op) {
+        return super.reduceLanesTemplate(op);  // specialized
+    }
+
+    @Override
+    @ForceInline
+    public final byte reduceLanes(VectorOperators.Associative op,
+                                    VectorMask<Byte> m) {
+        return super.reduceLanesTemplate(op, m);  // specialized
+    }
+
+    @Override
+    @ForceInline
+    public final long reduceLanesToLong(VectorOperators.Associative op) {
+        return (long) super.reduceLanesTemplate(op);  // specialized
+    }
+
+    @Override
+    @ForceInline
+    public final long reduceLanesToLong(VectorOperators.Associative op,
+                                        VectorMask<Byte> m) {
+        return (long) super.reduceLanesTemplate(op, m);  // specialized
+    }
+
+    @Override
+    @ForceInline
+    public VectorShuffle<Byte> toShuffle() {
+        byte[] a = toArray();
+        int[] sa = new int[a.length];
+        for (int i = 0; i < a.length; i++) {
+            sa[i] = (int) a[i];
+        }
+        return VectorShuffle.fromArray(VSPECIES, sa, 0);
+    }
+
+    // Specialized unary testing
+
+    @Override
+    @ForceInline
+    public final Byte256Mask test(Test op) {
+        return super.testTemplate(Byte256Mask.class, op);  // specialize
+    }
+
+    // Specialized comparisons
+
+    @Override
+    @ForceInline
+    public final Byte256Mask compare(Comparison op, Vector<Byte> v) {
+        return super.compareTemplate(Byte256Mask.class, op, v);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final Byte256Mask compare(Comparison op, byte s) {
+        return super.compareTemplate(Byte256Mask.class, op, s);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final Byte256Mask compare(Comparison op, long s) {
+        return super.compareTemplate(Byte256Mask.class, op, s);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Byte256Vector blend(Vector<Byte> v, VectorMask<Byte> m) {
+        return (Byte256Vector)
+            super.blendTemplate(Byte256Mask.class,
+                                (Byte256Vector) v,
+                                (Byte256Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Byte256Vector slice(int origin, Vector<Byte> v) {
+        return (Byte256Vector) super.sliceTemplate(origin, v);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Byte256Vector slice(int origin) {
+       if ((origin < 0) || (origin >= VLENGTH)) {
+         throw new ArrayIndexOutOfBoundsException("Index " + origin + " out of bounds for vector length " + VLENGTH);
+       } else {
+         Byte256Shuffle Iota = iotaShuffle();
+         VectorMask<Byte> BlendMask = Iota.toVector().compare(VectorOperators.LT, (broadcast((byte)(VLENGTH-origin))));
+         Iota = iotaShuffle(origin, 1, true);
+         return ZERO.blend(this.rearrange(Iota), BlendMask);
+       }
+    }
+
+    @Override
+    @ForceInline
+    public Byte256Vector unslice(int origin, Vector<Byte> w, int part) {
+        return (Byte256Vector) super.unsliceTemplate(origin, w, part);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Byte256Vector unslice(int origin, Vector<Byte> w, int part, VectorMask<Byte> m) {
+        return (Byte256Vector)
+            super.unsliceTemplate(Byte256Mask.class,
+                                  origin, w, part,
+                                  (Byte256Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Byte256Vector unslice(int origin) {
+       if ((origin < 0) || (origin >= VLENGTH)) {
+         throw new ArrayIndexOutOfBoundsException("Index " + origin + " out of bounds for vector length " + VLENGTH);
+       } else {
+         Byte256Shuffle Iota = iotaShuffle();
+         VectorMask<Byte> BlendMask = Iota.toVector().compare(VectorOperators.GE, (broadcast((byte)(origin))));
+         Iota = iotaShuffle(-origin, 1, true);
+         return ZERO.blend(this.rearrange(Iota), BlendMask);
+       }
+    }
+
+    @Override
+    @ForceInline
+    public Byte256Vector rearrange(VectorShuffle<Byte> s) {
+        return (Byte256Vector)
+            super.rearrangeTemplate(Byte256Shuffle.class,
+                                    (Byte256Shuffle) s);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Byte256Vector rearrange(VectorShuffle<Byte> shuffle,
+                                  VectorMask<Byte> m) {
+        return (Byte256Vector)
+            super.rearrangeTemplate(Byte256Shuffle.class,
+                                    (Byte256Shuffle) shuffle,
+                                    (Byte256Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Byte256Vector rearrange(VectorShuffle<Byte> s,
+                                  Vector<Byte> v) {
+        return (Byte256Vector)
+            super.rearrangeTemplate(Byte256Shuffle.class,
+                                    (Byte256Shuffle) s,
+                                    (Byte256Vector) v);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Byte256Vector selectFrom(Vector<Byte> v) {
+        return (Byte256Vector)
+            super.selectFromTemplate((Byte256Vector) v);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Byte256Vector selectFrom(Vector<Byte> v,
+                                   VectorMask<Byte> m) {
+        return (Byte256Vector)
+            super.selectFromTemplate((Byte256Vector) v,
+                                     (Byte256Mask) m);  // specialize
+    }
+
+
+    @ForceInline
+    @Override
+    public byte lane(int i) {
+        switch(i) {
+            case 0: return laneHelper(0);
+            case 1: return laneHelper(1);
+            case 2: return laneHelper(2);
+            case 3: return laneHelper(3);
+            case 4: return laneHelper(4);
+            case 5: return laneHelper(5);
+            case 6: return laneHelper(6);
+            case 7: return laneHelper(7);
+            case 8: return laneHelper(8);
+            case 9: return laneHelper(9);
+            case 10: return laneHelper(10);
+            case 11: return laneHelper(11);
+            case 12: return laneHelper(12);
+            case 13: return laneHelper(13);
+            case 14: return laneHelper(14);
+            case 15: return laneHelper(15);
+            case 16: return laneHelper(16);
+            case 17: return laneHelper(17);
+            case 18: return laneHelper(18);
+            case 19: return laneHelper(19);
+            case 20: return laneHelper(20);
+            case 21: return laneHelper(21);
+            case 22: return laneHelper(22);
+            case 23: return laneHelper(23);
+            case 24: return laneHelper(24);
+            case 25: return laneHelper(25);
+            case 26: return laneHelper(26);
+            case 27: return laneHelper(27);
+            case 28: return laneHelper(28);
+            case 29: return laneHelper(29);
+            case 30: return laneHelper(30);
+            case 31: return laneHelper(31);
+            default: throw new IllegalArgumentException("Index " + i + " must be zero or positive, and less than " + VLENGTH);
+        }
+    }
+
+    public byte laneHelper(int i) {
+        return (byte) VectorSupport.extract(
+                                VCLASS, ETYPE, VLENGTH,
+                                this, i,
+                                (vec, ix) -> {
+                                    byte[] vecarr = vec.vec();
+                                    return (long)vecarr[ix];
+                                });
+    }
+
+    @ForceInline
+    @Override
+    public Byte256Vector withLane(int i, byte e) {
+        switch (i) {
+            case 0: return withLaneHelper(0, e);
+            case 1: return withLaneHelper(1, e);
+            case 2: return withLaneHelper(2, e);
+            case 3: return withLaneHelper(3, e);
+            case 4: return withLaneHelper(4, e);
+            case 5: return withLaneHelper(5, e);
+            case 6: return withLaneHelper(6, e);
+            case 7: return withLaneHelper(7, e);
+            case 8: return withLaneHelper(8, e);
+            case 9: return withLaneHelper(9, e);
+            case 10: return withLaneHelper(10, e);
+            case 11: return withLaneHelper(11, e);
+            case 12: return withLaneHelper(12, e);
+            case 13: return withLaneHelper(13, e);
+            case 14: return withLaneHelper(14, e);
+            case 15: return withLaneHelper(15, e);
+            case 16: return withLaneHelper(16, e);
+            case 17: return withLaneHelper(17, e);
+            case 18: return withLaneHelper(18, e);
+            case 19: return withLaneHelper(19, e);
+            case 20: return withLaneHelper(20, e);
+            case 21: return withLaneHelper(21, e);
+            case 22: return withLaneHelper(22, e);
+            case 23: return withLaneHelper(23, e);
+            case 24: return withLaneHelper(24, e);
+            case 25: return withLaneHelper(25, e);
+            case 26: return withLaneHelper(26, e);
+            case 27: return withLaneHelper(27, e);
+            case 28: return withLaneHelper(28, e);
+            case 29: return withLaneHelper(29, e);
+            case 30: return withLaneHelper(30, e);
+            case 31: return withLaneHelper(31, e);
+            default: throw new IllegalArgumentException("Index " + i + " must be zero or positive, and less than " + VLENGTH);
+        }
+    }
+
+    public Byte256Vector withLaneHelper(int i, byte e) {
+        return VectorSupport.insert(
+                                VCLASS, ETYPE, VLENGTH,
+                                this, i, (long)e,
+                                (v, ix, bits) -> {
+                                    byte[] res = v.vec().clone();
+                                    res[ix] = (byte)bits;
+                                    return v.vectorFactory(res);
+                                });
+    }
+
+    // Mask
+
+    static final class Byte256Mask extends AbstractMask<Byte> {
+        static final int VLENGTH = VSPECIES.laneCount();    // used by the JVM
+        static final Class<Byte> ETYPE = byte.class; // used by the JVM
+
+        Byte256Mask(boolean[] bits) {
+            this(bits, 0);
+        }
+
+        Byte256Mask(boolean[] bits, int offset) {
+            super(prepare(bits, offset));
+        }
+
+        Byte256Mask(boolean val) {
+            super(prepare(val));
+        }
+
+        private static boolean[] prepare(boolean[] bits, int offset) {
+            boolean[] newBits = new boolean[VSPECIES.laneCount()];
+            for (int i = 0; i < newBits.length; i++) {
+                newBits[i] = bits[offset + i];
+            }
+            return newBits;
+        }
+
+        private static boolean[] prepare(boolean val) {
+            boolean[] bits = new boolean[VSPECIES.laneCount()];
+            Arrays.fill(bits, val);
+            return bits;
+        }
+
+        @ForceInline
+        final @Override
+        public ByteSpecies vspecies() {
+            // ISSUE:  This should probably be a @Stable
+            // field inside AbstractMask, rather than
+            // a megamorphic method.
+            return VSPECIES;
+        }
+
+        @ForceInline
+        boolean[] getBits() {
+            return (boolean[])getPayload();
+        }
+
+        @Override
+        Byte256Mask uOp(MUnOp f) {
+            boolean[] res = new boolean[vspecies().laneCount()];
+            boolean[] bits = getBits();
+            for (int i = 0; i < res.length; i++) {
+                res[i] = f.apply(i, bits[i]);
+            }
+            return new Byte256Mask(res);
+        }
+
+        @Override
+        Byte256Mask bOp(VectorMask<Byte> m, MBinOp f) {
+            boolean[] res = new boolean[vspecies().laneCount()];
+            boolean[] bits = getBits();
+            boolean[] mbits = ((Byte256Mask)m).getBits();
+            for (int i = 0; i < res.length; i++) {
+                res[i] = f.apply(i, bits[i], mbits[i]);
+            }
+            return new Byte256Mask(res);
+        }
+
+        @ForceInline
+        @Override
+        public final
+        Byte256Vector toVector() {
+            return (Byte256Vector) super.toVectorTemplate();  // specialize
+        }
+
+        @Override
+        @ForceInline
+        public <E> VectorMask<E> cast(VectorSpecies<E> s) {
+            AbstractSpecies<E> species = (AbstractSpecies<E>) s;
+            if (length() != species.laneCount())
+                throw new IllegalArgumentException("VectorMask length and species length differ");
+            boolean[] maskArray = toArray();
+            // enum-switches don't optimize properly JDK-8161245
+            switch (species.laneType.switchKey) {
+            case LaneType.SK_BYTE:
+                return new Byte256Vector.Byte256Mask(maskArray).check(species);
+            case LaneType.SK_SHORT:
+                return new Short256Vector.Short256Mask(maskArray).check(species);
+            case LaneType.SK_INT:
+                return new Int256Vector.Int256Mask(maskArray).check(species);
+            case LaneType.SK_LONG:
+                return new Long256Vector.Long256Mask(maskArray).check(species);
+            case LaneType.SK_FLOAT:
+                return new Float256Vector.Float256Mask(maskArray).check(species);
+            case LaneType.SK_DOUBLE:
+                return new Double256Vector.Double256Mask(maskArray).check(species);
+            }
+
+            // Should not reach here.
+            throw new AssertionError(species);
+        }
+
+        // Unary operations
+
+        @Override
+        @ForceInline
+        public Byte256Mask not() {
+            return xor(maskAll(true));
+        }
+
+        // Binary operations
+
+        @Override
+        @ForceInline
+        public Byte256Mask and(VectorMask<Byte> mask) {
+            Objects.requireNonNull(mask);
+            Byte256Mask m = (Byte256Mask)mask;
+            return VectorSupport.binaryOp(VECTOR_OP_AND, Byte256Mask.class, byte.class, VLENGTH,
+                                             this, m,
+                                             (m1, m2) -> m1.bOp(m2, (i, a, b) -> a & b));
+        }
+
+        @Override
+        @ForceInline
+        public Byte256Mask or(VectorMask<Byte> mask) {
+            Objects.requireNonNull(mask);
+            Byte256Mask m = (Byte256Mask)mask;
+            return VectorSupport.binaryOp(VECTOR_OP_OR, Byte256Mask.class, byte.class, VLENGTH,
+                                             this, m,
+                                             (m1, m2) -> m1.bOp(m2, (i, a, b) -> a | b));
+        }
+
+        @ForceInline
+        /* package-private */
+        Byte256Mask xor(VectorMask<Byte> mask) {
+            Objects.requireNonNull(mask);
+            Byte256Mask m = (Byte256Mask)mask;
+            return VectorSupport.binaryOp(VECTOR_OP_XOR, Byte256Mask.class, byte.class, VLENGTH,
+                                          this, m,
+                                          (m1, m2) -> m1.bOp(m2, (i, a, b) -> a ^ b));
+        }
+
+        // Reductions
+
+        @Override
+        @ForceInline
+        public boolean anyTrue() {
+            return VectorSupport.test(BT_ne, Byte256Mask.class, byte.class, VLENGTH,
+                                         this, vspecies().maskAll(true),
+                                         (m, __) -> anyTrueHelper(((Byte256Mask)m).getBits()));
+        }
+
+        @Override
+        @ForceInline
+        public boolean allTrue() {
+            return VectorSupport.test(BT_overflow, Byte256Mask.class, byte.class, VLENGTH,
+                                         this, vspecies().maskAll(true),
+                                         (m, __) -> allTrueHelper(((Byte256Mask)m).getBits()));
+        }
+
+        @ForceInline
+        /*package-private*/
+        static Byte256Mask maskAll(boolean bit) {
+            return VectorSupport.broadcastCoerced(Byte256Mask.class, byte.class, VLENGTH,
+                                                  (bit ? -1 : 0), null,
+                                                  (v, __) -> (v != 0 ? TRUE_MASK : FALSE_MASK));
+        }
+        private static final Byte256Mask  TRUE_MASK = new Byte256Mask(true);
+        private static final Byte256Mask FALSE_MASK = new Byte256Mask(false);
+
+    }
+
+    // Shuffle
+
+    static final class Byte256Shuffle extends AbstractShuffle<Byte> {
+        static final int VLENGTH = VSPECIES.laneCount();    // used by the JVM
+        static final Class<Byte> ETYPE = byte.class; // used by the JVM
+
+        Byte256Shuffle(byte[] reorder) {
+            super(VLENGTH, reorder);
+        }
+
+        public Byte256Shuffle(int[] reorder) {
+            super(VLENGTH, reorder);
+        }
+
+        public Byte256Shuffle(int[] reorder, int i) {
+            super(VLENGTH, reorder, i);
+        }
+
+        public Byte256Shuffle(IntUnaryOperator fn) {
+            super(VLENGTH, fn);
+        }
+
+        @Override
+        public ByteSpecies vspecies() {
+            return VSPECIES;
+        }
+
+        static {
+            // There must be enough bits in the shuffle lanes to encode
+            // VLENGTH valid indexes and VLENGTH exceptional ones.
+            assert(VLENGTH < Byte.MAX_VALUE);
+            assert(Byte.MIN_VALUE <= -VLENGTH);
+        }
+        static final Byte256Shuffle IOTA = new Byte256Shuffle(IDENTITY);
+
+        @Override
+        @ForceInline
+        public Byte256Vector toVector() {
+            return VectorSupport.shuffleToVector(VCLASS, ETYPE, Byte256Shuffle.class, this, VLENGTH,
+                                                    (s) -> ((Byte256Vector)(((AbstractShuffle<Byte>)(s)).toVectorTemplate())));
+        }
+
+        @Override
+        @ForceInline
+        public <F> VectorShuffle<F> cast(VectorSpecies<F> s) {
+            AbstractSpecies<F> species = (AbstractSpecies<F>) s;
+            if (length() != species.laneCount())
+                throw new IllegalArgumentException("VectorShuffle length and species length differ");
+            int[] shuffleArray = toArray();
+            // enum-switches don't optimize properly JDK-8161245
+            switch (species.laneType.switchKey) {
+            case LaneType.SK_BYTE:
+                return new Byte256Vector.Byte256Shuffle(shuffleArray).check(species);
+            case LaneType.SK_SHORT:
+                return new Short256Vector.Short256Shuffle(shuffleArray).check(species);
+            case LaneType.SK_INT:
+                return new Int256Vector.Int256Shuffle(shuffleArray).check(species);
+            case LaneType.SK_LONG:
+                return new Long256Vector.Long256Shuffle(shuffleArray).check(species);
+            case LaneType.SK_FLOAT:
+                return new Float256Vector.Float256Shuffle(shuffleArray).check(species);
+            case LaneType.SK_DOUBLE:
+                return new Double256Vector.Double256Shuffle(shuffleArray).check(species);
+            }
+
+            // Should not reach here.
+            throw new AssertionError(species);
+        }
+
+        @ForceInline
+        @Override
+        public Byte256Shuffle rearrange(VectorShuffle<Byte> shuffle) {
+            Byte256Shuffle s = (Byte256Shuffle) shuffle;
+            byte[] reorder1 = reorder();
+            byte[] reorder2 = s.reorder();
+            byte[] r = new byte[reorder1.length];
+            for (int i = 0; i < reorder1.length; i++) {
+                int ssi = reorder2[i];
+                r[i] = reorder1[ssi];  // throws on exceptional index
+            }
+            return new Byte256Shuffle(r);
+        }
+    }
+
+    // ================================================
+
+    // Specialized low-level memory operations.
+
+    @ForceInline
+    @Override
+    final
+    ByteVector fromArray0(byte[] a, int offset) {
+        return super.fromArray0Template(a, offset);  // specialize
+    }
+
+    @ForceInline
+    @Override
+    final
+    ByteVector fromByteArray0(byte[] a, int offset) {
+        return super.fromByteArray0Template(a, offset);  // specialize
+    }
+
+    @ForceInline
+    @Override
+    final
+    ByteVector fromByteBuffer0(ByteBuffer bb, int offset) {
+        return super.fromByteBuffer0Template(bb, offset);  // specialize
+    }
+
+    @ForceInline
+    @Override
+    final
+    void intoArray0(byte[] a, int offset) {
+        super.intoArray0Template(a, offset);  // specialize
+    }
+
+    @ForceInline
+    @Override
+    final
+    void intoByteArray0(byte[] a, int offset) {
+        super.intoByteArray0Template(a, offset);  // specialize
+    }
+
+    // End of specialized low-level memory operations.
+
+    // ================================================
+
+}
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Byte512Vector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Byte512Vector.java
@ -0,0 +1,936 @@
+/*
+ * Copyright (c) 2017, 2020, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.  Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package jdk.incubator.vector;
+
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.Objects;
+import java.util.function.IntUnaryOperator;
+
+import jdk.internal.vm.annotation.ForceInline;
+import jdk.internal.vm.vector.VectorSupport;
+
+import static jdk.internal.vm.vector.VectorSupport.*;
+
+import static jdk.incubator.vector.VectorOperators.*;
+
+// -- This file was mechanically generated: Do not edit! -- //
+
+@SuppressWarnings("cast")  // warning: redundant cast
+final class Byte512Vector extends ByteVector {
+    static final ByteSpecies VSPECIES =
+        (ByteSpecies) ByteVector.SPECIES_512;
+
+    static final VectorShape VSHAPE =
+        VSPECIES.vectorShape();
+
+    static final Class<Byte512Vector> VCLASS = Byte512Vector.class;
+
+    static final int VSIZE = VSPECIES.vectorBitSize();
+
+    static final int VLENGTH = VSPECIES.laneCount(); // used by the JVM
+
+    static final Class<Byte> ETYPE = byte.class; // used by the JVM
+
+    Byte512Vector(byte[] v) {
+        super(v);
+    }
+
+    // For compatibility as Byte512Vector::new,
+    // stored into species.vectorFactory.
+    Byte512Vector(Object v) {
+        this((byte[]) v);
+    }
+
+    static final Byte512Vector ZERO = new Byte512Vector(new byte[VLENGTH]);
+    static final Byte512Vector IOTA = new Byte512Vector(VSPECIES.iotaArray());
+
+    static {
+        // Warm up a few species caches.
+        // If we do this too much we will
+        // get NPEs from bootstrap circularity.
+        VSPECIES.dummyVector();
+        VSPECIES.withLanes(LaneType.BYTE);
+    }
+
+    // Specialized extractors
+
+    @ForceInline
+    final @Override
+    public ByteSpecies vspecies() {
+        // ISSUE:  This should probably be a @Stable
+        // field inside AbstractVector, rather than
+        // a megamorphic method.
+        return VSPECIES;
+    }
+
+    @ForceInline
+    @Override
+    public final Class<Byte> elementType() { return byte.class; }
+
+    @ForceInline
+    @Override
+    public final int elementSize() { return Byte.SIZE; }
+
+    @ForceInline
+    @Override
+    public final VectorShape shape() { return VSHAPE; }
+
+    @ForceInline
+    @Override
+    public final int length() { return VLENGTH; }
+
+    @ForceInline
+    @Override
+    public final int bitSize() { return VSIZE; }
+
+    @ForceInline
+    @Override
+    public final int byteSize() { return VSIZE / Byte.SIZE; }
+
+    /*package-private*/
+    @ForceInline
+    final @Override
+    byte[] vec() {
+        return (byte[])getPayload();
+    }
+
+    // Virtualized constructors
+
+    @Override
+    @ForceInline
+    public final Byte512Vector broadcast(byte e) {
+        return (Byte512Vector) super.broadcastTemplate(e);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final Byte512Vector broadcast(long e) {
+        return (Byte512Vector) super.broadcastTemplate(e);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    Byte512Mask maskFromArray(boolean[] bits) {
+        return new Byte512Mask(bits);
+    }
+
+    @Override
+    @ForceInline
+    Byte512Shuffle iotaShuffle() { return Byte512Shuffle.IOTA; }
+
+    @ForceInline
+    Byte512Shuffle iotaShuffle(int start, int step, boolean wrap) {
+      if (wrap) {
+        return (Byte512Shuffle)VectorSupport.shuffleIota(ETYPE, Byte512Shuffle.class, VSPECIES, VLENGTH, start, step, 1,
+                (l, lstart, lstep, s) -> s.shuffleFromOp(i -> (VectorIntrinsics.wrapToRange(i*lstep + lstart, l))));
+      } else {
+        return (Byte512Shuffle)VectorSupport.shuffleIota(ETYPE, Byte512Shuffle.class, VSPECIES, VLENGTH, start, step, 0,
+                (l, lstart, lstep, s) -> s.shuffleFromOp(i -> (i*lstep + lstart)));
+      }
+    }
+
+    @Override
+    @ForceInline
+    Byte512Shuffle shuffleFromBytes(byte[] reorder) { return new Byte512Shuffle(reorder); }
+
+    @Override
+    @ForceInline
+    Byte512Shuffle shuffleFromArray(int[] indexes, int i) { return new Byte512Shuffle(indexes, i); }
+
+    @Override
+    @ForceInline
+    Byte512Shuffle shuffleFromOp(IntUnaryOperator fn) { return new Byte512Shuffle(fn); }
+
+    // Make a vector of the same species but the given elements:
+    @ForceInline
+    final @Override
+    Byte512Vector vectorFactory(byte[] vec) {
+        return new Byte512Vector(vec);
+    }
+
+    @ForceInline
+    final @Override
+    Byte512Vector asByteVectorRaw() {
+        return (Byte512Vector) super.asByteVectorRawTemplate();  // specialize
+    }
+
+    @ForceInline
+    final @Override
+    AbstractVector<?> asVectorRaw(LaneType laneType) {
+        return super.asVectorRawTemplate(laneType);  // specialize
+    }
+
+    // Unary operator
+
+    @ForceInline
+    final @Override
+    Byte512Vector uOp(FUnOp f) {
+        return (Byte512Vector) super.uOpTemplate(f);  // specialize
+    }
+
+    @ForceInline
+    final @Override
+    Byte512Vector uOp(VectorMask<Byte> m, FUnOp f) {
+        return (Byte512Vector)
+            super.uOpTemplate((Byte512Mask)m, f);  // specialize
+    }
+
+    // Binary operator
+
+    @ForceInline
+    final @Override
+    Byte512Vector bOp(Vector<Byte> v, FBinOp f) {
+        return (Byte512Vector) super.bOpTemplate((Byte512Vector)v, f);  // specialize
+    }
+
+    @ForceInline
+    final @Override
+    Byte512Vector bOp(Vector<Byte> v,
+                     VectorMask<Byte> m, FBinOp f) {
+        return (Byte512Vector)
+            super.bOpTemplate((Byte512Vector)v, (Byte512Mask)m,
+                              f);  // specialize
+    }
+
+    // Ternary operator
+
+    @ForceInline
+    final @Override
+    Byte512Vector tOp(Vector<Byte> v1, Vector<Byte> v2, FTriOp f) {
+        return (Byte512Vector)
+            super.tOpTemplate((Byte512Vector)v1, (Byte512Vector)v2,
+                              f);  // specialize
+    }
+
+    @ForceInline
+    final @Override
+    Byte512Vector tOp(Vector<Byte> v1, Vector<Byte> v2,
+                     VectorMask<Byte> m, FTriOp f) {
+        return (Byte512Vector)
+            super.tOpTemplate((Byte512Vector)v1, (Byte512Vector)v2,
+                              (Byte512Mask)m, f);  // specialize
+    }
+
+    @ForceInline
+    final @Override
+    byte rOp(byte v, FBinOp f) {
+        return super.rOpTemplate(v, f);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final <F>
+    Vector<F> convertShape(VectorOperators.Conversion<Byte,F> conv,
+                           VectorSpecies<F> rsp, int part) {
+        return super.convertShapeTemplate(conv, rsp, part);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final <F>
+    Vector<F> reinterpretShape(VectorSpecies<F> toSpecies, int part) {
+        return super.reinterpretShapeTemplate(toSpecies, part);  // specialize
+    }
+
+    // Specialized algebraic operations:
+
+    // The following definition forces a specialized version of this
+    // crucial method into the v-table of this class.  A call to add()
+    // will inline to a call to lanewise(ADD,), at which point the JIT
+    // intrinsic will have the opcode of ADD, plus all the metadata
+    // for this particular class, enabling it to generate precise
+    // code.
+    //
+    // There is probably no benefit to the JIT to specialize the
+    // masked or broadcast versions of the lanewise method.
+
+    @Override
+    @ForceInline
+    public Byte512Vector lanewise(Unary op) {
+        return (Byte512Vector) super.lanewiseTemplate(op);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Byte512Vector lanewise(Binary op, Vector<Byte> v) {
+        return (Byte512Vector) super.lanewiseTemplate(op, v);  // specialize
+    }
+
+    /*package-private*/
+    @Override
+    @ForceInline Byte512Vector
+    lanewiseShift(VectorOperators.Binary op, int e) {
+        return (Byte512Vector) super.lanewiseShiftTemplate(op, e);  // specialize
+    }
+
+    /*package-private*/
+    @Override
+    @ForceInline
+    public final
+    Byte512Vector
+    lanewise(VectorOperators.Ternary op, Vector<Byte> v1, Vector<Byte> v2) {
+        return (Byte512Vector) super.lanewiseTemplate(op, v1, v2);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final
+    Byte512Vector addIndex(int scale) {
+        return (Byte512Vector) super.addIndexTemplate(scale);  // specialize
+    }
+
+    // Type specific horizontal reductions
+
+    @Override
+    @ForceInline
+    public final byte reduceLanes(VectorOperators.Associative op) {
+        return super.reduceLanesTemplate(op);  // specialized
+    }
+
+    @Override
+    @ForceInline
+    public final byte reduceLanes(VectorOperators.Associative op,
+                                    VectorMask<Byte> m) {
+        return super.reduceLanesTemplate(op, m);  // specialized
+    }
+
+    @Override
+    @ForceInline
+    public final long reduceLanesToLong(VectorOperators.Associative op) {
+        return (long) super.reduceLanesTemplate(op);  // specialized
+    }
+
+    @Override
+    @ForceInline
+    public final long reduceLanesToLong(VectorOperators.Associative op,
+                                        VectorMask<Byte> m) {
+        return (long) super.reduceLanesTemplate(op, m);  // specialized
+    }
+
+    @Override
+    @ForceInline
+    public VectorShuffle<Byte> toShuffle() {
+        byte[] a = toArray();
+        int[] sa = new int[a.length];
+        for (int i = 0; i < a.length; i++) {
+            sa[i] = (int) a[i];
+        }
+        return VectorShuffle.fromArray(VSPECIES, sa, 0);
+    }
+
+    // Specialized unary testing
+
+    @Override
+    @ForceInline
+    public final Byte512Mask test(Test op) {
+        return super.testTemplate(Byte512Mask.class, op);  // specialize
+    }
+
+    // Specialized comparisons
+
+    @Override
+    @ForceInline
+    public final Byte512Mask compare(Comparison op, Vector<Byte> v) {
+        return super.compareTemplate(Byte512Mask.class, op, v);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final Byte512Mask compare(Comparison op, byte s) {
+        return super.compareTemplate(Byte512Mask.class, op, s);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final Byte512Mask compare(Comparison op, long s) {
+        return super.compareTemplate(Byte512Mask.class, op, s);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Byte512Vector blend(Vector<Byte> v, VectorMask<Byte> m) {
+        return (Byte512Vector)
+            super.blendTemplate(Byte512Mask.class,
+                                (Byte512Vector) v,
+                                (Byte512Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Byte512Vector slice(int origin, Vector<Byte> v) {
+        return (Byte512Vector) super.sliceTemplate(origin, v);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Byte512Vector slice(int origin) {
+       if ((origin < 0) || (origin >= VLENGTH)) {
+         throw new ArrayIndexOutOfBoundsException("Index " + origin + " out of bounds for vector length " + VLENGTH);
+       } else {
+         Byte512Shuffle Iota = iotaShuffle();
+         VectorMask<Byte> BlendMask = Iota.toVector().compare(VectorOperators.LT, (broadcast((byte)(VLENGTH-origin))));
+         Iota = iotaShuffle(origin, 1, true);
+         return ZERO.blend(this.rearrange(Iota), BlendMask);
+       }
+    }
+
+    @Override
+    @ForceInline
+    public Byte512Vector unslice(int origin, Vector<Byte> w, int part) {
+        return (Byte512Vector) super.unsliceTemplate(origin, w, part);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Byte512Vector unslice(int origin, Vector<Byte> w, int part, VectorMask<Byte> m) {
+        return (Byte512Vector)
+            super.unsliceTemplate(Byte512Mask.class,
+                                  origin, w, part,
+                                  (Byte512Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Byte512Vector unslice(int origin) {
+       if ((origin < 0) || (origin >= VLENGTH)) {
+         throw new ArrayIndexOutOfBoundsException("Index " + origin + " out of bounds for vector length " + VLENGTH);
+       } else {
+         Byte512Shuffle Iota = iotaShuffle();
+         VectorMask<Byte> BlendMask = Iota.toVector().compare(VectorOperators.GE, (broadcast((byte)(origin))));
+         Iota = iotaShuffle(-origin, 1, true);
+         return ZERO.blend(this.rearrange(Iota), BlendMask);
+       }
+    }
+
+    @Override
+    @ForceInline
+    public Byte512Vector rearrange(VectorShuffle<Byte> s) {
+        return (Byte512Vector)
+            super.rearrangeTemplate(Byte512Shuffle.class,
+                                    (Byte512Shuffle) s);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Byte512Vector rearrange(VectorShuffle<Byte> shuffle,
+                                  VectorMask<Byte> m) {
+        return (Byte512Vector)
+            super.rearrangeTemplate(Byte512Shuffle.class,
+                                    (Byte512Shuffle) shuffle,
+                                    (Byte512Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Byte512Vector rearrange(VectorShuffle<Byte> s,
+                                  Vector<Byte> v) {
+        return (Byte512Vector)
+            super.rearrangeTemplate(Byte512Shuffle.class,
+                                    (Byte512Shuffle) s,
+                                    (Byte512Vector) v);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Byte512Vector selectFrom(Vector<Byte> v) {
+        return (Byte512Vector)
+            super.selectFromTemplate((Byte512Vector) v);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Byte512Vector selectFrom(Vector<Byte> v,
+                                   VectorMask<Byte> m) {
+        return (Byte512Vector)
+            super.selectFromTemplate((Byte512Vector) v,
+                                     (Byte512Mask) m);  // specialize
+    }
+
+
+    @ForceInline
+    @Override
+    public byte lane(int i) {
+        switch(i) {
+            case 0: return laneHelper(0);
+            case 1: return laneHelper(1);
+            case 2: return laneHelper(2);
+            case 3: return laneHelper(3);
+            case 4: return laneHelper(4);
+            case 5: return laneHelper(5);
+            case 6: return laneHelper(6);
+            case 7: return laneHelper(7);
+            case 8: return laneHelper(8);
+            case 9: return laneHelper(9);
+            case 10: return laneHelper(10);
+            case 11: return laneHelper(11);
+            case 12: return laneHelper(12);
+            case 13: return laneHelper(13);
+            case 14: return laneHelper(14);
+            case 15: return laneHelper(15);
+            case 16: return laneHelper(16);
+            case 17: return laneHelper(17);
+            case 18: return laneHelper(18);
+            case 19: return laneHelper(19);
+            case 20: return laneHelper(20);
+            case 21: return laneHelper(21);
+            case 22: return laneHelper(22);
+            case 23: return laneHelper(23);
+            case 24: return laneHelper(24);
+            case 25: return laneHelper(25);
+            case 26: return laneHelper(26);
+            case 27: return laneHelper(27);
+            case 28: return laneHelper(28);
+            case 29: return laneHelper(29);
+            case 30: return laneHelper(30);
+            case 31: return laneHelper(31);
+            case 32: return laneHelper(32);
+            case 33: return laneHelper(33);
+            case 34: return laneHelper(34);
+            case 35: return laneHelper(35);
+            case 36: return laneHelper(36);
+            case 37: return laneHelper(37);
+            case 38: return laneHelper(38);
+            case 39: return laneHelper(39);
+            case 40: return laneHelper(40);
+            case 41: return laneHelper(41);
+            case 42: return laneHelper(42);
+            case 43: return laneHelper(43);
+            case 44: return laneHelper(44);
+            case 45: return laneHelper(45);
+            case 46: return laneHelper(46);
+            case 47: return laneHelper(47);
+            case 48: return laneHelper(48);
+            case 49: return laneHelper(49);
+            case 50: return laneHelper(50);
+            case 51: return laneHelper(51);
+            case 52: return laneHelper(52);
+            case 53: return laneHelper(53);
+            case 54: return laneHelper(54);
+            case 55: return laneHelper(55);
+            case 56: return laneHelper(56);
+            case 57: return laneHelper(57);
+            case 58: return laneHelper(58);
+            case 59: return laneHelper(59);
+            case 60: return laneHelper(60);
+            case 61: return laneHelper(61);
+            case 62: return laneHelper(62);
+            case 63: return laneHelper(63);
+            default: throw new IllegalArgumentException("Index " + i + " must be zero or positive, and less than " + VLENGTH);
+        }
+    }
+
+    public byte laneHelper(int i) {
+        return (byte) VectorSupport.extract(
+                                VCLASS, ETYPE, VLENGTH,
+                                this, i,
+                                (vec, ix) -> {
+                                    byte[] vecarr = vec.vec();
+                                    return (long)vecarr[ix];
+                                });
+    }
+
+    @ForceInline
+    @Override
+    public Byte512Vector withLane(int i, byte e) {
+        switch (i) {
+            case 0: return withLaneHelper(0, e);
+            case 1: return withLaneHelper(1, e);
+            case 2: return withLaneHelper(2, e);
+            case 3: return withLaneHelper(3, e);
+            case 4: return withLaneHelper(4, e);
+            case 5: return withLaneHelper(5, e);
+            case 6: return withLaneHelper(6, e);
+            case 7: return withLaneHelper(7, e);
+            case 8: return withLaneHelper(8, e);
+            case 9: return withLaneHelper(9, e);
+            case 10: return withLaneHelper(10, e);
+            case 11: return withLaneHelper(11, e);
+            case 12: return withLaneHelper(12, e);
+            case 13: return withLaneHelper(13, e);
+            case 14: return withLaneHelper(14, e);
+            case 15: return withLaneHelper(15, e);
+            case 16: return withLaneHelper(16, e);
+            case 17: return withLaneHelper(17, e);
+            case 18: return withLaneHelper(18, e);
+            case 19: return withLaneHelper(19, e);
+            case 20: return withLaneHelper(20, e);
+            case 21: return withLaneHelper(21, e);
+            case 22: return withLaneHelper(22, e);
+            case 23: return withLaneHelper(23, e);
+            case 24: return withLaneHelper(24, e);
+            case 25: return withLaneHelper(25, e);
+            case 26: return withLaneHelper(26, e);
+            case 27: return withLaneHelper(27, e);
+            case 28: return withLaneHelper(28, e);
+            case 29: return withLaneHelper(29, e);
+            case 30: return withLaneHelper(30, e);
+            case 31: return withLaneHelper(31, e);
+            case 32: return withLaneHelper(32, e);
+            case 33: return withLaneHelper(33, e);
+            case 34: return withLaneHelper(34, e);
+            case 35: return withLaneHelper(35, e);
+            case 36: return withLaneHelper(36, e);
+            case 37: return withLaneHelper(37, e);
+            case 38: return withLaneHelper(38, e);
+            case 39: return withLaneHelper(39, e);
+            case 40: return withLaneHelper(40, e);
+            case 41: return withLaneHelper(41, e);
+            case 42: return withLaneHelper(42, e);
+            case 43: return withLaneHelper(43, e);
+            case 44: return withLaneHelper(44, e);
+            case 45: return withLaneHelper(45, e);
+            case 46: return withLaneHelper(46, e);
+            case 47: return withLaneHelper(47, e);
+            case 48: return withLaneHelper(48, e);
+            case 49: return withLaneHelper(49, e);
+            case 50: return withLaneHelper(50, e);
+            case 51: return withLaneHelper(51, e);
+            case 52: return withLaneHelper(52, e);
+            case 53: return withLaneHelper(53, e);
+            case 54: return withLaneHelper(54, e);
+            case 55: return withLaneHelper(55, e);
+            case 56: return withLaneHelper(56, e);
+            case 57: return withLaneHelper(57, e);
+            case 58: return withLaneHelper(58, e);
+            case 59: return withLaneHelper(59, e);
+            case 60: return withLaneHelper(60, e);
+            case 61: return withLaneHelper(61, e);
+            case 62: return withLaneHelper(62, e);
+            case 63: return withLaneHelper(63, e);
+            default: throw new IllegalArgumentException("Index " + i + " must be zero or positive, and less than " + VLENGTH);
+        }
+    }
+
+    public Byte512Vector withLaneHelper(int i, byte e) {
+        return VectorSupport.insert(
+                                VCLASS, ETYPE, VLENGTH,
+                                this, i, (long)e,
+                                (v, ix, bits) -> {
+                                    byte[] res = v.vec().clone();
+                                    res[ix] = (byte)bits;
+                                    return v.vectorFactory(res);
+                                });
+    }
+
+    // Mask
+
+    static final class Byte512Mask extends AbstractMask<Byte> {
+        static final int VLENGTH = VSPECIES.laneCount();    // used by the JVM
+        static final Class<Byte> ETYPE = byte.class; // used by the JVM
+
+        Byte512Mask(boolean[] bits) {
+            this(bits, 0);
+        }
+
+        Byte512Mask(boolean[] bits, int offset) {
+            super(prepare(bits, offset));
+        }
+
+        Byte512Mask(boolean val) {
+            super(prepare(val));
+        }
+
+        private static boolean[] prepare(boolean[] bits, int offset) {
+            boolean[] newBits = new boolean[VSPECIES.laneCount()];
+            for (int i = 0; i < newBits.length; i++) {
+                newBits[i] = bits[offset + i];
+            }
+            return newBits;
+        }
+
+        private static boolean[] prepare(boolean val) {
+            boolean[] bits = new boolean[VSPECIES.laneCount()];
+            Arrays.fill(bits, val);
+            return bits;
+        }
+
+        @ForceInline
+        final @Override
+        public ByteSpecies vspecies() {
+            // ISSUE:  This should probably be a @Stable
+            // field inside AbstractMask, rather than
+            // a megamorphic method.
+            return VSPECIES;
+        }
+
+        @ForceInline
+        boolean[] getBits() {
+            return (boolean[])getPayload();
+        }
+
+        @Override
+        Byte512Mask uOp(MUnOp f) {
+            boolean[] res = new boolean[vspecies().laneCount()];
+            boolean[] bits = getBits();
+            for (int i = 0; i < res.length; i++) {
+                res[i] = f.apply(i, bits[i]);
+            }
+            return new Byte512Mask(res);
+        }
+
+        @Override
+        Byte512Mask bOp(VectorMask<Byte> m, MBinOp f) {
+            boolean[] res = new boolean[vspecies().laneCount()];
+            boolean[] bits = getBits();
+            boolean[] mbits = ((Byte512Mask)m).getBits();
+            for (int i = 0; i < res.length; i++) {
+                res[i] = f.apply(i, bits[i], mbits[i]);
+            }
+            return new Byte512Mask(res);
+        }
+
+        @ForceInline
+        @Override
+        public final
+        Byte512Vector toVector() {
+            return (Byte512Vector) super.toVectorTemplate();  // specialize
+        }
+
+        @Override
+        @ForceInline
+        public <E> VectorMask<E> cast(VectorSpecies<E> s) {
+            AbstractSpecies<E> species = (AbstractSpecies<E>) s;
+            if (length() != species.laneCount())
+                throw new IllegalArgumentException("VectorMask length and species length differ");
+            boolean[] maskArray = toArray();
+            // enum-switches don't optimize properly JDK-8161245
+            switch (species.laneType.switchKey) {
+            case LaneType.SK_BYTE:
+                return new Byte512Vector.Byte512Mask(maskArray).check(species);
+            case LaneType.SK_SHORT:
+                return new Short512Vector.Short512Mask(maskArray).check(species);
+            case LaneType.SK_INT:
+                return new Int512Vector.Int512Mask(maskArray).check(species);
+            case LaneType.SK_LONG:
+                return new Long512Vector.Long512Mask(maskArray).check(species);
+            case LaneType.SK_FLOAT:
+                return new Float512Vector.Float512Mask(maskArray).check(species);
+            case LaneType.SK_DOUBLE:
+                return new Double512Vector.Double512Mask(maskArray).check(species);
+            }
+
+            // Should not reach here.
+            throw new AssertionError(species);
+        }
+
+        // Unary operations
+
+        @Override
+        @ForceInline
+        public Byte512Mask not() {
+            return xor(maskAll(true));
+        }
+
+        // Binary operations
+
+        @Override
+        @ForceInline
+        public Byte512Mask and(VectorMask<Byte> mask) {
+            Objects.requireNonNull(mask);
+            Byte512Mask m = (Byte512Mask)mask;
+            return VectorSupport.binaryOp(VECTOR_OP_AND, Byte512Mask.class, byte.class, VLENGTH,
+                                             this, m,
+                                             (m1, m2) -> m1.bOp(m2, (i, a, b) -> a & b));
+        }
+
+        @Override
+        @ForceInline
+        public Byte512Mask or(VectorMask<Byte> mask) {
+            Objects.requireNonNull(mask);
+            Byte512Mask m = (Byte512Mask)mask;
+            return VectorSupport.binaryOp(VECTOR_OP_OR, Byte512Mask.class, byte.class, VLENGTH,
+                                             this, m,
+                                             (m1, m2) -> m1.bOp(m2, (i, a, b) -> a | b));
+        }
+
+        @ForceInline
+        /* package-private */
+        Byte512Mask xor(VectorMask<Byte> mask) {
+            Objects.requireNonNull(mask);
+            Byte512Mask m = (Byte512Mask)mask;
+            return VectorSupport.binaryOp(VECTOR_OP_XOR, Byte512Mask.class, byte.class, VLENGTH,
+                                          this, m,
+                                          (m1, m2) -> m1.bOp(m2, (i, a, b) -> a ^ b));
+        }
+
+        // Reductions
+
+        @Override
+        @ForceInline
+        public boolean anyTrue() {
+            return VectorSupport.test(BT_ne, Byte512Mask.class, byte.class, VLENGTH,
+                                         this, vspecies().maskAll(true),
+                                         (m, __) -> anyTrueHelper(((Byte512Mask)m).getBits()));
+        }
+
+        @Override
+        @ForceInline
+        public boolean allTrue() {
+            return VectorSupport.test(BT_overflow, Byte512Mask.class, byte.class, VLENGTH,
+                                         this, vspecies().maskAll(true),
+                                         (m, __) -> allTrueHelper(((Byte512Mask)m).getBits()));
+        }
+
+        @ForceInline
+        /*package-private*/
+        static Byte512Mask maskAll(boolean bit) {
+            return VectorSupport.broadcastCoerced(Byte512Mask.class, byte.class, VLENGTH,
+                                                  (bit ? -1 : 0), null,
+                                                  (v, __) -> (v != 0 ? TRUE_MASK : FALSE_MASK));
+        }
+        private static final Byte512Mask  TRUE_MASK = new Byte512Mask(true);
+        private static final Byte512Mask FALSE_MASK = new Byte512Mask(false);
+
+    }
+
+    // Shuffle
+
+    static final class Byte512Shuffle extends AbstractShuffle<Byte> {
+        static final int VLENGTH = VSPECIES.laneCount();    // used by the JVM
+        static final Class<Byte> ETYPE = byte.class; // used by the JVM
+
+        Byte512Shuffle(byte[] reorder) {
+            super(VLENGTH, reorder);
+        }
+
+        public Byte512Shuffle(int[] reorder) {
+            super(VLENGTH, reorder);
+        }
+
+        public Byte512Shuffle(int[] reorder, int i) {
+            super(VLENGTH, reorder, i);
+        }
+
+        public Byte512Shuffle(IntUnaryOperator fn) {
+            super(VLENGTH, fn);
+        }
+
+        @Override
+        public ByteSpecies vspecies() {
+            return VSPECIES;
+        }
+
+        static {
+            // There must be enough bits in the shuffle lanes to encode
+            // VLENGTH valid indexes and VLENGTH exceptional ones.
+            assert(VLENGTH < Byte.MAX_VALUE);
+            assert(Byte.MIN_VALUE <= -VLENGTH);
+        }
+        static final Byte512Shuffle IOTA = new Byte512Shuffle(IDENTITY);
+
+        @Override
+        @ForceInline
+        public Byte512Vector toVector() {
+            return VectorSupport.shuffleToVector(VCLASS, ETYPE, Byte512Shuffle.class, this, VLENGTH,
+                                                    (s) -> ((Byte512Vector)(((AbstractShuffle<Byte>)(s)).toVectorTemplate())));
+        }
+
+        @Override
+        @ForceInline
+        public <F> VectorShuffle<F> cast(VectorSpecies<F> s) {
+            AbstractSpecies<F> species = (AbstractSpecies<F>) s;
+            if (length() != species.laneCount())
+                throw new IllegalArgumentException("VectorShuffle length and species length differ");
+            int[] shuffleArray = toArray();
+            // enum-switches don't optimize properly JDK-8161245
+            switch (species.laneType.switchKey) {
+            case LaneType.SK_BYTE:
+                return new Byte512Vector.Byte512Shuffle(shuffleArray).check(species);
+            case LaneType.SK_SHORT:
+                return new Short512Vector.Short512Shuffle(shuffleArray).check(species);
+            case LaneType.SK_INT:
+                return new Int512Vector.Int512Shuffle(shuffleArray).check(species);
+            case LaneType.SK_LONG:
+                return new Long512Vector.Long512Shuffle(shuffleArray).check(species);
+            case LaneType.SK_FLOAT:
+                return new Float512Vector.Float512Shuffle(shuffleArray).check(species);
+            case LaneType.SK_DOUBLE:
+                return new Double512Vector.Double512Shuffle(shuffleArray).check(species);
+            }
+
+            // Should not reach here.
+            throw new AssertionError(species);
+        }
+
+        @ForceInline
+        @Override
+        public Byte512Shuffle rearrange(VectorShuffle<Byte> shuffle) {
+            Byte512Shuffle s = (Byte512Shuffle) shuffle;
+            byte[] reorder1 = reorder();
+            byte[] reorder2 = s.reorder();
+            byte[] r = new byte[reorder1.length];
+            for (int i = 0; i < reorder1.length; i++) {
+                int ssi = reorder2[i];
+                r[i] = reorder1[ssi];  // throws on exceptional index
+            }
+            return new Byte512Shuffle(r);
+        }
+    }
+
+    // ================================================
+
+    // Specialized low-level memory operations.
+
+    @ForceInline
+    @Override
+    final
+    ByteVector fromArray0(byte[] a, int offset) {
+        return super.fromArray0Template(a, offset);  // specialize
+    }
+
+    @ForceInline
+    @Override
+    final
+    ByteVector fromByteArray0(byte[] a, int offset) {
+        return super.fromByteArray0Template(a, offset);  // specialize
+    }
+
+    @ForceInline
+    @Override
+    final
+    ByteVector fromByteBuffer0(ByteBuffer bb, int offset) {
+        return super.fromByteBuffer0Template(bb, offset);  // specialize
+    }
+
+    @ForceInline
+    @Override
+    final
+    void intoArray0(byte[] a, int offset) {
+        super.intoArray0Template(a, offset);  // specialize
+    }
+
+    @ForceInline
+    @Override
+    final
+    void intoByteArray0(byte[] a, int offset) {
+        super.intoByteArray0Template(a, offset);  // specialize
+    }
+
+    // End of specialized low-level memory operations.
+
+    // ================================================
+
+}
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Byte64Vector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Byte64Vector.java
@ -0,0 +1,824 @@
+/*
+ * Copyright (c) 2017, 2020, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.  Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package jdk.incubator.vector;
+
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.Objects;
+import java.util.function.IntUnaryOperator;
+
+import jdk.internal.vm.annotation.ForceInline;
+import jdk.internal.vm.vector.VectorSupport;
+
+import static jdk.internal.vm.vector.VectorSupport.*;
+
+import static jdk.incubator.vector.VectorOperators.*;
+
+// -- This file was mechanically generated: Do not edit! -- //
+
+@SuppressWarnings("cast")  // warning: redundant cast
+final class Byte64Vector extends ByteVector {
+    static final ByteSpecies VSPECIES =
+        (ByteSpecies) ByteVector.SPECIES_64;
+
+    static final VectorShape VSHAPE =
+        VSPECIES.vectorShape();
+
+    static final Class<Byte64Vector> VCLASS = Byte64Vector.class;
+
+    static final int VSIZE = VSPECIES.vectorBitSize();
+
+    static final int VLENGTH = VSPECIES.laneCount(); // used by the JVM
+
+    static final Class<Byte> ETYPE = byte.class; // used by the JVM
+
+    Byte64Vector(byte[] v) {
+        super(v);
+    }
+
+    // For compatibility as Byte64Vector::new,
+    // stored into species.vectorFactory.
+    Byte64Vector(Object v) {
+        this((byte[]) v);
+    }
+
+    static final Byte64Vector ZERO = new Byte64Vector(new byte[VLENGTH]);
+    static final Byte64Vector IOTA = new Byte64Vector(VSPECIES.iotaArray());
+
+    static {
+        // Warm up a few species caches.
+        // If we do this too much we will
+        // get NPEs from bootstrap circularity.
+        VSPECIES.dummyVector();
+        VSPECIES.withLanes(LaneType.BYTE);
+    }
+
+    // Specialized extractors
+
+    @ForceInline
+    final @Override
+    public ByteSpecies vspecies() {
+        // ISSUE:  This should probably be a @Stable
+        // field inside AbstractVector, rather than
+        // a megamorphic method.
+        return VSPECIES;
+    }
+
+    @ForceInline
+    @Override
+    public final Class<Byte> elementType() { return byte.class; }
+
+    @ForceInline
+    @Override
+    public final int elementSize() { return Byte.SIZE; }
+
+    @ForceInline
+    @Override
+    public final VectorShape shape() { return VSHAPE; }
+
+    @ForceInline
+    @Override
+    public final int length() { return VLENGTH; }
+
+    @ForceInline
+    @Override
+    public final int bitSize() { return VSIZE; }
+
+    @ForceInline
+    @Override
+    public final int byteSize() { return VSIZE / Byte.SIZE; }
+
+    /*package-private*/
+    @ForceInline
+    final @Override
+    byte[] vec() {
+        return (byte[])getPayload();
+    }
+
+    // Virtualized constructors
+
+    @Override
+    @ForceInline
+    public final Byte64Vector broadcast(byte e) {
+        return (Byte64Vector) super.broadcastTemplate(e);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final Byte64Vector broadcast(long e) {
+        return (Byte64Vector) super.broadcastTemplate(e);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    Byte64Mask maskFromArray(boolean[] bits) {
+        return new Byte64Mask(bits);
+    }
+
+    @Override
+    @ForceInline
+    Byte64Shuffle iotaShuffle() { return Byte64Shuffle.IOTA; }
+
+    @ForceInline
+    Byte64Shuffle iotaShuffle(int start, int step, boolean wrap) {
+      if (wrap) {
+        return (Byte64Shuffle)VectorSupport.shuffleIota(ETYPE, Byte64Shuffle.class, VSPECIES, VLENGTH, start, step, 1,
+                (l, lstart, lstep, s) -> s.shuffleFromOp(i -> (VectorIntrinsics.wrapToRange(i*lstep + lstart, l))));
+      } else {
+        return (Byte64Shuffle)VectorSupport.shuffleIota(ETYPE, Byte64Shuffle.class, VSPECIES, VLENGTH, start, step, 0,
+                (l, lstart, lstep, s) -> s.shuffleFromOp(i -> (i*lstep + lstart)));
+      }
+    }
+
+    @Override
+    @ForceInline
+    Byte64Shuffle shuffleFromBytes(byte[] reorder) { return new Byte64Shuffle(reorder); }
+
+    @Override
+    @ForceInline
+    Byte64Shuffle shuffleFromArray(int[] indexes, int i) { return new Byte64Shuffle(indexes, i); }
+
+    @Override
+    @ForceInline
+    Byte64Shuffle shuffleFromOp(IntUnaryOperator fn) { return new Byte64Shuffle(fn); }
+
+    // Make a vector of the same species but the given elements:
+    @ForceInline
+    final @Override
+    Byte64Vector vectorFactory(byte[] vec) {
+        return new Byte64Vector(vec);
+    }
+
+    @ForceInline
+    final @Override
+    Byte64Vector asByteVectorRaw() {
+        return (Byte64Vector) super.asByteVectorRawTemplate();  // specialize
+    }
+
+    @ForceInline
+    final @Override
+    AbstractVector<?> asVectorRaw(LaneType laneType) {
+        return super.asVectorRawTemplate(laneType);  // specialize
+    }
+
+    // Unary operator
+
+    @ForceInline
+    final @Override
+    Byte64Vector uOp(FUnOp f) {
+        return (Byte64Vector) super.uOpTemplate(f);  // specialize
+    }
+
+    @ForceInline
+    final @Override
+    Byte64Vector uOp(VectorMask<Byte> m, FUnOp f) {
+        return (Byte64Vector)
+            super.uOpTemplate((Byte64Mask)m, f);  // specialize
+    }
+
+    // Binary operator
+
+    @ForceInline
+    final @Override
+    Byte64Vector bOp(Vector<Byte> v, FBinOp f) {
+        return (Byte64Vector) super.bOpTemplate((Byte64Vector)v, f);  // specialize
+    }
+
+    @ForceInline
+    final @Override
+    Byte64Vector bOp(Vector<Byte> v,
+                     VectorMask<Byte> m, FBinOp f) {
+        return (Byte64Vector)
+            super.bOpTemplate((Byte64Vector)v, (Byte64Mask)m,
+                              f);  // specialize
+    }
+
+    // Ternary operator
+
+    @ForceInline
+    final @Override
+    Byte64Vector tOp(Vector<Byte> v1, Vector<Byte> v2, FTriOp f) {
+        return (Byte64Vector)
+            super.tOpTemplate((Byte64Vector)v1, (Byte64Vector)v2,
+                              f);  // specialize
+    }
+
+    @ForceInline
+    final @Override
+    Byte64Vector tOp(Vector<Byte> v1, Vector<Byte> v2,
+                     VectorMask<Byte> m, FTriOp f) {
+        return (Byte64Vector)
+            super.tOpTemplate((Byte64Vector)v1, (Byte64Vector)v2,
+                              (Byte64Mask)m, f);  // specialize
+    }
+
+    @ForceInline
+    final @Override
+    byte rOp(byte v, FBinOp f) {
+        return super.rOpTemplate(v, f);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final <F>
+    Vector<F> convertShape(VectorOperators.Conversion<Byte,F> conv,
+                           VectorSpecies<F> rsp, int part) {
+        return super.convertShapeTemplate(conv, rsp, part);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final <F>
+    Vector<F> reinterpretShape(VectorSpecies<F> toSpecies, int part) {
+        return super.reinterpretShapeTemplate(toSpecies, part);  // specialize
+    }
+
+    // Specialized algebraic operations:
+
+    // The following definition forces a specialized version of this
+    // crucial method into the v-table of this class.  A call to add()
+    // will inline to a call to lanewise(ADD,), at which point the JIT
+    // intrinsic will have the opcode of ADD, plus all the metadata
+    // for this particular class, enabling it to generate precise
+    // code.
+    //
+    // There is probably no benefit to the JIT to specialize the
+    // masked or broadcast versions of the lanewise method.
+
+    @Override
+    @ForceInline
+    public Byte64Vector lanewise(Unary op) {
+        return (Byte64Vector) super.lanewiseTemplate(op);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Byte64Vector lanewise(Binary op, Vector<Byte> v) {
+        return (Byte64Vector) super.lanewiseTemplate(op, v);  // specialize
+    }
+
+    /*package-private*/
+    @Override
+    @ForceInline Byte64Vector
+    lanewiseShift(VectorOperators.Binary op, int e) {
+        return (Byte64Vector) super.lanewiseShiftTemplate(op, e);  // specialize
+    }
+
+    /*package-private*/
+    @Override
+    @ForceInline
+    public final
+    Byte64Vector
+    lanewise(VectorOperators.Ternary op, Vector<Byte> v1, Vector<Byte> v2) {
+        return (Byte64Vector) super.lanewiseTemplate(op, v1, v2);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final
+    Byte64Vector addIndex(int scale) {
+        return (Byte64Vector) super.addIndexTemplate(scale);  // specialize
+    }
+
+    // Type specific horizontal reductions
+
+    @Override
+    @ForceInline
+    public final byte reduceLanes(VectorOperators.Associative op) {
+        return super.reduceLanesTemplate(op);  // specialized
+    }
+
+    @Override
+    @ForceInline
+    public final byte reduceLanes(VectorOperators.Associative op,
+                                    VectorMask<Byte> m) {
+        return super.reduceLanesTemplate(op, m);  // specialized
+    }
+
+    @Override
+    @ForceInline
+    public final long reduceLanesToLong(VectorOperators.Associative op) {
+        return (long) super.reduceLanesTemplate(op);  // specialized
+    }
+
+    @Override
+    @ForceInline
+    public final long reduceLanesToLong(VectorOperators.Associative op,
+                                        VectorMask<Byte> m) {
+        return (long) super.reduceLanesTemplate(op, m);  // specialized
+    }
+
+    @Override
+    @ForceInline
+    public VectorShuffle<Byte> toShuffle() {
+        byte[] a = toArray();
+        int[] sa = new int[a.length];
+        for (int i = 0; i < a.length; i++) {
+            sa[i] = (int) a[i];
+        }
+        return VectorShuffle.fromArray(VSPECIES, sa, 0);
+    }
+
+    // Specialized unary testing
+
+    @Override
+    @ForceInline
+    public final Byte64Mask test(Test op) {
+        return super.testTemplate(Byte64Mask.class, op);  // specialize
+    }
+
+    // Specialized comparisons
+
+    @Override
+    @ForceInline
+    public final Byte64Mask compare(Comparison op, Vector<Byte> v) {
+        return super.compareTemplate(Byte64Mask.class, op, v);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final Byte64Mask compare(Comparison op, byte s) {
+        return super.compareTemplate(Byte64Mask.class, op, s);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final Byte64Mask compare(Comparison op, long s) {
+        return super.compareTemplate(Byte64Mask.class, op, s);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Byte64Vector blend(Vector<Byte> v, VectorMask<Byte> m) {
+        return (Byte64Vector)
+            super.blendTemplate(Byte64Mask.class,
+                                (Byte64Vector) v,
+                                (Byte64Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Byte64Vector slice(int origin, Vector<Byte> v) {
+        return (Byte64Vector) super.sliceTemplate(origin, v);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Byte64Vector slice(int origin) {
+       if ((origin < 0) || (origin >= VLENGTH)) {
+         throw new ArrayIndexOutOfBoundsException("Index " + origin + " out of bounds for vector length " + VLENGTH);
+       } else {
+         Byte64Shuffle Iota = iotaShuffle();
+         VectorMask<Byte> BlendMask = Iota.toVector().compare(VectorOperators.LT, (broadcast((byte)(VLENGTH-origin))));
+         Iota = iotaShuffle(origin, 1, true);
+         return ZERO.blend(this.rearrange(Iota), BlendMask);
+       }
+    }
+
+    @Override
+    @ForceInline
+    public Byte64Vector unslice(int origin, Vector<Byte> w, int part) {
+        return (Byte64Vector) super.unsliceTemplate(origin, w, part);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Byte64Vector unslice(int origin, Vector<Byte> w, int part, VectorMask<Byte> m) {
+        return (Byte64Vector)
+            super.unsliceTemplate(Byte64Mask.class,
+                                  origin, w, part,
+                                  (Byte64Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Byte64Vector unslice(int origin) {
+       if ((origin < 0) || (origin >= VLENGTH)) {
+         throw new ArrayIndexOutOfBoundsException("Index " + origin + " out of bounds for vector length " + VLENGTH);
+       } else {
+         Byte64Shuffle Iota = iotaShuffle();
+         VectorMask<Byte> BlendMask = Iota.toVector().compare(VectorOperators.GE, (broadcast((byte)(origin))));
+         Iota = iotaShuffle(-origin, 1, true);
+         return ZERO.blend(this.rearrange(Iota), BlendMask);
+       }
+    }
+
+    @Override
+    @ForceInline
+    public Byte64Vector rearrange(VectorShuffle<Byte> s) {
+        return (Byte64Vector)
+            super.rearrangeTemplate(Byte64Shuffle.class,
+                                    (Byte64Shuffle) s);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Byte64Vector rearrange(VectorShuffle<Byte> shuffle,
+                                  VectorMask<Byte> m) {
+        return (Byte64Vector)
+            super.rearrangeTemplate(Byte64Shuffle.class,
+                                    (Byte64Shuffle) shuffle,
+                                    (Byte64Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Byte64Vector rearrange(VectorShuffle<Byte> s,
+                                  Vector<Byte> v) {
+        return (Byte64Vector)
+            super.rearrangeTemplate(Byte64Shuffle.class,
+                                    (Byte64Shuffle) s,
+                                    (Byte64Vector) v);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Byte64Vector selectFrom(Vector<Byte> v) {
+        return (Byte64Vector)
+            super.selectFromTemplate((Byte64Vector) v);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Byte64Vector selectFrom(Vector<Byte> v,
+                                   VectorMask<Byte> m) {
+        return (Byte64Vector)
+            super.selectFromTemplate((Byte64Vector) v,
+                                     (Byte64Mask) m);  // specialize
+    }
+
+
+    @ForceInline
+    @Override
+    public byte lane(int i) {
+        switch(i) {
+            case 0: return laneHelper(0);
+            case 1: return laneHelper(1);
+            case 2: return laneHelper(2);
+            case 3: return laneHelper(3);
+            case 4: return laneHelper(4);
+            case 5: return laneHelper(5);
+            case 6: return laneHelper(6);
+            case 7: return laneHelper(7);
+            default: throw new IllegalArgumentException("Index " + i + " must be zero or positive, and less than " + VLENGTH);
+        }
+    }
+
+    public byte laneHelper(int i) {
+        return (byte) VectorSupport.extract(
+                                VCLASS, ETYPE, VLENGTH,
+                                this, i,
+                                (vec, ix) -> {
+                                    byte[] vecarr = vec.vec();
+                                    return (long)vecarr[ix];
+                                });
+    }
+
+    @ForceInline
+    @Override
+    public Byte64Vector withLane(int i, byte e) {
+        switch (i) {
+            case 0: return withLaneHelper(0, e);
+            case 1: return withLaneHelper(1, e);
+            case 2: return withLaneHelper(2, e);
+            case 3: return withLaneHelper(3, e);
+            case 4: return withLaneHelper(4, e);
+            case 5: return withLaneHelper(5, e);
+            case 6: return withLaneHelper(6, e);
+            case 7: return withLaneHelper(7, e);
+            default: throw new IllegalArgumentException("Index " + i + " must be zero or positive, and less than " + VLENGTH);
+        }
+    }
+
+    public Byte64Vector withLaneHelper(int i, byte e) {
+        return VectorSupport.insert(
+                                VCLASS, ETYPE, VLENGTH,
+                                this, i, (long)e,
+                                (v, ix, bits) -> {
+                                    byte[] res = v.vec().clone();
+                                    res[ix] = (byte)bits;
+                                    return v.vectorFactory(res);
+                                });
+    }
+
+    // Mask
+
+    static final class Byte64Mask extends AbstractMask<Byte> {
+        static final int VLENGTH = VSPECIES.laneCount();    // used by the JVM
+        static final Class<Byte> ETYPE = byte.class; // used by the JVM
+
+        Byte64Mask(boolean[] bits) {
+            this(bits, 0);
+        }
+
+        Byte64Mask(boolean[] bits, int offset) {
+            super(prepare(bits, offset));
+        }
+
+        Byte64Mask(boolean val) {
+            super(prepare(val));
+        }
+
+        private static boolean[] prepare(boolean[] bits, int offset) {
+            boolean[] newBits = new boolean[VSPECIES.laneCount()];
+            for (int i = 0; i < newBits.length; i++) {
+                newBits[i] = bits[offset + i];
+            }
+            return newBits;
+        }
+
+        private static boolean[] prepare(boolean val) {
+            boolean[] bits = new boolean[VSPECIES.laneCount()];
+            Arrays.fill(bits, val);
+            return bits;
+        }
+
+        @ForceInline
+        final @Override
+        public ByteSpecies vspecies() {
+            // ISSUE:  This should probably be a @Stable
+            // field inside AbstractMask, rather than
+            // a megamorphic method.
+            return VSPECIES;
+        }
+
+        @ForceInline
+        boolean[] getBits() {
+            return (boolean[])getPayload();
+        }
+
+        @Override
+        Byte64Mask uOp(MUnOp f) {
+            boolean[] res = new boolean[vspecies().laneCount()];
+            boolean[] bits = getBits();
+            for (int i = 0; i < res.length; i++) {
+                res[i] = f.apply(i, bits[i]);
+            }
+            return new Byte64Mask(res);
+        }
+
+        @Override
+        Byte64Mask bOp(VectorMask<Byte> m, MBinOp f) {
+            boolean[] res = new boolean[vspecies().laneCount()];
+            boolean[] bits = getBits();
+            boolean[] mbits = ((Byte64Mask)m).getBits();
+            for (int i = 0; i < res.length; i++) {
+                res[i] = f.apply(i, bits[i], mbits[i]);
+            }
+            return new Byte64Mask(res);
+        }
+
+        @ForceInline
+        @Override
+        public final
+        Byte64Vector toVector() {
+            return (Byte64Vector) super.toVectorTemplate();  // specialize
+        }
+
+        @Override
+        @ForceInline
+        public <E> VectorMask<E> cast(VectorSpecies<E> s) {
+            AbstractSpecies<E> species = (AbstractSpecies<E>) s;
+            if (length() != species.laneCount())
+                throw new IllegalArgumentException("VectorMask length and species length differ");
+            boolean[] maskArray = toArray();
+            // enum-switches don't optimize properly JDK-8161245
+            switch (species.laneType.switchKey) {
+            case LaneType.SK_BYTE:
+                return new Byte64Vector.Byte64Mask(maskArray).check(species);
+            case LaneType.SK_SHORT:
+                return new Short64Vector.Short64Mask(maskArray).check(species);
+            case LaneType.SK_INT:
+                return new Int64Vector.Int64Mask(maskArray).check(species);
+            case LaneType.SK_LONG:
+                return new Long64Vector.Long64Mask(maskArray).check(species);
+            case LaneType.SK_FLOAT:
+                return new Float64Vector.Float64Mask(maskArray).check(species);
+            case LaneType.SK_DOUBLE:
+                return new Double64Vector.Double64Mask(maskArray).check(species);
+            }
+
+            // Should not reach here.
+            throw new AssertionError(species);
+        }
+
+        // Unary operations
+
+        @Override
+        @ForceInline
+        public Byte64Mask not() {
+            return xor(maskAll(true));
+        }
+
+        // Binary operations
+
+        @Override
+        @ForceInline
+        public Byte64Mask and(VectorMask<Byte> mask) {
+            Objects.requireNonNull(mask);
+            Byte64Mask m = (Byte64Mask)mask;
+            return VectorSupport.binaryOp(VECTOR_OP_AND, Byte64Mask.class, byte.class, VLENGTH,
+                                             this, m,
+                                             (m1, m2) -> m1.bOp(m2, (i, a, b) -> a & b));
+        }
+
+        @Override
+        @ForceInline
+        public Byte64Mask or(VectorMask<Byte> mask) {
+            Objects.requireNonNull(mask);
+            Byte64Mask m = (Byte64Mask)mask;
+            return VectorSupport.binaryOp(VECTOR_OP_OR, Byte64Mask.class, byte.class, VLENGTH,
+                                             this, m,
+                                             (m1, m2) -> m1.bOp(m2, (i, a, b) -> a | b));
+        }
+
+        @ForceInline
+        /* package-private */
+        Byte64Mask xor(VectorMask<Byte> mask) {
+            Objects.requireNonNull(mask);
+            Byte64Mask m = (Byte64Mask)mask;
+            return VectorSupport.binaryOp(VECTOR_OP_XOR, Byte64Mask.class, byte.class, VLENGTH,
+                                          this, m,
+                                          (m1, m2) -> m1.bOp(m2, (i, a, b) -> a ^ b));
+        }
+
+        // Reductions
+
+        @Override
+        @ForceInline
+        public boolean anyTrue() {
+            return VectorSupport.test(BT_ne, Byte64Mask.class, byte.class, VLENGTH,
+                                         this, vspecies().maskAll(true),
+                                         (m, __) -> anyTrueHelper(((Byte64Mask)m).getBits()));
+        }
+
+        @Override
+        @ForceInline
+        public boolean allTrue() {
+            return VectorSupport.test(BT_overflow, Byte64Mask.class, byte.class, VLENGTH,
+                                         this, vspecies().maskAll(true),
+                                         (m, __) -> allTrueHelper(((Byte64Mask)m).getBits()));
+        }
+
+        @ForceInline
+        /*package-private*/
+        static Byte64Mask maskAll(boolean bit) {
+            return VectorSupport.broadcastCoerced(Byte64Mask.class, byte.class, VLENGTH,
+                                                  (bit ? -1 : 0), null,
+                                                  (v, __) -> (v != 0 ? TRUE_MASK : FALSE_MASK));
+        }
+        private static final Byte64Mask  TRUE_MASK = new Byte64Mask(true);
+        private static final Byte64Mask FALSE_MASK = new Byte64Mask(false);
+
+    }
+
+    // Shuffle
+
+    static final class Byte64Shuffle extends AbstractShuffle<Byte> {
+        static final int VLENGTH = VSPECIES.laneCount();    // used by the JVM
+        static final Class<Byte> ETYPE = byte.class; // used by the JVM
+
+        Byte64Shuffle(byte[] reorder) {
+            super(VLENGTH, reorder);
+        }
+
+        public Byte64Shuffle(int[] reorder) {
+            super(VLENGTH, reorder);
+        }
+
+        public Byte64Shuffle(int[] reorder, int i) {
+            super(VLENGTH, reorder, i);
+        }
+
+        public Byte64Shuffle(IntUnaryOperator fn) {
+            super(VLENGTH, fn);
+        }
+
+        @Override
+        public ByteSpecies vspecies() {
+            return VSPECIES;
+        }
+
+        static {
+            // There must be enough bits in the shuffle lanes to encode
+            // VLENGTH valid indexes and VLENGTH exceptional ones.
+            assert(VLENGTH < Byte.MAX_VALUE);
+            assert(Byte.MIN_VALUE <= -VLENGTH);
+        }
+        static final Byte64Shuffle IOTA = new Byte64Shuffle(IDENTITY);
+
+        @Override
+        @ForceInline
+        public Byte64Vector toVector() {
+            return VectorSupport.shuffleToVector(VCLASS, ETYPE, Byte64Shuffle.class, this, VLENGTH,
+                                                    (s) -> ((Byte64Vector)(((AbstractShuffle<Byte>)(s)).toVectorTemplate())));
+        }
+
+        @Override
+        @ForceInline
+        public <F> VectorShuffle<F> cast(VectorSpecies<F> s) {
+            AbstractSpecies<F> species = (AbstractSpecies<F>) s;
+            if (length() != species.laneCount())
+                throw new IllegalArgumentException("VectorShuffle length and species length differ");
+            int[] shuffleArray = toArray();
+            // enum-switches don't optimize properly JDK-8161245
+            switch (species.laneType.switchKey) {
+            case LaneType.SK_BYTE:
+                return new Byte64Vector.Byte64Shuffle(shuffleArray).check(species);
+            case LaneType.SK_SHORT:
+                return new Short64Vector.Short64Shuffle(shuffleArray).check(species);
+            case LaneType.SK_INT:
+                return new Int64Vector.Int64Shuffle(shuffleArray).check(species);
+            case LaneType.SK_LONG:
+                return new Long64Vector.Long64Shuffle(shuffleArray).check(species);
+            case LaneType.SK_FLOAT:
+                return new Float64Vector.Float64Shuffle(shuffleArray).check(species);
+            case LaneType.SK_DOUBLE:
+                return new Double64Vector.Double64Shuffle(shuffleArray).check(species);
+            }
+
+            // Should not reach here.
+            throw new AssertionError(species);
+        }
+
+        @ForceInline
+        @Override
+        public Byte64Shuffle rearrange(VectorShuffle<Byte> shuffle) {
+            Byte64Shuffle s = (Byte64Shuffle) shuffle;
+            byte[] reorder1 = reorder();
+            byte[] reorder2 = s.reorder();
+            byte[] r = new byte[reorder1.length];
+            for (int i = 0; i < reorder1.length; i++) {
+                int ssi = reorder2[i];
+                r[i] = reorder1[ssi];  // throws on exceptional index
+            }
+            return new Byte64Shuffle(r);
+        }
+    }
+
+    // ================================================
+
+    // Specialized low-level memory operations.
+
+    @ForceInline
+    @Override
+    final
+    ByteVector fromArray0(byte[] a, int offset) {
+        return super.fromArray0Template(a, offset);  // specialize
+    }
+
+    @ForceInline
+    @Override
+    final
+    ByteVector fromByteArray0(byte[] a, int offset) {
+        return super.fromByteArray0Template(a, offset);  // specialize
+    }
+
+    @ForceInline
+    @Override
+    final
+    ByteVector fromByteBuffer0(ByteBuffer bb, int offset) {
+        return super.fromByteBuffer0Template(bb, offset);  // specialize
+    }
+
+    @ForceInline
+    @Override
+    final
+    void intoArray0(byte[] a, int offset) {
+        super.intoArray0Template(a, offset);  // specialize
+    }
+
+    @ForceInline
+    @Override
+    final
+    void intoByteArray0(byte[] a, int offset) {
+        super.intoByteArray0Template(a, offset);  // specialize
+    }
+
+    // End of specialized low-level memory operations.
+
+    // ================================================
+
+}
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ByteMaxVector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ByteMaxVector.java
@ -0,0 +1,810 @@
+/*
+ * Copyright (c) 2017, 2020, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.  Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package jdk.incubator.vector;
+
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.Objects;
+import java.util.function.IntUnaryOperator;
+
+import jdk.internal.vm.annotation.ForceInline;
+import jdk.internal.vm.vector.VectorSupport;
+
+import static jdk.internal.vm.vector.VectorSupport.*;
+
+import static jdk.incubator.vector.VectorOperators.*;
+
+// -- This file was mechanically generated: Do not edit! -- //
+
+@SuppressWarnings("cast")  // warning: redundant cast
+final class ByteMaxVector extends ByteVector {
+    static final ByteSpecies VSPECIES =
+        (ByteSpecies) ByteVector.SPECIES_MAX;
+
+    static final VectorShape VSHAPE =
+        VSPECIES.vectorShape();
+
+    static final Class<ByteMaxVector> VCLASS = ByteMaxVector.class;
+
+    static final int VSIZE = VSPECIES.vectorBitSize();
+
+    static final int VLENGTH = VSPECIES.laneCount(); // used by the JVM
+
+    static final Class<Byte> ETYPE = byte.class; // used by the JVM
+
+    ByteMaxVector(byte[] v) {
+        super(v);
+    }
+
+    // For compatibility as ByteMaxVector::new,
+    // stored into species.vectorFactory.
+    ByteMaxVector(Object v) {
+        this((byte[]) v);
+    }
+
+    static final ByteMaxVector ZERO = new ByteMaxVector(new byte[VLENGTH]);
+    static final ByteMaxVector IOTA = new ByteMaxVector(VSPECIES.iotaArray());
+
+    static {
+        // Warm up a few species caches.
+        // If we do this too much we will
+        // get NPEs from bootstrap circularity.
+        VSPECIES.dummyVector();
+        VSPECIES.withLanes(LaneType.BYTE);
+    }
+
+    // Specialized extractors
+
+    @ForceInline
+    final @Override
+    public ByteSpecies vspecies() {
+        // ISSUE:  This should probably be a @Stable
+        // field inside AbstractVector, rather than
+        // a megamorphic method.
+        return VSPECIES;
+    }
+
+    @ForceInline
+    @Override
+    public final Class<Byte> elementType() { return byte.class; }
+
+    @ForceInline
+    @Override
+    public final int elementSize() { return Byte.SIZE; }
+
+    @ForceInline
+    @Override
+    public final VectorShape shape() { return VSHAPE; }
+
+    @ForceInline
+    @Override
+    public final int length() { return VLENGTH; }
+
+    @ForceInline
+    @Override
+    public final int bitSize() { return VSIZE; }
+
+    @ForceInline
+    @Override
+    public final int byteSize() { return VSIZE / Byte.SIZE; }
+
+    /*package-private*/
+    @ForceInline
+    final @Override
+    byte[] vec() {
+        return (byte[])getPayload();
+    }
+
+    // Virtualized constructors
+
+    @Override
+    @ForceInline
+    public final ByteMaxVector broadcast(byte e) {
+        return (ByteMaxVector) super.broadcastTemplate(e);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final ByteMaxVector broadcast(long e) {
+        return (ByteMaxVector) super.broadcastTemplate(e);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    ByteMaxMask maskFromArray(boolean[] bits) {
+        return new ByteMaxMask(bits);
+    }
+
+    @Override
+    @ForceInline
+    ByteMaxShuffle iotaShuffle() { return ByteMaxShuffle.IOTA; }
+
+    @ForceInline
+    ByteMaxShuffle iotaShuffle(int start, int step, boolean wrap) {
+      if (wrap) {
+        return (ByteMaxShuffle)VectorSupport.shuffleIota(ETYPE, ByteMaxShuffle.class, VSPECIES, VLENGTH, start, step, 1,
+                (l, lstart, lstep, s) -> s.shuffleFromOp(i -> (VectorIntrinsics.wrapToRange(i*lstep + lstart, l))));
+      } else {
+        return (ByteMaxShuffle)VectorSupport.shuffleIota(ETYPE, ByteMaxShuffle.class, VSPECIES, VLENGTH, start, step, 0,
+                (l, lstart, lstep, s) -> s.shuffleFromOp(i -> (i*lstep + lstart)));
+      }
+    }
+
+    @Override
+    @ForceInline
+    ByteMaxShuffle shuffleFromBytes(byte[] reorder) { return new ByteMaxShuffle(reorder); }
+
+    @Override
+    @ForceInline
+    ByteMaxShuffle shuffleFromArray(int[] indexes, int i) { return new ByteMaxShuffle(indexes, i); }
+
+    @Override
+    @ForceInline
+    ByteMaxShuffle shuffleFromOp(IntUnaryOperator fn) { return new ByteMaxShuffle(fn); }
+
+    // Make a vector of the same species but the given elements:
+    @ForceInline
+    final @Override
+    ByteMaxVector vectorFactory(byte[] vec) {
+        return new ByteMaxVector(vec);
+    }
+
+    @ForceInline
+    final @Override
+    ByteMaxVector asByteVectorRaw() {
+        return (ByteMaxVector) super.asByteVectorRawTemplate();  // specialize
+    }
+
+    @ForceInline
+    final @Override
+    AbstractVector<?> asVectorRaw(LaneType laneType) {
+        return super.asVectorRawTemplate(laneType);  // specialize
+    }
+
+    // Unary operator
+
+    @ForceInline
+    final @Override
+    ByteMaxVector uOp(FUnOp f) {
+        return (ByteMaxVector) super.uOpTemplate(f);  // specialize
+    }
+
+    @ForceInline
+    final @Override
+    ByteMaxVector uOp(VectorMask<Byte> m, FUnOp f) {
+        return (ByteMaxVector)
+            super.uOpTemplate((ByteMaxMask)m, f);  // specialize
+    }
+
+    // Binary operator
+
+    @ForceInline
+    final @Override
+    ByteMaxVector bOp(Vector<Byte> v, FBinOp f) {
+        return (ByteMaxVector) super.bOpTemplate((ByteMaxVector)v, f);  // specialize
+    }
+
+    @ForceInline
+    final @Override
+    ByteMaxVector bOp(Vector<Byte> v,
+                     VectorMask<Byte> m, FBinOp f) {
+        return (ByteMaxVector)
+            super.bOpTemplate((ByteMaxVector)v, (ByteMaxMask)m,
+                              f);  // specialize
+    }
+
+    // Ternary operator
+
+    @ForceInline
+    final @Override
+    ByteMaxVector tOp(Vector<Byte> v1, Vector<Byte> v2, FTriOp f) {
+        return (ByteMaxVector)
+            super.tOpTemplate((ByteMaxVector)v1, (ByteMaxVector)v2,
+                              f);  // specialize
+    }
+
+    @ForceInline
+    final @Override
+    ByteMaxVector tOp(Vector<Byte> v1, Vector<Byte> v2,
+                     VectorMask<Byte> m, FTriOp f) {
+        return (ByteMaxVector)
+            super.tOpTemplate((ByteMaxVector)v1, (ByteMaxVector)v2,
+                              (ByteMaxMask)m, f);  // specialize
+    }
+
+    @ForceInline
+    final @Override
+    byte rOp(byte v, FBinOp f) {
+        return super.rOpTemplate(v, f);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final <F>
+    Vector<F> convertShape(VectorOperators.Conversion<Byte,F> conv,
+                           VectorSpecies<F> rsp, int part) {
+        return super.convertShapeTemplate(conv, rsp, part);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final <F>
+    Vector<F> reinterpretShape(VectorSpecies<F> toSpecies, int part) {
+        return super.reinterpretShapeTemplate(toSpecies, part);  // specialize
+    }
+
+    // Specialized algebraic operations:
+
+    // The following definition forces a specialized version of this
+    // crucial method into the v-table of this class.  A call to add()
+    // will inline to a call to lanewise(ADD,), at which point the JIT
+    // intrinsic will have the opcode of ADD, plus all the metadata
+    // for this particular class, enabling it to generate precise
+    // code.
+    //
+    // There is probably no benefit to the JIT to specialize the
+    // masked or broadcast versions of the lanewise method.
+
+    @Override
+    @ForceInline
+    public ByteMaxVector lanewise(Unary op) {
+        return (ByteMaxVector) super.lanewiseTemplate(op);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public ByteMaxVector lanewise(Binary op, Vector<Byte> v) {
+        return (ByteMaxVector) super.lanewiseTemplate(op, v);  // specialize
+    }
+
+    /*package-private*/
+    @Override
+    @ForceInline ByteMaxVector
+    lanewiseShift(VectorOperators.Binary op, int e) {
+        return (ByteMaxVector) super.lanewiseShiftTemplate(op, e);  // specialize
+    }
+
+    /*package-private*/
+    @Override
+    @ForceInline
+    public final
+    ByteMaxVector
+    lanewise(VectorOperators.Ternary op, Vector<Byte> v1, Vector<Byte> v2) {
+        return (ByteMaxVector) super.lanewiseTemplate(op, v1, v2);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final
+    ByteMaxVector addIndex(int scale) {
+        return (ByteMaxVector) super.addIndexTemplate(scale);  // specialize
+    }
+
+    // Type specific horizontal reductions
+
+    @Override
+    @ForceInline
+    public final byte reduceLanes(VectorOperators.Associative op) {
+        return super.reduceLanesTemplate(op);  // specialized
+    }
+
+    @Override
+    @ForceInline
+    public final byte reduceLanes(VectorOperators.Associative op,
+                                    VectorMask<Byte> m) {
+        return super.reduceLanesTemplate(op, m);  // specialized
+    }
+
+    @Override
+    @ForceInline
+    public final long reduceLanesToLong(VectorOperators.Associative op) {
+        return (long) super.reduceLanesTemplate(op);  // specialized
+    }
+
+    @Override
+    @ForceInline
+    public final long reduceLanesToLong(VectorOperators.Associative op,
+                                        VectorMask<Byte> m) {
+        return (long) super.reduceLanesTemplate(op, m);  // specialized
+    }
+
+    @Override
+    @ForceInline
+    public VectorShuffle<Byte> toShuffle() {
+        byte[] a = toArray();
+        int[] sa = new int[a.length];
+        for (int i = 0; i < a.length; i++) {
+            sa[i] = (int) a[i];
+        }
+        return VectorShuffle.fromArray(VSPECIES, sa, 0);
+    }
+
+    // Specialized unary testing
+
+    @Override
+    @ForceInline
+    public final ByteMaxMask test(Test op) {
+        return super.testTemplate(ByteMaxMask.class, op);  // specialize
+    }
+
+    // Specialized comparisons
+
+    @Override
+    @ForceInline
+    public final ByteMaxMask compare(Comparison op, Vector<Byte> v) {
+        return super.compareTemplate(ByteMaxMask.class, op, v);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final ByteMaxMask compare(Comparison op, byte s) {
+        return super.compareTemplate(ByteMaxMask.class, op, s);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final ByteMaxMask compare(Comparison op, long s) {
+        return super.compareTemplate(ByteMaxMask.class, op, s);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public ByteMaxVector blend(Vector<Byte> v, VectorMask<Byte> m) {
+        return (ByteMaxVector)
+            super.blendTemplate(ByteMaxMask.class,
+                                (ByteMaxVector) v,
+                                (ByteMaxMask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public ByteMaxVector slice(int origin, Vector<Byte> v) {
+        return (ByteMaxVector) super.sliceTemplate(origin, v);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public ByteMaxVector slice(int origin) {
+       if ((origin < 0) || (origin >= VLENGTH)) {
+         throw new ArrayIndexOutOfBoundsException("Index " + origin + " out of bounds for vector length " + VLENGTH);
+       } else {
+         ByteMaxShuffle Iota = iotaShuffle();
+         VectorMask<Byte> BlendMask = Iota.toVector().compare(VectorOperators.LT, (broadcast((byte)(VLENGTH-origin))));
+         Iota = iotaShuffle(origin, 1, true);
+         return ZERO.blend(this.rearrange(Iota), BlendMask);
+       }
+    }
+
+    @Override
+    @ForceInline
+    public ByteMaxVector unslice(int origin, Vector<Byte> w, int part) {
+        return (ByteMaxVector) super.unsliceTemplate(origin, w, part);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public ByteMaxVector unslice(int origin, Vector<Byte> w, int part, VectorMask<Byte> m) {
+        return (ByteMaxVector)
+            super.unsliceTemplate(ByteMaxMask.class,
+                                  origin, w, part,
+                                  (ByteMaxMask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public ByteMaxVector unslice(int origin) {
+       if ((origin < 0) || (origin >= VLENGTH)) {
+         throw new ArrayIndexOutOfBoundsException("Index " + origin + " out of bounds for vector length " + VLENGTH);
+       } else {
+         ByteMaxShuffle Iota = iotaShuffle();
+         VectorMask<Byte> BlendMask = Iota.toVector().compare(VectorOperators.GE, (broadcast((byte)(origin))));
+         Iota = iotaShuffle(-origin, 1, true);
+         return ZERO.blend(this.rearrange(Iota), BlendMask);
+       }
+    }
+
+    @Override
+    @ForceInline
+    public ByteMaxVector rearrange(VectorShuffle<Byte> s) {
+        return (ByteMaxVector)
+            super.rearrangeTemplate(ByteMaxShuffle.class,
+                                    (ByteMaxShuffle) s);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public ByteMaxVector rearrange(VectorShuffle<Byte> shuffle,
+                                  VectorMask<Byte> m) {
+        return (ByteMaxVector)
+            super.rearrangeTemplate(ByteMaxShuffle.class,
+                                    (ByteMaxShuffle) shuffle,
+                                    (ByteMaxMask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public ByteMaxVector rearrange(VectorShuffle<Byte> s,
+                                  Vector<Byte> v) {
+        return (ByteMaxVector)
+            super.rearrangeTemplate(ByteMaxShuffle.class,
+                                    (ByteMaxShuffle) s,
+                                    (ByteMaxVector) v);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public ByteMaxVector selectFrom(Vector<Byte> v) {
+        return (ByteMaxVector)
+            super.selectFromTemplate((ByteMaxVector) v);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public ByteMaxVector selectFrom(Vector<Byte> v,
+                                   VectorMask<Byte> m) {
+        return (ByteMaxVector)
+            super.selectFromTemplate((ByteMaxVector) v,
+                                     (ByteMaxMask) m);  // specialize
+    }
+
+
+    @ForceInline
+    @Override
+    public byte lane(int i) {
+        if (i < 0 || i >= VLENGTH) {
+            throw new IllegalArgumentException("Index " + i + " must be zero or positive, and less than " + VLENGTH);
+        }
+        return laneHelper(i);
+    }
+
+    public byte laneHelper(int i) {
+        return (byte) VectorSupport.extract(
+                                VCLASS, ETYPE, VLENGTH,
+                                this, i,
+                                (vec, ix) -> {
+                                    byte[] vecarr = vec.vec();
+                                    return (long)vecarr[ix];
+                                });
+    }
+
+    @ForceInline
+    @Override
+    public ByteMaxVector withLane(int i, byte e) {
+        if (i < 0 || i >= VLENGTH) {
+            throw new IllegalArgumentException("Index " + i + " must be zero or positive, and less than " + VLENGTH);
+        }
+        return withLaneHelper(i, e);
+    }
+
+    public ByteMaxVector withLaneHelper(int i, byte e) {
+        return VectorSupport.insert(
+                                VCLASS, ETYPE, VLENGTH,
+                                this, i, (long)e,
+                                (v, ix, bits) -> {
+                                    byte[] res = v.vec().clone();
+                                    res[ix] = (byte)bits;
+                                    return v.vectorFactory(res);
+                                });
+    }
+
+    // Mask
+
+    static final class ByteMaxMask extends AbstractMask<Byte> {
+        static final int VLENGTH = VSPECIES.laneCount();    // used by the JVM
+        static final Class<Byte> ETYPE = byte.class; // used by the JVM
+
+        ByteMaxMask(boolean[] bits) {
+            this(bits, 0);
+        }
+
+        ByteMaxMask(boolean[] bits, int offset) {
+            super(prepare(bits, offset));
+        }
+
+        ByteMaxMask(boolean val) {
+            super(prepare(val));
+        }
+
+        private static boolean[] prepare(boolean[] bits, int offset) {
+            boolean[] newBits = new boolean[VSPECIES.laneCount()];
+            for (int i = 0; i < newBits.length; i++) {
+                newBits[i] = bits[offset + i];
+            }
+            return newBits;
+        }
+
+        private static boolean[] prepare(boolean val) {
+            boolean[] bits = new boolean[VSPECIES.laneCount()];
+            Arrays.fill(bits, val);
+            return bits;
+        }
+
+        @ForceInline
+        final @Override
+        public ByteSpecies vspecies() {
+            // ISSUE:  This should probably be a @Stable
+            // field inside AbstractMask, rather than
+            // a megamorphic method.
+            return VSPECIES;
+        }
+
+        @ForceInline
+        boolean[] getBits() {
+            return (boolean[])getPayload();
+        }
+
+        @Override
+        ByteMaxMask uOp(MUnOp f) {
+            boolean[] res = new boolean[vspecies().laneCount()];
+            boolean[] bits = getBits();
+            for (int i = 0; i < res.length; i++) {
+                res[i] = f.apply(i, bits[i]);
+            }
+            return new ByteMaxMask(res);
+        }
+
+        @Override
+        ByteMaxMask bOp(VectorMask<Byte> m, MBinOp f) {
+            boolean[] res = new boolean[vspecies().laneCount()];
+            boolean[] bits = getBits();
+            boolean[] mbits = ((ByteMaxMask)m).getBits();
+            for (int i = 0; i < res.length; i++) {
+                res[i] = f.apply(i, bits[i], mbits[i]);
+            }
+            return new ByteMaxMask(res);
+        }
+
+        @ForceInline
+        @Override
+        public final
+        ByteMaxVector toVector() {
+            return (ByteMaxVector) super.toVectorTemplate();  // specialize
+        }
+
+        @Override
+        @ForceInline
+        public <E> VectorMask<E> cast(VectorSpecies<E> s) {
+            AbstractSpecies<E> species = (AbstractSpecies<E>) s;
+            if (length() != species.laneCount())
+                throw new IllegalArgumentException("VectorMask length and species length differ");
+            boolean[] maskArray = toArray();
+            // enum-switches don't optimize properly JDK-8161245
+            switch (species.laneType.switchKey) {
+            case LaneType.SK_BYTE:
+                return new ByteMaxVector.ByteMaxMask(maskArray).check(species);
+            case LaneType.SK_SHORT:
+                return new ShortMaxVector.ShortMaxMask(maskArray).check(species);
+            case LaneType.SK_INT:
+                return new IntMaxVector.IntMaxMask(maskArray).check(species);
+            case LaneType.SK_LONG:
+                return new LongMaxVector.LongMaxMask(maskArray).check(species);
+            case LaneType.SK_FLOAT:
+                return new FloatMaxVector.FloatMaxMask(maskArray).check(species);
+            case LaneType.SK_DOUBLE:
+                return new DoubleMaxVector.DoubleMaxMask(maskArray).check(species);
+            }
+
+            // Should not reach here.
+            throw new AssertionError(species);
+        }
+
+        // Unary operations
+
+        @Override
+        @ForceInline
+        public ByteMaxMask not() {
+            return xor(maskAll(true));
+        }
+
+        // Binary operations
+
+        @Override
+        @ForceInline
+        public ByteMaxMask and(VectorMask<Byte> mask) {
+            Objects.requireNonNull(mask);
+            ByteMaxMask m = (ByteMaxMask)mask;
+            return VectorSupport.binaryOp(VECTOR_OP_AND, ByteMaxMask.class, byte.class, VLENGTH,
+                                             this, m,
+                                             (m1, m2) -> m1.bOp(m2, (i, a, b) -> a & b));
+        }
+
+        @Override
+        @ForceInline
+        public ByteMaxMask or(VectorMask<Byte> mask) {
+            Objects.requireNonNull(mask);
+            ByteMaxMask m = (ByteMaxMask)mask;
+            return VectorSupport.binaryOp(VECTOR_OP_OR, ByteMaxMask.class, byte.class, VLENGTH,
+                                             this, m,
+                                             (m1, m2) -> m1.bOp(m2, (i, a, b) -> a | b));
+        }
+
+        @ForceInline
+        /* package-private */
+        ByteMaxMask xor(VectorMask<Byte> mask) {
+            Objects.requireNonNull(mask);
+            ByteMaxMask m = (ByteMaxMask)mask;
+            return VectorSupport.binaryOp(VECTOR_OP_XOR, ByteMaxMask.class, byte.class, VLENGTH,
+                                          this, m,
+                                          (m1, m2) -> m1.bOp(m2, (i, a, b) -> a ^ b));
+        }
+
+        // Reductions
+
+        @Override
+        @ForceInline
+        public boolean anyTrue() {
+            return VectorSupport.test(BT_ne, ByteMaxMask.class, byte.class, VLENGTH,
+                                         this, vspecies().maskAll(true),
+                                         (m, __) -> anyTrueHelper(((ByteMaxMask)m).getBits()));
+        }
+
+        @Override
+        @ForceInline
+        public boolean allTrue() {
+            return VectorSupport.test(BT_overflow, ByteMaxMask.class, byte.class, VLENGTH,
+                                         this, vspecies().maskAll(true),
+                                         (m, __) -> allTrueHelper(((ByteMaxMask)m).getBits()));
+        }
+
+        @ForceInline
+        /*package-private*/
+        static ByteMaxMask maskAll(boolean bit) {
+            return VectorSupport.broadcastCoerced(ByteMaxMask.class, byte.class, VLENGTH,
+                                                  (bit ? -1 : 0), null,
+                                                  (v, __) -> (v != 0 ? TRUE_MASK : FALSE_MASK));
+        }
+        private static final ByteMaxMask  TRUE_MASK = new ByteMaxMask(true);
+        private static final ByteMaxMask FALSE_MASK = new ByteMaxMask(false);
+
+    }
+
+    // Shuffle
+
+    static final class ByteMaxShuffle extends AbstractShuffle<Byte> {
+        static final int VLENGTH = VSPECIES.laneCount();    // used by the JVM
+        static final Class<Byte> ETYPE = byte.class; // used by the JVM
+
+        ByteMaxShuffle(byte[] reorder) {
+            super(VLENGTH, reorder);
+        }
+
+        public ByteMaxShuffle(int[] reorder) {
+            super(VLENGTH, reorder);
+        }
+
+        public ByteMaxShuffle(int[] reorder, int i) {
+            super(VLENGTH, reorder, i);
+        }
+
+        public ByteMaxShuffle(IntUnaryOperator fn) {
+            super(VLENGTH, fn);
+        }
+
+        @Override
+        public ByteSpecies vspecies() {
+            return VSPECIES;
+        }
+
+        static {
+            // There must be enough bits in the shuffle lanes to encode
+            // VLENGTH valid indexes and VLENGTH exceptional ones.
+            assert(VLENGTH < Byte.MAX_VALUE);
+            assert(Byte.MIN_VALUE <= -VLENGTH);
+        }
+        static final ByteMaxShuffle IOTA = new ByteMaxShuffle(IDENTITY);
+
+        @Override
+        @ForceInline
+        public ByteMaxVector toVector() {
+            return VectorSupport.shuffleToVector(VCLASS, ETYPE, ByteMaxShuffle.class, this, VLENGTH,
+                                                    (s) -> ((ByteMaxVector)(((AbstractShuffle<Byte>)(s)).toVectorTemplate())));
+        }
+
+        @Override
+        @ForceInline
+        public <F> VectorShuffle<F> cast(VectorSpecies<F> s) {
+            AbstractSpecies<F> species = (AbstractSpecies<F>) s;
+            if (length() != species.laneCount())
+                throw new IllegalArgumentException("VectorShuffle length and species length differ");
+            int[] shuffleArray = toArray();
+            // enum-switches don't optimize properly JDK-8161245
+            switch (species.laneType.switchKey) {
+            case LaneType.SK_BYTE:
+                return new ByteMaxVector.ByteMaxShuffle(shuffleArray).check(species);
+            case LaneType.SK_SHORT:
+                return new ShortMaxVector.ShortMaxShuffle(shuffleArray).check(species);
+            case LaneType.SK_INT:
+                return new IntMaxVector.IntMaxShuffle(shuffleArray).check(species);
+            case LaneType.SK_LONG:
+                return new LongMaxVector.LongMaxShuffle(shuffleArray).check(species);
+            case LaneType.SK_FLOAT:
+                return new FloatMaxVector.FloatMaxShuffle(shuffleArray).check(species);
+            case LaneType.SK_DOUBLE:
+                return new DoubleMaxVector.DoubleMaxShuffle(shuffleArray).check(species);
+            }
+
+            // Should not reach here.
+            throw new AssertionError(species);
+        }
+
+        @ForceInline
+        @Override
+        public ByteMaxShuffle rearrange(VectorShuffle<Byte> shuffle) {
+            ByteMaxShuffle s = (ByteMaxShuffle) shuffle;
+            byte[] reorder1 = reorder();
+            byte[] reorder2 = s.reorder();
+            byte[] r = new byte[reorder1.length];
+            for (int i = 0; i < reorder1.length; i++) {
+                int ssi = reorder2[i];
+                r[i] = reorder1[ssi];  // throws on exceptional index
+            }
+            return new ByteMaxShuffle(r);
+        }
+    }
+
+    // ================================================
+
+    // Specialized low-level memory operations.
+
+    @ForceInline
+    @Override
+    final
+    ByteVector fromArray0(byte[] a, int offset) {
+        return super.fromArray0Template(a, offset);  // specialize
+    }
+
+    @ForceInline
+    @Override
+    final
+    ByteVector fromByteArray0(byte[] a, int offset) {
+        return super.fromByteArray0Template(a, offset);  // specialize
+    }
+
+    @ForceInline
+    @Override
+    final
+    ByteVector fromByteBuffer0(ByteBuffer bb, int offset) {
+        return super.fromByteBuffer0Template(bb, offset);  // specialize
+    }
+
+    @ForceInline
+    @Override
+    final
+    void intoArray0(byte[] a, int offset) {
+        super.intoArray0Template(a, offset);  // specialize
+    }
+
+    @ForceInline
+    @Override
+    final
+    void intoByteArray0(byte[] a, int offset) {
+        super.intoByteArray0Template(a, offset);  // specialize
+    }
+
+    // End of specialized low-level memory operations.
+
+    // ================================================
+
+}
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ByteVector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ByteVector.java
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Double128Vector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Double128Vector.java
@ -0,0 +1,808 @@
+/*
+ * Copyright (c) 2017, 2020, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.  Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package jdk.incubator.vector;
+
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.Objects;
+import java.util.function.IntUnaryOperator;
+
+import jdk.internal.vm.annotation.ForceInline;
+import jdk.internal.vm.vector.VectorSupport;
+
+import static jdk.internal.vm.vector.VectorSupport.*;
+
+import static jdk.incubator.vector.VectorOperators.*;
+
+// -- This file was mechanically generated: Do not edit! -- //
+
+@SuppressWarnings("cast")  // warning: redundant cast
+final class Double128Vector extends DoubleVector {
+    static final DoubleSpecies VSPECIES =
+        (DoubleSpecies) DoubleVector.SPECIES_128;
+
+    static final VectorShape VSHAPE =
+        VSPECIES.vectorShape();
+
+    static final Class<Double128Vector> VCLASS = Double128Vector.class;
+
+    static final int VSIZE = VSPECIES.vectorBitSize();
+
+    static final int VLENGTH = VSPECIES.laneCount(); // used by the JVM
+
+    static final Class<Double> ETYPE = double.class; // used by the JVM
+
+    Double128Vector(double[] v) {
+        super(v);
+    }
+
+    // For compatibility as Double128Vector::new,
+    // stored into species.vectorFactory.
+    Double128Vector(Object v) {
+        this((double[]) v);
+    }
+
+    static final Double128Vector ZERO = new Double128Vector(new double[VLENGTH]);
+    static final Double128Vector IOTA = new Double128Vector(VSPECIES.iotaArray());
+
+    static {
+        // Warm up a few species caches.
+        // If we do this too much we will
+        // get NPEs from bootstrap circularity.
+        VSPECIES.dummyVector();
+        VSPECIES.withLanes(LaneType.BYTE);
+    }
+
+    // Specialized extractors
+
+    @ForceInline
+    final @Override
+    public DoubleSpecies vspecies() {
+        // ISSUE:  This should probably be a @Stable
+        // field inside AbstractVector, rather than
+        // a megamorphic method.
+        return VSPECIES;
+    }
+
+    @ForceInline
+    @Override
+    public final Class<Double> elementType() { return double.class; }
+
+    @ForceInline
+    @Override
+    public final int elementSize() { return Double.SIZE; }
+
+    @ForceInline
+    @Override
+    public final VectorShape shape() { return VSHAPE; }
+
+    @ForceInline
+    @Override
+    public final int length() { return VLENGTH; }
+
+    @ForceInline
+    @Override
+    public final int bitSize() { return VSIZE; }
+
+    @ForceInline
+    @Override
+    public final int byteSize() { return VSIZE / Byte.SIZE; }
+
+    /*package-private*/
+    @ForceInline
+    final @Override
+    double[] vec() {
+        return (double[])getPayload();
+    }
+
+    // Virtualized constructors
+
+    @Override
+    @ForceInline
+    public final Double128Vector broadcast(double e) {
+        return (Double128Vector) super.broadcastTemplate(e);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final Double128Vector broadcast(long e) {
+        return (Double128Vector) super.broadcastTemplate(e);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    Double128Mask maskFromArray(boolean[] bits) {
+        return new Double128Mask(bits);
+    }
+
+    @Override
+    @ForceInline
+    Double128Shuffle iotaShuffle() { return Double128Shuffle.IOTA; }
+
+    @ForceInline
+    Double128Shuffle iotaShuffle(int start, int step, boolean wrap) {
+      if (wrap) {
+        return (Double128Shuffle)VectorSupport.shuffleIota(ETYPE, Double128Shuffle.class, VSPECIES, VLENGTH, start, step, 1,
+                (l, lstart, lstep, s) -> s.shuffleFromOp(i -> (VectorIntrinsics.wrapToRange(i*lstep + lstart, l))));
+      } else {
+        return (Double128Shuffle)VectorSupport.shuffleIota(ETYPE, Double128Shuffle.class, VSPECIES, VLENGTH, start, step, 0,
+                (l, lstart, lstep, s) -> s.shuffleFromOp(i -> (i*lstep + lstart)));
+      }
+    }
+
+    @Override
+    @ForceInline
+    Double128Shuffle shuffleFromBytes(byte[] reorder) { return new Double128Shuffle(reorder); }
+
+    @Override
+    @ForceInline
+    Double128Shuffle shuffleFromArray(int[] indexes, int i) { return new Double128Shuffle(indexes, i); }
+
+    @Override
+    @ForceInline
+    Double128Shuffle shuffleFromOp(IntUnaryOperator fn) { return new Double128Shuffle(fn); }
+
+    // Make a vector of the same species but the given elements:
+    @ForceInline
+    final @Override
+    Double128Vector vectorFactory(double[] vec) {
+        return new Double128Vector(vec);
+    }
+
+    @ForceInline
+    final @Override
+    Byte128Vector asByteVectorRaw() {
+        return (Byte128Vector) super.asByteVectorRawTemplate();  // specialize
+    }
+
+    @ForceInline
+    final @Override
+    AbstractVector<?> asVectorRaw(LaneType laneType) {
+        return super.asVectorRawTemplate(laneType);  // specialize
+    }
+
+    // Unary operator
+
+    @ForceInline
+    final @Override
+    Double128Vector uOp(FUnOp f) {
+        return (Double128Vector) super.uOpTemplate(f);  // specialize
+    }
+
+    @ForceInline
+    final @Override
+    Double128Vector uOp(VectorMask<Double> m, FUnOp f) {
+        return (Double128Vector)
+            super.uOpTemplate((Double128Mask)m, f);  // specialize
+    }
+
+    // Binary operator
+
+    @ForceInline
+    final @Override
+    Double128Vector bOp(Vector<Double> v, FBinOp f) {
+        return (Double128Vector) super.bOpTemplate((Double128Vector)v, f);  // specialize
+    }
+
+    @ForceInline
+    final @Override
+    Double128Vector bOp(Vector<Double> v,
+                     VectorMask<Double> m, FBinOp f) {
+        return (Double128Vector)
+            super.bOpTemplate((Double128Vector)v, (Double128Mask)m,
+                              f);  // specialize
+    }
+
+    // Ternary operator
+
+    @ForceInline
+    final @Override
+    Double128Vector tOp(Vector<Double> v1, Vector<Double> v2, FTriOp f) {
+        return (Double128Vector)
+            super.tOpTemplate((Double128Vector)v1, (Double128Vector)v2,
+                              f);  // specialize
+    }
+
+    @ForceInline
+    final @Override
+    Double128Vector tOp(Vector<Double> v1, Vector<Double> v2,
+                     VectorMask<Double> m, FTriOp f) {
+        return (Double128Vector)
+            super.tOpTemplate((Double128Vector)v1, (Double128Vector)v2,
+                              (Double128Mask)m, f);  // specialize
+    }
+
+    @ForceInline
+    final @Override
+    double rOp(double v, FBinOp f) {
+        return super.rOpTemplate(v, f);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final <F>
+    Vector<F> convertShape(VectorOperators.Conversion<Double,F> conv,
+                           VectorSpecies<F> rsp, int part) {
+        return super.convertShapeTemplate(conv, rsp, part);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final <F>
+    Vector<F> reinterpretShape(VectorSpecies<F> toSpecies, int part) {
+        return super.reinterpretShapeTemplate(toSpecies, part);  // specialize
+    }
+
+    // Specialized algebraic operations:
+
+    // The following definition forces a specialized version of this
+    // crucial method into the v-table of this class.  A call to add()
+    // will inline to a call to lanewise(ADD,), at which point the JIT
+    // intrinsic will have the opcode of ADD, plus all the metadata
+    // for this particular class, enabling it to generate precise
+    // code.
+    //
+    // There is probably no benefit to the JIT to specialize the
+    // masked or broadcast versions of the lanewise method.
+
+    @Override
+    @ForceInline
+    public Double128Vector lanewise(Unary op) {
+        return (Double128Vector) super.lanewiseTemplate(op);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Double128Vector lanewise(Binary op, Vector<Double> v) {
+        return (Double128Vector) super.lanewiseTemplate(op, v);  // specialize
+    }
+
+
+    /*package-private*/
+    @Override
+    @ForceInline
+    public final
+    Double128Vector
+    lanewise(VectorOperators.Ternary op, Vector<Double> v1, Vector<Double> v2) {
+        return (Double128Vector) super.lanewiseTemplate(op, v1, v2);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final
+    Double128Vector addIndex(int scale) {
+        return (Double128Vector) super.addIndexTemplate(scale);  // specialize
+    }
+
+    // Type specific horizontal reductions
+
+    @Override
+    @ForceInline
+    public final double reduceLanes(VectorOperators.Associative op) {
+        return super.reduceLanesTemplate(op);  // specialized
+    }
+
+    @Override
+    @ForceInline
+    public final double reduceLanes(VectorOperators.Associative op,
+                                    VectorMask<Double> m) {
+        return super.reduceLanesTemplate(op, m);  // specialized
+    }
+
+    @Override
+    @ForceInline
+    public final long reduceLanesToLong(VectorOperators.Associative op) {
+        return (long) super.reduceLanesTemplate(op);  // specialized
+    }
+
+    @Override
+    @ForceInline
+    public final long reduceLanesToLong(VectorOperators.Associative op,
+                                        VectorMask<Double> m) {
+        return (long) super.reduceLanesTemplate(op, m);  // specialized
+    }
+
+    @Override
+    @ForceInline
+    public VectorShuffle<Double> toShuffle() {
+        double[] a = toArray();
+        int[] sa = new int[a.length];
+        for (int i = 0; i < a.length; i++) {
+            sa[i] = (int) a[i];
+        }
+        return VectorShuffle.fromArray(VSPECIES, sa, 0);
+    }
+
+    // Specialized unary testing
+
+    @Override
+    @ForceInline
+    public final Double128Mask test(Test op) {
+        return super.testTemplate(Double128Mask.class, op);  // specialize
+    }
+
+    // Specialized comparisons
+
+    @Override
+    @ForceInline
+    public final Double128Mask compare(Comparison op, Vector<Double> v) {
+        return super.compareTemplate(Double128Mask.class, op, v);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final Double128Mask compare(Comparison op, double s) {
+        return super.compareTemplate(Double128Mask.class, op, s);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final Double128Mask compare(Comparison op, long s) {
+        return super.compareTemplate(Double128Mask.class, op, s);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Double128Vector blend(Vector<Double> v, VectorMask<Double> m) {
+        return (Double128Vector)
+            super.blendTemplate(Double128Mask.class,
+                                (Double128Vector) v,
+                                (Double128Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Double128Vector slice(int origin, Vector<Double> v) {
+        return (Double128Vector) super.sliceTemplate(origin, v);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Double128Vector slice(int origin) {
+       if ((origin < 0) || (origin >= VLENGTH)) {
+         throw new ArrayIndexOutOfBoundsException("Index " + origin + " out of bounds for vector length " + VLENGTH);
+       } else {
+         Double128Shuffle Iota = iotaShuffle();
+         VectorMask<Double> BlendMask = Iota.toVector().compare(VectorOperators.LT, (broadcast((double)(VLENGTH-origin))));
+         Iota = iotaShuffle(origin, 1, true);
+         return ZERO.blend(this.rearrange(Iota), BlendMask);
+       }
+    }
+
+    @Override
+    @ForceInline
+    public Double128Vector unslice(int origin, Vector<Double> w, int part) {
+        return (Double128Vector) super.unsliceTemplate(origin, w, part);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Double128Vector unslice(int origin, Vector<Double> w, int part, VectorMask<Double> m) {
+        return (Double128Vector)
+            super.unsliceTemplate(Double128Mask.class,
+                                  origin, w, part,
+                                  (Double128Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Double128Vector unslice(int origin) {
+       if ((origin < 0) || (origin >= VLENGTH)) {
+         throw new ArrayIndexOutOfBoundsException("Index " + origin + " out of bounds for vector length " + VLENGTH);
+       } else {
+         Double128Shuffle Iota = iotaShuffle();
+         VectorMask<Double> BlendMask = Iota.toVector().compare(VectorOperators.GE, (broadcast((double)(origin))));
+         Iota = iotaShuffle(-origin, 1, true);
+         return ZERO.blend(this.rearrange(Iota), BlendMask);
+       }
+    }
+
+    @Override
+    @ForceInline
+    public Double128Vector rearrange(VectorShuffle<Double> s) {
+        return (Double128Vector)
+            super.rearrangeTemplate(Double128Shuffle.class,
+                                    (Double128Shuffle) s);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Double128Vector rearrange(VectorShuffle<Double> shuffle,
+                                  VectorMask<Double> m) {
+        return (Double128Vector)
+            super.rearrangeTemplate(Double128Shuffle.class,
+                                    (Double128Shuffle) shuffle,
+                                    (Double128Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Double128Vector rearrange(VectorShuffle<Double> s,
+                                  Vector<Double> v) {
+        return (Double128Vector)
+            super.rearrangeTemplate(Double128Shuffle.class,
+                                    (Double128Shuffle) s,
+                                    (Double128Vector) v);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Double128Vector selectFrom(Vector<Double> v) {
+        return (Double128Vector)
+            super.selectFromTemplate((Double128Vector) v);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Double128Vector selectFrom(Vector<Double> v,
+                                   VectorMask<Double> m) {
+        return (Double128Vector)
+            super.selectFromTemplate((Double128Vector) v,
+                                     (Double128Mask) m);  // specialize
+    }
+
+
+    @ForceInline
+    @Override
+    public double lane(int i) {
+        long bits;
+        switch(i) {
+            case 0: bits = laneHelper(0); break;
+            case 1: bits = laneHelper(1); break;
+            default: throw new IllegalArgumentException("Index " + i + " must be zero or positive, and less than " + VLENGTH);
+        }
+        return Double.longBitsToDouble(bits);
+    }
+
+    public long laneHelper(int i) {
+        return (long) VectorSupport.extract(
+                     VCLASS, ETYPE, VLENGTH,
+                     this, i,
+                     (vec, ix) -> {
+                     double[] vecarr = vec.vec();
+                     return (long)Double.doubleToLongBits(vecarr[ix]);
+                     });
+    }
+
+    @ForceInline
+    @Override
+    public Double128Vector withLane(int i, double e) {
+        switch(i) {
+            case 0: return withLaneHelper(0, e);
+            case 1: return withLaneHelper(1, e);
+            default: throw new IllegalArgumentException("Index " + i + " must be zero or positive, and less than " + VLENGTH);
+        }
+    }
+
+    public Double128Vector withLaneHelper(int i, double e) {
+        return VectorSupport.insert(
+                                VCLASS, ETYPE, VLENGTH,
+                                this, i, (long)Double.doubleToLongBits(e),
+                                (v, ix, bits) -> {
+                                    double[] res = v.vec().clone();
+                                    res[ix] = Double.longBitsToDouble((long)bits);
+                                    return v.vectorFactory(res);
+                                });
+    }
+
+    // Mask
+
+    static final class Double128Mask extends AbstractMask<Double> {
+        static final int VLENGTH = VSPECIES.laneCount();    // used by the JVM
+        static final Class<Double> ETYPE = double.class; // used by the JVM
+
+        Double128Mask(boolean[] bits) {
+            this(bits, 0);
+        }
+
+        Double128Mask(boolean[] bits, int offset) {
+            super(prepare(bits, offset));
+        }
+
+        Double128Mask(boolean val) {
+            super(prepare(val));
+        }
+
+        private static boolean[] prepare(boolean[] bits, int offset) {
+            boolean[] newBits = new boolean[VSPECIES.laneCount()];
+            for (int i = 0; i < newBits.length; i++) {
+                newBits[i] = bits[offset + i];
+            }
+            return newBits;
+        }
+
+        private static boolean[] prepare(boolean val) {
+            boolean[] bits = new boolean[VSPECIES.laneCount()];
+            Arrays.fill(bits, val);
+            return bits;
+        }
+
+        @ForceInline
+        final @Override
+        public DoubleSpecies vspecies() {
+            // ISSUE:  This should probably be a @Stable
+            // field inside AbstractMask, rather than
+            // a megamorphic method.
+            return VSPECIES;
+        }
+
+        @ForceInline
+        boolean[] getBits() {
+            return (boolean[])getPayload();
+        }
+
+        @Override
+        Double128Mask uOp(MUnOp f) {
+            boolean[] res = new boolean[vspecies().laneCount()];
+            boolean[] bits = getBits();
+            for (int i = 0; i < res.length; i++) {
+                res[i] = f.apply(i, bits[i]);
+            }
+            return new Double128Mask(res);
+        }
+
+        @Override
+        Double128Mask bOp(VectorMask<Double> m, MBinOp f) {
+            boolean[] res = new boolean[vspecies().laneCount()];
+            boolean[] bits = getBits();
+            boolean[] mbits = ((Double128Mask)m).getBits();
+            for (int i = 0; i < res.length; i++) {
+                res[i] = f.apply(i, bits[i], mbits[i]);
+            }
+            return new Double128Mask(res);
+        }
+
+        @ForceInline
+        @Override
+        public final
+        Double128Vector toVector() {
+            return (Double128Vector) super.toVectorTemplate();  // specialize
+        }
+
+        @Override
+        @ForceInline
+        public <E> VectorMask<E> cast(VectorSpecies<E> s) {
+            AbstractSpecies<E> species = (AbstractSpecies<E>) s;
+            if (length() != species.laneCount())
+                throw new IllegalArgumentException("VectorMask length and species length differ");
+            boolean[] maskArray = toArray();
+            // enum-switches don't optimize properly JDK-8161245
+            switch (species.laneType.switchKey) {
+            case LaneType.SK_BYTE:
+                return new Byte128Vector.Byte128Mask(maskArray).check(species);
+            case LaneType.SK_SHORT:
+                return new Short128Vector.Short128Mask(maskArray).check(species);
+            case LaneType.SK_INT:
+                return new Int128Vector.Int128Mask(maskArray).check(species);
+            case LaneType.SK_LONG:
+                return new Long128Vector.Long128Mask(maskArray).check(species);
+            case LaneType.SK_FLOAT:
+                return new Float128Vector.Float128Mask(maskArray).check(species);
+            case LaneType.SK_DOUBLE:
+                return new Double128Vector.Double128Mask(maskArray).check(species);
+            }
+
+            // Should not reach here.
+            throw new AssertionError(species);
+        }
+
+        // Unary operations
+
+        @Override
+        @ForceInline
+        public Double128Mask not() {
+            return xor(maskAll(true));
+        }
+
+        // Binary operations
+
+        @Override
+        @ForceInline
+        public Double128Mask and(VectorMask<Double> mask) {
+            Objects.requireNonNull(mask);
+            Double128Mask m = (Double128Mask)mask;
+            return VectorSupport.binaryOp(VECTOR_OP_AND, Double128Mask.class, long.class, VLENGTH,
+                                             this, m,
+                                             (m1, m2) -> m1.bOp(m2, (i, a, b) -> a & b));
+        }
+
+        @Override
+        @ForceInline
+        public Double128Mask or(VectorMask<Double> mask) {
+            Objects.requireNonNull(mask);
+            Double128Mask m = (Double128Mask)mask;
+            return VectorSupport.binaryOp(VECTOR_OP_OR, Double128Mask.class, long.class, VLENGTH,
+                                             this, m,
+                                             (m1, m2) -> m1.bOp(m2, (i, a, b) -> a | b));
+        }
+
+        @ForceInline
+        /* package-private */
+        Double128Mask xor(VectorMask<Double> mask) {
+            Objects.requireNonNull(mask);
+            Double128Mask m = (Double128Mask)mask;
+            return VectorSupport.binaryOp(VECTOR_OP_XOR, Double128Mask.class, long.class, VLENGTH,
+                                          this, m,
+                                          (m1, m2) -> m1.bOp(m2, (i, a, b) -> a ^ b));
+        }
+
+        // Reductions
+
+        @Override
+        @ForceInline
+        public boolean anyTrue() {
+            return VectorSupport.test(BT_ne, Double128Mask.class, long.class, VLENGTH,
+                                         this, vspecies().maskAll(true),
+                                         (m, __) -> anyTrueHelper(((Double128Mask)m).getBits()));
+        }
+
+        @Override
+        @ForceInline
+        public boolean allTrue() {
+            return VectorSupport.test(BT_overflow, Double128Mask.class, long.class, VLENGTH,
+                                         this, vspecies().maskAll(true),
+                                         (m, __) -> allTrueHelper(((Double128Mask)m).getBits()));
+        }
+
+        @ForceInline
+        /*package-private*/
+        static Double128Mask maskAll(boolean bit) {
+            return VectorSupport.broadcastCoerced(Double128Mask.class, long.class, VLENGTH,
+                                                  (bit ? -1 : 0), null,
+                                                  (v, __) -> (v != 0 ? TRUE_MASK : FALSE_MASK));
+        }
+        private static final Double128Mask  TRUE_MASK = new Double128Mask(true);
+        private static final Double128Mask FALSE_MASK = new Double128Mask(false);
+
+    }
+
+    // Shuffle
+
+    static final class Double128Shuffle extends AbstractShuffle<Double> {
+        static final int VLENGTH = VSPECIES.laneCount();    // used by the JVM
+        static final Class<Double> ETYPE = double.class; // used by the JVM
+
+        Double128Shuffle(byte[] reorder) {
+            super(VLENGTH, reorder);
+        }
+
+        public Double128Shuffle(int[] reorder) {
+            super(VLENGTH, reorder);
+        }
+
+        public Double128Shuffle(int[] reorder, int i) {
+            super(VLENGTH, reorder, i);
+        }
+
+        public Double128Shuffle(IntUnaryOperator fn) {
+            super(VLENGTH, fn);
+        }
+
+        @Override
+        public DoubleSpecies vspecies() {
+            return VSPECIES;
+        }
+
+        static {
+            // There must be enough bits in the shuffle lanes to encode
+            // VLENGTH valid indexes and VLENGTH exceptional ones.
+            assert(VLENGTH < Byte.MAX_VALUE);
+            assert(Byte.MIN_VALUE <= -VLENGTH);
+        }
+        static final Double128Shuffle IOTA = new Double128Shuffle(IDENTITY);
+
+        @Override
+        @ForceInline
+        public Double128Vector toVector() {
+            return VectorSupport.shuffleToVector(VCLASS, ETYPE, Double128Shuffle.class, this, VLENGTH,
+                                                    (s) -> ((Double128Vector)(((AbstractShuffle<Double>)(s)).toVectorTemplate())));
+        }
+
+        @Override
+        @ForceInline
+        public <F> VectorShuffle<F> cast(VectorSpecies<F> s) {
+            AbstractSpecies<F> species = (AbstractSpecies<F>) s;
+            if (length() != species.laneCount())
+                throw new IllegalArgumentException("VectorShuffle length and species length differ");
+            int[] shuffleArray = toArray();
+            // enum-switches don't optimize properly JDK-8161245
+            switch (species.laneType.switchKey) {
+            case LaneType.SK_BYTE:
+                return new Byte128Vector.Byte128Shuffle(shuffleArray).check(species);
+            case LaneType.SK_SHORT:
+                return new Short128Vector.Short128Shuffle(shuffleArray).check(species);
+            case LaneType.SK_INT:
+                return new Int128Vector.Int128Shuffle(shuffleArray).check(species);
+            case LaneType.SK_LONG:
+                return new Long128Vector.Long128Shuffle(shuffleArray).check(species);
+            case LaneType.SK_FLOAT:
+                return new Float128Vector.Float128Shuffle(shuffleArray).check(species);
+            case LaneType.SK_DOUBLE:
+                return new Double128Vector.Double128Shuffle(shuffleArray).check(species);
+            }
+
+            // Should not reach here.
+            throw new AssertionError(species);
+        }
+
+        @ForceInline
+        @Override
+        public Double128Shuffle rearrange(VectorShuffle<Double> shuffle) {
+            Double128Shuffle s = (Double128Shuffle) shuffle;
+            byte[] reorder1 = reorder();
+            byte[] reorder2 = s.reorder();
+            byte[] r = new byte[reorder1.length];
+            for (int i = 0; i < reorder1.length; i++) {
+                int ssi = reorder2[i];
+                r[i] = reorder1[ssi];  // throws on exceptional index
+            }
+            return new Double128Shuffle(r);
+        }
+    }
+
+    // ================================================
+
+    // Specialized low-level memory operations.
+
+    @ForceInline
+    @Override
+    final
+    DoubleVector fromArray0(double[] a, int offset) {
+        return super.fromArray0Template(a, offset);  // specialize
+    }
+
+    @ForceInline
+    @Override
+    final
+    DoubleVector fromByteArray0(byte[] a, int offset) {
+        return super.fromByteArray0Template(a, offset);  // specialize
+    }
+
+    @ForceInline
+    @Override
+    final
+    DoubleVector fromByteBuffer0(ByteBuffer bb, int offset) {
+        return super.fromByteBuffer0Template(bb, offset);  // specialize
+    }
+
+    @ForceInline
+    @Override
+    final
+    void intoArray0(double[] a, int offset) {
+        super.intoArray0Template(a, offset);  // specialize
+    }
+
+    @ForceInline
+    @Override
+    final
+    void intoByteArray0(byte[] a, int offset) {
+        super.intoByteArray0Template(a, offset);  // specialize
+    }
+
+    // End of specialized low-level memory operations.
+
+    // ================================================
+
+}
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Double256Vector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Double256Vector.java
@ -0,0 +1,812 @@
+/*
+ * Copyright (c) 2017, 2020, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.  Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package jdk.incubator.vector;
+
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.Objects;
+import java.util.function.IntUnaryOperator;
+
+import jdk.internal.vm.annotation.ForceInline;
+import jdk.internal.vm.vector.VectorSupport;
+
+import static jdk.internal.vm.vector.VectorSupport.*;
+
+import static jdk.incubator.vector.VectorOperators.*;
+
+// -- This file was mechanically generated: Do not edit! -- //
+
+@SuppressWarnings("cast")  // warning: redundant cast
+final class Double256Vector extends DoubleVector {
+    static final DoubleSpecies VSPECIES =
+        (DoubleSpecies) DoubleVector.SPECIES_256;
+
+    static final VectorShape VSHAPE =
+        VSPECIES.vectorShape();
+
+    static final Class<Double256Vector> VCLASS = Double256Vector.class;
+
+    static final int VSIZE = VSPECIES.vectorBitSize();
+
+    static final int VLENGTH = VSPECIES.laneCount(); // used by the JVM
+
+    static final Class<Double> ETYPE = double.class; // used by the JVM
+
+    Double256Vector(double[] v) {
+        super(v);
+    }
+
+    // For compatibility as Double256Vector::new,
+    // stored into species.vectorFactory.
+    Double256Vector(Object v) {
+        this((double[]) v);
+    }
+
+    static final Double256Vector ZERO = new Double256Vector(new double[VLENGTH]);
+    static final Double256Vector IOTA = new Double256Vector(VSPECIES.iotaArray());
+
+    static {
+        // Warm up a few species caches.
+        // If we do this too much we will
+        // get NPEs from bootstrap circularity.
+        VSPECIES.dummyVector();
+        VSPECIES.withLanes(LaneType.BYTE);
+    }
+
+    // Specialized extractors
+
+    @ForceInline
+    final @Override
+    public DoubleSpecies vspecies() {
+        // ISSUE:  This should probably be a @Stable
+        // field inside AbstractVector, rather than
+        // a megamorphic method.
+        return VSPECIES;
+    }
+
+    @ForceInline
+    @Override
+    public final Class<Double> elementType() { return double.class; }
+
+    @ForceInline
+    @Override
+    public final int elementSize() { return Double.SIZE; }
+
+    @ForceInline
+    @Override
+    public final VectorShape shape() { return VSHAPE; }
+
+    @ForceInline
+    @Override
+    public final int length() { return VLENGTH; }
+
+    @ForceInline
+    @Override
+    public final int bitSize() { return VSIZE; }
+
+    @ForceInline
+    @Override
+    public final int byteSize() { return VSIZE / Byte.SIZE; }
+
+    /*package-private*/
+    @ForceInline
+    final @Override
+    double[] vec() {
+        return (double[])getPayload();
+    }
+
+    // Virtualized constructors
+
+    @Override
+    @ForceInline
+    public final Double256Vector broadcast(double e) {
+        return (Double256Vector) super.broadcastTemplate(e);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final Double256Vector broadcast(long e) {
+        return (Double256Vector) super.broadcastTemplate(e);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    Double256Mask maskFromArray(boolean[] bits) {
+        return new Double256Mask(bits);
+    }
+
+    @Override
+    @ForceInline
+    Double256Shuffle iotaShuffle() { return Double256Shuffle.IOTA; }
+
+    @ForceInline
+    Double256Shuffle iotaShuffle(int start, int step, boolean wrap) {
+      if (wrap) {
+        return (Double256Shuffle)VectorSupport.shuffleIota(ETYPE, Double256Shuffle.class, VSPECIES, VLENGTH, start, step, 1,
+                (l, lstart, lstep, s) -> s.shuffleFromOp(i -> (VectorIntrinsics.wrapToRange(i*lstep + lstart, l))));
+      } else {
+        return (Double256Shuffle)VectorSupport.shuffleIota(ETYPE, Double256Shuffle.class, VSPECIES, VLENGTH, start, step, 0,
+                (l, lstart, lstep, s) -> s.shuffleFromOp(i -> (i*lstep + lstart)));
+      }
+    }
+
+    @Override
+    @ForceInline
+    Double256Shuffle shuffleFromBytes(byte[] reorder) { return new Double256Shuffle(reorder); }
+
+    @Override
+    @ForceInline
+    Double256Shuffle shuffleFromArray(int[] indexes, int i) { return new Double256Shuffle(indexes, i); }
+
+    @Override
+    @ForceInline
+    Double256Shuffle shuffleFromOp(IntUnaryOperator fn) { return new Double256Shuffle(fn); }
+
+    // Make a vector of the same species but the given elements:
+    @ForceInline
+    final @Override
+    Double256Vector vectorFactory(double[] vec) {
+        return new Double256Vector(vec);
+    }
+
+    @ForceInline
+    final @Override
+    Byte256Vector asByteVectorRaw() {
+        return (Byte256Vector) super.asByteVectorRawTemplate();  // specialize
+    }
+
+    @ForceInline
+    final @Override
+    AbstractVector<?> asVectorRaw(LaneType laneType) {
+        return super.asVectorRawTemplate(laneType);  // specialize
+    }
+
+    // Unary operator
+
+    @ForceInline
+    final @Override
+    Double256Vector uOp(FUnOp f) {
+        return (Double256Vector) super.uOpTemplate(f);  // specialize
+    }
+
+    @ForceInline
+    final @Override
+    Double256Vector uOp(VectorMask<Double> m, FUnOp f) {
+        return (Double256Vector)
+            super.uOpTemplate((Double256Mask)m, f);  // specialize
+    }
+
+    // Binary operator
+
+    @ForceInline
+    final @Override
+    Double256Vector bOp(Vector<Double> v, FBinOp f) {
+        return (Double256Vector) super.bOpTemplate((Double256Vector)v, f);  // specialize
+    }
+
+    @ForceInline
+    final @Override
+    Double256Vector bOp(Vector<Double> v,
+                     VectorMask<Double> m, FBinOp f) {
+        return (Double256Vector)
+            super.bOpTemplate((Double256Vector)v, (Double256Mask)m,
+                              f);  // specialize
+    }
+
+    // Ternary operator
+
+    @ForceInline
+    final @Override
+    Double256Vector tOp(Vector<Double> v1, Vector<Double> v2, FTriOp f) {
+        return (Double256Vector)
+            super.tOpTemplate((Double256Vector)v1, (Double256Vector)v2,
+                              f);  // specialize
+    }
+
+    @ForceInline
+    final @Override
+    Double256Vector tOp(Vector<Double> v1, Vector<Double> v2,
+                     VectorMask<Double> m, FTriOp f) {
+        return (Double256Vector)
+            super.tOpTemplate((Double256Vector)v1, (Double256Vector)v2,
+                              (Double256Mask)m, f);  // specialize
+    }
+
+    @ForceInline
+    final @Override
+    double rOp(double v, FBinOp f) {
+        return super.rOpTemplate(v, f);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final <F>
+    Vector<F> convertShape(VectorOperators.Conversion<Double,F> conv,
+                           VectorSpecies<F> rsp, int part) {
+        return super.convertShapeTemplate(conv, rsp, part);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final <F>
+    Vector<F> reinterpretShape(VectorSpecies<F> toSpecies, int part) {
+        return super.reinterpretShapeTemplate(toSpecies, part);  // specialize
+    }
+
+    // Specialized algebraic operations:
+
+    // The following definition forces a specialized version of this
+    // crucial method into the v-table of this class.  A call to add()
+    // will inline to a call to lanewise(ADD,), at which point the JIT
+    // intrinsic will have the opcode of ADD, plus all the metadata
+    // for this particular class, enabling it to generate precise
+    // code.
+    //
+    // There is probably no benefit to the JIT to specialize the
+    // masked or broadcast versions of the lanewise method.
+
+    @Override
+    @ForceInline
+    public Double256Vector lanewise(Unary op) {
+        return (Double256Vector) super.lanewiseTemplate(op);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Double256Vector lanewise(Binary op, Vector<Double> v) {
+        return (Double256Vector) super.lanewiseTemplate(op, v);  // specialize
+    }
+
+
+    /*package-private*/
+    @Override
+    @ForceInline
+    public final
+    Double256Vector
+    lanewise(VectorOperators.Ternary op, Vector<Double> v1, Vector<Double> v2) {
+        return (Double256Vector) super.lanewiseTemplate(op, v1, v2);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final
+    Double256Vector addIndex(int scale) {
+        return (Double256Vector) super.addIndexTemplate(scale);  // specialize
+    }
+
+    // Type specific horizontal reductions
+
+    @Override
+    @ForceInline
+    public final double reduceLanes(VectorOperators.Associative op) {
+        return super.reduceLanesTemplate(op);  // specialized
+    }
+
+    @Override
+    @ForceInline
+    public final double reduceLanes(VectorOperators.Associative op,
+                                    VectorMask<Double> m) {
+        return super.reduceLanesTemplate(op, m);  // specialized
+    }
+
+    @Override
+    @ForceInline
+    public final long reduceLanesToLong(VectorOperators.Associative op) {
+        return (long) super.reduceLanesTemplate(op);  // specialized
+    }
+
+    @Override
+    @ForceInline
+    public final long reduceLanesToLong(VectorOperators.Associative op,
+                                        VectorMask<Double> m) {
+        return (long) super.reduceLanesTemplate(op, m);  // specialized
+    }
+
+    @Override
+    @ForceInline
+    public VectorShuffle<Double> toShuffle() {
+        double[] a = toArray();
+        int[] sa = new int[a.length];
+        for (int i = 0; i < a.length; i++) {
+            sa[i] = (int) a[i];
+        }
+        return VectorShuffle.fromArray(VSPECIES, sa, 0);
+    }
+
+    // Specialized unary testing
+
+    @Override
+    @ForceInline
+    public final Double256Mask test(Test op) {
+        return super.testTemplate(Double256Mask.class, op);  // specialize
+    }
+
+    // Specialized comparisons
+
+    @Override
+    @ForceInline
+    public final Double256Mask compare(Comparison op, Vector<Double> v) {
+        return super.compareTemplate(Double256Mask.class, op, v);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final Double256Mask compare(Comparison op, double s) {
+        return super.compareTemplate(Double256Mask.class, op, s);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final Double256Mask compare(Comparison op, long s) {
+        return super.compareTemplate(Double256Mask.class, op, s);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Double256Vector blend(Vector<Double> v, VectorMask<Double> m) {
+        return (Double256Vector)
+            super.blendTemplate(Double256Mask.class,
+                                (Double256Vector) v,
+                                (Double256Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Double256Vector slice(int origin, Vector<Double> v) {
+        return (Double256Vector) super.sliceTemplate(origin, v);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Double256Vector slice(int origin) {
+       if ((origin < 0) || (origin >= VLENGTH)) {
+         throw new ArrayIndexOutOfBoundsException("Index " + origin + " out of bounds for vector length " + VLENGTH);
+       } else {
+         Double256Shuffle Iota = iotaShuffle();
+         VectorMask<Double> BlendMask = Iota.toVector().compare(VectorOperators.LT, (broadcast((double)(VLENGTH-origin))));
+         Iota = iotaShuffle(origin, 1, true);
+         return ZERO.blend(this.rearrange(Iota), BlendMask);
+       }
+    }
+
+    @Override
+    @ForceInline
+    public Double256Vector unslice(int origin, Vector<Double> w, int part) {
+        return (Double256Vector) super.unsliceTemplate(origin, w, part);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Double256Vector unslice(int origin, Vector<Double> w, int part, VectorMask<Double> m) {
+        return (Double256Vector)
+            super.unsliceTemplate(Double256Mask.class,
+                                  origin, w, part,
+                                  (Double256Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Double256Vector unslice(int origin) {
+       if ((origin < 0) || (origin >= VLENGTH)) {
+         throw new ArrayIndexOutOfBoundsException("Index " + origin + " out of bounds for vector length " + VLENGTH);
+       } else {
+         Double256Shuffle Iota = iotaShuffle();
+         VectorMask<Double> BlendMask = Iota.toVector().compare(VectorOperators.GE, (broadcast((double)(origin))));
+         Iota = iotaShuffle(-origin, 1, true);
+         return ZERO.blend(this.rearrange(Iota), BlendMask);
+       }
+    }
+
+    @Override
+    @ForceInline
+    public Double256Vector rearrange(VectorShuffle<Double> s) {
+        return (Double256Vector)
+            super.rearrangeTemplate(Double256Shuffle.class,
+                                    (Double256Shuffle) s);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Double256Vector rearrange(VectorShuffle<Double> shuffle,
+                                  VectorMask<Double> m) {
+        return (Double256Vector)
+            super.rearrangeTemplate(Double256Shuffle.class,
+                                    (Double256Shuffle) shuffle,
+                                    (Double256Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Double256Vector rearrange(VectorShuffle<Double> s,
+                                  Vector<Double> v) {
+        return (Double256Vector)
+            super.rearrangeTemplate(Double256Shuffle.class,
+                                    (Double256Shuffle) s,
+                                    (Double256Vector) v);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Double256Vector selectFrom(Vector<Double> v) {
+        return (Double256Vector)
+            super.selectFromTemplate((Double256Vector) v);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Double256Vector selectFrom(Vector<Double> v,
+                                   VectorMask<Double> m) {
+        return (Double256Vector)
+            super.selectFromTemplate((Double256Vector) v,
+                                     (Double256Mask) m);  // specialize
+    }
+
+
+    @ForceInline
+    @Override
+    public double lane(int i) {
+        long bits;
+        switch(i) {
+            case 0: bits = laneHelper(0); break;
+            case 1: bits = laneHelper(1); break;
+            case 2: bits = laneHelper(2); break;
+            case 3: bits = laneHelper(3); break;
+            default: throw new IllegalArgumentException("Index " + i + " must be zero or positive, and less than " + VLENGTH);
+        }
+        return Double.longBitsToDouble(bits);
+    }
+
+    public long laneHelper(int i) {
+        return (long) VectorSupport.extract(
+                     VCLASS, ETYPE, VLENGTH,
+                     this, i,
+                     (vec, ix) -> {
+                     double[] vecarr = vec.vec();
+                     return (long)Double.doubleToLongBits(vecarr[ix]);
+                     });
+    }
+
+    @ForceInline
+    @Override
+    public Double256Vector withLane(int i, double e) {
+        switch(i) {
+            case 0: return withLaneHelper(0, e);
+            case 1: return withLaneHelper(1, e);
+            case 2: return withLaneHelper(2, e);
+            case 3: return withLaneHelper(3, e);
+            default: throw new IllegalArgumentException("Index " + i + " must be zero or positive, and less than " + VLENGTH);
+        }
+    }
+
+    public Double256Vector withLaneHelper(int i, double e) {
+        return VectorSupport.insert(
+                                VCLASS, ETYPE, VLENGTH,
+                                this, i, (long)Double.doubleToLongBits(e),
+                                (v, ix, bits) -> {
+                                    double[] res = v.vec().clone();
+                                    res[ix] = Double.longBitsToDouble((long)bits);
+                                    return v.vectorFactory(res);
+                                });
+    }
+
+    // Mask
+
+    static final class Double256Mask extends AbstractMask<Double> {
+        static final int VLENGTH = VSPECIES.laneCount();    // used by the JVM
+        static final Class<Double> ETYPE = double.class; // used by the JVM
+
+        Double256Mask(boolean[] bits) {
+            this(bits, 0);
+        }
+
+        Double256Mask(boolean[] bits, int offset) {
+            super(prepare(bits, offset));
+        }
+
+        Double256Mask(boolean val) {
+            super(prepare(val));
+        }
+
+        private static boolean[] prepare(boolean[] bits, int offset) {
+            boolean[] newBits = new boolean[VSPECIES.laneCount()];
+            for (int i = 0; i < newBits.length; i++) {
+                newBits[i] = bits[offset + i];
+            }
+            return newBits;
+        }
+
+        private static boolean[] prepare(boolean val) {
+            boolean[] bits = new boolean[VSPECIES.laneCount()];
+            Arrays.fill(bits, val);
+            return bits;
+        }
+
+        @ForceInline
+        final @Override
+        public DoubleSpecies vspecies() {
+            // ISSUE:  This should probably be a @Stable
+            // field inside AbstractMask, rather than
+            // a megamorphic method.
+            return VSPECIES;
+        }
+
+        @ForceInline
+        boolean[] getBits() {
+            return (boolean[])getPayload();
+        }
+
+        @Override
+        Double256Mask uOp(MUnOp f) {
+            boolean[] res = new boolean[vspecies().laneCount()];
+            boolean[] bits = getBits();
+            for (int i = 0; i < res.length; i++) {
+                res[i] = f.apply(i, bits[i]);
+            }
+            return new Double256Mask(res);
+        }
+
+        @Override
+        Double256Mask bOp(VectorMask<Double> m, MBinOp f) {
+            boolean[] res = new boolean[vspecies().laneCount()];
+            boolean[] bits = getBits();
+            boolean[] mbits = ((Double256Mask)m).getBits();
+            for (int i = 0; i < res.length; i++) {
+                res[i] = f.apply(i, bits[i], mbits[i]);
+            }
+            return new Double256Mask(res);
+        }
+
+        @ForceInline
+        @Override
+        public final
+        Double256Vector toVector() {
+            return (Double256Vector) super.toVectorTemplate();  // specialize
+        }
+
+        @Override
+        @ForceInline
+        public <E> VectorMask<E> cast(VectorSpecies<E> s) {
+            AbstractSpecies<E> species = (AbstractSpecies<E>) s;
+            if (length() != species.laneCount())
+                throw new IllegalArgumentException("VectorMask length and species length differ");
+            boolean[] maskArray = toArray();
+            // enum-switches don't optimize properly JDK-8161245
+            switch (species.laneType.switchKey) {
+            case LaneType.SK_BYTE:
+                return new Byte256Vector.Byte256Mask(maskArray).check(species);
+            case LaneType.SK_SHORT:
+                return new Short256Vector.Short256Mask(maskArray).check(species);
+            case LaneType.SK_INT:
+                return new Int256Vector.Int256Mask(maskArray).check(species);
+            case LaneType.SK_LONG:
+                return new Long256Vector.Long256Mask(maskArray).check(species);
+            case LaneType.SK_FLOAT:
+                return new Float256Vector.Float256Mask(maskArray).check(species);
+            case LaneType.SK_DOUBLE:
+                return new Double256Vector.Double256Mask(maskArray).check(species);
+            }
+
+            // Should not reach here.
+            throw new AssertionError(species);
+        }
+
+        // Unary operations
+
+        @Override
+        @ForceInline
+        public Double256Mask not() {
+            return xor(maskAll(true));
+        }
+
+        // Binary operations
+
+        @Override
+        @ForceInline
+        public Double256Mask and(VectorMask<Double> mask) {
+            Objects.requireNonNull(mask);
+            Double256Mask m = (Double256Mask)mask;
+            return VectorSupport.binaryOp(VECTOR_OP_AND, Double256Mask.class, long.class, VLENGTH,
+                                             this, m,
+                                             (m1, m2) -> m1.bOp(m2, (i, a, b) -> a & b));
+        }
+
+        @Override
+        @ForceInline
+        public Double256Mask or(VectorMask<Double> mask) {
+            Objects.requireNonNull(mask);
+            Double256Mask m = (Double256Mask)mask;
+            return VectorSupport.binaryOp(VECTOR_OP_OR, Double256Mask.class, long.class, VLENGTH,
+                                             this, m,
+                                             (m1, m2) -> m1.bOp(m2, (i, a, b) -> a | b));
+        }
+
+        @ForceInline
+        /* package-private */
+        Double256Mask xor(VectorMask<Double> mask) {
+            Objects.requireNonNull(mask);
+            Double256Mask m = (Double256Mask)mask;
+            return VectorSupport.binaryOp(VECTOR_OP_XOR, Double256Mask.class, long.class, VLENGTH,
+                                          this, m,
+                                          (m1, m2) -> m1.bOp(m2, (i, a, b) -> a ^ b));
+        }
+
+        // Reductions
+
+        @Override
+        @ForceInline
+        public boolean anyTrue() {
+            return VectorSupport.test(BT_ne, Double256Mask.class, long.class, VLENGTH,
+                                         this, vspecies().maskAll(true),
+                                         (m, __) -> anyTrueHelper(((Double256Mask)m).getBits()));
+        }
+
+        @Override
+        @ForceInline
+        public boolean allTrue() {
+            return VectorSupport.test(BT_overflow, Double256Mask.class, long.class, VLENGTH,
+                                         this, vspecies().maskAll(true),
+                                         (m, __) -> allTrueHelper(((Double256Mask)m).getBits()));
+        }
+
+        @ForceInline
+        /*package-private*/
+        static Double256Mask maskAll(boolean bit) {
+            return VectorSupport.broadcastCoerced(Double256Mask.class, long.class, VLENGTH,
+                                                  (bit ? -1 : 0), null,
+                                                  (v, __) -> (v != 0 ? TRUE_MASK : FALSE_MASK));
+        }
+        private static final Double256Mask  TRUE_MASK = new Double256Mask(true);
+        private static final Double256Mask FALSE_MASK = new Double256Mask(false);
+
+    }
+
+    // Shuffle
+
+    static final class Double256Shuffle extends AbstractShuffle<Double> {
+        static final int VLENGTH = VSPECIES.laneCount();    // used by the JVM
+        static final Class<Double> ETYPE = double.class; // used by the JVM
+
+        Double256Shuffle(byte[] reorder) {
+            super(VLENGTH, reorder);
+        }
+
+        public Double256Shuffle(int[] reorder) {
+            super(VLENGTH, reorder);
+        }
+
+        public Double256Shuffle(int[] reorder, int i) {
+            super(VLENGTH, reorder, i);
+        }
+
+        public Double256Shuffle(IntUnaryOperator fn) {
+            super(VLENGTH, fn);
+        }
+
+        @Override
+        public DoubleSpecies vspecies() {
+            return VSPECIES;
+        }
+
+        static {
+            // There must be enough bits in the shuffle lanes to encode
+            // VLENGTH valid indexes and VLENGTH exceptional ones.
+            assert(VLENGTH < Byte.MAX_VALUE);
+            assert(Byte.MIN_VALUE <= -VLENGTH);
+        }
+        static final Double256Shuffle IOTA = new Double256Shuffle(IDENTITY);
+
+        @Override
+        @ForceInline
+        public Double256Vector toVector() {
+            return VectorSupport.shuffleToVector(VCLASS, ETYPE, Double256Shuffle.class, this, VLENGTH,
+                                                    (s) -> ((Double256Vector)(((AbstractShuffle<Double>)(s)).toVectorTemplate())));
+        }
+
+        @Override
+        @ForceInline
+        public <F> VectorShuffle<F> cast(VectorSpecies<F> s) {
+            AbstractSpecies<F> species = (AbstractSpecies<F>) s;
+            if (length() != species.laneCount())
+                throw new IllegalArgumentException("VectorShuffle length and species length differ");
+            int[] shuffleArray = toArray();
+            // enum-switches don't optimize properly JDK-8161245
+            switch (species.laneType.switchKey) {
+            case LaneType.SK_BYTE:
+                return new Byte256Vector.Byte256Shuffle(shuffleArray).check(species);
+            case LaneType.SK_SHORT:
+                return new Short256Vector.Short256Shuffle(shuffleArray).check(species);
+            case LaneType.SK_INT:
+                return new Int256Vector.Int256Shuffle(shuffleArray).check(species);
+            case LaneType.SK_LONG:
+                return new Long256Vector.Long256Shuffle(shuffleArray).check(species);
+            case LaneType.SK_FLOAT:
+                return new Float256Vector.Float256Shuffle(shuffleArray).check(species);
+            case LaneType.SK_DOUBLE:
+                return new Double256Vector.Double256Shuffle(shuffleArray).check(species);
+            }
+
+            // Should not reach here.
+            throw new AssertionError(species);
+        }
+
+        @ForceInline
+        @Override
+        public Double256Shuffle rearrange(VectorShuffle<Double> shuffle) {
+            Double256Shuffle s = (Double256Shuffle) shuffle;
+            byte[] reorder1 = reorder();
+            byte[] reorder2 = s.reorder();
+            byte[] r = new byte[reorder1.length];
+            for (int i = 0; i < reorder1.length; i++) {
+                int ssi = reorder2[i];
+                r[i] = reorder1[ssi];  // throws on exceptional index
+            }
+            return new Double256Shuffle(r);
+        }
+    }
+
+    // ================================================
+
+    // Specialized low-level memory operations.
+
+    @ForceInline
+    @Override
+    final
+    DoubleVector fromArray0(double[] a, int offset) {
+        return super.fromArray0Template(a, offset);  // specialize
+    }
+
+    @ForceInline
+    @Override
+    final
+    DoubleVector fromByteArray0(byte[] a, int offset) {
+        return super.fromByteArray0Template(a, offset);  // specialize
+    }
+
+    @ForceInline
+    @Override
+    final
+    DoubleVector fromByteBuffer0(ByteBuffer bb, int offset) {
+        return super.fromByteBuffer0Template(bb, offset);  // specialize
+    }
+
+    @ForceInline
+    @Override
+    final
+    void intoArray0(double[] a, int offset) {
+        super.intoArray0Template(a, offset);  // specialize
+    }
+
+    @ForceInline
+    @Override
+    final
+    void intoByteArray0(byte[] a, int offset) {
+        super.intoByteArray0Template(a, offset);  // specialize
+    }
+
+    // End of specialized low-level memory operations.
+
+    // ================================================
+
+}
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Double512Vector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Double512Vector.java
@ -0,0 +1,820 @@
+/*
+ * Copyright (c) 2017, 2020, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.  Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package jdk.incubator.vector;
+
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.Objects;
+import java.util.function.IntUnaryOperator;
+
+import jdk.internal.vm.annotation.ForceInline;
+import jdk.internal.vm.vector.VectorSupport;
+
+import static jdk.internal.vm.vector.VectorSupport.*;
+
+import static jdk.incubator.vector.VectorOperators.*;
+
+// -- This file was mechanically generated: Do not edit! -- //
+
+@SuppressWarnings("cast")  // warning: redundant cast
+final class Double512Vector extends DoubleVector {
+    static final DoubleSpecies VSPECIES =
+        (DoubleSpecies) DoubleVector.SPECIES_512;
+
+    static final VectorShape VSHAPE =
+        VSPECIES.vectorShape();
+
+    static final Class<Double512Vector> VCLASS = Double512Vector.class;
+
+    static final int VSIZE = VSPECIES.vectorBitSize();
+
+    static final int VLENGTH = VSPECIES.laneCount(); // used by the JVM
+
+    static final Class<Double> ETYPE = double.class; // used by the JVM
+
+    Double512Vector(double[] v) {
+        super(v);
+    }
+
+    // For compatibility as Double512Vector::new,
+    // stored into species.vectorFactory.
+    Double512Vector(Object v) {
+        this((double[]) v);
+    }
+
+    static final Double512Vector ZERO = new Double512Vector(new double[VLENGTH]);
+    static final Double512Vector IOTA = new Double512Vector(VSPECIES.iotaArray());
+
+    static {
+        // Warm up a few species caches.
+        // If we do this too much we will
+        // get NPEs from bootstrap circularity.
+        VSPECIES.dummyVector();
+        VSPECIES.withLanes(LaneType.BYTE);
+    }
+
+    // Specialized extractors
+
+    @ForceInline
+    final @Override
+    public DoubleSpecies vspecies() {
+        // ISSUE:  This should probably be a @Stable
+        // field inside AbstractVector, rather than
+        // a megamorphic method.
+        return VSPECIES;
+    }
+
+    @ForceInline
+    @Override
+    public final Class<Double> elementType() { return double.class; }
+
+    @ForceInline
+    @Override
+    public final int elementSize() { return Double.SIZE; }
+
+    @ForceInline
+    @Override
+    public final VectorShape shape() { return VSHAPE; }
+
+    @ForceInline
+    @Override
+    public final int length() { return VLENGTH; }
+
+    @ForceInline
+    @Override
+    public final int bitSize() { return VSIZE; }
+
+    @ForceInline
+    @Override
+    public final int byteSize() { return VSIZE / Byte.SIZE; }
+
+    /*package-private*/
+    @ForceInline
+    final @Override
+    double[] vec() {
+        return (double[])getPayload();
+    }
+
+    // Virtualized constructors
+
+    @Override
+    @ForceInline
+    public final Double512Vector broadcast(double e) {
+        return (Double512Vector) super.broadcastTemplate(e);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final Double512Vector broadcast(long e) {
+        return (Double512Vector) super.broadcastTemplate(e);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    Double512Mask maskFromArray(boolean[] bits) {
+        return new Double512Mask(bits);
+    }
+
+    @Override
+    @ForceInline
+    Double512Shuffle iotaShuffle() { return Double512Shuffle.IOTA; }
+
+    @ForceInline
+    Double512Shuffle iotaShuffle(int start, int step, boolean wrap) {
+      if (wrap) {
+        return (Double512Shuffle)VectorSupport.shuffleIota(ETYPE, Double512Shuffle.class, VSPECIES, VLENGTH, start, step, 1,
+                (l, lstart, lstep, s) -> s.shuffleFromOp(i -> (VectorIntrinsics.wrapToRange(i*lstep + lstart, l))));
+      } else {
+        return (Double512Shuffle)VectorSupport.shuffleIota(ETYPE, Double512Shuffle.class, VSPECIES, VLENGTH, start, step, 0,
+                (l, lstart, lstep, s) -> s.shuffleFromOp(i -> (i*lstep + lstart)));
+      }
+    }
+
+    @Override
+    @ForceInline
+    Double512Shuffle shuffleFromBytes(byte[] reorder) { return new Double512Shuffle(reorder); }
+
+    @Override
+    @ForceInline
+    Double512Shuffle shuffleFromArray(int[] indexes, int i) { return new Double512Shuffle(indexes, i); }
+
+    @Override
+    @ForceInline
+    Double512Shuffle shuffleFromOp(IntUnaryOperator fn) { return new Double512Shuffle(fn); }
+
+    // Make a vector of the same species but the given elements:
+    @ForceInline
+    final @Override
+    Double512Vector vectorFactory(double[] vec) {
+        return new Double512Vector(vec);
+    }
+
+    @ForceInline
+    final @Override
+    Byte512Vector asByteVectorRaw() {
+        return (Byte512Vector) super.asByteVectorRawTemplate();  // specialize
+    }
+
+    @ForceInline
+    final @Override
+    AbstractVector<?> asVectorRaw(LaneType laneType) {
+        return super.asVectorRawTemplate(laneType);  // specialize
+    }
+
+    // Unary operator
+
+    @ForceInline
+    final @Override
+    Double512Vector uOp(FUnOp f) {
+        return (Double512Vector) super.uOpTemplate(f);  // specialize
+    }
+
+    @ForceInline
+    final @Override
+    Double512Vector uOp(VectorMask<Double> m, FUnOp f) {
+        return (Double512Vector)
+            super.uOpTemplate((Double512Mask)m, f);  // specialize
+    }
+
+    // Binary operator
+
+    @ForceInline
+    final @Override
+    Double512Vector bOp(Vector<Double> v, FBinOp f) {
+        return (Double512Vector) super.bOpTemplate((Double512Vector)v, f);  // specialize
+    }
+
+    @ForceInline
+    final @Override
+    Double512Vector bOp(Vector<Double> v,
+                     VectorMask<Double> m, FBinOp f) {
+        return (Double512Vector)
+            super.bOpTemplate((Double512Vector)v, (Double512Mask)m,
+                              f);  // specialize
+    }
+
+    // Ternary operator
+
+    @ForceInline
+    final @Override
+    Double512Vector tOp(Vector<Double> v1, Vector<Double> v2, FTriOp f) {
+        return (Double512Vector)
+            super.tOpTemplate((Double512Vector)v1, (Double512Vector)v2,
+                              f);  // specialize
+    }
+
+    @ForceInline
+    final @Override
+    Double512Vector tOp(Vector<Double> v1, Vector<Double> v2,
+                     VectorMask<Double> m, FTriOp f) {
+        return (Double512Vector)
+            super.tOpTemplate((Double512Vector)v1, (Double512Vector)v2,
+                              (Double512Mask)m, f);  // specialize
+    }
+
+    @ForceInline
+    final @Override
+    double rOp(double v, FBinOp f) {
+        return super.rOpTemplate(v, f);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final <F>
+    Vector<F> convertShape(VectorOperators.Conversion<Double,F> conv,
+                           VectorSpecies<F> rsp, int part) {
+        return super.convertShapeTemplate(conv, rsp, part);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final <F>
+    Vector<F> reinterpretShape(VectorSpecies<F> toSpecies, int part) {
+        return super.reinterpretShapeTemplate(toSpecies, part);  // specialize
+    }
+
+    // Specialized algebraic operations:
+
+    // The following definition forces a specialized version of this
+    // crucial method into the v-table of this class.  A call to add()
+    // will inline to a call to lanewise(ADD,), at which point the JIT
+    // intrinsic will have the opcode of ADD, plus all the metadata
+    // for this particular class, enabling it to generate precise
+    // code.
+    //
+    // There is probably no benefit to the JIT to specialize the
+    // masked or broadcast versions of the lanewise method.
+
+    @Override
+    @ForceInline
+    public Double512Vector lanewise(Unary op) {
+        return (Double512Vector) super.lanewiseTemplate(op);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Double512Vector lanewise(Binary op, Vector<Double> v) {
+        return (Double512Vector) super.lanewiseTemplate(op, v);  // specialize
+    }
+
+
+    /*package-private*/
+    @Override
+    @ForceInline
+    public final
+    Double512Vector
+    lanewise(VectorOperators.Ternary op, Vector<Double> v1, Vector<Double> v2) {
+        return (Double512Vector) super.lanewiseTemplate(op, v1, v2);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final
+    Double512Vector addIndex(int scale) {
+        return (Double512Vector) super.addIndexTemplate(scale);  // specialize
+    }
+
+    // Type specific horizontal reductions
+
+    @Override
+    @ForceInline
+    public final double reduceLanes(VectorOperators.Associative op) {
+        return super.reduceLanesTemplate(op);  // specialized
+    }
+
+    @Override
+    @ForceInline
+    public final double reduceLanes(VectorOperators.Associative op,
+                                    VectorMask<Double> m) {
+        return super.reduceLanesTemplate(op, m);  // specialized
+    }
+
+    @Override
+    @ForceInline
+    public final long reduceLanesToLong(VectorOperators.Associative op) {
+        return (long) super.reduceLanesTemplate(op);  // specialized
+    }
+
+    @Override
+    @ForceInline
+    public final long reduceLanesToLong(VectorOperators.Associative op,
+                                        VectorMask<Double> m) {
+        return (long) super.reduceLanesTemplate(op, m);  // specialized
+    }
+
+    @Override
+    @ForceInline
+    public VectorShuffle<Double> toShuffle() {
+        double[] a = toArray();
+        int[] sa = new int[a.length];
+        for (int i = 0; i < a.length; i++) {
+            sa[i] = (int) a[i];
+        }
+        return VectorShuffle.fromArray(VSPECIES, sa, 0);
+    }
+
+    // Specialized unary testing
+
+    @Override
+    @ForceInline
+    public final Double512Mask test(Test op) {
+        return super.testTemplate(Double512Mask.class, op);  // specialize
+    }
+
+    // Specialized comparisons
+
+    @Override
+    @ForceInline
+    public final Double512Mask compare(Comparison op, Vector<Double> v) {
+        return super.compareTemplate(Double512Mask.class, op, v);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final Double512Mask compare(Comparison op, double s) {
+        return super.compareTemplate(Double512Mask.class, op, s);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final Double512Mask compare(Comparison op, long s) {
+        return super.compareTemplate(Double512Mask.class, op, s);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Double512Vector blend(Vector<Double> v, VectorMask<Double> m) {
+        return (Double512Vector)
+            super.blendTemplate(Double512Mask.class,
+                                (Double512Vector) v,
+                                (Double512Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Double512Vector slice(int origin, Vector<Double> v) {
+        return (Double512Vector) super.sliceTemplate(origin, v);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Double512Vector slice(int origin) {
+       if ((origin < 0) || (origin >= VLENGTH)) {
+         throw new ArrayIndexOutOfBoundsException("Index " + origin + " out of bounds for vector length " + VLENGTH);
+       } else {
+         Double512Shuffle Iota = iotaShuffle();
+         VectorMask<Double> BlendMask = Iota.toVector().compare(VectorOperators.LT, (broadcast((double)(VLENGTH-origin))));
+         Iota = iotaShuffle(origin, 1, true);
+         return ZERO.blend(this.rearrange(Iota), BlendMask);
+       }
+    }
+
+    @Override
+    @ForceInline
+    public Double512Vector unslice(int origin, Vector<Double> w, int part) {
+        return (Double512Vector) super.unsliceTemplate(origin, w, part);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Double512Vector unslice(int origin, Vector<Double> w, int part, VectorMask<Double> m) {
+        return (Double512Vector)
+            super.unsliceTemplate(Double512Mask.class,
+                                  origin, w, part,
+                                  (Double512Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Double512Vector unslice(int origin) {
+       if ((origin < 0) || (origin >= VLENGTH)) {
+         throw new ArrayIndexOutOfBoundsException("Index " + origin + " out of bounds for vector length " + VLENGTH);
+       } else {
+         Double512Shuffle Iota = iotaShuffle();
+         VectorMask<Double> BlendMask = Iota.toVector().compare(VectorOperators.GE, (broadcast((double)(origin))));
+         Iota = iotaShuffle(-origin, 1, true);
+         return ZERO.blend(this.rearrange(Iota), BlendMask);
+       }
+    }
+
+    @Override
+    @ForceInline
+    public Double512Vector rearrange(VectorShuffle<Double> s) {
+        return (Double512Vector)
+            super.rearrangeTemplate(Double512Shuffle.class,
+                                    (Double512Shuffle) s);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Double512Vector rearrange(VectorShuffle<Double> shuffle,
+                                  VectorMask<Double> m) {
+        return (Double512Vector)
+            super.rearrangeTemplate(Double512Shuffle.class,
+                                    (Double512Shuffle) shuffle,
+                                    (Double512Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Double512Vector rearrange(VectorShuffle<Double> s,
+                                  Vector<Double> v) {
+        return (Double512Vector)
+            super.rearrangeTemplate(Double512Shuffle.class,
+                                    (Double512Shuffle) s,
+                                    (Double512Vector) v);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Double512Vector selectFrom(Vector<Double> v) {
+        return (Double512Vector)
+            super.selectFromTemplate((Double512Vector) v);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Double512Vector selectFrom(Vector<Double> v,
+                                   VectorMask<Double> m) {
+        return (Double512Vector)
+            super.selectFromTemplate((Double512Vector) v,
+                                     (Double512Mask) m);  // specialize
+    }
+
+
+    @ForceInline
+    @Override
+    public double lane(int i) {
+        long bits;
+        switch(i) {
+            case 0: bits = laneHelper(0); break;
+            case 1: bits = laneHelper(1); break;
+            case 2: bits = laneHelper(2); break;
+            case 3: bits = laneHelper(3); break;
+            case 4: bits = laneHelper(4); break;
+            case 5: bits = laneHelper(5); break;
+            case 6: bits = laneHelper(6); break;
+            case 7: bits = laneHelper(7); break;
+            default: throw new IllegalArgumentException("Index " + i + " must be zero or positive, and less than " + VLENGTH);
+        }
+        return Double.longBitsToDouble(bits);
+    }
+
+    public long laneHelper(int i) {
+        return (long) VectorSupport.extract(
+                     VCLASS, ETYPE, VLENGTH,
+                     this, i,
+                     (vec, ix) -> {
+                     double[] vecarr = vec.vec();
+                     return (long)Double.doubleToLongBits(vecarr[ix]);
+                     });
+    }
+
+    @ForceInline
+    @Override
+    public Double512Vector withLane(int i, double e) {
+        switch(i) {
+            case 0: return withLaneHelper(0, e);
+            case 1: return withLaneHelper(1, e);
+            case 2: return withLaneHelper(2, e);
+            case 3: return withLaneHelper(3, e);
+            case 4: return withLaneHelper(4, e);
+            case 5: return withLaneHelper(5, e);
+            case 6: return withLaneHelper(6, e);
+            case 7: return withLaneHelper(7, e);
+            default: throw new IllegalArgumentException("Index " + i + " must be zero or positive, and less than " + VLENGTH);
+        }
+    }
+
+    public Double512Vector withLaneHelper(int i, double e) {
+        return VectorSupport.insert(
+                                VCLASS, ETYPE, VLENGTH,
+                                this, i, (long)Double.doubleToLongBits(e),
+                                (v, ix, bits) -> {
+                                    double[] res = v.vec().clone();
+                                    res[ix] = Double.longBitsToDouble((long)bits);
+                                    return v.vectorFactory(res);
+                                });
+    }
+
+    // Mask
+
+    static final class Double512Mask extends AbstractMask<Double> {
+        static final int VLENGTH = VSPECIES.laneCount();    // used by the JVM
+        static final Class<Double> ETYPE = double.class; // used by the JVM
+
+        Double512Mask(boolean[] bits) {
+            this(bits, 0);
+        }
+
+        Double512Mask(boolean[] bits, int offset) {
+            super(prepare(bits, offset));
+        }
+
+        Double512Mask(boolean val) {
+            super(prepare(val));
+        }
+
+        private static boolean[] prepare(boolean[] bits, int offset) {
+            boolean[] newBits = new boolean[VSPECIES.laneCount()];
+            for (int i = 0; i < newBits.length; i++) {
+                newBits[i] = bits[offset + i];
+            }
+            return newBits;
+        }
+
+        private static boolean[] prepare(boolean val) {
+            boolean[] bits = new boolean[VSPECIES.laneCount()];
+            Arrays.fill(bits, val);
+            return bits;
+        }
+
+        @ForceInline
+        final @Override
+        public DoubleSpecies vspecies() {
+            // ISSUE:  This should probably be a @Stable
+            // field inside AbstractMask, rather than
+            // a megamorphic method.
+            return VSPECIES;
+        }
+
+        @ForceInline
+        boolean[] getBits() {
+            return (boolean[])getPayload();
+        }
+
+        @Override
+        Double512Mask uOp(MUnOp f) {
+            boolean[] res = new boolean[vspecies().laneCount()];
+            boolean[] bits = getBits();
+            for (int i = 0; i < res.length; i++) {
+                res[i] = f.apply(i, bits[i]);
+            }
+            return new Double512Mask(res);
+        }
+
+        @Override
+        Double512Mask bOp(VectorMask<Double> m, MBinOp f) {
+            boolean[] res = new boolean[vspecies().laneCount()];
+            boolean[] bits = getBits();
+            boolean[] mbits = ((Double512Mask)m).getBits();
+            for (int i = 0; i < res.length; i++) {
+                res[i] = f.apply(i, bits[i], mbits[i]);
+            }
+            return new Double512Mask(res);
+        }
+
+        @ForceInline
+        @Override
+        public final
+        Double512Vector toVector() {
+            return (Double512Vector) super.toVectorTemplate();  // specialize
+        }
+
+        @Override
+        @ForceInline
+        public <E> VectorMask<E> cast(VectorSpecies<E> s) {
+            AbstractSpecies<E> species = (AbstractSpecies<E>) s;
+            if (length() != species.laneCount())
+                throw new IllegalArgumentException("VectorMask length and species length differ");
+            boolean[] maskArray = toArray();
+            // enum-switches don't optimize properly JDK-8161245
+            switch (species.laneType.switchKey) {
+            case LaneType.SK_BYTE:
+                return new Byte512Vector.Byte512Mask(maskArray).check(species);
+            case LaneType.SK_SHORT:
+                return new Short512Vector.Short512Mask(maskArray).check(species);
+            case LaneType.SK_INT:
+                return new Int512Vector.Int512Mask(maskArray).check(species);
+            case LaneType.SK_LONG:
+                return new Long512Vector.Long512Mask(maskArray).check(species);
+            case LaneType.SK_FLOAT:
+                return new Float512Vector.Float512Mask(maskArray).check(species);
+            case LaneType.SK_DOUBLE:
+                return new Double512Vector.Double512Mask(maskArray).check(species);
+            }
+
+            // Should not reach here.
+            throw new AssertionError(species);
+        }
+
+        // Unary operations
+
+        @Override
+        @ForceInline
+        public Double512Mask not() {
+            return xor(maskAll(true));
+        }
+
+        // Binary operations
+
+        @Override
+        @ForceInline
+        public Double512Mask and(VectorMask<Double> mask) {
+            Objects.requireNonNull(mask);
+            Double512Mask m = (Double512Mask)mask;
+            return VectorSupport.binaryOp(VECTOR_OP_AND, Double512Mask.class, long.class, VLENGTH,
+                                             this, m,
+                                             (m1, m2) -> m1.bOp(m2, (i, a, b) -> a & b));
+        }
+
+        @Override
+        @ForceInline
+        public Double512Mask or(VectorMask<Double> mask) {
+            Objects.requireNonNull(mask);
+            Double512Mask m = (Double512Mask)mask;
+            return VectorSupport.binaryOp(VECTOR_OP_OR, Double512Mask.class, long.class, VLENGTH,
+                                             this, m,
+                                             (m1, m2) -> m1.bOp(m2, (i, a, b) -> a | b));
+        }
+
+        @ForceInline
+        /* package-private */
+        Double512Mask xor(VectorMask<Double> mask) {
+            Objects.requireNonNull(mask);
+            Double512Mask m = (Double512Mask)mask;
+            return VectorSupport.binaryOp(VECTOR_OP_XOR, Double512Mask.class, long.class, VLENGTH,
+                                          this, m,
+                                          (m1, m2) -> m1.bOp(m2, (i, a, b) -> a ^ b));
+        }
+
+        // Reductions
+
+        @Override
+        @ForceInline
+        public boolean anyTrue() {
+            return VectorSupport.test(BT_ne, Double512Mask.class, long.class, VLENGTH,
+                                         this, vspecies().maskAll(true),
+                                         (m, __) -> anyTrueHelper(((Double512Mask)m).getBits()));
+        }
+
+        @Override
+        @ForceInline
+        public boolean allTrue() {
+            return VectorSupport.test(BT_overflow, Double512Mask.class, long.class, VLENGTH,
+                                         this, vspecies().maskAll(true),
+                                         (m, __) -> allTrueHelper(((Double512Mask)m).getBits()));
+        }
+
+        @ForceInline
+        /*package-private*/
+        static Double512Mask maskAll(boolean bit) {
+            return VectorSupport.broadcastCoerced(Double512Mask.class, long.class, VLENGTH,
+                                                  (bit ? -1 : 0), null,
+                                                  (v, __) -> (v != 0 ? TRUE_MASK : FALSE_MASK));
+        }
+        private static final Double512Mask  TRUE_MASK = new Double512Mask(true);
+        private static final Double512Mask FALSE_MASK = new Double512Mask(false);
+
+    }
+
+    // Shuffle
+
+    static final class Double512Shuffle extends AbstractShuffle<Double> {
+        static final int VLENGTH = VSPECIES.laneCount();    // used by the JVM
+        static final Class<Double> ETYPE = double.class; // used by the JVM
+
+        Double512Shuffle(byte[] reorder) {
+            super(VLENGTH, reorder);
+        }
+
+        public Double512Shuffle(int[] reorder) {
+            super(VLENGTH, reorder);
+        }
+
+        public Double512Shuffle(int[] reorder, int i) {
+            super(VLENGTH, reorder, i);
+        }
+
+        public Double512Shuffle(IntUnaryOperator fn) {
+            super(VLENGTH, fn);
+        }
+
+        @Override
+        public DoubleSpecies vspecies() {
+            return VSPECIES;
+        }
+
+        static {
+            // There must be enough bits in the shuffle lanes to encode
+            // VLENGTH valid indexes and VLENGTH exceptional ones.
+            assert(VLENGTH < Byte.MAX_VALUE);
+            assert(Byte.MIN_VALUE <= -VLENGTH);
+        }
+        static final Double512Shuffle IOTA = new Double512Shuffle(IDENTITY);
+
+        @Override
+        @ForceInline
+        public Double512Vector toVector() {
+            return VectorSupport.shuffleToVector(VCLASS, ETYPE, Double512Shuffle.class, this, VLENGTH,
+                                                    (s) -> ((Double512Vector)(((AbstractShuffle<Double>)(s)).toVectorTemplate())));
+        }
+
+        @Override
+        @ForceInline
+        public <F> VectorShuffle<F> cast(VectorSpecies<F> s) {
+            AbstractSpecies<F> species = (AbstractSpecies<F>) s;
+            if (length() != species.laneCount())
+                throw new IllegalArgumentException("VectorShuffle length and species length differ");
+            int[] shuffleArray = toArray();
+            // enum-switches don't optimize properly JDK-8161245
+            switch (species.laneType.switchKey) {
+            case LaneType.SK_BYTE:
+                return new Byte512Vector.Byte512Shuffle(shuffleArray).check(species);
+            case LaneType.SK_SHORT:
+                return new Short512Vector.Short512Shuffle(shuffleArray).check(species);
+            case LaneType.SK_INT:
+                return new Int512Vector.Int512Shuffle(shuffleArray).check(species);
+            case LaneType.SK_LONG:
+                return new Long512Vector.Long512Shuffle(shuffleArray).check(species);
+            case LaneType.SK_FLOAT:
+                return new Float512Vector.Float512Shuffle(shuffleArray).check(species);
+            case LaneType.SK_DOUBLE:
+                return new Double512Vector.Double512Shuffle(shuffleArray).check(species);
+            }
+
+            // Should not reach here.
+            throw new AssertionError(species);
+        }
+
+        @ForceInline
+        @Override
+        public Double512Shuffle rearrange(VectorShuffle<Double> shuffle) {
+            Double512Shuffle s = (Double512Shuffle) shuffle;
+            byte[] reorder1 = reorder();
+            byte[] reorder2 = s.reorder();
+            byte[] r = new byte[reorder1.length];
+            for (int i = 0; i < reorder1.length; i++) {
+                int ssi = reorder2[i];
+                r[i] = reorder1[ssi];  // throws on exceptional index
+            }
+            return new Double512Shuffle(r);
+        }
+    }
+
+    // ================================================
+
+    // Specialized low-level memory operations.
+
+    @ForceInline
+    @Override
+    final
+    DoubleVector fromArray0(double[] a, int offset) {
+        return super.fromArray0Template(a, offset);  // specialize
+    }
+
+    @ForceInline
+    @Override
+    final
+    DoubleVector fromByteArray0(byte[] a, int offset) {
+        return super.fromByteArray0Template(a, offset);  // specialize
+    }
+
+    @ForceInline
+    @Override
+    final
+    DoubleVector fromByteBuffer0(ByteBuffer bb, int offset) {
+        return super.fromByteBuffer0Template(bb, offset);  // specialize
+    }
+
+    @ForceInline
+    @Override
+    final
+    void intoArray0(double[] a, int offset) {
+        super.intoArray0Template(a, offset);  // specialize
+    }
+
+    @ForceInline
+    @Override
+    final
+    void intoByteArray0(byte[] a, int offset) {
+        super.intoByteArray0Template(a, offset);  // specialize
+    }
+
+    // End of specialized low-level memory operations.
+
+    // ================================================
+
+}
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Double64Vector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Double64Vector.java
@ -0,0 +1,806 @@
+/*
+ * Copyright (c) 2017, 2020, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.  Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package jdk.incubator.vector;
+
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.Objects;
+import java.util.function.IntUnaryOperator;
+
+import jdk.internal.vm.annotation.ForceInline;
+import jdk.internal.vm.vector.VectorSupport;
+
+import static jdk.internal.vm.vector.VectorSupport.*;
+
+import static jdk.incubator.vector.VectorOperators.*;
+
+// -- This file was mechanically generated: Do not edit! -- //
+
+@SuppressWarnings("cast")  // warning: redundant cast
+final class Double64Vector extends DoubleVector {
+    static final DoubleSpecies VSPECIES =
+        (DoubleSpecies) DoubleVector.SPECIES_64;
+
+    static final VectorShape VSHAPE =
+        VSPECIES.vectorShape();
+
+    static final Class<Double64Vector> VCLASS = Double64Vector.class;
+
+    static final int VSIZE = VSPECIES.vectorBitSize();
+
+    static final int VLENGTH = VSPECIES.laneCount(); // used by the JVM
+
+    static final Class<Double> ETYPE = double.class; // used by the JVM
+
+    Double64Vector(double[] v) {
+        super(v);
+    }
+
+    // For compatibility as Double64Vector::new,
+    // stored into species.vectorFactory.
+    Double64Vector(Object v) {
+        this((double[]) v);
+    }
+
+    static final Double64Vector ZERO = new Double64Vector(new double[VLENGTH]);
+    static final Double64Vector IOTA = new Double64Vector(VSPECIES.iotaArray());
+
+    static {
+        // Warm up a few species caches.
+        // If we do this too much we will
+        // get NPEs from bootstrap circularity.
+        VSPECIES.dummyVector();
+        VSPECIES.withLanes(LaneType.BYTE);
+    }
+
+    // Specialized extractors
+
+    @ForceInline
+    final @Override
+    public DoubleSpecies vspecies() {
+        // ISSUE:  This should probably be a @Stable
+        // field inside AbstractVector, rather than
+        // a megamorphic method.
+        return VSPECIES;
+    }
+
+    @ForceInline
+    @Override
+    public final Class<Double> elementType() { return double.class; }
+
+    @ForceInline
+    @Override
+    public final int elementSize() { return Double.SIZE; }
+
+    @ForceInline
+    @Override
+    public final VectorShape shape() { return VSHAPE; }
+
+    @ForceInline
+    @Override
+    public final int length() { return VLENGTH; }
+
+    @ForceInline
+    @Override
+    public final int bitSize() { return VSIZE; }
+
+    @ForceInline
+    @Override
+    public final int byteSize() { return VSIZE / Byte.SIZE; }
+
+    /*package-private*/
+    @ForceInline
+    final @Override
+    double[] vec() {
+        return (double[])getPayload();
+    }
+
+    // Virtualized constructors
+
+    @Override
+    @ForceInline
+    public final Double64Vector broadcast(double e) {
+        return (Double64Vector) super.broadcastTemplate(e);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final Double64Vector broadcast(long e) {
+        return (Double64Vector) super.broadcastTemplate(e);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    Double64Mask maskFromArray(boolean[] bits) {
+        return new Double64Mask(bits);
+    }
+
+    @Override
+    @ForceInline
+    Double64Shuffle iotaShuffle() { return Double64Shuffle.IOTA; }
+
+    @ForceInline
+    Double64Shuffle iotaShuffle(int start, int step, boolean wrap) {
+      if (wrap) {
+        return (Double64Shuffle)VectorSupport.shuffleIota(ETYPE, Double64Shuffle.class, VSPECIES, VLENGTH, start, step, 1,
+                (l, lstart, lstep, s) -> s.shuffleFromOp(i -> (VectorIntrinsics.wrapToRange(i*lstep + lstart, l))));
+      } else {
+        return (Double64Shuffle)VectorSupport.shuffleIota(ETYPE, Double64Shuffle.class, VSPECIES, VLENGTH, start, step, 0,
+                (l, lstart, lstep, s) -> s.shuffleFromOp(i -> (i*lstep + lstart)));
+      }
+    }
+
+    @Override
+    @ForceInline
+    Double64Shuffle shuffleFromBytes(byte[] reorder) { return new Double64Shuffle(reorder); }
+
+    @Override
+    @ForceInline
+    Double64Shuffle shuffleFromArray(int[] indexes, int i) { return new Double64Shuffle(indexes, i); }
+
+    @Override
+    @ForceInline
+    Double64Shuffle shuffleFromOp(IntUnaryOperator fn) { return new Double64Shuffle(fn); }
+
+    // Make a vector of the same species but the given elements:
+    @ForceInline
+    final @Override
+    Double64Vector vectorFactory(double[] vec) {
+        return new Double64Vector(vec);
+    }
+
+    @ForceInline
+    final @Override
+    Byte64Vector asByteVectorRaw() {
+        return (Byte64Vector) super.asByteVectorRawTemplate();  // specialize
+    }
+
+    @ForceInline
+    final @Override
+    AbstractVector<?> asVectorRaw(LaneType laneType) {
+        return super.asVectorRawTemplate(laneType);  // specialize
+    }
+
+    // Unary operator
+
+    @ForceInline
+    final @Override
+    Double64Vector uOp(FUnOp f) {
+        return (Double64Vector) super.uOpTemplate(f);  // specialize
+    }
+
+    @ForceInline
+    final @Override
+    Double64Vector uOp(VectorMask<Double> m, FUnOp f) {
+        return (Double64Vector)
+            super.uOpTemplate((Double64Mask)m, f);  // specialize
+    }
+
+    // Binary operator
+
+    @ForceInline
+    final @Override
+    Double64Vector bOp(Vector<Double> v, FBinOp f) {
+        return (Double64Vector) super.bOpTemplate((Double64Vector)v, f);  // specialize
+    }
+
+    @ForceInline
+    final @Override
+    Double64Vector bOp(Vector<Double> v,
+                     VectorMask<Double> m, FBinOp f) {
+        return (Double64Vector)
+            super.bOpTemplate((Double64Vector)v, (Double64Mask)m,
+                              f);  // specialize
+    }
+
+    // Ternary operator
+
+    @ForceInline
+    final @Override
+    Double64Vector tOp(Vector<Double> v1, Vector<Double> v2, FTriOp f) {
+        return (Double64Vector)
+            super.tOpTemplate((Double64Vector)v1, (Double64Vector)v2,
+                              f);  // specialize
+    }
+
+    @ForceInline
+    final @Override
+    Double64Vector tOp(Vector<Double> v1, Vector<Double> v2,
+                     VectorMask<Double> m, FTriOp f) {
+        return (Double64Vector)
+            super.tOpTemplate((Double64Vector)v1, (Double64Vector)v2,
+                              (Double64Mask)m, f);  // specialize
+    }
+
+    @ForceInline
+    final @Override
+    double rOp(double v, FBinOp f) {
+        return super.rOpTemplate(v, f);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final <F>
+    Vector<F> convertShape(VectorOperators.Conversion<Double,F> conv,
+                           VectorSpecies<F> rsp, int part) {
+        return super.convertShapeTemplate(conv, rsp, part);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final <F>
+    Vector<F> reinterpretShape(VectorSpecies<F> toSpecies, int part) {
+        return super.reinterpretShapeTemplate(toSpecies, part);  // specialize
+    }
+
+    // Specialized algebraic operations:
+
+    // The following definition forces a specialized version of this
+    // crucial method into the v-table of this class.  A call to add()
+    // will inline to a call to lanewise(ADD,), at which point the JIT
+    // intrinsic will have the opcode of ADD, plus all the metadata
+    // for this particular class, enabling it to generate precise
+    // code.
+    //
+    // There is probably no benefit to the JIT to specialize the
+    // masked or broadcast versions of the lanewise method.
+
+    @Override
+    @ForceInline
+    public Double64Vector lanewise(Unary op) {
+        return (Double64Vector) super.lanewiseTemplate(op);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Double64Vector lanewise(Binary op, Vector<Double> v) {
+        return (Double64Vector) super.lanewiseTemplate(op, v);  // specialize
+    }
+
+
+    /*package-private*/
+    @Override
+    @ForceInline
+    public final
+    Double64Vector
+    lanewise(VectorOperators.Ternary op, Vector<Double> v1, Vector<Double> v2) {
+        return (Double64Vector) super.lanewiseTemplate(op, v1, v2);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final
+    Double64Vector addIndex(int scale) {
+        return (Double64Vector) super.addIndexTemplate(scale);  // specialize
+    }
+
+    // Type specific horizontal reductions
+
+    @Override
+    @ForceInline
+    public final double reduceLanes(VectorOperators.Associative op) {
+        return super.reduceLanesTemplate(op);  // specialized
+    }
+
+    @Override
+    @ForceInline
+    public final double reduceLanes(VectorOperators.Associative op,
+                                    VectorMask<Double> m) {
+        return super.reduceLanesTemplate(op, m);  // specialized
+    }
+
+    @Override
+    @ForceInline
+    public final long reduceLanesToLong(VectorOperators.Associative op) {
+        return (long) super.reduceLanesTemplate(op);  // specialized
+    }
+
+    @Override
+    @ForceInline
+    public final long reduceLanesToLong(VectorOperators.Associative op,
+                                        VectorMask<Double> m) {
+        return (long) super.reduceLanesTemplate(op, m);  // specialized
+    }
+
+    @Override
+    @ForceInline
+    public VectorShuffle<Double> toShuffle() {
+        double[] a = toArray();
+        int[] sa = new int[a.length];
+        for (int i = 0; i < a.length; i++) {
+            sa[i] = (int) a[i];
+        }
+        return VectorShuffle.fromArray(VSPECIES, sa, 0);
+    }
+
+    // Specialized unary testing
+
+    @Override
+    @ForceInline
+    public final Double64Mask test(Test op) {
+        return super.testTemplate(Double64Mask.class, op);  // specialize
+    }
+
+    // Specialized comparisons
+
+    @Override
+    @ForceInline
+    public final Double64Mask compare(Comparison op, Vector<Double> v) {
+        return super.compareTemplate(Double64Mask.class, op, v);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final Double64Mask compare(Comparison op, double s) {
+        return super.compareTemplate(Double64Mask.class, op, s);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public final Double64Mask compare(Comparison op, long s) {
+        return super.compareTemplate(Double64Mask.class, op, s);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Double64Vector blend(Vector<Double> v, VectorMask<Double> m) {
+        return (Double64Vector)
+            super.blendTemplate(Double64Mask.class,
+                                (Double64Vector) v,
+                                (Double64Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Double64Vector slice(int origin, Vector<Double> v) {
+        return (Double64Vector) super.sliceTemplate(origin, v);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Double64Vector slice(int origin) {
+       if ((origin < 0) || (origin >= VLENGTH)) {
+         throw new ArrayIndexOutOfBoundsException("Index " + origin + " out of bounds for vector length " + VLENGTH);
+       } else {
+         Double64Shuffle Iota = iotaShuffle();
+         VectorMask<Double> BlendMask = Iota.toVector().compare(VectorOperators.LT, (broadcast((double)(VLENGTH-origin))));
+         Iota = iotaShuffle(origin, 1, true);
+         return ZERO.blend(this.rearrange(Iota), BlendMask);
+       }
+    }
+
+    @Override
+    @ForceInline
+    public Double64Vector unslice(int origin, Vector<Double> w, int part) {
+        return (Double64Vector) super.unsliceTemplate(origin, w, part);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Double64Vector unslice(int origin, Vector<Double> w, int part, VectorMask<Double> m) {
+        return (Double64Vector)
+            super.unsliceTemplate(Double64Mask.class,
+                                  origin, w, part,
+                                  (Double64Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Double64Vector unslice(int origin) {
+       if ((origin < 0) || (origin >= VLENGTH)) {
+         throw new ArrayIndexOutOfBoundsException("Index " + origin + " out of bounds for vector length " + VLENGTH);
+       } else {
+         Double64Shuffle Iota = iotaShuffle();
+         VectorMask<Double> BlendMask = Iota.toVector().compare(VectorOperators.GE, (broadcast((double)(origin))));
+         Iota = iotaShuffle(-origin, 1, true);
+         return ZERO.blend(this.rearrange(Iota), BlendMask);
+       }
+    }
+
+    @Override
+    @ForceInline
+    public Double64Vector rearrange(VectorShuffle<Double> s) {
+        return (Double64Vector)
+            super.rearrangeTemplate(Double64Shuffle.class,
+                                    (Double64Shuffle) s);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Double64Vector rearrange(VectorShuffle<Double> shuffle,
+                                  VectorMask<Double> m) {
+        return (Double64Vector)
+            super.rearrangeTemplate(Double64Shuffle.class,
+                                    (Double64Shuffle) shuffle,
+                                    (Double64Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Double64Vector rearrange(VectorShuffle<Double> s,
+                                  Vector<Double> v) {
+        return (Double64Vector)
+            super.rearrangeTemplate(Double64Shuffle.class,
+                                    (Double64Shuffle) s,
+                                    (Double64Vector) v);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Double64Vector selectFrom(Vector<Double> v) {
+        return (Double64Vector)
+            super.selectFromTemplate((Double64Vector) v);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Double64Vector selectFrom(Vector<Double> v,
+                                   VectorMask<Double> m) {
+        return (Double64Vector)
+            super.selectFromTemplate((Double64Vector) v,
+                                     (Double64Mask) m);  // specialize
+    }
+
+
+    @ForceInline
+    @Override
+    public double lane(int i) {
+        long bits;
+        switch(i) {
+            case 0: bits = laneHelper(0); break;
+            default: throw new IllegalArgumentException("Index " + i + " must be zero or positive, and less than " + VLENGTH);
+        }
+        return Double.longBitsToDouble(bits);
+    }
+
+    public long laneHelper(int i) {
+        return (long) VectorSupport.extract(
+                     VCLASS, ETYPE, VLENGTH,
+                     this, i,
+                     (vec, ix) -> {
+                     double[] vecarr = vec.vec();
+                     return (long)Double.doubleToLongBits(vecarr[ix]);
+                     });
+    }
+
+    @ForceInline
+    @Override
+    public Double64Vector withLane(int i, double e) {
+        switch(i) {
+            case 0: return withLaneHelper(0, e);
+            default: throw new IllegalArgumentException("Index " + i + " must be zero or positive, and less than " + VLENGTH);
+        }
+    }
+
+    public Double64Vector withLaneHelper(int i, double e) {
+        return VectorSupport.insert(
+                                VCLASS, ETYPE, VLENGTH,
+                                this, i, (long)Double.doubleToLongBits(e),
+                                (v, ix, bits) -> {
+                                    double[] res = v.vec().clone();
+                                    res[ix] = Double.longBitsToDouble((long)bits);
+                                    return v.vectorFactory(res);
+                                });
+    }
+
+    // Mask
+
+    static final class Double64Mask extends AbstractMask<Double> {
+        static final int VLENGTH = VSPECIES.laneCount();    // used by the JVM
+        static final Class<Double> ETYPE = double.class; // used by the JVM
+
+        Double64Mask(boolean[] bits) {
+            this(bits, 0);
+        }
+
+        Double64Mask(boolean[] bits, int offset) {
+            super(prepare(bits, offset));
+        }
+
+        Double64Mask(boolean val) {
+            super(prepare(val));
+        }
+
+        private static boolean[] prepare(boolean[] bits, int offset) {
+            boolean[] newBits = new boolean[VSPECIES.laneCount()];
+            for (int i = 0; i < newBits.length; i++) {
+                newBits[i] = bits[offset + i];
+            }
+            return newBits;
+        }
+
+        private static boolean[] prepare(boolean val) {
+            boolean[] bits = new boolean[VSPECIES.laneCount()];
+            Arrays.fill(bits, val);
+            return bits;
+        }
+
+        @ForceInline
+        final @Override
+        public DoubleSpecies vspecies() {
+            // ISSUE:  This should probably be a @Stable
+            // field inside AbstractMask, rather than
+            // a megamorphic method.
+            return VSPECIES;
+        }
+
+        @ForceInline
+        boolean[] getBits() {
+            return (boolean[])getPayload();
+        }
+
+        @Override
+        Double64Mask uOp(MUnOp f) {
+            boolean[] res = new boolean[vspecies().laneCount()];
+            boolean[] bits = getBits();
+            for (int i = 0; i < res.length; i++) {
+                res[i] = f.apply(i, bits[i]);
+            }
+            return new Double64Mask(res);
+        }
+
+        @Override
+        Double64Mask bOp(VectorMask<Double> m, MBinOp f) {
+            boolean[] res = new boolean[vspecies().laneCount()];
+            boolean[] bits = getBits();
+            boolean[] mbits = ((Double64Mask)m).getBits();
+            for (int i = 0; i < res.length; i++) {
+                res[i] = f.apply(i, bits[i], mbits[i]);
+            }
+            return new Double64Mask(res);
+        }
+
+        @ForceInline
+        @Override
+        public final
+        Double64Vector toVector() {
+            return (Double64Vector) super.toVectorTemplate();  // specialize
+        }
+
+        @Override
+        @ForceInline
+        public <E> VectorMask<E> cast(VectorSpecies<E> s) {
+            AbstractSpecies<E> species = (AbstractSpecies<E>) s;
+            if (length() != species.laneCount())
+                throw new IllegalArgumentException("VectorMask length and species length differ");
+            boolean[] maskArray = toArray();
+            // enum-switches don't optimize properly JDK-8161245
+            switch (species.laneType.switchKey) {
+            case LaneType.SK_BYTE:
+                return new Byte64Vector.Byte64Mask(maskArray).check(species);
+            case LaneType.SK_SHORT:
+                return new Short64Vector.Short64Mask(maskArray).check(species);
+            case LaneType.SK_INT:
+                return new Int64Vector.Int64Mask(maskArray).check(species);
+            case LaneType.SK_LONG:
+                return new Long64Vector.Long64Mask(maskArray).check(species);
+            case LaneType.SK_FLOAT:
+                return new Float64Vector.Float64Mask(maskArray).check(species);
+            case LaneType.SK_DOUBLE:
+                return new Double64Vector.Double64Mask(maskArray).check(species);
+            }
+
+            // Should not reach here.
+            throw new AssertionError(species);
+        }
+
+        // Unary operations
+
+        @Override
+        @ForceInline
+        public Double64Mask not() {
+            return xor(maskAll(true));
+        }
+
+        // Binary operations
+
+        @Override
+        @ForceInline
+        public Double64Mask and(VectorMask<Double> mask) {
+            Objects.requireNonNull(mask);
+            Double64Mask m = (Double64Mask)mask;
+            return VectorSupport.binaryOp(VECTOR_OP_AND, Double64Mask.class, long.class, VLENGTH,
+                                             this, m,
+                                             (m1, m2) -> m1.bOp(m2, (i, a, b) -> a & b));
+        }
+
+        @Override
+        @ForceInline
+        public Double64Mask or(VectorMask<Double> mask) {
+            Objects.requireNonNull(mask);
+            Double64Mask m = (Double64Mask)mask;
+            return VectorSupport.binaryOp(VECTOR_OP_OR, Double64Mask.class, long.class, VLENGTH,
+                                             this, m,
+                                             (m1, m2) -> m1.bOp(m2, (i, a, b) -> a | b));
+        }
+
+        @ForceInline
+        /* package-private */
+        Double64Mask xor(VectorMask<Double> mask) {
+            Objects.requireNonNull(mask);
+            Double64Mask m = (Double64Mask)mask;
+            return VectorSupport.binaryOp(VECTOR_OP_XOR, Double64Mask.class, long.class, VLENGTH,
+                                          this, m,
+                                          (m1, m2) -> m1.bOp(m2, (i, a, b) -> a ^ b));
+        }
+
+        // Reductions
+
+        @Override
+        @ForceInline
+        public boolean anyTrue() {
+            return VectorSupport.test(BT_ne, Double64Mask.class, long.class, VLENGTH,
+                                         this, vspecies().maskAll(true),
+                                         (m, __) -> anyTrueHelper(((Double64Mask)m).getBits()));
+        }
+
+        @Override
+        @ForceInline
+        public boolean allTrue() {
+            return VectorSupport.test(BT_overflow, Double64Mask.class, long.class, VLENGTH,
+                                         this, vspecies().maskAll(true),
+                                         (m, __) -> allTrueHelper(((Double64Mask)m).getBits()));
+        }
+
+        @ForceInline
+        /*package-private*/
+        static Double64Mask maskAll(boolean bit) {
+            return VectorSupport.broadcastCoerced(Double64Mask.class, long.class, VLENGTH,
+                                                  (bit ? -1 : 0), null,
+                                                  (v, __) -> (v != 0 ? TRUE_MASK : FALSE_MASK));
+        }
+        private static final Double64Mask  TRUE_MASK = new Double64Mask(true);
+        private static final Double64Mask FALSE_MASK = new Double64Mask(false);
+
+    }
+
+    // Shuffle
+
+    static final class Double64Shuffle extends AbstractShuffle<Double> {
+        static final int VLENGTH = VSPECIES.laneCount();    // used by the JVM
+        static final Class<Double> ETYPE = double.class; // used by the JVM
+
+        Double64Shuffle(byte[] reorder) {
+            super(VLENGTH, reorder);
+        }
+
+        public Double64Shuffle(int[] reorder) {
+            super(VLENGTH, reorder);
+        }
+
+        public Double64Shuffle(int[] reorder, int i) {
+            super(VLENGTH, reorder, i);
+        }
+
+        public Double64Shuffle(IntUnaryOperator fn) {
+            super(VLENGTH, fn);
+        }
+
+        @Override
+        public DoubleSpecies vspecies() {
+            return VSPECIES;
+        }
+
+        static {
+            // There must be enough bits in the shuffle lanes to encode
+            // VLENGTH valid indexes and VLENGTH exceptional ones.
+            assert(VLENGTH < Byte.MAX_VALUE);
+            assert(Byte.MIN_VALUE <= -VLENGTH);
+        }
+        static final Double64Shuffle IOTA = new Double64Shuffle(IDENTITY);
+
+        @Override
+        @ForceInline
+        public Double64Vector toVector() {
+            return VectorSupport.shuffleToVector(VCLASS, ETYPE, Double64Shuffle.class, this, VLENGTH,
+                                                    (s) -> ((Double64Vector)(((AbstractShuffle<Double>)(s)).toVectorTemplate())));
+        }
+
+        @Override
+        @ForceInline
+        public <F> VectorShuffle<F> cast(VectorSpecies<F> s) {
+            AbstractSpecies<F> species = (AbstractSpecies<F>) s;
+            if (length() != species.laneCount())
+                throw new IllegalArgumentException("VectorShuffle length and species length differ");
+            int[] shuffleArray = toArray();
+            // enum-switches don't optimize properly JDK-8161245
+            switch (species.laneType.switchKey) {
+            case LaneType.SK_BYTE:
+                return new Byte64Vector.Byte64Shuffle(shuffleArray).check(species);
+            case LaneType.SK_SHORT:
+                return new Short64Vector.Short64Shuffle(shuffleArray).check(species);
+            case LaneType.SK_INT:
+                return new Int64Vector.Int64Shuffle(shuffleArray).check(species);
+            case LaneType.SK_LONG:
+                return new Long64Vector.Long64Shuffle(shuffleArray).check(species);
+            case LaneType.SK_FLOAT:
+                return new Float64Vector.Float64Shuffle(shuffleArray).check(species);
+            case LaneType.SK_DOUBLE:
+                return new Double64Vector.Double64Shuffle(shuffleArray).check(species);
+            }
+
+            // Should not reach here.
+            throw new AssertionError(species);
+        }
+
+        @ForceInline
+        @Override
+        public Double64Shuffle rearrange(VectorShuffle<Double> shuffle) {
+            Double64Shuffle s = (Double64Shuffle) shuffle;
+            byte[] reorder1 = reorder();
+            byte[] reorder2 = s.reorder();
+            byte[] r = new byte[reorder1.length];
+            for (int i = 0; i < reorder1.length; i++) {
+                int ssi = reorder2[i];
+                r[i] = reorder1[ssi];  // throws on exceptional index
+            }
+            return new Double64Shuffle(r);
+        }
+    }
+
+    // ================================================
+
+    // Specialized low-level memory operations.
+
+    @ForceInline
+    @Override
+    final
+    DoubleVector fromArray0(double[] a, int offset) {
+        return super.fromArray0Template(a, offset);  // specialize
+    }
+
+    @ForceInline
+    @Override
+    final
+    DoubleVector fromByteArray0(byte[] a, int offset) {
+        return super.fromByteArray0Template(a, offset);  // specialize
+    }
+
+    @ForceInline
+    @Override
+    final
+    DoubleVector fromByteBuffer0(ByteBuffer bb, int offset) {
+        return super.fromByteBuffer0Template(bb, offset);  // specialize
+    }
+
+    @ForceInline
+    @Override
+    final
+    void intoArray0(double[] a, int offset) {
+        super.intoArray0Template(a, offset);  // specialize
+    }
+
+    @ForceInline
+    @Override
+    final
+    void intoByteArray0(byte[] a, int offset) {
+        super.intoByteArray0Template(a, offset);  // specialize
+    }
+
+    // End of specialized low-level memory operations.
+
+    // ================================================
+
+}
--- a/Show More
+++ b/Show More