describeConstable()
+ // public Float16 resolveConstantDesc(MethodHandles.Lookup lookup)
+
+ /*
+ * Note: for the basic arithmetic operations {+, -, *, /} and
+ * square root, among binary interchange formats (binary16,
+ * binary32 a.k.a. float, binary64 a.k.a double, etc.) the "2p + 2"
+ * property holds. That is, if one format has p bits of precision,
+ * if the next larger format has at least 2p + 2 bits of
+ * precision, arithmetic on the smaller format can be implemented by:
+ *
+ * 1) converting each argument to the wider format
+ * 2) performing the operation in the wider format
+ * 3) converting the result from 2) to the narrower format
+ *
+ * For example, this property hold between the formats used for the
+ * float and double types. Therefore, the following is a valid
+ * implementation of a float addition:
+ *
+ * float add(float addend, float augend) {
+ * return (float)((double)addend + (double)augend);
+ * }
+ *
+ * The same property holds between the float16 format and
+ * float. Therefore, the software implementations of Float16 {+,
+ * -, *, /} and square root below use the technique of widening
+ * the Float16 arguments to float, performing the operation in
+ * float arithmetic, and then rounding the float result to
+ * Float16.
+ *
+ * For discussion and derivation of this property see:
+ *
+ * "When Is Double Rounding Innocuous?" by Samuel Figueroa
+ * ACM SIGNUM Newsletter, Volume 30 Issue 3, pp 21-26
+ * https://dl.acm.org/doi/pdf/10.1145/221332.221334
+ *
+ * Figueroa's write-up refers to lecture notes by W. Kahan. Those
+ * lectures notes are assumed to be these ones by Kahan and
+ * others:
+ *
+ * https://www.arithmazium.org/classroom/lib/Lecture_08_notes_slides.pdf
+ * https://www.arithmazium.org/classroom/lib/Lecture_09_notes_slides.pdf
+ */
+
+ /**
+ * Adds two {@code Float16} values together as per the {@code +}
+ * operator semantics using the round to nearest rounding policy.
+ *
+ * The handling of signed zeros, NaNs, infinities, and other
+ * special cases by this method is the same as for the handling of
+ * those cases by the built-in {@code +} operator for
+ * floating-point addition (JLS {@jls 15.18.2}).
+ *
+ * @apiNote This method corresponds to the addition operation
+ * defined in IEEE 754.
+ *
+ * @param addend the first operand
+ * @param augend the second operand
+ * @return the sum of the operands
+ *
+ * @jls 15.4 Floating-point Expressions
+ * @jls 15.18.2 Additive Operators (+ and -) for Numeric Types
+ */
+ public static Float16 add(Float16 addend, Float16 augend) {
+ return valueOf(addend.floatValue() + augend.floatValue());
+ }
+
+ /**
+ * Subtracts two {@code Float16} values as per the {@code -}
+ * operator semantics using the round to nearest rounding policy.
+ *
+ * The handling of signed zeros, NaNs, infinities, and other
+ * special cases by this method is the same as for the handling of
+ * those cases by the built-in {@code -} operator for
+ * floating-point subtraction (JLS {@jls 15.18.2}).
+ *
+ * @apiNote This method corresponds to the subtraction operation
+ * defined in IEEE 754.
+ *
+ * @param minuend the first operand
+ * @param subtrahend the second operand
+ * @return the difference of the operands
+ *
+ * @jls 15.4 Floating-point Expressions
+ * @jls 15.18.2 Additive Operators (+ and -) for Numeric Types
+ */
+ public static Float16 subtract(Float16 minuend, Float16 subtrahend) {
+ return valueOf(minuend.floatValue() - subtrahend.floatValue());
+ }
+
+ /**
+ * Multiplies two {@code Float16} values as per the {@code *}
+ * operator semantics using the round to nearest rounding policy.
+ *
+ * The handling of signed zeros, NaNs, and infinities, other
+ * special cases by this method is the same as for the handling of
+ * those cases by the built-in {@code *} operator for
+ * floating-point multiplication (JLS {@jls 15.17.1}).
+ *
+ * @apiNote This method corresponds to the multiplication
+ * operation defined in IEEE 754.
+ *
+ * @param multiplier the first operand
+ * @param multiplicand the second operand
+ * @return the product of the operands
+ *
+ * @jls 15.4 Floating-point Expressions
+ * @jls 15.17.1 Multiplication Operator *
+ */
+ public static Float16 multiply(Float16 multiplier, Float16 multiplicand) {
+ return valueOf(multiplier.floatValue() * multiplicand.floatValue());
+ }
+
+ /**
+ * Divides two {@code Float16} values as per the {@code /}
+ * operator semantics using the round to nearest rounding policy.
+ *
+ * The handling of signed zeros, NaNs, and infinities, other
+ * special cases by this method is the same as for the handling of
+ * those cases by the built-in {@code /} operator for
+ * floating-point division (JLS {@jls 15.17.2}).
+ *
+ * @apiNote This method corresponds to the division
+ * operation defined in IEEE 754.
+ *
+ * @param dividend the first operand
+ * @param divisor the second operand
+ * @return the quotient of the operands
+ *
+ * @jls 15.4 Floating-point Expressions
+ * @jls 15.17.2 Division Operator /
+ */
+ public static Float16 divide(Float16 dividend, Float16 divisor) {
+ return valueOf(dividend.floatValue() / divisor.floatValue());
+ }
+
+ /**
+ * {@return the square root of the operand} The square root is
+ * computed using the round to nearest rounding policy.
+ *
+ * The handling of zeros, NaN, infinities, and negative arguments
+ * by this method is analogous to the handling of those cases by
+ * {@link Math#sqrt(double)}.
+ *
+ * @apiNote
+ * This method corresponds to the squareRoot operation defined in
+ * IEEE 754.
+ *
+ * @param radicand the argument to have its square root taken
+ *
+ * @see Math#sqrt(double)
+ */
+ public static Float16 sqrt(Float16 radicand) {
+ // Rounding path of sqrt(Float16 -> double) -> Float16 is fine
+ // for preserving the correct final value. The conversion
+ // Float16 -> double preserves the exact numerical value. The
+ // conversion of double -> Float16 also benefits from the
+ // 2p+2 property of IEEE 754 arithmetic.
+ return valueOf(Math.sqrt(radicand.doubleValue()));
+ }
+
+ /**
+ * Returns the fused multiply add of the three arguments; that is,
+ * returns the exact product of the first two arguments summed
+ * with the third argument and then rounded once to the nearest
+ * {@code Float16}.
+ *
+ * The handling of zeros, NaN, infinities, and other special cases
+ * by this method is analogous to the handling of those cases by
+ * {@link Math#fma(float, float, float)}.
+ *
+ * @apiNote This method corresponds to the fusedMultiplyAdd
+ * operation defined in IEEE 754.
+ *
+ * @param a a value
+ * @param b a value
+ * @param c a value
+ *
+ * @return (a × b + c)
+ * computed, as if with unlimited range and precision, and rounded
+ * once to the nearest {@code Float16} value
+ *
+ * @see Math#fma(float, float, float)
+ * @see Math#fma(double, double, double)
+ */
+ public static Float16 fma(Float16 a, Float16 b, Float16 c) {
+ /*
+ * The double format has sufficient precision that a Float16
+ * fma can be computed by doing the arithmetic in double, with
+ * one rounding error for the sum, and then a second rounding
+ * error to round the product-sum to Float16. In pseudocode,
+ * this method is equivalent to the following code, assuming
+ * casting was defined between Float16 and double:
+ *
+ * double product = (double)a * (double)b; // Always exact
+ * double productSum = product + (double)c;
+ * return (Float16)produdctSum;
+ *
+ * (Note that a similar relationship does *not* hold between
+ * the double format and computing a float fma.)
+ *
+ * Below is a sketch of the proof that simple double
+ * arithmetic can be used to implement a correctly rounded
+ * Float16 fma.
+ *
+ * ----------------------
+ *
+ * As preliminaries, the handling of NaN and Infinity
+ * arguments falls out as a consequence of general operation
+ * of non-finite values by double * and +. Any NaN argument to
+ * fma will lead to a NaN result, infinities will propagate or
+ * get turned into NaN as appropriate, etc.
+ *
+ * One or more zero arguments are also handled correctly,
+ * including the propagation of the sign of zero if all three
+ * arguments are zero.
+ *
+ * The double format has 53 logical bits of precision and its
+ * exponent range goes from -1022 to 1023. The Float16 format
+ * has 11 bits of logical precision and its exponent range
+ * goes from -14 to 15. Therefore, the individual powers of 2
+ * representable in the Float16 format range from the
+ * subnormal 2^(-24), MIN_VALUE, to 2^15, the leading bit
+ * position of MAX_VALUE.
+ *
+ * In cases where the numerical value of (a * b) + c is
+ * computed exactly in a double, after a single rounding to
+ * Float16, the result is necessarily correct since the one
+ * double -> Float16 conversion is the only source of
+ * numerical error. The operation as implemented in those
+ * cases would be equivalent to rounding the infinitely precise
+ * value to the result format, etc.
+ *
+ * However, for some inputs, the intermediate product-sum will
+ * *not* be exact and additional analysis is needed to justify
+ * not having any corrective computation to compensate for
+ * intermediate rounding errors.
+ *
+ * The following analysis will rely on the range of bit
+ * positions representable in the intermediate
+ * product-sum.
+ *
+ * For the product a*b of Float16 inputs, the range of
+ * exponents for nonzero finite results goes from 2^(-48)
+ * (from MIN_VALUE squared) to 2^31 (from the exact value of
+ * MAX_VALUE squared). This full range of exponent positions,
+ * (31 -(-48) + 1 ) = 80 exceeds the precision of
+ * double. However, only the product a*b can exceed the
+ * exponent range of Float16. Therefore, there are three main
+ * cases to consider:
+ *
+ * 1) Large exponent product, exponent > Float16.MAX_EXPONENT
+ *
+ * The magnitude of the overflow threshold for Float16 is:
+ *
+ * MAX_VALUE + 1/2 * ulp(MAX_VALUE) = 0x1.ffcp15 + 0x0.002p15 = 0x1.ffep15
+ *
+ * Therefore, for any product greater than or equal in
+ * magnitude to (0x1.ffep15 + MAX_VALUE) = 0x1.ffdp16, the
+ * final fma result will certainly overflow to infinity (under
+ * round to nearest) since adding in c = -MAX_VALUE will still
+ * be at or above the overflow threshold.
+ *
+ * If the exponent of the product is 15 or 16, the smallest
+ * subnormal Float16 is 2^-24 and the ~40 bit wide range bit
+ * positions would fit in a single double exactly.
+ *
+ * 2) Exponent of product is within the range of _normal_
+ * Float16 values; Float16.MIN_EXPONENT <= exponent <= Float16.MAX_EXPONENT
+ *
+ * The exact product has at most 22 contiguous bits in its
+ * logical significand. The third number being added in has at
+ * most 11 contiguous bits in its significand and the lowest
+ * bit position that could be set is 2^(-24). Therefore, when
+ * the product has the maximum in-range exponent, 2^15, a
+ * single double has enough precision to hold down to the
+ * smallest subnormal bit position, 15 - (-24) + 1 = 40 <
+ * 53. If the product was large and rounded up, increasing the
+ * exponent, when the third operand was added, this would
+ * cause the exponent to go up to 16, which is within the
+ * range of double, so the product-sum is exact and will be
+ * correct when rounded to Float16.
+ *
+ * 3) Exponent of product is in the range of subnormal values or smaller,
+ * exponent < Float16.MIN_EXPONENT
+ *
+ * The smallest exponent possible in a product is 2^(-48).
+ * For moderately sized Float16 values added to the product,
+ * with an exponent of about 4, the sum will not be
+ * exact. Therefore, an analysis is needed to determine if the
+ * double-rounding is benign or would lead to a different
+ * final Float16 result. Double rounding can lead to a
+ * different result in two cases:
+ *
+ * 1) The first rounding from the exact value to the extended
+ * precision (here `double`) happens to be directed _toward_ 0
+ * to a value exactly midway between two adjacent working
+ * precision (here `Float16`) values, followed by a second
+ * rounding from there which again happens to be directed
+ * _toward_ 0 to one of these values (the one with lesser
+ * magnitude). A single rounding from the exact value to the
+ * working precision, in contrast, rounds to the value with
+ * larger magnitude.
+ *
+ * 2) Symmetrically, the first rounding to the extended
+ * precision happens to be directed _away_ from 0 to a value
+ * exactly midway between two adjacent working precision
+ * values, followed by a second rounding from there which
+ * again happens to be directed _away_ from 0 to one of these
+ * values (the one with larger magnitude). However, a single
+ * rounding from the exact value to the working precision
+ * rounds to the value with lesser magnitude.
+ *
+ * If the double rounding occurs in other cases, it is
+ * innocuous, returning the same value as a single rounding to
+ * the final format. Therefore, it is sufficient to show that
+ * the first rounding to double does not occur at the midpoint
+ * of two adjacent Float16 values:
+ *
+ * 1) If a, b and c have the same sign, the sum a*b + c has a
+ * significand with a large gap of 20 or more 0s between the
+ * bits of the significand of c to the left (at most 11 bits)
+ * and those of the product a*b to the right (at most 22
+ * bits). The rounding bit for the final working precision of
+ * `float16` is the leftmost 0 in the gap.
+ *
+ * a) If rounding to `double` is directed toward 0, all the
+ * 0s in the gap are preserved, thus the `Float16` rounding
+ * bit is unaffected and remains 0. This means that the
+ * `double` value is _not_ the midpoint of two adjacent
+ * `float16` values, so double rounding is harmless.
+ *
+ * b) If rounding to `double` is directed away form 0, the
+ * rightmost 0 in the gap might be replaced by a 1, but the
+ * others are unaffected, including the `float16` rounding
+ * bit. Again, this shows that the `double` value is _not_
+ * the midpoint of two adjacent `float16` values, and double
+ * rounding is innocuous.
+ *
+ * 2) If a, b and c have opposite signs, in the sum a*b + c
+ * the long gap of 0s above is replaced by a long gap of
+ * 1s. The `float16` rounding bit is the leftmost 1 in the
+ * gap, or the second leftmost 1 iff c is a power of 2. In
+ * both cases, the rounding bit is followed by at least
+ * another 1.
+ *
+ * a) If rounding to `double` is directed toward 0, the
+ * `float16` rounding bit and its follower are preserved and
+ * both 1, so the `double` value is _not_ the midpoint of
+ * two adjacent `float16` values, and double rounding is
+ * harmless.
+ *
+ * b) If rounding to `double` is directed away from 0, the
+ * `float16` rounding bit and its follower are either
+ * preserved (both 1), or both switch to 0. Either way, the
+ * `double` value is again _not_ the midpoint of two
+ * adjacent `float16` values, and double rounding is
+ * harmless.
+ */
+
+ // product is numerically exact in float before the cast to
+ // double; not necessary to widen to double before the
+ // multiply.
+ double product = (double)(a.floatValue() * b.floatValue());
+ return valueOf(product + c.doubleValue());
+ }
+
+ /**
+ * {@return the negation of the argument}
+ *
+ * Special cases:
+ *
+ * - If the argument is zero, the result is a zero with the
+ * opposite sign as the argument.
+ *
- If the argument is infinite, the result is an infinity
+ * with the opposite sign as the argument.
+ *
- If the argument is a NaN, the result is a NaN.
+ *
+ *
+ * @apiNote
+ * This method corresponds to the negate operation defined in IEEE
+ * 754.
+ *
+ * @param f16 the value to be negated
+ * @jls 15.15.4 Unary Minus Operator {@code -}
+ */
+ public static Float16 negate(Float16 f16) {
+ // Negate sign bit only. Per IEEE 754-2019 section 5.5.1,
+ // negate is a bit-level operation and not a logical
+ // operation. Therefore, in this case do _not_ use the float
+ // unary minus as an implementation as that is not guaranteed
+ // to flip the sign bit of a NaN.
+ return shortBitsToFloat16((short)(f16.value ^ (short)0x0000_8000));
+ }
+
+ /**
+ * {@return the absolute value of the argument}
+ *
+ * The handling of zeros, NaN, and infinities by this method is
+ * analogous to the handling of those cases by {@link
+ * Math#abs(float)}.
+ *
+ * @param f16 the argument whose absolute value is to be determined
+ *
+ * @see Math#abs(float)
+ * @see Math#abs(double)
+ */
+ public static Float16 abs(Float16 f16) {
+ // Zero out sign bit. Per IEE 754-2019 section 5.5.1, abs is a
+ // bit-level operation and not a logical operation.
+ return shortBitsToFloat16((short)(f16.value & (short)0x0000_7FFF));
+ }
+
+ /**
+ * Returns the unbiased exponent used in the representation of a
+ * {@code Float16}.
+ *
+ *
+ * - If the argument is NaN or infinite, then the result is
+ * {@link Float16#MAX_EXPONENT} + 1.
+ *
- If the argument is zero or subnormal, then the result is
+ * {@link Float16#MIN_EXPONENT} - 1.
+ *
+ *
+ * @apiNote
+ * This method is analogous to the logB operation defined in IEEE
+ * 754, but returns a different value on subnormal arguments.
+ *
+ * @param f16 a {@code Float16} value
+ * @return the unbiased exponent of the argument
+ *
+ * @see Math#getExponent(float)
+ * @see Math#getExponent(double)
+ */
+ public static int getExponent(Float16 f16) {
+ return getExponent0(f16.value);
+ }
+
+ /**
+ * From the bitwise representation of a float16, mask out exponent
+ * bits, shift to the right and then subtract out float16's bias
+ * adjust, 15, to get true exponent value.
+ */
+ /*package*/ static int getExponent0(short bits) {
+ // package private to be usable in java.lang.Float.
+ int bin16ExpBits = 0x0000_7c00 & bits; // Five exponent bits.
+ return (bin16ExpBits >> (PRECISION - 1)) - 15;
+ }
+
+ /**
+ * Returns the size of an ulp of the argument. An ulp, unit in
+ * the last place, of a {@code Float16} value is the positive
+ * distance between this floating-point value and the {@code
+ * Float16} value next larger in magnitude. Note that for non-NaN
+ * x, ulp(-x) == ulp(x).
+ *
+ * Special Cases:
+ *
+ * - If the argument is NaN, then the result is NaN.
+ *
- If the argument is positive or negative infinity, then the
+ * result is positive infinity.
+ *
- If the argument is positive or negative zero, then the result is
+ * {@code Float16.MIN_VALUE}.
+ *
- If the argument is ±{@code Float16.MAX_VALUE}, then
+ * the result is equal to 25, 32.0.
+ *
+ *
+ * @param f16 the floating-point value whose ulp is to be returned
+ * @return the size of an ulp of the argument
+ *
+ * @see Math#ulp(float)
+ * @see Math#ulp(double)
+ */
+ public static Float16 ulp(Float16 f16) {
+ int exp = getExponent(f16);
+
+ return switch(exp) {
+ case MAX_EXPONENT + 1 -> abs(f16); // NaN or infinity
+ case MIN_EXPONENT - 1 -> Float16.MIN_VALUE; // zero or subnormal
+ default -> {
+ assert exp <= MAX_EXPONENT && exp >= MIN_EXPONENT;
+ // ulp(x) is usually 2^(SIGNIFICAND_WIDTH-1)*(2^ilogb(x))
+ // Let float -> float16 conversion handle encoding issues.
+ yield scalb(valueOf(1), exp - (PRECISION - 1));
+ }
+ };
+ }
+
+ /**
+ * Returns the floating-point value adjacent to {@code v} in
+ * the direction of positive infinity.
+ *
+ * Special Cases:
+ *
+ * - If the argument is NaN, the result is NaN.
+ *
+ *
- If the argument is positive infinity, the result is
+ * positive infinity.
+ *
+ *
- If the argument is zero, the result is
+ * {@link #MIN_VALUE}
+ *
+ *
+ *
+ * @apiNote This method corresponds to the nextUp
+ * operation defined in IEEE 754.
+ *
+ * @param v starting floating-point value
+ * @return The adjacent floating-point value closer to positive
+ * infinity.
+ *
+ * @see Math#nextUp(float)
+ * @see Math#nextUp(double)
+ */
+ public static Float16 nextUp(Float16 v) {
+ float f = v.floatValue();
+ if (f < Float.POSITIVE_INFINITY) {
+ if (f != 0) {
+ int bits = float16ToRawShortBits(v);
+ return shortBitsToFloat16((short) (bits + ((bits >= 0) ? 1 : -1)));
+ }
+ return MIN_VALUE;
+ }
+ return v; // v is NaN or +Infinity
+ }
+
+ /**
+ * Returns the floating-point value adjacent to {@code v} in
+ * the direction of negative infinity.
+ *
+ * Special Cases:
+ *
+ * - If the argument is NaN, the result is NaN.
+ *
+ *
- If the argument is negative infinity, the result is
+ * negative infinity.
+ *
+ *
- If the argument is zero, the result is
+ * -{@link #MIN_VALUE}
+ *
+ *
+ *
+ * @apiNote This method corresponds to the nextDown
+ * operation defined in IEEE 754.
+ *
+ * @param v starting floating-point value
+ * @return The adjacent floating-point value closer to negative
+ * infinity.
+ *
+ * @see Math#nextDown(float)
+ * @see Math#nextDown(double)
+ */
+ public static Float16 nextDown(Float16 v) {
+ float f = v.floatValue();
+ if (f > Float.NEGATIVE_INFINITY) {
+ if (f != 0) {
+ int bits = float16ToRawShortBits(v);
+ return shortBitsToFloat16((short) (bits - ((bits >= 0) ? 1 : -1)));
+ }
+ return negate(MIN_VALUE);
+ }
+ return v; // v is NaN or -Infinity
+ }
+
+ /**
+ * Returns {@code v} × 2{@code scaleFactor}
+ * rounded as if performed by a single correctly rounded
+ * floating-point multiply. If the exponent of the result is
+ * between {@link Float16#MIN_EXPONENT} and {@link
+ * Float16#MAX_EXPONENT}, the answer is calculated exactly. If the
+ * exponent of the result would be larger than {@code
+ * Float16.MAX_EXPONENT}, an infinity is returned. Note that if the
+ * result is subnormal, precision may be lost; that is, when
+ * {@code scalb(x, n)} is subnormal, {@code scalb(scalb(x, n),
+ * -n)} may not equal x. When the result is non-NaN, the
+ * result has the same sign as {@code v}.
+ *
+ * Special cases:
+ *
+ * - If the first argument is NaN, NaN is returned.
+ *
- If the first argument is infinite, then an infinity of the
+ * same sign is returned.
+ *
- If the first argument is zero, then a zero of the same
+ * sign is returned.
+ *
+ *
+ * @apiNote This method corresponds to the scaleB operation
+ * defined in IEEE 754.
+ *
+ * @param v number to be scaled by a power of two.
+ * @param scaleFactor power of 2 used to scale {@code v}
+ * @return {@code v} × 2{@code scaleFactor}
+ *
+ * @see Math#scalb(float, int)
+ * @see Math#scalb(double, int)
+ */
+ public static Float16 scalb(Float16 v, int scaleFactor) {
+ // magnitude of a power of two so large that scaling a finite
+ // nonzero value by it would be guaranteed to over or
+ // underflow; due to rounding, scaling down takes an
+ // additional power of two which is reflected here
+ final int MAX_SCALE = Float16.MAX_EXPONENT + -Float16.MIN_EXPONENT +
+ Float16Consts.SIGNIFICAND_WIDTH + 1;
+
+ // Make sure scaling factor is in a reasonable range
+ scaleFactor = Math.max(Math.min(scaleFactor, MAX_SCALE), -MAX_SCALE);
+
+ int DoubleConsts_EXP_BIAS = 1023;
+ /*
+ * Since + MAX_SCALE for Float16 fits well within the double
+ * exponent range and + Float16 -> double conversion is exact
+ * the multiplication below will be exact. Therefore, the
+ * rounding that occurs when the double product is cast to
+ * Float16 will be the correctly rounded Float16 result.
+ */
+ return valueOf(v.doubleValue()
+ * Double.longBitsToDouble((long) (scaleFactor + DoubleConsts_EXP_BIAS) << Double.PRECISION - 1));
+ }
+ /**
+ * Returns the first floating-point argument with the sign of the
+ * second floating-point argument.
+ * This method does not require NaN {@code sign}
+ * arguments to be treated as positive values; implementations are
+ * permitted to treat some NaN arguments as positive and other NaN
+ * arguments as negative to allow greater performance.
+ *
+ * @apiNote
+ * This method corresponds to the copySign operation defined in
+ * IEEE 754.
+ *
+ * @param magnitude the parameter providing the magnitude of the result
+ * @param sign the parameter providing the sign of the result
+ * @return a value with the magnitude of {@code magnitude}
+ * and the sign of {@code sign}.
+ *
+ * @see Math#copySign(float, float)
+ * @see Math#copySign(double, double)
+ */
+ public static Float16 copySign(Float16 magnitude, Float16 sign) {
+ return shortBitsToFloat16((short) ((float16ToRawShortBits(sign) & SIGN_BIT_MASK) |
+ (float16ToRawShortBits(magnitude) &
+ (EXP_BIT_MASK | SIGNIF_BIT_MASK) )));
+ }
+
+ /**
+ * Returns the signum function of the argument; zero if the argument
+ * is zero, 1.0 if the argument is greater than zero, -1.0 if the
+ * argument is less than zero.
+ *
+ * Special Cases:
+ *
+ * - If the argument is NaN, then the result is NaN.
+ *
- If the argument is positive zero or negative zero, then the
+ * result is the same as the argument.
+ *
+ *
+ * @param f the floating-point value whose signum is to be returned
+ * @return the signum function of the argument
+ *
+ * @see Math#signum(float)
+ * @see Math#signum(double)
+ */
+ public static Float16 signum(Float16 f) {
+ return (f.floatValue() == 0.0f || isNaN(f)) ? f : copySign(valueOf(1), f);
+ }
+
+ // TODO: Temporary location for this functionality while Float16
+ // resides in incubator.
+ private static final class Float16ToDecimal {
+ /*
+ * For full details about this code see the following references:
+ *
+ * [1] Giulietti, "The Schubfach way to render doubles",
+ * https://drive.google.com/file/d/1gp5xv4CAa78SVgCeWfGqqI4FfYYYuNFb
+ *
+ * [2] IEEE Computer Society, "IEEE Standard for Floating-Point Arithmetic"
+ *
+ * [3] Bouvier & Zimmermann, "Division-Free Binary-to-Decimal Conversion"
+ *
+ * Divisions are avoided altogether for the benefit of those architectures
+ * that do not provide specific machine instructions or where they are slow.
+ * This is discussed in section 10 of [1].
+ */
+
+ /* The precision in bits */
+ static final int P = PRECISION;
+
+ /* Exponent width in bits */
+ private static final int W = (Float16.SIZE - 1) - (P - 1);
+
+ /* Minimum value of the exponent: -(2^(W-1)) - P + 3 */
+ static final int Q_MIN = (-1 << (W - 1)) - P + 3;
+
+ /* Maximum value of the exponent: 2^(W-1) - P */
+ static final int Q_MAX = (1 << (W - 1)) - P;
+
+ /* 10^(E_MIN - 1) <= MIN_VALUE < 10^E_MIN */
+ static final int E_MIN = -7;
+
+ /* 10^(E_MAX - 1) <= MAX_VALUE < 10^E_MAX */
+ static final int E_MAX = 5;
+
+ /* Threshold to detect tiny values, as in section 8.2.1 of [1] */
+ static final int C_TINY = 2;
+
+ /* The minimum and maximum k, as in section 8 of [1] */
+ static final int K_MIN = -8;
+ static final int K_MAX = 1;
+
+ /* H is as in section 8.1 of [1] */
+ static final int H = 5;
+
+ /* Minimum value of the significand of a normal value: 2^(P-1) */
+ private static final int C_MIN = 1 << (P - 1);
+
+ /* Mask to extract the biased exponent */
+ private static final int BQ_MASK = (1 << W) - 1;
+
+ /* Mask to extract the fraction bits */
+ private static final int T_MASK = (1 << (P - 1)) - 1;
+
+ /* Used in rop() */
+ private static final long MASK_32 = (1L << 32) - 1;
+
+ /* Used for left-to-tight digit extraction */
+ private static final int MASK_15 = (1 << 15) - 1;
+
+ private static final int NON_SPECIAL = 0;
+ private static final int PLUS_ZERO = 1;
+ private static final int MINUS_ZERO = 2;
+ private static final int PLUS_INF = 3;
+ private static final int MINUS_INF = 4;
+ private static final int NAN = 5;
+
+ /*
+ * Room for the longer of the forms
+ * -ddd.dd H + 2 characters
+ * -ddddd.0 H + 3 characters
+ * -0.00ddddd H + 5 characters
+ * -d.ddddE-e H + 5 characters
+ * where there are H digits d
+ */
+ public static final int MAX_CHARS = H + 5;
+
+ private final byte[] bytes = new byte[MAX_CHARS];
+
+ /* Index into bytes of rightmost valid character */
+ private int index;
+
+ private Float16ToDecimal() {
+ }
+
+ /**
+ * Returns a string representation of the {@code Float16}
+ * argument. All characters mentioned below are ASCII characters.
+ *
+ * @param v the {@code Float16} to be converted.
+ * @return a string representation of the argument.
+ * @see Float16#toString(Float16)
+ */
+ public static String toString(Float16 v) {
+ return new Float16ToDecimal().toDecimalString(v);
+ }
+
+ /**
+ * Appends the rendering of the {@code v} to {@code app}.
+ *
+ * The outcome is the same as if {@code v} were first
+ * {@link #toString(Float16) rendered} and the resulting string were then
+ * {@link Appendable#append(CharSequence) appended} to {@code app}.
+ *
+ * @param v the {@code Float16} whose rendering is appended.
+ * @param app the {@link Appendable} to append to.
+ * @throws IOException If an I/O error occurs
+ */
+ public static Appendable appendTo(Float16 v, Appendable app)
+ throws IOException {
+ return new Float16ToDecimal().appendDecimalTo(v, app);
+ }
+
+ private String toDecimalString(Float16 v) {
+ return switch (toDecimal(v)) {
+ case NON_SPECIAL -> charsToString();
+ case PLUS_ZERO -> "0.0";
+ case MINUS_ZERO -> "-0.0";
+ case PLUS_INF -> "Infinity";
+ case MINUS_INF -> "-Infinity";
+ default -> "NaN";
+ };
+ }
+
+ private Appendable appendDecimalTo(Float16 v, Appendable app)
+ throws IOException {
+ switch (toDecimal(v)) {
+ case NON_SPECIAL:
+ char[] chars = new char[index + 1];
+ for (int i = 0; i < chars.length; ++i) {
+ chars[i] = (char) bytes[i];
+ }
+ if (app instanceof StringBuilder builder) {
+ return builder.append(chars);
+ }
+ if (app instanceof StringBuffer buffer) {
+ return buffer.append(chars);
+ }
+ for (char c : chars) {
+ app.append(c);
+ }
+ return app;
+ case PLUS_ZERO: return app.append("0.0");
+ case MINUS_ZERO: return app.append("-0.0");
+ case PLUS_INF: return app.append("Infinity");
+ case MINUS_INF: return app.append("-Infinity");
+ default: return app.append("NaN");
+ }
+ }
+
+ /*
+ * Returns
+ * PLUS_ZERO iff v is 0.0
+ * MINUS_ZERO iff v is -0.0
+ * PLUS_INF iff v is POSITIVE_INFINITY
+ * MINUS_INF iff v is NEGATIVE_INFINITY
+ * NAN iff v is NaN
+ */
+ private int toDecimal(Float16 v) {
+ /*
+ * For full details see references [2] and [1].
+ *
+ * For finite v != 0, determine integers c and q such that
+ * |v| = c 2^q and
+ * Q_MIN <= q <= Q_MAX and
+ * either 2^(P-1) <= c < 2^P (normal)
+ * or 0 < c < 2^(P-1) and q = Q_MIN (subnormal)
+ */
+ int bits = float16ToRawShortBits(v);
+ int t = bits & T_MASK;
+ int bq = (bits >>> P - 1) & BQ_MASK;
+ if (bq < BQ_MASK) {
+ index = -1;
+ if (bits < 0) {
+ append('-');
+ }
+ if (bq != 0) {
+ /* normal value. Here mq = -q */
+ int mq = -Q_MIN + 1 - bq;
+ int c = C_MIN | t;
+ /* The fast path discussed in section 8.3 of [1] */
+ if (0 < mq & mq < P) {
+ int f = c >> mq;
+ if (f << mq == c) {
+ return toChars(f, 0);
+ }
+ }
+ return toDecimal(-mq, c, 0);
+ }
+ if (t != 0) {
+ /* subnormal value */
+ return t < C_TINY
+ ? toDecimal(Q_MIN, 10 * t, -1)
+ : toDecimal(Q_MIN, t, 0);
+ }
+ return bits == 0 ? PLUS_ZERO : MINUS_ZERO;
+ }
+ if (t != 0) {
+ return NAN;
+ }
+ return bits > 0 ? PLUS_INF : MINUS_INF;
+ }
+
+ private int toDecimal(int q, int c, int dk) {
+ /*
+ * The skeleton corresponds to figure 7 of [1].
+ * The efficient computations are those summarized in figure 9.
+ * Also check the appendix.
+ *
+ * Here's a correspondence between Java names and names in [1],
+ * expressed as approximate LaTeX source code and informally.
+ * Other names are identical.
+ * cb: \bar{c} "c-bar"
+ * cbr: \bar{c}_r "c-bar-r"
+ * cbl: \bar{c}_l "c-bar-l"
+ *
+ * vb: \bar{v} "v-bar"
+ * vbr: \bar{v}_r "v-bar-r"
+ * vbl: \bar{v}_l "v-bar-l"
+ *
+ * rop: r_o' "r-o-prime"
+ */
+ int out = c & 0x1;
+ long cb = c << 2;
+ long cbr = cb + 2;
+ long cbl;
+ int k;
+ /*
+ * flog10pow2(e) = floor(log_10(2^e))
+ * flog10threeQuartersPow2(e) = floor(log_10(3/4 2^e))
+ * flog2pow10(e) = floor(log_2(10^e))
+ */
+ if (c != C_MIN | q == Q_MIN) {
+ /* regular spacing */
+ cbl = cb - 2;
+ k = MathUtils.flog10pow2(q);
+ } else {
+ /* irregular spacing */
+ cbl = cb - 1;
+ k = MathUtils.flog10threeQuartersPow2(q);
+ }
+ int h = q + MathUtils.flog2pow10(-k) + 33;
+
+ /* g is as in the appendix */
+ long g = MathUtils.g1(k) + 1;
+
+ int vb = rop(g, cb << h);
+ int vbl = rop(g, cbl << h);
+ int vbr = rop(g, cbr << h);
+
+ int s = vb >> 2;
+ if (s >= 100) {
+ /*
+ * For n = 5, m = 1 the discussion in section 10 of [1] shows
+ * s' = floor(s / 10) = floor(s 52_429 / 2^19)
+ *
+ * sp10 = 10 s'
+ * tp10 = 10 t'
+ * upin iff u' = sp10 10^k in Rv
+ * wpin iff w' = tp10 10^k in Rv
+ * See section 9.3 of [1].
+ */
+ int sp10 = 10 * (int) (s * 52_429L >>> 19);
+ int tp10 = sp10 + 10;
+ boolean upin = vbl + out <= sp10 << 2;
+ boolean wpin = (tp10 << 2) + out <= vbr;
+ if (upin != wpin) {
+ return toChars(upin ? sp10 : tp10, k);
+ }
+ }
+
+ /*
+ * 10 <= s < 100 or s >= 100 and u', w' not in Rv
+ * uin iff u = s 10^k in Rv
+ * win iff w = t 10^k in Rv
+ * See section 9.3 of [1].
+ */
+ int t = s + 1;
+ boolean uin = vbl + out <= s << 2;
+ boolean win = (t << 2) + out <= vbr;
+ if (uin != win) {
+ /* Exactly one of u or w lies in Rv */
+ return toChars(uin ? s : t, k + dk);
+ }
+ /*
+ * Both u and w lie in Rv: determine the one closest to v.
+ * See section 9.3 of [1].
+ */
+ int cmp = vb - (s + t << 1);
+ return toChars(cmp < 0 || cmp == 0 && (s & 0x1) == 0 ? s : t, k + dk);
+ }
+
+ /*
+ * Computes rop(cp g 2^(-95))
+ * See appendix and figure 11 of [1].
+ */
+ private static int rop(long g, long cp) {
+ long x1 = multiplyHigh(g, cp);
+ long vbp = x1 >>> 31;
+ return (int) (vbp | (x1 & MASK_32) + MASK_32 >>> 32);
+ }
+
+ /*
+ * Formats the decimal f 10^e.
+ */
+ private int toChars(int f, int e) {
+ /*
+ * For details not discussed here see section 10 of [1].
+ *
+ * Determine len such that
+ * 10^(len-1) <= f < 10^len
+ */
+ int len = MathUtils.flog10pow2(Integer.SIZE - numberOfLeadingZeros(f));
+ if (f >= MathUtils.pow10(len)) {
+ len += 1;
+ }
+
+ /*
+ * Let fp and ep be the original f and e, respectively.
+ * Transform f and e to ensure
+ * 10^(H-1) <= f < 10^H
+ * fp 10^ep = f 10^(e-H) = 0.f 10^e
+ */
+ f *= (int)MathUtils.pow10(H - len);
+ e += len;
+
+ /*
+ * The toChars?() methods perform left-to-right digits extraction
+ * using ints, provided that the arguments are limited to 8 digits.
+ * Therefore, split the H = 9 digits of f into:
+ * h = the most significant digit of f
+ * l = the last 4, least significant digits of f
+ *
+ * For n = 5, m = 4 the discussion in section 10 of [1] shows
+ * floor(f / 10^4) = floor(107_375L f / 2^30)
+ */
+ int h = (int) (f * 107_375L >>> 30);
+ int l = f - 10_000 * h;
+
+ if (0 < e && e <= 7) {
+ return toChars1(h, l, e);
+ }
+ if (-3 < e && e <= 0) {
+ return toChars2(h, l, e);
+ }
+ return toChars3(h, l, e);
+ }
+
+ private int toChars1(int h, int l, int e) {
+ /*
+ * 0 < e <= 7: plain format without leading zeroes.
+ * Left-to-right digits extraction:
+ * algorithm 1 in [3], with b = 10, k = 4, n = 15.
+ */
+ appendDigit(h);
+ int y = y(l);
+ int t;
+ int i = 1;
+ for (; i < e; ++i) {
+ t = 10 * y;
+ appendDigit(t >>> 15);
+ y = t & MASK_15;
+ }
+ append('.');
+ for (; i <= 4; ++i) {
+ t = 10 * y;
+ appendDigit(t >>> 15);
+ y = t & MASK_15;
+ }
+ /*
+ * As H = 5 < 7, where 7 is the threshold for plain format without
+ * leading zeros, it can happen that the 2nd loop above is not executed.
+ * The following line ensures the presence of a digit to the right
+ * of the decimal point.
+ */
+ appendDigit(0);
+ removeTrailingZeroes();
+ return NON_SPECIAL;
+ }
+
+ private int toChars2(int h, int l, int e) {
+ /* -3 < e <= 0: plain format with leading zeroes */
+ appendDigit(0);
+ append('.');
+ for (; e < 0; ++e) {
+ appendDigit(0);
+ }
+ appendDigit(h);
+ append4Digits(l);
+ removeTrailingZeroes();
+ return NON_SPECIAL;
+ }
+
+ private int toChars3(int h, int l, int e) {
+ /* -3 >= e | e > 7: computerized scientific notation */
+ appendDigit(h);
+ append('.');
+ append4Digits(l);
+ removeTrailingZeroes();
+ exponent(e - 1);
+ return NON_SPECIAL;
+ }
+
+ private void append4Digits(int m) {
+ /*
+ * Left-to-right digits extraction:
+ * algorithm 1 in [3], with b = 10, k = 4, n = 15.
+ */
+ int y = y(m);
+ for (int i = 0; i < 4; ++i) {
+ int t = 10 * y;
+ appendDigit(t >>> 15);
+ y = t & MASK_15;
+ }
+ }
+
+ private void removeTrailingZeroes() {
+ while (bytes[index] == '0') {
+ --index;
+ }
+ /* ... but do not remove the one directly to the right of '.' */
+ if (bytes[index] == '.') {
+ ++index;
+ }
+ }
+
+ private int y(int a) {
+ /*
+ * Algorithm 1 in [3] needs computation of
+ * floor((a + 1) 2^n / b^k) - 1
+ * with a < 10^4, b = 10, k = 4, n = 15.
+ * Noting that
+ * (a + 1) 2^n <= 10^4 2^15 < 10^9
+ * For n = 9, m = 4 the discussion in section 10 of [1] leads to:
+ */
+ return (int) (((a + 1) << 15) * 1_759_218_605L >>> 44) - 1;
+ }
+
+ private void exponent(int e) {
+ append('E');
+ if (e < 0) {
+ append('-');
+ e = -e;
+ }
+ appendDigit(e);
+ }
+
+ private void append(int c) {
+ bytes[++index] = (byte) c;
+ }
+
+ private void appendDigit(int d) {
+ bytes[++index] = (byte) ('0' + d);
+ }
+
+ /* Using the deprecated constructor enhances performance */
+ @SuppressWarnings("deprecation")
+ private String charsToString() {
+ return new String(bytes, 0, 0, index + 1);
+ }
+
+ }
+
+ /* TODO Temporary hack while Float16 resides in incubator */
+
+ /**
+ * This class exposes package private utilities for other classes.
+ * Thus, all methods are assumed to be invoked with correct arguments,
+ * so these are not checked at all.
+ */
+ private static final class MathUtils {
+ /*
+ * For full details about this code see the following reference:
+ *
+ * Giulietti, "The Schubfach way to render doubles",
+ * https://drive.google.com/file/d/1gp5xv4CAa78SVgCeWfGqqI4FfYYYuNFb
+ */
+
+ /*
+ * The boundaries for k in g0(int) and g1(int).
+ * K_MIN must be DoubleToDecimal.K_MIN or less.
+ * K_MAX must be DoubleToDecimal.K_MAX or more.
+ */
+ static final int K_MIN = -8;
+ static final int K_MAX = 1;
+
+ /* Must be DoubleToDecimal.H or more */
+ static final int H = 17;
+
+ /* C_10 = floor(log10(2) * 2^Q_10), A_10 = floor(log10(3/4) * 2^Q_10) */
+ private static final int Q_10 = 41;
+ private static final long C_10 = 661_971_961_083L;
+ private static final long A_10 = -274_743_187_321L;
+
+ /* C_2 = floor(log2(10) * 2^Q_2) */
+ private static final int Q_2 = 38;
+ private static final long C_2 = 913_124_641_741L;
+
+ private MathUtils() {
+ throw new RuntimeException("not supposed to be instantiated.");
+ }
+
+ /* The first powers of 10. The last entry must be 10^(DoubleToDecimal.H) */
+ private static final long[] pow10 = {
+ 1L,
+ 10L,
+ 100L,
+ 1_000L,
+ 10_000L,
+ 100_000L,
+ 1_000_000L,
+ 10_000_000L,
+ 100_000_000L,
+ 1_000_000_000L,
+ 10_000_000_000L,
+ 100_000_000_000L,
+ 1_000_000_000_000L,
+ 10_000_000_000_000L,
+ 100_000_000_000_000L,
+ 1_000_000_000_000_000L,
+ 10_000_000_000_000_000L,
+ 100_000_000_000_000_000L,
+ };
+
+ /**
+ * Returns 10{@code e}.
+ *
+ * @param e The exponent which must meet
+ * 0 ≤ {@code e} ≤ {@link #H}.
+ * @return 10{@code e}.
+ */
+ static long pow10(int e) {
+ return pow10[e];
+ }
+
+ /**
+ * Returns the unique integer k such that
+ * 10k ≤ 2{@code e}
+ * < 10k+1.
+ *
+ * The result is correct when |{@code e}| ≤ 6_432_162.
+ * Otherwise the result is undefined.
+ *
+ * @param e The exponent of 2, which should meet
+ * |{@code e}| ≤ 6_432_162 for safe results.
+ * @return ⌊log102{@code e}⌋.
+ */
+ static int flog10pow2(int e) {
+ return (int) (e * C_10 >> Q_10);
+ }
+
+ /**
+ * Returns the unique integer k such that
+ * 10k ≤ 3/4 · 2{@code e}
+ * < 10k+1.
+ *
+ * The result is correct when
+ * -3_606_689 ≤ {@code e} ≤ 3_150_619.
+ * Otherwise the result is undefined.
+ *
+ * @param e The exponent of 2, which should meet
+ * -3_606_689 ≤ {@code e} ≤ 3_150_619 for safe results.
+ * @return ⌊log10(3/4 ·
+ * 2{@code e})⌋.
+ */
+ static int flog10threeQuartersPow2(int e) {
+ return (int) (e * C_10 + A_10 >> Q_10);
+ }
+
+ /**
+ * Returns the unique integer k such that
+ * 2k ≤ 10{@code e}
+ * < 2k+1.
+ *
+ * The result is correct when |{@code e}| ≤ 1_838_394.
+ * Otherwise the result is undefined.
+ *
+ * @param e The exponent of 10, which should meet
+ * |{@code e}| ≤ 1_838_394 for safe results.
+ * @return ⌊log210{@code e}⌋.
+ */
+ static int flog2pow10(int e) {
+ return (int) (e * C_2 >> Q_2);
+ }
+
+ /**
+ * Let 10-{@code k} = β 2r,
+ * for the unique pair of integer r and real β meeting
+ * 2125 ≤ β < 2126.
+ * Further, let g = ⌊β⌋ + 1.
+ * Split g into the higher 63 bits g1 and
+ * the lower 63 bits g0. Thus,
+ * g1 =
+ * ⌊g 2-63⌋
+ * and
+ * g0 =
+ * g - g1 263.
+ *
+ * This method returns g1 while
+ * {@link #g0(int)} returns g0.
+ *
+ * If needed, the exponent r can be computed as
+ * r = {@code flog2pow10(-k)} - 125 (see {@link #flog2pow10(int)}).
+ *
+ * @param k The exponent of 10, which must meet
+ * {@link #K_MIN} ≤ {@code e} ≤ {@link #K_MAX}.
+ * @return g1 as described above.
+ */
+ static long g1(int k) {
+ return g[k - K_MIN << 1];
+ }
+
+ /**
+ * Returns g0 as described in
+ * {@link #g1(int)}.
+ *
+ * @param k The exponent of 10, which must meet
+ * {@link #K_MIN} ≤ {@code e} ≤ {@link #K_MAX}.
+ * @return g0 as described in
+ * {@link #g1(int)}.
+ */
+ static long g0(int k) {
+ return g[k - K_MIN << 1 | 1];
+ }
+
+ /*
+ * The precomputed values for g1(int) and g0(int).
+ * The first entry must be for an exponent of K_MIN or less.
+ * The last entry must be for an exponent of K_MAX or more.
+ */
+ private static final long[] g = {
+ 0x5F5E_1000_0000_0000L, 0x0000_0000_0000_0001L, // -8
+ 0x4C4B_4000_0000_0000L, 0x0000_0000_0000_0001L, // -7
+ 0x7A12_0000_0000_0000L, 0x0000_0000_0000_0001L, // -6
+ 0x61A8_0000_0000_0000L, 0x0000_0000_0000_0001L, // -5
+ 0x4E20_0000_0000_0000L, 0x0000_0000_0000_0001L, // -4
+ 0x7D00_0000_0000_0000L, 0x0000_0000_0000_0001L, // -3
+ 0x6400_0000_0000_0000L, 0x0000_0000_0000_0001L, // -2
+ 0x5000_0000_0000_0000L, 0x0000_0000_0000_0001L, // -1
+ 0x4000_0000_0000_0000L, 0x0000_0000_0000_0001L, // 0
+ 0x6666_6666_6666_6666L, 0x3333_3333_3333_3334L, // 1
+ };
+
+ }
+}
diff --git a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Float16Consts.java b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Float16Consts.java
new file mode 100644
index 00000000000..48c4d2199b1
--- /dev/null
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Float16Consts.java
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation. Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package jdk.incubator.vector;
+
+import static jdk.incubator.vector.Float16.MIN_EXPONENT;
+import static jdk.incubator.vector.Float16.PRECISION;
+import static jdk.incubator.vector.Float16.SIZE;
+
+/**
+ * This class contains additional constants documenting limits of the
+ * {@code Float16} type.
+ */
+
+class Float16Consts {
+ /**
+ * Don't let anyone instantiate this class.
+ */
+ private Float16Consts() {}
+
+ /**
+ * The number of logical bits in the significand of a
+ * {@code Float16} number, including the implicit bit.
+ */
+ public static final int SIGNIFICAND_WIDTH = PRECISION;
+
+ /**
+ * The exponent the smallest positive {@code Float16}
+ * subnormal value would have if it could be normalized.
+ */
+ public static final int MIN_SUB_EXPONENT =
+ MIN_EXPONENT - (SIGNIFICAND_WIDTH - 1); // -24
+
+ /**
+ * Bias used in representing a {@code Float16} exponent.
+ */
+ public static final int EXP_BIAS =
+ (1 << (SIZE - SIGNIFICAND_WIDTH - 1)) - 1; // 15
+
+ /**
+ * Bit mask to isolate the sign bit of a {@code Float16}.
+ */
+ public static final int SIGN_BIT_MASK = 1 << (SIZE - 1);
+
+ /**
+ * Bit mask to isolate the exponent field of a {@code Float16}.
+ */
+ public static final int EXP_BIT_MASK =
+ ((1 << (SIZE - SIGNIFICAND_WIDTH)) - 1) << (SIGNIFICAND_WIDTH - 1);
+
+ /**
+ * Bit mask to isolate the significand field of a {@code Float16}.
+ */
+ public static final int SIGNIF_BIT_MASK = (1 << (SIGNIFICAND_WIDTH - 1)) - 1;
+
+ /**
+ * Bit mask to isolate the magnitude bits (combined exponent and
+ * significand fields) of a {@code Float16}.
+ */
+ public static final int MAG_BIT_MASK = EXP_BIT_MASK | SIGNIF_BIT_MASK;
+
+ static {
+ // verify bit masks cover all bit positions and that the bit
+ // masks are non-overlapping
+ assert(((SIGN_BIT_MASK | EXP_BIT_MASK | SIGNIF_BIT_MASK) == 0xFFFF) &&
+ (((SIGN_BIT_MASK & EXP_BIT_MASK) == 0) &&
+ ((SIGN_BIT_MASK & SIGNIF_BIT_MASK) == 0) &&
+ ((EXP_BIT_MASK & SIGNIF_BIT_MASK) == 0)) &&
+ ((SIGN_BIT_MASK | MAG_BIT_MASK) == 0xFFFF));
+ }
+}
diff --git a/test/jdk/java/math/BigDecimal/DoubleFloatValueTests.java b/test/jdk/java/math/BigDecimal/DoubleFloatValueTests.java
index d02c1a5bc7b..1d5010f84f6 100644
--- a/test/jdk/java/math/BigDecimal/DoubleFloatValueTests.java
+++ b/test/jdk/java/math/BigDecimal/DoubleFloatValueTests.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2022, 2024, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -23,8 +23,9 @@
/*
* @test
- * @bug 8205592
- * @summary Verify {double, float}Value methods work
+ * @bug 8205592 8339252 8341260
+ * @summary Verify {double, float, float16}Value methods work
+ * @modules jdk.incubator.vector
* @library /test/lib
* @key randomness
* @build jdk.test.lib.RandomFactory
@@ -35,6 +36,7 @@ import jdk.test.lib.RandomFactory;
import java.math.BigDecimal;
import java.util.Random;
+import jdk.incubator.vector.Float16;
public class DoubleFloatValueTests {
private static final BigDecimal HALF = BigDecimal.valueOf(5, 1);
@@ -64,6 +66,18 @@ public class DoubleFloatValueTests {
return bv.subtract(ulp.multiply(HALF));
}
+ private static BigDecimal nextHalfUp(Float16 v) {
+ BigDecimal bv = new BigDecimal(v.doubleValue());
+ BigDecimal ulp = new BigDecimal(Float16.ulp(v).doubleValue());
+ return bv.add(ulp.multiply(HALF));
+ }
+
+ private static BigDecimal nextHalfDown(Float16 v) {
+ BigDecimal bv = new BigDecimal(v.doubleValue());
+ BigDecimal ulp = new BigDecimal(v.doubleValue() - Float16.nextDown(v).doubleValue());
+ return bv.subtract(ulp.multiply(HALF));
+ }
+
private static String toDecHexString(double v) {
return v + " (" + Double.toHexString(v) + ")";
}
@@ -72,6 +86,10 @@ public class DoubleFloatValueTests {
return v + " (" + Float.toHexString(v) + ")";
}
+ private static String toDecHexString(Float16 v) {
+ return v + " (" + Float16.toHexString(v) + ")";
+ }
+
private static void checkDouble(BigDecimal bd, double exp) {
double res = bd.doubleValue();
if (exp != res ) {
@@ -90,6 +108,15 @@ public class DoubleFloatValueTests {
}
}
+ private static void checkFloat16(BigDecimal bv, Float16 exp) {
+ Float16 res = Float16.valueOf(bv); // bv.float16Value();
+ if (exp.floatValue() != res.floatValue()) {
+ String message = "Bad conversion: got " + toDecHexString(res) +
+ ", expected " + toDecHexString(exp);
+ throw new RuntimeException(message);
+ }
+ }
+
private static boolean isOdd(int n) {
return (n & 0x1) != 0;
}
@@ -112,6 +139,15 @@ public class DoubleFloatValueTests {
}
}
+ private static void testFloat16ValueNearMinValue() {
+ for (int n = 0; n < 100; ++n) {
+ BigDecimal b = nextHalfUp(Float16.multiply(Float16.valueOf(n), Float16.MIN_VALUE));
+ checkFloat16(b, Float16.multiply(Float16.valueOf((n + 1) / 2 * 2), Float16.MIN_VALUE));
+ checkFloat16(b.subtract(EPS), Float16.multiply(Float16.valueOf(n), Float16.MIN_VALUE));
+ checkFloat16(b.add(EPS), Float16.multiply(Float16.valueOf(n + 1), Float16.MIN_VALUE));
+ }
+ }
+
private static void testDoubleValueNearMinNormal() {
double v = Double.MIN_NORMAL;
for (int n = 0; n < 100; ++n) {
@@ -150,6 +186,25 @@ public class DoubleFloatValueTests {
}
}
+ private static void testFloat16ValueNearMinNormal() {
+ Float16 v = Float16.MIN_NORMAL;
+ for (int n = 0; n < 100; ++n) {
+ BigDecimal bv = nextHalfDown(v);
+ checkFloat16(bv, isOdd(n) ? Float16.nextDown(v) : v);
+ checkFloat16(bv.subtract(EPS), Float16.nextDown(v));
+ checkFloat16(bv.add(EPS), v);
+ v = Float16.nextDown(v);
+ }
+ v = Float16.MIN_NORMAL;
+ for (int n = 0; n < 100; ++n) {
+ BigDecimal bv = nextHalfUp(v);
+ checkFloat16(bv, isOdd(n) ? Float16.nextUp(v) : v);
+ checkFloat16(bv.subtract(EPS), v);
+ checkFloat16(bv.add(EPS), Float16.nextUp(v));
+ v = Float16.nextUp(v);
+ }
+ }
+
private static void testDoubleValueNearMaxValue() {
double v = Double.MAX_VALUE;
for (int n = 0; n < 100; ++n) {
@@ -180,6 +235,21 @@ public class DoubleFloatValueTests {
checkFloat(bv.add(EPS), Float.POSITIVE_INFINITY);
}
+ private static void testFloat16ValueNearMaxValue() {
+ Float16 v = Float16.MAX_VALUE;
+ for (int n = 0; n < 100; ++n) {
+ BigDecimal bv = nextHalfDown(v);
+ checkFloat16(bv, isOdd(n) ? v : Float16.nextDown(v));
+ checkFloat16(bv.subtract(EPS), Float16.nextDown(v));
+ checkFloat16(bv.add(EPS), v);
+ v = Float16.nextDown(v);
+ }
+ BigDecimal bv = nextHalfUp(Float16.MAX_VALUE);
+ checkFloat16(bv, Float16.POSITIVE_INFINITY);
+ checkFloat16(bv.subtract(EPS), Float16.MAX_VALUE);
+ checkFloat16(bv.add(EPS), Float16.POSITIVE_INFINITY);
+ }
+
private static void testDoubleValueRandom() {
Random r = RandomFactory.getRandom();
for (int i = 0; i < 10_000; ++i) {
@@ -228,18 +298,49 @@ public class DoubleFloatValueTests {
}
}
+ private static void testFloat16ValueRandom() {
+ Random r = RandomFactory.getRandom();
+ for (int i = 0; i < 10_000; ++i) {
+ Float16 v = Float16.valueOf(r.nextFloat(-Float16.MAX_VALUE.floatValue(), Float16.MAX_VALUE.floatValue()));
+ checkFloat16(new BigDecimal(v.floatValue()), v);
+ }
+ for (int i = 0; i < 10_000; ++i) {
+ Float16 v = Float16.valueOf(r.nextFloat(-1e4f, 1e4f));
+ checkFloat16(new BigDecimal(v.floatValue()), v);
+ }
+ for (int i = 0; i < 10_000; ++i) {
+ Float16 v = Float16.valueOf(r.nextFloat(-1e3f, 1e3f));
+ checkFloat16(new BigDecimal(v.floatValue()), v);
+ }
+ for (int i = 0; i < 10_000; ++i) {
+ Float16 v = Float16.valueOf(r.nextFloat(-1e-3f, 1e-3f));
+ checkFloat16(new BigDecimal(v.floatValue()), v);
+ }
+ for (int i = 0; i < 10_000; ++i) {
+ Float16 v = Float16.valueOf(r.nextFloat(-1e-4f, 1e-4f));
+ checkFloat16(new BigDecimal(v.floatValue()), v);
+ }
+ }
+
private static void testDoubleValueExtremes() {
checkDouble(BigDecimal.valueOf(1, 1000), 0.0);
checkDouble(BigDecimal.valueOf(-1, 1000), -0.0);
checkDouble(BigDecimal.valueOf(1, -1000), Double.POSITIVE_INFINITY);
- checkDouble(BigDecimal.valueOf(-1, -1000), -Double.POSITIVE_INFINITY);
+ checkDouble(BigDecimal.valueOf(-1, -1000), Double.NEGATIVE_INFINITY);
}
private static void testFloatValueExtremes() {
checkFloat(BigDecimal.valueOf(1, 1000), 0.0f);
checkFloat(BigDecimal.valueOf(-1, 1000), -0.0f);
checkFloat(BigDecimal.valueOf(1, -1000), Float.POSITIVE_INFINITY);
- checkFloat(BigDecimal.valueOf(-1, -1000), -Float.POSITIVE_INFINITY);
+ checkFloat(BigDecimal.valueOf(-1, -1000), Float.NEGATIVE_INFINITY);
+ }
+
+ private static void testFloat16ValueExtremes() {
+ checkFloat16(BigDecimal.valueOf(1, 1000), Float16.valueOf(0.0f));
+ checkFloat16(BigDecimal.valueOf(-1, 1000), Float16.valueOf(-0.0f));
+ checkFloat16(BigDecimal.valueOf(1, -1000), Float16.POSITIVE_INFINITY);
+ checkFloat16(BigDecimal.valueOf(-1, -1000), Float16.NEGATIVE_INFINITY);
}
public static void main(String[] args) {
@@ -254,6 +355,12 @@ public class DoubleFloatValueTests {
testFloatValueNearMaxValue();
testFloatValueRandom();
testFloatValueExtremes();
+
+ testFloat16ValueNearMinValue();
+ testFloat16ValueNearMinNormal();
+ testFloat16ValueNearMaxValue();
+ testFloat16ValueRandom();
+ testFloat16ValueExtremes();
}
}
diff --git a/test/jdk/jdk/incubator/vector/BasicFloat16ArithTests.java b/test/jdk/jdk/incubator/vector/BasicFloat16ArithTests.java
new file mode 100644
index 00000000000..4ed95f698cf
--- /dev/null
+++ b/test/jdk/jdk/incubator/vector/BasicFloat16ArithTests.java
@@ -0,0 +1,868 @@
+/*
+ * Copyright (c) 2016, 2024, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/*
+ * @test
+ * @bug 8329817 8334432 8339076 8341260
+ * @modules jdk.incubator.vector
+ * @summary Basic tests of Float16 arithmetic and similar operations
+ */
+
+import jdk.incubator.vector.Float16;
+import static jdk.incubator.vector.Float16.*;
+import java.util.HashSet;
+import java.util.List;
+
+public class BasicFloat16ArithTests {
+ private static float InfinityF = Float.POSITIVE_INFINITY;
+ private static float NaNf = Float.NaN;
+
+ private static final float MAX_VAL_FP16 = 0x1.ffcp15f;
+
+ public static void main(String... args) {
+ checkBitWise();
+ checkHash();
+ checkConstants();
+ checkNegate();
+ checkAbs();
+ checkIsNaN();
+ checkFiniteness();
+ checkMinMax();
+ checkArith();
+ checkSqrt();
+ checkGetExponent();
+ checkUlp();
+ checkValueOfDouble();
+ checkValueOfLong();
+ checkValueOfString();
+ checkBaseConversionRoundTrip();
+ FusedMultiplyAddTests.main();
+ }
+
+ /*
+ * The software implementation of Float16 delegates to float or
+ * double operations for most of the actual computation. This
+ * regression test takes that into account as it generally only
+ * has limited testing to probe whether or not the proper
+ * functionality is being delegated to.
+ *
+ * To make the test easier to read, float literals that are exact
+ * upon conversion to Float16 are used for the test data.
+ *
+ * The float <-> Float16 conversions are well-tested from prior
+ * work and are assumed to be correct by this regression test.
+ */
+
+ /**
+ * Verify handling of NaN representations
+ */
+ private static void checkBitWise() {
+ short nanImage = float16ToRawShortBits(Float16.NaN);
+
+ int exponent = 0x7c00;
+ int sign = 0x8000;
+
+ // All-zeros significand with a max exponent are infinite
+ // values, not NaN values.
+ for(int i = 0x1; i <= 0x03ff; i++) {
+ short posNaNasShort = (short)( exponent | i);
+ short negNaNasShort = (short)(sign | exponent | i);
+
+ Float16 posf16 = shortBitsToFloat16(posNaNasShort);
+ Float16 negf16 = shortBitsToFloat16(negNaNasShort);
+
+ // Mask-off high-order 16 bits to avoid sign extension woes
+ checkInt(nanImage & 0xffff, float16ToShortBits(posf16) & 0xffff, "positive NaN");
+ checkInt(nanImage & 0xffff, float16ToShortBits(negf16) & 0xffff, "negative NaN");
+
+ checkInt(posNaNasShort & 0xffff, float16ToRawShortBits(posf16) & 0xffff , "positive NaN");
+ checkInt(negNaNasShort & 0xffff, float16ToRawShortBits(negf16) & 0xffff, "negative NaN");
+ }
+ }
+
+ /**
+ * Verify correct number of hashValue's from Float16's.
+ */
+ private static void checkHash() {
+ // Slightly over-allocate the HashSet.
+ HashSet set = HashSet.newHashSet(Short.MAX_VALUE - Short.MIN_VALUE + 1);
+
+ // Each non-NaN value should have a distinct hashCode. All NaN
+ // values should share a single hashCode. Check the latter
+ // property by verifying the overall count of entries in the
+ // set.
+ for(int i = Short.MIN_VALUE; i <= Short.MAX_VALUE; i++) {
+ Float16 f16 = Float16.shortBitsToFloat16((short)i);
+ boolean addedToSet = set.add(f16.hashCode());
+
+ if (!Float16.isNaN(f16)) {
+ if (!addedToSet) {
+ throwRE("Existing hash value for " + f16);
+ }
+ }
+ }
+
+ // There are 2^16 = 65,536 total short values. Each of these
+ // bit patterns is a valid representation of a Float16
+ // value. However, NaNs have multiple possible encodings.
+ // With an exponent = 0x7c00, each nonzero significand 0x1 to
+ // 0x3ff is a NaN, for both positive and negative sign bits.
+ //
+ // Therefore, the total number of distinct hash codes for
+ // Float16 values should be:
+ // 65_536 - 2*(1_023) + 1 = 63_491
+
+ int setSize = set.size();
+ if (setSize != 63_491) {
+ throwRE("Unexpected number of distinct hash values " + setSize);
+ }
+ }
+
+ private static void checkConstants() {
+ checkInt(BYTES, 2, "Float16.BYTES");
+ checkInt(MAX_EXPONENT, 15, "Float16.MAX_EXPONENT");
+ checkInt(MIN_EXPONENT, -14, "Float16.MIN_EXPONENT");
+ checkInt(PRECISION, 11, "Float16.PRECISION");
+ checkInt(SIZE, 16, "Float16.SIZE");
+
+ checkFloat16(MIN_VALUE, 0x1.0p-24f, "Float16.MIN_VALUE");
+ checkFloat16(MIN_NORMAL, 0x1.0p-14f, "Float16.MIN_NORMAL");
+ checkFloat16(MAX_VALUE, 65504.0f, "Float16.MAX_VALUE");
+
+ checkFloat16(POSITIVE_INFINITY, InfinityF, "+infinity");
+ checkFloat16(NEGATIVE_INFINITY, -InfinityF, "-infinity");
+ checkFloat16(NaN, NaNf, "NaN");
+ }
+
+ private static void checkInt(int value, int expected, String message) {
+ if (value != expected) {
+ throwRE(String.format("Didn't get expected value for %s;%nexpected %d, got %d",
+ message, expected, value));
+ }
+ }
+
+ private static void checkFloat16(Float16 value16, float expected, String message) {
+ float value = value16.floatValue();
+ if (Float.compare(value, expected) != 0) {
+ throwRE(String.format("Didn't get expected value for %s;%nexpected %g (%a), got %g (%a)",
+ message, expected, expected, value, value));
+ }
+ }
+
+ private static void checkNegate() {
+ float[][] testCases = {
+ {-0.0f, 0.0f},
+ { 0.0f, -0.0f},
+
+ {-1.0f, 1.0f},
+ { 1.0f, -1.0f},
+
+ { InfinityF, -InfinityF},
+ {-InfinityF, InfinityF},
+
+ {NaNf, NaNf},
+ };
+
+ for(var testCase : testCases) {
+ float arg = testCase[0];
+ float expected = testCase[1];
+ Float16 result = negate(valueOf(arg));
+
+ if (Float.compare(expected, result.floatValue()) != 0) {
+ checkFloat16(result, expected, "negate(" + arg + ")");
+ }
+ }
+
+ return;
+ }
+
+ private static void checkAbs() {
+ float[][] testCases = {
+ {-0.0f, 0.0f},
+ { 0.0f, 0.0f},
+
+ {-1.0f, 1.0f},
+ { 1.0f, 1.0f},
+
+ { InfinityF, InfinityF},
+ {-InfinityF, InfinityF},
+
+ {NaNf, NaNf},
+ };
+
+ for(var testCase : testCases) {
+ float arg = testCase[0];
+ float expected = testCase[1];
+ Float16 result = abs(valueOf(arg));
+
+ if (Float.compare(expected, result.floatValue()) != 0) {
+ checkFloat16(result, expected, "abs(" + arg + ")");
+ }
+ }
+
+ return;
+ }
+
+ private static void checkIsNaN() {
+ if (!isNaN(NaN)) {
+ throwRE("Float16.isNaN() returns false for a NaN");
+ }
+
+ float[] testCases = {
+ -InfinityF,
+ InfinityF,
+ -0.0f,
+ +0.0f,
+ 1.0f,
+ -1.0f,
+ };
+
+ for(var testCase : testCases) {
+ boolean result = isNaN(valueOf(testCase));
+ if (result) {
+ throwRE("isNaN returned true for " + testCase);
+ }
+ }
+
+ return;
+ }
+
+ private static void checkFiniteness() {
+ float[] infinities = {
+ -InfinityF,
+ InfinityF,
+ };
+
+ for(var infinity : infinities) {
+ boolean result1 = isFinite(valueOf(infinity));
+ boolean result2 = isInfinite(valueOf(infinity));
+
+ if (result1) {
+ throwRE("Float16.isFinite returned true for " + infinity);
+ }
+
+ if (!result2) {
+ throwRE("Float16.isInfinite returned false for " + infinity);
+ }
+ }
+
+ if (isFinite(NaN)) {
+ throwRE("Float16.isFinite() returns true for a NaN");
+ }
+
+ if (isInfinite(NaN)) {
+ throwRE("Float16.isInfinite() returns true for a NaN");
+ }
+
+ float[] finities = {
+ -0.0f,
+ +0.0f,
+ 1.0f,
+ -1.0f,
+ };
+
+ for(var finity : finities) {
+ boolean result1 = isFinite(valueOf(finity));
+ boolean result2 = isInfinite(valueOf(finity));
+
+ if (!result1) {
+ throwRE("Float16.isFinite returned true for " + finity);
+ }
+
+ if (result2) {
+ throwRE("Float16.isInfinite returned true for " + finity);
+ }
+ }
+
+ return;
+ }
+
+ private static void checkMinMax() {
+ float small = 1.0f;
+ float large = 2.0f;
+
+ if (min(valueOf(small), valueOf(large)).floatValue() != small) {
+ throwRE(String.format("min(%g, %g) not equal to %g)",
+ small, large, small));
+ }
+
+ if (max(valueOf(small), valueOf(large)).floatValue() != large) {
+ throwRE(String.format("max(%g, %g) not equal to %g)",
+ small, large, large));
+ }
+ }
+
+ /*
+ * Cursory checks to make sure correct operation is being called
+ * with arguments in proper order.
+ */
+ private static void checkArith() {
+ float a = 1.0f;
+ Float16 a16 = valueOf(a);
+
+ float b = 2.0f;
+ Float16 b16 = valueOf(b);
+
+ if (add(a16, b16).floatValue() != (a + b)) {
+ throwRE("failure with " + a16 + " + " + b16);
+ }
+ if (add(b16, a16).floatValue() != (b + a)) {
+ throwRE("failure with " + b16 + " + " + a16);
+ }
+
+ if (subtract(a16, b16).floatValue() != (a - b)) {
+ throwRE("failure with " + a16 + " - " + b16);
+ }
+ if (subtract(b16, a16).floatValue() != (b - a)) {
+ throwRE("failure with " + b16 + " - " + a16);
+ }
+
+ if (multiply(a16, b16).floatValue() != (a * b)) {
+ throwRE("failure with " + a16 + " * " + b16);
+ }
+ if (multiply(b16, a16).floatValue() != (b * a)) {
+ throwRE("failure with " + b16 + " * " + a16);
+ }
+
+ if (divide(a16, b16).floatValue() != (a / b)) {
+ throwRE("failure with " + a16 + " / " + b16);
+ }
+ if (divide(b16, a16).floatValue() != (b / a)) {
+ throwRE("failure with " + b16 + " / " + a16);
+ }
+ return;
+ }
+
+ private static void checkSqrt() {
+ float[][] testCases = {
+ {-0.0f, -0.0f},
+ { 0.0f, 0.0f},
+
+ {1.0f, 1.0f},
+ {4.0f, 2.0f},
+ {9.0f, 3.0f},
+
+ { InfinityF, InfinityF},
+ {-InfinityF, NaNf},
+
+ {NaNf, NaNf},
+ };
+
+ for(var testCase : testCases) {
+ float arg = testCase[0];
+ float expected = testCase[1];
+ Float16 result = sqrt(valueOf(arg));
+
+ if (Float.compare(expected, result.floatValue()) != 0) {
+ checkFloat16(result, expected, "sqrt(" + arg + ")");
+ }
+ }
+
+ return;
+ }
+
+ private static void checkGetExponent() {
+ float[][] testCases = {
+ // Non-finite values
+ { InfinityF, MAX_EXPONENT + 1},
+ {-InfinityF, MAX_EXPONENT + 1},
+ { NaNf, MAX_EXPONENT + 1},
+
+ // Subnormal and almost subnormal values
+ {-0.0f, MIN_EXPONENT - 1},
+ {+0.0f, MIN_EXPONENT - 1},
+ { 0x1.0p-24f, MIN_EXPONENT - 1}, // Float16.MIN_VALUE
+ {-0x1.0p-24f, MIN_EXPONENT - 1}, // Float16.MIN_VALUE
+ { 0x1.0p-14f, MIN_EXPONENT}, // Float16.MIN_NORMAL
+ {-0x1.0p-14f, MIN_EXPONENT}, // Float16.MIN_NORMAL
+
+ // Normal values
+ { 1.0f, 0},
+ { 2.0f, 1},
+ { 4.0f, 2},
+
+ {MAX_VAL_FP16*0.5f, MAX_EXPONENT - 1},
+ {MAX_VAL_FP16, MAX_EXPONENT},
+ };
+
+ for(var testCase : testCases) {
+ float arg = testCase[0];
+ float expected = testCase[1];
+ // Exponents are in-range for Float16
+ Float16 result = valueOf(getExponent(valueOf(arg)));
+
+ if (Float.compare(expected, result.floatValue()) != 0) {
+ checkFloat16(result, expected, "getExponent(" + arg + ")");
+ }
+ }
+ return;
+ }
+
+ private static void checkUlp() {
+ float[][] testCases = {
+ { InfinityF, InfinityF},
+ {-InfinityF, InfinityF},
+ { NaNf, NaNf},
+
+ // Zeros, subnormals, and MIN_VALUE all have MIN_VALUE as an ulp.
+ {-0.0f, 0x1.0p-24f},
+ {+0.0f, 0x1.0p-24f},
+ { 0x1.0p-24f, 0x1.0p-24f},
+ {-0x1.0p-24f, 0x1.0p-24f},
+ { 0x1.0p-14f, 0x1.0p-24f},
+ {-0x1.0p-14f, 0x1.0p-24f},
+
+ // ulp is 10 bits away
+ {0x1.0p0f, 0x0.004p0f}, // 1.0f
+ {0x1.0p1f, 0x0.004p1f}, // 2.0f
+ {0x1.0p2f, 0x0.004p2f}, // 4.0f
+
+ {MAX_VAL_FP16*0.5f, 0x0.004p14f},
+ {MAX_VAL_FP16, 0x0.004p15f},
+ };
+
+ for(var testCase : testCases) {
+ float arg = testCase[0];
+ float expected = testCase[1];
+ // Exponents are in-range for Float16
+ Float16 result = ulp(valueOf(arg));
+
+ if (Float.compare(expected, result.floatValue()) != 0) {
+ checkFloat16(result, expected, "ulp(" + arg + ")");
+ }
+ }
+ return;
+ }
+
+ private static void throwRE(String message) {
+ throw new RuntimeException(message);
+ }
+
+ private static void checkValueOfDouble() {
+ /*
+ * Check that double -> Float16 conversion rounds properly
+ * around the midway point for each finite Float16 value by
+ * looping over the positive values and checking the negations
+ * along the way.
+ */
+
+ String roundUpMsg = "Didn't get half-way case rounding down";
+ String roundDownMsg = "Didn't get half-way case rounding up";
+
+ for(int i = 0; i <= Short.MAX_VALUE; i++ ) {
+ boolean isEven = ((i & 0x1) == 0);
+ Float16 f16 = Float16.shortBitsToFloat16((short)i);
+ Float16 f16Neg = negate(f16);
+
+ if (!isFinite(f16))
+ continue;
+
+ // System.out.println("\t" + toHexString(f16));
+
+ Float16 ulp = ulp(f16);
+ double halfWay = f16.doubleValue() + ulp.doubleValue() * 0.5;
+
+ // Under the round to nearest even rounding policy, the
+ // half-way case should round down to the starting value
+ // if the starting value is even; otherwise, it should round up.
+ float roundedBack = valueOf(halfWay).floatValue();
+ float roundedBackNeg = valueOf(-halfWay).floatValue();
+
+ if (isEven) {
+ checkFloat16(f16, roundedBack, roundDownMsg);
+ checkFloat16(f16Neg, roundedBackNeg, roundDownMsg);
+ } else {
+ checkFloat16(add(f16, ulp), roundedBack, roundUpMsg);
+ checkFloat16(subtract(f16Neg, ulp), roundedBackNeg, roundUpMsg);
+ }
+
+ // Should always round down
+ double halfWayNextDown = Math.nextDown(halfWay);
+ checkFloat16(f16, valueOf(halfWayNextDown).floatValue(), roundDownMsg);
+ checkFloat16(f16Neg, valueOf(-halfWayNextDown).floatValue(), roundDownMsg);
+
+ // Should always round up
+ double halfWayNextUp = Math.nextUp(halfWay);
+ checkFloat16(add(f16, ulp), valueOf( halfWayNextUp).floatValue(), roundUpMsg);
+ checkFloat16(subtract(f16Neg, ulp), valueOf(-halfWayNextUp).floatValue(), roundUpMsg);
+ }
+ }
+
+ private static void checkValueOfLong() {
+ checkFloat16(valueOf(-65_521), Float.NEGATIVE_INFINITY, "-infinity");
+ checkFloat16(valueOf(-65_520), Float.NEGATIVE_INFINITY, "-infinity");
+ checkFloat16(valueOf(-65_519), -MAX_VALUE.floatValue(), "-MAX_VALUE");
+ checkFloat16(valueOf(65_519), MAX_VALUE.floatValue(), "MAX_VALUE");
+ checkFloat16(valueOf(65_520), Float.POSITIVE_INFINITY, "+infinity");
+ checkFloat16(valueOf(65_521), Float.POSITIVE_INFINITY, "+infinity");
+ }
+
+ private static void checkValueOfString() {
+ String2Float16Case[] testCases = {
+ new String2Float16Case( "NaN", NaNf),
+ new String2Float16Case("+NaN", NaNf),
+ new String2Float16Case("-NaN", NaNf),
+
+ new String2Float16Case("+Infinity", +InfinityF),
+ new String2Float16Case("-Infinity", -InfinityF),
+
+ new String2Float16Case( "0.0", 0.0f),
+ new String2Float16Case("+0.0", 0.0f),
+ new String2Float16Case("-0.0", -0.0f),
+
+ // Decimal signed integers are accepted as input; hex
+ // signed integers are not, see negative test cases below.
+ new String2Float16Case( "1", 1.0f),
+ new String2Float16Case("-1", -1.0f),
+
+ new String2Float16Case( "12", 12.0f),
+ new String2Float16Case("-12", -12.0f),
+
+ new String2Float16Case( "123", 123.0f),
+ new String2Float16Case("-123", -123.0f),
+
+ new String2Float16Case( "1.0", 1.0f),
+ new String2Float16Case("-1.0", -1.0f),
+
+ // Check for FloatTypeSuffix handling
+ new String2Float16Case( "1.5f", 1.5f),
+ new String2Float16Case( "1.5F", 1.5f),
+ new String2Float16Case( "1.5D", 1.5f),
+ new String2Float16Case( "1.5d", 1.5f),
+
+ new String2Float16Case("65504.0", 65504.0f), // Float16.MAX_VALUE
+
+ new String2Float16Case("65520.0", InfinityF), // Float16.MAX_VALUE + 0.5*ulp
+
+ new String2Float16Case("65520.01", InfinityF), // Float16.MAX_VALUE + > 0.5*ulp
+ new String2Float16Case("65520.001", InfinityF), // Float16.MAX_VALUE + > 0.5*ulp
+ new String2Float16Case("65520.0001", InfinityF), // Float16.MAX_VALUE + > 0.5*ulp
+ new String2Float16Case("65520.00000000001", InfinityF), // Float16.MAX_VALUE + > 0.5*ulp
+
+ new String2Float16Case("65519.99999999999", 65504.0f), // Float16.MAX_VALUE + < 0.5*ulp
+ new String2Float16Case("0x1.ffdffffffffffp15", 65504.0f),
+ new String2Float16Case("0x1.ffdfffffffffp15", 65504.0f),
+
+
+ new String2Float16Case("65519.999999999999", 65504.0f),
+ new String2Float16Case("65519.9999999999999", 65504.0f),
+ new String2Float16Case("65519.99999999999999", 65504.0f),
+ new String2Float16Case("65519.999999999999999", 65504.0f),
+
+ // Float16.MAX_VALUE + < 0.5*ulp
+ new String2Float16Case("65519.9999999999999999999999999999999999999", 65504.0f),
+
+ // Near MAX_VALUE - 0.5 ulp
+ new String2Float16Case("65488.0", 65472.0f),
+ new String2Float16Case("65487.9999", 65472.0f),
+ new String2Float16Case("65487.99999999", 65472.0f),
+ new String2Float16Case("65487.9999999999999999", 65472.0f),
+
+ new String2Float16Case("65488.000001", MAX_VAL_FP16),
+
+ new String2Float16Case("65536.0", InfinityF), // Float16.MAX_VALUE + ulp
+
+ // Hex values
+ new String2Float16Case("0x1p2", 0x1.0p2f),
+ new String2Float16Case("0x1p2f", 0x1.0p2f),
+ new String2Float16Case("0x1p2d", 0x1.0p2f),
+ new String2Float16Case("0x1.0p1", 0x1.0p1f),
+
+ new String2Float16Case("-0x1p2", -0x1.0p2f),
+ new String2Float16Case("0x3.45p12", 0x3.45p12f),
+
+ new String2Float16Case("0x3.4500000001p12", 0x3.45p12f),
+
+ // Near half-way double + float cases in hex
+ new String2Float16Case("0x1.ffdfffffffffffffffffffffffffffffffffffp15", 65504.0f),
+
+ };
+
+ for(String2Float16Case testCase : testCases) {
+ String input = testCase.input();
+ float expected = testCase.expected();
+ Float16 result = Float16.valueOf(input);
+ checkFloat16(result, expected, "Float16.valueOf(String) " + input);
+ }
+
+ List negativeCases = List.of("0x1",
+ "-0x1",
+ "0x12",
+ "-0x12");
+
+ for(String negativeCase : negativeCases) {
+ try {
+ Float16 f16 = Float16.valueOf(negativeCase);
+ throwRE("Did not get expected exception for input " + negativeCase);
+ } catch (NumberFormatException nfe) {
+ ; // Expected
+ }
+ }
+
+ return;
+ }
+
+ private static record String2Float16Case(String input, float expected) {
+ }
+
+ private static void checkBaseConversionRoundTrip() {
+ checkFloat16(Float16.NaN,
+ Float16.valueOf("NaN").floatValue(),
+ "base conversion of NaN");
+
+ // For each non-NaN value, make sure
+ // value -> string -> value
+ // sequence of conversions gives the expected result.
+
+ for(int i = Short.MIN_VALUE; i <= Short.MAX_VALUE; i++) {
+ Float16 f16 = Float16.shortBitsToFloat16((short)i);
+ if (Float16.isNaN(f16))
+ continue;
+
+ checkFloat16(f16,
+ Float16.valueOf(Float16.toString(f16)).floatValue(),
+ "base conversion");
+ }
+ return;
+ }
+
+ private static class FusedMultiplyAddTests {
+ public static void main(String... args) {
+ testZeroNanInfCombos();
+ testNonFinite();
+ testZeroes();
+ testSimple();
+ testRounding();
+ }
+
+ private static void testZeroNanInfCombos() {
+ float [] testInputs = {
+ Float.NaN,
+ -InfinityF,
+ +InfinityF,
+ -0.0f,
+ +0.0f,
+ };
+
+ for (float i : testInputs) {
+ for (float j : testInputs) {
+ for (float k : testInputs) {
+ testFusedMacCase(i, j, k, Math.fma(i, j, k));
+ }
+ }
+ }
+ }
+
+ private static void testNonFinite() {
+ float [][] testCases = {
+ {1.0f, InfinityF, 2.0f,
+ InfinityF},
+
+ {1.0f, 2.0f, InfinityF,
+ InfinityF},
+
+ {InfinityF, 1.0f, InfinityF,
+ InfinityF},
+
+ {0x1.ffcp14f, 2.0f, -InfinityF,
+ -InfinityF},
+
+ {InfinityF, 1.0f, -InfinityF,
+ NaNf},
+
+ {-InfinityF, 1.0f, InfinityF,
+ NaNf},
+
+ {1.0f, NaNf, 2.0f,
+ NaNf},
+
+ {1.0f, 2.0f, NaNf,
+ NaNf},
+
+ {InfinityF, 2.0f, NaNf,
+ NaNf},
+
+ {NaNf, 2.0f, InfinityF,
+ NaNf},
+ };
+
+ for (float[] testCase: testCases) {
+ testFusedMacCase(testCase[0], testCase[1], testCase[2], testCase[3]);
+ }
+ }
+
+ private static void testZeroes() {
+ float [][] testCases = {
+ {+0.0f, +0.0f, +0.0f,
+ +0.0f},
+
+ {-0.0f, +0.0f, +0.0f,
+ +0.0f},
+
+ {+0.0f, +0.0f, -0.0f,
+ +0.0f},
+
+ {+0.0f, +0.0f, -0.0f,
+ +0.0f},
+
+ {-0.0f, +0.0f, -0.0f,
+ -0.0f},
+
+ {-0.0f, -0.0f, -0.0f,
+ +0.0f},
+
+ {-1.0f, +0.0f, -0.0f,
+ -0.0f},
+
+ {-1.0f, +0.0f, +0.0f,
+ +0.0f},
+
+ {-2.0f, +0.0f, -0.0f,
+ -0.0f},
+ };
+
+ for (float[] testCase: testCases) {
+ testFusedMacCase(testCase[0], testCase[1], testCase[2], testCase[3]);
+ }
+ }
+
+ private static void testSimple() {
+ final float ulpOneFp16 = ulp(valueOf(1.0f)).floatValue();
+
+ float [][] testCases = {
+ {1.0f, 2.0f, 3.0f,
+ 5.0f},
+
+ {1.0f, 2.0f, -2.0f,
+ 0.0f},
+
+ {5.0f, 5.0f, -25.0f,
+ 0.0f},
+
+ {0.5f*MAX_VAL_FP16, 2.0f, -0.5f*MAX_VAL_FP16,
+ 0.5f*MAX_VAL_FP16},
+
+ {MAX_VAL_FP16, 2.0f, -MAX_VAL_FP16,
+ MAX_VAL_FP16},
+
+ {MAX_VAL_FP16, 2.0f, 1.0f,
+ InfinityF},
+
+ {(1.0f + ulpOneFp16),
+ (1.0f + ulpOneFp16),
+ -1.0f - 2.0f*ulpOneFp16,
+ ulpOneFp16 * ulpOneFp16},
+
+ };
+
+ for (float[] testCase: testCases) {
+ testFusedMacCase(testCase[0], testCase[1], testCase[2], testCase[3]);
+ }
+ }
+
+ private static void testRounding() {
+ final float ulpOneFp16 = ulp(valueOf(1.0f)).floatValue();
+
+ float [][] testCases = {
+ // The product is equal to
+ // (MAX_VALUE + 1/2 * ulp(MAX_VALUE) + MAX_VALUE = (0x1.ffcp15 + 0x0.002p15)+ 0x1.ffcp15
+ // so overflows.
+ {0x1.3p1f, 0x1.afp15f, -MAX_VAL_FP16,
+ InfinityF},
+
+ // Product exactly equals 0x1.ffep15, the overflow
+ // threshold; subtracting a non-zero finite value will
+ // result in MAX_VALUE, adding zero or a positive
+ // value will overflow.
+ {0x1.2p10f, 0x1.c7p5f, -0x1.0p-14f,
+ MAX_VAL_FP16},
+
+ {0x1.2p10f, 0x1.c7p5f, -0.0f,
+ InfinityF},
+
+ {0x1.2p10f, 0x1.c7p5f, +0.0f,
+ InfinityF},
+
+ {0x1.2p10f, 0x1.c7p5f, +0x1.0p-14f,
+ InfinityF},
+
+ {0x1.2p10f, 0x1.c7p5f, InfinityF,
+ InfinityF},
+
+ // PRECISION bits in the subnormal intermediate product
+ {0x1.ffcp-14f, 0x1.0p-24f, 0x1.0p13f, // Can be held exactly
+ 0x1.0p13f},
+
+ {0x1.ffcp-14f, 0x1.0p-24f, 0x1.0p14f, // *Cannot* be held exactly
+ 0x1.0p14f},
+
+ // Check values where the exact result cannot be
+ // exactly stored in a double.
+ {0x1.0p-24f, 0x1.0p-24f, 0x1.0p10f,
+ 0x1.0p10f},
+
+ {0x1.0p-24f, 0x1.0p-24f, 0x1.0p14f,
+ 0x1.0p14f},
+
+ // Check subnormal results, underflow to zero
+ {0x1.0p-24f, -0.5f, 0x1.0p-24f,
+ 0.0f},
+
+ // Check subnormal results, underflow to zero
+ {0x1.0p-24f, -0.5f, 0.0f,
+ -0.0f},
+ };
+
+ for (float[] testCase: testCases) {
+ testFusedMacCase(testCase[0], testCase[1], testCase[2], testCase[3]);
+ }
+ }
+
+ private static void testFusedMacCase(float input1, float input2, float input3, float expected) {
+ Float16 a = valueOf(input1);
+ Float16 b = valueOf(input2);
+ Float16 c = valueOf(input3);
+ Float16 d = valueOf(expected);
+
+ test("Float16.fma(float)", a, b, c, Float16.fma(a, b, c), d);
+
+ // Permute first two inputs
+ test("Float16.fma(float)", b, a, c, Float16.fma(b, a, c), d);
+ return;
+ }
+ }
+
+ private static void test(String testName,
+ Float16 input1, Float16 input2, Float16 input3,
+ Float16 result, Float16 expected) {
+ if (Float16.compare(expected, result ) != 0) {
+ System.err.println("Failure for " + testName + ":\n" +
+ "\tFor inputs " + input1 + "\t(" + toHexString(input1) + ") and "
+ + input2 + "\t(" + toHexString(input2) + ") and"
+ + input3 + "\t(" + toHexString(input3) + ")\n" +
+ "\texpected " + expected + "\t(" + toHexString(expected) + ")\n" +
+ "\tgot " + result + "\t(" + toHexString(result) + ").");
+ throw new RuntimeException();
+ }
+ }
+}