From 5b7bb40d1f5a8e1261cc252db2a09b5e4f07e5f0 Mon Sep 17 00:00:00 2001 From: David Holmes Date: Tue, 30 Jul 2024 22:35:28 +0000 Subject: [PATCH] 8325002: Exceptions::fthrow needs to ensure it truncates to a valid utf8 string Reviewed-by: djelinski, stuefe --- src/hotspot/share/utilities/exceptions.cpp | 27 +++++- src/hotspot/share/utilities/utf8.cpp | 63 +++++++++++++ src/hotspot/share/utilities/utf8.hpp | 3 +- test/hotspot/gtest/utilities/test_utf8.cpp | 104 ++++++++++++++++++++- 4 files changed, 194 insertions(+), 3 deletions(-) diff --git a/src/hotspot/share/utilities/exceptions.cpp b/src/hotspot/share/utilities/exceptions.cpp index c8f458dfa31..034444839ab 100644 --- a/src/hotspot/share/utilities/exceptions.cpp +++ b/src/hotspot/share/utilities/exceptions.cpp @@ -43,6 +43,7 @@ #include "runtime/atomic.hpp" #include "utilities/events.hpp" #include "utilities/exceptions.hpp" +#include "utilities/utf8.hpp" // Limit exception message components to 64K (the same max as Symbols) #define MAX_LEN 65535 @@ -262,8 +263,32 @@ void Exceptions::fthrow(JavaThread* thread, const char* file, int line, Symbol* va_list ap; va_start(ap, format); char msg[max_msg_size]; - os::vsnprintf(msg, max_msg_size, format, ap); + int ret = os::vsnprintf(msg, max_msg_size, format, ap); va_end(ap); + + // If ret == -1 then either there was a format conversion error, or the required buffer size + // exceeds INT_MAX and so couldn't be returned (undocumented behaviour of vsnprintf). Depending + // on the platform the buffer may be filled to its capacity (Linux), filled to the conversion + // that encountered the overflow (macOS), or is empty (Windows), so it is possible we + // have a truncated UTF-8 sequence. Similarly, if the buffer was too small and ret >= max_msg_size + // we may also have a truncated UTF-8 sequence. In such cases we need to fix the buffer so the UTF-8 + // sequence is valid. + if (ret == -1 || ret >= max_msg_size) { + int len = (int) strlen(msg); + if (len > 0) { + // Truncation will only happen if the buffer was filled by vsnprintf, + // otherwise vsnprintf already terminated filling it at a well-defined point. + // But as this is not a clearly specified area we will perform our own UTF8 + // truncation anyway - though for those well-defined termination points it + // will be a no-op. + UTF8::truncate_to_legal_utf8((unsigned char*)msg, len + 1); + } + } + // UTF8::is_legal_utf8 should actually be called is_legal_utf8_class_name as the final + // parameter controls a check for a specific character appearing in the "name", which is only + // allowed for classfile versions <= 47. We pass `true` so that we allow such strings as this code + // know nothing about the actual string content. + assert(UTF8::is_legal_utf8((const unsigned char*)msg, (int)strlen(msg), true), "must be"); _throw_msg(thread, file, line, h_name, msg); } diff --git a/src/hotspot/share/utilities/utf8.cpp b/src/hotspot/share/utilities/utf8.cpp index 6fd877120df..47cbb04da4b 100644 --- a/src/hotspot/share/utilities/utf8.cpp +++ b/src/hotspot/share/utilities/utf8.cpp @@ -392,6 +392,69 @@ bool UTF8::is_legal_utf8(const unsigned char* buffer, int length, return true; } +// Return true if `b` could be the starting byte of an encoded 2,3 or 6 +// byte sequence. +static bool is_starting_byte(unsigned char b) { + return b >= 0xC0 && b <= 0xEF; +} + +// Takes an incoming buffer that was valid UTF-8, but which has been truncated such that +// the last encoding may be partial, and returns the same buffer with a NUL-terminator +// inserted such that any partial encoding has gone. +// Note: if the incoming buffer is already valid then we may still drop the last encoding. +// To avoid that the caller can choose to check for validity first. +// The incoming buffer is still expected to be NUL-terminated. +// The incoming buffer is expected to be a realistic size - we assert if it is too small. +void UTF8::truncate_to_legal_utf8(unsigned char* buffer, int length) { + assert(length > 5, "invalid length"); + assert(buffer[length - 1] == '\0', "Buffer should be NUL-terminated"); + + if (buffer[length - 2] < 128) { // valid "ascii" - common case + return; + } + + // Modified UTF-8 encodes characters in sequences of 1, 2, 3 or 6 bytes. + // The last byte is invalid if it is: + // - the 1st byte of a 2, 3 or 6 byte sequence + // 0b110xxxxx + // 0b1110xxxx + // 0b11101101 + // - the 2nd byte of a 3 or 6 byte sequence + // 0b10xxxxxx + // 0b1010xxxx + // - the 3rd, 4th or 5th byte of a 6 byte sequence + // 0b10xxxxxx + // 0b11101101 + // 0b1011xxxx + // + // Rather than checking all possible situations we simplify things noting that as we have already + // got a truncated string, then dropping one more character is not significant. So we work from the + // end of the buffer looking for the first byte that can be the starting byte of a UTF-8 encoded sequence, + // then we insert NUL at that location to terminate the buffer. There is an added complexity with 6 byte + // encodings as the first and fourth bytes are the same and overlap with the 3 byte encoding. + + for (int index = length - 2; index > 0; index--) { + if (is_starting_byte(buffer[index])) { + if (buffer[index] == 0xED) { + // Could be first byte of 3 or 6, or fourth byte of 6. + // If fourth the previous three bytes will encode a high + // surrogate value in the range EDA080 to EDAFBF. We only + // need to check for EDA to establish this as the "missing" + // values in EDAxxx would not be valid 3 byte encodings. + if ((index - 3) >= 0 && + (buffer[index - 3] == 0xED) && + ((buffer[index - 2] & 0xF0) == 0xA0)) { + assert(buffer[index - 1] >= 0x80 && buffer[index - 1] <= 0xBF, "sanity check"); + // It was fourth byte so truncate 3 bytes earlier + index -= 3; + } + } + buffer[index] = '\0'; + break; + } + } +} + //------------------------------------------------------------------------------------- bool UNICODE::is_latin1(jchar c) { diff --git a/src/hotspot/share/utilities/utf8.hpp b/src/hotspot/share/utilities/utf8.hpp index 80346f7da7d..9a18dd0ff93 100644 --- a/src/hotspot/share/utilities/utf8.hpp +++ b/src/hotspot/share/utilities/utf8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -88,6 +88,7 @@ class UTF8 : AllStatic { static bool is_legal_utf8(const unsigned char* buffer, int length, bool version_leq_47); + static void truncate_to_legal_utf8(unsigned char* buffer, int length); }; diff --git a/test/hotspot/gtest/utilities/test_utf8.cpp b/test/hotspot/gtest/utilities/test_utf8.cpp index ffd8121075b..80f6671207b 100644 --- a/test/hotspot/gtest/utilities/test_utf8.cpp +++ b/test/hotspot/gtest/utilities/test_utf8.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2016, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -22,6 +22,8 @@ */ #include "precompiled.hpp" +#include "nmt/memflags.hpp" +#include "runtime/os.hpp" #include "utilities/utf8.hpp" #include "unittest.hpp" @@ -101,5 +103,105 @@ TEST_VM(utf8, jbyte_length) { UNICODE::as_utf8(str, 19, res, i); EXPECT_TRUE(test_stamp(res + i, sizeof(res) - i)); } +} + +TEST_VM(utf8, truncation) { + + // Test that truncation removes partial encodings as expected. + + const char orig_bytes[] = { 'A', 'B', 'C', 'D', 'E', '\0' }; + const int orig_length = sizeof(orig_bytes)/sizeof(char); + ASSERT_TRUE(UTF8::is_legal_utf8((const unsigned char*)orig_bytes, orig_length - 1, false)); + const char* orig_str = &orig_bytes[0]; + ASSERT_EQ((int)strlen(orig_str), orig_length - 1); + + unsigned char* temp_bytes; + const char* temp_str; + char* utf8; + int n_utf8; // Number of bytes in the encoding + + // Test 1: a valid UTF8 "ascii" ending string should be returned as-is + + temp_bytes = (unsigned char*) os::malloc(sizeof(unsigned char) * orig_length, mtTest); + strcpy((char*)temp_bytes, orig_str); + temp_str = (const char*) temp_bytes; + UTF8::truncate_to_legal_utf8(temp_bytes, orig_length); + ASSERT_EQ((int)strlen(temp_str), orig_length - 1) << "bytes should be unchanged"; + ASSERT_EQ(strcmp(orig_str, temp_str), 0) << "bytes should be unchanged"; + os::free(temp_bytes); + + // Test 2: a UTF8 sequence that "ends" with a 2-byte encoding + // drops the 2-byte encoding + + jchar two_byte_char[] = { 0x00D1 }; // N with tilde + n_utf8 = 2; + utf8 = (char*) os::malloc(sizeof(char) * (n_utf8 + 1), mtTest); // plus NUL + UNICODE::convert_to_utf8(two_byte_char, 1, utf8); + int utf8_len = (int)strlen(utf8); + ASSERT_EQ(utf8_len, n_utf8) << "setup error"; + + // Now drop zero or one byte from the end and check it truncates as expected + for (int drop = 0; drop < n_utf8; drop++) { + int temp_len = orig_length + utf8_len - drop; + temp_bytes = (unsigned char*) os::malloc(sizeof(unsigned char) * temp_len, mtTest); + temp_str = (const char*) temp_bytes; + strcpy((char*)temp_bytes, orig_str); + strncat((char*)temp_bytes, utf8, utf8_len - drop); + ASSERT_EQ((int)strlen(temp_str), temp_len - 1) << "setup error"; + UTF8::truncate_to_legal_utf8(temp_bytes, temp_len); + ASSERT_EQ((int)strlen(temp_str), orig_length - 1) << "bytes should be truncated to original length"; + ASSERT_EQ(strcmp(orig_str, temp_str), 0) << "bytes should be truncated to original"; + os::free(temp_bytes); + } + os::free(utf8); + + // Test 3: a UTF8 sequence that "ends" with a 3-byte encoding + // drops the 3-byte encoding + n_utf8 = 3; + jchar three_byte_char[] = { 0x0800 }; + utf8 = (char*) os::malloc(sizeof(char) * (n_utf8 + 1), mtTest); // plus NUL + UNICODE::convert_to_utf8(three_byte_char, 1, utf8); + utf8_len = (int)strlen(utf8); + ASSERT_EQ(utf8_len, n_utf8) << "setup error"; + + // Now drop zero, to two bytes from the end and check it truncates as expected + for (int drop = 0; drop < n_utf8; drop++) { + int temp_len = orig_length + utf8_len - drop; + temp_bytes = (unsigned char*) os::malloc(sizeof(unsigned char) * temp_len, mtTest); + temp_str = (const char*) temp_bytes; + strcpy((char*)temp_bytes, orig_str); + strncat((char*)temp_bytes, utf8, utf8_len - drop); + ASSERT_EQ((int)strlen(temp_str), temp_len - 1) << "setup error"; + UTF8::truncate_to_legal_utf8(temp_bytes, temp_len); + ASSERT_EQ((int)strlen(temp_str), orig_length - 1) << "bytes should be truncated to original length"; + ASSERT_EQ(strcmp(orig_str, temp_str), 0) << "bytes should be truncated to original"; + os::free(temp_bytes); + } + os::free(utf8); + + // Test 4: a UTF8 sequence that "ends" with a 6-byte encoding + // drops the 6-byte encoding + n_utf8 = 6; + jchar six_byte_char[] = { 0xD801, 0xDC37 }; // U+10437 as its UTF-16 surrogate pairs + utf8 = (char*) os::malloc(sizeof(char) * (n_utf8 + 1), mtTest); // plus NUL + UNICODE::convert_to_utf8(six_byte_char, 2, utf8); + utf8_len = (int)strlen(utf8); + ASSERT_EQ(utf8_len, n_utf8) << "setup error"; + + // Now drop zero to five bytes from the end and check it truncates as expected + for (int drop = 0; drop < n_utf8; drop++) { + int temp_len = orig_length + utf8_len - drop; + temp_bytes = (unsigned char*) os::malloc(sizeof(unsigned char) * temp_len, mtTest); + temp_str = (const char*) temp_bytes; + strcpy((char*)temp_bytes, orig_str); + strncat((char*)temp_bytes, utf8, utf8_len - drop); + ASSERT_EQ((int)strlen(temp_str), temp_len - 1) << "setup error"; + UTF8::truncate_to_legal_utf8(temp_bytes, temp_len); + ASSERT_EQ((int)strlen(temp_str), orig_length - 1) << "bytes should be truncated to original length"; + ASSERT_EQ(strcmp(orig_str, temp_str), 0) << "bytes should be truncated to original"; + os::free(temp_bytes); + } + os::free(utf8); + }