diff --git a/CMakeLists.txt b/CMakeLists.txt index 94d6d05..fcd3dea 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,9 +8,9 @@ endif() project(iris VERSION 0.0.1 LANGUAGES CXX) -if(NOT DEFINED IRIS_ROOT) - set(IRIS_ROOT "${CMAKE_CURRENT_LIST_DIR}") -endif() +set(IRIS_ROOT "${CMAKE_CURRENT_LIST_DIR}") +set_property(GLOBAL PROPERTY IRIS_ROOT "${IRIS_ROOT}") + # ----------------------------------------------------------------- # Global settings diff --git a/include/iris/unicode/string.hpp b/include/iris/unicode/string.hpp new file mode 100644 index 0000000..accbf24 --- /dev/null +++ b/include/iris/unicode/string.hpp @@ -0,0 +1,1229 @@ +// Copyright 2006 Nemanja Trifunovic +// Copyright 2026 The Iris Project Contributors + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + +#ifndef IRIS_UNICODE_STRING_HPP +#define IRIS_UNICODE_STRING_HPP + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace iris::unicode { + +template +constexpr T bom[] = {static_cast(0xef), static_cast(0xbb), static_cast(0xbf)}; + +template +concept octet = std::integral && sizeof(T) == 1; + +template +concept utf8char = octet && (std::same_as || std::same_as); + +template +concept utf16char = std::same_as; + +template +concept utf32char = std::same_as; + +template +concept octet_input_iterator = std::input_iterator && octet>; + +template +concept utf8_input_iterator = octet_input_iterator && utf8char>; + +template +concept utf16_input_iterator = std::input_iterator && utf16char>; + +template +concept utf32_input_iterator = std::input_iterator && utf32char>; + + +template +concept octet_input_range = + std::ranges::input_range && + octet_input_iterator>; + + +namespace detail { + +template +struct select_output_value_type +{ + static_assert(std::output_iterator); + using type = DesiredValueT; +}; + +template + requires requires { + typename std::iter_value_t; + requires std::convertible_to>; + } +struct select_output_value_type +{ + static_assert(std::output_iterator>); + using type = std::iter_value_t; +}; + +template +concept maybe_value_type_sized = + requires { + typename std::iter_value_t; + requires sizeof(std::iter_value_t) == SizeofChar; + } || + !requires { + typename std::iter_value_t; + }; + +} // detail + +template +concept octet_output_iterator = + ( + std::output_iterator || + std::output_iterator + ) && + detail::maybe_value_type_sized; + +template +concept octet_output_range = + ( + std::ranges::output_range || + std::ranges::output_range + ) && + detail::maybe_value_type_sized, 1>; + +template +concept utf16_output_iterator = + std::output_iterator && + detail::maybe_value_type_sized; + +template +concept utf16_output_range = + std::ranges::output_range && + detail::maybe_value_type_sized, 2>; + + +template +struct is_nothrow_dereferenceable : std::false_type {}; + +template +struct is_nothrow_dereferenceable())>> : std::bool_constant())> {}; + +template +inline constexpr bool is_nothrow_dereferenceable_v = is_nothrow_dereferenceable::value; + +template +struct is_nothrow_prefix_incrementable : std::false_type {}; + +template +struct is_nothrow_prefix_incrementable())>> : std::bool_constant())> {}; + +template +inline constexpr bool is_nothrow_prefix_incrementable_v = is_nothrow_prefix_incrementable::value; + +template +struct is_nothrow_postfix_incrementable : std::false_type {}; + +template +struct is_nothrow_postfix_incrementable()++)>> : std::bool_constant()++)> {}; + +template +inline constexpr bool is_nothrow_postfix_incrementable_v = is_nothrow_postfix_incrementable::value; + +template +struct is_nothrow_sentinel : std::false_type {}; + +template + requires std::sentinel_for +struct is_nothrow_sentinel : std::bool_constant< + noexcept(std::declval() == std::declval()) && + noexcept(std::declval() != std::declval()) && + noexcept(std::declval() == std::declval()) && + noexcept(std::declval() != std::declval()) +> +{}; + +template +inline constexpr bool is_nothrow_sentinel_v = is_nothrow_sentinel::value; + + +class unicode_error : public std::runtime_error +{ + using std::runtime_error::runtime_error; +}; + +class invalid_code_point : public unicode_error +{ + char32_t cp; + +public: + explicit invalid_code_point(char32_t codepoint) + : unicode_error("invalid code point") + , cp(codepoint) + {} + + [[nodiscard]] char32_t code_point() const noexcept { return cp; } +}; + +class invalid_utf8 : public unicode_error +{ + char8_t u8; + +public: + explicit invalid_utf8(char c) + : unicode_error("invalid UTF-8") + , u8(static_cast(c)) + {} + + explicit invalid_utf8(char8_t u) + : unicode_error("invalid UTF-8") + , u8(u) + {} + + [[nodiscard]] char8_t utf8_octet() const noexcept { return u8; } +}; + +class invalid_utf16 : public unicode_error +{ + char16_t u16; + +public: + explicit invalid_utf16(char16_t u) + : unicode_error("Invalid UTF-16") + , u16(u) + {} + + [[nodiscard]] char16_t utf16_word() const noexcept { return u16; } +}; + +class not_enough_space : public unicode_error +{ +public: + not_enough_space() + : unicode_error("not enough space") + {} +}; + + +namespace detail { + +// Unicode constants +// Leading (high) surrogates: 0xd800 - 0xdbff +// Trailing (low) surrogates: 0xdc00 - 0xdfff +constexpr char16_t LEAD_SURROGATE_MIN = 0xd800u; +constexpr char16_t LEAD_SURROGATE_MAX = 0xdbffu; +constexpr char16_t TRAIL_SURROGATE_MIN = 0xdc00u; +constexpr char16_t TRAIL_SURROGATE_MAX = 0xdfffu; +constexpr char16_t LEAD_OFFSET = 0xd7c0u; // LEAD_SURROGATE_MIN - (0x10000 >> 10) +constexpr char32_t SURROGATE_OFFSET = 0xfca02400u; // 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN + +// Maximum valid value for a Unicode code point +constexpr char32_t CODE_POINT_MAX = 0x0010ffffu; + +enum class [[nodiscard]] utf_error +{ + OK, + NOT_ENOUGH_SPACE, + INVALID_LEAD, + INCOMPLETE_SEQUENCE, + OVERLONG_SEQUENCE, + INVALID_CODE_POINT, +}; + +template +[[nodiscard]] constexpr char8_t mask8(Octet oc) noexcept +{ + return static_cast(oc & 0xff); +} + +[[nodiscard]] constexpr char16_t mask16(char16_t oc) noexcept +{ + return static_cast(oc & 0xffff); +} + +template +[[nodiscard]] constexpr bool is_trail(Octet oc) noexcept +{ + return ((detail::mask8(oc) >> 6) == 0x2); +} + +[[nodiscard]] constexpr bool is_lead_surrogate(char32_t cp) noexcept +{ + return cp >= static_cast(LEAD_SURROGATE_MIN) && cp <= static_cast(LEAD_SURROGATE_MAX); +} + +[[nodiscard]] constexpr bool is_trail_surrogate(char32_t cp) noexcept +{ + return cp >= static_cast(TRAIL_SURROGATE_MIN) && cp <= static_cast(TRAIL_SURROGATE_MAX); +} + +[[nodiscard]] constexpr bool is_surrogate(char32_t cp) noexcept +{ + return cp >= static_cast(LEAD_SURROGATE_MIN) && cp <= static_cast(TRAIL_SURROGATE_MAX); +} + +[[nodiscard]] constexpr bool is_code_point_valid(char32_t cp) noexcept +{ + return cp <= CODE_POINT_MAX && !detail::is_surrogate(cp); +} + +[[nodiscard]] constexpr bool is_in_bmp(char32_t cp) noexcept +{ + return cp < char32_t(0x10000); +} + +[[nodiscard]] constexpr bool is_overlong_sequence(char32_t cp, int length) noexcept +{ + if (cp < 0x80) { + if (length != 1) return true; + } else if (cp < 0x800) { + if (length != 2) return true; + } else if (cp < 0x10000) { + if (length != 3) return true; + } + return false; +} + +template +[[nodiscard]] constexpr int sequence_length(It lead_it) + noexcept(is_nothrow_dereferenceable_v) +{ + char8_t const lead = detail::mask8(*lead_it); + if (lead < 0x80) return 1; + if ((lead >> 5) == 0x6) return 2; + if ((lead >> 4) == 0xe) return 3; + if ((lead >> 3) == 0x1e) return 4; + return 0; +} + +/// Helper for get_sequence_x +template Se> +constexpr utf_error increase_safely(It& it, Se end) + noexcept(std::conjunction_v< + is_nothrow_dereferenceable, + is_nothrow_prefix_incrementable, + is_nothrow_sentinel + >) +{ + if (++it == end) { + return utf_error::NOT_ENOUGH_SPACE; + } + if (!detail::is_trail(*it)) { + return utf_error::INCOMPLETE_SEQUENCE; + } + return utf_error::OK; +} + +#define IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(IT, END) \ + do { \ + utf_error ret = increase_safely(IT, END); \ + if (ret != utf_error::OK) \ + return ret; \ + } while (false) + +// get_sequence_x functions decode utf-8 sequences of the length x +template Se> +constexpr utf_error get_sequence_1(It& it, Se end, char32_t& code_point) + noexcept(std::conjunction_v< + is_nothrow_dereferenceable, + is_nothrow_sentinel + >) +{ + if (it == end) return utf_error::NOT_ENOUGH_SPACE; + code_point = static_cast(detail::mask8(*it)); + return utf_error::OK; +} + +template Se> +constexpr utf_error get_sequence_2(It& it, Se end, char32_t& code_point) + noexcept(std::conjunction_v< + is_nothrow_dereferenceable, + is_nothrow_prefix_incrementable, + is_nothrow_sentinel + >) +{ + if (it == end) return utf_error::NOT_ENOUGH_SPACE; + + code_point = static_cast(detail::mask8(*it)); + IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end); + code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f); + return utf_error::OK; +} + +template Se> +constexpr utf_error get_sequence_3(It& it, Se end, char32_t& code_point) + noexcept(std::conjunction_v< + is_nothrow_dereferenceable, + is_nothrow_prefix_incrementable, + is_nothrow_sentinel + >) +{ + if (it == end) return utf_error::NOT_ENOUGH_SPACE; + + code_point = static_cast(detail::mask8(*it)); + IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end); + code_point = ((code_point << 12) & 0xffff) + ((detail::mask8(*it) << 6) & 0xfff); + IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end); + code_point = static_cast(code_point + ((*it) & 0x3f)); + return utf_error::OK; +} + +template Se> +constexpr utf_error get_sequence_4(It& it, Se end, char32_t& code_point) + noexcept(std::conjunction_v< + is_nothrow_dereferenceable, + is_nothrow_prefix_incrementable, + is_nothrow_sentinel + >) +{ + if (it == end) return utf_error::NOT_ENOUGH_SPACE; + + code_point = static_cast(detail::mask8(*it)); + IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end); + code_point = ((code_point << 18) & 0x1fffff) + ((detail::mask8(*it) << 12) & 0x3ffff); + IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end); + code_point = static_cast(code_point + ((detail::mask8(*it) << 6) & 0xfff)); + IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end); + code_point = static_cast(code_point + ((*it) & 0x3f)); + return utf_error::OK; +} + +#undef IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR + +template Se> + requires std::forward_iterator +constexpr utf_error validate_next(It& it, Se end, char32_t& code_point) + noexcept(std::conjunction_v< + is_nothrow_dereferenceable, + is_nothrow_prefix_incrementable, + is_nothrow_sentinel, + std::is_nothrow_copy_constructible + >) +{ + if (it == end) return utf_error::NOT_ENOUGH_SPACE; + + // Save the original value of it so we can go back in case of failure + // Of course, it does not make much sense with i.e. stream iterators + It const original_it = it; + + char32_t cp = 0; + // Determine the sequence length based on the lead octet + int const length = detail::sequence_length(it); + + // Get trail octets and calculate the code point + utf_error err{}; + switch (length) { + case 0: + return utf_error::INVALID_LEAD; + case 1: + err = detail::get_sequence_1(it, end, cp); + break; + case 2: + err = detail::get_sequence_2(it, end, cp); + break; + case 3: + err = detail::get_sequence_3(it, end, cp); + break; + case 4: + err = detail::get_sequence_4(it, end, cp); + break; + default: + std::unreachable(); + } + if (err != utf_error::OK) { + it = original_it; + return err; + } + + if (detail::is_code_point_valid(cp)) { + if (!detail::is_overlong_sequence(cp, length)) { + code_point = cp; + ++it; + return utf_error::OK; + } + + it = original_it; + return utf_error::OVERLONG_SEQUENCE; + } + + it = original_it; + return utf_error::INVALID_CODE_POINT; +} + +template Se> + requires std::forward_iterator +constexpr utf_error validate_next(It& it, Se end) + noexcept(std::conjunction_v< + is_nothrow_dereferenceable, + is_nothrow_prefix_incrementable, + is_nothrow_sentinel, + std::is_nothrow_copy_constructible + >) +{ + if (it == end) return utf_error::NOT_ENOUGH_SPACE; + + // Save the original value of it so we can go back in case of failure + // Of course, it does not make much sense with i.e. stream iterators + It const original_it = it; + + char32_t cp = 0; + // Determine the sequence length based on the lead octet + int const length = detail::sequence_length(it); + + // Get trail octets and calculate the code point + utf_error err{}; + switch (length) { + case 0: + return utf_error::INVALID_LEAD; + case 1: + err = detail::get_sequence_1(it, end, cp); + break; + case 2: + err = detail::get_sequence_2(it, end, cp); + break; + case 3: + err = detail::get_sequence_3(it, end, cp); + break; + case 4: + err = detail::get_sequence_4(it, end, cp); + break; + default: + std::unreachable(); + } + if (err != utf_error::OK) { + it = original_it; + return err; + } + + if (detail::is_code_point_valid(cp)) { + if (!detail::is_overlong_sequence(cp, length)) { + ++it; + return utf_error::OK; + } + + it = original_it; + return utf_error::OVERLONG_SEQUENCE; + } + + it = original_it; + return utf_error::INVALID_CODE_POINT; +} + +template Se> + requires std::forward_iterator +constexpr utf_error validate_next16(It& it, Se end, char32_t& code_point) + noexcept(std::conjunction_v< + is_nothrow_dereferenceable, + is_nothrow_prefix_incrementable, + is_nothrow_postfix_incrementable, + is_nothrow_sentinel, + std::is_nothrow_copy_constructible + >) +{ + // Check the edge case: + if (it == end) return utf_error::NOT_ENOUGH_SPACE; + + // Save the original value of it so we can go back in case of failure + // Of course, it does not make much sense with i.e. stream iterators + It const original_it = it; + + char16_t const first_word = *it++; + if (!detail::is_surrogate(first_word)) { + code_point = first_word; + return utf_error::OK; + } + if (it == end) { + it = original_it; + return utf_error::NOT_ENOUGH_SPACE; + } + if (detail::is_lead_surrogate(first_word)) { + char16_t const second_word = *it++; + if (detail::is_trail_surrogate(static_cast(second_word))) { + code_point = static_cast(first_word << 10) + static_cast(second_word) + SURROGATE_OFFSET; + return utf_error::OK; + } + it = original_it; + return utf_error::INCOMPLETE_SEQUENCE; + } + + it = original_it; + return utf_error::INVALID_LEAD; +} + +} // detail + +template Se> +[[nodiscard]] constexpr It find_invalid(It it, Se se) + noexcept(noexcept(detail::validate_next(it, se)) && std::is_nothrow_copy_constructible_v) +{ + while (it != se) { + detail::utf_error err_code = detail::validate_next(it, se); + if (err_code != detail::utf_error::OK) { + return it; + } + } + return it; +} + +[[nodiscard]] constexpr std::size_t find_invalid(std::string_view s) + noexcept(noexcept(unicode::find_invalid(s.begin(), s.end()))) +{ + std::string_view::const_iterator invalid = unicode::find_invalid(s.begin(), s.end()); + return invalid == s.end() ? std::string_view::npos : static_cast(invalid - s.begin()); +} + +[[nodiscard]] constexpr std::size_t find_invalid(std::u8string_view s) + noexcept(noexcept(unicode::find_invalid(s.begin(), s.end()))) +{ + std::u8string_view::const_iterator invalid = unicode::find_invalid(s.begin(), s.end()); + return invalid == s.end() ? std::u8string_view::npos : static_cast(invalid - s.begin()); +} + +template Se> +[[nodiscard]] constexpr bool is_valid(It it, Se se) + noexcept(noexcept(unicode::find_invalid(it, se)) && is_nothrow_sentinel_v) +{ + return unicode::find_invalid(it, se) == se; +} + +[[nodiscard]] constexpr bool is_valid(std::string_view s) + noexcept(noexcept(unicode::is_valid(s.begin(), s.end()))) +{ + return unicode::is_valid(s.begin(), s.end()); +} + +[[nodiscard]] constexpr bool is_valid(std::u8string_view s) + noexcept(noexcept(unicode::is_valid(s.begin(), s.end()))) +{ + return unicode::is_valid(s.begin(), s.end()); +} + +template Se> +[[nodiscard]] constexpr bool starts_with_bom(It it, Se end) + noexcept(noexcept(detail::mask8(*it++)) && is_nothrow_sentinel_v) +{ + return + (it != end && detail::mask8(*it++) == bom[0]) && + (it != end && detail::mask8(*it++) == bom[1]) && + (it != end && detail::mask8(*it) == bom[2]); +} + +template +[[nodiscard]] constexpr bool starts_with_bom(R&& r) + noexcept(noexcept(unicode::starts_with_bom(std::ranges::begin(r), std::ranges::end(r)))) +{ + return unicode::starts_with_bom(std::ranges::begin(r), std::ranges::end(r)); +} + +[[nodiscard]] constexpr bool starts_with_bom(std::string_view s) + noexcept(noexcept(unicode::starts_with_bom(s.begin(), s.end()))) +{ + return unicode::starts_with_bom(s.begin(), s.end()); +} + +[[nodiscard]] constexpr bool starts_with_bom(std::u8string_view s) + noexcept(noexcept(unicode::starts_with_bom(s.begin(), s.end()))) +{ + return unicode::starts_with_bom(s.begin(), s.end()); +} + + +template +constexpr OutIt append8(char32_t cp, OutIt out) +{ + if (!detail::is_code_point_valid(cp)) throw invalid_code_point(cp); + + using octet_type = detail::select_output_value_type::type; + + if (cp < 0x80) { // one octet + *out++ = static_cast(cp); + } else if (cp < 0x800) { // two octets + *out++ = static_cast((cp >> 6) | 0xc0); + *out++ = static_cast((cp & 0x3f) | 0x80); + } else if (cp < 0x10000) { // three octets + *out++ = static_cast((cp >> 12) | 0xe0); + *out++ = static_cast(((cp >> 6) & 0x3f) | 0x80); + *out++ = static_cast((cp & 0x3f) | 0x80); + } else { // four octets + *out++ = static_cast((cp >> 18) | 0xf0); + *out++ = static_cast(((cp >> 12) & 0x3f) | 0x80); + *out++ = static_cast(((cp >> 6) & 0x3f) | 0x80); + *out++ = static_cast((cp & 0x3f) | 0x80); + } + return out; +} + +template +constexpr OutIt append16(char32_t cp, OutIt out) +{ + if (!detail::is_code_point_valid(cp)) throw invalid_code_point(cp); + + if (detail::is_in_bmp(cp)) { + *out++ = static_cast(cp); + } else { + // Code points from the supplementary planes are encoded via surrogate pairs + *out++ = static_cast(detail::LEAD_OFFSET + (cp >> 10)); + *out++ = static_cast(detail::TRAIL_SURROGATE_MIN + (cp & 0x3FF)); + } + return out; +} + +// Forwards automatically based on `sizeof(value_type)`, but overload may become +// ambiguous on `value_type`-agnostic iterators such as `std::back_insert_iterator`. +template +constexpr OutIt append(char32_t cp, OutIt out) +{ + return unicode::append8(cp, std::move(out)); +} +template +constexpr OutIt append(char32_t cp, OutIt out) +{ + return unicode::append16(cp, std::move(out)); +} + + +template + requires octet_output_range> +constexpr std::ranges::subrange>, std::ranges::sentinel_t>> +append8(char32_t cp, OutR&& r) +{ + return std::ranges::subrange{ + unicode::append8(cp, std::ranges::begin(r)), std::ranges::end(r) + }; +} + +template + requires utf16_output_range> +constexpr std::ranges::subrange>, std::ranges::sentinel_t>> +append16(char32_t cp, OutR&& r) +{ + return std::ranges::subrange{ + unicode::append16(cp, std::ranges::begin(r)), std::ranges::end(r) + }; +} + +// Forwards automatically based on `sizeof(value_type)`, but overload may become +// ambiguous on `value_type`-agnostic iterators such as `std::back_insert_iterator`. +template + requires octet_output_range> +constexpr std::ranges::subrange>, std::ranges::sentinel_t>> +append(char32_t cp, OutR&& r) +{ + return unicode::append8(cp, std::forward(r)); +} +template + requires utf16_output_range> +constexpr std::ranges::subrange>, std::ranges::sentinel_t>> +append(char32_t cp, OutR&& r) +{ + return unicode::append16(cp, std::forward(r)); +} + +constexpr void append(char32_t cp, std::string& str) +{ + unicode::append8(cp, std::back_inserter(str)); +} + +constexpr void append(char32_t cp, std::u8string& str) +{ + unicode::append8(cp, std::back_inserter(str)); +} + +constexpr void append(char32_t cp, std::u16string& str) +{ + unicode::append16(cp, std::back_inserter(str)); +} + +template Se, octet_output_iterator Out> +constexpr Out replace_invalid(It start, Se end, Out out, char32_t replacement) +{ + while (start != end) { + It const sequence_start = start; + switch (detail::validate_next(start, end)) { + case detail::utf_error::OK: + for (It it = sequence_start; it != start; ++it) { + *out++ = *it; + } + break; + + case detail::utf_error::NOT_ENOUGH_SPACE: + out = unicode::append8(replacement, out); + start = end; + break; + + case detail::utf_error::INVALID_LEAD: + out = unicode::append8(replacement, out); + ++start; + break; + + case detail::utf_error::INCOMPLETE_SEQUENCE: + case detail::utf_error::OVERLONG_SEQUENCE: + case detail::utf_error::INVALID_CODE_POINT: + out = unicode::append8(replacement, out); + ++start; + // just one replacement mark for the sequence + while (start != end && detail::is_trail(*start)) { + ++start; + } + break; + + default: + std::unreachable(); + } + } + return out; +} + +template Se, octet_output_iterator Out> +constexpr Out replace_invalid(It start, Se end, Out out) +{ + constexpr char32_t replacement_marker = static_cast(detail::mask16(0xfffd)); + return unicode::replace_invalid(start, end, out, replacement_marker); +} + +[[nodiscard]] constexpr std::string replace_invalid(std::string_view s, char32_t replacement) +{ + std::string result; + unicode::replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); + return result; +} + +[[nodiscard]] constexpr std::u8string replace_invalid(std::u8string_view s, char32_t replacement) +{ + std::u8string result; + unicode::replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); + return result; +} + +[[nodiscard]] constexpr std::string replace_invalid(std::string_view s) +{ + std::string result; + unicode::replace_invalid(s.begin(), s.end(), std::back_inserter(result)); + return result; +} + +[[nodiscard]] constexpr std::u8string replace_invalid(std::u8string_view s) +{ + std::u8string result; + unicode::replace_invalid(s.begin(), s.end(), std::back_inserter(result)); + return result; +} + +template Se> +[[nodiscard]] constexpr char32_t next(It& it, Se end) +{ + char32_t cp = 0; + switch (detail::validate_next(it, end, cp)) { + case detail::utf_error::OK: + break; + + case detail::utf_error::NOT_ENOUGH_SPACE: + throw not_enough_space(); + + case detail::utf_error::INVALID_LEAD: + case detail::utf_error::INCOMPLETE_SEQUENCE: + case detail::utf_error::OVERLONG_SEQUENCE: + throw invalid_utf8(static_cast(*it)); + + case detail::utf_error::INVALID_CODE_POINT: + throw invalid_code_point(cp); + + default: + std::unreachable(); + } + return cp; +} + +template +[[nodiscard]] constexpr std::pair::difference_type> +bounded_next(It it, It const last, typename std::iterator_traits::difference_type off = 1) +{ + typename std::iterator_traits::difference_type count = 0; + for (; it != last && count < off; ++count) { + char32_t cp = 0; + switch (detail::validate_next(it, last, cp)) { + case detail::utf_error::OK: + break; + + case detail::utf_error::NOT_ENOUGH_SPACE: + throw not_enough_space(); + + case detail::utf_error::INVALID_LEAD: + case detail::utf_error::INCOMPLETE_SEQUENCE: + case detail::utf_error::OVERLONG_SEQUENCE: + throw invalid_utf8(static_cast(*it)); + + case detail::utf_error::INVALID_CODE_POINT: + throw invalid_code_point(cp); + + default: + std::unreachable(); + } + } + return {it, count}; +} + +template Se> +[[nodiscard]] constexpr char32_t next16(It& it, Se end) +{ + char32_t cp = 0; + detail::utf_error err_code = detail::validate_next16(it, end, cp); + if (err_code == detail::utf_error::NOT_ENOUGH_SPACE) { + throw not_enough_space(); + } + return cp; +} + +template Se> +[[nodiscard]] constexpr char32_t peek_next(It it, Se end) +{ + return unicode::next(it, end); +} + +template Se> +[[nodiscard]] constexpr char32_t prev(It& it, Se start) +{ + // can't do much if it == start + if (it == start) throw not_enough_space(); + + It end = it; + // Go back until we hit either a lead octet or start + while (detail::is_trail(*--it)) { + if (it == start) throw invalid_utf8(*it); // error - no lead byte in the sequence + } + return unicode::peek_next(it, end); +} + +template +[[nodiscard]] constexpr std::pair::difference_type> +bounded_prev(It const start, It it, typename std::iterator_traits::difference_type off = 1) +{ + typename std::iterator_traits::difference_type count = 0; + for (; it != start && count < off; ++count) { + while (detail::is_trail(*--it)) { + if (it == start) throw invalid_utf8(*it); // error - no lead byte in the sequence + } + } + return {it, count}; +} + +template Se, class distance_type> +constexpr void advance(It& it, distance_type n, Se end) +{ + constexpr distance_type zero(0); + if (n < zero) { + // backward + for (distance_type i = n; i < zero; ++i) { + (void)unicode::prev(it, end); + } + } else { + // forward + for (distance_type i = zero; i < n; ++i) { + (void)unicode::next(it, end); + } + } +} + +template Se> +[[nodiscard]] constexpr typename std::iterator_traits::difference_type +distance(It first, Se last) +{ + typename std::iterator_traits::difference_type dist; + for (dist = 0; first != last; ++dist) { + (void)unicode::next(first, last); + } + return dist; +} + +// ------------------------------------ + +template +class code_point_iterator +{ + It it; + It range_start; + It range_end; + +public: + using value_type = char32_t; + using pointer = char32_t*; + using reference = char32_t&; + using difference_type = std::ptrdiff_t; + using iterator_category = std::bidirectional_iterator_tag; + + constexpr code_point_iterator() + requires std::is_default_constructible_v + = default; + + constexpr code_point_iterator(It it, It range_start, It range_end) + : it(std::move(it)) + , range_start(std::move(range_start)) + , range_end(std::move(range_end)) + { + if constexpr (std::random_access_iterator) { + if (this->it < this->range_start || this->it > this->range_end) { + throw std::out_of_range("Invalid utf-8 iterator position"); + } + } + } + + [[nodiscard]] constexpr It base() const { return it; } + + [[nodiscard]] constexpr char32_t operator*() const + { + It temp = it; + return unicode::next(temp, range_end); + } + + [[nodiscard]] constexpr bool operator==(code_point_iterator const& rhs) const noexcept + { + assert(range_start == rhs.range_start && range_end == rhs.range_end && "comparing incompatible iterator range is not allowed"); + return it == rhs.it; + } + + constexpr code_point_iterator& operator++() + { + (void)unicode::next(it, range_end); + return *this; + } + + [[nodiscard]] constexpr code_point_iterator operator++(int) + { + code_point_iterator temp = *this; + (void)unicode::next(it, range_end); + return temp; + } + + constexpr code_point_iterator& operator--() + { + (void)unicode::prev(it, range_start); + return *this; + } + + [[nodiscard]] constexpr code_point_iterator operator--(int) + { + code_point_iterator temp = *this; + (void)unicode::prev(it, range_start); + return temp; + } +}; + +// ------------------------------------ + +template Se, utf16_output_iterator OutIt> +constexpr OutIt utf8to16(It start, Se end, OutIt out) +{ + while (start != end) { + char32_t const cp = unicode::next(start, end); + if (cp > 0xffff) { // make a surrogate pair + *out++ = static_cast((cp >> 10) + detail::LEAD_OFFSET); + *out++ = static_cast((cp & 0x3ff) + detail::TRAIL_SURROGATE_MIN); + } else { + *out++ = static_cast(cp); + } + } + return out; +} + +[[nodiscard]] constexpr std::u16string utf8to16(std::string_view str) +{ + std::u16string result; + unicode::utf8to16(str.begin(), str.end(), std::back_inserter(result)); + return result; +} + +[[nodiscard]] constexpr std::u16string utf8to16(std::u8string_view str) +{ + std::u16string result; + unicode::utf8to16(str.begin(), str.end(), std::back_inserter(result)); + return result; +} + +template Se, octet_output_iterator OutIt> +constexpr OutIt utf16to8(It start, Se end, OutIt out) +{ + while (start != end) { + char32_t cp = static_cast(detail::mask16(*start++)); + // Take care of surrogate pairs first + if (detail::is_lead_surrogate(cp)) { + if (start != end) { + char32_t const trail_surrogate = static_cast(detail::mask16(*start++)); + if (detail::is_trail_surrogate(trail_surrogate)) { + cp = (cp << 10) + trail_surrogate + detail::SURROGATE_OFFSET; + } else { + throw invalid_utf16(static_cast(trail_surrogate)); + } + } else { + throw invalid_utf16(static_cast(cp)); + } + + // Lone trail surrogate + } else if (detail::is_trail_surrogate(cp)) { + throw invalid_utf16(static_cast(cp)); + } + out = unicode::append8(cp, out); + } + return out; +} + +[[nodiscard]] constexpr std::string utf16to8(std::u16string_view str) +{ + std::string result; + unicode::utf16to8(str.begin(), str.end(), std::back_inserter(result)); + return result; +} + +[[nodiscard]] constexpr std::u8string utf16tou8(std::u16string_view str) +{ + std::u8string result; + unicode::utf16to8(str.begin(), str.end(), std::back_inserter(result)); + return result; +} + +template Se, class OutIt> +constexpr OutIt utf8to32(It start, Se end, OutIt out) +{ + while (start != end) { + *out++ = unicode::next(start, end); + } + return out; +} + +[[nodiscard]] constexpr std::u32string utf8to32(std::string_view str) +{ + std::u32string result; + unicode::utf8to32(str.begin(), str.end(), std::back_inserter(result)); + return result; +} + +[[nodiscard]] constexpr std::u32string utf8to32(std::u8string_view str) +{ + std::u32string result; + unicode::utf8to32(str.begin(), str.end(), std::back_inserter(result)); + return result; +} + +template Se, octet_output_iterator OutIt> +constexpr OutIt utf32to8(It start, Se end, OutIt out) +{ + while (start != end) { + out = unicode::append8(*start++, out); + } + return out; +} + +[[nodiscard]] constexpr std::string utf32to8(std::u32string_view str) +{ + std::string result; + unicode::utf32to8(str.begin(), str.end(), std::back_inserter(result)); + return result; +} + +[[nodiscard]] constexpr std::u8string utf32tou8(std::u32string_view str) +{ + std::u8string result; + unicode::utf32to8(str.begin(), str.end(), std::back_inserter(result)); + return result; +} + +// TODO: add single char variations + +template +[[nodiscard]] constexpr std::basic_string transcode(std::string_view str) +{ + if constexpr (std::same_as) { + return std::u8string{str.begin(), str.end()}; + } else if constexpr (std::same_as) { + return unicode::utf8to16(str); + } else if constexpr (std::same_as) { + return unicode::utf8to32(str); + } else { + static_assert(std::same_as); + return std::string{str}; + } +} + +template +[[nodiscard]] constexpr std::basic_string transcode(std::u8string_view str) +{ + if constexpr (std::same_as) { + return std::u8string{str}; + } else if constexpr (std::same_as) { + return unicode::utf8to16(str); + } else if constexpr (std::same_as) { + return unicode::utf8to32(str); + } else { + static_assert(std::same_as); + return std::string{str.begin(), str.end()}; + } +} + +template +[[nodiscard]] constexpr std::basic_string transcode(std::u16string_view str) +{ + if constexpr (std::same_as) { + return unicode::utf16tou8(str); + } else if constexpr (std::same_as) { + return std::u16string{str}; + } else if constexpr (std::same_as) { + static_assert(false, "not implemented"); + return {}; // dummy + //return unicode::utf16to32(str); + } else { + static_assert(std::same_as); + return unicode::utf16to8(str); + } +} + +template +[[nodiscard]] constexpr std::basic_string transcode(std::u32string_view str) +{ + if constexpr (std::same_as) { + return unicode::utf32tou8(str); + } else if constexpr (std::same_as) { + static_assert(false, "not implemented"); + return {}; // dummy + //return unicode::utf32to16(str); + } else if constexpr (std::same_as) { + return std::u32string{str}; + } else { + static_assert(std::same_as); + return unicode::utf32to8(str); + } +} + +} // iris::unicode + +#endif diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 34edcd0..5347bca 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -97,6 +97,11 @@ function(iris_define_test_headers test_name) endfunction() function(_iris_define_test_impl test_name libs) + get_property(IRIS_ROOT GLOBAL PROPERTY IRIS_ROOT) + if(NOT DEFINED IRIS_ROOT OR IRIS_ROOT STREQUAL "") + message(FATAL_ERROR "IRIS_ROOT is not defined") + endif() + add_executable(${test_name}_test ${ARGN}) target_include_directories(${test_name}_test PRIVATE ${CMAKE_CURRENT_FUNCTION_LIST_DIR}) target_include_directories(${test_name}_test PRIVATE ${CMAKE_CURRENT_LIST_DIR}) @@ -116,11 +121,32 @@ function(_iris_define_test_impl test_name libs) target_link_libraries(${test_name}_test PRIVATE Iris::Iris iris_cxx_test ${libs}) add_test(NAME ${test_name}_test COMMAND ${test_name}_test --colour-mode=ansi) + set_tests_properties( + ${test_name}_test PROPERTIES + ENVIRONMENT "IRIS_ROOT=${IRIS_ROOT}" + ) if(MSVC) + set( + VS_DEBUGGER_ENVIRONMENT_LIST + "PATH=$(VC_ExecutablePath_x64)\;%PATH%" + "ASAN_SYMBOLIZER_PATH=$(VC_ExecutablePath_x64)\\llvm-symbolizer.exe" + "IRIS_ROOT=$" + ) + list(JOIN VS_DEBUGGER_ENVIRONMENT_LIST "\n" VS_DEBUGGER_ENVIRONMENT) + + set_target_properties( + ${test_name}_test PROPERTIES + VS_DEBUGGER_ENVIRONMENT "${VS_DEBUGGER_ENVIRONMENT}" + ) + get_property(IRIS_MSVC_ASAN_DIR GLOBAL PROPERTY IRIS_MSVC_ASAN_DIR) + set( + ENV_MODIFICATION + "PATH=path_list_append:${IRIS_MSVC_ASAN_DIR}" + ) set_tests_properties( ${test_name}_test PROPERTIES - ENVIRONMENT "PATH=${IRIS_MSVC_ASAN_DIR};$ENV{PATH}" + ENVIRONMENT_MODIFICATION "${ENV_MODIFICATION}" ) endif() endfunction() @@ -169,5 +195,7 @@ if(PROJECT_IS_TOP_LEVEL) foreach(test_name IN LISTS IRIS_TEST_IRIS_TESTS) iris_define_test_headers(iris_${test_name} iris_test.hpp) endforeach() + + add_subdirectory(unicode) endif() endif() diff --git a/test/unicode/CMakeLists.txt b/test/unicode/CMakeLists.txt new file mode 100644 index 0000000..37baffc --- /dev/null +++ b/test/unicode/CMakeLists.txt @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: MIT + +add_subdirectory(string) diff --git a/test/unicode/string/CMakeLists.txt b/test/unicode/string/CMakeLists.txt new file mode 100644 index 0000000..ebfa8d4 --- /dev/null +++ b/test/unicode/string/CMakeLists.txt @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: MIT + +set( + IRIS_TEST_UNICODE_STRING_TESTS + string + utf8_invalid +) + +foreach(test_name IN LISTS IRIS_TEST_UNICODE_STRING_TESTS) + iris_define_test(unicode_string_${test_name} ${test_name}.cpp) + set_target_properties(unicode_string_${test_name}_test PROPERTIES FOLDER "test/unicode/string") +endforeach() + +target_sources(unicode_string_utf8_invalid_test PRIVATE test_data/utf8_invalid.txt) diff --git a/test/unicode/string/LICENSE b/test/unicode/string/LICENSE new file mode 100644 index 0000000..36b7cd9 --- /dev/null +++ b/test/unicode/string/LICENSE @@ -0,0 +1,23 @@ +Boost Software License - Version 1.0 - August 17th, 2003 + +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/test/unicode/string/string.cpp b/test/unicode/string/string.cpp new file mode 100644 index 0000000..86c0732 --- /dev/null +++ b/test/unicode/string/string.cpp @@ -0,0 +1,454 @@ +#include "iris_test.hpp" + +#include + +#include +#include +#include +#include + +#include + +namespace iris_unicode_test { + +namespace unicode = iris::unicode; + +template +constexpr std::array to_array_cast(Chars... cs) +{ + return std::array{ + static_cast(cs)... + }; +} + +TEST_CASE("append") +{ + constexpr auto do_test = []() { + STATIC_CHECK([] { + std::array u{}; + unicode::append(0x0448U, u); + return u; + }() == to_array_cast(0xd1, 0x88, 0, 0, 0)); + + STATIC_CHECK([] { + std::array u{}; + unicode::append(0x65e5U, u); + return u; + }() == to_array_cast(0xe6, 0x97, 0xa5, 0, 0)); + + STATIC_CHECK([] { + std::array u{}; + unicode::append(0x3044U, u); + return u; + }() == to_array_cast(0xe3, 0x81, 0x84, 0, 0)); + + STATIC_CHECK([] { + std::array u{}; + unicode::append(0x10346U, u); + return u; + }() == to_array_cast(0xf0, 0x90, 0x8d, 0x86, 0)); + }; + + do_test.operator()(); + do_test.operator()(); + do_test.operator()(); + do_test.operator()(); + do_test.operator()(); +} + +TEST_CASE("append16") +{ + constexpr auto do_test = []() { + STATIC_CHECK([] { + std::array u{}; + unicode::append(0x0448U, u); + return u; + }() == to_array_cast(0x0448, 0, 0, 0, 0)); + + STATIC_CHECK([] { + std::array u{}; + unicode::append(0x65e5U, u); + return u; + }() == to_array_cast(0x65e5, 0, 0, 0, 0)); + + STATIC_CHECK([] { + std::array u{}; + unicode::append(0x10346U, u); + return u; + }() == to_array_cast(0xd800, 0xdf46, 0, 0, 0)); + }; + + do_test.operator()(); + do_test.operator()(); + do_test.operator()(); +} + +TEST_CASE("next") +{ + char const* twochars = "\xe6\x97\xa5\xd1\x88"; + char const* w = twochars; + unsigned int cp = unicode::next(w, twochars + 6); + + CHECK(cp == 0x65e5); + CHECK(w == twochars + 3); + + char const* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + w = threechars; + + cp = unicode::next(w, threechars + 9); + CHECK(cp == 0x10346); + CHECK(w == threechars + 4); + + cp = unicode::next(w, threechars + 9); + CHECK(cp == 0x65e5); + CHECK(w == threechars + 7); + + cp = unicode::next(w, threechars + 9); + CHECK(cp == 0x0448); + CHECK(w == threechars + 9); +} + +TEST_CASE("next16") +{ + char16_t const u[3] = {0x65e5, 0xd800, 0xdf46}; + char16_t const* w = u; + char32_t cp = unicode::next16(w, w + 3); + CHECK(cp == 0x65e5); + CHECK(w == u + 1); + + cp = unicode::next16(w, w + 2); + CHECK(cp == 0x10346); + CHECK(w == u + 3); +} + +TEST_CASE("peek_next") +{ + char const* const cw = "\xe6\x97\xa5\xd1\x88"; + unsigned int cp = unicode::peek_next(cw, cw + 6); + CHECK(cp == 0x65e5); +} + +TEST_CASE("prev") +{ + char const* twochars = "\xe6\x97\xa5\xd1\x88"; + char const* w = twochars + 3; + unsigned int cp = unicode::prev(w, twochars); + CHECK(cp == 0x65e5); + CHECK(w == twochars); + + char const* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + w = threechars + 9; + cp = unicode::prev(w, threechars); + CHECK(cp == 0x0448); + CHECK(w == threechars + 7); + cp = unicode::prev(w, threechars); + CHECK(cp == 0x65e5); + CHECK(w == threechars + 4); + cp = unicode::prev(w, threechars); + CHECK(cp == 0x10346); + CHECK(w == threechars); +} + +TEST_CASE("advance") +{ + char const* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + char const* w = threechars; + unicode::advance(w, 2, threechars + 9); + CHECK(w == threechars + 7); + unicode::advance(w, -2, threechars); + CHECK(w == threechars); + unicode::advance(w, 3, threechars + 9); + CHECK(w == threechars + 9); + unicode::advance(w, -2, threechars); + CHECK(w == threechars + 4); + unicode::advance(w, -1, threechars); + CHECK(w == threechars); +} + +TEST_CASE("distance") +{ + constexpr char const* twochars = "\xe6\x97\xa5\xd1\x88"; + std::size_t const dist = static_cast(unicode::distance(twochars, twochars + 5)); + CHECK(dist == 2); +} + +TEST_CASE("is_valid") +{ + constexpr char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; + constexpr char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; + + { + CHECK(!unicode::is_valid(utf_invalid)); + CHECK(!unicode::is_valid(utf_invalid, utf_invalid + 6)); + } + { + CHECK(unicode::is_valid(utf8_with_surrogates)); + CHECK(unicode::is_valid(utf8_with_surrogates, utf8_with_surrogates + 9)); + } + { + std::u8string const utf_invalid_u8(reinterpret_cast(utf_invalid)); + CHECK(!unicode::is_valid(utf_invalid_u8)); + } + { + std::u8string const utf8_with_surrogates_u8(reinterpret_cast(utf8_with_surrogates)); + CHECK(unicode::is_valid(utf8_with_surrogates)); + } + + { + constexpr char const* twochars = "ab"; + CHECK(unicode::is_valid(twochars)); + + std::string const two_chars_string(twochars); + CHECK(unicode::is_valid(two_chars_string)); + } +} + +TEST_CASE("find_invalid") +{ + constexpr char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; + { + char const* invalid = unicode::find_invalid(utf_invalid, utf_invalid + 6); + CHECK(invalid == utf_invalid + 5); + } + { + std::size_t const invalid_pos = unicode::find_invalid(utf_invalid); + CHECK(invalid_pos == 5); + } + { + std::size_t const invalid_pos = unicode::find_invalid(std::string{utf_invalid}); + CHECK(invalid_pos == 5); + } + { + std::size_t const invalid_pos = unicode::find_invalid(std::string_view{utf_invalid}); + CHECK(invalid_pos == 5); + } + { + std::u8string const utf_invalid_u8(reinterpret_cast(utf_invalid)); + std::size_t const invalid_pos = unicode::find_invalid(utf_invalid_u8); + CHECK(invalid_pos == 5); + } +} + +TEST_CASE("replace_invalid (vector)") +{ + char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; + std::vector replace_invalid_result; + + unicode::replace_invalid(invalid_sequence, invalid_sequence + sizeof(invalid_sequence), std::back_inserter(replace_invalid_result), '?'); + CHECK(unicode::is_valid(replace_invalid_result.begin(), replace_invalid_result.end())); + + char const fixed_invalid_sequence[] = "a????z"; + CHECK(sizeof(fixed_invalid_sequence) == replace_invalid_result.size()); + CHECK(std::equal(replace_invalid_result.begin(), replace_invalid_result.begin() + sizeof(fixed_invalid_sequence), fixed_invalid_sequence)); +} + +TEST_CASE("replace_invalid (string)") +{ + std::string_view invalid_sequence = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; + std::string const replace_invalid_result = unicode::replace_invalid(invalid_sequence, '?'); + CHECK(unicode::is_valid(replace_invalid_result)); + + std::string const fixed_invalid_sequence = "a????z"; + CHECK(fixed_invalid_sequence == replace_invalid_result); +} + +TEST_CASE("replace_invalid (u8string)") +{ + std::u8string const invalid_sequence(reinterpret_cast("a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z")); + std::u8string const replace_invalid_result = unicode::replace_invalid(invalid_sequence, u8'?'); + + CHECK(unicode::is_valid(replace_invalid_result)); + std::u8string const fixed_invalid_sequence(reinterpret_cast("a????z")); + CHECK(fixed_invalid_sequence == replace_invalid_result); +} + +TEST_CASE("starts_with_bom") +{ + CHECK(unicode::starts_with_bom(unicode::bom)); + CHECK(unicode::starts_with_bom(unicode::bom)); + CHECK(unicode::starts_with_bom(unicode::bom)); + CHECK(unicode::starts_with_bom(unicode::bom)); + CHECK(unicode::starts_with_bom(unicode::bom)); + + constexpr char threechars[] = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + CHECK(!unicode::starts_with_bom(threechars)); + CHECK(!unicode::starts_with_bom(std::string{threechars})); + CHECK(!unicode::starts_with_bom(std::string_view{threechars})); + CHECK(!unicode::starts_with_bom(std::u8string{reinterpret_cast(threechars)})); +} + +TEST_CASE("increment") +{ + constexpr char const* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + unicode::code_point_iterator it(threechars, threechars, threechars + 9); + unicode::code_point_iterator it2 = it; + CHECK(it2 == it); + CHECK(*it == 0x10346); + CHECK(*++it == 0x65e5); + CHECK(*it++ == 0x65e5); + CHECK(*it == 0x0448); + CHECK(it != it2); + unicode::code_point_iterator endit(threechars + 9, threechars, threechars + 9); + CHECK(++it == endit); +} + +TEST_CASE("decrement") +{ + constexpr char const* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + unicode::code_point_iterator it(threechars + 9, threechars, threechars + 9); + CHECK(*--it == 0x0448); + CHECK(*it-- == 0x0448); + CHECK(*it == 0x65e5); + CHECK(--it == unicode::code_point_iterator(threechars, threechars, threechars + 9)); + CHECK(*it == 0x10346); +} + +// ----------------------------------- + +TEST_CASE("utf8to16") +{ + { + constexpr char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; + std::vector utf16result; + unicode::utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result)); + CHECK(utf16result.size() == 4); + CHECK(utf16result[2] == 0xd834); + CHECK(utf16result[3] == 0xdd1e); + } + { + std::string const utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; + std::u16string const utf16result = unicode::utf8to16(utf8_with_surrogates); + CHECK(utf16result.size() == 4); + CHECK(utf16result[2] == 0xd834); + CHECK(utf16result[3] == 0xdd1e); + // Just to make sure it compiles with string literals + CHECK(unicode::utf8to16(u8"simple") == u"simple"); + CHECK(unicode::utf8to16("simple") == u"simple"); + } + { + constexpr std::string_view utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; + std::u16string const utf16result = unicode::utf8to16(utf8_with_surrogates); + CHECK(utf16result.size() == 4); + CHECK(utf16result[2] == 0xd834); + CHECK(utf16result[3] == 0xdd1e); + } + { + std::u8string const utf8_with_surrogates{reinterpret_cast("\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e")}; + std::u16string const utf16result = unicode::utf8to16(utf8_with_surrogates); + CHECK(utf16result.size() == 4); + CHECK(utf16result[2] == 0xd834); + CHECK(utf16result[3] == 0xdd1e); + } +} + +TEST_CASE("utf16to8") +{ + { + constexpr char16_t utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; + std::string utf8result; + unicode::utf16to8(utf16string, utf16string + 5, back_inserter(utf8result)); + CHECK(utf8result.size() == 10); + } + { + std::u16string const utf16string{0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; + std::string const u = unicode::utf16to8(utf16string); + CHECK(u.size() == 10); + } + { + std::u16string const utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; + std::u16string_view const utf16stringview(utf16string); + std::string const u = unicode::utf16to8(utf16stringview); + CHECK(u.size() == 10); + } + { + std::u16string const utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; + std::u16string_view const utf16stringview{utf16string}; + { + std::u8string const u = unicode::utf16tou8(utf16string); + CHECK(u.size() == 10); + } + { + std::u8string const u = unicode::utf16tou8(utf16stringview); + CHECK(u.size() == 10); + } + } +} + +// ----------------------------------------- + +TEST_CASE("utf8to32") +{ + { + constexpr char const* twochars = "\xe6\x97\xa5\xd1\x88"; + std::vector utf32result; + unicode::utf8to32(twochars, twochars + 5, back_inserter(utf32result)); + CHECK(utf32result.size() == 2); + } + { + constexpr char const* twochars = "\xe6\x97\xa5\xd1\x88"; + std::u32string const utf32result = unicode::utf8to32(twochars); + CHECK(utf32result.size() == 2); + } + { + constexpr std::string_view twochars = "\xe6\x97\xa5\xd1\x88"; + std::u32string const utf32result = unicode::utf8to32(twochars); + CHECK(utf32result.size() == 2); + } + { + std::u8string const twochars{reinterpret_cast("\xe6\x97\xa5\xd1\x88")}; + std::u32string const utf32result = unicode::utf8to32(twochars); + CHECK(utf32result.size() == 2); + } +} + +TEST_CASE("utf32to8") +{ + { + constexpr char32_t utf32string[] = {0x448, 0x65E5, 0x10346, 0}; + std::string utf8result; + unicode::utf32to8(utf32string, utf32string + 3, back_inserter(utf8result)); + CHECK(utf8result.size() == 9); + } + { + std::u32string const utf32string = {0x448, 0x65E5, 0x10346}; + std::string const utf8result = unicode::utf32to8(utf32string); + CHECK(utf8result.size() == 9); + } + { + std::u32string const utf32string = {0x448, 0x65E5, 0x10346}; + std::u32string_view const utf32stringview(utf32string); + std::string const utf8result = unicode::utf32to8(utf32stringview); + CHECK(utf8result.size() == 9); + } + { + std::u32string const utf32string = {0x448, 0x65E5, 0x10346}; + std::u32string_view const utf32stringview{utf32string}; + std::u8string const utf8result = unicode::utf32tou8(utf32stringview); + CHECK(utf8result.size() == 9); + } +} + +TEST_CASE("transcode") +{ + STATIC_CHECK(unicode::transcode("aこれはb試験ですc") == "aこれはb試験ですc"); + STATIC_CHECK(unicode::transcode(u8"aこれはb試験ですc") == "aこれはb試験ですc"); + STATIC_CHECK(unicode::transcode(u"aこれはb試験ですc") == "aこれはb試験ですc"); + STATIC_CHECK(unicode::transcode(U"aこれはb試験ですc") == "aこれはb試験ですc"); + + STATIC_CHECK(unicode::transcode("aこれはb試験ですc") == u8"aこれはb試験ですc"); + STATIC_CHECK(unicode::transcode(u8"aこれはb試験ですc") == u8"aこれはb試験ですc"); + STATIC_CHECK(unicode::transcode(u"aこれはb試験ですc") == u8"aこれはb試験ですc"); + STATIC_CHECK(unicode::transcode(U"aこれはb試験ですc") == u8"aこれはb試験ですc"); + + STATIC_CHECK(unicode::transcode("aこれはb試験ですc") == u"aこれはb試験ですc"); + STATIC_CHECK(unicode::transcode(u8"aこれはb試験ですc") == u"aこれはb試験ですc"); + STATIC_CHECK(unicode::transcode(u"aこれはb試験ですc") == u"aこれはb試験ですc"); + //STATIC_CHECK(unicode::transcode(U"aこれはb試験ですc") == u"aこれはb試験ですc"); + + STATIC_CHECK(unicode::transcode("aこれはb試験ですc") == U"aこれはb試験ですc"); + STATIC_CHECK(unicode::transcode(u8"aこれはb試験ですc") == U"aこれはb試験ですc"); + //STATIC_CHECK(unicode::transcode(u"aこれはb試験ですc") == U"aこれはb試験ですc"); + STATIC_CHECK(unicode::transcode(U"aこれはb試験ですc") == U"aこれはb試験ですc"); +} + +} // iris_unicode_test diff --git a/test/unicode/string/test_data/utf8_invalid.txt b/test/unicode/string/test_data/utf8_invalid.txt new file mode 100644 index 0000000..ae83159 Binary files /dev/null and b/test/unicode/string/test_data/utf8_invalid.txt differ diff --git a/test/unicode/string/utf8_invalid.cpp b/test/unicode/string/utf8_invalid.cpp new file mode 100644 index 0000000..b7f46b7 --- /dev/null +++ b/test/unicode/string/utf8_invalid.cpp @@ -0,0 +1,77 @@ +// TODO: we need secure "getenv" in iris library +#define _CRT_SECURE_NO_WARNINGS 1 + +#include "iris_test.hpp" + +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace iris_unicode_test { + +constexpr auto INVALID_LINES = std::to_array({ + 75, 76, 83, 84, 85, 93, 102, 103, 105, 106, 107, 108, 109, + 110, 114, 115, 116, 117, 124, 125, 130, 135, 140, 145, 153, + 154, 155, 156, 157, 158, 159, 160, 161, 162, 169, 175, 176, + 177, 207, 208, 209, 210, 211, 220, 221, 222, 223, 224, 232, + 233, 234, 235, 236, 247, 248, 249, 250, 251, 252, 253, 257, + 258, 259, 260, 261, 262, 263, 264, +}); + +TEST_CASE("utf8_invalid") +{ + namespace unicode = iris::unicode; + using iris::throwf; + + std::filesystem::path const IRIS_ROOT = [] { + char const* IRIS_ROOT_str = std::getenv("IRIS_ROOT"); + if (!IRIS_ROOT_str) throwf("IRIS_ROOT is not defined"); + return std::filesystem::path(IRIS_ROOT_str); + }(); + + auto const test_file_path = IRIS_ROOT / "test" / "unicode" / "string" / "test_data" / "utf8_invalid.txt"; + std::ifstream fs8(test_file_path); + if (!fs8) { + throwf("could not open \"{}\"", test_file_path.string()); + } + + // Read it line by line + unsigned line_count = 0; + while (!fs8.eof()) { + std::string line; + + char byte; + while ((byte = static_cast(fs8.get())) != '\n' && !fs8.eof()) { + line.push_back(byte); + } + + ++line_count; + bool const expected_valid = std::ranges::find(INVALID_LINES, line_count) == INVALID_LINES.end(); + + // Print out lines that contain unexpected invalid UTF-8 + if (!unicode::is_valid(line.begin(), line.end())) { + if (expected_valid) { + throwf("unexpected invalid utf-8 at line {}", line_count); + } + + // try fixing it: + std::string fixed_line; + unicode::replace_invalid(line.begin(), line.end(), back_inserter(fixed_line)); + if (!unicode::is_valid(fixed_line.begin(), fixed_line.end())) { + throwf("replace_invalid() resulted in an invalid utf-8 at line {}", line_count); + } + + } else if (!expected_valid) { + throwf("invalid utf-8 NOT detected at line {}", line_count); + } + } + CHECK(true); +} + +} // iris_unicode_test