From 510c67b3eded045fec91ae519bd4bf09016cd305 Mon Sep 17 00:00:00 2001 From: Nana Sakisaka <1901813+saki7@users.noreply.github.com> Date: Sat, 7 Mar 2026 14:00:09 +0900 Subject: [PATCH 01/17] Add UTF string library --- include/iris/unicode/string.hpp | 957 ++++++++++++++++++ test/CMakeLists.txt | 4 + test/unicode/CMakeLists.txt | 3 + test/unicode/string/CMakeLists.txt | 3 + test/unicode/string/LICENSE | 23 + test/unicode/string/apitests.cpp | 257 +++++ test/unicode/string/negative.cpp | 61 ++ test/unicode/string/test_cpp11.cpp | 117 +++ test/unicode/string/test_cpp17.cpp | 86 ++ test/unicode/string/test_cpp20.cpp | 79 ++ .../unicode/string/test_data/utf8_invalid.txt | Bin 0 -> 20010 bytes 11 files changed, 1590 insertions(+) create mode 100644 include/iris/unicode/string.hpp create mode 100644 test/unicode/CMakeLists.txt create mode 100644 test/unicode/string/CMakeLists.txt create mode 100644 test/unicode/string/LICENSE create mode 100644 test/unicode/string/apitests.cpp create mode 100644 test/unicode/string/negative.cpp create mode 100644 test/unicode/string/test_cpp11.cpp create mode 100644 test/unicode/string/test_cpp17.cpp create mode 100644 test/unicode/string/test_cpp20.cpp create mode 100644 test/unicode/string/test_data/utf8_invalid.txt diff --git a/include/iris/unicode/string.hpp b/include/iris/unicode/string.hpp new file mode 100644 index 0000000..08d8d1f --- /dev/null +++ b/include/iris/unicode/string.hpp @@ -0,0 +1,957 @@ +// Copyright 2006 Nemanja Trifunovic +// Copyright 2026 The Iris Project Contributors + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef IRIS_UTFLIB_UTF8_H +#define IRIS_UTFLIB_UTF8_H + +#include +#include +#include +#include +#include +#include + +#include + +namespace iris::utflib +{ + template + concept octet = std::integral && sizeof(T) == 1; + + template + concept utf8char = octet && (std::same_as || std::same_as); + + template + concept utf16char = std::same_as; + + template + concept utf32char = std::same_as; + + template + concept octet_iterator = std::input_iterator && octet>; + + template + concept utf8_iterator = octet_iterator && utf8char>; + + template + concept utf16_iterator = std::input_iterator && utf16char>; + + template + concept utf32_iterator = std::input_iterator && utf32char>; + + namespace traits + { + template + struct is_nothrow_dereferenceable : std::false_type {}; + + template + struct is_nothrow_dereferenceable())>> : std::bool_constant())> {}; + + template + inline constexpr bool is_nothrow_dereferenceable_v = is_nothrow_dereferenceable::value; + + template + struct is_nothrow_prefix_incrementable : std::false_type {}; + + template + struct is_nothrow_prefix_incrementable())>> : std::bool_constant())> {}; + + template + inline constexpr bool is_nothrow_prefix_incrementable_v = is_nothrow_prefix_incrementable::value; + + template + struct is_nothrow_postfix_incrementable : std::false_type {}; + + template + struct is_nothrow_postfix_incrementable()++)>> : std::bool_constant()++)> {}; + + template + inline constexpr bool is_nothrow_postfix_incrementable_v = is_nothrow_postfix_incrementable::value; + + template + struct is_nothrow_sentinel : std::false_type {}; + + template + requires std::sentinel_for + struct is_nothrow_sentinel : std::bool_constant< + noexcept(std::declval() == std::declval()) && + noexcept(std::declval() != std::declval()) && + noexcept(std::declval() == std::declval()) && + noexcept(std::declval() != std::declval()) + > + {}; + + template + inline constexpr bool is_nothrow_sentinel_v = is_nothrow_sentinel::value; + } // namespace traits + + // Helper code - not intended to be directly called by the library users. May be changed at any time + namespace internal + { + // Unicode constants + // Leading (high) surrogates: 0xd800 - 0xdbff + // Trailing (low) surrogates: 0xdc00 - 0xdfff + constexpr char16_t LEAD_SURROGATE_MIN = 0xd800u; + constexpr char16_t LEAD_SURROGATE_MAX = 0xdbffu; + constexpr char16_t TRAIL_SURROGATE_MIN = 0xdc00u; + constexpr char16_t TRAIL_SURROGATE_MAX = 0xdfffu; + constexpr char16_t LEAD_OFFSET = 0xd7c0u; // LEAD_SURROGATE_MIN - (0x10000 >> 10) + constexpr char32_t SURROGATE_OFFSET = 0xfca02400u; // 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN + + // Maximum valid value for a Unicode code point + constexpr char32_t CODE_POINT_MAX = 0x0010ffffu; + + enum class utf_error { OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT }; + + template + [[nodiscard]] constexpr char8_t mask8(Octet oc) noexcept + { + return static_cast(0xff & oc); + } + + [[nodiscard]] constexpr char16_t mask16(char16_t oc) noexcept + { + return static_cast(0xffff & oc); + } + + template + [[nodiscard]] constexpr bool is_trail(Octet oc) noexcept + { + return ((internal::mask8(oc) >> 6) == 0x2); + } + + [[nodiscard]] constexpr bool is_lead_surrogate(char32_t cp) noexcept + { + return (cp >= static_cast(LEAD_SURROGATE_MIN) && cp <= static_cast(LEAD_SURROGATE_MAX)); + } + + [[nodiscard]] constexpr bool is_trail_surrogate(char32_t cp) noexcept + { + return (cp >= static_cast(TRAIL_SURROGATE_MIN) && cp <= static_cast(TRAIL_SURROGATE_MAX)); + } + + [[nodiscard]] constexpr bool is_surrogate(char32_t cp) noexcept + { + return (cp >= static_cast(LEAD_SURROGATE_MIN) && cp <= static_cast(TRAIL_SURROGATE_MAX)); + } + + [[nodiscard]] constexpr bool is_code_point_valid(char32_t cp) noexcept + { + return (cp <= CODE_POINT_MAX && !internal::is_surrogate(cp)); + } + + [[nodiscard]] constexpr bool is_in_bmp(char32_t cp) noexcept + { + return cp < char32_t(0x10000); + } + + [[nodiscard]] constexpr bool is_overlong_sequence(char32_t cp, int length) noexcept + { + if (cp < 0x80) { + if (length != 1) + return true; + } else if (cp < 0x800) { + if (length != 2) + return true; + } else if (cp < 0x10000) { + if (length != 3) + return true; + } + return false; + } + + template + [[nodiscard]] constexpr int sequence_length(It lead_it) + noexcept(traits::is_nothrow_dereferenceable_v) + { + const char8_t lead = internal::mask8(*lead_it); + if (lead < 0x80) + return 1; + else if ((lead >> 5) == 0x6) + return 2; + else if ((lead >> 4) == 0xe) + return 3; + else if ((lead >> 3) == 0x1e) + return 4; + else + return 0; + } + + /// Helper for get_sequence_x + template Se> + constexpr utf_error increase_safely(It& it, Se end) + noexcept(std::conjunction_v< + traits::is_nothrow_dereferenceable, + traits::is_nothrow_prefix_incrementable, + traits::is_nothrow_sentinel + >) + { + if (++it == end) + return utf_error::NOT_ENOUGH_ROOM; + + if (!internal::is_trail(*it)) + return utf_error::INCOMPLETE_SEQUENCE; + + return utf_error::OK; + } + +#define IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(IT, END) \ + do { \ + utf_error ret = increase_safely(IT, END); \ + if (ret != utf_error::OK) \ + return ret; \ + } while (false) + + /// get_sequence_x functions decode utf-8 sequences of the length x + template Se> + constexpr utf_error get_sequence_1(It& it, Se end, char32_t& code_point) + noexcept(std::conjunction_v< + traits::is_nothrow_dereferenceable, + traits::is_nothrow_sentinel + >) + { + if (it == end) + return utf_error::NOT_ENOUGH_ROOM; + + code_point = static_cast(internal::mask8(*it)); + + return utf_error::OK; + } + + template Se> + constexpr utf_error get_sequence_2(It& it, Se end, char32_t& code_point) + noexcept(std::conjunction_v< + traits::is_nothrow_dereferenceable, + traits::is_nothrow_prefix_incrementable, + traits::is_nothrow_sentinel + >) + { + if (it == end) + return utf_error::NOT_ENOUGH_ROOM; + + code_point = static_cast(internal::mask8(*it)); + + IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end); + + code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f); + + return utf_error::OK; + } + + template Se> + constexpr utf_error get_sequence_3(It& it, Se end, char32_t& code_point) + noexcept(std::conjunction_v< + traits::is_nothrow_dereferenceable, + traits::is_nothrow_prefix_incrementable, + traits::is_nothrow_sentinel + >) + { + if (it == end) + return utf_error::NOT_ENOUGH_ROOM; + + code_point = static_cast(internal::mask8(*it)); + + IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end); + + code_point = ((code_point << 12) & 0xffff) + ((internal::mask8(*it) << 6) & 0xfff); + + IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end); + + code_point = static_cast(code_point + ((*it) & 0x3f)); + + return utf_error::OK; + } + + template Se> + constexpr utf_error get_sequence_4(It& it, Se end, char32_t& code_point) + noexcept(std::conjunction_v< + traits::is_nothrow_dereferenceable, + traits::is_nothrow_prefix_incrementable, + traits::is_nothrow_sentinel + >) + { + if (it == end) + return utf_error::NOT_ENOUGH_ROOM; + + code_point = static_cast(internal::mask8(*it)); + + IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end); + + code_point = ((code_point << 18) & 0x1fffff) + ((internal::mask8(*it) << 12) & 0x3ffff); + + IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end); + + code_point = static_cast(code_point + ((internal::mask8(*it) << 6) & 0xfff)); + + IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end); + + code_point = static_cast(code_point + ((*it) & 0x3f)); + + return utf_error::OK; + } + +#undef IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR + + template Se> + requires std::forward_iterator + constexpr utf_error validate_next(It& it, Se end, char32_t& code_point) + noexcept(std::conjunction_v< + traits::is_nothrow_dereferenceable, + traits::is_nothrow_prefix_incrementable, + traits::is_nothrow_sentinel, + std::is_nothrow_copy_constructible + >) + { + if (it == end) + return utf_error::NOT_ENOUGH_ROOM; + + // Save the original value of it so we can go back in case of failure + // Of course, it does not make much sense with i.e. stream iterators + It original_it = it; + + char32_t cp = 0; + // Determine the sequence length based on the lead octet + const int length = internal::sequence_length(it); + + // Get trail octets and calculate the code point + utf_error err = utf_error::OK; + switch (length) { + case 0: + return utf_error::INVALID_LEAD; + case 1: + err = internal::get_sequence_1(it, end, cp); + break; + case 2: + err = internal::get_sequence_2(it, end, cp); + break; + case 3: + err = internal::get_sequence_3(it, end, cp); + break; + case 4: + err = internal::get_sequence_4(it, end, cp); + break; + } + + if (err == utf_error::OK) { + // Decoding succeeded. Now, security checks... + if (internal::is_code_point_valid(cp)) { + if (!internal::is_overlong_sequence(cp, length)) { + // Passed! Return here. + code_point = cp; + ++it; + return utf_error::OK; + } else + err = utf_error::OVERLONG_SEQUENCE; + } else + err = utf_error::INVALID_CODE_POINT; + } + + // Failure branch - restore the original value of the iterator + it = original_it; + return err; + } + + template Se> + requires std::forward_iterator + constexpr utf_error validate_next(It& it, Se end) + noexcept(noexcept(internal::validate_next(it, end, std::declval()))) + { + char32_t ignored; + return internal::validate_next(it, end, ignored); + } + + template Se> + requires std::forward_iterator + constexpr utf_error validate_next16(It& it, Se end, char32_t& code_point) + noexcept(std::conjunction_v< + traits::is_nothrow_dereferenceable, + traits::is_nothrow_prefix_incrementable, + traits::is_nothrow_postfix_incrementable, + traits::is_nothrow_sentinel, + std::is_nothrow_copy_constructible + >) + { + // Check the edge case: + if (it == end) + return utf_error::NOT_ENOUGH_ROOM; + // Save the original value of it so we can go back in case of failure + // Of course, it does not make much sense with i.e. stream iterators + It original_it = it; + + utf_error err = utf_error::OK; + + const char16_t first_word = *it++; + if (!internal::is_surrogate(first_word)) { + code_point = first_word; + return utf_error::OK; + } else { + if (it == end) + err = utf_error::NOT_ENOUGH_ROOM; + else if (internal::is_lead_surrogate(first_word)) { + const char16_t second_word = *it++; + if (internal::is_trail_surrogate(static_cast(second_word))) { + code_point = static_cast(first_word << 10) + static_cast(second_word) + SURROGATE_OFFSET; + return utf_error::OK; + } else + err = utf_error::INCOMPLETE_SEQUENCE; + + } else { + err = utf_error::INVALID_LEAD; + } + } + // error branch + it = original_it; + return err; + } + + template > + requires std::output_iterator + constexpr It append(char32_t cp, It result) + noexcept(noexcept(*result++ = std::declval())) + { + if (cp < 0x80) // one octet + *(result++) = static_cast(cp); + else if (cp < 0x800) { // two octets + *(result++) = static_cast((cp >> 6) | 0xc0); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } else if (cp < 0x10000) { // three octets + *(result++) = static_cast((cp >> 12) | 0xe0); + *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } else { // four octets + *(result++) = static_cast((cp >> 18) | 0xf0); + *(result++) = static_cast(((cp >> 12) & 0x3f) | 0x80); + *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + return result; + } + + template + constexpr std::back_insert_iterator append(char32_t cp, std::back_insert_iterator result) + noexcept(noexcept(internal::append, typename container_type::value_type>(cp, result))) + { + return internal::append, typename container_type::value_type>(cp, result); + } + + template It> + constexpr It append16(char32_t cp, It result) + noexcept(noexcept(*result++ = std::declval())) + { + if (internal::is_in_bmp(cp)) + *(result++) = static_cast(cp); + else { + // Code points from the supplementary planes are encoded via surrogate pairs + *(result++) = static_cast(LEAD_OFFSET + (cp >> 10)); + *(result++) = static_cast(TRAIL_SURROGATE_MIN + (cp & 0x3FF)); + } + return result; + } + } // namespace internal + + // Base for the exceptions that may be thrown from the library + class exception : public ::std::exception + { + }; + + // Exceptions that may be thrown from the library functions. + class invalid_code_point : public exception + { + char32_t cp; + + public: + explicit invalid_code_point(char32_t codepoint) + : cp(codepoint) + { + } + virtual const char* what() const noexcept override { return "Invalid code point"; } + [[nodiscard]] char32_t code_point() const noexcept { return cp; } + }; + + class invalid_utf8 : public exception + { + char8_t u8; + + public: + explicit invalid_utf8(char c) + : u8(static_cast(c)) + { + } + explicit invalid_utf8(char8_t u) + : u8(u) + { + } + virtual const char* what() const noexcept override { return "Invalid UTF-8"; } + [[nodiscard]] char8_t utf8_octet() const noexcept { return u8; } + }; + + class invalid_utf16 : public exception + { + char16_t u16; + + public: + explicit invalid_utf16(char16_t u) + : u16(u) + { + } + virtual const char* what() const noexcept override { return "Invalid UTF-16"; } + [[nodiscard]] char16_t utf16_word() const noexcept { return u16; } + }; + + class not_enough_room : public exception + { + public: + virtual const char* what() const noexcept override { return "Not enough space"; } + }; + + /// The library API - functions intended to be called by the users + + // Byte order mark + constexpr char8_t bom[] = {0xef, 0xbb, 0xbf}; + + template Se> + [[nodiscard]] constexpr It find_invalid(It it, Se se) + noexcept(noexcept(internal::validate_next(it, se)) && std::is_nothrow_copy_constructible_v) + { + while (it != se) { + internal::utf_error err_code = internal::validate_next(it, se); + if (err_code != internal::utf_error::OK) + return it; + } + return it; + } + + [[nodiscard]] constexpr std::size_t find_invalid(std::string_view s) + noexcept(noexcept(utflib::find_invalid(s.begin(), s.end()))) + { + std::string_view::const_iterator invalid = utflib::find_invalid(s.begin(), s.end()); + return (invalid == s.end()) ? std::string_view::npos : static_cast(invalid - s.begin()); + } + + [[nodiscard]] constexpr std::size_t find_invalid(std::u8string_view s) + noexcept(noexcept(utflib::find_invalid(s.begin(), s.end()))) + { + std::u8string_view::const_iterator invalid = utflib::find_invalid(s.begin(), s.end()); + return (invalid == s.end()) ? std::u8string_view::npos : static_cast(invalid - s.begin()); + } + + template Se> + [[nodiscard]] constexpr bool is_valid(It it, Se se) + noexcept(noexcept(utflib::find_invalid(it, se)) && traits::is_nothrow_sentinel_v) + { + return (utflib::find_invalid(it, se) == se); + } + + [[nodiscard]] constexpr bool is_valid(std::string_view s) + noexcept(noexcept(utflib::is_valid(s.begin(), s.end()))) + { + return utflib::is_valid(s.begin(), s.end()); + } + + [[nodiscard]] constexpr bool is_valid(std::u8string_view s) + noexcept(noexcept(utflib::is_valid(s.begin(), s.end()))) + { + return utflib::is_valid(s.begin(), s.end()); + } + + template Se> + [[nodiscard]] constexpr bool starts_with_bom(It it, Se end) + noexcept(noexcept(internal::mask8(*it++)) && traits::is_nothrow_sentinel_v) + { + return (((it != end) && (internal::mask8(*it++)) == bom[0]) && ((it != end) && (internal::mask8(*it++)) == bom[1]) && ((it != end) && (internal::mask8(*it)) == bom[2])); + } + + [[nodiscard]] constexpr bool starts_with_bom(std::string_view s) + noexcept(noexcept(utflib::starts_with_bom(s.begin(), s.end()))) + { + return utflib::starts_with_bom(s.begin(), s.end()); + } + + [[nodiscard]] constexpr bool starts_with_bom(std::u8string_view s) + noexcept(noexcept(utflib::starts_with_bom(s.begin(), s.end()))) + { + return utflib::starts_with_bom(s.begin(), s.end()); + } + + template // TODO: add constraints + constexpr It append(char32_t cp, It result) + { + if (!internal::is_code_point_valid(cp)) + throw invalid_code_point(cp); + + return internal::append(cp, result); + } + + constexpr void append(char32_t cp, std::string& s) + { + utflib::append(cp, std::back_inserter(s)); + } + + constexpr void append(char32_t cp, std::u8string& s) + { + utflib::append(cp, std::back_inserter(s)); + } + + template // TODO: add constraints + constexpr It append16(char32_t cp, It result) + { + if (!internal::is_code_point_valid(cp)) + throw invalid_code_point(cp); + + return internal::append16(cp, result); + } + + constexpr void append16(char32_t cp, std::u16string& s) + { + utflib::append16(cp, std::back_inserter(s)); + } + + template Se, typename Out> // TODO: add constraints + constexpr Out replace_invalid(It start, Se end, Out out, char32_t replacement) + { + while (start != end) { + It sequence_start = start; + internal::utf_error err_code = internal::validate_next(start, end); + switch (err_code) { + case internal::utf_error::OK: + for (It it = sequence_start; it != start; ++it) + *out++ = *it; + break; + case internal::utf_error::NOT_ENOUGH_ROOM: + out = utflib::append(replacement, out); + start = end; + break; + case internal::utf_error::INVALID_LEAD: + out = utflib::append(replacement, out); + ++start; + break; + case internal::utf_error::INCOMPLETE_SEQUENCE: + case internal::utf_error::OVERLONG_SEQUENCE: + case internal::utf_error::INVALID_CODE_POINT: + out = utflib::append(replacement, out); + ++start; + // just one replacement mark for the sequence + while (start != end && internal::is_trail(*start)) + ++start; + break; + } + } + return out; + } + + template Se, typename Out> // TODO: add constraints + constexpr Out replace_invalid(It start, Se end, Out out) + { + constexpr char32_t replacement_marker = static_cast(internal::mask16(0xfffd)); + return utflib::replace_invalid(start, end, out, replacement_marker); + } + + [[nodiscard]] constexpr std::string replace_invalid(std::string_view s, char32_t replacement) + { + std::string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); + return result; + } + + [[nodiscard]] constexpr std::u8string replace_invalid(std::u8string_view s, char32_t replacement) + { + std::u8string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); + return result; + } + + [[nodiscard]] constexpr std::string replace_invalid(std::string_view s) + { + std::string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + [[nodiscard]] constexpr std::u8string replace_invalid(std::u8string_view s) + { + std::u8string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + template Se> + [[nodiscard]] constexpr char32_t next(It& it, Se end) + { + char32_t cp = 0; + internal::utf_error err_code = internal::validate_next(it, end, cp); + switch (err_code) { + case internal::utf_error::OK: + break; + case internal::utf_error::NOT_ENOUGH_ROOM: + throw not_enough_room(); + case internal::utf_error::INVALID_LEAD: + case internal::utf_error::INCOMPLETE_SEQUENCE: + case internal::utf_error::OVERLONG_SEQUENCE: + throw invalid_utf8(static_cast(*it)); + case internal::utf_error::INVALID_CODE_POINT: + throw invalid_code_point(cp); + } + return cp; + } + + template Se> + [[nodiscard]] constexpr char32_t next16(It& it, Se end) + { + char32_t cp = 0; + internal::utf_error err_code = internal::validate_next16(it, end, cp); + if (err_code == internal::utf_error::NOT_ENOUGH_ROOM) + throw not_enough_room(); + return cp; + } + + template Se> + [[nodiscard]] constexpr char32_t peek_next(It it, Se end) + { + return utflib::next(it, end); + } + + template Se> + [[nodiscard]] constexpr char32_t prior(It& it, Se start) + { + // can't do much if it == start + if (it == start) + throw not_enough_room(); + + It end = it; + // Go back until we hit either a lead octet or start + while (internal::is_trail(*(--it))) + if (it == start) + throw invalid_utf8(*it); // error - no lead byte in the sequence + return utflib::peek_next(it, end); + } + + template Se, typename distance_type> + constexpr void advance(It& it, distance_type n, Se end) + { + const distance_type zero(0); + if (n < zero) { + // backward + for (distance_type i = n; i < zero; ++i) + (void)utflib::prior(it, end); + } else { + // forward + for (distance_type i = zero; i < n; ++i) + (void)utflib::next(it, end); + } + } + + template Se> + [[nodiscard]] constexpr typename std::iterator_traits::difference_type distance(It first, Se last) + { + typename std::iterator_traits::difference_type dist; + for (dist = 0; first != last; ++dist) + (void)utflib::next(first, last); + return dist; + } + + template Se, typename OutIt> // TODO: add constraints + constexpr OutIt utf16to8(It start, Se end, OutIt result) + { + while (start != end) { + char32_t cp = static_cast(internal::mask16(*start++)); + // Take care of surrogate pairs first + if (internal::is_lead_surrogate(cp)) { + if (start != end) { + const char32_t trail_surrogate = static_cast(internal::mask16(*start++)); + if (internal::is_trail_surrogate(trail_surrogate)) + cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; + else + throw invalid_utf16(static_cast(trail_surrogate)); + } else + throw invalid_utf16(static_cast(cp)); + + } + // Lone trail surrogate + else if (internal::is_trail_surrogate(cp)) + throw invalid_utf16(static_cast(cp)); + + result = utflib::append(cp, result); + } + return result; + } + + [[nodiscard]] constexpr std::string utf16to8(std::u16string_view s) + { + std::string result; + utflib::utf16to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + [[nodiscard]] constexpr std::u8string utf16tou8(std::u16string_view s) + { + std::u8string result; + utflib::utf16to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + template Se, typename OutIt> // TODO: add constraints + constexpr OutIt utf8to16(It start, Se end, OutIt result) + { + while (start != end) { + const char32_t cp = utflib::next(start, end); + if (cp > 0xffff) { // make a surrogate pair + *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); + *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); + } else + *result++ = static_cast(cp); + } + return result; + } + + [[nodiscard]] constexpr std::u16string utf8to16(std::string_view s) + { + std::u16string result; + utflib::utf8to16(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + [[nodiscard]] constexpr std::u16string utf8to16(std::u8string_view s) + { + std::u16string result; + utflib::utf8to16(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + template Se, typename OutIt> // TODO: add constraints + constexpr OutIt utf32to8(It start, Se end, OutIt result) + { + while (start != end) + result = utflib::append(*(start++), result); + + return result; + } + + [[nodiscard]] constexpr std::string utf32to8(std::u32string_view s) + { + std::string result; + utflib::utf32to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + [[nodiscard]] constexpr std::u8string utf32tou8(std::u32string_view s) + { + std::u8string result; + utflib::utf32to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + template Se, typename OutIt> + constexpr OutIt utf8to32(It start, Se end, OutIt result) + { + while (start != end) + (*result++) = utflib::next(start, end); + + return result; + } + + [[nodiscard]] constexpr std::u32string utf8to32(std::string_view s) + { + std::u32string result; + utflib::utf8to32(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + [[nodiscard]] constexpr std::u32string utf8to32(std::u8string_view s) + { + std::u32string result; + utflib::utf8to32(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + // The iterator class + template + class iterator + { + It it; + It range_start; + It range_end; + + public: + using value_type = char32_t; + using pointer = char32_t*; + using reference = char32_t&; + using difference_type = std::ptrdiff_t; + using iterator_category = std::bidirectional_iterator_tag; + constexpr iterator() + requires std::is_default_constructible_v + = default; + constexpr explicit iterator(It octet_it, It rangestart, It rangeend) + : it(std::move(octet_it)) + , range_start(std::move(rangestart)) + , range_end(std::move(rangeend)) + { + if constexpr (std::random_access_iterator) { + if (it < range_start || it > range_end) + throw std::out_of_range("Invalid utf-8 iterator position"); + } + } + // the default "big three" are OK + [[nodiscard]] constexpr It base() const { return it; } + [[nodiscard]] constexpr char32_t operator*() const + { + It temp = it; + return utflib::next(temp, range_end); + } + [[nodiscard]] constexpr bool operator==(const iterator& rhs) const + { + if (range_start != rhs.range_start || range_end != rhs.range_end) + throw std::logic_error("Comparing utf-8 iterators defined with different ranges"); + return (it == rhs.it); + } + constexpr iterator& operator++() + { + (void)utflib::next(it, range_end); + return *this; + } + constexpr iterator operator++(int) + { + iterator temp = *this; + (void)utflib::next(it, range_end); + return temp; + } + constexpr iterator& operator--() + { + (void)utflib::prior(it, range_start); + return *this; + } + constexpr iterator operator--(int) + { + iterator temp = *this; + (void)utflib::prior(it, range_start); + return temp; + } + }; // class iterator + +} // namespace iris::utflib + +#endif // header guard diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 34edcd0..77d4c02 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -170,4 +170,8 @@ if(PROJECT_IS_TOP_LEVEL) iris_define_test_headers(iris_${test_name} iris_test.hpp) endforeach() endif() + + if(NOT DEFINED IRIS_CI_COMPONENT OR IRIS_CI_COMPONENT STREQUAL unicode) + add_subdirectory(unicode) + endif() endif() diff --git a/test/unicode/CMakeLists.txt b/test/unicode/CMakeLists.txt new file mode 100644 index 0000000..37baffc --- /dev/null +++ b/test/unicode/CMakeLists.txt @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: MIT + +add_subdirectory(string) diff --git a/test/unicode/string/CMakeLists.txt b/test/unicode/string/CMakeLists.txt new file mode 100644 index 0000000..4fd1e27 --- /dev/null +++ b/test/unicode/string/CMakeLists.txt @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: MIT + + diff --git a/test/unicode/string/LICENSE b/test/unicode/string/LICENSE new file mode 100644 index 0000000..36b7cd9 --- /dev/null +++ b/test/unicode/string/LICENSE @@ -0,0 +1,23 @@ +Boost Software License - Version 1.0 - August 17th, 2003 + +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/test/unicode/string/apitests.cpp b/test/unicode/string/apitests.cpp new file mode 100644 index 0000000..ba8fa90 --- /dev/null +++ b/test/unicode/string/apitests.cpp @@ -0,0 +1,257 @@ +#include "ftest.h" + +#include "utf8.h" + +#include + +#include + +using namespace iris::utflib; +using namespace std; + +TEST(CheckedAPITests, test_append) +{ + unsigned char u[5] = {0,0,0,0,0}; + append(0x0448, u); + EXPECT_EQ (u[0], 0xd1); + EXPECT_EQ (u[1], 0x88); + EXPECT_EQ (u[2], 0); + EXPECT_EQ (u[3], 0); + EXPECT_EQ (u[4], 0); + + append(0x65e5, u); + EXPECT_EQ (u[0], 0xe6); + EXPECT_EQ (u[1], 0x97); + EXPECT_EQ (u[2], 0xa5); + EXPECT_EQ (u[3], 0); + EXPECT_EQ (u[4], 0); + + append(0x3044, u); + EXPECT_EQ (u[0], 0xe3); + EXPECT_EQ (u[1], 0x81); + EXPECT_EQ (u[2], 0x84); + EXPECT_EQ (u[3], 0); + EXPECT_EQ (u[4], 0); + + append(0x10346, u); + EXPECT_EQ (u[0], 0xf0); + EXPECT_EQ (u[1], 0x90); + EXPECT_EQ (u[2], 0x8d); + EXPECT_EQ (u[3], 0x86); + EXPECT_EQ (u[4], 0); + + // Ensure no warnings with plain char + char c[2] = {0,0}; + append('a', c); + EXPECT_EQ (c[0], 'a'); + EXPECT_EQ (c[1], 0); +} + +TEST(CheckedAPITests, test_append16) +{ + char16_t u[5] = {0,0}; + append16(0x0448, u); + EXPECT_EQ (u[0], 0x0448); + EXPECT_EQ (u[1], 0x0000); + + append16(0x65e5, u); + EXPECT_EQ (u[0], 0x65e5); + EXPECT_EQ (u[1], 0x0000); + + append16(0x10346, u); + EXPECT_EQ (u[0], 0xd800); + EXPECT_EQ (u[1], 0xdf46); +} + +TEST(CheckedAPITests, test_next) +{ + const char* twochars = "\xe6\x97\xa5\xd1\x88"; + const char* w = twochars; + unsigned int cp = next(w, twochars + 6); + EXPECT_EQ (cp, 0x65e5); + EXPECT_EQ (w, twochars + 3); + + const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + w = threechars; + + cp = next(w, threechars + 9); + EXPECT_EQ (cp, 0x10346); + EXPECT_EQ (w, threechars + 4); + + cp = next(w, threechars + 9); + EXPECT_EQ (cp, 0x65e5); + EXPECT_EQ (w, threechars + 7); + + cp = next(w, threechars + 9); + EXPECT_EQ (cp, 0x0448); + EXPECT_EQ (w, threechars + 9); +} + +TEST(CheckedAPITests, test_next16) +{ + const char16_t u[3] = {0x65e5, 0xd800, 0xdf46}; + const char16_t* w = u; + char32_t cp = next16(w, w + 3); + EXPECT_EQ (cp, 0x65e5); + EXPECT_EQ (w, u + 1); + + cp = next16(w, w + 2); + EXPECT_EQ (cp, 0x10346); + EXPECT_EQ (w, u + 3); +} + +TEST(CheckedAPITests, test_peek_next) +{ + const char* const cw = "\xe6\x97\xa5\xd1\x88"; + unsigned int cp = peek_next(cw, cw + 6); + EXPECT_EQ (cp, 0x65e5); +} + +TEST(CheckedAPITests, test_prior) +{ + const char* twochars = "\xe6\x97\xa5\xd1\x88"; + const char* w = twochars + 3; + unsigned int cp = prior (w, twochars); + EXPECT_EQ (cp, 0x65e5); + EXPECT_EQ (w, twochars); + + const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + w = threechars + 9; + cp = prior(w, threechars); + EXPECT_EQ (cp, 0x0448); + EXPECT_EQ (w, threechars + 7); + cp = prior(w, threechars); + EXPECT_EQ (cp, 0x65e5); + EXPECT_EQ (w, threechars + 4); + cp = prior(w, threechars); + EXPECT_EQ (cp, 0x10346); + EXPECT_EQ (w, threechars); +} + +TEST(CheckedAPITests, test_advance) +{ + const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + const char* w = threechars; + advance(w, 2, threechars + 9); + EXPECT_EQ(w, threechars + 7); + advance(w, -2, threechars); + EXPECT_EQ(w, threechars); + advance(w, 3, threechars + 9); + EXPECT_EQ(w, threechars + 9); + advance(w, -2, threechars); + EXPECT_EQ(w, threechars + 4); + advance(w, -1, threechars); + EXPECT_EQ(w, threechars); +} + +TEST(CheckedAPITests, test_distance) +{ + const char* twochars = "\xe6\x97\xa5\xd1\x88"; + size_t dist = static_cast(iris::utflib::distance(twochars, twochars + 5)); + EXPECT_EQ (dist, 2); +} + +TEST(CheckedAPITests, test_utf32to8) +{ + char32_t utf32string[] = {0x448, 0x65E5, 0x10346, 0}; + string utf8result; + iris::utflib::utf32to8(utf32string, utf32string + 3, back_inserter(utf8result)); + EXPECT_EQ (utf8result.size(), 9); +} + +TEST(CheckedAPITests, test_utf8to32) +{ + const char* twochars = "\xe6\x97\xa5\xd1\x88"; + vector utf32result; + iris::utflib::utf8to32(twochars, twochars + 5, back_inserter(utf32result)); + EXPECT_EQ (utf32result.size(), 2); +} + +TEST(CheckedAPITests, test_utf16to8) +{ + char16_t utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; + string utf8result; + iris::utflib::utf16to8(utf16string, utf16string + 5, back_inserter(utf8result)); + EXPECT_EQ (utf8result.size(), 10); +} + +TEST(CheckedAPITests, test_utf8to16) +{ + char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; + vector utf16result; + iris::utflib::utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result)); + EXPECT_EQ (utf16result.size(), 4); + EXPECT_EQ (utf16result[2], 0xd834); + EXPECT_EQ (utf16result[3], 0xdd1e); +} + +TEST(CheckedAPITests, test_replace_invalid) +{ + char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; + vector replace_invalid_result; + replace_invalid (invalid_sequence, invalid_sequence + sizeof(invalid_sequence), std::back_inserter(replace_invalid_result), '?'); + bool bvalid = is_valid(replace_invalid_result.begin(), replace_invalid_result.end()); + EXPECT_TRUE (bvalid); + const char fixed_invalid_sequence[] = "a????z"; + EXPECT_EQ (sizeof(fixed_invalid_sequence), replace_invalid_result.size()); + EXPECT_TRUE (std::equal(replace_invalid_result.begin(), replace_invalid_result.begin() + sizeof(fixed_invalid_sequence), fixed_invalid_sequence)); +} + +TEST(CheckedAPITests, test_find_invalid) +{ + char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; + const char* invalid = find_invalid(utf_invalid, utf_invalid + 6); + EXPECT_EQ (invalid, utf_invalid + 5); + invalid = utf_invalid + find_invalid(utf_invalid); + EXPECT_EQ (invalid, utf_invalid + 5); +} + +TEST(CheckedAPITests, test_is_valid) +{ + char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; + bool bvalid = is_valid(utf_invalid, utf_invalid + 6); + EXPECT_FALSE (bvalid); + bvalid = is_valid(utf_invalid); + EXPECT_FALSE (bvalid); + char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; + bvalid = is_valid(utf8_with_surrogates, utf8_with_surrogates + 9); + EXPECT_TRUE (bvalid); + bvalid = is_valid(utf8_with_surrogates); + EXPECT_TRUE (bvalid); +} + +TEST(CheckedAPITests, test_starts_with_bom) +{ + unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf}; + bool bbom = starts_with_bom(byte_order_mark, byte_order_mark + sizeof(byte_order_mark)); + EXPECT_TRUE (bbom); + const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + bool no_bbom = starts_with_bom(threechars, threechars + sizeof(threechars)); + EXPECT_FALSE (no_bbom); +} + +TEST(CheckedIteratrTests, test_increment) +{ + const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + iris::utflib::iterator it(threechars, threechars, threechars + 9); + iris::utflib::iterator it2 = it; + EXPECT_EQ (it2, it); + EXPECT_EQ (*it, 0x10346); + EXPECT_EQ (*(++it), 0x65e5); + EXPECT_EQ ((*it++), 0x65e5); + EXPECT_EQ (*it, 0x0448); + EXPECT_NE (it, it2); + iris::utflib::iterator endit (threechars + 9, threechars, threechars + 9); + EXPECT_EQ (++it, endit); +} + +TEST(CheckedIteratrTests, test_decrement) +{ + const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + iris::utflib::iterator it(threechars+9, threechars, threechars + 9); + EXPECT_EQ (*(--it), 0x0448); + EXPECT_EQ ((*it--), 0x0448); + EXPECT_EQ (*it, 0x65e5); + EXPECT_EQ (--it, iris::utflib::iterator(threechars, threechars, threechars + 9)); + EXPECT_EQ (*it, 0x10346); +} diff --git a/test/unicode/string/negative.cpp b/test/unicode/string/negative.cpp new file mode 100644 index 0000000..665585b --- /dev/null +++ b/test/unicode/string/negative.cpp @@ -0,0 +1,61 @@ +#include "utf8.h" + +using namespace iris::utflib; + +#include +#include +#include +#include + +using namespace std; + +const unsigned INVALID_LINES[] = { 75, 76, 83, 84, 85, 93, 102, 103, 105, 106, 107, 108, 109, 110, 114, 115, 116, 117, 124, 125, 130, 135, 140, 145, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 169, 175, 176, 177, 207, 208, 209, 210, 211, 220, 221, 222, 223, 224, 232, 233, 234, 235, 236, 247, 248, 249, 250, 251, 252, 253, 257, 258, 259, 260, 261, 262, 263, 264}; +const unsigned* INVALID_LINES_END = INVALID_LINES + sizeof(INVALID_LINES)/sizeof(unsigned); + +int main(int argc, char** argv) +{ + string test_file_path; + if (argc == 2) + test_file_path = argv[1]; + else { + cout << "Wrong number of arguments" << endl; + return 1; + } + // Open the test file + ifstream fs8(test_file_path.c_str()); + if (!fs8.is_open()) { + cout << "Could not open " << test_file_path << endl; + return 1; + } + + // Read it line by line + unsigned int line_count = 0; + char byte; + while (!fs8.eof()) { + string line; + while ((byte = static_cast(fs8.get())) != '\n' && !fs8.eof()) + line.push_back(byte); + + line_count++; + bool expected_valid = (find(INVALID_LINES, INVALID_LINES_END, line_count) == INVALID_LINES_END); + // Print out lines that contain unexpected invalid UTF-8 + if (!is_valid(line.begin(), line.end())) { + if (expected_valid) { + cout << "Unexpected invalid utf-8 at line " << line_count << '\n'; + return 1; + } + + // try fixing it: + string fixed_line; + replace_invalid(line.begin(), line.end(), back_inserter(fixed_line)); + if (!is_valid(fixed_line.begin(), fixed_line.end())) { + cout << "replace_invalid() resulted in an invalid utf-8 at line " << line_count << '\n'; + return 1; + } + } + else if (!expected_valid) { + cout << "Invalid utf-8 NOT detected at line " << line_count << '\n'; + return 1; + } + } +} diff --git a/test/unicode/string/test_cpp11.cpp b/test/unicode/string/test_cpp11.cpp new file mode 100644 index 0000000..9de19be --- /dev/null +++ b/test/unicode/string/test_cpp11.cpp @@ -0,0 +1,117 @@ +#include "ftest.h" + +#include "utf8.h" + +#include + +using namespace iris::utflib; +using namespace std; + +TEST(CPP11APITests, test_append) +{ + string u; + append(0x0448, u); + EXPECT_EQ (u[0], char(0xd1)); + EXPECT_EQ (u[1], char(0x88)); + EXPECT_EQ (u.length(), 2); + + u.clear(); + append(0x65e5, u); + EXPECT_EQ (u[0], char(0xe6)); + EXPECT_EQ (u[1], char(0x97)); + EXPECT_EQ (u[2], char(0xa5)); + EXPECT_EQ (u.length(), 3); + + u.clear(); + append(0x3044, u); + EXPECT_EQ (u[0], char(0xe3)); + EXPECT_EQ (u[1], char(0x81)); + EXPECT_EQ (u[2], char(0x84)); + EXPECT_EQ (u.length(), 3); + + u.clear(); + append(0x10346, u); + EXPECT_EQ (u[0], char(0xf0)); + EXPECT_EQ (u[1], char(0x90)); + EXPECT_EQ (u[2], char(0x8d)); + EXPECT_EQ (u[3], char(0x86)); + EXPECT_EQ (u.length(), 4); +} + +TEST(CPP11APITests, test_append16) +{ + u16string u; + append16(0x0448, u); + EXPECT_EQ (u[0], char16_t(0x0448)); + EXPECT_EQ (u.length(), 1); +} + +TEST(CPP11APITests, test_utf16to8) +{ + u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; + string u = utf16to8(utf16string); + EXPECT_EQ (u.size(), 10); +} + +TEST(CPP11APITests, test_utf8to16) +{ + string utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; + u16string utf16result = utf8to16(utf8_with_surrogates); + EXPECT_EQ (utf16result.size(), 4); + EXPECT_EQ (utf16result[2], 0xd834); + EXPECT_EQ (utf16result[3], 0xdd1e); + // Just to make sure it compiles with string literals + EXPECT_EQ(utf8to16(u8"simple"), u"simple"); + EXPECT_EQ(utf8to16("simple"), u"simple"); +} + +TEST(CPP11APITests, test_utf32to8) +{ + u32string utf32string = {0x448, 0x65E5, 0x10346}; + string utf8result = utf32to8(utf32string); + EXPECT_EQ (utf8result.size(), 9); +} + +TEST(CPP11APITests, test_utf8to32) +{ + const char* twochars = "\xe6\x97\xa5\xd1\x88"; + u32string utf32result = utf8to32(twochars); + EXPECT_EQ (utf32result.size(), 2); +} + +TEST(CPP11APITests, test_find_invalid) +{ + string utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa"; + auto invalid = find_invalid(utf_invalid); + EXPECT_EQ (invalid, 5); +} + +TEST(CPP11APITests, test_is_valid) +{ + string utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa"; + bool bvalid = is_valid(utf_invalid); + EXPECT_FALSE (bvalid); + string utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; + bvalid = is_valid(utf8_with_surrogates); + EXPECT_TRUE (bvalid); +} + +TEST(CPP11APITests, test_replace_invalid) +{ + string invalid_sequence = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; + string replace_invalid_result = replace_invalid(invalid_sequence, '?'); + bool bvalid = is_valid(replace_invalid_result); + EXPECT_TRUE (bvalid); + const string fixed_invalid_sequence = "a????z"; + EXPECT_EQ(fixed_invalid_sequence, replace_invalid_result); +} + +TEST(CPP11APITests, test_starts_with_bom) +{ + string byte_order_mark = {char(0xef), char(0xbb), char(0xbf)}; + bool bbom = starts_with_bom(byte_order_mark); + EXPECT_TRUE (bbom); + string threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + bool no_bbom = starts_with_bom(threechars); + EXPECT_FALSE (no_bbom); +} diff --git a/test/unicode/string/test_cpp17.cpp b/test/unicode/string/test_cpp17.cpp new file mode 100644 index 0000000..2d3756c --- /dev/null +++ b/test/unicode/string/test_cpp17.cpp @@ -0,0 +1,86 @@ +#include "ftest.h" + +#include "utf8.h" + +#include + +using namespace iris::utflib; +using namespace std; + +TEST(CPP17APITests, test_utf16to8) +{ + u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; + u16string_view utf16stringview(utf16string); + string u = utf16to8(utf16stringview); + EXPECT_EQ (u.size(), 10); +} + +TEST(CPP17APITests, test_utf8to16) +{ + string_view utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; + u16string utf16result = utf8to16(utf8_with_surrogates); + EXPECT_EQ (utf16result.size(), 4); + EXPECT_EQ (utf16result[2], 0xd834); + EXPECT_EQ (utf16result[3], 0xdd1e); +} + +TEST(CPP17APITests, test_utf32to8) +{ + u32string utf32string = {0x448, 0x65E5, 0x10346}; + u32string_view utf32stringview(utf32string); + string utf8result = utf32to8(utf32stringview); + EXPECT_EQ (utf8result.size(), 9); +} + +TEST(CPP17APITests, test_utf8to32) +{ + string_view twochars = "\xe6\x97\xa5\xd1\x88"; + u32string utf32result = utf8to32(twochars); + EXPECT_EQ (utf32result.size(), 2); +} + +TEST(CPP17APITests, test_find_invalid) +{ + string_view utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa"; + auto invalid = find_invalid(utf_invalid); + EXPECT_EQ (invalid, 5); +} + +TEST(CPP17APITests, test_is_valid) +{ + string_view utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa"; + bool bvalid = is_valid(utf_invalid); + EXPECT_FALSE (bvalid); + string_view utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; + bvalid = is_valid(utf8_with_surrogates); + EXPECT_TRUE (bvalid); +} + +TEST(CPP17APITests, test_replace_invalid) +{ + string_view invalid_sequence = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; + string replace_invalid_result = replace_invalid(invalid_sequence, '?'); + bool bvalid = is_valid(replace_invalid_result); + EXPECT_TRUE (bvalid); + const string fixed_invalid_sequence = "a????z"; + EXPECT_EQ(fixed_invalid_sequence, replace_invalid_result); +} + +TEST(CPP17APITests, test_starts_with_bom) +{ + string byte_order_mark = {char(0xef), char(0xbb), char(0xbf)}; + string_view byte_order_mark_view(byte_order_mark); + bool bbom = starts_with_bom(byte_order_mark_view); + EXPECT_TRUE (bbom); + string_view threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + bool no_bbom = starts_with_bom(threechars); + EXPECT_FALSE (no_bbom); +} + +TEST(CPP17APITests, string_class_and_literals) +{ + const char* twochars = "ab"; + EXPECT_TRUE (is_valid(twochars)); + const string two_chars_string(twochars); + EXPECT_TRUE (is_valid(two_chars_string)); +} diff --git a/test/unicode/string/test_cpp20.cpp b/test/unicode/string/test_cpp20.cpp new file mode 100644 index 0000000..330027d --- /dev/null +++ b/test/unicode/string/test_cpp20.cpp @@ -0,0 +1,79 @@ +#include "ftest.h" + +#include "utf8.h" + +#include + +using namespace iris::utflib; +using namespace std; + +TEST(CPP20APITests, test_utf16tou8) +{ + u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; + u16string_view utf16stringview{utf16string}; + u8string u = utf16tou8(utf16string); + EXPECT_EQ (u.size(), 10); + u = utf16tou8(utf16stringview); + EXPECT_EQ (u.size(), 10); +} + +TEST(CPP20APITests, tes20t_utf8to16) +{ + u8string utf8_with_surrogates{ reinterpret_cast("\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e") }; + u16string utf16result = utf8to16(utf8_with_surrogates); + EXPECT_EQ (utf16result.size(), 4); + EXPECT_EQ (utf16result[2], 0xd834); + EXPECT_EQ (utf16result[3], 0xdd1e); +} + +TEST(CPP20APITests, test_utf32tou8) +{ + u32string utf32string = {0x448, 0x65E5, 0x10346}; + u32string_view utf32stringview{utf32string}; + u8string utf8result = utf32tou8(utf32stringview); + EXPECT_EQ (utf8result.size(), 9); +} + +TEST(CPP20APITests, test_utf8to32) +{ + u8string twochars = reinterpret_cast("\xe6\x97\xa5\xd1\x88"); + u32string utf32result = utf8to32(twochars); + EXPECT_EQ (utf32result.size(), 2); +} + +TEST(CPP20APITests, test_find_invalid) +{ + u8string utf_invalid = reinterpret_cast("\xe6\x97\xa5\xd1\x88\xfa"); + auto invalid = find_invalid(utf_invalid); + EXPECT_EQ (invalid, 5); +} + +TEST(CPP20APITests, test_is_valid) +{ + u8string utf_invalid = reinterpret_cast("\xe6\x97\xa5\xd1\x88\xfa"); + bool bvalid = is_valid(utf_invalid); + EXPECT_FALSE (bvalid); + u8string utf8_with_surrogates = reinterpret_cast("\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"); + bvalid = is_valid(utf8_with_surrogates); + EXPECT_TRUE (bvalid); +} + +TEST(CPP20APITests, test_replace_invalid) +{ + u8string invalid_sequence = reinterpret_cast("a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"); + u8string replace_invalid_result = replace_invalid(invalid_sequence, u8'?'); + bool bvalid = is_valid(replace_invalid_result); + EXPECT_TRUE (bvalid); + const u8string fixed_invalid_sequence = reinterpret_cast("a????z"); + EXPECT_EQ(fixed_invalid_sequence, replace_invalid_result); +} + +TEST(CPP20APITests, test_starts_with_bom) +{ + u8string byte_order_mark = reinterpret_cast("\xef\xbb\xbf"); + bool bbom = starts_with_bom(byte_order_mark); + EXPECT_TRUE (bbom); + u8string threechars = reinterpret_cast("\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"); + bool no_bbom = starts_with_bom(threechars); + EXPECT_FALSE (no_bbom); +} diff --git a/test/unicode/string/test_data/utf8_invalid.txt b/test/unicode/string/test_data/utf8_invalid.txt new file mode 100644 index 0000000000000000000000000000000000000000..ae83159328313d3ba57b4f51ccb96db3dbfd79f1 GIT binary patch literal 20010 zcmdU1X_MU6bovBN~mKh6fF_0QAgsswpbL0B%%C@w9FK+vZcy8Zq1!0g4#f{pMW{IDslKE-otkk?5=P@_gAEfdHgKi|B z)6KH}<@4tc4-cDO*!0|9)AgEz{qtYa+uL8iaQ^dTMb=kWH&#~HSJo~$&h>6!#*Dho@!}-$y=+-} zZrF*Fp5I=UIFUH(`pIFC`VvH$4^Fcn+B29=>OVE`BhOEpq9iFbAT5eB>G^JyNft}X z7Z>0WFvZeHX99kq@03yXYq|L;(&dH9ts7p$uh^(gsw*RC$vY zY4j=~Y%hh*Ls*{|AHZ86gsAN%L)t()AWLb*#wIOE+*A(zFyznNBEzA8Rd`-IH)U&f zl+BGO;P3Eo+jsggj1)FaA*9fC*p>*2haGtyviXSFv1s@(Ed6 zy|{UCW$iNDZgp8wql;8_np>*S_2!1R46wF;5O}@=$MJ&$>lhf@1q;MT1j1~2xe?_7i8soFiY{eZsL0IJi`?X4n~y*;FwnIw&Q*V!K8y$>Sqno<}~ap zxg=aT3*GE6pk2;*FQ*rt3UdfN7l;l#mjeu74-IJ{(n-mr;kqdg$y0+MqwIUVID($= zG|Z_F+i|@vz?=zlqf5pBQa1vz!!p=R=vIe>T=mJGk5FQ{lBF9D?(XjHoLiQ|Zh&#Z zINqnIT992B>@!+mY^a@lK-LIZ@S`3Au0}Lh(x4YGx}?KiD-Jal^@6=_W@6P|9Jd7m zcInaw0X+|_wF8ZOgCOl6AX-Hh}9Dt0Fw9I6j?$C z5B8!sq4kz*6%4`L5@!&Vq**nj3a^Cc(Ve=r!$Ua%e4yv<`%c?$AbGI*hz-5=X%o z8|Vz8O-)9Upx1NveCQaw6r)v~%Vz3oqE6jmDvj%pT$QVj?=;{s%0$RSz$lKW7fYa7 z3Yw-Fr9Mr_h1xW86_brufYKN> z;4w#@-$#xF6?{KCM6^Ska10Ua&9&y5s&>7J`+^nFo4Ft`ildc2{3TOgedx)DuU#WA zSrUh-bK^l|j8%P^w_CfQC>NqpECww2Q+izRdgQlP4udwRqpL3?6To&!X_aRyNh2#6 z^rX*u#$Yt>g1N#?ePjUypS+y3i4X6kg`2faZaK;KU>}di8oiL! zQ5GOvxFH-Wf*Tw_cP3FGfYRSU-}Hc*mr$K0Fly#kV1zJkSKwG3q;@)uM}ts>F43pK znHGF$kPXz`Qmr5>RpL235Ds)ukYMADkZHW$Mm^AG(bgJ*c92zxS=Jqk#5yC(S+g-w z#aIod8u0PGA@H554=&S+Bg6s?mKx6Ks<{cNg+PkVBP^S>o>rrWF{GE})Yr&4^{X>q z964(uPqNA`Y|eysR_ZffxW|`^r|*j}zskMP#6n*_Kk@y2IYjX&5A4Tj&wrqhXye5X zeC&nyfAr}WKltMN8h0mk@l>vhT$!r5jq5Y#r+24i)^E;$KEzhpk36BL&U20S4A_Q zBIX>R_iNDO?^8R_%@e2}X0GgvKYi~{XJOtOG;b4RcOXFdT*2hzQb$@H+&1pKr?&8v zeC3!1ZV;~`-Vo83?N$B*;>YVpn+hVNXb_>`9U>}$tu_%~zWG93cc~!4!il)HN|OOh zXH&Kfs51WQyXu>bAnriXO<1@>YAP_0my~V8`Kxne1VY~19_D#s1sz-&Iak1}D&0RJ&LiOnofNn*|25_z`kt zh0AWmHJUNiV?7sOMK(T+n#FTHQyls)KlknU--~m9x*=l;`OniQyr|GV*xGAuxndg8yp|J#-}r(kMW+X%Ny%vLo`pj zu#+YH!-;o#PqjL;l==)vyJ^6k41NlSwBl%A*3;9d=EBR}AN$MA>i{}^iM+>BW*Dg-#cVmWred^bLQ|tG(d`$jE{#O1@ z{$Bn;{!#u({#pJ-{#E`>?#RdG<^Okmpx&T9wkOrGIYS*Cs-v?t+uh3i@fq!JugJg4 zf5?BztMXs+-|~q$Rsw%Dji*%r4Nt#XuqlT@w$9!0lJ`0&O#rERz;Cstl`FjfI~CY-KH*wC zeKZsnDcZTEO)c2G!zLd#6pON@Iz!i<1UoIyw^U1lXU)$|kAZIJ7f9UM(qU@j)0uHx1rmJ~9yQ0b-8TMMd{5G(Jh5B!z z(yQoME5JgvXe+90Pc&{#RyLf!X~jEHS=7Y4eLNcwCODBBYXZJ9z3ym8M980r?llo# zo3U|XAQ81D0#r@N*Jo|U7);F6Yiq`A!I?dU#!IsWiJ5wB%_tt%Nik)`a_ekBOx0^^ zh6}I6Q$TDb%tplIy0&IkR?7{A26qx>L&D2yYi6agpU}{aglSCcjtjY=reTf`qtgd# zhwRueJsJTTIwvAoT4%t;#|Xw)hAhajh^ZQG;p<$c$;oCn78)+9F%mD7ohAkCBiq`+ zUMD6DyI5LlPFUa+KNrt0kQ=vdA0v3s+toc^dif?SmL@jSCJI{W{#-oGx)E`$XZKT0 zPOMAMg+KZ2n_2nV}G!joR1Q@B8RBMcT8Cp+yH^rfrww6fT}V$d7rj<8yQA zjY*#$AN}5+=+hTC@9yZn(x(@LhcpQK_^*=#zwySWPldgB`5}Tmfi^d7IxK_4y(KmK ziN(`5Zhs|(MdNoFr_knB_PI0cp_pWED(W_Sjo5x@`|8)52#gsUh?90x9*Gax@TdJ| zXqrPi8HEA>yn@C^w85t-d;Qr$S6m}!((Fx7G(5r_6>-_4h?O+7(V^M3J>VIUuHR#S zc_VdU5~dHV#LK%B8$~!y{I{W=)R#;{vvz@(T$d8jhEMos^%|kK)=hCN;j(rP<6sKU zacHNyHZHTFJ!Drrau<=ot>$Wrtu61d-eN87CrI zzG;lr?odNO4hVh3lJ@nSjGoCzalixP<$ateuTe$MuusSmEr=yuC4JTQN!z$Mn=EbV z^zl1kbK6WPDEz!z*3Ji-vWsuW zu93qdH9RWB6Bc=iShUfz(G*bvf?axa_Z%AU^o$2&1BxI(ZQ_UdRRKrZ)r(|4-~pzb zDtL=7w&{y71wu&S^lG=au6bfaBYn_mr0Tnmn{qgt`r zqN6s1HE0f?bB$fV3;I48gD}kT0Zfku3D1}5ffjn_LEs-Q7t%xN1a0L<(D2Ji|0wO9 zr?5=7D_s`qn3TWz@LMQ{w{Wkg@WU$8^pCQc9xNIRlj4xn$WY=U3Q&?<(c9{v%{<1T zopb;*RfDcFjChc8L!4|C&C?Zzmf5XqG*k8dT#h*7rF>KXNuT<^^~>=i0We6Im^mtf zIW0~?Q4?xVEYSQTob13P)n1E_&zsD?S8*^8bN6B{Z;74Ih$Z8$H;bq53%T$C(Uh?^ zm$qr3hn}e8vcl=G@7gjjNod7h%9uicNlsI&wRB~valV0GYI9l(c4w3c-zqNpFAw!p z%I$FlFROx!paECb<4_qoxAm+d{h16F7xU4~Lmsx}noA%Mhmxm9*-Y`@j`g$)&IIz; z_Cn1+CAiPFzn%YP@%n}R!2-FwvChUI562TK`5qLaEWGu6?K*Dc5OBQ&^tC6biV48m z$MYg?l!%?8%gBNP5p$M#AvY_K+j+|tdY23;AUOw`7xO{|(9XU1<6)dtytL7|qe~z<}U$s32$R=i>DNdtrhy?T9?G#2A70_o*21 z;i)yKDeCZr8X!$D2h4jgSMwfB`wC-3y=F{9 z_BRqyArT|C$hi3jRK|#R%vgN0TvTohrspnSyn0(!qipS^{CX0Q;*gDV)62We$}1LK}1dW$W7G?26$^~H=_DL^As2yz}wTi5ml7xy+Ncq7l=09TTmb(-OouR-PuA`(!D_iB-8zzm~>~0IiBtT z3rtP-bAVPQasT%t2LxWrj}_}5Lcl7i`6~;j0$;p2j=S?j!YdrHmMfE?$^v_J&BPz( zf9S!6Gcl_!_B?tS&XC>9)hSK}3{AyEU8@dr98^!h_OSQMWd)q~%@?Q2*}3@}8EHk* z2oIc{o4+H1B@tVjOfX(Ib}U{WTnc{DZ0wRIG)`ffp2T}tpQtiY;pwE?GojCfuN@p{ zgM)FEMP~!>wL8z$b~H;gJjAqV-*|B6iq2-Ol8zj)jXQ;QB+WkTG76{TR+c*vAF17y5@tu1s zUU@e8zQDA~DKV{W0`uOAGMKr?zz4_ZR976M^VyrV>CAn`V3u@ROiL#)>(XhwhMqk} zr<(d0ozK;mr}3K{v!v5vS~`JQmrmn3j2B~cD%2RA&)1hHdL96lW0rJUOiL#)>(F`M VE;t6T_-vE^`IE6xPXA%#{2v5ASmOWy literal 0 HcmV?d00001 From 1d46090e99696f5201300a7b8b14d4acc0edcbbd Mon Sep 17 00:00:00 2001 From: Nana Sakisaka <1901813+saki7@users.noreply.github.com> Date: Sat, 7 Mar 2026 15:36:24 +0900 Subject: [PATCH 02/17] Combine unit tests --- include/iris/unicode/string.hpp | 239 +++++---- test/unicode/string/CMakeLists.txt | 11 + test/unicode/string/apitests.cpp | 257 --------- test/unicode/string/string.cpp | 488 ++++++++++++++++++ test/unicode/string/test_cpp11.cpp | 117 ----- test/unicode/string/test_cpp17.cpp | 86 --- test/unicode/string/test_cpp20.cpp | 79 --- .../string/{negative.cpp => utf8_invalid.cpp} | 0 8 files changed, 618 insertions(+), 659 deletions(-) delete mode 100644 test/unicode/string/apitests.cpp create mode 100644 test/unicode/string/string.cpp delete mode 100644 test/unicode/string/test_cpp11.cpp delete mode 100644 test/unicode/string/test_cpp17.cpp delete mode 100644 test/unicode/string/test_cpp20.cpp rename test/unicode/string/{negative.cpp => utf8_invalid.cpp} (100%) diff --git a/include/iris/unicode/string.hpp b/include/iris/unicode/string.hpp index 08d8d1f..a09ebc8 100644 --- a/include/iris/unicode/string.hpp +++ b/include/iris/unicode/string.hpp @@ -26,77 +26,77 @@ DEALINGS IN THE SOFTWARE. */ -#ifndef IRIS_UTFLIB_UTF8_H -#define IRIS_UTFLIB_UTF8_H +#ifndef IRIS_UNICODE_STRING_HPP +#define IRIS_UNICODE_STRING_HPP #include -#include +#include #include #include #include #include +#include +#include -#include - -namespace iris::utflib +namespace iris::unicode { - template + template concept octet = std::integral && sizeof(T) == 1; - template + template concept utf8char = octet && (std::same_as || std::same_as); - - template + + template concept utf16char = std::same_as; - - template + + template concept utf32char = std::same_as; - template - concept octet_iterator = std::input_iterator && octet>; + template + concept octet_input_iterator = std::input_iterator && octet>; - template - concept utf8_iterator = octet_iterator && utf8char>; + template + concept utf8_input_iterator = octet_input_iterator && utf8char>; - template - concept utf16_iterator = std::input_iterator && utf16char>; - - template - concept utf32_iterator = std::input_iterator && utf32char>; + template + concept utf16_input_iterator = std::input_iterator && utf16char>; + + template + concept utf32_input_iterator = std::input_iterator && utf32char>; namespace traits { - template + template struct is_nothrow_dereferenceable : std::false_type {}; - template + template struct is_nothrow_dereferenceable())>> : std::bool_constant())> {}; - template + template inline constexpr bool is_nothrow_dereferenceable_v = is_nothrow_dereferenceable::value; - - template + + template struct is_nothrow_prefix_incrementable : std::false_type {}; - template + template struct is_nothrow_prefix_incrementable())>> : std::bool_constant())> {}; - template + template inline constexpr bool is_nothrow_prefix_incrementable_v = is_nothrow_prefix_incrementable::value; - - template + + template struct is_nothrow_postfix_incrementable : std::false_type {}; - template + template struct is_nothrow_postfix_incrementable()++)>> : std::bool_constant()++)> {}; - template + template inline constexpr bool is_nothrow_postfix_incrementable_v = is_nothrow_postfix_incrementable::value; - template + template struct is_nothrow_sentinel : std::false_type {}; - template + template requires std::sentinel_for struct is_nothrow_sentinel : std::bool_constant< noexcept(std::declval() == std::declval()) && @@ -106,7 +106,7 @@ namespace iris::utflib > {}; - template + template inline constexpr bool is_nothrow_sentinel_v = is_nothrow_sentinel::value; } // namespace traits @@ -128,7 +128,7 @@ namespace iris::utflib enum class utf_error { OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT }; - template + template [[nodiscard]] constexpr char8_t mask8(Octet oc) noexcept { return static_cast(0xff & oc); @@ -139,7 +139,7 @@ namespace iris::utflib return static_cast(0xffff & oc); } - template + template [[nodiscard]] constexpr bool is_trail(Octet oc) noexcept { return ((internal::mask8(oc) >> 6) == 0x2); @@ -185,7 +185,7 @@ namespace iris::utflib return false; } - template + template [[nodiscard]] constexpr int sequence_length(It lead_it) noexcept(traits::is_nothrow_dereferenceable_v) { @@ -203,7 +203,7 @@ namespace iris::utflib } /// Helper for get_sequence_x - template Se> + template Se> constexpr utf_error increase_safely(It& it, Se end) noexcept(std::conjunction_v< traits::is_nothrow_dereferenceable, @@ -228,7 +228,7 @@ namespace iris::utflib } while (false) /// get_sequence_x functions decode utf-8 sequences of the length x - template Se> + template Se> constexpr utf_error get_sequence_1(It& it, Se end, char32_t& code_point) noexcept(std::conjunction_v< traits::is_nothrow_dereferenceable, @@ -243,7 +243,7 @@ namespace iris::utflib return utf_error::OK; } - template Se> + template Se> constexpr utf_error get_sequence_2(It& it, Se end, char32_t& code_point) noexcept(std::conjunction_v< traits::is_nothrow_dereferenceable, @@ -263,7 +263,7 @@ namespace iris::utflib return utf_error::OK; } - template Se> + template Se> constexpr utf_error get_sequence_3(It& it, Se end, char32_t& code_point) noexcept(std::conjunction_v< traits::is_nothrow_dereferenceable, @@ -287,7 +287,7 @@ namespace iris::utflib return utf_error::OK; } - template Se> + template Se> constexpr utf_error get_sequence_4(It& it, Se end, char32_t& code_point) noexcept(std::conjunction_v< traits::is_nothrow_dereferenceable, @@ -317,7 +317,7 @@ namespace iris::utflib #undef IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR - template Se> + template Se> requires std::forward_iterator constexpr utf_error validate_next(It& it, Se end, char32_t& code_point) noexcept(std::conjunction_v< @@ -376,7 +376,7 @@ namespace iris::utflib return err; } - template Se> + template Se> requires std::forward_iterator constexpr utf_error validate_next(It& it, Se end) noexcept(noexcept(internal::validate_next(it, end, std::declval()))) @@ -385,7 +385,7 @@ namespace iris::utflib return internal::validate_next(it, end, ignored); } - template Se> + template Se> requires std::forward_iterator constexpr utf_error validate_next16(It& it, Se end, char32_t& code_point) noexcept(std::conjunction_v< @@ -429,10 +429,9 @@ namespace iris::utflib return err; } - template > - requires std::output_iterator - constexpr It append(char32_t cp, It result) - noexcept(noexcept(*result++ = std::declval())) + template> + requires std::output_iterator + constexpr OutIt append(char32_t cp, OutIt result) noexcept { if (cp < 0x80) // one octet *(result++) = static_cast(cp); @@ -452,14 +451,14 @@ namespace iris::utflib return result; } - template + template constexpr std::back_insert_iterator append(char32_t cp, std::back_insert_iterator result) - noexcept(noexcept(internal::append, typename container_type::value_type>(cp, result))) + noexcept(noexcept(internal::append, class container_type::value_type>(cp, result))) { - return internal::append, typename container_type::value_type>(cp, result); + return internal::append, class container_type::value_type>(cp, result); } - template It> + template It> constexpr It append16(char32_t cp, It result) noexcept(noexcept(*result++ = std::declval())) { @@ -534,7 +533,7 @@ namespace iris::utflib // Byte order mark constexpr char8_t bom[] = {0xef, 0xbb, 0xbf}; - template Se> + template Se> [[nodiscard]] constexpr It find_invalid(It it, Se se) noexcept(noexcept(internal::validate_next(it, se)) && std::is_nothrow_copy_constructible_v) { @@ -547,39 +546,39 @@ namespace iris::utflib } [[nodiscard]] constexpr std::size_t find_invalid(std::string_view s) - noexcept(noexcept(utflib::find_invalid(s.begin(), s.end()))) + noexcept(noexcept(unicode::find_invalid(s.begin(), s.end()))) { - std::string_view::const_iterator invalid = utflib::find_invalid(s.begin(), s.end()); + std::string_view::const_iterator invalid = unicode::find_invalid(s.begin(), s.end()); return (invalid == s.end()) ? std::string_view::npos : static_cast(invalid - s.begin()); } [[nodiscard]] constexpr std::size_t find_invalid(std::u8string_view s) - noexcept(noexcept(utflib::find_invalid(s.begin(), s.end()))) + noexcept(noexcept(unicode::find_invalid(s.begin(), s.end()))) { - std::u8string_view::const_iterator invalid = utflib::find_invalid(s.begin(), s.end()); + std::u8string_view::const_iterator invalid = unicode::find_invalid(s.begin(), s.end()); return (invalid == s.end()) ? std::u8string_view::npos : static_cast(invalid - s.begin()); } - template Se> + template Se> [[nodiscard]] constexpr bool is_valid(It it, Se se) - noexcept(noexcept(utflib::find_invalid(it, se)) && traits::is_nothrow_sentinel_v) + noexcept(noexcept(unicode::find_invalid(it, se)) && traits::is_nothrow_sentinel_v) { - return (utflib::find_invalid(it, se) == se); + return (unicode::find_invalid(it, se) == se); } [[nodiscard]] constexpr bool is_valid(std::string_view s) - noexcept(noexcept(utflib::is_valid(s.begin(), s.end()))) + noexcept(noexcept(unicode::is_valid(s.begin(), s.end()))) { - return utflib::is_valid(s.begin(), s.end()); + return unicode::is_valid(s.begin(), s.end()); } [[nodiscard]] constexpr bool is_valid(std::u8string_view s) - noexcept(noexcept(utflib::is_valid(s.begin(), s.end()))) + noexcept(noexcept(unicode::is_valid(s.begin(), s.end()))) { - return utflib::is_valid(s.begin(), s.end()); + return unicode::is_valid(s.begin(), s.end()); } - template Se> + template Se> [[nodiscard]] constexpr bool starts_with_bom(It it, Se end) noexcept(noexcept(internal::mask8(*it++)) && traits::is_nothrow_sentinel_v) { @@ -587,19 +586,19 @@ namespace iris::utflib } [[nodiscard]] constexpr bool starts_with_bom(std::string_view s) - noexcept(noexcept(utflib::starts_with_bom(s.begin(), s.end()))) + noexcept(noexcept(unicode::starts_with_bom(s.begin(), s.end()))) { - return utflib::starts_with_bom(s.begin(), s.end()); + return unicode::starts_with_bom(s.begin(), s.end()); } [[nodiscard]] constexpr bool starts_with_bom(std::u8string_view s) - noexcept(noexcept(utflib::starts_with_bom(s.begin(), s.end()))) + noexcept(noexcept(unicode::starts_with_bom(s.begin(), s.end()))) { - return utflib::starts_with_bom(s.begin(), s.end()); + return unicode::starts_with_bom(s.begin(), s.end()); } - template // TODO: add constraints - constexpr It append(char32_t cp, It result) + template + constexpr OutIt append(char32_t cp, OutIt result) { if (!internal::is_code_point_valid(cp)) throw invalid_code_point(cp); @@ -609,15 +608,15 @@ namespace iris::utflib constexpr void append(char32_t cp, std::string& s) { - utflib::append(cp, std::back_inserter(s)); + unicode::append(cp, std::back_inserter(s)); } constexpr void append(char32_t cp, std::u8string& s) { - utflib::append(cp, std::back_inserter(s)); + unicode::append(cp, std::back_inserter(s)); } - template // TODO: add constraints + template // TODO: add constraints constexpr It append16(char32_t cp, It result) { if (!internal::is_code_point_valid(cp)) @@ -628,10 +627,10 @@ namespace iris::utflib constexpr void append16(char32_t cp, std::u16string& s) { - utflib::append16(cp, std::back_inserter(s)); + unicode::append16(cp, std::back_inserter(s)); } - template Se, typename Out> // TODO: add constraints + template Se, class Out> // TODO: add constraints constexpr Out replace_invalid(It start, Se end, Out out, char32_t replacement) { while (start != end) { @@ -643,17 +642,17 @@ namespace iris::utflib *out++ = *it; break; case internal::utf_error::NOT_ENOUGH_ROOM: - out = utflib::append(replacement, out); + out = unicode::append(replacement, out); start = end; break; case internal::utf_error::INVALID_LEAD: - out = utflib::append(replacement, out); + out = unicode::append(replacement, out); ++start; break; case internal::utf_error::INCOMPLETE_SEQUENCE: case internal::utf_error::OVERLONG_SEQUENCE: case internal::utf_error::INVALID_CODE_POINT: - out = utflib::append(replacement, out); + out = unicode::append(replacement, out); ++start; // just one replacement mark for the sequence while (start != end && internal::is_trail(*start)) @@ -664,11 +663,11 @@ namespace iris::utflib return out; } - template Se, typename Out> // TODO: add constraints + template Se, class Out> // TODO: add constraints constexpr Out replace_invalid(It start, Se end, Out out) { constexpr char32_t replacement_marker = static_cast(internal::mask16(0xfffd)); - return utflib::replace_invalid(start, end, out, replacement_marker); + return unicode::replace_invalid(start, end, out, replacement_marker); } [[nodiscard]] constexpr std::string replace_invalid(std::string_view s, char32_t replacement) @@ -699,7 +698,7 @@ namespace iris::utflib return result; } - template Se> + template Se> [[nodiscard]] constexpr char32_t next(It& it, Se end) { char32_t cp = 0; @@ -719,7 +718,7 @@ namespace iris::utflib return cp; } - template Se> + template Se> [[nodiscard]] constexpr char32_t next16(It& it, Se end) { char32_t cp = 0; @@ -729,13 +728,13 @@ namespace iris::utflib return cp; } - template Se> + template Se> [[nodiscard]] constexpr char32_t peek_next(It it, Se end) { - return utflib::next(it, end); + return unicode::next(it, end); } - template Se> + template Se> [[nodiscard]] constexpr char32_t prior(It& it, Se start) { // can't do much if it == start @@ -747,34 +746,34 @@ namespace iris::utflib while (internal::is_trail(*(--it))) if (it == start) throw invalid_utf8(*it); // error - no lead byte in the sequence - return utflib::peek_next(it, end); + return unicode::peek_next(it, end); } - template Se, typename distance_type> + template Se, class distance_type> constexpr void advance(It& it, distance_type n, Se end) { const distance_type zero(0); if (n < zero) { // backward for (distance_type i = n; i < zero; ++i) - (void)utflib::prior(it, end); + (void)unicode::prior(it, end); } else { // forward for (distance_type i = zero; i < n; ++i) - (void)utflib::next(it, end); + (void)unicode::next(it, end); } } - template Se> - [[nodiscard]] constexpr typename std::iterator_traits::difference_type distance(It first, Se last) + template Se> + [[nodiscard]] constexpr class std::iterator_traits::difference_type distance(It first, Se last) { - typename std::iterator_traits::difference_type dist; + class std::iterator_traits::difference_type dist; for (dist = 0; first != last; ++dist) - (void)utflib::next(first, last); + (void)unicode::next(first, last); return dist; } - template Se, typename OutIt> // TODO: add constraints + template Se, class OutIt> // TODO: add constraints constexpr OutIt utf16to8(It start, Se end, OutIt result) { while (start != end) { @@ -795,7 +794,7 @@ namespace iris::utflib else if (internal::is_trail_surrogate(cp)) throw invalid_utf16(static_cast(cp)); - result = utflib::append(cp, result); + result = unicode::append(cp, result); } return result; } @@ -803,22 +802,22 @@ namespace iris::utflib [[nodiscard]] constexpr std::string utf16to8(std::u16string_view s) { std::string result; - utflib::utf16to8(s.begin(), s.end(), std::back_inserter(result)); + unicode::utf16to8(s.begin(), s.end(), std::back_inserter(result)); return result; } [[nodiscard]] constexpr std::u8string utf16tou8(std::u16string_view s) { std::u8string result; - utflib::utf16to8(s.begin(), s.end(), std::back_inserter(result)); + unicode::utf16to8(s.begin(), s.end(), std::back_inserter(result)); return result; } - template Se, typename OutIt> // TODO: add constraints + template Se, class OutIt> // TODO: add constraints constexpr OutIt utf8to16(It start, Se end, OutIt result) { while (start != end) { - const char32_t cp = utflib::next(start, end); + const char32_t cp = unicode::next(start, end); if (cp > 0xffff) { // make a surrogate pair *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); @@ -831,22 +830,22 @@ namespace iris::utflib [[nodiscard]] constexpr std::u16string utf8to16(std::string_view s) { std::u16string result; - utflib::utf8to16(s.begin(), s.end(), std::back_inserter(result)); + unicode::utf8to16(s.begin(), s.end(), std::back_inserter(result)); return result; } [[nodiscard]] constexpr std::u16string utf8to16(std::u8string_view s) { std::u16string result; - utflib::utf8to16(s.begin(), s.end(), std::back_inserter(result)); + unicode::utf8to16(s.begin(), s.end(), std::back_inserter(result)); return result; } - template Se, typename OutIt> // TODO: add constraints + template Se, class OutIt> // TODO: add constraints constexpr OutIt utf32to8(It start, Se end, OutIt result) { while (start != end) - result = utflib::append(*(start++), result); + result = unicode::append(*(start++), result); return result; } @@ -854,22 +853,22 @@ namespace iris::utflib [[nodiscard]] constexpr std::string utf32to8(std::u32string_view s) { std::string result; - utflib::utf32to8(s.begin(), s.end(), std::back_inserter(result)); + unicode::utf32to8(s.begin(), s.end(), std::back_inserter(result)); return result; } [[nodiscard]] constexpr std::u8string utf32tou8(std::u32string_view s) { std::u8string result; - utflib::utf32to8(s.begin(), s.end(), std::back_inserter(result)); + unicode::utf32to8(s.begin(), s.end(), std::back_inserter(result)); return result; } - template Se, typename OutIt> + template Se, class OutIt> constexpr OutIt utf8to32(It start, Se end, OutIt result) { while (start != end) - (*result++) = utflib::next(start, end); + (*result++) = unicode::next(start, end); return result; } @@ -877,19 +876,19 @@ namespace iris::utflib [[nodiscard]] constexpr std::u32string utf8to32(std::string_view s) { std::u32string result; - utflib::utf8to32(s.begin(), s.end(), std::back_inserter(result)); + unicode::utf8to32(s.begin(), s.end(), std::back_inserter(result)); return result; } [[nodiscard]] constexpr std::u32string utf8to32(std::u8string_view s) { std::u32string result; - utflib::utf8to32(s.begin(), s.end(), std::back_inserter(result)); + unicode::utf8to32(s.begin(), s.end(), std::back_inserter(result)); return result; } // The iterator class - template + template class iterator { It it; @@ -920,7 +919,7 @@ namespace iris::utflib [[nodiscard]] constexpr char32_t operator*() const { It temp = it; - return utflib::next(temp, range_end); + return unicode::next(temp, range_end); } [[nodiscard]] constexpr bool operator==(const iterator& rhs) const { @@ -930,28 +929,28 @@ namespace iris::utflib } constexpr iterator& operator++() { - (void)utflib::next(it, range_end); + (void)unicode::next(it, range_end); return *this; } constexpr iterator operator++(int) { iterator temp = *this; - (void)utflib::next(it, range_end); + (void)unicode::next(it, range_end); return temp; } constexpr iterator& operator--() { - (void)utflib::prior(it, range_start); + (void)unicode::prior(it, range_start); return *this; } constexpr iterator operator--(int) { iterator temp = *this; - (void)utflib::prior(it, range_start); + (void)unicode::prior(it, range_start); return temp; } }; // class iterator -} // namespace iris::utflib +} // iris::unicode -#endif // header guard +#endif diff --git a/test/unicode/string/CMakeLists.txt b/test/unicode/string/CMakeLists.txt index 4fd1e27..ebfa8d4 100644 --- a/test/unicode/string/CMakeLists.txt +++ b/test/unicode/string/CMakeLists.txt @@ -1,3 +1,14 @@ # SPDX-License-Identifier: MIT +set( + IRIS_TEST_UNICODE_STRING_TESTS + string + utf8_invalid +) +foreach(test_name IN LISTS IRIS_TEST_UNICODE_STRING_TESTS) + iris_define_test(unicode_string_${test_name} ${test_name}.cpp) + set_target_properties(unicode_string_${test_name}_test PROPERTIES FOLDER "test/unicode/string") +endforeach() + +target_sources(unicode_string_utf8_invalid_test PRIVATE test_data/utf8_invalid.txt) diff --git a/test/unicode/string/apitests.cpp b/test/unicode/string/apitests.cpp deleted file mode 100644 index ba8fa90..0000000 --- a/test/unicode/string/apitests.cpp +++ /dev/null @@ -1,257 +0,0 @@ -#include "ftest.h" - -#include "utf8.h" - -#include - -#include - -using namespace iris::utflib; -using namespace std; - -TEST(CheckedAPITests, test_append) -{ - unsigned char u[5] = {0,0,0,0,0}; - append(0x0448, u); - EXPECT_EQ (u[0], 0xd1); - EXPECT_EQ (u[1], 0x88); - EXPECT_EQ (u[2], 0); - EXPECT_EQ (u[3], 0); - EXPECT_EQ (u[4], 0); - - append(0x65e5, u); - EXPECT_EQ (u[0], 0xe6); - EXPECT_EQ (u[1], 0x97); - EXPECT_EQ (u[2], 0xa5); - EXPECT_EQ (u[3], 0); - EXPECT_EQ (u[4], 0); - - append(0x3044, u); - EXPECT_EQ (u[0], 0xe3); - EXPECT_EQ (u[1], 0x81); - EXPECT_EQ (u[2], 0x84); - EXPECT_EQ (u[3], 0); - EXPECT_EQ (u[4], 0); - - append(0x10346, u); - EXPECT_EQ (u[0], 0xf0); - EXPECT_EQ (u[1], 0x90); - EXPECT_EQ (u[2], 0x8d); - EXPECT_EQ (u[3], 0x86); - EXPECT_EQ (u[4], 0); - - // Ensure no warnings with plain char - char c[2] = {0,0}; - append('a', c); - EXPECT_EQ (c[0], 'a'); - EXPECT_EQ (c[1], 0); -} - -TEST(CheckedAPITests, test_append16) -{ - char16_t u[5] = {0,0}; - append16(0x0448, u); - EXPECT_EQ (u[0], 0x0448); - EXPECT_EQ (u[1], 0x0000); - - append16(0x65e5, u); - EXPECT_EQ (u[0], 0x65e5); - EXPECT_EQ (u[1], 0x0000); - - append16(0x10346, u); - EXPECT_EQ (u[0], 0xd800); - EXPECT_EQ (u[1], 0xdf46); -} - -TEST(CheckedAPITests, test_next) -{ - const char* twochars = "\xe6\x97\xa5\xd1\x88"; - const char* w = twochars; - unsigned int cp = next(w, twochars + 6); - EXPECT_EQ (cp, 0x65e5); - EXPECT_EQ (w, twochars + 3); - - const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; - w = threechars; - - cp = next(w, threechars + 9); - EXPECT_EQ (cp, 0x10346); - EXPECT_EQ (w, threechars + 4); - - cp = next(w, threechars + 9); - EXPECT_EQ (cp, 0x65e5); - EXPECT_EQ (w, threechars + 7); - - cp = next(w, threechars + 9); - EXPECT_EQ (cp, 0x0448); - EXPECT_EQ (w, threechars + 9); -} - -TEST(CheckedAPITests, test_next16) -{ - const char16_t u[3] = {0x65e5, 0xd800, 0xdf46}; - const char16_t* w = u; - char32_t cp = next16(w, w + 3); - EXPECT_EQ (cp, 0x65e5); - EXPECT_EQ (w, u + 1); - - cp = next16(w, w + 2); - EXPECT_EQ (cp, 0x10346); - EXPECT_EQ (w, u + 3); -} - -TEST(CheckedAPITests, test_peek_next) -{ - const char* const cw = "\xe6\x97\xa5\xd1\x88"; - unsigned int cp = peek_next(cw, cw + 6); - EXPECT_EQ (cp, 0x65e5); -} - -TEST(CheckedAPITests, test_prior) -{ - const char* twochars = "\xe6\x97\xa5\xd1\x88"; - const char* w = twochars + 3; - unsigned int cp = prior (w, twochars); - EXPECT_EQ (cp, 0x65e5); - EXPECT_EQ (w, twochars); - - const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; - w = threechars + 9; - cp = prior(w, threechars); - EXPECT_EQ (cp, 0x0448); - EXPECT_EQ (w, threechars + 7); - cp = prior(w, threechars); - EXPECT_EQ (cp, 0x65e5); - EXPECT_EQ (w, threechars + 4); - cp = prior(w, threechars); - EXPECT_EQ (cp, 0x10346); - EXPECT_EQ (w, threechars); -} - -TEST(CheckedAPITests, test_advance) -{ - const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; - const char* w = threechars; - advance(w, 2, threechars + 9); - EXPECT_EQ(w, threechars + 7); - advance(w, -2, threechars); - EXPECT_EQ(w, threechars); - advance(w, 3, threechars + 9); - EXPECT_EQ(w, threechars + 9); - advance(w, -2, threechars); - EXPECT_EQ(w, threechars + 4); - advance(w, -1, threechars); - EXPECT_EQ(w, threechars); -} - -TEST(CheckedAPITests, test_distance) -{ - const char* twochars = "\xe6\x97\xa5\xd1\x88"; - size_t dist = static_cast(iris::utflib::distance(twochars, twochars + 5)); - EXPECT_EQ (dist, 2); -} - -TEST(CheckedAPITests, test_utf32to8) -{ - char32_t utf32string[] = {0x448, 0x65E5, 0x10346, 0}; - string utf8result; - iris::utflib::utf32to8(utf32string, utf32string + 3, back_inserter(utf8result)); - EXPECT_EQ (utf8result.size(), 9); -} - -TEST(CheckedAPITests, test_utf8to32) -{ - const char* twochars = "\xe6\x97\xa5\xd1\x88"; - vector utf32result; - iris::utflib::utf8to32(twochars, twochars + 5, back_inserter(utf32result)); - EXPECT_EQ (utf32result.size(), 2); -} - -TEST(CheckedAPITests, test_utf16to8) -{ - char16_t utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; - string utf8result; - iris::utflib::utf16to8(utf16string, utf16string + 5, back_inserter(utf8result)); - EXPECT_EQ (utf8result.size(), 10); -} - -TEST(CheckedAPITests, test_utf8to16) -{ - char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; - vector utf16result; - iris::utflib::utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result)); - EXPECT_EQ (utf16result.size(), 4); - EXPECT_EQ (utf16result[2], 0xd834); - EXPECT_EQ (utf16result[3], 0xdd1e); -} - -TEST(CheckedAPITests, test_replace_invalid) -{ - char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; - vector replace_invalid_result; - replace_invalid (invalid_sequence, invalid_sequence + sizeof(invalid_sequence), std::back_inserter(replace_invalid_result), '?'); - bool bvalid = is_valid(replace_invalid_result.begin(), replace_invalid_result.end()); - EXPECT_TRUE (bvalid); - const char fixed_invalid_sequence[] = "a????z"; - EXPECT_EQ (sizeof(fixed_invalid_sequence), replace_invalid_result.size()); - EXPECT_TRUE (std::equal(replace_invalid_result.begin(), replace_invalid_result.begin() + sizeof(fixed_invalid_sequence), fixed_invalid_sequence)); -} - -TEST(CheckedAPITests, test_find_invalid) -{ - char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; - const char* invalid = find_invalid(utf_invalid, utf_invalid + 6); - EXPECT_EQ (invalid, utf_invalid + 5); - invalid = utf_invalid + find_invalid(utf_invalid); - EXPECT_EQ (invalid, utf_invalid + 5); -} - -TEST(CheckedAPITests, test_is_valid) -{ - char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; - bool bvalid = is_valid(utf_invalid, utf_invalid + 6); - EXPECT_FALSE (bvalid); - bvalid = is_valid(utf_invalid); - EXPECT_FALSE (bvalid); - char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; - bvalid = is_valid(utf8_with_surrogates, utf8_with_surrogates + 9); - EXPECT_TRUE (bvalid); - bvalid = is_valid(utf8_with_surrogates); - EXPECT_TRUE (bvalid); -} - -TEST(CheckedAPITests, test_starts_with_bom) -{ - unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf}; - bool bbom = starts_with_bom(byte_order_mark, byte_order_mark + sizeof(byte_order_mark)); - EXPECT_TRUE (bbom); - const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; - bool no_bbom = starts_with_bom(threechars, threechars + sizeof(threechars)); - EXPECT_FALSE (no_bbom); -} - -TEST(CheckedIteratrTests, test_increment) -{ - const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; - iris::utflib::iterator it(threechars, threechars, threechars + 9); - iris::utflib::iterator it2 = it; - EXPECT_EQ (it2, it); - EXPECT_EQ (*it, 0x10346); - EXPECT_EQ (*(++it), 0x65e5); - EXPECT_EQ ((*it++), 0x65e5); - EXPECT_EQ (*it, 0x0448); - EXPECT_NE (it, it2); - iris::utflib::iterator endit (threechars + 9, threechars, threechars + 9); - EXPECT_EQ (++it, endit); -} - -TEST(CheckedIteratrTests, test_decrement) -{ - const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; - iris::utflib::iterator it(threechars+9, threechars, threechars + 9); - EXPECT_EQ (*(--it), 0x0448); - EXPECT_EQ ((*it--), 0x0448); - EXPECT_EQ (*it, 0x65e5); - EXPECT_EQ (--it, iris::utflib::iterator(threechars, threechars, threechars + 9)); - EXPECT_EQ (*it, 0x10346); -} diff --git a/test/unicode/string/string.cpp b/test/unicode/string/string.cpp new file mode 100644 index 0000000..abc34a2 --- /dev/null +++ b/test/unicode/string/string.cpp @@ -0,0 +1,488 @@ +#include "iris_test.hpp" + +#include + +#include + +namespace iris_unicode_test { + +namespace unicode = iris::unicode; + +using namespace iris::unicode; +using namespace std; + +TEST_CASE("append") +{ + unsigned char u[5] = {0, 0, 0, 0, 0}; + unicode::append(0x0448, u); + EXPECT_EQ (u[0], 0xd1); + EXPECT_EQ (u[1], 0x88); + EXPECT_EQ (u[2], 0); + EXPECT_EQ (u[3], 0); + EXPECT_EQ (u[4], 0); + + unicode::append(0x65e5, u); + EXPECT_EQ (u[0], 0xe6); + EXPECT_EQ (u[1], 0x97); + EXPECT_EQ (u[2], 0xa5); + EXPECT_EQ (u[3], 0); + EXPECT_EQ (u[4], 0); + + unicode::append(0x3044, u); + EXPECT_EQ (u[0], 0xe3); + EXPECT_EQ (u[1], 0x81); + EXPECT_EQ (u[2], 0x84); + EXPECT_EQ (u[3], 0); + EXPECT_EQ (u[4], 0); + + unicode::append(0x10346, u); + EXPECT_EQ (u[0], 0xf0); + EXPECT_EQ (u[1], 0x90); + EXPECT_EQ (u[2], 0x8d); + EXPECT_EQ (u[3], 0x86); + EXPECT_EQ (u[4], 0); +} + +#if 0 + +TEST(CheckedAPITests, test_append16) +{ + char16_t u[5] = {0, 0}; + append16(0x0448, u); + EXPECT_EQ (u[0], 0x0448); + EXPECT_EQ (u[1], 0x0000); + + append16(0x65e5, u); + EXPECT_EQ (u[0], 0x65e5); + EXPECT_EQ (u[1], 0x0000); + + append16(0x10346, u); + EXPECT_EQ (u[0], 0xd800); + EXPECT_EQ (u[1], 0xdf46); +} + +TEST(CheckedAPITests, test_next) +{ + const char* twochars = "\xe6\x97\xa5\xd1\x88"; + const char* w = twochars; + unsigned int cp = next(w, twochars + 6); + EXPECT_EQ (cp, 0x65e5); + EXPECT_EQ (w, twochars + 3); + + const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + w = threechars; + + cp = next(w, threechars + 9); + EXPECT_EQ (cp, 0x10346); + EXPECT_EQ (w, threechars + 4); + + cp = next(w, threechars + 9); + EXPECT_EQ (cp, 0x65e5); + EXPECT_EQ (w, threechars + 7); + + cp = next(w, threechars + 9); + EXPECT_EQ (cp, 0x0448); + EXPECT_EQ (w, threechars + 9); +} + +TEST(CheckedAPITests, test_next16) +{ + const char16_t u[3] = {0x65e5, 0xd800, 0xdf46}; + const char16_t* w = u; + char32_t cp = next16(w, w + 3); + EXPECT_EQ (cp, 0x65e5); + EXPECT_EQ (w, u + 1); + + cp = next16(w, w + 2); + EXPECT_EQ (cp, 0x10346); + EXPECT_EQ (w, u + 3); +} + +TEST(CheckedAPITests, test_peek_next) +{ + const char* const cw = "\xe6\x97\xa5\xd1\x88"; + unsigned int cp = peek_next(cw, cw + 6); + EXPECT_EQ (cp, 0x65e5); +} + +TEST(CheckedAPITests, test_prior) +{ + const char* twochars = "\xe6\x97\xa5\xd1\x88"; + const char* w = twochars + 3; + unsigned int cp = prior (w, twochars); + EXPECT_EQ (cp, 0x65e5); + EXPECT_EQ (w, twochars); + + const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + w = threechars + 9; + cp = prior(w, threechars); + EXPECT_EQ (cp, 0x0448); + EXPECT_EQ (w, threechars + 7); + cp = prior(w, threechars); + EXPECT_EQ (cp, 0x65e5); + EXPECT_EQ (w, threechars + 4); + cp = prior(w, threechars); + EXPECT_EQ (cp, 0x10346); + EXPECT_EQ (w, threechars); +} + +TEST(CheckedAPITests, test_advance) +{ + const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + const char* w = threechars; + advance(w, 2, threechars + 9); + EXPECT_EQ(w, threechars + 7); + advance(w, -2, threechars); + EXPECT_EQ(w, threechars); + advance(w, 3, threechars + 9); + EXPECT_EQ(w, threechars + 9); + advance(w, -2, threechars); + EXPECT_EQ(w, threechars + 4); + advance(w, -1, threechars); + EXPECT_EQ(w, threechars); +} + +TEST(CheckedAPITests, test_distance) +{ + const char* twochars = "\xe6\x97\xa5\xd1\x88"; + size_t dist = static_cast(iris::utflib::distance(twochars, twochars + 5)); + EXPECT_EQ (dist, 2); +} + +TEST(CheckedAPITests, test_utf32to8) +{ + char32_t utf32string[] = {0x448, 0x65E5, 0x10346, 0}; + string utf8result; + iris::utflib::utf32to8(utf32string, utf32string + 3, back_inserter(utf8result)); + EXPECT_EQ (utf8result.size(), 9); +} + +TEST(CheckedAPITests, test_utf8to32) +{ + const char* twochars = "\xe6\x97\xa5\xd1\x88"; + vector utf32result; + iris::utflib::utf8to32(twochars, twochars + 5, back_inserter(utf32result)); + EXPECT_EQ (utf32result.size(), 2); +} + +TEST(CheckedAPITests, test_utf16to8) +{ + char16_t utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; + string utf8result; + iris::utflib::utf16to8(utf16string, utf16string + 5, back_inserter(utf8result)); + EXPECT_EQ (utf8result.size(), 10); +} + +TEST(CheckedAPITests, test_utf8to16) +{ + char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; + vector utf16result; + iris::utflib::utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result)); + EXPECT_EQ (utf16result.size(), 4); + EXPECT_EQ (utf16result[2], 0xd834); + EXPECT_EQ (utf16result[3], 0xdd1e); +} + +TEST(CheckedAPITests, test_replace_invalid) +{ + char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; + vector replace_invalid_result; + replace_invalid (invalid_sequence, invalid_sequence + sizeof(invalid_sequence), std::back_inserter(replace_invalid_result), '?'); + bool bvalid = is_valid(replace_invalid_result.begin(), replace_invalid_result.end()); + EXPECT_TRUE (bvalid); + const char fixed_invalid_sequence[] = "a????z"; + EXPECT_EQ (sizeof(fixed_invalid_sequence), replace_invalid_result.size()); + EXPECT_TRUE (std::equal(replace_invalid_result.begin(), replace_invalid_result.begin() + sizeof(fixed_invalid_sequence), fixed_invalid_sequence)); +} + +TEST(CheckedAPITests, test_find_invalid) +{ + char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; + const char* invalid = find_invalid(utf_invalid, utf_invalid + 6); + EXPECT_EQ (invalid, utf_invalid + 5); + invalid = utf_invalid + find_invalid(utf_invalid); + EXPECT_EQ (invalid, utf_invalid + 5); +} + +TEST(CheckedAPITests, test_is_valid) +{ + char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; + bool bvalid = is_valid(utf_invalid, utf_invalid + 6); + EXPECT_FALSE (bvalid); + bvalid = is_valid(utf_invalid); + EXPECT_FALSE (bvalid); + char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; + bvalid = is_valid(utf8_with_surrogates, utf8_with_surrogates + 9); + EXPECT_TRUE (bvalid); + bvalid = is_valid(utf8_with_surrogates); + EXPECT_TRUE (bvalid); +} + +TEST(CheckedAPITests, test_starts_with_bom) +{ + unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf}; + bool bbom = starts_with_bom(byte_order_mark, byte_order_mark + sizeof(byte_order_mark)); + EXPECT_TRUE (bbom); + const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + bool no_bbom = starts_with_bom(threechars, threechars + sizeof(threechars)); + EXPECT_FALSE (no_bbom); +} + +TEST(CheckedIteratrTests, test_increment) +{ + const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + iris::utflib::iterator it(threechars, threechars, threechars + 9); + iris::utflib::iterator it2 = it; + EXPECT_EQ (it2, it); + EXPECT_EQ (*it, 0x10346); + EXPECT_EQ (*(++it), 0x65e5); + EXPECT_EQ ((*it++), 0x65e5); + EXPECT_EQ (*it, 0x0448); + EXPECT_NE (it, it2); + iris::utflib::iterator endit (threechars + 9, threechars, threechars + 9); + EXPECT_EQ (++it, endit); +} + +TEST(CheckedIteratrTests, test_decrement) +{ + const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + iris::utflib::iterator it(threechars+9, threechars, threechars + 9); + EXPECT_EQ (*(--it), 0x0448); + EXPECT_EQ ((*it--), 0x0448); + EXPECT_EQ (*it, 0x65e5); + EXPECT_EQ (--it, iris::utflib::iterator(threechars, threechars, threechars + 9)); + EXPECT_EQ (*it, 0x10346); +} + +TEST(CPP11APITests, test_append16) +{ + u16string u; + append16(0x0448, u); + EXPECT_EQ (u[0], char16_t(0x0448)); + EXPECT_EQ (u.length(), 1); +} + +TEST(CPP11APITests, test_utf16to8) +{ + u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; + string u = utf16to8(utf16string); + EXPECT_EQ (u.size(), 10); +} + +TEST(CPP11APITests, test_utf8to16) +{ + string utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; + u16string utf16result = utf8to16(utf8_with_surrogates); + EXPECT_EQ (utf16result.size(), 4); + EXPECT_EQ (utf16result[2], 0xd834); + EXPECT_EQ (utf16result[3], 0xdd1e); + // Just to make sure it compiles with string literals + EXPECT_EQ(utf8to16(u8"simple"), u"simple"); + EXPECT_EQ(utf8to16("simple"), u"simple"); +} + +TEST(CPP11APITests, test_utf32to8) +{ + u32string utf32string = {0x448, 0x65E5, 0x10346}; + string utf8result = utf32to8(utf32string); + EXPECT_EQ (utf8result.size(), 9); +} + +TEST(CPP11APITests, test_utf8to32) +{ + const char* twochars = "\xe6\x97\xa5\xd1\x88"; + u32string utf32result = utf8to32(twochars); + EXPECT_EQ (utf32result.size(), 2); +} + +TEST(CPP11APITests, test_find_invalid) +{ + string utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa"; + auto invalid = find_invalid(utf_invalid); + EXPECT_EQ (invalid, 5); +} + +TEST(CPP11APITests, test_is_valid) +{ + string utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa"; + bool bvalid = is_valid(utf_invalid); + EXPECT_FALSE (bvalid); + string utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; + bvalid = is_valid(utf8_with_surrogates); + EXPECT_TRUE (bvalid); +} + +TEST(CPP11APITests, test_replace_invalid) +{ + string invalid_sequence = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; + string replace_invalid_result = replace_invalid(invalid_sequence, '?'); + bool bvalid = is_valid(replace_invalid_result); + EXPECT_TRUE (bvalid); + const string fixed_invalid_sequence = "a????z"; + EXPECT_EQ(fixed_invalid_sequence, replace_invalid_result); +} + +TEST(CPP11APITests, test_starts_with_bom) +{ + string byte_order_mark = {char(0xef), char(0xbb), char(0xbf)}; + bool bbom = starts_with_bom(byte_order_mark); + EXPECT_TRUE (bbom); + string threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + bool no_bbom = starts_with_bom(threechars); + EXPECT_FALSE (no_bbom); +} + + +TEST(CPP17APITests, test_utf16to8) +{ + u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; + u16string_view utf16stringview(utf16string); + string u = utf16to8(utf16stringview); + EXPECT_EQ (u.size(), 10); +} + +TEST(CPP17APITests, test_utf8to16) +{ + string_view utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; + u16string utf16result = utf8to16(utf8_with_surrogates); + EXPECT_EQ (utf16result.size(), 4); + EXPECT_EQ (utf16result[2], 0xd834); + EXPECT_EQ (utf16result[3], 0xdd1e); +} + +TEST(CPP17APITests, test_utf32to8) +{ + u32string utf32string = {0x448, 0x65E5, 0x10346}; + u32string_view utf32stringview(utf32string); + string utf8result = utf32to8(utf32stringview); + EXPECT_EQ (utf8result.size(), 9); +} + +TEST(CPP17APITests, test_utf8to32) +{ + string_view twochars = "\xe6\x97\xa5\xd1\x88"; + u32string utf32result = utf8to32(twochars); + EXPECT_EQ (utf32result.size(), 2); +} + +TEST(CPP17APITests, test_find_invalid) +{ + string_view utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa"; + auto invalid = find_invalid(utf_invalid); + EXPECT_EQ (invalid, 5); +} + +TEST(CPP17APITests, test_is_valid) +{ + string_view utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa"; + bool bvalid = is_valid(utf_invalid); + EXPECT_FALSE (bvalid); + string_view utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; + bvalid = is_valid(utf8_with_surrogates); + EXPECT_TRUE (bvalid); +} + +TEST(CPP17APITests, test_replace_invalid) +{ + string_view invalid_sequence = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; + string replace_invalid_result = replace_invalid(invalid_sequence, '?'); + bool bvalid = is_valid(replace_invalid_result); + EXPECT_TRUE (bvalid); + const string fixed_invalid_sequence = "a????z"; + EXPECT_EQ(fixed_invalid_sequence, replace_invalid_result); +} + +TEST(CPP17APITests, test_starts_with_bom) +{ + string byte_order_mark = {char(0xef), char(0xbb), char(0xbf)}; + string_view byte_order_mark_view(byte_order_mark); + bool bbom = starts_with_bom(byte_order_mark_view); + EXPECT_TRUE (bbom); + string_view threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + bool no_bbom = starts_with_bom(threechars); + EXPECT_FALSE (no_bbom); +} + +TEST(CPP17APITests, string_class_and_literals) +{ + const char* twochars = "ab"; + EXPECT_TRUE (is_valid(twochars)); + const string two_chars_string(twochars); + EXPECT_TRUE (is_valid(two_chars_string)); +} + + +TEST(CPP20APITests, test_utf16tou8) +{ + u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; + u16string_view utf16stringview{utf16string}; + u8string u = utf16tou8(utf16string); + EXPECT_EQ (u.size(), 10); + u = utf16tou8(utf16stringview); + EXPECT_EQ (u.size(), 10); +} + +TEST(CPP20APITests, tes20t_utf8to16) +{ + u8string utf8_with_surrogates{ reinterpret_cast("\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e") }; + u16string utf16result = utf8to16(utf8_with_surrogates); + EXPECT_EQ (utf16result.size(), 4); + EXPECT_EQ (utf16result[2], 0xd834); + EXPECT_EQ (utf16result[3], 0xdd1e); +} + +TEST(CPP20APITests, test_utf32tou8) +{ + u32string utf32string = {0x448, 0x65E5, 0x10346}; + u32string_view utf32stringview{utf32string}; + u8string utf8result = utf32tou8(utf32stringview); + EXPECT_EQ (utf8result.size(), 9); +} + +TEST(CPP20APITests, test_utf8to32) +{ + u8string twochars = reinterpret_cast("\xe6\x97\xa5\xd1\x88"); + u32string utf32result = utf8to32(twochars); + EXPECT_EQ (utf32result.size(), 2); +} + +TEST(CPP20APITests, test_find_invalid) +{ + u8string utf_invalid = reinterpret_cast("\xe6\x97\xa5\xd1\x88\xfa"); + auto invalid = find_invalid(utf_invalid); + EXPECT_EQ (invalid, 5); +} + +TEST(CPP20APITests, test_is_valid) +{ + u8string utf_invalid = reinterpret_cast("\xe6\x97\xa5\xd1\x88\xfa"); + bool bvalid = is_valid(utf_invalid); + EXPECT_FALSE (bvalid); + u8string utf8_with_surrogates = reinterpret_cast("\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"); + bvalid = is_valid(utf8_with_surrogates); + EXPECT_TRUE (bvalid); +} + +TEST(CPP20APITests, test_replace_invalid) +{ + u8string invalid_sequence = reinterpret_cast("a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"); + u8string replace_invalid_result = replace_invalid(invalid_sequence, u8'?'); + bool bvalid = is_valid(replace_invalid_result); + EXPECT_TRUE (bvalid); + const u8string fixed_invalid_sequence = reinterpret_cast("a????z"); + EXPECT_EQ(fixed_invalid_sequence, replace_invalid_result); +} + +TEST(CPP20APITests, test_starts_with_bom) +{ + u8string byte_order_mark = reinterpret_cast("\xef\xbb\xbf"); + bool bbom = starts_with_bom(byte_order_mark); + EXPECT_TRUE (bbom); + u8string threechars = reinterpret_cast("\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"); + bool no_bbom = starts_with_bom(threechars); + EXPECT_FALSE (no_bbom); +} + +#endif + +} // iris_unicode_test diff --git a/test/unicode/string/test_cpp11.cpp b/test/unicode/string/test_cpp11.cpp deleted file mode 100644 index 9de19be..0000000 --- a/test/unicode/string/test_cpp11.cpp +++ /dev/null @@ -1,117 +0,0 @@ -#include "ftest.h" - -#include "utf8.h" - -#include - -using namespace iris::utflib; -using namespace std; - -TEST(CPP11APITests, test_append) -{ - string u; - append(0x0448, u); - EXPECT_EQ (u[0], char(0xd1)); - EXPECT_EQ (u[1], char(0x88)); - EXPECT_EQ (u.length(), 2); - - u.clear(); - append(0x65e5, u); - EXPECT_EQ (u[0], char(0xe6)); - EXPECT_EQ (u[1], char(0x97)); - EXPECT_EQ (u[2], char(0xa5)); - EXPECT_EQ (u.length(), 3); - - u.clear(); - append(0x3044, u); - EXPECT_EQ (u[0], char(0xe3)); - EXPECT_EQ (u[1], char(0x81)); - EXPECT_EQ (u[2], char(0x84)); - EXPECT_EQ (u.length(), 3); - - u.clear(); - append(0x10346, u); - EXPECT_EQ (u[0], char(0xf0)); - EXPECT_EQ (u[1], char(0x90)); - EXPECT_EQ (u[2], char(0x8d)); - EXPECT_EQ (u[3], char(0x86)); - EXPECT_EQ (u.length(), 4); -} - -TEST(CPP11APITests, test_append16) -{ - u16string u; - append16(0x0448, u); - EXPECT_EQ (u[0], char16_t(0x0448)); - EXPECT_EQ (u.length(), 1); -} - -TEST(CPP11APITests, test_utf16to8) -{ - u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; - string u = utf16to8(utf16string); - EXPECT_EQ (u.size(), 10); -} - -TEST(CPP11APITests, test_utf8to16) -{ - string utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; - u16string utf16result = utf8to16(utf8_with_surrogates); - EXPECT_EQ (utf16result.size(), 4); - EXPECT_EQ (utf16result[2], 0xd834); - EXPECT_EQ (utf16result[3], 0xdd1e); - // Just to make sure it compiles with string literals - EXPECT_EQ(utf8to16(u8"simple"), u"simple"); - EXPECT_EQ(utf8to16("simple"), u"simple"); -} - -TEST(CPP11APITests, test_utf32to8) -{ - u32string utf32string = {0x448, 0x65E5, 0x10346}; - string utf8result = utf32to8(utf32string); - EXPECT_EQ (utf8result.size(), 9); -} - -TEST(CPP11APITests, test_utf8to32) -{ - const char* twochars = "\xe6\x97\xa5\xd1\x88"; - u32string utf32result = utf8to32(twochars); - EXPECT_EQ (utf32result.size(), 2); -} - -TEST(CPP11APITests, test_find_invalid) -{ - string utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa"; - auto invalid = find_invalid(utf_invalid); - EXPECT_EQ (invalid, 5); -} - -TEST(CPP11APITests, test_is_valid) -{ - string utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa"; - bool bvalid = is_valid(utf_invalid); - EXPECT_FALSE (bvalid); - string utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; - bvalid = is_valid(utf8_with_surrogates); - EXPECT_TRUE (bvalid); -} - -TEST(CPP11APITests, test_replace_invalid) -{ - string invalid_sequence = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; - string replace_invalid_result = replace_invalid(invalid_sequence, '?'); - bool bvalid = is_valid(replace_invalid_result); - EXPECT_TRUE (bvalid); - const string fixed_invalid_sequence = "a????z"; - EXPECT_EQ(fixed_invalid_sequence, replace_invalid_result); -} - -TEST(CPP11APITests, test_starts_with_bom) -{ - string byte_order_mark = {char(0xef), char(0xbb), char(0xbf)}; - bool bbom = starts_with_bom(byte_order_mark); - EXPECT_TRUE (bbom); - string threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; - bool no_bbom = starts_with_bom(threechars); - EXPECT_FALSE (no_bbom); -} diff --git a/test/unicode/string/test_cpp17.cpp b/test/unicode/string/test_cpp17.cpp deleted file mode 100644 index 2d3756c..0000000 --- a/test/unicode/string/test_cpp17.cpp +++ /dev/null @@ -1,86 +0,0 @@ -#include "ftest.h" - -#include "utf8.h" - -#include - -using namespace iris::utflib; -using namespace std; - -TEST(CPP17APITests, test_utf16to8) -{ - u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; - u16string_view utf16stringview(utf16string); - string u = utf16to8(utf16stringview); - EXPECT_EQ (u.size(), 10); -} - -TEST(CPP17APITests, test_utf8to16) -{ - string_view utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; - u16string utf16result = utf8to16(utf8_with_surrogates); - EXPECT_EQ (utf16result.size(), 4); - EXPECT_EQ (utf16result[2], 0xd834); - EXPECT_EQ (utf16result[3], 0xdd1e); -} - -TEST(CPP17APITests, test_utf32to8) -{ - u32string utf32string = {0x448, 0x65E5, 0x10346}; - u32string_view utf32stringview(utf32string); - string utf8result = utf32to8(utf32stringview); - EXPECT_EQ (utf8result.size(), 9); -} - -TEST(CPP17APITests, test_utf8to32) -{ - string_view twochars = "\xe6\x97\xa5\xd1\x88"; - u32string utf32result = utf8to32(twochars); - EXPECT_EQ (utf32result.size(), 2); -} - -TEST(CPP17APITests, test_find_invalid) -{ - string_view utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa"; - auto invalid = find_invalid(utf_invalid); - EXPECT_EQ (invalid, 5); -} - -TEST(CPP17APITests, test_is_valid) -{ - string_view utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa"; - bool bvalid = is_valid(utf_invalid); - EXPECT_FALSE (bvalid); - string_view utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; - bvalid = is_valid(utf8_with_surrogates); - EXPECT_TRUE (bvalid); -} - -TEST(CPP17APITests, test_replace_invalid) -{ - string_view invalid_sequence = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; - string replace_invalid_result = replace_invalid(invalid_sequence, '?'); - bool bvalid = is_valid(replace_invalid_result); - EXPECT_TRUE (bvalid); - const string fixed_invalid_sequence = "a????z"; - EXPECT_EQ(fixed_invalid_sequence, replace_invalid_result); -} - -TEST(CPP17APITests, test_starts_with_bom) -{ - string byte_order_mark = {char(0xef), char(0xbb), char(0xbf)}; - string_view byte_order_mark_view(byte_order_mark); - bool bbom = starts_with_bom(byte_order_mark_view); - EXPECT_TRUE (bbom); - string_view threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; - bool no_bbom = starts_with_bom(threechars); - EXPECT_FALSE (no_bbom); -} - -TEST(CPP17APITests, string_class_and_literals) -{ - const char* twochars = "ab"; - EXPECT_TRUE (is_valid(twochars)); - const string two_chars_string(twochars); - EXPECT_TRUE (is_valid(two_chars_string)); -} diff --git a/test/unicode/string/test_cpp20.cpp b/test/unicode/string/test_cpp20.cpp deleted file mode 100644 index 330027d..0000000 --- a/test/unicode/string/test_cpp20.cpp +++ /dev/null @@ -1,79 +0,0 @@ -#include "ftest.h" - -#include "utf8.h" - -#include - -using namespace iris::utflib; -using namespace std; - -TEST(CPP20APITests, test_utf16tou8) -{ - u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; - u16string_view utf16stringview{utf16string}; - u8string u = utf16tou8(utf16string); - EXPECT_EQ (u.size(), 10); - u = utf16tou8(utf16stringview); - EXPECT_EQ (u.size(), 10); -} - -TEST(CPP20APITests, tes20t_utf8to16) -{ - u8string utf8_with_surrogates{ reinterpret_cast("\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e") }; - u16string utf16result = utf8to16(utf8_with_surrogates); - EXPECT_EQ (utf16result.size(), 4); - EXPECT_EQ (utf16result[2], 0xd834); - EXPECT_EQ (utf16result[3], 0xdd1e); -} - -TEST(CPP20APITests, test_utf32tou8) -{ - u32string utf32string = {0x448, 0x65E5, 0x10346}; - u32string_view utf32stringview{utf32string}; - u8string utf8result = utf32tou8(utf32stringview); - EXPECT_EQ (utf8result.size(), 9); -} - -TEST(CPP20APITests, test_utf8to32) -{ - u8string twochars = reinterpret_cast("\xe6\x97\xa5\xd1\x88"); - u32string utf32result = utf8to32(twochars); - EXPECT_EQ (utf32result.size(), 2); -} - -TEST(CPP20APITests, test_find_invalid) -{ - u8string utf_invalid = reinterpret_cast("\xe6\x97\xa5\xd1\x88\xfa"); - auto invalid = find_invalid(utf_invalid); - EXPECT_EQ (invalid, 5); -} - -TEST(CPP20APITests, test_is_valid) -{ - u8string utf_invalid = reinterpret_cast("\xe6\x97\xa5\xd1\x88\xfa"); - bool bvalid = is_valid(utf_invalid); - EXPECT_FALSE (bvalid); - u8string utf8_with_surrogates = reinterpret_cast("\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"); - bvalid = is_valid(utf8_with_surrogates); - EXPECT_TRUE (bvalid); -} - -TEST(CPP20APITests, test_replace_invalid) -{ - u8string invalid_sequence = reinterpret_cast("a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"); - u8string replace_invalid_result = replace_invalid(invalid_sequence, u8'?'); - bool bvalid = is_valid(replace_invalid_result); - EXPECT_TRUE (bvalid); - const u8string fixed_invalid_sequence = reinterpret_cast("a????z"); - EXPECT_EQ(fixed_invalid_sequence, replace_invalid_result); -} - -TEST(CPP20APITests, test_starts_with_bom) -{ - u8string byte_order_mark = reinterpret_cast("\xef\xbb\xbf"); - bool bbom = starts_with_bom(byte_order_mark); - EXPECT_TRUE (bbom); - u8string threechars = reinterpret_cast("\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"); - bool no_bbom = starts_with_bom(threechars); - EXPECT_FALSE (no_bbom); -} diff --git a/test/unicode/string/negative.cpp b/test/unicode/string/utf8_invalid.cpp similarity index 100% rename from test/unicode/string/negative.cpp rename to test/unicode/string/utf8_invalid.cpp From c078f214907750dadf75d7f7e1c60ff899a2d3f9 Mon Sep 17 00:00:00 2001 From: Nana Sakisaka <1901813+saki7@users.noreply.github.com> Date: Sat, 7 Mar 2026 15:38:50 +0900 Subject: [PATCH 03/17] Fix code style --- include/iris/unicode/string.hpp | 1547 +++++++++++++++---------------- 1 file changed, 773 insertions(+), 774 deletions(-) diff --git a/include/iris/unicode/string.hpp b/include/iris/unicode/string.hpp index a09ebc8..7b0b77a 100644 --- a/include/iris/unicode/string.hpp +++ b/include/iris/unicode/string.hpp @@ -38,918 +38,917 @@ DEALINGS IN THE SOFTWARE. #include #include -namespace iris::unicode -{ - template - concept octet = std::integral && sizeof(T) == 1; +namespace iris::unicode { - template - concept utf8char = octet && (std::same_as || std::same_as); +template +concept octet = std::integral && sizeof(T) == 1; - template - concept utf16char = std::same_as; +template +concept utf8char = octet && (std::same_as || std::same_as); - template - concept utf32char = std::same_as; +template +concept utf16char = std::same_as; - template - concept octet_input_iterator = std::input_iterator && octet>; +template +concept utf32char = std::same_as; - template - concept utf8_input_iterator = octet_input_iterator && utf8char>; +template +concept octet_input_iterator = std::input_iterator && octet>; - template - concept utf16_input_iterator = std::input_iterator && utf16char>; +template +concept utf8_input_iterator = octet_input_iterator && utf8char>; - template - concept utf32_input_iterator = std::input_iterator && utf32char>; +template +concept utf16_input_iterator = std::input_iterator && utf16char>; - namespace traits - { - template - struct is_nothrow_dereferenceable : std::false_type {}; +template +concept utf32_input_iterator = std::input_iterator && utf32char>; - template - struct is_nothrow_dereferenceable())>> : std::bool_constant())> {}; - template - inline constexpr bool is_nothrow_dereferenceable_v = is_nothrow_dereferenceable::value; +template +struct is_nothrow_dereferenceable : std::false_type {}; - template - struct is_nothrow_prefix_incrementable : std::false_type {}; +template +struct is_nothrow_dereferenceable())>> : std::bool_constant())> {}; - template - struct is_nothrow_prefix_incrementable())>> : std::bool_constant())> {}; +template +inline constexpr bool is_nothrow_dereferenceable_v = is_nothrow_dereferenceable::value; - template - inline constexpr bool is_nothrow_prefix_incrementable_v = is_nothrow_prefix_incrementable::value; +template +struct is_nothrow_prefix_incrementable : std::false_type {}; - template - struct is_nothrow_postfix_incrementable : std::false_type {}; +template +struct is_nothrow_prefix_incrementable())>> : std::bool_constant())> {}; - template - struct is_nothrow_postfix_incrementable()++)>> : std::bool_constant()++)> {}; +template +inline constexpr bool is_nothrow_prefix_incrementable_v = is_nothrow_prefix_incrementable::value; - template - inline constexpr bool is_nothrow_postfix_incrementable_v = is_nothrow_postfix_incrementable::value; +template +struct is_nothrow_postfix_incrementable : std::false_type {}; - template - struct is_nothrow_sentinel : std::false_type {}; +template +struct is_nothrow_postfix_incrementable()++)>> : std::bool_constant()++)> {}; - template - requires std::sentinel_for - struct is_nothrow_sentinel : std::bool_constant< - noexcept(std::declval() == std::declval()) && - noexcept(std::declval() != std::declval()) && - noexcept(std::declval() == std::declval()) && - noexcept(std::declval() != std::declval()) - > - {}; +template +inline constexpr bool is_nothrow_postfix_incrementable_v = is_nothrow_postfix_incrementable::value; - template - inline constexpr bool is_nothrow_sentinel_v = is_nothrow_sentinel::value; - } // namespace traits +template +struct is_nothrow_sentinel : std::false_type {}; - // Helper code - not intended to be directly called by the library users. May be changed at any time - namespace internal - { - // Unicode constants - // Leading (high) surrogates: 0xd800 - 0xdbff - // Trailing (low) surrogates: 0xdc00 - 0xdfff - constexpr char16_t LEAD_SURROGATE_MIN = 0xd800u; - constexpr char16_t LEAD_SURROGATE_MAX = 0xdbffu; - constexpr char16_t TRAIL_SURROGATE_MIN = 0xdc00u; - constexpr char16_t TRAIL_SURROGATE_MAX = 0xdfffu; - constexpr char16_t LEAD_OFFSET = 0xd7c0u; // LEAD_SURROGATE_MIN - (0x10000 >> 10) - constexpr char32_t SURROGATE_OFFSET = 0xfca02400u; // 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN - - // Maximum valid value for a Unicode code point - constexpr char32_t CODE_POINT_MAX = 0x0010ffffu; - - enum class utf_error { OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT }; - - template - [[nodiscard]] constexpr char8_t mask8(Octet oc) noexcept - { - return static_cast(0xff & oc); - } +template + requires std::sentinel_for +struct is_nothrow_sentinel : std::bool_constant< + noexcept(std::declval() == std::declval()) && + noexcept(std::declval() != std::declval()) && + noexcept(std::declval() == std::declval()) && + noexcept(std::declval() != std::declval()) +> +{}; - [[nodiscard]] constexpr char16_t mask16(char16_t oc) noexcept - { - return static_cast(0xffff & oc); - } +template +inline constexpr bool is_nothrow_sentinel_v = is_nothrow_sentinel::value; - template - [[nodiscard]] constexpr bool is_trail(Octet oc) noexcept - { - return ((internal::mask8(oc) >> 6) == 0x2); - } - [[nodiscard]] constexpr bool is_lead_surrogate(char32_t cp) noexcept - { - return (cp >= static_cast(LEAD_SURROGATE_MIN) && cp <= static_cast(LEAD_SURROGATE_MAX)); - } +namespace detail { - [[nodiscard]] constexpr bool is_trail_surrogate(char32_t cp) noexcept - { - return (cp >= static_cast(TRAIL_SURROGATE_MIN) && cp <= static_cast(TRAIL_SURROGATE_MAX)); - } +// Unicode constants +// Leading (high) surrogates: 0xd800 - 0xdbff +// Trailing (low) surrogates: 0xdc00 - 0xdfff +constexpr char16_t LEAD_SURROGATE_MIN = 0xd800u; +constexpr char16_t LEAD_SURROGATE_MAX = 0xdbffu; +constexpr char16_t TRAIL_SURROGATE_MIN = 0xdc00u; +constexpr char16_t TRAIL_SURROGATE_MAX = 0xdfffu; +constexpr char16_t LEAD_OFFSET = 0xd7c0u; // LEAD_SURROGATE_MIN - (0x10000 >> 10) +constexpr char32_t SURROGATE_OFFSET = 0xfca02400u; // 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN - [[nodiscard]] constexpr bool is_surrogate(char32_t cp) noexcept - { - return (cp >= static_cast(LEAD_SURROGATE_MIN) && cp <= static_cast(TRAIL_SURROGATE_MAX)); - } +// Maximum valid value for a Unicode code point +constexpr char32_t CODE_POINT_MAX = 0x0010ffffu; - [[nodiscard]] constexpr bool is_code_point_valid(char32_t cp) noexcept - { - return (cp <= CODE_POINT_MAX && !internal::is_surrogate(cp)); - } +enum class utf_error { OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT }; - [[nodiscard]] constexpr bool is_in_bmp(char32_t cp) noexcept - { - return cp < char32_t(0x10000); - } +template +[[nodiscard]] constexpr char8_t mask8(Octet oc) noexcept +{ + return static_cast(0xff & oc); +} - [[nodiscard]] constexpr bool is_overlong_sequence(char32_t cp, int length) noexcept - { - if (cp < 0x80) { - if (length != 1) - return true; - } else if (cp < 0x800) { - if (length != 2) - return true; - } else if (cp < 0x10000) { - if (length != 3) - return true; - } - return false; - } +[[nodiscard]] constexpr char16_t mask16(char16_t oc) noexcept +{ + return static_cast(0xffff & oc); +} - template - [[nodiscard]] constexpr int sequence_length(It lead_it) - noexcept(traits::is_nothrow_dereferenceable_v) - { - const char8_t lead = internal::mask8(*lead_it); - if (lead < 0x80) - return 1; - else if ((lead >> 5) == 0x6) - return 2; - else if ((lead >> 4) == 0xe) - return 3; - else if ((lead >> 3) == 0x1e) - return 4; - else - return 0; - } +template +[[nodiscard]] constexpr bool is_trail(Octet oc) noexcept +{ + return ((detail::mask8(oc) >> 6) == 0x2); +} - /// Helper for get_sequence_x - template Se> - constexpr utf_error increase_safely(It& it, Se end) - noexcept(std::conjunction_v< - traits::is_nothrow_dereferenceable, - traits::is_nothrow_prefix_incrementable, - traits::is_nothrow_sentinel - >) - { - if (++it == end) - return utf_error::NOT_ENOUGH_ROOM; - - if (!internal::is_trail(*it)) - return utf_error::INCOMPLETE_SEQUENCE; - - return utf_error::OK; - } +[[nodiscard]] constexpr bool is_lead_surrogate(char32_t cp) noexcept +{ + return (cp >= static_cast(LEAD_SURROGATE_MIN) && cp <= static_cast(LEAD_SURROGATE_MAX)); +} -#define IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(IT, END) \ - do { \ - utf_error ret = increase_safely(IT, END); \ - if (ret != utf_error::OK) \ - return ret; \ - } while (false) - - /// get_sequence_x functions decode utf-8 sequences of the length x - template Se> - constexpr utf_error get_sequence_1(It& it, Se end, char32_t& code_point) - noexcept(std::conjunction_v< - traits::is_nothrow_dereferenceable, - traits::is_nothrow_sentinel - >) - { - if (it == end) - return utf_error::NOT_ENOUGH_ROOM; - - code_point = static_cast(internal::mask8(*it)); - - return utf_error::OK; - } +[[nodiscard]] constexpr bool is_trail_surrogate(char32_t cp) noexcept +{ + return (cp >= static_cast(TRAIL_SURROGATE_MIN) && cp <= static_cast(TRAIL_SURROGATE_MAX)); +} - template Se> - constexpr utf_error get_sequence_2(It& it, Se end, char32_t& code_point) - noexcept(std::conjunction_v< - traits::is_nothrow_dereferenceable, - traits::is_nothrow_prefix_incrementable, - traits::is_nothrow_sentinel - >) - { - if (it == end) - return utf_error::NOT_ENOUGH_ROOM; +[[nodiscard]] constexpr bool is_surrogate(char32_t cp) noexcept +{ + return (cp >= static_cast(LEAD_SURROGATE_MIN) && cp <= static_cast(TRAIL_SURROGATE_MAX)); +} - code_point = static_cast(internal::mask8(*it)); +[[nodiscard]] constexpr bool is_code_point_valid(char32_t cp) noexcept +{ + return (cp <= CODE_POINT_MAX && !detail::is_surrogate(cp)); +} - IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end); +[[nodiscard]] constexpr bool is_in_bmp(char32_t cp) noexcept +{ + return cp < char32_t(0x10000); +} - code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f); +[[nodiscard]] constexpr bool is_overlong_sequence(char32_t cp, int length) noexcept +{ + if (cp < 0x80) { + if (length != 1) + return true; + } else if (cp < 0x800) { + if (length != 2) + return true; + } else if (cp < 0x10000) { + if (length != 3) + return true; + } + return false; +} + +template +[[nodiscard]] constexpr int sequence_length(It lead_it) + noexcept(is_nothrow_dereferenceable_v) +{ + const char8_t lead = detail::mask8(*lead_it); + if (lead < 0x80) + return 1; + else if ((lead >> 5) == 0x6) + return 2; + else if ((lead >> 4) == 0xe) + return 3; + else if ((lead >> 3) == 0x1e) + return 4; + else + return 0; +} + +/// Helper for get_sequence_x +template Se> +constexpr utf_error increase_safely(It& it, Se end) + noexcept(std::conjunction_v< + is_nothrow_dereferenceable, + is_nothrow_prefix_incrementable, + is_nothrow_sentinel + >) +{ + if (++it == end) + return utf_error::NOT_ENOUGH_ROOM; - return utf_error::OK; - } + if (!detail::is_trail(*it)) + return utf_error::INCOMPLETE_SEQUENCE; - template Se> - constexpr utf_error get_sequence_3(It& it, Se end, char32_t& code_point) - noexcept(std::conjunction_v< - traits::is_nothrow_dereferenceable, - traits::is_nothrow_prefix_incrementable, - traits::is_nothrow_sentinel - >) - { - if (it == end) - return utf_error::NOT_ENOUGH_ROOM; + return utf_error::OK; +} - code_point = static_cast(internal::mask8(*it)); +#define IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(IT, END) \ +do { \ +utf_error ret = increase_safely(IT, END); \ +if (ret != utf_error::OK) \ + return ret; \ +} while (false) + +/// get_sequence_x functions decode utf-8 sequences of the length x +template Se> +constexpr utf_error get_sequence_1(It& it, Se end, char32_t& code_point) + noexcept(std::conjunction_v< + is_nothrow_dereferenceable, + is_nothrow_sentinel + >) +{ + if (it == end) + return utf_error::NOT_ENOUGH_ROOM; - IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end); + code_point = static_cast(detail::mask8(*it)); - code_point = ((code_point << 12) & 0xffff) + ((internal::mask8(*it) << 6) & 0xfff); + return utf_error::OK; +} - IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end); +template Se> +constexpr utf_error get_sequence_2(It& it, Se end, char32_t& code_point) + noexcept(std::conjunction_v< + is_nothrow_dereferenceable, + is_nothrow_prefix_incrementable, + is_nothrow_sentinel + >) +{ + if (it == end) + return utf_error::NOT_ENOUGH_ROOM; - code_point = static_cast(code_point + ((*it) & 0x3f)); + code_point = static_cast(detail::mask8(*it)); - return utf_error::OK; - } + IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end); - template Se> - constexpr utf_error get_sequence_4(It& it, Se end, char32_t& code_point) - noexcept(std::conjunction_v< - traits::is_nothrow_dereferenceable, - traits::is_nothrow_prefix_incrementable, - traits::is_nothrow_sentinel - >) - { - if (it == end) - return utf_error::NOT_ENOUGH_ROOM; + code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f); - code_point = static_cast(internal::mask8(*it)); + return utf_error::OK; +} - IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end); +template Se> +constexpr utf_error get_sequence_3(It& it, Se end, char32_t& code_point) + noexcept(std::conjunction_v< + is_nothrow_dereferenceable, + is_nothrow_prefix_incrementable, + is_nothrow_sentinel + >) +{ + if (it == end) + return utf_error::NOT_ENOUGH_ROOM; - code_point = ((code_point << 18) & 0x1fffff) + ((internal::mask8(*it) << 12) & 0x3ffff); + code_point = static_cast(detail::mask8(*it)); - IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end); + IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end); - code_point = static_cast(code_point + ((internal::mask8(*it) << 6) & 0xfff)); + code_point = ((code_point << 12) & 0xffff) + ((detail::mask8(*it) << 6) & 0xfff); - IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end); + IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end); - code_point = static_cast(code_point + ((*it) & 0x3f)); + code_point = static_cast(code_point + ((*it) & 0x3f)); - return utf_error::OK; - } + return utf_error::OK; +} -#undef IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR +template Se> +constexpr utf_error get_sequence_4(It& it, Se end, char32_t& code_point) + noexcept(std::conjunction_v< + is_nothrow_dereferenceable, + is_nothrow_prefix_incrementable, + is_nothrow_sentinel + >) +{ + if (it == end) + return utf_error::NOT_ENOUGH_ROOM; - template Se> - requires std::forward_iterator - constexpr utf_error validate_next(It& it, Se end, char32_t& code_point) - noexcept(std::conjunction_v< - traits::is_nothrow_dereferenceable, - traits::is_nothrow_prefix_incrementable, - traits::is_nothrow_sentinel, - std::is_nothrow_copy_constructible - >) - { - if (it == end) - return utf_error::NOT_ENOUGH_ROOM; - - // Save the original value of it so we can go back in case of failure - // Of course, it does not make much sense with i.e. stream iterators - It original_it = it; - - char32_t cp = 0; - // Determine the sequence length based on the lead octet - const int length = internal::sequence_length(it); - - // Get trail octets and calculate the code point - utf_error err = utf_error::OK; - switch (length) { - case 0: - return utf_error::INVALID_LEAD; - case 1: - err = internal::get_sequence_1(it, end, cp); - break; - case 2: - err = internal::get_sequence_2(it, end, cp); - break; - case 3: - err = internal::get_sequence_3(it, end, cp); - break; - case 4: - err = internal::get_sequence_4(it, end, cp); - break; - } - - if (err == utf_error::OK) { - // Decoding succeeded. Now, security checks... - if (internal::is_code_point_valid(cp)) { - if (!internal::is_overlong_sequence(cp, length)) { - // Passed! Return here. - code_point = cp; - ++it; - return utf_error::OK; - } else - err = utf_error::OVERLONG_SEQUENCE; - } else - err = utf_error::INVALID_CODE_POINT; - } - - // Failure branch - restore the original value of the iterator - it = original_it; - return err; - } + code_point = static_cast(detail::mask8(*it)); - template Se> - requires std::forward_iterator - constexpr utf_error validate_next(It& it, Se end) - noexcept(noexcept(internal::validate_next(it, end, std::declval()))) - { - char32_t ignored; - return internal::validate_next(it, end, ignored); - } + IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end); - template Se> - requires std::forward_iterator - constexpr utf_error validate_next16(It& it, Se end, char32_t& code_point) - noexcept(std::conjunction_v< - traits::is_nothrow_dereferenceable, - traits::is_nothrow_prefix_incrementable, - traits::is_nothrow_postfix_incrementable, - traits::is_nothrow_sentinel, - std::is_nothrow_copy_constructible - >) - { - // Check the edge case: - if (it == end) - return utf_error::NOT_ENOUGH_ROOM; - // Save the original value of it so we can go back in case of failure - // Of course, it does not make much sense with i.e. stream iterators - It original_it = it; - - utf_error err = utf_error::OK; - - const char16_t first_word = *it++; - if (!internal::is_surrogate(first_word)) { - code_point = first_word; - return utf_error::OK; - } else { - if (it == end) - err = utf_error::NOT_ENOUGH_ROOM; - else if (internal::is_lead_surrogate(first_word)) { - const char16_t second_word = *it++; - if (internal::is_trail_surrogate(static_cast(second_word))) { - code_point = static_cast(first_word << 10) + static_cast(second_word) + SURROGATE_OFFSET; - return utf_error::OK; - } else - err = utf_error::INCOMPLETE_SEQUENCE; - - } else { - err = utf_error::INVALID_LEAD; - } - } - // error branch - it = original_it; - return err; - } + code_point = ((code_point << 18) & 0x1fffff) + ((detail::mask8(*it) << 12) & 0x3ffff); - template> - requires std::output_iterator - constexpr OutIt append(char32_t cp, OutIt result) noexcept - { - if (cp < 0x80) // one octet - *(result++) = static_cast(cp); - else if (cp < 0x800) { // two octets - *(result++) = static_cast((cp >> 6) | 0xc0); - *(result++) = static_cast((cp & 0x3f) | 0x80); - } else if (cp < 0x10000) { // three octets - *(result++) = static_cast((cp >> 12) | 0xe0); - *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); - *(result++) = static_cast((cp & 0x3f) | 0x80); - } else { // four octets - *(result++) = static_cast((cp >> 18) | 0xf0); - *(result++) = static_cast(((cp >> 12) & 0x3f) | 0x80); - *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); - *(result++) = static_cast((cp & 0x3f) | 0x80); - } - return result; - } + IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end); - template - constexpr std::back_insert_iterator append(char32_t cp, std::back_insert_iterator result) - noexcept(noexcept(internal::append, class container_type::value_type>(cp, result))) - { - return internal::append, class container_type::value_type>(cp, result); - } + code_point = static_cast(code_point + ((detail::mask8(*it) << 6) & 0xfff)); - template It> - constexpr It append16(char32_t cp, It result) - noexcept(noexcept(*result++ = std::declval())) - { - if (internal::is_in_bmp(cp)) - *(result++) = static_cast(cp); - else { - // Code points from the supplementary planes are encoded via surrogate pairs - *(result++) = static_cast(LEAD_OFFSET + (cp >> 10)); - *(result++) = static_cast(TRAIL_SURROGATE_MIN + (cp & 0x3FF)); - } - return result; - } - } // namespace internal + IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end); - // Base for the exceptions that may be thrown from the library - class exception : public ::std::exception - { - }; + code_point = static_cast(code_point + ((*it) & 0x3f)); - // Exceptions that may be thrown from the library functions. - class invalid_code_point : public exception - { - char32_t cp; + return utf_error::OK; +} - public: - explicit invalid_code_point(char32_t codepoint) - : cp(codepoint) - { - } - virtual const char* what() const noexcept override { return "Invalid code point"; } - [[nodiscard]] char32_t code_point() const noexcept { return cp; } - }; +#undef IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR - class invalid_utf8 : public exception - { - char8_t u8; +template Se> + requires std::forward_iterator +constexpr utf_error validate_next(It& it, Se end, char32_t& code_point) + noexcept(std::conjunction_v< + is_nothrow_dereferenceable, + is_nothrow_prefix_incrementable, + is_nothrow_sentinel, + std::is_nothrow_copy_constructible + >) +{ + if (it == end) + return utf_error::NOT_ENOUGH_ROOM; + + // Save the original value of it so we can go back in case of failure + // Of course, it does not make much sense with i.e. stream iterators + It original_it = it; + + char32_t cp = 0; + // Determine the sequence length based on the lead octet + const int length = detail::sequence_length(it); + + // Get trail octets and calculate the code point + utf_error err = utf_error::OK; + switch (length) { + case 0: + return utf_error::INVALID_LEAD; + case 1: + err = detail::get_sequence_1(it, end, cp); + break; + case 2: + err = detail::get_sequence_2(it, end, cp); + break; + case 3: + err = detail::get_sequence_3(it, end, cp); + break; + case 4: + err = detail::get_sequence_4(it, end, cp); + break; + } + + if (err == utf_error::OK) { + // Decoding succeeded. Now, security checks... + if (detail::is_code_point_valid(cp)) { + if (!detail::is_overlong_sequence(cp, length)) { + // Passed! Return here. + code_point = cp; + ++it; + return utf_error::OK; + } else + err = utf_error::OVERLONG_SEQUENCE; + } else + err = utf_error::INVALID_CODE_POINT; + } - public: - explicit invalid_utf8(char c) - : u8(static_cast(c)) - { - } - explicit invalid_utf8(char8_t u) - : u8(u) - { - } - virtual const char* what() const noexcept override { return "Invalid UTF-8"; } - [[nodiscard]] char8_t utf8_octet() const noexcept { return u8; } - }; + // Failure branch - restore the original value of the iterator + it = original_it; + return err; +} - class invalid_utf16 : public exception - { - char16_t u16; +template Se> + requires std::forward_iterator +constexpr utf_error validate_next(It& it, Se end) + noexcept(noexcept(detail::validate_next(it, end, std::declval()))) +{ + char32_t ignored; + return detail::validate_next(it, end, ignored); +} + +template Se> + requires std::forward_iterator +constexpr utf_error validate_next16(It& it, Se end, char32_t& code_point) + noexcept(std::conjunction_v< + is_nothrow_dereferenceable, + is_nothrow_prefix_incrementable, + is_nothrow_postfix_incrementable, + is_nothrow_sentinel, + std::is_nothrow_copy_constructible + >) +{ + // Check the edge case: + if (it == end) + return utf_error::NOT_ENOUGH_ROOM; + // Save the original value of it so we can go back in case of failure + // Of course, it does not make much sense with i.e. stream iterators + It original_it = it; + + utf_error err = utf_error::OK; + + const char16_t first_word = *it++; + if (!detail::is_surrogate(first_word)) { + code_point = first_word; + return utf_error::OK; + } else { + if (it == end) + err = utf_error::NOT_ENOUGH_ROOM; + else if (detail::is_lead_surrogate(first_word)) { + const char16_t second_word = *it++; + if (detail::is_trail_surrogate(static_cast(second_word))) { + code_point = static_cast(first_word << 10) + static_cast(second_word) + SURROGATE_OFFSET; + return utf_error::OK; + } else + err = utf_error::INCOMPLETE_SEQUENCE; - public: - explicit invalid_utf16(char16_t u) - : u16(u) - { + } else { + err = utf_error::INVALID_LEAD; } - virtual const char* what() const noexcept override { return "Invalid UTF-16"; } - [[nodiscard]] char16_t utf16_word() const noexcept { return u16; } - }; + } + // error branch + it = original_it; + return err; +} - class not_enough_room : public exception +template> + requires std::output_iterator +constexpr OutIt append(char32_t cp, OutIt result) noexcept +{ + if (cp < 0x80) // one octet + *(result++) = static_cast(cp); + else if (cp < 0x800) { // two octets + *(result++) = static_cast((cp >> 6) | 0xc0); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } else if (cp < 0x10000) { // three octets + *(result++) = static_cast((cp >> 12) | 0xe0); + *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } else { // four octets + *(result++) = static_cast((cp >> 18) | 0xf0); + *(result++) = static_cast(((cp >> 12) & 0x3f) | 0x80); + *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + return result; +} + +template +constexpr std::back_insert_iterator append(char32_t cp, std::back_insert_iterator result) + noexcept(noexcept(detail::append, class container_type::value_type>(cp, result))) + { + return detail::append, class container_type::value_type>(cp, result); + } + + template It> + constexpr It append16(char32_t cp, It result) + noexcept(noexcept(*result++ = std::declval())) { - public: - virtual const char* what() const noexcept override { return "Not enough space"; } - }; + if (detail::is_in_bmp(cp)) + *(result++) = static_cast(cp); + else { + // Code points from the supplementary planes are encoded via surrogate pairs + *(result++) = static_cast(LEAD_OFFSET + (cp >> 10)); + *(result++) = static_cast(TRAIL_SURROGATE_MIN + (cp & 0x3FF)); + } + return result; + } - /// The library API - functions intended to be called by the users +} // detail - // Byte order mark - constexpr char8_t bom[] = {0xef, 0xbb, 0xbf}; +// Base for the exceptions that may be thrown from the library +class exception : public ::std::exception +{ +}; - template Se> - [[nodiscard]] constexpr It find_invalid(It it, Se se) - noexcept(noexcept(internal::validate_next(it, se)) && std::is_nothrow_copy_constructible_v) - { - while (it != se) { - internal::utf_error err_code = internal::validate_next(it, se); - if (err_code != internal::utf_error::OK) - return it; - } - return it; - } +// Exceptions that may be thrown from the library functions. +class invalid_code_point : public exception +{ + char32_t cp; - [[nodiscard]] constexpr std::size_t find_invalid(std::string_view s) - noexcept(noexcept(unicode::find_invalid(s.begin(), s.end()))) +public: + explicit invalid_code_point(char32_t codepoint) + : cp(codepoint) { - std::string_view::const_iterator invalid = unicode::find_invalid(s.begin(), s.end()); - return (invalid == s.end()) ? std::string_view::npos : static_cast(invalid - s.begin()); } + virtual const char* what() const noexcept override { return "Invalid code point"; } + [[nodiscard]] char32_t code_point() const noexcept { return cp; } +}; - [[nodiscard]] constexpr std::size_t find_invalid(std::u8string_view s) - noexcept(noexcept(unicode::find_invalid(s.begin(), s.end()))) - { - std::u8string_view::const_iterator invalid = unicode::find_invalid(s.begin(), s.end()); - return (invalid == s.end()) ? std::u8string_view::npos : static_cast(invalid - s.begin()); - } +class invalid_utf8 : public exception +{ + char8_t u8; - template Se> - [[nodiscard]] constexpr bool is_valid(It it, Se se) - noexcept(noexcept(unicode::find_invalid(it, se)) && traits::is_nothrow_sentinel_v) +public: + explicit invalid_utf8(char c) + : u8(static_cast(c)) { - return (unicode::find_invalid(it, se) == se); } - - [[nodiscard]] constexpr bool is_valid(std::string_view s) - noexcept(noexcept(unicode::is_valid(s.begin(), s.end()))) + explicit invalid_utf8(char8_t u) + : u8(u) { - return unicode::is_valid(s.begin(), s.end()); } + virtual const char* what() const noexcept override { return "Invalid UTF-8"; } + [[nodiscard]] char8_t utf8_octet() const noexcept { return u8; } +}; - [[nodiscard]] constexpr bool is_valid(std::u8string_view s) - noexcept(noexcept(unicode::is_valid(s.begin(), s.end()))) - { - return unicode::is_valid(s.begin(), s.end()); - } +class invalid_utf16 : public exception +{ + char16_t u16; - template Se> - [[nodiscard]] constexpr bool starts_with_bom(It it, Se end) - noexcept(noexcept(internal::mask8(*it++)) && traits::is_nothrow_sentinel_v) +public: + explicit invalid_utf16(char16_t u) + : u16(u) { - return (((it != end) && (internal::mask8(*it++)) == bom[0]) && ((it != end) && (internal::mask8(*it++)) == bom[1]) && ((it != end) && (internal::mask8(*it)) == bom[2])); } + virtual const char* what() const noexcept override { return "Invalid UTF-16"; } + [[nodiscard]] char16_t utf16_word() const noexcept { return u16; } +}; - [[nodiscard]] constexpr bool starts_with_bom(std::string_view s) - noexcept(noexcept(unicode::starts_with_bom(s.begin(), s.end()))) - { - return unicode::starts_with_bom(s.begin(), s.end()); - } +class not_enough_room : public exception +{ +public: + virtual const char* what() const noexcept override { return "Not enough space"; } +}; - [[nodiscard]] constexpr bool starts_with_bom(std::u8string_view s) - noexcept(noexcept(unicode::starts_with_bom(s.begin(), s.end()))) - { - return unicode::starts_with_bom(s.begin(), s.end()); - } +/// The library API - functions intended to be called by the users - template - constexpr OutIt append(char32_t cp, OutIt result) - { - if (!internal::is_code_point_valid(cp)) - throw invalid_code_point(cp); +// Byte order mark +constexpr char8_t bom[] = {0xef, 0xbb, 0xbf}; - return internal::append(cp, result); +template Se> +[[nodiscard]] constexpr It find_invalid(It it, Se se) + noexcept(noexcept(detail::validate_next(it, se)) && std::is_nothrow_copy_constructible_v) +{ + while (it != se) { + detail::utf_error err_code = detail::validate_next(it, se); + if (err_code != detail::utf_error::OK) + return it; } + return it; +} - constexpr void append(char32_t cp, std::string& s) - { - unicode::append(cp, std::back_inserter(s)); - } +[[nodiscard]] constexpr std::size_t find_invalid(std::string_view s) + noexcept(noexcept(unicode::find_invalid(s.begin(), s.end()))) +{ + std::string_view::const_iterator invalid = unicode::find_invalid(s.begin(), s.end()); + return (invalid == s.end()) ? std::string_view::npos : static_cast(invalid - s.begin()); +} - constexpr void append(char32_t cp, std::u8string& s) - { - unicode::append(cp, std::back_inserter(s)); - } +[[nodiscard]] constexpr std::size_t find_invalid(std::u8string_view s) + noexcept(noexcept(unicode::find_invalid(s.begin(), s.end()))) +{ + std::u8string_view::const_iterator invalid = unicode::find_invalid(s.begin(), s.end()); + return (invalid == s.end()) ? std::u8string_view::npos : static_cast(invalid - s.begin()); +} - template // TODO: add constraints - constexpr It append16(char32_t cp, It result) - { - if (!internal::is_code_point_valid(cp)) - throw invalid_code_point(cp); +template Se> +[[nodiscard]] constexpr bool is_valid(It it, Se se) + noexcept(noexcept(unicode::find_invalid(it, se)) && is_nothrow_sentinel_v) +{ + return (unicode::find_invalid(it, se) == se); +} - return internal::append16(cp, result); - } +[[nodiscard]] constexpr bool is_valid(std::string_view s) + noexcept(noexcept(unicode::is_valid(s.begin(), s.end()))) +{ + return unicode::is_valid(s.begin(), s.end()); +} - constexpr void append16(char32_t cp, std::u16string& s) - { - unicode::append16(cp, std::back_inserter(s)); - } +[[nodiscard]] constexpr bool is_valid(std::u8string_view s) + noexcept(noexcept(unicode::is_valid(s.begin(), s.end()))) +{ + return unicode::is_valid(s.begin(), s.end()); +} - template Se, class Out> // TODO: add constraints - constexpr Out replace_invalid(It start, Se end, Out out, char32_t replacement) - { - while (start != end) { - It sequence_start = start; - internal::utf_error err_code = internal::validate_next(start, end); - switch (err_code) { - case internal::utf_error::OK: - for (It it = sequence_start; it != start; ++it) - *out++ = *it; - break; - case internal::utf_error::NOT_ENOUGH_ROOM: - out = unicode::append(replacement, out); - start = end; - break; - case internal::utf_error::INVALID_LEAD: - out = unicode::append(replacement, out); - ++start; - break; - case internal::utf_error::INCOMPLETE_SEQUENCE: - case internal::utf_error::OVERLONG_SEQUENCE: - case internal::utf_error::INVALID_CODE_POINT: - out = unicode::append(replacement, out); - ++start; - // just one replacement mark for the sequence - while (start != end && internal::is_trail(*start)) - ++start; - break; - } - } - return out; - } +template Se> +[[nodiscard]] constexpr bool starts_with_bom(It it, Se end) + noexcept(noexcept(detail::mask8(*it++)) && is_nothrow_sentinel_v) +{ + return (((it != end) && (detail::mask8(*it++)) == bom[0]) && ((it != end) && (detail::mask8(*it++)) == bom[1]) && ((it != end) && (detail::mask8(*it)) == bom[2])); +} - template Se, class Out> // TODO: add constraints - constexpr Out replace_invalid(It start, Se end, Out out) - { - constexpr char32_t replacement_marker = static_cast(internal::mask16(0xfffd)); - return unicode::replace_invalid(start, end, out, replacement_marker); - } +[[nodiscard]] constexpr bool starts_with_bom(std::string_view s) + noexcept(noexcept(unicode::starts_with_bom(s.begin(), s.end()))) +{ + return unicode::starts_with_bom(s.begin(), s.end()); +} - [[nodiscard]] constexpr std::string replace_invalid(std::string_view s, char32_t replacement) - { - std::string result; - replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); - return result; - } +[[nodiscard]] constexpr bool starts_with_bom(std::u8string_view s) + noexcept(noexcept(unicode::starts_with_bom(s.begin(), s.end()))) +{ + return unicode::starts_with_bom(s.begin(), s.end()); +} - [[nodiscard]] constexpr std::u8string replace_invalid(std::u8string_view s, char32_t replacement) - { - std::u8string result; - replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); - return result; - } +template +constexpr OutIt append(char32_t cp, OutIt result) +{ + if (!detail::is_code_point_valid(cp)) + throw invalid_code_point(cp); - [[nodiscard]] constexpr std::string replace_invalid(std::string_view s) - { - std::string result; - replace_invalid(s.begin(), s.end(), std::back_inserter(result)); - return result; - } + return detail::append(cp, result); +} - [[nodiscard]] constexpr std::u8string replace_invalid(std::u8string_view s) - { - std::u8string result; - replace_invalid(s.begin(), s.end(), std::back_inserter(result)); - return result; - } +constexpr void append(char32_t cp, std::string& s) +{ + unicode::append(cp, std::back_inserter(s)); +} - template Se> - [[nodiscard]] constexpr char32_t next(It& it, Se end) - { - char32_t cp = 0; - internal::utf_error err_code = internal::validate_next(it, end, cp); +constexpr void append(char32_t cp, std::u8string& s) +{ + unicode::append(cp, std::back_inserter(s)); +} + +template // TODO: add constraints +constexpr It append16(char32_t cp, It result) +{ + if (!detail::is_code_point_valid(cp)) + throw invalid_code_point(cp); + + return detail::append16(cp, result); +} + +constexpr void append16(char32_t cp, std::u16string& s) +{ + unicode::append16(cp, std::back_inserter(s)); +} + +template Se, class Out> // TODO: add constraints +constexpr Out replace_invalid(It start, Se end, Out out, char32_t replacement) +{ + while (start != end) { + It sequence_start = start; + detail::utf_error err_code = detail::validate_next(start, end); switch (err_code) { - case internal::utf_error::OK: + case detail::utf_error::OK: + for (It it = sequence_start; it != start; ++it) + *out++ = *it; + break; + case detail::utf_error::NOT_ENOUGH_ROOM: + out = unicode::append(replacement, out); + start = end; + break; + case detail::utf_error::INVALID_LEAD: + out = unicode::append(replacement, out); + ++start; + break; + case detail::utf_error::INCOMPLETE_SEQUENCE: + case detail::utf_error::OVERLONG_SEQUENCE: + case detail::utf_error::INVALID_CODE_POINT: + out = unicode::append(replacement, out); + ++start; + // just one replacement mark for the sequence + while (start != end && detail::is_trail(*start)) + ++start; break; - case internal::utf_error::NOT_ENOUGH_ROOM: - throw not_enough_room(); - case internal::utf_error::INVALID_LEAD: - case internal::utf_error::INCOMPLETE_SEQUENCE: - case internal::utf_error::OVERLONG_SEQUENCE: - throw invalid_utf8(static_cast(*it)); - case internal::utf_error::INVALID_CODE_POINT: - throw invalid_code_point(cp); } - return cp; } + return out; +} - template Se> - [[nodiscard]] constexpr char32_t next16(It& it, Se end) - { - char32_t cp = 0; - internal::utf_error err_code = internal::validate_next16(it, end, cp); - if (err_code == internal::utf_error::NOT_ENOUGH_ROOM) +template Se, class Out> // TODO: add constraints +constexpr Out replace_invalid(It start, Se end, Out out) +{ + constexpr char32_t replacement_marker = static_cast(detail::mask16(0xfffd)); + return unicode::replace_invalid(start, end, out, replacement_marker); +} + +[[nodiscard]] constexpr std::string replace_invalid(std::string_view s, char32_t replacement) +{ + std::string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); + return result; +} + +[[nodiscard]] constexpr std::u8string replace_invalid(std::u8string_view s, char32_t replacement) +{ + std::u8string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); + return result; +} + +[[nodiscard]] constexpr std::string replace_invalid(std::string_view s) +{ + std::string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result)); + return result; +} + +[[nodiscard]] constexpr std::u8string replace_invalid(std::u8string_view s) +{ + std::u8string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result)); + return result; +} + +template Se> +[[nodiscard]] constexpr char32_t next(It& it, Se end) +{ + char32_t cp = 0; + detail::utf_error err_code = detail::validate_next(it, end, cp); + switch (err_code) { + case detail::utf_error::OK: + break; + case detail::utf_error::NOT_ENOUGH_ROOM: throw not_enough_room(); - return cp; + case detail::utf_error::INVALID_LEAD: + case detail::utf_error::INCOMPLETE_SEQUENCE: + case detail::utf_error::OVERLONG_SEQUENCE: + throw invalid_utf8(static_cast(*it)); + case detail::utf_error::INVALID_CODE_POINT: + throw invalid_code_point(cp); } + return cp; +} - template Se> - [[nodiscard]] constexpr char32_t peek_next(It it, Se end) - { - return unicode::next(it, end); - } +template Se> +[[nodiscard]] constexpr char32_t next16(It& it, Se end) +{ + char32_t cp = 0; + detail::utf_error err_code = detail::validate_next16(it, end, cp); + if (err_code == detail::utf_error::NOT_ENOUGH_ROOM) + throw not_enough_room(); + return cp; +} + +template Se> +[[nodiscard]] constexpr char32_t peek_next(It it, Se end) +{ + return unicode::next(it, end); +} - template Se> - [[nodiscard]] constexpr char32_t prior(It& it, Se start) - { - // can't do much if it == start +template Se> +[[nodiscard]] constexpr char32_t prev(It& it, Se start) +{ + // can't do much if it == start + if (it == start) + throw not_enough_room(); + + It end = it; + // Go back until we hit either a lead octet or start + while (detail::is_trail(*(--it))) if (it == start) - throw not_enough_room(); + throw invalid_utf8(*it); // error - no lead byte in the sequence + return unicode::peek_next(it, end); +} - It end = it; - // Go back until we hit either a lead octet or start - while (internal::is_trail(*(--it))) - if (it == start) - throw invalid_utf8(*it); // error - no lead byte in the sequence - return unicode::peek_next(it, end); - } +template Se, class distance_type> +constexpr void advance(It& it, distance_type n, Se end) +{ + const distance_type zero(0); + if (n < zero) { + // backward + for (distance_type i = n; i < zero; ++i) + (void)unicode::prev(it, end); + } else { + // forward + for (distance_type i = zero; i < n; ++i) + (void)unicode::next(it, end); + } +} + +template Se> +[[nodiscard]] constexpr class std::iterator_traits::difference_type distance(It first, Se last) +{ + class std::iterator_traits::difference_type dist; + for (dist = 0; first != last; ++dist) + (void)unicode::next(first, last); + return dist; +} + +template Se, class OutIt> // TODO: add constraints +constexpr OutIt utf16to8(It start, Se end, OutIt result) +{ + while (start != end) { + char32_t cp = static_cast(detail::mask16(*start++)); + // Take care of surrogate pairs first + if (detail::is_lead_surrogate(cp)) { + if (start != end) { + const char32_t trail_surrogate = static_cast(detail::mask16(*start++)); + if (detail::is_trail_surrogate(trail_surrogate)) + cp = (cp << 10) + trail_surrogate + detail::SURROGATE_OFFSET; + else + throw invalid_utf16(static_cast(trail_surrogate)); + } else + throw invalid_utf16(static_cast(cp)); - template Se, class distance_type> - constexpr void advance(It& it, distance_type n, Se end) - { - const distance_type zero(0); - if (n < zero) { - // backward - for (distance_type i = n; i < zero; ++i) - (void)unicode::prior(it, end); - } else { - // forward - for (distance_type i = zero; i < n; ++i) - (void)unicode::next(it, end); } - } + // Lone trail surrogate + else if (detail::is_trail_surrogate(cp)) + throw invalid_utf16(static_cast(cp)); - template Se> - [[nodiscard]] constexpr class std::iterator_traits::difference_type distance(It first, Se last) - { - class std::iterator_traits::difference_type dist; - for (dist = 0; first != last; ++dist) - (void)unicode::next(first, last); - return dist; + result = unicode::append(cp, result); } + return result; +} - template Se, class OutIt> // TODO: add constraints - constexpr OutIt utf16to8(It start, Se end, OutIt result) - { - while (start != end) { - char32_t cp = static_cast(internal::mask16(*start++)); - // Take care of surrogate pairs first - if (internal::is_lead_surrogate(cp)) { - if (start != end) { - const char32_t trail_surrogate = static_cast(internal::mask16(*start++)); - if (internal::is_trail_surrogate(trail_surrogate)) - cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; - else - throw invalid_utf16(static_cast(trail_surrogate)); - } else - throw invalid_utf16(static_cast(cp)); - - } - // Lone trail surrogate - else if (internal::is_trail_surrogate(cp)) - throw invalid_utf16(static_cast(cp)); +[[nodiscard]] constexpr std::string utf16to8(std::u16string_view s) +{ + std::string result; + unicode::utf16to8(s.begin(), s.end(), std::back_inserter(result)); + return result; +} - result = unicode::append(cp, result); - } - return result; - } +[[nodiscard]] constexpr std::u8string utf16tou8(std::u16string_view s) +{ + std::u8string result; + unicode::utf16to8(s.begin(), s.end(), std::back_inserter(result)); + return result; +} - [[nodiscard]] constexpr std::string utf16to8(std::u16string_view s) - { - std::string result; - unicode::utf16to8(s.begin(), s.end(), std::back_inserter(result)); - return result; - } +template Se, class OutIt> // TODO: add constraints +constexpr OutIt utf8to16(It start, Se end, OutIt result) +{ + while (start != end) { + const char32_t cp = unicode::next(start, end); + if (cp > 0xffff) { // make a surrogate pair + *result++ = static_cast((cp >> 10) + detail::LEAD_OFFSET); + *result++ = static_cast((cp & 0x3ff) + detail::TRAIL_SURROGATE_MIN); + } else + *result++ = static_cast(cp); + } + return result; +} + +[[nodiscard]] constexpr std::u16string utf8to16(std::string_view s) +{ + std::u16string result; + unicode::utf8to16(s.begin(), s.end(), std::back_inserter(result)); + return result; +} - [[nodiscard]] constexpr std::u8string utf16tou8(std::u16string_view s) - { - std::u8string result; - unicode::utf16to8(s.begin(), s.end(), std::back_inserter(result)); - return result; - } +[[nodiscard]] constexpr std::u16string utf8to16(std::u8string_view s) +{ + std::u16string result; + unicode::utf8to16(s.begin(), s.end(), std::back_inserter(result)); + return result; +} - template Se, class OutIt> // TODO: add constraints - constexpr OutIt utf8to16(It start, Se end, OutIt result) - { - while (start != end) { - const char32_t cp = unicode::next(start, end); - if (cp > 0xffff) { // make a surrogate pair - *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); - *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); - } else - *result++ = static_cast(cp); - } - return result; - } +template Se, class OutIt> // TODO: add constraints +constexpr OutIt utf32to8(It start, Se end, OutIt result) +{ + while (start != end) + result = unicode::append(*(start++), result); - [[nodiscard]] constexpr std::u16string utf8to16(std::string_view s) - { - std::u16string result; - unicode::utf8to16(s.begin(), s.end(), std::back_inserter(result)); - return result; - } + return result; +} - [[nodiscard]] constexpr std::u16string utf8to16(std::u8string_view s) - { - std::u16string result; - unicode::utf8to16(s.begin(), s.end(), std::back_inserter(result)); - return result; - } +[[nodiscard]] constexpr std::string utf32to8(std::u32string_view s) +{ + std::string result; + unicode::utf32to8(s.begin(), s.end(), std::back_inserter(result)); + return result; +} - template Se, class OutIt> // TODO: add constraints - constexpr OutIt utf32to8(It start, Se end, OutIt result) - { - while (start != end) - result = unicode::append(*(start++), result); +[[nodiscard]] constexpr std::u8string utf32tou8(std::u32string_view s) +{ + std::u8string result; + unicode::utf32to8(s.begin(), s.end(), std::back_inserter(result)); + return result; +} - return result; - } +template Se, class OutIt> +constexpr OutIt utf8to32(It start, Se end, OutIt result) +{ + while (start != end) + (*result++) = unicode::next(start, end); - [[nodiscard]] constexpr std::string utf32to8(std::u32string_view s) + return result; +} + +[[nodiscard]] constexpr std::u32string utf8to32(std::string_view s) +{ + std::u32string result; + unicode::utf8to32(s.begin(), s.end(), std::back_inserter(result)); + return result; +} + +[[nodiscard]] constexpr std::u32string utf8to32(std::u8string_view s) +{ + std::u32string result; + unicode::utf8to32(s.begin(), s.end(), std::back_inserter(result)); + return result; +} + +// The iterator class +template +class iterator +{ + It it; + It range_start; + It range_end; + +public: + using value_type = char32_t; + using pointer = char32_t*; + using reference = char32_t&; + using difference_type = std::ptrdiff_t; + using iterator_category = std::bidirectional_iterator_tag; + constexpr iterator() + requires std::is_default_constructible_v + = default; + constexpr explicit iterator(It octet_it, It rangestart, It rangeend) + : it(std::move(octet_it)) + , range_start(std::move(rangestart)) + , range_end(std::move(rangeend)) { - std::string result; - unicode::utf32to8(s.begin(), s.end(), std::back_inserter(result)); - return result; + if constexpr (std::random_access_iterator) { + if (it < range_start || it > range_end) + throw std::out_of_range("Invalid utf-8 iterator position"); + } } - - [[nodiscard]] constexpr std::u8string utf32tou8(std::u32string_view s) + // the default "big three" are OK + [[nodiscard]] constexpr It base() const { return it; } + [[nodiscard]] constexpr char32_t operator*() const { - std::u8string result; - unicode::utf32to8(s.begin(), s.end(), std::back_inserter(result)); - return result; + It temp = it; + return unicode::next(temp, range_end); } - - template Se, class OutIt> - constexpr OutIt utf8to32(It start, Se end, OutIt result) + [[nodiscard]] constexpr bool operator==(const iterator& rhs) const { - while (start != end) - (*result++) = unicode::next(start, end); - - return result; + if (range_start != rhs.range_start || range_end != rhs.range_end) + throw std::logic_error("Comparing utf-8 iterators defined with different ranges"); + return (it == rhs.it); } - - [[nodiscard]] constexpr std::u32string utf8to32(std::string_view s) + constexpr iterator& operator++() { - std::u32string result; - unicode::utf8to32(s.begin(), s.end(), std::back_inserter(result)); - return result; + (void)unicode::next(it, range_end); + return *this; } - - [[nodiscard]] constexpr std::u32string utf8to32(std::u8string_view s) + constexpr iterator operator++(int) { - std::u32string result; - unicode::utf8to32(s.begin(), s.end(), std::back_inserter(result)); - return result; + iterator temp = *this; + (void)unicode::next(it, range_end); + return temp; } - - // The iterator class - template - class iterator + constexpr iterator& operator--() { - It it; - It range_start; - It range_end; - - public: - using value_type = char32_t; - using pointer = char32_t*; - using reference = char32_t&; - using difference_type = std::ptrdiff_t; - using iterator_category = std::bidirectional_iterator_tag; - constexpr iterator() - requires std::is_default_constructible_v - = default; - constexpr explicit iterator(It octet_it, It rangestart, It rangeend) - : it(std::move(octet_it)) - , range_start(std::move(rangestart)) - , range_end(std::move(rangeend)) - { - if constexpr (std::random_access_iterator) { - if (it < range_start || it > range_end) - throw std::out_of_range("Invalid utf-8 iterator position"); - } - } - // the default "big three" are OK - [[nodiscard]] constexpr It base() const { return it; } - [[nodiscard]] constexpr char32_t operator*() const - { - It temp = it; - return unicode::next(temp, range_end); - } - [[nodiscard]] constexpr bool operator==(const iterator& rhs) const - { - if (range_start != rhs.range_start || range_end != rhs.range_end) - throw std::logic_error("Comparing utf-8 iterators defined with different ranges"); - return (it == rhs.it); - } - constexpr iterator& operator++() - { - (void)unicode::next(it, range_end); - return *this; - } - constexpr iterator operator++(int) - { - iterator temp = *this; - (void)unicode::next(it, range_end); - return temp; - } - constexpr iterator& operator--() - { - (void)unicode::prior(it, range_start); - return *this; - } - constexpr iterator operator--(int) - { - iterator temp = *this; - (void)unicode::prior(it, range_start); - return temp; - } - }; // class iterator + (void)unicode::prev(it, range_start); + return *this; + } + constexpr iterator operator--(int) + { + iterator temp = *this; + (void)unicode::prev(it, range_start); + return temp; + } +}; } // iris::unicode From cb8aaf4b0661b57d45e17e840900af06bb5049a3 Mon Sep 17 00:00:00 2001 From: Nana Sakisaka <1901813+saki7@users.noreply.github.com> Date: Sat, 7 Mar 2026 18:10:32 +0900 Subject: [PATCH 04/17] Refactor until append8/append16 --- include/iris/unicode/string.hpp | 693 +++++++++++++++------------ test/unicode/string/string.cpp | 202 ++++---- test/unicode/string/utf8_invalid.cpp | 10 +- 3 files changed, 506 insertions(+), 399 deletions(-) diff --git a/include/iris/unicode/string.hpp b/include/iris/unicode/string.hpp index 7b0b77a..98ed0c3 100644 --- a/include/iris/unicode/string.hpp +++ b/include/iris/unicode/string.hpp @@ -25,7 +25,6 @@ ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ - #ifndef IRIS_UNICODE_STRING_HPP #define IRIS_UNICODE_STRING_HPP @@ -37,9 +36,12 @@ DEALINGS IN THE SOFTWARE. #include #include #include +#include namespace iris::unicode { +constexpr char8_t bom[] = {0xef, 0xbb, 0xbf}; + template concept octet = std::integral && sizeof(T) == 1; @@ -65,6 +67,65 @@ template concept utf32_input_iterator = std::input_iterator && utf32char>; +namespace detail { + +template +struct select_output_value_type +{ + static_assert(std::output_iterator); + using type = DesiredValueT; +}; + +template + requires requires { + typename std::iter_value_t; + requires std::convertible_to>; + } +struct select_output_value_type +{ + static_assert(std::output_iterator>); + using type = std::iter_value_t; +}; + +template +concept maybe_value_type_sized = + requires { + typename std::iter_value_t; + requires sizeof(std::iter_value_t) == SizeofChar; + } || + !requires { + typename std::iter_value_t; + }; + +} // detail + +template +concept octet_output_iterator = + ( + std::output_iterator || + std::output_iterator + ) && + detail::maybe_value_type_sized; + +template +concept octet_output_range = + ( + std::ranges::output_range || + std::ranges::output_range + ) && + detail::maybe_value_type_sized, 1>; + +template +concept utf16_output_iterator = + std::output_iterator && + detail::maybe_value_type_sized; + +template +concept utf16_output_range = + std::ranges::output_range && + detail::maybe_value_type_sized, 2>; + + template struct is_nothrow_dereferenceable : std::false_type {}; @@ -109,6 +170,64 @@ template inline constexpr bool is_nothrow_sentinel_v = is_nothrow_sentinel::value; +class unicode_error : public std::runtime_error +{ + using std::runtime_error::runtime_error; +}; + +class invalid_code_point : public unicode_error +{ + char32_t cp; + +public: + explicit invalid_code_point(char32_t codepoint) + : unicode_error("invalid code point") + , cp(codepoint) + {} + + [[nodiscard]] char32_t code_point() const noexcept { return cp; } +}; + +class invalid_utf8 : public unicode_error +{ + char8_t u8; + +public: + explicit invalid_utf8(char c) + : unicode_error("invalid UTF-8") + , u8(static_cast(c)) + {} + + explicit invalid_utf8(char8_t u) + : unicode_error("invalid UTF-8") + , u8(u) + {} + + [[nodiscard]] char8_t utf8_octet() const noexcept { return u8; } +}; + +class invalid_utf16 : public unicode_error +{ + char16_t u16; + +public: + explicit invalid_utf16(char16_t u) + : unicode_error("Invalid UTF-16") + , u16(u) + {} + + [[nodiscard]] char16_t utf16_word() const noexcept { return u16; } +}; + +class not_enough_space : public unicode_error +{ +public: + not_enough_space() + : unicode_error("not enough space") + {} +}; + + namespace detail { // Unicode constants @@ -124,7 +243,15 @@ constexpr char32_t SURROGATE_OFFSET = 0xfca02400u; // 0x10000u - (LEAD_SURROG // Maximum valid value for a Unicode code point constexpr char32_t CODE_POINT_MAX = 0x0010ffffu; -enum class utf_error { OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT }; +enum class utf_error +{ + OK, + NOT_ENOUGH_SPACE, + INVALID_LEAD, + INCOMPLETE_SEQUENCE, + OVERLONG_SEQUENCE, + INVALID_CODE_POINT, +}; template [[nodiscard]] constexpr char8_t mask8(Octet oc) noexcept @@ -145,22 +272,22 @@ template [[nodiscard]] constexpr bool is_lead_surrogate(char32_t cp) noexcept { - return (cp >= static_cast(LEAD_SURROGATE_MIN) && cp <= static_cast(LEAD_SURROGATE_MAX)); + return cp >= static_cast(LEAD_SURROGATE_MIN) && cp <= static_cast(LEAD_SURROGATE_MAX); } [[nodiscard]] constexpr bool is_trail_surrogate(char32_t cp) noexcept { - return (cp >= static_cast(TRAIL_SURROGATE_MIN) && cp <= static_cast(TRAIL_SURROGATE_MAX)); + return cp >= static_cast(TRAIL_SURROGATE_MIN) && cp <= static_cast(TRAIL_SURROGATE_MAX); } [[nodiscard]] constexpr bool is_surrogate(char32_t cp) noexcept { - return (cp >= static_cast(LEAD_SURROGATE_MIN) && cp <= static_cast(TRAIL_SURROGATE_MAX)); + return cp >= static_cast(LEAD_SURROGATE_MIN) && cp <= static_cast(TRAIL_SURROGATE_MAX); } [[nodiscard]] constexpr bool is_code_point_valid(char32_t cp) noexcept { - return (cp <= CODE_POINT_MAX && !detail::is_surrogate(cp)); + return cp <= CODE_POINT_MAX && !detail::is_surrogate(cp); } [[nodiscard]] constexpr bool is_in_bmp(char32_t cp) noexcept @@ -171,14 +298,11 @@ template [[nodiscard]] constexpr bool is_overlong_sequence(char32_t cp, int length) noexcept { if (cp < 0x80) { - if (length != 1) - return true; + if (length != 1) return true; } else if (cp < 0x800) { - if (length != 2) - return true; + if (length != 2) return true; } else if (cp < 0x10000) { - if (length != 3) - return true; + if (length != 3) return true; } return false; } @@ -187,17 +311,12 @@ template [[nodiscard]] constexpr int sequence_length(It lead_it) noexcept(is_nothrow_dereferenceable_v) { - const char8_t lead = detail::mask8(*lead_it); - if (lead < 0x80) - return 1; - else if ((lead >> 5) == 0x6) - return 2; - else if ((lead >> 4) == 0xe) - return 3; - else if ((lead >> 3) == 0x1e) - return 4; - else - return 0; + char8_t const lead = detail::mask8(*lead_it); + if (lead < 0x80) return 1; + if ((lead >> 5) == 0x6) return 2; + if ((lead >> 4) == 0xe) return 3; + if ((lead >> 3) == 0x1e) return 4; + return 0; } /// Helper for get_sequence_x @@ -209,21 +328,21 @@ constexpr utf_error increase_safely(It& it, Se end) is_nothrow_sentinel >) { - if (++it == end) - return utf_error::NOT_ENOUGH_ROOM; - - if (!detail::is_trail(*it)) + if (++it == end) { + return utf_error::NOT_ENOUGH_SPACE; + } + if (!detail::is_trail(*it)) { return utf_error::INCOMPLETE_SEQUENCE; - + } return utf_error::OK; } #define IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(IT, END) \ -do { \ -utf_error ret = increase_safely(IT, END); \ -if (ret != utf_error::OK) \ - return ret; \ -} while (false) + do { \ + utf_error ret = increase_safely(IT, END); \ + if (ret != utf_error::OK) \ + return ret; \ + } while (false) /// get_sequence_x functions decode utf-8 sequences of the length x template Se> @@ -233,11 +352,8 @@ constexpr utf_error get_sequence_1(It& it, Se end, char32_t& code_point) is_nothrow_sentinel >) { - if (it == end) - return utf_error::NOT_ENOUGH_ROOM; - + if (it == end) return utf_error::NOT_ENOUGH_SPACE; code_point = static_cast(detail::mask8(*it)); - return utf_error::OK; } @@ -249,15 +365,11 @@ constexpr utf_error get_sequence_2(It& it, Se end, char32_t& code_point) is_nothrow_sentinel >) { - if (it == end) - return utf_error::NOT_ENOUGH_ROOM; + if (it == end) return utf_error::NOT_ENOUGH_SPACE; code_point = static_cast(detail::mask8(*it)); - IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end); - code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f); - return utf_error::OK; } @@ -269,19 +381,13 @@ constexpr utf_error get_sequence_3(It& it, Se end, char32_t& code_point) is_nothrow_sentinel >) { - if (it == end) - return utf_error::NOT_ENOUGH_ROOM; + if (it == end) return utf_error::NOT_ENOUGH_SPACE; code_point = static_cast(detail::mask8(*it)); - IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end); - code_point = ((code_point << 12) & 0xffff) + ((detail::mask8(*it) << 6) & 0xfff); - IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end); - code_point = static_cast(code_point + ((*it) & 0x3f)); - return utf_error::OK; } @@ -293,23 +399,15 @@ constexpr utf_error get_sequence_4(It& it, Se end, char32_t& code_point) is_nothrow_sentinel >) { - if (it == end) - return utf_error::NOT_ENOUGH_ROOM; + if (it == end) return utf_error::NOT_ENOUGH_SPACE; code_point = static_cast(detail::mask8(*it)); - IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end); - code_point = ((code_point << 18) & 0x1fffff) + ((detail::mask8(*it) << 12) & 0x3ffff); - IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end); - code_point = static_cast(code_point + ((detail::mask8(*it) << 6) & 0xfff)); - IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end); - code_point = static_cast(code_point + ((*it) & 0x3f)); - return utf_error::OK; } @@ -325,8 +423,7 @@ constexpr utf_error validate_next(It& it, Se end, char32_t& code_point) std::is_nothrow_copy_constructible >) { - if (it == end) - return utf_error::NOT_ENOUGH_ROOM; + if (it == end) return utf_error::NOT_ENOUGH_SPACE; // Save the original value of it so we can go back in case of failure // Of course, it does not make much sense with i.e. stream iterators @@ -337,22 +434,24 @@ constexpr utf_error validate_next(It& it, Se end, char32_t& code_point) const int length = detail::sequence_length(it); // Get trail octets and calculate the code point - utf_error err = utf_error::OK; + utf_error err{}; switch (length) { - case 0: - return utf_error::INVALID_LEAD; - case 1: - err = detail::get_sequence_1(it, end, cp); - break; - case 2: - err = detail::get_sequence_2(it, end, cp); - break; - case 3: - err = detail::get_sequence_3(it, end, cp); - break; - case 4: - err = detail::get_sequence_4(it, end, cp); - break; + case 0: + return utf_error::INVALID_LEAD; + case 1: + err = detail::get_sequence_1(it, end, cp); + break; + case 2: + err = detail::get_sequence_2(it, end, cp); + break; + case 3: + err = detail::get_sequence_3(it, end, cp); + break; + case 4: + err = detail::get_sequence_4(it, end, cp); + break; + default: + std::unreachable(); } if (err == utf_error::OK) { @@ -363,10 +462,12 @@ constexpr utf_error validate_next(It& it, Se end, char32_t& code_point) code_point = cp; ++it; return utf_error::OK; - } else + } else { err = utf_error::OVERLONG_SEQUENCE; - } else + } + } else { err = utf_error::INVALID_CODE_POINT; + } } // Failure branch - restore the original value of the iterator @@ -395,142 +496,36 @@ constexpr utf_error validate_next16(It& it, Se end, char32_t& code_point) >) { // Check the edge case: - if (it == end) - return utf_error::NOT_ENOUGH_ROOM; + if (it == end) return utf_error::NOT_ENOUGH_SPACE; + // Save the original value of it so we can go back in case of failure // Of course, it does not make much sense with i.e. stream iterators - It original_it = it; + It const original_it = it; - utf_error err = utf_error::OK; - - const char16_t first_word = *it++; + char16_t const first_word = *it++; if (!detail::is_surrogate(first_word)) { code_point = first_word; return utf_error::OK; - } else { - if (it == end) - err = utf_error::NOT_ENOUGH_ROOM; - else if (detail::is_lead_surrogate(first_word)) { - const char16_t second_word = *it++; - if (detail::is_trail_surrogate(static_cast(second_word))) { - code_point = static_cast(first_word << 10) + static_cast(second_word) + SURROGATE_OFFSET; - return utf_error::OK; - } else - err = utf_error::INCOMPLETE_SEQUENCE; - - } else { - err = utf_error::INVALID_LEAD; - } } - // error branch - it = original_it; - return err; -} - -template> - requires std::output_iterator -constexpr OutIt append(char32_t cp, OutIt result) noexcept -{ - if (cp < 0x80) // one octet - *(result++) = static_cast(cp); - else if (cp < 0x800) { // two octets - *(result++) = static_cast((cp >> 6) | 0xc0); - *(result++) = static_cast((cp & 0x3f) | 0x80); - } else if (cp < 0x10000) { // three octets - *(result++) = static_cast((cp >> 12) | 0xe0); - *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); - *(result++) = static_cast((cp & 0x3f) | 0x80); - } else { // four octets - *(result++) = static_cast((cp >> 18) | 0xf0); - *(result++) = static_cast(((cp >> 12) & 0x3f) | 0x80); - *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); - *(result++) = static_cast((cp & 0x3f) | 0x80); + if (it == end) { + it = original_it; + return utf_error::NOT_ENOUGH_SPACE; } - return result; -} - -template -constexpr std::back_insert_iterator append(char32_t cp, std::back_insert_iterator result) - noexcept(noexcept(detail::append, class container_type::value_type>(cp, result))) - { - return detail::append, class container_type::value_type>(cp, result); - } - - template It> - constexpr It append16(char32_t cp, It result) - noexcept(noexcept(*result++ = std::declval())) - { - if (detail::is_in_bmp(cp)) - *(result++) = static_cast(cp); - else { - // Code points from the supplementary planes are encoded via surrogate pairs - *(result++) = static_cast(LEAD_OFFSET + (cp >> 10)); - *(result++) = static_cast(TRAIL_SURROGATE_MIN + (cp & 0x3FF)); + if (detail::is_lead_surrogate(first_word)) { + char16_t const second_word = *it++; + if (detail::is_trail_surrogate(static_cast(second_word))) { + code_point = static_cast(first_word << 10) + static_cast(second_word) + SURROGATE_OFFSET; + return utf_error::OK; } - return result; - } - -} // detail - -// Base for the exceptions that may be thrown from the library -class exception : public ::std::exception -{ -}; - -// Exceptions that may be thrown from the library functions. -class invalid_code_point : public exception -{ - char32_t cp; - -public: - explicit invalid_code_point(char32_t codepoint) - : cp(codepoint) - { - } - virtual const char* what() const noexcept override { return "Invalid code point"; } - [[nodiscard]] char32_t code_point() const noexcept { return cp; } -}; - -class invalid_utf8 : public exception -{ - char8_t u8; - -public: - explicit invalid_utf8(char c) - : u8(static_cast(c)) - { - } - explicit invalid_utf8(char8_t u) - : u8(u) - { - } - virtual const char* what() const noexcept override { return "Invalid UTF-8"; } - [[nodiscard]] char8_t utf8_octet() const noexcept { return u8; } -}; - -class invalid_utf16 : public exception -{ - char16_t u16; - -public: - explicit invalid_utf16(char16_t u) - : u16(u) - { + it = original_it; + return utf_error::INCOMPLETE_SEQUENCE; } - virtual const char* what() const noexcept override { return "Invalid UTF-16"; } - [[nodiscard]] char16_t utf16_word() const noexcept { return u16; } -}; - -class not_enough_room : public exception -{ -public: - virtual const char* what() const noexcept override { return "Not enough space"; } -}; -/// The library API - functions intended to be called by the users + it = original_it; + return utf_error::INVALID_LEAD; +} -// Byte order mark -constexpr char8_t bom[] = {0xef, 0xbb, 0xbf}; +} // detail template Se> [[nodiscard]] constexpr It find_invalid(It it, Se se) @@ -538,8 +533,9 @@ template Se> { while (it != se) { detail::utf_error err_code = detail::validate_next(it, se); - if (err_code != detail::utf_error::OK) + if (err_code != detail::utf_error::OK) { return it; + } } return it; } @@ -548,21 +544,21 @@ template Se> noexcept(noexcept(unicode::find_invalid(s.begin(), s.end()))) { std::string_view::const_iterator invalid = unicode::find_invalid(s.begin(), s.end()); - return (invalid == s.end()) ? std::string_view::npos : static_cast(invalid - s.begin()); + return invalid == s.end() ? std::string_view::npos : static_cast(invalid - s.begin()); } [[nodiscard]] constexpr std::size_t find_invalid(std::u8string_view s) noexcept(noexcept(unicode::find_invalid(s.begin(), s.end()))) { std::u8string_view::const_iterator invalid = unicode::find_invalid(s.begin(), s.end()); - return (invalid == s.end()) ? std::u8string_view::npos : static_cast(invalid - s.begin()); + return invalid == s.end() ? std::u8string_view::npos : static_cast(invalid - s.begin()); } template Se> [[nodiscard]] constexpr bool is_valid(It it, Se se) noexcept(noexcept(unicode::find_invalid(it, se)) && is_nothrow_sentinel_v) { - return (unicode::find_invalid(it, se) == se); + return unicode::find_invalid(it, se) == se; } [[nodiscard]] constexpr bool is_valid(std::string_view s) @@ -581,7 +577,10 @@ template Se> [[nodiscard]] constexpr bool starts_with_bom(It it, Se end) noexcept(noexcept(detail::mask8(*it++)) && is_nothrow_sentinel_v) { - return (((it != end) && (detail::mask8(*it++)) == bom[0]) && ((it != end) && (detail::mask8(*it++)) == bom[1]) && ((it != end) && (detail::mask8(*it)) == bom[2])); + return + (it != end && (detail::mask8(*it++)) == bom[0]) && + (it != end && (detail::mask8(*it++)) == bom[1]) && + (it != end && (detail::mask8(*it)) == bom[2]); } [[nodiscard]] constexpr bool starts_with_bom(std::string_view s) @@ -596,73 +595,154 @@ template Se> return unicode::starts_with_bom(s.begin(), s.end()); } -template -constexpr OutIt append(char32_t cp, OutIt result) + +template +constexpr OutIt append8(char32_t cp, OutIt out) { - if (!detail::is_code_point_valid(cp)) - throw invalid_code_point(cp); + if (!detail::is_code_point_valid(cp)) throw invalid_code_point(cp); - return detail::append(cp, result); + using octet_type = detail::select_output_value_type::type; + + if (cp < 0x80) { // one octet + *out++ = static_cast(cp); + } else if (cp < 0x800) { // two octets + *out++ = static_cast((cp >> 6) | 0xc0); + *out++ = static_cast((cp & 0x3f) | 0x80); + } else if (cp < 0x10000) { // three octets + *out++ = static_cast((cp >> 12) | 0xe0); + *out++ = static_cast(((cp >> 6) & 0x3f) | 0x80); + *out++ = static_cast((cp & 0x3f) | 0x80); + } else { // four octets + *out++ = static_cast((cp >> 18) | 0xf0); + *out++ = static_cast(((cp >> 12) & 0x3f) | 0x80); + *out++ = static_cast(((cp >> 6) & 0x3f) | 0x80); + *out++ = static_cast((cp & 0x3f) | 0x80); + } + return out; } -constexpr void append(char32_t cp, std::string& s) +template +constexpr OutIt append16(char32_t cp, OutIt out) { - unicode::append(cp, std::back_inserter(s)); + if (!detail::is_code_point_valid(cp)) throw invalid_code_point(cp); + + if (detail::is_in_bmp(cp)) { + *out++ = static_cast(cp); + } else { + // Code points from the supplementary planes are encoded via surrogate pairs + *out++ = static_cast(detail::LEAD_OFFSET + (cp >> 10)); + *out++ = static_cast(detail::TRAIL_SURROGATE_MIN + (cp & 0x3FF)); + } + return out; } -constexpr void append(char32_t cp, std::u8string& s) +// Forwards automatically based on `sizeof(value_type)`, but overload may become +// ambiguous on `value_type`-agnostic iterators such as `std::back_insert_iterator`. +template +constexpr OutIt append(char32_t cp, OutIt out) +{ + return unicode::append8(cp, std::move(out)); +} +template +constexpr OutIt append(char32_t cp, OutIt out) { - unicode::append(cp, std::back_inserter(s)); + return unicode::append16(cp, std::move(out)); } -template // TODO: add constraints -constexpr It append16(char32_t cp, It result) + +template + requires octet_output_range> +constexpr std::ranges::subrange>, std::ranges::sentinel_t>> +append8(char32_t cp, OutR&& r) { - if (!detail::is_code_point_valid(cp)) - throw invalid_code_point(cp); + return std::ranges::subrange{ + unicode::append8(cp, std::ranges::begin(r)), std::ranges::end(r) + }; +} - return detail::append16(cp, result); +template + requires utf16_output_range> +constexpr std::ranges::subrange>, std::ranges::sentinel_t>> +append16(char32_t cp, OutR&& r) +{ + return std::ranges::subrange{ + unicode::append16(cp, std::ranges::begin(r)), std::ranges::end(r) + }; +} + +// Forwards automatically based on `sizeof(value_type)`, but overload may become +// ambiguous on `value_type`-agnostic iterators such as `std::back_insert_iterator`. +template + requires octet_output_range> +constexpr std::ranges::subrange>, std::ranges::sentinel_t>> +append(char32_t cp, OutR&& r) +{ + return unicode::append8(cp, std::forward(r)); +} +template + requires utf16_output_range> +constexpr std::ranges::subrange>, std::ranges::sentinel_t>> +append(char32_t cp, OutR&& r) +{ + return unicode::append16(cp, std::forward(r)); } -constexpr void append16(char32_t cp, std::u16string& s) +constexpr void append(char32_t cp, std::string& str) { - unicode::append16(cp, std::back_inserter(s)); + unicode::append8(cp, std::back_inserter(str)); } -template Se, class Out> // TODO: add constraints +constexpr void append(char32_t cp, std::u8string& str) +{ + unicode::append8(cp, std::back_inserter(str)); +} + +constexpr void append(char32_t cp, std::u16string& str) +{ + unicode::append16(cp, std::back_inserter(str)); +} + +template Se, octet_output_iterator Out> constexpr Out replace_invalid(It start, Se end, Out out, char32_t replacement) { while (start != end) { - It sequence_start = start; - detail::utf_error err_code = detail::validate_next(start, end); - switch (err_code) { - case detail::utf_error::OK: - for (It it = sequence_start; it != start; ++it) - *out++ = *it; - break; - case detail::utf_error::NOT_ENOUGH_ROOM: - out = unicode::append(replacement, out); - start = end; - break; - case detail::utf_error::INVALID_LEAD: - out = unicode::append(replacement, out); - ++start; - break; - case detail::utf_error::INCOMPLETE_SEQUENCE: - case detail::utf_error::OVERLONG_SEQUENCE: - case detail::utf_error::INVALID_CODE_POINT: - out = unicode::append(replacement, out); + It const sequence_start = start; + switch (detail::validate_next(start, end)) { + case detail::utf_error::OK: + for (It it = sequence_start; it != start; ++it) { + *out++ = *it; + } + break; + + case detail::utf_error::NOT_ENOUGH_SPACE: + out = unicode::append8(replacement, out); + start = end; + break; + + case detail::utf_error::INVALID_LEAD: + out = unicode::append8(replacement, out); + ++start; + break; + + case detail::utf_error::INCOMPLETE_SEQUENCE: + case detail::utf_error::OVERLONG_SEQUENCE: + case detail::utf_error::INVALID_CODE_POINT: + out = unicode::append8(replacement, out); + ++start; + // just one replacement mark for the sequence + while (start != end && detail::is_trail(*start)) { ++start; - // just one replacement mark for the sequence - while (start != end && detail::is_trail(*start)) - ++start; - break; + } + break; + + default: + std::unreachable(); } } return out; } -template Se, class Out> // TODO: add constraints +template Se, octet_output_iterator Out> constexpr Out replace_invalid(It start, Se end, Out out) { constexpr char32_t replacement_marker = static_cast(detail::mask16(0xfffd)); @@ -672,47 +752,48 @@ constexpr Out replace_invalid(It start, Se end, Out out) [[nodiscard]] constexpr std::string replace_invalid(std::string_view s, char32_t replacement) { std::string result; - replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); + unicode::replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); return result; } [[nodiscard]] constexpr std::u8string replace_invalid(std::u8string_view s, char32_t replacement) { std::u8string result; - replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); + unicode::replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); return result; } [[nodiscard]] constexpr std::string replace_invalid(std::string_view s) { std::string result; - replace_invalid(s.begin(), s.end(), std::back_inserter(result)); + unicode::replace_invalid(s.begin(), s.end(), std::back_inserter(result)); return result; } [[nodiscard]] constexpr std::u8string replace_invalid(std::u8string_view s) { std::u8string result; - replace_invalid(s.begin(), s.end(), std::back_inserter(result)); + unicode::replace_invalid(s.begin(), s.end(), std::back_inserter(result)); return result; } template Se> [[nodiscard]] constexpr char32_t next(It& it, Se end) { - char32_t cp = 0; - detail::utf_error err_code = detail::validate_next(it, end, cp); - switch (err_code) { - case detail::utf_error::OK: - break; - case detail::utf_error::NOT_ENOUGH_ROOM: - throw not_enough_room(); - case detail::utf_error::INVALID_LEAD: - case detail::utf_error::INCOMPLETE_SEQUENCE: - case detail::utf_error::OVERLONG_SEQUENCE: - throw invalid_utf8(static_cast(*it)); - case detail::utf_error::INVALID_CODE_POINT: - throw invalid_code_point(cp); + char32_t cp = 0; + switch (detail::validate_next(it, end, cp)) { + case detail::utf_error::OK: + break; + case detail::utf_error::NOT_ENOUGH_SPACE: + throw not_enough_space(); + case detail::utf_error::INVALID_LEAD: + case detail::utf_error::INCOMPLETE_SEQUENCE: + case detail::utf_error::OVERLONG_SEQUENCE: + throw invalid_utf8(static_cast(*it)); + case detail::utf_error::INVALID_CODE_POINT: + throw invalid_code_point(cp); + default: + std::unreachable(); } return cp; } @@ -722,8 +803,9 @@ template Se> { char32_t cp = 0; detail::utf_error err_code = detail::validate_next16(it, end, cp); - if (err_code == detail::utf_error::NOT_ENOUGH_ROOM) - throw not_enough_room(); + if (err_code == detail::utf_error::NOT_ENOUGH_SPACE) { + throw not_enough_space(); + } return cp; } @@ -737,50 +819,53 @@ template Se> [[nodiscard]] constexpr char32_t prev(It& it, Se start) { // can't do much if it == start - if (it == start) - throw not_enough_room(); + if (it == start) throw not_enough_space(); It end = it; // Go back until we hit either a lead octet or start - while (detail::is_trail(*(--it))) - if (it == start) - throw invalid_utf8(*it); // error - no lead byte in the sequence + while (detail::is_trail(*(--it))) { + if (it == start) throw invalid_utf8(*it); // error - no lead byte in the sequence + } return unicode::peek_next(it, end); } template Se, class distance_type> constexpr void advance(It& it, distance_type n, Se end) { - const distance_type zero(0); + constexpr distance_type zero(0); if (n < zero) { // backward - for (distance_type i = n; i < zero; ++i) + for (distance_type i = n; i < zero; ++i) { (void)unicode::prev(it, end); + } } else { // forward - for (distance_type i = zero; i < n; ++i) + for (distance_type i = zero; i < n; ++i) { (void)unicode::next(it, end); + } } } template Se> -[[nodiscard]] constexpr class std::iterator_traits::difference_type distance(It first, Se last) +[[nodiscard]] constexpr typename std::iterator_traits::difference_type +distance(It first, Se last) { - class std::iterator_traits::difference_type dist; - for (dist = 0; first != last; ++dist) + typename std::iterator_traits::difference_type dist; + for (dist = 0; first != last; ++dist) { (void)unicode::next(first, last); + } return dist; } -template Se, class OutIt> // TODO: add constraints -constexpr OutIt utf16to8(It start, Se end, OutIt result) +template Se, octet_output_iterator OutIt> +constexpr OutIt utf16to8(It start, Se end, OutIt out) { while (start != end) { char32_t cp = static_cast(detail::mask16(*start++)); // Take care of surrogate pairs first if (detail::is_lead_surrogate(cp)) { if (start != end) { - const char32_t trail_surrogate = static_cast(detail::mask16(*start++)); + char32_t const trail_surrogate = static_cast(detail::mask16(*start++)); if (detail::is_trail_surrogate(trail_surrogate)) cp = (cp << 10) + trail_surrogate + detail::SURROGATE_OFFSET; else @@ -793,9 +878,9 @@ constexpr OutIt utf16to8(It start, Se end, OutIt result) else if (detail::is_trail_surrogate(cp)) throw invalid_utf16(static_cast(cp)); - result = unicode::append(cp, result); + out = unicode::append8(cp, out); } - return result; + return out; } [[nodiscard]] constexpr std::string utf16to8(std::u16string_view s) @@ -812,18 +897,19 @@ constexpr OutIt utf16to8(It start, Se end, OutIt result) return result; } -template Se, class OutIt> // TODO: add constraints -constexpr OutIt utf8to16(It start, Se end, OutIt result) +template Se, utf16_output_iterator OutIt> +constexpr OutIt utf8to16(It start, Se end, OutIt out) { while (start != end) { - const char32_t cp = unicode::next(start, end); + char32_t const cp = unicode::next(start, end); if (cp > 0xffff) { // make a surrogate pair - *result++ = static_cast((cp >> 10) + detail::LEAD_OFFSET); - *result++ = static_cast((cp & 0x3ff) + detail::TRAIL_SURROGATE_MIN); - } else - *result++ = static_cast(cp); + *out++ = static_cast((cp >> 10) + detail::LEAD_OFFSET); + *out++ = static_cast((cp & 0x3ff) + detail::TRAIL_SURROGATE_MIN); + } else { + *out++ = static_cast(cp); + } } - return result; + return out; } [[nodiscard]] constexpr std::u16string utf8to16(std::string_view s) @@ -840,13 +926,13 @@ constexpr OutIt utf8to16(It start, Se end, OutIt result) return result; } -template Se, class OutIt> // TODO: add constraints -constexpr OutIt utf32to8(It start, Se end, OutIt result) +template Se, octet_output_iterator OutIt> +constexpr OutIt utf32to8(It start, Se end, OutIt out) { - while (start != end) - result = unicode::append(*(start++), result); - - return result; + while (start != end) { + out = unicode::append8(*start++, out); + } + return out; } [[nodiscard]] constexpr std::string utf32to8(std::u32string_view s) @@ -864,12 +950,12 @@ constexpr OutIt utf32to8(It start, Se end, OutIt result) } template Se, class OutIt> -constexpr OutIt utf8to32(It start, Se end, OutIt result) +constexpr OutIt utf8to32(It start, Se end, OutIt out) { - while (start != end) - (*result++) = unicode::next(start, end); - - return result; + while (start != end) { + *out++ = unicode::next(start, end); + } + return out; } [[nodiscard]] constexpr std::u32string utf8to32(std::string_view s) @@ -900,17 +986,20 @@ class iterator using reference = char32_t&; using difference_type = std::ptrdiff_t; using iterator_category = std::bidirectional_iterator_tag; + constexpr iterator() requires std::is_default_constructible_v = default; + constexpr explicit iterator(It octet_it, It rangestart, It rangeend) : it(std::move(octet_it)) , range_start(std::move(rangestart)) , range_end(std::move(rangeend)) { if constexpr (std::random_access_iterator) { - if (it < range_start || it > range_end) + if (it < range_start || it > range_end) { throw std::out_of_range("Invalid utf-8 iterator position"); + } } } // the default "big three" are OK diff --git a/test/unicode/string/string.cpp b/test/unicode/string/string.cpp index abc34a2..27738b8 100644 --- a/test/unicode/string/string.cpp +++ b/test/unicode/string/string.cpp @@ -2,66 +2,90 @@ #include +#include #include +#include +#include + +#include namespace iris_unicode_test { namespace unicode = iris::unicode; -using namespace iris::unicode; -using namespace std; +template +constexpr std::array to_array_cast(Chars... cs) +{ + return std::array{ + static_cast(cs)... + }; +} TEST_CASE("append") { - unsigned char u[5] = {0, 0, 0, 0, 0}; - unicode::append(0x0448, u); - EXPECT_EQ (u[0], 0xd1); - EXPECT_EQ (u[1], 0x88); - EXPECT_EQ (u[2], 0); - EXPECT_EQ (u[3], 0); - EXPECT_EQ (u[4], 0); - - unicode::append(0x65e5, u); - EXPECT_EQ (u[0], 0xe6); - EXPECT_EQ (u[1], 0x97); - EXPECT_EQ (u[2], 0xa5); - EXPECT_EQ (u[3], 0); - EXPECT_EQ (u[4], 0); - - unicode::append(0x3044, u); - EXPECT_EQ (u[0], 0xe3); - EXPECT_EQ (u[1], 0x81); - EXPECT_EQ (u[2], 0x84); - EXPECT_EQ (u[3], 0); - EXPECT_EQ (u[4], 0); - - unicode::append(0x10346, u); - EXPECT_EQ (u[0], 0xf0); - EXPECT_EQ (u[1], 0x90); - EXPECT_EQ (u[2], 0x8d); - EXPECT_EQ (u[3], 0x86); - EXPECT_EQ (u[4], 0); + constexpr auto do_test = []() { + STATIC_CHECK([] { + std::array u{}; + unicode::append(0x0448U, u); + return u; + }() == to_array_cast(0xd1, 0x88, 0, 0, 0)); + + STATIC_CHECK([] { + std::array u{}; + unicode::append(0x65e5U, u); + return u; + }() == to_array_cast(0xe6, 0x97, 0xa5, 0, 0)); + + STATIC_CHECK([] { + std::array u{}; + unicode::append(0x3044U, u); + return u; + }() == to_array_cast(0xe3, 0x81, 0x84, 0, 0)); + + STATIC_CHECK([] { + std::array u{}; + unicode::append(0x10346U, u); + return u; + }() == to_array_cast(0xf0, 0x90, 0x8d, 0x86, 0)); + }; + + do_test.operator()(); + do_test.operator()(); + do_test.operator()(); + do_test.operator()(); + do_test.operator()(); +} + +TEST_CASE("append16") +{ + constexpr auto do_test = []() { + STATIC_CHECK([] { + std::array u{}; + unicode::append(0x0448U, u); + return u; + }() == to_array_cast(0x0448, 0, 0, 0, 0)); + + STATIC_CHECK([] { + std::array u{}; + unicode::append(0x65e5U, u); + return u; + }() == to_array_cast(0x65e5, 0, 0, 0, 0)); + + STATIC_CHECK([] { + std::array u{}; + unicode::append(0x10346U, u); + return u; + }() == to_array_cast(0xd800, 0xdf46, 0, 0, 0)); + }; + + do_test.operator()(); + do_test.operator()(); + do_test.operator()(); } #if 0 -TEST(CheckedAPITests, test_append16) -{ - char16_t u[5] = {0, 0}; - append16(0x0448, u); - EXPECT_EQ (u[0], 0x0448); - EXPECT_EQ (u[1], 0x0000); - - append16(0x65e5, u); - EXPECT_EQ (u[0], 0x65e5); - EXPECT_EQ (u[1], 0x0000); - - append16(0x10346, u); - EXPECT_EQ (u[0], 0xd800); - EXPECT_EQ (u[1], 0xdf46); -} - -TEST(CheckedAPITests, test_next) +TEST_CASE("next") { const char* twochars = "\xe6\x97\xa5\xd1\x88"; const char* w = twochars; @@ -85,7 +109,7 @@ TEST(CheckedAPITests, test_next) EXPECT_EQ (w, threechars + 9); } -TEST(CheckedAPITests, test_next16) +TEST_CASE("next16") { const char16_t u[3] = {0x65e5, 0xd800, 0xdf46}; const char16_t* w = u; @@ -98,14 +122,14 @@ TEST(CheckedAPITests, test_next16) EXPECT_EQ (w, u + 3); } -TEST(CheckedAPITests, test_peek_next) +TEST_CASE("peek_next") { const char* const cw = "\xe6\x97\xa5\xd1\x88"; unsigned int cp = peek_next(cw, cw + 6); EXPECT_EQ (cp, 0x65e5); } -TEST(CheckedAPITests, test_prior) +TEST_CASE("prior") { const char* twochars = "\xe6\x97\xa5\xd1\x88"; const char* w = twochars + 3; @@ -126,7 +150,7 @@ TEST(CheckedAPITests, test_prior) EXPECT_EQ (w, threechars); } -TEST(CheckedAPITests, test_advance) +TEST_CASE("advance") { const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; const char* w = threechars; @@ -142,14 +166,14 @@ TEST(CheckedAPITests, test_advance) EXPECT_EQ(w, threechars); } -TEST(CheckedAPITests, test_distance) +TEST_CASE("distance") { const char* twochars = "\xe6\x97\xa5\xd1\x88"; size_t dist = static_cast(iris::utflib::distance(twochars, twochars + 5)); EXPECT_EQ (dist, 2); } -TEST(CheckedAPITests, test_utf32to8) +TEST_CASE("utf32to8") { char32_t utf32string[] = {0x448, 0x65E5, 0x10346, 0}; string utf8result; @@ -157,7 +181,7 @@ TEST(CheckedAPITests, test_utf32to8) EXPECT_EQ (utf8result.size(), 9); } -TEST(CheckedAPITests, test_utf8to32) +TEST_CASE("utf8to32") { const char* twochars = "\xe6\x97\xa5\xd1\x88"; vector utf32result; @@ -165,7 +189,7 @@ TEST(CheckedAPITests, test_utf8to32) EXPECT_EQ (utf32result.size(), 2); } -TEST(CheckedAPITests, test_utf16to8) +TEST_CASE("utf16to8") { char16_t utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; string utf8result; @@ -173,7 +197,7 @@ TEST(CheckedAPITests, test_utf16to8) EXPECT_EQ (utf8result.size(), 10); } -TEST(CheckedAPITests, test_utf8to16) +TEST_CASE("utf8to16") { char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; vector utf16result; @@ -183,7 +207,7 @@ TEST(CheckedAPITests, test_utf8to16) EXPECT_EQ (utf16result[3], 0xdd1e); } -TEST(CheckedAPITests, test_replace_invalid) +TEST_CASE("replace_invalid") { char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; vector replace_invalid_result; @@ -195,7 +219,7 @@ TEST(CheckedAPITests, test_replace_invalid) EXPECT_TRUE (std::equal(replace_invalid_result.begin(), replace_invalid_result.begin() + sizeof(fixed_invalid_sequence), fixed_invalid_sequence)); } -TEST(CheckedAPITests, test_find_invalid) +TEST_CASE("find_invalid") { char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; const char* invalid = find_invalid(utf_invalid, utf_invalid + 6); @@ -204,7 +228,7 @@ TEST(CheckedAPITests, test_find_invalid) EXPECT_EQ (invalid, utf_invalid + 5); } -TEST(CheckedAPITests, test_is_valid) +TEST_CASE("is_valid") { char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; bool bvalid = is_valid(utf_invalid, utf_invalid + 6); @@ -218,7 +242,7 @@ TEST(CheckedAPITests, test_is_valid) EXPECT_TRUE (bvalid); } -TEST(CheckedAPITests, test_starts_with_bom) +TEST_CASE("starts_with_bom") { unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf}; bool bbom = starts_with_bom(byte_order_mark, byte_order_mark + sizeof(byte_order_mark)); @@ -228,7 +252,7 @@ TEST(CheckedAPITests, test_starts_with_bom) EXPECT_FALSE (no_bbom); } -TEST(CheckedIteratrTests, test_increment) +TEST_CASE("increment") { const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; iris::utflib::iterator it(threechars, threechars, threechars + 9); @@ -243,7 +267,7 @@ TEST(CheckedIteratrTests, test_increment) EXPECT_EQ (++it, endit); } -TEST(CheckedIteratrTests, test_decrement) +TEST_CASE("decrement") { const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; iris::utflib::iterator it(threechars+9, threechars, threechars + 9); @@ -254,22 +278,14 @@ TEST(CheckedIteratrTests, test_decrement) EXPECT_EQ (*it, 0x10346); } -TEST(CPP11APITests, test_append16) -{ - u16string u; - append16(0x0448, u); - EXPECT_EQ (u[0], char16_t(0x0448)); - EXPECT_EQ (u.length(), 1); -} - -TEST(CPP11APITests, test_utf16to8) +TEST_CASE("utf16to8") { u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; string u = utf16to8(utf16string); EXPECT_EQ (u.size(), 10); } -TEST(CPP11APITests, test_utf8to16) +TEST_CASE("utf8to16") { string utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; u16string utf16result = utf8to16(utf8_with_surrogates); @@ -281,28 +297,28 @@ TEST(CPP11APITests, test_utf8to16) EXPECT_EQ(utf8to16("simple"), u"simple"); } -TEST(CPP11APITests, test_utf32to8) +TEST_CASE("utf32to8") { u32string utf32string = {0x448, 0x65E5, 0x10346}; string utf8result = utf32to8(utf32string); EXPECT_EQ (utf8result.size(), 9); } -TEST(CPP11APITests, test_utf8to32) +TEST_CASE("utf8to32") { const char* twochars = "\xe6\x97\xa5\xd1\x88"; u32string utf32result = utf8to32(twochars); EXPECT_EQ (utf32result.size(), 2); } -TEST(CPP11APITests, test_find_invalid) +TEST_CASE("find_invalid") { string utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa"; auto invalid = find_invalid(utf_invalid); EXPECT_EQ (invalid, 5); } -TEST(CPP11APITests, test_is_valid) +TEST_CASE("is_valid") { string utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa"; bool bvalid = is_valid(utf_invalid); @@ -312,7 +328,7 @@ TEST(CPP11APITests, test_is_valid) EXPECT_TRUE (bvalid); } -TEST(CPP11APITests, test_replace_invalid) +TEST_CASE("replace_invalid") { string invalid_sequence = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; string replace_invalid_result = replace_invalid(invalid_sequence, '?'); @@ -322,7 +338,7 @@ TEST(CPP11APITests, test_replace_invalid) EXPECT_EQ(fixed_invalid_sequence, replace_invalid_result); } -TEST(CPP11APITests, test_starts_with_bom) +TEST_CASE("starts_with_bom") { string byte_order_mark = {char(0xef), char(0xbb), char(0xbf)}; bool bbom = starts_with_bom(byte_order_mark); @@ -333,7 +349,7 @@ TEST(CPP11APITests, test_starts_with_bom) } -TEST(CPP17APITests, test_utf16to8) +TEST_CASE("utf16to8") { u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; u16string_view utf16stringview(utf16string); @@ -341,7 +357,7 @@ TEST(CPP17APITests, test_utf16to8) EXPECT_EQ (u.size(), 10); } -TEST(CPP17APITests, test_utf8to16) +TEST_CASE("utf8to16") { string_view utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; u16string utf16result = utf8to16(utf8_with_surrogates); @@ -350,7 +366,7 @@ TEST(CPP17APITests, test_utf8to16) EXPECT_EQ (utf16result[3], 0xdd1e); } -TEST(CPP17APITests, test_utf32to8) +TEST_CASE("utf32to8") { u32string utf32string = {0x448, 0x65E5, 0x10346}; u32string_view utf32stringview(utf32string); @@ -358,21 +374,21 @@ TEST(CPP17APITests, test_utf32to8) EXPECT_EQ (utf8result.size(), 9); } -TEST(CPP17APITests, test_utf8to32) +TEST_CASE("utf8to32") { string_view twochars = "\xe6\x97\xa5\xd1\x88"; u32string utf32result = utf8to32(twochars); EXPECT_EQ (utf32result.size(), 2); } -TEST(CPP17APITests, test_find_invalid) +TEST_CASE("find_invalid") { string_view utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa"; auto invalid = find_invalid(utf_invalid); EXPECT_EQ (invalid, 5); } -TEST(CPP17APITests, test_is_valid) +TEST_CASE("is_valid") { string_view utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa"; bool bvalid = is_valid(utf_invalid); @@ -382,7 +398,7 @@ TEST(CPP17APITests, test_is_valid) EXPECT_TRUE (bvalid); } -TEST(CPP17APITests, test_replace_invalid) +TEST_CASE("replace_invalid") { string_view invalid_sequence = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; string replace_invalid_result = replace_invalid(invalid_sequence, '?'); @@ -392,7 +408,7 @@ TEST(CPP17APITests, test_replace_invalid) EXPECT_EQ(fixed_invalid_sequence, replace_invalid_result); } -TEST(CPP17APITests, test_starts_with_bom) +TEST_CASE("starts_with_bom") { string byte_order_mark = {char(0xef), char(0xbb), char(0xbf)}; string_view byte_order_mark_view(byte_order_mark); @@ -412,7 +428,7 @@ TEST(CPP17APITests, string_class_and_literals) } -TEST(CPP20APITests, test_utf16tou8) +TEST_CASE("utf16tou8") { u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; u16string_view utf16stringview{utf16string}; @@ -431,7 +447,7 @@ TEST(CPP20APITests, tes20t_utf8to16) EXPECT_EQ (utf16result[3], 0xdd1e); } -TEST(CPP20APITests, test_utf32tou8) +TEST_CASE("utf32tou8") { u32string utf32string = {0x448, 0x65E5, 0x10346}; u32string_view utf32stringview{utf32string}; @@ -439,21 +455,21 @@ TEST(CPP20APITests, test_utf32tou8) EXPECT_EQ (utf8result.size(), 9); } -TEST(CPP20APITests, test_utf8to32) +TEST_CASE("utf8to32") { u8string twochars = reinterpret_cast("\xe6\x97\xa5\xd1\x88"); u32string utf32result = utf8to32(twochars); EXPECT_EQ (utf32result.size(), 2); } -TEST(CPP20APITests, test_find_invalid) +TEST_CASE("find_invalid") { u8string utf_invalid = reinterpret_cast("\xe6\x97\xa5\xd1\x88\xfa"); auto invalid = find_invalid(utf_invalid); EXPECT_EQ (invalid, 5); } -TEST(CPP20APITests, test_is_valid) +TEST_CASE("is_valid") { u8string utf_invalid = reinterpret_cast("\xe6\x97\xa5\xd1\x88\xfa"); bool bvalid = is_valid(utf_invalid); @@ -463,7 +479,7 @@ TEST(CPP20APITests, test_is_valid) EXPECT_TRUE (bvalid); } -TEST(CPP20APITests, test_replace_invalid) +TEST_CASE("replace_invalid") { u8string invalid_sequence = reinterpret_cast("a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"); u8string replace_invalid_result = replace_invalid(invalid_sequence, u8'?'); @@ -473,7 +489,7 @@ TEST(CPP20APITests, test_replace_invalid) EXPECT_EQ(fixed_invalid_sequence, replace_invalid_result); } -TEST(CPP20APITests, test_starts_with_bom) +TEST_CASE("starts_with_bom") { u8string byte_order_mark = reinterpret_cast("\xef\xbb\xbf"); bool bbom = starts_with_bom(byte_order_mark); diff --git a/test/unicode/string/utf8_invalid.cpp b/test/unicode/string/utf8_invalid.cpp index 665585b..7dd6588 100644 --- a/test/unicode/string/utf8_invalid.cpp +++ b/test/unicode/string/utf8_invalid.cpp @@ -1,6 +1,4 @@ -#include "utf8.h" - -using namespace iris::utflib; +#include #include #include @@ -8,14 +6,16 @@ using namespace iris::utflib; #include using namespace std; +using namespace iris::unicode; const unsigned INVALID_LINES[] = { 75, 76, 83, 84, 85, 93, 102, 103, 105, 106, 107, 108, 109, 110, 114, 115, 116, 117, 124, 125, 130, 135, 140, 145, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 169, 175, 176, 177, 207, 208, 209, 210, 211, 220, 221, 222, 223, 224, 232, 233, 234, 235, 236, 247, 248, 249, 250, 251, 252, 253, 257, 258, 259, 260, 261, 262, 263, 264}; const unsigned* INVALID_LINES_END = INVALID_LINES + sizeof(INVALID_LINES)/sizeof(unsigned); +#if 0 int main(int argc, char** argv) { string test_file_path; - if (argc == 2) + if (argc == 2) test_file_path = argv[1]; else { cout << "Wrong number of arguments" << endl; @@ -59,3 +59,5 @@ int main(int argc, char** argv) } } } + +#endif From d8a8313ba222e1a58c1e9c0fa8fdcdd555c2d867 Mon Sep 17 00:00:00 2001 From: Nana Sakisaka <1901813+saki7@users.noreply.github.com> Date: Sat, 7 Mar 2026 18:42:32 +0900 Subject: [PATCH 05/17] Refactor until `replace_invalid` --- include/iris/unicode/string.hpp | 17 +- test/unicode/string/string.cpp | 459 ++++++++++++++++---------------- 2 files changed, 235 insertions(+), 241 deletions(-) diff --git a/include/iris/unicode/string.hpp b/include/iris/unicode/string.hpp index 98ed0c3..b9d0520 100644 --- a/include/iris/unicode/string.hpp +++ b/include/iris/unicode/string.hpp @@ -801,7 +801,7 @@ template Se> template Se> [[nodiscard]] constexpr char32_t next16(It& it, Se end) { - char32_t cp = 0; + char32_t cp = 0; detail::utf_error err_code = detail::validate_next16(it, end, cp); if (err_code == detail::utf_error::NOT_ENOUGH_SPACE) { throw not_enough_space(); @@ -823,7 +823,7 @@ template Se> It end = it; // Go back until we hit either a lead octet or start - while (detail::is_trail(*(--it))) { + while (detail::is_trail(*--it)) { if (it == start) throw invalid_utf8(*it); // error - no lead byte in the sequence } return unicode::peek_next(it, end); @@ -866,18 +866,19 @@ constexpr OutIt utf16to8(It start, Se end, OutIt out) if (detail::is_lead_surrogate(cp)) { if (start != end) { char32_t const trail_surrogate = static_cast(detail::mask16(*start++)); - if (detail::is_trail_surrogate(trail_surrogate)) + if (detail::is_trail_surrogate(trail_surrogate)) { cp = (cp << 10) + trail_surrogate + detail::SURROGATE_OFFSET; - else + } else { throw invalid_utf16(static_cast(trail_surrogate)); - } else + } + } else { throw invalid_utf16(static_cast(cp)); + } - } // Lone trail surrogate - else if (detail::is_trail_surrogate(cp)) + } else if (detail::is_trail_surrogate(cp)) { throw invalid_utf16(static_cast(cp)); - + } out = unicode::append8(cp, out); } return out; diff --git a/test/unicode/string/string.cpp b/test/unicode/string/string.cpp index 27738b8..c7f1dea 100644 --- a/test/unicode/string/string.cpp +++ b/test/unicode/string/string.cpp @@ -2,6 +2,7 @@ #include +#include #include #include #include @@ -83,420 +84,412 @@ TEST_CASE("append16") do_test.operator()(); } -#if 0 - TEST_CASE("next") { - const char* twochars = "\xe6\x97\xa5\xd1\x88"; - const char* w = twochars; - unsigned int cp = next(w, twochars + 6); - EXPECT_EQ (cp, 0x65e5); - EXPECT_EQ (w, twochars + 3); + char const* twochars = "\xe6\x97\xa5\xd1\x88"; + char const* w = twochars; + unsigned int cp = unicode::next(w, twochars + 6); - const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + CHECK(cp == 0x65e5); + CHECK(w == twochars + 3); + + char const* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; w = threechars; - cp = next(w, threechars + 9); - EXPECT_EQ (cp, 0x10346); - EXPECT_EQ (w, threechars + 4); + cp = unicode::next(w, threechars + 9); + CHECK(cp == 0x10346); + CHECK(w == threechars + 4); - cp = next(w, threechars + 9); - EXPECT_EQ (cp, 0x65e5); - EXPECT_EQ (w, threechars + 7); + cp = unicode::next(w, threechars + 9); + CHECK(cp == 0x65e5); + CHECK(w == threechars + 7); - cp = next(w, threechars + 9); - EXPECT_EQ (cp, 0x0448); - EXPECT_EQ (w, threechars + 9); + cp = unicode::next(w, threechars + 9); + CHECK(cp == 0x0448); + CHECK(w == threechars + 9); } TEST_CASE("next16") { - const char16_t u[3] = {0x65e5, 0xd800, 0xdf46}; - const char16_t* w = u; - char32_t cp = next16(w, w + 3); - EXPECT_EQ (cp, 0x65e5); - EXPECT_EQ (w, u + 1); + char16_t const u[3] = {0x65e5, 0xd800, 0xdf46}; + char16_t const* w = u; + char32_t cp = unicode::next16(w, w + 3); + CHECK(cp == 0x65e5); + CHECK(w == u + 1); - cp = next16(w, w + 2); - EXPECT_EQ (cp, 0x10346); - EXPECT_EQ (w, u + 3); + cp = unicode::next16(w, w + 2); + CHECK(cp == 0x10346); + CHECK(w == u + 3); } TEST_CASE("peek_next") { - const char* const cw = "\xe6\x97\xa5\xd1\x88"; - unsigned int cp = peek_next(cw, cw + 6); - EXPECT_EQ (cp, 0x65e5); + char const* const cw = "\xe6\x97\xa5\xd1\x88"; + unsigned int cp = unicode::peek_next(cw, cw + 6); + CHECK(cp == 0x65e5); } -TEST_CASE("prior") +TEST_CASE("prev") { - const char* twochars = "\xe6\x97\xa5\xd1\x88"; - const char* w = twochars + 3; - unsigned int cp = prior (w, twochars); - EXPECT_EQ (cp, 0x65e5); - EXPECT_EQ (w, twochars); + char const* twochars = "\xe6\x97\xa5\xd1\x88"; + char const* w = twochars + 3; + unsigned int cp = unicode::prev(w, twochars); + CHECK(cp == 0x65e5); + CHECK(w == twochars); - const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + char const* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; w = threechars + 9; - cp = prior(w, threechars); - EXPECT_EQ (cp, 0x0448); - EXPECT_EQ (w, threechars + 7); - cp = prior(w, threechars); - EXPECT_EQ (cp, 0x65e5); - EXPECT_EQ (w, threechars + 4); - cp = prior(w, threechars); - EXPECT_EQ (cp, 0x10346); - EXPECT_EQ (w, threechars); + cp = unicode::prev(w, threechars); + CHECK(cp == 0x0448); + CHECK(w == threechars + 7); + cp = unicode::prev(w, threechars); + CHECK(cp == 0x65e5); + CHECK(w == threechars + 4); + cp = unicode::prev(w, threechars); + CHECK(cp == 0x10346); + CHECK(w == threechars); } TEST_CASE("advance") { - const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; - const char* w = threechars; - advance(w, 2, threechars + 9); - EXPECT_EQ(w, threechars + 7); - advance(w, -2, threechars); - EXPECT_EQ(w, threechars); - advance(w, 3, threechars + 9); - EXPECT_EQ(w, threechars + 9); - advance(w, -2, threechars); - EXPECT_EQ(w, threechars + 4); - advance(w, -1, threechars); - EXPECT_EQ(w, threechars); + char const* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + char const* w = threechars; + unicode::advance(w, 2, threechars + 9); + CHECK(w == threechars + 7); + unicode::advance(w, -2, threechars); + CHECK(w == threechars); + unicode::advance(w, 3, threechars + 9); + CHECK(w == threechars + 9); + unicode::advance(w, -2, threechars); + CHECK(w == threechars + 4); + unicode::advance(w, -1, threechars); + CHECK(w == threechars); } TEST_CASE("distance") { - const char* twochars = "\xe6\x97\xa5\xd1\x88"; - size_t dist = static_cast(iris::utflib::distance(twochars, twochars + 5)); - EXPECT_EQ (dist, 2); + constexpr char const* twochars = "\xe6\x97\xa5\xd1\x88"; + std::size_t const dist = static_cast(unicode::distance(twochars, twochars + 5)); + CHECK(dist == 2); } -TEST_CASE("utf32to8") +TEST_CASE("replace_invalid (vector)") { - char32_t utf32string[] = {0x448, 0x65E5, 0x10346, 0}; - string utf8result; - iris::utflib::utf32to8(utf32string, utf32string + 3, back_inserter(utf8result)); - EXPECT_EQ (utf8result.size(), 9); -} + char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; + std::vector replace_invalid_result; -TEST_CASE("utf8to32") -{ - const char* twochars = "\xe6\x97\xa5\xd1\x88"; - vector utf32result; - iris::utflib::utf8to32(twochars, twochars + 5, back_inserter(utf32result)); - EXPECT_EQ (utf32result.size(), 2); -} + unicode::replace_invalid(invalid_sequence, invalid_sequence + sizeof(invalid_sequence), std::back_inserter(replace_invalid_result), '?'); + CHECK(unicode::is_valid(replace_invalid_result.begin(), replace_invalid_result.end())); -TEST_CASE("utf16to8") -{ - char16_t utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; - string utf8result; - iris::utflib::utf16to8(utf16string, utf16string + 5, back_inserter(utf8result)); - EXPECT_EQ (utf8result.size(), 10); + char const fixed_invalid_sequence[] = "a????z"; + CHECK(sizeof(fixed_invalid_sequence) == replace_invalid_result.size()); + CHECK(std::equal(replace_invalid_result.begin(), replace_invalid_result.begin() + sizeof(fixed_invalid_sequence), fixed_invalid_sequence)); } -TEST_CASE("utf8to16") +TEST_CASE("replace_invalid (string)") { - char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; - vector utf16result; - iris::utflib::utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result)); - EXPECT_EQ (utf16result.size(), 4); - EXPECT_EQ (utf16result[2], 0xd834); - EXPECT_EQ (utf16result[3], 0xdd1e); + std::string_view invalid_sequence = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; + std::string const replace_invalid_result = unicode::replace_invalid(invalid_sequence, '?'); + CHECK(unicode::is_valid(replace_invalid_result)); + + std::string const fixed_invalid_sequence = "a????z"; + CHECK(fixed_invalid_sequence == replace_invalid_result); } -TEST_CASE("replace_invalid") +TEST_CASE("replace_invalid (u8string)") { - char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; - vector replace_invalid_result; - replace_invalid (invalid_sequence, invalid_sequence + sizeof(invalid_sequence), std::back_inserter(replace_invalid_result), '?'); - bool bvalid = is_valid(replace_invalid_result.begin(), replace_invalid_result.end()); - EXPECT_TRUE (bvalid); - const char fixed_invalid_sequence[] = "a????z"; - EXPECT_EQ (sizeof(fixed_invalid_sequence), replace_invalid_result.size()); - EXPECT_TRUE (std::equal(replace_invalid_result.begin(), replace_invalid_result.begin() + sizeof(fixed_invalid_sequence), fixed_invalid_sequence)); + std::u8string invalid_sequence = reinterpret_cast("a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"); + std::u8string const replace_invalid_result = unicode::replace_invalid(invalid_sequence, u8'?'); + + CHECK(unicode::is_valid(replace_invalid_result)); + std::u8string const fixed_invalid_sequence = reinterpret_cast("a????z"); + CHECK(fixed_invalid_sequence == replace_invalid_result); } +#if 0 + TEST_CASE("find_invalid") { char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; - const char* invalid = find_invalid(utf_invalid, utf_invalid + 6); - EXPECT_EQ (invalid, utf_invalid + 5); + char const* invalid = find_invalid(utf_invalid, utf_invalid + 6); + CHECK(invalid == utf_invalid + 5); invalid = utf_invalid + find_invalid(utf_invalid); - EXPECT_EQ (invalid, utf_invalid + 5); + CHECK(invalid == utf_invalid + 5); } TEST_CASE("is_valid") { char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; bool bvalid = is_valid(utf_invalid, utf_invalid + 6); - EXPECT_FALSE (bvalid); + CHECK(!bvalid); bvalid = is_valid(utf_invalid); - EXPECT_FALSE (bvalid); + CHECK(!bvalid); char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; bvalid = is_valid(utf8_with_surrogates, utf8_with_surrogates + 9); - EXPECT_TRUE (bvalid); + CHECK(bvalid); bvalid = is_valid(utf8_with_surrogates); - EXPECT_TRUE (bvalid); + CHECK(bvalid); } TEST_CASE("starts_with_bom") { unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf}; bool bbom = starts_with_bom(byte_order_mark, byte_order_mark + sizeof(byte_order_mark)); - EXPECT_TRUE (bbom); - const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + CHECK(bbom); + char const* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; bool no_bbom = starts_with_bom(threechars, threechars + sizeof(threechars)); - EXPECT_FALSE (no_bbom); + CHECK(!no_bbom); } TEST_CASE("increment") { - const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; - iris::utflib::iterator it(threechars, threechars, threechars + 9); - iris::utflib::iterator it2 = it; - EXPECT_EQ (it2, it); - EXPECT_EQ (*it, 0x10346); - EXPECT_EQ (*(++it), 0x65e5); - EXPECT_EQ ((*it++), 0x65e5); - EXPECT_EQ (*it, 0x0448); + char const* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + iris::utflib::iterator it(threechars, threechars, threechars + 9); + iris::utflib::iterator it2 = it; + CHECK(it2 == it); + CHECK(*it == 0x10346); + CHECK(*(++it) == 0x65e5); + CHECK((*it++) == 0x65e5); + CHECK(*it == 0x0448); EXPECT_NE (it, it2); - iris::utflib::iterator endit (threechars + 9, threechars, threechars + 9); - EXPECT_EQ (++it, endit); + iris::utflib::iterator endit (threechars + 9, threechars, threechars + 9); + CHECK(++it == endit); } TEST_CASE("decrement") { - const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; - iris::utflib::iterator it(threechars+9, threechars, threechars + 9); - EXPECT_EQ (*(--it), 0x0448); - EXPECT_EQ ((*it--), 0x0448); - EXPECT_EQ (*it, 0x65e5); - EXPECT_EQ (--it, iris::utflib::iterator(threechars, threechars, threechars + 9)); - EXPECT_EQ (*it, 0x10346); + char const* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + iris::utflib::iterator it(threechars+9, threechars, threechars + 9); + CHECK(*(--it) == 0x0448); + CHECK((*it--) == 0x0448); + CHECK(*it == 0x65e5); + CHECK(--it == iris::utflib::iterator(threechars, threechars, threechars + 9)); + CHECK(*it == 0x10346); +} + +TEST_CASE("utf32to8") +{ + char32_t utf32string[] = {0x448, 0x65E5, 0x10346, 0}; + std::string utf8result; + iris::utflib::utf32to8(utf32string, utf32string + 3, back_inserter(utf8result)); + CHECK(utf8result.size() == 9); +} + +TEST_CASE("utf8to32") +{ + char const* twochars = "\xe6\x97\xa5\xd1\x88"; + std::vector utf32result; + iris::utflib::utf8to32(twochars, twochars + 5, back_inserter(utf32result)); + CHECK(utf32result.size() == 2); +} + +TEST_CASE("utf16to8") +{ + char16_t utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; + std::string utf8result; + iris::utflib::utf16to8(utf16string, utf16string + 5, back_inserter(utf8result)); + CHECK(utf8result.size() == 10); +} + +TEST_CASE("utf8to16") +{ + char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; + std::vector utf16result; + iris::utflib::utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result)); + CHECK(utf16result.size() == 4); + CHECK(utf16result[2] == 0xd834); + CHECK(utf16result[3] == 0xdd1e); } TEST_CASE("utf16to8") { - u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; - string u = utf16to8(utf16string); - EXPECT_EQ (u.size(), 10); + std::u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; + std::string u = utf16to8(utf16string); + CHECK(u.size() == 10); } TEST_CASE("utf8to16") { - string utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; - u16string utf16result = utf8to16(utf8_with_surrogates); - EXPECT_EQ (utf16result.size(), 4); - EXPECT_EQ (utf16result[2], 0xd834); - EXPECT_EQ (utf16result[3], 0xdd1e); - // Just to make sure it compiles with string literals - EXPECT_EQ(utf8to16(u8"simple"), u"simple"); - EXPECT_EQ(utf8to16("simple"), u"simple"); + std::string utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; + std::u16string utf16result = utf8to16(utf8_with_surrogates); + CHECK(utf16result.size() == 4); + CHECK(utf16result[2] == 0xd834); + CHECK(utf16result[3] == 0xdd1e); + // Just to make sure it compiles with std::string literals + CHECK(utf8to16(u8"simple") == u"simple"); + CHECK(utf8to16("simple") == u"simple"); } TEST_CASE("utf32to8") { - u32string utf32string = {0x448, 0x65E5, 0x10346}; - string utf8result = utf32to8(utf32string); - EXPECT_EQ (utf8result.size(), 9); + std::u32string utf32string = {0x448, 0x65E5, 0x10346}; + std::string utf8result = utf32to8(utf32string); + CHECK(utf8result.size() == 9); } TEST_CASE("utf8to32") { - const char* twochars = "\xe6\x97\xa5\xd1\x88"; - u32string utf32result = utf8to32(twochars); - EXPECT_EQ (utf32result.size(), 2); + char const* twochars = "\xe6\x97\xa5\xd1\x88"; + std::u32string utf32result = utf8to32(twochars); + CHECK(utf32result.size() == 2); } TEST_CASE("find_invalid") { - string utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa"; + std::string utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa"; auto invalid = find_invalid(utf_invalid); - EXPECT_EQ (invalid, 5); + CHECK(invalid == 5); } TEST_CASE("is_valid") { - string utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa"; + std::string utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa"; bool bvalid = is_valid(utf_invalid); - EXPECT_FALSE (bvalid); - string utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; + CHECK(!bvalid); + std::string utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; bvalid = is_valid(utf8_with_surrogates); - EXPECT_TRUE (bvalid); -} - -TEST_CASE("replace_invalid") -{ - string invalid_sequence = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; - string replace_invalid_result = replace_invalid(invalid_sequence, '?'); - bool bvalid = is_valid(replace_invalid_result); - EXPECT_TRUE (bvalid); - const string fixed_invalid_sequence = "a????z"; - EXPECT_EQ(fixed_invalid_sequence, replace_invalid_result); + CHECK(bvalid); } TEST_CASE("starts_with_bom") { - string byte_order_mark = {char(0xef), char(0xbb), char(0xbf)}; + std::string byte_order_mark = {char(0xef), char(0xbb), char(0xbf)}; bool bbom = starts_with_bom(byte_order_mark); - EXPECT_TRUE (bbom); - string threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + CHECK(bbom); + std::string threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; bool no_bbom = starts_with_bom(threechars); - EXPECT_FALSE (no_bbom); + CHECK(!no_bbom); } TEST_CASE("utf16to8") { - u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; + std::u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; u16string_view utf16stringview(utf16string); - string u = utf16to8(utf16stringview); - EXPECT_EQ (u.size(), 10); + std::string u = utf16to8(utf16stringview); + CHECK(u.size() == 10); } TEST_CASE("utf8to16") { - string_view utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; - u16string utf16result = utf8to16(utf8_with_surrogates); - EXPECT_EQ (utf16result.size(), 4); - EXPECT_EQ (utf16result[2], 0xd834); - EXPECT_EQ (utf16result[3], 0xdd1e); + std::string_view utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; + std::u16string utf16result = utf8to16(utf8_with_surrogates); + CHECK(utf16result.size() == 4); + CHECK(utf16result[2] == 0xd834); + CHECK(utf16result[3] == 0xdd1e); } TEST_CASE("utf32to8") { - u32string utf32string = {0x448, 0x65E5, 0x10346}; + std::u32string utf32string = {0x448, 0x65E5, 0x10346}; u32string_view utf32stringview(utf32string); - string utf8result = utf32to8(utf32stringview); - EXPECT_EQ (utf8result.size(), 9); + std::string utf8result = utf32to8(utf32stringview); + CHECK(utf8result.size() == 9); } TEST_CASE("utf8to32") { - string_view twochars = "\xe6\x97\xa5\xd1\x88"; - u32string utf32result = utf8to32(twochars); - EXPECT_EQ (utf32result.size(), 2); + std::string_view twochars = "\xe6\x97\xa5\xd1\x88"; + std::u32string utf32result = utf8to32(twochars); + CHECK(utf32result.size() == 2); } TEST_CASE("find_invalid") { - string_view utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa"; + std::string_view utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa"; auto invalid = find_invalid(utf_invalid); - EXPECT_EQ (invalid, 5); + CHECK(invalid == 5); } TEST_CASE("is_valid") { - string_view utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa"; + std::string_view utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa"; bool bvalid = is_valid(utf_invalid); - EXPECT_FALSE (bvalid); - string_view utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; + CHECK(!bvalid); + std::string_view utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; bvalid = is_valid(utf8_with_surrogates); - EXPECT_TRUE (bvalid); -} - -TEST_CASE("replace_invalid") -{ - string_view invalid_sequence = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; - string replace_invalid_result = replace_invalid(invalid_sequence, '?'); - bool bvalid = is_valid(replace_invalid_result); - EXPECT_TRUE (bvalid); - const string fixed_invalid_sequence = "a????z"; - EXPECT_EQ(fixed_invalid_sequence, replace_invalid_result); + CHECK(bvalid); } TEST_CASE("starts_with_bom") { - string byte_order_mark = {char(0xef), char(0xbb), char(0xbf)}; - string_view byte_order_mark_view(byte_order_mark); + std::string byte_order_mark = {char(0xef), char(0xbb), char(0xbf)}; + std::string_view byte_order_mark_view(byte_order_mark); bool bbom = starts_with_bom(byte_order_mark_view); - EXPECT_TRUE (bbom); - string_view threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + CHECK(bbom); + std::string_view threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; bool no_bbom = starts_with_bom(threechars); - EXPECT_FALSE (no_bbom); + CHECK(!no_bbom); } TEST(CPP17APITests, string_class_and_literals) { - const char* twochars = "ab"; - EXPECT_TRUE (is_valid(twochars)); - const string two_chars_string(twochars); - EXPECT_TRUE (is_valid(two_chars_string)); + char const* twochars = "ab"; + CHECK(is_valid(twochars)); + std::string const two_chars_string(twochars); + CHECK(is_valid(two_chars_string)); } TEST_CASE("utf16tou8") { - u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; + std::u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; u16string_view utf16stringview{utf16string}; - u8string u = utf16tou8(utf16string); - EXPECT_EQ (u.size(), 10); + std::u8string u = utf16tou8(utf16string); + CHECK(u.size() == 10); u = utf16tou8(utf16stringview); - EXPECT_EQ (u.size(), 10); + CHECK(u.size() == 10); } -TEST(CPP20APITests, tes20t_utf8to16) +TEST_CASE("utf8to16") { - u8string utf8_with_surrogates{ reinterpret_cast("\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e") }; - u16string utf16result = utf8to16(utf8_with_surrogates); - EXPECT_EQ (utf16result.size(), 4); - EXPECT_EQ (utf16result[2], 0xd834); - EXPECT_EQ (utf16result[3], 0xdd1e); + std::u8string utf8_with_surrogates{ reinterpret_cast("\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e") }; + std::u16string utf16result = utf8to16(utf8_with_surrogates); + CHECK(utf16result.size() == 4); + CHECK(utf16result[2] == 0xd834); + CHECK(utf16result[3] == 0xdd1e); } TEST_CASE("utf32tou8") { - u32string utf32string = {0x448, 0x65E5, 0x10346}; + std::u32string utf32string = {0x448, 0x65E5, 0x10346}; u32string_view utf32stringview{utf32string}; - u8string utf8result = utf32tou8(utf32stringview); - EXPECT_EQ (utf8result.size(), 9); + std::u8string utf8result = utf32tou8(utf32stringview); + CHECK(utf8result.size() == 9); } TEST_CASE("utf8to32") { - u8string twochars = reinterpret_cast("\xe6\x97\xa5\xd1\x88"); - u32string utf32result = utf8to32(twochars); - EXPECT_EQ (utf32result.size(), 2); + std::u8string twochars = reinterpret_cast("\xe6\x97\xa5\xd1\x88"); + std::u32string utf32result = utf8to32(twochars); + CHECK(utf32result.size() == 2); } TEST_CASE("find_invalid") { - u8string utf_invalid = reinterpret_cast("\xe6\x97\xa5\xd1\x88\xfa"); + std::u8string utf_invalid = reinterpret_cast("\xe6\x97\xa5\xd1\x88\xfa"); auto invalid = find_invalid(utf_invalid); - EXPECT_EQ (invalid, 5); + CHECK(invalid == 5); } TEST_CASE("is_valid") { - u8string utf_invalid = reinterpret_cast("\xe6\x97\xa5\xd1\x88\xfa"); + std::u8string utf_invalid = reinterpret_cast("\xe6\x97\xa5\xd1\x88\xfa"); bool bvalid = is_valid(utf_invalid); - EXPECT_FALSE (bvalid); - u8string utf8_with_surrogates = reinterpret_cast("\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"); + CHECK(!bvalid); + std::u8string utf8_with_surrogates = reinterpret_cast("\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"); bvalid = is_valid(utf8_with_surrogates); - EXPECT_TRUE (bvalid); -} - -TEST_CASE("replace_invalid") -{ - u8string invalid_sequence = reinterpret_cast("a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"); - u8string replace_invalid_result = replace_invalid(invalid_sequence, u8'?'); - bool bvalid = is_valid(replace_invalid_result); - EXPECT_TRUE (bvalid); - const u8string fixed_invalid_sequence = reinterpret_cast("a????z"); - EXPECT_EQ(fixed_invalid_sequence, replace_invalid_result); + CHECK(bvalid); } TEST_CASE("starts_with_bom") { - u8string byte_order_mark = reinterpret_cast("\xef\xbb\xbf"); + std::u8string byte_order_mark = reinterpret_cast("\xef\xbb\xbf"); bool bbom = starts_with_bom(byte_order_mark); - EXPECT_TRUE (bbom); - u8string threechars = reinterpret_cast("\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"); + CHECK(bbom); + std::u8string threechars = reinterpret_cast("\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"); bool no_bbom = starts_with_bom(threechars); - EXPECT_FALSE (no_bbom); + CHECK(!no_bbom); } #endif From 0e2952541c6cde9dfa1e55267d48daf90cb8f585 Mon Sep 17 00:00:00 2001 From: Nana Sakisaka <1901813+saki7@users.noreply.github.com> Date: Sat, 7 Mar 2026 19:50:12 +0900 Subject: [PATCH 06/17] Refactor until `is_valid` --- test/unicode/string/string.cpp | 68 ++++++++++++---------------------- 1 file changed, 23 insertions(+), 45 deletions(-) diff --git a/test/unicode/string/string.cpp b/test/unicode/string/string.cpp index c7f1dea..1bd4ef2 100644 --- a/test/unicode/string/string.cpp +++ b/test/unicode/string/string.cpp @@ -173,6 +173,28 @@ TEST_CASE("distance") CHECK(dist == 2); } +TEST_CASE("is_valid") +{ + { + char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; + CHECK(!unicode::is_valid(utf_invalid)); + CHECK(!unicode::is_valid(utf_invalid, utf_invalid + 6)); + } + { + char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; + CHECK(unicode::is_valid(utf8_with_surrogates)); + CHECK(unicode::is_valid(utf8_with_surrogates, utf8_with_surrogates + 9)); + } + { + std::u8string const utf_invalid(std::from_range, "\xe6\x97\xa5\xd1\x88\xfa"); + CHECK(!unicode::is_valid(utf_invalid)); + } + { + std::u8string const utf8_with_surrogates(std::from_range, "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"); + CHECK(unicode::is_valid(utf8_with_surrogates)); + } +} + TEST_CASE("replace_invalid (vector)") { char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; @@ -217,20 +239,6 @@ TEST_CASE("find_invalid") CHECK(invalid == utf_invalid + 5); } -TEST_CASE("is_valid") -{ - char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; - bool bvalid = is_valid(utf_invalid, utf_invalid + 6); - CHECK(!bvalid); - bvalid = is_valid(utf_invalid); - CHECK(!bvalid); - char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; - bvalid = is_valid(utf8_with_surrogates, utf8_with_surrogates + 9); - CHECK(bvalid); - bvalid = is_valid(utf8_with_surrogates); - CHECK(bvalid); -} - TEST_CASE("starts_with_bom") { unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf}; @@ -341,16 +349,6 @@ TEST_CASE("find_invalid") CHECK(invalid == 5); } -TEST_CASE("is_valid") -{ - std::string utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa"; - bool bvalid = is_valid(utf_invalid); - CHECK(!bvalid); - std::string utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; - bvalid = is_valid(utf8_with_surrogates); - CHECK(bvalid); -} - TEST_CASE("starts_with_bom") { std::string byte_order_mark = {char(0xef), char(0xbb), char(0xbf)}; @@ -401,16 +399,6 @@ TEST_CASE("find_invalid") CHECK(invalid == 5); } -TEST_CASE("is_valid") -{ - std::string_view utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa"; - bool bvalid = is_valid(utf_invalid); - CHECK(!bvalid); - std::string_view utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; - bvalid = is_valid(utf8_with_surrogates); - CHECK(bvalid); -} - TEST_CASE("starts_with_bom") { std::string byte_order_mark = {char(0xef), char(0xbb), char(0xbf)}; @@ -422,7 +410,7 @@ TEST_CASE("starts_with_bom") CHECK(!no_bbom); } -TEST(CPP17APITests, string_class_and_literals) +TEST_CASE("string_class_and_literals") { char const* twochars = "ab"; CHECK(is_valid(twochars)); @@ -472,16 +460,6 @@ TEST_CASE("find_invalid") CHECK(invalid == 5); } -TEST_CASE("is_valid") -{ - std::u8string utf_invalid = reinterpret_cast("\xe6\x97\xa5\xd1\x88\xfa"); - bool bvalid = is_valid(utf_invalid); - CHECK(!bvalid); - std::u8string utf8_with_surrogates = reinterpret_cast("\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"); - bvalid = is_valid(utf8_with_surrogates); - CHECK(bvalid); -} - TEST_CASE("starts_with_bom") { std::u8string byte_order_mark = reinterpret_cast("\xef\xbb\xbf"); From b45687916744bf2a9aadbd3799ccf1862fd6f142 Mon Sep 17 00:00:00 2001 From: Nana Sakisaka <1901813+saki7@users.noreply.github.com> Date: Sat, 7 Mar 2026 19:57:58 +0900 Subject: [PATCH 07/17] Refactor until `find_invalid` --- test/unicode/string/string.cpp | 67 ++++++++++++++++------------------ 1 file changed, 32 insertions(+), 35 deletions(-) diff --git a/test/unicode/string/string.cpp b/test/unicode/string/string.cpp index 1bd4ef2..bee8ee0 100644 --- a/test/unicode/string/string.cpp +++ b/test/unicode/string/string.cpp @@ -175,26 +175,53 @@ TEST_CASE("distance") TEST_CASE("is_valid") { + constexpr char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; + constexpr char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; + { - char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; CHECK(!unicode::is_valid(utf_invalid)); CHECK(!unicode::is_valid(utf_invalid, utf_invalid + 6)); } { - char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; CHECK(unicode::is_valid(utf8_with_surrogates)); CHECK(unicode::is_valid(utf8_with_surrogates, utf8_with_surrogates + 9)); } { - std::u8string const utf_invalid(std::from_range, "\xe6\x97\xa5\xd1\x88\xfa"); - CHECK(!unicode::is_valid(utf_invalid)); + std::u8string const utf_invalid_u8(std::from_range, utf_invalid); + CHECK(!unicode::is_valid(utf_invalid_u8)); } { - std::u8string const utf8_with_surrogates(std::from_range, "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"); + std::u8string const utf8_with_surrogates_u8(std::from_range, utf8_with_surrogates); CHECK(unicode::is_valid(utf8_with_surrogates)); } } +TEST_CASE("find_invalid") +{ + constexpr char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; + { + char const* invalid = unicode::find_invalid(utf_invalid, utf_invalid + 6); + CHECK(invalid == utf_invalid + 5); + } + { + std::size_t const invalid_pos = unicode::find_invalid(utf_invalid); + CHECK(invalid_pos == 5); + } + { + std::size_t const invalid_pos = unicode::find_invalid(std::string{utf_invalid}); + CHECK(invalid_pos == 5); + } + { + std::size_t const invalid_pos = unicode::find_invalid(std::string_view{utf_invalid}); + CHECK(invalid_pos == 5); + } + { + std::u8string const utf_invalid_u8(std::from_range, utf_invalid); + std::size_t const invalid_pos = unicode::find_invalid(utf_invalid_u8); + CHECK(invalid_pos == 5); + } +} + TEST_CASE("replace_invalid (vector)") { char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; @@ -230,15 +257,6 @@ TEST_CASE("replace_invalid (u8string)") #if 0 -TEST_CASE("find_invalid") -{ - char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; - char const* invalid = find_invalid(utf_invalid, utf_invalid + 6); - CHECK(invalid == utf_invalid + 5); - invalid = utf_invalid + find_invalid(utf_invalid); - CHECK(invalid == utf_invalid + 5); -} - TEST_CASE("starts_with_bom") { unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf}; @@ -342,13 +360,6 @@ TEST_CASE("utf8to32") CHECK(utf32result.size() == 2); } -TEST_CASE("find_invalid") -{ - std::string utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa"; - auto invalid = find_invalid(utf_invalid); - CHECK(invalid == 5); -} - TEST_CASE("starts_with_bom") { std::string byte_order_mark = {char(0xef), char(0xbb), char(0xbf)}; @@ -392,13 +403,6 @@ TEST_CASE("utf8to32") CHECK(utf32result.size() == 2); } -TEST_CASE("find_invalid") -{ - std::string_view utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa"; - auto invalid = find_invalid(utf_invalid); - CHECK(invalid == 5); -} - TEST_CASE("starts_with_bom") { std::string byte_order_mark = {char(0xef), char(0xbb), char(0xbf)}; @@ -453,13 +457,6 @@ TEST_CASE("utf8to32") CHECK(utf32result.size() == 2); } -TEST_CASE("find_invalid") -{ - std::u8string utf_invalid = reinterpret_cast("\xe6\x97\xa5\xd1\x88\xfa"); - auto invalid = find_invalid(utf_invalid); - CHECK(invalid == 5); -} - TEST_CASE("starts_with_bom") { std::u8string byte_order_mark = reinterpret_cast("\xef\xbb\xbf"); From 2a43f6bac23cb988d490c002de78d0c89cea65a0 Mon Sep 17 00:00:00 2001 From: Nana Sakisaka <1901813+saki7@users.noreply.github.com> Date: Sat, 7 Mar 2026 20:59:31 +0900 Subject: [PATCH 08/17] Refactor until `starts_with_bom` --- include/iris/unicode/string.hpp | 26 +++++++++++---- test/unicode/string/string.cpp | 57 +++++++++------------------------ 2 files changed, 35 insertions(+), 48 deletions(-) diff --git a/include/iris/unicode/string.hpp b/include/iris/unicode/string.hpp index b9d0520..1bada79 100644 --- a/include/iris/unicode/string.hpp +++ b/include/iris/unicode/string.hpp @@ -40,7 +40,8 @@ DEALINGS IN THE SOFTWARE. namespace iris::unicode { -constexpr char8_t bom[] = {0xef, 0xbb, 0xbf}; +template +constexpr T bom[] = {static_cast(0xef), static_cast(0xbb), static_cast(0xbf)}; template concept octet = std::integral && sizeof(T) == 1; @@ -67,6 +68,12 @@ template concept utf32_input_iterator = std::input_iterator && utf32char>; +template +concept octet_input_range = + std::ranges::input_range && + octet_input_iterator>; + + namespace detail { template @@ -256,12 +263,12 @@ enum class utf_error template [[nodiscard]] constexpr char8_t mask8(Octet oc) noexcept { - return static_cast(0xff & oc); + return static_cast(oc & 0xff); } [[nodiscard]] constexpr char16_t mask16(char16_t oc) noexcept { - return static_cast(0xffff & oc); + return static_cast(oc & 0xffff); } template @@ -578,9 +585,16 @@ template Se> noexcept(noexcept(detail::mask8(*it++)) && is_nothrow_sentinel_v) { return - (it != end && (detail::mask8(*it++)) == bom[0]) && - (it != end && (detail::mask8(*it++)) == bom[1]) && - (it != end && (detail::mask8(*it)) == bom[2]); + (it != end && detail::mask8(*it++) == bom[0]) && + (it != end && detail::mask8(*it++) == bom[1]) && + (it != end && detail::mask8(*it) == bom[2]); +} + +template +[[nodiscard]] constexpr bool starts_with_bom(R&& r) + noexcept(noexcept(unicode::starts_with_bom(std::ranges::begin(r), std::ranges::end(r)))) +{ + return unicode::starts_with_bom(std::ranges::begin(r), std::ranges::end(r)); } [[nodiscard]] constexpr bool starts_with_bom(std::string_view s) diff --git a/test/unicode/string/string.cpp b/test/unicode/string/string.cpp index bee8ee0..47100ff 100644 --- a/test/unicode/string/string.cpp +++ b/test/unicode/string/string.cpp @@ -247,26 +247,31 @@ TEST_CASE("replace_invalid (string)") TEST_CASE("replace_invalid (u8string)") { - std::u8string invalid_sequence = reinterpret_cast("a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"); + std::u8string const invalid_sequence(std::from_range, "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"); std::u8string const replace_invalid_result = unicode::replace_invalid(invalid_sequence, u8'?'); CHECK(unicode::is_valid(replace_invalid_result)); - std::u8string const fixed_invalid_sequence = reinterpret_cast("a????z"); + std::u8string const fixed_invalid_sequence(std::from_range, "a????z"); CHECK(fixed_invalid_sequence == replace_invalid_result); } -#if 0 - TEST_CASE("starts_with_bom") { - unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf}; - bool bbom = starts_with_bom(byte_order_mark, byte_order_mark + sizeof(byte_order_mark)); - CHECK(bbom); - char const* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; - bool no_bbom = starts_with_bom(threechars, threechars + sizeof(threechars)); - CHECK(!no_bbom); + CHECK(unicode::starts_with_bom(unicode::bom)); + CHECK(unicode::starts_with_bom(unicode::bom)); + CHECK(unicode::starts_with_bom(unicode::bom)); + CHECK(unicode::starts_with_bom(unicode::bom)); + CHECK(unicode::starts_with_bom(unicode::bom)); + + constexpr char threechars[] = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + CHECK(!unicode::starts_with_bom(threechars)); + CHECK(!unicode::starts_with_bom(std::string{threechars})); + CHECK(!unicode::starts_with_bom(std::string_view{threechars})); + CHECK(!unicode::starts_with_bom(std::u8string{std::from_range, threechars})); } +#if 0 + TEST_CASE("increment") { char const* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; @@ -360,17 +365,6 @@ TEST_CASE("utf8to32") CHECK(utf32result.size() == 2); } -TEST_CASE("starts_with_bom") -{ - std::string byte_order_mark = {char(0xef), char(0xbb), char(0xbf)}; - bool bbom = starts_with_bom(byte_order_mark); - CHECK(bbom); - std::string threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; - bool no_bbom = starts_with_bom(threechars); - CHECK(!no_bbom); -} - - TEST_CASE("utf16to8") { std::u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; @@ -403,17 +397,6 @@ TEST_CASE("utf8to32") CHECK(utf32result.size() == 2); } -TEST_CASE("starts_with_bom") -{ - std::string byte_order_mark = {char(0xef), char(0xbb), char(0xbf)}; - std::string_view byte_order_mark_view(byte_order_mark); - bool bbom = starts_with_bom(byte_order_mark_view); - CHECK(bbom); - std::string_view threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; - bool no_bbom = starts_with_bom(threechars); - CHECK(!no_bbom); -} - TEST_CASE("string_class_and_literals") { char const* twochars = "ab"; @@ -457,16 +440,6 @@ TEST_CASE("utf8to32") CHECK(utf32result.size() == 2); } -TEST_CASE("starts_with_bom") -{ - std::u8string byte_order_mark = reinterpret_cast("\xef\xbb\xbf"); - bool bbom = starts_with_bom(byte_order_mark); - CHECK(bbom); - std::u8string threechars = reinterpret_cast("\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"); - bool no_bbom = starts_with_bom(threechars); - CHECK(!no_bbom); -} - #endif } // iris_unicode_test From 6ca87b1300d5bb220992382e0313eb1faf5b91e8 Mon Sep 17 00:00:00 2001 From: Nana Sakisaka <1901813+saki7@users.noreply.github.com> Date: Sat, 7 Mar 2026 21:13:26 +0900 Subject: [PATCH 09/17] Refactor until `increment`/`decrement` --- include/iris/unicode/string.hpp | 53 +++++++++++++++++++-------------- test/unicode/string/string.cpp | 28 ++++++++--------- 2 files changed, 45 insertions(+), 36 deletions(-) diff --git a/include/iris/unicode/string.hpp b/include/iris/unicode/string.hpp index 1bada79..d54b1ff 100644 --- a/include/iris/unicode/string.hpp +++ b/include/iris/unicode/string.hpp @@ -38,6 +38,8 @@ DEALINGS IN THE SOFTWARE. #include #include +#include + namespace iris::unicode { template @@ -434,11 +436,11 @@ constexpr utf_error validate_next(It& it, Se end, char32_t& code_point) // Save the original value of it so we can go back in case of failure // Of course, it does not make much sense with i.e. stream iterators - It original_it = it; + It const original_it = it; char32_t cp = 0; // Determine the sequence length based on the lead octet - const int length = detail::sequence_length(it); + int const length = detail::sequence_length(it); // Get trail octets and calculate the code point utf_error err{}; @@ -460,26 +462,24 @@ constexpr utf_error validate_next(It& it, Se end, char32_t& code_point) default: std::unreachable(); } + if (err != utf_error::OK) { + it = original_it; + return err; + } - if (err == utf_error::OK) { - // Decoding succeeded. Now, security checks... - if (detail::is_code_point_valid(cp)) { - if (!detail::is_overlong_sequence(cp, length)) { - // Passed! Return here. - code_point = cp; - ++it; - return utf_error::OK; - } else { - err = utf_error::OVERLONG_SEQUENCE; - } - } else { - err = utf_error::INVALID_CODE_POINT; + if (detail::is_code_point_valid(cp)) { + if (!detail::is_overlong_sequence(cp, length)) { + code_point = cp; + ++it; + return utf_error::OK; } + + it = original_it; + return utf_error::OVERLONG_SEQUENCE; } - // Failure branch - restore the original value of the iterator it = original_it; - return err; + return utf_error::INVALID_CODE_POINT; } template Se> @@ -798,14 +798,18 @@ template Se> switch (detail::validate_next(it, end, cp)) { case detail::utf_error::OK: break; + case detail::utf_error::NOT_ENOUGH_SPACE: throw not_enough_space(); + case detail::utf_error::INVALID_LEAD: case detail::utf_error::INCOMPLETE_SEQUENCE: case detail::utf_error::OVERLONG_SEQUENCE: throw invalid_utf8(static_cast(*it)); + case detail::utf_error::INVALID_CODE_POINT: throw invalid_code_point(cp); + default: std::unreachable(); } @@ -1017,35 +1021,40 @@ class iterator } } } - // the default "big three" are OK + [[nodiscard]] constexpr It base() const { return it; } + [[nodiscard]] constexpr char32_t operator*() const { It temp = it; return unicode::next(temp, range_end); } - [[nodiscard]] constexpr bool operator==(const iterator& rhs) const + + [[nodiscard]] constexpr bool operator==(iterator const& rhs) const noexcept { - if (range_start != rhs.range_start || range_end != rhs.range_end) - throw std::logic_error("Comparing utf-8 iterators defined with different ranges"); - return (it == rhs.it); + assert(range_start == rhs.range_start && range_end == rhs.range_end && "comparing incompatible iterator range is not allowed"); + return it == rhs.it; } + constexpr iterator& operator++() { (void)unicode::next(it, range_end); return *this; } + constexpr iterator operator++(int) { iterator temp = *this; (void)unicode::next(it, range_end); return temp; } + constexpr iterator& operator--() { (void)unicode::prev(it, range_start); return *this; } + constexpr iterator operator--(int) { iterator temp = *this; diff --git a/test/unicode/string/string.cpp b/test/unicode/string/string.cpp index 47100ff..fba0bda 100644 --- a/test/unicode/string/string.cpp +++ b/test/unicode/string/string.cpp @@ -270,34 +270,34 @@ TEST_CASE("starts_with_bom") CHECK(!unicode::starts_with_bom(std::u8string{std::from_range, threechars})); } -#if 0 - TEST_CASE("increment") { - char const* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; - iris::utflib::iterator it(threechars, threechars, threechars + 9); - iris::utflib::iterator it2 = it; + constexpr char const* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + unicode::iterator it(threechars, threechars, threechars + 9); + unicode::iterator it2 = it; CHECK(it2 == it); CHECK(*it == 0x10346); - CHECK(*(++it) == 0x65e5); - CHECK((*it++) == 0x65e5); + CHECK(*++it == 0x65e5); + CHECK(*it++ == 0x65e5); CHECK(*it == 0x0448); - EXPECT_NE (it, it2); - iris::utflib::iterator endit (threechars + 9, threechars, threechars + 9); + CHECK(it != it2); + unicode::iterator endit(threechars + 9, threechars, threechars + 9); CHECK(++it == endit); } TEST_CASE("decrement") { - char const* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; - iris::utflib::iterator it(threechars+9, threechars, threechars + 9); - CHECK(*(--it) == 0x0448); - CHECK((*it--) == 0x0448); + constexpr char const* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + unicode::iterator it(threechars + 9, threechars, threechars + 9); + CHECK(*--it == 0x0448); + CHECK(*it-- == 0x0448); CHECK(*it == 0x65e5); - CHECK(--it == iris::utflib::iterator(threechars, threechars, threechars + 9)); + CHECK(--it == unicode::iterator(threechars, threechars, threechars + 9)); CHECK(*it == 0x10346); } +#if 0 + TEST_CASE("utf32to8") { char32_t utf32string[] = {0x448, 0x65E5, 0x10346, 0}; From 8f06ee4a048aff92e49c8a7fe9f0de6adc6f142b Mon Sep 17 00:00:00 2001 From: Nana Sakisaka <1901813+saki7@users.noreply.github.com> Date: Sat, 7 Mar 2026 21:15:08 +0900 Subject: [PATCH 10/17] Refactor until "string_class_and_literals" --- test/unicode/string/string.cpp | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/test/unicode/string/string.cpp b/test/unicode/string/string.cpp index fba0bda..b25aca6 100644 --- a/test/unicode/string/string.cpp +++ b/test/unicode/string/string.cpp @@ -194,6 +194,14 @@ TEST_CASE("is_valid") std::u8string const utf8_with_surrogates_u8(std::from_range, utf8_with_surrogates); CHECK(unicode::is_valid(utf8_with_surrogates)); } + + { + constexpr char const* twochars = "ab"; + CHECK(unicode::is_valid(twochars)); + + std::string const two_chars_string(twochars); + CHECK(unicode::is_valid(two_chars_string)); + } } TEST_CASE("find_invalid") @@ -397,15 +405,6 @@ TEST_CASE("utf8to32") CHECK(utf32result.size() == 2); } -TEST_CASE("string_class_and_literals") -{ - char const* twochars = "ab"; - CHECK(is_valid(twochars)); - std::string const two_chars_string(twochars); - CHECK(is_valid(two_chars_string)); -} - - TEST_CASE("utf16tou8") { std::u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; From 96d1476ade49622c3dddc0e1f9a96bc7ce3b3fc6 Mon Sep 17 00:00:00 2001 From: Nana Sakisaka <1901813+saki7@users.noreply.github.com> Date: Sat, 7 Mar 2026 23:11:29 +0900 Subject: [PATCH 11/17] Refactor until string conversion --- include/iris/unicode/string.hpp | 143 +++++++++++++----- test/unicode/string/string.cpp | 249 +++++++++++++++++--------------- 2 files changed, 235 insertions(+), 157 deletions(-) diff --git a/include/iris/unicode/string.hpp b/include/iris/unicode/string.hpp index d54b1ff..eb61690 100644 --- a/include/iris/unicode/string.hpp +++ b/include/iris/unicode/string.hpp @@ -875,6 +875,37 @@ distance(It first, Se last) return dist; } +// -------------------------------- + +template Se, utf16_output_iterator OutIt> +constexpr OutIt utf8to16(It start, Se end, OutIt out) +{ + while (start != end) { + char32_t const cp = unicode::next(start, end); + if (cp > 0xffff) { // make a surrogate pair + *out++ = static_cast((cp >> 10) + detail::LEAD_OFFSET); + *out++ = static_cast((cp & 0x3ff) + detail::TRAIL_SURROGATE_MIN); + } else { + *out++ = static_cast(cp); + } + } + return out; +} + +[[nodiscard]] constexpr std::u16string utf8to16(std::string_view str) +{ + std::u16string result; + unicode::utf8to16(str.begin(), str.end(), std::back_inserter(result)); + return result; +} + +[[nodiscard]] constexpr std::u16string utf8to16(std::u8string_view str) +{ + std::u16string result; + unicode::utf8to16(str.begin(), str.end(), std::back_inserter(result)); + return result; +} + template Se, octet_output_iterator OutIt> constexpr OutIt utf16to8(It start, Se end, OutIt out) { @@ -902,46 +933,40 @@ constexpr OutIt utf16to8(It start, Se end, OutIt out) return out; } -[[nodiscard]] constexpr std::string utf16to8(std::u16string_view s) +[[nodiscard]] constexpr std::string utf16to8(std::u16string_view str) { std::string result; - unicode::utf16to8(s.begin(), s.end(), std::back_inserter(result)); + unicode::utf16to8(str.begin(), str.end(), std::back_inserter(result)); return result; } -[[nodiscard]] constexpr std::u8string utf16tou8(std::u16string_view s) +[[nodiscard]] constexpr std::u8string utf16tou8(std::u16string_view str) { std::u8string result; - unicode::utf16to8(s.begin(), s.end(), std::back_inserter(result)); + unicode::utf16to8(str.begin(), str.end(), std::back_inserter(result)); return result; } -template Se, utf16_output_iterator OutIt> -constexpr OutIt utf8to16(It start, Se end, OutIt out) +template Se, class OutIt> +constexpr OutIt utf8to32(It start, Se end, OutIt out) { while (start != end) { - char32_t const cp = unicode::next(start, end); - if (cp > 0xffff) { // make a surrogate pair - *out++ = static_cast((cp >> 10) + detail::LEAD_OFFSET); - *out++ = static_cast((cp & 0x3ff) + detail::TRAIL_SURROGATE_MIN); - } else { - *out++ = static_cast(cp); - } + *out++ = unicode::next(start, end); } return out; } -[[nodiscard]] constexpr std::u16string utf8to16(std::string_view s) +[[nodiscard]] constexpr std::u32string utf8to32(std::string_view str) { - std::u16string result; - unicode::utf8to16(s.begin(), s.end(), std::back_inserter(result)); + std::u32string result; + unicode::utf8to32(str.begin(), str.end(), std::back_inserter(result)); return result; } -[[nodiscard]] constexpr std::u16string utf8to16(std::u8string_view s) +[[nodiscard]] constexpr std::u32string utf8to32(std::u8string_view str) { - std::u16string result; - unicode::utf8to16(s.begin(), s.end(), std::back_inserter(result)); + std::u32string result; + unicode::utf8to32(str.begin(), str.end(), std::back_inserter(result)); return result; } @@ -954,44 +979,86 @@ constexpr OutIt utf32to8(It start, Se end, OutIt out) return out; } -[[nodiscard]] constexpr std::string utf32to8(std::u32string_view s) +[[nodiscard]] constexpr std::string utf32to8(std::u32string_view str) { std::string result; - unicode::utf32to8(s.begin(), s.end(), std::back_inserter(result)); + unicode::utf32to8(str.begin(), str.end(), std::back_inserter(result)); return result; } -[[nodiscard]] constexpr std::u8string utf32tou8(std::u32string_view s) +[[nodiscard]] constexpr std::u8string utf32tou8(std::u32string_view str) { std::u8string result; - unicode::utf32to8(s.begin(), s.end(), std::back_inserter(result)); + unicode::utf32to8(str.begin(), str.end(), std::back_inserter(result)); return result; } -template Se, class OutIt> -constexpr OutIt utf8to32(It start, Se end, OutIt out) + +template +[[nodiscard]] constexpr std::basic_string transcode(std::string_view str) { - while (start != end) { - *out++ = unicode::next(start, end); + if constexpr (std::same_as) { + return std::u8string{std::from_range, str}; + } else if constexpr (std::same_as) { + return unicode::utf8to16(str); + } else if constexpr (std::same_as) { + return unicode::utf8to32(str); + } else { + static_assert(std::same_as); + return std::string{str}; } - return out; } -[[nodiscard]] constexpr std::u32string utf8to32(std::string_view s) +template +[[nodiscard]] constexpr std::basic_string transcode(std::u8string_view str) { - std::u32string result; - unicode::utf8to32(s.begin(), s.end(), std::back_inserter(result)); - return result; + if constexpr (std::same_as) { + return std::u8string{str}; + } else if constexpr (std::same_as) { + return unicode::utf8to16(str); + } else if constexpr (std::same_as) { + return unicode::utf8to32(str); + } else { + static_assert(std::same_as); + return std::string{std::from_range, str}; + } } -[[nodiscard]] constexpr std::u32string utf8to32(std::u8string_view s) -{ - std::u32string result; - unicode::utf8to32(s.begin(), s.end(), std::back_inserter(result)); - return result; +template +[[nodiscard]] constexpr std::basic_string transcode(std::u16string_view str) +{ + if constexpr (std::same_as) { + return unicode::utf16tou8(str); + } else if constexpr (std::same_as) { + return std::u16string{str}; + } else if constexpr (std::same_as) { + static_assert(false, "not implemented"); + return {}; // dummy + //return unicode::utf16to32(str); + } else { + static_assert(std::same_as); + return unicode::utf16to8(str); + } } -// The iterator class +template +[[nodiscard]] constexpr std::basic_string transcode(std::u32string_view str) +{ + if constexpr (std::same_as) { + return unicode::utf32tou8(str); + } else if constexpr (std::same_as) { + static_assert(false, "not implemented"); + return {}; // dummy + //return unicode::utf32to16(str); + } else if constexpr (std::same_as) { + return std::u32string{str}; + } else { + static_assert(std::same_as); + return unicode::utf32to8(str); + } +} + + template class iterator { diff --git a/test/unicode/string/string.cpp b/test/unicode/string/string.cpp index b25aca6..eab9eb1 100644 --- a/test/unicode/string/string.cpp +++ b/test/unicode/string/string.cpp @@ -187,11 +187,11 @@ TEST_CASE("is_valid") CHECK(unicode::is_valid(utf8_with_surrogates, utf8_with_surrogates + 9)); } { - std::u8string const utf_invalid_u8(std::from_range, utf_invalid); + std::u8string const utf_invalid_u8(reinterpret_cast(utf_invalid)); CHECK(!unicode::is_valid(utf_invalid_u8)); } { - std::u8string const utf8_with_surrogates_u8(std::from_range, utf8_with_surrogates); + std::u8string const utf8_with_surrogates_u8(reinterpret_cast(utf8_with_surrogates)); CHECK(unicode::is_valid(utf8_with_surrogates)); } @@ -224,7 +224,7 @@ TEST_CASE("find_invalid") CHECK(invalid_pos == 5); } { - std::u8string const utf_invalid_u8(std::from_range, utf_invalid); + std::u8string const utf_invalid_u8(reinterpret_cast(utf_invalid)); std::size_t const invalid_pos = unicode::find_invalid(utf_invalid_u8); CHECK(invalid_pos == 5); } @@ -255,11 +255,11 @@ TEST_CASE("replace_invalid (string)") TEST_CASE("replace_invalid (u8string)") { - std::u8string const invalid_sequence(std::from_range, "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"); + std::u8string const invalid_sequence(reinterpret_cast("a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z")); std::u8string const replace_invalid_result = unicode::replace_invalid(invalid_sequence, u8'?'); CHECK(unicode::is_valid(replace_invalid_result)); - std::u8string const fixed_invalid_sequence(std::from_range, "a????z"); + std::u8string const fixed_invalid_sequence(reinterpret_cast("a????z")); CHECK(fixed_invalid_sequence == replace_invalid_result); } @@ -275,7 +275,7 @@ TEST_CASE("starts_with_bom") CHECK(!unicode::starts_with_bom(threechars)); CHECK(!unicode::starts_with_bom(std::string{threechars})); CHECK(!unicode::starts_with_bom(std::string_view{threechars})); - CHECK(!unicode::starts_with_bom(std::u8string{std::from_range, threechars})); + CHECK(!unicode::starts_with_bom(std::u8string{reinterpret_cast(threechars)})); } TEST_CASE("increment") @@ -304,141 +304,152 @@ TEST_CASE("decrement") CHECK(*it == 0x10346); } -#if 0 - -TEST_CASE("utf32to8") -{ - char32_t utf32string[] = {0x448, 0x65E5, 0x10346, 0}; - std::string utf8result; - iris::utflib::utf32to8(utf32string, utf32string + 3, back_inserter(utf8result)); - CHECK(utf8result.size() == 9); -} - -TEST_CASE("utf8to32") -{ - char const* twochars = "\xe6\x97\xa5\xd1\x88"; - std::vector utf32result; - iris::utflib::utf8to32(twochars, twochars + 5, back_inserter(utf32result)); - CHECK(utf32result.size() == 2); -} - -TEST_CASE("utf16to8") -{ - char16_t utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; - std::string utf8result; - iris::utflib::utf16to8(utf16string, utf16string + 5, back_inserter(utf8result)); - CHECK(utf8result.size() == 10); -} +// ----------------------------------- TEST_CASE("utf8to16") { - char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; - std::vector utf16result; - iris::utflib::utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result)); - CHECK(utf16result.size() == 4); - CHECK(utf16result[2] == 0xd834); - CHECK(utf16result[3] == 0xdd1e); + { + constexpr char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; + std::vector utf16result; + unicode::utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result)); + CHECK(utf16result.size() == 4); + CHECK(utf16result[2] == 0xd834); + CHECK(utf16result[3] == 0xdd1e); + } + { + std::string const utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; + std::u16string const utf16result = unicode::utf8to16(utf8_with_surrogates); + CHECK(utf16result.size() == 4); + CHECK(utf16result[2] == 0xd834); + CHECK(utf16result[3] == 0xdd1e); + // Just to make sure it compiles with string literals + CHECK(unicode::utf8to16(u8"simple") == u"simple"); + CHECK(unicode::utf8to16("simple") == u"simple"); + } + { + constexpr std::string_view utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; + std::u16string const utf16result = unicode::utf8to16(utf8_with_surrogates); + CHECK(utf16result.size() == 4); + CHECK(utf16result[2] == 0xd834); + CHECK(utf16result[3] == 0xdd1e); + } + { + std::u8string const utf8_with_surrogates{reinterpret_cast("\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e")}; + std::u16string const utf16result = unicode::utf8to16(utf8_with_surrogates); + CHECK(utf16result.size() == 4); + CHECK(utf16result[2] == 0xd834); + CHECK(utf16result[3] == 0xdd1e); + } } TEST_CASE("utf16to8") { - std::u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; - std::string u = utf16to8(utf16string); - CHECK(u.size() == 10); -} - -TEST_CASE("utf8to16") -{ - std::string utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; - std::u16string utf16result = utf8to16(utf8_with_surrogates); - CHECK(utf16result.size() == 4); - CHECK(utf16result[2] == 0xd834); - CHECK(utf16result[3] == 0xdd1e); - // Just to make sure it compiles with std::string literals - CHECK(utf8to16(u8"simple") == u"simple"); - CHECK(utf8to16("simple") == u"simple"); + { + constexpr char16_t utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; + std::string utf8result; + unicode::utf16to8(utf16string, utf16string + 5, back_inserter(utf8result)); + CHECK(utf8result.size() == 10); + } + { + std::u16string const utf16string{0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; + std::string const u = unicode::utf16to8(utf16string); + CHECK(u.size() == 10); + } + { + std::u16string const utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; + std::u16string_view const utf16stringview(utf16string); + std::string const u = unicode::utf16to8(utf16stringview); + CHECK(u.size() == 10); + } + { + std::u16string const utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; + std::u16string_view const utf16stringview{utf16string}; + { + std::u8string const u = unicode::utf16tou8(utf16string); + CHECK(u.size() == 10); + } + { + std::u8string const u = unicode::utf16tou8(utf16stringview); + CHECK(u.size() == 10); + } + } } -TEST_CASE("utf32to8") -{ - std::u32string utf32string = {0x448, 0x65E5, 0x10346}; - std::string utf8result = utf32to8(utf32string); - CHECK(utf8result.size() == 9); -} +// ----------------------------------------- TEST_CASE("utf8to32") { - char const* twochars = "\xe6\x97\xa5\xd1\x88"; - std::u32string utf32result = utf8to32(twochars); - CHECK(utf32result.size() == 2); -} - -TEST_CASE("utf16to8") -{ - std::u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; - u16string_view utf16stringview(utf16string); - std::string u = utf16to8(utf16stringview); - CHECK(u.size() == 10); -} - -TEST_CASE("utf8to16") -{ - std::string_view utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; - std::u16string utf16result = utf8to16(utf8_with_surrogates); - CHECK(utf16result.size() == 4); - CHECK(utf16result[2] == 0xd834); - CHECK(utf16result[3] == 0xdd1e); + { + constexpr char const* twochars = "\xe6\x97\xa5\xd1\x88"; + std::vector utf32result; + unicode::utf8to32(twochars, twochars + 5, back_inserter(utf32result)); + CHECK(utf32result.size() == 2); + } + { + constexpr char const* twochars = "\xe6\x97\xa5\xd1\x88"; + std::u32string const utf32result = unicode::utf8to32(twochars); + CHECK(utf32result.size() == 2); + } + { + constexpr std::string_view twochars = "\xe6\x97\xa5\xd1\x88"; + std::u32string const utf32result = unicode::utf8to32(twochars); + CHECK(utf32result.size() == 2); + } + { + std::u8string const twochars{reinterpret_cast("\xe6\x97\xa5\xd1\x88")}; + std::u32string const utf32result = unicode::utf8to32(twochars); + CHECK(utf32result.size() == 2); + } } TEST_CASE("utf32to8") { - std::u32string utf32string = {0x448, 0x65E5, 0x10346}; - u32string_view utf32stringview(utf32string); - std::string utf8result = utf32to8(utf32stringview); - CHECK(utf8result.size() == 9); -} - -TEST_CASE("utf8to32") -{ - std::string_view twochars = "\xe6\x97\xa5\xd1\x88"; - std::u32string utf32result = utf8to32(twochars); - CHECK(utf32result.size() == 2); + { + constexpr char32_t utf32string[] = {0x448, 0x65E5, 0x10346, 0}; + std::string utf8result; + unicode::utf32to8(utf32string, utf32string + 3, back_inserter(utf8result)); + CHECK(utf8result.size() == 9); + } + { + std::u32string const utf32string = {0x448, 0x65E5, 0x10346}; + std::string const utf8result = unicode::utf32to8(utf32string); + CHECK(utf8result.size() == 9); + } + { + std::u32string const utf32string = {0x448, 0x65E5, 0x10346}; + std::u32string_view const utf32stringview(utf32string); + std::string const utf8result = unicode::utf32to8(utf32stringview); + CHECK(utf8result.size() == 9); + } + { + std::u32string const utf32string = {0x448, 0x65E5, 0x10346}; + std::u32string_view const utf32stringview{utf32string}; + std::u8string const utf8result = unicode::utf32tou8(utf32stringview); + CHECK(utf8result.size() == 9); + } } -TEST_CASE("utf16tou8") +TEST_CASE("transcode") { - std::u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; - u16string_view utf16stringview{utf16string}; - std::u8string u = utf16tou8(utf16string); - CHECK(u.size() == 10); - u = utf16tou8(utf16stringview); - CHECK(u.size() == 10); -} + STATIC_CHECK(unicode::transcode("aこれはb試験ですc") == "aこれはb試験ですc"); + STATIC_CHECK(unicode::transcode(u8"aこれはb試験ですc") == "aこれはb試験ですc"); + STATIC_CHECK(unicode::transcode(u"aこれはb試験ですc") == "aこれはb試験ですc"); + STATIC_CHECK(unicode::transcode(U"aこれはb試験ですc") == "aこれはb試験ですc"); -TEST_CASE("utf8to16") -{ - std::u8string utf8_with_surrogates{ reinterpret_cast("\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e") }; - std::u16string utf16result = utf8to16(utf8_with_surrogates); - CHECK(utf16result.size() == 4); - CHECK(utf16result[2] == 0xd834); - CHECK(utf16result[3] == 0xdd1e); -} + STATIC_CHECK(unicode::transcode("aこれはb試験ですc") == u8"aこれはb試験ですc"); + STATIC_CHECK(unicode::transcode(u8"aこれはb試験ですc") == u8"aこれはb試験ですc"); + STATIC_CHECK(unicode::transcode(u"aこれはb試験ですc") == u8"aこれはb試験ですc"); + STATIC_CHECK(unicode::transcode(U"aこれはb試験ですc") == u8"aこれはb試験ですc"); -TEST_CASE("utf32tou8") -{ - std::u32string utf32string = {0x448, 0x65E5, 0x10346}; - u32string_view utf32stringview{utf32string}; - std::u8string utf8result = utf32tou8(utf32stringview); - CHECK(utf8result.size() == 9); -} + STATIC_CHECK(unicode::transcode("aこれはb試験ですc") == u"aこれはb試験ですc"); + STATIC_CHECK(unicode::transcode(u8"aこれはb試験ですc") == u"aこれはb試験ですc"); + STATIC_CHECK(unicode::transcode(u"aこれはb試験ですc") == u"aこれはb試験ですc"); + //STATIC_CHECK(unicode::transcode(U"aこれはb試験ですc") == u"aこれはb試験ですc"); -TEST_CASE("utf8to32") -{ - std::u8string twochars = reinterpret_cast("\xe6\x97\xa5\xd1\x88"); - std::u32string utf32result = utf8to32(twochars); - CHECK(utf32result.size() == 2); + STATIC_CHECK(unicode::transcode("aこれはb試験ですc") == U"aこれはb試験ですc"); + STATIC_CHECK(unicode::transcode(u8"aこれはb試験ですc") == U"aこれはb試験ですc"); + //STATIC_CHECK(unicode::transcode(u"aこれはb試験ですc") == U"aこれはb試験ですc"); + STATIC_CHECK(unicode::transcode(U"aこれはb試験ですc") == U"aこれはb試験ですc"); } -#endif - } // iris_unicode_test From a183495a55369e0c31df1e2b0aa39d35edabaf56 Mon Sep 17 00:00:00 2001 From: Nana Sakisaka <1901813+saki7@users.noreply.github.com> Date: Sat, 7 Mar 2026 23:13:57 +0900 Subject: [PATCH 12/17] Refactor class `iterator` --- include/iris/unicode/string.hpp | 147 ++++++++++++++++---------------- 1 file changed, 74 insertions(+), 73 deletions(-) diff --git a/include/iris/unicode/string.hpp b/include/iris/unicode/string.hpp index eb61690..e63f345 100644 --- a/include/iris/unicode/string.hpp +++ b/include/iris/unicode/string.hpp @@ -875,7 +875,80 @@ distance(It first, Se last) return dist; } -// -------------------------------- +// ------------------------------------ + +template +class iterator +{ + It it; + It range_start; + It range_end; + +public: + using value_type = char32_t; + using pointer = char32_t*; + using reference = char32_t&; + using difference_type = std::ptrdiff_t; + using iterator_category = std::bidirectional_iterator_tag; + + constexpr iterator() + requires std::is_default_constructible_v + = default; + + constexpr explicit iterator(It octet_it, It rangestart, It rangeend) + : it(std::move(octet_it)) + , range_start(std::move(rangestart)) + , range_end(std::move(rangeend)) + { + if constexpr (std::random_access_iterator) { + if (it < range_start || it > range_end) { + throw std::out_of_range("Invalid utf-8 iterator position"); + } + } + } + + [[nodiscard]] constexpr It base() const { return it; } + + [[nodiscard]] constexpr char32_t operator*() const + { + It temp = it; + return unicode::next(temp, range_end); + } + + [[nodiscard]] constexpr bool operator==(iterator const& rhs) const noexcept + { + assert(range_start == rhs.range_start && range_end == rhs.range_end && "comparing incompatible iterator range is not allowed"); + return it == rhs.it; + } + + constexpr iterator& operator++() + { + (void)unicode::next(it, range_end); + return *this; + } + + [[nodiscard]] constexpr iterator operator++(int) + { + iterator temp = *this; + (void)unicode::next(it, range_end); + return temp; + } + + constexpr iterator& operator--() + { + (void)unicode::prev(it, range_start); + return *this; + } + + [[nodiscard]] constexpr iterator operator--(int) + { + iterator temp = *this; + (void)unicode::prev(it, range_start); + return temp; + } +}; + +// ------------------------------------ template Se, utf16_output_iterator OutIt> constexpr OutIt utf8to16(It start, Se end, OutIt out) @@ -1058,78 +1131,6 @@ template } } - -template -class iterator -{ - It it; - It range_start; - It range_end; - -public: - using value_type = char32_t; - using pointer = char32_t*; - using reference = char32_t&; - using difference_type = std::ptrdiff_t; - using iterator_category = std::bidirectional_iterator_tag; - - constexpr iterator() - requires std::is_default_constructible_v - = default; - - constexpr explicit iterator(It octet_it, It rangestart, It rangeend) - : it(std::move(octet_it)) - , range_start(std::move(rangestart)) - , range_end(std::move(rangeend)) - { - if constexpr (std::random_access_iterator) { - if (it < range_start || it > range_end) { - throw std::out_of_range("Invalid utf-8 iterator position"); - } - } - } - - [[nodiscard]] constexpr It base() const { return it; } - - [[nodiscard]] constexpr char32_t operator*() const - { - It temp = it; - return unicode::next(temp, range_end); - } - - [[nodiscard]] constexpr bool operator==(iterator const& rhs) const noexcept - { - assert(range_start == rhs.range_start && range_end == rhs.range_end && "comparing incompatible iterator range is not allowed"); - return it == rhs.it; - } - - constexpr iterator& operator++() - { - (void)unicode::next(it, range_end); - return *this; - } - - constexpr iterator operator++(int) - { - iterator temp = *this; - (void)unicode::next(it, range_end); - return temp; - } - - constexpr iterator& operator--() - { - (void)unicode::prev(it, range_start); - return *this; - } - - constexpr iterator operator--(int) - { - iterator temp = *this; - (void)unicode::prev(it, range_start); - return temp; - } -}; - } // iris::unicode #endif From 2071cd7c2b6d9d4980a855cb09d78d5872f13947 Mon Sep 17 00:00:00 2001 From: Nana Sakisaka <1901813+saki7@users.noreply.github.com> Date: Sun, 8 Mar 2026 00:28:26 +0900 Subject: [PATCH 13/17] Port test "utf8_invalid" --- test/CMakeLists.txt | 23 ++++++- test/unicode/string/string.cpp | 1 - test/unicode/string/utf8_invalid.cpp | 90 ++++++++++++++++------------ 3 files changed, 74 insertions(+), 40 deletions(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 77d4c02..952d4eb 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -116,11 +116,32 @@ function(_iris_define_test_impl test_name libs) target_link_libraries(${test_name}_test PRIVATE Iris::Iris iris_cxx_test ${libs}) add_test(NAME ${test_name}_test COMMAND ${test_name}_test --colour-mode=ansi) + set_tests_properties( + ${test_name}_test PROPERTIES + ENVIRONMENT "IRIS_ROOT=${IRIS_ROOT}" + ) if(MSVC) + set( + VS_DEBUGGER_ENVIRONMENT_LIST + "PATH=$(VC_ExecutablePath_x64)\;%PATH%" + "ASAN_SYMBOLIZER_PATH=$(VC_ExecutablePath_x64)\\llvm-symbolizer.exe" + "IRIS_ROOT=$" + ) + list(JOIN VS_DEBUGGER_ENVIRONMENT_LIST "\n" VS_DEBUGGER_ENVIRONMENT) + + set_target_properties( + ${test_name}_test PROPERTIES + VS_DEBUGGER_ENVIRONMENT "${VS_DEBUGGER_ENVIRONMENT}" + ) + get_property(IRIS_MSVC_ASAN_DIR GLOBAL PROPERTY IRIS_MSVC_ASAN_DIR) + set( + ENV_MODIFICATION + "PATH=path_list_append:${IRIS_MSVC_ASAN_DIR}" + ) set_tests_properties( ${test_name}_test PROPERTIES - ENVIRONMENT "PATH=${IRIS_MSVC_ASAN_DIR};$ENV{PATH}" + ENVIRONMENT_MODIFICATION "${ENV_MODIFICATION}" ) endif() endfunction() diff --git a/test/unicode/string/string.cpp b/test/unicode/string/string.cpp index eab9eb1..9517be0 100644 --- a/test/unicode/string/string.cpp +++ b/test/unicode/string/string.cpp @@ -6,7 +6,6 @@ #include #include #include -#include #include diff --git a/test/unicode/string/utf8_invalid.cpp b/test/unicode/string/utf8_invalid.cpp index 7dd6588..b7f46b7 100644 --- a/test/unicode/string/utf8_invalid.cpp +++ b/test/unicode/string/utf8_invalid.cpp @@ -1,63 +1,77 @@ +// TODO: we need secure "getenv" in iris library +#define _CRT_SECURE_NO_WARNINGS 1 + +#include "iris_test.hpp" + #include +#include +#include #include -#include #include +#include #include +#include -using namespace std; -using namespace iris::unicode; +namespace iris_unicode_test { -const unsigned INVALID_LINES[] = { 75, 76, 83, 84, 85, 93, 102, 103, 105, 106, 107, 108, 109, 110, 114, 115, 116, 117, 124, 125, 130, 135, 140, 145, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 169, 175, 176, 177, 207, 208, 209, 210, 211, 220, 221, 222, 223, 224, 232, 233, 234, 235, 236, 247, 248, 249, 250, 251, 252, 253, 257, 258, 259, 260, 261, 262, 263, 264}; -const unsigned* INVALID_LINES_END = INVALID_LINES + sizeof(INVALID_LINES)/sizeof(unsigned); +constexpr auto INVALID_LINES = std::to_array({ + 75, 76, 83, 84, 85, 93, 102, 103, 105, 106, 107, 108, 109, + 110, 114, 115, 116, 117, 124, 125, 130, 135, 140, 145, 153, + 154, 155, 156, 157, 158, 159, 160, 161, 162, 169, 175, 176, + 177, 207, 208, 209, 210, 211, 220, 221, 222, 223, 224, 232, + 233, 234, 235, 236, 247, 248, 249, 250, 251, 252, 253, 257, + 258, 259, 260, 261, 262, 263, 264, +}); -#if 0 -int main(int argc, char** argv) +TEST_CASE("utf8_invalid") { - string test_file_path; - if (argc == 2) - test_file_path = argv[1]; - else { - cout << "Wrong number of arguments" << endl; - return 1; - } - // Open the test file - ifstream fs8(test_file_path.c_str()); - if (!fs8.is_open()) { - cout << "Could not open " << test_file_path << endl; - return 1; + namespace unicode = iris::unicode; + using iris::throwf; + + std::filesystem::path const IRIS_ROOT = [] { + char const* IRIS_ROOT_str = std::getenv("IRIS_ROOT"); + if (!IRIS_ROOT_str) throwf("IRIS_ROOT is not defined"); + return std::filesystem::path(IRIS_ROOT_str); + }(); + + auto const test_file_path = IRIS_ROOT / "test" / "unicode" / "string" / "test_data" / "utf8_invalid.txt"; + std::ifstream fs8(test_file_path); + if (!fs8) { + throwf("could not open \"{}\"", test_file_path.string()); } // Read it line by line - unsigned int line_count = 0; - char byte; + unsigned line_count = 0; while (!fs8.eof()) { - string line; - while ((byte = static_cast(fs8.get())) != '\n' && !fs8.eof()) + std::string line; + + char byte; + while ((byte = static_cast(fs8.get())) != '\n' && !fs8.eof()) { line.push_back(byte); + } + + ++line_count; + bool const expected_valid = std::ranges::find(INVALID_LINES, line_count) == INVALID_LINES.end(); - line_count++; - bool expected_valid = (find(INVALID_LINES, INVALID_LINES_END, line_count) == INVALID_LINES_END); // Print out lines that contain unexpected invalid UTF-8 - if (!is_valid(line.begin(), line.end())) { + if (!unicode::is_valid(line.begin(), line.end())) { if (expected_valid) { - cout << "Unexpected invalid utf-8 at line " << line_count << '\n'; - return 1; + throwf("unexpected invalid utf-8 at line {}", line_count); } // try fixing it: - string fixed_line; - replace_invalid(line.begin(), line.end(), back_inserter(fixed_line)); - if (!is_valid(fixed_line.begin(), fixed_line.end())) { - cout << "replace_invalid() resulted in an invalid utf-8 at line " << line_count << '\n'; - return 1; + std::string fixed_line; + unicode::replace_invalid(line.begin(), line.end(), back_inserter(fixed_line)); + if (!unicode::is_valid(fixed_line.begin(), fixed_line.end())) { + throwf("replace_invalid() resulted in an invalid utf-8 at line {}", line_count); } - } - else if (!expected_valid) { - cout << "Invalid utf-8 NOT detected at line " << line_count << '\n'; - return 1; + + } else if (!expected_valid) { + throwf("invalid utf-8 NOT detected at line {}", line_count); } } + CHECK(true); } -#endif +} // iris_unicode_test From 65d37a2f34d2ed5aa15bb9d8c74c05d76fb0bae8 Mon Sep 17 00:00:00 2001 From: Nana Sakisaka <1901813+saki7@users.noreply.github.com> Date: Sun, 8 Mar 2026 00:33:39 +0900 Subject: [PATCH 14/17] Enable unicode string tests in CI --- test/CMakeLists.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 952d4eb..e01f75a 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -190,9 +190,7 @@ if(PROJECT_IS_TOP_LEVEL) foreach(test_name IN LISTS IRIS_TEST_IRIS_TESTS) iris_define_test_headers(iris_${test_name} iris_test.hpp) endforeach() - endif() - if(NOT DEFINED IRIS_CI_COMPONENT OR IRIS_CI_COMPONENT STREQUAL unicode) add_subdirectory(unicode) endif() endif() From 92500773e9940dc059704974e86d69b9d51373e0 Mon Sep 17 00:00:00 2001 From: Nana Sakisaka <1901813+saki7@users.noreply.github.com> Date: Sun, 8 Mar 2026 00:41:11 +0900 Subject: [PATCH 15/17] Degrade `std::from_range` constructor --- include/iris/unicode/string.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/iris/unicode/string.hpp b/include/iris/unicode/string.hpp index e63f345..6dc0a3a 100644 --- a/include/iris/unicode/string.hpp +++ b/include/iris/unicode/string.hpp @@ -1071,7 +1071,7 @@ template [[nodiscard]] constexpr std::basic_string transcode(std::string_view str) { if constexpr (std::same_as) { - return std::u8string{std::from_range, str}; + return std::u8string{str.begin(), str.end()}; } else if constexpr (std::same_as) { return unicode::utf8to16(str); } else if constexpr (std::same_as) { @@ -1093,7 +1093,7 @@ template return unicode::utf8to32(str); } else { static_assert(std::same_as); - return std::string{std::from_range, str}; + return std::string{str.begin(), str.end()}; } } From fb357682a282d2cb1e1c5dbb1db67edd9a8c8492 Mon Sep 17 00:00:00 2001 From: Nana Sakisaka <1901813+saki7@users.noreply.github.com> Date: Sun, 8 Mar 2026 17:25:55 +0900 Subject: [PATCH 16/17] Refactor iterator --- include/iris/unicode/string.hpp | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/include/iris/unicode/string.hpp b/include/iris/unicode/string.hpp index 6dc0a3a..d6640da 100644 --- a/include/iris/unicode/string.hpp +++ b/include/iris/unicode/string.hpp @@ -878,7 +878,7 @@ distance(It first, Se last) // ------------------------------------ template -class iterator +class code_point_iterator { It it; It range_start; @@ -891,17 +891,17 @@ class iterator using difference_type = std::ptrdiff_t; using iterator_category = std::bidirectional_iterator_tag; - constexpr iterator() + constexpr code_point_iterator() requires std::is_default_constructible_v = default; - constexpr explicit iterator(It octet_it, It rangestart, It rangeend) - : it(std::move(octet_it)) - , range_start(std::move(rangestart)) - , range_end(std::move(rangeend)) + constexpr code_point_iterator(It it, It range_start, It range_end) + : it(std::move(it)) + , range_start(std::move(range_start)) + , range_end(std::move(range_end)) { if constexpr (std::random_access_iterator) { - if (it < range_start || it > range_end) { + if (this->it < this->range_start || this->it > this->range_end) { throw std::out_of_range("Invalid utf-8 iterator position"); } } @@ -915,34 +915,34 @@ class iterator return unicode::next(temp, range_end); } - [[nodiscard]] constexpr bool operator==(iterator const& rhs) const noexcept + [[nodiscard]] constexpr bool operator==(code_point_iterator const& rhs) const noexcept { assert(range_start == rhs.range_start && range_end == rhs.range_end && "comparing incompatible iterator range is not allowed"); return it == rhs.it; } - constexpr iterator& operator++() + constexpr code_point_iterator& operator++() { (void)unicode::next(it, range_end); return *this; } - [[nodiscard]] constexpr iterator operator++(int) + [[nodiscard]] constexpr code_point_iterator operator++(int) { - iterator temp = *this; + code_point_iterator temp = *this; (void)unicode::next(it, range_end); return temp; } - constexpr iterator& operator--() + constexpr code_point_iterator& operator--() { (void)unicode::prev(it, range_start); return *this; } - [[nodiscard]] constexpr iterator operator--(int) + [[nodiscard]] constexpr code_point_iterator operator--(int) { - iterator temp = *this; + code_point_iterator temp = *this; (void)unicode::prev(it, range_start); return temp; } @@ -1066,6 +1066,7 @@ constexpr OutIt utf32to8(It start, Se end, OutIt out) return result; } +// TODO: add single char variations template [[nodiscard]] constexpr std::basic_string transcode(std::string_view str) From 00eee1765a0fee478c9818b7ddf777240f95ce50 Mon Sep 17 00:00:00 2001 From: Nana Sakisaka <1901813+saki7@users.noreply.github.com> Date: Mon, 9 Mar 2026 00:05:37 +0900 Subject: [PATCH 17/17] Add `bounded_prev`/`bounded_next` --- CMakeLists.txt | 6 +- include/iris/unicode/string.hpp | 102 ++++++++++++++++++++++++++++++-- test/CMakeLists.txt | 5 ++ test/unicode/string/string.cpp | 10 ++-- 4 files changed, 110 insertions(+), 13 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 94d6d05..fcd3dea 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,9 +8,9 @@ endif() project(iris VERSION 0.0.1 LANGUAGES CXX) -if(NOT DEFINED IRIS_ROOT) - set(IRIS_ROOT "${CMAKE_CURRENT_LIST_DIR}") -endif() +set(IRIS_ROOT "${CMAKE_CURRENT_LIST_DIR}") +set_property(GLOBAL PROPERTY IRIS_ROOT "${IRIS_ROOT}") + # ----------------------------------------------------------------- # Global settings diff --git a/include/iris/unicode/string.hpp b/include/iris/unicode/string.hpp index d6640da..accbf24 100644 --- a/include/iris/unicode/string.hpp +++ b/include/iris/unicode/string.hpp @@ -252,7 +252,7 @@ constexpr char32_t SURROGATE_OFFSET = 0xfca02400u; // 0x10000u - (LEAD_SURROG // Maximum valid value for a Unicode code point constexpr char32_t CODE_POINT_MAX = 0x0010ffffu; -enum class utf_error +enum class [[nodiscard]] utf_error { OK, NOT_ENOUGH_SPACE, @@ -353,7 +353,7 @@ constexpr utf_error increase_safely(It& it, Se end) return ret; \ } while (false) -/// get_sequence_x functions decode utf-8 sequences of the length x +// get_sequence_x functions decode utf-8 sequences of the length x template Se> constexpr utf_error get_sequence_1(It& it, Se end, char32_t& code_point) noexcept(std::conjunction_v< @@ -485,10 +485,60 @@ constexpr utf_error validate_next(It& it, Se end, char32_t& code_point) template Se> requires std::forward_iterator constexpr utf_error validate_next(It& it, Se end) - noexcept(noexcept(detail::validate_next(it, end, std::declval()))) + noexcept(std::conjunction_v< + is_nothrow_dereferenceable, + is_nothrow_prefix_incrementable, + is_nothrow_sentinel, + std::is_nothrow_copy_constructible + >) { - char32_t ignored; - return detail::validate_next(it, end, ignored); + if (it == end) return utf_error::NOT_ENOUGH_SPACE; + + // Save the original value of it so we can go back in case of failure + // Of course, it does not make much sense with i.e. stream iterators + It const original_it = it; + + char32_t cp = 0; + // Determine the sequence length based on the lead octet + int const length = detail::sequence_length(it); + + // Get trail octets and calculate the code point + utf_error err{}; + switch (length) { + case 0: + return utf_error::INVALID_LEAD; + case 1: + err = detail::get_sequence_1(it, end, cp); + break; + case 2: + err = detail::get_sequence_2(it, end, cp); + break; + case 3: + err = detail::get_sequence_3(it, end, cp); + break; + case 4: + err = detail::get_sequence_4(it, end, cp); + break; + default: + std::unreachable(); + } + if (err != utf_error::OK) { + it = original_it; + return err; + } + + if (detail::is_code_point_valid(cp)) { + if (!detail::is_overlong_sequence(cp, length)) { + ++it; + return utf_error::OK; + } + + it = original_it; + return utf_error::OVERLONG_SEQUENCE; + } + + it = original_it; + return utf_error::INVALID_CODE_POINT; } template Se> @@ -816,6 +866,35 @@ template Se> return cp; } +template +[[nodiscard]] constexpr std::pair::difference_type> +bounded_next(It it, It const last, typename std::iterator_traits::difference_type off = 1) +{ + typename std::iterator_traits::difference_type count = 0; + for (; it != last && count < off; ++count) { + char32_t cp = 0; + switch (detail::validate_next(it, last, cp)) { + case detail::utf_error::OK: + break; + + case detail::utf_error::NOT_ENOUGH_SPACE: + throw not_enough_space(); + + case detail::utf_error::INVALID_LEAD: + case detail::utf_error::INCOMPLETE_SEQUENCE: + case detail::utf_error::OVERLONG_SEQUENCE: + throw invalid_utf8(static_cast(*it)); + + case detail::utf_error::INVALID_CODE_POINT: + throw invalid_code_point(cp); + + default: + std::unreachable(); + } + } + return {it, count}; +} + template Se> [[nodiscard]] constexpr char32_t next16(It& it, Se end) { @@ -847,6 +926,19 @@ template Se> return unicode::peek_next(it, end); } +template +[[nodiscard]] constexpr std::pair::difference_type> +bounded_prev(It const start, It it, typename std::iterator_traits::difference_type off = 1) +{ + typename std::iterator_traits::difference_type count = 0; + for (; it != start && count < off; ++count) { + while (detail::is_trail(*--it)) { + if (it == start) throw invalid_utf8(*it); // error - no lead byte in the sequence + } + } + return {it, count}; +} + template Se, class distance_type> constexpr void advance(It& it, distance_type n, Se end) { diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index e01f75a..5347bca 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -97,6 +97,11 @@ function(iris_define_test_headers test_name) endfunction() function(_iris_define_test_impl test_name libs) + get_property(IRIS_ROOT GLOBAL PROPERTY IRIS_ROOT) + if(NOT DEFINED IRIS_ROOT OR IRIS_ROOT STREQUAL "") + message(FATAL_ERROR "IRIS_ROOT is not defined") + endif() + add_executable(${test_name}_test ${ARGN}) target_include_directories(${test_name}_test PRIVATE ${CMAKE_CURRENT_FUNCTION_LIST_DIR}) target_include_directories(${test_name}_test PRIVATE ${CMAKE_CURRENT_LIST_DIR}) diff --git a/test/unicode/string/string.cpp b/test/unicode/string/string.cpp index 9517be0..86c0732 100644 --- a/test/unicode/string/string.cpp +++ b/test/unicode/string/string.cpp @@ -280,26 +280,26 @@ TEST_CASE("starts_with_bom") TEST_CASE("increment") { constexpr char const* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; - unicode::iterator it(threechars, threechars, threechars + 9); - unicode::iterator it2 = it; + unicode::code_point_iterator it(threechars, threechars, threechars + 9); + unicode::code_point_iterator it2 = it; CHECK(it2 == it); CHECK(*it == 0x10346); CHECK(*++it == 0x65e5); CHECK(*it++ == 0x65e5); CHECK(*it == 0x0448); CHECK(it != it2); - unicode::iterator endit(threechars + 9, threechars, threechars + 9); + unicode::code_point_iterator endit(threechars + 9, threechars, threechars + 9); CHECK(++it == endit); } TEST_CASE("decrement") { constexpr char const* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; - unicode::iterator it(threechars + 9, threechars, threechars + 9); + unicode::code_point_iterator it(threechars + 9, threechars, threechars + 9); CHECK(*--it == 0x0448); CHECK(*it-- == 0x0448); CHECK(*it == 0x65e5); - CHECK(--it == unicode::iterator(threechars, threechars, threechars + 9)); + CHECK(--it == unicode::code_point_iterator(threechars, threechars, threechars + 9)); CHECK(*it == 0x10346); }