From 510c67b3eded045fec91ae519bd4bf09016cd305 Mon Sep 17 00:00:00 2001
From: Nana Sakisaka <1901813+saki7@users.noreply.github.com>
Date: Sat, 7 Mar 2026 14:00:09 +0900
Subject: [PATCH 01/17] Add UTF string library

---
 include/iris/unicode/string.hpp               | 957 ++++++++++++++++++
 test/CMakeLists.txt                           |   4 +
 test/unicode/CMakeLists.txt                   |   3 +
 test/unicode/string/CMakeLists.txt            |   3 +
 test/unicode/string/LICENSE                   |  23 +
 test/unicode/string/apitests.cpp              | 257 +++++
 test/unicode/string/negative.cpp              |  61 ++
 test/unicode/string/test_cpp11.cpp            | 117 +++
 test/unicode/string/test_cpp17.cpp            |  86 ++
 test/unicode/string/test_cpp20.cpp            |  79 ++
 .../unicode/string/test_data/utf8_invalid.txt | Bin 0 -> 20010 bytes
 11 files changed, 1590 insertions(+)
 create mode 100644 include/iris/unicode/string.hpp
 create mode 100644 test/unicode/CMakeLists.txt
 create mode 100644 test/unicode/string/CMakeLists.txt
 create mode 100644 test/unicode/string/LICENSE
 create mode 100644 test/unicode/string/apitests.cpp
 create mode 100644 test/unicode/string/negative.cpp
 create mode 100644 test/unicode/string/test_cpp11.cpp
 create mode 100644 test/unicode/string/test_cpp17.cpp
 create mode 100644 test/unicode/string/test_cpp20.cpp
 create mode 100644 test/unicode/string/test_data/utf8_invalid.txt
diff --git a/include/iris/unicode/string.hpp b/include/iris/unicode/string.hpp
new file mode 100644
index 0000000..08d8d1f
--- /dev/null
+++ b/include/iris/unicode/string.hpp
@@ -0,0 +1,957 @@
+// Copyright 2006 Nemanja Trifunovic
+// Copyright 2026 The Iris Project Contributors
+
+/*
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+*/
+
+
+#ifndef IRIS_UTFLIB_UTF8_H
+#define IRIS_UTFLIB_UTF8_H
+
+#include <concepts>
+#include <exception>
+#include <iterator>
+#include <stdexcept>
+#include <string>
+#include <string_view>
+
+#include <cstring>
+
+namespace iris::utflib
+{
+    template <typename T>
+    concept octet = std::integral<T> && sizeof(T) == 1;
+
+    template <typename T>
+    concept utf8char = octet<T> && (std::same_as<T, char> || std::same_as<T, char8_t>);
+    
+    template <typename T>
+    concept utf16char = std::same_as<T, char16_t>;
+    
+    template <typename T>
+    concept utf32char = std::same_as<T, char32_t>;
+
+    template <typename It>
+    concept octet_iterator = std::input_iterator<It> && octet<std::iter_value_t<It>>;
+
+    template <typename It>
+    concept utf8_iterator = octet_iterator<It> && utf8char<std::iter_value_t<It>>;
+
+    template <typename It>
+    concept utf16_iterator = std::input_iterator<It> && utf16char<std::iter_value_t<It>>;
+    
+    template <typename It>
+    concept utf32_iterator = std::input_iterator<It> && utf32char<std::iter_value_t<It>>;
+
+    namespace traits
+    {
+        template <typename T, typename = void>
+        struct is_nothrow_dereferenceable : std::false_type {};
+
+        template <typename T>
+        struct is_nothrow_dereferenceable<T, std::void_t<decltype(*std::declval<T>())>> : std::bool_constant<noexcept(*std::declval<T>())> {};
+
+        template <typename T>
+        inline constexpr bool is_nothrow_dereferenceable_v = is_nothrow_dereferenceable<T>::value;
+        
+        template <typename T, typename = void>
+        struct is_nothrow_prefix_incrementable : std::false_type {};
+
+        template <typename T>
+        struct is_nothrow_prefix_incrementable<T, std::void_t<decltype(++std::declval<T>())>> : std::bool_constant<noexcept(++std::declval<T>())> {};
+
+        template <typename T>
+        inline constexpr bool is_nothrow_prefix_incrementable_v = is_nothrow_prefix_incrementable<T>::value;
+        
+        template <typename T, typename = void>
+        struct is_nothrow_postfix_incrementable : std::false_type {};
+
+        template <typename T>
+        struct is_nothrow_postfix_incrementable<T, std::void_t<decltype(std::declval<T>()++)>> : std::bool_constant<noexcept(std::declval<T>()++)> {};
+
+        template <typename T>
+        inline constexpr bool is_nothrow_postfix_incrementable_v = is_nothrow_postfix_incrementable<T>::value;
+
+        template <typename It, typename Se>
+        struct is_nothrow_sentinel : std::false_type {};
+
+        template <typename It, typename Se>
+            requires std::sentinel_for<Se, It>
+        struct is_nothrow_sentinel<It, Se> : std::bool_constant<
+            noexcept(std::declval<It&>() == std::declval<Se&>()) &&
+            noexcept(std::declval<It&>() != std::declval<Se&>()) &&
+            noexcept(std::declval<Se&>() == std::declval<It&>()) &&
+            noexcept(std::declval<Se&>() != std::declval<It&>())
+        >
+        {};
+
+        template <typename It, typename Se>
+        inline constexpr bool is_nothrow_sentinel_v = is_nothrow_sentinel<It, Se>::value;
+    } // namespace traits
+
+    // Helper code - not intended to be directly called by the library users. May be changed at any time
+    namespace internal
+    {
+        // Unicode constants
+        // Leading (high) surrogates: 0xd800 - 0xdbff
+        // Trailing (low) surrogates: 0xdc00 - 0xdfff
+        constexpr char16_t LEAD_SURROGATE_MIN  = 0xd800u;
+        constexpr char16_t LEAD_SURROGATE_MAX  = 0xdbffu;
+        constexpr char16_t TRAIL_SURROGATE_MIN = 0xdc00u;
+        constexpr char16_t TRAIL_SURROGATE_MAX = 0xdfffu;
+        constexpr char16_t LEAD_OFFSET         = 0xd7c0u;     // LEAD_SURROGATE_MIN - (0x10000 >> 10)
+        constexpr char32_t SURROGATE_OFFSET    = 0xfca02400u; // 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN
+
+        // Maximum valid value for a Unicode code point
+        constexpr char32_t CODE_POINT_MAX = 0x0010ffffu;
+
+        enum class utf_error { OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT };
+
+        template <octet Octet>
+        [[nodiscard]] constexpr char8_t mask8(Octet oc) noexcept
+        {
+            return static_cast<char8_t>(0xff & oc);
+        }
+
+        [[nodiscard]] constexpr char16_t mask16(char16_t oc) noexcept
+        {
+            return static_cast<char16_t>(0xffff & oc);
+        }
+
+        template <octet Octet>
+        [[nodiscard]] constexpr bool is_trail(Octet oc) noexcept
+        {
+            return ((internal::mask8(oc) >> 6) == 0x2);
+        }
+
+        [[nodiscard]] constexpr bool is_lead_surrogate(char32_t cp) noexcept
+        {
+            return (cp >= static_cast<char32_t>(LEAD_SURROGATE_MIN) && cp <= static_cast<char32_t>(LEAD_SURROGATE_MAX));
+        }
+
+        [[nodiscard]] constexpr bool is_trail_surrogate(char32_t cp) noexcept
+        {
+            return (cp >= static_cast<char32_t>(TRAIL_SURROGATE_MIN) && cp <= static_cast<char32_t>(TRAIL_SURROGATE_MAX));
+        }
+
+        [[nodiscard]] constexpr bool is_surrogate(char32_t cp) noexcept
+        {
+            return (cp >= static_cast<char32_t>(LEAD_SURROGATE_MIN) && cp <= static_cast<char32_t>(TRAIL_SURROGATE_MAX));
+        }
+
+        [[nodiscard]] constexpr bool is_code_point_valid(char32_t cp) noexcept
+        {
+            return (cp <= CODE_POINT_MAX && !internal::is_surrogate(cp));
+        }
+
+        [[nodiscard]] constexpr bool is_in_bmp(char32_t cp) noexcept
+        {
+            return cp < char32_t(0x10000);
+        }
+
+        [[nodiscard]] constexpr bool is_overlong_sequence(char32_t cp, int length) noexcept
+        {
+            if (cp < 0x80) {
+                if (length != 1)
+                    return true;
+            } else if (cp < 0x800) {
+                if (length != 2)
+                    return true;
+            } else if (cp < 0x10000) {
+                if (length != 3)
+                    return true;
+            }
+            return false;
+        }
+
+        template <octet_iterator It>
+        [[nodiscard]] constexpr int sequence_length(It lead_it)
+            noexcept(traits::is_nothrow_dereferenceable_v<It&>)
+        {
+            const char8_t lead = internal::mask8(*lead_it);
+            if (lead < 0x80)
+                return 1;
+            else if ((lead >> 5) == 0x6)
+                return 2;
+            else if ((lead >> 4) == 0xe)
+                return 3;
+            else if ((lead >> 3) == 0x1e)
+                return 4;
+            else
+                return 0;
+        }
+
+        /// Helper for get_sequence_x
+        template <octet_iterator It, std::sentinel_for<It> Se>
+        constexpr utf_error increase_safely(It& it, Se end)
+            noexcept(std::conjunction_v<
+                traits::is_nothrow_dereferenceable<It&>,
+                traits::is_nothrow_prefix_incrementable<It&>,
+                traits::is_nothrow_sentinel<It, Se>
+            >)
+        {
+            if (++it == end)
+                return utf_error::NOT_ENOUGH_ROOM;
+
+            if (!internal::is_trail(*it))
+                return utf_error::INCOMPLETE_SEQUENCE;
+
+            return utf_error::OK;
+        }
+
+#define IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(IT, END)                                                                                                                                                                                      \
+    do {                                                                                                                                                                                                                                       \
+        utf_error ret = increase_safely(IT, END);                                                                                                                                                                                              \
+        if (ret != utf_error::OK)                                                                                                                                                                                                                    \
+            return ret;                                                                                                                                                                                                                        \
+    } while (false)
+
+        /// get_sequence_x functions decode utf-8 sequences of the length x
+        template <octet_iterator It, std::sentinel_for<It> Se>
+        constexpr utf_error get_sequence_1(It& it, Se end, char32_t& code_point)
+            noexcept(std::conjunction_v<
+                traits::is_nothrow_dereferenceable<It&>,
+                traits::is_nothrow_sentinel<It, Se>
+            >)
+        {
+            if (it == end)
+                return utf_error::NOT_ENOUGH_ROOM;
+
+            code_point = static_cast<char32_t>(internal::mask8(*it));
+
+            return utf_error::OK;
+        }
+
+        template <octet_iterator It, std::sentinel_for<It> Se>
+        constexpr utf_error get_sequence_2(It& it, Se end, char32_t& code_point)
+            noexcept(std::conjunction_v<
+                traits::is_nothrow_dereferenceable<It&>,
+                traits::is_nothrow_prefix_incrementable<It&>,
+                traits::is_nothrow_sentinel<It, Se>
+            >)
+        {
+            if (it == end)
+                return utf_error::NOT_ENOUGH_ROOM;
+
+            code_point = static_cast<char32_t>(internal::mask8(*it));
+
+            IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end);
+
+            code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f);
+
+            return utf_error::OK;
+        }
+
+        template <octet_iterator It, std::sentinel_for<It> Se>
+        constexpr utf_error get_sequence_3(It& it, Se end, char32_t& code_point)
+            noexcept(std::conjunction_v<
+                traits::is_nothrow_dereferenceable<It&>,
+                traits::is_nothrow_prefix_incrementable<It&>,
+                traits::is_nothrow_sentinel<It, Se>
+            >)
+        {
+            if (it == end)
+                return utf_error::NOT_ENOUGH_ROOM;
+
+            code_point = static_cast<char32_t>(internal::mask8(*it));
+
+            IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end);
+
+            code_point = ((code_point << 12) & 0xffff) + ((internal::mask8(*it) << 6) & 0xfff);
+
+            IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end);
+
+            code_point = static_cast<char32_t>(code_point + ((*it) & 0x3f));
+
+            return utf_error::OK;
+        }
+
+        template <octet_iterator It, std::sentinel_for<It> Se>
+        constexpr utf_error get_sequence_4(It& it, Se end, char32_t& code_point)
+            noexcept(std::conjunction_v<
+                traits::is_nothrow_dereferenceable<It&>,
+                traits::is_nothrow_prefix_incrementable<It&>,
+                traits::is_nothrow_sentinel<It, Se>
+            >)
+        {
+            if (it == end)
+                return utf_error::NOT_ENOUGH_ROOM;
+
+            code_point = static_cast<char32_t>(internal::mask8(*it));
+
+            IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end);
+
+            code_point = ((code_point << 18) & 0x1fffff) + ((internal::mask8(*it) << 12) & 0x3ffff);
+
+            IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end);
+
+            code_point = static_cast<char32_t>(code_point + ((internal::mask8(*it) << 6) & 0xfff));
+
+            IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end);
+
+            code_point = static_cast<char32_t>(code_point + ((*it) & 0x3f));
+
+            return utf_error::OK;
+        }
+
+#undef IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR
+
+        template <octet_iterator It, std::sentinel_for<It> Se>
+            requires std::forward_iterator<It>
+        constexpr utf_error validate_next(It& it, Se end, char32_t& code_point)
+            noexcept(std::conjunction_v<
+                traits::is_nothrow_dereferenceable<It&>,
+                traits::is_nothrow_prefix_incrementable<It&>,
+                traits::is_nothrow_sentinel<It, Se>,
+                std::is_nothrow_copy_constructible<It>
+            >)
+        {
+            if (it == end)
+                return utf_error::NOT_ENOUGH_ROOM;
+
+            // Save the original value of it so we can go back in case of failure
+            // Of course, it does not make much sense with i.e. stream iterators
+            It original_it = it;
+
+            char32_t cp = 0;
+            // Determine the sequence length based on the lead octet
+            const int length = internal::sequence_length(it);
+
+            // Get trail octets and calculate the code point
+            utf_error err = utf_error::OK;
+            switch (length) {
+                case 0:
+                    return utf_error::INVALID_LEAD;
+                case 1:
+                    err = internal::get_sequence_1(it, end, cp);
+                    break;
+                case 2:
+                    err = internal::get_sequence_2(it, end, cp);
+                    break;
+                case 3:
+                    err = internal::get_sequence_3(it, end, cp);
+                    break;
+                case 4:
+                    err = internal::get_sequence_4(it, end, cp);
+                    break;
+            }
+
+            if (err == utf_error::OK) {
+                // Decoding succeeded. Now, security checks...
+                if (internal::is_code_point_valid(cp)) {
+                    if (!internal::is_overlong_sequence(cp, length)) {
+                        // Passed! Return here.
+                        code_point = cp;
+                        ++it;
+                        return utf_error::OK;
+                    } else
+                        err = utf_error::OVERLONG_SEQUENCE;
+                } else
+                    err = utf_error::INVALID_CODE_POINT;
+            }
+
+            // Failure branch - restore the original value of the iterator
+            it = original_it;
+            return err;
+        }
+
+        template <octet_iterator It, std::sentinel_for<It> Se>
+            requires std::forward_iterator<It>
+        constexpr utf_error validate_next(It& it, Se end)
+            noexcept(noexcept(internal::validate_next(it, end, std::declval<char32_t&>())))
+        {
+            char32_t ignored;
+            return internal::validate_next(it, end, ignored);
+        }
+
+        template <utf16_iterator It, std::sentinel_for<It> Se>
+            requires std::forward_iterator<It>
+        constexpr utf_error validate_next16(It& it, Se end, char32_t& code_point)
+            noexcept(std::conjunction_v<
+                traits::is_nothrow_dereferenceable<It&>,
+                traits::is_nothrow_prefix_incrementable<It&>,
+                traits::is_nothrow_postfix_incrementable<It&>,
+                traits::is_nothrow_sentinel<It, Se>,
+                std::is_nothrow_copy_constructible<It>
+            >)
+        {
+            // Check the edge case:
+            if (it == end)
+                return utf_error::NOT_ENOUGH_ROOM;
+            // Save the original value of it so we can go back in case of failure
+            // Of course, it does not make much sense with i.e. stream iterators
+            It original_it = it;
+
+            utf_error err = utf_error::OK;
+
+            const char16_t first_word = *it++;
+            if (!internal::is_surrogate(first_word)) {
+                code_point = first_word;
+                return utf_error::OK;
+            } else {
+                if (it == end)
+                    err = utf_error::NOT_ENOUGH_ROOM;
+                else if (internal::is_lead_surrogate(first_word)) {
+                    const char16_t second_word = *it++;
+                    if (internal::is_trail_surrogate(static_cast<char32_t>(second_word))) {
+                        code_point = static_cast<char32_t>(first_word << 10) + static_cast<char32_t>(second_word) + SURROGATE_OFFSET;
+                        return utf_error::OK;
+                    } else
+                        err = utf_error::INCOMPLETE_SEQUENCE;
+
+                } else {
+                    err = utf_error::INVALID_LEAD;
+                }
+            }
+            // error branch
+            it = original_it;
+            return err;
+        }
+
+        template <typename It, octet octet_type = std::iter_value_t<It>>
+            requires std::output_iterator<It, octet_type>
+        constexpr It append(char32_t cp, It result)
+            noexcept(noexcept(*result++ = std::declval<octet_type>()))
+        {
+            if (cp < 0x80) // one octet
+                *(result++) = static_cast<octet_type>(cp);
+            else if (cp < 0x800) { // two octets
+                *(result++) = static_cast<octet_type>((cp >> 6) | 0xc0);
+                *(result++) = static_cast<octet_type>((cp & 0x3f) | 0x80);
+            } else if (cp < 0x10000) { // three octets
+                *(result++) = static_cast<octet_type>((cp >> 12) | 0xe0);
+                *(result++) = static_cast<octet_type>(((cp >> 6) & 0x3f) | 0x80);
+                *(result++) = static_cast<octet_type>((cp & 0x3f) | 0x80);
+            } else { // four octets
+                *(result++) = static_cast<octet_type>((cp >> 18) | 0xf0);
+                *(result++) = static_cast<octet_type>(((cp >> 12) & 0x3f) | 0x80);
+                *(result++) = static_cast<octet_type>(((cp >> 6) & 0x3f) | 0x80);
+                *(result++) = static_cast<octet_type>((cp & 0x3f) | 0x80);
+            }
+            return result;
+        }
+
+        template <typename container_type>
+        constexpr std::back_insert_iterator<container_type> append(char32_t cp, std::back_insert_iterator<container_type> result)
+            noexcept(noexcept(internal::append<std::back_insert_iterator<container_type>, typename container_type::value_type>(cp, result)))
+        {
+            return internal::append<std::back_insert_iterator<container_type>, typename container_type::value_type>(cp, result);
+        }
+
+        template <std::output_iterator<char16_t> It>
+        constexpr It append16(char32_t cp, It result)
+            noexcept(noexcept(*result++ = std::declval<char16_t>()))
+        {
+            if (internal::is_in_bmp(cp))
+                *(result++) = static_cast<char16_t>(cp);
+            else {
+                // Code points from the supplementary planes are encoded via surrogate pairs
+                *(result++) = static_cast<char16_t>(LEAD_OFFSET + (cp >> 10));
+                *(result++) = static_cast<char16_t>(TRAIL_SURROGATE_MIN + (cp & 0x3FF));
+            }
+            return result;
+        }
+    } // namespace internal
+
+    // Base for the exceptions that may be thrown from the library
+    class exception : public ::std::exception
+    {
+    };
+
+    // Exceptions that may be thrown from the library functions.
+    class invalid_code_point : public exception
+    {
+        char32_t cp;
+
+    public:
+        explicit invalid_code_point(char32_t codepoint)
+            : cp(codepoint)
+        {
+        }
+        virtual const char* what() const noexcept override { return "Invalid code point"; }
+        [[nodiscard]] char32_t code_point() const noexcept { return cp; }
+    };
+
+    class invalid_utf8 : public exception
+    {
+        char8_t u8;
+
+    public:
+        explicit invalid_utf8(char c)
+            : u8(static_cast<char8_t>(c))
+        {
+        }
+        explicit invalid_utf8(char8_t u)
+            : u8(u)
+        {
+        }
+        virtual const char* what() const noexcept override { return "Invalid UTF-8"; }
+        [[nodiscard]] char8_t utf8_octet() const noexcept { return u8; }
+    };
+
+    class invalid_utf16 : public exception
+    {
+        char16_t u16;
+
+    public:
+        explicit invalid_utf16(char16_t u)
+            : u16(u)
+        {
+        }
+        virtual const char* what() const noexcept override { return "Invalid UTF-16"; }
+        [[nodiscard]] char16_t utf16_word() const noexcept { return u16; }
+    };
+
+    class not_enough_room : public exception
+    {
+    public:
+        virtual const char* what() const noexcept override { return "Not enough space"; }
+    };
+
+    /// The library API - functions intended to be called by the users
+
+    // Byte order mark
+    constexpr char8_t bom[] = {0xef, 0xbb, 0xbf};
+
+    template <octet_iterator It, std::sentinel_for<It> Se>
+    [[nodiscard]] constexpr It find_invalid(It it, Se se)
+        noexcept(noexcept(internal::validate_next(it, se)) && std::is_nothrow_copy_constructible_v<It>)
+    {
+        while (it != se) {
+            internal::utf_error err_code = internal::validate_next(it, se);
+            if (err_code != internal::utf_error::OK)
+                return it;
+        }
+        return it;
+    }
+
+    [[nodiscard]] constexpr std::size_t find_invalid(std::string_view s)
+        noexcept(noexcept(utflib::find_invalid(s.begin(), s.end())))
+    {
+        std::string_view::const_iterator invalid = utflib::find_invalid(s.begin(), s.end());
+        return (invalid == s.end()) ? std::string_view::npos : static_cast<std::size_t>(invalid - s.begin());
+    }
+
+    [[nodiscard]] constexpr std::size_t find_invalid(std::u8string_view s)
+        noexcept(noexcept(utflib::find_invalid(s.begin(), s.end())))
+    {
+        std::u8string_view::const_iterator invalid = utflib::find_invalid(s.begin(), s.end());
+        return (invalid == s.end()) ? std::u8string_view::npos : static_cast<std::size_t>(invalid - s.begin());
+    }
+
+    template <octet_iterator It, std::sentinel_for<It> Se>
+    [[nodiscard]] constexpr bool is_valid(It it, Se se)
+        noexcept(noexcept(utflib::find_invalid(it, se)) && traits::is_nothrow_sentinel_v<It, Se>)
+    {
+        return (utflib::find_invalid(it, se) == se);
+    }
+
+    [[nodiscard]] constexpr bool is_valid(std::string_view s)
+        noexcept(noexcept(utflib::is_valid(s.begin(), s.end())))
+    {
+        return utflib::is_valid(s.begin(), s.end());
+    }
+
+    [[nodiscard]] constexpr bool is_valid(std::u8string_view s)
+        noexcept(noexcept(utflib::is_valid(s.begin(), s.end())))
+    {
+        return utflib::is_valid(s.begin(), s.end());
+    }
+
+    template <octet_iterator It, std::sentinel_for<It> Se>
+    [[nodiscard]] constexpr bool starts_with_bom(It it, Se end)
+        noexcept(noexcept(internal::mask8(*it++)) && traits::is_nothrow_sentinel_v<It, Se>)
+    {
+        return (((it != end) && (internal::mask8(*it++)) == bom[0]) && ((it != end) && (internal::mask8(*it++)) == bom[1]) && ((it != end) && (internal::mask8(*it)) == bom[2]));
+    }
+
+    [[nodiscard]] constexpr bool starts_with_bom(std::string_view s)
+        noexcept(noexcept(utflib::starts_with_bom(s.begin(), s.end())))
+    {
+        return utflib::starts_with_bom(s.begin(), s.end());
+    }
+
+    [[nodiscard]] constexpr bool starts_with_bom(std::u8string_view s)
+        noexcept(noexcept(utflib::starts_with_bom(s.begin(), s.end())))
+    {
+        return utflib::starts_with_bom(s.begin(), s.end());
+    }
+
+    template <typename It>  // TODO: add constraints
+    constexpr It append(char32_t cp, It result)
+    {
+        if (!internal::is_code_point_valid(cp))
+            throw invalid_code_point(cp);
+
+        return internal::append(cp, result);
+    }
+
+    constexpr void append(char32_t cp, std::string& s)
+    {
+        utflib::append(cp, std::back_inserter(s));
+    }
+
+    constexpr void append(char32_t cp, std::u8string& s)
+    {
+        utflib::append(cp, std::back_inserter(s));
+    }
+
+    template <typename It>  // TODO: add constraints
+    constexpr It append16(char32_t cp, It result)
+    {
+        if (!internal::is_code_point_valid(cp))
+            throw invalid_code_point(cp);
+
+        return internal::append16(cp, result);
+    }
+
+    constexpr void append16(char32_t cp, std::u16string& s)
+    {
+        utflib::append16(cp, std::back_inserter(s));
+    }
+
+    template <octet_iterator It, std::sentinel_for<It> Se, typename Out>  // TODO: add constraints
+    constexpr Out replace_invalid(It start, Se end, Out out, char32_t replacement)
+    {
+        while (start != end) {
+            It sequence_start = start;
+            internal::utf_error err_code  = internal::validate_next(start, end);
+            switch (err_code) {
+                case internal::utf_error::OK:
+                    for (It it = sequence_start; it != start; ++it)
+                        *out++ = *it;
+                    break;
+                case internal::utf_error::NOT_ENOUGH_ROOM:
+                    out   = utflib::append(replacement, out);
+                    start = end;
+                    break;
+                case internal::utf_error::INVALID_LEAD:
+                    out = utflib::append(replacement, out);
+                    ++start;
+                    break;
+                case internal::utf_error::INCOMPLETE_SEQUENCE:
+                case internal::utf_error::OVERLONG_SEQUENCE:
+                case internal::utf_error::INVALID_CODE_POINT:
+                    out = utflib::append(replacement, out);
+                    ++start;
+                    // just one replacement mark for the sequence
+                    while (start != end && internal::is_trail(*start))
+                        ++start;
+                    break;
+            }
+        }
+        return out;
+    }
+
+    template <octet_iterator It, std::sentinel_for<It> Se, typename Out>  // TODO: add constraints
+    constexpr Out replace_invalid(It start, Se end, Out out)
+    {
+        constexpr char32_t replacement_marker = static_cast<char32_t>(internal::mask16(0xfffd));
+        return utflib::replace_invalid(start, end, out, replacement_marker);
+    }
+
+    [[nodiscard]] constexpr std::string replace_invalid(std::string_view s, char32_t replacement)
+    {
+        std::string result;
+        replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement);
+        return result;
+    }
+
+    [[nodiscard]] constexpr std::u8string replace_invalid(std::u8string_view s, char32_t replacement)
+    {
+        std::u8string result;
+        replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement);
+        return result;
+    }
+
+    [[nodiscard]] constexpr std::string replace_invalid(std::string_view s)
+    {
+        std::string result;
+        replace_invalid(s.begin(), s.end(), std::back_inserter(result));
+        return result;
+    }
+
+    [[nodiscard]] constexpr std::u8string replace_invalid(std::u8string_view s)
+    {
+        std::u8string result;
+        replace_invalid(s.begin(), s.end(), std::back_inserter(result));
+        return result;
+    }
+
+    template <octet_iterator It, std::sentinel_for<It> Se>
+    [[nodiscard]] constexpr char32_t next(It& it, Se end)
+    {
+        char32_t cp               = 0;
+        internal::utf_error err_code = internal::validate_next(it, end, cp);
+        switch (err_code) {
+            case internal::utf_error::OK:
+                break;
+            case internal::utf_error::NOT_ENOUGH_ROOM:
+                throw not_enough_room();
+            case internal::utf_error::INVALID_LEAD:
+            case internal::utf_error::INCOMPLETE_SEQUENCE:
+            case internal::utf_error::OVERLONG_SEQUENCE:
+                throw invalid_utf8(static_cast<char8_t>(*it));
+            case internal::utf_error::INVALID_CODE_POINT:
+                throw invalid_code_point(cp);
+        }
+        return cp;
+    }
+
+    template <utf16_iterator It, std::sentinel_for<It> Se>
+    [[nodiscard]] constexpr char32_t next16(It& it, Se end)
+    {
+        char32_t cp               = 0;
+        internal::utf_error err_code = internal::validate_next16(it, end, cp);
+        if (err_code == internal::utf_error::NOT_ENOUGH_ROOM)
+            throw not_enough_room();
+        return cp;
+    }
+
+    template <octet_iterator It, std::sentinel_for<It> Se>
+    [[nodiscard]] constexpr char32_t peek_next(It it, Se end)
+    {
+        return utflib::next(it, end);
+    }
+
+    template <octet_iterator It, std::sentinel_for<It> Se>
+    [[nodiscard]] constexpr char32_t prior(It& it, Se start)
+    {
+        // can't do much if it == start
+        if (it == start)
+            throw not_enough_room();
+
+        It end = it;
+        // Go back until we hit either a lead octet or start
+        while (internal::is_trail(*(--it)))
+            if (it == start)
+                throw invalid_utf8(*it); // error - no lead byte in the sequence
+        return utflib::peek_next(it, end);
+    }
+
+    template <octet_iterator It, std::sentinel_for<It> Se, typename distance_type>
+    constexpr void advance(It& it, distance_type n, Se end)
+    {
+        const distance_type zero(0);
+        if (n < zero) {
+            // backward
+            for (distance_type i = n; i < zero; ++i)
+                (void)utflib::prior(it, end);
+        } else {
+            // forward
+            for (distance_type i = zero; i < n; ++i)
+                (void)utflib::next(it, end);
+        }
+    }
+
+    template <octet_iterator It, std::sentinel_for<It> Se>
+    [[nodiscard]] constexpr typename std::iterator_traits<It>::difference_type distance(It first, Se last)
+    {
+        typename std::iterator_traits<It>::difference_type dist;
+        for (dist = 0; first != last; ++dist)
+            (void)utflib::next(first, last);
+        return dist;
+    }
+
+    template <utf16_iterator It, std::sentinel_for<It> Se, typename OutIt> // TODO: add constraints
+    constexpr OutIt utf16to8(It start, Se end, OutIt result)
+    {
+        while (start != end) {
+            char32_t cp = static_cast<char32_t>(internal::mask16(*start++));
+            // Take care of surrogate pairs first
+            if (internal::is_lead_surrogate(cp)) {
+                if (start != end) {
+                    const char32_t trail_surrogate = static_cast<char32_t>(internal::mask16(*start++));
+                    if (internal::is_trail_surrogate(trail_surrogate))
+                        cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
+                    else
+                        throw invalid_utf16(static_cast<char16_t>(trail_surrogate));
+                } else
+                    throw invalid_utf16(static_cast<char16_t>(cp));
+
+            }
+            // Lone trail surrogate
+            else if (internal::is_trail_surrogate(cp))
+                throw invalid_utf16(static_cast<char16_t>(cp));
+
+            result = utflib::append(cp, result);
+        }
+        return result;
+    }
+
+    [[nodiscard]] constexpr std::string utf16to8(std::u16string_view s)
+    {
+        std::string result;
+        utflib::utf16to8(s.begin(), s.end(), std::back_inserter(result));
+        return result;
+    }
+
+    [[nodiscard]] constexpr std::u8string utf16tou8(std::u16string_view s)
+    {
+        std::u8string result;
+        utflib::utf16to8(s.begin(), s.end(), std::back_inserter(result));
+        return result;
+    }
+
+    template <utf8_iterator It, std::sentinel_for<It> Se, typename OutIt>  // TODO: add constraints
+    constexpr OutIt utf8to16(It start, Se end, OutIt result)
+    {
+        while (start != end) {
+            const char32_t cp = utflib::next(start, end);
+            if (cp > 0xffff) { // make a surrogate pair
+                *result++ = static_cast<char16_t>((cp >> 10) + internal::LEAD_OFFSET);
+                *result++ = static_cast<char16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
+            } else
+                *result++ = static_cast<char16_t>(cp);
+        }
+        return result;
+    }
+
+    [[nodiscard]] constexpr std::u16string utf8to16(std::string_view s)
+    {
+        std::u16string result;
+        utflib::utf8to16(s.begin(), s.end(), std::back_inserter(result));
+        return result;
+    }
+
+    [[nodiscard]] constexpr std::u16string utf8to16(std::u8string_view s)
+    {
+        std::u16string result;
+        utflib::utf8to16(s.begin(), s.end(), std::back_inserter(result));
+        return result;
+    }
+
+    template <utf32_iterator It, std::sentinel_for<It> Se, typename OutIt>  // TODO: add constraints
+    constexpr OutIt utf32to8(It start, Se end, OutIt result)
+    {
+        while (start != end)
+            result = utflib::append(*(start++), result);
+
+        return result;
+    }
+
+    [[nodiscard]] constexpr std::string utf32to8(std::u32string_view s)
+    {
+        std::string result;
+        utflib::utf32to8(s.begin(), s.end(), std::back_inserter(result));
+        return result;
+    }
+
+    [[nodiscard]] constexpr std::u8string utf32tou8(std::u32string_view s)
+    {
+        std::u8string result;
+        utflib::utf32to8(s.begin(), s.end(), std::back_inserter(result));
+        return result;
+    }
+
+    template <utf8_iterator It, std::sentinel_for<It> Se, typename OutIt>
+    constexpr OutIt utf8to32(It start, Se end, OutIt result)
+    {
+        while (start != end)
+            (*result++) = utflib::next(start, end);
+
+        return result;
+    }
+
+    [[nodiscard]] constexpr std::u32string utf8to32(std::string_view s)
+    {
+        std::u32string result;
+        utflib::utf8to32(s.begin(), s.end(), std::back_inserter(result));
+        return result;
+    }
+
+    [[nodiscard]] constexpr std::u32string utf8to32(std::u8string_view s)
+    {
+        std::u32string result;
+        utflib::utf8to32(s.begin(), s.end(), std::back_inserter(result));
+        return result;
+    }
+
+    // The iterator class
+    template <octet_iterator It>
+    class iterator
+    {
+        It it;
+        It range_start;
+        It range_end;
+
+    public:
+        using value_type = char32_t;
+        using pointer = char32_t*;
+        using reference = char32_t&;
+        using difference_type = std::ptrdiff_t;
+        using iterator_category = std::bidirectional_iterator_tag;
+        constexpr iterator()
+            requires std::is_default_constructible_v<It>
+        = default;
+        constexpr explicit iterator(It octet_it, It rangestart, It rangeend)
+            : it(std::move(octet_it))
+            , range_start(std::move(rangestart))
+            , range_end(std::move(rangeend))
+        {
+            if constexpr (std::random_access_iterator<It>) {
+                if (it < range_start || it > range_end)
+                    throw std::out_of_range("Invalid utf-8 iterator position");
+            }
+        }
+        // the default "big three" are OK
+        [[nodiscard]] constexpr It base() const { return it; }
+        [[nodiscard]] constexpr char32_t operator*() const
+        {
+            It temp = it;
+            return utflib::next(temp, range_end);
+        }
+        [[nodiscard]] constexpr bool operator==(const iterator& rhs) const
+        {
+            if (range_start != rhs.range_start || range_end != rhs.range_end)
+                throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
+            return (it == rhs.it);
+        }
+        constexpr iterator& operator++()
+        {
+            (void)utflib::next(it, range_end);
+            return *this;
+        }
+        constexpr iterator operator++(int)
+        {
+            iterator temp = *this;
+            (void)utflib::next(it, range_end);
+            return temp;
+        }
+        constexpr iterator& operator--()
+        {
+            (void)utflib::prior(it, range_start);
+            return *this;
+        }
+        constexpr iterator operator--(int)
+        {
+            iterator temp = *this;
+            (void)utflib::prior(it, range_start);
+            return temp;
+        }
+    }; // class iterator
+
+} // namespace iris::utflib
+
+#endif // header guard
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 34edcd0..77d4c02 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -170,4 +170,8 @@ if(PROJECT_IS_TOP_LEVEL)
             iris_define_test_headers(iris_${test_name} iris_test.hpp)
         endforeach()
     endif()
+
+    if(NOT DEFINED IRIS_CI_COMPONENT OR IRIS_CI_COMPONENT STREQUAL unicode)
+        add_subdirectory(unicode)
+    endif()
 endif()
diff --git a/test/unicode/CMakeLists.txt b/test/unicode/CMakeLists.txt
new file mode 100644
index 0000000..37baffc
--- /dev/null
+++ b/test/unicode/CMakeLists.txt
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: MIT
+
+add_subdirectory(string)
diff --git a/test/unicode/string/CMakeLists.txt b/test/unicode/string/CMakeLists.txt
new file mode 100644
index 0000000..4fd1e27
--- /dev/null
+++ b/test/unicode/string/CMakeLists.txt
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: MIT
+
+
diff --git a/test/unicode/string/LICENSE b/test/unicode/string/LICENSE
new file mode 100644
index 0000000..36b7cd9
--- /dev/null
+++ b/test/unicode/string/LICENSE
@@ -0,0 +1,23 @@
+Boost Software License - Version 1.0 - August 17th, 2003
+
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/test/unicode/string/apitests.cpp b/test/unicode/string/apitests.cpp
new file mode 100644
index 0000000..ba8fa90
--- /dev/null
+++ b/test/unicode/string/apitests.cpp
@@ -0,0 +1,257 @@
+#include "ftest.h"
+
+#include "utf8.h"
+
+#include <string>
+
+#include <cassert>
+
+using namespace iris::utflib;
+using namespace std;
+
+TEST(CheckedAPITests, test_append)
+{
+    unsigned char u[5] = {0,0,0,0,0};
+    append(0x0448, u);
+    EXPECT_EQ (u[0], 0xd1);
+    EXPECT_EQ (u[1], 0x88);
+    EXPECT_EQ (u[2], 0);
+    EXPECT_EQ (u[3], 0);
+    EXPECT_EQ (u[4], 0);
+
+    append(0x65e5, u);
+    EXPECT_EQ (u[0], 0xe6);
+    EXPECT_EQ (u[1], 0x97);
+    EXPECT_EQ (u[2], 0xa5);
+    EXPECT_EQ (u[3], 0);
+    EXPECT_EQ (u[4], 0);
+
+    append(0x3044, u);
+    EXPECT_EQ (u[0], 0xe3);
+    EXPECT_EQ (u[1], 0x81);
+    EXPECT_EQ (u[2], 0x84);
+    EXPECT_EQ (u[3], 0);
+    EXPECT_EQ (u[4], 0);
+
+    append(0x10346, u);
+    EXPECT_EQ (u[0], 0xf0);
+    EXPECT_EQ (u[1], 0x90);
+    EXPECT_EQ (u[2], 0x8d);
+    EXPECT_EQ (u[3], 0x86);
+    EXPECT_EQ (u[4], 0);
+
+    // Ensure no warnings with plain char
+    char c[2] = {0,0};
+    append('a', c);
+    EXPECT_EQ (c[0], 'a');
+    EXPECT_EQ (c[1], 0);
+}
+
+TEST(CheckedAPITests, test_append16)
+{
+    char16_t u[5] = {0,0};
+    append16(0x0448, u);
+    EXPECT_EQ (u[0], 0x0448);
+    EXPECT_EQ (u[1], 0x0000);
+
+    append16(0x65e5, u);
+    EXPECT_EQ (u[0], 0x65e5);
+    EXPECT_EQ (u[1], 0x0000);
+
+    append16(0x10346, u);
+    EXPECT_EQ (u[0], 0xd800);
+    EXPECT_EQ (u[1], 0xdf46);
+}
+
+TEST(CheckedAPITests, test_next)
+{
+    const char* twochars = "\xe6\x97\xa5\xd1\x88";
+    const char* w = twochars;
+    unsigned int cp = next(w, twochars + 6);
+    EXPECT_EQ (cp, 0x65e5);
+    EXPECT_EQ (w, twochars + 3);
+
+    const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
+    w = threechars;
+
+    cp = next(w, threechars + 9);
+    EXPECT_EQ (cp, 0x10346);
+    EXPECT_EQ (w, threechars + 4);
+
+    cp = next(w, threechars + 9);
+    EXPECT_EQ (cp, 0x65e5);
+    EXPECT_EQ (w, threechars + 7);
+
+    cp = next(w, threechars + 9);
+    EXPECT_EQ (cp, 0x0448);
+    EXPECT_EQ (w, threechars + 9);
+}
+
+TEST(CheckedAPITests, test_next16)
+{
+    const char16_t u[3] = {0x65e5, 0xd800, 0xdf46};
+    const char16_t* w = u;
+    char32_t cp = next16(w, w + 3);
+    EXPECT_EQ (cp, 0x65e5);
+    EXPECT_EQ (w, u + 1);
+
+    cp = next16(w, w + 2);
+    EXPECT_EQ (cp, 0x10346);
+    EXPECT_EQ (w, u + 3);
+}
+
+TEST(CheckedAPITests, test_peek_next)
+{
+    const char* const cw = "\xe6\x97\xa5\xd1\x88";
+    unsigned int cp = peek_next(cw, cw + 6);
+    EXPECT_EQ (cp, 0x65e5);
+}
+
+TEST(CheckedAPITests, test_prior)
+{
+    const char* twochars = "\xe6\x97\xa5\xd1\x88";
+    const char* w = twochars + 3;
+    unsigned int cp = prior (w, twochars);
+    EXPECT_EQ (cp, 0x65e5);
+    EXPECT_EQ (w, twochars);
+
+    const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
+    w = threechars + 9;
+    cp = prior(w, threechars);
+    EXPECT_EQ (cp, 0x0448);
+    EXPECT_EQ (w, threechars + 7);
+    cp = prior(w, threechars);
+    EXPECT_EQ (cp, 0x65e5);
+    EXPECT_EQ (w, threechars + 4);
+    cp = prior(w, threechars);
+    EXPECT_EQ (cp, 0x10346);
+    EXPECT_EQ (w, threechars);
+}
+
+TEST(CheckedAPITests, test_advance)
+{
+    const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
+    const char* w = threechars;
+    advance(w, 2, threechars + 9);
+    EXPECT_EQ(w, threechars + 7);
+    advance(w, -2, threechars);
+    EXPECT_EQ(w, threechars);
+    advance(w, 3, threechars + 9);
+    EXPECT_EQ(w, threechars + 9);
+    advance(w, -2, threechars);
+    EXPECT_EQ(w, threechars + 4);
+    advance(w, -1, threechars);
+    EXPECT_EQ(w, threechars);
+}
+
+TEST(CheckedAPITests, test_distance)
+{
+    const char* twochars = "\xe6\x97\xa5\xd1\x88";
+    size_t dist = static_cast<size_t>(iris::utflib::distance(twochars, twochars + 5));
+    EXPECT_EQ (dist, 2);
+}
+
+TEST(CheckedAPITests, test_utf32to8)
+{
+    char32_t utf32string[] = {0x448, 0x65E5, 0x10346, 0};
+    string utf8result;
+    iris::utflib::utf32to8(utf32string, utf32string + 3, back_inserter(utf8result));
+    EXPECT_EQ (utf8result.size(), 9);
+}
+
+TEST(CheckedAPITests, test_utf8to32)
+{
+    const char* twochars = "\xe6\x97\xa5\xd1\x88";
+    vector<unsigned int> utf32result;
+    iris::utflib::utf8to32(twochars, twochars + 5, back_inserter(utf32result));
+    EXPECT_EQ (utf32result.size(), 2);
+}
+
+TEST(CheckedAPITests, test_utf16to8)
+{
+    char16_t utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
+    string utf8result;
+    iris::utflib::utf16to8(utf16string, utf16string + 5, back_inserter(utf8result));
+    EXPECT_EQ (utf8result.size(), 10);
+}
+
+TEST(CheckedAPITests, test_utf8to16)
+{
+    char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
+    vector <char16_t> utf16result;
+    iris::utflib::utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result));
+    EXPECT_EQ (utf16result.size(), 4);
+    EXPECT_EQ (utf16result[2], 0xd834);
+    EXPECT_EQ (utf16result[3], 0xdd1e);
+}
+
+TEST(CheckedAPITests, test_replace_invalid)
+{
+    char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
+    vector<char> replace_invalid_result;
+    replace_invalid (invalid_sequence, invalid_sequence + sizeof(invalid_sequence), std::back_inserter(replace_invalid_result), '?');
+    bool bvalid = is_valid(replace_invalid_result.begin(), replace_invalid_result.end());
+    EXPECT_TRUE (bvalid);
+    const char fixed_invalid_sequence[] = "a????z";
+    EXPECT_EQ (sizeof(fixed_invalid_sequence), replace_invalid_result.size());
+    EXPECT_TRUE (std::equal(replace_invalid_result.begin(), replace_invalid_result.begin() + sizeof(fixed_invalid_sequence), fixed_invalid_sequence));
+}
+
+TEST(CheckedAPITests, test_find_invalid)
+{
+    char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa";
+    const char* invalid = find_invalid(utf_invalid, utf_invalid + 6);
+    EXPECT_EQ (invalid, utf_invalid + 5);
+    invalid = utf_invalid + find_invalid(utf_invalid);
+    EXPECT_EQ (invalid, utf_invalid + 5);
+}
+
+TEST(CheckedAPITests, test_is_valid)
+{
+    char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa";
+    bool bvalid = is_valid(utf_invalid, utf_invalid + 6);
+    EXPECT_FALSE (bvalid);
+    bvalid = is_valid(utf_invalid);
+    EXPECT_FALSE (bvalid);
+    char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
+    bvalid = is_valid(utf8_with_surrogates, utf8_with_surrogates + 9);
+    EXPECT_TRUE (bvalid);
+    bvalid = is_valid(utf8_with_surrogates);
+    EXPECT_TRUE (bvalid);
+}
+
+TEST(CheckedAPITests, test_starts_with_bom)
+{
+    unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf};
+    bool bbom = starts_with_bom(byte_order_mark, byte_order_mark + sizeof(byte_order_mark));
+    EXPECT_TRUE (bbom);
+    const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
+    bool no_bbom = starts_with_bom(threechars, threechars + sizeof(threechars));
+    EXPECT_FALSE (no_bbom);
+}
+
+TEST(CheckedIteratrTests, test_increment)
+{
+    const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
+    iris::utflib::iterator<const char*> it(threechars, threechars, threechars + 9);
+    iris::utflib::iterator<const char*> it2 = it;
+    EXPECT_EQ (it2, it);
+    EXPECT_EQ (*it, 0x10346);
+    EXPECT_EQ (*(++it), 0x65e5);
+    EXPECT_EQ ((*it++), 0x65e5);
+    EXPECT_EQ (*it, 0x0448);
+    EXPECT_NE (it, it2);
+    iris::utflib::iterator<const char*> endit (threechars + 9, threechars, threechars + 9);
+    EXPECT_EQ (++it, endit);
+}
+
+TEST(CheckedIteratrTests, test_decrement)
+{
+    const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
+    iris::utflib::iterator<const char*> it(threechars+9, threechars, threechars + 9);
+    EXPECT_EQ (*(--it), 0x0448);
+    EXPECT_EQ ((*it--), 0x0448);
+    EXPECT_EQ (*it, 0x65e5);
+    EXPECT_EQ (--it, iris::utflib::iterator<const char*>(threechars, threechars, threechars + 9));
+    EXPECT_EQ (*it, 0x10346);
+}
diff --git a/test/unicode/string/negative.cpp b/test/unicode/string/negative.cpp
new file mode 100644
index 0000000..665585b
--- /dev/null
+++ b/test/unicode/string/negative.cpp
@@ -0,0 +1,61 @@
+#include "utf8.h"
+
+using namespace iris::utflib;
+
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <algorithm>
+
+using namespace std;
+
+const unsigned INVALID_LINES[] = { 75, 76, 83, 84, 85, 93, 102, 103, 105, 106, 107, 108, 109, 110, 114, 115, 116, 117, 124, 125, 130, 135, 140, 145, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 169, 175, 176, 177, 207, 208, 209, 210, 211, 220, 221, 222, 223, 224, 232, 233, 234, 235, 236, 247, 248, 249, 250, 251, 252, 253, 257, 258, 259, 260, 261, 262, 263, 264};
+const unsigned* INVALID_LINES_END = INVALID_LINES + sizeof(INVALID_LINES)/sizeof(unsigned);
+
+int main(int argc, char** argv)
+{
+    string test_file_path;
+    if (argc == 2) 
+        test_file_path = argv[1];
+    else {
+        cout << "Wrong number of arguments" << endl;
+        return 1;
+    }
+    // Open the test file
+    ifstream fs8(test_file_path.c_str());
+    if (!fs8.is_open()) {
+        cout << "Could not open " << test_file_path << endl;
+        return 1;
+    }
+
+    // Read it line by line
+    unsigned int line_count = 0;
+    char byte;
+    while (!fs8.eof()) {
+        string line;
+        while ((byte = static_cast<char>(fs8.get())) != '\n' && !fs8.eof())
+            line.push_back(byte);
+
+        line_count++;
+        bool expected_valid = (find(INVALID_LINES, INVALID_LINES_END, line_count) == INVALID_LINES_END);
+        // Print out lines that contain unexpected invalid UTF-8
+        if (!is_valid(line.begin(), line.end())) {
+            if (expected_valid) {
+                cout << "Unexpected invalid utf-8 at line " << line_count << '\n';
+                return 1;
+            }
+
+            // try fixing it:
+            string fixed_line;
+            replace_invalid(line.begin(), line.end(), back_inserter(fixed_line));
+            if (!is_valid(fixed_line.begin(), fixed_line.end())) {
+                cout << "replace_invalid() resulted in an invalid utf-8 at line " << line_count << '\n';
+                return 1;
+            }
+        }
+        else if (!expected_valid) {
+            cout << "Invalid utf-8 NOT detected at line " << line_count << '\n';
+            return 1;
+        }
+    }
+}
diff --git a/test/unicode/string/test_cpp11.cpp b/test/unicode/string/test_cpp11.cpp
new file mode 100644
index 0000000..9de19be
--- /dev/null
+++ b/test/unicode/string/test_cpp11.cpp
@@ -0,0 +1,117 @@
+#include "ftest.h"
+
+#include "utf8.h"
+
+#include <string>
+
+using namespace iris::utflib;
+using namespace std;
+
+TEST(CPP11APITests, test_append)
+{
+    string u;
+    append(0x0448, u);
+    EXPECT_EQ (u[0], char(0xd1));
+    EXPECT_EQ (u[1], char(0x88));
+    EXPECT_EQ (u.length(), 2);
+
+    u.clear();
+    append(0x65e5, u);
+    EXPECT_EQ (u[0], char(0xe6));
+    EXPECT_EQ (u[1], char(0x97));
+    EXPECT_EQ (u[2], char(0xa5));
+    EXPECT_EQ (u.length(), 3);
+
+    u.clear();
+    append(0x3044, u);
+    EXPECT_EQ (u[0], char(0xe3));
+    EXPECT_EQ (u[1], char(0x81));
+    EXPECT_EQ (u[2], char(0x84));
+    EXPECT_EQ (u.length(), 3);
+
+    u.clear();
+    append(0x10346, u);
+    EXPECT_EQ (u[0], char(0xf0));
+    EXPECT_EQ (u[1], char(0x90));
+    EXPECT_EQ (u[2], char(0x8d));
+    EXPECT_EQ (u[3], char(0x86));
+    EXPECT_EQ (u.length(), 4);
+}
+
+TEST(CPP11APITests, test_append16)
+{
+    u16string u;
+    append16(0x0448, u);
+    EXPECT_EQ (u[0], char16_t(0x0448));
+    EXPECT_EQ (u.length(), 1);
+}
+
+TEST(CPP11APITests, test_utf16to8)
+{
+    u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
+    string u = utf16to8(utf16string);
+    EXPECT_EQ (u.size(), 10);
+}
+
+TEST(CPP11APITests, test_utf8to16)
+{
+    string utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
+    u16string utf16result = utf8to16(utf8_with_surrogates);
+    EXPECT_EQ (utf16result.size(), 4);
+    EXPECT_EQ (utf16result[2], 0xd834);
+    EXPECT_EQ (utf16result[3], 0xdd1e);
+    // Just to make sure it compiles with string literals
+    EXPECT_EQ(utf8to16(u8"simple"), u"simple");
+    EXPECT_EQ(utf8to16("simple"), u"simple");
+}
+
+TEST(CPP11APITests, test_utf32to8)
+{
+    u32string utf32string = {0x448, 0x65E5, 0x10346};
+    string utf8result = utf32to8(utf32string);
+    EXPECT_EQ (utf8result.size(), 9);
+}
+
+TEST(CPP11APITests, test_utf8to32)
+{
+    const char* twochars = "\xe6\x97\xa5\xd1\x88";
+    u32string utf32result = utf8to32(twochars);
+    EXPECT_EQ (utf32result.size(), 2);
+}
+
+TEST(CPP11APITests, test_find_invalid)
+{
+    string utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa";
+    auto invalid = find_invalid(utf_invalid);
+    EXPECT_EQ (invalid, 5);
+}
+
+TEST(CPP11APITests, test_is_valid)
+{
+    string utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa";
+    bool bvalid = is_valid(utf_invalid);
+    EXPECT_FALSE (bvalid);
+    string utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
+    bvalid = is_valid(utf8_with_surrogates);
+    EXPECT_TRUE (bvalid);
+}
+
+TEST(CPP11APITests, test_replace_invalid)
+{
+    string invalid_sequence = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
+    string replace_invalid_result = replace_invalid(invalid_sequence, '?');
+    bool bvalid = is_valid(replace_invalid_result);
+    EXPECT_TRUE (bvalid);
+    const string fixed_invalid_sequence = "a????z";
+    EXPECT_EQ(fixed_invalid_sequence, replace_invalid_result);
+}
+
+TEST(CPP11APITests, test_starts_with_bom)
+{
+    string byte_order_mark = {char(0xef), char(0xbb), char(0xbf)};
+    bool bbom = starts_with_bom(byte_order_mark);
+    EXPECT_TRUE (bbom);
+    string threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
+    bool no_bbom = starts_with_bom(threechars);
+    EXPECT_FALSE (no_bbom);
+}
diff --git a/test/unicode/string/test_cpp17.cpp b/test/unicode/string/test_cpp17.cpp
new file mode 100644
index 0000000..2d3756c
--- /dev/null
+++ b/test/unicode/string/test_cpp17.cpp
@@ -0,0 +1,86 @@
+#include "ftest.h"
+
+#include "utf8.h"
+
+#include <string>
+
+using namespace iris::utflib;
+using namespace std;
+
+TEST(CPP17APITests, test_utf16to8)
+{
+    u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
+    u16string_view utf16stringview(utf16string);
+    string u = utf16to8(utf16stringview);
+    EXPECT_EQ (u.size(), 10);
+}
+
+TEST(CPP17APITests, test_utf8to16)
+{
+    string_view utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
+    u16string utf16result = utf8to16(utf8_with_surrogates);
+    EXPECT_EQ (utf16result.size(), 4);
+    EXPECT_EQ (utf16result[2], 0xd834);
+    EXPECT_EQ (utf16result[3], 0xdd1e);
+}
+
+TEST(CPP17APITests, test_utf32to8)
+{
+    u32string utf32string = {0x448, 0x65E5, 0x10346};
+    u32string_view utf32stringview(utf32string);
+    string utf8result = utf32to8(utf32stringview);
+    EXPECT_EQ (utf8result.size(), 9);
+}
+
+TEST(CPP17APITests, test_utf8to32)
+{
+    string_view twochars = "\xe6\x97\xa5\xd1\x88";
+    u32string utf32result = utf8to32(twochars);
+    EXPECT_EQ (utf32result.size(), 2);
+}
+
+TEST(CPP17APITests, test_find_invalid)
+{
+    string_view utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa";
+    auto invalid = find_invalid(utf_invalid);
+    EXPECT_EQ (invalid, 5);
+}
+
+TEST(CPP17APITests, test_is_valid)
+{
+    string_view utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa";
+    bool bvalid = is_valid(utf_invalid);
+    EXPECT_FALSE (bvalid);
+    string_view utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
+    bvalid = is_valid(utf8_with_surrogates);
+    EXPECT_TRUE (bvalid);
+}
+
+TEST(CPP17APITests, test_replace_invalid)
+{
+    string_view invalid_sequence = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
+    string replace_invalid_result = replace_invalid(invalid_sequence, '?');
+    bool bvalid = is_valid(replace_invalid_result);
+    EXPECT_TRUE (bvalid);
+    const string fixed_invalid_sequence = "a????z";
+    EXPECT_EQ(fixed_invalid_sequence, replace_invalid_result);
+}
+
+TEST(CPP17APITests, test_starts_with_bom)
+{
+    string byte_order_mark = {char(0xef), char(0xbb), char(0xbf)};
+    string_view byte_order_mark_view(byte_order_mark);
+    bool bbom = starts_with_bom(byte_order_mark_view);
+    EXPECT_TRUE (bbom);
+    string_view threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
+    bool no_bbom = starts_with_bom(threechars);
+    EXPECT_FALSE (no_bbom);
+}
+
+TEST(CPP17APITests, string_class_and_literals)
+{
+    const char* twochars = "ab";
+    EXPECT_TRUE (is_valid(twochars));
+    const string two_chars_string(twochars);
+    EXPECT_TRUE (is_valid(two_chars_string));
+}
diff --git a/test/unicode/string/test_cpp20.cpp b/test/unicode/string/test_cpp20.cpp
new file mode 100644
index 0000000..330027d
--- /dev/null
+++ b/test/unicode/string/test_cpp20.cpp
@@ -0,0 +1,79 @@
+#include "ftest.h"
+
+#include "utf8.h"
+
+#include <string>
+
+using namespace iris::utflib;
+using namespace std;
+
+TEST(CPP20APITests, test_utf16tou8)
+{
+    u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
+    u16string_view utf16stringview{utf16string};
+    u8string u = utf16tou8(utf16string);
+    EXPECT_EQ (u.size(), 10);
+    u = utf16tou8(utf16stringview);
+    EXPECT_EQ (u.size(), 10);
+}
+
+TEST(CPP20APITests, tes20t_utf8to16)
+{
+    u8string utf8_with_surrogates{ reinterpret_cast<const char8_t*>("\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e") };
+    u16string utf16result = utf8to16(utf8_with_surrogates);
+    EXPECT_EQ (utf16result.size(), 4);
+    EXPECT_EQ (utf16result[2], 0xd834);
+    EXPECT_EQ (utf16result[3], 0xdd1e);
+}
+
+TEST(CPP20APITests, test_utf32tou8)
+{
+    u32string utf32string = {0x448, 0x65E5, 0x10346};
+    u32string_view utf32stringview{utf32string};
+    u8string utf8result = utf32tou8(utf32stringview);
+    EXPECT_EQ (utf8result.size(), 9);
+}
+
+TEST(CPP20APITests, test_utf8to32)
+{
+    u8string twochars = reinterpret_cast<const char8_t*>("\xe6\x97\xa5\xd1\x88");
+    u32string utf32result = utf8to32(twochars);
+    EXPECT_EQ (utf32result.size(), 2);
+}
+
+TEST(CPP20APITests, test_find_invalid)
+{
+    u8string utf_invalid = reinterpret_cast<const char8_t*>("\xe6\x97\xa5\xd1\x88\xfa");
+    auto invalid = find_invalid(utf_invalid);
+    EXPECT_EQ (invalid, 5);
+}
+
+TEST(CPP20APITests, test_is_valid)
+{
+    u8string utf_invalid = reinterpret_cast<const char8_t*>("\xe6\x97\xa5\xd1\x88\xfa");
+    bool bvalid = is_valid(utf_invalid);
+    EXPECT_FALSE (bvalid);
+    u8string utf8_with_surrogates = reinterpret_cast<const char8_t*>("\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e");
+    bvalid = is_valid(utf8_with_surrogates);
+    EXPECT_TRUE (bvalid);
+}
+
+TEST(CPP20APITests, test_replace_invalid)
+{
+    u8string invalid_sequence = reinterpret_cast<const char8_t*>("a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z");
+    u8string replace_invalid_result = replace_invalid(invalid_sequence, u8'?');
+    bool bvalid = is_valid(replace_invalid_result);
+    EXPECT_TRUE (bvalid);
+    const u8string fixed_invalid_sequence = reinterpret_cast<const char8_t*>("a????z");
+    EXPECT_EQ(fixed_invalid_sequence, replace_invalid_result);
+}
+
+TEST(CPP20APITests, test_starts_with_bom)
+{
+    u8string byte_order_mark = reinterpret_cast<const char8_t*>("\xef\xbb\xbf");
+    bool bbom = starts_with_bom(byte_order_mark);
+    EXPECT_TRUE (bbom);
+    u8string threechars = reinterpret_cast<const char8_t*>("\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88");
+    bool no_bbom = starts_with_bom(threechars);
+    EXPECT_FALSE (no_bbom);
+}
diff --git a/test/unicode/string/test_data/utf8_invalid.txt b/test/unicode/string/test_data/utf8_invalid.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ae83159328313d3ba57b4f51ccb96db3dbfd79f1
GIT binary patch
literal 20010
zcmdU1X_MU6b<Jn|iktqTp*7Q-jhrDXNtLMKM676WmB>ovBN~mKh6fF_0QAgssw<g9
zPVB69_B~eiRg$wfi5<(NoqdzD?}_N_8!aVKVkMU4oO2)WfL=I64|^&Wip4?q0O#HP
z-uK>pbL0B%%C@w9FK+vZcy8Zq1!0g4#f{pMW{IDslKE-otkk?5=P@_gAEfdHgKi|B
z)6KH}<@4tc4-cDO*!0|9)AgEz{qtYa+uL8iaQ^dTMb=kWH&#~HSJo~$&h>6!#*<DE
z`k*J0t{?Vg7!Slha(hAKFH1K*<QIwD7$d$5l0!dr4%{S&K|nU_`>Dho@!}-$y=+-}
zZrF*Fp5I=UIFUH(`pIFC`VvH$4^Fcn+B29=>OVE`BhOEpq9iFbAT5eB>G^JyNft}X
z7Z>0WF<m^Yf@;dOOxiK%;In<edP&d%A#OC3zMEu$HwayfKFGYd2L#BOg!n)K0`Eiz
zl0$chQJhBKP16PmZgkurY~Y!yjF!be@FRQ~5B9ozA5<J=sicFwJ+Opv9T3s+6SoC{
z7K(29UH1S|O2o}1B^~|H&DjM_L60;9QJVQK$cr7T1PmVSi`xSEhIEXfqn3z}cttyL
z1Pan2ka{k7hrGfN$_Vg19vno2bl`><sz1me?5jt8KM5##P$eD6so=xOx*)X|W4KO`
zq}fUqP<jI}o~V|`j3LEC5G>vZeHX99kq@03yXYq|L;(&dH9ts7p$uh^(gsw*RC$vY
zY4j=~Y%hh*Ls*{|AHZ86gsAN%L)t()AWLb*#wIOE+*A(zFyznNBEzA8Rd`-IH)U&f
zl+BGO;P3Eo+jsggj1)FaA*9fC*p>*2haGtyv<O~kifZRMR!;vz9=>iXSFv1s@(Ed6
zy|{UCW$iNDZgp8wql;8_np>*S_2!1R46wF;5O}@=$MJ&$>lhf@1q<phFe})AG^|$v
z%y>;MT1j1~2xe?_7i8soFiY{eZsL0IJi`?X4n~y*;FwnIw&Q*V!K8y$>Sqno<}~ap
zxg=aT3*GE6pk2;*FQ*rt3UdfN7l;l#mjeu74-IJ{(n-mr;kqdg$y0+MqwIUVID($=
zG|Z_F+i|@vz?=zlqf5pBQa1vz!!p=R=vIe>T=mJGk5FQ{lBF9D?(XjHoLiQ|Zh&#Z
zINqnIT992B>@!+mY^a@lK-LIZ@S`3Au0}Lh(x4YGx}?KiD-Jal^@6=_W@6P|9Jd7m
zcInaw0X+|_wF8ZOgCOl<s8XATL#xwH68ESzV`Smbv8G$t>6AX-Hh}9<Kkfk{mj<9*
zOR(7ltUH|;{I?KG%3$0Xkf_k#bHnl2Rxw@;u+amVgSgv<QlZcQ(c>Dt0Fw9I6j?$C
z5B8!sq4kz*6%4`L5@!&Vq**nj3a^Cc(Ve=r!$Ua%e4yv<`%c?$A<pUm4Bzdb(_wr^
zu3=nf`a=9+1XXi`@S+YvOdKJwyp@p&7S{DWXz#G=!zdAsE0j(s04BDyc8)fNqypj5
zATtLlT$ZKv@$X^P$d+CPK_c?ODxy6E|GU^6!~nky4&0E&m_lT32n!?wOB?6ZS2_sV
z@gdoKY7l3L<GIy|+oAuukpqcPW0STHnytam&|vgKrr)6HH<27;M0>bGI*hz-5=X%o
z8|Vz8O-)9Upx1NveCQaw6r)v~%Vz3oqE6jmDvj%pT$QVj?=;{s%0$RSz$lKW7fYa7
z3Yw-Fr9Mr_h1xW86<N|t+_cLWgapZ=qoql@aLzmoyCEVR6NBTqh!TQ4PCa{mLM=Z8
zOEH;4senARlwja0G_-Z8i~uQI9#UTFhb&+arTDJX@%{EWW8Q6(XrLYo>_brufYKN>
z;4w#@-$#xF6?{KCM6^Ska10Ua&9&y5s&>7J`+^nFo4Ft`ildc2{3TOgedx)DuU#WA
zSrUh-bK^l|j8%P^w_CfQC>NqpECww2Q+izRdgQlP4udwRqpL3?6To&!X_aRyNh2#6
z^r<sc3WHuG5YJMgF${4Iu6C|zszeANK1{VTWJG5j0xS0R!(r)7l{%!*DeD3ity84S
zXwYjhk)lv6A{c8Q)>X*u#$Yt>g1N#?ePjUypS+y3i4X6kg`2faZaK;KU>}di8oiL!
zQ5GOvxFH-Wf*Tw_cP3FGfYRSU-}Hc*mr$K0Fly#kV1zJkSKwG3q;@)uM}ts>F43pK
znHGF$kPXz`Qmr5>RpL235Ds)ukYMADkZHW$Mm^AG(bgJ*c92zxS=Jqk#5yC(S+g-w
z#aIod8u0PGA@H554=&S+Bg6s?mKx6Ks<{cNg+PkVBP^S>o>rrWF{GE})Yr&4^{X>q
z964(uPqNA`Y|eysR_ZffxW|`^r|*j}zskMP#6n*_Kk@y2IYjX&5A4Tj&wrqhXye5X
zeC&nyfAr}WKltMN8h0mk@l>vhT$!r5jq5Y#r+24i)^E;$KEzhp<?KZIahe94aTVo;
zaTH_)4YWwLdx(iAUS~odDO@w2i)wR)5uMk$LbuZRtP@l^POrmk0-dZY9;%Z9Z@Y^3
zk>k36BL&<L*_Z&_9wYGfGjk$tifoP{u1&D~%Da!}guEbfp$f^#)EJSkz4Lfp%!?uy
zE0~)KS(W7LZwKZagpO<Lem_w=Q1+v&q84D?au((dgPdXBTe}tW{?fYEDCBpRd2j7j
z@m{)F7f=P*oHlRF_^N;}-<%V1(!AFQSVjEA&3Pdw%zHx-tB{`r<eX%Vn>U20S4A_Q
zBIX>R_iNDO?^8R_%@e2}X0GgvKYi~{XJOtOG;b4RcOXFdT*2hzQb$@H+&1pKr?&8v
zeC3!1ZV;~`-Vo83?N$B*;>YVpn+hVNXb_>`9U>}$tu_%~zWG93cc~!4!il)HN|OOh
zXH&Kfs51WQyXu>bAnriXO<1@>YAP_0my~V8`Kxne1VY~19_D#s<Uodx&6r)x15$bR
z4Cqr!jC?HDVg(N?9hlq&k(p2AIjl1Ny>1sz-&Iak1}D&0RJ&LiOnofNn*|25_z`kt
zh0AWmHJUNiV?7sOMK(T+n#FT<e=eS1L<VZa)SgqykkzB@)yvYTTRXu1Ol7CZep<P}
zYjuEEH{P5LzwnXuY1%OFYaG`lELWJ}7g1*8Zg_PiX8;WzLd@n}@#dn?;30utxI5lF
z6dJrO)|*qdV&r#xa~>HQyls)Klkn<OcsBNO33KpbgMzCTSU|hDS(dWXl8`0(%?gdZ
zEK{iA#nT7S#_^YZ`O|Ozim!afJHP6yzvgSd?(5%WZ0R)r8~Wb;4d3`p-~27#`fcC-
z9pCv~-~Bz``<~MvnRg7{GvD|9Kk$P;^us^$qd)fJKk<{#KKK0n;=T8$e)?yA_I*G1
z^S|)^5B%Z>U--~m9x*=l;`OniQyr|GV*xGAuxndg8yp|J#-}r(kMW+X%Ny%vLo`pj
zu<Z2}U70IZ9xyRqJ@RPB?GD!UXIz#(UEwp!a>#+YH<tFXFI;kO*9m68*sm;}{pS8|
z6t8hhJ}e)RUy@&zUy)yxUz1;#-;m#w-;&>!-;o#PqjL;l==)vyJ^6k41NlSwBl%<b
z6Zup5Gx>A*3;9d=EBR}AN$MA>i{}^iM+>BW*Dg-#cVmWred^bLQ|tG(d`$jE{#O1@
z{$Bn;{!#u({#pJ-{#E`>?#RdG<^Okmpx&T9wkOrGIYS*Cs-v?t+uh3i@fq!JugJg4
zf5?BztMXs+-|~q$<Ui?(XX78J*Qk%pNqt<HrH^guqxGiN$7}LG^1t%Q`_Mz7m$T6a
z>Rsw%Dji*%r4Nt#XuqlT@w$9!0lJ`0&O#rERz<aoZA)$s>;Cstl`FjfI~CY-KH*wC
zeKZsnDcZTEO)c2G!zLd#6pON@Iz!i<1UoIyw^U1lXU)$|kAZIJ7f9UM(q<m^$xL-b
zlyK08-3iu>U@j)0uHx1rmJ~9yQ0b-<I!=}zF1L)aWWc@Eytxe>8TMMd{5G(Jh5B!z
z(yQoME5JgvXe+90Pc&{#RyLf!X~jEHS=7Y4eLNcwCODBBYXZJ9z3ym8M980r?llo#
zo3U|XAQ81D0#r@N*Jo|U7);F6Yiq`A!I?dU#!IsWiJ5wB%_tt%Nik)`a_ekBOx0^^
zh6}I6Q$TDb%tplIy0&IkR?7{A26qx>L&D2yYi6agpU}{aglSCcjtjY=reTf`qtgd#
zhwRueJsJTTIwvAoT4%t;#|Xw)hAhajh^ZQG;p<$c$;oCn78)+9F%mD7ohAkCBiq`+
zUMD6DyI5LlPFUa+KNrt0kQ=vdA0v3s+toc^dif?SmL@jSCJI{W{#-oGx)E`$XZKT0
zPOMAM<a3Ls?>g+KZ2n_2nV}G!joR1Q@B8RBMcT8Cp+yH^rfrww6fT}V$d7rj<8yQA
zjY*#$AN}5+=+hTC@9yZn(x(@LhcpQK_^*=#zwySWPldgB`5}Tmfi^d7IxK_4y(KmK
ziN(`5Zhs|(MdNoFr_knB_PI0cp_pWED(W_Sjo5x@`|8)52#gsUh?90x9*Gax@TdJ|
zXqrPi8HEA>yn@C^w85t-d;Qr$S6m}!((Fx7G(5r_6>-_4h?O+7(V^M3J>VIUuHR#S
zc_VdU5~dHV#LK%B8$~!y{I{W=)R#;{vvz@(T$d8jhEMos^%|kK)=hCN;j(rP<6sKU
zacHNyHZHTFJ!Drrau<r%j__E+ge_1K#Wm71EjSc}jHg><=ot>$Wrtu61d-eN87CrI
zzG;lr?odNO4hVh3lJ@nSjGoCzalixP<$ateuTe$MuusSmEr=yuC4JTQN!z$Mn=EbV
z^z<YJ<QaH8GlbP%YzW$G<Y?PJh-?jc3I{6<Y|mu-C>l1kbK6WPDEz!z*3Ji-vWsuW
zu93qdH9RWB6Bc=iShUfz(G*bvf?axa_Z%AU^o$2&1BxI(ZQ_UdRRKrZ)r(|4-~pzb
zDt<to=b?qwqg9+8;4}+bK$AT>L=7w&{y71wu&S^lG=au6bfaBYn_mr0Tnmn{qgt`r
zqN6s1HE0f?bB$fV3;I48gD}kT0Zfku3D1}5ffjn_LEs-Q7t%xN1a0L<(D2Ji|0wO9
zr?5=7D_s`qn3TWz@LMQ{w{Wkg@WU$8^pCQc9xNIRlj4xn$WY=U3Q&?<(c9{v%{<1T
zopb;*RfDcFjChc8L!4|C&C?Zzmf5XqG*k8dT#h*7rF>KXNuT<^^~>=i0We6Im^mtf
zIW0~?Q4?xVEYSQTob13P)n1E_&zsD?S8*^8bN6B{Z;74Ih$Z8$H;bq53%T$C(Uh?^
zm$qr3hn}e8vcl=G@7gjjNod7h%9uicNlsI&wRB~valV0GYI9l(c4w3c-zqNpFAw!p
z%I$FlFROx!paECb<4_qoxAm+d{h16F7xU4~Lmsx}noA%Mhmxm9*-Y`@j`g$)&IIz;
z_Cn1+CAiPFzn%YP@%n}R!2-FwvChUI562TK`5qLaEWGu6?K*Dc5OBQ&^tC6biV48m
z$MYg?l!%?8%gBNP5p$M#AvY_K+j+|tdY23;AUOw`7xO{|(<?CzEHHIWH4o_gCUVjn
z)DDXC$K0c!H|UjaS|){)-k$#5;?41jo(|RNG#rhQp^K9%JepD*=0wFb;BSzyP$QZ=
zi~~?OGRBC_iBx0>9XU1<6)dtytL7|qe~z<<z=bl^9%a!SmY<0#08xIhtH~Z))iX29
zAk$ustik024TohdSH!X$aO6~j+C*hK9>}U$s32$R=i>DNdtrhy?T9?G#2A70_o*21
z;i)yKDeCZr8X!$D2h4jgSMwfB`wC-3y=F{<h#8_m<T4vbWQ-{K8x1I9M9nTo1Br>9
z_BRqyArT|C$hi3jRK|#R%vgN0TvTohrs<i?*X~FrVgy?C3ryTFMm!D7n$oO{uz*_<
z2}wRt%wepbXs#|6@E^bNXgNDMan!||<MruUqi}4*0#hgG`K=TDsw8=opFu7mI5*8<
zxH<!rVy>pnSyn0(!qipS^{CX0Q;*gDV)62We$}1LK}1dW$W7G?26$^~H=_DL^<E7|
z-MIiX>As2yz}wTi5ml7xy+Ncq7l=09TTmb(-OouR-PuA`(!D_iB-8zzm~>~0IiBtT
z3rtP-bAVPQasT%t2LxWrj}_}5Lcl7i`6~;j0$;p2j=S?j!YdrHmMfE?$^v_J&BPz(
zf9S!6Gcl_!_B?tS&XC>9)hSK}3{AyEU8@dr98^!h_OSQMWd)q~%@?Q2*}3@}8EHk*
z2oIc{o4+H1B@tVjOfX(Ib}U{WTnc{DZ0wRIG)`ffp2T}tpQtiY;pwE?GojCfuN@p{
zgM)FEMP~!>wL8z$b~H;gJjAqV-*|B6iq<wnMBkn!;ycgQ%EA_nHwIDT*Ul9U0z}jk
zA*AuS)6%HnYv+nqB1F`Zfr!tamPQR;I}aCfY*{(J_p~%((-v~vEx3l?nkabZL#IW=
zwk;70u1%ptuItrD6Pg+Oc;6qW>2-Ol8zj)jXQ;QB+WkTG76{TR+c*vAF17y5@tu1s
zUU@e8zQDA~DKV{W0`uOAGMKr?zz4_ZR976M^VyrV>CAn`V3u@ROiL#)>(XhwhMqk}
zr<(d0ozK;mr}3K{v!v5vS~`JQmrmn3j2B~cD%2RA&)1hHdL96lW0rJUOiL#)>(F`M
VE;t6T_-vE^`IE6xPXA%#{2v5ASmOWy

literal 0
HcmV?d00001


From 1d46090e99696f5201300a7b8b14d4acc0edcbbd Mon Sep 17 00:00:00 2001
From: Nana Sakisaka <1901813+saki7@users.noreply.github.com>
Date: Sat, 7 Mar 2026 15:36:24 +0900
Subject: [PATCH 02/17] Combine unit tests

---
 include/iris/unicode/string.hpp               | 239 +++++----
 test/unicode/string/CMakeLists.txt            |  11 +
 test/unicode/string/apitests.cpp              | 257 ---------
 test/unicode/string/string.cpp                | 488 ++++++++++++++++++
 test/unicode/string/test_cpp11.cpp            | 117 -----
 test/unicode/string/test_cpp17.cpp            |  86 ---
 test/unicode/string/test_cpp20.cpp            |  79 ---
 .../string/{negative.cpp => utf8_invalid.cpp} |   0
 8 files changed, 618 insertions(+), 659 deletions(-)
 delete mode 100644 test/unicode/string/apitests.cpp
 create mode 100644 test/unicode/string/string.cpp
 delete mode 100644 test/unicode/string/test_cpp11.cpp
 delete mode 100644 test/unicode/string/test_cpp17.cpp
 delete mode 100644 test/unicode/string/test_cpp20.cpp
 rename test/unicode/string/{negative.cpp => utf8_invalid.cpp} (100%)

diff --git a/include/iris/unicode/string.hpp b/include/iris/unicode/string.hpp
index 08d8d1f..a09ebc8 100644
--- a/include/iris/unicode/string.hpp
+++ b/include/iris/unicode/string.hpp
@@ -26,77 +26,77 @@ DEALINGS IN THE SOFTWARE.
 */
 
 
-#ifndef IRIS_UTFLIB_UTF8_H
-#define IRIS_UTFLIB_UTF8_H
+#ifndef IRIS_UNICODE_STRING_HPP
+#define IRIS_UNICODE_STRING_HPP
 
 #include <concepts>
-#include <exception>
+#include <stdexcept>
 #include <iterator>
 #include <stdexcept>
 #include <string>
 #include <string_view>
+#include <type_traits>
+#include <utility>
 
-#include <cstring>
-
-namespace iris::utflib
+namespace iris::unicode
 {
-    template <typename T>
+    template<class T>
     concept octet = std::integral<T> && sizeof(T) == 1;
 
-    template <typename T>
+    template<class T>
     concept utf8char = octet<T> && (std::same_as<T, char> || std::same_as<T, char8_t>);
-    
-    template <typename T>
+
+    template<class T>
     concept utf16char = std::same_as<T, char16_t>;
-    
-    template <typename T>
+
+    template<class T>
     concept utf32char = std::same_as<T, char32_t>;
 
-    template <typename It>
-    concept octet_iterator = std::input_iterator<It> && octet<std::iter_value_t<It>>;
+    template<class It>
+    concept octet_input_iterator = std::input_iterator<It> && octet<std::iter_value_t<It>>;
 
-    template <typename It>
-    concept utf8_iterator = octet_iterator<It> && utf8char<std::iter_value_t<It>>;
+    template<class It>
+    concept utf8_input_iterator = octet_input_iterator<It> && utf8char<std::iter_value_t<It>>;
 
-    template <typename It>
-    concept utf16_iterator = std::input_iterator<It> && utf16char<std::iter_value_t<It>>;
-    
-    template <typename It>
-    concept utf32_iterator = std::input_iterator<It> && utf32char<std::iter_value_t<It>>;
+    template<class It>
+    concept utf16_input_iterator = std::input_iterator<It> && utf16char<std::iter_value_t<It>>;
+
+    template<class It>
+    concept utf32_input_iterator = std::input_iterator<It> && utf32char<std::iter_value_t<It>>;
 
     namespace traits
     {
-        template <typename T, typename = void>
+        template<class T, class = void>
         struct is_nothrow_dereferenceable : std::false_type {};
 
-        template <typename T>
+        template<class T>
         struct is_nothrow_dereferenceable<T, std::void_t<decltype(*std::declval<T>())>> : std::bool_constant<noexcept(*std::declval<T>())> {};
 
-        template <typename T>
+        template<class T>
         inline constexpr bool is_nothrow_dereferenceable_v = is_nothrow_dereferenceable<T>::value;
-        
-        template <typename T, typename = void>
+
+        template<class T, class = void>
         struct is_nothrow_prefix_incrementable : std::false_type {};
 
-        template <typename T>
+        template<class T>
         struct is_nothrow_prefix_incrementable<T, std::void_t<decltype(++std::declval<T>())>> : std::bool_constant<noexcept(++std::declval<T>())> {};
 
-        template <typename T>
+        template<class T>
         inline constexpr bool is_nothrow_prefix_incrementable_v = is_nothrow_prefix_incrementable<T>::value;
-        
-        template <typename T, typename = void>
+
+        template<class T, class = void>
         struct is_nothrow_postfix_incrementable : std::false_type {};
 
-        template <typename T>
+        template<class T>
         struct is_nothrow_postfix_incrementable<T, std::void_t<decltype(std::declval<T>()++)>> : std::bool_constant<noexcept(std::declval<T>()++)> {};
 
-        template <typename T>
+        template<class T>
         inline constexpr bool is_nothrow_postfix_incrementable_v = is_nothrow_postfix_incrementable<T>::value;
 
-        template <typename It, typename Se>
+        template<class It, class Se>
         struct is_nothrow_sentinel : std::false_type {};
 
-        template <typename It, typename Se>
+        template<class It, class Se>
             requires std::sentinel_for<Se, It>
         struct is_nothrow_sentinel<It, Se> : std::bool_constant<
             noexcept(std::declval<It&>() == std::declval<Se&>()) &&
@@ -106,7 +106,7 @@ namespace iris::utflib
         >
         {};
 
-        template <typename It, typename Se>
+        template<class It, class Se>
         inline constexpr bool is_nothrow_sentinel_v = is_nothrow_sentinel<It, Se>::value;
     } // namespace traits
 
@@ -128,7 +128,7 @@ namespace iris::utflib
 
         enum class utf_error { OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT };
 
-        template <octet Octet>
+        template<octet Octet>
         [[nodiscard]] constexpr char8_t mask8(Octet oc) noexcept
         {
             return static_cast<char8_t>(0xff & oc);
@@ -139,7 +139,7 @@ namespace iris::utflib
             return static_cast<char16_t>(0xffff & oc);
         }
 
-        template <octet Octet>
+        template<octet Octet>
         [[nodiscard]] constexpr bool is_trail(Octet oc) noexcept
         {
             return ((internal::mask8(oc) >> 6) == 0x2);
@@ -185,7 +185,7 @@ namespace iris::utflib
             return false;
         }
 
-        template <octet_iterator It>
+        template<octet_input_iterator It>
         [[nodiscard]] constexpr int sequence_length(It lead_it)
             noexcept(traits::is_nothrow_dereferenceable_v<It&>)
         {
@@ -203,7 +203,7 @@ namespace iris::utflib
         }
 
         /// Helper for get_sequence_x
-        template <octet_iterator It, std::sentinel_for<It> Se>
+        template<octet_input_iterator It, std::sentinel_for<It> Se>
         constexpr utf_error increase_safely(It& it, Se end)
             noexcept(std::conjunction_v<
                 traits::is_nothrow_dereferenceable<It&>,
@@ -228,7 +228,7 @@ namespace iris::utflib
     } while (false)
 
         /// get_sequence_x functions decode utf-8 sequences of the length x
-        template <octet_iterator It, std::sentinel_for<It> Se>
+        template<octet_input_iterator It, std::sentinel_for<It> Se>
         constexpr utf_error get_sequence_1(It& it, Se end, char32_t& code_point)
             noexcept(std::conjunction_v<
                 traits::is_nothrow_dereferenceable<It&>,
@@ -243,7 +243,7 @@ namespace iris::utflib
             return utf_error::OK;
         }
 
-        template <octet_iterator It, std::sentinel_for<It> Se>
+        template<octet_input_iterator It, std::sentinel_for<It> Se>
         constexpr utf_error get_sequence_2(It& it, Se end, char32_t& code_point)
             noexcept(std::conjunction_v<
                 traits::is_nothrow_dereferenceable<It&>,
@@ -263,7 +263,7 @@ namespace iris::utflib
             return utf_error::OK;
         }
 
-        template <octet_iterator It, std::sentinel_for<It> Se>
+        template<octet_input_iterator It, std::sentinel_for<It> Se>
         constexpr utf_error get_sequence_3(It& it, Se end, char32_t& code_point)
             noexcept(std::conjunction_v<
                 traits::is_nothrow_dereferenceable<It&>,
@@ -287,7 +287,7 @@ namespace iris::utflib
             return utf_error::OK;
         }
 
-        template <octet_iterator It, std::sentinel_for<It> Se>
+        template<octet_input_iterator It, std::sentinel_for<It> Se>
         constexpr utf_error get_sequence_4(It& it, Se end, char32_t& code_point)
             noexcept(std::conjunction_v<
                 traits::is_nothrow_dereferenceable<It&>,
@@ -317,7 +317,7 @@ namespace iris::utflib
 
 #undef IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR
 
-        template <octet_iterator It, std::sentinel_for<It> Se>
+        template<octet_input_iterator It, std::sentinel_for<It> Se>
             requires std::forward_iterator<It>
         constexpr utf_error validate_next(It& it, Se end, char32_t& code_point)
             noexcept(std::conjunction_v<
@@ -376,7 +376,7 @@ namespace iris::utflib
             return err;
         }
 
-        template <octet_iterator It, std::sentinel_for<It> Se>
+        template<octet_input_iterator It, std::sentinel_for<It> Se>
             requires std::forward_iterator<It>
         constexpr utf_error validate_next(It& it, Se end)
             noexcept(noexcept(internal::validate_next(it, end, std::declval<char32_t&>())))
@@ -385,7 +385,7 @@ namespace iris::utflib
             return internal::validate_next(it, end, ignored);
         }
 
-        template <utf16_iterator It, std::sentinel_for<It> Se>
+        template<utf16_input_iterator It, std::sentinel_for<It> Se>
             requires std::forward_iterator<It>
         constexpr utf_error validate_next16(It& it, Se end, char32_t& code_point)
             noexcept(std::conjunction_v<
@@ -429,10 +429,9 @@ namespace iris::utflib
             return err;
         }
 
-        template <typename It, octet octet_type = std::iter_value_t<It>>
-            requires std::output_iterator<It, octet_type>
-        constexpr It append(char32_t cp, It result)
-            noexcept(noexcept(*result++ = std::declval<octet_type>()))
+        template<class OutIt, octet octet_type = std::iter_value_t<OutIt>>
+            requires std::output_iterator<OutIt, octet_type>
+        constexpr OutIt append(char32_t cp, OutIt result) noexcept
         {
             if (cp < 0x80) // one octet
                 *(result++) = static_cast<octet_type>(cp);
@@ -452,14 +451,14 @@ namespace iris::utflib
             return result;
         }
 
-        template <typename container_type>
+        template<class container_type>
         constexpr std::back_insert_iterator<container_type> append(char32_t cp, std::back_insert_iterator<container_type> result)
-            noexcept(noexcept(internal::append<std::back_insert_iterator<container_type>, typename container_type::value_type>(cp, result)))
+            noexcept(noexcept(internal::append<std::back_insert_iterator<container_type>, class container_type::value_type>(cp, result)))
         {
-            return internal::append<std::back_insert_iterator<container_type>, typename container_type::value_type>(cp, result);
+            return internal::append<std::back_insert_iterator<container_type>, class container_type::value_type>(cp, result);
         }
 
-        template <std::output_iterator<char16_t> It>
+        template<std::output_iterator<char16_t> It>
         constexpr It append16(char32_t cp, It result)
             noexcept(noexcept(*result++ = std::declval<char16_t>()))
         {
@@ -534,7 +533,7 @@ namespace iris::utflib
     // Byte order mark
     constexpr char8_t bom[] = {0xef, 0xbb, 0xbf};
 
-    template <octet_iterator It, std::sentinel_for<It> Se>
+    template<octet_input_iterator It, std::sentinel_for<It> Se>
     [[nodiscard]] constexpr It find_invalid(It it, Se se)
         noexcept(noexcept(internal::validate_next(it, se)) && std::is_nothrow_copy_constructible_v<It>)
     {
@@ -547,39 +546,39 @@ namespace iris::utflib
     }
 
     [[nodiscard]] constexpr std::size_t find_invalid(std::string_view s)
-        noexcept(noexcept(utflib::find_invalid(s.begin(), s.end())))
+        noexcept(noexcept(unicode::find_invalid(s.begin(), s.end())))
     {
-        std::string_view::const_iterator invalid = utflib::find_invalid(s.begin(), s.end());
+        std::string_view::const_iterator invalid = unicode::find_invalid(s.begin(), s.end());
         return (invalid == s.end()) ? std::string_view::npos : static_cast<std::size_t>(invalid - s.begin());
     }
 
     [[nodiscard]] constexpr std::size_t find_invalid(std::u8string_view s)
-        noexcept(noexcept(utflib::find_invalid(s.begin(), s.end())))
+        noexcept(noexcept(unicode::find_invalid(s.begin(), s.end())))
     {
-        std::u8string_view::const_iterator invalid = utflib::find_invalid(s.begin(), s.end());
+        std::u8string_view::const_iterator invalid = unicode::find_invalid(s.begin(), s.end());
         return (invalid == s.end()) ? std::u8string_view::npos : static_cast<std::size_t>(invalid - s.begin());
     }
 
-    template <octet_iterator It, std::sentinel_for<It> Se>
+    template<octet_input_iterator It, std::sentinel_for<It> Se>
     [[nodiscard]] constexpr bool is_valid(It it, Se se)
-        noexcept(noexcept(utflib::find_invalid(it, se)) && traits::is_nothrow_sentinel_v<It, Se>)
+        noexcept(noexcept(unicode::find_invalid(it, se)) && traits::is_nothrow_sentinel_v<It, Se>)
     {
-        return (utflib::find_invalid(it, se) == se);
+        return (unicode::find_invalid(it, se) == se);
     }
 
     [[nodiscard]] constexpr bool is_valid(std::string_view s)
-        noexcept(noexcept(utflib::is_valid(s.begin(), s.end())))
+        noexcept(noexcept(unicode::is_valid(s.begin(), s.end())))
     {
-        return utflib::is_valid(s.begin(), s.end());
+        return unicode::is_valid(s.begin(), s.end());
     }
 
     [[nodiscard]] constexpr bool is_valid(std::u8string_view s)
-        noexcept(noexcept(utflib::is_valid(s.begin(), s.end())))
+        noexcept(noexcept(unicode::is_valid(s.begin(), s.end())))
     {
-        return utflib::is_valid(s.begin(), s.end());
+        return unicode::is_valid(s.begin(), s.end());
     }
 
-    template <octet_iterator It, std::sentinel_for<It> Se>
+    template<octet_input_iterator It, std::sentinel_for<It> Se>
     [[nodiscard]] constexpr bool starts_with_bom(It it, Se end)
         noexcept(noexcept(internal::mask8(*it++)) && traits::is_nothrow_sentinel_v<It, Se>)
     {
@@ -587,19 +586,19 @@ namespace iris::utflib
     }
 
     [[nodiscard]] constexpr bool starts_with_bom(std::string_view s)
-        noexcept(noexcept(utflib::starts_with_bom(s.begin(), s.end())))
+        noexcept(noexcept(unicode::starts_with_bom(s.begin(), s.end())))
     {
-        return utflib::starts_with_bom(s.begin(), s.end());
+        return unicode::starts_with_bom(s.begin(), s.end());
     }
 
     [[nodiscard]] constexpr bool starts_with_bom(std::u8string_view s)
-        noexcept(noexcept(utflib::starts_with_bom(s.begin(), s.end())))
+        noexcept(noexcept(unicode::starts_with_bom(s.begin(), s.end())))
     {
-        return utflib::starts_with_bom(s.begin(), s.end());
+        return unicode::starts_with_bom(s.begin(), s.end());
     }
 
-    template <typename It>  // TODO: add constraints
-    constexpr It append(char32_t cp, It result)
+    template<class OutIt>
+    constexpr OutIt append(char32_t cp, OutIt result)
     {
         if (!internal::is_code_point_valid(cp))
             throw invalid_code_point(cp);
@@ -609,15 +608,15 @@ namespace iris::utflib
 
     constexpr void append(char32_t cp, std::string& s)
     {
-        utflib::append(cp, std::back_inserter(s));
+        unicode::append(cp, std::back_inserter(s));
     }
 
     constexpr void append(char32_t cp, std::u8string& s)
     {
-        utflib::append(cp, std::back_inserter(s));
+        unicode::append(cp, std::back_inserter(s));
     }
 
-    template <typename It>  // TODO: add constraints
+    template<class It>  // TODO: add constraints
     constexpr It append16(char32_t cp, It result)
     {
         if (!internal::is_code_point_valid(cp))
@@ -628,10 +627,10 @@ namespace iris::utflib
 
     constexpr void append16(char32_t cp, std::u16string& s)
     {
-        utflib::append16(cp, std::back_inserter(s));
+        unicode::append16(cp, std::back_inserter(s));
     }
 
-    template <octet_iterator It, std::sentinel_for<It> Se, typename Out>  // TODO: add constraints
+    template<octet_input_iterator It, std::sentinel_for<It> Se, class Out>  // TODO: add constraints
     constexpr Out replace_invalid(It start, Se end, Out out, char32_t replacement)
     {
         while (start != end) {
@@ -643,17 +642,17 @@ namespace iris::utflib
                         *out++ = *it;
                     break;
                 case internal::utf_error::NOT_ENOUGH_ROOM:
-                    out   = utflib::append(replacement, out);
+                    out   = unicode::append(replacement, out);
                     start = end;
                     break;
                 case internal::utf_error::INVALID_LEAD:
-                    out = utflib::append(replacement, out);
+                    out = unicode::append(replacement, out);
                     ++start;
                     break;
                 case internal::utf_error::INCOMPLETE_SEQUENCE:
                 case internal::utf_error::OVERLONG_SEQUENCE:
                 case internal::utf_error::INVALID_CODE_POINT:
-                    out = utflib::append(replacement, out);
+                    out = unicode::append(replacement, out);
                     ++start;
                     // just one replacement mark for the sequence
                     while (start != end && internal::is_trail(*start))
@@ -664,11 +663,11 @@ namespace iris::utflib
         return out;
     }
 
-    template <octet_iterator It, std::sentinel_for<It> Se, typename Out>  // TODO: add constraints
+    template<octet_input_iterator It, std::sentinel_for<It> Se, class Out>  // TODO: add constraints
     constexpr Out replace_invalid(It start, Se end, Out out)
     {
         constexpr char32_t replacement_marker = static_cast<char32_t>(internal::mask16(0xfffd));
-        return utflib::replace_invalid(start, end, out, replacement_marker);
+        return unicode::replace_invalid(start, end, out, replacement_marker);
     }
 
     [[nodiscard]] constexpr std::string replace_invalid(std::string_view s, char32_t replacement)
@@ -699,7 +698,7 @@ namespace iris::utflib
         return result;
     }
 
-    template <octet_iterator It, std::sentinel_for<It> Se>
+    template<octet_input_iterator It, std::sentinel_for<It> Se>
     [[nodiscard]] constexpr char32_t next(It& it, Se end)
     {
         char32_t cp               = 0;
@@ -719,7 +718,7 @@ namespace iris::utflib
         return cp;
     }
 
-    template <utf16_iterator It, std::sentinel_for<It> Se>
+    template<utf16_input_iterator It, std::sentinel_for<It> Se>
     [[nodiscard]] constexpr char32_t next16(It& it, Se end)
     {
         char32_t cp               = 0;
@@ -729,13 +728,13 @@ namespace iris::utflib
         return cp;
     }
 
-    template <octet_iterator It, std::sentinel_for<It> Se>
+    template<octet_input_iterator It, std::sentinel_for<It> Se>
     [[nodiscard]] constexpr char32_t peek_next(It it, Se end)
     {
-        return utflib::next(it, end);
+        return unicode::next(it, end);
     }
 
-    template <octet_iterator It, std::sentinel_for<It> Se>
+    template<octet_input_iterator It, std::sentinel_for<It> Se>
     [[nodiscard]] constexpr char32_t prior(It& it, Se start)
     {
         // can't do much if it == start
@@ -747,34 +746,34 @@ namespace iris::utflib
         while (internal::is_trail(*(--it)))
             if (it == start)
                 throw invalid_utf8(*it); // error - no lead byte in the sequence
-        return utflib::peek_next(it, end);
+        return unicode::peek_next(it, end);
     }
 
-    template <octet_iterator It, std::sentinel_for<It> Se, typename distance_type>
+    template<octet_input_iterator It, std::sentinel_for<It> Se, class distance_type>
     constexpr void advance(It& it, distance_type n, Se end)
     {
         const distance_type zero(0);
         if (n < zero) {
             // backward
             for (distance_type i = n; i < zero; ++i)
-                (void)utflib::prior(it, end);
+                (void)unicode::prior(it, end);
         } else {
             // forward
             for (distance_type i = zero; i < n; ++i)
-                (void)utflib::next(it, end);
+                (void)unicode::next(it, end);
         }
     }
 
-    template <octet_iterator It, std::sentinel_for<It> Se>
-    [[nodiscard]] constexpr typename std::iterator_traits<It>::difference_type distance(It first, Se last)
+    template<octet_input_iterator It, std::sentinel_for<It> Se>
+    [[nodiscard]] constexpr class std::iterator_traits<It>::difference_type distance(It first, Se last)
     {
-        typename std::iterator_traits<It>::difference_type dist;
+        class std::iterator_traits<It>::difference_type dist;
         for (dist = 0; first != last; ++dist)
-            (void)utflib::next(first, last);
+            (void)unicode::next(first, last);
         return dist;
     }
 
-    template <utf16_iterator It, std::sentinel_for<It> Se, typename OutIt> // TODO: add constraints
+    template<utf16_input_iterator It, std::sentinel_for<It> Se, class OutIt> // TODO: add constraints
     constexpr OutIt utf16to8(It start, Se end, OutIt result)
     {
         while (start != end) {
@@ -795,7 +794,7 @@ namespace iris::utflib
             else if (internal::is_trail_surrogate(cp))
                 throw invalid_utf16(static_cast<char16_t>(cp));
 
-            result = utflib::append(cp, result);
+            result = unicode::append(cp, result);
         }
         return result;
     }
@@ -803,22 +802,22 @@ namespace iris::utflib
     [[nodiscard]] constexpr std::string utf16to8(std::u16string_view s)
     {
         std::string result;
-        utflib::utf16to8(s.begin(), s.end(), std::back_inserter(result));
+        unicode::utf16to8(s.begin(), s.end(), std::back_inserter(result));
         return result;
     }
 
     [[nodiscard]] constexpr std::u8string utf16tou8(std::u16string_view s)
     {
         std::u8string result;
-        utflib::utf16to8(s.begin(), s.end(), std::back_inserter(result));
+        unicode::utf16to8(s.begin(), s.end(), std::back_inserter(result));
         return result;
     }
 
-    template <utf8_iterator It, std::sentinel_for<It> Se, typename OutIt>  // TODO: add constraints
+    template<utf8_input_iterator It, std::sentinel_for<It> Se, class OutIt>  // TODO: add constraints
     constexpr OutIt utf8to16(It start, Se end, OutIt result)
     {
         while (start != end) {
-            const char32_t cp = utflib::next(start, end);
+            const char32_t cp = unicode::next(start, end);
             if (cp > 0xffff) { // make a surrogate pair
                 *result++ = static_cast<char16_t>((cp >> 10) + internal::LEAD_OFFSET);
                 *result++ = static_cast<char16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
@@ -831,22 +830,22 @@ namespace iris::utflib
     [[nodiscard]] constexpr std::u16string utf8to16(std::string_view s)
     {
         std::u16string result;
-        utflib::utf8to16(s.begin(), s.end(), std::back_inserter(result));
+        unicode::utf8to16(s.begin(), s.end(), std::back_inserter(result));
         return result;
     }
 
     [[nodiscard]] constexpr std::u16string utf8to16(std::u8string_view s)
     {
         std::u16string result;
-        utflib::utf8to16(s.begin(), s.end(), std::back_inserter(result));
+        unicode::utf8to16(s.begin(), s.end(), std::back_inserter(result));
         return result;
     }
 
-    template <utf32_iterator It, std::sentinel_for<It> Se, typename OutIt>  // TODO: add constraints
+    template<utf32_input_iterator It, std::sentinel_for<It> Se, class OutIt>  // TODO: add constraints
     constexpr OutIt utf32to8(It start, Se end, OutIt result)
     {
         while (start != end)
-            result = utflib::append(*(start++), result);
+            result = unicode::append(*(start++), result);
 
         return result;
     }
@@ -854,22 +853,22 @@ namespace iris::utflib
     [[nodiscard]] constexpr std::string utf32to8(std::u32string_view s)
     {
         std::string result;
-        utflib::utf32to8(s.begin(), s.end(), std::back_inserter(result));
+        unicode::utf32to8(s.begin(), s.end(), std::back_inserter(result));
         return result;
     }
 
     [[nodiscard]] constexpr std::u8string utf32tou8(std::u32string_view s)
     {
         std::u8string result;
-        utflib::utf32to8(s.begin(), s.end(), std::back_inserter(result));
+        unicode::utf32to8(s.begin(), s.end(), std::back_inserter(result));
         return result;
     }
 
-    template <utf8_iterator It, std::sentinel_for<It> Se, typename OutIt>
+    template<utf8_input_iterator It, std::sentinel_for<It> Se, class OutIt>
     constexpr OutIt utf8to32(It start, Se end, OutIt result)
     {
         while (start != end)
-            (*result++) = utflib::next(start, end);
+            (*result++) = unicode::next(start, end);
 
         return result;
     }
@@ -877,19 +876,19 @@ namespace iris::utflib
     [[nodiscard]] constexpr std::u32string utf8to32(std::string_view s)
     {
         std::u32string result;
-        utflib::utf8to32(s.begin(), s.end(), std::back_inserter(result));
+        unicode::utf8to32(s.begin(), s.end(), std::back_inserter(result));
         return result;
     }
 
     [[nodiscard]] constexpr std::u32string utf8to32(std::u8string_view s)
     {
         std::u32string result;
-        utflib::utf8to32(s.begin(), s.end(), std::back_inserter(result));
+        unicode::utf8to32(s.begin(), s.end(), std::back_inserter(result));
         return result;
     }
 
     // The iterator class
-    template <octet_iterator It>
+    template<octet_input_iterator It>
     class iterator
     {
         It it;
@@ -920,7 +919,7 @@ namespace iris::utflib
         [[nodiscard]] constexpr char32_t operator*() const
         {
             It temp = it;
-            return utflib::next(temp, range_end);
+            return unicode::next(temp, range_end);
         }
         [[nodiscard]] constexpr bool operator==(const iterator& rhs) const
         {
@@ -930,28 +929,28 @@ namespace iris::utflib
         }
         constexpr iterator& operator++()
         {
-            (void)utflib::next(it, range_end);
+            (void)unicode::next(it, range_end);
             return *this;
         }
         constexpr iterator operator++(int)
         {
             iterator temp = *this;
-            (void)utflib::next(it, range_end);
+            (void)unicode::next(it, range_end);
             return temp;
         }
         constexpr iterator& operator--()
         {
-            (void)utflib::prior(it, range_start);
+            (void)unicode::prior(it, range_start);
             return *this;
         }
         constexpr iterator operator--(int)
         {
             iterator temp = *this;
-            (void)utflib::prior(it, range_start);
+            (void)unicode::prior(it, range_start);
             return temp;
         }
     }; // class iterator
 
-} // namespace iris::utflib
+} // iris::unicode
 
-#endif // header guard
+#endif
diff --git a/test/unicode/string/CMakeLists.txt b/test/unicode/string/CMakeLists.txt
index 4fd1e27..ebfa8d4 100644
--- a/test/unicode/string/CMakeLists.txt
+++ b/test/unicode/string/CMakeLists.txt
@@ -1,3 +1,14 @@
 # SPDX-License-Identifier: MIT
 
+set(
+    IRIS_TEST_UNICODE_STRING_TESTS
+    string
+    utf8_invalid
+)
 
+foreach(test_name IN LISTS IRIS_TEST_UNICODE_STRING_TESTS)
+    iris_define_test(unicode_string_${test_name} ${test_name}.cpp)
+    set_target_properties(unicode_string_${test_name}_test PROPERTIES FOLDER "test/unicode/string")
+endforeach()
+
+target_sources(unicode_string_utf8_invalid_test PRIVATE test_data/utf8_invalid.txt)
diff --git a/test/unicode/string/apitests.cpp b/test/unicode/string/apitests.cpp
deleted file mode 100644
index ba8fa90..0000000
--- a/test/unicode/string/apitests.cpp
+++ /dev/null
@@ -1,257 +0,0 @@
-#include "ftest.h"
-
-#include "utf8.h"
-
-#include <string>
-
-#include <cassert>
-
-using namespace iris::utflib;
-using namespace std;
-
-TEST(CheckedAPITests, test_append)
-{
-    unsigned char u[5] = {0,0,0,0,0};
-    append(0x0448, u);
-    EXPECT_EQ (u[0], 0xd1);
-    EXPECT_EQ (u[1], 0x88);
-    EXPECT_EQ (u[2], 0);
-    EXPECT_EQ (u[3], 0);
-    EXPECT_EQ (u[4], 0);
-
-    append(0x65e5, u);
-    EXPECT_EQ (u[0], 0xe6);
-    EXPECT_EQ (u[1], 0x97);
-    EXPECT_EQ (u[2], 0xa5);
-    EXPECT_EQ (u[3], 0);
-    EXPECT_EQ (u[4], 0);
-
-    append(0x3044, u);
-    EXPECT_EQ (u[0], 0xe3);
-    EXPECT_EQ (u[1], 0x81);
-    EXPECT_EQ (u[2], 0x84);
-    EXPECT_EQ (u[3], 0);
-    EXPECT_EQ (u[4], 0);
-
-    append(0x10346, u);
-    EXPECT_EQ (u[0], 0xf0);
-    EXPECT_EQ (u[1], 0x90);
-    EXPECT_EQ (u[2], 0x8d);
-    EXPECT_EQ (u[3], 0x86);
-    EXPECT_EQ (u[4], 0);
-
-    // Ensure no warnings with plain char
-    char c[2] = {0,0};
-    append('a', c);
-    EXPECT_EQ (c[0], 'a');
-    EXPECT_EQ (c[1], 0);
-}
-
-TEST(CheckedAPITests, test_append16)
-{
-    char16_t u[5] = {0,0};
-    append16(0x0448, u);
-    EXPECT_EQ (u[0], 0x0448);
-    EXPECT_EQ (u[1], 0x0000);
-
-    append16(0x65e5, u);
-    EXPECT_EQ (u[0], 0x65e5);
-    EXPECT_EQ (u[1], 0x0000);
-
-    append16(0x10346, u);
-    EXPECT_EQ (u[0], 0xd800);
-    EXPECT_EQ (u[1], 0xdf46);
-}
-
-TEST(CheckedAPITests, test_next)
-{
-    const char* twochars = "\xe6\x97\xa5\xd1\x88";
-    const char* w = twochars;
-    unsigned int cp = next(w, twochars + 6);
-    EXPECT_EQ (cp, 0x65e5);
-    EXPECT_EQ (w, twochars + 3);
-
-    const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
-    w = threechars;
-
-    cp = next(w, threechars + 9);
-    EXPECT_EQ (cp, 0x10346);
-    EXPECT_EQ (w, threechars + 4);
-
-    cp = next(w, threechars + 9);
-    EXPECT_EQ (cp, 0x65e5);
-    EXPECT_EQ (w, threechars + 7);
-
-    cp = next(w, threechars + 9);
-    EXPECT_EQ (cp, 0x0448);
-    EXPECT_EQ (w, threechars + 9);
-}
-
-TEST(CheckedAPITests, test_next16)
-{
-    const char16_t u[3] = {0x65e5, 0xd800, 0xdf46};
-    const char16_t* w = u;
-    char32_t cp = next16(w, w + 3);
-    EXPECT_EQ (cp, 0x65e5);
-    EXPECT_EQ (w, u + 1);
-
-    cp = next16(w, w + 2);
-    EXPECT_EQ (cp, 0x10346);
-    EXPECT_EQ (w, u + 3);
-}
-
-TEST(CheckedAPITests, test_peek_next)
-{
-    const char* const cw = "\xe6\x97\xa5\xd1\x88";
-    unsigned int cp = peek_next(cw, cw + 6);
-    EXPECT_EQ (cp, 0x65e5);
-}
-
-TEST(CheckedAPITests, test_prior)
-{
-    const char* twochars = "\xe6\x97\xa5\xd1\x88";
-    const char* w = twochars + 3;
-    unsigned int cp = prior (w, twochars);
-    EXPECT_EQ (cp, 0x65e5);
-    EXPECT_EQ (w, twochars);
-
-    const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
-    w = threechars + 9;
-    cp = prior(w, threechars);
-    EXPECT_EQ (cp, 0x0448);
-    EXPECT_EQ (w, threechars + 7);
-    cp = prior(w, threechars);
-    EXPECT_EQ (cp, 0x65e5);
-    EXPECT_EQ (w, threechars + 4);
-    cp = prior(w, threechars);
-    EXPECT_EQ (cp, 0x10346);
-    EXPECT_EQ (w, threechars);
-}
-
-TEST(CheckedAPITests, test_advance)
-{
-    const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
-    const char* w = threechars;
-    advance(w, 2, threechars + 9);
-    EXPECT_EQ(w, threechars + 7);
-    advance(w, -2, threechars);
-    EXPECT_EQ(w, threechars);
-    advance(w, 3, threechars + 9);
-    EXPECT_EQ(w, threechars + 9);
-    advance(w, -2, threechars);
-    EXPECT_EQ(w, threechars + 4);
-    advance(w, -1, threechars);
-    EXPECT_EQ(w, threechars);
-}
-
-TEST(CheckedAPITests, test_distance)
-{
-    const char* twochars = "\xe6\x97\xa5\xd1\x88";
-    size_t dist = static_cast<size_t>(iris::utflib::distance(twochars, twochars + 5));
-    EXPECT_EQ (dist, 2);
-}
-
-TEST(CheckedAPITests, test_utf32to8)
-{
-    char32_t utf32string[] = {0x448, 0x65E5, 0x10346, 0};
-    string utf8result;
-    iris::utflib::utf32to8(utf32string, utf32string + 3, back_inserter(utf8result));
-    EXPECT_EQ (utf8result.size(), 9);
-}
-
-TEST(CheckedAPITests, test_utf8to32)
-{
-    const char* twochars = "\xe6\x97\xa5\xd1\x88";
-    vector<unsigned int> utf32result;
-    iris::utflib::utf8to32(twochars, twochars + 5, back_inserter(utf32result));
-    EXPECT_EQ (utf32result.size(), 2);
-}
-
-TEST(CheckedAPITests, test_utf16to8)
-{
-    char16_t utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
-    string utf8result;
-    iris::utflib::utf16to8(utf16string, utf16string + 5, back_inserter(utf8result));
-    EXPECT_EQ (utf8result.size(), 10);
-}
-
-TEST(CheckedAPITests, test_utf8to16)
-{
-    char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
-    vector <char16_t> utf16result;
-    iris::utflib::utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result));
-    EXPECT_EQ (utf16result.size(), 4);
-    EXPECT_EQ (utf16result[2], 0xd834);
-    EXPECT_EQ (utf16result[3], 0xdd1e);
-}
-
-TEST(CheckedAPITests, test_replace_invalid)
-{
-    char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
-    vector<char> replace_invalid_result;
-    replace_invalid (invalid_sequence, invalid_sequence + sizeof(invalid_sequence), std::back_inserter(replace_invalid_result), '?');
-    bool bvalid = is_valid(replace_invalid_result.begin(), replace_invalid_result.end());
-    EXPECT_TRUE (bvalid);
-    const char fixed_invalid_sequence[] = "a????z";
-    EXPECT_EQ (sizeof(fixed_invalid_sequence), replace_invalid_result.size());
-    EXPECT_TRUE (std::equal(replace_invalid_result.begin(), replace_invalid_result.begin() + sizeof(fixed_invalid_sequence), fixed_invalid_sequence));
-}
-
-TEST(CheckedAPITests, test_find_invalid)
-{
-    char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa";
-    const char* invalid = find_invalid(utf_invalid, utf_invalid + 6);
-    EXPECT_EQ (invalid, utf_invalid + 5);
-    invalid = utf_invalid + find_invalid(utf_invalid);
-    EXPECT_EQ (invalid, utf_invalid + 5);
-}
-
-TEST(CheckedAPITests, test_is_valid)
-{
-    char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa";
-    bool bvalid = is_valid(utf_invalid, utf_invalid + 6);
-    EXPECT_FALSE (bvalid);
-    bvalid = is_valid(utf_invalid);
-    EXPECT_FALSE (bvalid);
-    char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
-    bvalid = is_valid(utf8_with_surrogates, utf8_with_surrogates + 9);
-    EXPECT_TRUE (bvalid);
-    bvalid = is_valid(utf8_with_surrogates);
-    EXPECT_TRUE (bvalid);
-}
-
-TEST(CheckedAPITests, test_starts_with_bom)
-{
-    unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf};
-    bool bbom = starts_with_bom(byte_order_mark, byte_order_mark + sizeof(byte_order_mark));
-    EXPECT_TRUE (bbom);
-    const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
-    bool no_bbom = starts_with_bom(threechars, threechars + sizeof(threechars));
-    EXPECT_FALSE (no_bbom);
-}
-
-TEST(CheckedIteratrTests, test_increment)
-{
-    const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
-    iris::utflib::iterator<const char*> it(threechars, threechars, threechars + 9);
-    iris::utflib::iterator<const char*> it2 = it;
-    EXPECT_EQ (it2, it);
-    EXPECT_EQ (*it, 0x10346);
-    EXPECT_EQ (*(++it), 0x65e5);
-    EXPECT_EQ ((*it++), 0x65e5);
-    EXPECT_EQ (*it, 0x0448);
-    EXPECT_NE (it, it2);
-    iris::utflib::iterator<const char*> endit (threechars + 9, threechars, threechars + 9);
-    EXPECT_EQ (++it, endit);
-}
-
-TEST(CheckedIteratrTests, test_decrement)
-{
-    const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
-    iris::utflib::iterator<const char*> it(threechars+9, threechars, threechars + 9);
-    EXPECT_EQ (*(--it), 0x0448);
-    EXPECT_EQ ((*it--), 0x0448);
-    EXPECT_EQ (*it, 0x65e5);
-    EXPECT_EQ (--it, iris::utflib::iterator<const char*>(threechars, threechars, threechars + 9));
-    EXPECT_EQ (*it, 0x10346);
-}
diff --git a/test/unicode/string/string.cpp b/test/unicode/string/string.cpp
new file mode 100644
index 0000000..abc34a2
--- /dev/null
+++ b/test/unicode/string/string.cpp
@@ -0,0 +1,488 @@
+#include "iris_test.hpp"
+
+#include <iris/unicode/string.hpp>
+
+#include <string>
+
+namespace iris_unicode_test {
+
+namespace unicode = iris::unicode;
+
+using namespace iris::unicode;
+using namespace std;
+
+TEST_CASE("append")
+{
+    unsigned char u[5] = {0, 0, 0, 0, 0};
+    unicode::append(0x0448, u);
+    EXPECT_EQ (u[0], 0xd1);
+    EXPECT_EQ (u[1], 0x88);
+    EXPECT_EQ (u[2], 0);
+    EXPECT_EQ (u[3], 0);
+    EXPECT_EQ (u[4], 0);
+
+    unicode::append(0x65e5, u);
+    EXPECT_EQ (u[0], 0xe6);
+    EXPECT_EQ (u[1], 0x97);
+    EXPECT_EQ (u[2], 0xa5);
+    EXPECT_EQ (u[3], 0);
+    EXPECT_EQ (u[4], 0);
+
+    unicode::append(0x3044, u);
+    EXPECT_EQ (u[0], 0xe3);
+    EXPECT_EQ (u[1], 0x81);
+    EXPECT_EQ (u[2], 0x84);
+    EXPECT_EQ (u[3], 0);
+    EXPECT_EQ (u[4], 0);
+
+    unicode::append(0x10346, u);
+    EXPECT_EQ (u[0], 0xf0);
+    EXPECT_EQ (u[1], 0x90);
+    EXPECT_EQ (u[2], 0x8d);
+    EXPECT_EQ (u[3], 0x86);
+    EXPECT_EQ (u[4], 0);
+}
+
+#if 0
+
+TEST(CheckedAPITests, test_append16)
+{
+    char16_t u[5] = {0, 0};
+    append16(0x0448, u);
+    EXPECT_EQ (u[0], 0x0448);
+    EXPECT_EQ (u[1], 0x0000);
+
+    append16(0x65e5, u);
+    EXPECT_EQ (u[0], 0x65e5);
+    EXPECT_EQ (u[1], 0x0000);
+
+    append16(0x10346, u);
+    EXPECT_EQ (u[0], 0xd800);
+    EXPECT_EQ (u[1], 0xdf46);
+}
+
+TEST(CheckedAPITests, test_next)
+{
+    const char* twochars = "\xe6\x97\xa5\xd1\x88";
+    const char* w = twochars;
+    unsigned int cp = next(w, twochars + 6);
+    EXPECT_EQ (cp, 0x65e5);
+    EXPECT_EQ (w, twochars + 3);
+
+    const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
+    w = threechars;
+
+    cp = next(w, threechars + 9);
+    EXPECT_EQ (cp, 0x10346);
+    EXPECT_EQ (w, threechars + 4);
+
+    cp = next(w, threechars + 9);
+    EXPECT_EQ (cp, 0x65e5);
+    EXPECT_EQ (w, threechars + 7);
+
+    cp = next(w, threechars + 9);
+    EXPECT_EQ (cp, 0x0448);
+    EXPECT_EQ (w, threechars + 9);
+}
+
+TEST(CheckedAPITests, test_next16)
+{
+    const char16_t u[3] = {0x65e5, 0xd800, 0xdf46};
+    const char16_t* w = u;
+    char32_t cp = next16(w, w + 3);
+    EXPECT_EQ (cp, 0x65e5);
+    EXPECT_EQ (w, u + 1);
+
+    cp = next16(w, w + 2);
+    EXPECT_EQ (cp, 0x10346);
+    EXPECT_EQ (w, u + 3);
+}
+
+TEST(CheckedAPITests, test_peek_next)
+{
+    const char* const cw = "\xe6\x97\xa5\xd1\x88";
+    unsigned int cp = peek_next(cw, cw + 6);
+    EXPECT_EQ (cp, 0x65e5);
+}
+
+TEST(CheckedAPITests, test_prior)
+{
+    const char* twochars = "\xe6\x97\xa5\xd1\x88";
+    const char* w = twochars + 3;
+    unsigned int cp = prior (w, twochars);
+    EXPECT_EQ (cp, 0x65e5);
+    EXPECT_EQ (w, twochars);
+
+    const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
+    w = threechars + 9;
+    cp = prior(w, threechars);
+    EXPECT_EQ (cp, 0x0448);
+    EXPECT_EQ (w, threechars + 7);
+    cp = prior(w, threechars);
+    EXPECT_EQ (cp, 0x65e5);
+    EXPECT_EQ (w, threechars + 4);
+    cp = prior(w, threechars);
+    EXPECT_EQ (cp, 0x10346);
+    EXPECT_EQ (w, threechars);
+}
+
+TEST(CheckedAPITests, test_advance)
+{
+    const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
+    const char* w = threechars;
+    advance(w, 2, threechars + 9);
+    EXPECT_EQ(w, threechars + 7);
+    advance(w, -2, threechars);
+    EXPECT_EQ(w, threechars);
+    advance(w, 3, threechars + 9);
+    EXPECT_EQ(w, threechars + 9);
+    advance(w, -2, threechars);
+    EXPECT_EQ(w, threechars + 4);
+    advance(w, -1, threechars);
+    EXPECT_EQ(w, threechars);
+}
+
+TEST(CheckedAPITests, test_distance)
+{
+    const char* twochars = "\xe6\x97\xa5\xd1\x88";
+    size_t dist = static_cast<size_t>(iris::utflib::distance(twochars, twochars + 5));
+    EXPECT_EQ (dist, 2);
+}
+
+TEST(CheckedAPITests, test_utf32to8)
+{
+    char32_t utf32string[] = {0x448, 0x65E5, 0x10346, 0};
+    string utf8result;
+    iris::utflib::utf32to8(utf32string, utf32string + 3, back_inserter(utf8result));
+    EXPECT_EQ (utf8result.size(), 9);
+}
+
+TEST(CheckedAPITests, test_utf8to32)
+{
+    const char* twochars = "\xe6\x97\xa5\xd1\x88";
+    vector<unsigned int> utf32result;
+    iris::utflib::utf8to32(twochars, twochars + 5, back_inserter(utf32result));
+    EXPECT_EQ (utf32result.size(), 2);
+}
+
+TEST(CheckedAPITests, test_utf16to8)
+{
+    char16_t utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
+    string utf8result;
+    iris::utflib::utf16to8(utf16string, utf16string + 5, back_inserter(utf8result));
+    EXPECT_EQ (utf8result.size(), 10);
+}
+
+TEST(CheckedAPITests, test_utf8to16)
+{
+    char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
+    vector <char16_t> utf16result;
+    iris::utflib::utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result));
+    EXPECT_EQ (utf16result.size(), 4);
+    EXPECT_EQ (utf16result[2], 0xd834);
+    EXPECT_EQ (utf16result[3], 0xdd1e);
+}
+
+TEST(CheckedAPITests, test_replace_invalid)
+{
+    char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
+    vector<char> replace_invalid_result;
+    replace_invalid (invalid_sequence, invalid_sequence + sizeof(invalid_sequence), std::back_inserter(replace_invalid_result), '?');
+    bool bvalid = is_valid(replace_invalid_result.begin(), replace_invalid_result.end());
+    EXPECT_TRUE (bvalid);
+    const char fixed_invalid_sequence[] = "a????z";
+    EXPECT_EQ (sizeof(fixed_invalid_sequence), replace_invalid_result.size());
+    EXPECT_TRUE (std::equal(replace_invalid_result.begin(), replace_invalid_result.begin() + sizeof(fixed_invalid_sequence), fixed_invalid_sequence));
+}
+
+TEST(CheckedAPITests, test_find_invalid)
+{
+    char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa";
+    const char* invalid = find_invalid(utf_invalid, utf_invalid + 6);
+    EXPECT_EQ (invalid, utf_invalid + 5);
+    invalid = utf_invalid + find_invalid(utf_invalid);
+    EXPECT_EQ (invalid, utf_invalid + 5);
+}
+
+TEST(CheckedAPITests, test_is_valid)
+{
+    char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa";
+    bool bvalid = is_valid(utf_invalid, utf_invalid + 6);
+    EXPECT_FALSE (bvalid);
+    bvalid = is_valid(utf_invalid);
+    EXPECT_FALSE (bvalid);
+    char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
+    bvalid = is_valid(utf8_with_surrogates, utf8_with_surrogates + 9);
+    EXPECT_TRUE (bvalid);
+    bvalid = is_valid(utf8_with_surrogates);
+    EXPECT_TRUE (bvalid);
+}
+
+TEST(CheckedAPITests, test_starts_with_bom)
+{
+    unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf};
+    bool bbom = starts_with_bom(byte_order_mark, byte_order_mark + sizeof(byte_order_mark));
+    EXPECT_TRUE (bbom);
+    const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
+    bool no_bbom = starts_with_bom(threechars, threechars + sizeof(threechars));
+    EXPECT_FALSE (no_bbom);
+}
+
+TEST(CheckedIteratrTests, test_increment)
+{
+    const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
+    iris::utflib::iterator<const char*> it(threechars, threechars, threechars + 9);
+    iris::utflib::iterator<const char*> it2 = it;
+    EXPECT_EQ (it2, it);
+    EXPECT_EQ (*it, 0x10346);
+    EXPECT_EQ (*(++it), 0x65e5);
+    EXPECT_EQ ((*it++), 0x65e5);
+    EXPECT_EQ (*it, 0x0448);
+    EXPECT_NE (it, it2);
+    iris::utflib::iterator<const char*> endit (threechars + 9, threechars, threechars + 9);
+    EXPECT_EQ (++it, endit);
+}
+
+TEST(CheckedIteratrTests, test_decrement)
+{
+    const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
+    iris::utflib::iterator<const char*> it(threechars+9, threechars, threechars + 9);
+    EXPECT_EQ (*(--it), 0x0448);
+    EXPECT_EQ ((*it--), 0x0448);
+    EXPECT_EQ (*it, 0x65e5);
+    EXPECT_EQ (--it, iris::utflib::iterator<const char*>(threechars, threechars, threechars + 9));
+    EXPECT_EQ (*it, 0x10346);
+}
+
+TEST(CPP11APITests, test_append16)
+{
+    u16string u;
+    append16(0x0448, u);
+    EXPECT_EQ (u[0], char16_t(0x0448));
+    EXPECT_EQ (u.length(), 1);
+}
+
+TEST(CPP11APITests, test_utf16to8)
+{
+    u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
+    string u = utf16to8(utf16string);
+    EXPECT_EQ (u.size(), 10);
+}
+
+TEST(CPP11APITests, test_utf8to16)
+{
+    string utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
+    u16string utf16result = utf8to16(utf8_with_surrogates);
+    EXPECT_EQ (utf16result.size(), 4);
+    EXPECT_EQ (utf16result[2], 0xd834);
+    EXPECT_EQ (utf16result[3], 0xdd1e);
+    // Just to make sure it compiles with string literals
+    EXPECT_EQ(utf8to16(u8"simple"), u"simple");
+    EXPECT_EQ(utf8to16("simple"), u"simple");
+}
+
+TEST(CPP11APITests, test_utf32to8)
+{
+    u32string utf32string = {0x448, 0x65E5, 0x10346};
+    string utf8result = utf32to8(utf32string);
+    EXPECT_EQ (utf8result.size(), 9);
+}
+
+TEST(CPP11APITests, test_utf8to32)
+{
+    const char* twochars = "\xe6\x97\xa5\xd1\x88";
+    u32string utf32result = utf8to32(twochars);
+    EXPECT_EQ (utf32result.size(), 2);
+}
+
+TEST(CPP11APITests, test_find_invalid)
+{
+    string utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa";
+    auto invalid = find_invalid(utf_invalid);
+    EXPECT_EQ (invalid, 5);
+}
+
+TEST(CPP11APITests, test_is_valid)
+{
+    string utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa";
+    bool bvalid = is_valid(utf_invalid);
+    EXPECT_FALSE (bvalid);
+    string utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
+    bvalid = is_valid(utf8_with_surrogates);
+    EXPECT_TRUE (bvalid);
+}
+
+TEST(CPP11APITests, test_replace_invalid)
+{
+    string invalid_sequence = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
+    string replace_invalid_result = replace_invalid(invalid_sequence, '?');
+    bool bvalid = is_valid(replace_invalid_result);
+    EXPECT_TRUE (bvalid);
+    const string fixed_invalid_sequence = "a????z";
+    EXPECT_EQ(fixed_invalid_sequence, replace_invalid_result);
+}
+
+TEST(CPP11APITests, test_starts_with_bom)
+{
+    string byte_order_mark = {char(0xef), char(0xbb), char(0xbf)};
+    bool bbom = starts_with_bom(byte_order_mark);
+    EXPECT_TRUE (bbom);
+    string threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
+    bool no_bbom = starts_with_bom(threechars);
+    EXPECT_FALSE (no_bbom);
+}
+
+
+TEST(CPP17APITests, test_utf16to8)
+{
+    u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
+    u16string_view utf16stringview(utf16string);
+    string u = utf16to8(utf16stringview);
+    EXPECT_EQ (u.size(), 10);
+}
+
+TEST(CPP17APITests, test_utf8to16)
+{
+    string_view utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
+    u16string utf16result = utf8to16(utf8_with_surrogates);
+    EXPECT_EQ (utf16result.size(), 4);
+    EXPECT_EQ (utf16result[2], 0xd834);
+    EXPECT_EQ (utf16result[3], 0xdd1e);
+}
+
+TEST(CPP17APITests, test_utf32to8)
+{
+    u32string utf32string = {0x448, 0x65E5, 0x10346};
+    u32string_view utf32stringview(utf32string);
+    string utf8result = utf32to8(utf32stringview);
+    EXPECT_EQ (utf8result.size(), 9);
+}
+
+TEST(CPP17APITests, test_utf8to32)
+{
+    string_view twochars = "\xe6\x97\xa5\xd1\x88";
+    u32string utf32result = utf8to32(twochars);
+    EXPECT_EQ (utf32result.size(), 2);
+}
+
+TEST(CPP17APITests, test_find_invalid)
+{
+    string_view utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa";
+    auto invalid = find_invalid(utf_invalid);
+    EXPECT_EQ (invalid, 5);
+}
+
+TEST(CPP17APITests, test_is_valid)
+{
+    string_view utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa";
+    bool bvalid = is_valid(utf_invalid);
+    EXPECT_FALSE (bvalid);
+    string_view utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
+    bvalid = is_valid(utf8_with_surrogates);
+    EXPECT_TRUE (bvalid);
+}
+
+TEST(CPP17APITests, test_replace_invalid)
+{
+    string_view invalid_sequence = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
+    string replace_invalid_result = replace_invalid(invalid_sequence, '?');
+    bool bvalid = is_valid(replace_invalid_result);
+    EXPECT_TRUE (bvalid);
+    const string fixed_invalid_sequence = "a????z";
+    EXPECT_EQ(fixed_invalid_sequence, replace_invalid_result);
+}
+
+TEST(CPP17APITests, test_starts_with_bom)
+{
+    string byte_order_mark = {char(0xef), char(0xbb), char(0xbf)};
+    string_view byte_order_mark_view(byte_order_mark);
+    bool bbom = starts_with_bom(byte_order_mark_view);
+    EXPECT_TRUE (bbom);
+    string_view threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
+    bool no_bbom = starts_with_bom(threechars);
+    EXPECT_FALSE (no_bbom);
+}
+
+TEST(CPP17APITests, string_class_and_literals)
+{
+    const char* twochars = "ab";
+    EXPECT_TRUE (is_valid(twochars));
+    const string two_chars_string(twochars);
+    EXPECT_TRUE (is_valid(two_chars_string));
+}
+
+
+TEST(CPP20APITests, test_utf16tou8)
+{
+    u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
+    u16string_view utf16stringview{utf16string};
+    u8string u = utf16tou8(utf16string);
+    EXPECT_EQ (u.size(), 10);
+    u = utf16tou8(utf16stringview);
+    EXPECT_EQ (u.size(), 10);
+}
+
+TEST(CPP20APITests, tes20t_utf8to16)
+{
+    u8string utf8_with_surrogates{ reinterpret_cast<const char8_t*>("\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e") };
+    u16string utf16result = utf8to16(utf8_with_surrogates);
+    EXPECT_EQ (utf16result.size(), 4);
+    EXPECT_EQ (utf16result[2], 0xd834);
+    EXPECT_EQ (utf16result[3], 0xdd1e);
+}
+
+TEST(CPP20APITests, test_utf32tou8)
+{
+    u32string utf32string = {0x448, 0x65E5, 0x10346};
+    u32string_view utf32stringview{utf32string};
+    u8string utf8result = utf32tou8(utf32stringview);
+    EXPECT_EQ (utf8result.size(), 9);
+}
+
+TEST(CPP20APITests, test_utf8to32)
+{
+    u8string twochars = reinterpret_cast<const char8_t*>("\xe6\x97\xa5\xd1\x88");
+    u32string utf32result = utf8to32(twochars);
+    EXPECT_EQ (utf32result.size(), 2);
+}
+
+TEST(CPP20APITests, test_find_invalid)
+{
+    u8string utf_invalid = reinterpret_cast<const char8_t*>("\xe6\x97\xa5\xd1\x88\xfa");
+    auto invalid = find_invalid(utf_invalid);
+    EXPECT_EQ (invalid, 5);
+}
+
+TEST(CPP20APITests, test_is_valid)
+{
+    u8string utf_invalid = reinterpret_cast<const char8_t*>("\xe6\x97\xa5\xd1\x88\xfa");
+    bool bvalid = is_valid(utf_invalid);
+    EXPECT_FALSE (bvalid);
+    u8string utf8_with_surrogates = reinterpret_cast<const char8_t*>("\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e");
+    bvalid = is_valid(utf8_with_surrogates);
+    EXPECT_TRUE (bvalid);
+}
+
+TEST(CPP20APITests, test_replace_invalid)
+{
+    u8string invalid_sequence = reinterpret_cast<const char8_t*>("a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z");
+    u8string replace_invalid_result = replace_invalid(invalid_sequence, u8'?');
+    bool bvalid = is_valid(replace_invalid_result);
+    EXPECT_TRUE (bvalid);
+    const u8string fixed_invalid_sequence = reinterpret_cast<const char8_t*>("a????z");
+    EXPECT_EQ(fixed_invalid_sequence, replace_invalid_result);
+}
+
+TEST(CPP20APITests, test_starts_with_bom)
+{
+    u8string byte_order_mark = reinterpret_cast<const char8_t*>("\xef\xbb\xbf");
+    bool bbom = starts_with_bom(byte_order_mark);
+    EXPECT_TRUE (bbom);
+    u8string threechars = reinterpret_cast<const char8_t*>("\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88");
+    bool no_bbom = starts_with_bom(threechars);
+    EXPECT_FALSE (no_bbom);
+}
+
+#endif
+
+} // iris_unicode_test
diff --git a/test/unicode/string/test_cpp11.cpp b/test/unicode/string/test_cpp11.cpp
deleted file mode 100644
index 9de19be..0000000
--- a/test/unicode/string/test_cpp11.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-#include "ftest.h"
-
-#include "utf8.h"
-
-#include <string>
-
-using namespace iris::utflib;
-using namespace std;
-
-TEST(CPP11APITests, test_append)
-{
-    string u;
-    append(0x0448, u);
-    EXPECT_EQ (u[0], char(0xd1));
-    EXPECT_EQ (u[1], char(0x88));
-    EXPECT_EQ (u.length(), 2);
-
-    u.clear();
-    append(0x65e5, u);
-    EXPECT_EQ (u[0], char(0xe6));
-    EXPECT_EQ (u[1], char(0x97));
-    EXPECT_EQ (u[2], char(0xa5));
-    EXPECT_EQ (u.length(), 3);
-
-    u.clear();
-    append(0x3044, u);
-    EXPECT_EQ (u[0], char(0xe3));
-    EXPECT_EQ (u[1], char(0x81));
-    EXPECT_EQ (u[2], char(0x84));
-    EXPECT_EQ (u.length(), 3);
-
-    u.clear();
-    append(0x10346, u);
-    EXPECT_EQ (u[0], char(0xf0));
-    EXPECT_EQ (u[1], char(0x90));
-    EXPECT_EQ (u[2], char(0x8d));
-    EXPECT_EQ (u[3], char(0x86));
-    EXPECT_EQ (u.length(), 4);
-}
-
-TEST(CPP11APITests, test_append16)
-{
-    u16string u;
-    append16(0x0448, u);
-    EXPECT_EQ (u[0], char16_t(0x0448));
-    EXPECT_EQ (u.length(), 1);
-}
-
-TEST(CPP11APITests, test_utf16to8)
-{
-    u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
-    string u = utf16to8(utf16string);
-    EXPECT_EQ (u.size(), 10);
-}
-
-TEST(CPP11APITests, test_utf8to16)
-{
-    string utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
-    u16string utf16result = utf8to16(utf8_with_surrogates);
-    EXPECT_EQ (utf16result.size(), 4);
-    EXPECT_EQ (utf16result[2], 0xd834);
-    EXPECT_EQ (utf16result[3], 0xdd1e);
-    // Just to make sure it compiles with string literals
-    EXPECT_EQ(utf8to16(u8"simple"), u"simple");
-    EXPECT_EQ(utf8to16("simple"), u"simple");
-}
-
-TEST(CPP11APITests, test_utf32to8)
-{
-    u32string utf32string = {0x448, 0x65E5, 0x10346};
-    string utf8result = utf32to8(utf32string);
-    EXPECT_EQ (utf8result.size(), 9);
-}
-
-TEST(CPP11APITests, test_utf8to32)
-{
-    const char* twochars = "\xe6\x97\xa5\xd1\x88";
-    u32string utf32result = utf8to32(twochars);
-    EXPECT_EQ (utf32result.size(), 2);
-}
-
-TEST(CPP11APITests, test_find_invalid)
-{
-    string utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa";
-    auto invalid = find_invalid(utf_invalid);
-    EXPECT_EQ (invalid, 5);
-}
-
-TEST(CPP11APITests, test_is_valid)
-{
-    string utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa";
-    bool bvalid = is_valid(utf_invalid);
-    EXPECT_FALSE (bvalid);
-    string utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
-    bvalid = is_valid(utf8_with_surrogates);
-    EXPECT_TRUE (bvalid);
-}
-
-TEST(CPP11APITests, test_replace_invalid)
-{
-    string invalid_sequence = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
-    string replace_invalid_result = replace_invalid(invalid_sequence, '?');
-    bool bvalid = is_valid(replace_invalid_result);
-    EXPECT_TRUE (bvalid);
-    const string fixed_invalid_sequence = "a????z";
-    EXPECT_EQ(fixed_invalid_sequence, replace_invalid_result);
-}
-
-TEST(CPP11APITests, test_starts_with_bom)
-{
-    string byte_order_mark = {char(0xef), char(0xbb), char(0xbf)};
-    bool bbom = starts_with_bom(byte_order_mark);
-    EXPECT_TRUE (bbom);
-    string threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
-    bool no_bbom = starts_with_bom(threechars);
-    EXPECT_FALSE (no_bbom);
-}
diff --git a/test/unicode/string/test_cpp17.cpp b/test/unicode/string/test_cpp17.cpp
deleted file mode 100644
index 2d3756c..0000000
--- a/test/unicode/string/test_cpp17.cpp
+++ /dev/null
@@ -1,86 +0,0 @@
-#include "ftest.h"
-
-#include "utf8.h"
-
-#include <string>
-
-using namespace iris::utflib;
-using namespace std;
-
-TEST(CPP17APITests, test_utf16to8)
-{
-    u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
-    u16string_view utf16stringview(utf16string);
-    string u = utf16to8(utf16stringview);
-    EXPECT_EQ (u.size(), 10);
-}
-
-TEST(CPP17APITests, test_utf8to16)
-{
-    string_view utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
-    u16string utf16result = utf8to16(utf8_with_surrogates);
-    EXPECT_EQ (utf16result.size(), 4);
-    EXPECT_EQ (utf16result[2], 0xd834);
-    EXPECT_EQ (utf16result[3], 0xdd1e);
-}
-
-TEST(CPP17APITests, test_utf32to8)
-{
-    u32string utf32string = {0x448, 0x65E5, 0x10346};
-    u32string_view utf32stringview(utf32string);
-    string utf8result = utf32to8(utf32stringview);
-    EXPECT_EQ (utf8result.size(), 9);
-}
-
-TEST(CPP17APITests, test_utf8to32)
-{
-    string_view twochars = "\xe6\x97\xa5\xd1\x88";
-    u32string utf32result = utf8to32(twochars);
-    EXPECT_EQ (utf32result.size(), 2);
-}
-
-TEST(CPP17APITests, test_find_invalid)
-{
-    string_view utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa";
-    auto invalid = find_invalid(utf_invalid);
-    EXPECT_EQ (invalid, 5);
-}
-
-TEST(CPP17APITests, test_is_valid)
-{
-    string_view utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa";
-    bool bvalid = is_valid(utf_invalid);
-    EXPECT_FALSE (bvalid);
-    string_view utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
-    bvalid = is_valid(utf8_with_surrogates);
-    EXPECT_TRUE (bvalid);
-}
-
-TEST(CPP17APITests, test_replace_invalid)
-{
-    string_view invalid_sequence = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
-    string replace_invalid_result = replace_invalid(invalid_sequence, '?');
-    bool bvalid = is_valid(replace_invalid_result);
-    EXPECT_TRUE (bvalid);
-    const string fixed_invalid_sequence = "a????z";
-    EXPECT_EQ(fixed_invalid_sequence, replace_invalid_result);
-}
-
-TEST(CPP17APITests, test_starts_with_bom)
-{
-    string byte_order_mark = {char(0xef), char(0xbb), char(0xbf)};
-    string_view byte_order_mark_view(byte_order_mark);
-    bool bbom = starts_with_bom(byte_order_mark_view);
-    EXPECT_TRUE (bbom);
-    string_view threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
-    bool no_bbom = starts_with_bom(threechars);
-    EXPECT_FALSE (no_bbom);
-}
-
-TEST(CPP17APITests, string_class_and_literals)
-{
-    const char* twochars = "ab";
-    EXPECT_TRUE (is_valid(twochars));
-    const string two_chars_string(twochars);
-    EXPECT_TRUE (is_valid(two_chars_string));
-}
diff --git a/test/unicode/string/test_cpp20.cpp b/test/unicode/string/test_cpp20.cpp
deleted file mode 100644
index 330027d..0000000
--- a/test/unicode/string/test_cpp20.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-#include "ftest.h"
-
-#include "utf8.h"
-
-#include <string>
-
-using namespace iris::utflib;
-using namespace std;
-
-TEST(CPP20APITests, test_utf16tou8)
-{
-    u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
-    u16string_view utf16stringview{utf16string};
-    u8string u = utf16tou8(utf16string);
-    EXPECT_EQ (u.size(), 10);
-    u = utf16tou8(utf16stringview);
-    EXPECT_EQ (u.size(), 10);
-}
-
-TEST(CPP20APITests, tes20t_utf8to16)
-{
-    u8string utf8_with_surrogates{ reinterpret_cast<const char8_t*>("\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e") };
-    u16string utf16result = utf8to16(utf8_with_surrogates);
-    EXPECT_EQ (utf16result.size(), 4);
-    EXPECT_EQ (utf16result[2], 0xd834);
-    EXPECT_EQ (utf16result[3], 0xdd1e);
-}
-
-TEST(CPP20APITests, test_utf32tou8)
-{
-    u32string utf32string = {0x448, 0x65E5, 0x10346};
-    u32string_view utf32stringview{utf32string};
-    u8string utf8result = utf32tou8(utf32stringview);
-    EXPECT_EQ (utf8result.size(), 9);
-}
-
-TEST(CPP20APITests, test_utf8to32)
-{
-    u8string twochars = reinterpret_cast<const char8_t*>("\xe6\x97\xa5\xd1\x88");
-    u32string utf32result = utf8to32(twochars);
-    EXPECT_EQ (utf32result.size(), 2);
-}
-
-TEST(CPP20APITests, test_find_invalid)
-{
-    u8string utf_invalid = reinterpret_cast<const char8_t*>("\xe6\x97\xa5\xd1\x88\xfa");
-    auto invalid = find_invalid(utf_invalid);
-    EXPECT_EQ (invalid, 5);
-}
-
-TEST(CPP20APITests, test_is_valid)
-{
-    u8string utf_invalid = reinterpret_cast<const char8_t*>("\xe6\x97\xa5\xd1\x88\xfa");
-    bool bvalid = is_valid(utf_invalid);
-    EXPECT_FALSE (bvalid);
-    u8string utf8_with_surrogates = reinterpret_cast<const char8_t*>("\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e");
-    bvalid = is_valid(utf8_with_surrogates);
-    EXPECT_TRUE (bvalid);
-}
-
-TEST(CPP20APITests, test_replace_invalid)
-{
-    u8string invalid_sequence = reinterpret_cast<const char8_t*>("a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z");
-    u8string replace_invalid_result = replace_invalid(invalid_sequence, u8'?');
-    bool bvalid = is_valid(replace_invalid_result);
-    EXPECT_TRUE (bvalid);
-    const u8string fixed_invalid_sequence = reinterpret_cast<const char8_t*>("a????z");
-    EXPECT_EQ(fixed_invalid_sequence, replace_invalid_result);
-}
-
-TEST(CPP20APITests, test_starts_with_bom)
-{
-    u8string byte_order_mark = reinterpret_cast<const char8_t*>("\xef\xbb\xbf");
-    bool bbom = starts_with_bom(byte_order_mark);
-    EXPECT_TRUE (bbom);
-    u8string threechars = reinterpret_cast<const char8_t*>("\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88");
-    bool no_bbom = starts_with_bom(threechars);
-    EXPECT_FALSE (no_bbom);
-}
diff --git a/test/unicode/string/negative.cpp b/test/unicode/string/utf8_invalid.cpp
similarity index 100%
rename from test/unicode/string/negative.cpp
rename to test/unicode/string/utf8_invalid.cpp

From c078f214907750dadf75d7f7e1c60ff899a2d3f9 Mon Sep 17 00:00:00 2001
From: Nana Sakisaka <1901813+saki7@users.noreply.github.com>
Date: Sat, 7 Mar 2026 15:38:50 +0900
Subject: [PATCH 03/17] Fix code style

---
 include/iris/unicode/string.hpp | 1547 +++++++++++++++----------------
 1 file changed, 773 insertions(+), 774 deletions(-)

diff --git a/include/iris/unicode/string.hpp b/include/iris/unicode/string.hpp
index a09ebc8..7b0b77a 100644
--- a/include/iris/unicode/string.hpp
+++ b/include/iris/unicode/string.hpp
@@ -38,918 +38,917 @@ DEALINGS IN THE SOFTWARE.
 #include <type_traits>
 #include <utility>
 
-namespace iris::unicode
-{
-    template<class T>
-    concept octet = std::integral<T> && sizeof(T) == 1;
+namespace iris::unicode {
 
-    template<class T>
-    concept utf8char = octet<T> && (std::same_as<T, char> || std::same_as<T, char8_t>);
+template<class T>
+concept octet = std::integral<T> && sizeof(T) == 1;
 
-    template<class T>
-    concept utf16char = std::same_as<T, char16_t>;
+template<class T>
+concept utf8char = octet<T> && (std::same_as<T, char> || std::same_as<T, char8_t>);
 
-    template<class T>
-    concept utf32char = std::same_as<T, char32_t>;
+template<class T>
+concept utf16char = std::same_as<T, char16_t>;
 
-    template<class It>
-    concept octet_input_iterator = std::input_iterator<It> && octet<std::iter_value_t<It>>;
+template<class T>
+concept utf32char = std::same_as<T, char32_t>;
 
-    template<class It>
-    concept utf8_input_iterator = octet_input_iterator<It> && utf8char<std::iter_value_t<It>>;
+template<class It>
+concept octet_input_iterator = std::input_iterator<It> && octet<std::iter_value_t<It>>;
 
-    template<class It>
-    concept utf16_input_iterator = std::input_iterator<It> && utf16char<std::iter_value_t<It>>;
+template<class It>
+concept utf8_input_iterator = octet_input_iterator<It> && utf8char<std::iter_value_t<It>>;
 
-    template<class It>
-    concept utf32_input_iterator = std::input_iterator<It> && utf32char<std::iter_value_t<It>>;
+template<class It>
+concept utf16_input_iterator = std::input_iterator<It> && utf16char<std::iter_value_t<It>>;
 
-    namespace traits
-    {
-        template<class T, class = void>
-        struct is_nothrow_dereferenceable : std::false_type {};
+template<class It>
+concept utf32_input_iterator = std::input_iterator<It> && utf32char<std::iter_value_t<It>>;
 
-        template<class T>
-        struct is_nothrow_dereferenceable<T, std::void_t<decltype(*std::declval<T>())>> : std::bool_constant<noexcept(*std::declval<T>())> {};
 
-        template<class T>
-        inline constexpr bool is_nothrow_dereferenceable_v = is_nothrow_dereferenceable<T>::value;
+template<class T, class = void>
+struct is_nothrow_dereferenceable : std::false_type {};
 
-        template<class T, class = void>
-        struct is_nothrow_prefix_incrementable : std::false_type {};
+template<class T>
+struct is_nothrow_dereferenceable<T, std::void_t<decltype(*std::declval<T>())>> : std::bool_constant<noexcept(*std::declval<T>())> {};
 
-        template<class T>
-        struct is_nothrow_prefix_incrementable<T, std::void_t<decltype(++std::declval<T>())>> : std::bool_constant<noexcept(++std::declval<T>())> {};
+template<class T>
+inline constexpr bool is_nothrow_dereferenceable_v = is_nothrow_dereferenceable<T>::value;
 
-        template<class T>
-        inline constexpr bool is_nothrow_prefix_incrementable_v = is_nothrow_prefix_incrementable<T>::value;
+template<class T, class = void>
+struct is_nothrow_prefix_incrementable : std::false_type {};
 
-        template<class T, class = void>
-        struct is_nothrow_postfix_incrementable : std::false_type {};
+template<class T>
+struct is_nothrow_prefix_incrementable<T, std::void_t<decltype(++std::declval<T>())>> : std::bool_constant<noexcept(++std::declval<T>())> {};
 
-        template<class T>
-        struct is_nothrow_postfix_incrementable<T, std::void_t<decltype(std::declval<T>()++)>> : std::bool_constant<noexcept(std::declval<T>()++)> {};
+template<class T>
+inline constexpr bool is_nothrow_prefix_incrementable_v = is_nothrow_prefix_incrementable<T>::value;
 
-        template<class T>
-        inline constexpr bool is_nothrow_postfix_incrementable_v = is_nothrow_postfix_incrementable<T>::value;
+template<class T, class = void>
+struct is_nothrow_postfix_incrementable : std::false_type {};
 
-        template<class It, class Se>
-        struct is_nothrow_sentinel : std::false_type {};
+template<class T>
+struct is_nothrow_postfix_incrementable<T, std::void_t<decltype(std::declval<T>()++)>> : std::bool_constant<noexcept(std::declval<T>()++)> {};
 
-        template<class It, class Se>
-            requires std::sentinel_for<Se, It>
-        struct is_nothrow_sentinel<It, Se> : std::bool_constant<
-            noexcept(std::declval<It&>() == std::declval<Se&>()) &&
-            noexcept(std::declval<It&>() != std::declval<Se&>()) &&
-            noexcept(std::declval<Se&>() == std::declval<It&>()) &&
-            noexcept(std::declval<Se&>() != std::declval<It&>())
-        >
-        {};
+template<class T>
+inline constexpr bool is_nothrow_postfix_incrementable_v = is_nothrow_postfix_incrementable<T>::value;
 
-        template<class It, class Se>
-        inline constexpr bool is_nothrow_sentinel_v = is_nothrow_sentinel<It, Se>::value;
-    } // namespace traits
+template<class It, class Se>
+struct is_nothrow_sentinel : std::false_type {};
 
-    // Helper code - not intended to be directly called by the library users. May be changed at any time
-    namespace internal
-    {
-        // Unicode constants
-        // Leading (high) surrogates: 0xd800 - 0xdbff
-        // Trailing (low) surrogates: 0xdc00 - 0xdfff
-        constexpr char16_t LEAD_SURROGATE_MIN  = 0xd800u;
-        constexpr char16_t LEAD_SURROGATE_MAX  = 0xdbffu;
-        constexpr char16_t TRAIL_SURROGATE_MIN = 0xdc00u;
-        constexpr char16_t TRAIL_SURROGATE_MAX = 0xdfffu;
-        constexpr char16_t LEAD_OFFSET         = 0xd7c0u;     // LEAD_SURROGATE_MIN - (0x10000 >> 10)
-        constexpr char32_t SURROGATE_OFFSET    = 0xfca02400u; // 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN
-
-        // Maximum valid value for a Unicode code point
-        constexpr char32_t CODE_POINT_MAX = 0x0010ffffu;
-
-        enum class utf_error { OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT };
-
-        template<octet Octet>
-        [[nodiscard]] constexpr char8_t mask8(Octet oc) noexcept
-        {
-            return static_cast<char8_t>(0xff & oc);
-        }
+template<class It, class Se>
+    requires std::sentinel_for<Se, It>
+struct is_nothrow_sentinel<It, Se> : std::bool_constant<
+    noexcept(std::declval<It&>() == std::declval<Se&>()) &&
+    noexcept(std::declval<It&>() != std::declval<Se&>()) &&
+    noexcept(std::declval<Se&>() == std::declval<It&>()) &&
+    noexcept(std::declval<Se&>() != std::declval<It&>())
+>
+{};
 
-        [[nodiscard]] constexpr char16_t mask16(char16_t oc) noexcept
-        {
-            return static_cast<char16_t>(0xffff & oc);
-        }
+template<class It, class Se>
+inline constexpr bool is_nothrow_sentinel_v = is_nothrow_sentinel<It, Se>::value;
 
-        template<octet Octet>
-        [[nodiscard]] constexpr bool is_trail(Octet oc) noexcept
-        {
-            return ((internal::mask8(oc) >> 6) == 0x2);
-        }
 
-        [[nodiscard]] constexpr bool is_lead_surrogate(char32_t cp) noexcept
-        {
-            return (cp >= static_cast<char32_t>(LEAD_SURROGATE_MIN) && cp <= static_cast<char32_t>(LEAD_SURROGATE_MAX));
-        }
+namespace detail {
 
-        [[nodiscard]] constexpr bool is_trail_surrogate(char32_t cp) noexcept
-        {
-            return (cp >= static_cast<char32_t>(TRAIL_SURROGATE_MIN) && cp <= static_cast<char32_t>(TRAIL_SURROGATE_MAX));
-        }
+// Unicode constants
+// Leading (high) surrogates: 0xd800 - 0xdbff
+// Trailing (low) surrogates: 0xdc00 - 0xdfff
+constexpr char16_t LEAD_SURROGATE_MIN  = 0xd800u;
+constexpr char16_t LEAD_SURROGATE_MAX  = 0xdbffu;
+constexpr char16_t TRAIL_SURROGATE_MIN = 0xdc00u;
+constexpr char16_t TRAIL_SURROGATE_MAX = 0xdfffu;
+constexpr char16_t LEAD_OFFSET         = 0xd7c0u;     // LEAD_SURROGATE_MIN - (0x10000 >> 10)
+constexpr char32_t SURROGATE_OFFSET    = 0xfca02400u; // 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN
 
-        [[nodiscard]] constexpr bool is_surrogate(char32_t cp) noexcept
-        {
-            return (cp >= static_cast<char32_t>(LEAD_SURROGATE_MIN) && cp <= static_cast<char32_t>(TRAIL_SURROGATE_MAX));
-        }
+// Maximum valid value for a Unicode code point
+constexpr char32_t CODE_POINT_MAX = 0x0010ffffu;
 
-        [[nodiscard]] constexpr bool is_code_point_valid(char32_t cp) noexcept
-        {
-            return (cp <= CODE_POINT_MAX && !internal::is_surrogate(cp));
-        }
+enum class utf_error { OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT };
 
-        [[nodiscard]] constexpr bool is_in_bmp(char32_t cp) noexcept
-        {
-            return cp < char32_t(0x10000);
-        }
+template<octet Octet>
+[[nodiscard]] constexpr char8_t mask8(Octet oc) noexcept
+{
+    return static_cast<char8_t>(0xff & oc);
+}
 
-        [[nodiscard]] constexpr bool is_overlong_sequence(char32_t cp, int length) noexcept
-        {
-            if (cp < 0x80) {
-                if (length != 1)
-                    return true;
-            } else if (cp < 0x800) {
-                if (length != 2)
-                    return true;
-            } else if (cp < 0x10000) {
-                if (length != 3)
-                    return true;
-            }
-            return false;
-        }
+[[nodiscard]] constexpr char16_t mask16(char16_t oc) noexcept
+{
+    return static_cast<char16_t>(0xffff & oc);
+}
 
-        template<octet_input_iterator It>
-        [[nodiscard]] constexpr int sequence_length(It lead_it)
-            noexcept(traits::is_nothrow_dereferenceable_v<It&>)
-        {
-            const char8_t lead = internal::mask8(*lead_it);
-            if (lead < 0x80)
-                return 1;
-            else if ((lead >> 5) == 0x6)
-                return 2;
-            else if ((lead >> 4) == 0xe)
-                return 3;
-            else if ((lead >> 3) == 0x1e)
-                return 4;
-            else
-                return 0;
-        }
+template<octet Octet>
+[[nodiscard]] constexpr bool is_trail(Octet oc) noexcept
+{
+    return ((detail::mask8(oc) >> 6) == 0x2);
+}
 
-        /// Helper for get_sequence_x
-        template<octet_input_iterator It, std::sentinel_for<It> Se>
-        constexpr utf_error increase_safely(It& it, Se end)
-            noexcept(std::conjunction_v<
-                traits::is_nothrow_dereferenceable<It&>,
-                traits::is_nothrow_prefix_incrementable<It&>,
-                traits::is_nothrow_sentinel<It, Se>
-            >)
-        {
-            if (++it == end)
-                return utf_error::NOT_ENOUGH_ROOM;
-
-            if (!internal::is_trail(*it))
-                return utf_error::INCOMPLETE_SEQUENCE;
-
-            return utf_error::OK;
-        }
+[[nodiscard]] constexpr bool is_lead_surrogate(char32_t cp) noexcept
+{
+    return (cp >= static_cast<char32_t>(LEAD_SURROGATE_MIN) && cp <= static_cast<char32_t>(LEAD_SURROGATE_MAX));
+}
 
-#define IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(IT, END)                                                                                                                                                                                      \
-    do {                                                                                                                                                                                                                                       \
-        utf_error ret = increase_safely(IT, END);                                                                                                                                                                                              \
-        if (ret != utf_error::OK)                                                                                                                                                                                                                    \
-            return ret;                                                                                                                                                                                                                        \
-    } while (false)
-
-        /// get_sequence_x functions decode utf-8 sequences of the length x
-        template<octet_input_iterator It, std::sentinel_for<It> Se>
-        constexpr utf_error get_sequence_1(It& it, Se end, char32_t& code_point)
-            noexcept(std::conjunction_v<
-                traits::is_nothrow_dereferenceable<It&>,
-                traits::is_nothrow_sentinel<It, Se>
-            >)
-        {
-            if (it == end)
-                return utf_error::NOT_ENOUGH_ROOM;
-
-            code_point = static_cast<char32_t>(internal::mask8(*it));
-
-            return utf_error::OK;
-        }
+[[nodiscard]] constexpr bool is_trail_surrogate(char32_t cp) noexcept
+{
+    return (cp >= static_cast<char32_t>(TRAIL_SURROGATE_MIN) && cp <= static_cast<char32_t>(TRAIL_SURROGATE_MAX));
+}
 
-        template<octet_input_iterator It, std::sentinel_for<It> Se>
-        constexpr utf_error get_sequence_2(It& it, Se end, char32_t& code_point)
-            noexcept(std::conjunction_v<
-                traits::is_nothrow_dereferenceable<It&>,
-                traits::is_nothrow_prefix_incrementable<It&>,
-                traits::is_nothrow_sentinel<It, Se>
-            >)
-        {
-            if (it == end)
-                return utf_error::NOT_ENOUGH_ROOM;
+[[nodiscard]] constexpr bool is_surrogate(char32_t cp) noexcept
+{
+    return (cp >= static_cast<char32_t>(LEAD_SURROGATE_MIN) && cp <= static_cast<char32_t>(TRAIL_SURROGATE_MAX));
+}
 
-            code_point = static_cast<char32_t>(internal::mask8(*it));
+[[nodiscard]] constexpr bool is_code_point_valid(char32_t cp) noexcept
+{
+    return (cp <= CODE_POINT_MAX && !detail::is_surrogate(cp));
+}
 
-            IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end);
+[[nodiscard]] constexpr bool is_in_bmp(char32_t cp) noexcept
+{
+    return cp < char32_t(0x10000);
+}
 
-            code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f);
+[[nodiscard]] constexpr bool is_overlong_sequence(char32_t cp, int length) noexcept
+{
+    if (cp < 0x80) {
+        if (length != 1)
+            return true;
+    } else if (cp < 0x800) {
+        if (length != 2)
+            return true;
+    } else if (cp < 0x10000) {
+        if (length != 3)
+            return true;
+    }
+    return false;
+}
+
+template<octet_input_iterator It>
+[[nodiscard]] constexpr int sequence_length(It lead_it)
+    noexcept(is_nothrow_dereferenceable_v<It&>)
+{
+    const char8_t lead = detail::mask8(*lead_it);
+    if (lead < 0x80)
+        return 1;
+    else if ((lead >> 5) == 0x6)
+        return 2;
+    else if ((lead >> 4) == 0xe)
+        return 3;
+    else if ((lead >> 3) == 0x1e)
+        return 4;
+    else
+        return 0;
+}
+
+/// Helper for get_sequence_x
+template<octet_input_iterator It, std::sentinel_for<It> Se>
+constexpr utf_error increase_safely(It& it, Se end)
+    noexcept(std::conjunction_v<
+        is_nothrow_dereferenceable<It&>,
+        is_nothrow_prefix_incrementable<It&>,
+        is_nothrow_sentinel<It, Se>
+    >)
+{
+    if (++it == end)
+        return utf_error::NOT_ENOUGH_ROOM;
 
-            return utf_error::OK;
-        }
+    if (!detail::is_trail(*it))
+        return utf_error::INCOMPLETE_SEQUENCE;
 
-        template<octet_input_iterator It, std::sentinel_for<It> Se>
-        constexpr utf_error get_sequence_3(It& it, Se end, char32_t& code_point)
-            noexcept(std::conjunction_v<
-                traits::is_nothrow_dereferenceable<It&>,
-                traits::is_nothrow_prefix_incrementable<It&>,
-                traits::is_nothrow_sentinel<It, Se>
-            >)
-        {
-            if (it == end)
-                return utf_error::NOT_ENOUGH_ROOM;
+    return utf_error::OK;
+}
 
-            code_point = static_cast<char32_t>(internal::mask8(*it));
+#define IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(IT, END)                                                                                                                                                                                      \
+do {                                                                                                                                                                                                                                       \
+utf_error ret = increase_safely(IT, END);                                                                                                                                                                                              \
+if (ret != utf_error::OK)                                                                                                                                                                                                                    \
+    return ret;                                                                                                                                                                                                                        \
+} while (false)
+
+/// get_sequence_x functions decode utf-8 sequences of the length x
+template<octet_input_iterator It, std::sentinel_for<It> Se>
+constexpr utf_error get_sequence_1(It& it, Se end, char32_t& code_point)
+    noexcept(std::conjunction_v<
+        is_nothrow_dereferenceable<It&>,
+        is_nothrow_sentinel<It, Se>
+    >)
+{
+    if (it == end)
+        return utf_error::NOT_ENOUGH_ROOM;
 
-            IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end);
+    code_point = static_cast<char32_t>(detail::mask8(*it));
 
-            code_point = ((code_point << 12) & 0xffff) + ((internal::mask8(*it) << 6) & 0xfff);
+    return utf_error::OK;
+}
 
-            IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end);
+template<octet_input_iterator It, std::sentinel_for<It> Se>
+constexpr utf_error get_sequence_2(It& it, Se end, char32_t& code_point)
+    noexcept(std::conjunction_v<
+        is_nothrow_dereferenceable<It&>,
+        is_nothrow_prefix_incrementable<It&>,
+        is_nothrow_sentinel<It, Se>
+    >)
+{
+    if (it == end)
+        return utf_error::NOT_ENOUGH_ROOM;
 
-            code_point = static_cast<char32_t>(code_point + ((*it) & 0x3f));
+    code_point = static_cast<char32_t>(detail::mask8(*it));
 
-            return utf_error::OK;
-        }
+    IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end);
 
-        template<octet_input_iterator It, std::sentinel_for<It> Se>
-        constexpr utf_error get_sequence_4(It& it, Se end, char32_t& code_point)
-            noexcept(std::conjunction_v<
-                traits::is_nothrow_dereferenceable<It&>,
-                traits::is_nothrow_prefix_incrementable<It&>,
-                traits::is_nothrow_sentinel<It, Se>
-            >)
-        {
-            if (it == end)
-                return utf_error::NOT_ENOUGH_ROOM;
+    code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f);
 
-            code_point = static_cast<char32_t>(internal::mask8(*it));
+    return utf_error::OK;
+}
 
-            IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end);
+template<octet_input_iterator It, std::sentinel_for<It> Se>
+constexpr utf_error get_sequence_3(It& it, Se end, char32_t& code_point)
+    noexcept(std::conjunction_v<
+        is_nothrow_dereferenceable<It&>,
+        is_nothrow_prefix_incrementable<It&>,
+        is_nothrow_sentinel<It, Se>
+    >)
+{
+    if (it == end)
+        return utf_error::NOT_ENOUGH_ROOM;
 
-            code_point = ((code_point << 18) & 0x1fffff) + ((internal::mask8(*it) << 12) & 0x3ffff);
+    code_point = static_cast<char32_t>(detail::mask8(*it));
 
-            IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end);
+    IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end);
 
-            code_point = static_cast<char32_t>(code_point + ((internal::mask8(*it) << 6) & 0xfff));
+    code_point = ((code_point << 12) & 0xffff) + ((detail::mask8(*it) << 6) & 0xfff);
 
-            IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end);
+    IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end);
 
-            code_point = static_cast<char32_t>(code_point + ((*it) & 0x3f));
+    code_point = static_cast<char32_t>(code_point + ((*it) & 0x3f));
 
-            return utf_error::OK;
-        }
+    return utf_error::OK;
+}
 
-#undef IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR
+template<octet_input_iterator It, std::sentinel_for<It> Se>
+constexpr utf_error get_sequence_4(It& it, Se end, char32_t& code_point)
+    noexcept(std::conjunction_v<
+        is_nothrow_dereferenceable<It&>,
+        is_nothrow_prefix_incrementable<It&>,
+        is_nothrow_sentinel<It, Se>
+    >)
+{
+    if (it == end)
+        return utf_error::NOT_ENOUGH_ROOM;
 
-        template<octet_input_iterator It, std::sentinel_for<It> Se>
-            requires std::forward_iterator<It>
-        constexpr utf_error validate_next(It& it, Se end, char32_t& code_point)
-            noexcept(std::conjunction_v<
-                traits::is_nothrow_dereferenceable<It&>,
-                traits::is_nothrow_prefix_incrementable<It&>,
-                traits::is_nothrow_sentinel<It, Se>,
-                std::is_nothrow_copy_constructible<It>
-            >)
-        {
-            if (it == end)
-                return utf_error::NOT_ENOUGH_ROOM;
-
-            // Save the original value of it so we can go back in case of failure
-            // Of course, it does not make much sense with i.e. stream iterators
-            It original_it = it;
-
-            char32_t cp = 0;
-            // Determine the sequence length based on the lead octet
-            const int length = internal::sequence_length(it);
-
-            // Get trail octets and calculate the code point
-            utf_error err = utf_error::OK;
-            switch (length) {
-                case 0:
-                    return utf_error::INVALID_LEAD;
-                case 1:
-                    err = internal::get_sequence_1(it, end, cp);
-                    break;
-                case 2:
-                    err = internal::get_sequence_2(it, end, cp);
-                    break;
-                case 3:
-                    err = internal::get_sequence_3(it, end, cp);
-                    break;
-                case 4:
-                    err = internal::get_sequence_4(it, end, cp);
-                    break;
-            }
-
-            if (err == utf_error::OK) {
-                // Decoding succeeded. Now, security checks...
-                if (internal::is_code_point_valid(cp)) {
-                    if (!internal::is_overlong_sequence(cp, length)) {
-                        // Passed! Return here.
-                        code_point = cp;
-                        ++it;
-                        return utf_error::OK;
-                    } else
-                        err = utf_error::OVERLONG_SEQUENCE;
-                } else
-                    err = utf_error::INVALID_CODE_POINT;
-            }
-
-            // Failure branch - restore the original value of the iterator
-            it = original_it;
-            return err;
-        }
+    code_point = static_cast<char32_t>(detail::mask8(*it));
 
-        template<octet_input_iterator It, std::sentinel_for<It> Se>
-            requires std::forward_iterator<It>
-        constexpr utf_error validate_next(It& it, Se end)
-            noexcept(noexcept(internal::validate_next(it, end, std::declval<char32_t&>())))
-        {
-            char32_t ignored;
-            return internal::validate_next(it, end, ignored);
-        }
+    IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end);
 
-        template<utf16_input_iterator It, std::sentinel_for<It> Se>
-            requires std::forward_iterator<It>
-        constexpr utf_error validate_next16(It& it, Se end, char32_t& code_point)
-            noexcept(std::conjunction_v<
-                traits::is_nothrow_dereferenceable<It&>,
-                traits::is_nothrow_prefix_incrementable<It&>,
-                traits::is_nothrow_postfix_incrementable<It&>,
-                traits::is_nothrow_sentinel<It, Se>,
-                std::is_nothrow_copy_constructible<It>
-            >)
-        {
-            // Check the edge case:
-            if (it == end)
-                return utf_error::NOT_ENOUGH_ROOM;
-            // Save the original value of it so we can go back in case of failure
-            // Of course, it does not make much sense with i.e. stream iterators
-            It original_it = it;
-
-            utf_error err = utf_error::OK;
-
-            const char16_t first_word = *it++;
-            if (!internal::is_surrogate(first_word)) {
-                code_point = first_word;
-                return utf_error::OK;
-            } else {
-                if (it == end)
-                    err = utf_error::NOT_ENOUGH_ROOM;
-                else if (internal::is_lead_surrogate(first_word)) {
-                    const char16_t second_word = *it++;
-                    if (internal::is_trail_surrogate(static_cast<char32_t>(second_word))) {
-                        code_point = static_cast<char32_t>(first_word << 10) + static_cast<char32_t>(second_word) + SURROGATE_OFFSET;
-                        return utf_error::OK;
-                    } else
-                        err = utf_error::INCOMPLETE_SEQUENCE;
-
-                } else {
-                    err = utf_error::INVALID_LEAD;
-                }
-            }
-            // error branch
-            it = original_it;
-            return err;
-        }
+    code_point = ((code_point << 18) & 0x1fffff) + ((detail::mask8(*it) << 12) & 0x3ffff);
 
-        template<class OutIt, octet octet_type = std::iter_value_t<OutIt>>
-            requires std::output_iterator<OutIt, octet_type>
-        constexpr OutIt append(char32_t cp, OutIt result) noexcept
-        {
-            if (cp < 0x80) // one octet
-                *(result++) = static_cast<octet_type>(cp);
-            else if (cp < 0x800) { // two octets
-                *(result++) = static_cast<octet_type>((cp >> 6) | 0xc0);
-                *(result++) = static_cast<octet_type>((cp & 0x3f) | 0x80);
-            } else if (cp < 0x10000) { // three octets
-                *(result++) = static_cast<octet_type>((cp >> 12) | 0xe0);
-                *(result++) = static_cast<octet_type>(((cp >> 6) & 0x3f) | 0x80);
-                *(result++) = static_cast<octet_type>((cp & 0x3f) | 0x80);
-            } else { // four octets
-                *(result++) = static_cast<octet_type>((cp >> 18) | 0xf0);
-                *(result++) = static_cast<octet_type>(((cp >> 12) & 0x3f) | 0x80);
-                *(result++) = static_cast<octet_type>(((cp >> 6) & 0x3f) | 0x80);
-                *(result++) = static_cast<octet_type>((cp & 0x3f) | 0x80);
-            }
-            return result;
-        }
+    IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end);
 
-        template<class container_type>
-        constexpr std::back_insert_iterator<container_type> append(char32_t cp, std::back_insert_iterator<container_type> result)
-            noexcept(noexcept(internal::append<std::back_insert_iterator<container_type>, class container_type::value_type>(cp, result)))
-        {
-            return internal::append<std::back_insert_iterator<container_type>, class container_type::value_type>(cp, result);
-        }
+    code_point = static_cast<char32_t>(code_point + ((detail::mask8(*it) << 6) & 0xfff));
 
-        template<std::output_iterator<char16_t> It>
-        constexpr It append16(char32_t cp, It result)
-            noexcept(noexcept(*result++ = std::declval<char16_t>()))
-        {
-            if (internal::is_in_bmp(cp))
-                *(result++) = static_cast<char16_t>(cp);
-            else {
-                // Code points from the supplementary planes are encoded via surrogate pairs
-                *(result++) = static_cast<char16_t>(LEAD_OFFSET + (cp >> 10));
-                *(result++) = static_cast<char16_t>(TRAIL_SURROGATE_MIN + (cp & 0x3FF));
-            }
-            return result;
-        }
-    } // namespace internal
+    IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end);
 
-    // Base for the exceptions that may be thrown from the library
-    class exception : public ::std::exception
-    {
-    };
+    code_point = static_cast<char32_t>(code_point + ((*it) & 0x3f));
 
-    // Exceptions that may be thrown from the library functions.
-    class invalid_code_point : public exception
-    {
-        char32_t cp;
+    return utf_error::OK;
+}
 
-    public:
-        explicit invalid_code_point(char32_t codepoint)
-            : cp(codepoint)
-        {
-        }
-        virtual const char* what() const noexcept override { return "Invalid code point"; }
-        [[nodiscard]] char32_t code_point() const noexcept { return cp; }
-    };
+#undef IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR
 
-    class invalid_utf8 : public exception
-    {
-        char8_t u8;
+template<octet_input_iterator It, std::sentinel_for<It> Se>
+    requires std::forward_iterator<It>
+constexpr utf_error validate_next(It& it, Se end, char32_t& code_point)
+    noexcept(std::conjunction_v<
+        is_nothrow_dereferenceable<It&>,
+        is_nothrow_prefix_incrementable<It&>,
+        is_nothrow_sentinel<It, Se>,
+        std::is_nothrow_copy_constructible<It>
+    >)
+{
+    if (it == end)
+        return utf_error::NOT_ENOUGH_ROOM;
+
+    // Save the original value of it so we can go back in case of failure
+    // Of course, it does not make much sense with i.e. stream iterators
+    It original_it = it;
+
+    char32_t cp = 0;
+    // Determine the sequence length based on the lead octet
+    const int length = detail::sequence_length(it);
+
+    // Get trail octets and calculate the code point
+    utf_error err = utf_error::OK;
+    switch (length) {
+        case 0:
+            return utf_error::INVALID_LEAD;
+        case 1:
+            err = detail::get_sequence_1(it, end, cp);
+            break;
+        case 2:
+            err = detail::get_sequence_2(it, end, cp);
+            break;
+        case 3:
+            err = detail::get_sequence_3(it, end, cp);
+            break;
+        case 4:
+            err = detail::get_sequence_4(it, end, cp);
+            break;
+    }
+
+    if (err == utf_error::OK) {
+        // Decoding succeeded. Now, security checks...
+        if (detail::is_code_point_valid(cp)) {
+            if (!detail::is_overlong_sequence(cp, length)) {
+                // Passed! Return here.
+                code_point = cp;
+                ++it;
+                return utf_error::OK;
+            } else
+                err = utf_error::OVERLONG_SEQUENCE;
+        } else
+            err = utf_error::INVALID_CODE_POINT;
+    }
 
-    public:
-        explicit invalid_utf8(char c)
-            : u8(static_cast<char8_t>(c))
-        {
-        }
-        explicit invalid_utf8(char8_t u)
-            : u8(u)
-        {
-        }
-        virtual const char* what() const noexcept override { return "Invalid UTF-8"; }
-        [[nodiscard]] char8_t utf8_octet() const noexcept { return u8; }
-    };
+    // Failure branch - restore the original value of the iterator
+    it = original_it;
+    return err;
+}
 
-    class invalid_utf16 : public exception
-    {
-        char16_t u16;
+template<octet_input_iterator It, std::sentinel_for<It> Se>
+    requires std::forward_iterator<It>
+constexpr utf_error validate_next(It& it, Se end)
+    noexcept(noexcept(detail::validate_next(it, end, std::declval<char32_t&>())))
+{
+    char32_t ignored;
+    return detail::validate_next(it, end, ignored);
+}
+
+template<utf16_input_iterator It, std::sentinel_for<It> Se>
+    requires std::forward_iterator<It>
+constexpr utf_error validate_next16(It& it, Se end, char32_t& code_point)
+    noexcept(std::conjunction_v<
+        is_nothrow_dereferenceable<It&>,
+        is_nothrow_prefix_incrementable<It&>,
+        is_nothrow_postfix_incrementable<It&>,
+        is_nothrow_sentinel<It, Se>,
+        std::is_nothrow_copy_constructible<It>
+    >)
+{
+    // Check the edge case:
+    if (it == end)
+        return utf_error::NOT_ENOUGH_ROOM;
+    // Save the original value of it so we can go back in case of failure
+    // Of course, it does not make much sense with i.e. stream iterators
+    It original_it = it;
+
+    utf_error err = utf_error::OK;
+
+    const char16_t first_word = *it++;
+    if (!detail::is_surrogate(first_word)) {
+        code_point = first_word;
+        return utf_error::OK;
+    } else {
+        if (it == end)
+            err = utf_error::NOT_ENOUGH_ROOM;
+        else if (detail::is_lead_surrogate(first_word)) {
+            const char16_t second_word = *it++;
+            if (detail::is_trail_surrogate(static_cast<char32_t>(second_word))) {
+                code_point = static_cast<char32_t>(first_word << 10) + static_cast<char32_t>(second_word) + SURROGATE_OFFSET;
+                return utf_error::OK;
+            } else
+                err = utf_error::INCOMPLETE_SEQUENCE;
 
-    public:
-        explicit invalid_utf16(char16_t u)
-            : u16(u)
-        {
+        } else {
+            err = utf_error::INVALID_LEAD;
         }
-        virtual const char* what() const noexcept override { return "Invalid UTF-16"; }
-        [[nodiscard]] char16_t utf16_word() const noexcept { return u16; }
-    };
+    }
+    // error branch
+    it = original_it;
+    return err;
+}
 
-    class not_enough_room : public exception
+template<class OutIt, octet octet_type = std::iter_value_t<OutIt>>
+    requires std::output_iterator<OutIt, octet_type>
+constexpr OutIt append(char32_t cp, OutIt result) noexcept
+{
+    if (cp < 0x80) // one octet
+        *(result++) = static_cast<octet_type>(cp);
+    else if (cp < 0x800) { // two octets
+        *(result++) = static_cast<octet_type>((cp >> 6) | 0xc0);
+        *(result++) = static_cast<octet_type>((cp & 0x3f) | 0x80);
+    } else if (cp < 0x10000) { // three octets
+        *(result++) = static_cast<octet_type>((cp >> 12) | 0xe0);
+        *(result++) = static_cast<octet_type>(((cp >> 6) & 0x3f) | 0x80);
+        *(result++) = static_cast<octet_type>((cp & 0x3f) | 0x80);
+    } else { // four octets
+        *(result++) = static_cast<octet_type>((cp >> 18) | 0xf0);
+        *(result++) = static_cast<octet_type>(((cp >> 12) & 0x3f) | 0x80);
+        *(result++) = static_cast<octet_type>(((cp >> 6) & 0x3f) | 0x80);
+        *(result++) = static_cast<octet_type>((cp & 0x3f) | 0x80);
+    }
+    return result;
+}
+
+template<class container_type>
+constexpr std::back_insert_iterator<container_type> append(char32_t cp, std::back_insert_iterator<container_type> result)
+    noexcept(noexcept(detail::append<std::back_insert_iterator<container_type>, class container_type::value_type>(cp, result)))
+    {
+        return detail::append<std::back_insert_iterator<container_type>, class container_type::value_type>(cp, result);
+    }
+
+    template<std::output_iterator<char16_t> It>
+    constexpr It append16(char32_t cp, It result)
+        noexcept(noexcept(*result++ = std::declval<char16_t>()))
     {
-    public:
-        virtual const char* what() const noexcept override { return "Not enough space"; }
-    };
+        if (detail::is_in_bmp(cp))
+            *(result++) = static_cast<char16_t>(cp);
+        else {
+            // Code points from the supplementary planes are encoded via surrogate pairs
+            *(result++) = static_cast<char16_t>(LEAD_OFFSET + (cp >> 10));
+            *(result++) = static_cast<char16_t>(TRAIL_SURROGATE_MIN + (cp & 0x3FF));
+        }
+        return result;
+    }
 
-    /// The library API - functions intended to be called by the users
+} // detail
 
-    // Byte order mark
-    constexpr char8_t bom[] = {0xef, 0xbb, 0xbf};
+// Base for the exceptions that may be thrown from the library
+class exception : public ::std::exception
+{
+};
 
-    template<octet_input_iterator It, std::sentinel_for<It> Se>
-    [[nodiscard]] constexpr It find_invalid(It it, Se se)
-        noexcept(noexcept(internal::validate_next(it, se)) && std::is_nothrow_copy_constructible_v<It>)
-    {
-        while (it != se) {
-            internal::utf_error err_code = internal::validate_next(it, se);
-            if (err_code != internal::utf_error::OK)
-                return it;
-        }
-        return it;
-    }
+// Exceptions that may be thrown from the library functions.
+class invalid_code_point : public exception
+{
+    char32_t cp;
 
-    [[nodiscard]] constexpr std::size_t find_invalid(std::string_view s)
-        noexcept(noexcept(unicode::find_invalid(s.begin(), s.end())))
+public:
+    explicit invalid_code_point(char32_t codepoint)
+        : cp(codepoint)
     {
-        std::string_view::const_iterator invalid = unicode::find_invalid(s.begin(), s.end());
-        return (invalid == s.end()) ? std::string_view::npos : static_cast<std::size_t>(invalid - s.begin());
     }
+    virtual const char* what() const noexcept override { return "Invalid code point"; }
+    [[nodiscard]] char32_t code_point() const noexcept { return cp; }
+};
 
-    [[nodiscard]] constexpr std::size_t find_invalid(std::u8string_view s)
-        noexcept(noexcept(unicode::find_invalid(s.begin(), s.end())))
-    {
-        std::u8string_view::const_iterator invalid = unicode::find_invalid(s.begin(), s.end());
-        return (invalid == s.end()) ? std::u8string_view::npos : static_cast<std::size_t>(invalid - s.begin());
-    }
+class invalid_utf8 : public exception
+{
+    char8_t u8;
 
-    template<octet_input_iterator It, std::sentinel_for<It> Se>
-    [[nodiscard]] constexpr bool is_valid(It it, Se se)
-        noexcept(noexcept(unicode::find_invalid(it, se)) && traits::is_nothrow_sentinel_v<It, Se>)
+public:
+    explicit invalid_utf8(char c)
+        : u8(static_cast<char8_t>(c))
     {
-        return (unicode::find_invalid(it, se) == se);
     }
-
-    [[nodiscard]] constexpr bool is_valid(std::string_view s)
-        noexcept(noexcept(unicode::is_valid(s.begin(), s.end())))
+    explicit invalid_utf8(char8_t u)
+        : u8(u)
     {
-        return unicode::is_valid(s.begin(), s.end());
     }
+    virtual const char* what() const noexcept override { return "Invalid UTF-8"; }
+    [[nodiscard]] char8_t utf8_octet() const noexcept { return u8; }
+};
 
-    [[nodiscard]] constexpr bool is_valid(std::u8string_view s)
-        noexcept(noexcept(unicode::is_valid(s.begin(), s.end())))
-    {
-        return unicode::is_valid(s.begin(), s.end());
-    }
+class invalid_utf16 : public exception
+{
+    char16_t u16;
 
-    template<octet_input_iterator It, std::sentinel_for<It> Se>
-    [[nodiscard]] constexpr bool starts_with_bom(It it, Se end)
-        noexcept(noexcept(internal::mask8(*it++)) && traits::is_nothrow_sentinel_v<It, Se>)
+public:
+    explicit invalid_utf16(char16_t u)
+        : u16(u)
     {
-        return (((it != end) && (internal::mask8(*it++)) == bom[0]) && ((it != end) && (internal::mask8(*it++)) == bom[1]) && ((it != end) && (internal::mask8(*it)) == bom[2]));
     }
+    virtual const char* what() const noexcept override { return "Invalid UTF-16"; }
+    [[nodiscard]] char16_t utf16_word() const noexcept { return u16; }
+};
 
-    [[nodiscard]] constexpr bool starts_with_bom(std::string_view s)
-        noexcept(noexcept(unicode::starts_with_bom(s.begin(), s.end())))
-    {
-        return unicode::starts_with_bom(s.begin(), s.end());
-    }
+class not_enough_room : public exception
+{
+public:
+    virtual const char* what() const noexcept override { return "Not enough space"; }
+};
 
-    [[nodiscard]] constexpr bool starts_with_bom(std::u8string_view s)
-        noexcept(noexcept(unicode::starts_with_bom(s.begin(), s.end())))
-    {
-        return unicode::starts_with_bom(s.begin(), s.end());
-    }
+/// The library API - functions intended to be called by the users
 
-    template<class OutIt>
-    constexpr OutIt append(char32_t cp, OutIt result)
-    {
-        if (!internal::is_code_point_valid(cp))
-            throw invalid_code_point(cp);
+// Byte order mark
+constexpr char8_t bom[] = {0xef, 0xbb, 0xbf};
 
-        return internal::append(cp, result);
+template<octet_input_iterator It, std::sentinel_for<It> Se>
+[[nodiscard]] constexpr It find_invalid(It it, Se se)
+    noexcept(noexcept(detail::validate_next(it, se)) && std::is_nothrow_copy_constructible_v<It>)
+{
+    while (it != se) {
+        detail::utf_error err_code = detail::validate_next(it, se);
+        if (err_code != detail::utf_error::OK)
+            return it;
     }
+    return it;
+}
 
-    constexpr void append(char32_t cp, std::string& s)
-    {
-        unicode::append(cp, std::back_inserter(s));
-    }
+[[nodiscard]] constexpr std::size_t find_invalid(std::string_view s)
+    noexcept(noexcept(unicode::find_invalid(s.begin(), s.end())))
+{
+    std::string_view::const_iterator invalid = unicode::find_invalid(s.begin(), s.end());
+    return (invalid == s.end()) ? std::string_view::npos : static_cast<std::size_t>(invalid - s.begin());
+}
 
-    constexpr void append(char32_t cp, std::u8string& s)
-    {
-        unicode::append(cp, std::back_inserter(s));
-    }
+[[nodiscard]] constexpr std::size_t find_invalid(std::u8string_view s)
+    noexcept(noexcept(unicode::find_invalid(s.begin(), s.end())))
+{
+    std::u8string_view::const_iterator invalid = unicode::find_invalid(s.begin(), s.end());
+    return (invalid == s.end()) ? std::u8string_view::npos : static_cast<std::size_t>(invalid - s.begin());
+}
 
-    template<class It>  // TODO: add constraints
-    constexpr It append16(char32_t cp, It result)
-    {
-        if (!internal::is_code_point_valid(cp))
-            throw invalid_code_point(cp);
+template<octet_input_iterator It, std::sentinel_for<It> Se>
+[[nodiscard]] constexpr bool is_valid(It it, Se se)
+    noexcept(noexcept(unicode::find_invalid(it, se)) && is_nothrow_sentinel_v<It, Se>)
+{
+    return (unicode::find_invalid(it, se) == se);
+}
 
-        return internal::append16(cp, result);
-    }
+[[nodiscard]] constexpr bool is_valid(std::string_view s)
+    noexcept(noexcept(unicode::is_valid(s.begin(), s.end())))
+{
+    return unicode::is_valid(s.begin(), s.end());
+}
 
-    constexpr void append16(char32_t cp, std::u16string& s)
-    {
-        unicode::append16(cp, std::back_inserter(s));
-    }
+[[nodiscard]] constexpr bool is_valid(std::u8string_view s)
+    noexcept(noexcept(unicode::is_valid(s.begin(), s.end())))
+{
+    return unicode::is_valid(s.begin(), s.end());
+}
 
-    template<octet_input_iterator It, std::sentinel_for<It> Se, class Out>  // TODO: add constraints
-    constexpr Out replace_invalid(It start, Se end, Out out, char32_t replacement)
-    {
-        while (start != end) {
-            It sequence_start = start;
-            internal::utf_error err_code  = internal::validate_next(start, end);
-            switch (err_code) {
-                case internal::utf_error::OK:
-                    for (It it = sequence_start; it != start; ++it)
-                        *out++ = *it;
-                    break;
-                case internal::utf_error::NOT_ENOUGH_ROOM:
-                    out   = unicode::append(replacement, out);
-                    start = end;
-                    break;
-                case internal::utf_error::INVALID_LEAD:
-                    out = unicode::append(replacement, out);
-                    ++start;
-                    break;
-                case internal::utf_error::INCOMPLETE_SEQUENCE:
-                case internal::utf_error::OVERLONG_SEQUENCE:
-                case internal::utf_error::INVALID_CODE_POINT:
-                    out = unicode::append(replacement, out);
-                    ++start;
-                    // just one replacement mark for the sequence
-                    while (start != end && internal::is_trail(*start))
-                        ++start;
-                    break;
-            }
-        }
-        return out;
-    }
+template<octet_input_iterator It, std::sentinel_for<It> Se>
+[[nodiscard]] constexpr bool starts_with_bom(It it, Se end)
+    noexcept(noexcept(detail::mask8(*it++)) && is_nothrow_sentinel_v<It, Se>)
+{
+    return (((it != end) && (detail::mask8(*it++)) == bom[0]) && ((it != end) && (detail::mask8(*it++)) == bom[1]) && ((it != end) && (detail::mask8(*it)) == bom[2]));
+}
 
-    template<octet_input_iterator It, std::sentinel_for<It> Se, class Out>  // TODO: add constraints
-    constexpr Out replace_invalid(It start, Se end, Out out)
-    {
-        constexpr char32_t replacement_marker = static_cast<char32_t>(internal::mask16(0xfffd));
-        return unicode::replace_invalid(start, end, out, replacement_marker);
-    }
+[[nodiscard]] constexpr bool starts_with_bom(std::string_view s)
+    noexcept(noexcept(unicode::starts_with_bom(s.begin(), s.end())))
+{
+    return unicode::starts_with_bom(s.begin(), s.end());
+}
 
-    [[nodiscard]] constexpr std::string replace_invalid(std::string_view s, char32_t replacement)
-    {
-        std::string result;
-        replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement);
-        return result;
-    }
+[[nodiscard]] constexpr bool starts_with_bom(std::u8string_view s)
+    noexcept(noexcept(unicode::starts_with_bom(s.begin(), s.end())))
+{
+    return unicode::starts_with_bom(s.begin(), s.end());
+}
 
-    [[nodiscard]] constexpr std::u8string replace_invalid(std::u8string_view s, char32_t replacement)
-    {
-        std::u8string result;
-        replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement);
-        return result;
-    }
+template<class OutIt>
+constexpr OutIt append(char32_t cp, OutIt result)
+{
+    if (!detail::is_code_point_valid(cp))
+        throw invalid_code_point(cp);
 
-    [[nodiscard]] constexpr std::string replace_invalid(std::string_view s)
-    {
-        std::string result;
-        replace_invalid(s.begin(), s.end(), std::back_inserter(result));
-        return result;
-    }
+    return detail::append(cp, result);
+}
 
-    [[nodiscard]] constexpr std::u8string replace_invalid(std::u8string_view s)
-    {
-        std::u8string result;
-        replace_invalid(s.begin(), s.end(), std::back_inserter(result));
-        return result;
-    }
+constexpr void append(char32_t cp, std::string& s)
+{
+    unicode::append(cp, std::back_inserter(s));
+}
 
-    template<octet_input_iterator It, std::sentinel_for<It> Se>
-    [[nodiscard]] constexpr char32_t next(It& it, Se end)
-    {
-        char32_t cp               = 0;
-        internal::utf_error err_code = internal::validate_next(it, end, cp);
+constexpr void append(char32_t cp, std::u8string& s)
+{
+    unicode::append(cp, std::back_inserter(s));
+}
+
+template<class It>  // TODO: add constraints
+constexpr It append16(char32_t cp, It result)
+{
+    if (!detail::is_code_point_valid(cp))
+        throw invalid_code_point(cp);
+
+    return detail::append16(cp, result);
+}
+
+constexpr void append16(char32_t cp, std::u16string& s)
+{
+    unicode::append16(cp, std::back_inserter(s));
+}
+
+template<octet_input_iterator It, std::sentinel_for<It> Se, class Out>  // TODO: add constraints
+constexpr Out replace_invalid(It start, Se end, Out out, char32_t replacement)
+{
+    while (start != end) {
+        It sequence_start = start;
+        detail::utf_error err_code  = detail::validate_next(start, end);
         switch (err_code) {
-            case internal::utf_error::OK:
+            case detail::utf_error::OK:
+                for (It it = sequence_start; it != start; ++it)
+                    *out++ = *it;
+                break;
+            case detail::utf_error::NOT_ENOUGH_ROOM:
+                out   = unicode::append(replacement, out);
+                start = end;
+                break;
+            case detail::utf_error::INVALID_LEAD:
+                out = unicode::append(replacement, out);
+                ++start;
+                break;
+            case detail::utf_error::INCOMPLETE_SEQUENCE:
+            case detail::utf_error::OVERLONG_SEQUENCE:
+            case detail::utf_error::INVALID_CODE_POINT:
+                out = unicode::append(replacement, out);
+                ++start;
+                // just one replacement mark for the sequence
+                while (start != end && detail::is_trail(*start))
+                    ++start;
                 break;
-            case internal::utf_error::NOT_ENOUGH_ROOM:
-                throw not_enough_room();
-            case internal::utf_error::INVALID_LEAD:
-            case internal::utf_error::INCOMPLETE_SEQUENCE:
-            case internal::utf_error::OVERLONG_SEQUENCE:
-                throw invalid_utf8(static_cast<char8_t>(*it));
-            case internal::utf_error::INVALID_CODE_POINT:
-                throw invalid_code_point(cp);
         }
-        return cp;
     }
+    return out;
+}
 
-    template<utf16_input_iterator It, std::sentinel_for<It> Se>
-    [[nodiscard]] constexpr char32_t next16(It& it, Se end)
-    {
-        char32_t cp               = 0;
-        internal::utf_error err_code = internal::validate_next16(it, end, cp);
-        if (err_code == internal::utf_error::NOT_ENOUGH_ROOM)
+template<octet_input_iterator It, std::sentinel_for<It> Se, class Out>  // TODO: add constraints
+constexpr Out replace_invalid(It start, Se end, Out out)
+{
+    constexpr char32_t replacement_marker = static_cast<char32_t>(detail::mask16(0xfffd));
+    return unicode::replace_invalid(start, end, out, replacement_marker);
+}
+
+[[nodiscard]] constexpr std::string replace_invalid(std::string_view s, char32_t replacement)
+{
+    std::string result;
+    replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement);
+    return result;
+}
+
+[[nodiscard]] constexpr std::u8string replace_invalid(std::u8string_view s, char32_t replacement)
+{
+    std::u8string result;
+    replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement);
+    return result;
+}
+
+[[nodiscard]] constexpr std::string replace_invalid(std::string_view s)
+{
+    std::string result;
+    replace_invalid(s.begin(), s.end(), std::back_inserter(result));
+    return result;
+}
+
+[[nodiscard]] constexpr std::u8string replace_invalid(std::u8string_view s)
+{
+    std::u8string result;
+    replace_invalid(s.begin(), s.end(), std::back_inserter(result));
+    return result;
+}
+
+template<octet_input_iterator It, std::sentinel_for<It> Se>
+[[nodiscard]] constexpr char32_t next(It& it, Se end)
+{
+    char32_t cp               = 0;
+    detail::utf_error err_code = detail::validate_next(it, end, cp);
+    switch (err_code) {
+        case detail::utf_error::OK:
+            break;
+        case detail::utf_error::NOT_ENOUGH_ROOM:
             throw not_enough_room();
-        return cp;
+        case detail::utf_error::INVALID_LEAD:
+        case detail::utf_error::INCOMPLETE_SEQUENCE:
+        case detail::utf_error::OVERLONG_SEQUENCE:
+            throw invalid_utf8(static_cast<char8_t>(*it));
+        case detail::utf_error::INVALID_CODE_POINT:
+            throw invalid_code_point(cp);
     }
+    return cp;
+}
 
-    template<octet_input_iterator It, std::sentinel_for<It> Se>
-    [[nodiscard]] constexpr char32_t peek_next(It it, Se end)
-    {
-        return unicode::next(it, end);
-    }
+template<utf16_input_iterator It, std::sentinel_for<It> Se>
+[[nodiscard]] constexpr char32_t next16(It& it, Se end)
+{
+    char32_t cp               = 0;
+    detail::utf_error err_code = detail::validate_next16(it, end, cp);
+    if (err_code == detail::utf_error::NOT_ENOUGH_ROOM)
+        throw not_enough_room();
+    return cp;
+}
+
+template<octet_input_iterator It, std::sentinel_for<It> Se>
+[[nodiscard]] constexpr char32_t peek_next(It it, Se end)
+{
+    return unicode::next(it, end);
+}
 
-    template<octet_input_iterator It, std::sentinel_for<It> Se>
-    [[nodiscard]] constexpr char32_t prior(It& it, Se start)
-    {
-        // can't do much if it == start
+template<octet_input_iterator It, std::sentinel_for<It> Se>
+[[nodiscard]] constexpr char32_t prev(It& it, Se start)
+{
+    // can't do much if it == start
+    if (it == start)
+        throw not_enough_room();
+
+    It end = it;
+    // Go back until we hit either a lead octet or start
+    while (detail::is_trail(*(--it)))
         if (it == start)
-            throw not_enough_room();
+            throw invalid_utf8(*it); // error - no lead byte in the sequence
+    return unicode::peek_next(it, end);
+}
 
-        It end = it;
-        // Go back until we hit either a lead octet or start
-        while (internal::is_trail(*(--it)))
-            if (it == start)
-                throw invalid_utf8(*it); // error - no lead byte in the sequence
-        return unicode::peek_next(it, end);
-    }
+template<octet_input_iterator It, std::sentinel_for<It> Se, class distance_type>
+constexpr void advance(It& it, distance_type n, Se end)
+{
+    const distance_type zero(0);
+    if (n < zero) {
+        // backward
+        for (distance_type i = n; i < zero; ++i)
+            (void)unicode::prev(it, end);
+    } else {
+        // forward
+        for (distance_type i = zero; i < n; ++i)
+            (void)unicode::next(it, end);
+    }
+}
+
+template<octet_input_iterator It, std::sentinel_for<It> Se>
+[[nodiscard]] constexpr class std::iterator_traits<It>::difference_type distance(It first, Se last)
+{
+    class std::iterator_traits<It>::difference_type dist;
+    for (dist = 0; first != last; ++dist)
+        (void)unicode::next(first, last);
+    return dist;
+}
+
+template<utf16_input_iterator It, std::sentinel_for<It> Se, class OutIt> // TODO: add constraints
+constexpr OutIt utf16to8(It start, Se end, OutIt result)
+{
+    while (start != end) {
+        char32_t cp = static_cast<char32_t>(detail::mask16(*start++));
+        // Take care of surrogate pairs first
+        if (detail::is_lead_surrogate(cp)) {
+            if (start != end) {
+                const char32_t trail_surrogate = static_cast<char32_t>(detail::mask16(*start++));
+                if (detail::is_trail_surrogate(trail_surrogate))
+                    cp = (cp << 10) + trail_surrogate + detail::SURROGATE_OFFSET;
+                else
+                    throw invalid_utf16(static_cast<char16_t>(trail_surrogate));
+            } else
+                throw invalid_utf16(static_cast<char16_t>(cp));
 
-    template<octet_input_iterator It, std::sentinel_for<It> Se, class distance_type>
-    constexpr void advance(It& it, distance_type n, Se end)
-    {
-        const distance_type zero(0);
-        if (n < zero) {
-            // backward
-            for (distance_type i = n; i < zero; ++i)
-                (void)unicode::prior(it, end);
-        } else {
-            // forward
-            for (distance_type i = zero; i < n; ++i)
-                (void)unicode::next(it, end);
         }
-    }
+        // Lone trail surrogate
+        else if (detail::is_trail_surrogate(cp))
+            throw invalid_utf16(static_cast<char16_t>(cp));
 
-    template<octet_input_iterator It, std::sentinel_for<It> Se>
-    [[nodiscard]] constexpr class std::iterator_traits<It>::difference_type distance(It first, Se last)
-    {
-        class std::iterator_traits<It>::difference_type dist;
-        for (dist = 0; first != last; ++dist)
-            (void)unicode::next(first, last);
-        return dist;
+        result = unicode::append(cp, result);
     }
+    return result;
+}
 
-    template<utf16_input_iterator It, std::sentinel_for<It> Se, class OutIt> // TODO: add constraints
-    constexpr OutIt utf16to8(It start, Se end, OutIt result)
-    {
-        while (start != end) {
-            char32_t cp = static_cast<char32_t>(internal::mask16(*start++));
-            // Take care of surrogate pairs first
-            if (internal::is_lead_surrogate(cp)) {
-                if (start != end) {
-                    const char32_t trail_surrogate = static_cast<char32_t>(internal::mask16(*start++));
-                    if (internal::is_trail_surrogate(trail_surrogate))
-                        cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
-                    else
-                        throw invalid_utf16(static_cast<char16_t>(trail_surrogate));
-                } else
-                    throw invalid_utf16(static_cast<char16_t>(cp));
-
-            }
-            // Lone trail surrogate
-            else if (internal::is_trail_surrogate(cp))
-                throw invalid_utf16(static_cast<char16_t>(cp));
+[[nodiscard]] constexpr std::string utf16to8(std::u16string_view s)
+{
+    std::string result;
+    unicode::utf16to8(s.begin(), s.end(), std::back_inserter(result));
+    return result;
+}
 
-            result = unicode::append(cp, result);
-        }
-        return result;
-    }
+[[nodiscard]] constexpr std::u8string utf16tou8(std::u16string_view s)
+{
+    std::u8string result;
+    unicode::utf16to8(s.begin(), s.end(), std::back_inserter(result));
+    return result;
+}
 
-    [[nodiscard]] constexpr std::string utf16to8(std::u16string_view s)
-    {
-        std::string result;
-        unicode::utf16to8(s.begin(), s.end(), std::back_inserter(result));
-        return result;
-    }
+template<utf8_input_iterator It, std::sentinel_for<It> Se, class OutIt>  // TODO: add constraints
+constexpr OutIt utf8to16(It start, Se end, OutIt result)
+{
+    while (start != end) {
+        const char32_t cp = unicode::next(start, end);
+        if (cp > 0xffff) { // make a surrogate pair
+            *result++ = static_cast<char16_t>((cp >> 10) + detail::LEAD_OFFSET);
+            *result++ = static_cast<char16_t>((cp & 0x3ff) + detail::TRAIL_SURROGATE_MIN);
+        } else
+            *result++ = static_cast<char16_t>(cp);
+    }
+    return result;
+}
+
+[[nodiscard]] constexpr std::u16string utf8to16(std::string_view s)
+{
+    std::u16string result;
+    unicode::utf8to16(s.begin(), s.end(), std::back_inserter(result));
+    return result;
+}
 
-    [[nodiscard]] constexpr std::u8string utf16tou8(std::u16string_view s)
-    {
-        std::u8string result;
-        unicode::utf16to8(s.begin(), s.end(), std::back_inserter(result));
-        return result;
-    }
+[[nodiscard]] constexpr std::u16string utf8to16(std::u8string_view s)
+{
+    std::u16string result;
+    unicode::utf8to16(s.begin(), s.end(), std::back_inserter(result));
+    return result;
+}
 
-    template<utf8_input_iterator It, std::sentinel_for<It> Se, class OutIt>  // TODO: add constraints
-    constexpr OutIt utf8to16(It start, Se end, OutIt result)
-    {
-        while (start != end) {
-            const char32_t cp = unicode::next(start, end);
-            if (cp > 0xffff) { // make a surrogate pair
-                *result++ = static_cast<char16_t>((cp >> 10) + internal::LEAD_OFFSET);
-                *result++ = static_cast<char16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
-            } else
-                *result++ = static_cast<char16_t>(cp);
-        }
-        return result;
-    }
+template<utf32_input_iterator It, std::sentinel_for<It> Se, class OutIt>  // TODO: add constraints
+constexpr OutIt utf32to8(It start, Se end, OutIt result)
+{
+    while (start != end)
+        result = unicode::append(*(start++), result);
 
-    [[nodiscard]] constexpr std::u16string utf8to16(std::string_view s)
-    {
-        std::u16string result;
-        unicode::utf8to16(s.begin(), s.end(), std::back_inserter(result));
-        return result;
-    }
+    return result;
+}
 
-    [[nodiscard]] constexpr std::u16string utf8to16(std::u8string_view s)
-    {
-        std::u16string result;
-        unicode::utf8to16(s.begin(), s.end(), std::back_inserter(result));
-        return result;
-    }
+[[nodiscard]] constexpr std::string utf32to8(std::u32string_view s)
+{
+    std::string result;
+    unicode::utf32to8(s.begin(), s.end(), std::back_inserter(result));
+    return result;
+}
 
-    template<utf32_input_iterator It, std::sentinel_for<It> Se, class OutIt>  // TODO: add constraints
-    constexpr OutIt utf32to8(It start, Se end, OutIt result)
-    {
-        while (start != end)
-            result = unicode::append(*(start++), result);
+[[nodiscard]] constexpr std::u8string utf32tou8(std::u32string_view s)
+{
+    std::u8string result;
+    unicode::utf32to8(s.begin(), s.end(), std::back_inserter(result));
+    return result;
+}
 
-        return result;
-    }
+template<utf8_input_iterator It, std::sentinel_for<It> Se, class OutIt>
+constexpr OutIt utf8to32(It start, Se end, OutIt result)
+{
+    while (start != end)
+        (*result++) = unicode::next(start, end);
 
-    [[nodiscard]] constexpr std::string utf32to8(std::u32string_view s)
+    return result;
+}
+
+[[nodiscard]] constexpr std::u32string utf8to32(std::string_view s)
+{
+    std::u32string result;
+    unicode::utf8to32(s.begin(), s.end(), std::back_inserter(result));
+    return result;
+}
+
+[[nodiscard]] constexpr std::u32string utf8to32(std::u8string_view s)
+{
+    std::u32string result;
+    unicode::utf8to32(s.begin(), s.end(), std::back_inserter(result));
+    return result;
+}
+
+// The iterator class
+template<octet_input_iterator It>
+class iterator
+{
+    It it;
+    It range_start;
+    It range_end;
+
+public:
+    using value_type = char32_t;
+    using pointer = char32_t*;
+    using reference = char32_t&;
+    using difference_type = std::ptrdiff_t;
+    using iterator_category = std::bidirectional_iterator_tag;
+    constexpr iterator()
+        requires std::is_default_constructible_v<It>
+    = default;
+    constexpr explicit iterator(It octet_it, It rangestart, It rangeend)
+        : it(std::move(octet_it))
+        , range_start(std::move(rangestart))
+        , range_end(std::move(rangeend))
     {
-        std::string result;
-        unicode::utf32to8(s.begin(), s.end(), std::back_inserter(result));
-        return result;
+        if constexpr (std::random_access_iterator<It>) {
+            if (it < range_start || it > range_end)
+                throw std::out_of_range("Invalid utf-8 iterator position");
+        }
     }
-
-    [[nodiscard]] constexpr std::u8string utf32tou8(std::u32string_view s)
+    // the default "big three" are OK
+    [[nodiscard]] constexpr It base() const { return it; }
+    [[nodiscard]] constexpr char32_t operator*() const
     {
-        std::u8string result;
-        unicode::utf32to8(s.begin(), s.end(), std::back_inserter(result));
-        return result;
+        It temp = it;
+        return unicode::next(temp, range_end);
     }
-
-    template<utf8_input_iterator It, std::sentinel_for<It> Se, class OutIt>
-    constexpr OutIt utf8to32(It start, Se end, OutIt result)
+    [[nodiscard]] constexpr bool operator==(const iterator& rhs) const
     {
-        while (start != end)
-            (*result++) = unicode::next(start, end);
-
-        return result;
+        if (range_start != rhs.range_start || range_end != rhs.range_end)
+            throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
+        return (it == rhs.it);
     }
-
-    [[nodiscard]] constexpr std::u32string utf8to32(std::string_view s)
+    constexpr iterator& operator++()
     {
-        std::u32string result;
-        unicode::utf8to32(s.begin(), s.end(), std::back_inserter(result));
-        return result;
+        (void)unicode::next(it, range_end);
+        return *this;
     }
-
-    [[nodiscard]] constexpr std::u32string utf8to32(std::u8string_view s)
+    constexpr iterator operator++(int)
     {
-        std::u32string result;
-        unicode::utf8to32(s.begin(), s.end(), std::back_inserter(result));
-        return result;
+        iterator temp = *this;
+        (void)unicode::next(it, range_end);
+        return temp;
     }
-
-    // The iterator class
-    template<octet_input_iterator It>
-    class iterator
+    constexpr iterator& operator--()
     {
-        It it;
-        It range_start;
-        It range_end;
-
-    public:
-        using value_type = char32_t;
-        using pointer = char32_t*;
-        using reference = char32_t&;
-        using difference_type = std::ptrdiff_t;
-        using iterator_category = std::bidirectional_iterator_tag;
-        constexpr iterator()
-            requires std::is_default_constructible_v<It>
-        = default;
-        constexpr explicit iterator(It octet_it, It rangestart, It rangeend)
-            : it(std::move(octet_it))
-            , range_start(std::move(rangestart))
-            , range_end(std::move(rangeend))
-        {
-            if constexpr (std::random_access_iterator<It>) {
-                if (it < range_start || it > range_end)
-                    throw std::out_of_range("Invalid utf-8 iterator position");
-            }
-        }
-        // the default "big three" are OK
-        [[nodiscard]] constexpr It base() const { return it; }
-        [[nodiscard]] constexpr char32_t operator*() const
-        {
-            It temp = it;
-            return unicode::next(temp, range_end);
-        }
-        [[nodiscard]] constexpr bool operator==(const iterator& rhs) const
-        {
-            if (range_start != rhs.range_start || range_end != rhs.range_end)
-                throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
-            return (it == rhs.it);
-        }
-        constexpr iterator& operator++()
-        {
-            (void)unicode::next(it, range_end);
-            return *this;
-        }
-        constexpr iterator operator++(int)
-        {
-            iterator temp = *this;
-            (void)unicode::next(it, range_end);
-            return temp;
-        }
-        constexpr iterator& operator--()
-        {
-            (void)unicode::prior(it, range_start);
-            return *this;
-        }
-        constexpr iterator operator--(int)
-        {
-            iterator temp = *this;
-            (void)unicode::prior(it, range_start);
-            return temp;
-        }
-    }; // class iterator
+        (void)unicode::prev(it, range_start);
+        return *this;
+    }
+    constexpr iterator operator--(int)
+    {
+        iterator temp = *this;
+        (void)unicode::prev(it, range_start);
+        return temp;
+    }
+};
 
 } // iris::unicode
 

From cb8aaf4b0661b57d45e17e840900af06bb5049a3 Mon Sep 17 00:00:00 2001
From: Nana Sakisaka <1901813+saki7@users.noreply.github.com>
Date: Sat, 7 Mar 2026 18:10:32 +0900
Subject: [PATCH 04/17] Refactor until append8/append16

---
 include/iris/unicode/string.hpp      | 693 +++++++++++++++------------
 test/unicode/string/string.cpp       | 202 ++++----
 test/unicode/string/utf8_invalid.cpp |  10 +-
 3 files changed, 506 insertions(+), 399 deletions(-)

diff --git a/include/iris/unicode/string.hpp b/include/iris/unicode/string.hpp
index 7b0b77a..98ed0c3 100644
--- a/include/iris/unicode/string.hpp
+++ b/include/iris/unicode/string.hpp
@@ -25,7 +25,6 @@ ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 DEALINGS IN THE SOFTWARE.
 */
 
-
 #ifndef IRIS_UNICODE_STRING_HPP
 #define IRIS_UNICODE_STRING_HPP
 
@@ -37,9 +36,12 @@ DEALINGS IN THE SOFTWARE.
 #include <string_view>
 #include <type_traits>
 #include <utility>
+#include <ranges>
 
 namespace iris::unicode {
 
+constexpr char8_t bom[] = {0xef, 0xbb, 0xbf};
+
 template<class T>
 concept octet = std::integral<T> && sizeof(T) == 1;
 
@@ -65,6 +67,65 @@ template<class It>
 concept utf32_input_iterator = std::input_iterator<It> && utf32char<std::iter_value_t<It>>;
 
 
+namespace detail {
+
+template<class OutIt, class DesiredValueT>
+struct select_output_value_type
+{
+    static_assert(std::output_iterator<OutIt, DesiredValueT>);
+    using type = DesiredValueT;
+};
+
+template<class OutIt, class DesiredValueT>
+    requires requires {
+        typename std::iter_value_t<OutIt>;
+        requires std::convertible_to<DesiredValueT, std::iter_value_t<OutIt>>;
+    }
+struct select_output_value_type<OutIt, DesiredValueT>
+{
+    static_assert(std::output_iterator<OutIt, std::iter_value_t<OutIt>>);
+    using type = std::iter_value_t<OutIt>;
+};
+
+template<class OutIt, std::size_t SizeofChar>
+concept maybe_value_type_sized =
+    requires {
+        typename std::iter_value_t<OutIt>;
+        requires sizeof(std::iter_value_t<OutIt>) == SizeofChar;
+    } ||
+    !requires {
+        typename std::iter_value_t<OutIt>;
+    };
+
+} // detail
+
+template<class OutIt>
+concept octet_output_iterator =
+    (
+        std::output_iterator<OutIt, char8_t> ||
+        std::output_iterator<OutIt, char>
+    ) &&
+    detail::maybe_value_type_sized<OutIt, 1>;
+
+template<class R>
+concept octet_output_range =
+    (
+        std::ranges::output_range<R, char8_t> ||
+        std::ranges::output_range<R, char>
+    ) &&
+    detail::maybe_value_type_sized<std::ranges::iterator_t<R>, 1>;
+
+template<class OutIt>
+concept utf16_output_iterator =
+    std::output_iterator<OutIt, char16_t> &&
+    detail::maybe_value_type_sized<OutIt, 2>;
+
+template<class R>
+concept utf16_output_range =
+    std::ranges::output_range<R, char16_t> &&
+    detail::maybe_value_type_sized<std::ranges::iterator_t<R>, 2>;
+
+
 template<class T, class = void>
 struct is_nothrow_dereferenceable : std::false_type {};
 
@@ -109,6 +170,64 @@ template<class It, class Se>
 inline constexpr bool is_nothrow_sentinel_v = is_nothrow_sentinel<It, Se>::value;
 
 
+class unicode_error : public std::runtime_error
+{
+    using std::runtime_error::runtime_error;
+};
+
+class invalid_code_point : public unicode_error
+{
+    char32_t cp;
+
+public:
+    explicit invalid_code_point(char32_t codepoint)
+        : unicode_error("invalid code point")
+        , cp(codepoint)
+    {}
+
+    [[nodiscard]] char32_t code_point() const noexcept { return cp; }
+};
+
+class invalid_utf8 : public unicode_error
+{
+    char8_t u8;
+
+public:
+    explicit invalid_utf8(char c)
+        : unicode_error("invalid UTF-8")
+        , u8(static_cast<char8_t>(c))
+    {}
+
+    explicit invalid_utf8(char8_t u)
+        : unicode_error("invalid UTF-8")
+        , u8(u)
+    {}
+
+    [[nodiscard]] char8_t utf8_octet() const noexcept { return u8; }
+};
+
+class invalid_utf16 : public unicode_error
+{
+    char16_t u16;
+
+public:
+    explicit invalid_utf16(char16_t u)
+        : unicode_error("Invalid UTF-16")
+        , u16(u)
+    {}
+
+    [[nodiscard]] char16_t utf16_word() const noexcept { return u16; }
+};
+
+class not_enough_space : public unicode_error
+{
+public:
+    not_enough_space()
+        : unicode_error("not enough space")
+    {}
+};
+
+
 namespace detail {
 
 // Unicode constants
@@ -124,7 +243,15 @@ constexpr char32_t SURROGATE_OFFSET    = 0xfca02400u; // 0x10000u - (LEAD_SURROG
 // Maximum valid value for a Unicode code point
 constexpr char32_t CODE_POINT_MAX = 0x0010ffffu;
 
-enum class utf_error { OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT };
+enum class utf_error
+{
+    OK,
+    NOT_ENOUGH_SPACE,
+    INVALID_LEAD,
+    INCOMPLETE_SEQUENCE,
+    OVERLONG_SEQUENCE,
+    INVALID_CODE_POINT,
+};
 
 template<octet Octet>
 [[nodiscard]] constexpr char8_t mask8(Octet oc) noexcept
@@ -145,22 +272,22 @@ template<octet Octet>
 
 [[nodiscard]] constexpr bool is_lead_surrogate(char32_t cp) noexcept
 {
-    return (cp >= static_cast<char32_t>(LEAD_SURROGATE_MIN) && cp <= static_cast<char32_t>(LEAD_SURROGATE_MAX));
+    return cp >= static_cast<char32_t>(LEAD_SURROGATE_MIN) && cp <= static_cast<char32_t>(LEAD_SURROGATE_MAX);
 }
 
 [[nodiscard]] constexpr bool is_trail_surrogate(char32_t cp) noexcept
 {
-    return (cp >= static_cast<char32_t>(TRAIL_SURROGATE_MIN) && cp <= static_cast<char32_t>(TRAIL_SURROGATE_MAX));
+    return cp >= static_cast<char32_t>(TRAIL_SURROGATE_MIN) && cp <= static_cast<char32_t>(TRAIL_SURROGATE_MAX);
 }
 
 [[nodiscard]] constexpr bool is_surrogate(char32_t cp) noexcept
 {
-    return (cp >= static_cast<char32_t>(LEAD_SURROGATE_MIN) && cp <= static_cast<char32_t>(TRAIL_SURROGATE_MAX));
+    return cp >= static_cast<char32_t>(LEAD_SURROGATE_MIN) && cp <= static_cast<char32_t>(TRAIL_SURROGATE_MAX);
 }
 
 [[nodiscard]] constexpr bool is_code_point_valid(char32_t cp) noexcept
 {
-    return (cp <= CODE_POINT_MAX && !detail::is_surrogate(cp));
+    return cp <= CODE_POINT_MAX && !detail::is_surrogate(cp);
 }
 
 [[nodiscard]] constexpr bool is_in_bmp(char32_t cp) noexcept
@@ -171,14 +298,11 @@ template<octet Octet>
 [[nodiscard]] constexpr bool is_overlong_sequence(char32_t cp, int length) noexcept
 {
     if (cp < 0x80) {
-        if (length != 1)
-            return true;
+        if (length != 1) return true;
     } else if (cp < 0x800) {
-        if (length != 2)
-            return true;
+        if (length != 2) return true;
     } else if (cp < 0x10000) {
-        if (length != 3)
-            return true;
+        if (length != 3) return true;
     }
     return false;
 }
@@ -187,17 +311,12 @@ template<octet_input_iterator It>
 [[nodiscard]] constexpr int sequence_length(It lead_it)
     noexcept(is_nothrow_dereferenceable_v<It&>)
 {
-    const char8_t lead = detail::mask8(*lead_it);
-    if (lead < 0x80)
-        return 1;
-    else if ((lead >> 5) == 0x6)
-        return 2;
-    else if ((lead >> 4) == 0xe)
-        return 3;
-    else if ((lead >> 3) == 0x1e)
-        return 4;
-    else
-        return 0;
+    char8_t const lead = detail::mask8(*lead_it);
+    if (lead < 0x80) return 1;
+    if ((lead >> 5) == 0x6) return 2;
+    if ((lead >> 4) == 0xe) return 3;
+    if ((lead >> 3) == 0x1e) return 4;
+    return 0;
 }
 
 /// Helper for get_sequence_x
@@ -209,21 +328,21 @@ constexpr utf_error increase_safely(It& it, Se end)
         is_nothrow_sentinel<It, Se>
     >)
 {
-    if (++it == end)
-        return utf_error::NOT_ENOUGH_ROOM;
-
-    if (!detail::is_trail(*it))
+    if (++it == end) {
+        return utf_error::NOT_ENOUGH_SPACE;
+    }
+    if (!detail::is_trail(*it)) {
         return utf_error::INCOMPLETE_SEQUENCE;
-
+    }
     return utf_error::OK;
 }
 
 #define IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(IT, END)                                                                                                                                                                                      \
-do {                                                                                                                                                                                                                                       \
-utf_error ret = increase_safely(IT, END);                                                                                                                                                                                              \
-if (ret != utf_error::OK)                                                                                                                                                                                                                    \
-    return ret;                                                                                                                                                                                                                        \
-} while (false)
+    do {                                                                                                                                                                                                                                       \
+        utf_error ret = increase_safely(IT, END);                                                                                                                                                                                              \
+    if (ret != utf_error::OK)                                                                                                                                                                                                                    \
+        return ret;                                                                                                                                                                                                                        \
+    } while (false)
 
 /// get_sequence_x functions decode utf-8 sequences of the length x
 template<octet_input_iterator It, std::sentinel_for<It> Se>
@@ -233,11 +352,8 @@ constexpr utf_error get_sequence_1(It& it, Se end, char32_t& code_point)
         is_nothrow_sentinel<It, Se>
     >)
 {
-    if (it == end)
-        return utf_error::NOT_ENOUGH_ROOM;
-
+    if (it == end) return utf_error::NOT_ENOUGH_SPACE;
     code_point = static_cast<char32_t>(detail::mask8(*it));
-
     return utf_error::OK;
 }
 
@@ -249,15 +365,11 @@ constexpr utf_error get_sequence_2(It& it, Se end, char32_t& code_point)
         is_nothrow_sentinel<It, Se>
     >)
 {
-    if (it == end)
-        return utf_error::NOT_ENOUGH_ROOM;
+    if (it == end) return utf_error::NOT_ENOUGH_SPACE;
 
     code_point = static_cast<char32_t>(detail::mask8(*it));
-
     IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end);
-
     code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f);
-
     return utf_error::OK;
 }
 
@@ -269,19 +381,13 @@ constexpr utf_error get_sequence_3(It& it, Se end, char32_t& code_point)
         is_nothrow_sentinel<It, Se>
     >)
 {
-    if (it == end)
-        return utf_error::NOT_ENOUGH_ROOM;
+    if (it == end) return utf_error::NOT_ENOUGH_SPACE;
 
     code_point = static_cast<char32_t>(detail::mask8(*it));
-
     IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end);
-
     code_point = ((code_point << 12) & 0xffff) + ((detail::mask8(*it) << 6) & 0xfff);
-
     IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end);
-
     code_point = static_cast<char32_t>(code_point + ((*it) & 0x3f));
-
     return utf_error::OK;
 }
 
@@ -293,23 +399,15 @@ constexpr utf_error get_sequence_4(It& it, Se end, char32_t& code_point)
         is_nothrow_sentinel<It, Se>
     >)
 {
-    if (it == end)
-        return utf_error::NOT_ENOUGH_ROOM;
+    if (it == end) return utf_error::NOT_ENOUGH_SPACE;
 
     code_point = static_cast<char32_t>(detail::mask8(*it));
-
     IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end);
-
     code_point = ((code_point << 18) & 0x1fffff) + ((detail::mask8(*it) << 12) & 0x3ffff);
-
     IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end);
-
     code_point = static_cast<char32_t>(code_point + ((detail::mask8(*it) << 6) & 0xfff));
-
     IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end);
-
     code_point = static_cast<char32_t>(code_point + ((*it) & 0x3f));
-
     return utf_error::OK;
 }
 
@@ -325,8 +423,7 @@ constexpr utf_error validate_next(It& it, Se end, char32_t& code_point)
         std::is_nothrow_copy_constructible<It>
     >)
 {
-    if (it == end)
-        return utf_error::NOT_ENOUGH_ROOM;
+    if (it == end) return utf_error::NOT_ENOUGH_SPACE;
 
     // Save the original value of it so we can go back in case of failure
     // Of course, it does not make much sense with i.e. stream iterators
@@ -337,22 +434,24 @@ constexpr utf_error validate_next(It& it, Se end, char32_t& code_point)
     const int length = detail::sequence_length(it);
 
     // Get trail octets and calculate the code point
-    utf_error err = utf_error::OK;
+    utf_error err{};
     switch (length) {
-        case 0:
-            return utf_error::INVALID_LEAD;
-        case 1:
-            err = detail::get_sequence_1(it, end, cp);
-            break;
-        case 2:
-            err = detail::get_sequence_2(it, end, cp);
-            break;
-        case 3:
-            err = detail::get_sequence_3(it, end, cp);
-            break;
-        case 4:
-            err = detail::get_sequence_4(it, end, cp);
-            break;
+    case 0:
+        return utf_error::INVALID_LEAD;
+    case 1:
+        err = detail::get_sequence_1(it, end, cp);
+        break;
+    case 2:
+        err = detail::get_sequence_2(it, end, cp);
+        break;
+    case 3:
+        err = detail::get_sequence_3(it, end, cp);
+        break;
+    case 4:
+        err = detail::get_sequence_4(it, end, cp);
+        break;
+    default:
+        std::unreachable();
     }
 
     if (err == utf_error::OK) {
@@ -363,10 +462,12 @@ constexpr utf_error validate_next(It& it, Se end, char32_t& code_point)
                 code_point = cp;
                 ++it;
                 return utf_error::OK;
-            } else
+            } else {
                 err = utf_error::OVERLONG_SEQUENCE;
-        } else
+            }
+        } else {
             err = utf_error::INVALID_CODE_POINT;
+        }
     }
 
     // Failure branch - restore the original value of the iterator
@@ -395,142 +496,36 @@ constexpr utf_error validate_next16(It& it, Se end, char32_t& code_point)
     >)
 {
     // Check the edge case:
-    if (it == end)
-        return utf_error::NOT_ENOUGH_ROOM;
+    if (it == end) return utf_error::NOT_ENOUGH_SPACE;
+
     // Save the original value of it so we can go back in case of failure
     // Of course, it does not make much sense with i.e. stream iterators
-    It original_it = it;
+    It const original_it = it;
 
-    utf_error err = utf_error::OK;
-
-    const char16_t first_word = *it++;
+    char16_t const first_word = *it++;
     if (!detail::is_surrogate(first_word)) {
         code_point = first_word;
         return utf_error::OK;
-    } else {
-        if (it == end)
-            err = utf_error::NOT_ENOUGH_ROOM;
-        else if (detail::is_lead_surrogate(first_word)) {
-            const char16_t second_word = *it++;
-            if (detail::is_trail_surrogate(static_cast<char32_t>(second_word))) {
-                code_point = static_cast<char32_t>(first_word << 10) + static_cast<char32_t>(second_word) + SURROGATE_OFFSET;
-                return utf_error::OK;
-            } else
-                err = utf_error::INCOMPLETE_SEQUENCE;
-
-        } else {
-            err = utf_error::INVALID_LEAD;
-        }
     }
-    // error branch
-    it = original_it;
-    return err;
-}
-
-template<class OutIt, octet octet_type = std::iter_value_t<OutIt>>
-    requires std::output_iterator<OutIt, octet_type>
-constexpr OutIt append(char32_t cp, OutIt result) noexcept
-{
-    if (cp < 0x80) // one octet
-        *(result++) = static_cast<octet_type>(cp);
-    else if (cp < 0x800) { // two octets
-        *(result++) = static_cast<octet_type>((cp >> 6) | 0xc0);
-        *(result++) = static_cast<octet_type>((cp & 0x3f) | 0x80);
-    } else if (cp < 0x10000) { // three octets
-        *(result++) = static_cast<octet_type>((cp >> 12) | 0xe0);
-        *(result++) = static_cast<octet_type>(((cp >> 6) & 0x3f) | 0x80);
-        *(result++) = static_cast<octet_type>((cp & 0x3f) | 0x80);
-    } else { // four octets
-        *(result++) = static_cast<octet_type>((cp >> 18) | 0xf0);
-        *(result++) = static_cast<octet_type>(((cp >> 12) & 0x3f) | 0x80);
-        *(result++) = static_cast<octet_type>(((cp >> 6) & 0x3f) | 0x80);
-        *(result++) = static_cast<octet_type>((cp & 0x3f) | 0x80);
+    if (it == end) {
+        it = original_it;
+        return utf_error::NOT_ENOUGH_SPACE;
     }
-    return result;
-}
-
-template<class container_type>
-constexpr std::back_insert_iterator<container_type> append(char32_t cp, std::back_insert_iterator<container_type> result)
-    noexcept(noexcept(detail::append<std::back_insert_iterator<container_type>, class container_type::value_type>(cp, result)))
-    {
-        return detail::append<std::back_insert_iterator<container_type>, class container_type::value_type>(cp, result);
-    }
-
-    template<std::output_iterator<char16_t> It>
-    constexpr It append16(char32_t cp, It result)
-        noexcept(noexcept(*result++ = std::declval<char16_t>()))
-    {
-        if (detail::is_in_bmp(cp))
-            *(result++) = static_cast<char16_t>(cp);
-        else {
-            // Code points from the supplementary planes are encoded via surrogate pairs
-            *(result++) = static_cast<char16_t>(LEAD_OFFSET + (cp >> 10));
-            *(result++) = static_cast<char16_t>(TRAIL_SURROGATE_MIN + (cp & 0x3FF));
+    if (detail::is_lead_surrogate(first_word)) {
+        char16_t const second_word = *it++;
+        if (detail::is_trail_surrogate(static_cast<char32_t>(second_word))) {
+            code_point = static_cast<char32_t>(first_word << 10) + static_cast<char32_t>(second_word) + SURROGATE_OFFSET;
+            return utf_error::OK;
         }
-        return result;
-    }
-
-} // detail
-
-// Base for the exceptions that may be thrown from the library
-class exception : public ::std::exception
-{
-};
-
-// Exceptions that may be thrown from the library functions.
-class invalid_code_point : public exception
-{
-    char32_t cp;
-
-public:
-    explicit invalid_code_point(char32_t codepoint)
-        : cp(codepoint)
-    {
-    }
-    virtual const char* what() const noexcept override { return "Invalid code point"; }
-    [[nodiscard]] char32_t code_point() const noexcept { return cp; }
-};
-
-class invalid_utf8 : public exception
-{
-    char8_t u8;
-
-public:
-    explicit invalid_utf8(char c)
-        : u8(static_cast<char8_t>(c))
-    {
-    }
-    explicit invalid_utf8(char8_t u)
-        : u8(u)
-    {
-    }
-    virtual const char* what() const noexcept override { return "Invalid UTF-8"; }
-    [[nodiscard]] char8_t utf8_octet() const noexcept { return u8; }
-};
-
-class invalid_utf16 : public exception
-{
-    char16_t u16;
-
-public:
-    explicit invalid_utf16(char16_t u)
-        : u16(u)
-    {
+        it = original_it;
+        return utf_error::INCOMPLETE_SEQUENCE;
     }
-    virtual const char* what() const noexcept override { return "Invalid UTF-16"; }
-    [[nodiscard]] char16_t utf16_word() const noexcept { return u16; }
-};
-
-class not_enough_room : public exception
-{
-public:
-    virtual const char* what() const noexcept override { return "Not enough space"; }
-};
 
-/// The library API - functions intended to be called by the users
+    it = original_it;
+    return utf_error::INVALID_LEAD;
+}
 
-// Byte order mark
-constexpr char8_t bom[] = {0xef, 0xbb, 0xbf};
+} // detail
 
 template<octet_input_iterator It, std::sentinel_for<It> Se>
 [[nodiscard]] constexpr It find_invalid(It it, Se se)
@@ -538,8 +533,9 @@ template<octet_input_iterator It, std::sentinel_for<It> Se>
 {
     while (it != se) {
         detail::utf_error err_code = detail::validate_next(it, se);
-        if (err_code != detail::utf_error::OK)
+        if (err_code != detail::utf_error::OK) {
             return it;
+        }
     }
     return it;
 }
@@ -548,21 +544,21 @@ template<octet_input_iterator It, std::sentinel_for<It> Se>
     noexcept(noexcept(unicode::find_invalid(s.begin(), s.end())))
 {
     std::string_view::const_iterator invalid = unicode::find_invalid(s.begin(), s.end());
-    return (invalid == s.end()) ? std::string_view::npos : static_cast<std::size_t>(invalid - s.begin());
+    return invalid == s.end() ? std::string_view::npos : static_cast<std::size_t>(invalid - s.begin());
 }
 
 [[nodiscard]] constexpr std::size_t find_invalid(std::u8string_view s)
     noexcept(noexcept(unicode::find_invalid(s.begin(), s.end())))
 {
     std::u8string_view::const_iterator invalid = unicode::find_invalid(s.begin(), s.end());
-    return (invalid == s.end()) ? std::u8string_view::npos : static_cast<std::size_t>(invalid - s.begin());
+    return invalid == s.end() ? std::u8string_view::npos : static_cast<std::size_t>(invalid - s.begin());
 }
 
 template<octet_input_iterator It, std::sentinel_for<It> Se>
 [[nodiscard]] constexpr bool is_valid(It it, Se se)
     noexcept(noexcept(unicode::find_invalid(it, se)) && is_nothrow_sentinel_v<It, Se>)
 {
-    return (unicode::find_invalid(it, se) == se);
+    return unicode::find_invalid(it, se) == se;
 }
 
 [[nodiscard]] constexpr bool is_valid(std::string_view s)
@@ -581,7 +577,10 @@ template<octet_input_iterator It, std::sentinel_for<It> Se>
 [[nodiscard]] constexpr bool starts_with_bom(It it, Se end)
     noexcept(noexcept(detail::mask8(*it++)) && is_nothrow_sentinel_v<It, Se>)
 {
-    return (((it != end) && (detail::mask8(*it++)) == bom[0]) && ((it != end) && (detail::mask8(*it++)) == bom[1]) && ((it != end) && (detail::mask8(*it)) == bom[2]));
+    return
+        (it != end && (detail::mask8(*it++)) == bom[0]) &&
+        (it != end && (detail::mask8(*it++)) == bom[1]) &&
+        (it != end && (detail::mask8(*it)) == bom[2]);
 }
 
 [[nodiscard]] constexpr bool starts_with_bom(std::string_view s)
@@ -596,73 +595,154 @@ template<octet_input_iterator It, std::sentinel_for<It> Se>
     return unicode::starts_with_bom(s.begin(), s.end());
 }
 
-template<class OutIt>
-constexpr OutIt append(char32_t cp, OutIt result)
+
+template<octet_output_iterator OutIt>
+constexpr OutIt append8(char32_t cp, OutIt out)
 {
-    if (!detail::is_code_point_valid(cp))
-        throw invalid_code_point(cp);
+    if (!detail::is_code_point_valid(cp)) throw invalid_code_point(cp);
 
-    return detail::append(cp, result);
+    using octet_type = detail::select_output_value_type<OutIt, char>::type;
+
+    if (cp < 0x80) { // one octet
+        *out++ = static_cast<octet_type>(cp);
+    } else if (cp < 0x800) { // two octets
+        *out++ = static_cast<octet_type>((cp >> 6) | 0xc0);
+        *out++ = static_cast<octet_type>((cp & 0x3f) | 0x80);
+    } else if (cp < 0x10000) { // three octets
+        *out++ = static_cast<octet_type>((cp >> 12) | 0xe0);
+        *out++ = static_cast<octet_type>(((cp >> 6) & 0x3f) | 0x80);
+        *out++ = static_cast<octet_type>((cp & 0x3f) | 0x80);
+    } else { // four octets
+        *out++ = static_cast<octet_type>((cp >> 18) | 0xf0);
+        *out++ = static_cast<octet_type>(((cp >> 12) & 0x3f) | 0x80);
+        *out++ = static_cast<octet_type>(((cp >> 6) & 0x3f) | 0x80);
+        *out++ = static_cast<octet_type>((cp & 0x3f) | 0x80);
+    }
+    return out;
 }
 
-constexpr void append(char32_t cp, std::string& s)
+template<utf16_output_iterator OutIt>
+constexpr OutIt append16(char32_t cp, OutIt out)
 {
-    unicode::append(cp, std::back_inserter(s));
+    if (!detail::is_code_point_valid(cp)) throw invalid_code_point(cp);
+
+    if (detail::is_in_bmp(cp)) {
+        *out++ = static_cast<char16_t>(cp);
+    } else {
+        // Code points from the supplementary planes are encoded via surrogate pairs
+        *out++ = static_cast<char16_t>(detail::LEAD_OFFSET + (cp >> 10));
+        *out++ = static_cast<char16_t>(detail::TRAIL_SURROGATE_MIN + (cp & 0x3FF));
+    }
+    return out;
 }
 
-constexpr void append(char32_t cp, std::u8string& s)
+// Forwards automatically based on `sizeof(value_type)`, but overload may become
+// ambiguous on `value_type`-agnostic iterators such as `std::back_insert_iterator`.
+template<octet_output_iterator OutIt>
+constexpr OutIt append(char32_t cp, OutIt out)
+{
+    return unicode::append8(cp, std::move(out));
+}
+template<utf16_output_iterator OutIt>
+constexpr OutIt append(char32_t cp, OutIt out)
 {
-    unicode::append(cp, std::back_inserter(s));
+    return unicode::append16(cp, std::move(out));
 }
 
-template<class It>  // TODO: add constraints
-constexpr It append16(char32_t cp, It result)
+
+template<class OutR>
+    requires octet_output_range<std::decay_t<OutR>>
+constexpr std::ranges::subrange<std::ranges::iterator_t<std::decay_t<OutR>>, std::ranges::sentinel_t<std::decay_t<OutR>>>
+append8(char32_t cp, OutR&& r)
 {
-    if (!detail::is_code_point_valid(cp))
-        throw invalid_code_point(cp);
+    return std::ranges::subrange{
+        unicode::append8(cp, std::ranges::begin(r)), std::ranges::end(r)
+    };
+}
 
-    return detail::append16(cp, result);
+template<class OutR>
+    requires utf16_output_range<std::decay_t<OutR>>
+constexpr std::ranges::subrange<std::ranges::iterator_t<std::decay_t<OutR>>, std::ranges::sentinel_t<std::decay_t<OutR>>>
+append16(char32_t cp, OutR&& r)
+{
+    return std::ranges::subrange{
+        unicode::append16(cp, std::ranges::begin(r)), std::ranges::end(r)
+    };
+}
+
+// Forwards automatically based on `sizeof(value_type)`, but overload may become
+// ambiguous on `value_type`-agnostic iterators such as `std::back_insert_iterator`.
+template<class OutR>
+    requires octet_output_range<std::decay_t<OutR>>
+constexpr std::ranges::subrange<std::ranges::iterator_t<std::decay_t<OutR>>, std::ranges::sentinel_t<std::decay_t<OutR>>>
+append(char32_t cp, OutR&& r)
+{
+    return unicode::append8(cp, std::forward<OutR>(r));
+}
+template<class OutR>
+    requires utf16_output_range<std::decay_t<OutR>>
+constexpr std::ranges::subrange<std::ranges::iterator_t<std::decay_t<OutR>>, std::ranges::sentinel_t<std::decay_t<OutR>>>
+append(char32_t cp, OutR&& r)
+{
+    return unicode::append16(cp, std::forward<OutR>(r));
 }
 
-constexpr void append16(char32_t cp, std::u16string& s)
+constexpr void append(char32_t cp, std::string& str)
 {
-    unicode::append16(cp, std::back_inserter(s));
+    unicode::append8(cp, std::back_inserter(str));
 }
 
-template<octet_input_iterator It, std::sentinel_for<It> Se, class Out>  // TODO: add constraints
+constexpr void append(char32_t cp, std::u8string& str)
+{
+    unicode::append8(cp, std::back_inserter(str));
+}
+
+constexpr void append(char32_t cp, std::u16string& str)
+{
+    unicode::append16(cp, std::back_inserter(str));
+}
+
+template<octet_input_iterator It, std::sentinel_for<It> Se, octet_output_iterator Out>
 constexpr Out replace_invalid(It start, Se end, Out out, char32_t replacement)
 {
     while (start != end) {
-        It sequence_start = start;
-        detail::utf_error err_code  = detail::validate_next(start, end);
-        switch (err_code) {
-            case detail::utf_error::OK:
-                for (It it = sequence_start; it != start; ++it)
-                    *out++ = *it;
-                break;
-            case detail::utf_error::NOT_ENOUGH_ROOM:
-                out   = unicode::append(replacement, out);
-                start = end;
-                break;
-            case detail::utf_error::INVALID_LEAD:
-                out = unicode::append(replacement, out);
-                ++start;
-                break;
-            case detail::utf_error::INCOMPLETE_SEQUENCE:
-            case detail::utf_error::OVERLONG_SEQUENCE:
-            case detail::utf_error::INVALID_CODE_POINT:
-                out = unicode::append(replacement, out);
+        It const sequence_start = start;
+        switch (detail::validate_next(start, end)) {
+        case detail::utf_error::OK:
+            for (It it = sequence_start; it != start; ++it) {
+                *out++ = *it;
+            }
+            break;
+
+        case detail::utf_error::NOT_ENOUGH_SPACE:
+            out = unicode::append8(replacement, out);
+            start = end;
+            break;
+
+        case detail::utf_error::INVALID_LEAD:
+            out = unicode::append8(replacement, out);
+            ++start;
+            break;
+
+        case detail::utf_error::INCOMPLETE_SEQUENCE:
+        case detail::utf_error::OVERLONG_SEQUENCE:
+        case detail::utf_error::INVALID_CODE_POINT:
+            out = unicode::append8(replacement, out);
+            ++start;
+            // just one replacement mark for the sequence
+            while (start != end && detail::is_trail(*start)) {
                 ++start;
-                // just one replacement mark for the sequence
-                while (start != end && detail::is_trail(*start))
-                    ++start;
-                break;
+            }
+            break;
+
+        default:
+            std::unreachable();
         }
     }
     return out;
 }
 
-template<octet_input_iterator It, std::sentinel_for<It> Se, class Out>  // TODO: add constraints
+template<octet_input_iterator It, std::sentinel_for<It> Se, octet_output_iterator Out>
 constexpr Out replace_invalid(It start, Se end, Out out)
 {
     constexpr char32_t replacement_marker = static_cast<char32_t>(detail::mask16(0xfffd));
@@ -672,47 +752,48 @@ constexpr Out replace_invalid(It start, Se end, Out out)
 [[nodiscard]] constexpr std::string replace_invalid(std::string_view s, char32_t replacement)
 {
     std::string result;
-    replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement);
+    unicode::replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement);
     return result;
 }
 
 [[nodiscard]] constexpr std::u8string replace_invalid(std::u8string_view s, char32_t replacement)
 {
     std::u8string result;
-    replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement);
+    unicode::replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement);
     return result;
 }
 
 [[nodiscard]] constexpr std::string replace_invalid(std::string_view s)
 {
     std::string result;
-    replace_invalid(s.begin(), s.end(), std::back_inserter(result));
+    unicode::replace_invalid(s.begin(), s.end(), std::back_inserter(result));
     return result;
 }
 
 [[nodiscard]] constexpr std::u8string replace_invalid(std::u8string_view s)
 {
     std::u8string result;
-    replace_invalid(s.begin(), s.end(), std::back_inserter(result));
+    unicode::replace_invalid(s.begin(), s.end(), std::back_inserter(result));
     return result;
 }
 
 template<octet_input_iterator It, std::sentinel_for<It> Se>
 [[nodiscard]] constexpr char32_t next(It& it, Se end)
 {
-    char32_t cp               = 0;
-    detail::utf_error err_code = detail::validate_next(it, end, cp);
-    switch (err_code) {
-        case detail::utf_error::OK:
-            break;
-        case detail::utf_error::NOT_ENOUGH_ROOM:
-            throw not_enough_room();
-        case detail::utf_error::INVALID_LEAD:
-        case detail::utf_error::INCOMPLETE_SEQUENCE:
-        case detail::utf_error::OVERLONG_SEQUENCE:
-            throw invalid_utf8(static_cast<char8_t>(*it));
-        case detail::utf_error::INVALID_CODE_POINT:
-            throw invalid_code_point(cp);
+    char32_t cp = 0;
+    switch (detail::validate_next(it, end, cp)) {
+    case detail::utf_error::OK:
+        break;
+    case detail::utf_error::NOT_ENOUGH_SPACE:
+        throw not_enough_space();
+    case detail::utf_error::INVALID_LEAD:
+    case detail::utf_error::INCOMPLETE_SEQUENCE:
+    case detail::utf_error::OVERLONG_SEQUENCE:
+        throw invalid_utf8(static_cast<char8_t>(*it));
+    case detail::utf_error::INVALID_CODE_POINT:
+        throw invalid_code_point(cp);
+    default:
+        std::unreachable();
     }
     return cp;
 }
@@ -722,8 +803,9 @@ template<utf16_input_iterator It, std::sentinel_for<It> Se>
 {
     char32_t cp               = 0;
     detail::utf_error err_code = detail::validate_next16(it, end, cp);
-    if (err_code == detail::utf_error::NOT_ENOUGH_ROOM)
-        throw not_enough_room();
+    if (err_code == detail::utf_error::NOT_ENOUGH_SPACE) {
+        throw not_enough_space();
+    }
     return cp;
 }
 
@@ -737,50 +819,53 @@ template<octet_input_iterator It, std::sentinel_for<It> Se>
 [[nodiscard]] constexpr char32_t prev(It& it, Se start)
 {
     // can't do much if it == start
-    if (it == start)
-        throw not_enough_room();
+    if (it == start) throw not_enough_space();
 
     It end = it;
     // Go back until we hit either a lead octet or start
-    while (detail::is_trail(*(--it)))
-        if (it == start)
-            throw invalid_utf8(*it); // error - no lead byte in the sequence
+    while (detail::is_trail(*(--it))) {
+        if (it == start) throw invalid_utf8(*it); // error - no lead byte in the sequence
+    }
     return unicode::peek_next(it, end);
 }
 
 template<octet_input_iterator It, std::sentinel_for<It> Se, class distance_type>
 constexpr void advance(It& it, distance_type n, Se end)
 {
-    const distance_type zero(0);
+    constexpr distance_type zero(0);
     if (n < zero) {
         // backward
-        for (distance_type i = n; i < zero; ++i)
+        for (distance_type i = n; i < zero; ++i) {
             (void)unicode::prev(it, end);
+        }
     } else {
         // forward
-        for (distance_type i = zero; i < n; ++i)
+        for (distance_type i = zero; i < n; ++i) {
             (void)unicode::next(it, end);
+        }
     }
 }
 
 template<octet_input_iterator It, std::sentinel_for<It> Se>
-[[nodiscard]] constexpr class std::iterator_traits<It>::difference_type distance(It first, Se last)
+[[nodiscard]] constexpr typename std::iterator_traits<It>::difference_type
+distance(It first, Se last)
 {
-    class std::iterator_traits<It>::difference_type dist;
-    for (dist = 0; first != last; ++dist)
+    typename std::iterator_traits<It>::difference_type dist;
+    for (dist = 0; first != last; ++dist) {
         (void)unicode::next(first, last);
+    }
     return dist;
 }
 
-template<utf16_input_iterator It, std::sentinel_for<It> Se, class OutIt> // TODO: add constraints
-constexpr OutIt utf16to8(It start, Se end, OutIt result)
+template<utf16_input_iterator It, std::sentinel_for<It> Se, octet_output_iterator OutIt>
+constexpr OutIt utf16to8(It start, Se end, OutIt out)
 {
     while (start != end) {
         char32_t cp = static_cast<char32_t>(detail::mask16(*start++));
         // Take care of surrogate pairs first
         if (detail::is_lead_surrogate(cp)) {
             if (start != end) {
-                const char32_t trail_surrogate = static_cast<char32_t>(detail::mask16(*start++));
+                char32_t const trail_surrogate = static_cast<char32_t>(detail::mask16(*start++));
                 if (detail::is_trail_surrogate(trail_surrogate))
                     cp = (cp << 10) + trail_surrogate + detail::SURROGATE_OFFSET;
                 else
@@ -793,9 +878,9 @@ constexpr OutIt utf16to8(It start, Se end, OutIt result)
         else if (detail::is_trail_surrogate(cp))
             throw invalid_utf16(static_cast<char16_t>(cp));
 
-        result = unicode::append(cp, result);
+        out = unicode::append8(cp, out);
     }
-    return result;
+    return out;
 }
 
 [[nodiscard]] constexpr std::string utf16to8(std::u16string_view s)
@@ -812,18 +897,19 @@ constexpr OutIt utf16to8(It start, Se end, OutIt result)
     return result;
 }
 
-template<utf8_input_iterator It, std::sentinel_for<It> Se, class OutIt>  // TODO: add constraints
-constexpr OutIt utf8to16(It start, Se end, OutIt result)
+template<utf8_input_iterator It, std::sentinel_for<It> Se, utf16_output_iterator OutIt>
+constexpr OutIt utf8to16(It start, Se end, OutIt out)
 {
     while (start != end) {
-        const char32_t cp = unicode::next(start, end);
+        char32_t const cp = unicode::next(start, end);
         if (cp > 0xffff) { // make a surrogate pair
-            *result++ = static_cast<char16_t>((cp >> 10) + detail::LEAD_OFFSET);
-            *result++ = static_cast<char16_t>((cp & 0x3ff) + detail::TRAIL_SURROGATE_MIN);
-        } else
-            *result++ = static_cast<char16_t>(cp);
+            *out++ = static_cast<char16_t>((cp >> 10) + detail::LEAD_OFFSET);
+            *out++ = static_cast<char16_t>((cp & 0x3ff) + detail::TRAIL_SURROGATE_MIN);
+        } else {
+            *out++ = static_cast<char16_t>(cp);
+        }
     }
-    return result;
+    return out;
 }
 
 [[nodiscard]] constexpr std::u16string utf8to16(std::string_view s)
@@ -840,13 +926,13 @@ constexpr OutIt utf8to16(It start, Se end, OutIt result)
     return result;
 }
 
-template<utf32_input_iterator It, std::sentinel_for<It> Se, class OutIt>  // TODO: add constraints
-constexpr OutIt utf32to8(It start, Se end, OutIt result)
+template<utf32_input_iterator It, std::sentinel_for<It> Se, octet_output_iterator OutIt>
+constexpr OutIt utf32to8(It start, Se end, OutIt out)
 {
-    while (start != end)
-        result = unicode::append(*(start++), result);
-
-    return result;
+    while (start != end) {
+        out = unicode::append8(*start++, out);
+    }
+    return out;
 }
 
 [[nodiscard]] constexpr std::string utf32to8(std::u32string_view s)
@@ -864,12 +950,12 @@ constexpr OutIt utf32to8(It start, Se end, OutIt result)
 }
 
 template<utf8_input_iterator It, std::sentinel_for<It> Se, class OutIt>
-constexpr OutIt utf8to32(It start, Se end, OutIt result)
+constexpr OutIt utf8to32(It start, Se end, OutIt out)
 {
-    while (start != end)
-        (*result++) = unicode::next(start, end);
-
-    return result;
+    while (start != end) {
+        *out++ = unicode::next(start, end);
+    }
+    return out;
 }
 
 [[nodiscard]] constexpr std::u32string utf8to32(std::string_view s)
@@ -900,17 +986,20 @@ class iterator
     using reference = char32_t&;
     using difference_type = std::ptrdiff_t;
     using iterator_category = std::bidirectional_iterator_tag;
+
     constexpr iterator()
         requires std::is_default_constructible_v<It>
     = default;
+
     constexpr explicit iterator(It octet_it, It rangestart, It rangeend)
         : it(std::move(octet_it))
         , range_start(std::move(rangestart))
         , range_end(std::move(rangeend))
     {
         if constexpr (std::random_access_iterator<It>) {
-            if (it < range_start || it > range_end)
+            if (it < range_start || it > range_end) {
                 throw std::out_of_range("Invalid utf-8 iterator position");
+            }
         }
     }
     // the default "big three" are OK
diff --git a/test/unicode/string/string.cpp b/test/unicode/string/string.cpp
index abc34a2..27738b8 100644
--- a/test/unicode/string/string.cpp
+++ b/test/unicode/string/string.cpp
@@ -2,66 +2,90 @@
 
 #include <iris/unicode/string.hpp>
 
+#include <algorithm>
 #include <string>
+#include <array>
+#include <ranges>
+
+#include <cstdint>
 
 namespace iris_unicode_test {
 
 namespace unicode = iris::unicode;
 
-using namespace iris::unicode;
-using namespace std;
+template<class T, class... Chars>
+constexpr std::array<T, sizeof...(Chars)> to_array_cast(Chars... cs)
+{
+    return std::array<T, sizeof...(Chars)>{
+        static_cast<T>(cs)...
+    };
+}
 
 TEST_CASE("append")
 {
-    unsigned char u[5] = {0, 0, 0, 0, 0};
-    unicode::append(0x0448, u);
-    EXPECT_EQ (u[0], 0xd1);
-    EXPECT_EQ (u[1], 0x88);
-    EXPECT_EQ (u[2], 0);
-    EXPECT_EQ (u[3], 0);
-    EXPECT_EQ (u[4], 0);
-
-    unicode::append(0x65e5, u);
-    EXPECT_EQ (u[0], 0xe6);
-    EXPECT_EQ (u[1], 0x97);
-    EXPECT_EQ (u[2], 0xa5);
-    EXPECT_EQ (u[3], 0);
-    EXPECT_EQ (u[4], 0);
-
-    unicode::append(0x3044, u);
-    EXPECT_EQ (u[0], 0xe3);
-    EXPECT_EQ (u[1], 0x81);
-    EXPECT_EQ (u[2], 0x84);
-    EXPECT_EQ (u[3], 0);
-    EXPECT_EQ (u[4], 0);
-
-    unicode::append(0x10346, u);
-    EXPECT_EQ (u[0], 0xf0);
-    EXPECT_EQ (u[1], 0x90);
-    EXPECT_EQ (u[2], 0x8d);
-    EXPECT_EQ (u[3], 0x86);
-    EXPECT_EQ (u[4], 0);
+    constexpr auto do_test = []<class T>() {
+        STATIC_CHECK([] {
+            std::array<T, 5> u{};
+            unicode::append(0x0448U, u);
+            return u;
+        }() == to_array_cast<T>(0xd1, 0x88, 0, 0, 0));
+
+        STATIC_CHECK([] {
+            std::array<T, 5> u{};
+            unicode::append(0x65e5U, u);
+            return u;
+        }() == to_array_cast<T>(0xe6, 0x97, 0xa5, 0, 0));
+
+        STATIC_CHECK([] {
+            std::array<T, 5> u{};
+            unicode::append(0x3044U, u);
+            return u;
+        }() == to_array_cast<T>(0xe3, 0x81, 0x84, 0, 0));
+
+        STATIC_CHECK([] {
+            std::array<T, 5> u{};
+            unicode::append(0x10346U, u);
+            return u;
+        }() == to_array_cast<T>(0xf0, 0x90, 0x8d, 0x86, 0));
+    };
+
+    do_test.operator()<char8_t>();
+    do_test.operator()<char>();
+    do_test.operator()<unsigned char>();
+    do_test.operator()<std::int8_t>();
+    do_test.operator()<std::uint8_t>();
+}
+
+TEST_CASE("append16")
+{
+    constexpr auto do_test = []<class T>() {
+        STATIC_CHECK([] {
+            std::array<T, 5> u{};
+            unicode::append(0x0448U, u);
+            return u;
+        }() == to_array_cast<T>(0x0448, 0, 0, 0, 0));
+
+        STATIC_CHECK([] {
+            std::array<T, 5> u{};
+            unicode::append(0x65e5U, u);
+            return u;
+        }() == to_array_cast<T>(0x65e5, 0, 0, 0, 0));
+
+        STATIC_CHECK([] {
+            std::array<T, 5> u{};
+            unicode::append(0x10346U, u);
+            return u;
+        }() == to_array_cast<T>(0xd800, 0xdf46, 0, 0, 0));
+    };
+
+    do_test.operator()<char16_t>();
+    do_test.operator()<std::int16_t>();
+    do_test.operator()<std::uint16_t>();
 }
 
 #if 0
 
-TEST(CheckedAPITests, test_append16)
-{
-    char16_t u[5] = {0, 0};
-    append16(0x0448, u);
-    EXPECT_EQ (u[0], 0x0448);
-    EXPECT_EQ (u[1], 0x0000);
-
-    append16(0x65e5, u);
-    EXPECT_EQ (u[0], 0x65e5);
-    EXPECT_EQ (u[1], 0x0000);
-
-    append16(0x10346, u);
-    EXPECT_EQ (u[0], 0xd800);
-    EXPECT_EQ (u[1], 0xdf46);
-}
-
-TEST(CheckedAPITests, test_next)
+TEST_CASE("next")
 {
     const char* twochars = "\xe6\x97\xa5\xd1\x88";
     const char* w = twochars;
@@ -85,7 +109,7 @@ TEST(CheckedAPITests, test_next)
     EXPECT_EQ (w, threechars + 9);
 }
 
-TEST(CheckedAPITests, test_next16)
+TEST_CASE("next16")
 {
     const char16_t u[3] = {0x65e5, 0xd800, 0xdf46};
     const char16_t* w = u;
@@ -98,14 +122,14 @@ TEST(CheckedAPITests, test_next16)
     EXPECT_EQ (w, u + 3);
 }
 
-TEST(CheckedAPITests, test_peek_next)
+TEST_CASE("peek_next")
 {
     const char* const cw = "\xe6\x97\xa5\xd1\x88";
     unsigned int cp = peek_next(cw, cw + 6);
     EXPECT_EQ (cp, 0x65e5);
 }
 
-TEST(CheckedAPITests, test_prior)
+TEST_CASE("prior")
 {
     const char* twochars = "\xe6\x97\xa5\xd1\x88";
     const char* w = twochars + 3;
@@ -126,7 +150,7 @@ TEST(CheckedAPITests, test_prior)
     EXPECT_EQ (w, threechars);
 }
 
-TEST(CheckedAPITests, test_advance)
+TEST_CASE("advance")
 {
     const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
     const char* w = threechars;
@@ -142,14 +166,14 @@ TEST(CheckedAPITests, test_advance)
     EXPECT_EQ(w, threechars);
 }
 
-TEST(CheckedAPITests, test_distance)
+TEST_CASE("distance")
 {
     const char* twochars = "\xe6\x97\xa5\xd1\x88";
     size_t dist = static_cast<size_t>(iris::utflib::distance(twochars, twochars + 5));
     EXPECT_EQ (dist, 2);
 }
 
-TEST(CheckedAPITests, test_utf32to8)
+TEST_CASE("utf32to8")
 {
     char32_t utf32string[] = {0x448, 0x65E5, 0x10346, 0};
     string utf8result;
@@ -157,7 +181,7 @@ TEST(CheckedAPITests, test_utf32to8)
     EXPECT_EQ (utf8result.size(), 9);
 }
 
-TEST(CheckedAPITests, test_utf8to32)
+TEST_CASE("utf8to32")
 {
     const char* twochars = "\xe6\x97\xa5\xd1\x88";
     vector<unsigned int> utf32result;
@@ -165,7 +189,7 @@ TEST(CheckedAPITests, test_utf8to32)
     EXPECT_EQ (utf32result.size(), 2);
 }
 
-TEST(CheckedAPITests, test_utf16to8)
+TEST_CASE("utf16to8")
 {
     char16_t utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
     string utf8result;
@@ -173,7 +197,7 @@ TEST(CheckedAPITests, test_utf16to8)
     EXPECT_EQ (utf8result.size(), 10);
 }
 
-TEST(CheckedAPITests, test_utf8to16)
+TEST_CASE("utf8to16")
 {
     char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
     vector <char16_t> utf16result;
@@ -183,7 +207,7 @@ TEST(CheckedAPITests, test_utf8to16)
     EXPECT_EQ (utf16result[3], 0xdd1e);
 }
 
-TEST(CheckedAPITests, test_replace_invalid)
+TEST_CASE("replace_invalid")
 {
     char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
     vector<char> replace_invalid_result;
@@ -195,7 +219,7 @@ TEST(CheckedAPITests, test_replace_invalid)
     EXPECT_TRUE (std::equal(replace_invalid_result.begin(), replace_invalid_result.begin() + sizeof(fixed_invalid_sequence), fixed_invalid_sequence));
 }
 
-TEST(CheckedAPITests, test_find_invalid)
+TEST_CASE("find_invalid")
 {
     char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa";
     const char* invalid = find_invalid(utf_invalid, utf_invalid + 6);
@@ -204,7 +228,7 @@ TEST(CheckedAPITests, test_find_invalid)
     EXPECT_EQ (invalid, utf_invalid + 5);
 }
 
-TEST(CheckedAPITests, test_is_valid)
+TEST_CASE("is_valid")
 {
     char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa";
     bool bvalid = is_valid(utf_invalid, utf_invalid + 6);
@@ -218,7 +242,7 @@ TEST(CheckedAPITests, test_is_valid)
     EXPECT_TRUE (bvalid);
 }
 
-TEST(CheckedAPITests, test_starts_with_bom)
+TEST_CASE("starts_with_bom")
 {
     unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf};
     bool bbom = starts_with_bom(byte_order_mark, byte_order_mark + sizeof(byte_order_mark));
@@ -228,7 +252,7 @@ TEST(CheckedAPITests, test_starts_with_bom)
     EXPECT_FALSE (no_bbom);
 }
 
-TEST(CheckedIteratrTests, test_increment)
+TEST_CASE("increment")
 {
     const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
     iris::utflib::iterator<const char*> it(threechars, threechars, threechars + 9);
@@ -243,7 +267,7 @@ TEST(CheckedIteratrTests, test_increment)
     EXPECT_EQ (++it, endit);
 }
 
-TEST(CheckedIteratrTests, test_decrement)
+TEST_CASE("decrement")
 {
     const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
     iris::utflib::iterator<const char*> it(threechars+9, threechars, threechars + 9);
@@ -254,22 +278,14 @@ TEST(CheckedIteratrTests, test_decrement)
     EXPECT_EQ (*it, 0x10346);
 }
 
-TEST(CPP11APITests, test_append16)
-{
-    u16string u;
-    append16(0x0448, u);
-    EXPECT_EQ (u[0], char16_t(0x0448));
-    EXPECT_EQ (u.length(), 1);
-}
-
-TEST(CPP11APITests, test_utf16to8)
+TEST_CASE("utf16to8")
 {
     u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
     string u = utf16to8(utf16string);
     EXPECT_EQ (u.size(), 10);
 }
 
-TEST(CPP11APITests, test_utf8to16)
+TEST_CASE("utf8to16")
 {
     string utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
     u16string utf16result = utf8to16(utf8_with_surrogates);
@@ -281,28 +297,28 @@ TEST(CPP11APITests, test_utf8to16)
     EXPECT_EQ(utf8to16("simple"), u"simple");
 }
 
-TEST(CPP11APITests, test_utf32to8)
+TEST_CASE("utf32to8")
 {
     u32string utf32string = {0x448, 0x65E5, 0x10346};
     string utf8result = utf32to8(utf32string);
     EXPECT_EQ (utf8result.size(), 9);
 }
 
-TEST(CPP11APITests, test_utf8to32)
+TEST_CASE("utf8to32")
 {
     const char* twochars = "\xe6\x97\xa5\xd1\x88";
     u32string utf32result = utf8to32(twochars);
     EXPECT_EQ (utf32result.size(), 2);
 }
 
-TEST(CPP11APITests, test_find_invalid)
+TEST_CASE("find_invalid")
 {
     string utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa";
     auto invalid = find_invalid(utf_invalid);
     EXPECT_EQ (invalid, 5);
 }
 
-TEST(CPP11APITests, test_is_valid)
+TEST_CASE("is_valid")
 {
     string utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa";
     bool bvalid = is_valid(utf_invalid);
@@ -312,7 +328,7 @@ TEST(CPP11APITests, test_is_valid)
     EXPECT_TRUE (bvalid);
 }
 
-TEST(CPP11APITests, test_replace_invalid)
+TEST_CASE("replace_invalid")
 {
     string invalid_sequence = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
     string replace_invalid_result = replace_invalid(invalid_sequence, '?');
@@ -322,7 +338,7 @@ TEST(CPP11APITests, test_replace_invalid)
     EXPECT_EQ(fixed_invalid_sequence, replace_invalid_result);
 }
 
-TEST(CPP11APITests, test_starts_with_bom)
+TEST_CASE("starts_with_bom")
 {
     string byte_order_mark = {char(0xef), char(0xbb), char(0xbf)};
     bool bbom = starts_with_bom(byte_order_mark);
@@ -333,7 +349,7 @@ TEST(CPP11APITests, test_starts_with_bom)
 }
 
 
-TEST(CPP17APITests, test_utf16to8)
+TEST_CASE("utf16to8")
 {
     u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
     u16string_view utf16stringview(utf16string);
@@ -341,7 +357,7 @@ TEST(CPP17APITests, test_utf16to8)
     EXPECT_EQ (u.size(), 10);
 }
 
-TEST(CPP17APITests, test_utf8to16)
+TEST_CASE("utf8to16")
 {
     string_view utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
     u16string utf16result = utf8to16(utf8_with_surrogates);
@@ -350,7 +366,7 @@ TEST(CPP17APITests, test_utf8to16)
     EXPECT_EQ (utf16result[3], 0xdd1e);
 }
 
-TEST(CPP17APITests, test_utf32to8)
+TEST_CASE("utf32to8")
 {
     u32string utf32string = {0x448, 0x65E5, 0x10346};
     u32string_view utf32stringview(utf32string);
@@ -358,21 +374,21 @@ TEST(CPP17APITests, test_utf32to8)
     EXPECT_EQ (utf8result.size(), 9);
 }
 
-TEST(CPP17APITests, test_utf8to32)
+TEST_CASE("utf8to32")
 {
     string_view twochars = "\xe6\x97\xa5\xd1\x88";
     u32string utf32result = utf8to32(twochars);
     EXPECT_EQ (utf32result.size(), 2);
 }
 
-TEST(CPP17APITests, test_find_invalid)
+TEST_CASE("find_invalid")
 {
     string_view utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa";
     auto invalid = find_invalid(utf_invalid);
     EXPECT_EQ (invalid, 5);
 }
 
-TEST(CPP17APITests, test_is_valid)
+TEST_CASE("is_valid")
 {
     string_view utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa";
     bool bvalid = is_valid(utf_invalid);
@@ -382,7 +398,7 @@ TEST(CPP17APITests, test_is_valid)
     EXPECT_TRUE (bvalid);
 }
 
-TEST(CPP17APITests, test_replace_invalid)
+TEST_CASE("replace_invalid")
 {
     string_view invalid_sequence = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
     string replace_invalid_result = replace_invalid(invalid_sequence, '?');
@@ -392,7 +408,7 @@ TEST(CPP17APITests, test_replace_invalid)
     EXPECT_EQ(fixed_invalid_sequence, replace_invalid_result);
 }
 
-TEST(CPP17APITests, test_starts_with_bom)
+TEST_CASE("starts_with_bom")
 {
     string byte_order_mark = {char(0xef), char(0xbb), char(0xbf)};
     string_view byte_order_mark_view(byte_order_mark);
@@ -412,7 +428,7 @@ TEST(CPP17APITests, string_class_and_literals)
 }
 
 
-TEST(CPP20APITests, test_utf16tou8)
+TEST_CASE("utf16tou8")
 {
     u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
     u16string_view utf16stringview{utf16string};
@@ -431,7 +447,7 @@ TEST(CPP20APITests, tes20t_utf8to16)
     EXPECT_EQ (utf16result[3], 0xdd1e);
 }
 
-TEST(CPP20APITests, test_utf32tou8)
+TEST_CASE("utf32tou8")
 {
     u32string utf32string = {0x448, 0x65E5, 0x10346};
     u32string_view utf32stringview{utf32string};
@@ -439,21 +455,21 @@ TEST(CPP20APITests, test_utf32tou8)
     EXPECT_EQ (utf8result.size(), 9);
 }
 
-TEST(CPP20APITests, test_utf8to32)
+TEST_CASE("utf8to32")
 {
     u8string twochars = reinterpret_cast<const char8_t*>("\xe6\x97\xa5\xd1\x88");
     u32string utf32result = utf8to32(twochars);
     EXPECT_EQ (utf32result.size(), 2);
 }
 
-TEST(CPP20APITests, test_find_invalid)
+TEST_CASE("find_invalid")
 {
     u8string utf_invalid = reinterpret_cast<const char8_t*>("\xe6\x97\xa5\xd1\x88\xfa");
     auto invalid = find_invalid(utf_invalid);
     EXPECT_EQ (invalid, 5);
 }
 
-TEST(CPP20APITests, test_is_valid)
+TEST_CASE("is_valid")
 {
     u8string utf_invalid = reinterpret_cast<const char8_t*>("\xe6\x97\xa5\xd1\x88\xfa");
     bool bvalid = is_valid(utf_invalid);
@@ -463,7 +479,7 @@ TEST(CPP20APITests, test_is_valid)
     EXPECT_TRUE (bvalid);
 }
 
-TEST(CPP20APITests, test_replace_invalid)
+TEST_CASE("replace_invalid")
 {
     u8string invalid_sequence = reinterpret_cast<const char8_t*>("a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z");
     u8string replace_invalid_result = replace_invalid(invalid_sequence, u8'?');
@@ -473,7 +489,7 @@ TEST(CPP20APITests, test_replace_invalid)
     EXPECT_EQ(fixed_invalid_sequence, replace_invalid_result);
 }
 
-TEST(CPP20APITests, test_starts_with_bom)
+TEST_CASE("starts_with_bom")
 {
     u8string byte_order_mark = reinterpret_cast<const char8_t*>("\xef\xbb\xbf");
     bool bbom = starts_with_bom(byte_order_mark);
diff --git a/test/unicode/string/utf8_invalid.cpp b/test/unicode/string/utf8_invalid.cpp
index 665585b..7dd6588 100644
--- a/test/unicode/string/utf8_invalid.cpp
+++ b/test/unicode/string/utf8_invalid.cpp
@@ -1,6 +1,4 @@
-#include "utf8.h"
-
-using namespace iris::utflib;
+#include <iris/unicode/string.hpp>
 
 #include <string>
 #include <iostream>
@@ -8,14 +6,16 @@ using namespace iris::utflib;
 #include <algorithm>
 
 using namespace std;
+using namespace iris::unicode;
 
 const unsigned INVALID_LINES[] = { 75, 76, 83, 84, 85, 93, 102, 103, 105, 106, 107, 108, 109, 110, 114, 115, 116, 117, 124, 125, 130, 135, 140, 145, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 169, 175, 176, 177, 207, 208, 209, 210, 211, 220, 221, 222, 223, 224, 232, 233, 234, 235, 236, 247, 248, 249, 250, 251, 252, 253, 257, 258, 259, 260, 261, 262, 263, 264};
 const unsigned* INVALID_LINES_END = INVALID_LINES + sizeof(INVALID_LINES)/sizeof(unsigned);
 
+#if 0
 int main(int argc, char** argv)
 {
     string test_file_path;
-    if (argc == 2) 
+    if (argc == 2)
         test_file_path = argv[1];
     else {
         cout << "Wrong number of arguments" << endl;
@@ -59,3 +59,5 @@ int main(int argc, char** argv)
         }
     }
 }
+
+#endif

From d8a8313ba222e1a58c1e9c0fa8fdcdd555c2d867 Mon Sep 17 00:00:00 2001
From: Nana Sakisaka <1901813+saki7@users.noreply.github.com>
Date: Sat, 7 Mar 2026 18:42:32 +0900
Subject: [PATCH 05/17] Refactor until `replace_invalid`

---
 include/iris/unicode/string.hpp |  17 +-
 test/unicode/string/string.cpp  | 459 ++++++++++++++++----------------
 2 files changed, 235 insertions(+), 241 deletions(-)

diff --git a/include/iris/unicode/string.hpp b/include/iris/unicode/string.hpp
index 98ed0c3..b9d0520 100644
--- a/include/iris/unicode/string.hpp
+++ b/include/iris/unicode/string.hpp
@@ -801,7 +801,7 @@ template<octet_input_iterator It, std::sentinel_for<It> Se>
 template<utf16_input_iterator It, std::sentinel_for<It> Se>
 [[nodiscard]] constexpr char32_t next16(It& it, Se end)
 {
-    char32_t cp               = 0;
+    char32_t cp = 0;
     detail::utf_error err_code = detail::validate_next16(it, end, cp);
     if (err_code == detail::utf_error::NOT_ENOUGH_SPACE) {
         throw not_enough_space();
@@ -823,7 +823,7 @@ template<octet_input_iterator It, std::sentinel_for<It> Se>
 
     It end = it;
     // Go back until we hit either a lead octet or start
-    while (detail::is_trail(*(--it))) {
+    while (detail::is_trail(*--it)) {
         if (it == start) throw invalid_utf8(*it); // error - no lead byte in the sequence
     }
     return unicode::peek_next(it, end);
@@ -866,18 +866,19 @@ constexpr OutIt utf16to8(It start, Se end, OutIt out)
         if (detail::is_lead_surrogate(cp)) {
             if (start != end) {
                 char32_t const trail_surrogate = static_cast<char32_t>(detail::mask16(*start++));
-                if (detail::is_trail_surrogate(trail_surrogate))
+                if (detail::is_trail_surrogate(trail_surrogate)) {
                     cp = (cp << 10) + trail_surrogate + detail::SURROGATE_OFFSET;
-                else
+                } else {
                     throw invalid_utf16(static_cast<char16_t>(trail_surrogate));
-            } else
+                }
+            } else {
                 throw invalid_utf16(static_cast<char16_t>(cp));
+            }
 
-        }
         // Lone trail surrogate
-        else if (detail::is_trail_surrogate(cp))
+        } else if (detail::is_trail_surrogate(cp)) {
             throw invalid_utf16(static_cast<char16_t>(cp));
-
+        }
         out = unicode::append8(cp, out);
     }
     return out;
diff --git a/test/unicode/string/string.cpp b/test/unicode/string/string.cpp
index 27738b8..c7f1dea 100644
--- a/test/unicode/string/string.cpp
+++ b/test/unicode/string/string.cpp
@@ -2,6 +2,7 @@
 
 #include <iris/unicode/string.hpp>
 
+#include <vector>
 #include <algorithm>
 #include <string>
 #include <array>
@@ -83,420 +84,412 @@ TEST_CASE("append16")
     do_test.operator()<std::uint16_t>();
 }
 
-#if 0
-
 TEST_CASE("next")
 {
-    const char* twochars = "\xe6\x97\xa5\xd1\x88";
-    const char* w = twochars;
-    unsigned int cp = next(w, twochars + 6);
-    EXPECT_EQ (cp, 0x65e5);
-    EXPECT_EQ (w, twochars + 3);
+    char const* twochars = "\xe6\x97\xa5\xd1\x88";
+    char const* w = twochars;
+    unsigned int cp = unicode::next(w, twochars + 6);
 
-    const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
+    CHECK(cp == 0x65e5);
+    CHECK(w == twochars + 3);
+
+    char const* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
     w = threechars;
 
-    cp = next(w, threechars + 9);
-    EXPECT_EQ (cp, 0x10346);
-    EXPECT_EQ (w, threechars + 4);
+    cp = unicode::next(w, threechars + 9);
+    CHECK(cp == 0x10346);
+    CHECK(w == threechars + 4);
 
-    cp = next(w, threechars + 9);
-    EXPECT_EQ (cp, 0x65e5);
-    EXPECT_EQ (w, threechars + 7);
+    cp = unicode::next(w, threechars + 9);
+    CHECK(cp == 0x65e5);
+    CHECK(w == threechars + 7);
 
-    cp = next(w, threechars + 9);
-    EXPECT_EQ (cp, 0x0448);
-    EXPECT_EQ (w, threechars + 9);
+    cp = unicode::next(w, threechars + 9);
+    CHECK(cp == 0x0448);
+    CHECK(w == threechars + 9);
 }
 
 TEST_CASE("next16")
 {
-    const char16_t u[3] = {0x65e5, 0xd800, 0xdf46};
-    const char16_t* w = u;
-    char32_t cp = next16(w, w + 3);
-    EXPECT_EQ (cp, 0x65e5);
-    EXPECT_EQ (w, u + 1);
+    char16_t const u[3] = {0x65e5, 0xd800, 0xdf46};
+    char16_t const* w = u;
+    char32_t cp = unicode::next16(w, w + 3);
+    CHECK(cp == 0x65e5);
+    CHECK(w == u + 1);
 
-    cp = next16(w, w + 2);
-    EXPECT_EQ (cp, 0x10346);
-    EXPECT_EQ (w, u + 3);
+    cp = unicode::next16(w, w + 2);
+    CHECK(cp == 0x10346);
+    CHECK(w == u + 3);
 }
 
 TEST_CASE("peek_next")
 {
-    const char* const cw = "\xe6\x97\xa5\xd1\x88";
-    unsigned int cp = peek_next(cw, cw + 6);
-    EXPECT_EQ (cp, 0x65e5);
+    char const* const cw = "\xe6\x97\xa5\xd1\x88";
+    unsigned int cp = unicode::peek_next(cw, cw + 6);
+    CHECK(cp == 0x65e5);
 }
 
-TEST_CASE("prior")
+TEST_CASE("prev")
 {
-    const char* twochars = "\xe6\x97\xa5\xd1\x88";
-    const char* w = twochars + 3;
-    unsigned int cp = prior (w, twochars);
-    EXPECT_EQ (cp, 0x65e5);
-    EXPECT_EQ (w, twochars);
+    char const* twochars = "\xe6\x97\xa5\xd1\x88";
+    char const* w = twochars + 3;
+    unsigned int cp = unicode::prev(w, twochars);
+    CHECK(cp == 0x65e5);
+    CHECK(w == twochars);
 
-    const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
+    char const* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
     w = threechars + 9;
-    cp = prior(w, threechars);
-    EXPECT_EQ (cp, 0x0448);
-    EXPECT_EQ (w, threechars + 7);
-    cp = prior(w, threechars);
-    EXPECT_EQ (cp, 0x65e5);
-    EXPECT_EQ (w, threechars + 4);
-    cp = prior(w, threechars);
-    EXPECT_EQ (cp, 0x10346);
-    EXPECT_EQ (w, threechars);
+    cp = unicode::prev(w, threechars);
+    CHECK(cp == 0x0448);
+    CHECK(w == threechars + 7);
+    cp = unicode::prev(w, threechars);
+    CHECK(cp == 0x65e5);
+    CHECK(w == threechars + 4);
+    cp = unicode::prev(w, threechars);
+    CHECK(cp == 0x10346);
+    CHECK(w == threechars);
 }
 
 TEST_CASE("advance")
 {
-    const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
-    const char* w = threechars;
-    advance(w, 2, threechars + 9);
-    EXPECT_EQ(w, threechars + 7);
-    advance(w, -2, threechars);
-    EXPECT_EQ(w, threechars);
-    advance(w, 3, threechars + 9);
-    EXPECT_EQ(w, threechars + 9);
-    advance(w, -2, threechars);
-    EXPECT_EQ(w, threechars + 4);
-    advance(w, -1, threechars);
-    EXPECT_EQ(w, threechars);
+    char const* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
+    char const* w = threechars;
+    unicode::advance(w, 2, threechars + 9);
+    CHECK(w == threechars + 7);
+    unicode::advance(w, -2, threechars);
+    CHECK(w == threechars);
+    unicode::advance(w, 3, threechars + 9);
+    CHECK(w == threechars + 9);
+    unicode::advance(w, -2, threechars);
+    CHECK(w == threechars + 4);
+    unicode::advance(w, -1, threechars);
+    CHECK(w == threechars);
 }
 
 TEST_CASE("distance")
 {
-    const char* twochars = "\xe6\x97\xa5\xd1\x88";
-    size_t dist = static_cast<size_t>(iris::utflib::distance(twochars, twochars + 5));
-    EXPECT_EQ (dist, 2);
+    constexpr char const* twochars = "\xe6\x97\xa5\xd1\x88";
+    std::size_t const dist = static_cast<size_t>(unicode::distance(twochars, twochars + 5));
+    CHECK(dist == 2);
 }
 
-TEST_CASE("utf32to8")
+TEST_CASE("replace_invalid (vector)")
 {
-    char32_t utf32string[] = {0x448, 0x65E5, 0x10346, 0};
-    string utf8result;
-    iris::utflib::utf32to8(utf32string, utf32string + 3, back_inserter(utf8result));
-    EXPECT_EQ (utf8result.size(), 9);
-}
+    char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
+    std::vector<char> replace_invalid_result;
 
-TEST_CASE("utf8to32")
-{
-    const char* twochars = "\xe6\x97\xa5\xd1\x88";
-    vector<unsigned int> utf32result;
-    iris::utflib::utf8to32(twochars, twochars + 5, back_inserter(utf32result));
-    EXPECT_EQ (utf32result.size(), 2);
-}
+    unicode::replace_invalid(invalid_sequence, invalid_sequence + sizeof(invalid_sequence), std::back_inserter(replace_invalid_result), '?');
+    CHECK(unicode::is_valid(replace_invalid_result.begin(), replace_invalid_result.end()));
 
-TEST_CASE("utf16to8")
-{
-    char16_t utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
-    string utf8result;
-    iris::utflib::utf16to8(utf16string, utf16string + 5, back_inserter(utf8result));
-    EXPECT_EQ (utf8result.size(), 10);
+    char const fixed_invalid_sequence[] = "a????z";
+    CHECK(sizeof(fixed_invalid_sequence) == replace_invalid_result.size());
+    CHECK(std::equal(replace_invalid_result.begin(), replace_invalid_result.begin() + sizeof(fixed_invalid_sequence), fixed_invalid_sequence));
 }
 
-TEST_CASE("utf8to16")
+TEST_CASE("replace_invalid (string)")
 {
-    char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
-    vector <char16_t> utf16result;
-    iris::utflib::utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result));
-    EXPECT_EQ (utf16result.size(), 4);
-    EXPECT_EQ (utf16result[2], 0xd834);
-    EXPECT_EQ (utf16result[3], 0xdd1e);
+    std::string_view invalid_sequence = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
+    std::string const replace_invalid_result = unicode::replace_invalid(invalid_sequence, '?');
+    CHECK(unicode::is_valid(replace_invalid_result));
+
+    std::string const fixed_invalid_sequence = "a????z";
+    CHECK(fixed_invalid_sequence == replace_invalid_result);
 }
 
-TEST_CASE("replace_invalid")
+TEST_CASE("replace_invalid (u8string)")
 {
-    char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
-    vector<char> replace_invalid_result;
-    replace_invalid (invalid_sequence, invalid_sequence + sizeof(invalid_sequence), std::back_inserter(replace_invalid_result), '?');
-    bool bvalid = is_valid(replace_invalid_result.begin(), replace_invalid_result.end());
-    EXPECT_TRUE (bvalid);
-    const char fixed_invalid_sequence[] = "a????z";
-    EXPECT_EQ (sizeof(fixed_invalid_sequence), replace_invalid_result.size());
-    EXPECT_TRUE (std::equal(replace_invalid_result.begin(), replace_invalid_result.begin() + sizeof(fixed_invalid_sequence), fixed_invalid_sequence));
+    std::u8string invalid_sequence = reinterpret_cast<char8_t const*>("a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z");
+    std::u8string const replace_invalid_result = unicode::replace_invalid(invalid_sequence, u8'?');
+
+    CHECK(unicode::is_valid(replace_invalid_result));
+    std::u8string const fixed_invalid_sequence = reinterpret_cast<char8_t const*>("a????z");
+    CHECK(fixed_invalid_sequence == replace_invalid_result);
 }
 
+#if 0
+
 TEST_CASE("find_invalid")
 {
     char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa";
-    const char* invalid = find_invalid(utf_invalid, utf_invalid + 6);
-    EXPECT_EQ (invalid, utf_invalid + 5);
+    char const* invalid = find_invalid(utf_invalid, utf_invalid + 6);
+    CHECK(invalid == utf_invalid + 5);
     invalid = utf_invalid + find_invalid(utf_invalid);
-    EXPECT_EQ (invalid, utf_invalid + 5);
+    CHECK(invalid == utf_invalid + 5);
 }
 
 TEST_CASE("is_valid")
 {
     char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa";
     bool bvalid = is_valid(utf_invalid, utf_invalid + 6);
-    EXPECT_FALSE (bvalid);
+    CHECK(!bvalid);
     bvalid = is_valid(utf_invalid);
-    EXPECT_FALSE (bvalid);
+    CHECK(!bvalid);
     char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
     bvalid = is_valid(utf8_with_surrogates, utf8_with_surrogates + 9);
-    EXPECT_TRUE (bvalid);
+    CHECK(bvalid);
     bvalid = is_valid(utf8_with_surrogates);
-    EXPECT_TRUE (bvalid);
+    CHECK(bvalid);
 }
 
 TEST_CASE("starts_with_bom")
 {
     unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf};
     bool bbom = starts_with_bom(byte_order_mark, byte_order_mark + sizeof(byte_order_mark));
-    EXPECT_TRUE (bbom);
-    const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
+    CHECK(bbom);
+    char const* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
     bool no_bbom = starts_with_bom(threechars, threechars + sizeof(threechars));
-    EXPECT_FALSE (no_bbom);
+    CHECK(!no_bbom);
 }
 
 TEST_CASE("increment")
 {
-    const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
-    iris::utflib::iterator<const char*> it(threechars, threechars, threechars + 9);
-    iris::utflib::iterator<const char*> it2 = it;
-    EXPECT_EQ (it2, it);
-    EXPECT_EQ (*it, 0x10346);
-    EXPECT_EQ (*(++it), 0x65e5);
-    EXPECT_EQ ((*it++), 0x65e5);
-    EXPECT_EQ (*it, 0x0448);
+    char const* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
+    iris::utflib::iterator<char const*> it(threechars, threechars, threechars + 9);
+    iris::utflib::iterator<char const*> it2 = it;
+    CHECK(it2 == it);
+    CHECK(*it == 0x10346);
+    CHECK(*(++it) == 0x65e5);
+    CHECK((*it++) == 0x65e5);
+    CHECK(*it == 0x0448);
     EXPECT_NE (it, it2);
-    iris::utflib::iterator<const char*> endit (threechars + 9, threechars, threechars + 9);
-    EXPECT_EQ (++it, endit);
+    iris::utflib::iterator<char const*> endit (threechars + 9, threechars, threechars + 9);
+    CHECK(++it == endit);
 }
 
 TEST_CASE("decrement")
 {
-    const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
-    iris::utflib::iterator<const char*> it(threechars+9, threechars, threechars + 9);
-    EXPECT_EQ (*(--it), 0x0448);
-    EXPECT_EQ ((*it--), 0x0448);
-    EXPECT_EQ (*it, 0x65e5);
-    EXPECT_EQ (--it, iris::utflib::iterator<const char*>(threechars, threechars, threechars + 9));
-    EXPECT_EQ (*it, 0x10346);
+    char const* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
+    iris::utflib::iterator<char const*> it(threechars+9, threechars, threechars + 9);
+    CHECK(*(--it) == 0x0448);
+    CHECK((*it--) == 0x0448);
+    CHECK(*it == 0x65e5);
+    CHECK(--it == iris::utflib::iterator<char const*>(threechars, threechars, threechars + 9));
+    CHECK(*it == 0x10346);
+}
+
+TEST_CASE("utf32to8")
+{
+    char32_t utf32string[] = {0x448, 0x65E5, 0x10346, 0};
+    std::string utf8result;
+    iris::utflib::utf32to8(utf32string, utf32string + 3, back_inserter(utf8result));
+    CHECK(utf8result.size() == 9);
+}
+
+TEST_CASE("utf8to32")
+{
+    char const* twochars = "\xe6\x97\xa5\xd1\x88";
+    std::vector<unsigned int> utf32result;
+    iris::utflib::utf8to32(twochars, twochars + 5, back_inserter(utf32result));
+    CHECK(utf32result.size() == 2);
+}
+
+TEST_CASE("utf16to8")
+{
+    char16_t utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
+    std::string utf8result;
+    iris::utflib::utf16to8(utf16string, utf16string + 5, back_inserter(utf8result));
+    CHECK(utf8result.size() == 10);
+}
+
+TEST_CASE("utf8to16")
+{
+    char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
+    std::vector<char16_t> utf16result;
+    iris::utflib::utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result));
+    CHECK(utf16result.size() == 4);
+    CHECK(utf16result[2] == 0xd834);
+    CHECK(utf16result[3] == 0xdd1e);
 }
 
 TEST_CASE("utf16to8")
 {
-    u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
-    string u = utf16to8(utf16string);
-    EXPECT_EQ (u.size(), 10);
+    std::u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
+    std::string u = utf16to8(utf16string);
+    CHECK(u.size() == 10);
 }
 
 TEST_CASE("utf8to16")
 {
-    string utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
-    u16string utf16result = utf8to16(utf8_with_surrogates);
-    EXPECT_EQ (utf16result.size(), 4);
-    EXPECT_EQ (utf16result[2], 0xd834);
-    EXPECT_EQ (utf16result[3], 0xdd1e);
-    // Just to make sure it compiles with string literals
-    EXPECT_EQ(utf8to16(u8"simple"), u"simple");
-    EXPECT_EQ(utf8to16("simple"), u"simple");
+    std::string utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
+    std::u16string utf16result = utf8to16(utf8_with_surrogates);
+    CHECK(utf16result.size() == 4);
+    CHECK(utf16result[2] == 0xd834);
+    CHECK(utf16result[3] == 0xdd1e);
+    // Just to make sure it compiles with std::string literals
+    CHECK(utf8to16(u8"simple") == u"simple");
+    CHECK(utf8to16("simple") == u"simple");
 }
 
 TEST_CASE("utf32to8")
 {
-    u32string utf32string = {0x448, 0x65E5, 0x10346};
-    string utf8result = utf32to8(utf32string);
-    EXPECT_EQ (utf8result.size(), 9);
+    std::u32string utf32string = {0x448, 0x65E5, 0x10346};
+    std::string utf8result = utf32to8(utf32string);
+    CHECK(utf8result.size() == 9);
 }
 
 TEST_CASE("utf8to32")
 {
-    const char* twochars = "\xe6\x97\xa5\xd1\x88";
-    u32string utf32result = utf8to32(twochars);
-    EXPECT_EQ (utf32result.size(), 2);
+    char const* twochars = "\xe6\x97\xa5\xd1\x88";
+    std::u32string utf32result = utf8to32(twochars);
+    CHECK(utf32result.size() == 2);
 }
 
 TEST_CASE("find_invalid")
 {
-    string utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa";
+    std::string utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa";
     auto invalid = find_invalid(utf_invalid);
-    EXPECT_EQ (invalid, 5);
+    CHECK(invalid == 5);
 }
 
 TEST_CASE("is_valid")
 {
-    string utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa";
+    std::string utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa";
     bool bvalid = is_valid(utf_invalid);
-    EXPECT_FALSE (bvalid);
-    string utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
+    CHECK(!bvalid);
+    std::string utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
     bvalid = is_valid(utf8_with_surrogates);
-    EXPECT_TRUE (bvalid);
-}
-
-TEST_CASE("replace_invalid")
-{
-    string invalid_sequence = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
-    string replace_invalid_result = replace_invalid(invalid_sequence, '?');
-    bool bvalid = is_valid(replace_invalid_result);
-    EXPECT_TRUE (bvalid);
-    const string fixed_invalid_sequence = "a????z";
-    EXPECT_EQ(fixed_invalid_sequence, replace_invalid_result);
+    CHECK(bvalid);
 }
 
 TEST_CASE("starts_with_bom")
 {
-    string byte_order_mark = {char(0xef), char(0xbb), char(0xbf)};
+    std::string byte_order_mark = {char(0xef), char(0xbb), char(0xbf)};
     bool bbom = starts_with_bom(byte_order_mark);
-    EXPECT_TRUE (bbom);
-    string threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
+    CHECK(bbom);
+    std::string threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
     bool no_bbom = starts_with_bom(threechars);
-    EXPECT_FALSE (no_bbom);
+    CHECK(!no_bbom);
 }
 
 
 TEST_CASE("utf16to8")
 {
-    u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
+    std::u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
     u16string_view utf16stringview(utf16string);
-    string u = utf16to8(utf16stringview);
-    EXPECT_EQ (u.size(), 10);
+    std::string u = utf16to8(utf16stringview);
+    CHECK(u.size() == 10);
 }
 
 TEST_CASE("utf8to16")
 {
-    string_view utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
-    u16string utf16result = utf8to16(utf8_with_surrogates);
-    EXPECT_EQ (utf16result.size(), 4);
-    EXPECT_EQ (utf16result[2], 0xd834);
-    EXPECT_EQ (utf16result[3], 0xdd1e);
+    std::string_view utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
+    std::u16string utf16result = utf8to16(utf8_with_surrogates);
+    CHECK(utf16result.size() == 4);
+    CHECK(utf16result[2] == 0xd834);
+    CHECK(utf16result[3] == 0xdd1e);
 }
 
 TEST_CASE("utf32to8")
 {
-    u32string utf32string = {0x448, 0x65E5, 0x10346};
+    std::u32string utf32string = {0x448, 0x65E5, 0x10346};
     u32string_view utf32stringview(utf32string);
-    string utf8result = utf32to8(utf32stringview);
-    EXPECT_EQ (utf8result.size(), 9);
+    std::string utf8result = utf32to8(utf32stringview);
+    CHECK(utf8result.size() == 9);
 }
 
 TEST_CASE("utf8to32")
 {
-    string_view twochars = "\xe6\x97\xa5\xd1\x88";
-    u32string utf32result = utf8to32(twochars);
-    EXPECT_EQ (utf32result.size(), 2);
+    std::string_view twochars = "\xe6\x97\xa5\xd1\x88";
+    std::u32string utf32result = utf8to32(twochars);
+    CHECK(utf32result.size() == 2);
 }
 
 TEST_CASE("find_invalid")
 {
-    string_view utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa";
+    std::string_view utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa";
     auto invalid = find_invalid(utf_invalid);
-    EXPECT_EQ (invalid, 5);
+    CHECK(invalid == 5);
 }
 
 TEST_CASE("is_valid")
 {
-    string_view utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa";
+    std::string_view utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa";
     bool bvalid = is_valid(utf_invalid);
-    EXPECT_FALSE (bvalid);
-    string_view utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
+    CHECK(!bvalid);
+    std::string_view utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
     bvalid = is_valid(utf8_with_surrogates);
-    EXPECT_TRUE (bvalid);
-}
-
-TEST_CASE("replace_invalid")
-{
-    string_view invalid_sequence = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
-    string replace_invalid_result = replace_invalid(invalid_sequence, '?');
-    bool bvalid = is_valid(replace_invalid_result);
-    EXPECT_TRUE (bvalid);
-    const string fixed_invalid_sequence = "a????z";
-    EXPECT_EQ(fixed_invalid_sequence, replace_invalid_result);
+    CHECK(bvalid);
 }
 
 TEST_CASE("starts_with_bom")
 {
-    string byte_order_mark = {char(0xef), char(0xbb), char(0xbf)};
-    string_view byte_order_mark_view(byte_order_mark);
+    std::string byte_order_mark = {char(0xef), char(0xbb), char(0xbf)};
+    std::string_view byte_order_mark_view(byte_order_mark);
     bool bbom = starts_with_bom(byte_order_mark_view);
-    EXPECT_TRUE (bbom);
-    string_view threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
+    CHECK(bbom);
+    std::string_view threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
     bool no_bbom = starts_with_bom(threechars);
-    EXPECT_FALSE (no_bbom);
+    CHECK(!no_bbom);
 }
 
 TEST(CPP17APITests, string_class_and_literals)
 {
-    const char* twochars = "ab";
-    EXPECT_TRUE (is_valid(twochars));
-    const string two_chars_string(twochars);
-    EXPECT_TRUE (is_valid(two_chars_string));
+    char const* twochars = "ab";
+    CHECK(is_valid(twochars));
+    std::string const two_chars_string(twochars);
+    CHECK(is_valid(two_chars_string));
 }
 
 
 TEST_CASE("utf16tou8")
 {
-    u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
+    std::u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
     u16string_view utf16stringview{utf16string};
-    u8string u = utf16tou8(utf16string);
-    EXPECT_EQ (u.size(), 10);
+    std::u8string u = utf16tou8(utf16string);
+    CHECK(u.size() == 10);
     u = utf16tou8(utf16stringview);
-    EXPECT_EQ (u.size(), 10);
+    CHECK(u.size() == 10);
 }
 
-TEST(CPP20APITests, tes20t_utf8to16)
+TEST_CASE("utf8to16")
 {
-    u8string utf8_with_surrogates{ reinterpret_cast<const char8_t*>("\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e") };
-    u16string utf16result = utf8to16(utf8_with_surrogates);
-    EXPECT_EQ (utf16result.size(), 4);
-    EXPECT_EQ (utf16result[2], 0xd834);
-    EXPECT_EQ (utf16result[3], 0xdd1e);
+    std::u8string utf8_with_surrogates{ reinterpret_cast<char8_t const*>("\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e") };
+    std::u16string utf16result = utf8to16(utf8_with_surrogates);
+    CHECK(utf16result.size() == 4);
+    CHECK(utf16result[2] == 0xd834);
+    CHECK(utf16result[3] == 0xdd1e);
 }
 
 TEST_CASE("utf32tou8")
 {
-    u32string utf32string = {0x448, 0x65E5, 0x10346};
+    std::u32string utf32string = {0x448, 0x65E5, 0x10346};
     u32string_view utf32stringview{utf32string};
-    u8string utf8result = utf32tou8(utf32stringview);
-    EXPECT_EQ (utf8result.size(), 9);
+    std::u8string utf8result = utf32tou8(utf32stringview);
+    CHECK(utf8result.size() == 9);
 }
 
 TEST_CASE("utf8to32")
 {
-    u8string twochars = reinterpret_cast<const char8_t*>("\xe6\x97\xa5\xd1\x88");
-    u32string utf32result = utf8to32(twochars);
-    EXPECT_EQ (utf32result.size(), 2);
+    std::u8string twochars = reinterpret_cast<char8_t const*>("\xe6\x97\xa5\xd1\x88");
+    std::u32string utf32result = utf8to32(twochars);
+    CHECK(utf32result.size() == 2);
 }
 
 TEST_CASE("find_invalid")
 {
-    u8string utf_invalid = reinterpret_cast<const char8_t*>("\xe6\x97\xa5\xd1\x88\xfa");
+    std::u8string utf_invalid = reinterpret_cast<char8_t const*>("\xe6\x97\xa5\xd1\x88\xfa");
     auto invalid = find_invalid(utf_invalid);
-    EXPECT_EQ (invalid, 5);
+    CHECK(invalid == 5);
 }
 
 TEST_CASE("is_valid")
 {
-    u8string utf_invalid = reinterpret_cast<const char8_t*>("\xe6\x97\xa5\xd1\x88\xfa");
+    std::u8string utf_invalid = reinterpret_cast<char8_t const*>("\xe6\x97\xa5\xd1\x88\xfa");
     bool bvalid = is_valid(utf_invalid);
-    EXPECT_FALSE (bvalid);
-    u8string utf8_with_surrogates = reinterpret_cast<const char8_t*>("\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e");
+    CHECK(!bvalid);
+    std::u8string utf8_with_surrogates = reinterpret_cast<char8_t const*>("\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e");
     bvalid = is_valid(utf8_with_surrogates);
-    EXPECT_TRUE (bvalid);
-}
-
-TEST_CASE("replace_invalid")
-{
-    u8string invalid_sequence = reinterpret_cast<const char8_t*>("a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z");
-    u8string replace_invalid_result = replace_invalid(invalid_sequence, u8'?');
-    bool bvalid = is_valid(replace_invalid_result);
-    EXPECT_TRUE (bvalid);
-    const u8string fixed_invalid_sequence = reinterpret_cast<const char8_t*>("a????z");
-    EXPECT_EQ(fixed_invalid_sequence, replace_invalid_result);
+    CHECK(bvalid);
 }
 
 TEST_CASE("starts_with_bom")
 {
-    u8string byte_order_mark = reinterpret_cast<const char8_t*>("\xef\xbb\xbf");
+    std::u8string byte_order_mark = reinterpret_cast<char8_t const*>("\xef\xbb\xbf");
     bool bbom = starts_with_bom(byte_order_mark);
-    EXPECT_TRUE (bbom);
-    u8string threechars = reinterpret_cast<const char8_t*>("\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88");
+    CHECK(bbom);
+    std::u8string threechars = reinterpret_cast<char8_t const*>("\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88");
     bool no_bbom = starts_with_bom(threechars);
-    EXPECT_FALSE (no_bbom);
+    CHECK(!no_bbom);
 }
 
 #endif

From 0e2952541c6cde9dfa1e55267d48daf90cb8f585 Mon Sep 17 00:00:00 2001
From: Nana Sakisaka <1901813+saki7@users.noreply.github.com>
Date: Sat, 7 Mar 2026 19:50:12 +0900
Subject: [PATCH 06/17] Refactor until `is_valid`

---
 test/unicode/string/string.cpp | 68 ++++++++++++----------------------
 1 file changed, 23 insertions(+), 45 deletions(-)

diff --git a/test/unicode/string/string.cpp b/test/unicode/string/string.cpp
index c7f1dea..1bd4ef2 100644
--- a/test/unicode/string/string.cpp
+++ b/test/unicode/string/string.cpp
@@ -173,6 +173,28 @@ TEST_CASE("distance")
     CHECK(dist == 2);
 }
 
+TEST_CASE("is_valid")
+{
+    {
+        char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa";
+        CHECK(!unicode::is_valid(utf_invalid));
+        CHECK(!unicode::is_valid(utf_invalid, utf_invalid + 6));
+    }
+    {
+        char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
+        CHECK(unicode::is_valid(utf8_with_surrogates));
+        CHECK(unicode::is_valid(utf8_with_surrogates, utf8_with_surrogates + 9));
+    }
+    {
+        std::u8string const utf_invalid(std::from_range, "\xe6\x97\xa5\xd1\x88\xfa");
+        CHECK(!unicode::is_valid(utf_invalid));
+    }
+    {
+        std::u8string const utf8_with_surrogates(std::from_range, "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e");
+        CHECK(unicode::is_valid(utf8_with_surrogates));
+    }
+}
+
 TEST_CASE("replace_invalid (vector)")
 {
     char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
@@ -217,20 +239,6 @@ TEST_CASE("find_invalid")
     CHECK(invalid == utf_invalid + 5);
 }
 
-TEST_CASE("is_valid")
-{
-    char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa";
-    bool bvalid = is_valid(utf_invalid, utf_invalid + 6);
-    CHECK(!bvalid);
-    bvalid = is_valid(utf_invalid);
-    CHECK(!bvalid);
-    char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
-    bvalid = is_valid(utf8_with_surrogates, utf8_with_surrogates + 9);
-    CHECK(bvalid);
-    bvalid = is_valid(utf8_with_surrogates);
-    CHECK(bvalid);
-}
-
 TEST_CASE("starts_with_bom")
 {
     unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf};
@@ -341,16 +349,6 @@ TEST_CASE("find_invalid")
     CHECK(invalid == 5);
 }
 
-TEST_CASE("is_valid")
-{
-    std::string utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa";
-    bool bvalid = is_valid(utf_invalid);
-    CHECK(!bvalid);
-    std::string utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
-    bvalid = is_valid(utf8_with_surrogates);
-    CHECK(bvalid);
-}
-
 TEST_CASE("starts_with_bom")
 {
     std::string byte_order_mark = {char(0xef), char(0xbb), char(0xbf)};
@@ -401,16 +399,6 @@ TEST_CASE("find_invalid")
     CHECK(invalid == 5);
 }
 
-TEST_CASE("is_valid")
-{
-    std::string_view utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa";
-    bool bvalid = is_valid(utf_invalid);
-    CHECK(!bvalid);
-    std::string_view utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
-    bvalid = is_valid(utf8_with_surrogates);
-    CHECK(bvalid);
-}
-
 TEST_CASE("starts_with_bom")
 {
     std::string byte_order_mark = {char(0xef), char(0xbb), char(0xbf)};
@@ -422,7 +410,7 @@ TEST_CASE("starts_with_bom")
     CHECK(!no_bbom);
 }
 
-TEST(CPP17APITests, string_class_and_literals)
+TEST_CASE("string_class_and_literals")
 {
     char const* twochars = "ab";
     CHECK(is_valid(twochars));
@@ -472,16 +460,6 @@ TEST_CASE("find_invalid")
     CHECK(invalid == 5);
 }
 
-TEST_CASE("is_valid")
-{
-    std::u8string utf_invalid = reinterpret_cast<char8_t const*>("\xe6\x97\xa5\xd1\x88\xfa");
-    bool bvalid = is_valid(utf_invalid);
-    CHECK(!bvalid);
-    std::u8string utf8_with_surrogates = reinterpret_cast<char8_t const*>("\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e");
-    bvalid = is_valid(utf8_with_surrogates);
-    CHECK(bvalid);
-}
-
 TEST_CASE("starts_with_bom")
 {
     std::u8string byte_order_mark = reinterpret_cast<char8_t const*>("\xef\xbb\xbf");

From b45687916744bf2a9aadbd3799ccf1862fd6f142 Mon Sep 17 00:00:00 2001
From: Nana Sakisaka <1901813+saki7@users.noreply.github.com>
Date: Sat, 7 Mar 2026 19:57:58 +0900
Subject: [PATCH 07/17] Refactor until `find_invalid`

---
 test/unicode/string/string.cpp | 67 ++++++++++++++++------------------
 1 file changed, 32 insertions(+), 35 deletions(-)

diff --git a/test/unicode/string/string.cpp b/test/unicode/string/string.cpp
index 1bd4ef2..bee8ee0 100644
--- a/test/unicode/string/string.cpp
+++ b/test/unicode/string/string.cpp
@@ -175,26 +175,53 @@ TEST_CASE("distance")
 
 TEST_CASE("is_valid")
 {
+    constexpr char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa";
+    constexpr char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
+
     {
-        char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa";
         CHECK(!unicode::is_valid(utf_invalid));
         CHECK(!unicode::is_valid(utf_invalid, utf_invalid + 6));
     }
     {
-        char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
         CHECK(unicode::is_valid(utf8_with_surrogates));
         CHECK(unicode::is_valid(utf8_with_surrogates, utf8_with_surrogates + 9));
     }
     {
-        std::u8string const utf_invalid(std::from_range, "\xe6\x97\xa5\xd1\x88\xfa");
-        CHECK(!unicode::is_valid(utf_invalid));
+        std::u8string const utf_invalid_u8(std::from_range, utf_invalid);
+        CHECK(!unicode::is_valid(utf_invalid_u8));
     }
     {
-        std::u8string const utf8_with_surrogates(std::from_range, "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e");
+        std::u8string const utf8_with_surrogates_u8(std::from_range, utf8_with_surrogates);
         CHECK(unicode::is_valid(utf8_with_surrogates));
     }
 }
 
+TEST_CASE("find_invalid")
+{
+    constexpr char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa";
+    {
+        char const* invalid = unicode::find_invalid(utf_invalid, utf_invalid + 6);
+        CHECK(invalid == utf_invalid + 5);
+    }
+    {
+        std::size_t const invalid_pos = unicode::find_invalid(utf_invalid);
+        CHECK(invalid_pos == 5);
+    }
+    {
+        std::size_t const invalid_pos = unicode::find_invalid(std::string{utf_invalid});
+        CHECK(invalid_pos == 5);
+    }
+    {
+        std::size_t const invalid_pos = unicode::find_invalid(std::string_view{utf_invalid});
+        CHECK(invalid_pos == 5);
+    }
+    {
+        std::u8string const utf_invalid_u8(std::from_range, utf_invalid);
+        std::size_t const invalid_pos = unicode::find_invalid(utf_invalid_u8);
+        CHECK(invalid_pos == 5);
+    }
+}
+
 TEST_CASE("replace_invalid (vector)")
 {
     char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
@@ -230,15 +257,6 @@ TEST_CASE("replace_invalid (u8string)")
 
 #if 0
 
-TEST_CASE("find_invalid")
-{
-    char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa";
-    char const* invalid = find_invalid(utf_invalid, utf_invalid + 6);
-    CHECK(invalid == utf_invalid + 5);
-    invalid = utf_invalid + find_invalid(utf_invalid);
-    CHECK(invalid == utf_invalid + 5);
-}
-
 TEST_CASE("starts_with_bom")
 {
     unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf};
@@ -342,13 +360,6 @@ TEST_CASE("utf8to32")
     CHECK(utf32result.size() == 2);
 }
 
-TEST_CASE("find_invalid")
-{
-    std::string utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa";
-    auto invalid = find_invalid(utf_invalid);
-    CHECK(invalid == 5);
-}
-
 TEST_CASE("starts_with_bom")
 {
     std::string byte_order_mark = {char(0xef), char(0xbb), char(0xbf)};
@@ -392,13 +403,6 @@ TEST_CASE("utf8to32")
     CHECK(utf32result.size() == 2);
 }
 
-TEST_CASE("find_invalid")
-{
-    std::string_view utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa";
-    auto invalid = find_invalid(utf_invalid);
-    CHECK(invalid == 5);
-}
-
 TEST_CASE("starts_with_bom")
 {
     std::string byte_order_mark = {char(0xef), char(0xbb), char(0xbf)};
@@ -453,13 +457,6 @@ TEST_CASE("utf8to32")
     CHECK(utf32result.size() == 2);
 }
 
-TEST_CASE("find_invalid")
-{
-    std::u8string utf_invalid = reinterpret_cast<char8_t const*>("\xe6\x97\xa5\xd1\x88\xfa");
-    auto invalid = find_invalid(utf_invalid);
-    CHECK(invalid == 5);
-}
-
 TEST_CASE("starts_with_bom")
 {
     std::u8string byte_order_mark = reinterpret_cast<char8_t const*>("\xef\xbb\xbf");

From 2a43f6bac23cb988d490c002de78d0c89cea65a0 Mon Sep 17 00:00:00 2001
From: Nana Sakisaka <1901813+saki7@users.noreply.github.com>
Date: Sat, 7 Mar 2026 20:59:31 +0900
Subject: [PATCH 08/17] Refactor until `starts_with_bom`

---
 include/iris/unicode/string.hpp | 26 +++++++++++----
 test/unicode/string/string.cpp  | 57 +++++++++------------------------
 2 files changed, 35 insertions(+), 48 deletions(-)

diff --git a/include/iris/unicode/string.hpp b/include/iris/unicode/string.hpp
index b9d0520..1bada79 100644
--- a/include/iris/unicode/string.hpp
+++ b/include/iris/unicode/string.hpp
@@ -40,7 +40,8 @@ DEALINGS IN THE SOFTWARE.
 
 namespace iris::unicode {
 
-constexpr char8_t bom[] = {0xef, 0xbb, 0xbf};
+template<class T>
+constexpr T bom[] = {static_cast<T>(0xef), static_cast<T>(0xbb), static_cast<T>(0xbf)};
 
 template<class T>
 concept octet = std::integral<T> && sizeof(T) == 1;
@@ -67,6 +68,12 @@ template<class It>
 concept utf32_input_iterator = std::input_iterator<It> && utf32char<std::iter_value_t<It>>;
 
 
+template<class R>
+concept octet_input_range =
+    std::ranges::input_range<R> &&
+    octet_input_iterator<std::ranges::iterator_t<R>>;
+
+
 namespace detail {
 
 template<class OutIt, class DesiredValueT>
@@ -256,12 +263,12 @@ enum class utf_error
 template<octet Octet>
 [[nodiscard]] constexpr char8_t mask8(Octet oc) noexcept
 {
-    return static_cast<char8_t>(0xff & oc);
+    return static_cast<char8_t>(oc & 0xff);
 }
 
 [[nodiscard]] constexpr char16_t mask16(char16_t oc) noexcept
 {
-    return static_cast<char16_t>(0xffff & oc);
+    return static_cast<char16_t>(oc & 0xffff);
 }
 
 template<octet Octet>
@@ -578,9 +585,16 @@ template<octet_input_iterator It, std::sentinel_for<It> Se>
     noexcept(noexcept(detail::mask8(*it++)) && is_nothrow_sentinel_v<It, Se>)
 {
     return
-        (it != end && (detail::mask8(*it++)) == bom[0]) &&
-        (it != end && (detail::mask8(*it++)) == bom[1]) &&
-        (it != end && (detail::mask8(*it)) == bom[2]);
+        (it != end && detail::mask8(*it++) == bom<char8_t>[0]) &&
+        (it != end && detail::mask8(*it++) == bom<char8_t>[1]) &&
+        (it != end && detail::mask8(*it)   == bom<char8_t>[2]);
+}
+
+template<octet_input_range R>
+[[nodiscard]] constexpr bool starts_with_bom(R&& r)
+    noexcept(noexcept(unicode::starts_with_bom(std::ranges::begin(r), std::ranges::end(r))))
+{
+    return unicode::starts_with_bom(std::ranges::begin(r), std::ranges::end(r));
 }
 
 [[nodiscard]] constexpr bool starts_with_bom(std::string_view s)
diff --git a/test/unicode/string/string.cpp b/test/unicode/string/string.cpp
index bee8ee0..47100ff 100644
--- a/test/unicode/string/string.cpp
+++ b/test/unicode/string/string.cpp
@@ -247,26 +247,31 @@ TEST_CASE("replace_invalid (string)")
 
 TEST_CASE("replace_invalid (u8string)")
 {
-    std::u8string invalid_sequence = reinterpret_cast<char8_t const*>("a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z");
+    std::u8string const invalid_sequence(std::from_range, "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z");
     std::u8string const replace_invalid_result = unicode::replace_invalid(invalid_sequence, u8'?');
 
     CHECK(unicode::is_valid(replace_invalid_result));
-    std::u8string const fixed_invalid_sequence = reinterpret_cast<char8_t const*>("a????z");
+    std::u8string const fixed_invalid_sequence(std::from_range, "a????z");
     CHECK(fixed_invalid_sequence == replace_invalid_result);
 }
 
-#if 0
-
 TEST_CASE("starts_with_bom")
 {
-    unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf};
-    bool bbom = starts_with_bom(byte_order_mark, byte_order_mark + sizeof(byte_order_mark));
-    CHECK(bbom);
-    char const* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
-    bool no_bbom = starts_with_bom(threechars, threechars + sizeof(threechars));
-    CHECK(!no_bbom);
+    CHECK(unicode::starts_with_bom(unicode::bom<char>));
+    CHECK(unicode::starts_with_bom(unicode::bom<unsigned char>));
+    CHECK(unicode::starts_with_bom(unicode::bom<char8_t>));
+    CHECK(unicode::starts_with_bom(unicode::bom<std::int8_t>));
+    CHECK(unicode::starts_with_bom(unicode::bom<std::uint8_t>));
+
+    constexpr char threechars[] = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
+    CHECK(!unicode::starts_with_bom(threechars));
+    CHECK(!unicode::starts_with_bom(std::string{threechars}));
+    CHECK(!unicode::starts_with_bom(std::string_view{threechars}));
+    CHECK(!unicode::starts_with_bom(std::u8string{std::from_range, threechars}));
 }
 
+#if 0
+
 TEST_CASE("increment")
 {
     char const* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
@@ -360,17 +365,6 @@ TEST_CASE("utf8to32")
     CHECK(utf32result.size() == 2);
 }
 
-TEST_CASE("starts_with_bom")
-{
-    std::string byte_order_mark = {char(0xef), char(0xbb), char(0xbf)};
-    bool bbom = starts_with_bom(byte_order_mark);
-    CHECK(bbom);
-    std::string threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
-    bool no_bbom = starts_with_bom(threechars);
-    CHECK(!no_bbom);
-}
-
-
 TEST_CASE("utf16to8")
 {
     std::u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
@@ -403,17 +397,6 @@ TEST_CASE("utf8to32")
     CHECK(utf32result.size() == 2);
 }
 
-TEST_CASE("starts_with_bom")
-{
-    std::string byte_order_mark = {char(0xef), char(0xbb), char(0xbf)};
-    std::string_view byte_order_mark_view(byte_order_mark);
-    bool bbom = starts_with_bom(byte_order_mark_view);
-    CHECK(bbom);
-    std::string_view threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
-    bool no_bbom = starts_with_bom(threechars);
-    CHECK(!no_bbom);
-}
-
 TEST_CASE("string_class_and_literals")
 {
     char const* twochars = "ab";
@@ -457,16 +440,6 @@ TEST_CASE("utf8to32")
     CHECK(utf32result.size() == 2);
 }
 
-TEST_CASE("starts_with_bom")
-{
-    std::u8string byte_order_mark = reinterpret_cast<char8_t const*>("\xef\xbb\xbf");
-    bool bbom = starts_with_bom(byte_order_mark);
-    CHECK(bbom);
-    std::u8string threechars = reinterpret_cast<char8_t const*>("\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88");
-    bool no_bbom = starts_with_bom(threechars);
-    CHECK(!no_bbom);
-}
-
 #endif
 
 } // iris_unicode_test

From 6ca87b1300d5bb220992382e0313eb1faf5b91e8 Mon Sep 17 00:00:00 2001
From: Nana Sakisaka <1901813+saki7@users.noreply.github.com>
Date: Sat, 7 Mar 2026 21:13:26 +0900
Subject: [PATCH 09/17] Refactor until `increment`/`decrement`

---
 include/iris/unicode/string.hpp | 53 +++++++++++++++++++--------------
 test/unicode/string/string.cpp  | 28 ++++++++---------
 2 files changed, 45 insertions(+), 36 deletions(-)

diff --git a/include/iris/unicode/string.hpp b/include/iris/unicode/string.hpp
index 1bada79..d54b1ff 100644
--- a/include/iris/unicode/string.hpp
+++ b/include/iris/unicode/string.hpp
@@ -38,6 +38,8 @@ DEALINGS IN THE SOFTWARE.
 #include <utility>
 #include <ranges>
 
+#include <cassert>
+
 namespace iris::unicode {
 
 template<class T>
@@ -434,11 +436,11 @@ constexpr utf_error validate_next(It& it, Se end, char32_t& code_point)
 
     // Save the original value of it so we can go back in case of failure
     // Of course, it does not make much sense with i.e. stream iterators
-    It original_it = it;
+    It const original_it = it;
 
     char32_t cp = 0;
     // Determine the sequence length based on the lead octet
-    const int length = detail::sequence_length(it);
+    int const length = detail::sequence_length(it);
 
     // Get trail octets and calculate the code point
     utf_error err{};
@@ -460,26 +462,24 @@ constexpr utf_error validate_next(It& it, Se end, char32_t& code_point)
     default:
         std::unreachable();
     }
+    if (err != utf_error::OK) {
+        it = original_it;
+        return err;
+    }
 
-    if (err == utf_error::OK) {
-        // Decoding succeeded. Now, security checks...
-        if (detail::is_code_point_valid(cp)) {
-            if (!detail::is_overlong_sequence(cp, length)) {
-                // Passed! Return here.
-                code_point = cp;
-                ++it;
-                return utf_error::OK;
-            } else {
-                err = utf_error::OVERLONG_SEQUENCE;
-            }
-        } else {
-            err = utf_error::INVALID_CODE_POINT;
+    if (detail::is_code_point_valid(cp)) {
+        if (!detail::is_overlong_sequence(cp, length)) {
+            code_point = cp;
+            ++it;
+            return utf_error::OK;
         }
+
+        it = original_it;
+        return utf_error::OVERLONG_SEQUENCE;
     }
 
-    // Failure branch - restore the original value of the iterator
     it = original_it;
-    return err;
+    return utf_error::INVALID_CODE_POINT;
 }
 
 template<octet_input_iterator It, std::sentinel_for<It> Se>
@@ -798,14 +798,18 @@ template<octet_input_iterator It, std::sentinel_for<It> Se>
     switch (detail::validate_next(it, end, cp)) {
     case detail::utf_error::OK:
         break;
+
     case detail::utf_error::NOT_ENOUGH_SPACE:
         throw not_enough_space();
+
     case detail::utf_error::INVALID_LEAD:
     case detail::utf_error::INCOMPLETE_SEQUENCE:
     case detail::utf_error::OVERLONG_SEQUENCE:
         throw invalid_utf8(static_cast<char8_t>(*it));
+
     case detail::utf_error::INVALID_CODE_POINT:
         throw invalid_code_point(cp);
+
     default:
         std::unreachable();
     }
@@ -1017,35 +1021,40 @@ class iterator
             }
         }
     }
-    // the default "big three" are OK
+
     [[nodiscard]] constexpr It base() const { return it; }
+
     [[nodiscard]] constexpr char32_t operator*() const
     {
         It temp = it;
         return unicode::next(temp, range_end);
     }
-    [[nodiscard]] constexpr bool operator==(const iterator& rhs) const
+
+    [[nodiscard]] constexpr bool operator==(iterator const& rhs) const noexcept
     {
-        if (range_start != rhs.range_start || range_end != rhs.range_end)
-            throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
-        return (it == rhs.it);
+        assert(range_start == rhs.range_start && range_end == rhs.range_end && "comparing incompatible iterator range is not allowed");
+        return it == rhs.it;
     }
+
     constexpr iterator& operator++()
     {
         (void)unicode::next(it, range_end);
         return *this;
     }
+
     constexpr iterator operator++(int)
     {
         iterator temp = *this;
         (void)unicode::next(it, range_end);
         return temp;
     }
+
     constexpr iterator& operator--()
     {
         (void)unicode::prev(it, range_start);
         return *this;
     }
+
     constexpr iterator operator--(int)
     {
         iterator temp = *this;
diff --git a/test/unicode/string/string.cpp b/test/unicode/string/string.cpp
index 47100ff..fba0bda 100644
--- a/test/unicode/string/string.cpp
+++ b/test/unicode/string/string.cpp
@@ -270,34 +270,34 @@ TEST_CASE("starts_with_bom")
     CHECK(!unicode::starts_with_bom(std::u8string{std::from_range, threechars}));
 }
 
-#if 0
-
 TEST_CASE("increment")
 {
-    char const* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
-    iris::utflib::iterator<char const*> it(threechars, threechars, threechars + 9);
-    iris::utflib::iterator<char const*> it2 = it;
+    constexpr char const* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
+    unicode::iterator<char const*> it(threechars, threechars, threechars + 9);
+    unicode::iterator<char const*> it2 = it;
     CHECK(it2 == it);
     CHECK(*it == 0x10346);
-    CHECK(*(++it) == 0x65e5);
-    CHECK((*it++) == 0x65e5);
+    CHECK(*++it == 0x65e5);
+    CHECK(*it++ == 0x65e5);
     CHECK(*it == 0x0448);
-    EXPECT_NE (it, it2);
-    iris::utflib::iterator<char const*> endit (threechars + 9, threechars, threechars + 9);
+    CHECK(it != it2);
+    unicode::iterator<char const*> endit(threechars + 9, threechars, threechars + 9);
     CHECK(++it == endit);
 }
 
 TEST_CASE("decrement")
 {
-    char const* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
-    iris::utflib::iterator<char const*> it(threechars+9, threechars, threechars + 9);
-    CHECK(*(--it) == 0x0448);
-    CHECK((*it--) == 0x0448);
+    constexpr char const* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
+    unicode::iterator<char const*> it(threechars + 9, threechars, threechars + 9);
+    CHECK(*--it == 0x0448);
+    CHECK(*it-- == 0x0448);
     CHECK(*it == 0x65e5);
-    CHECK(--it == iris::utflib::iterator<char const*>(threechars, threechars, threechars + 9));
+    CHECK(--it == unicode::iterator<char const*>(threechars, threechars, threechars + 9));
     CHECK(*it == 0x10346);
 }
 
+#if 0
+
 TEST_CASE("utf32to8")
 {
     char32_t utf32string[] = {0x448, 0x65E5, 0x10346, 0};

From 8f06ee4a048aff92e49c8a7fe9f0de6adc6f142b Mon Sep 17 00:00:00 2001
From: Nana Sakisaka <1901813+saki7@users.noreply.github.com>
Date: Sat, 7 Mar 2026 21:15:08 +0900
Subject: [PATCH 10/17] Refactor until "string_class_and_literals"

---
 test/unicode/string/string.cpp | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/test/unicode/string/string.cpp b/test/unicode/string/string.cpp
index fba0bda..b25aca6 100644
--- a/test/unicode/string/string.cpp
+++ b/test/unicode/string/string.cpp
@@ -194,6 +194,14 @@ TEST_CASE("is_valid")
         std::u8string const utf8_with_surrogates_u8(std::from_range, utf8_with_surrogates);
         CHECK(unicode::is_valid(utf8_with_surrogates));
     }
+
+    {
+        constexpr char const* twochars = "ab";
+        CHECK(unicode::is_valid(twochars));
+
+        std::string const two_chars_string(twochars);
+        CHECK(unicode::is_valid(two_chars_string));
+    }
 }
 
 TEST_CASE("find_invalid")
@@ -397,15 +405,6 @@ TEST_CASE("utf8to32")
     CHECK(utf32result.size() == 2);
 }
 
-TEST_CASE("string_class_and_literals")
-{
-    char const* twochars = "ab";
-    CHECK(is_valid(twochars));
-    std::string const two_chars_string(twochars);
-    CHECK(is_valid(two_chars_string));
-}
-
-
 TEST_CASE("utf16tou8")
 {
     std::u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};

From 96d1476ade49622c3dddc0e1f9a96bc7ce3b3fc6 Mon Sep 17 00:00:00 2001
From: Nana Sakisaka <1901813+saki7@users.noreply.github.com>
Date: Sat, 7 Mar 2026 23:11:29 +0900
Subject: [PATCH 11/17] Refactor until string conversion

---
 include/iris/unicode/string.hpp | 143 +++++++++++++-----
 test/unicode/string/string.cpp  | 249 +++++++++++++++++---------------
 2 files changed, 235 insertions(+), 157 deletions(-)

diff --git a/include/iris/unicode/string.hpp b/include/iris/unicode/string.hpp
index d54b1ff..eb61690 100644
--- a/include/iris/unicode/string.hpp
+++ b/include/iris/unicode/string.hpp
@@ -875,6 +875,37 @@ distance(It first, Se last)
     return dist;
 }
 
+// --------------------------------
+
+template<utf8_input_iterator It, std::sentinel_for<It> Se, utf16_output_iterator OutIt>
+constexpr OutIt utf8to16(It start, Se end, OutIt out)
+{
+    while (start != end) {
+        char32_t const cp = unicode::next(start, end);
+        if (cp > 0xffff) { // make a surrogate pair
+            *out++ = static_cast<char16_t>((cp >> 10) + detail::LEAD_OFFSET);
+            *out++ = static_cast<char16_t>((cp & 0x3ff) + detail::TRAIL_SURROGATE_MIN);
+        } else {
+            *out++ = static_cast<char16_t>(cp);
+        }
+    }
+    return out;
+}
+
+[[nodiscard]] constexpr std::u16string utf8to16(std::string_view str)
+{
+    std::u16string result;
+    unicode::utf8to16(str.begin(), str.end(), std::back_inserter(result));
+    return result;
+}
+
+[[nodiscard]] constexpr std::u16string utf8to16(std::u8string_view str)
+{
+    std::u16string result;
+    unicode::utf8to16(str.begin(), str.end(), std::back_inserter(result));
+    return result;
+}
+
 template<utf16_input_iterator It, std::sentinel_for<It> Se, octet_output_iterator OutIt>
 constexpr OutIt utf16to8(It start, Se end, OutIt out)
 {
@@ -902,46 +933,40 @@ constexpr OutIt utf16to8(It start, Se end, OutIt out)
     return out;
 }
 
-[[nodiscard]] constexpr std::string utf16to8(std::u16string_view s)
+[[nodiscard]] constexpr std::string utf16to8(std::u16string_view str)
 {
     std::string result;
-    unicode::utf16to8(s.begin(), s.end(), std::back_inserter(result));
+    unicode::utf16to8(str.begin(), str.end(), std::back_inserter(result));
     return result;
 }
 
-[[nodiscard]] constexpr std::u8string utf16tou8(std::u16string_view s)
+[[nodiscard]] constexpr std::u8string utf16tou8(std::u16string_view str)
 {
     std::u8string result;
-    unicode::utf16to8(s.begin(), s.end(), std::back_inserter(result));
+    unicode::utf16to8(str.begin(), str.end(), std::back_inserter(result));
     return result;
 }
 
-template<utf8_input_iterator It, std::sentinel_for<It> Se, utf16_output_iterator OutIt>
-constexpr OutIt utf8to16(It start, Se end, OutIt out)
+template<utf8_input_iterator It, std::sentinel_for<It> Se, class OutIt>
+constexpr OutIt utf8to32(It start, Se end, OutIt out)
 {
     while (start != end) {
-        char32_t const cp = unicode::next(start, end);
-        if (cp > 0xffff) { // make a surrogate pair
-            *out++ = static_cast<char16_t>((cp >> 10) + detail::LEAD_OFFSET);
-            *out++ = static_cast<char16_t>((cp & 0x3ff) + detail::TRAIL_SURROGATE_MIN);
-        } else {
-            *out++ = static_cast<char16_t>(cp);
-        }
+        *out++ = unicode::next(start, end);
     }
     return out;
 }
 
-[[nodiscard]] constexpr std::u16string utf8to16(std::string_view s)
+[[nodiscard]] constexpr std::u32string utf8to32(std::string_view str)
 {
-    std::u16string result;
-    unicode::utf8to16(s.begin(), s.end(), std::back_inserter(result));
+    std::u32string result;
+    unicode::utf8to32(str.begin(), str.end(), std::back_inserter(result));
     return result;
 }
 
-[[nodiscard]] constexpr std::u16string utf8to16(std::u8string_view s)
+[[nodiscard]] constexpr std::u32string utf8to32(std::u8string_view str)
 {
-    std::u16string result;
-    unicode::utf8to16(s.begin(), s.end(), std::back_inserter(result));
+    std::u32string result;
+    unicode::utf8to32(str.begin(), str.end(), std::back_inserter(result));
     return result;
 }
 
@@ -954,44 +979,86 @@ constexpr OutIt utf32to8(It start, Se end, OutIt out)
     return out;
 }
 
-[[nodiscard]] constexpr std::string utf32to8(std::u32string_view s)
+[[nodiscard]] constexpr std::string utf32to8(std::u32string_view str)
 {
     std::string result;
-    unicode::utf32to8(s.begin(), s.end(), std::back_inserter(result));
+    unicode::utf32to8(str.begin(), str.end(), std::back_inserter(result));
     return result;
 }
 
-[[nodiscard]] constexpr std::u8string utf32tou8(std::u32string_view s)
+[[nodiscard]] constexpr std::u8string utf32tou8(std::u32string_view str)
 {
     std::u8string result;
-    unicode::utf32to8(s.begin(), s.end(), std::back_inserter(result));
+    unicode::utf32to8(str.begin(), str.end(), std::back_inserter(result));
     return result;
 }
 
-template<utf8_input_iterator It, std::sentinel_for<It> Se, class OutIt>
-constexpr OutIt utf8to32(It start, Se end, OutIt out)
+
+template<class CharT>
+[[nodiscard]] constexpr std::basic_string<CharT> transcode(std::string_view str)
 {
-    while (start != end) {
-        *out++ = unicode::next(start, end);
+    if constexpr (std::same_as<CharT, char8_t>) {
+        return std::u8string{std::from_range, str};
+    } else if constexpr (std::same_as<CharT, char16_t>) {
+        return unicode::utf8to16(str);
+    } else if constexpr (std::same_as<CharT, char32_t>) {
+        return unicode::utf8to32(str);
+    } else {
+        static_assert(std::same_as<CharT, char>);
+        return std::string{str};
     }
-    return out;
 }
 
-[[nodiscard]] constexpr std::u32string utf8to32(std::string_view s)
+template<class CharT>
+[[nodiscard]] constexpr std::basic_string<CharT> transcode(std::u8string_view str)
 {
-    std::u32string result;
-    unicode::utf8to32(s.begin(), s.end(), std::back_inserter(result));
-    return result;
+    if constexpr (std::same_as<CharT, char8_t>) {
+        return std::u8string{str};
+    } else if constexpr (std::same_as<CharT, char16_t>) {
+        return unicode::utf8to16(str);
+    } else if constexpr (std::same_as<CharT, char32_t>) {
+        return unicode::utf8to32(str);
+    } else {
+        static_assert(std::same_as<CharT, char>);
+        return std::string{std::from_range, str};
+    }
 }
 
-[[nodiscard]] constexpr std::u32string utf8to32(std::u8string_view s)
-{
-    std::u32string result;
-    unicode::utf8to32(s.begin(), s.end(), std::back_inserter(result));
-    return result;
+template<class CharT>
+[[nodiscard]] constexpr std::basic_string<CharT> transcode(std::u16string_view str)
+{
+    if constexpr (std::same_as<CharT, char8_t>) {
+        return unicode::utf16tou8(str);
+    } else if constexpr (std::same_as<CharT, char16_t>) {
+        return std::u16string{str};
+    } else if constexpr (std::same_as<CharT, char32_t>) {
+        static_assert(false, "not implemented");
+        return {}; // dummy
+        //return unicode::utf16to32(str);
+    } else {
+        static_assert(std::same_as<CharT, char>);
+        return unicode::utf16to8(str);
+    }
 }
 
-// The iterator class
+template<class CharT>
+[[nodiscard]] constexpr std::basic_string<CharT> transcode(std::u32string_view str)
+{
+    if constexpr (std::same_as<CharT, char8_t>) {
+        return unicode::utf32tou8(str);
+    } else if constexpr (std::same_as<CharT, char16_t>) {
+        static_assert(false, "not implemented");
+        return {}; // dummy
+        //return unicode::utf32to16(str);
+    } else if constexpr (std::same_as<CharT, char32_t>) {
+        return std::u32string{str};
+    } else {
+        static_assert(std::same_as<CharT, char>);
+        return unicode::utf32to8(str);
+    }
+}
+
+
 template<octet_input_iterator It>
 class iterator
 {
diff --git a/test/unicode/string/string.cpp b/test/unicode/string/string.cpp
index b25aca6..eab9eb1 100644
--- a/test/unicode/string/string.cpp
+++ b/test/unicode/string/string.cpp
@@ -187,11 +187,11 @@ TEST_CASE("is_valid")
         CHECK(unicode::is_valid(utf8_with_surrogates, utf8_with_surrogates + 9));
     }
     {
-        std::u8string const utf_invalid_u8(std::from_range, utf_invalid);
+        std::u8string const utf_invalid_u8(reinterpret_cast<char8_t const*>(utf_invalid));
         CHECK(!unicode::is_valid(utf_invalid_u8));
     }
     {
-        std::u8string const utf8_with_surrogates_u8(std::from_range, utf8_with_surrogates);
+        std::u8string const utf8_with_surrogates_u8(reinterpret_cast<char8_t const*>(utf8_with_surrogates));
         CHECK(unicode::is_valid(utf8_with_surrogates));
     }
 
@@ -224,7 +224,7 @@ TEST_CASE("find_invalid")
         CHECK(invalid_pos == 5);
     }
     {
-        std::u8string const utf_invalid_u8(std::from_range, utf_invalid);
+        std::u8string const utf_invalid_u8(reinterpret_cast<char8_t const*>(utf_invalid));
         std::size_t const invalid_pos = unicode::find_invalid(utf_invalid_u8);
         CHECK(invalid_pos == 5);
     }
@@ -255,11 +255,11 @@ TEST_CASE("replace_invalid (string)")
 
 TEST_CASE("replace_invalid (u8string)")
 {
-    std::u8string const invalid_sequence(std::from_range, "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z");
+    std::u8string const invalid_sequence(reinterpret_cast<char8_t const*>("a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"));
     std::u8string const replace_invalid_result = unicode::replace_invalid(invalid_sequence, u8'?');
 
     CHECK(unicode::is_valid(replace_invalid_result));
-    std::u8string const fixed_invalid_sequence(std::from_range, "a????z");
+    std::u8string const fixed_invalid_sequence(reinterpret_cast<char8_t const*>("a????z"));
     CHECK(fixed_invalid_sequence == replace_invalid_result);
 }
 
@@ -275,7 +275,7 @@ TEST_CASE("starts_with_bom")
     CHECK(!unicode::starts_with_bom(threechars));
     CHECK(!unicode::starts_with_bom(std::string{threechars}));
     CHECK(!unicode::starts_with_bom(std::string_view{threechars}));
-    CHECK(!unicode::starts_with_bom(std::u8string{std::from_range, threechars}));
+    CHECK(!unicode::starts_with_bom(std::u8string{reinterpret_cast<char8_t const*>(threechars)}));
 }
 
 TEST_CASE("increment")
@@ -304,141 +304,152 @@ TEST_CASE("decrement")
     CHECK(*it == 0x10346);
 }
 
-#if 0
-
-TEST_CASE("utf32to8")
-{
-    char32_t utf32string[] = {0x448, 0x65E5, 0x10346, 0};
-    std::string utf8result;
-    iris::utflib::utf32to8(utf32string, utf32string + 3, back_inserter(utf8result));
-    CHECK(utf8result.size() == 9);
-}
-
-TEST_CASE("utf8to32")
-{
-    char const* twochars = "\xe6\x97\xa5\xd1\x88";
-    std::vector<unsigned int> utf32result;
-    iris::utflib::utf8to32(twochars, twochars + 5, back_inserter(utf32result));
-    CHECK(utf32result.size() == 2);
-}
-
-TEST_CASE("utf16to8")
-{
-    char16_t utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
-    std::string utf8result;
-    iris::utflib::utf16to8(utf16string, utf16string + 5, back_inserter(utf8result));
-    CHECK(utf8result.size() == 10);
-}
+// -----------------------------------
 
 TEST_CASE("utf8to16")
 {
-    char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
-    std::vector<char16_t> utf16result;
-    iris::utflib::utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result));
-    CHECK(utf16result.size() == 4);
-    CHECK(utf16result[2] == 0xd834);
-    CHECK(utf16result[3] == 0xdd1e);
+    {
+        constexpr char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
+        std::vector<char16_t> utf16result;
+        unicode::utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result));
+        CHECK(utf16result.size() == 4);
+        CHECK(utf16result[2] == 0xd834);
+        CHECK(utf16result[3] == 0xdd1e);
+    }
+    {
+        std::string const utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
+        std::u16string const utf16result = unicode::utf8to16(utf8_with_surrogates);
+        CHECK(utf16result.size() == 4);
+        CHECK(utf16result[2] == 0xd834);
+        CHECK(utf16result[3] == 0xdd1e);
+        // Just to make sure it compiles with string literals
+        CHECK(unicode::utf8to16(u8"simple") == u"simple");
+        CHECK(unicode::utf8to16("simple") == u"simple");
+    }
+    {
+        constexpr std::string_view utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
+        std::u16string const utf16result = unicode::utf8to16(utf8_with_surrogates);
+        CHECK(utf16result.size() == 4);
+        CHECK(utf16result[2] == 0xd834);
+        CHECK(utf16result[3] == 0xdd1e);
+    }
+    {
+        std::u8string const utf8_with_surrogates{reinterpret_cast<char8_t const*>("\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e")};
+        std::u16string const utf16result = unicode::utf8to16(utf8_with_surrogates);
+        CHECK(utf16result.size() == 4);
+        CHECK(utf16result[2] == 0xd834);
+        CHECK(utf16result[3] == 0xdd1e);
+    }
 }
 
 TEST_CASE("utf16to8")
 {
-    std::u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
-    std::string u = utf16to8(utf16string);
-    CHECK(u.size() == 10);
-}
-
-TEST_CASE("utf8to16")
-{
-    std::string utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
-    std::u16string utf16result = utf8to16(utf8_with_surrogates);
-    CHECK(utf16result.size() == 4);
-    CHECK(utf16result[2] == 0xd834);
-    CHECK(utf16result[3] == 0xdd1e);
-    // Just to make sure it compiles with std::string literals
-    CHECK(utf8to16(u8"simple") == u"simple");
-    CHECK(utf8to16("simple") == u"simple");
+    {
+        constexpr char16_t utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
+        std::string utf8result;
+        unicode::utf16to8(utf16string, utf16string + 5, back_inserter(utf8result));
+        CHECK(utf8result.size() == 10);
+    }
+    {
+        std::u16string const utf16string{0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
+        std::string const u = unicode::utf16to8(utf16string);
+        CHECK(u.size() == 10);
+    }
+    {
+        std::u16string const utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
+        std::u16string_view const utf16stringview(utf16string);
+        std::string const u = unicode::utf16to8(utf16stringview);
+        CHECK(u.size() == 10);
+    }
+    {
+        std::u16string const utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
+        std::u16string_view const utf16stringview{utf16string};
+        {
+            std::u8string const u = unicode::utf16tou8(utf16string);
+            CHECK(u.size() == 10);
+        }
+        {
+            std::u8string const u = unicode::utf16tou8(utf16stringview);
+            CHECK(u.size() == 10);
+        }
+    }
 }
 
-TEST_CASE("utf32to8")
-{
-    std::u32string utf32string = {0x448, 0x65E5, 0x10346};
-    std::string utf8result = utf32to8(utf32string);
-    CHECK(utf8result.size() == 9);
-}
+// -----------------------------------------
 
 TEST_CASE("utf8to32")
 {
-    char const* twochars = "\xe6\x97\xa5\xd1\x88";
-    std::u32string utf32result = utf8to32(twochars);
-    CHECK(utf32result.size() == 2);
-}
-
-TEST_CASE("utf16to8")
-{
-    std::u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
-    u16string_view utf16stringview(utf16string);
-    std::string u = utf16to8(utf16stringview);
-    CHECK(u.size() == 10);
-}
-
-TEST_CASE("utf8to16")
-{
-    std::string_view utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
-    std::u16string utf16result = utf8to16(utf8_with_surrogates);
-    CHECK(utf16result.size() == 4);
-    CHECK(utf16result[2] == 0xd834);
-    CHECK(utf16result[3] == 0xdd1e);
+    {
+        constexpr char const* twochars = "\xe6\x97\xa5\xd1\x88";
+        std::vector<unsigned int> utf32result;
+        unicode::utf8to32(twochars, twochars + 5, back_inserter(utf32result));
+        CHECK(utf32result.size() == 2);
+    }
+    {
+        constexpr char const* twochars = "\xe6\x97\xa5\xd1\x88";
+        std::u32string const utf32result = unicode::utf8to32(twochars);
+        CHECK(utf32result.size() == 2);
+    }
+    {
+        constexpr std::string_view twochars = "\xe6\x97\xa5\xd1\x88";
+        std::u32string const utf32result = unicode::utf8to32(twochars);
+        CHECK(utf32result.size() == 2);
+    }
+    {
+        std::u8string const twochars{reinterpret_cast<char8_t const*>("\xe6\x97\xa5\xd1\x88")};
+        std::u32string const utf32result = unicode::utf8to32(twochars);
+        CHECK(utf32result.size() == 2);
+    }
 }
 
 TEST_CASE("utf32to8")
 {
-    std::u32string utf32string = {0x448, 0x65E5, 0x10346};
-    u32string_view utf32stringview(utf32string);
-    std::string utf8result = utf32to8(utf32stringview);
-    CHECK(utf8result.size() == 9);
-}
-
-TEST_CASE("utf8to32")
-{
-    std::string_view twochars = "\xe6\x97\xa5\xd1\x88";
-    std::u32string utf32result = utf8to32(twochars);
-    CHECK(utf32result.size() == 2);
+    {
+        constexpr char32_t utf32string[] = {0x448, 0x65E5, 0x10346, 0};
+        std::string utf8result;
+        unicode::utf32to8(utf32string, utf32string + 3, back_inserter(utf8result));
+        CHECK(utf8result.size() == 9);
+    }
+    {
+        std::u32string const utf32string = {0x448, 0x65E5, 0x10346};
+        std::string const utf8result = unicode::utf32to8(utf32string);
+        CHECK(utf8result.size() == 9);
+    }
+    {
+        std::u32string const utf32string = {0x448, 0x65E5, 0x10346};
+        std::u32string_view const utf32stringview(utf32string);
+        std::string const utf8result = unicode::utf32to8(utf32stringview);
+        CHECK(utf8result.size() == 9);
+    }
+    {
+        std::u32string const utf32string = {0x448, 0x65E5, 0x10346};
+        std::u32string_view const utf32stringview{utf32string};
+        std::u8string const utf8result = unicode::utf32tou8(utf32stringview);
+        CHECK(utf8result.size() == 9);
+    }
 }
 
-TEST_CASE("utf16tou8")
+TEST_CASE("transcode")
 {
-    std::u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
-    u16string_view utf16stringview{utf16string};
-    std::u8string u = utf16tou8(utf16string);
-    CHECK(u.size() == 10);
-    u = utf16tou8(utf16stringview);
-    CHECK(u.size() == 10);
-}
+    STATIC_CHECK(unicode::transcode<char>("aこれはb試験ですc")       == "aこれはb試験ですc");
+    STATIC_CHECK(unicode::transcode<char>(u8"aこれはb試験ですc")     == "aこれはb試験ですc");
+    STATIC_CHECK(unicode::transcode<char>(u"aこれはb試験ですc")      == "aこれはb試験ですc");
+    STATIC_CHECK(unicode::transcode<char>(U"aこれはb試験ですc")      == "aこれはb試験ですc");
 
-TEST_CASE("utf8to16")
-{
-    std::u8string utf8_with_surrogates{ reinterpret_cast<char8_t const*>("\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e") };
-    std::u16string utf16result = utf8to16(utf8_with_surrogates);
-    CHECK(utf16result.size() == 4);
-    CHECK(utf16result[2] == 0xd834);
-    CHECK(utf16result[3] == 0xdd1e);
-}
+    STATIC_CHECK(unicode::transcode<char8_t>("aこれはb試験ですc")    == u8"aこれはb試験ですc");
+    STATIC_CHECK(unicode::transcode<char8_t>(u8"aこれはb試験ですc")  == u8"aこれはb試験ですc");
+    STATIC_CHECK(unicode::transcode<char8_t>(u"aこれはb試験ですc")   == u8"aこれはb試験ですc");
+    STATIC_CHECK(unicode::transcode<char8_t>(U"aこれはb試験ですc")   == u8"aこれはb試験ですc");
 
-TEST_CASE("utf32tou8")
-{
-    std::u32string utf32string = {0x448, 0x65E5, 0x10346};
-    u32string_view utf32stringview{utf32string};
-    std::u8string utf8result = utf32tou8(utf32stringview);
-    CHECK(utf8result.size() == 9);
-}
+    STATIC_CHECK(unicode::transcode<char16_t>("aこれはb試験ですc")   == u"aこれはb試験ですc");
+    STATIC_CHECK(unicode::transcode<char16_t>(u8"aこれはb試験ですc") == u"aこれはb試験ですc");
+    STATIC_CHECK(unicode::transcode<char16_t>(u"aこれはb試験ですc")  == u"aこれはb試験ですc");
+    //STATIC_CHECK(unicode::transcode<char16_t>(U"aこれはb試験ですc")  == u"aこれはb試験ですc");
 
-TEST_CASE("utf8to32")
-{
-    std::u8string twochars = reinterpret_cast<char8_t const*>("\xe6\x97\xa5\xd1\x88");
-    std::u32string utf32result = utf8to32(twochars);
-    CHECK(utf32result.size() == 2);
+    STATIC_CHECK(unicode::transcode<char32_t>("aこれはb試験ですc")   == U"aこれはb試験ですc");
+    STATIC_CHECK(unicode::transcode<char32_t>(u8"aこれはb試験ですc") == U"aこれはb試験ですc");
+    //STATIC_CHECK(unicode::transcode<char32_t>(u"aこれはb試験ですc")  == U"aこれはb試験ですc");
+    STATIC_CHECK(unicode::transcode<char32_t>(U"aこれはb試験ですc")  == U"aこれはb試験ですc");
 }
 
-#endif
-
 } // iris_unicode_test

From a183495a55369e0c31df1e2b0aa39d35edabaf56 Mon Sep 17 00:00:00 2001
From: Nana Sakisaka <1901813+saki7@users.noreply.github.com>
Date: Sat, 7 Mar 2026 23:13:57 +0900
Subject: [PATCH 12/17] Refactor class `iterator`

---
 include/iris/unicode/string.hpp | 147 ++++++++++++++++----------------
 1 file changed, 74 insertions(+), 73 deletions(-)

diff --git a/include/iris/unicode/string.hpp b/include/iris/unicode/string.hpp
index eb61690..e63f345 100644
--- a/include/iris/unicode/string.hpp
+++ b/include/iris/unicode/string.hpp
@@ -875,7 +875,80 @@ distance(It first, Se last)
     return dist;
 }
 
-// --------------------------------
+// ------------------------------------
+
+template<octet_input_iterator It>
+class iterator
+{
+    It it;
+    It range_start;
+    It range_end;
+
+public:
+    using value_type = char32_t;
+    using pointer = char32_t*;
+    using reference = char32_t&;
+    using difference_type = std::ptrdiff_t;
+    using iterator_category = std::bidirectional_iterator_tag;
+
+    constexpr iterator()
+        requires std::is_default_constructible_v<It>
+    = default;
+
+    constexpr explicit iterator(It octet_it, It rangestart, It rangeend)
+        : it(std::move(octet_it))
+        , range_start(std::move(rangestart))
+        , range_end(std::move(rangeend))
+    {
+        if constexpr (std::random_access_iterator<It>) {
+            if (it < range_start || it > range_end) {
+                throw std::out_of_range("Invalid utf-8 iterator position");
+            }
+        }
+    }
+
+    [[nodiscard]] constexpr It base() const { return it; }
+
+    [[nodiscard]] constexpr char32_t operator*() const
+    {
+        It temp = it;
+        return unicode::next(temp, range_end);
+    }
+
+    [[nodiscard]] constexpr bool operator==(iterator const& rhs) const noexcept
+    {
+        assert(range_start == rhs.range_start && range_end == rhs.range_end && "comparing incompatible iterator range is not allowed");
+        return it == rhs.it;
+    }
+
+    constexpr iterator& operator++()
+    {
+        (void)unicode::next(it, range_end);
+        return *this;
+    }
+
+    [[nodiscard]] constexpr iterator operator++(int)
+    {
+        iterator temp = *this;
+        (void)unicode::next(it, range_end);
+        return temp;
+    }
+
+    constexpr iterator& operator--()
+    {
+        (void)unicode::prev(it, range_start);
+        return *this;
+    }
+
+    [[nodiscard]] constexpr iterator operator--(int)
+    {
+        iterator temp = *this;
+        (void)unicode::prev(it, range_start);
+        return temp;
+    }
+};
+
+// ------------------------------------
 
 template<utf8_input_iterator It, std::sentinel_for<It> Se, utf16_output_iterator OutIt>
 constexpr OutIt utf8to16(It start, Se end, OutIt out)
@@ -1058,78 +1131,6 @@ template<class CharT>
     }
 }
 
-
-template<octet_input_iterator It>
-class iterator
-{
-    It it;
-    It range_start;
-    It range_end;
-
-public:
-    using value_type = char32_t;
-    using pointer = char32_t*;
-    using reference = char32_t&;
-    using difference_type = std::ptrdiff_t;
-    using iterator_category = std::bidirectional_iterator_tag;
-
-    constexpr iterator()
-        requires std::is_default_constructible_v<It>
-    = default;
-
-    constexpr explicit iterator(It octet_it, It rangestart, It rangeend)
-        : it(std::move(octet_it))
-        , range_start(std::move(rangestart))
-        , range_end(std::move(rangeend))
-    {
-        if constexpr (std::random_access_iterator<It>) {
-            if (it < range_start || it > range_end) {
-                throw std::out_of_range("Invalid utf-8 iterator position");
-            }
-        }
-    }
-
-    [[nodiscard]] constexpr It base() const { return it; }
-
-    [[nodiscard]] constexpr char32_t operator*() const
-    {
-        It temp = it;
-        return unicode::next(temp, range_end);
-    }
-
-    [[nodiscard]] constexpr bool operator==(iterator const& rhs) const noexcept
-    {
-        assert(range_start == rhs.range_start && range_end == rhs.range_end && "comparing incompatible iterator range is not allowed");
-        return it == rhs.it;
-    }
-
-    constexpr iterator& operator++()
-    {
-        (void)unicode::next(it, range_end);
-        return *this;
-    }
-
-    constexpr iterator operator++(int)
-    {
-        iterator temp = *this;
-        (void)unicode::next(it, range_end);
-        return temp;
-    }
-
-    constexpr iterator& operator--()
-    {
-        (void)unicode::prev(it, range_start);
-        return *this;
-    }
-
-    constexpr iterator operator--(int)
-    {
-        iterator temp = *this;
-        (void)unicode::prev(it, range_start);
-        return temp;
-    }
-};
-
 } // iris::unicode
 
 #endif

From 2071cd7c2b6d9d4980a855cb09d78d5872f13947 Mon Sep 17 00:00:00 2001
From: Nana Sakisaka <1901813+saki7@users.noreply.github.com>
Date: Sun, 8 Mar 2026 00:28:26 +0900
Subject: [PATCH 13/17] Port test "utf8_invalid"

---
 test/CMakeLists.txt                  | 23 ++++++-
 test/unicode/string/string.cpp       |  1 -
 test/unicode/string/utf8_invalid.cpp | 90 ++++++++++++++++------------
 3 files changed, 74 insertions(+), 40 deletions(-)

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 77d4c02..952d4eb 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -116,11 +116,32 @@ function(_iris_define_test_impl test_name libs)
     target_link_libraries(${test_name}_test PRIVATE Iris::Iris iris_cxx_test ${libs})
     add_test(NAME ${test_name}_test COMMAND ${test_name}_test --colour-mode=ansi)
 
+    set_tests_properties(
+        ${test_name}_test PROPERTIES
+        ENVIRONMENT "IRIS_ROOT=${IRIS_ROOT}"
+    )
     if(MSVC)
+        set(
+            VS_DEBUGGER_ENVIRONMENT_LIST
+            "PATH=$(VC_ExecutablePath_x64)\;%PATH%"
+            "ASAN_SYMBOLIZER_PATH=$(VC_ExecutablePath_x64)\\llvm-symbolizer.exe"
+            "IRIS_ROOT=$<SHELL_PATH:${IRIS_ROOT}>"
+        )
+        list(JOIN VS_DEBUGGER_ENVIRONMENT_LIST "\n" VS_DEBUGGER_ENVIRONMENT)
+
+        set_target_properties(
+            ${test_name}_test PROPERTIES
+            VS_DEBUGGER_ENVIRONMENT "${VS_DEBUGGER_ENVIRONMENT}"
+        )
+
         get_property(IRIS_MSVC_ASAN_DIR GLOBAL PROPERTY IRIS_MSVC_ASAN_DIR)
+        set(
+            ENV_MODIFICATION
+            "PATH=path_list_append:${IRIS_MSVC_ASAN_DIR}"
+        )
         set_tests_properties(
             ${test_name}_test PROPERTIES
-            ENVIRONMENT "PATH=${IRIS_MSVC_ASAN_DIR};$ENV{PATH}"
+            ENVIRONMENT_MODIFICATION "${ENV_MODIFICATION}"
         )
     endif()
 endfunction()
diff --git a/test/unicode/string/string.cpp b/test/unicode/string/string.cpp
index eab9eb1..9517be0 100644
--- a/test/unicode/string/string.cpp
+++ b/test/unicode/string/string.cpp
@@ -6,7 +6,6 @@
 #include <algorithm>
 #include <string>
 #include <array>
-#include <ranges>
 
 #include <cstdint>
 
diff --git a/test/unicode/string/utf8_invalid.cpp b/test/unicode/string/utf8_invalid.cpp
index 7dd6588..b7f46b7 100644
--- a/test/unicode/string/utf8_invalid.cpp
+++ b/test/unicode/string/utf8_invalid.cpp
@@ -1,63 +1,77 @@
+// TODO: we need secure "getenv" in iris library
+#define _CRT_SECURE_NO_WARNINGS 1
+
+#include "iris_test.hpp"
+
 #include <iris/unicode/string.hpp>
+#include <iris/exception.hpp>
 
+#include <stdexcept>
 #include <string>
-#include <iostream>
 #include <fstream>
+#include <filesystem>
 #include <algorithm>
+#include <array>
 
-using namespace std;
-using namespace iris::unicode;
+namespace iris_unicode_test {
 
-const unsigned INVALID_LINES[] = { 75, 76, 83, 84, 85, 93, 102, 103, 105, 106, 107, 108, 109, 110, 114, 115, 116, 117, 124, 125, 130, 135, 140, 145, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 169, 175, 176, 177, 207, 208, 209, 210, 211, 220, 221, 222, 223, 224, 232, 233, 234, 235, 236, 247, 248, 249, 250, 251, 252, 253, 257, 258, 259, 260, 261, 262, 263, 264};
-const unsigned* INVALID_LINES_END = INVALID_LINES + sizeof(INVALID_LINES)/sizeof(unsigned);
+constexpr auto INVALID_LINES = std::to_array<unsigned>({
+    75, 76, 83, 84, 85, 93, 102, 103, 105, 106, 107, 108, 109,
+    110, 114, 115, 116, 117, 124, 125, 130, 135, 140, 145, 153,
+    154, 155, 156, 157, 158, 159, 160, 161, 162, 169, 175, 176,
+    177, 207, 208, 209, 210, 211, 220, 221, 222, 223, 224, 232,
+    233, 234, 235, 236, 247, 248, 249, 250, 251, 252, 253, 257,
+    258, 259, 260, 261, 262, 263, 264,
+});
 
-#if 0
-int main(int argc, char** argv)
+TEST_CASE("utf8_invalid")
 {
-    string test_file_path;
-    if (argc == 2)
-        test_file_path = argv[1];
-    else {
-        cout << "Wrong number of arguments" << endl;
-        return 1;
-    }
-    // Open the test file
-    ifstream fs8(test_file_path.c_str());
-    if (!fs8.is_open()) {
-        cout << "Could not open " << test_file_path << endl;
-        return 1;
+    namespace unicode = iris::unicode;
+    using iris::throwf;
+
+    std::filesystem::path const IRIS_ROOT = [] {
+        char const* IRIS_ROOT_str = std::getenv("IRIS_ROOT");
+        if (!IRIS_ROOT_str) throwf<std::invalid_argument>("IRIS_ROOT is not defined");
+        return std::filesystem::path(IRIS_ROOT_str);
+    }();
+
+    auto const test_file_path = IRIS_ROOT / "test" / "unicode" / "string" / "test_data" / "utf8_invalid.txt";
+    std::ifstream fs8(test_file_path);
+    if (!fs8) {
+        throwf<std::invalid_argument>("could not open \"{}\"", test_file_path.string());
     }
 
     // Read it line by line
-    unsigned int line_count = 0;
-    char byte;
+    unsigned line_count = 0;
     while (!fs8.eof()) {
-        string line;
-        while ((byte = static_cast<char>(fs8.get())) != '\n' && !fs8.eof())
+        std::string line;
+
+        char byte;
+        while ((byte = static_cast<char>(fs8.get())) != '\n' && !fs8.eof()) {
             line.push_back(byte);
+        }
+
+        ++line_count;
+        bool const expected_valid = std::ranges::find(INVALID_LINES, line_count) == INVALID_LINES.end();
 
-        line_count++;
-        bool expected_valid = (find(INVALID_LINES, INVALID_LINES_END, line_count) == INVALID_LINES_END);
         // Print out lines that contain unexpected invalid UTF-8
-        if (!is_valid(line.begin(), line.end())) {
+        if (!unicode::is_valid(line.begin(), line.end())) {
             if (expected_valid) {
-                cout << "Unexpected invalid utf-8 at line " << line_count << '\n';
-                return 1;
+                throwf<std::runtime_error>("unexpected invalid utf-8 at line {}", line_count);
             }
 
             // try fixing it:
-            string fixed_line;
-            replace_invalid(line.begin(), line.end(), back_inserter(fixed_line));
-            if (!is_valid(fixed_line.begin(), fixed_line.end())) {
-                cout << "replace_invalid() resulted in an invalid utf-8 at line " << line_count << '\n';
-                return 1;
+            std::string fixed_line;
+            unicode::replace_invalid(line.begin(), line.end(), back_inserter(fixed_line));
+            if (!unicode::is_valid(fixed_line.begin(), fixed_line.end())) {
+                throwf<std::runtime_error>("replace_invalid() resulted in an invalid utf-8 at line {}", line_count);
             }
-        }
-        else if (!expected_valid) {
-            cout << "Invalid utf-8 NOT detected at line " << line_count << '\n';
-            return 1;
+
+        } else if (!expected_valid) {
+            throwf<std::runtime_error>("invalid utf-8 NOT detected at line {}", line_count);
         }
     }
+    CHECK(true);
 }
 
-#endif
+} // iris_unicode_test

From 65d37a2f34d2ed5aa15bb9d8c74c05d76fb0bae8 Mon Sep 17 00:00:00 2001
From: Nana Sakisaka <1901813+saki7@users.noreply.github.com>
Date: Sun, 8 Mar 2026 00:33:39 +0900
Subject: [PATCH 14/17] Enable unicode string tests in CI

---
 test/CMakeLists.txt | 2 --
 1 file changed, 2 deletions(-)

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 952d4eb..e01f75a 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -190,9 +190,7 @@ if(PROJECT_IS_TOP_LEVEL)
         foreach(test_name IN LISTS IRIS_TEST_IRIS_TESTS)
             iris_define_test_headers(iris_${test_name} iris_test.hpp)
         endforeach()
-    endif()
 
-    if(NOT DEFINED IRIS_CI_COMPONENT OR IRIS_CI_COMPONENT STREQUAL unicode)
         add_subdirectory(unicode)
     endif()
 endif()

From 92500773e9940dc059704974e86d69b9d51373e0 Mon Sep 17 00:00:00 2001
From: Nana Sakisaka <1901813+saki7@users.noreply.github.com>
Date: Sun, 8 Mar 2026 00:41:11 +0900
Subject: [PATCH 15/17] Degrade `std::from_range` constructor

---
 include/iris/unicode/string.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/iris/unicode/string.hpp b/include/iris/unicode/string.hpp
index e63f345..6dc0a3a 100644
--- a/include/iris/unicode/string.hpp
+++ b/include/iris/unicode/string.hpp
@@ -1071,7 +1071,7 @@ template<class CharT>
 [[nodiscard]] constexpr std::basic_string<CharT> transcode(std::string_view str)
 {
     if constexpr (std::same_as<CharT, char8_t>) {
-        return std::u8string{std::from_range, str};
+        return std::u8string{str.begin(), str.end()};
     } else if constexpr (std::same_as<CharT, char16_t>) {
         return unicode::utf8to16(str);
     } else if constexpr (std::same_as<CharT, char32_t>) {
@@ -1093,7 +1093,7 @@ template<class CharT>
         return unicode::utf8to32(str);
     } else {
         static_assert(std::same_as<CharT, char>);
-        return std::string{std::from_range, str};
+        return std::string{str.begin(), str.end()};
     }
 }
 

From fb357682a282d2cb1e1c5dbb1db67edd9a8c8492 Mon Sep 17 00:00:00 2001
From: Nana Sakisaka <1901813+saki7@users.noreply.github.com>
Date: Sun, 8 Mar 2026 17:25:55 +0900
Subject: [PATCH 16/17] Refactor iterator

---
 include/iris/unicode/string.hpp | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/include/iris/unicode/string.hpp b/include/iris/unicode/string.hpp
index 6dc0a3a..d6640da 100644
--- a/include/iris/unicode/string.hpp
+++ b/include/iris/unicode/string.hpp
@@ -878,7 +878,7 @@ distance(It first, Se last)
 // ------------------------------------
 
 template<octet_input_iterator It>
-class iterator
+class code_point_iterator
 {
     It it;
     It range_start;
@@ -891,17 +891,17 @@ class iterator
     using difference_type = std::ptrdiff_t;
     using iterator_category = std::bidirectional_iterator_tag;
 
-    constexpr iterator()
+    constexpr code_point_iterator()
         requires std::is_default_constructible_v<It>
     = default;
 
-    constexpr explicit iterator(It octet_it, It rangestart, It rangeend)
-        : it(std::move(octet_it))
-        , range_start(std::move(rangestart))
-        , range_end(std::move(rangeend))
+    constexpr code_point_iterator(It it, It range_start, It range_end)
+        : it(std::move(it))
+        , range_start(std::move(range_start))
+        , range_end(std::move(range_end))
     {
         if constexpr (std::random_access_iterator<It>) {
-            if (it < range_start || it > range_end) {
+            if (this->it < this->range_start || this->it > this->range_end) {
                 throw std::out_of_range("Invalid utf-8 iterator position");
             }
         }
@@ -915,34 +915,34 @@ class iterator
         return unicode::next(temp, range_end);
     }
 
-    [[nodiscard]] constexpr bool operator==(iterator const& rhs) const noexcept
+    [[nodiscard]] constexpr bool operator==(code_point_iterator const& rhs) const noexcept
     {
         assert(range_start == rhs.range_start && range_end == rhs.range_end && "comparing incompatible iterator range is not allowed");
         return it == rhs.it;
     }
 
-    constexpr iterator& operator++()
+    constexpr code_point_iterator& operator++()
     {
         (void)unicode::next(it, range_end);
         return *this;
     }
 
-    [[nodiscard]] constexpr iterator operator++(int)
+    [[nodiscard]] constexpr code_point_iterator operator++(int)
     {
-        iterator temp = *this;
+        code_point_iterator temp = *this;
         (void)unicode::next(it, range_end);
         return temp;
     }
 
-    constexpr iterator& operator--()
+    constexpr code_point_iterator& operator--()
     {
         (void)unicode::prev(it, range_start);
         return *this;
     }
 
-    [[nodiscard]] constexpr iterator operator--(int)
+    [[nodiscard]] constexpr code_point_iterator operator--(int)
     {
-        iterator temp = *this;
+        code_point_iterator temp = *this;
         (void)unicode::prev(it, range_start);
         return temp;
     }
@@ -1066,6 +1066,7 @@ constexpr OutIt utf32to8(It start, Se end, OutIt out)
     return result;
 }
 
+// TODO: add single char variations
 
 template<class CharT>
 [[nodiscard]] constexpr std::basic_string<CharT> transcode(std::string_view str)

From 00eee1765a0fee478c9818b7ddf777240f95ce50 Mon Sep 17 00:00:00 2001
From: Nana Sakisaka <1901813+saki7@users.noreply.github.com>
Date: Mon, 9 Mar 2026 00:05:37 +0900
Subject: [PATCH 17/17] Add `bounded_prev`/`bounded_next`

---
 CMakeLists.txt                  |   6 +-
 include/iris/unicode/string.hpp | 102 ++++++++++++++++++++++++++++++--
 test/CMakeLists.txt             |   5 ++
 test/unicode/string/string.cpp  |  10 ++--
 4 files changed, 110 insertions(+), 13 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 94d6d05..fcd3dea 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,9 +8,9 @@ endif()
 
 project(iris VERSION 0.0.1 LANGUAGES CXX)
 
-if(NOT DEFINED IRIS_ROOT)
-    set(IRIS_ROOT "${CMAKE_CURRENT_LIST_DIR}")
-endif()
+set(IRIS_ROOT "${CMAKE_CURRENT_LIST_DIR}")
+set_property(GLOBAL PROPERTY IRIS_ROOT "${IRIS_ROOT}")
+
 
 # -----------------------------------------------------------------
 # Global settings
diff --git a/include/iris/unicode/string.hpp b/include/iris/unicode/string.hpp
index d6640da..accbf24 100644
--- a/include/iris/unicode/string.hpp
+++ b/include/iris/unicode/string.hpp
@@ -252,7 +252,7 @@ constexpr char32_t SURROGATE_OFFSET    = 0xfca02400u; // 0x10000u - (LEAD_SURROG
 // Maximum valid value for a Unicode code point
 constexpr char32_t CODE_POINT_MAX = 0x0010ffffu;
 
-enum class utf_error
+enum class [[nodiscard]] utf_error
 {
     OK,
     NOT_ENOUGH_SPACE,
@@ -353,7 +353,7 @@ constexpr utf_error increase_safely(It& it, Se end)
         return ret;                                                                                                                                                                                                                        \
     } while (false)
 
-/// get_sequence_x functions decode utf-8 sequences of the length x
+// get_sequence_x functions decode utf-8 sequences of the length x
 template<octet_input_iterator It, std::sentinel_for<It> Se>
 constexpr utf_error get_sequence_1(It& it, Se end, char32_t& code_point)
     noexcept(std::conjunction_v<
@@ -485,10 +485,60 @@ constexpr utf_error validate_next(It& it, Se end, char32_t& code_point)
 template<octet_input_iterator It, std::sentinel_for<It> Se>
     requires std::forward_iterator<It>
 constexpr utf_error validate_next(It& it, Se end)
-    noexcept(noexcept(detail::validate_next(it, end, std::declval<char32_t&>())))
+    noexcept(std::conjunction_v<
+        is_nothrow_dereferenceable<It&>,
+        is_nothrow_prefix_incrementable<It&>,
+        is_nothrow_sentinel<It, Se>,
+        std::is_nothrow_copy_constructible<It>
+    >)
 {
-    char32_t ignored;
-    return detail::validate_next(it, end, ignored);
+    if (it == end) return utf_error::NOT_ENOUGH_SPACE;
+
+    // Save the original value of it so we can go back in case of failure
+    // Of course, it does not make much sense with i.e. stream iterators
+    It const original_it = it;
+
+    char32_t cp = 0;
+    // Determine the sequence length based on the lead octet
+    int const length = detail::sequence_length(it);
+
+    // Get trail octets and calculate the code point
+    utf_error err{};
+    switch (length) {
+    case 0:
+        return utf_error::INVALID_LEAD;
+    case 1:
+        err = detail::get_sequence_1(it, end, cp);
+        break;
+    case 2:
+        err = detail::get_sequence_2(it, end, cp);
+        break;
+    case 3:
+        err = detail::get_sequence_3(it, end, cp);
+        break;
+    case 4:
+        err = detail::get_sequence_4(it, end, cp);
+        break;
+    default:
+        std::unreachable();
+    }
+    if (err != utf_error::OK) {
+        it = original_it;
+        return err;
+    }
+
+    if (detail::is_code_point_valid(cp)) {
+        if (!detail::is_overlong_sequence(cp, length)) {
+            ++it;
+            return utf_error::OK;
+        }
+
+        it = original_it;
+        return utf_error::OVERLONG_SEQUENCE;
+    }
+
+    it = original_it;
+    return utf_error::INVALID_CODE_POINT;
 }
 
 template<utf16_input_iterator It, std::sentinel_for<It> Se>
@@ -816,6 +866,35 @@ template<octet_input_iterator It, std::sentinel_for<It> Se>
     return cp;
 }
 
+template<octet_input_iterator It>
+[[nodiscard]] constexpr std::pair<It, typename std::iterator_traits<It>::difference_type>
+bounded_next(It it, It const last, typename std::iterator_traits<It>::difference_type off = 1)
+{
+    typename std::iterator_traits<It>::difference_type count = 0;
+    for (; it != last && count < off; ++count) {
+        char32_t cp = 0;
+        switch (detail::validate_next(it, last, cp)) {
+        case detail::utf_error::OK:
+            break;
+
+        case detail::utf_error::NOT_ENOUGH_SPACE:
+            throw not_enough_space();
+
+        case detail::utf_error::INVALID_LEAD:
+        case detail::utf_error::INCOMPLETE_SEQUENCE:
+        case detail::utf_error::OVERLONG_SEQUENCE:
+            throw invalid_utf8(static_cast<char8_t>(*it));
+
+        case detail::utf_error::INVALID_CODE_POINT:
+            throw invalid_code_point(cp);
+
+        default:
+            std::unreachable();
+        }
+    }
+    return {it, count};
+}
+
 template<utf16_input_iterator It, std::sentinel_for<It> Se>
 [[nodiscard]] constexpr char32_t next16(It& it, Se end)
 {
@@ -847,6 +926,19 @@ template<octet_input_iterator It, std::sentinel_for<It> Se>
     return unicode::peek_next(it, end);
 }
 
+template<octet_input_iterator It>
+[[nodiscard]] constexpr std::pair<It, typename std::iterator_traits<It>::difference_type>
+bounded_prev(It const start, It it, typename std::iterator_traits<It>::difference_type off = 1)
+{
+    typename std::iterator_traits<It>::difference_type count = 0;
+    for (; it != start && count < off; ++count) {
+        while (detail::is_trail(*--it)) {
+            if (it == start) throw invalid_utf8(*it); // error - no lead byte in the sequence
+        }
+    }
+    return {it, count};
+}
+
 template<octet_input_iterator It, std::sentinel_for<It> Se, class distance_type>
 constexpr void advance(It& it, distance_type n, Se end)
 {
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index e01f75a..5347bca 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -97,6 +97,11 @@ function(iris_define_test_headers test_name)
 endfunction()
 
 function(_iris_define_test_impl test_name libs)
+    get_property(IRIS_ROOT GLOBAL PROPERTY IRIS_ROOT)
+    if(NOT DEFINED IRIS_ROOT OR IRIS_ROOT STREQUAL "")
+        message(FATAL_ERROR "IRIS_ROOT is not defined")
+    endif()
+
     add_executable(${test_name}_test ${ARGN})
     target_include_directories(${test_name}_test PRIVATE ${CMAKE_CURRENT_FUNCTION_LIST_DIR})
     target_include_directories(${test_name}_test PRIVATE ${CMAKE_CURRENT_LIST_DIR})
diff --git a/test/unicode/string/string.cpp b/test/unicode/string/string.cpp
index 9517be0..86c0732 100644
--- a/test/unicode/string/string.cpp
+++ b/test/unicode/string/string.cpp
@@ -280,26 +280,26 @@ TEST_CASE("starts_with_bom")
 TEST_CASE("increment")
 {
     constexpr char const* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
-    unicode::iterator<char const*> it(threechars, threechars, threechars + 9);
-    unicode::iterator<char const*> it2 = it;
+    unicode::code_point_iterator<char const*> it(threechars, threechars, threechars + 9);
+    unicode::code_point_iterator<char const*> it2 = it;
     CHECK(it2 == it);
     CHECK(*it == 0x10346);
     CHECK(*++it == 0x65e5);
     CHECK(*it++ == 0x65e5);
     CHECK(*it == 0x0448);
     CHECK(it != it2);
-    unicode::iterator<char const*> endit(threechars + 9, threechars, threechars + 9);
+    unicode::code_point_iterator<char const*> endit(threechars + 9, threechars, threechars + 9);
     CHECK(++it == endit);
 }
 
 TEST_CASE("decrement")
 {
     constexpr char const* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
-    unicode::iterator<char const*> it(threechars + 9, threechars, threechars + 9);
+    unicode::code_point_iterator<char const*> it(threechars + 9, threechars, threechars + 9);
     CHECK(*--it == 0x0448);
     CHECK(*it-- == 0x0448);
     CHECK(*it == 0x65e5);
-    CHECK(--it == unicode::iterator<char const*>(threechars, threechars, threechars + 9));
+    CHECK(--it == unicode::code_point_iterator<char const*>(threechars, threechars, threechars + 9));
     CHECK(*it == 0x10346);
 }