diff --git a/CMakeLists.txt b/CMakeLists.txt
index 94d6d05..fcd3dea 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,9 +8,9 @@ endif()
 
 project(iris VERSION 0.0.1 LANGUAGES CXX)
 
-if(NOT DEFINED IRIS_ROOT)
-    set(IRIS_ROOT "${CMAKE_CURRENT_LIST_DIR}")
-endif()
+set(IRIS_ROOT "${CMAKE_CURRENT_LIST_DIR}")
+set_property(GLOBAL PROPERTY IRIS_ROOT "${IRIS_ROOT}")
+
 
 # -----------------------------------------------------------------
 # Global settings
diff --git a/include/iris/unicode/string.hpp b/include/iris/unicode/string.hpp
new file mode 100644
index 0000000..accbf24
--- /dev/null
+++ b/include/iris/unicode/string.hpp
@@ -0,0 +1,1229 @@
+// Copyright 2006 Nemanja Trifunovic
+// Copyright 2026 The Iris Project Contributors
+
+/*
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+*/
+
+#ifndef IRIS_UNICODE_STRING_HPP
+#define IRIS_UNICODE_STRING_HPP
+
+#include <concepts>
+#include <stdexcept>
+#include <iterator>
+#include <stdexcept>
+#include <string>
+#include <string_view>
+#include <type_traits>
+#include <utility>
+#include <ranges>
+
+#include <cassert>
+
+namespace iris::unicode {
+
+template<class T>
+constexpr T bom[] = {static_cast<T>(0xef), static_cast<T>(0xbb), static_cast<T>(0xbf)};
+
+template<class T>
+concept octet = std::integral<T> && sizeof(T) == 1;
+
+template<class T>
+concept utf8char = octet<T> && (std::same_as<T, char> || std::same_as<T, char8_t>);
+
+template<class T>
+concept utf16char = std::same_as<T, char16_t>;
+
+template<class T>
+concept utf32char = std::same_as<T, char32_t>;
+
+template<class It>
+concept octet_input_iterator = std::input_iterator<It> && octet<std::iter_value_t<It>>;
+
+template<class It>
+concept utf8_input_iterator = octet_input_iterator<It> && utf8char<std::iter_value_t<It>>;
+
+template<class It>
+concept utf16_input_iterator = std::input_iterator<It> && utf16char<std::iter_value_t<It>>;
+
+template<class It>
+concept utf32_input_iterator = std::input_iterator<It> && utf32char<std::iter_value_t<It>>;
+
+
+template<class R>
+concept octet_input_range =
+    std::ranges::input_range<R> &&
+    octet_input_iterator<std::ranges::iterator_t<R>>;
+
+
+namespace detail {
+
+template<class OutIt, class DesiredValueT>
+struct select_output_value_type
+{
+    static_assert(std::output_iterator<OutIt, DesiredValueT>);
+    using type = DesiredValueT;
+};
+
+template<class OutIt, class DesiredValueT>
+    requires requires {
+        typename std::iter_value_t<OutIt>;
+        requires std::convertible_to<DesiredValueT, std::iter_value_t<OutIt>>;
+    }
+struct select_output_value_type<OutIt, DesiredValueT>
+{
+    static_assert(std::output_iterator<OutIt, std::iter_value_t<OutIt>>);
+    using type = std::iter_value_t<OutIt>;
+};
+
+template<class OutIt, std::size_t SizeofChar>
+concept maybe_value_type_sized =
+    requires {
+        typename std::iter_value_t<OutIt>;
+        requires sizeof(std::iter_value_t<OutIt>) == SizeofChar;
+    } ||
+    !requires {
+        typename std::iter_value_t<OutIt>;
+    };
+
+} // detail
+
+template<class OutIt>
+concept octet_output_iterator =
+    (
+        std::output_iterator<OutIt, char8_t> ||
+        std::output_iterator<OutIt, char>
+    ) &&
+    detail::maybe_value_type_sized<OutIt, 1>;
+
+template<class R>
+concept octet_output_range =
+    (
+        std::ranges::output_range<R, char8_t> ||
+        std::ranges::output_range<R, char>
+    ) &&
+    detail::maybe_value_type_sized<std::ranges::iterator_t<R>, 1>;
+
+template<class OutIt>
+concept utf16_output_iterator =
+    std::output_iterator<OutIt, char16_t> &&
+    detail::maybe_value_type_sized<OutIt, 2>;
+
+template<class R>
+concept utf16_output_range =
+    std::ranges::output_range<R, char16_t> &&
+    detail::maybe_value_type_sized<std::ranges::iterator_t<R>, 2>;
+
+
+template<class T, class = void>
+struct is_nothrow_dereferenceable : std::false_type {};
+
+template<class T>
+struct is_nothrow_dereferenceable<T, std::void_t<decltype(*std::declval<T>())>> : std::bool_constant<noexcept(*std::declval<T>())> {};
+
+template<class T>
+inline constexpr bool is_nothrow_dereferenceable_v = is_nothrow_dereferenceable<T>::value;
+
+template<class T, class = void>
+struct is_nothrow_prefix_incrementable : std::false_type {};
+
+template<class T>
+struct is_nothrow_prefix_incrementable<T, std::void_t<decltype(++std::declval<T>())>> : std::bool_constant<noexcept(++std::declval<T>())> {};
+
+template<class T>
+inline constexpr bool is_nothrow_prefix_incrementable_v = is_nothrow_prefix_incrementable<T>::value;
+
+template<class T, class = void>
+struct is_nothrow_postfix_incrementable : std::false_type {};
+
+template<class T>
+struct is_nothrow_postfix_incrementable<T, std::void_t<decltype(std::declval<T>()++)>> : std::bool_constant<noexcept(std::declval<T>()++)> {};
+
+template<class T>
+inline constexpr bool is_nothrow_postfix_incrementable_v = is_nothrow_postfix_incrementable<T>::value;
+
+template<class It, class Se>
+struct is_nothrow_sentinel : std::false_type {};
+
+template<class It, class Se>
+    requires std::sentinel_for<Se, It>
+struct is_nothrow_sentinel<It, Se> : std::bool_constant<
+    noexcept(std::declval<It&>() == std::declval<Se&>()) &&
+    noexcept(std::declval<It&>() != std::declval<Se&>()) &&
+    noexcept(std::declval<Se&>() == std::declval<It&>()) &&
+    noexcept(std::declval<Se&>() != std::declval<It&>())
+>
+{};
+
+template<class It, class Se>
+inline constexpr bool is_nothrow_sentinel_v = is_nothrow_sentinel<It, Se>::value;
+
+
+class unicode_error : public std::runtime_error
+{
+    using std::runtime_error::runtime_error;
+};
+
+class invalid_code_point : public unicode_error
+{
+    char32_t cp;
+
+public:
+    explicit invalid_code_point(char32_t codepoint)
+        : unicode_error("invalid code point")
+        , cp(codepoint)
+    {}
+
+    [[nodiscard]] char32_t code_point() const noexcept { return cp; }
+};
+
+class invalid_utf8 : public unicode_error
+{
+    char8_t u8;
+
+public:
+    explicit invalid_utf8(char c)
+        : unicode_error("invalid UTF-8")
+        , u8(static_cast<char8_t>(c))
+    {}
+
+    explicit invalid_utf8(char8_t u)
+        : unicode_error("invalid UTF-8")
+        , u8(u)
+    {}
+
+    [[nodiscard]] char8_t utf8_octet() const noexcept { return u8; }
+};
+
+class invalid_utf16 : public unicode_error
+{
+    char16_t u16;
+
+public:
+    explicit invalid_utf16(char16_t u)
+        : unicode_error("Invalid UTF-16")
+        , u16(u)
+    {}
+
+    [[nodiscard]] char16_t utf16_word() const noexcept { return u16; }
+};
+
+class not_enough_space : public unicode_error
+{
+public:
+    not_enough_space()
+        : unicode_error("not enough space")
+    {}
+};
+
+
+namespace detail {
+
+// Unicode constants
+// Leading (high) surrogates: 0xd800 - 0xdbff
+// Trailing (low) surrogates: 0xdc00 - 0xdfff
+constexpr char16_t LEAD_SURROGATE_MIN  = 0xd800u;
+constexpr char16_t LEAD_SURROGATE_MAX  = 0xdbffu;
+constexpr char16_t TRAIL_SURROGATE_MIN = 0xdc00u;
+constexpr char16_t TRAIL_SURROGATE_MAX = 0xdfffu;
+constexpr char16_t LEAD_OFFSET         = 0xd7c0u;     // LEAD_SURROGATE_MIN - (0x10000 >> 10)
+constexpr char32_t SURROGATE_OFFSET    = 0xfca02400u; // 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN
+
+// Maximum valid value for a Unicode code point
+constexpr char32_t CODE_POINT_MAX = 0x0010ffffu;
+
+enum class [[nodiscard]] utf_error
+{
+    OK,
+    NOT_ENOUGH_SPACE,
+    INVALID_LEAD,
+    INCOMPLETE_SEQUENCE,
+    OVERLONG_SEQUENCE,
+    INVALID_CODE_POINT,
+};
+
+template<octet Octet>
+[[nodiscard]] constexpr char8_t mask8(Octet oc) noexcept
+{
+    return static_cast<char8_t>(oc & 0xff);
+}
+
+[[nodiscard]] constexpr char16_t mask16(char16_t oc) noexcept
+{
+    return static_cast<char16_t>(oc & 0xffff);
+}
+
+template<octet Octet>
+[[nodiscard]] constexpr bool is_trail(Octet oc) noexcept
+{
+    return ((detail::mask8(oc) >> 6) == 0x2);
+}
+
+[[nodiscard]] constexpr bool is_lead_surrogate(char32_t cp) noexcept
+{
+    return cp >= static_cast<char32_t>(LEAD_SURROGATE_MIN) && cp <= static_cast<char32_t>(LEAD_SURROGATE_MAX);
+}
+
+[[nodiscard]] constexpr bool is_trail_surrogate(char32_t cp) noexcept
+{
+    return cp >= static_cast<char32_t>(TRAIL_SURROGATE_MIN) && cp <= static_cast<char32_t>(TRAIL_SURROGATE_MAX);
+}
+
+[[nodiscard]] constexpr bool is_surrogate(char32_t cp) noexcept
+{
+    return cp >= static_cast<char32_t>(LEAD_SURROGATE_MIN) && cp <= static_cast<char32_t>(TRAIL_SURROGATE_MAX);
+}
+
+[[nodiscard]] constexpr bool is_code_point_valid(char32_t cp) noexcept
+{
+    return cp <= CODE_POINT_MAX && !detail::is_surrogate(cp);
+}
+
+[[nodiscard]] constexpr bool is_in_bmp(char32_t cp) noexcept
+{
+    return cp < char32_t(0x10000);
+}
+
+[[nodiscard]] constexpr bool is_overlong_sequence(char32_t cp, int length) noexcept
+{
+    if (cp < 0x80) {
+        if (length != 1) return true;
+    } else if (cp < 0x800) {
+        if (length != 2) return true;
+    } else if (cp < 0x10000) {
+        if (length != 3) return true;
+    }
+    return false;
+}
+
+template<octet_input_iterator It>
+[[nodiscard]] constexpr int sequence_length(It lead_it)
+    noexcept(is_nothrow_dereferenceable_v<It&>)
+{
+    char8_t const lead = detail::mask8(*lead_it);
+    if (lead < 0x80) return 1;
+    if ((lead >> 5) == 0x6) return 2;
+    if ((lead >> 4) == 0xe) return 3;
+    if ((lead >> 3) == 0x1e) return 4;
+    return 0;
+}
+
+/// Helper for get_sequence_x
+template<octet_input_iterator It, std::sentinel_for<It> Se>
+constexpr utf_error increase_safely(It& it, Se end)
+    noexcept(std::conjunction_v<
+        is_nothrow_dereferenceable<It&>,
+        is_nothrow_prefix_incrementable<It&>,
+        is_nothrow_sentinel<It, Se>
+    >)
+{
+    if (++it == end) {
+        return utf_error::NOT_ENOUGH_SPACE;
+    }
+    if (!detail::is_trail(*it)) {
+        return utf_error::INCOMPLETE_SEQUENCE;
+    }
+    return utf_error::OK;
+}
+
+#define IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(IT, END)                                                                                                                                                                                      \
+    do {                                                                                                                                                                                                                                       \
+        utf_error ret = increase_safely(IT, END);                                                                                                                                                                                              \
+    if (ret != utf_error::OK)                                                                                                                                                                                                                    \
+        return ret;                                                                                                                                                                                                                        \
+    } while (false)
+
+// get_sequence_x functions decode utf-8 sequences of the length x
+template<octet_input_iterator It, std::sentinel_for<It> Se>
+constexpr utf_error get_sequence_1(It& it, Se end, char32_t& code_point)
+    noexcept(std::conjunction_v<
+        is_nothrow_dereferenceable<It&>,
+        is_nothrow_sentinel<It, Se>
+    >)
+{
+    if (it == end) return utf_error::NOT_ENOUGH_SPACE;
+    code_point = static_cast<char32_t>(detail::mask8(*it));
+    return utf_error::OK;
+}
+
+template<octet_input_iterator It, std::sentinel_for<It> Se>
+constexpr utf_error get_sequence_2(It& it, Se end, char32_t& code_point)
+    noexcept(std::conjunction_v<
+        is_nothrow_dereferenceable<It&>,
+        is_nothrow_prefix_incrementable<It&>,
+        is_nothrow_sentinel<It, Se>
+    >)
+{
+    if (it == end) return utf_error::NOT_ENOUGH_SPACE;
+
+    code_point = static_cast<char32_t>(detail::mask8(*it));
+    IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end);
+    code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f);
+    return utf_error::OK;
+}
+
+template<octet_input_iterator It, std::sentinel_for<It> Se>
+constexpr utf_error get_sequence_3(It& it, Se end, char32_t& code_point)
+    noexcept(std::conjunction_v<
+        is_nothrow_dereferenceable<It&>,
+        is_nothrow_prefix_incrementable<It&>,
+        is_nothrow_sentinel<It, Se>
+    >)
+{
+    if (it == end) return utf_error::NOT_ENOUGH_SPACE;
+
+    code_point = static_cast<char32_t>(detail::mask8(*it));
+    IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end);
+    code_point = ((code_point << 12) & 0xffff) + ((detail::mask8(*it) << 6) & 0xfff);
+    IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end);
+    code_point = static_cast<char32_t>(code_point + ((*it) & 0x3f));
+    return utf_error::OK;
+}
+
+template<octet_input_iterator It, std::sentinel_for<It> Se>
+constexpr utf_error get_sequence_4(It& it, Se end, char32_t& code_point)
+    noexcept(std::conjunction_v<
+        is_nothrow_dereferenceable<It&>,
+        is_nothrow_prefix_incrementable<It&>,
+        is_nothrow_sentinel<It, Se>
+    >)
+{
+    if (it == end) return utf_error::NOT_ENOUGH_SPACE;
+
+    code_point = static_cast<char32_t>(detail::mask8(*it));
+    IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end);
+    code_point = ((code_point << 18) & 0x1fffff) + ((detail::mask8(*it) << 12) & 0x3ffff);
+    IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end);
+    code_point = static_cast<char32_t>(code_point + ((detail::mask8(*it) << 6) & 0xfff));
+    IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR(it, end);
+    code_point = static_cast<char32_t>(code_point + ((*it) & 0x3f));
+    return utf_error::OK;
+}
+
+#undef IRIS_UTFLIB_INCREASE_AND_RETURN_ON_ERROR
+
+template<octet_input_iterator It, std::sentinel_for<It> Se>
+    requires std::forward_iterator<It>
+constexpr utf_error validate_next(It& it, Se end, char32_t& code_point)
+    noexcept(std::conjunction_v<
+        is_nothrow_dereferenceable<It&>,
+        is_nothrow_prefix_incrementable<It&>,
+        is_nothrow_sentinel<It, Se>,
+        std::is_nothrow_copy_constructible<It>
+    >)
+{
+    if (it == end) return utf_error::NOT_ENOUGH_SPACE;
+
+    // Save the original value of it so we can go back in case of failure
+    // Of course, it does not make much sense with i.e. stream iterators
+    It const original_it = it;
+
+    char32_t cp = 0;
+    // Determine the sequence length based on the lead octet
+    int const length = detail::sequence_length(it);
+
+    // Get trail octets and calculate the code point
+    utf_error err{};
+    switch (length) {
+    case 0:
+        return utf_error::INVALID_LEAD;
+    case 1:
+        err = detail::get_sequence_1(it, end, cp);
+        break;
+    case 2:
+        err = detail::get_sequence_2(it, end, cp);
+        break;
+    case 3:
+        err = detail::get_sequence_3(it, end, cp);
+        break;
+    case 4:
+        err = detail::get_sequence_4(it, end, cp);
+        break;
+    default:
+        std::unreachable();
+    }
+    if (err != utf_error::OK) {
+        it = original_it;
+        return err;
+    }
+
+    if (detail::is_code_point_valid(cp)) {
+        if (!detail::is_overlong_sequence(cp, length)) {
+            code_point = cp;
+            ++it;
+            return utf_error::OK;
+        }
+
+        it = original_it;
+        return utf_error::OVERLONG_SEQUENCE;
+    }
+
+    it = original_it;
+    return utf_error::INVALID_CODE_POINT;
+}
+
+template<octet_input_iterator It, std::sentinel_for<It> Se>
+    requires std::forward_iterator<It>
+constexpr utf_error validate_next(It& it, Se end)
+    noexcept(std::conjunction_v<
+        is_nothrow_dereferenceable<It&>,
+        is_nothrow_prefix_incrementable<It&>,
+        is_nothrow_sentinel<It, Se>,
+        std::is_nothrow_copy_constructible<It>
+    >)
+{
+    if (it == end) return utf_error::NOT_ENOUGH_SPACE;
+
+    // Save the original value of it so we can go back in case of failure
+    // Of course, it does not make much sense with i.e. stream iterators
+    It const original_it = it;
+
+    char32_t cp = 0;
+    // Determine the sequence length based on the lead octet
+    int const length = detail::sequence_length(it);
+
+    // Get trail octets and calculate the code point
+    utf_error err{};
+    switch (length) {
+    case 0:
+        return utf_error::INVALID_LEAD;
+    case 1:
+        err = detail::get_sequence_1(it, end, cp);
+        break;
+    case 2:
+        err = detail::get_sequence_2(it, end, cp);
+        break;
+    case 3:
+        err = detail::get_sequence_3(it, end, cp);
+        break;
+    case 4:
+        err = detail::get_sequence_4(it, end, cp);
+        break;
+    default:
+        std::unreachable();
+    }
+    if (err != utf_error::OK) {
+        it = original_it;
+        return err;
+    }
+
+    if (detail::is_code_point_valid(cp)) {
+        if (!detail::is_overlong_sequence(cp, length)) {
+            ++it;
+            return utf_error::OK;
+        }
+
+        it = original_it;
+        return utf_error::OVERLONG_SEQUENCE;
+    }
+
+    it = original_it;
+    return utf_error::INVALID_CODE_POINT;
+}
+
+template<utf16_input_iterator It, std::sentinel_for<It> Se>
+    requires std::forward_iterator<It>
+constexpr utf_error validate_next16(It& it, Se end, char32_t& code_point)
+    noexcept(std::conjunction_v<
+        is_nothrow_dereferenceable<It&>,
+        is_nothrow_prefix_incrementable<It&>,
+        is_nothrow_postfix_incrementable<It&>,
+        is_nothrow_sentinel<It, Se>,
+        std::is_nothrow_copy_constructible<It>
+    >)
+{
+    // Check the edge case:
+    if (it == end) return utf_error::NOT_ENOUGH_SPACE;
+
+    // Save the original value of it so we can go back in case of failure
+    // Of course, it does not make much sense with i.e. stream iterators
+    It const original_it = it;
+
+    char16_t const first_word = *it++;
+    if (!detail::is_surrogate(first_word)) {
+        code_point = first_word;
+        return utf_error::OK;
+    }
+    if (it == end) {
+        it = original_it;
+        return utf_error::NOT_ENOUGH_SPACE;
+    }
+    if (detail::is_lead_surrogate(first_word)) {
+        char16_t const second_word = *it++;
+        if (detail::is_trail_surrogate(static_cast<char32_t>(second_word))) {
+            code_point = static_cast<char32_t>(first_word << 10) + static_cast<char32_t>(second_word) + SURROGATE_OFFSET;
+            return utf_error::OK;
+        }
+        it = original_it;
+        return utf_error::INCOMPLETE_SEQUENCE;
+    }
+
+    it = original_it;
+    return utf_error::INVALID_LEAD;
+}
+
+} // detail
+
+template<octet_input_iterator It, std::sentinel_for<It> Se>
+[[nodiscard]] constexpr It find_invalid(It it, Se se)
+    noexcept(noexcept(detail::validate_next(it, se)) && std::is_nothrow_copy_constructible_v<It>)
+{
+    while (it != se) {
+        detail::utf_error err_code = detail::validate_next(it, se);
+        if (err_code != detail::utf_error::OK) {
+            return it;
+        }
+    }
+    return it;
+}
+
+[[nodiscard]] constexpr std::size_t find_invalid(std::string_view s)
+    noexcept(noexcept(unicode::find_invalid(s.begin(), s.end())))
+{
+    std::string_view::const_iterator invalid = unicode::find_invalid(s.begin(), s.end());
+    return invalid == s.end() ? std::string_view::npos : static_cast<std::size_t>(invalid - s.begin());
+}
+
+[[nodiscard]] constexpr std::size_t find_invalid(std::u8string_view s)
+    noexcept(noexcept(unicode::find_invalid(s.begin(), s.end())))
+{
+    std::u8string_view::const_iterator invalid = unicode::find_invalid(s.begin(), s.end());
+    return invalid == s.end() ? std::u8string_view::npos : static_cast<std::size_t>(invalid - s.begin());
+}
+
+template<octet_input_iterator It, std::sentinel_for<It> Se>
+[[nodiscard]] constexpr bool is_valid(It it, Se se)
+    noexcept(noexcept(unicode::find_invalid(it, se)) && is_nothrow_sentinel_v<It, Se>)
+{
+    return unicode::find_invalid(it, se) == se;
+}
+
+[[nodiscard]] constexpr bool is_valid(std::string_view s)
+    noexcept(noexcept(unicode::is_valid(s.begin(), s.end())))
+{
+    return unicode::is_valid(s.begin(), s.end());
+}
+
+[[nodiscard]] constexpr bool is_valid(std::u8string_view s)
+    noexcept(noexcept(unicode::is_valid(s.begin(), s.end())))
+{
+    return unicode::is_valid(s.begin(), s.end());
+}
+
+template<octet_input_iterator It, std::sentinel_for<It> Se>
+[[nodiscard]] constexpr bool starts_with_bom(It it, Se end)
+    noexcept(noexcept(detail::mask8(*it++)) && is_nothrow_sentinel_v<It, Se>)
+{
+    return
+        (it != end && detail::mask8(*it++) == bom<char8_t>[0]) &&
+        (it != end && detail::mask8(*it++) == bom<char8_t>[1]) &&
+        (it != end && detail::mask8(*it)   == bom<char8_t>[2]);
+}
+
+template<octet_input_range R>
+[[nodiscard]] constexpr bool starts_with_bom(R&& r)
+    noexcept(noexcept(unicode::starts_with_bom(std::ranges::begin(r), std::ranges::end(r))))
+{
+    return unicode::starts_with_bom(std::ranges::begin(r), std::ranges::end(r));
+}
+
+[[nodiscard]] constexpr bool starts_with_bom(std::string_view s)
+    noexcept(noexcept(unicode::starts_with_bom(s.begin(), s.end())))
+{
+    return unicode::starts_with_bom(s.begin(), s.end());
+}
+
+[[nodiscard]] constexpr bool starts_with_bom(std::u8string_view s)
+    noexcept(noexcept(unicode::starts_with_bom(s.begin(), s.end())))
+{
+    return unicode::starts_with_bom(s.begin(), s.end());
+}
+
+
+template<octet_output_iterator OutIt>
+constexpr OutIt append8(char32_t cp, OutIt out)
+{
+    if (!detail::is_code_point_valid(cp)) throw invalid_code_point(cp);
+
+    using octet_type = detail::select_output_value_type<OutIt, char>::type;
+
+    if (cp < 0x80) { // one octet
+        *out++ = static_cast<octet_type>(cp);
+    } else if (cp < 0x800) { // two octets
+        *out++ = static_cast<octet_type>((cp >> 6) | 0xc0);
+        *out++ = static_cast<octet_type>((cp & 0x3f) | 0x80);
+    } else if (cp < 0x10000) { // three octets
+        *out++ = static_cast<octet_type>((cp >> 12) | 0xe0);
+        *out++ = static_cast<octet_type>(((cp >> 6) & 0x3f) | 0x80);
+        *out++ = static_cast<octet_type>((cp & 0x3f) | 0x80);
+    } else { // four octets
+        *out++ = static_cast<octet_type>((cp >> 18) | 0xf0);
+        *out++ = static_cast<octet_type>(((cp >> 12) & 0x3f) | 0x80);
+        *out++ = static_cast<octet_type>(((cp >> 6) & 0x3f) | 0x80);
+        *out++ = static_cast<octet_type>((cp & 0x3f) | 0x80);
+    }
+    return out;
+}
+
+template<utf16_output_iterator OutIt>
+constexpr OutIt append16(char32_t cp, OutIt out)
+{
+    if (!detail::is_code_point_valid(cp)) throw invalid_code_point(cp);
+
+    if (detail::is_in_bmp(cp)) {
+        *out++ = static_cast<char16_t>(cp);
+    } else {
+        // Code points from the supplementary planes are encoded via surrogate pairs
+        *out++ = static_cast<char16_t>(detail::LEAD_OFFSET + (cp >> 10));
+        *out++ = static_cast<char16_t>(detail::TRAIL_SURROGATE_MIN + (cp & 0x3FF));
+    }
+    return out;
+}
+
+// Forwards automatically based on `sizeof(value_type)`, but overload may become
+// ambiguous on `value_type`-agnostic iterators such as `std::back_insert_iterator`.
+template<octet_output_iterator OutIt>
+constexpr OutIt append(char32_t cp, OutIt out)
+{
+    return unicode::append8(cp, std::move(out));
+}
+template<utf16_output_iterator OutIt>
+constexpr OutIt append(char32_t cp, OutIt out)
+{
+    return unicode::append16(cp, std::move(out));
+}
+
+
+template<class OutR>
+    requires octet_output_range<std::decay_t<OutR>>
+constexpr std::ranges::subrange<std::ranges::iterator_t<std::decay_t<OutR>>, std::ranges::sentinel_t<std::decay_t<OutR>>>
+append8(char32_t cp, OutR&& r)
+{
+    return std::ranges::subrange{
+        unicode::append8(cp, std::ranges::begin(r)), std::ranges::end(r)
+    };
+}
+
+template<class OutR>
+    requires utf16_output_range<std::decay_t<OutR>>
+constexpr std::ranges::subrange<std::ranges::iterator_t<std::decay_t<OutR>>, std::ranges::sentinel_t<std::decay_t<OutR>>>
+append16(char32_t cp, OutR&& r)
+{
+    return std::ranges::subrange{
+        unicode::append16(cp, std::ranges::begin(r)), std::ranges::end(r)
+    };
+}
+
+// Forwards automatically based on `sizeof(value_type)`, but overload may become
+// ambiguous on `value_type`-agnostic iterators such as `std::back_insert_iterator`.
+template<class OutR>
+    requires octet_output_range<std::decay_t<OutR>>
+constexpr std::ranges::subrange<std::ranges::iterator_t<std::decay_t<OutR>>, std::ranges::sentinel_t<std::decay_t<OutR>>>
+append(char32_t cp, OutR&& r)
+{
+    return unicode::append8(cp, std::forward<OutR>(r));
+}
+template<class OutR>
+    requires utf16_output_range<std::decay_t<OutR>>
+constexpr std::ranges::subrange<std::ranges::iterator_t<std::decay_t<OutR>>, std::ranges::sentinel_t<std::decay_t<OutR>>>
+append(char32_t cp, OutR&& r)
+{
+    return unicode::append16(cp, std::forward<OutR>(r));
+}
+
+constexpr void append(char32_t cp, std::string& str)
+{
+    unicode::append8(cp, std::back_inserter(str));
+}
+
+constexpr void append(char32_t cp, std::u8string& str)
+{
+    unicode::append8(cp, std::back_inserter(str));
+}
+
+constexpr void append(char32_t cp, std::u16string& str)
+{
+    unicode::append16(cp, std::back_inserter(str));
+}
+
+template<octet_input_iterator It, std::sentinel_for<It> Se, octet_output_iterator Out>
+constexpr Out replace_invalid(It start, Se end, Out out, char32_t replacement)
+{
+    while (start != end) {
+        It const sequence_start = start;
+        switch (detail::validate_next(start, end)) {
+        case detail::utf_error::OK:
+            for (It it = sequence_start; it != start; ++it) {
+                *out++ = *it;
+            }
+            break;
+
+        case detail::utf_error::NOT_ENOUGH_SPACE:
+            out = unicode::append8(replacement, out);
+            start = end;
+            break;
+
+        case detail::utf_error::INVALID_LEAD:
+            out = unicode::append8(replacement, out);
+            ++start;
+            break;
+
+        case detail::utf_error::INCOMPLETE_SEQUENCE:
+        case detail::utf_error::OVERLONG_SEQUENCE:
+        case detail::utf_error::INVALID_CODE_POINT:
+            out = unicode::append8(replacement, out);
+            ++start;
+            // just one replacement mark for the sequence
+            while (start != end && detail::is_trail(*start)) {
+                ++start;
+            }
+            break;
+
+        default:
+            std::unreachable();
+        }
+    }
+    return out;
+}
+
+template<octet_input_iterator It, std::sentinel_for<It> Se, octet_output_iterator Out>
+constexpr Out replace_invalid(It start, Se end, Out out)
+{
+    constexpr char32_t replacement_marker = static_cast<char32_t>(detail::mask16(0xfffd));
+    return unicode::replace_invalid(start, end, out, replacement_marker);
+}
+
+[[nodiscard]] constexpr std::string replace_invalid(std::string_view s, char32_t replacement)
+{
+    std::string result;
+    unicode::replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement);
+    return result;
+}
+
+[[nodiscard]] constexpr std::u8string replace_invalid(std::u8string_view s, char32_t replacement)
+{
+    std::u8string result;
+    unicode::replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement);
+    return result;
+}
+
+[[nodiscard]] constexpr std::string replace_invalid(std::string_view s)
+{
+    std::string result;
+    unicode::replace_invalid(s.begin(), s.end(), std::back_inserter(result));
+    return result;
+}
+
+[[nodiscard]] constexpr std::u8string replace_invalid(std::u8string_view s)
+{
+    std::u8string result;
+    unicode::replace_invalid(s.begin(), s.end(), std::back_inserter(result));
+    return result;
+}
+
+template<octet_input_iterator It, std::sentinel_for<It> Se>
+[[nodiscard]] constexpr char32_t next(It& it, Se end)
+{
+    char32_t cp = 0;
+    switch (detail::validate_next(it, end, cp)) {
+    case detail::utf_error::OK:
+        break;
+
+    case detail::utf_error::NOT_ENOUGH_SPACE:
+        throw not_enough_space();
+
+    case detail::utf_error::INVALID_LEAD:
+    case detail::utf_error::INCOMPLETE_SEQUENCE:
+    case detail::utf_error::OVERLONG_SEQUENCE:
+        throw invalid_utf8(static_cast<char8_t>(*it));
+
+    case detail::utf_error::INVALID_CODE_POINT:
+        throw invalid_code_point(cp);
+
+    default:
+        std::unreachable();
+    }
+    return cp;
+}
+
+template<octet_input_iterator It>
+[[nodiscard]] constexpr std::pair<It, typename std::iterator_traits<It>::difference_type>
+bounded_next(It it, It const last, typename std::iterator_traits<It>::difference_type off = 1)
+{
+    typename std::iterator_traits<It>::difference_type count = 0;
+    for (; it != last && count < off; ++count) {
+        char32_t cp = 0;
+        switch (detail::validate_next(it, last, cp)) {
+        case detail::utf_error::OK:
+            break;
+
+        case detail::utf_error::NOT_ENOUGH_SPACE:
+            throw not_enough_space();
+
+        case detail::utf_error::INVALID_LEAD:
+        case detail::utf_error::INCOMPLETE_SEQUENCE:
+        case detail::utf_error::OVERLONG_SEQUENCE:
+            throw invalid_utf8(static_cast<char8_t>(*it));
+
+        case detail::utf_error::INVALID_CODE_POINT:
+            throw invalid_code_point(cp);
+
+        default:
+            std::unreachable();
+        }
+    }
+    return {it, count};
+}
+
+template<utf16_input_iterator It, std::sentinel_for<It> Se>
+[[nodiscard]] constexpr char32_t next16(It& it, Se end)
+{
+    char32_t cp = 0;
+    detail::utf_error err_code = detail::validate_next16(it, end, cp);
+    if (err_code == detail::utf_error::NOT_ENOUGH_SPACE) {
+        throw not_enough_space();
+    }
+    return cp;
+}
+
+template<octet_input_iterator It, std::sentinel_for<It> Se>
+[[nodiscard]] constexpr char32_t peek_next(It it, Se end)
+{
+    return unicode::next(it, end);
+}
+
+template<octet_input_iterator It, std::sentinel_for<It> Se>
+[[nodiscard]] constexpr char32_t prev(It& it, Se start)
+{
+    // can't do much if it == start
+    if (it == start) throw not_enough_space();
+
+    It end = it;
+    // Go back until we hit either a lead octet or start
+    while (detail::is_trail(*--it)) {
+        if (it == start) throw invalid_utf8(*it); // error - no lead byte in the sequence
+    }
+    return unicode::peek_next(it, end);
+}
+
+template<octet_input_iterator It>
+[[nodiscard]] constexpr std::pair<It, typename std::iterator_traits<It>::difference_type>
+bounded_prev(It const start, It it, typename std::iterator_traits<It>::difference_type off = 1)
+{
+    typename std::iterator_traits<It>::difference_type count = 0;
+    for (; it != start && count < off; ++count) {
+        while (detail::is_trail(*--it)) {
+            if (it == start) throw invalid_utf8(*it); // error - no lead byte in the sequence
+        }
+    }
+    return {it, count};
+}
+
+template<octet_input_iterator It, std::sentinel_for<It> Se, class distance_type>
+constexpr void advance(It& it, distance_type n, Se end)
+{
+    constexpr distance_type zero(0);
+    if (n < zero) {
+        // backward
+        for (distance_type i = n; i < zero; ++i) {
+            (void)unicode::prev(it, end);
+        }
+    } else {
+        // forward
+        for (distance_type i = zero; i < n; ++i) {
+            (void)unicode::next(it, end);
+        }
+    }
+}
+
+template<octet_input_iterator It, std::sentinel_for<It> Se>
+[[nodiscard]] constexpr typename std::iterator_traits<It>::difference_type
+distance(It first, Se last)
+{
+    typename std::iterator_traits<It>::difference_type dist;
+    for (dist = 0; first != last; ++dist) {
+        (void)unicode::next(first, last);
+    }
+    return dist;
+}
+
+// ------------------------------------
+
+template<octet_input_iterator It>
+class code_point_iterator
+{
+    It it;
+    It range_start;
+    It range_end;
+
+public:
+    using value_type = char32_t;
+    using pointer = char32_t*;
+    using reference = char32_t&;
+    using difference_type = std::ptrdiff_t;
+    using iterator_category = std::bidirectional_iterator_tag;
+
+    constexpr code_point_iterator()
+        requires std::is_default_constructible_v<It>
+    = default;
+
+    constexpr code_point_iterator(It it, It range_start, It range_end)
+        : it(std::move(it))
+        , range_start(std::move(range_start))
+        , range_end(std::move(range_end))
+    {
+        if constexpr (std::random_access_iterator<It>) {
+            if (this->it < this->range_start || this->it > this->range_end) {
+                throw std::out_of_range("Invalid utf-8 iterator position");
+            }
+        }
+    }
+
+    [[nodiscard]] constexpr It base() const { return it; }
+
+    [[nodiscard]] constexpr char32_t operator*() const
+    {
+        It temp = it;
+        return unicode::next(temp, range_end);
+    }
+
+    [[nodiscard]] constexpr bool operator==(code_point_iterator const& rhs) const noexcept
+    {
+        assert(range_start == rhs.range_start && range_end == rhs.range_end && "comparing incompatible iterator range is not allowed");
+        return it == rhs.it;
+    }
+
+    constexpr code_point_iterator& operator++()
+    {
+        (void)unicode::next(it, range_end);
+        return *this;
+    }
+
+    [[nodiscard]] constexpr code_point_iterator operator++(int)
+    {
+        code_point_iterator temp = *this;
+        (void)unicode::next(it, range_end);
+        return temp;
+    }
+
+    constexpr code_point_iterator& operator--()
+    {
+        (void)unicode::prev(it, range_start);
+        return *this;
+    }
+
+    [[nodiscard]] constexpr code_point_iterator operator--(int)
+    {
+        code_point_iterator temp = *this;
+        (void)unicode::prev(it, range_start);
+        return temp;
+    }
+};
+
+// ------------------------------------
+
+template<utf8_input_iterator It, std::sentinel_for<It> Se, utf16_output_iterator OutIt>
+constexpr OutIt utf8to16(It start, Se end, OutIt out)
+{
+    while (start != end) {
+        char32_t const cp = unicode::next(start, end);
+        if (cp > 0xffff) { // make a surrogate pair
+            *out++ = static_cast<char16_t>((cp >> 10) + detail::LEAD_OFFSET);
+            *out++ = static_cast<char16_t>((cp & 0x3ff) + detail::TRAIL_SURROGATE_MIN);
+        } else {
+            *out++ = static_cast<char16_t>(cp);
+        }
+    }
+    return out;
+}
+
+[[nodiscard]] constexpr std::u16string utf8to16(std::string_view str)
+{
+    std::u16string result;
+    unicode::utf8to16(str.begin(), str.end(), std::back_inserter(result));
+    return result;
+}
+
+[[nodiscard]] constexpr std::u16string utf8to16(std::u8string_view str)
+{
+    std::u16string result;
+    unicode::utf8to16(str.begin(), str.end(), std::back_inserter(result));
+    return result;
+}
+
+template<utf16_input_iterator It, std::sentinel_for<It> Se, octet_output_iterator OutIt>
+constexpr OutIt utf16to8(It start, Se end, OutIt out)
+{
+    while (start != end) {
+        char32_t cp = static_cast<char32_t>(detail::mask16(*start++));
+        // Take care of surrogate pairs first
+        if (detail::is_lead_surrogate(cp)) {
+            if (start != end) {
+                char32_t const trail_surrogate = static_cast<char32_t>(detail::mask16(*start++));
+                if (detail::is_trail_surrogate(trail_surrogate)) {
+                    cp = (cp << 10) + trail_surrogate + detail::SURROGATE_OFFSET;
+                } else {
+                    throw invalid_utf16(static_cast<char16_t>(trail_surrogate));
+                }
+            } else {
+                throw invalid_utf16(static_cast<char16_t>(cp));
+            }
+
+        // Lone trail surrogate
+        } else if (detail::is_trail_surrogate(cp)) {
+            throw invalid_utf16(static_cast<char16_t>(cp));
+        }
+        out = unicode::append8(cp, out);
+    }
+    return out;
+}
+
+[[nodiscard]] constexpr std::string utf16to8(std::u16string_view str)
+{
+    std::string result;
+    unicode::utf16to8(str.begin(), str.end(), std::back_inserter(result));
+    return result;
+}
+
+[[nodiscard]] constexpr std::u8string utf16tou8(std::u16string_view str)
+{
+    std::u8string result;
+    unicode::utf16to8(str.begin(), str.end(), std::back_inserter(result));
+    return result;
+}
+
+template<utf8_input_iterator It, std::sentinel_for<It> Se, class OutIt>
+constexpr OutIt utf8to32(It start, Se end, OutIt out)
+{
+    while (start != end) {
+        *out++ = unicode::next(start, end);
+    }
+    return out;
+}
+
+[[nodiscard]] constexpr std::u32string utf8to32(std::string_view str)
+{
+    std::u32string result;
+    unicode::utf8to32(str.begin(), str.end(), std::back_inserter(result));
+    return result;
+}
+
+[[nodiscard]] constexpr std::u32string utf8to32(std::u8string_view str)
+{
+    std::u32string result;
+    unicode::utf8to32(str.begin(), str.end(), std::back_inserter(result));
+    return result;
+}
+
+template<utf32_input_iterator It, std::sentinel_for<It> Se, octet_output_iterator OutIt>
+constexpr OutIt utf32to8(It start, Se end, OutIt out)
+{
+    while (start != end) {
+        out = unicode::append8(*start++, out);
+    }
+    return out;
+}
+
+[[nodiscard]] constexpr std::string utf32to8(std::u32string_view str)
+{
+    std::string result;
+    unicode::utf32to8(str.begin(), str.end(), std::back_inserter(result));
+    return result;
+}
+
+[[nodiscard]] constexpr std::u8string utf32tou8(std::u32string_view str)
+{
+    std::u8string result;
+    unicode::utf32to8(str.begin(), str.end(), std::back_inserter(result));
+    return result;
+}
+
+// TODO: add single char variations
+
+template<class CharT>
+[[nodiscard]] constexpr std::basic_string<CharT> transcode(std::string_view str)
+{
+    if constexpr (std::same_as<CharT, char8_t>) {
+        return std::u8string{str.begin(), str.end()};
+    } else if constexpr (std::same_as<CharT, char16_t>) {
+        return unicode::utf8to16(str);
+    } else if constexpr (std::same_as<CharT, char32_t>) {
+        return unicode::utf8to32(str);
+    } else {
+        static_assert(std::same_as<CharT, char>);
+        return std::string{str};
+    }
+}
+
+template<class CharT>
+[[nodiscard]] constexpr std::basic_string<CharT> transcode(std::u8string_view str)
+{
+    if constexpr (std::same_as<CharT, char8_t>) {
+        return std::u8string{str};
+    } else if constexpr (std::same_as<CharT, char16_t>) {
+        return unicode::utf8to16(str);
+    } else if constexpr (std::same_as<CharT, char32_t>) {
+        return unicode::utf8to32(str);
+    } else {
+        static_assert(std::same_as<CharT, char>);
+        return std::string{str.begin(), str.end()};
+    }
+}
+
+template<class CharT>
+[[nodiscard]] constexpr std::basic_string<CharT> transcode(std::u16string_view str)
+{
+    if constexpr (std::same_as<CharT, char8_t>) {
+        return unicode::utf16tou8(str);
+    } else if constexpr (std::same_as<CharT, char16_t>) {
+        return std::u16string{str};
+    } else if constexpr (std::same_as<CharT, char32_t>) {
+        static_assert(false, "not implemented");
+        return {}; // dummy
+        //return unicode::utf16to32(str);
+    } else {
+        static_assert(std::same_as<CharT, char>);
+        return unicode::utf16to8(str);
+    }
+}
+
+template<class CharT>
+[[nodiscard]] constexpr std::basic_string<CharT> transcode(std::u32string_view str)
+{
+    if constexpr (std::same_as<CharT, char8_t>) {
+        return unicode::utf32tou8(str);
+    } else if constexpr (std::same_as<CharT, char16_t>) {
+        static_assert(false, "not implemented");
+        return {}; // dummy
+        //return unicode::utf32to16(str);
+    } else if constexpr (std::same_as<CharT, char32_t>) {
+        return std::u32string{str};
+    } else {
+        static_assert(std::same_as<CharT, char>);
+        return unicode::utf32to8(str);
+    }
+}
+
+} // iris::unicode
+
+#endif
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 34edcd0..5347bca 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -97,6 +97,11 @@ function(iris_define_test_headers test_name)
 endfunction()
 
 function(_iris_define_test_impl test_name libs)
+    get_property(IRIS_ROOT GLOBAL PROPERTY IRIS_ROOT)
+    if(NOT DEFINED IRIS_ROOT OR IRIS_ROOT STREQUAL "")
+        message(FATAL_ERROR "IRIS_ROOT is not defined")
+    endif()
+
     add_executable(${test_name}_test ${ARGN})
     target_include_directories(${test_name}_test PRIVATE ${CMAKE_CURRENT_FUNCTION_LIST_DIR})
     target_include_directories(${test_name}_test PRIVATE ${CMAKE_CURRENT_LIST_DIR})
@@ -116,11 +121,32 @@ function(_iris_define_test_impl test_name libs)
     target_link_libraries(${test_name}_test PRIVATE Iris::Iris iris_cxx_test ${libs})
     add_test(NAME ${test_name}_test COMMAND ${test_name}_test --colour-mode=ansi)
 
+    set_tests_properties(
+        ${test_name}_test PROPERTIES
+        ENVIRONMENT "IRIS_ROOT=${IRIS_ROOT}"
+    )
     if(MSVC)
+        set(
+            VS_DEBUGGER_ENVIRONMENT_LIST
+            "PATH=$(VC_ExecutablePath_x64)\;%PATH%"
+            "ASAN_SYMBOLIZER_PATH=$(VC_ExecutablePath_x64)\\llvm-symbolizer.exe"
+            "IRIS_ROOT=$<SHELL_PATH:${IRIS_ROOT}>"
+        )
+        list(JOIN VS_DEBUGGER_ENVIRONMENT_LIST "\n" VS_DEBUGGER_ENVIRONMENT)
+
+        set_target_properties(
+            ${test_name}_test PROPERTIES
+            VS_DEBUGGER_ENVIRONMENT "${VS_DEBUGGER_ENVIRONMENT}"
+        )
+
         get_property(IRIS_MSVC_ASAN_DIR GLOBAL PROPERTY IRIS_MSVC_ASAN_DIR)
+        set(
+            ENV_MODIFICATION
+            "PATH=path_list_append:${IRIS_MSVC_ASAN_DIR}"
+        )
         set_tests_properties(
             ${test_name}_test PROPERTIES
-            ENVIRONMENT "PATH=${IRIS_MSVC_ASAN_DIR};$ENV{PATH}"
+            ENVIRONMENT_MODIFICATION "${ENV_MODIFICATION}"
         )
     endif()
 endfunction()
@@ -169,5 +195,7 @@ if(PROJECT_IS_TOP_LEVEL)
         foreach(test_name IN LISTS IRIS_TEST_IRIS_TESTS)
             iris_define_test_headers(iris_${test_name} iris_test.hpp)
         endforeach()
+
+        add_subdirectory(unicode)
     endif()
 endif()
diff --git a/test/unicode/CMakeLists.txt b/test/unicode/CMakeLists.txt
new file mode 100644
index 0000000..37baffc
--- /dev/null
+++ b/test/unicode/CMakeLists.txt
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: MIT
+
+add_subdirectory(string)
diff --git a/test/unicode/string/CMakeLists.txt b/test/unicode/string/CMakeLists.txt
new file mode 100644
index 0000000..ebfa8d4
--- /dev/null
+++ b/test/unicode/string/CMakeLists.txt
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: MIT
+
+set(
+    IRIS_TEST_UNICODE_STRING_TESTS
+    string
+    utf8_invalid
+)
+
+foreach(test_name IN LISTS IRIS_TEST_UNICODE_STRING_TESTS)
+    iris_define_test(unicode_string_${test_name} ${test_name}.cpp)
+    set_target_properties(unicode_string_${test_name}_test PROPERTIES FOLDER "test/unicode/string")
+endforeach()
+
+target_sources(unicode_string_utf8_invalid_test PRIVATE test_data/utf8_invalid.txt)
diff --git a/test/unicode/string/LICENSE b/test/unicode/string/LICENSE
new file mode 100644
index 0000000..36b7cd9
--- /dev/null
+++ b/test/unicode/string/LICENSE
@@ -0,0 +1,23 @@
+Boost Software License - Version 1.0 - August 17th, 2003
+
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/test/unicode/string/string.cpp b/test/unicode/string/string.cpp
new file mode 100644
index 0000000..86c0732
--- /dev/null
+++ b/test/unicode/string/string.cpp
@@ -0,0 +1,454 @@
+#include "iris_test.hpp"
+
+#include <iris/unicode/string.hpp>
+
+#include <vector>
+#include <algorithm>
+#include <string>
+#include <array>
+
+#include <cstdint>
+
+namespace iris_unicode_test {
+
+namespace unicode = iris::unicode;
+
+template<class T, class... Chars>
+constexpr std::array<T, sizeof...(Chars)> to_array_cast(Chars... cs)
+{
+    return std::array<T, sizeof...(Chars)>{
+        static_cast<T>(cs)...
+    };
+}
+
+TEST_CASE("append")
+{
+    constexpr auto do_test = []<class T>() {
+        STATIC_CHECK([] {
+            std::array<T, 5> u{};
+            unicode::append(0x0448U, u);
+            return u;
+        }() == to_array_cast<T>(0xd1, 0x88, 0, 0, 0));
+
+        STATIC_CHECK([] {
+            std::array<T, 5> u{};
+            unicode::append(0x65e5U, u);
+            return u;
+        }() == to_array_cast<T>(0xe6, 0x97, 0xa5, 0, 0));
+
+        STATIC_CHECK([] {
+            std::array<T, 5> u{};
+            unicode::append(0x3044U, u);
+            return u;
+        }() == to_array_cast<T>(0xe3, 0x81, 0x84, 0, 0));
+
+        STATIC_CHECK([] {
+            std::array<T, 5> u{};
+            unicode::append(0x10346U, u);
+            return u;
+        }() == to_array_cast<T>(0xf0, 0x90, 0x8d, 0x86, 0));
+    };
+
+    do_test.operator()<char8_t>();
+    do_test.operator()<char>();
+    do_test.operator()<unsigned char>();
+    do_test.operator()<std::int8_t>();
+    do_test.operator()<std::uint8_t>();
+}
+
+TEST_CASE("append16")
+{
+    constexpr auto do_test = []<class T>() {
+        STATIC_CHECK([] {
+            std::array<T, 5> u{};
+            unicode::append(0x0448U, u);
+            return u;
+        }() == to_array_cast<T>(0x0448, 0, 0, 0, 0));
+
+        STATIC_CHECK([] {
+            std::array<T, 5> u{};
+            unicode::append(0x65e5U, u);
+            return u;
+        }() == to_array_cast<T>(0x65e5, 0, 0, 0, 0));
+
+        STATIC_CHECK([] {
+            std::array<T, 5> u{};
+            unicode::append(0x10346U, u);
+            return u;
+        }() == to_array_cast<T>(0xd800, 0xdf46, 0, 0, 0));
+    };
+
+    do_test.operator()<char16_t>();
+    do_test.operator()<std::int16_t>();
+    do_test.operator()<std::uint16_t>();
+}
+
+TEST_CASE("next")
+{
+    char const* twochars = "\xe6\x97\xa5\xd1\x88";
+    char const* w = twochars;
+    unsigned int cp = unicode::next(w, twochars + 6);
+
+    CHECK(cp == 0x65e5);
+    CHECK(w == twochars + 3);
+
+    char const* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
+    w = threechars;
+
+    cp = unicode::next(w, threechars + 9);
+    CHECK(cp == 0x10346);
+    CHECK(w == threechars + 4);
+
+    cp = unicode::next(w, threechars + 9);
+    CHECK(cp == 0x65e5);
+    CHECK(w == threechars + 7);
+
+    cp = unicode::next(w, threechars + 9);
+    CHECK(cp == 0x0448);
+    CHECK(w == threechars + 9);
+}
+
+TEST_CASE("next16")
+{
+    char16_t const u[3] = {0x65e5, 0xd800, 0xdf46};
+    char16_t const* w = u;
+    char32_t cp = unicode::next16(w, w + 3);
+    CHECK(cp == 0x65e5);
+    CHECK(w == u + 1);
+
+    cp = unicode::next16(w, w + 2);
+    CHECK(cp == 0x10346);
+    CHECK(w == u + 3);
+}
+
+TEST_CASE("peek_next")
+{
+    char const* const cw = "\xe6\x97\xa5\xd1\x88";
+    unsigned int cp = unicode::peek_next(cw, cw + 6);
+    CHECK(cp == 0x65e5);
+}
+
+TEST_CASE("prev")
+{
+    char const* twochars = "\xe6\x97\xa5\xd1\x88";
+    char const* w = twochars + 3;
+    unsigned int cp = unicode::prev(w, twochars);
+    CHECK(cp == 0x65e5);
+    CHECK(w == twochars);
+
+    char const* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
+    w = threechars + 9;
+    cp = unicode::prev(w, threechars);
+    CHECK(cp == 0x0448);
+    CHECK(w == threechars + 7);
+    cp = unicode::prev(w, threechars);
+    CHECK(cp == 0x65e5);
+    CHECK(w == threechars + 4);
+    cp = unicode::prev(w, threechars);
+    CHECK(cp == 0x10346);
+    CHECK(w == threechars);
+}
+
+TEST_CASE("advance")
+{
+    char const* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
+    char const* w = threechars;
+    unicode::advance(w, 2, threechars + 9);
+    CHECK(w == threechars + 7);
+    unicode::advance(w, -2, threechars);
+    CHECK(w == threechars);
+    unicode::advance(w, 3, threechars + 9);
+    CHECK(w == threechars + 9);
+    unicode::advance(w, -2, threechars);
+    CHECK(w == threechars + 4);
+    unicode::advance(w, -1, threechars);
+    CHECK(w == threechars);
+}
+
+TEST_CASE("distance")
+{
+    constexpr char const* twochars = "\xe6\x97\xa5\xd1\x88";
+    std::size_t const dist = static_cast<size_t>(unicode::distance(twochars, twochars + 5));
+    CHECK(dist == 2);
+}
+
+TEST_CASE("is_valid")
+{
+    constexpr char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa";
+    constexpr char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
+
+    {
+        CHECK(!unicode::is_valid(utf_invalid));
+        CHECK(!unicode::is_valid(utf_invalid, utf_invalid + 6));
+    }
+    {
+        CHECK(unicode::is_valid(utf8_with_surrogates));
+        CHECK(unicode::is_valid(utf8_with_surrogates, utf8_with_surrogates + 9));
+    }
+    {
+        std::u8string const utf_invalid_u8(reinterpret_cast<char8_t const*>(utf_invalid));
+        CHECK(!unicode::is_valid(utf_invalid_u8));
+    }
+    {
+        std::u8string const utf8_with_surrogates_u8(reinterpret_cast<char8_t const*>(utf8_with_surrogates));
+        CHECK(unicode::is_valid(utf8_with_surrogates));
+    }
+
+    {
+        constexpr char const* twochars = "ab";
+        CHECK(unicode::is_valid(twochars));
+
+        std::string const two_chars_string(twochars);
+        CHECK(unicode::is_valid(two_chars_string));
+    }
+}
+
+TEST_CASE("find_invalid")
+{
+    constexpr char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa";
+    {
+        char const* invalid = unicode::find_invalid(utf_invalid, utf_invalid + 6);
+        CHECK(invalid == utf_invalid + 5);
+    }
+    {
+        std::size_t const invalid_pos = unicode::find_invalid(utf_invalid);
+        CHECK(invalid_pos == 5);
+    }
+    {
+        std::size_t const invalid_pos = unicode::find_invalid(std::string{utf_invalid});
+        CHECK(invalid_pos == 5);
+    }
+    {
+        std::size_t const invalid_pos = unicode::find_invalid(std::string_view{utf_invalid});
+        CHECK(invalid_pos == 5);
+    }
+    {
+        std::u8string const utf_invalid_u8(reinterpret_cast<char8_t const*>(utf_invalid));
+        std::size_t const invalid_pos = unicode::find_invalid(utf_invalid_u8);
+        CHECK(invalid_pos == 5);
+    }
+}
+
+TEST_CASE("replace_invalid (vector)")
+{
+    char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
+    std::vector<char> replace_invalid_result;
+
+    unicode::replace_invalid(invalid_sequence, invalid_sequence + sizeof(invalid_sequence), std::back_inserter(replace_invalid_result), '?');
+    CHECK(unicode::is_valid(replace_invalid_result.begin(), replace_invalid_result.end()));
+
+    char const fixed_invalid_sequence[] = "a????z";
+    CHECK(sizeof(fixed_invalid_sequence) == replace_invalid_result.size());
+    CHECK(std::equal(replace_invalid_result.begin(), replace_invalid_result.begin() + sizeof(fixed_invalid_sequence), fixed_invalid_sequence));
+}
+
+TEST_CASE("replace_invalid (string)")
+{
+    std::string_view invalid_sequence = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
+    std::string const replace_invalid_result = unicode::replace_invalid(invalid_sequence, '?');
+    CHECK(unicode::is_valid(replace_invalid_result));
+
+    std::string const fixed_invalid_sequence = "a????z";
+    CHECK(fixed_invalid_sequence == replace_invalid_result);
+}
+
+TEST_CASE("replace_invalid (u8string)")
+{
+    std::u8string const invalid_sequence(reinterpret_cast<char8_t const*>("a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"));
+    std::u8string const replace_invalid_result = unicode::replace_invalid(invalid_sequence, u8'?');
+
+    CHECK(unicode::is_valid(replace_invalid_result));
+    std::u8string const fixed_invalid_sequence(reinterpret_cast<char8_t const*>("a????z"));
+    CHECK(fixed_invalid_sequence == replace_invalid_result);
+}
+
+TEST_CASE("starts_with_bom")
+{
+    CHECK(unicode::starts_with_bom(unicode::bom<char>));
+    CHECK(unicode::starts_with_bom(unicode::bom<unsigned char>));
+    CHECK(unicode::starts_with_bom(unicode::bom<char8_t>));
+    CHECK(unicode::starts_with_bom(unicode::bom<std::int8_t>));
+    CHECK(unicode::starts_with_bom(unicode::bom<std::uint8_t>));
+
+    constexpr char threechars[] = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
+    CHECK(!unicode::starts_with_bom(threechars));
+    CHECK(!unicode::starts_with_bom(std::string{threechars}));
+    CHECK(!unicode::starts_with_bom(std::string_view{threechars}));
+    CHECK(!unicode::starts_with_bom(std::u8string{reinterpret_cast<char8_t const*>(threechars)}));
+}
+
+TEST_CASE("increment")
+{
+    constexpr char const* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
+    unicode::code_point_iterator<char const*> it(threechars, threechars, threechars + 9);
+    unicode::code_point_iterator<char const*> it2 = it;
+    CHECK(it2 == it);
+    CHECK(*it == 0x10346);
+    CHECK(*++it == 0x65e5);
+    CHECK(*it++ == 0x65e5);
+    CHECK(*it == 0x0448);
+    CHECK(it != it2);
+    unicode::code_point_iterator<char const*> endit(threechars + 9, threechars, threechars + 9);
+    CHECK(++it == endit);
+}
+
+TEST_CASE("decrement")
+{
+    constexpr char const* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
+    unicode::code_point_iterator<char const*> it(threechars + 9, threechars, threechars + 9);
+    CHECK(*--it == 0x0448);
+    CHECK(*it-- == 0x0448);
+    CHECK(*it == 0x65e5);
+    CHECK(--it == unicode::code_point_iterator<char const*>(threechars, threechars, threechars + 9));
+    CHECK(*it == 0x10346);
+}
+
+// -----------------------------------
+
+TEST_CASE("utf8to16")
+{
+    {
+        constexpr char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
+        std::vector<char16_t> utf16result;
+        unicode::utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result));
+        CHECK(utf16result.size() == 4);
+        CHECK(utf16result[2] == 0xd834);
+        CHECK(utf16result[3] == 0xdd1e);
+    }
+    {
+        std::string const utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
+        std::u16string const utf16result = unicode::utf8to16(utf8_with_surrogates);
+        CHECK(utf16result.size() == 4);
+        CHECK(utf16result[2] == 0xd834);
+        CHECK(utf16result[3] == 0xdd1e);
+        // Just to make sure it compiles with string literals
+        CHECK(unicode::utf8to16(u8"simple") == u"simple");
+        CHECK(unicode::utf8to16("simple") == u"simple");
+    }
+    {
+        constexpr std::string_view utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
+        std::u16string const utf16result = unicode::utf8to16(utf8_with_surrogates);
+        CHECK(utf16result.size() == 4);
+        CHECK(utf16result[2] == 0xd834);
+        CHECK(utf16result[3] == 0xdd1e);
+    }
+    {
+        std::u8string const utf8_with_surrogates{reinterpret_cast<char8_t const*>("\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e")};
+        std::u16string const utf16result = unicode::utf8to16(utf8_with_surrogates);
+        CHECK(utf16result.size() == 4);
+        CHECK(utf16result[2] == 0xd834);
+        CHECK(utf16result[3] == 0xdd1e);
+    }
+}
+
+TEST_CASE("utf16to8")
+{
+    {
+        constexpr char16_t utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
+        std::string utf8result;
+        unicode::utf16to8(utf16string, utf16string + 5, back_inserter(utf8result));
+        CHECK(utf8result.size() == 10);
+    }
+    {
+        std::u16string const utf16string{0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
+        std::string const u = unicode::utf16to8(utf16string);
+        CHECK(u.size() == 10);
+    }
+    {
+        std::u16string const utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
+        std::u16string_view const utf16stringview(utf16string);
+        std::string const u = unicode::utf16to8(utf16stringview);
+        CHECK(u.size() == 10);
+    }
+    {
+        std::u16string const utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
+        std::u16string_view const utf16stringview{utf16string};
+        {
+            std::u8string const u = unicode::utf16tou8(utf16string);
+            CHECK(u.size() == 10);
+        }
+        {
+            std::u8string const u = unicode::utf16tou8(utf16stringview);
+            CHECK(u.size() == 10);
+        }
+    }
+}
+
+// -----------------------------------------
+
+TEST_CASE("utf8to32")
+{
+    {
+        constexpr char const* twochars = "\xe6\x97\xa5\xd1\x88";
+        std::vector<unsigned int> utf32result;
+        unicode::utf8to32(twochars, twochars + 5, back_inserter(utf32result));
+        CHECK(utf32result.size() == 2);
+    }
+    {
+        constexpr char const* twochars = "\xe6\x97\xa5\xd1\x88";
+        std::u32string const utf32result = unicode::utf8to32(twochars);
+        CHECK(utf32result.size() == 2);
+    }
+    {
+        constexpr std::string_view twochars = "\xe6\x97\xa5\xd1\x88";
+        std::u32string const utf32result = unicode::utf8to32(twochars);
+        CHECK(utf32result.size() == 2);
+    }
+    {
+        std::u8string const twochars{reinterpret_cast<char8_t const*>("\xe6\x97\xa5\xd1\x88")};
+        std::u32string const utf32result = unicode::utf8to32(twochars);
+        CHECK(utf32result.size() == 2);
+    }
+}
+
+TEST_CASE("utf32to8")
+{
+    {
+        constexpr char32_t utf32string[] = {0x448, 0x65E5, 0x10346, 0};
+        std::string utf8result;
+        unicode::utf32to8(utf32string, utf32string + 3, back_inserter(utf8result));
+        CHECK(utf8result.size() == 9);
+    }
+    {
+        std::u32string const utf32string = {0x448, 0x65E5, 0x10346};
+        std::string const utf8result = unicode::utf32to8(utf32string);
+        CHECK(utf8result.size() == 9);
+    }
+    {
+        std::u32string const utf32string = {0x448, 0x65E5, 0x10346};
+        std::u32string_view const utf32stringview(utf32string);
+        std::string const utf8result = unicode::utf32to8(utf32stringview);
+        CHECK(utf8result.size() == 9);
+    }
+    {
+        std::u32string const utf32string = {0x448, 0x65E5, 0x10346};
+        std::u32string_view const utf32stringview{utf32string};
+        std::u8string const utf8result = unicode::utf32tou8(utf32stringview);
+        CHECK(utf8result.size() == 9);
+    }
+}
+
+TEST_CASE("transcode")
+{
+    STATIC_CHECK(unicode::transcode<char>("aこれはb試験ですc")       == "aこれはb試験ですc");
+    STATIC_CHECK(unicode::transcode<char>(u8"aこれはb試験ですc")     == "aこれはb試験ですc");
+    STATIC_CHECK(unicode::transcode<char>(u"aこれはb試験ですc")      == "aこれはb試験ですc");
+    STATIC_CHECK(unicode::transcode<char>(U"aこれはb試験ですc")      == "aこれはb試験ですc");
+
+    STATIC_CHECK(unicode::transcode<char8_t>("aこれはb試験ですc")    == u8"aこれはb試験ですc");
+    STATIC_CHECK(unicode::transcode<char8_t>(u8"aこれはb試験ですc")  == u8"aこれはb試験ですc");
+    STATIC_CHECK(unicode::transcode<char8_t>(u"aこれはb試験ですc")   == u8"aこれはb試験ですc");
+    STATIC_CHECK(unicode::transcode<char8_t>(U"aこれはb試験ですc")   == u8"aこれはb試験ですc");
+
+    STATIC_CHECK(unicode::transcode<char16_t>("aこれはb試験ですc")   == u"aこれはb試験ですc");
+    STATIC_CHECK(unicode::transcode<char16_t>(u8"aこれはb試験ですc") == u"aこれはb試験ですc");
+    STATIC_CHECK(unicode::transcode<char16_t>(u"aこれはb試験ですc")  == u"aこれはb試験ですc");
+    //STATIC_CHECK(unicode::transcode<char16_t>(U"aこれはb試験ですc")  == u"aこれはb試験ですc");
+
+    STATIC_CHECK(unicode::transcode<char32_t>("aこれはb試験ですc")   == U"aこれはb試験ですc");
+    STATIC_CHECK(unicode::transcode<char32_t>(u8"aこれはb試験ですc") == U"aこれはb試験ですc");
+    //STATIC_CHECK(unicode::transcode<char32_t>(u"aこれはb試験ですc")  == U"aこれはb試験ですc");
+    STATIC_CHECK(unicode::transcode<char32_t>(U"aこれはb試験ですc")  == U"aこれはb試験ですc");
+}
+
+} // iris_unicode_test
diff --git a/test/unicode/string/test_data/utf8_invalid.txt b/test/unicode/string/test_data/utf8_invalid.txt
new file mode 100644
index 0000000..ae83159
Binary files /dev/null and b/test/unicode/string/test_data/utf8_invalid.txt differ
diff --git a/test/unicode/string/utf8_invalid.cpp b/test/unicode/string/utf8_invalid.cpp
new file mode 100644
index 0000000..b7f46b7
--- /dev/null
+++ b/test/unicode/string/utf8_invalid.cpp
@@ -0,0 +1,77 @@
+// TODO: we need secure "getenv" in iris library
+#define _CRT_SECURE_NO_WARNINGS 1
+
+#include "iris_test.hpp"
+
+#include <iris/unicode/string.hpp>
+#include <iris/exception.hpp>
+
+#include <stdexcept>
+#include <string>
+#include <fstream>
+#include <filesystem>
+#include <algorithm>
+#include <array>
+
+namespace iris_unicode_test {
+
+constexpr auto INVALID_LINES = std::to_array<unsigned>({
+    75, 76, 83, 84, 85, 93, 102, 103, 105, 106, 107, 108, 109,
+    110, 114, 115, 116, 117, 124, 125, 130, 135, 140, 145, 153,
+    154, 155, 156, 157, 158, 159, 160, 161, 162, 169, 175, 176,
+    177, 207, 208, 209, 210, 211, 220, 221, 222, 223, 224, 232,
+    233, 234, 235, 236, 247, 248, 249, 250, 251, 252, 253, 257,
+    258, 259, 260, 261, 262, 263, 264,
+});
+
+TEST_CASE("utf8_invalid")
+{
+    namespace unicode = iris::unicode;
+    using iris::throwf;
+
+    std::filesystem::path const IRIS_ROOT = [] {
+        char const* IRIS_ROOT_str = std::getenv("IRIS_ROOT");
+        if (!IRIS_ROOT_str) throwf<std::invalid_argument>("IRIS_ROOT is not defined");
+        return std::filesystem::path(IRIS_ROOT_str);
+    }();
+
+    auto const test_file_path = IRIS_ROOT / "test" / "unicode" / "string" / "test_data" / "utf8_invalid.txt";
+    std::ifstream fs8(test_file_path);
+    if (!fs8) {
+        throwf<std::invalid_argument>("could not open \"{}\"", test_file_path.string());
+    }
+
+    // Read it line by line
+    unsigned line_count = 0;
+    while (!fs8.eof()) {
+        std::string line;
+
+        char byte;
+        while ((byte = static_cast<char>(fs8.get())) != '\n' && !fs8.eof()) {
+            line.push_back(byte);
+        }
+
+        ++line_count;
+        bool const expected_valid = std::ranges::find(INVALID_LINES, line_count) == INVALID_LINES.end();
+
+        // Print out lines that contain unexpected invalid UTF-8
+        if (!unicode::is_valid(line.begin(), line.end())) {
+            if (expected_valid) {
+                throwf<std::runtime_error>("unexpected invalid utf-8 at line {}", line_count);
+            }
+
+            // try fixing it:
+            std::string fixed_line;
+            unicode::replace_invalid(line.begin(), line.end(), back_inserter(fixed_line));
+            if (!unicode::is_valid(fixed_line.begin(), fixed_line.end())) {
+                throwf<std::runtime_error>("replace_invalid() resulted in an invalid utf-8 at line {}", line_count);
+            }
+
+        } else if (!expected_valid) {
+            throwf<std::runtime_error>("invalid utf-8 NOT detected at line {}", line_count);
+        }
+    }
+    CHECK(true);
+}
+
+} // iris_unicode_test