|
std::size_t |
|
remove_dot_segments( |
|
char* dest0, |
|
char const* end, |
|
core::string_view input) noexcept |
|
{ |
|
// 1. The input buffer `s` is initialized with |
|
// the now-appended path components and the |
|
// output buffer `dest0` is initialized to |
|
// the empty string. |
|
char* dest = dest0; |
|
bool const is_absolute = input.starts_with('/'); |
|
|
|
// Step 2 is a loop through 5 production rules: |
|
// https://www.rfc-editor.org/rfc/rfc3986#section-5.2.4 |
|
// |
|
// There are no transitions between all rules, |
|
// which enables some optimizations. |
|
// |
|
// Initial: |
|
// - Rule A: handle initial dots |
|
// If the input buffer begins with a |
|
// prefix of "../" or "./", then remove |
|
// that prefix from the input buffer. |
|
// Rule A can only happen at the beginning. |
|
// Errata 4547: Keep "../" in the beginning |
|
// https://www.rfc-editor.org/errata/eid4547 |
|
// |
|
// Then: |
|
// - Rule D: ignore a final ".." or "." |
|
// if the input buffer consists only of "." |
|
// or "..", then remove that from the input |
|
// buffer. |
|
// Rule D can only happen after Rule A because: |
|
// - B and C write "/" to the input |
|
// - E writes "/" to input or returns |
|
// |
|
// Then: |
|
// - Rule B: ignore ".": write "/" to the input |
|
// - Rule C: apply "..": remove seg and write "/" |
|
// - Rule E: copy complete segment |
|
auto append = |
|
[](char*& first, char const* last, core::string_view in) |
|
{ |
|
// append `in` to `dest` |
|
BOOST_ASSERT(in.size() <= std::size_t(last - first)); |
|
std::memmove(first, in.data(), in.size()); |
|
first += in.size(); |
|
ignore_unused(last); |
|
}; |
|
|
|
auto dot_starts_with = []( |
|
core::string_view str, core::string_view dots, std::size_t& n) |
|
{ |
|
// starts_with for encoded/decoded dots |
|
// or decoded otherwise. return how many |
|
// chars in str match the dots |
|
n = 0; |
|
for (char c: dots) |
|
{ |
|
if (str.starts_with(c)) |
|
{ |
|
str.remove_prefix(1); |
|
++n; |
|
continue; |
|
} |
|
|
|
// In the general case, we would need to |
|
// check if the next char is an encoded |
|
// dot. |
|
// However, an encoded dot in `str` |
|
// would have already been decoded in |
|
// url_base::normalize_path(). |
|
// This needs to be undone if |
|
// `remove_dot_segments` is used in a |
|
// different context. |
|
// if (str.size() > 2 && |
|
// c == '.' |
|
// && |
|
// str[0] == '%' && |
|
// str[1] == '2' && |
|
// (str[2] == 'e' || |
|
// str[2] == 'E')) |
|
// { |
|
// str.remove_prefix(3); |
|
// n += 3; |
|
// continue; |
|
// } |
|
|
|
n = 0; |
|
return false; |
|
} |
|
return true; |
|
}; |
|
|
|
auto dot_equal = [&dot_starts_with]( |
|
core::string_view str, core::string_view dots) |
|
{ |
|
std::size_t n = 0; |
|
dot_starts_with(str, dots, n); |
|
return n == str.size(); |
|
}; |
|
|
|
// Rule A |
|
std::size_t n; |
|
while (!input.empty()) |
|
{ |
|
if (dot_starts_with(input, "../", n)) |
|
{ |
|
// Errata 4547 |
|
append(dest, end, "../"); |
|
input.remove_prefix(n); |
|
continue; |
|
} |
|
else if (!dot_starts_with(input, "./", n)) |
|
{ |
|
break; |
|
} |
|
input.remove_prefix(n); |
|
} |
|
|
|
// Rule D |
|
if( dot_equal(input, ".")) |
|
{ |
|
input = {}; |
|
} |
|
else if( dot_equal(input, "..") ) |
|
{ |
|
// Errata 4547 |
|
append(dest, end, ".."); |
|
input = {}; |
|
} |
|
|
|
// 2. While the input buffer is not empty, |
|
// loop as follows: |
|
while (!input.empty()) |
|
{ |
|
// Rule B |
|
bool const is_dot_seg = dot_starts_with(input, "/./", n); |
|
if (is_dot_seg) |
|
{ |
|
input.remove_prefix(n - 1); |
|
continue; |
|
} |
|
|
|
bool const is_final_dot_seg = dot_equal(input, "/."); |
|
if (is_final_dot_seg) |
|
{ |
|
// We can't remove "." from a core::string_view |
|
// So what we do here is equivalent to |
|
// replacing s with '/' as required |
|
// in Rule B and executing the next |
|
// iteration, which would append this |
|
// '/' to the output, as required by |
|
// Rule E |
|
append(dest, end, input.substr(0, 1)); |
|
input = {}; |
|
break; |
|
} |
|
|
|
// Rule C |
|
bool const is_dotdot_seg = dot_starts_with(input, "/../", n); |
|
if (is_dotdot_seg) |
|
{ |
|
core::string_view cur_out(dest0, dest - dest0); |
|
std::size_t p = cur_out.find_last_of('/'); |
|
bool const has_multiple_segs = p != core::string_view::npos; |
|
if (has_multiple_segs) |
|
{ |
|
// output has multiple segments |
|
// "erase" [p, end] if not "/.." |
|
core::string_view last_seg(dest0 + p, dest - (dest0 + p)); |
|
bool const prev_is_dotdot_seg = dot_equal(last_seg, "/.."); |
|
if (!prev_is_dotdot_seg) |
|
{ |
|
dest = dest0 + p; |
|
} |
|
else |
|
{ |
|
append(dest, end, "/.."); |
|
} |
|
} |
|
else if (dest0 != dest) |
|
{ |
|
// Only one segment in the output: remove it |
|
core::string_view last_seg(dest0, dest - dest0); |
|
bool const prev_is_dotdot_seg = dot_equal(last_seg, ".."); |
|
if (!prev_is_dotdot_seg) |
|
{ |
|
dest = dest0; |
|
if (!is_absolute) |
|
{ |
|
input.remove_prefix(1); |
|
} |
|
} |
|
else |
|
{ |
|
append(dest, end, "/.."); |
|
} |
|
} |
|
else |
|
{ |
|
// Output is empty |
|
if (is_absolute) |
|
{ |
|
append(dest, end, "/.."); |
|
} |
|
else |
|
{ |
|
// AFREITAS: Although we have no formal proof |
|
// for that, the output can't be relative |
|
// and empty at this point because relative |
|
// paths will fall in the `dest0 != dest` |
|
// case above of this rule C and then the |
|
// general case of rule E for "..". |
|
append(dest, end, ".."); |
|
} |
|
} |
|
input.remove_prefix(n - 1); |
|
continue; |
|
} |
|
|
|
bool const is_final_dotdot_seg = dot_equal(input, "/.."); |
|
if (is_final_dotdot_seg) |
|
{ |
|
core::string_view cur_out(dest0, dest - dest0); |
|
std::size_t p = cur_out.find_last_of('/'); |
|
bool const has_multiple_segs = p != core::string_view::npos; |
|
if (has_multiple_segs) |
|
{ |
|
// output has multiple segments |
|
// "erase" [p, end] if not "/.." |
|
core::string_view last_seg(dest0 + p, dest - (dest0 + p)); |
|
bool const prev_is_dotdot_seg = dot_equal(last_seg, "/.."); |
|
if (!prev_is_dotdot_seg) |
|
{ |
|
dest = dest0 + p; |
|
append(dest, end, "/"); |
|
} |
|
else |
|
{ |
|
append(dest, end, "/.."); |
|
} |
|
} |
|
else if (dest0 != dest) |
|
{ |
|
// Only one segment in the output: remove it |
|
core::string_view last_seg(dest0, dest - dest0); |
|
bool const prev_is_dotdot_seg = dot_equal(last_seg, ".."); |
|
if (!prev_is_dotdot_seg) { |
|
dest = dest0; |
|
} |
|
else |
|
{ |
|
append(dest, end, "/.."); |
|
} |
|
} |
|
else |
|
{ |
|
// Output is empty: append dotdot |
|
if (is_absolute) |
|
{ |
|
append(dest, end, "/.."); |
|
} |
|
else |
|
{ |
|
// AFREITAS: Although we have no formal proof |
|
// for that, the output can't be relative |
|
// and empty at this point because relative |
|
// paths will fall in the `dest0 != dest` |
|
// case above of this rule C and then the |
|
// general case of rule E for "..". |
|
append(dest, end, ".."); |
|
} |
|
} |
|
input = {}; |
|
break; |
|
} |
|
|
|
// Rule E |
|
std::size_t p = input.find_first_of('/', 1); |
|
if (p != core::string_view::npos) |
|
{ |
|
append(dest, end, input.substr(0, p)); |
|
input.remove_prefix(p); |
|
} |
|
else |
|
{ |
|
append(dest, end, input); |
|
input = {}; |
|
} |
|
} |
|
|
|
// 3. Finally, the output buffer is set |
|
// as the result of remove_dot_segments, |
|
// and we return its size |
|
return dest - dest0; |
|
} |
Hi,
I was wondering if I could use
Boost.URLto implement resolving of relative IRIs as mandated in the SPARQL standard:https://www.w3.org/TR/2013/REC-sparql11-query-20130321/#relIRIs
Basically it refers to section 5.2 of the RFC 3986, but explicitly states that "Neither Syntax-Based Normalization nor Scheme-Based Normalization (described in sections 6.2.2 and 6.2.3 of RFC3986)" should be performed, which is done by
Boost.URLwithout any way to turn it off.So that leads me to two questions:
normalize_pathwith calls tourl/src/detail/normalize.cpp
Lines 316 to 613 in 3f8a428
Thank you.