Skip to content

Commit 698ed92

Browse files
committed
feature: Buffer.lastIndexOf
* Remove unnecessary templating from SearchString SearchString used to have separate PatternChar and SubjectChar template type arguments, apparently to support things like searching for an 8-bit string inside a 16-bit string or vice versa. However, SearchString is only used from node_buffer.cc, where PatternChar and SubjectChar are always the same. Since this is extra complexity that's unused and untested (simplifying to a single Char template argument still compiles and didn't break any unit tests), I removed it. * Use Boyer-Hoore[-Horspool] for both indexOf and lastIndexOf Add test cases for lastIndexOf. Test the fallback from BMH to Boyer-Moore, which looks like it was totally untested before. * Extra bounds checks in node_buffer.cc * Extra asserts in string_search.h * Buffer.lastIndexOf: clean up, enforce consistency w/ String.lastIndexOf * Polyfill memrchr(3) for non-GNU systems
1 parent 6b6f2c1 commit 698ed92

4 files changed

Lines changed: 431 additions & 252 deletions

File tree

lib/buffer.js

Lines changed: 49 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -598,7 +598,48 @@ Buffer.prototype.compare = function compare(target,
598598
return binding.compareOffset(this, target, start, thisStart, end, thisEnd);
599599
};
600600

601-
function slowIndexOf(buffer, val, byteOffset, encoding) {
601+
602+
// Finds either the first index of `val` in `buffer` at offset >= `byteOffset`,
603+
// OR the last index of `val` in `buffer` at offset <= `byteOffset`.
604+
//
605+
// Arguments:
606+
// - buffer - a Buffer to search
607+
// - val - a string, Buffer, or number
608+
// - byteOffset - an index into `buffer`; will be clamped to an int32
609+
// - encoding - an optional encoding, relevant is val is a string
610+
// - dir - true for indexOf, false for lastIndexOf
611+
function bidirectionalIndexOf(buffer, val, byteOffset, encoding, dir) {
612+
if (typeof byteOffset === 'string') {
613+
encoding = byteOffset;
614+
byteOffset = undefined;
615+
} else if (byteOffset > 0x7fffffff) {
616+
byteOffset = 0x7fffffff;
617+
} else if (byteOffset < -0x80000000) {
618+
byteOffset = -0x80000000;
619+
}
620+
byteOffset = +byteOffset; // Coerce to Number.
621+
if (isNaN(byteOffset)) {
622+
// If the offset is undefined, null, NaN, "foo", etc, search whole buffer.
623+
byteOffset = dir ? 0 : (buffer.length - 1);
624+
}
625+
dir = !!dir; // Cast to bool.
626+
627+
if (typeof val === 'string') {
628+
if (encoding === undefined) {
629+
return binding.indexOfString(buffer, val, byteOffset, encoding, dir);
630+
}
631+
return slowIndexOf(buffer, val, byteOffset, encoding, dir);
632+
} else if (val instanceof Buffer) {
633+
return binding.indexOfBuffer(buffer, val, byteOffset, encoding, dir);
634+
} else if (typeof val === 'number') {
635+
return binding.indexOfNumber(buffer, val, byteOffset, dir);
636+
}
637+
638+
throw new TypeError('"val" argument must be string, number or Buffer');
639+
}
640+
641+
642+
function slowIndexOf(buffer, val, byteOffset, encoding, dir) {
602643
var loweredCase = false;
603644
for (;;) {
604645
switch (encoding) {
@@ -609,13 +650,13 @@ function slowIndexOf(buffer, val, byteOffset, encoding) {
609650
case 'utf16le':
610651
case 'utf-16le':
611652
case 'binary':
612-
return binding.indexOfString(buffer, val, byteOffset, encoding);
653+
return binding.indexOfString(buffer, val, byteOffset, encoding, dir);
613654

614655
case 'base64':
615656
case 'ascii':
616657
case 'hex':
617658
return binding.indexOfBuffer(
618-
buffer, Buffer.from(val, encoding), byteOffset, encoding);
659+
buffer, Buffer.from(val, encoding), byteOffset, encoding, dir);
619660

620661
default:
621662
if (loweredCase) {
@@ -628,29 +669,14 @@ function slowIndexOf(buffer, val, byteOffset, encoding) {
628669
}
629670
}
630671

672+
631673
Buffer.prototype.indexOf = function indexOf(val, byteOffset, encoding) {
632-
if (typeof byteOffset === 'string') {
633-
encoding = byteOffset;
634-
byteOffset = 0;
635-
} else if (byteOffset > 0x7fffffff) {
636-
byteOffset = 0x7fffffff;
637-
} else if (byteOffset < -0x80000000) {
638-
byteOffset = -0x80000000;
639-
}
640-
byteOffset >>= 0;
674+
return bidirectionalIndexOf(this, val, byteOffset, encoding, true);
675+
};
641676

642-
if (typeof val === 'string') {
643-
if (encoding === undefined) {
644-
return binding.indexOfString(this, val, byteOffset, encoding);
645-
}
646-
return slowIndexOf(this, val, byteOffset, encoding);
647-
} else if (val instanceof Buffer) {
648-
return binding.indexOfBuffer(this, val, byteOffset, encoding);
649-
} else if (typeof val === 'number') {
650-
return binding.indexOfNumber(this, val, byteOffset);
651-
}
652677

653-
throw new TypeError('"val" argument must be string, number or Buffer');
678+
Buffer.prototype.lastIndexOf = function lastIndexOf(val, byteOffset, encoding) {
679+
return bidirectionalIndexOf(this, val, byteOffset, encoding, false);
654680
};
655681

656682

src/node_buffer.cc

Lines changed: 80 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -944,9 +944,44 @@ void Compare(const FunctionCallbackInfo<Value> &args) {
944944
}
945945

946946

947+
// Computes the offset for starting an indexOf or lastIndexOf search.
948+
// Returns either a valid offset in [0...<length - 1>], ie inside the Buffer,
949+
// or -1 to signal that there is no possible match.
950+
int64_t IndexOfOffset(size_t length, int64_t offset_i64, bool is_forward) {
951+
int64_t length_i64 = static_cast<int64_t>(length);
952+
if (length_i64 == 0) {
953+
// Empty buffer, no match.
954+
return -1;
955+
}
956+
if (offset_i64 < 0) {
957+
if (offset_i64 + length_i64 >= 0) {
958+
// Negative offsets count backwards from the end of the buffer.
959+
return length_i64 + offset_i64;
960+
} else if (is_forward) {
961+
// indexOf from before the start of the buffer: search the whole buffer.
962+
return 0;
963+
} else {
964+
// lastIndexOf from before the start of the buffer: no match.
965+
return -1;
966+
}
967+
} else {
968+
if (offset_i64 < length_i64) {
969+
// Valid positive offset.
970+
return offset_i64;
971+
} else if (is_forward) {
972+
// indexOf from past the end of the buffer: no match.
973+
return -1;
974+
} else {
975+
// lastIndexOf from past the end of the buffer: search the whole buffer.
976+
return length_i64 - 1;
977+
}
978+
}
979+
}
980+
947981
void IndexOfString(const FunctionCallbackInfo<Value>& args) {
948982
ASSERT(args[1]->IsString());
949983
ASSERT(args[2]->IsNumber());
984+
ASSERT(args[4]->IsBoolean());
950985

951986
enum encoding enc = ParseEncoding(args.GetIsolate(),
952987
args[3],
@@ -956,31 +991,26 @@ void IndexOfString(const FunctionCallbackInfo<Value>& args) {
956991
SPREAD_ARG(args[0], ts_obj);
957992

958993
Local<String> needle = args[1].As<String>();
994+
int64_t offset_i64 = args[2]->IntegerValue();
995+
bool is_forward = args[4]->IsTrue();
996+
959997
const char* haystack = ts_obj_data;
960998
const size_t haystack_length = ts_obj_length;
961999
// Extended latin-1 characters are 2 bytes in Utf8.
9621000
const size_t needle_length =
9631001
enc == BINARY ? needle->Length() : needle->Utf8Length();
9641002

965-
9661003
if (needle_length == 0 || haystack_length == 0) {
9671004
return args.GetReturnValue().Set(-1);
9681005
}
9691006

970-
int64_t offset_i64 = args[2]->IntegerValue();
971-
size_t offset = 0;
972-
973-
if (offset_i64 < 0) {
974-
if (offset_i64 + static_cast<int64_t>(haystack_length) < 0) {
975-
offset = 0;
976-
} else {
977-
offset = static_cast<size_t>(haystack_length + offset_i64);
978-
}
979-
} else {
980-
offset = static_cast<size_t>(offset_i64);
1007+
int64_t opt_offset = IndexOfOffset(haystack_length, offset_i64, is_forward);
1008+
if (opt_offset <= -1) {
1009+
return args.GetReturnValue().Set(-1);
9811010
}
982-
983-
if (haystack_length < offset || needle_length + offset > haystack_length) {
1011+
size_t offset = static_cast<size_t>(opt_offset);
1012+
CHECK_LT(offset, haystack_length);
1013+
if (is_forward && needle_length + offset > haystack_length) {
9841014
return args.GetReturnValue().Set(-1);
9851015
}
9861016

@@ -1008,13 +1038,15 @@ void IndexOfString(const FunctionCallbackInfo<Value>& args) {
10081038
haystack_length / 2,
10091039
decoded_string,
10101040
decoder.size() / 2,
1011-
offset / 2);
1041+
offset / 2,
1042+
is_forward);
10121043
} else {
10131044
result = SearchString(reinterpret_cast<const uint16_t*>(haystack),
10141045
haystack_length / 2,
10151046
reinterpret_cast<const uint16_t*>(*needle_value),
10161047
needle_value.length(),
1017-
offset / 2);
1048+
offset / 2,
1049+
is_forward);
10181050
}
10191051
result *= 2;
10201052
} else if (enc == UTF8) {
@@ -1026,7 +1058,8 @@ void IndexOfString(const FunctionCallbackInfo<Value>& args) {
10261058
haystack_length,
10271059
reinterpret_cast<const uint8_t*>(*needle_value),
10281060
needle_length,
1029-
offset);
1061+
offset,
1062+
is_forward);
10301063
} else if (enc == BINARY) {
10311064
uint8_t* needle_data = static_cast<uint8_t*>(malloc(needle_length));
10321065
if (needle_data == nullptr) {
@@ -1039,7 +1072,8 @@ void IndexOfString(const FunctionCallbackInfo<Value>& args) {
10391072
haystack_length,
10401073
needle_data,
10411074
needle_length,
1042-
offset);
1075+
offset,
1076+
is_forward);
10431077
free(needle_data);
10441078
}
10451079

@@ -1050,17 +1084,18 @@ void IndexOfString(const FunctionCallbackInfo<Value>& args) {
10501084
void IndexOfBuffer(const FunctionCallbackInfo<Value>& args) {
10511085
ASSERT(args[1]->IsObject());
10521086
ASSERT(args[2]->IsNumber());
1087+
ASSERT(args[4]->IsBoolean());
10531088

10541089
enum encoding enc = ParseEncoding(args.GetIsolate(),
10551090
args[3],
10561091
UTF8);
10571092

10581093
THROW_AND_RETURN_UNLESS_BUFFER(Environment::GetCurrent(args), args[0]);
1094+
THROW_AND_RETURN_UNLESS_BUFFER(Environment::GetCurrent(args), args[1]);
10591095
SPREAD_ARG(args[0], ts_obj);
10601096
SPREAD_ARG(args[1], buf);
1061-
1062-
if (buf_length > 0)
1063-
CHECK_NE(buf_data, nullptr);
1097+
int64_t offset_i64 = args[2]->IntegerValue();
1098+
bool is_forward = args[4]->IsTrue();
10641099

10651100
const char* haystack = ts_obj_data;
10661101
const size_t haystack_length = ts_obj_length;
@@ -1071,19 +1106,13 @@ void IndexOfBuffer(const FunctionCallbackInfo<Value>& args) {
10711106
return args.GetReturnValue().Set(-1);
10721107
}
10731108

1074-
int64_t offset_i64 = args[2]->IntegerValue();
1075-
size_t offset = 0;
1076-
1077-
if (offset_i64 < 0) {
1078-
if (offset_i64 + static_cast<int64_t>(haystack_length) < 0)
1079-
offset = 0;
1080-
else
1081-
offset = static_cast<size_t>(haystack_length + offset_i64);
1082-
} else {
1083-
offset = static_cast<size_t>(offset_i64);
1109+
int64_t opt_offset = IndexOfOffset(haystack_length, offset_i64, is_forward);
1110+
if (opt_offset <= -1) {
1111+
return args.GetReturnValue().Set(-1);
10841112
}
1085-
1086-
if (haystack_length < offset || needle_length + offset > haystack_length) {
1113+
size_t offset = static_cast<size_t>(opt_offset);
1114+
CHECK_LT(offset, haystack_length);
1115+
if (is_forward && needle_length + offset > haystack_length) {
10871116
return args.GetReturnValue().Set(-1);
10881117
}
10891118

@@ -1098,15 +1127,17 @@ void IndexOfBuffer(const FunctionCallbackInfo<Value>& args) {
10981127
haystack_length / 2,
10991128
reinterpret_cast<const uint16_t*>(needle),
11001129
needle_length / 2,
1101-
offset / 2);
1130+
offset / 2,
1131+
is_forward);
11021132
result *= 2;
11031133
} else {
11041134
result = SearchString(
11051135
reinterpret_cast<const uint8_t*>(haystack),
11061136
haystack_length,
11071137
reinterpret_cast<const uint8_t*>(needle),
11081138
needle_length,
1109-
offset);
1139+
offset,
1140+
is_forward);
11101141
}
11111142

11121143
args.GetReturnValue().Set(
@@ -1116,28 +1147,29 @@ void IndexOfBuffer(const FunctionCallbackInfo<Value>& args) {
11161147
void IndexOfNumber(const FunctionCallbackInfo<Value>& args) {
11171148
ASSERT(args[1]->IsNumber());
11181149
ASSERT(args[2]->IsNumber());
1150+
ASSERT(args[3]->IsBoolean());
11191151

11201152
THROW_AND_RETURN_UNLESS_BUFFER(Environment::GetCurrent(args), args[0]);
11211153
SPREAD_ARG(args[0], ts_obj);
11221154

11231155
uint32_t needle = args[1]->Uint32Value();
11241156
int64_t offset_i64 = args[2]->IntegerValue();
1125-
size_t offset;
1126-
1127-
if (offset_i64 < 0) {
1128-
if (offset_i64 + static_cast<int64_t>(ts_obj_length) < 0)
1129-
offset = 0;
1130-
else
1131-
offset = static_cast<size_t>(ts_obj_length + offset_i64);
1132-
} else {
1133-
offset = static_cast<size_t>(offset_i64);
1134-
}
1157+
bool is_forward = args[3]->IsTrue();
11351158

1136-
if (ts_obj_length == 0 || offset + 1 > ts_obj_length)
1159+
int64_t opt_offset = IndexOfOffset(ts_obj_length, offset_i64, is_forward);
1160+
if (opt_offset <= -1) {
11371161
return args.GetReturnValue().Set(-1);
1162+
}
1163+
size_t offset = static_cast<size_t>(opt_offset);
1164+
CHECK_LT(offset, ts_obj_length);
11381165

1139-
void* ptr = memchr(ts_obj_data + offset, needle, ts_obj_length - offset);
1140-
char* ptr_char = static_cast<char*>(ptr);
1166+
const void* ptr;
1167+
if (is_forward) {
1168+
ptr = memchr(ts_obj_data + offset, needle, ts_obj_length - offset);
1169+
} else {
1170+
ptr = node::stringsearch::MemrchrFill(ts_obj_data, needle, offset + 1);
1171+
}
1172+
const char* ptr_char = static_cast<const char*>(ptr);
11411173
args.GetReturnValue().Set(ptr ? static_cast<int>(ptr_char - ts_obj_data)
11421174
: -1);
11431175
}

0 commit comments

Comments
 (0)