string repeat: Don't allocate repeated string all at once (#9124)

* string repeat: Don't allocate repeated string all at once

This used to allocate one string and fill it with the necessary
repetitions, which could be a very very large string.

Now, it instead uses one buffer and fills it to a chunk size,
and then writes that.

This fixes:

1. We no longer crash with too large max/count values. Before they
caused a bad_alloc because we tried to fill all RAM.
2. We no longer fill all RAM if given a big-but-not-too-big value. You
could've caused fish to eat *most* of your RAM here.
3. It can start writing almost immediately, instead of waiting
potentially minutes to start.

Performance is about the same to slightly faster overall.
This commit is contained in:
Fabian Boehm 2022-08-09 19:58:56 +02:00 committed by GitHub
parent 6128b58be6
commit eac808a819
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 92 additions and 37 deletions

View File

@ -1493,29 +1493,6 @@ static int string_collect(parser_t &parser, io_streams_t &streams, int argc, con
return appended > 0 ? STATUS_CMD_OK : STATUS_CMD_ERROR;
}
// Helper function to abstract the repeat logic from string_repeat
// returns the to_repeat string, repeated count times.
static wcstring wcsrepeat(const wcstring &to_repeat, size_t count) {
wcstring repeated;
repeated.reserve(to_repeat.length() * count);
for (size_t j = 0; j < count; j++) {
repeated += to_repeat;
}
return repeated;
}
// Helper function to abstract the repeat until logic from string_repeat
// returns the to_repeat string, repeated until max char has been reached.
static wcstring wcsrepeat_until(const wcstring &to_repeat, size_t max) {
if (to_repeat.length() == 0) return {};
size_t count = max / to_repeat.length();
size_t mod = max % to_repeat.length();
return wcsrepeat(to_repeat, count) + to_repeat.substr(0, mod);
}
static int string_repeat(parser_t &parser, io_streams_t &streams, int argc, const wchar_t **argv) {
options_t opts;
opts.count_valid = true;
@ -1525,32 +1502,84 @@ static int string_repeat(parser_t &parser, io_streams_t &streams, int argc, cons
int optind;
int retval = parse_opts(&opts, &optind, 0, argc, argv, parser, streams);
if (retval != STATUS_CMD_OK) return retval;
if (opts.max == 0 && opts.count == 0) {
// XXX: This used to be allowed, but returned 1.
// Keep it that way for now instead of adding an error.
// streams.err.append(L"Count or max must be greater than zero");
return STATUS_CMD_ERROR;
}
bool all_empty = true;
bool first = true;
arg_iterator_t aiter(argv, optind, streams);
while (const wcstring *word = aiter.nextstr()) {
if (!first && !opts.quiet) {
streams.out.append(L'\n');
// If the string is empty, there is nothing to repeat.
if (word->empty()) {
continue;
}
first = false;
const bool limit_repeat =
(opts.max > 0 && word->length() * opts.count > static_cast<size_t>(opts.max)) ||
!opts.count;
const wcstring repeated =
limit_repeat ? wcsrepeat_until(*word, opts.max) : wcsrepeat(*word, opts.count);
if (!repeated.empty()) {
all_empty = false;
if (opts.quiet) {
// Early out if we can - see #7495.
return STATUS_CMD_OK;
}
if (!first && !opts.quiet) {
streams.out.append(L'\n');
}
first = false;
auto &w = *word;
// The maximum size of the string is either the "max" characters,
// or it's the "count" repetitions, whichever ends up lower.
size_t max = opts.max;
if (max == 0 || (opts.count > 0 && w.length() * opts.count < max)) {
max = w.length() * opts.count;
}
// Append if not quiet.
if (!opts.quiet) {
streams.out.append(repeated);
// Reserve a string to avoid writing constantly.
// The 1500 here is a total gluteal extraction, but 500 seems to perform slightly worse.
const size_t chunk_size = 1500;
// The + word length is so we don't have to hit the chunk size exactly,
// which would require us to restart in the middle of the string.
// E.g. imagine repeating "12345678". The first chunk is hit after a last "1234",
// so we would then have to restart by appending "5678", which requires a substring.
// So let's not bother.
//
// Unless of course we don't even print the entire word, in which case we just need max.
wcstring chunk;
chunk.reserve(std::min(chunk_size + w.length(), max));
for (size_t i = max; i > 0; i -= w.length()) {
// Build up the chunk.
if (i >= w.length()) {
chunk.append(w);
} else {
chunk.append(w.substr(0, i));
break;
}
if (chunk.length() >= chunk_size) {
// We hit the chunk size, write it repeatedly until we can't anymore.
streams.out.append(chunk);
while (i >= chunk.length()) {
streams.out.append(chunk);
// We can easily be asked to write *a lot* of data,
// so we need to check every so often if the pipe has been closed.
// If we didn't, running `string repeat -n LARGENUMBER foo | pv`
// and pressing ctrl-c seems to hang.
if (streams.out.flush_and_check_error() != STATUS_CMD_OK) {
return STATUS_CMD_ERROR;
}
i -= chunk.length();
}
chunk.clear();
}
}
// Flush the remainder.
if (!chunk.empty()) {
streams.out.append(chunk);
}
}

View File

@ -504,6 +504,32 @@ string repeat -n3 ""
or echo string repeat empty string failed
# CHECK: string repeat empty string failed
# See that we hit the expected length
# First with "max", i.e. maximum number of characters
string repeat -m 5000 aab | string length
# CHECK: 5000
string repeat -m 5000 ab | string length
# CHECK: 5000
string repeat -m 5000 a | string length
# CHECK: 5000
string repeat -m 17 aab | string length
# CHECK: 17
string repeat -m 17 ab | string length
# CHECK: 17
string repeat -m 17 a | string length
# CHECK: 17
# Then with "count", i.e. number of repetitions.
# (these are count * length long)
string repeat -n 17 aab | string length
# CHECK: 51
string repeat -n 17 ab | string length
# CHECK: 34
string repeat -n 17 a | string length
# CHECK: 17
# And a more tricksy case with a long string that we truncate.
string repeat -m 5 (string repeat -n 500000 aaaaaaaaaaaaaaaaaa) | string length
# CHECK: 5
# Test equivalent matches with/without the --entire, --regex, and --invert flags.
string match -e x abc dxf xyz jkx x z
or echo exit 1