mirror of
https://github.com/fish-shell/fish-shell.git
synced 2025-01-19 10:12:49 +08:00
Factor out PCRE2 into new re component
This migrates our PCRE2 dependency from builtin/string.cpp to new files re.h/re.cpp, allowing regexes to be used in other places in fish. No user-visible behavior change expected here.
This commit is contained in:
parent
a1dd93df41
commit
7ae1727359
|
@ -110,7 +110,7 @@ set(FISH_SRCS
|
|||
src/null_terminated_array.cpp src/operation_context.cpp src/output.cpp
|
||||
src/pager.cpp src/parse_execution.cpp src/parse_tree.cpp src/parse_util.cpp
|
||||
src/parser.cpp src/parser_keywords.cpp src/path.cpp src/postfork.cpp
|
||||
src/proc.cpp src/reader.cpp src/redirection.cpp src/screen.cpp
|
||||
src/proc.cpp src/re.cpp src/reader.cpp src/redirection.cpp src/screen.cpp
|
||||
src/signal.cpp src/termsize.cpp src/timer.cpp src/tinyexpr.cpp
|
||||
src/tokenizer.cpp src/topic_monitor.cpp src/trace.cpp src/utf8.cpp src/util.cpp
|
||||
src/wait_handle.cpp src/wcstringutil.cpp src/wgetopt.cpp src/wildcard.cpp
|
||||
|
|
|
@ -71,6 +71,7 @@
|
|||
#include "parser.h"
|
||||
#include "path.h"
|
||||
#include "proc.h"
|
||||
#include "re.h"
|
||||
#include "reader.h"
|
||||
#include "redirection.h"
|
||||
#include "screen.h"
|
||||
|
@ -6682,6 +6683,170 @@ static void test_killring() {
|
|||
do_test((kill_entries() == wcstring_list_t{L"a", L"c", L"b", L"d"}));
|
||||
}
|
||||
|
||||
namespace {
|
||||
using namespace re;
|
||||
|
||||
// Basic tests for re, which wraps PCRE2.
|
||||
static void test_re_errs() {
|
||||
say(L"Testing re");
|
||||
flags_t flags{};
|
||||
re_error_t error{};
|
||||
maybe_t<regex_t> re;
|
||||
do_test(!regex_t::try_compile(L"abc[", flags, &error));
|
||||
do_test(error.code != 0);
|
||||
do_test(!error.message().empty());
|
||||
|
||||
error = re_error_t{};
|
||||
do_test(!regex_t::try_compile(L"abc(", flags, &error).has_value());
|
||||
do_test(error.code != 0);
|
||||
do_test(!error.message().empty());
|
||||
}
|
||||
|
||||
static void test_re_basic() {
|
||||
// Match a character twice.
|
||||
using namespace re;
|
||||
wcstring subject = L"AAbCCd11e";
|
||||
auto substr_from_range = [&](maybe_t<match_range_t> r) {
|
||||
do_test(r.has_value());
|
||||
do_test(r->begin <= r->end);
|
||||
do_test(r->end <= subject.size());
|
||||
return subject.substr(r->begin, r->end - r->begin);
|
||||
};
|
||||
auto re = regex_t::try_compile(L"(.)\\1");
|
||||
do_test(re.has_value());
|
||||
auto md = re->prepare();
|
||||
wcstring_list_t matches;
|
||||
wcstring_list_t captures;
|
||||
while (auto r = re->match(md, subject)) {
|
||||
matches.push_back(substr_from_range(r));
|
||||
captures.push_back(substr_from_range(re->group(md, 1)));
|
||||
do_test(!re->group(md, 2));
|
||||
}
|
||||
do_test(join_strings(matches, L',') == L"AA,CC,11");
|
||||
do_test(join_strings(captures, L',') == L"A,C,1");
|
||||
}
|
||||
|
||||
static void test_re_reset() {
|
||||
using namespace re;
|
||||
auto re = regex_t::try_compile(L"([0-9])");
|
||||
wcstring s = L"012345";
|
||||
auto md = re->prepare();
|
||||
for (size_t idx = 0; idx < s.size(); idx++) {
|
||||
md.reset();
|
||||
for (size_t j = 0; j <= idx; j++) {
|
||||
auto m = re->match(md, s);
|
||||
match_range_t expected{j, j + 1};
|
||||
do_test(m == expected);
|
||||
do_test(re->group(md, 1) == expected);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void test_re_named() {
|
||||
// Named capture groups.
|
||||
using namespace re;
|
||||
auto re = regex_t::try_compile(L"A(?<FOO>x+)?");
|
||||
do_test(re->capture_group_count() == 1);
|
||||
|
||||
wcstring subject = L"AxxAAx";
|
||||
auto md = re->prepare();
|
||||
|
||||
auto r = re->match(md, subject);
|
||||
do_test((r == match_range_t{0, 3}));
|
||||
do_test(re->substring_for_group(md, L"QQQ", subject) == none());
|
||||
do_test(re->substring_for_group(md, L"FOO", subject) == L"xx");
|
||||
|
||||
r = re->match(md, subject);
|
||||
do_test((r == match_range_t{3, 4}));
|
||||
do_test(re->substring_for_group(md, L"QQQ", subject) == none());
|
||||
do_test(re->substring_for_group(md, L"FOO", subject) == none());
|
||||
|
||||
r = re->match(md, subject);
|
||||
do_test((r == match_range_t{4, 6}));
|
||||
do_test(re->substring_for_group(md, L"QQQ", subject) == none());
|
||||
do_test(re->substring_for_group(md, L"FOO", subject) == wcstring(L"x"));
|
||||
}
|
||||
|
||||
static void test_re_name_extraction() {
|
||||
// Names of capture groups can be extracted.
|
||||
using namespace re;
|
||||
auto re = regex_t::try_compile(L"(?<FOO>dd)ff(?<BAR>cc)aaa(?<alpha>)ff(?<BETA>)");
|
||||
do_test(re.has_value());
|
||||
do_test(re->capture_group_count() == 4);
|
||||
// PCRE2 returns these sorted.
|
||||
do_test(join_strings(re->capture_group_names(), L',') == L"BAR,BETA,FOO,alpha");
|
||||
|
||||
// Mixed named and positional captures.
|
||||
re = regex_t::try_compile(L"(abc)(?<FOO>def)(ghi)(?<BAR>jkl)");
|
||||
do_test(re.has_value());
|
||||
do_test(re->capture_group_count() == 4);
|
||||
do_test(join_strings(re->capture_group_names(), L',') == L"BAR,FOO");
|
||||
auto md = re->prepare();
|
||||
const wcstring subject = L"abcdefghijkl";
|
||||
auto m = re->match(md, subject);
|
||||
do_test((m == match_range_t{0, 12}));
|
||||
do_test((re->group(md, 1) == match_range_t{0, 3}));
|
||||
do_test((re->group(md, 2) == match_range_t{3, 6}));
|
||||
do_test((re->group(md, 3) == match_range_t{6, 9}));
|
||||
do_test((re->group(md, 4) == match_range_t{9, 12}));
|
||||
do_test(re->substring_for_group(md, L"FOO", subject) == wcstring(L"def"));
|
||||
do_test(re->substring_for_group(md, L"BAR", subject) == wcstring(L"jkl"));
|
||||
}
|
||||
|
||||
static void test_re_substitute() {
|
||||
// Names of capture groups can be extracted.
|
||||
using namespace re;
|
||||
auto re = regex_t::try_compile(L"[a-z]+(\\d+)");
|
||||
do_test(re.has_value());
|
||||
do_test(re->capture_group_count() == 1);
|
||||
maybe_t<wcstring> res{};
|
||||
int repl_count{};
|
||||
sub_flags_t sflags{};
|
||||
const wcstring subj = L"AAabc123ZZ AAabc123ZZ";
|
||||
const wcstring repl = L"$1qqq";
|
||||
res = re->substitute(subj, repl, sflags, 0, nullptr, &repl_count);
|
||||
do_test(res && *res == L"AA123qqqZZ AAabc123ZZ");
|
||||
do_test(repl_count == 1);
|
||||
|
||||
res = re->substitute(subj, repl, sflags, 5, nullptr, &repl_count);
|
||||
do_test(res && *res == L"AAabc123ZZ AA123qqqZZ");
|
||||
do_test(repl_count == 1);
|
||||
|
||||
sflags.global = true;
|
||||
res = re->substitute(subj, repl, sflags, 0, nullptr, &repl_count);
|
||||
do_test(res && *res == L"AA123qqqZZ AA123qqqZZ");
|
||||
do_test(repl_count == 2);
|
||||
|
||||
sflags.literal = true;
|
||||
res = re->substitute(subj, repl, sflags, 0, nullptr, &repl_count);
|
||||
do_test(res && *res == L"AA$1qqqZZ AA$1qqqZZ");
|
||||
do_test(repl_count == 2);
|
||||
|
||||
sflags.literal = false;
|
||||
sflags.extended = true;
|
||||
res = re->substitute(subj, L"\\x21", sflags, 0, nullptr, &repl_count); // \x21 = !
|
||||
do_test(res && *res == L"AA!ZZ AA!ZZ");
|
||||
do_test(repl_count == 2);
|
||||
|
||||
// Test with a bad escape; \b is unsupported.
|
||||
re_error_t error{};
|
||||
res = re->substitute(subj, L"AAA\\bZZZ", sflags, 0, &error);
|
||||
do_test(!res.has_value());
|
||||
do_test(error.code == -57 /* PCRE2_ERROR_BADREPESCAPE */);
|
||||
do_test(error.message() == L"bad escape sequence in replacement string");
|
||||
do_test(error.offset == 5 /* the b */);
|
||||
|
||||
// Test a very long replacement as we used a fixed-size buffer.
|
||||
sflags = sub_flags_t{};
|
||||
sflags.global = true;
|
||||
re = regex_t::try_compile(L"A");
|
||||
res =
|
||||
re->substitute(wcstring(4096, L'A'), wcstring(4096, L'X'), sflags, 0, nullptr, &repl_count);
|
||||
do_test(res && *res == wcstring(4096 * 4096, L'X'));
|
||||
do_test(repl_count == 4096);
|
||||
}
|
||||
} // namespace
|
||||
|
||||
struct termsize_tester_t {
|
||||
static void test();
|
||||
};
|
||||
|
@ -6860,6 +7025,12 @@ static const test_t s_tests[]{
|
|||
{TEST_GROUP("timer_format"), test_timer_format},
|
||||
{TEST_GROUP("termsize"), termsize_tester_t::test},
|
||||
{TEST_GROUP("killring"), test_killring},
|
||||
{TEST_GROUP("re"), test_re_errs},
|
||||
{TEST_GROUP("re"), test_re_basic},
|
||||
{TEST_GROUP("re"), test_re_reset},
|
||||
{TEST_GROUP("re"), test_re_named},
|
||||
{TEST_GROUP("re"), test_re_name_extraction},
|
||||
{TEST_GROUP("re"), test_re_substitute},
|
||||
};
|
||||
|
||||
void list_tests() {
|
||||
|
|
288
src/re.cpp
Normal file
288
src/re.cpp
Normal file
|
@ -0,0 +1,288 @@
|
|||
#include "config.h" // IWYU pragma: keep
|
||||
|
||||
#include "re.h"
|
||||
|
||||
#include "flog.h"
|
||||
|
||||
#define PCRE2_CODE_UNIT_WIDTH WCHAR_T_BITS
|
||||
#ifdef _WIN32
|
||||
#define PCRE2_STATIC
|
||||
#endif
|
||||
|
||||
#include "pcre2.h"
|
||||
|
||||
using namespace re;
|
||||
using namespace re::adapters;
|
||||
|
||||
void bytecode_deleter_t::operator()(const void *ptr) {
|
||||
if (ptr) {
|
||||
pcre2_code_free(static_cast<pcre2_code *>(const_cast<void *>(ptr)));
|
||||
}
|
||||
}
|
||||
|
||||
void match_data_deleter_t::operator()(void *ptr) {
|
||||
if (ptr) {
|
||||
pcre2_match_data_free(static_cast<pcre2_match_data *>(ptr));
|
||||
}
|
||||
}
|
||||
|
||||
// Get underlying pcre2_code from a bytecode_ptr_t.
|
||||
const pcre2_code *get_code(const bytecode_ptr_t &ptr) {
|
||||
assert(ptr && "Null pointer");
|
||||
return static_cast<const pcre2_code *>(ptr.get());
|
||||
}
|
||||
|
||||
// Get underlying match_data_t.
|
||||
pcre2_match_data *get_md(const match_data_ptr_t &ptr) {
|
||||
assert(ptr && "Null pointer");
|
||||
return static_cast<pcre2_match_data *>(ptr.get());
|
||||
}
|
||||
|
||||
// Convert a wcstring to a PCRE2_SPTR.
|
||||
PCRE2_SPTR to_sptr(const wcstring &str) { return reinterpret_cast<PCRE2_SPTR>(str.c_str()); }
|
||||
|
||||
/// \return a message for an error code.
|
||||
static wcstring message_for_code(error_code_t code) {
|
||||
wchar_t buf[128] = {};
|
||||
pcre2_get_error_message(code, reinterpret_cast<PCRE2_UCHAR *>(buf),
|
||||
sizeof(buf) / sizeof(wchar_t));
|
||||
return buf;
|
||||
}
|
||||
|
||||
maybe_t<regex_t> regex_t::try_compile(const wcstring &pattern, const flags_t &flags,
|
||||
re_error_t *error) {
|
||||
// Disable some sequences that can lead to security problems.
|
||||
uint32_t options = PCRE2_NEVER_UTF;
|
||||
#if PCRE2_CODE_UNIT_WIDTH < 32
|
||||
options |= PCRE2_NEVER_BACKSLASH_C;
|
||||
#endif
|
||||
if (flags.icase) options |= PCRE2_CASELESS;
|
||||
|
||||
error_code_t err_code = 0;
|
||||
PCRE2_SIZE err_offset = 0;
|
||||
pcre2_code *code =
|
||||
pcre2_compile(to_sptr(pattern), pattern.size(), options, &err_code, &err_offset, nullptr);
|
||||
if (!code) {
|
||||
if (error) {
|
||||
error->code = err_code;
|
||||
error->offset = err_offset;
|
||||
}
|
||||
return none();
|
||||
}
|
||||
return regex_t{bytecode_ptr_t(code)};
|
||||
}
|
||||
|
||||
match_data_t regex_t::prepare() const {
|
||||
pcre2_match_data *md = pcre2_match_data_create_from_pattern(get_code(code_), nullptr);
|
||||
// Bogus assertion for memory exhaustion.
|
||||
if (unlikely(!md)) {
|
||||
DIE("Out of memory");
|
||||
}
|
||||
return match_data_t{match_data_ptr_t(static_cast<void *>(md))};
|
||||
}
|
||||
|
||||
void match_data_t::reset() {
|
||||
start_offset = 0;
|
||||
max_capture = 0;
|
||||
last_empty = false;
|
||||
}
|
||||
|
||||
maybe_t<match_range_t> regex_t::match(match_data_t &md, const wcstring &subject) const {
|
||||
pcre2_match_data *const match_data = get_md(md.data);
|
||||
assert(match_data && "Invalid match data");
|
||||
|
||||
// Handle exhausted matches.
|
||||
if (md.start_offset > subject.size() || (md.last_empty && md.start_offset == subject.size())) {
|
||||
md.max_capture = 0;
|
||||
return none();
|
||||
}
|
||||
PCRE2_SIZE start_offset = md.start_offset;
|
||||
|
||||
// See pcre2demo.c for an explanation of this logic.
|
||||
uint32_t options = md.last_empty ? PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED : 0;
|
||||
error_code_t code = pcre2_match(get_code(code_), to_sptr(subject), subject.size(), start_offset,
|
||||
options, match_data, nullptr);
|
||||
if (code == PCRE2_ERROR_NOMATCH && !md.last_empty) {
|
||||
// Failed to match.
|
||||
md.start_offset = subject.size();
|
||||
md.max_capture = 0;
|
||||
return none();
|
||||
} else if (code == PCRE2_ERROR_NOMATCH && md.last_empty) {
|
||||
// Failed to find a non-empty-string match at a point where there was a previous
|
||||
// empty-string match. Advance by one character and try again.
|
||||
md.start_offset += 1;
|
||||
md.last_empty = false;
|
||||
return this->match(md, subject);
|
||||
} else if (code < 0) {
|
||||
FLOG(error, "pcre2_match unexpected error:", message_for_code(code));
|
||||
return none();
|
||||
}
|
||||
|
||||
// Match succeeded.
|
||||
// Start at end of previous match, marking if it was empty.
|
||||
const auto *ovector = pcre2_get_ovector_pointer(match_data);
|
||||
md.start_offset = ovector[1];
|
||||
md.max_capture = static_cast<size_t>(code);
|
||||
md.last_empty = ovector[0] == ovector[1];
|
||||
return match_range_t{ovector[0], ovector[1]};
|
||||
}
|
||||
|
||||
maybe_t<match_range_t> regex_t::group(const match_data_t &md, size_t group_idx) const {
|
||||
if (group_idx >= md.max_capture || group_idx >= pcre2_get_ovector_count(get_md(md.data))) {
|
||||
return none();
|
||||
}
|
||||
|
||||
const PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(get_md(md.data));
|
||||
PCRE2_SIZE start = ovector[2 * group_idx];
|
||||
PCRE2_SIZE end = ovector[2 * group_idx + 1];
|
||||
if (start == PCRE2_UNSET || end == PCRE2_UNSET) {
|
||||
return none();
|
||||
}
|
||||
// From PCRE2 docs: "Note that when a pattern such as (?=ab\K) matches, the reported start of
|
||||
// the match can be greater than the end of the match."
|
||||
// Saturate the end.
|
||||
end = std::max(start, end);
|
||||
return match_range_t{start, end};
|
||||
}
|
||||
|
||||
maybe_t<match_range_t> regex_t::group(const match_data_t &match_data, const wcstring &name) const {
|
||||
const auto *pcname = to_sptr(name);
|
||||
// Beware, pcre2_substring_copy_byname and pcre2_substring_copy_bynumber both have a bug
|
||||
// on at least one Ubuntu (running PCRE2) where it outputs garbage for the first character.
|
||||
// Read out from the ovector directly.
|
||||
int num = pcre2_substring_number_from_name(get_code(code_), pcname);
|
||||
if (num <= 0) {
|
||||
return none();
|
||||
}
|
||||
return this->group(match_data, static_cast<size_t>(num));
|
||||
}
|
||||
|
||||
static maybe_t<wcstring> range_to_substr(const wcstring &subject, maybe_t<match_range_t> range) {
|
||||
if (!range) {
|
||||
return none();
|
||||
}
|
||||
assert(range->begin <= range->end && range->end <= subject.size() && "Invalid range");
|
||||
return subject.substr(range->begin, range->end - range->begin);
|
||||
}
|
||||
|
||||
maybe_t<wcstring> regex_t::substring_for_group(const match_data_t &md, size_t group_idx,
|
||||
const wcstring &subject) const {
|
||||
return range_to_substr(subject, this->group(md, group_idx));
|
||||
}
|
||||
|
||||
maybe_t<wcstring> regex_t::substring_for_group(const match_data_t &md, const wcstring &name,
|
||||
const wcstring &subject) const {
|
||||
return range_to_substr(subject, this->group(md, name));
|
||||
}
|
||||
|
||||
size_t regex_t::capture_group_count() const {
|
||||
uint32_t count{};
|
||||
pcre2_pattern_info(get_code(code_), PCRE2_INFO_CAPTURECOUNT, &count);
|
||||
return count;
|
||||
}
|
||||
|
||||
wcstring_list_t regex_t::capture_group_names() const {
|
||||
PCRE2_SPTR name_table{};
|
||||
uint32_t name_entry_size{};
|
||||
uint32_t name_count{};
|
||||
|
||||
const auto *code = get_code(code_);
|
||||
pcre2_pattern_info(code, PCRE2_INFO_NAMETABLE, &name_table);
|
||||
pcre2_pattern_info(code, PCRE2_INFO_NAMEENTRYSIZE, &name_entry_size);
|
||||
pcre2_pattern_info(code, PCRE2_INFO_NAMECOUNT, &name_count);
|
||||
|
||||
struct name_table_entry_t {
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||
uint8_t match_index_msb;
|
||||
uint8_t match_index_lsb;
|
||||
#if CHAR_BIT == PCRE2_CODE_UNIT_WIDTH
|
||||
char name[];
|
||||
#else
|
||||
char8_t name[];
|
||||
#endif
|
||||
#elif PCRE2_CODE_UNIT_WIDTH == 16
|
||||
uint16_t match_index;
|
||||
#if WCHAR_T_BITS == PCRE2_CODE_UNIT_WIDTH
|
||||
wchar_t name[];
|
||||
#else
|
||||
char16_t name[];
|
||||
#endif
|
||||
#else
|
||||
uint32_t match_index;
|
||||
#if WCHAR_T_BITS == PCRE2_CODE_UNIT_WIDTH
|
||||
wchar_t name[];
|
||||
#else
|
||||
char32_t name[];
|
||||
#endif // WCHAR_T_BITS
|
||||
#endif // PCRE2_CODE_UNIT_WIDTH
|
||||
};
|
||||
|
||||
const auto *names = reinterpret_cast<const name_table_entry_t *>(name_table);
|
||||
wcstring_list_t result;
|
||||
result.reserve(name_count);
|
||||
for (uint32_t i = 0; i < name_count; ++i) {
|
||||
const auto &name_entry = names[i * name_entry_size];
|
||||
result.emplace_back(name_entry.name);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
maybe_t<wcstring> regex_t::substitute(const wcstring &subject, const wcstring &replacement,
|
||||
sub_flags_t flags, size_t start_idx, re_error_t *out_error,
|
||||
int *out_repl_count) const {
|
||||
constexpr size_t stack_bufflen = 256;
|
||||
wchar_t buffer[stack_bufflen];
|
||||
|
||||
// SUBSTITUTE_GLOBAL means more than one substitution happens.
|
||||
uint32_t options = PCRE2_SUBSTITUTE_UNSET_EMPTY // don't error on unmatched
|
||||
| PCRE2_SUBSTITUTE_OVERFLOW_LENGTH // return required length on overflow
|
||||
| (flags.global ? PCRE2_SUBSTITUTE_GLOBAL : 0) // replace multiple
|
||||
| (flags.literal ? PCRE2_SUBSTITUTE_LITERAL : 0) // respect $1, etc.
|
||||
| (flags.extended ? PCRE2_SUBSTITUTE_EXTENDED : 0) // backslash escapes
|
||||
;
|
||||
size_t bufflen = stack_bufflen;
|
||||
error_code_t rc =
|
||||
pcre2_substitute(get_code(code_), to_sptr(subject), subject.size(), start_idx, options,
|
||||
nullptr /* match_data */, nullptr /* context */, to_sptr(replacement),
|
||||
replacement.size(), reinterpret_cast<PCRE2_UCHAR32 *>(buffer), &bufflen);
|
||||
|
||||
if (out_repl_count) {
|
||||
*out_repl_count = std::max(rc, 0);
|
||||
}
|
||||
if (rc == 0) {
|
||||
// No replacements.
|
||||
return subject;
|
||||
} else if (rc > 0) {
|
||||
// Some replacement which fit in our buffer.
|
||||
// Note we may have had embedded nuls.
|
||||
assert(bufflen <= stack_bufflen && "bufflen should not exceed buffer size");
|
||||
return wcstring(buffer, bufflen);
|
||||
} else if (rc == PCRE2_ERROR_NOMEMORY) {
|
||||
// bufflen has been updated to required buffer size.
|
||||
// Try again with a real string.
|
||||
wcstring res(bufflen, L'\0');
|
||||
rc = pcre2_substitute(get_code(code_), to_sptr(subject), subject.size(), start_idx, options,
|
||||
nullptr /* match_data */, nullptr /* context */, to_sptr(replacement),
|
||||
replacement.size(), reinterpret_cast<PCRE2_UCHAR32 *>(&res[0]),
|
||||
&bufflen);
|
||||
if (out_repl_count) {
|
||||
*out_repl_count = std::max(rc, 0);
|
||||
}
|
||||
if (rc >= 0) {
|
||||
res.resize(bufflen);
|
||||
return res;
|
||||
}
|
||||
}
|
||||
// Some error. The offset may be returned in the bufflen.
|
||||
if (out_error) {
|
||||
out_error->code = rc;
|
||||
out_error->offset = (bufflen == PCRE2_UNSET ? 0 : bufflen);
|
||||
}
|
||||
return none();
|
||||
}
|
||||
|
||||
regex_t::regex_t(adapters::bytecode_ptr_t &&code) : code_(std::move(code)) {
|
||||
assert(code_ && "Null impl");
|
||||
}
|
||||
|
||||
wcstring re_error_t::message() const { return message_for_code(this->code); }
|
146
src/re.h
Normal file
146
src/re.h
Normal file
|
@ -0,0 +1,146 @@
|
|||
// Wraps PCRE2.
|
||||
#ifndef FISH_RE_H
|
||||
#define FISH_RE_H
|
||||
|
||||
#include "common.h"
|
||||
#include "maybe.h"
|
||||
|
||||
namespace re {
|
||||
|
||||
namespace adapters {
|
||||
// Adapter to store pcre2_code in unique_ptr.
|
||||
struct bytecode_deleter_t {
|
||||
void operator()(const void *);
|
||||
};
|
||||
using bytecode_ptr_t = std::unique_ptr<const void, bytecode_deleter_t>;
|
||||
|
||||
// Adapter to store pcre2_match_data in unique_ptr.
|
||||
struct match_data_deleter_t {
|
||||
void operator()(void *);
|
||||
};
|
||||
using match_data_ptr_t = std::unique_ptr<void, match_data_deleter_t>;
|
||||
} // namespace adapters
|
||||
|
||||
/// Error code type alias.
|
||||
using error_code_t = int;
|
||||
|
||||
/// Flags for compiling a regex.
|
||||
struct flags_t {
|
||||
bool icase{}; // ignore case?
|
||||
};
|
||||
|
||||
/// Flags for substituting a regex.
|
||||
struct sub_flags_t {
|
||||
bool global{}; // perform multiple substitutions?
|
||||
bool literal{}; // $1 is literal, not a capture reference
|
||||
bool extended{}; // apply PCRE2 extended backslash escapes?
|
||||
};
|
||||
|
||||
/// A type wrapping up error information.
|
||||
/// Beware, GNU defines error_t; hence we use an re_ prefix again.
|
||||
struct re_error_t {
|
||||
error_code_t code{}; // error code
|
||||
size_t offset{}; // offset of the error in the pattern
|
||||
|
||||
/// \return our error message.
|
||||
wcstring message() const;
|
||||
};
|
||||
|
||||
/// A half-open range of a subject which matched.
|
||||
struct match_range_t {
|
||||
size_t begin;
|
||||
size_t end;
|
||||
|
||||
bool operator==(match_range_t rhs) const { return begin == rhs.begin && end == rhs.end; }
|
||||
bool operator!=(match_range_t rhs) const { return !(*this == rhs); }
|
||||
};
|
||||
|
||||
/// A match data is the "stateful" object, storing string indices for where to start the next match,
|
||||
/// capture results, etc. Create one via regex_t::prepare(). These are tied to the regex which
|
||||
/// created them.
|
||||
class match_data_t : noncopyable_t {
|
||||
public:
|
||||
match_data_t(match_data_t &&) = default;
|
||||
match_data_t &operator=(match_data_t &&) = default;
|
||||
~match_data_t() = default;
|
||||
|
||||
/// \return a "count" of the number of capture groups which matched.
|
||||
/// This is really one more than the highest matching group.
|
||||
/// 0 is considered a "group" for the entire match, so this will always return at least 1 for a
|
||||
/// successful match.
|
||||
size_t matched_capture_group_count() const { return max_capture; }
|
||||
|
||||
/// Reset this data, as if this were freshly issued by a call to prepare().
|
||||
void reset();
|
||||
|
||||
private:
|
||||
explicit match_data_t(adapters::match_data_ptr_t &&data) : data(std::move(data)) {}
|
||||
|
||||
// Next start position. This may exceed the needle length, which indicates exhaustion.
|
||||
size_t start_offset{0};
|
||||
|
||||
// One more than the highest numbered capturing pair that was set (e.g. 1 if no captures).
|
||||
size_t max_capture{0};
|
||||
|
||||
// If set, the last match was empty.
|
||||
bool last_empty{false};
|
||||
|
||||
// Underlying pcre2_match_data.
|
||||
adapters::match_data_ptr_t data{};
|
||||
|
||||
friend class regex_t;
|
||||
};
|
||||
|
||||
/// The compiled form of a PCRE2 regex.
|
||||
/// This is thread safe.
|
||||
class regex_t : noncopyable_t {
|
||||
public:
|
||||
/// Compile a pattern into a regex. \return the resulting regex, or none on error.
|
||||
/// If \p error is not null, populate it with the error information.
|
||||
static maybe_t<regex_t> try_compile(const wcstring &pattern, const flags_t &flags = flags_t{},
|
||||
re_error_t *out_error = nullptr);
|
||||
|
||||
/// Create a match data for this regex.
|
||||
/// The result is tied to this regex; it should not be used for others.
|
||||
match_data_t prepare() const;
|
||||
|
||||
/// Match against a string \p subject, populating \p md.
|
||||
/// \return a range on a successful match, none on no match.
|
||||
maybe_t<match_range_t> match(match_data_t &md, const wcstring &subject) const;
|
||||
|
||||
/// \return the matched range for an indexed or named capture group. 0 means the entire match.
|
||||
maybe_t<match_range_t> group(const match_data_t &md, size_t group_idx) const;
|
||||
maybe_t<match_range_t> group(const match_data_t &md, const wcstring &name) const;
|
||||
|
||||
/// \return the matched substring for a capture group.
|
||||
maybe_t<wcstring> substring_for_group(const match_data_t &md, size_t group_idx,
|
||||
const wcstring &subject) const;
|
||||
maybe_t<wcstring> substring_for_group(const match_data_t &md, const wcstring &name,
|
||||
const wcstring &subject) const;
|
||||
|
||||
/// \return the number of indexed capture groups.
|
||||
size_t capture_group_count() const;
|
||||
|
||||
/// \return the list of capture group names.
|
||||
/// Note PCRE provides these in sorted order, not specification order.
|
||||
wcstring_list_t capture_group_names() const;
|
||||
|
||||
/// Search \p subject for matches for this regex, starting at \p start_idx, and replacing them
|
||||
/// with \p replacement. If \p repl_count is not null, populate it with the number of
|
||||
/// replacements which occurred. This may fail for e.g. bad escapes in the replacement string.
|
||||
maybe_t<wcstring> substitute(const wcstring &subject, const wcstring &replacement,
|
||||
sub_flags_t flags, size_t start_idx = 0,
|
||||
re_error_t *out_error = nullptr,
|
||||
int *out_repl_count = nullptr) const;
|
||||
|
||||
regex_t(regex_t &&other) = default;
|
||||
regex_t &operator=(regex_t &&) = default;
|
||||
~regex_t() = default;
|
||||
|
||||
private:
|
||||
regex_t(adapters::bytecode_ptr_t &&);
|
||||
adapters::bytecode_ptr_t code_;
|
||||
};
|
||||
|
||||
} // namespace re
|
||||
#endif
|
Loading…
Reference in New Issue
Block a user