From e384e63b2459002e471726e364da46c5337da062 Mon Sep 17 00:00:00 2001 From: Neeraj Jaiswal Date: Fri, 24 Feb 2023 21:25:49 +0530 Subject: [PATCH] re: port regex make anchored to rust and helper ffi funtions for regex --- fish-rust/src/ffi.rs | 33 +++++++++++++++++++++++++++++-- fish-rust/src/re.rs | 46 ++++++++++++++++++++++++++++++++++++++++++++ src/fish_tests.cpp | 17 ---------------- src/re.cpp | 26 +++++++++++++++++-------- src/re.h | 17 ++++++++++++---- 5 files changed, 108 insertions(+), 31 deletions(-) create mode 100644 fish-rust/src/re.rs diff --git a/fish-rust/src/ffi.rs b/fish-rust/src/ffi.rs index 860ceebdf..fd200eead 100644 --- a/fish-rust/src/ffi.rs +++ b/fish-rust/src/ffi.rs @@ -1,7 +1,13 @@ use crate::wchar; +use crate::wchar_ffi::WCharToFFI; +#[rustfmt::skip] +use ::std::fmt::{self, Debug, Formatter}; +#[rustfmt::skip] +use ::std::pin::Pin; +#[rustfmt::skip] +use ::std::slice; +use crate::wchar::wstr; use autocxx::prelude::*; -use core::pin::Pin; -use core::slice; use cxx::SharedPtr; // autocxx has been hacked up to know about this. @@ -10,14 +16,17 @@ pub type wchar_t = u32; include_cpp! { #include "builtin.h" #include "common.h" + #include "env.h" #include "event.h" #include "fallback.h" #include "fds.h" #include "flog.h" #include "io.h" + #include "parse_constants.h" #include "parser.h" #include "parse_util.h" #include "proc.h" + #include "re.h" #include "tokenizer.h" #include "wildcard.h" #include "wutil.h" @@ -74,6 +83,12 @@ include_cpp! { generate!("signal_get_desc") generate!("fd_event_signaller_t") + + generate_pod!("re::flags_t") + generate_pod!("re::re_error_t") + generate!("re::regex_t") + generate!("re::regex_result_ffi") + generate!("re::try_compile_ffi") } impl parser_t { @@ -89,6 +104,10 @@ impl parser_t { } } +pub fn try_compile(anchored: &wstr, flags: &re::flags_t) -> Pin> { + re::try_compile_ffi(&anchored.to_ffi(), flags).within_box() +} + impl job_t { #[allow(clippy::mut_from_ref)] pub fn get_procs(&self) -> &mut [UniquePtr] { @@ -115,6 +134,12 @@ impl From for wchar::WString { } } +impl Debug for re::regex_t { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + f.write_str("regex_t") + } +} + /// A bogus trait for turning &mut Foo into Pin<&mut Foo>. /// autocxx enforces that non-const methods must be called through Pin, /// but this means we can't pass around mutable references to types like parser_t. @@ -133,11 +158,15 @@ pub trait Repin { } // Implement Repin for our types. +impl Repin for env_stack_t {} impl Repin for io_streams_t {} impl Repin for job_t {} impl Repin for output_stream_t {} impl Repin for parser_t {} impl Repin for process_t {} +impl Repin for re::regex_result_ffi {} + +unsafe impl Send for re::regex_t {} pub use autocxx::c_int; pub use ffi::*; diff --git a/fish-rust/src/re.rs b/fish-rust/src/re.rs new file mode 100644 index 000000000..72b0ad6b4 --- /dev/null +++ b/fish-rust/src/re.rs @@ -0,0 +1,46 @@ +use crate::wchar::{wstr, WString, L}; + +/// Adjust a pattern so that it is anchored at both beginning and end. +/// This is a workaround for the fact that PCRE2_ENDANCHORED is unavailable on pre-2017 PCRE2 +/// (e.g. 10.21, on Xenial). +pub fn regex_make_anchored(pattern: &wstr) -> WString { + let mut anchored = pattern.to_owned(); + // PATTERN -> ^(:?PATTERN)$. + let prefix = L!("^(?:"); + let suffix = L!(")$"); + anchored.reserve(pattern.len() + prefix.len() + suffix.len()); + anchored.insert_utfstr(0, prefix); + anchored.push_utfstr(suffix); + anchored +} + +use crate::ffi_tests::add_test; +add_test!("test_regex_make_anchored", || { + use crate::ffi; + use crate::wchar::L; + use crate::wchar_ffi::WCharToFFI; + + let flags = ffi::re::flags_t { icase: false }; + let mut result = ffi::try_compile(®ex_make_anchored(L!("ab(.+?)")), &flags); + assert!(!result.has_error()); + + let re = result.as_mut().get_regex(); + + assert!(!re.is_null()); + assert!(!re.matches_ffi(&L!("").to_ffi())); + assert!(!re.matches_ffi(&L!("ab").to_ffi())); + assert!(re.matches_ffi(&L!("abcd").to_ffi())); + assert!(!re.matches_ffi(&L!("xabcd").to_ffi())); + assert!(re.matches_ffi(&L!("abcdefghij").to_ffi())); + + let mut result = ffi::try_compile(®ex_make_anchored(L!("(a+)|(b+)")), &flags); + assert!(!result.has_error()); + + let re = result.as_mut().get_regex(); + assert!(!re.is_null()); + assert!(!re.matches_ffi(&L!("").to_ffi())); + assert!(!re.matches_ffi(&L!("aabb").to_ffi())); + assert!(re.matches_ffi(&L!("aaaa").to_ffi())); + assert!(re.matches_ffi(&L!("bbbb").to_ffi())); + assert!(!re.matches_ffi(&L!("aaaax").to_ffi())); +}); diff --git a/src/fish_tests.cpp b/src/fish_tests.cpp index 024a743c3..deb7a6275 100644 --- a/src/fish_tests.cpp +++ b/src/fish_tests.cpp @@ -6828,23 +6828,6 @@ static void test_re_basic() { } do_test(join_strings(matches, L',') == L"AA,CC,11"); do_test(join_strings(captures, L',') == L"A,C,1"); - - // Test make_anchored - re = regex_t::try_compile(make_anchored(L"ab(.+?)")); - do_test(re.has_value()); - do_test(!re->match(L"")); - do_test(!re->match(L"ab")); - do_test((re->match(L"abcd") == match_range_t{0, 4})); - do_test(!re->match(L"xabcd")); - do_test((re->match(L"abcdefghij") == match_range_t{0, 10})); - - re = regex_t::try_compile(make_anchored(L"(a+)|(b+)")); - do_test(re.has_value()); - do_test(!re->match(L"")); - do_test(!re->match(L"aabb")); - do_test((re->match(L"aaaa") == match_range_t{0, 4})); - do_test((re->match(L"bbbb") == match_range_t{0, 4})); - do_test(!re->match(L"aaaax")); } static void test_re_reset() { diff --git a/src/re.cpp b/src/re.cpp index 54ee295bc..b14bf3d68 100644 --- a/src/re.cpp +++ b/src/re.cpp @@ -135,6 +135,10 @@ maybe_t regex_t::match(const wcstring &subject) const { return this->match(md, subject); } +bool regex_t::matches_ffi(const wcstring &subject) const { + return this->match(subject).has_value(); +} + maybe_t regex_t::group(const match_data_t &md, size_t group_idx) const { if (group_idx >= md.max_capture || group_idx >= pcre2_get_ovector_count(get_md(md.data))) { return none(); @@ -295,12 +299,18 @@ regex_t::regex_t(adapters::bytecode_ptr_t &&code) : code_(std::move(code)) { wcstring re_error_t::message() const { return message_for_code(this->code); } -wcstring re::make_anchored(wcstring pattern) { - // PATTERN -> ^(:?PATTERN)$. - const wchar_t *prefix = L"^(?:"; - const wchar_t *suffix = L")$"; - pattern.reserve(pattern.size() + wcslen(prefix) + wcslen(suffix)); - pattern.insert(0, prefix); - pattern.append(suffix); - return pattern; +re::regex_result_ffi re::try_compile_ffi(const wcstring &pattern, const flags_t &flags) { + re_error_t error{}; + auto regex = regex_t::try_compile(pattern, flags, &error); + + if (regex) { + return regex_result_ffi{std::make_unique(regex.acquire()), error}; + } + + return re::regex_result_ffi{nullptr, error}; } + +bool re::regex_result_ffi::has_error() const { return error.code != 0; } +re::re_error_t re::regex_result_ffi::get_error() const { return error; }; + +std::unique_ptr re::regex_result_ffi::get_regex() { return std::move(regex); } diff --git a/src/re.h b/src/re.h index 134b01c5e..c1cd0f34d 100644 --- a/src/re.h +++ b/src/re.h @@ -114,6 +114,9 @@ class regex_t : noncopyable_t { /// A convenience function which calls prepare() for you. maybe_t match(const wcstring &subject) const; + /// A convenience function which calls prepare() for you. + bool matches_ffi(const wcstring &subject) const; + /// \return the matched range for an indexed or named capture group. 0 means the entire match. maybe_t group(const match_data_t &md, size_t group_idx) const; maybe_t group(const match_data_t &md, const wcstring &name) const; @@ -148,10 +151,16 @@ class regex_t : noncopyable_t { adapters::bytecode_ptr_t code_; }; -/// Adjust a pattern so that it is anchored at both beginning and end. -/// This is a workaround for the fact that PCRE2_ENDANCHORED is unavailable on pre-2017 PCRE2 -/// (e.g. 10.21, on Xenial). -wcstring make_anchored(wcstring pattern); +struct regex_result_ffi { + std::unique_ptr regex; + re::re_error_t error; + + bool has_error() const; + std::unique_ptr get_regex(); + re::re_error_t get_error() const; +}; + +regex_result_ffi try_compile_ffi(const wcstring &pattern, const flags_t &flags); } // namespace re #endif