fish-shell/fish-rust/src/tokenizer.rs

//! A specialized tokenizer for tokenizing the fish language. In the future, the tokenizer should be
//! extended to support marks, tokenizing multiple strings and disposing of unused string segments.

use crate::ffi::{valid_var_name_char, wcharz_t};
use crate::future_feature_flags::{feature_test, FeatureFlag};
use crate::parse_constants::SOURCE_OFFSET_INVALID;
use crate::redirection::RedirectionMode;
use crate::wchar::{WExt, L};
use crate::wchar_ffi::{wchar_t, wstr, WCharFromFFI, WCharToFFI, WString};
use crate::wutil::wgettext;
use cxx::{CxxWString, SharedPtr, UniquePtr};
use libc::{c_int, STDIN_FILENO, STDOUT_FILENO};
use std::ops::{BitAnd, BitAndAssign, BitOr, BitOrAssign, Not};
use std::os::fd::RawFd;
use widestring_suffix::widestrs;

#[cxx::bridge]
mod tokenizer_ffi {
    extern "C++" {
        include!("wutil.h");
        include!("redirection.h");
        type wcharz_t = super::wcharz_t;
        type RedirectionMode = super::RedirectionMode;
    }

    /// Token types. XXX Why this isn't ParseTokenType, I'm not really sure.
    enum TokenType {
        /// Error reading token
        error,
        /// String token
        string,
        /// Pipe token
        pipe,
        /// && token
        andand,
        /// || token
        oror,
        /// End token (semicolon or newline, not literal end)
        end,
        /// redirection token
        redirect,
        /// send job to bg token
        background,
        /// comment token
        comment,
    }

    enum TokenizerError {
        none,
        unterminated_quote,
        unterminated_subshell,
        unterminated_slice,
        unterminated_escape,
        invalid_redirect,
        invalid_pipe,
        invalid_pipe_ampersand,
        closing_unopened_subshell,
        illegal_slice,
        closing_unopened_brace,
        unterminated_brace,
        expected_pclose_found_bclose,
        expected_bclose_found_pclose,
    }

    extern "Rust" {
        fn tokenizer_get_error_message(err: TokenizerError) -> UniquePtr<CxxWString>;
    }

    struct Tok {
        // Offset of the token.
        offset: u32,
        // Length of the token.
        length: u32,

        // If an error, this is the offset of the error within the token. A value of 0 means it occurred
        // at 'offset'.
        error_offset_within_token: u32,
        error_length: u32,

        // If an error, this is the error code.
        error: TokenizerError,

        // The type of the token.
        type_: TokenType,
    }
    // TODO static_assert(sizeof(Tok) <= 32, "Tok expected to be 32 bytes or less");

    extern "Rust" {
        fn location_in_or_at_end_of_source_range(self: &Tok, loc: usize) -> bool;
        #[cxx_name = "get_source"]
        fn get_source_ffi(self: &Tok, str: &CxxWString) -> UniquePtr<CxxWString>;
    }

    extern "Rust" {
        type Tokenizer;
        fn new_tokenizer(start: wcharz_t, flags: u8) -> Box<Tokenizer>;
        #[cxx_name = "next"]
        fn next_ffi(self: &mut Tokenizer) -> UniquePtr<Tok>;
        #[cxx_name = "text_of"]
        fn text_of_ffi(self: &Tokenizer, tok: &Tok) -> UniquePtr<CxxWString>;
        #[cxx_name = "is_token_delimiter"]
        fn is_token_delimiter_ffi(c: wchar_t, next: SharedPtr<wchar_t>) -> bool;
    }

    extern "Rust" {
        #[cxx_name = "tok_command"]
        fn tok_command_ffi(str: &CxxWString) -> UniquePtr<CxxWString>;
    }

    /// Struct wrapping up a parsed pipe or redirection.
    struct PipeOrRedir {
        // The redirected fd, or -1 on overflow.
        // In the common case of a pipe, this is 1 (STDOUT_FILENO).
        // For example, in the case of "3>&1" this will be 3.
        fd: i32,

        // Whether we are a pipe (true) or redirection (false).
        is_pipe: bool,

        // The redirection mode if the type is redirect.
        // Ignored for pipes.
        mode: RedirectionMode,

        // Whether, in addition to this redirection, stderr should also be dup'd to stdout
        // For example &| or &>
        stderr_merge: bool,

        // Number of characters consumed when parsing the string.
        consumed: usize,
    }

    extern "Rust" {
        fn pipe_or_redir_from_string(buff: wcharz_t) -> UniquePtr<PipeOrRedir>;
        fn is_valid(self: &PipeOrRedir) -> bool;
        fn oflags(self: &PipeOrRedir) -> i32;
        fn token_type(self: &PipeOrRedir) -> TokenType;
    }

    enum MoveWordStyle {
        move_word_style_punctuation,     // stop at punctuation
        move_word_style_path_components, // stops at path components
        move_word_style_whitespace,      // stops at whitespace
    }

    /// Our state machine that implements "one word" movement or erasure.
    struct MoveWordStateMachine {
        state: u8,
        style: MoveWordStyle,
    }

    extern "Rust" {
        fn new_move_word_state_machine(syl: MoveWordStyle) -> Box<MoveWordStateMachine>;
        #[cxx_name = "consume_char"]
        fn consume_char_ffi(self: &mut MoveWordStateMachine, c: wchar_t) -> bool;
        fn reset(self: &mut MoveWordStateMachine);
    }

    extern "Rust" {
        #[cxx_name = "variable_assignment_equals_pos"]
        fn variable_assignment_equals_pos_ffi(txt: &CxxWString) -> SharedPtr<usize>;
    }
}

pub use tokenizer_ffi::{
    MoveWordStateMachine, MoveWordStyle, PipeOrRedir, Tok, TokenType, TokenizerError,
};

#[derive(Clone, Copy)]
pub struct TokFlags(pub u8);

impl BitAnd for TokFlags {
    type Output = bool;
    fn bitand(self, rhs: Self) -> Self::Output {
        (self.0 & rhs.0) != 0
    }
}
impl BitOr for TokFlags {
    type Output = Self;
    fn bitor(self, rhs: Self) -> Self::Output {
        Self(self.0 | rhs.0)
    }
}

/// Flag telling the tokenizer to accept incomplete parameters, i.e. parameters with mismatching
/// parenthesis, etc. This is useful for tab-completion.
pub const TOK_ACCEPT_UNFINISHED: TokFlags = TokFlags(1);

/// Flag telling the tokenizer not to remove comments. Useful for syntax highlighting.
pub const TOK_SHOW_COMMENTS: TokFlags = TokFlags(2);

/// Ordinarily, the tokenizer ignores newlines following a newline, or a semicolon. This flag tells
/// the tokenizer to return each of them as a separate END.
pub const TOK_SHOW_BLANK_LINES: TokFlags = TokFlags(4);

/// Make an effort to continue after an error.
pub const TOK_CONTINUE_AFTER_ERROR: TokFlags = TokFlags(8);

/// Get the error message for an error \p err.
pub fn tokenizer_get_error_message(err: TokenizerError) -> UniquePtr<CxxWString> {
    let s: &'static wstr = err.into();
    s.to_ffi()
}

impl From<TokenizerError> for &'static wstr {
    #[widestrs]
    fn from(err: TokenizerError) -> Self {
        match err {
            TokenizerError::none => ""L,
            TokenizerError::unterminated_quote => {
                wgettext!("Unexpected end of string, quotes are not balanced")
            }
            TokenizerError::unterminated_subshell => {
                wgettext!("Unexpected end of string, expecting ')'")
            }
            TokenizerError::unterminated_slice => {
                wgettext!("Unexpected end of string, square brackets do not match")
            }
            TokenizerError::unterminated_escape => {
                wgettext!("Unexpected end of string, incomplete escape sequence")
            }
            TokenizerError::invalid_redirect => {
                wgettext!("Invalid input/output redirection")
            }
            TokenizerError::invalid_pipe => {
                wgettext!("Cannot use stdin (fd 0) as pipe output")
            }
            TokenizerError::invalid_pipe_ampersand => {
                wgettext!("|& is not valid. In fish, use &| to pipe both stdout and stderr.")
            }
            TokenizerError::closing_unopened_subshell => {
                wgettext!("Unexpected ')' for unopened parenthesis")
            }
            TokenizerError::illegal_slice => {
                wgettext!("Unexpected '[' at this location")
            }
            TokenizerError::closing_unopened_brace => {
                wgettext!("Unexpected '}' for unopened brace expansion")
            }
            TokenizerError::unterminated_brace => {
                wgettext!("Unexpected end of string, incomplete parameter expansion")
            }
            TokenizerError::expected_pclose_found_bclose => {
                wgettext!("Unexpected '}' found, expecting ')'")
            }
            TokenizerError::expected_bclose_found_pclose => {
                wgettext!("Unexpected ')' found, expecting '}'")
            }
            _ => {
                panic!("Unexpected tokenizer error");
            }
        }
    }
}

impl Tok {
    fn new(r#type: TokenType) -> Tok {
        Tok {
            offset: 0,
            length: 0,
            error_offset_within_token: SOURCE_OFFSET_INVALID,
            error_length: 0,
            error: TokenizerError::none,
            type_: r#type,
        }
    }
    pub fn location_in_or_at_end_of_source_range(self: &Tok, loc: usize) -> bool {
        let loc = loc as u32;
        self.offset <= loc && loc - self.offset <= self.length
    }
    pub fn get_source<'a, 'b>(self: &'a Tok, str: &'b wstr) -> &'b wstr {
        &str[self.offset as usize..(self.offset + self.length) as usize]
    }
    fn get_source_ffi(self: &Tok, str: &CxxWString) -> UniquePtr<CxxWString> {
        self.get_source(&str.from_ffi()).to_ffi()
    }
}

/// The tokenizer struct.
pub struct Tokenizer {
    /// A pointer into the original string, showing where the next token begins.
    token_cursor: usize,
    /// The start of the original string.
    start: WString, // TODO Avoid copying once we drop the FFI.
    /// Whether we have additional tokens.
    has_next: bool,
    /// Whether incomplete tokens are accepted.
    accept_unfinished: bool,
    /// Whether comments should be returned.
    show_comments: bool,
    /// Whether all blank lines are returned.
    show_blank_lines: bool,
    /// Whether to attempt to continue after an error.
    continue_after_error: bool,
    /// Whether to continue the previous line after the comment.
    continue_line_after_comment: bool,
}

impl Tokenizer {
    /// Constructor for a tokenizer. b is the string that is to be tokenized. It is not copied, and
    /// should not be freed by the caller until after the tokenizer is destroyed.
    ///
    /// \param start The string to tokenize
    /// \param flags Flags to the tokenizer. Setting TOK_ACCEPT_UNFINISHED will cause the tokenizer
    /// to accept incomplete tokens, such as a subshell without a closing parenthesis, as a valid
    /// token. Setting TOK_SHOW_COMMENTS will return comments as tokens
    pub fn new(start: &wstr, flags: TokFlags) -> Self {
        Tokenizer {
            token_cursor: 0,
            start: start.to_owned(),
            has_next: true,
            accept_unfinished: flags & TOK_ACCEPT_UNFINISHED,
            show_comments: flags & TOK_SHOW_COMMENTS,
            show_blank_lines: flags & TOK_SHOW_BLANK_LINES,
            continue_after_error: flags & TOK_CONTINUE_AFTER_ERROR,
            continue_line_after_comment: false,
        }
    }
}

fn new_tokenizer(start: wcharz_t, flags: u8) -> Box<Tokenizer> {
    Box::new(Tokenizer::new(start.into(), TokFlags(flags)))
}

impl Tokenizer {
    /// Returns the next token, or none if we are at the end.
    pub fn next(&mut self) -> Option<Tok> {
        // TODO Implement IntoIterator.
        if !self.has_next {
            return None;
        }

        // Consume non-newline whitespace. If we get an escaped newline, mark it and continue past
        // it.
        loop {
            let i = self.token_cursor;
            if self.start.get(i..i + 2) == Some(L!("\\\n")) {
                self.token_cursor += 2;
                self.continue_line_after_comment = true;
            } else if i < self.start.len() && iswspace_not_nl(self.start.char_at(i)) {
                self.token_cursor += 1;
            } else {
                break;
            }
        }

        while self.start.char_at(self.token_cursor) == '#' {
            // We have a comment, walk over the comment.
            let comment_start = self.token_cursor;
            self.token_cursor = comment_end(&self.start, self.token_cursor);
            let comment_len = self.token_cursor - comment_start;

            // If we are going to continue after the comment, skip any trailing newline.
            if self.start.as_char_slice().get(self.token_cursor) == Some(&'\n')
                && self.continue_line_after_comment
            {
                self.token_cursor += 1;
            }

            // Maybe return the comment.
            if self.show_comments {
                let mut result = Tok::new(TokenType::comment);
                result.offset = comment_start as u32;
                result.length = comment_len as u32;
                return Some(result);
            }

            while self.token_cursor < self.start.len()
                && iswspace_not_nl(self.start.char_at(self.token_cursor))
            {
                self.token_cursor += 1;
            }
        }

        // We made it past the comments and ate any trailing newlines we wanted to ignore.
        self.continue_line_after_comment = false;
        let start_pos = self.token_cursor;

        let this_char = self.start.char_at(self.token_cursor);
        let next_char = self
            .start
            .as_char_slice()
            .get(self.token_cursor + 1)
            .copied();
        let buff = &self.start[self.token_cursor..];
        match this_char {
            '\0'=> {
                self.has_next = false;
                None
            }
            '\r'|  // carriage-return
            '\n'|  // newline
            ';'=> {
                let mut result = Tok::new(TokenType::end);
                result.offset = start_pos as u32;
                result.length = 1;
                self.token_cursor+=1;
                // Hack: when we get a newline, swallow as many as we can. This compresses multiple
                // subsequent newlines into a single one.
                if !self.show_blank_lines {
                    while self.token_cursor < self.start.len() {
                        let c = self.start.char_at(self.token_cursor);
                        if c != '\n' && c != '\r' && c != ' ' && c != '\t' {
                            break
                        }
                        self.token_cursor+=1;
                    }
                }
                Some(result)
            }
            '&'=> {
                if next_char == Some('&') {
                    // && is and.
                    let mut result = Tok::new(TokenType::andand);
                    result.offset = start_pos as u32;
                    result.length = 2;
                    self.token_cursor += 2;
                    Some(result)
                } else if next_char == Some('>') || next_char == Some('|') {
                    // &> and &| redirect both stdout and stderr.
                    let redir = PipeOrRedir::try_from(buff).
                        expect("Should always succeed to parse a &> or &| redirection");
                    let mut result = Tok::new(redir.token_type());
                    result.offset = start_pos as u32;
                    result.length = redir.consumed as u32;
                    self.token_cursor += redir.consumed;
                    Some(result)
                } else {
                    let mut result = Tok::new(TokenType::background);
                    result.offset = start_pos as u32;
                    result.length = 1;
                    self.token_cursor+=1;
                    Some(result)
                }
            }
            '|'=> {
                if next_char == Some('|') {
                    // || is or.
                        let mut result=Tok::new(TokenType::oror);
                    result.offset = start_pos as u32;
                    result.length = 2;
                    self.token_cursor += 2;
                    Some(result)
                } else if next_char == Some('&') {
                    // |& is a bashism; in fish it's &|.
                    Some(self.call_error(TokenizerError::invalid_pipe_ampersand,
                                            self.token_cursor, self.token_cursor, Some(2), 2))
                } else {
                    let pipe = PipeOrRedir::try_from(buff).
                        expect("Should always succeed to parse a | pipe");
                    let mut result = Tok::new(pipe.token_type());
                    result.offset = start_pos as u32;
                    result.length = pipe.consumed as u32;
                    self.token_cursor += pipe.consumed;
                    Some(result)
                }
            }
            '>'| '<' => {
                // There's some duplication with the code in the default case below. The key
                // difference here is that we must never parse these as a string; a failed
                // redirection is an error!
                 match PipeOrRedir::try_from(buff) {
                    Ok(redir_or_pipe) => {
                        if redir_or_pipe.fd < 0 {
                            Some(self.call_error(TokenizerError::invalid_redirect, self.token_cursor,
                                            self.token_cursor,
                                            Some(redir_or_pipe.consumed),
                                            redir_or_pipe.consumed))
                        } else {
                            let mut result = Tok::new(redir_or_pipe.token_type());
                            result.offset = start_pos as u32;
                            result.length = redir_or_pipe.consumed as u32;
                            self.token_cursor += redir_or_pipe.consumed;
                            Some(result)
                        }
                    }
                    Err(()) => Some(self.call_error(TokenizerError::invalid_redirect, self.token_cursor,
                                            self.token_cursor,
                                            Some(0),
                                            0))
                }
            }
                _ => {
                    // Maybe a redirection like '2>&1', maybe a pipe like 2>|, maybe just a string.
                    let error_location = self.token_cursor;
                    let redir_or_pipe = if this_char.is_ascii_digit() {
                        PipeOrRedir::try_from(buff).ok()
                    } else {
                        None
                    };

                    match redir_or_pipe {
                        Some(redir_or_pipe) => {
                            // It looks like a redirection or a pipe. But we don't support piping fd 0. Note
                            // tSome(hat fd 0 may be -1, indicating overflow; but we don't treat that as a
                            // tokenizer error.
                            if redir_or_pipe.is_pipe && redir_or_pipe.fd == 0 {
                                Some(self.call_error(TokenizerError::invalid_pipe, error_location,
                                                        error_location, Some(redir_or_pipe.consumed),
                                                        redir_or_pipe.consumed))
                            }
                            else {
                                let mut result = Tok::new(redir_or_pipe.token_type());
                                result.offset = start_pos as u32;
                                result.length = redir_or_pipe.consumed as u32;
                                self.token_cursor += redir_or_pipe.consumed;
                                Some(result)
                            }
                        }
                        None => {
                            // Not a redirection or pipe, so just a string.
                            Some(self.read_string())
                    }
                }
            }
        }
    }
    fn next_ffi(&mut self) -> UniquePtr<Tok> {
        match self.next() {
            Some(tok) => UniquePtr::new(tok),
            None => UniquePtr::null(),
        }
    }
}

/// Test if a character is whitespace. Differs from iswspace in that it does not consider a
/// newline to be whitespace.
fn iswspace_not_nl(c: char) -> bool {
    match c {
        ' ' | '\t' | '\r' => true,
        '\n' => false,
        _ => c.is_whitespace(),
    }
}

impl Tokenizer {
    /// Returns the text of a token, as a string.
    pub fn text_of(&self, tok: &Tok) -> &wstr {
        tok.get_source(&self.start)
    }
    fn text_of_ffi(&self, tok: &Tok) -> UniquePtr<CxxWString> {
        self.text_of(tok).to_ffi()
    }

    /// Return an error token and mark that we no longer have a next token.
    fn call_error(
        &mut self,
        error_type: TokenizerError,
        token_start: usize,
        error_loc: usize,
        token_length: Option<usize>,
        error_len: usize,
    ) -> Tok {
        assert!(
            error_type != TokenizerError::none,
            "TokenizerError::none passed to call_error"
        );
        assert!(error_loc >= token_start, "Invalid error location");
        assert!(self.token_cursor >= token_start, "Invalid buff location");

        // If continue_after_error is set and we have a real token length, then skip past it.
        // Otherwise give up.
        match token_length {
            Some(token_length) if self.continue_after_error => {
                assert!(
                    self.token_cursor < error_loc + token_length,
                    "Unable to continue past error"
                );
                self.token_cursor = error_loc + token_length;
            }
            _ => self.has_next = false,
        }

        Tok {
            offset: token_start as u32,
            length: token_length.unwrap_or(self.token_cursor - token_start) as u32,
            error_offset_within_token: (error_loc - token_start) as u32,
            error_length: error_len as u32,
            error: error_type,
            type_: TokenType::error,
        }
    }
}

impl Tokenizer {
    /// Read the next token as a string.
    fn read_string(&mut self) -> Tok {
        let mut mode = TOK_MODE_REGULAR_TEXT;
        let mut paran_offsets = vec![];
        let mut brace_offsets = vec![];
        let mut expecting = vec![];
        let mut quoted_cmdsubs = vec![];
        let mut slice_offset = 0;
        let buff_start = self.token_cursor;
        let mut is_token_begin = true;

        fn process_opening_quote(
            this: &mut Tokenizer,
            quoted_cmdsubs: &mut Vec<usize>,
            paran_offsets: &mut Vec<usize>,
            quote: char,
        ) -> Result<(), usize> {
            if let Some(end) = quote_end(&this.start, this.token_cursor, quote) {
                if this.start.char_at(end) == '$' {
                    quoted_cmdsubs.push(paran_offsets.len());
                }
                this.token_cursor = end;
                Ok(())
            } else {
                let error_loc = this.token_cursor;
                this.token_cursor = this.start.len();
                Err(error_loc)
            }
        }

        while self.token_cursor != self.start.len() {
            let c = self.start.char_at(self.token_cursor);

            // Make sure this character isn't being escaped before anything else
            if mode & TOK_MODE_CHAR_ESCAPE {
                mode &= !TOK_MODE_CHAR_ESCAPE;
                // and do nothing more
            } else if myal(c) {
                // Early exit optimization in case the character is just a letter,
                // which has no special meaning to the tokenizer, i.e. the same mode continues.
            }
            // Now proceed with the evaluation of the token, first checking to see if the token
            // has been explicitly ignored (escaped).
            else if c == '\\' {
                mode |= TOK_MODE_CHAR_ESCAPE;
            } else if c == '#' && is_token_begin {
                self.token_cursor = comment_end(&self.start, self.token_cursor) - 1;
            } else if c == '(' {
                paran_offsets.push(self.token_cursor);
                expecting.push(')');
                mode |= TOK_MODE_SUBSHELL;
            } else if c == '{' {
                brace_offsets.push(self.token_cursor);
                expecting.push('}');
                mode |= TOK_MODE_CURLY_BRACES;
            } else if c == ')' {
                if expecting.last() == Some(&'}') {
                    return self.call_error(
                        TokenizerError::expected_bclose_found_pclose,
                        self.token_cursor,
                        self.token_cursor,
                        Some(1),
                        1,
                    );
                }
                if paran_offsets.is_empty() {
                    return self.call_error(
                        TokenizerError::closing_unopened_subshell,
                        self.token_cursor,
                        self.token_cursor,
                        Some(1),
                        1,
                    );
                }
                paran_offsets.pop();
                if paran_offsets.is_empty() {
                    mode &= !TOK_MODE_SUBSHELL;
                }
                expecting.pop();
                // Check if the ) completed a quoted command substitution.
                if quoted_cmdsubs.last() == Some(&paran_offsets.len()) {
                    quoted_cmdsubs.pop();
                    // The "$(" part of a quoted command substitution closes double quotes. To keep
                    // quotes balanced, act as if there was an invisible double quote after the ")".
                    if let Err(error_loc) =
                        process_opening_quote(self, &mut quoted_cmdsubs, &mut paran_offsets, '"')
                    {
                        if !self.accept_unfinished {
                            return self.call_error(
                                TokenizerError::unterminated_quote,
                                buff_start,
                                error_loc,
                                None,
                                0,
                            );
                        }
                        break;
                    }
                }
            } else if c == '}' {
                if expecting.last() == Some(&')') {
                    return self.call_error(
                        TokenizerError::expected_pclose_found_bclose,
                        self.token_cursor,
                        self.token_cursor,
                        Some(1),
                        1,
                    );
                }
                if brace_offsets.is_empty() {
                    return self.call_error(
                        TokenizerError::closing_unopened_brace,
                        self.token_cursor,
                        self.start.len(),
                        None,
                        0,
                    );
                }
                brace_offsets.pop();
                if brace_offsets.is_empty() {
                    mode &= !TOK_MODE_CURLY_BRACES;
                }
                expecting.pop();
            } else if c == '[' {
                if self.token_cursor != buff_start {
                    mode |= TOK_MODE_ARRAY_BRACKETS;
                    slice_offset = self.token_cursor;
                } else {
                    // This is actually allowed so the test operator `[` can be used as the head of a
                    // command
                }
            }
            // Only exit bracket mode if we are in bracket mode.
            // Reason: `]` can be a parameter, e.g. last parameter to `[` test alias.
            // e.g. echo $argv[([ $x -eq $y ])] # must not end bracket mode on first bracket
            else if c == ']' && (mode & TOK_MODE_ARRAY_BRACKETS) {
                mode &= !TOK_MODE_ARRAY_BRACKETS;
            } else if c == '\'' || c == '"' {
                if let Err(error_loc) =
                    process_opening_quote(self, &mut quoted_cmdsubs, &mut paran_offsets, c)
                {
                    if !self.accept_unfinished {
                        return self.call_error(
                            TokenizerError::unterminated_quote,
                            buff_start,
                            error_loc,
                            None,
                            1,
                        );
                    }
                    break;
                }
            } else if mode == TOK_MODE_REGULAR_TEXT
                && !tok_is_string_character(
                    c,
                    self.start
                        .as_char_slice()
                        .get(self.token_cursor + 1)
                        .copied(),
                )
            {
                break;
            }

            let next = self
                .start
                .as_char_slice()
                .get(self.token_cursor + 1)
                .copied();
            is_token_begin = is_token_delimiter(c, next);
            self.token_cursor += 1;
        }

        if !self.accept_unfinished && mode != TOK_MODE_REGULAR_TEXT {
            // These are all "unterminated", so the only char we can mark as an error
            // is the opener (the closing char could be anywhere!)
            //
            // (except for TOK_MODE_CHAR_ESCAPE, which is one long by definition)
            if mode & TOK_MODE_CHAR_ESCAPE {
                return self.call_error(
                    TokenizerError::unterminated_escape,
                    buff_start,
                    self.token_cursor - 1,
                    None,
                    1,
                );
            } else if mode & TOK_MODE_ARRAY_BRACKETS {
                return self.call_error(
                    TokenizerError::unterminated_slice,
                    buff_start,
                    slice_offset,
                    None,
                    1,
                );
            } else if mode & TOK_MODE_SUBSHELL {
                assert!(!paran_offsets.is_empty());
                let offset_of_open_paran = *paran_offsets.last().unwrap();

                return self.call_error(
                    TokenizerError::unterminated_subshell,
                    buff_start,
                    offset_of_open_paran,
                    None,
                    1,
                );
            } else if mode & TOK_MODE_CURLY_BRACES {
                assert!(!brace_offsets.is_empty());
                let offset_of_open_brace = *brace_offsets.last().unwrap();

                return self.call_error(
                    TokenizerError::unterminated_brace,
                    buff_start,
                    offset_of_open_brace,
                    None,
                    1,
                );
            } else {
                panic!("Unknown non-regular-text mode");
            }
        }

        let mut result = Tok::new(TokenType::string);
        result.offset = buff_start as u32;
        result.length = (self.token_cursor - buff_start) as u32;
        result
    }
}

pub fn quote_end(s: &wstr, mut pos: usize, quote: char) -> Option<usize> {
    loop {
        pos += 1;

        if pos == s.len() {
            return None;
        }

        let c = s.char_at(pos);
        if c == '\\' {
            pos += 1;
            if pos == s.len() {
                return None;
            }
        } else if c == quote ||
                // Command substitutions also end a double quoted string.  This is how we
                // support command substitutions inside double quotes.
                (quote == '"' && c == '$' && s.as_char_slice().get(pos+1) == Some(&'('))
        {
            return Some(pos);
        }
    }
}

pub fn comment_end(s: &wstr, mut pos: usize) -> usize {
    loop {
        pos += 1;
        if pos == s.len() || s.char_at(pos) == '\n' {
            return pos;
        }
    }
}

/// Tests if this character can be a part of a string. Hash (#) starts a comment if it's the first
/// character in a token; otherwise it is considered a string character. See issue #953.
fn tok_is_string_character(c: char, next: Option<char>) -> bool {
    match c {
        // Unconditional separators.
        '\0' | ' ' | '\n' | '|' | '\t' | ';' | '\r' | '<' | '>' => false,
        '&' => {
            if feature_test(FeatureFlag::ampersand_nobg_in_token) {
                // Unlike in other shells, '&' is not special if followed by a string character.
                next.map(|nc| tok_is_string_character(nc, None))
                    .unwrap_or(false)
            } else {
                false
            }
        }
        _ => true,
    }
}

/// Quick test to catch the most common 'non-magical' characters, makes read_string slightly faster
/// by adding a fast path for the most common characters. This is obviously not a suitable
/// replacement for iswalpha.
fn myal(c: char) -> bool {
    ('a'..='z').contains(&c) || ('A'..='Z').contains(&c)
}

#[derive(Clone, Copy, PartialEq, Eq)]
struct TokModes(u8);

const TOK_MODE_REGULAR_TEXT: TokModes = TokModes(0); // regular text
const TOK_MODE_SUBSHELL: TokModes = TokModes(1 << 0); // inside of subshell parentheses
const TOK_MODE_ARRAY_BRACKETS: TokModes = TokModes(1 << 1); // inside of array brackets
const TOK_MODE_CURLY_BRACES: TokModes = TokModes(1 << 2);
const TOK_MODE_CHAR_ESCAPE: TokModes = TokModes(1 << 3);

impl BitAnd for TokModes {
    type Output = bool;
    fn bitand(self, rhs: Self) -> Self::Output {
        (self.0 & rhs.0) != 0
    }
}
impl BitAndAssign for TokModes {
    fn bitand_assign(&mut self, rhs: Self) {
        self.0 &= rhs.0
    }
}
impl BitOrAssign for TokModes {
    fn bitor_assign(&mut self, rhs: Self) {
        self.0 |= rhs.0
    }
}
impl Not for TokModes {
    type Output = TokModes;
    fn not(self) -> Self::Output {
        TokModes(!self.0)
    }
}

/// Tests if this character can delimit tokens.
pub fn is_token_delimiter(c: char, next: Option<char>) -> bool {
    c == '(' || !tok_is_string_character(c, next)
}

fn is_token_delimiter_ffi(c: wchar_t, next: SharedPtr<wchar_t>) -> bool {
    is_token_delimiter(
        c.try_into().unwrap(),
        next.as_ref().map(|c| (*c).try_into().unwrap()),
    )
}

/// \return the_ffi first token from the string, skipping variable assignments like A=B.
pub fn tok_command(str: &wstr) -> WString {
    let mut t = Tokenizer::new(str, TokFlags(0));
    while let Some(token) = t.next() {
        if token.type_ != TokenType::string {
            return WString::new();
        }
        let text = t.text_of(&token);
        if variable_assignment_equals_pos(text).is_some() {
            continue;
        }
        return text.to_owned();
    }
    WString::new()
}
fn tok_command_ffi(str: &CxxWString) -> UniquePtr<CxxWString> {
    tok_command(&str.from_ffi()).to_ffi()
}

impl TryFrom<&wstr> for PipeOrRedir {
    type Error = ();

    /// Examples of supported syntaxes.
    ///    Note we are only responsible for parsing the redirection part, not 'cmd' or 'file'.
    ///
    ///     cmd | cmd        normal pipe
    ///     cmd &| cmd       normal pipe plus stderr-merge
    ///     cmd >| cmd       pipe with explicit fd
    ///     cmd 2>| cmd      pipe with explicit fd
    ///     cmd < file       stdin redirection
    ///     cmd > file       redirection
    ///     cmd >> file      appending redirection
    ///     cmd >? file      noclobber redirection
    ///     cmd >>? file     appending noclobber redirection
    ///     cmd 2> file      file redirection with explicit fd
    ///     cmd >&2          fd redirection with no explicit src fd (stdout is used)
    ///     cmd 1>&2         fd redirection with an explicit src fd
    ///     cmd <&2          fd redirection with no explicit src fd (stdin is used)
    ///     cmd 3<&0         fd redirection with an explicit src fd
    ///     cmd &> file      redirection with stderr merge
    ///     cmd ^ file       caret (stderr) redirection, perhaps disabled via feature flags
    ///     cmd ^^ file      caret (stderr) redirection, perhaps disabled via feature flags
    fn try_from(buff: &wstr) -> Result<PipeOrRedir, ()> {
        // Extract a range of leading fd.
        let mut cursor = buff.chars().take_while(|c| c.is_ascii_digit()).count();
        let fd_buff = &buff[..cursor];
        let has_fd = !fd_buff.is_empty();

        // Try consuming a given character.
        // Return true if consumed. On success, advances cursor.
        let try_consume = |cursor: &mut usize, c| -> bool {
            if buff.char_at(*cursor) != c {
                false
            } else {
                *cursor += 1;
                true
            }
        };

        // Like try_consume, but asserts on failure.
        let consume = |cursor: &mut usize, c| {
            assert!(buff.char_at(*cursor) == c, "Failed to consume char");
            *cursor += 1;
        };

        let c = buff.char_at(cursor);
        let mut result = PipeOrRedir {
            fd: -1,
            is_pipe: false,
            mode: RedirectionMode::overwrite,
            stderr_merge: false,
            consumed: 0,
        };
        match c {
            '|' => {
                if has_fd {
                    // Like 123|
                    return Err(());
                }
                consume(&mut cursor, '|');
                assert!(
                    buff.char_at(cursor) != '|',
                    "|| passed as redirection, this should have been handled as 'or' by the caller"
                );
                result.fd = STDOUT_FILENO;
                result.is_pipe = true;
            }
            '>' => {
                consume(&mut cursor, '>');
                if try_consume(&mut cursor, '>') {
                    result.mode = RedirectionMode::append;
                }
                if try_consume(&mut cursor, '|') {
                    // Note we differ from bash here.
                    // Consider `echo foo 2>| bar`
                    // In fish, this is a *pipe*. Run bar as a command and attach foo's stderr to bar's
                    // stdin, while leaving stdout as tty.
                    // In bash, this is a *redirection* to bar as a file. It is like > but ignores
                    // noclobber.
                    result.is_pipe = true;
                    result.fd = if has_fd {
                        parse_fd(fd_buff) // like 2>|
                    } else {
                        STDOUT_FILENO
                    }; // like >|
                } else if try_consume(&mut cursor, '&') {
                    // This is a redirection to an fd.
                    // Note that we allow ">>&", but it's still just writing to the fd - "appending" to
                    // it doesn't make sense.
                    result.mode = RedirectionMode::fd;
                    result.fd = if has_fd {
                        parse_fd(fd_buff) // like 1>&2
                    } else {
                        STDOUT_FILENO // like >&2
                    };
                } else {
                    // This is a redirection to a file.
                    result.fd = if has_fd {
                        parse_fd(fd_buff) // like 1> file.txt
                    } else {
                        STDOUT_FILENO // like > file.txt
                    };
                    if result.mode != RedirectionMode::append {
                        result.mode = RedirectionMode::overwrite;
                    }
                    // Note 'echo abc >>? file' is valid: it means append and noclobber.
                    // But here "noclobber" means the file must not exist, so appending
                    // can be ignored.
                    if try_consume(&mut cursor, '?') {
                        result.mode = RedirectionMode::noclob;
                    }
                }
            }
            '<' => {
                consume(&mut cursor, '<');
                if try_consume(&mut cursor, '&') {
                    result.mode = RedirectionMode::fd;
                } else {
                    result.mode = RedirectionMode::input;
                }
                result.fd = if has_fd {
                    parse_fd(fd_buff) // like 1<&3 or 1< /tmp/file.txt
                } else {
                    STDIN_FILENO // like <&3 or < /tmp/file.txt
                };
            }
            '&' => {
                consume(&mut cursor, '&');
                if try_consume(&mut cursor, '|') {
                    // &| is pipe with stderr merge.
                    result.fd = STDOUT_FILENO;
                    result.is_pipe = true;
                    result.stderr_merge = true;
                } else if try_consume(&mut cursor, '>') {
                    result.fd = STDOUT_FILENO;
                    result.stderr_merge = true;
                    result.mode = RedirectionMode::overwrite;
                    if try_consume(&mut cursor, '>') {
                        result.mode = RedirectionMode::append; // like &>>
                    }
                    if try_consume(&mut cursor, '?') {
                        result.mode = RedirectionMode::noclob; // like &>? or &>>?
                    }
                } else {
                    return Err(());
                }
            }
            _ => {
                // Not a redirection.
                return Err(());
            }
        }

        result.consumed = cursor;
        assert!(
            result.consumed > 0,
            "Should have consumed at least one character on success"
        );
        Ok(result)
    }
}

fn pipe_or_redir_from_string(buff: wcharz_t) -> UniquePtr<PipeOrRedir> {
    match PipeOrRedir::try_from(Into::<&wstr>::into(buff)) {
        Ok(p) => UniquePtr::new(p),
        Err(()) => UniquePtr::null(),
    }
}

impl PipeOrRedir {
    /// \return the oflags (as in open(2)) for this redirection.
    pub fn oflags(&self) -> c_int {
        self.mode.oflags().unwrap_or(-1)
    }

    // \return if we are "valid". Here "valid" means only that the source fd did not overflow.
    // For example 99999999999> is invalid.
    fn is_valid(&self) -> bool {
        self.fd >= 0
    }

    // \return the token type for this redirection.
    fn token_type(&self) -> TokenType {
        if self.is_pipe {
            TokenType::pipe
        } else {
            TokenType::redirect
        }
    }
}

// Parse an fd from the non-empty string [start, end), all of which are digits.
// Return the fd, or -1 on overflow.
fn parse_fd(s: &wstr) -> RawFd {
    assert!(!s.is_empty());
    let chars: Vec<u8> = s
        .chars()
        .map(|c| {
            assert!(c.is_ascii_digit());
            c as u8
        })
        .collect();
    let s = std::str::from_utf8(chars.as_slice()).unwrap();
    s.parse().unwrap_or(-1)
}

fn new_move_word_state_machine(syl: MoveWordStyle) -> Box<MoveWordStateMachine> {
    Box::new(MoveWordStateMachine::new(syl))
}

impl MoveWordStateMachine {
    pub fn new(style: MoveWordStyle) -> Self {
        MoveWordStateMachine { state: 0, style }
    }

    pub fn consume_char(&mut self, c: char) -> bool {
        match self.style {
            MoveWordStyle::move_word_style_punctuation => self.consume_char_punctuation(c),
            MoveWordStyle::move_word_style_path_components => self.consume_char_path_components(c),
            MoveWordStyle::move_word_style_whitespace => self.consume_char_whitespace(c),
            _ => panic!(),
        }
    }
    pub fn consume_char_ffi(&mut self, c: wchar_t) -> bool {
        self.consume_char(c.try_into().unwrap())
    }

    pub fn reset(&mut self) {
        self.state = 0;
    }

    fn consume_char_punctuation(&mut self, c: char) -> bool {
        const S_ALWAYS_ONE: u8 = 0;
        const S_REST: u8 = 1;
        const S_WHITESPACE_REST: u8 = 2;
        const S_WHITESPACE: u8 = 3;
        const S_ALPHANUMERIC: u8 = 4;
        const S_END: u8 = 5;

        let mut consumed = false;
        while self.state != S_END && !consumed {
            match self.state {
                S_ALWAYS_ONE => {
                    // Always consume the first character.
                    consumed = true;
                    if c.is_whitespace() {
                        self.state = S_WHITESPACE;
                    } else if c.is_alphanumeric() {
                        self.state = S_ALPHANUMERIC;
                    } else {
                        // Don't allow switching type (ws->nonws) after non-whitespace and
                        // non-alphanumeric.
                        self.state = S_REST;
                    }
                }
                S_REST => {
                    if c.is_whitespace() {
                        // Consume only trailing whitespace.
                        self.state = S_WHITESPACE_REST;
                    } else if c.is_alphanumeric() {
                        // Consume only alnums.
                        self.state = S_ALPHANUMERIC;
                    } else {
                        consumed = false;
                        self.state = S_END;
                    }
                }
                S_WHITESPACE_REST | S_WHITESPACE => {
                    // "whitespace" consumes whitespace and switches to alnums,
                    // "whitespace_rest" only consumes whitespace.
                    if c.is_whitespace() {
                        // Consumed whitespace.
                        consumed = true;
                    } else {
                        self.state = if self.state == S_WHITESPACE {
                            S_ALPHANUMERIC
                        } else {
                            S_END
                        };
                    }
                }
                S_ALPHANUMERIC => {
                    if c.is_alphanumeric() {
                        consumed = true; // consumed alphanumeric
                    } else {
                        self.state = S_END;
                    }
                }
                _ => {}
            }
        }
        consumed
    }

    fn consume_char_path_components(&mut self, c: char) -> bool {
        const S_INITIAL_PUNCTUATION: u8 = 0;
        const S_WHITESPACE: u8 = 1;
        const S_SEPARATOR: u8 = 2;
        const S_SLASH: u8 = 3;
        const S_PATH_COMPONENT_CHARACTERS: u8 = 4;
        const S_INITIAL_SEPARATOR: u8 = 5;
        const S_END: u8 = 6;

        let mut consumed = false;
        while self.state != S_END && !consumed {
            match self.state {
                S_INITIAL_PUNCTUATION => {
                    if !is_path_component_character(c) && !c.is_whitespace() {
                        self.state = S_INITIAL_SEPARATOR;
                    } else {
                        if !is_path_component_character(c) {
                            consumed = true;
                        }
                        self.state = S_WHITESPACE;
                    }
                }
                S_WHITESPACE => {
                    if c.is_whitespace() {
                        consumed = true; // consumed whitespace
                    } else if c == '/' || is_path_component_character(c) {
                        self.state = S_SLASH; // path component
                    } else {
                        self.state = S_SEPARATOR; // path separator
                    }
                }
                S_SEPARATOR => {
                    if !c.is_whitespace() && !is_path_component_character(c) {
                        consumed = true; // consumed separator
                    } else {
                        self.state = S_END;
                    }
                }
                S_SLASH => {
                    if c == '/' {
                        consumed = true; // consumed slash
                    } else {
                        self.state = S_PATH_COMPONENT_CHARACTERS;
                    }
                }
                S_PATH_COMPONENT_CHARACTERS => {
                    if is_path_component_character(c) {
                        consumed = true; // consumed string character except slash
                    } else {
                        self.state = S_END;
                    }
                }
                S_INITIAL_SEPARATOR => {
                    if is_path_component_character(c) {
                        consumed = true;
                        self.state = S_PATH_COMPONENT_CHARACTERS;
                    } else if c.is_whitespace() {
                        self.state = S_END;
                    } else {
                        consumed = true;
                    }
                }
                _ => {}
            }
        }
        consumed
    }

    fn consume_char_whitespace(&mut self, c: char) -> bool {
        // Consume a "word" of printable characters plus any leading whitespace.
        const S_ALWAYS_ONE: u8 = 0;
        const S_BLANK: u8 = 1;
        const S_GRAPH: u8 = 2;
        const S_END: u8 = 3;

        let mut consumed = false;
        while self.state != S_END && !consumed {
            match self.state {
                S_ALWAYS_ONE => {
                    consumed = true; // always consume the first character
                                     // If it's not whitespace, only consume those from here.
                    if !c.is_whitespace() {
                        self.state = S_GRAPH;
                    } else {
                        // If it's whitespace, keep consuming whitespace until the graphs.
                        self.state = S_BLANK;
                    }
                }
                S_BLANK => {
                    if c.is_whitespace() {
                        consumed = true; // consumed whitespace
                    } else {
                        self.state = S_GRAPH;
                    }
                }
                S_GRAPH => {
                    if !c.is_whitespace() {
                        consumed = true; // consumed printable non-space
                    } else {
                        self.state = S_END;
                    }
                }
                _ => {}
            }
        }
        consumed
    }
}

fn is_path_component_character(c: char) -> bool {
    tok_is_string_character(c, None) && !L!("/={,}'\":@").as_char_slice().contains(&c)
}

/// The position of the equal sign in a variable assignment like foo=bar.
///
/// Return the location of the equals sign, or none if the string does
/// not look like a variable assignment like FOO=bar.  The detection
/// works similar as in some POSIX shells: only letters and numbers qre
/// allowed on the left hand side, no quotes or escaping.
pub fn variable_assignment_equals_pos(txt: &wstr) -> Option<usize> {
    let mut found_potential_variable = false;

    // TODO bracket indexing
    for (i, c) in txt.chars().enumerate() {
        if !found_potential_variable {
            if !valid_var_name_char(c as wchar_t) {
                return None;
            }
            found_potential_variable = true;
        } else {
            if c == '=' {
                return Some(i);
            }
            if !valid_var_name_char(c as wchar_t) {
                return None;
            }
        }
    }
    None
}

fn variable_assignment_equals_pos_ffi(txt: &CxxWString) -> SharedPtr<usize> {
    match variable_assignment_equals_pos(&txt.from_ffi()) {
        Some(p) => SharedPtr::new(p),
        None => SharedPtr::null(),
    }
}