2016-05-04 05:35:12 +08:00
// A specialized tokenizer for tokenizing the fish language. In the future, the tokenizer should be
// extended to support marks, tokenizing multiple strings and disposing of unused string segments.
2016-05-19 06:30:21 +08:00
# include "config.h" // IWYU pragma: keep
2016-05-04 05:35:12 +08:00
# include <fcntl.h>
# include <limits.h>
# include <unistd.h>
2005-09-20 21:26:39 +08:00
# include <wchar.h>
# include <wctype.h>
2017-02-11 10:47:02 +08:00
2015-07-25 23:14:25 +08:00
# include <string>
2017-02-11 10:47:02 +08:00
# include <type_traits>
2006-02-28 21:17:16 +08:00
2005-09-20 21:26:39 +08:00
# include "common.h"
2016-05-04 05:35:12 +08:00
# include "fallback.h" // IWYU pragma: keep
2015-07-25 23:14:25 +08:00
# include "tokenizer.h"
2016-05-04 05:35:12 +08:00
# include "wutil.h" // IWYU pragma: keep
2006-07-20 06:55:49 +08:00
2018-03-12 08:36:10 +08:00
tokenizer_error * TOK_ERROR_NONE = new tokenizer_error ( L " " ) ;
tokenizer_error * TOK_UNTERMINATED_QUOTE = new tokenizer_error ( ( L " Unexpected end of string, quotes are not balanced " ) , parse_error_tokenizer_unterminated_quote ) ;
tokenizer_error * TOK_UNTERMINATED_SUBSHELL = new tokenizer_error ( ( L " Unexpected end of string, expecting ')' " ) , parse_error_tokenizer_unterminated_subshell ) ;
tokenizer_error * TOK_UNTERMINATED_SLICE = new tokenizer_error ( ( L " Unexpected end of string, square brackets do not match " ) , parse_error_tokenizer_unterminated_slice ) ;
tokenizer_error * TOK_UNTERMINATED_ESCAPE = new tokenizer_error ( ( L " Unexpected end of string, incomplete escape sequence " ) , parse_error_tokenizer_unterminated_escape ) ;
tokenizer_error * TOK_INVALID_REDIRECT = new tokenizer_error ( ( L " Invalid input/output redirection " ) ) ;
tokenizer_error * TOK_INVALID_PIPE = new tokenizer_error ( ( L " Cannot use stdin (fd 0) as pipe output " ) ) ;
tokenizer_error * TOK_CLOSING_UNOPENED_SUBSHELL = new tokenizer_error ( ( L " Unexpected ')' for unopened parenthesis " ) ) ;
tokenizer_error * TOK_ILLEGAL_SLICE = new tokenizer_error ( ( L " Unexpected '[' at this location " ) ) ;
tokenizer_error * TOK_CLOSING_UNOPENED_BRACE = new tokenizer_error ( ( L " Unexpected '}' for unopened brace expansion " ) ) ;
tokenizer_error * TOK_UNTERMINATED_BRACE = new tokenizer_error ( ( L " Unexpected end of string, incomplete parameter expansion " ) ) ;
tokenizer_error * TOK_EXPECTED_PCLOSE_FOUND_BCLOSE = new tokenizer_error ( ( L " Unexpected '}' found, expecting ')' " ) ) ;
tokenizer_error * TOK_EXPECTED_BCLOSE_FOUND_PCLOSE = new tokenizer_error ( ( L " Unexpected ')' found, expecting '}' " ) ) ;
2018-02-24 09:28:12 +08:00
2018-02-24 06:30:15 +08:00
/// Return an error token and mark that we no longer have a next token.
2018-03-12 08:36:10 +08:00
tok_t tokenizer_t : : call_error ( tokenizer_error * error_type , const wchar_t * token_start ,
2018-02-24 06:30:15 +08:00
const wchar_t * error_loc ) {
2018-02-20 08:31:39 +08:00
assert ( error_type ! = TOK_ERROR_NONE & & " TOK_ERROR_NONE passed to call_error " ) ;
2018-02-24 06:30:15 +08:00
assert ( error_loc > = token_start & & " Invalid error location " ) ;
assert ( this - > buff > = token_start & & " Invalid buff location " ) ;
2018-02-20 07:10:10 +08:00
this - > has_next = false ;
2018-02-24 06:30:15 +08:00
tok_t result ;
result . type = TOK_ERROR ;
result . error = error_type ;
result . offset = token_start - this - > start ;
result . length = this - > buff - token_start ;
result . error_offset = error_loc - token_start ;
return result ;
2005-09-20 21:26:39 +08:00
}
2018-02-20 07:10:10 +08:00
tokenizer_t : : tokenizer_t ( const wchar_t * start , tok_flags_t flags ) : buff ( start ) , start ( start ) {
assert ( start ! = nullptr & & " Invalid start " ) ;
2006-07-15 20:40:05 +08:00
2016-10-21 12:14:40 +08:00
this - > accept_unfinished = static_cast < bool > ( flags & TOK_ACCEPT_UNFINISHED ) ;
this - > show_comments = static_cast < bool > ( flags & TOK_SHOW_COMMENTS ) ;
this - > show_blank_lines = static_cast < bool > ( flags & TOK_SHOW_BLANK_LINES ) ;
2005-09-20 21:26:39 +08:00
}
2016-05-04 05:35:12 +08:00
bool tokenizer_t : : next ( struct tok_t * result ) {
2015-07-26 14:05:47 +08:00
assert ( result ! = NULL ) ;
2018-02-24 06:30:15 +08:00
maybe_t < tok_t > tok = this - > tok_next ( ) ;
if ( ! tok ) {
2015-07-26 14:05:47 +08:00
return false ;
}
2018-02-24 06:30:15 +08:00
* result = std : : move ( * tok ) ;
2015-07-26 14:05:47 +08:00
return true ;
}
2016-05-04 05:35:12 +08:00
/// Tests if this character can be a part of a string. The redirect ^ is allowed unless it's the
/// first character. Hash (#) starts a comment if it's the first character in a token; otherwise it
/// is considered a string character. See issue #953.
static bool tok_is_string_character ( wchar_t c , bool is_first ) {
switch ( c ) {
2012-11-19 16:31:03 +08:00
case L ' \0 ' :
case L ' ' :
case L ' \n ' :
case L ' | ' :
case L ' \t ' :
case L ' ; ' :
case L ' \r ' :
case L ' < ' :
case L ' > ' :
2016-05-04 05:35:12 +08:00
case L ' & ' : {
// Unconditional separators.
2012-11-19 16:31:03 +08:00
return false ;
2016-05-04 05:35:12 +08:00
}
case L ' ^ ' : {
// Conditional separator.
return ! is_first ;
}
default : { return true ; }
2012-07-11 11:30:54 +08:00
}
2005-10-26 18:51:02 +08:00
}
2005-09-20 21:26:39 +08:00
2016-05-04 05:35:12 +08:00
/// Quick test to catch the most common 'non-magical' characters, makes read_string slightly faster
/// by adding a fast path for the most common characters. This is obviously not a suitable
/// replacement for iswalpha.
2018-03-11 08:42:56 +08:00
static inline int myal ( wchar_t c ) { return ( c > = L ' a ' & & c < = L ' z ' ) | | ( c > = L ' A ' & & c < = L ' Z ' ) ; }
ENUM_FLAGS ( tok_mode ) {
regular_text = 0 , // regular text
subshell = 1 < < 0 , // inside of subshell parentheses
array_brackets = 1 < < 1 , // inside of array brackets
curly_braces = 1 < < 2 ,
char_escape = 1 < < 3 ,
2018-03-12 01:13:55 +08:00
} ;
2005-09-20 21:26:39 +08:00
2016-05-04 05:35:12 +08:00
/// Read the next token as a string.
2018-02-24 06:30:15 +08:00
tok_t tokenizer_t : : read_string ( ) {
2018-03-12 01:13:55 +08:00
tok_mode mode { tok_mode : : regular_text } ;
2018-03-11 08:42:56 +08:00
std : : vector < int > paran_offsets ;
2018-03-12 09:06:45 +08:00
std : : vector < int > brace_offsets ;
2018-03-12 08:36:10 +08:00
std : : vector < char > expecting ;
2018-03-11 08:42:56 +08:00
int slice_offset = 0 ;
2018-02-20 07:10:10 +08:00
const wchar_t * const buff_start = this - > buff ;
2012-07-11 11:30:54 +08:00
bool is_first = true ;
2012-11-18 18:23:22 +08:00
2018-03-11 08:42:56 +08:00
while ( true ) {
wchar_t c = * this - > buff ;
# if false
wcstring msg = L " Handling 0x%x (%lc) " ;
tok_mode mode_begin = mode ;
# endif
2018-03-12 06:10:16 +08:00
if ( c = = L ' \0 ' ) {
break ;
}
2018-03-11 08:42:56 +08:00
// Make sure this character isn't being escaped before anything else
if ( ( mode & tok_mode : : char_escape ) = = tok_mode : : char_escape ) {
mode & = ~ ( tok_mode : : char_escape ) ;
// and do nothing more
}
2018-03-12 01:13:55 +08:00
else if ( myal ( c ) ) {
// Early exit optimization in case the character is just a letter,
// which has no special meaning to the tokenizer, i.e. the same mode continues.
}
// Now proceed with the evaluation of the token, first checking to see if the token
// has been explicitly ignored (escaped).
else if ( c = = L ' \\ ' ) {
2018-03-12 06:10:16 +08:00
mode | = tok_mode : : char_escape ;
}
else if ( c = = L ' ( ' ) {
paran_offsets . push_back ( this - > buff - this - > start ) ;
2018-03-12 08:36:10 +08:00
expecting . push_back ( L ' ) ' ) ;
2018-03-12 06:10:16 +08:00
mode | = tok_mode : : subshell ;
}
2018-03-12 08:36:10 +08:00
else if ( c = = L ' { ' ) {
2018-03-12 09:06:45 +08:00
brace_offsets . push_back ( this - > buff - this - > start ) ;
2018-03-12 08:36:10 +08:00
expecting . push_back ( L ' } ' ) ;
mode | = tok_mode : : curly_braces ;
}
2018-03-12 06:10:16 +08:00
else if ( c = = L ' ) ' ) {
2018-03-12 08:36:10 +08:00
if ( expecting . size ( ) > 0 & & expecting . back ( ) = = L ' } ' ) {
return this - > call_error ( TOK_EXPECTED_BCLOSE_FOUND_PCLOSE , this - > start , this - > buff ) ;
}
2018-03-12 06:10:16 +08:00
switch ( paran_offsets . size ( ) ) {
case 0 :
2018-03-12 06:16:53 +08:00
return this - > call_error ( TOK_CLOSING_UNOPENED_SUBSHELL , this - > start , this - > buff ) ;
2018-03-12 06:10:16 +08:00
case 1 :
mode & = ~ ( tok_mode : : subshell ) ;
default :
paran_offsets . pop_back ( ) ;
2012-11-19 08:30:30 +08:00
}
2018-03-12 09:06:45 +08:00
expecting . pop_back ( ) ;
2018-03-12 06:10:16 +08:00
}
2018-03-12 08:36:10 +08:00
else if ( c = = L ' } ' ) {
if ( expecting . size ( ) > 0 & & expecting . back ( ) = = L ' ) ' ) {
return this - > call_error ( TOK_EXPECTED_PCLOSE_FOUND_BCLOSE , this - > start , this - > buff ) ;
}
2018-03-12 09:06:45 +08:00
switch ( brace_offsets . size ( ) ) {
2018-03-12 08:36:10 +08:00
case 0 :
return this - > call_error ( TOK_CLOSING_UNOPENED_BRACE , this - > start , this - > buff ) ;
case 1 :
mode & = ~ ( tok_mode : : curly_braces ) ;
default :
2018-03-12 09:06:45 +08:00
brace_offsets . pop_back ( ) ;
2018-03-12 08:36:10 +08:00
}
2018-03-12 09:06:45 +08:00
expecting . pop_back ( ) ;
2018-03-12 08:36:10 +08:00
}
2018-03-12 06:10:16 +08:00
else if ( c = = L ' [ ' ) {
if ( this - > buff ! = buff_start ) {
if ( ( mode & tok_mode : : array_brackets ) = = tok_mode : : array_brackets ) {
// Nested brackets should not overwrite the existing slice_offset
//mqudsi: TOK_ILLEGAL_SLICE is the right error here, but the shell
//prints an error message with the caret pointing at token_start,
//not err_loc, making the TOK_ILLEGAL_SLICE message misleading.
// return call_error(TOK_ILLEGAL_SLICE, buff_start, this->buff);
2018-03-12 06:16:53 +08:00
return this - > call_error ( TOK_UNTERMINATED_SLICE , this - > start , this - > buff ) ;
2015-08-11 09:30:44 +08:00
}
2018-03-12 06:10:16 +08:00
slice_offset = this - > buff - this - > start ;
mode | = tok_mode : : array_brackets ;
2018-03-11 08:42:56 +08:00
}
2018-03-12 06:10:16 +08:00
else {
// This is actually allowed so the test operator `[` can be used as the head of a command
2018-03-11 08:42:56 +08:00
}
2018-03-12 06:10:16 +08:00
}
// Only exit bracket mode if we are in bracket mode.
// Reason: `]` can be a parameter, e.g. last parameter to `[` test alias.
// e.g. echo $argv[([ $x -eq $y ])] # must not end bracket mode on first bracket
else if ( c = = L ' ] ' & & ( ( mode & tok_mode : : array_brackets ) = = tok_mode : : array_brackets ) ) {
mode & = ~ ( tok_mode : : array_brackets ) ;
}
else if ( c = = L ' \' ' | | c = = L ' " ' ) {
const wchar_t * end = quote_end ( this - > buff ) ;
if ( end ) {
this - > buff = end ;
} else {
const wchar_t * error_loc = this - > buff ;
this - > buff + = wcslen ( this - > buff ) ;
if ( ( ! this - > accept_unfinished ) ) {
return this - > call_error ( TOK_UNTERMINATED_QUOTE , buff_start , error_loc ) ;
2015-08-11 09:30:44 +08:00
}
2018-03-11 08:42:56 +08:00
break ;
}
2018-03-12 06:10:16 +08:00
}
else if ( mode = = tok_mode : : regular_text & & ! tok_is_string_character ( c , is_first ) ) {
break ;
}
2012-11-18 18:23:22 +08:00
2018-03-11 08:42:56 +08:00
# if false
if ( mode ! = mode_begin ) {
msg . append ( L " : mode 0x%x -> 0x%x \n " ) ;
} else {
msg . push_back ( L ' \n ' ) ;
}
debug ( 0 , msg . c_str ( ) , c , c , int ( mode_begin ) , int ( mode ) ) ;
# endif
2012-11-18 18:23:22 +08:00
2015-07-26 15:58:32 +08:00
this - > buff + + ;
2012-07-11 11:30:54 +08:00
is_first = false ;
2012-11-19 08:30:30 +08:00
}
2005-09-20 21:26:39 +08:00
2018-03-11 08:42:56 +08:00
if ( ( ! this - > accept_unfinished ) & & ( mode ! = tok_mode : : regular_text ) ) {
2018-02-24 06:30:15 +08:00
tok_t error ;
2018-03-11 08:42:56 +08:00
if ( ( mode & tok_mode : : char_escape ) = = tok_mode : : char_escape ) {
error = this - > call_error ( TOK_UNTERMINATED_ESCAPE , buff_start ,
2018-03-12 06:10:16 +08:00
this - > buff - 1 ) ;
2018-03-11 08:42:56 +08:00
}
else if ( ( mode & tok_mode : : array_brackets ) = = tok_mode : : array_brackets ) {
error = this - > call_error ( TOK_UNTERMINATED_SLICE , buff_start ,
this - > start + slice_offset ) ;
}
else if ( ( mode & tok_mode : : subshell ) = = tok_mode : : subshell ) {
assert ( paran_offsets . size ( ) > 0 ) ;
size_t offset_of_open_paran = paran_offsets . back ( ) ;
2016-05-04 05:35:12 +08:00
2018-03-11 08:42:56 +08:00
error = this - > call_error ( TOK_UNTERMINATED_SUBSHELL , buff_start ,
this - > start + offset_of_open_paran ) ;
2013-09-12 05:22:16 +08:00
}
2018-03-12 08:36:10 +08:00
else if ( ( mode & tok_mode : : curly_braces ) = = tok_mode : : curly_braces ) {
2018-03-12 09:06:45 +08:00
assert ( brace_offsets . size ( ) > 0 ) ;
size_t offset_of_open_brace = brace_offsets . back ( ) ;
2018-03-12 08:36:10 +08:00
error = this - > call_error ( TOK_UNTERMINATED_BRACE , buff_start ,
this - > start + offset_of_open_brace ) ;
}
2018-02-24 06:30:15 +08:00
return error ;
2012-11-19 08:30:30 +08:00
}
2005-09-20 21:26:39 +08:00
2018-02-24 06:30:15 +08:00
tok_t result ;
result . type = TOK_STRING ;
result . offset = buff_start - this - > start ;
result . length = this - > buff - buff_start ;
return result ;
2005-09-20 21:26:39 +08:00
}
2018-02-24 07:19:58 +08:00
// Reads a redirection or an "fd pipe" (like 2>|) from a string.
// Returns the parsed pipe or redirection, or none() on error.
struct parsed_redir_or_pipe_t {
// Number of characters consumed.
size_t consumed { 0 } ;
// The token type, always either TOK_PIPE or TOK_REDIRECT.
token_type type { TOK_REDIRECT } ;
// The redirection mode if the type is TOK_REDIRECT.
redirection_type_t redirection_mode { redirection_type_t : : overwrite } ;
// The redirected fd, or -1 on overflow.
int fd { 0 } ;
} ;
2012-11-18 18:23:22 +08:00
2018-02-24 07:19:58 +08:00
static maybe_t < parsed_redir_or_pipe_t > read_redirection_or_fd_pipe ( const wchar_t * buff ) {
bool errored = false ;
parsed_redir_or_pipe_t result ;
2013-10-14 04:26:52 +08:00
size_t idx = 0 ;
2014-01-15 17:40:40 +08:00
2016-05-04 05:35:12 +08:00
// Determine the fd. This may be specified as a prefix like '2>...' or it may be implicit like
// '>' or '^'. Try parsing out a number; if we did not get any digits then infer it from the
// first character. Watch out for overflow.
2013-10-14 07:58:40 +08:00
long long big_fd = 0 ;
2016-05-04 05:35:12 +08:00
for ( ; iswdigit ( buff [ idx ] ) ; idx + + ) {
// Note that it's important we consume all the digits here, even if it overflows.
if ( big_fd < = INT_MAX ) big_fd = big_fd * 10 + ( buff [ idx ] - L ' 0 ' ) ;
2013-10-14 04:26:52 +08:00
}
2014-01-15 17:40:40 +08:00
2018-02-24 07:19:58 +08:00
result . fd = ( big_fd > INT_MAX ? - 1 : static_cast < int > ( big_fd ) ) ;
2014-01-15 17:40:40 +08:00
2016-05-04 05:35:12 +08:00
if ( idx = = 0 ) {
// We did not find a leading digit, so there's no explicit fd. Infer it from the type.
switch ( buff [ idx ] ) {
case L ' > ' : {
2018-02-24 07:19:58 +08:00
result . fd = STDOUT_FILENO ;
2014-01-15 17:40:40 +08:00
break ;
2016-05-04 05:35:12 +08:00
}
case L ' < ' : {
2018-02-24 07:19:58 +08:00
result . fd = STDIN_FILENO ;
2014-01-15 17:40:40 +08:00
break ;
2016-05-04 05:35:12 +08:00
}
case L ' ^ ' : {
2018-02-24 07:19:58 +08:00
result . fd = STDERR_FILENO ;
2014-01-15 17:40:40 +08:00
break ;
2016-05-04 05:35:12 +08:00
}
default : {
2014-01-15 17:40:40 +08:00
errored = true ;
break ;
2016-05-04 05:35:12 +08:00
}
2012-11-19 08:30:30 +08:00
}
2013-10-14 04:26:52 +08:00
}
2016-06-13 02:34:35 +08:00
2016-05-04 05:35:12 +08:00
// Either way we should have ended on the redirection character itself like '>'.
2016-06-12 17:16:46 +08:00
// Don't allow an fd with a caret redirection - see #1873
2016-05-04 05:35:12 +08:00
wchar_t redirect_char = buff [ idx + + ] ; // note increment of idx
2016-06-12 17:16:46 +08:00
if ( redirect_char = = L ' > ' | | ( redirect_char = = L ' ^ ' & & idx = = 1 ) ) {
2018-02-24 07:19:58 +08:00
result . redirection_mode = redirection_type_t : : overwrite ;
2016-05-04 05:35:12 +08:00
if ( buff [ idx ] = = redirect_char ) {
// Doubled up like ^^ or >>. That means append.
2018-02-24 07:19:58 +08:00
result . redirection_mode = redirection_type_t : : append ;
2013-10-14 04:26:52 +08:00
idx + + ;
2012-11-19 08:30:30 +08:00
}
2016-05-04 05:35:12 +08:00
} else if ( redirect_char = = L ' < ' ) {
2018-02-24 07:19:58 +08:00
result . redirection_mode = redirection_type_t : : input ;
2016-05-04 05:35:12 +08:00
} else {
// Something else.
2013-10-14 04:26:52 +08:00
errored = true ;
2012-11-18 18:23:22 +08:00
}
2014-01-15 17:40:40 +08:00
2018-02-24 07:19:58 +08:00
// Bail on error.
2016-05-04 05:35:12 +08:00
if ( errored ) {
2018-02-24 07:19:58 +08:00
return none ( ) ;
2015-10-08 02:38:13 +08:00
}
2014-01-15 17:40:40 +08:00
2018-02-24 07:19:58 +08:00
// Optional characters like & or ?, or the pipe char |.
wchar_t opt_char = buff [ idx ] ;
if ( opt_char = = L ' & ' ) {
result . redirection_mode = redirection_type_t : : fd ;
idx + + ;
} else if ( opt_char = = L ' ? ' ) {
result . redirection_mode = redirection_type_t : : noclob ;
idx + + ;
} else if ( opt_char = = L ' | ' ) {
// So the string looked like '2>|'. This is not a redirection - it's a pipe! That gets
// handled elsewhere.
result . type = TOK_PIPE ;
idx + + ;
}
2014-01-15 17:40:40 +08:00
2018-02-24 07:19:58 +08:00
result . consumed = idx ;
return result ;
2005-09-20 21:26:39 +08:00
}
2018-02-24 07:19:58 +08:00
maybe_t < redirection_type_t > redirection_type_for_string ( const wcstring & str , int * out_fd ) {
auto v = read_redirection_or_fd_pipe ( str . c_str ( ) ) ;
2016-05-04 05:35:12 +08:00
// Redirections only, no pipes.
2018-02-24 07:19:58 +08:00
if ( ! v | | v - > type ! = TOK_REDIRECT | | v - > fd < 0 ) return none ( ) ;
if ( out_fd ) * out_fd = v - > fd ;
return v - > redirection_mode ;
2013-10-14 07:58:40 +08:00
}
2013-12-29 08:18:38 +08:00
2016-05-04 05:35:12 +08:00
int fd_redirected_by_pipe ( const wcstring & str ) {
// Hack for the common case.
if ( str = = L " | " ) {
2013-12-29 08:18:38 +08:00
return STDOUT_FILENO ;
}
2018-02-24 07:19:58 +08:00
auto v = read_redirection_or_fd_pipe ( str . c_str ( ) ) ;
return ( v & & v - > type = = TOK_PIPE ) ? v - > fd : - 1 ;
2013-12-29 08:18:38 +08:00
}
2013-10-14 07:58:40 +08:00
2018-02-24 07:19:58 +08:00
int oflags_for_redirection_type ( redirection_type_t type ) {
2016-05-04 05:35:12 +08:00
switch ( type ) {
2018-02-24 07:19:58 +08:00
case redirection_type_t : : append : {
2014-01-15 17:40:40 +08:00
return O_CREAT | O_APPEND | O_WRONLY ;
2016-05-04 05:35:12 +08:00
}
2018-02-24 07:19:58 +08:00
case redirection_type_t : : overwrite : {
2014-01-15 17:40:40 +08:00
return O_CREAT | O_WRONLY | O_TRUNC ;
2016-05-04 05:35:12 +08:00
}
2018-02-24 07:19:58 +08:00
case redirection_type_t : : noclob : {
2014-01-15 17:40:40 +08:00
return O_CREAT | O_EXCL | O_WRONLY ;
2016-05-04 05:35:12 +08:00
}
2018-02-24 07:19:58 +08:00
case redirection_type_t : : input : {
2014-01-15 17:40:40 +08:00
return O_RDONLY ;
2016-05-04 05:35:12 +08:00
}
default : { return - 1 ; }
2013-12-24 06:53:56 +08:00
}
}
2016-05-04 05:35:12 +08:00
/// Test if a character is whitespace. Differs from iswspace in that it does not consider a newline
/// to be whitespace.
2018-02-20 07:47:02 +08:00
static bool iswspace_not_nl ( wchar_t c ) {
switch ( c ) {
case L ' ' :
case L ' \t ' :
case L ' \r ' :
return true ;
case L ' \n ' :
return false ;
default :
return iswspace ( c ) ;
}
}
2005-09-20 21:26:39 +08:00
2018-02-24 06:30:15 +08:00
maybe_t < tok_t > tokenizer_t : : tok_next ( ) {
2016-05-04 05:35:12 +08:00
if ( ! this - > has_next ) {
2018-02-24 06:30:15 +08:00
return none ( ) ;
2012-11-18 18:23:22 +08:00
}
2018-02-20 07:47:02 +08:00
// Consume non-newline whitespace. If we get an escaped newline, mark it and continue past it.
for ( ; ; ) {
2016-05-04 05:35:12 +08:00
if ( this - > buff [ 0 ] = = L ' \\ ' & & this - > buff [ 1 ] = = L ' \n ' ) {
2015-07-26 15:58:32 +08:00
this - > buff + = 2 ;
this - > continue_line_after_comment = true ;
2018-02-20 07:47:02 +08:00
} else if ( iswspace_not_nl ( this - > buff [ 0 ] ) ) {
2015-07-26 15:58:32 +08:00
this - > buff + + ;
2016-05-04 05:35:12 +08:00
} else {
2012-11-19 08:30:30 +08:00
break ;
}
2012-11-18 18:23:22 +08:00
}
2012-11-19 08:30:30 +08:00
2016-05-04 05:35:12 +08:00
while ( * this - > buff = = L ' # ' ) {
2018-02-20 07:47:02 +08:00
// We have a comment, walk over the comment.
const wchar_t * comment_start = this - > buff ;
while ( this - > buff [ 0 ] ! = L ' \n ' & & this - > buff [ 0 ] ! = L ' \0 ' ) this - > buff + + ;
size_t comment_len = this - > buff - comment_start ;
// If we are going to continue after the comment, skip any trailing newline.
if ( this - > buff [ 0 ] = = L ' \n ' & & this - > continue_line_after_comment ) this - > buff + + ;
2015-03-13 20:05:22 +08:00
2018-02-20 07:47:02 +08:00
// Maybe return the comment.
if ( this - > show_comments ) {
2018-02-24 06:30:15 +08:00
tok_t result ;
result . type = TOK_COMMENT ;
result . offset = comment_start - this - > start ;
result . length = comment_len ;
return result ;
2012-11-19 08:30:30 +08:00
}
2018-02-20 07:47:02 +08:00
while ( iswspace_not_nl ( this - > buff [ 0 ] ) ) this - > buff + + ;
2012-11-19 08:30:30 +08:00
}
2012-11-18 18:23:22 +08:00
2018-02-20 07:47:02 +08:00
// We made it past the comments and ate any trailing newlines we wanted to ignore.
2015-07-26 15:58:32 +08:00
this - > continue_line_after_comment = false ;
2018-02-24 06:30:15 +08:00
size_t start_pos = this - > buff - this - > start ;
2012-11-18 18:23:22 +08:00
2018-02-24 06:30:15 +08:00
tok_t result ;
result . offset = start_pos ;
2016-05-04 05:35:12 +08:00
switch ( * this - > buff ) {
case L ' \0 ' : {
2015-07-26 15:58:32 +08:00
this - > has_next = false ;
2018-02-24 06:30:15 +08:00
return none ( ) ;
2016-05-04 05:35:12 +08:00
}
case L ' \r ' : // carriage-return
case L ' \n ' : // newline
case L ' ; ' : {
2018-02-24 06:30:15 +08:00
result . type = TOK_END ;
result . length = 1 ;
2015-07-26 15:58:32 +08:00
this - > buff + + ;
2016-05-04 05:35:12 +08:00
// Hack: when we get a newline, swallow as many as we can. This compresses multiple
// subsequent newlines into a single one.
if ( ! this - > show_blank_lines ) {
while ( * this - > buff = = L ' \n ' | | * this - > buff = = 13 /* CR */ | | * this - > buff = = ' ' | |
* this - > buff = = ' \t ' ) {
2015-07-26 15:58:32 +08:00
this - > buff + + ;
2014-11-26 02:43:03 +08:00
}
2014-11-24 17:20:57 +08:00
}
2012-11-19 16:31:03 +08:00
break ;
2016-05-04 05:35:12 +08:00
}
case L ' & ' : {
2018-03-02 04:56:15 +08:00
if ( this - > buff [ 1 ] = = L ' & ' ) {
result . type = TOK_ANDAND ;
result . length = 2 ;
this - > buff + = 2 ;
} else {
result . type = TOK_BACKGROUND ;
result . length = 1 ;
this - > buff + + ;
}
2012-11-19 16:31:03 +08:00
break ;
2016-05-04 05:35:12 +08:00
}
case L ' | ' : {
2018-03-02 04:56:15 +08:00
if ( this - > buff [ 1 ] = = L ' | ' ) {
result . type = TOK_OROR ;
result . length = 2 ;
this - > buff + = 2 ;
} else {
result . type = TOK_PIPE ;
result . redirected_fd = 1 ;
result . length = 1 ;
this - > buff + + ;
}
2012-11-19 16:31:03 +08:00
break ;
2016-05-04 05:35:12 +08:00
}
2012-11-19 16:31:03 +08:00
case L ' > ' :
case L ' < ' :
2016-05-04 05:35:12 +08:00
case L ' ^ ' : {
// There's some duplication with the code in the default case below. The key difference
// here is that we must never parse these as a string; a failed redirection is an error!
2018-02-24 07:19:58 +08:00
auto redir_or_pipe = read_redirection_or_fd_pipe ( this - > buff ) ;
if ( ! redir_or_pipe | | redir_or_pipe - > fd < 0 ) {
2018-02-24 06:30:15 +08:00
return this - > call_error ( TOK_INVALID_REDIRECT , this - > buff , this - > buff ) ;
2013-10-14 04:26:52 +08:00
}
2018-02-24 07:19:58 +08:00
result . type = redir_or_pipe - > type ;
result . redirected_fd = redir_or_pipe - > fd ;
result . length = redir_or_pipe - > consumed ;
this - > buff + = redir_or_pipe - > consumed ;
2016-05-04 05:35:12 +08:00
break ;
2013-10-14 04:26:52 +08:00
}
2016-05-04 05:35:12 +08:00
default : {
// Maybe a redirection like '2>&1', maybe a pipe like 2>|, maybe just a string.
2015-08-11 09:30:44 +08:00
const wchar_t * error_location = this - > buff ;
2018-02-24 07:19:58 +08:00
maybe_t < parsed_redir_or_pipe_t > redir_or_pipe ;
2016-05-04 05:35:12 +08:00
if ( iswdigit ( * this - > buff ) ) {
2018-02-24 07:19:58 +08:00
redir_or_pipe = read_redirection_or_fd_pipe ( this - > buff ) ;
2015-10-08 02:38:13 +08:00
}
2014-01-15 17:40:40 +08:00
2018-02-24 07:19:58 +08:00
if ( redir_or_pipe & & redir_or_pipe - > consumed > 0 ) {
2016-05-04 05:35:12 +08:00
// It looks like a redirection or a pipe. But we don't support piping fd 0. Note
// that fd 0 may be -1, indicating overflow; but we don't treat that as a tokenizer
// error.
2018-02-24 07:19:58 +08:00
if ( redir_or_pipe - > type = = TOK_PIPE & & redir_or_pipe - > fd = = 0 ) {
2018-02-24 06:30:15 +08:00
return this - > call_error ( TOK_INVALID_PIPE , error_location , error_location ) ;
2012-11-19 16:31:03 +08:00
}
2018-02-24 07:19:58 +08:00
result . type = redir_or_pipe - > type ;
result . redirected_fd = redir_or_pipe - > fd ;
result . length = redir_or_pipe - > consumed ;
this - > buff + = redir_or_pipe - > consumed ;
2016-05-04 05:35:12 +08:00
} else {
// Not a redirection or pipe, so just a string.
2018-02-24 06:30:15 +08:00
result = this - > read_string ( ) ;
2013-10-14 04:26:52 +08:00
}
2016-05-04 05:35:12 +08:00
break ;
2012-11-18 18:23:22 +08:00
}
2012-11-19 08:30:30 +08:00
}
2018-02-24 06:30:15 +08:00
return result ;
2005-09-20 21:26:39 +08:00
}
2016-05-04 05:35:12 +08:00
wcstring tok_first ( const wcstring & str ) {
2018-02-24 09:28:12 +08:00
tokenizer_t t ( str . c_str ( ) , 0 ) ;
2015-07-26 15:58:32 +08:00
tok_t token ;
2016-05-04 05:35:12 +08:00
if ( t . next ( & token ) & & token . type = = TOK_STRING ) {
2018-02-24 06:30:15 +08:00
return t . text_of ( token ) ;
2012-11-19 08:30:30 +08:00
}
2018-02-24 06:30:15 +08:00
return { } ;
2005-09-20 21:26:39 +08:00
}
2016-05-04 05:35:12 +08:00
bool move_word_state_machine_t : : consume_char_punctuation ( wchar_t c ) {
2018-02-25 23:30:15 +08:00
enum { s_always_one = 0 , s_rest , s_whitespace_rest , s_whitespace , s_alphanumeric , s_end } ;
2012-12-23 04:21:31 +08:00
2012-12-21 09:37:09 +08:00
bool consumed = false ;
2016-05-04 05:35:12 +08:00
while ( state ! = s_end & & ! consumed ) {
switch ( state ) {
case s_always_one : {
// Always consume the first character.
2012-12-21 09:37:09 +08:00
consumed = true ;
2018-02-25 23:30:15 +08:00
if ( iswspace ( c ) ) {
state = s_whitespace ;
} else {
// Don't allow switching type (ws->nonws) after non-whitespace.
state = s_rest ;
}
2012-12-21 09:37:09 +08:00
break ;
2016-05-04 05:35:12 +08:00
}
2018-02-25 23:30:15 +08:00
case s_rest : {
if ( iswspace ( c ) ) {
// Consume only trailing whitespace.
state = s_whitespace_rest ;
} else if ( iswalnum ( c ) ) {
// Consume only alnums.
state = s_alphanumeric ;
} else {
consumed = false ;
state = s_end ;
}
break ;
}
case s_whitespace_rest :
2016-05-04 05:35:12 +08:00
case s_whitespace : {
2018-02-25 23:30:15 +08:00
// "whitespace" consumes whitespace and switches to alnums,
// "whitespace_rest" only consumes whitespace.
2016-05-04 05:35:12 +08:00
if ( iswspace ( c ) ) {
// Consumed whitespace.
2012-12-21 09:37:09 +08:00
consumed = true ;
2016-05-04 05:35:12 +08:00
} else {
2018-02-25 23:30:15 +08:00
state = state = = s_whitespace ? s_alphanumeric : s_end ;
2012-12-21 09:37:09 +08:00
}
break ;
2016-05-04 05:35:12 +08:00
}
case s_alphanumeric : {
if ( iswalnum ( c ) ) {
consumed = true ; // consumed alphanumeric
} else {
2012-12-21 09:37:09 +08:00
state = s_end ;
}
break ;
2016-05-04 05:35:12 +08:00
}
2012-12-21 09:37:09 +08:00
case s_end :
2016-05-04 05:35:12 +08:00
default : { break ; }
2012-12-21 09:37:09 +08:00
}
}
return consumed ;
}
2005-09-20 21:26:39 +08:00
2016-05-04 05:35:12 +08:00
bool move_word_state_machine_t : : is_path_component_character ( wchar_t c ) {
// Always treat separators as first. All this does is ensure that we treat ^ as a string
// character instead of as stderr redirection, which I hypothesize is usually what is desired.
return tok_is_string_character ( c , true ) & & ! wcschr ( L " /= { , } ' \ " " , c ) ;
2012-12-11 08:23:08 +08:00
}
2016-05-04 05:35:12 +08:00
bool move_word_state_machine_t : : consume_char_path_components ( wchar_t c ) {
enum {
2012-12-21 09:37:09 +08:00
s_initial_punctuation ,
s_whitespace ,
s_separator ,
s_slash ,
s_path_component_characters ,
s_end
} ;
2012-12-23 04:21:31 +08:00
2017-01-14 12:34:15 +08:00
// fwprintf(stdout, L"state %d, consume '%lc'\n", state, c);
2012-12-11 08:23:08 +08:00
bool consumed = false ;
2016-05-04 05:35:12 +08:00
while ( state ! = s_end & & ! consumed ) {
switch ( state ) {
case s_initial_punctuation : {
if ( ! is_path_component_character ( c ) ) {
2012-12-21 09:37:09 +08:00
consumed = true ;
}
state = s_whitespace ;
break ;
2016-05-04 05:35:12 +08:00
}
case s_whitespace : {
if ( iswspace ( c ) ) {
consumed = true ; // consumed whitespace
} else if ( c = = L ' / ' | | is_path_component_character ( c ) ) {
state = s_slash ; // path component
} else {
state = s_separator ; // path separator
2012-12-11 08:23:08 +08:00
}
break ;
2016-05-04 05:35:12 +08:00
}
case s_separator : {
if ( ! iswspace ( c ) & & ! is_path_component_character ( c ) ) {
consumed = true ; // consumed separator
2016-05-04 07:23:30 +08:00
} else {
2012-12-11 08:23:08 +08:00
state = s_end ;
}
break ;
2016-05-04 05:35:12 +08:00
}
case s_slash : {
if ( c = = L ' / ' ) {
consumed = true ; // consumed slash
} else {
2012-12-21 09:37:09 +08:00
state = s_path_component_characters ;
2012-12-11 08:23:08 +08:00
}
break ;
2016-05-04 05:35:12 +08:00
}
case s_path_component_characters : {
if ( is_path_component_character ( c ) ) {
consumed = true ; // consumed string character except slash
} else {
2012-12-11 08:23:08 +08:00
state = s_end ;
}
break ;
2016-05-04 05:35:12 +08:00
}
2012-12-11 08:23:08 +08:00
case s_end :
2016-11-03 09:29:14 +08:00
default : { break ; }
2012-12-11 08:23:08 +08:00
}
}
return consumed ;
}
2016-05-04 05:35:12 +08:00
bool move_word_state_machine_t : : consume_char_whitespace ( wchar_t c ) {
enum { s_always_one = 0 , s_blank , s_graph , s_end } ;
Add 'bigword' vi key bindings
- Add four new functions: forward-bigword, backward-bigword,
kill-bigword, backward-kill-bigword
- Add new enum move_word_style_whitespace and related state machine
method
- Change vi key bindings to operate on bigwords: B, gE, W, E, dW, diW,
daW, dE, dB, dgE, cW, ciW, caW, cE, cB, cgE, yW, yiW, yaW, yE, yB,
ygE
2015-05-31 06:44:25 +08:00
bool consumed = false ;
2016-05-04 05:35:12 +08:00
while ( state ! = s_end & & ! consumed ) {
switch ( state ) {
case s_always_one : {
consumed = true ; // always consume the first character
Add 'bigword' vi key bindings
- Add four new functions: forward-bigword, backward-bigword,
kill-bigword, backward-kill-bigword
- Add new enum move_word_style_whitespace and related state machine
method
- Change vi key bindings to operate on bigwords: B, gE, W, E, dW, diW,
daW, dE, dB, dgE, cW, ciW, caW, cE, cB, cgE, yW, yiW, yaW, yE, yB,
ygE
2015-05-31 06:44:25 +08:00
state = s_blank ;
break ;
2016-05-04 05:35:12 +08:00
}
case s_blank : {
if ( iswblank ( c ) ) {
consumed = true ; // consumed whitespace
} else {
Add 'bigword' vi key bindings
- Add four new functions: forward-bigword, backward-bigword,
kill-bigword, backward-kill-bigword
- Add new enum move_word_style_whitespace and related state machine
method
- Change vi key bindings to operate on bigwords: B, gE, W, E, dW, diW,
daW, dE, dB, dgE, cW, ciW, caW, cE, cB, cgE, yW, yiW, yaW, yE, yB,
ygE
2015-05-31 06:44:25 +08:00
state = s_graph ;
}
break ;
2016-05-04 05:35:12 +08:00
}
case s_graph : {
if ( iswgraph ( c ) ) {
consumed = true ; // consumed printable non-space
} else {
Add 'bigword' vi key bindings
- Add four new functions: forward-bigword, backward-bigword,
kill-bigword, backward-kill-bigword
- Add new enum move_word_style_whitespace and related state machine
method
- Change vi key bindings to operate on bigwords: B, gE, W, E, dW, diW,
daW, dE, dB, dgE, cW, ciW, caW, cE, cB, cgE, yW, yiW, yaW, yE, yB,
ygE
2015-05-31 06:44:25 +08:00
state = s_end ;
}
break ;
2016-05-04 05:35:12 +08:00
}
Add 'bigword' vi key bindings
- Add four new functions: forward-bigword, backward-bigword,
kill-bigword, backward-kill-bigword
- Add new enum move_word_style_whitespace and related state machine
method
- Change vi key bindings to operate on bigwords: B, gE, W, E, dW, diW,
daW, dE, dB, dgE, cW, ciW, caW, cE, cB, cgE, yW, yiW, yaW, yE, yB,
ygE
2015-05-31 06:44:25 +08:00
case s_end :
2016-05-04 05:35:12 +08:00
default : { break ; }
Add 'bigword' vi key bindings
- Add four new functions: forward-bigword, backward-bigword,
kill-bigword, backward-kill-bigword
- Add new enum move_word_style_whitespace and related state machine
method
- Change vi key bindings to operate on bigwords: B, gE, W, E, dW, diW,
daW, dE, dB, dgE, cW, ciW, caW, cE, cB, cgE, yW, yiW, yaW, yE, yB,
ygE
2015-05-31 06:44:25 +08:00
}
}
return consumed ;
}
2016-05-04 05:35:12 +08:00
bool move_word_state_machine_t : : consume_char ( wchar_t c ) {
switch ( style ) {
case move_word_style_punctuation : {
2012-12-23 04:21:31 +08:00
return consume_char_punctuation ( c ) ;
2016-05-04 05:35:12 +08:00
}
case move_word_style_path_components : {
2012-12-23 04:21:31 +08:00
return consume_char_path_components ( c ) ;
2016-05-04 05:35:12 +08:00
}
case move_word_style_whitespace : {
Add 'bigword' vi key bindings
- Add four new functions: forward-bigword, backward-bigword,
kill-bigword, backward-kill-bigword
- Add new enum move_word_style_whitespace and related state machine
method
- Change vi key bindings to operate on bigwords: B, gE, W, E, dW, diW,
daW, dE, dB, dgE, cW, ciW, caW, cE, cB, cgE, yW, yiW, yaW, yE, yB,
ygE
2015-05-31 06:44:25 +08:00
return consume_char_whitespace ( c ) ;
2016-05-04 05:35:12 +08:00
}
2012-12-21 09:37:09 +08:00
}
2016-11-07 09:48:26 +08:00
DIE ( " should not reach this statement " ) ; // silence some compiler errors about not returning
2012-12-21 09:37:09 +08:00
}
2016-05-04 05:35:12 +08:00
move_word_state_machine_t : : move_word_state_machine_t ( move_word_style_t syl )
: state ( 0 ) , style ( syl ) { }
2012-12-21 09:37:09 +08:00
2016-05-04 05:35:12 +08:00
void move_word_state_machine_t : : reset ( ) { state = 0 ; }