2005-09-20 21:26:39 +08:00
/** \file tokenizer.c
2006-10-01 23:59:18 +08:00
A specialized tokenizer for tokenizing the fish language . In the
future , the tokenizer should be extended to support marks ,
tokenizing multiple strings and disposing of unused string
segments .
2005-09-20 21:26:39 +08:00
*/
# include <wchar.h>
# include <wctype.h>
# include <unistd.h>
2013-12-24 06:53:56 +08:00
# include <fcntl.h>
2015-07-25 23:14:25 +08:00
# include <assert.h>
# include <string>
2016-04-21 14:00:54 +08:00
# include <limits.h>
2006-02-28 21:17:16 +08:00
2015-07-25 23:14:25 +08:00
# include "fallback.h" // IWYU pragma: keep
2005-09-20 21:26:39 +08:00
# include "common.h"
2016-04-21 14:00:54 +08:00
# include "wutil.h" // IWYU pragma: keep
2015-07-25 23:14:25 +08:00
# include "tokenizer.h"
2006-07-20 06:55:49 +08:00
2012-02-18 07:55:54 +08:00
/* Wow what a hack */
2015-08-11 09:30:44 +08:00
# define TOK_CALL_ERROR(t, e, x, where) do { (t)->call_error((e), where, (t)->squash_errors ? L"" : (x)); } while (0)
2005-09-20 21:26:39 +08:00
/**
Error string for unexpected end of string
*/
2006-11-17 22:59:05 +08:00
# define QUOTE_ERROR _( L"Unexpected end of string, quotes are not balanced" )
2006-10-07 08:56:25 +08:00
2005-09-20 21:26:39 +08:00
/**
Error string for mismatched parenthesis
*/
2006-11-17 22:59:05 +08:00
# define PARAN_ERROR _( L"Unexpected end of string, parenthesis do not match" )
2006-06-21 08:48:36 +08:00
2013-09-12 05:22:16 +08:00
/**
Error string for mismatched square brackets
*/
# define SQUARE_BRACKET_ERROR _( L"Unexpected end of string, square brackets do not match" )
2015-08-11 09:30:44 +08:00
/**
Error string for unterminated escape ( backslash without continuation )
*/
# define UNTERMINATED_ESCAPE_ERROR _( L"Unexpected end of string, incomplete escape sequence" )
2013-09-12 05:22:16 +08:00
2005-09-20 21:26:39 +08:00
/**
Error string for invalid redirections
*/
2006-11-17 22:59:05 +08:00
# define REDIRECT_ERROR _( L"Invalid input / output redirection" )
2005-09-20 21:26:39 +08:00
2005-10-07 22:08:57 +08:00
/**
Error string for when trying to pipe from fd 0
*/
2013-10-14 04:26:52 +08:00
# define PIPE_ERROR _( L"Cannot use stdin (fd 0) as pipe output" )
2005-10-07 22:08:57 +08:00
2005-09-20 21:26:39 +08:00
/**
Set the latest tokens string to be the specified error message
*/
2015-08-11 09:30:44 +08:00
void tokenizer_t : : call_error ( enum tokenizer_error error_type , const wchar_t * where , const wchar_t * error_message )
2005-09-20 21:26:39 +08:00
{
2015-07-26 15:58:32 +08:00
this - > last_type = TOK_ERROR ;
this - > error = error_type ;
2015-08-11 09:30:44 +08:00
this - > global_error_offset = where ? where - this - > orig_buff : 0 ;
2015-07-26 15:58:32 +08:00
this - > last_token = error_message ;
2005-09-20 21:26:39 +08:00
}
2015-08-11 09:30:44 +08:00
tokenizer_t : : tokenizer_t ( const wchar_t * b , tok_flags_t flags ) : buff ( b ) , orig_buff ( b ) , last_type ( TOK_NONE ) , last_pos ( 0 ) , has_next ( false ) , accept_unfinished ( false ) , show_comments ( false ) , show_blank_lines ( false ) , error ( TOK_ERROR_NONE ) , global_error_offset ( - 1 ) , squash_errors ( false ) , continue_line_after_comment ( false )
2005-09-20 21:26:39 +08:00
{
2015-07-26 15:58:32 +08:00
assert ( b ! = NULL ) ;
2006-07-15 20:40:05 +08:00
2012-11-22 09:48:35 +08:00
this - > accept_unfinished = ! ! ( flags & TOK_ACCEPT_UNFINISHED ) ;
this - > show_comments = ! ! ( flags & TOK_SHOW_COMMENTS ) ;
this - > squash_errors = ! ! ( flags & TOK_SQUASH_ERRORS ) ;
2014-11-26 02:43:03 +08:00
this - > show_blank_lines = ! ! ( flags & TOK_SHOW_BLANK_LINES ) ;
2005-09-20 21:26:39 +08:00
2012-11-22 09:48:35 +08:00
this - > has_next = ( * b ! = L ' \0 ' ) ;
2015-07-26 15:58:32 +08:00
this - > tok_next ( ) ;
2005-09-20 21:26:39 +08:00
}
2015-07-26 14:05:47 +08:00
bool tokenizer_t : : next ( struct tok_t * result )
{
assert ( result ! = NULL ) ;
if ( ! this - > has_next )
{
return false ;
}
2015-08-11 09:30:44 +08:00
const size_t current_pos = this - > buff - this - > orig_buff ;
2016-02-28 08:13:14 +08:00
/* We want to copy our last_token into result->text. If we just do this naively via =, we are liable to trigger std::string's CoW implementation: result->text's storage will be deallocated and instead will acquire a reference to last_token's storage. But last_token will be overwritten soon, which will trigger a new allocation and a copy. So our attempt to re-use result->text's storage will have failed. To ensure that doesn't happen, use assign() with wchar_t */
result - > text . assign ( this - > last_token . data ( ) , this - > last_token . size ( ) ) ;
2015-07-26 14:05:47 +08:00
result - > type = this - > last_type ;
2015-08-11 09:30:44 +08:00
result - > offset = this - > last_pos ;
2015-07-26 15:12:36 +08:00
result - > error = this - > last_type = = TOK_ERROR ? this - > error : TOK_ERROR_NONE ;
assert ( this - > buff > = this - > orig_buff ) ;
2015-08-11 09:30:44 +08:00
/* Compute error offset */
result - > error_offset = 0 ;
if ( this - > last_type = = TOK_ERROR & & this - > global_error_offset > = this - > last_pos & & this - > global_error_offset < current_pos )
{
result - > error_offset = this - > global_error_offset - this - > last_pos ;
}
2015-07-26 14:05:47 +08:00
assert ( this - > buff > = this - > orig_buff ) ;
2015-07-26 15:12:36 +08:00
result - > length = current_pos > = this - > last_pos ? current_pos - this - > last_pos : 0 ;
2015-07-26 15:58:32 +08:00
this - > tok_next ( ) ;
2015-07-26 14:05:47 +08:00
return true ;
}
2005-09-20 21:26:39 +08:00
/**
2012-07-11 11:30:54 +08:00
Tests if this character can be a part of a string . The redirect ^ is allowed unless it ' s the first character .
2014-11-02 12:06:16 +08:00
Hash ( # ) starts a comment if it ' s the first character in a token ; otherwise it is considered a string character .
See # 953.
2005-09-20 21:26:39 +08:00
*/
2015-07-26 11:29:19 +08:00
static bool tok_is_string_character ( wchar_t c , bool is_first )
2005-09-20 21:26:39 +08:00
{
2012-07-11 11:30:54 +08:00
switch ( c )
{
2012-11-19 16:31:03 +08:00
/* Unconditional separators */
case L ' \0 ' :
case L ' ' :
case L ' \n ' :
case L ' | ' :
case L ' \t ' :
case L ' ; ' :
case L ' \r ' :
case L ' < ' :
case L ' > ' :
case L ' & ' :
return false ;
/* Conditional separator */
case L ' ^ ' :
return ! is_first ;
default :
return true ;
2012-07-11 11:30:54 +08:00
}
2005-10-26 18:51:02 +08:00
}
2005-09-20 21:26:39 +08:00
2005-10-26 18:51:02 +08:00
/**
Quick test to catch the most common ' non - magical ' characters , makes
read_string slightly faster by adding a fast path for the most
common characters . This is obviously not a suitable replacement for
iswalpha .
*/
2012-11-19 08:30:30 +08:00
static int myal ( wchar_t c )
2005-10-26 18:51:02 +08:00
{
2012-11-19 08:30:30 +08:00
return ( c > = L ' a ' & & c < = L ' z ' ) | | ( c > = L ' A ' & & c < = L ' Z ' ) ;
2005-09-20 21:26:39 +08:00
}
/**
Read the next token as a string
*/
2015-07-26 15:58:32 +08:00
void tokenizer_t : : read_string ( )
2005-09-20 21:26:39 +08:00
{
2012-11-19 08:30:30 +08:00
long len ;
int do_loop = 1 ;
2015-08-11 09:30:44 +08:00
size_t paran_count = 0 ;
// up to 96 open parens, before we give up on good error reporting
const size_t paran_offsets_max = 96 ;
size_t paran_offsets [ paran_offsets_max ] ;
2015-08-11 10:30:21 +08:00
// where the open bracket is
size_t offset_of_bracket = 0 ;
2005-09-20 21:26:39 +08:00
2015-08-11 09:30:44 +08:00
const wchar_t * const start = this - > buff ;
2012-07-11 11:30:54 +08:00
bool is_first = true ;
2012-11-18 18:23:22 +08:00
2012-11-24 03:12:22 +08:00
enum tok_mode_t
{
2012-11-22 09:48:35 +08:00
mode_regular_text = 0 , // regular text
mode_subshell = 1 , // inside of subshell
mode_array_brackets = 2 , // inside of array brackets
mode_array_brackets_and_subshell = 3 // inside of array brackets and subshell, like in '$foo[(ech'
} mode = mode_regular_text ;
2012-11-19 08:30:30 +08:00
while ( 1 )
2012-11-18 18:23:22 +08:00
{
2015-07-26 15:58:32 +08:00
if ( ! myal ( * this - > buff ) )
2012-11-18 18:23:22 +08:00
{
2015-07-26 15:58:32 +08:00
if ( * this - > buff = = L ' \\ ' )
2012-11-19 08:30:30 +08:00
{
2015-08-11 09:30:44 +08:00
const wchar_t * error_location = this - > buff ;
2015-07-26 15:58:32 +08:00
this - > buff + + ;
if ( * this - > buff = = L ' \0 ' )
2012-11-19 08:30:30 +08:00
{
2015-07-26 15:58:32 +08:00
if ( ( ! this - > accept_unfinished ) )
2012-11-19 08:30:30 +08:00
{
2015-08-11 09:30:44 +08:00
TOK_CALL_ERROR ( this , TOK_UNTERMINATED_ESCAPE , UNTERMINATED_ESCAPE_ERROR , error_location ) ;
2012-11-19 08:30:30 +08:00
return ;
}
else
{
2012-11-21 06:51:30 +08:00
/* Since we are about to increment tok->buff, decrement it first so the increment doesn't go past the end of the buffer. https://github.com/fish-shell/fish-shell/issues/389 */
2015-07-26 15:58:32 +08:00
this - > buff - - ;
2012-11-22 09:48:35 +08:00
do_loop = 0 ;
2012-11-19 08:30:30 +08:00
}
}
2012-11-18 18:23:22 +08:00
2015-07-26 15:58:32 +08:00
this - > buff + + ;
2012-11-19 08:30:30 +08:00
continue ;
}
2012-11-24 03:12:22 +08:00
2012-11-19 08:30:30 +08:00
switch ( mode )
2012-11-18 18:23:22 +08:00
{
2012-11-22 09:48:35 +08:00
case mode_regular_text :
2012-11-19 08:30:30 +08:00
{
2015-07-26 15:58:32 +08:00
switch ( * this - > buff )
2012-11-19 16:31:03 +08:00
{
case L ' ( ' :
{
paran_count = 1 ;
2015-08-11 09:30:44 +08:00
paran_offsets [ 0 ] = this - > buff - this - > orig_buff ;
2012-11-22 09:48:35 +08:00
mode = mode_subshell ;
2012-11-19 16:31:03 +08:00
break ;
}
2012-11-18 18:23:22 +08:00
2012-11-19 16:31:03 +08:00
case L ' [ ' :
{
2015-07-26 15:58:32 +08:00
if ( this - > buff ! = start )
2015-08-11 10:30:21 +08:00
{
2012-11-22 09:48:35 +08:00
mode = mode_array_brackets ;
2015-08-11 10:30:21 +08:00
offset_of_bracket = this - > buff - this - > orig_buff ;
}
2012-11-19 16:31:03 +08:00
break ;
}
2012-11-18 18:23:22 +08:00
2012-11-19 16:31:03 +08:00
case L ' \' ' :
case L ' " ' :
{
2012-11-18 18:23:22 +08:00
2015-07-26 15:58:32 +08:00
const wchar_t * end = quote_end ( this - > buff ) ;
2012-11-19 16:31:03 +08:00
if ( end )
{
2015-07-26 15:58:32 +08:00
this - > buff = end ;
2012-11-19 16:31:03 +08:00
}
else
{
2015-08-11 09:30:44 +08:00
const wchar_t * error_loc = this - > buff ;
2015-07-26 15:58:32 +08:00
this - > buff + = wcslen ( this - > buff ) ;
2012-11-19 16:31:03 +08:00
2015-07-26 15:58:32 +08:00
if ( ! this - > accept_unfinished )
2012-11-19 16:31:03 +08:00
{
2015-08-11 09:30:44 +08:00
TOK_CALL_ERROR ( this , TOK_UNTERMINATED_QUOTE , QUOTE_ERROR , error_loc ) ;
2012-11-19 16:31:03 +08:00
return ;
}
do_loop = 0 ;
}
break ;
}
2012-11-19 08:30:30 +08:00
2012-11-19 16:31:03 +08:00
default :
2012-11-19 08:30:30 +08:00
{
2015-07-26 15:58:32 +08:00
if ( ! tok_is_string_character ( * ( this - > buff ) , is_first ) )
2012-11-19 16:31:03 +08:00
{
do_loop = 0 ;
}
2012-11-19 08:30:30 +08:00
}
}
break ;
}
2012-11-22 09:48:35 +08:00
case mode_array_brackets_and_subshell :
case mode_subshell :
2015-08-11 09:30:44 +08:00
{
2015-07-26 15:58:32 +08:00
switch ( * this - > buff )
2012-11-19 08:30:30 +08:00
{
2012-11-19 16:31:03 +08:00
case L ' \' ' :
case L ' \" ' :
2012-11-19 08:30:30 +08:00
{
2015-07-26 15:58:32 +08:00
const wchar_t * end = quote_end ( this - > buff ) ;
2012-11-19 16:31:03 +08:00
if ( end )
{
2015-07-26 15:58:32 +08:00
this - > buff = end ;
2012-11-19 16:31:03 +08:00
}
else
{
2015-08-11 09:30:44 +08:00
const wchar_t * error_loc = this - > buff ;
2015-07-26 15:58:32 +08:00
this - > buff + = wcslen ( this - > buff ) ;
if ( ( ! this - > accept_unfinished ) )
2012-11-19 16:31:03 +08:00
{
2015-08-11 09:30:44 +08:00
TOK_CALL_ERROR ( this , TOK_UNTERMINATED_QUOTE , QUOTE_ERROR , error_loc ) ;
2012-11-19 16:31:03 +08:00
return ;
}
do_loop = 0 ;
}
break ;
2012-11-19 08:30:30 +08:00
}
2012-11-18 18:23:22 +08:00
2012-11-19 16:31:03 +08:00
case L ' ( ' :
2015-08-11 09:30:44 +08:00
if ( paran_count < paran_offsets_max )
{
paran_offsets [ paran_count ] = this - > buff - this - > orig_buff ;
}
2012-11-19 16:31:03 +08:00
paran_count + + ;
break ;
case L ' ) ' :
2015-08-11 09:30:44 +08:00
assert ( paran_count > 0 ) ;
2012-11-19 16:31:03 +08:00
paran_count - - ;
if ( paran_count = = 0 )
{
2012-11-22 09:48:35 +08:00
mode = ( mode = = mode_array_brackets_and_subshell ? mode_array_brackets : mode_regular_text ) ;
2012-11-19 16:31:03 +08:00
}
break ;
case L ' \0 ' :
do_loop = 0 ;
break ;
}
2012-11-19 08:30:30 +08:00
break ;
2015-08-11 09:30:44 +08:00
}
2012-11-24 03:12:22 +08:00
2012-11-22 09:48:35 +08:00
case mode_array_brackets :
2015-08-11 09:30:44 +08:00
{
2015-07-26 15:58:32 +08:00
switch ( * this - > buff )
2012-11-19 08:30:30 +08:00
{
2012-11-19 16:31:03 +08:00
case L ' ( ' :
paran_count = 1 ;
2015-08-11 09:30:44 +08:00
paran_offsets [ 0 ] = this - > buff - this - > orig_buff ;
2012-11-22 09:48:35 +08:00
mode = mode_array_brackets_and_subshell ;
2012-11-19 16:31:03 +08:00
break ;
case L ' ] ' :
2012-11-22 09:48:35 +08:00
mode = mode_regular_text ;
2012-11-19 16:31:03 +08:00
break ;
case L ' \0 ' :
do_loop = 0 ;
break ;
2012-11-19 08:30:30 +08:00
}
break ;
2015-08-11 09:30:44 +08:00
}
2012-11-18 18:23:22 +08:00
}
2012-11-19 08:30:30 +08:00
}
2012-11-18 18:23:22 +08:00
2012-11-19 08:30:30 +08:00
if ( ! do_loop )
break ;
2012-11-18 18:23:22 +08:00
2015-07-26 15:58:32 +08:00
this - > buff + + ;
2012-07-11 11:30:54 +08:00
is_first = false ;
2012-11-19 08:30:30 +08:00
}
2005-09-20 21:26:39 +08:00
2015-07-26 15:58:32 +08:00
if ( ( ! this - > accept_unfinished ) & & ( mode ! = mode_regular_text ) )
2012-11-19 08:30:30 +08:00
{
2013-09-12 05:22:16 +08:00
switch ( mode )
{
case mode_subshell :
2015-08-11 09:30:44 +08:00
{
// Determine the innermost opening paran offset by interrogating paran_offsets
assert ( paran_count > 0 ) ;
size_t offset_of_open_paran = 0 ;
if ( paran_count < = paran_offsets_max )
{
offset_of_open_paran = paran_offsets [ paran_count - 1 ] ;
}
TOK_CALL_ERROR ( this , TOK_UNTERMINATED_SUBSHELL , PARAN_ERROR , this - > orig_buff + offset_of_open_paran ) ;
2013-09-12 05:22:16 +08:00
break ;
2015-08-11 09:30:44 +08:00
}
2013-09-12 05:22:16 +08:00
case mode_array_brackets :
case mode_array_brackets_and_subshell :
2015-08-11 09:30:44 +08:00
{
2015-08-11 10:30:21 +08:00
TOK_CALL_ERROR ( this , TOK_UNTERMINATED_SLICE , SQUARE_BRACKET_ERROR , this - > orig_buff + offset_of_bracket ) ;
2013-09-12 05:22:16 +08:00
break ;
2015-08-11 09:30:44 +08:00
}
2013-09-12 05:22:16 +08:00
default :
assert ( 0 & & " Unexpected mode in read_string " ) ;
break ;
}
2012-11-19 08:30:30 +08:00
return ;
}
2005-09-20 21:26:39 +08:00
2015-07-26 15:58:32 +08:00
len = this - > buff - start ;
2005-09-20 21:26:39 +08:00
2015-07-26 15:58:32 +08:00
this - > last_token . assign ( start , len ) ;
this - > last_type = TOK_STRING ;
2005-09-20 21:26:39 +08:00
}
/**
Read the next token as a comment .
*/
2015-07-26 15:58:32 +08:00
void tokenizer_t : : read_comment ( )
2005-09-20 21:26:39 +08:00
{
2015-07-26 15:58:32 +08:00
const wchar_t * start = this - > buff ;
while ( * ( this - > buff ) ! = L ' \n ' & & * ( this - > buff ) ! = L ' \0 ' )
this - > buff + + ;
2012-11-24 03:12:22 +08:00
2015-07-26 15:58:32 +08:00
size_t len = this - > buff - start ;
this - > last_token . assign ( start , len ) ;
this - > last_type = TOK_COMMENT ;
2005-09-20 21:26:39 +08:00
}
2013-10-14 07:58:40 +08:00
2013-10-14 04:26:52 +08:00
/* Reads a redirection or an "fd pipe" (like 2>|) from a string. Returns how many characters were consumed. If zero, then this string was not a redirection.
2013-10-14 07:58:40 +08:00
Also returns by reference the redirection mode , and the fd to redirection . If there is overflow , * out_fd is set to - 1.
2005-09-20 21:26:39 +08:00
*/
2013-10-14 04:26:52 +08:00
static size_t read_redirection_or_fd_pipe ( const wchar_t * buff , enum token_type * out_redirection_mode , int * out_fd )
2005-09-20 21:26:39 +08:00
{
2013-10-14 04:26:52 +08:00
bool errored = false ;
int fd = 0 ;
2013-10-01 04:57:36 +08:00
enum token_type redirection_mode = TOK_NONE ;
2012-11-18 18:23:22 +08:00
2013-10-14 04:26:52 +08:00
size_t idx = 0 ;
2014-01-15 17:40:40 +08:00
2013-10-14 07:58:40 +08:00
/* Determine the fd. This may be specified as a prefix like '2>...' or it may be implicit like '>' or '^'. Try parsing out a number; if we did not get any digits then infer it from the first character. Watch out for overflow. */
long long big_fd = 0 ;
2013-10-14 04:26:52 +08:00
for ( ; iswdigit ( buff [ idx ] ) ; idx + + )
2012-11-18 18:23:22 +08:00
{
2013-10-14 07:58:40 +08:00
/* Note that it's important we consume all the digits here, even if it overflows. */
if ( big_fd < = INT_MAX )
big_fd = big_fd * 10 + ( buff [ idx ] - L ' 0 ' ) ;
2013-10-14 04:26:52 +08:00
}
2014-01-15 17:40:40 +08:00
2013-10-14 07:58:40 +08:00
fd = ( big_fd > INT_MAX ? - 1 : static_cast < int > ( big_fd ) ) ;
2014-01-15 17:40:40 +08:00
2013-10-14 04:26:52 +08:00
if ( idx = = 0 )
{
/* We did not find a leading digit, so there's no explicit fd. Infer it from the type */
switch ( buff [ idx ] )
2012-11-19 08:30:30 +08:00
{
2014-01-15 17:40:40 +08:00
case L ' > ' :
fd = STDOUT_FILENO ;
break ;
case L ' < ' :
fd = STDIN_FILENO ;
break ;
case L ' ^ ' :
fd = STDERR_FILENO ;
break ;
default :
errored = true ;
break ;
2012-11-19 08:30:30 +08:00
}
2013-10-14 04:26:52 +08:00
}
2014-01-15 17:40:40 +08:00
2013-10-14 04:26:52 +08:00
/* Either way we should have ended on the redirection character itself like '>' */
wchar_t redirect_char = buff [ idx + + ] ; //note increment of idx
if ( redirect_char = = L ' > ' | | redirect_char = = L ' ^ ' )
{
redirection_mode = TOK_REDIRECT_OUT ;
if ( buff [ idx ] = = redirect_char )
2012-11-19 08:30:30 +08:00
{
2013-10-14 04:26:52 +08:00
/* Doubled up like ^^ or >>. That means append */
redirection_mode = TOK_REDIRECT_APPEND ;
idx + + ;
2012-11-19 08:30:30 +08:00
}
}
2013-10-14 04:26:52 +08:00
else if ( redirect_char = = L ' < ' )
2012-11-19 08:30:30 +08:00
{
2013-10-01 04:57:36 +08:00
redirection_mode = TOK_REDIRECT_IN ;
2012-11-18 18:23:22 +08:00
}
else
{
2013-10-14 04:26:52 +08:00
/* Something else */
errored = true ;
2012-11-18 18:23:22 +08:00
}
2014-01-15 17:40:40 +08:00
2013-10-14 04:26:52 +08:00
/* Don't return valid-looking stuff on error */
if ( errored )
2012-11-19 08:30:30 +08:00
{
2013-10-14 04:26:52 +08:00
idx = 0 ;
redirection_mode = TOK_NONE ;
2012-11-19 08:30:30 +08:00
}
2015-10-08 02:38:13 +08:00
else
{
/* Optional characters like & or ?, or the pipe char | */
wchar_t opt_char = buff [ idx ] ;
if ( opt_char = = L ' & ' )
{
redirection_mode = TOK_REDIRECT_FD ;
idx + + ;
}
else if ( opt_char = = L ' ? ' )
{
redirection_mode = TOK_REDIRECT_NOCLOB ;
idx + + ;
}
else if ( opt_char = = L ' | ' )
{
/* So the string looked like '2>|'. This is not a redirection - it's a pipe! That gets handled elsewhere. */
redirection_mode = TOK_PIPE ;
idx + + ;
}
}
2014-01-15 17:40:40 +08:00
2013-10-14 04:26:52 +08:00
/* Return stuff */
if ( out_redirection_mode ! = NULL )
* out_redirection_mode = redirection_mode ;
if ( out_fd ! = NULL )
* out_fd = fd ;
2014-01-15 17:40:40 +08:00
2013-10-14 04:26:52 +08:00
return idx ;
2005-09-20 21:26:39 +08:00
}
2013-12-24 06:53:56 +08:00
enum token_type redirection_type_for_string ( const wcstring & str , int * out_fd )
2013-10-14 07:58:40 +08:00
{
enum token_type mode = TOK_NONE ;
int fd = 0 ;
read_redirection_or_fd_pipe ( str . c_str ( ) , & mode , & fd ) ;
/* Redirections only, no pipes */
if ( mode = = TOK_PIPE | | fd < 0 )
mode = TOK_NONE ;
2013-12-24 06:53:56 +08:00
if ( out_fd ! = NULL )
* out_fd = fd ;
2013-10-14 07:58:40 +08:00
return mode ;
}
2013-12-29 08:18:38 +08:00
int fd_redirected_by_pipe ( const wcstring & str )
{
/* Hack for the common case */
if ( str = = L " | " )
{
return STDOUT_FILENO ;
}
2014-01-15 17:40:40 +08:00
2013-12-29 08:18:38 +08:00
enum token_type mode = TOK_NONE ;
int fd = 0 ;
read_redirection_or_fd_pipe ( str . c_str ( ) , & mode , & fd ) ;
/* Pipes only */
if ( mode ! = TOK_PIPE | | fd < 0 )
fd = - 1 ;
return fd ;
}
2013-10-14 07:58:40 +08:00
2013-12-24 06:53:56 +08:00
int oflags_for_redirection_type ( enum token_type type )
{
switch ( type )
{
2014-01-15 17:40:40 +08:00
case TOK_REDIRECT_APPEND :
return O_CREAT | O_APPEND | O_WRONLY ;
case TOK_REDIRECT_OUT :
return O_CREAT | O_WRONLY | O_TRUNC ;
case TOK_REDIRECT_NOCLOB :
return O_CREAT | O_EXCL | O_WRONLY ;
case TOK_REDIRECT_IN :
return O_RDONLY ;
2013-12-24 06:53:56 +08:00
default :
return - 1 ;
}
}
2005-09-20 21:26:39 +08:00
/**
Test if a character is whitespace . Differs from iswspace in that it
does not consider a newline to be whitespace .
*/
2012-11-24 04:03:36 +08:00
static bool my_iswspace ( wchar_t c )
2005-09-20 21:26:39 +08:00
{
2012-11-24 04:03:36 +08:00
return c ! = L ' \n ' & & iswspace ( c ) ;
2005-09-20 21:26:39 +08:00
}
2015-07-26 15:58:32 +08:00
void tokenizer_t : : tok_next ( )
2005-09-20 21:26:39 +08:00
{
2015-07-26 15:58:32 +08:00
if ( this - > last_type = = TOK_ERROR )
2012-11-18 18:23:22 +08:00
{
2015-07-26 15:58:32 +08:00
this - > has_next = false ;
2012-11-19 08:30:30 +08:00
return ;
2012-11-18 18:23:22 +08:00
}
2012-11-19 08:30:30 +08:00
2015-07-26 15:58:32 +08:00
if ( ! this - > has_next )
2012-11-18 18:23:22 +08:00
{
2012-11-19 08:30:30 +08:00
/* wprintf( L"EOL\n" );*/
2015-07-26 15:58:32 +08:00
this - > last_type = TOK_END ;
2012-11-18 18:23:22 +08:00
return ;
}
2012-11-19 08:30:30 +08:00
while ( 1 )
2012-11-18 18:23:22 +08:00
{
2015-07-26 15:58:32 +08:00
if ( this - > buff [ 0 ] = = L ' \\ ' & & this - > buff [ 1 ] = = L ' \n ' )
2012-11-24 04:03:36 +08:00
{
2015-07-26 15:58:32 +08:00
this - > buff + = 2 ;
this - > continue_line_after_comment = true ;
2012-11-24 04:03:36 +08:00
}
2015-07-26 15:58:32 +08:00
else if ( my_iswspace ( this - > buff [ 0 ] ) )
2012-11-19 08:30:30 +08:00
{
2015-07-26 15:58:32 +08:00
this - > buff + + ;
2012-11-19 08:30:30 +08:00
}
else
{
break ;
}
2012-11-18 18:23:22 +08:00
}
2012-11-19 08:30:30 +08:00
2015-07-26 15:58:32 +08:00
while ( * this - > buff = = L ' # ' )
2012-11-18 18:23:22 +08:00
{
2015-07-26 15:58:32 +08:00
if ( this - > show_comments )
2012-11-19 08:30:30 +08:00
{
2015-07-26 15:58:32 +08:00
this - > last_pos = this - > buff - this - > orig_buff ;
this - > read_comment ( ) ;
2015-03-13 20:05:22 +08:00
2015-07-26 15:58:32 +08:00
if ( this - > buff [ 0 ] = = L ' \n ' & & this - > continue_line_after_comment )
this - > buff + + ;
2015-03-13 20:05:22 +08:00
2012-11-19 08:30:30 +08:00
return ;
}
else
{
2015-07-26 15:58:32 +08:00
while ( * ( this - > buff ) ! = L ' \n ' & & * ( this - > buff ) ! = L ' \0 ' )
this - > buff + + ;
2015-03-13 20:05:22 +08:00
2015-07-26 15:58:32 +08:00
if ( this - > buff [ 0 ] = = L ' \n ' & & this - > continue_line_after_comment )
this - > buff + + ;
2012-11-19 08:30:30 +08:00
}
2012-11-18 18:23:22 +08:00
2015-07-26 15:58:32 +08:00
while ( my_iswspace ( * ( this - > buff ) ) ) {
this - > buff + + ;
2015-03-13 20:05:22 +08:00
}
2012-11-19 08:30:30 +08:00
}
2012-11-18 18:23:22 +08:00
2015-07-26 15:58:32 +08:00
this - > continue_line_after_comment = false ;
2015-03-13 20:05:22 +08:00
2015-07-26 15:58:32 +08:00
this - > last_pos = this - > buff - this - > orig_buff ;
2012-11-18 18:23:22 +08:00
2015-07-26 15:58:32 +08:00
switch ( * this - > buff )
2012-11-19 08:30:30 +08:00
{
2012-11-19 16:31:03 +08:00
case L ' \0 ' :
2015-07-26 15:58:32 +08:00
this - > last_type = TOK_END ;
2012-11-19 16:31:03 +08:00
/*fwprintf( stderr, L"End of string\n" );*/
2015-07-26 15:58:32 +08:00
this - > has_next = false ;
2012-11-19 16:31:03 +08:00
break ;
2016-01-22 11:56:39 +08:00
case L ' \r ' : // carriage-return
case L ' \n ' : // newline
2014-11-26 02:43:03 +08:00
case L ' ; ' :
2015-07-26 15:58:32 +08:00
this - > last_type = TOK_END ;
this - > buff + + ;
2014-11-24 17:20:57 +08:00
// Hack: when we get a newline, swallow as many as we can
// This compresses multiple subsequent newlines into a single one
2015-07-26 15:58:32 +08:00
if ( ! this - > show_blank_lines )
2014-11-24 17:20:57 +08:00
{
2015-07-26 15:58:32 +08:00
while ( * this - > buff = = L ' \n ' | | * this - > buff = = 13 /* CR */ | | * this - > buff = = ' ' | | * this - > buff = = ' \t ' )
2014-11-26 02:43:03 +08:00
{
2015-07-26 15:58:32 +08:00
this - > buff + + ;
2014-11-26 02:43:03 +08:00
}
2014-11-24 17:20:57 +08:00
}
2015-07-26 15:58:32 +08:00
this - > last_token . clear ( ) ;
2012-11-19 16:31:03 +08:00
break ;
case L ' & ' :
2015-07-26 15:58:32 +08:00
this - > last_type = TOK_BACKGROUND ;
this - > buff + + ;
2012-11-19 16:31:03 +08:00
break ;
2012-11-18 18:23:22 +08:00
2012-11-19 16:31:03 +08:00
case L ' | ' :
2015-07-26 15:58:32 +08:00
this - > last_token = L " 1 " ;
this - > last_type = TOK_PIPE ;
this - > buff + + ;
2012-11-19 16:31:03 +08:00
break ;
2012-11-18 18:23:22 +08:00
2012-11-19 16:31:03 +08:00
case L ' > ' :
case L ' < ' :
case L ' ^ ' :
2013-10-14 04:26:52 +08:00
{
/* There's some duplication with the code in the default case below. The key difference here is that we must never parse these as a string; a failed redirection is an error! */
enum token_type mode = TOK_NONE ;
int fd = - 1 ;
2015-07-26 15:58:32 +08:00
size_t consumed = read_redirection_or_fd_pipe ( this - > buff , & mode , & fd ) ;
2013-10-14 07:58:40 +08:00
if ( consumed = = 0 | | fd < 0 )
2013-10-14 04:26:52 +08:00
{
2015-08-11 09:30:44 +08:00
TOK_CALL_ERROR ( this , TOK_OTHER , REDIRECT_ERROR , this - > buff ) ;
2013-10-14 04:26:52 +08:00
}
else
{
2015-07-26 15:58:32 +08:00
this - > buff + = consumed ;
this - > last_type = mode ;
this - > last_token = to_string ( fd ) ;
2013-10-14 04:26:52 +08:00
}
}
break ;
2012-11-18 18:23:22 +08:00
2012-11-19 16:31:03 +08:00
default :
2012-11-18 18:23:22 +08:00
{
2013-10-14 04:26:52 +08:00
/* Maybe a redirection like '2>&1', maybe a pipe like 2>|, maybe just a string */
2015-08-11 09:30:44 +08:00
const wchar_t * error_location = this - > buff ;
2013-10-14 04:26:52 +08:00
size_t consumed = 0 ;
enum token_type mode = TOK_NONE ;
int fd = - 1 ;
2015-07-26 15:58:32 +08:00
if ( iswdigit ( * this - > buff ) )
2015-10-08 02:38:13 +08:00
{
2015-07-26 15:58:32 +08:00
consumed = read_redirection_or_fd_pipe ( this - > buff , & mode , & fd ) ;
2015-10-08 02:38:13 +08:00
}
2014-01-15 17:40:40 +08:00
2013-10-14 04:26:52 +08:00
if ( consumed > 0 )
2012-11-19 08:30:30 +08:00
{
2013-10-14 07:58:40 +08:00
/* It looks like a redirection or a pipe. But we don't support piping fd 0. Note that fd 0 may be -1, indicating overflow; but we don't treat that as a tokenizer error. */
2013-10-14 04:26:52 +08:00
if ( mode = = TOK_PIPE & & fd = = 0 )
2012-11-19 16:31:03 +08:00
{
2015-08-11 09:30:44 +08:00
TOK_CALL_ERROR ( this , TOK_OTHER , PIPE_ERROR , error_location ) ;
2013-10-14 04:26:52 +08:00
}
else
{
2015-07-26 15:58:32 +08:00
this - > buff + = consumed ;
this - > last_type = mode ;
this - > last_token = to_string ( fd ) ;
2012-11-19 16:31:03 +08:00
}
2012-11-19 08:30:30 +08:00
}
2013-10-14 04:26:52 +08:00
else
{
2014-11-02 12:06:16 +08:00
/* Not a redirection or pipe, so just a string */
2015-07-26 15:58:32 +08:00
this - > read_string ( ) ;
2013-10-14 04:26:52 +08:00
}
2012-11-18 18:23:22 +08:00
}
2013-10-14 04:26:52 +08:00
break ;
2012-11-19 08:30:30 +08:00
}
2005-09-20 21:26:39 +08:00
}
2015-07-26 15:58:32 +08:00
wcstring tok_first ( const wcstring & str )
2005-09-20 21:26:39 +08:00
{
2012-11-22 14:23:48 +08:00
wcstring result ;
2015-07-26 15:58:32 +08:00
tokenizer_t t ( str . c_str ( ) , TOK_SQUASH_ERRORS ) ;
tok_t token ;
if ( t . next ( & token ) & & token . type = = TOK_STRING )
2012-11-19 08:30:30 +08:00
{
2015-07-26 15:58:32 +08:00
result . swap ( token . text ) ;
2012-11-19 08:30:30 +08:00
}
2012-11-22 14:23:48 +08:00
return result ;
2005-09-20 21:26:39 +08:00
}
2012-12-21 09:37:09 +08:00
bool move_word_state_machine_t : : consume_char_punctuation ( wchar_t c )
{
enum
{
s_always_one = 0 ,
s_whitespace ,
s_alphanumeric ,
s_end
} ;
2012-12-23 04:21:31 +08:00
2012-12-21 09:37:09 +08:00
bool consumed = false ;
while ( state ! = s_end & & ! consumed )
{
switch ( state )
{
case s_always_one :
/* Always consume the first character */
consumed = true ;
state = s_whitespace ;
break ;
2012-12-23 04:21:31 +08:00
2012-12-21 09:37:09 +08:00
case s_whitespace :
if ( iswspace ( c ) )
{
/* Consumed whitespace */
consumed = true ;
}
else
{
state = s_alphanumeric ;
}
break ;
2012-12-23 04:21:31 +08:00
2012-12-21 09:37:09 +08:00
case s_alphanumeric :
if ( iswalnum ( c ) )
{
/* Consumed alphanumeric */
consumed = true ;
}
else
{
state = s_end ;
}
break ;
2012-12-23 04:21:31 +08:00
2012-12-21 09:37:09 +08:00
case s_end :
default :
break ;
}
}
return consumed ;
}
2005-09-20 21:26:39 +08:00
2012-12-21 09:37:09 +08:00
bool move_word_state_machine_t : : is_path_component_character ( wchar_t c )
2012-12-11 08:23:08 +08:00
{
2012-12-21 09:37:09 +08:00
/* Always treat separators as first. All this does is ensure that we treat ^ as a string character instead of as stderr redirection, which I hypothesize is usually what is desired. */
return tok_is_string_character ( c , true ) & & ! wcschr ( L " /= { , } ' \ " " , c ) ;
2012-12-11 08:23:08 +08:00
}
2012-12-21 09:37:09 +08:00
bool move_word_state_machine_t : : consume_char_path_components ( wchar_t c )
2012-12-11 08:23:08 +08:00
{
2012-12-21 09:37:09 +08:00
enum
{
s_initial_punctuation ,
s_whitespace ,
s_separator ,
s_slash ,
s_path_component_characters ,
s_end
} ;
2012-12-23 04:21:31 +08:00
2012-12-11 08:23:08 +08:00
//printf("state %d, consume '%lc'\n", state, c);
bool consumed = false ;
while ( state ! = s_end & & ! consumed )
{
switch ( state )
{
2012-12-21 09:37:09 +08:00
case s_initial_punctuation :
if ( ! is_path_component_character ( c ) )
{
consumed = true ;
}
state = s_whitespace ;
break ;
2012-12-23 04:21:31 +08:00
2012-12-11 08:23:08 +08:00
case s_whitespace :
if ( iswspace ( c ) )
{
/* Consumed whitespace */
consumed = true ;
}
2012-12-21 09:37:09 +08:00
else if ( c = = L ' / ' | | is_path_component_character ( c ) )
2012-12-11 08:23:08 +08:00
{
2012-12-21 09:37:09 +08:00
/* Path component */
2012-12-11 08:23:08 +08:00
state = s_slash ;
}
else
{
2012-12-21 09:37:09 +08:00
/* Path separator */
2012-12-11 08:23:08 +08:00
state = s_separator ;
}
break ;
case s_separator :
2012-12-21 09:37:09 +08:00
if ( ! iswspace ( c ) & & ! is_path_component_character ( c ) )
2012-12-11 08:23:08 +08:00
{
/* Consumed separator */
consumed = true ;
}
else
{
state = s_end ;
}
break ;
case s_slash :
if ( c = = L ' / ' )
{
/* Consumed slash */
consumed = true ;
}
else
{
2012-12-21 09:37:09 +08:00
state = s_path_component_characters ;
2012-12-11 08:23:08 +08:00
}
break ;
2012-12-21 09:37:09 +08:00
case s_path_component_characters :
if ( is_path_component_character ( c ) )
2012-12-11 08:23:08 +08:00
{
/* Consumed string character except slash */
consumed = true ;
}
else
{
state = s_end ;
}
break ;
/* We won't get here, but keep the compiler happy */
case s_end :
default :
break ;
}
}
return consumed ;
}
Add 'bigword' vi key bindings
- Add four new functions: forward-bigword, backward-bigword,
kill-bigword, backward-kill-bigword
- Add new enum move_word_style_whitespace and related state machine
method
- Change vi key bindings to operate on bigwords: B, gE, W, E, dW, diW,
daW, dE, dB, dgE, cW, ciW, caW, cE, cB, cgE, yW, yiW, yaW, yE, yB,
ygE
2015-05-31 06:44:25 +08:00
bool move_word_state_machine_t : : consume_char_whitespace ( wchar_t c )
{
enum
{
s_always_one = 0 ,
s_blank ,
s_graph ,
s_end
} ;
bool consumed = false ;
while ( state ! = s_end & & ! consumed )
{
switch ( state )
{
case s_always_one :
/* Always consume the first character */
consumed = true ;
state = s_blank ;
break ;
case s_blank :
if ( iswblank ( c ) )
{
/* Consumed whitespace */
consumed = true ;
}
else
{
state = s_graph ;
}
break ;
case s_graph :
if ( iswgraph ( c ) )
{
/* Consumed printable non-space */
consumed = true ;
}
else
{
state = s_end ;
}
break ;
case s_end :
default :
break ;
}
}
return consumed ;
}
2012-12-21 09:37:09 +08:00
bool move_word_state_machine_t : : consume_char ( wchar_t c )
{
switch ( style )
{
2012-12-23 04:21:31 +08:00
case move_word_style_punctuation :
return consume_char_punctuation ( c ) ;
case move_word_style_path_components :
return consume_char_path_components ( c ) ;
Add 'bigword' vi key bindings
- Add four new functions: forward-bigword, backward-bigword,
kill-bigword, backward-kill-bigword
- Add new enum move_word_style_whitespace and related state machine
method
- Change vi key bindings to operate on bigwords: B, gE, W, E, dW, diW,
daW, dE, dB, dgE, cW, ciW, caW, cE, cB, cgE, yW, yiW, yaW, yE, yB,
ygE
2015-05-31 06:44:25 +08:00
case move_word_style_whitespace :
return consume_char_whitespace ( c ) ;
2012-12-23 04:21:31 +08:00
default :
return false ;
2012-12-21 09:37:09 +08:00
}
}
move_word_state_machine_t : : move_word_state_machine_t ( move_word_style_t syl ) : state ( 0 ) , style ( syl )
{
}
void move_word_state_machine_t : : reset ( )
{
state = 0 ;
}