2012-11-18 18:23:22 +08:00
/** \file tokenizer.h
2005-09-20 21:26:39 +08:00
A specialized tokenizer for tokenizing the fish language . In the
future , the tokenizer should be extended to support marks ,
tokenizing multiple strings and disposing of unused string
segments .
*/
2005-10-04 23:11:39 +08:00
# ifndef FISH_TOKENIZER_H
# define FISH_TOKENIZER_H
# include <wchar.h>
2012-11-22 14:09:35 +08:00
# include "common.h"
2005-10-04 23:11:39 +08:00
2005-09-20 21:26:39 +08:00
/**
Token types
*/
enum token_type
{
2012-11-19 08:30:30 +08:00
TOK_NONE , /**< Tokenizer not yet constructed */
TOK_ERROR , /**< Error reading token */
TOK_STRING , /**< String token */
TOK_PIPE , /**< Pipe token */
2013-06-02 13:14:47 +08:00
TOK_END , /**< End token (semicolon or newline, not literal end) */
2012-11-19 08:30:30 +08:00
TOK_REDIRECT_OUT , /**< redirection token */
TOK_REDIRECT_APPEND , /**< redirection append token */
TOK_REDIRECT_IN , /**< input redirection token */
TOK_REDIRECT_FD , /**< redirection to new fd token */
TOK_REDIRECT_NOCLOB , /**<? redirection token */
TOK_BACKGROUND , /**< send job to bg token */
TOK_COMMENT /**< comment token */
2012-02-16 03:33:41 +08:00
} ;
2006-10-07 08:56:25 +08:00
/**
Tokenizer error types
*/
enum tokenizer_error
{
2012-11-19 08:30:30 +08:00
TOK_UNTERMINATED_QUOTE ,
TOK_UNTERMINATED_SUBSHELL ,
TOK_UNTERMINATED_ESCAPE ,
TOK_OTHER
2006-10-07 08:56:25 +08:00
}
2012-11-19 08:30:30 +08:00
;
2006-10-07 08:56:25 +08:00
2005-09-20 21:26:39 +08:00
/**
Flag telling the tokenizer to accept incomplete parameters ,
i . e . parameters with mismatching paranthesis , etc . This is useful
for tab - completion .
*/
# define TOK_ACCEPT_UNFINISHED 1
/**
Flag telling the tokenizer not to remove comments . Useful for
syntax highlighting .
*/
# define TOK_SHOW_COMMENTS 2
2012-02-18 07:55:54 +08:00
/** Flag telling the tokenizer to not generate error messages, which we need to do when tokenizing off of the main thread (since wgettext is not thread safe).
*/
# define TOK_SQUASH_ERRORS 4
2012-11-22 09:48:35 +08:00
typedef unsigned int tok_flags_t ;
2005-09-20 21:26:39 +08:00
/**
2012-11-18 18:23:22 +08:00
The tokenizer struct .
2005-09-20 21:26:39 +08:00
*/
2012-11-22 09:48:35 +08:00
struct tokenizer_t
2005-09-20 21:26:39 +08:00
{
2012-11-19 08:30:30 +08:00
/** A pointer into the original string, showing where the next token begins */
const wchar_t * buff ;
/** A copy of the original string */
const wchar_t * orig_buff ;
2012-11-22 14:09:35 +08:00
/** The last token */
wcstring last_token ;
2012-11-19 08:30:30 +08:00
/** Type of last token*/
int last_type ;
2012-11-22 14:09:35 +08:00
2012-11-19 08:30:30 +08:00
/** Offset of last token*/
size_t last_pos ;
/** Whether there are more tokens*/
bool has_next ;
/** Whether incomplete tokens are accepted*/
bool accept_unfinished ;
/** Whether commants should be returned*/
bool show_comments ;
/** Type of last quote, can be either ' or ".*/
wchar_t last_quote ;
/** Last error */
int error ;
2012-02-18 07:55:54 +08:00
/* Whether we are squashing errors */
bool squash_errors ;
2012-08-05 08:44:14 +08:00
/* Cached line number information */
size_t cached_lineno_offset ;
int cached_lineno_count ;
/** Return the line number of the character at the given offset */
int line_number_of_character_at_offset ( size_t offset ) ;
2012-11-22 09:48:35 +08:00
/**
Constructor for a tokenizer . b is the string that is to be
tokenized . It is not copied , and should not be freed by the caller
until after the tokenizer is destroyed .
2005-09-20 21:26:39 +08:00
2012-11-22 09:48:35 +08:00
\ param b The string to tokenize
\ param flags Flags to the tokenizer . Setting TOK_ACCEPT_UNFINISHED will cause the tokenizer
to accept incomplete tokens , such as a subshell without a closing
parenthesis , as a valid token . Setting TOK_SHOW_COMMENTS will return comments as tokens
2012-11-18 18:23:22 +08:00
2012-11-22 09:48:35 +08:00
*/
tokenizer_t ( const wchar_t * b , tok_flags_t flags ) ;
} ;
2005-09-20 21:26:39 +08:00
/**
Jump to the next token .
*/
2012-11-22 09:48:35 +08:00
void tok_next ( tokenizer_t * tok ) ;
2005-09-20 21:26:39 +08:00
/**
Returns the type of the last token . Must be one of the values in the token_type enum .
*/
2012-11-22 09:48:35 +08:00
int tok_last_type ( tokenizer_t * tok ) ;
2005-09-20 21:26:39 +08:00
/**
Returns the last token string . The string should not be freed by the caller .
*/
2012-11-22 14:09:35 +08:00
const wchar_t * tok_last ( tokenizer_t * tok ) ;
2005-09-20 21:26:39 +08:00
/**
Returns the type of quote from the last TOK_QSTRING
*/
2012-11-22 09:48:35 +08:00
wchar_t tok_last_quote ( tokenizer_t * tok ) ;
2005-09-20 21:26:39 +08:00
/**
Returns true as long as there are more tokens left
*/
2012-11-22 09:48:35 +08:00
int tok_has_next ( tokenizer_t * tok ) ;
2005-09-20 21:26:39 +08:00
/**
Returns the position of the beginning of the current token in the original string
*/
2013-07-23 09:26:15 +08:00
int tok_get_pos ( const tokenizer_t * tok ) ;
/** Returns the extent of the current token */
size_t tok_get_extent ( const tokenizer_t * tok ) ;
2005-09-20 21:26:39 +08:00
/**
Returns the original string to tokenizer
*/
2012-11-22 09:48:35 +08:00
const wchar_t * tok_string ( tokenizer_t * tok ) ;
2005-09-20 21:26:39 +08:00
/**
Returns only the first token from the specified string . This is a
convenience function , used to retrieve the first token of a
string . This can be useful for error messages , etc .
2012-11-22 14:23:48 +08:00
On failure , returns the empty string .
2005-09-20 21:26:39 +08:00
*/
2012-11-22 14:23:48 +08:00
wcstring tok_first ( const wchar_t * str ) ;
2005-09-20 21:26:39 +08:00
2012-11-19 18:41:57 +08:00
/**
Indicates whether a character can be part of a string , or is a string separator .
Separators include newline , tab , | , ^ , > , < , etc .
is_first should indicate whether this is the first character in a potential string .
*/
bool tok_is_string_character ( wchar_t c , bool is_first ) ;
2005-09-20 21:26:39 +08:00
/**
Move tokenizer position
*/
2012-11-22 09:48:35 +08:00
void tok_set_pos ( tokenizer_t * tok , int pos ) ;
2005-09-20 21:26:39 +08:00
/**
Returns a string description of the specified token type
*/
2012-11-19 08:30:30 +08:00
const wchar_t * tok_get_desc ( int type ) ;
2005-09-20 21:26:39 +08:00
2006-10-07 08:56:25 +08:00
/**
Get tokenizer error type . Should only be called if tok_last_tope returns TOK_ERROR .
*/
2012-11-22 09:48:35 +08:00
int tok_get_error ( tokenizer_t * tok ) ;
2006-10-07 08:56:25 +08:00
2012-12-21 09:37:09 +08:00
enum move_word_style_t
{
move_word_style_punctuation , //stop at punctuation
move_word_style_path_components //stops at path components
} ;
2006-10-07 08:56:25 +08:00
2012-12-11 08:23:08 +08:00
/* Our state machine that implements "one word" movement or erasure. */
class move_word_state_machine_t
{
2012-12-21 09:37:09 +08:00
private :
2012-12-23 04:21:31 +08:00
2012-12-21 09:37:09 +08:00
bool consume_char_punctuation ( wchar_t c ) ;
bool consume_char_path_components ( wchar_t c ) ;
bool is_path_component_character ( wchar_t c ) ;
2012-12-23 04:21:31 +08:00
2012-12-21 09:37:09 +08:00
int state ;
move_word_style_t style ;
2012-12-11 08:23:08 +08:00
public :
2012-12-21 09:37:09 +08:00
move_word_state_machine_t ( move_word_style_t st ) ;
2012-12-11 08:23:08 +08:00
bool consume_char ( wchar_t c ) ;
2012-12-21 09:37:09 +08:00
void reset ( ) ;
2012-12-11 08:23:08 +08:00
} ;
2005-10-04 23:11:39 +08:00
# endif