fish-shell/tokenizer.h

/** \file tokenizer.h

    A specialized tokenizer for tokenizing the fish language. In the
    future, the tokenizer should be extended to support marks,
    tokenizing multiple strings and disposing of unused string
    segments.
*/

#ifndef FISH_TOKENIZER_H
#define FISH_TOKENIZER_H

#include <wchar.h>
#include "common.h"

/**
   Token types
*/
enum token_type
{
    TOK_NONE, /**< Tokenizer not yet constructed */
    TOK_ERROR, /**< Error reading token */
    TOK_STRING,/**< String token */
    TOK_PIPE,/**< Pipe token */
    TOK_END,/**< End token (semicolon or newline, not literal end) */
    TOK_REDIRECT_OUT, /**< redirection token */
    TOK_REDIRECT_APPEND,/**< redirection append token */
    TOK_REDIRECT_IN,/**< input redirection token */
    TOK_REDIRECT_FD,/**< redirection to new fd token */
    TOK_REDIRECT_NOCLOB, /**<? redirection token */
    TOK_BACKGROUND,/**< send job to bg token */
    TOK_COMMENT/**< comment token */
};

/**
   Tokenizer error types
*/
enum tokenizer_error
{
    TOK_UNTERMINATED_QUOTE,
    TOK_UNTERMINATED_SUBSHELL,
    TOK_UNTERMINATED_ESCAPE,
    TOK_OTHER
}
;


/**
   Flag telling the tokenizer to accept incomplete parameters,
   i.e. parameters with mismatching paranthesis, etc. This is useful
   for tab-completion.
*/
#define TOK_ACCEPT_UNFINISHED 1

/**
   Flag telling the tokenizer not to remove comments. Useful for
   syntax highlighting.
*/
#define TOK_SHOW_COMMENTS 2

/** Flag telling the tokenizer to not generate error messages, which we need to do when tokenizing off of the main thread (since wgettext is not thread safe).
*/
#define TOK_SQUASH_ERRORS 4

typedef unsigned int tok_flags_t;

/**
   The tokenizer struct.
*/
struct tokenizer_t
{
    /** A pointer into the original string, showing where the next token begins */
    const wchar_t *buff;
    /** A copy of the original string */
    const wchar_t *orig_buff;
    /** The last token */
    wcstring last_token;

    /** Type of last token*/
    enum token_type last_type;

    /** Offset of last token*/
    size_t last_pos;
    /** Whether there are more tokens*/
    bool has_next;
    /** Whether incomplete tokens are accepted*/
    bool accept_unfinished;
    /** Whether commants should be returned*/
    bool show_comments;
    /** Type of last quote, can be either ' or ".*/
    wchar_t last_quote;
    /** Last error */
    int error;
    /* Whether we are squashing errors */
    bool squash_errors;

    /* Cached line number information */
    size_t cached_lineno_offset;
    int cached_lineno_count;

    /** Return the line number of the character at the given offset */
    int line_number_of_character_at_offset(size_t offset);

    /**
      Constructor for a tokenizer. b is the string that is to be
      tokenized. It is not copied, and should not be freed by the caller
      until after the tokenizer is destroyed.

      \param b The string to tokenize
      \param flags Flags to the tokenizer. Setting TOK_ACCEPT_UNFINISHED will cause the tokenizer
      to accept incomplete tokens, such as a subshell without a closing
      parenthesis, as a valid token. Setting TOK_SHOW_COMMENTS will return comments as tokens

    */
    tokenizer_t(const wchar_t *b, tok_flags_t flags);
};

/**
  Jump to the next token.
*/
void tok_next(tokenizer_t *tok);

/**
  Returns the type of the last token. Must be one of the values in the token_type enum.
*/
enum token_type tok_last_type(tokenizer_t *tok);

/**
  Returns the last token string. The string should not be freed by the caller.
*/
const wchar_t *tok_last(tokenizer_t *tok);

/**
  Returns the type of quote from the last TOK_QSTRING
*/
wchar_t tok_last_quote(tokenizer_t *tok);

/**
  Returns true as long as there are more tokens left
*/
int tok_has_next(tokenizer_t *tok);

/**
  Returns the position of the beginning of the current token in the original string
*/
int tok_get_pos(const tokenizer_t *tok);

/** Returns the extent of the current token */
size_t tok_get_extent(const tokenizer_t *tok);

/** Returns the token type after the current one, without adjusting the position. Optionally returns the next string by reference. */
enum token_type tok_peek_next(tokenizer_t *tok, wcstring *out_next_string);

/**
   Returns the original string to tokenizer
 */
const wchar_t *tok_string(tokenizer_t *tok);

/**
   Returns only the first token from the specified string. This is a
   convenience function, used to retrieve the first token of a
   string. This can be useful for error messages, etc.

   On failure, returns the empty string.
*/
wcstring tok_first(const wchar_t *str);

/**
   Indicates whether a character can be part of a string, or is a string separator.
   Separators include newline, tab, |, ^, >, <, etc.

   is_first should indicate whether this is the first character in a potential string.
*/
bool tok_is_string_character(wchar_t c, bool is_first);

/**
   Move tokenizer position
*/
void tok_set_pos(tokenizer_t *tok, int pos);

/**
   Returns a string description of the specified token type
*/
const wchar_t *tok_get_desc(int type);

/**
   Get tokenizer error type. Should only be called if tok_last_tope returns TOK_ERROR.
*/
int tok_get_error(tokenizer_t *tok);

/* Helper function to determine redirection type from a string, or TOK_NONE if the redirection is invalid */
enum token_type redirection_type_for_string(const wcstring &str);

enum move_word_style_t
{
    move_word_style_punctuation, //stop at punctuation
    move_word_style_path_components //stops at path components
};

/* Our state machine that implements "one word" movement or erasure. */
class move_word_state_machine_t
{
private:

    bool consume_char_punctuation(wchar_t c);
    bool consume_char_path_components(wchar_t c);
    bool is_path_component_character(wchar_t c);

    int state;
    move_word_style_t style;

public:

    move_word_state_machine_t(move_word_style_t st);
    bool consume_char(wchar_t c);
    void reset();
};


#endif
Remove trailing whitespaces and change tabs to spaces 2012-11-18 18:23:22 +08:00			`/** \file tokenizer.h`
Initial revision darcs-hash:20050920132639-ac50b-fa3b476891e1f5f67207cf4cc7bf623834cc5edc.gz 2005-09-20 21:26:39 +08:00
			`A specialized tokenizer for tokenizing the fish language. In the`
			`future, the tokenizer should be extended to support marks,`
			`tokenizing multiple strings and disposing of unused string`
			`segments.`
			`*/`

Add header guards to the header files. darcs-hash:20051004151139-35ec8-7af69b9d7647d145dc621f7eaea726e729cff554.gz 2005-10-04 23:11:39 +08:00			`#ifndef FISH_TOKENIZER_H`
			`#define FISH_TOKENIZER_H`

			`#include <wchar.h>`
Modify tokenizer to store last token in a wcstring 2012-11-22 14:09:35 +08:00			`#include "common.h"`
Add header guards to the header files. darcs-hash:20051004151139-35ec8-7af69b9d7647d145dc621f7eaea726e729cff554.gz 2005-10-04 23:11:39 +08:00
Initial revision darcs-hash:20050920132639-ac50b-fa3b476891e1f5f67207cf4cc7bf623834cc5edc.gz 2005-09-20 21:26:39 +08:00			`/**`
			`Token types`
			`*/`
			`enum token_type`
			`{`
Apply new indentation, brace, and whitespace style 2012-11-19 08:30:30 +08:00			`TOK_NONE, /*< Tokenizer not yet constructed /`
			`TOK_ERROR, /*< Error reading token /`
			`TOK_STRING,/*< String token /`
			`TOK_PIPE,/*< Pipe token /`
Stuff 2013-06-02 13:14:47 +08:00			`TOK_END,/*< End token (semicolon or newline, not literal end) /`
Apply new indentation, brace, and whitespace style 2012-11-19 08:30:30 +08:00			`TOK_REDIRECT_OUT, /*< redirection token /`
			`TOK_REDIRECT_APPEND,/*< redirection append token /`
			`TOK_REDIRECT_IN,/*< input redirection token /`
			`TOK_REDIRECT_FD,/*< redirection to new fd token /`
			`TOK_REDIRECT_NOCLOB, /*<? redirection token /`
			`TOK_BACKGROUND,/*< send job to bg token /`
			`TOK_COMMENT/*< comment token /`
Initial work towards making autosuggestion smarter by recognizing paths 2012-02-16 03:33:41 +08:00			`};`
Add autoindentation support darcs-hash:20061007005625-ac50b-11873654797eb1e98fd17893022bdf995be3e2aa.gz 2006-10-07 08:56:25 +08:00
			`/**`
			`Tokenizer error types`
			`*/`
			`enum tokenizer_error`
			`{`
Apply new indentation, brace, and whitespace style 2012-11-19 08:30:30 +08:00			`TOK_UNTERMINATED_QUOTE,`
			`TOK_UNTERMINATED_SUBSHELL,`
			`TOK_UNTERMINATED_ESCAPE,`
			`TOK_OTHER`
Add autoindentation support darcs-hash:20061007005625-ac50b-11873654797eb1e98fd17893022bdf995be3e2aa.gz 2006-10-07 08:56:25 +08:00			`}`
Apply new indentation, brace, and whitespace style 2012-11-19 08:30:30 +08:00			`;`
Add autoindentation support darcs-hash:20061007005625-ac50b-11873654797eb1e98fd17893022bdf995be3e2aa.gz 2006-10-07 08:56:25 +08:00
Initial revision darcs-hash:20050920132639-ac50b-fa3b476891e1f5f67207cf4cc7bf623834cc5edc.gz 2005-09-20 21:26:39 +08:00
			`/**`
			`Flag telling the tokenizer to accept incomplete parameters,`
			`i.e. parameters with mismatching paranthesis, etc. This is useful`
			`for tab-completion.`
			`*/`
			`#define TOK_ACCEPT_UNFINISHED 1`

			`/**`
			`Flag telling the tokenizer not to remove comments. Useful for`
			`syntax highlighting.`
			`*/`
			`#define TOK_SHOW_COMMENTS 2`

Fix a crash when using quotes due to wgettext thread safety issues. 2012-02-18 07:55:54 +08:00			`/** Flag telling the tokenizer to not generate error messages, which we need to do when tokenizing off of the main thread (since wgettext is not thread safe).`
			`*/`
			`#define TOK_SQUASH_ERRORS 4`

Work towards refactoring tokenizer to be a real object 2012-11-22 09:48:35 +08:00			`typedef unsigned int tok_flags_t;`
Initial revision darcs-hash:20050920132639-ac50b-fa3b476891e1f5f67207cf4cc7bf623834cc5edc.gz 2005-09-20 21:26:39 +08:00
			`/**`
Remove trailing whitespaces and change tabs to spaces 2012-11-18 18:23:22 +08:00			`The tokenizer struct.`
Initial revision darcs-hash:20050920132639-ac50b-fa3b476891e1f5f67207cf4cc7bf623834cc5edc.gz 2005-09-20 21:26:39 +08:00			`*/`
Work towards refactoring tokenizer to be a real object 2012-11-22 09:48:35 +08:00			`struct tokenizer_t`
Initial revision darcs-hash:20050920132639-ac50b-fa3b476891e1f5f67207cf4cc7bf623834cc5edc.gz 2005-09-20 21:26:39 +08:00			`{`
Apply new indentation, brace, and whitespace style 2012-11-19 08:30:30 +08:00			`/** A pointer into the original string, showing where the next token begins */`
			`const wchar_t *buff;`
			`/** A copy of the original string */`
			`const wchar_t *orig_buff;`
Modify tokenizer to store last token in a wcstring 2012-11-22 14:09:35 +08:00			`/** The last token */`
			`wcstring last_token;`
Apply new indentation, brace, and whitespace style 2012-11-19 08:30:30 +08:00
			`/** Type of last token*/`
Make tok_last_type return an enum token_type instead of int 2013-10-01 04:57:36 +08:00			`enum token_type last_type;`
Modify tokenizer to store last token in a wcstring 2012-11-22 14:09:35 +08:00
Apply new indentation, brace, and whitespace style 2012-11-19 08:30:30 +08:00			`/** Offset of last token*/`
			`size_t last_pos;`
			`/** Whether there are more tokens*/`
			`bool has_next;`
			`/** Whether incomplete tokens are accepted*/`
			`bool accept_unfinished;`
			`/** Whether commants should be returned*/`
			`bool show_comments;`
			`/** Type of last quote, can be either ' or ".*/`
			`wchar_t last_quote;`
			`/** Last error */`
			`int error;`
Fix a crash when using quotes due to wgettext thread safety issues. 2012-02-18 07:55:54 +08:00			`/* Whether we are squashing errors */`
			`bool squash_errors;`
Fix to restore an optimization from parse_util_get_line_from_offset in a more thread-safe way 2012-08-05 08:44:14 +08:00
			`/* Cached line number information */`
			`size_t cached_lineno_offset;`
			`int cached_lineno_count;`

			`/** Return the line number of the character at the given offset */`
			`int line_number_of_character_at_offset(size_t offset);`

Work towards refactoring tokenizer to be a real object 2012-11-22 09:48:35 +08:00			`/**`
			`Constructor for a tokenizer. b is the string that is to be`
			`tokenized. It is not copied, and should not be freed by the caller`
			`until after the tokenizer is destroyed.`
Initial revision darcs-hash:20050920132639-ac50b-fa3b476891e1f5f67207cf4cc7bf623834cc5edc.gz 2005-09-20 21:26:39 +08:00
Work towards refactoring tokenizer to be a real object 2012-11-22 09:48:35 +08:00			`\param b The string to tokenize`
			`\param flags Flags to the tokenizer. Setting TOK_ACCEPT_UNFINISHED will cause the tokenizer`
			`to accept incomplete tokens, such as a subshell without a closing`
			`parenthesis, as a valid token. Setting TOK_SHOW_COMMENTS will return comments as tokens`
Remove trailing whitespaces and change tabs to spaces 2012-11-18 18:23:22 +08:00
Work towards refactoring tokenizer to be a real object 2012-11-22 09:48:35 +08:00			`*/`
			`tokenizer_t(const wchar_t *b, tok_flags_t flags);`
			`};`
Initial revision darcs-hash:20050920132639-ac50b-fa3b476891e1f5f67207cf4cc7bf623834cc5edc.gz 2005-09-20 21:26:39 +08:00
			`/**`
			`Jump to the next token.`
			`*/`
Work towards refactoring tokenizer to be a real object 2012-11-22 09:48:35 +08:00			`void tok_next(tokenizer_t *tok);`
Initial revision darcs-hash:20050920132639-ac50b-fa3b476891e1f5f67207cf4cc7bf623834cc5edc.gz 2005-09-20 21:26:39 +08:00
			`/**`
			`Returns the type of the last token. Must be one of the values in the token_type enum.`
			`*/`
Make tok_last_type return an enum token_type instead of int 2013-10-01 04:57:36 +08:00			`enum token_type tok_last_type(tokenizer_t *tok);`
Initial revision darcs-hash:20050920132639-ac50b-fa3b476891e1f5f67207cf4cc7bf623834cc5edc.gz 2005-09-20 21:26:39 +08:00
			`/**`
			`Returns the last token string. The string should not be freed by the caller.`
			`*/`
Modify tokenizer to store last token in a wcstring 2012-11-22 14:09:35 +08:00			`const wchar_t tok_last(tokenizer_t tok);`
Initial revision darcs-hash:20050920132639-ac50b-fa3b476891e1f5f67207cf4cc7bf623834cc5edc.gz 2005-09-20 21:26:39 +08:00
			`/**`
			`Returns the type of quote from the last TOK_QSTRING`
			`*/`
Work towards refactoring tokenizer to be a real object 2012-11-22 09:48:35 +08:00			`wchar_t tok_last_quote(tokenizer_t *tok);`
Initial revision darcs-hash:20050920132639-ac50b-fa3b476891e1f5f67207cf4cc7bf623834cc5edc.gz 2005-09-20 21:26:39 +08:00
			`/**`
			`Returns true as long as there are more tokens left`
			`*/`
Work towards refactoring tokenizer to be a real object 2012-11-22 09:48:35 +08:00			`int tok_has_next(tokenizer_t *tok);`
Initial revision darcs-hash:20050920132639-ac50b-fa3b476891e1f5f67207cf4cc7bf623834cc5edc.gz 2005-09-20 21:26:39 +08:00
			`/**`
			`Returns the position of the beginning of the current token in the original string`
			`*/`
Improvements to new parser. All functions and completions now parse. 2013-07-23 09:26:15 +08:00			`int tok_get_pos(const tokenizer_t *tok);`

			`/** Returns the extent of the current token */`
			`size_t tok_get_extent(const tokenizer_t *tok);`
Initial revision darcs-hash:20050920132639-ac50b-fa3b476891e1f5f67207cf4cc7bf623834cc5edc.gz 2005-09-20 21:26:39 +08:00
Better error messages for `EDITOR=vim git...` type commands. https://github.com/fish-shell/fish-shell/issues/809 2013-10-01 05:55:25 +08:00			`/** Returns the token type after the current one, without adjusting the position. Optionally returns the next string by reference. */`
			`enum token_type tok_peek_next(tokenizer_t tok, wcstring out_next_string);`

Initial revision darcs-hash:20050920132639-ac50b-fa3b476891e1f5f67207cf4cc7bf623834cc5edc.gz 2005-09-20 21:26:39 +08:00			`/**`
			`Returns the original string to tokenizer`
			`*/`
Work towards refactoring tokenizer to be a real object 2012-11-22 09:48:35 +08:00			`const wchar_t tok_string(tokenizer_t tok);`
Initial revision darcs-hash:20050920132639-ac50b-fa3b476891e1f5f67207cf4cc7bf623834cc5edc.gz 2005-09-20 21:26:39 +08:00
			`/**`
			`Returns only the first token from the specified string. This is a`
			`convenience function, used to retrieve the first token of a`
			`string. This can be useful for error messages, etc.`

Make tok_first return a wcstring instead of a wchar_t* 2012-11-22 14:23:48 +08:00			`On failure, returns the empty string.`
Initial revision darcs-hash:20050920132639-ac50b-fa3b476891e1f5f67207cf4cc7bf623834cc5edc.gz 2005-09-20 21:26:39 +08:00			`*/`
Make tok_first return a wcstring instead of a wchar_t* 2012-11-22 14:23:48 +08:00			`wcstring tok_first(const wchar_t *str);`
Initial revision darcs-hash:20050920132639-ac50b-fa3b476891e1f5f67207cf4cc7bf623834cc5edc.gz 2005-09-20 21:26:39 +08:00
Changes to make word movement less aggressive https://github.com/fish-shell/fish-shell/issues/384 2012-11-19 18:41:57 +08:00			`/**`
			`Indicates whether a character can be part of a string, or is a string separator.`
			`Separators include newline, tab, \|, ^, >, <, etc.`

			`is_first should indicate whether this is the first character in a potential string.`
			`*/`
			`bool tok_is_string_character(wchar_t c, bool is_first);`

Initial revision darcs-hash:20050920132639-ac50b-fa3b476891e1f5f67207cf4cc7bf623834cc5edc.gz 2005-09-20 21:26:39 +08:00			`/**`
			`Move tokenizer position`
			`*/`
Work towards refactoring tokenizer to be a real object 2012-11-22 09:48:35 +08:00			`void tok_set_pos(tokenizer_t *tok, int pos);`
Initial revision darcs-hash:20050920132639-ac50b-fa3b476891e1f5f67207cf4cc7bf623834cc5edc.gz 2005-09-20 21:26:39 +08:00
			`/**`
			`Returns a string description of the specified token type`
			`*/`
Apply new indentation, brace, and whitespace style 2012-11-19 08:30:30 +08:00			`const wchar_t *tok_get_desc(int type);`
Initial revision darcs-hash:20050920132639-ac50b-fa3b476891e1f5f67207cf4cc7bf623834cc5edc.gz 2005-09-20 21:26:39 +08:00
Add autoindentation support darcs-hash:20061007005625-ac50b-11873654797eb1e98fd17893022bdf995be3e2aa.gz 2006-10-07 08:56:25 +08:00			`/**`
			`Get tokenizer error type. Should only be called if tok_last_tope returns TOK_ERROR.`
			`*/`
Work towards refactoring tokenizer to be a real object 2012-11-22 09:48:35 +08:00			`int tok_get_error(tokenizer_t *tok);`
Add autoindentation support darcs-hash:20061007005625-ac50b-11873654797eb1e98fd17893022bdf995be3e2aa.gz 2006-10-07 08:56:25 +08:00
Syntax highlighting for file redirections 2013-10-14 07:58:40 +08:00			`/* Helper function to determine redirection type from a string, or TOK_NONE if the redirection is invalid */`
			`enum token_type redirection_type_for_string(const wcstring &str);`

Changes to work recognition per https://github.com/fish-shell/fish-shell/issues/384 Word movement should be very similar to fish 1.x backward-kill-word remains more liberal, but now stops at any of {,'"=} 2012-12-21 09:37:09 +08:00			`enum move_word_style_t`
			`{`
			`move_word_style_punctuation, //stop at punctuation`
			`move_word_style_path_components //stops at path components`
			`};`
Add autoindentation support darcs-hash:20061007005625-ac50b-11873654797eb1e98fd17893022bdf995be3e2aa.gz 2006-10-07 08:56:25 +08:00
forward-word should accept a word of an autosuggestion https://github.com/fish-shell/fish-shell/issues/435 2012-12-11 08:23:08 +08:00			`/* Our state machine that implements "one word" movement or erasure. */`
			`class move_word_state_machine_t`
			`{`
Changes to work recognition per https://github.com/fish-shell/fish-shell/issues/384 Word movement should be very similar to fish 1.x backward-kill-word remains more liberal, but now stops at any of {,'"=} 2012-12-21 09:37:09 +08:00			`private:`
Formatting 2012-12-23 04:21:31 +08:00
Changes to work recognition per https://github.com/fish-shell/fish-shell/issues/384 Word movement should be very similar to fish 1.x backward-kill-word remains more liberal, but now stops at any of {,'"=} 2012-12-21 09:37:09 +08:00			`bool consume_char_punctuation(wchar_t c);`
			`bool consume_char_path_components(wchar_t c);`
			`bool is_path_component_character(wchar_t c);`
Formatting 2012-12-23 04:21:31 +08:00
Changes to work recognition per https://github.com/fish-shell/fish-shell/issues/384 Word movement should be very similar to fish 1.x backward-kill-word remains more liberal, but now stops at any of {,'"=} 2012-12-21 09:37:09 +08:00			`int state;`
			`move_word_style_t style;`
forward-word should accept a word of an autosuggestion https://github.com/fish-shell/fish-shell/issues/435 2012-12-11 08:23:08 +08:00
			`public:`
Changes to work recognition per https://github.com/fish-shell/fish-shell/issues/384 Word movement should be very similar to fish 1.x backward-kill-word remains more liberal, but now stops at any of {,'"=} 2012-12-21 09:37:09 +08:00
			`move_word_state_machine_t(move_word_style_t st);`
forward-word should accept a word of an autosuggestion https://github.com/fish-shell/fish-shell/issues/435 2012-12-11 08:23:08 +08:00			`bool consume_char(wchar_t c);`
Changes to work recognition per https://github.com/fish-shell/fish-shell/issues/384 Word movement should be very similar to fish 1.x backward-kill-word remains more liberal, but now stops at any of {,'"=} 2012-12-21 09:37:09 +08:00			`void reset();`
forward-word should accept a word of an autosuggestion https://github.com/fish-shell/fish-shell/issues/435 2012-12-11 08:23:08 +08:00			`};`


Add header guards to the header files. darcs-hash:20051004151139-35ec8-7af69b9d7647d145dc621f7eaea726e729cff554.gz 2005-10-04 23:11:39 +08:00			`#endif`