From 44ce3e67315e71ee646106219200b2a7f93fd3b3 Mon Sep 17 00:00:00 2001
From: ridiculousfish <corydoras@ridiculousfish.com>
Date: Mon, 19 Nov 2012 02:41:57 -0800
Subject: [PATCH] Changes to make word movement less aggressive
 https://github.com/fish-shell/fish-shell/issues/384

---
 reader.cpp    | 150 ++++++++++++++++++++++++++++++++------------------
 tokenizer.cpp |   5 +-
 tokenizer.h   |   8 +++
 3 files changed, 107 insertions(+), 56 deletions(-)

diff --git a/reader.cpp b/reader.cpp
index 79775dab2..5548dabfb 100644
--- a/reader.cpp
+++ b/reader.cpp
@@ -1950,8 +1950,9 @@ class move_word_state_machine_t
     enum
     {
         s_whitespace,
-        s_punctuation,
-        s_alphanumeric_or_punctuation_except_slash,
+        s_separator,
+        s_slash,
+        s_nonseparators_except_slash,
         s_end
     } state;
 
@@ -1963,28 +1964,75 @@ public:
 
     bool consume_char(wchar_t c)
     {
-        /* Note fall-through in all of these! */
-        switch (state)
+        //printf("state %d, consume '%lc'\n", state, c);
+        bool consumed = false;
+        /* Always treat separators as first. All this does is ensure that we treat ^ as a string character instead of as stderr redirection, which I hypothesize is usually what is desired. */
+        bool was_first = true;
+        while (state != s_end && ! consumed)
         {
-            case s_whitespace:
-                if (iswspace(c))
-                    return true;
-                state = s_punctuation;
+            switch (state)
+            {
+                case s_whitespace:
+                    if (iswspace(c))
+                    {
+                        /* Consumed whitespace */
+                        consumed = true;
+                    }
+                    else if (tok_is_string_character(c, was_first))
+                    {
+                        /* String path */
+                        state = s_slash;
+                    }
+                    else
+                    {
+                        /* Separator path */
+                        state = s_separator;
+                    }
+                    break;
 
-            case s_punctuation:
-                if (iswpunct(c))
-                    return true;
-                state = s_alphanumeric_or_punctuation_except_slash;
+                case s_separator:
+                    if (! iswspace(c) && ! tok_is_string_character(c, was_first))
+                    {
+                        /* Consumed separator */
+                        consumed = true;
+                    }
+                    else
+                    {
+                        state = s_end;
+                    }
+                    break;
 
-            case s_alphanumeric_or_punctuation_except_slash:
-                if (c != L'/' && (iswalnum(c) || iswpunct(c)))
-                    return true;
-                state = s_end;
+                case s_slash:
+                    if (c == L'/')
+                    {
+                        /* Consumed slash */
+                        consumed = true;
+                    }
+                    else
+                    {
+                        state = s_nonseparators_except_slash;
+                    }
+                    break;
 
-            case s_end:
-            default:
-                return false;
+                case s_nonseparators_except_slash:
+                    if (c != L'/' && tok_is_string_character(c, was_first))
+                    {
+                        /* Consumed string character except slash */
+                        consumed = true;
+                    }
+                    else
+                    {
+                        state = s_end;
+                    }
+                    break;
+
+                    /* We won't get here, but keep the compiler happy */
+                case s_end:
+                default:
+                    break;
+            }
         }
+        return consumed;
     }
 };
 
@@ -2000,8 +2048,10 @@ public:
    The regex we implement:
 
       WHITESPACE*
-      PUNCTUATION*
-      ALPHANUMERIC_OR_PUNCTUATION_EXCEPT_SLASH*
+        (SEPARATOR+)
+      |
+        (SLASH*
+         TOK_STRING_CHARACTERS_EXCEPT_SLASH*)
 
    Interesting test case:
      /foo/bar/baz/ -> /foo/bar/ -> /foo/ -> /
@@ -2013,61 +2063,55 @@ enum move_word_dir_t
     MOVE_DIR_LEFT,
     MOVE_DIR_RIGHT
 };
+
 static void move_word(bool move_right, bool erase, bool newv)
 {
-    const size_t start_buff_pos = data->buff_pos;
-    size_t end_buff_pos = data->buff_pos; //this is the last character we delete; we always delete at least one character
-
     /* Return if we are already at the edge */
     const size_t boundary = move_right ? data->command_length() : 0;
     if (data->buff_pos == boundary)
         return;
 
-    /*
-      If we are beyond the last character and moving left, start by
-      moving one step, since otherwise we'll start on the \0, which
-      should be ignored.
-    */
-    if (! move_right && (end_buff_pos == data->command_length()))
-    {
-        if (!end_buff_pos)
-            return;
-
-        end_buff_pos--;
-    }
-
-    const wchar_t * const command_line = data->command_line.c_str();
+    /* When moving left, a value of 1 means the character at index 0. */
     move_word_state_machine_t state;
-    const size_t last_valid_end_buff_pos = (move_right ? data->command_length() - 1 : 0);
-    while (end_buff_pos != last_valid_end_buff_pos)
+    const wchar_t * const command_line = data->command_line.c_str();
+    const size_t start_buff_pos = data->buff_pos;
+
+    size_t buff_pos = data->buff_pos;
+    while (buff_pos != boundary)
     {
-        size_t next_buff_pos = (move_right ? end_buff_pos + 1 : end_buff_pos - 1);
-        wchar_t c = command_line[next_buff_pos];
+        size_t idx = (move_right ? buff_pos : buff_pos - 1);
+        wchar_t c = command_line[idx];
         if (! state.consume_char(c))
             break;
-        end_buff_pos = next_buff_pos;
+        buff_pos = (move_right ? buff_pos + 1 : buff_pos - 1);
     }
 
+    /* Always consume at least one character */
+    if (buff_pos == start_buff_pos)
+        buff_pos = (move_right ? buff_pos + 1 : buff_pos - 1);
+
+    /* If we are moving left, buff_pos-1 is the index of the first character we do not delete (possibly -1). If we are moving right, then buff_pos is that index - possibly data->command_length(). */
     if (erase)
     {
-        size_t start_idx = mini(start_buff_pos, end_buff_pos); //first char to delete
-        size_t end_idx = maxi(start_buff_pos, end_buff_pos); //last char to delete
-
-        // Don't try to delete past the end
-        end_idx = mini(end_idx, data->command_length() - 1);
-
-        // Don't autosuggest after a kill
+        /* Don't autosuggest after a kill */
         data->suppress_autosuggestion = true;
 
-        reader_kill(start_idx, end_idx - start_idx + 1, move_right?KILL_APPEND:KILL_PREPEND, newv);
+        if (move_right)
+        {
+            reader_kill(start_buff_pos, buff_pos - start_buff_pos, KILL_APPEND, newv);
+        }
+        else
+        {
+            reader_kill(buff_pos, start_buff_pos - buff_pos, KILL_PREPEND, newv);
+        }
     }
     else
     {
-        data->buff_pos = move_right ? end_buff_pos + 1 : end_buff_pos;
+        data->buff_pos = buff_pos;
         reader_repaint();
     }
-}
 
+}
 
 const wchar_t *reader_get_buffer(void)
 {
diff --git a/tokenizer.cpp b/tokenizer.cpp
index 1f8e97cf9..ebe163b53 100644
--- a/tokenizer.cpp
+++ b/tokenizer.cpp
@@ -224,7 +224,7 @@ int tokenizer::line_number_of_character_at_offset(size_t offset)
 /**
    Tests if this character can be a part of a string. The redirect ^ is allowed unless it's the first character.
 */
-static bool is_string_char(wchar_t c, bool is_first)
+bool tok_is_string_character(wchar_t c, bool is_first)
 {
     switch (c)
     {
@@ -248,7 +248,6 @@ static bool is_string_char(wchar_t c, bool is_first)
 
         default:
             return true;
-
     }
 }
 
@@ -368,7 +367,7 @@ static void read_string(tokenizer *tok)
 
                         default:
                         {
-                            if (!is_string_char(*(tok->buff), is_first))
+                            if (!tok_is_string_character(*(tok->buff), is_first))
                             {
                                 do_loop=0;
                             }
diff --git a/tokenizer.h b/tokenizer.h
index daa4a0b1b..ae6b6ecc9 100644
--- a/tokenizer.h
+++ b/tokenizer.h
@@ -167,6 +167,14 @@ const wchar_t *tok_string(tokenizer *tok);
 */
 wchar_t *tok_first(const wchar_t *str);
 
+/**
+   Indicates whether a character can be part of a string, or is a string separator.
+   Separators include newline, tab, |, ^, >, <, etc.
+
+   is_first should indicate whether this is the first character in a potential string.
+*/
+bool tok_is_string_character(wchar_t c, bool is_first);
+
 /**
    Move tokenizer position
 */