Overhaul continue_job() and try_select()

Convert `select_try()` to return a well-defined enum describing its state, and handle each of the three possible cases with clear reasons why we are blocking or not blocking in each subsequent call to `process_mark_finished_children()`.
2025-01-22 12:07:49 +08:00 · 2018-10-02 15:10:42 -05:00 · 2018-10-02 15:10:42 -05:00 · 39a05a359a
commit 39a05a359a
parent 1bfbed94ae
2 changed files with 105 additions and 86 deletions
--- a/src/proc.cpp
+++ b/src/proc.cpp
@ -218,22 +218,25 @@ void job_t::set_flag(job_flag_t flag, bool set) { this->flags.set(flag, set); }
 bool job_t::get_flag(job_flag_t flag) const { return this->flags.get(flag); }
-int job_t::signal(int signal) {
+bool job_t::signal(int signal) {
-    pid_t my_pgid = getpgrp();
+    // Presumably we are distinguishing between the two cases below because we do
-    int res = 0;
+    // not want to send ourselves the signal in question in case the job shares
    // a pgid with the shell.
-    if (pgid != my_pgid) {
+    if (pgid != getpgrp()) {
-        res = killpg(pgid, signal);
+        if (killpg(pgid, signal) == -1) {
            wperror(L"killpg");
            return false;
        }
    } else {
-        for (const process_ptr_t &p : processes) {
+        for (const auto &p : processes) {
-            if (!p->completed && p->pid && kill(p->pid, signal)) {
+            if (!p->completed && p->pid && kill(p->pid, signal) == -1) {
-                res = -1;
+                return false;
                break;
            }
        }
    }
-    return res;
+    return true;
 }
 static void mark_job_complete(const job_t *j) {
@ -681,19 +684,17 @@ void proc_update_jiffies() {
 /// Check if there are buffers associated with the job, and select on them for a while if available.
 ///
 /// \param j the job to test
-///
+/// \return the status of the select operation
-/// \return 1 if buffers were available, zero otherwise
+static select_try_t select_try(job_t *j) {
 static int select_try(job_t *j) {
    fd_set fds;
    int maxfd = -1;
    FD_ZERO(&fds);
    const io_chain_t chain = j->all_io_redirections();
-    for (size_t idx = 0; idx < chain.size(); idx++) {
+    for (const auto &io : chain) {
        const io_data_t *io = chain.at(idx).get();
        if (io->io_mode == IO_BUFFER) {
-            const io_pipe_t *io_pipe = static_cast<const io_pipe_t *>(io);
+            auto io_pipe = static_cast<const io_pipe_t *>(io.get());
            int fd = io_pipe->pipe_fd[0];
            FD_SET(fd, &fds);
            maxfd = std::max(maxfd, fd);
@ -702,20 +703,20 @@ static int select_try(job_t *j) {
    }
    if (maxfd >= 0) {
-        int retval;
+        struct timeval timeout;
        struct timeval tv;
-        tv.tv_sec = 0;
+        timeout.tv_sec = 0;
-        tv.tv_usec = 10000;
+        timeout.tv_usec = 10000;
-        retval = select(maxfd + 1, &fds, 0, 0, &tv);
+        int retval = select(maxfd + 1, &fds, 0, 0, &timeout);
        if (retval == 0) {
            debug(4, L"select_try hit timeout");
            return select_try_t::TIMEOUT;
        }
-        return retval > 0;
+        return select_try_t::DATA_READY;
    }
-    return -1;
+    return select_try_t::IO_ERROR;
 }
 /// Read from descriptors until they are empty.
@ -915,39 +916,41 @@ static bool terminal_return_from_job(job_t *j) {
    return true;
 }
-void job_t::continue_job(bool cont) {
+void job_t::continue_job(bool send_sigcont) {
    // Put job first in the job list.
    promote();
    set_flag(job_flag_t::NOTIFIED, false);
-    debug(4, L"%ls job %d, gid %d (%ls), %ls, %ls", cont ? L"Continue" : L"Start", job_id,
+    debug(4, L"%ls job %d, gid %d (%ls), %ls, %ls", send_sigcont ? L"Continue" : L"Start", job_id,
          pgid, command_wcstr(), is_completed() ? L"COMPLETED" : L"UNCOMPLETED",
          is_interactive ? L"INTERACTIVE" : L"NON-INTERACTIVE");
    if (!is_completed()) {
        if (get_flag(job_flag_t::TERMINAL) && is_foreground()) {
-            // Put the job into the foreground.
+            // Put the job into the foreground and give it control of the terminal.
            // Hack: ensure that stdin is marked as blocking first (issue #176).
            make_fd_blocking(STDIN_FILENO);
-            if (!terminal_give_to_job(this, cont)) return;
+            if (!terminal_give_to_job(this, send_sigcont)) {
                // This returns without bubbling up the error. Presumably that is OK.
                return;
            }
        }
-        // Send the job a continue signal, if necessary.
+        // If both requested and necessary, send the job a continue signal
-        if (cont) {
+        if (send_sigcont) {
-            for (process_ptr_t &p : processes) p->stopped = false;
+            // reset the status of each process instance
            for (auto &p : processes) {
                p->stopped = false;
            }
-            if (get_flag(job_flag_t::JOB_CONTROL)) {
+            // This code used to check for JOB_CONTROL to decide between using killpg to signal
-                if (killpg(pgid, SIGCONT)) {
+            // all processes in the group or iterating over each process in the group and sending
-                    wperror(L"killpg (SIGCONT)");
+            // the signal individually. job_t::signal() does the same, but uses the shell's own
-                    return;
+            // pgroup to make that distinction.
-                }
+            if (!signal(SIGCONT)) {
-            } else {
+                // This returns without bubbling up the error. Presumably that is OK.
-                for (const process_ptr_t &p : processes) {
+                debug(2, "Failed to send SIGCONT to any processes in pgroup %d!", pgid);
-                    if (kill(p->pid, SIGCONT) < 0) {
+                return;
                        wperror(L"kill (SIGCONT)");
                        return;
                    }
                }
            }
        }
@ -955,36 +958,45 @@ void job_t::continue_job(bool cont) {
            // Look for finished processes first, to avoid select() if it's already done.
            process_mark_finished_children(false);
-            // Wait for job to report.
+            // sigcont is NOT sent only when a job is first created. In the event of "nested jobs"
            // (i.e. new jobs started due to the evaluation of functions), we can start a process
            // that feeds its output to a child job (e.g. `some_func | some_builtin`, which is two
            // jobs), and if the first job in that expression generates enough data that it fills
            // its pipe buffer, it'll hang waiting for the second job to read from it. If we block
            // on foreground jobs in this case, we'll keep waiting forever.
            bool block_on_fg = send_sigcont;
            // Wait for data to become available or the status of our own job to change
            while (!reader_exit_forced() && !is_stopped() && !is_completed()) {
-                switch (select_try(this)) {
+                auto result = select_try(this);
-                    case 1: {
+
-                        // debug(1, L"select_try() 1" );
+                if (result == select_try_t::DATA_READY) {
-                        read_try(this);
+                    // Read the data that we know is now available, then scan for finished processes
-                        process_mark_finished_children(false);
+                    // but do not block. We don't block so long as we have IO to process, once the
-                        break;
+                    // fd buffers are empty we'll block in the second case below.
-                    }
+                    read_try(this);
-                    case 0: {
+                    process_mark_finished_children(false);
-                        // debug(1, L"select_try() 0" );
+                }
-                        // No FDs are ready. Look for finished processes.
+                else if (result == select_try_t::TIMEOUT) {
-                        process_mark_finished_children(true);
+                    // Our select_try() timeout is ~10ms, so this can be EXTREMELY chatty but this is very useful
-                        break;
+                    // if trying to debug an unknown hang in fish. Uncomment to see if we're stuck here.
-                    }
+                    // debug(1, L"select_try: no fds returned valid data within the timeout" );
-                    case -1: {
+
-                        // debug(1, L"select_try() -1" );
+                    // No FDs are ready. Look for finished processes instead.
-                        // If there is no funky IO magic, we can use waitpid instead of handling
+                    process_mark_finished_children(block_on_fg);
-                        // child deaths through signals. This gives a rather large speed boost (A
+                }
-                        // factor 3 startup time improvement on my 300 MHz machine) on short-lived
+                else {
-                        // jobs.
+                    // This is easily encountered by simply transferring control of the terminal to another process,
-                        //
+                    // then suspending it. For example, `nvim`, then `ctrl+z`. Since we are not the foreground process
-                        // This will return early if we get a signal, like SIGHUP.
+                    debug(3, L"select_try: interrupted read from job file descriptors" );
-                        process_mark_finished_children(true);
+
-                        break;
+                    // This tends to happen when the foreground process has changed, e.g. it was suspended and control
-                    }
+                    // has returned to the shell. It's a good time to block on foreground processes.
-                    default: {
+                    process_mark_finished_children(block_on_fg);
-                        DIE("unexpected return value from select_try()");
+
-                        break;
+                    // If it turns out that we encountered this because the file descriptor we were reading from has
-                    }
+                    // died, process_mark_finished_children() should take care of changing the status of our is_completed()
                    // (assuming it is appropriate to do so), in which case we will break out of this loop.
                }
            }
        }
@ -992,19 +1004,15 @@ void job_t::continue_job(bool cont) {
    if (is_foreground()) {
        if (is_completed()) {
-            // It's possible that the job will produce output and exit before we've even read from
+            // It's possible that the job will produce output and exit before we've even read from it.
-            // it.
+            // In that case, make sure we read that output now, before we've executed any subsequent calls.
-            //
+            // This is why prompt colors were getting screwed up - the builtin `echo` calls were sometimes
-            // We'll eventually read the output, but it may be after we've executed subsequent calls
+            // having their output combined with the `set_color` calls in the wrong order!
            // This is why my prompt colors kept getting screwed up - the builtin echo calls
            // were sometimes having their output combined with the set_color calls in the wrong
            // order!
            read_try(this);
-            const std::unique_ptr<process_t> &p = processes.back();
+            // Set $status only if we are in the foreground and the last process in the job has finished
-
+            // and is not a short-circuited builtin.
-            // Mark process status only if we are in the foreground and the last process in a pipe,
+            auto &p = processes.back();
            // and it is not a short circuited builtin.
            if ((WIFEXITED(p->status) || WIFSIGNALED(p->status)) && p->pid) {
                int status = proc_format_status(p->status);
                proc_set_last_status(get_flag(job_flag_t::NEGATE) ? !status : status);
--- a/src/proc.h
+++ b/src/proc.h
@ -41,6 +41,16 @@ enum {
    JOB_CONTROL_NONE,
 };
 /// The return value of select_try(), indicating IO readiness or an error
 enum class select_try_t {
    /// One or more fds have data ready for read
    DATA_READY,
    /// The timeout elapsed without any data becoming available for read
    TIMEOUT,
    /// The select operation was aborted due to an interrupt or IO error
    IO_ERROR,
 };
 /// A structure representing a single fish process. Contains variables for tracking process state
 /// and the process argument list. Actually, a fish process can be either a regular external
 /// process, an internal builtin which may or may not spawn a fake IO process during execution, a
@ -239,19 +249,20 @@ class job_t {
    /// The job is in a stopped state
    bool is_stopped() const;
    // (This function would just be called `continue` but that's obviously a reserved keyword)
    /// Resume a (possibly) stopped job. Puts job in the foreground.  If cont is true, restore the
    /// saved terminal modes and send the process group a SIGCONT signal to wake it up before we
    /// block.
    ///
-    /// \param cont Whether the function should wait for the job to complete before returning
+    /// \param send_sigcont Whether SIGCONT should be sent to the job if it is in the foreground.
-    // (This would just be called `continue` but that's obviously a reserved keyword)
+    void continue_job(bool send_sigcont);
    void continue_job(bool cont);
    /// Promotes the job to the front of the job list.
    void promote();
    /// Send the specified signal to all processes in this job.
-    int signal(int signal);
+    /// \return true on success, false on failure.
    bool signal(int signal);
    /// Return the job instance matching this unique job id.
    /// If id is 0 or less, return the last job used.