Merge 'Rearchitect WSL compatibility from SIGCONT to keepalive process'

This merges a sequence of commits that undoes the SIGCONT orchestration used for WSL compatibility. The essential problem is this: In Unix and Linux, exited processes are still valid until it is reaped; you can, say, make an exited process a group leader. But in Windows, when a process exits, it is gone, and most syscalls (other than, say, waitpid) fail for it. This is known as the WSL Rick Grimes problem. This manifests as various race conditions in WSL between a parent operating on a child, and the child exiting. Prior to this merge, these were addressed by having the child wait for the parent to send it a SIGCONT. This resolved the race. This merge removes this approach and replaces it with a simpler mechanism that leverages the existing keepalive machinery. A keepalive process is created for all platforms when we have a pipeline that contains a builtin. This is necessary to keep the whole process group alive. The fix is, on WSL, we always create a keepalive and make it the group leader. Because the keepalive does not call exec and its lifetime is bound to a C++ stack frame, it is easy to resolve the race. This improves performance a bit (except on WSL), since child processes no longer have to synchronize with the parent process, but the big win is simplicity. This removes the notion of the single global stopped child, of which there could only be one, and which had be resumed at the right time(s), of which there were several.
2025-02-21 12:58:29 +08:00 · 2018-02-07 12:57:01 -08:00 · 2018-02-07 12:57:01 -08:00 · 56a46a0bab
commit 56a46a0bab
parent a04a6d116e cb03be9fe6
5 changed files with 66 additions and 122 deletions
--- a/src/common.cpp
+++ b/src/common.cpp
@ -1997,6 +1997,25 @@ void assert_is_locked(void *vmutex, const char *who, const char *caller) {
    }
 }

+/// Detect if we are Windows Subsystem for Linux by inspecting /proc/sys/kernel/osrelease
+/// and checking if "Microsoft" is in the first line.
+/// See https://github.com/Microsoft/WSL/issues/423
+bool is_windows_subsystem_for_linux() {
+    ASSERT_IS_NOT_FORKED_CHILD();
+    static bool s_is_wsl = false;
+    static std::once_flag oflag;
+    std::call_once(oflag, []() {
+        // 'e' sets CLOEXEC if possible.
+        FILE *fp = fopen("/proc/sys/kernel/osrelease", "re");
+        if (fp) {
+            char buff[256];
+            if (fgets(buff, sizeof buff, fp)) s_is_wsl = (strstr(buff, "Microsoft") != NULL);
+            fclose(fp);
+        }
+    });
+    return s_is_wsl;
+}
+
 template <typename CharType_t>
 static CharType_t **make_null_terminated_array_helper(
    const std::vector<std::basic_string<CharType_t> > &argv) {
--- a/src/common.h
+++ b/src/common.h
@ -728,6 +728,9 @@ void assert_is_not_forked_child(const char *who);
 #define ASSERT_IS_NOT_FORKED_CHILD_TRAMPOLINE(x) assert_is_not_forked_child(x)
 #define ASSERT_IS_NOT_FORKED_CHILD() ASSERT_IS_NOT_FORKED_CHILD_TRAMPOLINE(__FUNCTION__)

+/// Return whether we are running in Windows Subsystem for Linux.
+bool is_windows_subsystem_for_linux();
+
 extern "C" {
 __attribute__((noinline)) void debug_thread_error(void);
 }
--- a/src/exec.cpp
+++ b/src/exec.cpp
@ -397,13 +397,11 @@ void internal_exec(job_t *j, const io_chain_t &&all_ios) {

 /// Execute an internal builtin. Given a parser, a job within that parser, and a process within that
 /// job corresponding to a builtin, execute the builtin with the given streams. If pipe_read is set,
-/// assign stdin to it; otherwise infer stdin from the IO chain. unblock_previous is a hack used to
-/// prevent jobs from finishing; see commit cdb72b7024.
+/// assign stdin to it; otherwise infer stdin from the IO chain.
 /// return true on success, false if there is an exec error.
 static bool exec_internal_builtin_proc(parser_t &parser, job_t *j, process_t *p,
                                       const io_pipe_t *pipe_read, const io_chain_t &proc_io_chain,
-                                       io_streams_t &streams,
-                                       const std::function<void(void)> &unblock_previous) {
+                                       io_streams_t &streams) {
    assert(p->type == INTERNAL_BUILTIN && "Process must be a builtin");
    int local_builtin_stdin = STDIN_FILENO;
    bool close_stdin = false;
@ -492,9 +490,7 @@ static bool exec_internal_builtin_proc(parser_t &parser, job_t *j, process_t *p,
    const int fg = j->get_flag(JOB_FOREGROUND);
    j->set_flag(JOB_FOREGROUND, false);

-    // Main loop may need to perform a blocking read from previous command's output.
-    // Make sure read source is not blocked.
-    unblock_previous();
+    // Note this call may block for a long time, while the builtin performs I/O.
    p->status = builtin_run(parser, p->get_argv(), streams);

    // Restore the fg flag, which is temporarily set to false during builtin
@ -576,14 +572,21 @@ void exec_job(parser_t &parser, job_t *j) {
        }
    }

+    // When running under WSL, create a keepalive process unconditionally if our first process is external.
+    // This is because WSL does not permit joining the pgrp of an exited process.
+    // (see https://github.com/Microsoft/WSL/issues/2786), also fish PR #4676
+    if (j->processes.front()->type == EXTERNAL && is_windows_subsystem_for_linux())
+        needs_keepalive = true;
+
    if (needs_keepalive) {
        // Call fork. No need to wait for threads since our use is confined and simple.
+        pid_t parent_pid = getpid();
        keepalive.pid = execute_fork(false);
        if (keepalive.pid == 0) {
            // Child
            keepalive.pid = getpid();
            child_set_group(j, &keepalive);
-            pause();
+            run_as_keepalive(parent_pid);
            exit_without_destructors(0);
        } else {
            // Parent
@ -606,10 +609,6 @@ void exec_job(parser_t &parser, job_t *j) {
    //
    // We are careful to set these to -1 when closed, so if we exit the loop abruptly, we can still
    // close them.
-    bool pgrp_set = false;
-    // This is static since processes can block on input/output across jobs the main exec_job loop
-    // is only ever run in a single thread, so this is OK.
-    static pid_t blocked_pid = -1;
    int pipe_current_read = -1, pipe_current_write = -1, pipe_next_read = -1;
    for (std::unique_ptr<process_t> &unique_p : j->processes) {
        if (exec_error) {
@ -630,7 +629,6 @@ void exec_job(parser_t &parser, job_t *j) {
        // Set to true if we end up forking for this process.
        bool child_forked = false;
        bool child_spawned = false;
-        bool block_child = true;

        // The pipes the current process write to and read from. Unfortunately these can't be just
        // allocated on the stack, since j->io wants shared_ptr.
@ -736,23 +734,10 @@ void exec_job(parser_t &parser, job_t *j) {
        // a pipeline.
        shared_ptr<io_buffer_t> block_output_io_buffer;

-        // Used to SIGCONT the previously SIGSTOP'd process so the main loop or next command in job
-        // can read from its output.
-        auto unblock_previous = [&j]() {
-            // We've already called waitpid after forking the child, so we've guaranteed that
-            // they're already SIGSTOP'd. Otherwise we'd be risking a deadlock because we can call
-            // SIGCONT before they've actually stopped, and they'll remain suspended indefinitely.
-            if (blocked_pid != -1) {
-                debug(2, L"Unblocking process %d.\n", blocked_pid);
-                kill(blocked_pid, SIGCONT);
-                blocked_pid = -1;
-            }
-        };
-
        // This is the io_streams we pass to internal builtins.
        std::unique_ptr<io_streams_t> builtin_io_streams(new io_streams_t(stdout_read_limit));

-        auto do_fork = [&j, &p, &pid, &exec_error, &process_net_io_chain, &block_child,
+        auto do_fork = [&j, &p, &pid, &exec_error, &process_net_io_chain,
                        &child_forked](bool drain_threads, const char *fork_type,
                                       std::function<void()> child_action) -> bool {
            pid = execute_fork(drain_threads);
@ -760,16 +745,7 @@ void exec_job(parser_t &parser, job_t *j) {
                // This is the child process. Setup redirections, print correct output to
                // stdout and stderr, and then exit.
                p->pid = getpid();
-                blocked_pid = -1;
                child_set_group(j, p);
-                // Make child processes pause after executing setup_child_process() to give
-                // down-chain commands in the job a chance to join their process group and read
-                // their pipes. The process will be resumed when the next command in the chain is
-                // started.
-                if (block_child) {
-                    kill(p->pid, SIGSTOP);
-                }
-                // The parent will wake us up when we're ready to execute.
                setup_child_process(p, process_net_io_chain);
                child_action();
                DIE("Child process returned control to do_fork lambda!");
@ -785,10 +761,8 @@ void exec_job(parser_t &parser, job_t *j) {
                debug(2, L"Fork #%d, pid %d: %s for '%ls'", g_fork_count, pid, fork_type,
                      p->argv0());
                child_forked = true;
-                if (block_child) {
-                    debug(2, L"Blocking process %d waiting for next command in chain.\n", pid);
-                }
                p->pid = pid;
+                set_child_group(j, p->pid);
            }

            return true;
@ -858,7 +832,7 @@ void exec_job(parser_t &parser, job_t *j) {

            case INTERNAL_BUILTIN: {
                if (!exec_internal_builtin_proc(parser, j, p, pipe_read.get(), process_net_io_chain,
-                                                *builtin_io_streams, unblock_previous)) {
+                                                *builtin_io_streams)) {
                    exec_error = true;
                }
                break;
@ -944,9 +918,8 @@ void exec_job(parser_t &parser, job_t *j) {
                bool must_fork = redirection_is_to_real_file(stdout_io.get()) ||
                                 redirection_is_to_real_file(stderr_io.get());
                if (!must_fork && p->is_last_in_job) {
-                    // We are handling reads directly in the main loop. Make sure source is
-                    // unblocked. Note that we may still end up forking.
-                    unblock_previous();
+                    // We are handling reads directly in the main loop. Note that we may still end
+                    // up forking.
                    const bool stdout_is_to_buffer = stdout_io && stdout_io->io_mode == IO_BUFFER;
                    const bool no_stdout_output = stdout_buffer.empty();
                    const bool no_stderr_output = stderr_buffer.empty();
@ -1099,8 +1072,9 @@ void exec_job(parser_t &parser, job_t *j) {
                }

                // This is the parent process. Store away information on the child, and possibly
-                // fice it control over the terminal.
+                // give it control over the terminal.
                p->pid = pid;
+                set_child_group(j, p->pid);
                break;
            }

@ -1111,51 +1085,6 @@ void exec_job(parser_t &parser, job_t *j) {
            }
        }

-        bool child_blocked = block_child && child_forked;
-        if (child_blocked) {
-            // We have to wait to ensure the child has set their progress group and is in SIGSTOP
-            // state otherwise, we can later call SIGCONT before they call SIGSTOP and they'll be
-            // blocked indefinitely. The child is SIGSTOP'd and is guaranteed to have called
-            // child_set_group() at this point. but we only need to call set_child_group for the
-            // first process in the group. If needs_keepalive is set, this has already been called
-            // for the keepalive process.
-            int result;
-            int pid_status;
-            while ((result = waitpid(p->pid, &pid_status, WUNTRACED)) == -1 && errno == EINTR) {
-                // This could be a superfluous interrupt or Ctrl+C at the terminal In all cases, it
-                // is OK to retry since the forking code above is specifically designed to never,
-                // ever hang/block in a child process before the SIGSTOP call is reached.
-                ;  // do nothing
-            }
-            if (result == -1) {
-                exec_error = true;
-                debug(1, L"waitpid(%d) call in unblock_pid failed:!\n", p->pid);
-                wperror(L"waitpid");
-                break;
-            }
-            // We are not unblocking the child via SIGCONT just yet to give the next process a
-            // chance to open the pipes and join the process group. We only unblock the last process
-            // in the job chain because no one awaits it.
-        }
-        // Regardless of whether the child blocked or not: only once per job, and only once we've
-        // actually executed an external command.
-        if ((child_spawned || child_forked) && !pgrp_set) {
-            // This should be called after waitpid if child_forked and pipes_to_next_command it can
-            // be called at any time if child_spawned.
-            set_child_group(j, p->pid);
-            // we can't rely on p->is_first_in_job because a builtin may have been the first.
-            pgrp_set = true;
-        }
-        // If the command we ran _before_ this one was SIGSTOP'd to let this one catch up, unblock
-        // it now. this must be after the wait_pid on the process we just started, if any.
-        unblock_previous();
-
-        if (child_blocked) {
-            // Store the newly-blocked command's PID so that it can be SIGCONT'd once the next
-            // process in the chain is started. That may be in this job or in another job.
-            blocked_pid = p->pid;
-        }
-
        // Close the pipe the current process uses to read from the previous process_t.
        if (pipe_current_read >= 0) {
            exec_close(pipe_current_read);
@ -1167,11 +1096,6 @@ void exec_job(parser_t &parser, job_t *j) {
            exec_close(pipe_current_write);
            pipe_current_write = -1;
        }
-
-        // Unblock the last process because there's no need for it to stay SIGSTOP'd for anything.
-        if (p->is_last_in_job) {
-            unblock_previous();
-        }
    }

    // Clean up any file descriptors we left open.
--- a/src/postfork.cpp
+++ b/src/postfork.cpp
@ -60,36 +60,16 @@ static void debug_safe_int(int level, const char *format, int val) {
 }

 /// Called only by the child to set its own process group (possibly creating a new group in the
-/// process if it is the first in a JOB_CONTROL job. The parent will wait for this to finish.
-/// A process that isn't already in control of the terminal can't give itself control of the
-/// terminal without hanging, but it's not right for the child to try and give itself control
-/// from the very beginning because the parent may not have gotten around to doing so yet. Let
-/// the parent figure it out; if the child doesn't have terminal control and it later tries to
-/// read from the terminal, the kernel will send it SIGTTIN and it'll hang anyway.
-/// The key here is that the parent should transfer control of the terminal (if appropriate)
-/// prior to sending the child SIGCONT to wake it up to exec.
-///
+/// process if it is the first in a JOB_CONTROL job.
 /// Returns true on sucess, false on failiure.
 bool child_set_group(job_t *j, process_t *p) {
    bool retval = true;
-
    if (j->get_flag(JOB_CONTROL)) {
        // New jobs have the pgid set to -2
        if (j->pgid == -2) {
            j->pgid = p->pid;
        }
-        // Retry on EPERM because there's no way that a child cannot join an existing progress group
-        // because we are SIGSTOPing the previous job in the chain. Sometimes we have to try a few
-        // times to get the kernel to see the new group. (Linux 4.4.0)
-        int failure = setpgid(p->pid, j->pgid);
-        while (failure == -1 && (errno == EPERM || errno == EINTR)) {
-            debug_safe(4, "Retrying setpgid in child process");
-            failure = setpgid(p->pid, j->pgid);
-        }
-        // TODO: Figure out why we're testing whether the pgid is correct after attempting to
-        // set it failed. This was added in commit 4e912ef8 from 2012-02-27.
-        failure = failure && getpgid(p->pid) != j->pgid;
-        if (failure) {  //!OCLINT(collapsible if statements)
+        if (setpgid(p->pid, j->pgid) < 0) {
            char pid_buff[128];
            char job_id_buff[128];
            char getpgid_buff[128];
@ -112,7 +92,7 @@ bool child_set_group(job_t *j, process_t *p) {
            retval = false;
        }
    } else {
-        // This is probably stays unused in the child.
+        // The child does not actualyl use this field.
        j->pgid = getpgrp();
    }

@ -123,11 +103,7 @@ bool child_set_group(job_t *j, process_t *p) {
 /// guaranteeing the job control process group has been created and that the child belongs to the
 /// correct process group. Here we can update our job_t structure to reflect the correct process
 /// group in the case of JOB_CONTROL, and we can give the new process group control of the terminal
-/// if it's to run in the foreground. Note that we can guarantee the child won't try to read from
-/// the terminal before we've had a chance to run this code, because we haven't woken them up with a
-/// SIGCONT yet. This musn't be called as a part of setup_child_process because that can hang
-/// indefinitely until data is available to read/write in the case of IO_FILE, which means we'll
-/// never reach our SIGSTOP and everything hangs.
+/// if it's to run in the foreground.
 bool set_child_group(job_t *j, pid_t child_pid) {
    bool retval = true;

@ -136,6 +112,13 @@ bool set_child_group(job_t *j, pid_t child_pid) {
        if (j->pgid == -2) {
            j->pgid = child_pid;
        }
+        // The parent sets the child's group. This incurs the well-known unavoidable race with the
+        // child exiting, so ignore ESRCH and EPERM (in case the pid was recycled).
+        if (setpgid(child_pid, j->pgid) < 0) {
+            if (errno != ESRCH && errno != EPERM) {
+                safe_perror("setpgid");
+            }
+        }
    } else {
        j->pgid = getpgrp();
    }
@ -551,3 +534,15 @@ bool do_builtin_io(const char *out, size_t outlen, const char *err, size_t errle
    errno = saved_errno;
    return success;
 }
+
+void run_as_keepalive(pid_t parent_pid) {
+    // Run this process as a keepalive. In typical usage the parent process will kill us. However
+    // this may not happen if the parent process exits abruptly, either via kill or exec. What we do
+    // is poll our ppid() and exit when it differs from parent_pid. We can afford to do this with
+    // low frequency because in the great majority of cases, fish will kill(9) us.
+    for (;;) {
+        // Note sleep is async-safe.
+        if (sleep(1)) break;
+        if (getppid() != parent_pid) break;
+    }
+}
--- a/src/postfork.h
+++ b/src/postfork.h
@ -47,6 +47,9 @@ bool do_builtin_io(const char *out, size_t outlen, const char *err, size_t errle
 void safe_report_exec_error(int err, const char *actual_cmd, const char *const *argv,
                            const char *const *envv);

+/// Runs the process as a keepalive, until the parent process given by parent_pid exits.
+void run_as_keepalive(pid_t parent_pid);
+
 #if FISH_USE_POSIX_SPAWN
 /// Initializes and fills in a posix_spawnattr_t; on success, the caller should destroy it via
 /// posix_spawnattr_destroy.