Changed how process groups are assigned to child processes

There is no more race condition between parent and child with
regards to setting the process groups. Each child sets it for themselves
and then blocks indefinitely until the parent does what it needs to for
them (having waited for them to set their process groups). They are not
SIGCONT'd until the next process in the chain (if any) starts so that
that process can join their process group and open the pipes.
This commit is contained in:
Mahmoud Al-Qudsi 2017-07-26 21:45:22 -05:00 committed by Kurtis Rader
parent c81cf56c0b
commit 25afc9b377
5 changed files with 120 additions and 125 deletions

View File

@ -21,6 +21,7 @@
#include <algorithm>
#include <map>
#include <memory>
#include <mutex>
#include <string>
#include <type_traits>
#include <vector>
@ -383,21 +384,6 @@ void exec_job(parser_t &parser, job_t *j) {
return;
}
auto unblock_pid = [] (pid_t blocked_pid) {
//this is correct, except there's a race condition if the child hasn't yet SIGSTOP'd
//in that case, they never receive the SIGCONT
pid_t pid_status{};
if (waitpid(blocked_pid, &pid_status, WUNTRACED) != -1) {
// if (WIFSTOPPED(pid_status)) {
debug(2, L"Unblocking process %d.\n", blocked_pid);
kill(blocked_pid, SIGCONT);
return true;
// }
}
debug(2, L"waitpid call in unblock_pid failed!\n");
return false;
};
debug(4, L"Exec job '%ls' with id %d", j->command_wcstr(), j->job_id);
// Verify that all IO_BUFFERs are output. We used to support a (single, hacked-in) magical input
@ -485,14 +471,14 @@ void exec_job(parser_t &parser, job_t *j) {
if (keepalive.pid == 0) {
// Child
keepalive.pid = getpid();
set_child_group(j, &keepalive, 1);
child_set_group(j, &keepalive);
pause();
exit_without_destructors(0);
} else {
// Parent
debug(2, L"Fork #%d, pid %d: keepalive fork for '%ls'", g_fork_count, keepalive.pid,
j->command_wcstr());
set_child_group(j, &keepalive, 0);
set_child_group(j, keepalive.pid);
}
}
@ -527,7 +513,8 @@ void exec_job(parser_t &parser, job_t *j) {
// See if we need a pipe.
const bool pipes_to_next_command = !p->is_last_in_job;
bool command_blocked = false;
//set to true if we end up forking for this process
bool child_forked = false;
// The pipes the current process write to and read from. Unfortunately these can't be just
// allocated on the stack, since j->io wants shared_ptr.
@ -632,12 +619,16 @@ void exec_job(parser_t &parser, job_t *j) {
// This is the IO buffer we use for storing the output of a block or function when it is in
// a pipeline.
shared_ptr<io_buffer_t> block_output_io_buffer;
auto unblock_previous = [&] () {
auto unblock_previous = [&j] () {
//we've already called waitpid after forking the child, so we've guaranteed that they're
//already SIGSTOP'd. Otherwise we'd be risking a deadlock because we can call SIGCONT before
//they've actually stopped, and they'll remain suspended indefinitely.
if (blocked_pid != -1) {
//now that next command in the chain has been started, unblock the previous command
unblock_pid(blocked_pid);
blocked_pid = -1;
debug(2, L"Unblocking process %d.\n", blocked_pid);
kill(blocked_pid, SIGCONT);
}
blocked_pid = -1;
};
// This is the io_streams we pass to internal builtins.
@ -893,14 +884,10 @@ void exec_job(parser_t &parser, job_t *j) {
// This is the child process. Write out the contents of the pipeline.
p->pid = getpid();
setup_child_process(j, p, process_net_io_chain);
// Start child processes that are part of a chain in a stopped state
// to ensure that they are still running when the next command in the
// chain is started.
// Make child processes pause after executing setup_child_process() to give down-chain
// commands in the job a chance to join their process group and read their pipes.
// The process will be resumed when the next command in the chain is started.
// Note that this may span multiple jobs, as jobs can read from each other.
if (pipes_to_next_command) {
kill(p->pid, SIGSTOP);
}
kill(p->pid, SIGSTOP);
exec_write_and_exit(block_output_io_buffer->fd, buffer, count, status);
} else {
@ -908,14 +895,9 @@ void exec_job(parser_t &parser, job_t *j) {
// possibly give it control over the terminal.
debug(2, L"Fork #%d, pid %d: internal block or function for '%ls'",
g_fork_count, pid, p->argv0());
if (pipes_to_next_command) {
//it actually blocked itself after forking above, but print in here for output
//synchronization & so we can assign command_blocked in the correct address space
debug(2, L"Blocking process %d waiting for next command in chain.\n", pid);
command_blocked = true;
}
child_forked = true;
debug(2, L"Blocking process %d waiting for next command in chain.\n", pid);
p->pid = pid;
set_child_group(j, p, 0);
}
} else {
@ -1030,14 +1012,11 @@ void exec_job(parser_t &parser, job_t *j) {
// stdout and stderr, and then exit.
p->pid = getpid();
setup_child_process(j, p, process_net_io_chain);
// Start child processes that are part of a chain in a stopped state
// to ensure that they are still running when the next command in the
// chain is started.
// Make child processes pause after executing setup_child_process() to give down-chain
// commands in the job a chance to join their process group and read their pipes.
// The process will be resumed when the next command in the chain is started.
// Note that this may span multiple jobs, as jobs can read from each other.
if (pipes_to_next_command) {
kill(p->pid, SIGSTOP);
}
kill(p->pid, SIGSTOP);
do_builtin_io(outbuff, outbuff_len, errbuff, errbuff_len);
exit_without_destructors(p->status);
} else {
@ -1045,15 +1024,9 @@ void exec_job(parser_t &parser, job_t *j) {
// possibly give it control over the terminal.
debug(2, L"Fork #%d, pid %d: internal builtin for '%ls'", g_fork_count, pid,
p->argv0());
if (pipes_to_next_command) {
//it actually blocked itself after forking above, but print in here for output
//synchronization & so we can assign command_blocked in the correct address space
debug(2, L"Blocking process %d waiting for next command in chain.\n", pid);
command_blocked = true;
}
child_forked = true;
debug(2, L"Blocking process %d waiting for next command in chain.\n", pid);
p->pid = pid;
set_child_group(j, p, 0);
}
}
@ -1127,15 +1100,10 @@ void exec_job(parser_t &parser, job_t *j) {
// This is the child process.
p->pid = getpid();
setup_child_process(j, p, process_net_io_chain);
// Start child processes that are part of a chain in a stopped state
// to ensure that they are still running when the next command in the
// chain is started.
// Make child processes pause after executing setup_child_process() to give down-chain
// commands in the job a chance to join their process group and read their pipes.
// The process will be resumed when the next command in the chain is started.
// Note that this may span multiple jobs, as jobs can read from each other.
if (pipes_to_next_command) {
kill(p->pid, SIGSTOP);
}
kill(p->pid, SIGSTOP);
safe_launch_process(p, actual_cmd, argv, envv);
// safe_launch_process _never_ returns...
@ -1147,19 +1115,14 @@ void exec_job(parser_t &parser, job_t *j) {
job_mark_process_as_failed(j, p);
exec_error = true;
}
if (pipes_to_next_command) {
//it actually blocked itself after forking above, but print in here for output
//synchronization & so we can assign command_blocked in the correct address space
debug(2, L"Blocking process %d waiting for next command in chain.\n", pid);
command_blocked = true;
}
child_forked = true;
debug(2, L"Blocking process %d waiting for next command in chain.\n", pid);
}
}
// This is the parent process. Store away information on the child, and possibly
// fice it control over the terminal.
p->pid = pid;
set_child_group(j, p, 0);
break;
}
@ -1170,10 +1133,31 @@ void exec_job(parser_t &parser, job_t *j) {
}
}
//if the command we ran before this one was SIGSTOP'd to let this one catch up, unblock it now.
if (child_forked) {
//we have to wait to ensure the child has set their progress group and is in SIGSTOP state
//otherwise, we can later call SIGCONT before they call SIGSTOP and they'll be blocked indefinitely.
pid_t pid_status{};
if (waitpid(p->pid, &pid_status, WUNTRACED) != -1) {
//the child is SIGSTOP'd and is guaranteed to have called child_set_group() at this point.
set_child_group(j, p->pid); //update our own process group info to match
//we are not unblocking the child via SIGCONT just yet to give the next process a chance to open
//the pipes and join the process group. We only unblock the last process in the job chain because
//no one awaits it.
if (!pipes_to_next_command)
{
kill(p->pid, SIGCONT);
}
}
else {
debug(2, L"waitpid(%d) call in unblock_pid failed:!\n", p->pid);
wperror(L"waitpid");
}
}
//if the command we ran _before_ this one was SIGSTOP'd to let this one catch up, unblock it now.
//this must be after the wait_pid on the process we just started, if any.
unblock_previous();
if (command_blocked) {
if (child_forked) {
//store the newly-blocked command's PID so that it can be SIGCONT'd once the next process
//in the chain is started. That may be in this job or in another job.
blocked_pid = p->pid;

View File

@ -59,14 +59,18 @@ static void debug_safe_int(int level, const char *format, int val) {
debug_safe(level, format, buff);
}
/// This function should be called by both the parent process and the child right after fork() has
/// been called. If job control is enabled, the child is put in the jobs group, and if the child is
/// also in the foreground, it is also given control of the terminal. When called in the parent
/// process, this function may fail, since the child might have already finished and called exit.
/// The parent process may safely ignore the exit status of this call.
/// Called only by the child to set its own process group (possibly creating a new group in the
/// process if it is the first in a JOB_CONTROL job. The parent will wait for this to finish.
/// A process that isn't already in control of the terminal can't give itself control of the
/// terminal without hanging, but it's not right for the child to try and give itself control
/// from the very beginning because the parent may not have gotten around to doing so yet. Let
/// the parent figure it out; if the child doesn't have terminal control and it later tries to
/// read from the terminal, the kernel will send it SIGTTIN and it'll hang anyway.
/// The key here is that the parent should transfer control of the terminal (if appropriate)
/// prior to sending the child SIGCONT to wake it up to exec.
///
/// Returns true on sucess, false on failiure.
bool set_child_group(job_t *j, process_t *p, int print_errors) {
bool child_set_group(job_t *j, process_t *p) {
bool retval = true;
if (j->get_flag(JOB_CONTROL)) {
@ -74,32 +78,53 @@ bool set_child_group(job_t *j, process_t *p, int print_errors) {
if (j->pgid == -2) {
j->pgid = p->pid;
}
int failure = setpgid(p->pid, j->pgid);
// TODO: Figure out why we're testing whether the pgid is correct after attempting to
// set it failed. This was added in commit 4e912ef8 from 2012-02-27.
failure = failure && getpgid(p->pid) != j->pgid;
if (failure) { //!OCLINT(collapsible if statements)
char pid_buff[128];
char job_id_buff[128];
char getpgid_buff[128];
char job_pgid_buff[128];
char argv0[64];
char command[64];
if (setpgid(p->pid, j->pgid)) { //!OCLINT(collapsible if statements)
// TODO: Figure out why we're testing whether the pgid is correct after attempting to
// set it failed. This was added in commit 4e912ef8 from 2012-02-27.
if (getpgid(p->pid) != j->pgid && print_errors) {
char pid_buff[128];
char job_id_buff[128];
char getpgid_buff[128];
char job_pgid_buff[128];
char argv0[64];
char command[64];
format_long_safe(pid_buff, p->pid);
format_long_safe(job_id_buff, j->job_id);
format_long_safe(getpgid_buff, getpgid(p->pid));
format_long_safe(job_pgid_buff, j->pgid);
narrow_string_safe(argv0, p->argv0());
narrow_string_safe(command, j->command_wcstr());
format_long_safe(pid_buff, p->pid);
format_long_safe(job_id_buff, j->job_id);
format_long_safe(getpgid_buff, getpgid(p->pid));
format_long_safe(job_pgid_buff, j->pgid);
narrow_string_safe(argv0, p->argv0());
narrow_string_safe(command, j->command_wcstr());
debug_safe(
1, "Could not send own process %s, '%s' in job %s, '%s' from group %s to group %s",
pid_buff, argv0, job_id_buff, command, getpgid_buff, job_pgid_buff);
debug_safe(
1, "Could not send process %s, '%s' in job %s, '%s' from group %s to group %s",
pid_buff, argv0, job_id_buff, command, getpgid_buff, job_pgid_buff);
safe_perror("setpgid");
retval = false;
}
} else {
//this is probably stays unused in the child
j->pgid = getpgrp();
}
safe_perror("setpgid");
retval = false;
}
return retval;
}
/// Called only by the parent only after a child forks and successfully calls child_set_group, guaranteeing
/// the job control process group has been created and that the child belongs to the correct process group.
/// Here we can update our job_t structure to reflect the correct process group in the case of JOB_CONTROL,
/// and we can give the new process group control of the terminal if it's to run in the foreground. Note that
/// we can guarantee the child won't try to read from the terminal before we've had a chance to run this code,
/// because we haven't woken them up with a SIGCONT yet.
bool set_child_group(job_t *j, pid_t child_pid) {
bool retval = true;
if (j->get_flag(JOB_CONTROL)) {
// New jobs have the pgid set to -2
if (j->pgid == -2) {
j->pgid = child_pid;
}
} else {
j->pgid = getpgrp();
@ -107,33 +132,16 @@ bool set_child_group(job_t *j, process_t *p, int print_errors) {
if (j->get_flag(JOB_TERMINAL) && j->get_flag(JOB_FOREGROUND)) { //!OCLINT(early exit)
if (tcgetpgrp(STDIN_FILENO) == j->pgid) {
//we've already assigned the process group control of the terminal when the first process in the job
//was started. There's no need to do so again, and on some platforms this can cause an EPERM error.
//In addition, if we've given control of the terminal to a process group, attempting to call tcsetpgrp
//from the background will cause SIGTTOU to be sent to everything in our process group (unless we
//handle it)..
debug(4, L"Process group %d already has control of terminal\n", j->pgid);
}
else {
debug(4, L"Attempting bring process group to foreground via tcsetpgrp for job->pgid %d\n", j->pgid);
debug(4, L"caller session id: %d, pgid %d has session id: %d\n", getsid(0), j->pgid, getsid(j->pgid));
int result = -1;
errno = EINTR;
while (result == -1 && errno == EINTR) {
signal_block(true);
result = tcsetpgrp(STDIN_FILENO, j->pgid);
signal_unblock(true);
}
if (result == -1) {
if (errno == ENOTTY) redirect_tty_output();
if (print_errors) {
char job_id_buff[64];
char command_buff[64];
char job_pgid_buff[128];
format_long_safe(job_id_buff, j->job_id);
narrow_string_safe(command_buff, j->command_wcstr());
format_long_safe(job_pgid_buff, j->pgid);
debug_safe(1, "Could not send job %s ('%s') with pgid %s to foreground", job_id_buff,
command_buff, job_pgid_buff);
safe_perror("tcsetpgrp");
retval = false;
}
}
//no need to duplicate the code here, a function already exists that does just this
retval = terminal_give_to_job(j, false /*new job, so not continuing*/);
}
}
@ -242,10 +250,10 @@ static int handle_child_io(const io_chain_t &io_chain) {
}
int setup_child_process(job_t *j, process_t *p, const io_chain_t &io_chain) {
bool ok = true;
bool ok = false;
if (p) {
ok = set_child_group(j, p, 1);
ok = child_set_group(j, p);
}
if (ok) {

View File

@ -18,7 +18,8 @@ class io_chain_t;
class job_t;
class process_t;
bool set_child_group(job_t *j, process_t *p, int print_errors);
bool set_child_group(job_t *j, pid_t child_pid); //called by parent
bool child_set_group(job_t *j, process_t *p); //called by child
/// Initialize a new child process. This should be called right away after forking in the child
/// process. If job control is enabled for this job, the process is put in the process group of the

View File

@ -782,7 +782,7 @@ static void read_try(job_t *j) {
/// \param j The job to give the terminal to.
/// \param cont If this variable is set, we are giving back control to a job that has previously
/// been stopped. In that case, we need to set the terminal attributes to those saved in the job.
static bool terminal_give_to_job(job_t *j, int cont) {
bool terminal_give_to_job(job_t *j, int cont) {
errno = 0;
if (j->pgid == 0) {
debug(2, "terminal_give_to_job() returning early due to no process group");

View File

@ -365,3 +365,5 @@ void proc_pop_interactive();
int proc_format_status(int status);
#endif
bool terminal_give_to_job(job_t *j, int cont);