Correct a race in topic monitor

This fixes a race condition in the topic monitor. A thread may decide to
enter the wait queue, but before it does the generation list changes, and
so our thread will wait forever, resulting in a hang.

It also simplifies the implementation of the topic monitor considerably;
on reflection the whole "metagen" thing isn't providing any value and we
should just compare generations directly.

In the new design, we have a lock-protected list of current generations,
along with a boolean as to whether someone is reading from the pipe. The
reader (only one at a time) is responsible for broadcasting notifications
via a condition variable.
This commit is contained in:
ridiculousfish 2019-05-31 09:13:21 -07:00
parent d920a618de
commit 4e03d3c264
3 changed files with 111 additions and 89 deletions

View File

@ -635,6 +635,9 @@ class acquired_lock {
/// Create from a global lock. /// Create from a global lock.
/// This is used in weird cases where a global lock protects more than one piece of data. /// This is used in weird cases where a global lock protects more than one piece of data.
static acquired_lock from_global(std::mutex &lk, Data *v) { return acquired_lock{lk, v}; } static acquired_lock from_global(std::mutex &lk, Data *v) { return acquired_lock{lk, v}; }
/// \return a reference to the lock, for use with a condition variable.
std::unique_lock<std::mutex> &get_lock() { return lock; }
}; };
// A lock that owns a piece of data // A lock that owns a piece of data

View File

@ -26,6 +26,12 @@
/// pointless at-exit handler for the dtor. /// pointless at-exit handler for the dtor.
static topic_monitor_t *const s_principal = new topic_monitor_t(); static topic_monitor_t *const s_principal = new topic_monitor_t();
/// \return the metagen for a topic generation list.
/// The metagen is simply the sum of topic generations. Note it is monotone.
static generation_t metagen_for(const generation_list_t &lst) {
return std::accumulate(lst.begin(), lst.end(), generation_t{0});
}
topic_monitor_t &topic_monitor_t::principal() { topic_monitor_t &topic_monitor_t::principal() {
// Do not attempt to move s_principal to a function-level static, it needs to be accessed from a // Do not attempt to move s_principal to a function-level static, it needs to be accessed from a
// signal handler so it must not be lazily created. // signal handler so it must not be lazily created.
@ -71,9 +77,7 @@ void topic_monitor_t::post(topic_t topic) {
// Ignore EAGAIN and other errors (which conceivably could occur during shutdown). // Ignore EAGAIN and other errors (which conceivably could occur during shutdown).
} }
generation_list_t topic_monitor_t::updated_gens() { generation_list_t topic_monitor_t::updated_gens_in_data(acquired_lock<data_t> &data) {
auto current_gens = current_gen_.acquire();
// Atomically acquire the pending updates, swapping in 0. // Atomically acquire the pending updates, swapping in 0.
// If there are no pending updates (likely), just return. // If there are no pending updates (likely), just return.
// Otherwise CAS in 0 and update our topics. // Otherwise CAS in 0 and update our topics.
@ -82,7 +86,7 @@ generation_list_t topic_monitor_t::updated_gens() {
bool cas_success; bool cas_success;
do { do {
raw = pending_updates_.load(relaxed); raw = pending_updates_.load(relaxed);
if (raw == 0) return *current_gens; if (raw == 0) return data->current_gens;
cas_success = pending_updates_.compare_exchange_weak(raw, 0, relaxed, relaxed); cas_success = pending_updates_.compare_exchange_weak(raw, 0, relaxed, relaxed);
} while (!cas_success); } while (!cas_success);
@ -90,76 +94,88 @@ generation_list_t topic_monitor_t::updated_gens() {
auto topics = topic_set_t::from_raw(raw); auto topics = topic_set_t::from_raw(raw);
for (topic_t topic : topic_iter_t{}) { for (topic_t topic : topic_iter_t{}) {
if (topics.get(topic)) { if (topics.get(topic)) {
current_gens->at(topic) += 1; data->current_gens.at(topic) += 1;
FLOG(topic_monitor, "Updating topic", (int)topic, "to", current_gens->at(topic)); FLOG(topic_monitor, "Updating topic", (int)topic, "to", data->current_gens.at(topic));
} }
} }
return *current_gens; // Report our change.
data_notifier_.notify_all();
return data->current_gens;
} }
void topic_monitor_t::await_metagen(generation_t mgen) { generation_list_t topic_monitor_t::updated_gens() {
// Fast check of the metagen before taking the lock. If it's changed we're done. auto data = data_.acquire();
generation_t current = current_metagen(); return updated_gens_in_data(data);
FLOG(topic_monitor, "TID", thread_id(), "local mgen", mgen, ": current", current); }
if (mgen != current) return;
// Take the lock (which may take a long time) and then check again. bool topic_monitor_t::try_update_gens_maybe_becoming_reader(generation_list_t *gens) {
std::unique_lock<std::mutex> locker{wait_queue_lock_}; bool become_reader = false;
current = current_metagen(); auto data = data_.acquire();
FLOG(topic_monitor, "TID", thread_id(), "local mgen", mgen, ": current", current, for (;;) {
"acquired lock"); // See if the updated gen list has changed. If so we don't need to become the reader.
if (mgen != current) return; auto current = updated_gens_in_data(data);
FLOG(topic_monitor, "TID", thread_id(), "local mgen", metagen_for(*gens), ": current",
metagen_for(current));
if (*gens != current) {
*gens = current;
break;
}
// Our metagen hasn't changed. Push our metagen onto the queue, then wait until we're the // The generations haven't changed. Perhaps we become the reader.
// lowest. If multiple waiters are the lowest, then anyone can be the observer. if (!data->has_reader) {
// Note the reason for picking the lowest metagen is to avoid a priority inversion where a lower become_reader = true;
// metagen (therefore someone who should see changes) is blocked waiting for a higher metagen data->has_reader = true;
// (who has already seen the changes). break;
wait_queue_.push(mgen); }
while (wait_queue_.top() != mgen) { // Not the reader, wait until the reader notifies us and loop again.
FLOG(topic_monitor, "TID", thread_id(), "local mgen", mgen, "releasing lock for", data_notifier_.wait(data.get_lock());
wait_queue_.top());
wait_queue_notifier_.wait(locker);
} }
wait_queue_.pop(); return become_reader;
}
// We now have the lowest metagen in the wait queue. Notice we still hold the lock. generation_list_t topic_monitor_t::await_gens(const generation_list_t &input_gens) {
// Read until the metagen changes. It may already have changed. generation_list_t gens = input_gens;
// Note because changes are coalesced, we can read a lot, potentially draining the pipe. while (gens == input_gens) {
current = current_metagen(); bool become_reader = try_update_gens_maybe_becoming_reader(&gens);
FLOG(topic_monitor, "TID", thread_id(), "local mgen", mgen, "considering waiting for mgen", if (become_reader) {
current); // Now we are the reader. Read from the pipe, and then update with any changes.
while (mgen == current) { // Note we no longer hold the lock.
int fd = pipes_.read.fd(); assert(gens == input_gens &&
"Generations should not have changed if we are the reader.");
int fd = pipes_.read.fd();
#if TOPIC_MONITOR_TSAN_WORKAROUND #if TOPIC_MONITOR_TSAN_WORKAROUND
// Under tsan our notifying pipe is non-blocking, so we would busy-loop on the read() call // Under tsan our notifying pipe is non-blocking, so we would busy-loop on the read()
// until data is available (that is, fish would use 100% cpu while waiting for processes). // call until data is available (that is, fish would use 100% cpu while waiting for
// The select prevents that. // processes). The select prevents that.
fd_set fds; fd_set fds;
FD_ZERO(&fds); FD_ZERO(&fds);
FD_SET(fd, &fds); FD_SET(fd, &fds);
(void)select(fd + 1, &fds, nullptr, nullptr, nullptr /* timeout */); (void)select(fd + 1, &fds, nullptr, nullptr, nullptr /* timeout */);
#endif #endif
uint8_t ignored[PIPE_BUF]; uint8_t ignored[PIPE_BUF];
(void)read(fd, ignored, sizeof ignored); (void)read(fd, ignored, sizeof ignored);
current = current_metagen();
FLOG(topic_monitor, "TID", thread_id(), "local mgen", mgen,
"read() complete, current mgen is", current);
}
// Release the lock and wake up the remaining waiters. // We are finished reading. We must stop being the reader, and post on the condition
FLOG(topic_monitor, "TID", thread_id(), "local mgen", mgen, "awakening all waiters"); // variable to wake up any other threads waiting for us to finish reading.
locker.unlock(); auto data = data_.acquire();
wait_queue_notifier_.notify_all(); gens = data->current_gens;
FLOG(topic_monitor, "TID", thread_id(), "local mgen", metagen_for(input_gens),
"read() complete, current mgen is", metagen_for(gens));
assert(data->has_reader && "We should be the reader");
data->has_reader = false;
data_notifier_.notify_all();
}
}
return gens;
} }
topic_set_t topic_monitor_t::check(generation_list_t *gens, topic_set_t topics, bool wait) { topic_set_t topic_monitor_t::check(generation_list_t *gens, topic_set_t topics, bool wait) {
if (topics.none()) return topics; if (topics.none()) return topics;
generation_list_t current = updated_gens();
topic_set_t changed{}; topic_set_t changed{};
for (;;) { for (;;) {
// Load the topic list and see if anything has changed. // Load the topic list and see if anything has changed.
generation_list_t current = updated_gens();
for (topic_t topic : topic_iter_t{}) { for (topic_t topic : topic_iter_t{}) {
if (topics.get(topic)) { if (topics.get(topic)) {
assert(gens->at(topic) <= current.at(topic) && assert(gens->at(topic) <= current.at(topic) &&
@ -176,9 +192,8 @@ topic_set_t topic_monitor_t::check(generation_list_t *gens, topic_set_t topics,
break; break;
} }
// Try again. Note that we use the metagen corresponding to the topic list we just // Wait until our gens change.
// inspected, not the current one (which may have updates since we checked). current = await_gens(current);
await_metagen(metagen_for(current));
} }
return changed; return changed;
} }

View File

@ -11,7 +11,6 @@
#include <condition_variable> #include <condition_variable>
#include <limits> #include <limits>
#include <numeric> #include <numeric>
#include <queue>
/** Topic monitoring support. Topics are conceptually "a thing that can happen." For example, /** Topic monitoring support. Topics are conceptually "a thing that can happen." For example,
delivery of a SIGINT, a child process exits, etc. It is possible to post to a topic, which means delivery of a SIGINT, a child process exits, etc. It is possible to post to a topic, which means
@ -73,47 +72,52 @@ class topic_monitor_t {
static_assert(sizeof(topic_set_raw_t) * CHAR_BIT >= enum_count<topic_t>(), static_assert(sizeof(topic_set_raw_t) * CHAR_BIT >= enum_count<topic_t>(),
"topic_set_raw is too small"); "topic_set_raw is too small");
/// The current topic generation list, protected by a mutex. Note this may be opportunistically // Some stuff that needs to be protected by the same lock.
/// updated at the point it is queried. struct data_t {
owning_lock<generation_list_t> current_gen_{{}}; /// The current generation list.
generation_list_t current_gens{};
/// Whether there is a thread currently reading from the notifier pipe.
bool has_reader{false};
};
owning_lock<data_t> data_{};
/// Condition variable for broadcasting notifications.
/// This is associated with data_'s mutex.
std::condition_variable data_notifier_{};
/// The set of topics which have pending increments. /// The set of topics which have pending increments.
/// This is managed via atomics. /// This is managed via atomics.
std::atomic<topic_set_raw_t> pending_updates_{}; std::atomic<topic_set_raw_t> pending_updates_{};
/// When a topic set is queried in a blocking way, the waiters are put into a queue. The waiter /// Self-pipes used to communicate changes.
/// with the smallest metagen is responsible for announcing the change to the rest of the /// The writer is a signal handler.
/// waiters. (The metagen is just the sum of the current generations.) Note that this is a /// "The reader" refers to a thread that wants to wait for changes. Only one thread can be the
/// max-heap that defaults to std::less; by using std::greater it becomes a min heap. This is /// reader at a given time.
/// protected by wait_queue_lock_.
std::priority_queue<generation_t, std::vector<generation_t>, std::greater<generation_t>>
wait_queue_;
/// Mutex guarding the wait queue.
std::mutex wait_queue_lock_{};
/// Condition variable for broadcasting notifications.
std::condition_variable wait_queue_notifier_{};
/// Pipes used to communicate changes from the signal handler.
autoclose_pipes_t pipes_; autoclose_pipes_t pipes_;
/// \return the metagen for a topic generation list. /// Apply any pending updates to the data.
/// The metagen is simply the sum of topic generations. Note it is monotone. /// This accepts data because it must be locked.
static inline generation_t metagen_for(const generation_list_t &lst) { /// \return the updated generation list.
return std::accumulate(lst.begin(), lst.end(), generation_t{0}); generation_list_t updated_gens_in_data(acquired_lock<data_t> &data);
}
/// Wait for the current metagen to become different from \p gen. /// Given a list of input generations, attempt to update them to something newer.
/// If it is already different, return immediately. /// If \p gens is older, then just return those by reference, and directly return false (not
void await_metagen(generation_t gen); /// becoming the reader).
/// If \p gens is current and there is not a reader, then do not update \p gens and return true,
/// indicating we should become the reader. Now it is our responsibility to read from the pipes
/// and notify on a change via the condition variable.
/// If \p gens is current, and there is already a reader, then wait until the reader notifies us
/// and try again.
bool try_update_gens_maybe_becoming_reader(generation_list_t *gens);
/// Return the current generation list, opportunistically applying any pending updates. /// Wait for some entry in the list of generations to change.
/// \return the new gens.
generation_list_t await_gens(const generation_list_t &input_gens);
/// \return the current generation list, opportunistically applying any pending updates.
generation_list_t updated_gens(); generation_list_t updated_gens();
/// \return the metagen for the current topic generation list.
inline generation_t current_metagen() { return metagen_for(updated_gens()); }
public: public:
topic_monitor_t(); topic_monitor_t();
~topic_monitor_t(); ~topic_monitor_t();