caddyhttp: Honor grace period in background (#5043)

* caddyhttp: Honor grace period in background This avoids blocking during config reloads. * Don't quit process until servers shut down * Make tests more likely to pass on fast CI (#5045) * caddyhttp: Even faster shutdowns Simultaneously shut down all HTTP servers, rather than one at a time. In practice there usually won't be more than 1 that lingers. But this code ensures that they all Shutdown() in their own goroutine and then we wait for them at the end (if exiting). We also wait for them to start up so we can be fairly confident the shutdowns have begun; i.e. old servers no longer accepting new connections. * Fix comment typo * Pull functions out of loop, for readability
2025-03-24 21:05:16 +08:00 · 2022-09-19 21:54:47 -06:00 · 2022-09-19 21:54:47 -06:00 · da8b7fe58f
commit da8b7fe58f
parent 0950ba4f0b
9 changed files with 78 additions and 10 deletions
--- a/caddy.go
+++ b/caddy.go
@ -31,6 +31,7 @@ import (
 	"strconv"
 	"strings"
 	"sync"
+	"sync/atomic"
 	"time"

 	"github.com/caddyserver/caddy/v2/notify"
@ -676,6 +677,10 @@ func Validate(cfg *Config) error {
 // Errors are logged along the way, and an appropriate exit
 // code is emitted.
 func exitProcess(ctx context.Context, logger *zap.Logger) {
+	// let the rest of the program know we're quitting
+	atomic.StoreInt32(exiting, 1)
+
+	// give the OS or service/process manager our 2 weeks' notice: we quit
 	if err := notify.Stopping(); err != nil {
 		Log().Error("unable to notify service manager of stopping state", zap.Error(err))
 	}
@ -739,6 +744,12 @@ func exitProcess(ctx context.Context, logger *zap.Logger) {
 	}()
 }

+var exiting = new(int32) // accessed atomically
+
+// Exiting returns true if the process is exiting.
+// EXPERIMENTAL API: subject to change or removal.
+func Exiting() bool { return atomic.LoadInt32(exiting) == 1 }
+
 // Duration can be an integer or a string. An integer is
 // interpreted as nanoseconds. If a string, it is a Go
 // time.Duration value such as `300ms`, `1.5h`, or `2h45m`;
--- a/caddytest/caddytest.go
+++ b/caddytest/caddytest.go
@ -43,7 +43,7 @@ type Defaults struct {

 // Default testing values
 var Default = Defaults{
-	AdminPort:          2019,
+	AdminPort:          2999, // different from what a real server also running on a developer's machine might be
 	Certifcates:        []string{"/caddy.localhost.crt", "/caddy.localhost.key"},
 	TestRequestTimeout: 5 * time.Second,
 	LoadRequestTimeout: 5 * time.Second,
--- a/caddytest/integration/caddyfile_test.go
+++ b/caddytest/integration/caddyfile_test.go
@ -16,6 +16,7 @@ func TestRespond(t *testing.T) {
  {
    http_port     9080
    https_port    9443
+    grace_period  1
  }
  
  localhost:9080 {
@ -37,6 +38,7 @@ func TestRedirect(t *testing.T) {
  {
    http_port     9080
    https_port    9443
+    grace_period  1
  }
  
  localhost:9080 {
@ -86,6 +88,7 @@ func TestReadCookie(t *testing.T) {
  {
    http_port     9080
    https_port    9443
+    grace_period  1
  }
  
  localhost:9080 {
@ -109,6 +112,7 @@ func TestReplIndex(t *testing.T) {
  {
    http_port     9080
    https_port    9443
+    grace_period  1
  }

  localhost:9080 {
--- a/caddytest/integration/handler_test.go
+++ b/caddytest/integration/handler_test.go
@ -13,6 +13,7 @@ func TestBrowse(t *testing.T) {
 	{
 		http_port     9080
 		https_port    9443
+		grace_period  1
 	}
 	http://localhost:9080 {
 		file_server browse
--- a/caddytest/integration/map_test.go
+++ b/caddytest/integration/map_test.go
@ -13,6 +13,7 @@ func TestMap(t *testing.T) {
 	tester.InitServer(`{
 		http_port     9080
 		https_port    9443
+		grace_period  1
 	}

 	localhost:9080 {
--- a/caddytest/integration/reverseproxy_test.go
+++ b/caddytest/integration/reverseproxy_test.go
@ -18,6 +18,7 @@ func TestSRVReverseProxy(t *testing.T) {
 	{
 		"apps": {
 		  "http": {
+			"grace_period": 1,
 			"servers": {
 			  "srv0": {
 				"listen": [
@ -50,6 +51,7 @@ func TestSRVWithDial(t *testing.T) {
 	{
 		"apps": {
 		  "http": {
+			"grace_period": 1,
 			"servers": {
 			  "srv0": {
 				"listen": [
@ -115,6 +117,7 @@ func TestDialWithPlaceholderUnix(t *testing.T) {
 	{
 		"apps": {
 		  "http": {
+			"grace_period": 1,
 			"servers": {
 			  "srv0": {
 				"listen": [
@ -156,6 +159,7 @@ func TestReverseProxyWithPlaceholderDialAddress(t *testing.T) {
 	{
 		"apps": {
 			"http": {
+				"grace_period": 1,
 				"servers": {
 					"srv0": {
 						"listen": [
@ -239,6 +243,7 @@ func TestReverseProxyWithPlaceholderTCPDialAddress(t *testing.T) {
 	{
 		"apps": {
 			"http": {
+				"grace_period": 1,
 				"servers": {
 					"srv0": {
 						"listen": [
@ -321,6 +326,7 @@ func TestSRVWithActiveHealthcheck(t *testing.T) {
 	{
 		"apps": {
 		  "http": {
+			"grace_period": 1,
 			"servers": {
 			  "srv0": {
 				"listen": [
--- a/caddytest/integration/sni_test.go
+++ b/caddytest/integration/sni_test.go
@ -15,6 +15,7 @@ func TestDefaultSNI(t *testing.T) {
      "http": {
        "http_port": 9080,
        "https_port": 9443,
+        "grace_period": 1,
        "servers": {
          "srv0": {
            "listen": [
@ -112,6 +113,7 @@ func TestDefaultSNIWithNamedHostAndExplicitIP(t *testing.T) {
      "http": {
        "http_port": 9080,
        "https_port": 9443,
+        "grace_period": 1,
        "servers": {
          "srv0": {
            "listen": [
@ -212,6 +214,7 @@ func TestDefaultSNIWithPortMappingOnly(t *testing.T) {
      "http": {
        "http_port": 9080,
        "https_port": 9443,
+        "grace_period": 1,
        "servers": {
          "srv0": {
            "listen": [
--- a/caddytest/integration/stream_test.go
+++ b/caddytest/integration/stream_test.go
@ -27,6 +27,7 @@ func TestH2ToH2CStream(t *testing.T) {
      "http": {
        "http_port": 9080,
        "https_port": 9443,
+		"grace_period": 1,
        "servers": {
          "srv0": {
            "listen": [
@ -216,6 +217,7 @@ func TestH2ToH1ChunkedResponse(t *testing.T) {
    "http": {
      "http_port": 9080,
      "https_port": 9443,
+	  "grace_period": 1,
      "servers": {
        "srv0": {
          "listen": [
--- a/modules/caddyhttp/app.go
+++ b/modules/caddyhttp/app.go
@ -505,22 +505,62 @@ func (app *App) Stop() error {
 		app.logger.Debug("servers shutting down with eternal grace period")
 	}

-	// shut down servers
-	for _, server := range app.Servers {
+	// goroutines aren't guaranteed to be scheduled right away,
+	// so we'll use one WaitGroup to wait for all the goroutines
+	// to start their server shutdowns, and another to wait for
+	// them to finish; we'll always block for them to start so
+	// that when we return the caller can be confident* that the
+	// old servers are no longer accepting new connections
+	// (* the scheduler might still pause them right before
+	// calling Shutdown(), but it's unlikely)
+	var startedShutdown, finishedShutdown sync.WaitGroup
+
+	// these will run in goroutines
+	stopServer := func(server *Server) {
+		defer finishedShutdown.Done()
+		startedShutdown.Done()
+
 		if err := server.server.Shutdown(ctx); err != nil {
 			app.logger.Error("server shutdown",
 				zap.Error(err),
 				zap.Strings("addresses", server.Listen))
 		}
+	}
+	stopH3Server := func(server *Server) {
+		defer finishedShutdown.Done()
+		startedShutdown.Done()

-		if server.h3server != nil {
-			// TODO: CloseGracefully, once implemented upstream (see https://github.com/lucas-clemente/quic-go/issues/2103)
-			if err := server.h3server.Close(); err != nil {
-				app.logger.Error("HTTP/3 server shutdown",
-					zap.Error(err),
-					zap.Strings("addresses", server.Listen))
-			}
+		if server.h3server == nil {
+			return
 		}
+		// TODO: CloseGracefully, once implemented upstream (see https://github.com/lucas-clemente/quic-go/issues/2103)
+		if err := server.h3server.Close(); err != nil {
+			app.logger.Error("HTTP/3 server shutdown",
+				zap.Error(err),
+				zap.Strings("addresses", server.Listen))
+		}
+	}
+
+	for _, server := range app.Servers {
+		startedShutdown.Add(2)
+		finishedShutdown.Add(2)
+		go stopServer(server)
+		go stopH3Server(server)
+	}
+
+	// block until all the goroutines have been run by the scheduler;
+	// this means that they have likely called Shutdown() by now
+	startedShutdown.Wait()
+
+	// if the process is exiting, we need to block here and wait
+	// for the grace periods to complete, otherwise the process will
+	// terminate before the servers are finished shutting down; but
+	// we don't really need to wait for the grace period to finish
+	// if the process isn't exiting (but note that frequent config
+	// reloads with long grace periods for a sustained length of time
+	// may deplete resources)
+	if caddy.Exiting() {
+		finishedShutdown.Wait()
 	}

 	return nil