TUN-3869: Improve reliability of graceful shutdown.

- Don't rely on edge to close connection on graceful shutdown in h2mux, start muxer shutdown from cloudflared.
- Don't retry failed connections after graceful shutdown has started.
- After graceful shutdown channel is closed we stop waiting for retry timer and don't try to restart tunnel loop.
- Use readonly channel for graceful shutdown in functions that only consume the signal
This commit is contained in:
Igor Postelnik
2021-02-04 18:07:49 -06:00
parent dbd90f270e
commit 0b16a473da
6 changed files with 95 additions and 83 deletions

View File

@@ -58,16 +58,18 @@ type Supervisor struct {
useReconnectToken bool
reconnectCh chan ReconnectSignal
gracefulShutdownC chan struct{}
gracefulShutdownC <-chan struct{}
}
var errEarlyShutdown = errors.New("shutdown started")
type tunnelError struct {
index int
addr *net.TCPAddr
err error
}
func NewSupervisor(config *TunnelConfig, reconnectCh chan ReconnectSignal, gracefulShutdownC chan struct{}) (*Supervisor, error) {
func NewSupervisor(config *TunnelConfig, reconnectCh chan ReconnectSignal, gracefulShutdownC <-chan struct{}) (*Supervisor, error) {
cloudflaredUUID, err := uuid.NewRandom()
if err != nil {
return nil, fmt.Errorf("failed to generate cloudflared instance ID: %w", err)
@@ -108,6 +110,9 @@ func (s *Supervisor) Run(
connectedSignal *signal.Signal,
) error {
if err := s.initialize(ctx, connectedSignal); err != nil {
if err == errEarlyShutdown {
return nil
}
return err
}
var tunnelsWaiting []int
@@ -130,6 +135,7 @@ func (s *Supervisor) Run(
}
}
shuttingDown := false
for {
select {
// Context cancelled
@@ -143,7 +149,7 @@ func (s *Supervisor) Run(
// (note that this may also be caused by context cancellation)
case tunnelError := <-s.tunnelErrors:
tunnelsActive--
if tunnelError.err != nil {
if tunnelError.err != nil && !shuttingDown {
s.log.Err(tunnelError.err).Int(connection.LogFieldConnIndex, tunnelError.index).Msg("Connection terminated")
tunnelsWaiting = append(tunnelsWaiting, tunnelError.index)
s.waitForNextTunnel(tunnelError.index)
@@ -181,6 +187,8 @@ func (s *Supervisor) Run(
// No more tunnels outstanding, clear backoff timer
backoff.SetGracePeriod()
}
case <-s.gracefulShutdownC:
shuttingDown = true
}
}
}
@@ -203,6 +211,8 @@ func (s *Supervisor) initialize(
return ctx.Err()
case tunnelError := <-s.tunnelErrors:
return tunnelError.err
case <-s.gracefulShutdownC:
return errEarlyShutdown
case <-connectedSignal.Wait():
}
// At least one successful connection, so start the rest