TUN-7373: Streaming logs override for same actor

To help accommodate web browser interactions with websockets, when a streaming logs session is requested for the same actor while already serving a session for that user in a separate request, the original request will be closed and the new request start streaming logs instead. This should help with rogue sessions holding on for too long with no client on the other side (before idle timeout or connection close).
2025-07-27 22:59:58 +00:00 · 2023-04-21 11:54:37 -07:00
parent ee5e447d44
commit 38cd455e4d
109 changed files with 12691 additions and 1798 deletions
--- a/management/service.go
+++ b/management/service.go
@@ -7,7 +7,6 @@ import (
 	"net/http"
 	"os"
 	"sync"
-	"sync/atomic"
 	"time"

 	"github.com/go-chi/chi/v5"
@@ -24,7 +23,7 @@ const (
 	// value will return this error to incoming requests.
 	StatusSessionLimitExceeded websocket.StatusCode = 4002
 	reasonSessionLimitExceeded                      = "limit exceeded for streaming sessions"
-
+	// There is a limited idle time while not actively serving a session for a request before dropping the connection.
 	StatusIdleLimitExceeded websocket.StatusCode = 4003
 	reasonIdleLimitExceeded                      = "session was idle for too long"
 )
@@ -41,9 +40,6 @@ type ManagementService struct {
 	log    *zerolog.Logger
 	router chi.Router

-	// streaming signifies if the service is already streaming logs. Helps limit the number of active users streaming logs
-	// from this cloudflared instance.
-	streaming atomic.Bool
 	// streamingMut is a lock to prevent concurrent requests to start streaming. Utilizing the atomic.Bool is not
 	// sufficient to complete this operation since many other checks during an incoming new request are needed
 	// to validate this before setting streaming to true.
@@ -67,6 +63,7 @@ func New(managementHostname string,
 		label:     label,
 	}
 	r := chi.NewRouter()
+	r.Use(ValidateAccessTokenQueryMiddleware)
 	r.Get("/ping", ping)
 	r.Head("/ping", ping)
 	r.Get("/logs", s.logs)
@@ -92,7 +89,6 @@ type getHostDetailsResponse struct {
 }

 func (m *ManagementService) getHostDetails(w http.ResponseWriter, r *http.Request) {
-
 	var getHostDetailsResponse = getHostDetailsResponse{
 		ClientID: m.clientID.String(),
 	}
@@ -157,12 +153,11 @@ func (m *ManagementService) readEvents(c *websocket.Conn, ctx context.Context, e
 }

 // streamLogs will begin the process of reading from the Session listener and write the log events to the client.
-func (m *ManagementService) streamLogs(c *websocket.Conn, ctx context.Context, session *Session) {
-	defer m.logger.Close(session)
-	for m.streaming.Load() {
+func (m *ManagementService) streamLogs(c *websocket.Conn, ctx context.Context, session *session) {
+	for session.Active() {
 		select {
 		case <-ctx.Done():
-			m.streaming.Store(false)
+			session.Stop()
 			return
 		case event := <-session.listener:
 			err := WriteEvent(c, ctx, &EventLog{
@@ -176,7 +171,7 @@ func (m *ManagementService) streamLogs(c *websocket.Conn, ctx context.Context, s
 					m.log.Err(c.Close(websocket.StatusInternalError, err.Error())).Send()
 				}
 				// Any errors when writing the messages to the client will stop streaming and close the connection
-				m.streaming.Store(false)
+				session.Stop()
 				return
 			}
 		default:
@@ -185,28 +180,38 @@ func (m *ManagementService) streamLogs(c *websocket.Conn, ctx context.Context, s
 	}
 }

-// startStreaming will check the conditions of the request and begin streaming or close the connection for invalid
-// requests.
-func (m *ManagementService) startStreaming(c *websocket.Conn, ctx context.Context, event *ClientEvent) {
+// canStartStream will check the conditions of the request and return if the session can begin streaming.
+func (m *ManagementService) canStartStream(session *session) bool {
 	m.streamingMut.Lock()
 	defer m.streamingMut.Unlock()
-	// Limits to one user for streaming logs
-	if m.streaming.Load() {
-		m.log.Warn().
-			Msgf("Another management session request was attempted but one session already being served; there is a limit of streaming log sessions to reduce overall performance impact.")
-		m.log.Err(c.Close(StatusSessionLimitExceeded, reasonSessionLimitExceeded)).Send()
-		return
+	// Limits to one actor for streaming logs
+	if m.logger.ActiveSessions() > 0 {
+		// Allow the same user to preempt their existing session to disconnect their old session and start streaming
+		// with this new session instead.
+		if existingSession := m.logger.ActiveSession(session.actor); existingSession != nil {
+			m.log.Info().
+				Msgf("Another management session request for the same actor was requested; the other session will be disconnected to handle the new request.")
+			existingSession.Stop()
+			m.logger.Remove(existingSession)
+			existingSession.cancel()
+		} else {
+			m.log.Warn().
+				Msgf("Another management session request was attempted but one session already being served; there is a limit of streaming log sessions to reduce overall performance impact.")
+			return false
+		}
 	}
+	return true
+}
+
+// parseFilters will check the ClientEvent for start_streaming and assign filters if provided to the session
+func (m *ManagementService) parseFilters(c *websocket.Conn, event *ClientEvent, session *session) bool {
 	// Expect the first incoming request
 	startEvent, ok := IntoClientEvent[EventStartStreaming](event, StartStreaming)
 	if !ok {
-		m.log.Warn().Err(c.Close(StatusInvalidCommand, reasonInvalidCommand)).Msgf("expected start_streaming as first recieved event")
-		return
+		return false
 	}
-	m.streaming.Store(true)
-	listener := m.logger.Listen(startEvent.Filters)
-	m.log.Debug().Msgf("Streaming logs")
-	go m.streamLogs(c, ctx, listener)
+	session.Filters(startEvent.Filters)
+	return true
 }

 // Management Streaming Logs accept handler
@@ -227,11 +232,23 @@ func (m *ManagementService) logs(w http.ResponseWriter, r *http.Request) {
 	ping := time.NewTicker(15 * time.Second)
 	defer ping.Stop()

-	// Close the connection if no operation has occurred after the idle timeout.
+	// Close the connection if no operation has occurred after the idle timeout. The timeout is halted
+	// when streaming logs is active.
 	idleTimeout := 5 * time.Minute
 	idle := time.NewTimer(idleTimeout)
 	defer idle.Stop()

+	// Fetch the claims from the request context to acquire the actor
+	claims, ok := ctx.Value(accessClaimsCtxKey).(*managementTokenClaims)
+	if !ok || claims == nil {
+		// Typically should never happen as it is provided in the context from the middleware
+		m.log.Err(c.Close(websocket.StatusInternalError, "missing access_token")).Send()
+		return
+	}
+
+	session := newSession(logWindow, claims.Actor, cancel)
+	defer m.logger.Remove(session)
+
 	for {
 		select {
 		case <-ctx.Done():
@@ -242,12 +259,28 @@ func (m *ManagementService) logs(w http.ResponseWriter, r *http.Request) {
 			switch event.Type {
 			case StartStreaming:
 				idle.Stop()
-				m.startStreaming(c, ctx, event)
+				// Expect the first incoming request
+				startEvent, ok := IntoClientEvent[EventStartStreaming](event, StartStreaming)
+				if !ok {
+					m.log.Warn().Msgf("expected start_streaming as first recieved event")
+					m.log.Err(c.Close(StatusInvalidCommand, reasonInvalidCommand)).Send()
+					return
+				}
+				// Make sure the session can start
+				if !m.canStartStream(session) {
+					m.log.Err(c.Close(StatusSessionLimitExceeded, reasonSessionLimitExceeded)).Send()
+					return
+				}
+				session.Filters(startEvent.Filters)
+				m.logger.Listen(session)
+				m.log.Debug().Msgf("Streaming logs")
+				go m.streamLogs(c, ctx, session)
 				continue
 			case StopStreaming:
 				idle.Reset(idleTimeout)
-				// TODO: limit StopStreaming to only halt streaming for clients that are already streaming
-				m.streaming.Store(false)
+				// Stop the current session for the current actor who requested it
+				session.Stop()
+				m.logger.Remove(session)
 			case UnknownClientEventType:
 				fallthrough
 			default: