TUN-1350: Enhance error messages with cloudflarestatus.com link, if relevant

This commit is contained in:
Nick Vollmar
2019-01-10 14:55:44 -06:00
parent 8de19dc647
commit 62b1ab8c98
25 changed files with 1632 additions and 33 deletions

View File

@@ -1,9 +1,8 @@
package origin
import (
"context"
"time"
"golang.org/x/net/context"
)
// Redeclare time functions so they can be overridden in tests.

View File

@@ -1,10 +1,9 @@
package origin
import (
"context"
"testing"
"time"
"golang.org/x/net/context"
)
func immediateTimeAfter(time.Duration) <-chan time.Time {

View File

@@ -0,0 +1,117 @@
package origin
import (
"encoding/json"
"io/ioutil"
"net/http"
"strings"
"time"
"github.com/cloudflare/golibs/lrucache"
)
// StatusPage.io API docs:
// https://www.cloudflarestatus.com/api/v2/#incidents-unresolved
const (
activeIncidentsURL = "https://yh6f0r4529hb.statuspage.io/api/v2/incidents/unresolved.json"
argoTunnelKeyword = "argo tunnel"
incidentDetailsPrefix = "https://www.cloudflarestatus.com/incidents/"
)
// IncidentLookup is an object that checks for active incidents in
// the Cloudflare infrastructure.
type IncidentLookup interface {
ActiveIncidents() []Incident
}
// NewIncidentLookup returns a new IncidentLookup instance that caches its
// results with a 1-minute TTL.
func NewIncidentLookup() IncidentLookup {
return newCachedIncidentLookup(fetchActiveIncidents)
}
type IncidentUpdate struct {
Body string
}
type Incident struct {
Name string
ID string `json:"id"`
Updates []IncidentUpdate `json:"incident_updates"`
}
type StatusPage struct {
Incidents []Incident
}
func (i Incident) URL() string {
return incidentDetailsPrefix + i.ID
}
func parseStatusPage(data []byte) (*StatusPage, error) {
var result StatusPage
err := json.Unmarshal(data, &result)
return &result, err
}
func isArgoTunnelIncident(i Incident) bool {
if strings.Contains(strings.ToLower(i.Name), argoTunnelKeyword) {
return true
}
for _, u := range i.Updates {
if strings.Contains(strings.ToLower(u.Body), argoTunnelKeyword) {
return true
}
}
return false
}
func fetchActiveIncidents() (incidents []Incident) {
resp, err := http.Get(activeIncidentsURL)
if err != nil {
return
}
defer resp.Body.Close()
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
return
}
statusPage, err := parseStatusPage(body)
if err != nil {
return
}
for _, i := range statusPage.Incidents {
if isArgoTunnelIncident(i) {
incidents = append(incidents, i)
}
}
return incidents
}
type cachedIncidentLookup struct {
cache *lrucache.LRUCache
ttl time.Duration
uncachedLookup func() []Incident
}
func newCachedIncidentLookup(uncachedLookup func() []Incident) *cachedIncidentLookup {
return &cachedIncidentLookup{
cache: lrucache.NewLRUCache(1),
ttl: time.Minute,
uncachedLookup: uncachedLookup,
}
}
// We only need one cache entry. Always use the empty string as its key.
const cacheKey = ""
func (c *cachedIncidentLookup) ActiveIncidents() []Incident {
if cached, ok := c.cache.GetNotStale(cacheKey); ok {
if incidents, ok := cached.([]Incident); ok {
return incidents
}
}
incidents := c.uncachedLookup()
c.cache.Set(cacheKey, incidents, time.Now().Add(c.ttl))
return incidents
}

View File

@@ -0,0 +1,386 @@
package origin
import (
"testing"
"time"
"github.com/cloudflare/golibs/lrucache"
"github.com/stretchr/testify/assert"
)
func TestParseStatusPage(t *testing.T) {
testCases := []struct {
input []byte
output *StatusPage
fail bool
}{
{
input: []byte(`<html>
<head><title>504 Gateway Time-out</title></head>
<body><center><h1>504 Gateway Time-out</h1></center></body>
</html>`),
output: nil,
fail: true,
},
{
input: []byte(`{
"page": {
"id": "yh6f0r4529hb",
"name": "Cloudflare",
"url": "https://www.cloudflarestatus.com",
"time_zone": "Etc/UTC",
"updated_at": "2019-01-10T20:11:38.750Z"
},
"incidents": [
{
"name": "Cloudflare API service issues",
"status": "resolved",
"created_at": "2018-09-17T19:29:21.132Z",
"updated_at": "2018-09-18T07:45:41.313Z",
"monitoring_at": "2018-09-17T21:35:06.492Z",
"resolved_at": "2018-09-18T07:45:41.290Z",
"shortlink": "http://stspg.io/7f079791e",
"id": "q746ybtyb6q0",
"page_id": "yh6f0r4529hb",
"incident_updates": [
{
"status": "resolved",
"body": "Cloudflare has resolved the issue and the service have resumed normal operation.",
"created_at": "2018-09-18T07:45:41.290Z",
"updated_at": "2018-09-18T07:45:41.290Z",
"display_at": "2018-09-18T07:45:41.290Z",
"affected_components": [
{
"code": "g4tb35rs9yw7",
"name": "Cloudflare customer dashboard and APIs - Cloudflare APIs",
"old_status": "operational",
"new_status": "operational"
}
],
"deliver_notifications": true,
"tweet_id": null,
"id": "zl5g2pl5zhfs",
"incident_id": "q746ybtyb6q0",
"custom_tweet": null
},
{
"status": "monitoring",
"body": "Cloudflare has implemented a fix for this issue and is currently monitoring the results.\r\n\r\nWe will update the status once the issue is resolved.",
"created_at": "2018-09-17T21:35:06.492Z",
"updated_at": "2018-09-17T21:35:06.492Z",
"display_at": "2018-09-17T21:35:06.492Z",
"affected_components": [
{
"code": "g4tb35rs9yw7",
"name": "Cloudflare customer dashboard and APIs - Cloudflare APIs",
"old_status": "degraded_performance",
"new_status": "operational"
}
],
"deliver_notifications": false,
"tweet_id": null,
"id": "0001sv3chdnx",
"incident_id": "q746ybtyb6q0",
"custom_tweet": null
},
{
"status": "investigating",
"body": "We are continuing to investigate this issue.",
"created_at": "2018-09-17T19:30:08.049Z",
"updated_at": "2018-09-17T19:30:08.049Z",
"display_at": "2018-09-17T19:30:08.049Z",
"affected_components": [
{
"code": "g4tb35rs9yw7",
"name": "Cloudflare customer dashboard and APIs - Cloudflare APIs",
"old_status": "operational",
"new_status": "degraded_performance"
}
],
"deliver_notifications": false,
"tweet_id": null,
"id": "qdr164tfpq7m",
"incident_id": "q746ybtyb6q0",
"custom_tweet": null
},
{
"status": "investigating",
"body": "Cloudflare is investigating issues with APIs and Page Rule delays for Page Rule updates. Cloudflare Page Rule service delivery is unaffected and is operating normally. Also, these issues do not affect the Cloudflare CDN and therefore, do not impact customer websites.",
"created_at": "2018-09-17T19:29:21.201Z",
"updated_at": "2018-09-17T19:29:21.201Z",
"display_at": "2018-09-17T19:29:21.201Z",
"affected_components": [
{
"code": "g4tb35rs9yw7",
"name": "Cloudflare customer dashboard and APIs - Cloudflare APIs",
"old_status": "operational",
"new_status": "operational"
}
],
"deliver_notifications": false,
"tweet_id": null,
"id": "qzl2n0q8tskg",
"incident_id": "q746ybtyb6q0",
"custom_tweet": null
}
],
"components": [
{
"status": "operational",
"name": "Cloudflare APIs",
"created_at": "2014-10-09T03:32:07.158Z",
"updated_at": "2019-01-01T22:58:30.846Z",
"position": 2,
"description": null,
"showcase": false,
"id": "g4tb35rs9yw7",
"page_id": "yh6f0r4529hb",
"group_id": "1km35smx8p41",
"group": false,
"only_show_if_degraded": false,
"automation_email": "component+g4tb35rs9yw7@notifications.statuspage.io"
}
],
"impact": "minor"
},
{
"name": "Web Analytics Delays",
"status": "resolved",
"created_at": "2018-09-17T18:05:39.907Z",
"updated_at": "2018-09-17T22:53:05.078Z",
"monitoring_at": null,
"resolved_at": "2018-09-17T22:53:05.057Z",
"shortlink": "http://stspg.io/cb208928c",
"id": "wqfk9mzs5qt1",
"page_id": "yh6f0r4529hb",
"incident_updates": [
{
"status": "resolved",
"body": "Cloudflare has resolved the issue and Web Analytics have resumed normal operation.",
"created_at": "2018-09-17T22:53:05.057Z",
"updated_at": "2018-09-17T22:53:05.057Z",
"display_at": "2018-09-17T22:53:05.057Z",
"affected_components": [
{
"code": "4c231tkdlpcl",
"name": "Cloudflare customer dashboard and APIs - Analytics",
"old_status": "degraded_performance",
"new_status": "operational"
}
],
"deliver_notifications": false,
"tweet_id": null,
"id": "93y1w00yqzk4",
"incident_id": "wqfk9mzs5qt1",
"custom_tweet": null
},
{
"status": "investigating",
"body": "There is a delay in processing Cloudflare Web Analytics. This affects timely delivery of customer data.\n\nThese delays do not impact analytics for DNS and Rate Limiting.",
"created_at": "2018-09-17T18:05:40.033Z",
"updated_at": "2018-09-17T18:05:40.033Z",
"display_at": "2018-09-17T18:05:40.033Z",
"affected_components": [
{
"code": "4c231tkdlpcl",
"name": "Cloudflare customer dashboard and APIs - Analytics",
"old_status": "operational",
"new_status": "degraded_performance"
}
],
"deliver_notifications": false,
"tweet_id": null,
"id": "362t6lv0vrpk",
"incident_id": "wqfk9mzs5qt1",
"custom_tweet": null
}
],
"components": [
{
"status": "operational",
"name": "Analytics",
"created_at": "2014-11-13T11:54:10.191Z",
"updated_at": "2018-12-31T08:20:52.349Z",
"position": 3,
"description": "Customer data",
"showcase": false,
"id": "4c231tkdlpcl",
"page_id": "yh6f0r4529hb",
"group_id": "1km35smx8p41",
"group": false,
"only_show_if_degraded": false,
"automation_email": "component+4c231tkdlpcl@notifications.statuspage.io"
}
],
"impact": "minor"
}
]
}`),
output: &StatusPage{
Incidents: []Incident{
Incident{
Name: "Cloudflare API service issues",
ID: "q746ybtyb6q0",
Updates: []IncidentUpdate{
IncidentUpdate{
Body: "Cloudflare has resolved the issue and the service have resumed normal operation.",
},
IncidentUpdate{
Body: "Cloudflare has implemented a fix for this issue and is currently monitoring the results.\r\n\r\nWe will update the status once the issue is resolved.",
},
IncidentUpdate{
Body: "We are continuing to investigate this issue.",
},
IncidentUpdate{
Body: "Cloudflare is investigating issues with APIs and Page Rule delays for Page Rule updates. Cloudflare Page Rule service delivery is unaffected and is operating normally. Also, these issues do not affect the Cloudflare CDN and therefore, do not impact customer websites.",
},
},
},
Incident{
Name: "Web Analytics Delays",
ID: "wqfk9mzs5qt1",
Updates: []IncidentUpdate{
IncidentUpdate{
Body: "Cloudflare has resolved the issue and Web Analytics have resumed normal operation.",
},
IncidentUpdate{
Body: "There is a delay in processing Cloudflare Web Analytics. This affects timely delivery of customer data.\n\nThese delays do not impact analytics for DNS and Rate Limiting.",
},
},
},
},
},
fail: false,
},
}
for _, testCase := range testCases {
output, err := parseStatusPage(testCase.input)
if testCase.fail {
assert.Error(t, err)
} else {
assert.Nil(t, err)
assert.Equal(t, testCase.output, output)
}
}
}
func TestIsArgoTunnelIncident(t *testing.T) {
testCases := []struct {
input Incident
output bool
}{
{
input: Incident{},
output: false,
},
{
input: Incident{Name: "An Argo Tunnel incident"},
output: true,
},
{
input: Incident{Name: "an argo tunnel incident"},
output: true,
},
{
input: Incident{Name: "an aRgO TuNnEl incident"},
output: true,
},
{
input: Incident{Name: "an argotunnel incident"},
output: false,
},
{
input: Incident{Name: "irrelevant"},
output: false,
},
{
input: Incident{
Name: "irrelevant",
Updates: []IncidentUpdate{
IncidentUpdate{Body: "irrelevant"},
IncidentUpdate{Body: "an Argo Tunnel incident"},
IncidentUpdate{Body: "irrelevant"},
},
},
output: true,
},
{
input: Incident{
Name: "an Argo Tunnel incident",
Updates: []IncidentUpdate{
IncidentUpdate{Body: "irrelevant"},
IncidentUpdate{Body: "irrelevant"},
IncidentUpdate{Body: "irrelevant"},
},
},
output: true,
},
}
for _, testCase := range testCases {
actual := isArgoTunnelIncident(testCase.input)
assert.Equal(t, testCase.output, actual, "Test case failed: %v", testCase.input)
}
}
func TestIncidentURL(t *testing.T) {
incident := Incident{
ID: "s6k0dnn5347b",
}
assert.Equal(t, "https://www.cloudflarestatus.com/incidents/s6k0dnn5347b", incident.URL())
}
func TestNewCachedIncidentLookup(t *testing.T) {
c := newCachedIncidentLookup(func() []Incident { return nil })
assert.Equal(t, time.Minute, c.ttl)
assert.Equal(t, 1, c.cache.Capacity())
}
func TestCachedIncidentLookup(t *testing.T) {
expected := []Incident{
Incident{
Name: "An incident",
ID: "incidentID",
},
}
var shouldCallUncachedLookup bool
c := &cachedIncidentLookup{
cache: lrucache.NewLRUCache(1),
ttl: 50 * time.Millisecond,
uncachedLookup: func() []Incident {
if !shouldCallUncachedLookup {
t.Fatal("uncachedLookup shouldn't have been called")
}
return expected
},
}
shouldCallUncachedLookup = true
assert.Equal(t, expected, c.ActiveIncidents())
shouldCallUncachedLookup = false
assert.Equal(t, expected, c.ActiveIncidents())
assert.Equal(t, expected, c.ActiveIncidents())
time.Sleep(50 * time.Millisecond)
shouldCallUncachedLookup = true
assert.Equal(t, expected, c.ActiveIncidents())
}
func TestCachedIncidentLookupDoesntPanic(t *testing.T) {
expected := []Incident{
Incident{
Name: "An incident",
ID: "incidentID",
},
}
c := &cachedIncidentLookup{
cache: lrucache.NewLRUCache(1),
ttl: 50 * time.Millisecond,
uncachedLookup: func() []Incident { return expected },
}
c.cache.Set(cacheKey, 42, time.Now().Add(30*time.Minute))
actual := c.ActiveIncidents()
assert.Equal(t, expected, actual)
}

View File

@@ -1,11 +1,10 @@
package origin
import (
"context"
"fmt"
"net"
"time"
"golang.org/x/net/context"
)
const (

View File

@@ -2,9 +2,9 @@ package origin
import (
"bufio"
"context"
"crypto/tls"
"fmt"
"github.com/google/uuid"
"io"
"net"
"net/http"
@@ -13,7 +13,6 @@ import (
"strings"
"time"
"golang.org/x/net/context"
"golang.org/x/sync/errgroup"
"github.com/cloudflare/cloudflared/h2mux"
@@ -23,6 +22,7 @@ import (
"github.com/cloudflare/cloudflared/websocket"
raven "github.com/getsentry/raven-go"
"github.com/google/uuid"
"github.com/pkg/errors"
_ "github.com/prometheus/client_golang/prometheus"
log "github.com/sirupsen/logrus"
@@ -63,6 +63,7 @@ type TunnelConfig struct {
NoChunkedEncoding bool
WSGI bool
CompressionQuality uint64
IncidentLookup IncidentLookup
}
type dialError struct {
@@ -265,6 +266,9 @@ func ServeTunnel(
logger.WithError(castedErr.cause).Error("Register tunnel error from server side")
// Don't send registration error return from server to Sentry. They are
// logged on server side
if incidents := config.IncidentLookup.ActiveIncidents(); len(incidents) > 0 {
logger.Error(activeIncidentsMsg(incidents))
}
return castedErr.cause, !castedErr.permanent
case clientRegisterTunnelError:
logger.WithError(castedErr.cause).Error("Register tunnel error on client side")
@@ -696,3 +700,17 @@ func trialZoneMsg(url string) []string {
" " + url,
}
}
func activeIncidentsMsg(incidents []Incident) string {
preamble := "There is an active Cloudflare incident that may be related:"
if len(incidents) > 1 {
preamble = "There are active Cloudflare incidents that may be related:"
}
incidentStrings := []string{}
for _, incident := range incidents {
incidentString := fmt.Sprintf("%s (%s)", incident.Name, incident.URL())
incidentStrings = append(incidentStrings, incidentString)
}
return preamble + " " + strings.Join(incidentStrings, "; ")
}