diff --git a/cmd/derpprobe/derpprobe.go b/cmd/derpprobe/derpprobe.go
index bf66943c5..ca389aa38 100644
--- a/cmd/derpprobe/derpprobe.go
+++ b/cmd/derpprobe/derpprobe.go
@@ -19,7 +19,9 @@
"log"
"net"
"net/http"
+ "os"
"sort"
+ "strings"
"sync"
"time"
@@ -54,7 +56,15 @@
func main() {
flag.Parse()
+
+ // proactively load the DERP map. Nothing terrible happens if this fails, so we ignore
+ // the error. The Slack bot will print a notification that the DERP map was empty.
+ ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+ defer cancel()
+ _, _ = getDERPMap(ctx)
+
go probeLoop()
+ go slackLoop()
log.Fatal(http.ListenAndServe(*listen, http.HandlerFunc(serve)))
}
@@ -138,10 +148,14 @@ func getOverallStatus() (o overallStatus) {
func serve(w http.ResponseWriter, r *http.Request) {
st := getOverallStatus()
summary := "All good"
- if len(st.bad) > 0 {
+ if (float64(len(st.bad)) / float64(len(st.bad)+len(st.good))) > 0.25 {
+ // This will generate an alert and page a human.
+ // It also ends up in Slack, but as part of the alert handling pipeline not
+ // because we generated a Slack notification from here.
w.WriteHeader(500)
summary = fmt.Sprintf("%d problems", len(st.bad))
}
+
io.WriteString(w, "
\n")
fmt.Fprintf(w, "derp probe
\n%s:", summary)
for _, s := range st.bad {
@@ -153,6 +167,71 @@ func serve(w http.ResponseWriter, r *http.Request) {
io.WriteString(w, "
\n")
}
+func notifySlack(text string) error {
+ type SlackRequestBody struct {
+ Text string `json:"text"`
+ }
+
+ slackBody, err := json.Marshal(SlackRequestBody{Text: text})
+ if err != nil {
+ return err
+ }
+
+ webhookUrl := os.Getenv("SLACK_WEBHOOK")
+ if webhookUrl == "" {
+ return errors.New("No SLACK_WEBHOOK configured")
+ }
+
+ req, err := http.NewRequest("POST", webhookUrl, bytes.NewReader(slackBody))
+ if err != nil {
+ return err
+ }
+ req.Header.Add("Content-Type", "application/json")
+
+ client := &http.Client{Timeout: 10 * time.Second}
+ resp, err := client.Do(req)
+ if err != nil {
+ return err
+ }
+ defer resp.Body.Close()
+
+ if resp.StatusCode != 200 {
+ return errors.New(resp.Status)
+ }
+
+ body, _ := io.ReadAll(resp.Body)
+ if string(body) != "ok" {
+ return errors.New("Non-ok response returned from Slack")
+ }
+ return nil
+}
+
+// We only page a human if it looks like there is a significant outage across multiple regions.
+// To Slack, we report all failures great and small.
+func slackLoop() {
+ inBadState := false
+ for {
+ time.Sleep(time.Second * 30)
+ st := getOverallStatus()
+
+ if len(st.bad) > 0 && !inBadState {
+ err := notifySlack(strings.Join(st.bad, "\n"))
+ if err == nil {
+ inBadState = true
+ } else {
+ log.Printf("%d problems, notify Slack failed: %v", len(st.bad), err)
+ }
+ }
+
+ if len(st.bad) == 0 && inBadState {
+ err := notifySlack("All DERPs recovered.")
+ if err == nil {
+ inBadState = false
+ }
+ }
+ }
+}
+
func sortedRegions(dm *tailcfg.DERPMap) []*tailcfg.DERPRegion {
ret := make([]*tailcfg.DERPRegion, 0, len(dm.Regions))
for _, r := range dm.Regions {