net/dns, health: raise health warning for failing forwarded DNS queries (#12888)
updates tailscale/corp#21823 Misconfigured, broken, or blocked DNS will often present as "internet is broken'" to the end user. This plumbs the health tracker into the dns manager and forwarder and adds a health warning with a 5 second delay that is raised on failures in the forwarder and lowered on successes. Signed-off-by: Jonathan Nobels <jonathan@tailscale.com>
This commit is contained in:
@ -23,6 +23,7 @@ import (
|
||||
dns "golang.org/x/net/dns/dnsmessage"
|
||||
"tailscale.com/control/controlknobs"
|
||||
"tailscale.com/envknob"
|
||||
"tailscale.com/health"
|
||||
"tailscale.com/net/dns/publicdns"
|
||||
"tailscale.com/net/dnscache"
|
||||
"tailscale.com/net/neterror"
|
||||
@ -164,6 +165,23 @@ func clampEDNSSize(packet []byte, maxSize uint16) {
|
||||
binary.BigEndian.PutUint16(opt[3:5], maxSize)
|
||||
}
|
||||
|
||||
// dnsForwarderFailing should be raised when the forwarder is unable to reach the
|
||||
// upstream resolvers. This is a high severity warning as it results in "no internet".
|
||||
// This warning must be cleared when the forwarder is working again.
|
||||
//
|
||||
// We allow for 5 second grace period to ensure this is not raised for spurious errors
|
||||
// under the assumption that DNS queries are relatively frequent and a subsequent
|
||||
// successful query will clear any one-off errors.
|
||||
var dnsForwarderFailing = health.Register(&health.Warnable{
|
||||
Code: "dns-forward-failing",
|
||||
Title: "DNS unavailable",
|
||||
Severity: health.SeverityHigh,
|
||||
DependsOn: []*health.Warnable{health.NetworkStatusWarnable},
|
||||
Text: health.StaticMessage("Tailscale can't reach the configured DNS servers. Internet connectivity may be affected."),
|
||||
ImpactsConnectivity: true,
|
||||
TimeToVisible: 5 * time.Second,
|
||||
})
|
||||
|
||||
type route struct {
|
||||
Suffix dnsname.FQDN
|
||||
Resolvers []resolverAndDelay
|
||||
@ -188,6 +206,7 @@ type forwarder struct {
|
||||
netMon *netmon.Monitor // always non-nil
|
||||
linkSel ForwardLinkSelector // TODO(bradfitz): remove this when tsdial.Dialer absorbs it
|
||||
dialer *tsdial.Dialer
|
||||
health *health.Tracker // always non-nil
|
||||
|
||||
controlKnobs *controlknobs.Knobs // or nil
|
||||
|
||||
@ -219,7 +238,7 @@ type forwarder struct {
|
||||
missingUpstreamRecovery func()
|
||||
}
|
||||
|
||||
func newForwarder(logf logger.Logf, netMon *netmon.Monitor, linkSel ForwardLinkSelector, dialer *tsdial.Dialer, knobs *controlknobs.Knobs) *forwarder {
|
||||
func newForwarder(logf logger.Logf, netMon *netmon.Monitor, linkSel ForwardLinkSelector, dialer *tsdial.Dialer, health *health.Tracker, knobs *controlknobs.Knobs) *forwarder {
|
||||
if netMon == nil {
|
||||
panic("nil netMon")
|
||||
}
|
||||
@ -228,6 +247,7 @@ func newForwarder(logf logger.Logf, netMon *netmon.Monitor, linkSel ForwardLinkS
|
||||
netMon: netMon,
|
||||
linkSel: linkSel,
|
||||
dialer: dialer,
|
||||
health: health,
|
||||
controlKnobs: knobs,
|
||||
missingUpstreamRecovery: func() {},
|
||||
}
|
||||
@ -887,6 +907,7 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo
|
||||
resolvers = f.resolvers(domain)
|
||||
if len(resolvers) == 0 {
|
||||
metricDNSFwdErrorNoUpstream.Add(1)
|
||||
f.health.SetUnhealthy(dnsForwarderFailing, health.Args{health.ArgDNSServers: ""})
|
||||
f.logf("no upstream resolvers set, returning SERVFAIL")
|
||||
|
||||
// Attempt to recompile the DNS configuration
|
||||
@ -909,6 +930,8 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo
|
||||
case responseChan <- res:
|
||||
return nil
|
||||
}
|
||||
} else {
|
||||
f.health.SetHealthy(dnsForwarderFailing)
|
||||
}
|
||||
}
|
||||
|
||||
@ -960,6 +983,7 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo
|
||||
return fmt.Errorf("waiting to send response: %w", ctx.Err())
|
||||
case responseChan <- packet{v, query.family, query.addr}:
|
||||
metricDNSFwdSuccess.Add(1)
|
||||
f.health.SetHealthy(dnsForwarderFailing)
|
||||
return nil
|
||||
}
|
||||
case err := <-errc:
|
||||
@ -979,6 +1003,11 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo
|
||||
case <-ctx.Done():
|
||||
metricDNSFwdErrorContext.Add(1)
|
||||
metricDNSFwdErrorContextGotError.Add(1)
|
||||
var resolverAddrs []string
|
||||
for _, rr := range resolvers {
|
||||
resolverAddrs = append(resolverAddrs, rr.name.Addr)
|
||||
}
|
||||
f.health.SetUnhealthy(dnsForwarderFailing, health.Args{health.ArgDNSServers: strings.Join(resolverAddrs, ",")})
|
||||
case responseChan <- res:
|
||||
}
|
||||
}
|
||||
@ -999,6 +1028,7 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo
|
||||
for _, rr := range resolvers {
|
||||
resolverAddrs = append(resolverAddrs, rr.name.Addr)
|
||||
}
|
||||
f.health.SetUnhealthy(dnsForwarderFailing, health.Args{health.ArgDNSServers: strings.Join(resolverAddrs, ",")})
|
||||
return fmt.Errorf("waiting for response or error from %v: %w", resolverAddrs, ctx.Err())
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user