net/dns, health: raise health warning for failing forwarded DNS queries (#12888)

updates tailscale/corp#21823

Misconfigured, broken, or blocked DNS will often present as
"internet is broken'" to the end user.  This  plumbs the health tracker
into the dns manager and forwarder and adds a health warning
with a 5 second delay that is raised on failures in the forwarder and
lowered on successes.

Signed-off-by: Jonathan Nobels <jonathan@tailscale.com>
This commit is contained in:
Jonathan Nobels
2024-07-29 13:48:46 -04:00
committed by GitHub
parent 3088c6105e
commit 19b0c8a024
7 changed files with 53 additions and 8 deletions

View File

@ -23,6 +23,7 @@ import (
dns "golang.org/x/net/dns/dnsmessage"
"tailscale.com/control/controlknobs"
"tailscale.com/envknob"
"tailscale.com/health"
"tailscale.com/net/dns/publicdns"
"tailscale.com/net/dnscache"
"tailscale.com/net/neterror"
@ -164,6 +165,23 @@ func clampEDNSSize(packet []byte, maxSize uint16) {
binary.BigEndian.PutUint16(opt[3:5], maxSize)
}
// dnsForwarderFailing should be raised when the forwarder is unable to reach the
// upstream resolvers. This is a high severity warning as it results in "no internet".
// This warning must be cleared when the forwarder is working again.
//
// We allow for 5 second grace period to ensure this is not raised for spurious errors
// under the assumption that DNS queries are relatively frequent and a subsequent
// successful query will clear any one-off errors.
var dnsForwarderFailing = health.Register(&health.Warnable{
Code: "dns-forward-failing",
Title: "DNS unavailable",
Severity: health.SeverityHigh,
DependsOn: []*health.Warnable{health.NetworkStatusWarnable},
Text: health.StaticMessage("Tailscale can't reach the configured DNS servers. Internet connectivity may be affected."),
ImpactsConnectivity: true,
TimeToVisible: 5 * time.Second,
})
type route struct {
Suffix dnsname.FQDN
Resolvers []resolverAndDelay
@ -188,6 +206,7 @@ type forwarder struct {
netMon *netmon.Monitor // always non-nil
linkSel ForwardLinkSelector // TODO(bradfitz): remove this when tsdial.Dialer absorbs it
dialer *tsdial.Dialer
health *health.Tracker // always non-nil
controlKnobs *controlknobs.Knobs // or nil
@ -219,7 +238,7 @@ type forwarder struct {
missingUpstreamRecovery func()
}
func newForwarder(logf logger.Logf, netMon *netmon.Monitor, linkSel ForwardLinkSelector, dialer *tsdial.Dialer, knobs *controlknobs.Knobs) *forwarder {
func newForwarder(logf logger.Logf, netMon *netmon.Monitor, linkSel ForwardLinkSelector, dialer *tsdial.Dialer, health *health.Tracker, knobs *controlknobs.Knobs) *forwarder {
if netMon == nil {
panic("nil netMon")
}
@ -228,6 +247,7 @@ func newForwarder(logf logger.Logf, netMon *netmon.Monitor, linkSel ForwardLinkS
netMon: netMon,
linkSel: linkSel,
dialer: dialer,
health: health,
controlKnobs: knobs,
missingUpstreamRecovery: func() {},
}
@ -887,6 +907,7 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo
resolvers = f.resolvers(domain)
if len(resolvers) == 0 {
metricDNSFwdErrorNoUpstream.Add(1)
f.health.SetUnhealthy(dnsForwarderFailing, health.Args{health.ArgDNSServers: ""})
f.logf("no upstream resolvers set, returning SERVFAIL")
// Attempt to recompile the DNS configuration
@ -909,6 +930,8 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo
case responseChan <- res:
return nil
}
} else {
f.health.SetHealthy(dnsForwarderFailing)
}
}
@ -960,6 +983,7 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo
return fmt.Errorf("waiting to send response: %w", ctx.Err())
case responseChan <- packet{v, query.family, query.addr}:
metricDNSFwdSuccess.Add(1)
f.health.SetHealthy(dnsForwarderFailing)
return nil
}
case err := <-errc:
@ -979,6 +1003,11 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo
case <-ctx.Done():
metricDNSFwdErrorContext.Add(1)
metricDNSFwdErrorContextGotError.Add(1)
var resolverAddrs []string
for _, rr := range resolvers {
resolverAddrs = append(resolverAddrs, rr.name.Addr)
}
f.health.SetUnhealthy(dnsForwarderFailing, health.Args{health.ArgDNSServers: strings.Join(resolverAddrs, ",")})
case responseChan <- res:
}
}
@ -999,6 +1028,7 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo
for _, rr := range resolvers {
resolverAddrs = append(resolverAddrs, rr.name.Addr)
}
f.health.SetUnhealthy(dnsForwarderFailing, health.Args{health.ArgDNSServers: strings.Join(resolverAddrs, ",")})
return fmt.Errorf("waiting for response or error from %v: %w", resolverAddrs, ctx.Err())
}
}