cmd/containerboot: wait for consistent state on shutdown (#14263)
Some checks are pending
checklocks / checklocks (push) Waiting to run
CodeQL / Analyze (go) (push) Waiting to run
Dockerfile build / deploy (push) Waiting to run
CI / race-root-integration (1/4) (push) Waiting to run
CI / race-root-integration (2/4) (push) Waiting to run
CI / race-root-integration (3/4) (push) Waiting to run
CI / race-root-integration (4/4) (push) Waiting to run
CI / test (-coverprofile=/tmp/coverage.out, amd64) (push) Waiting to run
CI / test (-race, amd64, 1/3) (push) Waiting to run
CI / test (-race, amd64, 2/3) (push) Waiting to run
CI / test (-race, amd64, 3/3) (push) Waiting to run
CI / test (386) (push) Waiting to run
CI / windows (push) Waiting to run
CI / privileged (push) Waiting to run
CI / vm (push) Waiting to run
CI / race-build (push) Waiting to run
CI / cross (386, linux) (push) Waiting to run
CI / cross (amd64, darwin) (push) Waiting to run
CI / cross (amd64, freebsd) (push) Waiting to run
CI / cross (amd64, openbsd) (push) Waiting to run
CI / cross (amd64, windows) (push) Waiting to run
CI / cross (arm, 5, linux) (push) Waiting to run
CI / cross (arm, 7, linux) (push) Waiting to run
CI / cross (arm64, darwin) (push) Waiting to run
CI / cross (arm64, linux) (push) Waiting to run
CI / cross (arm64, windows) (push) Waiting to run
CI / cross (loong64, linux) (push) Waiting to run
CI / ios (push) Waiting to run
CI / crossmin (amd64, illumos) (push) Waiting to run
CI / crossmin (amd64, plan9) (push) Waiting to run
CI / crossmin (amd64, solaris) (push) Waiting to run
CI / crossmin (ppc64, aix) (push) Waiting to run
CI / android (push) Waiting to run
CI / wasm (push) Waiting to run
CI / tailscale_go (push) Waiting to run
CI / fuzz (push) Waiting to run
CI / depaware (push) Waiting to run
CI / go_generate (push) Waiting to run
CI / go_mod_tidy (push) Waiting to run
CI / licenses (push) Waiting to run
CI / staticcheck (386, windows) (push) Waiting to run
CI / staticcheck (amd64, darwin) (push) Waiting to run
CI / staticcheck (amd64, linux) (push) Waiting to run
CI / staticcheck (amd64, windows) (push) Waiting to run
CI / notify_slack (push) Blocked by required conditions
CI / check_mergeability (push) Blocked by required conditions
Some checks are pending
checklocks / checklocks (push) Waiting to run
CodeQL / Analyze (go) (push) Waiting to run
Dockerfile build / deploy (push) Waiting to run
CI / race-root-integration (1/4) (push) Waiting to run
CI / race-root-integration (2/4) (push) Waiting to run
CI / race-root-integration (3/4) (push) Waiting to run
CI / race-root-integration (4/4) (push) Waiting to run
CI / test (-coverprofile=/tmp/coverage.out, amd64) (push) Waiting to run
CI / test (-race, amd64, 1/3) (push) Waiting to run
CI / test (-race, amd64, 2/3) (push) Waiting to run
CI / test (-race, amd64, 3/3) (push) Waiting to run
CI / test (386) (push) Waiting to run
CI / windows (push) Waiting to run
CI / privileged (push) Waiting to run
CI / vm (push) Waiting to run
CI / race-build (push) Waiting to run
CI / cross (386, linux) (push) Waiting to run
CI / cross (amd64, darwin) (push) Waiting to run
CI / cross (amd64, freebsd) (push) Waiting to run
CI / cross (amd64, openbsd) (push) Waiting to run
CI / cross (amd64, windows) (push) Waiting to run
CI / cross (arm, 5, linux) (push) Waiting to run
CI / cross (arm, 7, linux) (push) Waiting to run
CI / cross (arm64, darwin) (push) Waiting to run
CI / cross (arm64, linux) (push) Waiting to run
CI / cross (arm64, windows) (push) Waiting to run
CI / cross (loong64, linux) (push) Waiting to run
CI / ios (push) Waiting to run
CI / crossmin (amd64, illumos) (push) Waiting to run
CI / crossmin (amd64, plan9) (push) Waiting to run
CI / crossmin (amd64, solaris) (push) Waiting to run
CI / crossmin (ppc64, aix) (push) Waiting to run
CI / android (push) Waiting to run
CI / wasm (push) Waiting to run
CI / tailscale_go (push) Waiting to run
CI / fuzz (push) Waiting to run
CI / depaware (push) Waiting to run
CI / go_generate (push) Waiting to run
CI / go_mod_tidy (push) Waiting to run
CI / licenses (push) Waiting to run
CI / staticcheck (386, windows) (push) Waiting to run
CI / staticcheck (amd64, darwin) (push) Waiting to run
CI / staticcheck (amd64, linux) (push) Waiting to run
CI / staticcheck (amd64, windows) (push) Waiting to run
CI / notify_slack (push) Blocked by required conditions
CI / check_mergeability (push) Blocked by required conditions
tailscaled's ipn package writes a collection of keys to state after authenticating to control, but one at a time. If containerboot happens to send a SIGTERM signal to tailscaled in the middle of writing those keys, it may shut down with an inconsistent state Secret and never recover. While we can't durably fix this with our current single-use auth keys (no atomic operation to auth + write state), we can reduce the window for this race condition by checking for partial state before sending SIGTERM to tailscaled. Best effort only. Updates #14080 Change-Id: I0532d51b6f0b7d391e538468bd6a0a80dbe1d9f7 Signed-off-by: Tom Proctor <tomhjp@users.noreply.github.com>
This commit is contained in:
@ -25,6 +25,7 @@ import (
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"syscall"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
@ -50,9 +51,7 @@ func TestContainerBoot(t *testing.T) {
|
||||
defer lapi.Close()
|
||||
|
||||
kube := kubeServer{FSRoot: d}
|
||||
if err := kube.Start(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
kube.Start(t)
|
||||
defer kube.Close()
|
||||
|
||||
tailscaledConf := &ipn.ConfigVAlpha{AuthKey: ptr.To("foo"), Version: "alpha0"}
|
||||
@ -138,15 +137,29 @@ func TestContainerBoot(t *testing.T) {
|
||||
|
||||
// WantCmds is the commands that containerboot should run in this phase.
|
||||
WantCmds []string
|
||||
|
||||
// WantKubeSecret is the secret keys/values that should exist in the
|
||||
// kube secret.
|
||||
WantKubeSecret map[string]string
|
||||
|
||||
// Update the kube secret with these keys/values at the beginning of the
|
||||
// phase (simulates our fake tailscaled doing it).
|
||||
UpdateKubeSecret map[string]string
|
||||
|
||||
// WantFiles files that should exist in the container and their
|
||||
// contents.
|
||||
WantFiles map[string]string
|
||||
// WantFatalLog is the fatal log message we expect from containerboot.
|
||||
// If set for a phase, the test will finish on that phase.
|
||||
WantFatalLog string
|
||||
|
||||
// WantLog is a log message we expect from containerboot.
|
||||
WantLog string
|
||||
|
||||
// If set for a phase, the test will expect containerboot to exit with
|
||||
// this error code, and the test will finish on that phase without
|
||||
// waiting for the successful startup log message.
|
||||
WantExitCode *int
|
||||
|
||||
// The signal to send to containerboot at the start of the phase.
|
||||
Signal *syscall.Signal
|
||||
|
||||
EndpointStatuses map[string]int
|
||||
}
|
||||
@ -434,7 +447,8 @@ func TestContainerBoot(t *testing.T) {
|
||||
},
|
||||
},
|
||||
},
|
||||
WantFatalLog: "no forwarding rules for egress addresses [::1/128], host supports IPv6: false",
|
||||
WantLog: "no forwarding rules for egress addresses [::1/128], host supports IPv6: false",
|
||||
WantExitCode: ptr.To(1),
|
||||
},
|
||||
},
|
||||
},
|
||||
@ -936,7 +950,64 @@ func TestContainerBoot(t *testing.T) {
|
||||
},
|
||||
Phases: []phase{
|
||||
{
|
||||
WantFatalLog: "TS_EGRESS_PROXIES_CONFIG_PATH is only supported for Tailscale running on Kubernetes",
|
||||
WantLog: "TS_EGRESS_PROXIES_CONFIG_PATH is only supported for Tailscale running on Kubernetes",
|
||||
WantExitCode: ptr.To(1),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Name: "kube_shutdown_during_state_write",
|
||||
Env: map[string]string{
|
||||
"KUBERNETES_SERVICE_HOST": kube.Host,
|
||||
"KUBERNETES_SERVICE_PORT_HTTPS": kube.Port,
|
||||
"TS_ENABLE_HEALTH_CHECK": "true",
|
||||
},
|
||||
KubeSecret: map[string]string{
|
||||
"authkey": "tskey-key",
|
||||
},
|
||||
Phases: []phase{
|
||||
{
|
||||
// Normal startup.
|
||||
WantCmds: []string{
|
||||
"/usr/bin/tailscaled --socket=/tmp/tailscaled.sock --state=kube:tailscale --statedir=/tmp --tun=userspace-networking",
|
||||
"/usr/bin/tailscale --socket=/tmp/tailscaled.sock up --accept-dns=false --authkey=tskey-key",
|
||||
},
|
||||
WantKubeSecret: map[string]string{
|
||||
"authkey": "tskey-key",
|
||||
},
|
||||
},
|
||||
{
|
||||
// SIGTERM before state is finished writing, should wait for
|
||||
// consistent state before propagating SIGTERM to tailscaled.
|
||||
Signal: ptr.To(unix.SIGTERM),
|
||||
UpdateKubeSecret: map[string]string{
|
||||
"_machinekey": "foo",
|
||||
"_profiles": "foo",
|
||||
"profile-baff": "foo",
|
||||
// Missing "_current-profile" key.
|
||||
},
|
||||
WantKubeSecret: map[string]string{
|
||||
"authkey": "tskey-key",
|
||||
"_machinekey": "foo",
|
||||
"_profiles": "foo",
|
||||
"profile-baff": "foo",
|
||||
},
|
||||
WantLog: "Waiting for tailscaled to finish writing state to Secret \"tailscale\"",
|
||||
},
|
||||
{
|
||||
// tailscaled has finished writing state, should propagate SIGTERM.
|
||||
UpdateKubeSecret: map[string]string{
|
||||
"_current-profile": "foo",
|
||||
},
|
||||
WantKubeSecret: map[string]string{
|
||||
"authkey": "tskey-key",
|
||||
"_machinekey": "foo",
|
||||
"_profiles": "foo",
|
||||
"profile-baff": "foo",
|
||||
"_current-profile": "foo",
|
||||
},
|
||||
WantLog: "HTTP server at [::]:9002 closed",
|
||||
WantExitCode: ptr.To(0),
|
||||
},
|
||||
},
|
||||
},
|
||||
@ -984,26 +1055,36 @@ func TestContainerBoot(t *testing.T) {
|
||||
|
||||
var wantCmds []string
|
||||
for i, p := range test.Phases {
|
||||
for k, v := range p.UpdateKubeSecret {
|
||||
kube.SetSecret(k, v)
|
||||
}
|
||||
lapi.Notify(p.Notify)
|
||||
if p.WantFatalLog != "" {
|
||||
if p.Signal != nil {
|
||||
cmd.Process.Signal(*p.Signal)
|
||||
}
|
||||
if p.WantLog != "" {
|
||||
err := tstest.WaitFor(2*time.Second, func() error {
|
||||
state, err := cmd.Process.Wait()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if state.ExitCode() != 1 {
|
||||
return fmt.Errorf("process exited with code %d but wanted %d", state.ExitCode(), 1)
|
||||
}
|
||||
waitLogLine(t, time.Second, cbOut, p.WantFatalLog)
|
||||
waitLogLine(t, time.Second, cbOut, p.WantLog)
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
if p.WantExitCode != nil {
|
||||
state, err := cmd.Process.Wait()
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if state.ExitCode() != *p.WantExitCode {
|
||||
t.Fatalf("phase %d: want exit code %d, got %d", i, *p.WantExitCode, state.ExitCode())
|
||||
}
|
||||
|
||||
// Early test return, we don't expect the successful startup log message.
|
||||
return
|
||||
}
|
||||
|
||||
wantCmds = append(wantCmds, p.WantCmds...)
|
||||
waitArgs(t, 2*time.Second, d, argFile, strings.Join(wantCmds, "\n"))
|
||||
err := tstest.WaitFor(2*time.Second, func() error {
|
||||
@ -1059,6 +1140,9 @@ func TestContainerBoot(t *testing.T) {
|
||||
}
|
||||
}
|
||||
waitLogLine(t, 2*time.Second, cbOut, "Startup complete, waiting for shutdown signal")
|
||||
if cmd.ProcessState != nil {
|
||||
t.Fatalf("containerboot should be running but exited with exit code %d", cmd.ProcessState.ExitCode())
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@ -1290,18 +1374,18 @@ func (k *kubeServer) Reset() {
|
||||
k.secret = map[string]string{}
|
||||
}
|
||||
|
||||
func (k *kubeServer) Start() error {
|
||||
func (k *kubeServer) Start(t *testing.T) {
|
||||
root := filepath.Join(k.FSRoot, "var/run/secrets/kubernetes.io/serviceaccount")
|
||||
|
||||
if err := os.MkdirAll(root, 0700); err != nil {
|
||||
return err
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err := os.WriteFile(filepath.Join(root, "namespace"), []byte("default"), 0600); err != nil {
|
||||
return err
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(root, "token"), []byte("bearer_token"), 0600); err != nil {
|
||||
return err
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
k.srv = httptest.NewTLSServer(k)
|
||||
@ -1310,13 +1394,11 @@ func (k *kubeServer) Start() error {
|
||||
|
||||
var cert bytes.Buffer
|
||||
if err := pem.Encode(&cert, &pem.Block{Type: "CERTIFICATE", Bytes: k.srv.Certificate().Raw}); err != nil {
|
||||
return err
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(root, "ca.crt"), cert.Bytes(), 0600); err != nil {
|
||||
return err
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (k *kubeServer) Close() {
|
||||
@ -1365,6 +1447,7 @@ func (k *kubeServer) serveSecret(w http.ResponseWriter, r *http.Request) {
|
||||
http.Error(w, fmt.Sprintf("reading request body: %v", err), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
defer r.Body.Close()
|
||||
|
||||
switch r.Method {
|
||||
case "GET":
|
||||
@ -1397,12 +1480,13 @@ func (k *kubeServer) serveSecret(w http.ResponseWriter, r *http.Request) {
|
||||
panic(fmt.Sprintf("json decode failed: %v. Body:\n\n%s", err, string(bs)))
|
||||
}
|
||||
for _, op := range req {
|
||||
if op.Op == "remove" {
|
||||
switch op.Op {
|
||||
case "remove":
|
||||
if !strings.HasPrefix(op.Path, "/data/") {
|
||||
panic(fmt.Sprintf("unsupported json-patch path %q", op.Path))
|
||||
}
|
||||
delete(k.secret, strings.TrimPrefix(op.Path, "/data/"))
|
||||
} else if op.Op == "replace" {
|
||||
case "replace":
|
||||
path, ok := strings.CutPrefix(op.Path, "/data/")
|
||||
if !ok {
|
||||
panic(fmt.Sprintf("unsupported json-patch path %q", op.Path))
|
||||
@ -1419,7 +1503,7 @@ func (k *kubeServer) serveSecret(w http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
k.secret[path] = val
|
||||
}
|
||||
} else {
|
||||
default:
|
||||
panic(fmt.Sprintf("unsupported json-patch op %q", op.Op))
|
||||
}
|
||||
}
|
||||
@ -1437,7 +1521,7 @@ func (k *kubeServer) serveSecret(w http.ResponseWriter, r *http.Request) {
|
||||
panic(fmt.Sprintf("unknown content type %q", r.Header.Get("Content-Type")))
|
||||
}
|
||||
default:
|
||||
panic(fmt.Sprintf("unhandled HTTP method %q", r.Method))
|
||||
panic(fmt.Sprintf("unhandled HTTP request %s %s", r.Method, r.URL))
|
||||
}
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user