From 2802a01b8199a10c41241eb2699ab158be276df7 Mon Sep 17 00:00:00 2001 From: Christine Dodrill Date: Mon, 31 May 2021 17:04:49 -0400 Subject: [PATCH] tstest/integration/vms: test vms as they are ready (#2022) Instead of testing all the VMs at once when they are all ready, this patch changes the testing logic so that the vms are tested as soon as they register with testcontrol. Also limit the amount of VM ram used at once with the `-ram-limit` flag. That uses a semaphore to guard resource use. Also document CentOS' sins. Updates #1988 Signed-off-by: Christine Dodrill --- tstest/integration/vms/vms_test.go | 215 ++++++++++++++--------------- 1 file changed, 104 insertions(+), 111 deletions(-) diff --git a/tstest/integration/vms/vms_test.go b/tstest/integration/vms/vms_test.go index 25c612879..236b258e1 100644 --- a/tstest/integration/vms/vms_test.go +++ b/tstest/integration/vms/vms_test.go @@ -7,6 +7,7 @@ package vms import ( + "context" "crypto/sha256" "encoding/hex" "flag" @@ -31,6 +32,7 @@ import ( expect "github.com/google/goexpect" "github.com/pkg/sftp" "golang.org/x/crypto/ssh" + "golang.org/x/sync/semaphore" "inet.af/netaddr" "tailscale.com/net/interfaces" "tailscale.com/tstest" @@ -40,7 +42,8 @@ import ( const securePassword = "hunter2" -var runVMTests = flag.Bool("run-vm-tests", false, "if set, run expensive (10G+ ram) VM based integration tests") +var runVMTests = flag.Bool("run-vm-tests", false, "if set, run expensive VM based integration tests") +var vmRamLimit = flag.Int("ram-limit", 4096, "the maximum number of megabytes of ram that can be used for VMs, must be greater than or equal to 1024") var distroRex *regexValue = func() *regexValue { result := ®exValue{r: regexp.MustCompile(`.*`)} flag.Var(result, "distro-regex", "The regex that matches what distros should be run") @@ -259,9 +262,8 @@ func mkVM(t *testing.T, n int, d Distro, sshKey, hostURL, tdir string) func() { if err != nil { t.Fatalf("can't find cache dir: %v", err) } - cdir = filepath.Join(cdir, "within", "mkvm") + cdir = filepath.Join(cdir, "tailscale", "vm-test") os.MkdirAll(filepath.Join(cdir, "qcow2"), 0755) - os.MkdirAll(filepath.Join(cdir, "seed"), 0755) port := 23100 + n @@ -280,6 +282,7 @@ func mkVM(t *testing.T, n int, d Distro, sshKey, hostURL, tdir string) func() { "-drive", driveArg, "-cdrom", filepath.Join(tdir, d.name, "seed", "seed.iso"), "-vnc", fmt.Sprintf(":%d", n), + "-smbios", "type=1,serial=ds=nocloud;h=" + d.name, } t.Logf("running: qemu-system-x86_64 %s", strings.Join(args, " ")) @@ -378,7 +381,7 @@ func TestVMIntegrationEndToEnd(t *testing.T) { var ( ipMu sync.Mutex - ipMap = []ipMapping{} + ipMap = map[string]ipMapping{} ) mux := http.NewServeMux() @@ -398,7 +401,8 @@ func TestVMIntegrationEndToEnd(t *testing.T) { if err != nil { log.Panicf("bad port: %v", port) } - ipMap = append(ipMap, ipMapping{r.UserAgent(), port, host}) + distro := r.UserAgent() + ipMap[distro] = ipMapping{distro, port, host} t.Logf("%s: %v", name, host) }) @@ -424,140 +428,129 @@ func TestVMIntegrationEndToEnd(t *testing.T) { loginServer := fmt.Sprintf("http://%s", ln.Addr()) t.Logf("loginServer: %s", loginServer) - var numDistros = 0 + tstest.FixLogs(t) + defer tstest.UnfixLogs(t) - cancels := make(chan func(), len(distros)) + ramsem := semaphore.NewWeighted(int64(*vmRamLimit)) - t.Run("mkvm", func(t *testing.T) { + t.Run("do", func(t *testing.T) { for n, distro := range distros { n, distro := n, distro if rex.MatchString(distro.name) { t.Logf("%s matches %s", distro.name, rex) - numDistros++ } else { continue } t.Run(distro.name, func(t *testing.T) { + ctx, done := context.WithCancel(context.Background()) + defer done() + + if distro.name == "opensuse-leap-15-1" { + t.Skip("OpenSUSE Leap 15.1's cloud-init image just doesn't work for some reason, see https://github.com/tailscale/tailscale/issues/1988") + } + t.Parallel() + err := ramsem.Acquire(ctx, int64(distro.mem)) + if err != nil { + t.Fatalf("can't acquire ram semaphore: %v", err) + } + defer ramsem.Release(int64(distro.mem)) + cancel := mkVM(t, n, distro, string(pubkey), loginServer, dir) - cancels <- cancel + defer cancel() + var ipm ipMapping + + t.Run("wait-for-start", func(t *testing.T) { + waiter := time.NewTicker(time.Second) + defer waiter.Stop() + var ok bool + for { + <-waiter.C + ipMu.Lock() + if ipm, ok = ipMap[distro.name]; ok { + ipMu.Unlock() + break + } + ipMu.Unlock() + } + }) + + testDistro(t, loginServer, signer, ipm) }) } }) +} - close(cancels) - for cancel := range cancels { - //lint:ignore SA9001 They do actually get ran - defer cancel() - - if len(cancels) == 0 { - t.Log("all VMs started") - break - } +func testDistro(t *testing.T, loginServer string, signer ssh.Signer, ipm ipMapping) { + t.Helper() + port := ipm.port + hostport := fmt.Sprintf("127.0.0.1:%d", port) + ccfg := &ssh.ClientConfig{ + User: "root", + Auth: []ssh.AuthMethod{ssh.PublicKeys(signer), ssh.Password(securePassword)}, + HostKeyCallback: ssh.InsecureIgnoreHostKey(), } - t.Run("wait-for-vms", func(t *testing.T) { - t.Log("waiting for VMs to register") - waiter := time.NewTicker(time.Second) - defer waiter.Stop() - n := 0 - for { - <-waiter.C - ipMu.Lock() - if len(ipMap) == numDistros { - ipMu.Unlock() - break - } else { - if n%30 == 0 { - t.Logf("ipMap: %d", len(ipMap)) - t.Logf("distros: %d", numDistros) - } - } - n++ - ipMu.Unlock() + // NOTE(Xe): This deadline loop helps to make things a bit faster, centos + // sometimes is slow at starting its sshd and will sometimes randomly kill + // SSH sessions on transition to multi-user.target. I don't know why they + // don't use socket activation. + const maxRetries = 5 + var working bool + for i := 0; i < maxRetries; i++ { + cli, err := ssh.Dial("tcp", hostport, ccfg) + if err == nil { + working = true + cli.Close() + break } - }) - ipMu.Lock() - defer ipMu.Unlock() - t.Run("join-net", func(t *testing.T) { - for _, ipm := range ipMap { - ipm := ipm - port := ipm.port - t.Run(ipm.name, func(t *testing.T) { - tstest.FixLogs(t) - t.Parallel() + time.Sleep(10 * time.Second) + } - hostport := fmt.Sprintf("127.0.0.1:%d", port) + if !working { + t.Fatalf("can't connect to %s, tried %d times", hostport, maxRetries) + } - // NOTE(Xe): This retry loop helps to make things a bit faster, centos sometimes is slow at starting its sshd. I don't know why they don't use socket activation. - const maxRetries = 5 - var working bool - for i := 0; i < maxRetries; i++ { - conn, err := net.Dial("tcp", hostport) - if err == nil { - working = true - conn.Close() - break - } + t.Logf("about to ssh into 127.0.0.1:%d", port) + cli, err := ssh.Dial("tcp", hostport, ccfg) + if err != nil { + t.Fatal(err) + } + copyBinaries(t, cli) - time.Sleep(5 * time.Second) - } + timeout := 5 * time.Minute - if !working { - t.Fatalf("can't connect to %s, tried %d times", hostport, maxRetries) - } + e, _, err := expect.SpawnSSH(cli, timeout, expect.Verbose(true), expect.VerboseWriter(log.Writer())) + if err != nil { + t.Fatalf("%d: can't register a shell session: %v", port, err) + } + defer e.Close() - t.Logf("about to ssh into 127.0.0.1:%d", port) - cli, err := ssh.Dial("tcp", hostport, &ssh.ClientConfig{ - User: "root", - Auth: []ssh.AuthMethod{ssh.PublicKeys(signer), ssh.Password(securePassword)}, - HostKeyCallback: ssh.InsecureIgnoreHostKey(), - }) - if err != nil { - t.Fatal(err) - } - copyBinaries(t, cli) + t.Log("opened session") - timeout := 5 * time.Minute - - e, _, err := expect.SpawnSSH(cli, timeout, expect.Verbose(true), expect.VerboseWriter(log.Writer())) - if err != nil { - t.Fatalf("%d: can't register a shell session: %v", port, err) - } - defer e.Close() - - t.Log("opened session") - - _, _, err = e.Expect(regexp.MustCompile(`(\#)`), timeout) - if err != nil { - t.Fatalf("%d: can't get a shell: %v", port, err) - } - t.Logf("got shell for %d", port) - err = e.Send("systemctl start tailscaled.service\n") - if err != nil { - t.Fatalf("can't send command to start tailscaled: %v", err) - } - _, _, err = e.Expect(regexp.MustCompile(`(\#)`), timeout) - if err != nil { - t.Fatalf("%d: can't get a shell: %v", port, err) - } - err = e.Send(fmt.Sprintf("sudo tailscale up --login-server %s\n", loginServer)) - if err != nil { - t.Fatalf("%d: can't send tailscale up command: %v", port, err) - } - _, _, err = e.Expect(regexp.MustCompile(`Success.`), timeout) - if err != nil { - t.Fatalf("not successful: %v", err) - } - }) - } - }) - - if numNodes := cs.NumNodes(); numNodes != len(ipMap) { - t.Errorf("wanted %d nodes, got: %d", len(ipMap), numNodes) + _, _, err = e.Expect(regexp.MustCompile(`(\#)`), timeout) + if err != nil { + t.Fatalf("%d: can't get a shell: %v", port, err) + } + t.Logf("got shell for %d", port) + err = e.Send("systemctl start tailscaled.service\n") + if err != nil { + t.Fatalf("can't send command to start tailscaled: %v", err) + } + _, _, err = e.Expect(regexp.MustCompile(`(\#)`), timeout) + if err != nil { + t.Fatalf("%d: can't get a shell: %v", port, err) + } + err = e.Send(fmt.Sprintf("sudo tailscale up --login-server %s\n", loginServer)) + if err != nil { + t.Fatalf("%d: can't send tailscale up command: %v", port, err) + } + _, _, err = e.Expect(regexp.MustCompile(`Success.`), timeout) + if err != nil { + t.Fatalf("not successful: %v", err) } }