Commit 00e0fe4b authored by David Crawshaw's avatar David Crawshaw

misc/ios: retry loop to handle builder flakiness

After moving the darwin/arm builder to new hardware several new flaky
error messages appeared. This provided enough information to Google
to make it clear that iOS build systems have been flaky for many
years, and that is unlikely to change any time soon.

However, all of the pain of lldb and using a breakpoint early in
program initialization gives us an advantage: all install and
initialization flakiness appears to happen before the Go program ever
gets going. So if we see an error or we timeout before we reach our
breakpoint (before any test code has executed), we can assume it is
the fault of the builder and restart without risking hiding a flaky
Go test.

This code has successfully processed the last 8 builds. I am hopeful.

Change-Id: Ide24aaae4fa7bdab9d8f4432bb85d8f2256c7606
Reviewed-on: https://go-review.googlesource.com/8241Reviewed-by: 's avatarHyang-Ah Hana Kim <hyangah@gmail.com>
parent dd95244d
......@@ -26,6 +26,10 @@ import (
const debug = false
var errRetry = errors.New("failed to start test harness (retry attempted)")
var tmpdir string
func main() {
log.SetFlags(0)
log.SetPrefix("go_darwin_arm_exec: ")
......@@ -36,39 +40,39 @@ func main() {
log.Fatal("usage: go_darwin_arm_exec a.out")
}
if err := run(os.Args[1], os.Args[2:]); err != nil {
fmt.Fprintf(os.Stderr, "go_darwin_arm_exec: %v\n", err)
os.Exit(1)
var err error
tmpdir, err = ioutil.TempDir("", "go_darwin_arm_exec_")
if err != nil {
log.Fatal(err)
}
}
func run(bin string, args []string) (err error) {
type waitPanic struct {
err error
}
defer func() {
if r := recover(); r != nil {
if w, ok := r.(waitPanic); ok {
err = w.err
return
}
panic(r)
// Approximately 1 in a 100 binaries fail to start. If it happens,
// try again. These failures happen for several reasons beyond
// our control, but all of them are safe to retry as they happen
// before lldb encounters the initial getwd breakpoint. As we
// know the tests haven't started, we are not hiding flaky tests
// with this retry.
for i := 0; i < 5; i++ {
if i > 0 {
fmt.Fprintln(os.Stderr, "start timeout, trying again")
}
err = run(os.Args[1], os.Args[2:])
if err == nil || err != errRetry {
break
}
}()
defer exec.Command("killall", "ios-deploy").Run() // cleanup
exec.Command("killall", "ios-deploy").Run()
tmpdir, err := ioutil.TempDir("", "go_darwin_arm_exec_")
if err != nil {
log.Fatal(err)
}
if !debug {
defer os.RemoveAll(tmpdir)
os.RemoveAll(tmpdir)
}
if err != nil {
fmt.Fprintf(os.Stderr, "go_darwin_arm_exec: %v\n", err)
os.Exit(1)
}
}
func run(bin string, args []string) (err error) {
appdir := filepath.Join(tmpdir, "gotest.app")
os.RemoveAll(appdir)
if err := os.MkdirAll(appdir, 0755); err != nil {
return err
}
......@@ -109,9 +113,31 @@ func run(bin string, args []string) (err error) {
return fmt.Errorf("codesign: %v", err)
}
if err := os.Chdir(tmpdir); err != nil {
oldwd, err := os.Getwd()
if err != nil {
return err
}
if err := os.Chdir(filepath.Join(appdir, "..")); err != nil {
return err
}
defer os.Chdir(oldwd)
type waitPanic struct {
err error
}
defer func() {
if r := recover(); r != nil {
if w, ok := r.(waitPanic); ok {
err = w.err
return
}
panic(r)
}
}()
defer exec.Command("killall", "ios-deploy").Run() // cleanup
exec.Command("killall", "ios-deploy").Run()
// ios-deploy invokes lldb to give us a shell session with the app.
cmd = exec.Command(
......@@ -175,11 +201,11 @@ func run(bin string, args []string) (err error) {
w.printBuf()
return fmt.Errorf("failed (stage %s): %v", stage, err)
case i := <-w.find(str, timeout):
if i >= 0 {
w.clearTo(i + len(str))
} else {
log.Printf("timed out on stage %s, continuing", stage)
if i < 0 {
log.Printf("timed out on stage %q, retrying", stage)
return errRetry
}
w.clearTo(i + len(str))
return nil
}
}
......@@ -192,7 +218,11 @@ func run(bin string, args []string) (err error) {
// Wait for installation and connection.
if err := waitFor("ios-deploy before run", "(lldb) connect\r\nProcess 0 connected\r\n", 0); err != nil {
return err
// Retry if we see a rare and longstanding ios-deploy bug.
// https://github.com/phonegap/ios-deploy/issues/11
// Assertion failed: (AMDeviceStartService(device, CFSTR("com.apple.debugserver"), &gdbfd, NULL) == 0)
log.Printf("%v, retrying", err)
return errRetry
}
// Script LLDB. Oh dear.
......@@ -205,9 +235,21 @@ func run(bin string, args []string) (err error) {
do(`breakpoint set -n getwd`) // in runtime/cgo/gcc_darwin_arm.go
fmt.Fprintln(lldb, `run`)
// Sometimes we don't see "reason = breakpoint", so we time out
// and try to continue.
if err := waitFor("br getwd", "stop reason = breakpoint", 10*time.Second); err != nil {
if err := waitFor("br getwd", "stop reason = breakpoint", 20*time.Second); err != nil {
// At this point we see several flaky errors from the iOS
// build infrastructure. The most common is never reaching
// the breakpoint, which we catch with a timeout. Very
// occasionally lldb can produce errors like:
//
// Breakpoint 1: no locations (pending).
// WARNING: Unable to resolve breakpoint to any actual locations.
//
// As no actual test code has been executed by this point,
// we treat all errors as recoverable.
if err != errRetry {
log.Printf("%v, retrying", err)
err = errRetry
}
return err
}
if err := waitFor("br getwd prompt", "(lldb)", 0); err != nil {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment