如何处理好Golang中的panic和recover

题记

Go 语言自发布以来，一直以高性能、高并发著称。因为标准库提供了 http 包，即使刚学不久的程序员，也能轻松写出 http 服务程序。

panicrecover

初识 panic 和 recover

panic

panic恐慌、恐慌的panicjavathrow

recover

recover恢复、复原recoverjavatry ... catch

try ... catchpanicrecoverif error then return

setjumplongjump

#include <setjmp.h>
#include <stdio.h>

static jmp_buf env;

double divide(double to, double by)
{
    if(by == 0)
    {
        longjmp(env, 1);
    }
    return to / by;
}

void test_divide()
{
    divide(2, 0);
    printf("done\n");
}

int main()
{
    if (setjmp(env) == 0)
    {
        test_divide();
    }
    else
    {
        printf("Cannot / 0\n");
        return -1;
    }
    return 0;
}

dividemainCannot / 0done

try catchrecoversetjumprecoverdeferthrowpaniclongjump

深入 panic 和 recover

源码

panicrecoversrc/runtime/panic.gogopanicgorecover

// gopanic 的代码，在 src/runtime/panic.go 第 454 行

// 预定义函数 panic 的实现
func gopanic(e interface{}) {
    gp := getg()
    if gp.m.curg != gp {
        print("panic: ")
        printany(e)
        print("\n")
        throw("panic on system stack")
    }

    if gp.m.mallocing != 0 {
        print("panic: ")
        printany(e)
        print("\n")
        throw("panic during malloc")
    }
    if gp.m.preemptoff != "" {
        print("panic: ")
        printany(e)
        print("\n")
        print("preempt off reason: ")
        print(gp.m.preemptoff)
        print("\n")
        throw("panic during preemptoff")
    }
    if gp.m.locks != 0 {
        print("panic: ")
        printany(e)
        print("\n")
        throw("panic holding locks")
    }

    var p _panic
    p.arg = e
    p.link = gp._panic
    gp._panic = (*_panic)(noescape(unsafe.Pointer(&p)))

    atomic.Xadd(&runningPanicDefers, 1)

    for {
        d := gp._defer
        if d == nil {
            break
        }

        // 如果触发 defer 的 panic 是在前一个 panic 或者 Goexit 的 defer 中触发的，那么将前一个 defer 从列表中去除。前一个 panic 或者 Goexit 将不再继续执行。
        if d.started {
            if d._panic != nil {
                d._panic.aborted = true
            }
            d._panic = nil
            d.fn = nil
            gp._defer = d.link
            freedefer(d)
            continue
        }

        // 将 defer 标记为 started，但是保留在列表上，这样，如果在 reflectcall 开始执行 d.fn 之前发生了堆栈增长或垃圾回收，则 traceback 可以找到并更新 defer 的参数帧。
        d.started = true

        // 将正在执行 defer 的 panic 保存下来。如果在该 panic 的 defer 函数中触发了新的 panic ，则新 panic 在列表中将会找到 d 并将 d._panic 标记为 aborted 。
        d._panic = (*_panic)(noescape(unsafe.Pointer(&p)))

        p.argp = unsafe.Pointer(getargp(0))
        reflectcall(nil, unsafe.Pointer(d.fn), deferArgs(d), uint32(d.siz), uint32(d.siz))
        p.argp = nil

        // reflectcall 不会 panic，移除 d 。
        if gp._defer != d {
            throw("bad defer entry in panic")
        }
        d._panic = nil
        d.fn = nil
        gp._defer = d.link

        // 这里用 GC() 来触发堆栈收缩以测试堆栈拷贝。由于是测试代码，所以注释掉了。参考 stack_test.go:TestStackPanic
        //GC()

        pc := d.pc
        sp := unsafe.Pointer(d.sp) // 必须是指针，以便在堆栈复制期间进行调整
        // defer 处理函数的内存是动态分配的，在执行完后需要释放内存。所以，如果 defer 一直得不到执行（比如在死循环中一直创建 defer），将会导致内存泄露
        freedefer(d)
        if p.recovered {
            atomic.Xadd(&runningPanicDefers, -1)

            gp._panic = p.link
            // 已退出的 panic 已经被标记，但还遗留在 g.panic 列表里，从列表里移除他们。
            for gp._panic != nil && gp._panic.aborted {
                gp._panic = gp._panic.link
            }
            if gp._panic == nil { // must be done with signal
                gp.sig = 0
            }
            // 将正在恢复的栈帧传给 recovery。
            gp.sigcode0 = uintptr(sp)
            gp.sigcode1 = pc
            mcall(recovery)
            throw("recovery failed") // mcall 不应该返回
        }
    }

    // 如果所有的 defer 都遍历完毕，意味着没有 recover（前面提到，mcall 执行 recovery 是不返回的），继续执行 panic 后续流程，如：输出调用栈信息和错误信息
    // 由于在冻结世界之后调用任意用户代码是不安全的，因此我们调用preprintpanics来调用所有必要的Error和String方法以在startpanic之前准备 panic 输出的字符串。
    preprintpanics(gp._panic)

    fatalpanic(gp._panic) // 不应该返回
    *(*int)(nil) = 0      // 因为 fatalpanic 不应该返回，正常情况下这里不会执行。如果执行到了，这行代码将触发 panic
}

// gorecover 的代码，在 src/runtime/panic.go 第 585 行

// 预定义函数 recover 的实现。
// 无法拆分堆栈，因为它需要可靠地找到其调用方的堆栈段。
//
// TODO(rsc): Once we commit to CopyStackAlways,
// this doesn't need to be nosplit.
//go:nosplit
func gorecover(argp uintptr) interface{} {
    // 在处理 panic 的时候，recover 函数的调用必须放在 defer 的顶层处理函数中。
    // p.argp 是最顶层的延迟函数调用的参数指针，与调用方传递的argp进行比较，如果一致，则该调用方是可以恢复的。
    gp := getg()
    p := gp._panic
    if p != nil && !p.recovered && argp == uintptr(p.argp) {
        p.recovered = true
        return p.arg
    }
    return nil
}

panic

ggoroutinegdeferdeferrecoverpanicpanicrecovereddeferrecovereddefergruntime.mcallm->g0recoverygrecoveryruntime.mcallsrc/runtime/asm_xxx.sxxxamd64

// src/runtime/asm_amd64.s 第 274 行

// func mcall(fn func(*g))
// Switch to m->g0's stack, call fn(g).
// Fn must never return. It should gogo(&g->sched)
// to keep running g.
TEXT runtime·mcall(SB), NOSPLIT, $0-8
    MOVQ    fn+0(FP), DI

    get_tls(CX)
    MOVQ    g(CX), AX   // save state in g->sched
    MOVQ    0(SP), BX   // caller's PC
    MOVQ    BX, (g_sched+gobuf_pc)(AX)
    LEAQ    fn+0(FP), BX    // caller's SP
    MOVQ    BX, (g_sched+gobuf_sp)(AX)
    MOVQ    AX, (g_sched+gobuf_g)(AX)
    MOVQ    BP, (g_sched+gobuf_bp)(AX)

    // switch to m->g0 & its stack, call fn
    MOVQ    g(CX), BX
    MOVQ    g_m(BX), BX
    MOVQ    m_g0(BX), SI
    CMPQ    SI, AX  // if g == m->g0 call badmcall
    JNE 3(PC)
    MOVQ    $runtime·badmcall(SB), AX
    JMP AX
    MOVQ    SI, g(CX)   // g = m->g0
    MOVQ    (g_sched+gobuf_sp)(SI), SP  // sp = m->g0->sched.sp
    PUSHQ   AX
    MOVQ    DI, DX
    MOVQ    0(DI), DI
    CALL    DI
    POPQ    AX
    MOVQ    $runtime·badmcall2(SB), AX
    JMP AX
    RET

m->g0runtimegoroutinerecoveryruntimem->g0recovery

recoveryggogoggrecover

  // gorecover 的代码，在 src/runtime/panic.go 第 637 行

// 在 panic 后，在延迟函数中调用 recover 的时候，将回溯堆栈，并且继续执行，就像延迟函数的调用者正常返回一样。
  func recovery(gp *g) {
      // Info about defer passed in G struct.
      sp := gp.sigcode0
      pc := gp.sigcode1

      // 延迟函数的参数必须已经保存在堆栈中了（这里通过判断 sp 是否处于栈内存地址的范围内来保障参数的正确处理）
      if sp != 0 && (sp < gp.stack.lo || gp.stack.hi < sp) {
          print("recover: ", hex(sp), " not in [", hex(gp.stack.lo), ", ", hex(gp.stack.hi), "]\n")
          throw("bad recovery")
      }

  // 让延迟函数的 deferproc 再次返回，这次返回 1 。调用函数将跳转到标准返回结尾。
      gp.sched.sp = sp
      gp.sched.pc = pc
      gp.sched.lr = 0
      gp.sched.ret = 1
      gogo(&gp.sched)
  }

// src/runtime/asm_amd64.s 第 274 行

// func gogo(buf *gobuf)
// restore state from Gobuf; longjmp
TEXT runtime·gogo(SB), NOSPLIT, $16-8
    MOVQ    buf+0(FP), BX       // gobuf
    MOVQ    gobuf_g(BX), DX
    MOVQ    0(DX), CX       // make sure g != nil
    get_tls(CX)
    MOVQ    DX, g(CX)
    MOVQ    gobuf_sp(BX), SP    // 从 gobuf 中恢复 SP ，以便后面做跳转
    MOVQ    gobuf_ret(BX), AX
    MOVQ    gobuf_ctxt(BX), DX
    MOVQ    gobuf_bp(BX), BP
    MOVQ    $0, gobuf_sp(BX)    // 这里开始清理 gobuf ，以便垃圾回收。
    MOVQ    $0, gobuf_ret(BX)
    MOVQ    $0, gobuf_ctxt(BX)
    MOVQ    $0, gobuf_bp(BX)
    MOVQ    gobuf_pc(BX), BX    // 从 gobuf 中恢复 pc ，以便跳转
    JMP BX

以上便是 Go 底层处理异常的流程，精简为三步便是：

deferrecoverpanicruntimedeferrecovergdeferrecover

都有哪些坑

panicpanic

runtimepanic

接下来，笔者给大家细数下都有哪些坑。

数组( slice )下标越界
这个比较好理解，对于静态类型语言，数组下标越界是致命错误。如下代码可以验证：

package main

import (
    "fmt"
)

func foo(){
    defer func(){
        if err := recover(); err != nil {
            fmt.Println(err)
        }
    }()
    var bar = []int{1}
    fmt.Println(bar[1])
}

func main(){
    foo()
    fmt.Println("exit")
}

输出：

runtime error: index out of range
exit

recoverexit

recover

panic: runtime error: index out of range

goroutine 1 [running]:
main.foo()
    /home/letian/work/go/src/test/test.go:14 +0x3e
main.main()
    /home/letian/work/go/src/test/test.go:18 +0x22
exit status 2

访问未初始化的指针或 nil 指针
对于有 c/c++ 开发经验的人来说，这个很好理解。但对于没用过指针的新手来说，这是最常见的一类错误。
如下代码可以验证：

package main

import (
    "fmt"
)

func foo(){
    defer func(){
        if err := recover(); err != nil {
            fmt.Println(err)
        }
    }()
    var bar *int
    fmt.Println(*bar)
}

func main(){
    foo()
    fmt.Println("exit")
}

输出：

runtime error: invalid memory address or nil pointer dereference
exit

recover

panic: runtime error: invalid memory address or nil pointer dereference
[signal SIGSEGV: segmentation violation code=0x1 addr=0x0 pc=0x4869ff]

goroutine 1 [running]:
main.foo()
    /home/letian/work/go/src/test/test.go:14 +0x3f
main.main()
    /home/letian/work/go/src/test/test.go:18 +0x22
exit status 2

chanchan

package main

import (
    "fmt"
)

func foo(){
    defer func(){
        if err := recover(); err != nil {
            fmt.Println(err)
        }
    }()
    var bar = make(chan int, 1)
    close(bar)
    bar<-1
}

func main(){
    foo()
    fmt.Println("exit")
}

输出：

send on closed channel
exit

如果注释掉 recover ，将输出：

panic: send on closed channel

goroutine 1 [running]:
main.foo()
    /home/letian/work/go/src/test/test.go:15 +0x83
main.main()
    /home/letian/work/go/src/test/test.go:19 +0x22
exit status 2

src/runtime/chan.gochansend

// src/runtime/chan.go 第 269 行

// 如果 block 不为 nil ，则协议将不会休眠，但如果无法完成则返回。
// 当关闭休眠中的通道时，可以使用 g.param == nil 唤醒睡眠。
// 我们可以非常容易循环并重新运行该操作，并且将会看到它处于已关闭状态。
func chansend(c *hchan, ep unsafe.Pointer, block bool, callerpc uintptr) bool {
    if c == nil {
        if !block {
            return false
        }
        gopark(nil, nil, waitReasonChanSendNilChan, traceEvGoStop, 2)
        throw("unreachable")
    }

    if debugChan {
        print("chansend: chan=", c, "\n")
    }

    if raceenabled {
        racereadpc(c.raceaddr(), callerpc, funcPC(chansend))
    }

    // Fast path: check for failed non-blocking operation without acquiring the lock.
    //
    // After observing that the channel is not closed, we observe that the channel is
    // not ready for sending. Each of these observations is a single word-sized read
    // (first c.closed and second c.recvq.first or c.qcount depending on kind of channel).
    // Because a closed channel cannot transition from 'ready for sending' to
    // 'not ready for sending', even if the channel is closed between the two observations,
    // they imply a moment between the two when the channel was both not yet closed
    // and not ready for sending. We behave as if we observed the channel at that moment,
    // and report that the send cannot proceed.
    //
    // It is okay if the reads are reordered here: if we observe that the channel is not
    // ready for sending and then observe that it is not closed, that implies that the
    // channel wasn't closed during the first observation.
    if !block && c.closed == 0 && ((c.dataqsiz == 0 && c.recvq.first == nil) ||
        (c.dataqsiz > 0 && c.qcount == c.dataqsiz)) {
        return false
    }

    var t0 int64
    if blockprofilerate > 0 {
        t0 = cputicks()
    }

    lock(&c.lock)

    if c.closed != 0 {
        unlock(&c.lock)
        panic(plainError("send on closed channel"))
    }

    if sg := c.recvq.dequeue(); sg != nil {
        // Found a waiting receiver. We pass the value we want to send
        // directly to the receiver, bypassing the channel buffer (if any).
        send(c, sg, ep, func() { unlock(&c.lock) }, 3)
        return true
    }

    if c.qcount < c.dataqsiz {
        // Space is available in the channel buffer. Enqueue the element to send.
        qp := chanbuf(c, c.sendx)
        if raceenabled {
            raceacquire(qp)
            racerelease(qp)
        }
        typedmemmove(c.elemtype, qp, ep)
        c.sendx++
        if c.sendx == c.dataqsiz {
            c.sendx = 0
        }
        c.qcount++
        unlock(&c.lock)
        return true
    }

    if !block {
        unlock(&c.lock)
        return false
    }

    // Block on the channel. Some receiver will complete our operation for us.
    gp := getg()
    mysg := acquireSudog()
    mysg.releasetime = 0
    if t0 != 0 {
        mysg.releasetime = -1
    }
    // No stack splits between assigning elem and enqueuing mysg
    // on gp.waiting where copystack can find it.
    mysg.elem = ep
    mysg.waitlink = nil
    mysg.g = gp
    mysg.isSelect = false
    mysg.c = c
    gp.waiting = mysg
    gp.param = nil
    c.sendq.enqueue(mysg)
    goparkunlock(&c.lock, waitReasonChanSend, traceEvGoBlockSend, 3)
    // Ensure the value being sent is kept alive until the
    // receiver copies it out. The sudog has a pointer to the
    // stack object, but sudogs aren't considered as roots of the
    // stack tracer.
    KeepAlive(ep)

    // someone woke us up.
    if mysg != gp.waiting {
        throw("G waiting list is corrupted")
    }
    gp.waiting = nil
    if gp.param == nil {
        if c.closed == 0 {
            throw("chansend: spurious wakeup")
        }
        panic(plainError("send on closed channel"))
    }
    gp.param = nil
    if mysg.releasetime > 0 {
        blockevent(mysg.releasetime-t0, 2)
    }
    mysg.c = nil
    releaseSudog(mysg)
    return true
}

并发读写相同 map

对于刚学并发编程的同学来说，并发读写 map 也是很容易遇到的问题。如下代码可以验证：

  package main

  import (
      "fmt"
  )

  func foo(){
      defer func(){
          if err := recover(); err != nil {
              fmt.Println(err)
          }
      }()
      var bar = make(map[int]int)
      go func(){
          defer func(){
              if err := recover(); err != nil {
                  fmt.Println(err)
              }
          }()
          for{
              _ = bar[1]
          }
      }()
      for{
          bar[1]=1
      }
  }

  func main(){
      foo()
      fmt.Println("exit")
  }

输出：

fatal error: concurrent map read and map write

  goroutine 5 [running]:
  runtime.throw(0x4bd8b0, 0x21)
      /home/letian/.gvm/gos/go1.12/src/runtime/panic.go:617 +0x72 fp=0xc00004c780 sp=0xc00004c750 pc=0x427f22
  runtime.mapaccess1_fast64(0x49eaa0, 0xc000088180, 0x1, 0xc0000260d8)
      /home/letian/.gvm/gos/go1.12/src/runtime/map_fast64.go:21 +0x1a8 fp=0xc00004c7a8 sp=0xc00004c780 pc=0x40eb58
  main.foo.func2(0xc000088180)
      /home/letian/work/go/src/test/test.go:21 +0x5c fp=0xc00004c7d8 sp=0xc00004c7a8 pc=0x48708c
  runtime.goexit()
      /home/letian/.gvm/gos/go1.12/src/runtime/asm_amd64.s:1337 +0x1 fp=0xc00004c7e0 sp=0xc00004c7d8 pc=0x450e51
  created by main.foo
      /home/letian/work/go/src/test/test.go:14 +0x68

  goroutine 1 [runnable]:
  main.foo()
      /home/letian/work/go/src/test/test.go:25 +0x8b
  main.main()
      /home/letian/work/go/src/test/test.go:30 +0x22
  exit status 2

exitsrc/runtime/map.go

  if h.flags&hashWriting != 0 {
      throw("concurrent map read and map write")
  }

runtimethrowrecover

interfaceinterface

package main

import (
    "fmt"
)

func foo(){
    defer func(){
        if err := recover(); err != nil {
            fmt.Println(err)
        }
    }()
    var i interface{} = "abc"
    _ = i.([]string)
}

func main(){
    foo()
    fmt.Println("exit")
}

输出：

interface conversion: interface {} is string, not []string
exit

src/runtime/iface.go

// panicdottypeE is called when doing an e.(T) conversion and the conversion fails.
// have = the dynamic type we have.
// want = the static type we're trying to convert to.
// iface = the static type we're converting from.
func panicdottypeE(have, want, iface *_type) {
    panic(&TypeAssertionError{iface, have, want, ""})
}

// panicdottypeI is called when doing an i.(T) conversion and the conversion fails.
// Same args as panicdottypeE, but "have" is the dynamic itab we have.
func panicdottypeI(have *itab, want, iface *_type) {
    var t *_type
    if have != nil {
        t = have._type
    }
    panicdottypeE(t, want, iface)
}

下回预告

Go语言踩坑记之channel与goroutine

如何处理好Golang中的panic和recover

题记

初识 panic 和 recover

深入 panic 和 recover

源码

都有哪些坑

数组( slice )下标越界

访问未初始化的指针或 nil 指针

并发读写相同 map

更多的 panic

下回预告

推荐文章