From 8a0d036a145909ab89099f6cfec5cebb223ca99d Mon Sep 17 00:00:00 2001 From: asahi Date: Sat, 11 Jan 2025 15:31:52 +0800 Subject: [PATCH] =?UTF-8?q?=E9=98=85=E8=AF=BBsync.Pool=E6=96=87=E6=A1=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Golang/Golang Document.md | 398 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 398 insertions(+) diff --git a/Golang/Golang Document.md b/Golang/Golang Document.md index 661db2e..14b9a99 100644 --- a/Golang/Golang Document.md +++ b/Golang/Golang Document.md @@ -310,6 +310,404 @@ ok git.kazusa.red/asahi/fuzz-demo 10.360s > #### new interesting > `new interesting`指会扩充code coverage的用例输入,在fuzz test刚开始时,new interesting数量通常会因发现新的代码路径快速增加,然后,会随着时间的推移逐渐减少 +## Go Sync +### sync.Pool +`sync.Pool`为golang标准库中的实现,用于降低allocation和减少垃圾回收。 + +### sync.Pool使用示例 +golang中`sync.Pool`使用示例如下: +```go +package main + +import ( + "fmt" + "sync" +) + +type JobState int + +const ( + JobStateFresh JobState = iota + JobStateRunning + JobStateRecycled +) + +type Job struct { + state JobState +} + +func (j *Job) Run() { + switch j.state { + case JobStateRecycled: + fmt.Println("this job came from the pool") + case JobStateFresh: + fmt.Println("this job just got allocated") + } + + j.state = JobStateRunning +} + +func main() { + pool := &sync.Pool{ + New: func() any { + return &Job{state: JobStateFresh} + }, + } + + // get a job from the pool + job := pool.Get().(*Job) + + // run it + job.Run() + + // put it back in the pool + job.state = JobStateRecycled + pool.Put(job) +} +``` + +### Pool和垃圾回收 +`sync.Pool`对象实际是由两部分组成: +- `local pool` +- `victim pool` + +调用Pool中的方法时,实际行为如下: +- `Put`:调用Put方法时,会将对象添加到`local` +- `Get`:调用Get时,首先会从`local`中查找,如果`local`中未找到,那么会从`victim`中查找,如果victim中仍然不存在,那么则是会调用`New` + +在`sync.Pool`中,`local`被用作primary cache,victim则被用作victim cache。 + +#### poolCleanup +poolCleanUp方法实现如下: +```go +func poolCleanup() { + // This function is called with the world stopped, at the beginning of a garbage collection. + // It must not allocate and probably should not call any runtime functions. + + // Because the world is stopped, no pool user can be in a + // pinned section (in effect, this has all Ps pinned). + + // Drop victim caches from all pools. + for _, p := range oldPools { + p.victim = nil + p.victimSize = 0 + } + + // Move primary cache to victim cache. + for _, p := range allPools { + p.victim = p.local + p.victimSize = p.localSize + p.local = nil + p.localSize = 0 + } + + // The pools with non-empty primary caches now have non-empty + // victim caches and no pools have primary caches. + oldPools, allPools = allPools, nil +} + +var ( + allPoolsMu Mutex + + // allPools is the set of pools that have non-empty primary + // caches. Protected by either 1) allPoolsMu and pinning or 2) + // STW. + allPools []*Pool + + // oldPools is the set of pools that may have non-empty victim + // caches. Protected by STW. + oldPools []*Pool +) + +func init() { + runtime_registerPoolCleanup(poolCleanup) +} +``` +#### allPools & oldPools +所有被实例化的`sync.Pool`对象,`在修生变化时`,都会将其自身注册到`allPools`静态变量中。 + +其中,`allPools`引用了所有`local`(primary cache)不为空的pool实例,而`oldPools`则引用了所有`victim`(victim cache)不为空的pool实例。 + +在init方法中,将poolCleanup注册到了runtime,在STW的上线文中,poolCleanup将会`在垃圾回收之前`被runtime调用。 + +poolCleanup方法逻辑比较简单,具体如下: +- 将`victim`丢弃,并且将`local`转移到`victim`,最后将`local`置为空 +- 将静态变量中`allPools`的值转移到`oldPools`,并且将`oldPools`的值置为空 + +这代表如果pool中的对象如果长期未被访问,那么将会从pool中被淘汰。 + +> `poolCleanUp`方法在STW时会被调用,第一次STW时,未使用对象会从local移动到victim,而第二次STW,则是会从victim中被丢弃,之后被后续的垃圾回收清理。 + +### Proc Pining +关于`sync.Pool`,其实际结构如下: +```go +type Pool struct { + noCopy noCopy + + local unsafe.Pointer // local fixed-size per-P pool, actual type is [P]poolLocal + localSize uintptr // size of the local array + + victim unsafe.Pointer // local from previous cycle + victimSize uintptr // size of victims array + + // New optionally specifies a function to generate + // a value when Get would otherwise return nil. + // It may not be changed concurrently with calls to Get. + New func() any +} +``` +#### per-P +关于调度的actor,其存在如下角色: +- goroutine:`G's` +- machines:`M's`代表系统线程 +- processor:`P's`代表处理器物理线程 + +其中,`goroutine`由操作系统线程执行,而操作系统线程在执行时需要获取实际的cpu物理线程。 + +在gouroutine运行时,存在一些`safe-point`,在`safe-point`goroutine可以在`clean`状态被停止。故而,`抢占只能发生在safe-point`。 + +`proc pinning`会禁止抢占,在pinning后,P(物理线程)将会被独占,在`unpin`发生之前,goroutine会一直执行,并不会被停止,甚至不会被GC停止。`unpin之前,P无法被其他goroutine使用`。 + +一旦`pinned`后,execution flow在P上不会被中断,`这也意味着在访问threadlocal数据时无需加锁`。 + +如下是围绕Pinning的逻辑: + +```go +// pin pins the current goroutine to P, disables preemption and +// returns poolLocal pool for the P and the P's id. +// Caller must call runtime_procUnpin() when done with the pool. +func (p *Pool) pin() (*poolLocal, int) { + pid := runtime_procPin() + // In pinSlow we store to local and then to localSize, here we load in opposite order. + // Since we've disabled preemption, GC cannot happen in between. + // Thus here we must observe local at least as large localSize. + // We can observe a newer/larger local, it is fine (we must observe its zero-initialized-ness). + s := runtime_LoadAcquintptr(&p.localSize) // load-acquire + l := p.local // load-consume + if uintptr(pid) < s { + return indexLocal(l, pid), pid + } + return p.pinSlow() +} + +func indexLocal(l unsafe.Pointer, i int) *poolLocal { + lp := unsafe.Pointer(uintptr(l) + uintptr(i)*unsafe.Sizeof(poolLocal{})) + return (*poolLocal)(lp) +} +``` + +#### local & localSize +- `local`:local是一个由`poolLocal`对象组成`c-style`数组 +- `localSize`:localSize是`local`数组的大小 +- `poolLocal`: local数组中的每个poolLocal都关联一个给定的P +- `runtime_procPin`:该方法会返回`pin`锁关联的processor id,processor id从0开始依次加1,直到`GOMAXPROCS` + +分析上述`indexLocal`方法的逻辑,其根据processor id的值,计算了pid关联poolLocal对象地址的偏移量,并返回poolLocal对象的指针。这令我们可以并发安全的访问poolLocal对象而无需加锁,`只需要pinned并且直接访问threadlocal变量`。 + +#### PinSlow +`pinSlow`方法是针对`pin`的fallback方法,其代表我们针对local数组大小的假设是错误的,本次绑定的P其并没有对应的poolLocal。 + +代码进入到pinSlow有如下可能: +- `GOMAXPROCS`被更新过,从而有了额外可用的P +- 该pool对象是新创建的 + +pinSlow的代码如下: +```go +func (p *Pool) pinSlow() (*poolLocal, int) { + // Retry under the mutex. + // Can not lock the mutex while pinned. + runtime_procUnpin() + allPoolsMu.Lock() + defer allPoolsMu.Unlock() + pid := runtime_procPin() + // poolCleanup won't be called while we are pinned. + s := p.localSize + l := p.local + if uintptr(pid) < s { + return indexLocal(l, pid), pid + } + if p.local == nil { + allPools = append(allPools, p) + } + // If GOMAXPROCS changes between GCs, we re-allocate the array and lose the old one. + size := runtime.GOMAXPROCS(0) + local := make([]poolLocal, size) + atomic.StorePointer(&p.local, unsafe.Pointer(&local[0])) // store-release + runtime_StoreReluintptr(&p.localSize, uintptr(size)) // store-release + return &local[pid], pid +} +``` +当处于`pinned`状态时,无法获取针对`allPools`变量的锁,这样有可能会导致死锁。 + +> 如果在处于pinned状态的情况下获取锁,那么此时锁可能被其他goroutine持有,而持有锁的goroutine可能正在等待我们释放P + +故而,在pinSlow中,首先`unpin`,然后获取锁,并且在获取锁之后重新进入`pin`状态 + +在重新进入pin状态并且获取到allPoolsMu的锁之后,首先会检测目前pid是否有关联的poolLocal对象,如果有,则直接返回,这通常在如下场景下发生: +- 在阻塞获取allPoolsMu锁时,其他goroutinue已经为我们扩充了local数组的大小 +- 我们不再绑定在之前的P上了,我们可能绑定在另一个pid小于local数组大小的P上 + +如果目前pool对象其local数组为空,那么其会先将pool实例注册到allPools中,然后执行如下逻辑: +- 创建一个新的poolLocal slice,slice大小和GOMAXPROCS相同,并将新创建slice的头一个元素地址存储到`p.local`中 +- 将slice大小存储在`p.localSize`中 + +### Pool Local +`poolLocal`结构如下: +```go +// Local per-P Pool appendix. +type poolLocalInternal struct { + private any // Can be used only by the respective P. + shared poolChain // Local P can pushHead/popHead; any P can popTail. +} + +type poolLocal struct { + poolLocalInternal + + // Prevents false sharing on widespread platforms with + // 128 mod (cache line size) = 0 . + pad [128 - unsafe.Sizeof(poolLocalInternal{})%128]byte +} +``` +> 对于poolLocalIntenral中的poolChain,local P可以执行pushHead/popHead逻辑,而任何P都可以执行popTail逻辑 + +### pool的Put/Get +#### Put +其中,Put相关逻辑如下: +```go +// Put adds x to the pool. +func (p *Pool) Put(x any) { + if x == nil { + return + } + if race.Enabled { + if fastrandn(4) == 0 { + // Randomly drop x on floor. + return + } + race.ReleaseMerge(poolRaceAddr(x)) + race.Disable() + } + l, _ := p.pin() + if l.private == nil { + l.private = x + } else { + l.shared.pushHead(x) + } + runtime_procUnpin() + if race.Enabled { + race.Enable() + } +} +``` +其核心逻辑如下: +- pin,并获取poolLocal +- 如果poolLocal中private为空,将item放到private中 +- 如果private不为空,将其放入shared中,LIFO +- 然后unpin + +#### Get +Get的相关逻辑如下: +```go +// Get selects an arbitrary item from the Pool, removes it from the +// Pool, and returns it to the caller. +// Get may choose to ignore the pool and treat it as empty. +// Callers should not assume any relation between values passed to Put and +// the values returned by Get. +// +// If Get would otherwise return nil and p.New is non-nil, Get returns +// the result of calling p.New. +func (p *Pool) Get() any { + if race.Enabled { + race.Disable() + } + l, pid := p.pin() + x := l.private + l.private = nil + if x == nil { + // Try to pop the head of the local shard. We prefer + // the head over the tail for temporal locality of + // reuse. + x, _ = l.shared.popHead() + if x == nil { + x = p.getSlow(pid) + } + } + runtime_procUnpin() + if race.Enabled { + race.Enable() + if x != nil { + race.Acquire(poolRaceAddr(x)) + } + } + if x == nil && p.New != nil { + x = p.New() + } + return x +} +``` +以下是pool的Get核心流程: +- pin, 并且获取poolLocal +- 将private清空,并且判断之前private是否有值,如果有值,将使用该值 +- 如果private之前没有值,那么对shared执行pop操作,LIFO,如果pop操作获取的值不为空,使用该值 +- 如果对shared执行LIFO pop操作的也为空,那么会执行slow path的getSlow方法 +- 如果在getSlow仍然未获取到值的情况下,会调用`New`方法来获取值 + +> #### LIFO +> 对于poolLocal的shared队列,其使用的是LIFO,最后添加到队列的元素会被最先弹出。这代表我们希望使用最新分配的对象,旧分配的对象会随着`STW`被逐渐淘汰。 + +##### slow path +在调用Get方法时,slow path仅当private和shared都为空时被触发,这代表当前threadlocal pool为空。 + +在触发slow path场景下,会尝试从其他P中窃取对象,如果在窃取仍然失败的场景下,才会去`victim`中进行查找。 + +Get方法中slow path实现如下: +```go +func (p *Pool) getSlow(pid int) any { + // See the comment in pin regarding ordering of the loads. + size := runtime_LoadAcquintptr(&p.localSize) // load-acquire + locals := p.local // load-consume + // Try to steal one element from other procs. + for i := 0; i < int(size); i++ { + l := indexLocal(locals, (pid+i+1)%int(size)) + if x, _ := l.shared.popTail(); x != nil { + return x + } + } + + // Try the victim cache. We do this after attempting to steal + // from all primary caches because we want objects in the + // victim cache to age out if at all possible. + size = atomic.LoadUintptr(&p.victimSize) + if uintptr(pid) >= size { + return nil + } + locals = p.victim + l := indexLocal(locals, pid) + if x := l.private; x != nil { + l.private = nil + return x + } + for i := 0; i < int(size); i++ { + l := indexLocal(locals, (pid+i)%int(size)) + if x, _ := l.shared.popTail(); x != nil { + return x + } + } + + // Mark the victim cache as empty for future gets don't bother + // with it. + atomic.StoreUintptr(&p.victimSize, 0) + + return nil +} +``` +- 首先,会尝试对`pool.local`数组中所有的poolLocal对象都调用popTail方法,如果任一方法返回值不为空,那么将会使用该返回的值。`窃取操作会尝试窃取尾部的对象,这是最先被创建的对象`。 +- 如果在local中未能找到和窃取到对象,那么会从victim中进行查找 + - 首先,获取victim中当前pid对象的poolLocal对象,检查poolLocal对象private是否不为空,如果不为空,使用该值并将victim.private清空 + - 如果private为空,那么则对victim中所有P关联的poolLocal对象执行popTail操作,如果任何一个pop操作返回不为空,那么使用返回的对象 + - 如果所有victim中的poolLocal对象都返回为空,那么会将victim中`p.victimSize`标识为空,后续再次执行slow path时,如果感知到victimSize为空,那么便不会再次查找victim + + ## syntax ### iota `iota`关键字代表连续的整数变量,`0, 1, 2`,每当`const`关键字出现时,其重置为0