如何快速的随机从 map 中返回一个值

前言

本文翻译自 lukechampine.com/hackmap.htm… go 的 map 源码解析都会引用的一片文章。

第一部分：问题

map.Int(n)

func randSliceValue(xs []string) string {
	return xs[rand.Intn(len(xs))]
}
复制代码

O(1)range

一种方式是我们可以把 map 展开，然后复用切片中随机获取一个值的方法

func randMapKey(m map[string]int) string {
	mapKeys = make([]string, 0, len(m)) // pre-allocate exact size
	for key := range m {
		mapKeys = append(mapKeys, key)
	}
	return mapKeys[rand.Intn(len(mapKeys))]
}
复制代码

O(n)

rangerangerangemapiterinit

func randMapKey(m map[string]int) string {
	r := rand.Intn(len(m))
	for k := range m {
		if r == 0 {
			return k
		}
		r--
	}
	panic("unreachable")
}
复制代码

O(n)O(1)

第二部分：深入理解

unsafe[]byte[]byte

map

map.gohitermapiterinitmapinternextrangehitermapinitinitmapiternext

hmapnew(hiter)mapiterinitlen(m)mapiternext

mapiterinitmapiternexthashmap.goatomic.Or8interface{}

// runtime representation of an interface{}
type emptyInterface struct {
	typ unsafe.Pointer
	val unsafe.Pointer
}

func mapTypeAndValue(m interface{}) (*maptype, *hmap) {
	ei := (*emptyInterface)(unsafe.Pointer(&m))
	return (*maptype)(ei.typ), (*hmap)(ei.val)
}

func iterKey(it *hiter) interface{} {
	ei := emptyInterface{
		typ: unsafe.Pointer(it.t.key), // it.t is the maptype
		val: it.key,
	}
	return *(*interface{})(unsafe.Pointer(&ei))
}
复制代码

randMapKeyO(n)O(1)

func randMapKey(m interface{}) interface{} {
	// get map internals
	t, h := mapTypeAndValue(m)
	// initialize iterator
	it := new(hiter)
	mapiterinit(t, h, it)
	// advance iterator a random number of times
	r := rand.Intn(h.count) // h.count == len(m)
	for i := 0; i < r; i++ {
		mapiternext(it)
	}
	// return current iterator key as an interface{}
	return iterKey(it)
}
复制代码

unsafe

第三部分：可以的

实际上我们是可以花费常数时间(虽然这个常数时间可以有点大)复杂度来获取 map 的随机 key 的。但是呢，为了达到常数时间复杂度的地步，我们需要更深的理解 go 中的 map 是如何运作的。

hmaphmapunsafe.Pointertophash

const bucketCnt = 8 // number of cells per bucket

type bmap struct {
	tophash [bucketCnt]uint8
	// Followed by bucketCnt keys and then bucketCnt values.
	// NOTE: packing all the keys together and then all the values together makes the
	// code a bit more complicated than alternating key/value/key/value/... but it allows
	// us to eliminate padding which would be needed for, e.g., map[int64]int8.
	// Followed by an overflow pointer.
}
复制代码

map[string]intbmap

type bmap struct {
	tophash  [bucketCnt]uint8
	keys     [bucketCnt]string
	values   [bucketCnt]int
	overflow *bmap
}
复制代码

uintptrhashhash

// h is an hmap, t is a maptype
hash := t.key.alg.hash(key, uintptr(h.hash0))
bucketIndex := hash & (uintptr(1) << h.B - 1)
复制代码

tophashtophashtophashtophash

// calculate tophash
top := uint8(hash >> (unsafe.Sizeof(hash)*8 - 8))

// seek to offset of bucketIndex in h.buckets
b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucketIndex*uintptr(t.bucketsize)))

// iterate through the cells of b. If a tophash matches top, it means we've
// already inserted a value with this key, so overwrite it. Otherwise, store
// the key/value in the first empty cell.
for i := 0; i < bucketCnt; i++ {
	if b.tophash[i] == top {
		// overwrite the existing value
		// [ code omitted ]
		return
	} else if b.tophash[i] == 0 {
		// insert the new key/value
		// [ code omitted ]
		b.tophash[i] = top
		h.count++
		return
	}
}
复制代码

正如你期待的那样，为了获得 map 中的值，我们仅仅是重复了此过程。但是这个过程是返回 cell 中的数据而不是往 cell 中插入或者覆盖。

randMapKeyh.buckets

mapinterinit

[foo] [bar] [---] [---] [---] [---] [---] [---]
复制代码

foobarfoofoobar

幸运的是，我们有一个替代的方案：随机的获取一个索引，如果那个 cell 是空的，我们再次随机的获取一个索引即可。平均下来，我们没获取到一个非空的 cell 需要的次数为 k/n ，其中 k 是 cell 的个数，n 是非空的 cell 的个数。当然，算法必须保证随机值是均匀分布的，这样每个索引的取值都是等可能的。

randMapKeyadd

func add(p unsafe.Pointer, x uintptr) unsafe.Pointer {
	return unsafe.Pointer(uintptr(p) + x)
}

func cellKey(t *maptype, b *bmap, i int) interface{} {
	// dataOffset is where the cell data begins in a bmap.
	const dataOffset = unsafe.Offsetof(struct {
		tophash [bucketCnt]uint8
		cells   int64
	}{}.cells)

	k := add(unsafe.Pointer(b), dataOffset+uintptr(i)*uintptr(t.keysize))
	if t.indirectkey {
		// if the map's key type is too big, a pointer will be stored in
		// the map instead of the actual data. In that case, we need to
		// dereference the pointer.
		k = *(*unsafe.Pointer)(k)
	}

	ei := emptyInterface{
		typ: unsafe.Pointer(t.key),
		val: k,
	}
	return *(*interface{})(unsafe.Pointer(&ei))
}

func randMapKey(m interface{}) interface{} {
	// get map internals
	t, h := mapTypeAndValue(m)
	numBuckets := 1 << h.B

	// loop until we hit a valid cell
	for {
		// pick random indices
		bucketIndex := rand.Intn(numBuckets)
		cellIndex := rand.Intn(bucketCnt)

		// lookup cell
		b := (*bmap)(add(h.buckets, uintptr(bucketIndex)*uintptr(t.bucketsize)))
		if b.tophash[cellIndex] == 0 {
			// cell is empty; try again
			continue
		}
		return cellKey(t, b, cellIndex)
	}
}
复制代码

第四部分：不

bmapoverflowbmapbmapoverflow

randMapKey

           bucket0   bucket1   bucket2   bucket3
overflow0 [|||||||] [|||||||] [|||||||] [|||||||]
overflow1 [|||||||]           [|||||||]
overflow2                     [|||||||]
复制代码

之前，我们仅仅从第一行的四个 bucket 中进行选择。现在我们需要在 12 个 bucket 中进行选择，即使一些 bucket 并不存在。操作和之前的遇到空的 cell 相同，如果一个 bucket 并不存在，我们从新选择一个随机值再来一次就好了。

hmapoverflowoverflownil

func (b *bmap) overflow(t *maptype) *bmap {
	offset := uintptr(t.bucketsize)-unsafe.Sizeof(uintptr(0))
	return *(**bmap)(add(unsafe.Pointer(b), offset))
}

func maxOverflow(t *maptype, h *hmap) int {
	numBuckets := uintptr(1 << h.B)
	max := 0
	for i := uintptr(0); i < numBuckets; i++ {
		over := 0
		b := (*bmap)(add(h.buckets, i*uintptr(t.bucketsize)))
		for b = b.overflow(t); b != nil; over++ {
			b = b.overflow(t)
		}
		if over > max {
			max = over
		}
	}
	return max
}
复制代码

randMapKey

func randMapKey(m interface{}) interface{} {
	// get map internals
	t, h := mapTypeAndValue(m)
	numBuckets := 1 << h.B
	numOver := maxOverflow(t, h) + 1 // add 1 to account for "base" bucket

	// loop until we hit a valid cell
loop:
	for {
		// pick random indices
		bucketIndex := rand.Intn(numBuckets)
		overIndex := rand.Intn(numOver)
		cellIndex := rand.Intn(bucketCnt)

		// seek to index in h.buckets
		b := (*bmap)(add(h.buckets, uintptr(bucketIndex)*uintptr(t.bucketsize)))

		// seek to index in overflow chain
		for i := 0; i < overIndex; i++ {
			b = b.overflow(t)
			if b == nil {
				// invalid bucket; try again
				continue loop
			}
		}

		// lookup cell
		if b.tophash[cellIndex] == 0 {
			// cell is empty; try again
			continue loop
		}
		return cellKey(t, b, cellIndex)
	}
}
复制代码

我们可以确定对于含有 overflow 的 bucket 是可以运行的. 我在担心是否还有第五部分的内容

第五部分：当然还有第五部分

(这是最后一部分，我保证)

evacuatedh.oldbucketsnil

h.bucketsh.oldbuckets

maxOverflowh.bucketsh.oldbuckets

maxOverflow

func maxOverflow(t *maptype, h *hmap) int {
	numBuckets := uintptr(1 << h.B)
	max := 0
	for i := uintptr(0); i < numBuckets; i++ {
		over := 0
		b := (*bmap)(add(h.buckets, i*uintptr(t.bucketsize)))
		for b = b.overflow(t); b != nil; over++ {
			b = b.overflow(t)
		}
		if over > max {
			max = over
		}
	}

	// check oldbuckets too, if it exists
	if h.oldbuckets != nil {
		for i := uintptr(0); i < numBuckets/2; i++ {
			var over int
			b := (*bmap)(add(h.oldbuckets, i*uintptr(t.bucketsize)))
			if evacuated(b) {
				// we already counted this bucket in the first loop
				continue
			}
			for b = b.overflow(t); b != nil; over++ {
				b = b.overflow(t)
			}
			if over > max {
				max = over
			}
		}
	}
	return max
}
复制代码

randMapKey

func randMapKey(m interface{}) interface{} {
	// get map internals
	t, h := mapTypeAndValue(m)
	numBuckets := uintptr(1 << h.B)
	numOver := maxOverflow(t, h) + 1 // add 1 to account for "base" bucket

	// loop until we hit a valid cell
loop:
	for {
		// pick a random index
		bucketIndex := uintptr(rand.Intn(int(numBuckets)))
		overIndex := rand.Intn(numOver)
		cellIndex := rand.Intn(bucketCnt)

		// seek to index in h.buckets
		b := (*bmap)(add(h.buckets, bucketIndex*uintptr(t.bucketsize)))

		// if the oldbucket hasn't been evacuated, then we need to use that
		// pointer instead.
		usingOldBucket := false
		if h.oldbuckets != nil {
			numOldBuckets := numBuckets / 2
			oldBucketIndex := bucketIndex & (numOldBuckets - 1)
			oldB := (*bmap)(add(h.oldbuckets, oldBucketIndex*uintptr(t.bucketsize)))
			if !evacuated(oldB) {
				b = oldB
				usingOldBucket = true
			}
		}

		// seek to index in overflow chain
		for i := 0; i < overIndex; i++ {
			b = b.overflow(t)
			if b == nil {
				// invalid bucket; try again
				continue loop
			}
		}

		// lookup cell
		if b.tophash[cellIndex] == 0 {
			// cell is empty; try again
			continue loop
		}

		// grab key and dereference if necessary (same as cellKey)
		k := add(unsafe.Pointer(b), dataOffset+uintptr(cellIndex)*uintptr(t.keysize))
		if t.indirectkey {
			k = *(*unsafe.Pointer)(k)
		}

		// if this is an old bucket, we need to check whether this key is destined
		// for the new bucket. Otherwise, we will have a 2x bias towards oldbucket
		// values, since two different bucket selections can result in the same
		// oldbucket.
		if usingOldBucket {
			hash := t.key.alg.hash(k, uintptr(h.hash0))
			if hash&(numBuckets-1) != bucketIndex {
				// this key is destined for a different bucket
				continue loop
			}
		}

		// pack key into interface{} (same as cellKey)
		ei := emptyInterface{
			typ: unsafe.Pointer(t.key),
			val: k,
		}
		return *(*interface{})(unsafe.Pointer(&ei))
	}
}
复制代码

感觉并不坏，全部情况都考虑到了！如果你没看到最初的代码地址，那么看可以看这个完整的代码。