前言
本文翻译自 lukechampine.com/hackmap.htm… go 的 map 源码解析都会引用的一片文章。
第一部分:问题
map.Int(n)
func randSliceValue(xs []string) string {
return xs[rand.Intn(len(xs))]
}
复制代码
O(1)range
一种方式是我们可以把 map 展开,然后复用切片中随机获取一个值的方法
func randMapKey(m map[string]int) string {
mapKeys = make([]string, 0, len(m)) // pre-allocate exact size
for key := range m {
mapKeys = append(mapKeys, key)
}
return mapKeys[rand.Intn(len(mapKeys))]
}
复制代码
O(n)
rangerangerangemapiterinit
func randMapKey(m map[string]int) string {
r := rand.Intn(len(m))
for k := range m {
if r == 0 {
return k
}
r--
}
panic("unreachable")
}
复制代码
O(n)O(1)
第二部分:深入理解
unsafe[]byte[]byte
map
map.gohitermapiterinitmapinternextrangehitermapinitinitmapiternext
hmapnew(hiter)mapiterinitlen(m)mapiternext
mapiterinitmapiternexthashmap.goatomic.Or8interface{}
// runtime representation of an interface{}
type emptyInterface struct {
typ unsafe.Pointer
val unsafe.Pointer
}
func mapTypeAndValue(m interface{}) (*maptype, *hmap) {
ei := (*emptyInterface)(unsafe.Pointer(&m))
return (*maptype)(ei.typ), (*hmap)(ei.val)
}
func iterKey(it *hiter) interface{} {
ei := emptyInterface{
typ: unsafe.Pointer(it.t.key), // it.t is the maptype
val: it.key,
}
return *(*interface{})(unsafe.Pointer(&ei))
}
复制代码
randMapKeyO(n)O(1)
func randMapKey(m interface{}) interface{} {
// get map internals
t, h := mapTypeAndValue(m)
// initialize iterator
it := new(hiter)
mapiterinit(t, h, it)
// advance iterator a random number of times
r := rand.Intn(h.count) // h.count == len(m)
for i := 0; i < r; i++ {
mapiternext(it)
}
// return current iterator key as an interface{}
return iterKey(it)
}
复制代码
unsafe
第三部分:可以的
实际上我们是可以花费常数时间(虽然这个常数时间可以有点大)复杂度来获取 map 的随机 key 的。但是呢,为了达到常数时间复杂度的地步,我们需要更深的理解 go 中的 map 是如何运作的。
hmaphmapunsafe.Pointertophash
const bucketCnt = 8 // number of cells per bucket
type bmap struct {
tophash [bucketCnt]uint8
// Followed by bucketCnt keys and then bucketCnt values.
// NOTE: packing all the keys together and then all the values together makes the
// code a bit more complicated than alternating key/value/key/value/... but it allows
// us to eliminate padding which would be needed for, e.g., map[int64]int8.
// Followed by an overflow pointer.
}
复制代码
map[string]intbmap
type bmap struct {
tophash [bucketCnt]uint8
keys [bucketCnt]string
values [bucketCnt]int
overflow *bmap
}
复制代码
uintptrhashhash
// h is an hmap, t is a maptype
hash := t.key.alg.hash(key, uintptr(h.hash0))
bucketIndex := hash & (uintptr(1) << h.B - 1)
复制代码
tophashtophashtophashtophash
// calculate tophash
top := uint8(hash >> (unsafe.Sizeof(hash)*8 - 8))
// seek to offset of bucketIndex in h.buckets
b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucketIndex*uintptr(t.bucketsize)))
// iterate through the cells of b. If a tophash matches top, it means we've
// already inserted a value with this key, so overwrite it. Otherwise, store
// the key/value in the first empty cell.
for i := 0; i < bucketCnt; i++ {
if b.tophash[i] == top {
// overwrite the existing value
// [ code omitted ]
return
} else if b.tophash[i] == 0 {
// insert the new key/value
// [ code omitted ]
b.tophash[i] = top
h.count++
return
}
}
复制代码
正如你期待的那样,为了获得 map 中的值,我们仅仅是重复了此过程。但是这个过程是返回 cell 中的数据而不是往 cell 中插入或者覆盖。
randMapKeyh.buckets
mapinterinit
[foo] [bar] [---] [---] [---] [---] [---] [---]
复制代码
foobarfoofoobar
幸运的是,我们有一个替代的方案:随机的获取一个索引,如果那个 cell 是空的,我们再次随机的获取一个索引即可。平均下来,我们没获取到一个非空的 cell 需要的次数为 k/n ,其中 k 是 cell 的个数,n 是非空的 cell 的个数。当然,算法必须保证随机值是均匀分布的,这样每个索引的取值都是等可能的。
randMapKeyadd
func add(p unsafe.Pointer, x uintptr) unsafe.Pointer {
return unsafe.Pointer(uintptr(p) + x)
}
func cellKey(t *maptype, b *bmap, i int) interface{} {
// dataOffset is where the cell data begins in a bmap.
const dataOffset = unsafe.Offsetof(struct {
tophash [bucketCnt]uint8
cells int64
}{}.cells)
k := add(unsafe.Pointer(b), dataOffset+uintptr(i)*uintptr(t.keysize))
if t.indirectkey {
// if the map's key type is too big, a pointer will be stored in
// the map instead of the actual data. In that case, we need to
// dereference the pointer.
k = *(*unsafe.Pointer)(k)
}
ei := emptyInterface{
typ: unsafe.Pointer(t.key),
val: k,
}
return *(*interface{})(unsafe.Pointer(&ei))
}
func randMapKey(m interface{}) interface{} {
// get map internals
t, h := mapTypeAndValue(m)
numBuckets := 1 << h.B
// loop until we hit a valid cell
for {
// pick random indices
bucketIndex := rand.Intn(numBuckets)
cellIndex := rand.Intn(bucketCnt)
// lookup cell
b := (*bmap)(add(h.buckets, uintptr(bucketIndex)*uintptr(t.bucketsize)))
if b.tophash[cellIndex] == 0 {
// cell is empty; try again
continue
}
return cellKey(t, b, cellIndex)
}
}
复制代码
第四部分:不
bmapoverflowbmapbmapoverflow
randMapKey
bucket0 bucket1 bucket2 bucket3
overflow0 [|||||||] [|||||||] [|||||||] [|||||||]
overflow1 [|||||||] [|||||||]
overflow2 [|||||||]
复制代码
之前,我们仅仅从第一行的四个 bucket 中进行选择。现在我们需要在 12 个 bucket 中进行选择,即使一些 bucket 并不存在。操作和之前的遇到空的 cell 相同,如果一个 bucket 并不存在,我们从新选择一个随机值再来一次就好了。
hmapoverflowoverflownil
func (b *bmap) overflow(t *maptype) *bmap {
offset := uintptr(t.bucketsize)-unsafe.Sizeof(uintptr(0))
return *(**bmap)(add(unsafe.Pointer(b), offset))
}
func maxOverflow(t *maptype, h *hmap) int {
numBuckets := uintptr(1 << h.B)
max := 0
for i := uintptr(0); i < numBuckets; i++ {
over := 0
b := (*bmap)(add(h.buckets, i*uintptr(t.bucketsize)))
for b = b.overflow(t); b != nil; over++ {
b = b.overflow(t)
}
if over > max {
max = over
}
}
return max
}
复制代码
randMapKey
func randMapKey(m interface{}) interface{} {
// get map internals
t, h := mapTypeAndValue(m)
numBuckets := 1 << h.B
numOver := maxOverflow(t, h) + 1 // add 1 to account for "base" bucket
// loop until we hit a valid cell
loop:
for {
// pick random indices
bucketIndex := rand.Intn(numBuckets)
overIndex := rand.Intn(numOver)
cellIndex := rand.Intn(bucketCnt)
// seek to index in h.buckets
b := (*bmap)(add(h.buckets, uintptr(bucketIndex)*uintptr(t.bucketsize)))
// seek to index in overflow chain
for i := 0; i < overIndex; i++ {
b = b.overflow(t)
if b == nil {
// invalid bucket; try again
continue loop
}
}
// lookup cell
if b.tophash[cellIndex] == 0 {
// cell is empty; try again
continue loop
}
return cellKey(t, b, cellIndex)
}
}
复制代码
我们可以确定 对于含有 overflow 的 bucket 是可以运行的. 我在担心是否还有第五部分的内容
第五部分:当然还有第五部分
(这是最后一部分,我保证)
evacuatedh.oldbucketsnil
h.bucketsh.oldbuckets
maxOverflowh.bucketsh.oldbuckets
maxOverflow
func maxOverflow(t *maptype, h *hmap) int {
numBuckets := uintptr(1 << h.B)
max := 0
for i := uintptr(0); i < numBuckets; i++ {
over := 0
b := (*bmap)(add(h.buckets, i*uintptr(t.bucketsize)))
for b = b.overflow(t); b != nil; over++ {
b = b.overflow(t)
}
if over > max {
max = over
}
}
// check oldbuckets too, if it exists
if h.oldbuckets != nil {
for i := uintptr(0); i < numBuckets/2; i++ {
var over int
b := (*bmap)(add(h.oldbuckets, i*uintptr(t.bucketsize)))
if evacuated(b) {
// we already counted this bucket in the first loop
continue
}
for b = b.overflow(t); b != nil; over++ {
b = b.overflow(t)
}
if over > max {
max = over
}
}
}
return max
}
复制代码
randMapKey
func randMapKey(m interface{}) interface{} {
// get map internals
t, h := mapTypeAndValue(m)
numBuckets := uintptr(1 << h.B)
numOver := maxOverflow(t, h) + 1 // add 1 to account for "base" bucket
// loop until we hit a valid cell
loop:
for {
// pick a random index
bucketIndex := uintptr(rand.Intn(int(numBuckets)))
overIndex := rand.Intn(numOver)
cellIndex := rand.Intn(bucketCnt)
// seek to index in h.buckets
b := (*bmap)(add(h.buckets, bucketIndex*uintptr(t.bucketsize)))
// if the oldbucket hasn't been evacuated, then we need to use that
// pointer instead.
usingOldBucket := false
if h.oldbuckets != nil {
numOldBuckets := numBuckets / 2
oldBucketIndex := bucketIndex & (numOldBuckets - 1)
oldB := (*bmap)(add(h.oldbuckets, oldBucketIndex*uintptr(t.bucketsize)))
if !evacuated(oldB) {
b = oldB
usingOldBucket = true
}
}
// seek to index in overflow chain
for i := 0; i < overIndex; i++ {
b = b.overflow(t)
if b == nil {
// invalid bucket; try again
continue loop
}
}
// lookup cell
if b.tophash[cellIndex] == 0 {
// cell is empty; try again
continue loop
}
// grab key and dereference if necessary (same as cellKey)
k := add(unsafe.Pointer(b), dataOffset+uintptr(cellIndex)*uintptr(t.keysize))
if t.indirectkey {
k = *(*unsafe.Pointer)(k)
}
// if this is an old bucket, we need to check whether this key is destined
// for the new bucket. Otherwise, we will have a 2x bias towards oldbucket
// values, since two different bucket selections can result in the same
// oldbucket.
if usingOldBucket {
hash := t.key.alg.hash(k, uintptr(h.hash0))
if hash&(numBuckets-1) != bucketIndex {
// this key is destined for a different bucket
continue loop
}
}
// pack key into interface{} (same as cellKey)
ei := emptyInterface{
typ: unsafe.Pointer(t.key),
val: k,
}
return *(*interface{})(unsafe.Pointer(&ei))
}
}
复制代码
感觉并不坏,全部情况都考虑到了!如果你没看到最初的代码地址,那么看可以看这个完整的代码。