Skip to content

Commit cdcb659

Browse files
author
alex cai
committed
filter
1 parent a3fdee3 commit cdcb659

File tree

3 files changed

+261
-37
lines changed

3 files changed

+261
-37
lines changed

README.md

+11-1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,16 @@
1313
## Example
1414

1515
```go
16-
1716
```
17+
## 冲突的数量
18+
19+
### 只使用一个hash函数(fnv.New32a)
20+
21+
使用2MB的空间,即1.67kw左右的表示空间:
22+
23+
- 元素个数1000w,其hash冲突率约为6.7%
24+
- 元素个数700w,其hash冲突率约为1.3%
25+
- 元素个数600w或者低于600w时,其hash冲突率约为0%
26+
27+
每次用完得alg.Reset()
1828

filter.go

+124-18
Original file line numberDiff line numberDiff line change
@@ -2,51 +2,157 @@ package bitFilter
22

33
import ()
44

5+
const (
6+
// 2MB的存储空间. 1MB=2^20, 一个uint64长度为8个字节,就是2^3
7+
// 总共的位数:8*1024*1024*2 = 16 777 216
8+
Size2MB uint64 = 1 << (21 - 3)
9+
10+
// 更大的空间
11+
Size8MB uint64 = 1 << (23 - 3)
12+
Size32MB uint64 = 1 << (25 - 3)
13+
Size128MB uint64 = 1 << (27 - 3)
14+
Size512MB uint64 = 1 << (29 - 3)
15+
Size2GB uint64 = 1 << (31 - 3)
16+
17+
// 8*1024*2 = 16 384
18+
Size2KB uint64 = 1 << (11 - 3)
19+
)
20+
521
type TBits struct {
6-
bytes []byte // 字符数组
7-
bits int // 位空间,1个字节8个位
8-
collision map[uint32]bool // 冲突的key值(理论上冲突的key值应该很小才有效)
22+
buckets []uint64 // 数组
23+
24+
// 冲突的key值(理论上冲突的key值应该很小才有效)
25+
// 下标是key,值是hashKey
26+
collision map[uint64]uint32
927
}
1028

1129
var (
1230
// 位的掩码
13-
bitMask = [8]uint8{1, 2, 4, 8, 16, 32, 64, 128}
31+
bitMask [64]uint64
1432
)
1533

16-
func New(spaceBytes int) *TBits {
34+
func init() {
35+
var i uint
36+
for i = 0; i < 64; i++ {
37+
bitMask[i] = 1 << i
38+
}
39+
}
40+
41+
func New(len uint64) *TBits {
1742
return &TBits{
18-
bytes: make([]byte, spaceBytes),
19-
bits: spaceBytes << 3,
20-
collision: make(map[uint32]bool),
43+
buckets: make([]uint64, len),
44+
collision: make(map[uint64]uint32),
2145
}
2246
}
2347

24-
func (b *TBits) AddAllHashKeys(hashKeys []uint32) {
48+
// 复制一个结构
49+
func (b *TBits) Copy() *TBits {
50+
new := &TBits{
51+
buckets: b.buckets[:],
52+
collision: make(map[uint64]uint32),
53+
}
54+
for key, hashKey := range b.collision {
55+
new.collision[key] = hashKey
56+
}
57+
58+
return new
59+
}
60+
61+
// 初始化所有hash key值
62+
func (b *TBits) InitAllHashKeys(hashKeys []uint32) {
2563
var bucket uint32
2664
for _, key := range hashKeys {
27-
bucket = key >> 3 // 确定在那个位上
65+
bucket = key >> 6 // 确定在那个位上
2866
//println(bucket, key&7)
29-
b.bytes[bucket] |= bitMask[key&7]
67+
b.buckets[bucket] |= bitMask[key&63] // 63 = 1<<6 - 1
3068
}
3169
}
3270

33-
func (b *TBits) AddCollisionKeys(keys []uint32) {
34-
for _, key := range keys {
35-
b.collision[key] = true
71+
// 初始化冲突的key值
72+
func (b *TBits) InitCollisionKeys(keys []uint64, hashKeys []uint32) {
73+
for index, key := range keys {
74+
b.collision[key] = hashKeys[index]
3675
}
3776
}
3877

3978
// Filter 过滤器,如果元素存在,则返回true,否则返回
40-
func (b *TBits) Filter(key, hashKey uint32) (isExist bool) {
79+
func (b *TBits) Filter(key uint64, hashKey uint32) (isExist bool) {
4180
if _, isExist = b.collision[key]; isExist {
4281
return isExist
4382
}
4483

45-
bucket := hashKey >> 3
46-
if b.bytes[bucket]&bitMask[hashKey&7] == 0 {
47-
//println("==")
84+
bucket := hashKey >> 6
85+
if b.buckets[bucket]&bitMask[hashKey&63] == 0 {
4886
return false
4987
}
5088

5189
return true
5290
}
91+
92+
// Add 新增key
93+
// 如果如果hashKey已经存在,但是又没有冲突的话,则返回false,这时需要先添加原来的冲突
94+
func (b *TBits) Add(key uint64, hashKey uint32) (ok bool) {
95+
var (
96+
bucket uint32 = hashKey >> 6 // 确定在那个位上
97+
keyMask = bitMask[hashKey&63] // 63 = 1<<6 - 1
98+
)
99+
if b.buckets[bucket]&keyMask == keyMask {
100+
// 判断该haskkey是否已经冲突
101+
if isExist := b.hasHashKeyInCollision(hashKey); !isExist {
102+
// 需要添加原来的key到冲突key map才可以继续
103+
return false
104+
}
105+
106+
// 原来已经冲突了,这是直接增加一个冲突即可
107+
b.collision[key] = hashKey
108+
} else {
109+
// 该key完全是新的
110+
b.buckets[bucket] |= keyMask
111+
}
112+
113+
return true
114+
}
115+
116+
// Remove 减少key
117+
func (b *TBits) Remove(key uint64, hashKey uint32) {
118+
var oldKeys []uint64
119+
if _, isExist := b.collision[key]; isExist {
120+
for k, hk := range b.collision {
121+
if hk == hashKey {
122+
oldKeys = append(oldKeys, k)
123+
}
124+
}
125+
126+
if len(oldKeys) > 2 {
127+
// 有三个或者三个以上冲突
128+
delete(b.collision, key)
129+
} else {
130+
// 只有两个冲突,则全部删除
131+
// 冲突的key,至少也会有两个
132+
for _, k := range oldKeys {
133+
delete(b.collision, k)
134+
}
135+
}
136+
} else {
137+
var (
138+
bucket uint32 = hashKey >> 6 // 确定在那个位上
139+
keyMask = bitMask[hashKey&63] // 63 = 1<<6 - 1
140+
)
141+
b.buckets[bucket] &= ^keyMask
142+
}
143+
}
144+
145+
// AddCollisionKey 增加冲突key
146+
func (b *TBits) AddCollisionKey(key uint64, hashKey uint32) {
147+
b.collision[key] = hashKey
148+
}
149+
150+
func (b *TBits) hasHashKeyInCollision(hashKey uint32) (isExist bool) {
151+
for _, hk := range b.collision {
152+
if hk == hashKey {
153+
return true
154+
}
155+
}
156+
157+
return isExist
158+
}

filter_test.go

+126-18
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,49 @@
11
package bitFilter
22

33
import (
4+
"fmt"
5+
//"hash"
6+
"encoding/binary"
7+
"hash/fnv"
48
"testing"
59
)
610

711
var bits *TBits
812
var maxMask uint32
913

10-
func init() {
11-
bits = New(1024)
12-
maxMask = 1024<<3 - 1
13-
}
14-
1514
func TestFilter(t *testing.T) {
16-
if len(bits.bytes) != 1024 {
17-
t.Fatalf("len bytes: %d != 2014", len(bits.bytes))
15+
bits = New(128)
16+
maxMask = 128<<6 - 1
17+
18+
if len(bits.buckets) != 128 {
19+
t.Fatalf("len buckets: %d != 128", len(bits.buckets))
1820
}
1921

20-
bits.AddCollisionKeys([]uint32{0, 1, 2, 3, 4, 8192, 8193, 8194, 8195, 8196})
21-
bits.AddAllHashKeys([]uint32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9})
22+
keys := []uint64{0, 1, 2, 3, 4, 8192, 8193, 8194, 8195, 8196, 16385}
23+
collHashKeys := []uint32{0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 1}
24+
hashKeys := []uint32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
25+
bits.InitCollisionKeys(keys, collHashKeys)
26+
bits.InitAllHashKeys(hashKeys)
2227

23-
if len(bits.collision) != 10 {
24-
t.Fatalf("collision len != 10")
28+
if len(bits.collision) != len(collHashKeys) {
29+
t.Fatalf("collision len != %d", len(collHashKeys))
2530
}
2631

27-
if bits.bytes[0] != 255 {
28-
t.Fatalf("bytes[0]:%d != 255", bits.bytes[0])
32+
if bits.buckets[0] != 1023 {
33+
t.Fatalf("buckets[0]:%d != 1023", bits.buckets[0])
2934
}
3035

31-
if bits.bytes[1] != 3 {
32-
t.Fatalf("bytes[1]:%d != 3", bits.bytes[1])
36+
if bits.buckets[1] != 0 {
37+
t.Fatalf("buckets[1]:%d != 0", bits.buckets[1])
3338
}
3439

35-
var i, hashKey uint32
36-
var exist bool
40+
var (
41+
i uint64
42+
hashKey uint32
43+
exist bool
44+
)
3745
for i = 0; i < 10000; i++ {
38-
hashKey = i & maxMask
46+
hashKey = uint32(i) & maxMask
3947
//println(i, hashKey)
4048
exist = bits.Filter(i, hashKey)
4149
if hashKey < 10 {
@@ -49,3 +57,103 @@ func TestFilter(t *testing.T) {
4957
}
5058
}
5159
}
60+
61+
func TestNew(t *testing.T) {
62+
newBits := bits.Copy()
63+
64+
if len(newBits.collision) != 11 {
65+
t.Fatalf("collision len != 11")
66+
}
67+
68+
if newBits.buckets[0] != 1023 {
69+
t.Fatalf("buckets[0]:%d != 1023", newBits.buckets[0])
70+
}
71+
72+
if newBits.buckets[1] != 0 {
73+
t.Fatalf("buckets[1]:%d != 0", newBits.buckets[1])
74+
}
75+
76+
newBits.Remove(8192, 0)
77+
78+
if len(newBits.collision) != 9 {
79+
t.Fatalf("collision len:9 != %d", len(newBits.collision))
80+
}
81+
82+
if newBits.buckets[0] != 1023 {
83+
t.Fatalf("buckets[0]:%d != 1023", newBits.buckets[0])
84+
}
85+
86+
if isExist := newBits.Filter(8192, 0); !isExist {
87+
// 8192这个值虽然remove了,但是0这个值还没remove
88+
// 所以这里还是存在的,这是误判
89+
t.Fatalf("error")
90+
}
91+
92+
newBits.Remove(0, 0)
93+
if isExist := newBits.Filter(0, 0); isExist {
94+
t.Fatalf("error")
95+
}
96+
97+
if newBits.buckets[0] != 1022 {
98+
t.Fatalf("buckets[0]:%d != 1022", newBits.buckets[0])
99+
}
100+
101+
newBits.Remove(16385, 1)
102+
if len(newBits.collision) != 8 {
103+
t.Fatalf("collision len:8 != %d", len(newBits.collision))
104+
}
105+
106+
if isExist := newBits.Filter(1, 1); !isExist {
107+
t.Fatalf("error")
108+
}
109+
110+
if newBits.buckets[0] != 1022 {
111+
t.Fatalf("buckets[0]:%d != 1022", newBits.buckets[0])
112+
}
113+
114+
}
115+
116+
func TestRand(t *testing.T) {
117+
bits = New(Size2MB)
118+
maxMask = uint32(Size2MB<<6 - 1)
119+
120+
var exist = make(map[uint32]bool)
121+
var errorCount int
122+
var projectId, posId, n1, n2 uint32
123+
var b1 = make([]byte, 4)
124+
var b2 = make([]byte, 4)
125+
alg := fnv.New32a()
126+
n1, n2 = 68000, 100
127+
for projectId = 1; projectId < n1; projectId++ {
128+
binary.BigEndian.PutUint32(b1, projectId)
129+
for posId = 1; posId < n2; posId++ {
130+
binary.BigEndian.PutUint32(b2, posId)
131+
alg.Reset()
132+
alg.Write(append(b1, b2...))
133+
hashKey := alg.Sum32()
134+
hashKey = hashKey<<6 - 1
135+
if _, ok := exist[hashKey]; ok {
136+
errorCount++
137+
} else {
138+
exist[hashKey] = true
139+
}
140+
}
141+
}
142+
143+
fmt.Printf("Error Count: %d in %d*%d\nError Rate: %f\n", errorCount, n1, n2, float32(errorCount)/(float32(n1)*float32(n2)))
144+
145+
projectId, posId = 32314, 23413
146+
binary.BigEndian.PutUint32(b1, projectId)
147+
binary.BigEndian.PutUint32(b2, posId)
148+
alg.Reset()
149+
alg.Write(append(b1, b2...))
150+
fmt.Printf("fnv.New32a: %d, %d: %d\n", projectId, posId, alg.Sum32()&maxMask)
151+
}
152+
153+
func BenchmarkFilter(b *testing.B) {
154+
b.RunParallel(func(pb *testing.PB) {
155+
for pb.Next() {
156+
_ = bits.Filter(1234, 43)
157+
}
158+
})
159+
}

0 commit comments

Comments
 (0)