-
Notifications
You must be signed in to change notification settings - Fork 43
/
bloomfilter.py
298 lines (247 loc) · 11.7 KB
/
bloomfilter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
# disable E0611, No name %r in module %r. pylint is unable to correctly assess the content of hashlib
# pylint: disable=E0611
# disable E1103, %s %r has no %r member (but some types could not be inferred). Because pylint can not correctly assess
# the content of hashlib is can not assess its members either.
# pylint: disable=E1103
"""
This module provides the bloom filter support.
The Bloom filter, conceived by Burton Howard Bloom in 1970, is a space-efficient probabilistic data
structure that is used to test whether an element is a member of a set. False positives are
possible, but false negatives are not. Elements can be added to the set, but not removed (though
this can be addressed with a counting filter). The more elements that are added to the set, the
larger the probability of false positives.
Initial Bloomfilter implementation based on pybloom by Jay Baird <[email protected]> and Bob
Ippolito <[email protected]>. Simplified, and optimized to use just python code.
@author: Boudewijn Schoon
@organization: Technical University Delft
@contact: [email protected]
"""
from hashlib import sha1, sha256, sha384, sha512, md5
from math import ceil, log
from struct import Struct
from binascii import hexlify, unhexlify
import logging
logger = logging.getLogger(__name__)
class BloomFilter(object):
"""
A Bloom filter, conceived by Burton Howard Bloom in 1970 is a space-efficient probabilistic data structure that is
used to test whether an element is a member of a set. False positive matches are possible, but false negatives are
not; i.e. a query returns either "inside set (may be wrong)" or "definitely not in set". Elements can be added to
the set, but not removed. The more elements that are added to the set, the larger the probability of false
positives.
The BloomFilter constructor takes parameters that are interpreted differently, depending on their type. The
following type combination, and their interpretations, are possible:
- BloomFilter(int:m_size, float:f_error_rate, str:prefix="")
Will create a BloomFilter instance that is m_size bits large with approximately f_error_rate chance for false
positives. Typically this is used to create a bloom filter where the size it can occupy is limited or fixed.
Note that m_size must be a multiple of 8.
- BloomFilter(float:f_error_rate, int:n_capacity, str:prefix="")
Will create a BloomFilter instance with approximately f_error_rate chance for false positives when n_capacity keys
are added. The m_size, i.e. bits required for storage, is approximated from the f_error_rate and n_capacity.
- BloomFilter(str:bytes, int:k_functions, str:prefix="")
Will create a BloomFilter instance from a binary string and a number of functions. Typically this is used to
retrieve a bloom filter that was serialised. For example:
original = BloomFilter(128, 0.25)
original.add_keys(str(i) for i in xrange(100))
storage = (original.bytes, original.functions, original.prefix)
# storage can be written to disk, socket, etc
clone = BloomFilter(storage[0], storage[1], storage[2])
"""
@staticmethod
def _get_k_functions(m_size, n_capacity):
return int(ceil(log(2) * m_size / n_capacity))
@staticmethod
def _get_n_capacity(m_size, f_error_rate):
return int(m_size * (log(2) ** 2 / abs(log(f_error_rate))))
@classmethod
def _overload_constructor_arguments(cls, args, kargs):
# matches: BloomFilter(str:bytes, int:k_functions, str:prefix="")
if len(args) >= 2 and isinstance(args[0], str) and isinstance(args[1], int):
bytes_ = args[0]
m_size = len(bytes_) * 8
k_functions = args[1]
prefix = kargs.get("prefix", args[2] if len(args) >= 3 else "")
assert 0 < len(bytes_), len(bytes_)
logger.debug("bloom filter based on %d bytes and k_functions %d", len(bytes_), k_functions)
filter_ = long(hexlify(bytes_[::-1]), 16)
# matches: BloomFilter(int:m_size, float:f_error_rate, str:prefix="")
elif len(args) >= 2 and isinstance(args[0], int) and isinstance(args[1], float):
m_size = args[0]
f_error_rate = args[1]
prefix = kargs.get("prefix", args[2] if len(args) >= 3 else "")
assert 0 < m_size, m_size
assert m_size % 8 == 0, "size must be a multiple of eight (%d)" % m_size
assert 0.0 < f_error_rate < 1.0, f_error_rate
logger.debug("constructing bloom filter based on m_size %d bits and f_error_rate %f", m_size, f_error_rate)
k_functions = cls._get_k_functions(m_size, cls._get_n_capacity(m_size, f_error_rate))
filter_ = 0
# matches: BloomFilter(float:f_error_rate, int:n_capacity, str:prefix="")
elif len(args) >= 2 and isinstance(args[0], float) and isinstance(args[1], int):
f_error_rate = args[0]
n_capacity = args[1]
prefix = kargs.get("prefix", args[2] if len(args) >= 3 else "")
assert 0.0 < f_error_rate < 1.0, f_error_rate
assert 0 < n_capacity, n_capacity
logger.debug("constructing bloom filter based on f_error_rate %f and %d n_capacity", f_error_rate,
n_capacity)
m_size = int(ceil(abs((n_capacity * log(f_error_rate)) / (log(2) ** 2)) / 8.0) * 8)
k_functions = cls._get_k_functions(m_size, n_capacity)
filter_ = 0
else:
raise RuntimeError("Unknown combination of argument types %s" % str([type(arg) for arg in args]))
return m_size, k_functions, prefix, filter_
def __init__(self, *args, **kargs):
self._logger = logging.getLogger(self.__class__.__name__)
# get constructor arguments required to build the bloom filter
self._m_size, self._k_functions, self._prefix, self._filter = self._overload_constructor_arguments(args, kargs)
assert isinstance(self._m_size, int), type(self._m_size)
assert 0 < self._m_size, self._m_size
assert self._m_size % 8 == 0, "size must be a multiple of eight (%d)" % self._m_size
assert isinstance(self._k_functions, int), type(self._k_functions)
assert 0 < self._k_functions <= self._m_size, [self._k_functions, self._m_size]
assert isinstance(self._prefix, str), type(self._prefix)
assert 0 <= len(self._prefix) < 256, len(self._prefix)
assert isinstance(self._filter, (int, long)), type(self._filter)
# determine hash function
if self._m_size >= (1 << 31):
fmt_code, chunk_size = "Q", 8
elif self._m_size >= (1 << 15):
fmt_code, chunk_size = "L", 4
else:
fmt_code, chunk_size = "H", 2
# we need at most chunk_size * k bits from our hash function
bits_required = chunk_size * self._k_functions * 8
assert bits_required <= 512, \
"Combining multiple hashfunctions is not implemented, cannot create a hash for %d bits" % bits_required
if bits_required > 384:
hashfn = sha512
elif bits_required > 256:
hashfn = sha384
elif bits_required > 160:
hashfn = sha256
elif bits_required > 128:
hashfn = sha1
else:
hashfn = md5
self._fmt_unpack = Struct("".join((">",
fmt_code * self._k_functions,
"x" * (hashfn().digest_size - bits_required / 8)))).unpack
self._salt = hashfn(self._prefix)
def add(self, key):
"""
Add KEY to the BloomFilter.
"""
filter_ = self._filter
hash_ = self._salt.copy()
hash_.update(key)
for pos in self._fmt_unpack(hash_.digest()):
filter_ |= 1 << (pos % self._m_size)
self._filter = filter_
def add_keys(self, keys):
"""
Add a sequence of KEYS to the BloomFilter.
"""
filter_ = self._filter
salt_copy = self._salt.copy
m_size = self._m_size
fmt_unpack = self._fmt_unpack
for key in keys:
assert isinstance(key, str)
hash_ = salt_copy()
hash_.update(key)
# 04/05/12 Boudewijn: using a list instead of a generator is significantly faster.
# while generators are more memory efficient, this list will be relatively short.
# 07/05/12 Niels: using no list at all is even more efficient/faster
for pos in fmt_unpack(hash_.digest()):
filter_ |= 1 << (pos % m_size)
self._filter = filter_
def clear(self):
"""
Set all bits in the filter to zero.
"""
self._filter = 0
def __contains__(self, key):
filter_ = self._filter
m_size_ = self._m_size
hash_ = self._salt.copy()
hash_.update(key)
for pos in self._fmt_unpack(hash_.digest()):
if not filter_ & (1 << (pos % m_size_)):
return False
return True
def not_filter(self, iterator):
"""
Yields all tuples in iterator where the first element in the tuple is NOT in the bloom
filter.
"""
filter_ = self._filter
salt_copy = self._salt.copy
m_size = self._m_size
fmt_unpack = self._fmt_unpack
for tup in iterator:
assert isinstance(tup, tuple)
assert len(tup) > 0
assert isinstance(tup[0], str)
hash_ = salt_copy()
hash_.update(tup[0])
# 04/05/12 Boudewijn: using a list instead of a generator is significantly faster.
# while generators are more memory efficient, this list will be relatively short.
# 07/05/12 Niels: using no list at all is even more efficient/faster
for pos in fmt_unpack(hash_.digest()):
if not filter_ & (1 << (pos % m_size)):
yield tup
break
def get_capacity(self, f_error_rate):
"""
Returns the capacity given a certain error rate.
@rtype: int
"""
assert isinstance(f_error_rate, float)
assert 0 < f_error_rate < 1
return self._get_n_capacity(self._m_size, f_error_rate)
def get_bits_checked(self):
"""
Returns the number of bits in the bloom filter that are set.
@rtype: int
"""
# get_bits_checked does not take any parameters, hence it should be a property like size, functions, etc.
self._logger.warning("get_bits_checked function is deprecated, please use the bits_checked property")
return self.bits_checked
@property
def bits_checked(self):
"""
The number of bits in the bloom filter that are set.
@rtype: int
"""
return sum(1 if self._filter & (1 << i) else 0 for i in range(self._m_size))
@property
def size(self):
"""
The size of the bloom filter in bits (m).
@rtype: int
"""
return self._m_size
@property
def functions(self):
"""
The number of functions used for each item (k).
"""
return self._k_functions
@property
def prefix(self):
"""
The prefix.
@rtype: string
"""
return self._prefix
@property
def bytes(self):
"""
The binary representation of the bits in the bloom filter. Note that to reconstruct the bloom filter, not the
bytes as well as the number of functions are required.
@rtype: string
"""
# hex should be m_size/4, hex is 16 instead of 8 -> hence half the number of "hexes" in m_size
hex_ = '%x' % self._filter
padding = '0' * (self._m_size / 4 - len(hex_))
return unhexlify(padding + hex_)[::-1]