Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simple prehash interning + some convenience functionality #1

Open
wants to merge 38 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
cb56dbc
Make buildable without ulib
markflorisson Mar 6, 2013
77f6611
Add string interning mechanism
markflorisson Mar 6, 2013
2c742df
Add python/cython-level way to build function hashtable
markflorisson Mar 7, 2013
71f542d
Add siphash
markflorisson Mar 8, 2013
9d41ebe
Use siphash key interning scheme
markflorisson Mar 11, 2013
3b9dff9
Create methodtable module
markflorisson Mar 11, 2013
4edb5f4
Deallocate vtable when going out of scope
markflorisson Mar 11, 2013
fdbe2a5
Make extension module sources and names configurable
markflorisson Mar 11, 2013
94b1ff4
Allow building from different working directory
markflorisson Mar 11, 2013
e6524a0
Add packages to setup.py
markflorisson Mar 11, 2013
d48b7f4
Fix typo in capsule_getpointer
markflorisson Mar 11, 2013
29f7b0f
Use separate hasher and add table generation method to method table
markflorisson Mar 14, 2013
5f0ffba
Use prehash as id
markflorisson Mar 14, 2013
836be99
Make sure the extensions builds on python 3
markflorisson Mar 21, 2013
7bc2663
More Python 3 fixes.
Mar 26, 2013
9e83a79
Add string method to perfect hashing vtable
markflorisson Apr 3, 2013
31ca4a0
Some py3 compatability
markflorisson Apr 3, 2013
c7078fd
Some more str -> bytes conversion for py3
markflorisson Apr 3, 2013
e4d9c4f
Add some error checking to see whether we succeeded building hash table
markflorisson Apr 8, 2013
8d0c824
Add utility to print secret table keys
markflorisson Apr 8, 2013
5851fd1
Add more thorough test to build hash-based function table
markflorisson Apr 9, 2013
4cb4abe
Add and use pstdint.h
markflorisson Apr 9, 2013
f967cea
Some C89 compatibility
markflorisson Apr 9, 2013
c41a368
Make sure we can draw hashes when sizeof(long) == 4
markflorisson Apr 9, 2013
d09f8b9
Add test for pstdint.h
markflorisson Apr 9, 2013
c14ea77
Fix type cast of entry id in hash table
markflorisson Apr 9, 2013
e0f55af
Print table in hashing error message
markflorisson Apr 12, 2013
17d2402
Add some error messages (TODO: use errnos or error return codes)
markflorisson Apr 12, 2013
316307c
Allow for larger hash tables
markflorisson Apr 12, 2013
4aa139d
Update bucketsort test
markflorisson Apr 12, 2013
9c04c4c
Add more thorough intern test
markflorisson Apr 12, 2013
cd2c8de
Add better test for method table
markflorisson Apr 14, 2013
c1ec5db
Disable global intern exception test
markflorisson Apr 15, 2013
63a9b0b
Verify ids in interning test
markflorisson Apr 15, 2013
18b192b
Guard some module-level test calls
markflorisson Apr 15, 2013
0c17bbc
Temporarily switch to deteministic interning keys
markflorisson Apr 15, 2013
cd3a4f5
Make sure displacements xor inbounds
markflorisson Apr 15, 2013
7e3d0cc
Merge branch 'devel'
markflorisson Apr 15, 2013
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions demo/customslots.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ cdef extern from "customslots.h":
void *pointer
Py_ssize_t objoffset
uintptr_t flags

ctypedef struct PyCustomSlot:
uintrptr_t id
uintptr_t id
pyx_data data

int PyCustomSlots_Check(obj)
Expand Down
27 changes: 27 additions & 0 deletions extensibletype/extensibletype.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
cdef extern from "pstdint.h":
ctypedef unsigned int uint32_t
ctypedef unsigned long long uint64_t
ctypedef unsigned short uint16_t
ctypedef unsigned char uint8_t
ctypedef uint64_t uintptr_t

cdef extern from "perfecthash.h":
ctypedef struct PyCustomSlots_Entry:
uint64_t id
void *ptr

ctypedef struct PyCustomSlots_Table:
uint64_t flags
uint64_t m_f, m_g
PyCustomSlots_Entry *entries
uint16_t n, b
uint8_t r

ctypedef struct PyCustomSlots_Table_64_64:
PyCustomSlots_Table base
uint16_t d[64]
PyCustomSlots_Entry entries_mem[64]


int PyCustomSlots_PerfectHash(PyCustomSlots_Table *table, uint64_t *hashes)

105 changes: 46 additions & 59 deletions extensibletype/extensibletype.pyx
Original file line number Diff line number Diff line change
@@ -1,42 +1,30 @@
cimport numpy as cnp
import numpy as np

cdef extern from "stdint.h":
ctypedef unsigned int uint32_t
ctypedef unsigned long long uint64_t
ctypedef unsigned short uint16_t
ctypedef unsigned char uint8_t
ctypedef uint64_t uintptr_t
import hashlib

cdef extern from "perfecthash.h":
ctypedef struct PyCustomSlots_Entry:
char *id
uintptr_t flags
void *ptr

ctypedef struct PyCustomSlots_Table:
uint64_t flags
uint64_t m_f, m_g
PyCustomSlots_Entry *entries
uint16_t n, b
uint8_t r

ctypedef struct PyCustomSlots_Table_64_64:
PyCustomSlots_Table base
uint16_t d[64]
PyCustomSlots_Entry entries_mem[64]


int PyCustomSlots_PerfectHash(PyCustomSlots_Table *table, uint64_t *hashes)
void _PyCustomSlots_bucket_argsort(uint16_t *p, uint8_t *binsizes,
uint8_t *number_of_bins_by_size)
uint16_t *number_of_bins_by_size)

def bucket_argsort(cnp.ndarray[uint16_t, mode='c'] p,
cnp.ndarray[uint8_t, mode='c'] binsizes,
cnp.ndarray[uint8_t, mode='c'] number_of_bins_by_size):
cnp.ndarray[uint16_t, mode='c'] number_of_bins_by_size):
_PyCustomSlots_bucket_argsort(&p[0], &binsizes[0],
&number_of_bins_by_size[0])

def get_random_hashes(rng, nitems):
return rng.randint(-2**31, 2**31-1, size=nitems).astype(np.uint64)

def draw_hashes(rng, nitems):
assert sizeof(long) >= 4

hashes = get_random_hashes(rng, nitems)
hashes <<= 32
hashes |= get_random_hashes(rng, nitems)

return hashes

def perfect_hash(cnp.ndarray[uint64_t] hashes, int repeat=1):
"""Used for testing. Takes the hashes as input, and returns
a permutation array and hash parameters:
Expand All @@ -49,8 +37,7 @@ def perfect_hash(cnp.ndarray[uint64_t] hashes, int repeat=1):
table.base.b = 64
table.base.entries = &table.entries_mem[0]
for i in range(64):
table.entries_mem[i].id = NULL
table.entries_mem[i].flags = i
table.entries_mem[i].id = hashes[i]
table.entries_mem[i].ptr = NULL

cdef int r
Expand All @@ -61,38 +48,38 @@ def perfect_hash(cnp.ndarray[uint64_t] hashes, int repeat=1):
p = np.zeros(64, dtype=np.uint16)

for i in range(64):
p[i] = table.entries_mem[i].flags
p[i] = table.entries_mem[i].id & 0xFF
d[i] = table.d[i]

return p, table.base.r, table.base.m_f, table.base.m_g, d

cdef extern from "md5sum.h":
ctypedef struct MD5_CTX:
uint32_t i[2]
uint32_t buf[4]
unsigned char in_ "in"[64]
unsigned char digest[16]

void MD5Init(MD5_CTX *mdContext)
void MD5Update(MD5_CTX *mdContext, unsigned char *inBuf,
unsigned int inLen)
void MD5Final(MD5_CTX *mdContext)
cdef extern from "hash.h":
uint64_t hash_crapwow64(unsigned char *buf, uint64_t len, uint64_t seed)

def crapwowbench(int repeat=1):
cdef int r
cdef MD5_CTX ctx
for r in range(repeat):
hash_crapwow64("asdf", 4, 0xf123456781234567)


def md5bench(int repeat=1):
cdef int r
cdef MD5_CTX ctx
for r in range(repeat):
MD5Init(&ctx)
MD5Update(&ctx, "asdf", 4)
MD5Final(&ctx)
#cdef extern from "md5sum.h":
# ctypedef struct MD5_CTX:
# uint32_t i[2]
# uint32_t buf[4]
# unsigned char in_ "in"[64]
# unsigned char digest[16]
#
# void MD5Init(MD5_CTX *mdContext)
# void MD5Update(MD5_CTX *mdContext, unsigned char *inBuf,
# unsigned int inLen)
# void MD5Final(MD5_CTX *mdContext)
#
#cdef extern from "hash.h":
# uint64_t hash_crapwow64(unsigned char *buf, uint64_t len, uint64_t seed)
#
#def crapwowbench(int repeat=1):
# cdef int r
# cdef MD5_CTX ctx
# for r in range(repeat):
# hash_crapwow64("asdf", 4, 0xf123456781234567)
#
#
#def md5bench(int repeat=1):
# cdef int r
# cdef MD5_CTX ctx
# for r in range(repeat):
# MD5Init(&ctx)
# MD5Update(&ctx, "asdf", 4)
# MD5Final(&ctx)

17 changes: 17 additions & 0 deletions extensibletype/intern.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from extensibletype cimport *

cdef extern from "Python.h":
ctypedef unsigned int Py_uintptr_t

cdef extern from *:
ctypedef char *string_t "const char *"

cdef extern from "globalinterning.h":
ctypedef void *intern_table_t

intern_table_t *intern_create_table(intern_table_t *table) except NULL
void intern_destroy_table(intern_table_t *table)
uint64_t intern_key(intern_table_t *table, string_t key) except? 0

int PyIntern_Initialize() except -1
uint64_t PyIntern_AddKey(string_t key) except? 0
20 changes: 20 additions & 0 deletions extensibletype/intern.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
def global_intern(bytes key):
return PyIntern_AddKey(key)

def global_intern_initialize():
PyIntern_Initialize()

cdef class InternTable(object):
"Wrap intern tables (intern_table_t)"

cdef intern_table_t _table
cdef intern_table_t *table

def __init__(self):
self.table = intern_create_table(&self._table)

def __dealloc__(self):
intern_destroy_table(self.table)

def intern(self, bytes key):
return intern_key(self.table, key)
183 changes: 183 additions & 0 deletions extensibletype/methodtable.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
from libc cimport stdlib
cimport numpy as cnp
import numpy as np

from extensibletype cimport *
from . import extensibletype

import intern

def roundup(x):
"Round up to a power of two"
x -= 1
x |= x >> 1
x |= x >> 2
x |= x >> 4
x |= x >> 8
x |= x >> 16
x |= x >> 32
x += 1
return x

class HashingError(Exception):
"""
Raised when we can't create a perfect hash-based function table.
"""

cdef PyCustomSlots_Table *allocate_hash_table(uint16_t size) except NULL:
cdef PyCustomSlots_Table *table
cdef uint16_t nbins

size = roundup(size)
assert size * 4 <= 0xFFFF, hex(size)
nbins = size * 4

table = <PyCustomSlots_Table *> stdlib.calloc(
1, sizeof(PyCustomSlots_Table) + sizeof(uint16_t) * nbins +
sizeof(PyCustomSlots_Entry) * size)

if table == NULL:
raise MemoryError

table.n = size
table.b = nbins
table.flags = 0

assert table.b >= table.n, (table.b, table.n, nbins)

table.entries = <PyCustomSlots_Entry *> (
(<char *> table) + sizeof(PyCustomSlots_Table) +
nbins * sizeof(uint16_t))

return table

def make_bytes(s):
if isinstance(s, str):
# Python 3
s = s.encode("ascii")

return s

cdef class Hasher(object):
"""
Generate a globally unique hashes for signature strings.
"""

def hash_signature(self, signature):
cdef uint64_t hashvalue
# cdef bytes md5 = hashlib.md5(signature).digest()
# (&hashvalue)[0] = (<uint64_t *> <char *> md5)[0]

hashvalue = intern.global_intern(make_bytes(signature))
return hashvalue


cdef class PerfectHashMethodTable(object):
"""
Simple wrapper for hash-based virtual method tables.
"""

cdef PyCustomSlots_Table *table
cdef uint16_t *displacements
cdef Hasher hasher

cdef object id_to_signature, signatures

def __init__(self, hasher):
self.hasher = hasher
# For debugging
self.id_to_signature = {}

def generate_table(self, n, ids, flags, funcs, method_names=None):
cdef Py_ssize_t i
cdef cnp.ndarray[uint64_t] hashes

self.table = allocate_hash_table(n)
self.displacements = <uint16_t *> (<char *> self.table +
sizeof(PyCustomSlots_Table))

hashes = np.zeros(self.table.n, dtype=np.uint64)

intern.global_intern_initialize()

# Initialize hash table entries, build hash ids
assert len(ids) == len(flags) == len(funcs)

for i, (signature, flag, func) in enumerate(zip(ids, flags, funcs)):
id = self.hasher.hash_signature(signature)

self.table.entries[i].id = id
self.table.entries[i].ptr = <void *> <uintptr_t> func

hashes[i] = id
self.id_to_signature[id] = signature


hashes[n:self.table.n] = extensibletype.draw_hashes(np.random,
self.table.n - n)
# print "n", n, "table.n", self.table.n, "table.b", self.table.b
assert len(np.unique(hashes)) == len(hashes)

# print "-----------------------"
# print self
# print "-----------------------"

assert self.table.b >= self.table.n, (self.table.b, self.table.n)

# Perfect hash our table
if PyCustomSlots_PerfectHash(self.table, &hashes[0]) < 0:
# TODO: sensible error messages
raise HashingError(
"Unable to create perfect hash table for table: %s" % self)

for i, signature in enumerate(ids):
assert self.find_method(signature) is not None, (i, signature)

# For debugging
self.signatures = ids

def find_method(self, signature):
"""
Find method of the given signature. Use from non-performance
critical code.
"""
cdef uint64_t prehash = intern.global_intern(make_bytes(signature))

assert 0 <= self.displacements[prehash & self.table.m_g] < self.table.b
cdef uint64_t idx = (((prehash >> self.table.r) & self.table.m_f) ^
self.displacements[prehash & self.table.m_g])

assert 0 <= idx < self.size, (idx, self.size)

if self.table.entries[idx].id != prehash:
return None

return (<uintptr_t> self.table.entries[idx].ptr,
self.table.entries[idx].id & 0xFF)

def __str__(self):
buf = ["PerfectHashMethodTable("]
for i in range(self.table.n):
id = self.table.entries[i].id
ptr = <uintptr_t> self.table.entries[i].ptr
sig = self.id_to_signature.get(id, "<empty>")
s = " id: 0x%-16x funcptr: %20d signature: %s" % (id, ptr, sig)
buf.append(s)

buf.append(")")

return "\n".join(buf)

def __dealloc__(self):
# stdlib.free(self.table)
# self.table = NULL
pass

property table_ptr:
def __get__(self):
return <uintptr_t> self.table

property size:
def __get__(self):
return self.table.n

Loading