-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
- Loading branch information
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
[[source]] | ||
url = "https://pypi.org/simple" | ||
verify_ssl = true | ||
name = "pypi" | ||
|
||
[packages] | ||
graphviz = "*" | ||
|
||
[dev-packages] | ||
|
||
[requires] | ||
python_version = "3.9" | ||
python_full_version = "3.9.6" |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,147 @@ | ||
""" | ||
B+Tree implementation in python using disk persistence | ||
""" | ||
|
||
import bisect | ||
from math import ceil | ||
from typing import TypeVar, Optional, List | ||
|
||
import graphviz | ||
|
||
T = TypeVar("T") | ||
|
||
|
||
class TreeNode: | ||
def __init__(self, is_leaf=False, order=3): | ||
self.order: int = order # max number of keys in a node | ||
self.is_leaf: bool = is_leaf | ||
self.keys: T = [] | ||
self.children: List["TreeNode"] = [] | ||
self.parent: Optional[TreeNode] = None | ||
|
||
self.next: Optional[TreeNode] = None | ||
self.previous: Optional[TreeNode] = None | ||
|
||
def __str__(self): | ||
return str([key for key in self.keys]) | ||
|
||
@property | ||
def max_num_of_keys(self): | ||
return self.order - 1 | ||
|
||
@property | ||
def max_num_of_child(self): | ||
return self.order | ||
|
||
@property | ||
def min_num_of_key(self): | ||
return ceil(self.order / 2) - 1 | ||
|
||
def insert(self, key): | ||
bisect.insort(self.keys, key) | ||
|
||
def split(self): | ||
mid_idx = len(self.keys) // 2 | ||
mid_key = self.keys[mid_idx] | ||
|
||
sibling_node = TreeNode(is_leaf=self.is_leaf, order=self.order) | ||
# splitting separator key to sibling node | ||
sibling_node.keys = self.keys[mid_idx + 1 :] | ||
self.keys = self.keys[: mid_idx + 1] | ||
|
||
# only leaf that needs next reference, will help on deletion | ||
if self.is_leaf: | ||
self.next = sibling_node | ||
sibling_node.previous = self | ||
|
||
# only non-leaf that has children, it needs to be migrated to new node. | ||
if not self.is_leaf: | ||
# move the children of origin node to the sibling node | ||
sibling_node.children = self.children[mid_idx + 1 :] | ||
self.children = self.children[: mid_idx + 1] | ||
|
||
for child in sibling_node.children: | ||
child.parent = sibling_node | ||
|
||
return mid_key, sibling_node | ||
|
||
|
||
class BPlusTree: | ||
def __init__(self, order=4): | ||
self.root = TreeNode(is_leaf=True, order=order) | ||
self.order = order | ||
|
||
def _find_leaf_node(self, node, key) -> TreeNode: | ||
if node.is_leaf: | ||
return node | ||
|
||
# todo: binary search | ||
for i, separator_key in enumerate(node.keys): | ||
if key <= separator_key: | ||
return self._find_leaf_node(node.children[i], key) | ||
|
||
return self._find_leaf_node(node.children[-1], key) | ||
|
||
def _split_and_promote(self, node: TreeNode): | ||
if len(node.keys) <= node.max_num_of_keys: | ||
return | ||
|
||
mid_key, sibling = node.split() | ||
|
||
if node.parent is None: | ||
new_root = TreeNode(is_leaf=False, order=self.order) | ||
new_root.keys = [mid_key] | ||
new_root.children = [node, sibling] | ||
node.parent = new_root | ||
sibling.parent = new_root | ||
self.root = new_root | ||
else: | ||
parent = node.parent | ||
sibling.parent = parent | ||
|
||
bisect.insort(parent.keys, mid_key) | ||
parent.children.insert(parent.keys.index(mid_key) + 1, sibling) | ||
|
||
if len(parent.keys) > node.max_num_of_keys: | ||
self._split_and_promote(parent) | ||
|
||
def insert(self, key): | ||
leaf = self._find_leaf_node(self.root, key) | ||
leaf.insert(key) | ||
|
||
if len(leaf.keys) > leaf.max_num_of_keys: | ||
self._split_and_promote(leaf) | ||
|
||
def find(self, key): | ||
node = self._find_leaf_node(self.root, key) | ||
if key in node.keys: | ||
return key | ||
return None | ||
|
||
def graph(self): | ||
dot = graphviz.Digraph() | ||
dot.attr("node", shape="square") | ||
|
||
edges = set() | ||
|
||
from queue import Queue | ||
|
||
queue = Queue() | ||
queue.put(self.root) | ||
|
||
while queue.empty() is False: | ||
node = queue.get() | ||
dot.node(str(node), str(node.keys)) | ||
|
||
if node.parent: | ||
edge = f"{str(node.parent)}-{str(node)}" | ||
if edge not in edges: | ||
dot.edge(str(node.parent), str(node)) | ||
edges.add(edge) | ||
|
||
for child in node.children: | ||
queue.put(child) | ||
|
||
dot.render("graph", view=True) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
import random | ||
from unittest import TestCase | ||
from uuid import uuid4 | ||
|
||
from python.b_tree_in_memory import BPlusTree | ||
|
||
|
||
class BTreeTest(TestCase): | ||
def test_small_inputs(self): | ||
test_num_of_keys = 100 | ||
btree = BPlusTree(order=5) | ||
inputs = [i for i in range(test_num_of_keys)] | ||
|
||
for i in inputs: | ||
try: | ||
btree.insert( | ||
i, | ||
) | ||
except Exception: | ||
btree.graph() | ||
self.fail(f"cannot insert key {i}") | ||
for i in inputs: | ||
try: | ||
key = btree.find(key=i) | ||
if key is None: | ||
btree.graph() | ||
self.fail(f"cannot find key {i}") | ||
except: | ||
self.fail(f"cannot find key {i}") | ||
|
||
btree.graph() | ||
|
||
def test_insert_and_retrieve(self): | ||
test_num_of_keys = 1000 | ||
test_orders = 100 | ||
test_array = [ | ||
list(range(1, test_num_of_keys + 1)), | ||
list([str(uuid4()) for _ in range(test_num_of_keys)]), | ||
] | ||
for inputs in test_array: | ||
for order in range(3, test_orders): | ||
with self.subTest( | ||
msg=f"test orders {order} with input type {type(inputs[0])}" | ||
): | ||
btree = BPlusTree(order=order) | ||
random.shuffle(list(inputs)) | ||
|
||
for i in inputs: | ||
btree.insert(i) | ||
for i in inputs: | ||
key = btree.find(key=i) | ||
if key is None: | ||
self.fail(f"cannot find key {i}") | ||
|
||
def test_delete_without_underflow(self): | ||
btree = BPlusTree(order=4) | ||
for i in range(8): | ||
btree.insert(i) | ||
|
||
btree.graph() | ||
|
||
btree.delete(2) | ||
btree.graph() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,30 +1,14 @@ | ||
# go-caskdb | ||
Create your own DB from scratch | ||
|
||
[![codecov](https://codecov.io/gh/luqmansen/go-caskdb/branch/master/graph/badge.svg)](https://codecov.io/gh/luqmansen/go-caskdb) | ||
[![Actions Status](https://github.com/luqmansen/go-caskdb/actions/workflows/test.yml/badge.svg)](https://github.com/luqmansen/go-caskdb/actions/workflows/test.yml) | ||
- Storage Engine | ||
- B+Tree | ||
- SSTable | ||
- Page Cache | ||
- Execution Engine | ||
- SQL Parser | ||
- SQL Optimizer | ||
- Server | ||
|
||
[Riak's Bitcask paper](https://riak.com/assets/bitcask-intro.pdf) implementation in Golang | ||
Distributed System | ||
|
||
## Todo | ||
|
||
- [ ] Implement key deletion | ||
- [ ] Implement CRC | ||
- [ ] Implement Max file size | ||
- [ ] Implement Log Merging | ||
- [ ] Implement merge trigger | ||
- [ ] Fragmentation | ||
- [ ] Dead bytes | ||
- [ ] Implement merge interval | ||
- [ ] Add support for ranged query | ||
|
||
## Benchmark | ||
|
||
| Ops | Result | | ||
|---------------------------------|-------------------------------------------------------------| | ||
| Unbuffered Write | `BenchmarkDiskStorage_Set-8 651841 1737 ns/op` | ||
| Buffered Write | `BenchmarkDiskStorage_Set-8 2569089 501.8 ns/op` | | ||
| Buffered Write + Sync after set | `BenchmarkDiskStorage_Set-8 7879 313756 ns/op` | ||
|
||
## Credits | ||
|
||
This repo is inspired by [py-caskdb](https://github.com/avinassh/py-caskdb/) | ||
- Raft |