diff --git a/.gitignore b/.gitignore index 6994dc0..0a728af 100644 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,8 @@ # test data output testdata/ +venv/ + # Output of the go coverage tool, specifically when used with LiteIDE *.out diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/db.iml b/.idea/db.iml new file mode 100644 index 0000000..f4a2bef --- /dev/null +++ b/.idea/db.iml @@ -0,0 +1,11 @@ + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..5c038fe --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,33 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..4af1825 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,10 @@ + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..7aa31ea --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..b60581b --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,7 @@ + + + + + + + \ No newline at end of file diff --git a/Makefile b/go/Makefile similarity index 100% rename from Makefile rename to go/Makefile diff --git a/datafile.go b/go/datafile.go similarity index 100% rename from datafile.go rename to go/datafile.go diff --git a/entry.go b/go/entry.go similarity index 100% rename from entry.go rename to go/entry.go diff --git a/entry_test.go b/go/entry_test.go similarity index 100% rename from entry_test.go rename to go/entry_test.go diff --git a/go.mod b/go/go.mod similarity index 100% rename from go.mod rename to go/go.mod diff --git a/go.sum b/go/go.sum similarity index 100% rename from go.sum rename to go/go.sum diff --git a/header.go b/go/header.go similarity index 100% rename from header.go rename to go/header.go diff --git a/header_test.go b/go/header_test.go similarity index 100% rename from header_test.go rename to go/header_test.go diff --git a/options.go b/go/options.go similarity index 100% rename from options.go rename to go/options.go diff --git a/options_test.go b/go/options_test.go similarity index 100% rename from options_test.go rename to go/options_test.go diff --git a/store_disk.go b/go/store_disk.go similarity index 100% rename from store_disk.go rename to go/store_disk.go diff --git a/store_disk_test.go b/go/store_disk_test.go similarity index 100% rename from store_disk_test.go rename to go/store_disk_test.go diff --git a/python/Pipfile b/python/Pipfile new file mode 100644 index 0000000..30c2ed5 --- /dev/null +++ b/python/Pipfile @@ -0,0 +1,13 @@ +[[source]] +url = "https://pypi.org/simple" +verify_ssl = true +name = "pypi" + +[packages] +graphviz = "*" + +[dev-packages] + +[requires] +python_version = "3.9" +python_full_version = "3.9.6" diff --git a/python/Pipfile.lock b/python/Pipfile.lock new file mode 100644 index 0000000..71bcdc7 --- /dev/null +++ b/python/Pipfile.lock @@ -0,0 +1,31 @@ +{ + "_meta": { + "hash": { + "sha256": "b4e57cf75f4f743a2c55d3179e617886271fb2bc9951b2229605ae1d382fcb68" + }, + "pipfile-spec": 6, + "requires": { + "python_full_version": "3.9.6", + "python_version": "3.9" + }, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.org/simple", + "verify_ssl": true + } + ] + }, + "default": { + "graphviz": { + "hashes": [ + "sha256:09d6bc81e6a9fa392e7ba52135a9d49f1ed62526f96499325930e87ca1b5925d", + "sha256:81f848f2904515d8cd359cc611faba817598d2feaac4027b266aa3eda7b3dde5" + ], + "index": "pypi", + "markers": "python_version >= '3.8'", + "version": "==0.20.3" + } + }, + "develop": {} +} diff --git a/python/b_tree_in_memory.py b/python/b_tree_in_memory.py new file mode 100644 index 0000000..0219338 --- /dev/null +++ b/python/b_tree_in_memory.py @@ -0,0 +1,147 @@ +""" +B+Tree implementation in python using disk persistence + + +""" + +import bisect +from math import ceil +from typing import TypeVar, Optional, List + +import graphviz + +T = TypeVar("T") + + +class TreeNode: + def __init__(self, is_leaf=False, order=3): + self.order: int = order # max number of keys in a node + self.is_leaf: bool = is_leaf + self.keys: T = [] + self.children: List["TreeNode"] = [] + self.parent: Optional[TreeNode] = None + + self.next: Optional[TreeNode] = None + self.previous: Optional[TreeNode] = None + + def __str__(self): + return str([key for key in self.keys]) + + @property + def max_num_of_keys(self): + return self.order - 1 + + @property + def max_num_of_child(self): + return self.order + + @property + def min_num_of_key(self): + return ceil(self.order / 2) - 1 + + def insert(self, key): + bisect.insort(self.keys, key) + + def split(self): + mid_idx = len(self.keys) // 2 + mid_key = self.keys[mid_idx] + + sibling_node = TreeNode(is_leaf=self.is_leaf, order=self.order) + # splitting separator key to sibling node + sibling_node.keys = self.keys[mid_idx + 1 :] + self.keys = self.keys[: mid_idx + 1] + + # only leaf that needs next reference, will help on deletion + if self.is_leaf: + self.next = sibling_node + sibling_node.previous = self + + # only non-leaf that has children, it needs to be migrated to new node. + if not self.is_leaf: + # move the children of origin node to the sibling node + sibling_node.children = self.children[mid_idx + 1 :] + self.children = self.children[: mid_idx + 1] + + for child in sibling_node.children: + child.parent = sibling_node + + return mid_key, sibling_node + + +class BPlusTree: + def __init__(self, order=4): + self.root = TreeNode(is_leaf=True, order=order) + self.order = order + + def _find_leaf_node(self, node, key) -> TreeNode: + if node.is_leaf: + return node + + # todo: binary search + for i, separator_key in enumerate(node.keys): + if key <= separator_key: + return self._find_leaf_node(node.children[i], key) + + return self._find_leaf_node(node.children[-1], key) + + def _split_and_promote(self, node: TreeNode): + if len(node.keys) <= node.max_num_of_keys: + return + + mid_key, sibling = node.split() + + if node.parent is None: + new_root = TreeNode(is_leaf=False, order=self.order) + new_root.keys = [mid_key] + new_root.children = [node, sibling] + node.parent = new_root + sibling.parent = new_root + self.root = new_root + else: + parent = node.parent + sibling.parent = parent + + bisect.insort(parent.keys, mid_key) + parent.children.insert(parent.keys.index(mid_key) + 1, sibling) + + if len(parent.keys) > node.max_num_of_keys: + self._split_and_promote(parent) + + def insert(self, key): + leaf = self._find_leaf_node(self.root, key) + leaf.insert(key) + + if len(leaf.keys) > leaf.max_num_of_keys: + self._split_and_promote(leaf) + + def find(self, key): + node = self._find_leaf_node(self.root, key) + if key in node.keys: + return key + return None + + def graph(self): + dot = graphviz.Digraph() + dot.attr("node", shape="square") + + edges = set() + + from queue import Queue + + queue = Queue() + queue.put(self.root) + + while queue.empty() is False: + node = queue.get() + dot.node(str(node), str(node.keys)) + + if node.parent: + edge = f"{str(node.parent)}-{str(node)}" + if edge not in edges: + dot.edge(str(node.parent), str(node)) + edges.add(edge) + + for child in node.children: + queue.put(child) + + dot.render("graph", view=True) diff --git a/python/test_b_tree.py b/python/test_b_tree.py new file mode 100644 index 0000000..fec758c --- /dev/null +++ b/python/test_b_tree.py @@ -0,0 +1,63 @@ +import random +from unittest import TestCase +from uuid import uuid4 + +from python.b_tree_in_memory import BPlusTree + + +class BTreeTest(TestCase): + def test_small_inputs(self): + test_num_of_keys = 100 + btree = BPlusTree(order=5) + inputs = [i for i in range(test_num_of_keys)] + + for i in inputs: + try: + btree.insert( + i, + ) + except Exception: + btree.graph() + self.fail(f"cannot insert key {i}") + for i in inputs: + try: + key = btree.find(key=i) + if key is None: + btree.graph() + self.fail(f"cannot find key {i}") + except: + self.fail(f"cannot find key {i}") + + btree.graph() + + def test_insert_and_retrieve(self): + test_num_of_keys = 1000 + test_orders = 100 + test_array = [ + list(range(1, test_num_of_keys + 1)), + list([str(uuid4()) for _ in range(test_num_of_keys)]), + ] + for inputs in test_array: + for order in range(3, test_orders): + with self.subTest( + msg=f"test orders {order} with input type {type(inputs[0])}" + ): + btree = BPlusTree(order=order) + random.shuffle(list(inputs)) + + for i in inputs: + btree.insert(i) + for i in inputs: + key = btree.find(key=i) + if key is None: + self.fail(f"cannot find key {i}") + + def test_delete_without_underflow(self): + btree = BPlusTree(order=4) + for i in range(8): + btree.insert(i) + + btree.graph() + + btree.delete(2) + btree.graph() diff --git a/readme.md b/readme.md index 60aa0b5..855fd28 100644 --- a/readme.md +++ b/readme.md @@ -1,30 +1,14 @@ -# go-caskdb +Create your own DB from scratch -[![codecov](https://codecov.io/gh/luqmansen/go-caskdb/branch/master/graph/badge.svg)](https://codecov.io/gh/luqmansen/go-caskdb) -[![Actions Status](https://github.com/luqmansen/go-caskdb/actions/workflows/test.yml/badge.svg)](https://github.com/luqmansen/go-caskdb/actions/workflows/test.yml) +- Storage Engine + - B+Tree + - SSTable +- Page Cache +- Execution Engine +- SQL Parser + - SQL Optimizer +- Server -[Riak's Bitcask paper](https://riak.com/assets/bitcask-intro.pdf) implementation in Golang +Distributed System -## Todo - -- [ ] Implement key deletion -- [ ] Implement CRC -- [ ] Implement Max file size -- [ ] Implement Log Merging - - [ ] Implement merge trigger - - [ ] Fragmentation - - [ ] Dead bytes - - [ ] Implement merge interval -- [ ] Add support for ranged query - -## Benchmark - -| Ops | Result | -|---------------------------------|-------------------------------------------------------------| -| Unbuffered Write | `BenchmarkDiskStorage_Set-8 651841 1737 ns/op` -| Buffered Write | `BenchmarkDiskStorage_Set-8 2569089 501.8 ns/op` | -| Buffered Write + Sync after set | `BenchmarkDiskStorage_Set-8 7879 313756 ns/op` - -## Credits - -This repo is inspired by [py-caskdb](https://github.com/avinassh/py-caskdb/) \ No newline at end of file +- Raft