Skip to content

Commit

Permalink
move stuff around
Browse files Browse the repository at this point in the history
  • Loading branch information
luqmansen committed Oct 14, 2024
1 parent ac5c282 commit 71d5af8
Show file tree
Hide file tree
Showing 24 changed files with 344 additions and 27 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
# test data output
testdata/

venv/

# Output of the go coverage tool, specifically when used with LiteIDE
*.out

Expand Down
8 changes: 8 additions & 0 deletions .idea/.gitignore

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

11 changes: 11 additions & 0 deletions .idea/db.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

33 changes: 33 additions & 0 deletions .idea/inspectionProfiles/Project_Default.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 10 additions & 0 deletions .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions .idea/modules.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions .idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
13 changes: 13 additions & 0 deletions python/Pipfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
[[source]]
url = "https://pypi.org/simple"
verify_ssl = true
name = "pypi"

[packages]
graphviz = "*"

[dev-packages]

[requires]
python_version = "3.9"
python_full_version = "3.9.6"
31 changes: 31 additions & 0 deletions python/Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

147 changes: 147 additions & 0 deletions python/b_tree_in_memory.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
"""
B+Tree implementation in python using disk persistence
"""

import bisect
from math import ceil
from typing import TypeVar, Optional, List

import graphviz

T = TypeVar("T")


class TreeNode:
def __init__(self, is_leaf=False, order=3):
self.order: int = order # max number of keys in a node
self.is_leaf: bool = is_leaf
self.keys: T = []
self.children: List["TreeNode"] = []
self.parent: Optional[TreeNode] = None

self.next: Optional[TreeNode] = None
self.previous: Optional[TreeNode] = None

def __str__(self):
return str([key for key in self.keys])

@property
def max_num_of_keys(self):
return self.order - 1

@property
def max_num_of_child(self):
return self.order

@property
def min_num_of_key(self):
return ceil(self.order / 2) - 1

def insert(self, key):
bisect.insort(self.keys, key)

def split(self):
mid_idx = len(self.keys) // 2
mid_key = self.keys[mid_idx]

sibling_node = TreeNode(is_leaf=self.is_leaf, order=self.order)
# splitting separator key to sibling node
sibling_node.keys = self.keys[mid_idx + 1 :]
self.keys = self.keys[: mid_idx + 1]

# only leaf that needs next reference, will help on deletion
if self.is_leaf:
self.next = sibling_node
sibling_node.previous = self

# only non-leaf that has children, it needs to be migrated to new node.
if not self.is_leaf:
# move the children of origin node to the sibling node
sibling_node.children = self.children[mid_idx + 1 :]
self.children = self.children[: mid_idx + 1]

for child in sibling_node.children:
child.parent = sibling_node

return mid_key, sibling_node


class BPlusTree:
def __init__(self, order=4):
self.root = TreeNode(is_leaf=True, order=order)
self.order = order

def _find_leaf_node(self, node, key) -> TreeNode:
if node.is_leaf:
return node

# todo: binary search
for i, separator_key in enumerate(node.keys):
if key <= separator_key:
return self._find_leaf_node(node.children[i], key)

return self._find_leaf_node(node.children[-1], key)

def _split_and_promote(self, node: TreeNode):
if len(node.keys) <= node.max_num_of_keys:
return

mid_key, sibling = node.split()

if node.parent is None:
new_root = TreeNode(is_leaf=False, order=self.order)
new_root.keys = [mid_key]
new_root.children = [node, sibling]
node.parent = new_root
sibling.parent = new_root
self.root = new_root
else:
parent = node.parent
sibling.parent = parent

bisect.insort(parent.keys, mid_key)
parent.children.insert(parent.keys.index(mid_key) + 1, sibling)

if len(parent.keys) > node.max_num_of_keys:
self._split_and_promote(parent)

def insert(self, key):
leaf = self._find_leaf_node(self.root, key)
leaf.insert(key)

if len(leaf.keys) > leaf.max_num_of_keys:
self._split_and_promote(leaf)

def find(self, key):
node = self._find_leaf_node(self.root, key)
if key in node.keys:
return key
return None

def graph(self):
dot = graphviz.Digraph()
dot.attr("node", shape="square")

edges = set()

from queue import Queue

queue = Queue()
queue.put(self.root)

while queue.empty() is False:
node = queue.get()
dot.node(str(node), str(node.keys))

if node.parent:
edge = f"{str(node.parent)}-{str(node)}"
if edge not in edges:
dot.edge(str(node.parent), str(node))
edges.add(edge)

for child in node.children:
queue.put(child)

dot.render("graph", view=True)
63 changes: 63 additions & 0 deletions python/test_b_tree.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import random
from unittest import TestCase
from uuid import uuid4

from python.b_tree_in_memory import BPlusTree


class BTreeTest(TestCase):
def test_small_inputs(self):
test_num_of_keys = 100
btree = BPlusTree(order=5)
inputs = [i for i in range(test_num_of_keys)]

for i in inputs:
try:
btree.insert(
i,
)
except Exception:
btree.graph()
self.fail(f"cannot insert key {i}")
for i in inputs:
try:
key = btree.find(key=i)
if key is None:
btree.graph()
self.fail(f"cannot find key {i}")
except:
self.fail(f"cannot find key {i}")

btree.graph()

def test_insert_and_retrieve(self):
test_num_of_keys = 1000
test_orders = 100
test_array = [
list(range(1, test_num_of_keys + 1)),
list([str(uuid4()) for _ in range(test_num_of_keys)]),
]
for inputs in test_array:
for order in range(3, test_orders):
with self.subTest(
msg=f"test orders {order} with input type {type(inputs[0])}"
):
btree = BPlusTree(order=order)
random.shuffle(list(inputs))

for i in inputs:
btree.insert(i)
for i in inputs:
key = btree.find(key=i)
if key is None:
self.fail(f"cannot find key {i}")

def test_delete_without_underflow(self):
btree = BPlusTree(order=4)
for i in range(8):
btree.insert(i)

btree.graph()

btree.delete(2)
btree.graph()
38 changes: 11 additions & 27 deletions readme.md
Original file line number Diff line number Diff line change
@@ -1,30 +1,14 @@
# go-caskdb
Create your own DB from scratch

[![codecov](https://codecov.io/gh/luqmansen/go-caskdb/branch/master/graph/badge.svg)](https://codecov.io/gh/luqmansen/go-caskdb)
[![Actions Status](https://github.com/luqmansen/go-caskdb/actions/workflows/test.yml/badge.svg)](https://github.com/luqmansen/go-caskdb/actions/workflows/test.yml)
- Storage Engine
- B+Tree
- SSTable
- Page Cache
- Execution Engine
- SQL Parser
- SQL Optimizer
- Server

[Riak's Bitcask paper](https://riak.com/assets/bitcask-intro.pdf) implementation in Golang
Distributed System

## Todo

- [ ] Implement key deletion
- [ ] Implement CRC
- [ ] Implement Max file size
- [ ] Implement Log Merging
- [ ] Implement merge trigger
- [ ] Fragmentation
- [ ] Dead bytes
- [ ] Implement merge interval
- [ ] Add support for ranged query

## Benchmark

| Ops | Result |
|---------------------------------|-------------------------------------------------------------|
| Unbuffered Write | `BenchmarkDiskStorage_Set-8 651841 1737 ns/op`
| Buffered Write | `BenchmarkDiskStorage_Set-8 2569089 501.8 ns/op` |
| Buffered Write + Sync after set | `BenchmarkDiskStorage_Set-8 7879 313756 ns/op`

## Credits

This repo is inspired by [py-caskdb](https://github.com/avinassh/py-caskdb/)
- Raft

0 comments on commit 71d5af8

Please sign in to comment.