Skip to content

Commit

Permalink
feat: generate data randomly for ibd file
Browse files Browse the repository at this point in the history
  • Loading branch information
yongcai committed Dec 30, 2024
1 parent d5aced1 commit 0988258
Show file tree
Hide file tree
Showing 4 changed files with 181 additions and 64 deletions.
52 changes: 49 additions & 3 deletions devtools/deploy_mysqld.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,19 @@
import json
from pprint import pprint

from dataclasses import dataclass, asdict
from dataclasses import dataclass, asdict, fields
from testcontainers.mysql import MySqlContainer
from docker.models.containers import Container
from testcontainers.core.config import testcontainers_config as c

from sqlalchemy import create_engine


from pyinnodb import const
from pyinnodb import disk_struct
from pyinnodb.disk_struct.index import MIndexHeader, MSDIPage, MSystemRecord
from pyinnodb.sdi.table import Column, Table

c.ryuk_disabled = True


Expand All @@ -20,8 +26,8 @@ def main():
pass


@main.command()
def list():
@main.command(name="list")
def tlist():
data = load_deploy()
pprint(data)

Expand Down Expand Up @@ -138,5 +144,45 @@ def exec(version, sql, file):
result = conn.exec_driver_sql(sql)
print(result.all()[0][1])

@main.command()
@click.option("--version", type=click.STRING, default="")
@click.option("--table", type=click.STRING, default="")
@click.option("--size", type=click.INT, default=100)
@click.option("--idx", type=click.INT, default=-1)
@click.option("--int-range", type=click.INT, default=256)
@click.option("--str-size", type=click.INT, default=20)
def rand_data(version, table, size, idx, int_range, str_size):
deploy_container = load_deploy()
if version not in deploy_container:
mDeploy(version)
deploy_container = load_deploy()

table_ibd = deploy_container.get(version).datadir + f"/test/{table}.ibd"
if not os.path.exists(table_ibd):
print(f"\n\n\n{table} is not exists now, please create first\n\n\n")
os.system(deploy_container.get(version).cmd)
else:
f = open(table_ibd, "rb")
fsp = disk_struct.MFspPage.parse_stream(f)
if fsp.sdi_version == 0:
print("version of mysql is not support")
return
f.seek(fsp.sdi_page_no * const.PAGE_SIZE)
sdi_page = MSDIPage.parse_stream(f)
all_tables = [d for d in sdi_page.iterate_sdi_record(f) if d["dd_object_type"] == "Table"]
if len(all_tables) > 1 and idx == -1:
print("these is more than one table, please use --idx to specify one")
return
elif len(all_tables) == 1:
idx = 0
dd_object = Table(**all_tables[idx]["dd_object"])
sql = dd_object.gen_rand_data_sql(size, int_range, str_size)
engine = create_engine(deploy_container.get(version).url)
with engine.connect() as conn:
conn.exec_driver_sql(sql)
conn.commit()
print(f"insert {size} record randomly into {dd_object.schema_ref}.{dd_object.name}")


if __name__ == "__main__":
main()
30 changes: 7 additions & 23 deletions src/pyinnodb/cli/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from pyinnodb.disk_struct.index import MSDIPage, MIndexPage
from pyinnodb.sdi.table import Table
from pyinnodb.disk_struct.data import MGeo
import dataclasses

import json

Expand Down Expand Up @@ -68,9 +69,7 @@ def tosql(ctx, mode, sdi_idx, schema):
if table_object.comment
else ""
)
print(
f"CREATE TABLE {table_name} ({columns_dec}) {desc}{parts}{comment}"
)
print(f"CREATE TABLE {table_name} ({columns_dec}) {desc}{parts}{comment}")
else:
table_object = Table(**sdi_page.ddl(f, sdi_idx)["dd_object"])
root_page_no = int(table_object.indexes[0].private_data.get("root", 4))
Expand All @@ -82,28 +81,11 @@ def tosql(ctx, mode, sdi_idx, schema):
if first_leaf_page_no is None:
print("no data")
return

values = []

def transfter(nd):
vs = []
for field in nd:
if isinstance(field, dict) or isinstance(field, list):
vs.append(repr(json.dumps(field)))
elif field is None:
vs.append("NULL")
elif (
isinstance(field, date)
or isinstance(field, timedelta)
or isinstance(field, datetime)
):
vs.append(f"'{str(field)}'")
elif isinstance(field, MGeo):
d = field.build().hex() # .zfill(50)
vs.append("0x" + d)
elif isinstance(field, bytes):
vs.append("0x"+field.hex())
else:
vs.append(repr(field))
vs = table_object.transfer(nd)
values.append(f"({','.join(vs)})")

default_value_parser = MIndexPage.default_value_parser(
Expand All @@ -117,7 +99,9 @@ def transfter(nd):

table_name = f"`{table_object.schema_ref}`.`{table_object.name}`"
print(
f"INSERT INTO {table_name}({','.join(table_object.DataClass._fields)}) values {', '.join(values)}"
f"INSERT INTO {table_name}({','.join(
table_object.keys()
)}) values {', '.join(values)}"
)

return
Expand Down
68 changes: 35 additions & 33 deletions src/pyinnodb/const/dd_column_type.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from enum import Enum
from collections import namedtuple
from datetime import datetime, date, timedelta


class DDColumnType(Enum):
Expand Down Expand Up @@ -116,42 +117,43 @@ def is_big(cls, t):
DDColumnType.VECTOR,
]

DDColConf = namedtuple("DDColConf", "type size")
DDColConf = namedtuple("DDColConf", "type size pytype")

nop = namedtuple("nop", "")

class DDColConf(DDColConf, Enum):
DECIMAL = DDColumnType.DECIMAL, 0
TINY = DDColumnType.TINY, 1
SHORT = DDColumnType.SHORT, 2
LONG = DDColumnType.LONG, 4
FLOAT = DDColumnType.FLOAT, 4
DOUBLE = DDColumnType.DOUBLE, 8
TYPE_NULL = DDColumnType.TYPE_NULL, 0
TIMESTAMP = DDColumnType.TIMESTAMP, 0
LONGLONG = DDColumnType.LONGLONG, 8
INT24 = DDColumnType.INT24, 3
DATE = DDColumnType.DATE, 0
TIME = DDColumnType.TIME, 0
DATETIME = DDColumnType.DATETIME, 0
YEAR = DDColumnType.YEAR, 1
NEWDATE = DDColumnType.NEWDATE, 3
VARCHAR = DDColumnType.VARCHAR, 0
BIT = DDColumnType.BIT, 0
TIMESTAMP2 = DDColumnType.TIMESTAMP2, 0
DATETIME2 = DDColumnType.DATETIME2, 0
TIME2 = DDColumnType.TIME2, 0
NEWDECIMAL = DDColumnType.NEWDECIMAL, 0
ENUM = DDColumnType.ENUM, 0
SET = DDColumnType.SET, 0
TINY_BLOB = DDColumnType.TINY_BLOB, 0
MEDIUM_BLOB = DDColumnType.MEDIUM_BLOB, 0
LONG_BLOB = DDColumnType.LONG_BLOB, 0
BLOB = DDColumnType.BLOB, 0
VAR_STRING = DDColumnType.VAR_STRING, 0
STRING = DDColumnType.STRING, 0
GEOMETRY = DDColumnType.GEOMETRY, 0
JSON = DDColumnType.JSON, 0
VECTOR = DDColumnType.VECTOR, 0
DECIMAL = DDColumnType.DECIMAL, 0, float
TINY = DDColumnType.TINY, 1, int
SHORT = DDColumnType.SHORT, 2, int
LONG = DDColumnType.LONG, 4, int
FLOAT = DDColumnType.FLOAT, 4, float
DOUBLE = DDColumnType.DOUBLE, 8, float
TYPE_NULL = DDColumnType.TYPE_NULL, 0, int
TIMESTAMP = DDColumnType.TIMESTAMP, 0, int
LONGLONG = DDColumnType.LONGLONG, 8, int
INT24 = DDColumnType.INT24, 3, int
DATE = DDColumnType.DATE, 0, date
TIME = DDColumnType.TIME, 0, timedelta
DATETIME = DDColumnType.DATETIME, 0, datetime
YEAR = DDColumnType.YEAR, 1, int
NEWDATE = DDColumnType.NEWDATE, 3, date
VARCHAR = DDColumnType.VARCHAR, 0, str
BIT = DDColumnType.BIT, 0, int
TIMESTAMP2 = DDColumnType.TIMESTAMP2, 0, int
DATETIME2 = DDColumnType.DATETIME2, 0, datetime
TIME2 = DDColumnType.TIME2, 0, timedelta
NEWDECIMAL = DDColumnType.NEWDECIMAL, 0, float
ENUM = DDColumnType.ENUM, 0, str
SET = DDColumnType.SET, 0, set
TINY_BLOB = DDColumnType.TINY_BLOB, 0, str
MEDIUM_BLOB = DDColumnType.MEDIUM_BLOB, 0, str
LONG_BLOB = DDColumnType.LONG_BLOB, 0, str
BLOB = DDColumnType.BLOB, 0, str
VAR_STRING = DDColumnType.VAR_STRING, 0, str
STRING = DDColumnType.STRING, 0, str
GEOMETRY = DDColumnType.GEOMETRY, 0, nop
JSON = DDColumnType.JSON, 0, str
VECTOR = DDColumnType.VECTOR, 0, list

@classmethod
def get_col_type_conf(cls, type) -> DDColConf:
Expand Down
95 changes: 90 additions & 5 deletions src/pyinnodb/sdi/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import struct
import decimal
import dataclasses
import random
import re

import sys
Expand All @@ -11,14 +12,15 @@
from functools import cache
else:
cache = lambda x: x
from dataclasses import dataclass
from dataclasses import dataclass, field
from collections import namedtuple
from base64 import b64decode
from datetime import timedelta, datetime, date


from .. import const
from ..const.dd_column_type import DDColumnType, DDColConf
from ..const.dd_column_type import DDColumnType, DDColConf, nop
from ..disk_struct.varsize import VarSize, OffPagePointer

from ..disk_struct.data import MTime2, MDatetime, MDate, MTimestamp, MGeo
from ..disk_struct.json import MJson
from ..disk_struct.rollback import MRollbackPointer
Expand Down Expand Up @@ -124,6 +126,25 @@ class Column:
collation_id: int = 0
is_explicit_collation: bool = False

@property
@cache
def pytype(self):
return DDColConf.get_col_type_conf(self.type).pytype

@property
@cache
def dfield(self):
kw_only = False
default = dataclasses.MISSING
if self.pytype == nop:
kw_only = True
default = None
return field(
default=default,
kw_only=kw_only,
metadata={"col": self},
)

def index_prefix(self, ie: IndexElement):
if ie.length == 4294967295:
return 0, False
Expand Down Expand Up @@ -702,6 +723,70 @@ def DataClassHiddenCol(self):

return namedtuple(self.name, " ".join(cols))

def keys(self, no_primary=False, for_rand=False):
v = [f.name for f in dataclasses.fields(self.DataClass)]
if not no_primary and not for_rand:
return v
primary_key_name = [f.name for f in self.get_primary_key_col()]
v = [f for f in v if f not in primary_key_name]
if not for_rand:
return v
target = [f.name for f in dataclasses.fields(self.DataClass) if f.type in [int, str]]

return [f for f in v if f in target]

def gen_rand_data_sql(self, size, int_range=256, str_size=20):
rand_key = self.keys(for_rand=True)
values = []
for dc in self.gen_rand_data(size, int_range, str_size):
values.append("(" + ",".join(self.transfer(dc, rand_key)) + ")")

return f"INSERT INTO `{self.schema_ref}`.`{self.name}`({','.join(rand_key)}) values {', '.join(values)}"

def gen_rand_data(self, size, int_range=256, str_size=20):
keys = self.keys(for_rand=True)
vs = []
for i in range(size):
v = []
for f in dataclasses.fields(self.DataClass):
if f.name not in keys:
continue
if f.type == int:
v.append(random.randint(0, int_range))
elif f.type == str:
v.append(random.randbytes(str_size).hex())
vs.append(v)
return vs

def transfer(self, dc, keys=None):
vs = []
if keys is None:
value = dataclasses.astuple(dc)
elif isinstance(dc, self.DataClass):
value = [getattr(dc, k) for k in keys]
elif isinstance(dc, list):
value = dc
for f in value:
if isinstance(f, dict) or isinstance(f, list):
vs.append(repr(json.dumps(f)))
elif f is None:
vs.append("NULL")
elif (
isinstance(f, date)
or isinstance(f, timedelta)
or isinstance(f, datetime)
):
vs.append(f"'{str(f)}'")
elif isinstance(f, MGeo):
d = f.build().hex() # .zfill(50)
vs.append("0x" + d)
elif isinstance(f, bytes):
vs.append("0x"+f.hex())
else:
vs.append(repr(f))
return vs


@property
@cache
def DataClass(self):
Expand All @@ -715,9 +800,9 @@ def DataClass(self):
continue
if c.is_virtual or c.generation_expression_utf8 != "":
continue
cols.append(c.name)
cols.append([c.name, c.pytype, c.dfield])

return namedtuple(self.name, " ".join(cols))
return dataclasses.make_dataclass(self.name, cols)

@property
@cache
Expand Down

0 comments on commit 0988258

Please sign in to comment.