Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: generate data randomly for ibd file #60

Merged
merged 1 commit into from
Dec 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 49 additions & 3 deletions devtools/deploy_mysqld.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,19 @@
import json
from pprint import pprint

from dataclasses import dataclass, asdict
from dataclasses import dataclass, asdict, fields
from testcontainers.mysql import MySqlContainer
from docker.models.containers import Container
from testcontainers.core.config import testcontainers_config as c

from sqlalchemy import create_engine


from pyinnodb import const
from pyinnodb import disk_struct
from pyinnodb.disk_struct.index import MIndexHeader, MSDIPage, MSystemRecord
from pyinnodb.sdi.table import Column, Table

c.ryuk_disabled = True


Expand All @@ -20,8 +26,8 @@ def main():
pass


@main.command()
def list():
@main.command(name="list")
def tlist():
data = load_deploy()
pprint(data)

Expand Down Expand Up @@ -138,5 +144,45 @@ def exec(version, sql, file):
result = conn.exec_driver_sql(sql)
print(result.all()[0][1])

@main.command()
@click.option("--version", type=click.STRING, default="")
@click.option("--table", type=click.STRING, default="")
@click.option("--size", type=click.INT, default=100)
@click.option("--idx", type=click.INT, default=-1)
@click.option("--int-range", type=click.INT, default=256)
@click.option("--str-size", type=click.INT, default=20)
def rand_data(version, table, size, idx, int_range, str_size):
deploy_container = load_deploy()
if version not in deploy_container:
mDeploy(version)
deploy_container = load_deploy()

table_ibd = deploy_container.get(version).datadir + f"/test/{table}.ibd"
if not os.path.exists(table_ibd):
print(f"\n\n\n{table} is not exists now, please create first\n\n\n")
os.system(deploy_container.get(version).cmd)
else:
f = open(table_ibd, "rb")
fsp = disk_struct.MFspPage.parse_stream(f)
if fsp.sdi_version == 0:
print("version of mysql is not support")
return
f.seek(fsp.sdi_page_no * const.PAGE_SIZE)
sdi_page = MSDIPage.parse_stream(f)
all_tables = [d for d in sdi_page.iterate_sdi_record(f) if d["dd_object_type"] == "Table"]
if len(all_tables) > 1 and idx == -1:
print("these is more than one table, please use --idx to specify one")
return
elif len(all_tables) == 1:
idx = 0
dd_object = Table(**all_tables[idx]["dd_object"])
sql = dd_object.gen_rand_data_sql(size, int_range, str_size)
engine = create_engine(deploy_container.get(version).url)
with engine.connect() as conn:
conn.exec_driver_sql(sql)
conn.commit()
print(f"insert {size} record randomly into {dd_object.schema_ref}.{dd_object.name}")


if __name__ == "__main__":
main()
30 changes: 7 additions & 23 deletions src/pyinnodb/cli/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from pyinnodb.disk_struct.index import MSDIPage, MIndexPage
from pyinnodb.sdi.table import Table
from pyinnodb.disk_struct.data import MGeo
import dataclasses

import json

Expand Down Expand Up @@ -68,9 +69,7 @@ def tosql(ctx, mode, sdi_idx, schema):
if table_object.comment
else ""
)
print(
f"CREATE TABLE {table_name} ({columns_dec}) {desc}{parts}{comment}"
)
print(f"CREATE TABLE {table_name} ({columns_dec}) {desc}{parts}{comment}")
else:
table_object = Table(**sdi_page.ddl(f, sdi_idx)["dd_object"])
root_page_no = int(table_object.indexes[0].private_data.get("root", 4))
Expand All @@ -82,28 +81,11 @@ def tosql(ctx, mode, sdi_idx, schema):
if first_leaf_page_no is None:
print("no data")
return

values = []

def transfter(nd):
vs = []
for field in nd:
if isinstance(field, dict) or isinstance(field, list):
vs.append(repr(json.dumps(field)))
elif field is None:
vs.append("NULL")
elif (
isinstance(field, date)
or isinstance(field, timedelta)
or isinstance(field, datetime)
):
vs.append(f"'{str(field)}'")
elif isinstance(field, MGeo):
d = field.build().hex() # .zfill(50)
vs.append("0x" + d)
elif isinstance(field, bytes):
vs.append("0x"+field.hex())
else:
vs.append(repr(field))
vs = table_object.transfer(nd)
values.append(f"({','.join(vs)})")

default_value_parser = MIndexPage.default_value_parser(
Expand All @@ -117,7 +99,9 @@ def transfter(nd):

table_name = f"`{table_object.schema_ref}`.`{table_object.name}`"
print(
f"INSERT INTO {table_name}({','.join(table_object.DataClass._fields)}) values {', '.join(values)}"
f"INSERT INTO {table_name}({','.join(
table_object.keys()
)}) values {', '.join(values)}"
)

return
Expand Down
68 changes: 35 additions & 33 deletions src/pyinnodb/const/dd_column_type.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from enum import Enum
from collections import namedtuple
from datetime import datetime, date, timedelta


class DDColumnType(Enum):
Expand Down Expand Up @@ -116,42 +117,43 @@ def is_big(cls, t):
DDColumnType.VECTOR,
]

DDColConf = namedtuple("DDColConf", "type size")
DDColConf = namedtuple("DDColConf", "type size pytype")

nop = namedtuple("nop", "")

class DDColConf(DDColConf, Enum):
DECIMAL = DDColumnType.DECIMAL, 0
TINY = DDColumnType.TINY, 1
SHORT = DDColumnType.SHORT, 2
LONG = DDColumnType.LONG, 4
FLOAT = DDColumnType.FLOAT, 4
DOUBLE = DDColumnType.DOUBLE, 8
TYPE_NULL = DDColumnType.TYPE_NULL, 0
TIMESTAMP = DDColumnType.TIMESTAMP, 0
LONGLONG = DDColumnType.LONGLONG, 8
INT24 = DDColumnType.INT24, 3
DATE = DDColumnType.DATE, 0
TIME = DDColumnType.TIME, 0
DATETIME = DDColumnType.DATETIME, 0
YEAR = DDColumnType.YEAR, 1
NEWDATE = DDColumnType.NEWDATE, 3
VARCHAR = DDColumnType.VARCHAR, 0
BIT = DDColumnType.BIT, 0
TIMESTAMP2 = DDColumnType.TIMESTAMP2, 0
DATETIME2 = DDColumnType.DATETIME2, 0
TIME2 = DDColumnType.TIME2, 0
NEWDECIMAL = DDColumnType.NEWDECIMAL, 0
ENUM = DDColumnType.ENUM, 0
SET = DDColumnType.SET, 0
TINY_BLOB = DDColumnType.TINY_BLOB, 0
MEDIUM_BLOB = DDColumnType.MEDIUM_BLOB, 0
LONG_BLOB = DDColumnType.LONG_BLOB, 0
BLOB = DDColumnType.BLOB, 0
VAR_STRING = DDColumnType.VAR_STRING, 0
STRING = DDColumnType.STRING, 0
GEOMETRY = DDColumnType.GEOMETRY, 0
JSON = DDColumnType.JSON, 0
VECTOR = DDColumnType.VECTOR, 0
DECIMAL = DDColumnType.DECIMAL, 0, float
TINY = DDColumnType.TINY, 1, int
SHORT = DDColumnType.SHORT, 2, int
LONG = DDColumnType.LONG, 4, int
FLOAT = DDColumnType.FLOAT, 4, float
DOUBLE = DDColumnType.DOUBLE, 8, float
TYPE_NULL = DDColumnType.TYPE_NULL, 0, int
TIMESTAMP = DDColumnType.TIMESTAMP, 0, int
LONGLONG = DDColumnType.LONGLONG, 8, int
INT24 = DDColumnType.INT24, 3, int
DATE = DDColumnType.DATE, 0, date
TIME = DDColumnType.TIME, 0, timedelta
DATETIME = DDColumnType.DATETIME, 0, datetime
YEAR = DDColumnType.YEAR, 1, int
NEWDATE = DDColumnType.NEWDATE, 3, date
VARCHAR = DDColumnType.VARCHAR, 0, str
BIT = DDColumnType.BIT, 0, int
TIMESTAMP2 = DDColumnType.TIMESTAMP2, 0, int
DATETIME2 = DDColumnType.DATETIME2, 0, datetime
TIME2 = DDColumnType.TIME2, 0, timedelta
NEWDECIMAL = DDColumnType.NEWDECIMAL, 0, float
ENUM = DDColumnType.ENUM, 0, str
SET = DDColumnType.SET, 0, set
TINY_BLOB = DDColumnType.TINY_BLOB, 0, str
MEDIUM_BLOB = DDColumnType.MEDIUM_BLOB, 0, str
LONG_BLOB = DDColumnType.LONG_BLOB, 0, str
BLOB = DDColumnType.BLOB, 0, str
VAR_STRING = DDColumnType.VAR_STRING, 0, str
STRING = DDColumnType.STRING, 0, str
GEOMETRY = DDColumnType.GEOMETRY, 0, nop
JSON = DDColumnType.JSON, 0, str
VECTOR = DDColumnType.VECTOR, 0, list

@classmethod
def get_col_type_conf(cls, type) -> DDColConf:
Expand Down
95 changes: 90 additions & 5 deletions src/pyinnodb/sdi/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import struct
import decimal
import dataclasses
import random
import re

import sys
Expand All @@ -11,14 +12,15 @@
from functools import cache
else:
cache = lambda x: x
from dataclasses import dataclass
from dataclasses import dataclass, field
from collections import namedtuple
from base64 import b64decode
from datetime import timedelta, datetime, date


from .. import const
from ..const.dd_column_type import DDColumnType, DDColConf
from ..const.dd_column_type import DDColumnType, DDColConf, nop
from ..disk_struct.varsize import VarSize, OffPagePointer

from ..disk_struct.data import MTime2, MDatetime, MDate, MTimestamp, MGeo
from ..disk_struct.json import MJson
from ..disk_struct.rollback import MRollbackPointer
Expand Down Expand Up @@ -124,6 +126,25 @@ class Column:
collation_id: int = 0
is_explicit_collation: bool = False

@property
@cache
def pytype(self):
return DDColConf.get_col_type_conf(self.type).pytype

@property
@cache
def dfield(self):
kw_only = False
default = dataclasses.MISSING
if self.pytype == nop:
kw_only = True
default = None
return field(
default=default,
kw_only=kw_only,
metadata={"col": self},
)

def index_prefix(self, ie: IndexElement):
if ie.length == 4294967295:
return 0, False
Expand Down Expand Up @@ -702,6 +723,70 @@ def DataClassHiddenCol(self):

return namedtuple(self.name, " ".join(cols))

def keys(self, no_primary=False, for_rand=False):
v = [f.name for f in dataclasses.fields(self.DataClass)]
if not no_primary and not for_rand:
return v
primary_key_name = [f.name for f in self.get_primary_key_col()]
v = [f for f in v if f not in primary_key_name]
if not for_rand:
return v
target = [f.name for f in dataclasses.fields(self.DataClass) if f.type in [int, str]]

return [f for f in v if f in target]

def gen_rand_data_sql(self, size, int_range=256, str_size=20):
rand_key = self.keys(for_rand=True)
values = []
for dc in self.gen_rand_data(size, int_range, str_size):
values.append("(" + ",".join(self.transfer(dc, rand_key)) + ")")

return f"INSERT INTO `{self.schema_ref}`.`{self.name}`({','.join(rand_key)}) values {', '.join(values)}"

def gen_rand_data(self, size, int_range=256, str_size=20):
keys = self.keys(for_rand=True)
vs = []
for i in range(size):
v = []
for f in dataclasses.fields(self.DataClass):
if f.name not in keys:
continue
if f.type == int:
v.append(random.randint(0, int_range))
elif f.type == str:
v.append(random.randbytes(str_size).hex())
vs.append(v)
return vs

def transfer(self, dc, keys=None):
vs = []
if keys is None:
value = dataclasses.astuple(dc)
elif isinstance(dc, self.DataClass):
value = [getattr(dc, k) for k in keys]
elif isinstance(dc, list):
value = dc
for f in value:
if isinstance(f, dict) or isinstance(f, list):
vs.append(repr(json.dumps(f)))
elif f is None:
vs.append("NULL")
elif (
isinstance(f, date)
or isinstance(f, timedelta)
or isinstance(f, datetime)
):
vs.append(f"'{str(f)}'")
elif isinstance(f, MGeo):
d = f.build().hex() # .zfill(50)
vs.append("0x" + d)
elif isinstance(f, bytes):
vs.append("0x"+f.hex())
else:
vs.append(repr(f))
return vs


@property
@cache
def DataClass(self):
Expand All @@ -715,9 +800,9 @@ def DataClass(self):
continue
if c.is_virtual or c.generation_expression_utf8 != "":
continue
cols.append(c.name)
cols.append([c.name, c.pytype, c.dfield])

return namedtuple(self.name, " ".join(cols))
return dataclasses.make_dataclass(self.name, cols)

@property
@cache
Expand Down
Loading