From 09882584c9e67fbc87a4a2909ea37e3bc024f541 Mon Sep 17 00:00:00 2001 From: yongcai Date: Mon, 30 Dec 2024 22:58:37 +0800 Subject: [PATCH] feat: generate data randomly for ibd file --- devtools/deploy_mysqld.py | 52 ++++++++++++++- src/pyinnodb/cli/sql.py | 30 ++------- src/pyinnodb/const/dd_column_type.py | 68 ++++++++++---------- src/pyinnodb/sdi/table.py | 95 ++++++++++++++++++++++++++-- 4 files changed, 181 insertions(+), 64 deletions(-) diff --git a/devtools/deploy_mysqld.py b/devtools/deploy_mysqld.py index 499797f..d1eef7b 100644 --- a/devtools/deploy_mysqld.py +++ b/devtools/deploy_mysqld.py @@ -5,13 +5,19 @@ import json from pprint import pprint -from dataclasses import dataclass, asdict +from dataclasses import dataclass, asdict, fields from testcontainers.mysql import MySqlContainer from docker.models.containers import Container from testcontainers.core.config import testcontainers_config as c from sqlalchemy import create_engine + +from pyinnodb import const +from pyinnodb import disk_struct +from pyinnodb.disk_struct.index import MIndexHeader, MSDIPage, MSystemRecord +from pyinnodb.sdi.table import Column, Table + c.ryuk_disabled = True @@ -20,8 +26,8 @@ def main(): pass -@main.command() -def list(): +@main.command(name="list") +def tlist(): data = load_deploy() pprint(data) @@ -138,5 +144,45 @@ def exec(version, sql, file): result = conn.exec_driver_sql(sql) print(result.all()[0][1]) +@main.command() +@click.option("--version", type=click.STRING, default="") +@click.option("--table", type=click.STRING, default="") +@click.option("--size", type=click.INT, default=100) +@click.option("--idx", type=click.INT, default=-1) +@click.option("--int-range", type=click.INT, default=256) +@click.option("--str-size", type=click.INT, default=20) +def rand_data(version, table, size, idx, int_range, str_size): + deploy_container = load_deploy() + if version not in deploy_container: + mDeploy(version) + deploy_container = load_deploy() + + table_ibd = deploy_container.get(version).datadir + f"/test/{table}.ibd" + if not os.path.exists(table_ibd): + print(f"\n\n\n{table} is not exists now, please create first\n\n\n") + os.system(deploy_container.get(version).cmd) + else: + f = open(table_ibd, "rb") + fsp = disk_struct.MFspPage.parse_stream(f) + if fsp.sdi_version == 0: + print("version of mysql is not support") + return + f.seek(fsp.sdi_page_no * const.PAGE_SIZE) + sdi_page = MSDIPage.parse_stream(f) + all_tables = [d for d in sdi_page.iterate_sdi_record(f) if d["dd_object_type"] == "Table"] + if len(all_tables) > 1 and idx == -1: + print("these is more than one table, please use --idx to specify one") + return + elif len(all_tables) == 1: + idx = 0 + dd_object = Table(**all_tables[idx]["dd_object"]) + sql = dd_object.gen_rand_data_sql(size, int_range, str_size) + engine = create_engine(deploy_container.get(version).url) + with engine.connect() as conn: + conn.exec_driver_sql(sql) + conn.commit() + print(f"insert {size} record randomly into {dd_object.schema_ref}.{dd_object.name}") + + if __name__ == "__main__": main() diff --git a/src/pyinnodb/cli/sql.py b/src/pyinnodb/cli/sql.py index ee6c57a..2f689ff 100644 --- a/src/pyinnodb/cli/sql.py +++ b/src/pyinnodb/cli/sql.py @@ -3,6 +3,7 @@ from pyinnodb.disk_struct.index import MSDIPage, MIndexPage from pyinnodb.sdi.table import Table from pyinnodb.disk_struct.data import MGeo +import dataclasses import json @@ -68,9 +69,7 @@ def tosql(ctx, mode, sdi_idx, schema): if table_object.comment else "" ) - print( - f"CREATE TABLE {table_name} ({columns_dec}) {desc}{parts}{comment}" - ) + print(f"CREATE TABLE {table_name} ({columns_dec}) {desc}{parts}{comment}") else: table_object = Table(**sdi_page.ddl(f, sdi_idx)["dd_object"]) root_page_no = int(table_object.indexes[0].private_data.get("root", 4)) @@ -82,28 +81,11 @@ def tosql(ctx, mode, sdi_idx, schema): if first_leaf_page_no is None: print("no data") return + values = [] def transfter(nd): - vs = [] - for field in nd: - if isinstance(field, dict) or isinstance(field, list): - vs.append(repr(json.dumps(field))) - elif field is None: - vs.append("NULL") - elif ( - isinstance(field, date) - or isinstance(field, timedelta) - or isinstance(field, datetime) - ): - vs.append(f"'{str(field)}'") - elif isinstance(field, MGeo): - d = field.build().hex() # .zfill(50) - vs.append("0x" + d) - elif isinstance(field, bytes): - vs.append("0x"+field.hex()) - else: - vs.append(repr(field)) + vs = table_object.transfer(nd) values.append(f"({','.join(vs)})") default_value_parser = MIndexPage.default_value_parser( @@ -117,7 +99,9 @@ def transfter(nd): table_name = f"`{table_object.schema_ref}`.`{table_object.name}`" print( - f"INSERT INTO {table_name}({','.join(table_object.DataClass._fields)}) values {', '.join(values)}" + f"INSERT INTO {table_name}({','.join( + table_object.keys() + )}) values {', '.join(values)}" ) return diff --git a/src/pyinnodb/const/dd_column_type.py b/src/pyinnodb/const/dd_column_type.py index 2e86210..7eb37fa 100644 --- a/src/pyinnodb/const/dd_column_type.py +++ b/src/pyinnodb/const/dd_column_type.py @@ -1,5 +1,6 @@ from enum import Enum from collections import namedtuple +from datetime import datetime, date, timedelta class DDColumnType(Enum): @@ -116,42 +117,43 @@ def is_big(cls, t): DDColumnType.VECTOR, ] -DDColConf = namedtuple("DDColConf", "type size") +DDColConf = namedtuple("DDColConf", "type size pytype") +nop = namedtuple("nop", "") class DDColConf(DDColConf, Enum): - DECIMAL = DDColumnType.DECIMAL, 0 - TINY = DDColumnType.TINY, 1 - SHORT = DDColumnType.SHORT, 2 - LONG = DDColumnType.LONG, 4 - FLOAT = DDColumnType.FLOAT, 4 - DOUBLE = DDColumnType.DOUBLE, 8 - TYPE_NULL = DDColumnType.TYPE_NULL, 0 - TIMESTAMP = DDColumnType.TIMESTAMP, 0 - LONGLONG = DDColumnType.LONGLONG, 8 - INT24 = DDColumnType.INT24, 3 - DATE = DDColumnType.DATE, 0 - TIME = DDColumnType.TIME, 0 - DATETIME = DDColumnType.DATETIME, 0 - YEAR = DDColumnType.YEAR, 1 - NEWDATE = DDColumnType.NEWDATE, 3 - VARCHAR = DDColumnType.VARCHAR, 0 - BIT = DDColumnType.BIT, 0 - TIMESTAMP2 = DDColumnType.TIMESTAMP2, 0 - DATETIME2 = DDColumnType.DATETIME2, 0 - TIME2 = DDColumnType.TIME2, 0 - NEWDECIMAL = DDColumnType.NEWDECIMAL, 0 - ENUM = DDColumnType.ENUM, 0 - SET = DDColumnType.SET, 0 - TINY_BLOB = DDColumnType.TINY_BLOB, 0 - MEDIUM_BLOB = DDColumnType.MEDIUM_BLOB, 0 - LONG_BLOB = DDColumnType.LONG_BLOB, 0 - BLOB = DDColumnType.BLOB, 0 - VAR_STRING = DDColumnType.VAR_STRING, 0 - STRING = DDColumnType.STRING, 0 - GEOMETRY = DDColumnType.GEOMETRY, 0 - JSON = DDColumnType.JSON, 0 - VECTOR = DDColumnType.VECTOR, 0 + DECIMAL = DDColumnType.DECIMAL, 0, float + TINY = DDColumnType.TINY, 1, int + SHORT = DDColumnType.SHORT, 2, int + LONG = DDColumnType.LONG, 4, int + FLOAT = DDColumnType.FLOAT, 4, float + DOUBLE = DDColumnType.DOUBLE, 8, float + TYPE_NULL = DDColumnType.TYPE_NULL, 0, int + TIMESTAMP = DDColumnType.TIMESTAMP, 0, int + LONGLONG = DDColumnType.LONGLONG, 8, int + INT24 = DDColumnType.INT24, 3, int + DATE = DDColumnType.DATE, 0, date + TIME = DDColumnType.TIME, 0, timedelta + DATETIME = DDColumnType.DATETIME, 0, datetime + YEAR = DDColumnType.YEAR, 1, int + NEWDATE = DDColumnType.NEWDATE, 3, date + VARCHAR = DDColumnType.VARCHAR, 0, str + BIT = DDColumnType.BIT, 0, int + TIMESTAMP2 = DDColumnType.TIMESTAMP2, 0, int + DATETIME2 = DDColumnType.DATETIME2, 0, datetime + TIME2 = DDColumnType.TIME2, 0, timedelta + NEWDECIMAL = DDColumnType.NEWDECIMAL, 0, float + ENUM = DDColumnType.ENUM, 0, str + SET = DDColumnType.SET, 0, set + TINY_BLOB = DDColumnType.TINY_BLOB, 0, str + MEDIUM_BLOB = DDColumnType.MEDIUM_BLOB, 0, str + LONG_BLOB = DDColumnType.LONG_BLOB, 0, str + BLOB = DDColumnType.BLOB, 0, str + VAR_STRING = DDColumnType.VAR_STRING, 0, str + STRING = DDColumnType.STRING, 0, str + GEOMETRY = DDColumnType.GEOMETRY, 0, nop + JSON = DDColumnType.JSON, 0, str + VECTOR = DDColumnType.VECTOR, 0, list @classmethod def get_col_type_conf(cls, type) -> DDColConf: diff --git a/src/pyinnodb/sdi/table.py b/src/pyinnodb/sdi/table.py index 97b5cc8..7fb7feb 100644 --- a/src/pyinnodb/sdi/table.py +++ b/src/pyinnodb/sdi/table.py @@ -3,6 +3,7 @@ import struct import decimal import dataclasses +import random import re import sys @@ -11,14 +12,15 @@ from functools import cache else: cache = lambda x: x -from dataclasses import dataclass +from dataclasses import dataclass, field from collections import namedtuple from base64 import b64decode +from datetime import timedelta, datetime, date + from .. import const -from ..const.dd_column_type import DDColumnType, DDColConf +from ..const.dd_column_type import DDColumnType, DDColConf, nop from ..disk_struct.varsize import VarSize, OffPagePointer - from ..disk_struct.data import MTime2, MDatetime, MDate, MTimestamp, MGeo from ..disk_struct.json import MJson from ..disk_struct.rollback import MRollbackPointer @@ -124,6 +126,25 @@ class Column: collation_id: int = 0 is_explicit_collation: bool = False + @property + @cache + def pytype(self): + return DDColConf.get_col_type_conf(self.type).pytype + + @property + @cache + def dfield(self): + kw_only = False + default = dataclasses.MISSING + if self.pytype == nop: + kw_only = True + default = None + return field( + default=default, + kw_only=kw_only, + metadata={"col": self}, + ) + def index_prefix(self, ie: IndexElement): if ie.length == 4294967295: return 0, False @@ -702,6 +723,70 @@ def DataClassHiddenCol(self): return namedtuple(self.name, " ".join(cols)) + def keys(self, no_primary=False, for_rand=False): + v = [f.name for f in dataclasses.fields(self.DataClass)] + if not no_primary and not for_rand: + return v + primary_key_name = [f.name for f in self.get_primary_key_col()] + v = [f for f in v if f not in primary_key_name] + if not for_rand: + return v + target = [f.name for f in dataclasses.fields(self.DataClass) if f.type in [int, str]] + + return [f for f in v if f in target] + + def gen_rand_data_sql(self, size, int_range=256, str_size=20): + rand_key = self.keys(for_rand=True) + values = [] + for dc in self.gen_rand_data(size, int_range, str_size): + values.append("(" + ",".join(self.transfer(dc, rand_key)) + ")") + + return f"INSERT INTO `{self.schema_ref}`.`{self.name}`({','.join(rand_key)}) values {', '.join(values)}" + + def gen_rand_data(self, size, int_range=256, str_size=20): + keys = self.keys(for_rand=True) + vs = [] + for i in range(size): + v = [] + for f in dataclasses.fields(self.DataClass): + if f.name not in keys: + continue + if f.type == int: + v.append(random.randint(0, int_range)) + elif f.type == str: + v.append(random.randbytes(str_size).hex()) + vs.append(v) + return vs + + def transfer(self, dc, keys=None): + vs = [] + if keys is None: + value = dataclasses.astuple(dc) + elif isinstance(dc, self.DataClass): + value = [getattr(dc, k) for k in keys] + elif isinstance(dc, list): + value = dc + for f in value: + if isinstance(f, dict) or isinstance(f, list): + vs.append(repr(json.dumps(f))) + elif f is None: + vs.append("NULL") + elif ( + isinstance(f, date) + or isinstance(f, timedelta) + or isinstance(f, datetime) + ): + vs.append(f"'{str(f)}'") + elif isinstance(f, MGeo): + d = f.build().hex() # .zfill(50) + vs.append("0x" + d) + elif isinstance(f, bytes): + vs.append("0x"+f.hex()) + else: + vs.append(repr(f)) + return vs + + @property @cache def DataClass(self): @@ -715,9 +800,9 @@ def DataClass(self): continue if c.is_virtual or c.generation_expression_utf8 != "": continue - cols.append(c.name) + cols.append([c.name, c.pytype, c.dfield]) - return namedtuple(self.name, " ".join(cols)) + return dataclasses.make_dataclass(self.name, cols) @property @cache