Skip to content

Commit

Permalink
Merge pull request #51 from piercefreeman/feature/optimize-insert
Browse files Browse the repository at this point in the history
Optimize insertion speed
  • Loading branch information
piercefreeman authored Dec 27, 2024
2 parents c6ba7f4 + 188c4c7 commit 4f0c1aa
Show file tree
Hide file tree
Showing 2 changed files with 135 additions and 25 deletions.
45 changes: 45 additions & 0 deletions iceaxe/__tests__/benchmarks/test_bulk_insert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import time
from typing import Sequence

import pytest

from iceaxe.__tests__.conf_models import UserDemo
from iceaxe.logging import CONSOLE, LOGGER
from iceaxe.session import DBConnection


def generate_test_users(count: int) -> Sequence[UserDemo]:
"""
Generate a sequence of test users for bulk insertion.
:param count: Number of users to generate
:return: Sequence of UserDemo instances
"""
return [
UserDemo(name=f"User {i}", email=f"user{i}@example.com") for i in range(count)
]


@pytest.mark.asyncio
@pytest.mark.integration_tests
async def test_bulk_insert_performance(db_connection: DBConnection):
"""
Test the performance of bulk inserting 500k records.
"""
NUM_USERS = 500_000
users = generate_test_users(NUM_USERS)
LOGGER.info(f"Generated {NUM_USERS} test users")

start_time = time.time()

await db_connection.insert(users)

total_time = time.time() - start_time
records_per_second = NUM_USERS / total_time

CONSOLE.print("\nBulk Insert Performance:")
CONSOLE.print(f"Total time: {total_time:.2f} seconds")
CONSOLE.print(f"Records per second: {records_per_second:.2f}")

result = await db_connection.conn.fetchval("SELECT COUNT(*) FROM userdemo")
assert result == NUM_USERS
115 changes: 90 additions & 25 deletions iceaxe/session.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from collections import defaultdict
from contextlib import asynccontextmanager
from json import loads as json_loads
from math import ceil
from typing import (
Any,
Literal,
Expand Down Expand Up @@ -33,6 +34,9 @@

TableType = TypeVar("TableType", bound=TableBase)

# PostgreSQL has a limit of 65535 parameters per query
PG_MAX_PARAMETERS = 65535


class DBConnection:
"""
Expand Down Expand Up @@ -235,37 +239,98 @@ async def insert(self, objects: Sequence[TableBase]):
if not objects:
return

for model, model_objects in self._aggregate_models_by_table(objects):
table_name = QueryIdentifier(model.get_table_name())
fields = {
field: info
for field, info in model.model_fields.items()
if (not info.exclude and not info.autoincrement)
}
field_string = ", ".join(f'"{field}"' for field in fields)
primary_key = self._get_primary_key(model)

placeholders = ", ".join(f"${i}" for i in range(1, len(fields) + 1))
query = f"INSERT INTO {table_name} ({field_string}) VALUES ({placeholders})"
if primary_key:
query += f" RETURNING {primary_key}"
# Reuse a single transaction for all inserts
async with self._ensure_transaction():
for model, model_objects in self._aggregate_models_by_table(objects):
# For each table, build batched insert queries
table_name = QueryIdentifier(model.get_table_name())
fields = {
field: info
for field, info in model.model_fields.items()
if (not info.exclude and not info.autoincrement)
}
primary_key = self._get_primary_key(model)
field_names = list(
fields.keys()
) # Iterate over these in order for each row
field_identifiers = ", ".join(f'"{f}"' for f in field_names)

# Calculate max batch size based on number of fields
# Each row uses len(fields) parameters, so max_batch_size * len(fields) <= PG_MAX_PARAMETERS
max_batch_size = PG_MAX_PARAMETERS // len(fields)
# Cap at 5000 rows per batch to avoid excessive memory usage
max_batch_size = min(max_batch_size, 5000)

total = len(model_objects)
num_batches = ceil(total / max_batch_size)

for batch_idx in range(num_batches):
start_idx = batch_idx * max_batch_size
end_idx = (batch_idx + 1) * max_batch_size
batch_objects = model_objects[start_idx:end_idx]

# Build the multi-row VALUES clause
# e.g. for 3 rows with 2 columns, we'd want:
# VALUES ($1, $2), ($3, $4), ($5, $6)
num_rows = len(batch_objects)
if not num_rows:
continue

async with self._ensure_transaction():
for obj in model_objects:
obj_values = obj.model_dump()
values = [
info.to_db_value(obj_values[field])
for field, info in fields.items()
]
result = await self.conn.fetchrow(query, *values)
# placeholders per row: ($1, $2, ...)
# but we have to shift the placeholder index for each row
placeholders: list[str] = []
values: list[Any] = []
param_index = 1

for obj in batch_objects:
obj_values = obj.model_dump()
row_values = []
for field in field_names:
info = fields[field]
row_values.append(info.to_db_value(obj_values[field]))
values.extend(row_values)
row_placeholder = (
"("
+ ", ".join(
f"${p}"
for p in range(
param_index, param_index + len(field_names)
)
)
+ ")"
)
placeholders.append(row_placeholder)
param_index += len(field_names)

placeholders_clause = ", ".join(placeholders)

query = f"""
INSERT INTO {table_name} ({field_identifiers})
VALUES {placeholders_clause}
"""
if primary_key:
query += f" RETURNING {primary_key}"

# Insert them in one go
if primary_key:
rows = await self.conn.fetch(query, *values)
# 'rows' should be a list of Record objects, one per inserted row
# Update each object in the same order
for obj, row in zip(batch_objects, rows):
setattr(obj, primary_key, row[primary_key])
else:
# No need to fetch anything if there's no primary key
await self.conn.execute(query, *values)

if primary_key and result:
setattr(obj, primary_key, result[primary_key])
obj.clear_modified_attributes()
# Mark as unmodified
for obj in batch_objects:
obj.clear_modified_attributes()

# Register modification callbacks outside the main insert loop
for obj in objects:
obj.register_modified_callback(self.modification_tracker.track_modification)

# Clear modification status
self.modification_tracker.clear_status(objects)

@overload
Expand Down

0 comments on commit 4f0c1aa

Please sign in to comment.