Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use orjson to improve JSON marshalling performance #691

Merged
merged 2 commits into from
Jan 30, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,23 @@ Changes for crate
Unreleased
==========

- Switched JSON encoder to use the `orjson`_ library, to improve JSON
marshalling performance. Thanks, @widmogrod.

orjson is fast and in some spots even more correct when compared against
Python's stdlib ``json`` module. Contrary to the stdlib variant, orjson
will serialize to ``bytes`` instead of ``str``. When sending data to CrateDB,
``crate-python`` uses a custom encoder to add support for additional data
types.

- Python's ``Decimal`` type will be serialized to ``str``.
- Python's ``dt.datetime`` and ``dt.date`` types will be serialized to
``int`` (``LONG``) after converting to milliseconds since epoch, to
optimally accommodate CrateDB's `TIMESTAMP`_ representation.
- NumPy's data types will be handled by ``orjson`` without any ado.

.. _orjson: https://github.com/ijl/orjson
.. _TIMESTAMP: https://cratedb.com/docs/crate/reference/en/latest/general/ddl/data-types.html#type-timestamp

2024/11/23 1.0.1
================
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ def read(path):
packages=find_namespace_packages("src"),
package_dir={"": "src"},
install_requires=[
"orjson<4",
"urllib3",
"verlib2",
],
Expand Down
80 changes: 54 additions & 26 deletions src/crate/client/http.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,22 +21,22 @@


import calendar
import datetime as dt
import heapq
import io
import json
import logging
import os
import re
import socket
import ssl
import threading
import typing as t
from base64 import b64encode
from datetime import date, datetime, timezone
from decimal import Decimal
from time import time
from urllib.parse import urlparse
from uuid import UUID

import orjson
import urllib3
from urllib3 import connection_from_url
from urllib3.connection import HTTPConnection
Expand Down Expand Up @@ -86,25 +86,53 @@
return None


class CrateJsonEncoder(json.JSONEncoder):
epoch_aware = datetime(1970, 1, 1, tzinfo=timezone.utc)
epoch_naive = datetime(1970, 1, 1)

def default(self, o):
if isinstance(o, (Decimal, UUID)):
return str(o)
if isinstance(o, datetime):
if o.tzinfo is not None:
delta = o - self.epoch_aware
else:
delta = o - self.epoch_naive
return int(
delta.microseconds / 1000.0
+ (delta.seconds + delta.days * 24 * 3600) * 1000.0
)
if isinstance(o, date):
return calendar.timegm(o.timetuple()) * 1000
return json.JSONEncoder.default(self, o)
epoch_aware = dt.datetime(1970, 1, 1, tzinfo=dt.timezone.utc)
epoch_naive = dt.datetime(1970, 1, 1)


def json_encoder(obj: t.Any) -> t.Union[int, str]:
"""
Encoder function for orjson, with additional type support.

- Python's `Decimal` type will be serialized to `str`.
- Python's `dt.datetime` and `dt.date` types will be
serialized to `int` after converting to milliseconds
since epoch.

https://github.com/ijl/orjson#default
https://cratedb.com/docs/crate/reference/en/latest/general/ddl/data-types.html#type-timestamp
"""
if isinstance(obj, Decimal):
return str(obj)
if isinstance(obj, dt.datetime):
if obj.tzinfo is not None:
delta = obj - epoch_aware
else:
delta = obj - epoch_naive
return int(
delta.microseconds / 1000.0
+ (delta.seconds + delta.days * 24 * 3600) * 1000.0
)
if isinstance(obj, dt.date):
return calendar.timegm(obj.timetuple()) * 1000
raise TypeError

Check warning on line 118 in src/crate/client/http.py

View check run for this annotation

Codecov / codecov/patch

src/crate/client/http.py#L118

Added line #L118 was not covered by tests


def json_dumps(obj: t.Any) -> bytes:
"""
Serialize to JSON format, using `orjson`, with additional type support.

https://github.com/ijl/orjson
"""
return orjson.dumps(
obj,
default=json_encoder,
option=(
orjson.OPT_PASSTHROUGH_DATETIME
| orjson.OPT_NON_STR_KEYS
| orjson.OPT_SERIALIZE_NUMPY
),
)


class Server:
Expand Down Expand Up @@ -180,7 +208,7 @@

def _json_from_response(response):
try:
return json.loads(response.data.decode("utf-8"))
return orjson.loads(response.data)
except ValueError as ex:
raise ProgrammingError(
"Invalid server response of content-type '{}':\n{}".format(
Expand Down Expand Up @@ -223,7 +251,7 @@
if response.status == 503:
raise ConnectionError(message)
if response.headers.get("content-type", "").startswith("application/json"):
data = json.loads(response.data.decode("utf-8"))
data = orjson.loads(response.data)
error = data.get("error", {})
error_trace = data.get("error_trace", None)
if "results" in data:
Expand Down Expand Up @@ -323,7 +351,7 @@
kwargs["ssl_minimum_version"] = ssl.TLSVersion.MINIMUM_SUPPORTED


def _create_sql_payload(stmt, args, bulk_args):
def _create_sql_payload(stmt, args, bulk_args) -> bytes:
if not isinstance(stmt, str):
raise ValueError("stmt is not a string")
if args and bulk_args:
Expand All @@ -334,7 +362,7 @@
data["args"] = args
if bulk_args:
data["bulk_args"] = bulk_args
return json.dumps(data, cls=CrateJsonEncoder)
return json_dumps(data)


def _get_socket_opts(
Expand Down
10 changes: 5 additions & 5 deletions tests/client/test_http.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,9 @@
)
from crate.client.http import (
Client,
CrateJsonEncoder,
_get_socket_opts,
_remove_certs_for_non_https,
json_dumps,
)

REQUEST = "crate.client.http.Server.request"
Expand Down Expand Up @@ -724,10 +724,10 @@ def test_username(self):
class TestCrateJsonEncoder(TestCase):
def test_naive_datetime(self):
data = dt.datetime.fromisoformat("2023-06-26T09:24:00.123")
result = json.dumps(data, cls=CrateJsonEncoder)
self.assertEqual(result, "1687771440123")
result = json_dumps(data)
self.assertEqual(result, b"1687771440123")

def test_aware_datetime(self):
data = dt.datetime.fromisoformat("2023-06-26T09:24:00.123+02:00")
result = json.dumps(data, cls=CrateJsonEncoder)
self.assertEqual(result, "1687764240123")
result = json_dumps(data)
self.assertEqual(result, b"1687764240123")
Loading