fix: replace pydantic with msgspec

This ends up giving us a 1.5x speed improvement ``` ❯ hyperfine --warmup 1 ".\old\Scripts\python.exe test_nzb_old.py" ".\.venv\Scripts\python.exe test_nzb.py" Benchmark 1: .\old\Scripts\python.exe test_nzb_old.py Time (mean ± σ): 6.042 s ± 0.295 s [User: 5.752 s, System: 0.259 s] Range (min … max): 5.754 s … 6.413 s 10 runs Benchmark 2: .\.venv\Scripts\python.exe test_nzb.py Time (mean ± σ): 3.902 s ± 0.019 s [User: 3.624 s, System: 0.222 s] Range (min … max): 3.871 s … 3.930 s 10 runs Summary .\.venv\Scripts\python.exe test_nzb.py ran 1.55 ± 0.08 times faster than .\old\Scripts\python.exe test_nzb_old.py ```
Ravencentric · Feb 3, 2025 · d17e8b1 · d17e8b1
1 parent 0846a73
commit d17e8b1
Show file tree

Hide file tree

Showing 8 changed files with 119 additions and 145 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -15,7 +15,7 @@ classifiers = [
   "Programming Language :: Python :: 3.13",
   "Typing :: Typed",
 ]
-dependencies = ["natsort>=8.4.0", "pydantic>=2.9.2", "xmltodict>=0.13.0"]
+dependencies = ["msgspec>=0.19.0", "natsort>=8.4.0", "xmltodict>=0.14.2"]
 
 [project.urls]
 Homepage = "https://nzb.ravencentric.cc"
@@ -24,20 +24,16 @@ Repository = "https://github.com/Ravencentric/nzb"
 
 [dependency-groups]
 docs = [
-    "mkdocs-autorefs>=1.3.0",
-    "mkdocs-material>=9.5.50",
-    "mkdocstrings[python]>=0.27.0",
-]
-test = [
-    "coverage[toml]>=7.6.10",
-    "pytest>=8.3.4",
-    "rnzb>=0.3.1",
+  "mkdocs-autorefs>=1.3.0",
+  "mkdocs-material>=9.5.50",
+  "mkdocstrings[python]>=0.27.0",
 ]
+test = ["coverage[toml]>=7.6.10", "pytest>=8.3.4", "rnzb>=0.3.1"]
 lint = [
-    "mypy>=1.14.1",
-    "ruff>=0.9.3",
-    "types-xmltodict>=0.14.0.20241009",
-    "typing-extensions>=4.12.2",
+  "mypy>=1.14.1",
+  "ruff>=0.9.3",
+  "types-xmltodict>=0.14.0.20241009",
+  "typing-extensions>=4.12.2",
 ]
 dev = [
   { include-group = "docs" },

diff --git a/src/nzb/_core.py b/src/nzb/_core.py
@@ -5,10 +5,11 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Literal, overload
 
+import msgspec
 import xmltodict
 from natsort import natsorted
 
-from nzb._models import File, Meta, ParentModel
+from nzb._models import File, Meta
 from nzb._parser import parse_doctype, parse_files, parse_metadata
 from nzb._utils import construct_meta, nzb_to_dict, read_nzb_file, realpath, remove_meta_fields, sort_meta
 
@@ -20,7 +21,7 @@
     from nzb._types import StrPath
 
 
-class Nzb(ParentModel):
+class Nzb(msgspec.Struct, frozen=True, eq=True, kw_only=True, cache_hash=True, dict=True):
     """
     Represents a complete NZB file.
 
@@ -147,7 +148,7 @@ def from_json(cls, json: str, /) -> Nzb:
             Raised if the NZB is invalid.
 
         """
-        return cls.model_validate_json(json)
+        return msgspec.json.decode(json, type=cls)
 
     def to_json(self, *, pretty: bool = False) -> str:
         """
@@ -164,8 +165,12 @@ def to_json(self, *, pretty: bool = False) -> str:
             JSON string representing the NZB.
 
         """
-        indent = 2 if pretty else None
-        return self.model_dump_json(indent=indent)
+        jsonified = msgspec.json.encode(self).decode()
+
+        if pretty:
+            return msgspec.json.format(jsonified)
+
+        return jsonified
 
     @cached_property
     def size(self) -> int:

diff --git a/src/nzb/_models.py b/src/nzb/_models.py
@@ -1,20 +1,16 @@
 from __future__ import annotations
 
 import re
+from datetime import datetime
 from functools import cached_property
 from os.path import splitext
 
-from pydantic import BaseModel, ConfigDict
+from msgspec import Struct
 
-from nzb._types import UTCDateTime
 from nzb._utils import name_is_par2, name_is_rar, stem_is_obfuscated
 
 
-class ParentModel(BaseModel):
-    model_config = ConfigDict(frozen=True, str_strip_whitespace=True)
-
-
-class Meta(ParentModel):
+class Meta(Struct, frozen=True, eq=True, kw_only=True, cache_hash=True, dict=True):
     """Optional creator-definable metadata for the contents of the NZB."""
 
     title: str | None = None
@@ -30,7 +26,7 @@ class Meta(ParentModel):
     """Category."""
 
 
-class Segment(ParentModel):
+class Segment(Struct, frozen=True, eq=True, kw_only=True, cache_hash=True, dict=True):
     """One part segment of a file."""
 
     size: int
@@ -41,13 +37,13 @@ class Segment(ParentModel):
     """Message ID of the segment."""
 
 
-class File(ParentModel):
+class File(Struct, frozen=True, eq=True, kw_only=True, cache_hash=True, dict=True):
     """Represents a complete file, consisting of segments that make up a file."""
 
     poster: str
     """The poster of the file."""
 
-    posted_at: UTCDateTime
+    posted_at: datetime
     """The date and time when the file was posted, in UTC."""
 
     subject: str

diff --git a/src/nzb/_parser.py b/src/nzb/_parser.py
@@ -8,6 +8,7 @@
 import re
 from typing import Any, TypeAlias, cast
 
+import msgspec
 from natsort import natsorted
 
 from nzb._exceptions import InvalidNzbError
@@ -82,8 +83,8 @@ def parse_metadata(nzb: dict[str, Any]) -> Meta:
 
     return Meta(
         title=title,
-        passwords=passwords,  # type: ignore[arg-type]
-        tags=tags,  # type: ignore[arg-type]
+        passwords=tuple(passwords),
+        tags=tuple(tags),
         category=category,
     )
 
@@ -124,16 +125,16 @@ def parse_segments(segmentdict: dict[str, list[dict[str, str]] | dict[str, str]
 
     for segment in segments:
         try:
-            size = segment["@bytes"]
-            number = segment["@number"]
+            size = int(segment["@bytes"])
+            number = int(segment["@number"])
             message_id = segment["#text"]
-        except KeyError:
+        except (KeyError, ValueError):
             # This segment is broken
             # We do not error here because a few missing
             # segments don't invalidate the nzb.
             continue
 
-        segmentlist.append(Segment(size=size, number=number, message_id=message_id))  # type: ignore[arg-type]
+        segmentlist.append(Segment(size=size, number=number, message_id=message_id))
 
     return tuple(natsorted(segmentlist, key=lambda seg: seg.number))
 
@@ -197,15 +198,21 @@ def parse_files(nzb: dict[str, Any]) -> tuple[File, ...]:
         else:
             grouplist.extend(groups)
 
-        filelist.append(
-            File(
-                poster=file.get("@poster"),
-                posted_at=file.get("@date"),
-                subject=file.get("@subject"),
-                groups=natsorted(grouplist),  # type: ignore[arg-type]
-                segments=parse_segments(file.get("segments")),
+        try:
+            _file = msgspec.convert(
+                {
+                    "poster": file.get("@poster"),
+                    "posted_at": file.get("@date"),
+                    "subject": file.get("@subject"),
+                    "groups": natsorted(grouplist),
+                    "segments": parse_segments(file.get("segments")),
+                },
+                type=File,
+                strict=False,
             )
-        )
+        except msgspec.ValidationError as e:
+            raise InvalidNzbError(str(e)) from None
+        filelist.append(_file)
 
     if not filelist:  # pragma: no cover
         # I cannot think of any case where this will ever be raised

diff --git a/src/nzb/_types.py b/src/nzb/_types.py
@@ -1,13 +1,7 @@
 from __future__ import annotations
 
-from datetime import datetime, timezone
 from os import PathLike
-from typing import Annotated, TypeAlias
-
-from pydantic import AfterValidator
+from typing import TypeAlias
 
 StrPath: TypeAlias = str | PathLike[str]
 """String or pathlib.Path"""
-
-UTCDateTime = Annotated[datetime, AfterValidator(lambda dt: dt.astimezone(timezone.utc))]
-"""datetime that's always in UTC."""
diff --git a/tests/test_exceptions.py b/tests/test_exceptions.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import textwrap
 from pathlib import Path
 
 import pytest
@@ -105,3 +106,30 @@ def test_read_nzb_file(tmp_path: Path) -> None:
 
     with pytest.raises(InvalidNzbError, match="^Failed to read NZB file"):
         read_nzb_file(tmp_file)
+
+
+def test_nzb_with_missing_file_attributes() -> None:
+    nzb = textwrap.dedent("""
+    <?xml version="1.0" encoding="iso-8859-1" ?>
+    <!DOCTYPE nzb PUBLIC "-//newzBin//DTD NZB 1.1//EN" "http://www.newzbin.com/DTD/nzb/nzb-1.1.dtd">
+    <nzb xmlns="http://www.newzbin.com/DTD/2003/nzb">
+        <head>
+            <meta type="title">Your File!</meta>
+            <meta type="password">secret</meta>
+            <meta type="tag">HD</meta>
+            <meta type="category">TV</meta>
+        </head>
+        <file poster="Joe Bloggs &lt;[email protected]&gt;" date="not a date" subject="Here's your file!  abc-mr2a.r01 (1/2)">
+            <groups>
+                <group>alt.binaries.newzbin</group>
+                <group>alt.binaries.mojo</group>
+            </groups>
+            <segments>
+                <segment bytes="102394" number="1">[email protected]</segment>
+                <segment bytes="4501" number="2">[email protected]</segment>
+            </segments>
+        </file>
+    </nzb>
+    """).strip()
+    with pytest.raises(InvalidNzbError, match=r"Invalid RFC3339 encoded datetime - at `\$\.posted_at`"):
+        Nzb.from_str(nzb)
diff --git a/tests/test_nzbparser.py b/tests/test_nzbparser.py
@@ -242,3 +242,5 @@ def test_json_roundtrip(nzb_file: str) -> None:
 
     assert original == deserialized
     assert original_rnzb == deserialized_rnzb
+
+    assert deserialized.to_json(pretty=True) == deserialized_rnzb.to_json(pretty=True)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -242,3 +242,5 @@ def test_json_roundtrip(nzb_file: str) -> None:

		assert original == deserialized
		assert original_rnzb == deserialized_rnzb

		assert deserialized.to_json(pretty=True) == deserialized_rnzb.to_json(pretty=True)