Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix a memory leak caused by schematics json schemas #50

Merged
merged 2 commits into from
Dec 19, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# uranium
.virtualenv
.env
bin
build
Expand Down
33 changes: 21 additions & 12 deletions transmute_core/object_serializers/schematics_serializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,16 +74,16 @@ def _translate_to_model(self, model):
if isinstance(model, BaseType):
return model

if model in self._models:
return self._models[model]

if isinstance(model, ModelMeta):
return ModelType(model)
self._models[model] = ModelType(model)

if model not in self._models and isinstance(model, tuple):
self._models[model] = ListType(self._translate_to_model(model[0]))

if model in self._models:
return self._models[model]

return model
return self._models.get(model, model)

@staticmethod
def _to_key(model):
Expand All @@ -92,7 +92,7 @@ def _to_key(model):
return model

def load(self, model, value):
model = _enforce_instance(model)
model = _enforce_type_instance_or_model_class(model)
try:
context = get_import_context(oo=True)
model = self._translate_to_model(model)
Expand All @@ -108,7 +108,7 @@ def load(self, model, value):
raise SerializationException(str(e))

def dump(self, model, value):
model = _enforce_instance(model)
model = _enforce_type_instance_or_model_class(model)
try:
model = self._translate_to_model(model)
return model.to_primitive(value)
Expand All @@ -118,7 +118,12 @@ def dump(self, model, value):
def to_json_schema(self, model):
if isinstance(model, Serializable):
model = model.type
model = _enforce_instance(model)
# ensure that the model is an instance,
# as further processing steps required that it is.
model = _enforce_type_instance_or_model_class(model)
# once the model is an instance of something, there
# are several legacy transforms (like taking a tuple of schematics
# models) that require a second transformation to a pure schematics model.
model = self._translate_to_model(model)
return _to_json_schema(model)

Expand Down Expand Up @@ -174,11 +179,15 @@ def _dict_type_to_json_schema(dict_type):
return {"type": "object", "additionalProperties": _to_json_schema(dict_type.field)}


def _enforce_instance(model_or_class):
def _enforce_type_instance_or_model_class(model_or_class):
"""
It's a common mistake to not initialize a
schematics class. We should handle that by just
calling the default constructor.
As a broad range of schematics classes and instances are supported, this
ensures that those options are normalized to one of the following:

* an uninstantiated model
* an instantiated type

This enables the rest of the code to only have to handle those two cases.
"""
if isinstance(model_or_class, type) and issubclass(model_or_class, BaseType):
return model_or_class()
Expand Down
28 changes: 26 additions & 2 deletions transmute_core/tests/test_schematics.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from schematics.types.compound import DictType, ModelType
from schematics.exceptions import ValidationError
from transmute_core.exceptions import SerializationException, NoSerializerFound
from transmute_core.object_serializers.schematics_serializer import SchematicsSerializer


class Card(Model):
Expand Down Expand Up @@ -85,8 +86,7 @@ def test_schematics_to_json_schema(object_serializer_set):
(IntType, {"type": "integer"}),
(datetime, {"type": "string", "format": "date-time"}),
(UTCDateTimeType, {"type": "string", "format": "date-time"}),
(Serializable(fget=lambda: None,
type=StringType()), {"type": "string"}),
(Serializable(fget=lambda: None, type=StringType()), {"type": "string"}),
],
)
def test_to_json_schema(object_serializer_set, inp, expected):
Expand Down Expand Up @@ -165,3 +165,27 @@ def test_can_handle(object_serializer_set, cls, should_handle):
the object serializer set should not raise an exception if it can handle these types.
"""
object_serializer_set[cls]


def test_schematics_uses_cached_entries():
"""
Regression test. Validating that SchematicsSerializer uses the
same json schema it used previously, as before erroneous
use of the instantiated model as a key was causing memory leaks.
"""
serializer = SchematicsSerializer()
# A nested schema type is required as primitives have
# hard-coded dictionaries representing the json.
class SchematicsBody(Model):
name = StringType(max_length=5)

# ensure that instances have the same key as well.
instance = ModelType(SchematicsBody)
original_payload = serializer.to_json_schema(instance)
second_payload = serializer.to_json_schema(instance)
assert original_payload is second_payload

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we expect two different instances of the same thing to be the same payload? e.g.

    instance = ModelType(SchematicsBody)
    second_instance = ModelType(SchematicsBody)
    original_payload = serializer.to_json_schema(instance)
    second_payload = serializer.to_json_schema(second_instance)
    assert original_payload is second_payload

Copy link
Owner Author

@toumorokoshi toumorokoshi Dec 19, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as far as transmute is concerned, it would be a weird pattern to wrap your type in a ModelType in a non-nested scenario. But could definitely see people doing that.

This won't cause a memory leak as ModelTypes are only created dynamically once when the raw type is passed through, and cached from that point on. Assuming the user isn't dynamically creating modeltypes, that cache won't grow unbounded.

I think it's a good optimization, but if I was to clean this up properly I'd probably add more strict validation to reject weird edge cases like this. If you'd like to follow up with that optimization in a PR I encourage it, but I'm personally not sure the extra complexity is worth a few fewer cache entries.


# classes are also valid output to recieve that the json schema
original_payload = serializer.to_json_schema(SchematicsBody)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These SchematicsBody classes do not get instantiated within the to_json_schema method because it fails the issubclass(model_or_class, BaseType) check in _enforce_instance.

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SchematicsBody classes should not get instantiated. you could argue _enforce_instance is a misnomer here, as schematics has an incongruent apis between "types" and "models":

  • types need to be an instance to call validate
  • models need to be a class to instantiate and call validate

that incongruency leads to a lot of the code that exists in this file, dealing with specific edge cases around the various classes and their similar, but slightly mismatched, APIs.

to clarify, I could rename this method to "_enforce_instance_type_or_model". How does that sound?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ahh, got it, thanks for clarifying. That change sounds great!

second_payload = serializer.to_json_schema(SchematicsBody)
assert original_payload is second_payload
5 changes: 2 additions & 3 deletions uranium
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,8 @@ ROOT = os.path.dirname(__file__)
URANIUM_GITHUB_ACCOUNT = os.getenv("URANIUM_GITHUB_ACCOUNT", "toumorokoshi")
URANIUM_GITHUB_BRANCH = os.getenv("URANIUM_GITHUB_BRANCH", "master")
URANIUM_STANDALONE_URL = "https://raw.githubusercontent.com/{account}/uranium/{branch}/uranium/scripts/uranium_standalone".format(
account=URANIUM_GITHUB_ACCOUNT, branch=URANIUM_GITHUB_BRANCH
)
CACHE_DIRECTORY = os.path.join(ROOT, ".env", ".uranium")
account=URANIUM_GITHUB_ACCOUNT, branch=URANIUM_GITHUB_BRANCH)
CACHE_DIRECTORY = os.path.join(ROOT, ".virtualenv", ".uranium")
CACHED_URANIUM_STANDALONE = os.path.join(CACHE_DIRECTORY, "uranium_standalone")
# set uranium to use to 2.0a0
os.environ["URANIUM_VERSION"] = "2.0a1"
Expand Down