Skip to content

Commit

Permalink
Merge pull request #1389 from Sage-Bionetworks/develop-fix-cache
Browse files Browse the repository at this point in the history
[bug fix] Fix check_synapse_cache_size function; allow file size to be float
  • Loading branch information
linglp authored Apr 5, 2024
2 parents 41e9fb9 + ad61a5c commit f91c4ed
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 53 deletions.
7 changes: 2 additions & 5 deletions schematic/store/synapse.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,9 @@
from schematic.utils.general import (
entity_type_mapping,
get_dir_size,
convert_gb_to_bytes,
create_temp_folder,
check_synapse_cache_size,
clear_synapse_cache,
profile,
calculate_datetime,
)

from schematic.utils.schema_utils import get_class_label_from_display_name
Expand Down Expand Up @@ -234,8 +231,8 @@ def _purge_synapse_cache(self, maximum_storage_allowed_cache_gb=1):
# try clearing the cache
# scan a directory and check size of files
if os.path.exists(self.root_synapse_cache):
maximum_storage_allowed_cache_bytes = convert_gb_to_bytes(
maximum_storage_allowed_cache_gb
maximum_storage_allowed_cache_bytes = maximum_storage_allowed_cache_gb * (
1024**3
)
nbytes = get_dir_size(self.root_synapse_cache)
dir_size_bytes = check_synapse_cache_size(directory=self.root_synapse_cache)
Expand Down
17 changes: 4 additions & 13 deletions schematic/utils/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,14 +129,14 @@ def calculate_datetime(

def check_synapse_cache_size(
directory: str = "/root/.synapseCache",
) -> Union[float, int]:
) -> float:
"""use du --sh command to calculate size of .synapseCache.
Args:
directory (str, optional): .synapseCache directory. Defaults to '/root/.synapseCache'
Returns:
float or integer: returns size of .synapsecache directory in bytes
float: returns size of .synapsecache directory in bytes
"""
# Note: this command might fail on windows user.
# But since this command is primarily for running on AWS, it is fine.
Expand All @@ -154,8 +154,8 @@ def check_synapse_cache_size(
size_in_mb = float(size.rstrip("M"))
byte_size = size_in_mb * 1000000
elif "G" in size:
size_in_gb = int(size.rstrip("G"))
byte_size = convert_gb_to_bytes(size_in_gb)
size_in_gb = float(size.rstrip("G"))
byte_size = size_in_gb * (1024**3)
elif "B" in size:
byte_size = float(size.rstrip("B"))
else:
Expand All @@ -180,15 +180,6 @@ def clear_synapse_cache(synapse_cache: cache.Cache, minutes: int) -> int:
return num_of_deleted_files


def convert_gb_to_bytes(g_bytes: int) -> int:
"""convert gb to bytes
Args:
g_bytes: number of gb
return: total number of bytes
"""
return g_bytes * 1024 * 1024 * 1024


def entity_type_mapping(syn: Synapse, entity_id: str) -> str:
"""Return the entity type of manifest
Expand Down
110 changes: 75 additions & 35 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
import time
from datetime import datetime
from unittest import mock
from pathlib import Path
from typing import Union, Generator
from _pytest.fixtures import FixtureRequest

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -147,8 +150,7 @@
"duplicated_component": {
"validation_rules": ["#Patient unique^^#Patient int"],
"parsed_rules": "raises_exception",
}

},
}

TEST_DN_DICT = {
Expand All @@ -159,22 +161,55 @@
"bio_things": {"class": "BioThings", "property": "bioThings"},
}

test_disk_storage = [
(2, 4000, 16000),
(1000, 4000, 16000),
(2000000, 1900000, 2000000),
(1073741825, 1073741824, 1181116006.4),
]


# create temporary files with various size based on request
@pytest.fixture()
def create_temp_query_file(
tmp_path: Path, request: FixtureRequest
) -> Generator[tuple[Path, Path, Path], None, None]:
"""create temporary files of various size based on request parameter.
Args:
tmp_path (Path): temporary file path
request (any): a request for a fixture from a test
Yields:
Generator[Tuple[Path, Path, Path]]: return path of mock synapse cache directory, mock table query folder and csv
"""
# define location of mock synapse cache
mock_synapse_cache_dir = tmp_path / ".synapseCache/"
mock_synapse_cache_dir.mkdir()
mock_sub_folder = mock_synapse_cache_dir / "123"
mock_sub_folder.mkdir()
mock_table_query_folder = mock_sub_folder / "456"
mock_table_query_folder.mkdir()

# create mock table query csv
mock_synapse_table_query_csv = (
mock_table_query_folder / "mock_synapse_table_query.csv"
)
with open(mock_synapse_table_query_csv, "wb") as f:
f.write(b"\0" * request.param)
yield mock_synapse_cache_dir, mock_table_query_folder, mock_synapse_table_query_csv


class TestGeneral:
def test_clear_synapse_cache(self, tmp_path):
@pytest.mark.parametrize("create_temp_query_file", [3, 1000], indirect=True)
def test_clear_synapse_cache(self, create_temp_query_file) -> None:
# define location of mock synapse cache
mock_synapse_cache_dir = tmp_path / ".synapseCache/"
mock_synapse_cache_dir.mkdir()
mock_sub_folder = mock_synapse_cache_dir / "123"
mock_sub_folder.mkdir()
mock_table_query_folder = mock_sub_folder / "456"
mock_table_query_folder.mkdir()

# create mock table query csv and a mock cache map
mock_synapse_table_query_csv = (
mock_table_query_folder / "mock_synapse_table_query.csv"
)
mock_synapse_table_query_csv.write_text("mock table query content")
(
mock_synapse_cache_dir,
mock_table_query_folder,
mock_synapse_table_query_csv,
) = create_temp_query_file
# create a mock cache map
mock_cache_map = mock_table_query_folder / ".cacheMap"
mock_cache_map.write_text(
f"{mock_synapse_table_query_csv}: '2022-06-13T19:24:27.000Z'"
Expand Down Expand Up @@ -222,22 +257,25 @@ def test_calculate_datetime_raise_error(self):

# this test might fail for windows machine
@pytest.mark.not_windows
def test_check_synapse_cache_size(self, tmp_path):
mock_synapse_cache_dir = tmp_path / ".synapseCache"
mock_synapse_cache_dir.mkdir()

mock_synapse_table_query_csv = (
mock_synapse_cache_dir / "mock_synapse_table_query.csv"
)
mock_synapse_table_query_csv.write_text("example file for calculating cache")

file_size = check_synapse_cache_size(mock_synapse_cache_dir)
@pytest.mark.parametrize(
"create_temp_query_file,local_disk_size,gh_disk_size",
test_disk_storage,
indirect=["create_temp_query_file"],
)
def test_check_synapse_cache_size(
self,
create_temp_query_file,
local_disk_size: int,
gh_disk_size: Union[int, float],
) -> None:
mock_synapse_cache_dir, _, _ = create_temp_query_file
disk_size = check_synapse_cache_size(mock_synapse_cache_dir)

# For some reasons, when running in github action, the size of file changes.
if IN_GITHUB_ACTIONS:
assert file_size == 8000
assert disk_size == gh_disk_size
else:
assert file_size == 4000
assert disk_size == local_disk_size

def test_find_duplicates(self):
mock_list = ["foo", "bar", "foo"]
Expand Down Expand Up @@ -775,18 +813,20 @@ def test_parse_single_set_validation_rules(self, test_individual_rule_set):
@pytest.mark.parametrize(
"component_names",
[
["duplicated_component", ['Patient', 'Biospecimen', 'Patient']],
["individual_component", ['Patient', 'Biospecimen']],
["no_component", []]
["duplicated_component", ["Patient", "Biospecimen", "Patient"]],
["individual_component", ["Patient", "Biospecimen"]],
["no_component", []],
],
ids=["duplicated_component", "individual_component", "no_component"],
)
def test_check_for_duplicate_components(self, component_names):
"""Test that we are properly identifying duplicates in a list.
Exception should only be triggered when the duplicate component list is passed.
Exception should only be triggered when the duplicate component list is passed.
"""
try:
check_for_duplicate_components(component_names=component_names[1], validation_rule_string='dummy_str')
check_for_duplicate_components(
component_names=component_names[1], validation_rule_string="dummy_str"
)
except:
assert component_names[0] == "duplicated_component"

Expand All @@ -812,7 +852,7 @@ def test_parse_validation_rules(self, test_rule_name):
)
assert expected_parsed_rules == parsed_validation_rules
except:
assert test_rule_name in ["str_rule", "duplicated_component"]
assert test_rule_name in ["str_rule", "duplicated_component"]

@pytest.mark.parametrize(
"test_rule_name",
Expand All @@ -836,6 +876,7 @@ def test_extract_component_validation_rules(self, test_rule_name):
component
]
)

@pytest.mark.parametrize(
"test_dn",
list(TEST_DN_DICT.keys()),
Expand Down Expand Up @@ -950,8 +991,7 @@ def test_get_label_from_display_name(self, test_dn: str, data_model_labels: str)

class TestValidateUtils:
def test_validate_schema(self, helpers):
"""
"""
""" """

# Get data model path
data_model_path = helpers.get_data_path("example.model.jsonld")
Expand Down

0 comments on commit f91c4ed

Please sign in to comment.