From 1d48c368f3a7bc811ccf234651838c3b342156c2 Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Fri, 16 Aug 2024 09:51:13 -0700 Subject: [PATCH 01/38] update existing file paths --- schematic/store/synapse.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py index db3ab10ad..fd253196d 100644 --- a/schematic/store/synapse.py +++ b/schematic/store/synapse.py @@ -788,12 +788,20 @@ def fill_in_entity_id_filename( # the columns Filename and entityId are assumed to be present in manifest schema # TODO: use idiomatic panda syntax if dataset_files: + all_files = self._get_file_entityIds( + dataset_files=dataset_files, only_new_files=False, manifest=manifest + ) new_files = self._get_file_entityIds( dataset_files=dataset_files, only_new_files=True, manifest=manifest ) - # update manifest so that it contains new dataset files + all_files = pd.DataFrame(all_files) new_files = pd.DataFrame(new_files) + existing_files = manifest["entityId"].isin(all_files["entityId"]) + manifest.loc[existing_files, "Filename"] = all_files["Filename"] + + # update manifest so that it contains new dataset files + manifest = ( pd.concat([manifest, new_files], sort=False) .reset_index() From e2d92cc075bf99638001da851c0e358a45a90522 Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Fri, 16 Aug 2024 09:56:54 -0700 Subject: [PATCH 02/38] update logic for when no manfiest is passed in --- schematic/store/synapse.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py index fd253196d..cf5e561da 100644 --- a/schematic/store/synapse.py +++ b/schematic/store/synapse.py @@ -797,8 +797,10 @@ def fill_in_entity_id_filename( all_files = pd.DataFrame(all_files) new_files = pd.DataFrame(new_files) - existing_files = manifest["entityId"].isin(all_files["entityId"]) - manifest.loc[existing_files, "Filename"] = all_files["Filename"] + + if not manifest.empty: + existing_files = manifest["entityId"].isin(all_files["entityId"]) + manifest.loc[existing_files, "Filename"] = all_files["Filename"] # update manifest so that it contains new dataset files From dc7fac4b5c85d798167166be2bda75a50b629403 Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Fri, 16 Aug 2024 11:15:38 -0700 Subject: [PATCH 03/38] only update paths when there is a mismatch --- schematic/store/synapse.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py index cf5e561da..e28b83e2c 100644 --- a/schematic/store/synapse.py +++ b/schematic/store/synapse.py @@ -799,8 +799,20 @@ def fill_in_entity_id_filename( new_files = pd.DataFrame(new_files) if not manifest.empty: - existing_files = manifest["entityId"].isin(all_files["entityId"]) - manifest.loc[existing_files, "Filename"] = all_files["Filename"] + synpase_files_in_manifest = all_files["entityId"].isin( + manifest["entityId"] + ) + file_paths_match = ( + manifest["Filename"] + == all_files.loc[synpase_files_in_manifest, "Filename"] + ).all() + if not file_paths_match: + manifest_files_in_synapse = manifest["entityId"].isin( + all_files["entityId"] + ) + manifest.loc[manifest_files_in_synapse, "Filename"] = all_files.loc[ + synpase_files_in_manifest, "Filename" + ] # update manifest so that it contains new dataset files From 2ecb5d6e40174c51fd12bc0af3dca03daeb2a425 Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Fri, 16 Aug 2024 14:20:39 -0700 Subject: [PATCH 04/38] update test --- tests/test_store.py | 56 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 44 insertions(+), 12 deletions(-) diff --git a/tests/test_store.py b/tests/test_store.py index b89735e2e..3602beec4 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -432,27 +432,59 @@ def test_getDatasetManifest(self, synapse_store, downloadFile): assert manifest_data == "syn51204513" @pytest.mark.parametrize( - "existing_manifest_df", + "existing_manifest_df,fill_in", [ - pd.DataFrame(), - pd.DataFrame( - { - "Filename": ["existing_mock_file_path"], - "entityId": ["existing_mock_entity_id"], - } + ( + pd.DataFrame(), + [ + { + "Filename": ["mock_file_path"], + "entityId": ["mock_entity_id"], + }, + { + "Filename": ["mock_file_path"], + "entityId": ["mock_entity_id"], + }, + ], + ), + ( + pd.DataFrame( + { + "Filename": ["existing_mock_file_path"], + "entityId": ["existing_mock_entity_id"], + } + ), + [ + { + "Filename": ["existing_mock_file_path", "mock_file_path"], + "entityId": ["existing_mock_entity_id", "mock_entity_id"], + }, + { + "Filename": ["mock_file_path"], + "entityId": ["mock_entity_id"], + }, + ], ), ], ) - def test_fill_in_entity_id_filename(self, synapse_store, existing_manifest_df): + def test_fill_in_entity_id_filename( + self, synapse_store, existing_manifest_df, fill_in + ): + fill_in_first_return_value = { + "Filename": ["mock_file_path"], + "entityId": ["mock_entity_id"], + } + fill_in_first_return_value = { + "Filename": ["existing_mock_file_path", "mock_file_path"], + "entityId": ["existing_mock_entity_id", "mock_entity_id"], + } + with patch( "schematic.store.synapse.SynapseStorage.getFilesInStorageDataset", return_value=["syn123", "syn124", "syn125"], ) as mock_get_file_storage, patch( "schematic.store.synapse.SynapseStorage._get_file_entityIds", - return_value={ - "Filename": ["mock_file_path"], - "entityId": ["mock_entity_id"], - }, + side_effect=fill_in, ) as mock_get_file_entity_id: dataset_files, new_manifest = synapse_store.fill_in_entity_id_filename( datasetId="test_syn_id", manifest=existing_manifest_df From 0bd91bf6ae4c6c8bc990b41505b17f73244732aa Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Fri, 16 Aug 2024 14:21:42 -0700 Subject: [PATCH 05/38] clean and update name --- tests/test_store.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/tests/test_store.py b/tests/test_store.py index 3602beec4..96e1654d1 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -432,7 +432,7 @@ def test_getDatasetManifest(self, synapse_store, downloadFile): assert manifest_data == "syn51204513" @pytest.mark.parametrize( - "existing_manifest_df,fill_in", + "existing_manifest_df,fill_in_return_value", [ ( pd.DataFrame(), @@ -468,23 +468,14 @@ def test_getDatasetManifest(self, synapse_store, downloadFile): ], ) def test_fill_in_entity_id_filename( - self, synapse_store, existing_manifest_df, fill_in + self, synapse_store, existing_manifest_df, fill_in_return_value ): - fill_in_first_return_value = { - "Filename": ["mock_file_path"], - "entityId": ["mock_entity_id"], - } - fill_in_first_return_value = { - "Filename": ["existing_mock_file_path", "mock_file_path"], - "entityId": ["existing_mock_entity_id", "mock_entity_id"], - } - with patch( "schematic.store.synapse.SynapseStorage.getFilesInStorageDataset", return_value=["syn123", "syn124", "syn125"], ) as mock_get_file_storage, patch( "schematic.store.synapse.SynapseStorage._get_file_entityIds", - side_effect=fill_in, + side_effect=fill_in_return_value, ) as mock_get_file_entity_id: dataset_files, new_manifest = synapse_store.fill_in_entity_id_filename( datasetId="test_syn_id", manifest=existing_manifest_df From 3d5941c27f1a07112c2d762b308bf9e18f3a7c33 Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Tue, 20 Aug 2024 11:49:16 -0700 Subject: [PATCH 06/38] add test for submission --- tests/test_metadata.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/tests/test_metadata.py b/tests/test_metadata.py index bf0c4d97b..8065f793a 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -2,8 +2,9 @@ import logging import os -from typing import Optional, Generator +from contextlib import contextmanager from pathlib import Path +from typing import Generator, Optional from unittest.mock import patch import pytest @@ -15,6 +16,11 @@ logger = logging.getLogger(__name__) +@contextmanager +def does_not_raise(): + yield + + def metadata_model(helpers, data_model_labels): metadata_model = MetadataModel( inputMModelLocation=helpers.get_data_path("example.model.jsonld"), @@ -143,3 +149,20 @@ def test_submit_metadata_manifest( hide_blanks=hide_blanks, ) assert mock_manifest_id == "mock manifest id" + + def test_submit_filebased_manifest(self, helpers): + meta_data_model = metadata_model(helpers, "class_label") + + manifest_path = helpers.get_data_path( + "mock_manifests/filepath_submission_test_manifest.csv" + ) + + with does_not_raise(): + manifest_id = meta_data_model.submit_metadata_manifest( + manifest_path=manifest_path, + dataset_id="syn62276880", + manifest_record_type="file_only", + restrict_rules=False, + file_annotations_upload=True, + ) + assert manifest_id == "syn62280543" From 53ab10c5418fa9a8bb3ddb0fbc138792430bf96c Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Tue, 20 Aug 2024 12:58:30 -0700 Subject: [PATCH 07/38] add test manifest --- .../data/mock_manifests/filepath_submission_test_manifest.csv | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 tests/data/mock_manifests/filepath_submission_test_manifest.csv diff --git a/tests/data/mock_manifests/filepath_submission_test_manifest.csv b/tests/data/mock_manifests/filepath_submission_test_manifest.csv new file mode 100644 index 000000000..9def5eccc --- /dev/null +++ b/tests/data/mock_manifests/filepath_submission_test_manifest.csv @@ -0,0 +1,3 @@ +Filename,Sample ID,File Format,Component,Genome Build,Genome FASTA,Id,entityId +schematic - main/Test Filename Upload/txt1.txt,,,BulkRNA-seqAssay,,,01ded8fc-0915-4959-85ab-64e9644c8787,syn62276954 +schematic - main/Test Filename Upload/txt2.txt,,,BulkRNA-seqAssay,,,fd122bb5-3353-4c94-b1f5-0bb93a3e9fc9,syn62276956 From fa9b263446ea1f71c6ef5ad1277a9487ee848be4 Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Tue, 20 Aug 2024 15:49:31 -0700 Subject: [PATCH 08/38] redo matching logic --- schematic/store/synapse.py | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py index e28b83e2c..243f8e931 100644 --- a/schematic/store/synapse.py +++ b/schematic/store/synapse.py @@ -798,30 +798,32 @@ def fill_in_entity_id_filename( all_files = pd.DataFrame(all_files) new_files = pd.DataFrame(new_files) - if not manifest.empty: - synpase_files_in_manifest = all_files["entityId"].isin( - manifest["entityId"] - ) - file_paths_match = ( - manifest["Filename"] - == all_files.loc[synpase_files_in_manifest, "Filename"] - ).all() - if not file_paths_match: - manifest_files_in_synapse = manifest["entityId"].isin( - all_files["entityId"] - ) - manifest.loc[manifest_files_in_synapse, "Filename"] = all_files.loc[ - synpase_files_in_manifest, "Filename" - ] - # update manifest so that it contains new dataset files - manifest = ( pd.concat([manifest, new_files], sort=False) .reset_index() .drop("index", axis=1) ) + manifest_reindex = manifest.set_index("entityId") + all_files_reindex = all_files.set_index("entityId") + all_files_reindex_like_manifest = all_files_reindex.reindex_like( + manifest_reindex + ) + + file_paths_match = ( + manifest_reindex["Filename"] + == all_files_reindex_like_manifest["Filename"] + ) + + if not file_paths_match.all(): + manifest_reindex.loc[ + ~file_paths_match, "Filename" + ] = all_files_reindex_like_manifest.loc[~file_paths_match, "Filename"] + manifest = manifest_reindex.reset_index() + entityIdCol = manifest.pop("entityId") + manifest.insert(len(manifest.columns), "entityId", entityIdCol) + manifest = manifest.fillna("") return dataset_files, manifest From 48a48fd297609640999b41e576560b0585ed9627 Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Tue, 20 Aug 2024 16:22:21 -0700 Subject: [PATCH 09/38] update test assertion --- tests/test_manifest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_manifest.py b/tests/test_manifest.py index 5829dcde1..3ce9e58a0 100644 --- a/tests/test_manifest.py +++ b/tests/test_manifest.py @@ -798,4 +798,4 @@ def test_get_manifest_with_files(self, helpers, component, datasetId): assert n_rows == 4 elif component == "BulkRNA-seqAssay": assert filename_in_manifest_columns - assert n_rows == 3 + assert n_rows == 4 From abf127711bf3efda8cb46008435dfdc1a518ca81 Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Wed, 21 Aug 2024 09:21:02 -0700 Subject: [PATCH 10/38] update tests for filepaths as well --- tests/test_manifest.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/tests/test_manifest.py b/tests/test_manifest.py index 3ce9e58a0..b4daaaf76 100644 --- a/tests/test_manifest.py +++ b/tests/test_manifest.py @@ -769,7 +769,9 @@ def test_create_manifests( ) def test_get_manifest_with_files(self, helpers, component, datasetId): """ - Test to ensure that when generating a record based manifset that has files in the dataset that the files are not added to the manifest as well + Test to ensure that + when generating a record based manifset that has files in the dataset that the files are not added to the manifest as well + when generating a file based manifest from a dataset thathas had files added that the files are added correctly """ path_to_data_model = helpers.get_data_path("example.model.jsonld") @@ -799,3 +801,14 @@ def test_get_manifest_with_files(self, helpers, component, datasetId): elif component == "BulkRNA-seqAssay": assert filename_in_manifest_columns assert n_rows == 4 + + expected_files = pd.Series( + [ + "schematic - main/BulkRNASeq and files/txt1.txt", + "schematic - main/BulkRNASeq and files/txt2.txt", + "schematic - main/BulkRNASeq and files/txt4.txt", + "schematic - main/BulkRNASeq and files/txt3.txt", + ], + name="Filename", + ) + assert expected_files.equals(manifest["Filename"]) From eb66a379f68720808a7f9561ff71ac5549090307 Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Wed, 21 Aug 2024 09:53:06 -0700 Subject: [PATCH 11/38] add exceptions and messages --- schematic/store/synapse.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py index 243f8e931..0b69b14df 100644 --- a/schematic/store/synapse.py +++ b/schematic/store/synapse.py @@ -291,8 +291,19 @@ def query_fileview( self.storageFileviewTable = self.syn.tableQuery( query=self.fileview_query, ).asDataFrame() - except SynapseHTTPError: - raise AccessCredentialsError(self.storageFileview) + except SynapseHTTPError as exc: + exception_text = str(exc) + if "Unknown column path" in exception_text: + raise ValueError( + "The path column has not been added to the fileview. Please make sure that the fileview is up to date. You can add the path column to the fileview by follwing the instructions in the validation rules documentation (https://sagebionetworks.jira.com/wiki/spaces/SCHEM/pages/2645262364/Data+Model+Validation+Rules#Filename-Validation)." + ) from exc + elif "Unknown column" in exception_text: + missing_columns = exception_text.split("Unknown column ")[-1] + raise ValueError( + f"The column(s) ({missing_columns}) specified in the query do not exist in the fileview. Please make sure that the column names are correct and that all expected columns have been added to the fileview." + ) from exc + else: + raise AccessCredentialsError(self.storageFileview) def _build_query( self, columns: Optional[list] = None, where_clauses: Optional[list] = None From e0d6b8b8093ccc8781a49ca62c186713c619c399 Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Wed, 21 Aug 2024 13:45:59 -0700 Subject: [PATCH 12/38] add tests for exceptions raised --- tests/test_store.py | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/tests/test_store.py b/tests/test_store.py index 96e1654d1..f91234b27 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -7,6 +7,7 @@ import math import os import shutil +from contextlib import contextmanager from time import sleep from typing import Any, Generator from unittest.mock import AsyncMock, patch @@ -19,7 +20,7 @@ from synapseclient.entity import File from synapseclient.models import Annotations -from schematic.configuration.configuration import Configuration +from schematic.configuration.configuration import CONFIG, Configuration from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer from schematic.schemas.data_model_parser import DataModelParser from schematic.store.base import BaseStorage @@ -31,6 +32,11 @@ logger = logging.getLogger(__name__) +@contextmanager +def does_not_raise(): + yield + + @pytest.fixture def test_download_manifest_id(): yield "syn51203973" @@ -214,6 +220,34 @@ def test_view_query( # tests that the query was valid and successful, that a view subset has actually been retrived assert synapse_store_special_scope.storageFileviewTable.empty is False + @pytest.mark.parametrize( + "asset_view,columns,expectation", + [ + ("syn62339865", ["path"], pytest.raises(ValueError)), + ("syn62340177", ["id"], pytest.raises(ValueError)), + ("syn23643253", ["id", "path"], does_not_raise()), + ], + ) + def test_view_query_exception( + self, + synapse_store_special_scope: SynapseStorage, + asset_view: str, + columns: list, + expectation: str, + ) -> None: + project_scope = ["syn23643250"] + + # set to use appropriate test view and set scope + CONFIG.synapse_master_fileview_id = asset_view + synapse_store_special_scope.project_scope = project_scope + + # ensure approriate exception is raised + with expectation: + synapse_store_special_scope.query_fileview(columns) + + # reset config to default fileview + CONFIG.synapse_master_fileview_id = "syn23643253" + def test_getFileAnnotations(self, synapse_store: SynapseStorage) -> None: expected_dict = { "author": "bruno, milen, sujay", From 1c5621f4a31dbb7bc3feda4c5dad30184435f9db Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Wed, 21 Aug 2024 14:50:59 -0700 Subject: [PATCH 13/38] update test --- tests/test_api.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/tests/test_api.py b/tests/test_api.py index 97183186f..4798faf2c 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -12,13 +12,11 @@ import pytest from schematic.configuration.configuration import Configuration -from schematic.schemas.data_model_parser import DataModelParser from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer +from schematic.schemas.data_model_parser import DataModelParser from schematic.schemas.data_model_relationships import DataModelRelationships - from schematic_api.api import create_app - logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -763,9 +761,9 @@ def test_generate_manifest_file_based_annotations( # make sure Filename, entityId, and component get filled with correct value assert google_sheet_df["Filename"].to_list() == [ - "TestDataset-Annotations-v3/Sample_A.txt", - "TestDataset-Annotations-v3/Sample_B.txt", - "TestDataset-Annotations-v3/Sample_C.txt", + "schematic - main/TestDataset-Annotations-v3/Sample_A.txt", + "schematic - main/TestDataset-Annotations-v3/Sample_B.txt", + "schematic - main/TestDataset-Annotations-v3/Sample_C.txt", ] assert google_sheet_df["entityId"].to_list() == [ "syn25614636", From 4c2edd99a536deb21d8a055a87d67b70a18fd9b0 Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Thu, 22 Aug 2024 13:47:08 -0700 Subject: [PATCH 14/38] Revert "update test" This reverts commit 1c5621f4a31dbb7bc3feda4c5dad30184435f9db. --- tests/test_api.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/test_api.py b/tests/test_api.py index 4798faf2c..97183186f 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -12,11 +12,13 @@ import pytest from schematic.configuration.configuration import Configuration -from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer from schematic.schemas.data_model_parser import DataModelParser +from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer from schematic.schemas.data_model_relationships import DataModelRelationships + from schematic_api.api import create_app + logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -761,9 +763,9 @@ def test_generate_manifest_file_based_annotations( # make sure Filename, entityId, and component get filled with correct value assert google_sheet_df["Filename"].to_list() == [ - "schematic - main/TestDataset-Annotations-v3/Sample_A.txt", - "schematic - main/TestDataset-Annotations-v3/Sample_B.txt", - "schematic - main/TestDataset-Annotations-v3/Sample_C.txt", + "TestDataset-Annotations-v3/Sample_A.txt", + "TestDataset-Annotations-v3/Sample_B.txt", + "TestDataset-Annotations-v3/Sample_C.txt", ] assert google_sheet_df["entityId"].to_list() == [ "syn25614636", From 1ae2d901ab066cfa44c3f3292001f75fbc680d1d Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Thu, 22 Aug 2024 13:48:32 -0700 Subject: [PATCH 15/38] update test --- tests/test_store.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/test_store.py b/tests/test_store.py index f91234b27..0a62048da 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -242,11 +242,12 @@ def test_view_query_exception( synapse_store_special_scope.project_scope = project_scope # ensure approriate exception is raised - with expectation: - synapse_store_special_scope.query_fileview(columns) - - # reset config to default fileview - CONFIG.synapse_master_fileview_id = "syn23643253" + try: + with expectation: + synapse_store_special_scope.query_fileview(columns) + finally: + # reset config to default fileview + CONFIG.synapse_master_fileview_id = "syn23643253" def test_getFileAnnotations(self, synapse_store: SynapseStorage) -> None: expected_dict = { From 0b60395304888b621a2a6e4137e63b12d48fb2bd Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Thu, 22 Aug 2024 15:51:33 -0700 Subject: [PATCH 16/38] add comments --- schematic/store/synapse.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py index 0b69b14df..d41b427d7 100644 --- a/schematic/store/synapse.py +++ b/schematic/store/synapse.py @@ -816,21 +816,26 @@ def fill_in_entity_id_filename( .drop("index", axis=1) ) + # Reindex manifest and new files dataframes according to entityIds to align file paths and metadata manifest_reindex = manifest.set_index("entityId") all_files_reindex = all_files.set_index("entityId") all_files_reindex_like_manifest = all_files_reindex.reindex_like( manifest_reindex ) + # Check if individual file paths in manifest and from synapse match file_paths_match = ( manifest_reindex["Filename"] == all_files_reindex_like_manifest["Filename"] ) + # If all the paths do not match, update the manifest with the filepaths from synapse if not file_paths_match.all(): manifest_reindex.loc[ ~file_paths_match, "Filename" ] = all_files_reindex_like_manifest.loc[~file_paths_match, "Filename"] + + # reformat manifest for further use manifest = manifest_reindex.reset_index() entityIdCol = manifest.pop("entityId") manifest.insert(len(manifest.columns), "entityId", entityIdCol) From 6feea81f8d5102043bbd69bb090e73c868b82945 Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Mon, 26 Aug 2024 14:24:07 -0700 Subject: [PATCH 17/38] update exception message --- schematic/store/synapse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py index d41b427d7..e02ff40b5 100644 --- a/schematic/store/synapse.py +++ b/schematic/store/synapse.py @@ -295,7 +295,7 @@ def query_fileview( exception_text = str(exc) if "Unknown column path" in exception_text: raise ValueError( - "The path column has not been added to the fileview. Please make sure that the fileview is up to date. You can add the path column to the fileview by follwing the instructions in the validation rules documentation (https://sagebionetworks.jira.com/wiki/spaces/SCHEM/pages/2645262364/Data+Model+Validation+Rules#Filename-Validation)." + "The path column has not been added to the fileview. Please make sure that the fileview is up to date. You can add the path column to the fileview by follwing the instructions in the validation rules documentation." ) from exc elif "Unknown column" in exception_text: missing_columns = exception_text.split("Unknown column ")[-1] From 976645a26cb809a433f8c6dc41c1e966f8b584fa Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Mon, 26 Aug 2024 14:31:56 -0700 Subject: [PATCH 18/38] update type hint --- tests/test_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_store.py b/tests/test_store.py index 0a62048da..409dda5fa 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -232,7 +232,7 @@ def test_view_query_exception( self, synapse_store_special_scope: SynapseStorage, asset_view: str, - columns: list, + columns: list[str], expectation: str, ) -> None: project_scope = ["syn23643250"] From 208f22534588a68bfae24f6677371b5df74d63b2 Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Mon, 26 Aug 2024 14:39:34 -0700 Subject: [PATCH 19/38] update comment --- tests/test_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_store.py b/tests/test_store.py index 409dda5fa..bb8d7ea8b 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -246,7 +246,7 @@ def test_view_query_exception( with expectation: synapse_store_special_scope.query_fileview(columns) finally: - # reset config to default fileview + # reset config to default fileview so subsequent tests do not use the test fileviews utilized by this test CONFIG.synapse_master_fileview_id = "syn23643253" def test_getFileAnnotations(self, synapse_store: SynapseStorage) -> None: From d0628b2be4bbbd5790a59c35d62294c9afbbbe84 Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Mon, 26 Aug 2024 15:04:17 -0700 Subject: [PATCH 20/38] change structure of --- schematic/store/synapse.py | 72 ++++++++++++++++++++------------------ 1 file changed, 37 insertions(+), 35 deletions(-) diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py index 0e71a7fd9..a287859f5 100644 --- a/schematic/store/synapse.py +++ b/schematic/store/synapse.py @@ -787,47 +787,49 @@ def fill_in_entity_id_filename( # note that if there is an existing manifest and there are files in the dataset # the columns Filename and entityId are assumed to be present in manifest schema # TODO: use idiomatic panda syntax - if dataset_files: - all_files = self._get_file_entityIds( - dataset_files=dataset_files, only_new_files=False, manifest=manifest - ) - new_files = self._get_file_entityIds( - dataset_files=dataset_files, only_new_files=True, manifest=manifest - ) + if not dataset_files: + manifest = manifest.fillna("") + return dataset_files, manifest - all_files = pd.DataFrame(all_files) - new_files = pd.DataFrame(new_files) + all_files = self._get_file_entityIds( + dataset_files=dataset_files, only_new_files=False, manifest=manifest + ) + new_files = self._get_file_entityIds( + dataset_files=dataset_files, only_new_files=True, manifest=manifest + ) - # update manifest so that it contains new dataset files - manifest = ( - pd.concat([manifest, new_files], sort=False) - .reset_index() - .drop("index", axis=1) - ) + all_files = pd.DataFrame(all_files) + new_files = pd.DataFrame(new_files) - # Reindex manifest and new files dataframes according to entityIds to align file paths and metadata - manifest_reindex = manifest.set_index("entityId") - all_files_reindex = all_files.set_index("entityId") - all_files_reindex_like_manifest = all_files_reindex.reindex_like( - manifest_reindex - ) + # update manifest so that it contains new dataset files + manifest = ( + pd.concat([manifest, new_files], sort=False) + .reset_index() + .drop("index", axis=1) + ) - # Check if individual file paths in manifest and from synapse match - file_paths_match = ( - manifest_reindex["Filename"] - == all_files_reindex_like_manifest["Filename"] - ) + # Reindex manifest and new files dataframes according to entityIds to align file paths and metadata + manifest_reindex = manifest.set_index("entityId") + all_files_reindex = all_files.set_index("entityId") + all_files_reindex_like_manifest = all_files_reindex.reindex_like( + manifest_reindex + ) + + # Check if individual file paths in manifest and from synapse match + file_paths_match = ( + manifest_reindex["Filename"] == all_files_reindex_like_manifest["Filename"] + ) - # If all the paths do not match, update the manifest with the filepaths from synapse - if not file_paths_match.all(): - manifest_reindex.loc[ - ~file_paths_match, "Filename" - ] = all_files_reindex_like_manifest.loc[~file_paths_match, "Filename"] + # If all the paths do not match, update the manifest with the filepaths from synapse + if not file_paths_match.all(): + manifest_reindex.loc[ + ~file_paths_match, "Filename" + ] = all_files_reindex_like_manifest.loc[~file_paths_match, "Filename"] - # reformat manifest for further use - manifest = manifest_reindex.reset_index() - entityIdCol = manifest.pop("entityId") - manifest.insert(len(manifest.columns), "entityId", entityIdCol) + # reformat manifest for further use + manifest = manifest_reindex.reset_index() + entityIdCol = manifest.pop("entityId") + manifest.insert(len(manifest.columns), "entityId", entityIdCol) manifest = manifest.fillna("") return dataset_files, manifest From c2b832d314348740b4b44c2f2ee59bbc34e0b57d Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Tue, 27 Aug 2024 09:58:16 -0700 Subject: [PATCH 21/38] move test to integration dir --- tests/integration/test_metadata_model.py | 42 ++++++++++++++++++++++++ tests/test_metadata.py | 22 ------------- 2 files changed, 42 insertions(+), 22 deletions(-) create mode 100644 tests/integration/test_metadata_model.py diff --git a/tests/integration/test_metadata_model.py b/tests/integration/test_metadata_model.py new file mode 100644 index 000000000..29167c991 --- /dev/null +++ b/tests/integration/test_metadata_model.py @@ -0,0 +1,42 @@ +import logging +from contextlib import contextmanager +from unittest.mock import patch + +from schematic.models.metadata import MetadataModel + +logging.basicConfig(level=logging.DEBUG) +logger = logging.getLogger(__name__) + + +@contextmanager +def does_not_raise(): + yield + + +def metadata_model(helpers, data_model_labels): + metadata_model = MetadataModel( + inputMModelLocation=helpers.get_data_path("example.model.jsonld"), + data_model_labels=data_model_labels, + inputMModelLocationType="local", + ) + + return metadata_model + + +class TestMetadataModel: + def test_submit_filebased_manifest(self, helpers): + meta_data_model = metadata_model(helpers, "class_label") + + manifest_path = helpers.get_data_path( + "mock_manifests/filepath_submission_test_manifest.csv" + ) + + with does_not_raise(): + manifest_id = meta_data_model.submit_metadata_manifest( + manifest_path=manifest_path, + dataset_id="syn62276880", + manifest_record_type="file_only", + restrict_rules=False, + file_annotations_upload=True, + ) + assert manifest_id == "syn62280543" diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 8065f793a..b192f5387 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -16,11 +16,6 @@ logger = logging.getLogger(__name__) -@contextmanager -def does_not_raise(): - yield - - def metadata_model(helpers, data_model_labels): metadata_model = MetadataModel( inputMModelLocation=helpers.get_data_path("example.model.jsonld"), @@ -149,20 +144,3 @@ def test_submit_metadata_manifest( hide_blanks=hide_blanks, ) assert mock_manifest_id == "mock manifest id" - - def test_submit_filebased_manifest(self, helpers): - meta_data_model = metadata_model(helpers, "class_label") - - manifest_path = helpers.get_data_path( - "mock_manifests/filepath_submission_test_manifest.csv" - ) - - with does_not_raise(): - manifest_id = meta_data_model.submit_metadata_manifest( - manifest_path=manifest_path, - dataset_id="syn62276880", - manifest_record_type="file_only", - restrict_rules=False, - file_annotations_upload=True, - ) - assert manifest_id == "syn62280543" From 2dd6fdf7108c0c8b1d08ee5f7f6be5d2300fcaf2 Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Tue, 27 Aug 2024 10:05:50 -0700 Subject: [PATCH 22/38] move function to conftest --- tests/conftest.py | 11 +++++++++++ tests/integration/test_metadata_model.py | 11 +---------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 9a0b5789d..6ce21df5d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -9,6 +9,7 @@ from dotenv import load_dotenv from schematic.configuration.configuration import CONFIG +from schematic.models.metadata import MetadataModel from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer from schematic.schemas.data_model_parser import DataModelParser from schematic.store.synapse import SynapseStorage @@ -149,3 +150,13 @@ def DMGE(helpers: Helpers) -> DataModelGraphExplorer: """Fixture to instantiate a DataModelGraphExplorer object.""" dmge = helpers.get_data_model_graph_explorer(path="example.model.jsonld") return dmge + + +def metadata_model(helpers, data_model_labels): + metadata_model = MetadataModel( + inputMModelLocation=helpers.get_data_path("example.model.jsonld"), + data_model_labels=data_model_labels, + inputMModelLocationType="local", + ) + + return metadata_model diff --git a/tests/integration/test_metadata_model.py b/tests/integration/test_metadata_model.py index 29167c991..1ed8899f5 100644 --- a/tests/integration/test_metadata_model.py +++ b/tests/integration/test_metadata_model.py @@ -3,6 +3,7 @@ from unittest.mock import patch from schematic.models.metadata import MetadataModel +from tests.conftest import metadata_model logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger(__name__) @@ -13,16 +14,6 @@ def does_not_raise(): yield -def metadata_model(helpers, data_model_labels): - metadata_model = MetadataModel( - inputMModelLocation=helpers.get_data_path("example.model.jsonld"), - data_model_labels=data_model_labels, - inputMModelLocationType="local", - ) - - return metadata_model - - class TestMetadataModel: def test_submit_filebased_manifest(self, helpers): meta_data_model = metadata_model(helpers, "class_label") From 096fe550ba5810ad50a08986a927bdc8193d34f4 Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Tue, 27 Aug 2024 11:11:38 -0700 Subject: [PATCH 23/38] update exception message --- schematic/store/synapse.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py index 69af178fc..c9ca36c10 100644 --- a/schematic/store/synapse.py +++ b/schematic/store/synapse.py @@ -298,9 +298,9 @@ def query_fileview( "The path column has not been added to the fileview. Please make sure that the fileview is up to date. You can add the path column to the fileview by follwing the instructions in the validation rules documentation." ) from exc elif "Unknown column" in exception_text: - missing_columns = exception_text.split("Unknown column ")[-1] + missing_column = exception_text.split("Unknown column ")[-1] raise ValueError( - f"The column(s) ({missing_columns}) specified in the query do not exist in the fileview. Please make sure that the column names are correct and that all expected columns have been added to the fileview." + f"The columns {missing_column} specified in the query do not exist in the fileview. Please make sure that the column names are correct and that all expected columns have been added to the fileview." ) from exc else: raise AccessCredentialsError(self.storageFileview) From 05a791eb3180128c0ca81e60907fa84ae7bc8cb7 Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Tue, 27 Aug 2024 11:12:14 -0700 Subject: [PATCH 24/38] update query tests --- tests/test_store.py | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/tests/test_store.py b/tests/test_store.py index 024581411..4f13c10da 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -208,18 +208,26 @@ def test_view_query( synapse_store_special_scope.project_scope = project_scope - synapse_store_special_scope.query_fileview(columns, where_clauses) - # tests ._build_query() - assert synapse_store_special_scope.fileview_query == expected - # tests that the query was valid and successful, that a view subset has actually been retrived - assert synapse_store_special_scope.storageFileviewTable.empty is False + with does_not_raise(): + synapse_store_special_scope.query_fileview(columns, where_clauses) + # tests ._build_query() + assert synapse_store_special_scope.fileview_query == expected + # tests that the query was valid and successful, that a view subset has actually been retrived + assert synapse_store_special_scope.storageFileviewTable.empty is False @pytest.mark.parametrize( - "asset_view,columns,expectation", + "asset_view,columns,message", [ - ("syn62339865", ["path"], pytest.raises(ValueError)), - ("syn62340177", ["id"], pytest.raises(ValueError)), - ("syn23643253", ["id", "path"], does_not_raise()), + ( + "syn62339865", + ["path"], + r"The path column has not been added to the fileview. .*", + ), + ( + "syn62340177", + ["id"], + r"The columns id specified in the query do not exist in the fileview. .*", + ), ], ) def test_view_query_exception( @@ -227,7 +235,7 @@ def test_view_query_exception( synapse_store_special_scope: SynapseStorage, asset_view: str, columns: list[str], - expectation: str, + message: str, ) -> None: project_scope = ["syn23643250"] @@ -237,7 +245,7 @@ def test_view_query_exception( # ensure approriate exception is raised try: - with expectation: + with pytest.raises(ValueError, match=message): synapse_store_special_scope.query_fileview(columns) finally: # reset config to default fileview so subsequent tests do not use the test fileviews utilized by this test From 2b84f4d7b386c3e731584863c5f02a2168d14ac8 Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Tue, 27 Aug 2024 11:32:59 -0700 Subject: [PATCH 25/38] update comments --- tests/test_store.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/tests/test_store.py b/tests/test_store.py index 4f13c10da..bebad9920 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -203,16 +203,19 @@ def test_view_query( where_clauses: list, expected: str, ) -> None: - # Ensure correct view is being utilized + # GIVEN a the correct fileview assert synapse_store_special_scope.storageFileview == "syn23643253" + # AND the approrpiate project scope synapse_store_special_scope.project_scope = project_scope + # WHEN the query is built and run + # THEN it should complete without raising an exception with does_not_raise(): synapse_store_special_scope.query_fileview(columns, where_clauses) - # tests ._build_query() + # AND the query string should be as expected assert synapse_store_special_scope.fileview_query == expected - # tests that the query was valid and successful, that a view subset has actually been retrived + # AND query should have recieved a non-empty table assert synapse_store_special_scope.storageFileviewTable.empty is False @pytest.mark.parametrize( @@ -239,16 +242,18 @@ def test_view_query_exception( ) -> None: project_scope = ["syn23643250"] - # set to use appropriate test view and set scope + # GIVEN the appropriate test file view CONFIG.synapse_master_fileview_id = asset_view + # AND the approrpiate project scope synapse_store_special_scope.project_scope = project_scope - # ensure approriate exception is raised + # WHEN the query is built and run try: + # THEN it should raise a ValueError with the appropriate message with pytest.raises(ValueError, match=message): synapse_store_special_scope.query_fileview(columns) finally: - # reset config to default fileview so subsequent tests do not use the test fileviews utilized by this test + # AND the fileview should be reset to the default so the other tests are not affected regardless of the outcome of the query CONFIG.synapse_master_fileview_id = "syn23643253" def test_getFileAnnotations(self, synapse_store: SynapseStorage) -> None: From cb0f0fbe329f6c61ea165dbd21f0dceb00a6d660 Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Tue, 27 Aug 2024 12:56:06 -0700 Subject: [PATCH 26/38] update imports --- tests/test_metadata.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_metadata.py b/tests/test_metadata.py index b192f5387..fca1d3db5 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -2,7 +2,6 @@ import logging import os -from contextlib import contextmanager from pathlib import Path from typing import Generator, Optional from unittest.mock import patch From a916641319b0d7dc37991fcffd087609c353751b Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Tue, 27 Aug 2024 12:56:43 -0700 Subject: [PATCH 27/38] update get manifest test --- tests/test_manifest.py | 67 +++++++++++++++++++++++++++++------------- 1 file changed, 46 insertions(+), 21 deletions(-) diff --git a/tests/test_manifest.py b/tests/test_manifest.py index b4daaaf76..06bd7b168 100644 --- a/tests/test_manifest.py +++ b/tests/test_manifest.py @@ -763,24 +763,52 @@ def test_create_manifests( assert all_results == expected_result @pytest.mark.parametrize( - "component,datasetId", - [("Biospecimen", "syn61260107"), ("BulkRNA-seqAssay", "syn61374924")], + "component,datasetId,expected_file_based,expected_rows,expected_files", + [ + ("Biospecimen", "syn61260107", False, 4, None), + ( + "BulkRNA-seqAssay", + "syn61374924", + True, + 4, + pd.Series( + [ + "schematic - main/BulkRNASeq and files/txt1.txt", + "schematic - main/BulkRNASeq and files/txt2.txt", + "schematic - main/BulkRNASeq and files/txt4.txt", + "schematic - main/BulkRNASeq and files/txt3.txt", + ], + name="Filename", + ), + ), + ], ids=["Record based", "File based"], ) - def test_get_manifest_with_files(self, helpers, component, datasetId): + def test_get_manifest_with_files( + self, + helpers, + component, + datasetId, + expected_file_based, + expected_rows, + expected_files, + ): """ Test to ensure that when generating a record based manifset that has files in the dataset that the files are not added to the manifest as well when generating a file based manifest from a dataset thathas had files added that the files are added correctly """ + # GIVEN the example data model path_to_data_model = helpers.get_data_path("example.model.jsonld") + # AND a graph data model graph_data_model = generate_graph_data_model( helpers, path_to_data_model=path_to_data_model, data_model_labels="class_label", ) + # AND a manifest generator generator = ManifestGenerator( path_to_data_model=path_to_data_model, graph=graph_data_model, @@ -788,27 +816,24 @@ def test_get_manifest_with_files(self, helpers, component, datasetId): use_annotations=True, ) + # WHEN a manifest is generated for the appropriate dataset as a dataframe manifest = generator.get_manifest( dataset_id=datasetId, output_format="dataframe" ) - filename_in_manifest_columns = "Filename" in manifest.columns + # AND it is determined if the manifest is filebased + is_file_based = "Filename" in manifest.columns + + # AND the number of rows are checked n_rows = manifest.shape[0] - if component == "Biospecimen": - assert not filename_in_manifest_columns - assert n_rows == 4 - elif component == "BulkRNA-seqAssay": - assert filename_in_manifest_columns - assert n_rows == 4 - - expected_files = pd.Series( - [ - "schematic - main/BulkRNASeq and files/txt1.txt", - "schematic - main/BulkRNASeq and files/txt2.txt", - "schematic - main/BulkRNASeq and files/txt4.txt", - "schematic - main/BulkRNASeq and files/txt3.txt", - ], - name="Filename", - ) - assert expected_files.equals(manifest["Filename"]) + # THEN the manifest should have the expected number of rows + assert n_rows == expected_rows + + # AND the manifest should be filebased or not as expected + assert is_file_based == expected_file_based + + # AND if the manifest is file based + if expected_file_based: + # THEN the manifest should have the expected files + assert manifest["Filename"].equals(expected_files) From e3107a535d1284398442cdf4faa295c3d981b910 Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Tue, 27 Aug 2024 13:02:17 -0700 Subject: [PATCH 28/38] cover other cases in query test --- tests/test_store.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/test_store.py b/tests/test_store.py index bebad9920..a06d8e95f 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -187,6 +187,18 @@ def test_login(self) -> None: None, "SELECT name,id,path FROM syn23643253 WHERE projectId IN ('syn23643250', '') ;", ), + ( + None, + ["name", "id", "path"], + ["parentId='syn61682648'", "type='file'"], + "SELECT name,id,path FROM syn23643253 WHERE parentId='syn61682648' AND type='file' ;", + ), + ( + ["syn23643250"], + None, + ["parentId='syn61682648'", "type='file'"], + "SELECT * FROM syn23643253 WHERE parentId='syn61682648' AND type='file' AND projectId IN ('syn23643250', '') ;", + ), ( ["syn23643250"], ["name", "id", "path"], From 57ec340f0b6df364aba23fcee757a989121150ba Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Tue, 27 Aug 2024 13:10:56 -0700 Subject: [PATCH 29/38] update query test --- tests/test_store.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/tests/test_store.py b/tests/test_store.py index a06d8e95f..3517439af 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -122,7 +122,7 @@ def dmge( yield dmge -@pytest.fixture +@pytest.fixture(scope="module") def synapse_store_special_scope(): yield SynapseStorage(perform_query=False) @@ -166,44 +166,57 @@ def test_login(self) -> None: shutil.rmtree("test_cache_dir") @pytest.mark.parametrize( - "project_scope,columns,where_clauses,expected", + "project_scope,columns,where_clauses,expected,expected_new_query", [ - (None, None, None, "SELECT * FROM syn23643253 ;"), + (None, None, None, "SELECT * FROM syn23643253 ;", True), ( ["syn23643250"], None, None, "SELECT * FROM syn23643253 WHERE projectId IN ('syn23643250', '') ;", + True, ), ( None, None, ["projectId IN ('syn23643250')"], "SELECT * FROM syn23643253 WHERE projectId IN ('syn23643250') ;", + True, ), ( ["syn23643250"], ["name", "id", "path"], None, "SELECT name,id,path FROM syn23643253 WHERE projectId IN ('syn23643250', '') ;", + True, ), ( None, ["name", "id", "path"], ["parentId='syn61682648'", "type='file'"], "SELECT name,id,path FROM syn23643253 WHERE parentId='syn61682648' AND type='file' ;", + True, ), ( ["syn23643250"], None, ["parentId='syn61682648'", "type='file'"], "SELECT * FROM syn23643253 WHERE parentId='syn61682648' AND type='file' AND projectId IN ('syn23643250', '') ;", + True, + ), + ( + ["syn23643250"], + ["name", "id", "path"], + ["parentId='syn61682648'", "type='file'"], + "SELECT name,id,path FROM syn23643253 WHERE parentId='syn61682648' AND type='file' AND projectId IN ('syn23643250', '') ;", + True, ), ( ["syn23643250"], ["name", "id", "path"], ["parentId='syn61682648'", "type='file'"], "SELECT name,id,path FROM syn23643253 WHERE parentId='syn61682648' AND type='file' AND projectId IN ('syn23643250', '') ;", + False, ), ], ) @@ -214,6 +227,7 @@ def test_view_query( columns: list, where_clauses: list, expected: str, + expected_new_query: bool, ) -> None: # GIVEN a the correct fileview assert synapse_store_special_scope.storageFileview == "syn23643253" @@ -229,6 +243,8 @@ def test_view_query( assert synapse_store_special_scope.fileview_query == expected # AND query should have recieved a non-empty table assert synapse_store_special_scope.storageFileviewTable.empty is False + # AND the query should be new if expected + assert synapse_store_special_scope.new_query_different == expected_new_query @pytest.mark.parametrize( "asset_view,columns,message", From bb043f6a2cfd724a847d05db5674c842fbc71fc0 Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Tue, 27 Aug 2024 13:14:48 -0700 Subject: [PATCH 30/38] change structure of test --- tests/test_store.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/tests/test_store.py b/tests/test_store.py index 3517439af..38db2d74a 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -505,7 +505,7 @@ def test_getDatasetManifest(self, synapse_store, downloadFile): assert manifest_data == "syn51204513" @pytest.mark.parametrize( - "existing_manifest_df,fill_in_return_value", + "existing_manifest_df,fill_in_return_value,expected_df", [ ( pd.DataFrame(), @@ -519,6 +519,9 @@ def test_getDatasetManifest(self, synapse_store, downloadFile): "entityId": ["mock_entity_id"], }, ], + pd.DataFrame( + {"Filename": ["mock_file_path"], "entityId": ["mock_entity_id"]} + ), ), ( pd.DataFrame( @@ -537,11 +540,17 @@ def test_getDatasetManifest(self, synapse_store, downloadFile): "entityId": ["mock_entity_id"], }, ], + pd.DataFrame( + { + "Filename": ["existing_mock_file_path", "mock_file_path"], + "entityId": ["existing_mock_entity_id", "mock_entity_id"], + } + ), ), ], ) def test_fill_in_entity_id_filename( - self, synapse_store, existing_manifest_df, fill_in_return_value + self, synapse_store, existing_manifest_df, fill_in_return_value, expected_df ): with patch( "schematic.store.synapse.SynapseStorage.getFilesInStorageDataset", @@ -553,17 +562,7 @@ def test_fill_in_entity_id_filename( dataset_files, new_manifest = synapse_store.fill_in_entity_id_filename( datasetId="test_syn_id", manifest=existing_manifest_df ) - if not existing_manifest_df.empty: - expected_df = pd.DataFrame( - { - "Filename": ["existing_mock_file_path", "mock_file_path"], - "entityId": ["existing_mock_entity_id", "mock_entity_id"], - } - ) - else: - expected_df = pd.DataFrame( - {"Filename": ["mock_file_path"], "entityId": ["mock_entity_id"]} - ) + assert_frame_equal(new_manifest, expected_df) assert dataset_files == ["syn123", "syn124", "syn125"] From ed9837bd71c472642ff82afe5ec00fff06a11ca8 Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Tue, 27 Aug 2024 13:26:05 -0700 Subject: [PATCH 31/38] update mock data names --- tests/test_store.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/tests/test_store.py b/tests/test_store.py index 38db2d74a..d2bf6d57a 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -511,16 +511,19 @@ def test_getDatasetManifest(self, synapse_store, downloadFile): pd.DataFrame(), [ { - "Filename": ["mock_file_path"], - "entityId": ["mock_entity_id"], + "Filename": ["new_mock_file_path"], + "entityId": ["new_mock_entity_id"], }, { - "Filename": ["mock_file_path"], - "entityId": ["mock_entity_id"], + "Filename": ["new_mock_file_path"], + "entityId": ["new_mock_entity_id"], }, ], pd.DataFrame( - {"Filename": ["mock_file_path"], "entityId": ["mock_entity_id"]} + { + "Filename": ["new_mock_file_path"], + "entityId": ["new_mock_entity_id"], + } ), ), ( @@ -532,18 +535,18 @@ def test_getDatasetManifest(self, synapse_store, downloadFile): ), [ { - "Filename": ["existing_mock_file_path", "mock_file_path"], - "entityId": ["existing_mock_entity_id", "mock_entity_id"], + "Filename": ["existing_mock_file_path", "new_mock_file_path"], + "entityId": ["existing_mock_entity_id", "new_mock_entity_id"], }, { - "Filename": ["mock_file_path"], - "entityId": ["mock_entity_id"], + "Filename": ["new_mock_file_path"], + "entityId": ["new_mock_entity_id"], }, ], pd.DataFrame( { - "Filename": ["existing_mock_file_path", "mock_file_path"], - "entityId": ["existing_mock_entity_id", "mock_entity_id"], + "Filename": ["existing_mock_file_path", "new_mock_file_path"], + "entityId": ["existing_mock_entity_id", "new_mock_entity_id"], } ), ), From dfc0e13b4d6f42070dfca0148d1e20f5a398a6f8 Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Wed, 28 Aug 2024 10:08:22 -0700 Subject: [PATCH 32/38] update 'does_not_raise' --- tests/integration/test_metadata_model.py | 8 +------- tests/test_store.py | 7 +------ 2 files changed, 2 insertions(+), 13 deletions(-) diff --git a/tests/integration/test_metadata_model.py b/tests/integration/test_metadata_model.py index 1ed8899f5..bd7e55242 100644 --- a/tests/integration/test_metadata_model.py +++ b/tests/integration/test_metadata_model.py @@ -1,19 +1,13 @@ import logging -from contextlib import contextmanager +from contextlib import nullcontext as does_not_raise from unittest.mock import patch -from schematic.models.metadata import MetadataModel from tests.conftest import metadata_model logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger(__name__) -@contextmanager -def does_not_raise(): - yield - - class TestMetadataModel: def test_submit_filebased_manifest(self, helpers): meta_data_model = metadata_model(helpers, "class_label") diff --git a/tests/test_store.py b/tests/test_store.py index d2bf6d57a..94e897dc0 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -7,7 +7,7 @@ import math import os import shutil -from contextlib import contextmanager +from contextlib import nullcontext as does_not_raise from time import sleep from typing import Any, Generator from unittest.mock import AsyncMock, patch @@ -32,11 +32,6 @@ logger = logging.getLogger(__name__) -@contextmanager -def does_not_raise(): - yield - - @pytest.fixture def test_download_manifest_id(): yield "syn51203973" From 250d71de63cc7840886563697cdc08856958dd45 Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Wed, 28 Aug 2024 10:09:25 -0700 Subject: [PATCH 33/38] update imports --- tests/integration/test_metadata_model.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration/test_metadata_model.py b/tests/integration/test_metadata_model.py index bd7e55242..7276812e2 100644 --- a/tests/integration/test_metadata_model.py +++ b/tests/integration/test_metadata_model.py @@ -1,6 +1,5 @@ import logging from contextlib import nullcontext as does_not_raise -from unittest.mock import patch from tests.conftest import metadata_model From 7b82453774a2c82eb2ae0f12506ade2cf26e38ee Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Thu, 29 Aug 2024 10:05:02 -0700 Subject: [PATCH 34/38] modifiy query exception raised --- schematic/store/synapse.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py index c9ca36c10..139002a20 100644 --- a/schematic/store/synapse.py +++ b/schematic/store/synapse.py @@ -296,12 +296,12 @@ def query_fileview( if "Unknown column path" in exception_text: raise ValueError( "The path column has not been added to the fileview. Please make sure that the fileview is up to date. You can add the path column to the fileview by follwing the instructions in the validation rules documentation." - ) from exc + ) elif "Unknown column" in exception_text: missing_column = exception_text.split("Unknown column ")[-1] raise ValueError( f"The columns {missing_column} specified in the query do not exist in the fileview. Please make sure that the column names are correct and that all expected columns have been added to the fileview." - ) from exc + ) else: raise AccessCredentialsError(self.storageFileview) From 06532e050d5caa703bb1b0ec8d7ba50578488ab7 Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Thu, 29 Aug 2024 10:13:51 -0700 Subject: [PATCH 35/38] mock config for query exception tests --- tests/test_store.py | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/tests/test_store.py b/tests/test_store.py index 94e897dc0..9ad00a052 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -10,7 +10,7 @@ from contextlib import nullcontext as does_not_raise from time import sleep from typing import Any, Generator -from unittest.mock import AsyncMock, patch +from unittest.mock import AsyncMock, MagicMock, patch import pandas as pd import pytest @@ -258,26 +258,31 @@ def test_view_query( ) def test_view_query_exception( self, - synapse_store_special_scope: SynapseStorage, asset_view: str, columns: list[str], message: str, ) -> None: + # GIVEN a project scope project_scope = ["syn23643250"] - # GIVEN the appropriate test file view - CONFIG.synapse_master_fileview_id = asset_view - # AND the approrpiate project scope - synapse_store_special_scope.project_scope = project_scope - - # WHEN the query is built and run - try: + # AND a test configuration + TEST_CONFIG = Configuration() + with patch( + "schematic.store.synapse.CONFIG", return_value=TEST_CONFIG + ) as mock_config: + # AND the appropriate test file view + mock_config.synapse_master_fileview_id = asset_view + # AND a real path to the synapse config file + mock_config.synapse_configuration_path = CONFIG.synapse_configuration_path + # AND a unique synapse storage object that uses the values modified in the test config + synapse_store = SynapseStorage(perform_query=False) + # AND the given project scope + synapse_store.project_scope = project_scope + + # WHEN the query is built and run # THEN it should raise a ValueError with the appropriate message with pytest.raises(ValueError, match=message): - synapse_store_special_scope.query_fileview(columns) - finally: - # AND the fileview should be reset to the default so the other tests are not affected regardless of the outcome of the query - CONFIG.synapse_master_fileview_id = "syn23643253" + synapse_store.query_fileview(columns) def test_getFileAnnotations(self, synapse_store: SynapseStorage) -> None: expected_dict = { From 5dc117b5aabac8592245501f418fe834619453b9 Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Thu, 29 Aug 2024 11:06:11 -0700 Subject: [PATCH 36/38] update test data --- .../data/mock_manifests/filepath_submission_test_manifest.csv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/data/mock_manifests/filepath_submission_test_manifest.csv b/tests/data/mock_manifests/filepath_submission_test_manifest.csv index 9def5eccc..11e0d5077 100644 --- a/tests/data/mock_manifests/filepath_submission_test_manifest.csv +++ b/tests/data/mock_manifests/filepath_submission_test_manifest.csv @@ -1,3 +1,3 @@ Filename,Sample ID,File Format,Component,Genome Build,Genome FASTA,Id,entityId -schematic - main/Test Filename Upload/txt1.txt,,,BulkRNA-seqAssay,,,01ded8fc-0915-4959-85ab-64e9644c8787,syn62276954 -schematic - main/Test Filename Upload/txt2.txt,,,BulkRNA-seqAssay,,,fd122bb5-3353-4c94-b1f5-0bb93a3e9fc9,syn62276956 +schematic - main/Test Filename Upload/txt1.txt,1,,BulkRNA-seqAssay,,,01ded8fc-0915-4959-85ab-64e9644c8787,syn62276954 +schematic - main/Test Filename Upload/txt2.txt,2,,BulkRNA-seqAssay,,,fd122bb5-3353-4c94-b1f5-0bb93a3e9fc9,syn62276956 From b2b09ca947a8cb3e982e391b7637464d46e12429 Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Thu, 29 Aug 2024 11:06:28 -0700 Subject: [PATCH 37/38] update test --- tests/integration/test_metadata_model.py | 33 ++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_metadata_model.py b/tests/integration/test_metadata_model.py index 7276812e2..26602bb92 100644 --- a/tests/integration/test_metadata_model.py +++ b/tests/integration/test_metadata_model.py @@ -1,6 +1,9 @@ import logging from contextlib import nullcontext as does_not_raise +from pytest_mock import MockerFixture + +from schematic.store.synapse import SynapseStorage from tests.conftest import metadata_model logging.basicConfig(level=logging.DEBUG) @@ -8,19 +11,45 @@ class TestMetadataModel: - def test_submit_filebased_manifest(self, helpers): + def test_submit_filebased_manifest(self, helpers, mocker: MockerFixture): + # spys + spy_upload_file_as_csv = mocker.spy(SynapseStorage, "upload_manifest_as_csv") + spy_upload_file_as_table = mocker.spy( + SynapseStorage, "upload_manifest_as_table" + ) + spy_upload_file_combo = mocker.spy(SynapseStorage, "upload_manifest_combo") + spy_add_annotations = mocker.spy( + SynapseStorage, "add_annotations_to_entities_files" + ) + + # GIVEN a metadata model object using class labels meta_data_model = metadata_model(helpers, "class_label") + # AND a filebased test manifset manifest_path = helpers.get_data_path( "mock_manifests/filepath_submission_test_manifest.csv" ) + # WHEN the manifest it submitted + # THEN submission should complete without error with does_not_raise(): manifest_id = meta_data_model.submit_metadata_manifest( manifest_path=manifest_path, dataset_id="syn62276880", - manifest_record_type="file_only", + manifest_record_type="file_and_entities", restrict_rules=False, file_annotations_upload=True, + hide_blanks=False, ) + + # AND the manifest should be submitted to the correct place assert manifest_id == "syn62280543" + + # AND the manifest should be uploaded as a CSV + spy_upload_file_as_csv.assert_called_once() + # AND annotations should be added to the files + spy_add_annotations.assert_called_once() + + # AND the manifest should not be uploaded as a table or combination of table, file, and entities + spy_upload_file_as_table.assert_not_called() + spy_upload_file_combo.assert_not_called() From 8c77ccfce7bfa780e5a3f7f2551d37ebd01bd074 Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Thu, 29 Aug 2024 11:23:03 -0700 Subject: [PATCH 38/38] update test file --- .../data/mock_manifests/filepath_submission_test_manifest.csv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/data/mock_manifests/filepath_submission_test_manifest.csv b/tests/data/mock_manifests/filepath_submission_test_manifest.csv index 11e0d5077..3b1a349fc 100644 --- a/tests/data/mock_manifests/filepath_submission_test_manifest.csv +++ b/tests/data/mock_manifests/filepath_submission_test_manifest.csv @@ -1,3 +1,3 @@ Filename,Sample ID,File Format,Component,Genome Build,Genome FASTA,Id,entityId -schematic - main/Test Filename Upload/txt1.txt,1,,BulkRNA-seqAssay,,,01ded8fc-0915-4959-85ab-64e9644c8787,syn62276954 -schematic - main/Test Filename Upload/txt2.txt,2,,BulkRNA-seqAssay,,,fd122bb5-3353-4c94-b1f5-0bb93a3e9fc9,syn62276956 +schematic - main/Test Filename Upload/txt1.txt,1.0,,BulkRNA-seqAssay,,,01ded8fc-0915-4959-85ab-64e9644c8787,syn62276954 +schematic - main/Test Filename Upload/txt2.txt,2.0,,BulkRNA-seqAssay,,,fd122bb5-3353-4c94-b1f5-0bb93a3e9fc9,syn62276956