diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py index e62f11e01..139002a20 100644 --- a/schematic/store/synapse.py +++ b/schematic/store/synapse.py @@ -291,8 +291,19 @@ def query_fileview( self.storageFileviewTable = self.syn.tableQuery( query=self.fileview_query, ).asDataFrame() - except SynapseHTTPError: - raise AccessCredentialsError(self.storageFileview) + except SynapseHTTPError as exc: + exception_text = str(exc) + if "Unknown column path" in exception_text: + raise ValueError( + "The path column has not been added to the fileview. Please make sure that the fileview is up to date. You can add the path column to the fileview by follwing the instructions in the validation rules documentation." + ) + elif "Unknown column" in exception_text: + missing_column = exception_text.split("Unknown column ")[-1] + raise ValueError( + f"The columns {missing_column} specified in the query do not exist in the fileview. Please make sure that the column names are correct and that all expected columns have been added to the fileview." + ) + else: + raise AccessCredentialsError(self.storageFileview) def _build_query( self, columns: Optional[list] = None, where_clauses: Optional[list] = None @@ -788,18 +799,49 @@ def fill_in_entity_id_filename( # note that if there is an existing manifest and there are files in the dataset # the columns Filename and entityId are assumed to be present in manifest schema # TODO: use idiomatic panda syntax - if dataset_files: - new_files = self._get_file_entityIds( - dataset_files=dataset_files, only_new_files=True, manifest=manifest - ) + if not dataset_files: + manifest = manifest.fillna("") + return dataset_files, manifest - # update manifest so that it contains new dataset files - new_files = pd.DataFrame(new_files) - manifest = ( - pd.concat([manifest, new_files], sort=False) - .reset_index() - .drop("index", axis=1) - ) + all_files = self._get_file_entityIds( + dataset_files=dataset_files, only_new_files=False, manifest=manifest + ) + new_files = self._get_file_entityIds( + dataset_files=dataset_files, only_new_files=True, manifest=manifest + ) + + all_files = pd.DataFrame(all_files) + new_files = pd.DataFrame(new_files) + + # update manifest so that it contains new dataset files + manifest = ( + pd.concat([manifest, new_files], sort=False) + .reset_index() + .drop("index", axis=1) + ) + + # Reindex manifest and new files dataframes according to entityIds to align file paths and metadata + manifest_reindex = manifest.set_index("entityId") + all_files_reindex = all_files.set_index("entityId") + all_files_reindex_like_manifest = all_files_reindex.reindex_like( + manifest_reindex + ) + + # Check if individual file paths in manifest and from synapse match + file_paths_match = ( + manifest_reindex["Filename"] == all_files_reindex_like_manifest["Filename"] + ) + + # If all the paths do not match, update the manifest with the filepaths from synapse + if not file_paths_match.all(): + manifest_reindex.loc[ + ~file_paths_match, "Filename" + ] = all_files_reindex_like_manifest.loc[~file_paths_match, "Filename"] + + # reformat manifest for further use + manifest = manifest_reindex.reset_index() + entityIdCol = manifest.pop("entityId") + manifest.insert(len(manifest.columns), "entityId", entityIdCol) manifest = manifest.fillna("") return dataset_files, manifest diff --git a/tests/conftest.py b/tests/conftest.py index 9a0b5789d..6ce21df5d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -9,6 +9,7 @@ from dotenv import load_dotenv from schematic.configuration.configuration import CONFIG +from schematic.models.metadata import MetadataModel from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer from schematic.schemas.data_model_parser import DataModelParser from schematic.store.synapse import SynapseStorage @@ -149,3 +150,13 @@ def DMGE(helpers: Helpers) -> DataModelGraphExplorer: """Fixture to instantiate a DataModelGraphExplorer object.""" dmge = helpers.get_data_model_graph_explorer(path="example.model.jsonld") return dmge + + +def metadata_model(helpers, data_model_labels): + metadata_model = MetadataModel( + inputMModelLocation=helpers.get_data_path("example.model.jsonld"), + data_model_labels=data_model_labels, + inputMModelLocationType="local", + ) + + return metadata_model diff --git a/tests/data/mock_manifests/filepath_submission_test_manifest.csv b/tests/data/mock_manifests/filepath_submission_test_manifest.csv new file mode 100644 index 000000000..3b1a349fc --- /dev/null +++ b/tests/data/mock_manifests/filepath_submission_test_manifest.csv @@ -0,0 +1,3 @@ +Filename,Sample ID,File Format,Component,Genome Build,Genome FASTA,Id,entityId +schematic - main/Test Filename Upload/txt1.txt,1.0,,BulkRNA-seqAssay,,,01ded8fc-0915-4959-85ab-64e9644c8787,syn62276954 +schematic - main/Test Filename Upload/txt2.txt,2.0,,BulkRNA-seqAssay,,,fd122bb5-3353-4c94-b1f5-0bb93a3e9fc9,syn62276956 diff --git a/tests/integration/test_metadata_model.py b/tests/integration/test_metadata_model.py new file mode 100644 index 000000000..26602bb92 --- /dev/null +++ b/tests/integration/test_metadata_model.py @@ -0,0 +1,55 @@ +import logging +from contextlib import nullcontext as does_not_raise + +from pytest_mock import MockerFixture + +from schematic.store.synapse import SynapseStorage +from tests.conftest import metadata_model + +logging.basicConfig(level=logging.DEBUG) +logger = logging.getLogger(__name__) + + +class TestMetadataModel: + def test_submit_filebased_manifest(self, helpers, mocker: MockerFixture): + # spys + spy_upload_file_as_csv = mocker.spy(SynapseStorage, "upload_manifest_as_csv") + spy_upload_file_as_table = mocker.spy( + SynapseStorage, "upload_manifest_as_table" + ) + spy_upload_file_combo = mocker.spy(SynapseStorage, "upload_manifest_combo") + spy_add_annotations = mocker.spy( + SynapseStorage, "add_annotations_to_entities_files" + ) + + # GIVEN a metadata model object using class labels + meta_data_model = metadata_model(helpers, "class_label") + + # AND a filebased test manifset + manifest_path = helpers.get_data_path( + "mock_manifests/filepath_submission_test_manifest.csv" + ) + + # WHEN the manifest it submitted + # THEN submission should complete without error + with does_not_raise(): + manifest_id = meta_data_model.submit_metadata_manifest( + manifest_path=manifest_path, + dataset_id="syn62276880", + manifest_record_type="file_and_entities", + restrict_rules=False, + file_annotations_upload=True, + hide_blanks=False, + ) + + # AND the manifest should be submitted to the correct place + assert manifest_id == "syn62280543" + + # AND the manifest should be uploaded as a CSV + spy_upload_file_as_csv.assert_called_once() + # AND annotations should be added to the files + spy_add_annotations.assert_called_once() + + # AND the manifest should not be uploaded as a table or combination of table, file, and entities + spy_upload_file_as_table.assert_not_called() + spy_upload_file_combo.assert_not_called() diff --git a/tests/test_manifest.py b/tests/test_manifest.py index 3ce9e58a0..06bd7b168 100644 --- a/tests/test_manifest.py +++ b/tests/test_manifest.py @@ -763,22 +763,52 @@ def test_create_manifests( assert all_results == expected_result @pytest.mark.parametrize( - "component,datasetId", - [("Biospecimen", "syn61260107"), ("BulkRNA-seqAssay", "syn61374924")], + "component,datasetId,expected_file_based,expected_rows,expected_files", + [ + ("Biospecimen", "syn61260107", False, 4, None), + ( + "BulkRNA-seqAssay", + "syn61374924", + True, + 4, + pd.Series( + [ + "schematic - main/BulkRNASeq and files/txt1.txt", + "schematic - main/BulkRNASeq and files/txt2.txt", + "schematic - main/BulkRNASeq and files/txt4.txt", + "schematic - main/BulkRNASeq and files/txt3.txt", + ], + name="Filename", + ), + ), + ], ids=["Record based", "File based"], ) - def test_get_manifest_with_files(self, helpers, component, datasetId): + def test_get_manifest_with_files( + self, + helpers, + component, + datasetId, + expected_file_based, + expected_rows, + expected_files, + ): """ - Test to ensure that when generating a record based manifset that has files in the dataset that the files are not added to the manifest as well + Test to ensure that + when generating a record based manifset that has files in the dataset that the files are not added to the manifest as well + when generating a file based manifest from a dataset thathas had files added that the files are added correctly """ + # GIVEN the example data model path_to_data_model = helpers.get_data_path("example.model.jsonld") + # AND a graph data model graph_data_model = generate_graph_data_model( helpers, path_to_data_model=path_to_data_model, data_model_labels="class_label", ) + # AND a manifest generator generator = ManifestGenerator( path_to_data_model=path_to_data_model, graph=graph_data_model, @@ -786,16 +816,24 @@ def test_get_manifest_with_files(self, helpers, component, datasetId): use_annotations=True, ) + # WHEN a manifest is generated for the appropriate dataset as a dataframe manifest = generator.get_manifest( dataset_id=datasetId, output_format="dataframe" ) - filename_in_manifest_columns = "Filename" in manifest.columns + # AND it is determined if the manifest is filebased + is_file_based = "Filename" in manifest.columns + + # AND the number of rows are checked n_rows = manifest.shape[0] - if component == "Biospecimen": - assert not filename_in_manifest_columns - assert n_rows == 4 - elif component == "BulkRNA-seqAssay": - assert filename_in_manifest_columns - assert n_rows == 4 + # THEN the manifest should have the expected number of rows + assert n_rows == expected_rows + + # AND the manifest should be filebased or not as expected + assert is_file_based == expected_file_based + + # AND if the manifest is file based + if expected_file_based: + # THEN the manifest should have the expected files + assert manifest["Filename"].equals(expected_files) diff --git a/tests/test_metadata.py b/tests/test_metadata.py index bf0c4d97b..fca1d3db5 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -2,8 +2,8 @@ import logging import os -from typing import Optional, Generator from pathlib import Path +from typing import Generator, Optional from unittest.mock import patch import pytest diff --git a/tests/test_store.py b/tests/test_store.py index cefa6d56a..9ad00a052 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -7,9 +7,10 @@ import math import os import shutil +from contextlib import nullcontext as does_not_raise from time import sleep from typing import Any, Generator -from unittest.mock import AsyncMock, patch +from unittest.mock import AsyncMock, MagicMock, patch import pandas as pd import pytest @@ -19,7 +20,7 @@ from synapseclient.entity import File from synapseclient.models import Annotations -from schematic.configuration.configuration import Configuration +from schematic.configuration.configuration import CONFIG, Configuration from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer from schematic.schemas.data_model_parser import DataModelParser from schematic.store.base import BaseStorage @@ -116,7 +117,7 @@ def dmge( yield dmge -@pytest.fixture +@pytest.fixture(scope="module") def synapse_store_special_scope(): yield SynapseStorage(perform_query=False) @@ -160,32 +161,57 @@ def test_login(self) -> None: shutil.rmtree("test_cache_dir") @pytest.mark.parametrize( - "project_scope,columns,where_clauses,expected", + "project_scope,columns,where_clauses,expected,expected_new_query", [ - (None, None, None, "SELECT * FROM syn23643253 ;"), + (None, None, None, "SELECT * FROM syn23643253 ;", True), ( ["syn23643250"], None, None, "SELECT * FROM syn23643253 WHERE projectId IN ('syn23643250', '') ;", + True, ), ( None, None, ["projectId IN ('syn23643250')"], "SELECT * FROM syn23643253 WHERE projectId IN ('syn23643250') ;", + True, ), ( ["syn23643250"], ["name", "id", "path"], None, "SELECT name,id,path FROM syn23643253 WHERE projectId IN ('syn23643250', '') ;", + True, + ), + ( + None, + ["name", "id", "path"], + ["parentId='syn61682648'", "type='file'"], + "SELECT name,id,path FROM syn23643253 WHERE parentId='syn61682648' AND type='file' ;", + True, + ), + ( + ["syn23643250"], + None, + ["parentId='syn61682648'", "type='file'"], + "SELECT * FROM syn23643253 WHERE parentId='syn61682648' AND type='file' AND projectId IN ('syn23643250', '') ;", + True, + ), + ( + ["syn23643250"], + ["name", "id", "path"], + ["parentId='syn61682648'", "type='file'"], + "SELECT name,id,path FROM syn23643253 WHERE parentId='syn61682648' AND type='file' AND projectId IN ('syn23643250', '') ;", + True, ), ( ["syn23643250"], ["name", "id", "path"], ["parentId='syn61682648'", "type='file'"], "SELECT name,id,path FROM syn23643253 WHERE parentId='syn61682648' AND type='file' AND projectId IN ('syn23643250', '') ;", + False, ), ], ) @@ -196,17 +222,67 @@ def test_view_query( columns: list, where_clauses: list, expected: str, + expected_new_query: bool, ) -> None: - # Ensure correct view is being utilized + # GIVEN a the correct fileview assert synapse_store_special_scope.storageFileview == "syn23643253" + # AND the approrpiate project scope synapse_store_special_scope.project_scope = project_scope - synapse_store_special_scope.query_fileview(columns, where_clauses) - # tests ._build_query() - assert synapse_store_special_scope.fileview_query == expected - # tests that the query was valid and successful, that a view subset has actually been retrived - assert synapse_store_special_scope.storageFileviewTable.empty is False + # WHEN the query is built and run + # THEN it should complete without raising an exception + with does_not_raise(): + synapse_store_special_scope.query_fileview(columns, where_clauses) + # AND the query string should be as expected + assert synapse_store_special_scope.fileview_query == expected + # AND query should have recieved a non-empty table + assert synapse_store_special_scope.storageFileviewTable.empty is False + # AND the query should be new if expected + assert synapse_store_special_scope.new_query_different == expected_new_query + + @pytest.mark.parametrize( + "asset_view,columns,message", + [ + ( + "syn62339865", + ["path"], + r"The path column has not been added to the fileview. .*", + ), + ( + "syn62340177", + ["id"], + r"The columns id specified in the query do not exist in the fileview. .*", + ), + ], + ) + def test_view_query_exception( + self, + asset_view: str, + columns: list[str], + message: str, + ) -> None: + # GIVEN a project scope + project_scope = ["syn23643250"] + + # AND a test configuration + TEST_CONFIG = Configuration() + with patch( + "schematic.store.synapse.CONFIG", return_value=TEST_CONFIG + ) as mock_config: + # AND the appropriate test file view + mock_config.synapse_master_fileview_id = asset_view + # AND a real path to the synapse config file + mock_config.synapse_configuration_path = CONFIG.synapse_configuration_path + # AND a unique synapse storage object that uses the values modified in the test config + synapse_store = SynapseStorage(perform_query=False) + # AND the given project scope + synapse_store.project_scope = project_scope + + # WHEN the query is built and run + # THEN it should raise a ValueError with the appropriate message + with pytest.raises(ValueError, match=message): + synapse_store.query_fileview(columns) def test_getFileAnnotations(self, synapse_store: SynapseStorage) -> None: expected_dict = { @@ -429,42 +505,67 @@ def test_getDatasetManifest(self, synapse_store, downloadFile): assert manifest_data == "syn51204513" @pytest.mark.parametrize( - "existing_manifest_df", + "existing_manifest_df,fill_in_return_value,expected_df", [ - pd.DataFrame(), - pd.DataFrame( - { - "Filename": ["existing_mock_file_path"], - "entityId": ["existing_mock_entity_id"], - } + ( + pd.DataFrame(), + [ + { + "Filename": ["new_mock_file_path"], + "entityId": ["new_mock_entity_id"], + }, + { + "Filename": ["new_mock_file_path"], + "entityId": ["new_mock_entity_id"], + }, + ], + pd.DataFrame( + { + "Filename": ["new_mock_file_path"], + "entityId": ["new_mock_entity_id"], + } + ), + ), + ( + pd.DataFrame( + { + "Filename": ["existing_mock_file_path"], + "entityId": ["existing_mock_entity_id"], + } + ), + [ + { + "Filename": ["existing_mock_file_path", "new_mock_file_path"], + "entityId": ["existing_mock_entity_id", "new_mock_entity_id"], + }, + { + "Filename": ["new_mock_file_path"], + "entityId": ["new_mock_entity_id"], + }, + ], + pd.DataFrame( + { + "Filename": ["existing_mock_file_path", "new_mock_file_path"], + "entityId": ["existing_mock_entity_id", "new_mock_entity_id"], + } + ), ), ], ) - def test_fill_in_entity_id_filename(self, synapse_store, existing_manifest_df): + def test_fill_in_entity_id_filename( + self, synapse_store, existing_manifest_df, fill_in_return_value, expected_df + ): with patch( "schematic.store.synapse.SynapseStorage.getFilesInStorageDataset", return_value=["syn123", "syn124", "syn125"], ) as mock_get_file_storage, patch( "schematic.store.synapse.SynapseStorage._get_file_entityIds", - return_value={ - "Filename": ["mock_file_path"], - "entityId": ["mock_entity_id"], - }, + side_effect=fill_in_return_value, ) as mock_get_file_entity_id: dataset_files, new_manifest = synapse_store.fill_in_entity_id_filename( datasetId="test_syn_id", manifest=existing_manifest_df ) - if not existing_manifest_df.empty: - expected_df = pd.DataFrame( - { - "Filename": ["existing_mock_file_path", "mock_file_path"], - "entityId": ["existing_mock_entity_id", "mock_entity_id"], - } - ) - else: - expected_df = pd.DataFrame( - {"Filename": ["mock_file_path"], "entityId": ["mock_entity_id"]} - ) + assert_frame_equal(new_manifest, expected_df) assert dataset_files == ["syn123", "syn124", "syn125"]