Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update existing file paths in manifests at generation to conform to new convention #1467

Merged
merged 40 commits into from
Aug 30, 2024
Merged
Show file tree
Hide file tree
Changes from 33 commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
1d48c36
update existing file paths
GiaJordan Aug 16, 2024
e2d92cc
update logic for when no manfiest is passed in
GiaJordan Aug 16, 2024
dc7fac4
only update paths when there is a mismatch
GiaJordan Aug 16, 2024
2ecb5d6
update test
GiaJordan Aug 16, 2024
0bd91bf
clean and update name
GiaJordan Aug 16, 2024
3d5941c
add test for submission
GiaJordan Aug 20, 2024
53ab10c
add test manifest
GiaJordan Aug 20, 2024
fa9b263
redo matching logic
GiaJordan Aug 20, 2024
48a48fd
update test assertion
GiaJordan Aug 20, 2024
abf1277
update tests for filepaths as well
GiaJordan Aug 21, 2024
eb66a37
add exceptions and messages
GiaJordan Aug 21, 2024
e0d6b8b
add tests for exceptions raised
GiaJordan Aug 21, 2024
1c5621f
update test
GiaJordan Aug 21, 2024
4c2edd9
Revert "update test"
GiaJordan Aug 22, 2024
1ae2d90
update test
GiaJordan Aug 22, 2024
0b60395
add comments
GiaJordan Aug 22, 2024
6feea81
update exception message
GiaJordan Aug 26, 2024
976645a
update type hint
GiaJordan Aug 26, 2024
208f225
update comment
GiaJordan Aug 26, 2024
838634b
Merge branch 'develop' into develop-filepath2-manifest-gen-FDS-2278
GiaJordan Aug 26, 2024
d0628b2
change structure of
GiaJordan Aug 26, 2024
2711064
Merge branch 'develop' into develop-filepath2-manifest-gen-FDS-2278
GiaJordan Aug 27, 2024
c2b832d
move test to integration dir
GiaJordan Aug 27, 2024
2dd6fdf
move function to conftest
GiaJordan Aug 27, 2024
096fe55
update exception message
GiaJordan Aug 27, 2024
05a791e
update query tests
GiaJordan Aug 27, 2024
2b84f4d
update comments
GiaJordan Aug 27, 2024
cb0f0fb
update imports
GiaJordan Aug 27, 2024
a916641
update get manifest test
GiaJordan Aug 27, 2024
e3107a5
cover other cases in query test
GiaJordan Aug 27, 2024
57ec340
update query test
GiaJordan Aug 27, 2024
bb043f6
change structure of test
GiaJordan Aug 27, 2024
ed9837b
update mock data names
GiaJordan Aug 27, 2024
dfc0e13
update 'does_not_raise'
GiaJordan Aug 28, 2024
250d71d
update imports
GiaJordan Aug 28, 2024
7b82453
modifiy query exception raised
GiaJordan Aug 29, 2024
06532e0
mock config for query exception tests
GiaJordan Aug 29, 2024
5dc117b
update test data
GiaJordan Aug 29, 2024
b2b09ca
update test
GiaJordan Aug 29, 2024
8c77ccf
update test file
GiaJordan Aug 29, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 55 additions & 13 deletions schematic/store/synapse.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,8 +291,19 @@ def query_fileview(
self.storageFileviewTable = self.syn.tableQuery(
query=self.fileview_query,
).asDataFrame()
except SynapseHTTPError:
raise AccessCredentialsError(self.storageFileview)
except SynapseHTTPError as exc:
exception_text = str(exc)
if "Unknown column path" in exception_text:
raise ValueError(
"The path column has not been added to the fileview. Please make sure that the fileview is up to date. You can add the path column to the fileview by follwing the instructions in the validation rules documentation."
) from exc
elif "Unknown column" in exception_text:
missing_column = exception_text.split("Unknown column ")[-1]
raise ValueError(
f"The columns {missing_column} specified in the query do not exist in the fileview. Please make sure that the column names are correct and that all expected columns have been added to the fileview."
) from exc
else:
raise AccessCredentialsError(self.storageFileview)
GiaJordan marked this conversation as resolved.
Show resolved Hide resolved

def _build_query(
self, columns: Optional[list] = None, where_clauses: Optional[list] = None
Expand Down Expand Up @@ -788,18 +799,49 @@ def fill_in_entity_id_filename(
# note that if there is an existing manifest and there are files in the dataset
# the columns Filename and entityId are assumed to be present in manifest schema
# TODO: use idiomatic panda syntax
if dataset_files:
new_files = self._get_file_entityIds(
dataset_files=dataset_files, only_new_files=True, manifest=manifest
)
if not dataset_files:
manifest = manifest.fillna("")
return dataset_files, manifest

# update manifest so that it contains new dataset files
new_files = pd.DataFrame(new_files)
manifest = (
pd.concat([manifest, new_files], sort=False)
.reset_index()
.drop("index", axis=1)
)
all_files = self._get_file_entityIds(
dataset_files=dataset_files, only_new_files=False, manifest=manifest
)
new_files = self._get_file_entityIds(
dataset_files=dataset_files, only_new_files=True, manifest=manifest
)

all_files = pd.DataFrame(all_files)
new_files = pd.DataFrame(new_files)

# update manifest so that it contains new dataset files
manifest = (
pd.concat([manifest, new_files], sort=False)
.reset_index()
.drop("index", axis=1)
)

# Reindex manifest and new files dataframes according to entityIds to align file paths and metadata
manifest_reindex = manifest.set_index("entityId")
all_files_reindex = all_files.set_index("entityId")
all_files_reindex_like_manifest = all_files_reindex.reindex_like(
manifest_reindex
)

# Check if individual file paths in manifest and from synapse match
file_paths_match = (
manifest_reindex["Filename"] == all_files_reindex_like_manifest["Filename"]
)

# If all the paths do not match, update the manifest with the filepaths from synapse
if not file_paths_match.all():
manifest_reindex.loc[
~file_paths_match, "Filename"
] = all_files_reindex_like_manifest.loc[~file_paths_match, "Filename"]

# reformat manifest for further use
manifest = manifest_reindex.reset_index()
entityIdCol = manifest.pop("entityId")
manifest.insert(len(manifest.columns), "entityId", entityIdCol)

manifest = manifest.fillna("")
return dataset_files, manifest
Expand Down
11 changes: 11 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from dotenv import load_dotenv

from schematic.configuration.configuration import CONFIG
from schematic.models.metadata import MetadataModel
from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer
from schematic.schemas.data_model_parser import DataModelParser
from schematic.store.synapse import SynapseStorage
Expand Down Expand Up @@ -149,3 +150,13 @@ def DMGE(helpers: Helpers) -> DataModelGraphExplorer:
"""Fixture to instantiate a DataModelGraphExplorer object."""
dmge = helpers.get_data_model_graph_explorer(path="example.model.jsonld")
return dmge


def metadata_model(helpers, data_model_labels):
metadata_model = MetadataModel(
inputMModelLocation=helpers.get_data_path("example.model.jsonld"),
data_model_labels=data_model_labels,
inputMModelLocationType="local",
)

return metadata_model
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Filename,Sample ID,File Format,Component,Genome Build,Genome FASTA,Id,entityId
GiaJordan marked this conversation as resolved.
Show resolved Hide resolved
schematic - main/Test Filename Upload/txt1.txt,,,BulkRNA-seqAssay,,,01ded8fc-0915-4959-85ab-64e9644c8787,syn62276954
schematic - main/Test Filename Upload/txt2.txt,,,BulkRNA-seqAssay,,,fd122bb5-3353-4c94-b1f5-0bb93a3e9fc9,syn62276956
33 changes: 33 additions & 0 deletions tests/integration/test_metadata_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import logging
from contextlib import contextmanager
from unittest.mock import patch

from schematic.models.metadata import MetadataModel
from tests.conftest import metadata_model

logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)


@contextmanager
def does_not_raise():
yield


class TestMetadataModel:
def test_submit_filebased_manifest(self, helpers):
meta_data_model = metadata_model(helpers, "class_label")

manifest_path = helpers.get_data_path(
"mock_manifests/filepath_submission_test_manifest.csv"
)

with does_not_raise():
manifest_id = meta_data_model.submit_metadata_manifest(
manifest_path=manifest_path,
dataset_id="syn62276880",
manifest_record_type="file_only",
restrict_rules=False,
file_annotations_upload=True,
)
assert manifest_id == "syn62280543"
GiaJordan marked this conversation as resolved.
Show resolved Hide resolved
60 changes: 49 additions & 11 deletions tests/test_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -763,39 +763,77 @@ def test_create_manifests(
assert all_results == expected_result

@pytest.mark.parametrize(
"component,datasetId",
[("Biospecimen", "syn61260107"), ("BulkRNA-seqAssay", "syn61374924")],
"component,datasetId,expected_file_based,expected_rows,expected_files",
[
("Biospecimen", "syn61260107", False, 4, None),
(
"BulkRNA-seqAssay",
"syn61374924",
True,
4,
pd.Series(
[
"schematic - main/BulkRNASeq and files/txt1.txt",
"schematic - main/BulkRNASeq and files/txt2.txt",
"schematic - main/BulkRNASeq and files/txt4.txt",
"schematic - main/BulkRNASeq and files/txt3.txt",
],
name="Filename",
),
),
],
ids=["Record based", "File based"],
)
def test_get_manifest_with_files(self, helpers, component, datasetId):
def test_get_manifest_with_files(
self,
helpers,
component,
datasetId,
expected_file_based,
expected_rows,
expected_files,
):
"""
Test to ensure that when generating a record based manifset that has files in the dataset that the files are not added to the manifest as well
Test to ensure that
when generating a record based manifset that has files in the dataset that the files are not added to the manifest as well
when generating a file based manifest from a dataset thathas had files added that the files are added correctly
"""
# GIVEN the example data model
GiaJordan marked this conversation as resolved.
Show resolved Hide resolved
path_to_data_model = helpers.get_data_path("example.model.jsonld")

# AND a graph data model
graph_data_model = generate_graph_data_model(
helpers,
path_to_data_model=path_to_data_model,
data_model_labels="class_label",
)

# AND a manifest generator
generator = ManifestGenerator(
path_to_data_model=path_to_data_model,
graph=graph_data_model,
root=component,
use_annotations=True,
)

# WHEN a manifest is generated for the appropriate dataset as a dataframe
manifest = generator.get_manifest(
dataset_id=datasetId, output_format="dataframe"
)

filename_in_manifest_columns = "Filename" in manifest.columns
# AND it is determined if the manifest is filebased
is_file_based = "Filename" in manifest.columns

# AND the number of rows are checked
n_rows = manifest.shape[0]

if component == "Biospecimen":
assert not filename_in_manifest_columns
assert n_rows == 4
elif component == "BulkRNA-seqAssay":
assert filename_in_manifest_columns
assert n_rows == 4
# THEN the manifest should have the expected number of rows
assert n_rows == expected_rows

# AND the manifest should be filebased or not as expected
assert is_file_based == expected_file_based

# AND if the manifest is file based
if expected_file_based:
# THEN the manifest should have the expected files
assert manifest["Filename"].equals(expected_files)
2 changes: 1 addition & 1 deletion tests/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

import logging
import os
from typing import Optional, Generator
from pathlib import Path
from typing import Generator, Optional
from unittest.mock import patch

import pytest
Expand Down
Loading
Loading