Skip to content

Commit

Permalink
fixes from the tests of the day
Browse files Browse the repository at this point in the history
  • Loading branch information
nitrosx committed Oct 31, 2024
1 parent 524d1a6 commit dda7237
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 17 deletions.
2 changes: 1 addition & 1 deletion src/scicat_configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ class FileHandlingOptions:
ingestor_files_directory: str = "../ingestor"
message_to_file: bool = True
message_file_extension: str = "message.json"
use_full_file_path: bool = False
file_path_type: str = "relative" # allowed values: absolute and relative


@dataclass(kw_only=True)
Expand Down
55 changes: 40 additions & 15 deletions src/scicat_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,8 @@ def extract_paths_from_h5_file(
for key in temp_keys:
output_paths += [
key + "/" + subkey
for subkey in extract_paths_from_h5_file(
for subkey
in extract_paths_from_h5_file(
_h5_object[key], copy.deepcopy(_path)
)
]
Expand All @@ -184,6 +185,8 @@ def extract_paths_from_h5_file(
master_key + "/" + subkey
for subkey in extract_paths_from_h5_file(_h5_object[master_key], _path)
]
else:
output_paths = [master_key]

return output_paths

Expand Down Expand Up @@ -400,7 +403,7 @@ def create_data_file_list(
file_path=hash_file_path, compute_file_hash=False
)
)
if source_folder:
if source_folder and config.file_path_type == "relative":
for data_file in data_file_list:
data_file.path = str(
pathlib.Path(data_file.path).relative_to(source_folder)
Expand Down Expand Up @@ -579,18 +582,28 @@ def scicat_dataset_to_dict(dataset: ScicatDataset) -> dict:
return {k: v for k, v in asdict(dataset).items() if v is not None}


def _define_dataset_source_folder(datafilelist: list[DataFileListItem]) -> pathlib.Path:
def _define_dataset_source_folder(
datafilelist: list[DataFileListItem],
data_file_path: pathlib.Path,
source_folder_config: str = "common_path"
) -> pathlib.Path | None:
"""
Return the dataset source folder, which is the common path
between all the data files associated with the dataset
"""
import os

return pathlib.Path(os.path.commonpath([item.path for item in datafilelist]))

if source_folder_config == "data_file":
return pathlib.Path(os.path.dirname(data_file_path))
elif source_folder_config == "common_path":
return pathlib.Path(os.path.commonpath([item.path for item in datafilelist]))
else:
return None

def _path_to_relative(
datafilelist_item: DataFileListItem, dataset_source_folder: pathlib.Path
datafilelist_item: DataFileListItem,
dataset_source_folder: pathlib.Path,
file_path_type: str = "relative"
) -> DataFileListItem:
"""
Copy the datafiles item and transform the path to the relative path
Expand All @@ -599,32 +612,44 @@ def _path_to_relative(
from copy import copy

origdatablock_datafilelist_item = copy(datafilelist_item)
origdatablock_datafilelist_item.path = (
pathlib.Path(datafilelist_item.path)
.relative_to(dataset_source_folder)
.as_posix()
)
if file_path_type == "relative":
origdatablock_datafilelist_item.path = (
pathlib.Path(datafilelist_item.path)
.relative_to(dataset_source_folder)
.as_posix()
)
return origdatablock_datafilelist_item


def _prepare_origdatablock_datafilelist(
datafiles_list: list[DataFileListItem], dataset_source_folder: pathlib.Path
datafiles_list: list[DataFileListItem],
dataset_source_folder: pathlib.Path,
file_path_type: str = "relative"
) -> list[DataFileListItem]:
"""
Prepare the datafiles list for the origdatablock entry in scicat
That means that the file paths needs to be relative to the dataset source folder
"""
return [_path_to_relative(item, dataset_source_folder) for item in datafiles_list]
return [
_path_to_relative(
item,
dataset_source_folder,
file_path_type
)
for item
in datafiles_list
]


def create_origdatablock_instance(
data_file_list: list[DataFileListItem],
scicat_dataset: dict,
config: FileHandlingOptions,
) -> OrigDataBlockInstance:
dataset_source_folder = _define_dataset_source_folder(data_file_list)
origdatablock_datafiles_list = _prepare_origdatablock_datafilelist(
data_file_list, dataset_source_folder
data_file_list,
scicat_dataset["sourceFolder"],
config.file_path_type
)
return OrigDataBlockInstance(
datasetId=scicat_dataset["pid"],
Expand Down
2 changes: 1 addition & 1 deletion src/scicat_offline_ingestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def main() -> None:
nexus_file=nexus_file_path,
ingestor_directory=ingestor_directory,
config=fh_options,
source_folder=variable_map["source_folder"] if config.ingestion.file_handling.use_full_file_path==False else "",
source_folder=variable_map["source_folder"],
logger=logger,
# TODO: add done_writing_message_file and nexus_structure_file
)
Expand Down

0 comments on commit dda7237

Please sign in to comment.