Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix from testing 20241031 #86

Merged
merged 6 commits into from
Nov 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion resources/config.sample.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
"ingestor_files_directory": "../ingestor",
"message_to_file": true,
"message_file_extension": "message.json",
"use_full_file_path": false
"file_path_type": "relative"
}
},
"kafka": {
Expand Down
1 change: 0 additions & 1 deletion src/scicat_communication.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,6 @@ def render_full_url(url: str, config: SciCatOptions) -> str:
for endpoint in urls.keys():
if url.startswith(endpoint):
return url.replace(endpoint, urls[endpoint])

return url


Expand Down
2 changes: 1 addition & 1 deletion src/scicat_configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ class FileHandlingOptions:
ingestor_files_directory: str = "../ingestor"
message_to_file: bool = True
message_file_extension: str = "message.json"
use_full_file_path: bool = False
file_path_type: str = "relative" # allowed values: absolute and relative


@dataclass(kw_only=True)
Expand Down
44 changes: 31 additions & 13 deletions src/scicat_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,8 @@ def extract_paths_from_h5_file(
master_key + "/" + subkey
for subkey in extract_paths_from_h5_file(_h5_object[master_key], _path)
]
else:
output_paths = [master_key]

return output_paths

Expand Down Expand Up @@ -424,7 +426,7 @@ def create_data_file_list(
file_path=hash_file_path, compute_file_hash=False
)
)
if source_folder:
if source_folder and config.file_path_type == "relative":
for data_file in data_file_list:
data_file.path = str(
pathlib.Path(data_file.path).relative_to(source_folder)
Expand Down Expand Up @@ -603,18 +605,29 @@ def scicat_dataset_to_dict(dataset: ScicatDataset) -> dict:
return {k: v for k, v in asdict(dataset).items() if v is not None}


def _define_dataset_source_folder(datafilelist: list[DataFileListItem]) -> pathlib.Path:
def _define_dataset_source_folder(
datafilelist: list[DataFileListItem],
data_file_path: pathlib.Path,
source_folder_config: str = "common_path",
) -> pathlib.Path | None:
"""
Return the dataset source folder, which is the common path
between all the data files associated with the dataset
"""
import os

return pathlib.Path(os.path.commonpath([item.path for item in datafilelist]))
if source_folder_config == "data_file":
return pathlib.Path(os.path.dirname(data_file_path))
elif source_folder_config == "common_path":
return pathlib.Path(os.path.commonpath([item.path for item in datafilelist]))
else:
return None


def _path_to_relative(
datafilelist_item: DataFileListItem, dataset_source_folder: pathlib.Path
datafilelist_item: DataFileListItem,
dataset_source_folder: pathlib.Path,
file_path_type: str = "relative",
) -> DataFileListItem:
"""
Copy the datafiles item and transform the path to the relative path
Expand All @@ -623,32 +636,37 @@ def _path_to_relative(
from copy import copy

origdatablock_datafilelist_item = copy(datafilelist_item)
origdatablock_datafilelist_item.path = (
pathlib.Path(datafilelist_item.path)
.relative_to(dataset_source_folder)
.as_posix()
)
if file_path_type == "relative":
origdatablock_datafilelist_item.path = (
pathlib.Path(datafilelist_item.path)
.relative_to(dataset_source_folder)
.as_posix()
)
return origdatablock_datafilelist_item


def _prepare_origdatablock_datafilelist(
datafiles_list: list[DataFileListItem], dataset_source_folder: pathlib.Path
datafiles_list: list[DataFileListItem],
dataset_source_folder: pathlib.Path,
file_path_type: str = "relative",
) -> list[DataFileListItem]:
"""
Prepare the datafiles list for the origdatablock entry in scicat
That means that the file paths needs to be relative to the dataset source folder
"""
return [_path_to_relative(item, dataset_source_folder) for item in datafiles_list]
return [
_path_to_relative(item, dataset_source_folder, file_path_type)
for item in datafiles_list
]


def create_origdatablock_instance(
data_file_list: list[DataFileListItem],
scicat_dataset: dict,
config: FileHandlingOptions,
) -> OrigDataBlockInstance:
dataset_source_folder = _define_dataset_source_folder(data_file_list)
origdatablock_datafiles_list = _prepare_origdatablock_datafilelist(
data_file_list, dataset_source_folder
data_file_list, scicat_dataset["sourceFolder"], config.file_path_type
)
return OrigDataBlockInstance(
datasetId=scicat_dataset["pid"],
Expand Down