Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add author and tools details in RO-Crate #18820

Open
wants to merge 17 commits into
base: dev
Choose a base branch
from
213 changes: 213 additions & 0 deletions lib/galaxy/model/store/ro_crate_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
class WorkflowRunCrateProfileBuilder:
def __init__(self, model_store: Any):
self.model_store = model_store
self.toolbox = self.model_store.app.toolbox
self.invocation: WorkflowInvocation = model_store.included_invocations[0]
self.workflow: Workflow = self.invocation.workflow
self.param_type_mapping = {
Expand Down Expand Up @@ -85,6 +86,8 @@ def __init__(self, model_store: Any):
self.file_entities: Dict[int, Any] = {}
self.param_entities: Dict[int, Any] = {}
self.pv_entities: Dict[str, Any] = {}
# Cache for tools to avoid duplicating entities for the same tool
self.tool_cache: Dict[str, ContextEntity] = {}

def build_crate(self):
crate = ROCrate()
Expand Down Expand Up @@ -222,6 +225,216 @@ def _add_workflows(self, crate: ROCrate):
crate.mainEntity["name"] = self.workflow.name
crate.mainEntity["subjectOf"] = cwl_wf

# Adding multiple creators if available
if self.workflow.creator_metadata:
for creator_data in self.workflow.creator_metadata:
if creator_data.get("class") == "Person":
# Create the person entity
creator_entity = crate.add(
ContextEntity(
crate,
creator_data.get("identifier", ""), # Default to empty string if identifier is missing
properties={
"@type": "Person",
"name": creator_data.get("name", ""), # Default to empty string if name is missing
"orcid": creator_data.get(
Marie59 marked this conversation as resolved.
Show resolved Hide resolved
"identifier", ""
), # Assuming identifier is ORCID, or adjust as needed
"url": creator_data.get("url", ""), # Add URL if available, otherwise empty string
"email": creator_data.get(
"email", ""
), # Add email if available, otherwise empty string
},
)
)
# Append the person creator entity to the mainEntity
crate.mainEntity.append_to("creator", creator_entity)

elif creator_data.get("class") == "Organization":
# Create the organization entity
organization_entity = crate.add(
ContextEntity(
crate,
creator_data.get(
"url", ""
), # Use URL as identifier if available, otherwise empty string
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could also try to use identifier here, as that is an option in the UI too when setting an organization as a creator. For example an ROR identifier could be used. (though admittedly it's more likely for an average user to just input their institute's URL in the url field)

Copy link
Author

@Marie59 Marie59 Dec 23, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For this point I was not sure what to do I used identifier for the ORCID of the creator. Should I replace it by ROR or is something completely different you were suggesting ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As per the section above (L228-L251), if the creator is a Person, then we assume the identifier field is an ORCID and that's all good.

In this section, if the creator is an Organization, the identifier field won't represent an ORCID - instead it would (probably) be an ROR identifier. However, it's less likely to be used by users, as ROR isn't as widely known yet.
So for the entity id on this line I would suggest: if the identifier exists, use that, otherwise use the url, or if neither exist, use an empty string.

And in the properties you can include identifier as well as url if the identifier exists in the creator_data.

properties={
"@type": "Organization",
"name": creator_data.get("name", ""), # Default to empty string if name is missing
"url": creator_data.get("url", ""), # Add URL if available, otherwise empty string
},
)
)
# Append the organization entity to the mainEntity
crate.mainEntity.append_to("creator", organization_entity)

# Add CWL workflow entity if exists
crate.mainEntity["subjectOf"] = cwl_wf

# Add tools used in the workflow
self._add_tools(crate)
self._add_steps(crate)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Workflows can have workflow inputs. Should we add them here as well? At least the types of the workflow inputs?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That would be nice to have, yes.

This can even be done per tool (though more fiddly). There is already some capturing of the inputs and outputs as as formalParameters, but it seems only to happen if data files are included in the crate. There seem to be some helper functions further down the file for this purpose:

def _add_step_parameter_fp(self, step: WorkflowInvocationStep, crate: ROCrate):


def _add_steps(self, crate: ROCrate):
"""
Add workflow steps (HowToStep) to the RO-Crate. These are unique for each tool occurrence.
"""
step_entities = []
Marie59 marked this conversation as resolved.
Show resolved Hide resolved
# Initialize the position as a list with a single element to keep it mutable
position = [1]
self._add_steps_recursive(self.workflow.steps, crate, step_entities, position)
return step_entities

def _add_steps_recursive(self, steps, crate: ROCrate, step_entities, position):
"""
Recursively add HowToStep entities from workflow steps, ensuring that
the position index is maintained across subworkflows.
"""
for step in steps:
if step.type == "tool":
# Create a unique HowToStep entity for each step
step_id = f"step_{position[0]}"
step_description = None
if step.annotations:
annotations_list = [annotation.annotation for annotation in step.annotations if annotation]
step_description = " ".join(annotations_list) if annotations_list else None

# Add HowToStep entity to the crate
step_entity = crate.add(
ContextEntity(
crate,
step_id,
properties={
"@type": "HowToStep",
"position": position[0],
"name": step.tool_id,
"description": step_description,
Marie59 marked this conversation as resolved.
Show resolved Hide resolved
},
)
)

# Append the HowToStep entity to the workflow steps list
step_entities.append(step_entity)
crate.mainEntity.append_to("step", step_entity)

# Increment the position counter
position[0] += 1

# Handle subworkflows recursively
elif step.type == "subworkflow":
subworkflow = step.subworkflow
if subworkflow:
self._add_steps_recursive(subworkflow.steps, crate, step_entities, position)

def _add_tools(self, crate: ROCrate):
tool_entities = []
Marie59 marked this conversation as resolved.
Show resolved Hide resolved
self._add_tools_recursive(self.workflow.steps, crate, tool_entities)

def _add_tools_recursive(self, steps, crate: ROCrate, tool_entities):
"""
Recursively add SoftwareApplication entities from workflow steps, reusing tools when necessary.
"""
for step in steps:
if step.type == "tool":
Marie59 marked this conversation as resolved.
Show resolved Hide resolved
tool_id = step.tool_id
Marie59 marked this conversation as resolved.
Show resolved Hide resolved
tool_version = step.tool_version

# Cache key based on tool ID and version
tool_key = f"{tool_id}:{tool_version}"

# Check if tool entity is already in cache
if tool_key in self.tool_cache:
tool_entity = self.tool_cache[tool_key]
else:
# Create a new tool entity
tool_name = tool_id
tool_description = None
if step.annotations:
annotations_list = [annotation.annotation for annotation in step.annotations if annotation]
tool_description = " ".join(annotations_list) if annotations_list else None

# Retrieve the tool metadata from the toolbox
tool_metadata = self._get_tool_metadata(tool_id)

# Add tool entity to the RO-Crate
tool_entity = crate.add(
ContextEntity(
crate,
tool_id,
Marie59 marked this conversation as resolved.
Show resolved Hide resolved
properties={
"@type": "SoftwareApplication",
"name": tool_name,
"version": tool_version,
"description": tool_description,
"url": "https://toolshed.g2.bx.psu.edu", # URL if relevant
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not always true. Tools could come from multiple toolsheds.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes I was wondering if this part was relevant or if I should remove it

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Having a url for each tool would be great - but if we can't get this due to the toolbox limitations, it should be removed.

"citation": tool_metadata["citations"],
"identifier": tool_metadata["xrefs"],
"EDAM operation": tool_metadata["edam_operations"],
},
)
)

# Store the tool entity in the cache
self.tool_cache[tool_key] = tool_entity

# Append the tool entity to the workflow (instrument) and store it in the list
tool_entities.append(tool_entity)
crate.mainEntity.append_to("instrument", tool_entity)
Marie59 marked this conversation as resolved.
Show resolved Hide resolved

# Handle subworkflows recursively
elif step.type == "subworkflow":
subworkflow = step.subworkflow
if subworkflow:
self._add_tools_recursive(subworkflow.steps, crate, tool_entities)

def _get_tool_metadata(self, tool_id: str):
"""
Retrieve the tool metadata (citations, xrefs, EDAM operations) using the ToolBox.

Args:
toolbox (ToolBox): An instance of the Galaxy ToolBox.
tool_id (str): The ID of the tool to retrieve metadata for.

Returns:
dict: A dictionary containing citations, xrefs, and EDAM operations for the tool.
"""
tool = self.toolbox.get_tool(tool_id)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This won't work, you can't use the toolbox in this context. We can't load up the toolbox in celery workers, that would take too much time and memory. The best solution is to either get that data from a yet to be written toolshed endpoint or load up the tool on demand, neither of these are easy to do or fit into the context of this PR. Maybe you could focus on the other enhancements in this PR.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a shame that the toolbox can't be used in this way. But the most essential metadata about the tools is already captured without using the toolbox, so I think it's ok to remove this function and consider this as an enhancement which could be added in future.

It does mean we would have a greater need for the url on line 369, though, as without the xrefs there's no alternative links to find the tool (and the rest of its metadata). Still not absolutely mandatory to have url, but a definite best practice.

(I wondered if we could retrieve the citation/xref/EDAM metadata from something like job_attrs.txt which is already part of the crate, but it's not included there. Nor is the url.)

if not tool:
return None

# Extracting relevant metadata from the tool object
citations = []
if tool.citations:
for citation in tool.citations:
citations.append(
{
"type": citation.type, # e.g., "doi" or "bibtex"
"value": citation.value, # The actual DOI, BibTeX, etc.
}
)

xrefs = []
if tool.xrefs:
for xref in tool.xrefs:
xrefs.append(
{
"type": xref.type, # e.g., "registry", "repository", etc.
"value": xref.value, # The identifier or link
}
)

# Handling EDAM operations, which are simple values in your XML
edam_operations = []
if tool.edam_operations:
for operation in tool.edam_operations:
edam_operations.append({"value": operation}) # Extract the operation code (e.g., "operation_3482")

return {
"citations": citations, # List of structured citation entries
"xrefs": xrefs, # List of structured xref entries
"edam_operations": edam_operations, # List of structured EDAM operations
}

def _add_create_action(self, crate: ROCrate):
self.create_action = crate.add(
ContextEntity(
Expand Down
22 changes: 22 additions & 0 deletions test/unit/data/model/test_model_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -1199,11 +1199,33 @@ def write_composite_file(self, dataset_instance, contents, file_name):
)


class MockTool:
"""Mock class to simulate Galaxy tools with essential metadata for testing."""

def __init__(self, tool_id):
self.tool_id = tool_id
self.citations = [{"type": "doi", "value": "10.1234/example.doi"}]
self.xrefs = [{"type": "registry", "value": "tool_registry_id"}]
self.edam_operations = ["operation_1234"]


class MockToolbox:
"""Mock class for the Galaxy toolbox, which returns tools based on their IDs."""

def get_tool(self, tool_id):
# Returns a MockTool object with basic metadata for testing
return MockTool(tool_id)


def _mock_app(store_by=DEFAULT_OBJECT_STORE_BY):
app = TestApp()
test_object_store_config = TestConfig(store_by=store_by)
app.object_store = test_object_store_config.object_store
app.model.Dataset.object_store = app.object_store

# Add a mocked toolbox attribute for tests requiring tool metadata
app.toolbox = MockToolbox()

return app


Expand Down
Loading