Skip to content

Commit

Permalink
Merge branch 'main' into multi-direct-issue
Browse files Browse the repository at this point in the history
  • Loading branch information
doctrino authored Mar 4, 2025
2 parents d35c520 + a3a4785 commit 5f9f41f
Show file tree
Hide file tree
Showing 15 changed files with 392 additions and 76 deletions.
2 changes: 1 addition & 1 deletion .github/pull_request_template.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,6 @@ Please describe the change you have made.
- [ ] Skip

## Changelog
### Added/Changed/Improved/Removed
### Added/Changed/Improved/Removed/Fixed

- My change.
8 changes: 5 additions & 3 deletions .github/workflows/code-quality.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ name: Check Quality and Test

on:
pull_request:
branches: [main]

env:
# Used for linting, docker images, and docs.
Expand Down Expand Up @@ -151,7 +152,6 @@ jobs:
coverageFile: coverage.xml
token: ${{ secrets.GITHUB_TOKEN }}
thresholdAll: 0.6

validate-description:
name: Validate PR description
runs-on: ubuntu-latest
Expand All @@ -165,8 +165,10 @@ jobs:
uses: snok/install-poetry@v1
with:
version: 1.5.1
- name: Install dependencies
- run: poetry install -E all
- name: Install core dependencies
run: |
poetry config virtualenvs.create false
poetry install -E all
- name: Dump description to file
uses: actions/github-script@v7
with:
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,13 @@ jobs:
run: git describe --tags --abbrev=0 > last_version.txt
- name: Bump Version
run: python dev.py bump --verbose
- name: Create CHANGELOG entry
run: python dev.py changelog
- id: version
name: Read the new version
# Read the version from the cognite/neat/_version.py file
run: echo "version=$(sed -n 's/^__version__ = "\(.*\)"/\1/p' cognite/neat/_version.py)" >> $GITHUB_ENV

- name: Create CHANGELOG entry
if: env.version != '0.0.0'
run: python dev.py changelog
- name: Build package
if: env.version != '0.0.0'
run: poetry build
Expand Down
46 changes: 39 additions & 7 deletions cognite/neat/_graph/extractors/_classic_cdf/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from collections.abc import Callable, Iterable, Sequence, Set
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Generic, TypeVar
from typing import Any, Generic, TypeVar, cast

from cognite.client import CogniteClient
from cognite.client.data_classes._base import WriteableCogniteResource
Expand Down Expand Up @@ -111,6 +111,8 @@ def __init__(
# Used by the ClassicGraphExtractor to log URIRefs
self._log_urirefs = False
self._uriref_by_external_id: dict[str, URIRef] = {}
self.asset_parent_uri_by_id: dict[int, URIRef] = {}
self.asset_parent_uri_by_external_id: dict[str, URIRef] = {}

def extract(self) -> Iterable[Triple]:
"""Extracts an asset with the given asset_id."""
Expand Down Expand Up @@ -165,19 +167,36 @@ def _item2triples(self, item: T_CogniteResource) -> list[Triple]:
item = item.as_write()
dumped = item.dump(self.camel_case)
dumped.pop("id", None)
# We have parentId so we don't need parentExternalId
dumped.pop("parentExternalId", None)

if "metadata" in dumped:
triples.extend(self._metadata_to_triples(id_, dumped.pop("metadata")))

triples.extend(self._item2triples_special_cases(id_, dumped))

parent_renaming = {"parent_external_id": "parent_id", "parentExternalId": "parentId"}
parent_key = set(parent_renaming.keys()) | set(parent_renaming.values())

for key, value in dumped.items():
if value is None or value == []:
continue
values = value if isinstance(value, Sequence) and not isinstance(value, str) else [value]
for raw in values:
triples.append((id_, self.namespace[key], self._as_object(raw, key)))
object_ = self._as_object(raw, key)
if object_ is None:
continue
if key in parent_key:
parent_id = cast(URIRef, object_)
if isinstance(raw, str):
self.asset_parent_uri_by_external_id[raw] = parent_id
elif isinstance(raw, int):
self.asset_parent_uri_by_id[raw] = parent_id
# We add a triple to include the parent. This is such that for example the parent
# externalID will remove the prefix when loading.
triples.append((parent_id, RDF.type, self.namespace[self._get_rdf_type()]))
# Parent external ID must be renamed to parent id to match the data model.
key = parent_renaming.get(key, key)

triples.append((id_, self.namespace[key], object_))
return triples

def _item2triples_special_cases(self, id_: URIRef, dumped: dict[str, Any]) -> list[Triple]:
Expand All @@ -186,7 +205,7 @@ def _item2triples_special_cases(self, id_: URIRef, dumped: dict[str, Any]) -> li

@classmethod
def _external_id_as_uri_suffix(cls, external_id: str | None) -> str:
if external_id == "":
if external_id == "" or (isinstance(external_id, str) and external_id.strip() == ""):
warnings.warn(NeatValueWarning(f"Empty external id in {cls._default_rdf_type}"), stacklevel=2)
return "empty"
elif external_id == "\x00":
Expand Down Expand Up @@ -223,7 +242,7 @@ def _get_rdf_type(self) -> str:
type_ = f"{self.prefix}{type_}"
return self._SPACE_PATTERN.sub("_", type_)

def _as_object(self, raw: Any, key: str) -> Literal | URIRef:
def _as_object(self, raw: Any, key: str) -> Literal | URIRef | None:
"""Return properly formatted object part of s-p-o triple"""
if key in {"data_set_id", "dataSetId"}:
if self.identifier == "externalId" and self.lookup_dataset_external_id:
Expand All @@ -237,13 +256,26 @@ def _as_object(self, raw: Any, key: str) -> Literal | URIRef:
]
else:
return self.namespace[f"{InstanceIdPrefix.data_set}{raw}"]
elif key in {"assetId", "asset_id", "assetIds", "asset_ids", "parentId", "rootId", "parent_id", "root_id"}:
elif key in {"parentId", "parent_id", "parentExternalId", "parent_external_id"}:
if self.identifier == "id" and key in {"parent_id", "parentId"}:
return self.namespace[f"{InstanceIdPrefix.asset}{raw}"]
elif (
self.identifier == "externalId"
and key in {"parent_external_id", "parentExternalId"}
and isinstance(raw, str)
):
return self.namespace[f"{InstanceIdPrefix.asset}{self._external_id_as_uri_suffix(raw)}"]
else:
# Skip it
return None
elif key in {"assetId", "asset_id", "assetIds", "asset_ids", "rootId", "root_id"}:
if self.identifier == "id":
return self.namespace[f"{InstanceIdPrefix.asset}{raw}"]
else:
try:
asset_external_id = self._external_id_as_uri_suffix(self.asset_external_ids_by_id[raw])
except KeyError:
warnings.warn(NeatValueWarning(f"Unknown asset id {raw}"), stacklevel=2)
return Literal("Unknown asset", datatype=XSD.string)
else:
return self.namespace[f"{InstanceIdPrefix.asset}{asset_external_id}"]
Expand Down
44 changes: 43 additions & 1 deletion cognite/neat/_graph/extractors/_classic_cdf/_classic.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from cognite.client import CogniteClient
from cognite.client.exceptions import CogniteAPIError
from rdflib import Namespace, URIRef
from rdflib import Literal, Namespace, URIRef

from cognite.neat._constants import CLASSIC_CDF_NAMESPACE, DEFAULT_NAMESPACE, get_default_prefixes_and_namespaces
from cognite.neat._graph.extractors._base import KnowledgeGraphExtractor
Expand Down Expand Up @@ -136,6 +136,8 @@ def __init__(
self._extracted_data_sets = False
self._asset_external_ids_by_id: dict[int, str] = {}
self._dataset_external_ids_by_id: dict[int, str] = {}
self._asset_parent_uri_by_id: dict[int, URIRef] = {}
self._asset_parent_uri_by_external_id: dict[str, URIRef] = {}
self.neat_prefix_by_predicate_uri: dict[URIRef, str] = {
self._namespace["dataSetId"]: InstanceIdPrefix.data_set,
self._namespace["assetId"]: InstanceIdPrefix.asset,
Expand Down Expand Up @@ -197,6 +199,8 @@ def extract(self) -> Iterable[Triple]:
else:
self._extracted_data_sets = True

yield from self._extract_asset_parent_data_sets()

def get_information_rules(self) -> InformationRules:
# To avoid circular imports
from cognite.neat._rules.importers import ExcelImporter
Expand Down Expand Up @@ -288,6 +292,10 @@ def _extract_core_start_nodes(self):
if self._identifier == "id":
self._uris_by_external_id_by_type[core_node.resource_type].update(extractor._uriref_by_external_id)

if isinstance(extractor, AssetsExtractor):
self._asset_parent_uri_by_id.update(extractor.asset_parent_uri_by_id)
self._asset_parent_uri_by_external_id.update(extractor.asset_parent_uri_by_external_id)

def _extract_start_node_relationships(self):
for start_resource_type, source_external_ids in self._source_external_ids_by_type.items():
start_type = start_resource_type.removesuffix("_")
Expand Down Expand Up @@ -325,6 +333,10 @@ def _extract_start_node_relationships(self):
# the target nodes.
self._relationship_subject_predicate_type_external_id.extend(extractor._target_triples)

if isinstance(extractor, AssetsExtractor):
self._asset_parent_uri_by_id.update(extractor.asset_parent_uri_by_id)
self._asset_parent_uri_by_external_id.update(extractor.asset_parent_uri_by_external_id)

def _extract_core_end_nodes(self):
for core_node in self._classic_node_types:
target_external_ids = self._target_external_ids_by_type[core_node.resource_type]
Expand Down Expand Up @@ -372,6 +384,36 @@ def _extract_data_sets(self):
)
yield from DataSetExtractor(data_set_iterator, **self._extractor_args).extract()

def _extract_asset_parent_data_sets(self):
if self._asset_parent_uri_by_id:
for chunk in self._chunk(
list(self._asset_parent_uri_by_id.keys()), description="Extracting asset parent data sets"
):
assets = self._client.assets.retrieve_multiple(id=list(chunk), ignore_unknown_ids=True)
for asset in assets:
if asset.data_set_id is None:
continue
object_ = (
Literal(self._lookup_dataset(asset.data_set_id))
if self._identifier == "externalId"
else Literal(asset.data_set_id)
)
yield self._asset_parent_uri_by_id[asset.id], self._namespace.dataSetId, object_
if self._asset_parent_uri_by_external_id:
for chunk in self._chunk(
list(self._asset_parent_uri_by_external_id.keys()), description="Extracting asset parent data sets"
):
assets = self._client.assets.retrieve_multiple(external_ids=list(chunk), ignore_unknown_ids=True)
for asset in assets:
if asset.data_set_id is None:
continue
object_ = (
Literal(self._lookup_dataset(asset.data_set_id))
if self._identifier == "externalId"
else Literal(asset.data_set_id)
)
yield self._asset_parent_uri_by_external_id[asset.external_id], self._namespace.dataSetId, object_

def _extract_with_logging_label_dataset(
self, extractor: ClassicCDFBaseExtractor, resource_type: InstanceIdPrefix | None = None
) -> Iterable[Triple]:
Expand Down
116 changes: 115 additions & 1 deletion cognite/neat/_rules/exporters/_rules2excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,15 @@
SchemaCompleteness,
SheetRow,
)
from cognite.neat._rules.models.data_types import _DATA_TYPE_BY_DMS_TYPE
from cognite.neat._rules.models.dms import DMSMetadata
from cognite.neat._rules.models.dms._rules import DMSRules
from cognite.neat._rules.models.information import InformationMetadata
from cognite.neat._rules.models.information._rules import InformationRules
from cognite.neat._utils.spreadsheet import find_column_with_value
from cognite.neat._utils.spreadsheet import (
find_column_with_value,
generate_data_validation,
)

from ._base import BaseExporter

Expand Down Expand Up @@ -54,6 +59,7 @@ class ExcelExporter(BaseExporter[VerifiedRules, Workbook]):

Style = Literal["none", "minimal", "default", "maximal"]
DumpOptions = Literal["user", "last", "reference"]
_helper_sheet_name: str = "_helper"
_main_header_by_sheet_name: ClassVar[dict[str, str]] = {
"Properties": "Definition of Properties",
"Classes": "Definition of Classes",
Expand All @@ -74,6 +80,7 @@ def __init__(
add_empty_rows: bool = False,
hide_internal_columns: bool = True,
include_properties: Literal["same-space", "all"] = "all",
add_drop_downs: bool = True,
):
self.sheet_prefix = sheet_prefix or ""
if styling not in self.style_options:
Expand All @@ -85,6 +92,7 @@ def __init__(
self.add_empty_rows = add_empty_rows
self.hide_internal_columns = hide_internal_columns
self.include_properties = include_properties
self.add_drop_downs = add_drop_downs

@property
def description(self) -> str:
Expand Down Expand Up @@ -130,8 +138,114 @@ def export(self, rules: VerifiedRules) -> Workbook:
if column_letter:
ws.column_dimensions[column_letter].hidden = True

# Only add drop downs if the rules are DMSRules
if self.add_drop_downs and isinstance(rules, DMSRules):
self._add_drop_downs(workbook)

return workbook

def _add_drop_downs(self, workbook: Workbook, no_rows: int = 100) -> None:
"""Adds drop down menus to specific columns for fast and accurate data entry.
Args:
workbook: Workbook representation of the Excel file.
no_rows: number of rows to add drop down menus. Defaults to 100*100.
!!! note "Why no_rows=100?"
Maximum number of views per data model is 100, thus this value is set accordingly
!!! note "Why defining individual data validation per desired column?
This is due to the internal working of openpyxl. Adding same validation to
different column leads to unexpected behavior when the openpyxl workbook is exported
as and Excel file. Probably, the validation is not copied to the new column,
but instead reference to the data validation object is added.
"""

self._make_helper_sheet(workbook)

# We need create individual data validation and cannot re-use the same one due
# the internals of openpyxl
dv_views = generate_data_validation(self._helper_sheet_name, "A", no_header_rows=0, no_rows=no_rows)
dv_containers = generate_data_validation(self._helper_sheet_name, "b", no_header_rows=0, no_rows=no_rows)
dv_value_types = generate_data_validation(self._helper_sheet_name, "C", no_header_rows=0, no_rows=no_rows)

dv_immutable = generate_data_validation(self._helper_sheet_name, "D", no_header_rows=0, no_rows=3)
dv_nullable = generate_data_validation(self._helper_sheet_name, "D", no_header_rows=0, no_rows=3)
dv_is_list = generate_data_validation(self._helper_sheet_name, "D", no_header_rows=0, no_rows=3)
dv_in_model = generate_data_validation(self._helper_sheet_name, "D", no_header_rows=0, no_rows=3)
dv_used_for = generate_data_validation(self._helper_sheet_name, "E", no_header_rows=0, no_rows=3)

workbook["Properties"].add_data_validation(dv_views)
workbook["Properties"].add_data_validation(dv_containers)
workbook["Properties"].add_data_validation(dv_value_types)
workbook["Properties"].add_data_validation(dv_nullable)
workbook["Properties"].add_data_validation(dv_is_list)
workbook["Properties"].add_data_validation(dv_immutable)
workbook["Views"].add_data_validation(dv_in_model)
workbook["Containers"].add_data_validation(dv_used_for)

# we multiply no_rows with 100 since a view can have max 100 properties per view
if column := find_column_with_value(workbook["Properties"], "View"):
dv_views.add(f"{column}{3}:{column}{no_rows * 100}")

if column := find_column_with_value(workbook["Properties"], "Container"):
dv_containers.add(f"{column}{3}:{column}{no_rows * 100}")

if column := find_column_with_value(workbook["Properties"], "Value Type"):
dv_value_types.add(f"{column}{3}:{column}{no_rows * 100}")

if column := find_column_with_value(workbook["Properties"], "Nullable"):
dv_nullable.add(f"{column}{3}:{column}{no_rows * 100}")

if column := find_column_with_value(workbook["Properties"], "Is List"):
dv_is_list.add(f"{column}{3}:{column}{no_rows * 100}")

if column := find_column_with_value(workbook["Properties"], "Immutable"):
dv_immutable.add(f"{column}{3}:{column}{no_rows * 100}")

if column := find_column_with_value(workbook["Views"], "In Model"):
dv_in_model.add(f"{column}{3}:{column}{no_rows}")

if column := find_column_with_value(workbook["Containers"], "Used For"):
dv_used_for.add(f"{column}{3}:{column}{no_rows}")

def _make_helper_sheet(self, workbook: Workbook) -> None:
"""This helper sheet is used as source of data for drop down menus creation
!!! note "Why 100 rows?"
The number of rows is set to 100 since this is the maximum number of views
per data model.
"""
workbook.create_sheet(title=self._helper_sheet_name)

for counter, dtype in enumerate(_DATA_TYPE_BY_DMS_TYPE):
workbook[self._helper_sheet_name].cell(row=counter + 1, column=3, value=dtype)

for i in range(100):
workbook[self._helper_sheet_name].cell(
row=i + 1,
column=1,
value=f'=IF(ISBLANK(Views!A{i + 3}), "", Views!A{i + 3})',
)
workbook[self._helper_sheet_name].cell(
row=i + 1,
column=2,
value=f'=IF(ISBLANK(Containers!A{i + 3}), "", Containers!A{i + 3})',
)
workbook[self._helper_sheet_name].cell(
row=counter + i + 2,
column=3,
value=f'=IF(ISBLANK(Views!A{i + 3}), "", Views!A{i + 3})',
)

for i, value in enumerate([True, False, ""]):
workbook[self._helper_sheet_name].cell(row=i + 1, column=4, value=cast(bool | str, value))

for i, value in enumerate(["node", "edge", "all"]):
workbook[self._helper_sheet_name].cell(row=i + 1, column=5, value=value)

workbook[self._helper_sheet_name].sheet_state = "hidden"

def _write_sheets(
self,
workbook: Workbook,
Expand Down
Loading

0 comments on commit 5f9f41f

Please sign in to comment.