Fixed non mandatory missing components not autofilled with NaN on dat… (

#32) * Fixed non mandatory missing components not autofilled with NaN on datapoints. * Fixed non mandatory missing components error message. Added tests for missing non mandatory components. * Fixed flake errors. * Re-added readme tests. Pyproject.toml version updated to 1.0.1. * Refactored fill missing non mandatory components method. * Refactored fill missing non mandatory components method. --------- Co-authored-by: Francisco Javier Hernández del Caño <[email protected]>
Meaningful-Data · Oct 17, 2024 · 3ab31bd · 3ab31bd
1 parent f789e84
commit 3ab31bd
Show file tree

Hide file tree

Showing 3 changed files with 214 additions and 1 deletion.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "vtlengine"
-version = "1.0"
+version = "1.0.1"
 description = "Run and Validate VTL Scripts"
 authors = ["MeaningfulData <[email protected]>"]
 license = "AGPL-3.0"

diff --git a/src/vtlengine/files/parser/__init__.py b/src/vtlengine/files/parser/__init__.py
@@ -143,8 +143,16 @@ def _validate_pandas(
 ) -> pd.DataFrame:
     warnings.filterwarnings("ignore", category=FutureWarning)
     # Identifier checking
+
     id_names = [comp_name for comp_name, comp in components.items() if comp.role == Role.IDENTIFIER]
 
+    missing_columns = [name for name in components.keys() if name not in data.columns.tolist()]
+    if missing_columns:
+        for name in missing_columns:
+            if components[name].nullable is False:
+                raise SemanticError("0-1-1-10", name=dataset_name, comp_name=name)
+            data[name] = None
+
     for id_name in id_names:
         if data[id_name].isnull().any():
             raise SemanticError("0-1-1-4", null_identifier=id_name, name=dataset_name)

diff --git a/tests/API/test_api.py b/tests/API/test_api.py
@@ -452,6 +452,47 @@ def test_run_only_persistent(script, data_structures, datapoints, value_domains,
     assert result == reference
 
 
+def test_readme_example():
+    script = """
+        DS_A := DS_1 * 10;
+    """
+
+    data_structures = {
+        "datasets": [
+            {
+                "name": "DS_1",
+                "DataStructure": [
+                    {"name": "Id_1", "type": "Integer", "role": "Identifier", "nullable": False},
+                    {"name": "Me_1", "type": "Number", "role": "Measure", "nullable": True},
+                ],
+            }
+        ]
+    }
+
+    data_df = pd.DataFrame({"Id_1": [1, 2, 3], "Me_1": [10, 20, 30]})
+
+    datapoints = {"DS_1": data_df}
+
+    run_result = run(script=script, data_structures=data_structures, datapoints=datapoints)
+
+    assert run_result == {
+        "DS_A": Dataset(
+            name="DS_A",
+            components={
+                "Id_1": Component(
+                    name="Id_1", data_type=DataTypes.Integer, role=Role.IDENTIFIER, nullable=False
+                ),
+                "Me_1": Component(
+                    name="Me_1", data_type=DataTypes.Number, role=Role.MEASURE, nullable=True
+                ),
+            },
+            data=pd.DataFrame(
+                columns=["Id_1", "Me_1"], index=[0, 1, 2], data=[(1, 100), (2, 200), (3, 300)]
+            ),
+        )
+    }
+
+
 def test_readme_run():
     script = """
         DS_A := DS_1 * 10;
@@ -519,3 +560,167 @@ def test_readme_semantic_error():
     # Check output dataset on error message
     with pytest.raises(SemanticError, match="DS_A"):
         semantic_analysis(script=script, data_structures=data_structures)
+
+
+def test_non_mandatory_fill_at():
+    script = """
+        DS_r := DS_1;
+    """
+
+    data_structures = {
+        "datasets": [
+            {
+                "name": "DS_1",
+                "DataStructure": [
+                    {"name": "Id_1", "type": "Integer", "role": "Identifier", "nullable": False},
+                    {"name": "Id_2", "type": "String", "role": "Identifier", "nullable": False},
+                    {"name": "Me_1", "type": "String", "role": "Measure", "nullable": True},
+                    {"name": "At_1", "type": "String", "role": "Attribute", "nullable": True},
+                ],
+            }
+        ]
+    }
+
+    data_df = pd.DataFrame({"Id_1": [1, 1, 2], "Id_2": ['A', 'B', 'A'], "Me_1": ['N', 'N', 'O']})
+
+    datapoints = {"DS_1": data_df}
+
+    run_result = run(script=script, data_structures=data_structures, datapoints=datapoints)
+
+    assert run_result == {
+        "DS_r": Dataset(
+            name="DS_r",
+            components={
+                "Id_1": Component(
+                    name="Id_1", data_type=DataTypes.Integer, role=Role.IDENTIFIER, nullable=False
+                ),
+                "Id_2": Component(
+                    name="Id_2", data_type=DataTypes.String, role=Role.IDENTIFIER, nullable=False
+                ),
+                "Me_1": Component(
+                    name="Me_1", data_type=DataTypes.String, role=Role.MEASURE, nullable=True
+                ),
+                "At_1": Component(
+                    name="At_1", data_type=DataTypes.String, role=Role.ATTRIBUTE, nullable=True
+                ),
+            },
+            data=pd.DataFrame(
+                columns=["Id_1", "Id_2", "Me_1", "At_1"], index=[0, 1, 2], data=pd.DataFrame({"Id_1": [1, 1, 2], "Id_2": ['A', 'B', 'A'], "Me_1": ['N', 'N', 'O'], "At_1": [None, None, None]})
+            ),
+        )
+    }
+
+
+def test_non_mandatory_fill_me():
+    script = """
+        DS_r := DS_1;
+    """
+
+    data_structures = {
+        "datasets": [
+            {
+                "name": "DS_1",
+                "DataStructure": [
+                    {"name": "Id_1", "type": "Integer", "role": "Identifier", "nullable": False},
+                    {"name": "Id_2", "type": "String", "role": "Identifier", "nullable": False},
+                    {"name": "Me_1", "type": "String", "role": "Measure", "nullable": True},
+                    {"name": "At_1", "type": "String", "role": "Attribute", "nullable": True},
+                ],
+            }
+        ]
+    }
+
+    data_df = pd.DataFrame({"Id_1": [1, 1, 2], "Id_2": ['A', 'B', 'A'], "At_1": ['N', 'N', 'O']})
+
+    datapoints = {"DS_1": data_df}
+
+    run_result = run(script=script, data_structures=data_structures, datapoints=datapoints)
+
+    assert run_result == {
+        "DS_r": Dataset(
+            name="DS_r",
+            components={
+                "Id_1": Component(
+                    name="Id_1", data_type=DataTypes.Integer, role=Role.IDENTIFIER, nullable=False
+                ),
+                "Id_2": Component(
+                    name="Id_2", data_type=DataTypes.String, role=Role.IDENTIFIER, nullable=False
+                ),
+                "Me_1": Component(
+                    name="Me_1", data_type=DataTypes.String, role=Role.MEASURE, nullable=True
+                ),
+                "At_1": Component(
+                    name="At_1", data_type=DataTypes.String, role=Role.ATTRIBUTE, nullable=True
+                ),
+            },
+            data=pd.DataFrame(
+                columns=["Id_1", "Id_2", "Me_1", "At_1"], index=[0, 1, 2], data=pd.DataFrame({"Id_1": [1, 1, 2], "Id_2": ['A', 'B', 'A'], "Me_1": [None, None, None], "At_1": ['N', 'N', 'O']})
+            ),
+        )
+    }
+
+
+def test_mandatory_at_error():
+    exception_code = "0-1-1-10"
+
+    script = """
+        DS_r := DS_1;
+    """
+
+    data_structures = {
+        "datasets": [
+            {
+                "name": "DS_1",
+                "DataStructure": [
+                    {"name": "Id_1", "type": "Integer", "role": "Identifier", "nullable": False},
+                    {"name": "Id_2", "type": "String", "role": "Identifier", "nullable": False},
+                    {"name": "Me_1", "type": "String", "role": "Measure", "nullable": True},
+                    {"name": "At_1", "type": "String", "role": "Attribute", "nullable": False},
+                ],
+            }
+        ]
+    }
+
+    data_df = pd.DataFrame({"Id_1": [1, 1, 2], "Id_2": ['A', 'B', 'A'], "Me_1": ['N', 'N', 'O']})
+
+    datapoints = {"DS_1": data_df}
+
+    with pytest.raises(SemanticError) as context:
+        run(script=script, data_structures=data_structures, datapoints=datapoints)
+    result = exception_code == str(context.value.args[1])
+    if result is False:
+        print(f"\n{exception_code} != {context.value.args[1]}")
+    assert result
+
+
+def test_mandatory_me_error():
+    exception_code = "0-1-1-10"
+
+    script = """
+        DS_r := DS_1;
+    """
+
+    data_structures = {
+        "datasets": [
+            {
+                "name": "DS_1",
+                "DataStructure": [
+                    {"name": "Id_1", "type": "Integer", "role": "Identifier", "nullable": False},
+                    {"name": "Id_2", "type": "String", "role": "Identifier", "nullable": False},
+                    {"name": "Me_1", "type": "String", "role": "Measure", "nullable": False},
+                    {"name": "At_1", "type": "String", "role": "Attribute", "nullable": True},
+                ],
+            }
+        ]
+    }
+
+    data_df = pd.DataFrame({"Id_1": [1, 1, 2], "Id_2": ['A', 'B', 'A'], "At_1": ['N', 'N', 'O']})
+
+    datapoints = {"DS_1": data_df}
+
+    with pytest.raises(SemanticError) as context:
+        run(script=script, data_structures=data_structures, datapoints=datapoints)
+    result = exception_code == str(context.value.args[1])
+    if result is False:
+        print(f"\n{exception_code} != {context.value.args[1]}")
+    assert result