From e3905cf3dcd8aacdd9cd2ded3f6956ade955c350 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Mon, 26 Aug 2024 10:41:01 -0700 Subject: [PATCH] [FDS-2127] Update URL validation to use requests.options to verify connectivity (#1472) * Update URL validation to use requests.options to verify connectivity --- README.md | 25 +++++-- schematic/models/validate_attribute.py | 26 +++---- tests/conftest.py | 7 ++ tests/integration/test_validate_attribute.py | 75 ++++++++++++++++++++ tests/test_validation.py | 14 +--- 5 files changed, 116 insertions(+), 31 deletions(-) create mode 100644 tests/integration/test_validate_attribute.py diff --git a/README.md b/README.md index 2849dc4fa..cf1cd96f6 100644 --- a/README.md +++ b/README.md @@ -2,17 +2,28 @@ [![Build Status](https://img.shields.io/endpoint.svg?url=https%3A%2F%2Factions-badge.atrox.dev%2FSage-Bionetworks%2Fschematic%2Fbadge%3Fref%3Ddevelop&style=flat)](https://actions-badge.atrox.dev/Sage-Bionetworks/schematic/goto?ref=develop) [![Documentation Status](https://readthedocs.org/projects/sage-schematic/badge/?version=develop)](https://sage-schematic.readthedocs.io/en/develop/?badge=develop) [![PyPI version](https://badge.fury.io/py/schematicpy.svg)](https://badge.fury.io/py/schematicpy) # Table of contents +- [Schematic](#schematic) +- [Table of contents](#table-of-contents) - [Introduction](#introduction) - [Installation](#installation) - [Installation Requirements](#installation-requirements) - [Installation guide for Schematic CLI users](#installation-guide-for-schematic-cli-users) - [Installation guide for developers/contributors](#installation-guide-for-developerscontributors) + - [Development environment setup](#development-environment-setup) + - [Development process instruction](#development-process-instruction) + - [Example For REST API ](#example-for-rest-api-) + - [Use file path of `config.yml` to run API endpoints:](#use-file-path-of-configyml-to-run-api-endpoints) + - [Use content of `config.yml` and `schematic_service_account_creds.json`as an environment variable to run API endpoints:](#use-content-of-configyml-and-schematic_service_account_credsjsonas-an-environment-variable-to-run-api-endpoints) + - [Example For Schematic on mac/linux ](#example-for-schematic-on-maclinux-) + - [Example For Schematic on Windows ](#example-for-schematic-on-windows-) - [Other Contribution Guidelines](#other-contribution-guidelines) - - [Update readthedocs documentation](#update-readthedocs-documentation) + - [Updating readthedocs documentation](#updating-readthedocs-documentation) + - [Update toml file and lock file](#update-toml-file-and-lock-file) + - [Reporting bugs or feature requests](#reporting-bugs-or-feature-requests) - [Command Line Usage](#command-line-usage) - [Testing](#testing) - [Updating Synapse test resources](#updating-synapse-test-resources) -- [Code Style](#code-style) +- [Code style](#code-style) - [Contributors](#contributors) # Introduction @@ -90,13 +101,15 @@ This command will install the dependencies based on what we specify in poetry.lo *Note*: If you won't interact with Synapse, please ignore this section. There are two main configuration files that need to be edited: -config.yml -and [synapseConfig](https://raw.githubusercontent.com/Sage-Bionetworks/synapsePythonClient/v2.3.0-rc/synapseclient/.synapseConfig) +- config.yml +- [synapseConfig](https://raw.githubusercontent.com/Sage-Bionetworks/synapsePythonClient/master/synapseclient/.synapseConfig) Configure .synapseConfig File -Download a copy of the ``.synapseConfig`` file, open the file in the -editor of your choice and edit the `username` and `authtoken` attribute under the `authentication` section +Download a copy of the ``.synapseConfig`` file, open the file in the editor of your +choice and edit the `username` and `authtoken` attribute under the `authentication` +section. **Note:** You must place the file at the root of the project like +`{project_root}/.synapseConfig` in order for any authenticated tests to work. *Note*: You could also visit [configparser](https://docs.python.org/3/library/configparser.html#module-configparser>) doc to see the format that `.synapseConfig` must have. For instance: >[authentication]
username = ABC
authtoken = abc diff --git a/schematic/models/validate_attribute.py b/schematic/models/validate_attribute.py index 74f5c2db2..a4f79b036 100644 --- a/schematic/models/validate_attribute.py +++ b/schematic/models/validate_attribute.py @@ -1,4 +1,3 @@ -import builtins import logging import re from copy import deepcopy @@ -6,12 +5,11 @@ # allows specifying explicit variable types from typing import Any, Literal, Optional, Union -from urllib import error from urllib.parse import urlparse -from urllib.request import Request, urlopen import numpy as np import pandas as pd +import requests from jsonschema import ValidationError from synapseclient.core.exceptions import SynapseNoCredentialsError @@ -1127,16 +1125,16 @@ def type_validation( def url_validation( self, val_rule: str, - manifest_col: str, + manifest_col: pd.Series, ) -> tuple[list[list[str]], list[list[str]]]: """ Purpose: Validate URL's submitted for a particular attribute in a manifest. Determine if the URL is valid and contains attributes specified in the - schema. + schema. Additionally, the server must be reachable to be deemed as valid. Input: - val_rule: str, Validation rule - - manifest_col: pd.core.series.Series, column for a given + - manifest_col: pd.Series, column for a given attribute in the manifest Output: This function will return errors when the user input value @@ -1154,8 +1152,9 @@ def url_validation( ) if entry_has_value: # Check if a random phrase, string or number was added and - # log the appropriate error. Specifically, Raise an error if the value added is not a string or no part - # of the string can be parsed as a part of a URL. + # log the appropriate error. Specifically, Raise an error if the value + # added is not a string or no part of the string can be parsed as a + # part of a URL. if not isinstance(url, str) or not ( urlparse(url).scheme + urlparse(url).netloc @@ -1186,10 +1185,13 @@ def url_validation( try: # Check that the URL points to a working webpage # if not log the appropriate error. - request = Request(url) - response = urlopen(request) valid_url = True - response_code = response.getcode() + response = requests.options(url, allow_redirects=True) + logger.debug( + "Validated URL [URL: %s, status_code: %s]", + url, + response.status_code, + ) except: valid_url = False url_error = "invalid_url" @@ -1207,7 +1209,7 @@ def url_validation( errors.append(vr_errors) if vr_warnings: warnings.append(vr_warnings) - if valid_url == True: + if valid_url: # If the URL works, check to see if it contains the proper arguments # as specified in the schema. for arg in url_args: diff --git a/tests/conftest.py b/tests/conftest.py index 9ec4a4ef8..9a0b5789d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -142,3 +142,10 @@ def temporary_file_copy(request, helpers: Helpers) -> Generator[str, None, None] # Teardown if os.path.exists(temp_csv_path): os.remove(temp_csv_path) + + +@pytest.fixture(name="dmge", scope="function") +def DMGE(helpers: Helpers) -> DataModelGraphExplorer: + """Fixture to instantiate a DataModelGraphExplorer object.""" + dmge = helpers.get_data_model_graph_explorer(path="example.model.jsonld") + return dmge diff --git a/tests/integration/test_validate_attribute.py b/tests/integration/test_validate_attribute.py new file mode 100644 index 000000000..36c1f9bab --- /dev/null +++ b/tests/integration/test_validate_attribute.py @@ -0,0 +1,75 @@ +import pandas as pd + +from schematic.models.validate_attribute import ValidateAttribute +from schematic.schemas.data_model_graph import DataModelGraphExplorer + +CHECK_URL_NODE_NAME = "Check URL" +VALIDATION_RULE_URL = "url" + + +class TestValidateAttribute: + """Integration tests for the ValidateAttribute class.""" + + def test_url_validation_valid_url(self, dmge: DataModelGraphExplorer) -> None: + # GIVEN a valid URL: + url = "https://github.com/Sage-Bionetworks/schematic" + + # AND a pd.core.series.Series that contains this URL + content = pd.Series(data=[url], name=CHECK_URL_NODE_NAME) + + # AND a validation attribute + validator = ValidateAttribute(dmge=dmge) + + # WHEN the URL is validated + result = validator.url_validation( + val_rule=VALIDATION_RULE_URL, manifest_col=content + ) + + # THEN the result should pass validation + assert result == ([], []) + + def test_url_validation_valid_doi(self, dmge: DataModelGraphExplorer) -> None: + # GIVEN a valid URL: + url = "https://doi.org/10.1158/0008-5472.can-23-0128" + + # AND a pd.core.series.Series that contains this URL + content = pd.Series(data=[url], name=CHECK_URL_NODE_NAME) + + # AND a validation attribute + validator = ValidateAttribute(dmge=dmge) + + # WHEN the URL is validated + result = validator.url_validation( + val_rule=VALIDATION_RULE_URL, manifest_col=content + ) + + # THEN the result should pass validation + assert result == ([], []) + + def test_url_validation_invalid_url(self, dmge: DataModelGraphExplorer) -> None: + # GIVEN an invalid URL: + url = "http://googlef.com/" + + # AND a pd.core.series.Series that contains this URL + content = pd.Series(data=[url], name=CHECK_URL_NODE_NAME) + + # AND a validation attribute + validator = ValidateAttribute(dmge=dmge) + + # WHEN the URL is validated + result = validator.url_validation( + val_rule=VALIDATION_RULE_URL, manifest_col=content + ) + + # THEN the result should not pass validation + assert result == ( + [ + [ + "2", + "Check URL", + "For the attribute 'Check URL', on row 2, the URL provided (http://googlef.com/) does not conform to the standards of a URL. Please make sure you are entering a real, working URL as required by the Schema.", + "http://googlef.com/", + ] + ], + [], + ) diff --git a/tests/test_validation.py b/tests/test_validation.py index c2b1268ed..d6b3971be 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -1,11 +1,7 @@ -import itertools import logging import os import re -from pathlib import Path -import jsonschema -import networkx as nx import pytest from schematic.models.metadata import MetadataModel @@ -13,20 +9,12 @@ from schematic.models.validate_manifest import ValidateManifest from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer from schematic.schemas.data_model_json_schema import DataModelJSONSchema -from schematic.schemas.data_model_parser import DataModelParser -from schematic.store.synapse import SynapseStorage from schematic.utils.validate_rules_utils import validation_rule_info logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger(__name__) -@pytest.fixture(name="dmge") -def DMGE(helpers): - dmge = helpers.get_data_model_graph_explorer(path="example.model.jsonld") - yield dmge - - def get_metadataModel(helpers, model_name: str): metadataModel = MetadataModel( inputMModelLocation=helpers.get_data_path(model_name), @@ -1075,7 +1063,7 @@ def test_rule_combinations( class TestValidateAttributeObject: - def test_login(self, helpers, dmge): + def test_login(self, dmge: DataModelGraphExplorer) -> None: """ Tests that sequential logins update the view query as necessary """