From e3905cf3dcd8aacdd9cd2ded3f6956ade955c350 Mon Sep 17 00:00:00 2001
From: BryanFauble <17128019+BryanFauble@users.noreply.github.com>
Date: Mon, 26 Aug 2024 10:41:01 -0700
Subject: [PATCH] [FDS-2127] Update URL validation to use requests.options to
verify connectivity (#1472)
* Update URL validation to use requests.options to verify connectivity
---
README.md | 25 +++++--
schematic/models/validate_attribute.py | 26 +++----
tests/conftest.py | 7 ++
tests/integration/test_validate_attribute.py | 75 ++++++++++++++++++++
tests/test_validation.py | 14 +---
5 files changed, 116 insertions(+), 31 deletions(-)
create mode 100644 tests/integration/test_validate_attribute.py
diff --git a/README.md b/README.md
index 2849dc4fa..cf1cd96f6 100644
--- a/README.md
+++ b/README.md
@@ -2,17 +2,28 @@
[![Build Status](https://img.shields.io/endpoint.svg?url=https%3A%2F%2Factions-badge.atrox.dev%2FSage-Bionetworks%2Fschematic%2Fbadge%3Fref%3Ddevelop&style=flat)](https://actions-badge.atrox.dev/Sage-Bionetworks/schematic/goto?ref=develop) [![Documentation Status](https://readthedocs.org/projects/sage-schematic/badge/?version=develop)](https://sage-schematic.readthedocs.io/en/develop/?badge=develop) [![PyPI version](https://badge.fury.io/py/schematicpy.svg)](https://badge.fury.io/py/schematicpy)
# Table of contents
+- [Schematic](#schematic)
+- [Table of contents](#table-of-contents)
- [Introduction](#introduction)
- [Installation](#installation)
- [Installation Requirements](#installation-requirements)
- [Installation guide for Schematic CLI users](#installation-guide-for-schematic-cli-users)
- [Installation guide for developers/contributors](#installation-guide-for-developerscontributors)
+ - [Development environment setup](#development-environment-setup)
+ - [Development process instruction](#development-process-instruction)
+ - [Example For REST API ](#example-for-rest-api-)
+ - [Use file path of `config.yml` to run API endpoints:](#use-file-path-of-configyml-to-run-api-endpoints)
+ - [Use content of `config.yml` and `schematic_service_account_creds.json`as an environment variable to run API endpoints:](#use-content-of-configyml-and-schematic_service_account_credsjsonas-an-environment-variable-to-run-api-endpoints)
+ - [Example For Schematic on mac/linux ](#example-for-schematic-on-maclinux-)
+ - [Example For Schematic on Windows ](#example-for-schematic-on-windows-)
- [Other Contribution Guidelines](#other-contribution-guidelines)
- - [Update readthedocs documentation](#update-readthedocs-documentation)
+ - [Updating readthedocs documentation](#updating-readthedocs-documentation)
+ - [Update toml file and lock file](#update-toml-file-and-lock-file)
+ - [Reporting bugs or feature requests](#reporting-bugs-or-feature-requests)
- [Command Line Usage](#command-line-usage)
- [Testing](#testing)
- [Updating Synapse test resources](#updating-synapse-test-resources)
-- [Code Style](#code-style)
+- [Code style](#code-style)
- [Contributors](#contributors)
# Introduction
@@ -90,13 +101,15 @@ This command will install the dependencies based on what we specify in poetry.lo
*Note*: If you won't interact with Synapse, please ignore this section.
There are two main configuration files that need to be edited:
-config.yml
-and [synapseConfig](https://raw.githubusercontent.com/Sage-Bionetworks/synapsePythonClient/v2.3.0-rc/synapseclient/.synapseConfig)
+- config.yml
+- [synapseConfig](https://raw.githubusercontent.com/Sage-Bionetworks/synapsePythonClient/master/synapseclient/.synapseConfig)
Configure .synapseConfig File
-Download a copy of the ``.synapseConfig`` file, open the file in the
-editor of your choice and edit the `username` and `authtoken` attribute under the `authentication` section
+Download a copy of the ``.synapseConfig`` file, open the file in the editor of your
+choice and edit the `username` and `authtoken` attribute under the `authentication`
+section. **Note:** You must place the file at the root of the project like
+`{project_root}/.synapseConfig` in order for any authenticated tests to work.
*Note*: You could also visit [configparser](https://docs.python.org/3/library/configparser.html#module-configparser>) doc to see the format that `.synapseConfig` must have. For instance:
>[authentication]
username = ABC
authtoken = abc
diff --git a/schematic/models/validate_attribute.py b/schematic/models/validate_attribute.py
index 74f5c2db2..a4f79b036 100644
--- a/schematic/models/validate_attribute.py
+++ b/schematic/models/validate_attribute.py
@@ -1,4 +1,3 @@
-import builtins
import logging
import re
from copy import deepcopy
@@ -6,12 +5,11 @@
# allows specifying explicit variable types
from typing import Any, Literal, Optional, Union
-from urllib import error
from urllib.parse import urlparse
-from urllib.request import Request, urlopen
import numpy as np
import pandas as pd
+import requests
from jsonschema import ValidationError
from synapseclient.core.exceptions import SynapseNoCredentialsError
@@ -1127,16 +1125,16 @@ def type_validation(
def url_validation(
self,
val_rule: str,
- manifest_col: str,
+ manifest_col: pd.Series,
) -> tuple[list[list[str]], list[list[str]]]:
"""
Purpose:
Validate URL's submitted for a particular attribute in a manifest.
Determine if the URL is valid and contains attributes specified in the
- schema.
+ schema. Additionally, the server must be reachable to be deemed as valid.
Input:
- val_rule: str, Validation rule
- - manifest_col: pd.core.series.Series, column for a given
+ - manifest_col: pd.Series, column for a given
attribute in the manifest
Output:
This function will return errors when the user input value
@@ -1154,8 +1152,9 @@ def url_validation(
)
if entry_has_value:
# Check if a random phrase, string or number was added and
- # log the appropriate error. Specifically, Raise an error if the value added is not a string or no part
- # of the string can be parsed as a part of a URL.
+ # log the appropriate error. Specifically, Raise an error if the value
+ # added is not a string or no part of the string can be parsed as a
+ # part of a URL.
if not isinstance(url, str) or not (
urlparse(url).scheme
+ urlparse(url).netloc
@@ -1186,10 +1185,13 @@ def url_validation(
try:
# Check that the URL points to a working webpage
# if not log the appropriate error.
- request = Request(url)
- response = urlopen(request)
valid_url = True
- response_code = response.getcode()
+ response = requests.options(url, allow_redirects=True)
+ logger.debug(
+ "Validated URL [URL: %s, status_code: %s]",
+ url,
+ response.status_code,
+ )
except:
valid_url = False
url_error = "invalid_url"
@@ -1207,7 +1209,7 @@ def url_validation(
errors.append(vr_errors)
if vr_warnings:
warnings.append(vr_warnings)
- if valid_url == True:
+ if valid_url:
# If the URL works, check to see if it contains the proper arguments
# as specified in the schema.
for arg in url_args:
diff --git a/tests/conftest.py b/tests/conftest.py
index 9ec4a4ef8..9a0b5789d 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -142,3 +142,10 @@ def temporary_file_copy(request, helpers: Helpers) -> Generator[str, None, None]
# Teardown
if os.path.exists(temp_csv_path):
os.remove(temp_csv_path)
+
+
+@pytest.fixture(name="dmge", scope="function")
+def DMGE(helpers: Helpers) -> DataModelGraphExplorer:
+ """Fixture to instantiate a DataModelGraphExplorer object."""
+ dmge = helpers.get_data_model_graph_explorer(path="example.model.jsonld")
+ return dmge
diff --git a/tests/integration/test_validate_attribute.py b/tests/integration/test_validate_attribute.py
new file mode 100644
index 000000000..36c1f9bab
--- /dev/null
+++ b/tests/integration/test_validate_attribute.py
@@ -0,0 +1,75 @@
+import pandas as pd
+
+from schematic.models.validate_attribute import ValidateAttribute
+from schematic.schemas.data_model_graph import DataModelGraphExplorer
+
+CHECK_URL_NODE_NAME = "Check URL"
+VALIDATION_RULE_URL = "url"
+
+
+class TestValidateAttribute:
+ """Integration tests for the ValidateAttribute class."""
+
+ def test_url_validation_valid_url(self, dmge: DataModelGraphExplorer) -> None:
+ # GIVEN a valid URL:
+ url = "https://github.com/Sage-Bionetworks/schematic"
+
+ # AND a pd.core.series.Series that contains this URL
+ content = pd.Series(data=[url], name=CHECK_URL_NODE_NAME)
+
+ # AND a validation attribute
+ validator = ValidateAttribute(dmge=dmge)
+
+ # WHEN the URL is validated
+ result = validator.url_validation(
+ val_rule=VALIDATION_RULE_URL, manifest_col=content
+ )
+
+ # THEN the result should pass validation
+ assert result == ([], [])
+
+ def test_url_validation_valid_doi(self, dmge: DataModelGraphExplorer) -> None:
+ # GIVEN a valid URL:
+ url = "https://doi.org/10.1158/0008-5472.can-23-0128"
+
+ # AND a pd.core.series.Series that contains this URL
+ content = pd.Series(data=[url], name=CHECK_URL_NODE_NAME)
+
+ # AND a validation attribute
+ validator = ValidateAttribute(dmge=dmge)
+
+ # WHEN the URL is validated
+ result = validator.url_validation(
+ val_rule=VALIDATION_RULE_URL, manifest_col=content
+ )
+
+ # THEN the result should pass validation
+ assert result == ([], [])
+
+ def test_url_validation_invalid_url(self, dmge: DataModelGraphExplorer) -> None:
+ # GIVEN an invalid URL:
+ url = "http://googlef.com/"
+
+ # AND a pd.core.series.Series that contains this URL
+ content = pd.Series(data=[url], name=CHECK_URL_NODE_NAME)
+
+ # AND a validation attribute
+ validator = ValidateAttribute(dmge=dmge)
+
+ # WHEN the URL is validated
+ result = validator.url_validation(
+ val_rule=VALIDATION_RULE_URL, manifest_col=content
+ )
+
+ # THEN the result should not pass validation
+ assert result == (
+ [
+ [
+ "2",
+ "Check URL",
+ "For the attribute 'Check URL', on row 2, the URL provided (http://googlef.com/) does not conform to the standards of a URL. Please make sure you are entering a real, working URL as required by the Schema.",
+ "http://googlef.com/",
+ ]
+ ],
+ [],
+ )
diff --git a/tests/test_validation.py b/tests/test_validation.py
index c2b1268ed..d6b3971be 100644
--- a/tests/test_validation.py
+++ b/tests/test_validation.py
@@ -1,11 +1,7 @@
-import itertools
import logging
import os
import re
-from pathlib import Path
-import jsonschema
-import networkx as nx
import pytest
from schematic.models.metadata import MetadataModel
@@ -13,20 +9,12 @@
from schematic.models.validate_manifest import ValidateManifest
from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer
from schematic.schemas.data_model_json_schema import DataModelJSONSchema
-from schematic.schemas.data_model_parser import DataModelParser
-from schematic.store.synapse import SynapseStorage
from schematic.utils.validate_rules_utils import validation_rule_info
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)
-@pytest.fixture(name="dmge")
-def DMGE(helpers):
- dmge = helpers.get_data_model_graph_explorer(path="example.model.jsonld")
- yield dmge
-
-
def get_metadataModel(helpers, model_name: str):
metadataModel = MetadataModel(
inputMModelLocation=helpers.get_data_path(model_name),
@@ -1075,7 +1063,7 @@ def test_rule_combinations(
class TestValidateAttributeObject:
- def test_login(self, helpers, dmge):
+ def test_login(self, dmge: DataModelGraphExplorer) -> None:
"""
Tests that sequential logins update the view query as necessary
"""