From e9007f3eaf3dd7c322a9fb558aba1b274cc497a7 Mon Sep 17 00:00:00 2001 From: rmadupuri Date: Tue, 24 Sep 2024 15:26:59 -0400 Subject: [PATCH] fix validation tests --- .circleci/config.yml | 2 +- .circleci/install_dependencies.sh | 18 ++--- .circleci/requirements.txt | 10 ++- .circleci/validate_all_studies.sh | 40 ++++++---- .circleci/validate_changed_studies.sh | 111 +++++++++++++++----------- 5 files changed, 103 insertions(+), 78 deletions(-) mode change 100644 => 100755 .circleci/config.yml mode change 100644 => 100755 .circleci/requirements.txt diff --git a/.circleci/config.yml b/.circleci/config.yml old mode 100644 new mode 100755 index 7d994759bb..bb911a8481 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -105,7 +105,7 @@ workflows: workflow_weekly: triggers: - schedule: - cron: "30 14 * * 1" + cron: "0 16 * * 6" filters: branches: only: diff --git a/.circleci/install_dependencies.sh b/.circleci/install_dependencies.sh index 1e24ae3a9c..781e2504ae 100755 --- a/.circleci/install_dependencies.sh +++ b/.circleci/install_dependencies.sh @@ -1,11 +1,15 @@ #!/usr/bin/env bash # This script installs the dependencies to download and validate all studies +# Upgrade pip +echo "Upgrading pip..." +python -m pip install --upgrade pip + # Install python dependencies -cd ~/repo/.circleci -sudo pip install -r requirements.txt +pip install -r requirements.txt # Install and configure Git LFS +echo "Installing Git LFS..." cd ~/ wget https://github.com/git-lfs/git-lfs/releases/download/v2.3.4/git-lfs-linux-amd64-2.3.4.tar.gz tar -xvf git-lfs-linux-amd64-2.3.4.tar.gz @@ -14,17 +18,11 @@ sudo ./install.sh cd ~/repo sudo chown -R circleci .git git lfs install --skip-smudge - -# Clone datahub master branch -cd ~/ -git clone --depth 1 -b master https://github.com/cbioportal/cbioportal.git +rm -rf git-lfs-linux-amd64-2.3.4.tar.gz git-lfs-2.3.4 # Clone cBioPortal core cd ~/ -git clone https://github.com/cBioPortal/cbioportal-core.git -# install validator dependencies -sudo pip install -r cbioportal-core/requirements.txt - +git clone --depth 1 -b main https://github.com/cBioPortal/cbioportal-core.git # Make test reports location cd ~/ diff --git a/.circleci/requirements.txt b/.circleci/requirements.txt old mode 100644 new mode 100755 index a5ed6da257..b1dd353252 --- a/.circleci/requirements.txt +++ b/.circleci/requirements.txt @@ -1,7 +1,9 @@ requests==2.27.1 -Jinja2==3.1.4 -mysqlclient==1.3.13 +Jinja2==3.0.3 +mysqlclient==2.1.0 docker==3.5.0 urllib3==1.26.19 -gitpython==3.1.41 -PyYAML==5.4 \ No newline at end of file +gitpython==3.1.18 +PyYAML==6.0.1 +markupsafe==2.0.1 +dsnparse==0.1.15 \ No newline at end of file diff --git a/.circleci/validate_all_studies.sh b/.circleci/validate_all_studies.sh index bae7660556..61c7c798c9 100755 --- a/.circleci/validate_all_studies.sh +++ b/.circleci/validate_all_studies.sh @@ -1,25 +1,31 @@ #!/usr/bin/env bash # This script runs validation on all public studies. -STUDIES_DIR="public/" +STUDIES_DIRS=("public/" "crdc/gdc/") +GIT_REMOTE_URL="git@github.com:cbioportal/datahub.git" +test_reports_location="$HOME/test-reports" -git remote add upstream git@github.com:cbioportal/datahub.git +git remote add upstream "$GIT_REMOTE_URL" git fetch upstream master -git lfs pull -I "public" - num_studies=${#list_of_study_dirs[@]} -test_reports_location="$HOME/test-reports" -validation_command="$HOME/cbioportal-core/scripts/importer/./validateStudies.py -d $HOME/repo/public/ -p $HOME/repo/.circleci/portalinfo -html $test_reports_location" -echo $'\nExecuting: '; echo $validation_command -if sh -c "$validation_command" ; then - echo "Tests passed successfully" - exit 0 -else - echo "Errors found" - # move errors to ERRORS/ folder: - erred_studies=`grep -rnlz $test_reports_location -e 'Validation status.*Failed' ` - mv $erred_studies $test_reports_location/ERRORS - exit 1 -fi +for STUDIES_DIR in "${STUDIES_DIRS[@]}"; do + git lfs pull -I "STUDIES_DIR" + validation_command="$HOME/cbioportal-core/scripts/importer/./validateStudies.py -d $HOME/repo/$STUDIES_DIR -p $HOME/repo/.circleci/portalinfo -html $test_reports_location" + echo $'\nExecuting: '; echo $validation_command + if sh -c "$validation_command" ; then + echo "Tests passed successfully for $STUDIES_DIR" + EXIT_STATUS=0 + else + echo "Errors found" + # move errors to ERRORS/ folder: + erred_studies=$(grep -rnlz "$test_reports_location" -e 'Failed') + if [ -n "$erred_studies" ]; then + mv $erred_studies $test_reports_location/ERRORS + EXIT_STATUS=1 + fi + fi +done + +exit "$EXIT_STATUS" \ No newline at end of file diff --git a/.circleci/validate_changed_studies.sh b/.circleci/validate_changed_studies.sh index 2ff22164ef..064a888aa9 100755 --- a/.circleci/validate_changed_studies.sh +++ b/.circleci/validate_changed_studies.sh @@ -1,68 +1,76 @@ #!/usr/bin/env bash # This script detects the studies that were changed and triggers the validation accordingly -STUDIES_DIR="public/" +STUDIES_DIRS=("public/" "crdc/gdc/") +REPO_DIR="$HOME/repo/" +TEST_REPORTS_LOCATION="$HOME/test-reports" +ERRORS_DIR="$TEST_REPORTS_LOCATION/ERRORS" +LOG_DIR="$TEST_REPORTS_LOCATION/logs" +VALIDATION_SCRIPT="$HOME/cbioportal-core/scripts/importer/validateStudies.py" +GIT_REMOTE_URL="git@github.com:cbioportal/datahub.git" +MAX_THREADS=7 -git remote add upstream git@github.com:cbioportal/datahub.git +git remote add upstream "$GIT_REMOTE_URL" git fetch upstream master -files_changing=`git diff --name-only --diff-filter=ACMRU upstream/master` +mkdir -p "$LOG_DIR" + +files_changing=$(git diff --name-only --diff-filter=ACMRU upstream/master) list_of_study_dirs=() -for file_changing in $files_changing -do - #echo "file > [$file_changing]" +for file_changing in $files_changing; do # if file is part of studies_dir, store its directory path (except case_lists) - if [[ $file_changing = *$STUDIES_DIR* ]] && [[ $file_changing != *".htm"* ]]; then - echo "study file changing > [$file_changing]" - dir_name=`dirname $file_changing` - # match case_list*, caselist* as a case list dir (actually only case_lists is valid, - # but this is up to validation script to flag): - if [[ $dir_name != *"/case_list"* ]] && [[ $dir_name != *"/caselist"* ]] && [[ $dir_name != *"/archived_files"* ]] && [[ $dir_name != *"/gene_sets"* ]] && [[ $dir_name != *"/normals"* ]]; then - echo "study dir > [$dir_name]" - else - # get parent dir: - dir_name=`dirname $dir_name` - echo "study dir > [$dir_name]" - fi - found_in_list=`echo ${list_of_study_dirs[@]} | grep $dir_name` - if [[ $found_in_list = "" ]]; then - echo "adding to list..." - list_of_study_dirs+=($dir_name) - echo "downloading files from git lfs..." - git lfs pull -I "$dir_name/*" - git lfs pull -I "$dir_name/case_lists/*" + for STUDIES_DIR in "${STUDIES_DIRS[@]}"; do + if [[ $file_changing = *$STUDIES_DIR* ]] && [[ $file_changing != *".htm"* ]]; then + echo "study file changing > [$file_changing]" + dir_name=$(dirname $file_changing) + # match case_list*, caselist* as a case list dir (actually only case_lists is valid, + # but this is up to validation script to flag): + if [[ $dir_name != *"/case_list"* ]] && [[ $dir_name != *"/caselist"* ]] && [[ $dir_name != *"/archived_files"* ]] && [[ $dir_name != *"/gene_sets"* ]] && [[ $dir_name != *"/normals"* ]] && [[ $dir_name != *"/validation_reports"* ]]; then + echo "study dir > [$dir_name]" + else + # get parent dir: + dir_name=`dirname $dir_name` + echo "study dir > [$dir_name]" + fi + if [[ ! " ${list_of_study_dirs[@]} " =~ " $dir_name " ]]; then + echo "adding to list..." + list_of_study_dirs+=("$dir_name") + echo "downloading files from git lfs..." + git lfs pull -I "$dir_name/*" + git lfs pull -I "$dir_name/case_lists/*" + fi fi - fi + done done num_studies=${#list_of_study_dirs[@]} if [[ $num_studies > 0 ]]; then echo $'\n====List of studies:====\n' - list_csv=`echo ${list_of_study_dirs[@]} | tr ' ' ','` - echo $list_csv + list_csv=$(printf "%s," "${list_of_study_dirs[@]}" | sed 's/,$//') + echo "$list_csv" - test_reports_location="$HOME/test-reports" + mkdir -p "$ERRORS_DIR" validation_command="" num=0 - max_threads=7 - break_num=$(($num_studies / $max_threads + 1)) + break_num=$((num_studies / MAX_THREADS + 1)) for study in ${list_csv//,/ } do # append sleep command between commands ((num=num+1)) mod=$(($num % $break_num)) + log_file="$LOG_DIR/$(basename $study).log" # if [ $mod = 0 ] ; then # validation_command="${validation_command} && sleep $((num*2))" # fi # append the first study - if [ "$validation_command" = "" ] ; then - validation_command="($HOME/cbioportal-core/src/main/resources/scripts/importer/./validateStudies.py -d $HOME/repo/ -l $study -p $HOME/repo/.circleci/portalinfo -html $test_reports_location/$study" + if [[ -z "$validation_command" ]] ; then + validation_command="($VALIDATION_SCRIPT -d $REPO_DIR -l $study -p $REPO_DIR/.circleci/portalinfo -html $TEST_REPORTS_LOCATION/$study > $log_file 2>&1" else # run each validation individually in the background if [ $mod = 0 ] ; then - validation_command="${validation_command}) & ($HOME/cbioportal-core/src/main/resources/scripts/importer/./validateStudies.py -d $HOME/repo/ -l $study -p $HOME/repo/.circleci/portalinfo -html $test_reports_location/$study" + validation_command="${validation_command}) & ($VALIDATION_SCRIPT -d $REPO_DIR -l $study -p $REPO_DIR/.circleci/portalinfo -html $TEST_REPORTS_LOCATION/$study > $log_file 2>&1" else - validation_command="${validation_command} ; $HOME/cbioportal-core/src/main/resources/scripts/importer/./validateStudies.py -d $HOME/repo/ -l $study -p $HOME/repo/.circleci/portalinfo -html $test_reports_location/$study" + validation_command="${validation_command} ; $VALIDATION_SCRIPT -d $REPO_DIR -l $study -p $REPO_DIR/.circleci/portalinfo -html $TEST_REPORTS_LOCATION/$study > $log_file 2>&1" fi fi done @@ -73,25 +81,36 @@ if [[ $num_studies > 0 ]]; then while true; do wait -n || { code="$?" - echo "waiting for all processes to finish ...................." + echo -e "waiting for all processes to finish...\n\n" # exit only when all processes finished - if ([[ $code = "127" ]] && exit 0) ; then + if (( code = 127 )); then break fi } - done; - + done + + for log in "$LOG_DIR"/*.log; do + if [[ -f "$log" ]]; then + cat "$log" + echo -e "\n----------------------------------------------------\n" + fi + done + + # Remove the log directory + if [[ -d "$LOG_DIR" ]]; then + rm -rf "$LOG_DIR" + fi + # find all studies with error - erred_studies=`grep -rnlz $test_reports_location -e 'Validation status.*Failed' ` - if [[ $? -eq 0 ]]; then + erred_studies=$(grep -rl "$TEST_REPORTS_LOCATION" -e 'Failed') + if [[ $? -eq 0 ]] && [[ -n "$erred_studies" ]]; then echo $'\n====List of error studies:====\n' - echo $erred_studies - mv $erred_studies $test_reports_location/ERRORS + echo "$erred_studies" + echo "$erred_studies" | xargs -I {} mv {} "$ERRORS_DIR" exit 1 else - echo "All tests passed successfully" - exit 0 + echo "No error studies found." fi else echo "No studies were changed" -fi +fi \ No newline at end of file