Merge pull request #2075 from cBioPortal/fix_ci

Fix validation tests in CI
cBioPortal · Nov 11, 2024 · d0c3dc1 · d0c3dc1
2 parents 975fd3b + d4af0e2
commit d0c3dc1
Show file tree

Hide file tree

Showing 5 changed files with 103 additions and 78 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -105,7 +105,7 @@ workflows:
   workflow_weekly:
     triggers:
       - schedule:
-          cron: "30 14 * * 1"
+          cron: "0 16 * * 6"
           filters:
             branches:
               only: 

diff --git a/.circleci/install_dependencies.sh b/.circleci/install_dependencies.sh
@@ -1,11 +1,15 @@
 #!/usr/bin/env bash
 # This script installs the dependencies to download and validate all studies
 
+# Upgrade pip
+echo "Upgrading pip..."
+python -m pip install --upgrade pip
+
 # Install python dependencies
-cd ~/repo/.circleci
-sudo pip install -r requirements.txt
+pip install -r requirements.txt
 
 # Install and configure Git LFS
+echo "Installing Git LFS..."
 cd ~/
 wget https://github.com/git-lfs/git-lfs/releases/download/v2.3.4/git-lfs-linux-amd64-2.3.4.tar.gz
 tar -xvf git-lfs-linux-amd64-2.3.4.tar.gz
@@ -14,17 +18,11 @@ sudo ./install.sh
 cd ~/repo
 sudo chown -R circleci .git
 git lfs install --skip-smudge
-
-# Clone datahub master branch
-cd ~/
-git clone --depth 1 -b master https://github.com/cbioportal/cbioportal.git
+rm -rf git-lfs-linux-amd64-2.3.4.tar.gz git-lfs-2.3.4
 
 # Clone cBioPortal core
 cd ~/
-git clone https://github.com/cBioPortal/cbioportal-core.git
-# install validator dependencies
-sudo pip install -r cbioportal-core/requirements.txt
-
+git clone --depth 1 -b main https://github.com/cBioPortal/cbioportal-core.git
 
 # Make test reports location
 cd ~/

diff --git a/.circleci/requirements.txt b/.circleci/requirements.txt
@@ -1,7 +1,9 @@
 requests==2.27.1
-Jinja2==3.1.4
-mysqlclient==1.3.13
+Jinja2==3.0.3
+mysqlclient==2.1.0
 docker==3.5.0
 urllib3==1.26.19
-gitpython==3.1.41
-PyYAML==5.4
+gitpython==3.1.18
+PyYAML==6.0.1
+markupsafe==2.0.1
+dsnparse==0.1.15
diff --git a/.circleci/validate_all_studies.sh b/.circleci/validate_all_studies.sh
@@ -1,25 +1,31 @@
 #!/usr/bin/env bash
 # This script runs validation on all public studies.
 
-STUDIES_DIR="public/"
+STUDIES_DIRS=("public/" "crdc/gdc/")
+GIT_REMOTE_URL="[email protected]:cbioportal/datahub.git"
+test_reports_location="$HOME/test-reports"
 
-git remote add upstream [email protected]:cbioportal/datahub.git
+git remote add upstream "$GIT_REMOTE_URL"
 git fetch upstream master
 
-git lfs pull -I "public"
-
 num_studies=${#list_of_study_dirs[@]}
 
-test_reports_location="$HOME/test-reports"
-validation_command="$HOME/cbioportal-core/scripts/importer/./validateStudies.py -d $HOME/repo/public/ -p $HOME/repo/.circleci/portalinfo -html $test_reports_location"
-echo $'\nExecuting: '; echo $validation_command
-if sh -c "$validation_command" ; then
-    echo "Tests passed successfully"
-    exit 0
-else
-    echo "Errors found"
-    # move errors to ERRORS/ folder:
-    erred_studies=`grep -rnlz $test_reports_location -e 'Validation status.*Failed' `
-    mv $erred_studies $test_reports_location/ERRORS
-    exit 1
-fi
+for STUDIES_DIR in "${STUDIES_DIRS[@]}"; do
+    git lfs pull -I "STUDIES_DIR"
+    validation_command="$HOME/cbioportal-core/scripts/importer/./validateStudies.py -d $HOME/repo/$STUDIES_DIR -p $HOME/repo/.circleci/portalinfo -html $test_reports_location"
+    echo $'\nExecuting: '; echo $validation_command
+    if sh -c "$validation_command" ; then
+        echo "Tests passed successfully for $STUDIES_DIR"
+        EXIT_STATUS=0
+    else
+        echo "Errors found"
+        # move errors to ERRORS/ folder:
+        erred_studies=$(grep -rnlz "$test_reports_location" -e 'Failed')
+        if [ -n "$erred_studies" ]; then
+            mv $erred_studies $test_reports_location/ERRORS
+            EXIT_STATUS=1
+        fi
+    fi
+done
+
+exit "$EXIT_STATUS"
diff --git a/.circleci/validate_changed_studies.sh b/.circleci/validate_changed_studies.sh
@@ -1,68 +1,76 @@
 #!/usr/bin/env bash
 # This script detects the studies that were changed and triggers the validation accordingly
 
-STUDIES_DIR="public/"
+STUDIES_DIRS=("public/" "crdc/gdc/")
+REPO_DIR="$HOME/repo/"
+TEST_REPORTS_LOCATION="$HOME/test-reports"
+ERRORS_DIR="$TEST_REPORTS_LOCATION/ERRORS"
+LOG_DIR="$TEST_REPORTS_LOCATION/logs"
+VALIDATION_SCRIPT="$HOME/cbioportal-core/scripts/importer/validateStudies.py"
+GIT_REMOTE_URL="[email protected]:cbioportal/datahub.git"
+MAX_THREADS=7
 
-git remote add upstream [email protected]:cbioportal/datahub.git
+git remote add upstream "$GIT_REMOTE_URL"
 git fetch upstream master
 
-files_changing=`git diff --name-only --diff-filter=ACMRU upstream/master`
+mkdir -p "$LOG_DIR"
+
+files_changing=$(git diff --name-only --diff-filter=ACMRU upstream/master)
 list_of_study_dirs=()
 
-for file_changing in $files_changing
-do
-    #echo "file > [$file_changing]"
+for file_changing in $files_changing; do
     # if file is part of studies_dir, store its directory path (except case_lists)
-    if [[ $file_changing = *$STUDIES_DIR* ]] && [[ $file_changing != *".htm"* ]]; then
-      echo "study file changing > [$file_changing]"
-      dir_name=`dirname $file_changing`
-      # match case_list*, caselist* as a case list dir (actually only case_lists is valid, 
-      # but this is up to validation script to flag):
-      if [[ $dir_name != *"/case_list"* ]] && [[ $dir_name != *"/caselist"* ]] && [[ $dir_name != *"/archived_files"* ]] && [[ $dir_name != *"/gene_sets"* ]] && [[ $dir_name != *"/normals"* ]]; then
-        echo "study dir > [$dir_name]"
-      else
-        # get parent dir:
-        dir_name=`dirname $dir_name`
-        echo "study dir > [$dir_name]"
-      fi
-      found_in_list=`echo ${list_of_study_dirs[@]} | grep $dir_name`
-      if [[ $found_in_list = "" ]]; then
-        echo "adding to list..."
-        list_of_study_dirs+=($dir_name)
-        echo "downloading files from git lfs..."
-        git lfs pull -I "$dir_name/*"
-        git lfs pull -I "$dir_name/case_lists/*"
+    for STUDIES_DIR in "${STUDIES_DIRS[@]}"; do
+      if [[ $file_changing = *$STUDIES_DIR* ]] && [[ $file_changing != *".htm"* ]]; then
+        echo "study file changing > [$file_changing]"
+        dir_name=$(dirname $file_changing)
+        # match case_list*, caselist* as a case list dir (actually only case_lists is valid, 
+        # but this is up to validation script to flag):
+        if [[ $dir_name != *"/case_list"* ]] && [[ $dir_name != *"/caselist"* ]] && [[ $dir_name != *"/archived_files"* ]] && [[ $dir_name != *"/gene_sets"* ]] && [[ $dir_name != *"/normals"* ]] && [[ $dir_name != *"/validation_reports"* ]]; then
+          echo "study dir > [$dir_name]"
+        else
+          # get parent dir:
+          dir_name=`dirname $dir_name`
+          echo "study dir > [$dir_name]"
+        fi
+        if [[ ! " ${list_of_study_dirs[@]} " =~ " $dir_name " ]]; then
+          echo "adding to list..."
+          list_of_study_dirs+=("$dir_name")
+          echo "downloading files from git lfs..."
+          git lfs pull -I "$dir_name/*"
+          git lfs pull -I "$dir_name/case_lists/*"
+        fi
       fi
-    fi
+    done
 done
 num_studies=${#list_of_study_dirs[@]}
 if [[ $num_studies > 0 ]]; then
   echo $'\n====List of studies:====\n'
-  list_csv=`echo ${list_of_study_dirs[@]} | tr ' ' ','`
-  echo $list_csv
+  list_csv=$(printf "%s," "${list_of_study_dirs[@]}" | sed 's/,$//')
+  echo "$list_csv"
 
-  test_reports_location="$HOME/test-reports"
+  mkdir -p "$ERRORS_DIR"
   validation_command=""
   num=0
-  max_threads=7
-  break_num=$(($num_studies / $max_threads + 1))
+  break_num=$((num_studies / MAX_THREADS + 1))
   for study in ${list_csv//,/ }
   do
       # append sleep command between commands
       ((num=num+1))
       mod=$(($num % $break_num))
+      log_file="$LOG_DIR/$(basename $study).log"
       # if [ $mod = 0 ] ; then
       #   validation_command="${validation_command} && sleep $((num*2))"
       # fi
       # append the first study
-      if [ "$validation_command" = "" ] ; then
-        validation_command="($HOME/cbioportal-core/src/main/resources/scripts/importer/./validateStudies.py -d $HOME/repo/ -l $study -p $HOME/repo/.circleci/portalinfo -html $test_reports_location/$study"
+      if [[ -z "$validation_command" ]] ; then
+        validation_command="($VALIDATION_SCRIPT -d $REPO_DIR -l $study -p $REPO_DIR/.circleci/portalinfo -html $TEST_REPORTS_LOCATION/$study  > $log_file 2>&1"
       else
         # run each validation individually in the background
         if [ $mod = 0 ] ; then
-          validation_command="${validation_command}) & ($HOME/cbioportal-core/src/main/resources/scripts/importer/./validateStudies.py -d $HOME/repo/ -l $study -p $HOME/repo/.circleci/portalinfo -html $test_reports_location/$study"
+          validation_command="${validation_command}) & ($VALIDATION_SCRIPT -d $REPO_DIR -l $study -p $REPO_DIR/.circleci/portalinfo -html $TEST_REPORTS_LOCATION/$study > $log_file 2>&1"
         else
-          validation_command="${validation_command} ; $HOME/cbioportal-core/src/main/resources/scripts/importer/./validateStudies.py -d $HOME/repo/ -l $study -p $HOME/repo/.circleci/portalinfo -html $test_reports_location/$study"
+          validation_command="${validation_command} ; $VALIDATION_SCRIPT -d $REPO_DIR -l $study -p $REPO_DIR/.circleci/portalinfo -html $TEST_REPORTS_LOCATION/$study > $log_file 2>&1"
         fi
       fi
   done
@@ -73,25 +81,36 @@ if [[ $num_studies > 0 ]]; then
   while true; do
     wait -n || {
       code="$?"
-      echo "waiting for all processes to finish ...................."
+      echo -e "waiting for all processes to finish...\n\n"
       # exit only when all processes finished
-      if ([[ $code = "127" ]] && exit 0) ; then
+      if (( code = 127 )); then
         break
       fi
     }
-  done;
-
+  done
+
+  for log in "$LOG_DIR"/*.log; do
+    if [[ -f "$log" ]]; then
+      cat "$log"
+      echo -e "\n----------------------------------------------------\n"
+    fi
+  done
+
+  # Remove the log directory
+  if [[ -d "$LOG_DIR" ]]; then
+    rm -rf "$LOG_DIR"
+  fi
+
   # find all studies with error
-  erred_studies=`grep -rnlz $test_reports_location -e 'Validation status.*Failed' `
-  if [[ $? -eq 0 ]]; then
+  erred_studies=$(grep -rl "$TEST_REPORTS_LOCATION" -e 'Failed')
+  if [[ $? -eq 0 ]] && [[ -n "$erred_studies" ]]; then
     echo $'\n====List of error studies:====\n'
-    echo $erred_studies
-    mv $erred_studies $test_reports_location/ERRORS
+    echo "$erred_studies"
+    echo "$erred_studies" | xargs -I {} mv {} "$ERRORS_DIR"
     exit 1
   else
-    echo "All tests passed successfully"
-    exit 0
+    echo "No error studies found."
   fi
 else
   echo "No studies were changed"
-fi
+fi