Refactor kinship to allow other code to initiate import directly (#730)

* Refactor genetics pipeline to allow standalone import of TSVs that were calculated separately
LabKey · Mar 12, 2024 · adf9436 · adf9436
1 parent d0a9e46
commit adf9436
Show file tree

Hide file tree

Showing 11 changed files with 187 additions and 149 deletions.
diff --git a/ehr/api-src/org/labkey/api/ehr/EHRService.java b/ehr/api-src/org/labkey/api/ehr/EHRService.java
@@ -15,6 +15,7 @@
  */
 package org.labkey.api.ehr;
 
+import org.apache.logging.log4j.Logger;
 import org.jetbrains.annotations.NotNull;
 import org.jetbrains.annotations.Nullable;
 import org.labkey.api.data.AbstractTableInfo;
@@ -31,6 +32,7 @@
 import org.labkey.api.ehr.history.*;
 import org.labkey.api.ldk.table.ButtonConfigFactory;
 import org.labkey.api.module.Module;
+import org.labkey.api.pipeline.PipelineJobException;
 import org.labkey.api.query.BatchValidationException;
 import org.labkey.api.query.DetailsURL;
 import org.labkey.api.query.ExprColumn;
@@ -43,6 +45,7 @@
 import org.labkey.api.view.ActionURL;
 import org.labkey.api.view.template.ClientDependency;
 
+import java.io.File;
 import java.io.IOException;
 import java.util.Collection;
 import java.util.Date;
@@ -326,4 +329,14 @@ public EHRQCState getQCState(@NotNull Container c)
     abstract public List<String> ensureStudyQCStates(Container c, final User u, final boolean commitChanges);
 
     abstract public void registerLabWorkOverrides(Module module, String fromType, LabworkType toType);
+
+    /**
+     * The EHR has a built-in GeneticsCalculations pipeline job that computes inbreeding and kinship based on the pedigree.
+     * These are normally calculated in R, saved as TSVs, and imported using java code. This method is a separate entrypoint
+     * that allows other code perform the calculations, save the results as TSVs, and then trigger import here.
+     *
+     * A use case is a separate pipeline server that performs the R computation on a cluster, and then triggers the main webserver to import
+     * those results.
+     */
+    abstract public void standaloneProcessKinshipAndInbreeding(Container c, User u, File pipelineDir, Logger log) throws PipelineJobException;
 }
diff --git a/ehr/resources/pipeline/kinship/populateInbreeding.r b/ehr/resources/pipeline/kinship/populateInbreeding.r
@@ -7,47 +7,43 @@
 # This R script will calculate and store inbreeding coefficients for all animals in the colony.  This data will be compared against
 # the information currently stored in the DB and the minimal number of inserts/updates/deletes are then performed.  This script is designed
 # to run as a daily cron job.
-
-
-options(error = dump.frames);
-library(pedigree);
-library(getopt);
-library(Matrix);
+library(pedigree)
+library(getopt)
+library(Matrix)
 library(dplyr)
 
 spec <- matrix(c(
-'inputFile', '-f', 1, "character"
-), ncol=4, byrow=TRUE);
-opts = getopt(spec, commandArgs(trailingOnly = TRUE));
+  'inputFile', '-f', 1, 'character'
+), ncol=4, byrow=TRUE)
+opts <- getopt(spec, commandArgs(trailingOnly = TRUE))
 
-allPed <- read.table(opts$inputFile);
+allPed <- read.table(opts$inputFile)
 colnames(allPed)<-c('Id', 'Dam', 'Sire', 'Gender')
 
-is.na(allPed$Id)<-which(allPed$Id=="")
-is.na(allPed$Dam)<-which(allPed$Dam=="")
-is.na(allPed$Sire)<-which(allPed$Sire=="")
-is.na(allPed$Gender)<-which(allPed$Gender=="")
-
-df <- data.frame(id=as.character(allPed$Id), 'id parent1'=allPed$Dam, 'id parent2'=allPed$Sire, stringsAsFactors=FALSE);
-colnames(df)<-c("id", "id parent1", "id parent2")
+allPed$Id[allPed$Id == ""] <- NA
+allPed$Dam[allPed$Dam == ""] <- NA
+allPed$Sire[allPed$Sire == ""] <- NA
+allPed$Gender[allPed$Gender == ""] <- NA
 
-originalIds <-as.data.frame(df[,1,drop=FALSE])
+df <- data.frame(id=as.character(allPed$Id), 'id parent1'=allPed$Dam, 'id parent2'=allPed$Sire, stringsAsFactors=FALSE)
+originalIds <- df$id
+print(paste0('Input IDs: ', nrow(df)))
 
 #this is a function in the pedigree package designed to add missing parents to the dataframe
 #see pedigree package documentation for more detail
-df <- add.Inds(df);
+df <- add.Inds(df)
 ord <- orderPed(df)
 df <- df[order(ord),]
 
 #use an existing package to calculate inbreeding
-ib = calcInbreeding(df);
-
-newRecords <- data.frame(Id=as.character(df$id), coefficient=ib, stringsAsFactors=FALSE);
+ib <- calcInbreeding(df)
 
 #only calculate inbreeding for Ids at the center
-newRecords <- dplyr::filter(newRecords, Id %in% originalIds$id)
+newRecords <- data.frame(Id=as.character(df$id), coefficient=ib, stringsAsFactors=FALSE) %>%
+  dplyr::filter(Id %in% originalIds)
+
+if (nrow(newRecords) != length(originalIds)) {
+  stop(paste0('Output dataframe and input IDs not the same length! Expected: ', length(originalIds), ', was: ', nrow(newRecords)))
+}
 
-# write TSV to disk
-print("Output table:");
-print(str(newRecords));
-write.table(newRecords, file = "inbreeding.txt", append = FALSE,row.names=F,quote=F,sep="\t");
+write.table(newRecords, file = "inbreeding.txt", append = FALSE, row.names=F, quote=F, sep="\t")
diff --git a/ehr/resources/pipeline/kinship/populateKinship.r b/ehr/resources/pipeline/kinship/populateKinship.r
@@ -7,67 +7,64 @@
 # This R script will calculate and store kinship coefficients (aka. relatedness) for all animals in the colony.  This is a large, sparse matrix.
 # The matrix is converted into a very long 3-column dataframe (animal1, animal2, coefficient).  This dataframe is output to a TSV file,
 # which is normally imported into ehr.kinship by java code in GeneticCalculationsImportTask
-
-
-#options(echo=TRUE);
-options(error = dump.frames);
-library(methods);
-library(kinship2);
-library(getopt);
-library(Matrix);
-library(dplyr);
+library(kinship2)
+library(getopt)
+library(Matrix)
+library(dplyr)
 
 spec <- matrix(c(
-#'containerPath', '-c', 1, "character",
-#'baseUrl', '-b', 1, "character"
-'inputFile', '-f', 1, "character"
-), ncol=4, byrow=TRUE);
-opts = getopt(spec, commandArgs(trailingOnly = TRUE));
+    'inputFile', '-f', 1, 'character'
+), ncol=4, byrow=TRUE)
+opts <- getopt(spec, commandArgs(trailingOnly = TRUE))
 
-allPed <- read.table(opts$inputFile, quote="\"");
-colnames(allPed)<-c('Id', 'Dam', 'Sire', 'Gender', 'Species');
+allPed <- read.table(opts$inputFile, quote="\"")
+colnames(allPed)<-c('Id', 'Dam', 'Sire', 'Gender', 'Species')
 
-is.na(allPed$Id)<-which(allPed$Id=="")
-is.na(allPed$Dam)<-which(allPed$Dam=="")
-is.na(allPed$Sire)<-which(allPed$Sire=="")
-is.na(allPed$Gender)<-which(allPed$Gender=="")
+allPed$Id[allPed$Id == ""] <- NA
+allPed$Dam[allPed$Dam == ""] <- NA
+allPed$Sire[allPed$Sire == ""] <- NA
+allPed$Gender[allPed$Gender == "" | is.na(allPed$Gender)] <- 3 # 3 = unknown
 
 allPed$Species <- as.character(allPed$Species)
 allPed$Species[is.na(allPed$Species)] <- c('Unknown')
 allPed$Species <- as.factor(allPed$Species)
 
-# In order to reduce the max matrix size, calculate famids using makefamid, then analyze each group separately
-# It resizes the biggest matrix from 12000^2 to 8200^2 thus reduces the memory used by half
-newRecords=NULL
+if (any(allPed$Species == 'Unknown')) {
+    print(paste0('There are ', sum(allPed$Species == 'Unknown'), ' Ids with species = Unknown'))
+}
+
+newRecords <- NULL
 for (species in unique(allPed$Species)){
-    print(paste0('processing species: ', species))
-    allRecordsForSpecies <- allPed[allPed$Species == species,]
+    allRecordsForSpecies <- allPed[allPed$Species %in% species,]
+    print(paste0('Processing species: ', species, ', with ', nrow(allRecordsForSpecies), ' IDs'))
+    if (nrow(allRecordsForSpecies) == 1) {
+        print('single record, skipping')
+        next
+    }
 
     # Add missing parents for accurate kinship calculations
     fixedRecords <- with(allRecordsForSpecies, fixParents(id = Id, dadid = Sire, momid = Dam, sex = Gender))
 
     # Kinship is expecting records to be sorted IAW it's own pedigree function
-    recordsForSpecies <- with(fixedRecords, pedigree(id=id,dadid=dadid,momid=momid,sex=sex,missid=0))
+    recordsForSpecies <- with(fixedRecords, pedigree(id = id, dadid = dadid, momid = momid, sex = sex, missid = 0))
 
-    temp.kin=kinship(recordsForSpecies)
+    temp.kin <- kinship(recordsForSpecies)
 
     # Add rownames to make matrix symmetric, which is required downstream
     rownames(temp.kin) <- colnames(temp.kin)
 
     # Convert kinship matrix to a triplet list of two ids and their coefficient
-    summaryDf = as.data.frame(summary(as(temp.kin, "dgCMatrix")))
+    summaryDf <- as.data.frame(summary(as(temp.kin, "dgCMatrix")))
     idList <- rownames(temp.kin)
-    temp.tri = data.frame(Id=idList[summaryDf$i], Id2=idList[summaryDf$j], coefficient=summaryDf$x)
+    temp.tri <- data.frame(Id=idList[summaryDf$i], Id2=idList[summaryDf$j], coefficient=summaryDf$x)
 
     # Now filter out parents added for kinship calculation
     temp.tri <- dplyr::filter(temp.tri, grepl("^(?!addin).*$", Id, perl = TRUE))
     temp.tri <- dplyr::filter(temp.tri, grepl("^(?!addin).*$", Id2, perl = TRUE))
+    temp.tri <- merge(temp.tri, allRecordsForSpecies[c('Id', 'Species')], by = 'Id', all.x = TRUE)
 
-    newRecords=rbind(newRecords,temp.tri)
-    print(paste0('total subjects: ', nrow(allRecordsForSpecies)))
+    newRecords <- dplyr::bind_rows(newRecords,temp.tri)
 }
 
 # write TSV to disk
-print("Output table:");
-print(str(newRecords));
-write.table(newRecords, file = "kinship.txt", append = FALSE,row.names=F,quote=F,sep="\t");
+write.table(newRecords, file = "kinship.txt", append = FALSE, row.names = FALSE, quote = FALSE, sep = '\t')
diff --git a/ehr/resources/web/ehr/panel/GeneticCalculationSettingsPanel.js b/ehr/resources/web/ehr/panel/GeneticCalculationSettingsPanel.js
@@ -24,11 +24,15 @@ Ext4.define('EHR.panel.GeneticCalculationSettingsPanel', {
                 xtype: 'checkbox',
                 fieldLabel: 'Is Enabled?',
                 itemId: 'enabled'
+            },{
+                xtype: 'checkbox',
+                fieldLabel: 'Allow Import During Business Hours?',
+                itemId: 'allowImportDuringBusinessHours'
             },{
                 xtype: 'checkbox',
                 fieldLabel: 'Kinship validation?',
                 itemId: 'kinshipValidation',
-                listeners : {
+                listeners: {
                     render: function(c) {
                         Ext4.create('Ext.tip.ToolTip', {
                             target: c.getEl(),
@@ -94,6 +98,7 @@ Ext4.define('EHR.panel.GeneticCalculationSettingsPanel', {
         this.down('#hourOfDay').setValue(results.hourOfDay);
         this.down('#containerPath').setValue(results.containerPath);
         this.down('#kinshipValidation').setValue(results.kinshipValidation);
+        this.down('#allowImportDuringBusinessHours').setValue(results.allowImportDuringBusinessHours)
     },
 
     saveData: function(){
@@ -104,7 +109,8 @@ Ext4.define('EHR.panel.GeneticCalculationSettingsPanel', {
                 containerPath: this.down('#containerPath').getValue(),
                 enabled: this.down('#enabled').getValue(),
                 hourOfDay: this.down('#hourOfDay').getValue(),
-                kinshipValidation: this.down('#kinshipValidation').getValue()
+                kinshipValidation: this.down('#kinshipValidation').getValue(),
+                allowImportDuringBusinessHours: this.down('#allowImportDuringBusinessHours').getValue()
             },
             method : 'POST',
             scope: this,

diff --git a/ehr/src/org/labkey/ehr/EHRController.java b/ehr/src/org/labkey/ehr/EHRController.java
@@ -83,6 +83,7 @@
 import org.labkey.api.settings.AppProps;
 import org.labkey.api.study.DatasetTable;
 import org.labkey.api.util.ExceptionUtil;
+import org.labkey.api.util.HtmlString;
 import org.labkey.api.util.HtmlStringBuilder;
 import org.labkey.api.util.PageFlowUtil;
 import org.labkey.api.util.Path;
@@ -639,7 +640,7 @@ public ApiResponse execute(ScheduleGeneticCalculationForm form, BindException er
                 errors.reject(ERROR_MSG, "Unable to find container for path: " + form.getContainerPath());
                 return null;
             }
-            GeneticCalculationsJob.setProperties(form.isEnabled(), c, form.getHourOfDay(), form.isKinshipValidation());
+            GeneticCalculationsJob.setProperties(form.isEnabled(), c, form.getHourOfDay(), form.isKinshipValidation(), form.isAllowImportDuringBusinessHours());
 
             return new ApiSimpleResponse("success", true);
         }
@@ -759,6 +760,7 @@ public static class ScheduleGeneticCalculationForm
         private int hourOfDay;
 
         private boolean _kinshipValidation;
+        private boolean _allowImportDuringBusinessHours;
 
         public boolean isEnabled()
         {
@@ -799,6 +801,16 @@ public void setKinshipValidation(boolean kinshipValidation)
         {
             _kinshipValidation = kinshipValidation;
         }
+
+        public boolean isAllowImportDuringBusinessHours()
+        {
+            return _allowImportDuringBusinessHours;
+        }
+
+        public void setAllowImportDuringBusinessHours(boolean allowImportDuringBusinessHours)
+        {
+            _allowImportDuringBusinessHours = allowImportDuringBusinessHours;
+        }
     }
 
     @RequiresPermission(AdminPermission.class)
@@ -817,6 +829,7 @@ public ApiResponse execute(ScheduleGeneticCalculationForm form, BindException er
             ret.put("enabled", GeneticCalculationsJob.isEnabled());
             ret.put("hourOfDay", GeneticCalculationsJob.getHourOfDay());
             ret.put("kinshipValidation", GeneticCalculationsJob.isKinshipValidation());
+            ret.put("allowImportDuringBusinessHours", GeneticCalculationsJob.isAllowImportDuringBusinessHours());
 
             return new ApiSimpleResponse(ret);
         }
@@ -1250,7 +1263,7 @@ public void validateCommand(Object form, Errors errors)
         @Override
         public ModelAndView getConfirmView(Object form, BindException errors)
         {
-            return new HtmlView("This will cause the system to recalculate kinship and inbreeding coefficients on the colony.  Do you want to continue?");
+            return new HtmlView(HtmlString.of("This will cause the system to recalculate kinship and inbreeding coefficients on the colony.  Do you want to continue?"));
         }
 
         @Override

diff --git a/ehr/src/org/labkey/ehr/EHRServiceImpl.java b/ehr/src/org/labkey/ehr/EHRServiceImpl.java
@@ -50,6 +50,7 @@
 import org.labkey.api.module.ModuleLoader;
 import org.labkey.api.module.ModuleProperty;
 import org.labkey.api.pipeline.PipeRoot;
+import org.labkey.api.pipeline.PipelineJobException;
 import org.labkey.api.pipeline.PipelineService;
 import org.labkey.api.query.BatchValidationException;
 import org.labkey.api.query.DetailsURL;
@@ -78,10 +79,12 @@
 import org.labkey.ehr.history.DefaultObservationsDataSource;
 import org.labkey.ehr.history.DefaultPregnanciesDataSource;
 import org.labkey.ehr.history.LabworkManager;
+import org.labkey.ehr.pipeline.GeneticCalculationsImportTask;
 import org.labkey.ehr.security.EHRSecurityManager;
 import org.labkey.ehr.table.DefaultEHRCustomizer;
 import org.labkey.ehr.table.SNOMEDCodesDisplayColumn;
 
+import java.io.File;
 import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStream;
@@ -1061,4 +1064,10 @@ public Map<String, Pair<Module, LabworkType>> getLabWorkOverrides()
     {
         return _labWorkOverrides;
     }
+
+    @Override
+    public void standaloneProcessKinshipAndInbreeding(Container c, User u, File pipelineDir, Logger log) throws PipelineJobException
+    {
+        GeneticCalculationsImportTask.standaloneProcessKinshipAndInbreeding(c, u, pipelineDir, log);
+    }
 }