LabKey · bbimber · Sep 4, 2023 · Sep 4, 2023 · Sep 4, 2023 · Sep 4, 2023
diff --git a/ehr/api-src/org/labkey/api/ehr/EHRDemographicsService.java b/ehr/api-src/org/labkey/api/ehr/EHRDemographicsService.java
@@ -50,4 +50,9 @@ static public void setInstance(EHRDemographicsService instance)
     abstract public AnimalRecord getAnimal(Container c, String id);
 
     abstract public List<AnimalRecord> getAnimals(Container c, Collection<String> ids);
+
+    /**
+     *
+     */
+    abstract public void recalculateForAllIdsInCache(final Container c, final String schema, final String query, final boolean async);
 }
diff --git a/ehr/api-src/org/labkey/api/ehr/EHRService.java b/ehr/api-src/org/labkey/api/ehr/EHRService.java
@@ -15,6 +15,7 @@
  */
 package org.labkey.api.ehr;
 
+import org.apache.logging.log4j.Logger;
 import org.jetbrains.annotations.NotNull;
 import org.jetbrains.annotations.Nullable;
 import org.labkey.api.data.AbstractTableInfo;
@@ -31,6 +32,7 @@
 import org.labkey.api.ehr.history.*;
 import org.labkey.api.ldk.table.ButtonConfigFactory;
 import org.labkey.api.module.Module;
+import org.labkey.api.pipeline.PipelineJobException;
 import org.labkey.api.query.BatchValidationException;
 import org.labkey.api.query.DetailsURL;
 import org.labkey.api.query.ExprColumn;
@@ -43,6 +45,7 @@
 import org.labkey.api.view.ActionURL;
 import org.labkey.api.view.template.ClientDependency;
 
+import java.io.File;
 import java.io.IOException;
 import java.util.Collection;
 import java.util.Date;
@@ -324,4 +327,14 @@ public EHRQCState getQCState(@NotNull Container c)
 
     /** The EHR expects certain QC states to exist. This will inspect the current study and create any missing QC states. **/
     abstract public List<String> ensureStudyQCStates(Container c, final User u, final boolean commitChanges);
+
+    /**
+     * The EHR has a built-in GeneticsCalculations pipeline job that computes inbreeding and kinship based on the pedigree.
+     * These are normally calculated in R, saved as TSVs, and imported using java code. This method is a separate entrypoint
+     * that allows other code perform the calculations, save the results as TSVs, and then trigger import here.
+     *
+     * A use case is a separate pipeline server that performs the R computation on a cluster, and then triggers the main webserver to import
+     * those results.
+     */
+    abstract public void standaloneProcessKinshipAndInbreeding(Container c, User u, File pipelineDir, Logger log) throws PipelineJobException;
 }
diff --git a/ehr/resources/pipeline/kinship/populateInbreeding.r b/ehr/resources/pipeline/kinship/populateInbreeding.r
@@ -7,47 +7,43 @@
 # This R script will calculate and store inbreeding coefficients for all animals in the colony.  This data will be compared against
 # the information currently stored in the DB and the minimal number of inserts/updates/deletes are then performed.  This script is designed
 # to run as a daily cron job.
-
-
-options(error = dump.frames);
-library(pedigree);
-library(getopt);
-library(Matrix);
+library(pedigree)
+library(getopt)
+library(Matrix)
 library(dplyr)
 
 spec <- matrix(c(
-'inputFile', '-f', 1, "character"
-), ncol=4, byrow=TRUE);
-opts = getopt(spec, commandArgs(trailingOnly = TRUE));
+  'inputFile', '-f', 1, 'character'
+), ncol=4, byrow=TRUE)
+opts <- getopt(spec, commandArgs(trailingOnly = TRUE))
 
-allPed <- read.table(opts$inputFile);
+allPed <- read.table(opts$inputFile)
 colnames(allPed)<-c('Id', 'Dam', 'Sire', 'Gender')
 
-is.na(allPed$Id)<-which(allPed$Id=="")
-is.na(allPed$Dam)<-which(allPed$Dam=="")
-is.na(allPed$Sire)<-which(allPed$Sire=="")
-is.na(allPed$Gender)<-which(allPed$Gender=="")
-
-df <- data.frame(id=as.character(allPed$Id), 'id parent1'=allPed$Dam, 'id parent2'=allPed$Sire, stringsAsFactors=FALSE);
-colnames(df)<-c("id", "id parent1", "id parent2")
+allPed$Id[allPed$Id == ""] <- NA
+allPed$Dam[allPed$Dam == ""] <- NA
+allPed$Sire[allPed$Sire == ""] <- NA
+allPed$Gender[allPed$Gender == ""] <- NA
 
-originalIds <-as.data.frame(df[,1,drop=FALSE])
+df <- data.frame(id=as.character(allPed$Id), 'id parent1'=allPed$Dam, 'id parent2'=allPed$Sire, stringsAsFactors=FALSE)
+originalIds <- df$id
+print(paste0('Input IDs: ', nrow(df)))
 
 #this is a function in the pedigree package designed to add missing parents to the dataframe
 #see pedigree package documentation for more detail
-df <- add.Inds(df);
+df <- add.Inds(df)
 ord <- orderPed(df)
 df <- df[order(ord),]
 
 #use an existing package to calculate inbreeding
-ib = calcInbreeding(df);
-
-newRecords <- data.frame(Id=as.character(df$id), coefficient=ib, stringsAsFactors=FALSE);
+ib <- calcInbreeding(df)
 
 #only calculate inbreeding for Ids at the center
-newRecords <- dplyr::filter(newRecords, Id %in% originalIds$id)
+newRecords <- data.frame(Id=as.character(df$id), coefficient=ib, stringsAsFactors=FALSE) %>%
+  dplyr::filter(Id %in% originalIds)
+
+if (nrow(newRecords) != length(originalIds)) {
+  stop(paste0('Output dataframe and input IDs not the same length! Expected: ', length(originalIds), ', was: ', nrow(newRecords)))
+}
 
-# write TSV to disk
-print("Output table:");
-print(str(newRecords));
-write.table(newRecords, file = "inbreeding.txt", append = FALSE,row.names=F,quote=F,sep="\t");
+write.table(newRecords, file = "inbreeding.txt", append = FALSE, row.names=F, quote=F, sep="\t")
diff --git a/ehr/resources/pipeline/kinship/populateKinship.r b/ehr/resources/pipeline/kinship/populateKinship.r
@@ -7,67 +7,232 @@
 # This R script will calculate and store kinship coefficients (aka. relatedness) for all animals in the colony.  This is a large, sparse matrix.
 # The matrix is converted into a very long 3-column dataframe (animal1, animal2, coefficient).  This dataframe is output to a TSV file,
 # which is normally imported into ehr.kinship by java code in GeneticCalculationsImportTask
-
-
-#options(echo=TRUE);
-options(error = dump.frames);
-library(methods);
-library(kinship2);
-library(getopt);
-library(Matrix);
-library(dplyr);
+library(kinship2)
+library(getopt)
+library(Matrix)
+library(dplyr)
 
 spec <- matrix(c(
-#'containerPath', '-c', 1, "character",
-#'baseUrl', '-b', 1, "character"
-'inputFile', '-f', 1, "character"
-), ncol=4, byrow=TRUE);
-opts = getopt(spec, commandArgs(trailingOnly = TRUE));
+    'inputFile', '-f', 1, 'character',
+    'mergeSpeciesWithHybrids', '-m', 0, 'logical',
+    'performKinshipValidation', '-v', 0, 'logical'
+), ncol=4, byrow=TRUE)
+opts <- getopt(spec, commandArgs(trailingOnly = TRUE))
+
+if (is.null(opts$mergeSpeciesWithHybrids)){
+    opts$mergeSpeciesWithHybrids <- FALSE
+}
 
-allPed <- read.table(opts$inputFile, quote="\"");
-colnames(allPed)<-c('Id', 'Dam', 'Sire', 'Gender', 'Species');
+if (is.null(opts$performKinshipValidation)){
+    opts$performKinshipValidation <- FALSE
+}
 
-is.na(allPed$Id)<-which(allPed$Id=="")
-is.na(allPed$Dam)<-which(allPed$Dam=="")
-is.na(allPed$Sire)<-which(allPed$Sire=="")
-is.na(allPed$Gender)<-which(allPed$Gender=="")
+allPed <- read.table(opts$inputFile, quote="\"")
+colnames(allPed)<-c('Id', 'Dam', 'Sire', 'Gender', 'Species')
+
+allPed$Id[allPed$Id == ""] <- NA
+allPed$Dam[allPed$Dam == ""] <- NA
+allPed$Sire[allPed$Sire == ""] <- NA
+allPed$Gender[allPed$Gender == "" | is.na(allPed$Gender)] <- 3 # 3 = unknown
 
 allPed$Species <- as.character(allPed$Species)
 allPed$Species[is.na(allPed$Species)] <- c('Unknown')
 allPed$Species <- as.factor(allPed$Species)
 
-# In order to reduce the max matrix size, calculate famids using makefamid, then analyze each group separately
-# It resizes the biggest matrix from 12000^2 to 8200^2 thus reduces the memory used by half
-newRecords=NULL
-for (species in unique(allPed$Species)){
-    print(paste0('processing species: ', species))
-    allRecordsForSpecies <- allPed[allPed$Species == species,]
+if (any(allPed$Species == 'Unknown')) {
+    print(paste0('There are ', sum(allPed$Species == 'Unknown'), ' Ids with species = Unknown'))
+}
+
+# The purpose of this function is to handle instances where there was cross-breeding.
+# While this is probably rare, it can occur. When this happens, simply merge the entire species together and process as one unit.
+# This ensures that all relevant ancestors from both species are present
+generateSpeciesToProcess <- function(allPed, mergeSpeciesWithHybrids) {
+    hybridParents <- dplyr::bind_rows(
+      merge(allPed, allPed, by.x = 'Dam', by.y = 'Id') %>% filter(Species.x != Species.y) %>% select(Id, Dam, Species.x, Species.y) %>% rename(Parent = Dam) %>% mutate(ParentType = 'Dam'),
+      merge(allPed, allPed, by.x = 'Sire', by.y = 'Id') %>% filter(Species.x != Species.y) %>% select(Id, Sire, Species.x, Species.y) %>% rename(Parent = Sire) %>% mutate(ParentType = 'Sire')
+    )
+
+    if (nrow(hybridParents) > 0) {
+        print(paste0('There were ', nrow(hybridParents), ' records with parents of a different species'))
+    }
+
+    if (mergeSpeciesWithHybrids && nrow(hybridParents) > 0) {
+        speciesGroups <- as.list(as.character(unique(allPed$Species)))
+        toMerge <- unique(hybridParents[c('Species.x', 'Species.y')])
+        for (idx in 1:nrow(toMerge)){
+            speciesToCollapse <- as.character(unlist(toMerge[idx,,drop = TRUE]))
+            matchingIdx <- sapply(speciesGroups, function(x){
+                return(length(intersect(speciesToCollapse, x)) > 0)
+            })
+
+            speciesToCollapse <- sort(unique(c(speciesToCollapse, unlist(speciesGroups[matchingIdx]))))
+            speciesGroups <- speciesGroups[!matchingIdx]
+            speciesGroups <- append(speciesGroups, list(speciesToCollapse))
+        }
+
+        print('These species will be merged for processing:')
+        invisible(lapply(speciesGroups, function(x){
+            if (length(x) > 1) {
+                print(x)
+            }
+        }))
+
+        return(speciesGroups)
+    } else {
+        return(unique(allPed$Species[allPed$Species != 'Unknown']))
+    }
+}
+
+validateExpectedKinshipSubset <- function(dataToTest, expectedValues, errorRows, testReciprocal = TRUE) {
+    if (nrow(dataToTest) == 0 || nrow(expectedValues) == 0) {
+        return(errorRows)
+    }
+
+    # Generate the reciprocal of relationships as well:
+    if (testReciprocal) {
+        ret2 <- data.frame(Id = expectedValues$Id2, Id2 = expectedValues$Id, Relationship = expectedValues$Relationship, ExpectedCoefficient = expectedValues$ExpectedCoefficient)
+        ret2$Relationship <- sapply(expectedValues$Relationship, function(x){
+            x <- unlist(strsplit(x, split = '/'))
+            if (length(x) == 1) {
+                return(x)
+            }
+
+            return(paste0(x[2], '/', x[1]))
+        })
+        expectedValues <- dplyr::bind_rows(expectedValues, ret2)
+        rm(ret2)
+    }
+
+    dat <- merge(dataToTest, expectedValues, by = c('Id', 'Id2'), all.x = T, all.y = T) %>%
+      filter(!is.na(ExpectedCoefficient)) %>%
+      filter(is.na(coefficient) | coefficient < ExpectedCoefficient)
+
+    if (nrow(dat) == 0) {
+        return(errorRows)
+    }
+
+    if (all(is.null(errorRows))) {
+        return(dat)
+    }
+
+    return(dplyr::bind_rows(errorRows, dat))
+}
+
+validateExpectedKinship <- function(pedDf, dataToTest) {
+    errorRows <- NULL
+
+    # See reference: https://en.wikipedia.org/wiki/Coefficient_of_relationship#Kinship_coefficient
+    self <- data.frame(Id = pedDf$Id, Id2 = pedDf$Id, Relationship = 'Self', ExpectedCoefficient = 0.5)
+    errorRows <- validateExpectedKinshipSubset(dataToTest, self, errorRows, testReciprocal = FALSE)
+    rm(self)
+
+    parentChild <- dplyr::bind_rows(
+      data.frame(Id = pedDf$Id, Id2 = pedDf$Dam, Relationship = 'Child/Parent'),
+      data.frame(Id = pedDf$Id, Id2 = pedDf$Sire, Relationship = 'Child/Parent')
+    ) %>% filter(!is.na(Id2)) %>% mutate(ExpectedCoefficient = 0.25)
+    errorRows <- validateExpectedKinshipSubset(dataToTest, parentChild, errorRows)
+    rm(parentChild)
+
+    grandParentOffspring1 <- merge(pedDf[!is.na(pedDf$Dam),], pedDf, by.x = c('Dam'), by.y = c('Id'), all.x = F, all.y = F)
+    grandParentOffspring1 <- dplyr::bind_rows(
+      grandParentOffspring1 %>% select(Id, Dam.y) %>% filter(!is.na(Dam.y)) %>% rename(Id2 = Dam.y) %>% mutate(Relationship = 'Grandchild/Maternal Granddam'),
+      grandParentOffspring1 %>% select(Id, Sire.y) %>% filter(!is.na(Sire.y)) %>% rename(Id2 = Sire.y) %>% mutate(Relationship = 'Grandchild/Maternal Grandsire')
+    ) %>% mutate(ExpectedCoefficient = 0.125)
+    errorRows <- validateExpectedKinshipSubset(dataToTest, grandParentOffspring1, errorRows)
+    rm(grandParentOffspring1)
+
+    grandParentOffspring2 <- merge(pedDf[!is.na(pedDf$Sire),], pedDf, by.x = c('Sire'), by.y = c('Id'), all.x = F, all.y = F)
+    grandParentOffspring2 <- dplyr::bind_rows(
+      grandParentOffspring2 %>% select(Id, Dam.y) %>% filter(!is.na(Dam.y)) %>% rename(Id2 = Dam.y) %>% mutate(Relationship = 'Grandchild/Paternal Granddam'),
+      grandParentOffspring2 %>% select(Id, Sire.y) %>% filter(!is.na(Sire.y)) %>% rename(Id2 = Sire.y) %>% mutate(Relationship = 'Grandchild/Paternal Grandsire')
+    ) %>% mutate(ExpectedCoefficient = 0.125)
+    errorRows <- validateExpectedKinshipSubset(dataToTest, grandParentOffspring2, errorRows)
+    rm(grandParentOffspring2)
+
+    fullSibs <- merge(pedDf[!is.na(pedDf$Dam) & !is.na(pedDf$Sire),], pedDf[!is.na(pedDf$Dam) & !is.na(pedDf$Sire),], by = c('Sire', 'Dam'), all.x = F, all.y = F) %>%
+      select(Id.x, Id.y) %>%
+      rename(Id = Id.x, Id2 = Id.y) %>%
+      filter(Id != Id2) %>%
+      mutate(Relationship = 'Full sib', ExpectedCoefficient = 0.25)
+    errorRows <- validateExpectedKinshipSubset(dataToTest, fullSibs, errorRows)
+    rm(fullSibs)
+
+    halfSibs1 <- merge(pedDf[!is.na(pedDf$Dam),], pedDf[!is.na(pedDf$Dam),], by = c('Dam'), all.x = F, all.y = F) %>%
+      filter(Sire.x != Sire.y) %>%
+      select(Id.x, Id.y) %>%
+      rename(Id = Id.x, Id2 = Id.y) %>%
+      filter(Id != Id2) %>%
+      mutate(Relationship = 'Half sib', ExpectedCoefficient = 0.125)
+    errorRows <- validateExpectedKinshipSubset(dataToTest, halfSibs1, errorRows)
+    rm(halfSibs1)
+
+    halfSibs2 <- merge(pedDf[!is.na(pedDf$Sire),], pedDf[!is.na(pedDf$Sire),], by = c('Sire'), all.x = F, all.y = F) %>%
+      filter(Dam.x != Dam.y) %>%
+      select(Id.x, Id.y) %>%
+      rename(Id = Id.x, Id2 = Id.y) %>%
+      filter(Id != Id2) %>%
+      mutate(Relationship = 'Half sib', ExpectedCoefficient = 0.125)
+    errorRows <- validateExpectedKinshipSubset(dataToTest, halfSibs2, errorRows)
+    rm(halfSibs2)
+
+    return(errorRows)
+}
+
+speciesToProcess <- generateSpeciesToProcess(allPed, opts$mergeSpeciesWithHybrids)
+
+newRecords <- NULL
+for (speciesSet in speciesToProcess){
+    allRecordsForSpecies <- allPed[allPed$Species %in% speciesSet,]
+    print(paste0('Processing species set: ', paste0(speciesSet, collapse = ','), ', with ', nrow(allRecordsForSpecies), ' IDs'))
+    if (nrow(allRecordsForSpecies) == 1) {
+        print('single record, skipping')
+        newRecords <- dplyr::bind_rows(newRecords,data.frame(Id = allRecordsForSpecies$Id, Id2 = allRecordsForSpecies$Id, coefficient = 0.5, Species = allRecordsForSpecies$Species))
+        next
+    }
+
+    pctMissingSex <- sum(allRecordsForSpecies$Gender > 2) / nrow(allRecordsForSpecies)
+    if (pctMissingSex > 0.25) {
+        paste0('More than 25% of this species group are missing sex and cannot be processed by fixParents(), skipping')
+        newRecords <- dplyr::bind_rows(newRecords,data.frame(Id = allRecordsForSpecies$Id, Id2 = allRecordsForSpecies$Id, coefficient = 0.5, Species = allRecordsForSpecies$Species))
+        next
+    }
 
     # Add missing parents for accurate kinship calculations
     fixedRecords <- with(allRecordsForSpecies, fixParents(id = Id, dadid = Sire, momid = Dam, sex = Gender))
 
     # Kinship is expecting records to be sorted IAW it's own pedigree function
-    recordsForSpecies <- with(fixedRecords, pedigree(id=id,dadid=dadid,momid=momid,sex=sex,missid=0))
+    recordsForSpecies <- with(fixedRecords, pedigree(id = id, dadid = dadid, momid = momid, sex = sex, missid = 0))
 
-    temp.kin=kinship(recordsForSpecies)
+    temp.kin <- kinship(recordsForSpecies)
 
     # Add rownames to make matrix symmetric, which is required downstream
     rownames(temp.kin) <- colnames(temp.kin)
 
     # Convert kinship matrix to a triplet list of two ids and their coefficient
-    summaryDf = as.data.frame(summary(as(temp.kin, "dgCMatrix")))
+    summaryDf <- as.data.frame(summary(as(temp.kin, "dgCMatrix")))
     idList <- rownames(temp.kin)
-    temp.tri = data.frame(Id=idList[summaryDf$i], Id2=idList[summaryDf$j], coefficient=summaryDf$x)
+    temp.tri <- data.frame(Id=idList[summaryDf$i], Id2=idList[summaryDf$j], coefficient=summaryDf$x)
 
     # Now filter out parents added for kinship calculation
     temp.tri <- dplyr::filter(temp.tri, grepl("^(?!addin).*$", Id, perl = TRUE))
     temp.tri <- dplyr::filter(temp.tri, grepl("^(?!addin).*$", Id2, perl = TRUE))
-
-    newRecords=rbind(newRecords,temp.tri)
-    print(paste0('total subjects: ', nrow(allRecordsForSpecies)))
+    temp.tri <- merge(temp.tri, allRecordsForSpecies[c('Id', 'Species')], by = 'Id', all.x = TRUE)
+
+    newRecords <- dplyr::bind_rows(newRecords,temp.tri)
+
+    # NOTE: perform per-species to save memory
+    if (opts$performKinshipValidation) {
+        print('Validating coefficients against expected values')
+        errorRows <- validateExpectedKinship(allRecordsForSpecies, temp.tri)
+        if (!all(is.null(errorRows))) {
+            fileName <- paste0('kinshipErrors_', paste0(speciesSet, collapse = '.'), '.txt')
+            print(paste0('There were unexpected kinship values! See the file ', fileName, ' for more information'))
+            write.table(newRecords, file = fileName, row.names = FALSE, quote = FALSE, sep = '\t')
+        } else {
+            print('All coefficients were within expected ranges from predicted values')
+        }
+    }
 }
 
 # write TSV to disk
-print("Output table:");
-print(str(newRecords));
-write.table(newRecords, file = "kinship.txt", append = FALSE,row.names=F,quote=F,sep="\t");
+write.table(newRecords, file = "kinship.txt", append = FALSE, row.names = FALSE, quote = FALSE, sep = '\t')