Skip to content

Commit

Permalink
Refactor kinship to allow other code to initiate import directly (#730)
Browse files Browse the repository at this point in the history
* Refactor genetics pipeline to allow standalone import of TSVs that were calculated separately
  • Loading branch information
bbimber authored Mar 12, 2024
1 parent d0a9e46 commit adf9436
Show file tree
Hide file tree
Showing 11 changed files with 187 additions and 149 deletions.
13 changes: 13 additions & 0 deletions ehr/api-src/org/labkey/api/ehr/EHRService.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
*/
package org.labkey.api.ehr;

import org.apache.logging.log4j.Logger;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.labkey.api.data.AbstractTableInfo;
Expand All @@ -31,6 +32,7 @@
import org.labkey.api.ehr.history.*;
import org.labkey.api.ldk.table.ButtonConfigFactory;
import org.labkey.api.module.Module;
import org.labkey.api.pipeline.PipelineJobException;
import org.labkey.api.query.BatchValidationException;
import org.labkey.api.query.DetailsURL;
import org.labkey.api.query.ExprColumn;
Expand All @@ -43,6 +45,7 @@
import org.labkey.api.view.ActionURL;
import org.labkey.api.view.template.ClientDependency;

import java.io.File;
import java.io.IOException;
import java.util.Collection;
import java.util.Date;
Expand Down Expand Up @@ -326,4 +329,14 @@ public EHRQCState getQCState(@NotNull Container c)
abstract public List<String> ensureStudyQCStates(Container c, final User u, final boolean commitChanges);

abstract public void registerLabWorkOverrides(Module module, String fromType, LabworkType toType);

/**
* The EHR has a built-in GeneticsCalculations pipeline job that computes inbreeding and kinship based on the pedigree.
* These are normally calculated in R, saved as TSVs, and imported using java code. This method is a separate entrypoint
* that allows other code perform the calculations, save the results as TSVs, and then trigger import here.
*
* A use case is a separate pipeline server that performs the R computation on a cluster, and then triggers the main webserver to import
* those results.
*/
abstract public void standaloneProcessKinshipAndInbreeding(Container c, User u, File pipelineDir, Logger log) throws PipelineJobException;
}
50 changes: 23 additions & 27 deletions ehr/resources/pipeline/kinship/populateInbreeding.r
Original file line number Diff line number Diff line change
Expand Up @@ -7,47 +7,43 @@
# This R script will calculate and store inbreeding coefficients for all animals in the colony. This data will be compared against
# the information currently stored in the DB and the minimal number of inserts/updates/deletes are then performed. This script is designed
# to run as a daily cron job.


options(error = dump.frames);
library(pedigree);
library(getopt);
library(Matrix);
library(pedigree)
library(getopt)
library(Matrix)
library(dplyr)

spec <- matrix(c(
'inputFile', '-f', 1, "character"
), ncol=4, byrow=TRUE);
opts = getopt(spec, commandArgs(trailingOnly = TRUE));
'inputFile', '-f', 1, 'character'
), ncol=4, byrow=TRUE)
opts <- getopt(spec, commandArgs(trailingOnly = TRUE))

allPed <- read.table(opts$inputFile);
allPed <- read.table(opts$inputFile)
colnames(allPed)<-c('Id', 'Dam', 'Sire', 'Gender')

is.na(allPed$Id)<-which(allPed$Id=="")
is.na(allPed$Dam)<-which(allPed$Dam=="")
is.na(allPed$Sire)<-which(allPed$Sire=="")
is.na(allPed$Gender)<-which(allPed$Gender=="")

df <- data.frame(id=as.character(allPed$Id), 'id parent1'=allPed$Dam, 'id parent2'=allPed$Sire, stringsAsFactors=FALSE);
colnames(df)<-c("id", "id parent1", "id parent2")
allPed$Id[allPed$Id == ""] <- NA
allPed$Dam[allPed$Dam == ""] <- NA
allPed$Sire[allPed$Sire == ""] <- NA
allPed$Gender[allPed$Gender == ""] <- NA

originalIds <-as.data.frame(df[,1,drop=FALSE])
df <- data.frame(id=as.character(allPed$Id), 'id parent1'=allPed$Dam, 'id parent2'=allPed$Sire, stringsAsFactors=FALSE)
originalIds <- df$id
print(paste0('Input IDs: ', nrow(df)))

#this is a function in the pedigree package designed to add missing parents to the dataframe
#see pedigree package documentation for more detail
df <- add.Inds(df);
df <- add.Inds(df)
ord <- orderPed(df)
df <- df[order(ord),]

#use an existing package to calculate inbreeding
ib = calcInbreeding(df);

newRecords <- data.frame(Id=as.character(df$id), coefficient=ib, stringsAsFactors=FALSE);
ib <- calcInbreeding(df)

#only calculate inbreeding for Ids at the center
newRecords <- dplyr::filter(newRecords, Id %in% originalIds$id)
newRecords <- data.frame(Id=as.character(df$id), coefficient=ib, stringsAsFactors=FALSE) %>%
dplyr::filter(Id %in% originalIds)

if (nrow(newRecords) != length(originalIds)) {
stop(paste0('Output dataframe and input IDs not the same length! Expected: ', length(originalIds), ', was: ', nrow(newRecords)))
}

# write TSV to disk
print("Output table:");
print(str(newRecords));
write.table(newRecords, file = "inbreeding.txt", append = FALSE,row.names=F,quote=F,sep="\t");
write.table(newRecords, file = "inbreeding.txt", append = FALSE, row.names=F, quote=F, sep="\t")
65 changes: 31 additions & 34 deletions ehr/resources/pipeline/kinship/populateKinship.r
Original file line number Diff line number Diff line change
Expand Up @@ -7,67 +7,64 @@
# This R script will calculate and store kinship coefficients (aka. relatedness) for all animals in the colony. This is a large, sparse matrix.
# The matrix is converted into a very long 3-column dataframe (animal1, animal2, coefficient). This dataframe is output to a TSV file,
# which is normally imported into ehr.kinship by java code in GeneticCalculationsImportTask


#options(echo=TRUE);
options(error = dump.frames);
library(methods);
library(kinship2);
library(getopt);
library(Matrix);
library(dplyr);
library(kinship2)
library(getopt)
library(Matrix)
library(dplyr)

spec <- matrix(c(
#'containerPath', '-c', 1, "character",
#'baseUrl', '-b', 1, "character"
'inputFile', '-f', 1, "character"
), ncol=4, byrow=TRUE);
opts = getopt(spec, commandArgs(trailingOnly = TRUE));
'inputFile', '-f', 1, 'character'
), ncol=4, byrow=TRUE)
opts <- getopt(spec, commandArgs(trailingOnly = TRUE))

allPed <- read.table(opts$inputFile, quote="\"");
colnames(allPed)<-c('Id', 'Dam', 'Sire', 'Gender', 'Species');
allPed <- read.table(opts$inputFile, quote="\"")
colnames(allPed)<-c('Id', 'Dam', 'Sire', 'Gender', 'Species')

is.na(allPed$Id)<-which(allPed$Id=="")
is.na(allPed$Dam)<-which(allPed$Dam=="")
is.na(allPed$Sire)<-which(allPed$Sire=="")
is.na(allPed$Gender)<-which(allPed$Gender=="")
allPed$Id[allPed$Id == ""] <- NA
allPed$Dam[allPed$Dam == ""] <- NA
allPed$Sire[allPed$Sire == ""] <- NA
allPed$Gender[allPed$Gender == "" | is.na(allPed$Gender)] <- 3 # 3 = unknown

allPed$Species <- as.character(allPed$Species)
allPed$Species[is.na(allPed$Species)] <- c('Unknown')
allPed$Species <- as.factor(allPed$Species)

# In order to reduce the max matrix size, calculate famids using makefamid, then analyze each group separately
# It resizes the biggest matrix from 12000^2 to 8200^2 thus reduces the memory used by half
newRecords=NULL
if (any(allPed$Species == 'Unknown')) {
print(paste0('There are ', sum(allPed$Species == 'Unknown'), ' Ids with species = Unknown'))
}

newRecords <- NULL
for (species in unique(allPed$Species)){
print(paste0('processing species: ', species))
allRecordsForSpecies <- allPed[allPed$Species == species,]
allRecordsForSpecies <- allPed[allPed$Species %in% species,]
print(paste0('Processing species: ', species, ', with ', nrow(allRecordsForSpecies), ' IDs'))
if (nrow(allRecordsForSpecies) == 1) {
print('single record, skipping')
next
}

# Add missing parents for accurate kinship calculations
fixedRecords <- with(allRecordsForSpecies, fixParents(id = Id, dadid = Sire, momid = Dam, sex = Gender))

# Kinship is expecting records to be sorted IAW it's own pedigree function
recordsForSpecies <- with(fixedRecords, pedigree(id=id,dadid=dadid,momid=momid,sex=sex,missid=0))
recordsForSpecies <- with(fixedRecords, pedigree(id = id, dadid = dadid, momid = momid, sex = sex, missid = 0))

temp.kin=kinship(recordsForSpecies)
temp.kin <- kinship(recordsForSpecies)

# Add rownames to make matrix symmetric, which is required downstream
rownames(temp.kin) <- colnames(temp.kin)

# Convert kinship matrix to a triplet list of two ids and their coefficient
summaryDf = as.data.frame(summary(as(temp.kin, "dgCMatrix")))
summaryDf <- as.data.frame(summary(as(temp.kin, "dgCMatrix")))
idList <- rownames(temp.kin)
temp.tri = data.frame(Id=idList[summaryDf$i], Id2=idList[summaryDf$j], coefficient=summaryDf$x)
temp.tri <- data.frame(Id=idList[summaryDf$i], Id2=idList[summaryDf$j], coefficient=summaryDf$x)

# Now filter out parents added for kinship calculation
temp.tri <- dplyr::filter(temp.tri, grepl("^(?!addin).*$", Id, perl = TRUE))
temp.tri <- dplyr::filter(temp.tri, grepl("^(?!addin).*$", Id2, perl = TRUE))
temp.tri <- merge(temp.tri, allRecordsForSpecies[c('Id', 'Species')], by = 'Id', all.x = TRUE)

newRecords=rbind(newRecords,temp.tri)
print(paste0('total subjects: ', nrow(allRecordsForSpecies)))
newRecords <- dplyr::bind_rows(newRecords,temp.tri)
}

# write TSV to disk
print("Output table:");
print(str(newRecords));
write.table(newRecords, file = "kinship.txt", append = FALSE,row.names=F,quote=F,sep="\t");
write.table(newRecords, file = "kinship.txt", append = FALSE, row.names = FALSE, quote = FALSE, sep = '\t')
10 changes: 8 additions & 2 deletions ehr/resources/web/ehr/panel/GeneticCalculationSettingsPanel.js
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,15 @@ Ext4.define('EHR.panel.GeneticCalculationSettingsPanel', {
xtype: 'checkbox',
fieldLabel: 'Is Enabled?',
itemId: 'enabled'
},{
xtype: 'checkbox',
fieldLabel: 'Allow Import During Business Hours?',
itemId: 'allowImportDuringBusinessHours'
},{
xtype: 'checkbox',
fieldLabel: 'Kinship validation?',
itemId: 'kinshipValidation',
listeners : {
listeners: {
render: function(c) {
Ext4.create('Ext.tip.ToolTip', {
target: c.getEl(),
Expand Down Expand Up @@ -94,6 +98,7 @@ Ext4.define('EHR.panel.GeneticCalculationSettingsPanel', {
this.down('#hourOfDay').setValue(results.hourOfDay);
this.down('#containerPath').setValue(results.containerPath);
this.down('#kinshipValidation').setValue(results.kinshipValidation);
this.down('#allowImportDuringBusinessHours').setValue(results.allowImportDuringBusinessHours)
},

saveData: function(){
Expand All @@ -104,7 +109,8 @@ Ext4.define('EHR.panel.GeneticCalculationSettingsPanel', {
containerPath: this.down('#containerPath').getValue(),
enabled: this.down('#enabled').getValue(),
hourOfDay: this.down('#hourOfDay').getValue(),
kinshipValidation: this.down('#kinshipValidation').getValue()
kinshipValidation: this.down('#kinshipValidation').getValue(),
allowImportDuringBusinessHours: this.down('#allowImportDuringBusinessHours').getValue()
},
method : 'POST',
scope: this,
Expand Down
17 changes: 15 additions & 2 deletions ehr/src/org/labkey/ehr/EHRController.java
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@
import org.labkey.api.settings.AppProps;
import org.labkey.api.study.DatasetTable;
import org.labkey.api.util.ExceptionUtil;
import org.labkey.api.util.HtmlString;
import org.labkey.api.util.HtmlStringBuilder;
import org.labkey.api.util.PageFlowUtil;
import org.labkey.api.util.Path;
Expand Down Expand Up @@ -639,7 +640,7 @@ public ApiResponse execute(ScheduleGeneticCalculationForm form, BindException er
errors.reject(ERROR_MSG, "Unable to find container for path: " + form.getContainerPath());
return null;
}
GeneticCalculationsJob.setProperties(form.isEnabled(), c, form.getHourOfDay(), form.isKinshipValidation());
GeneticCalculationsJob.setProperties(form.isEnabled(), c, form.getHourOfDay(), form.isKinshipValidation(), form.isAllowImportDuringBusinessHours());

return new ApiSimpleResponse("success", true);
}
Expand Down Expand Up @@ -759,6 +760,7 @@ public static class ScheduleGeneticCalculationForm
private int hourOfDay;

private boolean _kinshipValidation;
private boolean _allowImportDuringBusinessHours;

public boolean isEnabled()
{
Expand Down Expand Up @@ -799,6 +801,16 @@ public void setKinshipValidation(boolean kinshipValidation)
{
_kinshipValidation = kinshipValidation;
}

public boolean isAllowImportDuringBusinessHours()
{
return _allowImportDuringBusinessHours;
}

public void setAllowImportDuringBusinessHours(boolean allowImportDuringBusinessHours)
{
_allowImportDuringBusinessHours = allowImportDuringBusinessHours;
}
}

@RequiresPermission(AdminPermission.class)
Expand All @@ -817,6 +829,7 @@ public ApiResponse execute(ScheduleGeneticCalculationForm form, BindException er
ret.put("enabled", GeneticCalculationsJob.isEnabled());
ret.put("hourOfDay", GeneticCalculationsJob.getHourOfDay());
ret.put("kinshipValidation", GeneticCalculationsJob.isKinshipValidation());
ret.put("allowImportDuringBusinessHours", GeneticCalculationsJob.isAllowImportDuringBusinessHours());

return new ApiSimpleResponse(ret);
}
Expand Down Expand Up @@ -1250,7 +1263,7 @@ public void validateCommand(Object form, Errors errors)
@Override
public ModelAndView getConfirmView(Object form, BindException errors)
{
return new HtmlView("This will cause the system to recalculate kinship and inbreeding coefficients on the colony. Do you want to continue?");
return new HtmlView(HtmlString.of("This will cause the system to recalculate kinship and inbreeding coefficients on the colony. Do you want to continue?"));
}

@Override
Expand Down
9 changes: 9 additions & 0 deletions ehr/src/org/labkey/ehr/EHRServiceImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
import org.labkey.api.module.ModuleLoader;
import org.labkey.api.module.ModuleProperty;
import org.labkey.api.pipeline.PipeRoot;
import org.labkey.api.pipeline.PipelineJobException;
import org.labkey.api.pipeline.PipelineService;
import org.labkey.api.query.BatchValidationException;
import org.labkey.api.query.DetailsURL;
Expand Down Expand Up @@ -78,10 +79,12 @@
import org.labkey.ehr.history.DefaultObservationsDataSource;
import org.labkey.ehr.history.DefaultPregnanciesDataSource;
import org.labkey.ehr.history.LabworkManager;
import org.labkey.ehr.pipeline.GeneticCalculationsImportTask;
import org.labkey.ehr.security.EHRSecurityManager;
import org.labkey.ehr.table.DefaultEHRCustomizer;
import org.labkey.ehr.table.SNOMEDCodesDisplayColumn;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
Expand Down Expand Up @@ -1061,4 +1064,10 @@ public Map<String, Pair<Module, LabworkType>> getLabWorkOverrides()
{
return _labWorkOverrides;
}

@Override
public void standaloneProcessKinshipAndInbreeding(Container c, User u, File pipelineDir, Logger log) throws PipelineJobException
{
GeneticCalculationsImportTask.standaloneProcessKinshipAndInbreeding(c, u, pipelineDir, log);
}
}
Loading

0 comments on commit adf9436

Please sign in to comment.