Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor kinship to allow other code to initiate import directly #730

Merged
merged 11 commits into from
Mar 12, 2024
13 changes: 13 additions & 0 deletions ehr/api-src/org/labkey/api/ehr/EHRService.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
*/
package org.labkey.api.ehr;

import org.apache.logging.log4j.Logger;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.labkey.api.data.AbstractTableInfo;
Expand All @@ -31,6 +32,7 @@
import org.labkey.api.ehr.history.*;
import org.labkey.api.ldk.table.ButtonConfigFactory;
import org.labkey.api.module.Module;
import org.labkey.api.pipeline.PipelineJobException;
import org.labkey.api.query.BatchValidationException;
import org.labkey.api.query.DetailsURL;
import org.labkey.api.query.ExprColumn;
Expand All @@ -43,6 +45,7 @@
import org.labkey.api.view.ActionURL;
import org.labkey.api.view.template.ClientDependency;

import java.io.File;
import java.io.IOException;
import java.util.Collection;
import java.util.Date;
Expand Down Expand Up @@ -326,4 +329,14 @@ public EHRQCState getQCState(@NotNull Container c)
abstract public List<String> ensureStudyQCStates(Container c, final User u, final boolean commitChanges);

abstract public void registerLabWorkOverrides(Module module, String fromType, LabworkType toType);

/**
* The EHR has a built-in GeneticsCalculations pipeline job that computes inbreeding and kinship based on the pedigree.
* These are normally calculated in R, saved as TSVs, and imported using java code. This method is a separate entrypoint
* that allows other code perform the calculations, save the results as TSVs, and then trigger import here.
*
* A use case is a separate pipeline server that performs the R computation on a cluster, and then triggers the main webserver to import
* those results.
*/
abstract public void standaloneProcessKinshipAndInbreeding(Container c, User u, File pipelineDir, Logger log) throws PipelineJobException;
}
50 changes: 23 additions & 27 deletions ehr/resources/pipeline/kinship/populateInbreeding.r
Original file line number Diff line number Diff line change
Expand Up @@ -7,47 +7,43 @@
# This R script will calculate and store inbreeding coefficients for all animals in the colony. This data will be compared against
# the information currently stored in the DB and the minimal number of inserts/updates/deletes are then performed. This script is designed
# to run as a daily cron job.


options(error = dump.frames);
library(pedigree);
library(getopt);
library(Matrix);
library(pedigree)
library(getopt)
library(Matrix)
library(dplyr)

spec <- matrix(c(
'inputFile', '-f', 1, "character"
), ncol=4, byrow=TRUE);
opts = getopt(spec, commandArgs(trailingOnly = TRUE));
'inputFile', '-f', 1, 'character'
), ncol=4, byrow=TRUE)
opts <- getopt(spec, commandArgs(trailingOnly = TRUE))

allPed <- read.table(opts$inputFile);
allPed <- read.table(opts$inputFile)
colnames(allPed)<-c('Id', 'Dam', 'Sire', 'Gender')

is.na(allPed$Id)<-which(allPed$Id=="")
is.na(allPed$Dam)<-which(allPed$Dam=="")
is.na(allPed$Sire)<-which(allPed$Sire=="")
is.na(allPed$Gender)<-which(allPed$Gender=="")

df <- data.frame(id=as.character(allPed$Id), 'id parent1'=allPed$Dam, 'id parent2'=allPed$Sire, stringsAsFactors=FALSE);
colnames(df)<-c("id", "id parent1", "id parent2")
allPed$Id[allPed$Id == ""] <- NA
allPed$Dam[allPed$Dam == ""] <- NA
allPed$Sire[allPed$Sire == ""] <- NA
allPed$Gender[allPed$Gender == ""] <- NA

originalIds <-as.data.frame(df[,1,drop=FALSE])
df <- data.frame(id=as.character(allPed$Id), 'id parent1'=allPed$Dam, 'id parent2'=allPed$Sire, stringsAsFactors=FALSE)
originalIds <- df$id
print(paste0('Input IDs: ', nrow(df)))

#this is a function in the pedigree package designed to add missing parents to the dataframe
#see pedigree package documentation for more detail
df <- add.Inds(df);
df <- add.Inds(df)
ord <- orderPed(df)
df <- df[order(ord),]

#use an existing package to calculate inbreeding
ib = calcInbreeding(df);

newRecords <- data.frame(Id=as.character(df$id), coefficient=ib, stringsAsFactors=FALSE);
ib <- calcInbreeding(df)

#only calculate inbreeding for Ids at the center
newRecords <- dplyr::filter(newRecords, Id %in% originalIds$id)
newRecords <- data.frame(Id=as.character(df$id), coefficient=ib, stringsAsFactors=FALSE) %>%
dplyr::filter(Id %in% originalIds)

if (nrow(newRecords) != length(originalIds)) {
stop(paste0('Output dataframe and input IDs not the same length! Expected: ', length(originalIds), ', was: ', nrow(newRecords)))
}

# write TSV to disk
print("Output table:");
print(str(newRecords));
write.table(newRecords, file = "inbreeding.txt", append = FALSE,row.names=F,quote=F,sep="\t");
write.table(newRecords, file = "inbreeding.txt", append = FALSE, row.names=F, quote=F, sep="\t")
65 changes: 31 additions & 34 deletions ehr/resources/pipeline/kinship/populateKinship.r
Original file line number Diff line number Diff line change
Expand Up @@ -7,67 +7,64 @@
# This R script will calculate and store kinship coefficients (aka. relatedness) for all animals in the colony. This is a large, sparse matrix.
# The matrix is converted into a very long 3-column dataframe (animal1, animal2, coefficient). This dataframe is output to a TSV file,
# which is normally imported into ehr.kinship by java code in GeneticCalculationsImportTask


#options(echo=TRUE);
options(error = dump.frames);
library(methods);
library(kinship2);
library(getopt);
library(Matrix);
library(dplyr);
library(kinship2)
library(getopt)
library(Matrix)
library(dplyr)

spec <- matrix(c(
#'containerPath', '-c', 1, "character",
#'baseUrl', '-b', 1, "character"
'inputFile', '-f', 1, "character"
), ncol=4, byrow=TRUE);
opts = getopt(spec, commandArgs(trailingOnly = TRUE));
'inputFile', '-f', 1, 'character'
), ncol=4, byrow=TRUE)
opts <- getopt(spec, commandArgs(trailingOnly = TRUE))

allPed <- read.table(opts$inputFile, quote="\"");
colnames(allPed)<-c('Id', 'Dam', 'Sire', 'Gender', 'Species');
allPed <- read.table(opts$inputFile, quote="\"")
colnames(allPed)<-c('Id', 'Dam', 'Sire', 'Gender', 'Species')

is.na(allPed$Id)<-which(allPed$Id=="")
is.na(allPed$Dam)<-which(allPed$Dam=="")
is.na(allPed$Sire)<-which(allPed$Sire=="")
is.na(allPed$Gender)<-which(allPed$Gender=="")
allPed$Id[allPed$Id == ""] <- NA
allPed$Dam[allPed$Dam == ""] <- NA
allPed$Sire[allPed$Sire == ""] <- NA
allPed$Gender[allPed$Gender == "" | is.na(allPed$Gender)] <- 3 # 3 = unknown

allPed$Species <- as.character(allPed$Species)
allPed$Species[is.na(allPed$Species)] <- c('Unknown')
allPed$Species <- as.factor(allPed$Species)

# In order to reduce the max matrix size, calculate famids using makefamid, then analyze each group separately
# It resizes the biggest matrix from 12000^2 to 8200^2 thus reduces the memory used by half
newRecords=NULL
if (any(allPed$Species == 'Unknown')) {
print(paste0('There are ', sum(allPed$Species == 'Unknown'), ' Ids with species = Unknown'))
}

newRecords <- NULL
for (species in unique(allPed$Species)){
print(paste0('processing species: ', species))
allRecordsForSpecies <- allPed[allPed$Species == species,]
allRecordsForSpecies <- allPed[allPed$Species %in% species,]
print(paste0('Processing species: ', species, ', with ', nrow(allRecordsForSpecies), ' IDs'))
if (nrow(allRecordsForSpecies) == 1) {
print('single record, skipping')
next
}

# Add missing parents for accurate kinship calculations
fixedRecords <- with(allRecordsForSpecies, fixParents(id = Id, dadid = Sire, momid = Dam, sex = Gender))

# Kinship is expecting records to be sorted IAW it's own pedigree function
recordsForSpecies <- with(fixedRecords, pedigree(id=id,dadid=dadid,momid=momid,sex=sex,missid=0))
recordsForSpecies <- with(fixedRecords, pedigree(id = id, dadid = dadid, momid = momid, sex = sex, missid = 0))

temp.kin=kinship(recordsForSpecies)
temp.kin <- kinship(recordsForSpecies)

# Add rownames to make matrix symmetric, which is required downstream
rownames(temp.kin) <- colnames(temp.kin)

# Convert kinship matrix to a triplet list of two ids and their coefficient
summaryDf = as.data.frame(summary(as(temp.kin, "dgCMatrix")))
summaryDf <- as.data.frame(summary(as(temp.kin, "dgCMatrix")))
idList <- rownames(temp.kin)
temp.tri = data.frame(Id=idList[summaryDf$i], Id2=idList[summaryDf$j], coefficient=summaryDf$x)
temp.tri <- data.frame(Id=idList[summaryDf$i], Id2=idList[summaryDf$j], coefficient=summaryDf$x)

# Now filter out parents added for kinship calculation
temp.tri <- dplyr::filter(temp.tri, grepl("^(?!addin).*$", Id, perl = TRUE))
temp.tri <- dplyr::filter(temp.tri, grepl("^(?!addin).*$", Id2, perl = TRUE))
temp.tri <- merge(temp.tri, allRecordsForSpecies[c('Id', 'Species')], by = 'Id', all.x = TRUE)

newRecords=rbind(newRecords,temp.tri)
print(paste0('total subjects: ', nrow(allRecordsForSpecies)))
newRecords <- dplyr::bind_rows(newRecords,temp.tri)
}

# write TSV to disk
print("Output table:");
print(str(newRecords));
write.table(newRecords, file = "kinship.txt", append = FALSE,row.names=F,quote=F,sep="\t");
write.table(newRecords, file = "kinship.txt", append = FALSE, row.names = FALSE, quote = FALSE, sep = '\t')
10 changes: 8 additions & 2 deletions ehr/resources/web/ehr/panel/GeneticCalculationSettingsPanel.js
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,15 @@ Ext4.define('EHR.panel.GeneticCalculationSettingsPanel', {
xtype: 'checkbox',
fieldLabel: 'Is Enabled?',
itemId: 'enabled'
},{
xtype: 'checkbox',
fieldLabel: 'Allow Import During Business Hours?',
itemId: 'allowImportDuringBusinessHours'
},{
xtype: 'checkbox',
fieldLabel: 'Kinship validation?',
itemId: 'kinshipValidation',
listeners : {
listeners: {
render: function(c) {
Ext4.create('Ext.tip.ToolTip', {
target: c.getEl(),
Expand Down Expand Up @@ -94,6 +98,7 @@ Ext4.define('EHR.panel.GeneticCalculationSettingsPanel', {
this.down('#hourOfDay').setValue(results.hourOfDay);
this.down('#containerPath').setValue(results.containerPath);
this.down('#kinshipValidation').setValue(results.kinshipValidation);
this.down('#allowImportDuringBusinessHours').setValue(results.allowImportDuringBusinessHours)
},

saveData: function(){
Expand All @@ -104,7 +109,8 @@ Ext4.define('EHR.panel.GeneticCalculationSettingsPanel', {
containerPath: this.down('#containerPath').getValue(),
enabled: this.down('#enabled').getValue(),
hourOfDay: this.down('#hourOfDay').getValue(),
kinshipValidation: this.down('#kinshipValidation').getValue()
kinshipValidation: this.down('#kinshipValidation').getValue(),
allowImportDuringBusinessHours: this.down('#allowImportDuringBusinessHours').getValue()
},
method : 'POST',
scope: this,
Expand Down
17 changes: 15 additions & 2 deletions ehr/src/org/labkey/ehr/EHRController.java
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@
import org.labkey.api.settings.AppProps;
import org.labkey.api.study.DatasetTable;
import org.labkey.api.util.ExceptionUtil;
import org.labkey.api.util.HtmlString;
import org.labkey.api.util.HtmlStringBuilder;
import org.labkey.api.util.PageFlowUtil;
import org.labkey.api.util.Path;
Expand Down Expand Up @@ -639,7 +640,7 @@ public ApiResponse execute(ScheduleGeneticCalculationForm form, BindException er
errors.reject(ERROR_MSG, "Unable to find container for path: " + form.getContainerPath());
return null;
}
GeneticCalculationsJob.setProperties(form.isEnabled(), c, form.getHourOfDay(), form.isKinshipValidation());
GeneticCalculationsJob.setProperties(form.isEnabled(), c, form.getHourOfDay(), form.isKinshipValidation(), form.isAllowImportDuringBusinessHours());

return new ApiSimpleResponse("success", true);
}
Expand Down Expand Up @@ -759,6 +760,7 @@ public static class ScheduleGeneticCalculationForm
private int hourOfDay;

private boolean _kinshipValidation;
private boolean _allowImportDuringBusinessHours;

public boolean isEnabled()
{
Expand Down Expand Up @@ -799,6 +801,16 @@ public void setKinshipValidation(boolean kinshipValidation)
{
_kinshipValidation = kinshipValidation;
}

public boolean isAllowImportDuringBusinessHours()
{
return _allowImportDuringBusinessHours;
}

public void setAllowImportDuringBusinessHours(boolean allowImportDuringBusinessHours)
{
_allowImportDuringBusinessHours = allowImportDuringBusinessHours;
}
}

@RequiresPermission(AdminPermission.class)
Expand All @@ -817,6 +829,7 @@ public ApiResponse execute(ScheduleGeneticCalculationForm form, BindException er
ret.put("enabled", GeneticCalculationsJob.isEnabled());
ret.put("hourOfDay", GeneticCalculationsJob.getHourOfDay());
ret.put("kinshipValidation", GeneticCalculationsJob.isKinshipValidation());
ret.put("allowImportDuringBusinessHours", GeneticCalculationsJob.isAllowImportDuringBusinessHours());

return new ApiSimpleResponse(ret);
}
Expand Down Expand Up @@ -1250,7 +1263,7 @@ public void validateCommand(Object form, Errors errors)
@Override
public ModelAndView getConfirmView(Object form, BindException errors)
{
return new HtmlView("This will cause the system to recalculate kinship and inbreeding coefficients on the colony. Do you want to continue?");
return new HtmlView(HtmlString.of("This will cause the system to recalculate kinship and inbreeding coefficients on the colony. Do you want to continue?"));
}

@Override
Expand Down
9 changes: 9 additions & 0 deletions ehr/src/org/labkey/ehr/EHRServiceImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
import org.labkey.api.module.ModuleLoader;
import org.labkey.api.module.ModuleProperty;
import org.labkey.api.pipeline.PipeRoot;
import org.labkey.api.pipeline.PipelineJobException;
import org.labkey.api.pipeline.PipelineService;
import org.labkey.api.query.BatchValidationException;
import org.labkey.api.query.DetailsURL;
Expand Down Expand Up @@ -78,10 +79,12 @@
import org.labkey.ehr.history.DefaultObservationsDataSource;
import org.labkey.ehr.history.DefaultPregnanciesDataSource;
import org.labkey.ehr.history.LabworkManager;
import org.labkey.ehr.pipeline.GeneticCalculationsImportTask;
import org.labkey.ehr.security.EHRSecurityManager;
import org.labkey.ehr.table.DefaultEHRCustomizer;
import org.labkey.ehr.table.SNOMEDCodesDisplayColumn;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
Expand Down Expand Up @@ -1061,4 +1064,10 @@ public Map<String, Pair<Module, LabworkType>> getLabWorkOverrides()
{
return _labWorkOverrides;
}

@Override
public void standaloneProcessKinshipAndInbreeding(Container c, User u, File pipelineDir, Logger log) throws PipelineJobException
{
GeneticCalculationsImportTask.standaloneProcessKinshipAndInbreeding(c, u, pipelineDir, log);
}
}
Loading