From fe77c6f43d4ecceadd8cddf05a95bfc90bdb7c28 Mon Sep 17 00:00:00 2001 From: ryan-preble Date: Mon, 9 Dec 2024 12:32:39 -0500 Subject: [PATCH 01/19] First draft of function to determine tool compatibility --- lib/CXGN/Dataset.pm | 191 +++++++++++++++++++++++++++++++++- lib/SGN/Controller/Dataset.pm | 7 ++ 2 files changed, 197 insertions(+), 1 deletion(-) diff --git a/lib/CXGN/Dataset.pm b/lib/CXGN/Dataset.pm index d48f618267..dff8093759 100644 --- a/lib/CXGN/Dataset.pm +++ b/lib/CXGN/Dataset.pm @@ -50,6 +50,7 @@ Lukas Mueller package CXGN::Dataset; +use List::Util 'sum'; use Moose; use Moose::Util::TypeConstraints; use Data::Dumper; @@ -330,6 +331,16 @@ has 'include_phenotype_primary_key' => ( default => 0 ); +=head2 tool_compatibility() + +=cut + +has 'tool_compatibility' => ( + isa => 'Maybe[String]', + is => 'rw', + # default => "" +); + has 'breeder_search' => (isa => 'CXGN::BreederSearch', is => 'rw'); sub BUILD { @@ -357,11 +368,12 @@ sub BUILD { $self->years($dataset->{categories}->{years}); $self->breeding_programs($dataset->{categories}->{breeding_programs}); $self->genotyping_protocols($dataset->{categories}->{genotyping_protocols}); - $self->genotyping_projects($dataset->{categories}->{genotyping_projects}); + $self->genotyping_projects($dataset->{categories}->{genotyping_projects}); $self->locations($dataset->{categories}->{locations}); $self->trial_designs($dataset->{categories}->{trial_designs}); $self->trial_types($dataset->{categories}->{trial_types}); $self->category_order($dataset->{category_order}); + $self->tool_compatibility($dataset->{tool_compatibility}); $self->is_live($dataset->{is_live}); $self->is_public($dataset->{is_public}); if ($args->{outliers}) { $self->outliers($args->{outliers})} else { $self->outliers($dataset->{outliers}); } @@ -583,6 +595,7 @@ sub get_dataset_data { $dataref->{category_order} = $self->category_order(); $dataref->{outliers} = $self->outliers() if $self->outliers; $dataref->{outlier_cutoffs} = $self->outlier_cutoffs() if $self->outliers; + $dataref->{tool_compatibility} = $self->tool_compatibility() if $self->tool_compatibility; return $dataref; } @@ -602,6 +615,7 @@ sub _get_dataref { $dataref->{trial_designs} = join(",", @{$self->trial_designs()}) if $self->trial_designs && scalar(@{$self->trial_designs})>0; $dataref->{trial_types} = join(",", @{$self->trial_types()}) if $self->trial_types && scalar(@{$self->trial_types})>0; $dataref->{locations} = join(",", @{$self->locations()}) if $self->locations && scalar(@{$self->locations})>0; + $dataref->{tool_compatibility} = $self->tool_compatibility() if $self->tool_compatibility; return $dataref; } @@ -1187,6 +1201,181 @@ sub retrieve_trial_types { return \@trial_types; } +=head2 retrieve_tool_compatibility + +Returns precalculated tool compatibility as a JSON string, if any. + +=cut + +sub retrieve_tool_compatibility { + my $self = shift; + + if ($self->tool_compatibility) { + return $self->tool_compatibility + } else { + return "(not calculated)"; + } +} + +=head2 store_tool_compatibility + +Uploads a JSON of analysis tools that this dataset can be used with. For example, a dataset with genotype data but no trait phenotypes cannot be used with GWAS. +Note that this function should only ever be called once for a dataset and have the data stored as part of the dataset definition JSON, since retrieving high dimensional phenotype and genotype +data can be time consuming. +Tools that use datasets: + solGS - genotyping data and phenotyping data + PCA - genotyping data + Cluster Analysis - genotyping data and/or phenotyping data + Kinship & Inbreeding - genotyping data + Stability - trials and observed traits, trials must have multiple locations. Traits and accessions need to be represented in all locations, with replicates. Some missing data allowed. + Heritability - trial(s) w/ different designs and observed traits. + Mixed Models - observed traits, fairly flexible + Boxplots - observed traits + GWAS - at least one trial, a genotyping protocol, and observed traits + +=cut + +sub store_tool_compatibility { + my $self = shift; + + my $tool_compatibility = { + 'Boxplotter' => { + 'url' => '/tools/boxplotter', + 'compatible' => 0 + }, + 'Population Structure' => { + 'url' => '/pca/analysis', + 'compatible' => 0 + }, + 'Clustering' => { + 'url' => '/cluster/analysis', + 'types' => [], + 'compatible' => 0 + }, + 'Kinship & Inbreeding' => { + 'url' => '/kinship/analysis', + 'compatible' => 0 + }, + 'Mixed Models' => { + 'url' => '/tools/mixedmodels', + 'compatible' => 0 + }, + 'Stability' => { + 'url' => '/tools/stability', + 'traits' => [], + 'compatible' => 0 + }, + 'Heritability' => { + 'url' => '/tools/heritability', + 'traits' => [], + 'compatible' => 0 + }, + 'solGS' => { + 'url' => '/solgs', + 'traits' => [], + 'compatible' => 0 + }, + 'GWAS' => { + 'url' => '/tools/solgwas', + 'traits' => [], + 'compatible' => 0 + } + }; + + my $trials = $self->retrieve_trials(); # faster and easier than pulling it out of the phenotypes_ref + # listref of listrefs, first index is trialID, second is trial name + my $locations = $self->retrieve_locations(); # faster and easier than pulling it out of the phenotypes_ref + # listref of listrefs, first index is locationID, second is location name + my $genotypes = $self->retrieve_genotypes(); # Give it at least 15 seconds! + # listref of hashrefs. Each hashref should describe a stock (accession) genotype measurement, with a list of SNPs/genotypes. + # Relevant hash keys: stock_id, germplasmName, analysisMethod, selected_genotype_hash + my ($phenotypes, undef) = $self->retrieve_phenotypes_ref(); # Unique traits are included in this, so we don't need a call to retrieve_traits + # Returns data as a listref with two hashrefs. First hashref is a list of all phenotypes in this dataset, which is an observational unit w/ a list + # of trait observations. Each OU is a stock (plot, accession, etc). Second hashref has all unique traits in the phenotype list. + # Relevant hash keys: observations, trial_id, trial_location_id, germplasm_stock_id, trait_id, trait_name, value + + my $geno_represented_accessions = {}; + foreach my $genotype (@{$genotypes}) { + $geno_represented_accessions->{$genotype->{'stock_id'}} = 1; + } + my $num_genotyped_accessions = scalar(%{$geno_represented_accessions}); + + my $num_markers = [map {$_->{'resultCount'}} @{$genotypes}]; + $num_markers = sum(@{$num_markers}) / scalar @{$num_markers}; #average marker size should be large enough to do a GWAS. There is no set minimum since it depends on LD scores but I will say they need at least 100 + + my $obs_by_trait = {}; + my $pheno_represented_accessions = {}; + + foreach my $observation (@{$phenotypes}){ # hash map of count of every trait observation at every location + my $location = $observation->{'trial_location_id'}; + $pheno_represented_accessions->{$observation->{'germplasm_stock_id'}} = 1; + my @traits = map {$_->{'trait_id'}} @{$observation->{'observations'}}; + foreach my $trait (@traits) { + if (!exists($obs_by_trait->{$trait}->{$location})){ + $obs_by_trait->{$trait}->{$location} = 1; + } else { + $obs_by_trait->{$trait}->{$location} += 1; + } + } + } + my $num_phenotyped_accessions = scalar(%{$pheno_represented_accessions}); + + my $num_typed_accessions = scalar( grep {exists($geno_represented_accessions->{$_})} keys(%{$pheno_represented_accessions}) ); # number of accessions with both geno and pheno data + + if (scalar @{$phenotypes}) { + $tool_compatibility->{'Boxplotter'}->{'compatible'} = 1; + } + + if ($num_genotyped_accessions > 1) { + $tool_compatibility->{'Population Structure'}->{'compatible'} = 1; + $tool_compatibility->{'Kinship & Inbreeding'}->{'compatible'} = 1; + $tool_compatibility->{'Clustering'}->{'compatible'} = 1; + push @{$tool_compatibility->{'Clustering'}->{'types'}}, 'Genotype' + } + + if ($num_phenotyped_accessions > 1) { #dont need to go trait by trait for clustering, just need plenty of trait measurements + $tool_compatibility->{'Clustering'}->{'compatible'} = 1; + push @{$tool_compatibility->{'Clustering'}->{'types'}}, 'Phenotype'; + + if(@{$self->retrieve_trial_designs()}) { + $tool_compatibility->{'Mixed Models'}->{'compatible'} = 1; + } + if ($num_typed_accessions > 1) { + $tool_compatibility->{'Clustering'}->{'compatible'} = 1; + push @{$tool_compatibility->{'Clustering'}->{'types'}}, 'GEBV' + } + } + + foreach my $trait (@{$self->retrieve_traits()}){ # For each trait, we need to check for number of observations (plus locations for stability) + my $sum = 0; + my @location_counts = (); + foreach my $location (@{$locations}){ + $sum += $obs_by_trait->{$trait->[0]}->{$location->[0]}; + push @location_counts, $obs_by_trait->{$trait->[0]}->{$location->[0]}; + } + + if ($sum > 0) { # This trait was measured + if (@{$self->trial_designs()}){ + $tool_compatibility->{'Heritability'}->{'compatible'} = 1; + push @{$tool_compatibility->{'Heritability'}->{'traits'}}, $trait->[1]; + } + if (scalar(grep {$_ > 0} @location_counts) > 1) { # More than one location had measurements + $tool_compatibility->{'Stability'}->{'compatible'} = 1; + push @{$tool_compatibility->{'Stability'}->{'traits'}}, $trait->[1]; + } + } + + if ($sum > 100 && $num_markers > 100 && $num_typed_accessions > 50) { # If lots of markers, lots of accessions, and lots of phenotype measurements, then you can do genomic modeling + $tool_compatibility->{'GWAS'}->{'compatible'} = 1; + push @{$tool_compatibility->{'GWAS'}->{'traits'}}, $trait->[1]; + $tool_compatibility->{'solGS'}->{'compatible'} = 1; + push @{$tool_compatibility->{'solGS'}->{'traits'}}, $trait->[1]; + } + } + + return JSON::Any->encode($tool_compatibility); +} + sub get_dataset_definition { my $self = shift; my @criteria; diff --git a/lib/SGN/Controller/Dataset.pm b/lib/SGN/Controller/Dataset.pm index 03d8a52183..a2b27a55a5 100644 --- a/lib/SGN/Controller/Dataset.pm +++ b/lib/SGN/Controller/Dataset.pm @@ -3,6 +3,7 @@ package SGN::Controller::Dataset; use Moose; use CXGN::Dataset; +use Data::Dumper; use strict; use warnings; @@ -26,6 +27,7 @@ sub dataset :Chained('/') Path('dataset') Args(1) { }); }; if ($@) { + print STDERR "Dataset retrieval error: $@ \n"; $c->stash->{template} = 'generic_message.mas'; $c->stash->{message} = "The requested dataset does not exist or has been deleted."; return; @@ -78,6 +80,11 @@ sub dataset :Chained('/') Path('dataset') Args(1) { } $html .= ""; + # print STDERR "=============================\n"; + # my ($phenodata, $unique_traits) = $dataset->retrieve_phenotypes_ref(); + # print STDERR reftype($phenodata); + # print STDERR "\n=============================\n"; + $c->stash->{dataset_name} = $dataset->name(); $c->stash->{dataset_id} = $dataset_id; $c->stash->{dataset_description} = $dataset->description; From 7ae1870549b67fa2362ebf20ebb7038f69f69f55 Mon Sep 17 00:00:00 2001 From: ryan-preble Date: Tue, 10 Dec 2024 13:37:17 -0500 Subject: [PATCH 02/19] Polish tool compatibility function; have it correctly work through different genotyping protocols and split results by traits. Store as JSON in database --- lib/CXGN/Dataset.pm | 137 ++++++++++++++++++++++------------ lib/SGN/Controller/Dataset.pm | 3 +- 2 files changed, 89 insertions(+), 51 deletions(-) diff --git a/lib/CXGN/Dataset.pm b/lib/CXGN/Dataset.pm index dff8093759..7e90e952a5 100644 --- a/lib/CXGN/Dataset.pm +++ b/lib/CXGN/Dataset.pm @@ -336,7 +336,7 @@ has 'include_phenotype_primary_key' => ( =cut has 'tool_compatibility' => ( - isa => 'Maybe[String]', + isa => 'Maybe[HashRef]', is => 'rw', # default => "" ); @@ -1241,7 +1241,8 @@ sub store_tool_compatibility { my $tool_compatibility = { 'Boxplotter' => { 'url' => '/tools/boxplotter', - 'compatible' => 0 + 'compatible' => 0, + 'traits' => [] }, 'Population Structure' => { 'url' => '/pca/analysis', @@ -1249,7 +1250,7 @@ sub store_tool_compatibility { }, 'Clustering' => { 'url' => '/cluster/analysis', - 'types' => [], + 'types' => {}, #gets converted to listref later 'compatible' => 0 }, 'Kinship & Inbreeding' => { @@ -1258,7 +1259,8 @@ sub store_tool_compatibility { }, 'Mixed Models' => { 'url' => '/tools/mixedmodels', - 'compatible' => 0 + 'compatible' => 0, + 'traits' => [] }, 'Stability' => { 'url' => '/tools/stability', @@ -1284,96 +1286,133 @@ sub store_tool_compatibility { my $trials = $self->retrieve_trials(); # faster and easier than pulling it out of the phenotypes_ref # listref of listrefs, first index is trialID, second is trial name + my $traits = $self->retrieve_traits(); + my $trial_designs = $self->retrieve_trial_designs(); + my $genotyping_methods = $self->retrieve_genotyping_protocols();# listref of listrefs. First index is + # method ID, second is method name my $locations = $self->retrieve_locations(); # faster and easier than pulling it out of the phenotypes_ref # listref of listrefs, first index is locationID, second is location name my $genotypes = $self->retrieve_genotypes(); # Give it at least 15 seconds! # listref of hashrefs. Each hashref should describe a stock (accession) genotype measurement, with a list of SNPs/genotypes. - # Relevant hash keys: stock_id, germplasmName, analysisMethod, selected_genotype_hash - my ($phenotypes, undef) = $self->retrieve_phenotypes_ref(); # Unique traits are included in this, so we don't need a call to retrieve_traits - # Returns data as a listref with two hashrefs. First hashref is a list of all phenotypes in this dataset, which is an observational unit w/ a list + # Relevant hash keys: stock_id, germplasmName, analysisMethod, analysisMethodDbId, selected_genotype_hash + my ($phenotypes, undef) = $self->retrieve_phenotypes_ref(); # Returns data as a listref with two hashrefs. First hashref is a list of all phenotypes in this dataset, which is an observational unit w/ a list # of trait observations. Each OU is a stock (plot, accession, etc). Second hashref has all unique traits in the phenotype list. # Relevant hash keys: observations, trial_id, trial_location_id, germplasm_stock_id, trait_id, trait_name, value - my $geno_represented_accessions = {}; - foreach my $genotype (@{$genotypes}) { - $geno_represented_accessions->{$genotype->{'stock_id'}} = 1; - } - my $num_genotyped_accessions = scalar(%{$geno_represented_accessions}); - - my $num_markers = [map {$_->{'resultCount'}} @{$genotypes}]; - $num_markers = sum(@{$num_markers}) / scalar @{$num_markers}; #average marker size should be large enough to do a GWAS. There is no set minimum since it depends on LD scores but I will say they need at least 100 - my $obs_by_trait = {}; my $pheno_represented_accessions = {}; foreach my $observation (@{$phenotypes}){ # hash map of count of every trait observation at every location my $location = $observation->{'trial_location_id'}; $pheno_represented_accessions->{$observation->{'germplasm_stock_id'}} = 1; - my @traits = map {$_->{'trait_id'}} @{$observation->{'observations'}}; - foreach my $trait (@traits) { + my @obs_traits = map {$_->{'trait_id'}} @{$observation->{'observations'}}; + foreach my $trait (@obs_traits) { if (!exists($obs_by_trait->{$trait}->{$location})){ $obs_by_trait->{$trait}->{$location} = 1; } else { - $obs_by_trait->{$trait}->{$location} += 1; + $obs_by_trait->{$trait}->{$location}++; + } + if (!exists($obs_by_trait->{$trait}->{$observation->{'germplasm_stock_id'}})){ + $obs_by_trait->{$trait}->{'accessions'}->{$observation->{'germplasm_stock_id'}} = 1; + } else { + $obs_by_trait->{$trait}->{'accessions'}->{$observation->{'germplasm_stock_id'}} += 1; } } } my $num_phenotyped_accessions = scalar(%{$pheno_represented_accessions}); - my $num_typed_accessions = scalar( grep {exists($geno_represented_accessions->{$_})} keys(%{$pheno_represented_accessions}) ); # number of accessions with both geno and pheno data - - if (scalar @{$phenotypes}) { - $tool_compatibility->{'Boxplotter'}->{'compatible'} = 1; + my $geno_represented_accessions = {}; #This will store average marker counts for genotype methods and accessions typed by each method + foreach my $genotype (@{$genotypes}) { + $geno_represented_accessions->{$genotype->{'analysisMethodDbId'}}->{'accessions'}->{$genotype->{'stock_id'}} = 1; # each accession genotyped using each method } + foreach my $method (@{$genotyping_methods}){ + my $num_markers = [map {$_->{'resultCount'}} grep {$_->{'analysisMethodDbId'} == $method->[0]} @{$genotypes}]; + $num_markers = sum(@{$num_markers}) / scalar @{$num_markers}; #average marker size should be large enough to do a GWAS. There is no set minimum since it depends on LD scores but I will say they need at least 100 + $geno_represented_accessions->{$method->[0]}->{'avg_marker_count'} = $num_markers; - if ($num_genotyped_accessions > 1) { - $tool_compatibility->{'Population Structure'}->{'compatible'} = 1; - $tool_compatibility->{'Kinship & Inbreeding'}->{'compatible'} = 1; - $tool_compatibility->{'Clustering'}->{'compatible'} = 1; - push @{$tool_compatibility->{'Clustering'}->{'types'}}, 'Genotype' + if (scalar(keys(%{$geno_represented_accessions->{$method->[0]}->{'accessions'}})) > 1) { + $tool_compatibility->{'Population Structure'}->{'compatible'} = 1; + $tool_compatibility->{'Kinship & Inbreeding'}->{'compatible'} = 1; + $tool_compatibility->{'Clustering'}->{'compatible'} = 1; + $tool_compatibility->{'Clustering'}->{'types'}->{'Genotype'} = 1; + } } - if ($num_phenotyped_accessions > 1) { #dont need to go trait by trait for clustering, just need plenty of trait measurements + # my $num_typed_accessions = scalar( grep {exists($geno_represented_accessions->{$_})} keys(%{$pheno_represented_accessions}) ); # number of accessions with both pheno and geno data (and the same geno method) + + if ($num_phenotyped_accessions > 1 && scalar(@{$traits}) > 1) { #dont need to go trait by trait for clustering, since all traits are combined to eigenvectors. just need plenty of trait measurements $tool_compatibility->{'Clustering'}->{'compatible'} = 1; - push @{$tool_compatibility->{'Clustering'}->{'types'}}, 'Phenotype'; - - if(@{$self->retrieve_trial_designs()}) { - $tool_compatibility->{'Mixed Models'}->{'compatible'} = 1; - } - if ($num_typed_accessions > 1) { - $tool_compatibility->{'Clustering'}->{'compatible'} = 1; - push @{$tool_compatibility->{'Clustering'}->{'types'}}, 'GEBV' + $tool_compatibility->{'Clustering'}->{'types'}->{'Phenotype'} = 1; + foreach my $method (@{$genotyping_methods}){ + if (scalar(keys(%{$geno_represented_accessions->{$method->[0]}->{'accessions'}})) > 1) { # for GEBV clustering, there needs to be enough accessions using the same geno method and which have lots of trait measurements + $tool_compatibility->{'Clustering'}->{'compatible'} = 1; + $tool_compatibility->{'Clustering'}->{'types'}->{'GEBV'} = 1; + } } } + $tool_compatibility->{'Clustering'}->{'types'} = [keys(%{$tool_compatibility->{'Clustering'}->{'types'}})]; #catches edge case where multiple genotype entries were made for multiple genotype protocols - foreach my $trait (@{$self->retrieve_traits()}){ # For each trait, we need to check for number of observations (plus locations for stability) - my $sum = 0; + foreach my $trait (@{$traits}){ # For each trait, we need to check for number of observations (plus locations for stability) + my $total_obs = 0; my @location_counts = (); foreach my $location (@{$locations}){ - $sum += $obs_by_trait->{$trait->[0]}->{$location->[0]}; + $total_obs += $obs_by_trait->{$trait->[0]}->{$location->[0]}; push @location_counts, $obs_by_trait->{$trait->[0]}->{$location->[0]}; } + + my $num_accessions_phenotyped_for_this_trait = scalar(keys(%{$obs_by_trait->{$trait->[0]}->{'accessions'}})); + if ($total_obs > 0) { # This trait was measured - if ($sum > 0) { # This trait was measured - if (@{$self->trial_designs()}){ + $tool_compatibility->{'Boxplotter'}->{'compatible'} = 1; + push @{$tool_compatibility->{'Boxplotter'}->{'traits'}}, $trait->[1]; + + if (scalar(@{$trial_designs}) > 0 && $num_accessions_phenotyped_for_this_trait > 1){ #the presence of trial designs implies the presence of trials and differences in "environment" or treatment group. We also need to check that multiple accessions were measured for this trait $tool_compatibility->{'Heritability'}->{'compatible'} = 1; push @{$tool_compatibility->{'Heritability'}->{'traits'}}, $trait->[1]; } - if (scalar(grep {$_ > 0} @location_counts) > 1) { # More than one location had measurements + if (scalar(grep {$_ > 0} @location_counts) > 1 && $num_accessions_phenotyped_for_this_trait > 1) { # More than one location had measurements, and more than one accession was measured $tool_compatibility->{'Stability'}->{'compatible'} = 1; push @{$tool_compatibility->{'Stability'}->{'traits'}}, $trait->[1]; } + if(scalar(@{$trial_designs}) > 0 && $num_accessions_phenotyped_for_this_trait > 1) { + $tool_compatibility->{'Mixed Models'}->{'compatible'} = 1; + push @{$tool_compatibility->{'Mixed Models'}->{'traits'}}, $trait->[1]; + } } - if ($sum > 100 && $num_markers > 100 && $num_typed_accessions > 50) { # If lots of markers, lots of accessions, and lots of phenotype measurements, then you can do genomic modeling - $tool_compatibility->{'GWAS'}->{'compatible'} = 1; - push @{$tool_compatibility->{'GWAS'}->{'traits'}}, $trait->[1]; - $tool_compatibility->{'solGS'}->{'compatible'} = 1; - push @{$tool_compatibility->{'solGS'}->{'traits'}}, $trait->[1]; + foreach my $method (@{$genotyping_methods}){ # There needs to be consistent genotyping protocol for genomic modeling + my $num_accessions_typed_for_this_trait = scalar( grep {exists($geno_represented_accessions->{$method->[0]}->{'accessions'}->{$_})} keys(%{$obs_by_trait->{$trait->[0]}->{'accessions'}}) ); + if ($total_obs > 100 && $geno_represented_accessions->{$method->[0]}->{'avg_marker_count'} > 100 && $num_accessions_typed_for_this_trait > 50) { # If lots of markers, lots of accessions, and lots of phenotype measurements, then you can do genomic modeling + $tool_compatibility->{'GWAS'}->{'compatible'} = 1; + push @{$tool_compatibility->{'GWAS'}->{'traits'}}, $trait->[1]; + $tool_compatibility->{'solGS'}->{'compatible'} = 1; + push @{$tool_compatibility->{'solGS'}->{'traits'}}, $trait->[1]; + } } } - return JSON::Any->encode($tool_compatibility); + $self->tool_compatibility($tool_compatibility); + + #return $tool_compatibility; + + my $row = $self->people_schema()->resultset("SpDataset")->find( { sp_dataset_id => $self->sp_dataset_id() }); + if (! $row) { + return "The specified dataset does not exist"; + } else { + eval { + $row->name($self->name()); + $row->description($self->description()); + $row->dataset(JSON::Any->encode($self->to_hashref()->{dataset})); + $row->sp_person_id($self->sp_person_id()); + $row->update(); + return $row->sp_dataset_id(); + }; + if ($@) { + return "An error occurred, $@"; + } else { + return undef; + } + } } sub get_dataset_definition { diff --git a/lib/SGN/Controller/Dataset.pm b/lib/SGN/Controller/Dataset.pm index a2b27a55a5..979cd99bc5 100644 --- a/lib/SGN/Controller/Dataset.pm +++ b/lib/SGN/Controller/Dataset.pm @@ -81,8 +81,7 @@ sub dataset :Chained('/') Path('dataset') Args(1) { $html .= ""; # print STDERR "=============================\n"; - # my ($phenodata, $unique_traits) = $dataset->retrieve_phenotypes_ref(); - # print STDERR reftype($phenodata); + # print STDERR Dumper $dataset->store_tool_compatibility(); # print STDERR "\n=============================\n"; $c->stash->{dataset_name} = $dataset->name(); From bedd617679ef8968aaea6dc75c957eae9cac09f0 Mon Sep 17 00:00:00 2001 From: ryan-preble Date: Thu, 12 Dec 2024 13:31:43 -0500 Subject: [PATCH 03/19] Tweaks to tool compatibility calculation and adding compatibility to dataset details page. Added button to calculate compatibility if not already stored. --- lib/CXGN/Dataset.pm | 78 +++++------------------------- lib/SGN/Controller/AJAX/Dataset.pm | 29 +++++++++++ mason/dataset/index.mas | 68 ++++++++++++++++++++++++++ 3 files changed, 109 insertions(+), 66 deletions(-) diff --git a/lib/CXGN/Dataset.pm b/lib/CXGN/Dataset.pm index 7e90e952a5..e44b500bcb 100644 --- a/lib/CXGN/Dataset.pm +++ b/lib/CXGN/Dataset.pm @@ -1211,7 +1211,7 @@ sub retrieve_tool_compatibility { my $self = shift; if ($self->tool_compatibility) { - return $self->tool_compatibility + return JSON::Any->encode($self->tool_compatibility); } else { return "(not calculated)"; } @@ -1238,51 +1238,7 @@ Tools that use datasets: sub store_tool_compatibility { my $self = shift; - my $tool_compatibility = { - 'Boxplotter' => { - 'url' => '/tools/boxplotter', - 'compatible' => 0, - 'traits' => [] - }, - 'Population Structure' => { - 'url' => '/pca/analysis', - 'compatible' => 0 - }, - 'Clustering' => { - 'url' => '/cluster/analysis', - 'types' => {}, #gets converted to listref later - 'compatible' => 0 - }, - 'Kinship & Inbreeding' => { - 'url' => '/kinship/analysis', - 'compatible' => 0 - }, - 'Mixed Models' => { - 'url' => '/tools/mixedmodels', - 'compatible' => 0, - 'traits' => [] - }, - 'Stability' => { - 'url' => '/tools/stability', - 'traits' => [], - 'compatible' => 0 - }, - 'Heritability' => { - 'url' => '/tools/heritability', - 'traits' => [], - 'compatible' => 0 - }, - 'solGS' => { - 'url' => '/solgs', - 'traits' => [], - 'compatible' => 0 - }, - 'GWAS' => { - 'url' => '/tools/solgwas', - 'traits' => [], - 'compatible' => 0 - } - }; + my $tool_compatibility = {}; my $trials = $self->retrieve_trials(); # faster and easier than pulling it out of the phenotypes_ref # listref of listrefs, first index is trialID, second is trial name @@ -1331,26 +1287,25 @@ sub store_tool_compatibility { $geno_represented_accessions->{$method->[0]}->{'avg_marker_count'} = $num_markers; if (scalar(keys(%{$geno_represented_accessions->{$method->[0]}->{'accessions'}})) > 1) { - $tool_compatibility->{'Population Structure'}->{'compatible'} = 1; - $tool_compatibility->{'Kinship & Inbreeding'}->{'compatible'} = 1; - $tool_compatibility->{'Clustering'}->{'compatible'} = 1; - $tool_compatibility->{'Clustering'}->{'types'}->{'Genotype'} = 1; + $tool_compatibility->{'Population Structure'} = ""; + $tool_compatibility->{'Kinship & Inbreeding'} = ""; + $tool_compatibility->{'Clustering'}->{'types'}->{'Genotype'} = ""; } } # my $num_typed_accessions = scalar( grep {exists($geno_represented_accessions->{$_})} keys(%{$pheno_represented_accessions}) ); # number of accessions with both pheno and geno data (and the same geno method) if ($num_phenotyped_accessions > 1 && scalar(@{$traits}) > 1) { #dont need to go trait by trait for clustering, since all traits are combined to eigenvectors. just need plenty of trait measurements - $tool_compatibility->{'Clustering'}->{'compatible'} = 1; - $tool_compatibility->{'Clustering'}->{'types'}->{'Phenotype'} = 1; + $tool_compatibility->{'Clustering'}->{'types'}->{'Phenotype'} = ""; foreach my $method (@{$genotyping_methods}){ if (scalar(keys(%{$geno_represented_accessions->{$method->[0]}->{'accessions'}})) > 1) { # for GEBV clustering, there needs to be enough accessions using the same geno method and which have lots of trait measurements - $tool_compatibility->{'Clustering'}->{'compatible'} = 1; - $tool_compatibility->{'Clustering'}->{'types'}->{'GEBV'} = 1; + $tool_compatibility->{'Clustering'}->{'types'}->{'GEBV'} = ""; } } } - $tool_compatibility->{'Clustering'}->{'types'} = [keys(%{$tool_compatibility->{'Clustering'}->{'types'}})]; #catches edge case where multiple genotype entries were made for multiple genotype protocols + if (exists $tool_compatibility->{'Clustering'}->{'types'}) { + $tool_compatibility->{'Clustering'}->{'types'} = [keys(%{$tool_compatibility->{'Clustering'}->{'types'}})]; + } foreach my $trait (@{$traits}){ # For each trait, we need to check for number of observations (plus locations for stability) my $total_obs = 0; @@ -1363,19 +1318,15 @@ sub store_tool_compatibility { my $num_accessions_phenotyped_for_this_trait = scalar(keys(%{$obs_by_trait->{$trait->[0]}->{'accessions'}})); if ($total_obs > 0) { # This trait was measured - $tool_compatibility->{'Boxplotter'}->{'compatible'} = 1; push @{$tool_compatibility->{'Boxplotter'}->{'traits'}}, $trait->[1]; if (scalar(@{$trial_designs}) > 0 && $num_accessions_phenotyped_for_this_trait > 1){ #the presence of trial designs implies the presence of trials and differences in "environment" or treatment group. We also need to check that multiple accessions were measured for this trait - $tool_compatibility->{'Heritability'}->{'compatible'} = 1; push @{$tool_compatibility->{'Heritability'}->{'traits'}}, $trait->[1]; } if (scalar(grep {$_ > 0} @location_counts) > 1 && $num_accessions_phenotyped_for_this_trait > 1) { # More than one location had measurements, and more than one accession was measured - $tool_compatibility->{'Stability'}->{'compatible'} = 1; push @{$tool_compatibility->{'Stability'}->{'traits'}}, $trait->[1]; } if(scalar(@{$trial_designs}) > 0 && $num_accessions_phenotyped_for_this_trait > 1) { - $tool_compatibility->{'Mixed Models'}->{'compatible'} = 1; push @{$tool_compatibility->{'Mixed Models'}->{'traits'}}, $trait->[1]; } } @@ -1383,18 +1334,14 @@ sub store_tool_compatibility { foreach my $method (@{$genotyping_methods}){ # There needs to be consistent genotyping protocol for genomic modeling my $num_accessions_typed_for_this_trait = scalar( grep {exists($geno_represented_accessions->{$method->[0]}->{'accessions'}->{$_})} keys(%{$obs_by_trait->{$trait->[0]}->{'accessions'}}) ); if ($total_obs > 100 && $geno_represented_accessions->{$method->[0]}->{'avg_marker_count'} > 100 && $num_accessions_typed_for_this_trait > 50) { # If lots of markers, lots of accessions, and lots of phenotype measurements, then you can do genomic modeling - $tool_compatibility->{'GWAS'}->{'compatible'} = 1; push @{$tool_compatibility->{'GWAS'}->{'traits'}}, $trait->[1]; - $tool_compatibility->{'solGS'}->{'compatible'} = 1; - push @{$tool_compatibility->{'solGS'}->{'traits'}}, $trait->[1]; + # push @{$tool_compatibility->{'solGS'}->{'traits'}}, $trait->[1]; } } } $self->tool_compatibility($tool_compatibility); - #return $tool_compatibility; - my $row = $self->people_schema()->resultset("SpDataset")->find( { sp_dataset_id => $self->sp_dataset_id() }); if (! $row) { return "The specified dataset does not exist"; @@ -1405,12 +1352,11 @@ sub store_tool_compatibility { $row->dataset(JSON::Any->encode($self->to_hashref()->{dataset})); $row->sp_person_id($self->sp_person_id()); $row->update(); - return $row->sp_dataset_id(); }; if ($@) { return "An error occurred, $@"; } else { - return undef; + return JSON::Any->encode($tool_compatibility); } } } diff --git a/lib/SGN/Controller/AJAX/Dataset.pm b/lib/SGN/Controller/AJAX/Dataset.pm index c2c55c5515..e36118381c 100644 --- a/lib/SGN/Controller/AJAX/Dataset.pm +++ b/lib/SGN/Controller/AJAX/Dataset.pm @@ -389,6 +389,35 @@ sub retrieve_dataset_dimension :Path('/ajax/dataset/retrieve') Args(2) { }; } +sub calc_tool_compatibility :Path('/ajax/dataset/calc_tool_compatibility') Args(1) { + my $self = shift; + my $c = shift; + my $dataset_id = shift; + my $include_phenotype_primary_key = $c->req->param('include_phenotype_primary_key'); + + my $dataset = CXGN::Dataset->new( + { + schema => $c->dbic_schema("Bio::Chado::Schema"), + people_schema => $c->dbic_schema("CXGN::People::Schema"), + sp_dataset_id=> $dataset_id, + include_phenotype_primary_key => $include_phenotype_primary_key, + }); + + my $tool_compatibility; + eval { + $tool_compatibility = $dataset->store_tool_compatibility(); + }; + if ($@){ + $c->stash->{rest} = { + error => "Error calculating tool compatibility:\n$@" + }; + } else { + $c->stash->{rest} = { + tool_compatibility => $tool_compatibility + }; + } +} + sub delete_dataset :Path('/ajax/dataset/delete') Args(1) { my $self = shift; my $c = shift; diff --git a/mason/dataset/index.mas b/mason/dataset/index.mas index df32ccc697..672147c13f 100644 --- a/mason/dataset/index.mas +++ b/mason/dataset/index.mas @@ -99,6 +99,15 @@ $dataset_contents => '' % } Select Dataset in Wizard<% $wizard_link %> + Tool Compatibility +
+ +
+
+

@@ -221,6 +230,65 @@ $dataset_contents => '' var DataSet = window.jsMod['dataset_scatterplot'].init(<% $dataset_id %>); DataSet.render(); + function populate_tool_compatibility(json) { + + if (Object.prototype.toString.call(json) == "[object Array]") { + let table = '

' + json.join("

") + '

'; + return table; + } + var keys = []; + for (key in json) { + keys.push( key + populate_tool_compatibility(json[key])) + } + if (keys.length == 0) { + return ""; + } + let table = '
' + keys.join("") + '
'; + return table; + } + + jQuery('#tool-compatibility-calc-button').click(function() { + jQuery('#working_modal').modal("show"); + $.ajax({ + url: '/ajax/dataset/calc_tool_compatibility/' + <% $dataset_id %>, + success: function(response) { + if (response.error) { + jQuery('#working_modal').modal("hide"); + alert("An error occured calculating tool compatibility: \n" + response.error); + jQuery('#predicted-tool-compatibility').text("Error calculating tool compatibility."); + } else { + jQuery('#predicted-tool-compatibility').html(populate_tool_compatibility(JSON.parse(response.tool_compatibility))); + jQuery('#working_modal').modal("hide"); + } + }, + error: function(response) { + jQuery('#working_modal').modal("hide"); + alert("An error occurred calculating tool compatibility"); + jQuery('#predicted-tool-compatibility').text("Error calculating tool compatibility."); + } + }); + }); + + $.ajax({ + url: '/ajax/dataset/retrieve/' + <% $dataset_id %> + '/tool_compatibility', + success: function(response) { + if (response.error) { + alert("An error occurred. \n" + response.error); + jQuery('#predicted-tool-compatibility').text("Error retrieving tool compatibility."); + } else { + if (response.tool_compatibility == "(not calculated)") { + jQuery('#predicted-tool-compatibility').text(response.tool_compatibility); + } else { + jQuery('#predicted-tool-compatibility').html(populate_tool_compatibility(JSON.parse(response.tool_compatibility))); + } + } + }, + error: function(response) { + alert("An error occurred retrieving tool compatibility"); + jQuery('#predicted-tool-compatibility').text("Error retrieving tool compatibility."); + } + }); + jQuery('#info_table').click(function() { $.ajax({ url: '/ajax/dataset/by_user/<% $dataset_id %>', From 5a51b1a80f41345135efb55287fa68c542eac97d Mon Sep 17 00:00:00 2001 From: ryan-preble Date: Thu, 12 Dec 2024 14:31:47 -0500 Subject: [PATCH 04/19] Tweaks to tool compatibility table display --- mason/dataset/index.mas | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/mason/dataset/index.mas b/mason/dataset/index.mas index 672147c13f..29a29b2b65 100644 --- a/mason/dataset/index.mas +++ b/mason/dataset/index.mas @@ -100,12 +100,14 @@ $dataset_contents => '' % } Select Dataset in Wizard<% $wizard_link %> Tool Compatibility +Tools present in this table are compatible with your dataset. +The table details the traits and types of analyses that can +be performed. If database elements are updated, such as +phenotypes added to field trials or accessions, you may wish +to recalculate tool compatibility for more accurate results.">
-
+

From b2c9af9d434aa6ac26075ec109298bc043bb864c Mon Sep 17 00:00:00 2001 From: ryan-preble Date: Fri, 20 Dec 2024 15:58:08 -0500 Subject: [PATCH 05/19] Added warnings to tool compatibility JSON. Adjust table display to include warning tooltips --- lib/CXGN/Dataset.pm | 80 +++++++++++++++++++++++++++++++++++++++-- mason/dataset/index.mas | 57 ++++++++++++++++++++++------- 2 files changed, 121 insertions(+), 16 deletions(-) diff --git a/lib/CXGN/Dataset.pm b/lib/CXGN/Dataset.pm index e44b500bcb..8cdbed98d6 100644 --- a/lib/CXGN/Dataset.pm +++ b/lib/CXGN/Dataset.pm @@ -1238,7 +1238,35 @@ Tools that use datasets: sub store_tool_compatibility { my $self = shift; - my $tool_compatibility = {}; + my $tool_compatibility = { + 'GWAS' => { + 'compatible' => 0 + }, + # 'solGS' => { + # 'compatible' => 0 + # }, + 'Population Structure' => { + 'compatible' => 0 + }, + 'Clustering' => { + 'compatible' => 0 + }, + 'Kinship & Inbreeding' => { + 'compatible' => 0 + }, + 'Stability' => { + 'compatible' => 0 + }, + 'Heritability' => { + 'compatible' => 0 + }, + 'Mixed Models' => { + 'compatible' => 0 + }, + 'Boxplotter' => { + 'compatible' => 0 + } + }; my $trials = $self->retrieve_trials(); # faster and easier than pulling it out of the phenotypes_ref # listref of listrefs, first index is trialID, second is trial name @@ -1287,8 +1315,14 @@ sub store_tool_compatibility { $geno_represented_accessions->{$method->[0]}->{'avg_marker_count'} = $num_markers; if (scalar(keys(%{$geno_represented_accessions->{$method->[0]}->{'accessions'}})) > 1) { - $tool_compatibility->{'Population Structure'} = ""; - $tool_compatibility->{'Kinship & Inbreeding'} = ""; + if (scalar(keys(%{$geno_represented_accessions->{$method->[0]}->{'accessions'}})) < 30) { + $tool_compatibility->{'Population Structure'}->{'warn'}->{"You may not have enough accessions for strong results."} = ""; + $tool_compatibility->{'Kinship & Inbreeding'}->{'warn'}->{"You may not have enough accessions for strong results."} = ""; + $tool_compatibility->{'Clustering'}->{'warn'}->{"You may not have enough genotyped accessions for strong genotype clustering."} = ""; + } + $tool_compatibility->{'Population Structure'}->{'compatible'} = 1; + $tool_compatibility->{'Kinship & Inbreeding'}->{'compatible'} = 1; + $tool_compatibility->{'Clustering'}->{'compatible'} = 1; $tool_compatibility->{'Clustering'}->{'types'}->{'Genotype'} = ""; } } @@ -1296,13 +1330,21 @@ sub store_tool_compatibility { # my $num_typed_accessions = scalar( grep {exists($geno_represented_accessions->{$_})} keys(%{$pheno_represented_accessions}) ); # number of accessions with both pheno and geno data (and the same geno method) if ($num_phenotyped_accessions > 1 && scalar(@{$traits}) > 1) { #dont need to go trait by trait for clustering, since all traits are combined to eigenvectors. just need plenty of trait measurements + if ($num_phenotyped_accessions > 1 && scalar(@{$traits}) < 30) { + $tool_compatibility->{'Clustering'}->{'warn'}->{"You may not have enough trait measurements for strong phenotype clustering."} = ""; + } + $tool_compatibility->{'Clustering'}->{'compatible'} = 1; $tool_compatibility->{'Clustering'}->{'types'}->{'Phenotype'} = ""; foreach my $method (@{$genotyping_methods}){ if (scalar(keys(%{$geno_represented_accessions->{$method->[0]}->{'accessions'}})) > 1) { # for GEBV clustering, there needs to be enough accessions using the same geno method and which have lots of trait measurements + if (scalar(keys(%{$geno_represented_accessions->{$method->[0]}->{'accessions'}})) < 30) { + $tool_compatibility->{'Clustering'}->{'warn'}->{"You may not have enough genotyped accessions for strong GEBV clustering."} = ""; + } $tool_compatibility->{'Clustering'}->{'types'}->{'GEBV'} = ""; } } } + if (exists $tool_compatibility->{'Clustering'}->{'types'}) { $tool_compatibility->{'Clustering'}->{'types'} = [keys(%{$tool_compatibility->{'Clustering'}->{'types'}})]; } @@ -1318,15 +1360,31 @@ sub store_tool_compatibility { my $num_accessions_phenotyped_for_this_trait = scalar(keys(%{$obs_by_trait->{$trait->[0]}->{'accessions'}})); if ($total_obs > 0) { # This trait was measured + if ($total_obs < 0) { + $tool_compatibility->{'Boxplotter'}->{'warn'}->{"There may not be enough observations of ". $trait->[1]." to get meaningful data."} = ""; + } + $tool_compatibility->{'Boxplotter'}->{'compatible'} = 1; push @{$tool_compatibility->{'Boxplotter'}->{'traits'}}, $trait->[1]; if (scalar(@{$trial_designs}) > 0 && $num_accessions_phenotyped_for_this_trait > 1){ #the presence of trial designs implies the presence of trials and differences in "environment" or treatment group. We also need to check that multiple accessions were measured for this trait + if ($num_accessions_phenotyped_for_this_trait < 30) { + $tool_compatibility->{'Heritability'}->{'warn'}->{"There may not be enough accessions phenotyped for ".$trait->[1]." to get strong results."} = ""; + } + $tool_compatibility->{'Heritability'}->{'compatible'} = 1; push @{$tool_compatibility->{'Heritability'}->{'traits'}}, $trait->[1]; } if (scalar(grep {$_ > 0} @location_counts) > 1 && $num_accessions_phenotyped_for_this_trait > 1) { # More than one location had measurements, and more than one accession was measured + if ($num_accessions_phenotyped_for_this_trait < 30) { + $tool_compatibility->{'Stability'}->{'warn'}->{"There may not be enough accessions phenotyped for ".$trait->[1]." to get strong results."} = ""; + } + $tool_compatibility->{'Stability'}->{'compatible'} = 1; push @{$tool_compatibility->{'Stability'}->{'traits'}}, $trait->[1]; } if(scalar(@{$trial_designs}) > 0 && $num_accessions_phenotyped_for_this_trait > 1) { + if ($num_accessions_phenotyped_for_this_trait < 30) { + $tool_compatibility->{'Mixed Models'}->{'warn'}->{"There may not be enough accessions phenotyped for ".$trait->[1]." to build a strong model."} = ""; + } + $tool_compatibility->{'Mixed Models'}->{'compatible'} = 1; push @{$tool_compatibility->{'Mixed Models'}->{'traits'}}, $trait->[1]; } } @@ -1334,12 +1392,28 @@ sub store_tool_compatibility { foreach my $method (@{$genotyping_methods}){ # There needs to be consistent genotyping protocol for genomic modeling my $num_accessions_typed_for_this_trait = scalar( grep {exists($geno_represented_accessions->{$method->[0]}->{'accessions'}->{$_})} keys(%{$obs_by_trait->{$trait->[0]}->{'accessions'}}) ); if ($total_obs > 100 && $geno_represented_accessions->{$method->[0]}->{'avg_marker_count'} > 100 && $num_accessions_typed_for_this_trait > 50) { # If lots of markers, lots of accessions, and lots of phenotype measurements, then you can do genomic modeling + if ($total_obs < 300) { + $tool_compatibility->{'GWAS'}->{'warn'}->{"There may not be enough observations of ".$trait->[1]." to identify associated loci."} = ""; + } + if ($geno_represented_accessions->{$method->[0]}->{'avg_marker_count'} < 2500) { + $tool_compatibility->{'GWAS'}->{'warn'}->{"There may not be enough SNPs genotyped for method ".$method->[1]." to identify associated loci."} = ""; + } + if ($num_accessions_typed_for_this_trait < 300) { + $tool_compatibility->{'GWAS'}->{'warn'}->{"There may not be enough accessions both genotyped and assayed for ".$trait->[1]." to identify associated loci."} = ""; + } push @{$tool_compatibility->{'GWAS'}->{'traits'}}, $trait->[1]; + $tool_compatibility->{'GWAS'}->{'compatible'} = 1; # push @{$tool_compatibility->{'solGS'}->{'traits'}}, $trait->[1]; } } } + foreach my $tool (keys(%{$tool_compatibility})) { + if (exists($tool_compatibility->{$tool}->{"warn"})){ + $tool_compatibility->{$tool}->{"warn"} = join("\n", keys(%{$tool_compatibility->{$tool}->{"warn"}})); + } + } + $self->tool_compatibility($tool_compatibility); my $row = $self->people_schema()->resultset("SpDataset")->find( { sp_dataset_id => $self->sp_dataset_id() }); diff --git a/mason/dataset/index.mas b/mason/dataset/index.mas index 29a29b2b65..4ebb3c3c46 100644 --- a/mason/dataset/index.mas +++ b/mason/dataset/index.mas @@ -99,15 +99,17 @@ $dataset_contents => '' % } Select Dataset in Wizard<% $wizard_link %> - Tool Compatibility +to recalculate tool compatibility for more accurate results. +Warning symbols indicate compatibility, but with low sample +sizes. Hover for details.">
-
+

@@ -234,19 +236,45 @@ to recalculate tool compatibility for more accurate results."> function populate_tool_compatibility(json) { - if (Object.prototype.toString.call(json) == "[object Array]") { - let table = '

' + json.join("

") + '

'; - return table; - } - var keys = []; for (key in json) { - keys.push( key + populate_tool_compatibility(json[key])) + if (json[key]['compatible'] == 0) { + var newkey = '' + key + ' '; + delete json[key]['compatible']; + json[newkey] = json[key]; + delete json[key]; + } else { + if ('warn' in json[key]) { + var newkey = '' + key + ' '; + delete json[key]['warn']; + delete json[key]['compatible']; + json[newkey] = json[key]; + delete json[key]; + } else { + var newkey = '' + key + ' '; + delete json[key]['compatible']; + json[newkey] = json[key]; + delete json[key]; + } + } } - if (keys.length == 0) { - return ""; + + function recursive_table_gen(obj) { + if (Object.prototype.toString.call(obj) == "[object Array]") { + let table = '

' + obj.join("

") + '

'; + return table; + } + var keys = []; + for (key in obj) { + keys.push( key + recursive_table_gen(obj[key])) + } + if (keys.length == 0) { + return ""; + } + let table = '
' + keys.join("") + '
'; + return table; } - let table = '
' + keys.join("") + '
'; - return table; + + return recursive_table_gen(json); } jQuery('#tool-compatibility-calc-button').click(function() { @@ -259,6 +287,7 @@ to recalculate tool compatibility for more accurate results."> alert("An error occured calculating tool compatibility: \n" + response.error); jQuery('#predicted-tool-compatibility').text("Error calculating tool compatibility."); } else { + jQuery('#predicted-tool-compatibility').css('height', '300px'); jQuery('#predicted-tool-compatibility').html(populate_tool_compatibility(JSON.parse(response.tool_compatibility))); jQuery('#working_modal').modal("hide"); } @@ -281,6 +310,8 @@ to recalculate tool compatibility for more accurate results."> if (response.tool_compatibility == "(not calculated)") { jQuery('#predicted-tool-compatibility').text(response.tool_compatibility); } else { + //console.log(JSON.parse(response.tool_compatibility)); + jQuery('#predicted-tool-compatibility').css('height', '300px'); jQuery('#predicted-tool-compatibility').html(populate_tool_compatibility(JSON.parse(response.tool_compatibility))); } } From 458e218eb515993638343b0dd054b31868dbfbef Mon Sep 17 00:00:00 2001 From: ryan-preble Date: Thu, 9 Jan 2025 10:10:03 -0500 Subject: [PATCH 06/19] Tweaks to user instructions and starting to change tool compatibility calculation to not use genotype search --- lib/CXGN/Dataset.pm | 60 ++++++++++++++++++++++++++++------------- mason/dataset/index.mas | 1 - 2 files changed, 42 insertions(+), 19 deletions(-) diff --git a/lib/CXGN/Dataset.pm b/lib/CXGN/Dataset.pm index 8cdbed98d6..4e762fd277 100644 --- a/lib/CXGN/Dataset.pm +++ b/lib/CXGN/Dataset.pm @@ -59,6 +59,7 @@ use CXGN::BreederSearch; use CXGN::People::Schema; use CXGN::Phenotypes::PhenotypeMatrix; use CXGN::Genotype::Search; +use CXGN::Genotype::Protocol; use CXGN::Phenotypes::HighDimensionalPhenotypesSearch; =head2 people_schema() @@ -1274,14 +1275,18 @@ sub store_tool_compatibility { my $trial_designs = $self->retrieve_trial_designs(); my $genotyping_methods = $self->retrieve_genotyping_protocols();# listref of listrefs. First index is # method ID, second is method name + print STDERR Dumper $genotyping_methods; + print STDERR "\n"; my $locations = $self->retrieve_locations(); # faster and easier than pulling it out of the phenotypes_ref # listref of listrefs, first index is locationID, second is location name - my $genotypes = $self->retrieve_genotypes(); # Give it at least 15 seconds! - # listref of hashrefs. Each hashref should describe a stock (accession) genotype measurement, with a list of SNPs/genotypes. - # Relevant hash keys: stock_id, germplasmName, analysisMethod, analysisMethodDbId, selected_genotype_hash + # my $genotypes = $self->retrieve_genotypes(); # Give it at least 15 seconds! + # # listref of hashrefs. Each hashref should describe a stock (accession) genotype measurement, with a list of SNPs/genotypes. + # # Relevant hash keys: stock_id, germplasmName, analysisMethod, analysisMethodDbId, selected_genotype_hash my ($phenotypes, undef) = $self->retrieve_phenotypes_ref(); # Returns data as a listref with two hashrefs. First hashref is a list of all phenotypes in this dataset, which is an observational unit w/ a list # of trait observations. Each OU is a stock (plot, accession, etc). Second hashref has all unique traits in the phenotype list. # Relevant hash keys: observations, trial_id, trial_location_id, germplasm_stock_id, trait_id, trait_name, value + my $accessions = $self->retrieve_accessions(); + my $num_accessions = scalar(@{$accessions}); my $obs_by_trait = {}; my $pheno_represented_accessions = {}; @@ -1305,17 +1310,23 @@ sub store_tool_compatibility { } my $num_phenotyped_accessions = scalar(%{$pheno_represented_accessions}); - my $geno_represented_accessions = {}; #This will store average marker counts for genotype methods and accessions typed by each method - foreach my $genotype (@{$genotypes}) { - $geno_represented_accessions->{$genotype->{'analysisMethodDbId'}}->{'accessions'}->{$genotype->{'stock_id'}} = 1; # each accession genotyped using each method - } + # my $geno_represented_accessions = {}; #This will store average marker counts for genotype methods and accessions typed by each method + # foreach my $genotype (@{$genotypes}) { + # $geno_represented_accessions->{$genotype->{'analysisMethodDbId'}}->{'accessions'}->{$genotype->{'stock_id'}} = 1; # each accession genotyped using each method + # } foreach my $method (@{$genotyping_methods}){ - my $num_markers = [map {$_->{'resultCount'}} grep {$_->{'analysisMethodDbId'} == $method->[0]} @{$genotypes}]; - $num_markers = sum(@{$num_markers}) / scalar @{$num_markers}; #average marker size should be large enough to do a GWAS. There is no set minimum since it depends on LD scores but I will say they need at least 100 - $geno_represented_accessions->{$method->[0]}->{'avg_marker_count'} = $num_markers; - - if (scalar(keys(%{$geno_represented_accessions->{$method->[0]}->{'accessions'}})) > 1) { - if (scalar(keys(%{$geno_represented_accessions->{$method->[0]}->{'accessions'}})) < 30) { + my $geno_method = CXGN::Genotype::Protocol->new({ + bcs_schema => $self->schema, + nd_protocol_id => $method->[0] + }); + my $num_markers = scalar(@{$geno_method->markers_array}); + # $num_markers = sum(@{$num_markers}) / scalar @{$num_markers}; #average marker size should be large enough to do a GWAS. There is no set minimum since it depends on LD scores but I will say they need at least 100 + # $geno_represented_accessions->{$method->[0]}->{'avg_marker_count'} = $num_markers; + print STDERR "#####################\n"; + print STDERR Dumper $geno_method->marker_name_list; + print STDERR "########################\n"; + if ($num_markers > 1) { + if ($num_accessions < 30) { $tool_compatibility->{'Population Structure'}->{'warn'}->{"You may not have enough accessions for strong results."} = ""; $tool_compatibility->{'Kinship & Inbreeding'}->{'warn'}->{"You may not have enough accessions for strong results."} = ""; $tool_compatibility->{'Clustering'}->{'warn'}->{"You may not have enough genotyped accessions for strong genotype clustering."} = ""; @@ -1336,10 +1347,18 @@ sub store_tool_compatibility { $tool_compatibility->{'Clustering'}->{'compatible'} = 1; $tool_compatibility->{'Clustering'}->{'types'}->{'Phenotype'} = ""; foreach my $method (@{$genotyping_methods}){ - if (scalar(keys(%{$geno_represented_accessions->{$method->[0]}->{'accessions'}})) > 1) { # for GEBV clustering, there needs to be enough accessions using the same geno method and which have lots of trait measurements - if (scalar(keys(%{$geno_represented_accessions->{$method->[0]}->{'accessions'}})) < 30) { + my $geno_method = CXGN::Genotype::Protocol->new({ + bcs_schema => $self->schema, + nd_protocol_id => $method->[0] + }); + my $num_markers = scalar(@{$geno_method->markers_array}); + if ($num_accessions > 1 && $num_markers > 30) { # for GEBV clustering, there needs to be enough accessions using the same geno method (not checked right here) and which have lots of trait measurements + if ($num_accessions < 30) { $tool_compatibility->{'Clustering'}->{'warn'}->{"You may not have enough genotyped accessions for strong GEBV clustering."} = ""; } + if ($num_markers < 1000) { + $tool_compatibility->{'Clustering'}->{'warn'}->{"You may not have enough genotype markers for strong GEBV clustering."} = ""; + } $tool_compatibility->{'Clustering'}->{'types'}->{'GEBV'} = ""; } } @@ -1390,12 +1409,17 @@ sub store_tool_compatibility { } foreach my $method (@{$genotyping_methods}){ # There needs to be consistent genotyping protocol for genomic modeling - my $num_accessions_typed_for_this_trait = scalar( grep {exists($geno_represented_accessions->{$method->[0]}->{'accessions'}->{$_})} keys(%{$obs_by_trait->{$trait->[0]}->{'accessions'}}) ); - if ($total_obs > 100 && $geno_represented_accessions->{$method->[0]}->{'avg_marker_count'} > 100 && $num_accessions_typed_for_this_trait > 50) { # If lots of markers, lots of accessions, and lots of phenotype measurements, then you can do genomic modeling + my $geno_method = CXGN::Genotype::Protocol->new({ + bcs_schema => $self->schema, + nd_protocol_id => $method->[0] + }); + my $num_markers = scalar(@{$geno_method->markers_array}); + my $num_accessions_typed_for_this_trait = scalar( keys(%{$obs_by_trait->{$trait->[0]}->{'accessions'}}) ); + if ($total_obs > 100 && $num_markers > 100 && $num_accessions_typed_for_this_trait > 50) { # If lots of markers, lots of accessions, and lots of phenotype measurements, then you can do genomic modeling if ($total_obs < 300) { $tool_compatibility->{'GWAS'}->{'warn'}->{"There may not be enough observations of ".$trait->[1]." to identify associated loci."} = ""; } - if ($geno_represented_accessions->{$method->[0]}->{'avg_marker_count'} < 2500) { + if ($num_markers < 2500) { $tool_compatibility->{'GWAS'}->{'warn'}->{"There may not be enough SNPs genotyped for method ".$method->[1]." to identify associated loci."} = ""; } if ($num_accessions_typed_for_this_trait < 300) { diff --git a/mason/dataset/index.mas b/mason/dataset/index.mas index 4ebb3c3c46..7c49443d4a 100644 --- a/mason/dataset/index.mas +++ b/mason/dataset/index.mas @@ -100,7 +100,6 @@ $dataset_contents => '' % } Select Dataset in Wizard<% $wizard_link %> Tool Compatibility retrieve_locations(); # faster and easier than pulling it out of the phenotypes_ref # listref of listrefs, first index is locationID, second is location name - # my $genotypes = $self->retrieve_genotypes(); # Give it at least 15 seconds! - # # listref of hashrefs. Each hashref should describe a stock (accession) genotype measurement, with a list of SNPs/genotypes. - # # Relevant hash keys: stock_id, germplasmName, analysisMethod, analysisMethodDbId, selected_genotype_hash my ($phenotypes, undef) = $self->retrieve_phenotypes_ref(); # Returns data as a listref with two hashrefs. First hashref is a list of all phenotypes in this dataset, which is an observational unit w/ a list # of trait observations. Each OU is a stock (plot, accession, etc). Second hashref has all unique traits in the phenotype list. # Relevant hash keys: observations, trial_id, trial_location_id, germplasm_stock_id, trait_id, trait_name, value my $accessions = $self->retrieve_accessions(); - my $num_accessions = scalar(@{$accessions}); + # my $num_accessions = scalar(@{$accessions}); + my $genotype_counts = {}; + + my @accession_ids = map {$_->[0]} @{$accessions}; + + foreach my $method (@{$genotyping_methods}) { + my $genotype_query = "SELECT COUNT(DISTINCT(stock_id, nd_protocol_id)) FROM stock + JOIN nd_experiment_stock USING(stock_id) + JOIN nd_experiment_genotype USING(nd_experiment_id) + JOIN genotypeprop USING(genotype_id) + JOIN nd_experiment_protocol ON(nd_experiment_genotype.nd_experiment_id=nd_experiment_protocol.nd_experiment_id) + WHERE stock_id IN (SELECT unnest(string_to_array(?, ',')::int[])) AND nd_protocol_id=?;"; + my $h = $self->schema->storage()->dbh()->prepare($genotype_query); + $h->execute(join(", ",@accession_ids), $method->[0]); + + $genotype_counts->{$method->[0]}->{"num_accessions"} = $h->fetchrow_array; + + my $marker_query = "SELECT DISTINCT LENGTH(genotypeprop.value::text) FROM genotypeprop + JOIN nd_experiment_genotype USING(genotype_id) + JOIN nd_experiment_protocol ON(nd_experiment_genotype.nd_experiment_id=nd_experiment_protocol.nd_experiment_id) + WHERE nd_protocol_id=?;"; + $h = $self->schema->storage()->dbh()->prepare($marker_query); + $h->execute($method->[0]); + + $genotype_counts->{$method->[0]}->{"num_markers"} = $h->fetchrow_array; + } my $obs_by_trait = {}; my $pheno_represented_accessions = {}; @@ -1310,21 +1330,9 @@ sub store_tool_compatibility { } my $num_phenotyped_accessions = scalar(%{$pheno_represented_accessions}); - # my $geno_represented_accessions = {}; #This will store average marker counts for genotype methods and accessions typed by each method - # foreach my $genotype (@{$genotypes}) { - # $geno_represented_accessions->{$genotype->{'analysisMethodDbId'}}->{'accessions'}->{$genotype->{'stock_id'}} = 1; # each accession genotyped using each method - # } foreach my $method (@{$genotyping_methods}){ - my $geno_method = CXGN::Genotype::Protocol->new({ - bcs_schema => $self->schema, - nd_protocol_id => $method->[0] - }); - my $num_markers = scalar(@{$geno_method->markers_array}); - # $num_markers = sum(@{$num_markers}) / scalar @{$num_markers}; #average marker size should be large enough to do a GWAS. There is no set minimum since it depends on LD scores but I will say they need at least 100 - # $geno_represented_accessions->{$method->[0]}->{'avg_marker_count'} = $num_markers; - print STDERR "#####################\n"; - print STDERR Dumper $geno_method->marker_name_list; - print STDERR "########################\n"; + my $num_markers = $genotype_counts->{$method->[0]}->{"num_markers"}; + my $num_accessions = $genotype_counts->{$method->[0]}->{"num_accessions"}; if ($num_markers > 1) { if ($num_accessions < 30) { $tool_compatibility->{'Population Structure'}->{'warn'}->{"You may not have enough accessions for strong results."} = ""; @@ -1338,8 +1346,6 @@ sub store_tool_compatibility { } } - # my $num_typed_accessions = scalar( grep {exists($geno_represented_accessions->{$_})} keys(%{$pheno_represented_accessions}) ); # number of accessions with both pheno and geno data (and the same geno method) - if ($num_phenotyped_accessions > 1 && scalar(@{$traits}) > 1) { #dont need to go trait by trait for clustering, since all traits are combined to eigenvectors. just need plenty of trait measurements if ($num_phenotyped_accessions > 1 && scalar(@{$traits}) < 30) { $tool_compatibility->{'Clustering'}->{'warn'}->{"You may not have enough trait measurements for strong phenotype clustering."} = ""; @@ -1347,11 +1353,8 @@ sub store_tool_compatibility { $tool_compatibility->{'Clustering'}->{'compatible'} = 1; $tool_compatibility->{'Clustering'}->{'types'}->{'Phenotype'} = ""; foreach my $method (@{$genotyping_methods}){ - my $geno_method = CXGN::Genotype::Protocol->new({ - bcs_schema => $self->schema, - nd_protocol_id => $method->[0] - }); - my $num_markers = scalar(@{$geno_method->markers_array}); + my $num_markers = $genotype_counts->{$method->[0]}->{"num_markers"}; + my $num_accessions = $genotype_counts->{$method->[0]}->{"num_accessions"}; if ($num_accessions > 1 && $num_markers > 30) { # for GEBV clustering, there needs to be enough accessions using the same geno method (not checked right here) and which have lots of trait measurements if ($num_accessions < 30) { $tool_compatibility->{'Clustering'}->{'warn'}->{"You may not have enough genotyped accessions for strong GEBV clustering."} = ""; @@ -1409,20 +1412,17 @@ sub store_tool_compatibility { } foreach my $method (@{$genotyping_methods}){ # There needs to be consistent genotyping protocol for genomic modeling - my $geno_method = CXGN::Genotype::Protocol->new({ - bcs_schema => $self->schema, - nd_protocol_id => $method->[0] - }); - my $num_markers = scalar(@{$geno_method->markers_array}); - my $num_accessions_typed_for_this_trait = scalar( keys(%{$obs_by_trait->{$trait->[0]}->{'accessions'}}) ); - if ($total_obs > 100 && $num_markers > 100 && $num_accessions_typed_for_this_trait > 50) { # If lots of markers, lots of accessions, and lots of phenotype measurements, then you can do genomic modeling + my $num_markers = $genotype_counts->{$method->[0]}->{"num_markers"}; + my $num_genotyped_accessions = $genotype_counts->{$method->[0]}->{"num_accessions"}; + my $num_accessions_phenotyped_for_this_trait = scalar( keys(%{$obs_by_trait->{$trait->[0]}->{'accessions'}}) ); + if ($total_obs > 100 && $num_markers > 100 && $num_accessions_phenotyped_for_this_trait > 50 && $num_genotyped_accessions > 50) { # If lots of markers, lots of accessions, and lots of phenotype measurements, then you can do genomic modeling if ($total_obs < 300) { $tool_compatibility->{'GWAS'}->{'warn'}->{"There may not be enough observations of ".$trait->[1]." to identify associated loci."} = ""; } if ($num_markers < 2500) { $tool_compatibility->{'GWAS'}->{'warn'}->{"There may not be enough SNPs genotyped for method ".$method->[1]." to identify associated loci."} = ""; } - if ($num_accessions_typed_for_this_trait < 300) { + if ($num_accessions_phenotyped_for_this_trait < 300 || $num_genotyped_accessions < 300) { $tool_compatibility->{'GWAS'}->{'warn'}->{"There may not be enough accessions both genotyped and assayed for ".$trait->[1]." to identify associated loci."} = ""; } push @{$tool_compatibility->{'GWAS'}->{'traits'}}, $trait->[1]; From 11d2e7c705ead01932923109b63eecf8ab2886b5 Mon Sep 17 00:00:00 2001 From: ryan-preble Date: Fri, 17 Jan 2025 10:47:12 -0500 Subject: [PATCH 08/19] Change tool compatibility to have separate storing and calculating functions. Datasets now have tool compatibility auto calculated on creation. Remove debug comments. --- lib/CXGN/Dataset.pm | 25 +++++++++++++++++-------- lib/SGN/Controller/AJAX/Dataset.pm | 6 ++++-- lib/SGN/Controller/Dataset.pm | 4 ---- mason/dataset/index.mas | 4 ++-- 4 files changed, 23 insertions(+), 16 deletions(-) diff --git a/lib/CXGN/Dataset.pm b/lib/CXGN/Dataset.pm index 603d1ae8d4..e19c355b2f 100644 --- a/lib/CXGN/Dataset.pm +++ b/lib/CXGN/Dataset.pm @@ -1218,11 +1218,12 @@ sub retrieve_tool_compatibility { } } -=head2 store_tool_compatibility +=head2 calculate_tool_compatibility -Uploads a JSON of analysis tools that this dataset can be used with. For example, a dataset with genotype data but no trait phenotypes cannot be used with GWAS. +Creates a hashref of analysis tools that this dataset can be used with. For example, a dataset with genotype data but no trait phenotypes cannot be used with GWAS. Note that this function should only ever be called once for a dataset and have the data stored as part of the dataset definition JSON, since retrieving high dimensional phenotype and genotype data can be time consuming. + Tools that use datasets: solGS - genotyping data and phenotyping data PCA - genotyping data @@ -1236,7 +1237,7 @@ Tools that use datasets: =cut -sub store_tool_compatibility { +sub calculate_tool_compatibility { my $self = shift; my $tool_compatibility = { @@ -1415,7 +1416,7 @@ sub store_tool_compatibility { my $num_markers = $genotype_counts->{$method->[0]}->{"num_markers"}; my $num_genotyped_accessions = $genotype_counts->{$method->[0]}->{"num_accessions"}; my $num_accessions_phenotyped_for_this_trait = scalar( keys(%{$obs_by_trait->{$trait->[0]}->{'accessions'}}) ); - if ($total_obs > 100 && $num_markers > 100 && $num_accessions_phenotyped_for_this_trait > 50 && $num_genotyped_accessions > 50) { # If lots of markers, lots of accessions, and lots of phenotype measurements, then you can do genomic modeling + if ($total_obs > 100 && $num_markers > 100 && $num_accessions_phenotyped_for_this_trait > 50 && $num_genotyped_accessions > 50 && scalar(@{$trials}) > 0) { # If lots of markers, lots of accessions, and lots of phenotype measurements, then you can do genomic modeling if ($total_obs < 300) { $tool_compatibility->{'GWAS'}->{'warn'}->{"There may not be enough observations of ".$trait->[1]." to identify associated loci."} = ""; } @@ -1440,21 +1441,29 @@ sub store_tool_compatibility { $self->tool_compatibility($tool_compatibility); + #return JSON::Any->encode($tool_compatibility); + return $tool_compatibility; +} + +sub update_tool_compatibility { + my $self = shift; + + $self->calculate_tool_compatibility(); + my $row = $self->people_schema()->resultset("SpDataset")->find( { sp_dataset_id => $self->sp_dataset_id() }); if (! $row) { return "The specified dataset does not exist"; } else { eval { - $row->name($self->name()); - $row->description($self->description()); - $row->dataset(JSON::Any->encode($self->to_hashref()->{dataset})); $row->sp_person_id($self->sp_person_id()); + $row->sp_dataset_id($self->sp_dataset_id()); + $row->dataset(JSON::Any->encode($self->to_hashref()->{dataset})); $row->update(); }; if ($@) { return "An error occurred, $@"; } else { - return JSON::Any->encode($tool_compatibility); + return undef; } } } diff --git a/lib/SGN/Controller/AJAX/Dataset.pm b/lib/SGN/Controller/AJAX/Dataset.pm index e36118381c..67e18aed31 100644 --- a/lib/SGN/Controller/AJAX/Dataset.pm +++ b/lib/SGN/Controller/AJAX/Dataset.pm @@ -62,6 +62,7 @@ sub store_dataset :Path('/ajax/dataset/save') Args(0) { } $dataset->store(); + $dataset->update_tool_compatibility(); $c->stash->{rest} = { message => "Stored Dataset Successfully!" }; } @@ -405,7 +406,8 @@ sub calc_tool_compatibility :Path('/ajax/dataset/calc_tool_compatibility') Args( my $tool_compatibility; eval { - $tool_compatibility = $dataset->store_tool_compatibility(); + $dataset->update_tool_compatibility(); + $tool_compatibility = $dataset->tool_compatibility; }; if ($@){ $c->stash->{rest} = { @@ -413,7 +415,7 @@ sub calc_tool_compatibility :Path('/ajax/dataset/calc_tool_compatibility') Args( }; } else { $c->stash->{rest} = { - tool_compatibility => $tool_compatibility + tool_compatibility => JSON::Any->encode($tool_compatibility) }; } } diff --git a/lib/SGN/Controller/Dataset.pm b/lib/SGN/Controller/Dataset.pm index 979cd99bc5..0dc62af819 100644 --- a/lib/SGN/Controller/Dataset.pm +++ b/lib/SGN/Controller/Dataset.pm @@ -80,10 +80,6 @@ sub dataset :Chained('/') Path('dataset') Args(1) { } $html .= ""; - # print STDERR "=============================\n"; - # print STDERR Dumper $dataset->store_tool_compatibility(); - # print STDERR "\n=============================\n"; - $c->stash->{dataset_name} = $dataset->name(); $c->stash->{dataset_id} = $dataset_id; $c->stash->{dataset_description} = $dataset->description; diff --git a/mason/dataset/index.mas b/mason/dataset/index.mas index 7c49443d4a..940385d454 100644 --- a/mason/dataset/index.mas +++ b/mason/dataset/index.mas @@ -259,8 +259,8 @@ sizes. Hover for details."> function recursive_table_gen(obj) { if (Object.prototype.toString.call(obj) == "[object Array]") { - let table = '

' + obj.join("

") + '

'; - return table; + let table = '

' + obj.join("

") + '

'; + return table; } var keys = []; for (key in obj) { From 91327a7fab53ad671b98872dc23c4126524918b9 Mon Sep 17 00:00:00 2001 From: ryan-preble Date: Wed, 22 Jan 2025 11:36:03 -0500 Subject: [PATCH 09/19] Added short selenium test to verify tool compatibility on details page --- t/selenium2/breeders/breeder_search.t | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/t/selenium2/breeders/breeder_search.t b/t/selenium2/breeders/breeder_search.t index 80d16855e0..d0dcd266e9 100644 --- a/t/selenium2/breeders/breeder_search.t +++ b/t/selenium2/breeders/breeder_search.t @@ -2,7 +2,7 @@ use strict; use lib 't/lib'; -use Test::More 'tests' => 113; +use Test::More 'tests' => 118; use SGN::Test::WWW::WebDriver; use Selenium::Remote::WDKeys 'KEYS'; @@ -390,16 +390,25 @@ $t->while_logged_in_as("submitter", sub { ok($selected_reloaded_elements =~ /IITA-TMS-IBA011412/, "Verify first column wizard, selected elements, after merging $fourth_list_name and two new elements: accession IITA-TMS-IBA011412"); ok($selected_reloaded_elements =~ /IITA-TMS-IBA30572/, "Verify first column wizard, selected elements, after merging $fourth_list_name and two new elements: accession IITA-TMS-IBA30572"); - # TEST WORKING MIXED MODEL AND DETAILS PAGE FOR DATASET 1 + # TEST WORKING DETAILS PAGE FOR DATASET 1 $t->get_ok('/search/datasets'); sleep(1); - $t->find_element_ok("//a[text()='$dataset_name_1']",'xpath','checking for created dataset on dataset overview page')->click(); - sleep(5); + $t->find_element_ok("//a[text()='$dataset_name_1']",'xpath','Checking for created dataset on dataset overview page')->click(); + sleep(10); my $child_analyses = $t->find_element('dataset_analysis_usage', 'id')->get_text(); - ok($child_analyses eq "(none)", 'checking initial analysis usage'); + ok($child_analyses eq "(none)", 'Checking initial analysis usage'); + sleep(1); + + $t->find_element_ok('predicted-tool-compatibility', 'id', 'Checking for predicted tool compatibility'); sleep(1); + $t->find_element_ok('tool-compatibility-calc-button', 'id', 'Recalculating tool compatibility')->click(); + sleep(2); + my $tool_compatibility = $t->find_element('predicted-tool-compatibility', 'id')->get_text(); + ok($tool_compatibility =~ /Mixed Models/, "Verify expected tool compatibilities"); + ok($tool_compatibility =~ /Boxplotter/, "Verify expected tool compatibilities"); + ok($tool_compatibility =~ /traits/, "Verify expected tool compatibilities"); # DELETE DATASET $t->get_ok('/breeders/search'); From cf6718a381e9c30b49b6807982d502bc668e21bb Mon Sep 17 00:00:00 2001 From: ryan-preble Date: Wed, 22 Jan 2025 11:56:09 -0500 Subject: [PATCH 10/19] Appeasing the linter --- lib/CXGN/Dataset.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/CXGN/Dataset.pm b/lib/CXGN/Dataset.pm index 566fb9f5a1..21aec32bff 100644 --- a/lib/CXGN/Dataset.pm +++ b/lib/CXGN/Dataset.pm @@ -1464,7 +1464,7 @@ sub update_tool_compatibility { if ($@) { return "An error occurred, $@"; } else { - return undef; + return; } } } From 52fb716d07ce292ec3618a8a9190b6fb5f349ea9 Mon Sep 17 00:00:00 2001 From: ryan-preble Date: Wed, 29 Jan 2025 11:30:18 -0500 Subject: [PATCH 11/19] Add sample sizes and marker numbers to warning messages --- lib/CXGN/Dataset.pm | 65 +++++++++++++++++++++-------------- lib/SGN/Controller/Dataset.pm | 2 ++ mason/dataset/index.mas | 2 +- 3 files changed, 43 insertions(+), 26 deletions(-) diff --git a/lib/CXGN/Dataset.pm b/lib/CXGN/Dataset.pm index 21aec32bff..925876a613 100644 --- a/lib/CXGN/Dataset.pm +++ b/lib/CXGN/Dataset.pm @@ -62,6 +62,7 @@ use CXGN::Phenotypes::PhenotypeMatrix; use CXGN::Genotype::Search; use CXGN::Genotype::Protocol; use CXGN::Phenotypes::HighDimensionalPhenotypesSearch; +use CXGN::Trait; =head2 people_schema() @@ -1274,6 +1275,17 @@ sub calculate_tool_compatibility { my $trials = $self->retrieve_trials(); # faster and easier than pulling it out of the phenotypes_ref # listref of listrefs, first index is trialID, second is trial name my $traits = $self->retrieve_traits(); + # my $traits = []; + # foreach my $trait (@{$all_traits}) { #filter for quantitative traits + # my $trait_obj = CXGN::Trait->new({ + # bcs_schema => $self->schema, + # cvterm_id => $trait->[0] + # }); + # if ($trait_obj->categories ne ""){# ??? Not sure how to filter for categorical traits only + # push @{$traits}, $trait; + # } + # } + my $trial_designs = $self->retrieve_trial_designs(); my $genotyping_methods = $self->retrieve_genotyping_protocols();# listref of listrefs. First index is # method ID, second is method name @@ -1337,9 +1349,9 @@ sub calculate_tool_compatibility { my $num_accessions = $genotype_counts->{$method->[0]}->{"num_accessions"}; if ($num_markers > 1) { if ($num_accessions < 30) { - $tool_compatibility->{'Population Structure'}->{'warn'}->{"You may not have enough accessions for strong results."} = ""; - $tool_compatibility->{'Kinship & Inbreeding'}->{'warn'}->{"You may not have enough accessions for strong results."} = ""; - $tool_compatibility->{'Clustering'}->{'warn'}->{"You may not have enough genotyped accessions for strong genotype clustering."} = ""; + $tool_compatibility->{'Population Structure'}->{'warn'}->{"You may not have enough accessions (n=$num_accessions) genotyped for ".$method->[1].", ($num_markers markers) for strong results."} = ""; + $tool_compatibility->{'Kinship & Inbreeding'}->{'warn'}->{"You may not have enough accessions (n=$num_accessions) genotyped for ".$method->[1].", ($num_markers markers) for strong results."} = ""; + $tool_compatibility->{'Clustering'}->{'warn'}->{"You may not have enough accessions (n=$num_accessions) genotyped for ".$method->[1].", ($num_markers markers) for strong genotype clustering."} = ""; } $tool_compatibility->{'Population Structure'}->{'compatible'} = 1; $tool_compatibility->{'Kinship & Inbreeding'}->{'compatible'} = 1; @@ -1349,24 +1361,27 @@ sub calculate_tool_compatibility { } if ($num_phenotyped_accessions > 1 && scalar(@{$traits}) > 1) { #dont need to go trait by trait for clustering, since all traits are combined to eigenvectors. just need plenty of trait measurements - if ($num_phenotyped_accessions > 1 && scalar(@{$traits}) < 30) { - $tool_compatibility->{'Clustering'}->{'warn'}->{"You may not have enough trait measurements for strong phenotype clustering."} = ""; + if (scalar(@{$traits}) < 5) { + $tool_compatibility->{'Clustering'}->{'warn'}->{"You may not have enough trait measurements (only ".scalar(@{$traits})." traits) for strong phenotype clustering."} = ""; + } + if ($num_phenotyped_accessions < 30) { + $tool_compatibility->{'Clustering'}->{'warn'}->{"You may not have enough phenotyped accessions (n=$num_phenotyped_accessions) for strong phenotype clustering."} = ""; } $tool_compatibility->{'Clustering'}->{'compatible'} = 1; $tool_compatibility->{'Clustering'}->{'types'}->{'Phenotype'} = ""; - foreach my $method (@{$genotyping_methods}){ - my $num_markers = $genotype_counts->{$method->[0]}->{"num_markers"}; - my $num_accessions = $genotype_counts->{$method->[0]}->{"num_accessions"}; - if ($num_accessions > 1 && $num_markers > 30) { # for GEBV clustering, there needs to be enough accessions using the same geno method (not checked right here) and which have lots of trait measurements - if ($num_accessions < 30) { - $tool_compatibility->{'Clustering'}->{'warn'}->{"You may not have enough genotyped accessions for strong GEBV clustering."} = ""; - } - if ($num_markers < 1000) { - $tool_compatibility->{'Clustering'}->{'warn'}->{"You may not have enough genotype markers for strong GEBV clustering."} = ""; - } - $tool_compatibility->{'Clustering'}->{'types'}->{'GEBV'} = ""; - } - } + # foreach my $method (@{$genotyping_methods}){ + # my $num_markers = $genotype_counts->{$method->[0]}->{"num_markers"}; + # my $num_accessions = $genotype_counts->{$method->[0]}->{"num_accessions"}; + # if ($num_accessions > 1 && $num_markers > 30) { # for GEBV clustering, there needs to be enough accessions using the same geno method (not checked right here) and which have lots of trait measurements + # if ($num_accessions < 30) { + # $tool_compatibility->{'Clustering'}->{'warn'}->{"You may not have enough genotyped accessions for strong GEBV clustering."} = ""; + # } + # if ($num_markers < 1000) { + # $tool_compatibility->{'Clustering'}->{'warn'}->{"You may not have enough genotype markers for strong GEBV clustering."} = ""; + # } + # $tool_compatibility->{'Clustering'}->{'types'}->{'GEBV'} = ""; + # } + # } } if (exists $tool_compatibility->{'Clustering'}->{'types'}) { @@ -1385,28 +1400,28 @@ sub calculate_tool_compatibility { if ($total_obs > 0) { # This trait was measured if ($total_obs < 0) { - $tool_compatibility->{'Boxplotter'}->{'warn'}->{"There may not be enough observations of ". $trait->[1]." to get meaningful data."} = ""; + $tool_compatibility->{'Boxplotter'}->{'warn'}->{"There may not be enough observations (n=$total_obs) of ". $trait->[1]." to get meaningful data."} = ""; } $tool_compatibility->{'Boxplotter'}->{'compatible'} = 1; push @{$tool_compatibility->{'Boxplotter'}->{'traits'}}, $trait->[1]; if (scalar(@{$trial_designs}) > 0 && $num_accessions_phenotyped_for_this_trait > 1){ #the presence of trial designs implies the presence of trials and differences in "environment" or treatment group. We also need to check that multiple accessions were measured for this trait if ($num_accessions_phenotyped_for_this_trait < 30) { - $tool_compatibility->{'Heritability'}->{'warn'}->{"There may not be enough accessions phenotyped for ".$trait->[1]." to get strong results."} = ""; + $tool_compatibility->{'Heritability'}->{'warn'}->{"There may not be enough accessions (n=$num_accessions_phenotyped_for_this_trait) phenotyped for ".$trait->[1]." to get strong results."} = ""; } $tool_compatibility->{'Heritability'}->{'compatible'} = 1; push @{$tool_compatibility->{'Heritability'}->{'traits'}}, $trait->[1]; } if (scalar(grep {$_ > 0} @location_counts) > 1 && $num_accessions_phenotyped_for_this_trait > 1) { # More than one location had measurements, and more than one accession was measured if ($num_accessions_phenotyped_for_this_trait < 30) { - $tool_compatibility->{'Stability'}->{'warn'}->{"There may not be enough accessions phenotyped for ".$trait->[1]." to get strong results."} = ""; + $tool_compatibility->{'Stability'}->{'warn'}->{"There may not be enough accessions (n=$num_accessions_phenotyped_for_this_trait) phenotyped for ".$trait->[1]." to get strong results."} = ""; } $tool_compatibility->{'Stability'}->{'compatible'} = 1; push @{$tool_compatibility->{'Stability'}->{'traits'}}, $trait->[1]; } if(scalar(@{$trial_designs}) > 0 && $num_accessions_phenotyped_for_this_trait > 1) { if ($num_accessions_phenotyped_for_this_trait < 30) { - $tool_compatibility->{'Mixed Models'}->{'warn'}->{"There may not be enough accessions phenotyped for ".$trait->[1]." to build a strong model."} = ""; + $tool_compatibility->{'Mixed Models'}->{'warn'}->{"There may not be enough accessions (n=$num_accessions_phenotyped_for_this_trait) phenotyped for ".$trait->[1]." to build a strong model."} = ""; } $tool_compatibility->{'Mixed Models'}->{'compatible'} = 1; push @{$tool_compatibility->{'Mixed Models'}->{'traits'}}, $trait->[1]; @@ -1419,13 +1434,13 @@ sub calculate_tool_compatibility { my $num_accessions_phenotyped_for_this_trait = scalar( keys(%{$obs_by_trait->{$trait->[0]}->{'accessions'}}) ); if ($total_obs > 100 && $num_markers > 100 && $num_accessions_phenotyped_for_this_trait > 50 && $num_genotyped_accessions > 50 && scalar(@{$trials}) > 0) { # If lots of markers, lots of accessions, and lots of phenotype measurements, then you can do genomic modeling if ($total_obs < 300) { - $tool_compatibility->{'GWAS'}->{'warn'}->{"There may not be enough observations of ".$trait->[1]." to identify associated loci."} = ""; + $tool_compatibility->{'GWAS'}->{'warn'}->{"There may not be enough observations (n=$total_obs) of ".$trait->[1]." to identify associated loci."} = ""; } if ($num_markers < 2500) { - $tool_compatibility->{'GWAS'}->{'warn'}->{"There may not be enough SNPs genotyped for method ".$method->[1]." to identify associated loci."} = ""; + $tool_compatibility->{'GWAS'}->{'warn'}->{"There may not be enough SNPs ($num_markers) genotyped for method ".$method->[1]." to identify associated loci."} = ""; } if ($num_accessions_phenotyped_for_this_trait < 300 || $num_genotyped_accessions < 300) { - $tool_compatibility->{'GWAS'}->{'warn'}->{"There may not be enough accessions both genotyped and assayed for ".$trait->[1]." to identify associated loci."} = ""; + $tool_compatibility->{'GWAS'}->{'warn'}->{"There may not be enough accessions (n=$num_genotyped_accessions) both genotyped and assayed for ".$trait->[1]." to identify associated loci."} = ""; } push @{$tool_compatibility->{'GWAS'}->{'traits'}}, $trait->[1]; $tool_compatibility->{'GWAS'}->{'compatible'} = 1; diff --git a/lib/SGN/Controller/Dataset.pm b/lib/SGN/Controller/Dataset.pm index 0dc62af819..da2387f3c4 100644 --- a/lib/SGN/Controller/Dataset.pm +++ b/lib/SGN/Controller/Dataset.pm @@ -80,6 +80,8 @@ sub dataset :Chained('/') Path('dataset') Args(1) { } $html .= ""; + # print STDERR Dumper $dataset->retrieve_traits(); + $c->stash->{dataset_name} = $dataset->name(); $c->stash->{dataset_id} = $dataset_id; $c->stash->{dataset_description} = $dataset->description; diff --git a/mason/dataset/index.mas b/mason/dataset/index.mas index 3f014e6c46..c06745f369 100644 --- a/mason/dataset/index.mas +++ b/mason/dataset/index.mas @@ -110,7 +110,7 @@ sizes. Hover for details.">
-

+

Analyses using this dataset
From cd453fab6e4cb36b553a34abf265d068ccf0aeab Mon Sep 17 00:00:00 2001 From: ryan-preble Date: Wed, 29 Jan 2025 13:34:18 -0500 Subject: [PATCH 12/19] Add correlation tool --- lib/CXGN/Dataset.pm | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/lib/CXGN/Dataset.pm b/lib/CXGN/Dataset.pm index 925876a613..431e2f37be 100644 --- a/lib/CXGN/Dataset.pm +++ b/lib/CXGN/Dataset.pm @@ -1269,6 +1269,9 @@ sub calculate_tool_compatibility { }, 'Boxplotter' => { 'compatible' => 0 + }, + 'Correlation' => { + 'compatible' => 0 } }; @@ -1399,11 +1402,14 @@ sub calculate_tool_compatibility { my $num_accessions_phenotyped_for_this_trait = scalar(keys(%{$obs_by_trait->{$trait->[0]}->{'accessions'}})); if ($total_obs > 0) { # This trait was measured - if ($total_obs < 0) { + if ($total_obs < 30) { $tool_compatibility->{'Boxplotter'}->{'warn'}->{"There may not be enough observations (n=$total_obs) of ". $trait->[1]." to get meaningful data."} = ""; + $tool_compatibility->{'Correlation'}->{'warn'}->{"There may not be enough observations (n=$total_obs) of ". $trait->[1]." to get meaningful data."} = ""; } $tool_compatibility->{'Boxplotter'}->{'compatible'} = 1; push @{$tool_compatibility->{'Boxplotter'}->{'traits'}}, $trait->[1]; + $tool_compatibility->{'Correlation'}->{'compatible'} = 1; + push @{$tool_compatibility->{'Correlation'}->{'traits'}}, $trait->[1]; if (scalar(@{$trial_designs}) > 0 && $num_accessions_phenotyped_for_this_trait > 1){ #the presence of trial designs implies the presence of trials and differences in "environment" or treatment group. We also need to check that multiple accessions were measured for this trait if ($num_accessions_phenotyped_for_this_trait < 30) { From 64bb5190729c7295a54fb448740ae1d5ab9cff8e Mon Sep 17 00:00:00 2001 From: ryan-preble Date: Wed, 29 Jan 2025 14:56:42 -0500 Subject: [PATCH 13/19] Add data summary to tool compatibility JSON & change details page to show summary box --- lib/CXGN/Dataset.pm | 22 ++++++++++++++++++++-- mason/dataset/index.mas | 38 ++++++++++++++++++++++++-------------- 2 files changed, 44 insertions(+), 16 deletions(-) diff --git a/lib/CXGN/Dataset.pm b/lib/CXGN/Dataset.pm index 431e2f37be..2ae345cb39 100644 --- a/lib/CXGN/Dataset.pm +++ b/lib/CXGN/Dataset.pm @@ -50,7 +50,6 @@ Lukas Mueller package CXGN::Dataset; -use List::Util 'sum'; use Moose; use Moose::Util::TypeConstraints; use Data::Dumper; @@ -1272,6 +1271,13 @@ sub calculate_tool_compatibility { }, 'Correlation' => { 'compatible' => 0 + }, + 'Data Summary' => { + 'markers per genotyping protocol' => [], + 'number of phenotyped accessions per trait' => [], + 'number of observations per trait' => [], + 'number of genotyped accessions per protocol' => [], + 'trait observations per location' => {} } }; @@ -1350,6 +1356,10 @@ sub calculate_tool_compatibility { foreach my $method (@{$genotyping_methods}){ my $num_markers = $genotype_counts->{$method->[0]}->{"num_markers"}; my $num_accessions = $genotype_counts->{$method->[0]}->{"num_accessions"}; + + push @{$tool_compatibility->{"Data Summary"}->{"markers per genotyping protocol"}}, $method->[1]." : ".$num_markers; + push @{$tool_compatibility->{"Data Summary"}->{"number of genotyped accessions per protocol"}}, $method->[1]." : ".$num_accessions; + if ($num_markers > 1) { if ($num_accessions < 30) { $tool_compatibility->{'Population Structure'}->{'warn'}->{"You may not have enough accessions (n=$num_accessions) genotyped for ".$method->[1].", ($num_markers markers) for strong results."} = ""; @@ -1365,7 +1375,7 @@ sub calculate_tool_compatibility { if ($num_phenotyped_accessions > 1 && scalar(@{$traits}) > 1) { #dont need to go trait by trait for clustering, since all traits are combined to eigenvectors. just need plenty of trait measurements if (scalar(@{$traits}) < 5) { - $tool_compatibility->{'Clustering'}->{'warn'}->{"You may not have enough trait measurements (only ".scalar(@{$traits})." traits) for strong phenotype clustering."} = ""; + $tool_compatibility->{'Clustering'}->{'warn'}->{"You may not have enough measured traits (only ".scalar(@{$traits}).") for strong phenotype clustering."} = ""; } if ($num_phenotyped_accessions < 30) { $tool_compatibility->{'Clustering'}->{'warn'}->{"You may not have enough phenotyped accessions (n=$num_phenotyped_accessions) for strong phenotype clustering."} = ""; @@ -1397,9 +1407,14 @@ sub calculate_tool_compatibility { foreach my $location (@{$locations}){ $total_obs += $obs_by_trait->{$trait->[0]}->{$location->[0]}; push @location_counts, $obs_by_trait->{$trait->[0]}->{$location->[0]}; + push @{$tool_compatibility->{"Data Summary"}->{"trait observations per location"}->{$location->[1]}}, $trait->[1]." : ".$obs_by_trait->{$trait->[0]}->{$location->[0]}; } my $num_accessions_phenotyped_for_this_trait = scalar(keys(%{$obs_by_trait->{$trait->[0]}->{'accessions'}})); + + push @{$tool_compatibility->{"Data Summary"}->{"number of phenotyped accessions per trait"}}, $trait->[1]." : ".$num_accessions_phenotyped_for_this_trait; + push @{$tool_compatibility->{"Data Summary"}->{"number of observations per trait"}}, $trait->[1]." : ".$total_obs; + if ($total_obs > 0) { # This trait was measured if ($total_obs < 30) { @@ -1422,6 +1437,9 @@ sub calculate_tool_compatibility { if ($num_accessions_phenotyped_for_this_trait < 30) { $tool_compatibility->{'Stability'}->{'warn'}->{"There may not be enough accessions (n=$num_accessions_phenotyped_for_this_trait) phenotyped for ".$trait->[1]." to get strong results."} = ""; } + if (scalar(grep {$_ < 30} @location_counts) > 1) { + $tool_compatibility->{'Stability'}->{'warn'}->{"There may not be enough phenotype observations at all trial locations to get strong results."} = ""; + } $tool_compatibility->{'Stability'}->{'compatible'} = 1; push @{$tool_compatibility->{'Stability'}->{'traits'}}, $trait->[1]; } diff --git a/mason/dataset/index.mas b/mason/dataset/index.mas index c06745f369..3659a9eb92 100644 --- a/mason/dataset/index.mas +++ b/mason/dataset/index.mas @@ -236,26 +236,35 @@ sizes. Hover for details."> function populate_tool_compatibility(json) { + var data_summary = {}; + for (key in json) { - if (json[key]['compatible'] == 0) { - var newkey = '' + key + ' '; - delete json[key]['compatible']; - json[newkey] = json[key]; - delete json[key]; + if (key == "Data Summary"){ + var newkey = '' + key + ''; + data_summary[newkey] = json[key]; + delete json[key]; } else { - if ('warn' in json[key]) { - var newkey = '' + key + ' '; - delete json[key]['warn']; + if (json[key]['compatible'] == 0) { + var newkey = '' + key + ' '; delete json[key]['compatible']; json[newkey] = json[key]; delete json[key]; } else { - var newkey = '' + key + ' '; - delete json[key]['compatible']; - json[newkey] = json[key]; - delete json[key]; + if ('warn' in json[key]) { + var newkey = '' + key + ' '; + delete json[key]['warn']; + delete json[key]['compatible']; + json[newkey] = json[key]; + delete json[key]; + } else { + var newkey = '' + key + ' '; + delete json[key]['compatible']; + json[newkey] = json[key]; + delete json[key]; + } } } + } function recursive_table_gen(obj) { @@ -274,7 +283,7 @@ sizes. Hover for details."> return table; } - return recursive_table_gen(json); + return recursive_table_gen(json) + "
" + recursive_table_gen(data_summary); } jQuery('#tool-compatibility-calc-button').click(function() { @@ -311,13 +320,14 @@ sizes. Hover for details."> jQuery('#predicted-tool-compatibility').text(response.tool_compatibility); } else { //console.log(JSON.parse(response.tool_compatibility)); - jQuery('#predicted-tool-compatibility').css('height', '300px'); + jQuery('#predicted-tool-compatibility').css('height', '400px'); jQuery('#predicted-tool-compatibility').html(populate_tool_compatibility(JSON.parse(response.tool_compatibility))); } } }, error: function(response) { alert("An error occurred retrieving tool compatibility"); + jQuery('#predicted-tool-compatibility').css('height', '100px'); jQuery('#predicted-tool-compatibility').text("Error retrieving tool compatibility."); } }); From eaacf179d216a901c6055b4eb46dcfcc4f846b3a Mon Sep 17 00:00:00 2001 From: ryan-preble Date: Wed, 29 Jan 2025 15:11:16 -0500 Subject: [PATCH 14/19] Adjusting div size --- mason/dataset/index.mas | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mason/dataset/index.mas b/mason/dataset/index.mas index 3659a9eb92..7c36794414 100644 --- a/mason/dataset/index.mas +++ b/mason/dataset/index.mas @@ -296,7 +296,7 @@ sizes. Hover for details."> alert("An error occured calculating tool compatibility: \n" + response.error); jQuery('#predicted-tool-compatibility').text("Error calculating tool compatibility."); } else { - jQuery('#predicted-tool-compatibility').css('height', '300px'); + jQuery('#predicted-tool-compatibility').css('height', '400px'); jQuery('#predicted-tool-compatibility').html(populate_tool_compatibility(JSON.parse(response.tool_compatibility))); jQuery('#working_modal').modal("hide"); } From f65af9bad966e9cf189b3e143b4de5920e999618 Mon Sep 17 00:00:00 2001 From: ryan-preble Date: Fri, 31 Jan 2025 11:29:15 -0500 Subject: [PATCH 15/19] Improving stability and heritability criteria --- lib/CXGN/Dataset.pm | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/lib/CXGN/Dataset.pm b/lib/CXGN/Dataset.pm index 2ae345cb39..db6aeb3501 100644 --- a/lib/CXGN/Dataset.pm +++ b/lib/CXGN/Dataset.pm @@ -61,7 +61,7 @@ use CXGN::Phenotypes::PhenotypeMatrix; use CXGN::Genotype::Search; use CXGN::Genotype::Protocol; use CXGN::Phenotypes::HighDimensionalPhenotypesSearch; -use CXGN::Trait; +use CXGN::Trial; =head2 people_schema() @@ -1277,7 +1277,8 @@ sub calculate_tool_compatibility { 'number of phenotyped accessions per trait' => [], 'number of observations per trait' => [], 'number of genotyped accessions per protocol' => [], - 'trait observations per location' => {} + 'trait observations per location' => {}, + 'number of accessions per trial' => [] } }; @@ -1309,6 +1310,21 @@ sub calculate_tool_compatibility { my @accession_ids = map {$_->[0]} @{$accessions}; + my $accessions_in_common = {}; + foreach my $trial (@{$trials}) { + my $trial_obj = CXGN::Trial->new({ + bcs_schema => $self->schema, + trial_id => $trial->[0] + }); + my $current_accessions = $trial_obj->get_accessions(); + push @{$tool_compatibility->{"Data Summary"}->{'number of accessions per trial'}}, $trial->[1]." : ".scalar(@{$current_accessions}); + foreach my $accession (@{$current_accessions}) { + $accessions_in_common->{$accession->{"stock_id"}}++; + } + } + my $num_shared_accessions = scalar(grep {$accessions_in_common->{$_} > 1} keys(%{$accessions_in_common})); + push @{$tool_compatibility->{"Data Summary"}->{'number of accessions per trial'}}, "Shared across all trials : $num_shared_accessions"; + foreach my $method (@{$genotyping_methods}) { my $genotype_query = "SELECT COUNT(DISTINCT(stock_id, nd_protocol_id)) FROM stock JOIN nd_experiment_stock USING(stock_id) @@ -1426,10 +1442,13 @@ sub calculate_tool_compatibility { $tool_compatibility->{'Correlation'}->{'compatible'} = 1; push @{$tool_compatibility->{'Correlation'}->{'traits'}}, $trait->[1]; - if (scalar(@{$trial_designs}) > 0 && $num_accessions_phenotyped_for_this_trait > 1){ #the presence of trial designs implies the presence of trials and differences in "environment" or treatment group. We also need to check that multiple accessions were measured for this trait + if ($num_accessions_phenotyped_for_this_trait > 1 && scalar(@{$trials}) > 1){ #the presence of trial designs implies the presence of trials and differences in "environment" or treatment group. We also need to check that multiple accessions were measured for this trait if ($num_accessions_phenotyped_for_this_trait < 30) { $tool_compatibility->{'Heritability'}->{'warn'}->{"There may not be enough accessions (n=$num_accessions_phenotyped_for_this_trait) phenotyped for ".$trait->[1]." to get strong results."} = ""; } + if ($num_shared_accessions < 30) { + $tool_compatibility->{'Heritability'}->{'warn'}->{"There may not be enough accessions shared across all trials ($num_shared_accessions) to get strong results."} = ""; + } $tool_compatibility->{'Heritability'}->{'compatible'} = 1; push @{$tool_compatibility->{'Heritability'}->{'traits'}}, $trait->[1]; } @@ -1437,9 +1456,12 @@ sub calculate_tool_compatibility { if ($num_accessions_phenotyped_for_this_trait < 30) { $tool_compatibility->{'Stability'}->{'warn'}->{"There may not be enough accessions (n=$num_accessions_phenotyped_for_this_trait) phenotyped for ".$trait->[1]." to get strong results."} = ""; } - if (scalar(grep {$_ < 30} @location_counts) > 1) { + if (scalar(grep {$_ < 30} @location_counts) > 1) {#If any of the locations had too few pheno observations $tool_compatibility->{'Stability'}->{'warn'}->{"There may not be enough phenotype observations at all trial locations to get strong results."} = ""; } + if ($total_obs < $num_accessions_phenotyped_for_this_trait) {# If total observations is lower than number of accessions, accessions were probably not replicated + $tool_compatibility->{'Stability'}->{'warn'}->{"There may not be enough replicated measurements of ".$trait->[1]."."} = ""; + } $tool_compatibility->{'Stability'}->{'compatible'} = 1; push @{$tool_compatibility->{'Stability'}->{'traits'}}, $trait->[1]; } From 9b3df359c59c6eab87f6602d4dffa0ec6f535594 Mon Sep 17 00:00:00 2001 From: ryan-preble Date: Fri, 31 Jan 2025 11:43:42 -0500 Subject: [PATCH 16/19] Adding check to make sure traits are quantitative only --- lib/CXGN/Dataset.pm | 25 ++++++++++++++----------- lib/SGN/Controller/Dataset.pm | 3 ++- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/lib/CXGN/Dataset.pm b/lib/CXGN/Dataset.pm index db6aeb3501..e6cee89f4b 100644 --- a/lib/CXGN/Dataset.pm +++ b/lib/CXGN/Dataset.pm @@ -1284,21 +1284,24 @@ sub calculate_tool_compatibility { my $trials = $self->retrieve_trials(); # faster and easier than pulling it out of the phenotypes_ref # listref of listrefs, first index is trialID, second is trial name - my $traits = $self->retrieve_traits(); - # my $traits = []; - # foreach my $trait (@{$all_traits}) { #filter for quantitative traits - # my $trait_obj = CXGN::Trait->new({ - # bcs_schema => $self->schema, - # cvterm_id => $trait->[0] - # }); - # if ($trait_obj->categories ne ""){# ??? Not sure how to filter for categorical traits only - # push @{$traits}, $trait; - # } - # } + my $all_traits = $self->retrieve_traits(); + my $traits = []; + foreach my $trait (@{$all_traits}) { #filter for quantitative traits + my $trait_obj = CXGN::Trait->new({ + bcs_schema => $self->schema, + cvterm_id => $trait->[0] + }); + if ($trait_obj->categories eq ""){# ??? Not sure how to filter properly + push @{$traits}, $trait; + } + } my $trial_designs = $self->retrieve_trial_designs(); my $genotyping_methods = $self->retrieve_genotyping_protocols();# listref of listrefs. First index is # method ID, second is method name + # if (scalar(@{$genotyping_methods}) == 0) { + # push @{$genotyping_methods}, $c->config->{default_genotyping_protocol}; + # } my $locations = $self->retrieve_locations(); # faster and easier than pulling it out of the phenotypes_ref # listref of listrefs, first index is locationID, second is location name my ($phenotypes, undef) = $self->retrieve_phenotypes_ref(); # Returns data as a listref with two hashrefs. First hashref is a list of all phenotypes in this dataset, which is an observational unit w/ a list diff --git a/lib/SGN/Controller/Dataset.pm b/lib/SGN/Controller/Dataset.pm index da2387f3c4..902d8414ae 100644 --- a/lib/SGN/Controller/Dataset.pm +++ b/lib/SGN/Controller/Dataset.pm @@ -80,7 +80,8 @@ sub dataset :Chained('/') Path('dataset') Args(1) { } $html .= ""; - # print STDERR Dumper $dataset->retrieve_traits(); + # print STDERR "=======================================================\n"; + # print STDERR "=======================================================\n"; $c->stash->{dataset_name} = $dataset->name(); $c->stash->{dataset_id} = $dataset_id; From b7e25769e1c2671c00bb55668f8be511dd0a9a05 Mon Sep 17 00:00:00 2001 From: ryan-preble Date: Fri, 31 Jan 2025 12:02:35 -0500 Subject: [PATCH 17/19] Add pheno and geno types to PCA tool compatibility check --- lib/CXGN/Dataset.pm | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/lib/CXGN/Dataset.pm b/lib/CXGN/Dataset.pm index e6cee89f4b..730a6fb3cf 100644 --- a/lib/CXGN/Dataset.pm +++ b/lib/CXGN/Dataset.pm @@ -1386,6 +1386,7 @@ sub calculate_tool_compatibility { $tool_compatibility->{'Clustering'}->{'warn'}->{"You may not have enough accessions (n=$num_accessions) genotyped for ".$method->[1].", ($num_markers markers) for strong genotype clustering."} = ""; } $tool_compatibility->{'Population Structure'}->{'compatible'} = 1; + $tool_compatibility->{'Population Structure'}->{'types'}->{'Genotype'} = 1; $tool_compatibility->{'Kinship & Inbreeding'}->{'compatible'} = 1; $tool_compatibility->{'Clustering'}->{'compatible'} = 1; $tool_compatibility->{'Clustering'}->{'types'}->{'Genotype'} = ""; @@ -1395,30 +1396,24 @@ sub calculate_tool_compatibility { if ($num_phenotyped_accessions > 1 && scalar(@{$traits}) > 1) { #dont need to go trait by trait for clustering, since all traits are combined to eigenvectors. just need plenty of trait measurements if (scalar(@{$traits}) < 5) { $tool_compatibility->{'Clustering'}->{'warn'}->{"You may not have enough measured traits (only ".scalar(@{$traits}).") for strong phenotype clustering."} = ""; + $tool_compatibility->{'Population Structure'}->{'warn'}->{"You have only ".scalar(@{$traits})." measured traits, which will limit the number of principal components in a phenotype PCA."} = ""; } if ($num_phenotyped_accessions < 30) { $tool_compatibility->{'Clustering'}->{'warn'}->{"You may not have enough phenotyped accessions (n=$num_phenotyped_accessions) for strong phenotype clustering."} = ""; + $tool_compatibility->{'Population Structure'}->{'warn'}->{"You may not have enough phenotyped accessions (n=$num_phenotyped_accessions) for a strong phenotype PCA."} = ""; } $tool_compatibility->{'Clustering'}->{'compatible'} = 1; $tool_compatibility->{'Clustering'}->{'types'}->{'Phenotype'} = ""; - # foreach my $method (@{$genotyping_methods}){ - # my $num_markers = $genotype_counts->{$method->[0]}->{"num_markers"}; - # my $num_accessions = $genotype_counts->{$method->[0]}->{"num_accessions"}; - # if ($num_accessions > 1 && $num_markers > 30) { # for GEBV clustering, there needs to be enough accessions using the same geno method (not checked right here) and which have lots of trait measurements - # if ($num_accessions < 30) { - # $tool_compatibility->{'Clustering'}->{'warn'}->{"You may not have enough genotyped accessions for strong GEBV clustering."} = ""; - # } - # if ($num_markers < 1000) { - # $tool_compatibility->{'Clustering'}->{'warn'}->{"You may not have enough genotype markers for strong GEBV clustering."} = ""; - # } - # $tool_compatibility->{'Clustering'}->{'types'}->{'GEBV'} = ""; - # } - # } + $tool_compatibility->{'Population Structure'}->{'compatible'} = 1; + $tool_compatibility->{'Population Structure'}->{'types'}->{'Phenotype'} = ""; } if (exists $tool_compatibility->{'Clustering'}->{'types'}) { $tool_compatibility->{'Clustering'}->{'types'} = [keys(%{$tool_compatibility->{'Clustering'}->{'types'}})]; } + if (exists $tool_compatibility->{'Population Structure'}->{'types'}) { + $tool_compatibility->{'Population Structure'}->{'types'} = [keys(%{$tool_compatibility->{'Population Structure'}->{'types'}})]; + } foreach my $trait (@{$traits}){ # For each trait, we need to check for number of observations (plus locations for stability) my $total_obs = 0; From dfb764d1a77209e13c51fcac09afe695d2e30cff Mon Sep 17 00:00:00 2001 From: ryan-preble Date: Fri, 31 Jan 2025 12:46:03 -0500 Subject: [PATCH 18/19] Adding default genotyping protocol enforcement --- lib/CXGN/Dataset.pm | 13 +++++++++---- lib/SGN/Controller/AJAX/Dataset.pm | 4 ++-- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/lib/CXGN/Dataset.pm b/lib/CXGN/Dataset.pm index 730a6fb3cf..32dd2474f1 100644 --- a/lib/CXGN/Dataset.pm +++ b/lib/CXGN/Dataset.pm @@ -1240,6 +1240,7 @@ Tools that use datasets: sub calculate_tool_compatibility { my $self = shift; + my $default_genotyping_protocol_name = shift; my $tool_compatibility = { 'GWAS' => { @@ -1299,16 +1300,20 @@ sub calculate_tool_compatibility { my $trial_designs = $self->retrieve_trial_designs(); my $genotyping_methods = $self->retrieve_genotyping_protocols();# listref of listrefs. First index is # method ID, second is method name - # if (scalar(@{$genotyping_methods}) == 0) { - # push @{$genotyping_methods}, $c->config->{default_genotyping_protocol}; - # } + if (scalar(@{$genotyping_methods}) == 0) { + my $geno_method_query = "SELECT nd_protocol_id FROM nd_protocol + WHERE name ilike ?"; + my $h = $self->schema->storage()->dbh()->prepare($geno_method_query); + $h->execute($default_genotyping_protocol_name); + my $default_genotyping_protocol_id = $h->fetchrow_array(); + push @{$genotyping_methods}, [$default_genotyping_protocol_id, $default_genotyping_protocol_name]; + } my $locations = $self->retrieve_locations(); # faster and easier than pulling it out of the phenotypes_ref # listref of listrefs, first index is locationID, second is location name my ($phenotypes, undef) = $self->retrieve_phenotypes_ref(); # Returns data as a listref with two hashrefs. First hashref is a list of all phenotypes in this dataset, which is an observational unit w/ a list # of trait observations. Each OU is a stock (plot, accession, etc). Second hashref has all unique traits in the phenotype list. # Relevant hash keys: observations, trial_id, trial_location_id, germplasm_stock_id, trait_id, trait_name, value my $accessions = $self->retrieve_accessions(); - # my $num_accessions = scalar(@{$accessions}); my $genotype_counts = {}; my @accession_ids = map {$_->[0]} @{$accessions}; diff --git a/lib/SGN/Controller/AJAX/Dataset.pm b/lib/SGN/Controller/AJAX/Dataset.pm index 8e2ab8ad2a..ebf0752ed3 100644 --- a/lib/SGN/Controller/AJAX/Dataset.pm +++ b/lib/SGN/Controller/AJAX/Dataset.pm @@ -62,7 +62,7 @@ sub store_dataset :Path('/ajax/dataset/save') Args(0) { } $dataset->store(); - $dataset->update_tool_compatibility(); + $dataset->update_tool_compatibility($c->config->{default_genotyping_protocol}); $c->stash->{rest} = { message => "Stored Dataset Successfully!" }; } @@ -438,7 +438,7 @@ sub calc_tool_compatibility :Path('/ajax/dataset/calc_tool_compatibility') Args( my $tool_compatibility; eval { - $dataset->update_tool_compatibility(); + $dataset->update_tool_compatibility($c->config->{default_genotyping_protocol}); $tool_compatibility = $dataset->tool_compatibility; }; if ($@){ From a3697a020bbcff6293a80d850f0c188c85229b48 Mon Sep 17 00:00:00 2001 From: ryan-preble Date: Mon, 3 Feb 2025 09:40:27 -0500 Subject: [PATCH 19/19] More POD --- lib/CXGN/Dataset.pm | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/lib/CXGN/Dataset.pm b/lib/CXGN/Dataset.pm index 32dd2474f1..59d9a8bc51 100644 --- a/lib/CXGN/Dataset.pm +++ b/lib/CXGN/Dataset.pm @@ -1225,16 +1225,7 @@ Creates a hashref of analysis tools that this dataset can be used with. For exam Note that this function should only ever be called once for a dataset and have the data stored as part of the dataset definition JSON, since retrieving high dimensional phenotype and genotype data can be time consuming. -Tools that use datasets: - solGS - genotyping data and phenotyping data - PCA - genotyping data - Cluster Analysis - genotyping data and/or phenotyping data - Kinship & Inbreeding - genotyping data - Stability - trials and observed traits, trials must have multiple locations. Traits and accessions need to be represented in all locations, with replicates. Some missing data allowed. - Heritability - trial(s) w/ different designs and observed traits. - Mixed Models - observed traits, fairly flexible - Boxplots - observed traits - GWAS - at least one trial, a genotyping protocol, and observed traits +Takes one parameter, passed from Controller: the name of the default genotyping protocol to use as a fallback if none is found in the dataset. =cut @@ -1510,6 +1501,12 @@ sub calculate_tool_compatibility { return $tool_compatibility; } +=head2 update_tool_compatibility + +Recalculates and stores tool compatibility individually without updating other dataset characteristics. Used in a button in the dataset details page. + +=cut + sub update_tool_compatibility { my $self = shift;