From 933086fe8471fae42ff6b7b9152eeb1753fe2421 Mon Sep 17 00:00:00 2001 From: Maxim Kuleshov Date: Wed, 16 Feb 2022 22:53:34 -0500 Subject: [PATCH 1/4] Fix references --- .../RNA_seq_Analysis_Pipeline.ipynb | 39 ++++++++++--------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/appyters/Bulk_RNA_seq/RNA_seq_Analysis_Pipeline.ipynb b/appyters/Bulk_RNA_seq/RNA_seq_Analysis_Pipeline.ipynb index 1915b169c..c7070d12f 100644 --- a/appyters/Bulk_RNA_seq/RNA_seq_Analysis_Pipeline.ipynb +++ b/appyters/Bulk_RNA_seq/RNA_seq_Analysis_Pipeline.ipynb @@ -479,7 +479,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Load datasets" + "# Loaded datasets" ] }, { @@ -630,7 +630,7 @@ "source": [ "%%appyter markdown\n", "{% if visualization_method.value == \"PCA\" %}\n", - "Principal Component Analysis (PCA) (Clark et al. 2011) is a statistical technique used to identify global patterns in high-dimensional datasets. It is commonly used to explore the similarity of biological samples in RNA-seq datasets. To achieve this, gene expression values are transformed into Principal Components (PCs), a set of linearly uncorrelated features which represent the most relevant sources of variance in the data, and subsequently visualized using a scatter plot.\n", + "Principal Component Analysis (PCA) [1] is a statistical technique used to identify global patterns in high-dimensional datasets. It is commonly used to explore the similarity of biological samples in RNA-seq datasets. To achieve this, gene expression values are transformed into Principal Components (PCs), a set of linearly uncorrelated features which represent the most relevant sources of variance in the data, and subsequently visualized using a scatter plot.\n", "{% endif %}" ] }, @@ -671,7 +671,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Clustergrammer (Fernandez et al. 2017) is a web-based tool for visualizing and analyzing high-dimensional data as interactive and hierarchically clustered heatmaps. It is commonly used to explore the similarity between samples in an RNA-seq dataset. In addition to identifying clusters of samples, it also allows to identify the genes which contribute to the clustering." + "Clustergrammer [2] is a web-based tool for visualizing and analyzing high-dimensional data as interactive and hierarchically clustered heatmaps. It is commonly used to explore the similarity between samples in an RNA-seq dataset. In addition to identifying clusters of samples, it also allows to identify the genes which contribute to the clustering." ] }, { @@ -735,7 +735,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Gene expression signatures are alterations in the patterns of gene expression that occur as a result of cellular perturbations such as drug treatments, gene knock-downs or diseases. They can be quantified using differential gene expression (DGE) methods (Ritchie et al. 2015, Clark et al. 2014), which compare gene expression between two groups of samples to identify genes whose expression is significantly altered in the perturbation. " + "Gene expression signatures are alterations in the patterns of gene expression that occur as a result of cellular perturbations such as drug treatments, gene knock-downs or diseases. They can be quantified using differential gene expression (DGE) methods [3, 4], which compare gene expression between two groups of samples to identify genes whose expression is significantly altered in the perturbation. " ] }, { @@ -796,7 +796,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Enrichment analysis is a statistical procedure used to identify biological terms which are over-represented in a given gene set. These include signaling pathways, molecular functions, diseases, and a wide variety of other biological terms obtained by integrating prior knowledge of gene function from multiple resources. Enrichr (Kuleshov et al. 2016) is a web-based application which allows to perform enrichment analysis using a large collection of gene-set libraries and various interactive approaches to display enrichment results." + "Enrichment analysis is a statistical procedure used to identify biological terms which are over-represented in a given gene set. These include signaling pathways, molecular functions, diseases, and a wide variety of other biological terms obtained by integrating prior knowledge of gene function from multiple resources. Enrichr [5] is a web-based application which allows to perform enrichment analysis using a large collection of gene-set libraries and various interactive approaches to display enrichment results." ] }, { @@ -921,7 +921,7 @@ " for gene_set_library in libraries:\n", " # Display results\n", " plot_name = \"{}_barchart_{}.png\".format(gene_set_library, label)\n", - " plot_library_barchart(enrichment_results, gene_set_library, enrichment_results['signature_label'], enrichment_results['sort_results_by'], nr_genesets=nr_genesets, plot_type=plot_type)\n", + " plot_library_barchart(enrichment_results, gene_set_library, enrichment_results['signature_label'], enrichment_results['sort_results_by'], nr_genesets=nr_genesets, plot_type=plot_type, plot_name=plot_name\n", " figure_counter, notebook_metadata = display_object(figure_counter, \"Enrichment Analysis Results for {} in {}. The figure contains interactive bar charts displaying the results of the pathway enrichment analysis generated using Enrichr. The x axis indicates the -log10(P-value) for each term. Significant terms are highlighted in bold. Additional information about enrichment results is available by hovering over each bar.\".format(label, gene_set_library), notebook_metadata, saved_filename=plot_name, istable=False)\n", "{% endif %}" ] @@ -1120,45 +1120,46 @@ "cell_type": "markdown", "metadata": {}, "source": [ + "1. Clark, N.R. and Ma’ayan, A. (2011) Introduction to statistical methods to analyze large data sets: principal components analysis. Sci. Signal., 4, tr3-tr3.\n", + "
\n", + "2. Fernandez, Nicolas F., et al. \"Clustergrammer, a web-based heatmap visualization and analysis tool for high-dimensional biological data.\" Scientific data 4 (2017): 170151.\n", + "
\n", + "3. Ritchie, Matthew E., et al. \"limma powers differential expression analyses for RNA-sequencing and microarray studies.\" Nucleic acids research 43.7 (2015): e47-e47.\n", + "
\n", + "4. Clark, Neil R., et al. \"The characteristic direction: a geometrical approach to identify differentially expressed genes.\" BMC bioinformatics 15.1 (2014): 79.\n", + "
\n", + "5. Kuleshov, M.V., Jones, M.R., Rouillard, A.D., Fernandez, N.F., Duan, Q., Wang, Z., Koplev, S., Jenkins, S.L., Jagodnik, K.M. and Lachmann, A. (2016) Enrichr: a comprehensive gene set enrichment analysis web server 2016 update. Nucleic acids research, 44, W90-W97.\n", + "
\n", + "\n", "Agarwal, Vikram, et al. \"Predicting effective microRNA target sites in mammalian mRNAs.\" elife 4 (2015): e05005.\n", "
\n", "Ashburner, M., Ball, C.A., Blake, J.A., Botstein, D., Butler, H., Cherry, J.M., Davis, A.P., Dolinski, K., Dwight, S.S. and Eppig, J.T. (2000) Gene Ontology: tool for the unification of biology. Nature genetics, 25, 25.\n", "
\n", "Chou, Chih-Hung, et al. \"miRTarBase 2016: updates to the experimentally validated miRNA-target interactions database.\" Nucleic acids research 44.D1 (2016): D239-D247.\n", "
\n", - "Clark, N.R. and Ma’ayan, A. (2011) Introduction to statistical methods to analyze large data sets: principal components analysis. Sci. Signal., 4, tr3-tr3.\n", - "
\n", - "Clark, Neil R., et al. \"The characteristic direction: a geometrical approach to identify differentially expressed genes.\" BMC bioinformatics 15.1 (2014): 79.\n", - "
\n", "Consortium, E.P. (2004) The ENCODE (ENCyclopedia of DNA elements) project. Science, 306, 636-640.\n", "
\n", "Croft, David, et al. \"The Reactome pathway knowledgebase.\" Nucleic acids research 42.D1 (2014): D472-D477.\n", "
\n", "Duan, Q., et al. \"L1000cds2: Lincs l1000 characteristic direction signatures search engine. NPJ Syst Biol Appl. 2016; 2: 16015.\" (2016).\n", "
\n", - "Fernandez, Nicolas F., et al. \"Clustergrammer, a web-based heatmap visualization and analysis tool for high-dimensional biological data.\" Scientific data 4 (2017): 170151.\n", - "
\n", "Kanehisa, M. and Goto, S. (2000) KEGG: kyoto encyclopedia of genes and genomes. Nucleic acids research, 28, 27-30.\n", "
\n", "Kelder, Thomas, et al. \"WikiPathways: building research communities on biological pathways.\" Nucleic acids research 40.D1 (2012): D1301-D1307.\n", "
\n", - "Kuleshov, M.V., Jones, M.R., Rouillard, A.D., Fernandez, N.F., Duan, Q., Wang, Z., Koplev, S., Jenkins, S.L., Jagodnik, K.M. and Lachmann, A. (2016) Enrichr: a comprehensive gene set enrichment analysis web server 2016 update. Nucleic acids research, 44, W90-W97.\n", - "
\n", "Lachmann, A., Xu, H., Krishnan, J., Berger, S.I., Mazloom, A.R. and Ma'ayan, A. (2010) ChEA: transcription factor regulation inferred from integrating genome-wide ChIP-X experiments. Bioinformatics, 26, 2438-2444.\n", "
\n", "Lachmann, Alexander, and Avi Ma'ayan. \"KEA: kinase enrichment analysis.\" Bioinformatics 25.5 (2009): 684-686.\n", "
\n", - "Ritchie, Matthew E., et al. \"limma powers differential expression analyses for RNA-sequencing and microarray studies.\" Nucleic acids research 43.7 (2015): e47-e47.\n", - "
\n", "Wang, Zichen, et al. \"L1000FWD: fireworks visualization of drug-induced transcriptomic signatures.\" Bioinformatics 34.12 (2018): 2150-2152." ] } ], "metadata": { "kernelspec": { - "display_name": "py39", + "display_name": "Python 3", "language": "python", - "name": "py39" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -1170,7 +1171,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.5" + "version": "3.9.2" } }, "nbformat": 4, From d196bc6813c1c1688e6f12cd22783f699bd6cc14 Mon Sep 17 00:00:00 2001 From: Maxim Kuleshov Date: Thu, 17 Feb 2022 21:21:36 -0500 Subject: [PATCH 2/4] Generate references from the content --- .../RNA_seq_Analysis_Pipeline.ipynb | 63 +++++++------------ 1 file changed, 22 insertions(+), 41 deletions(-) diff --git a/appyters/Bulk_RNA_seq/RNA_seq_Analysis_Pipeline.ipynb b/appyters/Bulk_RNA_seq/RNA_seq_Analysis_Pipeline.ipynb index c7070d12f..b237220f6 100644 --- a/appyters/Bulk_RNA_seq/RNA_seq_Analysis_Pipeline.ipynb +++ b/appyters/Bulk_RNA_seq/RNA_seq_Analysis_Pipeline.ipynb @@ -580,7 +580,7 @@ "outputs": [], "source": [ "dataset['dataset_metadata'] = meta_df\n", - "\n", + "ref_counter = ['Clark, N.R. and Ma’ayan, A. (2011) Introduction to statistical methods to analyze large data sets: principal components analysis. Sci. Signal., 4, tr3-tr3.']\n", "table_counter, notebook_metadata = display_object(table_counter, \"Raw RNA-seq expression data. The table displays the first 5 rows of the quantified RNA-seq expression dataset. Rows represent genes, columns represent samples, and values show the number of mapped reads.\", notebook_metadata, \"raw_exp.csv\", dataset[current_dataset].head(), istable=True)\n", "table_counter, notebook_metadata = display_object(table_counter, \"Metadata. The table displays the metadata associated with the samples in the RNA-seq dataset. Rows represent RNA-seq samples, columns represent metadata categories.\", notebook_metadata, \"metadata.csv\", dataset['dataset_metadata'].head(), istable=True)\n", "table_counter, notebook_metadata = display_object(table_counter, \"Sample size for each class. The table displays the number of samples in each class.\", notebook_metadata, \"num_of_samples_in_class.csv\", dataset['dataset_metadata'].reset_index().groupby(meta_class_column_name).count(), istable=True)" @@ -682,7 +682,7 @@ "source": [ "# Run analysis\n", "results['clustergrammer'] = run_clustergrammer(dataset=dataset, meta_class_column_name=meta_class_column_name, nr_genes=clustering_topk, normalization=normalization, z_score=True, gene_list=gene_list_for_clustergrammer)\n", - "\n", + "ref_counter.append('Fernandez, Nicolas F., et al. \"Clustergrammer, a web-based heatmap visualization and analysis tool for high-dimensional biological data.\" Scientific data 4 (2017): 170151.')\n", "# Display results\n", "plot_clustergrammar(results['clustergrammer'])\n", "caption = \"Clustered heatmap plot. The figure contains an interactive heatmap displaying gene expression for each sample in the RNA-seq dataset. Every row of the heatmap represents a gene, every column represents a sample, and every cell displays normalized gene expression values. The heatmap additionally features color bars beside each column which represent prior knowledge of each sample, such as the tissue of origin or experimental treatment.\"\n", @@ -745,7 +745,7 @@ "outputs": [], "source": [ "signatures = get_signatures(classes, dataset, normalization, diff_gex_method, meta_class_column_name, filter_genes)\n", - "\n", + "ref_counter.extend(['Ritchie, Matthew E., et al. limma powers differential expression analyses for RNA-sequencing and microarray studies. Nucleic acids research 43.7 (2015): e47-e47.', 'Clark, Neil R., et al. The characteristic direction: a geometrical approach to identify differentially expressed genes. BMC bioinformatics 15.1 (2014): 79.'])\n", "for label, signature in signatures.items():\n", " case_label = label.split(\" vs. \")[1]\n", " table_counter, notebook_metadata = display_object(table_counter, \"Differentially expressed genes between {} using {}. The figure displays a browsable table containing the gene expression signature generated from a differential gene expression analysis. Every row of the table represents a gene; the columns display the estimated measures of differential expression.\".format(label, diff_gex_method), notebook_metadata, \"DEG_results_{}.csv\".format(label), signature, istable=True)\n", @@ -806,6 +806,7 @@ "outputs": [], "source": [ "# Loop through signatures\n", + "ref_counter.append('Kuleshov, M.V., Jones, M.R., Rouillard, A.D., Fernandez, N.F., Duan, Q., Wang, Z., Koplev, S., Jenkins, S.L., Jagodnik, K.M. and Lachmann, A. (2016) Enrichr: a comprehensive gene set enrichment analysis web server 2016 update. Nucleic acids research, 44, W90-W97.')\n", "results = {}\n", "results['enrichr']= {}\n", "if diff_gex_method == \"characteristic_direction\":\n", @@ -850,7 +851,7 @@ "%%appyter markdown\n", "{% if \"Gene Ontology\" in enrichr_libraries.value %}\n", "# GO Enrichment Analysis\n", - "Gene Ontology (GO) (Ashburner et al. 2000) is a major bioinformatics initiative aimed at unifying the representation of gene attributes across all species. It contains a large collection of experimentally validated and predicted associations between genes and biological terms. This information can be leveraged by Enrichr to identify the biological processes, molecular functions and cellular components which are over-represented in the up-regulated and down-regulated genes identified by comparing two groups of samples.\n", + "Gene Ontology (GO) [{{ref_counter|length}}] is a major bioinformatics initiative aimed at unifying the representation of gene attributes across all species. It contains a large collection of experimentally validated and predicted associations between genes and biological terms. This information can be leveraged by Enrichr to identify the biological processes, molecular functions and cellular components which are over-represented in the up-regulated and down-regulated genes identified by comparing two groups of samples.\n", "{% endif %}" ] }, @@ -862,7 +863,7 @@ "source": [ "%%appyter code_exec\n", "{% if \"Gene Ontology\" in enrichr_libraries.value %}\n", - "\n", + "ref_counter.append('Ashburner, M., Ball, C.A., Blake, J.A., Botstein, D., Butler, H., Cherry, J.M., Davis, A.P., Dolinski, K., Dwight, S.S. and Eppig, J.T. (2000) Gene Ontology: tool for the unification of biology. Nature genetics, 25, 25.')\n", "results['go_enrichment'] = {}\n", "for label, signature in signatures.items():\n", " # Run analysis\n", @@ -891,7 +892,7 @@ "%%appyter markdown\n", "{% if \"Pathway\" in enrichr_libraries.value %}\n", "# Pathway Enrichment Analysis\n", - "Biological pathways are sequences of interactions between biochemical compounds which play a key role in determining cellular behavior. Databases such as KEGG (Kanehisa et al. 2000), Reactome (Croft et al. 2014) and WikiPathways (Kelder et al. 2012) contain a large number of associations between such pathways and genes. This information can be leveraged by Enrichr to identify the biological pathways which are over-represented in the up-regulated and down-regulated genes identified by comparing two groups of samples.\n", + "Biological pathways are sequences of interactions between biochemical compounds which play a key role in determining cellular behavior. Databases such as KEGG [{{ref_counter|length}}], Reactome [{{ref_counter|length + 1}}] and WikiPathways [{{ref_counter|length + 2}}] contain a large number of associations between such pathways and genes. This information can be leveraged by Enrichr to identify the biological pathways which are over-represented in the up-regulated and down-regulated genes identified by comparing two groups of samples.\n", "{% endif %}" ] }, @@ -904,6 +905,7 @@ "%%appyter code_exec\n", "{% if \"Pathway\" in enrichr_libraries.value %}\n", "# Initialize results\n", + "ref_counter.extend(['Kanehisa, M. and Goto, S. (2000) KEGG: kyoto encyclopedia of genes and genomes. Nucleic acids research, 28, 27-30.', 'Croft, David, et al. The Reactome pathway knowledgebase. Nucleic acids research 42.D1 (2014): D472-D477.', 'Kelder, Thomas, et al. WikiPathways: building research communities on biological pathways. Nucleic acids research 40.D1 (2012): D1301-D1307.'])\n", "results['pathway_enrichment'] = {}\n", "\n", "# Loop through results\n", @@ -935,7 +937,7 @@ "%%appyter markdown\n", "{% if \"Transcription Factor\" in enrichr_libraries.value %}\n", "# Transcription Factor Enrichment Analysis\n", - "Transcription Factors (TFs) are proteins involved in the transcriptional regulation of gene expression. Databases such as ChEA (Lachmann et al. 2010) and ENCODE (Consortium, 2014) contain a large number of associations between TFs and their transcriptional targets. This information can be leveraged by Enrichr to identify the transcription factors whose targets are over-represented in the up-regulated and down-regulated genes identified by comparing two groups of samples.\n", + "Transcription Factors (TFs) are proteins involved in the transcriptional regulation of gene expression. Databases such as ChEA [{{ref_counter|length}}] and ENCODE [{{ref_counter|length + 1}}] contain a large number of associations between TFs and their transcriptional targets. This information can be leveraged by Enrichr to identify the transcription factors whose targets are over-represented in the up-regulated and down-regulated genes identified by comparing two groups of samples.\n", "{% endif %}" ] }, @@ -948,6 +950,7 @@ "%%appyter code_exec\n", "{% if \"Transcription Factor\" in enrichr_libraries.value %}\n", "# Initialize results\n", + "ref_counter.extend(['Lachmann, A., Xu, H., Krishnan, J., Berger, S.I., Mazloom, A.R. and Ma\\'ayan, A. (2010) ChEA: transcription factor regulation inferred from integrating genome-wide ChIP-X experiments. Bioinformatics, 26, 2438-2444.', 'ENCODE Consortium (2004) The ENCODE (ENCyclopedia of DNA elements) project. Science, 306, 636-640.'])\n", "results['tf_enrichment'] = {}\n", "\n", "# Loop through results\n", @@ -968,7 +971,7 @@ "%%appyter markdown\n", "{% if \"Kinase\" in enrichr_libraries.value %}\n", "# Kinase Enrichment Analysis\n", - "Protein kinases are enzymes that modify other proteins by chemically adding phosphate groups. Databases such as KEA (Lachmann et al. 2009) contain a large number of associations between kinases and their substrates. This information can be leveraged by Enrichr to identify the protein kinases whose substrates are over-represented in the up-regulated and down-regulated genes identified by comparing two groups of samples.\n", + "Protein kinases are enzymes that modify other proteins by chemically adding phosphate groups. Databases such as KEA [{{ref_counter|length}}] contain a large number of associations between kinases and their substrates. This information can be leveraged by Enrichr to identify the protein kinases whose substrates are over-represented in the up-regulated and down-regulated genes identified by comparing two groups of samples.\n", "{% endif %}" ] }, @@ -981,6 +984,7 @@ "%%appyter code_exec\n", "{% if \"Kinase\" in enrichr_libraries.value %}\n", "# Initialize results\n", + "ref_counter.append('Lachmann, Alexander, and Avi Ma\\'ayan. \"KEA: kinase enrichment analysis.\" Bioinformatics 25.5 (2009): 684-686. ')\n", "results['kinase_enrichment'] = {}\n", "\n", "# Loop through results\n", @@ -1002,7 +1006,7 @@ "%%appyter markdown\n", "{% if \"miRNA\" in enrichr_libraries.value %}\n", "# miRNA Enrichment Analysis\n", - "microRNAs (miRNAs) are small non-coding RNA molecules which play a key role in the post-transcriptional regulation of gene expression. Databases such as TargetScan (Agarwal et al. 2015) and MiRTarBase (Chou et al. 2016) contain a large number of associations between miRNAs and their targets. This information can be leveraged by Enrichr to identify the miRNAs whose targets are over-represented in the up-regulated and down-regulated genes identified by comparing two groups of samples.\n", + "microRNAs (miRNAs) are small non-coding RNA molecules which play a key role in the post-transcriptional regulation of gene expression. Databases such as TargetScan [{{ref_counter|length}}] and MiRTarBase [{{ref_counter|length + 1}}] contain a large number of associations between miRNAs and their targets. This information can be leveraged by Enrichr to identify the miRNAs whose targets are over-represented in the up-regulated and down-regulated genes identified by comparing two groups of samples.\n", "{% endif %}" ] }, @@ -1014,6 +1018,7 @@ "source": [ "%%appyter code_exec\n", "{% if \"miRNA\" in enrichr_libraries.value %}\n", + "ref_counter.extend(['Agarwal, Vikram, et al. Predicting effective microRNA target sites in mammalian mRNAs. elife 4 (2015): e05005.', 'Chou, Chih-Hung, et al. miRTarBase 2016: updates to the experimentally validated miRNA-target interactions database. Nucleic acids research 44.D1 (2016): D239-D247.'])\n", "\n", "results['mirna_enrichment'] = {}\n", "\n", @@ -1036,7 +1041,7 @@ "%%appyter markdown\n", "{% if small_molecule_method.value == \"L1000CDS2\" %}\n", "# L1000CDS2 Query\n", - "L1000CDS2 (Duan et al. 2016) is a web-based tool for querying gene expression signatures against signatures created from human cell lines treated with over 20,000 small molecules and drugs for the LINCS project. It is commonly used to identify small molecules which mimic or reverse the effects of a gene expression signature generated from a differential gene expression analysis.\n", + "L1000CDS2 [{{ref_counter|length}}] is a web-based tool for querying gene expression signatures against signatures created from human cell lines treated with over 20,000 small molecules and drugs for the LINCS project. It is commonly used to identify small molecules which mimic or reverse the effects of a gene expression signature generated from a differential gene expression analysis.\n", "{% endif %}" ] }, @@ -1049,6 +1054,7 @@ "%%appyter code_exec\n", "{% if small_molecule_method.value == \"L1000CDS2\" %}\n", "# Initialize results\n", + "ref_counter.append('Duan, Q., et al. L1000CDS2: Lincs l1000 characteristic direction signatures search engine. NPJ Syst Biol Appl. 2016; 2: 16015. (2016).')\n", "results['l1000cds2'] = {}\n", "\n", "# Loop through signatures\n", @@ -1071,7 +1077,7 @@ "%%appyter markdown\n", "{% if small_molecule_method.value == \"L1000FWD\" %}\n", "# L1000FWD Query\n", - "L1000FWD (Wang et al. 2018) is a web-based tool for querying gene expression signatures against signatures created from human cell lines treated with over 20,000 small molecules and drugs for the LINCS project.\n", + "L1000FWD [{{ref_counter|length}}] is a web-based tool for querying gene expression signatures against signatures created from human cell lines treated with over 20,000 small molecules and drugs for the LINCS project.\n", "{% endif %}" ] }, @@ -1084,6 +1090,7 @@ "%%appyter code_exec\n", "{% if small_molecule_method.value == \"L1000FWD\" %}\n", "# Initialize results\n", + "ref_counter.append('Wang, Zichen, et al. L1000FWD: fireworks visualization of drug-induced transcriptomic signatures. Bioinformatics 34.12 (2018): 2150-2152.')\n", "results['l1000fwd'] = {}\n", "\n", "# Loop through signatures\n", @@ -1120,38 +1127,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "1. Clark, N.R. and Ma’ayan, A. (2011) Introduction to statistical methods to analyze large data sets: principal components analysis. Sci. Signal., 4, tr3-tr3.\n", - "
\n", - "2. Fernandez, Nicolas F., et al. \"Clustergrammer, a web-based heatmap visualization and analysis tool for high-dimensional biological data.\" Scientific data 4 (2017): 170151.\n", - "
\n", - "3. Ritchie, Matthew E., et al. \"limma powers differential expression analyses for RNA-sequencing and microarray studies.\" Nucleic acids research 43.7 (2015): e47-e47.\n", - "
\n", - "4. Clark, Neil R., et al. \"The characteristic direction: a geometrical approach to identify differentially expressed genes.\" BMC bioinformatics 15.1 (2014): 79.\n", - "
\n", - "5. Kuleshov, M.V., Jones, M.R., Rouillard, A.D., Fernandez, N.F., Duan, Q., Wang, Z., Koplev, S., Jenkins, S.L., Jagodnik, K.M. and Lachmann, A. (2016) Enrichr: a comprehensive gene set enrichment analysis web server 2016 update. Nucleic acids research, 44, W90-W97.\n", - "
\n", + "%%appyter markdown\n", "\n", - "Agarwal, Vikram, et al. \"Predicting effective microRNA target sites in mammalian mRNAs.\" elife 4 (2015): e05005.\n", - "
\n", - "Ashburner, M., Ball, C.A., Blake, J.A., Botstein, D., Butler, H., Cherry, J.M., Davis, A.P., Dolinski, K., Dwight, S.S. and Eppig, J.T. (2000) Gene Ontology: tool for the unification of biology. Nature genetics, 25, 25.\n", - "
\n", - "Chou, Chih-Hung, et al. \"miRTarBase 2016: updates to the experimentally validated miRNA-target interactions database.\" Nucleic acids research 44.D1 (2016): D239-D247.\n", - "
\n", - "Consortium, E.P. (2004) The ENCODE (ENCyclopedia of DNA elements) project. Science, 306, 636-640.\n", - "
\n", - "Croft, David, et al. \"The Reactome pathway knowledgebase.\" Nucleic acids research 42.D1 (2014): D472-D477.\n", - "
\n", - "Duan, Q., et al. \"L1000cds2: Lincs l1000 characteristic direction signatures search engine. NPJ Syst Biol Appl. 2016; 2: 16015.\" (2016).\n", - "
\n", - "Kanehisa, M. and Goto, S. (2000) KEGG: kyoto encyclopedia of genes and genomes. Nucleic acids research, 28, 27-30.\n", - "
\n", - "Kelder, Thomas, et al. \"WikiPathways: building research communities on biological pathways.\" Nucleic acids research 40.D1 (2012): D1301-D1307.\n", - "
\n", - "Lachmann, A., Xu, H., Krishnan, J., Berger, S.I., Mazloom, A.R. and Ma'ayan, A. (2010) ChEA: transcription factor regulation inferred from integrating genome-wide ChIP-X experiments. Bioinformatics, 26, 2438-2444.\n", - "
\n", - "Lachmann, Alexander, and Avi Ma'ayan. \"KEA: kinase enrichment analysis.\" Bioinformatics 25.5 (2009): 684-686.\n", + "{% for result in results %}\n", + "{{ loop.index }}. {{ result }}\n", "
\n", - "Wang, Zichen, et al. \"L1000FWD: fireworks visualization of drug-induced transcriptomic signatures.\" Bioinformatics 34.12 (2018): 2150-2152." + "{% endfor %}" ] } ], From 89a18b23515571a2118799bea07c72b12e6783f6 Mon Sep 17 00:00:00 2001 From: Maxim Kuleshov Date: Thu, 24 Mar 2022 23:39:37 -0400 Subject: [PATCH 3/4] Clean up --- .../RNA_seq_Analysis_Pipeline.ipynb | 44 +- appyters/Bulk_RNA_seq/utils.py | 537 ++++++++++-------- 2 files changed, 326 insertions(+), 255 deletions(-) diff --git a/appyters/Bulk_RNA_seq/RNA_seq_Analysis_Pipeline.ipynb b/appyters/Bulk_RNA_seq/RNA_seq_Analysis_Pipeline.ipynb index b237220f6..288798bdd 100644 --- a/appyters/Bulk_RNA_seq/RNA_seq_Analysis_Pipeline.ipynb +++ b/appyters/Bulk_RNA_seq/RNA_seq_Analysis_Pipeline.ipynb @@ -41,6 +41,7 @@ "import base64 \n", "import json\n", "from pandas.api.types import CategoricalDtype\n", + "from collections import defaultdict\n", "\n", "# Visualization\n", "import plotly\n", @@ -327,7 +328,7 @@ "\n", "{% set enrichr_libraries = MultiChoiceField(\n", " name='enrichr_libraries',\n", - " label='Enrichr Libraries (upto 2)',\n", + " label='Enrichr Libraries',\n", " descriptions='Enrichr libraries to be visualized. Select one or two libraries',\n", " choices=['Gene Ontology',\n", " 'Pathway',\n", @@ -423,10 +424,7 @@ "warnings.filterwarnings('ignore')\n", "random.seed(0)\n", "pandas2ri.activate()\n", - "notebook_metadata = dict()\n", - "notebook_metadata[\"tables\"] = dict()\n", - "notebook_metadata[\"figures\"] = dict()\n", - "notebook_metadata[\"input_parameters\"] = dict()\n", + "notebook_metadata = defaultdict(dict)\n", "if interactive_plot == True:\n", " plot_type='interactive'\n", "else:\n", @@ -444,35 +442,14 @@ "source": [ "%%appyter code_exec\n", "\n", + "ip = dict()\n", "\n", - "notebook_metadata[\"input_parameters\"][\"rnaseq_data_filename\"] = rnaseq_data_filename\n", - "notebook_metadata[\"input_parameters\"][\"meta_data_filename\"] = meta_data_filename\n", - "notebook_metadata[\"input_parameters\"][\"meta_class_column_name\"] = meta_class_column_name\n", - "notebook_metadata[\"input_parameters\"][\"control_name\"] = control_name\n", - "\n", - "notebook_metadata[\"input_parameters\"][\"filter_genes\"] = filter_genes\n", - "notebook_metadata[\"input_parameters\"][\"low_expression_threshold\"] = low_expression_threshold\n", - "notebook_metadata[\"input_parameters\"][\"logCPM_normalization\"] = {{logCPM_normalization.value}}\n", - "notebook_metadata[\"input_parameters\"][\"log_normalization\"] = {{log_normalization.value}}\n", - "notebook_metadata[\"input_parameters\"][\"z_normalization\"] = {{z_normalization.value}}\n", - "notebook_metadata[\"input_parameters\"][\"q_normalization\"] = {{q_normalization.value}}\n", - "\n", - "notebook_metadata[\"input_parameters\"][\"visualization_method\"] = \"{{visualization_method.value}}\"\n", - "notebook_metadata[\"input_parameters\"][\"nr_genes\"] = nr_genes\n", - "notebook_metadata[\"input_parameters\"][\"gene_list_for_clustergrammer\"] = gene_list_for_clustergrammer\n", - "notebook_metadata[\"input_parameters\"][\"clustering_topk\"] = clustering_topk\n", - "\n", - "notebook_metadata[\"input_parameters\"][\"diff_gex_method\"] = diff_gex_method\n", - "notebook_metadata[\"input_parameters\"][\"diff_gex_plot_method\"] = diff_gex_plot_method\n", - "notebook_metadata[\"input_parameters\"][\"pvalue_threshold\"] = pvalue_threshold\n", - "notebook_metadata[\"input_parameters\"][\"logfc_threshold\"] = logfc_threshold\n", - "notebook_metadata[\"input_parameters\"][\"gene_topk\"] = gene_topk\n", - "notebook_metadata[\"input_parameters\"][\"enrichr_libraries\"] = enrichr_libraries\n", - "notebook_metadata[\"input_parameters\"][\"nr_genesets\"] = nr_genesets\n", - "notebook_metadata[\"input_parameters\"][\"small_molecule_method\"] = small_molecule_method\n", - "notebook_metadata[\"input_parameters\"][\"l1000_topk\"] = l1000_topk\n", - "notebook_metadata[\"input_parameters\"][\"nr_drugs\"] = nr_drugs\n", - "\n" + "ip |= dict(zip([\"rnaseq_data_filename\", \"meta_data_filename\", \"meta_class_column_name\", \"control_name\"], [rnaseq_data_filename, meta_data_filename, meta_class_column_name, control_name]))\n", + "ip |= dict(zip([\"filter_genes\", \"low_expression_threshold\", \"logCPM_normalization\", \"log_normalization\", \"z_normalization\", \"q_normalization\"], [filter_genes, low_expression_threshold, {{logCPM_normalization.value}}, {{log_normalization.value}}, {{z_normalization.value}}, {{q_normalization.value}}]))\n", + "ip |= dict(zip([\"visualization_method\", \"nr_genes\", \"gene_list_for_clustergrammer\", \"clustering_topk\"], [\"{{visualization_method.value}}\", nr_genes, gene_list_for_clustergrammer, clustering_topk]))\n", + "ip |= dict(zip([\"diff_gex_method\", \"diff_gex_plot_method\", \"pvalue_threshold\", \"logfc_threshold\", \"gene_topk\", \"enrichr_libraries\", \"nr_genesets\", \"small_molecule_method\", \"l1000_topk\", \"nr_drugs\"], [diff_gex_method, diff_gex_plot_method, pvalue_threshold, logfc_threshold, gene_topk, enrichr_libraries, nr_genesets, small_molecule_method, l1000_topk, nr_drugs]))\n", + "\n", + "notebook_metadata[\"input_parameters\"] = ip\n" ] }, { @@ -1112,6 +1089,7 @@ "outputs": [], "source": [ "# save metadata of the notebook as json\n", + "notebook_metadata['references'] = ref_counter\n", "with open(\"notebook_metadata.json\", \"w\") as fw:\n", " json.dump(notebook_metadata, fw)" ] diff --git a/appyters/Bulk_RNA_seq/utils.py b/appyters/Bulk_RNA_seq/utils.py index fdc40d8e5..b6c7ee83e 100644 --- a/appyters/Bulk_RNA_seq/utils.py +++ b/appyters/Bulk_RNA_seq/utils.py @@ -1,27 +1,26 @@ -from rpy2 import robjects -from rpy2.robjects import r, pandas2ri -# Basic libraries -import pandas as pd -import os -import urllib3 -import requests, json -import random -import time +import json +import matplotlib.pyplot as plt import numpy as np -import warnings +import os +import pandas as pd -# Visualization import plotly -from plotly import tools import plotly.express as px import plotly.graph_objs as go -import matplotlib.pyplot as plt; plt.rcdefaults() +import random +import requests +import time +import urllib3 +import warnings +from plotly import tools +from rpy2 import robjects +from rpy2.robjects import r, pandas2ri + from matplotlib import rcParams import IPython from IPython.display import HTML, display, Markdown, IFrame - # Data analysis from itertools import combinations import scipy.spatial.distance as dist @@ -33,76 +32,82 @@ from sklearn.manifold import TSNE from plotly.offline import init_notebook_mode -init_notebook_mode(connected = False) + +plt.rcdefaults() +init_notebook_mode(connected=False) + def check_files(fname): if fname == "": raise IOError - if fname.endswith(".txt") == False and fname.endswith(".csv") ==False and fname.endswith(".tsv")==False: + if fname.endswith(".txt") == False and fname.endswith(".csv") == False and fname.endswith(".tsv") == False: raise IOError + + def check_df(df, col): if col not in df.columns: raise IOError - -def CPM(data): + +def cpm(data): with warnings.catch_warnings(): warnings.simplefilter("ignore") - data = (data/data.sum())*10**6 + data = (data / data.sum()) * 10 ** 6 data = data.fillna(0) - + return data -def logCPM(data): + +def log_cpm(data): with warnings.catch_warnings(): warnings.simplefilter("ignore") - data = (data/data.sum())*10**6 + data = (data / data.sum()) * 10 ** 6 data = data.fillna(0) - data = np.log2(data+1) + data = np.log2(data + 1) # Return return data -def log(data): + +def log(data): with warnings.catch_warnings(): warnings.simplefilter("ignore") data = data.fillna(0) - data = np.log2(data+1) + data = np.log2(data + 1) return data -def qnormalization(data): - - X_quantile_norm = quantile_normalize(data) - return X_quantile_norm -def normalize(dataset, current_dataset, logCPM_normalization, log_normalization, z_normalization, q_normalization): + +def normalize(dataset, current_dataset, log_cpm_normalization, log_normalization, z_normalization, q_normalization): normalization = current_dataset - if logCPM_normalization == True: + if log_cpm_normalization: data = dataset[normalization] normalization += '+logCPM' - dataset[normalization] = logCPM(data) - - if log_normalization == True: + dataset[normalization] = log_cpm(data) + + if log_normalization: data = dataset[normalization] normalization += '+log' dataset[normalization] = log(data) - - if z_normalization == True: + + if z_normalization: data = dataset[normalization] - normalization += '+z_norm' + normalization += '+z_norm' dataset[normalization] = data.T.apply(ss.zscore, axis=0).T.dropna() - if q_normalization == True: + if q_normalization: data = dataset[normalization] normalization += '+q_norm' - dataset[normalization] = qnormalization(data) + dataset[normalization] = quantile_normalize(data) return dataset, normalization -def create_download_link(df, title = "Download CSV file: {}", filename = "data.csv"): + +def create_download_link(df, title="Download CSV file: {}", filename="data.csv"): df.to_csv(filename) html = "{}".format(filename, title.format(filename)) return HTML(html) + def display_link(url): raw_html = '%s' % (url, url) return display(HTML(raw_html)) @@ -110,20 +115,19 @@ def display_link(url): def display_object(counter, caption, notebook_metadata=None, saved_filename=None, df=None, istable=True, ishtml=False): object_info = dict() - + if df is not None: - if ishtml == True: + if ishtml: display(HTML(df.to_html(render_links=True, escape=False))) else: display(df) df.to_csv(saved_filename) - - if istable == True: + + if istable: if notebook_metadata is not None: object_info["description"] = caption object_info["file_name"] = saved_filename - notebook_metadata["tables"][str(counter)] = object_info - + notebook_metadata["tables"][str(counter)] = object_info display(Markdown("*Table {}. {}*".format(counter, caption))) else: @@ -131,38 +135,42 @@ def display_object(counter, caption, notebook_metadata=None, saved_filename=None object_info["description"] = caption object_info["file_name"] = saved_filename notebook_metadata["figures"][str(counter)] = object_info - + display(Markdown("*Figure {}. {}*".format(counter, caption))) - + counter += 1 if notebook_metadata is not None: return counter, notebook_metadata else: return counter - -def run_dimension_reduction(dataset, method='PCA', normalization='logCPM', nr_genes=2500, filter_samples=True, plot_type='interactive'): + + +def run_dimension_reduction(dataset, method='PCA', normalization='logCPM', nr_genes=2500, filter_samples=True, + plot_type='interactive'): # Get data before_norm = normalization.replace("+z_norm", "").replace("+q_norm", "") top_genes = dataset[before_norm].var(axis=1).sort_values(ascending=False) - + expression_dataframe = dataset[normalization] - + # Filter columns if filter_samples and dataset.get('signature_metadata'): - selected_samples = [sample for samples in list(dataset['signature_metadata'].values())[0].values() for sample in samples] + selected_samples = [sample for samples in list(dataset['signature_metadata'].values())[0].values() for sample in + samples] expression_dataframe = expression_dataframe[selected_samples] # Filter rows expression_dataframe = expression_dataframe.loc[top_genes.index[:nr_genes]] - result=None + result = None axis = None if method == 'PCA': # Run PCA - pca=PCA(n_components=3) + pca = PCA(n_components=3) result = pca.fit(expression_dataframe) # Get Variance - axis = ['PC'+str((i+1))+'('+str(round(e*100, 1))+'% var. explained)' for i, e in enumerate(pca.explained_variance_ratio_)] + axis = ['PC' + str((i + 1)) + '(' + str(round(e * 100, 1)) + '% var. explained)' for i, e in + enumerate(pca.explained_variance_ratio_)] elif method == "UMAP": u = umap.UMAP(n_components=3) result = u.fit_transform(expression_dataframe.T) @@ -184,14 +192,15 @@ def run_dimension_reduction(dataset, method='PCA', normalization='logCPM', nr_ge col.append(B_label) else: col.append('Other') - dataset['dataset_metadata']['Sample Group'] = col - - # Return - results = {'method': method, 'result': result, 'axis': axis, - 'dataset_metadata': dataset['dataset_metadata'][dataset['dataset_metadata'].index == expression_dataframe.columns], - 'nr_genes': nr_genes, - 'normalization': normalization, 'signature_metadata': dataset.get('signature_metadata'), - 'plot_type': plot_type} + dataset['dataset_metadata']['Sample Group'] = col + + # Return + results = {'method': method, 'result': result, 'axis': axis, + 'dataset_metadata': dataset['dataset_metadata'][ + dataset['dataset_metadata'].index == expression_dataframe.columns], + 'nr_genes': nr_genes, + 'normalization': normalization, 'signature_metadata': dataset.get('signature_metadata'), + 'plot_type': plot_type} return results @@ -199,7 +208,7 @@ def plot_samples(pca_results, meta_class_column_name, counter, plot_name, notebo pca_transformed = pca_results['result'] axis = pca_results['axis'] meta_df = pca_results['dataset_metadata'] - + if pca_results["method"] == "PCA": meta_df['x'] = pca_transformed.components_[0] meta_df['y'] = pca_transformed.components_[1] @@ -208,7 +217,7 @@ def plot_samples(pca_results, meta_class_column_name, counter, plot_name, notebo meta_df['x'] = [x[0] for x in pca_transformed] meta_df['y'] = [x[1] for x in pca_transformed] meta_df['z'] = [x[2] for x in pca_transformed] - + fig = px.scatter_3d(meta_df, x='x', y='y', z='z', color=meta_class_column_name) if plot_type == "interactive": fig.show() @@ -223,35 +232,36 @@ def plot_samples(pca_results, meta_class_column_name, counter, plot_name, notebo object_info = dict() object_info["description"] = caption object_info["file_name"] = plot_name - + notebook_metadata["figures"][str(counter)] = object_info - + counter += 1 return counter, notebook_metadata -def run_clustergrammer(dataset, meta_class_column_name, normalization='logCPM', z_score=True, nr_genes=1500, metadata_cols=None, filter_samples=True,gene_list=None): + +def run_clustergrammer(dataset, meta_class_column_name, normalization='logCPM', z_score=True, nr_genes=1500, + metadata_cols=None, filter_samples=True, gene_list=None): # Subset the expression DataFrame using top 800 genes with largest variance data = dataset[normalization] variances = np.var(data, axis=1) srt_idx = variances.argsort()[::-1] - if gene_list == None or len(gene_list) == 0: + if gene_list is None or len(gene_list) == 0: expr_df_sub = data.iloc[srt_idx].iloc[:nr_genes] else: gene_list = gene_list.split("\n") common_gene_list = list(set(gene_list).intersection(set(data.index))) expr_df_sub = data.loc[common_gene_list, :] assert len(expr_df_sub.index) > 0 - + # prettify sample names - sample_names = ['::'.join([y, x]) for x,y in - zip(dataset["dataset_metadata"][meta_class_column_name], expr_df_sub.columns)] + sample_names = ['::'.join([y, x]) for x, y in + zip(dataset["dataset_metadata"][meta_class_column_name], expr_df_sub.columns)] expr_df_sub.columns = sample_names - expr_df_sub.index = ["Gene: "+str(x) for x in expr_df_sub.index] - sample_name = ["Sample: "+x for x in sample_names] + expr_df_sub.index = ["Gene: " + str(x) for x in expr_df_sub.index] + sample_name = ["Sample: " + x for x in sample_names] expr_df_sub.columns = sample_name - - treatment_type = ["Class: "+ x.split("::")[1] for x in sample_names] + treatment_type = ["Class: " + x.split("::")[1] for x in sample_names] new_series = pd.DataFrame(treatment_type).T new_series.columns = expr_df_sub.columns expr_df_sub = pd.concat([new_series, expr_df_sub], axis=0) @@ -265,7 +275,8 @@ def run_clustergrammer(dataset, meta_class_column_name, normalization='logCPM', clustergrammer_url = 'https://maayanlab.cloud/clustergrammer/matrix_upload/' r = requests.post(clustergrammer_url, files={'file': open(expr_df_sub_file, 'rb')}).text return r - + + ############################################# ########## 2. Plot ############################################# @@ -277,27 +288,41 @@ def plot_clustergrammar(clustergrammer_url): display(IPython.display.IFrame(clustergrammer_url, width="1000", height="1000")) -def plot_2D_scatter(x, y, text='', title='', xlab='', ylab='', hoverinfo='text', color='black', colorscale='Blues', size=8, showscale=False, symmetric_x=False, symmetric_y=False, pad=0.5, hline=False, vline=False, return_trace=False, labels=False, plot_type='interactive', de_type='ma', plot_name="2d_plot.png"): - range_x = [-max(abs(x))-pad, max(abs(x))+pad]if symmetric_x else None - range_y = [-max(abs(y))-pad, max(abs(y))+pad]if symmetric_y else None - trace = go.Scattergl(x=x, y=y, mode='markers', text=text, hoverinfo=hoverinfo, marker={'color': color, 'colorscale': colorscale, 'showscale': showscale, 'size': size}) +def plot_2D_scatter(x, y, text='', title='', xlab='', ylab='', hoverinfo='text', color='black', colorscale='Blues', + size=8, showscale=False, symmetric_x=False, symmetric_y=False, pad=0.5, hline=False, vline=False, + return_trace=False, labels=False, plot_type='interactive', de_type='ma', plot_name="2d_plot.png"): + range_x = [-max(abs(x)) - pad, max(abs(x)) + pad] if symmetric_x else None + range_y = [-max(abs(y)) - pad, max(abs(y)) + pad] if symmetric_y else None + trace = go.Scattergl(x=x, y=y, mode='markers', text=text, hoverinfo=hoverinfo, + marker={'color': color, 'colorscale': colorscale, 'showscale': showscale, 'size': size}) if return_trace: return trace else: if de_type == 'ma': annotations = [ - {'x': 1, 'y': 0.1, 'text':'Down-regulated in '+labels[-1]+'', 'showarrow': False, 'xref': 'paper', 'yref': 'paper', 'xanchor': 'right', 'yanchor': 'top'}, - {'x': 1, 'y': 0.9, 'text':'Up-regulated in '+labels[-1]+'', 'showarrow': False, 'xref': 'paper', 'yref': 'paper', 'xanchor': 'right', 'yanchor': 'bottom'} + {'x': 1, 'y': 0.1, + 'text': 'Down-regulated in ' + labels[ + -1] + '', 'showarrow': False, 'xref': 'paper', 'yref': 'paper', 'xanchor': 'right', + 'yanchor': 'top'}, + {'x': 1, 'y': 0.9, + 'text': 'Up-regulated in ' + labels[ + -1] + '', 'showarrow': False, 'xref': 'paper', 'yref': 'paper', 'xanchor': 'right', + 'yanchor': 'bottom'} ] if labels else [] elif de_type == 'volcano': annotations = [ - {'x': 0.25, 'y': 1.07, 'text':'Down-regulated in '+labels[-1]+'', 'showarrow': False, 'xref': 'paper', 'yref': 'paper', 'xanchor': 'center'}, - {'x': 0.75, 'y': 1.07, 'text':'Up-regulated in '+labels[-1]+'', 'showarrow': False, 'xref': 'paper', 'yref': 'paper', 'xanchor': 'center'} + {'x': 0.25, 'y': 1.07, + 'text': 'Down-regulated in ' + labels[ + -1] + '', 'showarrow': False, 'xref': 'paper', 'yref': 'paper', 'xanchor': 'center'}, + {'x': 0.75, 'y': 1.07, + 'text': 'Up-regulated in ' + labels[ + -1] + '', 'showarrow': False, 'xref': 'paper', 'yref': 'paper', 'xanchor': 'center'} ] if labels else [] - layout = go.Layout(title=title, xaxis={'title': xlab, 'range': range_x}, yaxis={'title': ylab, 'range': range_y}, hovermode='closest', annotations=annotations) + layout = go.Layout(title=title, xaxis={'title': xlab, 'range': range_x}, + yaxis={'title': ylab, 'range': range_y}, hovermode='closest', annotations=annotations) fig = go.Figure(data=[trace], layout=layout) - - if plot_type=='interactive': + + if plot_type == 'interactive': fig.show() else: fig.show(renderer="png") @@ -389,79 +414,93 @@ def plot_2D_scatter(x, y, text='', title='', xlab='', ylab='', hoverinfo='text', } ''') + def get_signatures(classes, dataset, normalization, method, meta_class_column_name, filter_genes): - tmp_normalization = normalization.replace("+z_norm+q_norm","").replace("+z_norm","") + tmp_normalization = normalization.replace("+z_norm+q_norm", "").replace("+z_norm", "") raw_expr_df = dataset['rawdata'] expr_df = dataset['rawdata'] - if filter_genes == True: + if filter_genes: expr_df = dataset['rawdata+filter_genes'] - + signatures = dict() for cls1, cls2 in combinations(classes, 2): print(cls1, cls2) - cls1_sample_ids = dataset["dataset_metadata"].loc[dataset["dataset_metadata"][meta_class_column_name]==cls1, :].index.tolist() #control - cls2_sample_ids = dataset["dataset_metadata"].loc[dataset["dataset_metadata"][meta_class_column_name]==cls2,:].index.tolist() #case - + cls1_sample_ids = dataset["dataset_metadata"].loc[dataset["dataset_metadata"][meta_class_column_name] == cls1, + :].index.tolist() # control + cls2_sample_ids = dataset["dataset_metadata"].loc[dataset["dataset_metadata"][meta_class_column_name] == cls2, + :].index.tolist() # case + signature_label = " vs. ".join([cls1, cls2]) - + if method == "limma": limma = robjects.r['limma'] - design_dataframe = pd.DataFrame([{'index': x, 'A': int(x in cls1_sample_ids), 'B': int(x in cls2_sample_ids)} for x in raw_expr_df.columns]).set_index('index') + design_dataframe = pd.DataFrame( + [{'index': x, 'A': int(x in cls1_sample_ids), 'B': int(x in cls2_sample_ids)} for x in + raw_expr_df.columns]).set_index('index') processed_data = {"expression": raw_expr_df, 'design': design_dataframe} - - limma_results = pandas2ri.conversion.rpy2py(limma(pandas2ri.conversion.py2rpy(processed_data['expression']), pandas2ri.conversion.py2rpy(processed_data['design']), filter_genes=filter_genes)) - + + limma_results = pandas2ri.conversion.rpy2py(limma(pandas2ri.conversion.py2rpy(processed_data['expression']), + pandas2ri.conversion.py2rpy(processed_data['design']), + filter_genes=filter_genes)) + signature = pd.DataFrame(limma_results[0]) signature.index = limma_results[1] signature = signature.sort_values("t", ascending=False) - + elif method == "characteristic_direction": - signature = characteristic_direction(dataset[tmp_normalization].loc[:, cls1_sample_ids], dataset[normalization].loc[:, cls2_sample_ids], calculate_sig=True) + signature = characteristic_direction(dataset[tmp_normalization].loc[:, cls1_sample_ids], + dataset[normalization].loc[:, cls2_sample_ids], calculate_sig=True) signature = signature.sort_values("CD-coefficient", ascending=False) elif method == "edgeR": edgeR = robjects.r['edgeR'] - - edgeR_results = pandas2ri.conversion.rpy2py(edgeR(pandas2ri.conversion.py2rpy(expr_df.loc[:, cls1_sample_ids+cls2_sample_ids]), pandas2ri.conversion.py2rpy(cls1_sample_ids), pandas2ri.conversion.py2rpy(cls2_sample_ids))) - + + edgeR_results = pandas2ri.conversion.rpy2py( + edgeR(pandas2ri.conversion.py2rpy(expr_df.loc[:, cls1_sample_ids+cls2_sample_ids]), pandas2ri.conversion.py2rpy(cls1_sample_ids), + pandas2ri.conversion.py2rpy(cls2_sample_ids))) + signature = pd.DataFrame(edgeR_results[0]) signature.index = edgeR_results[1] signature = signature.sort_values("logFC", ascending=False) elif method == "DESeq2": # deseq2 receives raw counts DESeq2 = robjects.r['deseq2'] - DESeq2_results = pandas2ri.conversion.rpy2py(DESeq2(pandas2ri.conversion.py2rpy(expr_df.loc[:, cls1_sample_ids+cls2_sample_ids]), pandas2ri.conversion.py2rpy(cls1_sample_ids), pandas2ri.conversion.py2rpy(cls2_sample_ids))) - + DESeq2_results = pandas2ri.conversion.rpy2py( + DESeq2(pandas2ri.conversion.py2rpy(expr_df.loc[:, cls1_sample_ids+cls2_sample_ids]), pandas2ri.conversion.py2rpy(cls1_sample_ids), + pandas2ri.conversion.py2rpy(cls2_sample_ids))) + signature = pd.DataFrame(DESeq2_results[0]) signature.index = DESeq2_results[1] signature = signature.sort_values("log2FoldChange", ascending=False) - - + signatures[signature_label] = signature - return signatures + return signatures def run_volcano(signature, signature_label, dataset, pvalue_threshold, logfc_threshold, plot_type): color = [] text = [] for index, rowData in signature.iterrows(): - if "AveExpr" in rowData.index: # limma + if "AveExpr" in rowData.index: # limma expr_colname = "AveExpr" pval_colname = "P.Value" logfc_colname = "logFC" - elif "logCPM" in rowData.index: #edgeR + elif "logCPM" in rowData.index: # edgeR expr_colname = "logCPM" pval_colname = "PValue" logfc_colname = "logFC" - elif "baseMean" in rowData.index: #DESeq2 + elif "baseMean" in rowData.index: # DESeq2 expr_colname = "baseMean" pval_colname = "pvalue" logfc_colname = "log2FoldChange" # Text - text.append(''+index+'
Avg Expression = '+str(round(rowData[expr_colname], ndigits=2))+'
logFC = '+str(round(rowData[logfc_colname], ndigits=2))+'
p = '+'{:.2e}'.format(rowData[pval_colname])+'
FDR = '+'{:.2e}'.format(rowData[pval_colname])) + text.append('' + index + '
Avg Expression = ' + str( + round(rowData[expr_colname], ndigits=2)) + '
logFC = ' + str( + round(rowData[logfc_colname], ndigits=2)) + '
p = ' + '{:.2e}'.format( + rowData[pval_colname]) + '
FDR = ' + '{:.2e}'.format(rowData[pval_colname])) # Color if rowData[pval_colname] < pvalue_threshold: @@ -475,12 +514,13 @@ def run_volcano(signature, signature_label, dataset, pvalue_threshold, logfc_thr else: color.append('black') - volcano_plot_results = {'x': signature[logfc_colname], 'y': -np.log10(signature[pval_colname]), 'text':text, 'color': color, 'signature_label': signature_label, 'plot_type': plot_type} + volcano_plot_results = {'x': signature[logfc_colname], 'y': -np.log10(signature[pval_colname]), 'text': text, + 'color': color, 'signature_label': signature_label, 'plot_type': plot_type} return volcano_plot_results - + def plot_volcano(volcano_plot_results): - spacer = ' '*50 + spacer = ' ' * 50 plot_name = "volcano_plot_{}.png".format(volcano_plot_results['signature_label']) plot_2D_scatter( x=volcano_plot_results['x'], @@ -495,29 +535,32 @@ def plot_volcano(volcano_plot_results): plot_type=volcano_plot_results['plot_type'], de_type='volcano', plot_name=plot_name - ) + ) return plot_name -def run_maplot(signature, signature_label='', pvalue_threshold=0.05, logfc_threshold=1.5, plot_type='interactive'): +def run_maplot(signature, signature_label='', pvalue_threshold=0.05, logfc_threshold=1.5, plot_type='interactive'): # Loop through signature color = [] text = [] for index, rowData in signature.iterrows(): - if "AveExpr" in rowData.index: # limma + if "AveExpr" in rowData.index: # limma expr_colname = "AveExpr" pval_colname = "adj.P.Val" logfc_colname = "logFC" - elif "logCPM" in rowData.index: #edgeR + elif "logCPM" in rowData.index: # edgeR expr_colname = "logCPM" pval_colname = "PValue" logfc_colname = "logFC" - elif "baseMean" in rowData.index: #DESeq2 + elif "baseMean" in rowData.index: # DESeq2 expr_colname = "baseMean" pval_colname = "padj" logfc_colname = "log2FoldChange" # Text - text.append(''+index+'
Avg Expression = '+str(round(rowData[expr_colname], ndigits=2))+'
logFC = '+str(round(rowData[logfc_colname], ndigits=2))+'
p = '+'{:.2e}'.format(rowData[pval_colname])+'
FDR = '+'{:.2e}'.format(rowData[pval_colname])) + text.append('' + index + '
Avg Expression = ' + str( + round(rowData[expr_colname], ndigits=2)) + '
logFC = ' + str( + round(rowData[logfc_colname], ndigits=2)) + '
p = ' + '{:.2e}'.format( + rowData[pval_colname]) + '
FDR = ' + '{:.2e}'.format(rowData[pval_colname])) # Color if rowData[pval_colname] < pvalue_threshold: @@ -530,12 +573,14 @@ def run_maplot(signature, signature_label='', pvalue_threshold=0.05, logfc_thres else: color.append('black') - + # Return - volcano_plot_results = {'x': signature[expr_colname], 'y': signature[logfc_colname], 'text':text, 'color': color, 'signature_label': signature_label, 'plot_type': plot_type} + volcano_plot_results = {'x': signature[expr_colname], 'y': signature[logfc_colname], 'text': text, 'color': color, + 'signature_label': signature_label, 'plot_type': plot_type} return volcano_plot_results -def plot_maplot(volcano_plot_results): + +def plot_maplot(volcano_plot_results): plot_name = "ma_plot_{}.png".format(volcano_plot_results['signature_label']) plot_2D_scatter( x=volcano_plot_results['x'], @@ -549,17 +594,16 @@ def plot_maplot(volcano_plot_results): labels=volcano_plot_results['signature_label'].split(' vs. '), plot_type=volcano_plot_results['plot_type'], plot_name=plot_name - + ) return plot_name -def run_enrichr(signature, signature_label, geneset_size=500, fc_colname = 'logFC', sort_genes_by='t', ascending=True): - +def run_enrichr(signature, signature_label, geneset_size=500, fc_colname='logFC', sort_genes_by='t', ascending=True): # Sort signature up_signature = signature[signature[fc_colname] > 0].sort_values(sort_genes_by, ascending=ascending) down_signature = signature[signature[fc_colname] < 0].sort_values(sort_genes_by, ascending=ascending) - + # Get genesets genesets = { 'upregulated': up_signature.index[:geneset_size], @@ -567,12 +611,15 @@ def run_enrichr(signature, signature_label, geneset_size=500, fc_colname = 'logF } # Submit to Enrichr - enrichr_ids = {geneset_label: submit_enrichr_geneset(geneset=geneset, label=signature_label+', '+geneset_label+', from Bulk RNA-seq Appyter') for geneset_label, geneset in genesets.items()} + enrichr_ids = {geneset_label: submit_enrichr_geneset(geneset=geneset, + label=signature_label + ', ' + geneset_label + ', from Bulk RNA-seq Appyter') + for geneset_label, geneset in genesets.items()} enrichr_ids['signature_label'] = signature_label return enrichr_ids + def submit_enrichr_geneset(geneset, label=''): - ENRICHR_URL = 'http://maayanlab.cloud/Enrichr/addList' + ENRICHR_URL = 'https://maayanlab.cloud/Enrichr/addList' genes_str = '\n'.join(geneset) payload = { 'list': (None, genes_str), @@ -586,22 +633,22 @@ def submit_enrichr_geneset(geneset, label=''): return data - def get_enrichr_results(user_list_id, gene_set_libraries, overlappingGenes=True, geneset=None): - ENRICHR_URL = 'http://maayanlab.cloud/Enrichr/enrich' + ENRICHR_URL = 'https://maayanlab.cloud/Enrichr/enrich' query_string = '?userListId=%s&backgroundType=%s' results = [] for gene_set_library, label in gene_set_libraries.items(): response = requests.get( - ENRICHR_URL + - query_string % (user_list_id, gene_set_library) - ) + ENRICHR_URL + + query_string % (user_list_id, gene_set_library) + ) if not response.ok: raise Exception('Error fetching enrichment results') data = json.loads(response.text) resultDataframe = pd.DataFrame(data[gene_set_library], columns=[ - 'rank', 'term_name', 'pvalue', 'zscore', 'combined_score', 'overlapping_genes', 'FDR', 'old_pvalue', 'old_FDR']) + 'rank', 'term_name', 'pvalue', 'zscore', 'combined_score', 'overlapping_genes', 'FDR', 'old_pvalue', + 'old_FDR']) selectedColumns = ['term_name', 'zscore', 'combined_score', 'pvalue', 'FDR'] if not overlappingGenes else [ 'term_name', 'zscore', 'combined_score', 'FDR', 'pvalue', 'overlapping_genes'] resultDataframe = resultDataframe.loc[:, selectedColumns] @@ -614,16 +661,15 @@ def get_enrichr_results(user_list_id, gene_set_libraries, overlappingGenes=True, return concatenatedDataframe - -def get_enrichr_results_by_library(enrichr_results, signature_label, plot_type='interactive', library_type='go', version='2018', sort_results_by='pvalue'): - +def get_enrichr_results_by_library(enrichr_results, signature_label, plot_type='interactive', library_type='go', + version='2018', sort_results_by='pvalue'): # Libraries if library_type == 'go': go_version = str(version) libraries = { - 'GO_Biological_Process_'+go_version: 'Gene Ontology Biological Process ('+go_version+' version)', - 'GO_Molecular_Function_'+go_version: 'Gene Ontology Molecular Function ('+go_version+' version)', - 'GO_Cellular_Component_'+go_version: 'Gene Ontology Cellular Component ('+go_version+' version)' + 'GO_Biological_Process_' + go_version: 'Gene Ontology Biological Process (' + go_version + ' version)', + 'GO_Molecular_Function_' + go_version: 'Gene Ontology Molecular Function (' + go_version + ' version)', + 'GO_Cellular_Component_' + go_version: 'Gene Ontology Cellular Component (' + go_version + ' version)' } elif library_type == "pathway": # Libraries @@ -634,7 +680,9 @@ def get_enrichr_results_by_library(enrichr_results, signature_label, plot_type=' } # Get Enrichment Results - enrichment_results = {geneset: get_enrichr_results(enrichr_results[geneset]['userListId'], gene_set_libraries=libraries, geneset=geneset) for geneset in ['upregulated', 'downregulated']} + enrichment_results = { + geneset: get_enrichr_results(enrichr_results[geneset]['userListId'], gene_set_libraries=libraries, + geneset=geneset) for geneset in ['upregulated', 'downregulated']} enrichment_results['signature_label'] = signature_label enrichment_results['plot_type'] = plot_type enrichment_results['sort_results_by'] = sort_results_by @@ -642,8 +690,8 @@ def get_enrichr_results_by_library(enrichr_results, signature_label, plot_type=' # Return return enrichment_results -def get_enrichr_result_tables_by_library(enrichr_results, signature_label, library_type='tf'): +def get_enrichr_result_tables_by_library(enrichr_results, signature_label, library_type='tf'): # Libraries if library_type == 'tf': # Libraries @@ -660,39 +708,41 @@ def get_enrichr_result_tables_by_library(enrichr_results, signature_label, libra } elif library_type == "mirna": libraries = { - 'TargetScan_microRNA_2017': 'TargetScan (experimentally validated targets)', - 'miRTarBase_2017': 'miRTarBase (experimentally validated targets)' + 'TargetScan_microRNA_2017': 'TargetScan (experimentally validated targets)', + 'miRTarBase_2017': 'miRTarBase (experimentally validated targets)' } - # Initialize results results = [] # Loop through genesets for geneset in ['upregulated', 'downregulated']: - # Append ChEA results - enrichment_dataframe = get_enrichr_results(enrichr_results[geneset]['userListId'], gene_set_libraries=libraries, geneset=geneset) + enrichment_dataframe = get_enrichr_results(enrichr_results[geneset]['userListId'], gene_set_libraries=libraries, + geneset=geneset) results.append(enrichment_dataframe) # Concatenate results enrichment_dataframe = pd.concat(results) return {'enrichment_dataframe': enrichment_dataframe, 'signature_label': signature_label} - -def plot_library_barchart(enrichr_results, gene_set_library, signature_label, sort_results_by='pvalue', nr_genesets=15, height=400, plot_type='interactive', plot_name="barchart.png"): + +def plot_library_barchart(enrichr_results, gene_set_library, signature_label, sort_results_by='pvalue', nr_genesets=15, + height=400, plot_type='interactive', plot_name="barchart.png"): sort_results_by = 'log10P' if sort_results_by == 'pvalue' else 'combined_score' fig = tools.make_subplots(rows=1, cols=2, print_grid=False) for i, geneset in enumerate(['upregulated', 'downregulated']): # Get dataframe enrichment_dataframe = enrichr_results[geneset] - plot_dataframe = enrichment_dataframe[enrichment_dataframe['gene_set_library'] == gene_set_library].sort_values(sort_results_by, ascending=False).iloc[:nr_genesets].iloc[::-1] + plot_dataframe = enrichment_dataframe[enrichment_dataframe['gene_set_library'] == gene_set_library].sort_values( + sort_results_by, ascending=False).iloc[:nr_genesets].iloc[::-1] # Format n = 7 plot_dataframe['nr_genes'] = [len(genes) for genes in plot_dataframe['overlapping_genes']] - plot_dataframe['overlapping_genes'] = ['
'.join([', '.join(genes[i:i+n]) for i in range(0, len(genes), n)]) for genes in plot_dataframe['overlapping_genes']] + plot_dataframe['overlapping_genes'] = ['
'.join([', '.join(genes[i:i + n]) for i in range(0, len(genes), n)]) + for genes in plot_dataframe['overlapping_genes']] # Get Bar bar = go.Bar( @@ -701,15 +751,17 @@ def plot_library_barchart(enrichr_results, gene_set_library, signature_label, so orientation='h', name=geneset.title(), showlegend=False, - hovertext=['{term_name}
P-value: {pvalue:.2}
FDR: {FDR:.2}
Z-score: {zscore:.3}
Combined score: {combined_score:.3}
{nr_genes} Genes: {overlapping_genes}
'.format(**rowData) for index, rowData in plot_dataframe.iterrows()], + hovertext=[ + '{term_name}
P-value: {pvalue:.2}
FDR: {FDR:.2}
Z-score: {zscore:.3}
Combined score: {combined_score:.3}
{nr_genes} Genes: {overlapping_genes}
'.format( + **rowData) for index, rowData in plot_dataframe.iterrows()], hoverinfo='text', marker={'color': '#FA8072' if geneset == 'upregulated' else '#87CEFA'} ) - fig.append_trace(bar, 1, i+1) + fig.append_trace(bar, 1, i + 1) # Get text text = go.Scatter( - x=[max(bar['x'])/50 for x in range(len(bar['y']))], + x=[max(bar['x']) / 50 for x in range(len(bar['y']))], y=bar['y'], mode='text', hoverinfo='none', @@ -719,15 +771,17 @@ def plot_library_barchart(enrichr_results, gene_set_library, signature_label, so textposition="middle right", textfont={'color': 'black'} ) - fig.append_trace(text, 1, i+1) + fig.append_trace(text, 1, i + 1) # Get annotations labels = signature_label.split(' vs. ') annotations = [ - {'x': 0.25, 'y': 1.06, 'text': 'Up-regulated in ' + - labels[-1]+'', 'showarrow': False, 'xref': 'paper', 'yref': 'paper', 'xanchor': 'center'}, - {'x': 0.75, 'y': 1.06, 'text': 'Down-regulated in ' + - labels[-1]+'', 'showarrow': False, 'xref': 'paper', 'yref': 'paper', 'xanchor': 'center'} + {'x': 0.25, 'y': 1.06, + 'text': 'Up-regulated in ' + + labels[-1] + '', 'showarrow': False, 'xref': 'paper', 'yref': 'paper', 'xanchor': 'center'}, + {'x': 0.75, 'y': 1.06, + 'text': 'Down-regulated in ' + + labels[-1] + '', 'showarrow': False, 'xref': 'paper', 'yref': 'paper', 'xanchor': 'center'} ] if signature_label else [] # Get title @@ -735,12 +789,14 @@ def plot_library_barchart(enrichr_results, gene_set_library, signature_label, so fig['layout'].update(height=height, title='{}'.format(title), hovermode='closest', annotations=annotations) - fig['layout']['xaxis1'].update(domain=[0, 0.49], title='-log10P' if sort_results_by == 'log10P' else 'Enrichment score') - fig['layout']['xaxis2'].update(domain=[0.51, 1], title='-log10P' if sort_results_by == 'log10P' else 'Enrichment score') + fig['layout']['xaxis1'].update(domain=[0, 0.49], + title='-log10P' if sort_results_by == 'log10P' else 'Enrichment score') + fig['layout']['xaxis2'].update(domain=[0.51, 1], + title='-log10P' if sort_results_by == 'log10P' else 'Enrichment score') fig['layout']['yaxis1'].update(showticklabels=False) fig['layout']['yaxis2'].update(showticklabels=False) fig['layout']['margin'].update(l=0, t=65, r=0, b=35) - if plot_type=='interactive': + if plot_type == 'interactive': fig.show() else: fig.show(renderer='png') @@ -750,41 +806,56 @@ def plot_library_barchart(enrichr_results, gene_set_library, signature_label, so def results_table(enrichment_dataframe, source_label, target_label, notebook_metadata, table_counter): - # Get libraries for gene_set_library in enrichment_dataframe['gene_set_library'].unique(): # Get subset - enrichment_dataframe_subset = enrichment_dataframe[enrichment_dataframe['gene_set_library'] == gene_set_library].copy() + enrichment_dataframe_subset = enrichment_dataframe[ + enrichment_dataframe['gene_set_library'] == gene_set_library].copy() # Get unique values from source column enrichment_dataframe_subset[source_label] = [x.split('_')[0] for x in enrichment_dataframe_subset['term_name']] - enrichment_dataframe_subset = enrichment_dataframe_subset.sort_values(['FDR', 'pvalue']).rename(columns={'pvalue': 'P-value'}).drop_duplicates(source_label) + enrichment_dataframe_subset = enrichment_dataframe_subset.sort_values(['FDR', 'pvalue']).rename( + columns={'pvalue': 'P-value'}).drop_duplicates(source_label) # Add links and bold for significant results - enrichment_dataframe_subset[source_label] = ['{}'.format(x.split(" ")[0], x) if '-miR-' in x else '{}'.format(x.split(" ")[0], x)for x in enrichment_dataframe_subset[source_label]] - + enrichment_dataframe_subset[source_label] = [ + '{}'.format(x.split(" ")[0], + x) if '-miR-' in x else '{}'.format( + x.split(" ")[0], x) for x in enrichment_dataframe_subset[source_label]] + # else: - # enrichment_dataframe_subset[source_label] = ['{x}'.format(**locals()) if '-miR-' in x else '{x}'.format(**locals())for x in enrichment_dataframe_subset[source_label]] - enrichment_dataframe_subset[source_label] = [rowData[source_label].replace('target="_blank">', 'target="_blank">').replace('', '*') if rowData['FDR'] < 0.05 else rowData[source_label] for index, rowData in enrichment_dataframe_subset.iterrows()] + # enrichment_dataframe_subset[source_label] = ['{x}'.format(**locals()) if '-miR-' in x else '{x}'.format(**locals())for x in enrichment_dataframe_subset[source_label]] + enrichment_dataframe_subset[source_label] = [ + rowData[source_label].replace('target="_blank">', 'target="_blank">').replace('', '*') if + rowData['FDR'] < 0.05 else rowData[source_label] for index, rowData in + enrichment_dataframe_subset.iterrows()] # Add rank - enrichment_dataframe_subset['Rank'] = [''+str(x+1)+'' for x in range(len(enrichment_dataframe_subset.index))] + enrichment_dataframe_subset['Rank'] = ['' + str(x + 1) + '' for x in + range(len(enrichment_dataframe_subset.index))] # Add overlapping genes with tooltip - enrichment_dataframe_subset['nr_overlapping_genes'] = [len(x) for x in enrichment_dataframe_subset['overlapping_genes']] - enrichment_dataframe_subset['overlapping_genes'] = [', '.join(x) for x in enrichment_dataframe_subset['overlapping_genes']] - enrichment_dataframe_subset[target_label.title()] = ['{nr_overlapping_genes} {geneset} '.format(**rowData)+target_label+'s' for index, rowData in enrichment_dataframe_subset.iterrows()] + enrichment_dataframe_subset['nr_overlapping_genes'] = [len(x) for x in + enrichment_dataframe_subset['overlapping_genes']] + enrichment_dataframe_subset['overlapping_genes'] = [', '.join(x) for x in + enrichment_dataframe_subset['overlapping_genes']] + enrichment_dataframe_subset[target_label.title()] = [ + '{nr_overlapping_genes} {geneset} '.format(**rowData) + target_label + 's' for index, rowData in + enrichment_dataframe_subset.iterrows()] # enrichment_dataframe[target_label.title()] = ['{nr_overlapping_genes} {geneset} '.format(**rowData)+target_label+'s
{overlapping_genes}
'.format(**rowData) for index, rowData in enrichment_dataframe.iterrows()] # Convert to HTML pd.set_option('max.colwidth', -1) - html_table = enrichment_dataframe_subset.head(50)[['Rank', source_label, 'P-value', 'FDR', target_label.title()]].to_html(escape=False, index=False, classes='w-100') + html_table = enrichment_dataframe_subset.head(50)[ + ['Rank', source_label, 'P-value', 'FDR', target_label.title()]].to_html(escape=False, index=False, + classes='w-100') html_results = '
{}
'.format(html_table) # Add CSS display(HTML('')) - display(HTML('')) + display(HTML( + '')) # Display table display(HTML(html_results)) @@ -792,20 +863,21 @@ def results_table(enrichment_dataframe, source_label, target_label, notebook_met if source_label == "Transcription Factor": additional_description = ". The table contains scrollable tables displaying the results of the Transcription Factor (TF) enrichment analysis generated using Enrichr. Every row represents a TF; significant TFs are highlighted in bold." elif source_label == "Kinase": - additional_description = ". The table contains browsable tables displaying the results of the Protein Kinase (PK) enrichment analysis generated using Enrichr. Every row represents a PK; significant PKs are highlighted in bold." + additional_description = ". The table contains browsable tables displaying the results of the Protein Kinase (PK) enrichment analysis generated using Enrichr. Every row represents a PK; significant PKs are highlighted in bold." elif source_label == "miRNA": additional_description = ". The figure contains browsable tables displaying the results of the miRNA enrichment analysis generated using Enrichr. Every row represents a miRNA; significant miRNAs are highlighted in bold." - filename="Enrichment_analysis_{}_{}.csv".format(source_label, gene_set_library) - table_counter, notebook_metadata = display_object(table_counter, gene_set_library+additional_description, notebook_metadata, saved_filename=filename, istable=True) + filename = "Enrichment_analysis_{}_{}.csv".format(source_label, gene_set_library) + table_counter, notebook_metadata = display_object(table_counter, gene_set_library + additional_description, + notebook_metadata, saved_filename=filename, istable=True) display(create_download_link(enrichment_dataframe_subset, filename=filename)) - + return table_counter, notebook_metadata + def display_table(analysis_results, source_label, notebook_metadata, table_counter): - # Plot Table - return results_table(analysis_results['enrichment_dataframe'].copy(), source_label=source_label, notebook_metadata=notebook_metadata, target_label='target', table_counter=table_counter) - + return results_table(analysis_results['enrichment_dataframe'].copy(), source_label=source_label, + notebook_metadata=notebook_metadata, target_label='target', table_counter=table_counter) def run_l1000cds2(signature, nr_genes=500, signature_label='', plot_type='interactive'): @@ -816,38 +888,46 @@ def run_l1000cds2(signature, nr_genes=500, signature_label='', plot_type='intera upperGenes = lambda genes: [gene.upper() for gene in genes] # Get Data - data = {"upGenes":upperGenes(signature.index[:nr_genes]),"dnGenes":upperGenes(signature.index[-nr_genes:])} + data = {"upGenes": upperGenes(signature.index[:nr_genes]), "dnGenes": upperGenes(signature.index[-nr_genes:])} # Loop through aggravate: for aggravate in [True, False]: # Send to API - config = {"aggravate":aggravate,"searchMethod":"geneSet","share":True,"combination":False,"db-version":"latest"} - r = requests.post('http://maayanlab.cloud/L1000CDS2/query',data=json.dumps({"data":data,"config":config}),headers={'content-type':'application/json'}) + config = {"aggravate": aggravate, "searchMethod": "geneSet", "share": True, "combination": False, + "db-version": "latest"} + r = requests.post('https://maayanlab.cloud/L1000CDS2/query', data=json.dumps({"data": data, "config": config}), + headers={'content-type': 'application/json'}) label = 'mimic' if aggravate else 'reverse' # Add results resGeneSet = r.json() if resGeneSet.get('topMeta'): - l1000cds2_dataframe = pd.DataFrame(resGeneSet['topMeta'])[['cell_id', 'pert_desc', 'pert_dose', 'pert_dose_unit', 'pert_id', 'pert_time', 'pert_time_unit', 'pubchem_id', 'score', 'sig_id']].replace('-666', np.nan) - l1000cds2_results[label] = {'url': 'http://maayanlab.cloud/L1000CDS2/#/result/{}'.format(resGeneSet['shareId']), 'table': l1000cds2_dataframe} + l1000cds2_dataframe = pd.DataFrame(resGeneSet['topMeta'])[ + ['cell_id', 'pert_desc', 'pert_dose', 'pert_dose_unit', 'pert_id', 'pert_time', 'pert_time_unit', + 'pubchem_id', 'score', 'sig_id']].replace('-666', np.nan) + l1000cds2_results[label] = { + 'url': 'https://maayanlab.cloud/L1000CDS2/#/result/{}'.format(resGeneSet['shareId']), + 'table': l1000cds2_dataframe} else: l1000cds2_results[label] = None l1000cds2_results['plot_type'] = plot_type # Return return l1000cds2_results - -def plot_l1000cds2(l1000cds2_results, counter, notebook_metadata, nr_drugs=7, height=300, plot_name="L1000CDS2.png"): + +def plot_l1000cds2(l1000cds2_results, counter, notebook_metadata, nr_drugs=7, height=300, plot_name="L1000CDS2.png"): # Check if there are results if not l1000cds2_results['mimic'] or not l1000cds2_results['reverse']: - print('### No results were found.\n This is likely due to the fact that the gene identifiers were not recognized by L1000CDS2. Please note that L1000CDS2 currently only supports HGNC gene symbols (https://www.genenames.org/). If your dataset uses other gene identifier systems, such as Ensembl IDs or Entrez IDs, consider converting them to HGNC. Automated gene identifier conversion is currently under development.') - + print( + '### No results were found.\n This is likely due to the fact that the gene identifiers were not recognized by L1000CDS2. Please note that L1000CDS2 currently only supports HGNC gene symbols (https://www.genenames.org/). If your dataset uses other gene identifier systems, such as Ensembl IDs or Entrez IDs, consider converting them to HGNC. Automated gene identifier conversion is currently under development.') + else: # Bar charts - fig = tools.make_subplots(rows=1, cols=2, print_grid=False); + fig = tools.make_subplots(rows=1, cols=2, print_grid=False) for i, direction in enumerate(['mimic', 'reverse']): - drug_counts = l1000cds2_results[direction]['table'].groupby('pert_desc').size().sort_values(ascending=False).iloc[:nr_drugs].iloc[::-1] + drug_counts = l1000cds2_results[direction]['table'].groupby('pert_desc').size().sort_values( + ascending=False).iloc[:nr_drugs].iloc[::-1] # Get Bar bar = go.Bar( @@ -857,13 +937,13 @@ def plot_l1000cds2(l1000cds2_results, counter, notebook_metadata, nr_drugs=7, he name=direction.title(), hovertext=drug_counts.index, hoverinfo='text', - marker={'color': '#FF7F50' if direction=='mimic' else '#9370DB'} + marker={'color': '#FF7F50' if direction == 'mimic' else '#9370DB'} ) - fig.append_trace(bar, 1, i+1) - + fig.append_trace(bar, 1, i + 1) + # Get text text = go.Scatter( - x=[max(bar['x'])/50 for x in range(len(bar['y']))], + x=[max(bar['x']) / 50 for x in range(len(bar['y']))], y=bar['y'], mode='text', hoverinfo='none', @@ -872,19 +952,21 @@ def plot_l1000cds2(l1000cds2_results, counter, notebook_metadata, nr_drugs=7, he textposition="middle right", textfont={'color': 'black'} ) - fig.append_trace(text, 1, i+1) + fig.append_trace(text, 1, i + 1) - fig['layout'].update(height=height, title='L1000CDS2 {}| Small Molecule Query
Top small molecules'.format(l1000cds2_results['signature_label']), hovermode='closest') - fig['layout']['xaxis1'].update(domain=[0,0.5]) + fig['layout'].update(height=height, + title='L1000CDS2 {}| Small Molecule Query
Top small molecules'.format( + l1000cds2_results['signature_label']), hovermode='closest') + fig['layout']['xaxis1'].update(domain=[0, 0.5]) fig['layout']['xaxis1'].update(title='
Count') fig['layout']['xaxis2'].update(title='
Count') - fig['layout']['xaxis2'].update(domain=[0.5,1]) + fig['layout']['xaxis2'].update(domain=[0.5, 1]) fig['layout']['yaxis1'].update(showticklabels=False) fig['layout']['yaxis2'].update(showticklabels=False) fig['layout']['margin'].update(l=10, t=95, r=0, b=45, pad=5) if l1000cds2_results['plot_type'] == 'interactive': - fig.show() + fig.show() else: fig.show(renderer="png") @@ -899,7 +981,7 @@ def plot_l1000cds2(l1000cds2_results, counter, notebook_metadata, nr_drugs=7, he display_link(l1000cds2_results['reverse']['url']) return counter, notebook_metadata - + def run_l1000fwd(signature, nr_genes=500, signature_label=''): # Define results l1000fwd_results = {'signature_label': signature_label} @@ -908,7 +990,8 @@ def run_l1000fwd(signature, nr_genes=500, signature_label=''): upperGenes = lambda genes: [gene.upper() for gene in genes] # Get Data - payload = {"up_genes":upperGenes(signature.index[:nr_genes]),"down_genes":upperGenes(signature.index[-nr_genes:])} + payload = {"up_genes": upperGenes(signature.index[:nr_genes]), + "down_genes": upperGenes(signature.index[-nr_genes:])} # Get URL L1000FWD_URL = 'https://maayanlab.cloud/l1000fwd/' @@ -920,43 +1003,53 @@ def run_l1000fwd(signature, nr_genes=500, signature_label=''): else: # Get ID and URL result_id = response.json()['result_id'] - l1000fwd_results['result_url'] = 'https://maayanlab.cloud/l1000fwd/vanilla/result/'+result_id + l1000fwd_results['result_url'] = 'https://maayanlab.cloud/l1000fwd/vanilla/result/' + result_id l1000fwd_results['result_id'] = result_id # Get Top l1000fwd_results['signatures'] = requests.get(L1000FWD_URL + 'result/topn/' + result_id).json() - # Return return l1000fwd_results - + + def plot_l1000fwd(l1000fwd_results, figure_counter, table_counter, notebook_metadata, nr_drugs=7, height=300): - # Check if results if l1000fwd_results['result_url']: # Display IFrame display_link(l1000fwd_results['result_url']) display(IFrame(l1000fwd_results['result_url'], width="1000", height="1000")) - figure_counter, notebook_metadata = display_object(figure_counter, "L1000FWD plot for visualizing top mimickers (red) and top reversers (blue) of {}.".format(l1000fwd_results["signature_label"]), notebook_metadata, saved_filename=l1000fwd_results['result_url'], istable=False) + figure_counter, notebook_metadata = display_object(figure_counter, + "L1000FWD plot for visualizing top mimickers (red) and top reversers (blue) of {}.".format( + l1000fwd_results["signature_label"]), notebook_metadata, + saved_filename=l1000fwd_results['result_url'], istable=False) # Display tables for direction, signature_list in l1000fwd_results['signatures'].items(): - # Fix dataframe - rename_dict = {'sig_id': 'Signature ID', 'pvals': 'P-value', 'qvals': 'FDR', 'zscores': 'Z-score', 'combined_scores': 'Combined Score'} - signature_dataframe = pd.DataFrame(signature_list)[list(rename_dict.keys())].rename(columns=rename_dict).sort_values('P-value').rename_axis('Rank') + rename_dict = {'sig_id': 'Signature ID', 'pvals': 'P-value', 'qvals': 'FDR', 'zscores': 'Z-score', + 'combined_scores': 'Combined Score'} + signature_dataframe = pd.DataFrame(signature_list)[list(rename_dict.keys())].rename( + columns=rename_dict).sort_values('P-value').rename_axis('Rank') signature_dataframe.index = [x + 1 for x in range(len(signature_dataframe.index))] signature_txt = signature_dataframe.to_csv(sep='\t') # Display table pd.set_option('max.colwidth', -1) - signature_dataframe['Signature ID'] = ['{x}'.format(**locals()) for x in signature_dataframe['Signature ID']] + signature_dataframe['Signature ID'] = [ + '{x}'.format(**locals()) for x in + signature_dataframe['Signature ID']] table_html = signature_dataframe.to_html(escape=False, classes='w-100') - - display(HTML('
{}
'.format(table_html))) + + display(HTML( + '
{}
'.format( + table_html))) filename = "{} Signatures for {}.csv".format(direction.title(), l1000fwd_results["signature_label"]) - table_counter, notebook_metadata = display_object(table_counter, "L1000FWD Results: {} Signatures of {}".format(direction.title(), l1000fwd_results["signature_label"]), notebook_metadata, saved_filename=filename, istable=True) + table_counter, notebook_metadata = display_object(table_counter, + "L1000FWD Results: {} Signatures of {}".format( + direction.title(), + l1000fwd_results["signature_label"]), + notebook_metadata, saved_filename=filename, istable=True) display(create_download_link(signature_dataframe, filename=filename)) - - return figure_counter, table_counter, notebook_metadata + return figure_counter, table_counter, notebook_metadata From a0ed49972cefc465f17ba919e787e54282b9163b Mon Sep 17 00:00:00 2001 From: Maxim Kuleshov Date: Thu, 31 Mar 2022 19:55:27 -0400 Subject: [PATCH 4/4] Enrichr links; References; Clustergrammer --- .../RNA_seq_Analysis_Pipeline.ipynb | 1 + appyters/Bulk_RNA_seq/appyter.json | 4 ++-- appyters/Bulk_RNA_seq/utils.py | 21 ++++++++++++------- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/appyters/Bulk_RNA_seq/RNA_seq_Analysis_Pipeline.ipynb b/appyters/Bulk_RNA_seq/RNA_seq_Analysis_Pipeline.ipynb index 288798bdd..e8d4d6c6a 100644 --- a/appyters/Bulk_RNA_seq/RNA_seq_Analysis_Pipeline.ipynb +++ b/appyters/Bulk_RNA_seq/RNA_seq_Analysis_Pipeline.ipynb @@ -815,6 +815,7 @@ " enrichr_link_dict[title_down] = dict()\n", " enrichr_link_dict[title_down][\"link\"] = \"link to Enrichr\".format(results['enrichr'][label][\"downregulated\"][\"shortId\"])\n", "\n", + "notebook_metadata['enrichr_links'] = enrichr_link_dict\n", "enrichr_link_df = pd.DataFrame.from_dict(enrichr_link_dict).T\n", "table_counter, notebook_metadata = display_object(table_counter, \"The table displays links to Enrichr containing the results of enrichment analyses generated by analyzing the up-regulated and down-regulated genes from a differential expression analysis. By clicking on these links, users can interactively explore and download the enrichment results from the Enrichr website.\", notebook_metadata=notebook_metadata, saved_filename=\"enrichr_links.csv\", df=enrichr_link_df, ishtml=True)" ] diff --git a/appyters/Bulk_RNA_seq/appyter.json b/appyters/Bulk_RNA_seq/appyter.json index dd36cabc2..9ebe8dfc5 100644 --- a/appyters/Bulk_RNA_seq/appyter.json +++ b/appyters/Bulk_RNA_seq/appyter.json @@ -2,7 +2,7 @@ "$schema": "https://raw.githubusercontent.com/MaayanLab/appyter-catalog/main/schema/appyter-validator.json", "name": "Bulk_RNA_seq", "title": "Bulk RNA-seq analysis pipeline", - "version": "0.1.8", + "version": "0.1.9", "description": "Bulk RNA-seq analysis pipeline enables you to analyze and visualize datasets with state-of-the-art algorithms and visualization methods.", "image": "screenshot.png", "authors": [ @@ -16,7 +16,7 @@ }, { "name": "Maxim Kuleshov", - "email": "kuleshov.max.v@gmail.com" + "email": "maxim.kuleshov@mssm.edu" } ], "url": "https://github.com/MaayanLab/appyter-catalog", diff --git a/appyters/Bulk_RNA_seq/utils.py b/appyters/Bulk_RNA_seq/utils.py index b6c7ee83e..12dcad14f 100644 --- a/appyters/Bulk_RNA_seq/utils.py +++ b/appyters/Bulk_RNA_seq/utils.py @@ -924,7 +924,7 @@ def plot_l1000cds2(l1000cds2_results, counter, notebook_metadata, nr_drugs=7, he else: # Bar charts - fig = tools.make_subplots(rows=1, cols=2, print_grid=False) + fig = tools.make_subplots(rows=1, cols=2, print_grid=False); for i, direction in enumerate(['mimic', 'reverse']): drug_counts = l1000cds2_results[direction]['table'].groupby('pert_desc').size().sort_values( ascending=False).iloc[:nr_drugs].iloc[::-1] @@ -969,17 +969,22 @@ def plot_l1000cds2(l1000cds2_results, counter, notebook_metadata, nr_drugs=7, he fig.show() else: fig.show(renderer="png") - - counter, notebook_metadata = display_object(counter, "Top {} Mimic/Reverse Small Molecule from L1000CDS2 for {}.".format(nr_drugs, l1000cds2_results['signature_label']), notebook_metadata, saved_filename=plot_name, istable=False) - notebook_metadata["figures"][str(counter-1)]["file_desc"] = [{"name": "Mimic Signature Query Results", "link":l1000cds2_results['mimic']['url']}, - {"name": "Reverse Signature Query Results", "link":l1000cds2_results['reverse']['url']},] - - # Links + + + counter, notebook_metadata = display_object(counter, + "Top {} Mimic/Reverse Small Molecule from L1000CDS2 for {}.".format( + nr_drugs, l1000cds2_results['signature_label']), + notebook_metadata, saved_filename=plot_name, istable=False) + notebook_metadata["figures"][str(counter - 1)]["file_desc"] = [ + {"name": "Mimic Signature Query Results", "link": l1000cds2_results['mimic']['url']}, + {"name": "Reverse Signature Query Results", "link": l1000cds2_results['reverse']['url']}, ] + + # Links display(Markdown(' *Mimic Signature Query Results*:')) display_link(l1000cds2_results['mimic']['url']) display(Markdown(' *Reverse Signature Query Results*:')) display_link(l1000cds2_results['reverse']['url']) - + return counter, notebook_metadata def run_l1000fwd(signature, nr_genes=500, signature_label=''):