diff --git a/appyters/Bulk_RNA_seq/RNA_seq_Analysis_Pipeline.ipynb b/appyters/Bulk_RNA_seq/RNA_seq_Analysis_Pipeline.ipynb
index 1915b169..e8d4d6c6 100644
--- a/appyters/Bulk_RNA_seq/RNA_seq_Analysis_Pipeline.ipynb
+++ b/appyters/Bulk_RNA_seq/RNA_seq_Analysis_Pipeline.ipynb
@@ -41,6 +41,7 @@
"import base64 \n",
"import json\n",
"from pandas.api.types import CategoricalDtype\n",
+ "from collections import defaultdict\n",
"\n",
"# Visualization\n",
"import plotly\n",
@@ -327,7 +328,7 @@
"\n",
"{% set enrichr_libraries = MultiChoiceField(\n",
" name='enrichr_libraries',\n",
- " label='Enrichr Libraries (upto 2)',\n",
+ " label='Enrichr Libraries',\n",
" descriptions='Enrichr libraries to be visualized. Select one or two libraries',\n",
" choices=['Gene Ontology',\n",
" 'Pathway',\n",
@@ -423,10 +424,7 @@
"warnings.filterwarnings('ignore')\n",
"random.seed(0)\n",
"pandas2ri.activate()\n",
- "notebook_metadata = dict()\n",
- "notebook_metadata[\"tables\"] = dict()\n",
- "notebook_metadata[\"figures\"] = dict()\n",
- "notebook_metadata[\"input_parameters\"] = dict()\n",
+ "notebook_metadata = defaultdict(dict)\n",
"if interactive_plot == True:\n",
" plot_type='interactive'\n",
"else:\n",
@@ -444,42 +442,21 @@
"source": [
"%%appyter code_exec\n",
"\n",
+ "ip = dict()\n",
"\n",
- "notebook_metadata[\"input_parameters\"][\"rnaseq_data_filename\"] = rnaseq_data_filename\n",
- "notebook_metadata[\"input_parameters\"][\"meta_data_filename\"] = meta_data_filename\n",
- "notebook_metadata[\"input_parameters\"][\"meta_class_column_name\"] = meta_class_column_name\n",
- "notebook_metadata[\"input_parameters\"][\"control_name\"] = control_name\n",
- "\n",
- "notebook_metadata[\"input_parameters\"][\"filter_genes\"] = filter_genes\n",
- "notebook_metadata[\"input_parameters\"][\"low_expression_threshold\"] = low_expression_threshold\n",
- "notebook_metadata[\"input_parameters\"][\"logCPM_normalization\"] = {{logCPM_normalization.value}}\n",
- "notebook_metadata[\"input_parameters\"][\"log_normalization\"] = {{log_normalization.value}}\n",
- "notebook_metadata[\"input_parameters\"][\"z_normalization\"] = {{z_normalization.value}}\n",
- "notebook_metadata[\"input_parameters\"][\"q_normalization\"] = {{q_normalization.value}}\n",
- "\n",
- "notebook_metadata[\"input_parameters\"][\"visualization_method\"] = \"{{visualization_method.value}}\"\n",
- "notebook_metadata[\"input_parameters\"][\"nr_genes\"] = nr_genes\n",
- "notebook_metadata[\"input_parameters\"][\"gene_list_for_clustergrammer\"] = gene_list_for_clustergrammer\n",
- "notebook_metadata[\"input_parameters\"][\"clustering_topk\"] = clustering_topk\n",
- "\n",
- "notebook_metadata[\"input_parameters\"][\"diff_gex_method\"] = diff_gex_method\n",
- "notebook_metadata[\"input_parameters\"][\"diff_gex_plot_method\"] = diff_gex_plot_method\n",
- "notebook_metadata[\"input_parameters\"][\"pvalue_threshold\"] = pvalue_threshold\n",
- "notebook_metadata[\"input_parameters\"][\"logfc_threshold\"] = logfc_threshold\n",
- "notebook_metadata[\"input_parameters\"][\"gene_topk\"] = gene_topk\n",
- "notebook_metadata[\"input_parameters\"][\"enrichr_libraries\"] = enrichr_libraries\n",
- "notebook_metadata[\"input_parameters\"][\"nr_genesets\"] = nr_genesets\n",
- "notebook_metadata[\"input_parameters\"][\"small_molecule_method\"] = small_molecule_method\n",
- "notebook_metadata[\"input_parameters\"][\"l1000_topk\"] = l1000_topk\n",
- "notebook_metadata[\"input_parameters\"][\"nr_drugs\"] = nr_drugs\n",
- "\n"
+ "ip |= dict(zip([\"rnaseq_data_filename\", \"meta_data_filename\", \"meta_class_column_name\", \"control_name\"], [rnaseq_data_filename, meta_data_filename, meta_class_column_name, control_name]))\n",
+ "ip |= dict(zip([\"filter_genes\", \"low_expression_threshold\", \"logCPM_normalization\", \"log_normalization\", \"z_normalization\", \"q_normalization\"], [filter_genes, low_expression_threshold, {{logCPM_normalization.value}}, {{log_normalization.value}}, {{z_normalization.value}}, {{q_normalization.value}}]))\n",
+ "ip |= dict(zip([\"visualization_method\", \"nr_genes\", \"gene_list_for_clustergrammer\", \"clustering_topk\"], [\"{{visualization_method.value}}\", nr_genes, gene_list_for_clustergrammer, clustering_topk]))\n",
+ "ip |= dict(zip([\"diff_gex_method\", \"diff_gex_plot_method\", \"pvalue_threshold\", \"logfc_threshold\", \"gene_topk\", \"enrichr_libraries\", \"nr_genesets\", \"small_molecule_method\", \"l1000_topk\", \"nr_drugs\"], [diff_gex_method, diff_gex_plot_method, pvalue_threshold, logfc_threshold, gene_topk, enrichr_libraries, nr_genesets, small_molecule_method, l1000_topk, nr_drugs]))\n",
+ "\n",
+ "notebook_metadata[\"input_parameters\"] = ip\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "# Load datasets"
+ "# Loaded datasets"
]
},
{
@@ -580,7 +557,7 @@
"outputs": [],
"source": [
"dataset['dataset_metadata'] = meta_df\n",
- "\n",
+ "ref_counter = ['Clark, N.R. and Ma’ayan, A. (2011) Introduction to statistical methods to analyze large data sets: principal components analysis. Sci. Signal., 4, tr3-tr3.']\n",
"table_counter, notebook_metadata = display_object(table_counter, \"Raw RNA-seq expression data. The table displays the first 5 rows of the quantified RNA-seq expression dataset. Rows represent genes, columns represent samples, and values show the number of mapped reads.\", notebook_metadata, \"raw_exp.csv\", dataset[current_dataset].head(), istable=True)\n",
"table_counter, notebook_metadata = display_object(table_counter, \"Metadata. The table displays the metadata associated with the samples in the RNA-seq dataset. Rows represent RNA-seq samples, columns represent metadata categories.\", notebook_metadata, \"metadata.csv\", dataset['dataset_metadata'].head(), istable=True)\n",
"table_counter, notebook_metadata = display_object(table_counter, \"Sample size for each class. The table displays the number of samples in each class.\", notebook_metadata, \"num_of_samples_in_class.csv\", dataset['dataset_metadata'].reset_index().groupby(meta_class_column_name).count(), istable=True)"
@@ -630,7 +607,7 @@
"source": [
"%%appyter markdown\n",
"{% if visualization_method.value == \"PCA\" %}\n",
- "Principal Component Analysis (PCA) (Clark et al. 2011) is a statistical technique used to identify global patterns in high-dimensional datasets. It is commonly used to explore the similarity of biological samples in RNA-seq datasets. To achieve this, gene expression values are transformed into Principal Components (PCs), a set of linearly uncorrelated features which represent the most relevant sources of variance in the data, and subsequently visualized using a scatter plot.\n",
+ "Principal Component Analysis (PCA) [1] is a statistical technique used to identify global patterns in high-dimensional datasets. It is commonly used to explore the similarity of biological samples in RNA-seq datasets. To achieve this, gene expression values are transformed into Principal Components (PCs), a set of linearly uncorrelated features which represent the most relevant sources of variance in the data, and subsequently visualized using a scatter plot.\n",
"{% endif %}"
]
},
@@ -671,7 +648,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "Clustergrammer (Fernandez et al. 2017) is a web-based tool for visualizing and analyzing high-dimensional data as interactive and hierarchically clustered heatmaps. It is commonly used to explore the similarity between samples in an RNA-seq dataset. In addition to identifying clusters of samples, it also allows to identify the genes which contribute to the clustering."
+ "Clustergrammer [2] is a web-based tool for visualizing and analyzing high-dimensional data as interactive and hierarchically clustered heatmaps. It is commonly used to explore the similarity between samples in an RNA-seq dataset. In addition to identifying clusters of samples, it also allows to identify the genes which contribute to the clustering."
]
},
{
@@ -682,7 +659,7 @@
"source": [
"# Run analysis\n",
"results['clustergrammer'] = run_clustergrammer(dataset=dataset, meta_class_column_name=meta_class_column_name, nr_genes=clustering_topk, normalization=normalization, z_score=True, gene_list=gene_list_for_clustergrammer)\n",
- "\n",
+ "ref_counter.append('Fernandez, Nicolas F., et al. \"Clustergrammer, a web-based heatmap visualization and analysis tool for high-dimensional biological data.\" Scientific data 4 (2017): 170151.')\n",
"# Display results\n",
"plot_clustergrammar(results['clustergrammer'])\n",
"caption = \"Clustered heatmap plot. The figure contains an interactive heatmap displaying gene expression for each sample in the RNA-seq dataset. Every row of the heatmap represents a gene, every column represents a sample, and every cell displays normalized gene expression values. The heatmap additionally features color bars beside each column which represent prior knowledge of each sample, such as the tissue of origin or experimental treatment.\"\n",
@@ -735,7 +712,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "Gene expression signatures are alterations in the patterns of gene expression that occur as a result of cellular perturbations such as drug treatments, gene knock-downs or diseases. They can be quantified using differential gene expression (DGE) methods (Ritchie et al. 2015, Clark et al. 2014), which compare gene expression between two groups of samples to identify genes whose expression is significantly altered in the perturbation. "
+ "Gene expression signatures are alterations in the patterns of gene expression that occur as a result of cellular perturbations such as drug treatments, gene knock-downs or diseases. They can be quantified using differential gene expression (DGE) methods [3, 4], which compare gene expression between two groups of samples to identify genes whose expression is significantly altered in the perturbation. "
]
},
{
@@ -745,7 +722,7 @@
"outputs": [],
"source": [
"signatures = get_signatures(classes, dataset, normalization, diff_gex_method, meta_class_column_name, filter_genes)\n",
- "\n",
+ "ref_counter.extend(['Ritchie, Matthew E., et al. limma powers differential expression analyses for RNA-sequencing and microarray studies. Nucleic acids research 43.7 (2015): e47-e47.', 'Clark, Neil R., et al. The characteristic direction: a geometrical approach to identify differentially expressed genes. BMC bioinformatics 15.1 (2014): 79.'])\n",
"for label, signature in signatures.items():\n",
" case_label = label.split(\" vs. \")[1]\n",
" table_counter, notebook_metadata = display_object(table_counter, \"Differentially expressed genes between {} using {}. The figure displays a browsable table containing the gene expression signature generated from a differential gene expression analysis. Every row of the table represents a gene; the columns display the estimated measures of differential expression.\".format(label, diff_gex_method), notebook_metadata, \"DEG_results_{}.csv\".format(label), signature, istable=True)\n",
@@ -796,7 +773,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "Enrichment analysis is a statistical procedure used to identify biological terms which are over-represented in a given gene set. These include signaling pathways, molecular functions, diseases, and a wide variety of other biological terms obtained by integrating prior knowledge of gene function from multiple resources. Enrichr (Kuleshov et al. 2016) is a web-based application which allows to perform enrichment analysis using a large collection of gene-set libraries and various interactive approaches to display enrichment results."
+ "Enrichment analysis is a statistical procedure used to identify biological terms which are over-represented in a given gene set. These include signaling pathways, molecular functions, diseases, and a wide variety of other biological terms obtained by integrating prior knowledge of gene function from multiple resources. Enrichr [5] is a web-based application which allows to perform enrichment analysis using a large collection of gene-set libraries and various interactive approaches to display enrichment results."
]
},
{
@@ -806,6 +783,7 @@
"outputs": [],
"source": [
"# Loop through signatures\n",
+ "ref_counter.append('Kuleshov, M.V., Jones, M.R., Rouillard, A.D., Fernandez, N.F., Duan, Q., Wang, Z., Koplev, S., Jenkins, S.L., Jagodnik, K.M. and Lachmann, A. (2016) Enrichr: a comprehensive gene set enrichment analysis web server 2016 update. Nucleic acids research, 44, W90-W97.')\n",
"results = {}\n",
"results['enrichr']= {}\n",
"if diff_gex_method == \"characteristic_direction\":\n",
@@ -837,6 +815,7 @@
" enrichr_link_dict[title_down] = dict()\n",
" enrichr_link_dict[title_down][\"link\"] = \"link to Enrichr\".format(results['enrichr'][label][\"downregulated\"][\"shortId\"])\n",
"\n",
+ "notebook_metadata['enrichr_links'] = enrichr_link_dict\n",
"enrichr_link_df = pd.DataFrame.from_dict(enrichr_link_dict).T\n",
"table_counter, notebook_metadata = display_object(table_counter, \"The table displays links to Enrichr containing the results of enrichment analyses generated by analyzing the up-regulated and down-regulated genes from a differential expression analysis. By clicking on these links, users can interactively explore and download the enrichment results from the Enrichr website.\", notebook_metadata=notebook_metadata, saved_filename=\"enrichr_links.csv\", df=enrichr_link_df, ishtml=True)"
]
@@ -850,7 +829,7 @@
"%%appyter markdown\n",
"{% if \"Gene Ontology\" in enrichr_libraries.value %}\n",
"# GO Enrichment Analysis\n",
- "Gene Ontology (GO) (Ashburner et al. 2000) is a major bioinformatics initiative aimed at unifying the representation of gene attributes across all species. It contains a large collection of experimentally validated and predicted associations between genes and biological terms. This information can be leveraged by Enrichr to identify the biological processes, molecular functions and cellular components which are over-represented in the up-regulated and down-regulated genes identified by comparing two groups of samples.\n",
+ "Gene Ontology (GO) [{{ref_counter|length}}] is a major bioinformatics initiative aimed at unifying the representation of gene attributes across all species. It contains a large collection of experimentally validated and predicted associations between genes and biological terms. This information can be leveraged by Enrichr to identify the biological processes, molecular functions and cellular components which are over-represented in the up-regulated and down-regulated genes identified by comparing two groups of samples.\n",
"{% endif %}"
]
},
@@ -862,7 +841,7 @@
"source": [
"%%appyter code_exec\n",
"{% if \"Gene Ontology\" in enrichr_libraries.value %}\n",
- "\n",
+ "ref_counter.append('Ashburner, M., Ball, C.A., Blake, J.A., Botstein, D., Butler, H., Cherry, J.M., Davis, A.P., Dolinski, K., Dwight, S.S. and Eppig, J.T. (2000) Gene Ontology: tool for the unification of biology. Nature genetics, 25, 25.')\n",
"results['go_enrichment'] = {}\n",
"for label, signature in signatures.items():\n",
" # Run analysis\n",
@@ -891,7 +870,7 @@
"%%appyter markdown\n",
"{% if \"Pathway\" in enrichr_libraries.value %}\n",
"# Pathway Enrichment Analysis\n",
- "Biological pathways are sequences of interactions between biochemical compounds which play a key role in determining cellular behavior. Databases such as KEGG (Kanehisa et al. 2000), Reactome (Croft et al. 2014) and WikiPathways (Kelder et al. 2012) contain a large number of associations between such pathways and genes. This information can be leveraged by Enrichr to identify the biological pathways which are over-represented in the up-regulated and down-regulated genes identified by comparing two groups of samples.\n",
+ "Biological pathways are sequences of interactions between biochemical compounds which play a key role in determining cellular behavior. Databases such as KEGG [{{ref_counter|length}}], Reactome [{{ref_counter|length + 1}}] and WikiPathways [{{ref_counter|length + 2}}] contain a large number of associations between such pathways and genes. This information can be leveraged by Enrichr to identify the biological pathways which are over-represented in the up-regulated and down-regulated genes identified by comparing two groups of samples.\n",
"{% endif %}"
]
},
@@ -904,6 +883,7 @@
"%%appyter code_exec\n",
"{% if \"Pathway\" in enrichr_libraries.value %}\n",
"# Initialize results\n",
+ "ref_counter.extend(['Kanehisa, M. and Goto, S. (2000) KEGG: kyoto encyclopedia of genes and genomes. Nucleic acids research, 28, 27-30.', 'Croft, David, et al. The Reactome pathway knowledgebase. Nucleic acids research 42.D1 (2014): D472-D477.', 'Kelder, Thomas, et al. WikiPathways: building research communities on biological pathways. Nucleic acids research 40.D1 (2012): D1301-D1307.'])\n",
"results['pathway_enrichment'] = {}\n",
"\n",
"# Loop through results\n",
@@ -921,7 +901,7 @@
" for gene_set_library in libraries:\n",
" # Display results\n",
" plot_name = \"{}_barchart_{}.png\".format(gene_set_library, label)\n",
- " plot_library_barchart(enrichment_results, gene_set_library, enrichment_results['signature_label'], enrichment_results['sort_results_by'], nr_genesets=nr_genesets, plot_type=plot_type)\n",
+ " plot_library_barchart(enrichment_results, gene_set_library, enrichment_results['signature_label'], enrichment_results['sort_results_by'], nr_genesets=nr_genesets, plot_type=plot_type, plot_name=plot_name\n",
" figure_counter, notebook_metadata = display_object(figure_counter, \"Enrichment Analysis Results for {} in {}. The figure contains interactive bar charts displaying the results of the pathway enrichment analysis generated using Enrichr. The x axis indicates the -log10(P-value) for each term. Significant terms are highlighted in bold. Additional information about enrichment results is available by hovering over each bar.\".format(label, gene_set_library), notebook_metadata, saved_filename=plot_name, istable=False)\n",
"{% endif %}"
]
@@ -935,7 +915,7 @@
"%%appyter markdown\n",
"{% if \"Transcription Factor\" in enrichr_libraries.value %}\n",
"# Transcription Factor Enrichment Analysis\n",
- "Transcription Factors (TFs) are proteins involved in the transcriptional regulation of gene expression. Databases such as ChEA (Lachmann et al. 2010) and ENCODE (Consortium, 2014) contain a large number of associations between TFs and their transcriptional targets. This information can be leveraged by Enrichr to identify the transcription factors whose targets are over-represented in the up-regulated and down-regulated genes identified by comparing two groups of samples.\n",
+ "Transcription Factors (TFs) are proteins involved in the transcriptional regulation of gene expression. Databases such as ChEA [{{ref_counter|length}}] and ENCODE [{{ref_counter|length + 1}}] contain a large number of associations between TFs and their transcriptional targets. This information can be leveraged by Enrichr to identify the transcription factors whose targets are over-represented in the up-regulated and down-regulated genes identified by comparing two groups of samples.\n",
"{% endif %}"
]
},
@@ -948,6 +928,7 @@
"%%appyter code_exec\n",
"{% if \"Transcription Factor\" in enrichr_libraries.value %}\n",
"# Initialize results\n",
+ "ref_counter.extend(['Lachmann, A., Xu, H., Krishnan, J., Berger, S.I., Mazloom, A.R. and Ma\\'ayan, A. (2010) ChEA: transcription factor regulation inferred from integrating genome-wide ChIP-X experiments. Bioinformatics, 26, 2438-2444.', 'ENCODE Consortium (2004) The ENCODE (ENCyclopedia of DNA elements) project. Science, 306, 636-640.'])\n",
"results['tf_enrichment'] = {}\n",
"\n",
"# Loop through results\n",
@@ -968,7 +949,7 @@
"%%appyter markdown\n",
"{% if \"Kinase\" in enrichr_libraries.value %}\n",
"# Kinase Enrichment Analysis\n",
- "Protein kinases are enzymes that modify other proteins by chemically adding phosphate groups. Databases such as KEA (Lachmann et al. 2009) contain a large number of associations between kinases and their substrates. This information can be leveraged by Enrichr to identify the protein kinases whose substrates are over-represented in the up-regulated and down-regulated genes identified by comparing two groups of samples.\n",
+ "Protein kinases are enzymes that modify other proteins by chemically adding phosphate groups. Databases such as KEA [{{ref_counter|length}}] contain a large number of associations between kinases and their substrates. This information can be leveraged by Enrichr to identify the protein kinases whose substrates are over-represented in the up-regulated and down-regulated genes identified by comparing two groups of samples.\n",
"{% endif %}"
]
},
@@ -981,6 +962,7 @@
"%%appyter code_exec\n",
"{% if \"Kinase\" in enrichr_libraries.value %}\n",
"# Initialize results\n",
+ "ref_counter.append('Lachmann, Alexander, and Avi Ma\\'ayan. \"KEA: kinase enrichment analysis.\" Bioinformatics 25.5 (2009): 684-686. ')\n",
"results['kinase_enrichment'] = {}\n",
"\n",
"# Loop through results\n",
@@ -1002,7 +984,7 @@
"%%appyter markdown\n",
"{% if \"miRNA\" in enrichr_libraries.value %}\n",
"# miRNA Enrichment Analysis\n",
- "microRNAs (miRNAs) are small non-coding RNA molecules which play a key role in the post-transcriptional regulation of gene expression. Databases such as TargetScan (Agarwal et al. 2015) and MiRTarBase (Chou et al. 2016) contain a large number of associations between miRNAs and their targets. This information can be leveraged by Enrichr to identify the miRNAs whose targets are over-represented in the up-regulated and down-regulated genes identified by comparing two groups of samples.\n",
+ "microRNAs (miRNAs) are small non-coding RNA molecules which play a key role in the post-transcriptional regulation of gene expression. Databases such as TargetScan [{{ref_counter|length}}] and MiRTarBase [{{ref_counter|length + 1}}] contain a large number of associations between miRNAs and their targets. This information can be leveraged by Enrichr to identify the miRNAs whose targets are over-represented in the up-regulated and down-regulated genes identified by comparing two groups of samples.\n",
"{% endif %}"
]
},
@@ -1014,6 +996,7 @@
"source": [
"%%appyter code_exec\n",
"{% if \"miRNA\" in enrichr_libraries.value %}\n",
+ "ref_counter.extend(['Agarwal, Vikram, et al. Predicting effective microRNA target sites in mammalian mRNAs. elife 4 (2015): e05005.', 'Chou, Chih-Hung, et al. miRTarBase 2016: updates to the experimentally validated miRNA-target interactions database. Nucleic acids research 44.D1 (2016): D239-D247.'])\n",
"\n",
"results['mirna_enrichment'] = {}\n",
"\n",
@@ -1036,7 +1019,7 @@
"%%appyter markdown\n",
"{% if small_molecule_method.value == \"L1000CDS2\" %}\n",
"# L1000CDS2 Query\n",
- "L1000CDS2 (Duan et al. 2016) is a web-based tool for querying gene expression signatures against signatures created from human cell lines treated with over 20,000 small molecules and drugs for the LINCS project. It is commonly used to identify small molecules which mimic or reverse the effects of a gene expression signature generated from a differential gene expression analysis.\n",
+ "L1000CDS2 [{{ref_counter|length}}] is a web-based tool for querying gene expression signatures against signatures created from human cell lines treated with over 20,000 small molecules and drugs for the LINCS project. It is commonly used to identify small molecules which mimic or reverse the effects of a gene expression signature generated from a differential gene expression analysis.\n",
"{% endif %}"
]
},
@@ -1049,6 +1032,7 @@
"%%appyter code_exec\n",
"{% if small_molecule_method.value == \"L1000CDS2\" %}\n",
"# Initialize results\n",
+ "ref_counter.append('Duan, Q., et al. L1000CDS2: Lincs l1000 characteristic direction signatures search engine. NPJ Syst Biol Appl. 2016; 2: 16015. (2016).')\n",
"results['l1000cds2'] = {}\n",
"\n",
"# Loop through signatures\n",
@@ -1071,7 +1055,7 @@
"%%appyter markdown\n",
"{% if small_molecule_method.value == \"L1000FWD\" %}\n",
"# L1000FWD Query\n",
- "L1000FWD (Wang et al. 2018) is a web-based tool for querying gene expression signatures against signatures created from human cell lines treated with over 20,000 small molecules and drugs for the LINCS project.\n",
+ "L1000FWD [{{ref_counter|length}}] is a web-based tool for querying gene expression signatures against signatures created from human cell lines treated with over 20,000 small molecules and drugs for the LINCS project.\n",
"{% endif %}"
]
},
@@ -1084,6 +1068,7 @@
"%%appyter code_exec\n",
"{% if small_molecule_method.value == \"L1000FWD\" %}\n",
"# Initialize results\n",
+ "ref_counter.append('Wang, Zichen, et al. L1000FWD: fireworks visualization of drug-induced transcriptomic signatures. Bioinformatics 34.12 (2018): 2150-2152.')\n",
"results['l1000fwd'] = {}\n",
"\n",
"# Loop through signatures\n",
@@ -1105,6 +1090,7 @@
"outputs": [],
"source": [
"# save metadata of the notebook as json\n",
+ "notebook_metadata['references'] = ref_counter\n",
"with open(\"notebook_metadata.json\", \"w\") as fw:\n",
" json.dump(notebook_metadata, fw)"
]
@@ -1120,45 +1106,20 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "Agarwal, Vikram, et al. \"Predicting effective microRNA target sites in mammalian mRNAs.\" elife 4 (2015): e05005.\n",
- "
\n",
- "Ashburner, M., Ball, C.A., Blake, J.A., Botstein, D., Butler, H., Cherry, J.M., Davis, A.P., Dolinski, K., Dwight, S.S. and Eppig, J.T. (2000) Gene Ontology: tool for the unification of biology. Nature genetics, 25, 25.\n",
- "
\n",
- "Chou, Chih-Hung, et al. \"miRTarBase 2016: updates to the experimentally validated miRNA-target interactions database.\" Nucleic acids research 44.D1 (2016): D239-D247.\n",
- "
\n",
- "Clark, N.R. and Ma’ayan, A. (2011) Introduction to statistical methods to analyze large data sets: principal components analysis. Sci. Signal., 4, tr3-tr3.\n",
- "
\n",
- "Clark, Neil R., et al. \"The characteristic direction: a geometrical approach to identify differentially expressed genes.\" BMC bioinformatics 15.1 (2014): 79.\n",
- "
\n",
- "Consortium, E.P. (2004) The ENCODE (ENCyclopedia of DNA elements) project. Science, 306, 636-640.\n",
- "
\n",
- "Croft, David, et al. \"The Reactome pathway knowledgebase.\" Nucleic acids research 42.D1 (2014): D472-D477.\n",
- "
\n",
- "Duan, Q., et al. \"L1000cds2: Lincs l1000 characteristic direction signatures search engine. NPJ Syst Biol Appl. 2016; 2: 16015.\" (2016).\n",
- "
\n",
- "Fernandez, Nicolas F., et al. \"Clustergrammer, a web-based heatmap visualization and analysis tool for high-dimensional biological data.\" Scientific data 4 (2017): 170151.\n",
- "
\n",
- "Kanehisa, M. and Goto, S. (2000) KEGG: kyoto encyclopedia of genes and genomes. Nucleic acids research, 28, 27-30.\n",
- "
\n",
- "Kelder, Thomas, et al. \"WikiPathways: building research communities on biological pathways.\" Nucleic acids research 40.D1 (2012): D1301-D1307.\n",
- "
\n",
- "Kuleshov, M.V., Jones, M.R., Rouillard, A.D., Fernandez, N.F., Duan, Q., Wang, Z., Koplev, S., Jenkins, S.L., Jagodnik, K.M. and Lachmann, A. (2016) Enrichr: a comprehensive gene set enrichment analysis web server 2016 update. Nucleic acids research, 44, W90-W97.\n",
- "
\n",
- "Lachmann, A., Xu, H., Krishnan, J., Berger, S.I., Mazloom, A.R. and Ma'ayan, A. (2010) ChEA: transcription factor regulation inferred from integrating genome-wide ChIP-X experiments. Bioinformatics, 26, 2438-2444.\n",
- "
\n",
- "Lachmann, Alexander, and Avi Ma'ayan. \"KEA: kinase enrichment analysis.\" Bioinformatics 25.5 (2009): 684-686.\n",
- "
\n",
- "Ritchie, Matthew E., et al. \"limma powers differential expression analyses for RNA-sequencing and microarray studies.\" Nucleic acids research 43.7 (2015): e47-e47.\n",
+ "%%appyter markdown\n",
+ "\n",
+ "{% for result in results %}\n",
+ "{{ loop.index }}. {{ result }}\n",
"
\n",
- "Wang, Zichen, et al. \"L1000FWD: fireworks visualization of drug-induced transcriptomic signatures.\" Bioinformatics 34.12 (2018): 2150-2152."
+ "{% endfor %}"
]
}
],
"metadata": {
"kernelspec": {
- "display_name": "py39",
+ "display_name": "Python 3",
"language": "python",
- "name": "py39"
+ "name": "python3"
},
"language_info": {
"codemirror_mode": {
@@ -1170,7 +1131,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.5"
+ "version": "3.9.2"
}
},
"nbformat": 4,
diff --git a/appyters/Bulk_RNA_seq/appyter.json b/appyters/Bulk_RNA_seq/appyter.json
index dd36cabc..9ebe8dfc 100644
--- a/appyters/Bulk_RNA_seq/appyter.json
+++ b/appyters/Bulk_RNA_seq/appyter.json
@@ -2,7 +2,7 @@
"$schema": "https://raw.githubusercontent.com/MaayanLab/appyter-catalog/main/schema/appyter-validator.json",
"name": "Bulk_RNA_seq",
"title": "Bulk RNA-seq analysis pipeline",
- "version": "0.1.8",
+ "version": "0.1.9",
"description": "Bulk RNA-seq analysis pipeline enables you to analyze and visualize datasets with state-of-the-art algorithms and visualization methods.",
"image": "screenshot.png",
"authors": [
@@ -16,7 +16,7 @@
},
{
"name": "Maxim Kuleshov",
- "email": "kuleshov.max.v@gmail.com"
+ "email": "maxim.kuleshov@mssm.edu"
}
],
"url": "https://github.com/MaayanLab/appyter-catalog",
diff --git a/appyters/Bulk_RNA_seq/utils.py b/appyters/Bulk_RNA_seq/utils.py
index fdc40d8e..12dcad14 100644
--- a/appyters/Bulk_RNA_seq/utils.py
+++ b/appyters/Bulk_RNA_seq/utils.py
@@ -1,27 +1,26 @@
-from rpy2 import robjects
-from rpy2.robjects import r, pandas2ri
-# Basic libraries
-import pandas as pd
-import os
-import urllib3
-import requests, json
-import random
-import time
+import json
+import matplotlib.pyplot as plt
import numpy as np
-import warnings
+import os
+import pandas as pd
-# Visualization
import plotly
-from plotly import tools
import plotly.express as px
import plotly.graph_objs as go
-import matplotlib.pyplot as plt; plt.rcdefaults()
+import random
+import requests
+import time
+import urllib3
+import warnings
+from plotly import tools
+from rpy2 import robjects
+from rpy2.robjects import r, pandas2ri
+
from matplotlib import rcParams
import IPython
from IPython.display import HTML, display, Markdown, IFrame
-
# Data analysis
from itertools import combinations
import scipy.spatial.distance as dist
@@ -33,76 +32,82 @@
from sklearn.manifold import TSNE
from plotly.offline import init_notebook_mode
-init_notebook_mode(connected = False)
+
+plt.rcdefaults()
+init_notebook_mode(connected=False)
+
def check_files(fname):
if fname == "":
raise IOError
- if fname.endswith(".txt") == False and fname.endswith(".csv") ==False and fname.endswith(".tsv")==False:
+ if fname.endswith(".txt") == False and fname.endswith(".csv") == False and fname.endswith(".tsv") == False:
raise IOError
+
+
def check_df(df, col):
if col not in df.columns:
raise IOError
-
-def CPM(data):
+
+def cpm(data):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
- data = (data/data.sum())*10**6
+ data = (data / data.sum()) * 10 ** 6
data = data.fillna(0)
-
+
return data
-def logCPM(data):
+
+def log_cpm(data):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
- data = (data/data.sum())*10**6
+ data = (data / data.sum()) * 10 ** 6
data = data.fillna(0)
- data = np.log2(data+1)
+ data = np.log2(data + 1)
# Return
return data
-def log(data):
+
+def log(data):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
data = data.fillna(0)
- data = np.log2(data+1)
+ data = np.log2(data + 1)
return data
-def qnormalization(data):
-
- X_quantile_norm = quantile_normalize(data)
- return X_quantile_norm
-def normalize(dataset, current_dataset, logCPM_normalization, log_normalization, z_normalization, q_normalization):
+
+def normalize(dataset, current_dataset, log_cpm_normalization, log_normalization, z_normalization, q_normalization):
normalization = current_dataset
- if logCPM_normalization == True:
+ if log_cpm_normalization:
data = dataset[normalization]
normalization += '+logCPM'
- dataset[normalization] = logCPM(data)
-
- if log_normalization == True:
+ dataset[normalization] = log_cpm(data)
+
+ if log_normalization:
data = dataset[normalization]
normalization += '+log'
dataset[normalization] = log(data)
-
- if z_normalization == True:
+
+ if z_normalization:
data = dataset[normalization]
- normalization += '+z_norm'
+ normalization += '+z_norm'
dataset[normalization] = data.T.apply(ss.zscore, axis=0).T.dropna()
- if q_normalization == True:
+ if q_normalization:
data = dataset[normalization]
normalization += '+q_norm'
- dataset[normalization] = qnormalization(data)
+ dataset[normalization] = quantile_normalize(data)
return dataset, normalization
-def create_download_link(df, title = "Download CSV file: {}", filename = "data.csv"):
+
+def create_download_link(df, title="Download CSV file: {}", filename="data.csv"):
df.to_csv(filename)
html = "{}".format(filename, title.format(filename))
return HTML(html)
+
def display_link(url):
raw_html = '%s' % (url, url)
return display(HTML(raw_html))
@@ -110,20 +115,19 @@ def display_link(url):
def display_object(counter, caption, notebook_metadata=None, saved_filename=None, df=None, istable=True, ishtml=False):
object_info = dict()
-
+
if df is not None:
- if ishtml == True:
+ if ishtml:
display(HTML(df.to_html(render_links=True, escape=False)))
else:
display(df)
df.to_csv(saved_filename)
-
- if istable == True:
+
+ if istable:
if notebook_metadata is not None:
object_info["description"] = caption
object_info["file_name"] = saved_filename
- notebook_metadata["tables"][str(counter)] = object_info
-
+ notebook_metadata["tables"][str(counter)] = object_info
display(Markdown("*Table {}. {}*".format(counter, caption)))
else:
@@ -131,38 +135,42 @@ def display_object(counter, caption, notebook_metadata=None, saved_filename=None
object_info["description"] = caption
object_info["file_name"] = saved_filename
notebook_metadata["figures"][str(counter)] = object_info
-
+
display(Markdown("*Figure {}. {}*".format(counter, caption)))
-
+
counter += 1
if notebook_metadata is not None:
return counter, notebook_metadata
else:
return counter
-
-def run_dimension_reduction(dataset, method='PCA', normalization='logCPM', nr_genes=2500, filter_samples=True, plot_type='interactive'):
+
+
+def run_dimension_reduction(dataset, method='PCA', normalization='logCPM', nr_genes=2500, filter_samples=True,
+ plot_type='interactive'):
# Get data
before_norm = normalization.replace("+z_norm", "").replace("+q_norm", "")
top_genes = dataset[before_norm].var(axis=1).sort_values(ascending=False)
-
+
expression_dataframe = dataset[normalization]
-
+
# Filter columns
if filter_samples and dataset.get('signature_metadata'):
- selected_samples = [sample for samples in list(dataset['signature_metadata'].values())[0].values() for sample in samples]
+ selected_samples = [sample for samples in list(dataset['signature_metadata'].values())[0].values() for sample in
+ samples]
expression_dataframe = expression_dataframe[selected_samples]
# Filter rows
expression_dataframe = expression_dataframe.loc[top_genes.index[:nr_genes]]
- result=None
+ result = None
axis = None
if method == 'PCA':
# Run PCA
- pca=PCA(n_components=3)
+ pca = PCA(n_components=3)
result = pca.fit(expression_dataframe)
# Get Variance
- axis = ['PC'+str((i+1))+'('+str(round(e*100, 1))+'% var. explained)' for i, e in enumerate(pca.explained_variance_ratio_)]
+ axis = ['PC' + str((i + 1)) + '(' + str(round(e * 100, 1)) + '% var. explained)' for i, e in
+ enumerate(pca.explained_variance_ratio_)]
elif method == "UMAP":
u = umap.UMAP(n_components=3)
result = u.fit_transform(expression_dataframe.T)
@@ -184,14 +192,15 @@ def run_dimension_reduction(dataset, method='PCA', normalization='logCPM', nr_ge
col.append(B_label)
else:
col.append('Other')
- dataset['dataset_metadata']['Sample Group'] = col
-
- # Return
- results = {'method': method, 'result': result, 'axis': axis,
- 'dataset_metadata': dataset['dataset_metadata'][dataset['dataset_metadata'].index == expression_dataframe.columns],
- 'nr_genes': nr_genes,
- 'normalization': normalization, 'signature_metadata': dataset.get('signature_metadata'),
- 'plot_type': plot_type}
+ dataset['dataset_metadata']['Sample Group'] = col
+
+ # Return
+ results = {'method': method, 'result': result, 'axis': axis,
+ 'dataset_metadata': dataset['dataset_metadata'][
+ dataset['dataset_metadata'].index == expression_dataframe.columns],
+ 'nr_genes': nr_genes,
+ 'normalization': normalization, 'signature_metadata': dataset.get('signature_metadata'),
+ 'plot_type': plot_type}
return results
@@ -199,7 +208,7 @@ def plot_samples(pca_results, meta_class_column_name, counter, plot_name, notebo
pca_transformed = pca_results['result']
axis = pca_results['axis']
meta_df = pca_results['dataset_metadata']
-
+
if pca_results["method"] == "PCA":
meta_df['x'] = pca_transformed.components_[0]
meta_df['y'] = pca_transformed.components_[1]
@@ -208,7 +217,7 @@ def plot_samples(pca_results, meta_class_column_name, counter, plot_name, notebo
meta_df['x'] = [x[0] for x in pca_transformed]
meta_df['y'] = [x[1] for x in pca_transformed]
meta_df['z'] = [x[2] for x in pca_transformed]
-
+
fig = px.scatter_3d(meta_df, x='x', y='y', z='z', color=meta_class_column_name)
if plot_type == "interactive":
fig.show()
@@ -223,35 +232,36 @@ def plot_samples(pca_results, meta_class_column_name, counter, plot_name, notebo
object_info = dict()
object_info["description"] = caption
object_info["file_name"] = plot_name
-
+
notebook_metadata["figures"][str(counter)] = object_info
-
+
counter += 1
return counter, notebook_metadata
-def run_clustergrammer(dataset, meta_class_column_name, normalization='logCPM', z_score=True, nr_genes=1500, metadata_cols=None, filter_samples=True,gene_list=None):
+
+def run_clustergrammer(dataset, meta_class_column_name, normalization='logCPM', z_score=True, nr_genes=1500,
+ metadata_cols=None, filter_samples=True, gene_list=None):
# Subset the expression DataFrame using top 800 genes with largest variance
data = dataset[normalization]
variances = np.var(data, axis=1)
srt_idx = variances.argsort()[::-1]
- if gene_list == None or len(gene_list) == 0:
+ if gene_list is None or len(gene_list) == 0:
expr_df_sub = data.iloc[srt_idx].iloc[:nr_genes]
else:
gene_list = gene_list.split("\n")
common_gene_list = list(set(gene_list).intersection(set(data.index)))
expr_df_sub = data.loc[common_gene_list, :]
assert len(expr_df_sub.index) > 0
-
+
# prettify sample names
- sample_names = ['::'.join([y, x]) for x,y in
- zip(dataset["dataset_metadata"][meta_class_column_name], expr_df_sub.columns)]
+ sample_names = ['::'.join([y, x]) for x, y in
+ zip(dataset["dataset_metadata"][meta_class_column_name], expr_df_sub.columns)]
expr_df_sub.columns = sample_names
- expr_df_sub.index = ["Gene: "+str(x) for x in expr_df_sub.index]
- sample_name = ["Sample: "+x for x in sample_names]
+ expr_df_sub.index = ["Gene: " + str(x) for x in expr_df_sub.index]
+ sample_name = ["Sample: " + x for x in sample_names]
expr_df_sub.columns = sample_name
-
- treatment_type = ["Class: "+ x.split("::")[1] for x in sample_names]
+ treatment_type = ["Class: " + x.split("::")[1] for x in sample_names]
new_series = pd.DataFrame(treatment_type).T
new_series.columns = expr_df_sub.columns
expr_df_sub = pd.concat([new_series, expr_df_sub], axis=0)
@@ -265,7 +275,8 @@ def run_clustergrammer(dataset, meta_class_column_name, normalization='logCPM',
clustergrammer_url = 'https://maayanlab.cloud/clustergrammer/matrix_upload/'
r = requests.post(clustergrammer_url, files={'file': open(expr_df_sub_file, 'rb')}).text
return r
-
+
+
#############################################
########## 2. Plot
#############################################
@@ -277,27 +288,41 @@ def plot_clustergrammar(clustergrammer_url):
display(IPython.display.IFrame(clustergrammer_url, width="1000", height="1000"))
-def plot_2D_scatter(x, y, text='', title='', xlab='', ylab='', hoverinfo='text', color='black', colorscale='Blues', size=8, showscale=False, symmetric_x=False, symmetric_y=False, pad=0.5, hline=False, vline=False, return_trace=False, labels=False, plot_type='interactive', de_type='ma', plot_name="2d_plot.png"):
- range_x = [-max(abs(x))-pad, max(abs(x))+pad]if symmetric_x else None
- range_y = [-max(abs(y))-pad, max(abs(y))+pad]if symmetric_y else None
- trace = go.Scattergl(x=x, y=y, mode='markers', text=text, hoverinfo=hoverinfo, marker={'color': color, 'colorscale': colorscale, 'showscale': showscale, 'size': size})
+def plot_2D_scatter(x, y, text='', title='', xlab='', ylab='', hoverinfo='text', color='black', colorscale='Blues',
+ size=8, showscale=False, symmetric_x=False, symmetric_y=False, pad=0.5, hline=False, vline=False,
+ return_trace=False, labels=False, plot_type='interactive', de_type='ma', plot_name="2d_plot.png"):
+ range_x = [-max(abs(x)) - pad, max(abs(x)) + pad] if symmetric_x else None
+ range_y = [-max(abs(y)) - pad, max(abs(y)) + pad] if symmetric_y else None
+ trace = go.Scattergl(x=x, y=y, mode='markers', text=text, hoverinfo=hoverinfo,
+ marker={'color': color, 'colorscale': colorscale, 'showscale': showscale, 'size': size})
if return_trace:
return trace
else:
if de_type == 'ma':
annotations = [
- {'x': 1, 'y': 0.1, 'text':'Down-regulated in '+labels[-1]+'', 'showarrow': False, 'xref': 'paper', 'yref': 'paper', 'xanchor': 'right', 'yanchor': 'top'},
- {'x': 1, 'y': 0.9, 'text':'Up-regulated in '+labels[-1]+'', 'showarrow': False, 'xref': 'paper', 'yref': 'paper', 'xanchor': 'right', 'yanchor': 'bottom'}
+ {'x': 1, 'y': 0.1,
+ 'text': 'Down-regulated in ' + labels[
+ -1] + '', 'showarrow': False, 'xref': 'paper', 'yref': 'paper', 'xanchor': 'right',
+ 'yanchor': 'top'},
+ {'x': 1, 'y': 0.9,
+ 'text': 'Up-regulated in ' + labels[
+ -1] + '', 'showarrow': False, 'xref': 'paper', 'yref': 'paper', 'xanchor': 'right',
+ 'yanchor': 'bottom'}
] if labels else []
elif de_type == 'volcano':
annotations = [
- {'x': 0.25, 'y': 1.07, 'text':'Down-regulated in '+labels[-1]+'', 'showarrow': False, 'xref': 'paper', 'yref': 'paper', 'xanchor': 'center'},
- {'x': 0.75, 'y': 1.07, 'text':'Up-regulated in '+labels[-1]+'', 'showarrow': False, 'xref': 'paper', 'yref': 'paper', 'xanchor': 'center'}
+ {'x': 0.25, 'y': 1.07,
+ 'text': 'Down-regulated in ' + labels[
+ -1] + '', 'showarrow': False, 'xref': 'paper', 'yref': 'paper', 'xanchor': 'center'},
+ {'x': 0.75, 'y': 1.07,
+ 'text': 'Up-regulated in ' + labels[
+ -1] + '', 'showarrow': False, 'xref': 'paper', 'yref': 'paper', 'xanchor': 'center'}
] if labels else []
- layout = go.Layout(title=title, xaxis={'title': xlab, 'range': range_x}, yaxis={'title': ylab, 'range': range_y}, hovermode='closest', annotations=annotations)
+ layout = go.Layout(title=title, xaxis={'title': xlab, 'range': range_x},
+ yaxis={'title': ylab, 'range': range_y}, hovermode='closest', annotations=annotations)
fig = go.Figure(data=[trace], layout=layout)
-
- if plot_type=='interactive':
+
+ if plot_type == 'interactive':
fig.show()
else:
fig.show(renderer="png")
@@ -389,79 +414,93 @@ def plot_2D_scatter(x, y, text='', title='', xlab='', ylab='', hoverinfo='text',
}
''')
+
def get_signatures(classes, dataset, normalization, method, meta_class_column_name, filter_genes):
- tmp_normalization = normalization.replace("+z_norm+q_norm","").replace("+z_norm","")
+ tmp_normalization = normalization.replace("+z_norm+q_norm", "").replace("+z_norm", "")
raw_expr_df = dataset['rawdata']
expr_df = dataset['rawdata']
- if filter_genes == True:
+ if filter_genes:
expr_df = dataset['rawdata+filter_genes']
-
+
signatures = dict()
for cls1, cls2 in combinations(classes, 2):
print(cls1, cls2)
- cls1_sample_ids = dataset["dataset_metadata"].loc[dataset["dataset_metadata"][meta_class_column_name]==cls1, :].index.tolist() #control
- cls2_sample_ids = dataset["dataset_metadata"].loc[dataset["dataset_metadata"][meta_class_column_name]==cls2,:].index.tolist() #case
-
+ cls1_sample_ids = dataset["dataset_metadata"].loc[dataset["dataset_metadata"][meta_class_column_name] == cls1,
+ :].index.tolist() # control
+ cls2_sample_ids = dataset["dataset_metadata"].loc[dataset["dataset_metadata"][meta_class_column_name] == cls2,
+ :].index.tolist() # case
+
signature_label = " vs. ".join([cls1, cls2])
-
+
if method == "limma":
limma = robjects.r['limma']
- design_dataframe = pd.DataFrame([{'index': x, 'A': int(x in cls1_sample_ids), 'B': int(x in cls2_sample_ids)} for x in raw_expr_df.columns]).set_index('index')
+ design_dataframe = pd.DataFrame(
+ [{'index': x, 'A': int(x in cls1_sample_ids), 'B': int(x in cls2_sample_ids)} for x in
+ raw_expr_df.columns]).set_index('index')
processed_data = {"expression": raw_expr_df, 'design': design_dataframe}
-
- limma_results = pandas2ri.conversion.rpy2py(limma(pandas2ri.conversion.py2rpy(processed_data['expression']), pandas2ri.conversion.py2rpy(processed_data['design']), filter_genes=filter_genes))
-
+
+ limma_results = pandas2ri.conversion.rpy2py(limma(pandas2ri.conversion.py2rpy(processed_data['expression']),
+ pandas2ri.conversion.py2rpy(processed_data['design']),
+ filter_genes=filter_genes))
+
signature = pd.DataFrame(limma_results[0])
signature.index = limma_results[1]
signature = signature.sort_values("t", ascending=False)
-
+
elif method == "characteristic_direction":
- signature = characteristic_direction(dataset[tmp_normalization].loc[:, cls1_sample_ids], dataset[normalization].loc[:, cls2_sample_ids], calculate_sig=True)
+ signature = characteristic_direction(dataset[tmp_normalization].loc[:, cls1_sample_ids],
+ dataset[normalization].loc[:, cls2_sample_ids], calculate_sig=True)
signature = signature.sort_values("CD-coefficient", ascending=False)
elif method == "edgeR":
edgeR = robjects.r['edgeR']
-
- edgeR_results = pandas2ri.conversion.rpy2py(edgeR(pandas2ri.conversion.py2rpy(expr_df.loc[:, cls1_sample_ids+cls2_sample_ids]), pandas2ri.conversion.py2rpy(cls1_sample_ids), pandas2ri.conversion.py2rpy(cls2_sample_ids)))
-
+
+ edgeR_results = pandas2ri.conversion.rpy2py(
+ edgeR(pandas2ri.conversion.py2rpy(expr_df.loc[:, cls1_sample_ids+cls2_sample_ids]), pandas2ri.conversion.py2rpy(cls1_sample_ids),
+ pandas2ri.conversion.py2rpy(cls2_sample_ids)))
+
signature = pd.DataFrame(edgeR_results[0])
signature.index = edgeR_results[1]
signature = signature.sort_values("logFC", ascending=False)
elif method == "DESeq2":
# deseq2 receives raw counts
DESeq2 = robjects.r['deseq2']
- DESeq2_results = pandas2ri.conversion.rpy2py(DESeq2(pandas2ri.conversion.py2rpy(expr_df.loc[:, cls1_sample_ids+cls2_sample_ids]), pandas2ri.conversion.py2rpy(cls1_sample_ids), pandas2ri.conversion.py2rpy(cls2_sample_ids)))
-
+ DESeq2_results = pandas2ri.conversion.rpy2py(
+ DESeq2(pandas2ri.conversion.py2rpy(expr_df.loc[:, cls1_sample_ids+cls2_sample_ids]), pandas2ri.conversion.py2rpy(cls1_sample_ids),
+ pandas2ri.conversion.py2rpy(cls2_sample_ids)))
+
signature = pd.DataFrame(DESeq2_results[0])
signature.index = DESeq2_results[1]
signature = signature.sort_values("log2FoldChange", ascending=False)
-
-
+
signatures[signature_label] = signature
- return signatures
+ return signatures
def run_volcano(signature, signature_label, dataset, pvalue_threshold, logfc_threshold, plot_type):
color = []
text = []
for index, rowData in signature.iterrows():
- if "AveExpr" in rowData.index: # limma
+ if "AveExpr" in rowData.index: # limma
expr_colname = "AveExpr"
pval_colname = "P.Value"
logfc_colname = "logFC"
- elif "logCPM" in rowData.index: #edgeR
+ elif "logCPM" in rowData.index: # edgeR
expr_colname = "logCPM"
pval_colname = "PValue"
logfc_colname = "logFC"
- elif "baseMean" in rowData.index: #DESeq2
+ elif "baseMean" in rowData.index: # DESeq2
expr_colname = "baseMean"
pval_colname = "pvalue"
logfc_colname = "log2FoldChange"
# Text
- text.append(''+index+'
Avg Expression = '+str(round(rowData[expr_colname], ndigits=2))+'
logFC = '+str(round(rowData[logfc_colname], ndigits=2))+'
p = '+'{:.2e}'.format(rowData[pval_colname])+'
FDR = '+'{:.2e}'.format(rowData[pval_colname]))
+ text.append('' + index + '
Avg Expression = ' + str(
+ round(rowData[expr_colname], ndigits=2)) + '
logFC = ' + str(
+ round(rowData[logfc_colname], ndigits=2)) + '
p = ' + '{:.2e}'.format(
+ rowData[pval_colname]) + '
FDR = ' + '{:.2e}'.format(rowData[pval_colname]))
# Color
if rowData[pval_colname] < pvalue_threshold:
@@ -475,12 +514,13 @@ def run_volcano(signature, signature_label, dataset, pvalue_threshold, logfc_thr
else:
color.append('black')
- volcano_plot_results = {'x': signature[logfc_colname], 'y': -np.log10(signature[pval_colname]), 'text':text, 'color': color, 'signature_label': signature_label, 'plot_type': plot_type}
+ volcano_plot_results = {'x': signature[logfc_colname], 'y': -np.log10(signature[pval_colname]), 'text': text,
+ 'color': color, 'signature_label': signature_label, 'plot_type': plot_type}
return volcano_plot_results
-
+
def plot_volcano(volcano_plot_results):
- spacer = ' '*50
+ spacer = ' ' * 50
plot_name = "volcano_plot_{}.png".format(volcano_plot_results['signature_label'])
plot_2D_scatter(
x=volcano_plot_results['x'],
@@ -495,29 +535,32 @@ def plot_volcano(volcano_plot_results):
plot_type=volcano_plot_results['plot_type'],
de_type='volcano',
plot_name=plot_name
- )
+ )
return plot_name
-def run_maplot(signature, signature_label='', pvalue_threshold=0.05, logfc_threshold=1.5, plot_type='interactive'):
+def run_maplot(signature, signature_label='', pvalue_threshold=0.05, logfc_threshold=1.5, plot_type='interactive'):
# Loop through signature
color = []
text = []
for index, rowData in signature.iterrows():
- if "AveExpr" in rowData.index: # limma
+ if "AveExpr" in rowData.index: # limma
expr_colname = "AveExpr"
pval_colname = "adj.P.Val"
logfc_colname = "logFC"
- elif "logCPM" in rowData.index: #edgeR
+ elif "logCPM" in rowData.index: # edgeR
expr_colname = "logCPM"
pval_colname = "PValue"
logfc_colname = "logFC"
- elif "baseMean" in rowData.index: #DESeq2
+ elif "baseMean" in rowData.index: # DESeq2
expr_colname = "baseMean"
pval_colname = "padj"
logfc_colname = "log2FoldChange"
# Text
- text.append(''+index+'
Avg Expression = '+str(round(rowData[expr_colname], ndigits=2))+'
logFC = '+str(round(rowData[logfc_colname], ndigits=2))+'
p = '+'{:.2e}'.format(rowData[pval_colname])+'
FDR = '+'{:.2e}'.format(rowData[pval_colname]))
+ text.append('' + index + '
Avg Expression = ' + str(
+ round(rowData[expr_colname], ndigits=2)) + '
logFC = ' + str(
+ round(rowData[logfc_colname], ndigits=2)) + '
p = ' + '{:.2e}'.format(
+ rowData[pval_colname]) + '
FDR = ' + '{:.2e}'.format(rowData[pval_colname]))
# Color
if rowData[pval_colname] < pvalue_threshold:
@@ -530,12 +573,14 @@ def run_maplot(signature, signature_label='', pvalue_threshold=0.05, logfc_thres
else:
color.append('black')
-
+
# Return
- volcano_plot_results = {'x': signature[expr_colname], 'y': signature[logfc_colname], 'text':text, 'color': color, 'signature_label': signature_label, 'plot_type': plot_type}
+ volcano_plot_results = {'x': signature[expr_colname], 'y': signature[logfc_colname], 'text': text, 'color': color,
+ 'signature_label': signature_label, 'plot_type': plot_type}
return volcano_plot_results
-def plot_maplot(volcano_plot_results):
+
+def plot_maplot(volcano_plot_results):
plot_name = "ma_plot_{}.png".format(volcano_plot_results['signature_label'])
plot_2D_scatter(
x=volcano_plot_results['x'],
@@ -549,17 +594,16 @@ def plot_maplot(volcano_plot_results):
labels=volcano_plot_results['signature_label'].split(' vs. '),
plot_type=volcano_plot_results['plot_type'],
plot_name=plot_name
-
+
)
return plot_name
-def run_enrichr(signature, signature_label, geneset_size=500, fc_colname = 'logFC', sort_genes_by='t', ascending=True):
-
+def run_enrichr(signature, signature_label, geneset_size=500, fc_colname='logFC', sort_genes_by='t', ascending=True):
# Sort signature
up_signature = signature[signature[fc_colname] > 0].sort_values(sort_genes_by, ascending=ascending)
down_signature = signature[signature[fc_colname] < 0].sort_values(sort_genes_by, ascending=ascending)
-
+
# Get genesets
genesets = {
'upregulated': up_signature.index[:geneset_size],
@@ -567,12 +611,15 @@ def run_enrichr(signature, signature_label, geneset_size=500, fc_colname = 'logF
}
# Submit to Enrichr
- enrichr_ids = {geneset_label: submit_enrichr_geneset(geneset=geneset, label=signature_label+', '+geneset_label+', from Bulk RNA-seq Appyter') for geneset_label, geneset in genesets.items()}
+ enrichr_ids = {geneset_label: submit_enrichr_geneset(geneset=geneset,
+ label=signature_label + ', ' + geneset_label + ', from Bulk RNA-seq Appyter')
+ for geneset_label, geneset in genesets.items()}
enrichr_ids['signature_label'] = signature_label
return enrichr_ids
+
def submit_enrichr_geneset(geneset, label=''):
- ENRICHR_URL = 'http://maayanlab.cloud/Enrichr/addList'
+ ENRICHR_URL = 'https://maayanlab.cloud/Enrichr/addList'
genes_str = '\n'.join(geneset)
payload = {
'list': (None, genes_str),
@@ -586,22 +633,22 @@ def submit_enrichr_geneset(geneset, label=''):
return data
-
def get_enrichr_results(user_list_id, gene_set_libraries, overlappingGenes=True, geneset=None):
- ENRICHR_URL = 'http://maayanlab.cloud/Enrichr/enrich'
+ ENRICHR_URL = 'https://maayanlab.cloud/Enrichr/enrich'
query_string = '?userListId=%s&backgroundType=%s'
results = []
for gene_set_library, label in gene_set_libraries.items():
response = requests.get(
- ENRICHR_URL +
- query_string % (user_list_id, gene_set_library)
- )
+ ENRICHR_URL +
+ query_string % (user_list_id, gene_set_library)
+ )
if not response.ok:
raise Exception('Error fetching enrichment results')
data = json.loads(response.text)
resultDataframe = pd.DataFrame(data[gene_set_library], columns=[
- 'rank', 'term_name', 'pvalue', 'zscore', 'combined_score', 'overlapping_genes', 'FDR', 'old_pvalue', 'old_FDR'])
+ 'rank', 'term_name', 'pvalue', 'zscore', 'combined_score', 'overlapping_genes', 'FDR', 'old_pvalue',
+ 'old_FDR'])
selectedColumns = ['term_name', 'zscore', 'combined_score', 'pvalue', 'FDR'] if not overlappingGenes else [
'term_name', 'zscore', 'combined_score', 'FDR', 'pvalue', 'overlapping_genes']
resultDataframe = resultDataframe.loc[:, selectedColumns]
@@ -614,16 +661,15 @@ def get_enrichr_results(user_list_id, gene_set_libraries, overlappingGenes=True,
return concatenatedDataframe
-
-def get_enrichr_results_by_library(enrichr_results, signature_label, plot_type='interactive', library_type='go', version='2018', sort_results_by='pvalue'):
-
+def get_enrichr_results_by_library(enrichr_results, signature_label, plot_type='interactive', library_type='go',
+ version='2018', sort_results_by='pvalue'):
# Libraries
if library_type == 'go':
go_version = str(version)
libraries = {
- 'GO_Biological_Process_'+go_version: 'Gene Ontology Biological Process ('+go_version+' version)',
- 'GO_Molecular_Function_'+go_version: 'Gene Ontology Molecular Function ('+go_version+' version)',
- 'GO_Cellular_Component_'+go_version: 'Gene Ontology Cellular Component ('+go_version+' version)'
+ 'GO_Biological_Process_' + go_version: 'Gene Ontology Biological Process (' + go_version + ' version)',
+ 'GO_Molecular_Function_' + go_version: 'Gene Ontology Molecular Function (' + go_version + ' version)',
+ 'GO_Cellular_Component_' + go_version: 'Gene Ontology Cellular Component (' + go_version + ' version)'
}
elif library_type == "pathway":
# Libraries
@@ -634,7 +680,9 @@ def get_enrichr_results_by_library(enrichr_results, signature_label, plot_type='
}
# Get Enrichment Results
- enrichment_results = {geneset: get_enrichr_results(enrichr_results[geneset]['userListId'], gene_set_libraries=libraries, geneset=geneset) for geneset in ['upregulated', 'downregulated']}
+ enrichment_results = {
+ geneset: get_enrichr_results(enrichr_results[geneset]['userListId'], gene_set_libraries=libraries,
+ geneset=geneset) for geneset in ['upregulated', 'downregulated']}
enrichment_results['signature_label'] = signature_label
enrichment_results['plot_type'] = plot_type
enrichment_results['sort_results_by'] = sort_results_by
@@ -642,8 +690,8 @@ def get_enrichr_results_by_library(enrichr_results, signature_label, plot_type='
# Return
return enrichment_results
-def get_enrichr_result_tables_by_library(enrichr_results, signature_label, library_type='tf'):
+def get_enrichr_result_tables_by_library(enrichr_results, signature_label, library_type='tf'):
# Libraries
if library_type == 'tf':
# Libraries
@@ -660,39 +708,41 @@ def get_enrichr_result_tables_by_library(enrichr_results, signature_label, libra
}
elif library_type == "mirna":
libraries = {
- 'TargetScan_microRNA_2017': 'TargetScan (experimentally validated targets)',
- 'miRTarBase_2017': 'miRTarBase (experimentally validated targets)'
+ 'TargetScan_microRNA_2017': 'TargetScan (experimentally validated targets)',
+ 'miRTarBase_2017': 'miRTarBase (experimentally validated targets)'
}
-
# Initialize results
results = []
# Loop through genesets
for geneset in ['upregulated', 'downregulated']:
-
# Append ChEA results
- enrichment_dataframe = get_enrichr_results(enrichr_results[geneset]['userListId'], gene_set_libraries=libraries, geneset=geneset)
+ enrichment_dataframe = get_enrichr_results(enrichr_results[geneset]['userListId'], gene_set_libraries=libraries,
+ geneset=geneset)
results.append(enrichment_dataframe)
# Concatenate results
enrichment_dataframe = pd.concat(results)
return {'enrichment_dataframe': enrichment_dataframe, 'signature_label': signature_label}
-
-def plot_library_barchart(enrichr_results, gene_set_library, signature_label, sort_results_by='pvalue', nr_genesets=15, height=400, plot_type='interactive', plot_name="barchart.png"):
+
+def plot_library_barchart(enrichr_results, gene_set_library, signature_label, sort_results_by='pvalue', nr_genesets=15,
+ height=400, plot_type='interactive', plot_name="barchart.png"):
sort_results_by = 'log10P' if sort_results_by == 'pvalue' else 'combined_score'
fig = tools.make_subplots(rows=1, cols=2, print_grid=False)
for i, geneset in enumerate(['upregulated', 'downregulated']):
# Get dataframe
enrichment_dataframe = enrichr_results[geneset]
- plot_dataframe = enrichment_dataframe[enrichment_dataframe['gene_set_library'] == gene_set_library].sort_values(sort_results_by, ascending=False).iloc[:nr_genesets].iloc[::-1]
+ plot_dataframe = enrichment_dataframe[enrichment_dataframe['gene_set_library'] == gene_set_library].sort_values(
+ sort_results_by, ascending=False).iloc[:nr_genesets].iloc[::-1]
# Format
n = 7
plot_dataframe['nr_genes'] = [len(genes) for genes in plot_dataframe['overlapping_genes']]
- plot_dataframe['overlapping_genes'] = ['
'.join([', '.join(genes[i:i+n]) for i in range(0, len(genes), n)]) for genes in plot_dataframe['overlapping_genes']]
+ plot_dataframe['overlapping_genes'] = ['
'.join([', '.join(genes[i:i + n]) for i in range(0, len(genes), n)])
+ for genes in plot_dataframe['overlapping_genes']]
# Get Bar
bar = go.Bar(
@@ -701,15 +751,17 @@ def plot_library_barchart(enrichr_results, gene_set_library, signature_label, so
orientation='h',
name=geneset.title(),
showlegend=False,
- hovertext=['{term_name}
P-value: {pvalue:.2}
FDR: {FDR:.2}
Z-score: {zscore:.3}
Combined score: {combined_score:.3}
{nr_genes} Genes: {overlapping_genes}
'.format(**rowData) for index, rowData in plot_dataframe.iterrows()],
+ hovertext=[
+ '{term_name}
P-value: {pvalue:.2}
FDR: {FDR:.2}
Z-score: {zscore:.3}
Combined score: {combined_score:.3}
{nr_genes} Genes: {overlapping_genes}
'.format(
+ **rowData) for index, rowData in plot_dataframe.iterrows()],
hoverinfo='text',
marker={'color': '#FA8072' if geneset == 'upregulated' else '#87CEFA'}
)
- fig.append_trace(bar, 1, i+1)
+ fig.append_trace(bar, 1, i + 1)
# Get text
text = go.Scatter(
- x=[max(bar['x'])/50 for x in range(len(bar['y']))],
+ x=[max(bar['x']) / 50 for x in range(len(bar['y']))],
y=bar['y'],
mode='text',
hoverinfo='none',
@@ -719,15 +771,17 @@ def plot_library_barchart(enrichr_results, gene_set_library, signature_label, so
textposition="middle right",
textfont={'color': 'black'}
)
- fig.append_trace(text, 1, i+1)
+ fig.append_trace(text, 1, i + 1)
# Get annotations
labels = signature_label.split(' vs. ')
annotations = [
- {'x': 0.25, 'y': 1.06, 'text': 'Up-regulated in ' +
- labels[-1]+'', 'showarrow': False, 'xref': 'paper', 'yref': 'paper', 'xanchor': 'center'},
- {'x': 0.75, 'y': 1.06, 'text': 'Down-regulated in ' +
- labels[-1]+'', 'showarrow': False, 'xref': 'paper', 'yref': 'paper', 'xanchor': 'center'}
+ {'x': 0.25, 'y': 1.06,
+ 'text': 'Up-regulated in ' +
+ labels[-1] + '', 'showarrow': False, 'xref': 'paper', 'yref': 'paper', 'xanchor': 'center'},
+ {'x': 0.75, 'y': 1.06,
+ 'text': 'Down-regulated in ' +
+ labels[-1] + '', 'showarrow': False, 'xref': 'paper', 'yref': 'paper', 'xanchor': 'center'}
] if signature_label else []
# Get title
@@ -735,12 +789,14 @@ def plot_library_barchart(enrichr_results, gene_set_library, signature_label, so
fig['layout'].update(height=height, title='{}'.format(title),
hovermode='closest', annotations=annotations)
- fig['layout']['xaxis1'].update(domain=[0, 0.49], title='-log10P' if sort_results_by == 'log10P' else 'Enrichment score')
- fig['layout']['xaxis2'].update(domain=[0.51, 1], title='-log10P' if sort_results_by == 'log10P' else 'Enrichment score')
+ fig['layout']['xaxis1'].update(domain=[0, 0.49],
+ title='-log10P' if sort_results_by == 'log10P' else 'Enrichment score')
+ fig['layout']['xaxis2'].update(domain=[0.51, 1],
+ title='-log10P' if sort_results_by == 'log10P' else 'Enrichment score')
fig['layout']['yaxis1'].update(showticklabels=False)
fig['layout']['yaxis2'].update(showticklabels=False)
fig['layout']['margin'].update(l=0, t=65, r=0, b=35)
- if plot_type=='interactive':
+ if plot_type == 'interactive':
fig.show()
else:
fig.show(renderer='png')
@@ -750,41 +806,56 @@ def plot_library_barchart(enrichr_results, gene_set_library, signature_label, so
def results_table(enrichment_dataframe, source_label, target_label, notebook_metadata, table_counter):
-
# Get libraries
for gene_set_library in enrichment_dataframe['gene_set_library'].unique():
# Get subset
- enrichment_dataframe_subset = enrichment_dataframe[enrichment_dataframe['gene_set_library'] == gene_set_library].copy()
+ enrichment_dataframe_subset = enrichment_dataframe[
+ enrichment_dataframe['gene_set_library'] == gene_set_library].copy()
# Get unique values from source column
enrichment_dataframe_subset[source_label] = [x.split('_')[0] for x in enrichment_dataframe_subset['term_name']]
- enrichment_dataframe_subset = enrichment_dataframe_subset.sort_values(['FDR', 'pvalue']).rename(columns={'pvalue': 'P-value'}).drop_duplicates(source_label)
+ enrichment_dataframe_subset = enrichment_dataframe_subset.sort_values(['FDR', 'pvalue']).rename(
+ columns={'pvalue': 'P-value'}).drop_duplicates(source_label)
# Add links and bold for significant results
- enrichment_dataframe_subset[source_label] = ['{}'.format(x.split(" ")[0], x) if '-miR-' in x else '{}'.format(x.split(" ")[0], x)for x in enrichment_dataframe_subset[source_label]]
-
+ enrichment_dataframe_subset[source_label] = [
+ '{}'.format(x.split(" ")[0],
+ x) if '-miR-' in x else '{}'.format(
+ x.split(" ")[0], x) for x in enrichment_dataframe_subset[source_label]]
+
# else:
- # enrichment_dataframe_subset[source_label] = ['{x}'.format(**locals()) if '-miR-' in x else '{x}'.format(**locals())for x in enrichment_dataframe_subset[source_label]]
- enrichment_dataframe_subset[source_label] = [rowData[source_label].replace('target="_blank">', 'target="_blank">').replace('', '*') if rowData['FDR'] < 0.05 else rowData[source_label] for index, rowData in enrichment_dataframe_subset.iterrows()]
+ # enrichment_dataframe_subset[source_label] = ['{x}'.format(**locals()) if '-miR-' in x else '{x}'.format(**locals())for x in enrichment_dataframe_subset[source_label]]
+ enrichment_dataframe_subset[source_label] = [
+ rowData[source_label].replace('target="_blank">', 'target="_blank">').replace('', '*') if
+ rowData['FDR'] < 0.05 else rowData[source_label] for index, rowData in
+ enrichment_dataframe_subset.iterrows()]
# Add rank
- enrichment_dataframe_subset['Rank'] = [''+str(x+1)+'' for x in range(len(enrichment_dataframe_subset.index))]
+ enrichment_dataframe_subset['Rank'] = ['' + str(x + 1) + '' for x in
+ range(len(enrichment_dataframe_subset.index))]
# Add overlapping genes with tooltip
- enrichment_dataframe_subset['nr_overlapping_genes'] = [len(x) for x in enrichment_dataframe_subset['overlapping_genes']]
- enrichment_dataframe_subset['overlapping_genes'] = [', '.join(x) for x in enrichment_dataframe_subset['overlapping_genes']]
- enrichment_dataframe_subset[target_label.title()] = ['{nr_overlapping_genes} {geneset} '.format(**rowData)+target_label+'s' for index, rowData in enrichment_dataframe_subset.iterrows()]
+ enrichment_dataframe_subset['nr_overlapping_genes'] = [len(x) for x in
+ enrichment_dataframe_subset['overlapping_genes']]
+ enrichment_dataframe_subset['overlapping_genes'] = [', '.join(x) for x in
+ enrichment_dataframe_subset['overlapping_genes']]
+ enrichment_dataframe_subset[target_label.title()] = [
+ '{nr_overlapping_genes} {geneset} '.format(**rowData) + target_label + 's' for index, rowData in
+ enrichment_dataframe_subset.iterrows()]
# enrichment_dataframe[target_label.title()] = ['{nr_overlapping_genes} {geneset} '.format(**rowData)+target_label+'s