diff --git a/appyters/Kinase_TF_Modules/Kinase-TF-Modules.ipynb b/appyters/Kinase_TF_Modules/Kinase-TF-Modules.ipynb new file mode 100644 index 00000000..4be79470 --- /dev/null +++ b/appyters/Kinase_TF_Modules/Kinase-TF-Modules.ipynb @@ -0,0 +1,939 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#%%appyter init\n", + "from appyter import magic\n", + "magic.init(lambda _=globals: _())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%appyter markdown\n", + "# Kinase - Transcription Factor Module Pairwise Analysis\n", + "\n", + "This Appyter predicts shared modules of kinases, inferred from phosphoproteomic data through KEA3, and transcription factors, inferred from transcriptomic data from ChEA3, that are differentially active across two groups of samples. It first ranks the kinases and transcription factors and then scores pairs of kinases and transcription factors on an individual sample level per each group. These scores are binarized based on an additional threshold which only takes into account the top percentage of pair scores and with which clustermaps are created. Clusters or modules of kinases and transcription factors in these groups are then compared and those that appear across multiple of the heatmaps, representing different directions of activation, are retained and their relationship is displayed as a bipartite network." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%appyter hide_code\n", + "\n", + "{% do SectionField(\n", + " name='primary',\n", + " title='Kinase - Transcription Factor Pairwise Module Analysis',\n", + " img='k-tf-logo.png'\n", + ") %}\n", + "\n", + "\n", + "{% set chea_up_file = FileField(\n", + " name='chea_up',\n", + " label='ChEA3 Enrichment Results from Upregulated RNA-seq expression vectors',\n", + " description='Transcription Factors as the rows, samples as the columns',\n", + " default='ChEA3_pancan_top500_baseline_normalized.tsv',\n", + " required=True,\n", + " examples={\n", + " 'ChEA3_pancan_top500_baseline_normalized.tsv': 'https://appyters.maayanlab.cloud/storage/Kinase_TF_Modules/ChEA3_10_cancer_type_meanRank_df_top500_baseline_normalized.tsv',\n", + " },\n", + " section='primary',\n", + ") %}\n", + "\n", + "{% set chea_down_file = FileField(\n", + " name='chea_down',\n", + " label='ChEA3 Enrichment Results from Downregulated RNA-seq expression vectors',\n", + " description='Transcription Factors as the rows, samples as the columns',\n", + " default='ChEA3_pancan_bot500_baseline_normalized.tsv',\n", + " required=True,\n", + " examples={\n", + " 'ChEA3_pancan_bot500_baseline_normalized.tsv': 'https://appyters.maayanlab.cloud/storage/Kinase_TF_Modules/ChEA3_10_cancer_type_meanRank_df_bot500_baseline_normalized.tsv',\n", + " },\n", + " section='primary',\n", + ") %}\n", + "\n", + "\n", + "{% set kea_up_file = FileField(\n", + " name='kea_up',\n", + " label='KEA3 Enrichment Results from upregulated phosphorylated proteins',\n", + " description='Kinases as the rows, samples as the columns',\n", + " default='KEA3_pancan_top500_baseline_normalized.tsv',\n", + " required=True,\n", + " examples={\n", + " 'KEA3_pancan_top500_baseline_normalized.tsv': 'https://appyters.maayanlab.cloud/storage/Kinase_TF_Modules/KEA3_V2_10_cancer_type_meanRank_df_top500_baseline_normalized.tsv',\n", + " },\n", + " section='primary',\n", + ") %}\n", + "\n", + "\n", + "{% set kea_down_file = FileField(\n", + " name='kea_down',\n", + " label='KEA3 Enrichment Results from downregulated phosphorylated proteins',\n", + " description='Kinases as the rows, samples as the columns',\n", + " default='KEA3_pancan_bot500_baseline_normalized.tsv',\n", + " required=True,\n", + " examples={\n", + " 'KEA3_pancan_bot500_baseline_normalized.tsv': 'https://appyters.maayanlab.cloud/storage/Kinase_TF_Modules/KEA3_V2_10_cancer_type_meanRank_df_bot500_baseline_normalized.tsv',\n", + " },\n", + " section='primary',\n", + ") %}\n", + "\n", + "{% set group_annotations_file = FileField(\n", + " name='group_annotations_file',\n", + " label='Pairwie group assigment for each sample. Column with annotations should be named group',\n", + " description='Each row should be a sample and an annotation.',\n", + " default='pancan_immune_subtypes.tsv',\n", + " required=True,\n", + " examples={\n", + " 'pancan_immune_subtypes.tsv': 'https://appyters.maayanlab.cloud/storage/Kinase_TF_Modules/pancan_immune_subtypes.tsv',\n", + " },\n", + " section='primary',\n", + ") %}\n", + "\n", + "{% set ascending = BoolField(\n", + " name='ascending',\n", + " label='ChEA3 and KEA3 Enrichment results are in ascending order',\n", + " default=False,\n", + " section='primary',\n", + ") %}\n", + "\n", + "{% set rank_threshold =\n", + " IntField(\n", + " name='rank_threshold', \n", + " label='Rank Threshold',\n", + " description=\"Threshold for rank of kinase/TF for pair to be included in the analysis\",\n", + " default=30,\n", + " section='primary'\n", + " ) %}\n", + "\n", + "{% set vis_threshold =\n", + " FloatField(\n", + " name='vis_threshold', \n", + " label='Rank Threshold',\n", + " description=\"Threshold for the top percentage of kinase-transcription factor pairs to be visualized and clustered.\",\n", + " default=0.005,\n", + " step=0.001,\n", + " section='primary'\n", + " ) %}\n", + "\n", + "{% set cluster_threshold =\n", + " IntField(\n", + " name='cluster_threshold', \n", + " label='Cluster size threshold',\n", + " description=\"Threshold for size of a cluster of kinases and transcription factors for it to be considered.\",\n", + " default=10,\n", + " section='primary'\n", + " ) %}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import json\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib as mpl\n", + "import seaborn as sns\n", + "import networkx as nx\n", + "import pyvis\n", + "from IPython.display import display, FileLink, HTML, Markdown, IFrame\n", + "\n", + "\n", + "def read_table(filename):\n", + " if filename.endswith('.tsv') or filename.endswith('.tsv.gz'):\n", + " return pd.read_csv(filename, sep='\\t', index_col=0)\n", + " elif filename.endswith('.csv') or filename.endswith('.csv.gz'):\n", + " return pd.read_csv(filename, sep=',', index_col=0)\n", + " elif filename.endswith('.gct') or filename.endswith('.gct.gz'):\n", + " return pd.read_csv(filename, sep='\\t', index_col=0, skiprows=2)\n", + " else:\n", + " return pd.read_table(filename, sep=None, engine='python', index_col=0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%appyter markdown\n", + "## Load Files\n", + "Load the [ChEA3](https://maayanlab.cloud/chea3/) and [KEA3](https://maayanlab.cloud/kea3/) files and convert to transcription factor and kinase enrichment ranks, respectively. \n", + "Additionally load annotation file which defines the pairwise groups which will be compared in the analysis." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%appyter code_eval\n", + "\n", + "chea_up = read_table({{ chea_up_file }})\n", + "chea_down = read_table({{ chea_down_file }})\n", + "kea_up = read_table({{ kea_up_file }})\n", + "kea_down = read_table({{ kea_down_file }})\n", + "\n", + "{% if ascending.raw_value %}\n", + "top_tf = chea_up.rank(ascending=False).T\n", + "bottom_tf = chea_down.rank(ascending=False).T\n", + "\n", + "top_kinase = kea_up.rank(ascending=False).T\n", + "bottom_kinase = kea_down.rank(ascending=False).T\n", + "{% else %}\n", + "top_tf = chea_up.rank(ascending=True).T\n", + "bottom_tf = chea_down.rank(ascending=True).T\n", + "\n", + "top_kinase = kea_up.rank(ascending=True).T\n", + "bottom_kinase = kea_down.rank(ascending=True).T\n", + "{% endif %}\n", + "\n", + "group_annotations = read_table({{ group_annotations_file }})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%appyter markdown\n", + "Parse group annotation and identify the available samples from the transcription factor and kinase ranking files." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "groups = list(group_annotations[\"group\"].dropna().unique())\n", + "if len(groups) < 2:\n", + " raise RuntimeError(\"Two groups were not recognized in the annotations file\")\n", + "elif len(groups) > 2:\n", + " print(\"More than two groups were identifed in the provided annotations file. Proceeding with the first two appearing groups.\")\n", + "\n", + "label1, label2 = groups[0], groups[1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "available_samples = set(top_kinase.index.values).intersection(set(top_tf.index.values), set(bottom_tf.index.values), set(bottom_kinase.index.values))\n", + "\n", + "group1 = list(group_annotations[group_annotations['group'] == label1].index.values)\n", + "group2 = list(group_annotations[group_annotations['group'] == label2].index.values)\n", + "\n", + "group1 = list(set(group1).intersection(available_samples))\n", + "group2 = list(set(group2).intersection(available_samples))\n", + "\n", + "if len(group1) == 0 or len(group2) == 0:\n", + " raise RuntimeError(\"No samples identified for the given groups. Please ensure samples are in the index of you annotations file.\")\n", + "\n", + "print(f'{label1}:', len(group1))\n", + "print(f'{label2}:', len(group2))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%appyter markdown\n", + "## Score kinase - transcription factor pairs\n", + "On a patient and group level, score kinase - transcription factor pairs according to the input rank threshold. If a kinase or transcription factor do not meet the rank threshold, then the pair recieves a score of 0, otherwise the score is the sum of the noramlzied ranks of the given kinase and transcription factor. If this process is running for an extended period of time, consider raising the rank threshold." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def melt_tf_kinase(kinase_df, tf_df, rank_threshold, fname):\n", + " # output (values are z-scored scores)\n", + " # |kinase_1|kinase_2\n", + " # tf_1|score|score\n", + " # tf_2|score|score\n", + " \n", + " df_kinase_melt_norm = kinase_df.melt(var_name='kinase', value_name='rank', ignore_index=False)\n", + " # |patient|kinase|rank|\n", + "\n", + " df_tf_melt_norm = tf_df.melt(var_name='tf', value_name='rank', ignore_index=False)\n", + " # |patient|tf|rank|\n", + "\n", + " df_goal = pd.merge(left=df_kinase_melt_norm, left_index=True, right=df_tf_melt_norm, right_index=True, suffixes=('_kinase', '_tf'))\n", + " # |patient|tf|rank_tf|kinase|rank_kinase|\n", + "\n", + " # filter\n", + " df_goal = df_goal[((df_goal.rank_tf < rank_threshold) & (df_goal.rank_kinase < rank_threshold))]\n", + "\n", + " # compute scores\n", + " df_goal['score'] = ((rank_threshold + 1) - df_goal['rank_tf'])/rank_threshold + ((rank_threshold + 1) -df_goal['rank_kinase'])/rank_threshold\n", + "\n", + " # aggregate score across all patients\n", + " df_agg = df_goal.groupby(['tf', 'kinase'])['score'].sum().reset_index()\n", + "\n", + " # zscore normalize scores across all pairs\n", + " df_agg['score_norm'] = np.log10(df_agg['score'])\n", + "\n", + " # turn into tf by kinase matrix of normalized scores\n", + " df_score_norm = df_agg.pivot(index='kinase', columns='tf', values='score_norm')\n", + " df_score_norm.to_csv(f\"{fname}.tsv\", sep='\\t')\n", + " return df_score_norm" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%appyter code_exec\n", + "rank_threshold = {{ rank_threshold }}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "label1_clean = \"\".join(x for x in label1 if x.isalnum())\n", + "top_kinase_top_tf_gr1 = melt_tf_kinase(top_kinase.loc[group1], top_tf.loc[group1], rank_threshold, f'top_kinase_top_tf_{label1_clean}')\n", + "bottom_kinase_bottom_tf_gr1 = melt_tf_kinase(bottom_kinase.loc[group1], bottom_tf.loc[group1], rank_threshold, f'bottom_kinase_bottom_tf_{label1_clean}')\n", + "top_kinase_bottom_tf_gr1 = melt_tf_kinase(top_kinase.loc[group1], bottom_tf.loc[group1], rank_threshold, f'top_kinase_bottom_tf_{label1_clean}')\n", + "bottom_kinase_top_tf_gr1 = melt_tf_kinase(bottom_kinase.loc[group1], top_tf.loc[group1], rank_threshold, f'bottom_kinase_top_tf_{label1_clean}')\n", + "\n", + "display(FileLink(f\"top_kinase_top_tf_{label1_clean}.tsv\", result_html_prefix=str(f'up kinase - up transcription factor {label1} scores matrix: ')))\n", + "display(FileLink(f\"bottom_kinase_bottom_tf_{label1_clean}.tsv\", result_html_prefix=str(f'down kinase - down transcription factor {label1} scores matrix: ')))\n", + "display(FileLink(f\"top_kinase_bottom_tf_{label1_clean}.tsv\", result_html_prefix=str(f'up kinase - down transcription factor {label1} scores matrix: ')))\n", + "display(FileLink(f\"bottom_kinase_top_tf_{label1_clean}.tsv\", result_html_prefix=str(f'down kinase - up transcription factor {label1} scores matrix: ')))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "label2_clean = \"\".join(x for x in label2 if x.isalnum())\n", + "top_kinase_top_tf_gr2 = melt_tf_kinase(top_kinase.loc[group2], top_tf.loc[group2], rank_threshold, f'top_kinase_top_tf_{label2_clean}')\n", + "bottom_kinase_bottom_tf_gr2 = melt_tf_kinase(bottom_kinase.loc[group2], bottom_tf.loc[group2], rank_threshold, f'bottom_kinase_bottom_tf_{label2_clean}')\n", + "top_kinase_bottom_tf_gr2 = melt_tf_kinase(top_kinase.loc[group2], bottom_tf.loc[group2], rank_threshold, f'top_kinase_bottom_tf_{label2_clean}')\n", + "bottom_kinase_top_tf_gr2 = melt_tf_kinase(bottom_kinase.loc[group2], top_tf.loc[group2], rank_threshold, f'bottom_kinase_top_tf_{label2_clean}')\n", + "\n", + "display(FileLink(f\"top_kinase_top_tf_{label2_clean}.tsv\", result_html_prefix=str(f'up kinase - up transcription factor {label2} scores matrix: ')))\n", + "display(FileLink(f\"bottom_kinase_bottom_tf_{label2_clean}.tsv\", result_html_prefix=str(f'down kinase - down transcription factor {label2} scores matrix: ')))\n", + "display(FileLink(f\"top_kinase_bottom_tf_{label2_clean}.tsv\", result_html_prefix=str(f'up kinase - down transcription factor {label2} scores matrix: ')))\n", + "display(FileLink(f\"bottom_kinase_top_tf_{label2_clean}.tsv\", result_html_prefix=str(f'down kinase - up transcription factor {label2} scores matrix: ')))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%appyter markdown\n", + "## Visualize kinase and transcription factor pairs and identify modules\n", + "Create a clustermap of the top percentage of kinase-transcription factor pairs appearing in one or both of groups based on the visualization threshold. Identify modules of kinases and transcription factors based on the threshold for cluster/module size." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def create_heatmap(gr2_df, gr1_df, threshold, fname): \n", + " pairings_and_scores_gr2 = []\n", + " for tf in gr2_df.columns.values:\n", + " for (kinase, score) in gr2_df[tf].items():\n", + " if not (pd.isna(score)):\n", + " pairings_and_scores_gr2.append((score, (kinase, tf), 'gr2'))\n", + "\n", + " pairings_and_scores_gr1 = []\n", + " for tf in gr1_df.columns.values:\n", + " for (kinase, score) in gr1_df[tf].items():\n", + " if not (pd.isna(score)):\n", + " pairings_and_scores_gr1.append((score, (kinase, tf), 'gr1'))\n", + " \n", + " top_1_threshold_gr2 = int(len(pairings_and_scores_gr2)*threshold)\n", + " top_1_threshold_gr1 = int(len(pairings_and_scores_gr1)*threshold)\n", + "\n", + " max_val_count = 0\n", + " max_vals_gr2 = []\n", + " while max_val_count < top_1_threshold_gr2:\n", + " val_to_add = max(pairings_and_scores_gr2)\n", + " max_vals_gr2.append(val_to_add)\n", + " pairings_and_scores_gr2.remove(val_to_add)\n", + " max_val_count += 1\n", + "\n", + " max_val_count = 0\n", + " max_vals_gr1 = []\n", + " while max_val_count < top_1_threshold_gr1:\n", + " val_to_add = max(pairings_and_scores_gr1)\n", + " max_vals_gr1.append(val_to_add)\n", + " pairings_and_scores_gr1.remove(val_to_add)\n", + " max_val_count += 1 \n", + "\n", + " max_vals = max_vals_gr1 + max_vals_gr2\n", + "\n", + " top_1_df = pd.DataFrame(max_vals)\n", + " top_1_df.columns = ['score', 'pairing', 'type']\n", + "\n", + " binary_vals = dict()\n", + " for (index, pair) in top_1_df['pairing'].items():\n", + " if pair not in binary_vals:\n", + " binary_vals[pair] = []\n", + " binary_vals[pair].append(top_1_df['type'][index])\n", + "\n", + " both = 0\n", + " gr1 = 0\n", + " gr2 = 0\n", + " new_scores = []\n", + " for pair in binary_vals:\n", + " if len(binary_vals[pair]) == 2:\n", + " new_scores.append((1, pair))\n", + " both += 1\n", + " elif binary_vals[pair] == ['gr1']:\n", + " new_scores.append((2, pair))\n", + " gr1 += 1\n", + " else:\n", + " new_scores.append((-2, pair))\n", + " gr2 += 1\n", + "\n", + " new_df = pd.DataFrame(new_scores, columns = ['score', 'pair'])\n", + "\n", + " #index = kinase\n", + " #columns = tfs\n", + "\n", + " kinases = []\n", + " tfs = []\n", + " for (index, pair) in new_df['pair'].items():\n", + " kinases.append(pair[0])\n", + " tfs.append(pair[1])\n", + " new_df['Kinases'] = kinases\n", + " new_df['Transcription Factors'] = tfs\n", + "\n", + " new_df_pivot = new_df.pivot_table(index = 'Kinases', columns = 'Transcription Factors', values = 'score')\n", + " new_df_pivot\n", + "\n", + " cmap = mpl.colors.ListedColormap(['blue', 'gray','red'])\n", + " kws = dict(cbar_kws=dict(ticks=[-2.666667, 0, 2.666667], orientation='horizontal'))\n", + " sns.set(font_scale = 1.0)\n", + "\n", + " g = sns.clustermap(new_df_pivot.fillna(0.), \n", + " mask=np.isnan(new_df_pivot),\n", + " method = 'complete',\n", + " cmap = cmap,\n", + " figsize=(18, 18),\n", + " xticklabels=True, \n", + " yticklabels=True,\n", + " vmin = -4,\n", + " vmax = 4,\n", + " **kws)\n", + " \n", + " g.savefig(f\"{fname}.png\")\n", + " g.savefig(f\"{fname}.jpg\")\n", + " g.savefig(f\"{fname}.svg\")\n", + " \n", + " mat = g.data2d\n", + "\n", + " x0, _y0, _w, _h = g.cbar_pos\n", + " g.ax_cbar.set_position([x0, 0.9, g.ax_row_dendrogram.get_position().width, 0.02])\n", + " g.ax_cbar.tick_params(axis='x', length=10)\n", + " g.ax_cbar.set_xticklabels([label2.replace('Mesenchymal', 'M'), 'Both', label1.replace('Mesenchymal', 'M')], fontsize = 20) ## Legend [gr2, Both, gr1]\n", + "\n", + " ax = g.ax_heatmap\n", + " ax.set_xlabel(\"Transcription Factors\", fontsize = 30, loc = 'center')\n", + " ax.set_ylabel(\"Kinases\", fontsize = 30, loc = 'center')\n", + "\n", + " for a in g.ax_row_dendrogram.collections:\n", + " a.set_linewidth(3)\n", + "\n", + " for a in g.ax_col_dendrogram.collections:\n", + " a.set_linewidth(3)\n", + "\n", + " kinase = list(new_df_pivot.index.values[g.dendrogram_row.reordered_ind])\n", + " tfs = list(new_df_pivot.columns.values[g.dendrogram_col.reordered_ind])\n", + " return mat, kinases, tfs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def is_valid_move(matrix, visited, row, col, value):\n", + " rows, cols = len(matrix), len(matrix[0])\n", + " return 0 <= row < rows and 0 <= col < cols and matrix[row][col] == value and not visited[row][col]\n", + "\n", + "def dfs(matrix, visited, row, col, island_indices):\n", + " visited[row][col] = True\n", + " island_indices.append((row, col))\n", + "\n", + " directions = [(0, 1), (1, 0), (0, -1), (-1, 0)] # right, down, left, up\n", + "\n", + " for dr, dc in directions:\n", + " new_row, new_col = row + dr, col + dc\n", + " if is_valid_move(matrix, visited, new_row, new_col, matrix[row][col]):\n", + " dfs(matrix, visited, new_row, new_col, island_indices)\n", + "\n", + "def find_islands(matrix, value, min_island_size):\n", + " rows, cols = len(matrix), len(matrix[0])\n", + " visited = [[False] * cols for _ in range(rows)]\n", + " islands = []\n", + " for row in range(rows):\n", + " for col in range(cols):\n", + " if matrix[row][col] == value and not visited[row][col]:\n", + " island_indices = []\n", + " dfs(matrix, visited, row, col, island_indices)\n", + "\n", + " if len(island_indices) >= min_island_size:\n", + " islands.append(island_indices)\n", + " return islands\n", + "\n", + "\n", + "def find_all_clusters(matrix, min_island_size, kinases, tfs):\n", + " label1_clusters = find_islands(matrix, 2, min_island_size)\n", + " label1_modules = {}\n", + " for i, clus in enumerate(label1_clusters):\n", + " clus_module = {'kinases': [], 'tfs': []}\n", + " for idx in clus:\n", + " clus_module['kinases'].append(kinases[idx[0]])\n", + " clus_module['tfs'].append(tfs[idx[1]])\n", + " clus_module['kinases'] = list(set(clus_module['kinases']))\n", + " clus_module['tfs'] = list(set(clus_module['tfs']))\n", + " label1_modules[f\"{label1}-{i + 1}\"] = clus_module\n", + "\n", + " label2_clusters = find_islands(matrix, -2, min_island_size)\n", + " label2_modules = {}\n", + " for i, clus in enumerate(label2_clusters):\n", + " clus_module = {'kinases': [], 'tfs': []}\n", + " for idx in clus:\n", + " clus_module['kinases'].append(kinases[idx[0]])\n", + " clus_module['tfs'].append(tfs[idx[1]])\n", + " clus_module['kinases'] = list(set(clus_module['kinases']))\n", + " clus_module['tfs'] = list(set(clus_module['tfs']))\n", + " label2_modules[f\"{label2}-{i + 1}\"] = clus_module\n", + "\n", + " shared_clusters = find_islands(matrix, 1, min_island_size)\n", + " shared_modules = {}\n", + " for i, clus in enumerate(shared_clusters):\n", + " clus_module = {'kinases': [], 'tfs': []}\n", + " for idx in clus:\n", + " clus_module['kinases'].append(kinases[idx[0]])\n", + " clus_module['tfs'].append(tfs[idx[1]])\n", + " clus_module['kinases'] = list(set(clus_module['kinases']))\n", + " clus_module['tfs'] = list(set(clus_module['tfs']))\n", + " shared_modules[f\"{label2}-{i + 1}\"] = clus_module\n", + "\n", + " return {label1: label1_modules, label2: label2_modules, 'both': shared_modules}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%appyter eval_code\n", + "\n", + "vis_threshold = {{ vis_threshold }}\n", + "cluster_threshold = {{ cluster_threshold }}\n", + "identified_clusters = {}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%appyter markdown\n", + "# Top ranked kinases - Top ranked transcription factors\n", + "Comparing top ranked kinases and transcription factors based on upregulated phosphosites and downregulated genes, respectively. Since their regulation mirrors each other, this is inferred as activation in the next step of the analysis." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mat, kinases, tfs = create_heatmap(top_kinase_top_tf_gr2, top_kinase_top_tf_gr1, vis_threshold, \"top-kinase-top-tf\")\n", + "toptopres = find_all_clusters(mat.fillna(0).values, 10, kinases, tfs)\n", + "identified_clusters['top-top'] = toptopres\n", + "display(FileLink(f\"top-kinase-top-tf.png\", result_html_prefix=str(f'Download Figure PNG: ')))\n", + "display(FileLink(f\"top-kinase-top-tf.jpg\", result_html_prefix=str(f'Download Figure JPEG: ')))\n", + "display(FileLink(f\"top-kinase-top-tf.svg\", result_html_prefix=str(f'Download Figure SVG: ')))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%appyter markdown\n", + "# Bottom ranked kinases - Bottom ranked transcription factors\n", + "Comparing top ranked kinases and transcription factors based on upregulated phosphosites and downregulated genes, respectively. Since their regulation mirrors each other, this is inferred as activation in the next step of the analysis." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mat, kinases, tfs = create_heatmap(bottom_kinase_bottom_tf_gr2, bottom_kinase_bottom_tf_gr1, vis_threshold, \"bottom-kinase-bottom-tf\")\n", + "botbotres = find_all_clusters(mat.fillna(0).values, 10, kinases, tfs)\n", + "identified_clusters['bot-bot'] = botbotres\n", + "display(FileLink(f\"bottom-kinase-bottom-tf.png\", result_html_prefix=str(f'Download Figure PNG: ')))\n", + "display(FileLink(f\"bottom-kinase-bottom-tf.jpg\", result_html_prefix=str(f'Download Figure JPEG: ')))\n", + "display(FileLink(f\"bottom-kinase-bottom-tf.svg\", result_html_prefix=str(f'Download Figure SVG: ')))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%appyter markdown\n", + "# Bottom ranked kinases - Top ranked transcription factors\n", + "Comparing top ranked kinases and transcription factors based on upregulated phosphosites and downregulated genes, respectively. Since their regulation opposes each other, this is inferred as inhibition in the next step of the analysis." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mat, kinases, tfs = create_heatmap(bottom_kinase_top_tf_gr2, bottom_kinase_top_tf_gr1, vis_threshold, \"bottom-kinase-top-tf\")\n", + "bottopres = find_all_clusters(mat.fillna(0).values, 10, kinases, tfs)\n", + "identified_clusters['bot-top'] = botbotres\n", + "display(FileLink(f\"bottom-kinase-top-tf.png\", result_html_prefix=str(f'Download Figure PNG: ')))\n", + "display(FileLink(f\"bottom-kinase-top-tf.jpg\", result_html_prefix=str(f'Download Figure JPEG: ')))\n", + "display(FileLink(f\"bottom-kinase-top-tf.svg\", result_html_prefix=str(f'Download Figure SVG: ')))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%appyter markdown\n", + "# Top ranked kinases - Bottom ranked transcription factors\n", + "Comparing top ranked kinases and transcription factors based on upregulated phosphosites and downregulated genes, respectively. Since their regulation opposes each other, this is inferred as inhibition in the next step of the analysis." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mat, kinases, tfs = create_heatmap(top_kinase_bottom_tf_gr2, top_kinase_bottom_tf_gr1, vis_threshold, \"top-kinase-bottom-tf\")\n", + "topbotres = find_all_clusters(mat.fillna(0).values, 10, kinases, tfs)\n", + "identified_clusters['top-bot'] = botbotres\n", + "display(FileLink(f\"top-kinase-bottom-tf.png\", result_html_prefix=str(f'Download Figure PNG: ')))\n", + "display(FileLink(f\"top-kinase-bottom-tf.jpg\", result_html_prefix=str(f'Download Figure JPEG: ')))\n", + "display(FileLink(f\"top-kinase-bottom-tf.svg\", result_html_prefix=str(f'Download Figure SVG: ')))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%appyter markdown\n", + "## Identify modules appearing across heatmaps\n", + "Kinase and transcription factor groups are identified from those appearing in modules across the above four visualizations." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "clusters = identified_clusters\n", + "clus_dict = {0: 'top-top', 1: 'bot-bot', 2: 'top-bot', 3: 'bot-top'}\n", + "\n", + "def find_overlaps_kinase(kinases_to_check):\n", + " check_set = set(kinases_to_check)\n", + " results = {}\n", + " for i, c in enumerate(clusters):\n", + " for key in clusters[c]:\n", + " for n in clusters[c][key]:\n", + " if len(clusters[c][key][n]) > 0:\n", + " overlap = set(clusters[c][key][n]['kinases']).intersection(check_set)\n", + " if len(overlap) > 3:\n", + " results[clus_dict[i] + key+str(n)] = list(overlap)\n", + " overlap_all = []\n", + " for key in results:\n", + " if len(results[key]) > len(overlap_all):\n", + " overlap_all = results[key]\n", + " overlap_all = set(overlap_all)\n", + " for key in results:\n", + " overlap_all = overlap_all.intersection(set(results[key]))\n", + " valid = len(results.keys()) >= 2\n", + " return frozenset(overlap_all), valid\n", + "\n", + "def find_overlaps_tf(kinases_to_check):\n", + " check_set = set(kinases_to_check)\n", + " results = {}\n", + " for i, c in enumerate(clusters):\n", + " for key in clusters[c]:\n", + " for n in clusters[c][key]:\n", + " if len(clusters[c][key][n]) > 0:\n", + " overlap = set(clusters[c][key][n]['tfs']).intersection(check_set)\n", + " if len(overlap) > 3:\n", + " results[clus_dict[i] + key+str(n)] = list(overlap)\n", + " overlap_all = []\n", + " for key in results:\n", + " if len(results[key]) > len(overlap_all):\n", + " overlap_all = results[key]\n", + " overlap_all = set(overlap_all)\n", + " for key in results:\n", + " overlap_all = overlap_all.intersection(set(results[key]))\n", + " valid = len(results.keys()) >= 2\n", + " return frozenset(overlap_all), valid\n", + "\n", + " \n", + "def sublist(lst1, lst2):\n", + " return len(set(lst1).intersection(set(lst2))) == len(lst1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "set_of_clusters_kinases = set()\n", + "\n", + "print('Kinase Clusters:')\n", + "for i, c in enumerate(clusters):\n", + " for key in clusters[c]:\n", + " for n in clusters[c][key]:\n", + " res, valid = find_overlaps_kinase(clusters[c][key][n]['kinases'])\n", + " if valid:\n", + " set_of_clusters_kinases.add(res)\n", + "\n", + "for clus in set_of_clusters_kinases:\n", + " if len(clus) > 2:\n", + " clus_str = ''\n", + " for c in clus:\n", + " clus_str += c + ' '\n", + " print(clus_str)\n", + "\n", + "sorted_kinases = sorted(list(set_of_clusters_kinases), key=lambda x: len(x))\n", + "kinase_list = []\n", + "for kinase_gr in sorted_kinases:\n", + " if len(kinase_gr) > 2:\n", + " kinase_list.append(' '.join(kinase_gr))\n", + "\n", + "set_of_clusters_tfs = set()\n", + "\n", + "print('\\nTFs Clusters:')\n", + "for i, c in enumerate(clusters):\n", + " for key in clusters[c]:\n", + " for n in clusters[c][key]:\n", + " res, valid = find_overlaps_tf(clusters[c][key][n]['tfs'])\n", + " if valid:\n", + " set_of_clusters_tfs.add(res)\n", + "\n", + "for clus in set_of_clusters_tfs:\n", + " if len(clus) > 2:\n", + " clus_str = ''\n", + " for c in clus:\n", + " clus_str += c + ' '\n", + " print(clus_str)\n", + "\n", + "sorted_tfs = sorted(list(set_of_clusters_tfs), key=lambda x: len(x))\n", + "tf_list = []\n", + "for tf_gr in sorted_tfs:\n", + " if len(tf_gr) > 2:\n", + " tf_list.append(' '.join(tf_gr))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%appyter markdown\n", + "## Create Biparite Figure with idenfied kinase and transcription factor modules and their relationships\n", + "Kinase and transcription factor module relationships are drawn from heatmaps above where upregulated kinases and upregulated transcription factors as well as downregulated kinases and downregulated transcription factors are inferred as activation and upregulated transcription factors and downregulated kinases as well as downregulated transcription factors and upregulated kinases are inferred as inhibition." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "G = nx.Graph()\n", + "\n", + "G.add_nodes_from(kinase_list, bipartite=0)\n", + "G.add_nodes_from(tf_list, bipartite=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for i in identified_clusters:\n", + " for label in identified_clusters[i]:\n", + " for clus in identified_clusters[i][label]:\n", + " clus_kinases = identified_clusters[i][label][clus]['kinases']\n", + " k_grs = []\n", + " for k_gr in kinase_list: \n", + " if len(set(k_gr.split()).intersection(clus_kinases)) == len(k_gr.split()):\n", + " k_grs.append(k_gr)\n", + " clus_tfs = identified_clusters[i][label][clus]['tfs']\n", + " tf_grs = []\n", + " for tf_gr in tf_list:\n", + " if len(set(tf_gr.split()).intersection(clus_tfs)) == len(tf_gr.split()):\n", + " tf_grs.append(tf_gr)\n", + "\n", + " for k_gr in k_grs:\n", + " for tf_gr in tf_grs:\n", + " if label == label1:\n", + " if i == 'top-top' or i =='bot-bot':\n", + " G.add_edge(k_gr, tf_gr, attr=f\"{label}-{i}\", color='red', shape='-|>')\n", + " else:\n", + " G.add_edge(k_gr, tf_gr, attr=f\"{label}-{i}\", color='red', shape='|-|')\n", + " elif label == label2:\n", + " if i == 'top-top' or i =='bot-bot':\n", + " G.add_edge(k_gr, tf_gr, attr=f\"{label}-{i}\", color='blue', shape='-|>')\n", + " else:\n", + " G.add_edge(k_gr, tf_gr, attr=f\"{label}-{i}\", color='blue', shape='|-|')\n", + " elif label == 'both':\n", + " if i == 'top-top' or i =='bot-bot':\n", + " G.add_edge(k_gr, tf_gr, attr=f\"{label}-{i}\", color='grey', shape='-|>')\n", + " else:\n", + " G.add_edge(k_gr, tf_gr, attr=f\"{label}-{i}\", color='grey', shape='|-|')\n", + "G.remove_nodes_from(list(nx.isolates(G))) # remove nodes with no connections" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "edges = G.edges()\n", + "colors = [G[u][v]['color'] for u,v in edges]\n", + "shapes = [G[u][v]['shape'] for u,v in edges]\n", + "fig = plt.figure(1, figsize=(20, 10), dpi=300)\n", + "pos = nx.drawing.layout.bipartite_layout(G, kinase_list, align='vertical')\n", + "nx.draw_networkx_nodes(G, pos, node_size=300, margins=[.2, .1], alpha=.5, )\n", + "nx.draw_networkx_labels(G, pos, bbox = dict(facecolor=\"white\", edgecolor='black', boxstyle='round,pad=1'))\n", + "for edge in G.edges(data=True):\n", + " y_diff = np.abs(pos[edge[0]][1] - pos[edge[1]][1])\n", + " x_diff = np.abs(pos[edge[0]][0] - pos[edge[1]][0])\n", + " dist = np.sqrt(np.square(y_diff) + np.square(x_diff))\n", + "\n", + " min_target_margin = dist * 64\n", + "\n", + " min_target_margin += np.log10(1/(y_diff +.01)) * len(edge[1]) * .2\n", + "\n", + " if y_diff < .1:\n", + " min_target_margin += len(edge[1]) * 1.5\n", + "\n", + " if y_diff > .5 :\n", + " min_target_margin -= y_diff * 35\n", + " \n", + " nx.draw_networkx_edges(G, pos, edgelist=[(edge[0],edge[1])], width= 1.5, arrows=True, arrowstyle=edge[2]['shape'], edge_color=edge[2]['color'], min_source_margin=-150, min_target_margin=min_target_margin)\n", + "\n", + "plt.tight_layout()\n", + "limits = plt.axis(\"off\")\n", + "json_adjacency = nx.adjacency_data(G)\n", + "with open(f'{label1_clean}vs{label2_clean}_adjacency.json', 'w') as f:\n", + " json.dump(json_adjacency, f)\n", + "nx.write_adjlist(G, f'{label1_clean}vs{label2_clean}_adjaceny.tsv', delimiter='\\t')\n", + "fig.savefig(f'{label1_clean}vs{label2_clean}_network.png')\n", + "fig.savefig(f'{label1_clean}vs{label2_clean}_network.jpg')\n", + "fig.savefig(f'{label1_clean}vs{label2_clean}_network.svg')\n", + "plt.show()\n", + "display(FileLink(f\"{label1_clean}vs{label2_clean}_adjacency.json\", result_html_prefix=str('Download adjacency netowrk data JSON: ')))\n", + "display(FileLink(f\"{label1_clean}vs{label2_clean}_adjaceny.tsv\", result_html_prefix=str('Download adjacency netowrk data TSV: ')))\n", + "display(FileLink(f'{label1_clean}vs{label2_clean}_network.png', result_html_prefix=str('Download Figue PNG: ')))\n", + "display(FileLink(f'{label1_clean}vs{label2_clean}_network.jpg', result_html_prefix=str('Download Figue JPEG: ')))\n", + "display(FileLink(f'{label1_clean}vs{label2_clean}_network.svg', result_html_prefix=str('Download Figue SVG: ')))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "g = pyvis.network.Network(notebook=True, cdn_resources='remote')\n", + "g.from_nx(G)\n", + "g.save_graph(f'{label1_clean}vs{label2_clean}.html')\n", + "display(FileLink(f\"{label1_clean}vs{label2_clean}.html\", result_html_prefix=str('Download network html: ')))\n", + "g.show(f'{label1_clean}vs{label2_clean}.html')" + ] + } + ], + "metadata": { + "file_extension": ".py", + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" + }, + "mimetype": "text/x-python", + "name": "python", + "npconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": 2 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/appyters/Kinase_TF_Modules/README.md b/appyters/Kinase_TF_Modules/README.md new file mode 100644 index 00000000..38dcc3a5 --- /dev/null +++ b/appyters/Kinase_TF_Modules/README.md @@ -0,0 +1,8 @@ +# Kinase - Transcription Factor Module Pairwise Analysis + +This Appyter predicts shared modules of kinases, inferred from phosphoproteomic data through [KEA3](https://maayanlab.cloud/kea3/) [1], and transcription factors, inferred from transcriptomic data from [ChEA3](https://maayanlab.cloud/chea3/) [2], that are differentially active across two groups of samples. It first ranks the kinases and transcription factors and then scores pairs of kinases and transcription factors on an individual sample level per each group. These scores are binarized based on an additional threshold which only takes into account the top percentage of pair scores and with which clustermaps are created. Clusters or modules of kinases and transcription factors in these groups are then compared and those that appear across multiple of the heatmaps, representing different directions of activation, are retained and their relationship is displayed as a bipartite network. + +[1] Kuleshov MV, Xie Z, London ABK, Yang J, Evangelista JE, Lachmann A, Shu I, Torre D, Ma'ayan A. KEA3: improved kinase enrichment analysis via data integration. Nucleic Acids Res. 2021 Jul 2;49(W1):W304-W316. doi: 10.1093/nar/gkab359. + +[2] Keenan AB, Torre D, Lachmann A, Leong AK, Wojciechowicz ML, Utti V, Jagodnik KM, Kropiwnicki E, Wang Z, Ma'ayan A. ChEA3: transcription factor enrichment analysis by orthogonal omics integration. Nucleic Acids Res. 2019 Jul 2;47(W1):W212-W224. doi: 10.1093/nar/gkz446. + diff --git a/appyters/Kinase_TF_Modules/appyter.json b/appyters/Kinase_TF_Modules/appyter.json new file mode 100644 index 00000000..cb20d5f0 --- /dev/null +++ b/appyters/Kinase_TF_Modules/appyter.json @@ -0,0 +1,29 @@ +{ + "$schema": "https://raw.githubusercontent.com/MaayanLab/appyter-catalog/main/schema/appyter-validator.json", + "name": "Kinase_TF_Modules", + "title": "Kinase Transcription Factor Module Analysis", + "version": "0.0.1", + "description": "Utilize paired phosphoproteomic and transcriptomic data to identify differentially activated kinase and transcription modules.", + "image": "thumbnail.png", + "authors": [ + { + "name": "Giacomo Marino", + "email": "giacomo.marino@mssm.edu" + } + ], + "url": "https://github.com/MaayanLab/appyter-catalog", + "tags": [ + "rna-seq", "phospho", "chea3", "kea3", "kinase", "tf", "transcription factor", "modules" + ], + "license": "CC-BY-NC-SA-4.0", + "public": false, + "appyter": { + "file": "Kinase-TF-Modules.ipynb", + "profile": "biojupies", + "extras": [ + "toc", + "hide-code", + "toggle-code" + ] + } +} diff --git a/appyters/Kinase_TF_Modules/requirements.txt b/appyters/Kinase_TF_Modules/requirements.txt new file mode 100644 index 00000000..2a533b6b --- /dev/null +++ b/appyters/Kinase_TF_Modules/requirements.txt @@ -0,0 +1,9 @@ +appyter @ git+https://github.com/Maayanlab/appyter +fastcluster +matplotlib +networkx +numpy +pandas +pyvis +scipy +seaborn \ No newline at end of file diff --git a/appyters/Kinase_TF_Modules/static/k-tf-logo.png b/appyters/Kinase_TF_Modules/static/k-tf-logo.png new file mode 100644 index 00000000..8db791a4 Binary files /dev/null and b/appyters/Kinase_TF_Modules/static/k-tf-logo.png differ diff --git a/appyters/Kinase_TF_Modules/static/screenshot.png b/appyters/Kinase_TF_Modules/static/screenshot.png new file mode 100644 index 00000000..729fcb44 Binary files /dev/null and b/appyters/Kinase_TF_Modules/static/screenshot.png differ diff --git a/appyters/Kinase_TF_Modules/static/thumbnail.png b/appyters/Kinase_TF_Modules/static/thumbnail.png new file mode 100644 index 00000000..f565357a Binary files /dev/null and b/appyters/Kinase_TF_Modules/static/thumbnail.png differ