Skip to content

Commit

Permalink
add notebooks
Browse files Browse the repository at this point in the history
  • Loading branch information
sky1ove committed Sep 27, 2024
1 parent 770cb9b commit 0ccfa4a
Show file tree
Hide file tree
Showing 28 changed files with 59,734 additions and 0 deletions.
857 changes: 857 additions & 0 deletions nbs/04a_Plot_heatmap_logo_CDDM.ipynb

Large diffs are not rendered by default.

452 changes: 452 additions & 0 deletions nbs/04b_Plot_heatmap_logo_PSPA.ipynb

Large diffs are not rendered by default.

1,569 changes: 1,569 additions & 0 deletions nbs/05_Hierarchical_clustering.ipynb

Large diffs are not rendered by default.

543 changes: 543 additions & 0 deletions nbs/06_AA_feature.ipynb

Large diffs are not rendered by default.

1,961 changes: 1,961 additions & 0 deletions nbs/07a_Combine_psp_ochoa.ipynb

Large diffs are not rendered by default.

487 changes: 487 additions & 0 deletions nbs/07b_Plot_distribution.ipynb

Large diffs are not rendered by default.

391 changes: 391 additions & 0 deletions nbs/07c_Enrichment_ATM.ipynb

Large diffs are not rendered by default.

2,829 changes: 2,829 additions & 0 deletions nbs/08_AUCDF.ipynb

Large diffs are not rendered by default.

6,643 changes: 6,643 additions & 0 deletions nbs/09a_ML_embedding.ipynb

Large diffs are not rendered by default.

14,472 changes: 14,472 additions & 0 deletions nbs/09b_ML_training.ipynb

Large diffs are not rendered by default.

4,888 changes: 4,888 additions & 0 deletions nbs/09c_ML_analysis.ipynb

Large diffs are not rendered by default.

5,304 changes: 5,304 additions & 0 deletions nbs/09d_ML_predict.ipynb

Large diffs are not rendered by default.

383 changes: 383 additions & 0 deletions nbs/others_02_CPTAC_IDmapping.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,383 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# CPTAC ID mapping"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"BCM CPTAC phosphoproteomics provides ensembl ID + site, but a lot of webtools has ID like gene + site, so we have to map the ensembl ID to the gene name that match with webtool."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Setup"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from katlas.core import *\n",
"\n",
"from tqdm import tqdm\n",
"\n",
"import pandas as pd\n",
"import seaborn as sns\n",
"from matplotlib import pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"class CPTAC_ID:\n",
" @staticmethod\n",
" def list_cancer():\n",
" return ['HNSCC','GBM','COAD','CCRCC','LSCC','BRCA','UCEC','LUAD','PDAC','OV']\n",
" \n",
" @staticmethod\n",
" def get_id(cancer, is_Tumor=True):\n",
" \"\"\"\n",
" Fetches the data from the given URL and returns a DataFrame\n",
" \"\"\"\n",
" \n",
" ID_URL = f\"https://zenodo.org/records/8196130/files/bcm-{cancer.lower()}-mapping-gencode.v34.basic.annotation-mapping.txt.gz\"\n",
" if is_Tumor:\n",
" DATA_URL = f\"https://cptac-pancancer-data.s3.us-west-2.amazonaws.com/data_freeze_v1.2_reorganized/{cancer.upper()}/{cancer.upper()}_phospho_site_abundance_log2_reference_intensity_normalized_Tumor.txt\"\n",
" \n",
" else:\n",
" DATA_URL = f\"https://cptac-pancancer-data.s3.us-west-2.amazonaws.com/data_freeze_v1.2_reorganized/{cancer.upper()}/{cancer.upper()}_phospho_site_abundance_log2_reference_intensity_normalized_Normal.txt\"\n",
"\n",
" \n",
" # Load ID data\n",
" ref = pd.read_csv(ID_URL, compression='gzip', sep='\\t')\\\n",
" [['protein','gene','gene_name']]\\\n",
" .drop_duplicates().reset_index(drop=True)\n",
" \n",
" \n",
" # Load CPTAC phosphoproteomics data\n",
" try:\n",
" raw = pd.read_csv(DATA_URL, sep='\\t')\n",
" except Exception as e:\n",
" print(f'{cancer}{e}')\n",
" else:\n",
" info = pd.DataFrame({'gene':raw.idx.str.split('|').str[0],\n",
" 'site':raw.idx.str.split('|').str[2],\n",
" 'site_seq':raw.idx.str.split('|').str[3]})\n",
" # print(raw.columns[1:]) # patient ID\n",
"\n",
" print(f'the {cancer} dataset length is: {info.shape[0]}')\n",
"\n",
" # Merge ensembl ID with gene name\n",
" info = info.merge(ref,'left')\n",
" print(f'after id mapping, the length is {info.shape[0]}')\n",
"\n",
" print(f'{info.gene_name.isna().sum()} sites does not have a mapped gene name')\n",
" \n",
" # LinkedOmics\n",
" info['gene_site'] = info['gene_name'] + '_' + info['site']\n",
" \n",
" # LinkedOmicsKB\n",
" info['ENSP_site'] = info['protein'].str.split('.').str[0] + '_' + info['site']\n",
"\n",
" return info"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"cancer_list = CPTAC_ID.list_cancer()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['HNSCC', 'GBM', 'COAD', 'CCRCC', 'LSCC', 'BRCA', 'UCEC', 'LUAD', 'PDAC', 'OV']"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cancer_list"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"the HNSCC dataset length is: 55270\n",
"after id mapping, the length is 214151\n",
"0 sites does not have a mapped gene name\n",
"the GBM dataset length is: 63410\n",
"after id mapping, the length is 261115\n",
"0 sites does not have a mapped gene name\n",
"the COAD dataset length is: 35487\n",
"after id mapping, the length is 130147\n",
"0 sites does not have a mapped gene name\n",
"the CCRCC dataset length is: 54238\n",
"after id mapping, the length is 213737\n",
"0 sites does not have a mapped gene name\n",
"the LSCC dataset length is: 65481\n",
"after id mapping, the length is 249575\n",
"0 sites does not have a mapped gene name\n",
"the BRCA dataset length is: 49871\n",
"after id mapping, the length is 175637\n",
"0 sites does not have a mapped gene name\n",
"the UCEC dataset length is: 64977\n",
"after id mapping, the length is 250006\n",
"0 sites does not have a mapped gene name\n",
"the LUAD dataset length is: 61705\n",
"after id mapping, the length is 236430\n",
"0 sites does not have a mapped gene name\n",
"the PDAC dataset length is: 50220\n",
"after id mapping, the length is 195218\n",
"0 sites does not have a mapped gene name\n",
"the OV dataset length is: 37334\n",
"after id mapping, the length is 129441\n",
"0 sites does not have a mapped gene name\n",
"the HNSCC dataset length is: 55267\n",
"after id mapping, the length is 214140\n",
"0 sites does not have a mapped gene name\n",
"GBMHTTP Error 403: Forbidden\n",
"the COAD dataset length is: 35485\n",
"after id mapping, the length is 130141\n",
"0 sites does not have a mapped gene name\n",
"the CCRCC dataset length is: 53152\n",
"after id mapping, the length is 209188\n",
"0 sites does not have a mapped gene name\n",
"the LSCC dataset length is: 65468\n",
"after id mapping, the length is 249541\n",
"0 sites does not have a mapped gene name\n",
"BRCAHTTP Error 403: Forbidden\n",
"the UCEC dataset length is: 55170\n",
"after id mapping, the length is 211970\n",
"0 sites does not have a mapped gene name\n",
"the LUAD dataset length is: 61702\n",
"after id mapping, the length is 236425\n",
"0 sites does not have a mapped gene name\n",
"the PDAC dataset length is: 50190\n",
"after id mapping, the length is 195092\n",
"0 sites does not have a mapped gene name\n",
"the OV dataset length is: 36582\n",
"after id mapping, the length is 127006\n",
"0 sites does not have a mapped gene name\n"
]
}
],
"source": [
"tumor = [CPTAC_ID.get_id(cancer,is_Tumor=True) for cancer in cancer_list]\n",
"normal = [CPTAC_ID.get_id(cancer,is_Tumor=False) for cancer in cancer_list]\n",
"all_list = tumor+normal"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"all_df = pd.concat(all_list,ignore_index=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"linkedomics = all_df.drop_duplicates('gene_site').reset_index(drop=True)\n",
"linkedomicsKB = all_df.drop_duplicates('ENSP_site').reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>gene</th>\n",
" <th>site</th>\n",
" <th>site_seq</th>\n",
" <th>protein</th>\n",
" <th>gene_name</th>\n",
" <th>gene_site</th>\n",
" <th>ENSP_site</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>ENSG00000003056.8</td>\n",
" <td>S267</td>\n",
" <td>DDQLGEESEERDDHL</td>\n",
" <td>ENSP00000000412.3</td>\n",
" <td>M6PR</td>\n",
" <td>M6PR_S267</td>\n",
" <td>ENSP00000000412_S267</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>ENSG00000048028.11</td>\n",
" <td>S1053</td>\n",
" <td>PPTIRPNSPYDLCSR</td>\n",
" <td>ENSP00000003302.4</td>\n",
" <td>USP28</td>\n",
" <td>USP28_S1053</td>\n",
" <td>ENSP00000003302_S1053</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>ENSG00000004776.13</td>\n",
" <td>S16</td>\n",
" <td>PSWLRRASAPLPGLS</td>\n",
" <td>ENSP00000004982.3</td>\n",
" <td>HSPB6</td>\n",
" <td>HSPB6_S16</td>\n",
" <td>ENSP00000004982_S16</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" gene site site_seq protein gene_name \\\n",
"0 ENSG00000003056.8 S267 DDQLGEESEERDDHL ENSP00000000412.3 M6PR \n",
"1 ENSG00000048028.11 S1053 PPTIRPNSPYDLCSR ENSP00000003302.4 USP28 \n",
"2 ENSG00000004776.13 S16 PSWLRRASAPLPGLS ENSP00000004982.3 HSPB6 \n",
"\n",
" gene_site ENSP_site \n",
"0 M6PR_S267 ENSP00000000412_S267 \n",
"1 USP28_S1053 ENSP00000003302_S1053 \n",
"2 HSPB6_S16 ENSP00000004982_S16 "
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"linkedomics[:3]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"linkedomics.to_parquet('linkedomics_ID.parquet')\n",
"linkedomicsKB.to_parquet('linkedomicsKB_ID.parquet')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(126602, 7)"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"linkedomics.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(489994, 7)"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"linkedomicsKB.shape"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.10"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Loading

0 comments on commit 0ccfa4a

Please sign in to comment.