-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
- Loading branch information
There are no files selected for viewing
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,383 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# CPTAC ID mapping" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"BCM CPTAC phosphoproteomics provides ensembl ID + site, but a lot of webtools has ID like gene + site, so we have to map the ensembl ID to the gene name that match with webtool." | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Setup" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from katlas.core import *\n", | ||
"\n", | ||
"from tqdm import tqdm\n", | ||
"\n", | ||
"import pandas as pd\n", | ||
"import seaborn as sns\n", | ||
"from matplotlib import pyplot as plt" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"class CPTAC_ID:\n", | ||
" @staticmethod\n", | ||
" def list_cancer():\n", | ||
" return ['HNSCC','GBM','COAD','CCRCC','LSCC','BRCA','UCEC','LUAD','PDAC','OV']\n", | ||
" \n", | ||
" @staticmethod\n", | ||
" def get_id(cancer, is_Tumor=True):\n", | ||
" \"\"\"\n", | ||
" Fetches the data from the given URL and returns a DataFrame\n", | ||
" \"\"\"\n", | ||
" \n", | ||
" ID_URL = f\"https://zenodo.org/records/8196130/files/bcm-{cancer.lower()}-mapping-gencode.v34.basic.annotation-mapping.txt.gz\"\n", | ||
" if is_Tumor:\n", | ||
" DATA_URL = f\"https://cptac-pancancer-data.s3.us-west-2.amazonaws.com/data_freeze_v1.2_reorganized/{cancer.upper()}/{cancer.upper()}_phospho_site_abundance_log2_reference_intensity_normalized_Tumor.txt\"\n", | ||
" \n", | ||
" else:\n", | ||
" DATA_URL = f\"https://cptac-pancancer-data.s3.us-west-2.amazonaws.com/data_freeze_v1.2_reorganized/{cancer.upper()}/{cancer.upper()}_phospho_site_abundance_log2_reference_intensity_normalized_Normal.txt\"\n", | ||
"\n", | ||
" \n", | ||
" # Load ID data\n", | ||
" ref = pd.read_csv(ID_URL, compression='gzip', sep='\\t')\\\n", | ||
" [['protein','gene','gene_name']]\\\n", | ||
" .drop_duplicates().reset_index(drop=True)\n", | ||
" \n", | ||
" \n", | ||
" # Load CPTAC phosphoproteomics data\n", | ||
" try:\n", | ||
" raw = pd.read_csv(DATA_URL, sep='\\t')\n", | ||
" except Exception as e:\n", | ||
" print(f'{cancer}{e}')\n", | ||
" else:\n", | ||
" info = pd.DataFrame({'gene':raw.idx.str.split('|').str[0],\n", | ||
" 'site':raw.idx.str.split('|').str[2],\n", | ||
" 'site_seq':raw.idx.str.split('|').str[3]})\n", | ||
" # print(raw.columns[1:]) # patient ID\n", | ||
"\n", | ||
" print(f'the {cancer} dataset length is: {info.shape[0]}')\n", | ||
"\n", | ||
" # Merge ensembl ID with gene name\n", | ||
" info = info.merge(ref,'left')\n", | ||
" print(f'after id mapping, the length is {info.shape[0]}')\n", | ||
"\n", | ||
" print(f'{info.gene_name.isna().sum()} sites does not have a mapped gene name')\n", | ||
" \n", | ||
" # LinkedOmics\n", | ||
" info['gene_site'] = info['gene_name'] + '_' + info['site']\n", | ||
" \n", | ||
" # LinkedOmicsKB\n", | ||
" info['ENSP_site'] = info['protein'].str.split('.').str[0] + '_' + info['site']\n", | ||
"\n", | ||
" return info" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"cancer_list = CPTAC_ID.list_cancer()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"['HNSCC', 'GBM', 'COAD', 'CCRCC', 'LSCC', 'BRCA', 'UCEC', 'LUAD', 'PDAC', 'OV']" | ||
] | ||
}, | ||
"execution_count": null, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"cancer_list" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"the HNSCC dataset length is: 55270\n", | ||
"after id mapping, the length is 214151\n", | ||
"0 sites does not have a mapped gene name\n", | ||
"the GBM dataset length is: 63410\n", | ||
"after id mapping, the length is 261115\n", | ||
"0 sites does not have a mapped gene name\n", | ||
"the COAD dataset length is: 35487\n", | ||
"after id mapping, the length is 130147\n", | ||
"0 sites does not have a mapped gene name\n", | ||
"the CCRCC dataset length is: 54238\n", | ||
"after id mapping, the length is 213737\n", | ||
"0 sites does not have a mapped gene name\n", | ||
"the LSCC dataset length is: 65481\n", | ||
"after id mapping, the length is 249575\n", | ||
"0 sites does not have a mapped gene name\n", | ||
"the BRCA dataset length is: 49871\n", | ||
"after id mapping, the length is 175637\n", | ||
"0 sites does not have a mapped gene name\n", | ||
"the UCEC dataset length is: 64977\n", | ||
"after id mapping, the length is 250006\n", | ||
"0 sites does not have a mapped gene name\n", | ||
"the LUAD dataset length is: 61705\n", | ||
"after id mapping, the length is 236430\n", | ||
"0 sites does not have a mapped gene name\n", | ||
"the PDAC dataset length is: 50220\n", | ||
"after id mapping, the length is 195218\n", | ||
"0 sites does not have a mapped gene name\n", | ||
"the OV dataset length is: 37334\n", | ||
"after id mapping, the length is 129441\n", | ||
"0 sites does not have a mapped gene name\n", | ||
"the HNSCC dataset length is: 55267\n", | ||
"after id mapping, the length is 214140\n", | ||
"0 sites does not have a mapped gene name\n", | ||
"GBMHTTP Error 403: Forbidden\n", | ||
"the COAD dataset length is: 35485\n", | ||
"after id mapping, the length is 130141\n", | ||
"0 sites does not have a mapped gene name\n", | ||
"the CCRCC dataset length is: 53152\n", | ||
"after id mapping, the length is 209188\n", | ||
"0 sites does not have a mapped gene name\n", | ||
"the LSCC dataset length is: 65468\n", | ||
"after id mapping, the length is 249541\n", | ||
"0 sites does not have a mapped gene name\n", | ||
"BRCAHTTP Error 403: Forbidden\n", | ||
"the UCEC dataset length is: 55170\n", | ||
"after id mapping, the length is 211970\n", | ||
"0 sites does not have a mapped gene name\n", | ||
"the LUAD dataset length is: 61702\n", | ||
"after id mapping, the length is 236425\n", | ||
"0 sites does not have a mapped gene name\n", | ||
"the PDAC dataset length is: 50190\n", | ||
"after id mapping, the length is 195092\n", | ||
"0 sites does not have a mapped gene name\n", | ||
"the OV dataset length is: 36582\n", | ||
"after id mapping, the length is 127006\n", | ||
"0 sites does not have a mapped gene name\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"tumor = [CPTAC_ID.get_id(cancer,is_Tumor=True) for cancer in cancer_list]\n", | ||
"normal = [CPTAC_ID.get_id(cancer,is_Tumor=False) for cancer in cancer_list]\n", | ||
"all_list = tumor+normal" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"all_df = pd.concat(all_list,ignore_index=True)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"linkedomics = all_df.drop_duplicates('gene_site').reset_index(drop=True)\n", | ||
"linkedomicsKB = all_df.drop_duplicates('ENSP_site').reset_index(drop=True)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/html": [ | ||
"<div>\n", | ||
"<style scoped>\n", | ||
" .dataframe tbody tr th:only-of-type {\n", | ||
" vertical-align: middle;\n", | ||
" }\n", | ||
"\n", | ||
" .dataframe tbody tr th {\n", | ||
" vertical-align: top;\n", | ||
" }\n", | ||
"\n", | ||
" .dataframe thead th {\n", | ||
" text-align: right;\n", | ||
" }\n", | ||
"</style>\n", | ||
"<table border=\"1\" class=\"dataframe\">\n", | ||
" <thead>\n", | ||
" <tr style=\"text-align: right;\">\n", | ||
" <th></th>\n", | ||
" <th>gene</th>\n", | ||
" <th>site</th>\n", | ||
" <th>site_seq</th>\n", | ||
" <th>protein</th>\n", | ||
" <th>gene_name</th>\n", | ||
" <th>gene_site</th>\n", | ||
" <th>ENSP_site</th>\n", | ||
" </tr>\n", | ||
" </thead>\n", | ||
" <tbody>\n", | ||
" <tr>\n", | ||
" <th>0</th>\n", | ||
" <td>ENSG00000003056.8</td>\n", | ||
" <td>S267</td>\n", | ||
" <td>DDQLGEESEERDDHL</td>\n", | ||
" <td>ENSP00000000412.3</td>\n", | ||
" <td>M6PR</td>\n", | ||
" <td>M6PR_S267</td>\n", | ||
" <td>ENSP00000000412_S267</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>1</th>\n", | ||
" <td>ENSG00000048028.11</td>\n", | ||
" <td>S1053</td>\n", | ||
" <td>PPTIRPNSPYDLCSR</td>\n", | ||
" <td>ENSP00000003302.4</td>\n", | ||
" <td>USP28</td>\n", | ||
" <td>USP28_S1053</td>\n", | ||
" <td>ENSP00000003302_S1053</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>2</th>\n", | ||
" <td>ENSG00000004776.13</td>\n", | ||
" <td>S16</td>\n", | ||
" <td>PSWLRRASAPLPGLS</td>\n", | ||
" <td>ENSP00000004982.3</td>\n", | ||
" <td>HSPB6</td>\n", | ||
" <td>HSPB6_S16</td>\n", | ||
" <td>ENSP00000004982_S16</td>\n", | ||
" </tr>\n", | ||
" </tbody>\n", | ||
"</table>\n", | ||
"</div>" | ||
], | ||
"text/plain": [ | ||
" gene site site_seq protein gene_name \\\n", | ||
"0 ENSG00000003056.8 S267 DDQLGEESEERDDHL ENSP00000000412.3 M6PR \n", | ||
"1 ENSG00000048028.11 S1053 PPTIRPNSPYDLCSR ENSP00000003302.4 USP28 \n", | ||
"2 ENSG00000004776.13 S16 PSWLRRASAPLPGLS ENSP00000004982.3 HSPB6 \n", | ||
"\n", | ||
" gene_site ENSP_site \n", | ||
"0 M6PR_S267 ENSP00000000412_S267 \n", | ||
"1 USP28_S1053 ENSP00000003302_S1053 \n", | ||
"2 HSPB6_S16 ENSP00000004982_S16 " | ||
] | ||
}, | ||
"execution_count": null, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"linkedomics[:3]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"linkedomics.to_parquet('linkedomics_ID.parquet')\n", | ||
"linkedomicsKB.to_parquet('linkedomicsKB_ID.parquet')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"(126602, 7)" | ||
] | ||
}, | ||
"execution_count": null, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"linkedomics.shape" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"(489994, 7)" | ||
] | ||
}, | ||
"execution_count": null, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"linkedomicsKB.shape" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.10.10" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 4 | ||
} |