from pathlib import Path
import dask.dataframe as dd
import numpy as np
import pandas as pd
from dask.distributed import Client, progress
client = Client(n_workers=4, threads_per_worker=2, memory_limit="32GB")
client
2022-06-22 14:19:38,777 - distributed.diskutils - INFO - Found stale lock file and directory '/n/data1/hms/dbmi/park/Cook/speclet/munge/dask-worker-space/worker-bvxablb0', purging
2022-06-22 14:19:38,806 - distributed.diskutils - INFO - Found stale lock file and directory '/n/data1/hms/dbmi/park/Cook/speclet/munge/dask-worker-space/worker-tlv8wp0b', purging
Client-dfe4d0ce-f257-11ec-814d-149ecf16877d
<tr>
<td style="text-align: left;"><strong>Connection method:</strong> Cluster object</td>
<td style="text-align: left;"><strong>Cluster type:</strong> distributed.LocalCluster</td>
</tr>
<tr>
<td style="text-align: left;">
<strong>Dashboard: </strong> <a href="http://127.0.0.1:8787/status" target="_blank">http://127.0.0.1:8787/status</a>
</td>
<td style="text-align: left;"></td>
</tr>
</table>
<details>
<summary style="margin-bottom: 20px;"><h3 style="display: inline;">Cluster Info</h3></summary>
<div class="jp-RenderedHTMLCommon jp-RenderedHTML jp-mod-trusted jp-OutputArea-output">
<div style="width: 24px; height: 24px; background-color: #e1e1e1; border: 3px solid #9D9D9D; border-radius: 5px; position: absolute;">
</div>
<div style="margin-left: 48px;">
<h3 style="margin-bottom: 0px; margin-top: 0px;">LocalCluster</h3>
<p style="color: #9D9D9D; margin-bottom: 0px;">6467eec9</p>
<table style="width: 100%; text-align: left;">
<tr>
<td style="text-align: left;">
<strong>Dashboard:</strong> <a href="http://127.0.0.1:8787/status" target="_blank">http://127.0.0.1:8787/status</a>
</td>
<td style="text-align: left;">
<strong>Workers:</strong> 4
</td>
</tr>
<tr>
<td style="text-align: left;">
<strong>Total threads:</strong> 8
</td>
<td style="text-align: left;">
<strong>Total memory:</strong> 119.21 GiB
</td>
</tr>
<tr>
<td style="text-align: left;"><strong>Status:</strong> running</td>
<td style="text-align: left;"><strong>Using processes:</strong> True</td>
</table>
<details>
<summary style="margin-bottom: 20px;">
<h3 style="display: inline;">Scheduler Info</h3>
</summary>
<div style="">
<div>
<div style="width: 24px; height: 24px; background-color: #FFF7E5; border: 3px solid #FF6132; border-radius: 5px; position: absolute;"> </div>
<div style="margin-left: 48px;">
<h3 style="margin-bottom: 0px;">Scheduler</h3>
<p style="color: #9D9D9D; margin-bottom: 0px;">Scheduler-e7379a1d-1481-40b2-a909-d3b6be83fd8c</p>
<table style="width: 100%; text-align: left;">
<tr>
<td style="text-align: left;">
<strong>Comm:</strong> tcp://127.0.0.1:33892
</td>
<td style="text-align: left;">
<strong>Workers:</strong> 4
</td>
</tr>
<tr>
<td style="text-align: left;">
<strong>Dashboard:</strong> <a href="http://127.0.0.1:8787/status" target="_blank">http://127.0.0.1:8787/status</a>
</td>
<td style="text-align: left;">
<strong>Total threads:</strong> 8
</td>
</tr>
<tr>
<td style="text-align: left;">
<strong>Started:</strong> Just now
</td>
<td style="text-align: left;">
<strong>Total memory:</strong> 119.21 GiB
</td>
</tr>
</table>
</div>
</div>
<details style="margin-left: 48px;">
<summary style="margin-bottom: 20px;">
<h3 style="display: inline;">Workers</h3>
</summary>
<div style="margin-bottom: 20px;">
<div style="width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;"> </div>
<div style="margin-left: 48px;">
<details>
<summary>
<h4 style="margin-bottom: 0px; display: inline;">Worker: 0</h4>
</summary>
<table style="width: 100%; text-align: left;">
<tr>
<td style="text-align: left;">
<strong>Comm: </strong> tcp://127.0.0.1:42837
</td>
<td style="text-align: left;">
<strong>Total threads: </strong> 2
</td>
</tr>
<tr>
<td style="text-align: left;">
<strong>Dashboard: </strong> <a href="http://127.0.0.1:43415/status" target="_blank">http://127.0.0.1:43415/status</a>
</td>
<td style="text-align: left;">
<strong>Memory: </strong> 29.80 GiB
</td>
</tr>
<tr>
<td style="text-align: left;">
<strong>Nanny: </strong> tcp://127.0.0.1:33033
</td>
<td style="text-align: left;"></td>
</tr>
<tr>
<td colspan="2" style="text-align: left;">
<strong>Local directory: </strong> /n/data1/hms/dbmi/park/Cook/speclet/munge/dask-worker-space/worker-91bw34nx
</td>
</tr>
</table>
</details>
</div>
</div>
<div style="margin-bottom: 20px;">
<div style="width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;"> </div>
<div style="margin-left: 48px;">
<details>
<summary>
<h4 style="margin-bottom: 0px; display: inline;">Worker: 1</h4>
</summary>
<table style="width: 100%; text-align: left;">
<tr>
<td style="text-align: left;">
<strong>Comm: </strong> tcp://127.0.0.1:46733
</td>
<td style="text-align: left;">
<strong>Total threads: </strong> 2
</td>
</tr>
<tr>
<td style="text-align: left;">
<strong>Dashboard: </strong> <a href="http://127.0.0.1:45956/status" target="_blank">http://127.0.0.1:45956/status</a>
</td>
<td style="text-align: left;">
<strong>Memory: </strong> 29.80 GiB
</td>
</tr>
<tr>
<td style="text-align: left;">
<strong>Nanny: </strong> tcp://127.0.0.1:41014
</td>
<td style="text-align: left;"></td>
</tr>
<tr>
<td colspan="2" style="text-align: left;">
<strong>Local directory: </strong> /n/data1/hms/dbmi/park/Cook/speclet/munge/dask-worker-space/worker-f1t_hati
</td>
</tr>
</table>
</details>
</div>
</div>
<div style="margin-bottom: 20px;">
<div style="width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;"> </div>
<div style="margin-left: 48px;">
<details>
<summary>
<h4 style="margin-bottom: 0px; display: inline;">Worker: 2</h4>
</summary>
<table style="width: 100%; text-align: left;">
<tr>
<td style="text-align: left;">
<strong>Comm: </strong> tcp://127.0.0.1:36788
</td>
<td style="text-align: left;">
<strong>Total threads: </strong> 2
</td>
</tr>
<tr>
<td style="text-align: left;">
<strong>Dashboard: </strong> <a href="http://127.0.0.1:35840/status" target="_blank">http://127.0.0.1:35840/status</a>
</td>
<td style="text-align: left;">
<strong>Memory: </strong> 29.80 GiB
</td>
</tr>
<tr>
<td style="text-align: left;">
<strong>Nanny: </strong> tcp://127.0.0.1:38395
</td>
<td style="text-align: left;"></td>
</tr>
<tr>
<td colspan="2" style="text-align: left;">
<strong>Local directory: </strong> /n/data1/hms/dbmi/park/Cook/speclet/munge/dask-worker-space/worker-o986ch3t
</td>
</tr>
</table>
</details>
</div>
</div>
<div style="margin-bottom: 20px;">
<div style="width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;"> </div>
<div style="margin-left: 48px;">
<details>
<summary>
<h4 style="margin-bottom: 0px; display: inline;">Worker: 3</h4>
</summary>
<table style="width: 100%; text-align: left;">
<tr>
<td style="text-align: left;">
<strong>Comm: </strong> tcp://127.0.0.1:34655
</td>
<td style="text-align: left;">
<strong>Total threads: </strong> 2
</td>
</tr>
<tr>
<td style="text-align: left;">
<strong>Dashboard: </strong> <a href="http://127.0.0.1:38084/status" target="_blank">http://127.0.0.1:38084/status</a>
</td>
<td style="text-align: left;">
<strong>Memory: </strong> 29.80 GiB
</td>
</tr>
<tr>
<td style="text-align: left;">
<strong>Nanny: </strong> tcp://127.0.0.1:39735
</td>
<td style="text-align: left;"></td>
</tr>
<tr>
<td colspan="2" style="text-align: left;">
<strong>Local directory: </strong> /n/data1/hms/dbmi/park/Cook/speclet/munge/dask-worker-space/worker-ams7d6y9
</td>
</tr>
</table>
</details>
</div>
</div>
</details>
</details>
</div>
</div>
Papermill parameters:
depmap_modeling_df
: The path to the full DepMap modeling data set.
DEPMAP_MODELING_DF: str = ""
# Parameters
DEPMAP_MODELING_DF = "../modeling_data/depmap-modeling-data.csv"
assert DEPMAP_MODELING_DF != "", "No path provided for the modeling data."
depmap_modeling_df_path = Path(DEPMAP_MODELING_DF)
if not depmap_modeling_df_path.exists():
raise FileNotFoundError(f"Could not find '{str(depmap_modeling_df_path)}'")
pd.read_csv(depmap_modeling_df_path, low_memory=False, nrows=200)
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
sgrna | replicate_id | lfc | p_dna_batch | genome_alignment | hugo_symbol | screen | multiple_hits_on_gene | sgrna_target_chr | sgrna_target_pos | ... | any_deleterious | any_tcga_hotspot | any_cosmic_hotspot | is_mutated | copy_number | lineage | lineage_subtype | primary_or_metastasis | is_male | age | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | AAACCTGCGGCGGTCGCCA | OVR3_c905R1 | -0.299958 | CRISPR_C6596666.sample | chr8_66505451_- | VXN | sanger | True | 8 | 66505451 | ... | NaN | NaN | NaN | False | 1.139595 | ovary | ovary_adenocarcinoma | metastasis | False | 60 |
1 | AACAGCACACCGGCCCCGT | OVR3_c905R1 | 0.267092 | CRISPR_C6596666.sample | chrX_156009834_- | IL9R | sanger | True | X | 156009834 | ... | NaN | NaN | NaN | False | 0.656377 | ovary | ovary_adenocarcinoma | metastasis | False | 60 |
2 | AACCTCCGGACTCCTCAGC | OVR3_c905R1 | 0.550477 | CRISPR_C6596666.sample | chr7_39609658_- | YAE1 | sanger | True | 7 | 39609658 | ... | NaN | NaN | NaN | False | 0.923715 | ovary | ovary_adenocarcinoma | metastasis | False | 60 |
3 | AACTCAAACTGACGCCGAA | OVR3_c905R1 | -0.391922 | CRISPR_C6596666.sample | chr1_117623388_- | TENT5C | sanger | True | 1 | 117623388 | ... | NaN | NaN | NaN | False | 1.352975 | ovary | ovary_adenocarcinoma | metastasis | False | 60 |
4 | AACTGACCTTGAAACGCTG | OVR3_c905R1 | -1.562577 | CRISPR_C6596666.sample | chr16_66933623_+ | CIAO2B | sanger | True | 16 | 66933623 | ... | NaN | NaN | NaN | False | 1.157211 | ovary | ovary_adenocarcinoma | metastasis | False | 60 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
195 | TGAGCTGGCAATGCTAGAT | OVR3_c905R1 | 0.565344 | CRISPR_C6596666.sample | chrX_155774116_- | SPRY3 | sanger | True | X | 155774116 | ... | NaN | NaN | NaN | False | 0.656377 | ovary | ovary_adenocarcinoma | metastasis | False | 60 |
196 | TGATGGAGCGAATCAGATG | OVR3_c905R1 | -0.204959 | CRISPR_C6596666.sample | chr16_66934065_+ | CIAO2B | sanger | True | 16 | 66934065 | ... | NaN | NaN | NaN | False | 1.157211 | ovary | ovary_adenocarcinoma | metastasis | False | 60 |
197 | TGCACTTATGTGTGCCGCC | OVR3_c905R1 | 0.650650 | CRISPR_C6596666.sample | chrX_156003692_- | IL9R | sanger | True | X | 156003692 | ... | NaN | NaN | NaN | False | 0.656377 | ovary | ovary_adenocarcinoma | metastasis | False | 60 |
198 | TGCTAGGACCCAACTGAGC | OVR3_c905R1 | -0.517796 | CRISPR_C6596666.sample | chr10_46580364_+ | SYT15 | sanger | True | 10 | 46580364 | ... | NaN | NaN | NaN | False | 0.752471 | ovary | ovary_adenocarcinoma | metastasis | False | 60 |
199 | TGGAAAGTTGCCTCGTCCG | OVR3_c905R1 | -0.218348 | CRISPR_C6596666.sample | chr1_117622978_- | TENT5C | sanger | True | 1 | 117622978 | ... | NaN | NaN | NaN | False | 1.352975 | ovary | ovary_adenocarcinoma | metastasis | False | 60 |
200 rows × 25 columns
depmap_modeling_df = dd.read_csv(
depmap_modeling_df_path,
dtype={
"age": "float64",
"p_dna_batch": "object",
"primary_or_metastasis": "object",
"counts_final": "float64",
"counts_initial": "float64",
},
low_memory=False,
)
depmap_modeling_df.head()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
sgrna | replicate_id | lfc | p_dna_batch | genome_alignment | hugo_symbol | screen | multiple_hits_on_gene | sgrna_target_chr | sgrna_target_pos | ... | any_deleterious | any_tcga_hotspot | any_cosmic_hotspot | is_mutated | copy_number | lineage | lineage_subtype | primary_or_metastasis | is_male | age | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | AAACCTGCGGCGGTCGCCA | OVR3_c905R1 | -0.299958 | CRISPR_C6596666.sample | chr8_66505451_- | VXN | sanger | True | 8 | 66505451 | ... | NaN | NaN | NaN | False | 1.139595 | ovary | ovary_adenocarcinoma | metastasis | False | 60.0 |
1 | AACAGCACACCGGCCCCGT | OVR3_c905R1 | 0.267092 | CRISPR_C6596666.sample | chrX_156009834_- | IL9R | sanger | True | X | 156009834 | ... | NaN | NaN | NaN | False | 0.656377 | ovary | ovary_adenocarcinoma | metastasis | False | 60.0 |
2 | AACCTCCGGACTCCTCAGC | OVR3_c905R1 | 0.550477 | CRISPR_C6596666.sample | chr7_39609658_- | YAE1 | sanger | True | 7 | 39609658 | ... | NaN | NaN | NaN | False | 0.923715 | ovary | ovary_adenocarcinoma | metastasis | False | 60.0 |
3 | AACTCAAACTGACGCCGAA | OVR3_c905R1 | -0.391922 | CRISPR_C6596666.sample | chr1_117623388_- | TENT5C | sanger | True | 1 | 117623388 | ... | NaN | NaN | NaN | False | 1.352975 | ovary | ovary_adenocarcinoma | metastasis | False | 60.0 |
4 | AACTGACCTTGAAACGCTG | OVR3_c905R1 | -1.562577 | CRISPR_C6596666.sample | chr16_66933623_+ | CIAO2B | sanger | True | 16 | 66933623 | ... | NaN | NaN | NaN | False | 1.157211 | ovary | ovary_adenocarcinoma | metastasis | False | 60.0 |
5 rows × 25 columns
depmap_modeling_df.columns
Index(['sgrna', 'replicate_id', 'lfc', 'p_dna_batch', 'genome_alignment',
'hugo_symbol', 'screen', 'multiple_hits_on_gene', 'sgrna_target_chr',
'sgrna_target_pos', 'depmap_id', 'counts_final', 'counts_initial',
'rna_expr', 'num_mutations', 'any_deleterious', 'any_tcga_hotspot',
'any_cosmic_hotspot', 'is_mutated', 'copy_number', 'lineage',
'lineage_subtype', 'primary_or_metastasis', 'is_male', 'age'],
dtype='object')
FAILED_CHECKS = 0
Check that specific columns exist (prevents some really bonehead discoveries later on...).
cols_that_should_exist = [
"depmap_id",
"sgrna",
"hugo_symbol",
"lfc",
"screen",
"num_mutations",
"is_mutated",
"lineage",
"counts_final",
"p_dna_batch",
"primary_or_metastasis",
]
missing_cols = [
col for col in cols_that_should_exist if col not in depmap_modeling_df.columns
]
if len(missing_cols) != 0:
print(f"Some columns ({len(missing_cols)}) that should be present are not 😦")
print(" missing columns: " + ", ".join(missing_cols))
FAILED_CHECKS += 1
Check that specific columns have no missing (NA
) values.
cols_without_na = [
"depmap_id",
"sgrna",
"hugo_symbol",
"lfc",
"screen",
"num_mutations",
"is_mutated",
"lineage",
]
na_checks = depmap_modeling_df.isna()[cols_without_na].any().compute()
num_missed_checks = na_checks.sum()
if num_missed_checks > 0:
FAILED_CHECKS += num_missed_checks
print(na_checks[na_checks])
na_checks
depmap_id False
sgrna False
hugo_symbol False
lfc False
screen False
num_mutations False
is_mutated False
lineage False
dtype: bool
Check that all combinations of cell line, sgRNA, and experimental replicate only appear once.
grp_cols = ["depmap_id", "sgrna", "replicate_id"]
ct_df = (
depmap_modeling_df.assign(n=1)[grp_cols + ["n"]]
.groupby(grp_cols)
.count()
.query("n > 1")
.compute()
)
if not ct_df.shape[0] == 0:
print("There are some sgRNA with multiple targets.")
print(ct_df.head(20))
FAILED_CHECKS += 1
if FAILED_CHECKS > 0:
raise Exception(f"There were {FAILED_CHECKS} failed checks.")
%load_ext watermark
%watermark -d -u -v -iv -b -h -m
Last updated: 2022-06-22
Python implementation: CPython
Python version : 3.10.4
IPython version : 8.4.0
Compiler : GCC 10.3.0
OS : Linux
Release : 3.10.0-1160.45.1.el7.x86_64
Machine : x86_64
Processor : x86_64
CPU cores : 32
Architecture: 64bit
Hostname: compute-a-16-152.o2.rc.hms.harvard.edu
Git branch: per-lineage
dask : 2022.6.0
pandas: 1.4.2
numpy : 1.22.4