Skip to content

Latest commit

 

History

History
1023 lines (884 loc) · 26.5 KB

045_check-depmap-modeling-data_exec.md

File metadata and controls

1023 lines (884 loc) · 26.5 KB

Check of modeling data

from pathlib import Path

import dask.dataframe as dd
import numpy as np
import pandas as pd
from dask.distributed import Client, progress
client = Client(n_workers=4, threads_per_worker=2, memory_limit="32GB")
client
2022-06-22 14:19:38,777 - distributed.diskutils - INFO - Found stale lock file and directory '/n/data1/hms/dbmi/park/Cook/speclet/munge/dask-worker-space/worker-bvxablb0', purging
2022-06-22 14:19:38,806 - distributed.diskutils - INFO - Found stale lock file and directory '/n/data1/hms/dbmi/park/Cook/speclet/munge/dask-worker-space/worker-tlv8wp0b', purging

Client

Client-dfe4d0ce-f257-11ec-814d-149ecf16877d

    <tr>

        <td style="text-align: left;"><strong>Connection method:</strong> Cluster object</td>
        <td style="text-align: left;"><strong>Cluster type:</strong> distributed.LocalCluster</td>

    </tr>


        <tr>
            <td style="text-align: left;">
                <strong>Dashboard: </strong> <a href="http://127.0.0.1:8787/status" target="_blank">http://127.0.0.1:8787/status</a>
            </td>
            <td style="text-align: left;"></td>
        </tr>


    </table>


        <details>
        <summary style="margin-bottom: 20px;"><h3 style="display: inline;">Cluster Info</h3></summary>
        <div class="jp-RenderedHTMLCommon jp-RenderedHTML jp-mod-trusted jp-OutputArea-output">
<div style="width: 24px; height: 24px; background-color: #e1e1e1; border: 3px solid #9D9D9D; border-radius: 5px; position: absolute;">
</div>
<div style="margin-left: 48px;">
    <h3 style="margin-bottom: 0px; margin-top: 0px;">LocalCluster</h3>
    <p style="color: #9D9D9D; margin-bottom: 0px;">6467eec9</p>
    <table style="width: 100%; text-align: left;">
        <tr>
            <td style="text-align: left;">
                <strong>Dashboard:</strong> <a href="http://127.0.0.1:8787/status" target="_blank">http://127.0.0.1:8787/status</a>
            </td>
            <td style="text-align: left;">
                <strong>Workers:</strong> 4
            </td>
        </tr>
        <tr>
            <td style="text-align: left;">
                <strong>Total threads:</strong> 8
            </td>
            <td style="text-align: left;">
                <strong>Total memory:</strong> 119.21 GiB
            </td>
        </tr>

        <tr>
<td style="text-align: left;"><strong>Status:</strong> running</td>
<td style="text-align: left;"><strong>Using processes:</strong> True</td>
    </table>

    <details>
        <summary style="margin-bottom: 20px;">
            <h3 style="display: inline;">Scheduler Info</h3>
        </summary>

        <div style="">
<div>
    <div style="width: 24px; height: 24px; background-color: #FFF7E5; border: 3px solid #FF6132; border-radius: 5px; position: absolute;"> </div>
    <div style="margin-left: 48px;">
        <h3 style="margin-bottom: 0px;">Scheduler</h3>
        <p style="color: #9D9D9D; margin-bottom: 0px;">Scheduler-e7379a1d-1481-40b2-a909-d3b6be83fd8c</p>
        <table style="width: 100%; text-align: left;">
            <tr>
                <td style="text-align: left;">
                    <strong>Comm:</strong> tcp://127.0.0.1:33892
                </td>
                <td style="text-align: left;">
                    <strong>Workers:</strong> 4
                </td>
            </tr>
            <tr>
                <td style="text-align: left;">
                    <strong>Dashboard:</strong> <a href="http://127.0.0.1:8787/status" target="_blank">http://127.0.0.1:8787/status</a>
                </td>
                <td style="text-align: left;">
                    <strong>Total threads:</strong> 8
                </td>
            </tr>
            <tr>
                <td style="text-align: left;">
                    <strong>Started:</strong> Just now
                </td>
                <td style="text-align: left;">
                    <strong>Total memory:</strong> 119.21 GiB
                </td>
            </tr>
        </table>
    </div>
</div>

<details style="margin-left: 48px;">
    <summary style="margin-bottom: 20px;">
        <h3 style="display: inline;">Workers</h3>
    </summary>


    <div style="margin-bottom: 20px;">
        <div style="width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;"> </div>
        <div style="margin-left: 48px;">
        <details>
            <summary>
                <h4 style="margin-bottom: 0px; display: inline;">Worker: 0</h4>
            </summary>
            <table style="width: 100%; text-align: left;">
                <tr>
                    <td style="text-align: left;">
                        <strong>Comm: </strong> tcp://127.0.0.1:42837
                    </td>
                    <td style="text-align: left;">
                        <strong>Total threads: </strong> 2
                    </td>
                </tr>
                <tr>
                    <td style="text-align: left;">
                        <strong>Dashboard: </strong> <a href="http://127.0.0.1:43415/status" target="_blank">http://127.0.0.1:43415/status</a>
                    </td>
                    <td style="text-align: left;">
                        <strong>Memory: </strong> 29.80 GiB
                    </td>
                </tr>
                <tr>
                    <td style="text-align: left;">
                        <strong>Nanny: </strong> tcp://127.0.0.1:33033
                    </td>
                    <td style="text-align: left;"></td>
                </tr>
                <tr>
                    <td colspan="2" style="text-align: left;">
                        <strong>Local directory: </strong> /n/data1/hms/dbmi/park/Cook/speclet/munge/dask-worker-space/worker-91bw34nx
                    </td>
                </tr>





            </table>
        </details>
        </div>
    </div>

    <div style="margin-bottom: 20px;">
        <div style="width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;"> </div>
        <div style="margin-left: 48px;">
        <details>
            <summary>
                <h4 style="margin-bottom: 0px; display: inline;">Worker: 1</h4>
            </summary>
            <table style="width: 100%; text-align: left;">
                <tr>
                    <td style="text-align: left;">
                        <strong>Comm: </strong> tcp://127.0.0.1:46733
                    </td>
                    <td style="text-align: left;">
                        <strong>Total threads: </strong> 2
                    </td>
                </tr>
                <tr>
                    <td style="text-align: left;">
                        <strong>Dashboard: </strong> <a href="http://127.0.0.1:45956/status" target="_blank">http://127.0.0.1:45956/status</a>
                    </td>
                    <td style="text-align: left;">
                        <strong>Memory: </strong> 29.80 GiB
                    </td>
                </tr>
                <tr>
                    <td style="text-align: left;">
                        <strong>Nanny: </strong> tcp://127.0.0.1:41014
                    </td>
                    <td style="text-align: left;"></td>
                </tr>
                <tr>
                    <td colspan="2" style="text-align: left;">
                        <strong>Local directory: </strong> /n/data1/hms/dbmi/park/Cook/speclet/munge/dask-worker-space/worker-f1t_hati
                    </td>
                </tr>





            </table>
        </details>
        </div>
    </div>

    <div style="margin-bottom: 20px;">
        <div style="width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;"> </div>
        <div style="margin-left: 48px;">
        <details>
            <summary>
                <h4 style="margin-bottom: 0px; display: inline;">Worker: 2</h4>
            </summary>
            <table style="width: 100%; text-align: left;">
                <tr>
                    <td style="text-align: left;">
                        <strong>Comm: </strong> tcp://127.0.0.1:36788
                    </td>
                    <td style="text-align: left;">
                        <strong>Total threads: </strong> 2
                    </td>
                </tr>
                <tr>
                    <td style="text-align: left;">
                        <strong>Dashboard: </strong> <a href="http://127.0.0.1:35840/status" target="_blank">http://127.0.0.1:35840/status</a>
                    </td>
                    <td style="text-align: left;">
                        <strong>Memory: </strong> 29.80 GiB
                    </td>
                </tr>
                <tr>
                    <td style="text-align: left;">
                        <strong>Nanny: </strong> tcp://127.0.0.1:38395
                    </td>
                    <td style="text-align: left;"></td>
                </tr>
                <tr>
                    <td colspan="2" style="text-align: left;">
                        <strong>Local directory: </strong> /n/data1/hms/dbmi/park/Cook/speclet/munge/dask-worker-space/worker-o986ch3t
                    </td>
                </tr>





            </table>
        </details>
        </div>
    </div>

    <div style="margin-bottom: 20px;">
        <div style="width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;"> </div>
        <div style="margin-left: 48px;">
        <details>
            <summary>
                <h4 style="margin-bottom: 0px; display: inline;">Worker: 3</h4>
            </summary>
            <table style="width: 100%; text-align: left;">
                <tr>
                    <td style="text-align: left;">
                        <strong>Comm: </strong> tcp://127.0.0.1:34655
                    </td>
                    <td style="text-align: left;">
                        <strong>Total threads: </strong> 2
                    </td>
                </tr>
                <tr>
                    <td style="text-align: left;">
                        <strong>Dashboard: </strong> <a href="http://127.0.0.1:38084/status" target="_blank">http://127.0.0.1:38084/status</a>
                    </td>
                    <td style="text-align: left;">
                        <strong>Memory: </strong> 29.80 GiB
                    </td>
                </tr>
                <tr>
                    <td style="text-align: left;">
                        <strong>Nanny: </strong> tcp://127.0.0.1:39735
                    </td>
                    <td style="text-align: left;"></td>
                </tr>
                <tr>
                    <td colspan="2" style="text-align: left;">
                        <strong>Local directory: </strong> /n/data1/hms/dbmi/park/Cook/speclet/munge/dask-worker-space/worker-ams7d6y9
                    </td>
                </tr>





            </table>
        </details>
        </div>
    </div>


</details>
    </details>
</div>
</div>

Papermill parameters:

depmap_modeling_df: The path to the full DepMap modeling data set.

DEPMAP_MODELING_DF: str = ""
# Parameters
DEPMAP_MODELING_DF = "../modeling_data/depmap-modeling-data.csv"
assert DEPMAP_MODELING_DF != "", "No path provided for the modeling data."
depmap_modeling_df_path = Path(DEPMAP_MODELING_DF)

if not depmap_modeling_df_path.exists():
    raise FileNotFoundError(f"Could not find '{str(depmap_modeling_df_path)}'")
pd.read_csv(depmap_modeling_df_path, low_memory=False, nrows=200)
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
sgrna replicate_id lfc p_dna_batch genome_alignment hugo_symbol screen multiple_hits_on_gene sgrna_target_chr sgrna_target_pos ... any_deleterious any_tcga_hotspot any_cosmic_hotspot is_mutated copy_number lineage lineage_subtype primary_or_metastasis is_male age
0 AAACCTGCGGCGGTCGCCA OVR3_c905R1 -0.299958 CRISPR_C6596666.sample chr8_66505451_- VXN sanger True 8 66505451 ... NaN NaN NaN False 1.139595 ovary ovary_adenocarcinoma metastasis False 60
1 AACAGCACACCGGCCCCGT OVR3_c905R1 0.267092 CRISPR_C6596666.sample chrX_156009834_- IL9R sanger True X 156009834 ... NaN NaN NaN False 0.656377 ovary ovary_adenocarcinoma metastasis False 60
2 AACCTCCGGACTCCTCAGC OVR3_c905R1 0.550477 CRISPR_C6596666.sample chr7_39609658_- YAE1 sanger True 7 39609658 ... NaN NaN NaN False 0.923715 ovary ovary_adenocarcinoma metastasis False 60
3 AACTCAAACTGACGCCGAA OVR3_c905R1 -0.391922 CRISPR_C6596666.sample chr1_117623388_- TENT5C sanger True 1 117623388 ... NaN NaN NaN False 1.352975 ovary ovary_adenocarcinoma metastasis False 60
4 AACTGACCTTGAAACGCTG OVR3_c905R1 -1.562577 CRISPR_C6596666.sample chr16_66933623_+ CIAO2B sanger True 16 66933623 ... NaN NaN NaN False 1.157211 ovary ovary_adenocarcinoma metastasis False 60
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
195 TGAGCTGGCAATGCTAGAT OVR3_c905R1 0.565344 CRISPR_C6596666.sample chrX_155774116_- SPRY3 sanger True X 155774116 ... NaN NaN NaN False 0.656377 ovary ovary_adenocarcinoma metastasis False 60
196 TGATGGAGCGAATCAGATG OVR3_c905R1 -0.204959 CRISPR_C6596666.sample chr16_66934065_+ CIAO2B sanger True 16 66934065 ... NaN NaN NaN False 1.157211 ovary ovary_adenocarcinoma metastasis False 60
197 TGCACTTATGTGTGCCGCC OVR3_c905R1 0.650650 CRISPR_C6596666.sample chrX_156003692_- IL9R sanger True X 156003692 ... NaN NaN NaN False 0.656377 ovary ovary_adenocarcinoma metastasis False 60
198 TGCTAGGACCCAACTGAGC OVR3_c905R1 -0.517796 CRISPR_C6596666.sample chr10_46580364_+ SYT15 sanger True 10 46580364 ... NaN NaN NaN False 0.752471 ovary ovary_adenocarcinoma metastasis False 60
199 TGGAAAGTTGCCTCGTCCG OVR3_c905R1 -0.218348 CRISPR_C6596666.sample chr1_117622978_- TENT5C sanger True 1 117622978 ... NaN NaN NaN False 1.352975 ovary ovary_adenocarcinoma metastasis False 60

200 rows × 25 columns

depmap_modeling_df = dd.read_csv(
    depmap_modeling_df_path,
    dtype={
        "age": "float64",
        "p_dna_batch": "object",
        "primary_or_metastasis": "object",
        "counts_final": "float64",
        "counts_initial": "float64",
    },
    low_memory=False,
)
depmap_modeling_df.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
sgrna replicate_id lfc p_dna_batch genome_alignment hugo_symbol screen multiple_hits_on_gene sgrna_target_chr sgrna_target_pos ... any_deleterious any_tcga_hotspot any_cosmic_hotspot is_mutated copy_number lineage lineage_subtype primary_or_metastasis is_male age
0 AAACCTGCGGCGGTCGCCA OVR3_c905R1 -0.299958 CRISPR_C6596666.sample chr8_66505451_- VXN sanger True 8 66505451 ... NaN NaN NaN False 1.139595 ovary ovary_adenocarcinoma metastasis False 60.0
1 AACAGCACACCGGCCCCGT OVR3_c905R1 0.267092 CRISPR_C6596666.sample chrX_156009834_- IL9R sanger True X 156009834 ... NaN NaN NaN False 0.656377 ovary ovary_adenocarcinoma metastasis False 60.0
2 AACCTCCGGACTCCTCAGC OVR3_c905R1 0.550477 CRISPR_C6596666.sample chr7_39609658_- YAE1 sanger True 7 39609658 ... NaN NaN NaN False 0.923715 ovary ovary_adenocarcinoma metastasis False 60.0
3 AACTCAAACTGACGCCGAA OVR3_c905R1 -0.391922 CRISPR_C6596666.sample chr1_117623388_- TENT5C sanger True 1 117623388 ... NaN NaN NaN False 1.352975 ovary ovary_adenocarcinoma metastasis False 60.0
4 AACTGACCTTGAAACGCTG OVR3_c905R1 -1.562577 CRISPR_C6596666.sample chr16_66933623_+ CIAO2B sanger True 16 66933623 ... NaN NaN NaN False 1.157211 ovary ovary_adenocarcinoma metastasis False 60.0

5 rows × 25 columns

depmap_modeling_df.columns
Index(['sgrna', 'replicate_id', 'lfc', 'p_dna_batch', 'genome_alignment',
       'hugo_symbol', 'screen', 'multiple_hits_on_gene', 'sgrna_target_chr',
       'sgrna_target_pos', 'depmap_id', 'counts_final', 'counts_initial',
       'rna_expr', 'num_mutations', 'any_deleterious', 'any_tcga_hotspot',
       'any_cosmic_hotspot', 'is_mutated', 'copy_number', 'lineage',
       'lineage_subtype', 'primary_or_metastasis', 'is_male', 'age'],
      dtype='object')

Basic checks

FAILED_CHECKS = 0

Check that specific columns exist (prevents some really bonehead discoveries later on...).

cols_that_should_exist = [
    "depmap_id",
    "sgrna",
    "hugo_symbol",
    "lfc",
    "screen",
    "num_mutations",
    "is_mutated",
    "lineage",
    "counts_final",
    "p_dna_batch",
    "primary_or_metastasis",
]

missing_cols = [
    col for col in cols_that_should_exist if col not in depmap_modeling_df.columns
]
if len(missing_cols) != 0:
    print(f"Some columns ({len(missing_cols)}) that should be present are not 😦")
    print("  missing columns: " + ", ".join(missing_cols))
    FAILED_CHECKS += 1

Check that specific columns have no missing (NA) values.

cols_without_na = [
    "depmap_id",
    "sgrna",
    "hugo_symbol",
    "lfc",
    "screen",
    "num_mutations",
    "is_mutated",
    "lineage",
]

na_checks = depmap_modeling_df.isna()[cols_without_na].any().compute()
num_missed_checks = na_checks.sum()

if num_missed_checks > 0:
    FAILED_CHECKS += num_missed_checks
    print(na_checks[na_checks])
na_checks
depmap_id        False
sgrna            False
hugo_symbol      False
lfc              False
screen           False
num_mutations    False
is_mutated       False
lineage          False
dtype: bool

Check that all combinations of cell line, sgRNA, and experimental replicate only appear once.

grp_cols = ["depmap_id", "sgrna", "replicate_id"]
ct_df = (
    depmap_modeling_df.assign(n=1)[grp_cols + ["n"]]
    .groupby(grp_cols)
    .count()
    .query("n > 1")
    .compute()
)

if not ct_df.shape[0] == 0:
    print("There are some sgRNA with multiple targets.")
    print(ct_df.head(20))
    FAILED_CHECKS += 1
if FAILED_CHECKS > 0:
    raise Exception(f"There were {FAILED_CHECKS} failed checks.")

%load_ext watermark
%watermark -d -u -v -iv -b -h -m
Last updated: 2022-06-22

Python implementation: CPython
Python version       : 3.10.4
IPython version      : 8.4.0

Compiler    : GCC 10.3.0
OS          : Linux
Release     : 3.10.0-1160.45.1.el7.x86_64
Machine     : x86_64
Processor   : x86_64
CPU cores   : 32
Architecture: 64bit

Hostname: compute-a-16-152.o2.rc.hms.harvard.edu

Git branch: per-lineage

dask  : 2022.6.0
pandas: 1.4.2
numpy : 1.22.4