forked from csbio/CG-TARGET
-
Notifications
You must be signed in to change notification settings - Fork 0
/
example_config_file.yaml
95 lines (94 loc) · 5.12 KB
/
example_config_file.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
---
Required_arguments :
# Directory in which to store results.
output_folder : /path/to/all/output
# Table of chemical genetic interaction scores with
# "screen_name" and "expt_id" columns for the conditions
# and "Strain_ID" and "Barcode" columns for the array strains.
# CG scores are stored in a column named "store".
cg_data_table : /path/to/cg_data_table
# Table with "Strain_ID" and "Barcode" columns that map to
# other info for the array.
cg_row_info_table : /path/to/cg_row_info_table
# Table with "screen_name" and "expt_id" columns that map to
# other info for the conditions.
cg_col_info_table : /path/to/cg_col_info_table
# For matching CG array to GI array: Column in cg_row_info_table
# that matches column in gi_array_info_table.
# MUST CONTAIN SYSTEMATIC NAMES FOR THE SPECIES IN QUESTION
cg_to_gi_match_col : ORF
# Name of genetic interaction dataset to use. See all available
# options by running the "gi_datasets.r" command.
gi_dataset_name : SGA_DMA_array_no-supps_sig-only
# Options: "cg_norm_cosine" and "cosine".
cg_gi_similarity_measure : cg-norm-dot
# Define a cutoff on the score from cg_gi_similarity_measure,
# above which the query gene is deemed a "driver" of predictions
# to gene sets it is in.
driver_cutoff : 2
# Name to uniquely identify the type of gene_sets used as targets.
# See all available options by running "gene_sets.r"
gene_set_name : S-cerevisiae_GO-BP
# Set the minimum and maximum allowable sizes of the gene sets.
# This occurs after the gene set annotations are restricted to
# the set of gene targets.
min_term_size : 4
max_term_size : 200
# Choose "1" to sample from the actual data for each array or "2"
# to sample from normal distributions with mean and standard
# deviation calculated for each array in the chemical-genetic
# interaction dataset. Choose "0" (or set
# "num_per-array_resampled_profiles to 0) to forgo generating the
# resampledprofiles and rely only on per-condition randomization
# for p-value computation.
per-array_resampling_scheme : 1
per-array_resampling_seed : 54321
# A good number is 10000
num_per-array_resampled_profiles : 10000
per-condition_randomization_seed : 12345
# This one takes the longest - a good number is 10000
num_per-condition_randomizations : 10000
# How do you want to estimate the false discovery rate?
# Choose "1" to estimate FDR using the entire dataset (using
# resampled and negative control profiles (whichever are
# available), or choose "2" to estimate FDR on a per-profile
# basis (better for smaller screens, uses the Benjamini-
# Hochberg correction).
FDR_estimation_scheme : 2
Options :
################################################################
per-array_randomization:
# Column in cg_col_info_table that denotes which samples to
# include in the dataset used to generate resampled profiles.
# I recommend using a column that has "TRUE" for rows not
# belonging to positive or negative controls.
per-array_randomization_include_column : not_pos_neg_control
################################################################
gene_target_prediction_visualization :
# Which columns from cg_col_info_table should be shown in
# the visualization CDT?
cg_col_table_vis_columns : screen_name,expt_id,name,lane,index_tag,index_tag_well
# Which columns from gi_query_info_table should be shown in
# the visualization CDT?
gi_col_table_vis_columns : query_genename,query_strain_only,array_type_temp
################################################################
gene_set_target_prediction :
# Column that indicates which conditions are "negative
# controls," aka experimental controls (such as DMSO).
negative_control_column : control?
# Column in cg_col_info_table that contains interpretable
# names for the conditions.
condition_name_column : name
################################################################
results_export :
# Column that indicates which conditions are considered
# as "treatment" conditions, as opposed to negative or
# positive controls. The condtions for which this column
# is "TRUE" will be exported to the "treatment" table
# after FDR estimation, while those that are not
# specified as negative controls will be exported to the
# "split-out" table after FDR estimation.
# If left empty, the default value is everything that is
# not a negative control (or a resampled profile).
treatment_column : not_pos_neg_control
...