-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdvc.yaml
95 lines (84 loc) · 3.36 KB
/
dvc.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
stages:
clean_data:
cmd: python src\data\clean_data.py
deps:
- ${data_source.data_folders}\training
- src\data\clean_data.py
#Commented path dependenc from all the stage as when we run the experiment and see in experiment dashbaord we see too many path dependence
#We can filter. BUt generally person might change the base folder and keep the inside prject structure same.
#params:
#- data_source.data_folders
#- data_source.training_data_folder.folder
#- data_source.training_data_folder.train
#- data_source.prepared.folder
#- data_source.prepared.clean_train
outs:
- ${data_source.data_folders}\prepared
generate_training_features:
cmd: python -W ignore src\data\generate_training_features.py #-W ignore: ignore all warning when executing the script
deps:
- ${data_source.data_folders}\prepared
- src\data\generate_training_features.py
- src\models\feature_eng
params:
#- data_source.data_folders # track specific param (from params.yaml)
#- data_source.prepared # track specific param (from params.yaml)
- featurize # track specific param (from params.yaml)
#- src\data\train_params.yaml: # track specific params from custom file
# - train_test_split
# #- training_data
# #- test_data
# - pipeline_type
- train_test_split
#- pipeline_type
- model.model_type
- model.logistic_reg.pipeline_type
- model.decision_tree.pipeline_type
- model.bagging_decision_tree.pipeline_type
- model.extra_decision_tree.pipeline_type
- model.random_forest.pipeline_type
- model.xgboost.pipeline_type
outs:
- ${data_source.data_folders}\feature\
train_model:
cmd: python -W ignore src\data\train_model.py
deps:
- src\data\train_model.py
- ${data_source.data_folders}\feature\
params:
#- data_source.data_folders # track specific param (from params.yaml)
#- src\data\train_params.yaml:
# - model
- model.model_type
- model.logistic_reg.hyper_params
- model.decision_tree.hyper_params
- model.bagging_decision_tree.hyper_params
- model.extra_decision_tree.hyper_params
- model.random_forest.hyper_params
- model.xgboost.hyper_params
#outs:
# - ${data_source.data_folders}\model
metrics:
- ${data_source.data_folders}\model\metrics\metrics.json:
cache: false
#evaluate_model:
# cmd: python src\data\eval.py
# deps:
# - ${data_source.data_folders}\model
# - src\data\eval.py
# #outs:
# # - ${data_source.data_folders}\eval\plots:
# # cache: false
# metrics:
# - ${data_source.data_folders}\eval\metrics\metrics.json:
# cache: false #With cache: false, DVC skips caching the output under DVC, if we want we can track its versioned by Git.
# plots:
# #- ${data_source.data_folders}\eval\plots\confusion_matrix.png:
# - data\eval\plots\confusion_matrix.png:
# cache: true
# #- ${data_source.data_folders}\eval\plots\roc_curve.png:
# - data\eval\plots\roc_curve.png:
# cache: true
# #- ${data_source.data_folders}\eval\plots\pr_rc_curve.png:
# - data\eval\plots\pr_rc_curve.png:
# cache: true