forked from Tikquuss/meta_XLM
-
Notifications
You must be signed in to change notification settings - Fork 0
/
data.sh
executable file
·108 lines (90 loc) · 3.57 KB
/
data.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#!/bin/bash
# usage : data.sh $languages
# Transform (tokenize, lower and remove accent, loard code and vocab, learn and apply BPE tokenization,
# binarize...) our data contained in the text files into a pth file understandable by the framework :
# takes a lot of time with dataset size, nCodes and shuf_n_samples
set -e
# languages
lgs=$1
# path where processed files will be stored
OUTPATH=/content/processed
# If parallel data is available and you need to preprocess it
PARA=True
# If you want to process monolingual data (if the monolingual data is unavailable and you
# leave this parameter set to True, the parallel data will be used to build the monolingual data)
MONO=True
# folder containing the parallel data
PARA_PATH=/content/data/para
# folder containing the monolingual data
MONO_PATH=/content/data/para
# whether all languages should share the same vocabulary (leave to True)
SAME_VOCAB=True
# The following parameter allows, when having independent monolingual and parallel data, to add the parallel data to the monolingual data. It is left by default to True.
add_para_data_to_mono_data=False
# Learn nCodes BPE code on the training data
nCodes=20000
# Generating shuf_n_samples random permutations of training data to learn bpe
shuf_n_samples=10000
# It is preferable and advisable that it be the powers of two...
threads_for_tokenizer=16
# Percentage of data to use as test data (%)
test_size=10
# Percentage of data to use as validation data (%)
val_size=10
# tools paths
TOOLS_PATH=tools
TOKENIZE=$TOOLS_PATH/tokenizer_our.sh
LOWER_REMOVE_ACCENT=$TOOLS_PATH/lowercase_and_remove_accent.py
FASTBPE=$TOOLS_PATH/fastBPE/fast
#PROCESSED_FILE=../scripts/build_meta_data_multixlm.sh
# The n_sample parameter is optional, and when it is not passed or when it exceeds the dataset size, the whole dataset is considered
n_samples=-1
# If you don't have any other data to fine-tune your model on a specific sub-task, specify the percentage of the sub-task metadata to consider or -1 to ignore it.
#sub_tasks=en-fr:10,de-en:-1,de-fr:-1
#If you want the subtasks to be constructed from the pair combinations of your languages, put the three dots
sub_tasks=...
tasks_n_samples=-1
##############################################
function abrev() {
# todo
result=$1
}
if [ $sub_tasks="..." ]; then
sub_tasks=""
IFS=', ' read -r -a langs_array <<< "$languages"
# todo : sort the array in alphebical oder
array_length=${#langs_array[*]}
for (( i=0; i<$array_length; ++i)); do
for (( j=$(($i+1)); j<$array_length; ++j)); do
abrev ${langs_array[$i]}
a=$result
abrev ${langs_array[$j]}
b=$result
sub_tasks=$sub_tasks,$a-$b:$tasks_n_samples
done
done
# Remove the comma in front
sub_tasks=$(echo $sub_tasks | cut -c2-)
fi
echo $sub_tasks
# create output path
mkdir -p $OUTPATH
# avoid permission error
chmod +x $FASTBPE
chmod +x $TOOLS_PATH/mosesdecoder/scripts/tokenizer/*.perl
echo "======================="
echo "Processed"
echo "======================="
### option 1 : data in the different folders with the name $pair for each pair ###
#chmod +x ../scripts/build_meta_data_multixlm.sh
#. ../scripts/build_meta_data_multixlm.sh $sub_tasks $n_samples $add_para_data_to_mono_data
## OR ##
### option 2 : data in a same folder ###
chmod +x ../scripts/build_meta_data_monoxlm.sh
. ../scripts/build_meta_data_monoxlm.sh $sub_tasks $n_samples $add_para_data_to_mono_data
# todo : make things dynamic like this
#chmod +x $PROCESSED_FILE
#$PROCESSED_FILE
echo "======================="
echo "End"
echo "======================="