forked from NVIDIA/OpenSeq2Seq
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathget_wmt16_en_dt.sh
More file actions
executable file
·45 lines (34 loc) · 1.43 KB
/
get_wmt16_en_dt.sh
File metadata and controls
executable file
·45 lines (34 loc) · 1.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
#!/bin/bash
set -e # Exit on error
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
TEMP_DIR="$(mktemp -d)"
OUTPUT_DIR="$SCRIPT_DIR/wmt16_en_dt"
echo "Downloading data to $OUTPUT_DIR ..."
mkdir "$OUTPUT_DIR"
export OUTPUT_DIR
cd "$TEMP_DIR"
git clone https://github.com/google/seq2seq
cd seq2seq
./bin/data/wmt16_en_de.sh
declare -a train_list_en=("train.tok.clean.bpe.32000.en"
"train.tok.bpe.32000.en"
"train.tok.clean.en"
"train.tok.en"
"train.clean.en"
"train.en")
declare -a train_list_de=("train.tok.clean.bpe.32000.de"
"train.tok.bpe.32000.de"
"train.tok.clean.de"
"train.tok.de"
"train.clean.de"
"train.de")
list_size=${#train_list_en[@]}
for (( i=0; i<${list_size}; i++ ));
do
shuf --random-source=${OUTPUT_DIR}/${train_list_en[$i]} ${OUTPUT_DIR}/${train_list_en[$i]} > ${OUTPUT_DIR}/${train_list_en[$i]}.shuffled
shuf --random-source=${OUTPUT_DIR}/${train_list_en[$i]} ${OUTPUT_DIR}/${train_list_de[$i]} > ${OUTPUT_DIR}/${train_list_de[$i]}.shuffled
mv ${OUTPUT_DIR}/${train_list_en[$i]}.shuffled ${OUTPUT_DIR}/${train_list_en[$i]}
mv ${OUTPUT_DIR}/${train_list_de[$i]}.shuffled ${OUTPUT_DIR}/${train_list_de[$i]}
done
cd "$SCRIPT_DIR"
rm -rf "$TEMP_DIR"