-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess_data.py
51 lines (40 loc) · 1.7 KB
/
preprocess_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import argparse
import os
from src.data.preprocessors.czech_merlin_data_preprocessor import CzechMerlinDataPreprocessor
from src.data.preprocessors.english_write_and_improve_data_preprocessor import EnglishWriteAndImproveDataPreprocessor
from src.data.preprocessors.german_merlin_data_preprocessor import GermanMerlinDataPreprocessor
from src.data.preprocessors.italian_merlin_data_preprocessor import ItalianMerlinDataPreprocessor
from src.data.preprocessors.portuguese_cople2_preprocessor import PortugueseCople2DataPreprocessor
from src.data.preprocessors.spanish_cedel2_data_preprocessor import SpanishCedel2DataPreprocessor
from src.utils import fix_seed
def main(args: argparse.Namespace):
"""Prepare data and save them in tsv format.
:param args: (argparse.NameSpace) Arguments.
Read examples of each dataset and save them
in a tsv file with a name [lang]_[dataset] in data/raw.
"""
fix_seed(seed=args.seed)
dp_raw = os.path.join("data", "raw")
if not os.path.exists(dp_raw):
os.mkdir(dp_raw)
for data_preprocessor in (
CzechMerlinDataPreprocessor,
EnglishWriteAndImproveDataPreprocessor,
GermanMerlinDataPreprocessor,
ItalianMerlinDataPreprocessor,
PortugueseCople2DataPreprocessor,
SpanishCedel2DataPreprocessor,
):
print(data_preprocessor.__name__)
data_preprocessor = data_preprocessor()
print(data_preprocessor)
data_preprocessor.to_tsv()
print()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
"--seed", type=int, default=42,
help="Random seed."
)
args = parser.parse_args()
main(args)