-
Notifications
You must be signed in to change notification settings - Fork 145
/
europarl.py
136 lines (112 loc) · 4.15 KB
/
europarl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
########################################################################
#
# Functions for downloading the Europarl data-set from the internet
# and loading it into memory. This data-set is used for translation
# between English and most European languages.
#
# http://www.statmt.org/europarl/
#
# Implemented in Python 3.6
#
# Usage:
# 1) Set the variable data_dir with the desired storage directory.
# 2) Determine the language-code to use e.g. "da" for Danish.
# 3) Call maybe_download_and_extract() to download the data-set
# if it is not already located in the given data_dir.
# 4) Call load_data(english=True) and load_data(english=False)
# to load the two data-files.
# 5) Use the returned data in your own program.
#
# Format:
# The Europarl data-set contains millions of text-pairs between English
# and most European languages. The data is stored in two text-files.
# The data is returned as lists of strings by the load_data() function.
#
# The list of currently supported languages and their codes are as follows:
#
# bg - Bulgarian
# cs - Czech
# da - Danish
# de - German
# el - Greek
# es - Spanish
# et - Estonian
# fi - Finnish
# fr - French
# hu - Hungarian
# it - Italian
# lt - Lithuanian
# lv - Latvian
# nl - Dutch
# pl - Polish
# pt - Portuguese
# ro - Romanian
# sk - Slovak
# sl - Slovene
# sv - Swedish
#
########################################################################
#
# This file is part of the TensorFlow Tutorials available at:
#
# https://github.com/Hvass-Labs/TensorFlow-Tutorials
#
# Published under the MIT License. See the file LICENSE for details.
#
# Copyright 2018 by Magnus Erik Hvass Pedersen
#
########################################################################
import os
import download
########################################################################
# Directory where you want to download and save the data-set.
# Set this before you start calling any of the functions below.
data_dir = "data/europarl/"
# Base-URL for the data-sets on the internet.
data_url = "http://www.statmt.org/europarl/v7/"
########################################################################
# Public functions that you may call to download the data-set from
# the internet and load the data into memory.
def maybe_download_and_extract(language_code="da"):
"""
Download and extract the Europarl data-set if the data-file doesn't
already exist in data_dir. The data-set is for translating between
English and the given language-code (e.g. 'da' for Danish, see the
list of available language-codes above).
"""
# Create the full URL for the file with this data-set.
url = data_url + language_code + "-en.tgz"
download.maybe_download_and_extract(url=url, download_dir=data_dir)
def load_data(english=True, language_code="da", start="", end=""):
"""
Load the data-file for either the English-language texts or
for the other language (e.g. "da" for Danish).
All lines of the data-file are returned as a list of strings.
:param english:
Boolean whether to load the data-file for
English (True) or the other language (False).
:param language_code:
Two-char code for the other language e.g. "da" for Danish.
See list of available codes above.
:param start:
Prepend each line with this text e.g. "ssss " to indicate start of line.
:param end:
Append each line with this text e.g. " eeee" to indicate end of line.
:return:
List of strings with all the lines of the data-file.
"""
if english:
# Load the English data.
filename = "europarl-v7.{0}-en.en".format(language_code)
else:
# Load the other language.
filename = "europarl-v7.{0}-en.{0}".format(language_code)
# Full path for the data-file.
path = os.path.join(data_dir, filename)
# Open and read all the contents of the data-file.
with open(path, encoding="utf-8") as file:
# Read the line from file, strip leading and trailing whitespace,
# prepend the start-text and append the end-text.
texts = [start + line.strip() + end for line in file]
return texts
########################################################################