From 0a5d647a3066e544a29da6910d97a15a2297f54f Mon Sep 17 00:00:00 2001 From: majoe Date: Sun, 20 Feb 2022 20:12:41 +0100 Subject: [PATCH 1/2] Use new GoogleDrive link for IWSLT dataset All IWSLT datasets are now on Google Drive. Also the language pairs are not provided as seperate archives anymore, but all in a single archive. The IWSLT datadep was updated with the new Google drive link and the postfetchmethod adapted to extract the nested language pair archives. --- src/datasets/translate/iwslt.jl | 1 + src/datasets/translate/iwslt2016.jl | 111 ++++++++++++---------------- 2 files changed, 50 insertions(+), 62 deletions(-) diff --git a/src/datasets/translate/iwslt.jl b/src/datasets/translate/iwslt.jl index 2f38d1ca..f9a3dd0b 100644 --- a/src/datasets/translate/iwslt.jl +++ b/src/datasets/translate/iwslt.jl @@ -1,4 +1,5 @@ module IWSLT +using Fetch using DataDeps using ..Datasets: Dataset diff --git a/src/datasets/translate/iwslt2016.jl b/src/datasets/translate/iwslt2016.jl index 355f8ce3..c44d2489 100644 --- a/src/datasets/translate/iwslt2016.jl +++ b/src/datasets/translate/iwslt2016.jl @@ -1,74 +1,61 @@ using LightXML function iwslt2016_init() - for (lang, checksum) ∈ zip(("ar", "cs", "fr", "de"), (("0e7dd1c836f66f0e68c45c3dea6312dd6a1f2e8c93bcf03982f842644319ff4c", - "bf10a15077b4d25cc3e8272e61429d6e85f75a0a55f180a10d01abfcdb3debc9"), - ("d6d5a4767f6afc96c59630583d0cfe2b23ba24a48a33c16b7fdb76017ee62ab3", - "f38dcf1407afa224324dbed02424fa2c30042d10a5cc387d02df1369eec3f68e"), - ("132bc5524c1f7500aadb84a4e05a0c3dd15cc5b527d4d4af402fd98582299231", - "b70aca9675966fcbdfb8349086848638f6711e47c669f5654971859d10266398"), - ("7e21dd345e9192180f36d7816f84a77eafd6b85e45432d90d0970f06e8c772ea", - "13c037b8a5dce7fb6199eeedc6b0460c0c75082db8eeda21902acb373ba9ba14"))) + + # Helper function for extracting nested archives + function extract_lang_archive(src, dst, fn) + # Unpack outer archive and construct path to the archive for the language pair + unpack(fn) + archivename = "2016-01" + innerdir = "$(src)-$(dst)" + langarchive = joinpath(archivename, "texts", "$(src)", "$(dst)", innerdir * ".tgz") + + # Unpack language pair archive + unpack(langarchive) + innerfiles = readdir(innerdir) + mv.(joinpath.(innerdir, innerfiles), innerfiles) + + for f ∈ innerfiles + if occursin(".xml", f) + clean_xml(f) + elseif occursin(".tags", f) + clean_tag(f) + end + end + rm(innerdir) + rm(archivename; recursive=true) + end + + archivehash = "425a3688e0faff00ed4d6d04f1664d1edbd9932e5b17a73680aa81a70f03e2d6" + + message = (src, dst) -> """ + The IWSLT 2016 TED talk translation task + + These are the data sets for the MT tasks of the evaluation campaigns of IWSLT. They are parallel data sets used for building and testing MT systems. They are publicly available through the WIT3 website wit3.fbk.eu, see release: 2016-01. + + Data are crawled from the TED website and carry the respective licensing conditions (for training, tuning and testing MT systems). + Approximately, for each language pair, training sets include 2,000 talks, 200K sentences and 4M tokens per side, while each dev and test sets 10-15 talks, 1.0K-1.5K sentences and 20K-30K tokens per side. In each edition, the training sets of previous editions are re-used and updated with new talks added to the TED repository in the meanwhile. + + from $(src) to $(dst) + """ + + for lang ∈ ("ar", "cs", "fr", "de") register(DataDep( "IWSLT2016 $(lang)-en", - """ - The IWSLT 2016 TED talk translation task - - These are the data sets for the MT tasks of the evaluation campaigns of IWSLT. They are parallel data sets used for building and testing MT systems. They are publicly available through the WIT3 website wit3.fbk.eu, see release: 2016-01. - - Data are crawled from the TED website and carry the respective licensing conditions (for training, tuning and testing MT systems). - Approximately, for each language pair, training sets include 2,000 talks, 200K sentences and 4M tokens per side, while each dev and test sets 10-15 talks, 1.0K-1.5K sentences and 20K-30K tokens per side. In each edition, the training sets of previous editions are re-used and updated with new talks added to the TED repository in the meanwhile. - - from $(lang) to en - """, - "https://wit3.fbk.eu/archive/2016-01//texts/$lang/en/$(lang)-en.tgz", - checksum[1]; - post_fetch_method = fn -> begin - unpack(fn) - innerdir = "$(lang)-en" - innerfiles = readdir(innerdir) - mv.(joinpath.(innerdir, innerfiles), innerfiles) - - for f ∈ innerfiles - if occursin(".xml", f) - clean_xml(f) - elseif occursin(".tags", f) - clean_tag(f) - end - end - rm(innerdir) - end + message(lang, "en"), + "https://drive.google.com/file/d/1l5y6Giag9aRPwGtuZHswh3w5v3qEz8D8/", + archivehash; + fetch_method=gdownload, + post_fetch_method = fn -> extract_lang_archive(lang, "en", fn) )) register(DataDep( "IWSLT2016 en-$(lang)", - """ - The IWSLT 2016 TED talk translation task - - These are the data sets for the MT tasks of the evaluation campaigns of IWSLT. They are parallel data sets used for building and testing MT systems. They are publicly available through the WIT3 website wit3.fbk.eu, see release: 2016-01. - - Data are crawled from the TED website and carry the respective licensing conditions (for training, tuning and testing MT systems). - Approximately, for each language pair, training sets include 2,000 talks, 200K sentences and 4M tokens per side, while each dev and test sets 10-15 talks, 1.0K-1.5K sentences and 20K-30K tokens per side. In each edition, the training sets of previous editions are re-used and updated with new talks added to the TED repository in the meanwhile. - - from en to $(lang) - """, - "https://wit3.fbk.eu/archive/2016-01//texts/en/$lang/en-$(lang).tgz", - checksum[2]; - post_fetch_method = fn -> begin - unpack(fn) - innerdir = "en-$(lang)" - innerfiles = readdir(innerdir) - mv.(joinpath.(innerdir, innerfiles), innerfiles) - - for f ∈ innerfiles - if occursin(".xml", f) - clean_xml(f) - elseif occursin(".tag", f) - clean_tag(f) - end - end - rm(innerdir) - end + message("en", lang), + "https://drive.google.com/file/d/1l5y6Giag9aRPwGtuZHswh3w5v3qEz8D8/", + archivehash; + fetch_method=gdownload, + post_fetch_method = fn -> extract_lang_archive("en", lang, fn) )) end end From 33eeeadd1791a87056dd3fda4dbe61e5e6d31cec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Markus=20J=C3=B6rg?= Date: Mon, 21 Feb 2022 08:54:08 +0100 Subject: [PATCH 2/2] Fix wrong error message in IWSLT tunefile When the requested file was not found in tunefile, a error was thrown, which should have included the list of available files in the datadeps. Here the wrong datadeps string was used, which triggered the download of the fr-en language pair. --- src/datasets/translate/iwslt2016.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets/translate/iwslt2016.jl b/src/datasets/translate/iwslt2016.jl index c44d2489..36fb4da7 100644 --- a/src/datasets/translate/iwslt2016.jl +++ b/src/datasets/translate/iwslt2016.jl @@ -116,7 +116,7 @@ function tunefile(iw::IWSLT2016, dev, year; tedx = false) !(srcf ∈ readdir(@datadep_str "IWSLT2016 $p")) && error("""no such file: $srcf, only have the following: - $(join(filter(x->occursin(".en.txt", x), readdir(datadep"IWSLT2016 fr-en/")), "\n"))""") + $(join(filter(x->occursin(".$(iw.src).txt", x), readdir(datadep"IWSLT2016 $p/")), "\n"))""") src = @datadep_str "IWSLT2016 $p/$srcf" ref = @datadep_str "IWSLT2016 $p/$reff"