From 5540af27edc156b88eb4e76ba3213b2bd9982a94 Mon Sep 17 00:00:00 2001 From: The TensorFlow Datasets Authors Date: Tue, 7 Jun 2022 12:58:17 -0700 Subject: [PATCH] Adding val and test splits to the WIT dataset. PiperOrigin-RevId: 453504846 --- .../vision_language/wit/checksums.tsv | 10 ++++++ .../wit/dummy_data/test-00000-of-00005.tsv | 2 ++ .../wit/dummy_data/val-00000-of-00005.tsv | 3 ++ .../vision_language/wit/wit.py | 31 +++++++++++++------ .../vision_language/wit/wit_test.py | 8 ++++- 5 files changed, 44 insertions(+), 10 deletions(-) create mode 100644 tensorflow_datasets/vision_language/wit/dummy_data/test-00000-of-00005.tsv create mode 100644 tensorflow_datasets/vision_language/wit/dummy_data/val-00000-of-00005.tsv diff --git a/tensorflow_datasets/vision_language/wit/checksums.tsv b/tensorflow_datasets/vision_language/wit/checksums.tsv index bd00515ab9c..36f4708b77c 100644 --- a/tensorflow_datasets/vision_language/wit/checksums.tsv +++ b/tensorflow_datasets/vision_language/wit/checksums.tsv @@ -1,3 +1,8 @@ +https://storage.googleapis.com/gresearch/wit/wit_v1.test.all-00000-of-00005.tsv.gz 32291578 aceef0d6624cfcab9d3fb7bc4c8a18424e6052b906fdacb3dc29936ff8eb1b4e wit_v1.test.all-00000-of-00005.tsv.gz +https://storage.googleapis.com/gresearch/wit/wit_v1.test.all-00001-of-00005.tsv.gz 32153532 055acbaf68e516d8f5e5647280bfb50764f606a1daa2d39a3a2a1849455b0b84 wit_v1.test.all-00001-of-00005.tsv.gz +https://storage.googleapis.com/gresearch/wit/wit_v1.test.all-00002-of-00005.tsv.gz 31963406 df0ebb07ffacf80c51d01729b17ef4e826a6fb15895b71fd8478d358fb66e711 wit_v1.test.all-00002-of-00005.tsv.gz +https://storage.googleapis.com/gresearch/wit/wit_v1.test.all-00003-of-00005.tsv.gz 32038958 92073ea8905c1248046f9cb1feca5d7165c6eee4f7240027da08cb7609416b48 wit_v1.test.all-00003-of-00005.tsv.gz +https://storage.googleapis.com/gresearch/wit/wit_v1.test.all-00004-of-00005.tsv.gz 32133564 1286e1c47e9c65fc7df5e316a4ad2a19c05ecc2433c13026f5642aed0fad4069 wit_v1.test.all-00004-of-00005.tsv.gz https://storage.googleapis.com/gresearch/wit/wit_v1.train.all-00000-of-00010.tsv.gz 2672819495 1fdd379b55e559fa6d0884aa3c57066bb1f206b183b5b4ce6a8128f486f2e8b3 wit_v1.train.all-00000-of-00010.tsv.gz https://storage.googleapis.com/gresearch/wit/wit_v1.train.all-00001-of-00010.tsv.gz 2667931762 2fb22ceab0cd33168367fd6d268c8d803982cfe924b5d01cdf43457c32591f27 wit_v1.train.all-00001-of-00010.tsv.gz https://storage.googleapis.com/gresearch/wit/wit_v1.train.all-00002-of-00010.tsv.gz 2669251466 316fd4471585df14c33425a199e1fbb843ea8ae1f42b2800ddea1d959d403dcd wit_v1.train.all-00002-of-00010.tsv.gz @@ -8,3 +13,8 @@ https://storage.googleapis.com/gresearch/wit/wit_v1.train.all-00006-of-00010.tsv https://storage.googleapis.com/gresearch/wit/wit_v1.train.all-00007-of-00010.tsv.gz 2669891774 b3728292b163f98858ff4c1f9f619259376ed063f9f36f0bdd66169982c40187 wit_v1.train.all-00007-of-00010.tsv.gz https://storage.googleapis.com/gresearch/wit/wit_v1.train.all-00008-of-00010.tsv.gz 2669091199 6104064c2981696c2a91f1aa35d952aae488ea07c2263cdc0c66b202cdb43170 wit_v1.train.all-00008-of-00010.tsv.gz https://storage.googleapis.com/gresearch/wit/wit_v1.train.all-00009-of-00010.tsv.gz 2670659115 3388614d12905c9a1ddb4c27445899a1f73ed75d12eaa36dd112e481330ecfaa wit_v1.train.all-00009-of-00010.tsv.gz +https://storage.googleapis.com/gresearch/wit/wit_v1.val.all-00000-of-00005.tsv.gz 40332966 f625ae640cbe2c56a8aabf95c54812d7422741056542f6dd2c695ff871cce4e5 wit_v1.val.all-00000-of-00005.tsv.gz +https://storage.googleapis.com/gresearch/wit/wit_v1.val.all-00001-of-00005.tsv.gz 40092981 cb5b1db242272bfe7e2d7f4647a555c51c2f5693c30c754de29f909f10cd752b wit_v1.val.all-00001-of-00005.tsv.gz +https://storage.googleapis.com/gresearch/wit/wit_v1.val.all-00002-of-00005.tsv.gz 39643004 dde515eec141e5ecd5f50d95b98343584acc4b197ec7f6931806a2c71021820d wit_v1.val.all-00002-of-00005.tsv.gz +https://storage.googleapis.com/gresearch/wit/wit_v1.val.all-00003-of-00005.tsv.gz 39958628 a8dd52a8473fce19ce3bae4a47e4a47dd2a1195bf7013a2d43e502dc21801846 wit_v1.val.all-00003-of-00005.tsv.gz +https://storage.googleapis.com/gresearch/wit/wit_v1.val.all-00004-of-00005.tsv.gz 39943476 57d0ee8b13fb5ff2bdea220db124e76c3264b3970770f0ed47866b0e3dc516d2 wit_v1.val.all-00004-of-00005.tsv.gz diff --git a/tensorflow_datasets/vision_language/wit/dummy_data/test-00000-of-00005.tsv b/tensorflow_datasets/vision_language/wit/dummy_data/test-00000-of-00005.tsv new file mode 100644 index 00000000000..3a2628809bd --- /dev/null +++ b/tensorflow_datasets/vision_language/wit/dummy_data/test-00000-of-00005.tsv @@ -0,0 +1,2 @@ +language page_url image_url page_title section_title hierarchical_section_title caption_reference_description caption_attribution_description caption_alt_text_description mime_type original_height original_width is_main_image attribution_passes_lang_id page_changed_recently context_page_description context_section_description +it https://it.wikipedia.org/wiki/Nube_dei_Cani_da_Caccia_I https://upload.wikimedia.org/wikipedia/commons/1/15/M94_group.gif Nube dei Cani da Caccia I Nube dei Cani da Caccia I Mappa del Canes Venatici I Cloud Italiano: Gruppo di M94 image/gif 600 640 false true false "La Nube dei Cani da Caccia I, un'ampia struttura costituita sostanzialmente da una nube di galassie, un amorfo filamento della lunghezza di oltre 10 Megaparsec. diff --git a/tensorflow_datasets/vision_language/wit/dummy_data/val-00000-of-00005.tsv b/tensorflow_datasets/vision_language/wit/dummy_data/val-00000-of-00005.tsv new file mode 100644 index 00000000000..11220daed53 --- /dev/null +++ b/tensorflow_datasets/vision_language/wit/dummy_data/val-00000-of-00005.tsv @@ -0,0 +1,3 @@ +language page_url image_url page_title section_title hierarchical_section_title caption_reference_description caption_attribution_description caption_alt_text_description mime_type original_height original_width is_main_image attribution_passes_lang_id page_changed_recently context_page_description context_section_description +nl https://nl.wikipedia.org/wiki/Vernajoul http://upload.wikimedia.org/wikipedia/commons/8/84/Map_commune_FR_insee_code_09329.png Vernajoul Geografie Vernajoul / Geografie Français : Carte des communes françaises: Vernajoul English: Map commune FR insee code 09329.png Detailkaart van de gemeente image/png 605 756 false false false Vernajoul is een gemeente in het Franse departement Ariège en telt 661 inwoners. De plaats maakt deel uit van het arrondissement Foix. "De oppervlakte van Vernajoul bedraagt 9,0 km², de bevolkingsdichtheid is 73,4 inwoners per km². +De onderstaande kaart toont de ligging van Vernajoul met de belangrijkste infrastructuur en aangrenzende gemeenten." diff --git a/tensorflow_datasets/vision_language/wit/wit.py b/tensorflow_datasets/vision_language/wit/wit.py index 7c1b79d3cab..cb8b6a49a06 100644 --- a/tensorflow_datasets/vision_language/wit/wit.py +++ b/tensorflow_datasets/vision_language/wit/wit.py @@ -43,10 +43,11 @@ class Wit(tfds.core.GeneratorBasedBuilder): """DatasetBuilder for wit dataset.""" - VERSION = tfds.core.Version("1.0.0") + VERSION = tfds.core.Version("1.1.0") RELEASE_NOTES = { "1.0.0": "Initial release. It loads the WIT dataset from " - "https://storage.googleapis.com/gresearch/wit/" + "https://storage.googleapis.com/gresearch/wit/", + "1.1.0": "Added `val` and `test` splits." } def _info(self) -> tfds.core.DatasetInfo: @@ -81,19 +82,32 @@ def _info(self) -> tfds.core.DatasetInfo: def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" wit_homepage = "https://storage.googleapis.com/gresearch/wit/" - wit_urls_to_download = [ + wit_train_urls_to_download = [ os.path.join(wit_homepage, f"wit_v1.train.all-0000{i}-of-00010.tsv.gz") for i in range(10) ] + wit_val_urls_to_download = [ + os.path.join(wit_homepage, f"wit_v1.val.all-0000{i}-of-00005.tsv.gz") + for i in range(5) + ] + wit_test_urls_to_download = [ + os.path.join(wit_homepage, f"wit_v1.test.all-0000{i}-of-00005.tsv.gz") + for i in range(5) + ] - path = dl_manager.download_and_extract({"wit": wit_urls_to_download}) + paths_per_split = dl_manager.download_and_extract({ + "train": wit_train_urls_to_download, + "val": wit_val_urls_to_download, + "test": wit_test_urls_to_download + }) - # The WIT dataset does not contain any validation or test split. return { - "train": self._generate_examples(path), + "train": self._generate_examples(paths_per_split["train"]), + "val": self._generate_examples(paths_per_split["val"]), + "test": self._generate_examples(paths_per_split["test"]), } - def _generate_examples(self, path): + def _generate_examples(self, filepaths): """Yields examples.""" beam = tfds.core.lazy_imports.apache_beam @@ -148,7 +162,6 @@ def _read_rows(filename): for i, row in enumerate(csv_reader): yield filename, i, row - wit_filepaths = path["wit"] - return (beam.Create(wit_filepaths) + return (beam.Create(filepaths) | beam.FlatMap(_read_rows) | beam.Map(_process_example)) diff --git a/tensorflow_datasets/vision_language/wit/wit_test.py b/tensorflow_datasets/vision_language/wit/wit_test.py index 8bfd319d8ae..1c5708114d5 100644 --- a/tensorflow_datasets/vision_language/wit/wit_test.py +++ b/tensorflow_datasets/vision_language/wit/wit_test.py @@ -24,9 +24,15 @@ class WitTest(tfds.testing.DatasetBuilderTestCase): DATASET_CLASS = wit.Wit SPLITS = { 'train': 3, + 'val': 1, + 'test': 1, } - DL_EXTRACT_RESULT = {'wit': ['train-00000-of-00009.tsv']} + DL_EXTRACT_RESULT = { + 'train': ['train-00000-of-00009.tsv'], + 'val': ['val-00000-of-00005.tsv'], + 'test': ['test-00000-of-00005.tsv'] + } if __name__ == '__main__':