Skip to content

Commit

Permalink
style
Browse files Browse the repository at this point in the history
  • Loading branch information
lhoestq committed Nov 28, 2024
1 parent 5888fe0 commit fae39eb
Show file tree
Hide file tree
Showing 5 changed files with 46 additions and 31 deletions.
8 changes: 5 additions & 3 deletions src/datasets/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,9 +242,11 @@ def __reduce__(self): # to make dynamically created class pickable, see _Initia
def get_dataset_builder_class(
dataset_module: "DatasetModule", dataset_name: Optional[str] = None
) -> Type[DatasetBuilder]:
with lock_importable_file(
dataset_module.importable_file_path
) if dataset_module.importable_file_path else nullcontext():
with (
lock_importable_file(dataset_module.importable_file_path)
if dataset_module.importable_file_path
else nullcontext()
):
builder_cls = import_main_class(dataset_module.module_path)
if dataset_module.builder_configs_parameters.builder_configs:
dataset_name = dataset_name or dataset_module.builder_kwargs.get("dataset_name")
Expand Down
41 changes: 24 additions & 17 deletions tests/test_arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2717,9 +2717,11 @@ def test_format_vectors(self, in_memory):
import tensorflow as tf
import torch

with tempfile.TemporaryDirectory() as tmp_dir, self._create_dummy_dataset(
in_memory, tmp_dir
) as dset, dset.map(lambda ex, i: {"vec": np.ones(3) * i}, with_indices=True) as dset:
with (
tempfile.TemporaryDirectory() as tmp_dir,
self._create_dummy_dataset(in_memory, tmp_dir) as dset,
dset.map(lambda ex, i: {"vec": np.ones(3) * i}, with_indices=True) as dset,
):
columns = dset.column_names

self.assertIsNotNone(dset[0])
Expand Down Expand Up @@ -2770,9 +2772,11 @@ def test_format_ragged_vectors(self, in_memory):
import tensorflow as tf
import torch

with tempfile.TemporaryDirectory() as tmp_dir, self._create_dummy_dataset(
in_memory, tmp_dir
) as dset, dset.map(lambda ex, i: {"vec": np.ones(3 + i) * i}, with_indices=True) as dset:
with (
tempfile.TemporaryDirectory() as tmp_dir,
self._create_dummy_dataset(in_memory, tmp_dir) as dset,
dset.map(lambda ex, i: {"vec": np.ones(3 + i) * i}, with_indices=True) as dset,
):
columns = dset.column_names

self.assertIsNotNone(dset[0])
Expand Down Expand Up @@ -2830,9 +2834,11 @@ def test_format_nested(self, in_memory):
import tensorflow as tf
import torch

with tempfile.TemporaryDirectory() as tmp_dir, self._create_dummy_dataset(
in_memory, tmp_dir
) as dset, dset.map(lambda ex: {"nested": [{"foo": np.ones(3)}] * len(ex["filename"])}, batched=True) as dset:
with (
tempfile.TemporaryDirectory() as tmp_dir,
self._create_dummy_dataset(in_memory, tmp_dir) as dset,
dset.map(lambda ex: {"nested": [{"foo": np.ones(3)}] * len(ex["filename"])}, batched=True) as dset,
):
self.assertDictEqual(
dset.features, Features({"filename": Value("string"), "nested": {"foo": Sequence(Value("float64"))}})
)
Expand Down Expand Up @@ -3224,11 +3230,11 @@ def test_concatenate_mixed_memory_and_disk(self):
info1 = DatasetInfo(description="Dataset1")
info2 = DatasetInfo(description="Dataset2")
with tempfile.TemporaryDirectory() as tmp_dir:
with Dataset.from_dict(data1, info=info1).map(
cache_file_name=os.path.join(tmp_dir, "d1.arrow")
) as dset1, Dataset.from_dict(data2, info=info2).map(
cache_file_name=os.path.join(tmp_dir, "d2.arrow")
) as dset2, Dataset.from_dict(data3) as dset3:
with (
Dataset.from_dict(data1, info=info1).map(cache_file_name=os.path.join(tmp_dir, "d1.arrow")) as dset1,
Dataset.from_dict(data2, info=info2).map(cache_file_name=os.path.join(tmp_dir, "d2.arrow")) as dset2,
Dataset.from_dict(data3) as dset3,
):
with concatenate_datasets([dset1, dset2, dset3]) as concatenated_dset:
self.assertEqual(len(concatenated_dset), len(dset1) + len(dset2) + len(dset3))
self.assertListEqual(concatenated_dset["id"], dset1["id"] + dset2["id"] + dset3["id"])
Expand Down Expand Up @@ -4130,9 +4136,10 @@ def test_dataset_to_json(dataset, tmp_path):
)
def test_pickle_dataset_after_transforming_the_table(in_memory, method_and_params, arrow_file):
method, args, kwargs = method_and_params
with Dataset.from_file(arrow_file, in_memory=in_memory) as dataset, Dataset.from_file(
arrow_file, in_memory=in_memory
) as reference_dataset:
with (
Dataset.from_file(arrow_file, in_memory=in_memory) as dataset,
Dataset.from_file(arrow_file, in_memory=in_memory) as reference_dataset,
):
out = getattr(dataset, method)(*args, **kwargs)
dataset = out if out is not None else dataset
pickled_dataset = pickle.dumps(dataset)
Expand Down
7 changes: 4 additions & 3 deletions tests/test_py_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,9 +116,10 @@ class Foo:
],
)
def test_map_nested_num_proc(iterable_length, num_proc, expected_num_proc):
with patch("datasets.utils.py_utils._single_map_nested") as mock_single_map_nested, patch(
"datasets.parallel.parallel.Pool"
) as mock_multiprocessing_pool:
with (
patch("datasets.utils.py_utils._single_map_nested") as mock_single_map_nested,
patch("datasets.parallel.parallel.Pool") as mock_multiprocessing_pool,
):
data_struct = {f"{i}": i for i in range(iterable_length)}
_ = map_nested(lambda x: x + 10, data_struct, num_proc=num_proc, parallel_min_length=16)
if expected_num_proc == 1:
Expand Down
16 changes: 10 additions & 6 deletions tests/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,11 @@ def test_add_elasticsearch_index(self):
from elasticsearch import Elasticsearch

dset: Dataset = self._create_dummy_dataset()
with patch("elasticsearch.Elasticsearch.search") as mocked_search, patch(
"elasticsearch.client.IndicesClient.create"
) as mocked_index_create, patch("elasticsearch.helpers.streaming_bulk") as mocked_bulk:
with (
patch("elasticsearch.Elasticsearch.search") as mocked_search,
patch("elasticsearch.client.IndicesClient.create") as mocked_index_create,
patch("elasticsearch.helpers.streaming_bulk") as mocked_bulk,
):
mocked_index_create.return_value = {"acknowledged": True}
mocked_bulk.return_value([(True, None)] * 30)
mocked_search.return_value = {"hits": {"hits": [{"_score": 1, "_id": 29}]}}
Expand Down Expand Up @@ -198,9 +200,11 @@ class ElasticSearchIndexTest(TestCase):
def test_elasticsearch(self):
from elasticsearch import Elasticsearch

with patch("elasticsearch.Elasticsearch.search") as mocked_search, patch(
"elasticsearch.client.IndicesClient.create"
) as mocked_index_create, patch("elasticsearch.helpers.streaming_bulk") as mocked_bulk:
with (
patch("elasticsearch.Elasticsearch.search") as mocked_search,
patch("elasticsearch.client.IndicesClient.create") as mocked_index_create,
patch("elasticsearch.helpers.streaming_bulk") as mocked_bulk,
):
es_client = Elasticsearch()
mocked_index_create.return_value = {"acknowledged": True}
index = ElasticSearchIndex(es_client=es_client)
Expand Down
5 changes: 3 additions & 2 deletions tests/test_upstream_hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,8 +242,9 @@ def test_push_dataset_dict_to_hub_with_multiple_commits(self, temporary_repo):
with temporary_repo() as ds_name:
self._api.create_repo(ds_name, token=self._token, repo_type="dataset")
num_commits_before_push = len(self._api.list_repo_commits(ds_name, repo_type="dataset", token=self._token))
with patch("datasets.config.MAX_SHARD_SIZE", "16KB"), patch(
"datasets.config.UPLOADS_MAX_NUMBER_PER_COMMIT", 1
with (
patch("datasets.config.MAX_SHARD_SIZE", "16KB"),
patch("datasets.config.UPLOADS_MAX_NUMBER_PER_COMMIT", 1),
):
local_ds.push_to_hub(ds_name, token=self._token)
hub_ds = load_dataset(ds_name, download_mode="force_redownload")
Expand Down

0 comments on commit fae39eb

Please sign in to comment.