From 596ee153016a39fad2a1cb187b584e7263eb4e8e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 16 Sep 2024 15:26:57 +0200 Subject: [PATCH] Create rule S6983 (#3973) * Create rule S6983 * Address review comments * Review comments 2 * Add tags --------- Co-authored-by: ghislainpiot Co-authored-by: Ghislain Piot --- rules/S6983/metadata.json | 2 + rules/S6983/python/metadata.json | 25 ++++++++++++ rules/S6983/python/rule.adoc | 70 ++++++++++++++++++++++++++++++++ 3 files changed, 97 insertions(+) create mode 100644 rules/S6983/metadata.json create mode 100644 rules/S6983/python/metadata.json create mode 100644 rules/S6983/python/rule.adoc diff --git a/rules/S6983/metadata.json b/rules/S6983/metadata.json new file mode 100644 index 00000000000..2c63c085104 --- /dev/null +++ b/rules/S6983/metadata.json @@ -0,0 +1,2 @@ +{ +} diff --git a/rules/S6983/python/metadata.json b/rules/S6983/python/metadata.json new file mode 100644 index 00000000000..10e9be93bd2 --- /dev/null +++ b/rules/S6983/python/metadata.json @@ -0,0 +1,25 @@ +{ + "title": "The \"num_workers\" parameter should be specified for \"torch.utils.data.DataLoader\"", + "type": "CODE_SMELL", + "status": "ready", + "remediation": { + "func": "Constant\/Issue", + "constantCost": "2min" + }, + "tags": [ + "pytorch", + "machine-learning" + ], + "defaultSeverity": "Minor", + "ruleSpecification": "RSPEC-6983", + "sqKey": "S6983", + "scope": "All", + "defaultQualityProfiles": ["Sonar way"], + "quickfix": "targeted", + "code": { + "impacts": { + "RELIABILITY": "LOW" + }, + "attribute": "COMPLETE" + } +} diff --git a/rules/S6983/python/rule.adoc b/rules/S6983/python/rule.adoc new file mode 100644 index 00000000000..83c001778d5 --- /dev/null +++ b/rules/S6983/python/rule.adoc @@ -0,0 +1,70 @@ +This rule raises an issue when a `torch.utils.data.Dataloader` is instantiated without specifying the `num_workers` parameter. + +== Why is this an issue? + +In the PyTorch library, the data loaders are used to provide an interface where common operations such as batching can be implemented. +It is also possible to parallelize the data loading process by using multiple worker processes. +This can improve performance by increasing the number of batches being fetched in parallel, at the cost of higher memory usage. +This performance increase can also be attributed to avoiding the Global Interpreter Lock (GIL) in the Python interpreter. + + +== How to fix it +Specify the `num_workers` parameter when instantiating the `torch.utils.data.Dataloader` object. + +The default value of `0` will use the main process to load the data, and might be faster for small datasets that can fit completely in memory. + +For larger datasets, it is recommended to use a value of `1` or higher to parallelize the data loading process. + +=== Code examples + +==== Noncompliant code example + +[source,python,diff-id=1,diff-type=noncompliant] +---- +from torch.utils.data import DataLoader +from torchvision import datasets +from torchvision.transforms import ToTensor + +train_dataset = datasets.MNIST(root='data', train=True, transform=ToTensor()) +train_data_loader = DataLoader(train_dataset, batch_size=32)# Noncompliant: the num_workers parameter is not specified +---- + +==== Compliant solution + +[source,python,diff-id=1,diff-type=compliant] +---- +from torch.utils.data import DataLoader +from torchvision import datasets +from torchvision.transforms import ToTensor + +train_dataset = datasets.MNIST(root='data', train=True, transform=ToTensor()) +train_data_loader = DataLoader(train_dataset, batch_size=32, num_workers=4) +---- + +== Resources +=== Documentation + +* PyTorch documentation - https://pytorch.org/docs/stable/data.html#single-and-multi-process-data-loading[Single- and Multi-process Data Loading] + +* PyTorch documentation - https://pytorch.org/tutorials/beginner/basics/data_tutorial.html[Datasets and DataLoaders] + +ifdef::env-github,rspecator-view[] + +(visible only on this page) + +== Implementation specification + + +=== Message + +Primary : Specify the `num_workers` parameter. + +=== Issue location + +Primary : Name of the instantiation + +=== Quickfix + +Fill in with the default parameter + +endif::env-github,rspecator-view[] \ No newline at end of file