pytorch · vmoens · Nov 24, 2023 · Nov 21, 2023 · Nov 21, 2023 · Nov 21, 2023
diff --git a/tensordict/base.py b/tensordict/base.py
@@ -33,6 +33,7 @@
     _GENERIC_NESTED_ERR,
     _is_tensorclass,
     _KEY_ERROR,
+    _proc_init,
     _shape,
     _split_tensordict,
     _td_fields,
@@ -2910,10 +2911,12 @@ def map(
         self,
         fn: Callable,
         dim: int = 0,
-        num_workers: int = None,
-        chunksize: int = None,
-        num_chunks: int = None,
-        pool: mp.Pool = None,
+        num_workers: int | None = None,
+        chunksize: int | None = None,
+        num_chunks: int | None = None,
+        pool: mp.Pool | None = None,
+        generator: torch.Generator | None = None,
+        max_tasks_per_child: int | None = None,
     ):
         """Maps a function to splits of the tensordict across one dimension.
 
@@ -2938,16 +2941,42 @@ def map(
                 of workers. For very large tensordicts, such large chunks
                 may not fit in memory for the operation to be done and
                 more chunks may be needed to make the operation practically
-                doable. This argument is exclusive with num_chunks.
+                doable. This argument is exclusive with ``num_chunks``.
             num_chunks (int, optional): the number of chunks to split the tensordict
                 into. If none is provided, the number of chunks will equate the number
                 of workers. For very large tensordicts, such large chunks
                 may not fit in memory for the operation to be done and
                 more chunks may be needed to make the operation practically
-                doable. This argument is exclusive with chunksize.
+                doable. This argument is exclusive with ``chunksize``.
             pool (mp.Pool, optional): a multiprocess Pool instance to use
                 to execute the job. If none is provided, a pool will be created
                 within the ``map`` method.
+            generator (torch.Generator, optional): a generator to use for seeding.
+                A base seed will be generated from it, and each worker
+                of the pool will be seeded with the provided seed incremented
+                by a unique integer from ``0`` to ``num_workers``. If no generator
+                is provided, a random integer will be used as seed.
+                To work with unseeded workers, a pool should be created separately
+                and passed to :meth:`map` directly.
+                .. note::
+                  Caution should be taken when providing a low-valued seed as
+                  this can cause autocorrelation between experiments, example:
+                  if 8 workers are asked and the seed is 4, the workers seed will
+                  range from 4 to 11. If the seed is 5, the workers seed will range
+                  from 5 to 12. These two experiments will have an overlap of 7
+                  seeds, which can have unexpected effects on the results.
+
+                .. note::
+                  The goal of seeding the workers is to have independent seed on
+                  each worker, and NOT to have reproducible results across calls
+                  of the `map` method. In other words, two experiments may and
+                  probably will return different results as it is impossible to
+                  know which worker will pick which job. However, we can make sure
+                  that each worker has a different seed and that the pseudo-random
+                  operations on each will be uncorrelated.
+            max_tasks_per_child (int, optional): the maximum number of jobs picked
+                by every child process. Defaults to ``None``, i.e., no restriction
+                on the number of jobs.
 
         Examples:
             >>> import torch
@@ -2976,7 +3005,21 @@ def map(
         if pool is None:
             if num_workers is None:
                 num_workers = mp.cpu_count()  # Get the number of CPU cores
-            with mp.Pool(num_workers) as pool:
+            if generator is None:
+                generator = torch.Generator()
+            seed = (
+                torch.empty((), dtype=torch.int64).random_(generator=generator).item()
+            )
+
+            queue = mp.Queue(maxsize=num_workers)
+            for i in range(num_workers):
+                queue.put(i)
+            with mp.Pool(
+                processes=num_workers,
+                initializer=_proc_init,
+                initargs=(seed, queue),
+                maxtasksperchild=max_tasks_per_child,
+            ) as pool:
                 return self.map(
                     fn, dim=dim, chunksize=chunksize, num_chunks=num_chunks, pool=pool
                 )
@@ -2989,8 +3032,8 @@ def map(
 
         self_split = _split_tensordict(self, chunksize, num_chunks, num_workers, dim)
         chunksize = 1
-        out = pool.imap(fn, self_split, chunksize)
-        out = torch.cat(list(out), dim)
+        imap = pool.imap(fn, self_split, chunksize)
+        out = torch.cat(list(imap), dim)
         return out
 
     # Functorch compatibility

diff --git a/tensordict/memmap.py b/tensordict/memmap.py
@@ -537,7 +537,7 @@ def __getitem__(self, item):
                     "isn't supported at the moment."
                 ) from err
             raise
-        if out.data_ptr() == self.data_ptr():
+        if out.storage().data_ptr() == self.storage().data_ptr():
             out = MemoryMappedTensor(out)
             out._handler = self._handler
             out._filename = self._filename

diff --git a/tensordict/utils.py b/tensordict/utils.py
@@ -7,7 +7,6 @@
 import collections
 import dataclasses
 import inspect
-
 import math
 import os
 
@@ -50,6 +49,7 @@
 from torch import Tensor
 from torch._C import _disabled_torch_function_impl
 from torch.nn.parameter import _ParameterMeta
+from torch.utils.data._utils.worker import _generate_state
 
 if TYPE_CHECKING:
     from tensordict.memmap_deprec import MemmapTensor as _MemmapTensor
@@ -1722,3 +1722,12 @@ def _legacy_lazy(func):
         )
     func.LEGACY = True
     return func
+
+
+# Process initializer for map
+def _proc_init(base_seed, queue):
+    worker_id = queue.get(timeout=10)
+    seed = base_seed + worker_id
+    torch.manual_seed(seed)
+    np_seed = _generate_state(base_seed, worker_id)
+    np.random.seed(np_seed)
diff --git a/test/test_tensordict.py b/test/test_tensordict.py
@@ -6546,6 +6546,109 @@ def test_modules(self, as_module):
             assert y._tensor.shape[0] == param_batch
 
 
+class TestMap:
+    """Tests for TensorDict.map that are independent from tensordict's type."""
+
+    @classmethod
+    def get_rand_incr(cls, td):
+        # torch
+        td["r"] = td["r"] + torch.randint(0, 100, ()).item()
+        # numpy
+        td["s"] = td["s"] + np.random.randint(0, 100, ()).item()
+        return td
+
+    def test_map_seed(self):
+        pytest.skip(
+            reason="Using max_tasks_per_child is unstable and can cause multiple processes to start over even though all jobs are completed",
+        )
+
+        if mp.get_start_method(allow_none=True) is None:
+            mp.set_start_method("spawn")
+        td = TensorDict(
+            {
+                "r": torch.zeros(20, dtype=torch.int),
+                "s": torch.zeros(20, dtype=torch.int),
+                "c": torch.arange(20),
+            },
+            batch_size=[20],
+        )
+        generator = torch.Generator()
+        # we use 4 workers with max 5 items each,
+        # making sure that no worker does more than any other.
+        generator.manual_seed(0)
+        td_out_0 = td.map(
+            TestMap.get_rand_incr,
+            num_workers=4,
+            generator=generator,
+            chunksize=1,
+            max_tasks_per_child=5,
+        )
+        print("got 1")
+        generator.manual_seed(0)
+        td_out_1 = td.map(
+            TestMap.get_rand_incr,
+            num_workers=4,
+            generator=generator,
+            chunksize=1,
+            max_tasks_per_child=5,
+        )
+        print("got 2")
+        # we cannot know which worker picks which job, but since they will all have
+        # a seed from 0 to 4 and produce 1 number each, we can chekc that
+        # those numbers are exactly what we were expecting.
+        assert (td_out_0["r"].sort().values == td_out_1["r"].sort().values).all(), (
+            td_out_0["r"].sort().values,
+            td_out_1["r"].sort().values,
+        )
+        assert (td_out_0["s"].sort().values == td_out_1["s"].sort().values).all(), (
+            td_out_0["s"].sort().values,
+            td_out_1["s"].sort().values,
+        )
+
+    def test_map_seed_single(self):
+        # A cheap version of the previous test
+        if mp.get_start_method(allow_none=True) is None:
+            mp.set_start_method("spawn")
+        td = TensorDict(
+            {
+                "r": torch.zeros(20, dtype=torch.int),
+                "s": torch.zeros(20, dtype=torch.int),
+                "c": torch.arange(20),
+            },
+            batch_size=[20],
+        )
+        generator = torch.Generator()
+        # we use 4 workers with max 5 items each,
+        # making sure that no worker does more than any other.
+        generator.manual_seed(0)
+        td_out_0 = td.map(
+            TestMap.get_rand_incr,
+            num_workers=1,
+            generator=generator,
+            chunksize=1,
+        )
+        print("got 1")
+        generator.manual_seed(0)
+        td_out_1 = td.map(
+            TestMap.get_rand_incr,
+            num_workers=1,
+            generator=generator,
+            chunksize=1,
+        )
+        print("got 2")
+        # we cannot know which worker picks which job, but since they will all have
+        # a seed from 0 to 4 and produce 1 number each, we can chekc that
+        # those numbers are exactly what we were expecting.
+        assert (td_out_0["r"].sort().values == td_out_1["r"].sort().values).all(), (
+            td_out_0["r"].sort().values,
+            td_out_1["r"].sort().values,
+        )
+        assert (td_out_0["s"].sort().values == td_out_1["s"].sort().values).all(), (
+            td_out_0["s"].sort().values,
+            td_out_1["s"].sort().values,
+        )
+
+
 if __name__ == "__main__":
     args, unknown = argparse.ArgumentParser().parse_known_args()
     pytest.main([__file__, "--capture", "no", "--exitfirst"] + unknown)