mosaicml · mattyding · Dec 3, 2024 · Nov 30, 2024 · Nov 30, 2024 · Dec 3, 2024
@@ -485,6 +485,8 @@ def profile_packing(
             tmp_path_to_broadcast = tempfile.TemporaryDirectory().name
             gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
             tmp_path = gathered_paths[local_rank_zero]
+            if stream_config.get('remote') is None:
+                stream_config['remote'] = stream_config['local']
             stream_config['local'] = tmp_path
 
     # Determine the packing_ratio values we'll try

diff --git a/tests/data/test_packing.py b/tests/data/test_packing.py
@@ -178,10 +178,9 @@ def test_auto_packing_with_streaming_dataloader(tmp_path: Path):
     local_dir = str(tmp_path / 'local')
     with MDSWriter(out=remote_dir, columns=columns, compression=None) as out:
         out.write({'prompt': 'HELLO', 'response': 'WORLD'})
-    cfg = DictConfig({
+
+    base_cfg = {
         'dataset': {
-            'remote': remote_dir,
-            'local': local_dir,
             'packing_ratio': 'auto',
             'max_seq_len': 200,
             'decoder_only_format': True,
@@ -194,28 +193,57 @@ def test_auto_packing_with_streaming_dataloader(tmp_path: Path):
         'prefetch_factor': None,
         'persistent_workers': False,
         'timeout': 0,
-    })
-
-    loader = build_finetuning_dataloader(
-        **cfg,
-        tokenizer=tokenizer,
-        device_batch_size=6,
-    ).dataloader
+    }
 
-    batch_ix = 0
-    for _ in loader:
-        batch_ix += 1
-        if batch_ix >= 3:
-            break
+    remote_cfg = DictConfig({
+        **base_cfg,
+        'dataset': {
+            **base_cfg['dataset'],
+            'remote': remote_dir,
+            'local': local_dir,
+        },
+    })
 
-    assert isinstance(loader, DataLoader)
-    assert isinstance(loader.dataset, StreamingFinetuningDataset)
-    assert loader.dataset.packing_ratio is not None
-    assert isinstance(loader.batch_size, int)
-    assert loader.dataset.packing_ratio == int(loader.batch_size / 6)
+    streams_cfg = DictConfig({
+        **base_cfg,
+        'dataset': {
+            **base_cfg['dataset'],
+            'streams': {
+                'stream_with_remote': {
+                    'remote': remote_dir,
+                    'local': local_dir,
+                },
+                'stream_without_remote': {
+                    'local': remote_dir,
+                },
+            },
+        },
+    })
 
-    state_dict = loader.dataset.state_dict(num_samples=2, from_beginning=False)
-    assert state_dict['sample_in_epoch'] == 2 * loader.dataset.packing_ratio
+    for cfg in [remote_cfg, streams_cfg]:
+        loader = build_finetuning_dataloader(
+            **cfg,
+            tokenizer=tokenizer,
+            device_batch_size=6,
+        ).dataloader
+
+        batch_ix = 0
+        for _ in loader:
+            batch_ix += 1
+            if batch_ix >= 3:
+                break
+
+        assert isinstance(loader, DataLoader)
+        assert isinstance(loader.dataset, StreamingFinetuningDataset)
+        assert loader.dataset.packing_ratio is not None
+        assert isinstance(loader.batch_size, int)
+        assert loader.dataset.packing_ratio == int(loader.batch_size / 6)
+
+        state_dict = loader.dataset.state_dict(
+            num_samples=2,
+            from_beginning=False,
+        )
+        assert state_dict['sample_in_epoch'] == 2 * loader.dataset.packing_ratio
 
 
 @pytest.mark.parametrize('packing_ratio', ['auto', 2.0])