Skip to content

Commit

Permalink
remove debug loggins; remove home directory test due to docker issue
Browse files Browse the repository at this point in the history
  • Loading branch information
cyruszhang committed Feb 26, 2025
1 parent 2514b1f commit 28a969a
Show file tree
Hide file tree
Showing 3 changed files with 3 additions and 31 deletions.
4 changes: 1 addition & 3 deletions data_juicer/core/data/load_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ def load_data(self, **kwargs):
f'Tried: {possible_paths}. '
f'Current working directory: {os.getcwd()}')

logger.error(f'Using resolved path for loading ray dataset: {path}')
logger.info(f'Using resolved path for loading ray dataset: {path}')
try:
dataset = RayDataset.read_json(path)
return RayDataset(dataset, dataset_path=path, cfg=self.cfg)
Expand Down Expand Up @@ -285,8 +285,6 @@ class DefaultLocalDataLoadStrategy(DefaultDataLoadStrategy):
}

def load_data(self, **kwargs):
logger.info(f'kwargs: {kwargs}')

# Get config values with defaults
text_keys = getattr(self.cfg, 'text_keys',
['text']) # Default to ['text']
Expand Down
2 changes: 0 additions & 2 deletions data_juicer/core/data/ray_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ def set_dataset_to_absolute_path(dataset, dataset_path, cfg):
for key in [cfg.video_key, cfg.image_key, cfg.audio_key]:
if key in columns:
path_keys.append(key)
logger.error(f'path_keys: {path_keys}')
if len(path_keys) > 0:
dataset_dir = os.path.dirname(dataset_path)
logger.error(f'dataset_dir: {dataset_dir}')
Expand All @@ -63,7 +62,6 @@ def set_dataset_to_absolute_path(dataset, dataset_path, cfg):
path_keys=path_keys),
batch_format='pyarrow',
zero_copy_batch=True)
logger.error(dataset.limit(1).take())
return dataset


Expand Down
28 changes: 2 additions & 26 deletions tests/core/test_dataload_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
import os
import os.path as osp
import json
import tempfile
import shutil
import uuid

Expand Down Expand Up @@ -312,8 +311,8 @@ def test_relative_path_resolution(self):
self.assertEqual(result[1]['text'], "Today is Monday and it's a happy day!")

@TEST_TAG('ray')
def test_home_and_workdir_resolution(self):
"""Test path resolution for home directory ('~') and work_dir"""
def test_workdir_resolution(self):
"""Test path resolution for work_dir"""
test_filename = 'test_resolution.jsonl'

# Create test file in work_dir
Expand All @@ -322,7 +321,6 @@ def test_home_and_workdir_resolution(self):
for item in self.test_data:
f.write(json.dumps(item, ensure_ascii=False).rstrip() + '\n')

# Test 1: work_dir resolution
strategy = RayLocalJsonDataLoadStrategy({
'path': test_filename # relative to work_dir
}, self.cfg)
Expand All @@ -331,28 +329,6 @@ def test_home_and_workdir_resolution(self):
result = list(dataset.get(2))
self.assertEqual(len(result), 2)
self.assertEqual(result[0]['text'], 'hello world')

# Test 2: home directory resolution
home_dir = osp.expanduser('~')
home_path = osp.join(home_dir, test_filename)

# Move test file to home directory
shutil.copy2(work_path, home_path)

try:
strategy = RayLocalJsonDataLoadStrategy({
'path': test_filename
}, self.cfg)

dataset = strategy.load_data()
result = list(dataset.get(2))
self.assertEqual(len(result), 2)
self.assertEqual(result[0]['text'], 'hello world')

finally:
# Clean up home directory test file
if osp.exists(home_path):
os.remove(home_path)


if __name__ == '__main__':
Expand Down

0 comments on commit 28a969a

Please sign in to comment.