Skip to content

Commit

Permalink
Add project.env
Browse files Browse the repository at this point in the history
  • Loading branch information
AjayP13 committed Feb 1, 2024
1 parent fa83b65 commit 4706732
Show file tree
Hide file tree
Showing 8 changed files with 57 additions and 38 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "DataDreamer"
version = "0.3.0"
version = "0.4.0"
description = "Prompt. Generate Synthetic Data. Train & Align Models."
license = "MIT"
authors= [
Expand Down
11 changes: 11 additions & 0 deletions scripts/project.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
export PROJECT_JOB_NAME="test" # This makes run.sh run with `pytest` instead of `python3`
export PROJECT_DATA=~/.datadreamer_dev/ # Where project dependencies will be installed and stored
export PROJECT_DISABLE_TUNNEL=1 # Disables certain dependencies that are not required

# API Keys and Tokens
# export HUGGING_FACE_HUB_TOKEN="your huggingface_hub token" # (optional) Some tests require a Hugging Face Hub token
# export OPENAI_API_KEY="your_openai_api_key" # (optional) Some tests OpenAI API key

# You can un-comment the line below to make subsequent runs faster
# after project dependencies have been installed.
export PROJECT_SKIP_INSTALL_REQS=1 # Skip installing reqs
1 change: 1 addition & 0 deletions src/steps/data_sources/csv_data_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ class CSVDataSource(DataSource):
**config_kwargs: Additional keyword arguments to pass to
:py:func:`datasets.load_dataset`.
"""

def __init__(
self,
name: str,
Expand Down
1 change: 1 addition & 0 deletions src/steps/data_sources/hf_dataset_data_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from ..step_operations import _INTERNAL_STEP_OPERATION_KEY
from .data_source import DataSource


class HFDatasetDataSource(DataSource):
"""Loads a Hugging Face :py:class:`~datasets.Dataset` from a local path. See
:py:func:`datasets.load_from_disk` for more details.
Expand Down
2 changes: 2 additions & 0 deletions src/steps/data_sources/json_data_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from ..step_operations import _INTERNAL_STEP_OPERATION_KEY
from .data_source import DataSource


class JSONDataSource(DataSource):
"""Loads a JSON dataset from a local path. See :py:func:`datasets.load_dataset` for
more details.
Expand All @@ -25,6 +26,7 @@ class JSONDataSource(DataSource):
**config_kwargs: Additional keyword arguments to pass to
:py:func:`datasets.load_dataset`.
"""

def __init__(
self,
name: str,
Expand Down
1 change: 1 addition & 0 deletions src/steps/data_sources/text_data_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ class TextDataSource(DataSource):
**config_kwargs: Additional keyword arguments to pass to
:py:func:`datasets.load_dataset`.
"""

def __init__(
self,
name: str,
Expand Down
73 changes: 37 additions & 36 deletions src/steps/step.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def __init__( # noqa: C901
log_level: The logging level to use (:py:data:`~logging.DEBUG`, :py:data:`~logging.INFO`, etc.).
save_num_proc: The number of processes to use if saving to disk.
save_num_shards: The number of shards on disk to save the dataset into.
background: Whether to run the operation in the background.
background: Whether to run the operation in the background.
"""
# Get the cls_name
cls_name = self.__class__.__name__
Expand Down Expand Up @@ -651,7 +651,7 @@ def pickle(self, value: Any, *args: Any, **kwargs: Any) -> bytes:
"""Pickle a value so it can be stored in a row produced by this step. See
:doc:`create your own steps
<pages/advanced_usage/creating_a_new_datadreamer_.../step>` for more details.
Args:
value: The value to pickle.
*args: The args to pass to :py:meth:`~dill.dumps`.
Expand All @@ -664,7 +664,7 @@ def pickle(self, value: Any, *args: Any, **kwargs: Any) -> bytes:
return _pickle(value, *args, **kwargs)

def unpickle(self, value: bytes) -> Any:
"""Unpickle a value that was stored in a row produced by this step with
"""Unpickle a value that was stored in a row produced by this step with
:py:meth:`~Step.pickle`. See :doc:`create your own steps
<pages/advanced_usage/creating_a_new_datadreamer_.../step>` for more details.
Expand Down Expand Up @@ -836,9 +836,9 @@ def output(self) -> OutputDataset | OutputIterableDataset:

@property
def dataset_path(self) -> str:
"""The path to the step's output dataset on disk in HuggingFace
"""The path to the step's output dataset on disk in HuggingFace
:py:class:`~datasets.Dataset` format if the step has been saved to disk.
"""
"""
assert not DataDreamer.is_running_in_memory(), (
"This step's dataset has not been saved to disk. DataDreamer is running"
" in-memory."
Expand Down Expand Up @@ -952,7 +952,7 @@ def select(
) -> "Step":
"""Select rows from the step's output by their indices. See
:py:meth:`~datasets.Dataset.select` for more details.
Args:
indices: The indices of the rows to select.
name: The name of the operation.
Expand All @@ -963,7 +963,7 @@ def select(
save_num_proc: The number of processes to use if saving to disk.
save_num_shards: The number of shards on disk to save the dataset into.
background: Whether to run the operation in the background.
Returns:
A new step with the operation applied.
"""
Expand Down Expand Up @@ -997,7 +997,7 @@ def select_columns(
save_num_proc: The number of processes to use if saving to disk.
save_num_shards: The number of shards on disk to save the dataset into.
background: Whether to run the operation in the background.
Returns:
A new step with the operation applied.
"""
Expand Down Expand Up @@ -1032,7 +1032,7 @@ def take(
save_num_proc: The number of processes to use if saving to disk.
save_num_shards: The number of shards on disk to save the dataset into.
background: Whether to run the operation in the background.
Returns:
A new step with the operation applied.
"""
Expand All @@ -1056,7 +1056,7 @@ def skip(
) -> "Step":
"""Skip the first ``n`` rows from the step's output. See
:py:meth:`~datasets.IterableDataset.skip` for more details.
Args:
n: The number of rows to skip.
name: The name of the operation.
Expand All @@ -1067,7 +1067,7 @@ def skip(
save_num_proc: The number of processes to use if saving to disk.
save_num_shards: The number of shards on disk to save the dataset into.
background: Whether to run the operation in the background.
Returns:
A new step with the operation applied.
"""
Expand All @@ -1091,10 +1091,10 @@ def shuffle(
) -> "Step":
"""Shuffle the rows of the step's output. See
:py:meth:`~datasets.IterableDataset.shuffle` for more details.
Args:
seed: The random seed to use for shuffling the step's output.
buffer_size: The buffer size to use for shuffling the dataset, if the step's
buffer_size: The buffer size to use for shuffling the dataset, if the step's
output is an :py:class:`~datadreamer.datasets.OutputIterableDataset`.
name: The name of the operation.
lazy: Whether to run the operation lazily.
Expand All @@ -1104,7 +1104,7 @@ def shuffle(
save_num_proc: The number of processes to use if saving to disk.
save_num_shards: The number of shards on disk to save the dataset into.
background: Whether to run the operation in the background.
Returns:
A new step with the operation applied.
"""
Expand All @@ -1129,7 +1129,7 @@ def sort(
) -> "Step":
"""Sort the rows of the step's output. See
:py:meth:`~datasets.Dataset.sort` for more details.
Args:
column_names: The names of the columns to sort by.
reverse: Whether to sort in reverse order.
Expand All @@ -1141,7 +1141,7 @@ def sort(
save_num_proc: The number of processes to use if saving to disk.
save_num_shards: The number of shards on disk to save the dataset into.
background: Whether to run the operation in the background.
Returns:
A new step with the operation applied.
"""
Expand Down Expand Up @@ -1175,7 +1175,7 @@ def add_item(
save_num_proc: The number of processes to use if saving to disk.
save_num_shards: The number of shards on disk to save the dataset into.
background: Whether to run the operation in the background.
Returns:
A new step with the operation applied.
"""
Expand Down Expand Up @@ -1206,7 +1206,7 @@ def map(
"""
Apply a function to the step's output. See
:py:meth:`~datasets.Dataset.map` for more details.
Args:
function: The function to apply to rows of the step's output.
with_indices: Whether to pass the indices of the rows to the function.
Expand All @@ -1225,7 +1225,7 @@ def map(
save_num_proc: The number of processes to use if saving to disk.
save_num_shards: The number of shards on disk to save the dataset into.
background: Whether to run the operation in the background.
Returns:
A new step with the operation applied.
"""
Expand Down Expand Up @@ -1282,7 +1282,7 @@ def filter(
save_num_proc: The number of processes to use if saving to disk.
save_num_shards: The number of shards on disk to save the dataset into.
background: Whether to run the operation in the background.
Returns:
A new step with the operation applied.
"""
Expand Down Expand Up @@ -1328,7 +1328,7 @@ def rename_column(
save_num_proc: The number of processes to use if saving to disk.
save_num_shards: The number of shards on disk to save the dataset into.
background: Whether to run the operation in the background.
Returns:
A new step with the operation applied.
"""
Expand Down Expand Up @@ -1363,7 +1363,7 @@ def rename_columns(
save_num_proc: The number of processes to use if saving to disk.
save_num_shards: The number of shards on disk to save the dataset into.
background: Whether to run the operation in the background.
Returns:
A new step with the operation applied.
"""
Expand Down Expand Up @@ -1398,7 +1398,7 @@ def remove_columns(
save_num_proc: The number of processes to use if saving to disk.
save_num_shards: The number of shards on disk to save the dataset into.
background: Whether to run the operation in the background.
Returns:
A new step with the operation applied.
"""
Expand Down Expand Up @@ -1430,7 +1430,7 @@ def splits(
and 1.0 and represent the proportion of the dataset to include in the
training split. If an int, should be the number of rows to include in
the training split.
validation_size: The size of the validation split. If a float, should be
between 0.0 and 1.0 and represent the proportion of the dataset to
include in the validation split. If an int, should be the number of rows
Expand All @@ -1439,7 +1439,7 @@ def splits(
1.0 and represent the proportion of the dataset to include in the test
split. If an int, should be the number of rows to include in the test
split.
stratify_by_column: The name of the column to use to stratify equally
stratify_by_column: The name of the column to use to stratify equally
between splits.
name: The name of the operation.
progress_interval: How often to log progress in seconds.
Expand Down Expand Up @@ -1486,7 +1486,7 @@ def shard(
save_num_proc: The number of processes to use if saving to disk.
save_num_shards: The number of shards on disk to save the dataset into.
background: Whether to run the operation in the background.
Returns:
A new step with the operation applied.
"""
Expand Down Expand Up @@ -1518,7 +1518,7 @@ def reverse(
save_num_proc: The number of processes to use if saving to disk.
save_num_shards: The number of shards on disk to save the dataset into.
background: Whether to run the operation in the background.
Returns:
A new step with the operation applied.
"""
Expand Down Expand Up @@ -1547,7 +1547,7 @@ def save(
save_num_proc: The number of processes to use if saving to disk.
save_num_shards: The number of shards on disk to save the dataset into.
background: Whether to run the operation in the background.
Returns:
A new step with the operation applied.
"""
Expand Down Expand Up @@ -1578,7 +1578,7 @@ def copy(
save_num_proc: The number of processes to use if saving to disk.
save_num_shards: The number of shards on disk to save the dataset into.
background: Whether to run the operation in the background.
Returns:
A new step with the operation applied.
"""
Expand Down Expand Up @@ -1624,7 +1624,7 @@ def export_to_dict(
writer_batch_size: The batch size to use if saving to disk.
save_num_proc: The number of processes to use if saving to disk.
save_num_shards: The number of shards on disk to save the dataset into.
Returns:
The step's output as a dictionary.
"""
Expand Down Expand Up @@ -1746,7 +1746,7 @@ def export_to_json(
save_num_shards: The number of shards on disk to save the dataset into.
to_json_kwargs: Additional keyword arguments to pass to
:py:meth:`~datasets.Dataset.to_json`.
Returns:
The path to the JSON file or a dictionary of paths if creating splits.
"""
Expand Down Expand Up @@ -1814,7 +1814,7 @@ def export_to_csv(
save_num_shards: The number of shards on disk to save the dataset into.
to_csv_kwargs: Additional keyword arguments to pass to
:py:meth:`~datasets.Dataset.to_csv`.
Returns:
The path to the CSV file or a dictionary of paths if creating splits.
"""
Expand Down Expand Up @@ -1879,7 +1879,7 @@ def export_to_hf_dataset(
writer_batch_size: The batch size to use if saving to disk.
save_num_proc: The number of processes to use if saving to disk.
save_num_shards: The number of shards on disk to save the dataset into.
Returns:
The step's output as a Hugging Face :py:class:`~datasets.Dataset` or
:py:class:`~datasets.DatasetDict` if creating splits.
Expand Down Expand Up @@ -1964,7 +1964,7 @@ def publish_to_hf_hub( # noqa: C901
when publishing).
kwargs: Additional keyword arguments to pass to
:py:meth:`~datasets.Dataset.push_to_hub`.
Returns:
The URL to the published dataset.
"""
Expand Down Expand Up @@ -2303,7 +2303,7 @@ def concat(
save_num_proc: The number of processes to use if saving to disk.
save_num_shards: The number of shards on disk to save the dataset into.
background: Whether to run the operation in the background.
Returns:
A new step with the operation applied.
"""
Expand Down Expand Up @@ -2343,7 +2343,7 @@ def zipped(
save_num_proc: The number of processes to use if saving to disk.
save_num_shards: The number of shards on disk to save the dataset into.
background: Whether to run the operation in the background.
Returns:
A new step with the operation applied.
"""
Expand Down Expand Up @@ -2449,6 +2449,7 @@ class SuperStep(Step): # pragma: no cover
See :doc:`create your own steps
<pages/advanced_usage/creating_a_new_datadreamer_.../step>` for more details.
"""

@property
def output(self) -> OutputDataset | OutputIterableDataset:
return super().output
Expand Down
Loading

0 comments on commit 4706732

Please sign in to comment.