Skip to content

Commit

Permalink
Update throughput numbers, add logging_configured() util function (#81
Browse files Browse the repository at this point in the history
)
  • Loading branch information
epwalsh authored Nov 5, 2024
1 parent bec0a3c commit e27ba74
Show file tree
Hide file tree
Showing 5 changed files with 33 additions and 7 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

- Added `retries` field to `BeakerLaunchConfig`.
- Allow running on Augusta cluster with existing train scripts.
- Added `olmo_core.utils.logging_configured()` function to check if logging has been configured.

## [v1.6.0](https://github.com/allenai/OLMo-core/releases/tag/v1.6.0) - 2024-11-01

Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ Throughput numbers from these scripts with various different configuration setti

| Model size | Model arch | Context length | Precision | Throughput[^1] | Training script | Commandline overrides                                    |
| :--------: | :--------: | :------------: | :-------: | -----------: | :----------- | :-------- |
| **1B** | OLMo-1124 | 4096 | BF16 | 44,000 TPS | `OLMo-1B.py` | |
| | | 4096 | BF16/FP8[^2] | 51,000 TPS | `OLMo-1B.py` | `--model.float8_config.enabled=true` |
| **1B** | OLMo-1124 | 4096 | BF16 | 55,000 TPS | `OLMo-1B.py` | |
| | | 4096 | BF16/FP8[^2] | 65,000 TPS | `OLMo-1B.py` | `--model.float8_config.enabled=true` |
| **7B** | OLMo-1124 | 4096 | BF16 | 10,000 TPS | `OLMo-7B.py` | |
| | | 4096 | BF16/FP8 | 13,000 TPS | `OLMo-7B.py` | `--model.float8_config.enabled=true` |
| **8B** | Llama | 4096 | BF16 | 9,500 TPS | `Llama-8B.py` | |
Expand Down
16 changes: 15 additions & 1 deletion src/olmo_core/distributed/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Distributed helpers, most of which work in a non-distributed context as well for API unity.
"""

import logging
import os
from datetime import timedelta
from typing import List, Optional, TypeVar
Expand All @@ -13,7 +14,7 @@

from ..config import StrEnum
from ..exceptions import OLMoConfigurationError, OLMoEnvironmentError
from ..utils import get_default_device, move_to_device, set_env_var
from ..utils import get_default_device, logging_configured, move_to_device, set_env_var

OLMO_SHARED_FS_ENV_VAR = "OLMO_SHARED_FS"
OLMO_FS_LOCAL_RANK_ENV_VAR = "FS_LOCAL_RANK"
Expand All @@ -23,6 +24,9 @@
BEAKER_HOSTNAME_ENV_VAR = "BEAKER_NODE_HOSTNAME"


log = logging.getLogger(__name__)


def init_distributed(backend: str = "nccl", timeout: timedelta = timedelta(minutes=30)):
"""
Initialize the distributed process group with the given backend(s) and check/set the
Expand Down Expand Up @@ -100,6 +104,16 @@ def init_distributed(backend: str = "nccl", timeout: timedelta = timedelta(minut

validate_env_vars()

msg = (
f"Global rank {get_rank()} "
f"= local rank {get_local_rank()} "
f"= file system local rank {get_fs_local_rank()}"
)
if logging_configured():
log.warning(msg)
else:
print(msg)


def validate_env_vars():
"""
Expand Down
17 changes: 14 additions & 3 deletions src/olmo_core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,19 +171,18 @@ def has_flash_attn() -> bool:


def set_env_var(name: str, value: str, override: bool = False, secret: bool = False):
global _LOGGING_CONFIGURED
value_str = "****" if secret else value
if name in os.environ:
if override and os.environ[name] != value:
msg = f"Overriding env var '{name}' to '{value_str}'"
if _LOGGING_CONFIGURED:
if logging_configured():
log.warning(msg)
else:
print(msg)
os.environ[name] = value
else:
msg = f"Setting env var '{name}' to '{value_str}'"
if _LOGGING_CONFIGURED:
if logging_configured():
log.info(msg)
else:
print(msg)
Expand Down Expand Up @@ -314,6 +313,18 @@ def local_rank0_filter(record: logging.LogRecord) -> int:
_LOGGING_CONFIGURED = True


def logging_configured() -> bool:
"""
Returns ``True`` if logging has been configured (like with :func:`setup_logging()`),
otherwise returns ``False``.
"""
if _LOGGING_CONFIGURED:
return True
else:
# Otherwise check if the root logger has any handlers.
return len(logging.getLogger().handlers) > 0


def excepthook(exctype, value, traceback):
"""
Used to patch ``sys.excepthook`` in order to log exceptions. Use :func:`install_excepthook()`
Expand Down
2 changes: 1 addition & 1 deletion src/scripts/train/OLMo-1B.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def build_trainer_config(common: CommonComponents) -> TrainerConfig:
return (
TrainerConfig(
save_folder=common.save_folder,
rank_microbatch_size=4 * 4096,
rank_microbatch_size=8 * 4096,
save_overwrite=True,
metrics_collect_interval=10,
cancel_check_interval=1,
Expand Down

0 comments on commit e27ba74

Please sign in to comment.