Skip to content

Commit

Permalink
Print the reminder for the illegal memory error in the AutoBatchSize …
Browse files Browse the repository at this point in the history
…under tf (#4283)

#3822 added a reminder for the illegal memory error. However, this
reminder is only needed for tf. This PR moves the illegal memory
reminder from base class AutoBatchSize to the inherited class under tf.

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- **New Features**
- Enhanced `AutoBatchSize` class to initialize batch size from an
environment variable, improving user guidance on memory management with
TensorFlow.
- **Bug Fixes**
- Removed redundant logging during initialization to streamline the
process when GPU resources are available.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
Yi-FanLi and pre-commit-ci[bot] authored Oct 31, 2024
1 parent cdad312 commit 0d13911
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 5 deletions.
16 changes: 16 additions & 0 deletions deepmd/tf/utils/batch_size.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
# SPDX-License-Identifier: LGPL-3.0-or-later
import os

from packaging.version import (
Version,
)
Expand All @@ -11,9 +13,23 @@
OutOfMemoryError,
)
from deepmd.utils.batch_size import AutoBatchSize as AutoBatchSizeBase
from deepmd.utils.batch_size import (
log,
)


class AutoBatchSize(AutoBatchSizeBase):
def __init__(self, initial_batch_size: int = 1024, factor: float = 2.0) -> None:
super().__init__(initial_batch_size, factor)
DP_INFER_BATCH_SIZE = int(os.environ.get("DP_INFER_BATCH_SIZE", 0))
if not DP_INFER_BATCH_SIZE > 0:
if self.is_gpu_available():
log.info(
"If you encounter the error 'an illegal memory access was encountered', this may be due to a TensorFlow issue. "
"To avoid this, set the environment variable DP_INFER_BATCH_SIZE to a smaller value than the last adjusted batch size. "
"The environment variable DP_INFER_BATCH_SIZE controls the inference batch size (nframes * natoms). "
)

def is_gpu_available(self) -> bool:
"""Check if GPU is available.
Expand Down
5 changes: 0 additions & 5 deletions deepmd/utils/batch_size.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,11 +61,6 @@ def __init__(self, initial_batch_size: int = 1024, factor: float = 2.0) -> None:
self.maximum_working_batch_size = initial_batch_size
if self.is_gpu_available():
self.minimal_not_working_batch_size = 2**31
log.info(
"If you encounter the error 'an illegal memory access was encountered', this may be due to a TensorFlow issue. "
"To avoid this, set the environment variable DP_INFER_BATCH_SIZE to a smaller value than the last adjusted batch size. "
"The environment variable DP_INFER_BATCH_SIZE controls the inference batch size (nframes * natoms). "
)
else:
self.minimal_not_working_batch_size = (
self.maximum_working_batch_size + 1
Expand Down

0 comments on commit 0d13911

Please sign in to comment.