Skip to content

Commit

Permalink
fix gpu_num
Browse files Browse the repository at this point in the history
  • Loading branch information
jq committed May 28, 2024
1 parent 0b9a5ee commit b02a785
Show file tree
Hide file tree
Showing 6 changed files with 15 additions and 18 deletions.
3 changes: 0 additions & 3 deletions .github/workflows/make_wheel_macOS_x86.sh
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,4 @@ bazel build \

bazel-bin/build_pip_pkg artifacts $NIGHTLY_FLAG

# Setting DYLD_LIBRARY_PATH to help delocate finding tensorflow after the rpath invalidation
export DYLD_LIBRARY_PATH=$DYLD_LIBRARY_PATH:$(python -c 'import configure; print(configure.get_tf_shared_lib_dir())')
delocate-wheel -w wheelhouse -v --ignore-missing-dependencies artifacts/*.whl

4 changes: 2 additions & 2 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ jobs:
strategy:
matrix:
# TODO: add back 'windows-latest' when it can be compiled.
os: ['self-hosted-gpu']
os: ['ubuntu-20.04']
py-version: ['3.7', '3.8', '3.9', '3.10', '3.11']
tf-version: ['2.11.0', '2.15.1']
tf-need-cuda: ['1']
Expand All @@ -124,7 +124,7 @@ jobs:
- tf-version: '2.15.1'
py-version: '3.8'
fail-fast: false
runs-on: ${{ matrix.os }}
runs-on: [self-hosted, Linux, X64]
steps:
- name: clear cache folder
run: rm -rf /usr/share/dotnet /opt/ghc "/usr/local/share/boost"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
- dataset: [movielen/1m-ratings](https://www.tensorflow.org/datasets/catalog/movielens#movielens1m-ratings)
- model: DNN
- Running mode: Graph mode and Keras by using Horovod AllToAll Embedding as model parameters parallelism

- enable gpu by `python3 -m pip install tensorflow[and-cuda]`
## start train:
By default, this shell will start a train task with N workers as GPU number on local machine.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,13 @@
],
dtype=tf.int32,
name='user_occupation_label'),
'raw_user_age':
'bucketized_user_age':
tf.TensorSpec(shape=[
None,
1,
], dtype=tf.int32, name='raw_user_age'),
],
dtype=tf.int32,
name='bucketized_user_age'),
'movie_id':
tf.TensorSpec(shape=[
None,
Expand Down Expand Up @@ -126,7 +128,7 @@
'input_tensor': None,
'pretreated_tensor': None,
},
'raw_user_age': {
'bucketized_user_age': {
'code': 106,
'dtype': tf.int32,
'dim': 1,
Expand Down Expand Up @@ -398,8 +400,8 @@ def call(self, features):
def get_dataset(batch_size=1):
ds = tfds.load("movielens/1m-ratings",
split="train",
data_dir="/dataset",
download=False)
data_dir="~/dataset",
download=True)
features = ds.map(
lambda x: {
"movie_id":
Expand All @@ -412,8 +414,8 @@ def get_dataset(batch_size=1):
tf.cast(x["user_gender"], tf.int32),
"user_occupation_label":
tf.cast(x["user_occupation_label"], tf.int32),
"raw_user_age":
tf.cast(x["raw_user_age"], tf.int32),
"bucketized_user_age":
tf.cast(x["bucketized_user_age"], tf.int32),
"timestamp":
tf.cast(x["timestamp"] - 880000000, tf.int32),
})
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/bin/bash
rm -rf ./export_dir
gpu_num = nvidia-smi --query-gpu=name --format=csv,noheader | wc -l
horovodrun -np $gpu_num python movielens-1m-keras-with-horovod.py --mode="train" --model_dir="./model_dir" --export_dir="./export_dir"
gpu_num=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
export gpu_num
horovodrun -np $gpu_num python movielens-1m-keras-with-horovod.py --mode="train" --model_dir="./model_dir" --export_dir="./export_dir"
3 changes: 0 additions & 3 deletions pytest.ini

This file was deleted.

0 comments on commit b02a785

Please sign in to comment.