From babb4282985b851ac97e7464ad0c60a1cae73124 Mon Sep 17 00:00:00 2001
From: Shuhei Iitsuka <tushuhei@google.com>
Date: Fri, 11 Nov 2022 11:32:16 +0900
Subject: [PATCH] Add out_span option for better GPU utilization (#90)

---
 scripts/build_model.py |   5 +-
 scripts/train.py       | 142 +++++++++++++++++++++++++++--------------
 tests/test_train.py    |  74 +++++++++++++++++++--
 3 files changed, 167 insertions(+), 54 deletions(-)

diff --git a/scripts/build_model.py b/scripts/build_model.py
index 339befda..f27c2064 100644
--- a/scripts/build_model.py
+++ b/scripts/build_model.py
@@ -34,7 +34,10 @@ def rollup(weights_filename: str,
   """
   decision_trees: typing.Dict[str, float] = dict()
   with open(weights_filename) as f:
-    for row in f:
+    for row in f.readlines():
+      row = row.strip()
+      if not row:
+        continue
       feature = row.split('\t')[0]
       score = float(row.split('\t')[1])
       decision_trees.setdefault(feature, 0)
diff --git a/scripts/train.py b/scripts/train.py
index 82ea956a..ee69bdf4 100644
--- a/scripts/train.py
+++ b/scripts/train.py
@@ -30,6 +30,12 @@
   import numpy as jnp  # type: ignore
 
 EPS = np.finfo(float).eps  # type: np.floating[typing.Any]
+DEFAULT_OUTPUT_NAME = 'weights.txt'
+DEFAULT_LOG_NAME = 'train.log'
+DEFAULT_FEATURE_THRES = 10
+DEFAULT_ITERATION = 10000
+DEFAULT_OUT_SPAN = 100
+ArgList = typing.Optional[typing.List[str]]
 
 
 class Result(NamedTuple):
@@ -168,6 +174,7 @@ def fit(X_train: npt.NDArray[np.bool_],
         iters: int,
         weights_filename: str,
         log_filename: str,
+        out_span: int,
         chunk_size: typing.Optional[int] = None) -> typing.Dict[int, float]:
   """Trains an AdaBoost classifier.
 
@@ -180,8 +187,9 @@ def fit(X_train: npt.NDArray[np.bool_],
     iters (int): A number of training iterations.
     weights_filename (str): A file path to write the learned weights.
     log_filename (str): A file path to log the accuracy along with training.
-    chunk_size (Optional[int]): A chunk size to split training entries into chunks for memory reduction
-      when calculating AdaBoost's weighted training error.
+    out_span (int): Iteration span to output metics and weights.
+    chunk_size (Optional[int]): A chunk size to split training entries for
+      memory efficiency.
 
   Returns:
     phi (Dict[int, float]): Learned child classifiers.
@@ -189,11 +197,13 @@ def fit(X_train: npt.NDArray[np.bool_],
   with open(weights_filename, 'w') as f:
     f.write('')
   with open(log_filename, 'w') as f:
-    f.write('train_accuracy\ttrain_precision\ttrain_recall\ttrain_fscore\t'
-            'test_accuracy\ttest_precision\ttest_recall\ttest_fscore\n')
+    f.write(
+        'iter\ttrain_accuracy\ttrain_precision\ttrain_recall\ttrain_fscore\t'
+        'test_accuracy\ttest_precision\ttest_recall\ttest_fscore\n')
   print('Outputting learned weights to %s ...' % (weights_filename))
 
   phis: typing.Dict[int, float] = dict()
+  phi_buffer: typing.List[typing.Tuple[str, float]] = []
 
   assert (X_train.shape[1] == X_test.shape[1]
          ), 'Training and test entries should have the same number of features.'
@@ -212,38 +222,19 @@ def fit(X_train: npt.NDArray[np.bool_],
   N_train, M_train = X_train.shape
   w = jnp.ones(N_train) / N_train
   YX_train = Y_train[:, None] ^ X_train
-  for t in range(iters):
-    print('=== %s ===' % (t))
-    if chunk_size is None:
-      res: npt.NDArray[np.float64] = w.dot(YX_train)
-    else:
-      res = np.zeros(M_train)
-      for i in range(0, N_train, chunk_size):
-        YX_train_chunk = YX_train[i:i + chunk_size]
-        w_chunk = w[i:i + chunk_size]
-        res += w_chunk.dot(YX_train_chunk)
-    err = 0.5 - jnp.abs(res - 0.5)
-    m_best = int(err.argmin())
-    pol_best = res[m_best] < 0.5
-    err_min = err[m_best]
-    print('min error:\t%.5f' % err_min)
-    print('best tree:\t%d' % m_best)
-    print()
-    alpha = jnp.log((1 - err_min) / (err_min + EPS))
-    phis.setdefault(m_best, 0)
-    phis[m_best] += alpha if pol_best else -alpha
-    miss = YX_train[:, m_best]
-    if not pol_best:
-      miss = ~(miss)
-    w = w * jnp.exp(alpha * miss)
-    w = w / w.sum()
+
+  def output_progress(t: int) -> None:
     with open(weights_filename, 'a') as f:
-      feature = features[m_best] if m_best < len(features) else 'BIAS'
-      f.write('%s\t%.3f\n' % (feature, alpha if pol_best else -alpha))
+      f.write('\n'.join('%s\t%.3f' % p for p in phi_buffer) + '\n')
+    phi_buffer.clear()
     pred_train = jit(pred)(phis, X_train) if jax_ready else pred(phis, X_train)
     pred_test = jit(pred)(phis, X_test) if jax_ready else pred(phis, X_test)
     metrics_train = get_metrics(pred_train, Y_train)
     metrics_test = get_metrics(pred_test, Y_test)
+    print('=== %s ===' % t)
+    print('min error:\t%.5f' % err_min)
+    print('best tree:\t%d' % m_best)
+    print()
     print('train accuracy:\t%.5f' % metrics_train.accuracy)
     print('train prec.:\t%.5f' % metrics_train.precision)
     print('train recall:\t%.5f' % metrics_train.recall)
@@ -255,7 +246,8 @@ def fit(X_train: npt.NDArray[np.bool_],
     print('test fscore:\t%.5f' % metrics_test.fscore)
     print()
     with open(log_filename, 'a') as f:
-      f.write('%.5f\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f\n' % (
+      f.write('%d\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f\n' % (
+          t,
           metrics_train.accuracy,
           metrics_train.precision,
           metrics_train.recall,
@@ -265,51 +257,103 @@ def fit(X_train: npt.NDArray[np.bool_],
           metrics_test.recall,
           metrics_test.fscore,
       ))
+
+  for t in range(iters):
+    if chunk_size is None:
+      res: npt.NDArray[np.float64] = w.dot(YX_train)
+    else:
+      res = np.zeros(M_train)
+      for i in range(0, N_train, chunk_size):
+        YX_train_chunk = YX_train[i:i + chunk_size]
+        w_chunk = w[i:i + chunk_size]
+        res += w_chunk.dot(YX_train_chunk)
+    err = 0.5 - jnp.abs(res - 0.5)
+    m_best = int(err.argmin())
+    pol_best = res[m_best] < 0.5
+    err_min: float = err[m_best]
+
+    alpha: float = jnp.log((1 - err_min) / (err_min + EPS))
+    phis.setdefault(m_best, 0)
+    phis[m_best] += alpha if pol_best else -alpha
+    miss = YX_train[:, m_best]
+    if not pol_best:
+      miss = ~(miss)
+    w = w * jnp.exp(alpha * miss)
+    w = w / w.sum()
+    feature = features[m_best] if m_best < len(features) else 'BIAS'
+    phi_buffer.append((feature, alpha if pol_best else -alpha))
+    if (t + 1) % out_span == 0:
+      output_progress(t + 1)
+  if len(phi_buffer) > 0:
+    output_progress(t + 1)
+
   return phis
 
 
-def parse_args() -> argparse.Namespace:
+def parse_args(test: ArgList = None) -> argparse.Namespace:
+  """Parses commandline arguments.
+
+  Args:
+    test (typing.Optional[typing.List[str]], optional): Commandline args for
+      testing. Defaults to None.
+
+  Returns:
+    argparse.Namespace: Parsed data of args.
+  """
   parser = argparse.ArgumentParser(description=__doc__)
   parser.add_argument(
       'encoded_train_data', help='File path for the encoded training data.')
   parser.add_argument(
       '-o',
       '--output',
-      help='Output file path for the learned weights. (default: weights.txt)',
-      default='weights.txt')
+      help=f'Output file path for the learned weights. (default: {DEFAULT_OUTPUT_NAME})',
+      type=str,
+      default=DEFAULT_OUTPUT_NAME)
   parser.add_argument(
       '--log',
-      help='Output file path for the training log. (default: train.log)',
-      default='train.log')
+      help=f'Output file path for the training log. (default: {DEFAULT_LOG_NAME})',
+      type=str,
+      default=DEFAULT_LOG_NAME)
   parser.add_argument(
       '--feature-thres',
-      help='Threshold value of the minimum feature frequency. (default: 10)',
-      default=10)
+      help=f'Threshold value of the minimum feature frequency. (default: {DEFAULT_FEATURE_THRES})',
+      type=int,
+      default=DEFAULT_FEATURE_THRES)
   parser.add_argument(
       '--iter',
-      help='Number of iterations for training. (default: 10000)',
-      default=10000)
+      help=f'Number of iterations for training. (default: {DEFAULT_ITERATION})',
+      type=int,
+      default=DEFAULT_ITERATION)
+  parser.add_argument(
+      '--out-span',
+      help=f'Iteration span to output metrics and weights. (default: {DEFAULT_OUT_SPAN})',
+      type=int,
+      default=DEFAULT_OUT_SPAN)
   parser.add_argument(
       '--chunk-size',
-      help='A chunk size to split training entries into chunks for memory reduction when calculating AdaBoost\'s weighted training error.'
-  )
-
-  return parser.parse_args()
+      type=int,
+      help='A chunk size to split training entries for memory efficiency. (default: None)',
+      default=None)
+  if test is None:
+    return parser.parse_args()
+  else:
+    return parser.parse_args(test)
 
 
 def main() -> None:
   args = parse_args()
-  train_data_filename = args.encoded_train_data
-  weights_filename = args.output
-  log_filename = args.log
+  train_data_filename: str = args.encoded_train_data
+  weights_filename: str = args.output
+  log_filename: str = args.log
   feature_thres = int(args.feature_thres)
   iterations = int(args.iter)
+  out_span = int(args.out_span)
   chunk_size = int(args.chunk_size) if args.chunk_size is not None else None
 
   X, Y, features = preprocess(train_data_filename, feature_thres)
   X_train, X_test, Y_train, Y_test = split_dataset(X, Y)
   fit(X_train, Y_train, X_test, Y_test, features, iterations, weights_filename,
-      log_filename, chunk_size)
+      log_filename, out_span, chunk_size)
 
   print('Training done. Export the model by passing %s to build_model.py' %
         (weights_filename))
diff --git a/tests/test_train.py b/tests/test_train.py
index a8ea617f..f810ca41 100644
--- a/tests/test_train.py
+++ b/tests/test_train.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 """Tests the training script."""
 
+import math
 import os
 import sys
 import unittest
@@ -35,6 +36,51 @@
     os.path.join(os.path.dirname(__file__), 'train_test.log'))
 
 
+class TestArgParse(unittest.TestCase):
+
+  def test_cmdargs_invalid_option(self) -> None:
+    cmdargs = ['-v']
+    with self.assertRaises(SystemExit) as cm:
+      train.parse_args(cmdargs)
+    self.assertEqual(cm.exception.code, 2)
+
+  def test_cmdargs_help(self) -> None:
+    cmdargs = ['-h']
+    with self.assertRaises(SystemExit) as cm:
+      train.parse_args(cmdargs)
+    self.assertEqual(cm.exception.code, 0)
+
+  def test_cmdargs_no_data(self) -> None:
+    with self.assertRaises(SystemExit) as cm:
+      train.parse_args([])
+    self.assertEqual(cm.exception.code, 2)
+
+  def test_cmdargs_default(self) -> None:
+    cmdargs = ['encoded.txt']
+    output = train.parse_args(cmdargs)
+    self.assertEqual(output.encoded_train_data, 'encoded.txt')
+    self.assertEqual(output.output, train.DEFAULT_OUTPUT_NAME)
+    self.assertEqual(output.log, train.DEFAULT_LOG_NAME)
+    self.assertEqual(output.feature_thres, train.DEFAULT_FEATURE_THRES)
+    self.assertEqual(output.iter, train.DEFAULT_ITERATION)
+    self.assertEqual(output.out_span, train.DEFAULT_OUT_SPAN)
+    self.assertEqual(output.chunk_size, None)
+
+  def test_cmdargs_full(self) -> None:
+    cmdargs = [
+        'encoded.txt', '-o', 'out.txt', '--log', 'foo.log', '--feature-thres',
+        '100', '--iter', '10', '--chunk-size', '1000', '--out-span', '50'
+    ]
+    output = train.parse_args(cmdargs)
+    self.assertEqual(output.encoded_train_data, 'encoded.txt')
+    self.assertEqual(output.output, 'out.txt')
+    self.assertEqual(output.log, 'foo.log')
+    self.assertEqual(output.feature_thres, 100)
+    self.assertEqual(output.iter, 10)
+    self.assertEqual(output.chunk_size, 1000)
+    self.assertEqual(output.out_span, 50)
+
+
 class TestTrain(unittest.TestCase):
 
   def setUp(self) -> None:
@@ -148,13 +194,33 @@ def test_fit(self) -> None:
         True,
     ])
     features = ['a', 'b', 'c']
-    iters = 1
-    train.fit(X, Y, X, Y, features, iters, WEIGHTS_FILE_PATH, LOG_FILE_PATH)
+    iters = 5
+    out_span = 2
+    train.fit(X, Y, X, Y, features, iters, WEIGHTS_FILE_PATH, LOG_FILE_PATH,
+              out_span)
     with open(WEIGHTS_FILE_PATH) as f:
-      weights = f.read().splitlines()
-    top_feature = weights[0].split('\t')[0]
+      weights = [
+          line.split('\t') for line in f.read().splitlines() if line.strip()
+      ]
+    top_feature = weights[0][0]
     self.assertEqual(
         top_feature, 'b', msg='The most effective feature should be selected.')
+    self.assertEqual(
+        len(weights),
+        iters,
+        msg='The number of lines should equal to the iteration count.')
+
+    with open(LOG_FILE_PATH) as f:
+      log = [line.split('\t') for line in f.read().splitlines() if line.strip()]
+    self.assertEqual(
+        len(log),
+        math.ceil(iters / out_span) + 1,
+        msg='The number of lines should equal to the ceil of iteration / out_span plus one for the header'
+    )
+    self.assertEqual(
+        len(set(len(line) for line in log)),
+        1,
+        msg='The header and the body should have the same number of columns.')
 
   def tearDown(self) -> None:
     os.remove(WEIGHTS_FILE_PATH)