From 1c78b221ff3541a74934915237d9f4756509696b Mon Sep 17 00:00:00 2001
From: jineetd <35962652+jineetd@users.noreply.github.com>
Date: Fri, 10 Nov 2023 02:09:20 -0500
Subject: [PATCH] Add train scores for ludwig in the create function handler.
 (#1342)

In the previous commit, we added the changes for displaying the train
scores and train times for XGBoost. We plan to add similar changes to
Ludwig integration as well.

---------

Co-authored-by: Jineet Desai <jdesai61@gatech.edu>
Co-authored-by: Andy Xu <xzdandy@gmail.com>
---
 docs/source/overview/concepts.rst             |  4 ++--
 .../source/reference/ai/model-forecasting.rst |  4 ++--
 docs/source/reference/databases/github.rst    |  2 +-
 evadb/executor/create_function_executor.py    | 20 +++++++++++++++++--
 4 files changed, 23 insertions(+), 7 deletions(-)
diff --git a/docs/source/overview/concepts.rst b/docs/source/overview/concepts.rst
index f2905b640f..4b559e24c6 100644
--- a/docs/source/overview/concepts.rst
+++ b/docs/source/overview/concepts.rst
@@ -92,6 +92,6 @@ After registering ``MnistImageClassifier`` function, you can call the function i
 AI-Centric Query Optimization
 -----------------------------
 
-EvaDB optimizes the AI queries to save money spent on running models and reduce query execution time. It contains a novel `Cascades-style query optimizer <https://www.cse.iitb.ac.in/infolab/Data/Courses/CS632/Papers/Cascades-graefe.pdf>`__  tailored for AI queries.
+EvaDB optimizes the AI queries to save money spent on running models and reduce query execution time. It contains a novel `Cascades-style query optimizer <https://faculty.cc.gatech.edu/~jarulraj/courses/8803-s21/slides/22-cascades.pdf>`__  tailored for AI queries.
 
-Query optimization has powered SQL database systems for several decades. It is the bridge that connects the declarative query language to efficient query execution on hardware. EvaDB accelerates AI queries using a collection of optimizations detailed in the :ref:`optimizations<optimizations>` page.
\ No newline at end of file
+Query optimization has powered SQL database systems for several decades. It is the bridge that connects the declarative query language to efficient query execution on hardware. EvaDB accelerates AI queries using a collection of optimizations detailed in the :ref:`optimizations<optimizations>` page.
diff --git a/docs/source/reference/ai/model-forecasting.rst b/docs/source/reference/ai/model-forecasting.rst
index 610461223e..48f644e690 100644
--- a/docs/source/reference/ai/model-forecasting.rst
+++ b/docs/source/reference/ai/model-forecasting.rst
@@ -58,7 +58,7 @@ EvaDB's default forecast framework is `statsforecast <https://nixtla.github.io/s
    * - LIBRARY (str, default: 'statsforecast')
      - We can select one of `statsforecast` (default) or `neuralforecast`. `statsforecast` provides access to statistical forecasting methods, while `neuralforecast` gives access to deep-learning based forecasting methods.
    * - MODEL (str, default: 'ARIMA')
-     - If LIBRARY is `statsforecast`, we can select one of ARIMA, ting, ETS, Theta. The default is ARIMA. Check `Automatic Forecasting <https://nixtla.github.io/statsforecast/src/core/models_intro.html#automatic-forecasting>`_ to learn details about these models. If LIBRARY is `neuralforecast`, we can select one of NHITS or NBEATS. The default is NBEATS. Check `NBEATS docs <https://nixtla.github.io/neuralforecast/models.nbeats.html>`_ for details.
+     - If LIBRARY is `statsforecast`, we can select one of ARIMA, ting, ETS, Theta. The default is ARIMA. Check `Automatic Forecasting <https://nixtla.mintlify.app/statsforecast/index.html#automatic-forecasting>`_ to learn details about these models. If LIBRARY is `neuralforecast`, we can select one of NHITS or NBEATS. The default is NBEATS. Check `NBEATS docs <https://nixtla.github.io/neuralforecast/models.nbeats.html>`_ for details.
    * - AUTO (str, default: 'T')
      - If set to 'T', it enables automatic hyperparameter optimization. Must be set to 'T' for `statsforecast` library. One may set this parameter to `false` if LIBRARY is `neuralforecast` for faster (but less reliable) results.
    * - Frequency (str, default: 'auto')
@@ -90,4 +90,4 @@ Below is an example query with `neuralforecast` with `trend` column as exogenous
     PREDICT 'y'
     LIBRARY 'neuralforecast'
     AUTO 'f'
-    FREQUENCY 'M';
\ No newline at end of file
+    FREQUENCY 'M';
diff --git a/docs/source/reference/databases/github.rst b/docs/source/reference/databases/github.rst
index 71cc9e546b..14aaa9fd8e 100644
--- a/docs/source/reference/databases/github.rst
+++ b/docs/source/reference/databases/github.rst
@@ -19,7 +19,7 @@ Required:
 
 Optional:
 
-* ``github_token`` is not required for public repositories. However, the rate limit is lower without a valid github_token. Check the `Rate limits page <https://docs.github.com/en/rest/overview/resources-in-the-rest-api>`_ to learn more about how to check your rate limit status. Check `Managing your personal access tokens page <https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens>`_ to learn how to create personal access tokens.
+* ``github_token`` is not required for public repositories. However, the rate limit is lower without a valid github_token. Check the `Rate limits page <https://docs.github.com/en/rest/overview/rate-limits-for-the-rest-api?apiVersion=2022-11-28>`_ to learn more about how to check your rate limit status. Check `Managing your personal access tokens page <https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens>`_ to learn how to create personal access tokens.
 
 Create Connection
 -----------------
diff --git a/evadb/executor/create_function_executor.py b/evadb/executor/create_function_executor.py
index cee6035f68..dc57bbc283 100644
--- a/evadb/executor/create_function_executor.py
+++ b/evadb/executor/create_function_executor.py
@@ -18,6 +18,7 @@
 import os
 import pickle
 import re
+import time
 from pathlib import Path
 from typing import Dict, List
 
@@ -125,6 +126,7 @@ def handle_ludwig_function(self):
         aggregated_batch.drop_column_alias()
 
         arg_map = {arg.key: arg.value for arg in self.node.metadata}
+        start_time = int(time.time())
         auto_train_results = auto_train(
             dataset=aggregated_batch.frames,
             target=arg_map["predict"],
@@ -134,11 +136,13 @@ def handle_ludwig_function(self):
                 "tmp_dir"
             ),
         )
+        train_time = int(time.time()) - start_time
         model_path = os.path.join(
             self.db.catalog().get_configuration_catalog_value("model_dir"),
             self.node.name,
         )
         auto_train_results.best_model.save(model_path)
+        best_score = auto_train_results.experiment_analysis.best_result["metric_score"]
         self.node.metadata.append(
             FunctionMetadataCatalogEntry("model_path", model_path)
         )
@@ -151,6 +155,8 @@ def handle_ludwig_function(self):
             self.node.function_type,
             io_list,
             self.node.metadata,
+            best_score,
+            train_time,
         )
 
     def handle_sklearn_function(self):
@@ -178,7 +184,10 @@ def handle_sklearn_function(self):
         model = LinearRegression()
         Y = aggregated_batch.frames[arg_map["predict"]]
         aggregated_batch.frames.drop([arg_map["predict"]], axis=1, inplace=True)
+        start_time = int(time.time())
         model.fit(X=aggregated_batch.frames, y=Y)
+        train_time = int(time.time()) - start_time
+        score = model.score(X=aggregated_batch.frames, y=Y)
         model_path = os.path.join(
             self.db.catalog().get_configuration_catalog_value("model_dir"),
             self.node.name,
@@ -200,6 +209,8 @@ def handle_sklearn_function(self):
             self.node.function_type,
             io_list,
             self.node.metadata,
+            score,
+            train_time,
         )
 
     def convert_to_numeric(self, x):
@@ -241,9 +252,11 @@ def handle_xgboost_function(self):
             "estimator_list": ["xgboost"],
             "task": arg_map.get("task", DEFAULT_XGBOOST_TASK),
         }
+        start_time = int(time.time())
         model.fit(
             dataframe=aggregated_batch.frames, label=arg_map["predict"], **settings
         )
+        train_time = int(time.time()) - start_time
         model_path = os.path.join(
             self.db.catalog().get_configuration_catalog_value("model_dir"),
             self.node.name,
@@ -260,7 +273,6 @@ def handle_xgboost_function(self):
         impl_path = Path(f"{self.function_dir}/xgboost.py").absolute().as_posix()
         io_list = self._resolve_function_io(None)
         best_score = model.best_loss
-        train_time = model.best_config_train_time
         return (
             self.node.name,
             impl_path,
@@ -638,6 +650,8 @@ def exec(self, *args, **kwargs):
                 function_type,
                 io_list,
                 metadata,
+                best_score,
+                train_time,
             ) = self.handle_ludwig_function()
         elif string_comparison_case_insensitive(self.node.function_type, "Sklearn"):
             (
@@ -646,6 +660,8 @@ def exec(self, *args, **kwargs):
                 function_type,
                 io_list,
                 metadata,
+                best_score,
+                train_time,
             ) = self.handle_sklearn_function()
         elif string_comparison_case_insensitive(self.node.function_type, "XGBoost"):
             (
@@ -688,7 +704,7 @@ def exec(self, *args, **kwargs):
                     [
                         msg,
                         "Validation Score: " + str(best_score),
-                        "Training time: " + str(train_time),
+                        "Training time: " + str(train_time) + " secs.",
                     ]
                 )
             )