fix: use numpy std with bessel correction and test

dreamquark-ai · Dec 27, 2021 · 3adaf4c · 3adaf4c
1 parent 49bd61b
commit 3adaf4c
Show file tree

Hide file tree

Showing 3 changed files with 46 additions and 79 deletions.
diff --git a/pretraining_example.ipynb b/pretraining_example.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -12,7 +12,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -43,7 +43,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -54,17 +54,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Downloading file...\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "out.parent.mkdir(parents=True, exist_ok=True)\n",
     "if out.exists():\n",
@@ -83,7 +75,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -108,31 +100,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "39 73\n",
-      " State-gov 9\n",
-      " Bachelors 16\n",
-      " 13 16\n",
-      " Never-married 7\n",
-      " Adm-clerical 15\n",
-      " Not-in-family 6\n",
-      " White 5\n",
-      " Male 2\n",
-      " 2174 119\n",
-      " 0 92\n",
-      " 40 94\n",
-      " United-States 42\n",
-      " <=50K 2\n",
-      "Set 3\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "nunique = train.nunique()\n",
     "types = train.dtypes\n",
@@ -160,7 +130,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -175,7 +145,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -198,7 +168,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -207,18 +177,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/work/pytorch_tabnet/abstract_model.py:75: UserWarning: Device used : cpu\n",
-      "  warnings.warn(f\"Device used : {self.device}\")\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "# TabNetPretrainer\n",
     "unsupervised_model = TabNetPretrainer(\n",
@@ -242,38 +203,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "max_epochs = 2 if not os.getenv(\"CI\", False) else 2"
+    "max_epochs = 1000 if not os.getenv(\"CI\", False) else 2"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": null,
    "metadata": {
     "scrolled": false
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "epoch 0  | loss: 6.48655 | val_0_unsup_loss_numpy: 2.0507700443267822|  0:00:07s\n",
-      "epoch 1  | loss: 1.61586 | val_0_unsup_loss_numpy: 1.2413300275802612|  0:00:15s\n",
-      "Stop training because you reached max_epochs = 2 with best_epoch = 1 and best_val_0_unsup_loss_numpy = 1.2413300275802612\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/work/pytorch_tabnet/callbacks.py:172: UserWarning: Best weights from best epoch are automatically used!\n",
-      "  warnings.warn(wrn_msg)\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "unsupervised_model.fit(\n",
     "    X_train=X_train,\n",

diff --git a/pytorch_tabnet/metrics.py b/pytorch_tabnet/metrics.py
@@ -53,7 +53,7 @@ def UnsupervisedLoss(y_pred, embedded_x, obf_vars, eps=1e-9):
 def UnsupervisedLossNumpy(y_pred, embedded_x, obf_vars, eps=1e-9):
     errors = y_pred - embedded_x
     reconstruction_errors = np.multiply(errors, obf_vars) ** 2
-    batch_stds = np.std(embedded_x, axis=0) ** 2 + eps
+    batch_stds = np.std(embedded_x, axis=0, ddof=1) ** 2 + eps
     features_loss = np.matmul(reconstruction_errors, 1 / batch_stds)
     # compute the number of obfuscated variables to reconstruct
     nb_reconstructed_variables = np.sum(obf_vars, axis=1)
@@ -64,7 +64,6 @@ def UnsupervisedLossNumpy(y_pred, embedded_x, obf_vars, eps=1e-9):
     return loss
 
 
-
 @dataclass
 class UnsupMetricContainer:
     """Container holding a list of metrics.

diff --git a/tests/unsupervised_loss.py b/tests/unsupervised_loss.py
@@ -0,0 +1,25 @@
+import numpy as np
+import torch
+from pytorch_tabnet.metrics import UnsupervisedLoss, UnsupervisedLossNumpy
+
+torch.set_printoptions(precision=10)
+
+
+def test_equal_losses():
+    y_pred = np.random.uniform(low=-2, high=2, size=(20, 100))
+    embedded_x = np.random.uniform(low=-2, high=2, size=(20, 100))
+    obf_vars = np.random.choice([0, 1], size=(20, 100), replace=True)
+
+    numpy_loss = UnsupervisedLossNumpy(
+        y_pred=y_pred,
+        embedded_x=embedded_x,
+        obf_vars=obf_vars
+    )
+
+    torch_loss = UnsupervisedLoss(
+        y_pred=torch.tensor(y_pred, dtype=torch.float64),
+        embedded_x=torch.tensor(embedded_x, dtype=torch.float64),
+        obf_vars=torch.tensor(obf_vars, dtype=torch.float64)
+    )
+
+    assert np.isclose(numpy_loss, torch_loss.detach().numpy())