From ec561baf27db921d39d9ce5c9b1f2faf54f018b5 Mon Sep 17 00:00:00 2001
From: wbenbihi <waligoo@gmail.com>
Date: Fri, 19 Aug 2022 12:35:04 +0800
Subject: [PATCH] [ADD][DOC] Notebook 02 -Split Train/Test demonstration

---
 notebooks/02 - Prepare Data.ipynb | 753 +++++++++++++++++++++++++++++-
 1 file changed, 739 insertions(+), 14 deletions(-)

diff --git a/notebooks/02 - Prepare Data.ipynb b/notebooks/02 - Prepare Data.ipynb
index 156ca80..e8ff777 100644
--- a/notebooks/02 - Prepare Data.ipynb	
+++ b/notebooks/02 - Prepare Data.ipynb	
@@ -27,7 +27,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 48,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -36,6 +36,8 @@
     "import sys\n",
     "import re\n",
     "import json\n",
+    "import random\n",
+    "import itertools\n",
     "sys.path.append(os.path.join('..'))\n",
     "\n",
     "# Import Type Hints\n",
@@ -92,12 +94,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 82,
    "metadata": {},
    "outputs": [],
    "source": [
     "# Define Limbs\n",
     "\n",
+    "NUM_JOINTS = 16\n",
+    "\n",
     "# Joint Identifiers\n",
     "# RIGHT LOWER BODY: 0 - r ankle, 1 - r knee, 2 - r hip, \n",
     "# LEFT LOWER BODY : 3 - l hip, 4 - l knee, 5 - l ankle, \n",
@@ -206,18 +210,10 @@
     "        lines += [\n",
     "            (\n",
     "                [\n",
-    "                    [\n",
-    "                        joints[limb_joints[0]][0], \n",
-    "                        joints[limb_joints[1]][0]\n",
-    "                    ], \n",
-    "                    [\n",
-    "                        joints[limb_joints[0]][1], \n",
-    "                        joints[limb_joints[1]][1]\n",
-    "                    ]\n",
+    "                    [joints[limb_joints[0]][0], joints[limb_joints[1]][0]], \n",
+    "                    [joints[limb_joints[0]][1], joints[limb_joints[1]][1]]\n",
     "                ],\n",
-    "                {\n",
-    "                    'color':limb_colors[limb_name]\n",
-    "                }\n",
+    "                {'color':limb_colors[limb_name]}\n",
     "            )\n",
     "            for limb_name, limb_joints in limbs.items()\n",
     "            if joints.get(limb_joints[0]) and joints.get(limb_joints[1])\n",
@@ -248,7 +244,10 @@
     "):\n",
     "    image_path = os.path.join(IMAGE_PATH, datapoint.source_image)\n",
     "    image = plt.imread(image_path)\n",
-    "    joints = {j.id: (j.x, j.y) for j in datapoint.joints}\n",
+    "    if isinstance(datapoint.joints, list):\n",
+    "        joints = {j.id: (j.x, j.y) for j in datapoint.joints}\n",
+    "    else:\n",
+    "        joints = datapoint.joints\n",
     "    plot_sample_with_joint(\n",
     "        image=image, \n",
     "        joints=joints, \n",
@@ -491,6 +490,732 @@
   {
    "cell_type": "markdown",
    "metadata": {},
+   "source": [
+    "We need to prepare the data where each record is a list of relevant information. For our purpose, we will use the following fields `source_image`, `bbox`, `joints`, `center`, `scale` to produce a table like array\n",
+    "\n",
+    "Headers:\n",
+    "- Static columns *(8 columns)*\n",
+    "    - `source_image` Image to retrieve\n",
+    "      - Needed to load the input image\n",
+    "    - `scale` Scale of the person on the image\n",
+    "      - Needed to filter image where the person might be too small\n",
+    "    - `bbox_tl_x` Top Left BBox Corner X Coordinate\n",
+    "    - `bbox_tl_y` Top Left BBox Corner Y Coordinate\n",
+    "    - `bbox_br_x` Bottom Right BBox Corner X Coordinate\n",
+    "    - `bbox_br_y` Bottom Right BBox Corner Y Coordinate\n",
+    "    - `center_x`  Person Center X Coordinate\n",
+    "    - `center_y`  Person Center Y Coordinate\n",
+    "- Dynamic columns *(3 x NUM_JOINTS)*\n",
+    "  - `joint_{JOINT_ID}_x` Joint n°JOINT_ID X Coordinate\n",
+    "  - `joint_{JOINT_ID}_y` Joint n°JOINT_ID Y Coordinate\n",
+    "  - `joint_{JOINT_ID}_visible` Joint n°JOINT_ID Visibility as 0/1 flag\n",
+    "  - ...\n",
+    "\n",
+    "For the MPII dataset given 16 joints we will generate a `(8 + 16 x 3) = 56 columns` table"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{6: HTFPersonJoint(x=356, y=290, id=6, visible=True),\n",
+       " 7: HTFPersonJoint(x=314, y=209, id=7, visible=True),\n",
+       " 8: HTFPersonJoint(x=315, y=213, id=8, visible=False),\n",
+       " 9: HTFPersonJoint(x=286, y=100, id=9, visible=False),\n",
+       " 0: HTFPersonJoint(x=325, y=349, id=0, visible=True),\n",
+       " 1: HTFPersonJoint(x=341, y=327, id=1, visible=True),\n",
+       " 2: HTFPersonJoint(x=326, y=302, id=2, visible=True),\n",
+       " 3: HTFPersonJoint(x=385, y=277, id=3, visible=True),\n",
+       " 4: HTFPersonJoint(x=409, y=302, id=4, visible=True),\n",
+       " 5: HTFPersonJoint(x=399, y=377, id=5, visible=True),\n",
+       " 10: HTFPersonJoint(x=318, y=325, id=10, visible=True),\n",
+       " 11: HTFPersonJoint(x=280, y=288, id=11, visible=True),\n",
+       " 12: HTFPersonJoint(x=267, y=231, id=12, visible=True),\n",
+       " 13: HTFPersonJoint(x=360, y=187, id=13, visible=True),\n",
+       " 14: HTFPersonJoint(x=406, y=218, id=14, visible=True),\n",
+       " 15: HTFPersonJoint(x=374, y=258, id=15, visible=True)}"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Ensure datapoints' joints are in dict format\n",
+    "_ = [datapoint.convert_joint(dict) for datapoint in htf_obj]\n",
+    "# Now the 'joints' property is a dictionary\n",
+    "sample.joints"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We define a `prepare_datapoint`as follow to generate such table:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def prepare_datapoint(datapoint: HTFPersonDatapoint):\n",
+    "    return [\n",
+    "        *[\n",
+    "            datapoint.source_image,\n",
+    "            datapoint.scale,\n",
+    "            datapoint.bbox.top_left.x,\n",
+    "            datapoint.bbox.top_left.y,\n",
+    "            datapoint.bbox.bottom_right.x,\n",
+    "            datapoint.bbox.bottom_right.y,\n",
+    "            datapoint.center.x,\n",
+    "            datapoint.center.y,\n",
+    "        ],\n",
+    "        *itertools.chain(*[\n",
+    "            [\n",
+    "                datapoint.joints.get(index).x if index in datapoint.joints else None,\n",
+    "                datapoint.joints.get(index).y if index in datapoint.joints else None,\n",
+    "                int(datapoint.joints.get(index).visible) if index in datapoint.joints else None,\n",
+    "            ]\n",
+    "            for index in range(NUM_JOINTS)\n",
+    "        ])\n",
+    "    ]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset shape (28883, 56)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# We generate the dataset\n",
+    "table_dataset = [prepare_datapoint(datapoint) for datapoint in htf_obj]\n",
+    "array_dataset = np.array(table_dataset)\n",
+    "# Print Shape\n",
+    "print(\"Dataset shape\", array_dataset.shape)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**WARNING: Data Leakage**\n",
+    "\n",
+    "Obviously we will need to split this sample in a sub train/validation datasets. We could use a basic 80/20 random selection on the whole dataset, but we do not advise this approach as it might lead to data leakage. Indeed, many samples share the same image *(e.g 2 different person on the same image)*. To avoid this we will apply a random 80/20 selection on the source image to avoid this issue\n",
+    "\n",
+    "You need to be aware that the number of image do not represent exactly the sample size of your subsets since multiple person might be display on a single image"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Image number having sufficient data: 17408\n",
+      "Estimated image on train set: 13926\n",
+      "Estimated image on validation set: 3481\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Image number having sufficient data:\", len(set(array_dataset[:, 0])))\n",
+    "print(\"Estimated image on train set:\", int(0.8*len(set(array_dataset[:, 0]))))\n",
+    "print(\"Estimated image on validation set:\", int(0.2*len(set(array_dataset[:, 0]))))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image_names = set(array_dataset[:, 0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 80,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/var/folders/yt/n2trg5vn7q1gldwvjlgl8gc00000gn/T/ipykernel_47621/3915166120.py:2: DeprecationWarning: Sampling from a set deprecated\n",
+      "since Python 3.9 and will be removed in a subsequent version.\n",
+      "  train_images = random.sample(image_names, int(0.8*(len(image_names))))\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Sizes valid: True\n",
+      "Set sizes [ 5695 23188]\n",
+      "Validation Set Size (5695, 56)\n",
+      "Train Set Size (23188, 56)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# We sample images for Train Set\n",
+    "train_images = random.sample(image_names, int(0.8*(len(image_names))))\n",
+    "# We Check images are selected only one time\n",
+    "print(\"Sizes valid:\", len(train_images) == len(set(train_images)))\n",
+    "# We select the index\n",
+    "train_idx = np.isin(array_dataset[:, 0], train_images)\n",
+    "validation_idx = np.logical_not(train_idx)\n",
+    "print(\"Set sizes\", np.bincount(train_idx))\n",
+    "# Generate Sets\n",
+    "train_array = array_dataset[train_idx]\n",
+    "validation_array = array_dataset[validation_idx]\n",
+    "print(\"Validation Set Size\", validation_array.shape)\n",
+    "print(\"Train Set Size\", train_array.shape)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 102,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Compute Headers\n",
+    "headers = [\n",
+    "    *[\n",
+    "        'image',\n",
+    "        'scale',\n",
+    "        'bbox_tl_x',\n",
+    "        'bbox_tl_y',\n",
+    "        'bbox_br_x',\n",
+    "        'bbox_br_y',\n",
+    "        'center_x',\n",
+    "        'center_y',\n",
+    "        \n",
+    "    ],\n",
+    "    *itertools.chain(*[(f\"joint_{i}_X\", f\"joint_{i}_Y\", f\"joint_{i}_visible\") for i in range(NUM_JOINTS)])\n",
+    "]\n",
+    "# Visualize as DataFrame \n",
+    "train_df = pd.DataFrame.from_records(train_array, columns=headers)\n",
+    "validation_df = pd.DataFrame.from_records(train_array, columns=headers)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 103,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def sanitize_dataset(df: pd.DataFrame):\n",
+    "    # Convert Coordinates as Int and fill NaN values with -1\n",
+    "    df[\n",
+    "        list(itertools.chain(*[ [f\"joint_{i}_X\", f\"joint_{i}_Y\"] for i in range(NUM_JOINTS)]))\n",
+    "    ] = df.filter(\n",
+    "        itertools.chain(*[ [f\"joint_{i}_X\", f\"joint_{i}_Y\"] for i in range(NUM_JOINTS)])\n",
+    "    ).fillna(-1).astype(int)\n",
+    "    # Fill NaN visibility values with 0 and Cast as Int\n",
+    "    df[\n",
+    "        [f\"joint_{i}_visible\" for i in range(NUM_JOINTS)]\n",
+    "    ] = df.filter(\n",
+    "        [f\"joint_{i}_visible\" for i in range(NUM_JOINTS)]\n",
+    "    ).fillna(0).astype(int)\n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 105,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>image</th>\n",
+       "      <th>scale</th>\n",
+       "      <th>bbox_tl_x</th>\n",
+       "      <th>bbox_tl_y</th>\n",
+       "      <th>bbox_br_x</th>\n",
+       "      <th>bbox_br_y</th>\n",
+       "      <th>center_x</th>\n",
+       "      <th>center_y</th>\n",
+       "      <th>joint_0_X</th>\n",
+       "      <th>joint_0_Y</th>\n",
+       "      <th>...</th>\n",
+       "      <th>joint_12_visible</th>\n",
+       "      <th>joint_13_X</th>\n",
+       "      <th>joint_13_Y</th>\n",
+       "      <th>joint_13_visible</th>\n",
+       "      <th>joint_14_X</th>\n",
+       "      <th>joint_14_Y</th>\n",
+       "      <th>joint_14_visible</th>\n",
+       "      <th>joint_15_X</th>\n",
+       "      <th>joint_15_Y</th>\n",
+       "      <th>joint_15_visible</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>015601864.jpg</td>\n",
+       "      <td>3.021046</td>\n",
+       "      <td>627</td>\n",
+       "      <td>627</td>\n",
+       "      <td>706</td>\n",
+       "      <td>706</td>\n",
+       "      <td>594</td>\n",
+       "      <td>257</td>\n",
+       "      <td>620</td>\n",
+       "      <td>394</td>\n",
+       "      <td>...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>692</td>\n",
+       "      <td>185</td>\n",
+       "      <td>1</td>\n",
+       "      <td>693</td>\n",
+       "      <td>240</td>\n",
+       "      <td>1</td>\n",
+       "      <td>688</td>\n",
+       "      <td>313</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>015601864.jpg</td>\n",
+       "      <td>2.472117</td>\n",
+       "      <td>841</td>\n",
+       "      <td>841</td>\n",
+       "      <td>902</td>\n",
+       "      <td>902</td>\n",
+       "      <td>952</td>\n",
+       "      <td>222</td>\n",
+       "      <td>895</td>\n",
+       "      <td>293</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>924</td>\n",
+       "      <td>206</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1013</td>\n",
+       "      <td>203</td>\n",
+       "      <td>1</td>\n",
+       "      <td>955</td>\n",
+       "      <td>263</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>015599452.jpg</td>\n",
+       "      <td>5.641276</td>\n",
+       "      <td>607</td>\n",
+       "      <td>607</td>\n",
+       "      <td>752</td>\n",
+       "      <td>752</td>\n",
+       "      <td>619</td>\n",
+       "      <td>329</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>719</td>\n",
+       "      <td>299</td>\n",
+       "      <td>1</td>\n",
+       "      <td>711</td>\n",
+       "      <td>516</td>\n",
+       "      <td>1</td>\n",
+       "      <td>545</td>\n",
+       "      <td>466</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>015599452.jpg</td>\n",
+       "      <td>6.071051</td>\n",
+       "      <td>903</td>\n",
+       "      <td>903</td>\n",
+       "      <td>1070</td>\n",
+       "      <td>1070</td>\n",
+       "      <td>1010</td>\n",
+       "      <td>412</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1145</td>\n",
+       "      <td>269</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1226</td>\n",
+       "      <td>475</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1096</td>\n",
+       "      <td>433</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>015599452.jpg</td>\n",
+       "      <td>5.728162</td>\n",
+       "      <td>27</td>\n",
+       "      <td>27</td>\n",
+       "      <td>186</td>\n",
+       "      <td>186</td>\n",
+       "      <td>133</td>\n",
+       "      <td>315</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>26</td>\n",
+       "      <td>251</td>\n",
+       "      <td>1</td>\n",
+       "      <td>26</td>\n",
+       "      <td>423</td>\n",
+       "      <td>1</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>23183</th>\n",
+       "      <td>084761779.jpg</td>\n",
+       "      <td>1.189877</td>\n",
+       "      <td>287</td>\n",
+       "      <td>287</td>\n",
+       "      <td>324</td>\n",
+       "      <td>324</td>\n",
+       "      <td>330</td>\n",
+       "      <td>208</td>\n",
+       "      <td>362</td>\n",
+       "      <td>350</td>\n",
+       "      <td>...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>288</td>\n",
+       "      <td>177</td>\n",
+       "      <td>1</td>\n",
+       "      <td>279</td>\n",
+       "      <td>199</td>\n",
+       "      <td>0</td>\n",
+       "      <td>268</td>\n",
+       "      <td>194</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>23184</th>\n",
+       "      <td>084761779.jpg</td>\n",
+       "      <td>0.900880</td>\n",
+       "      <td>333</td>\n",
+       "      <td>333</td>\n",
+       "      <td>361</td>\n",
+       "      <td>361</td>\n",
+       "      <td>372</td>\n",
+       "      <td>216</td>\n",
+       "      <td>457</td>\n",
+       "      <td>324</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>349</td>\n",
+       "      <td>171</td>\n",
+       "      <td>1</td>\n",
+       "      <td>334</td>\n",
+       "      <td>193</td>\n",
+       "      <td>0</td>\n",
+       "      <td>319</td>\n",
+       "      <td>194</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>23185</th>\n",
+       "      <td>084761779.jpg</td>\n",
+       "      <td>0.950352</td>\n",
+       "      <td>400</td>\n",
+       "      <td>400</td>\n",
+       "      <td>428</td>\n",
+       "      <td>428</td>\n",
+       "      <td>437</td>\n",
+       "      <td>207</td>\n",
+       "      <td>483</td>\n",
+       "      <td>292</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>403</td>\n",
+       "      <td>179</td>\n",
+       "      <td>1</td>\n",
+       "      <td>403</td>\n",
+       "      <td>205</td>\n",
+       "      <td>0</td>\n",
+       "      <td>386</td>\n",
+       "      <td>198</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>23186</th>\n",
+       "      <td>084761779.jpg</td>\n",
+       "      <td>0.905662</td>\n",
+       "      <td>212</td>\n",
+       "      <td>212</td>\n",
+       "      <td>244</td>\n",
+       "      <td>244</td>\n",
+       "      <td>230</td>\n",
+       "      <td>216</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>214</td>\n",
+       "      <td>179</td>\n",
+       "      <td>1</td>\n",
+       "      <td>190</td>\n",
+       "      <td>202</td>\n",
+       "      <td>1</td>\n",
+       "      <td>190</td>\n",
+       "      <td>201</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>23187</th>\n",
+       "      <td>084761779.jpg</td>\n",
+       "      <td>2.665729</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>76</td>\n",
+       "      <td>76</td>\n",
+       "      <td>62</td>\n",
+       "      <td>338</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>23188 rows × 56 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "               image     scale  bbox_tl_x  bbox_tl_y  bbox_br_x  bbox_br_y  \\\n",
+       "0      015601864.jpg  3.021046        627        627        706        706   \n",
+       "1      015601864.jpg  2.472117        841        841        902        902   \n",
+       "2      015599452.jpg  5.641276        607        607        752        752   \n",
+       "3      015599452.jpg  6.071051        903        903       1070       1070   \n",
+       "4      015599452.jpg  5.728162         27         27        186        186   \n",
+       "...              ...       ...        ...        ...        ...        ...   \n",
+       "23183  084761779.jpg  1.189877        287        287        324        324   \n",
+       "23184  084761779.jpg  0.900880        333        333        361        361   \n",
+       "23185  084761779.jpg  0.950352        400        400        428        428   \n",
+       "23186  084761779.jpg  0.905662        212        212        244        244   \n",
+       "23187  084761779.jpg  2.665729          0          0         76         76   \n",
+       "\n",
+       "       center_x  center_y  joint_0_X  joint_0_Y  ...  joint_12_visible  \\\n",
+       "0           594       257        620        394  ...                 1   \n",
+       "1           952       222        895        293  ...                 0   \n",
+       "2           619       329         -1         -1  ...                 0   \n",
+       "3          1010       412         -1         -1  ...                 1   \n",
+       "4           133       315         -1         -1  ...                 1   \n",
+       "...         ...       ...        ...        ...  ...               ...   \n",
+       "23183       330       208        362        350  ...                 1   \n",
+       "23184       372       216        457        324  ...                 0   \n",
+       "23185       437       207        483        292  ...                 0   \n",
+       "23186       230       216         -1         -1  ...                 0   \n",
+       "23187        62       338         -1         -1  ...                 1   \n",
+       "\n",
+       "       joint_13_X  joint_13_Y  joint_13_visible  joint_14_X  joint_14_Y  \\\n",
+       "0             692         185                 1         693         240   \n",
+       "1             924         206                 1        1013         203   \n",
+       "2             719         299                 1         711         516   \n",
+       "3            1145         269                 1        1226         475   \n",
+       "4              26         251                 1          26         423   \n",
+       "...           ...         ...               ...         ...         ...   \n",
+       "23183         288         177                 1         279         199   \n",
+       "23184         349         171                 1         334         193   \n",
+       "23185         403         179                 1         403         205   \n",
+       "23186         214         179                 1         190         202   \n",
+       "23187          -1          -1                 0          -1          -1   \n",
+       "\n",
+       "       joint_14_visible  joint_15_X  joint_15_Y  joint_15_visible  \n",
+       "0                     1         688         313                 1  \n",
+       "1                     1         955         263                 1  \n",
+       "2                     1         545         466                 1  \n",
+       "3                     1        1096         433                 1  \n",
+       "4                     1          -1          -1                 0  \n",
+       "...                 ...         ...         ...               ...  \n",
+       "23183                 0         268         194                 0  \n",
+       "23184                 0         319         194                 0  \n",
+       "23185                 0         386         198                 0  \n",
+       "23186                 1         190         201                 0  \n",
+       "23187                 0          -1          -1                 0  \n",
+       "\n",
+       "[23188 rows x 56 columns]"
+      ]
+     },
+     "execution_count": 105,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sanitize_train_df = sanitize_dataset(train_df)\n",
+    "sanitize_validation_df = sanitize_dataset(validation_df)\n",
+    "sanitize_train_df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This notebook is only for demonstration purpose, but this methodology is the one applied to train the model. Once frozen, the train/validation image list will be used as Model Metadata to ensure which image as been seen by the neural network during training. The next step would be to export the given set and use it to build a `tf.Dataset`. The next notebook will tackle this step"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 106,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 23188 entries, 0 to 23187\n",
+      "Data columns (total 56 columns):\n",
+      " #   Column            Non-Null Count  Dtype  \n",
+      "---  ------            --------------  -----  \n",
+      " 0   image             23188 non-null  object \n",
+      " 1   scale             23188 non-null  float64\n",
+      " 2   bbox_tl_x         23188 non-null  int64  \n",
+      " 3   bbox_tl_y         23188 non-null  int64  \n",
+      " 4   bbox_br_x         23188 non-null  int64  \n",
+      " 5   bbox_br_y         23188 non-null  int64  \n",
+      " 6   center_x          23188 non-null  int64  \n",
+      " 7   center_y          23188 non-null  int64  \n",
+      " 8   joint_0_X         23188 non-null  int64  \n",
+      " 9   joint_0_Y         23188 non-null  int64  \n",
+      " 10  joint_0_visible   23188 non-null  int64  \n",
+      " 11  joint_1_X         23188 non-null  int64  \n",
+      " 12  joint_1_Y         23188 non-null  int64  \n",
+      " 13  joint_1_visible   23188 non-null  int64  \n",
+      " 14  joint_2_X         23188 non-null  int64  \n",
+      " 15  joint_2_Y         23188 non-null  int64  \n",
+      " 16  joint_2_visible   23188 non-null  int64  \n",
+      " 17  joint_3_X         23188 non-null  int64  \n",
+      " 18  joint_3_Y         23188 non-null  int64  \n",
+      " 19  joint_3_visible   23188 non-null  int64  \n",
+      " 20  joint_4_X         23188 non-null  int64  \n",
+      " 21  joint_4_Y         23188 non-null  int64  \n",
+      " 22  joint_4_visible   23188 non-null  int64  \n",
+      " 23  joint_5_X         23188 non-null  int64  \n",
+      " 24  joint_5_Y         23188 non-null  int64  \n",
+      " 25  joint_5_visible   23188 non-null  int64  \n",
+      " 26  joint_6_X         23188 non-null  int64  \n",
+      " 27  joint_6_Y         23188 non-null  int64  \n",
+      " 28  joint_6_visible   23188 non-null  int64  \n",
+      " 29  joint_7_X         23188 non-null  int64  \n",
+      " 30  joint_7_Y         23188 non-null  int64  \n",
+      " 31  joint_7_visible   23188 non-null  int64  \n",
+      " 32  joint_8_X         23188 non-null  int64  \n",
+      " 33  joint_8_Y         23188 non-null  int64  \n",
+      " 34  joint_8_visible   23188 non-null  int64  \n",
+      " 35  joint_9_X         23188 non-null  int64  \n",
+      " 36  joint_9_Y         23188 non-null  int64  \n",
+      " 37  joint_9_visible   23188 non-null  int64  \n",
+      " 38  joint_10_X        23188 non-null  int64  \n",
+      " 39  joint_10_Y        23188 non-null  int64  \n",
+      " 40  joint_10_visible  23188 non-null  int64  \n",
+      " 41  joint_11_X        23188 non-null  int64  \n",
+      " 42  joint_11_Y        23188 non-null  int64  \n",
+      " 43  joint_11_visible  23188 non-null  int64  \n",
+      " 44  joint_12_X        23188 non-null  int64  \n",
+      " 45  joint_12_Y        23188 non-null  int64  \n",
+      " 46  joint_12_visible  23188 non-null  int64  \n",
+      " 47  joint_13_X        23188 non-null  int64  \n",
+      " 48  joint_13_Y        23188 non-null  int64  \n",
+      " 49  joint_13_visible  23188 non-null  int64  \n",
+      " 50  joint_14_X        23188 non-null  int64  \n",
+      " 51  joint_14_Y        23188 non-null  int64  \n",
+      " 52  joint_14_visible  23188 non-null  int64  \n",
+      " 53  joint_15_X        23188 non-null  int64  \n",
+      " 54  joint_15_Y        23188 non-null  int64  \n",
+      " 55  joint_15_visible  23188 non-null  int64  \n",
+      "dtypes: float64(1), int64(54), object(1)\n",
+      "memory usage: 9.9+ MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "sanitize_train_df.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": []
   }
  ],