From e018131252ce87a0fcd091e49b2e1f3f0a6a8508 Mon Sep 17 00:00:00 2001
From: Alex Barros <alexbarros@users.noreply.github.com>
Date: Tue, 22 Nov 2022 10:12:42 -0300
Subject: [PATCH] feat: report comparison example (#1160)

* feat: add report comparison example
---
 examples/report_comparison/comparison.ipynb | 120 ++++++++++++++++++++
 examples/report_comparison/comparison.py    |  20 ++++
 2 files changed, 140 insertions(+)
 create mode 100644 examples/report_comparison/comparison.ipynb
 create mode 100644 examples/report_comparison/comparison.py

diff --git a/examples/report_comparison/comparison.ipynb b/examples/report_comparison/comparison.ipynb
new file mode 100644
index 000000000..bada1575f
--- /dev/null
+++ b/examples/report_comparison/comparison.ipynb
@@ -0,0 +1,120 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "54253ce4",
+   "metadata": {
+    "cell_style": "center"
+   },
+   "outputs": [],
+   "source": [
+    "# Installed packages\n",
+    "import pandas as pd\n",
+    "\n",
+    "# Our package\n",
+    "from pandas_profiling import ProfileReport\n",
+    "from pandas_profiling.utils.cache import cache_file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "134987f5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Read the Titanic Dataset\n",
+    "file_name = cache_file(\n",
+    "    \"titanic.csv\",\n",
+    "    \"https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv\",\n",
+    ")\n",
+    "df = pd.read_csv(file_name)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "21dda1c6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Generate the Profiling Report from 2 samples from titanic dataset\n",
+    "profile1 = ProfileReport(df.sample(frac=0.5))\n",
+    "profile2 = ProfileReport(df.sample(frac=0.5))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fe83402f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# compare the profiles and generate a comparison profile\n",
+    "comparison = profile1.compare(profile2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9ad00f47",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# display the html profile in an iframe\n",
+    "comparison"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.6"
+  },
+  "varInspector": {
+   "cols": {
+    "lenName": 16,
+    "lenType": 16,
+    "lenVar": 40
+   },
+   "kernels_config": {
+    "python": {
+     "delete_cmd_postfix": "",
+     "delete_cmd_prefix": "del ",
+     "library": "var_list.py",
+     "varRefreshCmd": "print(var_dic_list())"
+    },
+    "r": {
+     "delete_cmd_postfix": ") ",
+     "delete_cmd_prefix": "rm(",
+     "library": "var_list.r",
+     "varRefreshCmd": "cat(var_dic_list()) "
+    }
+   },
+   "types_to_exclude": [
+    "module",
+    "function",
+    "builtin_function_or_method",
+    "instance",
+    "_Feature"
+   ],
+   "window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/report_comparison/comparison.py b/examples/report_comparison/comparison.py
new file mode 100644
index 000000000..cc5cdc14a
--- /dev/null
+++ b/examples/report_comparison/comparison.py
@@ -0,0 +1,20 @@
+import pandas as pd
+
+from pandas_profiling import ProfileReport
+from pandas_profiling.utils.cache import cache_file
+
+if __name__ == "__main__":
+    # Read the Titanic Dataset
+    file_name = cache_file(
+        "titanic.csv",
+        "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv",
+    )
+    df = pd.read_csv(file_name)
+
+    # Generate the Profiling Report from 2 samples from titanic dataset
+    profile1 = ProfileReport(df.sample(frac=0.5))
+    profile2 = ProfileReport(df.sample(frac=0.5))
+
+    # compare the profiles and generate a comparison profile
+    comparison = profile1.compare(profile2)
+    comparison.to_file("profile_comparison.html")