From e018131252ce87a0fcd091e49b2e1f3f0a6a8508 Mon Sep 17 00:00:00 2001 From: Alex Barros Date: Tue, 22 Nov 2022 10:12:42 -0300 Subject: [PATCH] feat: report comparison example (#1160) * feat: add report comparison example --- examples/report_comparison/comparison.ipynb | 120 ++++++++++++++++++++ examples/report_comparison/comparison.py | 20 ++++ 2 files changed, 140 insertions(+) create mode 100644 examples/report_comparison/comparison.ipynb create mode 100644 examples/report_comparison/comparison.py diff --git a/examples/report_comparison/comparison.ipynb b/examples/report_comparison/comparison.ipynb new file mode 100644 index 000000000..bada1575f --- /dev/null +++ b/examples/report_comparison/comparison.ipynb @@ -0,0 +1,120 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "54253ce4", + "metadata": { + "cell_style": "center" + }, + "outputs": [], + "source": [ + "# Installed packages\n", + "import pandas as pd\n", + "\n", + "# Our package\n", + "from pandas_profiling import ProfileReport\n", + "from pandas_profiling.utils.cache import cache_file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "134987f5", + "metadata": {}, + "outputs": [], + "source": [ + "# Read the Titanic Dataset\n", + "file_name = cache_file(\n", + " \"titanic.csv\",\n", + " \"https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv\",\n", + ")\n", + "df = pd.read_csv(file_name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21dda1c6", + "metadata": {}, + "outputs": [], + "source": [ + "# Generate the Profiling Report from 2 samples from titanic dataset\n", + "profile1 = ProfileReport(df.sample(frac=0.5))\n", + "profile2 = ProfileReport(df.sample(frac=0.5))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe83402f", + "metadata": {}, + "outputs": [], + "source": [ + "# compare the profiles and generate a comparison profile\n", + "comparison = profile1.compare(profile2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ad00f47", + "metadata": {}, + "outputs": [], + "source": [ + "# display the html profile in an iframe\n", + "comparison" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/report_comparison/comparison.py b/examples/report_comparison/comparison.py new file mode 100644 index 000000000..cc5cdc14a --- /dev/null +++ b/examples/report_comparison/comparison.py @@ -0,0 +1,20 @@ +import pandas as pd + +from pandas_profiling import ProfileReport +from pandas_profiling.utils.cache import cache_file + +if __name__ == "__main__": + # Read the Titanic Dataset + file_name = cache_file( + "titanic.csv", + "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv", + ) + df = pd.read_csv(file_name) + + # Generate the Profiling Report from 2 samples from titanic dataset + profile1 = ProfileReport(df.sample(frac=0.5)) + profile2 = ProfileReport(df.sample(frac=0.5)) + + # compare the profiles and generate a comparison profile + comparison = profile1.compare(profile2) + comparison.to_file("profile_comparison.html")