diff --git a/src/arche/__init__.py b/src/arche/__init__.py index 109f08f..85543f2 100755 --- a/src/arche/__init__.py +++ b/src/arche/__init__.py @@ -5,11 +5,16 @@ SH_URL = "https://app.scrapinghub.com/p" # noqa from arche.arche import Arche +from arche.tools import dataframe from arche.tools.schema import basic_json_schema import numpy as np import pandas as pd import plotly.io as pio +pd.DataFrame._repr_html_ = dataframe._repr_html_ +pd.set_option("display.max_colwidth", -1) +pd.set_option("display.render_links", True) + pio.renderers.default = "notebook_connected+jupyterlab" __all__ = ["basic_json_schema", "Arche", "np", "pd"] diff --git a/src/arche/tools/dataframe.py b/src/arche/tools/dataframe.py new file mode 100644 index 0000000..d1e93f3 --- /dev/null +++ b/src/arche/tools/dataframe.py @@ -0,0 +1,65 @@ +from io import StringIO +from typing import Optional + +from pandas._config import get_option, config +from pandas.io.formats import format as fmt + + +pc_render_links_doc = """ +: bool + This sets if URLs in DataFrame should be rendered as clickable anchors. +""" + +# Register `render_links` option +with config.config_prefix("display"): + config.register_option( + "render_links", True, pc_render_links_doc, validator=config.is_bool + ) + + +def _repr_html_(self) -> Optional[str]: + """ + Return a html representation for a particular DataFrame. + Mainly for IPython notebook. + """ + if self._info_repr(): + buf = StringIO("") + self.info(buf=buf) + # need to escape the , should be the first line. + val = buf.getvalue().replace("<", r"<", 1) + val = val.replace(">", r">", 1) + return "
" + val + "
" + + if get_option("display.notebook_repr_html"): + max_rows = get_option("display.max_rows") + min_rows = get_option("display.min_rows") + max_cols = get_option("display.max_columns") + show_dimensions = get_option("display.show_dimensions") + render_links = get_option("display.render_links") + + formatter = fmt.DataFrameFormatter( + self, + columns=None, + col_space=None, + na_rep="NaN", + formatters=None, + float_format=None, + sparsify=None, + justify=None, + index_names=True, + header=True, + index=True, + bold_rows=True, + escape=True, + max_rows=max_rows, + min_rows=min_rows, + max_cols=max_cols, + show_dimensions=show_dimensions, + decimal=".", + table_id=None, + render_links=render_links, + ) + formatter.to_html(notebook=True) + return formatter.buf.getvalue() + else: + return None diff --git a/tests/tools/test_dataframe_html_output.py b/tests/tools/test_dataframe_html_output.py new file mode 100644 index 0000000..3c806e9 --- /dev/null +++ b/tests/tools/test_dataframe_html_output.py @@ -0,0 +1,36 @@ +import pandas as pd +import pytest + + +@pytest.fixture() +def df_with_urls(): + pd.set_option("display.notebook_repr_html", True) + data = {"col1": [1, 2], "col2": ["http://foo.com", "https://bar.com"]} + return pd.DataFrame(data) + + +def test_df_has_clickable_urls(df_with_urls): + html = df_with_urls._repr_html_() + + assert 'http://foo.com' in html + assert 'https://bar.com' in html + + +def test_derivaded_df_has_clickable_urls(df_with_urls): + html = df_with_urls.head()._repr_html_() + assert 'http://foo.com' in html + assert 'https://bar.com' in html + + +def test_arche_df_does_not_add_links_if_no_url_found(): + df = pd.DataFrame({"col1": [1, 2], "col2": ["foo", "bar"]}) + html = df._repr_html_() + assert "<class 'pandas.core.frame.DataFrame'>\n" in html + assert "