diff --git a/contrib/cache/dataframe.py b/contrib/cache/dataframe.py index 415e6549073f0..b4ec9fbd4521c 100644 --- a/contrib/cache/dataframe.py +++ b/contrib/cache/dataframe.py @@ -129,7 +129,7 @@ def set(self, key, value, timeout=None): value.to_hdf(tmp, 'df') metadata['format'] = 'hdf' metadata['read_args'] = {'key': 'df'} - except ImportError: + except Exception: # PyTables is not installed, so fallback to pickle pickle.dump(value, f, pickle.HIGHEST_PROTOCOL) metadata['format'] = 'pickle' diff --git a/contrib/connectors/pandas/models.py b/contrib/connectors/pandas/models.py index 6b899346b1571..9af950c77b888 100644 --- a/contrib/connectors/pandas/models.py +++ b/contrib/connectors/pandas/models.py @@ -7,7 +7,9 @@ from datetime import datetime import hashlib import logging +from io import BytesIO from past.builtins import basestring +import requests try: from urllib.parse import urlparse except ImportError: @@ -18,7 +20,7 @@ is_string_dtype, is_numeric_dtype, is_datetime64_any_dtype) from sqlalchemy import ( - Column, Integer, String, ForeignKey, Text, or_ + Column, Integer, String, ForeignKey, Text, and_, or_ ) import sqlalchemy as sa from sqlalchemy.orm import backref, relationship @@ -105,6 +107,14 @@ def data(self): 'filterable', 'groupby') return {s: getattr(self, s) for s in attrs} + def get_perm(self): + if self.datasource: + return ('{parent_name}.[{obj.expression}]' + '(id:{obj.id})').format( + obj=self, + parent_name=self.datasource.full_name) + return None + class PandasMetric(Model, BaseMetric): """ @@ -124,8 +134,7 @@ class PandasMetric(Model, BaseMetric): source = Column(Text) expression = Column(Text) - @property - def perm(self): + def get_perm(self): if self.datasource: return ('{parent_name}.[{obj.metric_name}]' '(id:{obj.id})').format( @@ -165,6 +174,8 @@ class PandasDatasource(Model, BaseDatasource): name = Column(String(100), nullable=False) source_url = Column(String(1000), nullable=False) + source_auth = Column(JSONType) + source_parameters = Column(JSONType) format = Column(ChoiceType(FORMATS), nullable=False) additional_parameters = Column(JSONType) @@ -296,7 +307,9 @@ def get_empty_dataframe(self): @property def cache_key(self): - source = {'source_url': self.source_url} + source = {'source_url': self.source_url, + 'source_auth': self.source_auth} + source.update(self.source_parameters or {}) source.update(self.pandas_read_parameters) s = str([(k, source[k]) for k in sorted(source.keys())]) return hashlib.md5(s.encode('utf-8')).hexdigest() @@ -313,7 +326,25 @@ def get_dataframe(self): cache_key = self.cache_key self.df = dataframe_cache.get(cache_key) if not isinstance(self.df, pd.DataFrame): - self.df = self.pandas_read_method(self.source_url, **self.pandas_read_parameters) + if isinstance(self.source_url, basestring) and self.source_url[:4] == 'http': + # Use requests to retrieve remote data so we can handle authentication + auth = self.source_auth + url = self.source_url + if isinstance(auth, (tuple, list)): + response = requests.get(url, params=self.source_parameters, + auth=tuple(auth)) + elif auth: + response = requests.get(url, params=self.source_parameters, + headers={'Authorization': auth}) + else: + response = requests.get(url, params=self.source_parameters) + response.raise_for_status() + data = BytesIO(response.content) + else: + # Local file, so just use Pandas directly + data = self.source_url + # Read the dataframe from the response + self.df = self.pandas_read_method(data, **self.pandas_read_parameters) # read_html returns a list of DataFrames if (isinstance(self.df, list) and @@ -763,7 +794,6 @@ def get_metadata(self): """Build the metadata for the table and merge it in""" df = self.get_dataframe() - metrics = [] any_date_col = None dbcols = ( db.session.query(PandasColumn) @@ -773,10 +803,20 @@ def get_metadata(self): dbcols = {dbcol.column_name: dbcol for dbcol in dbcols} for col in df.columns: dbcol = dbcols.get(col, None) + if not dbcol: - dbcol = PandasColumn(column_name=str(col), type=df.dtypes[col].name) - dbcol.groupby = dbcol.is_string - dbcol.filterable = dbcol.is_string + dtype = df.dtypes[col].name + # Pandas defaults columns where all values are None to a dtype of + # float with all values as NaN, but if we can't correctly infer + # the dtype we are better to assume object so we don't + # create large numbers of unwanted Metrics + if self.df[col].isnull().all(): + dtype = 'object' + dbcol = PandasColumn(column_name=str(col), type=dtype) + # Only treat `object` as string if we have some data + if self.df[col].notnull().any(): + dbcol.groupby = dbcol.is_string + dbcol.filterable = dbcol.is_string dbcol.sum = dbcol.is_num dbcol.avg = dbcol.is_num dbcol.min = dbcol.is_num or dbcol.is_dttm @@ -786,70 +826,86 @@ def get_metadata(self): if not any_date_col and dbcol.is_time: any_date_col = col - if dbcol.sum: - metrics.append(PandasMetric( - metric_name='sum__' + dbcol.column_name, - verbose_name='sum__' + dbcol.column_name, - metric_type='sum', - source=dbcol.column_name, - expression='sum' - )) - if dbcol.avg: - metrics.append(PandasMetric( - metric_name='avg__' + dbcol.column_name, - verbose_name='avg__' + dbcol.column_name, - metric_type='avg', - source=dbcol.column_name, - expression='mean' - )) - if dbcol.max: - metrics.append(PandasMetric( - metric_name='max__' + dbcol.column_name, - verbose_name='max__' + dbcol.column_name, - metric_type='max', - source=dbcol.column_name, - expression='max' - )) - if dbcol.min: - metrics.append(PandasMetric( - metric_name='min__' + dbcol.column_name, - verbose_name='min__' + dbcol.column_name, - metric_type='min', - source=dbcol.column_name, - expression='min' - )) - if dbcol.count_distinct: - metrics.append(PandasMetric( - metric_name='count_distinct__' + dbcol.column_name, - verbose_name='count_distinct__' + dbcol.column_name, - metric_type='count_distinct', - source=dbcol.column_name, - expression='nunique' - )) - dbcol.type = df.dtypes[col].name - - metrics.append(PandasMetric( - metric_name='count', - verbose_name='count', - metric_type='count', - source=None, - expression="count" - )) - dbmetrics = ( - db.session.query(PandasMetric) - .filter(PandasMetric.datasource == self) - .filter(or_(PandasMetric.metric_name == metric.metric_name - for metric in metrics))) - dbmetrics = {metric.metric_name: metric for metric in dbmetrics} - for metric in metrics: - metric.pandas_datasource_id = self.id - if not dbmetrics.get(metric.metric_name, None): - db.session.add(metric) if not self.main_dttm_col: self.main_dttm_col = any_date_col + db.session.merge(self) db.session.commit() +def reconcile_column_metrics(mapper, connection, target): + """ + Create or delete PandasMetrics to match the metric attributes + specified on a PandasColumn + """ + metrics_table = PandasMetric.__table__ + for metric_type in ('sum', 'avg', 'max', 'min', 'count_distinct'): + # Set up the metric attributes + metric_name = metric_type + '__' + target.column_name + verbose_name = metric_name + source = target.column_name + if metric_type == 'avg': + expression = 'mean', + elif metric_type == 'count_distinct': + expression = 'nunique' + else: + expression = metric_type + + if getattr(target, metric_type): + # Create the metric if it doesn't already exist + result = connection.execute( + metrics_table + .select() + .where( + and_( + metrics_table.c.pandas_datasource_id == target.pandas_datasource_id, + metrics_table.c.metric_name == metric_name))) + if not result.rowcount: + connection.execute( + metrics_table.insert(), + pandas_datasource_id=target.pandas_datasource_id, + metric_name=metric_name, + verbose_name=verbose_name, + source=source, + expression=expression) + else: + # Delete the metric if it exists and hasn't been customized + connection.execute( + metrics_table + .delete() + .where( + and_( + metrics_table.c.pandas_datasource_id == target.pandas_datasource_id, + metrics_table.c.metric_name == metric_name, + metrics_table.c.verbose_name == verbose_name, + metrics_table.c.source == source, + metrics_table.c.expression == expression))) + + +def reconcile_metric_column(mapper, connection, target): + """ + Clear the metric attribute on a PandasColumn if the + corresponding PandasMetric is deleted + """ + column_table = PandasColumn.__table__ + try: + metric_type, column_name = target.metric_name.split('__', 1) + if metric_type in column_table.c: + connection.execute( + column_table + .update() + .values(**{metric_type: False}) + .where( + and_( + column_table.c.pandas_datasource_id == target.pandas_datasource_id, + column_table.c.column_name == column_name))) + except ValueError: + # Metric name doesn't contain __ + pass + + +sa.event.listen(PandasColumn, 'after_insert', reconcile_column_metrics) +sa.event.listen(PandasColumn, 'after_update', reconcile_column_metrics) +sa.event.listen(PandasMetric, 'before_delete', reconcile_metric_column) sa.event.listen(PandasDatasource, 'after_insert', set_perm) sa.event.listen(PandasDatasource, 'after_update', set_perm) diff --git a/contrib/connectors/pandas/views.py b/contrib/connectors/pandas/views.py index f7a4bb30b2670..0d514e4530935 100644 --- a/contrib/connectors/pandas/views.py +++ b/contrib/connectors/pandas/views.py @@ -1,16 +1,18 @@ """Views used by the SqlAlchemy connector""" +import json import logging from past.builtins import basestring from flask import Markup, flash, redirect from flask_appbuilder import CompactCRUDMixin, expose +from flask_appbuilder.fieldwidgets import BS3TextFieldWidget, BS3TextAreaFieldWidget from flask_appbuilder.models.sqla.interface import SQLAInterface import sqlalchemy as sa from flask_babel import lazy_gettext as _ from flask_babel import gettext as __ -from wtforms import SelectField +from wtforms import SelectField, StringField, validators from superset import appbuilder, db, utils, security, sm from superset.utils import has_access @@ -34,6 +36,34 @@ def process_data(self, value): super(ChoiceTypeSelectField, self).process_data(value) +class JSONField(StringField): + """ + JSON field for WTForms that converts between the form string data + and a dictionary representation, with validation + + See https://gist.github.com/dukebody/dcc371bf286534d546e9 + """ + def _value(self): + return json.dumps(self.data) if self.data else '' + + def process_formdata(self, valuelist): + if valuelist: + try: + self.data = json.loads(valuelist[0]) + except ValueError: + raise ValueError('This field contains invalid JSON') + else: + self.data = None + + def pre_validate(self, form): + super().pre_validate(form) + if self.data: + try: + json.dumps(self.data) + except TypeError: + raise ValueError('This field contains invalid JSON') + + class PandasColumnInlineView(CompactCRUDMixin, SupersetModelView): # noqa datamodel = SQLAInterface(PandasColumn) @@ -42,16 +72,15 @@ class PandasColumnInlineView(CompactCRUDMixin, SupersetModelView): # noqa add_title = _('Add Column') edit_title = _('Edit Column') - can_delete = False list_widget = ListWidgetWithCheckboxes edit_columns = [ 'column_name', 'verbose_name', 'description', 'type', 'groupby', 'filterable', - 'datasource', 'count_distinct', 'sum', 'min', 'max'] + 'datasource', 'count_distinct', 'sum', 'avg', 'min', 'max'] add_columns = edit_columns list_columns = [ 'column_name', 'verbose_name', 'type', 'groupby', 'filterable', - 'count_distinct', 'sum', 'min', 'max'] + 'count_distinct', 'sum', 'avg', 'min', 'max'] page_size = 500 description_columns = { 'is_dttm': _( @@ -76,6 +105,7 @@ class PandasColumnInlineView(CompactCRUDMixin, SupersetModelView): # noqa 'datasource': _("Datasource"), 'count_distinct': _("Count Distinct"), 'sum': _("Sum"), + 'avg': _("Average"), 'min': _("Min"), 'max': _("Max"), 'type': _('Type'), @@ -85,7 +115,7 @@ class PandasColumnInlineView(CompactCRUDMixin, SupersetModelView): # noqa appbuilder.add_view_no_menu(PandasColumnInlineView) -class PandasMetricInlineView(CompactCRUDMixin, SupersetModelView): # noqa +class PandasMetricInlineView(CompactCRUDMixin, SupersetModelView, DeleteMixin): # noqa datamodel = SQLAInterface(PandasMetric) list_title = _('List Metrics') @@ -147,28 +177,52 @@ def post_update(self, metric): class PandasDatasourceModelView(DatasourceModelView, DeleteMixin): # noqa datamodel = SQLAInterface(PandasDatasource) - list_title = _('List Pandas Datasources') - show_title = _('Show Pandas Datasource') - add_title = _('Add Pandas Datasource') - edit_title = _('Edit Pandas Datasource') + list_title = _('List File Datasources') + show_title = _('Show File Datasource') + add_title = _('Add File Datasource') + edit_title = _('Edit File Datasource') list_columns = [ 'link', 'changed_by_', 'modified'] order_columns = [ 'link', 'changed_on_'] - add_columns = ['name', 'source_url', 'format'] + add_columns = ['name', 'source_url', 'source_auth', 'source_parameters', + 'format', 'additional_parameters'] add_form_extra_fields = { - 'format': ChoiceTypeSelectField(_('Format'), choices=FORMATS) + 'source_auth': JSONField( + _('Source Credentials'), + [validators.optional(), validators.length(max=100)], + widget=BS3TextFieldWidget(), + description=( + "Credentials required to access the raw data, if required. " + "Can be either a username and password in the form " + "'[\"username\", \"password\"]' which will be authenticated " + "using HTTP Basic Auth, or a string which will be used as " + "an Authorization header")), + 'source_parameters': JSONField( + _("Additional Query Parameters"), + [validators.optional(), validators.length(max=500)], + widget=BS3TextAreaFieldWidget(), + description=( + "A JSON-formatted dictionary of additional parameters " + "used to request the remote file")), + 'format': ChoiceTypeSelectField(_('Format'), choices=FORMATS), + 'additional_parameters': JSONField( + _("Additional Read Parameters"), + [validators.optional(), validators.length(max=500)], + widget=BS3TextAreaFieldWidget(), + description=( + "A JSON-formatted dictionary of additional parameters " + "passed to the Pandas read function")), } edit_columns = [ - 'name', 'source_url', 'format', + 'name', 'source_url', 'source_auth', 'source_parameters', + 'format', 'additional_parameters', 'filter_select_enabled', 'slices', 'fetch_values_predicate', 'description', 'owner', 'main_dttm_col', 'default_endpoint', 'offset', 'cache_timeout'] - edit_form_extra_fields = { - 'format': ChoiceTypeSelectField(_('Format'), choices=FORMATS) - } + edit_form_extra_fields = add_form_extra_fields show_columns = edit_columns + ['perm'] related_views = [PandasColumnInlineView, PandasMetricInlineView] base_order = ('changed_on', 'desc') @@ -191,10 +245,6 @@ class PandasDatasourceModelView(DatasourceModelView, DeleteMixin): # noqa "The URL used to access the raw data"), 'format': _( "The format of the raw data, e.g. csv"), - 'additional_parameters': _( - "A JSON-formatted dictionary of additional parameters " - "passed to the Pandas read_* function, " - "see https://pandas.pydata.org/pandas-docs/stable/api.html#input-output"), # NOQA 'description': Markup( "Supports " "markdown"), @@ -225,7 +275,6 @@ class PandasDatasourceModelView(DatasourceModelView, DeleteMixin): # noqa 'name': _("Name"), 'source_url': _("Source URL"), 'format': _("Format"), - 'additional_parameters': _("Additional Read Parameters"), 'fetch_values_predicate': _('Fetch Values Predicate'), 'owner': _("Owner"), 'main_dttm_col': _("Main Datetime Column"), @@ -282,8 +331,8 @@ def edit(self, pk): appbuilder.add_view( PandasDatasourceModelView, - "Pandas Datasources", - label=__("Pandas Datasources"), + "File Datasources", + label=__("File Datasources"), category="Sources", category_label=__("Sources"), icon='fa-file',) diff --git a/docs/files.rst b/docs/files.rst new file mode 100644 index 0000000000000..7b0040e4cdc37 --- /dev/null +++ b/docs/files.rst @@ -0,0 +1,107 @@ +Files +===== + +Superset can visualize data contained in flat files or outputted by REST APIs +using Pandas. This page clarifies the features and limitations of using File +Datasources with Superset. + +File Datasources do not use files uploaded directly to the Superset server. The +files are stored elsewhere and downloaded (and cached) by Superset when required. +This approach offers two important advantages over uploaded files: + +1. If the remote file is updated, then Superset will automatically download +and use the new data when the datasource timeout expires. If the file had +been uploaded manually then each change in the source data would require a +new manual upload. +2. Data can be provided by a REST API rather than an actual file. This provides +a method to use Superset to visualize current data stored in other systems +without needing to manually extract it first. + +.. note :: + A File Datasource downloads the full content of the source url + and then performs all filtering, grouping and aggregation locally + on the Superset server. File Datasources that access large + volumes of data may impact server performance and affect other users. + Server administrators should ensure that adequate memory and CPU + resources are available before enabling the File Datasource. + +Installation +'''''''''''' + +File Datasources are not enabled by default. To enable them the systems +administrator should update the `superset_config.py` or other configuration +file to include: :: python + + # Include additional data sources + ADDITIONAL_MODULE_DS_MAP = { + 'contrib.connectors.pandas.models': ['PandasDatasource'], + } + +Supported Formats +''''''''''''''''' + +File Datasources use the `Pandas library `_ +directly. Using a default installation in Superset, Pandas can read the +following formats: + +* csv +* html +* json +* Microsoft Excel +* Stata + +If the appropriate dependencies have also been installed then the following +additional formats are supported: + +* HDF5 (if PyTables is installed: `pip install tables`) +* Feather (if Feather is installed: `pip install feather-format`) + +See the `Pandas Dependencies `_ +documentation for more information. + +Adding a File Datasource +'''''''''''''''''''''''' + +When you add a new File Datasource you need to provide the following information: + +* Source URL: the URL that the file to be visualized can be downloaded from. +This can be a file hosted on another server or on a file sharing platform +such as Dropbox, Google Drive or Amazon S3. It can also be the URL of a REST API +end point. +* Source Credentials: if the Source URL requires authentication then specify +the credentials to be used. Credentials entered as ``["username", "password"]`` - +i.e. as a valid username and password enclosed in quotation marks, separated +by a comma and surrounded by parentheses - will be treated as a separate username +and password and the File Datasource will use HTTP Basic Auth to authenticate to +the remote server. Text in any other format will be passed to the remote server +as an `Authentication` header. Typically this is used with API tokens issued by +the remote server. Remote servers that require authentication should also use +an HTTPS Source URL. +* Source Parameters: a JSON-formatted dictionary of additional query parameters +that are passed to the remote server. This field will not be required for file +downloads, but is useful for specifying requests against REST APIs. +* Format: The format of the data returned by the remote server +* Read Parameters: a JSON-formatted dictionary of additional parameters that are +passed to Pandas when the file retrieved from the remote Server is read into a +DataFrame. + +Aggregations +'''''''''''' + +Common aggregations can be defined and used in Superset. +The first and simpler use case is to use the checkbox matrix exposed in your +datasource's edit view (``Sources -> File Datasources -> +[your datasource] -> Edit -> [tab] List Datasource Column``). +Clicking the ``GroupBy`` and ``Filterable`` checkboxes will make the column +appear in the related dropdowns while in explore view. Checking +``Count Distinct``, ``Min``, ``Max`` ``Average`` or ``Sum`` will result in creating +new metrics that will appear in the ``List Metrics`` tab. +You can create your own aggregations manually from the ``List Metrics`` tab for +more complex cases. + +Post-Aggregations +''''''''''''''''' + +File Datasources allow post aggregation in Superset. Any Metric that has been +defined for the Datasource can be used as part of a Result Filter to limit +the returned data. diff --git a/docs/index.rst b/docs/index.rst index eba2e94516930..5a3b831630f28 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -25,10 +25,10 @@ intelligence web application endorsed by the ASF. Overview -======================================= +======== Features ---------- +-------- - A rich set of data visualizations - An easy-to-use interface for exploring and visualizing data @@ -61,7 +61,7 @@ Features Contents ---------- +-------- .. toctree:: :maxdepth: 2 @@ -74,6 +74,7 @@ Contents videos gallery druid + files faq diff --git a/superset/migrations/versions/b2cd059e8803_add_pandasdatasource.py b/superset/migrations/versions/b2cd059e8803_add_pandasdatasource.py index 5f45bbe6fc887..b8545d457bf9a 100644 --- a/superset/migrations/versions/b2cd059e8803_add_pandasdatasource.py +++ b/superset/migrations/versions/b2cd059e8803_add_pandasdatasource.py @@ -37,6 +37,8 @@ def upgrade(): sa.Column('perm', sa.String(length=1000), nullable=True), sa.Column('name', sa.String(length=100), nullable=False), sa.Column('source_url', sa.String(length=1000), nullable=False), + sa.Column('source_auth', sqlalchemy_utils.types.json.JSONType(), nullable=True), + sa.Column('source_parameters', sqlalchemy_utils.types.json.JSONType(), nullable=True), sa.Column('format', sqlalchemy_utils.types.choice.ChoiceType(FORMATS), nullable=False), sa.Column('additional_parameters', sqlalchemy_utils.types.json.JSONType(), nullable=True), sa.Column('user_id', sa.Integer(), nullable=True), diff --git a/superset/migrations/versions/ff40cbbdde10_.py b/superset/migrations/versions/ff40cbbdde10_.py index a026c9d52c56c..917d31317bc89 100644 --- a/superset/migrations/versions/ff40cbbdde10_.py +++ b/superset/migrations/versions/ff40cbbdde10_.py @@ -8,7 +8,7 @@ # revision identifiers, used by Alembic. revision = 'ff40cbbdde10' -down_revision = ('f959a6652acd', 'b2cd059e8803') +down_revision = ('4736ec66ce19', 'b2cd059e8803') from alembic import op import sqlalchemy as sa diff --git a/superset/views/core.py b/superset/views/core.py index ef0cbf5844b42..862397a2e5583 100755 --- a/superset/views/core.py +++ b/superset/views/core.py @@ -1336,6 +1336,9 @@ def checkbox(self, model_view, id_, attr, value): 'TableColumnInlineView': ConnectorRegistry.sources['table'].column_class, } + if 'pandas' in ConnectorRegistry.sources: + model = ConnectorRegistry.sources['pandas'].column_class + modelview_to_model[model_view] = model model = modelview_to_model[model_view] obj = db.session.query(model).filter_by(id=id_).first() if obj: