diff --git a/contrib/cache/dataframe.py b/contrib/cache/dataframe.py
index 415e6549073f0..b4ec9fbd4521c 100644
--- a/contrib/cache/dataframe.py
+++ b/contrib/cache/dataframe.py
@@ -129,7 +129,7 @@ def set(self, key, value, timeout=None):
value.to_hdf(tmp, 'df')
metadata['format'] = 'hdf'
metadata['read_args'] = {'key': 'df'}
- except ImportError:
+ except Exception:
# PyTables is not installed, so fallback to pickle
pickle.dump(value, f, pickle.HIGHEST_PROTOCOL)
metadata['format'] = 'pickle'
diff --git a/contrib/connectors/pandas/models.py b/contrib/connectors/pandas/models.py
index 6b899346b1571..9af950c77b888 100644
--- a/contrib/connectors/pandas/models.py
+++ b/contrib/connectors/pandas/models.py
@@ -7,7 +7,9 @@
from datetime import datetime
import hashlib
import logging
+from io import BytesIO
from past.builtins import basestring
+import requests
try:
from urllib.parse import urlparse
except ImportError:
@@ -18,7 +20,7 @@
is_string_dtype, is_numeric_dtype, is_datetime64_any_dtype)
from sqlalchemy import (
- Column, Integer, String, ForeignKey, Text, or_
+ Column, Integer, String, ForeignKey, Text, and_, or_
)
import sqlalchemy as sa
from sqlalchemy.orm import backref, relationship
@@ -105,6 +107,14 @@ def data(self):
'filterable', 'groupby')
return {s: getattr(self, s) for s in attrs}
+ def get_perm(self):
+ if self.datasource:
+ return ('{parent_name}.[{obj.expression}]'
+ '(id:{obj.id})').format(
+ obj=self,
+ parent_name=self.datasource.full_name)
+ return None
+
class PandasMetric(Model, BaseMetric):
"""
@@ -124,8 +134,7 @@ class PandasMetric(Model, BaseMetric):
source = Column(Text)
expression = Column(Text)
- @property
- def perm(self):
+ def get_perm(self):
if self.datasource:
return ('{parent_name}.[{obj.metric_name}]'
'(id:{obj.id})').format(
@@ -165,6 +174,8 @@ class PandasDatasource(Model, BaseDatasource):
name = Column(String(100), nullable=False)
source_url = Column(String(1000), nullable=False)
+ source_auth = Column(JSONType)
+ source_parameters = Column(JSONType)
format = Column(ChoiceType(FORMATS), nullable=False)
additional_parameters = Column(JSONType)
@@ -296,7 +307,9 @@ def get_empty_dataframe(self):
@property
def cache_key(self):
- source = {'source_url': self.source_url}
+ source = {'source_url': self.source_url,
+ 'source_auth': self.source_auth}
+ source.update(self.source_parameters or {})
source.update(self.pandas_read_parameters)
s = str([(k, source[k]) for k in sorted(source.keys())])
return hashlib.md5(s.encode('utf-8')).hexdigest()
@@ -313,7 +326,25 @@ def get_dataframe(self):
cache_key = self.cache_key
self.df = dataframe_cache.get(cache_key)
if not isinstance(self.df, pd.DataFrame):
- self.df = self.pandas_read_method(self.source_url, **self.pandas_read_parameters)
+ if isinstance(self.source_url, basestring) and self.source_url[:4] == 'http':
+ # Use requests to retrieve remote data so we can handle authentication
+ auth = self.source_auth
+ url = self.source_url
+ if isinstance(auth, (tuple, list)):
+ response = requests.get(url, params=self.source_parameters,
+ auth=tuple(auth))
+ elif auth:
+ response = requests.get(url, params=self.source_parameters,
+ headers={'Authorization': auth})
+ else:
+ response = requests.get(url, params=self.source_parameters)
+ response.raise_for_status()
+ data = BytesIO(response.content)
+ else:
+ # Local file, so just use Pandas directly
+ data = self.source_url
+ # Read the dataframe from the response
+ self.df = self.pandas_read_method(data, **self.pandas_read_parameters)
# read_html returns a list of DataFrames
if (isinstance(self.df, list) and
@@ -763,7 +794,6 @@ def get_metadata(self):
"""Build the metadata for the table and merge it in"""
df = self.get_dataframe()
- metrics = []
any_date_col = None
dbcols = (
db.session.query(PandasColumn)
@@ -773,10 +803,20 @@ def get_metadata(self):
dbcols = {dbcol.column_name: dbcol for dbcol in dbcols}
for col in df.columns:
dbcol = dbcols.get(col, None)
+
if not dbcol:
- dbcol = PandasColumn(column_name=str(col), type=df.dtypes[col].name)
- dbcol.groupby = dbcol.is_string
- dbcol.filterable = dbcol.is_string
+ dtype = df.dtypes[col].name
+ # Pandas defaults columns where all values are None to a dtype of
+ # float with all values as NaN, but if we can't correctly infer
+ # the dtype we are better to assume object so we don't
+ # create large numbers of unwanted Metrics
+ if self.df[col].isnull().all():
+ dtype = 'object'
+ dbcol = PandasColumn(column_name=str(col), type=dtype)
+ # Only treat `object` as string if we have some data
+ if self.df[col].notnull().any():
+ dbcol.groupby = dbcol.is_string
+ dbcol.filterable = dbcol.is_string
dbcol.sum = dbcol.is_num
dbcol.avg = dbcol.is_num
dbcol.min = dbcol.is_num or dbcol.is_dttm
@@ -786,70 +826,86 @@ def get_metadata(self):
if not any_date_col and dbcol.is_time:
any_date_col = col
- if dbcol.sum:
- metrics.append(PandasMetric(
- metric_name='sum__' + dbcol.column_name,
- verbose_name='sum__' + dbcol.column_name,
- metric_type='sum',
- source=dbcol.column_name,
- expression='sum'
- ))
- if dbcol.avg:
- metrics.append(PandasMetric(
- metric_name='avg__' + dbcol.column_name,
- verbose_name='avg__' + dbcol.column_name,
- metric_type='avg',
- source=dbcol.column_name,
- expression='mean'
- ))
- if dbcol.max:
- metrics.append(PandasMetric(
- metric_name='max__' + dbcol.column_name,
- verbose_name='max__' + dbcol.column_name,
- metric_type='max',
- source=dbcol.column_name,
- expression='max'
- ))
- if dbcol.min:
- metrics.append(PandasMetric(
- metric_name='min__' + dbcol.column_name,
- verbose_name='min__' + dbcol.column_name,
- metric_type='min',
- source=dbcol.column_name,
- expression='min'
- ))
- if dbcol.count_distinct:
- metrics.append(PandasMetric(
- metric_name='count_distinct__' + dbcol.column_name,
- verbose_name='count_distinct__' + dbcol.column_name,
- metric_type='count_distinct',
- source=dbcol.column_name,
- expression='nunique'
- ))
- dbcol.type = df.dtypes[col].name
-
- metrics.append(PandasMetric(
- metric_name='count',
- verbose_name='count',
- metric_type='count',
- source=None,
- expression="count"
- ))
- dbmetrics = (
- db.session.query(PandasMetric)
- .filter(PandasMetric.datasource == self)
- .filter(or_(PandasMetric.metric_name == metric.metric_name
- for metric in metrics)))
- dbmetrics = {metric.metric_name: metric for metric in dbmetrics}
- for metric in metrics:
- metric.pandas_datasource_id = self.id
- if not dbmetrics.get(metric.metric_name, None):
- db.session.add(metric)
if not self.main_dttm_col:
self.main_dttm_col = any_date_col
+
db.session.merge(self)
db.session.commit()
+def reconcile_column_metrics(mapper, connection, target):
+ """
+ Create or delete PandasMetrics to match the metric attributes
+ specified on a PandasColumn
+ """
+ metrics_table = PandasMetric.__table__
+ for metric_type in ('sum', 'avg', 'max', 'min', 'count_distinct'):
+ # Set up the metric attributes
+ metric_name = metric_type + '__' + target.column_name
+ verbose_name = metric_name
+ source = target.column_name
+ if metric_type == 'avg':
+ expression = 'mean',
+ elif metric_type == 'count_distinct':
+ expression = 'nunique'
+ else:
+ expression = metric_type
+
+ if getattr(target, metric_type):
+ # Create the metric if it doesn't already exist
+ result = connection.execute(
+ metrics_table
+ .select()
+ .where(
+ and_(
+ metrics_table.c.pandas_datasource_id == target.pandas_datasource_id,
+ metrics_table.c.metric_name == metric_name)))
+ if not result.rowcount:
+ connection.execute(
+ metrics_table.insert(),
+ pandas_datasource_id=target.pandas_datasource_id,
+ metric_name=metric_name,
+ verbose_name=verbose_name,
+ source=source,
+ expression=expression)
+ else:
+ # Delete the metric if it exists and hasn't been customized
+ connection.execute(
+ metrics_table
+ .delete()
+ .where(
+ and_(
+ metrics_table.c.pandas_datasource_id == target.pandas_datasource_id,
+ metrics_table.c.metric_name == metric_name,
+ metrics_table.c.verbose_name == verbose_name,
+ metrics_table.c.source == source,
+ metrics_table.c.expression == expression)))
+
+
+def reconcile_metric_column(mapper, connection, target):
+ """
+ Clear the metric attribute on a PandasColumn if the
+ corresponding PandasMetric is deleted
+ """
+ column_table = PandasColumn.__table__
+ try:
+ metric_type, column_name = target.metric_name.split('__', 1)
+ if metric_type in column_table.c:
+ connection.execute(
+ column_table
+ .update()
+ .values(**{metric_type: False})
+ .where(
+ and_(
+ column_table.c.pandas_datasource_id == target.pandas_datasource_id,
+ column_table.c.column_name == column_name)))
+ except ValueError:
+ # Metric name doesn't contain __
+ pass
+
+
+sa.event.listen(PandasColumn, 'after_insert', reconcile_column_metrics)
+sa.event.listen(PandasColumn, 'after_update', reconcile_column_metrics)
+sa.event.listen(PandasMetric, 'before_delete', reconcile_metric_column)
sa.event.listen(PandasDatasource, 'after_insert', set_perm)
sa.event.listen(PandasDatasource, 'after_update', set_perm)
diff --git a/contrib/connectors/pandas/views.py b/contrib/connectors/pandas/views.py
index f7a4bb30b2670..0d514e4530935 100644
--- a/contrib/connectors/pandas/views.py
+++ b/contrib/connectors/pandas/views.py
@@ -1,16 +1,18 @@
"""Views used by the SqlAlchemy connector"""
+import json
import logging
from past.builtins import basestring
from flask import Markup, flash, redirect
from flask_appbuilder import CompactCRUDMixin, expose
+from flask_appbuilder.fieldwidgets import BS3TextFieldWidget, BS3TextAreaFieldWidget
from flask_appbuilder.models.sqla.interface import SQLAInterface
import sqlalchemy as sa
from flask_babel import lazy_gettext as _
from flask_babel import gettext as __
-from wtforms import SelectField
+from wtforms import SelectField, StringField, validators
from superset import appbuilder, db, utils, security, sm
from superset.utils import has_access
@@ -34,6 +36,34 @@ def process_data(self, value):
super(ChoiceTypeSelectField, self).process_data(value)
+class JSONField(StringField):
+ """
+ JSON field for WTForms that converts between the form string data
+ and a dictionary representation, with validation
+
+ See https://gist.github.com/dukebody/dcc371bf286534d546e9
+ """
+ def _value(self):
+ return json.dumps(self.data) if self.data else ''
+
+ def process_formdata(self, valuelist):
+ if valuelist:
+ try:
+ self.data = json.loads(valuelist[0])
+ except ValueError:
+ raise ValueError('This field contains invalid JSON')
+ else:
+ self.data = None
+
+ def pre_validate(self, form):
+ super().pre_validate(form)
+ if self.data:
+ try:
+ json.dumps(self.data)
+ except TypeError:
+ raise ValueError('This field contains invalid JSON')
+
+
class PandasColumnInlineView(CompactCRUDMixin, SupersetModelView): # noqa
datamodel = SQLAInterface(PandasColumn)
@@ -42,16 +72,15 @@ class PandasColumnInlineView(CompactCRUDMixin, SupersetModelView): # noqa
add_title = _('Add Column')
edit_title = _('Edit Column')
- can_delete = False
list_widget = ListWidgetWithCheckboxes
edit_columns = [
'column_name', 'verbose_name', 'description',
'type', 'groupby', 'filterable',
- 'datasource', 'count_distinct', 'sum', 'min', 'max']
+ 'datasource', 'count_distinct', 'sum', 'avg', 'min', 'max']
add_columns = edit_columns
list_columns = [
'column_name', 'verbose_name', 'type', 'groupby', 'filterable',
- 'count_distinct', 'sum', 'min', 'max']
+ 'count_distinct', 'sum', 'avg', 'min', 'max']
page_size = 500
description_columns = {
'is_dttm': _(
@@ -76,6 +105,7 @@ class PandasColumnInlineView(CompactCRUDMixin, SupersetModelView): # noqa
'datasource': _("Datasource"),
'count_distinct': _("Count Distinct"),
'sum': _("Sum"),
+ 'avg': _("Average"),
'min': _("Min"),
'max': _("Max"),
'type': _('Type'),
@@ -85,7 +115,7 @@ class PandasColumnInlineView(CompactCRUDMixin, SupersetModelView): # noqa
appbuilder.add_view_no_menu(PandasColumnInlineView)
-class PandasMetricInlineView(CompactCRUDMixin, SupersetModelView): # noqa
+class PandasMetricInlineView(CompactCRUDMixin, SupersetModelView, DeleteMixin): # noqa
datamodel = SQLAInterface(PandasMetric)
list_title = _('List Metrics')
@@ -147,28 +177,52 @@ def post_update(self, metric):
class PandasDatasourceModelView(DatasourceModelView, DeleteMixin): # noqa
datamodel = SQLAInterface(PandasDatasource)
- list_title = _('List Pandas Datasources')
- show_title = _('Show Pandas Datasource')
- add_title = _('Add Pandas Datasource')
- edit_title = _('Edit Pandas Datasource')
+ list_title = _('List File Datasources')
+ show_title = _('Show File Datasource')
+ add_title = _('Add File Datasource')
+ edit_title = _('Edit File Datasource')
list_columns = [
'link', 'changed_by_', 'modified']
order_columns = [
'link', 'changed_on_']
- add_columns = ['name', 'source_url', 'format']
+ add_columns = ['name', 'source_url', 'source_auth', 'source_parameters',
+ 'format', 'additional_parameters']
add_form_extra_fields = {
- 'format': ChoiceTypeSelectField(_('Format'), choices=FORMATS)
+ 'source_auth': JSONField(
+ _('Source Credentials'),
+ [validators.optional(), validators.length(max=100)],
+ widget=BS3TextFieldWidget(),
+ description=(
+ "Credentials required to access the raw data, if required. "
+ "Can be either a username and password in the form "
+ "'[\"username\", \"password\"]' which will be authenticated "
+ "using HTTP Basic Auth, or a string which will be used as "
+ "an Authorization header")),
+ 'source_parameters': JSONField(
+ _("Additional Query Parameters"),
+ [validators.optional(), validators.length(max=500)],
+ widget=BS3TextAreaFieldWidget(),
+ description=(
+ "A JSON-formatted dictionary of additional parameters "
+ "used to request the remote file")),
+ 'format': ChoiceTypeSelectField(_('Format'), choices=FORMATS),
+ 'additional_parameters': JSONField(
+ _("Additional Read Parameters"),
+ [validators.optional(), validators.length(max=500)],
+ widget=BS3TextAreaFieldWidget(),
+ description=(
+ "A JSON-formatted dictionary of additional parameters "
+ "passed to the Pandas read function")),
}
edit_columns = [
- 'name', 'source_url', 'format',
+ 'name', 'source_url', 'source_auth', 'source_parameters',
+ 'format', 'additional_parameters',
'filter_select_enabled', 'slices',
'fetch_values_predicate',
'description', 'owner',
'main_dttm_col', 'default_endpoint', 'offset', 'cache_timeout']
- edit_form_extra_fields = {
- 'format': ChoiceTypeSelectField(_('Format'), choices=FORMATS)
- }
+ edit_form_extra_fields = add_form_extra_fields
show_columns = edit_columns + ['perm']
related_views = [PandasColumnInlineView, PandasMetricInlineView]
base_order = ('changed_on', 'desc')
@@ -191,10 +245,6 @@ class PandasDatasourceModelView(DatasourceModelView, DeleteMixin): # noqa
"The URL used to access the raw data"),
'format': _(
"The format of the raw data, e.g. csv"),
- 'additional_parameters': _(
- "A JSON-formatted dictionary of additional parameters "
- "passed to the Pandas read_* function, "
- "see https://pandas.pydata.org/pandas-docs/stable/api.html#input-output"), # NOQA
'description': Markup(
"Supports "
"markdown"),
@@ -225,7 +275,6 @@ class PandasDatasourceModelView(DatasourceModelView, DeleteMixin): # noqa
'name': _("Name"),
'source_url': _("Source URL"),
'format': _("Format"),
- 'additional_parameters': _("Additional Read Parameters"),
'fetch_values_predicate': _('Fetch Values Predicate'),
'owner': _("Owner"),
'main_dttm_col': _("Main Datetime Column"),
@@ -282,8 +331,8 @@ def edit(self, pk):
appbuilder.add_view(
PandasDatasourceModelView,
- "Pandas Datasources",
- label=__("Pandas Datasources"),
+ "File Datasources",
+ label=__("File Datasources"),
category="Sources",
category_label=__("Sources"),
icon='fa-file',)
diff --git a/docs/files.rst b/docs/files.rst
new file mode 100644
index 0000000000000..7b0040e4cdc37
--- /dev/null
+++ b/docs/files.rst
@@ -0,0 +1,107 @@
+Files
+=====
+
+Superset can visualize data contained in flat files or outputted by REST APIs
+using Pandas. This page clarifies the features and limitations of using File
+Datasources with Superset.
+
+File Datasources do not use files uploaded directly to the Superset server. The
+files are stored elsewhere and downloaded (and cached) by Superset when required.
+This approach offers two important advantages over uploaded files:
+
+1. If the remote file is updated, then Superset will automatically download
+and use the new data when the datasource timeout expires. If the file had
+been uploaded manually then each change in the source data would require a
+new manual upload.
+2. Data can be provided by a REST API rather than an actual file. This provides
+a method to use Superset to visualize current data stored in other systems
+without needing to manually extract it first.
+
+.. note ::
+ A File Datasource downloads the full content of the source url
+ and then performs all filtering, grouping and aggregation locally
+ on the Superset server. File Datasources that access large
+ volumes of data may impact server performance and affect other users.
+ Server administrators should ensure that adequate memory and CPU
+ resources are available before enabling the File Datasource.
+
+Installation
+''''''''''''
+
+File Datasources are not enabled by default. To enable them the systems
+administrator should update the `superset_config.py` or other configuration
+file to include: :: python
+
+ # Include additional data sources
+ ADDITIONAL_MODULE_DS_MAP = {
+ 'contrib.connectors.pandas.models': ['PandasDatasource'],
+ }
+
+Supported Formats
+'''''''''''''''''
+
+File Datasources use the `Pandas library `_
+directly. Using a default installation in Superset, Pandas can read the
+following formats:
+
+* csv
+* html
+* json
+* Microsoft Excel
+* Stata
+
+If the appropriate dependencies have also been installed then the following
+additional formats are supported:
+
+* HDF5 (if PyTables is installed: `pip install tables`)
+* Feather (if Feather is installed: `pip install feather-format`)
+
+See the `Pandas Dependencies `_
+documentation for more information.
+
+Adding a File Datasource
+''''''''''''''''''''''''
+
+When you add a new File Datasource you need to provide the following information:
+
+* Source URL: the URL that the file to be visualized can be downloaded from.
+This can be a file hosted on another server or on a file sharing platform
+such as Dropbox, Google Drive or Amazon S3. It can also be the URL of a REST API
+end point.
+* Source Credentials: if the Source URL requires authentication then specify
+the credentials to be used. Credentials entered as ``["username", "password"]`` -
+i.e. as a valid username and password enclosed in quotation marks, separated
+by a comma and surrounded by parentheses - will be treated as a separate username
+and password and the File Datasource will use HTTP Basic Auth to authenticate to
+the remote server. Text in any other format will be passed to the remote server
+as an `Authentication` header. Typically this is used with API tokens issued by
+the remote server. Remote servers that require authentication should also use
+an HTTPS Source URL.
+* Source Parameters: a JSON-formatted dictionary of additional query parameters
+that are passed to the remote server. This field will not be required for file
+downloads, but is useful for specifying requests against REST APIs.
+* Format: The format of the data returned by the remote server
+* Read Parameters: a JSON-formatted dictionary of additional parameters that are
+passed to Pandas when the file retrieved from the remote Server is read into a
+DataFrame.
+
+Aggregations
+''''''''''''
+
+Common aggregations can be defined and used in Superset.
+The first and simpler use case is to use the checkbox matrix exposed in your
+datasource's edit view (``Sources -> File Datasources ->
+[your datasource] -> Edit -> [tab] List Datasource Column``).
+Clicking the ``GroupBy`` and ``Filterable`` checkboxes will make the column
+appear in the related dropdowns while in explore view. Checking
+``Count Distinct``, ``Min``, ``Max`` ``Average`` or ``Sum`` will result in creating
+new metrics that will appear in the ``List Metrics`` tab.
+You can create your own aggregations manually from the ``List Metrics`` tab for
+more complex cases.
+
+Post-Aggregations
+'''''''''''''''''
+
+File Datasources allow post aggregation in Superset. Any Metric that has been
+defined for the Datasource can be used as part of a Result Filter to limit
+the returned data.
diff --git a/docs/index.rst b/docs/index.rst
index eba2e94516930..5a3b831630f28 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -25,10 +25,10 @@ intelligence web application
endorsed by the ASF.
Overview
-=======================================
+========
Features
----------
+--------
- A rich set of data visualizations
- An easy-to-use interface for exploring and visualizing data
@@ -61,7 +61,7 @@ Features
Contents
----------
+--------
.. toctree::
:maxdepth: 2
@@ -74,6 +74,7 @@ Contents
videos
gallery
druid
+ files
faq
diff --git a/superset/migrations/versions/b2cd059e8803_add_pandasdatasource.py b/superset/migrations/versions/b2cd059e8803_add_pandasdatasource.py
index 5f45bbe6fc887..b8545d457bf9a 100644
--- a/superset/migrations/versions/b2cd059e8803_add_pandasdatasource.py
+++ b/superset/migrations/versions/b2cd059e8803_add_pandasdatasource.py
@@ -37,6 +37,8 @@ def upgrade():
sa.Column('perm', sa.String(length=1000), nullable=True),
sa.Column('name', sa.String(length=100), nullable=False),
sa.Column('source_url', sa.String(length=1000), nullable=False),
+ sa.Column('source_auth', sqlalchemy_utils.types.json.JSONType(), nullable=True),
+ sa.Column('source_parameters', sqlalchemy_utils.types.json.JSONType(), nullable=True),
sa.Column('format', sqlalchemy_utils.types.choice.ChoiceType(FORMATS), nullable=False),
sa.Column('additional_parameters', sqlalchemy_utils.types.json.JSONType(), nullable=True),
sa.Column('user_id', sa.Integer(), nullable=True),
diff --git a/superset/migrations/versions/ff40cbbdde10_.py b/superset/migrations/versions/ff40cbbdde10_.py
index a026c9d52c56c..917d31317bc89 100644
--- a/superset/migrations/versions/ff40cbbdde10_.py
+++ b/superset/migrations/versions/ff40cbbdde10_.py
@@ -8,7 +8,7 @@
# revision identifiers, used by Alembic.
revision = 'ff40cbbdde10'
-down_revision = ('f959a6652acd', 'b2cd059e8803')
+down_revision = ('4736ec66ce19', 'b2cd059e8803')
from alembic import op
import sqlalchemy as sa
diff --git a/superset/views/core.py b/superset/views/core.py
index ef0cbf5844b42..862397a2e5583 100755
--- a/superset/views/core.py
+++ b/superset/views/core.py
@@ -1336,6 +1336,9 @@ def checkbox(self, model_view, id_, attr, value):
'TableColumnInlineView':
ConnectorRegistry.sources['table'].column_class,
}
+ if 'pandas' in ConnectorRegistry.sources:
+ model = ConnectorRegistry.sources['pandas'].column_class
+ modelview_to_model[model_view] = model
model = modelview_to_model[model_view]
obj = db.session.query(model).filter_by(id=id_).first()
if obj: