Infer types. Smart defaults for the visualize window. Basic implement…

…ation. (#1134) * Implement smart suggestions for the visualize flow. * Address JS comments. * Implement caravel dataframe wrapper.
apache · Sep 23, 2016 · df89bec · df89bec
1 parent fc921d6
commit df89bec
Show file tree

Hide file tree

Showing 5 changed files with 218 additions and 48 deletions.
diff --git a/caravel/assets/javascripts/SqlLab/components/ResultSet.jsx b/caravel/assets/javascripts/SqlLab/components/ResultSet.jsx
@@ -72,7 +72,7 @@ class ResultSet extends React.Component {
           <div className="ResultSet">
             <Table
               data={results.data}
-              columns={results.columns}
+              columns={results.columns.map((col) => col.name)}
               sortable
               className="table table-condensed table-bordered"
               filterBy={this.state.searchText}

diff --git a/caravel/assets/javascripts/SqlLab/components/VisualizeModal.jsx b/caravel/assets/javascripts/SqlLab/components/VisualizeModal.jsx
@@ -26,10 +26,25 @@ class VisualizeModal extends React.Component {
       columns: {},
       hints: [],
     };
+    // update columns if possible
+    this.setStateFromProps();
+  }
+  componentWillMount() {
+    this.setStateFromProps();
   }
   componentDidMount() {
     this.validate();
   }
+  setStateFromProps() {
+    if (!this.props.query || !this.props.query.results.columns) {
+      return;
+    }
+    const columns = {};
+    this.props.query.results.columns.forEach((col) => {
+      columns[col.name] = col;
+    });
+    this.setState({ columns });
+  }
   validate() {
     const hints = [];
     const cols = this.mergedColumns();
@@ -67,8 +82,8 @@ class VisualizeModal extends React.Component {
     const columns = Object.assign({}, this.state.columns);
     if (this.props.query && this.props.query.results.columns) {
       this.props.query.results.columns.forEach((col) => {
-        if (columns[col] === undefined) {
-          columns[col] = {};
+        if (columns[col.name] === undefined) {
+          columns[col.name] = col;
         }
       });
     }
@@ -88,39 +103,39 @@ class VisualizeModal extends React.Component {
     this.setState({ datasourceName: event.target.value });
     this.validate();
   }
-  changeCheckbox(attr, col, event) {
+  changeCheckbox(attr, columnName, event) {
     let columns = this.mergedColumns();
-    const column = Object.assign({}, columns[col], { [attr]: event.target.checked });
-    columns = Object.assign({}, columns, { [col]: column });
+    const column = Object.assign({}, columns[columnName], { [attr]: event.target.checked });
+    columns = Object.assign({}, columns, { [columnName]: column });
     this.setState({ columns }, this.validate);
   }
-  changeAggFunction(col, option) {
+  changeAggFunction(columnName, option) {
     let columns = this.mergedColumns();
     const val = (option) ? option.value : null;
-    const column = Object.assign({}, columns[col], { agg: val });
-    columns = Object.assign({}, columns, { [col]: column });
+    const column = Object.assign({}, columns[columnName], { agg: val });
+    columns = Object.assign({}, columns, { [columnName]: column });
     this.setState({ columns }, this.validate);
   }
   render() {
     if (!(this.props.query)) {
       return <div />;
     }
     const tableData = this.props.query.results.columns.map((col) => ({
-      column: col,
+      column: col.name,
       is_dimension: (
         <input
           type="checkbox"
-          onChange={this.changeCheckbox.bind(this, 'is_dim', col)}
-          checked={(this.state.columns[col]) ? this.state.columns[col].is_dim : false}
+          onChange={this.changeCheckbox.bind(this, 'is_dim', col.name)}
+          checked={(this.state.columns[col.name]) ? this.state.columns[col.name].is_dim : false}
           className="form-control"
         />
       ),
       is_date: (
         <input
           type="checkbox"
           className="form-control"
-          onChange={this.changeCheckbox.bind(this, 'is_date', col)}
-          checked={(this.state.columns[col]) ? this.state.columns[col].is_date : false}
+          onChange={this.changeCheckbox.bind(this, 'is_date', col.name)}
+          checked={(this.state.columns[col.name]) ? this.state.columns[col.name].is_date : false}
         />
       ),
       agg_func: (
@@ -132,8 +147,8 @@ class VisualizeModal extends React.Component {
             { value: 'avg', label: 'AVG(x)' },
             { value: 'count_distinct', label: 'COUNT(DISTINCT x)' },
           ]}
-          onChange={this.changeAggFunction.bind(this, col)}
-          value={(this.state.columns[col]) ? this.state.columns[col].agg : null}
+          onChange={this.changeAggFunction.bind(this, col.name)}
+          value={(this.state.columns[col.name]) ? this.state.columns[col.name].agg : null}
         />
       ),
     }));

diff --git a/caravel/dataframe.py b/caravel/dataframe.py
@@ -0,0 +1,113 @@
+""" Caravel wrapper around pandas.DataFrame.
+
+TODO(bkyryliuk): add support for the conventions like: *_dim or dim_*
+                 dimensions, *_ts, ts_*, ds_*, *_ds - datetime, etc.
+TODO(bkyryliuk): recognize integer encoded enums.
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import pandas as pd
+import numpy as np
+
+
+INFER_COL_TYPES_THRESHOLD = 95
+INFER_COL_TYPES_SAMPLE_SIZE = 100
+
+
+# http://pandas.pydata.org/pandas-docs/stable/internals.html#
+# subclassing-pandas-data-structures
+class CaravelDataFrame(object):
+    def __init__(self, df):
+        self.__df = df.where((pd.notnull(df)), None)
+
+    @property
+    def size(self):
+        return len(self.__df.index)
+
+    @property
+    def data(self):
+        return self.__df.to_dict(orient='records')
+
+    @property
+    def columns_dict(self):
+        """Provides metadata about columns for data visualization.
+
+        :return: dict, with the fields name, type, is_date, is_dim and agg.
+        """
+        if self.__df.empty:
+            return None
+
+        columns = []
+        sample_size = min(INFER_COL_TYPES_SAMPLE_SIZE, len(self.__df.index))
+        sample = self.__df
+        if sample_size:
+            sample = self.__df.sample(sample_size)
+        for col in self.__df.dtypes.keys():
+            column = {
+                'name': col,
+                'type': self.__df.dtypes[col].name,
+                'is_date': is_date(self.__df.dtypes[col]),
+                'is_dim': is_dimension(self.__df.dtypes[col], col),
+            }
+            agg = agg_func(self.__df.dtypes[col], col)
+            if agg_func:
+                column['agg'] = agg
+
+            if column['type'] == 'object':
+                # check if encoded datetime
+                if (datetime_conversion_rate(sample[col]) >
+                        INFER_COL_TYPES_THRESHOLD):
+                    column.update({
+                        'type': 'datetime_string',
+                        'is_date': True,
+                        'is_dim': False,
+                        'agg': None
+                    })
+            # 'agg' is optional attribute
+            if not column['agg']:
+                column.pop('agg', None)
+            columns.append(column)
+
+        return columns
+
+
+# It will give false positives on the numbers that are stored as strings.
+# It is hard to distinguish integer numbers and timestamps
+def datetime_conversion_rate(data_series):
+    success = 0
+    total = 0
+    for value in data_series:
+        total = total + 1
+        try:
+            pd.to_datetime(value)
+            success = success + 1
+        except Exception:
+            continue
+    return 100 * success / total
+
+
+def is_date(dtype):
+    return dtype.name.startswith('datetime')
+
+
+def is_dimension(dtype, column_name):
+    if is_id(column_name):
+        return False
+    return dtype == np.object or dtype == np.bool
+
+
+def is_id(column_name):
+    return column_name.startswith('id') or column_name.endswith('id')
+
+
+def agg_func(dtype, column_name):
+    # consider checking for key substring too.
+    if is_id(column_name):
+        return 'count_distinct'
+    if np.issubdtype(dtype, np.number):
+        return 'sum'
+    return None
diff --git a/caravel/sql_lab.py b/caravel/sql_lab.py
@@ -2,10 +2,10 @@
 from datetime import datetime
 import pandas as pd
 import logging
+import numpy
 import time
-import json
 
-from caravel import app, cache, db, models, utils
+from caravel import app, db, models, utils, dataframe
 
 QueryStatus = models.QueryStatus
 
@@ -114,45 +114,40 @@ def handle_error(msg):
             time.sleep(1)
             polled = cursor.poll()
 
-    columns = None
-    data = None
+    cdf = None
     if result_proxy.cursor:
-        columns = [col[0] for col in result_proxy.cursor.description]
+        column_names = [col[0] for col in result_proxy.cursor.description]
         data = result_proxy.fetchall()
-        df = pd.DataFrame(data, columns=columns)
-        df = df.where((pd.notnull(df)), None)
-        # TODO consider generating tuples instead of dicts to send
-        # less data through the wire. The command bellow does that,
-        # but we'd need to align on the client side.
-        # data = df.values.tolist()
-        data = df.to_dict(orient='records')
+        cdf = dataframe.CaravelDataFrame(
+            pd.DataFrame(data, columns=column_names))
+    # TODO consider generating tuples instead of dicts to send
+    # less data through the wire. The command bellow does that,
+    # but we'd need to align on the client side.
+    # data = df.values.tolist()
 
     query.rows = result_proxy.rowcount
     query.progress = 100
     query.status = QueryStatus.SUCCESS
-    if query.rows == -1 and data:
+    if query.rows == -1 and cdf:
         # Presto doesn't provide result_proxy.row_count
-        query.rows = len(data)
-
-    # CTAs queries result in 1 cell having the # of the added rows.
+        query.rows = cdf.size
     if query.select_as_cta:
         query.select_sql = '{}'.format(database.select_star(
             query.tmp_table_name, limit=query.limit))
-
     query.end_time = utils.now_as_float()
     session.commit()
 
-    payload = {
-        'query_id': query.id,
-        'status': query.status,
-        'data': [],
-    }
-    if query.status == models.QueryStatus.SUCCESS:
-        payload['data'] = data
-        payload['columns'] = columns
-    else:
-        payload['error'] = query.error_message
     if return_results:
+        payload = {
+            'query_id': query.id,
+            'status': query.status,
+            'data': [],
+        }
+        if query.status == models.QueryStatus.SUCCESS:
+            payload['data'] = cdf.data if cdf else []
+            payload['columns'] = cdf.columns_dict if cdf else []
+        else:
+            payload['error'] = query.error_message
         return payload
     '''
     # Hack testing using a kv store for results

diff --git a/tests/celery_tests.py b/tests/celery_tests.py
@@ -14,7 +14,7 @@
 import pandas as pd
 
 import caravel
-from caravel import app, appbuilder, db, models, sql_lab, utils
+from caravel import app, appbuilder, db, models, sql_lab, utils, dataframe
 
 from .base_tests import CaravelTestCase
 
@@ -34,6 +34,7 @@ class CeleryConfig(object):
 
 
 class UtilityFunctionTests(CaravelTestCase):
+
     # TODO(bkyryliuk): support more cases in CTA function.
     def test_create_table_as(self):
         select_query = "SELECT * FROM outer_space;"
@@ -193,8 +194,8 @@ def test_run_sync_query(self):
         result2 = self.run_sql(
             1, sql_where, tmp_table='tmp_table_2', cta='true')
         self.assertEqual(QueryStatus.SUCCESS, result2['query']['state'])
-        self.assertIsNone(result2['data'])
-        self.assertIsNone(result2['columns'])
+        self.assertEqual([], result2['data'])
+        self.assertEqual([], result2['columns'])
         query2 = self.get_query_by_id(result2['query']['serverId'])
 
         # Check the data in the tmp table.
@@ -208,8 +209,8 @@ def test_run_sync_query(self):
         result3 = self.run_sql(
             1, sql_empty_result, tmp_table='tmp_table_3', cta='true',)
         self.assertEqual(QueryStatus.SUCCESS, result3['query']['state'])
-        self.assertIsNone(result3['data'])
-        self.assertIsNone(result3['columns'])
+        self.assertEqual([], result3['data'])
+        self.assertEqual([], result3['columns'])
 
         query3 = self.get_query_by_id(result3['query']['serverId'])
         self.assertEqual(QueryStatus.SUCCESS, query3.status)
@@ -250,6 +251,52 @@ def test_run_async_query(self):
         self.assertEqual(True, query1.select_as_cta)
         self.assertEqual(True, query1.select_as_cta_used)
 
+    def test_get_columns_dict(self):
+        main_db = db.session.query(models.Database).filter_by(
+            database_name='main').first()
+        df = main_db.get_df("SELECT * FROM multiformat_time_series", None)
+        cdf = dataframe.CaravelDataFrame(df)
+        if main_db.sqlalchemy_uri.startswith('sqlite'):
+            self.assertEqual(
+                [{'is_date': True, 'type': 'datetime_string', 'name': 'ds',
+                  'is_dim': False},
+                 {'is_date': True, 'type': 'datetime_string', 'name': 'ds2',
+                  'is_dim': False},
+                 {'agg': 'sum', 'is_date': False, 'type': 'int64',
+                  'name': 'epoch_ms', 'is_dim': False},
+                 {'agg': 'sum', 'is_date': False, 'type': 'int64',
+                  'name': 'epoch_s', 'is_dim': False},
+                 {'is_date': True, 'type': 'datetime_string', 'name': 'string0',
+                  'is_dim': False},
+                 {'is_date': False, 'type': 'object',
+                  'name': 'string1', 'is_dim': True},
+                 {'is_date': True, 'type': 'datetime_string', 'name': 'string2',
+                  'is_dim': False},
+                 {'is_date': False, 'type': 'object',
+                  'name': 'string3', 'is_dim': True}]
+                , cdf.columns_dict
+            )
+        else:
+            self.assertEqual(
+                [{'is_date': True, 'type': 'datetime_string', 'name': 'ds',
+                  'is_dim': False},
+                 {'is_date': True, 'type': 'datetime64[ns]',
+                  'name': 'ds2', 'is_dim': False},
+                 {'agg': 'sum', 'is_date': False, 'type': 'int64',
+                  'name': 'epoch_ms', 'is_dim': False},
+                 {'agg': 'sum', 'is_date': False, 'type': 'int64',
+                  'name': 'epoch_s', 'is_dim': False},
+                 {'is_date': True, 'type': 'datetime_string', 'name': 'string0',
+                  'is_dim': False},
+                 {'is_date': False, 'type': 'object',
+                  'name': 'string1', 'is_dim': True},
+                 {'is_date': True, 'type': 'datetime_string', 'name': 'string2',
+                  'is_dim': False},
+                 {'is_date': False, 'type': 'object',
+                  'name': 'string3', 'is_dim': True}]
+                , cdf.columns_dict
+            )
+
 
 if __name__ == '__main__':
     unittest.main()