From a0bcdedf238e9cdbc81e33f2dbca93ff73b3e3e2 Mon Sep 17 00:00:00 2001
From: Naoki Kanazawa <nkanazawa1989@gmail.com>
Date: Sat, 16 Sep 2023 10:43:09 +0900
Subject: [PATCH] Optimize dataframe performance

- Avoid add_data call in data processing. Create dataframe with a bulk data is much faster than appending one by one.
- Group sort with model_id instead of model_name. String search is slow in pandas.
- Groupby in format subroutine is replaced with python native groupby function which is faster than pandas groupby.
---
 .../composite_curve_analysis.py               |   6 +-
 .../curve_analysis/curve_analysis.py          | 165 ++++++++++++------
 qiskit_experiments/curve_analysis/utils.py    | 141 ++++++---------
 3 files changed, 171 insertions(+), 141 deletions(-)

diff --git a/qiskit_experiments/curve_analysis/composite_curve_analysis.py b/qiskit_experiments/curve_analysis/composite_curve_analysis.py
index f3cb14ecc7..de3898e316 100644
--- a/qiskit_experiments/curve_analysis/composite_curve_analysis.py
+++ b/qiskit_experiments/curve_analysis/composite_curve_analysis.py
@@ -231,7 +231,8 @@ def _create_figures(
         """
         for analysis in self.analyses():
             sub_data = curve_data[curve_data.model_name.str.endswith(f"_{analysis.name}")]
-            for model_name, data in list(sub_data.groupby("model_name")):
+            for model_id, data in list(sub_data.groupby("model_id")):
+                model_name = analysis._models[model_id]._name
                 # Plot raw data scatters
                 if analysis.options.plot_raw_data:
                     raw_data = data.filter(like="processed", axis="index")
@@ -370,7 +371,8 @@ def _run_analysis(
                 fit_curves = []
                 formatted = curve_data.filter(like="formatted", axis="index")
                 columns = list(curve_data.columns)
-                for (i, name), sub_data in list(formatted.groupby(["model_id", "model_name"])):
+                for i, sub_data in list(formatted.groupby("model_id")):
+                    name = analysis._models[i]._name
                     xval = sub_data.xval.to_numpy()
                     if len(xval) == 0:
                         # If data is empty, skip drawing this model.
diff --git a/qiskit_experiments/curve_analysis/curve_analysis.py b/qiskit_experiments/curve_analysis/curve_analysis.py
index a96bb287ab..4ddd06cdec 100644
--- a/qiskit_experiments/curve_analysis/curve_analysis.py
+++ b/qiskit_experiments/curve_analysis/curve_analysis.py
@@ -17,10 +17,11 @@
 
 from typing import Dict, List, Tuple, Union, Optional
 from functools import partial
+from itertools import groupby
+from operator import itemgetter
 
 import lmfit
 import numpy as np
-import pandas as pd
 from uncertainties import unumpy as unp
 
 from qiskit_experiments.framework import ExperimentData, AnalysisResultData
@@ -161,30 +162,80 @@ def _run_data_processing(
         Raises:
             DataProcessorError: When key for x values is not found in the metadata.
             DataProcessorError: When data_subfit_map information for a fit model is missing.
+            ValueError: When input data has incomplete metadata to specify fit model.
         """
+        opt = self.options
+
         # Create table
-        x_key = self.options.x_key
-        filter_kwargs = self.options.filter_data
-        out = ScatterTable()
-        to_process = []
-        for idx, datum in enumerate(raw_data):
+        if opt.filter_data:
+            to_process = [d for d in raw_data if opt.filter_data.items() <= d["metadata"].items()]
+        else:
+            to_process = raw_data
+
+        # This must align with ScatterTable columns. Use struct array.
+        dtypes = np.dtype(
+            [
+                ("xval", float),
+                ("yval", float),
+                ("yerr", float),
+                ("model_name", "U30"),  # builtin str is U0 which is zero-length unicode in numpy
+                ("model_id", int),
+                ("shots", int),
+            ]
+        )
+        table_data = np.empty(len(to_process), dtype=dtypes)
+        for idx, datum in enumerate(to_process):
             metadata = datum["metadata"].copy()
-            if filter_kwargs and not filter_kwargs.items() <= datum["metadata"].items():
-                # This data is not a part of this analysis.
-                continue
+            # Get xval from metadata
             try:
-                xval = metadata.pop(x_key)
+                xval = metadata.pop(opt.x_key)
             except KeyError as ex:
                 raise DataProcessorError(
-                    f"X value key {x_key} is not defined in the circuit metadata."
+                    f"X value key {opt.x_key} is not defined in the circuit metadata."
                 ) from ex
-            out.add_entry(
-                index=f"processed-{idx:04d}",
-                xval=xval,
-                shots=datum.get("shots", pd.NA),
-                **metadata,
-            )
-            to_process.append(datum)
+            # Classify fit model
+            if len(self._models) == 1:
+                m_id = 0
+                m_name = self._models[0]._name
+            else:
+                for i, model in enumerate(self._models):
+                    try:
+                        model_spec = self.options.data_subfit_map[model._name]
+                    except KeyError as ex:
+                        raise DataProcessorError(
+                            f"Mapping to data for the fit model {model._name} is not provided."
+                        ) from ex
+                    if model_spec.items() <= metadata.items():
+                        m_id = i
+                        m_name = model._name
+                        break
+                else:
+                    raise ValueError(f"Experiment data {datum} doesn't belong to any fit model.")
+            table_data[idx]["xval"] = xval
+            table_data[idx]["shots"] = datum.get("shots", -1)
+            table_data[idx]["model_id"] = m_id
+            table_data[idx]["model_name"] = m_name
+
+            # Add extra metadata
+            add_key = metadata.keys() - table_data.dtype.fields
+            if add_key:
+                # Add missing keys to struct array
+                # This code is lengthy but faster than merge_arrays function
+                add_dtypes = []
+                for k in add_key:
+                    if isinstance(metadata[k], str):
+                        new_dtype = "U30"
+                    else:
+                        new_dtype = type(metadata[k])
+                    add_dtypes.append((k, new_dtype))
+                new_table_data = np.empty(
+                    len(to_process), dtype=sum((table_data.dtype.descr, add_dtypes), [])
+                )
+                for k in table_data.dtype.fields:
+                    new_table_data[k] = table_data[k]
+                table_data = new_table_data
+            for k, v in metadata.items():
+                table_data[idx][k] = v
 
         # Compute y value
         if not self.options.data_processor:
@@ -194,29 +245,13 @@ def _run_data_processing(
                 "data_processor analysis options."
             )
         processed_values = self.options.data_processor(to_process)
-        out.yval = unp.nominal_values(processed_values)
-        out.yerr = unp.std_devs(processed_values)
-
-        # Set model names and ids
-        if len(self._models) == 1:
-            out.model_name = self._models[0]._name
-            out.model_id = 0
-        else:
-            for i, model in enumerate(self._models):
-                try:
-                    model_spec = self.options.data_subfit_map[model._name]
-                except KeyError as ex:
-                    raise DataProcessorError(
-                        f"Mapping to data for the fit model {model._name} is not provided."
-                    ) from ex
-                conds = [out[k] == v for k, v in model_spec.items()]
-                if len(conds) >= 2:
-                    rows = np.logical_and(*conds)
-                else:
-                    rows = conds[0]
-                out.loc[rows, "model_name"] = model._name
-                out.loc[rows, "model_id"] = i
+        table_data["yval"] = unp.nominal_values(processed_values).flatten()
+        table_data["yerr"] = unp.std_devs(processed_values).flatten()
 
+        out = ScatterTable(
+            data=table_data,
+            index=[f"processed-{i:04d}" for i in range(len(to_process))],
+        )
         return out
 
     def _format_data(
@@ -237,19 +272,39 @@ def _format_data(
             "sample": sample_average,
         }
 
-        # Data is automatically sorted by dataframe groupby operation
-        grouped_by_model = curve_data.groupby(["model_name", "xval"], as_index=False)
-
-        # Conventional approach to average over the grouped dataframe is
-        # to use .apply method with a callback, but this operation just internally
-        # run python loop for each grouped dataframe, and thus not performant.
-        # This directly manipulates the dataframe values and roughly 4x faster.
-        averaged = map(
-            averaging_methods[self.options.average_method],
-            list(grouped_by_model),
+        columns = list(curve_data.columns)
+        sort_by = itemgetter(
+            columns.index("model_id"),
+            columns.index("xval"),
         )
+        # Use python native groupby method on ndarray. This is more performant than pandas one.
+        average = averaging_methods[self.options.average_method]
+        formatted = []
+        for (mid, xv), g in groupby(sorted(curve_data.values, key=sort_by), key=sort_by):
+            g_values = np.array(list(g))
+            g_dict = dict(zip(columns, g_values.T))
+            avg_yval, avg_yerr, shots = average(g_dict["yval"], g_dict["yerr"], g_dict["shots"])
+            averaged = dict.fromkeys(columns)
+            averaged["xval"] = xv
+            averaged["yval"] = avg_yval
+            averaged["yerr"] = avg_yerr
+            averaged["model_id"] = mid
+            averaged["shots"] = shots
+            for k, v in g_dict.items():
+                if averaged[k] is not None:
+                    continue
+                if len(g_values) == 1:
+                    averaged[k] = v[0]
+                else:
+                    unique = set(v)
+                    if len(unique) == 1:
+                        averaged[k] = next(iter(unique))
+                    else:
+                        averaged[k] = list(unique)
+            formatted.append(list(averaged.values()))
+
         return curve_data.append_list_values(
-            other=list(averaged),
+            other=formatted,
             prefix="formatted",
         )
 
@@ -398,7 +453,8 @@ def _create_figures(
         Returns:
             A list of figures.
         """
-        for model_name, data in list(curve_data.groupby("model_name")):
+        for model_id, data in list(curve_data.groupby("model_id")):
+            model_name = self._models[model_id]._name
             # Plot raw data scatters
             if self.options.plot_raw_data:
                 raw_data = data.filter(like="processed", axis="index")
@@ -467,14 +523,15 @@ def _run_analysis(
             fit_curves = []
             formatted = curve_data.filter(like="formatted", axis="index")
             columns = list(curve_data.columns)
-            for (i, name), sub_data in list(formatted.groupby(["model_id", "model_name"])):
+            for i, sub_data in list(formatted.groupby("model_id")):
+                name = self._models[i]._name
                 xval = sub_data.xval.to_numpy()
                 if len(xval) == 0:
                     # If data is empty, skip drawing this model.
                     # This is the case when fit model exist but no data to fit is provided.
                     continue
                 # Compute X, Y values with fit parameters.
-                xval_fit = np.linspace(np.min(xval), np.max(xval), num=100)
+                xval_fit = np.linspace(np.min(xval), np.max(xval), num=100, dtype=float)
                 yval_fit = eval_with_uncertainties(
                     x=xval_fit,
                     model=self._models[i],
diff --git a/qiskit_experiments/curve_analysis/utils.py b/qiskit_experiments/curve_analysis/utils.py
index 21af92d8c2..66a8855254 100644
--- a/qiskit_experiments/curve_analysis/utils.py
+++ b/qiskit_experiments/curve_analysis/utils.py
@@ -18,7 +18,6 @@
 import asteval
 import lmfit
 import numpy as np
-import pandas as pd
 from qiskit.utils.deprecation import deprecate_func
 from qiskit.utils import detach_prefix
 from uncertainties import UFloat, wrap as wrap_function
@@ -225,123 +224,95 @@ def eval_with_uncertainties(
 
 
 def shot_weighted_average(
-    group: Tuple[Tuple[str, float], pd.DataFrame],
-) -> List:
+    yvals: np.ndarray,
+    yerrs: np.ndarray,
+    shots: np.ndarray,
+) -> Tuple[float, float, float]:
     """Compute shot based variance and weighted average of the categorized data frame.
 
     Sample is weighted by the shot number.
 
     Args:
-        group: Data frame grouped by the model name and x value.
+        yvals: Y values to average.
+        yerrs: Y errors to average.
+        shots: Number of shots used to obtain Y value and error.
 
     Returns:
-        A single row of the average.
+        Averaged Y value, Y error, and total shots.
     """
-    (model_name, xval), grouped_df = group
-
-    if len(grouped_df) == 1:
-        return grouped_df.iloc[0].to_list()
-
-    values_dict = dict(zip(grouped_df.columns, grouped_df.values.T))
-    weights = np.array(values_dict["shots"]) / np.sum(np.array(values_dict["shots"]))
-
-    out = dict.fromkeys(values_dict.keys())
-    out["xval"] = xval
-    out["yval"] = np.sum(weights * values_dict["yval"])
-    out["yerr"] = np.sqrt(np.sum(weights**2 * values_dict["yerr"] ** 2))
-    out["model_name"] = model_name
-    out["model_id"] = values_dict["model_id"][0]
-    out["shots"] = np.sum(values_dict["shots"])
-
-    # Process extra columns. Use set operation to aggregate metadata.
-    for extra in grouped_df.columns[6:]:
-        unique_values = set(values_dict[extra])
-        if len(unique_values) == 1:
-            out[extra] = next(iter(unique_values))
-        else:
-            out[extra] = list(unique_values)
-    return list(out.values())
+    if len(yvals) == 1:
+        return yvals[0], yerrs[0], shots[0]
+
+    if np.any(shots < -1):
+        # Shot number is unknown
+        return np.mean(yvals), np.nan, -1
+
+    total_shots = np.sum(shots)
+    weights = shots / total_shots
+
+    avg_yval = np.sum(weights * yvals)
+    avg_yerr = np.sqrt(np.sum(weights**2 * yerrs**2))
+
+    return avg_yval, avg_yerr, total_shots
 
 
 def inverse_weighted_variance(
-    group: Tuple[Tuple[str, float], pd.DataFrame],
-) -> List:
+    yvals: np.ndarray,
+    yerrs: np.ndarray,
+    shots: np.ndarray,
+) -> Tuple[float, float, int]:
     """Compute inverse weighted variance and weighted average of the categorized data frame.
 
     Sample is weighted by the inverse of the data variance.
 
     Args:
-        group: Data frame grouped by the model name and x value.
+        yvals: Y values to average.
+        yerrs: Y errors to average.
+        shots: Number of shots used to obtain Y value and error.
 
     Returns:
-        A single row of the average.
+        Averaged Y value, Y error, and total shots.
     """
-    (model_name, xval), grouped_df = group
-
-    if len(grouped_df) == 1:
-        return grouped_df.iloc[0].to_list()
+    if len(yvals) == 1:
+        return yvals[0], yerrs[0], shots[0]
 
-    values_dict = dict(zip(grouped_df.columns, grouped_df.values.T))
-    weights = 1 / values_dict["yerr"] ** 2
+    total_shots = np.sum(shots) if all(shots > 0) else -1
+    weights = 1 / yerrs**2
     yvar = 1 / np.sum(weights)
 
-    out = dict.fromkeys(values_dict.keys())
-    out["xval"] = xval
-    out["yval"] = yvar * np.sum(weights * values_dict["yval"])
-    out["yerr"] = np.sqrt(yvar)
-    out["model_name"] = model_name
-    out["model_id"] = values_dict["model_id"][0]
-    out["shots"] = np.sum(values_dict["shots"])
-
-    # Process extra columns. Use set operation to aggregate metadata.
-    for extra in grouped_df.columns[6:]:
-        unique_values = set(values_dict[extra])
-        if len(unique_values) == 1:
-            out[extra] = next(iter(unique_values))
-        else:
-            out[extra] = list(unique_values)
-    return list(out.values())
+    avg_yval = yvar * np.sum(weights * yvals)
+    avg_yerr = np.sqrt(yvar)
+
+    return avg_yval, avg_yerr, total_shots
 
 
+# pylint: disable=unused-argument
 def sample_average(
-    group: Tuple[Tuple[str, float], pd.DataFrame],
-) -> List:
+    yvals: np.ndarray,
+    yerrs: np.ndarray,
+    shots: np.ndarray,
+) -> Tuple[float, float, int]:
     """Compute sample based variance and average of the categorized data frame.
 
     Original variance of the data is ignored and variance is computed with the y values.
 
     Args:
-        group: Data frame grouped by the model name and x value.
+        yvals: Y values to average.
+        yerrs: Y errors to average (ignored).
+        shots: Number of shots used to obtain Y value and error.
 
     Returns:
-        A single row of the average.
+        Averaged Y value, Y error, and total shots.
     """
-    (model_name, xval), grouped_df = group
-
-    if len(grouped_df) == 1:
-        out = grouped_df.iloc[0].copy()
-        out.yerr = 0.0  # Because there is only 1 sample
-        return out.to_list()
-
-    values_dict = dict(zip(grouped_df.columns, grouped_df.values.T))
-    ymean = np.mean(values_dict["yval"])
-
-    out = dict.fromkeys(values_dict.keys())
-    out["xval"] = xval
-    out["yval"] = ymean
-    out["yerr"] = np.sqrt(np.mean((ymean - values_dict["yval"]) ** 2) / len(grouped_df))
-    out["model_name"] = model_name
-    out["model_id"] = values_dict["model_id"][0]
-    out["shots"] = np.sum(values_dict["shots"])
-
-    # Process extra columns. Use set operation to aggregate metadata.
-    for extra in grouped_df.columns[6:]:
-        unique_values = set(values_dict[extra])
-        if len(unique_values) == 1:
-            out[extra] = next(iter(unique_values))
-        else:
-            out[extra] = list(unique_values)
-    return list(out.values())
+    if len(yvals) == 1:
+        return yvals[0], 0.0, shots[0]
+
+    total_shots = np.sum(shots) if all(shots > 0) else -1
+
+    avg_yval = np.mean(yvals)
+    avg_yerr = np.sqrt(np.mean((avg_yval - yvals) ** 2) / len(yvals))
+
+    return avg_yval, avg_yerr, total_shots
 
 
 @deprecate_func(