From a0bcdedf238e9cdbc81e33f2dbca93ff73b3e3e2 Mon Sep 17 00:00:00 2001 From: Naoki Kanazawa Date: Sat, 16 Sep 2023 10:43:09 +0900 Subject: [PATCH] Optimize dataframe performance - Avoid add_data call in data processing. Create dataframe with a bulk data is much faster than appending one by one. - Group sort with model_id instead of model_name. String search is slow in pandas. - Groupby in format subroutine is replaced with python native groupby function which is faster than pandas groupby. --- .../composite_curve_analysis.py | 6 +- .../curve_analysis/curve_analysis.py | 165 ++++++++++++------ qiskit_experiments/curve_analysis/utils.py | 141 ++++++--------- 3 files changed, 171 insertions(+), 141 deletions(-) diff --git a/qiskit_experiments/curve_analysis/composite_curve_analysis.py b/qiskit_experiments/curve_analysis/composite_curve_analysis.py index f3cb14ecc7..de3898e316 100644 --- a/qiskit_experiments/curve_analysis/composite_curve_analysis.py +++ b/qiskit_experiments/curve_analysis/composite_curve_analysis.py @@ -231,7 +231,8 @@ def _create_figures( """ for analysis in self.analyses(): sub_data = curve_data[curve_data.model_name.str.endswith(f"_{analysis.name}")] - for model_name, data in list(sub_data.groupby("model_name")): + for model_id, data in list(sub_data.groupby("model_id")): + model_name = analysis._models[model_id]._name # Plot raw data scatters if analysis.options.plot_raw_data: raw_data = data.filter(like="processed", axis="index") @@ -370,7 +371,8 @@ def _run_analysis( fit_curves = [] formatted = curve_data.filter(like="formatted", axis="index") columns = list(curve_data.columns) - for (i, name), sub_data in list(formatted.groupby(["model_id", "model_name"])): + for i, sub_data in list(formatted.groupby("model_id")): + name = analysis._models[i]._name xval = sub_data.xval.to_numpy() if len(xval) == 0: # If data is empty, skip drawing this model. diff --git a/qiskit_experiments/curve_analysis/curve_analysis.py b/qiskit_experiments/curve_analysis/curve_analysis.py index a96bb287ab..4ddd06cdec 100644 --- a/qiskit_experiments/curve_analysis/curve_analysis.py +++ b/qiskit_experiments/curve_analysis/curve_analysis.py @@ -17,10 +17,11 @@ from typing import Dict, List, Tuple, Union, Optional from functools import partial +from itertools import groupby +from operator import itemgetter import lmfit import numpy as np -import pandas as pd from uncertainties import unumpy as unp from qiskit_experiments.framework import ExperimentData, AnalysisResultData @@ -161,30 +162,80 @@ def _run_data_processing( Raises: DataProcessorError: When key for x values is not found in the metadata. DataProcessorError: When data_subfit_map information for a fit model is missing. + ValueError: When input data has incomplete metadata to specify fit model. """ + opt = self.options + # Create table - x_key = self.options.x_key - filter_kwargs = self.options.filter_data - out = ScatterTable() - to_process = [] - for idx, datum in enumerate(raw_data): + if opt.filter_data: + to_process = [d for d in raw_data if opt.filter_data.items() <= d["metadata"].items()] + else: + to_process = raw_data + + # This must align with ScatterTable columns. Use struct array. + dtypes = np.dtype( + [ + ("xval", float), + ("yval", float), + ("yerr", float), + ("model_name", "U30"), # builtin str is U0 which is zero-length unicode in numpy + ("model_id", int), + ("shots", int), + ] + ) + table_data = np.empty(len(to_process), dtype=dtypes) + for idx, datum in enumerate(to_process): metadata = datum["metadata"].copy() - if filter_kwargs and not filter_kwargs.items() <= datum["metadata"].items(): - # This data is not a part of this analysis. - continue + # Get xval from metadata try: - xval = metadata.pop(x_key) + xval = metadata.pop(opt.x_key) except KeyError as ex: raise DataProcessorError( - f"X value key {x_key} is not defined in the circuit metadata." + f"X value key {opt.x_key} is not defined in the circuit metadata." ) from ex - out.add_entry( - index=f"processed-{idx:04d}", - xval=xval, - shots=datum.get("shots", pd.NA), - **metadata, - ) - to_process.append(datum) + # Classify fit model + if len(self._models) == 1: + m_id = 0 + m_name = self._models[0]._name + else: + for i, model in enumerate(self._models): + try: + model_spec = self.options.data_subfit_map[model._name] + except KeyError as ex: + raise DataProcessorError( + f"Mapping to data for the fit model {model._name} is not provided." + ) from ex + if model_spec.items() <= metadata.items(): + m_id = i + m_name = model._name + break + else: + raise ValueError(f"Experiment data {datum} doesn't belong to any fit model.") + table_data[idx]["xval"] = xval + table_data[idx]["shots"] = datum.get("shots", -1) + table_data[idx]["model_id"] = m_id + table_data[idx]["model_name"] = m_name + + # Add extra metadata + add_key = metadata.keys() - table_data.dtype.fields + if add_key: + # Add missing keys to struct array + # This code is lengthy but faster than merge_arrays function + add_dtypes = [] + for k in add_key: + if isinstance(metadata[k], str): + new_dtype = "U30" + else: + new_dtype = type(metadata[k]) + add_dtypes.append((k, new_dtype)) + new_table_data = np.empty( + len(to_process), dtype=sum((table_data.dtype.descr, add_dtypes), []) + ) + for k in table_data.dtype.fields: + new_table_data[k] = table_data[k] + table_data = new_table_data + for k, v in metadata.items(): + table_data[idx][k] = v # Compute y value if not self.options.data_processor: @@ -194,29 +245,13 @@ def _run_data_processing( "data_processor analysis options." ) processed_values = self.options.data_processor(to_process) - out.yval = unp.nominal_values(processed_values) - out.yerr = unp.std_devs(processed_values) - - # Set model names and ids - if len(self._models) == 1: - out.model_name = self._models[0]._name - out.model_id = 0 - else: - for i, model in enumerate(self._models): - try: - model_spec = self.options.data_subfit_map[model._name] - except KeyError as ex: - raise DataProcessorError( - f"Mapping to data for the fit model {model._name} is not provided." - ) from ex - conds = [out[k] == v for k, v in model_spec.items()] - if len(conds) >= 2: - rows = np.logical_and(*conds) - else: - rows = conds[0] - out.loc[rows, "model_name"] = model._name - out.loc[rows, "model_id"] = i + table_data["yval"] = unp.nominal_values(processed_values).flatten() + table_data["yerr"] = unp.std_devs(processed_values).flatten() + out = ScatterTable( + data=table_data, + index=[f"processed-{i:04d}" for i in range(len(to_process))], + ) return out def _format_data( @@ -237,19 +272,39 @@ def _format_data( "sample": sample_average, } - # Data is automatically sorted by dataframe groupby operation - grouped_by_model = curve_data.groupby(["model_name", "xval"], as_index=False) - - # Conventional approach to average over the grouped dataframe is - # to use .apply method with a callback, but this operation just internally - # run python loop for each grouped dataframe, and thus not performant. - # This directly manipulates the dataframe values and roughly 4x faster. - averaged = map( - averaging_methods[self.options.average_method], - list(grouped_by_model), + columns = list(curve_data.columns) + sort_by = itemgetter( + columns.index("model_id"), + columns.index("xval"), ) + # Use python native groupby method on ndarray. This is more performant than pandas one. + average = averaging_methods[self.options.average_method] + formatted = [] + for (mid, xv), g in groupby(sorted(curve_data.values, key=sort_by), key=sort_by): + g_values = np.array(list(g)) + g_dict = dict(zip(columns, g_values.T)) + avg_yval, avg_yerr, shots = average(g_dict["yval"], g_dict["yerr"], g_dict["shots"]) + averaged = dict.fromkeys(columns) + averaged["xval"] = xv + averaged["yval"] = avg_yval + averaged["yerr"] = avg_yerr + averaged["model_id"] = mid + averaged["shots"] = shots + for k, v in g_dict.items(): + if averaged[k] is not None: + continue + if len(g_values) == 1: + averaged[k] = v[0] + else: + unique = set(v) + if len(unique) == 1: + averaged[k] = next(iter(unique)) + else: + averaged[k] = list(unique) + formatted.append(list(averaged.values())) + return curve_data.append_list_values( - other=list(averaged), + other=formatted, prefix="formatted", ) @@ -398,7 +453,8 @@ def _create_figures( Returns: A list of figures. """ - for model_name, data in list(curve_data.groupby("model_name")): + for model_id, data in list(curve_data.groupby("model_id")): + model_name = self._models[model_id]._name # Plot raw data scatters if self.options.plot_raw_data: raw_data = data.filter(like="processed", axis="index") @@ -467,14 +523,15 @@ def _run_analysis( fit_curves = [] formatted = curve_data.filter(like="formatted", axis="index") columns = list(curve_data.columns) - for (i, name), sub_data in list(formatted.groupby(["model_id", "model_name"])): + for i, sub_data in list(formatted.groupby("model_id")): + name = self._models[i]._name xval = sub_data.xval.to_numpy() if len(xval) == 0: # If data is empty, skip drawing this model. # This is the case when fit model exist but no data to fit is provided. continue # Compute X, Y values with fit parameters. - xval_fit = np.linspace(np.min(xval), np.max(xval), num=100) + xval_fit = np.linspace(np.min(xval), np.max(xval), num=100, dtype=float) yval_fit = eval_with_uncertainties( x=xval_fit, model=self._models[i], diff --git a/qiskit_experiments/curve_analysis/utils.py b/qiskit_experiments/curve_analysis/utils.py index 21af92d8c2..66a8855254 100644 --- a/qiskit_experiments/curve_analysis/utils.py +++ b/qiskit_experiments/curve_analysis/utils.py @@ -18,7 +18,6 @@ import asteval import lmfit import numpy as np -import pandas as pd from qiskit.utils.deprecation import deprecate_func from qiskit.utils import detach_prefix from uncertainties import UFloat, wrap as wrap_function @@ -225,123 +224,95 @@ def eval_with_uncertainties( def shot_weighted_average( - group: Tuple[Tuple[str, float], pd.DataFrame], -) -> List: + yvals: np.ndarray, + yerrs: np.ndarray, + shots: np.ndarray, +) -> Tuple[float, float, float]: """Compute shot based variance and weighted average of the categorized data frame. Sample is weighted by the shot number. Args: - group: Data frame grouped by the model name and x value. + yvals: Y values to average. + yerrs: Y errors to average. + shots: Number of shots used to obtain Y value and error. Returns: - A single row of the average. + Averaged Y value, Y error, and total shots. """ - (model_name, xval), grouped_df = group - - if len(grouped_df) == 1: - return grouped_df.iloc[0].to_list() - - values_dict = dict(zip(grouped_df.columns, grouped_df.values.T)) - weights = np.array(values_dict["shots"]) / np.sum(np.array(values_dict["shots"])) - - out = dict.fromkeys(values_dict.keys()) - out["xval"] = xval - out["yval"] = np.sum(weights * values_dict["yval"]) - out["yerr"] = np.sqrt(np.sum(weights**2 * values_dict["yerr"] ** 2)) - out["model_name"] = model_name - out["model_id"] = values_dict["model_id"][0] - out["shots"] = np.sum(values_dict["shots"]) - - # Process extra columns. Use set operation to aggregate metadata. - for extra in grouped_df.columns[6:]: - unique_values = set(values_dict[extra]) - if len(unique_values) == 1: - out[extra] = next(iter(unique_values)) - else: - out[extra] = list(unique_values) - return list(out.values()) + if len(yvals) == 1: + return yvals[0], yerrs[0], shots[0] + + if np.any(shots < -1): + # Shot number is unknown + return np.mean(yvals), np.nan, -1 + + total_shots = np.sum(shots) + weights = shots / total_shots + + avg_yval = np.sum(weights * yvals) + avg_yerr = np.sqrt(np.sum(weights**2 * yerrs**2)) + + return avg_yval, avg_yerr, total_shots def inverse_weighted_variance( - group: Tuple[Tuple[str, float], pd.DataFrame], -) -> List: + yvals: np.ndarray, + yerrs: np.ndarray, + shots: np.ndarray, +) -> Tuple[float, float, int]: """Compute inverse weighted variance and weighted average of the categorized data frame. Sample is weighted by the inverse of the data variance. Args: - group: Data frame grouped by the model name and x value. + yvals: Y values to average. + yerrs: Y errors to average. + shots: Number of shots used to obtain Y value and error. Returns: - A single row of the average. + Averaged Y value, Y error, and total shots. """ - (model_name, xval), grouped_df = group - - if len(grouped_df) == 1: - return grouped_df.iloc[0].to_list() + if len(yvals) == 1: + return yvals[0], yerrs[0], shots[0] - values_dict = dict(zip(grouped_df.columns, grouped_df.values.T)) - weights = 1 / values_dict["yerr"] ** 2 + total_shots = np.sum(shots) if all(shots > 0) else -1 + weights = 1 / yerrs**2 yvar = 1 / np.sum(weights) - out = dict.fromkeys(values_dict.keys()) - out["xval"] = xval - out["yval"] = yvar * np.sum(weights * values_dict["yval"]) - out["yerr"] = np.sqrt(yvar) - out["model_name"] = model_name - out["model_id"] = values_dict["model_id"][0] - out["shots"] = np.sum(values_dict["shots"]) - - # Process extra columns. Use set operation to aggregate metadata. - for extra in grouped_df.columns[6:]: - unique_values = set(values_dict[extra]) - if len(unique_values) == 1: - out[extra] = next(iter(unique_values)) - else: - out[extra] = list(unique_values) - return list(out.values()) + avg_yval = yvar * np.sum(weights * yvals) + avg_yerr = np.sqrt(yvar) + + return avg_yval, avg_yerr, total_shots +# pylint: disable=unused-argument def sample_average( - group: Tuple[Tuple[str, float], pd.DataFrame], -) -> List: + yvals: np.ndarray, + yerrs: np.ndarray, + shots: np.ndarray, +) -> Tuple[float, float, int]: """Compute sample based variance and average of the categorized data frame. Original variance of the data is ignored and variance is computed with the y values. Args: - group: Data frame grouped by the model name and x value. + yvals: Y values to average. + yerrs: Y errors to average (ignored). + shots: Number of shots used to obtain Y value and error. Returns: - A single row of the average. + Averaged Y value, Y error, and total shots. """ - (model_name, xval), grouped_df = group - - if len(grouped_df) == 1: - out = grouped_df.iloc[0].copy() - out.yerr = 0.0 # Because there is only 1 sample - return out.to_list() - - values_dict = dict(zip(grouped_df.columns, grouped_df.values.T)) - ymean = np.mean(values_dict["yval"]) - - out = dict.fromkeys(values_dict.keys()) - out["xval"] = xval - out["yval"] = ymean - out["yerr"] = np.sqrt(np.mean((ymean - values_dict["yval"]) ** 2) / len(grouped_df)) - out["model_name"] = model_name - out["model_id"] = values_dict["model_id"][0] - out["shots"] = np.sum(values_dict["shots"]) - - # Process extra columns. Use set operation to aggregate metadata. - for extra in grouped_df.columns[6:]: - unique_values = set(values_dict[extra]) - if len(unique_values) == 1: - out[extra] = next(iter(unique_values)) - else: - out[extra] = list(unique_values) - return list(out.values()) + if len(yvals) == 1: + return yvals[0], 0.0, shots[0] + + total_shots = np.sum(shots) if all(shots > 0) else -1 + + avg_yval = np.mean(yvals) + avg_yerr = np.sqrt(np.mean((avg_yval - yvals) ** 2) / len(yvals)) + + return avg_yval, avg_yerr, total_shots @deprecate_func(