Skip to content

Commit

Permalink
Optimize dataframe performance
Browse files Browse the repository at this point in the history
- Avoid add_data call in data processing. Create dataframe with a bulk data is much faster than appending one by one.
- Group sort with model_id instead of model_name. String search is slow in pandas.
- Groupby in format subroutine is replaced with python native groupby function which is faster than pandas groupby.
  • Loading branch information
nkanazawa1989 committed Sep 16, 2023
1 parent 7b114de commit a0bcded
Show file tree
Hide file tree
Showing 3 changed files with 171 additions and 141 deletions.
6 changes: 4 additions & 2 deletions qiskit_experiments/curve_analysis/composite_curve_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,8 @@ def _create_figures(
"""
for analysis in self.analyses():
sub_data = curve_data[curve_data.model_name.str.endswith(f"_{analysis.name}")]
for model_name, data in list(sub_data.groupby("model_name")):
for model_id, data in list(sub_data.groupby("model_id")):
model_name = analysis._models[model_id]._name
# Plot raw data scatters
if analysis.options.plot_raw_data:
raw_data = data.filter(like="processed", axis="index")
Expand Down Expand Up @@ -370,7 +371,8 @@ def _run_analysis(
fit_curves = []
formatted = curve_data.filter(like="formatted", axis="index")
columns = list(curve_data.columns)
for (i, name), sub_data in list(formatted.groupby(["model_id", "model_name"])):
for i, sub_data in list(formatted.groupby("model_id")):
name = analysis._models[i]._name
xval = sub_data.xval.to_numpy()
if len(xval) == 0:
# If data is empty, skip drawing this model.
Expand Down
165 changes: 111 additions & 54 deletions qiskit_experiments/curve_analysis/curve_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,11 @@

from typing import Dict, List, Tuple, Union, Optional
from functools import partial
from itertools import groupby
from operator import itemgetter

import lmfit
import numpy as np
import pandas as pd
from uncertainties import unumpy as unp

from qiskit_experiments.framework import ExperimentData, AnalysisResultData
Expand Down Expand Up @@ -161,30 +162,80 @@ def _run_data_processing(
Raises:
DataProcessorError: When key for x values is not found in the metadata.
DataProcessorError: When data_subfit_map information for a fit model is missing.
ValueError: When input data has incomplete metadata to specify fit model.
"""
opt = self.options

# Create table
x_key = self.options.x_key
filter_kwargs = self.options.filter_data
out = ScatterTable()
to_process = []
for idx, datum in enumerate(raw_data):
if opt.filter_data:
to_process = [d for d in raw_data if opt.filter_data.items() <= d["metadata"].items()]
else:
to_process = raw_data

# This must align with ScatterTable columns. Use struct array.
dtypes = np.dtype(
[
("xval", float),
("yval", float),
("yerr", float),
("model_name", "U30"), # builtin str is U0 which is zero-length unicode in numpy
("model_id", int),
("shots", int),
]
)
table_data = np.empty(len(to_process), dtype=dtypes)
for idx, datum in enumerate(to_process):
metadata = datum["metadata"].copy()
if filter_kwargs and not filter_kwargs.items() <= datum["metadata"].items():
# This data is not a part of this analysis.
continue
# Get xval from metadata
try:
xval = metadata.pop(x_key)
xval = metadata.pop(opt.x_key)
except KeyError as ex:
raise DataProcessorError(
f"X value key {x_key} is not defined in the circuit metadata."
f"X value key {opt.x_key} is not defined in the circuit metadata."
) from ex
out.add_entry(
index=f"processed-{idx:04d}",
xval=xval,
shots=datum.get("shots", pd.NA),
**metadata,
)
to_process.append(datum)
# Classify fit model
if len(self._models) == 1:
m_id = 0
m_name = self._models[0]._name
else:
for i, model in enumerate(self._models):
try:
model_spec = self.options.data_subfit_map[model._name]
except KeyError as ex:
raise DataProcessorError(
f"Mapping to data for the fit model {model._name} is not provided."
) from ex
if model_spec.items() <= metadata.items():
m_id = i
m_name = model._name
break
else:
raise ValueError(f"Experiment data {datum} doesn't belong to any fit model.")
table_data[idx]["xval"] = xval
table_data[idx]["shots"] = datum.get("shots", -1)
table_data[idx]["model_id"] = m_id
table_data[idx]["model_name"] = m_name

# Add extra metadata
add_key = metadata.keys() - table_data.dtype.fields
if add_key:
# Add missing keys to struct array
# This code is lengthy but faster than merge_arrays function
add_dtypes = []
for k in add_key:
if isinstance(metadata[k], str):
new_dtype = "U30"
else:
new_dtype = type(metadata[k])
add_dtypes.append((k, new_dtype))
new_table_data = np.empty(
len(to_process), dtype=sum((table_data.dtype.descr, add_dtypes), [])
)
for k in table_data.dtype.fields:
new_table_data[k] = table_data[k]
table_data = new_table_data
for k, v in metadata.items():
table_data[idx][k] = v

# Compute y value
if not self.options.data_processor:
Expand All @@ -194,29 +245,13 @@ def _run_data_processing(
"data_processor analysis options."
)
processed_values = self.options.data_processor(to_process)
out.yval = unp.nominal_values(processed_values)
out.yerr = unp.std_devs(processed_values)

# Set model names and ids
if len(self._models) == 1:
out.model_name = self._models[0]._name
out.model_id = 0
else:
for i, model in enumerate(self._models):
try:
model_spec = self.options.data_subfit_map[model._name]
except KeyError as ex:
raise DataProcessorError(
f"Mapping to data for the fit model {model._name} is not provided."
) from ex
conds = [out[k] == v for k, v in model_spec.items()]
if len(conds) >= 2:
rows = np.logical_and(*conds)
else:
rows = conds[0]
out.loc[rows, "model_name"] = model._name
out.loc[rows, "model_id"] = i
table_data["yval"] = unp.nominal_values(processed_values).flatten()
table_data["yerr"] = unp.std_devs(processed_values).flatten()

out = ScatterTable(
data=table_data,
index=[f"processed-{i:04d}" for i in range(len(to_process))],
)
return out

def _format_data(
Expand All @@ -237,19 +272,39 @@ def _format_data(
"sample": sample_average,
}

# Data is automatically sorted by dataframe groupby operation
grouped_by_model = curve_data.groupby(["model_name", "xval"], as_index=False)

# Conventional approach to average over the grouped dataframe is
# to use .apply method with a callback, but this operation just internally
# run python loop for each grouped dataframe, and thus not performant.
# This directly manipulates the dataframe values and roughly 4x faster.
averaged = map(
averaging_methods[self.options.average_method],
list(grouped_by_model),
columns = list(curve_data.columns)
sort_by = itemgetter(
columns.index("model_id"),
columns.index("xval"),
)
# Use python native groupby method on ndarray. This is more performant than pandas one.
average = averaging_methods[self.options.average_method]
formatted = []
for (mid, xv), g in groupby(sorted(curve_data.values, key=sort_by), key=sort_by):
g_values = np.array(list(g))
g_dict = dict(zip(columns, g_values.T))
avg_yval, avg_yerr, shots = average(g_dict["yval"], g_dict["yerr"], g_dict["shots"])
averaged = dict.fromkeys(columns)
averaged["xval"] = xv
averaged["yval"] = avg_yval
averaged["yerr"] = avg_yerr
averaged["model_id"] = mid
averaged["shots"] = shots
for k, v in g_dict.items():
if averaged[k] is not None:
continue
if len(g_values) == 1:
averaged[k] = v[0]
else:
unique = set(v)
if len(unique) == 1:
averaged[k] = next(iter(unique))
else:
averaged[k] = list(unique)
formatted.append(list(averaged.values()))

return curve_data.append_list_values(
other=list(averaged),
other=formatted,
prefix="formatted",
)

Expand Down Expand Up @@ -398,7 +453,8 @@ def _create_figures(
Returns:
A list of figures.
"""
for model_name, data in list(curve_data.groupby("model_name")):
for model_id, data in list(curve_data.groupby("model_id")):
model_name = self._models[model_id]._name
# Plot raw data scatters
if self.options.plot_raw_data:
raw_data = data.filter(like="processed", axis="index")
Expand Down Expand Up @@ -467,14 +523,15 @@ def _run_analysis(
fit_curves = []
formatted = curve_data.filter(like="formatted", axis="index")
columns = list(curve_data.columns)
for (i, name), sub_data in list(formatted.groupby(["model_id", "model_name"])):
for i, sub_data in list(formatted.groupby("model_id")):
name = self._models[i]._name
xval = sub_data.xval.to_numpy()
if len(xval) == 0:
# If data is empty, skip drawing this model.
# This is the case when fit model exist but no data to fit is provided.
continue
# Compute X, Y values with fit parameters.
xval_fit = np.linspace(np.min(xval), np.max(xval), num=100)
xval_fit = np.linspace(np.min(xval), np.max(xval), num=100, dtype=float)
yval_fit = eval_with_uncertainties(
x=xval_fit,
model=self._models[i],
Expand Down
Loading

0 comments on commit a0bcded

Please sign in to comment.