Skip to content

Commit

Permalink
Datasets as numpy arrays (qiskit-community#26)
Browse files Browse the repository at this point in the history
* data has been converted into np.array other than ad-hoc

* fix wine

* fix ad_hoc, style, lint

* fix breast cancer

* fix digits

* fix digits 2

* fix readme test/page

* fix gaussian

* adding sample_total back

* test_iris is completed

* fix notebook, fix warning, enhance ad_hoc

* fix style, remove unused

* remove unused

* fix notebook imports

Co-authored-by: Anton Dekusar <[email protected]>
  • Loading branch information
beichensinn and adekusar-drl authored Apr 1, 2021
1 parent 6dd2dac commit 8ba8738
Show file tree
Hide file tree
Showing 21 changed files with 247 additions and 2,299 deletions.
15 changes: 6 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,24 +46,21 @@ be classified.
from qiskit.algorithms.optimizers import COBYLA
from qiskit.circuit.library import TwoLocal
from qiskit_machine_learning.algorithms import VQC
from qiskit_machine_learning.datasets import wine, features_and_labels
from qiskit_machine_learning.datasets import wine
from qiskit_machine_learning.circuit.library import RawFeatureVector

seed = 1376
algorithm_globals.random_seed = seed

# Use Wine data set for training and test data
feature_dim = 4 # dimension of each data point
# sample_train, training_input, test_input, class_labels
training_size = 12
test_size = 4
_, training_input, test_input, class_labels = wine(training_size=training_size,
test_size=test_size,
n=feature_dim)

# prepare features and labels
training_features, train_labels, _ = features_and_labels(training_input, class_labels)
test_features, test_labels, _ = features_and_labels(test_input, class_labels)
# training features, training labels, test features, test labels as np.array,
# one hot encoding for labels
training_features, training_labels, test_features, test_labels = \
wine(training_size=training_size, test_size=test_size, n=feature_dim)

feature_map = RawFeatureVector(feature_dimension=feature_dim)
var_form = TwoLocal(feature_map.num_qubits, ['ry', 'rz'], 'cz', reps=3)
Expand All @@ -75,7 +72,7 @@ be classified.
seed_simulator=seed,
seed_transpiler=seed)
)
vqc.fit(training_features, train_labels)
vqc.fit(training_features, training_labels)

score = vqc.score(test_features, test_labels)
print('Testing accuracy: {:0.2f}'.format(score))
Expand Down
107 changes: 51 additions & 56 deletions docs/tutorials/03_quantum_kernel.ipynb

Large diffs are not rendered by default.

17 changes: 1 addition & 16 deletions qiskit_machine_learning/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,6 @@
gaussian
iris
wine
get_feature_dimension
get_num_classes
split_dataset_to_data_and_labels
map_label_to_class_name
reduce_dim_to_via_pca
features_and_labels
"""

Expand All @@ -46,9 +40,6 @@
from .gaussian import gaussian
from .iris import iris
from .wine import wine
from .dataset_helper import (get_feature_dimension, get_num_classes,
split_dataset_to_data_and_labels,
map_label_to_class_name, reduce_dim_to_via_pca, features_and_labels)

__all__ = [
'ad_hoc_data',
Expand All @@ -57,11 +48,5 @@
'digits',
'gaussian',
'iris',
'wine',
'get_feature_dimension',
'get_num_classes',
'split_dataset_to_data_and_labels',
'map_label_to_class_name',
'reduce_dim_to_via_pca',
'features_and_labels'
'wine'
]
16 changes: 14 additions & 2 deletions qiskit_machine_learning/datasets/ad_hoc.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,11 @@
from qiskit.utils import algorithm_globals
from qiskit.exceptions import MissingOptionalLibraryError

from qiskit_machine_learning.datasets.dataset_helper import features_and_labels_transform

def ad_hoc_data(training_size, test_size, n, gap, plot_data=False):

def ad_hoc_data(training_size, test_size, n, gap, plot_data=False,
one_hot=True, include_sample_total=False):
""" returns ad hoc dataset """
class_labels = [r'A', r'B']
count = 0
Expand Down Expand Up @@ -267,7 +270,16 @@ def ad_hoc_data(training_size, test_size, n, gap, plot_data=False):
ax_1.scatter(x_2, y_2, z_2, c='#683FC8')
plt.show()

return sample_total, training_input, test_input, class_labels
training_feature_array, training_label_array = features_and_labels_transform(
training_input, class_labels, one_hot)
test_feature_array, test_label_array = features_and_labels_transform(
test_input, class_labels, one_hot)

if include_sample_total:
return (training_feature_array, training_label_array, test_feature_array, test_label_array,
sample_total)
else:
return training_feature_array, training_label_array, test_feature_array, test_label_array


def sample_ad_hoc_data(sample_total, test_size, n):
Expand Down
9 changes: 7 additions & 2 deletions qiskit_machine_learning/datasets/breast_cancer.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from qiskit.exceptions import MissingOptionalLibraryError
from .dataset_helper import features_and_labels_transform


def breast_cancer(training_size, test_size, n, plot_data=False):
def breast_cancer(training_size, test_size, n, plot_data=False, one_hot=True):
""" returns breast cancer dataset """
class_labels = [r'A', r'B']
data, target = datasets.load_breast_cancer(return_X_y=True)
Expand Down Expand Up @@ -50,6 +51,10 @@ def breast_cancer(training_size, test_size, n, plot_data=False):
for k, key in enumerate(class_labels)}
test_input = {key: (sample_test[label_test == k, :])[:test_size]
for k, key in enumerate(class_labels)}
training_feature_array, training_label_array = features_and_labels_transform(
training_input, class_labels, one_hot)
test_feature_array, test_label_array = features_and_labels_transform(
test_input, class_labels, one_hot)

if plot_data:
try:
Expand All @@ -66,4 +71,4 @@ def breast_cancer(training_size, test_size, n, plot_data=False):
plt.title("PCA dim. reduced Breast cancer dataset")
plt.show()

return sample_train, training_input, test_input, class_labels
return training_feature_array, training_label_array, test_feature_array, test_label_array
158 changes: 25 additions & 133 deletions qiskit_machine_learning/datasets/dataset_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,137 +12,11 @@

""" Data set helper """

import operator
from copy import deepcopy
from typing import Dict, List, Tuple

import numpy as np
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder


def get_num_classes(dataset):
"""
Check number of classes in a given dataset
Args:
dataset(dict): key is the class name and value is the data.
Returns:
int: number of classes
"""
return len(list(dataset.keys()))


def get_feature_dimension(dataset):
"""
Check feature dimension of a given dataset
Args:
dataset(dict): key is the class name and value is the data.
Returns:
int: feature dimension, -1 denotes no data in the dataset.
Raises:
TypeError: invalid data set
"""
if not isinstance(dataset, dict):
raise TypeError("Dataset is not formatted as a dict. Please check it.")

feature_dim = -1
for v in dataset.values():
if not isinstance(v, np.ndarray):
v = np.asarray(v)
return v.shape[1]

return feature_dim


# pylint: disable=invalid-name
def split_dataset_to_data_and_labels(dataset, class_names=None):
"""
Split dataset to data and labels numpy array
If `class_names` is given, use the desired label to class name mapping,
or create the mapping based on the keys in the dataset.
Args:
dataset (dict): {'A': numpy.ndarray, 'B': numpy.ndarray, ...}
class_names (dict): class name of dataset, {class_name: label}
Returns:
Union(tuple(list, dict), list):
List contains two arrays of numpy.ndarray type
where the array at index 0 is data, an NxD array, and at
index 1 it is labels, an Nx1 array, containing values in range
0 to K-1, where K is the number of classes. The dict is a map
{str: int}, mapping class name to label. The tuple of list, dict is returned
when `class_names` is not None, otherwise just the list is returned.
Raises:
KeyError: data set invalid
"""
data = []
labels = []
if class_names is None:
sorted_classes_name = sorted(list(dataset.keys()))
class_to_label = {k: idx for idx, k in enumerate(sorted_classes_name)}
else:
class_to_label = class_names
sorted_label = sorted(class_to_label.items(), key=operator.itemgetter(1))
for class_name, _ in sorted_label:
values = dataset[class_name]
for value in values:
data.append(value)
try:
labels.append(class_to_label[class_name])
except Exception as ex: # pylint: disable=broad-except
raise KeyError('The dataset has different class names to '
'the training data. error message: {}'.format(ex)) from ex
data = np.asarray(data)
labels = np.asarray(labels)
if class_names is None:
return [data, labels], class_to_label
else:
return [data, labels]


def map_label_to_class_name(predicted_labels, label_to_class):
"""
Helper converts labels (numeric) to class name (string)
Args:
predicted_labels (numpy.ndarray): Nx1 array
label_to_class (dict or list): a mapping form label (numeric) to class name (str)
Returns:
str: predicted class names of each datum
"""

if not isinstance(predicted_labels, np.ndarray):
predicted_labels = np.asarray([predicted_labels])

predicted_class_names = []

for predicted_label in predicted_labels:
predicted_class_names.append(label_to_class[predicted_label])
return predicted_class_names


def reduce_dim_to_via_pca(x, dim):
"""
Reduce the data dimension via pca
Args:
x (numpy.ndarray): NxD array
dim (int): the targeted dimension D'
Returns:
numpy.ndarray: NxD' array
"""
x_reduced = PCA(n_components=dim).fit_transform(x)
return x_reduced


def discretize_and_truncate(data, bounds, num_qubits, return_data_grid_elements=False,
Expand Down Expand Up @@ -250,24 +124,42 @@ def discretize_and_truncate(data, bounds, num_qubits, return_data_grid_elements=
return data, data_grid


def features_and_labels(dataset: Dict[str, np.ndarray], class_labels: List[str]
) -> Tuple[np.ndarray, np.ndarray, OneHotEncoder]:
def features_and_labels_transform(dataset: Dict[str, np.ndarray],
class_labels: List[str],
one_hot: bool = True
) -> Tuple[np.ndarray, np.ndarray]:
"""
Converts a dataset into arrays of features and labels.
Args:
dataset: A dictionary in the format of {'A': numpy.ndarray, 'B': numpy.ndarray, ...}
class_labels: A list of classes in the dataset
one_hot (bool): if True - return one-hot encoded label
Returns:
A tuple of features as np.ndarray, labels as np.ndarray, and a one hot encoder.
A tuple of features as np.ndarray, label as np.ndarray
"""
features = np.concatenate(list(dataset.values()))
encoder = preprocessing.OneHotEncoder()
encoder.fit(np.array(class_labels).reshape(-1, 1))

raw_labels = []
for category in dataset.keys():
num_samples = dataset[category].shape[0]
raw_labels += [category] * num_samples
labels = np.array(encoder.transform(np.array(raw_labels).reshape(-1, 1)).todense())
return features, labels, encoder

if not raw_labels:
# no labels, empty dataset
labels = np.zeros((0, len(class_labels)))
return features, labels

if one_hot:
encoder = preprocessing.OneHotEncoder()
encoder.fit(np.array(class_labels).reshape(-1, 1))
labels = encoder.transform(np.array(raw_labels).reshape(-1, 1))
if not isinstance(labels, np.ndarray):
labels = np.array(labels.todense())
else:
encoder = preprocessing.LabelEncoder()
encoder.fit(np.array(class_labels))
labels = encoder.transform(np.array(raw_labels))

return features, labels
9 changes: 7 additions & 2 deletions qiskit_machine_learning/datasets/digits.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from qiskit.exceptions import MissingOptionalLibraryError
from .dataset_helper import features_and_labels_transform


def digits(training_size, test_size, n, plot_data=False):
def digits(training_size, test_size, n, plot_data=False, one_hot=True):
""" returns digits dataset """
class_labels = [r'A', r'B', r'C', r'D', r'E', r'F', r'G', r'H', r'I', r'J']
data = datasets.load_digits()
Expand Down Expand Up @@ -51,6 +52,10 @@ def digits(training_size, test_size, n, plot_data=False):
for k, key in enumerate(class_labels)}
test_input = {key: (sample_test[label_test == k, :])[:test_size]
for k, key in enumerate(class_labels)}
training_feature_array, training_label_array = features_and_labels_transform(
training_input, class_labels, one_hot)
test_feature_array, test_label_array = features_and_labels_transform(
test_input, class_labels, one_hot)

if plot_data:
try:
Expand All @@ -67,4 +72,4 @@ def digits(training_size, test_size, n, plot_data=False):
plt.title("PCA dim. reduced Digits dataset")
plt.show()

return sample_train, training_input, test_input, class_labels
return training_feature_array, training_label_array, test_feature_array, test_label_array
Loading

0 comments on commit 8ba8738

Please sign in to comment.