diff --git a/nn_utils.py b/nn_utils.py new file mode 100644 index 0000000..3dddc3b --- /dev/null +++ b/nn_utils.py @@ -0,0 +1,360 @@ +import json +import os +import pickle as pkl +import random +from io import BytesIO +from pathlib import Path +from typing import Callable + +import h5py as h5 +import numpy as np +import tensorflow as tf +import zstd +from HGQ.bops import trace_minmax +from keras.layers import Dense +from keras.src.layers.convolutional.base_conv import Conv +from keras.src.saving.legacy import hdf5_format +from matplotlib import pyplot as plt +from tensorflow import keras +from tqdm.auto import tqdm + + +class NumpyFloatValuesEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, np.float32): # type: ignore + return float(obj) + return json.JSONEncoder.default(self, obj) + + +class SaveTopN(keras.callbacks.Callback): + def __init__( + self, + metric_fn: Callable[[dict], float], + n: int, + path: str | Path, + side: str = 'max', + fname_format='epoch={epoch}-metric={metric:.4e}.h5', + cond_fn: Callable[[dict], bool] = lambda x: True, + ): + self.n = n + self.metric_fn = metric_fn + self.path = Path(path) + self.fname_format = fname_format + os.makedirs(path, exist_ok=True) + self.weight_paths = np.full(n, '/dev/null', dtype=object) + if side == 'max': + self.best = np.full(n, -np.inf) + self.side = np.greater + elif side == 'min': + self.best = np.full(n, np.inf) + self.side = np.less + self.cond = cond_fn + + def on_epoch_end(self, epoch, logs=None): + assert isinstance(logs, dict) + assert isinstance(self.model, keras.models.Model) + logs = logs.copy() + logs['epoch'] = epoch + if not self.cond(logs): + return + metric = self.metric_fn(logs) + + if self.side(metric, self.best[-1]): + try: + os.remove(self.weight_paths[-1]) + except OSError: + pass + logs['metric'] = metric + fname = self.path / self.fname_format.format(**logs) + self.best[-1] = metric + self.weight_paths[-1] = fname + self.model.save_weights(fname) + with h5.File(fname, 'r+') as f: + log_str = json.dumps(logs, cls=NumpyFloatValuesEncoder) + f.attrs['train_log'] = log_str + idx = np.argsort(self.best) + if self.side == np.greater: + idx = idx[::-1] + self.best = self.best[idx] + self.weight_paths = self.weight_paths[idx] + + def rename_ckpts(self, dataset, bsz=65536): + assert self.weight_paths[0] != '/dev/null', 'No checkpoints to rename' + assert isinstance(self.model, keras.models.Model) + + weight_buf = BytesIO() + with h5.File(weight_buf, 'w') as f: + hdf5_format.save_weights_to_hdf5_group(f, self.model) + weight_buf.seek(0) + + for i, path in enumerate(tqdm(self.weight_paths, desc='Renaming checkpoints')): + if path == '/dev/null': + continue + self.model.load_weights(path) + bops = trace_minmax(self.model, dataset, bsz=bsz, verbose=False) + with h5.File(path, 'r+') as f: + logs = json.loads(f.attrs['train_log']) # type: ignore + logs['bops'] = bops + metric = self.metric_fn(logs) + logs['metric'] = metric + f.attrs['train_log'] = json.dumps(logs, cls=NumpyFloatValuesEncoder) + self.best[i] = metric + new_fname = self.path / self.fname_format.format(**logs) + os.rename(path, new_fname) + self.weight_paths[i] = new_fname + + idx = np.argsort(self.best) + self.best = self.best[idx] + self.weight_paths = self.weight_paths[idx] + with h5.File(weight_buf, 'r') as f: + hdf5_format.load_weights_from_hdf5_group_by_name(f, self.model) + + +class PBarCallback(tf.keras.callbacks.Callback): + def __init__(self, metric='loss: {loss:.2f}/{val_loss:.2f}'): + self.pbar = None + self.template = metric + + def on_epoch_begin(self, epoch, logs=None): + if self.pbar is None: + self.pbar = tqdm(total=self.params['epochs'], unit='epoch') + + def on_epoch_end(self, epoch, logs=None): + assert isinstance(self.pbar, tqdm) + assert isinstance(logs, dict) + self.pbar.update(1) + string = self.template.format(**logs) + if 'bops' in logs: + string += f' - BOPs: {logs["bops"]:,.0f}' + self.pbar.set_description(string) + + def on_train_end(self, logs=None): + if self.pbar is not None: + self.pbar.close() + + +def plot_history(histry: dict, metrics=('loss', 'val_loss'), ylabel='Loss', logy=False): + fig, ax = plt.subplots() + for metric in metrics: + ax.plot(histry[metric], label=metric) + ax.set_xlabel('Epoch') + ax.set_ylabel(ylabel) + if logy: + ax.set_yscale('log') + ax.legend() + return fig, ax + + +def save_model(model: keras.models.Model, path: str): + _path = Path(path) + model.save(path) + if model.history is not None: + history = model.history.history + else: + history = {} + with open(_path.with_suffix('.history'), 'wb') as f: + f.write(zstd.compress(pkl.dumps(history))) + + +def load_model(path: str, co=None): + _path = Path(path) + model: keras.Model = keras.models.load_model(path, custom_objects=co) # type: ignore + with open(_path.with_suffix('.history'), 'rb') as f: + history: dict[str, list] = pkl.loads(zstd.decompress(f.read())) + return model, history + + +def save_history(history, path): + with open(path, 'wb') as f: + f.write(zstd.compress(pkl.dumps(history))) + + +def load_history(path): + with open(path, 'rb') as f: + history = pkl.loads(zstd.decompress(f.read())) + return history + + +def absorb_batchNorm(model_target, model_original): + for layer in model_target.layers: + if layer.__class__.__name__ == 'Functional': + absorb_batchNorm(layer, model_original.get_layer(layer.name)) + continue + if ( + (isinstance(layer, Dense) or isinstance(layer, Conv)) + and len(nodes := model_original.get_layer(layer.name)._outbound_nodes) > 0 + and isinstance(nodes[0].outbound_layer, keras.layers.BatchNormalization) + ): + _gamma, _beta, _mu, _var = model_original.get_layer(layer.name)._outbound_nodes[0].outbound_layer.get_weights() + _ratio = _gamma / np.sqrt(0.001 + _var) + _bias = -_gamma * _mu / np.sqrt(0.001 + _var) + _beta + + k, *_b = model_original.get_layer(layer.name).get_weights() + if _b: + b = _b[0] + else: + b = np.zeros(layer.output_shape[-1]) + nk = np.einsum('...c, c-> ...c', k, _ratio, optimize=True) + nb = np.einsum('...c, c-> ...c', b, _ratio, optimize=True) + _bias + extras = layer.get_weights()[2:] + layer.set_weights([nk, nb, *extras]) + elif hasattr(layer, 'kernel'): + for w in layer.weights: + if '_bw' not in w.name: + break + else: + continue + weights = layer.get_weights() + new_weights = model_original.get_layer(layer.name).get_weights() + l = len(new_weights) # noqa: E741 # If l looks like 1 by any chance, change your font. + layer.set_weights([*new_weights, *weights[l:]][: len(weights)]) + + +def set_seed(seed): + np.random.seed(seed) + tf.random.set_seed(seed) + os.environ['PYTHONHASHSEED'] = str(seed) + random.seed(seed) + + tf.config.experimental.enable_op_determinism() + + +def get_best_ckpt(save_path: Path, take_min=False): + ckpts = list(save_path.glob('*.h5')) + + def rank(ckpt: Path): + with h5.File(ckpt, 'r') as f: + log: dict = f.attrs['train_log'] # type: ignore + log = json.loads(log) # type: ignore + metric = log['metric'] # type: ignore + return metric + + ckpts = sorted(ckpts, key=rank, reverse=not take_min) + ckpt = ckpts[0] + return ckpt + + +class PeratoFront(keras.callbacks.Callback): + def __init__( + self, + path: str | Path, + fname_format: str, + metrics_names: list[str], + sides: list[int], + cond_fn: Callable[[dict], bool] = lambda x: True, + ): + self.path = Path(path) + self.fname_format = fname_format + os.makedirs(path, exist_ok=True) + self.paths = [] + self.metrics = [] + self.metric_names = metrics_names + self.sides = np.array(sides) + self.cond_fn = cond_fn + + def on_epoch_end(self, epoch, logs=None): + assert isinstance(self.model, keras.models.Model) + assert isinstance(logs, dict) + + logs = logs.copy() + logs['epoch'] = epoch + + if not self.cond_fn(logs): + return + new_metrics = np.array([logs[metric_name] for metric_name in self.metric_names]) + _rm_idx = [] + for i, old_metrics in enumerate(self.metrics): + _old_metrics = self.sides * old_metrics + _new_metrics = self.sides * new_metrics + if np.all(_new_metrics <= _old_metrics): + return + if np.all(_new_metrics >= _old_metrics): + _rm_idx.append(i) + for i in _rm_idx[::-1]: + self.metrics.pop(i) + p = self.paths.pop(i) + os.remove(p) + + path = self.path / self.fname_format.format(**logs) + self.metrics.append(new_metrics) + self.paths.append(path) + self.model.save_weights(self.paths[-1]) + + with h5.File(path, 'r+') as f: + log_str = json.dumps(logs, cls=NumpyFloatValuesEncoder) + f.attrs['train_log'] = log_str + + def rename_ckpts(self, dataset, bsz=65536): + assert isinstance(self.model, keras.models.Model) + + weight_buf = BytesIO() + with h5.File(weight_buf, 'w') as f: + hdf5_format.save_weights_to_hdf5_group(f, self.model) + weight_buf.seek(0) + + for i, path in enumerate(tqdm(self.paths, desc='Renaming checkpoints')): + self.model.load_weights(path) + bops = trace_minmax(self.model, dataset, bsz=bsz, verbose=False) + with h5.File(path, 'r+') as f: + logs = json.loads(f.attrs['train_log']) # type: ignore + logs['bops'] = bops + f.attrs['train_log'] = json.dumps(logs, cls=NumpyFloatValuesEncoder) + metrics = np.array([logs[metric_name] for metric_name in self.metric_names]) + self.metrics[i] = metrics + new_fname = self.path / self.fname_format.format(**logs) + os.rename(path, new_fname) + self.paths[i] = new_fname + + with h5.File(weight_buf, 'r') as f: + hdf5_format.load_weights_from_hdf5_group_by_name(f, self.model) + + +class BetaScheduler(keras.callbacks.Callback): + def __init__(self, beta_fn: Callable[[int], float]): + self.beta_fn = beta_fn + + def on_epoch_begin(self, epoch, logs=None): + assert isinstance(self.model, keras.models.Model) + + beta = self.beta_fn(epoch) + for layer in self.model.layers: + if hasattr(layer, 'beta'): + layer.beta.assign(keras.backend.constant(beta, dtype=keras.backend.floatx())) + + def on_epoch_end(self, epoch, logs=None): + assert isinstance(logs, dict) + logs['beta'] = self.beta_fn(epoch) + + @classmethod + def from_config(cls, config): + return cls(get_schedule(config.beta, config.train.epochs)) + + +def get_schedule(beta_conf, total_epochs): + epochs = [] + betas = [] + interpolations = [] + for block in beta_conf.intervals: + epochs.append(block.epochs) + betas.append(block.betas) + interpolation = block.interpolation + assert interpolation in ['linear', 'log'] + interpolations.append(interpolation == 'log') + epochs = np.array(epochs + [total_epochs]) + assert np.all(np.diff(epochs) >= 0) + betas = np.array(betas) + interpolations = np.array(interpolations) + + def schedule(epoch): + if epoch >= total_epochs: + return betas[-1, -1] + idx = np.searchsorted(epochs, epoch, side='right') - 1 + beta0, beta1 = betas[idx] + epoch0, epoch1 = epochs[idx], epochs[idx + 1] + if interpolations[idx]: + beta = beta0 * (beta1 / beta0) ** ((epoch - epoch0) / (epoch1 - epoch0)) + else: + beta = beta0 + (beta1 - beta0) * (epoch - epoch0) / (epoch1 - epoch0) + return float(beta) + + return schedule diff --git a/part4.1_HG_quantization.ipynb b/part4.1_HG_quantization.ipynb new file mode 100644 index 0000000..d5d3dc1 --- /dev/null +++ b/part4.1_HG_quantization.ipynb @@ -0,0 +1,504 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Part 4: HG Quantization" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import keras\n", + "from keras.utils import to_categorical\n", + "from sklearn.datasets import fetch_openml\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import LabelEncoder, StandardScaler\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "%matplotlib inline\n", + "seed = 0\n", + "np.random.seed(seed)\n", + "import tensorflow as tf\n", + "\n", + "tf.random.set_seed(seed)\n", + "\n", + "os.environ['PATH'] = os.environ['XILINX_VIVADO'] + '/bin:' + os.environ['PATH']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fetch the jet tagging dataset from Open ML" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# If you haven't finished part 1 already, uncomment the following lines to download, process, and save the dataset\n", + "\n", + "# le = LabelEncoder()\n", + "# y = le.fit_transform(y)\n", + "# y = to_categorical(y, 5)\n", + "# X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", + "# # print(y[:5])\n", + "# scaler = StandardScaler()\n", + "# X_train_val = scaler.fit_transform(X_train_val)\n", + "# X_test = scaler.transform(X_test)\n", + "# np.save('X_train_val.npy', X_train_val)\n", + "# np.save('X_test.npy', X_test)\n", + "# np.save('y_train_val.npy', y_train_val)\n", + "# np.save('y_test.npy', y_test)\n", + "# np.save('classes.npy', le.classes_)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_train_val = np.load('X_train_val.npy')\n", + "X_test = np.load('X_test.npy')\n", + "y_train_val = np.load('y_train_val.npy')\n", + "y_test = np.load('y_test.npy')\n", + "classes = np.load('classes.npy', allow_pickle=True)\n", + "\n", + "# Convert everything to tf.Tensor to avoid casting\n", + "with tf.device('/cpu:0'): # type: ignore\n", + " _X_train_val = tf.convert_to_tensor(X_train_val, dtype=tf.float32)\n", + " # We don't make explicit y categorical tensor:\n", + " # Use SparseCategoricalCrossentropy loss instead.\n", + " _y_train_val = tf.convert_to_tensor(np.argmax(y_train_val, axis=1), dtype=tf.int32)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Construct a model\n", + "This time we're going to use HGQ layers.\n", + "\n", + "HGQ is \"High Granularity Quantization\" for heterogeneous quantization at arbitrary granularity, up to per-weight and per-activation level.\n", + "\n", + "https://github.com/calad0i/HGQ\n", + "\n", + "Depending on the specific task, HGQ can achieve more than 10x resource savings comparing to QKeras. (For example, on this dataset and requiring an accuracy of around 0.72~0.74)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from keras.models import Sequential\n", + "from keras.optimizers import Adam\n", + "from keras.losses import SparseCategoricalCrossentropy\n", + "from HGQ.layers import HQuantize, HDense, HActivation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For any layer that needs to be quantized (i.e., layers that perform the actual computation), add a `H` in front of the layer name. For example, `HDense`, `HConv2D`, `HActivation`, etc.\n", + "\n", + "HGQ requires the input number to be quantized. To achieve it, you can simply add a `HQuantizer` layer at the beginning of the model. You may refer to https://calad0i.github.io/HGQ/ for full documentation.\n", + "\n", + "As all quantization bitwidths are learnt, you don't need to specify them. Instead, for each `H-` layer, you need to specify the `beta` parameter that controls the trade-off between accuracy and resource savings. The higher the `beta`, the more aggressive the quantization will be." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "beta = 3e-6\n", + "# The bigger the beta, the smaller the models is, at the cost of accuracy.\n", + "\n", + "model = Sequential(\n", + " [\n", + " HQuantize(beta=beta),\n", + " HDense(64, activation='relu', beta=beta),\n", + " HDense(32, activation='relu', beta=beta),\n", + " HDense(32, activation='relu', beta=beta),\n", + " HDense(5, beta=beta),\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train sparse\n", + "\n", + "No need to do anything. Unstructured sparsity comes for free with HGQ." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "# This is a empty code cell, you don't need to put anything here." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train the model\n", + "We'll use the same settings as the model for part 1: Adam optimizer with categorical crossentropy loss.\n", + "\n", + "However, we can skip the softmax layer in the model by adding `from_logits=True` to the loss function. `Softmax` is expensive in hardware, so we want to avoid it if possible.\n", + "\n", + "For any HGQ model, it's essential to use `ResetMinMax` callback to reset the quantization ranges after each epoch. This is because the ranges are calculated based on the data seen so far, and we want to make sure they are recalculated after each epoch.\n", + "\n", + "It is recommended to use the `FreeBOPs` callback to monitor the number of (effective) bits operations in the model. This is a good proxy for ressource usage in FPGA (BOPs ~ 55*DSPs+LUTs) for **post place&route resource**. Notice that CSynth tends to overestimate at least by a factor of 2." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from HGQ import ResetMinMax, FreeBOPs\n", + "from keras.callbacks import LearningRateScheduler\n", + "from keras.experimental import CosineDecay\n", + "from nn_utils import PBarCallback\n", + "\n", + "_sched = CosineDecay(2e-2, 200)\n", + "sched = LearningRateScheduler(_sched)\n", + "pbar = PBarCallback(metric='loss: {loss:.3f}/{val_loss:.3f} - acc: {accuracy:.3f}/{val_accuracy:.3f}')\n", + "\n", + "callbacks = [ResetMinMax(), FreeBOPs(), pbar, sched]\n", + "\n", + "# ResetMinMax: necessary callback for all HGQ models\n", + "# FreeBOPs: recommended callback\n", + "# pbar: progress bar callback, useful when the number of epochs is high\n", + "# sched: learning rate scheduler. Cosine decay in this case." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Notice\n", + "\n", + "- Due to the stochasticness of surrogate gradient on the individual bitwidth, it is recommended to train the model with a large batchsize over more epochs.\n", + "\n", + "- HGQ is jit-compiled for many parts. The first epoch will take longer to compile.\n", + "\n", + "- We train for 200 epochs here, which takes ~1min on a 3070-maxq GPU, similar to the time taken part 4.\n", + "\n", + "- Parameters used in this tutorial are not optimized for the best performance. Please refer to [HGQ-demos](https://github.com/calad0i/HGQ-demos) for more advanced examples." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train = True\n", + "if train:\n", + " opt = Adam(learning_rate=0)\n", + " loss = SparseCategoricalCrossentropy(from_logits=True)\n", + " model.compile(optimizer=opt, loss=loss, metrics=['accuracy'])\n", + "\n", + " model.fit(\n", + " _X_train_val,\n", + " _y_train_val,\n", + " batch_size=16384,\n", + " epochs=200,\n", + " validation_split=0.25,\n", + " shuffle=True,\n", + " callbacks=callbacks,\n", + " verbose=0, # type: ignore\n", + " )\n", + " model.save('model_3.1/model.h5')\n", + "else:\n", + " from keras.models import load_model\n", + "\n", + " # No need to use custom_objects as the custom layers are already registered\n", + " model: keras.Model = load_model('model_3.1/model.h5') # type: ignore" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prepare for conversion\n", + "\n", + "HGQ model cannot be converted to hls4ml model directly, and we need to convert it to a proxy model first. The proxy model also serves as a bit-accurate emulator of the hls4ml model that takes numerical overflow into account.\n", + "\n", + "To convert to a proxy model, we need to set appropriate ranges of the model internal variables. This is done by using the `trace_minmax` function. You can add a scaler factor `cover_range` to the ranges to make sure the model more robust to numerical overflow. `trace_minmax` also resturns the exact (effective) BOPs of the model (the number provided during training is approximated).\n", + "\n", + "If you keep all parameters the same and everything goes correctly, total BOPs of the model should be around 6500. This means, after running place&route (or vsynth), the model should take around 6500 LUTs, which means DSPs*55+LUTs used should be around 6500." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from HGQ import trace_minmax, to_proxy_model\n", + "\n", + "trace_minmax(model, X_train_val)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check that the model is indeed sparse without explicit pruning or `l1` regularization." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for layer in model.layers:\n", + " if layer._has_kernel:\n", + " k = layer.fused_qkernel.numpy()\n", + " print(f'{layer.name}: {np.mean(k==0):.2%} sparsity')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then, convert the model to a proxy model using the `to_proxy_model` function." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "proxy = to_proxy_model(model)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import hls4ml\n", + "import plotting\n", + "\n", + "hls_model = hls4ml.converters.convert_from_keras_model(proxy, output_dir='model_3.1/hls4ml_prj', part='xcu250-figd2104-2L-e')\n", + "hls_model.compile()\n", + "\n", + "X_test = np.ascontiguousarray(X_test)\n", + "y_keras = model.predict(X_test, batch_size=16384, verbose=0)\n", + "y_proxy = proxy.predict(X_test, batch_size=16384, verbose=0)\n", + "y_hls = hls_model.predict(X_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Check bit-accuracy\n", + "If you are unlucky, `y_keras` and `y_hls` will not fully match due to numerical overflow (for a few entries). However, `y_keras` and `y_proxy` should match perfectly. (Sometime mismatch could also happen - only due to machine precision limit.\n", + "\n", + "For newer nvidia GPUs, TF32 is enabled by default (fp32 with reduced mantissa bits), which could cause this issue). This will make this issue more prevalent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "np.mean(y_keras == y_hls), np.mean(y_proxy == y_hls)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# The plotting script assumes 0-1 range for the predictions.\n", + "y_keras_softmax = tf.nn.softmax(y_keras).numpy()\n", + "y_hls_softmax = tf.nn.softmax(y_hls).numpy()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "from sklearn.metrics import accuracy_score\n", + "from keras.models import load_model\n", + "\n", + "model_ref = load_model('model_1/KERAS_check_best_model.h5')\n", + "y_ref = model_ref.predict(X_test, batch_size=1024, verbose=0)\n", + "\n", + "print(\"Accuracy baseline: {}\".format(accuracy_score(np.argmax(y_test, axis=1), np.argmax(y_ref, axis=1))))\n", + "print(\"Accuracy pruned, quantized: {}\".format(accuracy_score(np.argmax(y_test, axis=1), np.argmax(y_keras, axis=1))))\n", + "print(\"Accuracy hls4ml: {}\".format(accuracy_score(np.argmax(y_test, axis=1), np.argmax(y_hls, axis=1))))\n", + "\n", + "fig, ax = plt.subplots(figsize=(9, 9))\n", + "_ = plotting.makeRoc(y_test, y_ref, classes)\n", + "plt.gca().set_prop_cycle(None) # reset the colors\n", + "_ = plotting.makeRoc(y_test, y_keras_softmax, classes, linestyle='--')\n", + "plt.gca().set_prop_cycle(None) # reset the colors\n", + "_ = plotting.makeRoc(y_test, y_hls_softmax, classes, linestyle=':')\n", + "\n", + "from matplotlib.lines import Line2D\n", + "\n", + "lines = [Line2D([0], [0], ls='-'), Line2D([0], [0], ls='--'), Line2D([0], [0], ls=':')]\n", + "from matplotlib.legend import Legend\n", + "\n", + "leg = Legend(ax, lines, labels=['baseline', 'pruned, quantized', 'hls4ml'], loc='lower right', frameon=False)\n", + "ax.add_artist(leg)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Synthesize\n", + "Now let's synthesize this quantized, pruned model.\n", + "\n", + "**The synthesis will take a while**\n", + "\n", + "While the C-Synthesis is running, we can monitor the progress looking at the log file by opening a terminal from the notebook home, and executing:\n", + "\n", + "`tail -f model_3.1/hls4ml_prj/vivado_hls.log`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Notice\n", + "\n", + "- For `vivado_hls`, adding a inline recursive pragma can greatly reduce the latency of the model (up to 50% for HGQ models). You can comment this cell out for comparison.\n", + "- For `vitis`, pipelining and inlining cannot co-exist, this comment out this cell if you want to use `vitis`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with open('model_3.1/hls4ml_prj/firmware/myproject.cpp', 'r+') as f:\n", + " code = f.read()\n", + " f.seek(0)\n", + " code = code.replace('#pragma HLS PIPELINE', '#pragma HLS PIPELINE\\n #pragma HLS INLINE RECURSIVE', 1)\n", + " f.write(code)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "hls_model.build(csim=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Check the reports\n", + "Print out the reports generated by Vivado HLS. Pay attention to the Utilization Estimates' section in particular this time.\n", + "\n", + "## Notice\n", + "We strip away the softmax layer compare to part 4, which takes 3~5 cycles to compute. The overall latency could be comparable." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "hls4ml.report.read_vivado_report('model_3.1/hls4ml_prj')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Print the report for the model trained in part 4. You should notice that the resource usage is significantly lower than the model trained in part 4.\n", + "\n", + "**Note you need to have trained and synthesized the model from part 4**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "hls4ml.report.read_vivado_report('model_3/hls4ml_prj')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## NB\n", + "Note as well that the Vivado HLS `csynth` resource estimates tend to _overestimate_ on chip resource usage. Running the subsequent stages of FPGA compilation reveals the more realistic resource usage, You can run the next step, 'logic synthesis' with `hls_model.build(synth=True, vsynth=True)`, but we skipped it in this tutorial in the interest of time." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}