coreylowman · nkoppel · Jun 28, 2023 · Jun 28, 2023 · Jun 28, 2023 · Jun 28, 2023
diff --git a/src/optim/adam/mod.rs b/src/optim/adam/mod.rs
@@ -12,7 +12,10 @@ use crate::{
     tensor_ops::Device,
 };
 
-use super::{Optimizer, OptimizerUpdateError, UnusedTensors, WeightDecay};
+use super::{
+    optimizer::{deserialize_weight_decay, serialize_weight_decay},
+    Optimizer, OptimizerUpdateError, SerializeWithModel, UnusedTensors, WeightDecay,
+};
 
 /// Configuration of hyperparameters for [Adam].
 ///
@@ -163,6 +166,116 @@ impl<M: TensorCollection<E, D>, D: Device<E>, E: Dtype> Optimizer<M, D, E> for A
     }
 }
 
+/// Used internally to serialize/deserialize Adam optimizers.
+#[derive(Clone)]
+pub struct AdamSerializer<M> {
+    lr: f64,
+    betas: [f64; 2],
+    eps: f64,
+    weight_decay: (u64, f64),
+    t: i32,
+    moment1: M,
+    moment2: M,
+}
+
+impl<M: TensorCollection<E, D>, E: Dtype, D: Device<E>> TensorCollection<E, D>
+    for AdamSerializer<M>
+{
+    type To<E2: Dtype, D2: Device<E2>> = AdamSerializer<M::To<E2, D2>>;
+
+    fn iter_tensors<V: ModuleVisitor<Self, E, D>>(
+        visitor: &mut V,
+    ) -> Result<Option<Self::To<V::E2, V::D2>>, V::Err> {
+        visitor.visit_fields(
+            (
+                (
+                    Self::scalar(
+                        "lr",
+                        |s| &s.lr,
+                        |s| &mut s.lr,
+                        ScalarOptions::from_default(1e-3),
+                    ),
+                    Self::scalar(
+                        "beta0",
+                        |s| &s.betas[0],
+                        |s| &mut s.betas[0],
+                        ScalarOptions::from_default(0.9),
+                    ),
+                    Self::scalar(
+                        "beta1",
+                        |s| &s.betas[1],
+                        |s| &mut s.betas[1],
+                        ScalarOptions::from_default(0.99),
+                    ),
+                    Self::scalar(
+                        "eps",
+                        |s| &s.eps,
+                        |s| &mut s.eps,
+                        ScalarOptions::from_default(1e-8),
+                    ),
+                    Self::scalar(
+                        "wd_tag",
+                        |s| &s.weight_decay.0,
+                        |s| &mut s.weight_decay.0,
+                        ScalarOptions::from_default(0),
+                    ),
+                    Self::scalar(
+                        "wd_val",
+                        |s| &s.weight_decay.1,
+                        |s| &mut s.weight_decay.1,
+                        ScalarOptions::from_default(0.0),
+                    ),
+                ),
+                Self::scalar("t", |s| &s.t, |s| &mut s.t, ScalarOptions::from_default(0)),
+                Self::module("moment1", |s| &s.moment1, |s| &mut s.moment1),
+                Self::module("moment2", |s| &s.moment2, |s| &mut s.moment2),
+            ),
+            |((lr, beta0, beta1, eps, wd_tag, wd_val), t, moment1, moment2)| AdamSerializer {
+                lr,
+                betas: [beta0, beta1],
+                eps,
+                weight_decay: (wd_tag, wd_val),
+                t,
+                moment1,
+                moment2,
+            },
+        )
+    }
+}
+
+impl<M: TensorCollection<E, D, To<E, D> = M> + Clone, E: Dtype, D: Device<E>>
+    SerializeWithModel<M, E, D> for Adam<M, E, D>
+{
+    type Serializer = AdamSerializer<M>;
+
+    fn try_to_serializer(&self, model: &M) -> Result<AdamSerializer<M>, D::Err> {
+        Ok(AdamSerializer {
+            lr: self.cfg.lr,
+            betas: self.cfg.betas,
+            eps: self.cfg.eps,
+            weight_decay: serialize_weight_decay(self.cfg.weight_decay),
+            t: self.t,
+            moment1: self.moment1.try_to_serializer(model)?,
+            moment2: self.moment2.try_to_serializer(model)?,
+        })
+    }
+
+    fn try_from_serializer(serializer: &AdamSerializer<M>, model: &M) -> Result<Self, D::Err> {
+        Ok(Adam {
+            cfg: AdamConfig {
+                lr: serializer.lr,
+                betas: serializer.betas,
+                eps: serializer.eps,
+                weight_decay: deserialize_weight_decay(serializer.weight_decay),
+            },
+            t: serializer.t,
+            moment1: Gradients::try_from_serializer(&serializer.moment1, model)?,
+            moment2: Gradients::try_from_serializer(&serializer.moment2, model)?,
+            marker: Default::default(),
+        })
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;

diff --git a/src/optim/mod.rs b/src/optim/mod.rs
@@ -33,12 +33,14 @@
 mod adam;
 mod optimizer;
 mod rmsprop;
+mod serialize;
 mod sgd;
 
 pub use adam::{Adam, AdamConfig, AdamKernel};
 pub use optimizer::{Momentum, WeightDecay};
 pub use optimizer::{Optimizer, OptimizerUpdateError, UnusedTensors};
 pub use rmsprop::{RMSprop, RMSpropConfig, RMSpropKernel};
+pub use serialize::SerializeWithModel;
 pub use sgd::{Sgd, SgdConfig, SgdKernel};
 
 pub mod prelude {

diff --git a/src/optim/optimizer.rs b/src/optim/optimizer.rs
@@ -33,6 +33,23 @@ pub(super) fn weight_decay_to_cuda(wd: Option<WeightDecay>) -> (WeightDecayType,
     }
 }
 
+pub(super) fn serialize_weight_decay(wd: Option<WeightDecay>) -> (u64, f64) {
+    match wd {
+        None => (0, 0.0),
+        Some(WeightDecay::L2(x)) => (1, x),
+        Some(WeightDecay::Decoupled(x)) => (2, x),
+    }
+}
+
+pub(super) fn deserialize_weight_decay(serialized: (u64, f64)) -> Option<WeightDecay> {
+    match serialized.0 {
+        0 => None,
+        1 => Some(WeightDecay::L2(serialized.1)),
+        2 => Some(WeightDecay::Decoupled(serialized.1)),
+        _ => panic!("Improperly serialized Weight Decay!"),
+    }
+}
+
 /// Momentum used for [super::Sgd] and others
 #[derive(Debug, Clone, Copy)]
 pub enum Momentum {
@@ -54,14 +71,31 @@ pub(super) enum MomentumType {
 }
 
 #[cfg(feature = "cuda")]
-pub(super) fn momentum_to_cuda(wd: Option<Momentum>) -> (MomentumType, f64) {
-    match wd {
+pub(super) fn momentum_to_cuda(momentum: Option<Momentum>) -> (MomentumType, f64) {
+    match momentum {
         None => (MomentumType::None, Default::default()),
         Some(Momentum::Classic(x)) => (MomentumType::Classic, x),
         Some(Momentum::Nesterov(x)) => (MomentumType::Nesterov, x),
     }
 }
 
+pub(super) fn serialize_momentum(momentum: Option<Momentum>) -> (u64, f64) {
+    match momentum {
+        None => (0, 0.0),
+        Some(Momentum::Classic(x)) => (1, x),
+        Some(Momentum::Nesterov(x)) => (2, x),
+    }
+}
+
+pub(super) fn deserialize_momentum(serialized: (u64, f64)) -> Option<Momentum> {
+    match serialized.0 {
+        0 => None,
+        1 => Some(Momentum::Classic(serialized.1)),
+        2 => Some(Momentum::Nesterov(serialized.1)),
+        _ => panic!("Improperly serialized Weight Decay!"),
+    }
+}
+
 /// All optimizers must implement the update function, which takes a `M`
 /// and updates all of its parameters.
 ///

diff --git a/src/optim/serialize.rs b/src/optim/serialize.rs
@@ -0,0 +1,188 @@
+#[cfg(any(feature = "safetensors", feature = "numpy"))]
+use std::path::Path;
+
+use crate::{
+    nn::tensor_collection::*,
+    shapes::{Dtype, Shape},
+    tensor::{Gradients, Tensor},
+    tensor_ops::Device,
+};
+
+#[cfg(feature = "numpy")]
+use crate::{
+    nn::{LoadFromNpz, SaveToNpz},
+    tensor::{numpy::NpzError, NumpyDtype},
+};
+
+#[cfg(feature = "numpy")]
+use zip::{result::ZipResult, ZipArchive, ZipWriter};
+
+#[cfg(feature = "safetensors")]
+use crate::{
+    nn::{LoadFromSafetensors, SaveToSafetensors},
+    tensor::safetensors::SafeDtype,
+};
+
+impl<E: Dtype, D: Device<E>> TensorVisitor<E, D> for &Gradients<E, D> {
+    type Viewer = ViewTensorRef;
+    type Err = D::Err;
+    type E2 = E;
+    type D2 = D;
+
+    fn visit<S: Shape>(
+        &mut self,
+        opts: TensorOptions<S, E, D>,
+        t: &Tensor<S, E, D>,
+    ) -> Result<Option<Tensor<S, Self::E2, Self::D2>>, Self::Err> {
+        if opts.do_gradient_update {
+            Ok(Some(self.get(t)))
+        } else {
+            Ok(Some(t.device.zeros_like(&t.shape)))
+        }
+    }
+}
+
+impl<E: Dtype, D: Device<E>> TensorVisitor<E, D> for Gradients<E, D> {
+    type Viewer = (ViewTensorRef, ViewTensorRef);
+    type Err = D::Err;
+    type E2 = E;
+    type D2 = D;
+
+    fn visit<S: Shape>(
+        &mut self,
+        opts: TensorOptions<S, E, D>,
+        (grad, t): (&Tensor<S, E, D>, &Tensor<S, E, D>),
+    ) -> Result<Option<Tensor<S, Self::E2, Self::D2>>, Self::Err> {
+        if opts.do_gradient_update {
+            self.get_or_alloc_mut(t)?.clone_from(&grad.data);
+        }
+        Ok(None)
+    }
+}
+
+impl<M, E: Dtype, D: Device<E>> SerializeWithModel<M, E, D> for Gradients<E, D>
+where
+    M: TensorCollection<E, D, To<E, D> = M> + Clone,
+{
+    type Serializer = M;
+
+    fn try_to_serializer(&self, model: &M) -> Result<M, <D>::Err> {
+        let mut f = self;
+        let out = M::iter_tensors(&mut RecursiveWalker {
+            m: model,
+            f: &mut f,
+        })?;
+
+        Ok(out.unwrap())
+    }
+
+    fn try_from_serializer(data: &M, model: &M) -> Result<Self, <D>::Err> {
+        let mut out = Gradients::leaky();
+        M::iter_tensors(&mut RecursiveWalker {
+            m: (data, model),
+            f: &mut out,
+        })?;
+        Ok(out)
+    }
+}
+
+pub trait SerializeWithModel<M, E: Dtype, D: Device<E>>: Sized
+where
+    M: TensorCollection<E, D> + Clone,
+{
+    type Serializer: TensorCollection<E, D>;
+
+    /// Fallible version of [Gradients::to_model]
+    fn try_to_serializer(&self, model: &M) -> Result<Self::Serializer, D::Err>;
+
+    /// Fallible version of [Gradients::from_model]
+    fn try_from_serializer(serializer: &Self::Serializer, model: &M) -> Result<Self, D::Err>;
+
+    /// Converts the data of `self` to the structure of `model` so that its data can be serialized.
+    ///
+    /// # Panics
+    /// This function may panic if `self` does not contain a tensor corresponding to a trainable
+    /// tensor in `model`.
+    fn to_serializer(&self, model: &M) -> Self::Serializer {
+        self.try_to_serializer(model).unwrap()
+    }
+
+    /// Creates an instance of `Self` containing the tensors in `data`, where each tensor in `self`
+    /// is associated with a corresponding tensor in `model`.
+    fn from_serializer(serializer: &Self::Serializer, model: &M) -> Self {
+        Self::try_from_serializer(serializer, model).unwrap()
+    }
+
+    /// See [SaveToNpz::save].
+    #[cfg(feature = "numpy")]
+    fn save<P: AsRef<Path>>(&self, path: P, model: &M) -> ZipResult<()>
+    where
+        E: NumpyDtype,
+    {
+        self.to_serializer(model).save(path)
+    }
+
+    /// See [SaveToNpz::write].
+    #[cfg(feature = "numpy")]
+    fn write<W>(&self, w: &mut ZipWriter<W>, model: &M) -> ZipResult<()>
+    where
+        W: std::io::Write + std::io::Seek,
+        E: NumpyDtype,
+    {
+        self.to_serializer(model).write(w)
+    }
+
+    /// See [LoadFromNpz::load].
+    #[cfg(feature = "numpy")]
+    fn load<P: AsRef<Path>>(&mut self, path: P, model: &M) -> Result<(), NpzError>
+    where
+        E: NumpyDtype,
+    {
+        let mut serializer = self.to_serializer(model);
+        serializer.load(path)?;
+        *self = Self::from_serializer(&serializer, model);
+        Ok(())
+    }
+
+    /// See [LoadFromNpz::read].
+    #[cfg(feature = "numpy")]
+    fn read<R>(&mut self, r: &mut ZipArchive<R>, model: &M) -> Result<(), NpzError>
+    where
+        R: std::io::Read + std::io::Seek,
+        E: NumpyDtype,
+    {
+        let mut serializer = self.to_serializer(model);
+        serializer.read(r)?;
+        *self = Self::from_serializer(&serializer, model);
+        Ok(())
+    }
+
+    /// See [SaveToSafetensors::save_safetensors].
+    #[cfg(feature = "safetensors")]
+    fn save_safetensors<P: AsRef<Path>>(
+        &self,
+        path: P,
+        model: &M,
+    ) -> Result<(), safetensors::SafeTensorError>
+    where
+        E: SafeDtype,
+    {
+        self.to_serializer(model).save_safetensors(path)
+    }
+
+    /// See [LoadFromSafetensors::load_safetensors].
+    #[cfg(feature = "safetensors")]
+    fn load_safetensors<P: AsRef<Path>>(
+        &mut self,
+        path: P,
+        model: &M,
+    ) -> Result<(), crate::tensor::safetensors::Error>
+    where
+        E: SafeDtype,
+    {
+        let mut serializer = self.to_serializer(model);
+        serializer.load_safetensors(path)?;
+        *self = Self::from_serializer(&serializer, model);
+        Ok(())
+    }
+}