From d3ae5b2613ce0bb4d7f3d486c9c96d4431e385a9 Mon Sep 17 00:00:00 2001
From: Linmiao Xu <linmiao.xu@gmail.com>
Date: Sat, 2 Dec 2023 17:50:32 -0800
Subject: [PATCH] Dual NNUE with L1-128 smallnet

Credit goes to @mstembera for:
- writing the code enabling dual NNUE: https://github.com/official-stockfish/Stockfish/pull/4898
- the idea of trying L1-128 trained exclusively on high simple eval positions

The L1-128 smallnet is:
- epoch 399 of a single-stage training from scratch
- trained only on positions from filtered data with high material difference
  - defined by abs(simple_eval) > 1000

```yaml
experiment-name: 128--S1-only-hse-v2

training-dataset:
  - /data/hse/S3/dfrc99-16tb7p-eval-filt-v2.min.high-simple-eval-1k.binpack
  - /data/hse/S3/leela96-filt-v2.min.high-simple-eval-1k.binpack
  - /data/hse/S3/test80-apr2022-16tb7p.min.high-simple-eval-1k.binpack

  - /data/hse/S7/test60-2020-2tb7p.v6-3072.high-simple-eval-1k.binpack
  - /data/hse/S7/test60-novdec2021-12tb7p-filter-v6-dd.min-mar2023.unmin.high-simple-eval-1k.binpack

  - /data/hse/S7/test77-nov2021-2tb7p.v6-3072.min.high-simple-eval-1k.binpack
  - /data/hse/S7/test77-dec2021-16tb7p-filter-v6-dd.min-mar2023.unmin.high-simple-eval-1k.binpack
  - /data/hse/S7/test77-jan2022-2tb7p.high-simple-eval-1k.binpack

  - /data/hse/S7/test78-jantomay2022-16tb7p-filter-v6-dd.min-mar2023.unmin.high-simple-eval-1k.binpack
  - /data/hse/S7/test78-juntosep2022-16tb7p-filter-v6-dd.min-mar2023.unmin.high-simple-eval-1k.binpack

  - /data/hse/S7/test79-apr2022-16tb7p-filter-v6-dd.min-mar2023.unmin.high-simple-eval-1k.binpack
  - /data/hse/S7/test79-may2022-16tb7p-filter-v6-dd.min-mar2023.unmin.high-simple-eval-1k.binpack

  # T80 2022
  - /data/hse/S7/test80-may2022-16tb7p.high-simple-eval-1k.binpack
  - /data/hse/S7/test80-jun2022-16tb7p-filter-v6-dd.min-mar2023.unmin.high-simple-eval-1k.binpack
  - /data/hse/S7/test80-jul2022-16tb7p.v6-dd.min.high-simple-eval-1k.binpack
  - /data/hse/S7/test80-aug2022-16tb7p-filter-v6-dd.min-mar2023.unmin.high-simple-eval-1k.binpack
  - /data/hse/S7/test80-sep2022-16tb7p-filter-v6-dd.min-mar2023.unmin.high-simple-eval-1k.binpack
  - /data/hse/S7/test80-oct2022-16tb7p.v6-dd.high-simple-eval-1k.binpack
  - /data/hse/S7/test80-nov2022-16tb7p-v6-dd.min.high-simple-eval-1k.binpack

  # T80 2023
  - /data/hse/S7/test80-jan2023-3of3-16tb7p-filter-v6-dd.min-mar2023.unmin.high-simple-eval-1k.binpack
  - /data/hse/S7/test80-feb2023-16tb7p-filter-v6-dd.min-mar2023.unmin.high-simple-eval-1k.binpack
  - /data/hse/S7/test80-mar2023-2tb7p.v6-sk16.min.high-simple-eval-1k.binpack
  - /data/hse/S7/test80-apr2023-2tb7p-filter-v6-sk16.min.high-simple-eval-1k.binpack
  - /data/hse/S7/test80-may2023-2tb7p.v6.min.high-simple-eval-1k.binpack
  - /data/hse/S7/test80-jun2023-2tb7p.v6-3072.min.high-simple-eval-1k.binpack
  - /data/hse/S7/test80-jul2023-2tb7p.v6-3072.min.high-simple-eval-1k.binpack
  - /data/hse/S7/test80-aug2023-2tb7p.v6.min.high-simple-eval-1k.binpack
  - /data/hse/S7/test80-sep2023-2tb7p.high-simple-eval-1k.binpack
  - /data/hse/S7/test80-oct2023-2tb7p.high-simple-eval-1k.binpack

start-from-engine-test-net: False

nnue-pytorch-branch: linrock/nnue-pytorch/L1-128
engine-test-branch: linrock/Stockfish/L1-128-nolazy
engine-base-branch: linrock/Stockfish/L1-128

num-epochs: 500
lambda: 1.0
```

Experiment yaml configs converted to easy_train.sh commands with:
https://github.com/linrock/nnue-tools/blob/4339954/yaml_easy_train.py

Binpacks interleaved at training time with:
https://github.com/official-stockfish/nnue-pytorch/pull/259

Data filtered for high simple eval positions with:
https://github.com/linrock/nnue-data/blob/32d6a68/filter_high_simple_eval_plain.py
https://github.com/linrock/Stockfish/blob/61dbfe/src/tools/transform.cpp#L626-L655

Training data can be found at:
https://robotmoon.com/nnue-training-data/

Local elo at 25k nodes per move of
L1-128 smallnet (nnue-only eval) vs. L1-128 trained on standard S1 data:
nn-epoch399.nnue : -318.1 +/- 2.1

Passed STC:
https://tests.stockfishchess.org/tests/view/6574cb9d95ea6ba1fcd49e3b
LLR: 2.93 (-2.94,2.94) <0.00,2.00>
Total: 62432 W: 15875 L: 15521 D: 31036
Ptnml(0-2): 177, 7331, 15872, 7633, 203

Passed LTC:
https://tests.stockfishchess.org/tests/view/6575da2d4d789acf40aaac6e
LLR: 2.94 (-2.94,2.94) <0.50,2.50>
Total: 64830 W: 16118 L: 15738 D: 32974
Ptnml(0-2): 43, 7129, 17697, 7497, 49

Bench: 1438336
Co-Authored-By: mstembera <5421953+mstembera@users.noreply.github.com>
---
 src/Makefile                        |  32 +++---
 src/evaluate.cpp                    | 159 +++++++++++++++-------------
 src/evaluate.h                      |   5 +-
 src/nnue/evaluate_nnue.cpp          | 139 ++++++++++++++++--------
 src/nnue/evaluate_nnue.h            |  19 ++--
 src/nnue/nnue_accumulator.h         |   3 +-
 src/nnue/nnue_architecture.h        |  38 ++++---
 src/nnue/nnue_feature_transformer.h |  65 ++++++------
 src/position.cpp                    |  18 ++--
 src/position.h                      |   6 +-
 src/uci.cpp                         |   3 +-
 src/ucioption.cpp                   |   4 +-
 12 files changed, 293 insertions(+), 198 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 660b41e7edb..e6de514e568 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -806,7 +806,7 @@ help:
 	@echo "help                    > Display architecture details"
 	@echo "profile-build           > standard build with profile-guided optimization"
 	@echo "build                   > skip profile-guided optimization"
-	@echo "net                     > Download the default nnue net"
+	@echo "net                     > Download the default nnue nets"
 	@echo "strip                   > Strip executable"
 	@echo "install                 > Install executable"
 	@echo "clean                   > Clean up"
@@ -922,16 +922,7 @@ profileclean:
 	@rm -f stockfish.res
 	@rm -f ./-lstdc++.res
 
-# set up shell variables for the net stuff
-netvariables:
-	$(eval nnuenet := $(shell grep EvalFileDefaultName evaluate.h | grep define | sed 's/.*\(nn-[a-z0-9]\{12\}.nnue\).*/\1/'))
-	$(eval nnuedownloadurl1 := https://tests.stockfishchess.org/api/nn/$(nnuenet))
-	$(eval nnuedownloadurl2 := https://github.com/official-stockfish/networks/raw/master/$(nnuenet))
-	$(eval curl_or_wget := $(shell if hash curl 2>/dev/null; then echo "curl -skL"; elif hash wget 2>/dev/null; then echo "wget -qO-"; fi))
-	$(eval shasum_command := $(shell if hash shasum 2>/dev/null; then echo "shasum -a 256 "; elif hash sha256sum 2>/dev/null; then echo "sha256sum "; fi))
-
-# evaluation network (nnue)
-net: netvariables
+define fetch_network
 	@echo "Default net: $(nnuenet)"
 	@if [ "x$(curl_or_wget)" = "x" ]; then \
 		echo "Neither curl nor wget is installed. Install one of these tools unless the net has been downloaded manually"; \
@@ -966,7 +957,24 @@ net: netvariables
 		if [ "$(nnuenet)" = "nn-"`$(shasum_command) $(nnuenet) | cut -c1-12`".nnue" ]; then \
 			echo "Network validated"; break; \
 		fi; \
-	fi; \
+	fi;
+endef
+
+# set up shell variables for the net stuff
+define netvariables
+$(eval nnuenet := $(shell grep $(1) evaluate.h | grep define | sed 's/.*\(nn-[a-z0-9]\{12\}.nnue\).*/\1/'))
+$(eval nnuedownloadurl1 := https://tests.stockfishchess.org/api/nn/$(nnuenet))
+$(eval nnuedownloadurl2 := https://github.com/official-stockfish/networks/raw/master/$(nnuenet))
+$(eval curl_or_wget := $(shell if hash curl 2>/dev/null; then echo "curl -skL"; elif hash wget 2>/dev/null; then echo "wget -qO-"; fi))
+$(eval shasum_command := $(shell if hash shasum 2>/dev/null; then echo "shasum -a 256 "; elif hash sha256sum 2>/dev/null; then echo "sha256sum "; fi))
+endef
+
+# evaluation network (nnue)
+net:
+	$(call netvariables, EvalFileDefaultNameBig)
+	$(call fetch_network)
+	$(call netvariables, EvalFileDefaultNameSmall)
+	$(call fetch_network)
 
 format:
 	$(CLANG-FORMAT) -i $(SRCS) $(HEADERS) -style=file
diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index bda7132a1ff..cca808df242 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -23,6 +23,7 @@
 #include <cmath>
 #include <cstdlib>
 #include <fstream>
+#include <initializer_list>
 #include <iomanip>
 #include <iostream>
 #include <sstream>
@@ -31,6 +32,7 @@
 #include "incbin/incbin.h"
 #include "misc.h"
 #include "nnue/evaluate_nnue.h"
+#include "nnue/nnue_architecture.h"
 #include "position.h"
 #include "thread.h"
 #include "types.h"
@@ -44,11 +46,15 @@
 //     const unsigned int         gEmbeddedNNUESize;    // the size of the embedded file
 // Note that this does not work in Microsoft Visual Studio.
 #if !defined(_MSC_VER) && !defined(NNUE_EMBEDDING_OFF)
-INCBIN(EmbeddedNNUE, EvalFileDefaultName);
+INCBIN(EmbeddedNNUEBig, EvalFileDefaultNameBig);
+INCBIN(EmbeddedNNUESmall, EvalFileDefaultNameSmall);
 #else
-const unsigned char        gEmbeddedNNUEData[1] = {0x0};
-const unsigned char* const gEmbeddedNNUEEnd     = &gEmbeddedNNUEData[1];
-const unsigned int         gEmbeddedNNUESize    = 1;
+const unsigned char        gEmbeddedNNUEBigData[1]   = {0x0};
+const unsigned char* const gEmbeddedNNUEBigEnd       = &gEmbeddedNNUEBigData[1];
+const unsigned int         gEmbeddedNNUEBigSize      = 1;
+const unsigned char        gEmbeddedNNUESmallData[1] = {0x0};
+const unsigned char* const gEmbeddedNNUESmallEnd     = &gEmbeddedNNUESmallData[1];
+const unsigned int         gEmbeddedNNUESmallSize    = 1;
 #endif
 
 
@@ -56,7 +62,9 @@ namespace Stockfish {
 
 namespace Eval {
 
-std::string currentEvalFileName = "None";
+std::string       currentEvalFileName[2] = {"None", "None"};
+const std::string EvFiles[2]             = {"EvalFile", "EvalFileSmall"};
+const std::string EvFileNames[2]         = {EvalFileDefaultNameBig, EvalFileDefaultNameSmall};
 
 // Tries to load a NNUE network at startup time, or when the engine
 // receives a UCI command "setoption name EvalFile value nn-[a-z0-9]{12}.nnue"
@@ -67,84 +75,92 @@ std::string currentEvalFileName = "None";
 // variable to have the engine search in a special directory in their distro.
 void NNUE::init() {
 
-    std::string eval_file = std::string(Options["EvalFile"]);
-    if (eval_file.empty())
-        eval_file = EvalFileDefaultName;
+    for (NetSize netSize : {Big, Small})
+    {
+        std::string eval_file = std::string(Options[EvFiles[netSize]]);
+        if (eval_file.empty())
+            eval_file = EvFileNames[netSize];
 
 #if defined(DEFAULT_NNUE_DIRECTORY)
-    std::vector<std::string> dirs = {"<internal>", "", CommandLine::binaryDirectory,
-                                     stringify(DEFAULT_NNUE_DIRECTORY)};
+        std::vector<std::string> dirs = {"<internal>", "", CommandLine::binaryDirectory,
+                                         stringify(DEFAULT_NNUE_DIRECTORY)};
 #else
-    std::vector<std::string> dirs = {"<internal>", "", CommandLine::binaryDirectory};
+        std::vector<std::string> dirs = {"<internal>", "", CommandLine::binaryDirectory};
 #endif
 
-    for (const std::string& directory : dirs)
-        if (currentEvalFileName != eval_file)
+        for (const std::string& directory : dirs)
         {
-            if (directory != "<internal>")
-            {
-                std::ifstream stream(directory + eval_file, std::ios::binary);
-                if (NNUE::load_eval(eval_file, stream))
-                    currentEvalFileName = eval_file;
-            }
-
-            if (directory == "<internal>" && eval_file == EvalFileDefaultName)
+            if (currentEvalFileName[netSize] != eval_file)
             {
-                // C++ way to prepare a buffer for a memory stream
-                class MemoryBuffer: public std::basic_streambuf<char> {
-                   public:
-                    MemoryBuffer(char* p, size_t n) {
-                        setg(p, p, p + n);
-                        setp(p, p + n);
-                    }
-                };
-
-                MemoryBuffer buffer(
-                  const_cast<char*>(reinterpret_cast<const char*>(gEmbeddedNNUEData)),
-                  size_t(gEmbeddedNNUESize));
-                (void) gEmbeddedNNUEEnd;  // Silence warning on unused variable
-
-                std::istream stream(&buffer);
-                if (NNUE::load_eval(eval_file, stream))
-                    currentEvalFileName = eval_file;
+                if (directory != "<internal>")
+                {
+                    std::ifstream stream(directory + eval_file, std::ios::binary);
+                    if (NNUE::load_eval(eval_file, stream, netSize))
+                        currentEvalFileName[netSize] = eval_file;
+                }
+
+                if (directory == "<internal>" && eval_file == EvFileNames[netSize])
+                {
+                    // C++ way to prepare a buffer for a memory stream
+                    class MemoryBuffer: public std::basic_streambuf<char> {
+                       public:
+                        MemoryBuffer(char* p, size_t n) {
+                            setg(p, p, p + n);
+                            setp(p, p + n);
+                        }
+                    };
+
+                    MemoryBuffer buffer(
+                      const_cast<char*>(reinterpret_cast<const char*>(
+                        netSize == Small ? gEmbeddedNNUESmallData : gEmbeddedNNUEBigData)),
+                      size_t(netSize == Small ? gEmbeddedNNUESmallSize : gEmbeddedNNUEBigSize));
+                    (void) gEmbeddedNNUEBigEnd;  // Silence warning on unused variable
+                    (void) gEmbeddedNNUESmallEnd;
+
+                    std::istream stream(&buffer);
+                    if (NNUE::load_eval(eval_file, stream, netSize))
+                        currentEvalFileName[netSize] = eval_file;
+                }
             }
         }
+    }
 }
 
 // Verifies that the last net used was loaded successfully
 void NNUE::verify() {
 
-    std::string eval_file = std::string(Options["EvalFile"]);
-    if (eval_file.empty())
-        eval_file = EvalFileDefaultName;
-
-    if (currentEvalFileName != eval_file)
+    for (NetSize netSize : {Big, Small})
     {
+        std::string eval_file = std::string(Options[EvFiles[netSize]]);
+        if (eval_file.empty())
+            eval_file = EvFileNames[netSize];
 
-        std::string msg1 =
-          "Network evaluation parameters compatible with the engine must be available.";
-        std::string msg2 = "The network file " + eval_file + " was not loaded successfully.";
-        std::string msg3 = "The UCI option EvalFile might need to specify the full path, "
-                           "including the directory name, to the network file.";
-        std::string msg4 = "The default net can be downloaded from: "
-                           "https://tests.stockfishchess.org/api/nn/"
-                         + std::string(EvalFileDefaultName);
-        std::string msg5 = "The engine will be terminated now.";
-
-        sync_cout << "info string ERROR: " << msg1 << sync_endl;
-        sync_cout << "info string ERROR: " << msg2 << sync_endl;
-        sync_cout << "info string ERROR: " << msg3 << sync_endl;
-        sync_cout << "info string ERROR: " << msg4 << sync_endl;
-        sync_cout << "info string ERROR: " << msg5 << sync_endl;
-
-        exit(EXIT_FAILURE);
-    }
+        if (currentEvalFileName[netSize] != eval_file)
+        {
+            std::string msg1 =
+              "Network evaluation parameters compatible with the engine must be available.";
+            std::string msg2 = "The network file " + eval_file + " was not loaded successfully.";
+            std::string msg3 = "The UCI option EvalFile might need to specify the full path, "
+                               "including the directory name, to the network file.";
+            std::string msg4 = "The default net can be downloaded from: "
+                               "https://tests.stockfishchess.org/api/nn/"
+                             + std::string(EvFileNames[netSize]);
+            std::string msg5 = "The engine will be terminated now.";
+
+            sync_cout << "info string ERROR: " << msg1 << sync_endl;
+            sync_cout << "info string ERROR: " << msg2 << sync_endl;
+            sync_cout << "info string ERROR: " << msg3 << sync_endl;
+            sync_cout << "info string ERROR: " << msg4 << sync_endl;
+            sync_cout << "info string ERROR: " << msg5 << sync_endl;
+
+            exit(EXIT_FAILURE);
+        }
 
-    sync_cout << "info string NNUE evaluation using " << eval_file << sync_endl;
+        sync_cout << "info string NNUE evaluation using " << eval_file << sync_endl;
+    }
 }
 }
 
-
 // Returns a static, purely materialistic evaluation of the position from
 // the point of view of the given color. It can be divided by PawnValue to get
 // an approximation of the material advantage on the board in terms of pawns.
@@ -163,18 +179,19 @@ Value Eval::evaluate(const Position& pos) {
     int   v;
     Color stm        = pos.side_to_move();
     int   shuffling  = pos.rule50_count();
-    int   simpleEval = simple_eval(pos, stm) + (int(pos.key() & 7) - 3);
-
-    bool lazy = std::abs(simpleEval) >= RookValue + KnightValue + 16 * shuffling * shuffling
-                                          + std::abs(pos.this_thread()->bestValue)
-                                          + std::abs(pos.this_thread()->rootSimpleEval);
+    int   simpleEval = simple_eval(pos, stm);
 
+    bool lazy = std::abs(simpleEval) > 2300;
     if (lazy)
         v = simpleEval;
     else
     {
-        int   nnueComplexity;
-        Value nnue = NNUE::evaluate(pos, true, &nnueComplexity);
+        bool smallNet = std::abs(simpleEval) > 1100;
+
+        int nnueComplexity;
+
+        Value nnue = smallNet ? NNUE::evaluate<NNUE::Small>(pos, true, &nnueComplexity)
+                              : NNUE::evaluate<NNUE::Big>(pos, true, &nnueComplexity);
 
         int optimism = pos.this_thread()->optimism[stm];
 
@@ -217,7 +234,7 @@ std::string Eval::trace(Position& pos) {
     ss << std::showpoint << std::showpos << std::fixed << std::setprecision(2) << std::setw(15);
 
     Value v;
-    v = NNUE::evaluate(pos, false);
+    v = NNUE::evaluate<NNUE::Big>(pos, false);
     v = pos.side_to_move() == WHITE ? v : -v;
     ss << "NNUE evaluation        " << 0.01 * UCI::to_cp(v) << " (white side)\n";
 
diff --git a/src/evaluate.h b/src/evaluate.h
index 0a7ec61a3cf..3ead6b763dc 100644
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -34,12 +34,13 @@ std::string trace(Position& pos);
 int   simple_eval(const Position& pos, Color c);
 Value evaluate(const Position& pos);
 
-extern std::string currentEvalFileName;
+extern std::string currentEvalFileName[2];
 
 // The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue
 // for the build process (profile-build and fishtest) to work. Do not change the
 // name of the macro, as it is used in the Makefile.
-#define EvalFileDefaultName "nn-b1e55edbea57.nnue"
+#define EvalFileDefaultNameBig "nn-b1e55edbea57.nnue"
+#define EvalFileDefaultNameSmall "nn-c01dc0ffeede.nnue"
 
 namespace NNUE {
 
diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp
index 14e2fec15a4..004e28dfb41 100644
--- a/src/nnue/evaluate_nnue.cpp
+++ b/src/nnue/evaluate_nnue.cpp
@@ -40,14 +40,18 @@
 namespace Stockfish::Eval::NNUE {
 
 // Input feature converter
-LargePagePtr<FeatureTransformer> featureTransformer;
+LargePagePtr<FeatureTransformer<TransformedFeatureDimensionsBig, &StateInfo::accumulatorBig>>
+  featureTransformerBig;
+LargePagePtr<FeatureTransformer<TransformedFeatureDimensionsSmall, &StateInfo::accumulatorSmall>>
+  featureTransformerSmall;
 
 // Evaluation function
-AlignedPtr<Network> network[LayerStacks];
+AlignedPtr<Network<TransformedFeatureDimensionsBig, L2Big, L3Big>>       networkBig[LayerStacks];
+AlignedPtr<Network<TransformedFeatureDimensionsSmall, L2Small, L3Small>> networkSmall[LayerStacks];
 
-// Evaluation function file name
-std::string fileName;
-std::string netDescription;
+// Evaluation function file names
+std::string fileName[2];
+std::string netDescription[2];
 
 namespace Detail {
 
@@ -91,11 +95,20 @@ bool write_parameters(std::ostream& stream, const T& reference) {
 
 
 // Initialize the evaluation function parameters
-static void initialize() {
+static void initialize(NetSize netSize) {
 
-    Detail::initialize(featureTransformer);
-    for (std::size_t i = 0; i < LayerStacks; ++i)
-        Detail::initialize(network[i]);
+    if (netSize == Small)
+    {
+        Detail::initialize(featureTransformerSmall);
+        for (std::size_t i = 0; i < LayerStacks; ++i)
+            Detail::initialize(networkSmall[i]);
+    }
+    else
+    {
+        Detail::initialize(featureTransformerBig);
+        for (std::size_t i = 0; i < LayerStacks; ++i)
+            Detail::initialize(networkBig[i]);
+    }
 }
 
 // Read network header
@@ -122,39 +135,57 @@ static bool write_header(std::ostream& stream, std::uint32_t hashValue, const st
 }
 
 // Read network parameters
-static bool read_parameters(std::istream& stream) {
+static bool read_parameters(std::istream& stream, NetSize netSize) {
 
     std::uint32_t hashValue;
-    if (!read_header(stream, &hashValue, &netDescription))
+    if (!read_header(stream, &hashValue, &netDescription[netSize]))
+        return false;
+    if (hashValue != HashValue[netSize])
         return false;
-    if (hashValue != HashValue)
+    if (netSize == Big && !Detail::read_parameters(stream, *featureTransformerBig))
         return false;
-    if (!Detail::read_parameters(stream, *featureTransformer))
+    if (netSize == Small && !Detail::read_parameters(stream, *featureTransformerSmall))
         return false;
     for (std::size_t i = 0; i < LayerStacks; ++i)
-        if (!Detail::read_parameters(stream, *(network[i])))
+    {
+        if (netSize == Big && !Detail::read_parameters(stream, *(networkBig[i])))
             return false;
+        if (netSize == Small && !Detail::read_parameters(stream, *(networkSmall[i])))
+            return false;
+    }
     return stream && stream.peek() == std::ios::traits_type::eof();
 }
 
 // Write network parameters
-static bool write_parameters(std::ostream& stream) {
+static bool write_parameters(std::ostream& stream, NetSize netSize) {
 
-    if (!write_header(stream, HashValue, netDescription))
+    if (!write_header(stream, HashValue[netSize], netDescription[netSize]))
         return false;
-    if (!Detail::write_parameters(stream, *featureTransformer))
+    if (netSize == Big && !Detail::write_parameters(stream, *featureTransformerBig))
+        return false;
+    if (netSize == Small && !Detail::write_parameters(stream, *featureTransformerSmall))
         return false;
     for (std::size_t i = 0; i < LayerStacks; ++i)
-        if (!Detail::write_parameters(stream, *(network[i])))
+    {
+        if (netSize == Big && !Detail::write_parameters(stream, *(networkBig[i])))
+            return false;
+        if (netSize == Small && !Detail::write_parameters(stream, *(networkSmall[i])))
             return false;
+    }
     return bool(stream);
 }
 
 void hint_common_parent_position(const Position& pos) {
-    featureTransformer->hint_common_access(pos);
+
+    int simpleEval = simple_eval(pos, pos.side_to_move());
+    if (abs(simpleEval) > 1100)
+        featureTransformerSmall->hint_common_access(pos);
+    else
+        featureTransformerBig->hint_common_access(pos);
 }
 
 // Evaluation function. Perform differential calculation.
+template<NetSize Net_Size>
 Value evaluate(const Position& pos, bool adjusted, int* complexity) {
 
     // We manually align the arrays on the stack because with gcc < 9.3
@@ -165,19 +196,28 @@ Value evaluate(const Position& pos, bool adjusted, int* complexity) {
 
 #if defined(ALIGNAS_ON_STACK_VARIABLES_BROKEN)
     TransformedFeatureType
-      transformedFeaturesUnaligned[FeatureTransformer::BufferSize
-                                   + alignment / sizeof(TransformedFeatureType)];
+      transformedFeaturesUnaligned[FeatureTransformer < Small ? TransformedFeatureDimensionsSmall
+                                                              : TransformedFeatureDimensionsBig,
+                                   nullptr
+                                     > ::BufferSize + alignment / sizeof(TransformedFeatureType)];
 
     auto* transformedFeatures = align_ptr_up<alignment>(&transformedFeaturesUnaligned[0]);
 #else
-    alignas(alignment) TransformedFeatureType transformedFeatures[FeatureTransformer::BufferSize];
+
+    alignas(alignment) TransformedFeatureType
+      transformedFeatures[FeatureTransformer < Net_Size == Small ? TransformedFeatureDimensionsSmall
+                                                                 : TransformedFeatureDimensionsBig,
+                          nullptr > ::BufferSize];
 #endif
 
     ASSERT_ALIGNED(transformedFeatures, alignment);
 
     const int  bucket     = (pos.count<ALL_PIECES>() - 1) / 4;
-    const auto psqt       = featureTransformer->transform(pos, transformedFeatures, bucket);
-    const auto positional = network[bucket]->propagate(transformedFeatures);
+    const auto psqt       = Net_Size == Small
+                            ? featureTransformerSmall->transform(pos, transformedFeatures, bucket)
+                            : featureTransformerBig->transform(pos, transformedFeatures, bucket);
+    const auto positional = Net_Size == Small ? networkSmall[bucket]->propagate(transformedFeatures)
+                                              : networkBig[bucket]->propagate(transformedFeatures);
 
     if (complexity)
         *complexity = std::abs(psqt - positional) / OutputScale;
@@ -190,6 +230,9 @@ Value evaluate(const Position& pos, bool adjusted, int* complexity) {
         return static_cast<Value>((psqt + positional) / OutputScale);
 }
 
+template Value evaluate<Big>(const Position& pos, bool adjusted, int* complexity);
+template Value evaluate<Small>(const Position& pos, bool adjusted, int* complexity);
+
 struct NnueEvalTrace {
     static_assert(LayerStacks == PSQTBuckets);
 
@@ -205,13 +248,14 @@ static NnueEvalTrace trace_evaluate(const Position& pos) {
     constexpr uint64_t alignment = CacheLineSize;
 
 #if defined(ALIGNAS_ON_STACK_VARIABLES_BROKEN)
-    TransformedFeatureType
-      transformedFeaturesUnaligned[FeatureTransformer::BufferSize
-                                   + alignment / sizeof(TransformedFeatureType)];
+    TransformedFeatureType transformedFeaturesUnaligned
+      [FeatureTransformer<TransformedFeatureDimensionsBig, nullptr>::BufferSize
+       + alignment / sizeof(TransformedFeatureType)];
 
     auto* transformedFeatures = align_ptr_up<alignment>(&transformedFeaturesUnaligned[0]);
 #else
-    alignas(alignment) TransformedFeatureType transformedFeatures[FeatureTransformer::BufferSize];
+    alignas(alignment) TransformedFeatureType
+      transformedFeatures[FeatureTransformer<TransformedFeatureDimensionsBig, nullptr>::BufferSize];
 #endif
 
     ASSERT_ALIGNED(transformedFeatures, alignment);
@@ -220,8 +264,8 @@ static NnueEvalTrace trace_evaluate(const Position& pos) {
     t.correctBucket = (pos.count<ALL_PIECES>() - 1) / 4;
     for (IndexType bucket = 0; bucket < LayerStacks; ++bucket)
     {
-        const auto materialist = featureTransformer->transform(pos, transformedFeatures, bucket);
-        const auto positional  = network[bucket]->propagate(transformedFeatures);
+        const auto materialist = featureTransformerBig->transform(pos, transformedFeatures, bucket);
+        const auto positional  = networkBig[bucket]->propagate(transformedFeatures);
 
         t.psqt[bucket]       = static_cast<Value>(materialist / OutputScale);
         t.positional[bucket] = static_cast<Value>(positional / OutputScale);
@@ -310,7 +354,7 @@ std::string trace(Position& pos) {
 
     // We estimate the value of each piece by doing a differential evaluation from
     // the current base eval, simulating the removal of the piece from its square.
-    Value base = evaluate(pos);
+    Value base = evaluate<NNUE::Big>(pos);
     base       = pos.side_to_move() == WHITE ? base : -base;
 
     for (File f = FILE_A; f <= FILE_H; ++f)
@@ -325,16 +369,16 @@ std::string trace(Position& pos) {
                 auto st = pos.state();
 
                 pos.remove_piece(sq);
-                st->accumulator.computed[WHITE] = false;
-                st->accumulator.computed[BLACK] = false;
+                st->accumulatorBig.computed[WHITE] = false;
+                st->accumulatorBig.computed[BLACK] = false;
 
-                Value eval = evaluate(pos);
+                Value eval = evaluate<NNUE::Big>(pos);
                 eval       = pos.side_to_move() == WHITE ? eval : -eval;
                 v          = base - eval;
 
                 pos.put_piece(pc, sq);
-                st->accumulator.computed[WHITE] = false;
-                st->accumulator.computed[BLACK] = false;
+                st->accumulatorBig.computed[WHITE] = false;
+                st->accumulatorBig.computed[BLACK] = false;
             }
 
             writeSquare(f, r, pc, v);
@@ -379,24 +423,24 @@ std::string trace(Position& pos) {
 
 
 // Load eval, from a file stream or a memory stream
-bool load_eval(std::string name, std::istream& stream) {
+bool load_eval(const std::string name, std::istream& stream, NetSize netSize) {
 
-    initialize();
-    fileName = name;
-    return read_parameters(stream);
+    initialize(netSize);
+    fileName[netSize] = name;
+    return read_parameters(stream, netSize);
 }
 
 // Save eval, to a file stream or a memory stream
-bool save_eval(std::ostream& stream) {
+bool save_eval(std::ostream& stream, NetSize netSize) {
 
-    if (fileName.empty())
+    if (fileName[netSize].empty())
         return false;
 
-    return write_parameters(stream);
+    return write_parameters(stream, netSize);
 }
 
 // Save eval, to a file given by its name
-bool save_eval(const std::optional<std::string>& filename) {
+bool save_eval(const std::optional<std::string>& filename, NetSize netSize) {
 
     std::string actualFilename;
     std::string msg;
@@ -405,7 +449,8 @@ bool save_eval(const std::optional<std::string>& filename) {
         actualFilename = filename.value();
     else
     {
-        if (currentEvalFileName != EvalFileDefaultName)
+        if (currentEvalFileName[netSize]
+            != (netSize == Small ? EvalFileDefaultNameSmall : EvalFileDefaultNameBig))
         {
             msg = "Failed to export a net. "
                   "A non-embedded net can only be saved if the filename is specified";
@@ -413,11 +458,11 @@ bool save_eval(const std::optional<std::string>& filename) {
             sync_cout << msg << sync_endl;
             return false;
         }
-        actualFilename = EvalFileDefaultName;
+        actualFilename = (netSize == Small ? EvalFileDefaultNameSmall : EvalFileDefaultNameBig);
     }
 
     std::ofstream stream(actualFilename, std::ios_base::binary);
-    bool          saved = save_eval(stream);
+    bool          saved = save_eval(stream, netSize);
 
     msg = saved ? "Network saved successfully to " + actualFilename : "Failed to export a net";
 
diff --git a/src/nnue/evaluate_nnue.h b/src/nnue/evaluate_nnue.h
index f80aa398bba..fabfb5693f9 100644
--- a/src/nnue/evaluate_nnue.h
+++ b/src/nnue/evaluate_nnue.h
@@ -39,9 +39,11 @@ class Position;
 namespace Stockfish::Eval::NNUE {
 
 // Hash value of evaluation function structure
-constexpr std::uint32_t HashValue =
-  FeatureTransformer::get_hash_value() ^ Network::get_hash_value();
-
+constexpr std::uint32_t HashValue[2] = {
+  FeatureTransformer<TransformedFeatureDimensionsBig, nullptr>::get_hash_value()
+    ^ Network<TransformedFeatureDimensionsBig, L2Big, L3Big>::get_hash_value(),
+  FeatureTransformer<TransformedFeatureDimensionsSmall, nullptr>::get_hash_value()
+    ^ Network<TransformedFeatureDimensionsSmall, L2Small, L3Small>::get_hash_value()};
 
 // Deleter for automating release of memory area
 template<typename T>
@@ -67,12 +69,13 @@ template<typename T>
 using LargePagePtr = std::unique_ptr<T, LargePageDeleter<T>>;
 
 std::string trace(Position& pos);
-Value       evaluate(const Position& pos, bool adjusted = false, int* complexity = nullptr);
-void        hint_common_parent_position(const Position& pos);
+template<NetSize Net_Size>
+Value evaluate(const Position& pos, bool adjusted = false, int* complexity = nullptr);
+void  hint_common_parent_position(const Position& pos);
 
-bool load_eval(std::string name, std::istream& stream);
-bool save_eval(std::ostream& stream);
-bool save_eval(const std::optional<std::string>& filename);
+bool load_eval(const std::string name, std::istream& stream, NetSize netSize);
+bool save_eval(std::ostream& stream, NetSize netSize);
+bool save_eval(const std::optional<std::string>& filename, NetSize netSize);
 
 }  // namespace Stockfish::Eval::NNUE
 
diff --git a/src/nnue/nnue_accumulator.h b/src/nnue/nnue_accumulator.h
index f6d705243ef..0b05d00da28 100644
--- a/src/nnue/nnue_accumulator.h
+++ b/src/nnue/nnue_accumulator.h
@@ -29,8 +29,9 @@
 namespace Stockfish::Eval::NNUE {
 
 // Class that holds the result of affine transformation of input features
+template<IndexType Size>
 struct alignas(CacheLineSize) Accumulator {
-    std::int16_t accumulation[2][TransformedFeatureDimensions];
+    std::int16_t accumulation[2][Size];
     std::int32_t psqtAccumulation[2][PSQTBuckets];
     bool         computed[2];
 };
diff --git a/src/nnue/nnue_architecture.h b/src/nnue/nnue_architecture.h
index 92445704296..949f2d8687f 100644
--- a/src/nnue/nnue_architecture.h
+++ b/src/nnue/nnue_architecture.h
@@ -37,14 +37,28 @@ namespace Stockfish::Eval::NNUE {
 // Input features used in evaluation function
 using FeatureSet = Features::HalfKAv2_hm;
 
+enum NetSize {
+    Big,
+    Small
+};
+
 // Number of input feature dimensions after conversion
-constexpr IndexType TransformedFeatureDimensions = 2560;
-constexpr IndexType PSQTBuckets                  = 8;
-constexpr IndexType LayerStacks                  = 8;
+constexpr IndexType TransformedFeatureDimensionsBig = 2560;
+constexpr int       L2Big                           = 15;
+constexpr int       L3Big                           = 32;
+
+constexpr IndexType TransformedFeatureDimensionsSmall = 128;
+constexpr int       L2Small                           = 15;
+constexpr int       L3Small                           = 32;
+
+constexpr IndexType PSQTBuckets = 8;
+constexpr IndexType LayerStacks = 8;
 
+template<IndexType L1, int L2, int L3>
 struct Network {
-    static constexpr int FC_0_OUTPUTS = 15;
-    static constexpr int FC_1_OUTPUTS = 32;
+    static constexpr IndexType TransformedFeatureDimensions = L1;
+    static constexpr int       FC_0_OUTPUTS                 = L2;
+    static constexpr int       FC_1_OUTPUTS                 = L3;
 
     Layers::AffineTransformSparseInput<TransformedFeatureDimensions, FC_0_OUTPUTS + 1> fc_0;
     Layers::SqrClippedReLU<FC_0_OUTPUTS + 1>                                           ac_sqr_0;
@@ -84,13 +98,13 @@ struct Network {
 
     std::int32_t propagate(const TransformedFeatureType* transformedFeatures) {
         struct alignas(CacheLineSize) Buffer {
-            alignas(CacheLineSize) decltype(fc_0)::OutputBuffer fc_0_out;
-            alignas(CacheLineSize) decltype(ac_sqr_0)::OutputType
+            alignas(CacheLineSize) typename decltype(fc_0)::OutputBuffer fc_0_out;
+            alignas(CacheLineSize) typename decltype(ac_sqr_0)::OutputType
               ac_sqr_0_out[ceil_to_multiple<IndexType>(FC_0_OUTPUTS * 2, 32)];
-            alignas(CacheLineSize) decltype(ac_0)::OutputBuffer ac_0_out;
-            alignas(CacheLineSize) decltype(fc_1)::OutputBuffer fc_1_out;
-            alignas(CacheLineSize) decltype(ac_1)::OutputBuffer ac_1_out;
-            alignas(CacheLineSize) decltype(fc_2)::OutputBuffer fc_2_out;
+            alignas(CacheLineSize) typename decltype(ac_0)::OutputBuffer ac_0_out;
+            alignas(CacheLineSize) typename decltype(fc_1)::OutputBuffer fc_1_out;
+            alignas(CacheLineSize) typename decltype(ac_1)::OutputBuffer ac_1_out;
+            alignas(CacheLineSize) typename decltype(fc_2)::OutputBuffer fc_2_out;
 
             Buffer() { std::memset(this, 0, sizeof(*this)); }
         };
@@ -108,7 +122,7 @@ struct Network {
         ac_sqr_0.propagate(buffer.fc_0_out, buffer.ac_sqr_0_out);
         ac_0.propagate(buffer.fc_0_out, buffer.ac_0_out);
         std::memcpy(buffer.ac_sqr_0_out + FC_0_OUTPUTS, buffer.ac_0_out,
-                    FC_0_OUTPUTS * sizeof(decltype(ac_0)::OutputType));
+                    FC_0_OUTPUTS * sizeof(typename decltype(ac_0)::OutputType));
         fc_1.propagate(buffer.ac_sqr_0_out, buffer.fc_1_out);
         ac_1.propagate(buffer.fc_1_out, buffer.ac_1_out);
         fc_2.propagate(buffer.ac_1_out, buffer.fc_2_out);
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index 2008cf25f1d..9a162ac9853 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -186,11 +186,6 @@ static constexpr int BestRegisterCount() {
 
     return 1;
 }
-
-static constexpr int NumRegs =
-  BestRegisterCount<vec_t, WeightType, TransformedFeatureDimensions, NumRegistersSIMD>();
-static constexpr int NumPsqtRegs =
-  BestRegisterCount<psqt_vec_t, PSQTWeightType, PSQTBuckets, NumRegistersSIMD>();
     #if defined(__GNUC__)
         #pragma GCC diagnostic pop
     #endif
@@ -198,6 +193,8 @@ static constexpr int NumPsqtRegs =
 
 
 // Input feature converter
+template<IndexType                                 TransformedFeatureDimensions,
+         Accumulator<TransformedFeatureDimensions> StateInfo::*accPtr>
 class FeatureTransformer {
 
    private:
@@ -205,6 +202,11 @@ class FeatureTransformer {
     static constexpr IndexType HalfDimensions = TransformedFeatureDimensions;
 
 #ifdef VECTOR
+    static constexpr int NumRegs =
+      BestRegisterCount<vec_t, WeightType, TransformedFeatureDimensions, NumRegistersSIMD>();
+    static constexpr int NumPsqtRegs =
+      BestRegisterCount<psqt_vec_t, PSQTWeightType, PSQTBuckets, NumRegistersSIMD>();
+
     static constexpr IndexType TileHeight     = NumRegs * sizeof(vec_t) / 2;
     static constexpr IndexType PsqtTileHeight = NumPsqtRegs * sizeof(psqt_vec_t) / 4;
     static_assert(HalfDimensions % TileHeight == 0, "TileHeight must divide HalfDimensions");
@@ -253,8 +255,8 @@ class FeatureTransformer {
         update_accumulator<BLACK>(pos);
 
         const Color perspectives[2]  = {pos.side_to_move(), ~pos.side_to_move()};
-        const auto& accumulation     = pos.state()->accumulator.accumulation;
-        const auto& psqtAccumulation = pos.state()->accumulator.psqtAccumulation;
+        const auto& accumulation     = (pos.state()->*accPtr).accumulation;
+        const auto& psqtAccumulation = (pos.state()->*accPtr).psqtAccumulation;
 
         const auto psqt =
           (psqtAccumulation[perspectives[0]][bucket] - psqtAccumulation[perspectives[1]][bucket])
@@ -323,7 +325,7 @@ class FeatureTransformer {
         // of the estimated gain in terms of features to be added/subtracted.
         StateInfo *st = pos.state(), *next = nullptr;
         int        gain = FeatureSet::refresh_cost(pos);
-        while (st->previous && !st->accumulator.computed[Perspective])
+        while (st->previous && !(st->*accPtr).computed[Perspective])
         {
             // This governs when a full feature refresh is needed and how many
             // updates are better than just one full refresh.
@@ -381,7 +383,7 @@ class FeatureTransformer {
 
             for (; i >= 0; --i)
             {
-                states_to_update[i]->accumulator.computed[Perspective] = true;
+                (states_to_update[i]->*accPtr).computed[Perspective] = true;
 
                 const StateInfo* end_state = i == 0 ? computed_st : states_to_update[i - 1];
 
@@ -402,9 +404,9 @@ class FeatureTransformer {
             assert(states_to_update[0]);
 
             auto accIn =
-              reinterpret_cast<const vec_t*>(&st->accumulator.accumulation[Perspective][0]);
+              reinterpret_cast<const vec_t*>(&(st->*accPtr).accumulation[Perspective][0]);
             auto accOut = reinterpret_cast<vec_t*>(
-              &states_to_update[0]->accumulator.accumulation[Perspective][0]);
+              &(states_to_update[0]->*accPtr).accumulation[Perspective][0]);
 
             const IndexType offsetR0 = HalfDimensions * removed[0][0];
             auto            columnR0 = reinterpret_cast<const vec_t*>(&weights[offsetR0]);
@@ -428,10 +430,10 @@ class FeatureTransformer {
                                            vec_add_16(columnR0[k], columnR1[k]));
             }
 
-            auto accPsqtIn = reinterpret_cast<const psqt_vec_t*>(
-              &st->accumulator.psqtAccumulation[Perspective][0]);
+            auto accPsqtIn =
+              reinterpret_cast<const psqt_vec_t*>(&(st->*accPtr).psqtAccumulation[Perspective][0]);
             auto accPsqtOut = reinterpret_cast<psqt_vec_t*>(
-              &states_to_update[0]->accumulator.psqtAccumulation[Perspective][0]);
+              &(states_to_update[0]->*accPtr).psqtAccumulation[Perspective][0]);
 
             const IndexType offsetPsqtR0 = PSQTBuckets * removed[0][0];
             auto columnPsqtR0 = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offsetPsqtR0]);
@@ -463,7 +465,7 @@ class FeatureTransformer {
             {
                 // Load accumulator
                 auto accTileIn = reinterpret_cast<const vec_t*>(
-                  &st->accumulator.accumulation[Perspective][j * TileHeight]);
+                  &(st->*accPtr).accumulation[Perspective][j * TileHeight]);
                 for (IndexType k = 0; k < NumRegs; ++k)
                     acc[k] = vec_load(&accTileIn[k]);
 
@@ -489,7 +491,7 @@ class FeatureTransformer {
 
                     // Store accumulator
                     auto accTileOut = reinterpret_cast<vec_t*>(
-                      &states_to_update[i]->accumulator.accumulation[Perspective][j * TileHeight]);
+                      &(states_to_update[i]->*accPtr).accumulation[Perspective][j * TileHeight]);
                     for (IndexType k = 0; k < NumRegs; ++k)
                         vec_store(&accTileOut[k], acc[k]);
                 }
@@ -499,7 +501,7 @@ class FeatureTransformer {
             {
                 // Load accumulator
                 auto accTilePsqtIn = reinterpret_cast<const psqt_vec_t*>(
-                  &st->accumulator.psqtAccumulation[Perspective][j * PsqtTileHeight]);
+                  &(st->*accPtr).psqtAccumulation[Perspective][j * PsqtTileHeight]);
                 for (std::size_t k = 0; k < NumPsqtRegs; ++k)
                     psqt[k] = vec_load_psqt(&accTilePsqtIn[k]);
 
@@ -525,8 +527,8 @@ class FeatureTransformer {
 
                     // Store accumulator
                     auto accTilePsqtOut = reinterpret_cast<psqt_vec_t*>(
-                      &states_to_update[i]
-                         ->accumulator.psqtAccumulation[Perspective][j * PsqtTileHeight]);
+                      &(states_to_update[i]->*accPtr)
+                         .psqtAccumulation[Perspective][j * PsqtTileHeight]);
                     for (std::size_t k = 0; k < NumPsqtRegs; ++k)
                         vec_store_psqt(&accTilePsqtOut[k], psqt[k]);
                 }
@@ -535,13 +537,12 @@ class FeatureTransformer {
 #else
         for (IndexType i = 0; states_to_update[i]; ++i)
         {
-            std::memcpy(states_to_update[i]->accumulator.accumulation[Perspective],
-                        st->accumulator.accumulation[Perspective],
-                        HalfDimensions * sizeof(BiasType));
+            std::memcpy((states_to_update[i]->*accPtr).accumulation[Perspective],
+                        (st->*accPtr).accumulation[Perspective], HalfDimensions * sizeof(BiasType));
 
             for (std::size_t k = 0; k < PSQTBuckets; ++k)
-                states_to_update[i]->accumulator.psqtAccumulation[Perspective][k] =
-                  st->accumulator.psqtAccumulation[Perspective][k];
+                (states_to_update[i]->*accPtr).psqtAccumulation[Perspective][k] =
+                  (st->*accPtr).psqtAccumulation[Perspective][k];
 
             st = states_to_update[i];
 
@@ -551,10 +552,10 @@ class FeatureTransformer {
                 const IndexType offset = HalfDimensions * index;
 
                 for (IndexType j = 0; j < HalfDimensions; ++j)
-                    st->accumulator.accumulation[Perspective][j] -= weights[offset + j];
+                    (st->*accPtr).accumulation[Perspective][j] -= weights[offset + j];
 
                 for (std::size_t k = 0; k < PSQTBuckets; ++k)
-                    st->accumulator.psqtAccumulation[Perspective][k] -=
+                    (st->*accPtr).psqtAccumulation[Perspective][k] -=
                       psqtWeights[index * PSQTBuckets + k];
             }
 
@@ -564,10 +565,10 @@ class FeatureTransformer {
                 const IndexType offset = HalfDimensions * index;
 
                 for (IndexType j = 0; j < HalfDimensions; ++j)
-                    st->accumulator.accumulation[Perspective][j] += weights[offset + j];
+                    (st->*accPtr).accumulation[Perspective][j] += weights[offset + j];
 
                 for (std::size_t k = 0; k < PSQTBuckets; ++k)
-                    st->accumulator.psqtAccumulation[Perspective][k] +=
+                    (st->*accPtr).psqtAccumulation[Perspective][k] +=
                       psqtWeights[index * PSQTBuckets + k];
             }
         }
@@ -586,7 +587,7 @@ class FeatureTransformer {
         // Refresh the accumulator
         // Could be extracted to a separate function because it's done in 2 places,
         // but it's unclear if compilers would correctly handle register allocation.
-        auto& accumulator                 = pos.state()->accumulator;
+        auto& accumulator                 = pos.state()->*accPtr;
         accumulator.computed[Perspective] = true;
         FeatureSet::IndexList active;
         FeatureSet::append_active_indices<Perspective>(pos, active);
@@ -663,12 +664,12 @@ class FeatureTransformer {
         // Look for a usable accumulator of an earlier position. We keep track
         // of the estimated gain in terms of features to be added/subtracted.
         // Fast early exit.
-        if (pos.state()->accumulator.computed[Perspective])
+        if ((pos.state()->*accPtr).computed[Perspective])
             return;
 
         auto [oldest_st, _] = try_find_computed_accumulator<Perspective>(pos);
 
-        if (oldest_st->accumulator.computed[Perspective])
+        if ((oldest_st->*accPtr).computed[Perspective])
         {
             // Only update current position accumulator to minimize work.
             StateInfo* states_to_update[2] = {pos.state(), nullptr};
@@ -685,7 +686,7 @@ class FeatureTransformer {
 
         auto [oldest_st, next] = try_find_computed_accumulator<Perspective>(pos);
 
-        if (oldest_st->accumulator.computed[Perspective])
+        if ((oldest_st->*accPtr).computed[Perspective])
         {
             if (next == nullptr)
                 return;
diff --git a/src/position.cpp b/src/position.cpp
index 4fba3c234b7..ddc31888422 100644
--- a/src/position.cpp
+++ b/src/position.cpp
@@ -684,10 +684,10 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
     ++st->pliesFromNull;
 
     // Used by NNUE
-    st->accumulator.computed[WHITE] = false;
-    st->accumulator.computed[BLACK] = false;
-    auto& dp                        = st->dirtyPiece;
-    dp.dirty_num                    = 1;
+    st->accumulatorBig.computed[WHITE]     = st->accumulatorBig.computed[BLACK] =
+      st->accumulatorSmall.computed[WHITE] = st->accumulatorSmall.computed[BLACK] = false;
+    auto& dp                                                                      = st->dirtyPiece;
+    dp.dirty_num                                                                  = 1;
 
     Color  us       = sideToMove;
     Color  them     = ~us;
@@ -964,15 +964,15 @@ void Position::do_null_move(StateInfo& newSt) {
     assert(!checkers());
     assert(&newSt != st);
 
-    std::memcpy(&newSt, st, offsetof(StateInfo, accumulator));
+    std::memcpy(&newSt, st, offsetof(StateInfo, accumulatorBig));
 
     newSt.previous = st;
     st             = &newSt;
 
-    st->dirtyPiece.dirty_num        = 0;
-    st->dirtyPiece.piece[0]         = NO_PIECE;  // Avoid checks in UpdateAccumulator()
-    st->accumulator.computed[WHITE] = false;
-    st->accumulator.computed[BLACK] = false;
+    st->dirtyPiece.dirty_num               = 0;
+    st->dirtyPiece.piece[0]                = NO_PIECE;  // Avoid checks in UpdateAccumulator()
+    st->accumulatorBig.computed[WHITE]     = st->accumulatorBig.computed[BLACK] =
+      st->accumulatorSmall.computed[WHITE] = st->accumulatorSmall.computed[BLACK] = false;
 
     if (st->epSquare != SQ_NONE)
     {
diff --git a/src/position.h b/src/position.h
index 7e0c3eefd77..34b53f4a558 100644
--- a/src/position.h
+++ b/src/position.h
@@ -27,6 +27,7 @@
 
 #include "bitboard.h"
 #include "nnue/nnue_accumulator.h"
+#include "nnue/nnue_architecture.h"
 #include "types.h"
 
 namespace Stockfish {
@@ -57,8 +58,9 @@ struct StateInfo {
     int        repetition;
 
     // Used by NNUE
-    Eval::NNUE::Accumulator accumulator;
-    DirtyPiece              dirtyPiece;
+    Eval::NNUE::Accumulator<Eval::NNUE::TransformedFeatureDimensionsBig>   accumulatorBig;
+    Eval::NNUE::Accumulator<Eval::NNUE::TransformedFeatureDimensionsSmall> accumulatorSmall;
+    DirtyPiece                                                             dirtyPiece;
 };
 
 
diff --git a/src/uci.cpp b/src/uci.cpp
index 8e93eee6dc5..be902277984 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -37,6 +37,7 @@
 #include "misc.h"
 #include "movegen.h"
 #include "nnue/evaluate_nnue.h"
+#include "nnue/nnue_architecture.h"
 #include "position.h"
 #include "search.h"
 #include "thread.h"
@@ -320,7 +321,7 @@ void UCI::loop(int argc, char* argv[]) {
             std::string                f;
             if (is >> std::skipws >> f)
                 filename = f;
-            Eval::NNUE::save_eval(filename);
+            Eval::NNUE::save_eval(filename, Eval::NNUE::Big);
         }
         else if (token == "--help" || token == "help" || token == "--license" || token == "license")
             sync_cout
diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index 087882f11b8..f8cbcc53077 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -82,7 +82,9 @@ void init(OptionsMap& o) {
     o["SyzygyProbeDepth"] << Option(1, 1, 100);
     o["Syzygy50MoveRule"] << Option(true);
     o["SyzygyProbeLimit"] << Option(7, 0, 7);
-    o["EvalFile"] << Option(EvalFileDefaultName, on_eval_file);
+    o["EvalFile"] << Option(EvalFileDefaultNameBig, on_eval_file);
+    // Enable this after fishtest workers support EvalFileSmall
+    // o["EvalFileSmall"] << Option(EvalFileDefaultNameSmall, on_eval_file);
 }