From d3ae5b2613ce0bb4d7f3d486c9c96d4431e385a9 Mon Sep 17 00:00:00 2001 From: Linmiao Xu Date: Sat, 2 Dec 2023 17:50:32 -0800 Subject: [PATCH] Dual NNUE with L1-128 smallnet Credit goes to @mstembera for: - writing the code enabling dual NNUE: https://github.com/official-stockfish/Stockfish/pull/4898 - the idea of trying L1-128 trained exclusively on high simple eval positions The L1-128 smallnet is: - epoch 399 of a single-stage training from scratch - trained only on positions from filtered data with high material difference - defined by abs(simple_eval) > 1000 ```yaml experiment-name: 128--S1-only-hse-v2 training-dataset: - /data/hse/S3/dfrc99-16tb7p-eval-filt-v2.min.high-simple-eval-1k.binpack - /data/hse/S3/leela96-filt-v2.min.high-simple-eval-1k.binpack - /data/hse/S3/test80-apr2022-16tb7p.min.high-simple-eval-1k.binpack - /data/hse/S7/test60-2020-2tb7p.v6-3072.high-simple-eval-1k.binpack - /data/hse/S7/test60-novdec2021-12tb7p-filter-v6-dd.min-mar2023.unmin.high-simple-eval-1k.binpack - /data/hse/S7/test77-nov2021-2tb7p.v6-3072.min.high-simple-eval-1k.binpack - /data/hse/S7/test77-dec2021-16tb7p-filter-v6-dd.min-mar2023.unmin.high-simple-eval-1k.binpack - /data/hse/S7/test77-jan2022-2tb7p.high-simple-eval-1k.binpack - /data/hse/S7/test78-jantomay2022-16tb7p-filter-v6-dd.min-mar2023.unmin.high-simple-eval-1k.binpack - /data/hse/S7/test78-juntosep2022-16tb7p-filter-v6-dd.min-mar2023.unmin.high-simple-eval-1k.binpack - /data/hse/S7/test79-apr2022-16tb7p-filter-v6-dd.min-mar2023.unmin.high-simple-eval-1k.binpack - /data/hse/S7/test79-may2022-16tb7p-filter-v6-dd.min-mar2023.unmin.high-simple-eval-1k.binpack # T80 2022 - /data/hse/S7/test80-may2022-16tb7p.high-simple-eval-1k.binpack - /data/hse/S7/test80-jun2022-16tb7p-filter-v6-dd.min-mar2023.unmin.high-simple-eval-1k.binpack - /data/hse/S7/test80-jul2022-16tb7p.v6-dd.min.high-simple-eval-1k.binpack - /data/hse/S7/test80-aug2022-16tb7p-filter-v6-dd.min-mar2023.unmin.high-simple-eval-1k.binpack - /data/hse/S7/test80-sep2022-16tb7p-filter-v6-dd.min-mar2023.unmin.high-simple-eval-1k.binpack - /data/hse/S7/test80-oct2022-16tb7p.v6-dd.high-simple-eval-1k.binpack - /data/hse/S7/test80-nov2022-16tb7p-v6-dd.min.high-simple-eval-1k.binpack # T80 2023 - /data/hse/S7/test80-jan2023-3of3-16tb7p-filter-v6-dd.min-mar2023.unmin.high-simple-eval-1k.binpack - /data/hse/S7/test80-feb2023-16tb7p-filter-v6-dd.min-mar2023.unmin.high-simple-eval-1k.binpack - /data/hse/S7/test80-mar2023-2tb7p.v6-sk16.min.high-simple-eval-1k.binpack - /data/hse/S7/test80-apr2023-2tb7p-filter-v6-sk16.min.high-simple-eval-1k.binpack - /data/hse/S7/test80-may2023-2tb7p.v6.min.high-simple-eval-1k.binpack - /data/hse/S7/test80-jun2023-2tb7p.v6-3072.min.high-simple-eval-1k.binpack - /data/hse/S7/test80-jul2023-2tb7p.v6-3072.min.high-simple-eval-1k.binpack - /data/hse/S7/test80-aug2023-2tb7p.v6.min.high-simple-eval-1k.binpack - /data/hse/S7/test80-sep2023-2tb7p.high-simple-eval-1k.binpack - /data/hse/S7/test80-oct2023-2tb7p.high-simple-eval-1k.binpack start-from-engine-test-net: False nnue-pytorch-branch: linrock/nnue-pytorch/L1-128 engine-test-branch: linrock/Stockfish/L1-128-nolazy engine-base-branch: linrock/Stockfish/L1-128 num-epochs: 500 lambda: 1.0 ``` Experiment yaml configs converted to easy_train.sh commands with: https://github.com/linrock/nnue-tools/blob/4339954/yaml_easy_train.py Binpacks interleaved at training time with: https://github.com/official-stockfish/nnue-pytorch/pull/259 Data filtered for high simple eval positions with: https://github.com/linrock/nnue-data/blob/32d6a68/filter_high_simple_eval_plain.py https://github.com/linrock/Stockfish/blob/61dbfe/src/tools/transform.cpp#L626-L655 Training data can be found at: https://robotmoon.com/nnue-training-data/ Local elo at 25k nodes per move of L1-128 smallnet (nnue-only eval) vs. L1-128 trained on standard S1 data: nn-epoch399.nnue : -318.1 +/- 2.1 Passed STC: https://tests.stockfishchess.org/tests/view/6574cb9d95ea6ba1fcd49e3b LLR: 2.93 (-2.94,2.94) <0.00,2.00> Total: 62432 W: 15875 L: 15521 D: 31036 Ptnml(0-2): 177, 7331, 15872, 7633, 203 Passed LTC: https://tests.stockfishchess.org/tests/view/6575da2d4d789acf40aaac6e LLR: 2.94 (-2.94,2.94) <0.50,2.50> Total: 64830 W: 16118 L: 15738 D: 32974 Ptnml(0-2): 43, 7129, 17697, 7497, 49 Bench: 1438336 Co-Authored-By: mstembera <5421953+mstembera@users.noreply.github.com> --- src/Makefile | 32 +++--- src/evaluate.cpp | 159 +++++++++++++++------------- src/evaluate.h | 5 +- src/nnue/evaluate_nnue.cpp | 139 ++++++++++++++++-------- src/nnue/evaluate_nnue.h | 19 ++-- src/nnue/nnue_accumulator.h | 3 +- src/nnue/nnue_architecture.h | 38 ++++--- src/nnue/nnue_feature_transformer.h | 65 ++++++------ src/position.cpp | 18 ++-- src/position.h | 6 +- src/uci.cpp | 3 +- src/ucioption.cpp | 4 +- 12 files changed, 293 insertions(+), 198 deletions(-) diff --git a/src/Makefile b/src/Makefile index 660b41e7edb..e6de514e568 100644 --- a/src/Makefile +++ b/src/Makefile @@ -806,7 +806,7 @@ help: @echo "help > Display architecture details" @echo "profile-build > standard build with profile-guided optimization" @echo "build > skip profile-guided optimization" - @echo "net > Download the default nnue net" + @echo "net > Download the default nnue nets" @echo "strip > Strip executable" @echo "install > Install executable" @echo "clean > Clean up" @@ -922,16 +922,7 @@ profileclean: @rm -f stockfish.res @rm -f ./-lstdc++.res -# set up shell variables for the net stuff -netvariables: - $(eval nnuenet := $(shell grep EvalFileDefaultName evaluate.h | grep define | sed 's/.*\(nn-[a-z0-9]\{12\}.nnue\).*/\1/')) - $(eval nnuedownloadurl1 := https://tests.stockfishchess.org/api/nn/$(nnuenet)) - $(eval nnuedownloadurl2 := https://github.com/official-stockfish/networks/raw/master/$(nnuenet)) - $(eval curl_or_wget := $(shell if hash curl 2>/dev/null; then echo "curl -skL"; elif hash wget 2>/dev/null; then echo "wget -qO-"; fi)) - $(eval shasum_command := $(shell if hash shasum 2>/dev/null; then echo "shasum -a 256 "; elif hash sha256sum 2>/dev/null; then echo "sha256sum "; fi)) - -# evaluation network (nnue) -net: netvariables +define fetch_network @echo "Default net: $(nnuenet)" @if [ "x$(curl_or_wget)" = "x" ]; then \ echo "Neither curl nor wget is installed. Install one of these tools unless the net has been downloaded manually"; \ @@ -966,7 +957,24 @@ net: netvariables if [ "$(nnuenet)" = "nn-"`$(shasum_command) $(nnuenet) | cut -c1-12`".nnue" ]; then \ echo "Network validated"; break; \ fi; \ - fi; \ + fi; +endef + +# set up shell variables for the net stuff +define netvariables +$(eval nnuenet := $(shell grep $(1) evaluate.h | grep define | sed 's/.*\(nn-[a-z0-9]\{12\}.nnue\).*/\1/')) +$(eval nnuedownloadurl1 := https://tests.stockfishchess.org/api/nn/$(nnuenet)) +$(eval nnuedownloadurl2 := https://github.com/official-stockfish/networks/raw/master/$(nnuenet)) +$(eval curl_or_wget := $(shell if hash curl 2>/dev/null; then echo "curl -skL"; elif hash wget 2>/dev/null; then echo "wget -qO-"; fi)) +$(eval shasum_command := $(shell if hash shasum 2>/dev/null; then echo "shasum -a 256 "; elif hash sha256sum 2>/dev/null; then echo "sha256sum "; fi)) +endef + +# evaluation network (nnue) +net: + $(call netvariables, EvalFileDefaultNameBig) + $(call fetch_network) + $(call netvariables, EvalFileDefaultNameSmall) + $(call fetch_network) format: $(CLANG-FORMAT) -i $(SRCS) $(HEADERS) -style=file diff --git a/src/evaluate.cpp b/src/evaluate.cpp index bda7132a1ff..cca808df242 100644 --- a/src/evaluate.cpp +++ b/src/evaluate.cpp @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -31,6 +32,7 @@ #include "incbin/incbin.h" #include "misc.h" #include "nnue/evaluate_nnue.h" +#include "nnue/nnue_architecture.h" #include "position.h" #include "thread.h" #include "types.h" @@ -44,11 +46,15 @@ // const unsigned int gEmbeddedNNUESize; // the size of the embedded file // Note that this does not work in Microsoft Visual Studio. #if !defined(_MSC_VER) && !defined(NNUE_EMBEDDING_OFF) -INCBIN(EmbeddedNNUE, EvalFileDefaultName); +INCBIN(EmbeddedNNUEBig, EvalFileDefaultNameBig); +INCBIN(EmbeddedNNUESmall, EvalFileDefaultNameSmall); #else -const unsigned char gEmbeddedNNUEData[1] = {0x0}; -const unsigned char* const gEmbeddedNNUEEnd = &gEmbeddedNNUEData[1]; -const unsigned int gEmbeddedNNUESize = 1; +const unsigned char gEmbeddedNNUEBigData[1] = {0x0}; +const unsigned char* const gEmbeddedNNUEBigEnd = &gEmbeddedNNUEBigData[1]; +const unsigned int gEmbeddedNNUEBigSize = 1; +const unsigned char gEmbeddedNNUESmallData[1] = {0x0}; +const unsigned char* const gEmbeddedNNUESmallEnd = &gEmbeddedNNUESmallData[1]; +const unsigned int gEmbeddedNNUESmallSize = 1; #endif @@ -56,7 +62,9 @@ namespace Stockfish { namespace Eval { -std::string currentEvalFileName = "None"; +std::string currentEvalFileName[2] = {"None", "None"}; +const std::string EvFiles[2] = {"EvalFile", "EvalFileSmall"}; +const std::string EvFileNames[2] = {EvalFileDefaultNameBig, EvalFileDefaultNameSmall}; // Tries to load a NNUE network at startup time, or when the engine // receives a UCI command "setoption name EvalFile value nn-[a-z0-9]{12}.nnue" @@ -67,84 +75,92 @@ std::string currentEvalFileName = "None"; // variable to have the engine search in a special directory in their distro. void NNUE::init() { - std::string eval_file = std::string(Options["EvalFile"]); - if (eval_file.empty()) - eval_file = EvalFileDefaultName; + for (NetSize netSize : {Big, Small}) + { + std::string eval_file = std::string(Options[EvFiles[netSize]]); + if (eval_file.empty()) + eval_file = EvFileNames[netSize]; #if defined(DEFAULT_NNUE_DIRECTORY) - std::vector dirs = {"", "", CommandLine::binaryDirectory, - stringify(DEFAULT_NNUE_DIRECTORY)}; + std::vector dirs = {"", "", CommandLine::binaryDirectory, + stringify(DEFAULT_NNUE_DIRECTORY)}; #else - std::vector dirs = {"", "", CommandLine::binaryDirectory}; + std::vector dirs = {"", "", CommandLine::binaryDirectory}; #endif - for (const std::string& directory : dirs) - if (currentEvalFileName != eval_file) + for (const std::string& directory : dirs) { - if (directory != "") - { - std::ifstream stream(directory + eval_file, std::ios::binary); - if (NNUE::load_eval(eval_file, stream)) - currentEvalFileName = eval_file; - } - - if (directory == "" && eval_file == EvalFileDefaultName) + if (currentEvalFileName[netSize] != eval_file) { - // C++ way to prepare a buffer for a memory stream - class MemoryBuffer: public std::basic_streambuf { - public: - MemoryBuffer(char* p, size_t n) { - setg(p, p, p + n); - setp(p, p + n); - } - }; - - MemoryBuffer buffer( - const_cast(reinterpret_cast(gEmbeddedNNUEData)), - size_t(gEmbeddedNNUESize)); - (void) gEmbeddedNNUEEnd; // Silence warning on unused variable - - std::istream stream(&buffer); - if (NNUE::load_eval(eval_file, stream)) - currentEvalFileName = eval_file; + if (directory != "") + { + std::ifstream stream(directory + eval_file, std::ios::binary); + if (NNUE::load_eval(eval_file, stream, netSize)) + currentEvalFileName[netSize] = eval_file; + } + + if (directory == "" && eval_file == EvFileNames[netSize]) + { + // C++ way to prepare a buffer for a memory stream + class MemoryBuffer: public std::basic_streambuf { + public: + MemoryBuffer(char* p, size_t n) { + setg(p, p, p + n); + setp(p, p + n); + } + }; + + MemoryBuffer buffer( + const_cast(reinterpret_cast( + netSize == Small ? gEmbeddedNNUESmallData : gEmbeddedNNUEBigData)), + size_t(netSize == Small ? gEmbeddedNNUESmallSize : gEmbeddedNNUEBigSize)); + (void) gEmbeddedNNUEBigEnd; // Silence warning on unused variable + (void) gEmbeddedNNUESmallEnd; + + std::istream stream(&buffer); + if (NNUE::load_eval(eval_file, stream, netSize)) + currentEvalFileName[netSize] = eval_file; + } } } + } } // Verifies that the last net used was loaded successfully void NNUE::verify() { - std::string eval_file = std::string(Options["EvalFile"]); - if (eval_file.empty()) - eval_file = EvalFileDefaultName; - - if (currentEvalFileName != eval_file) + for (NetSize netSize : {Big, Small}) { + std::string eval_file = std::string(Options[EvFiles[netSize]]); + if (eval_file.empty()) + eval_file = EvFileNames[netSize]; - std::string msg1 = - "Network evaluation parameters compatible with the engine must be available."; - std::string msg2 = "The network file " + eval_file + " was not loaded successfully."; - std::string msg3 = "The UCI option EvalFile might need to specify the full path, " - "including the directory name, to the network file."; - std::string msg4 = "The default net can be downloaded from: " - "https://tests.stockfishchess.org/api/nn/" - + std::string(EvalFileDefaultName); - std::string msg5 = "The engine will be terminated now."; - - sync_cout << "info string ERROR: " << msg1 << sync_endl; - sync_cout << "info string ERROR: " << msg2 << sync_endl; - sync_cout << "info string ERROR: " << msg3 << sync_endl; - sync_cout << "info string ERROR: " << msg4 << sync_endl; - sync_cout << "info string ERROR: " << msg5 << sync_endl; - - exit(EXIT_FAILURE); - } + if (currentEvalFileName[netSize] != eval_file) + { + std::string msg1 = + "Network evaluation parameters compatible with the engine must be available."; + std::string msg2 = "The network file " + eval_file + " was not loaded successfully."; + std::string msg3 = "The UCI option EvalFile might need to specify the full path, " + "including the directory name, to the network file."; + std::string msg4 = "The default net can be downloaded from: " + "https://tests.stockfishchess.org/api/nn/" + + std::string(EvFileNames[netSize]); + std::string msg5 = "The engine will be terminated now."; + + sync_cout << "info string ERROR: " << msg1 << sync_endl; + sync_cout << "info string ERROR: " << msg2 << sync_endl; + sync_cout << "info string ERROR: " << msg3 << sync_endl; + sync_cout << "info string ERROR: " << msg4 << sync_endl; + sync_cout << "info string ERROR: " << msg5 << sync_endl; + + exit(EXIT_FAILURE); + } - sync_cout << "info string NNUE evaluation using " << eval_file << sync_endl; + sync_cout << "info string NNUE evaluation using " << eval_file << sync_endl; + } } } - // Returns a static, purely materialistic evaluation of the position from // the point of view of the given color. It can be divided by PawnValue to get // an approximation of the material advantage on the board in terms of pawns. @@ -163,18 +179,19 @@ Value Eval::evaluate(const Position& pos) { int v; Color stm = pos.side_to_move(); int shuffling = pos.rule50_count(); - int simpleEval = simple_eval(pos, stm) + (int(pos.key() & 7) - 3); - - bool lazy = std::abs(simpleEval) >= RookValue + KnightValue + 16 * shuffling * shuffling - + std::abs(pos.this_thread()->bestValue) - + std::abs(pos.this_thread()->rootSimpleEval); + int simpleEval = simple_eval(pos, stm); + bool lazy = std::abs(simpleEval) > 2300; if (lazy) v = simpleEval; else { - int nnueComplexity; - Value nnue = NNUE::evaluate(pos, true, &nnueComplexity); + bool smallNet = std::abs(simpleEval) > 1100; + + int nnueComplexity; + + Value nnue = smallNet ? NNUE::evaluate(pos, true, &nnueComplexity) + : NNUE::evaluate(pos, true, &nnueComplexity); int optimism = pos.this_thread()->optimism[stm]; @@ -217,7 +234,7 @@ std::string Eval::trace(Position& pos) { ss << std::showpoint << std::showpos << std::fixed << std::setprecision(2) << std::setw(15); Value v; - v = NNUE::evaluate(pos, false); + v = NNUE::evaluate(pos, false); v = pos.side_to_move() == WHITE ? v : -v; ss << "NNUE evaluation " << 0.01 * UCI::to_cp(v) << " (white side)\n"; diff --git a/src/evaluate.h b/src/evaluate.h index 0a7ec61a3cf..3ead6b763dc 100644 --- a/src/evaluate.h +++ b/src/evaluate.h @@ -34,12 +34,13 @@ std::string trace(Position& pos); int simple_eval(const Position& pos, Color c); Value evaluate(const Position& pos); -extern std::string currentEvalFileName; +extern std::string currentEvalFileName[2]; // The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue // for the build process (profile-build and fishtest) to work. Do not change the // name of the macro, as it is used in the Makefile. -#define EvalFileDefaultName "nn-b1e55edbea57.nnue" +#define EvalFileDefaultNameBig "nn-b1e55edbea57.nnue" +#define EvalFileDefaultNameSmall "nn-c01dc0ffeede.nnue" namespace NNUE { diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp index 14e2fec15a4..004e28dfb41 100644 --- a/src/nnue/evaluate_nnue.cpp +++ b/src/nnue/evaluate_nnue.cpp @@ -40,14 +40,18 @@ namespace Stockfish::Eval::NNUE { // Input feature converter -LargePagePtr featureTransformer; +LargePagePtr> + featureTransformerBig; +LargePagePtr> + featureTransformerSmall; // Evaluation function -AlignedPtr network[LayerStacks]; +AlignedPtr> networkBig[LayerStacks]; +AlignedPtr> networkSmall[LayerStacks]; -// Evaluation function file name -std::string fileName; -std::string netDescription; +// Evaluation function file names +std::string fileName[2]; +std::string netDescription[2]; namespace Detail { @@ -91,11 +95,20 @@ bool write_parameters(std::ostream& stream, const T& reference) { // Initialize the evaluation function parameters -static void initialize() { +static void initialize(NetSize netSize) { - Detail::initialize(featureTransformer); - for (std::size_t i = 0; i < LayerStacks; ++i) - Detail::initialize(network[i]); + if (netSize == Small) + { + Detail::initialize(featureTransformerSmall); + for (std::size_t i = 0; i < LayerStacks; ++i) + Detail::initialize(networkSmall[i]); + } + else + { + Detail::initialize(featureTransformerBig); + for (std::size_t i = 0; i < LayerStacks; ++i) + Detail::initialize(networkBig[i]); + } } // Read network header @@ -122,39 +135,57 @@ static bool write_header(std::ostream& stream, std::uint32_t hashValue, const st } // Read network parameters -static bool read_parameters(std::istream& stream) { +static bool read_parameters(std::istream& stream, NetSize netSize) { std::uint32_t hashValue; - if (!read_header(stream, &hashValue, &netDescription)) + if (!read_header(stream, &hashValue, &netDescription[netSize])) + return false; + if (hashValue != HashValue[netSize]) return false; - if (hashValue != HashValue) + if (netSize == Big && !Detail::read_parameters(stream, *featureTransformerBig)) return false; - if (!Detail::read_parameters(stream, *featureTransformer)) + if (netSize == Small && !Detail::read_parameters(stream, *featureTransformerSmall)) return false; for (std::size_t i = 0; i < LayerStacks; ++i) - if (!Detail::read_parameters(stream, *(network[i]))) + { + if (netSize == Big && !Detail::read_parameters(stream, *(networkBig[i]))) return false; + if (netSize == Small && !Detail::read_parameters(stream, *(networkSmall[i]))) + return false; + } return stream && stream.peek() == std::ios::traits_type::eof(); } // Write network parameters -static bool write_parameters(std::ostream& stream) { +static bool write_parameters(std::ostream& stream, NetSize netSize) { - if (!write_header(stream, HashValue, netDescription)) + if (!write_header(stream, HashValue[netSize], netDescription[netSize])) return false; - if (!Detail::write_parameters(stream, *featureTransformer)) + if (netSize == Big && !Detail::write_parameters(stream, *featureTransformerBig)) + return false; + if (netSize == Small && !Detail::write_parameters(stream, *featureTransformerSmall)) return false; for (std::size_t i = 0; i < LayerStacks; ++i) - if (!Detail::write_parameters(stream, *(network[i]))) + { + if (netSize == Big && !Detail::write_parameters(stream, *(networkBig[i]))) + return false; + if (netSize == Small && !Detail::write_parameters(stream, *(networkSmall[i]))) return false; + } return bool(stream); } void hint_common_parent_position(const Position& pos) { - featureTransformer->hint_common_access(pos); + + int simpleEval = simple_eval(pos, pos.side_to_move()); + if (abs(simpleEval) > 1100) + featureTransformerSmall->hint_common_access(pos); + else + featureTransformerBig->hint_common_access(pos); } // Evaluation function. Perform differential calculation. +template Value evaluate(const Position& pos, bool adjusted, int* complexity) { // We manually align the arrays on the stack because with gcc < 9.3 @@ -165,19 +196,28 @@ Value evaluate(const Position& pos, bool adjusted, int* complexity) { #if defined(ALIGNAS_ON_STACK_VARIABLES_BROKEN) TransformedFeatureType - transformedFeaturesUnaligned[FeatureTransformer::BufferSize - + alignment / sizeof(TransformedFeatureType)]; + transformedFeaturesUnaligned[FeatureTransformer < Small ? TransformedFeatureDimensionsSmall + : TransformedFeatureDimensionsBig, + nullptr + > ::BufferSize + alignment / sizeof(TransformedFeatureType)]; auto* transformedFeatures = align_ptr_up(&transformedFeaturesUnaligned[0]); #else - alignas(alignment) TransformedFeatureType transformedFeatures[FeatureTransformer::BufferSize]; + + alignas(alignment) TransformedFeatureType + transformedFeatures[FeatureTransformer < Net_Size == Small ? TransformedFeatureDimensionsSmall + : TransformedFeatureDimensionsBig, + nullptr > ::BufferSize]; #endif ASSERT_ALIGNED(transformedFeatures, alignment); const int bucket = (pos.count() - 1) / 4; - const auto psqt = featureTransformer->transform(pos, transformedFeatures, bucket); - const auto positional = network[bucket]->propagate(transformedFeatures); + const auto psqt = Net_Size == Small + ? featureTransformerSmall->transform(pos, transformedFeatures, bucket) + : featureTransformerBig->transform(pos, transformedFeatures, bucket); + const auto positional = Net_Size == Small ? networkSmall[bucket]->propagate(transformedFeatures) + : networkBig[bucket]->propagate(transformedFeatures); if (complexity) *complexity = std::abs(psqt - positional) / OutputScale; @@ -190,6 +230,9 @@ Value evaluate(const Position& pos, bool adjusted, int* complexity) { return static_cast((psqt + positional) / OutputScale); } +template Value evaluate(const Position& pos, bool adjusted, int* complexity); +template Value evaluate(const Position& pos, bool adjusted, int* complexity); + struct NnueEvalTrace { static_assert(LayerStacks == PSQTBuckets); @@ -205,13 +248,14 @@ static NnueEvalTrace trace_evaluate(const Position& pos) { constexpr uint64_t alignment = CacheLineSize; #if defined(ALIGNAS_ON_STACK_VARIABLES_BROKEN) - TransformedFeatureType - transformedFeaturesUnaligned[FeatureTransformer::BufferSize - + alignment / sizeof(TransformedFeatureType)]; + TransformedFeatureType transformedFeaturesUnaligned + [FeatureTransformer::BufferSize + + alignment / sizeof(TransformedFeatureType)]; auto* transformedFeatures = align_ptr_up(&transformedFeaturesUnaligned[0]); #else - alignas(alignment) TransformedFeatureType transformedFeatures[FeatureTransformer::BufferSize]; + alignas(alignment) TransformedFeatureType + transformedFeatures[FeatureTransformer::BufferSize]; #endif ASSERT_ALIGNED(transformedFeatures, alignment); @@ -220,8 +264,8 @@ static NnueEvalTrace trace_evaluate(const Position& pos) { t.correctBucket = (pos.count() - 1) / 4; for (IndexType bucket = 0; bucket < LayerStacks; ++bucket) { - const auto materialist = featureTransformer->transform(pos, transformedFeatures, bucket); - const auto positional = network[bucket]->propagate(transformedFeatures); + const auto materialist = featureTransformerBig->transform(pos, transformedFeatures, bucket); + const auto positional = networkBig[bucket]->propagate(transformedFeatures); t.psqt[bucket] = static_cast(materialist / OutputScale); t.positional[bucket] = static_cast(positional / OutputScale); @@ -310,7 +354,7 @@ std::string trace(Position& pos) { // We estimate the value of each piece by doing a differential evaluation from // the current base eval, simulating the removal of the piece from its square. - Value base = evaluate(pos); + Value base = evaluate(pos); base = pos.side_to_move() == WHITE ? base : -base; for (File f = FILE_A; f <= FILE_H; ++f) @@ -325,16 +369,16 @@ std::string trace(Position& pos) { auto st = pos.state(); pos.remove_piece(sq); - st->accumulator.computed[WHITE] = false; - st->accumulator.computed[BLACK] = false; + st->accumulatorBig.computed[WHITE] = false; + st->accumulatorBig.computed[BLACK] = false; - Value eval = evaluate(pos); + Value eval = evaluate(pos); eval = pos.side_to_move() == WHITE ? eval : -eval; v = base - eval; pos.put_piece(pc, sq); - st->accumulator.computed[WHITE] = false; - st->accumulator.computed[BLACK] = false; + st->accumulatorBig.computed[WHITE] = false; + st->accumulatorBig.computed[BLACK] = false; } writeSquare(f, r, pc, v); @@ -379,24 +423,24 @@ std::string trace(Position& pos) { // Load eval, from a file stream or a memory stream -bool load_eval(std::string name, std::istream& stream) { +bool load_eval(const std::string name, std::istream& stream, NetSize netSize) { - initialize(); - fileName = name; - return read_parameters(stream); + initialize(netSize); + fileName[netSize] = name; + return read_parameters(stream, netSize); } // Save eval, to a file stream or a memory stream -bool save_eval(std::ostream& stream) { +bool save_eval(std::ostream& stream, NetSize netSize) { - if (fileName.empty()) + if (fileName[netSize].empty()) return false; - return write_parameters(stream); + return write_parameters(stream, netSize); } // Save eval, to a file given by its name -bool save_eval(const std::optional& filename) { +bool save_eval(const std::optional& filename, NetSize netSize) { std::string actualFilename; std::string msg; @@ -405,7 +449,8 @@ bool save_eval(const std::optional& filename) { actualFilename = filename.value(); else { - if (currentEvalFileName != EvalFileDefaultName) + if (currentEvalFileName[netSize] + != (netSize == Small ? EvalFileDefaultNameSmall : EvalFileDefaultNameBig)) { msg = "Failed to export a net. " "A non-embedded net can only be saved if the filename is specified"; @@ -413,11 +458,11 @@ bool save_eval(const std::optional& filename) { sync_cout << msg << sync_endl; return false; } - actualFilename = EvalFileDefaultName; + actualFilename = (netSize == Small ? EvalFileDefaultNameSmall : EvalFileDefaultNameBig); } std::ofstream stream(actualFilename, std::ios_base::binary); - bool saved = save_eval(stream); + bool saved = save_eval(stream, netSize); msg = saved ? "Network saved successfully to " + actualFilename : "Failed to export a net"; diff --git a/src/nnue/evaluate_nnue.h b/src/nnue/evaluate_nnue.h index f80aa398bba..fabfb5693f9 100644 --- a/src/nnue/evaluate_nnue.h +++ b/src/nnue/evaluate_nnue.h @@ -39,9 +39,11 @@ class Position; namespace Stockfish::Eval::NNUE { // Hash value of evaluation function structure -constexpr std::uint32_t HashValue = - FeatureTransformer::get_hash_value() ^ Network::get_hash_value(); - +constexpr std::uint32_t HashValue[2] = { + FeatureTransformer::get_hash_value() + ^ Network::get_hash_value(), + FeatureTransformer::get_hash_value() + ^ Network::get_hash_value()}; // Deleter for automating release of memory area template @@ -67,12 +69,13 @@ template using LargePagePtr = std::unique_ptr>; std::string trace(Position& pos); -Value evaluate(const Position& pos, bool adjusted = false, int* complexity = nullptr); -void hint_common_parent_position(const Position& pos); +template +Value evaluate(const Position& pos, bool adjusted = false, int* complexity = nullptr); +void hint_common_parent_position(const Position& pos); -bool load_eval(std::string name, std::istream& stream); -bool save_eval(std::ostream& stream); -bool save_eval(const std::optional& filename); +bool load_eval(const std::string name, std::istream& stream, NetSize netSize); +bool save_eval(std::ostream& stream, NetSize netSize); +bool save_eval(const std::optional& filename, NetSize netSize); } // namespace Stockfish::Eval::NNUE diff --git a/src/nnue/nnue_accumulator.h b/src/nnue/nnue_accumulator.h index f6d705243ef..0b05d00da28 100644 --- a/src/nnue/nnue_accumulator.h +++ b/src/nnue/nnue_accumulator.h @@ -29,8 +29,9 @@ namespace Stockfish::Eval::NNUE { // Class that holds the result of affine transformation of input features +template struct alignas(CacheLineSize) Accumulator { - std::int16_t accumulation[2][TransformedFeatureDimensions]; + std::int16_t accumulation[2][Size]; std::int32_t psqtAccumulation[2][PSQTBuckets]; bool computed[2]; }; diff --git a/src/nnue/nnue_architecture.h b/src/nnue/nnue_architecture.h index 92445704296..949f2d8687f 100644 --- a/src/nnue/nnue_architecture.h +++ b/src/nnue/nnue_architecture.h @@ -37,14 +37,28 @@ namespace Stockfish::Eval::NNUE { // Input features used in evaluation function using FeatureSet = Features::HalfKAv2_hm; +enum NetSize { + Big, + Small +}; + // Number of input feature dimensions after conversion -constexpr IndexType TransformedFeatureDimensions = 2560; -constexpr IndexType PSQTBuckets = 8; -constexpr IndexType LayerStacks = 8; +constexpr IndexType TransformedFeatureDimensionsBig = 2560; +constexpr int L2Big = 15; +constexpr int L3Big = 32; + +constexpr IndexType TransformedFeatureDimensionsSmall = 128; +constexpr int L2Small = 15; +constexpr int L3Small = 32; + +constexpr IndexType PSQTBuckets = 8; +constexpr IndexType LayerStacks = 8; +template struct Network { - static constexpr int FC_0_OUTPUTS = 15; - static constexpr int FC_1_OUTPUTS = 32; + static constexpr IndexType TransformedFeatureDimensions = L1; + static constexpr int FC_0_OUTPUTS = L2; + static constexpr int FC_1_OUTPUTS = L3; Layers::AffineTransformSparseInput fc_0; Layers::SqrClippedReLU ac_sqr_0; @@ -84,13 +98,13 @@ struct Network { std::int32_t propagate(const TransformedFeatureType* transformedFeatures) { struct alignas(CacheLineSize) Buffer { - alignas(CacheLineSize) decltype(fc_0)::OutputBuffer fc_0_out; - alignas(CacheLineSize) decltype(ac_sqr_0)::OutputType + alignas(CacheLineSize) typename decltype(fc_0)::OutputBuffer fc_0_out; + alignas(CacheLineSize) typename decltype(ac_sqr_0)::OutputType ac_sqr_0_out[ceil_to_multiple(FC_0_OUTPUTS * 2, 32)]; - alignas(CacheLineSize) decltype(ac_0)::OutputBuffer ac_0_out; - alignas(CacheLineSize) decltype(fc_1)::OutputBuffer fc_1_out; - alignas(CacheLineSize) decltype(ac_1)::OutputBuffer ac_1_out; - alignas(CacheLineSize) decltype(fc_2)::OutputBuffer fc_2_out; + alignas(CacheLineSize) typename decltype(ac_0)::OutputBuffer ac_0_out; + alignas(CacheLineSize) typename decltype(fc_1)::OutputBuffer fc_1_out; + alignas(CacheLineSize) typename decltype(ac_1)::OutputBuffer ac_1_out; + alignas(CacheLineSize) typename decltype(fc_2)::OutputBuffer fc_2_out; Buffer() { std::memset(this, 0, sizeof(*this)); } }; @@ -108,7 +122,7 @@ struct Network { ac_sqr_0.propagate(buffer.fc_0_out, buffer.ac_sqr_0_out); ac_0.propagate(buffer.fc_0_out, buffer.ac_0_out); std::memcpy(buffer.ac_sqr_0_out + FC_0_OUTPUTS, buffer.ac_0_out, - FC_0_OUTPUTS * sizeof(decltype(ac_0)::OutputType)); + FC_0_OUTPUTS * sizeof(typename decltype(ac_0)::OutputType)); fc_1.propagate(buffer.ac_sqr_0_out, buffer.fc_1_out); ac_1.propagate(buffer.fc_1_out, buffer.ac_1_out); fc_2.propagate(buffer.ac_1_out, buffer.fc_2_out); diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index 2008cf25f1d..9a162ac9853 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -186,11 +186,6 @@ static constexpr int BestRegisterCount() { return 1; } - -static constexpr int NumRegs = - BestRegisterCount(); -static constexpr int NumPsqtRegs = - BestRegisterCount(); #if defined(__GNUC__) #pragma GCC diagnostic pop #endif @@ -198,6 +193,8 @@ static constexpr int NumPsqtRegs = // Input feature converter +template StateInfo::*accPtr> class FeatureTransformer { private: @@ -205,6 +202,11 @@ class FeatureTransformer { static constexpr IndexType HalfDimensions = TransformedFeatureDimensions; #ifdef VECTOR + static constexpr int NumRegs = + BestRegisterCount(); + static constexpr int NumPsqtRegs = + BestRegisterCount(); + static constexpr IndexType TileHeight = NumRegs * sizeof(vec_t) / 2; static constexpr IndexType PsqtTileHeight = NumPsqtRegs * sizeof(psqt_vec_t) / 4; static_assert(HalfDimensions % TileHeight == 0, "TileHeight must divide HalfDimensions"); @@ -253,8 +255,8 @@ class FeatureTransformer { update_accumulator(pos); const Color perspectives[2] = {pos.side_to_move(), ~pos.side_to_move()}; - const auto& accumulation = pos.state()->accumulator.accumulation; - const auto& psqtAccumulation = pos.state()->accumulator.psqtAccumulation; + const auto& accumulation = (pos.state()->*accPtr).accumulation; + const auto& psqtAccumulation = (pos.state()->*accPtr).psqtAccumulation; const auto psqt = (psqtAccumulation[perspectives[0]][bucket] - psqtAccumulation[perspectives[1]][bucket]) @@ -323,7 +325,7 @@ class FeatureTransformer { // of the estimated gain in terms of features to be added/subtracted. StateInfo *st = pos.state(), *next = nullptr; int gain = FeatureSet::refresh_cost(pos); - while (st->previous && !st->accumulator.computed[Perspective]) + while (st->previous && !(st->*accPtr).computed[Perspective]) { // This governs when a full feature refresh is needed and how many // updates are better than just one full refresh. @@ -381,7 +383,7 @@ class FeatureTransformer { for (; i >= 0; --i) { - states_to_update[i]->accumulator.computed[Perspective] = true; + (states_to_update[i]->*accPtr).computed[Perspective] = true; const StateInfo* end_state = i == 0 ? computed_st : states_to_update[i - 1]; @@ -402,9 +404,9 @@ class FeatureTransformer { assert(states_to_update[0]); auto accIn = - reinterpret_cast(&st->accumulator.accumulation[Perspective][0]); + reinterpret_cast(&(st->*accPtr).accumulation[Perspective][0]); auto accOut = reinterpret_cast( - &states_to_update[0]->accumulator.accumulation[Perspective][0]); + &(states_to_update[0]->*accPtr).accumulation[Perspective][0]); const IndexType offsetR0 = HalfDimensions * removed[0][0]; auto columnR0 = reinterpret_cast(&weights[offsetR0]); @@ -428,10 +430,10 @@ class FeatureTransformer { vec_add_16(columnR0[k], columnR1[k])); } - auto accPsqtIn = reinterpret_cast( - &st->accumulator.psqtAccumulation[Perspective][0]); + auto accPsqtIn = + reinterpret_cast(&(st->*accPtr).psqtAccumulation[Perspective][0]); auto accPsqtOut = reinterpret_cast( - &states_to_update[0]->accumulator.psqtAccumulation[Perspective][0]); + &(states_to_update[0]->*accPtr).psqtAccumulation[Perspective][0]); const IndexType offsetPsqtR0 = PSQTBuckets * removed[0][0]; auto columnPsqtR0 = reinterpret_cast(&psqtWeights[offsetPsqtR0]); @@ -463,7 +465,7 @@ class FeatureTransformer { { // Load accumulator auto accTileIn = reinterpret_cast( - &st->accumulator.accumulation[Perspective][j * TileHeight]); + &(st->*accPtr).accumulation[Perspective][j * TileHeight]); for (IndexType k = 0; k < NumRegs; ++k) acc[k] = vec_load(&accTileIn[k]); @@ -489,7 +491,7 @@ class FeatureTransformer { // Store accumulator auto accTileOut = reinterpret_cast( - &states_to_update[i]->accumulator.accumulation[Perspective][j * TileHeight]); + &(states_to_update[i]->*accPtr).accumulation[Perspective][j * TileHeight]); for (IndexType k = 0; k < NumRegs; ++k) vec_store(&accTileOut[k], acc[k]); } @@ -499,7 +501,7 @@ class FeatureTransformer { { // Load accumulator auto accTilePsqtIn = reinterpret_cast( - &st->accumulator.psqtAccumulation[Perspective][j * PsqtTileHeight]); + &(st->*accPtr).psqtAccumulation[Perspective][j * PsqtTileHeight]); for (std::size_t k = 0; k < NumPsqtRegs; ++k) psqt[k] = vec_load_psqt(&accTilePsqtIn[k]); @@ -525,8 +527,8 @@ class FeatureTransformer { // Store accumulator auto accTilePsqtOut = reinterpret_cast( - &states_to_update[i] - ->accumulator.psqtAccumulation[Perspective][j * PsqtTileHeight]); + &(states_to_update[i]->*accPtr) + .psqtAccumulation[Perspective][j * PsqtTileHeight]); for (std::size_t k = 0; k < NumPsqtRegs; ++k) vec_store_psqt(&accTilePsqtOut[k], psqt[k]); } @@ -535,13 +537,12 @@ class FeatureTransformer { #else for (IndexType i = 0; states_to_update[i]; ++i) { - std::memcpy(states_to_update[i]->accumulator.accumulation[Perspective], - st->accumulator.accumulation[Perspective], - HalfDimensions * sizeof(BiasType)); + std::memcpy((states_to_update[i]->*accPtr).accumulation[Perspective], + (st->*accPtr).accumulation[Perspective], HalfDimensions * sizeof(BiasType)); for (std::size_t k = 0; k < PSQTBuckets; ++k) - states_to_update[i]->accumulator.psqtAccumulation[Perspective][k] = - st->accumulator.psqtAccumulation[Perspective][k]; + (states_to_update[i]->*accPtr).psqtAccumulation[Perspective][k] = + (st->*accPtr).psqtAccumulation[Perspective][k]; st = states_to_update[i]; @@ -551,10 +552,10 @@ class FeatureTransformer { const IndexType offset = HalfDimensions * index; for (IndexType j = 0; j < HalfDimensions; ++j) - st->accumulator.accumulation[Perspective][j] -= weights[offset + j]; + (st->*accPtr).accumulation[Perspective][j] -= weights[offset + j]; for (std::size_t k = 0; k < PSQTBuckets; ++k) - st->accumulator.psqtAccumulation[Perspective][k] -= + (st->*accPtr).psqtAccumulation[Perspective][k] -= psqtWeights[index * PSQTBuckets + k]; } @@ -564,10 +565,10 @@ class FeatureTransformer { const IndexType offset = HalfDimensions * index; for (IndexType j = 0; j < HalfDimensions; ++j) - st->accumulator.accumulation[Perspective][j] += weights[offset + j]; + (st->*accPtr).accumulation[Perspective][j] += weights[offset + j]; for (std::size_t k = 0; k < PSQTBuckets; ++k) - st->accumulator.psqtAccumulation[Perspective][k] += + (st->*accPtr).psqtAccumulation[Perspective][k] += psqtWeights[index * PSQTBuckets + k]; } } @@ -586,7 +587,7 @@ class FeatureTransformer { // Refresh the accumulator // Could be extracted to a separate function because it's done in 2 places, // but it's unclear if compilers would correctly handle register allocation. - auto& accumulator = pos.state()->accumulator; + auto& accumulator = pos.state()->*accPtr; accumulator.computed[Perspective] = true; FeatureSet::IndexList active; FeatureSet::append_active_indices(pos, active); @@ -663,12 +664,12 @@ class FeatureTransformer { // Look for a usable accumulator of an earlier position. We keep track // of the estimated gain in terms of features to be added/subtracted. // Fast early exit. - if (pos.state()->accumulator.computed[Perspective]) + if ((pos.state()->*accPtr).computed[Perspective]) return; auto [oldest_st, _] = try_find_computed_accumulator(pos); - if (oldest_st->accumulator.computed[Perspective]) + if ((oldest_st->*accPtr).computed[Perspective]) { // Only update current position accumulator to minimize work. StateInfo* states_to_update[2] = {pos.state(), nullptr}; @@ -685,7 +686,7 @@ class FeatureTransformer { auto [oldest_st, next] = try_find_computed_accumulator(pos); - if (oldest_st->accumulator.computed[Perspective]) + if ((oldest_st->*accPtr).computed[Perspective]) { if (next == nullptr) return; diff --git a/src/position.cpp b/src/position.cpp index 4fba3c234b7..ddc31888422 100644 --- a/src/position.cpp +++ b/src/position.cpp @@ -684,10 +684,10 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) { ++st->pliesFromNull; // Used by NNUE - st->accumulator.computed[WHITE] = false; - st->accumulator.computed[BLACK] = false; - auto& dp = st->dirtyPiece; - dp.dirty_num = 1; + st->accumulatorBig.computed[WHITE] = st->accumulatorBig.computed[BLACK] = + st->accumulatorSmall.computed[WHITE] = st->accumulatorSmall.computed[BLACK] = false; + auto& dp = st->dirtyPiece; + dp.dirty_num = 1; Color us = sideToMove; Color them = ~us; @@ -964,15 +964,15 @@ void Position::do_null_move(StateInfo& newSt) { assert(!checkers()); assert(&newSt != st); - std::memcpy(&newSt, st, offsetof(StateInfo, accumulator)); + std::memcpy(&newSt, st, offsetof(StateInfo, accumulatorBig)); newSt.previous = st; st = &newSt; - st->dirtyPiece.dirty_num = 0; - st->dirtyPiece.piece[0] = NO_PIECE; // Avoid checks in UpdateAccumulator() - st->accumulator.computed[WHITE] = false; - st->accumulator.computed[BLACK] = false; + st->dirtyPiece.dirty_num = 0; + st->dirtyPiece.piece[0] = NO_PIECE; // Avoid checks in UpdateAccumulator() + st->accumulatorBig.computed[WHITE] = st->accumulatorBig.computed[BLACK] = + st->accumulatorSmall.computed[WHITE] = st->accumulatorSmall.computed[BLACK] = false; if (st->epSquare != SQ_NONE) { diff --git a/src/position.h b/src/position.h index 7e0c3eefd77..34b53f4a558 100644 --- a/src/position.h +++ b/src/position.h @@ -27,6 +27,7 @@ #include "bitboard.h" #include "nnue/nnue_accumulator.h" +#include "nnue/nnue_architecture.h" #include "types.h" namespace Stockfish { @@ -57,8 +58,9 @@ struct StateInfo { int repetition; // Used by NNUE - Eval::NNUE::Accumulator accumulator; - DirtyPiece dirtyPiece; + Eval::NNUE::Accumulator accumulatorBig; + Eval::NNUE::Accumulator accumulatorSmall; + DirtyPiece dirtyPiece; }; diff --git a/src/uci.cpp b/src/uci.cpp index 8e93eee6dc5..be902277984 100644 --- a/src/uci.cpp +++ b/src/uci.cpp @@ -37,6 +37,7 @@ #include "misc.h" #include "movegen.h" #include "nnue/evaluate_nnue.h" +#include "nnue/nnue_architecture.h" #include "position.h" #include "search.h" #include "thread.h" @@ -320,7 +321,7 @@ void UCI::loop(int argc, char* argv[]) { std::string f; if (is >> std::skipws >> f) filename = f; - Eval::NNUE::save_eval(filename); + Eval::NNUE::save_eval(filename, Eval::NNUE::Big); } else if (token == "--help" || token == "help" || token == "--license" || token == "license") sync_cout diff --git a/src/ucioption.cpp b/src/ucioption.cpp index 087882f11b8..f8cbcc53077 100644 --- a/src/ucioption.cpp +++ b/src/ucioption.cpp @@ -82,7 +82,9 @@ void init(OptionsMap& o) { o["SyzygyProbeDepth"] << Option(1, 1, 100); o["Syzygy50MoveRule"] << Option(true); o["SyzygyProbeLimit"] << Option(7, 0, 7); - o["EvalFile"] << Option(EvalFileDefaultName, on_eval_file); + o["EvalFile"] << Option(EvalFileDefaultNameBig, on_eval_file); + // Enable this after fishtest workers support EvalFileSmall + // o["EvalFileSmall"] << Option(EvalFileDefaultNameSmall, on_eval_file); }