From 63d036203cfdf773d4f16b4beac7b55558850c52 Mon Sep 17 00:00:00 2001
From: Dragorn421 <Dragorn421@users.noreply.github.com>
Date: Sun, 3 Sep 2023 23:59:53 +0200
Subject: [PATCH 01/27] Turn dfsdemo into a text file reader instead of a
 module sound file player

---
 examples/dfsdemo/Makefile                     |   2 -
 examples/dfsdemo/dfsdemo.c                    | 126 ++++++------------
 ...E_MOD_XM_IT_S3M_FILES_HERE => dirhint.txt} |   0
 examples/dfsdemo/filesystem/dorian_gray.txt   |  14 ++
 examples/dfsdemo/filesystem/libdragon.txt     |   2 +
 .../dfsdemo/filesystem/my_sub/another.txt     |   4 +
 6 files changed, 63 insertions(+), 85 deletions(-)
 rename examples/dfsdemo/filesystem/{PLACE_MOD_XM_IT_S3M_FILES_HERE => dirhint.txt} (100%)
 create mode 100644 examples/dfsdemo/filesystem/dorian_gray.txt
 create mode 100644 examples/dfsdemo/filesystem/libdragon.txt
 create mode 100644 examples/dfsdemo/filesystem/my_sub/another.txt

diff --git a/examples/dfsdemo/Makefile b/examples/dfsdemo/Makefile
index 903f3437e1..fa9b37f0ff 100644
--- a/examples/dfsdemo/Makefile
+++ b/examples/dfsdemo/Makefile
@@ -7,8 +7,6 @@ include $(N64_INST)/include/n64.mk
 OBJS = $(BUILD_DIR)/dfsdemo.o
 
 dfsdemo.z64: N64_ROM_TITLE = "DragonFS Demo"
-dfsdemo.z64: N64_CFLAGS += -I$(N64_ROOTDIR)/include
-dfsdemo.z64: N64_LDFLAGS += -L$(N64_ROOTDIR)/lib -lmikmod
 dfsdemo.z64: $(BUILD_DIR)/dfsdemo.dfs
 
 $(BUILD_DIR)/dfsdemo.dfs: $(wildcard filesystem/*)
diff --git a/examples/dfsdemo/dfsdemo.c b/examples/dfsdemo/dfsdemo.c
index d8061557a8..0549f61ef2 100644
--- a/examples/dfsdemo/dfsdemo.c
+++ b/examples/dfsdemo/dfsdemo.c
@@ -1,18 +1,13 @@
 #include <math.h>
 #include <stdint.h>
+#include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <malloc.h>
 #include <libdragon.h>
-#include <mikmod.h>
 
 #define MAX_LIST            20
 
-// Hint linker on it is OK to put these on .data section
-// This is ugly, there must be a better way?
-MIKMODAPI extern UWORD md_mode __attribute__((section (".data")));
-MIKMODAPI extern UWORD md_mixfreq __attribute__((section (".data")));
-
 typedef struct
 {
     uint32_t type;
@@ -216,25 +211,12 @@ void display_dir(direntry_t *list, int cursor, int page, int max, int count)
 
 int main(void)
 {
-    /* Initialize audio and video */
-    audio_init(44100,2);
+    /* Initialize video */
     console_init();
 
     /* Initialize key detection */
     controller_init();
 
-    MikMod_RegisterAllDrivers();
-    MikMod_RegisterAllLoaders();
-
-    md_mode |= DMODE_16BITS;
-    md_mode |= DMODE_SOFT_MUSIC;
-    md_mode |= DMODE_SOFT_SNDFX;
-    //md_mode |= DMODE_STEREO;
-                                            
-    md_mixfreq = audio_get_frequency();
-
-    MikMod_Init("");
-
     if(dfs_init( DFS_DEFAULT_LOCATION ) != DFS_ESUCCESS)
     {
         printf("Filesystem failed to start!\n");
@@ -274,80 +256,58 @@ int main(void)
 
             if(keys.c[0].C_right && list[cursor].type == DT_REG)
             {
-                /* Module playing loop */
-                MODULE *module = NULL;
-
                 /* Concatenate to make file */
                 char path[512];
 
                 strcpy( path, dir );
                 strcat( path, list[cursor].filename );
 
-                module = Player_Load(path, 256, 0);
-                
-                /* Ensure that first part of module doesn't get cut off */
-                audio_write_silence();
-                audio_write_silence();
-
-                if(module)
-                {
-                    char c = '-';
-                    int sw = 0;
-
-                    Player_Start(module);
-
-                    while(1)
-                    {
-                        if(sw == 5)
-                        {
-                            console_clear();
-                            display_dir(list, cursor, page, MAX_LIST, count);
-
-                            sw = 0;
-                            switch(c)
-                            {
-                                case '-':
-                                    c = '\\';
-                                    break;
-                                case '\\':
-                                    c = '|';
-                                    break;
-                                case '|':
-                                    c = '/';
-                                    break;
-                                case '/':
-                                    c = '-';
-                                    break;
+                FILE* f;
+                f = fopen(path, "r");
+                if (f == NULL) {
+                    printf("Failed to open %s\n", path);
+                } else {
+                    printf("Hold A to scroll\n");
+                    char buf[1024];
+                    size_t nread;
+                    while ((nread = fread(buf, sizeof(buf[0]), sizeof(buf) - 1, f)) != 0) {
+                        buf[nread] = '\0';
+                        char* s = buf;
+                        while (s != NULL) {
+                            char* s_next_line = strchr(s, '\n');
+                            if (s_next_line == NULL) {
+                                printf("%s", s);
+                                s = NULL;
+                            } else {
+                                printf("%.*s\n", s_next_line - s, s);
+                                console_render();
+                                s = s_next_line + 1;
+
+                                wait_ms(100);
+                                controller_scan();
+                                while (!get_keys_pressed().c[0].A) {
+                                    wait_ms(10);
+                                    controller_scan();
+                                }
                             }
-    
-                            printf("\n\n\n%c Playing module", c);                        
-                            console_render();
-                        }
-                        else
-                        {
-                            sw++;
                         }
+                    }
+                    if (ferror(f)) {
+                        printf("Error while reading %s\n", path);
+                    }
 
-                        MikMod_Update();
-
-                        controller_scan();
-                        struct controller_data keys = get_keys_down();
-
-                        if(keys.c[0].C_left || !Player_Active())
-                        {
-                            /* End playback */
-                            audio_write_silence();
-                            audio_write_silence();
-                            audio_write_silence();
-                            audio_write_silence();
+                    fclose(f);
+                }
 
-                            break;
-                        }
-                    }
-                
-                    Player_Stop();
-                    Player_Free(module);
+                printf("Press B to quit\n");
+                console_render();
+                controller_scan();
+                while (!get_keys_down().c[0].B) {
+                    wait_ms(10);
+                    controller_scan();
                 }
+
+                continue;
             }
 
             if(keys.c[0].L)
diff --git a/examples/dfsdemo/filesystem/PLACE_MOD_XM_IT_S3M_FILES_HERE b/examples/dfsdemo/filesystem/dirhint.txt
similarity index 100%
rename from examples/dfsdemo/filesystem/PLACE_MOD_XM_IT_S3M_FILES_HERE
rename to examples/dfsdemo/filesystem/dirhint.txt
diff --git a/examples/dfsdemo/filesystem/dorian_gray.txt b/examples/dfsdemo/filesystem/dorian_gray.txt
new file mode 100644
index 0000000000..7436ecbedb
--- /dev/null
+++ b/examples/dfsdemo/filesystem/dorian_gray.txt
@@ -0,0 +1,14 @@
+THE PICTURE OF DORIAN GRAY.
+
+
+CHAPTER I.
+
+THE studio was filled with the rich odour of roses, and when the light summer wind stirred amidst the trees of the garden there came through the open door the heavy scent of the lilac, or the more delicate perfume of the pink-flowering thorn.
+
+From the corner of the divan of Persian saddlebags on which he was lying, smoking, as was his custom, innumerable cigarettes, Lord Henry Wotton could just catch the gleam of the honey-sweet and honey-coloured blossoms of a laburnum, whose tremulous branches seemed hardly able to bear the burden of a beauty so flame-like as theirs; and now and then the fantastic shadows of birds in flight flitted across the long tussore-silk curtains that were stretched in front of the huge window, producing a kind of momentary Japanese effect, and making him think of those pallid jade-faced painters of Tokio who, through the medium of an art that is necessarily immobile, seek to convey the sense of swiftness and motion. The sullen murmur of the bees shouldering their way through the long unmown grass, or circling with monotonous insistence round the dusty gilt horns of the straggling woodbine, seemed to make the stillness more oppressive. The dim roar of London was like the bourdon note of a distant organ.
+
+In the centre of the room, clamped to an upright easel, stood the full-length portrait of a young man of extraordinary personal beauty, and in front of it, some little distance away, was sitting the artist himself, Basil Hallward, whose sudden disappearance some years ago caused, at the time, such public excitement, and gave rise to so many strange conjectures.
+
+As the painter looked at the gracious and comely form he had so skilfully mirrored in his art, a smile of pleasure passed across his face, and seemed about to linger there. But he suddenly started up, and, closing his eyes, placed his fingers upon the lids, as though he sought to imprison within his brain some curious dream from which he feared he might awake.
+
+"It is your best work, Basil, the best thing you have ever done," said Lord Henry, languidly. "You must certainly send it next year to the Grosvenor. The Academy is too large and too vulgar.
diff --git a/examples/dfsdemo/filesystem/libdragon.txt b/examples/dfsdemo/filesystem/libdragon.txt
new file mode 100644
index 0000000000..0d7a222d02
--- /dev/null
+++ b/examples/dfsdemo/filesystem/libdragon.txt
@@ -0,0 +1,2 @@
+Libdragon is an open-source SDK for Nintendo 64.
+It aims for a complete N64 programming experience while providing programmers with modern approach to programming and debugging.
diff --git a/examples/dfsdemo/filesystem/my_sub/another.txt b/examples/dfsdemo/filesystem/my_sub/another.txt
new file mode 100644
index 0000000000..6edcb5b0be
--- /dev/null
+++ b/examples/dfsdemo/filesystem/my_sub/another.txt
@@ -0,0 +1,4 @@
+Hello
+on
+four
+lines!

From d74706b5962ecbbf4c9faa00075a2843eeebcabe Mon Sep 17 00:00:00 2001
From: Dragorn421 <Dragorn421@users.noreply.github.com>
Date: Mon, 4 Sep 2023 17:25:40 +0200
Subject: [PATCH 02/27] Move mikmod build in separate script (#432)

Co-authored-by: Giovanni Bajo <rasky@develer.com>
---
 build.sh              | 16 ----------------
 tools/build-mikmod.sh | 42 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 16 deletions(-)
 create mode 100755 tools/build-mikmod.sh

diff --git a/build.sh b/build.sh
index 52be801dcf..9f82d51058 100755
--- a/build.sh
+++ b/build.sh
@@ -31,26 +31,10 @@ makeWithParams(){
 JOBS="${JOBS:-$(getconf _NPROCESSORS_ONLN)}"
 JOBS="${JOBS:-1}" # If getconf returned nothing, default to 1
 
-# Specify where to get libmikmod from and where to put it
-LIBMIKMOD_REPO=https://github.com/networkfusion/libmikmod.git
-LIBMIKMOD_COMMIT=738b1e8b11b470360b1b919680d1d88429d9d174
-LIBMIKMOD_DIR=/tmp/libmikmod
-
 # Clean, build, and install libdragon + tools
 makeWithParams clobber
 makeWithParams install tools-install
 
-# Remove the cloned libmikmod repo if it already exists
-[ -d "$LIBMIKMOD_DIR" ] && rm -Rf $LIBMIKMOD_DIR
-# Clone, compile, and install libmikmod
-git clone $LIBMIKMOD_REPO $LIBMIKMOD_DIR
-pushd $LIBMIKMOD_DIR/n64
-git checkout $LIBMIKMOD_COMMIT
-makeWithParams
-makeWithParams install
-popd
-rm -Rf $LIBMIKMOD_DIR
-
 # Build examples and tests - libdragon must be already installed at this point,
 # so first clobber the build to make sure that everything works against the
 # installed version rather than using local artifacts.
diff --git a/tools/build-mikmod.sh b/tools/build-mikmod.sh
new file mode 100755
index 0000000000..7758f21933
--- /dev/null
+++ b/tools/build-mikmod.sh
@@ -0,0 +1,42 @@
+#!/usr/bin/env bash
+
+# This script downloads and build MikMod, a music library for playing
+# different module files. Notice that, albeit ported to N64, it is a
+# CPU-only port, so it will use lots of CPU time to play the music.
+# This is basically kept here for backward compatibility for old code
+# using it. New code should default to use the new mixer library
+# with its XM64/WAV64 support for music files.
+
+# Bash strict mode http://redsymbol.net/articles/unofficial-bash-strict-mode/
+set -euo pipefail
+IFS=$'\n\t'
+
+makeWithParams(){
+  make -j"${JOBS}" "$@"
+}
+
+sudoMakeWithParams(){
+  make -j"${JOBS}" "$@" || \
+    sudo env N64_INST="$N64_INST" \
+      make -j"${JOBS}" "$@"
+}
+
+# Limit the number of make jobs to the number of CPUs
+JOBS="${JOBS:-$(getconf _NPROCESSORS_ONLN)}"
+JOBS="${JOBS:-1}" # If getconf returned nothing, default to 1
+
+# Specify where to get libmikmod from and where to put it
+LIBMIKMOD_REPO=https://github.com/networkfusion/libmikmod.git
+LIBMIKMOD_COMMIT=738b1e8b11b470360b1b919680d1d88429d9d174
+LIBMIKMOD_DIR=/tmp/libmikmod
+
+# Remove the cloned libmikmod repo if it already exists
+[ -d "$LIBMIKMOD_DIR" ] && rm -Rf $LIBMIKMOD_DIR
+# Clone, compile, and install libmikmod
+git clone $LIBMIKMOD_REPO $LIBMIKMOD_DIR
+pushd $LIBMIKMOD_DIR/n64
+git checkout $LIBMIKMOD_COMMIT
+makeWithParams
+sudoMakeWithParams install
+popd
+rm -Rf $LIBMIKMOD_DIR

From 09ac283e2d8cfff099274eabef9d37c1ba221a28 Mon Sep 17 00:00:00 2001
From: Giovanni Bajo <rasky@develer.com>
Date: Sun, 24 Sep 2023 14:17:07 +0200
Subject: [PATCH 03/27] dma: add PI registers definitions

---
 include/dma.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/include/dma.h b/include/dma.h
index cc23295f55..5108043ccc 100644
--- a/include/dma.h
+++ b/include/dma.h
@@ -12,6 +12,12 @@
 extern "C" {
 #endif
 
+#define PI_DRAM_ADDR    ((volatile uint32_t*)0xA4600000)  ///< PI DMA: DRAM address register
+#define PI_CART_ADDR    ((volatile uint32_t*)0xA4600004)  ///< PI DMA: cartridge address register
+#define PI_RD_LEN       ((volatile uint32_t*)0xA4600008)  ///< PI DMA: read length register
+#define PI_WR_LEN       ((volatile uint32_t*)0xA460000C)  ///< PI DMA: write length register
+#define PI_STATUS       ((volatile uint32_t*)0xA4600010)  ///< PI: status register
+
 void dma_write_raw_async(const void *ram_address, unsigned long pi_address, unsigned long len);
 void dma_write(const void * ram_address, unsigned long pi_address, unsigned long len);
 

From d35ad7b55e5769245a34d9526a7fe17056cc041e Mon Sep 17 00:00:00 2001
From: Giovanni Bajo <rasky@develer.com>
Date: Mon, 25 Sep 2023 01:31:46 +0200
Subject: [PATCH 04/27] testrom: avoid redefining rand() symbol

---
 tests/testrom.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/testrom.c b/tests/testrom.c
index 96ddb75bce..d00af90106 100644
--- a/tests/testrom.c
+++ b/tests/testrom.c
@@ -24,6 +24,7 @@ typedef struct {
 
 typedef void (*TestFunc)(TestContext *ctx);
 
+#define ABS(n) ((n) < 0 ? -(n) : (n))
 #define PPCAT2(n,x) n ## x
 #define PPCAT(n,x) PPCAT2(n,x)
 
@@ -57,7 +58,7 @@ typedef void (*TestFunc)(TestContext *ctx);
 
 // Fair and fast random generation (using xorshift32, with explicit seed)
 static uint32_t rand_state = 1;
-static uint32_t rand(void) {
+static uint32_t myrand(void) {
 	uint32_t x = rand_state;
 	x ^= x << 13;
 	x ^= x >> 7;
@@ -71,8 +72,8 @@ static uint32_t rand(void) {
 // RANDN(n): generate a random number from 0 to n-1
 #define RANDN(n) ({ \
 	__builtin_constant_p((n)) ? \
-		(rand()%(n)) : \
-		(uint32_t)(((uint64_t)rand() * (n)) >> 32); \
+		(myrand()%(n)) : \
+		(uint32_t)(((uint64_t)myrand() * (n)) >> 32); \
 })
 
 // ASSERT(cond, msg): fail the test if the condition is false (with log message)

From 98a19d84a0fd5313ca9c73bac662244dcc841548 Mon Sep 17 00:00:00 2001
From: Giovanni Bajo <rasky@develer.com>
Date: Sun, 24 Sep 2023 14:18:07 +0200
Subject: [PATCH 05/27] Add new asset library

This library handles loading asset files from filesystems. It allows
either one-time loads or incremental loads via FILE* interface, and
it handles transparent decompression in both cases.

It implements two compressed format: a very fast one (based on LZ4),
and a slower one (based on LZH5) that compresses files more.
---
 Makefile                        |    6 +-
 include/asset.h                 |   88 +++
 include/libdragon.h             |    1 +
 src/asset.c                     |  282 ++++++++
 src/asset_internal.h            |   23 +
 src/compress/lz4_dec.c          |  296 ++++++++
 src/compress/lz4_dec_internal.h |   68 ++
 src/compress/lzh5.c             | 1164 +++++++++++++++++++++++++++++++
 src/compress/lzh5_internal.h    |   44 ++
 src/compress/ringbuf.c          |   64 ++
 src/compress/ringbuf_internal.h |   56 ++
 11 files changed, 2090 insertions(+), 2 deletions(-)
 create mode 100644 include/asset.h
 create mode 100644 src/asset.c
 create mode 100644 src/asset_internal.h
 create mode 100644 src/compress/lz4_dec.c
 create mode 100644 src/compress/lz4_dec_internal.h
 create mode 100644 src/compress/lzh5.c
 create mode 100644 src/compress/lzh5_internal.h
 create mode 100644 src/compress/ringbuf.c
 create mode 100644 src/compress/ringbuf_internal.h

diff --git a/Makefile b/Makefile
index faedeed54c..5737585ce0 100755
--- a/Makefile
+++ b/Makefile
@@ -34,8 +34,9 @@ libdragon.a: $(BUILD_DIR)/n64sys.o $(BUILD_DIR)/interrupt.o $(BUILD_DIR)/backtra
 			 $(BUILD_DIR)/debug.o $(BUILD_DIR)/debugcpp.o $(BUILD_DIR)/usb.o $(BUILD_DIR)/libcart/cart.o $(BUILD_DIR)/fatfs/ff.o \
 			 $(BUILD_DIR)/fatfs/ffunicode.o $(BUILD_DIR)/rompak.o $(BUILD_DIR)/dragonfs.o \
 			 $(BUILD_DIR)/audio.o $(BUILD_DIR)/display.o $(BUILD_DIR)/surface.o \
-			 $(BUILD_DIR)/console.o $(BUILD_DIR)/joybus.o \
-			 $(BUILD_DIR)/controller.o $(BUILD_DIR)/rtc.o \
+			 $(BUILD_DIR)/console.o $(BUILD_DIR)/asset.o \
+			 $(BUILD_DIR)/compress/lzh5.o $(BUILD_DIR)/compress/lz4_dec.o $(BUILD_DIR)/compress/ringbuf.o \
+			 $(BUILD_DIR)/joybus.o $(BUILD_DIR)/controller.o $(BUILD_DIR)/rtc.o \
 			 $(BUILD_DIR)/eeprom.o $(BUILD_DIR)/eepromfs.o $(BUILD_DIR)/mempak.o \
 			 $(BUILD_DIR)/tpak.o $(BUILD_DIR)/graphics.o $(BUILD_DIR)/rdp.o \
 			 $(BUILD_DIR)/rsp.o $(BUILD_DIR)/rsp_crash.o \
@@ -92,6 +93,7 @@ install: install-mk libdragon
 	install -Cv -m 0644 include/interrupt.h $(INSTALLDIR)/mips64-elf/include/interrupt.h
 	install -Cv -m 0644 include/dma.h $(INSTALLDIR)/mips64-elf/include/dma.h
 	install -Cv -m 0644 include/dragonfs.h $(INSTALLDIR)/mips64-elf/include/dragonfs.h
+	install -Cv -m 0644 include/asset.h $(INSTALLDIR)/mips64-elf/include/asset.h
 	install -Cv -m 0644 include/audio.h $(INSTALLDIR)/mips64-elf/include/audio.h
 	install -Cv -m 0644 include/surface.h $(INSTALLDIR)/mips64-elf/include/surface.h
 	install -Cv -m 0644 include/display.h $(INSTALLDIR)/mips64-elf/include/display.h
diff --git a/include/asset.h b/include/asset.h
new file mode 100644
index 0000000000..45a4bcf880
--- /dev/null
+++ b/include/asset.h
@@ -0,0 +1,88 @@
+/**
+ * @file asset.h
+ * @brief Asset Subsystem
+ * @ingroup asset
+ */
+#ifndef __LIBDRAGON_ASSET_H
+#define __LIBDRAGON_ASSET_H
+
+/**
+ * @defgroup asset Asset Subsystem
+ * @ingroup libdragon
+ * @brief Interfaces for loading assets from ROM or other supports
+ * 
+ * The asset subsystem is in charge of loading assets. Typically, assets
+ * will be loaded from ROM, but other options might be possible (like SD
+ * cards).
+ * 
+ * Asset filenames are always prefixed with a filesystem identifier which
+ * has a syntax similar to an URL. For instance, to load a file from ROM
+ * through the DragonFS filesystem, use a filename like "rom:/myfile.txt".
+ * 
+ * While it is possible to simply open asset files using fopen, which supports
+ * the filesystem prefix as well, the asset subsystem provides a few helpers
+ * around asset compression.
+ * 
+ * Assets can be optionally compressed using the mkasset tool. Asset compression
+ * is done on a per-file basis (similar to how "gzip" works), and decompression
+ * is transparent to the user. The asset subsystem will automatically detect
+ * a compressed file and decompress it during loading.
+ * 
+ * The main functions for loading assets are #asset_load and #asset_fopen.
+ * #asset_load loads the entire file into memory in one go, and it is useful
+ * for small files or in general files that has to fully keep in RAM as-is.
+ * The asset is transparently decompressed if needed.
+ * 
+ * Some files might require parsing during loading, and in that case,
+ * #asset_fopen is provided. It returns a FILE* so that any kind of file 
+ * operation can be performed on it, with transparent decompression.
+ * Since it is not possible to seek in a compressed file, the FILE* returned
+ * by #asset_fopen will assert on seek, even if the file is not compressed
+ * (so that the user code will be ready for adding compression at any time).
+ * 
+ * If you know that the file will never be compressed and you absolutely need
+ * to freely seek, simply use the standard fopen() function.
+ * 
+ */
+
+#include <stdio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @brief Load an asset file (possibly uncompressing it)
+ * 
+ * This function loads a file from a file system (eg: from ROM or SD).
+ * If the file was compressed using the mkasset tool, it will be
+ * automatically uncompressed.
+ * 
+ * @param fn        Filename to load (including filesystem prefix)
+ * @param sz        Pointer to an integer where the size of the file will be stored
+ * @return void*    Pointer to the loaded file (must be freed with free() when done)
+ */
+void *asset_load(const char *fn, int *sz);
+
+/**
+ * @brief Open an asset file for reading (with transparent decompression)
+ * 
+ * This function opens a file from a file system (eg: from ROM or SD).
+ * If the file was compressed using the mkasset tool, it will be
+ * automatically uncompressed as it is being read.
+ * 
+ * Note that since the file can be optionally compressed, the returned
+ * FILE* cannot be rewinded. It must be read sequentially, or seeked forward.
+ * Seeking backward is not supported.
+ * 
+ * @param fn        Filename to open (including filesystem prefix)
+ * @param sz        If not NULL, pointer to an integer where the size of the file will be stored
+ * @return FILE*    FILE pointer to use with standard C functions (fread, fclose)
+ */
+FILE *asset_fopen(const char *fn, int *sz);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/libdragon.h b/include/libdragon.h
index 74ace1cfb5..5995bb3d83 100755
--- a/include/libdragon.h
+++ b/include/libdragon.h
@@ -37,6 +37,7 @@
 #include "display.h"
 #include "dma.h"
 #include "dragonfs.h"
+#include "asset.h"
 #include "eeprom.h"
 #include "eepromfs.h"
 #include "graphics.h"
diff --git a/src/asset.c b/src/asset.c
new file mode 100644
index 0000000000..17e1aa9280
--- /dev/null
+++ b/src/asset.c
@@ -0,0 +1,282 @@
+#include "asset.h"
+#include "asset_internal.h"
+#include "compress/lzh5_internal.h"
+#include "compress/lz4_dec_internal.h"
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdalign.h>
+
+#ifdef N64
+#include <malloc.h>
+#include "debug.h"
+#include "n64sys.h"
+#include "dma.h"
+#include "dragonfs.h"
+#else
+#include <stdlib.h>
+#include <assert.h>
+#define memalign(a, b) malloc(b)
+#define assertf(x, ...) assert(x)
+#endif
+
+FILE *must_fopen(const char *fn)
+{
+    FILE *f = fopen(fn, "rb");
+    if (!f) {
+        // File not found.
+        int errnum = errno;
+        if (errnum == EINVAL) {
+            if (!strstr(fn, ":/")) {
+                // A common mistake is to forget the filesystem prefix.
+                // Try to give a hint if that's the case.
+                assertf(f, "File not found: %s\n"
+                    "Did you forget the filesystem prefix? (e.g. \"rom:/\")\n", fn);
+                return NULL;
+            } else if (strstr(fn, "rom:/")) {
+                // Another common mistake is to forget to initialize the rom filesystem.
+                // Suggest that if the filesystem prefix is "rom:/".
+                assertf(f, "File not found: %s\n"
+                    "Did you forget to call dfs_init(), or did it return an error?\n", fn);
+                return NULL;
+            }
+        }
+        assertf(f, "error opening file %s: %s\n", fn, strerror(errnum));
+    }
+    return f;
+}
+
+void *asset_load(const char *fn, int *sz)
+{
+    uint8_t *s; int size;
+    FILE *f = must_fopen(fn);
+   
+    // Check if file is compressed
+    asset_header_t header;
+    fread(&header, 1, sizeof(asset_header_t), f);
+    if (!memcmp(header.magic, ASSET_MAGIC, 3)) {
+        if (header.version != '2') {
+            assertf(0, "unsupported asset version: %c\nMake sure to rebuild libdragon tools and your assets", header.version);
+            return NULL;
+        }
+
+        #ifndef N64
+        header.algo = __builtin_bswap16(header.algo);
+        header.flags = __builtin_bswap16(header.flags);
+        header.cmp_size = __builtin_bswap32(header.cmp_size);
+        header.orig_size = __builtin_bswap32(header.orig_size);
+        #endif
+
+        switch (header.algo) {
+        case 2: {
+            size = header.orig_size;
+            s = memalign(16, size);
+            assertf(s, "asset_load: out of memory");
+            int n = decompress_lz5h_full(f, s, size); (void)n;
+            assertf(n == size, "asset: decompression error on file %s: corrupted? (%d/%d)", fn, n, size);
+        }   break;
+        case 1: {
+            size = header.orig_size;
+            int bufsize = size + LZ4_DECOMPRESS_INPLACE_MARGIN(header.cmp_size);
+            int cmp_offset = bufsize - header.cmp_size;
+            if (cmp_offset & 1) {
+                cmp_offset++;
+                bufsize++;
+            }
+            if (bufsize & 15) {
+                // In case we need to call invalidate (see below), we need an aligned buffer
+                bufsize += 16 - (bufsize & 15);
+            }
+
+            s = memalign(16, bufsize);
+            assertf(s, "asset_load: out of memory");
+            int n;
+
+            #ifdef N64
+            if (strncmp(fn, "rom:/", 5) == 0) {
+                // Invalid the portion of the buffer where we are going to load
+                // the compressed data. This is needed in case the buffer returned
+                // by memalign happens to be in cached already.
+                int align_cmp_offset = cmp_offset & ~15;
+                data_cache_hit_invalidate(s+align_cmp_offset, bufsize-align_cmp_offset);
+
+                // Loading from ROM. This is a common enough situation that we want to optimize it.
+                // Start an asynchronous DMA transfer, so that we can start decompressing as the
+                // data flows in.
+                uint32_t addr = dfs_rom_addr(fn+5) & 0x1FFFFFFF;
+                dma_read_async(s+cmp_offset, addr+16, header.cmp_size);
+
+                // Run the decompression racing with the DMA.
+                n = decompress_lz4_full_mem(s+cmp_offset, header.cmp_size, s, size, true); (void)n;
+            #else
+            if (false) {
+            #endif
+            } else {
+                // Standard loading via stdio. We have to wait for the whole file to be read.
+                fread(s+cmp_offset, 1, header.cmp_size, f);
+
+                // Run the decompression.
+                n = decompress_lz4_full_mem(s+cmp_offset, header.cmp_size, s, size, false); (void)n;
+            }
+            assertf(n == size, "asset: decompression error on file %s: corrupted? (%d/%d)", fn, n, size);
+            void *ptr = realloc(s, size); (void)ptr;
+            assertf(s == ptr, "asset: realloc moved the buffer"); // guaranteed by newlib
+        }   break;
+        default:
+            assertf(0, "asset: unsupported compression algorithm: %d", header.algo);
+            return NULL;
+        }        
+    } else {
+        // Allocate a buffer big enough to hold the file.
+        // We force a 16-byte alignment for the buffer so that it's cacheline aligned.
+        // This might or might not be useful, but if a binary file is laid out so that it
+        // matters, at least we guarantee that. 
+        fseek(f, 0, SEEK_END);
+        size = ftell(f);
+        s = memalign(16, size);
+
+        fseek(f, 0, SEEK_SET);
+        fread(s, 1, size, f);
+    }
+
+    fclose(f);
+    if (sz) *sz = size;
+    return s;
+}
+
+#ifdef N64
+
+typedef struct  {
+    FILE *fp;
+    bool seeked;
+} cookie_none_t;
+
+static fpos_t seekfn_none(void *c, fpos_t pos, int whence)
+{
+    cookie_none_t *cookie = c;
+
+    // SEEK_CUR with pos=0 is used as ftell()
+    if (whence == SEEK_CUR && pos == 0)
+        return ftell(cookie->fp);
+
+    cookie->seeked = true;
+    return -1;
+}
+
+static int readfn_none(void *c, char *buf, int sz)
+{
+    cookie_none_t *cookie = c;
+    assertf(!cookie->seeked, "Cannot seek in file opened via asset_fopen (it might be compressed)");
+    return fread(buf, 1, sz, cookie->fp);
+}
+
+static int closefn_none(void *c)
+{
+    cookie_none_t *cookie = c;
+    fclose(cookie->fp); cookie->fp = NULL;
+    free(cookie);
+    return 0;
+}
+
+typedef struct  {
+    FILE *fp;
+    int pos;
+    bool seeked;
+    ssize_t (*read)(void *state, void *buf, size_t len);
+    uint8_t state[] alignas(8);
+} cookie_lha_t;
+
+static int readfn_lha(void *c, char *buf, int sz)
+{
+    cookie_lha_t *cookie = (cookie_lha_t*)c;
+    assertf(!cookie->seeked, "Cannot seek in file opened via asset_fopen (it might be compressed)");
+    int n = cookie->read(cookie->state, (uint8_t*)buf, sz);
+    cookie->pos += n;
+    return n;
+}
+
+static fpos_t seekfn_lha(void *c, fpos_t pos, int whence)
+{
+    cookie_lha_t *cookie = (cookie_lha_t*)c;
+
+    // SEEK_CUR with pos=0 is used as ftell()
+    if (whence == SEEK_CUR && pos == 0)
+        return cookie->pos;
+
+    // We should really have an assert here but unfortunately newlib's fclose
+    // also issue a fseek (backward...) as part of a fflush. So we delay the actual
+    // assert until the next read (if any), which is better than nothing.
+    cookie->seeked = true;
+    return -1;
+}
+
+static int closefn_lha(void *c)
+{
+    cookie_lha_t *cookie = (cookie_lha_t*)c;
+    fclose(cookie->fp); cookie->fp = NULL;
+    free(cookie);
+    return 0;
+}
+
+FILE *asset_fopen(const char *fn, int *sz)
+{
+    FILE *f = must_fopen(fn);
+
+    // We use buffering on the outer file created by funopen, so we don't
+    // actually need buffering on the underlying one.
+    setbuf(f, NULL);
+
+    // Check if file is compressed
+    asset_header_t header;
+    fread(&header, 1, sizeof(asset_header_t), f);
+    if (!memcmp(header.magic, ASSET_MAGIC, 3)) {
+        if (header.version != '2') {
+            assertf(0, "unsupported asset version: %c\nMake sure to rebuild libdragon tools and your assets", header.version);
+            return NULL;
+        }
+
+        if (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) {  // for mkasset running on PC
+            header.algo = __builtin_bswap16(header.algo);
+            header.flags = __builtin_bswap16(header.flags);
+            header.cmp_size = __builtin_bswap32(header.cmp_size);
+            header.orig_size = __builtin_bswap32(header.orig_size);
+        }
+
+        cookie_lha_t *cookie;
+        switch (header.algo) {
+        case 1:
+            cookie = malloc(sizeof(cookie_lha_t) + DECOMPRESS_LZ4_STATE_SIZE);
+            decompress_lz4_init(cookie->state, f);
+            cookie->read = decompress_lz4_read;
+            break;
+        case 2:
+            cookie = malloc(sizeof(cookie_lha_t) + DECOMPRESS_LZ5H_STATE_SIZE);
+            decompress_lz5h_init(cookie->state, f);
+            cookie->read = decompress_lz5h_read;
+            break;
+        default:
+            assertf(0, "unsupported compression algorithm: %d", header.algo);
+            return NULL;
+        }
+
+        cookie->fp = f;
+        cookie->pos = 0;
+        cookie->seeked = false;
+        if (sz) *sz = header.orig_size;
+        return funopen(cookie, readfn_lha, NULL, seekfn_lha, closefn_lha);
+    }
+
+    // Not compressed. Return a wrapped FILE* without the seeking capability,
+    // so that it matches the behavior of the compressed file.
+    if (sz) {
+        fseek(f, 0, SEEK_END);
+        *sz = ftell(f);
+    }
+    fseek(f, 0, SEEK_SET);
+    cookie_none_t *cookie = malloc(sizeof(cookie_none_t));
+    cookie->fp = f;
+    cookie->seeked = false;
+    return funopen(cookie, readfn_none, NULL, seekfn_none, closefn_none);
+}
+
+#endif /* N64 */
diff --git a/src/asset_internal.h b/src/asset_internal.h
new file mode 100644
index 0000000000..0b662251c5
--- /dev/null
+++ b/src/asset_internal.h
@@ -0,0 +1,23 @@
+#ifndef __LIBDRAGON_ASSET_INTERNAL_H
+#define __LIBDRAGON_ASSET_INTERNAL_H
+
+#include <stdint.h>
+#include <stdio.h>
+
+#define ASSET_MAGIC    "DCA"   ///< Magic compressed asset header
+
+/** @brief Header of a compressed asset */
+typedef struct {
+    char magic[3];          ///< Magic header
+    uint8_t version;        ///< Version of the asset header
+    uint16_t algo;          ///< Compression algorithm
+    uint16_t flags;         ///< Flags (unused for now)
+    uint32_t cmp_size;      ///< Compressed size in bytes
+    uint32_t orig_size;     ///< Original size in bytes
+} asset_header_t;
+
+_Static_assert(sizeof(asset_header_t) == 16, "invalid sizeof(asset_header_t)");
+
+FILE *must_fopen(const char *fn);
+
+#endif
diff --git a/src/compress/lz4_dec.c b/src/compress/lz4_dec.c
new file mode 100644
index 0000000000..24b1b217cc
--- /dev/null
+++ b/src/compress/lz4_dec.c
@@ -0,0 +1,296 @@
+#include <stdio.h>
+#include <stdalign.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "lz4_dec_internal.h"
+#include "ringbuf_internal.h"
+#include "../utils.h"
+
+#define MIN_MATCH_SIZE  4
+#define MIN_OFFSET 1
+#define MAX_OFFSET 0xffff
+#define HISTORY_SIZE 65536
+#define LITERALS_RUN_LEN 15
+#define MATCH_RUN_LEN 15
+
+#define LZ4ULTRA_HEADER_SIZE        4
+#define LZ4ULTRA_MAX_HEADER_SIZE    7
+#define LZ4ULTRA_FRAME_SIZE         4
+
+#define LZ4ULTRA_ENCODE_ERR         (-1)
+
+#define LZ4ULTRA_DECODE_OK          0
+#define LZ4ULTRA_DECODE_ERR_FORMAT  (-1)
+#define LZ4ULTRA_DECODE_ERR_SUM     (-2)
+
+/* Compression flags */
+#define LZ4ULTRA_FLAG_FAVOR_RATIO    (1<<0)           /**< 1 to compress with the best ratio, 0 to trade some compression ratio for extra decompression speed */
+#define LZ4ULTRA_FLAG_RAW_BLOCK      (1<<1)           /**< 1 to emit raw block */
+#define LZ4ULTRA_FLAG_INDEP_BLOCKS   (1<<2)           /**< 1 if blocks are independent, 0 if using inter-block back references */
+#define LZ4ULTRA_FLAG_LEGACY_FRAMES  (1<<3)           /**< 1 if using the legacy frames format, 0 if using the modern lz4 frame format */
+
+#if defined(__GNUC__) || defined(__clang__)
+#define likely(x)       __builtin_expect(!!(x), 1)
+#define unlikely(x)     __builtin_expect(!!(x), 0)
+#else
+#define likely(x)       (x)
+#define unlikely(x)     (x)
+#endif
+
+#define LZ4ULTRA_DECOMPRESSOR_BUILD_LEN(__len) { \
+   unsigned int byte; \
+   do { \
+      if (unlikely(pInBlock >= pInBlockEnd)) return -1; \
+      if (dma_race) wait_dma(pInBlock+1); \
+      byte = (unsigned int)*pInBlock++; \
+      __len += byte; \
+   } while (unlikely(byte == 255)); \
+}
+
+#ifdef N64
+#include "dma.h"
+#endif
+
+static void wait_dma(const void *pIn) {
+   #ifdef N64
+   static void *ptr; static bool finished = false;
+   if (pIn == NULL) {
+      finished = false;
+      ptr = NULL;
+      return;
+   }
+   if (finished) return;
+   while (ptr < pIn) {
+      // Check if DMA is finished
+      if (!(*PI_STATUS & 1)) {
+         finished = true;
+         return;
+      }
+      // Read current DMA position. Ignore partial cachelines as they
+      // would create coherency problems if accessed by the CPU.
+      ptr = (void*)((*PI_DRAM_ADDR & ~0xF) | 0x80000000);
+   }
+   #endif
+}
+
+/**
+ * Decompress one data block
+ *
+ * @param pInBlock pointer to compressed data
+ * @param nBlockSize size of compressed data, in bytes
+ * @param pOutData pointer to output decompression buffer (previously decompressed bytes + room for decompressing this block)
+ * @param nOutDataOffset starting index of where to store decompressed bytes in output buffer (and size of previously decompressed bytes)
+ * @param nBlockMaxSize total size of output decompression buffer, in bytes
+ *
+ * @return size of decompressed data in bytes, or -1 for error
+ */
+int decompress_lz4_full_mem(const unsigned char *pInBlock, int nBlockSize, unsigned char *pOutData, int nBlockMaxSize, bool dma_race) {
+   const unsigned char *pInBlockEnd = pInBlock + nBlockSize;
+   unsigned char *pCurOutData = pOutData;
+   const unsigned char *pOutDataEnd = pCurOutData + nBlockMaxSize;
+   const unsigned char *pOutDataFastEnd = pOutDataEnd - 18;
+
+   if (dma_race) wait_dma(NULL);
+   while (likely(pInBlock < pInBlockEnd)) {
+      if (dma_race) wait_dma(pInBlock+1);
+      const unsigned int token = (unsigned int)*pInBlock++;
+      unsigned int nLiterals = ((token & 0xf0) >> 4);
+
+      if (nLiterals != LITERALS_RUN_LEN && pCurOutData <= pOutDataFastEnd && (pInBlock + 16) <= pInBlockEnd) {
+         if (dma_race) wait_dma(pInBlock+16);
+         memcpy(pCurOutData, pInBlock, 16);
+      }
+      else {
+         if (likely(nLiterals == LITERALS_RUN_LEN))
+            LZ4ULTRA_DECOMPRESSOR_BUILD_LEN(nLiterals);
+
+         if (unlikely((pInBlock + nLiterals) > pInBlockEnd)) return -1;
+         if (unlikely((pCurOutData + nLiterals) > pOutDataEnd)) return -1;
+
+         if (dma_race) wait_dma(pInBlock+nLiterals);
+         memcpy(pCurOutData, pInBlock, nLiterals);
+      }
+
+      pInBlock += nLiterals;
+      pCurOutData += nLiterals;
+
+      if (likely((pInBlock + 2) <= pInBlockEnd)) {
+         unsigned int nMatchOffset;
+
+         if (dma_race) wait_dma(pInBlock+2);
+         nMatchOffset = (unsigned int)*pInBlock++;
+         nMatchOffset |= ((unsigned int)*pInBlock++) << 8;
+
+         unsigned int nMatchLen = (token & 0x0f);
+
+         nMatchLen += MIN_MATCH_SIZE;
+         if (nMatchLen != (MATCH_RUN_LEN + MIN_MATCH_SIZE) && nMatchOffset >= 8 && pCurOutData <= pOutDataFastEnd) {
+            const unsigned char *pSrc = pCurOutData - nMatchOffset;
+
+            if (unlikely(pSrc < pOutData)) return -1;
+
+            memcpy(pCurOutData, pSrc, 8);
+            memcpy(pCurOutData + 8, pSrc + 8, 8);
+            memcpy(pCurOutData + 16, pSrc + 16, 2);
+
+            pCurOutData += nMatchLen;
+         }
+         else {
+            if (likely(nMatchLen == (MATCH_RUN_LEN + MIN_MATCH_SIZE)))
+               LZ4ULTRA_DECOMPRESSOR_BUILD_LEN(nMatchLen);
+
+            if (unlikely((pCurOutData + nMatchLen) > pOutDataEnd)) return -1;
+
+            const unsigned char *pSrc = pCurOutData - nMatchOffset;
+            if (unlikely(pSrc < pOutData)) return -1;
+
+            if (nMatchOffset >= 16 && (pCurOutData + nMatchLen) <= pOutDataFastEnd) {
+               const unsigned char *pCopySrc = pSrc;
+               unsigned char *pCopyDst = pCurOutData;
+               const unsigned char *pCopyEndDst = pCurOutData + nMatchLen;
+
+               do {
+                  memcpy(pCopyDst, pCopySrc, 16);
+                  pCopySrc += 16;
+                  pCopyDst += 16;
+               } while (pCopyDst < pCopyEndDst);
+
+               pCurOutData += nMatchLen;
+            }
+            else {
+               while (nMatchLen--) {
+                  *pCurOutData++ = *pSrc++;
+               }
+            }
+         }
+      }
+   }
+
+   return (int)(pCurOutData - pOutData);
+}
+
+/**
+ * @brief Fast-access state of the LZ4 algorithm (streaming version).
+ * 
+ * See the LZ4 block format for a better understanding of the fields.
+ */
+typedef struct lz4dec_faststate_s {
+   uint8_t token;       ///< Current token
+   int lit_len;         ///< Number of literals to copy
+   int match_len;       ///< Number of bytes to copy from the ring buffer
+   int match_off;       ///< Offset in the ring buffer to copy from
+   int fsm_state;       ///< Current state of the streaming state machine
+} lz4dec_faststate_t;
+
+/**
+ * @brief State of the LZ4 algorithm (streaming version).
+ */
+typedef struct lz4dec_state_s {
+   uint8_t buf[128] __attribute__((aligned(8)));     ///< File buffer
+   FILE *fp;                        ///< File pointer to read from
+	int buf_idx;                     ///< Current index in the file buffer
+	int buf_size;                    ///< Size of the file buffer
+   bool eof;                        ///< True if we reached the end of the file
+   lz4dec_faststate_t st;           ///< Fast-access state
+   decompress_ringbuf_t ringbuf;    ///< Ring buffer
+} lz4dec_state_t;
+
+#ifdef N64
+_Static_assert(sizeof(lz4dec_state_t) == DECOMPRESS_LZ4_STATE_SIZE, "decompress_lz4_state_t size mismatch");
+#endif
+
+static void lz4_refill(lz4dec_state_t *lz4)
+{
+   lz4->buf_size = fread(lz4->buf, 1, sizeof(lz4->buf), lz4->fp);
+   lz4->buf_idx = 0;
+   lz4->eof = (lz4->buf_size == 0);
+}
+
+static uint8_t lz4_readbyte(lz4dec_state_t *lz4)
+{
+   if (lz4->buf_idx >= lz4->buf_size)
+      lz4_refill(lz4);
+   return lz4->buf[lz4->buf_idx++];
+}
+
+static void lz4_read(lz4dec_state_t *lz4, void *buf, size_t len)
+{
+   while (len > 0) {
+      int n = MIN(len, lz4->buf_size - lz4->buf_idx);
+      memcpy(buf, lz4->buf + lz4->buf_idx, n);
+      buf += n;
+      len -= n;
+      lz4->buf_idx += n;
+      if (lz4->buf_idx >= lz4->buf_size)
+         lz4_refill(lz4);
+   }
+}
+
+void decompress_lz4_init(void *state, FILE *fp)
+{
+   lz4dec_state_t *lz4 = (lz4dec_state_t*)state;
+   lz4->fp = fp;
+   lz4->eof = false;
+   lz4->buf_idx = 0;
+   lz4->buf_size = 0;
+   memset(&lz4->st, 0, sizeof(lz4->st));
+   __ringbuf_init(&lz4->ringbuf);
+}
+
+ssize_t decompress_lz4_read(void *state, void *buf, size_t len)
+{
+   lz4dec_state_t *lz4 = (lz4dec_state_t*)state;
+   lz4dec_faststate_t st = lz4->st;
+   void *buf_orig = buf;
+   int n;
+
+   while (!lz4->eof && len > 0) {
+      switch (st.fsm_state) {
+      case 0: // read token
+         st.token = lz4_readbyte(lz4);
+         st.lit_len = ((st.token & 0xf0) >> 4);
+         if (unlikely(st.lit_len == LITERALS_RUN_LEN)) {
+            uint8_t byte;
+            do {
+               byte = lz4_readbyte(lz4);
+               st.lit_len += byte;
+            } while (unlikely(byte == 255));
+         }
+         st.fsm_state = 1;
+      case 1: // literals
+         n = MIN(st.lit_len, len);
+         lz4_read(lz4, buf, n);
+         __ringbuf_write(&lz4->ringbuf, buf, n);
+         buf += n;
+         len -= n;
+         st.lit_len -= n;
+         if (st.lit_len)
+            break;
+         st.match_off = lz4_readbyte(lz4);
+         st.match_off |= ((uint16_t)lz4_readbyte(lz4)) << 8;
+         st.match_len = (st.token & 0x0f);
+         if (unlikely(st.match_len == MATCH_RUN_LEN)) {
+            uint8_t byte;
+            do {
+               byte = lz4_readbyte(lz4);
+               st.match_len += byte;
+            } while (unlikely(byte == 255));
+         }
+         st.match_len += MIN_MATCH_SIZE;
+         st.fsm_state = 2;
+      case 2: // match
+         n = MIN(st.match_len, len);
+         __ringbuf_copy(&lz4->ringbuf, st.match_off, buf, n);
+         buf += n;
+         len -= n;
+         st.match_len -= n;
+         if (st.match_len)
+            break;
+         st.fsm_state = 0;
+      }
+   }
+
+   lz4->st = st;
+   return buf - buf_orig;
+}
diff --git a/src/compress/lz4_dec_internal.h b/src/compress/lz4_dec_internal.h
new file mode 100644
index 0000000000..cdaf5e821b
--- /dev/null
+++ b/src/compress/lz4_dec_internal.h
@@ -0,0 +1,68 @@
+#ifndef LIBDRAGON_COMPRESS_LZ4_DEC_INTERNAL_H
+#define LIBDRAGON_COMPRESS_LZ4_DEC_INTERNAL_H
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+/**
+ * @brief Calculate the margin required for in-place decompression.
+ * 
+ * It is possible to perform in-place decompression of LZ4 data: to do so,
+ * allocate a buffer large enough to hold the decompressed data, plus some
+ * margin calculated through this function. Then, read the compressed
+ * data at the end of the buffer. Finally, call #decompress_lz4_full_mem.
+ * 
+ * Example:
+ * 
+ * @code{.c}
+ *      // Allocate a buffer large enough to hold the decompressed data,
+ *      // pluse the inplace margin.
+ *      int buf_size = decompressed_size + LZ4_DECOMPRESS_INPLACE_MARGIN(compressed_size);
+ *      void *buf = malloc(buf_size);
+ * 
+ *      // Read compressed data at the end of the buffer
+ *      fread(buf + buf_size - compressed_size, 1, compressed_size, fp);
+ * 
+ *      // Decompress
+ *      decompress_lz4_full_mem(
+ *          buf + buf_size - compressed_size, compressed_size,
+ *          buf, decompressed_size,
+ *          false);
+ * @endcode
+ */
+#define LZ4_DECOMPRESS_INPLACE_MARGIN(compressed_size)    (((compressed_size) >> 8) + 32)
+
+/**
+ * @brief Decompress a block of LZ4 data (mem to mem).
+ * 
+ * This function run a LZ4 decompressor on a block of data, from memory to
+ * memory. 
+ * 
+ * LZ4 is much faster than PI DMA. To benefit even more from this, it is possible
+ * to actually run this function in parallel with the DMA transfer, "racing"
+ * with it. If called with @p dma_race set to true, the function will assume
+ * that the source buffer is currently being DMAed into memory, and will
+ * throttle itself to never read past the current DMA position.
+ * 
+ * In addition to this, it is possible to in-place decompress a block of data.
+ * See #LZ4_DECOMPRESS_INPLACE_MARGIN for more information.
+ * 
+ * @param src           Pointer to source buffer (compressed data)
+ * @param src_size      Size of the compressed data in bytes
+ * @param dst           Pointer to destination buffer (decompressed data)
+ * @param dst_size      Size of the destination buffer in bytes
+ * @param dma_race      If true, the source data is currently being DMA'd.
+ * @return int          Number of bytes decompressed, or -1 on error.
+ */
+int decompress_lz4_full_mem(const unsigned char *src, int src_size,
+    unsigned char *dst, int dst_size, bool dma_race);
+
+
+#define DECOMPRESS_LZ4_STATE_SIZE  (16552)
+
+void decompress_lz4_init(void *state, FILE *fp);
+ssize_t decompress_lz4_read(void *state, void *buf, size_t len);
+
+
+#endif
diff --git a/src/compress/lzh5.c b/src/compress/lzh5.c
new file mode 100644
index 0000000000..985643ad4f
--- /dev/null
+++ b/src/compress/lzh5.c
@@ -0,0 +1,1164 @@
+// Decoder for algorithm -lh5- of the LZH family.
+// This code comes from https://github.com/fragglet/lhasa
+// and has been turned into a single file header with only
+// the -lh5- algo.
+// This was also modified to allow for full streaming decompression
+// (up to 1 byte at a time). Before, the code would decompress one
+// internal LHA block at a time, writing a non predictable number of 
+// bytes in the output buffer.
+// This file is ISC Licensed.
+
+#include "lzh5_internal.h"
+
+//////////////////////// bit_stream_reader.c
+
+/*
+
+Copyright (c) 2011, 2012, Simon Howard
+
+Permission to use, copy, modify, and/or distribute this software
+for any purpose with or without fee is hereby granted, provided
+that the above copyright notice and this permission notice appear
+in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
+WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
+AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR
+CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
+LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
+NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ */
+
+#include <string.h>
+
+//
+// Data structure used to read bits from an input source as a stream.
+//
+// This file is designed to be #included by other source files to
+// make a complete decoder.
+//
+
+typedef struct {
+
+	// File pointer to read from.
+
+	FILE *fp;
+
+	// Internal cache of bytes read from the input stream.
+
+	uint8_t buf[128] __attribute__((aligned(8)));
+	int buf_idx;
+	int buf_size;
+
+	// Bits from the input stream that are waiting to be read.
+
+	uint64_t bit_buffer;
+	int bits;
+
+} BitStreamReader;
+
+// Initialize bit stream reader structure.
+
+static void bit_stream_reader_init(BitStreamReader *reader, FILE *fp)
+{
+	reader->fp = fp;
+	reader->buf_idx = 0;
+	reader->buf_size = 0;
+	reader->bits = 0;
+	reader->bit_buffer = 0;
+}
+
+// Refill the bit buffer with other 64 bits from the input stream.
+
+static int refill_bits(BitStreamReader *reader)
+{
+	if (reader->buf_idx >= reader->buf_size) {
+		reader->buf_size = fread(reader->buf, 1, sizeof(reader->buf), reader->fp);
+		reader->buf_idx = 0;
+	}
+
+	reader->bit_buffer = *(uint64_t*)(&reader->buf[reader->buf_idx]);
+	if (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+		reader->bit_buffer = __builtin_bswap64(reader->bit_buffer);
+	reader->bits = (reader->buf_size - reader->buf_idx) * 8;
+	if (reader->bits > 64)
+		reader->bits = 64;
+	reader->buf_idx += 8;
+	return reader->buf_size > 0;
+}
+
+// Internal continuation of read_bits
+// Returns -1 for failure.
+
+__attribute__((noinline))
+static int __read_bits2(BitStreamReader *reader,
+                      unsigned int n, int result)
+{
+	if (!refill_bits(reader))
+		return -1;
+	result |= reader->bit_buffer >> (64 - n);
+	reader->bit_buffer <<= n;
+	reader->bits -= n;
+	return result;
+}
+
+// Read multiple bits from the input stream.
+// Returns -1 for failure.
+
+__attribute__((noinline))
+static int read_bits(BitStreamReader *reader,
+                     unsigned int n)
+{
+	int result = reader->bit_buffer >> (64 - n);
+	reader->bit_buffer <<= n;
+	reader->bits -= n;
+	if (__builtin_expect(reader->bits >= 0, 1)) {
+		return result;
+	}
+	return __read_bits2(reader, -reader->bits, result);
+}
+
+
+// Read a bit from the input stream.
+// Returns -1 for failure.
+static int read_bit(BitStreamReader *reader)
+{
+	return read_bits(reader, 1);
+}
+
+
+static uint64_t peek_bits(BitStreamReader *reader, int *n)
+{
+	*n = reader->bits;
+	return reader->bit_buffer;
+}
+
+static int skip_bits(BitStreamReader *reader, int n)
+{
+	reader->bit_buffer <<= n;
+	reader->bits -= n;
+	if (__builtin_expect(reader->bits <= 0, 0)) {
+		refill_bits(reader);
+		if (reader->bits < 0)
+			return -1;
+	}
+	return 0;
+}
+
+
+//////////////////////// tree_decode.c
+typedef uint16_t TreeElement;
+
+/*
+
+Copyright (c) 2011, 2012, Simon Howard
+
+Permission to use, copy, modify, and/or distribute this software
+for any purpose with or without fee is hereby granted, provided
+that the above copyright notice and this permission notice appear
+in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
+WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
+AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR
+CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
+LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
+NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ */
+
+// Common tree decoding code.
+//
+// A recurring feature used by the different LHA algorithms is to
+// encode a set of codes, which have varying bit lengths. This is
+// implemented using a binary tree, stored inside an array of
+// elements.
+//
+// This file is implemented as a "template" file to be #include-d by
+// other files. The typedef for TreeElement must be defined before
+// include.
+
+
+// Upper bit is set in a node value to indicate a leaf.
+
+#define TREE_NODE_LEAF    (TreeElement) (1 << (sizeof(TreeElement) * 8 - 1))
+
+// Structure used to hold data needed to build the tree.
+
+typedef struct {
+	// The tree data and its size (must not be exceeded)
+
+	TreeElement *tree;
+	unsigned int tree_len;
+
+	// Counter used to allocate entries from the tree.
+	// Every time a new node is allocated, this increase by 2.
+
+	unsigned int tree_allocated;
+
+	// The next tree entry.
+	// As entries are allocated sequentially, the range from
+	// next_entry..tree_allocated-1 constitutes the indices into
+	// the tree that are available to be filled in. By the
+	// end of the tree build, next_entry should = tree_allocated.
+
+	unsigned int next_entry;
+} TreeBuildData;
+
+// Initialize all elements of the given tree to a good initial state.
+
+static void init_tree(TreeElement *tree, size_t tree_len)
+{
+	unsigned int i;
+
+	for (i = 0; i < tree_len; ++i) {
+		tree[i] = TREE_NODE_LEAF;
+	}
+}
+
+// Set tree to always decode to a single code.
+
+static void set_tree_single(TreeElement *tree, TreeElement code)
+{
+	tree[0] = (TreeElement) code | TREE_NODE_LEAF;
+}
+
+// "Expand" the list of queue entries. This generates a new child
+// node at each of the entries currently in the queue, adding the
+// children of those nodes into the queue to replace them.
+// The effect of this is to add an extra level to the tree, and
+// to increase the tree depth of the indices in the queue.
+
+static void expand_queue(TreeBuildData *build)
+{
+	unsigned int end_offset;
+	unsigned int new_nodes;
+
+	// Sanity check that there is enough space in the tree for
+	// all the new nodes.
+
+	new_nodes = (build->tree_allocated - build->next_entry) * 2;
+
+	if (build->tree_allocated + new_nodes > build->tree_len) {
+		return;
+	}
+
+	// Go through all entries currently in the allocated range, and
+	// allocate a subnode for each.
+
+	end_offset = build->tree_allocated;
+
+	while (build->next_entry < end_offset) {
+		build->tree[build->next_entry] = build->tree_allocated;
+		build->tree_allocated += 2;
+		++build->next_entry;
+	}
+}
+
+// Read the next entry from the queue of entries waiting to be used.
+
+static unsigned int read_next_entry(TreeBuildData *build)
+{
+	unsigned int result;
+
+	// Sanity check.
+
+	if (build->next_entry >= build->tree_allocated) {
+		return 0;
+	}
+
+	result = build->next_entry;
+	++build->next_entry;
+
+	return result;
+}
+
+// Add all codes to the tree that have the specified length.
+// Returns non-zero if there are any entries in code_lengths[] still
+// waiting to be added to the tree.
+
+static int add_codes_with_length(TreeBuildData *build,
+                                 uint8_t *code_lengths,
+                                 unsigned int num_code_lengths,
+                                 unsigned int code_len)
+{
+	unsigned int i;
+	unsigned int node;
+	int codes_remaining;
+
+	codes_remaining = 0;
+
+	for (i = 0; i < num_code_lengths; ++i) {
+
+		// Does this code belong at this depth in the tree?
+
+		if (code_lengths[i] == code_len) {
+			node = read_next_entry(build);
+
+			build->tree[node] = (TreeElement) i | TREE_NODE_LEAF;
+		}
+
+		// More work to be done after this pass?
+
+		else if (code_lengths[i] > code_len) {
+			codes_remaining = 1;
+		}
+	}
+
+	return codes_remaining;
+}
+
+// Build a tree, given the specified array of codes indicating the
+// required depth within the tree at which each code should be
+// located.
+
+static void build_tree(TreeElement *tree, size_t tree_len,
+                       uint8_t *code_lengths, unsigned int num_code_lengths)
+{
+	TreeBuildData build;
+	unsigned int code_len;
+
+	build.tree = tree;
+	build.tree_len = tree_len;
+
+	// Start with a single entry in the queue - the root node
+	// pointer.
+
+	build.next_entry = 0;
+
+	// We always have the root ...
+
+	build.tree_allocated = 1;
+
+	// Iterate over each possible code length.
+	// Note: code_len == 0 is deliberately skipped over, as 0
+	// indicates "not used".
+
+	code_len = 0;
+
+	do {
+		// Advance to the next code length by allocating extra
+		// nodes to the tree - the slots waiting in the queue
+		// will now be one level deeper in the tree (and the
+		// codes 1 bit longer).
+
+		expand_queue(&build);
+		++code_len;
+
+		// Add all codes that have this length.
+
+	} while (add_codes_with_length(&build, code_lengths,
+	                               num_code_lengths, code_len));
+}
+
+/*
+static void display_tree(TreeElement *tree, unsigned int node, int offset)
+{
+	unsigned int i;
+
+	if (node & TREE_NODE_LEAF) {
+		for (i = 0; i < offset; ++i) putchar(' ');
+		printf("leaf %i\n", node & ~TREE_NODE_LEAF);
+	} else {
+		for (i = 0; i < offset; ++i) putchar(' ');
+		printf("0 ->\n");
+		display_tree(tree, tree[node], offset + 4);
+		for (i = 0; i < offset; ++i) putchar(' ');
+		printf("1 ->\n");
+		display_tree(tree, tree[node + 1], offset + 4);
+	}
+}
+*/
+
+// Read bits from the input stream, traversing the specified tree
+// from the root node until we reach a leaf.  The leaf value is
+// returned.
+
+static int read_from_tree(BitStreamReader *reader, TreeElement *tree)
+{
+	TreeElement code;
+	int bit;
+	uint64_t bits=0; int n=0, used=0;
+
+	// Start from root.
+
+	code = tree[0];
+
+	while ((code & TREE_NODE_LEAF) == 0) {
+
+		if (used == n) {
+			if (skip_bits(reader, used) < 0)
+				return -1;
+			bits = peek_bits(reader, &n);
+			used = 0;
+		}
+
+		bit = bits >> 63;
+		bits <<= 1;
+		used++;
+
+		code = tree[code + (unsigned int) bit];
+	}
+
+	if (skip_bits(reader, used) < 0)
+		return -1;
+
+	// Mask off leaf bit to get the plain code.
+
+	return (int) (code & ~TREE_NODE_LEAF);
+}
+
+
+
+
+//////////////////////// lh5_decoder.c
+
+/*
+
+Copyright (c) 2011, 2012, Simon Howard
+
+Permission to use, copy, modify, and/or distribute this software
+for any purpose with or without fee is hereby granted, provided
+that the above copyright notice and this permission notice appear
+in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
+WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
+AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR
+CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
+LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
+NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ */
+
+//
+// Decoder for the -lh5- algorithm.
+//
+// This is the "new" algorithm that appeared in LHA v2, replacing
+// the older -lh1-. -lh4- seems to be identical to -lh5-.
+//
+
+// 16 KiB history ring buffer:
+
+#define HISTORY_BITS    14   /* 2^14 = 16384 */
+
+// Number of bits to encode HISTORY_BITS:
+
+#define OFFSET_BITS     4
+
+// Name of the variable for the encoder:
+
+#define DECODER_NAME lha_lh5_decoder
+
+
+//////////////////////// lh_new_decoder.c
+
+
+/*
+
+Copyright (c) 2011, 2012, Simon Howard
+
+Permission to use, copy, modify, and/or distribute this software
+for any purpose with or without fee is hereby granted, provided
+that the above copyright notice and this permission notice appear
+in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
+WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
+AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR
+CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
+LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
+NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ */
+
+// Decoder for "new-style" LHA algorithms, used with LHA v2 and onwards
+// (-lh4-, -lh5-, -lh6-, -lh7-).
+//
+// This file is designed to be a template. It is #included by other
+// files to generate an optimized decoder.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <inttypes.h>
+
+
+// Threshold for copying. The first copy code starts from here.
+
+#define COPY_THRESHOLD       3 /* bytes */
+
+// Ring buffer containing history has a size that is a power of two.
+// The number of bits is specified.
+
+#define RING_BUFFER_SIZE     (1 << HISTORY_BITS)
+
+// Required size of the output buffer.  At most, a single call to read()
+// might result in a copy of the entire ring buffer.
+
+#define OUTPUT_BUFFER_SIZE   RING_BUFFER_SIZE
+
+// Number of different command codes. 0-255 range are literal byte
+// values, while higher values indicate copy from history.
+
+#define NUM_CODES            510
+
+// Number of possible codes in the "temporary table" used to encode the
+// codes table.
+
+#define MAX_TEMP_CODES       20
+
+typedef struct _LHANewDecoder {
+	// Input bit stream.
+
+	BitStreamReader bit_stream_reader;
+
+	// Number of commands remaining before we start a new block.
+
+	unsigned int block_remaining;
+
+	// Table used for the code tree.
+
+	TreeElement code_tree[NUM_CODES * 2];
+
+	// Table used to encode the offset tree, used to read offsets
+	// into the history buffer. This same table is also used to
+	// encode the temp-table, which is bigger; hence the size.
+
+	TreeElement offset_tree[MAX_TEMP_CODES * 2];
+} LHANewDecoder;
+
+
+typedef struct _LHANewDecoderPartial {
+	// Decoder
+
+	LHANewDecoder decoder;
+
+	// Ring buffer of past data.  Used for position-based copies.
+
+	uint8_t ringbuf[RING_BUFFER_SIZE];
+	unsigned int ringbuf_pos;
+	int ringbuf_copy_pos;
+	int ringbuf_copy_count;
+
+	int decoded_bytes;
+
+} LHANewDecoderPartial;
+
+
+// Initialize the history ring buffer.
+
+static void init_ring_buffer(LHANewDecoderPartial *decoder)
+{
+	memset(decoder->ringbuf, ' ', RING_BUFFER_SIZE);
+	decoder->ringbuf_pos = 0;
+	decoder->ringbuf_copy_pos = 0;
+	decoder->ringbuf_copy_count = 0;
+}
+
+static int lha_lh_new_init(LHANewDecoder *decoder, FILE *fp)
+{
+	// Initialize input stream reader.
+
+	bit_stream_reader_init(&decoder->bit_stream_reader, fp);
+
+	// First read starts the first block.
+
+	decoder->block_remaining = 0;
+
+	// Initialize tree tables to a known state.
+
+	init_tree(decoder->code_tree, NUM_CODES * 2);
+	init_tree(decoder->offset_tree, MAX_TEMP_CODES * 2);
+
+	return 1;
+}
+
+static int lha_lh_new_init_partial(LHANewDecoderPartial *decoder, FILE *fp)
+{
+	lha_lh_new_init(&decoder->decoder, fp);
+
+	// Initialize data structures.
+
+	init_ring_buffer(decoder);
+
+	decoder->decoded_bytes = 0;
+
+	return 1;
+}
+
+// Read a length value - this is normally a value in the 0-7 range, but
+// sometimes can be longer.
+
+static int read_length_value(LHANewDecoder *decoder)
+{
+	int i, len;
+
+	len = read_bits(&decoder->bit_stream_reader, 3);
+
+	if (len < 0) {
+		return -1;
+	}
+
+	if (len == 7) {
+		// Read more bits to extend the length until we reach a '0'.
+
+		for (;;) {
+			i = read_bit(&decoder->bit_stream_reader);
+
+			if (i < 0) {
+				return -1;
+			} else if (i == 0) {
+				break;
+			}
+
+			++len;
+		}
+	}
+
+	return len;
+}
+
+// Read the values from the input stream that define the temporary table
+// used for encoding the code table.
+
+static int read_temp_table(LHANewDecoder *decoder)
+{
+	int i, j, n, len, code;
+	uint8_t code_lengths[MAX_TEMP_CODES];
+
+	// How many codes?
+
+	n = read_bits(&decoder->bit_stream_reader, 5);
+
+	if (n < 0) {
+		return 0;
+	}
+
+	// n=0 is a special case, meaning only a single code that
+	// is of zero length.
+
+	if (n == 0) {
+		code = read_bits(&decoder->bit_stream_reader, 5);
+
+		if (code < 0) {
+			return 0;
+		}
+
+		set_tree_single(decoder->offset_tree, code);
+		return 1;
+	}
+
+	// Enforce a hard limit on the number of codes.
+
+	if (n > MAX_TEMP_CODES) {
+		n = MAX_TEMP_CODES;
+	}
+
+	// Read the length of each code.
+
+	for (i = 0; i < n; ++i) {
+		len = read_length_value(decoder);
+
+		if (len < 0) {
+			return 0;
+		}
+
+		code_lengths[i] = len;
+
+		// After the first three lengths, there is a 2-bit
+		// field to allow skipping over up to a further three
+		// lengths. Not sure of the reason for this ...
+
+		if (i == 2) {
+			len = read_bits(&decoder->bit_stream_reader, 2);
+
+			if (len < 0) {
+				return 0;
+			}
+
+			for (j = 0; j < len; ++j) {
+				++i;
+				code_lengths[i] = 0;
+			}
+		}
+	}
+
+	build_tree(decoder->offset_tree, MAX_TEMP_CODES * 2, code_lengths, n);
+
+	return 1;
+}
+
+// Code table codes can indicate that a sequence of codes should be
+// skipped over. The number to skip is Huffman-encoded. Given a skip
+// range (0-2), this reads the number of codes to skip over.
+
+static int read_skip_count(LHANewDecoder *decoder, int skiprange)
+{
+	int result;
+
+	// skiprange=0 => 1 code.
+
+	if (skiprange == 0) {
+		result = 1;
+	}
+
+	// skiprange=1 => 3-18 codes.
+
+	else if (skiprange == 1) {
+		result = read_bits(&decoder->bit_stream_reader, 4);
+
+		if (result < 0) {
+			return -1;
+		}
+
+		result += 3;
+	}
+
+	// skiprange=2 => 20+ codes.
+
+	else {
+		result = read_bits(&decoder->bit_stream_reader, 9);
+
+		if (result < 0) {
+			return -1;
+		}
+
+		result += 20;
+	}
+
+	return result;
+}
+
+static int read_code_table(LHANewDecoder *decoder)
+{
+	int i, j, n, skip_count, code;
+	uint8_t code_lengths[NUM_CODES];
+
+	// How many codes?
+
+	n = read_bits(&decoder->bit_stream_reader, 9);
+
+	if (n < 0) {
+		return 0;
+	}
+
+	// n=0 implies a single code of zero length; all inputs
+	// decode to the same code.
+
+	if (n == 0) {
+		code = read_bits(&decoder->bit_stream_reader, 9);
+
+		if (code < 0) {
+			return 0;
+		}
+
+		set_tree_single(decoder->code_tree, code);
+
+		return 1;
+	}
+
+	if (n > NUM_CODES) {
+		n = NUM_CODES;
+	}
+
+	// Read the length of each code.
+	// The lengths are encoded using the temp-table previously read;
+	// offset_tree is reused temporarily to hold it.
+
+	i = 0;
+
+	while (i < n) {
+		code = read_from_tree(&decoder->bit_stream_reader,
+		                      decoder->offset_tree);
+
+		if (code < 0) {
+			return 0;
+		}
+
+		// The code that was read can have different meanings.
+		// If in the range 0-2, it indicates that a number of
+		// codes are unused and should be skipped over.
+		// Values greater than two represent a frequency count.
+
+		if (code <= 2) {
+			skip_count = read_skip_count(decoder, code);
+
+			if (skip_count < 0) {
+				return 0;
+			}
+
+			for (j = 0; j < skip_count && i < n; ++j) {
+				code_lengths[i] = 0;
+				++i;
+			}
+		} else {
+			code_lengths[i] = code - 2;
+			++i;
+		}
+	}
+
+	build_tree(decoder->code_tree, NUM_CODES * 2, code_lengths, n);
+
+	return 1;
+}
+
+static int read_offset_table(LHANewDecoder *decoder)
+{
+	int i, n, len, code;
+	uint8_t code_lengths[HISTORY_BITS];
+
+	// How many codes?
+
+	n = read_bits(&decoder->bit_stream_reader, OFFSET_BITS);
+
+	if (n < 0) {
+		return 0;
+	}
+
+	// n=0 is a special case, meaning only a single code that
+	// is of zero length.
+
+	if (n == 0) {
+		code = read_bits(&decoder->bit_stream_reader, OFFSET_BITS);
+
+		if (code < 0) {
+			return 0;
+		}
+
+		set_tree_single(decoder->offset_tree, code);
+		return 1;
+	}
+
+	// Enforce a hard limit on the number of codes.
+
+	if (n > HISTORY_BITS) {
+		n = HISTORY_BITS;
+	}
+
+	// Read the length of each code.
+
+	for (i = 0; i < n; ++i) {
+		len = read_length_value(decoder);
+
+		if (len < 0) {
+			return 0;
+		}
+
+		code_lengths[i] = len;
+	}
+
+	build_tree(decoder->offset_tree, MAX_TEMP_CODES * 2, code_lengths, n);
+
+	return 1;
+}
+
+// Start reading a new block from the input stream.
+
+static int start_new_block(LHANewDecoder *decoder)
+{
+	int len;
+
+	// Read length of new block (in commands).
+
+	len = read_bits(&decoder->bit_stream_reader, 16);
+
+	if (len < 0) {
+		return 0;
+	}
+
+	decoder->block_remaining = (size_t) len;
+
+	// Read the temporary decode table, used to encode the codes table.
+	// The position table data structure is reused for this.
+
+	if (!read_temp_table(decoder)) {
+		return 0;
+	}
+
+	// Read the code table; this is encoded *using* the temp table.
+
+	if (!read_code_table(decoder)) {
+		return 0;
+	}
+
+	// Read the offset table.
+
+	if (!read_offset_table(decoder)) {
+		return 0;
+	}
+
+	return 1;
+}
+
+// Read the next code from the input stream. Returns the code, or -1 if
+// an error occurred.
+
+static int read_code(LHANewDecoder *decoder)
+{
+	return read_from_tree(&decoder->bit_stream_reader, decoder->code_tree);
+}
+
+// Read an offset distance from the input stream.
+// Returns the code, or -1 if an error occurred.
+
+static int read_offset_code(LHANewDecoder *decoder)
+{
+	int bits, result;
+
+	bits = read_from_tree(&decoder->bit_stream_reader,
+	                      decoder->offset_tree);
+
+	if (bits < 0) {
+		return -1;
+	}
+
+	// The code read indicates the length of the offset in bits.
+	//
+	// The returned value looks like this:
+	//   bits = 0  ->         0
+	//   bits = 1  ->         1
+	//   bits = 2  ->        1x
+	//   bits = 3  ->       1xx
+	//   bits = 4  ->      1xxx
+	//             etc.
+
+	if (bits == 0) {
+		return 0;
+	} else if (bits == 1) {
+		return 1;
+	} else {
+		result = read_bits(&decoder->bit_stream_reader, bits - 1);
+
+		if (result < 0) {
+			return -1;
+		}
+
+		return result + (1 << (bits - 1));
+	}
+}
+
+// Add a byte value to the output stream.
+
+static void output_byte(LHANewDecoderPartial *decoder, uint8_t *buf,
+                        size_t *buf_len, uint8_t b)
+{
+	if (buf) buf[*buf_len] = b;
+	++*buf_len;
+
+	decoder->ringbuf[decoder->ringbuf_pos] = b;
+	decoder->ringbuf_pos = (decoder->ringbuf_pos + 1) % RING_BUFFER_SIZE;
+}
+
+// Copy a block from the history buffer.
+
+static void set_copy_from_history(LHANewDecoderPartial *decoder, size_t count)
+{
+	int offset;
+
+	offset = read_offset_code(&decoder->decoder);
+
+	if (offset < 0) {
+		return;
+	}
+
+	decoder->ringbuf_copy_pos = decoder->ringbuf_pos + RING_BUFFER_SIZE - (unsigned int) offset - 1;
+	while (decoder->ringbuf_copy_pos < 0)
+		decoder->ringbuf_copy_pos += RING_BUFFER_SIZE;
+	while (decoder->ringbuf_copy_pos >= RING_BUFFER_SIZE)
+		decoder->ringbuf_copy_pos -= RING_BUFFER_SIZE;
+
+	decoder->ringbuf_copy_count = count;
+}
+
+static size_t lha_lh_new_read_partial(LHANewDecoderPartial *decoder, uint8_t *buf, int sz)
+{
+	size_t result = 0;
+	int code;
+
+	while (sz > 0) {
+		if (decoder->ringbuf_copy_count > 0) {
+			// Calculate number of bytes that we can copy in sequence without reaching the end of a buffer
+			int wn = sz < decoder->ringbuf_copy_count ? sz : decoder->ringbuf_copy_count;
+			wn = wn < RING_BUFFER_SIZE - decoder->ringbuf_copy_pos ? wn : RING_BUFFER_SIZE - decoder->ringbuf_copy_pos;
+			wn = wn < RING_BUFFER_SIZE - decoder->ringbuf_pos      ? wn : RING_BUFFER_SIZE - decoder->ringbuf_pos;
+
+			if (!buf) {
+				// If buf is NULL, we're just skipping data
+				decoder->ringbuf_pos += wn;
+				decoder->ringbuf_copy_count -= wn;
+				decoder->ringbuf_copy_pos += wn;
+				sz -= wn;
+				result += wn;
+				decoder->ringbuf_copy_pos %= RING_BUFFER_SIZE;
+				decoder->ringbuf_pos %= RING_BUFFER_SIZE;
+				continue;
+			}
+
+			// Check if there's an overlap in the ring buffer between read and write pos, in which
+			// case we need to copy byte by byte.
+			if (decoder->ringbuf_pos < decoder->ringbuf_copy_pos || 
+			    decoder->ringbuf_pos > decoder->ringbuf_copy_pos+7) {
+				while (wn >= 8) {
+					// Copy 8 bytes at at time, using a unaligned memory access (LDL/LDR/SDL/SDR)
+					typedef uint64_t u_uint64_t __attribute__((aligned(1)));
+					uint64_t value = *(u_uint64_t*)&decoder->ringbuf[decoder->ringbuf_copy_pos];
+					*(u_uint64_t*)&buf[result] = value;
+					*(u_uint64_t*)&decoder->ringbuf[decoder->ringbuf_pos] = value;
+
+					decoder->ringbuf_copy_pos += 8;
+					decoder->ringbuf_pos += 8;
+					decoder->ringbuf_copy_count -= 8;
+					result += 8;
+					sz -= 8;
+					wn -= 8;
+				}
+			}
+
+			// Finish copying the remaining bytes
+			while (wn > 0) {
+				uint8_t value = decoder->ringbuf[decoder->ringbuf_copy_pos];
+				buf[result] = value;
+				decoder->ringbuf[decoder->ringbuf_pos] = value;
+
+				decoder->ringbuf_copy_pos += 1;
+				decoder->ringbuf_pos += 1;
+				decoder->ringbuf_copy_count -= 1;
+				result += 1;
+				sz -= 1;
+				wn -= 1;
+			}
+			decoder->ringbuf_copy_pos %= RING_BUFFER_SIZE;
+			decoder->ringbuf_pos %= RING_BUFFER_SIZE;
+			continue;
+		}
+
+
+		// Start of new block?
+		while (decoder->decoder.block_remaining == 0) {
+			if (!start_new_block(&decoder->decoder)) {
+				goto end;
+			}
+		}
+
+		--decoder->decoder.block_remaining;
+
+		// Read next command from input stream.
+
+		code = read_code(&decoder->decoder);
+
+		if (code < 0) {
+			return 0;
+		}
+
+		// The code may be either a literal byte value or a copy command.
+
+		if (code < 256) {
+			output_byte(decoder, buf, &result, (uint8_t) code);
+			sz--;
+		} else {
+			set_copy_from_history(decoder, code - 256 + COPY_THRESHOLD);
+		}
+	}
+
+end:
+	decoder->decoded_bytes += result;
+	return result;
+}
+
+static size_t lha_lh_new_read_full(LHANewDecoder *decoder, uint8_t *buf, int sz)
+{
+	uint8_t *buf_orig = buf;
+	int code;
+
+	while (sz > 0) {
+		// Start of new block?
+		while (decoder->block_remaining == 0) {
+			if (!start_new_block(decoder)) {
+				goto end;
+			}
+		}
+		--decoder->block_remaining;
+
+		// Read next command from input stream.
+		code = read_code(decoder);
+
+		if (code < 0) {
+			return 0;
+		}
+
+		// The code may be either a literal byte value or a copy command.
+		if (code < 256) {
+			*buf++ = (uint8_t) code;
+			sz--;
+		} else {
+			int count = code - 256 + COPY_THRESHOLD;
+			int offset = read_offset_code(decoder);
+
+			if (offset < 0) {
+				return 0;
+			}
+			uint8_t *src = buf - offset - 1;
+
+			count = count < sz ? count : sz;
+			sz -= count;
+
+			if (offset > 7) {
+				while (count >= 8) {
+					typedef uint64_t u_uint64_t __attribute__((aligned(1)));
+					*(u_uint64_t*)buf = *(u_uint64_t*)src;
+					buf += 8;
+					src += 8;
+					count -= 8;
+				}
+			}
+			while (count > 0) {
+				*buf++ = *src++;
+				count--;
+			}
+		}
+	}
+
+end:
+	return buf - buf_orig;
+}
+
+
+/*************************************************
+ * Libdragon API
+ *************************************************/
+
+_Static_assert(sizeof(LHANewDecoderPartial) == DECOMPRESS_LZ5H_STATE_SIZE, "LZH5 state size is wrong");
+
+void decompress_lz5h_init(void *state, FILE *fp)
+{
+	LHANewDecoderPartial *decoder = (LHANewDecoderPartial *)state;
+	lha_lh_new_init_partial(decoder, fp);
+}
+
+ssize_t decompress_lz5h_read(void *state, void *buf, size_t len)
+{
+	LHANewDecoderPartial *decoder = (LHANewDecoderPartial *)state;
+	return lha_lh_new_read_partial(decoder, buf, len);
+}
+
+int decompress_lz5h_pos(void *state) {
+	LHANewDecoderPartial *decoder = (LHANewDecoderPartial *)state;
+	return decoder->decoded_bytes;
+}
+
+size_t decompress_lz5h_full(FILE *fp, void *buf, size_t len)
+{
+	LHANewDecoder decoder;
+	lha_lh_new_init(&decoder, fp);
+	return lha_lh_new_read_full(&decoder, buf, len);
+}
diff --git a/src/compress/lzh5_internal.h b/src/compress/lzh5_internal.h
new file mode 100644
index 0000000000..7554504db5
--- /dev/null
+++ b/src/compress/lzh5_internal.h
@@ -0,0 +1,44 @@
+#ifndef LIBDRAGON_COMPRESS_LZH5_h
+#define LIBDRAGON_COMPRESS_LZH5_h
+
+#include <stdio.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @brief Size of the LZ5H decompression state.
+ * 
+ * Note that this can still be allocated on the stack, as the stack size
+ * configured by libdragon is 64KB.
+ */
+#define DECOMPRESS_LZ5H_STATE_SIZE    18688
+
+void decompress_lz5h_init(void *state, FILE *fp);
+ssize_t decompress_lz5h_read(void *state, void *buf, size_t len);
+int decompress_lz5h_pos(void *state);
+
+/**
+ * @brief Decompress a full LZ5H file into a buffer.
+ * 
+ * This function decompresses a full LZH5 file into a memory buffer.
+ * The caller should provide a buffer large enough to hold the entire
+ * file, or the function will fail.
+ * 
+ * This function is about 50% faster than using #decompress_lz5h_read,
+ * as it can assume that the whole decoded file will always be available
+ * during decoding.
+ * 
+ * @param fp        File pointer to the compressed file
+ * @param buf       Buffer to decompress into
+ * @param len       Length of the buffer
+ */
+size_t decompress_lz5h_full(FILE *fp, void *buf, size_t len);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/compress/ringbuf.c b/src/compress/ringbuf.c
new file mode 100644
index 0000000000..03362adf3b
--- /dev/null
+++ b/src/compress/ringbuf.c
@@ -0,0 +1,64 @@
+#include "ringbuf_internal.h"
+#include "../utils.h"
+
+void __ringbuf_init(decompress_ringbuf_t *ringbuf)
+{
+    ringbuf->ringbuf_pos = 0;
+}
+
+void __ringbuf_write(decompress_ringbuf_t *ringbuf, uint8_t *src, int count)
+{
+    while (count > 0) {
+        int n = MIN(count, RING_BUFFER_SIZE - ringbuf->ringbuf_pos);
+        memcpy(ringbuf->ringbuf + ringbuf->ringbuf_pos, src, n);
+        ringbuf->ringbuf_pos += n;
+        ringbuf->ringbuf_pos &= RING_BUFFER_SIZE-1;
+        src += n;
+        count -= n;
+    }
+}
+
+void __ringbuf_copy(decompress_ringbuf_t *ringbuf, int copy_offset, uint8_t *dst, int count)
+{
+    int ringbuf_copy_pos = (ringbuf->ringbuf_pos - copy_offset) & (RING_BUFFER_SIZE-1);
+    int dst_pos = 0;
+    while (count > 0) {
+		int wn = count;
+        wn = wn < RING_BUFFER_SIZE - ringbuf_copy_pos     ? wn : RING_BUFFER_SIZE - ringbuf_copy_pos;
+		wn = wn < RING_BUFFER_SIZE - ringbuf->ringbuf_pos ? wn : RING_BUFFER_SIZE - ringbuf->ringbuf_pos;
+        count -= wn;
+
+        // Check if there's an overlap in the ring buffer between read and write pos, in which
+        // case we need to copy byte by byte.
+        if (ringbuf->ringbuf_pos < ringbuf_copy_pos || 
+            ringbuf->ringbuf_pos > ringbuf_copy_pos+7) {
+            while (wn >= 8) {
+                // Copy 8 bytes at at time, using a unaligned memory access (LDL/LDR/SDL/SDR)
+                typedef uint64_t u_uint64_t __attribute__((aligned(1)));
+                uint64_t value = *(u_uint64_t*)&ringbuf->ringbuf[ringbuf_copy_pos];
+                *(u_uint64_t*)&dst[dst_pos] = value;
+                *(u_uint64_t*)&ringbuf->ringbuf[ringbuf->ringbuf_pos] = value;
+
+                ringbuf_copy_pos += 8;
+                ringbuf->ringbuf_pos += 8;
+                dst_pos += 8;
+                wn -= 8;
+            }
+        }
+
+        // Finish copying the remaining bytes
+        while (wn > 0) {
+            uint8_t value = ringbuf->ringbuf[ringbuf_copy_pos];
+            dst[dst_pos] = value;
+            ringbuf->ringbuf[ringbuf->ringbuf_pos] = value;
+
+            ringbuf_copy_pos += 1;
+            ringbuf->ringbuf_pos += 1;
+            dst_pos += 1;
+            wn -= 1;
+        }
+
+        ringbuf_copy_pos %= RING_BUFFER_SIZE;
+        ringbuf->ringbuf_pos %= RING_BUFFER_SIZE;
+    }
+}
diff --git a/src/compress/ringbuf_internal.h b/src/compress/ringbuf_internal.h
new file mode 100644
index 0000000000..5050ed2d2f
--- /dev/null
+++ b/src/compress/ringbuf_internal.h
@@ -0,0 +1,56 @@
+#ifndef LIBDRAGON_COMPRESS_RINGBUF_INTERNAL_H
+#define LIBDRAGON_COMPRESS_RINGBUF_INTERNAL_H
+
+#include <stdint.h>
+
+///< Size of the ring buffer in bytes. This happens to work for both lz4 and lzh5
+#ifndef RING_BUFFER_SIZE
+#define RING_BUFFER_SIZE    (16 * 1024)
+#endif
+
+/**
+ * @brief A ring buffer used for streaming decompression.
+ */
+typedef struct {
+	uint8_t ringbuf[RING_BUFFER_SIZE];      ///< The ring buffer itself
+	unsigned int ringbuf_pos;               ///< Current write position in the ring buffer
+} decompress_ringbuf_t;
+
+
+void __ringbuf_init(decompress_ringbuf_t *ringbuf);
+
+inline void __ringbuf_writebyte(decompress_ringbuf_t *ringbuf, uint8_t byte)
+{
+    ringbuf->ringbuf[ringbuf->ringbuf_pos++] = byte;
+    ringbuf->ringbuf_pos &= (RING_BUFFER_SIZE - 1);
+}
+
+/**
+ * @brief Write an array of bytes into the ring buffer.
+ * 
+ * @param ringbuf   The ring buffer to write to.
+ * @param src       The source array to write from.
+ * @param count     The number of bytes to write.
+ */
+void __ringbuf_write(decompress_ringbuf_t *ringbuf, uint8_t *src, int count);
+
+/**
+ * @brief Extract data from the ring buffer, updating it at the same time
+ * 
+ * This function is used to implement a typical match-copy of LZ algorithms.
+ * Given the ring buffer and the position to copy from, it will copy the
+ * specified number of bytes into the destination buffer, while also
+ * updating the ring buffer with the copied data.
+ * 
+ * It correctly handles overlaps, so if copy_offset is 1 and count is 100,
+ * the last character in the ring buffer will be copied 100 times to the
+ * output (and to the ring buffer itself).
+ * 
+ * @param ringbuf               The ring buffer
+ * @param copy_offset           Offset to copy from, relative to the current position.
+ * @param dst                   Destination buffer
+ * @param count                 Number of bytes to copy
+ */
+void __ringbuf_copy(decompress_ringbuf_t *ringbuf, int copy_offset, uint8_t *dst, int count);
+
+#endif

From 064d76c9801d5b5a67d1a314d26425174feee907 Mon Sep 17 00:00:00 2001
From: Giovanni Bajo <rasky@develer.com>
Date: Tue, 26 Sep 2023 23:41:05 +0200
Subject: [PATCH 06/27] Add new asset library

This library handles loading asset files from filesystems. It allows
either one-time loads or incremental loads via FILE* interface, and
it handles transparent decompression in both cases.

It implements two compressed format: a very fast one (based on LZ4),
and a slower one (based on LZH5) that compresses files more.
---
 include/asset.h                 |  37 +++++++++
 src/asset.c                     | 131 ++++++++++++--------------------
 src/asset_internal.h            |  14 ++++
 src/audio/ym64.c                |   6 +-
 src/compress/lz4_dec.c          |  56 ++++++++++++++
 src/compress/lz4_dec_internal.h |   2 +-
 src/compress/lzh5.c             |  25 ++++--
 src/compress/lzh5_internal.h    |  22 +++---
 tools/audioconv64/conv_ym64.c   |  14 ++--
 9 files changed, 198 insertions(+), 109 deletions(-)

diff --git a/include/asset.h b/include/asset.h
index 45a4bcf880..6c2867cc4c 100644
--- a/include/asset.h
+++ b/include/asset.h
@@ -47,10 +47,47 @@
 
 #include <stdio.h>
 
+#ifdef N64
+#include "debug.h"
+#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+/**
+ * @brief Enable a non-default compression level
+ * 
+ * This function must be called if any asset that will be loaded use
+ * a non-default compression level. The default compression level is 1,
+ * for which no initialization is required.
+ * 
+ * Currently, only level 2 requires initialization. If you have any assets
+ * compressed with level 2, you must call this function before loading them.
+ * 
+ * @code{.c}
+ *      asset_init_compression(2); 
+ * 
+ *      // Load an asset that might use level 2 compression
+ *      sprite_t *hero = sprite_load("rom:/hero.sprite");
+ * @endcode
+ * 
+ * @param level     Compression level to initialize
+ * 
+ * @see #asset_load
+ * @hideinitializer
+ */
+#define asset_init_compression(level) ({ \
+    switch (level) { \
+    case 1: break; \
+    case 2: { \
+        extern void __asset_init_compression_lvl2(void); \
+        __asset_init_compression_lvl2(); \
+    } break; \
+    default: assertf(0, "Unsupported compression level: %d", level); \
+    } \
+})
+
 /**
  * @brief Load an asset file (possibly uncompressing it)
  * 
diff --git a/src/asset.c b/src/asset.c
index 17e1aa9280..d6d6e402aa 100644
--- a/src/asset.c
+++ b/src/asset.c
@@ -20,6 +20,31 @@
 #define assertf(x, ...) assert(x)
 #endif
 
+/** 
+ * @brief Compression algorithms
+ * 
+ * Only level 1 (LZ4) is always initialized. The other algorithm (LZH5)
+ * must be initialized manually via #asset_init_compression.
+ */
+static asset_compression_t algos[2] = {
+    {
+        .state_size = DECOMPRESS_LZ4_STATE_SIZE,
+        .decompress_init = decompress_lz4_init,
+        .decompress_read = decompress_lz4_read,
+        .decompress_full = decompress_lz4_full,
+    }
+};
+
+void __asset_init_compression_lvl2(void)
+{
+    algos[1] = (asset_compression_t){
+        .state_size = DECOMPRESS_LZH5_STATE_SIZE,
+        .decompress_init = decompress_lzh5_init,
+        .decompress_read = decompress_lzh5_read,
+        .decompress_full = decompress_lzh5_full,
+    };
+}
+
 FILE *must_fopen(const char *fn)
 {
     FILE *f = fopen(fn, "rb");
@@ -67,65 +92,13 @@ void *asset_load(const char *fn, int *sz)
         header.orig_size = __builtin_bswap32(header.orig_size);
         #endif
 
-        switch (header.algo) {
-        case 2: {
-            size = header.orig_size;
-            s = memalign(16, size);
-            assertf(s, "asset_load: out of memory");
-            int n = decompress_lz5h_full(f, s, size); (void)n;
-            assertf(n == size, "asset: decompression error on file %s: corrupted? (%d/%d)", fn, n, size);
-        }   break;
-        case 1: {
-            size = header.orig_size;
-            int bufsize = size + LZ4_DECOMPRESS_INPLACE_MARGIN(header.cmp_size);
-            int cmp_offset = bufsize - header.cmp_size;
-            if (cmp_offset & 1) {
-                cmp_offset++;
-                bufsize++;
-            }
-            if (bufsize & 15) {
-                // In case we need to call invalidate (see below), we need an aligned buffer
-                bufsize += 16 - (bufsize & 15);
-            }
+        assertf(header.algo >= 1 || header.algo <= 2,
+            "unsupported compression algorithm: %d", header.algo);
+        assertf(algos[header.algo-1].decompress_full, 
+            "asset: compression level %d not initialized. Call asset_init_compression(%d) at initialization time", header.algo, header.algo);
 
-            s = memalign(16, bufsize);
-            assertf(s, "asset_load: out of memory");
-            int n;
-
-            #ifdef N64
-            if (strncmp(fn, "rom:/", 5) == 0) {
-                // Invalid the portion of the buffer where we are going to load
-                // the compressed data. This is needed in case the buffer returned
-                // by memalign happens to be in cached already.
-                int align_cmp_offset = cmp_offset & ~15;
-                data_cache_hit_invalidate(s+align_cmp_offset, bufsize-align_cmp_offset);
-
-                // Loading from ROM. This is a common enough situation that we want to optimize it.
-                // Start an asynchronous DMA transfer, so that we can start decompressing as the
-                // data flows in.
-                uint32_t addr = dfs_rom_addr(fn+5) & 0x1FFFFFFF;
-                dma_read_async(s+cmp_offset, addr+16, header.cmp_size);
-
-                // Run the decompression racing with the DMA.
-                n = decompress_lz4_full_mem(s+cmp_offset, header.cmp_size, s, size, true); (void)n;
-            #else
-            if (false) {
-            #endif
-            } else {
-                // Standard loading via stdio. We have to wait for the whole file to be read.
-                fread(s+cmp_offset, 1, header.cmp_size, f);
-
-                // Run the decompression.
-                n = decompress_lz4_full_mem(s+cmp_offset, header.cmp_size, s, size, false); (void)n;
-            }
-            assertf(n == size, "asset: decompression error on file %s: corrupted? (%d/%d)", fn, n, size);
-            void *ptr = realloc(s, size); (void)ptr;
-            assertf(s == ptr, "asset: realloc moved the buffer"); // guaranteed by newlib
-        }   break;
-        default:
-            assertf(0, "asset: unsupported compression algorithm: %d", header.algo);
-            return NULL;
-        }        
+        size = header.orig_size;
+        s = algos[header.algo-1].decompress_full(fn, f, header.cmp_size, size);
     } else {
         // Allocate a buffer big enough to hold the file.
         // We force a 16-byte alignment for the buffer so that it's cacheline aligned.
@@ -184,20 +157,20 @@ typedef struct  {
     bool seeked;
     ssize_t (*read)(void *state, void *buf, size_t len);
     uint8_t state[] alignas(8);
-} cookie_lha_t;
+} cookie_cmp_t;
 
-static int readfn_lha(void *c, char *buf, int sz)
+static int readfn_cmp(void *c, char *buf, int sz)
 {
-    cookie_lha_t *cookie = (cookie_lha_t*)c;
+    cookie_cmp_t *cookie = (cookie_cmp_t*)c;
     assertf(!cookie->seeked, "Cannot seek in file opened via asset_fopen (it might be compressed)");
     int n = cookie->read(cookie->state, (uint8_t*)buf, sz);
     cookie->pos += n;
     return n;
 }
 
-static fpos_t seekfn_lha(void *c, fpos_t pos, int whence)
+static fpos_t seekfn_cmp(void *c, fpos_t pos, int whence)
 {
-    cookie_lha_t *cookie = (cookie_lha_t*)c;
+    cookie_cmp_t *cookie = (cookie_cmp_t*)c;
 
     // SEEK_CUR with pos=0 is used as ftell()
     if (whence == SEEK_CUR && pos == 0)
@@ -210,9 +183,9 @@ static fpos_t seekfn_lha(void *c, fpos_t pos, int whence)
     return -1;
 }
 
-static int closefn_lha(void *c)
+static int closefn_cmp(void *c)
 {
-    cookie_lha_t *cookie = (cookie_lha_t*)c;
+    cookie_cmp_t *cookie = (cookie_cmp_t*)c;
     fclose(cookie->fp); cookie->fp = NULL;
     free(cookie);
     return 0;
@@ -242,28 +215,22 @@ FILE *asset_fopen(const char *fn, int *sz)
             header.orig_size = __builtin_bswap32(header.orig_size);
         }
 
-        cookie_lha_t *cookie;
-        switch (header.algo) {
-        case 1:
-            cookie = malloc(sizeof(cookie_lha_t) + DECOMPRESS_LZ4_STATE_SIZE);
-            decompress_lz4_init(cookie->state, f);
-            cookie->read = decompress_lz4_read;
-            break;
-        case 2:
-            cookie = malloc(sizeof(cookie_lha_t) + DECOMPRESS_LZ5H_STATE_SIZE);
-            decompress_lz5h_init(cookie->state, f);
-            cookie->read = decompress_lz5h_read;
-            break;
-        default:
-            assertf(0, "unsupported compression algorithm: %d", header.algo);
-            return NULL;
-        }
+        cookie_cmp_t *cookie;
+
+        assertf(header.algo >= 1 || header.algo <= 2,
+            "unsupported compression algorithm: %d", header.algo);
+        assertf(algos[header.algo-1].decompress_init, 
+            "asset: compression level %d not initialized. Call asset_init_compression(%d) at initialization time", header.algo, header.algo);
+
+        cookie = malloc(sizeof(cookie_cmp_t) + algos[header.algo-1].state_size);
+        cookie->read = algos[header.algo-1].decompress_read;
+        algos[header.algo-1].decompress_init(cookie->state, f);
 
         cookie->fp = f;
         cookie->pos = 0;
         cookie->seeked = false;
         if (sz) *sz = header.orig_size;
-        return funopen(cookie, readfn_lha, NULL, seekfn_lha, closefn_lha);
+        return funopen(cookie, readfn_cmp, NULL, seekfn_cmp, closefn_cmp);
     }
 
     // Not compressed. Return a wrapped FILE* without the seeking capability,
diff --git a/src/asset_internal.h b/src/asset_internal.h
index 0b662251c5..c17339dbfc 100644
--- a/src/asset_internal.h
+++ b/src/asset_internal.h
@@ -18,6 +18,20 @@ typedef struct {
 
 _Static_assert(sizeof(asset_header_t) == 16, "invalid sizeof(asset_header_t)");
 
+/** @brief A decompression algorithm used by the asset library */
+typedef struct {
+    int state_size;     ///< Size of the decompression state
+
+    /** @brief Initialize the decompression state */
+    void (*decompress_init)(void *state, FILE *fp);
+    /** @brief Partially read a decompressed file from a state */
+    ssize_t (*decompress_read)(void *state, void *buf, size_t len);
+
+    /** @brief Decompress a full file in one go */
+    void* (*decompress_full)(const char *fn, FILE *fp, size_t cmp_size, size_t len);
+} asset_compression_t;
+
+
 FILE *must_fopen(const char *fn);
 
 #endif
diff --git a/src/audio/ym64.c b/src/audio/ym64.c
index da4199a4eb..23a5869c04 100644
--- a/src/audio/ym64.c
+++ b/src/audio/ym64.c
@@ -29,7 +29,7 @@ _Static_assert(sizeof(ym5header) == 22, "invalid header size");
 
 static int ymread(ym64player_t *player, void *buf, int sz) {
 	if (player->decoder)
-		return lha_lh_new_read(player->decoder, buf, sz);
+		return decompress_lzh5_read(player->decoder, buf, sz);
 	return fread(buf, 1, sz, player->f);
 }
 
@@ -128,9 +128,9 @@ void ym64player_open(ym64player_t *player, const char *fn, ym64player_songinfo_t
 
 		// Initialize decompressor and re-read the header (this time, it will
 		// be decompressed and we should find a valid YM header).
-		player->decoder = (LHANewDecoder*)malloc(sizeof(LHANewDecoder));
+		player->decoder = malloc(DECOMPRESS_LZH5_STATE_SIZE);
 		offset = 0;
-		lha_lh_new_init(player->decoder, lha_callback, (void*)player->f);
+		decompress_lzh5_init(player->decoder, player->f);
 		_ymread(head, 12);
 	}
 
diff --git a/src/compress/lz4_dec.c b/src/compress/lz4_dec.c
index 24b1b217cc..a15ea9ff3b 100644
--- a/src/compress/lz4_dec.c
+++ b/src/compress/lz4_dec.c
@@ -7,6 +7,13 @@
 #include "ringbuf_internal.h"
 #include "../utils.h"
 
+#ifdef N64
+#include <malloc.h>
+#include "debug.h"
+#include "dragonfs.h"
+#include "n64sys.h"
+#endif
+
 #define MIN_MATCH_SIZE  4
 #define MIN_OFFSET 1
 #define MAX_OFFSET 0xffff
@@ -170,6 +177,55 @@ int decompress_lz4_full_mem(const unsigned char *pInBlock, int nBlockSize, unsig
    return (int)(pCurOutData - pOutData);
 }
 
+void* decompress_lz4_full(const char *fn, FILE *fp, size_t cmp_size, size_t size)
+{
+   int bufsize = size + LZ4_DECOMPRESS_INPLACE_MARGIN(cmp_size);
+   int cmp_offset = bufsize - cmp_size;
+   if (cmp_offset & 1) {
+         cmp_offset++;
+         bufsize++;
+   }
+   if (bufsize & 15) {
+         // In case we need to call invalidate (see below), we need an aligned buffer
+         bufsize += 16 - (bufsize & 15);
+   }
+
+   void *s = memalign(16, bufsize);
+   assertf(s, "asset_load: out of memory");
+   int n;
+
+   #ifdef N64
+   if (fn && strncmp(fn, "rom:/", 5) == 0) {
+         // Invalid the portion of the buffer where we are going to load
+         // the compressed data. This is needed in case the buffer returned
+         // by memalign happens to be in cached already.
+         int align_cmp_offset = cmp_offset & ~15;
+         data_cache_hit_invalidate(s+align_cmp_offset, bufsize-align_cmp_offset);
+
+         // Loading from ROM. This is a common enough situation that we want to optimize it.
+         // Start an asynchronous DMA transfer, so that we can start decompressing as the
+         // data flows in.
+         uint32_t addr = dfs_rom_addr(fn+5) & 0x1FFFFFFF;
+         dma_read_async(s+cmp_offset, addr+16, cmp_size);
+
+         // Run the decompression racing with the DMA.
+         n = decompress_lz4_full_mem(s+cmp_offset, cmp_size, s, size, true); (void)n;
+   #else
+   if (false) {
+   #endif
+   } else {
+         // Standard loading via stdio. We have to wait for the whole file to be read.
+         fread(s+cmp_offset, 1, cmp_size, fp);
+
+         // Run the decompression.
+         n = decompress_lz4_full_mem(s+cmp_offset, cmp_size, s, size, false); (void)n;
+   }
+   assertf(n == size, "asset: decompression error on file %s: corrupted? (%d/%d)", fn, n, size);
+   void *ptr = realloc(s, size); (void)ptr;
+   assertf(s == ptr, "asset: realloc moved the buffer"); // guaranteed by newlib
+   return ptr;
+}
+
 /**
  * @brief Fast-access state of the LZ4 algorithm (streaming version).
  * 
diff --git a/src/compress/lz4_dec_internal.h b/src/compress/lz4_dec_internal.h
index cdaf5e821b..f6dd18caf5 100644
--- a/src/compress/lz4_dec_internal.h
+++ b/src/compress/lz4_dec_internal.h
@@ -63,6 +63,6 @@ int decompress_lz4_full_mem(const unsigned char *src, int src_size,
 
 void decompress_lz4_init(void *state, FILE *fp);
 ssize_t decompress_lz4_read(void *state, void *buf, size_t len);
-
+void* decompress_lz4_full(const char *fn, FILE *fp, size_t cmp_size, size_t size);
 
 #endif
diff --git a/src/compress/lzh5.c b/src/compress/lzh5.c
index 985643ad4f..01f05b5137 100644
--- a/src/compress/lzh5.c
+++ b/src/compress/lzh5.c
@@ -10,6 +10,13 @@
 
 #include "lzh5_internal.h"
 
+#ifdef N64
+#include <malloc.h>
+#include "debug.h"
+#else
+#include <stdlib.h>
+#endif
+
 //////////////////////// bit_stream_reader.c
 
 /*
@@ -1137,28 +1144,34 @@ static size_t lha_lh_new_read_full(LHANewDecoder *decoder, uint8_t *buf, int sz)
  * Libdragon API
  *************************************************/
 
-_Static_assert(sizeof(LHANewDecoderPartial) == DECOMPRESS_LZ5H_STATE_SIZE, "LZH5 state size is wrong");
+_Static_assert(sizeof(LHANewDecoderPartial) == DECOMPRESS_LZH5_STATE_SIZE, "LZH5 state size is wrong");
 
-void decompress_lz5h_init(void *state, FILE *fp)
+void decompress_lzh5_init(void *state, FILE *fp)
 {
 	LHANewDecoderPartial *decoder = (LHANewDecoderPartial *)state;
 	lha_lh_new_init_partial(decoder, fp);
 }
 
-ssize_t decompress_lz5h_read(void *state, void *buf, size_t len)
+ssize_t decompress_lzh5_read(void *state, void *buf, size_t len)
 {
 	LHANewDecoderPartial *decoder = (LHANewDecoderPartial *)state;
 	return lha_lh_new_read_partial(decoder, buf, len);
 }
 
-int decompress_lz5h_pos(void *state) {
+int decompress_lzh5_pos(void *state) {
 	LHANewDecoderPartial *decoder = (LHANewDecoderPartial *)state;
 	return decoder->decoded_bytes;
 }
 
-size_t decompress_lz5h_full(FILE *fp, void *buf, size_t len)
+void* decompress_lzh5_full(const char *fn, FILE *fp, size_t cmp_size, size_t size)
 {
+	void *s = memalign(16, size);
+	assertf(s, "asset_load: out of memory");
+
 	LHANewDecoder decoder;
 	lha_lh_new_init(&decoder, fp);
-	return lha_lh_new_read_full(&decoder, buf, len);
+	int n = lha_lh_new_read_full(&decoder, s, size); (void)n;
+	assertf(n == size, "asset: decompression error on file %s: corrupted? (%d/%d)", fn, n, size);
+
+	return s;
 }
diff --git a/src/compress/lzh5_internal.h b/src/compress/lzh5_internal.h
index 7554504db5..26a059c7a5 100644
--- a/src/compress/lzh5_internal.h
+++ b/src/compress/lzh5_internal.h
@@ -9,33 +9,35 @@ extern "C" {
 #endif
 
 /**
- * @brief Size of the LZ5H decompression state.
+ * @brief Size of the LZH5 decompression state.
  * 
  * Note that this can still be allocated on the stack, as the stack size
  * configured by libdragon is 64KB.
  */
-#define DECOMPRESS_LZ5H_STATE_SIZE    18688
+#define DECOMPRESS_LZH5_STATE_SIZE    18688
 
-void decompress_lz5h_init(void *state, FILE *fp);
-ssize_t decompress_lz5h_read(void *state, void *buf, size_t len);
-int decompress_lz5h_pos(void *state);
+void decompress_lzh5_init(void *state, FILE *fp);
+ssize_t decompress_lzh5_read(void *state, void *buf, size_t len);
+int decompress_lzh5_pos(void *state);
 
 /**
- * @brief Decompress a full LZ5H file into a buffer.
+ * @brief Decompress a full LZH5 file into a buffer.
  * 
  * This function decompresses a full LZH5 file into a memory buffer.
  * The caller should provide a buffer large enough to hold the entire
  * file, or the function will fail.
  * 
- * This function is about 50% faster than using #decompress_lz5h_read,
+ * This function is about 50% faster than using #decompress_lzh5_read,
  * as it can assume that the whole decoded file will always be available
  * during decoding.
  * 
+ * @param fn        Filename of the file being decompressed, if known
  * @param fp        File pointer to the compressed file
- * @param buf       Buffer to decompress into
- * @param len       Length of the buffer
+ * @param cmp_size  Length of the compressed file
+ * @param size      Length of the file after decompression
+ * @return          Buffer that contains the decompressed file
  */
-size_t decompress_lz5h_full(FILE *fp, void *buf, size_t len);
+void* decompress_lzh5_full(const char *fn, FILE *fp, size_t cmp_size, size_t size);
 
 #ifdef __cplusplus
 }
diff --git a/tools/audioconv64/conv_ym64.c b/tools/audioconv64/conv_ym64.c
index 385f1c12fb..51f1378481 100644
--- a/tools/audioconv64/conv_ym64.c
+++ b/tools/audioconv64/conv_ym64.c
@@ -12,10 +12,14 @@
  *
  */
 
+#define assertf(x, ...) assert(x)
+#define memalign(a, b) malloc(b)
+
 #include "../../src/audio/lzh5.h"   // LZH5 decompression
 #include "lzh5_compress.h"          // LZH5 compression
 #include "lzh5_compress.c"
 
+
 bool flag_ym_compress = false;
 
 typedef struct __attribute__((packed)) {
@@ -46,15 +50,11 @@ _Static_assert(sizeof(ym5header) == 22, "invalid ym5header size");
 
 static FILE *ym_f;
 static bool ym_compressed;
-static LHANewDecoder ym_decoder;
-
-static size_t lha_callback(void *buf, size_t buf_len, void *user_data) {
-    return fread(buf, 1, buf_len, ym_f);
-}
+static uint8_t alignas(8) ym_decoder[DECOMPRESS_LZH5_STATE_SIZE];
 
 static void ymread(void *buf, int sz) {
     if (ym_compressed) {
-        lha_lh_new_read(&ym_decoder, buf, sz);
+        decompress_lzh5_read(ym_decoder, buf, sz);
         return;
     }
     fread(buf, 1, sz, ym_f);
@@ -155,7 +155,7 @@ int ym_convert(const char *infn, const char *outfn) {
         // https://github.com/fragglet/lhasa, stored in lz5h.h.
         fseek(ym_f, head[0]+2, SEEK_SET);
         ym_compressed = true;
-        lha_lh_new_init(&ym_decoder, lha_callback, NULL);
+        decompress_lzh5_init(ym_decoder, ym_f);
         ymread(head, 12);
     }
 

From f9f2bafb470c400e0b5dfb7bf62743891ef8cfe9 Mon Sep 17 00:00:00 2001
From: Giovanni Bajo <rasky@develer.com>
Date: Sun, 24 Sep 2023 22:58:00 +0200
Subject: [PATCH 07/27] asset.h: improve documentation

---
 include/asset.h | 40 +++++++++++++++++++++++++++++++++-------
 1 file changed, 33 insertions(+), 7 deletions(-)

diff --git a/include/asset.h b/include/asset.h
index 6c2867cc4c..c8f425ade8 100644
--- a/include/asset.h
+++ b/include/asset.h
@@ -43,6 +43,26 @@
  * If you know that the file will never be compressed and you absolutely need
  * to freely seek, simply use the standard fopen() function.
  * 
+ * ## Asset compression
+ * 
+ * To compress your own data files, you can use the mkasset tool.
+ * 
+ * There are currently two compression levels:
+ * 
+ * * Level 1: this is based on LZ4 by Yann Collet. It is extremely fast and
+ *   produce reasonable compression ratios. It is so fast at decompression
+ *   that our implementation is typically faster at loading and decompressing
+ *   a compressed asset, rather than loading it uncompressed. Libdragon tools
+ *   will compress at level 1 by default.
+ * * Level 2: this is based on LZH5 by Haruhiko Okumura, part of the LHA archiver.
+ *   It is slower than LZ4, but it produces better compression ratios. It has
+ *   been measured to beat gzip/zlib for small files like those typically used
+ *   on N64. Level 2 should be selected if there is a necessity to squeeze data
+ *   at the maximum ratio, at the expense of loading speed.
+ * 
+ * To minimize text siz and RAM usage, only the decompression code for level 1
+ * is compiled by default. If you need to use level 2, you must call
+ * #asset_init_compression(2).
  */
 
 #include <stdio.h>
@@ -95,8 +115,8 @@ extern "C" {
  * If the file was compressed using the mkasset tool, it will be
  * automatically uncompressed.
  * 
- * @param fn        Filename to load (including filesystem prefix)
- * @param sz        Pointer to an integer where the size of the file will be stored
+ * @param fn        Filename to load (including filesystem prefix, eg: "rom:/foo.dat")
+ * @param sz        If not NULL, this will be filed with the uncompressed size of the loaded file
  * @return void*    Pointer to the loaded file (must be freed with free() when done)
  */
 void *asset_load(const char *fn, int *sz);
@@ -108,12 +128,18 @@ void *asset_load(const char *fn, int *sz);
  * If the file was compressed using the mkasset tool, it will be
  * automatically uncompressed as it is being read.
  * 
- * Note that since the file can be optionally compressed, the returned
- * FILE* cannot be rewinded. It must be read sequentially, or seeked forward.
- * Seeking backward is not supported.
+ * Note that since the file might be compressed, the returned
+ * FILE* cannot be rewinded, nor seeked backward, as that would be impossible
+ * to do efficiently on a compressed file. Seeking forward is supported and is
+ * simulated by reading (decompressing) and discarding data.
  * 
- * @param fn        Filename to open (including filesystem prefix)
- * @param sz        If not NULL, pointer to an integer where the size of the file will be stored
+ * This behavior of the returned file is enforced also for non compressed
+ * assets, so that the code is ready to switch to compressed assets if
+ * required. If you need random access to an uncompressed file, simply use
+ * the standard fopen() function.
+ * 
+ * @param fn        Filename to load (including filesystem prefix, eg: "rom:/foo.dat")
+ * @param sz        If not NULL, this will be filed with the uncompressed size of the loaded file
  * @return FILE*    FILE pointer to use with standard C functions (fread, fclose)
  */
 FILE *asset_fopen(const char *fn, int *sz);

From cec126f07f352128eb0e37e9905ecda0d5cedfdc Mon Sep 17 00:00:00 2001
From: Giovanni Bajo <rasky@develer.com>
Date: Tue, 26 Sep 2023 23:46:36 +0200
Subject: [PATCH 08/27] lzh5: remove old LZH5 implementation (local to the
 audio library)

---
 doxygen-public.conf                           |    2 +-
 examples/audioplayer/audioplayer.c            |    1 -
 include/ym64.h                                |    6 +-
 src/audio/lzh5.h                              | 1294 -----------------
 src/audio/ym64.c                              |    9 +-
 tools/audioconv64/conv_ym64.c                 |   10 +-
 tools/{audioconv64 => common}/lzh5_compress.c |    0
 tools/{audioconv64 => common}/lzh5_compress.h |    0
 8 files changed, 11 insertions(+), 1311 deletions(-)
 delete mode 100644 src/audio/lzh5.h
 rename tools/{audioconv64 => common}/lzh5_compress.c (100%)
 rename tools/{audioconv64 => common}/lzh5_compress.h (100%)

diff --git a/doxygen-public.conf b/doxygen-public.conf
index de5df0338b..24eb697027 100644
--- a/doxygen-public.conf
+++ b/doxygen-public.conf
@@ -905,7 +905,7 @@ RECURSIVE              = YES
 # Note that relative paths are relative to the directory from which doxygen is
 # run.
 
-EXCLUDE                = ./src/audio/libxm/ ./src/audio/lzh5.h ./src/fatfs/
+EXCLUDE                = ./src/audio/libxm/ ./src/compress/lzh5.c ./src/fatfs/
 
 # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
 # directories that are symbolic links (a Unix file system feature) are excluded
diff --git a/examples/audioplayer/audioplayer.c b/examples/audioplayer/audioplayer.c
index 67bc25ec28..346aaac5e6 100644
--- a/examples/audioplayer/audioplayer.c
+++ b/examples/audioplayer/audioplayer.c
@@ -4,7 +4,6 @@
 // We need to show lots of internal details of the module which are not
 // exposed via public API, so include the internal header file.
 #include "../../src/audio/libxm/xm_internal.h"
-#include "../../src/audio/lzh5.h"
 
 #define CLAMP(x, min, max) ((x) < (min) ? (min) : ((x) > (max) ? (max) : (x)))
 
diff --git a/include/ym64.h b/include/ym64.h
index 66dbeecc3e..b2726b4212 100644
--- a/include/ym64.h
+++ b/include/ym64.h
@@ -11,10 +11,6 @@ extern "C" {
 #include "mixer.h"
 #include "ay8910.h"
 
-/// @cond
-typedef struct _LHANewDecoder LHANewDecoder;
-/// @endcond
-
 /**
  * @file ym64.h
  * @brief Player for the .YM64 module format (Arkos Tracker 2)
@@ -63,7 +59,7 @@ typedef struct {
 	waveform_t wave;          ///< waveform for playback with the mixer
 
 	FILE *f;                  ///< Open file handle
-	LHANewDecoder *decoder;   ///< Optional LHA decoder (compressed YM files)
+	void *decoder;            ///< Optional LHA decoder (compressed YM files)
 	int start_off;            ///< Starting offset of the first audio frame
 
 	AY8910 ay;                ///< AY8910 emulator
diff --git a/src/audio/lzh5.h b/src/audio/lzh5.h
deleted file mode 100644
index 0394e2b090..0000000000
--- a/src/audio/lzh5.h
+++ /dev/null
@@ -1,1294 +0,0 @@
-// Decoder for algorithm -lh5- of the LZH family.
-// This code comes from https://github.com/fragglet/lhasa
-// and has been turned into a single file header with only
-// the -lh5- algo.
-// This was also modified to allow for full streaming decompression
-// (up to 1 byte at a time). Before, the code would decompress one
-// internal LHA block at a time, writing a non predictable number of 
-// bytes in the output buffer.
-// This file is ISC Licensed.
-
-#ifndef LZH5_H
-#define LZH5_H
-
-//////////////////////// public/lha_decoder.h
-
-/*
-
-Copyright (c) 2011, 2012, Simon Howard
-
-Permission to use, copy, modify, and/or distribute this software
-for any purpose with or without fee is hereby granted, provided
-that the above copyright notice and this permission notice appear
-in all copies.
-
-THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
-WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
-AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR
-CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
-LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
-NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
-CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-
- */
-
-#ifndef LHASA_PUBLIC_LHA_DECODER_H
-#define LHASA_PUBLIC_LHA_DECODER_H
-
-#include <stdlib.h>
-#include <inttypes.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/**
- * @file lha_decoder.h
- *
- * @brief Raw LHA data decoder.
- *
- * This file defines the interface to the decompression code, which can
- * be used to decompress the raw compressed data from an LZH file.
- *
- * Implementations of the various compression algorithms used in LZH
- * archives are provided - these are represented by the
- * @ref LHADecoderType structure, and can be retrieved using the
- * @ref lha_decoder_for_name function. One of these can then be passed to
- * the @ref lha_decoder_new function to create a @ref LHADecoder structure
- * and decompress the data.
- */
-
-/**
- * Opaque type representing a type of decoder.
- *
- * This is an implementation of the decompression code for one of the
- * algorithms used in LZH archive files. Pointers to these structures are
- * retrieved by using the @ref lha_decoder_for_name function.
- */
-
-typedef struct _LHADecoderType LHADecoderType;
-
-/**
- * Opaque type representing an instance of a decoder.
- *
- * This is a decoder structure being used to decompress a stream of
- * compressed data. Instantiated using the @ref lha_decoder_new
- * function and freed using the @ref lha_decoder_free function.
- */
-
-typedef struct _LHADecoder LHADecoder;
-
-/**
- * Callback function invoked when a decoder wants to read more compressed
- * data.
- *
- * @param buf        Pointer to the buffer in which to store the data.
- * @param buf_len    Size of the buffer, in bytes.
- * @param user_data  Extra pointer to pass to the decoder.
- * @return           Number of bytes read.
- */
-
-typedef size_t (*LHADecoderCallback)(void *buf, size_t buf_len,
-                                     void *user_data);
-
-/**
- * Callback function used for monitoring decode progress.
- * The callback is invoked for every block processed (block size depends on
- * decode algorithm).
- *
- * @param num_blocks      Number of blocks processed so far.
- * @param total_blocks    Total number of blocks to process.
- * @param callback_data  Extra user-specified data passed to the callback.
- */
-
-typedef void (*LHADecoderProgressCallback)(unsigned int num_blocks,
-                                           unsigned int total_blocks,
-                                           void *callback_data);
-
-/**
- * Get the decoder type for the specified name.
- *
- * @param name           String identifying the decoder type, for
- *                       example, "-lh1-".
- * @return               Pointer to the decoder type, or NULL if there
- *                       is no decoder type for the specified name.
- */
-
-LHADecoderType *lha_decoder_for_name(char *name);
-
-/**
- * Allocate a new decoder for the specified type.
- *
- * @param dtype          The decoder type.
- * @param callback       Callback function for the decoder to call to read
- *                       more compressed data.
- * @param callback_data  Extra data to pass to the callback function.
- * @param stream_length  Length of the uncompressed data, in bytes. When
- *                       this point is reached, decompression will stop.
- * @return               Pointer to the new decoder, or NULL for failure.
- */
-
-LHADecoder *lha_decoder_new(LHADecoderType *dtype,
-                            LHADecoderCallback callback,
-                            void *callback_data,
-                            size_t stream_length);
-
-/**
- * Free a decoder.
- *
- * @param decoder        The decoder to free.
- */
-
-void lha_decoder_free(LHADecoder *decoder);
-
-/**
- * Set a callback function to monitor decode progress.
- *
- * @param decoder        The decoder.
- * @param callback       Callback function to monitor decode progress.
- * @param callback_data  Extra data to pass to the decoder.
- */
-
-void lha_decoder_monitor(LHADecoder *decoder,
-                         LHADecoderProgressCallback callback,
-                         void *callback_data);
-
-/**
- * Decode (decompress) more data.
- *
- * @param decoder        The decoder.
- * @param buf            Pointer to buffer to store decompressed data.
- * @param buf_len        Size of the buffer, in bytes.
- * @return               Number of bytes decompressed.
- */
-
-size_t lha_decoder_read(LHADecoder *decoder, uint8_t *buf, size_t buf_len);
-
-/**
- * Get the current 16-bit CRC of the decompressed data.
- *
- * This should be called at the end of decompression to check that the
- * data was extracted correctly, and the value compared against the CRC
- * from the file header.
- *
- * @param decoder        The decoder.
- * @return               16-bit CRC of the data decoded so far.
- */
-
-uint16_t lha_decoder_get_crc(LHADecoder *decoder);
-
-/**
- * Get the count of the number of bytes decoded.
- *
- * This should be called at the end of decompression, and the value
- * compared against the file length from the file header.
- *
- * @param decoder        The decoder.
- * @return               The number of decoded bytes.
- */
-
-size_t lha_decoder_get_length(LHADecoder *decoder);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* #ifndef LHASA_LHA_DECODER_H */
-
-
-
-//////////////////////// lha_decoder.h
-
-/*
-
-Copyright (c) 2011, 2012, Simon Howard
-
-Permission to use, copy, modify, and/or distribute this software
-for any purpose with or without fee is hereby granted, provided
-that the above copyright notice and this permission notice appear
-in all copies.
-
-THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
-WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
-AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR
-CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
-LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
-NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
-CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-
- */
-
-#ifndef LHASA_LHA_DECODER_H
-#define LHASA_LHA_DECODER_H
-
-struct _LHADecoderType {
-
-	/**
-	 * Callback function to initialize the decoder.
-	 *
-	 * @param extra_data     Pointer to the extra data area allocated for
-	 *                       the decoder.
-	 * @param callback       Callback function to invoke to read more
-	 *                       compressed data.
-	 * @param callback_data  Extra pointer to pass to the callback.
-	 * @return               Non-zero for success.
-	 */
-
-	int (*init)(void *extra_data,
-	            LHADecoderCallback callback,
-	            void *callback_data);
-
-	/**
-	 * Callback function to free the decoder.
-	 *
-	 * @param extra_data     Pointer to the extra data area allocated for
-	 *                       the decoder.
-	 */
-
-	void (*free)(void *extra_data);
-
-	/**
-	 * Callback function to read (ie. decompress) data from the
-	 * decoder.
-	 *
-	 * @param extra_data     Pointer to the decoder's custom data.
-	 * @param buf            Pointer to the buffer in which to store
-	 *                       the decompressed data.  The buffer is
-	 *                       at least 'max_read' bytes in size.
-	 * @return               Number of bytes decompressed.
-	 */
-
-	size_t (*read)(void *extra_data, uint8_t *buf);
-
-	/** Number of bytes of extra data to allocate for the decoder. */
-
-	size_t extra_size;
-
-	/** Maximum number of bytes that might be put into the buffer by
-	    a single call to read() */
-
-	size_t max_read;
-
-	/** Block size. Used for calculating number of blocks for
-	    progress bar. */
-
-	size_t block_size;
-};
-
-struct _LHADecoder {
-
-	/** Type of decoder (algorithm) */
-
-	LHADecoderType *dtype;
-
-	/** Callback function to monitor decoder progress. */
-
-	LHADecoderProgressCallback progress_callback;
-	void *progress_callback_data;
-
-	/** Last announced block position, for progress callback. */
-
-	unsigned int last_block, total_blocks;
-
-	/** Current position in the decode stream, and total length. */
-
-	size_t stream_pos, stream_length;
-
-	/** Output buffer, containing decoded data not yet returned. */
-
-	unsigned int outbuf_pos, outbuf_len;
-	uint8_t *outbuf;
-
-	/** If true, the decoder read() function returned zero. */
-
-	unsigned int decoder_failed;
-
-	/** Current CRC of the output stream. */
-
-	uint16_t crc;
-};
-
-#endif /* #ifndef LHASA_LHA_DECODER_H */
-
-
-//////////////////////// bit_stream_reader.c
-
-/*
-
-Copyright (c) 2011, 2012, Simon Howard
-
-Permission to use, copy, modify, and/or distribute this software
-for any purpose with or without fee is hereby granted, provided
-that the above copyright notice and this permission notice appear
-in all copies.
-
-THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
-WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
-AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR
-CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
-LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
-NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
-CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-
- */
-
-#include <string.h>
-
-//
-// Data structure used to read bits from an input source as a stream.
-//
-// This file is designed to be #included by other source files to
-// make a complete decoder.
-//
-
-typedef struct {
-
-	// Callback function to invoke to read more data from the
-	// input stream.
-
-	LHADecoderCallback callback;
-	void *callback_data;
-
-	// Bits from the input stream that are waiting to be read.
-
-	uint32_t bit_buffer;
-	unsigned int bits;
-
-} BitStreamReader;
-
-// Initialize bit stream reader structure.
-
-static void bit_stream_reader_init(BitStreamReader *reader,
-                                   LHADecoderCallback callback,
-                                   void *callback_data)
-{
-	reader->callback = callback;
-	reader->callback_data = callback_data;
-
-	reader->bits = 0;
-	reader->bit_buffer = 0;
-}
-
-// Return the next n bits waiting to be read from the input stream,
-// without removing any.  Returns -1 for failure.
-
-static int peek_bits(BitStreamReader *reader,
-                     unsigned int n)
-{
-	uint8_t buf[4];
-	unsigned int fill_bytes;
-	size_t bytes;
-
-	if (n == 0) {
-		return 0;
-	}
-
-	// If there are not enough bits in the buffer to satisfy this
-	// request, we need to fill up the buffer with more bits.
-
-	while (reader->bits < n) {
-
-		// Maximum number of bytes we can fill?
-
-		fill_bytes = (32 - reader->bits) / 8;
-
-		// Read from input and fill bit_buffer.
-
-		memset(buf, 0, sizeof(buf));
-		bytes = reader->callback(buf, fill_bytes,
-		                         reader->callback_data);
-
-		// End of file?
-
-		if (bytes == 0) {
-			return -1;
-		}
-
-		reader->bit_buffer |= (uint32_t) buf[0] << (24 - reader->bits);
-		reader->bit_buffer |= (uint32_t) buf[1] << (16 - reader->bits);
-		reader->bit_buffer |= (uint32_t) buf[2] << (8 - reader->bits);
-		reader->bit_buffer |= (uint32_t) buf[3];
-
-		reader->bits += bytes * 8;
-	}
-
-	return (signed int) (reader->bit_buffer >> (32 - n));
-}
-
-// Read a bit from the input stream.
-// Returns -1 for failure.
-
-static int read_bits(BitStreamReader *reader,
-                     unsigned int n)
-{
-	int result;
-
-	result = peek_bits(reader, n);
-
-	if (result >= 0) {
-		reader->bit_buffer <<= n;
-		reader->bits -= n;
-	}
-
-	return result;
-}
-
-
-// Read a bit from the input stream.
-// Returns -1 for failure.
-
-static int read_bit(BitStreamReader *reader)
-{
-	return read_bits(reader, 1);
-}
-
-
-//////////////////////// tree_decode.c
-typedef uint16_t TreeElement;
-
-/*
-
-Copyright (c) 2011, 2012, Simon Howard
-
-Permission to use, copy, modify, and/or distribute this software
-for any purpose with or without fee is hereby granted, provided
-that the above copyright notice and this permission notice appear
-in all copies.
-
-THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
-WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
-AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR
-CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
-LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
-NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
-CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-
- */
-
-// Common tree decoding code.
-//
-// A recurring feature used by the different LHA algorithms is to
-// encode a set of codes, which have varying bit lengths. This is
-// implemented using a binary tree, stored inside an array of
-// elements.
-//
-// This file is implemented as a "template" file to be #include-d by
-// other files. The typedef for TreeElement must be defined before
-// include.
-
-
-// Upper bit is set in a node value to indicate a leaf.
-
-#define TREE_NODE_LEAF    (TreeElement) (1 << (sizeof(TreeElement) * 8 - 1))
-
-// Structure used to hold data needed to build the tree.
-
-typedef struct {
-	// The tree data and its size (must not be exceeded)
-
-	TreeElement *tree;
-	unsigned int tree_len;
-
-	// Counter used to allocate entries from the tree.
-	// Every time a new node is allocated, this increase by 2.
-
-	unsigned int tree_allocated;
-
-	// The next tree entry.
-	// As entries are allocated sequentially, the range from
-	// next_entry..tree_allocated-1 constitutes the indices into
-	// the tree that are available to be filled in. By the
-	// end of the tree build, next_entry should = tree_allocated.
-
-	unsigned int next_entry;
-} TreeBuildData;
-
-// Initialize all elements of the given tree to a good initial state.
-
-static void init_tree(TreeElement *tree, size_t tree_len)
-{
-	unsigned int i;
-
-	for (i = 0; i < tree_len; ++i) {
-		tree[i] = TREE_NODE_LEAF;
-	}
-}
-
-// Set tree to always decode to a single code.
-
-static void set_tree_single(TreeElement *tree, TreeElement code)
-{
-	tree[0] = (TreeElement) code | TREE_NODE_LEAF;
-}
-
-// "Expand" the list of queue entries. This generates a new child
-// node at each of the entries currently in the queue, adding the
-// children of those nodes into the queue to replace them.
-// The effect of this is to add an extra level to the tree, and
-// to increase the tree depth of the indices in the queue.
-
-static void expand_queue(TreeBuildData *build)
-{
-	unsigned int end_offset;
-	unsigned int new_nodes;
-
-	// Sanity check that there is enough space in the tree for
-	// all the new nodes.
-
-	new_nodes = (build->tree_allocated - build->next_entry) * 2;
-
-	if (build->tree_allocated + new_nodes > build->tree_len) {
-		return;
-	}
-
-	// Go through all entries currently in the allocated range, and
-	// allocate a subnode for each.
-
-	end_offset = build->tree_allocated;
-
-	while (build->next_entry < end_offset) {
-		build->tree[build->next_entry] = build->tree_allocated;
-		build->tree_allocated += 2;
-		++build->next_entry;
-	}
-}
-
-// Read the next entry from the queue of entries waiting to be used.
-
-static unsigned int read_next_entry(TreeBuildData *build)
-{
-	unsigned int result;
-
-	// Sanity check.
-
-	if (build->next_entry >= build->tree_allocated) {
-		return 0;
-	}
-
-	result = build->next_entry;
-	++build->next_entry;
-
-	return result;
-}
-
-// Add all codes to the tree that have the specified length.
-// Returns non-zero if there are any entries in code_lengths[] still
-// waiting to be added to the tree.
-
-static int add_codes_with_length(TreeBuildData *build,
-                                 uint8_t *code_lengths,
-                                 unsigned int num_code_lengths,
-                                 unsigned int code_len)
-{
-	unsigned int i;
-	unsigned int node;
-	int codes_remaining;
-
-	codes_remaining = 0;
-
-	for (i = 0; i < num_code_lengths; ++i) {
-
-		// Does this code belong at this depth in the tree?
-
-		if (code_lengths[i] == code_len) {
-			node = read_next_entry(build);
-
-			build->tree[node] = (TreeElement) i | TREE_NODE_LEAF;
-		}
-
-		// More work to be done after this pass?
-
-		else if (code_lengths[i] > code_len) {
-			codes_remaining = 1;
-		}
-	}
-
-	return codes_remaining;
-}
-
-// Build a tree, given the specified array of codes indicating the
-// required depth within the tree at which each code should be
-// located.
-
-static void build_tree(TreeElement *tree, size_t tree_len,
-                       uint8_t *code_lengths, unsigned int num_code_lengths)
-{
-	TreeBuildData build;
-	unsigned int code_len;
-
-	build.tree = tree;
-	build.tree_len = tree_len;
-
-	// Start with a single entry in the queue - the root node
-	// pointer.
-
-	build.next_entry = 0;
-
-	// We always have the root ...
-
-	build.tree_allocated = 1;
-
-	// Iterate over each possible code length.
-	// Note: code_len == 0 is deliberately skipped over, as 0
-	// indicates "not used".
-
-	code_len = 0;
-
-	do {
-		// Advance to the next code length by allocating extra
-		// nodes to the tree - the slots waiting in the queue
-		// will now be one level deeper in the tree (and the
-		// codes 1 bit longer).
-
-		expand_queue(&build);
-		++code_len;
-
-		// Add all codes that have this length.
-
-	} while (add_codes_with_length(&build, code_lengths,
-	                               num_code_lengths, code_len));
-}
-
-/*
-static void display_tree(TreeElement *tree, unsigned int node, int offset)
-{
-	unsigned int i;
-
-	if (node & TREE_NODE_LEAF) {
-		for (i = 0; i < offset; ++i) putchar(' ');
-		printf("leaf %i\n", node & ~TREE_NODE_LEAF);
-	} else {
-		for (i = 0; i < offset; ++i) putchar(' ');
-		printf("0 ->\n");
-		display_tree(tree, tree[node], offset + 4);
-		for (i = 0; i < offset; ++i) putchar(' ');
-		printf("1 ->\n");
-		display_tree(tree, tree[node + 1], offset + 4);
-	}
-}
-*/
-
-// Read bits from the input stream, traversing the specified tree
-// from the root node until we reach a leaf.  The leaf value is
-// returned.
-
-static int read_from_tree(BitStreamReader *reader, TreeElement *tree)
-{
-	TreeElement code;
-	int bit;
-
-	// Start from root.
-
-	code = tree[0];
-
-	while ((code & TREE_NODE_LEAF) == 0) {
-
-		bit = read_bit(reader);
-
-		if (bit < 0) {
-			return -1;
-		}
-
-		code = tree[code + (unsigned int) bit];
-	}
-
-	// Mask off leaf bit to get the plain code.
-
-	return (int) (code & ~TREE_NODE_LEAF);
-}
-
-
-
-
-//////////////////////// lh5_decoder.c
-
-/*
-
-Copyright (c) 2011, 2012, Simon Howard
-
-Permission to use, copy, modify, and/or distribute this software
-for any purpose with or without fee is hereby granted, provided
-that the above copyright notice and this permission notice appear
-in all copies.
-
-THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
-WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
-AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR
-CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
-LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
-NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
-CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-
- */
-
-//
-// Decoder for the -lh5- algorithm.
-//
-// This is the "new" algorithm that appeared in LHA v2, replacing
-// the older -lh1-. -lh4- seems to be identical to -lh5-.
-//
-
-// 16 KiB history ring buffer:
-
-#define HISTORY_BITS    14   /* 2^14 = 16384 */
-
-// Number of bits to encode HISTORY_BITS:
-
-#define OFFSET_BITS     4
-
-// Name of the variable for the encoder:
-
-#define DECODER_NAME lha_lh5_decoder
-
-
-//////////////////////// lh_new_decoder.c
-
-
-/*
-
-Copyright (c) 2011, 2012, Simon Howard
-
-Permission to use, copy, modify, and/or distribute this software
-for any purpose with or without fee is hereby granted, provided
-that the above copyright notice and this permission notice appear
-in all copies.
-
-THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
-WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
-AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR
-CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
-LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
-NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
-CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-
- */
-
-// Decoder for "new-style" LHA algorithms, used with LHA v2 and onwards
-// (-lh4-, -lh5-, -lh6-, -lh7-).
-//
-// This file is designed to be a template. It is #included by other
-// files to generate an optimized decoder.
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <inttypes.h>
-
-
-// Threshold for copying. The first copy code starts from here.
-
-#define COPY_THRESHOLD       3 /* bytes */
-
-// Ring buffer containing history has a size that is a power of two.
-// The number of bits is specified.
-
-#define RING_BUFFER_SIZE     (1 << HISTORY_BITS)
-
-// Required size of the output buffer.  At most, a single call to read()
-// might result in a copy of the entire ring buffer.
-
-#define OUTPUT_BUFFER_SIZE   RING_BUFFER_SIZE
-
-// Number of different command codes. 0-255 range are literal byte
-// values, while higher values indicate copy from history.
-
-#define NUM_CODES            510
-
-// Number of possible codes in the "temporary table" used to encode the
-// codes table.
-
-#define MAX_TEMP_CODES       20
-
-typedef struct _LHANewDecoder {
-	// Input bit stream.
-
-	BitStreamReader bit_stream_reader;
-
-	// Ring buffer of past data.  Used for position-based copies.
-
-	uint8_t ringbuf[RING_BUFFER_SIZE];
-	unsigned int ringbuf_pos;
-	int ringbuf_copy_pos;
-	int ringbuf_copy_count;
-
-	// Number of commands remaining before we start a new block.
-
-	unsigned int block_remaining;
-
-	// Table used for the code tree.
-
-	TreeElement code_tree[NUM_CODES * 2];
-
-	// Table used to encode the offset tree, used to read offsets
-	// into the history buffer. This same table is also used to
-	// encode the temp-table, which is bigger; hence the size.
-
-	TreeElement offset_tree[MAX_TEMP_CODES * 2];
-} LHANewDecoder;
-
-// Initialize the history ring buffer.
-
-static void init_ring_buffer(LHANewDecoder *decoder)
-{
-	memset(decoder->ringbuf, ' ', RING_BUFFER_SIZE);
-	decoder->ringbuf_pos = 0;
-	decoder->ringbuf_copy_pos = 0;
-	decoder->ringbuf_copy_count = 0;
-}
-
-static int __attribute__((unused)) lha_lh_new_init(LHANewDecoder *decoder, LHADecoderCallback callback,
-                           void *callback_data)
-{
-	// Initialize input stream reader.
-
-	bit_stream_reader_init(&decoder->bit_stream_reader,
-	                       callback, callback_data);
-
-	// Initialize data structures.
-
-	init_ring_buffer(decoder);
-
-	// First read starts the first block.
-
-	decoder->block_remaining = 0;
-
-	// Initialize tree tables to a known state.
-
-	init_tree(decoder->code_tree, NUM_CODES * 2);
-	init_tree(decoder->offset_tree, MAX_TEMP_CODES * 2);
-
-	return 1;
-}
-
-// Read a length value - this is normally a value in the 0-7 range, but
-// sometimes can be longer.
-
-static int read_length_value(LHANewDecoder *decoder)
-{
-	int i, len;
-
-	len = read_bits(&decoder->bit_stream_reader, 3);
-
-	if (len < 0) {
-		return -1;
-	}
-
-	if (len == 7) {
-		// Read more bits to extend the length until we reach a '0'.
-
-		for (;;) {
-			i = read_bit(&decoder->bit_stream_reader);
-
-			if (i < 0) {
-				return -1;
-			} else if (i == 0) {
-				break;
-			}
-
-			++len;
-		}
-	}
-
-	return len;
-}
-
-// Read the values from the input stream that define the temporary table
-// used for encoding the code table.
-
-static int read_temp_table(LHANewDecoder *decoder)
-{
-	int i, j, n, len, code;
-	uint8_t code_lengths[MAX_TEMP_CODES];
-
-	// How many codes?
-
-	n = read_bits(&decoder->bit_stream_reader, 5);
-
-	if (n < 0) {
-		return 0;
-	}
-
-	// n=0 is a special case, meaning only a single code that
-	// is of zero length.
-
-	if (n == 0) {
-		code = read_bits(&decoder->bit_stream_reader, 5);
-
-		if (code < 0) {
-			return 0;
-		}
-
-		set_tree_single(decoder->offset_tree, code);
-		return 1;
-	}
-
-	// Enforce a hard limit on the number of codes.
-
-	if (n > MAX_TEMP_CODES) {
-		n = MAX_TEMP_CODES;
-	}
-
-	// Read the length of each code.
-
-	for (i = 0; i < n; ++i) {
-		len = read_length_value(decoder);
-
-		if (len < 0) {
-			return 0;
-		}
-
-		code_lengths[i] = len;
-
-		// After the first three lengths, there is a 2-bit
-		// field to allow skipping over up to a further three
-		// lengths. Not sure of the reason for this ...
-
-		if (i == 2) {
-			len = read_bits(&decoder->bit_stream_reader, 2);
-
-			if (len < 0) {
-				return 0;
-			}
-
-			for (j = 0; j < len; ++j) {
-				++i;
-				code_lengths[i] = 0;
-			}
-		}
-	}
-
-	build_tree(decoder->offset_tree, MAX_TEMP_CODES * 2, code_lengths, n);
-
-	return 1;
-}
-
-// Code table codes can indicate that a sequence of codes should be
-// skipped over. The number to skip is Huffman-encoded. Given a skip
-// range (0-2), this reads the number of codes to skip over.
-
-static int read_skip_count(LHANewDecoder *decoder, int skiprange)
-{
-	int result;
-
-	// skiprange=0 => 1 code.
-
-	if (skiprange == 0) {
-		result = 1;
-	}
-
-	// skiprange=1 => 3-18 codes.
-
-	else if (skiprange == 1) {
-		result = read_bits(&decoder->bit_stream_reader, 4);
-
-		if (result < 0) {
-			return -1;
-		}
-
-		result += 3;
-	}
-
-	// skiprange=2 => 20+ codes.
-
-	else {
-		result = read_bits(&decoder->bit_stream_reader, 9);
-
-		if (result < 0) {
-			return -1;
-		}
-
-		result += 20;
-	}
-
-	return result;
-}
-
-static int read_code_table(LHANewDecoder *decoder)
-{
-	int i, j, n, skip_count, code;
-	uint8_t code_lengths[NUM_CODES];
-
-	// How many codes?
-
-	n = read_bits(&decoder->bit_stream_reader, 9);
-
-	if (n < 0) {
-		return 0;
-	}
-
-	// n=0 implies a single code of zero length; all inputs
-	// decode to the same code.
-
-	if (n == 0) {
-		code = read_bits(&decoder->bit_stream_reader, 9);
-
-		if (code < 0) {
-			return 0;
-		}
-
-		set_tree_single(decoder->code_tree, code);
-
-		return 1;
-	}
-
-	if (n > NUM_CODES) {
-		n = NUM_CODES;
-	}
-
-	// Read the length of each code.
-	// The lengths are encoded using the temp-table previously read;
-	// offset_tree is reused temporarily to hold it.
-
-	i = 0;
-
-	while (i < n) {
-		code = read_from_tree(&decoder->bit_stream_reader,
-		                      decoder->offset_tree);
-
-		if (code < 0) {
-			return 0;
-		}
-
-		// The code that was read can have different meanings.
-		// If in the range 0-2, it indicates that a number of
-		// codes are unused and should be skipped over.
-		// Values greater than two represent a frequency count.
-
-		if (code <= 2) {
-			skip_count = read_skip_count(decoder, code);
-
-			if (skip_count < 0) {
-				return 0;
-			}
-
-			for (j = 0; j < skip_count && i < n; ++j) {
-				code_lengths[i] = 0;
-				++i;
-			}
-		} else {
-			code_lengths[i] = code - 2;
-			++i;
-		}
-	}
-
-	build_tree(decoder->code_tree, NUM_CODES * 2, code_lengths, n);
-
-	return 1;
-}
-
-static int read_offset_table(LHANewDecoder *decoder)
-{
-	int i, n, len, code;
-	uint8_t code_lengths[HISTORY_BITS];
-
-	// How many codes?
-
-	n = read_bits(&decoder->bit_stream_reader, OFFSET_BITS);
-
-	if (n < 0) {
-		return 0;
-	}
-
-	// n=0 is a special case, meaning only a single code that
-	// is of zero length.
-
-	if (n == 0) {
-		code = read_bits(&decoder->bit_stream_reader, OFFSET_BITS);
-
-		if (code < 0) {
-			return 0;
-		}
-
-		set_tree_single(decoder->offset_tree, code);
-		return 1;
-	}
-
-	// Enforce a hard limit on the number of codes.
-
-	if (n > HISTORY_BITS) {
-		n = HISTORY_BITS;
-	}
-
-	// Read the length of each code.
-
-	for (i = 0; i < n; ++i) {
-		len = read_length_value(decoder);
-
-		if (len < 0) {
-			return 0;
-		}
-
-		code_lengths[i] = len;
-	}
-
-	build_tree(decoder->offset_tree, MAX_TEMP_CODES * 2, code_lengths, n);
-
-	return 1;
-}
-
-// Start reading a new block from the input stream.
-
-static int start_new_block(LHANewDecoder *decoder)
-{
-	int len;
-
-	// Read length of new block (in commands).
-
-	len = read_bits(&decoder->bit_stream_reader, 16);
-
-	if (len < 0) {
-		return 0;
-	}
-
-	decoder->block_remaining = (size_t) len;
-
-	// Read the temporary decode table, used to encode the codes table.
-	// The position table data structure is reused for this.
-
-	if (!read_temp_table(decoder)) {
-		return 0;
-	}
-
-	// Read the code table; this is encoded *using* the temp table.
-
-	if (!read_code_table(decoder)) {
-		return 0;
-	}
-
-	// Read the offset table.
-
-	if (!read_offset_table(decoder)) {
-		return 0;
-	}
-
-	return 1;
-}
-
-// Read the next code from the input stream. Returns the code, or -1 if
-// an error occurred.
-
-static int read_code(LHANewDecoder *decoder)
-{
-	return read_from_tree(&decoder->bit_stream_reader, decoder->code_tree);
-}
-
-// Read an offset distance from the input stream.
-// Returns the code, or -1 if an error occurred.
-
-static int read_offset_code(LHANewDecoder *decoder)
-{
-	int bits, result;
-
-	bits = read_from_tree(&decoder->bit_stream_reader,
-	                      decoder->offset_tree);
-
-	if (bits < 0) {
-		return -1;
-	}
-
-	// The code read indicates the length of the offset in bits.
-	//
-	// The returned value looks like this:
-	//   bits = 0  ->         0
-	//   bits = 1  ->         1
-	//   bits = 2  ->        1x
-	//   bits = 3  ->       1xx
-	//   bits = 4  ->      1xxx
-	//             etc.
-
-	if (bits == 0) {
-		return 0;
-	} else if (bits == 1) {
-		return 1;
-	} else {
-		result = read_bits(&decoder->bit_stream_reader, bits - 1);
-
-		if (result < 0) {
-			return -1;
-		}
-
-		return result + (1 << (bits - 1));
-	}
-}
-
-// Add a byte value to the output stream.
-
-static void output_byte(LHANewDecoder *decoder, uint8_t *buf,
-                        size_t *buf_len, uint8_t b)
-{
-	buf[*buf_len] = b;
-	++*buf_len;
-
-	decoder->ringbuf[decoder->ringbuf_pos] = b;
-	decoder->ringbuf_pos = (decoder->ringbuf_pos + 1) % RING_BUFFER_SIZE;
-}
-
-// Copy a block from the history buffer.
-
-static void set_copy_from_history(LHANewDecoder *decoder, uint8_t *buf, size_t count)
-{
-	int offset;
-
-	offset = read_offset_code(decoder);
-
-	if (offset < 0) {
-		return;
-	}
-
-	decoder->ringbuf_copy_pos = decoder->ringbuf_pos + RING_BUFFER_SIZE - (unsigned int) offset - 1;
-	decoder->ringbuf_copy_count = count;
-}
-
-static size_t __attribute__((unused)) lha_lh_new_read(LHANewDecoder *decoder, uint8_t *buf, int sz)
-{
-	size_t result = 0;
-	int code;
-
-	while (sz > 0) {	
-
-		if (decoder->ringbuf_copy_count > 0) {
-			output_byte(decoder, buf, &result,
-			            decoder->ringbuf[decoder->ringbuf_copy_pos++ % RING_BUFFER_SIZE]);
-			decoder->ringbuf_copy_count--;
-			sz--;
-			continue;
-		}
-
-
-		// Start of new block?
-		while (decoder->block_remaining == 0) {
-			if (!start_new_block(decoder)) {
-				return 0;
-			}
-		}
-
-		--decoder->block_remaining;
-
-		// Read next command from input stream.
-
-		code = read_code(decoder);
-
-		if (code < 0) {
-			return 0;
-		}
-
-		// The code may be either a literal byte value or a copy command.
-
-		if (code < 256) {
-			output_byte(decoder, buf, &result, (uint8_t) code);
-			sz--;
-		} else {
-			set_copy_from_history(decoder, buf, code - 256 + COPY_THRESHOLD);
-		}
-	}
-
-	return result;
-}
-
-#endif /* LZH5_H */
diff --git a/src/audio/ym64.c b/src/audio/ym64.c
index 23a5869c04..2ecda3dd4d 100644
--- a/src/audio/ym64.c
+++ b/src/audio/ym64.c
@@ -6,13 +6,15 @@
 
 #include "ym64.h"
 #include "ay8910.h"
-#include "lzh5.h"
+#include "../compress/lzh5_internal.h"
 #include "samplebuffer.h"
 #include "debug.h"
+#include "asset_internal.h"
 #include "utils.h"
 #include <assert.h>
 #include <string.h>
 #include <stdio.h>
+#include <malloc.h>
 
 /** @brief Header of a YM5 file */
 typedef struct __attribute__((packed)) {
@@ -33,11 +35,6 @@ static int ymread(ym64player_t *player, void *buf, int sz) {
 	return fread(buf, 1, sz, player->f);
 }
 
-static unsigned int lha_callback(void *buf, size_t buf_len, void *user_data) {
-	FILE* f = (FILE*)user_data;
-	return fread(buf, 1, buf_len, f);
-}
-
 static void ym_wave_read(void *ctx, samplebuffer_t *sbuf, int wpos, int wlen, bool seeking) {
 	ym64player_t *player = (ym64player_t*)ctx;
 
diff --git a/tools/audioconv64/conv_ym64.c b/tools/audioconv64/conv_ym64.c
index 51f1378481..c236530f01 100644
--- a/tools/audioconv64/conv_ym64.c
+++ b/tools/audioconv64/conv_ym64.c
@@ -15,9 +15,11 @@
 #define assertf(x, ...) assert(x)
 #define memalign(a, b) malloc(b)
 
-#include "../../src/audio/lzh5.h"   // LZH5 decompression
-#include "lzh5_compress.h"          // LZH5 compression
-#include "lzh5_compress.c"
+#include "../../src/compress/lzh5_internal.h"  // LZH5 decompression
+#include "../../src/compress/lzh5.c"
+#include "../common/lzh5_compress.h"           // LZH5 compression
+#include "../common/lzh5_compress.c"
+#include <stdalign.h>
 
 
 bool flag_ym_compress = false;
@@ -152,7 +154,7 @@ int ym_convert(const char *infn, const char *outfn) {
 
         // Initialize LHA decompression, and read back the now uncompressed header.
         // Decompression is performed via a minimal version of
-        // https://github.com/fragglet/lhasa, stored in lz5h.h.
+        // https://github.com/fragglet/lhasa, stored in lzh5.h.
         fseek(ym_f, head[0]+2, SEEK_SET);
         ym_compressed = true;
         decompress_lzh5_init(ym_decoder, ym_f);
diff --git a/tools/audioconv64/lzh5_compress.c b/tools/common/lzh5_compress.c
similarity index 100%
rename from tools/audioconv64/lzh5_compress.c
rename to tools/common/lzh5_compress.c
diff --git a/tools/audioconv64/lzh5_compress.h b/tools/common/lzh5_compress.h
similarity index 100%
rename from tools/audioconv64/lzh5_compress.h
rename to tools/common/lzh5_compress.h

From ccacff2dfd1d8a191ecaa4c83a4653c4c3dcce9c Mon Sep 17 00:00:00 2001
From: Giovanni Bajo <rasky@develer.com>
Date: Sun, 24 Sep 2023 23:17:34 +0200
Subject: [PATCH 09/27] tools: print tool name during compilation

---
 tools/Makefile             | 3 +++
 tools/audioconv64/Makefile | 6 ++++--
 tools/dumpdfs/Makefile     | 2 ++
 tools/mkdfs/Makefile       | 2 ++
 4 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/tools/Makefile b/tools/Makefile
index 277ec9acdf..b6c6372066 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -20,15 +20,18 @@ clean:
 	$(MAKE) -C audioconv64 clean
 
 chksum64: chksum64.c
+	@echo "    [TOOL] chksum64"
 	gcc -o chksum64 chksum64.c
 
 n64tool: n64tool.c
+	@echo "    [TOOL] n64tool"
 	gcc -o n64tool n64tool.c
 
 n64sym: n64sym.c
 	gcc -O2 -o n64sym n64sym.c
 
 ed64romconfig: ed64romconfig.c
+	@echo "    [TOOL] ed64romconfig"
 	gcc -o ed64romconfig ed64romconfig.c
 
 .PHONY: dumpdfs
diff --git a/tools/audioconv64/Makefile b/tools/audioconv64/Makefile
index 7c6a5c16ce..abc4dd838b 100644
--- a/tools/audioconv64/Makefile
+++ b/tools/audioconv64/Makefile
@@ -1,11 +1,13 @@
 INSTALLDIR = $(N64_INST)
 CFLAGS = -std=gnu11 -MMD -O2 -Wall -Wno-unused-result -Werror -I../../include
 LDFLAGS += -lm
+SRC = audioconv64.c
 
 all: audioconv64
 
-audioconv64: audioconv64.c
-	$(CC) $(CFLAGS) $< $(LDFLAGS) -o $@ 
+audioconv64: $(SRC)
+	@echo "    [TOOL] audioconv64"
+	$(CC) $(CFLAGS) $(SRC) $(LDFLAGS) -o $@ 
 
 install: audioconv64
 	install -m 0755 audioconv64 $(INSTALLDIR)/bin
diff --git a/tools/dumpdfs/Makefile b/tools/dumpdfs/Makefile
index b7c39a913d..111a9eea03 100644
--- a/tools/dumpdfs/Makefile
+++ b/tools/dumpdfs/Makefile
@@ -4,6 +4,8 @@ CFLAGS = -std=gnu99 -O2 -Wall -Wno-unused-result -Wno-pointer-to-int-cast -Wno-i
 all: dumpdfs
 
 dumpdfs: dumpdfs.c
+	@echo "    [TOOL] dumpdfs"
+	$(CC) $(CFLAGS) $< $(LDFLAGS) -o $@ 
 
 install: dumpdfs
 	install -m 0755 dumpdfs $(INSTALLDIR)/bin
diff --git a/tools/mkdfs/Makefile b/tools/mkdfs/Makefile
index a13fd0feb8..89fe7e10b2 100644
--- a/tools/mkdfs/Makefile
+++ b/tools/mkdfs/Makefile
@@ -4,6 +4,8 @@ CFLAGS = -std=gnu99 -O2 -Wall -Werror -I../../include
 all: mkdfs
 
 mkdfs: mkdfs.c
+	@echo "    [TOOL] mkdfs"
+	$(CC) $(CFLAGS) $< $(LDFLAGS) -o $@ 
 
 install: mkdfs
 	install -m 0755 mkdfs $(INSTALLDIR)/bin

From 470cb606ee955d2bb76e44965b61b4be3e0199f1 Mon Sep 17 00:00:00 2001
From: Giovanni Bajo <rasky@develer.com>
Date: Sun, 24 Sep 2023 23:43:28 +0200
Subject: [PATCH 10/27] tools: add simple library to write big-endian binary
 files.

This will be use to simplify all tools writing binary files that are
now byteswapping data in memory (and often causing weird bugs because
of that).
---
 tools/common/binout.c | 50 +++++++++++++++++++++++++++++++++++++++++++
 tools/common/binout.h | 25 ++++++++++++++++++++++
 2 files changed, 75 insertions(+)
 create mode 100644 tools/common/binout.c
 create mode 100644 tools/common/binout.h

diff --git a/tools/common/binout.c b/tools/common/binout.c
new file mode 100644
index 0000000000..f1c83de847
--- /dev/null
+++ b/tools/common/binout.c
@@ -0,0 +1,50 @@
+#include <stdio.h>
+#include <stdint.h>
+#include <assert.h>
+
+void w8(FILE *f, uint8_t v) 
+{
+    fputc(v, f);
+}
+
+void w16(FILE *f, uint16_t v)
+{
+    w8(f, v >> 8);
+    w8(f, v & 0xff);
+}
+
+void w32(FILE *f, uint32_t v)
+{
+    w16(f, v >> 16);
+    w16(f, v & 0xffff);
+}
+
+int w32_placeholder(FILE *f)
+{
+    int pos = ftell(f);
+    w32(f, 0);
+    return pos;
+}
+
+void w32_at(FILE *f, int pos, uint32_t v)
+{
+    int cur = ftell(f);
+    assert(cur >= 0);  // fail on pipes
+    fseek(f, pos, SEEK_SET);
+    w32(f, v);
+    fseek(f, cur, SEEK_SET);
+}
+
+void walign(FILE *f, int align)
+{ 
+    int pos = ftell(f);
+    assert(pos >= 0);  // fail on pipes
+    while (pos++ % align) w8(f, 0);
+}
+
+void wpad(FILE *f, int size)
+{
+    while (size--) {
+        w8(f, 0);
+    }
+}
diff --git a/tools/common/binout.h b/tools/common/binout.h
new file mode 100644
index 0000000000..91005f577b
--- /dev/null
+++ b/tools/common/binout.h
@@ -0,0 +1,25 @@
+#ifndef COMMON_BINOUT_H
+#define COMMON_BINOUT_H
+
+/**
+ * @file binout.h
+ * @brief Helper to write binary big-endian data to a file
+ */
+
+#include <stdio.h>
+#include <assert.h>
+#include <stdint.h>
+
+#define BITCAST_F2I(f)   ({ uint32_t __i; memcpy(&__i, &(f), 4); __i; })
+
+void w8(FILE *f, uint8_t v);
+void w16(FILE *f, uint16_t v);
+void w32(FILE *f, uint32_t v);
+#define wf32(f, v) w32(f, BITCAST_F2I(v))
+
+int w32_placeholder(FILE *f);
+void w32_at(FILE *f, int pos, uint32_t v);
+void walign(FILE *f, int align);
+void wpad(FILE *f, int size);
+
+#endif

From 5ef5c6554ec9088c135f4ab96e470fb04368130a Mon Sep 17 00:00:00 2001
From: Giovanni Bajo <rasky@develer.com>
Date: Sun, 24 Sep 2023 23:43:11 +0200
Subject: [PATCH 11/27] tools: add asset compression library

This library includes common code for asset compression. It relies
on the LZH5 compression library (previously moved from the audio
library), plus a vendored copy of the LZ4 compressor.
---
 tools/common/assetcomp.c |  103 ++
 tools/common/assetcomp.h |    8 +
 tools/common/lz4.c       | 2751 ++++++++++++++++++++++++++++++++++++++
 tools/common/lz4.h       |  862 ++++++++++++
 tools/common/lz4hc.c     | 1637 +++++++++++++++++++++++
 tools/common/lz4hc.h     |  413 ++++++
 6 files changed, 5774 insertions(+)
 create mode 100644 tools/common/assetcomp.c
 create mode 100644 tools/common/assetcomp.h
 create mode 100644 tools/common/lz4.c
 create mode 100644 tools/common/lz4.h
 create mode 100644 tools/common/lz4hc.c
 create mode 100644 tools/common/lz4hc.h

diff --git a/tools/common/assetcomp.c b/tools/common/assetcomp.c
new file mode 100644
index 0000000000..59540f720d
--- /dev/null
+++ b/tools/common/assetcomp.c
@@ -0,0 +1,103 @@
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "../common/binout.h"
+#include "../common/lzh5_compress.c"
+#undef MIN
+#undef MAX
+#include "../../src/asset.c"
+#include "../../src/compress/lzh5.c"
+#include "../../src/compress/lz4_dec.c"
+#include "../../src/compress/ringbuf.c"
+#undef MIN
+#undef MAX
+#undef LZ4_DECOMPRESS_INPLACE_MARGIN
+
+#ifndef LZ4_SRC_INCLUDED
+#define LZ4_DISTANCE_MAX 16384
+#include "../common/lz4.c"
+#endif
+#include "../common/lz4hc.c"
+#undef MIN
+#undef MAX
+
+
+bool asset_compress(const char *infn, const char *outfn, int compression)
+{
+    // Make sure the file exists before calling asset_load,
+    // which would just assert.
+    FILE *in = fopen(infn, "rb");
+    if (!in) {
+        fprintf(stderr, "error opening input file: %s\n", infn);
+        return false;
+    }
+    fclose(in);
+
+    int sz;
+    uint8_t *data = asset_load(infn, &sz);
+
+    switch (compression) {
+    case 0: { // none
+        FILE *out = fopen(outfn, "wb");
+        if (!out) {
+            fprintf(stderr, "error opening output file: %s\n", outfn);
+            return 1;
+        }
+        fwrite(data, 1, sz, out);
+        fclose(out);
+    }   break;
+    case 2: { // lzh5
+        char *tmpfn = NULL;
+        asprintf(&tmpfn, "%s.tmp", outfn);
+        FILE *out = fopen(tmpfn, "wb");
+        if (!out) {
+            fprintf(stderr, "error opening output file: %s\n", tmpfn);
+            return 1;
+        }
+        fwrite(data, 1, sz, out);
+        fclose(out);
+
+        in = fopen(tmpfn, "rb");
+        out = fopen(outfn, "wb");
+        fwrite("DCA2", 1, 4, out);
+        w16(out, 2); // algo
+        w16(out, 0); // flags
+        int w_cmp_size = w32_placeholder(out); // cmp_size
+        int w_dec_size = w32_placeholder(out); // dec_size
+
+        unsigned int crc, dsize, csize;
+        lzh5_init(LZHUFF5_METHOD_NUM);
+        lzh5_encode(in, out, &crc, &csize, &dsize);
+
+        w32_at(out, w_cmp_size, csize);
+        w32_at(out, w_dec_size, dsize);
+
+        fclose(in);
+        fclose(out);
+        remove(tmpfn);
+        free(tmpfn);
+    }   break;
+    case 1: { // lz4hc
+        int cmp_max_size = LZ4_COMPRESSBOUND(sz);
+        void *output = malloc(cmp_max_size);
+        int cmp_size = LZ4_compress_HC((char*)data, output, sz, cmp_max_size, LZ4HC_CLEVEL_MAX);
+        assert(cmp_size <= cmp_max_size);
+
+        FILE *out = fopen(outfn, "wb");
+        fwrite("DCA2", 1, 4, out);
+        w16(out, 1); // algo
+        w16(out, 0); // flags
+        w32(out, cmp_size); // cmp_size
+        w32(out, sz); // dec_size
+        fwrite(output, 1, cmp_size, out);
+        fclose(out);
+        free(output);
+    }   break;
+    default:
+        assert(0);
+    }
+
+    return true;
+}
diff --git a/tools/common/assetcomp.h b/tools/common/assetcomp.h
new file mode 100644
index 0000000000..7d1960ff62
--- /dev/null
+++ b/tools/common/assetcomp.h
@@ -0,0 +1,8 @@
+#ifndef COMMON_ASSETCOMP_H
+#define COMMON_ASSETCOMP_H
+
+#define DEFAULT_COMPRESSION     1
+
+bool asset_compress(const char *infn, const char *outfn, int compression);
+
+#endif
diff --git a/tools/common/lz4.c b/tools/common/lz4.c
new file mode 100644
index 0000000000..0982f9529c
--- /dev/null
+++ b/tools/common/lz4.c
@@ -0,0 +1,2751 @@
+/*
+   LZ4 - Fast LZ compression algorithm
+   Copyright (C) 2011-2020, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+    - LZ4 homepage : http://www.lz4.org
+    - LZ4 source repository : https://github.com/lz4/lz4
+*/
+
+/*-************************************
+*  Tuning parameters
+**************************************/
+/*
+ * LZ4_HEAPMODE :
+ * Select how stateless compression functions like `LZ4_compress_default()`
+ * allocate memory for their hash table,
+ * in memory stack (0:default, fastest), or in memory heap (1:requires malloc()).
+ */
+#ifndef LZ4_HEAPMODE
+#  define LZ4_HEAPMODE 0
+#endif
+
+/*
+ * LZ4_ACCELERATION_DEFAULT :
+ * Select "acceleration" for LZ4_compress_fast() when parameter value <= 0
+ */
+#define LZ4_ACCELERATION_DEFAULT 1
+/*
+ * LZ4_ACCELERATION_MAX :
+ * Any "acceleration" value higher than this threshold
+ * get treated as LZ4_ACCELERATION_MAX instead (fix #876)
+ */
+#define LZ4_ACCELERATION_MAX 65537
+
+
+/*-************************************
+*  CPU Feature Detection
+**************************************/
+/* LZ4_FORCE_MEMORY_ACCESS
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
+ *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method is portable but violate C standard.
+ *            It can generate buggy code on targets which assembly generation depends on alignment.
+ *            But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
+ * See https://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
+ */
+#ifndef LZ4_FORCE_MEMORY_ACCESS   /* can be defined externally */
+#  if defined(__GNUC__) && \
+  ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) \
+  || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
+#    define LZ4_FORCE_MEMORY_ACCESS 2
+#  elif (defined(__INTEL_COMPILER) && !defined(_WIN32)) || defined(__GNUC__)
+#    define LZ4_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+/*
+ * LZ4_FORCE_SW_BITCOUNT
+ * Define this parameter if your target system or compiler does not support hardware bit count
+ */
+#if defined(_MSC_VER) && defined(_WIN32_WCE)   /* Visual Studio for WinCE doesn't support Hardware bit count */
+#  undef  LZ4_FORCE_SW_BITCOUNT  /* avoid double def */
+#  define LZ4_FORCE_SW_BITCOUNT
+#endif
+
+
+
+/*-************************************
+*  Dependency
+**************************************/
+/*
+ * LZ4_SRC_INCLUDED:
+ * Amalgamation flag, whether lz4.c is included
+ */
+#ifndef LZ4_SRC_INCLUDED
+#  define LZ4_SRC_INCLUDED 1
+#endif
+
+#ifndef LZ4_STATIC_LINKING_ONLY
+#define LZ4_STATIC_LINKING_ONLY
+#endif
+
+#ifndef LZ4_DISABLE_DEPRECATE_WARNINGS
+#define LZ4_DISABLE_DEPRECATE_WARNINGS /* due to LZ4_decompress_safe_withPrefix64k */
+#endif
+
+#define LZ4_STATIC_LINKING_ONLY  /* LZ4_DISTANCE_MAX */
+#include "lz4.h"
+/* see also "memory routines" below */
+
+
+/*-************************************
+*  Compiler Options
+**************************************/
+#if defined(_MSC_VER) && (_MSC_VER >= 1400)  /* Visual Studio 2005+ */
+#  include <intrin.h>               /* only present in VS2005+ */
+#  pragma warning(disable : 4127)   /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 6237)   /* disable: C6237: conditional expression is always 0 */
+#endif  /* _MSC_VER */
+
+#ifndef LZ4_FORCE_INLINE
+#  ifdef _MSC_VER    /* Visual Studio */
+#    define LZ4_FORCE_INLINE static __forceinline
+#  else
+#    if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#      ifdef __GNUC__
+#        define LZ4_FORCE_INLINE static inline __attribute__((always_inline))
+#      else
+#        define LZ4_FORCE_INLINE static inline
+#      endif
+#    else
+#      define LZ4_FORCE_INLINE static
+#    endif /* __STDC_VERSION__ */
+#  endif  /* _MSC_VER */
+#endif /* LZ4_FORCE_INLINE */
+
+/* LZ4_FORCE_O2 and LZ4_FORCE_INLINE
+ * gcc on ppc64le generates an unrolled SIMDized loop for LZ4_wildCopy8,
+ * together with a simple 8-byte copy loop as a fall-back path.
+ * However, this optimization hurts the decompression speed by >30%,
+ * because the execution does not go to the optimized loop
+ * for typical compressible data, and all of the preamble checks
+ * before going to the fall-back path become useless overhead.
+ * This optimization happens only with the -O3 flag, and -O2 generates
+ * a simple 8-byte copy loop.
+ * With gcc on ppc64le, all of the LZ4_decompress_* and LZ4_wildCopy8
+ * functions are annotated with __attribute__((optimize("O2"))),
+ * and also LZ4_wildCopy8 is forcibly inlined, so that the O2 attribute
+ * of LZ4_wildCopy8 does not affect the compression speed.
+ */
+#if defined(__PPC64__) && defined(__LITTLE_ENDIAN__) && defined(__GNUC__) && !defined(__clang__)
+#  define LZ4_FORCE_O2  __attribute__((optimize("O2")))
+#  undef LZ4_FORCE_INLINE
+#  define LZ4_FORCE_INLINE  static __inline __attribute__((optimize("O2"),always_inline))
+#else
+#  define LZ4_FORCE_O2
+#endif
+
+#if (defined(__GNUC__) && (__GNUC__ >= 3)) || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) || defined(__clang__)
+#  define expect(expr,value)    (__builtin_expect ((expr),(value)) )
+#else
+#  define expect(expr,value)    (expr)
+#endif
+
+#ifndef likely
+#define likely(expr)     expect((expr) != 0, 1)
+#endif
+#ifndef unlikely
+#define unlikely(expr)   expect((expr) != 0, 0)
+#endif
+
+/* Should the alignment test prove unreliable, for some reason,
+ * it can be disabled by setting LZ4_ALIGN_TEST to 0 */
+#ifndef LZ4_ALIGN_TEST  /* can be externally provided */
+# define LZ4_ALIGN_TEST 1
+#endif
+
+
+/*-************************************
+*  Memory routines
+**************************************/
+
+/*! LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION :
+ *  Disable relatively high-level LZ4/HC functions that use dynamic memory
+ *  allocation functions (malloc(), calloc(), free()).
+ *
+ *  Note that this is a compile-time switch. And since it disables
+ *  public/stable LZ4 v1 API functions, we don't recommend using this
+ *  symbol to generate a library for distribution.
+ *
+ *  The following public functions are removed when this symbol is defined.
+ *  - lz4   : LZ4_createStream, LZ4_freeStream,
+ *            LZ4_createStreamDecode, LZ4_freeStreamDecode, LZ4_create (deprecated)
+ *  - lz4hc : LZ4_createStreamHC, LZ4_freeStreamHC,
+ *            LZ4_createHC (deprecated), LZ4_freeHC  (deprecated)
+ *  - lz4frame, lz4file : All LZ4F_* functions
+ */
+#if defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
+#  define ALLOC(s)          lz4_error_memory_allocation_is_disabled
+#  define ALLOC_AND_ZERO(s) lz4_error_memory_allocation_is_disabled
+#  define FREEMEM(p)        lz4_error_memory_allocation_is_disabled
+#elif defined(LZ4_USER_MEMORY_FUNCTIONS)
+/* memory management functions can be customized by user project.
+ * Below functions must exist somewhere in the Project
+ * and be available at link time */
+void* LZ4_malloc(size_t s);
+void* LZ4_calloc(size_t n, size_t s);
+void  LZ4_free(void* p);
+# define ALLOC(s)          LZ4_malloc(s)
+# define ALLOC_AND_ZERO(s) LZ4_calloc(1,s)
+# define FREEMEM(p)        LZ4_free(p)
+#else
+# include <stdlib.h>   /* malloc, calloc, free */
+# define ALLOC(s)          malloc(s)
+# define ALLOC_AND_ZERO(s) calloc(1,s)
+# define FREEMEM(p)        free(p)
+#endif
+
+#if ! LZ4_FREESTANDING
+#  include <string.h>   /* memset, memcpy */
+#endif
+#if !defined(LZ4_memset)
+#  define LZ4_memset(p,v,s) memset((p),(v),(s))
+#endif
+#define MEM_INIT(p,v,s)   LZ4_memset((p),(v),(s))
+
+
+/*-************************************
+*  Common Constants
+**************************************/
+#define MINMATCH 4
+
+#define WILDCOPYLENGTH 8
+#define LASTLITERALS   5   /* see ../doc/lz4_Block_format.md#parsing-restrictions */
+#define MFLIMIT       12   /* see ../doc/lz4_Block_format.md#parsing-restrictions */
+#define MATCH_SAFEGUARD_DISTANCE  ((2*WILDCOPYLENGTH) - MINMATCH)   /* ensure it's possible to write 2 x wildcopyLength without overflowing output buffer */
+#define FASTLOOP_SAFE_DISTANCE 64
+static const int LZ4_minLength = (MFLIMIT+1);
+
+#define KB *(1 <<10)
+#define MB *(1 <<20)
+#define GB *(1U<<30)
+
+#define LZ4_DISTANCE_ABSOLUTE_MAX 65535
+#if (LZ4_DISTANCE_MAX > LZ4_DISTANCE_ABSOLUTE_MAX)   /* max supported by LZ4 format */
+#  error "LZ4_DISTANCE_MAX is too big : must be <= 65535"
+#endif
+
+#define ML_BITS  4
+#define ML_MASK  ((1U<<ML_BITS)-1)
+#define RUN_BITS (8-ML_BITS)
+#define RUN_MASK ((1U<<RUN_BITS)-1)
+
+
+/*-************************************
+*  Error detection
+**************************************/
+#if defined(LZ4_DEBUG) && (LZ4_DEBUG>=1)
+#  include <assert.h>
+#else
+#  ifndef assert
+#    define assert(condition) ((void)0)
+#  endif
+#endif
+
+#define LZ4_STATIC_ASSERT(c)   { enum { LZ4_static_assert = 1/(int)(!!(c)) }; }   /* use after variable declarations */
+
+#if defined(LZ4_DEBUG) && (LZ4_DEBUG>=2)
+#  include <stdio.h>
+   static int g_debuglog_enable = 1;
+#  define DEBUGLOG(l, ...) {                          \
+        if ((g_debuglog_enable) && (l<=LZ4_DEBUG)) {  \
+            fprintf(stderr, __FILE__  " %i: ", __LINE__); \
+            fprintf(stderr, __VA_ARGS__);             \
+            fprintf(stderr, " \n");                   \
+    }   }
+#else
+#  define DEBUGLOG(l, ...) {}    /* disabled */
+#endif
+
+static int LZ4_isAligned(const void* ptr, size_t alignment)
+{
+    return ((size_t)ptr & (alignment -1)) == 0;
+}
+
+
+/*-************************************
+*  Types
+**************************************/
+#include <limits.h>
+#if defined(__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+# include <stdint.h>
+  typedef  uint8_t BYTE;
+  typedef uint16_t U16;
+  typedef uint32_t U32;
+  typedef  int32_t S32;
+  typedef uint64_t U64;
+  typedef uintptr_t uptrval;
+#else
+# if UINT_MAX != 4294967295UL
+#   error "LZ4 code (when not C++ or C99) assumes that sizeof(int) == 4"
+# endif
+  typedef unsigned char       BYTE;
+  typedef unsigned short      U16;
+  typedef unsigned int        U32;
+  typedef   signed int        S32;
+  typedef unsigned long long  U64;
+  typedef size_t              uptrval;   /* generally true, except OpenVMS-64 */
+#endif
+
+#if defined(__x86_64__)
+  typedef U64    reg_t;   /* 64-bits in x32 mode */
+#else
+  typedef size_t reg_t;   /* 32-bits in x32 mode */
+#endif
+
+typedef enum {
+    notLimited = 0,
+    limitedOutput = 1,
+    fillOutput = 2
+} limitedOutput_directive;
+
+
+/*-************************************
+*  Reading and writing into memory
+**************************************/
+
+/**
+ * LZ4 relies on memcpy with a constant size being inlined. In freestanding
+ * environments, the compiler can't assume the implementation of memcpy() is
+ * standard compliant, so it can't apply its specialized memcpy() inlining
+ * logic. When possible, use __builtin_memcpy() to tell the compiler to analyze
+ * memcpy() as if it were standard compliant, so it can inline it in freestanding
+ * environments. This is needed when decompressing the Linux Kernel, for example.
+ */
+#if !defined(LZ4_memcpy)
+#  if defined(__GNUC__) && (__GNUC__ >= 4)
+#    define LZ4_memcpy(dst, src, size) __builtin_memcpy(dst, src, size)
+#  else
+#    define LZ4_memcpy(dst, src, size) memcpy(dst, src, size)
+#  endif
+#endif
+
+#if !defined(LZ4_memmove)
+#  if defined(__GNUC__) && (__GNUC__ >= 4)
+#    define LZ4_memmove __builtin_memmove
+#  else
+#    define LZ4_memmove memmove
+#  endif
+#endif
+
+static unsigned LZ4_isLittleEndian(void)
+{
+    const union { U32 u; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental */
+    return one.c[0];
+}
+
+
+#if defined(LZ4_FORCE_MEMORY_ACCESS) && (LZ4_FORCE_MEMORY_ACCESS==2)
+/* lie to the compiler about data alignment; use with caution */
+
+static U16 LZ4_read16(const void* memPtr) { return *(const U16*) memPtr; }
+static U32 LZ4_read32(const void* memPtr) { return *(const U32*) memPtr; }
+static reg_t LZ4_read_ARCH(const void* memPtr) { return *(const reg_t*) memPtr; }
+
+static void LZ4_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; }
+static void LZ4_write32(void* memPtr, U32 value) { *(U32*)memPtr = value; }
+
+#elif defined(LZ4_FORCE_MEMORY_ACCESS) && (LZ4_FORCE_MEMORY_ACCESS==1)
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef struct { U16 u16; } __attribute__((packed)) LZ4_unalign16;
+typedef struct { U32 u32; } __attribute__((packed)) LZ4_unalign32;
+typedef struct { reg_t uArch; } __attribute__((packed)) LZ4_unalignST;
+
+static U16 LZ4_read16(const void* ptr) { return ((const LZ4_unalign16*)ptr)->u16; }
+static U32 LZ4_read32(const void* ptr) { return ((const LZ4_unalign32*)ptr)->u32; }
+static reg_t LZ4_read_ARCH(const void* ptr) { return ((const LZ4_unalignST*)ptr)->uArch; }
+
+static void LZ4_write16(void* memPtr, U16 value) { ((LZ4_unalign16*)memPtr)->u16 = value; }
+static void LZ4_write32(void* memPtr, U32 value) { ((LZ4_unalign32*)memPtr)->u32 = value; }
+
+#else  /* safe and portable access using memcpy() */
+
+static U16 LZ4_read16(const void* memPtr)
+{
+    U16 val; LZ4_memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+static U32 LZ4_read32(const void* memPtr)
+{
+    U32 val; LZ4_memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+static reg_t LZ4_read_ARCH(const void* memPtr)
+{
+    reg_t val; LZ4_memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+static void LZ4_write16(void* memPtr, U16 value)
+{
+    LZ4_memcpy(memPtr, &value, sizeof(value));
+}
+
+static void LZ4_write32(void* memPtr, U32 value)
+{
+    LZ4_memcpy(memPtr, &value, sizeof(value));
+}
+
+#endif /* LZ4_FORCE_MEMORY_ACCESS */
+
+
+static U16 LZ4_readLE16(const void* memPtr)
+{
+    if (LZ4_isLittleEndian()) {
+        return LZ4_read16(memPtr);
+    } else {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U16)((U16)p[0] + (p[1]<<8));
+    }
+}
+
+static void LZ4_writeLE16(void* memPtr, U16 value)
+{
+    if (LZ4_isLittleEndian()) {
+        LZ4_write16(memPtr, value);
+    } else {
+        BYTE* p = (BYTE*)memPtr;
+        p[0] = (BYTE) value;
+        p[1] = (BYTE)(value>>8);
+    }
+}
+
+/* customized variant of memcpy, which can overwrite up to 8 bytes beyond dstEnd */
+LZ4_FORCE_INLINE
+void LZ4_wildCopy8(void* dstPtr, const void* srcPtr, void* dstEnd)
+{
+    BYTE* d = (BYTE*)dstPtr;
+    const BYTE* s = (const BYTE*)srcPtr;
+    BYTE* const e = (BYTE*)dstEnd;
+
+    do { LZ4_memcpy(d,s,8); d+=8; s+=8; } while (d<e);
+}
+
+static const unsigned inc32table[8] = {0, 1, 2,  1,  0,  4, 4, 4};
+static const int      dec64table[8] = {0, 0, 0, -1, -4,  1, 2, 3};
+
+
+#ifndef LZ4_FAST_DEC_LOOP
+#  if defined __i386__ || defined _M_IX86 || defined __x86_64__ || defined _M_X64
+#    define LZ4_FAST_DEC_LOOP 1
+#  elif defined(__aarch64__) && defined(__APPLE__)
+#    define LZ4_FAST_DEC_LOOP 1
+#  elif defined(__aarch64__) && !defined(__clang__)
+     /* On non-Apple aarch64, we disable this optimization for clang because
+      * on certain mobile chipsets, performance is reduced with clang. For
+      * more information refer to https://github.com/lz4/lz4/pull/707 */
+#    define LZ4_FAST_DEC_LOOP 1
+#  else
+#    define LZ4_FAST_DEC_LOOP 0
+#  endif
+#endif
+
+#if LZ4_FAST_DEC_LOOP
+
+LZ4_FORCE_INLINE void
+LZ4_memcpy_using_offset_base(BYTE* dstPtr, const BYTE* srcPtr, BYTE* dstEnd, const size_t offset)
+{
+    assert(srcPtr + offset == dstPtr);
+    if (offset < 8) {
+        LZ4_write32(dstPtr, 0);   /* silence an msan warning when offset==0 */
+        dstPtr[0] = srcPtr[0];
+        dstPtr[1] = srcPtr[1];
+        dstPtr[2] = srcPtr[2];
+        dstPtr[3] = srcPtr[3];
+        srcPtr += inc32table[offset];
+        LZ4_memcpy(dstPtr+4, srcPtr, 4);
+        srcPtr -= dec64table[offset];
+        dstPtr += 8;
+    } else {
+        LZ4_memcpy(dstPtr, srcPtr, 8);
+        dstPtr += 8;
+        srcPtr += 8;
+    }
+
+    LZ4_wildCopy8(dstPtr, srcPtr, dstEnd);
+}
+
+/* customized variant of memcpy, which can overwrite up to 32 bytes beyond dstEnd
+ * this version copies two times 16 bytes (instead of one time 32 bytes)
+ * because it must be compatible with offsets >= 16. */
+LZ4_FORCE_INLINE void
+LZ4_wildCopy32(void* dstPtr, const void* srcPtr, void* dstEnd)
+{
+    BYTE* d = (BYTE*)dstPtr;
+    const BYTE* s = (const BYTE*)srcPtr;
+    BYTE* const e = (BYTE*)dstEnd;
+
+    do { LZ4_memcpy(d,s,16); LZ4_memcpy(d+16,s+16,16); d+=32; s+=32; } while (d<e);
+}
+
+/* LZ4_memcpy_using_offset()  presumes :
+ * - dstEnd >= dstPtr + MINMATCH
+ * - there is at least 8 bytes available to write after dstEnd */
+LZ4_FORCE_INLINE void
+LZ4_memcpy_using_offset(BYTE* dstPtr, const BYTE* srcPtr, BYTE* dstEnd, const size_t offset)
+{
+    BYTE v[8];
+
+    assert(dstEnd >= dstPtr + MINMATCH);
+
+    switch(offset) {
+    case 1:
+        MEM_INIT(v, *srcPtr, 8);
+        break;
+    case 2:
+        LZ4_memcpy(v, srcPtr, 2);
+        LZ4_memcpy(&v[2], srcPtr, 2);
+#if defined(_MSC_VER) && (_MSC_VER <= 1936) /* MSVC 2022 ver 17.6 or earlier */
+#  pragma warning(push)
+#  pragma warning(disable : 6385) /* warning C6385: Reading invalid data from 'v'. */
+#endif
+        LZ4_memcpy(&v[4], v, 4);
+#if defined(_MSC_VER) && (_MSC_VER <= 1936) /* MSVC 2022 ver 17.6 or earlier */
+#  pragma warning(pop)
+#endif
+        break;
+    case 4:
+        LZ4_memcpy(v, srcPtr, 4);
+        LZ4_memcpy(&v[4], srcPtr, 4);
+        break;
+    default:
+        LZ4_memcpy_using_offset_base(dstPtr, srcPtr, dstEnd, offset);
+        return;
+    }
+
+    LZ4_memcpy(dstPtr, v, 8);
+    dstPtr += 8;
+    while (dstPtr < dstEnd) {
+        LZ4_memcpy(dstPtr, v, 8);
+        dstPtr += 8;
+    }
+}
+#endif
+
+
+/*-************************************
+*  Common functions
+**************************************/
+static unsigned LZ4_NbCommonBytes (reg_t val)
+{
+    assert(val != 0);
+    if (LZ4_isLittleEndian()) {
+        if (sizeof(val) == 8) {
+#       if defined(_MSC_VER) && (_MSC_VER >= 1800) && (defined(_M_AMD64) && !defined(_M_ARM64EC)) && !defined(LZ4_FORCE_SW_BITCOUNT)
+/*-*************************************************************************************************
+* ARM64EC is a Microsoft-designed ARM64 ABI compatible with AMD64 applications on ARM64 Windows 11.
+* The ARM64EC ABI does not support AVX/AVX2/AVX512 instructions, nor their relevant intrinsics
+* including _tzcnt_u64. Therefore, we need to neuter the _tzcnt_u64 code path for ARM64EC.
+****************************************************************************************************/
+#         if defined(__clang__) && (__clang_major__ < 10)
+            /* Avoid undefined clang-cl intrinsics issue.
+             * See https://github.com/lz4/lz4/pull/1017 for details. */
+            return (unsigned)__builtin_ia32_tzcnt_u64(val) >> 3;
+#         else
+            /* x64 CPUS without BMI support interpret `TZCNT` as `REP BSF` */
+            return (unsigned)_tzcnt_u64(val) >> 3;
+#         endif
+#       elif defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            unsigned long r = 0;
+            _BitScanForward64(&r, (U64)val);
+            return (unsigned)r >> 3;
+#       elif (defined(__clang__) || (defined(__GNUC__) && ((__GNUC__ > 3) || \
+                            ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) && \
+                                        !defined(LZ4_FORCE_SW_BITCOUNT)
+            return (unsigned)__builtin_ctzll((U64)val) >> 3;
+#       else
+            const U64 m = 0x0101010101010101ULL;
+            val ^= val - 1;
+            return (unsigned)(((U64)((val & (m - 1)) * m)) >> 56);
+#       endif
+        } else /* 32 bits */ {
+#       if defined(_MSC_VER) && (_MSC_VER >= 1400) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            unsigned long r;
+            _BitScanForward(&r, (U32)val);
+            return (unsigned)r >> 3;
+#       elif (defined(__clang__) || (defined(__GNUC__) && ((__GNUC__ > 3) || \
+                            ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) && \
+                        !defined(__TINYC__) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            return (unsigned)__builtin_ctz((U32)val) >> 3;
+#       else
+            const U32 m = 0x01010101;
+            return (unsigned)((((val - 1) ^ val) & (m - 1)) * m) >> 24;
+#       endif
+        }
+    } else   /* Big Endian CPU */ {
+        if (sizeof(val)==8) {
+#       if (defined(__clang__) || (defined(__GNUC__) && ((__GNUC__ > 3) || \
+                            ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) && \
+                        !defined(__TINYC__) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            return (unsigned)__builtin_clzll((U64)val) >> 3;
+#       else
+#if 1
+            /* this method is probably faster,
+             * but adds a 128 bytes lookup table */
+            static const unsigned char ctz7_tab[128] = {
+                7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+                5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+                6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+                5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+            };
+            U64 const mask = 0x0101010101010101ULL;
+            U64 const t = (((val >> 8) - mask) | val) & mask;
+            return ctz7_tab[(t * 0x0080402010080402ULL) >> 57];
+#else
+            /* this method doesn't consume memory space like the previous one,
+             * but it contains several branches,
+             * that may end up slowing execution */
+            static const U32 by32 = sizeof(val)*4;  /* 32 on 64 bits (goal), 16 on 32 bits.
+            Just to avoid some static analyzer complaining about shift by 32 on 32-bits target.
+            Note that this code path is never triggered in 32-bits mode. */
+            unsigned r;
+            if (!(val>>by32)) { r=4; } else { r=0; val>>=by32; }
+            if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
+            r += (!val);
+            return r;
+#endif
+#       endif
+        } else /* 32 bits */ {
+#       if (defined(__clang__) || (defined(__GNUC__) && ((__GNUC__ > 3) || \
+                            ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) && \
+                                        !defined(LZ4_FORCE_SW_BITCOUNT)
+            return (unsigned)__builtin_clz((U32)val) >> 3;
+#       else
+            val >>= 8;
+            val = ((((val + 0x00FFFF00) | 0x00FFFFFF) + val) |
+              (val + 0x00FF0000)) >> 24;
+            return (unsigned)val ^ 3;
+#       endif
+        }
+    }
+}
+
+
+#define STEPSIZE sizeof(reg_t)
+LZ4_FORCE_INLINE
+unsigned LZ4_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* pInLimit)
+{
+    const BYTE* const pStart = pIn;
+
+    if (likely(pIn < pInLimit-(STEPSIZE-1))) {
+        reg_t const diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn);
+        if (!diff) {
+            pIn+=STEPSIZE; pMatch+=STEPSIZE;
+        } else {
+            return LZ4_NbCommonBytes(diff);
+    }   }
+
+    while (likely(pIn < pInLimit-(STEPSIZE-1))) {
+        reg_t const diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn);
+        if (!diff) { pIn+=STEPSIZE; pMatch+=STEPSIZE; continue; }
+        pIn += LZ4_NbCommonBytes(diff);
+        return (unsigned)(pIn - pStart);
+    }
+
+    if ((STEPSIZE==8) && (pIn<(pInLimit-3)) && (LZ4_read32(pMatch) == LZ4_read32(pIn))) { pIn+=4; pMatch+=4; }
+    if ((pIn<(pInLimit-1)) && (LZ4_read16(pMatch) == LZ4_read16(pIn))) { pIn+=2; pMatch+=2; }
+    if ((pIn<pInLimit) && (*pMatch == *pIn)) pIn++;
+    return (unsigned)(pIn - pStart);
+}
+
+
+#ifndef LZ4_COMMONDEFS_ONLY
+/*-************************************
+*  Local Constants
+**************************************/
+static const int LZ4_64Klimit = ((64 KB) + (MFLIMIT-1));
+static const U32 LZ4_skipTrigger = 6;  /* Increase this value ==> compression run slower on incompressible data */
+
+
+/*-************************************
+*  Local Structures and types
+**************************************/
+typedef enum { clearedTable = 0, byPtr, byU32, byU16 } tableType_t;
+
+/**
+ * This enum distinguishes several different modes of accessing previous
+ * content in the stream.
+ *
+ * - noDict        : There is no preceding content.
+ * - withPrefix64k : Table entries up to ctx->dictSize before the current blob
+ *                   blob being compressed are valid and refer to the preceding
+ *                   content (of length ctx->dictSize), which is available
+ *                   contiguously preceding in memory the content currently
+ *                   being compressed.
+ * - usingExtDict  : Like withPrefix64k, but the preceding content is somewhere
+ *                   else in memory, starting at ctx->dictionary with length
+ *                   ctx->dictSize.
+ * - usingDictCtx  : Everything concerning the preceding content is
+ *                   in a separate context, pointed to by ctx->dictCtx.
+ *                   ctx->dictionary, ctx->dictSize, and table entries
+ *                   in the current context that refer to positions
+ *                   preceding the beginning of the current compression are
+ *                   ignored. Instead, ctx->dictCtx->dictionary and ctx->dictCtx
+ *                   ->dictSize describe the location and size of the preceding
+ *                   content, and matches are found by looking in the ctx
+ *                   ->dictCtx->hashTable.
+ */
+typedef enum { noDict = 0, withPrefix64k, usingExtDict, usingDictCtx } dict_directive;
+typedef enum { noDictIssue = 0, dictSmall } dictIssue_directive;
+
+
+/*-************************************
+*  Local Utils
+**************************************/
+int LZ4_versionNumber (void) { return LZ4_VERSION_NUMBER; }
+const char* LZ4_versionString(void) { return LZ4_VERSION_STRING; }
+int LZ4_compressBound(int isize)  { return LZ4_COMPRESSBOUND(isize); }
+int LZ4_sizeofState(void) { return sizeof(LZ4_stream_t); }
+
+
+/*-****************************************
+*  Internal Definitions, used only in Tests
+*******************************************/
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+int LZ4_compress_forceExtDict (LZ4_stream_t* LZ4_dict, const char* source, char* dest, int srcSize);
+
+int LZ4_decompress_safe_forceExtDict(const char* source, char* dest,
+                                     int compressedSize, int maxOutputSize,
+                                     const void* dictStart, size_t dictSize);
+int LZ4_decompress_safe_partial_forceExtDict(const char* source, char* dest,
+                                     int compressedSize, int targetOutputSize, int dstCapacity,
+                                     const void* dictStart, size_t dictSize);
+#if defined (__cplusplus)
+}
+#endif
+
+/*-******************************
+*  Compression functions
+********************************/
+LZ4_FORCE_INLINE U32 LZ4_hash4(U32 sequence, tableType_t const tableType)
+{
+    if (tableType == byU16)
+        return ((sequence * 2654435761U) >> ((MINMATCH*8)-(LZ4_HASHLOG+1)));
+    else
+        return ((sequence * 2654435761U) >> ((MINMATCH*8)-LZ4_HASHLOG));
+}
+
+LZ4_FORCE_INLINE U32 LZ4_hash5(U64 sequence, tableType_t const tableType)
+{
+    const U32 hashLog = (tableType == byU16) ? LZ4_HASHLOG+1 : LZ4_HASHLOG;
+    if (LZ4_isLittleEndian()) {
+        const U64 prime5bytes = 889523592379ULL;
+        return (U32)(((sequence << 24) * prime5bytes) >> (64 - hashLog));
+    } else {
+        const U64 prime8bytes = 11400714785074694791ULL;
+        return (U32)(((sequence >> 24) * prime8bytes) >> (64 - hashLog));
+    }
+}
+
+LZ4_FORCE_INLINE U32 LZ4_hashPosition(const void* const p, tableType_t const tableType)
+{
+    if ((sizeof(reg_t)==8) && (tableType != byU16)) return LZ4_hash5(LZ4_read_ARCH(p), tableType);
+    return LZ4_hash4(LZ4_read32(p), tableType);
+}
+
+LZ4_FORCE_INLINE void LZ4_clearHash(U32 h, void* tableBase, tableType_t const tableType)
+{
+    switch (tableType)
+    {
+    default: /* fallthrough */
+    case clearedTable: { /* illegal! */ assert(0); return; }
+    case byPtr: { const BYTE** hashTable = (const BYTE**)tableBase; hashTable[h] = NULL; return; }
+    case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = 0; return; }
+    case byU16: { U16* hashTable = (U16*) tableBase; hashTable[h] = 0; return; }
+    }
+}
+
+LZ4_FORCE_INLINE void LZ4_putIndexOnHash(U32 idx, U32 h, void* tableBase, tableType_t const tableType)
+{
+    switch (tableType)
+    {
+    default: /* fallthrough */
+    case clearedTable: /* fallthrough */
+    case byPtr: { /* illegal! */ assert(0); return; }
+    case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = idx; return; }
+    case byU16: { U16* hashTable = (U16*) tableBase; assert(idx < 65536); hashTable[h] = (U16)idx; return; }
+    }
+}
+
+/* LZ4_putPosition*() : only used in byPtr mode */
+LZ4_FORCE_INLINE void LZ4_putPositionOnHash(const BYTE* p, U32 h,
+                                  void* tableBase, tableType_t const tableType)
+{
+    const BYTE** const hashTable = (const BYTE**)tableBase;
+    assert(tableType == byPtr); (void)tableType;
+    hashTable[h] = p;
+}
+
+LZ4_FORCE_INLINE void LZ4_putPosition(const BYTE* p, void* tableBase, tableType_t tableType)
+{
+    U32 const h = LZ4_hashPosition(p, tableType);
+    LZ4_putPositionOnHash(p, h, tableBase, tableType);
+}
+
+/* LZ4_getIndexOnHash() :
+ * Index of match position registered in hash table.
+ * hash position must be calculated by using base+index, or dictBase+index.
+ * Assumption 1 : only valid if tableType == byU32 or byU16.
+ * Assumption 2 : h is presumed valid (within limits of hash table)
+ */
+LZ4_FORCE_INLINE U32 LZ4_getIndexOnHash(U32 h, const void* tableBase, tableType_t tableType)
+{
+    LZ4_STATIC_ASSERT(LZ4_MEMORY_USAGE > 2);
+    if (tableType == byU32) {
+        const U32* const hashTable = (const U32*) tableBase;
+        assert(h < (1U << (LZ4_MEMORY_USAGE-2)));
+        return hashTable[h];
+    }
+    if (tableType == byU16) {
+        const U16* const hashTable = (const U16*) tableBase;
+        assert(h < (1U << (LZ4_MEMORY_USAGE-1)));
+        return hashTable[h];
+    }
+    assert(0); return 0;  /* forbidden case */
+}
+
+static const BYTE* LZ4_getPositionOnHash(U32 h, const void* tableBase, tableType_t tableType)
+{
+    assert(tableType == byPtr); (void)tableType;
+    { const BYTE* const* hashTable = (const BYTE* const*) tableBase; return hashTable[h]; }
+}
+
+LZ4_FORCE_INLINE const BYTE*
+LZ4_getPosition(const BYTE* p,
+                const void* tableBase, tableType_t tableType)
+{
+    U32 const h = LZ4_hashPosition(p, tableType);
+    return LZ4_getPositionOnHash(h, tableBase, tableType);
+}
+
+LZ4_FORCE_INLINE void
+LZ4_prepareTable(LZ4_stream_t_internal* const cctx,
+           const int inputSize,
+           const tableType_t tableType) {
+    /* If the table hasn't been used, it's guaranteed to be zeroed out, and is
+     * therefore safe to use no matter what mode we're in. Otherwise, we figure
+     * out if it's safe to leave as is or whether it needs to be reset.
+     */
+    if ((tableType_t)cctx->tableType != clearedTable) {
+        assert(inputSize >= 0);
+        if ((tableType_t)cctx->tableType != tableType
+          || ((tableType == byU16) && cctx->currentOffset + (unsigned)inputSize >= 0xFFFFU)
+          || ((tableType == byU32) && cctx->currentOffset > 1 GB)
+          || tableType == byPtr
+          || inputSize >= 4 KB)
+        {
+            DEBUGLOG(4, "LZ4_prepareTable: Resetting table in %p", cctx);
+            MEM_INIT(cctx->hashTable, 0, LZ4_HASHTABLESIZE);
+            cctx->currentOffset = 0;
+            cctx->tableType = (U32)clearedTable;
+        } else {
+            DEBUGLOG(4, "LZ4_prepareTable: Re-use hash table (no reset)");
+        }
+    }
+
+    /* Adding a gap, so all previous entries are > LZ4_DISTANCE_MAX back,
+     * is faster than compressing without a gap.
+     * However, compressing with currentOffset == 0 is faster still,
+     * so we preserve that case.
+     */
+    if (cctx->currentOffset != 0 && tableType == byU32) {
+        DEBUGLOG(5, "LZ4_prepareTable: adding 64KB to currentOffset");
+        cctx->currentOffset += 64 KB;
+    }
+
+    /* Finally, clear history */
+    cctx->dictCtx = NULL;
+    cctx->dictionary = NULL;
+    cctx->dictSize = 0;
+}
+
+/** LZ4_compress_generic() :
+ *  inlined, to ensure branches are decided at compilation time.
+ *  The following conditions are presumed already validated:
+ *  - source != NULL
+ *  - inputSize > 0
+ */
+LZ4_FORCE_INLINE int LZ4_compress_generic_validated(
+                 LZ4_stream_t_internal* const cctx,
+                 const char* const source,
+                 char* const dest,
+                 const int inputSize,
+                 int*  inputConsumed, /* only written when outputDirective == fillOutput */
+                 const int maxOutputSize,
+                 const limitedOutput_directive outputDirective,
+                 const tableType_t tableType,
+                 const dict_directive dictDirective,
+                 const dictIssue_directive dictIssue,
+                 const int acceleration)
+{
+    int result;
+    const BYTE* ip = (const BYTE*)source;
+
+    U32 const startIndex = cctx->currentOffset;
+    const BYTE* base = (const BYTE*)source - startIndex;
+    const BYTE* lowLimit;
+
+    const LZ4_stream_t_internal* dictCtx = (const LZ4_stream_t_internal*) cctx->dictCtx;
+    const BYTE* const dictionary =
+        dictDirective == usingDictCtx ? dictCtx->dictionary : cctx->dictionary;
+    const U32 dictSize =
+        dictDirective == usingDictCtx ? dictCtx->dictSize : cctx->dictSize;
+    const U32 dictDelta =
+        (dictDirective == usingDictCtx) ? startIndex - dictCtx->currentOffset : 0;   /* make indexes in dictCtx comparable with indexes in current context */
+
+    int const maybe_extMem = (dictDirective == usingExtDict) || (dictDirective == usingDictCtx);
+    U32 const prefixIdxLimit = startIndex - dictSize;   /* used when dictDirective == dictSmall */
+    const BYTE* const dictEnd = dictionary ? dictionary + dictSize : dictionary;
+    const BYTE* anchor = (const BYTE*) source;
+    const BYTE* const iend = ip + inputSize;
+    const BYTE* const mflimitPlusOne = iend - MFLIMIT + 1;
+    const BYTE* const matchlimit = iend - LASTLITERALS;
+
+    /* the dictCtx currentOffset is indexed on the start of the dictionary,
+     * while a dictionary in the current context precedes the currentOffset */
+    const BYTE* dictBase = (dictionary == NULL) ? NULL :
+                           (dictDirective == usingDictCtx) ?
+                            dictionary + dictSize - dictCtx->currentOffset :
+                            dictionary + dictSize - startIndex;
+
+    BYTE* op = (BYTE*) dest;
+    BYTE* const olimit = op + maxOutputSize;
+
+    U32 offset = 0;
+    U32 forwardH;
+
+    DEBUGLOG(5, "LZ4_compress_generic_validated: srcSize=%i, tableType=%u", inputSize, tableType);
+    assert(ip != NULL);
+    if (tableType == byU16) assert(inputSize<LZ4_64Klimit);  /* Size too large (not within 64K limit) */
+    if (tableType == byPtr) assert(dictDirective==noDict);   /* only supported use case with byPtr */
+    /* If init conditions are not met, we don't have to mark stream
+     * as having dirty context, since no action was taken yet */
+    if (outputDirective == fillOutput && maxOutputSize < 1) { return 0; } /* Impossible to store anything */
+    assert(acceleration >= 1);
+
+    lowLimit = (const BYTE*)source - (dictDirective == withPrefix64k ? dictSize : 0);
+
+    /* Update context state */
+    if (dictDirective == usingDictCtx) {
+        /* Subsequent linked blocks can't use the dictionary. */
+        /* Instead, they use the block we just compressed. */
+        cctx->dictCtx = NULL;
+        cctx->dictSize = (U32)inputSize;
+    } else {
+        cctx->dictSize += (U32)inputSize;
+    }
+    cctx->currentOffset += (U32)inputSize;
+    cctx->tableType = (U32)tableType;
+
+    if (inputSize<LZ4_minLength) goto _last_literals;        /* Input too small, no compression (all literals) */
+
+    /* First Byte */
+    {   U32 const h = LZ4_hashPosition(ip, tableType);
+        if (tableType == byPtr) {
+            LZ4_putPositionOnHash(ip, h, cctx->hashTable, byPtr);
+        } else {
+            LZ4_putIndexOnHash(startIndex, h, cctx->hashTable, tableType);
+    }   }
+    ip++; forwardH = LZ4_hashPosition(ip, tableType);
+
+    /* Main Loop */
+    for ( ; ; ) {
+        const BYTE* match;
+        BYTE* token;
+        const BYTE* filledIp;
+
+        /* Find a match */
+        if (tableType == byPtr) {
+            const BYTE* forwardIp = ip;
+            int step = 1;
+            int searchMatchNb = acceleration << LZ4_skipTrigger;
+            do {
+                U32 const h = forwardH;
+                ip = forwardIp;
+                forwardIp += step;
+                step = (searchMatchNb++ >> LZ4_skipTrigger);
+
+                if (unlikely(forwardIp > mflimitPlusOne)) goto _last_literals;
+                assert(ip < mflimitPlusOne);
+
+                match = LZ4_getPositionOnHash(h, cctx->hashTable, tableType);
+                forwardH = LZ4_hashPosition(forwardIp, tableType);
+                LZ4_putPositionOnHash(ip, h, cctx->hashTable, tableType);
+
+            } while ( (match+LZ4_DISTANCE_MAX < ip)
+                   || (LZ4_read32(match) != LZ4_read32(ip)) );
+
+        } else {   /* byU32, byU16 */
+
+            const BYTE* forwardIp = ip;
+            int step = 1;
+            int searchMatchNb = acceleration << LZ4_skipTrigger;
+            do {
+                U32 const h = forwardH;
+                U32 const current = (U32)(forwardIp - base);
+                U32 matchIndex = LZ4_getIndexOnHash(h, cctx->hashTable, tableType);
+                assert(matchIndex <= current);
+                assert(forwardIp - base < (ptrdiff_t)(2 GB - 1));
+                ip = forwardIp;
+                forwardIp += step;
+                step = (searchMatchNb++ >> LZ4_skipTrigger);
+
+                if (unlikely(forwardIp > mflimitPlusOne)) goto _last_literals;
+                assert(ip < mflimitPlusOne);
+
+                if (dictDirective == usingDictCtx) {
+                    if (matchIndex < startIndex) {
+                        /* there was no match, try the dictionary */
+                        assert(tableType == byU32);
+                        matchIndex = LZ4_getIndexOnHash(h, dictCtx->hashTable, byU32);
+                        match = dictBase + matchIndex;
+                        matchIndex += dictDelta;   /* make dictCtx index comparable with current context */
+                        lowLimit = dictionary;
+                    } else {
+                        match = base + matchIndex;
+                        lowLimit = (const BYTE*)source;
+                    }
+                } else if (dictDirective == usingExtDict) {
+                    if (matchIndex < startIndex) {
+                        DEBUGLOG(7, "extDict candidate: matchIndex=%5u  <  startIndex=%5u", matchIndex, startIndex);
+                        assert(startIndex - matchIndex >= MINMATCH);
+                        assert(dictBase);
+                        match = dictBase + matchIndex;
+                        lowLimit = dictionary;
+                    } else {
+                        match = base + matchIndex;
+                        lowLimit = (const BYTE*)source;
+                    }
+                } else {   /* single continuous memory segment */
+                    match = base + matchIndex;
+                }
+                forwardH = LZ4_hashPosition(forwardIp, tableType);
+                LZ4_putIndexOnHash(current, h, cctx->hashTable, tableType);
+
+                DEBUGLOG(7, "candidate at pos=%u  (offset=%u \n", matchIndex, current - matchIndex);
+                if ((dictIssue == dictSmall) && (matchIndex < prefixIdxLimit)) { continue; }    /* match outside of valid area */
+                assert(matchIndex < current);
+                if ( ((tableType != byU16) || (LZ4_DISTANCE_MAX < LZ4_DISTANCE_ABSOLUTE_MAX))
+                  && (matchIndex+LZ4_DISTANCE_MAX < current)) {
+                    continue;
+                } /* too far */
+                assert((current - matchIndex) <= LZ4_DISTANCE_MAX);  /* match now expected within distance */
+
+                if (LZ4_read32(match) == LZ4_read32(ip)) {
+                    if (maybe_extMem) offset = current - matchIndex;
+                    break;   /* match found */
+                }
+
+            } while(1);
+        }
+
+        /* Catch up */
+        filledIp = ip;
+        while (((ip>anchor) & (match > lowLimit)) && (unlikely(ip[-1]==match[-1]))) { ip--; match--; }
+
+        /* Encode Literals */
+        {   unsigned const litLength = (unsigned)(ip - anchor);
+            token = op++;
+            if ((outputDirective == limitedOutput) &&  /* Check output buffer overflow */
+                (unlikely(op + litLength + (2 + 1 + LASTLITERALS) + (litLength/255) > olimit)) ) {
+                return 0;   /* cannot compress within `dst` budget. Stored indexes in hash table are nonetheless fine */
+            }
+            if ((outputDirective == fillOutput) &&
+                (unlikely(op + (litLength+240)/255 /* litlen */ + litLength /* literals */ + 2 /* offset */ + 1 /* token */ + MFLIMIT - MINMATCH /* min last literals so last match is <= end - MFLIMIT */ > olimit))) {
+                op--;
+                goto _last_literals;
+            }
+            if (litLength >= RUN_MASK) {
+                int len = (int)(litLength - RUN_MASK);
+                *token = (RUN_MASK<<ML_BITS);
+                for(; len >= 255 ; len-=255) *op++ = 255;
+                *op++ = (BYTE)len;
+            }
+            else *token = (BYTE)(litLength<<ML_BITS);
+
+            /* Copy Literals */
+            LZ4_wildCopy8(op, anchor, op+litLength);
+            op+=litLength;
+            DEBUGLOG(6, "seq.start:%i, literals=%u, match.start:%i",
+                        (int)(anchor-(const BYTE*)source), litLength, (int)(ip-(const BYTE*)source));
+        }
+
+_next_match:
+        /* at this stage, the following variables must be correctly set :
+         * - ip : at start of LZ operation
+         * - match : at start of previous pattern occurrence; can be within current prefix, or within extDict
+         * - offset : if maybe_ext_memSegment==1 (constant)
+         * - lowLimit : must be == dictionary to mean "match is within extDict"; must be == source otherwise
+         * - token and *token : position to write 4-bits for match length; higher 4-bits for literal length supposed already written
+         */
+
+        if ((outputDirective == fillOutput) &&
+            (op + 2 /* offset */ + 1 /* token */ + MFLIMIT - MINMATCH /* min last literals so last match is <= end - MFLIMIT */ > olimit)) {
+            /* the match was too close to the end, rewind and go to last literals */
+            op = token;
+            goto _last_literals;
+        }
+
+        /* Encode Offset */
+        if (maybe_extMem) {   /* static test */
+            DEBUGLOG(6, "             with offset=%u  (ext if > %i)", offset, (int)(ip - (const BYTE*)source));
+            assert(offset <= LZ4_DISTANCE_MAX && offset > 0);
+            LZ4_writeLE16(op, (U16)offset); op+=2;
+        } else  {
+            DEBUGLOG(6, "             with offset=%u  (same segment)", (U32)(ip - match));
+            assert(ip-match <= LZ4_DISTANCE_MAX);
+            LZ4_writeLE16(op, (U16)(ip - match)); op+=2;
+        }
+
+        /* Encode MatchLength */
+        {   unsigned matchCode;
+
+            if ( (dictDirective==usingExtDict || dictDirective==usingDictCtx)
+              && (lowLimit==dictionary) /* match within extDict */ ) {
+                const BYTE* limit = ip + (dictEnd-match);
+                assert(dictEnd > match);
+                if (limit > matchlimit) limit = matchlimit;
+                matchCode = LZ4_count(ip+MINMATCH, match+MINMATCH, limit);
+                ip += (size_t)matchCode + MINMATCH;
+                if (ip==limit) {
+                    unsigned const more = LZ4_count(limit, (const BYTE*)source, matchlimit);
+                    matchCode += more;
+                    ip += more;
+                }
+                DEBUGLOG(6, "             with matchLength=%u starting in extDict", matchCode+MINMATCH);
+            } else {
+                matchCode = LZ4_count(ip+MINMATCH, match+MINMATCH, matchlimit);
+                ip += (size_t)matchCode + MINMATCH;
+                DEBUGLOG(6, "             with matchLength=%u", matchCode+MINMATCH);
+            }
+
+            if ((outputDirective) &&    /* Check output buffer overflow */
+                (unlikely(op + (1 + LASTLITERALS) + (matchCode+240)/255 > olimit)) ) {
+                if (outputDirective == fillOutput) {
+                    /* Match description too long : reduce it */
+                    U32 newMatchCode = 15 /* in token */ - 1 /* to avoid needing a zero byte */ + ((U32)(olimit - op) - 1 - LASTLITERALS) * 255;
+                    ip -= matchCode - newMatchCode;
+                    assert(newMatchCode < matchCode);
+                    matchCode = newMatchCode;
+                    if (unlikely(ip <= filledIp)) {
+                        /* We have already filled up to filledIp so if ip ends up less than filledIp
+                         * we have positions in the hash table beyond the current position. This is
+                         * a problem if we reuse the hash table. So we have to remove these positions
+                         * from the hash table.
+                         */
+                        const BYTE* ptr;
+                        DEBUGLOG(5, "Clearing %u positions", (U32)(filledIp - ip));
+                        for (ptr = ip; ptr <= filledIp; ++ptr) {
+                            U32 const h = LZ4_hashPosition(ptr, tableType);
+                            LZ4_clearHash(h, cctx->hashTable, tableType);
+                        }
+                    }
+                } else {
+                    assert(outputDirective == limitedOutput);
+                    return 0;   /* cannot compress within `dst` budget. Stored indexes in hash table are nonetheless fine */
+                }
+            }
+            if (matchCode >= ML_MASK) {
+                *token += ML_MASK;
+                matchCode -= ML_MASK;
+                LZ4_write32(op, 0xFFFFFFFF);
+                while (matchCode >= 4*255) {
+                    op+=4;
+                    LZ4_write32(op, 0xFFFFFFFF);
+                    matchCode -= 4*255;
+                }
+                op += matchCode / 255;
+                *op++ = (BYTE)(matchCode % 255);
+            } else
+                *token += (BYTE)(matchCode);
+        }
+        /* Ensure we have enough space for the last literals. */
+        assert(!(outputDirective == fillOutput && op + 1 + LASTLITERALS > olimit));
+
+        anchor = ip;
+
+        /* Test end of chunk */
+        if (ip >= mflimitPlusOne) break;
+
+        /* Fill table */
+        {   U32 const h = LZ4_hashPosition(ip-2, tableType);
+            if (tableType == byPtr) {
+                LZ4_putPositionOnHash(ip-2, h, cctx->hashTable, byPtr);
+            } else {
+                U32 const idx = (U32)((ip-2) - base);
+                LZ4_putIndexOnHash(idx, h, cctx->hashTable, tableType);
+        }   }
+
+        /* Test next position */
+        if (tableType == byPtr) {
+
+            match = LZ4_getPosition(ip, cctx->hashTable, tableType);
+            LZ4_putPosition(ip, cctx->hashTable, tableType);
+            if ( (match+LZ4_DISTANCE_MAX >= ip)
+              && (LZ4_read32(match) == LZ4_read32(ip)) )
+            { token=op++; *token=0; goto _next_match; }
+
+        } else {   /* byU32, byU16 */
+
+            U32 const h = LZ4_hashPosition(ip, tableType);
+            U32 const current = (U32)(ip-base);
+            U32 matchIndex = LZ4_getIndexOnHash(h, cctx->hashTable, tableType);
+            assert(matchIndex < current);
+            if (dictDirective == usingDictCtx) {
+                if (matchIndex < startIndex) {
+                    /* there was no match, try the dictionary */
+                    assert(tableType == byU32);
+                    matchIndex = LZ4_getIndexOnHash(h, dictCtx->hashTable, byU32);
+                    match = dictBase + matchIndex;
+                    lowLimit = dictionary;   /* required for match length counter */
+                    matchIndex += dictDelta;
+                } else {
+                    match = base + matchIndex;
+                    lowLimit = (const BYTE*)source;  /* required for match length counter */
+                }
+            } else if (dictDirective==usingExtDict) {
+                if (matchIndex < startIndex) {
+                    assert(dictBase);
+                    match = dictBase + matchIndex;
+                    lowLimit = dictionary;   /* required for match length counter */
+                } else {
+                    match = base + matchIndex;
+                    lowLimit = (const BYTE*)source;   /* required for match length counter */
+                }
+            } else {   /* single memory segment */
+                match = base + matchIndex;
+            }
+            LZ4_putIndexOnHash(current, h, cctx->hashTable, tableType);
+            assert(matchIndex < current);
+            if ( ((dictIssue==dictSmall) ? (matchIndex >= prefixIdxLimit) : 1)
+              && (((tableType==byU16) && (LZ4_DISTANCE_MAX == LZ4_DISTANCE_ABSOLUTE_MAX)) ? 1 : (matchIndex+LZ4_DISTANCE_MAX >= current))
+              && (LZ4_read32(match) == LZ4_read32(ip)) ) {
+                token=op++;
+                *token=0;
+                if (maybe_extMem) offset = current - matchIndex;
+                DEBUGLOG(6, "seq.start:%i, literals=%u, match.start:%i",
+                            (int)(anchor-(const BYTE*)source), 0, (int)(ip-(const BYTE*)source));
+                goto _next_match;
+            }
+        }
+
+        /* Prepare next loop */
+        forwardH = LZ4_hashPosition(++ip, tableType);
+
+    }
+
+_last_literals:
+    /* Encode Last Literals */
+    {   size_t lastRun = (size_t)(iend - anchor);
+        if ( (outputDirective) &&  /* Check output buffer overflow */
+            (op + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > olimit)) {
+            if (outputDirective == fillOutput) {
+                /* adapt lastRun to fill 'dst' */
+                assert(olimit >= op);
+                lastRun  = (size_t)(olimit-op) - 1/*token*/;
+                lastRun -= (lastRun + 256 - RUN_MASK) / 256;  /*additional length tokens*/
+            } else {
+                assert(outputDirective == limitedOutput);
+                return 0;   /* cannot compress within `dst` budget. Stored indexes in hash table are nonetheless fine */
+            }
+        }
+        DEBUGLOG(6, "Final literal run : %i literals", (int)lastRun);
+        if (lastRun >= RUN_MASK) {
+            size_t accumulator = lastRun - RUN_MASK;
+            *op++ = RUN_MASK << ML_BITS;
+            for(; accumulator >= 255 ; accumulator-=255) *op++ = 255;
+            *op++ = (BYTE) accumulator;
+        } else {
+            *op++ = (BYTE)(lastRun<<ML_BITS);
+        }
+        LZ4_memcpy(op, anchor, lastRun);
+        ip = anchor + lastRun;
+        op += lastRun;
+    }
+
+    if (outputDirective == fillOutput) {
+        *inputConsumed = (int) (((const char*)ip)-source);
+    }
+    result = (int)(((char*)op) - dest);
+    assert(result > 0);
+    DEBUGLOG(5, "LZ4_compress_generic: compressed %i bytes into %i bytes", inputSize, result);
+    return result;
+}
+
+/** LZ4_compress_generic() :
+ *  inlined, to ensure branches are decided at compilation time;
+ *  takes care of src == (NULL, 0)
+ *  and forward the rest to LZ4_compress_generic_validated */
+LZ4_FORCE_INLINE int LZ4_compress_generic(
+                 LZ4_stream_t_internal* const cctx,
+                 const char* const src,
+                 char* const dst,
+                 const int srcSize,
+                 int *inputConsumed, /* only written when outputDirective == fillOutput */
+                 const int dstCapacity,
+                 const limitedOutput_directive outputDirective,
+                 const tableType_t tableType,
+                 const dict_directive dictDirective,
+                 const dictIssue_directive dictIssue,
+                 const int acceleration)
+{
+    DEBUGLOG(5, "LZ4_compress_generic: srcSize=%i, dstCapacity=%i",
+                srcSize, dstCapacity);
+
+    if ((U32)srcSize > (U32)LZ4_MAX_INPUT_SIZE) { return 0; }  /* Unsupported srcSize, too large (or negative) */
+    if (srcSize == 0) {   /* src == NULL supported if srcSize == 0 */
+        if (outputDirective != notLimited && dstCapacity <= 0) return 0;  /* no output, can't write anything */
+        DEBUGLOG(5, "Generating an empty block");
+        assert(outputDirective == notLimited || dstCapacity >= 1);
+        assert(dst != NULL);
+        dst[0] = 0;
+        if (outputDirective == fillOutput) {
+            assert (inputConsumed != NULL);
+            *inputConsumed = 0;
+        }
+        return 1;
+    }
+    assert(src != NULL);
+
+    return LZ4_compress_generic_validated(cctx, src, dst, srcSize,
+                inputConsumed, /* only written into if outputDirective == fillOutput */
+                dstCapacity, outputDirective,
+                tableType, dictDirective, dictIssue, acceleration);
+}
+
+
+int LZ4_compress_fast_extState(void* state, const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration)
+{
+    LZ4_stream_t_internal* const ctx = & LZ4_initStream(state, sizeof(LZ4_stream_t)) -> internal_donotuse;
+    assert(ctx != NULL);
+    if (acceleration < 1) acceleration = LZ4_ACCELERATION_DEFAULT;
+    if (acceleration > LZ4_ACCELERATION_MAX) acceleration = LZ4_ACCELERATION_MAX;
+    if (maxOutputSize >= LZ4_compressBound(inputSize)) {
+        if (inputSize < LZ4_64Klimit) {
+            return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, 0, notLimited, byU16, noDict, noDictIssue, acceleration);
+        } else {
+            const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)source > LZ4_DISTANCE_MAX)) ? byPtr : byU32;
+            return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, 0, notLimited, tableType, noDict, noDictIssue, acceleration);
+        }
+    } else {
+        if (inputSize < LZ4_64Klimit) {
+            return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, byU16, noDict, noDictIssue, acceleration);
+        } else {
+            const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)source > LZ4_DISTANCE_MAX)) ? byPtr : byU32;
+            return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, noDict, noDictIssue, acceleration);
+        }
+    }
+}
+
+/**
+ * LZ4_compress_fast_extState_fastReset() :
+ * A variant of LZ4_compress_fast_extState().
+ *
+ * Using this variant avoids an expensive initialization step. It is only safe
+ * to call if the state buffer is known to be correctly initialized already
+ * (see comment in lz4.h on LZ4_resetStream_fast() for a definition of
+ * "correctly initialized").
+ */
+int LZ4_compress_fast_extState_fastReset(void* state, const char* src, char* dst, int srcSize, int dstCapacity, int acceleration)
+{
+    LZ4_stream_t_internal* const ctx = &((LZ4_stream_t*)state)->internal_donotuse;
+    if (acceleration < 1) acceleration = LZ4_ACCELERATION_DEFAULT;
+    if (acceleration > LZ4_ACCELERATION_MAX) acceleration = LZ4_ACCELERATION_MAX;
+    assert(ctx != NULL);
+
+    if (dstCapacity >= LZ4_compressBound(srcSize)) {
+        if (srcSize < LZ4_64Klimit) {
+            const tableType_t tableType = byU16;
+            LZ4_prepareTable(ctx, srcSize, tableType);
+            if (ctx->currentOffset) {
+                return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, 0, notLimited, tableType, noDict, dictSmall, acceleration);
+            } else {
+                return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, 0, notLimited, tableType, noDict, noDictIssue, acceleration);
+            }
+        } else {
+            const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)src > LZ4_DISTANCE_MAX)) ? byPtr : byU32;
+            LZ4_prepareTable(ctx, srcSize, tableType);
+            return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, 0, notLimited, tableType, noDict, noDictIssue, acceleration);
+        }
+    } else {
+        if (srcSize < LZ4_64Klimit) {
+            const tableType_t tableType = byU16;
+            LZ4_prepareTable(ctx, srcSize, tableType);
+            if (ctx->currentOffset) {
+                return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, dstCapacity, limitedOutput, tableType, noDict, dictSmall, acceleration);
+            } else {
+                return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, dstCapacity, limitedOutput, tableType, noDict, noDictIssue, acceleration);
+            }
+        } else {
+            const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)src > LZ4_DISTANCE_MAX)) ? byPtr : byU32;
+            LZ4_prepareTable(ctx, srcSize, tableType);
+            return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, dstCapacity, limitedOutput, tableType, noDict, noDictIssue, acceleration);
+        }
+    }
+}
+
+
+int LZ4_compress_fast(const char* src, char* dest, int srcSize, int dstCapacity, int acceleration)
+{
+    int result;
+#if (LZ4_HEAPMODE)
+    LZ4_stream_t* const ctxPtr = (LZ4_stream_t*)ALLOC(sizeof(LZ4_stream_t));   /* malloc-calloc always properly aligned */
+    if (ctxPtr == NULL) return 0;
+#else
+    LZ4_stream_t ctx;
+    LZ4_stream_t* const ctxPtr = &ctx;
+#endif
+    result = LZ4_compress_fast_extState(ctxPtr, src, dest, srcSize, dstCapacity, acceleration);
+
+#if (LZ4_HEAPMODE)
+    FREEMEM(ctxPtr);
+#endif
+    return result;
+}
+
+
+int LZ4_compress_default(const char* src, char* dst, int srcSize, int dstCapacity)
+{
+    return LZ4_compress_fast(src, dst, srcSize, dstCapacity, 1);
+}
+
+
+/* Note!: This function leaves the stream in an unclean/broken state!
+ * It is not safe to subsequently use the same state with a _fastReset() or
+ * _continue() call without resetting it. */
+static int LZ4_compress_destSize_extState (LZ4_stream_t* state, const char* src, char* dst, int* srcSizePtr, int targetDstSize)
+{
+    void* const s = LZ4_initStream(state, sizeof (*state));
+    assert(s != NULL); (void)s;
+
+    if (targetDstSize >= LZ4_compressBound(*srcSizePtr)) {  /* compression success is guaranteed */
+        return LZ4_compress_fast_extState(state, src, dst, *srcSizePtr, targetDstSize, 1);
+    } else {
+        if (*srcSizePtr < LZ4_64Klimit) {
+            return LZ4_compress_generic(&state->internal_donotuse, src, dst, *srcSizePtr, srcSizePtr, targetDstSize, fillOutput, byU16, noDict, noDictIssue, 1);
+        } else {
+            tableType_t const addrMode = ((sizeof(void*)==4) && ((uptrval)src > LZ4_DISTANCE_MAX)) ? byPtr : byU32;
+            return LZ4_compress_generic(&state->internal_donotuse, src, dst, *srcSizePtr, srcSizePtr, targetDstSize, fillOutput, addrMode, noDict, noDictIssue, 1);
+    }   }
+}
+
+
+int LZ4_compress_destSize(const char* src, char* dst, int* srcSizePtr, int targetDstSize)
+{
+#if (LZ4_HEAPMODE)
+    LZ4_stream_t* const ctx = (LZ4_stream_t*)ALLOC(sizeof(LZ4_stream_t));   /* malloc-calloc always properly aligned */
+    if (ctx == NULL) return 0;
+#else
+    LZ4_stream_t ctxBody;
+    LZ4_stream_t* const ctx = &ctxBody;
+#endif
+
+    int result = LZ4_compress_destSize_extState(ctx, src, dst, srcSizePtr, targetDstSize);
+
+#if (LZ4_HEAPMODE)
+    FREEMEM(ctx);
+#endif
+    return result;
+}
+
+
+
+/*-******************************
+*  Streaming functions
+********************************/
+
+#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
+LZ4_stream_t* LZ4_createStream(void)
+{
+    LZ4_stream_t* const lz4s = (LZ4_stream_t*)ALLOC(sizeof(LZ4_stream_t));
+    LZ4_STATIC_ASSERT(sizeof(LZ4_stream_t) >= sizeof(LZ4_stream_t_internal));
+    DEBUGLOG(4, "LZ4_createStream %p", lz4s);
+    if (lz4s == NULL) return NULL;
+    LZ4_initStream(lz4s, sizeof(*lz4s));
+    return lz4s;
+}
+#endif
+
+static size_t LZ4_stream_t_alignment(void)
+{
+#if LZ4_ALIGN_TEST
+    typedef struct { char c; LZ4_stream_t t; } t_a;
+    return sizeof(t_a) - sizeof(LZ4_stream_t);
+#else
+    return 1;  /* effectively disabled */
+#endif
+}
+
+LZ4_stream_t* LZ4_initStream (void* buffer, size_t size)
+{
+    DEBUGLOG(5, "LZ4_initStream");
+    if (buffer == NULL) { return NULL; }
+    if (size < sizeof(LZ4_stream_t)) { return NULL; }
+    if (!LZ4_isAligned(buffer, LZ4_stream_t_alignment())) return NULL;
+    MEM_INIT(buffer, 0, sizeof(LZ4_stream_t_internal));
+    return (LZ4_stream_t*)buffer;
+}
+
+/* resetStream is now deprecated,
+ * prefer initStream() which is more general */
+void LZ4_resetStream (LZ4_stream_t* LZ4_stream)
+{
+    DEBUGLOG(5, "LZ4_resetStream (ctx:%p)", LZ4_stream);
+    MEM_INIT(LZ4_stream, 0, sizeof(LZ4_stream_t_internal));
+}
+
+void LZ4_resetStream_fast(LZ4_stream_t* ctx) {
+    LZ4_prepareTable(&(ctx->internal_donotuse), 0, byU32);
+}
+
+#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
+int LZ4_freeStream (LZ4_stream_t* LZ4_stream)
+{
+    if (!LZ4_stream) return 0;   /* support free on NULL */
+    DEBUGLOG(5, "LZ4_freeStream %p", LZ4_stream);
+    FREEMEM(LZ4_stream);
+    return (0);
+}
+#endif
+
+
+#define HASH_UNIT sizeof(reg_t)
+int LZ4_loadDict (LZ4_stream_t* LZ4_dict, const char* dictionary, int dictSize)
+{
+    LZ4_stream_t_internal* const dict = &LZ4_dict->internal_donotuse;
+    const tableType_t tableType = byU32;
+    const BYTE* p = (const BYTE*)dictionary;
+    const BYTE* const dictEnd = p + dictSize;
+    U32 idx32;
+
+    DEBUGLOG(4, "LZ4_loadDict (%i bytes from %p into %p)", dictSize, dictionary, LZ4_dict);
+
+    /* It's necessary to reset the context,
+     * and not just continue it with prepareTable()
+     * to avoid any risk of generating overflowing matchIndex
+     * when compressing using this dictionary */
+    LZ4_resetStream(LZ4_dict);
+
+    /* We always increment the offset by 64 KB, since, if the dict is longer,
+     * we truncate it to the last 64k, and if it's shorter, we still want to
+     * advance by a whole window length so we can provide the guarantee that
+     * there are only valid offsets in the window, which allows an optimization
+     * in LZ4_compress_fast_continue() where it uses noDictIssue even when the
+     * dictionary isn't a full 64k. */
+    dict->currentOffset += 64 KB;
+
+    if (dictSize < (int)HASH_UNIT) {
+        return 0;
+    }
+
+    if ((dictEnd - p) > 64 KB) p = dictEnd - 64 KB;
+    dict->dictionary = p;
+    dict->dictSize = (U32)(dictEnd - p);
+    dict->tableType = (U32)tableType;
+    idx32 = dict->currentOffset - dict->dictSize;
+
+    while (p <= dictEnd-HASH_UNIT) {
+        U32 const h = LZ4_hashPosition(p, tableType);
+        LZ4_putIndexOnHash(idx32, h, dict->hashTable, tableType);
+        p+=3; idx32+=3;
+    }
+
+    return (int)dict->dictSize;
+}
+
+void LZ4_attach_dictionary(LZ4_stream_t* workingStream, const LZ4_stream_t* dictionaryStream)
+{
+    const LZ4_stream_t_internal* dictCtx = (dictionaryStream == NULL) ? NULL :
+        &(dictionaryStream->internal_donotuse);
+
+    DEBUGLOG(4, "LZ4_attach_dictionary (%p, %p, size %u)",
+             workingStream, dictionaryStream,
+             dictCtx != NULL ? dictCtx->dictSize : 0);
+
+    if (dictCtx != NULL) {
+        /* If the current offset is zero, we will never look in the
+         * external dictionary context, since there is no value a table
+         * entry can take that indicate a miss. In that case, we need
+         * to bump the offset to something non-zero.
+         */
+        if (workingStream->internal_donotuse.currentOffset == 0) {
+            workingStream->internal_donotuse.currentOffset = 64 KB;
+        }
+
+        /* Don't actually attach an empty dictionary.
+         */
+        if (dictCtx->dictSize == 0) {
+            dictCtx = NULL;
+        }
+    }
+    workingStream->internal_donotuse.dictCtx = dictCtx;
+}
+
+
+static void LZ4_renormDictT(LZ4_stream_t_internal* LZ4_dict, int nextSize)
+{
+    assert(nextSize >= 0);
+    if (LZ4_dict->currentOffset + (unsigned)nextSize > 0x80000000) {   /* potential ptrdiff_t overflow (32-bits mode) */
+        /* rescale hash table */
+        U32 const delta = LZ4_dict->currentOffset - 64 KB;
+        const BYTE* dictEnd = LZ4_dict->dictionary + LZ4_dict->dictSize;
+        int i;
+        DEBUGLOG(4, "LZ4_renormDictT");
+        for (i=0; i<LZ4_HASH_SIZE_U32; i++) {
+            if (LZ4_dict->hashTable[i] < delta) LZ4_dict->hashTable[i]=0;
+            else LZ4_dict->hashTable[i] -= delta;
+        }
+        LZ4_dict->currentOffset = 64 KB;
+        if (LZ4_dict->dictSize > 64 KB) LZ4_dict->dictSize = 64 KB;
+        LZ4_dict->dictionary = dictEnd - LZ4_dict->dictSize;
+    }
+}
+
+
+int LZ4_compress_fast_continue (LZ4_stream_t* LZ4_stream,
+                                const char* source, char* dest,
+                                int inputSize, int maxOutputSize,
+                                int acceleration)
+{
+    const tableType_t tableType = byU32;
+    LZ4_stream_t_internal* const streamPtr = &LZ4_stream->internal_donotuse;
+    const char* dictEnd = streamPtr->dictSize ? (const char*)streamPtr->dictionary + streamPtr->dictSize : NULL;
+
+    DEBUGLOG(5, "LZ4_compress_fast_continue (inputSize=%i, dictSize=%u)", inputSize, streamPtr->dictSize);
+
+    LZ4_renormDictT(streamPtr, inputSize);   /* fix index overflow */
+    if (acceleration < 1) acceleration = LZ4_ACCELERATION_DEFAULT;
+    if (acceleration > LZ4_ACCELERATION_MAX) acceleration = LZ4_ACCELERATION_MAX;
+
+    /* invalidate tiny dictionaries */
+    if ( (streamPtr->dictSize < 4)     /* tiny dictionary : not enough for a hash */
+      && (dictEnd != source)           /* prefix mode */
+      && (inputSize > 0)               /* tolerance : don't lose history, in case next invocation would use prefix mode */
+      && (streamPtr->dictCtx == NULL)  /* usingDictCtx */
+      ) {
+        DEBUGLOG(5, "LZ4_compress_fast_continue: dictSize(%u) at addr:%p is too small", streamPtr->dictSize, streamPtr->dictionary);
+        /* remove dictionary existence from history, to employ faster prefix mode */
+        streamPtr->dictSize = 0;
+        streamPtr->dictionary = (const BYTE*)source;
+        dictEnd = source;
+    }
+
+    /* Check overlapping input/dictionary space */
+    {   const char* const sourceEnd = source + inputSize;
+        if ((sourceEnd > (const char*)streamPtr->dictionary) && (sourceEnd < dictEnd)) {
+            streamPtr->dictSize = (U32)(dictEnd - sourceEnd);
+            if (streamPtr->dictSize > 64 KB) streamPtr->dictSize = 64 KB;
+            if (streamPtr->dictSize < 4) streamPtr->dictSize = 0;
+            streamPtr->dictionary = (const BYTE*)dictEnd - streamPtr->dictSize;
+        }
+    }
+
+    /* prefix mode : source data follows dictionary */
+    if (dictEnd == source) {
+        if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset))
+            return LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, withPrefix64k, dictSmall, acceleration);
+        else
+            return LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, withPrefix64k, noDictIssue, acceleration);
+    }
+
+    /* external dictionary mode */
+    {   int result;
+        if (streamPtr->dictCtx) {
+            /* We depend here on the fact that dictCtx'es (produced by
+             * LZ4_loadDict) guarantee that their tables contain no references
+             * to offsets between dictCtx->currentOffset - 64 KB and
+             * dictCtx->currentOffset - dictCtx->dictSize. This makes it safe
+             * to use noDictIssue even when the dict isn't a full 64 KB.
+             */
+            if (inputSize > 4 KB) {
+                /* For compressing large blobs, it is faster to pay the setup
+                 * cost to copy the dictionary's tables into the active context,
+                 * so that the compression loop is only looking into one table.
+                 */
+                LZ4_memcpy(streamPtr, streamPtr->dictCtx, sizeof(*streamPtr));
+                result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingExtDict, noDictIssue, acceleration);
+            } else {
+                result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingDictCtx, noDictIssue, acceleration);
+            }
+        } else {  /* small data <= 4 KB */
+            if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset)) {
+                result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingExtDict, dictSmall, acceleration);
+            } else {
+                result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingExtDict, noDictIssue, acceleration);
+            }
+        }
+        streamPtr->dictionary = (const BYTE*)source;
+        streamPtr->dictSize = (U32)inputSize;
+        return result;
+    }
+}
+
+
+/* Hidden debug function, to force-test external dictionary mode */
+int LZ4_compress_forceExtDict (LZ4_stream_t* LZ4_dict, const char* source, char* dest, int srcSize)
+{
+    LZ4_stream_t_internal* const streamPtr = &LZ4_dict->internal_donotuse;
+    int result;
+
+    LZ4_renormDictT(streamPtr, srcSize);
+
+    if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset)) {
+        result = LZ4_compress_generic(streamPtr, source, dest, srcSize, NULL, 0, notLimited, byU32, usingExtDict, dictSmall, 1);
+    } else {
+        result = LZ4_compress_generic(streamPtr, source, dest, srcSize, NULL, 0, notLimited, byU32, usingExtDict, noDictIssue, 1);
+    }
+
+    streamPtr->dictionary = (const BYTE*)source;
+    streamPtr->dictSize = (U32)srcSize;
+
+    return result;
+}
+
+
+/*! LZ4_saveDict() :
+ *  If previously compressed data block is not guaranteed to remain available at its memory location,
+ *  save it into a safer place (char* safeBuffer).
+ *  Note : no need to call LZ4_loadDict() afterwards, dictionary is immediately usable,
+ *         one can therefore call LZ4_compress_fast_continue() right after.
+ * @return : saved dictionary size in bytes (necessarily <= dictSize), or 0 if error.
+ */
+int LZ4_saveDict (LZ4_stream_t* LZ4_dict, char* safeBuffer, int dictSize)
+{
+    LZ4_stream_t_internal* const dict = &LZ4_dict->internal_donotuse;
+
+    DEBUGLOG(5, "LZ4_saveDict : dictSize=%i, safeBuffer=%p", dictSize, safeBuffer);
+
+    if ((U32)dictSize > 64 KB) { dictSize = 64 KB; } /* useless to define a dictionary > 64 KB */
+    if ((U32)dictSize > dict->dictSize) { dictSize = (int)dict->dictSize; }
+
+    if (safeBuffer == NULL) assert(dictSize == 0);
+    if (dictSize > 0) {
+        const BYTE* const previousDictEnd = dict->dictionary + dict->dictSize;
+        assert(dict->dictionary);
+        LZ4_memmove(safeBuffer, previousDictEnd - dictSize, (size_t)dictSize);
+    }
+
+    dict->dictionary = (const BYTE*)safeBuffer;
+    dict->dictSize = (U32)dictSize;
+
+    return dictSize;
+}
+
+
+
+/*-*******************************
+ *  Decompression functions
+ ********************************/
+
+typedef enum { decode_full_block = 0, partial_decode = 1 } earlyEnd_directive;
+
+#undef MIN
+#define MIN(a,b)    ( (a) < (b) ? (a) : (b) )
+
+
+/* variant for decompress_unsafe()
+ * does not know end of input
+ * presumes input is well formed
+ * note : will consume at least one byte */
+static size_t read_long_length_no_check(const BYTE** pp)
+{
+    size_t b, l = 0;
+    do { b = **pp; (*pp)++; l += b; } while (b==255);
+    DEBUGLOG(6, "read_long_length_no_check: +length=%zu using %zu input bytes", l, l/255 + 1)
+    return l;
+}
+
+/* core decoder variant for LZ4_decompress_fast*()
+ * for legacy support only : these entry points are deprecated.
+ * - Presumes input is correctly formed (no defense vs malformed inputs)
+ * - Does not know input size (presume input buffer is "large enough")
+ * - Decompress a full block (only)
+ * @return : nb of bytes read from input.
+ * Note : this variant is not optimized for speed, just for maintenance.
+ *        the goal is to remove support of decompress_fast*() variants by v2.0
+**/
+LZ4_FORCE_INLINE int
+LZ4_decompress_unsafe_generic(
+                 const BYTE* const istart,
+                 BYTE* const ostart,
+                 int decompressedSize,
+
+                 size_t prefixSize,
+                 const BYTE* const dictStart,  /* only if dict==usingExtDict */
+                 const size_t dictSize         /* note: =0 if dictStart==NULL */
+                 )
+{
+    const BYTE* ip = istart;
+    BYTE* op = (BYTE*)ostart;
+    BYTE* const oend = ostart + decompressedSize;
+    const BYTE* const prefixStart = ostart - prefixSize;
+
+    DEBUGLOG(5, "LZ4_decompress_unsafe_generic");
+    if (dictStart == NULL) assert(dictSize == 0);
+
+    while (1) {
+        /* start new sequence */
+        unsigned token = *ip++;
+
+        /* literals */
+        {   size_t ll = token >> ML_BITS;
+            if (ll==15) {
+                /* long literal length */
+                ll += read_long_length_no_check(&ip);
+            }
+            if ((size_t)(oend-op) < ll) return -1; /* output buffer overflow */
+            LZ4_memmove(op, ip, ll); /* support in-place decompression */
+            op += ll;
+            ip += ll;
+            if ((size_t)(oend-op) < MFLIMIT) {
+                if (op==oend) break;  /* end of block */
+                DEBUGLOG(5, "invalid: literals end at distance %zi from end of block", oend-op);
+                /* incorrect end of block :
+                 * last match must start at least MFLIMIT==12 bytes before end of output block */
+                return -1;
+        }   }
+
+        /* match */
+        {   size_t ml = token & 15;
+            size_t const offset = LZ4_readLE16(ip);
+            ip+=2;
+
+            if (ml==15) {
+                /* long literal length */
+                ml += read_long_length_no_check(&ip);
+            }
+            ml += MINMATCH;
+
+            if ((size_t)(oend-op) < ml) return -1; /* output buffer overflow */
+
+            {   const BYTE* match = op - offset;
+
+                /* out of range */
+                if (offset > (size_t)(op - prefixStart) + dictSize) {
+                    DEBUGLOG(6, "offset out of range");
+                    return -1;
+                }
+
+                /* check special case : extDict */
+                if (offset > (size_t)(op - prefixStart)) {
+                    /* extDict scenario */
+                    const BYTE* const dictEnd = dictStart + dictSize;
+                    const BYTE* extMatch = dictEnd - (offset - (size_t)(op-prefixStart));
+                    size_t const extml = (size_t)(dictEnd - extMatch);
+                    if (extml > ml) {
+                        /* match entirely within extDict */
+                        LZ4_memmove(op, extMatch, ml);
+                        op += ml;
+                        ml = 0;
+                    } else {
+                        /* match split between extDict & prefix */
+                        LZ4_memmove(op, extMatch, extml);
+                        op += extml;
+                        ml -= extml;
+                    }
+                    match = prefixStart;
+                }
+
+                /* match copy - slow variant, supporting overlap copy */
+                {   size_t u;
+                    for (u=0; u<ml; u++) {
+                        op[u] = match[u];
+            }   }   }
+            op += ml;
+            if ((size_t)(oend-op) < LASTLITERALS) {
+                DEBUGLOG(5, "invalid: match ends at distance %zi from end of block", oend-op);
+                /* incorrect end of block :
+                 * last match must stop at least LASTLITERALS==5 bytes before end of output block */
+                return -1;
+            }
+        } /* match */
+    } /* main loop */
+    return (int)(ip - istart);
+}
+
+
+/* Read the variable-length literal or match length.
+ *
+ * @ip : input pointer
+ * @ilimit : position after which if length is not decoded, the input is necessarily corrupted.
+ * @initial_check - check ip >= ipmax before start of loop.  Returns initial_error if so.
+ * @error (output) - error code.  Must be set to 0 before call.
+**/
+typedef size_t Rvl_t;
+static const Rvl_t rvl_error = (Rvl_t)(-1);
+LZ4_FORCE_INLINE Rvl_t
+read_variable_length(const BYTE** ip, const BYTE* ilimit,
+                     int initial_check)
+{
+    Rvl_t s, length = 0;
+    assert(ip != NULL);
+    assert(*ip !=  NULL);
+    assert(ilimit != NULL);
+    if (initial_check && unlikely((*ip) >= ilimit)) {    /* read limit reached */
+        return rvl_error;
+    }
+    do {
+        s = **ip;
+        (*ip)++;
+        length += s;
+        if (unlikely((*ip) > ilimit)) {    /* read limit reached */
+            return rvl_error;
+        }
+        /* accumulator overflow detection (32-bit mode only) */
+        if ((sizeof(length)<8) && unlikely(length > ((Rvl_t)(-1)/2)) ) {
+            return rvl_error;
+        }
+    } while (s==255);
+
+    return length;
+}
+
+/*! LZ4_decompress_generic() :
+ *  This generic decompression function covers all use cases.
+ *  It shall be instantiated several times, using different sets of directives.
+ *  Note that it is important for performance that this function really get inlined,
+ *  in order to remove useless branches during compilation optimization.
+ */
+LZ4_FORCE_INLINE int
+LZ4_decompress_generic(
+                 const char* const src,
+                 char* const dst,
+                 int srcSize,
+                 int outputSize,         /* If endOnInput==endOnInputSize, this value is `dstCapacity` */
+
+                 earlyEnd_directive partialDecoding,  /* full, partial */
+                 dict_directive dict,                 /* noDict, withPrefix64k, usingExtDict */
+                 const BYTE* const lowPrefix,  /* always <= dst, == dst when no prefix */
+                 const BYTE* const dictStart,  /* only if dict==usingExtDict */
+                 const size_t dictSize         /* note : = 0 if noDict */
+                 )
+{
+    if ((src == NULL) || (outputSize < 0)) { return -1; }
+
+    {   const BYTE* ip = (const BYTE*) src;
+        const BYTE* const iend = ip + srcSize;
+
+        BYTE* op = (BYTE*) dst;
+        BYTE* const oend = op + outputSize;
+        BYTE* cpy;
+
+        const BYTE* const dictEnd = (dictStart == NULL) ? NULL : dictStart + dictSize;
+
+        const int checkOffset = (dictSize < (int)(64 KB));
+
+
+        /* Set up the "end" pointers for the shortcut. */
+        const BYTE* const shortiend = iend - 14 /*maxLL*/ - 2 /*offset*/;
+        const BYTE* const shortoend = oend - 14 /*maxLL*/ - 18 /*maxML*/;
+
+        const BYTE* match;
+        size_t offset;
+        unsigned token;
+        size_t length;
+
+
+        DEBUGLOG(5, "LZ4_decompress_generic (srcSize:%i, dstSize:%i)", srcSize, outputSize);
+
+        /* Special cases */
+        assert(lowPrefix <= op);
+        if (unlikely(outputSize==0)) {
+            /* Empty output buffer */
+            if (partialDecoding) return 0;
+            return ((srcSize==1) && (*ip==0)) ? 0 : -1;
+        }
+        if (unlikely(srcSize==0)) { return -1; }
+
+    /* LZ4_FAST_DEC_LOOP:
+     * designed for modern OoO performance cpus,
+     * where copying reliably 32-bytes is preferable to an unpredictable branch.
+     * note : fast loop may show a regression for some client arm chips. */
+#if LZ4_FAST_DEC_LOOP
+        if ((oend - op) < FASTLOOP_SAFE_DISTANCE) {
+            DEBUGLOG(6, "skip fast decode loop");
+            goto safe_decode;
+        }
+
+        /* Fast loop : decode sequences as long as output < oend-FASTLOOP_SAFE_DISTANCE */
+        DEBUGLOG(6, "using fast decode loop");
+        while (1) {
+            /* Main fastloop assertion: We can always wildcopy FASTLOOP_SAFE_DISTANCE */
+            assert(oend - op >= FASTLOOP_SAFE_DISTANCE);
+            assert(ip < iend);
+            token = *ip++;
+            length = token >> ML_BITS;  /* literal length */
+
+            /* decode literal length */
+            if (length == RUN_MASK) {
+                size_t const addl = read_variable_length(&ip, iend-RUN_MASK, 1);
+                if (addl == rvl_error) {
+                    DEBUGLOG(6, "error reading long literal length");
+                    goto _output_error;
+                }
+                length += addl;
+                if (unlikely((uptrval)(op)+length<(uptrval)(op))) { goto _output_error; } /* overflow detection */
+                if (unlikely((uptrval)(ip)+length<(uptrval)(ip))) { goto _output_error; } /* overflow detection */
+
+                /* copy literals */
+                cpy = op+length;
+                LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
+                if ((cpy>oend-32) || (ip+length>iend-32)) { goto safe_literal_copy; }
+                LZ4_wildCopy32(op, ip, cpy);
+                ip += length; op = cpy;
+            } else {
+                cpy = op+length;
+                DEBUGLOG(7, "copy %u bytes in a 16-bytes stripe", (unsigned)length);
+                /* We don't need to check oend, since we check it once for each loop below */
+                if (ip > iend-(16 + 1/*max lit + offset + nextToken*/)) { goto safe_literal_copy; }
+                /* Literals can only be <= 14, but hope compilers optimize better when copy by a register size */
+                LZ4_memcpy(op, ip, 16);
+                ip += length; op = cpy;
+            }
+
+            /* get offset */
+            offset = LZ4_readLE16(ip); ip+=2;
+            DEBUGLOG(6, " offset = %zu", offset);
+            match = op - offset;
+            assert(match <= op);  /* overflow check */
+
+            /* get matchlength */
+            length = token & ML_MASK;
+
+            if (length == ML_MASK) {
+                size_t const addl = read_variable_length(&ip, iend - LASTLITERALS + 1, 0);
+                if (addl == rvl_error) {
+                    DEBUGLOG(6, "error reading long match length");
+                    goto _output_error;
+                }
+                length += addl;
+                length += MINMATCH;
+                if (unlikely((uptrval)(op)+length<(uptrval)op)) { goto _output_error; } /* overflow detection */
+                if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) {
+                    DEBUGLOG(6, "Error : offset outside buffers");
+                    goto _output_error;
+                }
+                if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
+                    goto safe_match_copy;
+                }
+            } else {
+                length += MINMATCH;
+                if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
+                    goto safe_match_copy;
+                }
+
+                /* Fastpath check: skip LZ4_wildCopy32 when true */
+                if ((dict == withPrefix64k) || (match >= lowPrefix)) {
+                    if (offset >= 8) {
+                        assert(match >= lowPrefix);
+                        assert(match <= op);
+                        assert(op + 18 <= oend);
+
+                        LZ4_memcpy(op, match, 8);
+                        LZ4_memcpy(op+8, match+8, 8);
+                        LZ4_memcpy(op+16, match+16, 2);
+                        op += length;
+                        continue;
+            }   }   }
+
+            if ( checkOffset && (unlikely(match + dictSize < lowPrefix)) ) {
+                DEBUGLOG(6, "Error : pos=%zi, offset=%zi => outside buffers", op-lowPrefix, op-match);
+                goto _output_error;
+            }
+            /* match starting within external dictionary */
+            if ((dict==usingExtDict) && (match < lowPrefix)) {
+                assert(dictEnd != NULL);
+                if (unlikely(op+length > oend-LASTLITERALS)) {
+                    if (partialDecoding) {
+                        DEBUGLOG(7, "partialDecoding: dictionary match, close to dstEnd");
+                        length = MIN(length, (size_t)(oend-op));
+                    } else {
+                        DEBUGLOG(6, "end-of-block condition violated")
+                        goto _output_error;
+                }   }
+
+                if (length <= (size_t)(lowPrefix-match)) {
+                    /* match fits entirely within external dictionary : just copy */
+                    LZ4_memmove(op, dictEnd - (lowPrefix-match), length);
+                    op += length;
+                } else {
+                    /* match stretches into both external dictionary and current block */
+                    size_t const copySize = (size_t)(lowPrefix - match);
+                    size_t const restSize = length - copySize;
+                    LZ4_memcpy(op, dictEnd - copySize, copySize);
+                    op += copySize;
+                    if (restSize > (size_t)(op - lowPrefix)) {  /* overlap copy */
+                        BYTE* const endOfMatch = op + restSize;
+                        const BYTE* copyFrom = lowPrefix;
+                        while (op < endOfMatch) { *op++ = *copyFrom++; }
+                    } else {
+                        LZ4_memcpy(op, lowPrefix, restSize);
+                        op += restSize;
+                }   }
+                continue;
+            }
+
+            /* copy match within block */
+            cpy = op + length;
+
+            assert((op <= oend) && (oend-op >= 32));
+            if (unlikely(offset<16)) {
+                LZ4_memcpy_using_offset(op, match, cpy, offset);
+            } else {
+                LZ4_wildCopy32(op, match, cpy);
+            }
+
+            op = cpy;   /* wildcopy correction */
+        }
+    safe_decode:
+#endif
+
+        /* Main Loop : decode remaining sequences where output < FASTLOOP_SAFE_DISTANCE */
+        DEBUGLOG(6, "using safe decode loop");
+        while (1) {
+            assert(ip < iend);
+            token = *ip++;
+            length = token >> ML_BITS;  /* literal length */
+
+            /* A two-stage shortcut for the most common case:
+             * 1) If the literal length is 0..14, and there is enough space,
+             * enter the shortcut and copy 16 bytes on behalf of the literals
+             * (in the fast mode, only 8 bytes can be safely copied this way).
+             * 2) Further if the match length is 4..18, copy 18 bytes in a similar
+             * manner; but we ensure that there's enough space in the output for
+             * those 18 bytes earlier, upon entering the shortcut (in other words,
+             * there is a combined check for both stages).
+             */
+            if ( (length != RUN_MASK)
+                /* strictly "less than" on input, to re-enter the loop with at least one byte */
+              && likely((ip < shortiend) & (op <= shortoend)) ) {
+                /* Copy the literals */
+                LZ4_memcpy(op, ip, 16);
+                op += length; ip += length;
+
+                /* The second stage: prepare for match copying, decode full info.
+                 * If it doesn't work out, the info won't be wasted. */
+                length = token & ML_MASK; /* match length */
+                offset = LZ4_readLE16(ip); ip += 2;
+                match = op - offset;
+                assert(match <= op); /* check overflow */
+
+                /* Do not deal with overlapping matches. */
+                if ( (length != ML_MASK)
+                  && (offset >= 8)
+                  && (dict==withPrefix64k || match >= lowPrefix) ) {
+                    /* Copy the match. */
+                    LZ4_memcpy(op + 0, match + 0, 8);
+                    LZ4_memcpy(op + 8, match + 8, 8);
+                    LZ4_memcpy(op +16, match +16, 2);
+                    op += length + MINMATCH;
+                    /* Both stages worked, load the next token. */
+                    continue;
+                }
+
+                /* The second stage didn't work out, but the info is ready.
+                 * Propel it right to the point of match copying. */
+                goto _copy_match;
+            }
+
+            /* decode literal length */
+            if (length == RUN_MASK) {
+                size_t const addl = read_variable_length(&ip, iend-RUN_MASK, 1);
+                if (addl == rvl_error) { goto _output_error; }
+                length += addl;
+                if (unlikely((uptrval)(op)+length<(uptrval)(op))) { goto _output_error; } /* overflow detection */
+                if (unlikely((uptrval)(ip)+length<(uptrval)(ip))) { goto _output_error; } /* overflow detection */
+            }
+
+            /* copy literals */
+            cpy = op+length;
+#if LZ4_FAST_DEC_LOOP
+        safe_literal_copy:
+#endif
+            LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
+            if ((cpy>oend-MFLIMIT) || (ip+length>iend-(2+1+LASTLITERALS))) {
+                /* We've either hit the input parsing restriction or the output parsing restriction.
+                 * In the normal scenario, decoding a full block, it must be the last sequence,
+                 * otherwise it's an error (invalid input or dimensions).
+                 * In partialDecoding scenario, it's necessary to ensure there is no buffer overflow.
+                 */
+                if (partialDecoding) {
+                    /* Since we are partial decoding we may be in this block because of the output parsing
+                     * restriction, which is not valid since the output buffer is allowed to be undersized.
+                     */
+                    DEBUGLOG(7, "partialDecoding: copying literals, close to input or output end")
+                    DEBUGLOG(7, "partialDecoding: literal length = %u", (unsigned)length);
+                    DEBUGLOG(7, "partialDecoding: remaining space in dstBuffer : %i", (int)(oend - op));
+                    DEBUGLOG(7, "partialDecoding: remaining space in srcBuffer : %i", (int)(iend - ip));
+                    /* Finishing in the middle of a literals segment,
+                     * due to lack of input.
+                     */
+                    if (ip+length > iend) {
+                        length = (size_t)(iend-ip);
+                        cpy = op + length;
+                    }
+                    /* Finishing in the middle of a literals segment,
+                     * due to lack of output space.
+                     */
+                    if (cpy > oend) {
+                        cpy = oend;
+                        assert(op<=oend);
+                        length = (size_t)(oend-op);
+                    }
+                } else {
+                     /* We must be on the last sequence (or invalid) because of the parsing limitations
+                      * so check that we exactly consume the input and don't overrun the output buffer.
+                      */
+                    if ((ip+length != iend) || (cpy > oend)) {
+                        DEBUGLOG(6, "should have been last run of literals")
+                        DEBUGLOG(6, "ip(%p) + length(%i) = %p != iend (%p)", ip, (int)length, ip+length, iend);
+                        DEBUGLOG(6, "or cpy(%p) > oend(%p)", cpy, oend);
+                        goto _output_error;
+                    }
+                }
+                LZ4_memmove(op, ip, length);  /* supports overlapping memory regions, for in-place decompression scenarios */
+                ip += length;
+                op += length;
+                /* Necessarily EOF when !partialDecoding.
+                 * When partialDecoding, it is EOF if we've either
+                 * filled the output buffer or
+                 * can't proceed with reading an offset for following match.
+                 */
+                if (!partialDecoding || (cpy == oend) || (ip >= (iend-2))) {
+                    break;
+                }
+            } else {
+                LZ4_wildCopy8(op, ip, cpy);   /* can overwrite up to 8 bytes beyond cpy */
+                ip += length; op = cpy;
+            }
+
+            /* get offset */
+            offset = LZ4_readLE16(ip); ip+=2;
+            match = op - offset;
+
+            /* get matchlength */
+            length = token & ML_MASK;
+
+    _copy_match:
+            if (length == ML_MASK) {
+                size_t const addl = read_variable_length(&ip, iend - LASTLITERALS + 1, 0);
+                if (addl == rvl_error) { goto _output_error; }
+                length += addl;
+                if (unlikely((uptrval)(op)+length<(uptrval)op)) goto _output_error;   /* overflow detection */
+            }
+            length += MINMATCH;
+
+#if LZ4_FAST_DEC_LOOP
+        safe_match_copy:
+#endif
+            if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) goto _output_error;   /* Error : offset outside buffers */
+            /* match starting within external dictionary */
+            if ((dict==usingExtDict) && (match < lowPrefix)) {
+                assert(dictEnd != NULL);
+                if (unlikely(op+length > oend-LASTLITERALS)) {
+                    if (partialDecoding) length = MIN(length, (size_t)(oend-op));
+                    else goto _output_error;   /* doesn't respect parsing restriction */
+                }
+
+                if (length <= (size_t)(lowPrefix-match)) {
+                    /* match fits entirely within external dictionary : just copy */
+                    LZ4_memmove(op, dictEnd - (lowPrefix-match), length);
+                    op += length;
+                } else {
+                    /* match stretches into both external dictionary and current block */
+                    size_t const copySize = (size_t)(lowPrefix - match);
+                    size_t const restSize = length - copySize;
+                    LZ4_memcpy(op, dictEnd - copySize, copySize);
+                    op += copySize;
+                    if (restSize > (size_t)(op - lowPrefix)) {  /* overlap copy */
+                        BYTE* const endOfMatch = op + restSize;
+                        const BYTE* copyFrom = lowPrefix;
+                        while (op < endOfMatch) *op++ = *copyFrom++;
+                    } else {
+                        LZ4_memcpy(op, lowPrefix, restSize);
+                        op += restSize;
+                }   }
+                continue;
+            }
+            assert(match >= lowPrefix);
+
+            /* copy match within block */
+            cpy = op + length;
+
+            /* partialDecoding : may end anywhere within the block */
+            assert(op<=oend);
+            if (partialDecoding && (cpy > oend-MATCH_SAFEGUARD_DISTANCE)) {
+                size_t const mlen = MIN(length, (size_t)(oend-op));
+                const BYTE* const matchEnd = match + mlen;
+                BYTE* const copyEnd = op + mlen;
+                if (matchEnd > op) {   /* overlap copy */
+                    while (op < copyEnd) { *op++ = *match++; }
+                } else {
+                    LZ4_memcpy(op, match, mlen);
+                }
+                op = copyEnd;
+                if (op == oend) { break; }
+                continue;
+            }
+
+            if (unlikely(offset<8)) {
+                LZ4_write32(op, 0);   /* silence msan warning when offset==0 */
+                op[0] = match[0];
+                op[1] = match[1];
+                op[2] = match[2];
+                op[3] = match[3];
+                match += inc32table[offset];
+                LZ4_memcpy(op+4, match, 4);
+                match -= dec64table[offset];
+            } else {
+                LZ4_memcpy(op, match, 8);
+                match += 8;
+            }
+            op += 8;
+
+            if (unlikely(cpy > oend-MATCH_SAFEGUARD_DISTANCE)) {
+                BYTE* const oCopyLimit = oend - (WILDCOPYLENGTH-1);
+                if (cpy > oend-LASTLITERALS) { goto _output_error; } /* Error : last LASTLITERALS bytes must be literals (uncompressed) */
+                if (op < oCopyLimit) {
+                    LZ4_wildCopy8(op, match, oCopyLimit);
+                    match += oCopyLimit - op;
+                    op = oCopyLimit;
+                }
+                while (op < cpy) { *op++ = *match++; }
+            } else {
+                LZ4_memcpy(op, match, 8);
+                if (length > 16)  { LZ4_wildCopy8(op+8, match+8, cpy); }
+            }
+            op = cpy;   /* wildcopy correction */
+        }
+
+        /* end of decoding */
+        DEBUGLOG(5, "decoded %i bytes", (int) (((char*)op)-dst));
+        return (int) (((char*)op)-dst);     /* Nb of output bytes decoded */
+
+        /* Overflow error detected */
+    _output_error:
+        return (int) (-(((const char*)ip)-src))-1;
+    }
+}
+
+
+/*===== Instantiate the API decoding functions. =====*/
+
+LZ4_FORCE_O2
+int LZ4_decompress_safe(const char* source, char* dest, int compressedSize, int maxDecompressedSize)
+{
+    return LZ4_decompress_generic(source, dest, compressedSize, maxDecompressedSize,
+                                  decode_full_block, noDict,
+                                  (BYTE*)dest, NULL, 0);
+}
+
+LZ4_FORCE_O2
+int LZ4_decompress_safe_partial(const char* src, char* dst, int compressedSize, int targetOutputSize, int dstCapacity)
+{
+    dstCapacity = MIN(targetOutputSize, dstCapacity);
+    return LZ4_decompress_generic(src, dst, compressedSize, dstCapacity,
+                                  partial_decode,
+                                  noDict, (BYTE*)dst, NULL, 0);
+}
+
+LZ4_FORCE_O2
+int LZ4_decompress_fast(const char* source, char* dest, int originalSize)
+{
+    DEBUGLOG(5, "LZ4_decompress_fast");
+    return LZ4_decompress_unsafe_generic(
+                (const BYTE*)source, (BYTE*)dest, originalSize,
+                0, NULL, 0);
+}
+
+/*===== Instantiate a few more decoding cases, used more than once. =====*/
+
+LZ4_FORCE_O2 /* Exported, an obsolete API function. */
+int LZ4_decompress_safe_withPrefix64k(const char* source, char* dest, int compressedSize, int maxOutputSize)
+{
+    return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize,
+                                  decode_full_block, withPrefix64k,
+                                  (BYTE*)dest - 64 KB, NULL, 0);
+}
+
+LZ4_FORCE_O2
+static int LZ4_decompress_safe_partial_withPrefix64k(const char* source, char* dest, int compressedSize, int targetOutputSize, int dstCapacity)
+{
+    dstCapacity = MIN(targetOutputSize, dstCapacity);
+    return LZ4_decompress_generic(source, dest, compressedSize, dstCapacity,
+                                  partial_decode, withPrefix64k,
+                                  (BYTE*)dest - 64 KB, NULL, 0);
+}
+
+/* Another obsolete API function, paired with the previous one. */
+int LZ4_decompress_fast_withPrefix64k(const char* source, char* dest, int originalSize)
+{
+    return LZ4_decompress_unsafe_generic(
+                (const BYTE*)source, (BYTE*)dest, originalSize,
+                64 KB, NULL, 0);
+}
+
+LZ4_FORCE_O2
+static int LZ4_decompress_safe_withSmallPrefix(const char* source, char* dest, int compressedSize, int maxOutputSize,
+                                               size_t prefixSize)
+{
+    return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize,
+                                  decode_full_block, noDict,
+                                  (BYTE*)dest-prefixSize, NULL, 0);
+}
+
+LZ4_FORCE_O2
+static int LZ4_decompress_safe_partial_withSmallPrefix(const char* source, char* dest, int compressedSize, int targetOutputSize, int dstCapacity,
+                                               size_t prefixSize)
+{
+    dstCapacity = MIN(targetOutputSize, dstCapacity);
+    return LZ4_decompress_generic(source, dest, compressedSize, dstCapacity,
+                                  partial_decode, noDict,
+                                  (BYTE*)dest-prefixSize, NULL, 0);
+}
+
+LZ4_FORCE_O2
+int LZ4_decompress_safe_forceExtDict(const char* source, char* dest,
+                                     int compressedSize, int maxOutputSize,
+                                     const void* dictStart, size_t dictSize)
+{
+    DEBUGLOG(5, "LZ4_decompress_safe_forceExtDict");
+    return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize,
+                                  decode_full_block, usingExtDict,
+                                  (BYTE*)dest, (const BYTE*)dictStart, dictSize);
+}
+
+LZ4_FORCE_O2
+int LZ4_decompress_safe_partial_forceExtDict(const char* source, char* dest,
+                                     int compressedSize, int targetOutputSize, int dstCapacity,
+                                     const void* dictStart, size_t dictSize)
+{
+    dstCapacity = MIN(targetOutputSize, dstCapacity);
+    return LZ4_decompress_generic(source, dest, compressedSize, dstCapacity,
+                                  partial_decode, usingExtDict,
+                                  (BYTE*)dest, (const BYTE*)dictStart, dictSize);
+}
+
+LZ4_FORCE_O2
+static int LZ4_decompress_fast_extDict(const char* source, char* dest, int originalSize,
+                                       const void* dictStart, size_t dictSize)
+{
+    return LZ4_decompress_unsafe_generic(
+                (const BYTE*)source, (BYTE*)dest, originalSize,
+                0, (const BYTE*)dictStart, dictSize);
+}
+
+/* The "double dictionary" mode, for use with e.g. ring buffers: the first part
+ * of the dictionary is passed as prefix, and the second via dictStart + dictSize.
+ * These routines are used only once, in LZ4_decompress_*_continue().
+ */
+LZ4_FORCE_INLINE
+int LZ4_decompress_safe_doubleDict(const char* source, char* dest, int compressedSize, int maxOutputSize,
+                                   size_t prefixSize, const void* dictStart, size_t dictSize)
+{
+    return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize,
+                                  decode_full_block, usingExtDict,
+                                  (BYTE*)dest-prefixSize, (const BYTE*)dictStart, dictSize);
+}
+
+/*===== streaming decompression functions =====*/
+
+#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
+LZ4_streamDecode_t* LZ4_createStreamDecode(void)
+{
+    LZ4_STATIC_ASSERT(sizeof(LZ4_streamDecode_t) >= sizeof(LZ4_streamDecode_t_internal));
+    return (LZ4_streamDecode_t*) ALLOC_AND_ZERO(sizeof(LZ4_streamDecode_t));
+}
+
+int LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream)
+{
+    if (LZ4_stream == NULL) { return 0; }  /* support free on NULL */
+    FREEMEM(LZ4_stream);
+    return 0;
+}
+#endif
+
+/*! LZ4_setStreamDecode() :
+ *  Use this function to instruct where to find the dictionary.
+ *  This function is not necessary if previous data is still available where it was decoded.
+ *  Loading a size of 0 is allowed (same effect as no dictionary).
+ * @return : 1 if OK, 0 if error
+ */
+int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize)
+{
+    LZ4_streamDecode_t_internal* lz4sd = &LZ4_streamDecode->internal_donotuse;
+    lz4sd->prefixSize = (size_t)dictSize;
+    if (dictSize) {
+        assert(dictionary != NULL);
+        lz4sd->prefixEnd = (const BYTE*) dictionary + dictSize;
+    } else {
+        lz4sd->prefixEnd = (const BYTE*) dictionary;
+    }
+    lz4sd->externalDict = NULL;
+    lz4sd->extDictSize  = 0;
+    return 1;
+}
+
+/*! LZ4_decoderRingBufferSize() :
+ *  when setting a ring buffer for streaming decompression (optional scenario),
+ *  provides the minimum size of this ring buffer
+ *  to be compatible with any source respecting maxBlockSize condition.
+ *  Note : in a ring buffer scenario,
+ *  blocks are presumed decompressed next to each other.
+ *  When not enough space remains for next block (remainingSize < maxBlockSize),
+ *  decoding resumes from beginning of ring buffer.
+ * @return : minimum ring buffer size,
+ *           or 0 if there is an error (invalid maxBlockSize).
+ */
+int LZ4_decoderRingBufferSize(int maxBlockSize)
+{
+    if (maxBlockSize < 0) return 0;
+    if (maxBlockSize > LZ4_MAX_INPUT_SIZE) return 0;
+    if (maxBlockSize < 16) maxBlockSize = 16;
+    return LZ4_DECODER_RING_BUFFER_SIZE(maxBlockSize);
+}
+
+/*
+*_continue() :
+    These decoding functions allow decompression of multiple blocks in "streaming" mode.
+    Previously decoded blocks must still be available at the memory position where they were decoded.
+    If it's not possible, save the relevant part of decoded data into a safe buffer,
+    and indicate where it stands using LZ4_setStreamDecode()
+*/
+LZ4_FORCE_O2
+int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxOutputSize)
+{
+    LZ4_streamDecode_t_internal* lz4sd = &LZ4_streamDecode->internal_donotuse;
+    int result;
+
+    if (lz4sd->prefixSize == 0) {
+        /* The first call, no dictionary yet. */
+        assert(lz4sd->extDictSize == 0);
+        result = LZ4_decompress_safe(source, dest, compressedSize, maxOutputSize);
+        if (result <= 0) return result;
+        lz4sd->prefixSize = (size_t)result;
+        lz4sd->prefixEnd = (BYTE*)dest + result;
+    } else if (lz4sd->prefixEnd == (BYTE*)dest) {
+        /* They're rolling the current segment. */
+        if (lz4sd->prefixSize >= 64 KB - 1)
+            result = LZ4_decompress_safe_withPrefix64k(source, dest, compressedSize, maxOutputSize);
+        else if (lz4sd->extDictSize == 0)
+            result = LZ4_decompress_safe_withSmallPrefix(source, dest, compressedSize, maxOutputSize,
+                                                         lz4sd->prefixSize);
+        else
+            result = LZ4_decompress_safe_doubleDict(source, dest, compressedSize, maxOutputSize,
+                                                    lz4sd->prefixSize, lz4sd->externalDict, lz4sd->extDictSize);
+        if (result <= 0) return result;
+        lz4sd->prefixSize += (size_t)result;
+        lz4sd->prefixEnd  += result;
+    } else {
+        /* The buffer wraps around, or they're switching to another buffer. */
+        lz4sd->extDictSize = lz4sd->prefixSize;
+        lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize;
+        result = LZ4_decompress_safe_forceExtDict(source, dest, compressedSize, maxOutputSize,
+                                                  lz4sd->externalDict, lz4sd->extDictSize);
+        if (result <= 0) return result;
+        lz4sd->prefixSize = (size_t)result;
+        lz4sd->prefixEnd  = (BYTE*)dest + result;
+    }
+
+    return result;
+}
+
+LZ4_FORCE_O2 int
+LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode,
+                        const char* source, char* dest, int originalSize)
+{
+    LZ4_streamDecode_t_internal* const lz4sd =
+        (assert(LZ4_streamDecode!=NULL), &LZ4_streamDecode->internal_donotuse);
+    int result;
+
+    DEBUGLOG(5, "LZ4_decompress_fast_continue (toDecodeSize=%i)", originalSize);
+    assert(originalSize >= 0);
+
+    if (lz4sd->prefixSize == 0) {
+        DEBUGLOG(5, "first invocation : no prefix nor extDict");
+        assert(lz4sd->extDictSize == 0);
+        result = LZ4_decompress_fast(source, dest, originalSize);
+        if (result <= 0) return result;
+        lz4sd->prefixSize = (size_t)originalSize;
+        lz4sd->prefixEnd = (BYTE*)dest + originalSize;
+    } else if (lz4sd->prefixEnd == (BYTE*)dest) {
+        DEBUGLOG(5, "continue using existing prefix");
+        result = LZ4_decompress_unsafe_generic(
+                        (const BYTE*)source, (BYTE*)dest, originalSize,
+                        lz4sd->prefixSize,
+                        lz4sd->externalDict, lz4sd->extDictSize);
+        if (result <= 0) return result;
+        lz4sd->prefixSize += (size_t)originalSize;
+        lz4sd->prefixEnd  += originalSize;
+    } else {
+        DEBUGLOG(5, "prefix becomes extDict");
+        lz4sd->extDictSize = lz4sd->prefixSize;
+        lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize;
+        result = LZ4_decompress_fast_extDict(source, dest, originalSize,
+                                             lz4sd->externalDict, lz4sd->extDictSize);
+        if (result <= 0) return result;
+        lz4sd->prefixSize = (size_t)originalSize;
+        lz4sd->prefixEnd  = (BYTE*)dest + originalSize;
+    }
+
+    return result;
+}
+
+
+/*
+Advanced decoding functions :
+*_usingDict() :
+    These decoding functions work the same as "_continue" ones,
+    the dictionary must be explicitly provided within parameters
+*/
+
+int LZ4_decompress_safe_usingDict(const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize)
+{
+    if (dictSize==0)
+        return LZ4_decompress_safe(source, dest, compressedSize, maxOutputSize);
+    if (dictStart+dictSize == dest) {
+        if (dictSize >= 64 KB - 1) {
+            return LZ4_decompress_safe_withPrefix64k(source, dest, compressedSize, maxOutputSize);
+        }
+        assert(dictSize >= 0);
+        return LZ4_decompress_safe_withSmallPrefix(source, dest, compressedSize, maxOutputSize, (size_t)dictSize);
+    }
+    assert(dictSize >= 0);
+    return LZ4_decompress_safe_forceExtDict(source, dest, compressedSize, maxOutputSize, dictStart, (size_t)dictSize);
+}
+
+int LZ4_decompress_safe_partial_usingDict(const char* source, char* dest, int compressedSize, int targetOutputSize, int dstCapacity, const char* dictStart, int dictSize)
+{
+    if (dictSize==0)
+        return LZ4_decompress_safe_partial(source, dest, compressedSize, targetOutputSize, dstCapacity);
+    if (dictStart+dictSize == dest) {
+        if (dictSize >= 64 KB - 1) {
+            return LZ4_decompress_safe_partial_withPrefix64k(source, dest, compressedSize, targetOutputSize, dstCapacity);
+        }
+        assert(dictSize >= 0);
+        return LZ4_decompress_safe_partial_withSmallPrefix(source, dest, compressedSize, targetOutputSize, dstCapacity, (size_t)dictSize);
+    }
+    assert(dictSize >= 0);
+    return LZ4_decompress_safe_partial_forceExtDict(source, dest, compressedSize, targetOutputSize, dstCapacity, dictStart, (size_t)dictSize);
+}
+
+int LZ4_decompress_fast_usingDict(const char* source, char* dest, int originalSize, const char* dictStart, int dictSize)
+{
+    if (dictSize==0 || dictStart+dictSize == dest)
+        return LZ4_decompress_unsafe_generic(
+                        (const BYTE*)source, (BYTE*)dest, originalSize,
+                        (size_t)dictSize, NULL, 0);
+    assert(dictSize >= 0);
+    return LZ4_decompress_fast_extDict(source, dest, originalSize, dictStart, (size_t)dictSize);
+}
+
+
+/*=*************************************************
+*  Obsolete Functions
+***************************************************/
+/* obsolete compression functions */
+int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize)
+{
+    return LZ4_compress_default(source, dest, inputSize, maxOutputSize);
+}
+int LZ4_compress(const char* src, char* dest, int srcSize)
+{
+    return LZ4_compress_default(src, dest, srcSize, LZ4_compressBound(srcSize));
+}
+int LZ4_compress_limitedOutput_withState (void* state, const char* src, char* dst, int srcSize, int dstSize)
+{
+    return LZ4_compress_fast_extState(state, src, dst, srcSize, dstSize, 1);
+}
+int LZ4_compress_withState (void* state, const char* src, char* dst, int srcSize)
+{
+    return LZ4_compress_fast_extState(state, src, dst, srcSize, LZ4_compressBound(srcSize), 1);
+}
+int LZ4_compress_limitedOutput_continue (LZ4_stream_t* LZ4_stream, const char* src, char* dst, int srcSize, int dstCapacity)
+{
+    return LZ4_compress_fast_continue(LZ4_stream, src, dst, srcSize, dstCapacity, 1);
+}
+int LZ4_compress_continue (LZ4_stream_t* LZ4_stream, const char* source, char* dest, int inputSize)
+{
+    return LZ4_compress_fast_continue(LZ4_stream, source, dest, inputSize, LZ4_compressBound(inputSize), 1);
+}
+
+/*
+These decompression functions are deprecated and should no longer be used.
+They are only provided here for compatibility with older user programs.
+- LZ4_uncompress is totally equivalent to LZ4_decompress_fast
+- LZ4_uncompress_unknownOutputSize is totally equivalent to LZ4_decompress_safe
+*/
+int LZ4_uncompress (const char* source, char* dest, int outputSize)
+{
+    return LZ4_decompress_fast(source, dest, outputSize);
+}
+int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize)
+{
+    return LZ4_decompress_safe(source, dest, isize, maxOutputSize);
+}
+
+/* Obsolete Streaming functions */
+
+int LZ4_sizeofStreamState(void) { return sizeof(LZ4_stream_t); }
+
+int LZ4_resetStreamState(void* state, char* inputBuffer)
+{
+    (void)inputBuffer;
+    LZ4_resetStream((LZ4_stream_t*)state);
+    return 0;
+}
+
+#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
+void* LZ4_create (char* inputBuffer)
+{
+    (void)inputBuffer;
+    return LZ4_createStream();
+}
+#endif
+
+char* LZ4_slideInputBuffer (void* state)
+{
+    /* avoid const char * -> char * conversion warning */
+    return (char *)(uptrval)((LZ4_stream_t*)state)->internal_donotuse.dictionary;
+}
+
+#endif   /* LZ4_COMMONDEFS_ONLY */
diff --git a/tools/common/lz4.h b/tools/common/lz4.h
new file mode 100644
index 0000000000..f85b0389a0
--- /dev/null
+++ b/tools/common/lz4.h
@@ -0,0 +1,862 @@
+/*
+ *  LZ4 - Fast LZ compression algorithm
+ *  Header File
+ *  Copyright (C) 2011-2020, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+    - LZ4 homepage : http://www.lz4.org
+    - LZ4 source repository : https://github.com/lz4/lz4
+*/
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#ifndef LZ4_H_2983827168210
+#define LZ4_H_2983827168210
+
+/* --- Dependency --- */
+#include <stddef.h>   /* size_t */
+
+
+/**
+  Introduction
+
+  LZ4 is lossless compression algorithm, providing compression speed >500 MB/s per core,
+  scalable with multi-cores CPU. It features an extremely fast decoder, with speed in
+  multiple GB/s per core, typically reaching RAM speed limits on multi-core systems.
+
+  The LZ4 compression library provides in-memory compression and decompression functions.
+  It gives full buffer control to user.
+  Compression can be done in:
+    - a single step (described as Simple Functions)
+    - a single step, reusing a context (described in Advanced Functions)
+    - unbounded multiple steps (described as Streaming compression)
+
+  lz4.h generates and decodes LZ4-compressed blocks (doc/lz4_Block_format.md).
+  Decompressing such a compressed block requires additional metadata.
+  Exact metadata depends on exact decompression function.
+  For the typical case of LZ4_decompress_safe(),
+  metadata includes block's compressed size, and maximum bound of decompressed size.
+  Each application is free to encode and pass such metadata in whichever way it wants.
+
+  lz4.h only handle blocks, it can not generate Frames.
+
+  Blocks are different from Frames (doc/lz4_Frame_format.md).
+  Frames bundle both blocks and metadata in a specified manner.
+  Embedding metadata is required for compressed data to be self-contained and portable.
+  Frame format is delivered through a companion API, declared in lz4frame.h.
+  The `lz4` CLI can only manage frames.
+*/
+
+/*^***************************************************************
+*  Export parameters
+*****************************************************************/
+/*
+*  LZ4_DLL_EXPORT :
+*  Enable exporting of functions when building a Windows DLL
+*  LZ4LIB_VISIBILITY :
+*  Control library symbols visibility.
+*/
+#ifndef LZ4LIB_VISIBILITY
+#  if defined(__GNUC__) && (__GNUC__ >= 4)
+#    define LZ4LIB_VISIBILITY __attribute__ ((visibility ("default")))
+#  else
+#    define LZ4LIB_VISIBILITY
+#  endif
+#endif
+#if defined(LZ4_DLL_EXPORT) && (LZ4_DLL_EXPORT==1)
+#  define LZ4LIB_API __declspec(dllexport) LZ4LIB_VISIBILITY
+#elif defined(LZ4_DLL_IMPORT) && (LZ4_DLL_IMPORT==1)
+#  define LZ4LIB_API __declspec(dllimport) LZ4LIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
+#else
+#  define LZ4LIB_API LZ4LIB_VISIBILITY
+#endif
+
+/*! LZ4_FREESTANDING :
+ *  When this macro is set to 1, it enables "freestanding mode" that is
+ *  suitable for typical freestanding environment which doesn't support
+ *  standard C library.
+ *
+ *  - LZ4_FREESTANDING is a compile-time switch.
+ *  - It requires the following macros to be defined:
+ *    LZ4_memcpy, LZ4_memmove, LZ4_memset.
+ *  - It only enables LZ4/HC functions which don't use heap.
+ *    All LZ4F_* functions are not supported.
+ *  - See tests/freestanding.c to check its basic setup.
+ */
+#if defined(LZ4_FREESTANDING) && (LZ4_FREESTANDING == 1)
+#  define LZ4_HEAPMODE 0
+#  define LZ4HC_HEAPMODE 0
+#  define LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION 1
+#  if !defined(LZ4_memcpy)
+#    error "LZ4_FREESTANDING requires macro 'LZ4_memcpy'."
+#  endif
+#  if !defined(LZ4_memset)
+#    error "LZ4_FREESTANDING requires macro 'LZ4_memset'."
+#  endif
+#  if !defined(LZ4_memmove)
+#    error "LZ4_FREESTANDING requires macro 'LZ4_memmove'."
+#  endif
+#elif ! defined(LZ4_FREESTANDING)
+#  define LZ4_FREESTANDING 0
+#endif
+
+
+/*------   Version   ------*/
+#define LZ4_VERSION_MAJOR    1    /* for breaking interface changes  */
+#define LZ4_VERSION_MINOR    9    /* for new (non-breaking) interface capabilities */
+#define LZ4_VERSION_RELEASE  4    /* for tweaks, bug-fixes, or development */
+
+#define LZ4_VERSION_NUMBER (LZ4_VERSION_MAJOR *100*100 + LZ4_VERSION_MINOR *100 + LZ4_VERSION_RELEASE)
+
+#define LZ4_LIB_VERSION LZ4_VERSION_MAJOR.LZ4_VERSION_MINOR.LZ4_VERSION_RELEASE
+#define LZ4_QUOTE(str) #str
+#define LZ4_EXPAND_AND_QUOTE(str) LZ4_QUOTE(str)
+#define LZ4_VERSION_STRING LZ4_EXPAND_AND_QUOTE(LZ4_LIB_VERSION)  /* requires v1.7.3+ */
+
+LZ4LIB_API int LZ4_versionNumber (void);  /**< library version number; useful to check dll version; requires v1.3.0+ */
+LZ4LIB_API const char* LZ4_versionString (void);   /**< library version string; useful to check dll version; requires v1.7.5+ */
+
+
+/*-************************************
+*  Tuning parameter
+**************************************/
+#define LZ4_MEMORY_USAGE_MIN 10
+#define LZ4_MEMORY_USAGE_DEFAULT 14
+#define LZ4_MEMORY_USAGE_MAX 20
+
+/*!
+ * LZ4_MEMORY_USAGE :
+ * Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; )
+ * Increasing memory usage improves compression ratio, at the cost of speed.
+ * Reduced memory usage may improve speed at the cost of ratio, thanks to better cache locality.
+ * Default value is 14, for 16KB, which nicely fits into Intel x86 L1 cache
+ */
+#ifndef LZ4_MEMORY_USAGE
+# define LZ4_MEMORY_USAGE LZ4_MEMORY_USAGE_DEFAULT
+#endif
+
+#if (LZ4_MEMORY_USAGE < LZ4_MEMORY_USAGE_MIN)
+#  error "LZ4_MEMORY_USAGE is too small !"
+#endif
+
+#if (LZ4_MEMORY_USAGE > LZ4_MEMORY_USAGE_MAX)
+#  error "LZ4_MEMORY_USAGE is too large !"
+#endif
+
+/*-************************************
+*  Simple Functions
+**************************************/
+/*! LZ4_compress_default() :
+ *  Compresses 'srcSize' bytes from buffer 'src'
+ *  into already allocated 'dst' buffer of size 'dstCapacity'.
+ *  Compression is guaranteed to succeed if 'dstCapacity' >= LZ4_compressBound(srcSize).
+ *  It also runs faster, so it's a recommended setting.
+ *  If the function cannot compress 'src' into a more limited 'dst' budget,
+ *  compression stops *immediately*, and the function result is zero.
+ *  In which case, 'dst' content is undefined (invalid).
+ *      srcSize : max supported value is LZ4_MAX_INPUT_SIZE.
+ *      dstCapacity : size of buffer 'dst' (which must be already allocated)
+ *     @return  : the number of bytes written into buffer 'dst' (necessarily <= dstCapacity)
+ *                or 0 if compression fails
+ * Note : This function is protected against buffer overflow scenarios (never writes outside 'dst' buffer, nor read outside 'source' buffer).
+ */
+LZ4LIB_API int LZ4_compress_default(const char* src, char* dst, int srcSize, int dstCapacity);
+
+/*! LZ4_decompress_safe() :
+ * @compressedSize : is the exact complete size of the compressed block.
+ * @dstCapacity : is the size of destination buffer (which must be already allocated),
+ *                is an upper bound of decompressed size.
+ * @return : the number of bytes decompressed into destination buffer (necessarily <= dstCapacity)
+ *           If destination buffer is not large enough, decoding will stop and output an error code (negative value).
+ *           If the source stream is detected malformed, the function will stop decoding and return a negative result.
+ * Note 1 : This function is protected against malicious data packets :
+ *          it will never writes outside 'dst' buffer, nor read outside 'source' buffer,
+ *          even if the compressed block is maliciously modified to order the decoder to do these actions.
+ *          In such case, the decoder stops immediately, and considers the compressed block malformed.
+ * Note 2 : compressedSize and dstCapacity must be provided to the function, the compressed block does not contain them.
+ *          The implementation is free to send / store / derive this information in whichever way is most beneficial.
+ *          If there is a need for a different format which bundles together both compressed data and its metadata, consider looking at lz4frame.h instead.
+ */
+LZ4LIB_API int LZ4_decompress_safe (const char* src, char* dst, int compressedSize, int dstCapacity);
+
+
+/*-************************************
+*  Advanced Functions
+**************************************/
+#define LZ4_MAX_INPUT_SIZE        0x7E000000   /* 2 113 929 216 bytes */
+#define LZ4_COMPRESSBOUND(isize)  ((unsigned)(isize) > (unsigned)LZ4_MAX_INPUT_SIZE ? 0 : (isize) + ((isize)/255) + 16)
+
+/*! LZ4_compressBound() :
+    Provides the maximum size that LZ4 compression may output in a "worst case" scenario (input data not compressible)
+    This function is primarily useful for memory allocation purposes (destination buffer size).
+    Macro LZ4_COMPRESSBOUND() is also provided for compilation-time evaluation (stack memory allocation for example).
+    Note that LZ4_compress_default() compresses faster when dstCapacity is >= LZ4_compressBound(srcSize)
+        inputSize  : max supported value is LZ4_MAX_INPUT_SIZE
+        return : maximum output size in a "worst case" scenario
+              or 0, if input size is incorrect (too large or negative)
+*/
+LZ4LIB_API int LZ4_compressBound(int inputSize);
+
+/*! LZ4_compress_fast() :
+    Same as LZ4_compress_default(), but allows selection of "acceleration" factor.
+    The larger the acceleration value, the faster the algorithm, but also the lesser the compression.
+    It's a trade-off. It can be fine tuned, with each successive value providing roughly +~3% to speed.
+    An acceleration value of "1" is the same as regular LZ4_compress_default()
+    Values <= 0 will be replaced by LZ4_ACCELERATION_DEFAULT (currently == 1, see lz4.c).
+    Values > LZ4_ACCELERATION_MAX will be replaced by LZ4_ACCELERATION_MAX (currently == 65537, see lz4.c).
+*/
+LZ4LIB_API int LZ4_compress_fast (const char* src, char* dst, int srcSize, int dstCapacity, int acceleration);
+
+
+/*! LZ4_compress_fast_extState() :
+ *  Same as LZ4_compress_fast(), using an externally allocated memory space for its state.
+ *  Use LZ4_sizeofState() to know how much memory must be allocated,
+ *  and allocate it on 8-bytes boundaries (using `malloc()` typically).
+ *  Then, provide this buffer as `void* state` to compression function.
+ */
+LZ4LIB_API int LZ4_sizeofState(void);
+LZ4LIB_API int LZ4_compress_fast_extState (void* state, const char* src, char* dst, int srcSize, int dstCapacity, int acceleration);
+
+
+/*! LZ4_compress_destSize() :
+ *  Reverse the logic : compresses as much data as possible from 'src' buffer
+ *  into already allocated buffer 'dst', of size >= 'targetDestSize'.
+ *  This function either compresses the entire 'src' content into 'dst' if it's large enough,
+ *  or fill 'dst' buffer completely with as much data as possible from 'src'.
+ *  note: acceleration parameter is fixed to "default".
+ *
+ * *srcSizePtr : will be modified to indicate how many bytes where read from 'src' to fill 'dst'.
+ *               New value is necessarily <= input value.
+ * @return : Nb bytes written into 'dst' (necessarily <= targetDestSize)
+ *           or 0 if compression fails.
+ *
+ * Note : from v1.8.2 to v1.9.1, this function had a bug (fixed in v1.9.2+):
+ *        the produced compressed content could, in specific circumstances,
+ *        require to be decompressed into a destination buffer larger
+ *        by at least 1 byte than the content to decompress.
+ *        If an application uses `LZ4_compress_destSize()`,
+ *        it's highly recommended to update liblz4 to v1.9.2 or better.
+ *        If this can't be done or ensured,
+ *        the receiving decompression function should provide
+ *        a dstCapacity which is > decompressedSize, by at least 1 byte.
+ *        See https://github.com/lz4/lz4/issues/859 for details
+ */
+LZ4LIB_API int LZ4_compress_destSize (const char* src, char* dst, int* srcSizePtr, int targetDstSize);
+
+
+/*! LZ4_decompress_safe_partial() :
+ *  Decompress an LZ4 compressed block, of size 'srcSize' at position 'src',
+ *  into destination buffer 'dst' of size 'dstCapacity'.
+ *  Up to 'targetOutputSize' bytes will be decoded.
+ *  The function stops decoding on reaching this objective.
+ *  This can be useful to boost performance
+ *  whenever only the beginning of a block is required.
+ *
+ * @return : the number of bytes decoded in `dst` (necessarily <= targetOutputSize)
+ *           If source stream is detected malformed, function returns a negative result.
+ *
+ *  Note 1 : @return can be < targetOutputSize, if compressed block contains less data.
+ *
+ *  Note 2 : targetOutputSize must be <= dstCapacity
+ *
+ *  Note 3 : this function effectively stops decoding on reaching targetOutputSize,
+ *           so dstCapacity is kind of redundant.
+ *           This is because in older versions of this function,
+ *           decoding operation would still write complete sequences.
+ *           Therefore, there was no guarantee that it would stop writing at exactly targetOutputSize,
+ *           it could write more bytes, though only up to dstCapacity.
+ *           Some "margin" used to be required for this operation to work properly.
+ *           Thankfully, this is no longer necessary.
+ *           The function nonetheless keeps the same signature, in an effort to preserve API compatibility.
+ *
+ *  Note 4 : If srcSize is the exact size of the block,
+ *           then targetOutputSize can be any value,
+ *           including larger than the block's decompressed size.
+ *           The function will, at most, generate block's decompressed size.
+ *
+ *  Note 5 : If srcSize is _larger_ than block's compressed size,
+ *           then targetOutputSize **MUST** be <= block's decompressed size.
+ *           Otherwise, *silent corruption will occur*.
+ */
+LZ4LIB_API int LZ4_decompress_safe_partial (const char* src, char* dst, int srcSize, int targetOutputSize, int dstCapacity);
+
+
+/*-*********************************************
+*  Streaming Compression Functions
+***********************************************/
+typedef union LZ4_stream_u LZ4_stream_t;  /* incomplete type (defined later) */
+
+/**
+ Note about RC_INVOKED
+
+ - RC_INVOKED is predefined symbol of rc.exe (the resource compiler which is part of MSVC/Visual Studio).
+   https://docs.microsoft.com/en-us/windows/win32/menurc/predefined-macros
+
+ - Since rc.exe is a legacy compiler, it truncates long symbol (> 30 chars)
+   and reports warning "RC4011: identifier truncated".
+
+ - To eliminate the warning, we surround long preprocessor symbol with
+   "#if !defined(RC_INVOKED) ... #endif" block that means
+   "skip this block when rc.exe is trying to read it".
+*/
+#if !defined(RC_INVOKED) /* https://docs.microsoft.com/en-us/windows/win32/menurc/predefined-macros */
+#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
+LZ4LIB_API LZ4_stream_t* LZ4_createStream(void);
+LZ4LIB_API int           LZ4_freeStream (LZ4_stream_t* streamPtr);
+#endif /* !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) */
+#endif
+
+/*! LZ4_resetStream_fast() : v1.9.0+
+ *  Use this to prepare an LZ4_stream_t for a new chain of dependent blocks
+ *  (e.g., LZ4_compress_fast_continue()).
+ *
+ *  An LZ4_stream_t must be initialized once before usage.
+ *  This is automatically done when created by LZ4_createStream().
+ *  However, should the LZ4_stream_t be simply declared on stack (for example),
+ *  it's necessary to initialize it first, using LZ4_initStream().
+ *
+ *  After init, start any new stream with LZ4_resetStream_fast().
+ *  A same LZ4_stream_t can be re-used multiple times consecutively
+ *  and compress multiple streams,
+ *  provided that it starts each new stream with LZ4_resetStream_fast().
+ *
+ *  LZ4_resetStream_fast() is much faster than LZ4_initStream(),
+ *  but is not compatible with memory regions containing garbage data.
+ *
+ *  Note: it's only useful to call LZ4_resetStream_fast()
+ *        in the context of streaming compression.
+ *        The *extState* functions perform their own resets.
+ *        Invoking LZ4_resetStream_fast() before is redundant, and even counterproductive.
+ */
+LZ4LIB_API void LZ4_resetStream_fast (LZ4_stream_t* streamPtr);
+
+/*! LZ4_loadDict() :
+ *  Use this function to reference a static dictionary into LZ4_stream_t.
+ *  The dictionary must remain available during compression.
+ *  LZ4_loadDict() triggers a reset, so any previous data will be forgotten.
+ *  The same dictionary will have to be loaded on decompression side for successful decoding.
+ *  Dictionary are useful for better compression of small data (KB range).
+ *  While LZ4 accept any input as dictionary,
+ *  results are generally better when using Zstandard's Dictionary Builder.
+ *  Loading a size of 0 is allowed, and is the same as reset.
+ * @return : loaded dictionary size, in bytes (necessarily <= 64 KB)
+ */
+LZ4LIB_API int LZ4_loadDict (LZ4_stream_t* streamPtr, const char* dictionary, int dictSize);
+
+/*! LZ4_compress_fast_continue() :
+ *  Compress 'src' content using data from previously compressed blocks, for better compression ratio.
+ * 'dst' buffer must be already allocated.
+ *  If dstCapacity >= LZ4_compressBound(srcSize), compression is guaranteed to succeed, and runs faster.
+ *
+ * @return : size of compressed block
+ *           or 0 if there is an error (typically, cannot fit into 'dst').
+ *
+ *  Note 1 : Each invocation to LZ4_compress_fast_continue() generates a new block.
+ *           Each block has precise boundaries.
+ *           Each block must be decompressed separately, calling LZ4_decompress_*() with relevant metadata.
+ *           It's not possible to append blocks together and expect a single invocation of LZ4_decompress_*() to decompress them together.
+ *
+ *  Note 2 : The previous 64KB of source data is __assumed__ to remain present, unmodified, at same address in memory !
+ *
+ *  Note 3 : When input is structured as a double-buffer, each buffer can have any size, including < 64 KB.
+ *           Make sure that buffers are separated, by at least one byte.
+ *           This construction ensures that each block only depends on previous block.
+ *
+ *  Note 4 : If input buffer is a ring-buffer, it can have any size, including < 64 KB.
+ *
+ *  Note 5 : After an error, the stream status is undefined (invalid), it can only be reset or freed.
+ */
+LZ4LIB_API int LZ4_compress_fast_continue (LZ4_stream_t* streamPtr, const char* src, char* dst, int srcSize, int dstCapacity, int acceleration);
+
+/*! LZ4_saveDict() :
+ *  If last 64KB data cannot be guaranteed to remain available at its current memory location,
+ *  save it into a safer place (char* safeBuffer).
+ *  This is schematically equivalent to a memcpy() followed by LZ4_loadDict(),
+ *  but is much faster, because LZ4_saveDict() doesn't need to rebuild tables.
+ * @return : saved dictionary size in bytes (necessarily <= maxDictSize), or 0 if error.
+ */
+LZ4LIB_API int LZ4_saveDict (LZ4_stream_t* streamPtr, char* safeBuffer, int maxDictSize);
+
+
+/*-**********************************************
+*  Streaming Decompression Functions
+*  Bufferless synchronous API
+************************************************/
+typedef union LZ4_streamDecode_u LZ4_streamDecode_t;   /* tracking context */
+
+/*! LZ4_createStreamDecode() and LZ4_freeStreamDecode() :
+ *  creation / destruction of streaming decompression tracking context.
+ *  A tracking context can be re-used multiple times.
+ */
+#if !defined(RC_INVOKED) /* https://docs.microsoft.com/en-us/windows/win32/menurc/predefined-macros */
+#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
+LZ4LIB_API LZ4_streamDecode_t* LZ4_createStreamDecode(void);
+LZ4LIB_API int                 LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream);
+#endif /* !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) */
+#endif
+
+/*! LZ4_setStreamDecode() :
+ *  An LZ4_streamDecode_t context can be allocated once and re-used multiple times.
+ *  Use this function to start decompression of a new stream of blocks.
+ *  A dictionary can optionally be set. Use NULL or size 0 for a reset order.
+ *  Dictionary is presumed stable : it must remain accessible and unmodified during next decompression.
+ * @return : 1 if OK, 0 if error
+ */
+LZ4LIB_API int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize);
+
+/*! LZ4_decoderRingBufferSize() : v1.8.2+
+ *  Note : in a ring buffer scenario (optional),
+ *  blocks are presumed decompressed next to each other
+ *  up to the moment there is not enough remaining space for next block (remainingSize < maxBlockSize),
+ *  at which stage it resumes from beginning of ring buffer.
+ *  When setting such a ring buffer for streaming decompression,
+ *  provides the minimum size of this ring buffer
+ *  to be compatible with any source respecting maxBlockSize condition.
+ * @return : minimum ring buffer size,
+ *           or 0 if there is an error (invalid maxBlockSize).
+ */
+LZ4LIB_API int LZ4_decoderRingBufferSize(int maxBlockSize);
+#define LZ4_DECODER_RING_BUFFER_SIZE(maxBlockSize) (65536 + 14 + (maxBlockSize))  /* for static allocation; maxBlockSize presumed valid */
+
+/*! LZ4_decompress_safe_continue() :
+ *  This decoding function allows decompression of consecutive blocks in "streaming" mode.
+ *  The difference with the usual independent blocks is that
+ *  new blocks are allowed to find references into former blocks.
+ *  A block is an unsplittable entity, and must be presented entirely to the decompression function.
+ *  LZ4_decompress_safe_continue() only accepts one block at a time.
+ *  It's modeled after `LZ4_decompress_safe()` and behaves similarly.
+ *
+ * @LZ4_streamDecode : decompression state, tracking the position in memory of past data
+ * @compressedSize : exact complete size of one compressed block.
+ * @dstCapacity : size of destination buffer (which must be already allocated),
+ *                must be an upper bound of decompressed size.
+ * @return : number of bytes decompressed into destination buffer (necessarily <= dstCapacity)
+ *           If destination buffer is not large enough, decoding will stop and output an error code (negative value).
+ *           If the source stream is detected malformed, the function will stop decoding and return a negative result.
+ *
+ *  The last 64KB of previously decoded data *must* remain available and unmodified
+ *  at the memory position where they were previously decoded.
+ *  If less than 64KB of data has been decoded, all the data must be present.
+ *
+ *  Special : if decompression side sets a ring buffer, it must respect one of the following conditions :
+ *  - Decompression buffer size is _at least_ LZ4_decoderRingBufferSize(maxBlockSize).
+ *    maxBlockSize is the maximum size of any single block. It can have any value > 16 bytes.
+ *    In which case, encoding and decoding buffers do not need to be synchronized.
+ *    Actually, data can be produced by any source compliant with LZ4 format specification, and respecting maxBlockSize.
+ *  - Synchronized mode :
+ *    Decompression buffer size is _exactly_ the same as compression buffer size,
+ *    and follows exactly same update rule (block boundaries at same positions),
+ *    and decoding function is provided with exact decompressed size of each block (exception for last block of the stream),
+ *    _then_ decoding & encoding ring buffer can have any size, including small ones ( < 64 KB).
+ *  - Decompression buffer is larger than encoding buffer, by a minimum of maxBlockSize more bytes.
+ *    In which case, encoding and decoding buffers do not need to be synchronized,
+ *    and encoding ring buffer can have any size, including small ones ( < 64 KB).
+ *
+ *  Whenever these conditions are not possible,
+ *  save the last 64KB of decoded data into a safe buffer where it can't be modified during decompression,
+ *  then indicate where this data is saved using LZ4_setStreamDecode(), before decompressing next block.
+*/
+LZ4LIB_API int
+LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode,
+                        const char* src, char* dst,
+                        int srcSize, int dstCapacity);
+
+
+/*! LZ4_decompress_safe_usingDict() :
+ *  Works the same as
+ *  a combination of LZ4_setStreamDecode() followed by LZ4_decompress_safe_continue()
+ *  However, it's stateless: it doesn't need any LZ4_streamDecode_t state.
+ *  Dictionary is presumed stable : it must remain accessible and unmodified during decompression.
+ *  Performance tip : Decompression speed can be substantially increased
+ *                    when dst == dictStart + dictSize.
+ */
+LZ4LIB_API int
+LZ4_decompress_safe_usingDict(const char* src, char* dst,
+                              int srcSize, int dstCapacity,
+                              const char* dictStart, int dictSize);
+
+/*! LZ4_decompress_safe_partial_usingDict() :
+ *  Behaves the same as LZ4_decompress_safe_partial()
+ *  with the added ability to specify a memory segment for past data.
+ *  Performance tip : Decompression speed can be substantially increased
+ *                    when dst == dictStart + dictSize.
+ */
+LZ4LIB_API int
+LZ4_decompress_safe_partial_usingDict(const char* src, char* dst,
+                                      int compressedSize,
+                                      int targetOutputSize, int maxOutputSize,
+                                      const char* dictStart, int dictSize);
+
+#endif /* LZ4_H_2983827168210 */
+
+
+/*^*************************************
+ * !!!!!!   STATIC LINKING ONLY   !!!!!!
+ ***************************************/
+
+/*-****************************************************************************
+ * Experimental section
+ *
+ * Symbols declared in this section must be considered unstable. Their
+ * signatures or semantics may change, or they may be removed altogether in the
+ * future. They are therefore only safe to depend on when the caller is
+ * statically linked against the library.
+ *
+ * To protect against unsafe usage, not only are the declarations guarded,
+ * the definitions are hidden by default
+ * when building LZ4 as a shared/dynamic library.
+ *
+ * In order to access these declarations,
+ * define LZ4_STATIC_LINKING_ONLY in your application
+ * before including LZ4's headers.
+ *
+ * In order to make their implementations accessible dynamically, you must
+ * define LZ4_PUBLISH_STATIC_FUNCTIONS when building the LZ4 library.
+ ******************************************************************************/
+
+#ifdef LZ4_STATIC_LINKING_ONLY
+
+#ifndef LZ4_STATIC_3504398509
+#define LZ4_STATIC_3504398509
+
+#ifdef LZ4_PUBLISH_STATIC_FUNCTIONS
+#define LZ4LIB_STATIC_API LZ4LIB_API
+#else
+#define LZ4LIB_STATIC_API
+#endif
+
+
+/*! LZ4_compress_fast_extState_fastReset() :
+ *  A variant of LZ4_compress_fast_extState().
+ *
+ *  Using this variant avoids an expensive initialization step.
+ *  It is only safe to call if the state buffer is known to be correctly initialized already
+ *  (see above comment on LZ4_resetStream_fast() for a definition of "correctly initialized").
+ *  From a high level, the difference is that
+ *  this function initializes the provided state with a call to something like LZ4_resetStream_fast()
+ *  while LZ4_compress_fast_extState() starts with a call to LZ4_resetStream().
+ */
+LZ4LIB_STATIC_API int LZ4_compress_fast_extState_fastReset (void* state, const char* src, char* dst, int srcSize, int dstCapacity, int acceleration);
+
+/*! LZ4_attach_dictionary() :
+ *  This is an experimental API that allows
+ *  efficient use of a static dictionary many times.
+ *
+ *  Rather than re-loading the dictionary buffer into a working context before
+ *  each compression, or copying a pre-loaded dictionary's LZ4_stream_t into a
+ *  working LZ4_stream_t, this function introduces a no-copy setup mechanism,
+ *  in which the working stream references the dictionary stream in-place.
+ *
+ *  Several assumptions are made about the state of the dictionary stream.
+ *  Currently, only streams which have been prepared by LZ4_loadDict() should
+ *  be expected to work.
+ *
+ *  Alternatively, the provided dictionaryStream may be NULL,
+ *  in which case any existing dictionary stream is unset.
+ *
+ *  If a dictionary is provided, it replaces any pre-existing stream history.
+ *  The dictionary contents are the only history that can be referenced and
+ *  logically immediately precede the data compressed in the first subsequent
+ *  compression call.
+ *
+ *  The dictionary will only remain attached to the working stream through the
+ *  first compression call, at the end of which it is cleared. The dictionary
+ *  stream (and source buffer) must remain in-place / accessible / unchanged
+ *  through the completion of the first compression call on the stream.
+ */
+LZ4LIB_STATIC_API void
+LZ4_attach_dictionary(LZ4_stream_t* workingStream,
+                const LZ4_stream_t* dictionaryStream);
+
+
+/*! In-place compression and decompression
+ *
+ * It's possible to have input and output sharing the same buffer,
+ * for highly constrained memory environments.
+ * In both cases, it requires input to lay at the end of the buffer,
+ * and decompression to start at beginning of the buffer.
+ * Buffer size must feature some margin, hence be larger than final size.
+ *
+ * |<------------------------buffer--------------------------------->|
+ *                             |<-----------compressed data--------->|
+ * |<-----------decompressed size------------------>|
+ *                                                  |<----margin---->|
+ *
+ * This technique is more useful for decompression,
+ * since decompressed size is typically larger,
+ * and margin is short.
+ *
+ * In-place decompression will work inside any buffer
+ * which size is >= LZ4_DECOMPRESS_INPLACE_BUFFER_SIZE(decompressedSize).
+ * This presumes that decompressedSize > compressedSize.
+ * Otherwise, it means compression actually expanded data,
+ * and it would be more efficient to store such data with a flag indicating it's not compressed.
+ * This can happen when data is not compressible (already compressed, or encrypted).
+ *
+ * For in-place compression, margin is larger, as it must be able to cope with both
+ * history preservation, requiring input data to remain unmodified up to LZ4_DISTANCE_MAX,
+ * and data expansion, which can happen when input is not compressible.
+ * As a consequence, buffer size requirements are much higher,
+ * and memory savings offered by in-place compression are more limited.
+ *
+ * There are ways to limit this cost for compression :
+ * - Reduce history size, by modifying LZ4_DISTANCE_MAX.
+ *   Note that it is a compile-time constant, so all compressions will apply this limit.
+ *   Lower values will reduce compression ratio, except when input_size < LZ4_DISTANCE_MAX,
+ *   so it's a reasonable trick when inputs are known to be small.
+ * - Require the compressor to deliver a "maximum compressed size".
+ *   This is the `dstCapacity` parameter in `LZ4_compress*()`.
+ *   When this size is < LZ4_COMPRESSBOUND(inputSize), then compression can fail,
+ *   in which case, the return code will be 0 (zero).
+ *   The caller must be ready for these cases to happen,
+ *   and typically design a backup scheme to send data uncompressed.
+ * The combination of both techniques can significantly reduce
+ * the amount of margin required for in-place compression.
+ *
+ * In-place compression can work in any buffer
+ * which size is >= (maxCompressedSize)
+ * with maxCompressedSize == LZ4_COMPRESSBOUND(srcSize) for guaranteed compression success.
+ * LZ4_COMPRESS_INPLACE_BUFFER_SIZE() depends on both maxCompressedSize and LZ4_DISTANCE_MAX,
+ * so it's possible to reduce memory requirements by playing with them.
+ */
+
+#define LZ4_DECOMPRESS_INPLACE_MARGIN(compressedSize)          (((compressedSize) >> 8) + 32)
+#define LZ4_DECOMPRESS_INPLACE_BUFFER_SIZE(decompressedSize)   ((decompressedSize) + LZ4_DECOMPRESS_INPLACE_MARGIN(decompressedSize))  /**< note: presumes that compressedSize < decompressedSize. note2: margin is overestimated a bit, since it could use compressedSize instead */
+
+#ifndef LZ4_DISTANCE_MAX   /* history window size; can be user-defined at compile time */
+#  define LZ4_DISTANCE_MAX 65535   /* set to maximum value by default */
+#endif
+
+#define LZ4_COMPRESS_INPLACE_MARGIN                           (LZ4_DISTANCE_MAX + 32)   /* LZ4_DISTANCE_MAX can be safely replaced by srcSize when it's smaller */
+#define LZ4_COMPRESS_INPLACE_BUFFER_SIZE(maxCompressedSize)   ((maxCompressedSize) + LZ4_COMPRESS_INPLACE_MARGIN)  /**< maxCompressedSize is generally LZ4_COMPRESSBOUND(inputSize), but can be set to any lower value, with the risk that compression can fail (return code 0(zero)) */
+
+#endif   /* LZ4_STATIC_3504398509 */
+#endif   /* LZ4_STATIC_LINKING_ONLY */
+
+
+
+#ifndef LZ4_H_98237428734687
+#define LZ4_H_98237428734687
+
+/*-************************************************************
+ *  Private Definitions
+ **************************************************************
+ * Do not use these definitions directly.
+ * They are only exposed to allow static allocation of `LZ4_stream_t` and `LZ4_streamDecode_t`.
+ * Accessing members will expose user code to API and/or ABI break in future versions of the library.
+ **************************************************************/
+#define LZ4_HASHLOG   (LZ4_MEMORY_USAGE-2)
+#define LZ4_HASHTABLESIZE (1 << LZ4_MEMORY_USAGE)
+#define LZ4_HASH_SIZE_U32 (1 << LZ4_HASHLOG)       /* required as macro for static allocation */
+
+#if defined(__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+# include <stdint.h>
+  typedef  int8_t  LZ4_i8;
+  typedef uint8_t  LZ4_byte;
+  typedef uint16_t LZ4_u16;
+  typedef uint32_t LZ4_u32;
+#else
+  typedef   signed char  LZ4_i8;
+  typedef unsigned char  LZ4_byte;
+  typedef unsigned short LZ4_u16;
+  typedef unsigned int   LZ4_u32;
+#endif
+
+/*! LZ4_stream_t :
+ *  Never ever use below internal definitions directly !
+ *  These definitions are not API/ABI safe, and may change in future versions.
+ *  If you need static allocation, declare or allocate an LZ4_stream_t object.
+**/
+
+typedef struct LZ4_stream_t_internal LZ4_stream_t_internal;
+struct LZ4_stream_t_internal {
+    LZ4_u32 hashTable[LZ4_HASH_SIZE_U32];
+    const LZ4_byte* dictionary;
+    const LZ4_stream_t_internal* dictCtx;
+    LZ4_u32 currentOffset;
+    LZ4_u32 tableType;
+    LZ4_u32 dictSize;
+    /* Implicit padding to ensure structure is aligned */
+};
+
+#define LZ4_STREAM_MINSIZE  ((1UL << LZ4_MEMORY_USAGE) + 32)  /* static size, for inter-version compatibility */
+union LZ4_stream_u {
+    char minStateSize[LZ4_STREAM_MINSIZE];
+    LZ4_stream_t_internal internal_donotuse;
+}; /* previously typedef'd to LZ4_stream_t */
+
+
+/*! LZ4_initStream() : v1.9.0+
+ *  An LZ4_stream_t structure must be initialized at least once.
+ *  This is automatically done when invoking LZ4_createStream(),
+ *  but it's not when the structure is simply declared on stack (for example).
+ *
+ *  Use LZ4_initStream() to properly initialize a newly declared LZ4_stream_t.
+ *  It can also initialize any arbitrary buffer of sufficient size,
+ *  and will @return a pointer of proper type upon initialization.
+ *
+ *  Note : initialization fails if size and alignment conditions are not respected.
+ *         In which case, the function will @return NULL.
+ *  Note2: An LZ4_stream_t structure guarantees correct alignment and size.
+ *  Note3: Before v1.9.0, use LZ4_resetStream() instead
+**/
+LZ4LIB_API LZ4_stream_t* LZ4_initStream (void* buffer, size_t size);
+
+
+/*! LZ4_streamDecode_t :
+ *  Never ever use below internal definitions directly !
+ *  These definitions are not API/ABI safe, and may change in future versions.
+ *  If you need static allocation, declare or allocate an LZ4_streamDecode_t object.
+**/
+typedef struct {
+    const LZ4_byte* externalDict;
+    const LZ4_byte* prefixEnd;
+    size_t extDictSize;
+    size_t prefixSize;
+} LZ4_streamDecode_t_internal;
+
+#define LZ4_STREAMDECODE_MINSIZE 32
+union LZ4_streamDecode_u {
+    char minStateSize[LZ4_STREAMDECODE_MINSIZE];
+    LZ4_streamDecode_t_internal internal_donotuse;
+} ;   /* previously typedef'd to LZ4_streamDecode_t */
+
+
+
+/*-************************************
+*  Obsolete Functions
+**************************************/
+
+/*! Deprecation warnings
+ *
+ *  Deprecated functions make the compiler generate a warning when invoked.
+ *  This is meant to invite users to update their source code.
+ *  Should deprecation warnings be a problem, it is generally possible to disable them,
+ *  typically with -Wno-deprecated-declarations for gcc
+ *  or _CRT_SECURE_NO_WARNINGS in Visual.
+ *
+ *  Another method is to define LZ4_DISABLE_DEPRECATE_WARNINGS
+ *  before including the header file.
+ */
+#ifdef LZ4_DISABLE_DEPRECATE_WARNINGS
+#  define LZ4_DEPRECATED(message)   /* disable deprecation warnings */
+#else
+#  if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */
+#    define LZ4_DEPRECATED(message) [[deprecated(message)]]
+#  elif defined(_MSC_VER)
+#    define LZ4_DEPRECATED(message) __declspec(deprecated(message))
+#  elif defined(__clang__) || (defined(__GNUC__) && (__GNUC__ * 10 + __GNUC_MINOR__ >= 45))
+#    define LZ4_DEPRECATED(message) __attribute__((deprecated(message)))
+#  elif defined(__GNUC__) && (__GNUC__ * 10 + __GNUC_MINOR__ >= 31)
+#    define LZ4_DEPRECATED(message) __attribute__((deprecated))
+#  else
+#    pragma message("WARNING: LZ4_DEPRECATED needs custom implementation for this compiler")
+#    define LZ4_DEPRECATED(message)   /* disabled */
+#  endif
+#endif /* LZ4_DISABLE_DEPRECATE_WARNINGS */
+
+/*! Obsolete compression functions (since v1.7.3) */
+LZ4_DEPRECATED("use LZ4_compress_default() instead")       LZ4LIB_API int LZ4_compress               (const char* src, char* dest, int srcSize);
+LZ4_DEPRECATED("use LZ4_compress_default() instead")       LZ4LIB_API int LZ4_compress_limitedOutput (const char* src, char* dest, int srcSize, int maxOutputSize);
+LZ4_DEPRECATED("use LZ4_compress_fast_extState() instead") LZ4LIB_API int LZ4_compress_withState               (void* state, const char* source, char* dest, int inputSize);
+LZ4_DEPRECATED("use LZ4_compress_fast_extState() instead") LZ4LIB_API int LZ4_compress_limitedOutput_withState (void* state, const char* source, char* dest, int inputSize, int maxOutputSize);
+LZ4_DEPRECATED("use LZ4_compress_fast_continue() instead") LZ4LIB_API int LZ4_compress_continue                (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize);
+LZ4_DEPRECATED("use LZ4_compress_fast_continue() instead") LZ4LIB_API int LZ4_compress_limitedOutput_continue  (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize, int maxOutputSize);
+
+/*! Obsolete decompression functions (since v1.8.0) */
+LZ4_DEPRECATED("use LZ4_decompress_fast() instead") LZ4LIB_API int LZ4_uncompress (const char* source, char* dest, int outputSize);
+LZ4_DEPRECATED("use LZ4_decompress_safe() instead") LZ4LIB_API int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize);
+
+/* Obsolete streaming functions (since v1.7.0)
+ * degraded functionality; do not use!
+ *
+ * In order to perform streaming compression, these functions depended on data
+ * that is no longer tracked in the state. They have been preserved as well as
+ * possible: using them will still produce a correct output. However, they don't
+ * actually retain any history between compression calls. The compression ratio
+ * achieved will therefore be no better than compressing each chunk
+ * independently.
+ */
+LZ4_DEPRECATED("Use LZ4_createStream() instead") LZ4LIB_API void* LZ4_create (char* inputBuffer);
+LZ4_DEPRECATED("Use LZ4_createStream() instead") LZ4LIB_API int   LZ4_sizeofStreamState(void);
+LZ4_DEPRECATED("Use LZ4_resetStream() instead")  LZ4LIB_API int   LZ4_resetStreamState(void* state, char* inputBuffer);
+LZ4_DEPRECATED("Use LZ4_saveDict() instead")     LZ4LIB_API char* LZ4_slideInputBuffer (void* state);
+
+/*! Obsolete streaming decoding functions (since v1.7.0) */
+LZ4_DEPRECATED("use LZ4_decompress_safe_usingDict() instead") LZ4LIB_API int LZ4_decompress_safe_withPrefix64k (const char* src, char* dst, int compressedSize, int maxDstSize);
+LZ4_DEPRECATED("use LZ4_decompress_fast_usingDict() instead") LZ4LIB_API int LZ4_decompress_fast_withPrefix64k (const char* src, char* dst, int originalSize);
+
+/*! Obsolete LZ4_decompress_fast variants (since v1.9.0) :
+ *  These functions used to be faster than LZ4_decompress_safe(),
+ *  but this is no longer the case. They are now slower.
+ *  This is because LZ4_decompress_fast() doesn't know the input size,
+ *  and therefore must progress more cautiously into the input buffer to not read beyond the end of block.
+ *  On top of that `LZ4_decompress_fast()` is not protected vs malformed or malicious inputs, making it a security liability.
+ *  As a consequence, LZ4_decompress_fast() is strongly discouraged, and deprecated.
+ *
+ *  The last remaining LZ4_decompress_fast() specificity is that
+ *  it can decompress a block without knowing its compressed size.
+ *  Such functionality can be achieved in a more secure manner
+ *  by employing LZ4_decompress_safe_partial().
+ *
+ *  Parameters:
+ *  originalSize : is the uncompressed size to regenerate.
+ *                 `dst` must be already allocated, its size must be >= 'originalSize' bytes.
+ * @return : number of bytes read from source buffer (== compressed size).
+ *           The function expects to finish at block's end exactly.
+ *           If the source stream is detected malformed, the function stops decoding and returns a negative result.
+ *  note : LZ4_decompress_fast*() requires originalSize. Thanks to this information, it never writes past the output buffer.
+ *         However, since it doesn't know its 'src' size, it may read an unknown amount of input, past input buffer bounds.
+ *         Also, since match offsets are not validated, match reads from 'src' may underflow too.
+ *         These issues never happen if input (compressed) data is correct.
+ *         But they may happen if input data is invalid (error or intentional tampering).
+ *         As a consequence, use these functions in trusted environments with trusted data **only**.
+ */
+LZ4_DEPRECATED("This function is deprecated and unsafe. Consider using LZ4_decompress_safe() instead")
+LZ4LIB_API int LZ4_decompress_fast (const char* src, char* dst, int originalSize);
+LZ4_DEPRECATED("This function is deprecated and unsafe. Consider using LZ4_decompress_safe_continue() instead")
+LZ4LIB_API int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* src, char* dst, int originalSize);
+LZ4_DEPRECATED("This function is deprecated and unsafe. Consider using LZ4_decompress_safe_usingDict() instead")
+LZ4LIB_API int LZ4_decompress_fast_usingDict (const char* src, char* dst, int originalSize, const char* dictStart, int dictSize);
+
+/*! LZ4_resetStream() :
+ *  An LZ4_stream_t structure must be initialized at least once.
+ *  This is done with LZ4_initStream(), or LZ4_resetStream().
+ *  Consider switching to LZ4_initStream(),
+ *  invoking LZ4_resetStream() will trigger deprecation warnings in the future.
+ */
+LZ4LIB_API void LZ4_resetStream (LZ4_stream_t* streamPtr);
+
+
+#endif /* LZ4_H_98237428734687 */
+
+
+#if defined (__cplusplus)
+}
+#endif
diff --git a/tools/common/lz4hc.c b/tools/common/lz4hc.c
new file mode 100644
index 0000000000..651f190a09
--- /dev/null
+++ b/tools/common/lz4hc.c
@@ -0,0 +1,1637 @@
+/*
+    LZ4 HC - High Compression Mode of LZ4
+    Copyright (C) 2011-2020, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+       - LZ4 source repository : https://github.com/lz4/lz4
+       - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c
+*/
+/* note : lz4hc is not an independent module, it requires lz4.h/lz4.c for proper compilation */
+
+
+/* *************************************
+*  Tuning Parameter
+***************************************/
+
+/*! HEAPMODE :
+ *  Select how stateless HC compression functions like `LZ4_compress_HC()`
+ *  allocate memory for their workspace:
+ *  in stack (0:fastest), or in heap (1:default, requires malloc()).
+ *  Since workspace is rather large, heap mode is recommended.
+**/
+#ifndef LZ4HC_HEAPMODE
+#  define LZ4HC_HEAPMODE 1
+#endif
+
+
+/*===    Dependency    ===*/
+#define LZ4_HC_STATIC_LINKING_ONLY
+#include "lz4hc.h"
+
+
+/*===   Common definitions   ===*/
+#if defined(__GNUC__)
+#  pragma GCC diagnostic ignored "-Wunused-function"
+#endif
+#if defined (__clang__)
+#  pragma clang diagnostic ignored "-Wunused-function"
+#endif
+
+#define LZ4_COMMONDEFS_ONLY
+#ifndef LZ4_SRC_INCLUDED
+#include "lz4.c"   /* LZ4_count, constants, mem */
+#endif
+
+
+/*===   Enums   ===*/
+typedef enum { noDictCtx, usingDictCtxHc } dictCtx_directive;
+
+
+/*===   Constants   ===*/
+#define OPTIMAL_ML (int)((ML_MASK-1)+MINMATCH)
+#define LZ4_OPT_NUM   (1<<12)
+
+
+/*===   Macros   ===*/
+#define MIN(a,b)   ( (a) < (b) ? (a) : (b) )
+#define MAX(a,b)   ( (a) > (b) ? (a) : (b) )
+#define HASH_FUNCTION(i)         (((i) * 2654435761U) >> ((MINMATCH*8)-LZ4HC_HASH_LOG))
+#define DELTANEXTMAXD(p)         chainTable[(p) & LZ4HC_MAXD_MASK]    /* flexible, LZ4HC_MAXD dependent */
+#define DELTANEXTU16(table, pos) table[(U16)(pos)]   /* faster */
+/* Make fields passed to, and updated by LZ4HC_encodeSequence explicit */
+#define UPDATABLE(ip, op, anchor) &ip, &op, &anchor
+
+#define LZ4HC_HASHSIZE 4
+static U32 LZ4HC_hashPtr(const void* ptr) { return HASH_FUNCTION(LZ4_read32(ptr)); }
+
+
+/**************************************
+*  HC Compression
+**************************************/
+static void LZ4HC_clearTables (LZ4HC_CCtx_internal* hc4)
+{
+    MEM_INIT(hc4->hashTable, 0, sizeof(hc4->hashTable));
+    MEM_INIT(hc4->chainTable, 0xFF, sizeof(hc4->chainTable));
+}
+
+static void LZ4HC_init_internal (LZ4HC_CCtx_internal* hc4, const BYTE* start)
+{
+    size_t const bufferSize = (size_t)(hc4->end - hc4->prefixStart);
+    size_t newStartingOffset = bufferSize + hc4->dictLimit;
+    DEBUGLOG(5, "LZ4HC_init_internal");
+    assert(newStartingOffset >= bufferSize);  /* check overflow */
+    if (newStartingOffset > 1 GB) {
+        LZ4HC_clearTables(hc4);
+        newStartingOffset = 0;
+    }
+    newStartingOffset += 64 KB;
+    hc4->nextToUpdate = (U32)newStartingOffset;
+    hc4->prefixStart = start;
+    hc4->end = start;
+    hc4->dictStart = start;
+    hc4->dictLimit = (U32)newStartingOffset;
+    hc4->lowLimit = (U32)newStartingOffset;
+}
+
+
+/* Update chains up to ip (excluded) */
+LZ4_FORCE_INLINE void LZ4HC_Insert (LZ4HC_CCtx_internal* hc4, const BYTE* ip)
+{
+    U16* const chainTable = hc4->chainTable;
+    U32* const hashTable  = hc4->hashTable;
+    const BYTE* const prefixPtr = hc4->prefixStart;
+    U32 const prefixIdx = hc4->dictLimit;
+    U32 const target = (U32)(ip - prefixPtr) + prefixIdx;
+    U32 idx = hc4->nextToUpdate;
+    assert(ip >= prefixPtr);
+    assert(target >= prefixIdx);
+
+    while (idx < target) {
+        U32 const h = LZ4HC_hashPtr(prefixPtr+idx-prefixIdx);
+        size_t delta = idx - hashTable[h];
+        if (delta>LZ4_DISTANCE_MAX) delta = LZ4_DISTANCE_MAX;
+        DELTANEXTU16(chainTable, idx) = (U16)delta;
+        hashTable[h] = idx;
+        idx++;
+    }
+
+    hc4->nextToUpdate = target;
+}
+
+/** LZ4HC_countBack() :
+ * @return : negative value, nb of common bytes before ip/match */
+LZ4_FORCE_INLINE
+int LZ4HC_countBack(const BYTE* const ip, const BYTE* const match,
+                    const BYTE* const iMin, const BYTE* const mMin)
+{
+    int back = 0;
+    int const min = (int)MAX(iMin - ip, mMin - match);
+    assert(min <= 0);
+    assert(ip >= iMin); assert((size_t)(ip-iMin) < (1U<<31));
+    assert(match >= mMin); assert((size_t)(match - mMin) < (1U<<31));
+    while ( (back > min)
+         && (ip[back-1] == match[back-1]) )
+            back--;
+    return back;
+}
+
+#if defined(_MSC_VER)
+#  define LZ4HC_rotl32(x,r) _rotl(x,r)
+#else
+#  define LZ4HC_rotl32(x,r) ((x << r) | (x >> (32 - r)))
+#endif
+
+
+static U32 LZ4HC_rotatePattern(size_t const rotate, U32 const pattern)
+{
+    size_t const bitsToRotate = (rotate & (sizeof(pattern) - 1)) << 3;
+    if (bitsToRotate == 0) return pattern;
+    return LZ4HC_rotl32(pattern, (int)bitsToRotate);
+}
+
+/* LZ4HC_countPattern() :
+ * pattern32 must be a sample of repetitive pattern of length 1, 2 or 4 (but not 3!) */
+static unsigned
+LZ4HC_countPattern(const BYTE* ip, const BYTE* const iEnd, U32 const pattern32)
+{
+    const BYTE* const iStart = ip;
+    reg_t const pattern = (sizeof(pattern)==8) ?
+        (reg_t)pattern32 + (((reg_t)pattern32) << (sizeof(pattern)*4)) : pattern32;
+
+    while (likely(ip < iEnd-(sizeof(pattern)-1))) {
+        reg_t const diff = LZ4_read_ARCH(ip) ^ pattern;
+        if (!diff) { ip+=sizeof(pattern); continue; }
+        ip += LZ4_NbCommonBytes(diff);
+        return (unsigned)(ip - iStart);
+    }
+
+    if (LZ4_isLittleEndian()) {
+        reg_t patternByte = pattern;
+        while ((ip<iEnd) && (*ip == (BYTE)patternByte)) {
+            ip++; patternByte >>= 8;
+        }
+    } else {  /* big endian */
+        U32 bitOffset = (sizeof(pattern)*8) - 8;
+        while (ip < iEnd) {
+            BYTE const byte = (BYTE)(pattern >> bitOffset);
+            if (*ip != byte) break;
+            ip ++; bitOffset -= 8;
+    }   }
+
+    return (unsigned)(ip - iStart);
+}
+
+/* LZ4HC_reverseCountPattern() :
+ * pattern must be a sample of repetitive pattern of length 1, 2 or 4 (but not 3!)
+ * read using natural platform endianness */
+static unsigned
+LZ4HC_reverseCountPattern(const BYTE* ip, const BYTE* const iLow, U32 pattern)
+{
+    const BYTE* const iStart = ip;
+
+    while (likely(ip >= iLow+4)) {
+        if (LZ4_read32(ip-4) != pattern) break;
+        ip -= 4;
+    }
+    {   const BYTE* bytePtr = (const BYTE*)(&pattern) + 3; /* works for any endianness */
+        while (likely(ip>iLow)) {
+            if (ip[-1] != *bytePtr) break;
+            ip--; bytePtr--;
+    }   }
+    return (unsigned)(iStart - ip);
+}
+
+/* LZ4HC_protectDictEnd() :
+ * Checks if the match is in the last 3 bytes of the dictionary, so reading the
+ * 4 byte MINMATCH would overflow.
+ * @returns true if the match index is okay.
+ */
+static int LZ4HC_protectDictEnd(U32 const dictLimit, U32 const matchIndex)
+{
+    return ((U32)((dictLimit - 1) - matchIndex) >= 3);
+}
+
+typedef enum { rep_untested, rep_not, rep_confirmed } repeat_state_e;
+typedef enum { favorCompressionRatio=0, favorDecompressionSpeed } HCfavor_e;
+
+typedef struct {
+    int off;
+    int len;
+} LZ4HC_match_t;
+
+LZ4_FORCE_INLINE LZ4HC_match_t
+LZ4HC_InsertAndGetWiderMatch (
+        LZ4HC_CCtx_internal* const hc4,
+        const BYTE* const ip,
+        const BYTE* const iLowLimit, const BYTE* const iHighLimit,
+        int longest,
+        const BYTE** startpos,
+        const int maxNbAttempts,
+        const int patternAnalysis, const int chainSwap,
+        const dictCtx_directive dict,
+        const HCfavor_e favorDecSpeed)
+{
+    U16* const chainTable = hc4->chainTable;
+    U32* const hashTable = hc4->hashTable;
+    const LZ4HC_CCtx_internal * const dictCtx = hc4->dictCtx;
+    const BYTE* const prefixPtr = hc4->prefixStart;
+    const U32 prefixIdx = hc4->dictLimit;
+    const U32 ipIndex = (U32)(ip - prefixPtr) + prefixIdx;
+    const int withinStartDistance = (hc4->lowLimit + (LZ4_DISTANCE_MAX + 1) > ipIndex);
+    const U32 lowestMatchIndex = (withinStartDistance) ? hc4->lowLimit : ipIndex - LZ4_DISTANCE_MAX;
+    const BYTE* const dictStart = hc4->dictStart;
+    const U32 dictIdx = hc4->lowLimit;
+    const BYTE* const dictEnd = dictStart + prefixIdx - dictIdx;
+    int const lookBackLength = (int)(ip-iLowLimit);
+    int nbAttempts = maxNbAttempts;
+    U32 matchChainPos = 0;
+    U32 const pattern = LZ4_read32(ip);
+    U32 matchIndex;
+    repeat_state_e repeat = rep_untested;
+    size_t srcPatternLength = 0;
+    int offset = 0;
+
+    DEBUGLOG(7, "LZ4HC_InsertAndGetWiderMatch");
+    assert(startpos != NULL);
+    *startpos = ip;  /* in case there is no solution */
+    /* First Match */
+    LZ4HC_Insert(hc4, ip);  /* insert all prior positions up to ip (excluded) */
+    matchIndex = hashTable[LZ4HC_hashPtr(ip)];
+    DEBUGLOG(7, "First candidate match for pos %u found at index %u / %u (lowestMatchIndex)",
+                ipIndex, matchIndex, lowestMatchIndex);
+
+    while ((matchIndex>=lowestMatchIndex) && (nbAttempts>0)) {
+        int matchLength=0;
+        nbAttempts--;
+        assert(matchIndex < ipIndex);
+        if (favorDecSpeed && (ipIndex - matchIndex < 8)) {
+            /* do nothing:
+             * favorDecSpeed intentionally skips matches with offset < 8 */
+        } else if (matchIndex >= prefixIdx) {   /* within current Prefix */
+            const BYTE* const matchPtr = prefixPtr + (matchIndex - prefixIdx);
+            assert(matchPtr < ip);
+            assert(longest >= 1);
+            if (LZ4_read16(iLowLimit + longest - 1) == LZ4_read16(matchPtr - lookBackLength + longest - 1)) {
+                if (LZ4_read32(matchPtr) == pattern) {
+                    int const back = lookBackLength ? LZ4HC_countBack(ip, matchPtr, iLowLimit, prefixPtr) : 0;
+                    matchLength = MINMATCH + (int)LZ4_count(ip+MINMATCH, matchPtr+MINMATCH, iHighLimit);
+                    matchLength -= back;
+                    if (matchLength > longest) {
+                        longest = matchLength;
+                        offset = (int)(ipIndex - matchIndex);
+                        *startpos = ip + back;
+                        DEBUGLOG(7, "Found match of len=%i within prefix, offset=%i, back=%i", longest, offset, -back);
+            }   }   }
+        } else {   /* lowestMatchIndex <= matchIndex < dictLimit : within Ext Dict */
+            const BYTE* const matchPtr = dictStart + (matchIndex - dictIdx);
+            assert(matchIndex >= dictIdx);
+            if ( likely(matchIndex <= prefixIdx - 4)
+              && (LZ4_read32(matchPtr) == pattern) ) {
+                int back = 0;
+                const BYTE* vLimit = ip + (prefixIdx - matchIndex);
+                if (vLimit > iHighLimit) vLimit = iHighLimit;
+                matchLength = (int)LZ4_count(ip+MINMATCH, matchPtr+MINMATCH, vLimit) + MINMATCH;
+                if ((ip+matchLength == vLimit) && (vLimit < iHighLimit))
+                    matchLength += LZ4_count(ip+matchLength, prefixPtr, iHighLimit);
+                back = lookBackLength ? LZ4HC_countBack(ip, matchPtr, iLowLimit, dictStart) : 0;
+                matchLength -= back;
+                if (matchLength > longest) {
+                    longest = matchLength;
+                    offset = (int)(ipIndex - matchIndex);
+                    *startpos = ip + back;
+                    DEBUGLOG(7, "Found match of len=%i within dict, offset=%i, back=%i", longest, offset, -back);
+        }   }   }
+
+        if (chainSwap && matchLength==longest) {   /* better match => select a better chain */
+            assert(lookBackLength==0);   /* search forward only */
+            if (matchIndex + (U32)longest <= ipIndex) {
+                int const kTrigger = 4;
+                U32 distanceToNextMatch = 1;
+                int const end = longest - MINMATCH + 1;
+                int step = 1;
+                int accel = 1 << kTrigger;
+                int pos;
+                for (pos = 0; pos < end; pos += step) {
+                    U32 const candidateDist = DELTANEXTU16(chainTable, matchIndex + (U32)pos);
+                    step = (accel++ >> kTrigger);
+                    if (candidateDist > distanceToNextMatch) {
+                        distanceToNextMatch = candidateDist;
+                        matchChainPos = (U32)pos;
+                        accel = 1 << kTrigger;
+                }   }
+                if (distanceToNextMatch > 1) {
+                    if (distanceToNextMatch > matchIndex) break;   /* avoid overflow */
+                    matchIndex -= distanceToNextMatch;
+                    continue;
+        }   }   }
+
+        {   U32 const distNextMatch = DELTANEXTU16(chainTable, matchIndex);
+            if (patternAnalysis && distNextMatch==1 && matchChainPos==0) {
+                U32 const matchCandidateIdx = matchIndex-1;
+                /* may be a repeated pattern */
+                if (repeat == rep_untested) {
+                    if ( ((pattern & 0xFFFF) == (pattern >> 16))
+                      &  ((pattern & 0xFF)   == (pattern >> 24)) ) {
+                        DEBUGLOG(7, "Repeat pattern detected, char %02X", pattern >> 24);
+                        repeat = rep_confirmed;
+                        srcPatternLength = LZ4HC_countPattern(ip+sizeof(pattern), iHighLimit, pattern) + sizeof(pattern);
+                    } else {
+                        repeat = rep_not;
+                }   }
+                if ( (repeat == rep_confirmed) && (matchCandidateIdx >= lowestMatchIndex)
+                  && LZ4HC_protectDictEnd(prefixIdx, matchCandidateIdx) ) {
+                    const int extDict = matchCandidateIdx < prefixIdx;
+                    const BYTE* const matchPtr = extDict ? dictStart + (matchCandidateIdx - dictIdx) : prefixPtr + (matchCandidateIdx - prefixIdx);
+                    if (LZ4_read32(matchPtr) == pattern) {  /* good candidate */
+                        const BYTE* const iLimit = extDict ? dictEnd : iHighLimit;
+                        size_t forwardPatternLength = LZ4HC_countPattern(matchPtr+sizeof(pattern), iLimit, pattern) + sizeof(pattern);
+                        if (extDict && matchPtr + forwardPatternLength == iLimit) {
+                            U32 const rotatedPattern = LZ4HC_rotatePattern(forwardPatternLength, pattern);
+                            forwardPatternLength += LZ4HC_countPattern(prefixPtr, iHighLimit, rotatedPattern);
+                        }
+                        {   const BYTE* const lowestMatchPtr = extDict ? dictStart : prefixPtr;
+                            size_t backLength = LZ4HC_reverseCountPattern(matchPtr, lowestMatchPtr, pattern);
+                            size_t currentSegmentLength;
+                            if (!extDict
+                              && matchPtr - backLength == prefixPtr
+                              && dictIdx < prefixIdx) {
+                                U32 const rotatedPattern = LZ4HC_rotatePattern((U32)(-(int)backLength), pattern);
+                                backLength += LZ4HC_reverseCountPattern(dictEnd, dictStart, rotatedPattern);
+                            }
+                            /* Limit backLength not go further than lowestMatchIndex */
+                            backLength = matchCandidateIdx - MAX(matchCandidateIdx - (U32)backLength, lowestMatchIndex);
+                            assert(matchCandidateIdx - backLength >= lowestMatchIndex);
+                            currentSegmentLength = backLength + forwardPatternLength;
+                            /* Adjust to end of pattern if the source pattern fits, otherwise the beginning of the pattern */
+                            if ( (currentSegmentLength >= srcPatternLength)   /* current pattern segment large enough to contain full srcPatternLength */
+                              && (forwardPatternLength <= srcPatternLength) ) { /* haven't reached this position yet */
+                                U32 const newMatchIndex = matchCandidateIdx + (U32)forwardPatternLength - (U32)srcPatternLength;  /* best position, full pattern, might be followed by more match */
+                                if (LZ4HC_protectDictEnd(prefixIdx, newMatchIndex))
+                                    matchIndex = newMatchIndex;
+                                else {
+                                    /* Can only happen if started in the prefix */
+                                    assert(newMatchIndex >= prefixIdx - 3 && newMatchIndex < prefixIdx && !extDict);
+                                    matchIndex = prefixIdx;
+                                }
+                            } else {
+                                U32 const newMatchIndex = matchCandidateIdx - (U32)backLength;   /* farthest position in current segment, will find a match of length currentSegmentLength + maybe some back */
+                                if (!LZ4HC_protectDictEnd(prefixIdx, newMatchIndex)) {
+                                    assert(newMatchIndex >= prefixIdx - 3 && newMatchIndex < prefixIdx && !extDict);
+                                    matchIndex = prefixIdx;
+                                } else {
+                                    matchIndex = newMatchIndex;
+                                    if (lookBackLength==0) {  /* no back possible */
+                                        size_t const maxML = MIN(currentSegmentLength, srcPatternLength);
+                                        if ((size_t)longest < maxML) {
+                                            assert(prefixPtr - prefixIdx + matchIndex != ip);
+                                            if ((size_t)(ip - prefixPtr) + prefixIdx - matchIndex > LZ4_DISTANCE_MAX) break;
+                                            assert(maxML < 2 GB);
+                                            longest = (int)maxML;
+                                            offset = (int)(ipIndex - matchIndex);
+                                            *startpos = ip;
+                                            DEBUGLOG(7, "Found repeat pattern match of len=%i, offset=%i", longest, offset);
+                                        }
+                                        {   U32 const distToNextPattern = DELTANEXTU16(chainTable, matchIndex);
+                                            if (distToNextPattern > matchIndex) break;  /* avoid overflow */
+                                            matchIndex -= distToNextPattern;
+                        }   }   }   }   }
+                        continue;
+                }   }
+        }   }   /* PA optimization */
+
+        /* follow current chain */
+        matchIndex -= DELTANEXTU16(chainTable, matchIndex + matchChainPos);
+
+    }  /* while ((matchIndex>=lowestMatchIndex) && (nbAttempts)) */
+
+    if ( dict == usingDictCtxHc
+      && nbAttempts > 0
+      && ipIndex - lowestMatchIndex < LZ4_DISTANCE_MAX) {
+        size_t const dictEndOffset = (size_t)(dictCtx->end - dictCtx->prefixStart) + dictCtx->dictLimit;
+        U32 dictMatchIndex = dictCtx->hashTable[LZ4HC_hashPtr(ip)];
+        assert(dictEndOffset <= 1 GB);
+        matchIndex = dictMatchIndex + lowestMatchIndex - (U32)dictEndOffset;
+        if (dictMatchIndex>0) DEBUGLOG(7, "dictEndOffset = %zu, dictMatchIndex = %u => relative matchIndex = %i", dictEndOffset, dictMatchIndex, (int)dictMatchIndex - (int)dictEndOffset);
+        while (ipIndex - matchIndex <= LZ4_DISTANCE_MAX && nbAttempts--) {
+            const BYTE* const matchPtr = dictCtx->prefixStart - dictCtx->dictLimit + dictMatchIndex;
+
+            if (LZ4_read32(matchPtr) == pattern) {
+                int mlt;
+                int back = 0;
+                const BYTE* vLimit = ip + (dictEndOffset - dictMatchIndex);
+                if (vLimit > iHighLimit) vLimit = iHighLimit;
+                mlt = (int)LZ4_count(ip+MINMATCH, matchPtr+MINMATCH, vLimit) + MINMATCH;
+                back = lookBackLength ? LZ4HC_countBack(ip, matchPtr, iLowLimit, dictCtx->prefixStart) : 0;
+                mlt -= back;
+                if (mlt > longest) {
+                    longest = mlt;
+                    offset = (int)(ipIndex - matchIndex);
+                    *startpos = ip + back;
+                    DEBUGLOG(7, "found match of length %i within extDictCtx", longest);
+            }   }
+
+            {   U32 const nextOffset = DELTANEXTU16(dictCtx->chainTable, dictMatchIndex);
+                dictMatchIndex -= nextOffset;
+                matchIndex -= nextOffset;
+    }   }   }
+
+    {   LZ4HC_match_t md;
+        assert(longest >= 0);
+        md.len = longest;
+        md.off = offset;
+        return md;
+    }
+}
+
+LZ4_FORCE_INLINE LZ4HC_match_t
+LZ4HC_InsertAndFindBestMatch(LZ4HC_CCtx_internal* const hc4,   /* Index table will be updated */
+                       const BYTE* const ip, const BYTE* const iLimit,
+                       const int maxNbAttempts,
+                       const int patternAnalysis,
+                       const dictCtx_directive dict)
+{
+    const BYTE* uselessPtr = ip;
+    DEBUGLOG(7, "LZ4HC_InsertAndFindBestMatch");
+    /* note : LZ4HC_InsertAndGetWiderMatch() is able to modify the starting position of a match (*startpos),
+     * but this won't be the case here, as we define iLowLimit==ip,
+     * so LZ4HC_InsertAndGetWiderMatch() won't be allowed to search past ip */
+    return LZ4HC_InsertAndGetWiderMatch(hc4, ip, ip, iLimit, MINMATCH-1, &uselessPtr, maxNbAttempts, patternAnalysis, 0 /*chainSwap*/, dict, favorCompressionRatio);
+}
+
+/* LZ4HC_encodeSequence() :
+ * @return : 0 if ok,
+ *           1 if buffer issue detected */
+LZ4_FORCE_INLINE int LZ4HC_encodeSequence (
+    const BYTE** _ip,
+    BYTE** _op,
+    const BYTE** _anchor,
+    int matchLength,
+    int offset,
+    limitedOutput_directive limit,
+    BYTE* oend)
+{
+#define ip      (*_ip)
+#define op      (*_op)
+#define anchor  (*_anchor)
+
+    size_t length;
+    BYTE* const token = op++;
+
+#if defined(LZ4_DEBUG) && (LZ4_DEBUG >= 6)
+    static const BYTE* start = NULL;
+    static U32 totalCost = 0;
+    U32 const pos = (start==NULL) ? 0 : (U32)(anchor - start);
+    U32 const ll = (U32)(ip - anchor);
+    U32 const llAdd = (ll>=15) ? ((ll-15) / 255) + 1 : 0;
+    U32 const mlAdd = (matchLength>=19) ? ((matchLength-19) / 255) + 1 : 0;
+    U32 const cost = 1 + llAdd + ll + 2 + mlAdd;
+    if (start==NULL) start = anchor;  /* only works for single segment */
+    /* g_debuglog_enable = (pos >= 2228) & (pos <= 2262); */
+    DEBUGLOG(6, "pos:%7u -- literals:%4u, match:%4i, offset:%5i, cost:%4u + %5u",
+                pos,
+                (U32)(ip - anchor), matchLength, offset,
+                cost, totalCost);
+    totalCost += cost;
+#endif
+
+    /* Encode Literal length */
+    length = (size_t)(ip - anchor);
+    LZ4_STATIC_ASSERT(notLimited == 0);
+    /* Check output limit */
+    if (limit && ((op + (length / 255) + length + (2 + 1 + LASTLITERALS)) > oend)) {
+        DEBUGLOG(6, "Not enough room to write %i literals (%i bytes remaining)",
+                (int)length, (int)(oend - op));
+        return 1;
+    }
+    if (length >= RUN_MASK) {
+        size_t len = length - RUN_MASK;
+        *token = (RUN_MASK << ML_BITS);
+        for(; len >= 255 ; len -= 255) *op++ = 255;
+        *op++ = (BYTE)len;
+    } else {
+        *token = (BYTE)(length << ML_BITS);
+    }
+
+    /* Copy Literals */
+    LZ4_wildCopy8(op, anchor, op + length);
+    op += length;
+
+    /* Encode Offset */
+    assert(offset <= LZ4_DISTANCE_MAX );
+    assert(offset > 0);
+    LZ4_writeLE16(op, (U16)(offset)); op += 2;
+
+    /* Encode MatchLength */
+    assert(matchLength >= MINMATCH);
+    length = (size_t)matchLength - MINMATCH;
+    if (limit && (op + (length / 255) + (1 + LASTLITERALS) > oend)) {
+        DEBUGLOG(6, "Not enough room to write match length");
+        return 1;   /* Check output limit */
+    }
+    if (length >= ML_MASK) {
+        *token += ML_MASK;
+        length -= ML_MASK;
+        for(; length >= 510 ; length -= 510) { *op++ = 255; *op++ = 255; }
+        if (length >= 255) { length -= 255; *op++ = 255; }
+        *op++ = (BYTE)length;
+    } else {
+        *token += (BYTE)(length);
+    }
+
+    /* Prepare next loop */
+    ip += matchLength;
+    anchor = ip;
+
+    return 0;
+}
+#undef ip
+#undef op
+#undef anchor
+
+LZ4_FORCE_INLINE int LZ4HC_compress_hashChain (
+    LZ4HC_CCtx_internal* const ctx,
+    const char* const source,
+    char* const dest,
+    int* srcSizePtr,
+    int const maxOutputSize,
+    int maxNbAttempts,
+    const limitedOutput_directive limit,
+    const dictCtx_directive dict
+    )
+{
+    const int inputSize = *srcSizePtr;
+    const int patternAnalysis = (maxNbAttempts > 128);   /* levels 9+ */
+
+    const BYTE* ip = (const BYTE*) source;
+    const BYTE* anchor = ip;
+    const BYTE* const iend = ip + inputSize;
+    const BYTE* const mflimit = iend - MFLIMIT;
+    const BYTE* const matchlimit = (iend - LASTLITERALS);
+
+    BYTE* optr = (BYTE*) dest;
+    BYTE* op = (BYTE*) dest;
+    BYTE* oend = op + maxOutputSize;
+
+    const BYTE* start0;
+    const BYTE* start2 = NULL;
+    const BYTE* start3 = NULL;
+    LZ4HC_match_t m0, m1, m2, m3;
+    const LZ4HC_match_t nomatch = {0, 0};
+
+    /* init */
+    DEBUGLOG(5, "LZ4HC_compress_hashChain (dict?=>%i)", dict);
+    *srcSizePtr = 0;
+    if (limit == fillOutput) oend -= LASTLITERALS;                  /* Hack for support LZ4 format restriction */
+    if (inputSize < LZ4_minLength) goto _last_literals;             /* Input too small, no compression (all literals) */
+
+    /* Main Loop */
+    while (ip <= mflimit) {
+        m1 = LZ4HC_InsertAndFindBestMatch(ctx, ip, matchlimit, maxNbAttempts, patternAnalysis, dict);
+        if (m1.len<MINMATCH) { ip++; continue; }
+
+        /* saved, in case we would skip too much */
+        start0 = ip; m0 = m1;
+
+_Search2:
+        DEBUGLOG(7, "_Search2 (currently found match of size %i)", m1.len);
+        if (ip+m1.len <= mflimit) {
+            m2 = LZ4HC_InsertAndGetWiderMatch(ctx,
+                            ip + m1.len - 2, ip + 0, matchlimit, m1.len, &start2,
+                            maxNbAttempts, patternAnalysis, 0, dict, favorCompressionRatio);
+        } else {
+            m2 = nomatch;  /* do not search further */
+        }
+
+        if (m2.len <= m1.len) { /* No better match => encode ML1 immediately */
+            optr = op;
+            if (LZ4HC_encodeSequence(UPDATABLE(ip, op, anchor), m1.len, m1.off, limit, oend)) goto _dest_overflow;
+            continue;
+        }
+
+        if (start0 < ip) {   /* first match was skipped at least once */
+            if (start2 < ip + m0.len) {  /* squeezing ML1 between ML0(original ML1) and ML2 */
+                ip = start0; m1 = m0;  /* restore initial Match1 */
+        }   }
+
+        /* Here, start0==ip */
+        if ((start2 - ip) < 3) {  /* First Match too small : removed */
+            ip = start2;
+            m1 = m2;
+            goto _Search2;
+        }
+
+_Search3:
+        if ((start2 - ip) < OPTIMAL_ML) {
+            int correction;
+            int new_ml = m1.len;
+            if (new_ml > OPTIMAL_ML) new_ml = OPTIMAL_ML;
+            if (ip+new_ml > start2 + m2.len - MINMATCH)
+                new_ml = (int)(start2 - ip) + m2.len - MINMATCH;
+            correction = new_ml - (int)(start2 - ip);
+            if (correction > 0) {
+                start2 += correction;
+                m2.len -= correction;
+            }
+        }
+
+        if (start2 + m2.len <= mflimit) {
+            m3 = LZ4HC_InsertAndGetWiderMatch(ctx,
+                            start2 + m2.len - 3, start2, matchlimit, m2.len, &start3,
+                            maxNbAttempts, patternAnalysis, 0, dict, favorCompressionRatio);
+        } else {
+            m3 = nomatch;  /* do not search further */
+        }
+
+        if (m3.len <= m2.len) {  /* No better match => encode ML1 and ML2 */
+            /* ip & ref are known; Now for ml */
+            if (start2 < ip+m1.len) m1.len = (int)(start2 - ip);
+            /* Now, encode 2 sequences */
+            optr = op;
+            if (LZ4HC_encodeSequence(UPDATABLE(ip, op, anchor), m1.len, m1.off, limit, oend))
+                goto _dest_overflow;
+            ip = start2;
+            optr = op;
+            if (LZ4HC_encodeSequence(UPDATABLE(ip, op, anchor), m2.len, m2.off, limit, oend)) {
+                m1 = m2;
+                goto _dest_overflow;
+            }
+            continue;
+        }
+
+        if (start3 < ip+m1.len+3) {  /* Not enough space for match 2 : remove it */
+            if (start3 >= (ip+m1.len)) {  /* can write Seq1 immediately ==> Seq2 is removed, so Seq3 becomes Seq1 */
+                if (start2 < ip+m1.len) {
+                    int correction = (int)(ip+m1.len - start2);
+                    start2 += correction;
+                    m2.len -= correction;
+                    if (m2.len < MINMATCH) {
+                        start2 = start3;
+                        m2 = m3;
+                    }
+                }
+
+                optr = op;
+                if (LZ4HC_encodeSequence(UPDATABLE(ip, op, anchor), m1.len, m1.off, limit, oend)) goto _dest_overflow;
+                ip  = start3;
+                m1 = m3;
+
+                start0 = start2;
+                m0 = m2;
+                goto _Search2;
+            }
+
+            start2 = start3;
+            m2 = m3;
+            goto _Search3;
+        }
+
+        /*
+        * OK, now we have 3 ascending matches;
+        * let's write the first one ML1.
+        * ip & ref are known; Now decide ml.
+        */
+        if (start2 < ip+m1.len) {
+            if ((start2 - ip) < OPTIMAL_ML) {
+                int correction;
+                if (m1.len > OPTIMAL_ML) m1.len = OPTIMAL_ML;
+                if (ip + m1.len > start2 + m2.len - MINMATCH)
+                    m1.len = (int)(start2 - ip) + m2.len - MINMATCH;
+                correction = m1.len - (int)(start2 - ip);
+                if (correction > 0) {
+                    start2 += correction;
+                    m2.len -= correction;
+                }
+            } else {
+                m1.len = (int)(start2 - ip);
+            }
+        }
+        optr = op;
+        if (LZ4HC_encodeSequence(UPDATABLE(ip, op, anchor), m1.len, m1.off, limit, oend)) goto _dest_overflow;
+
+        /* ML2 becomes ML1 */
+        ip = start2; m1 = m2;
+
+        /* ML3 becomes ML2 */
+        start2 = start3; m2 = m3;
+
+        /* let's find a new ML3 */
+        goto _Search3;
+    }
+
+_last_literals:
+    /* Encode Last Literals */
+    {   size_t lastRunSize = (size_t)(iend - anchor);  /* literals */
+        size_t llAdd = (lastRunSize + 255 - RUN_MASK) / 255;
+        size_t const totalSize = 1 + llAdd + lastRunSize;
+        if (limit == fillOutput) oend += LASTLITERALS;  /* restore correct value */
+        if (limit && (op + totalSize > oend)) {
+            if (limit == limitedOutput) return 0;
+            /* adapt lastRunSize to fill 'dest' */
+            lastRunSize  = (size_t)(oend - op) - 1 /*token*/;
+            llAdd = (lastRunSize + 256 - RUN_MASK) / 256;
+            lastRunSize -= llAdd;
+        }
+        DEBUGLOG(6, "Final literal run : %i literals", (int)lastRunSize);
+        ip = anchor + lastRunSize;  /* can be != iend if limit==fillOutput */
+
+        if (lastRunSize >= RUN_MASK) {
+            size_t accumulator = lastRunSize - RUN_MASK;
+            *op++ = (RUN_MASK << ML_BITS);
+            for(; accumulator >= 255 ; accumulator -= 255) *op++ = 255;
+            *op++ = (BYTE) accumulator;
+        } else {
+            *op++ = (BYTE)(lastRunSize << ML_BITS);
+        }
+        LZ4_memcpy(op, anchor, lastRunSize);
+        op += lastRunSize;
+    }
+
+    /* End */
+    *srcSizePtr = (int) (((const char*)ip) - source);
+    return (int) (((char*)op)-dest);
+
+_dest_overflow:
+    if (limit == fillOutput) {
+        /* Assumption : ip, anchor, ml and ref must be set correctly */
+        size_t const ll = (size_t)(ip - anchor);
+        size_t const ll_addbytes = (ll + 240) / 255;
+        size_t const ll_totalCost = 1 + ll_addbytes + ll;
+        BYTE* const maxLitPos = oend - 3; /* 2 for offset, 1 for token */
+        DEBUGLOG(6, "Last sequence overflowing");
+        op = optr;  /* restore correct out pointer */
+        if (op + ll_totalCost <= maxLitPos) {
+            /* ll validated; now adjust match length */
+            size_t const bytesLeftForMl = (size_t)(maxLitPos - (op+ll_totalCost));
+            size_t const maxMlSize = MINMATCH + (ML_MASK-1) + (bytesLeftForMl * 255);
+            assert(maxMlSize < INT_MAX); assert(m1.len >= 0);
+            if ((size_t)m1.len > maxMlSize) m1.len = (int)maxMlSize;
+            if ((oend + LASTLITERALS) - (op + ll_totalCost + 2) - 1 + m1.len >= MFLIMIT) {
+                LZ4HC_encodeSequence(UPDATABLE(ip, op, anchor), m1.len, m1.off, notLimited, oend);
+        }   }
+        goto _last_literals;
+    }
+    /* compression failed */
+    return 0;
+}
+
+
+static int LZ4HC_compress_optimal( LZ4HC_CCtx_internal* ctx,
+    const char* const source, char* dst,
+    int* srcSizePtr, int dstCapacity,
+    int const nbSearches, size_t sufficient_len,
+    const limitedOutput_directive limit, int const fullUpdate,
+    const dictCtx_directive dict,
+    const HCfavor_e favorDecSpeed);
+
+
+LZ4_FORCE_INLINE int
+LZ4HC_compress_generic_internal (
+            LZ4HC_CCtx_internal* const ctx,
+            const char* const src,
+            char* const dst,
+            int* const srcSizePtr,
+            int const dstCapacity,
+            int cLevel,
+            const limitedOutput_directive limit,
+            const dictCtx_directive dict
+            )
+{
+    typedef enum { lz4hc, lz4opt } lz4hc_strat_e;
+    typedef struct {
+        lz4hc_strat_e strat;
+        int nbSearches;
+        U32 targetLength;
+    } cParams_t;
+    static const cParams_t clTable[LZ4HC_CLEVEL_MAX+1] = {
+        { lz4hc,     2, 16 },  /* 0, unused */
+        { lz4hc,     2, 16 },  /* 1, unused */
+        { lz4hc,     2, 16 },  /* 2, unused */
+        { lz4hc,     4, 16 },  /* 3 */
+        { lz4hc,     8, 16 },  /* 4 */
+        { lz4hc,    16, 16 },  /* 5 */
+        { lz4hc,    32, 16 },  /* 6 */
+        { lz4hc,    64, 16 },  /* 7 */
+        { lz4hc,   128, 16 },  /* 8 */
+        { lz4hc,   256, 16 },  /* 9 */
+        { lz4opt,   96, 64 },  /*10==LZ4HC_CLEVEL_OPT_MIN*/
+        { lz4opt,  512,128 },  /*11 */
+        { lz4opt,16384,LZ4_OPT_NUM },  /* 12==LZ4HC_CLEVEL_MAX */
+    };
+
+    DEBUGLOG(5, "LZ4HC_compress_generic_internal(src=%p, srcSize=%d)",
+                src, *srcSizePtr);
+
+    if (limit == fillOutput && dstCapacity < 1) return 0;   /* Impossible to store anything */
+    if ((U32)*srcSizePtr > (U32)LZ4_MAX_INPUT_SIZE) return 0;    /* Unsupported input size (too large or negative) */
+
+    ctx->end += *srcSizePtr;
+    if (cLevel < 1) cLevel = LZ4HC_CLEVEL_DEFAULT;   /* note : convention is different from lz4frame, maybe something to review */
+    cLevel = MIN(LZ4HC_CLEVEL_MAX, cLevel);
+    {   cParams_t const cParam = clTable[cLevel];
+        HCfavor_e const favor = ctx->favorDecSpeed ? favorDecompressionSpeed : favorCompressionRatio;
+        int result;
+
+        if (cParam.strat == lz4hc) {
+            result = LZ4HC_compress_hashChain(ctx,
+                                src, dst, srcSizePtr, dstCapacity,
+                                cParam.nbSearches, limit, dict);
+        } else {
+            assert(cParam.strat == lz4opt);
+            result = LZ4HC_compress_optimal(ctx,
+                                src, dst, srcSizePtr, dstCapacity,
+                                cParam.nbSearches, cParam.targetLength, limit,
+                                cLevel == LZ4HC_CLEVEL_MAX,   /* ultra mode */
+                                dict, favor);
+        }
+        if (result <= 0) ctx->dirty = 1;
+        return result;
+    }
+}
+
+static void LZ4HC_setExternalDict(LZ4HC_CCtx_internal* ctxPtr, const BYTE* newBlock);
+
+static int
+LZ4HC_compress_generic_noDictCtx (
+        LZ4HC_CCtx_internal* const ctx,
+        const char* const src,
+        char* const dst,
+        int* const srcSizePtr,
+        int const dstCapacity,
+        int cLevel,
+        limitedOutput_directive limit
+        )
+{
+    assert(ctx->dictCtx == NULL);
+    return LZ4HC_compress_generic_internal(ctx, src, dst, srcSizePtr, dstCapacity, cLevel, limit, noDictCtx);
+}
+
+static int
+LZ4HC_compress_generic_dictCtx (
+        LZ4HC_CCtx_internal* const ctx,
+        const char* const src,
+        char* const dst,
+        int* const srcSizePtr,
+        int const dstCapacity,
+        int cLevel,
+        limitedOutput_directive limit
+        )
+{
+    const size_t position = (size_t)(ctx->end - ctx->prefixStart) + (ctx->dictLimit - ctx->lowLimit);
+    assert(ctx->dictCtx != NULL);
+    if (position >= 64 KB) {
+        ctx->dictCtx = NULL;
+        return LZ4HC_compress_generic_noDictCtx(ctx, src, dst, srcSizePtr, dstCapacity, cLevel, limit);
+    } else if (position == 0 && *srcSizePtr > 4 KB) {
+        LZ4_memcpy(ctx, ctx->dictCtx, sizeof(LZ4HC_CCtx_internal));
+        LZ4HC_setExternalDict(ctx, (const BYTE *)src);
+        ctx->compressionLevel = (short)cLevel;
+        return LZ4HC_compress_generic_noDictCtx(ctx, src, dst, srcSizePtr, dstCapacity, cLevel, limit);
+    } else {
+        return LZ4HC_compress_generic_internal(ctx, src, dst, srcSizePtr, dstCapacity, cLevel, limit, usingDictCtxHc);
+    }
+}
+
+static int
+LZ4HC_compress_generic (
+        LZ4HC_CCtx_internal* const ctx,
+        const char* const src,
+        char* const dst,
+        int* const srcSizePtr,
+        int const dstCapacity,
+        int cLevel,
+        limitedOutput_directive limit
+        )
+{
+    if (ctx->dictCtx == NULL) {
+        return LZ4HC_compress_generic_noDictCtx(ctx, src, dst, srcSizePtr, dstCapacity, cLevel, limit);
+    } else {
+        return LZ4HC_compress_generic_dictCtx(ctx, src, dst, srcSizePtr, dstCapacity, cLevel, limit);
+    }
+}
+
+
+int LZ4_sizeofStateHC(void) { return (int)sizeof(LZ4_streamHC_t); }
+
+static size_t LZ4_streamHC_t_alignment(void)
+{
+#if LZ4_ALIGN_TEST
+    typedef struct { char c; LZ4_streamHC_t t; } t_a;
+    return sizeof(t_a) - sizeof(LZ4_streamHC_t);
+#else
+    return 1;  /* effectively disabled */
+#endif
+}
+
+/* state is presumed correctly initialized,
+ * in which case its size and alignment have already been validate */
+int LZ4_compress_HC_extStateHC_fastReset (void* state, const char* src, char* dst, int srcSize, int dstCapacity, int compressionLevel)
+{
+    LZ4HC_CCtx_internal* const ctx = &((LZ4_streamHC_t*)state)->internal_donotuse;
+    if (!LZ4_isAligned(state, LZ4_streamHC_t_alignment())) return 0;
+    LZ4_resetStreamHC_fast((LZ4_streamHC_t*)state, compressionLevel);
+    LZ4HC_init_internal (ctx, (const BYTE*)src);
+    if (dstCapacity < LZ4_compressBound(srcSize))
+        return LZ4HC_compress_generic (ctx, src, dst, &srcSize, dstCapacity, compressionLevel, limitedOutput);
+    else
+        return LZ4HC_compress_generic (ctx, src, dst, &srcSize, dstCapacity, compressionLevel, notLimited);
+}
+
+int LZ4_compress_HC_extStateHC (void* state, const char* src, char* dst, int srcSize, int dstCapacity, int compressionLevel)
+{
+    LZ4_streamHC_t* const ctx = LZ4_initStreamHC(state, sizeof(*ctx));
+    if (ctx==NULL) return 0;   /* init failure */
+    return LZ4_compress_HC_extStateHC_fastReset(state, src, dst, srcSize, dstCapacity, compressionLevel);
+}
+
+int LZ4_compress_HC(const char* src, char* dst, int srcSize, int dstCapacity, int compressionLevel)
+{
+    int cSize;
+#if defined(LZ4HC_HEAPMODE) && LZ4HC_HEAPMODE==1
+    LZ4_streamHC_t* const statePtr = (LZ4_streamHC_t*)ALLOC(sizeof(LZ4_streamHC_t));
+    if (statePtr==NULL) return 0;
+#else
+    LZ4_streamHC_t state;
+    LZ4_streamHC_t* const statePtr = &state;
+#endif
+    DEBUGLOG(5, "LZ4_compress_HC")
+    cSize = LZ4_compress_HC_extStateHC(statePtr, src, dst, srcSize, dstCapacity, compressionLevel);
+#if defined(LZ4HC_HEAPMODE) && LZ4HC_HEAPMODE==1
+    FREEMEM(statePtr);
+#endif
+    return cSize;
+}
+
+/* state is presumed sized correctly (>= sizeof(LZ4_streamHC_t)) */
+int LZ4_compress_HC_destSize(void* state, const char* source, char* dest, int* sourceSizePtr, int targetDestSize, int cLevel)
+{
+    LZ4_streamHC_t* const ctx = LZ4_initStreamHC(state, sizeof(*ctx));
+    if (ctx==NULL) return 0;   /* init failure */
+    LZ4HC_init_internal(&ctx->internal_donotuse, (const BYTE*) source);
+    LZ4_setCompressionLevel(ctx, cLevel);
+    return LZ4HC_compress_generic(&ctx->internal_donotuse, source, dest, sourceSizePtr, targetDestSize, cLevel, fillOutput);
+}
+
+
+
+/**************************************
+*  Streaming Functions
+**************************************/
+/* allocation */
+#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
+LZ4_streamHC_t* LZ4_createStreamHC(void)
+{
+    LZ4_streamHC_t* const state =
+        (LZ4_streamHC_t*)ALLOC_AND_ZERO(sizeof(LZ4_streamHC_t));
+    if (state == NULL) return NULL;
+    LZ4_setCompressionLevel(state, LZ4HC_CLEVEL_DEFAULT);
+    return state;
+}
+
+int LZ4_freeStreamHC (LZ4_streamHC_t* LZ4_streamHCPtr)
+{
+    DEBUGLOG(4, "LZ4_freeStreamHC(%p)", LZ4_streamHCPtr);
+    if (!LZ4_streamHCPtr) return 0;  /* support free on NULL */
+    FREEMEM(LZ4_streamHCPtr);
+    return 0;
+}
+#endif
+
+
+LZ4_streamHC_t* LZ4_initStreamHC (void* buffer, size_t size)
+{
+    LZ4_streamHC_t* const LZ4_streamHCPtr = (LZ4_streamHC_t*)buffer;
+    DEBUGLOG(4, "LZ4_initStreamHC(%p, %u)", buffer, (unsigned)size);
+    /* check conditions */
+    if (buffer == NULL) return NULL;
+    if (size < sizeof(LZ4_streamHC_t)) return NULL;
+    if (!LZ4_isAligned(buffer, LZ4_streamHC_t_alignment())) return NULL;
+    /* init */
+    { LZ4HC_CCtx_internal* const hcstate = &(LZ4_streamHCPtr->internal_donotuse);
+      MEM_INIT(hcstate, 0, sizeof(*hcstate)); }
+    LZ4_setCompressionLevel(LZ4_streamHCPtr, LZ4HC_CLEVEL_DEFAULT);
+    return LZ4_streamHCPtr;
+}
+
+/* just a stub */
+void LZ4_resetStreamHC (LZ4_streamHC_t* LZ4_streamHCPtr, int compressionLevel)
+{
+    LZ4_initStreamHC(LZ4_streamHCPtr, sizeof(*LZ4_streamHCPtr));
+    LZ4_setCompressionLevel(LZ4_streamHCPtr, compressionLevel);
+}
+
+void LZ4_resetStreamHC_fast (LZ4_streamHC_t* LZ4_streamHCPtr, int compressionLevel)
+{
+    LZ4HC_CCtx_internal* const s = &LZ4_streamHCPtr->internal_donotuse;
+    DEBUGLOG(5, "LZ4_resetStreamHC_fast(%p, %d)", LZ4_streamHCPtr, compressionLevel);
+    if (s->dirty) {
+        LZ4_initStreamHC(LZ4_streamHCPtr, sizeof(*LZ4_streamHCPtr));
+    } else {
+        assert(s->end >= s->prefixStart);
+        s->dictLimit += (U32)(s->end - s->prefixStart);
+        s->prefixStart = NULL;
+        s->end = NULL;
+        s->dictCtx = NULL;
+    }
+    LZ4_setCompressionLevel(LZ4_streamHCPtr, compressionLevel);
+}
+
+void LZ4_setCompressionLevel(LZ4_streamHC_t* LZ4_streamHCPtr, int compressionLevel)
+{
+    DEBUGLOG(5, "LZ4_setCompressionLevel(%p, %d)", LZ4_streamHCPtr, compressionLevel);
+    if (compressionLevel < 1) compressionLevel = LZ4HC_CLEVEL_DEFAULT;
+    if (compressionLevel > LZ4HC_CLEVEL_MAX) compressionLevel = LZ4HC_CLEVEL_MAX;
+    LZ4_streamHCPtr->internal_donotuse.compressionLevel = (short)compressionLevel;
+}
+
+void LZ4_favorDecompressionSpeed(LZ4_streamHC_t* LZ4_streamHCPtr, int favor)
+{
+    LZ4_streamHCPtr->internal_donotuse.favorDecSpeed = (favor!=0);
+}
+
+/* LZ4_loadDictHC() :
+ * LZ4_streamHCPtr is presumed properly initialized */
+int LZ4_loadDictHC (LZ4_streamHC_t* LZ4_streamHCPtr,
+              const char* dictionary, int dictSize)
+{
+    LZ4HC_CCtx_internal* const ctxPtr = &LZ4_streamHCPtr->internal_donotuse;
+    DEBUGLOG(4, "LZ4_loadDictHC(ctx:%p, dict:%p, dictSize:%d)", LZ4_streamHCPtr, dictionary, dictSize);
+    assert(LZ4_streamHCPtr != NULL);
+    if (dictSize > 64 KB) {
+        dictionary += (size_t)dictSize - 64 KB;
+        dictSize = 64 KB;
+    }
+    /* need a full initialization, there are bad side-effects when using resetFast() */
+    {   int const cLevel = ctxPtr->compressionLevel;
+        LZ4_initStreamHC(LZ4_streamHCPtr, sizeof(*LZ4_streamHCPtr));
+        LZ4_setCompressionLevel(LZ4_streamHCPtr, cLevel);
+    }
+    LZ4HC_init_internal (ctxPtr, (const BYTE*)dictionary);
+    ctxPtr->end = (const BYTE*)dictionary + dictSize;
+    if (dictSize >= LZ4HC_HASHSIZE) LZ4HC_Insert (ctxPtr, ctxPtr->end-3);
+    return dictSize;
+}
+
+void LZ4_attach_HC_dictionary(LZ4_streamHC_t *working_stream, const LZ4_streamHC_t *dictionary_stream) {
+    working_stream->internal_donotuse.dictCtx = dictionary_stream != NULL ? &(dictionary_stream->internal_donotuse) : NULL;
+}
+
+/* compression */
+
+static void LZ4HC_setExternalDict(LZ4HC_CCtx_internal* ctxPtr, const BYTE* newBlock)
+{
+    DEBUGLOG(4, "LZ4HC_setExternalDict(%p, %p)", ctxPtr, newBlock);
+    if (ctxPtr->end >= ctxPtr->prefixStart + 4)
+        LZ4HC_Insert (ctxPtr, ctxPtr->end-3);   /* Referencing remaining dictionary content */
+
+    /* Only one memory segment for extDict, so any previous extDict is lost at this stage */
+    ctxPtr->lowLimit  = ctxPtr->dictLimit;
+    ctxPtr->dictStart  = ctxPtr->prefixStart;
+    ctxPtr->dictLimit += (U32)(ctxPtr->end - ctxPtr->prefixStart);
+    ctxPtr->prefixStart = newBlock;
+    ctxPtr->end  = newBlock;
+    ctxPtr->nextToUpdate = ctxPtr->dictLimit;   /* match referencing will resume from there */
+
+    /* cannot reference an extDict and a dictCtx at the same time */
+    ctxPtr->dictCtx = NULL;
+}
+
+static int
+LZ4_compressHC_continue_generic (LZ4_streamHC_t* LZ4_streamHCPtr,
+                                 const char* src, char* dst,
+                                 int* srcSizePtr, int dstCapacity,
+                                 limitedOutput_directive limit)
+{
+    LZ4HC_CCtx_internal* const ctxPtr = &LZ4_streamHCPtr->internal_donotuse;
+    DEBUGLOG(5, "LZ4_compressHC_continue_generic(ctx=%p, src=%p, srcSize=%d, limit=%d)",
+                LZ4_streamHCPtr, src, *srcSizePtr, limit);
+    assert(ctxPtr != NULL);
+    /* auto-init if forgotten */
+    if (ctxPtr->prefixStart == NULL) LZ4HC_init_internal (ctxPtr, (const BYTE*) src);
+
+    /* Check overflow */
+    if ((size_t)(ctxPtr->end - ctxPtr->prefixStart) + ctxPtr->dictLimit > 2 GB) {
+        size_t dictSize = (size_t)(ctxPtr->end - ctxPtr->prefixStart);
+        if (dictSize > 64 KB) dictSize = 64 KB;
+        LZ4_loadDictHC(LZ4_streamHCPtr, (const char*)(ctxPtr->end) - dictSize, (int)dictSize);
+    }
+
+    /* Check if blocks follow each other */
+    if ((const BYTE*)src != ctxPtr->end)
+        LZ4HC_setExternalDict(ctxPtr, (const BYTE*)src);
+
+    /* Check overlapping input/dictionary space */
+    {   const BYTE* sourceEnd = (const BYTE*) src + *srcSizePtr;
+        const BYTE* const dictBegin = ctxPtr->dictStart;
+        const BYTE* const dictEnd   = ctxPtr->dictStart + (ctxPtr->dictLimit - ctxPtr->lowLimit);
+        if ((sourceEnd > dictBegin) && ((const BYTE*)src < dictEnd)) {
+            if (sourceEnd > dictEnd) sourceEnd = dictEnd;
+            ctxPtr->lowLimit += (U32)(sourceEnd - ctxPtr->dictStart);
+            ctxPtr->dictStart += (U32)(sourceEnd - ctxPtr->dictStart);
+            /* invalidate dictionary is it's too small */
+            if (ctxPtr->dictLimit - ctxPtr->lowLimit < LZ4HC_HASHSIZE) {
+                ctxPtr->lowLimit = ctxPtr->dictLimit;
+                ctxPtr->dictStart = ctxPtr->prefixStart;
+    }   }   }
+
+    return LZ4HC_compress_generic (ctxPtr, src, dst, srcSizePtr, dstCapacity, ctxPtr->compressionLevel, limit);
+}
+
+int LZ4_compress_HC_continue (LZ4_streamHC_t* LZ4_streamHCPtr, const char* src, char* dst, int srcSize, int dstCapacity)
+{
+    DEBUGLOG(5, "LZ4_compress_HC_continue");
+    if (dstCapacity < LZ4_compressBound(srcSize))
+        return LZ4_compressHC_continue_generic (LZ4_streamHCPtr, src, dst, &srcSize, dstCapacity, limitedOutput);
+    else
+        return LZ4_compressHC_continue_generic (LZ4_streamHCPtr, src, dst, &srcSize, dstCapacity, notLimited);
+}
+
+int LZ4_compress_HC_continue_destSize (LZ4_streamHC_t* LZ4_streamHCPtr, const char* src, char* dst, int* srcSizePtr, int targetDestSize)
+{
+    return LZ4_compressHC_continue_generic(LZ4_streamHCPtr, src, dst, srcSizePtr, targetDestSize, fillOutput);
+}
+
+
+
+/* LZ4_saveDictHC :
+ * save history content
+ * into a user-provided buffer
+ * which is then used to continue compression
+ */
+int LZ4_saveDictHC (LZ4_streamHC_t* LZ4_streamHCPtr, char* safeBuffer, int dictSize)
+{
+    LZ4HC_CCtx_internal* const streamPtr = &LZ4_streamHCPtr->internal_donotuse;
+    int const prefixSize = (int)(streamPtr->end - streamPtr->prefixStart);
+    DEBUGLOG(5, "LZ4_saveDictHC(%p, %p, %d)", LZ4_streamHCPtr, safeBuffer, dictSize);
+    assert(prefixSize >= 0);
+    if (dictSize > 64 KB) dictSize = 64 KB;
+    if (dictSize < 4) dictSize = 0;
+    if (dictSize > prefixSize) dictSize = prefixSize;
+    if (safeBuffer == NULL) assert(dictSize == 0);
+    if (dictSize > 0)
+        LZ4_memmove(safeBuffer, streamPtr->end - dictSize, (size_t)dictSize);
+    {   U32 const endIndex = (U32)(streamPtr->end - streamPtr->prefixStart) + streamPtr->dictLimit;
+        streamPtr->end = (safeBuffer == NULL) ? NULL : (const BYTE*)safeBuffer + dictSize;
+        streamPtr->prefixStart = (const BYTE*)safeBuffer;
+        streamPtr->dictLimit = endIndex - (U32)dictSize;
+        streamPtr->lowLimit = endIndex - (U32)dictSize;
+        streamPtr->dictStart = streamPtr->prefixStart;
+        if (streamPtr->nextToUpdate < streamPtr->dictLimit)
+            streamPtr->nextToUpdate = streamPtr->dictLimit;
+    }
+    return dictSize;
+}
+
+
+/***************************************************
+*  Deprecated Functions
+***************************************************/
+
+/* These functions currently generate deprecation warnings */
+
+/* Wrappers for deprecated compression functions */
+int LZ4_compressHC(const char* src, char* dst, int srcSize) { return LZ4_compress_HC (src, dst, srcSize, LZ4_compressBound(srcSize), 0); }
+int LZ4_compressHC_limitedOutput(const char* src, char* dst, int srcSize, int maxDstSize) { return LZ4_compress_HC(src, dst, srcSize, maxDstSize, 0); }
+int LZ4_compressHC2(const char* src, char* dst, int srcSize, int cLevel) { return LZ4_compress_HC (src, dst, srcSize, LZ4_compressBound(srcSize), cLevel); }
+int LZ4_compressHC2_limitedOutput(const char* src, char* dst, int srcSize, int maxDstSize, int cLevel) { return LZ4_compress_HC(src, dst, srcSize, maxDstSize, cLevel); }
+int LZ4_compressHC_withStateHC (void* state, const char* src, char* dst, int srcSize) { return LZ4_compress_HC_extStateHC (state, src, dst, srcSize, LZ4_compressBound(srcSize), 0); }
+int LZ4_compressHC_limitedOutput_withStateHC (void* state, const char* src, char* dst, int srcSize, int maxDstSize) { return LZ4_compress_HC_extStateHC (state, src, dst, srcSize, maxDstSize, 0); }
+int LZ4_compressHC2_withStateHC (void* state, const char* src, char* dst, int srcSize, int cLevel) { return LZ4_compress_HC_extStateHC(state, src, dst, srcSize, LZ4_compressBound(srcSize), cLevel); }
+int LZ4_compressHC2_limitedOutput_withStateHC (void* state, const char* src, char* dst, int srcSize, int maxDstSize, int cLevel) { return LZ4_compress_HC_extStateHC(state, src, dst, srcSize, maxDstSize, cLevel); }
+int LZ4_compressHC_continue (LZ4_streamHC_t* ctx, const char* src, char* dst, int srcSize) { return LZ4_compress_HC_continue (ctx, src, dst, srcSize, LZ4_compressBound(srcSize)); }
+int LZ4_compressHC_limitedOutput_continue (LZ4_streamHC_t* ctx, const char* src, char* dst, int srcSize, int maxDstSize) { return LZ4_compress_HC_continue (ctx, src, dst, srcSize, maxDstSize); }
+
+
+/* Deprecated streaming functions */
+int LZ4_sizeofStreamStateHC(void) { return sizeof(LZ4_streamHC_t); }
+
+/* state is presumed correctly sized, aka >= sizeof(LZ4_streamHC_t)
+ * @return : 0 on success, !=0 if error */
+int LZ4_resetStreamStateHC(void* state, char* inputBuffer)
+{
+    LZ4_streamHC_t* const hc4 = LZ4_initStreamHC(state, sizeof(*hc4));
+    if (hc4 == NULL) return 1;   /* init failed */
+    LZ4HC_init_internal (&hc4->internal_donotuse, (const BYTE*)inputBuffer);
+    return 0;
+}
+
+#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
+void* LZ4_createHC (const char* inputBuffer)
+{
+    LZ4_streamHC_t* const hc4 = LZ4_createStreamHC();
+    if (hc4 == NULL) return NULL;   /* not enough memory */
+    LZ4HC_init_internal (&hc4->internal_donotuse, (const BYTE*)inputBuffer);
+    return hc4;
+}
+
+int LZ4_freeHC (void* LZ4HC_Data)
+{
+    if (!LZ4HC_Data) return 0;  /* support free on NULL */
+    FREEMEM(LZ4HC_Data);
+    return 0;
+}
+#endif
+
+int LZ4_compressHC2_continue (void* LZ4HC_Data, const char* src, char* dst, int srcSize, int cLevel)
+{
+    return LZ4HC_compress_generic (&((LZ4_streamHC_t*)LZ4HC_Data)->internal_donotuse, src, dst, &srcSize, 0, cLevel, notLimited);
+}
+
+int LZ4_compressHC2_limitedOutput_continue (void* LZ4HC_Data, const char* src, char* dst, int srcSize, int dstCapacity, int cLevel)
+{
+    return LZ4HC_compress_generic (&((LZ4_streamHC_t*)LZ4HC_Data)->internal_donotuse, src, dst, &srcSize, dstCapacity, cLevel, limitedOutput);
+}
+
+char* LZ4_slideInputBufferHC(void* LZ4HC_Data)
+{
+    LZ4HC_CCtx_internal* const s = &((LZ4_streamHC_t*)LZ4HC_Data)->internal_donotuse;
+    const BYTE* const bufferStart = s->prefixStart - s->dictLimit + s->lowLimit;
+    LZ4_resetStreamHC_fast((LZ4_streamHC_t*)LZ4HC_Data, s->compressionLevel);
+    /* ugly conversion trick, required to evade (const char*) -> (char*) cast-qual warning :( */
+    return (char*)(uptrval)bufferStart;
+}
+
+
+/* ================================================
+ *  LZ4 Optimal parser (levels [LZ4HC_CLEVEL_OPT_MIN - LZ4HC_CLEVEL_MAX])
+ * ===============================================*/
+typedef struct {
+    int price;
+    int off;
+    int mlen;
+    int litlen;
+} LZ4HC_optimal_t;
+
+/* price in bytes */
+LZ4_FORCE_INLINE int LZ4HC_literalsPrice(int const litlen)
+{
+    int price = litlen;
+    assert(litlen >= 0);
+    if (litlen >= (int)RUN_MASK)
+        price += 1 + ((litlen-(int)RUN_MASK) / 255);
+    return price;
+}
+
+
+/* requires mlen >= MINMATCH */
+LZ4_FORCE_INLINE int LZ4HC_sequencePrice(int litlen, int mlen)
+{
+    int price = 1 + 2 ; /* token + 16-bit offset */
+    assert(litlen >= 0);
+    assert(mlen >= MINMATCH);
+
+    price += LZ4HC_literalsPrice(litlen);
+
+    if (mlen >= (int)(ML_MASK+MINMATCH))
+        price += 1 + ((mlen-(int)(ML_MASK+MINMATCH)) / 255);
+
+    return price;
+}
+
+
+
+LZ4_FORCE_INLINE LZ4HC_match_t
+LZ4HC_FindLongerMatch(LZ4HC_CCtx_internal* const ctx,
+                      const BYTE* ip, const BYTE* const iHighLimit,
+                      int minLen, int nbSearches,
+                      const dictCtx_directive dict,
+                      const HCfavor_e favorDecSpeed)
+{
+    LZ4HC_match_t const match0 = { 0 , 0 };
+    /* note : LZ4HC_InsertAndGetWiderMatch() is able to modify the starting position of a match (*startpos),
+     * but this won't be the case here, as we define iLowLimit==ip,
+    ** so LZ4HC_InsertAndGetWiderMatch() won't be allowed to search past ip */
+    LZ4HC_match_t md = LZ4HC_InsertAndGetWiderMatch(ctx, ip, ip, iHighLimit, minLen, &ip, nbSearches, 1 /*patternAnalysis*/, 1 /*chainSwap*/, dict, favorDecSpeed);
+    if (md.len <= minLen) return match0;
+    if (favorDecSpeed) {
+        if ((md.len>18) & (md.len<=36)) md.len=18;   /* favor shortcut */
+    }
+    return md;
+}
+
+
+static int LZ4HC_compress_optimal ( LZ4HC_CCtx_internal* ctx,
+                                    const char* const source,
+                                    char* dst,
+                                    int* srcSizePtr,
+                                    int dstCapacity,
+                                    int const nbSearches,
+                                    size_t sufficient_len,
+                                    const limitedOutput_directive limit,
+                                    int const fullUpdate,
+                                    const dictCtx_directive dict,
+                                    const HCfavor_e favorDecSpeed)
+{
+    int retval = 0;
+#define TRAILING_LITERALS 3
+#if defined(LZ4HC_HEAPMODE) && LZ4HC_HEAPMODE==1
+    LZ4HC_optimal_t* const opt = (LZ4HC_optimal_t*)ALLOC(sizeof(LZ4HC_optimal_t) * (LZ4_OPT_NUM + TRAILING_LITERALS));
+#else
+    LZ4HC_optimal_t opt[LZ4_OPT_NUM + TRAILING_LITERALS];   /* ~64 KB, which is a bit large for stack... */
+#endif
+
+    const BYTE* ip = (const BYTE*) source;
+    const BYTE* anchor = ip;
+    const BYTE* const iend = ip + *srcSizePtr;
+    const BYTE* const mflimit = iend - MFLIMIT;
+    const BYTE* const matchlimit = iend - LASTLITERALS;
+    BYTE* op = (BYTE*) dst;
+    BYTE* opSaved = (BYTE*) dst;
+    BYTE* oend = op + dstCapacity;
+    int ovml = MINMATCH;  /* overflow - last sequence */
+    int ovoff = 0;
+
+    /* init */
+#if defined(LZ4HC_HEAPMODE) && LZ4HC_HEAPMODE==1
+    if (opt == NULL) goto _return_label;
+#endif
+    DEBUGLOG(5, "LZ4HC_compress_optimal(dst=%p, dstCapa=%u)", dst, (unsigned)dstCapacity);
+    *srcSizePtr = 0;
+    if (limit == fillOutput) oend -= LASTLITERALS;   /* Hack for support LZ4 format restriction */
+    if (sufficient_len >= LZ4_OPT_NUM) sufficient_len = LZ4_OPT_NUM-1;
+
+    /* Main Loop */
+    while (ip <= mflimit) {
+         int const llen = (int)(ip - anchor);
+         int best_mlen, best_off;
+         int cur, last_match_pos = 0;
+
+         LZ4HC_match_t const firstMatch = LZ4HC_FindLongerMatch(ctx, ip, matchlimit, MINMATCH-1, nbSearches, dict, favorDecSpeed);
+         if (firstMatch.len==0) { ip++; continue; }
+
+         if ((size_t)firstMatch.len > sufficient_len) {
+             /* good enough solution : immediate encoding */
+             int const firstML = firstMatch.len;
+             opSaved = op;
+             if ( LZ4HC_encodeSequence(UPDATABLE(ip, op, anchor), firstML, firstMatch.off, limit, oend) ) {  /* updates ip, op and anchor */
+                 ovml = firstML;
+                 ovoff = firstMatch.off;
+                 goto _dest_overflow;
+             }
+             continue;
+         }
+
+         /* set prices for first positions (literals) */
+         {   int rPos;
+             for (rPos = 0 ; rPos < MINMATCH ; rPos++) {
+                 int const cost = LZ4HC_literalsPrice(llen + rPos);
+                 opt[rPos].mlen = 1;
+                 opt[rPos].off = 0;
+                 opt[rPos].litlen = llen + rPos;
+                 opt[rPos].price = cost;
+                 DEBUGLOG(7, "rPos:%3i => price:%3i (litlen=%i) -- initial setup",
+                             rPos, cost, opt[rPos].litlen);
+         }   }
+         /* set prices using initial match */
+         {   int mlen = MINMATCH;
+             int const matchML = firstMatch.len;   /* necessarily < sufficient_len < LZ4_OPT_NUM */
+             int const offset = firstMatch.off;
+             assert(matchML < LZ4_OPT_NUM);
+             for ( ; mlen <= matchML ; mlen++) {
+                 int const cost = LZ4HC_sequencePrice(llen, mlen);
+                 opt[mlen].mlen = mlen;
+                 opt[mlen].off = offset;
+                 opt[mlen].litlen = llen;
+                 opt[mlen].price = cost;
+                 DEBUGLOG(7, "rPos:%3i => price:%3i (matchlen=%i) -- initial setup",
+                             mlen, cost, mlen);
+         }   }
+         last_match_pos = firstMatch.len;
+         {   int addLit;
+             for (addLit = 1; addLit <= TRAILING_LITERALS; addLit ++) {
+                 opt[last_match_pos+addLit].mlen = 1; /* literal */
+                 opt[last_match_pos+addLit].off = 0;
+                 opt[last_match_pos+addLit].litlen = addLit;
+                 opt[last_match_pos+addLit].price = opt[last_match_pos].price + LZ4HC_literalsPrice(addLit);
+                 DEBUGLOG(7, "rPos:%3i => price:%3i (litlen=%i) -- initial setup",
+                             last_match_pos+addLit, opt[last_match_pos+addLit].price, addLit);
+         }   }
+
+         /* check further positions */
+         for (cur = 1; cur < last_match_pos; cur++) {
+             const BYTE* const curPtr = ip + cur;
+             LZ4HC_match_t newMatch;
+
+             if (curPtr > mflimit) break;
+             DEBUGLOG(7, "rPos:%u[%u] vs [%u]%u",
+                     cur, opt[cur].price, opt[cur+1].price, cur+1);
+             if (fullUpdate) {
+                 /* not useful to search here if next position has same (or lower) cost */
+                 if ( (opt[cur+1].price <= opt[cur].price)
+                   /* in some cases, next position has same cost, but cost rises sharply after, so a small match would still be beneficial */
+                   && (opt[cur+MINMATCH].price < opt[cur].price + 3/*min seq price*/) )
+                     continue;
+             } else {
+                 /* not useful to search here if next position has same (or lower) cost */
+                 if (opt[cur+1].price <= opt[cur].price) continue;
+             }
+
+             DEBUGLOG(7, "search at rPos:%u", cur);
+             if (fullUpdate)
+                 newMatch = LZ4HC_FindLongerMatch(ctx, curPtr, matchlimit, MINMATCH-1, nbSearches, dict, favorDecSpeed);
+             else
+                 /* only test matches of minimum length; slightly faster, but misses a few bytes */
+                 newMatch = LZ4HC_FindLongerMatch(ctx, curPtr, matchlimit, last_match_pos - cur, nbSearches, dict, favorDecSpeed);
+             if (!newMatch.len) continue;
+
+             if ( ((size_t)newMatch.len > sufficient_len)
+               || (newMatch.len + cur >= LZ4_OPT_NUM) ) {
+                 /* immediate encoding */
+                 best_mlen = newMatch.len;
+                 best_off = newMatch.off;
+                 last_match_pos = cur + 1;
+                 goto encode;
+             }
+
+             /* before match : set price with literals at beginning */
+             {   int const baseLitlen = opt[cur].litlen;
+                 int litlen;
+                 for (litlen = 1; litlen < MINMATCH; litlen++) {
+                     int const price = opt[cur].price - LZ4HC_literalsPrice(baseLitlen) + LZ4HC_literalsPrice(baseLitlen+litlen);
+                     int const pos = cur + litlen;
+                     if (price < opt[pos].price) {
+                         opt[pos].mlen = 1; /* literal */
+                         opt[pos].off = 0;
+                         opt[pos].litlen = baseLitlen+litlen;
+                         opt[pos].price = price;
+                         DEBUGLOG(7, "rPos:%3i => price:%3i (litlen=%i)",
+                                     pos, price, opt[pos].litlen);
+             }   }   }
+
+             /* set prices using match at position = cur */
+             {   int const matchML = newMatch.len;
+                 int ml = MINMATCH;
+
+                 assert(cur + newMatch.len < LZ4_OPT_NUM);
+                 for ( ; ml <= matchML ; ml++) {
+                     int const pos = cur + ml;
+                     int const offset = newMatch.off;
+                     int price;
+                     int ll;
+                     DEBUGLOG(7, "testing price rPos %i (last_match_pos=%i)",
+                                 pos, last_match_pos);
+                     if (opt[cur].mlen == 1) {
+                         ll = opt[cur].litlen;
+                         price = ((cur > ll) ? opt[cur - ll].price : 0)
+                               + LZ4HC_sequencePrice(ll, ml);
+                     } else {
+                         ll = 0;
+                         price = opt[cur].price + LZ4HC_sequencePrice(0, ml);
+                     }
+
+                    assert((U32)favorDecSpeed <= 1);
+                     if (pos > last_match_pos+TRAILING_LITERALS
+                      || price <= opt[pos].price - (int)favorDecSpeed) {
+                         DEBUGLOG(7, "rPos:%3i => price:%3i (matchlen=%i)",
+                                     pos, price, ml);
+                         assert(pos < LZ4_OPT_NUM);
+                         if ( (ml == matchML)  /* last pos of last match */
+                           && (last_match_pos < pos) )
+                             last_match_pos = pos;
+                         opt[pos].mlen = ml;
+                         opt[pos].off = offset;
+                         opt[pos].litlen = ll;
+                         opt[pos].price = price;
+             }   }   }
+             /* complete following positions with literals */
+             {   int addLit;
+                 for (addLit = 1; addLit <= TRAILING_LITERALS; addLit ++) {
+                     opt[last_match_pos+addLit].mlen = 1; /* literal */
+                     opt[last_match_pos+addLit].off = 0;
+                     opt[last_match_pos+addLit].litlen = addLit;
+                     opt[last_match_pos+addLit].price = opt[last_match_pos].price + LZ4HC_literalsPrice(addLit);
+                     DEBUGLOG(7, "rPos:%3i => price:%3i (litlen=%i)", last_match_pos+addLit, opt[last_match_pos+addLit].price, addLit);
+             }   }
+         }  /* for (cur = 1; cur <= last_match_pos; cur++) */
+
+         assert(last_match_pos < LZ4_OPT_NUM + TRAILING_LITERALS);
+         best_mlen = opt[last_match_pos].mlen;
+         best_off = opt[last_match_pos].off;
+         cur = last_match_pos - best_mlen;
+
+encode: /* cur, last_match_pos, best_mlen, best_off must be set */
+         assert(cur < LZ4_OPT_NUM);
+         assert(last_match_pos >= 1);  /* == 1 when only one candidate */
+         DEBUGLOG(6, "reverse traversal, looking for shortest path (last_match_pos=%i)", last_match_pos);
+         {   int candidate_pos = cur;
+             int selected_matchLength = best_mlen;
+             int selected_offset = best_off;
+             while (1) {  /* from end to beginning */
+                 int const next_matchLength = opt[candidate_pos].mlen;  /* can be 1, means literal */
+                 int const next_offset = opt[candidate_pos].off;
+                 DEBUGLOG(7, "pos %i: sequence length %i", candidate_pos, selected_matchLength);
+                 opt[candidate_pos].mlen = selected_matchLength;
+                 opt[candidate_pos].off = selected_offset;
+                 selected_matchLength = next_matchLength;
+                 selected_offset = next_offset;
+                 if (next_matchLength > candidate_pos) break; /* last match elected, first match to encode */
+                 assert(next_matchLength > 0);  /* can be 1, means literal */
+                 candidate_pos -= next_matchLength;
+         }   }
+
+         /* encode all recorded sequences in order */
+         {   int rPos = 0;  /* relative position (to ip) */
+             while (rPos < last_match_pos) {
+                 int const ml = opt[rPos].mlen;
+                 int const offset = opt[rPos].off;
+                 if (ml == 1) { ip++; rPos++; continue; }  /* literal; note: can end up with several literals, in which case, skip them */
+                 rPos += ml;
+                 assert(ml >= MINMATCH);
+                 assert((offset >= 1) && (offset <= LZ4_DISTANCE_MAX));
+                 opSaved = op;
+                 if ( LZ4HC_encodeSequence(UPDATABLE(ip, op, anchor), ml, offset, limit, oend) ) {  /* updates ip, op and anchor */
+                     ovml = ml;
+                     ovoff = offset;
+                     goto _dest_overflow;
+         }   }   }
+     }  /* while (ip <= mflimit) */
+
+_last_literals:
+     /* Encode Last Literals */
+     {   size_t lastRunSize = (size_t)(iend - anchor);  /* literals */
+         size_t llAdd = (lastRunSize + 255 - RUN_MASK) / 255;
+         size_t const totalSize = 1 + llAdd + lastRunSize;
+         if (limit == fillOutput) oend += LASTLITERALS;  /* restore correct value */
+         if (limit && (op + totalSize > oend)) {
+             if (limit == limitedOutput) { /* Check output limit */
+                retval = 0;
+                goto _return_label;
+             }
+             /* adapt lastRunSize to fill 'dst' */
+             lastRunSize  = (size_t)(oend - op) - 1 /*token*/;
+             llAdd = (lastRunSize + 256 - RUN_MASK) / 256;
+             lastRunSize -= llAdd;
+         }
+         DEBUGLOG(6, "Final literal run : %i literals", (int)lastRunSize);
+         ip = anchor + lastRunSize; /* can be != iend if limit==fillOutput */
+
+         if (lastRunSize >= RUN_MASK) {
+             size_t accumulator = lastRunSize - RUN_MASK;
+             *op++ = (RUN_MASK << ML_BITS);
+             for(; accumulator >= 255 ; accumulator -= 255) *op++ = 255;
+             *op++ = (BYTE) accumulator;
+         } else {
+             *op++ = (BYTE)(lastRunSize << ML_BITS);
+         }
+         LZ4_memcpy(op, anchor, lastRunSize);
+         op += lastRunSize;
+     }
+
+     /* End */
+     *srcSizePtr = (int) (((const char*)ip) - source);
+     retval = (int) ((char*)op-dst);
+     goto _return_label;
+
+_dest_overflow:
+if (limit == fillOutput) {
+     /* Assumption : ip, anchor, ovml and ovref must be set correctly */
+     size_t const ll = (size_t)(ip - anchor);
+     size_t const ll_addbytes = (ll + 240) / 255;
+     size_t const ll_totalCost = 1 + ll_addbytes + ll;
+     BYTE* const maxLitPos = oend - 3; /* 2 for offset, 1 for token */
+     DEBUGLOG(6, "Last sequence overflowing (only %i bytes remaining)", (int)(oend-1-opSaved));
+     op = opSaved;  /* restore correct out pointer */
+     if (op + ll_totalCost <= maxLitPos) {
+         /* ll validated; now adjust match length */
+         size_t const bytesLeftForMl = (size_t)(maxLitPos - (op+ll_totalCost));
+         size_t const maxMlSize = MINMATCH + (ML_MASK-1) + (bytesLeftForMl * 255);
+         assert(maxMlSize < INT_MAX); assert(ovml >= 0);
+         if ((size_t)ovml > maxMlSize) ovml = (int)maxMlSize;
+         if ((oend + LASTLITERALS) - (op + ll_totalCost + 2) - 1 + ovml >= MFLIMIT) {
+             DEBUGLOG(6, "Space to end : %i + ml (%i)", (int)((oend + LASTLITERALS) - (op + ll_totalCost + 2) - 1), ovml);
+             DEBUGLOG(6, "Before : ip = %p, anchor = %p", ip, anchor);
+             LZ4HC_encodeSequence(UPDATABLE(ip, op, anchor), ovml, ovoff, notLimited, oend);
+             DEBUGLOG(6, "After : ip = %p, anchor = %p", ip, anchor);
+     }   }
+     goto _last_literals;
+}
+_return_label:
+#if defined(LZ4HC_HEAPMODE) && LZ4HC_HEAPMODE==1
+     FREEMEM(opt);
+#endif
+     return retval;
+}
diff --git a/tools/common/lz4hc.h b/tools/common/lz4hc.h
new file mode 100644
index 0000000000..e937acfefd
--- /dev/null
+++ b/tools/common/lz4hc.h
@@ -0,0 +1,413 @@
+/*
+   LZ4 HC - High Compression Mode of LZ4
+   Header File
+   Copyright (C) 2011-2020, Yann Collet.
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - LZ4 source repository : https://github.com/lz4/lz4
+   - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c
+*/
+#ifndef LZ4_HC_H_19834876238432
+#define LZ4_HC_H_19834876238432
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* --- Dependency --- */
+/* note : lz4hc requires lz4.h/lz4.c for compilation */
+#include "lz4.h"   /* stddef, LZ4LIB_API, LZ4_DEPRECATED */
+
+
+/* --- Useful constants --- */
+#define LZ4HC_CLEVEL_MIN         3
+#define LZ4HC_CLEVEL_DEFAULT     9
+#define LZ4HC_CLEVEL_OPT_MIN    10
+#define LZ4HC_CLEVEL_MAX        12
+
+
+/*-************************************
+ *  Block Compression
+ **************************************/
+/*! LZ4_compress_HC() :
+ *  Compress data from `src` into `dst`, using the powerful but slower "HC" algorithm.
+ * `dst` must be already allocated.
+ *  Compression is guaranteed to succeed if `dstCapacity >= LZ4_compressBound(srcSize)` (see "lz4.h")
+ *  Max supported `srcSize` value is LZ4_MAX_INPUT_SIZE (see "lz4.h")
+ * `compressionLevel` : any value between 1 and LZ4HC_CLEVEL_MAX will work.
+ *                      Values > LZ4HC_CLEVEL_MAX behave the same as LZ4HC_CLEVEL_MAX.
+ * @return : the number of bytes written into 'dst'
+ *           or 0 if compression fails.
+ */
+LZ4LIB_API int LZ4_compress_HC (const char* src, char* dst, int srcSize, int dstCapacity, int compressionLevel);
+
+
+/* Note :
+ *   Decompression functions are provided within "lz4.h" (BSD license)
+ */
+
+
+/*! LZ4_compress_HC_extStateHC() :
+ *  Same as LZ4_compress_HC(), but using an externally allocated memory segment for `state`.
+ * `state` size is provided by LZ4_sizeofStateHC().
+ *  Memory segment must be aligned on 8-bytes boundaries (which a normal malloc() should do properly).
+ */
+LZ4LIB_API int LZ4_sizeofStateHC(void);
+LZ4LIB_API int LZ4_compress_HC_extStateHC(void* stateHC, const char* src, char* dst, int srcSize, int maxDstSize, int compressionLevel);
+
+
+/*! LZ4_compress_HC_destSize() : v1.9.0+
+ *  Will compress as much data as possible from `src`
+ *  to fit into `targetDstSize` budget.
+ *  Result is provided in 2 parts :
+ * @return : the number of bytes written into 'dst' (necessarily <= targetDstSize)
+ *           or 0 if compression fails.
+ * `srcSizePtr` : on success, *srcSizePtr is updated to indicate how much bytes were read from `src`
+ */
+LZ4LIB_API int LZ4_compress_HC_destSize(void* stateHC,
+                                  const char* src, char* dst,
+                                        int* srcSizePtr, int targetDstSize,
+                                        int compressionLevel);
+
+
+/*-************************************
+ *  Streaming Compression
+ *  Bufferless synchronous API
+ **************************************/
+ typedef union LZ4_streamHC_u LZ4_streamHC_t;   /* incomplete type (defined later) */
+
+/*! LZ4_createStreamHC() and LZ4_freeStreamHC() :
+ *  These functions create and release memory for LZ4 HC streaming state.
+ *  Newly created states are automatically initialized.
+ *  A same state can be used multiple times consecutively,
+ *  starting with LZ4_resetStreamHC_fast() to start a new stream of blocks.
+ */
+LZ4LIB_API LZ4_streamHC_t* LZ4_createStreamHC(void);
+LZ4LIB_API int             LZ4_freeStreamHC (LZ4_streamHC_t* streamHCPtr);
+
+/*
+  These functions compress data in successive blocks of any size,
+  using previous blocks as dictionary, to improve compression ratio.
+  One key assumption is that previous blocks (up to 64 KB) remain read-accessible while compressing next blocks.
+  There is an exception for ring buffers, which can be smaller than 64 KB.
+  Ring-buffer scenario is automatically detected and handled within LZ4_compress_HC_continue().
+
+  Before starting compression, state must be allocated and properly initialized.
+  LZ4_createStreamHC() does both, though compression level is set to LZ4HC_CLEVEL_DEFAULT.
+
+  Selecting the compression level can be done with LZ4_resetStreamHC_fast() (starts a new stream)
+  or LZ4_setCompressionLevel() (anytime, between blocks in the same stream) (experimental).
+  LZ4_resetStreamHC_fast() only works on states which have been properly initialized at least once,
+  which is automatically the case when state is created using LZ4_createStreamHC().
+
+  After reset, a first "fictional block" can be designated as initial dictionary,
+  using LZ4_loadDictHC() (Optional).
+
+  Invoke LZ4_compress_HC_continue() to compress each successive block.
+  The number of blocks is unlimited.
+  Previous input blocks, including initial dictionary when present,
+  must remain accessible and unmodified during compression.
+
+  It's allowed to update compression level anytime between blocks,
+  using LZ4_setCompressionLevel() (experimental).
+
+  'dst' buffer should be sized to handle worst case scenarios
+  (see LZ4_compressBound(), it ensures compression success).
+  In case of failure, the API does not guarantee recovery,
+  so the state _must_ be reset.
+  To ensure compression success
+  whenever `dst` buffer size cannot be made >= LZ4_compressBound(),
+  consider using LZ4_compress_HC_continue_destSize().
+
+  Whenever previous input blocks can't be preserved unmodified in-place during compression of next blocks,
+  it's possible to copy the last blocks into a more stable memory space, using LZ4_saveDictHC().
+  Return value of LZ4_saveDictHC() is the size of dictionary effectively saved into 'safeBuffer' (<= 64 KB)
+
+  After completing a streaming compression,
+  it's possible to start a new stream of blocks, using the same LZ4_streamHC_t state,
+  just by resetting it, using LZ4_resetStreamHC_fast().
+*/
+
+LZ4LIB_API void LZ4_resetStreamHC_fast(LZ4_streamHC_t* streamHCPtr, int compressionLevel);   /* v1.9.0+ */
+LZ4LIB_API int  LZ4_loadDictHC (LZ4_streamHC_t* streamHCPtr, const char* dictionary, int dictSize);
+
+LZ4LIB_API int LZ4_compress_HC_continue (LZ4_streamHC_t* streamHCPtr,
+                                   const char* src, char* dst,
+                                         int srcSize, int maxDstSize);
+
+/*! LZ4_compress_HC_continue_destSize() : v1.9.0+
+ *  Similar to LZ4_compress_HC_continue(),
+ *  but will read as much data as possible from `src`
+ *  to fit into `targetDstSize` budget.
+ *  Result is provided into 2 parts :
+ * @return : the number of bytes written into 'dst' (necessarily <= targetDstSize)
+ *           or 0 if compression fails.
+ * `srcSizePtr` : on success, *srcSizePtr will be updated to indicate how much bytes were read from `src`.
+ *           Note that this function may not consume the entire input.
+ */
+LZ4LIB_API int LZ4_compress_HC_continue_destSize(LZ4_streamHC_t* LZ4_streamHCPtr,
+                                           const char* src, char* dst,
+                                                 int* srcSizePtr, int targetDstSize);
+
+LZ4LIB_API int LZ4_saveDictHC (LZ4_streamHC_t* streamHCPtr, char* safeBuffer, int maxDictSize);
+
+
+
+/*^**********************************************
+ * !!!!!!   STATIC LINKING ONLY   !!!!!!
+ ***********************************************/
+
+/*-******************************************************************
+ * PRIVATE DEFINITIONS :
+ * Do not use these definitions directly.
+ * They are merely exposed to allow static allocation of `LZ4_streamHC_t`.
+ * Declare an `LZ4_streamHC_t` directly, rather than any type below.
+ * Even then, only do so in the context of static linking, as definitions may change between versions.
+ ********************************************************************/
+
+#define LZ4HC_DICTIONARY_LOGSIZE 16
+#define LZ4HC_MAXD (1<<LZ4HC_DICTIONARY_LOGSIZE)
+#define LZ4HC_MAXD_MASK (LZ4HC_MAXD - 1)
+
+#define LZ4HC_HASH_LOG 15
+#define LZ4HC_HASHTABLESIZE (1 << LZ4HC_HASH_LOG)
+#define LZ4HC_HASH_MASK (LZ4HC_HASHTABLESIZE - 1)
+
+
+/* Never ever use these definitions directly !
+ * Declare or allocate an LZ4_streamHC_t instead.
+**/
+typedef struct LZ4HC_CCtx_internal LZ4HC_CCtx_internal;
+struct LZ4HC_CCtx_internal
+{
+    LZ4_u32   hashTable[LZ4HC_HASHTABLESIZE];
+    LZ4_u16   chainTable[LZ4HC_MAXD];
+    const LZ4_byte* end;       /* next block here to continue on current prefix */
+    const LZ4_byte* prefixStart;  /* Indexes relative to this position */
+    const LZ4_byte* dictStart; /* alternate reference for extDict */
+    LZ4_u32   dictLimit;       /* below that point, need extDict */
+    LZ4_u32   lowLimit;        /* below that point, no more dict */
+    LZ4_u32   nextToUpdate;    /* index from which to continue dictionary update */
+    short     compressionLevel;
+    LZ4_i8    favorDecSpeed;   /* favor decompression speed if this flag set,
+                                  otherwise, favor compression ratio */
+    LZ4_i8    dirty;           /* stream has to be fully reset if this flag is set */
+    const LZ4HC_CCtx_internal* dictCtx;
+};
+
+#define LZ4_STREAMHC_MINSIZE  262200  /* static size, for inter-version compatibility */
+union LZ4_streamHC_u {
+    char minStateSize[LZ4_STREAMHC_MINSIZE];
+    LZ4HC_CCtx_internal internal_donotuse;
+}; /* previously typedef'd to LZ4_streamHC_t */
+
+/* LZ4_streamHC_t :
+ * This structure allows static allocation of LZ4 HC streaming state.
+ * This can be used to allocate statically on stack, or as part of a larger structure.
+ *
+ * Such state **must** be initialized using LZ4_initStreamHC() before first use.
+ *
+ * Note that invoking LZ4_initStreamHC() is not required when
+ * the state was created using LZ4_createStreamHC() (which is recommended).
+ * Using the normal builder, a newly created state is automatically initialized.
+ *
+ * Static allocation shall only be used in combination with static linking.
+ */
+
+/* LZ4_initStreamHC() : v1.9.0+
+ * Required before first use of a statically allocated LZ4_streamHC_t.
+ * Before v1.9.0 : use LZ4_resetStreamHC() instead
+ */
+LZ4LIB_API LZ4_streamHC_t* LZ4_initStreamHC(void* buffer, size_t size);
+
+
+/*-************************************
+*  Deprecated Functions
+**************************************/
+/* see lz4.h LZ4_DISABLE_DEPRECATE_WARNINGS to turn off deprecation warnings */
+
+/* deprecated compression functions */
+LZ4_DEPRECATED("use LZ4_compress_HC() instead") LZ4LIB_API int LZ4_compressHC               (const char* source, char* dest, int inputSize);
+LZ4_DEPRECATED("use LZ4_compress_HC() instead") LZ4LIB_API int LZ4_compressHC_limitedOutput (const char* source, char* dest, int inputSize, int maxOutputSize);
+LZ4_DEPRECATED("use LZ4_compress_HC() instead") LZ4LIB_API int LZ4_compressHC2              (const char* source, char* dest, int inputSize, int compressionLevel);
+LZ4_DEPRECATED("use LZ4_compress_HC() instead") LZ4LIB_API int LZ4_compressHC2_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize, int compressionLevel);
+LZ4_DEPRECATED("use LZ4_compress_HC_extStateHC() instead") LZ4LIB_API int LZ4_compressHC_withStateHC               (void* state, const char* source, char* dest, int inputSize);
+LZ4_DEPRECATED("use LZ4_compress_HC_extStateHC() instead") LZ4LIB_API int LZ4_compressHC_limitedOutput_withStateHC (void* state, const char* source, char* dest, int inputSize, int maxOutputSize);
+LZ4_DEPRECATED("use LZ4_compress_HC_extStateHC() instead") LZ4LIB_API int LZ4_compressHC2_withStateHC              (void* state, const char* source, char* dest, int inputSize, int compressionLevel);
+LZ4_DEPRECATED("use LZ4_compress_HC_extStateHC() instead") LZ4LIB_API int LZ4_compressHC2_limitedOutput_withStateHC(void* state, const char* source, char* dest, int inputSize, int maxOutputSize, int compressionLevel);
+LZ4_DEPRECATED("use LZ4_compress_HC_continue() instead") LZ4LIB_API int LZ4_compressHC_continue               (LZ4_streamHC_t* LZ4_streamHCPtr, const char* source, char* dest, int inputSize);
+LZ4_DEPRECATED("use LZ4_compress_HC_continue() instead") LZ4LIB_API int LZ4_compressHC_limitedOutput_continue (LZ4_streamHC_t* LZ4_streamHCPtr, const char* source, char* dest, int inputSize, int maxOutputSize);
+
+/* Obsolete streaming functions; degraded functionality; do not use!
+ *
+ * In order to perform streaming compression, these functions depended on data
+ * that is no longer tracked in the state. They have been preserved as well as
+ * possible: using them will still produce a correct output. However, use of
+ * LZ4_slideInputBufferHC() will truncate the history of the stream, rather
+ * than preserve a window-sized chunk of history.
+ */
+#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
+LZ4_DEPRECATED("use LZ4_createStreamHC() instead") LZ4LIB_API void* LZ4_createHC (const char* inputBuffer);
+LZ4_DEPRECATED("use LZ4_freeStreamHC() instead") LZ4LIB_API   int   LZ4_freeHC (void* LZ4HC_Data);
+#endif
+LZ4_DEPRECATED("use LZ4_saveDictHC() instead") LZ4LIB_API     char* LZ4_slideInputBufferHC (void* LZ4HC_Data);
+LZ4_DEPRECATED("use LZ4_compress_HC_continue() instead") LZ4LIB_API int LZ4_compressHC2_continue               (void* LZ4HC_Data, const char* source, char* dest, int inputSize, int compressionLevel);
+LZ4_DEPRECATED("use LZ4_compress_HC_continue() instead") LZ4LIB_API int LZ4_compressHC2_limitedOutput_continue (void* LZ4HC_Data, const char* source, char* dest, int inputSize, int maxOutputSize, int compressionLevel);
+LZ4_DEPRECATED("use LZ4_createStreamHC() instead") LZ4LIB_API int   LZ4_sizeofStreamStateHC(void);
+LZ4_DEPRECATED("use LZ4_initStreamHC() instead") LZ4LIB_API  int   LZ4_resetStreamStateHC(void* state, char* inputBuffer);
+
+
+/* LZ4_resetStreamHC() is now replaced by LZ4_initStreamHC().
+ * The intention is to emphasize the difference with LZ4_resetStreamHC_fast(),
+ * which is now the recommended function to start a new stream of blocks,
+ * but cannot be used to initialize a memory segment containing arbitrary garbage data.
+ *
+ * It is recommended to switch to LZ4_initStreamHC().
+ * LZ4_resetStreamHC() will generate deprecation warnings in a future version.
+ */
+LZ4LIB_API void LZ4_resetStreamHC (LZ4_streamHC_t* streamHCPtr, int compressionLevel);
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* LZ4_HC_H_19834876238432 */
+
+
+/*-**************************************************
+ * !!!!!     STATIC LINKING ONLY     !!!!!
+ * Following definitions are considered experimental.
+ * They should not be linked from DLL,
+ * as there is no guarantee of API stability yet.
+ * Prototypes will be promoted to "stable" status
+ * after successful usage in real-life scenarios.
+ ***************************************************/
+#ifdef LZ4_HC_STATIC_LINKING_ONLY   /* protection macro */
+#ifndef LZ4_HC_SLO_098092834
+#define LZ4_HC_SLO_098092834
+
+#define LZ4_STATIC_LINKING_ONLY   /* LZ4LIB_STATIC_API */
+#include "lz4.h"
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*! LZ4_setCompressionLevel() : v1.8.0+ (experimental)
+ *  It's possible to change compression level
+ *  between successive invocations of LZ4_compress_HC_continue*()
+ *  for dynamic adaptation.
+ */
+LZ4LIB_STATIC_API void LZ4_setCompressionLevel(
+    LZ4_streamHC_t* LZ4_streamHCPtr, int compressionLevel);
+
+/*! LZ4_favorDecompressionSpeed() : v1.8.2+ (experimental)
+ *  Opt. Parser will favor decompression speed over compression ratio.
+ *  Only applicable to levels >= LZ4HC_CLEVEL_OPT_MIN.
+ */
+LZ4LIB_STATIC_API void LZ4_favorDecompressionSpeed(
+    LZ4_streamHC_t* LZ4_streamHCPtr, int favor);
+
+/*! LZ4_resetStreamHC_fast() : v1.9.0+
+ *  When an LZ4_streamHC_t is known to be in a internally coherent state,
+ *  it can often be prepared for a new compression with almost no work, only
+ *  sometimes falling back to the full, expensive reset that is always required
+ *  when the stream is in an indeterminate state (i.e., the reset performed by
+ *  LZ4_resetStreamHC()).
+ *
+ *  LZ4_streamHCs are guaranteed to be in a valid state when:
+ *  - returned from LZ4_createStreamHC()
+ *  - reset by LZ4_resetStreamHC()
+ *  - memset(stream, 0, sizeof(LZ4_streamHC_t))
+ *  - the stream was in a valid state and was reset by LZ4_resetStreamHC_fast()
+ *  - the stream was in a valid state and was then used in any compression call
+ *    that returned success
+ *  - the stream was in an indeterminate state and was used in a compression
+ *    call that fully reset the state (LZ4_compress_HC_extStateHC()) and that
+ *    returned success
+ *
+ *  Note:
+ *  A stream that was last used in a compression call that returned an error
+ *  may be passed to this function. However, it will be fully reset, which will
+ *  clear any existing history and settings from the context.
+ */
+LZ4LIB_STATIC_API void LZ4_resetStreamHC_fast(
+    LZ4_streamHC_t* LZ4_streamHCPtr, int compressionLevel);
+
+/*! LZ4_compress_HC_extStateHC_fastReset() :
+ *  A variant of LZ4_compress_HC_extStateHC().
+ *
+ *  Using this variant avoids an expensive initialization step. It is only safe
+ *  to call if the state buffer is known to be correctly initialized already
+ *  (see above comment on LZ4_resetStreamHC_fast() for a definition of
+ *  "correctly initialized"). From a high level, the difference is that this
+ *  function initializes the provided state with a call to
+ *  LZ4_resetStreamHC_fast() while LZ4_compress_HC_extStateHC() starts with a
+ *  call to LZ4_resetStreamHC().
+ */
+LZ4LIB_STATIC_API int LZ4_compress_HC_extStateHC_fastReset (
+    void* state,
+    const char* src, char* dst,
+    int srcSize, int dstCapacity,
+    int compressionLevel);
+
+/*! LZ4_attach_HC_dictionary() :
+ *  This is an experimental API that allows for the efficient use of a
+ *  static dictionary many times.
+ *
+ *  Rather than re-loading the dictionary buffer into a working context before
+ *  each compression, or copying a pre-loaded dictionary's LZ4_streamHC_t into a
+ *  working LZ4_streamHC_t, this function introduces a no-copy setup mechanism,
+ *  in which the working stream references the dictionary stream in-place.
+ *
+ *  Several assumptions are made about the state of the dictionary stream.
+ *  Currently, only streams which have been prepared by LZ4_loadDictHC() should
+ *  be expected to work.
+ *
+ *  Alternatively, the provided dictionary stream pointer may be NULL, in which
+ *  case any existing dictionary stream is unset.
+ *
+ *  A dictionary should only be attached to a stream without any history (i.e.,
+ *  a stream that has just been reset).
+ *
+ *  The dictionary will remain attached to the working stream only for the
+ *  current stream session. Calls to LZ4_resetStreamHC(_fast) will remove the
+ *  dictionary context association from the working stream. The dictionary
+ *  stream (and source buffer) must remain in-place / accessible / unchanged
+ *  through the lifetime of the stream session.
+ */
+LZ4LIB_STATIC_API void LZ4_attach_HC_dictionary(
+          LZ4_streamHC_t *working_stream,
+    const LZ4_streamHC_t *dictionary_stream);
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif   /* LZ4_HC_SLO_098092834 */
+#endif   /* LZ4_HC_STATIC_LINKING_ONLY */

From 27fe1f6772e3105b970dacab3c147fefc65ff5a4 Mon Sep 17 00:00:00 2001
From: Giovanni Bajo <rasky@develer.com>
Date: Sun, 24 Sep 2023 23:18:53 +0200
Subject: [PATCH 12/27] tools: add mkasset tool to compress assets

---
 tools/Makefile           |  8 +++-
 tools/mkasset/.gitignore |  3 ++
 tools/mkasset/Makefile   | 20 ++++++++++
 tools/mkasset/mkasset.c  | 84 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 114 insertions(+), 1 deletion(-)
 create mode 100644 tools/mkasset/.gitignore
 create mode 100644 tools/mkasset/Makefile
 create mode 100644 tools/mkasset/mkasset.c

diff --git a/tools/Makefile b/tools/Makefile
index b6c6372066..0fb4b0ce11 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -1,6 +1,6 @@
 INSTALLDIR ?= $(N64_INST)
 
-all: chksum64 dumpdfs ed64romconfig mkdfs mksprite n64tool n64sym audioconv64
+all: chksum64 dumpdfs ed64romconfig mkdfs mksprite n64tool n64sym audioconv64 mkasset
 
 .PHONY: install
 install: all
@@ -9,6 +9,7 @@ install: all
 	$(MAKE) -C dumpdfs install
 	$(MAKE) -C mkdfs install
 	$(MAKE) -C mksprite install
+	$(MAKE) -C mkasset install
 	$(MAKE) -C audioconv64 install
 
 .PHONY: clean
@@ -17,6 +18,7 @@ clean:
 	$(MAKE) -C dumpdfs clean
 	$(MAKE) -C mkdfs clean
 	$(MAKE) -C mksprite clean
+	$(MAKE) -C mkasset clean
 	$(MAKE) -C audioconv64 clean
 
 chksum64: chksum64.c
@@ -46,6 +48,10 @@ mkdfs:
 mksprite:
 	$(MAKE) -C mksprite
 
+.PHONY: mkasset
+mkasset:
+	$(MAKE) -C mkasset
+
 .PHONY: audioconv64
 audioconv64:
 	$(MAKE) -C audioconv64
diff --git a/tools/mkasset/.gitignore b/tools/mkasset/.gitignore
new file mode 100644
index 0000000000..d9d4055432
--- /dev/null
+++ b/tools/mkasset/.gitignore
@@ -0,0 +1,3 @@
+mkasset
+mkasset.exe
+
diff --git a/tools/mkasset/Makefile b/tools/mkasset/Makefile
new file mode 100644
index 0000000000..d2ef88ff88
--- /dev/null
+++ b/tools/mkasset/Makefile
@@ -0,0 +1,20 @@
+INSTALLDIR = $(N64_INST)
+CFLAGS += -std=gnu99 -O2 -Wall -Werror -Wno-unused-result -I../../include -MMD
+
+all: mkasset
+
+mkasset: mkasset.c ../common/assetcomp.c
+	@echo "    [TOOL] mkasset"
+	$(CC) $(CFLAGS) -o $@ mkasset.c ../common/assetcomp.c
+
+install: mkasset
+	install -m 0755 mkasset $(INSTALLDIR)/bin
+
+.PHONY: clean install
+
+-include $(wildcard *.d)
+
+clean:
+	rm -f mkasset
+	rm -f *.d
+
diff --git a/tools/mkasset/mkasset.c b/tools/mkasset/mkasset.c
new file mode 100644
index 0000000000..3aaa0b145a
--- /dev/null
+++ b/tools/mkasset/mkasset.c
@@ -0,0 +1,84 @@
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include "../common/binout.c"
+#include "../common/assetcomp.h"
+
+bool flag_verbose = false;
+
+void print_args(char * name)
+{
+    fprintf(stderr, "%s -- Libdragon asset compression tool\n\n", name);
+    fprintf(stderr, "This tool can be used to compress/decompress arbitrary asset files in a format\n");
+    fprintf(stderr, "that can be loaded by the libdragon library. To open the compressed\n");
+    fprintf(stderr, "files, use asset_open() or asset_load().\n\n");
+    fprintf(stderr, "Usage: %s [flags] <input files...>\n", name);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "Command-line flags:\n");
+    fprintf(stderr, "   -v/--verbose          Verbose output\n");
+    fprintf(stderr, "   -o/--output <dir>     Specify output directory (default: .)\n");
+    fprintf(stderr, "   -c/--compress <algo>  Compression: 0=none, 1=lha, 2=lzh5 (default: %d)\n", DEFAULT_COMPRESSION);
+    fprintf(stderr, "\n");
+}
+
+int main(int argc, char *argv[])
+{
+    char *infn = NULL, *outdir = ".", *outfn = NULL;
+    int compression = DEFAULT_COMPRESSION;
+
+    if (argc < 2) {
+        print_args(argv[0]);
+        return 1;
+    }
+
+    for (int i = 1; i < argc; i++) {
+        if (argv[i][0] == '-') {
+            if (!strcmp(argv[i], "-h") || !strcmp(argv[i], "--help")) {
+                print_args(argv[0]);
+                return 0;
+            } else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--verbose")) {
+                flag_verbose = true;
+            } else if (!strcmp(argv[i], "-o") || !strcmp(argv[i], "--output")) {
+                if (++i == argc) {
+                    fprintf(stderr, "missing argument for %s\n", argv[i-1]);
+                    return 1;
+                }
+                outdir = argv[i];
+            } else if (!strcmp(argv[i], "-c") || !strcmp(argv[i], "--compress")) {
+                if (++i == argc) {
+                    fprintf(stderr, "missing argument for %s\n", argv[i-1]);
+                    return 1;
+                }
+                char extra;
+                if (sscanf(argv[i], "%d%c", &compression, &extra) != 1) {
+                    fprintf(stderr, "invalid argument for %s: %s\n", argv[i-1], argv[i]);
+                    return 1;
+                }
+                if (compression < 0 || compression > 2) {
+                    fprintf(stderr, "invalid compression algorithm: %d\n", compression);
+                    return 1;
+                }
+            } else {
+                fprintf(stderr, "invalid flag: %s\n", argv[i]);
+                return 1;
+            }
+            continue;
+        }
+
+        infn = argv[i];
+        char *basename = strrchr(infn, '/');
+        if (!basename) basename = infn; else basename += 1;
+
+        asprintf(&outfn, "%s/%s", outdir, basename);
+
+        if (flag_verbose)
+            printf("Compressing: %s => %s [algo=%d]\n", infn, outfn, compression);
+
+        asset_compress(infn, outfn, compression);
+
+        free(outfn);
+    }
+}

From 5eb51dec2509d82918adb01263a9b1596d0c7758 Mon Sep 17 00:00:00 2001
From: Giovanni Bajo <rasky@develer.com>
Date: Mon, 25 Sep 2023 00:05:20 +0200
Subject: [PATCH 13/27] lzh5_compress: fix MinGW compilation by adding missing
 include

---
 tools/common/lzh5_compress.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/common/lzh5_compress.c b/tools/common/lzh5_compress.c
index f0c74bc8b9..d69cb857e2 100644
--- a/tools/common/lzh5_compress.c
+++ b/tools/common/lzh5_compress.c
@@ -9,6 +9,7 @@
 #include <stdarg.h>
 #include <memory.h>
 #include <limits.h>
+#include <string.h>
 
 #undef DEBUG
 

From b4d16f6df2252a1a0c2996f538e370213d8b291c Mon Sep 17 00:00:00 2001
From: Giovanni Bajo <rasky@develer.com>
Date: Mon, 25 Sep 2023 00:39:51 +0200
Subject: [PATCH 14/27] debugcpp.h: fix pasto

---
 include/debugcpp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/debugcpp.h b/include/debugcpp.h
index b0ff4459f0..bd9e0311c9 100644
--- a/include/debugcpp.h
+++ b/include/debugcpp.h
@@ -19,7 +19,7 @@
     #define timer_init() ({ __debug_init_cpp(); timer_init(); })
     #define display_init(a,b,c,d,e) ({ __debug_init_cpp(); display_init(a,b,c,d,e); })
     #define debug_init_isviewer() ({ __debug_init_cpp(); debug_init_isviewer(); })
-    #define debug_init_usblog() ({ __debug_init_cpp(); debug_init_isviewer(); })
+    #define debug_init_usblog() ({ __debug_init_cpp(); debug_init_usblog(); })
     ///@endcond
 #endif
 

From b5e0c792e5b2e8d85c1c3a1ce7a4f7c7a5bef3c1 Mon Sep 17 00:00:00 2001
From: Giovanni Bajo <rasky@develer.com>
Date: Mon, 25 Sep 2023 00:40:12 +0200
Subject: [PATCH 15/27] cop1.h: add C1_ENABLE_MASK / C1_CAUSE_MASK to group
 bits

---
 include/cop1.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/cop1.h b/include/cop1.h
index 6976235277..b7fb456acc 100644
--- a/include/cop1.h
+++ b/include/cop1.h
@@ -26,6 +26,7 @@
 #define C1_ENABLE_OVERFLOW          0x00000200         ///< Enable overflow exception
 #define C1_ENABLE_DIV_BY_0          0x00000400         ///< Enable division by zero exception
 #define C1_ENABLE_INVALID_OP        0x00000800         ///< Enable invalid operation exception
+#define C1_ENABLE_MASK              0x00000F80         ///< Mask for all enable bits
 
 #define C1_CAUSE_INEXACT_OP         0x00001000         ///< Triggered inexact operation exception
 #define C1_CAUSE_UNDERFLOW          0x00002000         ///< Triggered underflow exception
@@ -33,6 +34,7 @@
 #define C1_CAUSE_DIV_BY_0           0x00008000         ///< Triggered division by zero exception
 #define C1_CAUSE_INVALID_OP         0x00010000         ///< Triggered invalid operation exception
 #define C1_CAUSE_NOT_IMPLEMENTED    0x00020000         ///< Triggered not implemented exception
+#define C1_CAUSE_MASK               0x0003F000         ///< Mask for all cause bits
 
 #define C1_FCR31_FS                 (1<<24)            ///< Flush denormals to zero/min
 

From a728166b8180348228440b45ac92a2d5609caa8b Mon Sep 17 00:00:00 2001
From: Giovanni Bajo <rasky@develer.com>
Date: Mon, 25 Sep 2023 00:43:42 +0200
Subject: [PATCH 16/27] regsinternal.h: add missing include

---
 include/regsinternal.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/regsinternal.h b/include/regsinternal.h
index 49aaaba0e4..4bb3a8f149 100644
--- a/include/regsinternal.h
+++ b/include/regsinternal.h
@@ -6,6 +6,8 @@
 #ifndef __LIBDRAGON_REGSINTERNAL_H
 #define __LIBDRAGON_REGSINTERNAL_H
 
+#include <stdint.h>
+
 /**
  * @defgroup lowlevel Low Level Hardware Interfaces
  * @ingroup libdragon

From 9898e15fd8e3f8e39894793a28f3dc50deb6b674 Mon Sep 17 00:00:00 2001
From: Giovanni Bajo <rasky@develer.com>
Date: Mon, 25 Sep 2023 00:45:17 +0200
Subject: [PATCH 17/27] entrypoint.S: simplify readability of initial SP
 constant

---
 src/entrypoint.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/entrypoint.S b/src/entrypoint.S
index df6cae04f5..ec5bb39c70 100644
--- a/src/entrypoint.S
+++ b/src/entrypoint.S
@@ -36,7 +36,7 @@ _start:
 	li t0, 0x7C0000
 
 .Lset_sp:
-	li t1, 0x7FFFFFF0
+	li t1, 0x80000000 - 0x10	/* sp = KSEG0 + memsize - 0x10 */
 	addu sp,t0,t1				/* init stack */
 	la gp, _gp					/* init data pointer */
 	li v0, 8

From bb2c36e2cc35b18eeee01e5daa9e6c025a60e9bb Mon Sep 17 00:00:00 2001
From: Giovanni Bajo <rasky@develer.com>
Date: Mon, 25 Sep 2023 00:45:46 +0200
Subject: [PATCH 18/27] interrupt: move static variable to local scope

---
 src/interrupt.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/interrupt.c b/src/interrupt.c
index 5873ac0d57..8ebb18b2a0 100644
--- a/src/interrupt.c
+++ b/src/interrupt.c
@@ -170,8 +170,6 @@ static void (*__prenmi_handlers[MAX_RESET_HANDLERS])(void);
 /** @brief Tick at which the pre-NMI was triggered */
 static uint32_t __prenmi_tick;
 
-static int last_cart_interrupt_count = 0;
-
 /** 
  * @brief Call each callback in a linked list of callbacks
  *
@@ -344,6 +342,7 @@ void __CART_handler(void)
        to do so, the console freezes because the interrupt will retrigger
        continuously. Since a freeze is always bad for debugging, try to 
        detect it, and show a proper assertion screen. */
+    static int last_cart_interrupt_count = 0;
     if (!(C0_CAUSE() & C0_INTERRUPT_CART))
         last_cart_interrupt_count = 0;
     else

From 73a63bcd816f2c48a52f40f3639ffbaf2658c4af Mon Sep 17 00:00:00 2001
From: Giovanni Bajo <rasky@develer.com>
Date: Mon, 25 Sep 2023 00:46:00 +0200
Subject: [PATCH 19/27] inthandler: fix reference to reg_block_t in comment

---
 src/inthandler.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/inthandler.S b/src/inthandler.S
index 029fa6ff27..e06b8aa9de 100644
--- a/src/inthandler.S
+++ b/src/inthandler.S
@@ -28,7 +28,7 @@ inthandler:
 # So we keep 0-31 empty, and we start saving GPRs from 32, and then FPR. See
 # the other macros to see the actual layout.
 #
-# *NOTE*: this layout is also exposed in C via regblock_t in exception.h
+# *NOTE*: this layout is also exposed in C via reg_block_t in exception.h
 # Please keep in sync!
 #define EXC_STACK_SIZE (544+32)
 #define STACK_GPR      32

From d41bf29dc37c90747fe96ff9ed5d67de4ca179dd Mon Sep 17 00:00:00 2001
From: Giovanni Bajo <rasky@develer.com>
Date: Mon, 25 Sep 2023 00:48:29 +0200
Subject: [PATCH 20/27] exception: avoid using __builint_isnan as well

It does seem to misbehave depending on compilation flags so it seems
better to manually check for bit patterns rather than playing further
cat and mouse games with the compiler.
---
 src/exception.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/exception.c b/src/exception.c
index e9c9494926..3d7f6c7260 100644
--- a/src/exception.c
+++ b/src/exception.c
@@ -182,7 +182,7 @@ void __exception_dump_fpr(exception_t* ex, void (*cb)(void *arg, const char *reg
 		// Open GCC bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66462
 		if ((fpr32 & 0x7F800000) == 0 && (fpr32 & 0x007FFFFF) != 0)
 			singlep = "<Denormal>";
-		else if (__builtin_isnan(f))
+		else if ((fpr32 & 0x7F800000) == 0x7F800000 && (fpr32 & 0x007FFFFF) != 0)
 			singlep = "<NaN>";
 		else if (__builtin_isinf(f))
 			singlep = (f < 0) ? "<-Inf>" : "<+Inf>";
@@ -191,7 +191,7 @@ void __exception_dump_fpr(exception_t* ex, void (*cb)(void *arg, const char *reg
 
 		if ((fpr64 & 0x7FF0000000000000ull) == 0 && (fpr64 & 0x000FFFFFFFFFFFFFull) != 0)
 			doublep = "<Denormal>";
-		else if (__builtin_isnan(g))
+		else if ((fpr64 & 0x7FF0000000000000ull) == 0x7FF0000000000000ull && (fpr64 & 0x000FFFFFFFFFFFFFull) != 0)
 			doublep = "<NaN>";
 		else if (__builtin_isinf(g))
 			doublep = (g < 0) ? "<-Inf>" : "<+Inf>";
@@ -202,6 +202,7 @@ void __exception_dump_fpr(exception_t* ex, void (*cb)(void *arg, const char *reg
 	}
 }
 
+#ifndef NDEBUG
 static void debug_exception(exception_t* ex) {
 	debugf("\n\n******* CPU EXCEPTION *******\n");
 	__exception_dump_header(stderr, ex);
@@ -225,6 +226,7 @@ static void debug_exception(exception_t* ex) {
 		debugf("\n");
 	}
 }
+#endif
 
 /**
  * @brief Default exception handler.
@@ -234,6 +236,7 @@ static void debug_exception(exception_t* ex) {
  * of all GPR/FPR registers. It then calls abort() to abort execution.
  */
 void exception_default_handler(exception_t* ex) {
+	#ifndef NDEBUG
 	static bool backtrace_exception = false;
 
 	// Write immediately as much data as we can to the debug spew. This is the
@@ -250,6 +253,7 @@ void exception_default_handler(exception_t* ex) {
 
 	// Run the inspector
 	__inspector_exception(ex);
+	#endif
 
 	abort();
 }

From 8dc4e4a8738d5c88812bcb857feb367e5bd02b21 Mon Sep 17 00:00:00 2001
From: Giovanni Bajo <rasky@develer.com>
Date: Mon, 25 Sep 2023 00:48:45 +0200
Subject: [PATCH 21/27] n64.mk: fix quoting of currnet directory in command
 line

---
 n64.mk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/n64.mk b/n64.mk
index 3249a0a5a2..f62dc72ad3 100644
--- a/n64.mk
+++ b/n64.mk
@@ -37,7 +37,7 @@ N64_AUDIOCONV = $(N64_BINDIR)/audioconv64
 
 N64_C_AND_CXX_FLAGS =  -march=vr4300 -mtune=vr4300 -I$(N64_INCLUDEDIR)
 N64_C_AND_CXX_FLAGS += -falign-functions=32   # NOTE: if you change this, also change backtrace() in backtrace.c
-N64_C_AND_CXX_FLAGS += -ffunction-sections -fdata-sections -g -ffile-prefix-map=$(CURDIR)=
+N64_C_AND_CXX_FLAGS += -ffunction-sections -fdata-sections -g -ffile-prefix-map="$(CURDIR)"=
 N64_C_AND_CXX_FLAGS += -DN64 -O2 -Wall -Werror -Wno-error=deprecated-declarations -fdiagnostics-color=always
 N64_CFLAGS = $(N64_C_AND_CXX_FLAGS) -std=gnu99
 N64_CXXFLAGS = $(N64_C_AND_CXX_FLAGS)

From 9cfe59feb479ff7d985807f11e195159605dfa49 Mon Sep 17 00:00:00 2001
From: Giovanni Bajo <rasky@develer.com>
Date: Mon, 25 Sep 2023 00:48:55 +0200
Subject: [PATCH 22/27] Improve compilation with NDEBUG

Currently, a NDEBUG build is broken because of compilation and linking
errors. It is currently not tested by CI. This commit fixes a few issues
and make sure it works. Going forward, we should probably cover with
CI if we want to make sure it always works.
---
 include/rsp.h   | 17 +++++++++++++++--
 src/debug.c     | 11 ++++++++++-
 src/inspector.c |  2 ++
 src/rsp.c       |  6 ++++++
 4 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/include/rsp.h b/include/rsp.h
index 27f14b6862..80fa19cfb3 100644
--- a/include/rsp.h
+++ b/include/rsp.h
@@ -1,5 +1,5 @@
 /**
- * @defgroup rsp RSP interface
+ * @defgroup rsp RSP: vector coprocessor
  * @ingroup lowlevel
  * @brief RSP basic library and command queue
  * 
@@ -157,6 +157,7 @@
 
 #include <stdbool.h>
 #include <stdint.h>
+#include <stdlib.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -468,9 +469,13 @@ void rsp_read_data(void* data, unsigned long size, unsigned int dmem_offset);
  * 
  * @see #rsp_crashf
  */
+#ifndef NDEBUG
 #define rsp_crash()  ({ \
     __rsp_crash(__FILE__, __LINE__, __func__, NULL); \
-}) 
+})
+#else
+#define rsp_crash() abort()
+#endif
 
 /**
  * @brief Abort the program showing a RSP crash screen with a symptom message.
@@ -487,9 +492,13 @@ void rsp_read_data(void* data, unsigned long size, unsigned int dmem_offset);
  * 
  * @see #rsp_crash
  */
+#ifndef NDEBUG
 #define rsp_crashf(msg, ...) ({ \
     __rsp_crash(__FILE__, __LINE__, __func__, msg, ##__VA_ARGS__); \
 })
+#else
+#define rsp_crashf(msg, ...) abort()
+#endif
 
 /**
  * @brief Create a loop that waits for some condition that is related to RSP,
@@ -549,9 +558,13 @@ void run_ucode(void) {
 // Internal function used by rsp_crash and rsp_crashf. These are not part
 // of the public API of rsp.h. Do not call them directly.
 /// @cond
+#ifndef NDEBUG
 void __rsp_crash(const char *file, int line, const char *func, const char *msg, ...)
    __attribute__((noreturn, format(printf, 4, 5)));
 void __rsp_check_assert(const char *file, int line, const char *func);
+#else
+static inline void __rsp_check_assert(const char *file, int line, const char *func) {}
+#endif
 /// @endcond
 
 #ifdef __cplusplus
diff --git a/src/debug.c b/src/debug.c
index 4d40a45623..4a0bb560b5 100644
--- a/src/debug.c
+++ b/src/debug.c
@@ -2,7 +2,7 @@
  * @file debug.c
  * @brief Debugging Support
  */
-
+#ifndef NDEBUG
 #include <string.h>
 #include <fcntl.h>
 #include <assert.h>
@@ -643,3 +643,12 @@ void debug_backtrace(void)
 {
 	__debug_backtrace(stderr, false);
 }
+#else
+
+#include <stdlib.h>
+
+void debug_assert_func(...) {
+	abort();
+}
+
+#endif
diff --git a/src/inspector.c b/src/inspector.c
index 2b01060824..19a325b02c 100644
--- a/src/inspector.c
+++ b/src/inspector.c
@@ -544,6 +544,7 @@ void __inspector_cppexception(const char *exctype, const char *what) {
     __builtin_unreachable();    
 }
 
+#ifndef NDEBUG
 __attribute__((constructor))
 void __inspector_init(void) {
     // Register SYSCALL 0x1 for assertion failures
@@ -553,3 +554,4 @@ void __inspector_init(void) {
     }
     register_syscall_handler(handler, 0x00001, 0x00002);
 }
+#endif
diff --git a/src/rsp.c b/src/rsp.c
index 341045f8b3..15f304e3ce 100644
--- a/src/rsp.c
+++ b/src/rsp.c
@@ -39,7 +39,9 @@ static void __SP_DMA_wait(void)
 
 static void rsp_interrupt(void)
 {
+    #ifndef NDEBUG
     __rsp_check_assert(__FILE__, __LINE__, __func__);
+    #endif
 }
 
 void rsp_init(void)
@@ -168,6 +170,7 @@ void rsp_run(void)
     rsp_wait();
 }
 
+#ifndef NDEBUG
 /// @cond
 // Check if the RSP has hit an internal assert, and call rsp_crash if so.
 // This function is invoked by #RSP_WAIT_LOOP while waiting for the RSP
@@ -194,7 +197,9 @@ void __rsp_check_assert(const char *file, int line, const char *func)
     }
 }
 /// @endcond
+#endif /* NDEBUG */
 
+#ifndef NDEBUG
 /// @cond
 // RSP crash handler implementation
 __attribute__((noreturn, format(printf, 4, 5)))
@@ -403,5 +408,6 @@ void __rsp_crash(const char *file, int line, const char *func, const char *msg,
     abort();
 }
 /// @endcond
+#endif /* NDEBUG */
 
 extern inline void rsp_run_async(void);

From 851a35fd5175fed744229ec93f154ae94c32882a Mon Sep 17 00:00:00 2001
From: Giovanni Bajo <rasky@develer.com>
Date: Mon, 25 Sep 2023 00:53:20 +0200
Subject: [PATCH 23/27] system.c: improve errno usage in open()

Currently open() was a bit sloppy in errno usage and specifically
it didn't allow the filesystem open call to set it. Change it so
that filesystem implementations can now set it to propagate a detailed
error message to the caller.

Also remove some weird global errno redefinition, not sure why it
was ever added but everything seems fine without it.
---
 src/system.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/src/system.c b/src/system.c
index 406a7ac5f8..61391947f6 100644
--- a/src/system.c
+++ b/src/system.c
@@ -19,8 +19,6 @@
 #include "system.h"
 #include "n64sys.h"
 
-#undef errno
-
 /** 
  * @defgroup system newlib Interface Hooks
  * @brief System hooks to provide low level threading and filesystem functionality to newlib.
@@ -95,11 +93,6 @@
  */
 char *__env[1] = { 0 };
 
-/**
- * @brief Definition of errno, as it's defined as extern across stdlib
- */
-int errno __attribute__((weak));
-
 /**
  * @brief Assert function pointer (initialized at startup)
  */
@@ -892,10 +885,15 @@ int open( const char *file, int flags, ... )
 
             if( mapping < 0 )
             {
-                errno = ENOMEM;
+                errno = EINVAL;
                 return -1;
             }
 
+            /* Clear errno so we can check whether the fs->open() call sets it. 
+               This is for backward compatibility, because we used not to require
+               errno to be set. */
+            errno = 0;
+
             /* Cast away const from the file name.
                open used to mistakenly take a char* instead of a const char*,
                and we don't want to break existing code for filesystem_t.open,
@@ -915,14 +913,15 @@ int open( const char *file, int flags, ... )
             else
             {
                 /* Couldn't open for some reason */
-                errno = EPERM;
+                if( errno == 0 )
+                    errno = ENOENT;
                 return -1;
             }
         }
     }
 
     /* No file handles available */
-    errno = ENOMEM;
+    errno = ENFILE;
     return -1;
 }
 

From 56e1fe95c32a29368313958a138ba9d3529d5c62 Mon Sep 17 00:00:00 2001
From: Giovanni Bajo <rasky@develer.com>
Date: Mon, 25 Sep 2023 00:42:48 +0200
Subject: [PATCH 24/27] dragonfs: improve management of error codes

This commit does three things that are related:

 * Rename DFS_ENOMEM to DFS_ENFILE to more closely match the equivalent
   POSIX error (ENFILE)
 * Add dfs_strerror() to simplify reporting errors produced by dfs
 * Correctly set errno on fopen() failure, so that callers can find out
   why a fopen() failed
---
 include/dragonfs.h | 12 ++++++++++--
 src/dragonfs.c     | 43 ++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 48 insertions(+), 7 deletions(-)

diff --git a/include/dragonfs.h b/include/dragonfs.h
index ff6383c61c..81f0212657 100644
--- a/include/dragonfs.h
+++ b/include/dragonfs.h
@@ -48,12 +48,18 @@
 #define DFS_ENOFILE         -2
 /** @brief Bad filesystem */
 #define DFS_EBADFS          -3
-/** @brief No memory for operation */
-#define DFS_ENOMEM          -4
+/** @brief Too many open files */
+#define DFS_ENFILE          -4
 /** @brief Invalid file handle */
 #define DFS_EBADHANDLE      -5
 /** @} */
 
+/** @cond */
+// Deprecated naming
+#define DFS_ENOMEM          -4
+/** @endcond */
+
+
 /**
  * @brief Macro to extract the file type from a DragonFS file flag
  *
@@ -96,6 +102,8 @@ int dfs_eof(uint32_t handle);
 int dfs_size(uint32_t handle);
 uint32_t dfs_rom_addr(const char *path);
 
+const char *dfs_strerror(int error);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/dragonfs.c b/src/dragonfs.c
index 17976d513d..ee99da8672 100644
--- a/src/dragonfs.c
+++ b/src/dragonfs.c
@@ -7,6 +7,7 @@
 #include <string.h>
 #include <stdint.h>
 #include <sys/stat.h>
+#include <errno.h>
 #include "libdragon.h"
 #include "system.h"
 #include "dfsinternal.h"
@@ -14,6 +15,7 @@
 
 /**
  * @defgroup dfs DragonFS
+ * @ingroup asset
  * @brief DragonFS filesystem implementation and newlib hooks.
  *
  * DragonFS is a read only ROM filesystem for the N64.  It provides an interface
@@ -38,9 +40,15 @@
  * simultaneously.
  *
  * When DFS is initialized, it will register itself with newlib using 'rom:/' as a prefix.
- * Files can be accessed either with standard POSIX functions and the 'rom:/' prefix or
- * with DFS API calls and no prefix.  Files can be opened using both sets of API calls
- * simultaneously as long as no more than four files are open at any one time.
+ * Files can be accessed either with standard POSIX functions (open, fopen) using the 'rom:/'
+ * prefix or the lower-level DFS API calls without prefix. In most cases, it is not necessary
+ * to use the DFS API directly, given that the standard C functions are more comprehensive.
+ * Files can be opened using both sets of API calls simultaneously as long as no more than
+ * four files are open at any one time.
+ * 
+ * DragonFS does not support file compression; if you want to compress your assets,
+ * use the asset API (#asset_load / #asset_fopen).
+ * 
  * @{
  */
 
@@ -766,7 +774,7 @@ int dfs_open(const char * const path)
 
     if(!file)
     {
-        return DFS_ENOMEM;        
+        return DFS_ENFILE;        
     }
 
     /* Try to find file */
@@ -1130,8 +1138,17 @@ static void *__open( char *name, int flags )
 
     /* We disregard flags here */
     int handle = dfs_open( name );
-    if (handle <= 0)
+    if (handle <= 0) {
+        switch (handle) {
+        case DFS_EBADINPUT:  errno = EINVAL; break;
+        case DFS_ENOFILE:    errno = ENOENT; break;
+        case DFS_EBADFS:     errno = ENODEV; break;
+        case DFS_ENFILE:     errno = ENFILE; break;
+        case DFS_EBADHANDLE: errno = EBADF;  break;
+        default:             errno = EPERM;  break;
+        }
         return NULL;
+    }
     return (void *)handle;
 }
 
@@ -1386,4 +1403,20 @@ int dfs_init(uint32_t base_fs_loc)
     return DFS_ESUCCESS;
 }
 
+/**
+ * @brief Convert DFS error code into an error string
+ */
+const char *dfs_strerror(int error)
+{
+    switch (error) {
+    case DFS_ESUCCESS:   return "Success";
+    case DFS_EBADFS:     return "Bad filesystem";
+    case DFS_ENOFILE:    return "File not found";
+    case DFS_EBADINPUT:  return "Invalid argument";
+    case DFS_ENFILE:     return "No free file handles";
+    case DFS_EBADHANDLE: return "Bad file handle";
+    default:             return "Unknown error";
+    }
+}
+
 /** @} */

From a1d5c45cc6833099d19cc74eb2866482fbb8e910 Mon Sep 17 00:00:00 2001
From: Giovanni Bajo <rasky@develer.com>
Date: Mon, 25 Sep 2023 00:50:33 +0200
Subject: [PATCH 25/27] xm64/ym64: use must_fopen

---
 src/audio/xm64.c | 4 ++--
 src/audio/ym64.c | 3 +--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/audio/xm64.c b/src/audio/xm64.c
index 5faabf247e..beced0821a 100644
--- a/src/audio/xm64.c
+++ b/src/audio/xm64.c
@@ -6,6 +6,7 @@
 
 #include <libdragon.h>
 #include "wav64internal.h"
+#include "asset_internal.h"
 #include "libxm/xm.h"
 #include "libxm/xm_internal.h"
 #include <stdbool.h>
@@ -95,8 +96,7 @@ void xm64player_open(xm64player_t *player, const char *fn) {
 	// No pending seek at the moment, we start from beginning anyway.
 	player->seek.patidx = -1;
 
-	player->fh = fopen(fn, "rb");
-	assertf(player->fh, "Cannot open file: %s", fn);
+	player->fh = must_fopen(fn);
 
 	// Load the XM context
 	int sample_rate = audio_get_frequency();
diff --git a/src/audio/ym64.c b/src/audio/ym64.c
index 2ecda3dd4d..796c88c37f 100644
--- a/src/audio/ym64.c
+++ b/src/audio/ym64.c
@@ -103,8 +103,7 @@ static void ym_wave_read(void *ctx, samplebuffer_t *sbuf, int wpos, int wlen, bo
 void ym64player_open(ym64player_t *player, const char *fn, ym64player_songinfo_t *info) {
 	memset(player, 0, sizeof(*player));
 
-	player->f = fopen(fn, "rb");
-	assertf(player->f != NULL, "Cannot open file: %s", fn);
+	player->f = must_fopen(fn);
 
 	int offset = 0;
 	int _ymread(void *buf, int sz) {

From 97d0c4112cf5f6c736e714bb7cc64d53ed3b2fe0 Mon Sep 17 00:00:00 2001
From: Giovanni Bajo <rasky@develer.com>
Date: Tue, 26 Sep 2023 22:34:29 +0200
Subject: [PATCH 26/27] compress: fix tabs/spaces mixup

---
 src/compress/ringbuf.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/compress/ringbuf.c b/src/compress/ringbuf.c
index 03362adf3b..46369ffde9 100644
--- a/src/compress/ringbuf.c
+++ b/src/compress/ringbuf.c
@@ -23,9 +23,9 @@ void __ringbuf_copy(decompress_ringbuf_t *ringbuf, int copy_offset, uint8_t *dst
     int ringbuf_copy_pos = (ringbuf->ringbuf_pos - copy_offset) & (RING_BUFFER_SIZE-1);
     int dst_pos = 0;
     while (count > 0) {
-		int wn = count;
+        int wn = count;
         wn = wn < RING_BUFFER_SIZE - ringbuf_copy_pos     ? wn : RING_BUFFER_SIZE - ringbuf_copy_pos;
-		wn = wn < RING_BUFFER_SIZE - ringbuf->ringbuf_pos ? wn : RING_BUFFER_SIZE - ringbuf->ringbuf_pos;
+        wn = wn < RING_BUFFER_SIZE - ringbuf->ringbuf_pos ? wn : RING_BUFFER_SIZE - ringbuf->ringbuf_pos;
         count -= wn;
 
         // Check if there's an overlap in the ring buffer between read and write pos, in which

From e1db7bbb2a86ae25a11b7a603a2c21085f00e6f7 Mon Sep 17 00:00:00 2001
From: Christopher Bonhage <me@christopherbonhage.com>
Date: Tue, 26 Sep 2023 09:15:08 -0400
Subject: [PATCH 27/27] Fix Make 3.8 compatibility issues in examples Makefile

---
 examples/Makefile | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/examples/Makefile b/examples/Makefile
index 633ed4514b..e0a82524f4 100644
--- a/examples/Makefile
+++ b/examples/Makefile
@@ -1,16 +1,14 @@
 EXAMPLES = audioplayer cpptest ctest dfsdemo eepromfstest mixertest mptest mputest rspqdemo spritemap test timers vrutest vtest ucodetest
 
-# Populated by evaluating the EXAMPLE_template macro for each example
-all:
-clean:
+all: $(EXAMPLES)
+clean: $(foreach example,$(EXAMPLES),$(example)-clean)
+.PHONY: all clean
 
-define EXAMPLE_template =
+define EXAMPLE_template
 $(1):
 	$$(MAKE) -C $(1)
 $(1)-clean:
 	$$(MAKE) -C $(1) clean
-all: $(1)
-clean: $(1)-clean
 .PHONY: $(1) $(1)-clean
 endef