diff --git a/Makefile b/Makefile
index 789c92666e..bc56130c20 100755
--- a/Makefile
+++ b/Makefile
@@ -70,6 +70,7 @@ install-mk: n64.mk
 install: install-mk libdragon
 	install -Cv -m 0644 libdragon.a $(INSTALLDIR)/mips64-elf/lib/libdragon.a
 	install -Cv -m 0644 n64.ld $(INSTALLDIR)/mips64-elf/lib/n64.ld
+	install -Cv -m 0644 rsp.ld $(INSTALLDIR)/mips64-elf/lib/rsp.ld
 	install -Cv -m 0644 header $(INSTALLDIR)/mips64-elf/lib/header
 	install -Cv -m 0644 libdragonsys.a $(INSTALLDIR)/mips64-elf/lib/libdragonsys.a
 	install -Cv -m 0644 include/pputils.h $(INSTALLDIR)/mips64-elf/include/pputils.h
diff --git a/include/pputils.h b/include/pputils.h
index ec18d13278..ec3d4b2a09 100644
--- a/include/pputils.h
+++ b/include/pputils.h
@@ -63,6 +63,42 @@
 #define __PPCAT2(n,x) n ## x
 #define __PPCAT(n,x) __PPCAT2(n,x)
 
+// __CALL_FOREACH_BIS. Like __CALL_FOREACH, but it allows to be called without the expansion
+// of a __CALL_FOREACH.
+#define __FEB_0(_call, ...)
+#define __FEB_1(_call, x)       _call(x)
+#define __FEB_2(_call, x, ...)  _call(x) __FEB_1(_call, __VA_ARGS__)
+#define __FEB_3(_call, x, ...)  _call(x) __FEB_2(_call, __VA_ARGS__)
+#define __FEB_4(_call, x, ...)  _call(x) __FEB_3(_call, __VA_ARGS__)
+#define __FEB_5(_call, x, ...)  _call(x) __FEB_4(_call, __VA_ARGS__)
+#define __FEB_6(_call, x, ...)  _call(x) __FEB_5(_call, __VA_ARGS__)
+#define __FEB_7(_call, x, ...)  _call(x) __FEB_6(_call, __VA_ARGS__)
+#define __FEB_8(_call, x, ...)  _call(x) __FEB_7(_call, __VA_ARGS__)
+#define __FEB_9(_call, x, ...)  _call(x) __FEB_8(_call, __VA_ARGS__)
+#define __FEB_10(_call, x, ...) _call(x) __FEB_9(_call, __VA_ARGS__)
+#define __FEB_11(_call, x, ...) _call(x) __FEB_10(_call, __VA_ARGS__)
+#define __FEB_12(_call, x, ...) _call(x) __FEB_11(_call, __VA_ARGS__)
+#define __FEB_13(_call, x, ...) _call(x) __FEB_12(_call, __VA_ARGS__)
+#define __FEB_14(_call, x, ...) _call(x) __FEB_13(_call, __VA_ARGS__)
+#define __FEB_15(_call, x, ...) _call(x) __FEB_14(_call, __VA_ARGS__)
+#define __FEB_16(_call, x, ...) _call(x) __FEB_15(_call, __VA_ARGS__)
+#define __FEB_17(_call, x, ...) _call(x) __FEB_16(_call, __VA_ARGS__)
+#define __FEB_18(_call, x, ...) _call(x) __FEB_17(_call, __VA_ARGS__)
+#define __FEB_19(_call, x, ...) _call(x) __FEB_18(_call, __VA_ARGS__)
+#define __FEB_20(_call, x, ...) _call(x) __FEB_19(_call, __VA_ARGS__)
+#define __FEB_21(_call, x, ...) _call(x) __FEB_20(_call, __VA_ARGS__)
+#define __FEB_22(_call, x, ...) _call(x) __FEB_21(_call, __VA_ARGS__)
+#define __FEB_23(_call, x, ...) _call(x) __FEB_22(_call, __VA_ARGS__)
+#define __FEB_24(_call, x, ...) _call(x) __FEB_23(_call, __VA_ARGS__)
+#define __FEB_25(_call, x, ...) _call(x) __FEB_24(_call, __VA_ARGS__)
+#define __FEB_26(_call, x, ...) _call(x) __FEB_25(_call, __VA_ARGS__)
+#define __FEB_27(_call, x, ...) _call(x) __FEB_26(_call, __VA_ARGS__)
+#define __FEB_28(_call, x, ...) _call(x) __FEB_27(_call, __VA_ARGS__)
+#define __FEB_29(_call, x, ...) _call(x) __FEB_28(_call, __VA_ARGS__)
+#define __FEB_30(_call, x, ...) _call(x) __FEB_29(_call, __VA_ARGS__)
+#define __FEB_31(_call, x, ...) _call(x) __FEB_30(_call, __VA_ARGS__)
+#define __CALL_FOREACH_BIS(fn, ...)  __GET_33RD_ARG("ignored", ##__VA_ARGS__, __FEB_31, __FEB_30, __FEB_29, __FEB_28, __FEB_27, __FEB_26, __FEB_25, __FEB_24, __FEB_23, __FEB_22, __FEB_21, __FEB_20, __FEB_19, __FEB_18, __FEB_17, __FEB_16, __FEB_15, __FEB_14, __FEB_13, __FEB_12, __FEB_11, __FEB_10, __FEB_9, __FEB_8, __FEB_7, __FEB_6, __FEB_5, __FEB_4, __FEB_3, __FEB_2, __FEB_1, __FEB_0)(fn, ##__VA_ARGS__)
+
 /// @endcond
 
 #endif
diff --git a/include/rsp.inc b/include/rsp.inc
index 37b2e6c789..326694ff70 100644
--- a/include/rsp.inc
+++ b/include/rsp.inc
@@ -1160,11 +1160,20 @@ makeMxc2Op mfc2, 0x0
         .align 4
     V_SHIFT:    .half 0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1
 
-    .macro setup_vsll vshiftreg
-        .set noat
-        la $1,%lo(V_SHIFT)
-        lqv \vshiftreg,0,  0,$1
-        .set at
+    .macro setup_vsll vshiftreg, emitload
+        .ifnb \emitload
+            .ifgt \emitload
+                .set noat
+                la $1,%lo(V_SHIFT)
+                lqv \vshiftreg,0,  0,$1
+                .set at
+            .endif
+        .else
+            .set noat
+            la $1,%lo(V_SHIFT)
+            lqv \vshiftreg,0,  0,$1
+            .set at
+        .endif
 
         .macro vsll vdstreg, vsrcreg, qty
             .if (\qty == 7)
@@ -1241,11 +1250,21 @@ makeMxc2Op mfc2, 0x0
         .align 4
     V_SHIFT8:   .half 0x8000, 0x4000, 0x2000, 0x1000, 0x800, 0x400, 0x200, 0x100
 
-    .macro setup_vsll8 vshiftreg
-        .set noat
-        la $1,%lo(V_SHIFT8)
-        lqv \vshiftreg,0,  0,$1
-        .set at
+
+    .macro setup_vsll8 vshiftreg, emitload
+        .ifnb \emitload
+            .ifgt \emitload
+                .set noat
+                la $1,%lo(V_SHIFT8)
+                lqv \vshiftreg,0,  0,$1
+                .set at
+            .endif
+        .else
+            .set noat
+            la $1,%lo(V_SHIFT8)
+            lqv \vshiftreg,0,  0,$1
+            .set at
+        .endif
 
         .macro vsll8 vdstreg, vsrcreg, qty
             .if (\qty == 15)
@@ -1504,6 +1523,24 @@ makeMxc2Op mfc2, 0x0
         lui $1, \code
         .set at
     .endm
+    .macro assert_ge v0, v1, code
+        blt \v0, \v1, assertion_failed
+        .set noat
+        lui $1, \code
+        .set at
+    .endm
+    .macro assert_gt v0, v1, code
+        ble \v0, \v1, assertion_failed
+        .set noat
+        lui $1, \code
+        .set at
+    .endm
+    .macro assert_lt v0, v1, code
+        bge \v0, \v1, assertion_failed
+        .set noat
+        lui $1, \code
+        .set at
+    .endm
 
 #else
     .macro assert code
@@ -1512,6 +1549,12 @@ makeMxc2Op mfc2, 0x0
     .endm
     .macro assert_ne v0, v1, code
     .endm
+    .macro assert_ge v0, v1, code
+    .endm
+    .macro assert_gt v0, v1, code
+    .endm
+    .macro assert_lt v0, v1, code
+    .endm
 #endif
 
 #endif /* RSP_INC */
diff --git a/include/rsp_queue.inc b/include/rsp_queue.inc
index ab9cac2f29..6466ee8a0c 100644
--- a/include/rsp_queue.inc
+++ b/include/rsp_queue.inc
@@ -167,6 +167,26 @@ _RSPQ_SAVED_STATE_END:
 # The total command size needs to be specified as well.
 #define CMD_ADDR(offset, cmdsize) (%lo(RSPQ_DMEM_BUFFER) + (offset) - (cmdsize)) (rspq_dmem_buf_ptr)
 
+# This register is initialized to zero any time a command is called
+#define vzero    $v00
+
+# These registers are initialized with the constant data required to make
+# vector shift macros code works (power of twos).
+#define vshift   $v30
+#define vshift8  $v31
+
+# We also define direct access to small constants as they can be useful in some
+# calculations.
+#define K1       vshift,e(7)
+#define K2       vshift,e(6)
+#define K4       vshift,e(5)
+#define K8       vshift,e(4)
+#define K16      vshift,e(3)
+#define K32      vshift,e(2)
+#define K64      vshift,e(1)
+#define K128     vshift,e(0)
+
+
 ########################################################
 #
 # The following is the actual implementation of the rsp engine.
@@ -185,6 +205,11 @@ _RSPQ_SAVED_STATE_END:
     .data
 _data_start:
 
+# Data for vector shift registers.
+# We put this at the top of the DMEM as we need an absolute address to save one opcode.
+    vsll_data
+    vsll8_data
+
 # Overlay tables. See rsp_overlay_t in rsp.c
 RSPQ_OVERLAY_TABLE:           .ds.b RSPQ_OVERLAY_TABLE_SIZE
 RSPQ_OVERLAY_DESCRIPTORS:     .ds.b (RSPQ_OVERLAY_DESC_SIZE * RSPQ_MAX_OVERLAY_COUNT)
@@ -232,6 +257,10 @@ _ovl_data_start:
 
     .text
 
+    # Just declare the shift macros, without emitting code. We will be emitting it later
+    setup_vsll      vshift, 0
+    setup_vsll8    vshift8, 0
+
     .globl _start
 _start:
     li rspq_dmem_buf_ptr, 0
@@ -381,6 +410,11 @@ rspq_execute_command:
     lw a3, %lo(RSPQ_DMEM_BUFFER) + 0xC (rspq_dmem_buf_ptr)
     add rspq_dmem_buf_ptr, rspq_cmd_size
 
+    # Initialize vzero, vshift, vshift8.
+    vxor vzero, vzero,0
+    lqv vshift,  0x00,zero
+    lqv vshift8, 0x10,zero
+
     # Jump to command. Set ra to the loop function, so that commands can 
     # either do "j RSPQ_Loop" or "jr ra" (or a tail call) to get back to the main loop
     sll cmd_desc, 2
diff --git a/n64.mk b/n64.mk
index a87517d476..426820e64e 100644
--- a/n64.mk
+++ b/n64.mk
@@ -107,7 +107,8 @@ $(BUILD_DIR)/%.o: $(SOURCE_DIR)/%.S
 		DATASECTION="$(basename $@).data"; \
 		BINARY="$(basename $@).elf"; \
 		echo "    [RSP] $<"; \
-		$(N64_CC) $(RSPASFLAGS) -nostartfiles -Wl,-Ttext=0x1000 -Wl,-Tdata=0x0 -Wl,-e0x1000 -o $$BINARY $<; \
+		$(N64_CC) $(RSPASFLAGS) -nostartfiles -Wl,-Trsp.ld -Wl,--gc-sections -o $@ $<; \
+		mv "$@" $$BINARY; \
 		$(N64_OBJCOPY) -O binary -j .text $$BINARY $$TEXTSECTION.bin; \
 		$(N64_OBJCOPY) -O binary -j .data $$BINARY $$DATASECTION.bin; \
 		$(N64_OBJCOPY) -I binary -O elf32-bigmips -B mips4300 \
diff --git a/rsp.ld b/rsp.ld
new file mode 100644
index 0000000000..e1490f0aa3
--- /dev/null
+++ b/rsp.ld
@@ -0,0 +1,50 @@
+/* 
+ * rsp.ld: Linker script for rsp ucode. 
+ */
+
+OUTPUT_FORMAT ("elf32-bigmips", "elf32-bigmips", "elf32-littlemips")
+OUTPUT_ARCH (mips)
+ENTRY (_start)
+
+MEMORY
+{
+    /* This is the layout in ROM. */
+    rom_dmem : ORIGIN = 0x0000, LENGTH = 0x1000
+    rom_imem : ORIGIN = 0x1000, LENGTH = 0x1000
+
+    /* This is a workaround to make ld place text symbols at the correct addresses (0x0 - 0x1000).
+       The RSP technically uses a harvard-architecture (https://en.wikipedia.org/wiki/Harvard_architecture)
+       which means that it uses different address spaces for instructions and data accesses.
+       Because ld is not designed for such architectures, we need to place the data section somewhere different,
+       since it would otherwise overlap the text section. As a workaround, we place it at 0x04000000 (which is also
+       the location of DMEM from the VR4300's point of view). Because the RSP only uses the lower 12 bits
+       of any address, this works out fine (as long as we always wrap data addresses in "%lo()").
+       
+       Note that this is not actually required to run the ucode correctly (instruction addresses above 0x1000 are truncated anyway),
+       but it makes debugging with gdb a lot easier (e.g. using this fork of cen64 https://github.com/lambertjamesd/cen64).
+     */
+    ram_data : ORIGIN = 0xA4000000, LENGTH = 0x1000
+    ram_text : ORIGIN = 0x00000000, LENGTH = 0x1000
+}
+
+SECTIONS
+{
+    .text : { 
+        KEEP(*(.text))
+        *(.text.*)
+    } > ram_text AT > rom_imem
+
+    .data : { 
+        KEEP(*(.data))
+        *(.data.*)
+    } > ram_data AT > rom_dmem
+
+    . = ALIGN(8);
+
+    .bss  : { 
+        KEEP(*(.bss))
+        *(.bss.*)
+    } > ram_data AT > rom_dmem
+
+    /DISCARD/ : { *(.MIPS.abiflags) }
+}
diff --git a/src/audio/rsp_mixer.S b/src/audio/rsp_mixer.S
index 851ebda31b..9023658ddb 100644
--- a/src/audio/rsp_mixer.S
+++ b/src/audio/rsp_mixer.S
@@ -190,9 +190,6 @@ VCONST_1:
 	#define k_alpha     v_const1.e1
 	#define k_1malpha   v_const1.e2
 
-	vsll_data
-	vsll8_data
-
 	.align 4
 BANNER0:    .ascii "Dragon RSP Audio"
 BANNER1:    .ascii " Coded by Rasky "
@@ -270,9 +267,6 @@ OUTPUT_AREA:     .dcb.w MAX_SAMPLES_PER_LOOP*2
 
 
 command_exec:
-	setup_vsll  v_shift
-	setup_vsll8 v_shift8
-
 	#define samples_left    t4
 	#define outptr          s8
 
diff --git a/src/display.c b/src/display.c
index 25ee37777a..bf06051128 100644
--- a/src/display.c
+++ b/src/display.c
@@ -137,7 +137,7 @@ static void __write_dram_register( void const * const dram_val )
 {
     volatile uint32_t *reg_base = (uint32_t *)REGISTER_BASE;
 
-    reg_base[1] = (uint32_t)dram_val;
+    reg_base[1] = PhysicalAddr(dram_val);
     MEMORY_BARRIER();
 }
 
@@ -169,6 +169,7 @@ static void __display_callback()
     /* Least significant bit of the current line register indicates
        if the currently displayed field is odd or even. */
     bool field = reg_base[4] & 1;
+    bool interlaced = reg_base[0] & (1<<6);
 
     /* Check if the next buffer is ready to be displayed, otherwise just
        leave up the current frame */
@@ -178,7 +179,7 @@ static void __display_callback()
         ready_mask &= ~(1 << next);
     }
 
-    __write_dram_register(__safe_buffer[now_showing] + (!field ? __width * __bitdepth : 0));
+    __write_dram_register(__safe_buffer[now_showing] + (interlaced && !field ? __width * __bitdepth : 0));
 }
 
 void display_init( resolution_t res, bitdepth_t bit, uint32_t num_buffers, gamma_t gamma, antialias_t aa )
@@ -327,7 +328,7 @@ void display_init( resolution_t res, bitdepth_t bit, uint32_t num_buffers, gamma
        to avoid confusing the VI chip with in-frame modifications. */
     if ( __is_vi_active() ) { __wait_for_vblank(); }
 
-    registers[1] = (uintptr_t) __safe_buffer[0];
+    registers[1] = PhysicalAddr(__safe_buffer[0]);
     __write_registers( registers );
 
     enable_interrupts();
diff --git a/src/utils.h b/src/utils.h
index f506d7c6ac..5310fec2ed 100644
--- a/src/utils.h
+++ b/src/utils.h
@@ -5,8 +5,11 @@
  * Misc utilities functions and macros. Internal header.
  */
 
+#define SWAP(a, b) ({ typeof(a) t = a; a = b; b = t; })
+
 #define MAX(a,b)  ({ typeof(a) _a = a; typeof(b) _b = b; _a > _b ? _a : _b; })
 #define MIN(a,b)  ({ typeof(a) _a = a; typeof(b) _b = b; _a < _b ? _a : _b; })
+#define CLAMP(x, min, max) (MIN(MAX((x), (min)), (max)))
 
 /** Round n up to the next multiple of d */
 #define ROUND_UP(n, d) ({ \
diff --git a/tests/testrom.c b/tests/testrom.c
index e39853bbc6..c8cdcf829c 100644
--- a/tests/testrom.c
+++ b/tests/testrom.c
@@ -18,8 +18,8 @@
 
 typedef struct {
 	int result;
-	char *log;
-	int logleft;
+	char *log; char *err;
+	int logleft, errleft;
 } TestContext;
 
 typedef void (*TestFunc)(TestContext *ctx);
@@ -30,10 +30,15 @@ typedef void (*TestFunc)(TestContext *ctx);
 // LOG(msg, ...): log something that will be displayed if the test fails.
 #define LOG(msg, ...)  ({ \
 	int __n = snprintf(ctx->log, ctx->logleft, msg, ##__VA_ARGS__); \
-	fwrite(ctx->log, 1, __n, stderr); \
 	ctx->log += __n; ctx->logleft -= __n; \
 })
 
+// ERR(msg, ...): generate an error message (just before failing the test)
+#define ERR(msg, ...)  ({ \
+	int __n = snprintf(ctx->err, ctx->errleft, msg, ##__VA_ARGS__); \
+	ctx->err += __n; ctx->errleft -= __n; \
+})
+
 // DEFER(stmt): execute "stmt" statement when the current lexical block exits.
 // This is useful in tests to execute cleanup functions even if the test fails
 // through ASSERT macros.
@@ -44,8 +49,8 @@ typedef void (*TestFunc)(TestContext *ctx);
 
 // SKIP: skip execution of the test.
 #define SKIP(msg, ...) ({ \
-	LOG("TEST SKIPPED:\n"); \
-	LOG(msg "\n", ##__VA_ARGS__); \
+	ERR("TEST SKIPPED:\n"); \
+	ERR(msg "\n", ##__VA_ARGS__); \
 	ctx->result = TEST_SKIPPED; \
 	return; \
 })
@@ -60,6 +65,9 @@ static uint32_t rand(void) {
 	return rand_state = x;
 }
 
+// SRAND(n): set seed for random number generator
+#define SRAND(n) ({ rand_state = (n); if (!rand_state) rand_state = 1; })
+
 // RANDN(n): generate a random number from 0 to n-1
 #define RANDN(n) ({ \
 	__builtin_constant_p((n)) ? \
@@ -70,9 +78,9 @@ static uint32_t rand(void) {
 // ASSERT(cond, msg): fail the test if the condition is false (with log message)
 #define ASSERT(cond, msg, ...) ({ \
 	if (!(cond)) { \
-		LOG("ASSERTION FAILED (%s:%d):\n", __FILE__, __LINE__); \
-		LOG("%s\n", #cond); \
-		LOG(msg "\n", ##__VA_ARGS__); \
+		ERR("ASSERTION FAILED (%s:%d):\n", __FILE__, __LINE__); \
+		ERR("%s\n", #cond); \
+		ERR(msg "\n", ##__VA_ARGS__); \
 		ctx->result = TEST_FAILED; \
 		return; \
 	} \
@@ -82,9 +90,9 @@ static uint32_t rand(void) {
 #define ASSERT_EQUAL_HEX(_a, _b, msg, ...) ({ \
 	uint64_t a = _a; uint64_t b = _b; \
 	if (a != b) { \
-		LOG("ASSERTION FAILED (%s:%d):\n", __FILE__, __LINE__); \
-		LOG("%s != %s (0x%llx != 0x%llx)\n", #_a, #_b, a, b); \
-		LOG(msg "\n", ##__VA_ARGS__); \
+		ERR("ASSERTION FAILED (%s:%d):\n", __FILE__, __LINE__); \
+		ERR("%s != %s (0x%llx != 0x%llx)\n", #_a, #_b, a, b); \
+		ERR(msg "\n", ##__VA_ARGS__); \
 		ctx->result = TEST_FAILED; \
 		return; \
 	} \
@@ -95,9 +103,9 @@ static uint32_t rand(void) {
 #define ASSERT_EQUAL_UNSIGNED(_a, _b, msg, ...) ({ \
 	uint64_t a = _a; uint64_t b = _b; \
 	if (a != b) { \
-		LOG("ASSERTION FAILED (%s:%d):\n", __FILE__, __LINE__); \
-		LOG("%s != %s (%llu != %llu)\n", #_a, #_b, a, b); \
-		LOG(msg "\n", ##__VA_ARGS__); \
+		ERR("ASSERTION FAILED (%s:%d):\n", __FILE__, __LINE__); \
+		ERR("%s != %s (%llu != %llu)\n", #_a, #_b, a, b); \
+		ERR(msg "\n", ##__VA_ARGS__); \
 		ctx->result = TEST_FAILED; \
 		return; \
 	} \
@@ -107,9 +115,9 @@ static uint32_t rand(void) {
 #define ASSERT_EQUAL_SIGNED(_a, _b, msg, ...) ({ \
 	int64_t a = _a; int64_t b = _b; \
 	if (a != b) { \
-		LOG("ASSERTION FAILED (%s:%d):\n", __FILE__, __LINE__); \
-		LOG("%s != %s (%lld != %lld)\n", #_a, #_b, a, b); \
-		LOG(msg "\n", ##__VA_ARGS__); \
+		ERR("ASSERTION FAILED (%s:%d):\n", __FILE__, __LINE__); \
+		ERR("%s != %s (%lld != %lld)\n", #_a, #_b, a, b); \
+		ERR(msg "\n", ##__VA_ARGS__); \
 		ctx->result = TEST_FAILED; \
 		return; \
 	} \
@@ -135,9 +143,9 @@ int assert_equal_mem(TestContext *ctx, const char *file, int line, const uint8_t
 			hexdump(dumpa, a, len, i-2, 5);
 			hexdump(dumpb, b, len, i-2, 5);
 
-			LOG("ASSERTION FAILED (%s:%d):\n", file, line); \
-			LOG("[%s] != [%s]\n", dumpa, dumpb);
-			LOG("     ^^              ^^  idx: %d\n", i);
+			ERR("ASSERTION FAILED (%s:%d):\n", file, line); \
+			ERR("[%s] != [%s]\n", dumpa, dumpb);
+			ERR("     ^^              ^^  idx: %d\n", i);
 			return 0;
 		}
 	}
@@ -149,7 +157,7 @@ int assert_equal_mem(TestContext *ctx, const char *file, int line, const uint8_t
 #define ASSERT_EQUAL_MEM(_a, _b, _len, msg, ...) ({ \
 	const uint8_t *a = (_a); const uint8_t *b = (_b); int len = (_len); \
 	if (!assert_equal_mem(ctx, __FILE__, __LINE__, a, b, len)) { \
-		LOG(msg "\n", ##__VA_ARGS__); \
+		ERR(msg "\n", ##__VA_ARGS__); \
 		ctx->result = TEST_FAILED; \
 		return; \
 	} \
@@ -250,7 +258,7 @@ int main() {
 	const int NUM_TESTS = sizeof(tests) / sizeof(tests[0]);
 	uint32_t start = TICKS_READ();
 	for (int i=0; i < NUM_TESTS; i++) {
-		static char logbuf[16384];
+		static char logbuf[16384], errbuf[4096];
 
 		printf("%-59s", tests[i].name);
 		fflush(stdout);
@@ -269,6 +277,8 @@ int main() {
 		TestContext ctx;
 		ctx.log = logbuf;
 		ctx.logleft = sizeof(logbuf);
+		ctx.err = errbuf;
+		ctx.errleft = sizeof(errbuf);
 		ctx.result = TEST_SUCCESS;
 		rand_state = 1; // reset to be fully reproducible
 
@@ -299,9 +309,12 @@ int main() {
 		if (ctx.result == TEST_FAILED) {
 			failures++;
 			printf("FAIL\n\n");
-
 			if (ctx.log != logbuf) {
-				printf("%s\n\n", logbuf);
+				debugf("%s\n", logbuf);
+			}
+			if (ctx.err != errbuf) {
+				printf("%s\n", errbuf);
+				debugf("%s\n", errbuf);
 			}
 		} else if (ctx.result == TEST_SKIPPED) {
 			skipped++;