Skip to content

Commit

Permalink
ipl3: compress the two stages via shrinkler
Browse files Browse the repository at this point in the history
This is an experiment that turned out to be too slow. gldemo boot
goes from 100ms to 381ms, and that's probably because shrinkler code
runs from DMEM in stage0 (and we don't have alternatives in that
situation).
  • Loading branch information
rasky committed Dec 11, 2023
1 parent d63d5b0 commit 6911f24
Show file tree
Hide file tree
Showing 17 changed files with 621 additions and 156 deletions.
31 changes: 21 additions & 10 deletions boot/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ N64_READELF = $(N64_GCCPREFIX_TRIPLET)readelf
N64_ROOTDIR = $(N64_INST)
N64_INCLUDEDIR = $(N64_ROOTDIR)/mips64-elf/include
N64_LIBDIR = $(N64_ROOTDIR)/mips64-elf/lib
N64_MKASSET = $(N64_ROOTDIR)/bin/mkasset

N64_CFLAGS = -march=vr4300 -mtune=vr4300 -MMD
N64_CFLAGS += -DN64 -Os -Wall -Werror -Wno-error=deprecated-declarations -fdiagnostics-color=always
Expand All @@ -34,18 +35,19 @@ N64_CFLAGS += -mabi=32 -mgp32 -mfp32 -msingle-float # Can't compile for 64bit AB
N64_ASFLAGS = -mtune=vr4300 -march=vr4300 -Wa,--fatal-warnings
N64_ASFLAGS = -mabi=32 -mgp32 -mfp32 -msingle-float -G0
N64_RSPASFLAGS = -march=mips1 -mabi=32 -Wa,--fatal-warnings
N64_LDFLAGS = -Wl,-T$(IPL3_LDSCRIPT) -Wl,-Map=build/ipl3.map -Wl,--gc-sections
N64_LDFLAGS = -Wl,-T$(IPL3_LDSCRIPT).1.ld -Wl,-Map=build/ipl3.map -Wl,--gc-sections

OBJS = build/ipl3.o build/minidragon.o build/rdram.o build/loader.o build/ique_trampoline.o build/entropy.o
OBJS = build/ipl3.o build/stage0.o build/minidragon.o build/rdram.o build/loader.o build/ique_trampoline.o build/entropy.o build/header.o build/boot_trampoline.o

ifeq ($(PROD),1)
N64_ASPPFLAGS += -DNDEBUG -DPROD
N64_CFLAGS += -DNDEBUG -DPROD
IPL3_ROM=bin/ipl3_prod.z64
IPL3_LDSCRIPT=ipl3_prod.ld
IPL3_LDSCRIPT=ipl3_prod
else
OBJS += build/debug.o build/boot_trampoline.o
OBJS+=build/debug.o
IPL3_ROM=bin/ipl3_dev.z64
IPL3_LDSCRIPT=ipl3_dev.ld
IPL3_LDSCRIPT=ipl3_dev
endif

all: $(IPL3_ROM)
Expand All @@ -65,13 +67,22 @@ build/%.o: %.c
build/%.o: %.S
@echo " [AS] $@"
@mkdir -p build
$(N64_CC) -c $(N64_ASFLAGS) -o $@ $<
$(N64_CC) -c $(N64_ASFLAGS) $(N64_ASPPFLAGS) -o $@ $<

%.z64: build/ipl3.elf
%.z64: build/ipl3.elf $(IPL3_LDSCRIPT).2.ld
@echo " [Z64] $@"
$(N64_OBJCOPY) -O binary $< $@

build/ipl3.elf: $(IPL3_LDSCRIPT) $(OBJS)
$(N64_OBJCOPY) -O binary -j .text.stage1 $< build/stage1.bin
$(N64_OBJCOPY) -O binary -j .text.stage2 $< build/stage2.bin
$(N64_OBJCOPY) -O binary -j .text.debug $< build/debug.bin
$(N64_MKASSET) -o build -c 3 --raw build/stage1.bin
$(N64_MKASSET) -o build -c 3 --raw build/stage2.bin
$(N64_CC) $(N64_CFLAGS) -Wl,-T$(IPL3_LDSCRIPT).2.ld -Wl,-Map=build/ipl3.compressed.map \
-o build/ipl3.compressed.elf \
stage0_bins.S header.c build/stage0.o build/ique_trampoline.o build/boot_trampoline.o
$(N64_READELF) --wide --sections build/ipl3.compressed.elf | grep .text
$(N64_OBJCOPY) -O binary build/ipl3.compressed.elf $@

build/ipl3.elf: $(IPL3_LDSCRIPT).1.ld $(OBJS)
@echo " [LD] $@"
$(N64_CC) $(N64_CFLAGS) $(N64_LDFLAGS) -o $@ $(filter %.o,$^)
$(N64_READELF) --wide --sections $@ | grep .text
Expand Down
5 changes: 4 additions & 1 deletion boot/debug.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,10 @@ void usb_init(void);
__attribute__((far))
void _usb_print(int ssize, const char *string, int nargs, ...);

#define debugf(s, ...) _usb_print(__builtin_strlen(s), s " ", __COUNT_VARARGS(__VA_ARGS__), ##__VA_ARGS__)
#define debugf(s, ...) ({ \
static const char __s[] __attribute__((section(".rodata.debug"))) = s " "; \
_usb_print(__builtin_strlen(s), __s, __COUNT_VARARGS(__VA_ARGS__), ##__VA_ARGS__); \
})

#else

Expand Down
28 changes: 28 additions & 0 deletions boot/header.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#include <stdint.h>

typedef struct __attribute__((packed)) {
uint32_t pi_dom1_config;
uint32_t clock_rate;
uint32_t boot_address;
uint32_t sdk_version;
uint64_t checksum;
uint64_t reserved1;
char title[20];
char reserved2[7];
uint32_t gamecode;
uint8_t rom_version;
} rom_header_t;

_Static_assert(sizeof(rom_header_t) == 64, "invalid sizeof(rom_header_t)");

__attribute__((section(".header"), used))
const rom_header_t header = {
// Standard PI DOM1 config
.pi_dom1_config = 0x80371240,
// Our IPL3 does not use directly this field. We do set it
// mainly for iQue, so that the special iQue trampoline is run,
// which jumps to our IPL3.
.boot_address = 0x80000400,
// Default title name
.title = "Libdragon ",
};
96 changes: 38 additions & 58 deletions boot/ipl3.c
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,6 @@
#include "entropy.h"
#include "loader.h"

__attribute__((section(".banner"), used))
const char banner[32] = " Libdragon IPL3 " " Coded by Rasky ";

// These register contains boot flags passed by IPL2. Define them globally
// during the first stage of IPL3, so that the registers are not reused.
register uint32_t ipl2_romType asm ("s3");
Expand All @@ -89,34 +86,6 @@ register uint32_t ipl2_resetType asm ("s5");
register uint32_t ipl2_romSeed asm ("s6");
register uint32_t ipl2_version asm ("s7");

typedef struct __attribute__((packed)) {
uint32_t pi_dom1_config;
uint32_t clock_rate;
uint32_t boot_address;
uint32_t sdk_version;
uint64_t checksum;
uint64_t reserved1;
char title[20];
char reserved2[7];
uint32_t gamecode;
uint8_t rom_version;
} rom_header_t;

_Static_assert(sizeof(rom_header_t) == 64, "invalid sizeof(rom_header_t)");

__attribute__((section(".header"), used))
const rom_header_t header = {
// Standard PI DOM1 config
.pi_dom1_config = 0x80371240,
// Our IPL3 does not use directly this field. We do set it
// mainly for iQue, so that the special iQue trampoline is run,
// which jumps to our IPL3.
.boot_address = 0x80000400,
// Default title name
.title = "Libdragon ",
};


#if 0
void memtest(int memsize)
{
Expand Down Expand Up @@ -221,8 +190,8 @@ static void mem_bank_init(int chip_id, bool last)
// If this is the last memory bank, don't do anything.
// We keep the RSP DMA idle to be able to quickly load
// the loader into it. We will clear this later.
if (last)
return;
// if (last)
// return;

uint32_t base = chip_id*1024*1024;
int size = 2*1024*1024;
Expand All @@ -238,25 +207,19 @@ static void mem_bank_init(int chip_id, bool last)
if (chip_id == 0 && ipl2_resetType != 0) {
base += 0x400;
size -= 0x400;
} else if (last) {
// If this is the last memory bank, we need to clear the
// last 2 MiB of RDRAM. This is where the loader will be copied,
// so avoid touching the last 32 KiB.
size -= TOTAL_RESERVED_SIZE;
}
rsp_bzero_async(base, size);
}

// This function is placed by the linker script immediately below the stage1()
// function. We just change the stack pointer here, as very first thing.
__attribute__((noreturn, section(".stage1.pre")))
void stage1pre(void)
{
// Move the stack to the data cache. Notice that RAM is not initialized
// yet but we don't care: if sp points to a cached location, it will
// just use the cache for that.
asm ("li $sp, %0"::"i"(STACK1_TOP));
__builtin_unreachable(); // avoid function epilog, we don't need it
}

__attribute__((noreturn, section(".stage1")))
void stage1(void)
__attribute__((section(".stage1")))
int stage1(void)
{
asm("tne $0, $0, 0x10");
// Clear IMEM (contains IPL2). We don't need it anymore, and we can
// instead use IMEM as a zero-buffer for RSP DMA.
rsp_clear_mem((uint32_t)SP_IMEM, 4096);
Expand All @@ -267,14 +230,20 @@ void stage1(void)

entropy_add(C0_COUNT());
C0_WRITE_CAUSE(0);
C0_WRITE_COUNT(0);
// C0_WRITE_COUNT(0);
C0_WRITE_COMPARE(0);
C0_WRITE_WATCHLO(0);

// Clear D/I-cache, useful after warm boot. Maybe not useful for cold
// boots, but the manual says that the cache state is invalid at boot,
// so a reset won't hurt.
// cop0_clear_cache();

int memsize;
bool bbplayer = (*MI_VERSION & 0xF0) == 0xB0;
if (!bbplayer) {
memsize = rdram_init(mem_bank_init);
memsize = 8<<20;
} else {
// iQue OS put the memory size in a special location. This is the
// amount of memory that the OS has assigned to the application, so it
Expand All @@ -288,10 +257,12 @@ void stage1(void)
memsize = 0x7C0000;
}

// Clear D/I-cache, useful after warm boot. Maybe not useful for cold
// boots, but the manual says that the cache state is invalid at boot,
// so a reset won't hurt.
cop0_clear_cache();
// Prepare TLB for stage2. We separate these from TLBWI because of COP0 hazards.
C0_WRITE_PAGEMASK(0x03 << 13); // 16 KiB / 0x4000
C0_WRITE_ENTRYHI(LOADER_VADDR);
C0_WRITE_ENTRYLO0(((memsize - 16*1024) >> 6) | 0x7);
C0_WRITE_ENTRYLO1_ZERO();
C0_WRITE_INDEX_ZERO();

// Fill boot information at beginning of DMEM. The rest of IMEM has been
// cleared by now anyway. Notice that we also store BSS in IMEM, so the
Expand All @@ -304,6 +275,20 @@ void stage1(void)
// Perform a memtest
// memtest(memsize);


// Clear the last 2 MiB of RDRAM. This is where the loader was just
// copied, so make sure not to step over the the loader itself.
// NOTE: this wouldn't be necessary if we played games with cache, but
// that would be largely emulator unfriendly, and it seems not worth to
// break most emulators for a minor performance gain.
// rsp_bzero_async(memsize-2*1024*1024, 2*1024*1024-TOTAL_RESERVED_SIZE);

// Enable TLB for stage 2
C0_TLBWI();
C0_WRITE_COUNT(0);

return memsize;
#if 0
// Copy the IPL3 stage2 (loader.c) from DMEM to the end of RDRAM.
extern uint32_t __stage2_start[]; extern int __stage2_size;
int stage2_size = (int)&__stage2_size;
Expand All @@ -317,15 +302,10 @@ void stage1(void)
while (*PI_STATUS & 1) {}
#endif

// Clear the last 2 MiB of RDRAM. This is where the loader was just
// copied, so make sure not to step over the the loader itself.
// NOTE: this wouldn't be necessary if we played games with cache, but
// that would be largely emulator unfriendly, and it seems not worth to
// break most emulators for a minor performance gain.
rsp_bzero_async(memsize-2*1024*1024, 2*1024*1024-TOTAL_RESERVED_SIZE);

// Jump to stage 2 in RDRAM.
MEMORY_BARRIER();
asm("move $sp, %0"::"r"(STACK2_TOP(memsize, stage2_size)));
goto *rdram_stage2;
#endif
}
38 changes: 23 additions & 15 deletions boot/ipl3_dev.ld → boot/ipl3_dev.1.ld
Original file line number Diff line number Diff line change
Expand Up @@ -7,41 +7,49 @@ MEMORY
{
rom : ORIGIN = 0xB0000000, LENGTH = 65536
dmem : ORIGIN = 0xA4000000, LENGTH = 4096
tlb : ORIGIN = 0xE0000000, LENGTH = 16*1024
}

/* Start of IPL3 in ROM (after trampoline) */
__ipl3rombase = 0xB0001040;

SECTIONS {
.text.prologue 0xB0000000 : {
KEEP(*(.header))
KEEP(*(.text.boot_trampoline))
KEEP(*(.text.ique_trampoline))
} > rom

.text.dmem 0xA4000040 : AT ( 0xB0001040 ) {
.text.stage0 0xA4000040 : AT ( __ipl3rombase ) {
*(.stage0)
} > dmem

.text.stage1 0xA4000040 + SIZEOF(.text.stage0) : {
__stage1 = .;
LONG(0x3044d236); /* special dummy opcode to simplify hash cracking; this is not needed in dev mode, but we keep it in case it matters */
KEEP(*(.stage1.pre))
KEEP(*(.stage1))
. = ALIGN(16);
KEEP(*(.banner))
build/ipl3.o (.text .text.* .data)
build/rdram.o (.text .text.* .data)
build/minidragon.o (.text .text.* .data)
build/entropy.o (.text .text.* .data)
build/ipl3.o
build/rdram.o
build/minidragon.o
build/entropy.o
*(.bss)
. = ALIGN(8);
} > dmem

__stage2_size = __stage2_end - __stage2_start;

.text.rom 0xB0001040 + SIZEOF ( .text.dmem ) : {
.text.stage2 0xE0000000 : AT( 0xB0002000 ) {
__stage2 = .;
__stage2_start = .;
KEEP(build/loader.o(.text.loader))
KEEP(build/loader.o(.text .text.*))
KEEP(build/loader.o(.data .data.*))
KEEP(build/loader.o)
. = ALIGN(8);
__stage2_end = .;
} > tlb

__stage2_size = __stage2_end - __stage2_start;

.text.debug 0xB0003000 : {
build/debug.o (.text .text.* .data)
*(.rodata)
*(.rodata.*)
*(.rodata.debug)
} > rom

/DISCARD/ : { *(.MIPS.abiflags) }
Expand Down
38 changes: 38 additions & 0 deletions boot/ipl3_dev.2.ld
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
OUTPUT_FORMAT ("elf32-bigmips", "elf32-bigmips", "elf32-littlemips")
OUTPUT_ARCH (mips)

MEMORY
{
rom : ORIGIN = 0xB0000000, LENGTH = 65536
dmem : ORIGIN = 0xA4000000, LENGTH = 4096
}

SECTIONS {
.text.prologue 0xB0000000 : {
KEEP(*(.header))
KEEP(*(.text.boot_trampoline))
KEEP(*(.text.ique_trampoline))
} > rom

.text.stage0 0xA4000040 : AT ( 0xB0001040 ) {
*(.stage0)
} > dmem

.text.banner : {
*(.banner)
} > dmem

.text.stage1 : {
*(.stage1bin)
} > dmem

.text.stage2 : {
*(.stage2bin)
} > dmem

.text.debug 0xB0003000 : {
*(.debugbin)
} > rom

/DISCARD/ : { *(.MIPS.abiflags) }
}
Loading

0 comments on commit 6911f24

Please sign in to comment.