From 956f8a8cc833a6e3575698dddc18825643a84c06 Mon Sep 17 00:00:00 2001
From: gvl610 <giangvinhloc610@gmail.com>
Date: Wed, 3 Jul 2024 13:56:00 +0700
Subject: [PATCH] Optimize cache and memory access

Some optimized tricks for AVR are included in this commit.
I spent a lot of time optimizing every line in these functions, looking for instruction-level optimization using godbolt.
A large portion of the time was spent on GDB, since I forgot bunch of checks and variable changes when I optimize these function.
Don't be stupid like me
---
 build.sh   |  16 ++-
 main.c     | 346 ++++++++++++++---------------------------------------
 main_sim.c | 342 ++++++++++++++--------------------------------------
 3 files changed, 194 insertions(+), 510 deletions(-)

diff --git a/build.sh b/build.sh
index eca0bcc..7d80e6c 100755
--- a/build.sh
+++ b/build.sh
@@ -23,6 +23,14 @@ sim_opt() {
     gcc -I. -march=native -Ofast -funsigned-char -funsigned-bitfields -fpack-struct -fshort-enums -Wall -Wstrict-prototypes -std=gnu99 main_sim.c if_sim.c -o sim
 }
 
+clean() {
+    # sim
+    rm sim dump.bin
+
+    # avr
+    rm *.o *.lst *.map *.elf
+}
+
 # Check for help flag
 if [[ "$1" == "-h" || "$1" == "--help" || "$1" == "help" ]]; then
     echo "Usage: $0 (sim|sim_opt|avr)"
@@ -42,8 +50,12 @@ else
         if [[ "$1" == "avr" ]]; then
             avr
         else
-            echo "Invalid argument: '$1'"
-            exit 1
+            if [[ "$1" == "clean" ]]; then
+                clean
+            else
+                echo "Invalid argument: '$1'"
+                exit 1
+            fi
         fi
     fi
 fi
diff --git a/main.c b/main.c
index 7d261a1..de70621 100644
--- a/main.c
+++ b/main.c
@@ -34,6 +34,7 @@
 #include <avr/interrupt.h>
 #include <util/atomic.h>
 #include <stdlib.h>
+#include <stdbool.h>
 #include <string.h>
 #include "uart.h"
 #include "spi.h"
@@ -55,8 +56,8 @@ struct MiniRV32IMAState *core;
  *
  * Each cache has a 512 bytes buffer, a tag (the sector number of that 512-bytes
  * block in RAM), and a "dirty" flag (see below). In our implementation, we use
- * an uint16_t age score variables. For each data read, we will +2 to the age
- * score, and for each data write, we will +1 to the age score.
+ * an uint16_t age score variables. For each data read, we will +1 to the age
+ * score, and for each data write, we will also +1 to the age score.
  *
  * Cache will be invaild by LRU mechanism: when there is a read/write request to
  * an address that is not currently in the cache pool, a least recently used
@@ -74,13 +75,16 @@ struct MiniRV32IMAState *core;
  * icache, it will get invalid, and its age score will be reset to 0. If pc
  * moves to an address that is currently in another dcache, that dcache will
  * be used for both icache and dcache, and its age score will be set to 0xFFFF.
+ *
+ * IMO, I don't think a cache will often be assigned as both icache and dcache
+ * as the same time. It might just happend in self-modifying code.
  */
 
 struct cache {
-    UInt8 buf[SD_BLOCK_LEN];
-    UInt32 tag;
+    uint8_t buf[SD_BLOCK_LEN];
+    uint32_t tag;
     uint16_t age;
-    UInt8 flag;
+    bool flag; // False is dirty. Why not the opposite? AVR takes 2 instructions to store a 1 and just 1 to store a 0.
 };
 
 struct cache pool[3];
@@ -410,58 +414,76 @@ static Int32 HandleOtherCSRRead( UInt8 * image, UInt16 csrno )
 
 /*
  * flag:
- * 0 -> data fetch, calculate sector
- * 1 -> data fetch, next sector
- * 2 -> instruction fetch, calculate sector
- * 3 -> instruction fetch, next sector
+ * false -> data fetch, calculate sector
+ * true -> instruction fetch, calculate sector
+ *
+ * return: cache buffer's address
+ *
+ * quick note: storing a 0 takes just a single instruction, but storing a 1 takes 2 on AVR
  */
 
-uint8_t read_buf(uint32_t ofs, uint8_t flag) {
-    static uint32_t s;
-
-    /*
-     * Some operations might involve read/write bytes that are located between 2 sectors on
-     * the SD card. In that case, we have to fetch 2 continuous sectors at a time. Since
-     * we already know the last sector number, we can just read the n + 1 sector and skip
-     * the division (which is a pain on AVR). The flag parameter is used for this. When
-     * flag is 0, we will calculate the sector number. Else, we will just fetch the n + 1
-     * sector.
-     */
-    if (flag % 2 == 0) {
-        // Calculate sector num
-        // Dividing on AVR is a pain, so we should avoid that if we can
-        s = ofs / SD_BLOCK_LEN;
-    } else {
-        // sector num = last sector + 1
-        ++s;
-    }
+uint8_t *read_buf(uint32_t ofs, bool flag, bool write) {
+    // Calculate sector num
+    // Dividing on AVR is a pain, so we should avoid that if we can
+    uint32_t s = ofs / SD_BLOCK_LEN;
 
     // Check if the requested sector exists in the pool
     // We have 3 caches, so we can use if. If you implement more than 3 caches, you should
     // use for loop
     uint8_t ret = 0;
-    if (s == pool[2].tag) {
-        ret = 2;
+    if (s == pool[0].tag) {
+        ret = 0;
+        
+        // Add age
+        if (pool[0].age <= 0xFFFE) {
+            pool[0].age += 1;
+        }
+
+        // Set dirty flag if needed
+        if (write) {
+            pool[0].flag = false; // false = dirty
+        }
     } else if (s == pool[1].tag) {
         ret = 1;
-    } else if (s == pool[0].tag) {
-        ret = 0;
+        
+        // Add age
+        if (pool[1].age <= 0xFFFE) {
+            pool[1].age += 1;
+        }
+
+        // Set dirty flag if needed
+        if (write) {
+            pool[1].flag = false;
+        }
+    } else if (s == pool[2].tag) {
+        ret = 2;
+
+        // Add age
+        if (pool[2].age <= 0xFFFE) {
+            pool[2].age += 1;
+        }
+
+        // Set dirty flag if needed
+        // If you think this can be optimized to pool[2].flag = write, think again!
+        if (write) {
+            pool[2].flag = false;
+        }
     } else {
         uint8_t lru = 2;
-        
-        if (flag > 1) {
+
+        if (flag) {
             // If icache miss -> invaild old icache
             if (pool[0].age == 0xFFFF) {
-                pool[0].age = 0;
                 lru = 0;
+                pool[0].age = 0;
                 goto continue_without_finding_lru;
             } else if (pool[1].age == 0xFFFF) {
-                pool[1].age = 0;
                 lru = 1;
+                pool[1].age = 0;
                 goto continue_without_finding_lru;
             } else {
-                pool[2].age = 0;
                 lru = 2;
+                pool[2].age = 0;
                 goto continue_without_finding_lru;
             }
         }
@@ -479,7 +501,7 @@ uint8_t read_buf(uint32_t ofs, uint8_t flag) {
 continue_without_finding_lru:
         // Check if LRU cache if dirty
         uint8_t token;
-        if (pool[lru].flag == 1) {
+        if (!pool[lru].flag) { // false = dirty
             // Dirty -> flush to SD
             uint8_t t = 0;
 cache_write:
@@ -503,12 +525,12 @@ uint8_t read_buf(uint32_t ofs, uint8_t flag) {
             }
 
             // Clear dirty flag
-            pool[lru].flag = 0;
+            pool[lru].flag = true; // true = not dirty
         }
 
         // Set new properties
         pool[lru].tag = s;
-        if (flag > 1) {
+        if (flag) {
             // icache
             pool[lru].age = 0xFFFF;
 #ifdef ENABLE_CACHE_STAT
@@ -516,11 +538,16 @@ uint8_t read_buf(uint32_t ofs, uint8_t flag) {
 #endif
         } else {
             // dcache
-            pool[lru].age = 0;
+            pool[lru].age = 1; // Already +1 for the access
 #ifdef ENABLE_CACHE_STAT
             dcache_miss++;
 #endif
         }
+
+        // Set dirty flag if needed
+        if (write) {
+            pool[lru].flag = false;
+        }
         
         // Fetch new sector into cache
         uint8_t t = 0;
@@ -543,207 +570,53 @@ uint8_t read_buf(uint32_t ofs, uint8_t flag) {
             }
         }
 
-        // Return the cache index
-        return lru;
+        // Return buffer address
+        return &pool[lru].buf;
     }
 
     // Cache hit
 #ifdef ENABLE_CACHE_STAT
-    if (flag > 1) {
+    if (flag) {
         icache_hit++;
     } else {
         dcache_hit++;
     }
 #endif
 
-    // Return the cache index
-    return ret;
+    // Return buffer address
+    return &pool[ret].buf;
 }
 
 // Memory access functions
 static uint32_t loadi(uint32_t ofs) {
     // Load instruction from icache
-    //uint32_t result;
-    uint8_t id = read_buf(ofs, 2);
-
-    // This will never happend, since RISC-V instructions are aligned on 32-bit boundaries,
-    // so they will probably never split between 512-bytes sectors.
-    // Removing this saves about 4 instructions in each loadi execution, and even more when
-    // considering the programming space. How much will this boost the performance?
-    /*if (r >= 509) {
-        // 1 - 3 bytes are in nth sector, and the others in n + 1 sector
-        // Read the nth sector and get the bytes in that sector
-        uint8_t i = 0;
-        for (; i < SD_BLOCK_LEN - r; i++) {
-            ((uint8_t *)&result)[i] = pool[id].buf[r + i];
-        }
-
-        // Read the next sector and get the remaining bytes
-        id = read_buf(ofs, 3);
-        for (uint8_t j = 0; j < r - 508; j++) {
-            ((uint8_t *)&result)[i + j] = pool[id].buf[j];
-        }
-
-        //UART_puthex32(result);
-        //UART_pputs("\r\n");
-
-        return result;
-    }*/
-
-    /*((uint8_t *)&result)[0] = pool[id].buf[r];     // LSB
-    ((uint8_t *)&result)[1] = pool[id].buf[r + 1];
-    ((uint8_t *)&result)[2] = pool[id].buf[r + 2];
-    ((uint8_t *)&result)[3] = pool[id].buf[r + 3]; // MSB
-
-    // Return result
-    return result;*/
-
-    // Return result
-    // ofs % 512 needs to be cast to uint16_t, or more instructions will be generated
-    return *(uint32_t*)&pool[id].buf[(uint16_t)(ofs % 512)];
-}
-
-void addage(uint8_t id, uint8_t score) {
-    if (pool[id].age <= (0xFFFF - score)) {
-        pool[id].age += score;
-    }
+    uint16_t r = ofs % 512; // Don't inline this. Clearly specify type for compiler to optimize. Else it will be seen as uint32_t
+    //return *(uint32_t*)&pool[id].buf[(uint16_t)(ofs % 512)];
+    return *(uint32_t*)(read_buf(ofs, true, false) + r);
 }
 
 static uint32_t load4(uint32_t ofs) {
     uint16_t r = ofs % 512; // Don't inline this
-    uint8_t id = read_buf(ofs, 0);
-
-    // Won't happend, see loadi()
-    /*if (r >= 509) {
-        uint32_t result;
-
-        // 1 - 3 bytes are in nth sector, and the others in n + 1 sector
-        // Read the nth sector and get the bytes in that sector
-        uint8_t i = 0;
-        for (; i < SD_BLOCK_LEN - r; i++) {
-            ((uint8_t *)&result)[i] = pool[id].buf[r + i];
-        }
-
-        // Read the next sector and get the remaining bytes
-        id = read_buf(ofs, 1);
-        for (uint8_t j = 0; j < r - 508; j++) {
-            ((uint8_t *)&result)[i + j] = pool[id].buf[j];
-        }
-
-        // Increase age score
-        addage(id, 2);
-
-        return result;
-    }*/
-
-    /*((uint8_t *)&result)[0] = pool[id].buf[r];     // LSB
-    ((uint8_t *)&result)[1] = pool[id].buf[r + 1];
-    ((uint8_t *)&result)[2] = pool[id].buf[r + 2];
-    ((uint8_t *)&result)[3] = pool[id].buf[r + 3]; // MSB
-
-    // Increase age score
-    addage(id, 2);
-
-    // Return result
-    return result;*/
-
-    // Increase age score
-    addage(id, 2);
-
-    // Return result
-    return *(uint32_t*)&pool[id].buf[r];
+    //return *(uint32_t*)&pool[id].buf[r];
+    return *(uint32_t*)(read_buf(ofs, false, false) + r);
 }
 static uint16_t load2(uint32_t ofs) {
     uint16_t r = ofs % 512;
-    uint8_t id = read_buf(ofs, 0);
-
-    // Won't happend, same reason as loadi() and load4()
-    /*if (r == 511) {
-	    uint16_t result;
-
-        // LSB located in nth sector
-        ((uint8_t *)&result)[0] = pool[id].buf[511];
-
-        // MSB located in n + 1 sector
-        id = read_buf(ofs, 1);
-        ((uint8_t *)&result)[1] = pool[id].buf[0];
-        
-        // Increase age score
-        addage(id, 2);
-
-        return result;
-    }*/
-
-    /*((uint8_t *)&result)[0] = pool[id].buf[r];     // LSB
-    ((uint8_t *)&result)[1] = pool[id].buf[r + 1]; // MSB
-    
-    // Increase age score
-    addage(id, 2);
-
-    // Return result
-    return result;*/
-
-    // Increase age score
-    addage(id, 2);
-
-    // Return result
-    return *(uint16_t *)&pool[id].buf[r];
+    //return *(uint16_t *)&pool[id].buf[r];
+    return *(uint16_t*)(read_buf(ofs, false, false) + r);
 }
 static uint8_t load1(uint32_t ofs) {
-    uint8_t id = read_buf(ofs, 0);
-
-    // Increase age score
-    addage(id, 2);
-
-    // ofs % 512 needs to be cast to uint16_t, or more instructions will be generated
-    return pool[id].buf[(uint16_t)(ofs % 512)];
+    uint16_t r = ofs % 512;
+    //return pool[id].buf[(uint16_t)(ofs % 512)];
+    return *(uint8_t*)(read_buf(ofs, false, false) + r);
 }
 
 static uint32_t store4(uint32_t ofs, uint32_t val) {
     uint16_t r = ofs % 512;
-    uint8_t id = read_buf(ofs, 0);
-
-    // Won't happend, see load4()
-    /*if (r >= 509) {
-        // 1 - 3 bytes are in nth sector, and the others in n + 1 sector
-        // Read the nth sector and change the bytes in that sector
-        uint8_t i = 0;
-        for (; i < SD_BLOCK_LEN - r; i++) {
-            pool[id].buf[r + i] = ((uint8_t *)&val)[i];
-        }
-
-        // Set "dirty" flag
-        pool[id].flag = 1;
-
-        // Read the next sector and get the remaining bytes
-        id = read_buf(ofs, 1);
-        for (uint8_t j = 0; j < r - 508; j++) {
-            pool[id].buf[j] = ((uint8_t *)&val)[i + j];
-        }
-
-        // Set "dirty" flag
-        pool[id].flag = 1;
-
-        // Increase age score
-        addage(id, 1);
-
-        // Return result
-        return val;
-    }*/
-
-    /*pool[id].buf[r]     = ((uint8_t *)&val)[0]; // LSB
-    pool[id].buf[r + 1] = ((uint8_t *)&val)[1];
-    pool[id].buf[r + 2] = ((uint8_t *)&val)[2];
-    pool[id].buf[r + 3] = ((uint8_t *)&val)[3]; // MSB*/
 
     // Store
-    *(uint32_t *)&pool[id].buf[r] = val;
-
-    // Set "dirty" flag
-    pool[id].flag = 1;
-
-    // Increase age score
-    addage(id, 1);
+    //*(uint32_t *)&pool[id].buf[r] = val;
+    *(uint32_t *)(read_buf(ofs, false, true) + r) = val;
 
     // Return result
     return val;
@@ -751,60 +624,23 @@ static uint32_t store4(uint32_t ofs, uint32_t val) {
 
 static uint16_t store2(uint32_t ofs, uint16_t val) {
     uint16_t r = ofs % 512;
-    uint8_t id = read_buf(ofs, 0);
-
-    // Won't happend, see load2
-    /*if (r == 511) {
-        // LSB located in the nth sector
-        pool[id].buf[511] = ((uint8_t *)&val)[0];
-
-        // Set "dirty" flag
-        pool[id].flag = 1;
-
-        // MSB located in the n + 1 sector
-        id = read_buf(ofs, 1);
-        pool[id].buf[0] = ((uint8_t *)&val)[1];
-        
-        // Set "dirty" flag
-        pool[id].flag = 1;
-
-        // Increase age score
-        addage(id, 1);
-
-        // Return result
-        return val;
-    }*/
-
-    /*pool[id].buf[r]     = ((uint8_t *)&val)[0]; // LSB
-    pool[id].buf[r + 1] = ((uint8_t *)&val)[1]; // MSB*/
 
     // Store
-    *(uint16_t *)&pool[id].buf[r] = val;
-    
-    // Set "dirty" flag
-    pool[id].flag = 1;
-
-    // Increase age score
-    addage(id, 1);
+    //*(uint16_t *)&pool[id].buf[r] = val;
+    *(uint16_t *)(read_buf(ofs, false, true) + r) = val;
 
     // Return result
     return val;
 }
 
 static uint8_t store1(uint32_t ofs, uint8_t val) {
-    uint8_t id = read_buf(ofs, 0);
+    uint16_t r = ofs % 512;
 
     // Store
-    // ofs % 512 needs to be cast to uint16_t, or more instructions will be generated
-    pool[id].buf[(uint16_t)(ofs % 512)] = val;
-    
-    // Set "dirty" flag
-    pool[id].flag = 1;
-
-    // Increase age score
-    addage(id, 1);
+    //*(uint8_t *)&pool[id].buf[r] = val;
+    *(uint8_t *)(read_buf(ofs, false, true) + r) = val;
 
-    // Return result
+    // Return results
     return val;
 }
 
diff --git a/main_sim.c b/main_sim.c
index ad657fe..01b1c67 100644
--- a/main_sim.c
+++ b/main_sim.c
@@ -9,6 +9,7 @@
 #define ENABLE_CACHE_STAT
 
 // Headers
+#include <stdbool.h>
 #include <time.h>
 #include "if_sim.h"
 #include "uart.h"
@@ -33,8 +34,8 @@ struct MiniRV32IMAState *core;
  *
  * Each cache has a 512 bytes buffer, a tag (the sector number of that 512-bytes
  * block in RAM), and a "dirty" flag (see below). In our implementation, we use
- * an uint16_t age score variables. For each data read, we will +2 to the age
- * score, and for each data write, we will +1 to the age score.
+ * an uint16_t age score variables. For each data read, we will +1 to the age
+ * score, and for each data write, we will also +1 to the age score.
  *
  * Cache will be invaild by LRU mechanism: when there is a read/write request to
  * an address that is not currently in the cache pool, a least recently used
@@ -52,13 +53,16 @@ struct MiniRV32IMAState *core;
  * icache, it will get invalid, and its age score will be reset to 0. If pc
  * moves to an address that is currently in another dcache, that dcache will
  * be used for both icache and dcache, and its age score will be set to 0xFFFF.
+ *
+ * IMO, I don't think a cache will often be assigned as both icache and dcache
+ * as the same time. It might just happend in self-modifying code.
  */
 
 struct cache {
     uint8_t buf[SD_BLOCK_LEN];
     uint32_t tag;
     uint16_t age;
-    uint8_t flag;
+    bool flag; // False is dirty. Why not the opposite? AVR takes 2 instructions to store a 1 and just 1 to store a 0.
 };
 
 struct cache pool[3];
@@ -340,58 +344,76 @@ static int32_t HandleOtherCSRRead( uint8_t * image, uint16_t csrno )
 
 /*
  * flag:
- * 0 -> data fetch, calculate sector
- * 1 -> data fetch, next sector
- * 2 -> instruction fetch, calculate sector
- * 3 -> instruction fetch, next sector
+ * false -> data fetch, calculate sector
+ * true -> instruction fetch, calculate sector
+ *
+ * return: cache buffer's address
+ *
+ * quick note: storing a 0 takes just a single instruction, but storing a 1 takes 2 on AVR
  */
 
-uint8_t read_buf(uint32_t ofs, uint8_t flag) {
-    static uint32_t s;
-
-    /*
-     * Some operations might involve read/write bytes that are located between 2 sectors on
-     * the SD card. In that case, we have to fetch 2 continuous sectors at a time. Since
-     * we already know the last sector number, we can just read the n + 1 sector and skip
-     * the division (which is a pain on AVR). The flag parameter is used for this. When
-     * flag is 0, we will calculate the sector number. Else, we will just fetch the n + 1
-     * sector.
-     */
-    if (flag % 2 == 0) {
-        // Calculate sector num
-        // Dividing on AVR is a pain, so we should avoid that if we can
-        s = ofs / SD_BLOCK_LEN;
-    } else {
-        // sector num = last sector + 1
-        ++s;
-    }
+uint8_t *read_buf(uint32_t ofs, bool flag, bool write) {
+    // Calculate sector num
+    // Dividing on AVR is a pain, so we should avoid that if we can
+    uint32_t s = ofs / SD_BLOCK_LEN;
 
     // Check if the requested sector exists in the pool
     // We have 3 caches, so we can use if. If you implement more than 3 caches, you should
     // use for loop
     uint8_t ret = 0;
-    if (s == pool[2].tag) {
-        ret = 2;
+    if (s == pool[0].tag) {
+        ret = 0;
+        
+        // Add age
+        if (pool[0].age <= 0xFFFE) {
+            pool[0].age += 1;
+        }
+
+        // Set dirty flag if needed
+        if (write) {
+            pool[0].flag = false; // false = dirty
+        }
     } else if (s == pool[1].tag) {
         ret = 1;
-    } else if (s == pool[0].tag) {
-        ret = 0;
+        
+        // Add age
+        if (pool[1].age <= 0xFFFE) {
+            pool[1].age += 1;
+        }
+
+        // Set dirty flag if needed
+        if (write) {
+            pool[1].flag = false;
+        }
+    } else if (s == pool[2].tag) {
+        ret = 2;
+
+        // Add age
+        if (pool[2].age <= 0xFFFE) {
+            pool[2].age += 1;
+        }
+
+        // Set dirty flag if needed
+        // If you think this can be optimized to pool[2].flag = write, think again!
+        if (write) {
+            pool[2].flag = false;
+        }
     } else {
         uint8_t lru = 2;
-        
-        if (flag > 1) {
+
+        if (flag) {
             // If icache miss -> invaild old icache
             if (pool[0].age == 0xFFFF) {
-                pool[0].age = 0;
                 lru = 0;
+                pool[0].age = 0;
                 goto continue_without_finding_lru;
             } else if (pool[1].age == 0xFFFF) {
-                pool[1].age = 0;
                 lru = 1;
+                pool[1].age = 0;
                 goto continue_without_finding_lru;
             } else {
-                pool[2].age = 0;
                 lru = 2;
+                pool[2].age = 0;
                 goto continue_without_finding_lru;
             }
         }
@@ -409,7 +431,7 @@ uint8_t read_buf(uint32_t ofs, uint8_t flag) {
 continue_without_finding_lru:
         // Check if LRU cache if dirty
         uint8_t token;
-        if (pool[lru].flag == 1) {
+        if (!pool[lru].flag) { // false = dirty
             // Dirty -> flush to SD
             uint8_t t = 0;
 cache_write:
@@ -433,12 +455,12 @@ uint8_t read_buf(uint32_t ofs, uint8_t flag) {
             }
 
             // Clear dirty flag
-            pool[lru].flag = 0;
+            pool[lru].flag = true; // true = not dirty
         }
 
         // Set new properties
         pool[lru].tag = s;
-        if (flag > 1) {
+        if (flag) {
             // icache
             pool[lru].age = 0xFFFF;
 #ifdef ENABLE_CACHE_STAT
@@ -446,11 +468,16 @@ uint8_t read_buf(uint32_t ofs, uint8_t flag) {
 #endif
         } else {
             // dcache
-            pool[lru].age = 0;
+            pool[lru].age = 1; // Already +1 for the access
 #ifdef ENABLE_CACHE_STAT
             dcache_miss++;
 #endif
         }
+
+        // Set dirty flag if needed
+        if (write) {
+            pool[lru].flag = false;
+        }
         
         // Fetch new sector into cache
         uint8_t t = 0;
@@ -473,207 +500,53 @@ uint8_t read_buf(uint32_t ofs, uint8_t flag) {
             }
         }
 
-        // Return the cache index
-        return lru;
+        // Return buffer address
+        return &pool[lru].buf;
     }
 
     // Cache hit
 #ifdef ENABLE_CACHE_STAT
-    if (flag > 1) {
+    if (flag) {
         icache_hit++;
     } else {
         dcache_hit++;
     }
 #endif
 
-    // Return the cache index
-    return ret;
+    // Return buffer address
+    return &pool[ret].buf;
 }
 
 // Memory access functions
 static uint32_t loadi(uint32_t ofs) {
     // Load instruction from icache
-    //uint32_t result;
-    uint8_t id = read_buf(ofs, 2);
-
-    // This will never happend, since RISC-V instructions are aligned on 32-bit boundaries,
-    // so they will probably never split between 512-bytes sectors.
-    // Removing this saves about 4 instructions in each loadi execution, and even more when
-    // considering the programming space. How much will this boost the performance?
-    /*if (r >= 509) {
-        // 1 - 3 bytes are in nth sector, and the others in n + 1 sector
-        // Read the nth sector and get the bytes in that sector
-        uint8_t i = 0;
-        for (; i < SD_BLOCK_LEN - r; i++) {
-            ((uint8_t *)&result)[i] = pool[id].buf[r + i];
-        }
-
-        // Read the next sector and get the remaining bytes
-        id = read_buf(ofs, 3);
-        for (uint8_t j = 0; j < r - 508; j++) {
-            ((uint8_t *)&result)[i + j] = pool[id].buf[j];
-        }
-
-        //UART_puthex32(result);
-        //UART_pputs("\r\n");
-
-        return result;
-    }*/
-
-    /*((uint8_t *)&result)[0] = pool[id].buf[r];     // LSB
-    ((uint8_t *)&result)[1] = pool[id].buf[r + 1];
-    ((uint8_t *)&result)[2] = pool[id].buf[r + 2];
-    ((uint8_t *)&result)[3] = pool[id].buf[r + 3]; // MSB
-
-    // Return result
-    return result;*/
-
-    // Return result
-    // ofs % 512 needs to be cast to uint16_t, or more instructions will be generated
-    return *(uint32_t*)&pool[id].buf[(uint16_t)(ofs % 512)];
-}
-
-void addage(uint8_t id, uint8_t score) {
-    if (pool[id].age <= (0xFFFF - score)) {
-        pool[id].age += score;
-    }
+    uint16_t r = ofs % 512; // Don't inline this. Clearly specify type for compiler to optimize. Else it will be seen as uint32_t
+    //return *(uint32_t*)&pool[id].buf[(uint16_t)(ofs % 512)];
+    return *(uint32_t*)(read_buf(ofs, true, false) + r);
 }
 
 static uint32_t load4(uint32_t ofs) {
     uint16_t r = ofs % 512; // Don't inline this
-    uint8_t id = read_buf(ofs, 0);
-
-    // Won't happend, see loadi()
-    /*if (r >= 509) {
-        uint32_t result;
-
-        // 1 - 3 bytes are in nth sector, and the others in n + 1 sector
-        // Read the nth sector and get the bytes in that sector
-        uint8_t i = 0;
-        for (; i < SD_BLOCK_LEN - r; i++) {
-            ((uint8_t *)&result)[i] = pool[id].buf[r + i];
-        }
-
-        // Read the next sector and get the remaining bytes
-        id = read_buf(ofs, 1);
-        for (uint8_t j = 0; j < r - 508; j++) {
-            ((uint8_t *)&result)[i + j] = pool[id].buf[j];
-        }
-
-        // Increase age score
-        addage(id, 2);
-
-        return result;
-    }*/
-
-    /*((uint8_t *)&result)[0] = pool[id].buf[r];     // LSB
-    ((uint8_t *)&result)[1] = pool[id].buf[r + 1];
-    ((uint8_t *)&result)[2] = pool[id].buf[r + 2];
-    ((uint8_t *)&result)[3] = pool[id].buf[r + 3]; // MSB
-
-    // Increase age score
-    addage(id, 2);
-
-    // Return result
-    return result;*/
-
-    // Increase age score
-    addage(id, 2);
-
-    // Return result
-    return *(uint32_t*)&pool[id].buf[r];
+    //return *(uint32_t*)&pool[id].buf[r];
+    return *(uint32_t*)(read_buf(ofs, false, false) + r);
 }
 static uint16_t load2(uint32_t ofs) {
     uint16_t r = ofs % 512;
-    uint8_t id = read_buf(ofs, 0);
-
-    // Won't happend, same reason as loadi() and load4()
-    /*if (r == 511) {
-	    uint16_t result;
-
-        // LSB located in nth sector
-        ((uint8_t *)&result)[0] = pool[id].buf[511];
-
-        // MSB located in n + 1 sector
-        id = read_buf(ofs, 1);
-        ((uint8_t *)&result)[1] = pool[id].buf[0];
-        
-        // Increase age score
-        addage(id, 2);
-
-        return result;
-    }*/
-
-    /*((uint8_t *)&result)[0] = pool[id].buf[r];     // LSB
-    ((uint8_t *)&result)[1] = pool[id].buf[r + 1]; // MSB
-    
-    // Increase age score
-    addage(id, 2);
-
-    // Return result
-    return result;*/
-
-    // Increase age score
-    addage(id, 2);
-
-    // Return result
-    return *(uint16_t *)&pool[id].buf[r];
+    //return *(uint16_t *)&pool[id].buf[r];
+    return *(uint16_t*)(read_buf(ofs, false, false) + r);
 }
 static uint8_t load1(uint32_t ofs) {
-    uint8_t id = read_buf(ofs, 0);
-
-    // Increase age score
-    addage(id, 2);
-
-    // ofs % 512 needs to be cast to uint16_t, or more instructions will be generated
-    return pool[id].buf[(uint16_t)(ofs % 512)];
+    uint16_t r = ofs % 512;
+    //return pool[id].buf[(uint16_t)(ofs % 512)];
+    return *(uint8_t*)(read_buf(ofs, false, false) + r);
 }
 
 static uint32_t store4(uint32_t ofs, uint32_t val) {
     uint16_t r = ofs % 512;
-    uint8_t id = read_buf(ofs, 0);
-
-    // Won't happend, see load4()
-    /*if (r >= 509) {
-        // 1 - 3 bytes are in nth sector, and the others in n + 1 sector
-        // Read the nth sector and change the bytes in that sector
-        uint8_t i = 0;
-        for (; i < SD_BLOCK_LEN - r; i++) {
-            pool[id].buf[r + i] = ((uint8_t *)&val)[i];
-        }
-
-        // Set "dirty" flag
-        pool[id].flag = 1;
-
-        // Read the next sector and get the remaining bytes
-        id = read_buf(ofs, 1);
-        for (uint8_t j = 0; j < r - 508; j++) {
-            pool[id].buf[j] = ((uint8_t *)&val)[i + j];
-        }
-
-        // Set "dirty" flag
-        pool[id].flag = 1;
-
-        // Increase age score
-        addage(id, 1);
-
-        // Return result
-        return val;
-    }*/
-
-    /*pool[id].buf[r]     = ((uint8_t *)&val)[0]; // LSB
-    pool[id].buf[r + 1] = ((uint8_t *)&val)[1];
-    pool[id].buf[r + 2] = ((uint8_t *)&val)[2];
-    pool[id].buf[r + 3] = ((uint8_t *)&val)[3]; // MSB*/
 
     // Store
-    *(uint32_t *)&pool[id].buf[r] = val;
-
-    // Set "dirty" flag
-    pool[id].flag = 1;
-
-    // Increase age score
-    addage(id, 1);
+    //*(uint32_t *)&pool[id].buf[r] = val;
+    *(uint32_t *)(read_buf(ofs, false, true) + r) = val;
 
     // Return result
     return val;
@@ -681,60 +554,23 @@ static uint32_t store4(uint32_t ofs, uint32_t val) {
 
 static uint16_t store2(uint32_t ofs, uint16_t val) {
     uint16_t r = ofs % 512;
-    uint8_t id = read_buf(ofs, 0);
-
-    // Won't happend, see load2
-    /*if (r == 511) {
-        // LSB located in the nth sector
-        pool[id].buf[511] = ((uint8_t *)&val)[0];
-
-        // Set "dirty" flag
-        pool[id].flag = 1;
-
-        // MSB located in the n + 1 sector
-        id = read_buf(ofs, 1);
-        pool[id].buf[0] = ((uint8_t *)&val)[1];
-        
-        // Set "dirty" flag
-        pool[id].flag = 1;
-
-        // Increase age score
-        addage(id, 1);
-
-        // Return result
-        return val;
-    }*/
-
-    /*pool[id].buf[r]     = ((uint8_t *)&val)[0]; // LSB
-    pool[id].buf[r + 1] = ((uint8_t *)&val)[1]; // MSB*/
 
     // Store
-    *(uint16_t *)&pool[id].buf[r] = val;
-    
-    // Set "dirty" flag
-    pool[id].flag = 1;
-
-    // Increase age score
-    addage(id, 1);
+    //*(uint16_t *)&pool[id].buf[r] = val;
+    *(uint16_t *)(read_buf(ofs, false, true) + r) = val;
 
     // Return result
     return val;
 }
 
 static uint8_t store1(uint32_t ofs, uint8_t val) {
-    uint8_t id = read_buf(ofs, 0);
+    uint16_t r = ofs % 512;
 
     // Store
-    // ofs % 512 needs to be cast to uint16_t, or more instructions will be generated
-    pool[id].buf[(uint16_t)(ofs % 512)] = val;
-    
-    // Set "dirty" flag
-    pool[id].flag = 1;
-
-    // Increase age score
-    addage(id, 1);
+    //*(uint8_t *)&pool[id].buf[r] = val;
+    *(uint8_t *)(read_buf(ofs, false, true) + r) = val;
 
-    // Return result
+    // Return results
     return val;
 }