diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt
index 6a5177d641b..c111f631f9c 100644
--- a/core/CMakeLists.txt
+++ b/core/CMakeLists.txt
@@ -1149,8 +1149,9 @@ if (BUILD_TESTS AND
       append_property_string(TARGET unit_tests LINK_FLAGS "-Wl,${ld_entry_flag},_start")
     endif ()
     if (NOT ANDROID) # everything is inside Bionic on Android
-      target_link_libraries(unit_tests c dl m pthread)
+      target_link_libraries(unit_tests dl m pthread)
     endif ()
+    target_link_libraries(unit_tests drmemfuncs)
     set_preferred_base_start_and_end(unit_tests ${preferred_base} ON)
   else (UNIX)
     # Just like drinjectlib (see above) we need libc before ntdll
diff --git a/core/arch/riscv64/memfuncs.asm b/core/arch/riscv64/memfuncs.asm
index f44751831c2..c7662ed9d9d 100644
--- a/core/arch/riscv64/memfuncs.asm
+++ b/core/arch/riscv64/memfuncs.asm
@@ -41,33 +41,184 @@ START_FILE
 #ifdef UNIX
 
 /* Private memcpy.
+ * Optimize private memcpy by using loop unrolling and
+ * branchless sequences.
  */
         DECLARE_FUNC(memcpy)
 GLOBAL_LABEL(memcpy:)
-/* TODO i#3544: Naive version, optimize it. */
-        mv       t1, ARG1
-        beqz     ARG3, 2f
-1:      lbu      t2, 0(ARG2)
-        addi     ARG2, ARG2, 1
-        sb       t2, 0(t1)
-        addi     t1, t1, 1
-        addi     ARG3, ARG3, -1
-        bnez     ARG3, 1b
-2:      ret
+        li       t6, 32
+        mv       t0, ARG2 /* Save dst for return. */
+copy32_:
+        /* When size is greater than 32, we use 4 ld/sd pairs
+         * to copy 4*8=32 bytes in each iteration.
+         * When size is less than 32, we jump to copy_remain and
+         * use other optimized copy methods.
+         */
+        blt      ARG3, t6, copy_remain
+        ld       t1, 0(ARG2)
+        ld       t2, 8(ARG2)
+        ld       t3, 16(ARG2)
+        ld       t4, 24(ARG2)
+        sd       t1, 0(ARG1)
+        sd       t2, 8(ARG1)
+        sd       t3, 16(ARG1)
+        sd       t4, 24(ARG1)
+        addi     ARG3, ARG3, -32
+        addi     ARG1, ARG1, 32
+        addi     ARG2, ARG2, 32
+        j        copy32_
+copy_remain:
+        add      a6, ARG2, ARG3 /* a6 = src + size */
+        add      a7, ARG1, ARG3 /* a7 = dst + size */
+        li       t6, 8
+        bge      ARG3, t6, copy8_32
+        li       t6, 4
+        bge      ARG3, t6, copy4_8
+        bgtz     ARG3, copy0_4
+        j        copyexit
+copy0_4:
+        /* 0 < size < 4:
+         * If the size is 1 or 2,
+         * this will do some redundant copies to avoid branches.
+         */
+        srli     t4, ARG3, 1
+        add      t5, t4, ARG1
+        add      t4, t4, ARG2
+        lbu      t1, 0(ARG2)
+        lbu      t2, -1(a6)
+        lbu      t3, 0(t4)
+        sb       t1, 0(ARG1)
+        sb       t2, -1(a7)
+        sb       t3, 0(t5)
+        j        copyexit
+copy4_8:
+        /* 4 <= size < 8:
+         * There will be at least 1 byte
+         * of overlap between the two 4 bytes copies.
+         * We do this to avoid further branches.
+         */
+        lwu      t1, 0(ARG2)
+        lwu      t2, -4(a6)
+        sw       t1, 0(ARG1)
+        sw       t2, -4(a7)
+        j        copyexit
+copy8_32:
+        /* 8 <= size < 32: */
+        /* Copy the first 8 bytes and the last 8 bytes.
+         * There will be overlap when size < 16.
+         */
+        ld       t1, 0(ARG2)
+        ld       t2, -8(a6)
+        sd       t1, 0(ARG1)
+        sd       t2, -8(a7)
+        /* If size > 16, intermediate bytes (src[8:size-9])
+         * have not been copied.
+         */
+        li       t6, 16
+        ble      ARG3, t6, copyexit
+        ld       t1, 8(ARG2)
+        sd       t1, 8(ARG1)
+        /* If size > 24, intermediate bytes (src[16:size-9])
+         * have not been copied.
+         */
+        li       t6, 24
+        ble      ARG3, t6, copyexit
+        ld       t1, 16(ARG2)
+        sd       t1, 16(ARG1)
+copyexit:
+        mv       a0, t0 /* Restore original dst as return value. */
+        ret
         END_FUNC(memcpy)
 
 /* Private memset.
+ * Optimize private memset by using loop unrolling and
+ * branchless sequences.
  */
         DECLARE_FUNC(memset)
 GLOBAL_LABEL(memset:)
-/* TODO i#3544: Naive version, optimize it. */
-        mv       t1, ARG1
-        beqz     ARG3, 2f
-1:      sb       ARG2, 0(t1)
-        addi     t1, t1, 1
-        addi     ARG3, ARG3, -1
-        bnez     ARG3, 1b
-2:      ret
+        li       t6, 32
+        mv       t0, ARG1 /* Save for return. */
+
+        /* Duplicate a single byte into whole 8 bytes register. */
+        andi     ARG2, ARG2, 0xff
+        mv       t1, ARG2
+        slli     ARG2, ARG2, 8
+        or       t1, t1, ARG2
+        slli     ARG2, ARG2, 8
+        or       t1, t1, ARG2
+        slli     ARG2, ARG2, 8
+        or       t1, t1, ARG2
+        slli     ARG2, ARG2, 8
+        or       t1, t1, ARG2
+        slli     ARG2, ARG2, 8
+        or       t1, t1, ARG2
+        slli     ARG2, ARG2, 8
+        or       t1, t1, ARG2
+        slli     ARG2, ARG2, 8
+        or       t1, t1, ARG2
+set32_:
+        /* When size is greater than 32, we use 4 sd
+         * to write 4*8=32 bytes in each iteration.
+         * When size is less than 32, we jump to set_remain and
+         * use other optimized methods.
+         */
+        blt      ARG3, t6, set_remain
+        sd       t1, 0(ARG1)
+        sd       t1, 8(ARG1)
+        sd       t1, 16(ARG1)
+        sd       t1, 24(ARG1)
+        addi     ARG3, ARG3, -32
+        addi     ARG1, ARG1, 32
+        j        set32_
+set_remain:
+        add      a6, ARG1, ARG3 /* a6 = dst + size */
+        li       t6, 8
+        bge      ARG3, t6, set8_32
+        li       t6, 4
+        bge      ARG3, t6, set4_8
+        bgtz     ARG3, set0_4
+        j        setexit
+set0_4:
+        /* 0 < size < 4:
+         * If the size is 1 or 2,
+         * this will do some redundant writes to avoid branches.
+         */
+        srli     t4, ARG3, 1
+        add      t4, t4, ARG1
+        sb       t1, 0(ARG1)
+        sb       t1, -1(a6)
+        sb       t1, 0(t4)
+        j        setexit
+set4_8:
+        /* There will be at least 1 byte
+         * of overlap between the two 4 bytes write.
+         * We do this to avoid further branches.
+         */
+        sw       t1, 0(ARG1)
+        sw       t1, -4(a6)
+        j        setexit
+set8_32:
+        /* 8 < size < 32: */
+        /* Write the first 8 bytes and the last 8 bytes.
+         * There will be overlap when size < 16.
+         */
+        sd       t1, 0(ARG1)
+        sd       t1, -8(a6)
+        /* If size > 16, intermediate bytes (src[8:size-9])
+         * have not been writen.
+         */
+        li       t6, 16
+        ble      ARG3, t6, setexit
+        sd       t1, 8(ARG1)
+        /* If size > 24, intermediate bytes (src[16:size-9])
+         * have not been writen.
+         */
+        li       t6, 24
+        ble      ARG3, t6, setexit
+        sd       t1, 16(ARG1)
+setexit:
+        mv       a0, t0 /* Restore original dst as return value. */
+        ret
         END_FUNC(memset)
 
 /* See x86.asm notes about needing these to avoid gcc invoking *_chk */
diff --git a/core/io.c b/core/io.c
index 003712928b6..14a4cb32e27 100644
--- a/core/io.c
+++ b/core/io.c
@@ -919,6 +919,7 @@ test_sscanf_all_specs(void)
 #        endif
 
 typedef void (*memcpy_t)(void *dst, const void *src, size_t n);
+typedef void (*memset_t)(void *dst, int src, size_t n);
 
 static void
 test_memcpy_offset_size(size_t src_offset, size_t dst_offset, size_t size)
@@ -1004,9 +1005,7 @@ test_our_memset(void)
 static void
 our_memcpy_vs_libc(void)
 {
-    /* Compare our memcpy with libc memcpy.
-     * XXX: Should compare on more sizes, especially small ones.
-     */
+    /* Compare our memcpy with libc memcpy. */
     size_t alloc_size = 20 * 1024;
     int loop_count = 100 * 1000;
     void *src = global_heap_alloc(alloc_size HEAPACCT(ACCT_OTHER));
@@ -1018,27 +1017,74 @@ our_memcpy_vs_libc(void)
     memset(src, -1, alloc_size);
     memset(dst, 0, alloc_size);
 
-    our_memcpy_start = query_time_millis();
+    int tests_size[] = { 1, 4, 128, 512, 8192, alloc_size };
+    int j;
+    for (j = 0; j < sizeof(tests_size) / sizeof(int); j++) {
+        our_memcpy_start = query_time_millis();
+        for (i = 0; i < loop_count; i++) {
+            memcpy(src, dst, tests_size[j]);
+        }
+        our_memcpy_end = query_time_millis();
+
+        libc_memcpy_start = query_time_millis();
+        for (i = 0; i < loop_count; i++) {
+            glibc_memcpy(src, dst, tests_size[j]);
+        }
+        libc_memcpy_end = query_time_millis();
+
+        our_memcpy_time = our_memcpy_end - our_memcpy_start;
+        libc_memcpy_time = libc_memcpy_end - libc_memcpy_start;
+        print_file(STDERR,
+                   "our_memcpy_time: size=" UINT64_FORMAT_STRING
+                   " time=" UINT64_FORMAT_STRING "\n",
+                   tests_size[j], our_memcpy_time);
+        print_file(STDERR,
+                   "libc_memcpy_time: size=" UINT64_FORMAT_STRING
+                   " time=" UINT64_FORMAT_STRING "\n",
+                   tests_size[j], libc_memcpy_time);
+    }
+    /* We could assert that we're not too much slower, but that's a recipe for
+     * flaky failures when the suite is run on shared VMs or in parallel.
+     */
+
+    global_heap_free(src, alloc_size HEAPACCT(ACCT_OTHER));
+    global_heap_free(dst, alloc_size HEAPACCT(ACCT_OTHER));
+}
+
+static void
+our_memset_vs_libc(void)
+{
+    /* Compare our memset with libc memset. */
+    size_t alloc_size = 20 * 1024;
+    int loop_count = 100 * 1000;
+    void *src = global_heap_alloc(alloc_size HEAPACCT(ACCT_OTHER));
+    void *dst = global_heap_alloc(alloc_size HEAPACCT(ACCT_OTHER));
+    int i;
+    memset_t glibc_memset = (memset_t)dlsym(RTLD_NEXT, "memset");
+    uint64 our_memset_start, our_memset_end, our_memset_time;
+    uint64 libc_memset_start, libc_memset_end, libc_memset_time;
+
+    our_memset_start = query_time_millis();
     for (i = 0; i < loop_count; i++) {
-        memcpy(src, dst, alloc_size);
+        memset(src, -1, alloc_size);
+        memset(dst, 0, alloc_size);
     }
-    our_memcpy_end = query_time_millis();
+    our_memset_end = query_time_millis();
 
-    libc_memcpy_start = query_time_millis();
+    libc_memset_start = query_time_millis();
     for (i = 0; i < loop_count; i++) {
-        glibc_memcpy(src, dst, alloc_size);
+        glibc_memset(src, -1, alloc_size);
+        glibc_memset(dst, 0, alloc_size);
     }
-    libc_memcpy_end = query_time_millis();
+    libc_memset_end = query_time_millis();
 
     global_heap_free(src, alloc_size HEAPACCT(ACCT_OTHER));
     global_heap_free(dst, alloc_size HEAPACCT(ACCT_OTHER));
-    our_memcpy_time = our_memcpy_end - our_memcpy_start;
-    libc_memcpy_time = libc_memcpy_end - libc_memcpy_start;
-    print_file(STDERR, "our_memcpy_time: " UINT64_FORMAT_STRING "\n", our_memcpy_time);
-    print_file(STDERR, "libc_memcpy_time: " UINT64_FORMAT_STRING "\n", libc_memcpy_time);
-    /* We could assert that we're not too much slower, but that's a recipe for
-     * flaky failures when the suite is run on shared VMs or in parallel.
-     */
+
+    our_memset_time = our_memset_end - our_memset_start;
+    libc_memset_time = libc_memset_end - libc_memset_start;
+    print_file(STDERR, "our_memset_time: " UINT64_FORMAT_STRING "\n", our_memset_time);
+    print_file(STDERR, "libc_memset_time: " UINT64_FORMAT_STRING "\n", libc_memset_time);
 }
 #    endif /* UNIX */
 
@@ -1187,6 +1233,7 @@ unit_test_io(void)
 
     /* memset tests */
     test_our_memset();
+    our_memset_vs_libc();
 #    endif /* UNIX */
 
     /* XXX: add more tests */