diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt index 6a5177d641b..c111f631f9c 100644 --- a/core/CMakeLists.txt +++ b/core/CMakeLists.txt @@ -1149,8 +1149,9 @@ if (BUILD_TESTS AND append_property_string(TARGET unit_tests LINK_FLAGS "-Wl,${ld_entry_flag},_start") endif () if (NOT ANDROID) # everything is inside Bionic on Android - target_link_libraries(unit_tests c dl m pthread) + target_link_libraries(unit_tests dl m pthread) endif () + target_link_libraries(unit_tests drmemfuncs) set_preferred_base_start_and_end(unit_tests ${preferred_base} ON) else (UNIX) # Just like drinjectlib (see above) we need libc before ntdll diff --git a/core/arch/riscv64/memfuncs.asm b/core/arch/riscv64/memfuncs.asm index f44751831c2..c7662ed9d9d 100644 --- a/core/arch/riscv64/memfuncs.asm +++ b/core/arch/riscv64/memfuncs.asm @@ -41,33 +41,184 @@ START_FILE #ifdef UNIX /* Private memcpy. + * Optimize private memcpy by using loop unrolling and + * branchless sequences. */ DECLARE_FUNC(memcpy) GLOBAL_LABEL(memcpy:) -/* TODO i#3544: Naive version, optimize it. */ - mv t1, ARG1 - beqz ARG3, 2f -1: lbu t2, 0(ARG2) - addi ARG2, ARG2, 1 - sb t2, 0(t1) - addi t1, t1, 1 - addi ARG3, ARG3, -1 - bnez ARG3, 1b -2: ret + li t6, 32 + mv t0, ARG2 /* Save dst for return. */ +copy32_: + /* When size is greater than 32, we use 4 ld/sd pairs + * to copy 4*8=32 bytes in each iteration. + * When size is less than 32, we jump to copy_remain and + * use other optimized copy methods. + */ + blt ARG3, t6, copy_remain + ld t1, 0(ARG2) + ld t2, 8(ARG2) + ld t3, 16(ARG2) + ld t4, 24(ARG2) + sd t1, 0(ARG1) + sd t2, 8(ARG1) + sd t3, 16(ARG1) + sd t4, 24(ARG1) + addi ARG3, ARG3, -32 + addi ARG1, ARG1, 32 + addi ARG2, ARG2, 32 + j copy32_ +copy_remain: + add a6, ARG2, ARG3 /* a6 = src + size */ + add a7, ARG1, ARG3 /* a7 = dst + size */ + li t6, 8 + bge ARG3, t6, copy8_32 + li t6, 4 + bge ARG3, t6, copy4_8 + bgtz ARG3, copy0_4 + j copyexit +copy0_4: + /* 0 < size < 4: + * If the size is 1 or 2, + * this will do some redundant copies to avoid branches. + */ + srli t4, ARG3, 1 + add t5, t4, ARG1 + add t4, t4, ARG2 + lbu t1, 0(ARG2) + lbu t2, -1(a6) + lbu t3, 0(t4) + sb t1, 0(ARG1) + sb t2, -1(a7) + sb t3, 0(t5) + j copyexit +copy4_8: + /* 4 <= size < 8: + * There will be at least 1 byte + * of overlap between the two 4 bytes copies. + * We do this to avoid further branches. + */ + lwu t1, 0(ARG2) + lwu t2, -4(a6) + sw t1, 0(ARG1) + sw t2, -4(a7) + j copyexit +copy8_32: + /* 8 <= size < 32: */ + /* Copy the first 8 bytes and the last 8 bytes. + * There will be overlap when size < 16. + */ + ld t1, 0(ARG2) + ld t2, -8(a6) + sd t1, 0(ARG1) + sd t2, -8(a7) + /* If size > 16, intermediate bytes (src[8:size-9]) + * have not been copied. + */ + li t6, 16 + ble ARG3, t6, copyexit + ld t1, 8(ARG2) + sd t1, 8(ARG1) + /* If size > 24, intermediate bytes (src[16:size-9]) + * have not been copied. + */ + li t6, 24 + ble ARG3, t6, copyexit + ld t1, 16(ARG2) + sd t1, 16(ARG1) +copyexit: + mv a0, t0 /* Restore original dst as return value. */ + ret END_FUNC(memcpy) /* Private memset. + * Optimize private memset by using loop unrolling and + * branchless sequences. */ DECLARE_FUNC(memset) GLOBAL_LABEL(memset:) -/* TODO i#3544: Naive version, optimize it. */ - mv t1, ARG1 - beqz ARG3, 2f -1: sb ARG2, 0(t1) - addi t1, t1, 1 - addi ARG3, ARG3, -1 - bnez ARG3, 1b -2: ret + li t6, 32 + mv t0, ARG1 /* Save for return. */ + + /* Duplicate a single byte into whole 8 bytes register. */ + andi ARG2, ARG2, 0xff + mv t1, ARG2 + slli ARG2, ARG2, 8 + or t1, t1, ARG2 + slli ARG2, ARG2, 8 + or t1, t1, ARG2 + slli ARG2, ARG2, 8 + or t1, t1, ARG2 + slli ARG2, ARG2, 8 + or t1, t1, ARG2 + slli ARG2, ARG2, 8 + or t1, t1, ARG2 + slli ARG2, ARG2, 8 + or t1, t1, ARG2 + slli ARG2, ARG2, 8 + or t1, t1, ARG2 +set32_: + /* When size is greater than 32, we use 4 sd + * to write 4*8=32 bytes in each iteration. + * When size is less than 32, we jump to set_remain and + * use other optimized methods. + */ + blt ARG3, t6, set_remain + sd t1, 0(ARG1) + sd t1, 8(ARG1) + sd t1, 16(ARG1) + sd t1, 24(ARG1) + addi ARG3, ARG3, -32 + addi ARG1, ARG1, 32 + j set32_ +set_remain: + add a6, ARG1, ARG3 /* a6 = dst + size */ + li t6, 8 + bge ARG3, t6, set8_32 + li t6, 4 + bge ARG3, t6, set4_8 + bgtz ARG3, set0_4 + j setexit +set0_4: + /* 0 < size < 4: + * If the size is 1 or 2, + * this will do some redundant writes to avoid branches. + */ + srli t4, ARG3, 1 + add t4, t4, ARG1 + sb t1, 0(ARG1) + sb t1, -1(a6) + sb t1, 0(t4) + j setexit +set4_8: + /* There will be at least 1 byte + * of overlap between the two 4 bytes write. + * We do this to avoid further branches. + */ + sw t1, 0(ARG1) + sw t1, -4(a6) + j setexit +set8_32: + /* 8 < size < 32: */ + /* Write the first 8 bytes and the last 8 bytes. + * There will be overlap when size < 16. + */ + sd t1, 0(ARG1) + sd t1, -8(a6) + /* If size > 16, intermediate bytes (src[8:size-9]) + * have not been writen. + */ + li t6, 16 + ble ARG3, t6, setexit + sd t1, 8(ARG1) + /* If size > 24, intermediate bytes (src[16:size-9]) + * have not been writen. + */ + li t6, 24 + ble ARG3, t6, setexit + sd t1, 16(ARG1) +setexit: + mv a0, t0 /* Restore original dst as return value. */ + ret END_FUNC(memset) /* See x86.asm notes about needing these to avoid gcc invoking *_chk */ diff --git a/core/io.c b/core/io.c index 003712928b6..14a4cb32e27 100644 --- a/core/io.c +++ b/core/io.c @@ -919,6 +919,7 @@ test_sscanf_all_specs(void) # endif typedef void (*memcpy_t)(void *dst, const void *src, size_t n); +typedef void (*memset_t)(void *dst, int src, size_t n); static void test_memcpy_offset_size(size_t src_offset, size_t dst_offset, size_t size) @@ -1004,9 +1005,7 @@ test_our_memset(void) static void our_memcpy_vs_libc(void) { - /* Compare our memcpy with libc memcpy. - * XXX: Should compare on more sizes, especially small ones. - */ + /* Compare our memcpy with libc memcpy. */ size_t alloc_size = 20 * 1024; int loop_count = 100 * 1000; void *src = global_heap_alloc(alloc_size HEAPACCT(ACCT_OTHER)); @@ -1018,27 +1017,74 @@ our_memcpy_vs_libc(void) memset(src, -1, alloc_size); memset(dst, 0, alloc_size); - our_memcpy_start = query_time_millis(); + int tests_size[] = { 1, 4, 128, 512, 8192, alloc_size }; + int j; + for (j = 0; j < sizeof(tests_size) / sizeof(int); j++) { + our_memcpy_start = query_time_millis(); + for (i = 0; i < loop_count; i++) { + memcpy(src, dst, tests_size[j]); + } + our_memcpy_end = query_time_millis(); + + libc_memcpy_start = query_time_millis(); + for (i = 0; i < loop_count; i++) { + glibc_memcpy(src, dst, tests_size[j]); + } + libc_memcpy_end = query_time_millis(); + + our_memcpy_time = our_memcpy_end - our_memcpy_start; + libc_memcpy_time = libc_memcpy_end - libc_memcpy_start; + print_file(STDERR, + "our_memcpy_time: size=" UINT64_FORMAT_STRING + " time=" UINT64_FORMAT_STRING "\n", + tests_size[j], our_memcpy_time); + print_file(STDERR, + "libc_memcpy_time: size=" UINT64_FORMAT_STRING + " time=" UINT64_FORMAT_STRING "\n", + tests_size[j], libc_memcpy_time); + } + /* We could assert that we're not too much slower, but that's a recipe for + * flaky failures when the suite is run on shared VMs or in parallel. + */ + + global_heap_free(src, alloc_size HEAPACCT(ACCT_OTHER)); + global_heap_free(dst, alloc_size HEAPACCT(ACCT_OTHER)); +} + +static void +our_memset_vs_libc(void) +{ + /* Compare our memset with libc memset. */ + size_t alloc_size = 20 * 1024; + int loop_count = 100 * 1000; + void *src = global_heap_alloc(alloc_size HEAPACCT(ACCT_OTHER)); + void *dst = global_heap_alloc(alloc_size HEAPACCT(ACCT_OTHER)); + int i; + memset_t glibc_memset = (memset_t)dlsym(RTLD_NEXT, "memset"); + uint64 our_memset_start, our_memset_end, our_memset_time; + uint64 libc_memset_start, libc_memset_end, libc_memset_time; + + our_memset_start = query_time_millis(); for (i = 0; i < loop_count; i++) { - memcpy(src, dst, alloc_size); + memset(src, -1, alloc_size); + memset(dst, 0, alloc_size); } - our_memcpy_end = query_time_millis(); + our_memset_end = query_time_millis(); - libc_memcpy_start = query_time_millis(); + libc_memset_start = query_time_millis(); for (i = 0; i < loop_count; i++) { - glibc_memcpy(src, dst, alloc_size); + glibc_memset(src, -1, alloc_size); + glibc_memset(dst, 0, alloc_size); } - libc_memcpy_end = query_time_millis(); + libc_memset_end = query_time_millis(); global_heap_free(src, alloc_size HEAPACCT(ACCT_OTHER)); global_heap_free(dst, alloc_size HEAPACCT(ACCT_OTHER)); - our_memcpy_time = our_memcpy_end - our_memcpy_start; - libc_memcpy_time = libc_memcpy_end - libc_memcpy_start; - print_file(STDERR, "our_memcpy_time: " UINT64_FORMAT_STRING "\n", our_memcpy_time); - print_file(STDERR, "libc_memcpy_time: " UINT64_FORMAT_STRING "\n", libc_memcpy_time); - /* We could assert that we're not too much slower, but that's a recipe for - * flaky failures when the suite is run on shared VMs or in parallel. - */ + + our_memset_time = our_memset_end - our_memset_start; + libc_memset_time = libc_memset_end - libc_memset_start; + print_file(STDERR, "our_memset_time: " UINT64_FORMAT_STRING "\n", our_memset_time); + print_file(STDERR, "libc_memset_time: " UINT64_FORMAT_STRING "\n", libc_memset_time); } # endif /* UNIX */ @@ -1187,6 +1233,7 @@ unit_test_io(void) /* memset tests */ test_our_memset(); + our_memset_vs_libc(); # endif /* UNIX */ /* XXX: add more tests */