Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dont use the heap when scanning for utf8 strings ##bin #23020

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 91 additions & 6 deletions libr/bin/bfile.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/* radare2 - LGPL - Copyright 2009-2024 - pancake, nibble, dso */

#define NEW_MASTER 0
#include <r_bin.h>
#include <r_hash.h>
#include "i/private.h"
Expand Down Expand Up @@ -101,8 +102,15 @@ static void print_string(RBinFile *bf, RBinString *string, int raw, PJ *pj) {
}
}

// this is always ok
// TODO: this code must be implemented in RSearch as options for the strings mode
static int string_scan_range(RList *list, RBinFile *bf, int min, const ut64 from, const ut64 to, int type, int raw, RBinSection *section) {
static int string_scan_range(R_NULLABLE RList *list, RBinFile *bf, int min, const ut64 from, const ut64 to, int type, int raw, RBinSection *section) {
R_RETURN_VAL_IF_FAIL (bf, -1);
#if R2_USE_NEW_ABI
int utf_list_size = 0;
int *utf_list = NULL;
int *utf_freq = NULL;
#endif
RBin *bin = bf->rbin;
const bool strings_nofp = bin->strings_nofp;
ut8 tmp[64]; // temporal buffer to encode characters in utf8 form
Expand All @@ -120,9 +128,6 @@ static int string_scan_range(RList *list, RBinFile *bf, int min, const ut64 from
maxstr = R_STRING_SCAN_BUFFER_SIZE;
}

// if list is null it means its gonna dump
R_RETURN_VAL_IF_FAIL (bf, -1);

if (type == -1) {
type = R_STRING_TYPE_DETECT;
}
Expand Down Expand Up @@ -312,15 +317,20 @@ static int string_scan_range(RList *list, RBinFile *bf, int min, const ut64 from
// back up past the \0 to the last char just in case it starts a wide string
needle -= 2;
}
// TODO: allow the user to filter strings by type at scan time, this is, dont expect utf32 or utf16 strings
if (runes >= min) {
const char *tmpstr = r_strbuf_get (sb);
size_t tmplen = r_strbuf_length (sb);
// reduce false positives
#if R2_USE_NEW_ABI
int j, num_blocks, *block_list;
#else
int j, num_blocks;
#endif
int *freq_list = NULL, expected_ascii, actual_ascii, num_chars;
if (str_type == R_STRING_TYPE_ASCII) {
for (j = 0; j < tmplen; j++) {
char ch = tmpstr[j];
const char ch = tmpstr[j];
if (ch != '\n' && ch != '\r' && ch != '\t') {
if (!IS_PRINTABLE (ch)) {
continue;
Expand All @@ -332,13 +342,68 @@ static int string_scan_range(RList *list, RBinFile *bf, int min, const ut64 from
case R_STRING_TYPE_UTF8:
case R_STRING_TYPE_WIDE:
case R_STRING_TYPE_WIDE32:
#if R2_USE_NEW_ABI
if (tmplen > utf_list_size) {
// not here
int newsize = tmplen + 128;
int *a = realloc (utf_list, sizeof (int) * newsize);
int *b = realloc (freq_list, sizeof (int) * newsize);
if (a && b) {
utf_list_size = newsize;
utf_list = a;
utf_freq = b;
} else {
R_LOG_ERROR ("Cannot allocate %d", tmplen);
return 0;
}
}
// freq_list = (str_type == R_STRING_TYPE_WIDE || str_type == R_STRING_TYPE_WIDE32)? utf_freq: NULL;
freq_list = (str_type == R_STRING_TYPE_WIDE)? utf_freq: NULL;
num_blocks = r_utf_block_list2 ((const ut8*)tmpstr, tmplen - 1, utf_list, freq_list);
if (freq_list) {
// not here
num_chars = 0;
actual_ascii = 0;
for (j = 0; j < num_blocks; j++) {
num_chars += freq_list[j];
if (!utf_list[j]) { // ASCII
actual_ascii += freq_list[j];
}
}
expected_ascii = num_blocks ? num_chars / num_blocks : 0;
if (actual_ascii > expected_ascii) {
ascii_only = true;
if (str_type == R_STRING_TYPE_UTF8) {
str_type = R_STRING_TYPE_ASCII;
R_LOG_DEBUG ("ascii string miss identified as utf8");
break;
}
needle = str_start;
continue;
}
}
if (num_blocks > R_STRING_MAX_UNI_BLOCKS) {
needle++;
continue;
}
#else
num_blocks = 0;
int *block_list = r_utf_block_list ((const ut8*)tmpstr, tmplen - 1,
str_type == R_STRING_TYPE_WIDE? &freq_list: NULL);
if (block_list) {
for (j = 0; block_list[j] != -1; j++) {
num_blocks++;
}
#if NEW_MASTER
if (num_blocks > 0) {
num_blocks--;
}
#if 0
for (j = 0; block_list[j] != -1 && block_list[j] < 200; j++) {
num_blocks++;
}
#endif
#endif
}
if (freq_list) {
num_chars = 0;
Expand All @@ -353,16 +418,30 @@ static int string_scan_range(RList *list, RBinFile *bf, int min, const ut64 from
expected_ascii = num_blocks ? num_chars / num_blocks : 0;
if (actual_ascii > expected_ascii) {
ascii_only = true;
#if NEW_MASTER
if (str_type == R_STRING_TYPE_UTF8) {
str_type = R_STRING_TYPE_ASCII;
R_LOG_DEBUG ("ascii string miss identified as utf8");
}
R_FREE (block_list);
if (str_start > needle) {
needle = str_start;
continue;
}
break;
#else
needle = str_start;
R_FREE (block_list);
continue;
#endif
}
}
R_FREE (block_list);
if (num_blocks > R_STRING_MAX_UNI_BLOCKS) {
needle++;
continue;
}
#endif
}
RBinString *bs = R_NEW0 (RBinString);
if (!bs) {
Expand All @@ -378,6 +457,7 @@ static int string_scan_range(RList *list, RBinFile *bf, int min, const ut64 from
break;
}
// TODO: move into adjust_offset
#if 1
switch (str_type) {
case R_STRING_TYPE_WIDE:
if (str_start - from > 1) {
Expand All @@ -396,6 +476,7 @@ static int string_scan_range(RList *list, RBinFile *bf, int min, const ut64 from
}
break;
}
#endif
if (!s) {
if (section) {
s = section;
Expand All @@ -409,7 +490,7 @@ static int string_scan_range(RList *list, RBinFile *bf, int min, const ut64 from
}
ut64 baddr = bf->loadaddr && bf->bo? bf->bo->baddr: bf->loadaddr;
bs->paddr = str_start + baddr;
bs->vaddr = str_start - pdelta + vdelta + baddr;
bs->vaddr = str_start + baddr + vdelta - pdelta;
bs->string = r_strbuf_drain (sb);
sb = r_strbuf_new ("");
if (strings_nofp) {
Expand Down Expand Up @@ -445,6 +526,10 @@ static int string_scan_range(RList *list, RBinFile *bf, int min, const ut64 from
pj_free (pj);
}
r_strbuf_free (sb);
#if R2_USE_NEW_ABI
free (utf_list);
free (utf_freq);
#endif
return bf->string_count;
}

Expand Down
5 changes: 4 additions & 1 deletion libr/include/r_util/r_utf8.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@ R_API char *r_utf16_to_utf8_l(const wchar_t *wc, int len);
R_API const char *r_utf_block_name(int idx);
R_API wchar_t *r_utf8_to_utf16_l(const char *cstring, int len);
R_API int r_utf_block_idx(RRune ch);
R_API int *r_utf_block_list(const ut8 *str, int len, int **freq_list);
R_API int *r_utf_block_list(const ut8 *str, int len, int **freq_list); // XXX deprecate
#if R2_USE_NEW_ABI
R_API int r_utf_block_list2(const ut8 *str, int len, int *list, int *freq_list);
#endif
R_API RStrEnc r_utf_bom_encoding(const ut8 *ptr, int ptrlen);
#define r_utf16_to_utf8(wc) r_utf16_to_utf8_l ((wchar_t *)wc, -1)
#define r_utf8_to_utf16(cstring) r_utf8_to_utf16_l ((char *)cstring, -1)
Expand Down
87 changes: 84 additions & 3 deletions libr/util/utf8.c
Original file line number Diff line number Diff line change
Expand Up @@ -766,11 +766,92 @@ R_API int r_utf_block_idx(RRune ch) {
return R_UTF_BLOCKS_COUNT - 1; /* index for "No_Block" */
}

#if R2_USE_NEW_ABI
R_API int r_utf_block_list2(const ut8 *str, int len, int *list, int *freq_list) {
R_RETURN_VAL_IF_FAIL (len >= 0, 0);
// list must be sizeof (int) * len + 1 at least
if (!str || len < 1) {
return 0;
}
#if 0
int *block_list = r_utf_block_list (str, len, &freq_list);
int j;
int num_blocks = 0;
for (j = 0; block_list[j] != -1; j++) {
list[j] = block_list[j];
num_blocks++;
}
free (block_list);
// printf ("%d\n", num_blocks);
return num_blocks;
#else
int block_freq[R_UTF_BLOCKS_COUNT] = {0};
int num_blocks = 0;
int *list_ptr = list;
const ut8 *str_ptr = str;
const ut8 *str_end = str + len;
RRune ch;
bool eos = false;
// FAIL
while (str_ptr < str_end) {
int block_idx;
int runesize = r_utf8_decode (str_ptr, str_end - str_ptr, &ch);
if (runesize == 0) {
*list_ptr++ = R_UTF_BLOCKS_COUNT - 1;
num_blocks++;
break;
}
if (runesize > 0) {
block_idx = r_utf_block_idx (ch);
if (!block_freq[block_idx]) {
*list_ptr++ = block_idx;
num_blocks++;
#if 0
if (block_idx == -1) {
eos = true;
}
if (!eos) {
num_blocks--;
}
#endif
}
block_freq[block_idx]++;
str_ptr += runesize;
} else {
str_ptr++;
break;
}
}
*list_ptr = -1;
int i;
if (freq_list) {
int *p = freq_list;
for (i = 0; i < num_blocks; i++) {
*p++ = block_freq[list[i]];
}
*p = -1;
}
//printf ("%d\n", num_blocks);
return num_blocks;
#endif
}
#else
R_API int r_utf_block_list2(const ut8 *str, int len, int *list, int *freq_list) {
int *block_list = r_utf_block_list (str, len, &freq_list);
int j;
int num_blocks = 0;
for (j = 0; block_list[j] != -1; j++) {
list[j] = block_list[j];
num_blocks++;
}
free (block_list);
return num_blocks;
}
#endif

/* str must be UTF8-encoded */
// R2_600 DEPRECATE THIS
R_API int *r_utf_block_list(const ut8 *str, int len, int **freq_list) {
if (!str) {
return NULL;
}
if (len < 0) {
len = strlen ((const char *)str);
}
Expand Down
21 changes: 17 additions & 4 deletions test/db/cmd/cmd_iz
Original file line number Diff line number Diff line change
Expand Up @@ -542,12 +542,25 @@ RUN

NAME=ascii substring detection (#14499)
FILE=bins/pe/Reborn_Stub-strings.exe
CMDS=izzq~pomf
EXPECT=<<EOF
0x489d8a 53 26 http://pomf.cat/upload.php
0x489edd 39 19 https://a.pomf.cat/
0x48b9ed 53 26 http://pomf.cat/upload.php
0x48ba22 40 19 https://a.pomf.cat/
EOF
RUN

# NOTE=R2_600 - broken with newabi because we now find more utf32 strings
NAME=ascii substring detection
FILE=bins/pe/Reborn_Stub-strings.exe
BROKEN=1
CMDS=izz~pomf
EXPECT=<<EOF
7130 0x00087f8a 0x00489d8a 26 53 .text utf16le http://pomf.cat/upload.php
7135 0x000880dd 0x00489edd 19 39 .text utf16le https://a.pomf.cat/
7298 0x00089bed 0x0048b9ed 26 53 .text utf16le http://pomf.cat/upload.php
7299 0x00089c22 0x0048ba22 19 40 .text utf16le https://a.pomf.cat/
7140 0x00087f8a 0x00489d8a 26 53 .text utf16le http://pomf.cat/upload.php
7145 0x000880dd 0x00489edd 19 39 .text utf16le https://a.pomf.cat/
7308 0x00089bed 0x0048b9ed 26 53 .text utf16le http://pomf.cat/upload.php
7309 0x00089c22 0x0048ba22 19 40 .text utf16le https://a.pomf.cat/
EOF
RUN

Expand Down
Loading
Loading