From 0535de5257f11f5c9eba89767cad5edbb22b8b02 Mon Sep 17 00:00:00 2001 From: Albert Cervin Date: Fri, 23 Aug 2024 17:07:27 +0200 Subject: [PATCH] Overhaul unicode parsing It now instead iterates the actual unicode code points. This is better than what it was previously doing but it is still not entirely correct w.r.t to unicode sequences. This handling of unicode code points does however make it slightly easier to handle UTF-16 if needed in the future. This also adds some long needed tests for buffer methods. --- dged.nix | 5 + src/dged/buffer.c | 473 +++++++++++++++++++++++++---------------- src/dged/buffer.h | 44 ++-- src/dged/buffer_view.c | 40 ++-- src/dged/display.c | 33 ++- src/dged/display.h | 2 +- src/dged/keyboard.c | 32 +-- src/dged/syntax.c | 68 +++--- src/dged/text.c | 233 +++++++------------- src/dged/text.h | 54 +++-- src/dged/utf8.c | 152 ++++++++----- src/dged/utf8.h | 28 ++- src/main/cmds.c | 2 +- src/main/completion.c | 108 ++++------ sune.txt | 4 + test/buffer.c | 152 ++++++++++++- test/main.c | 5 +- test/text.c | 70 ++---- test/utf8.c | 7 +- 19 files changed, 848 insertions(+), 664 deletions(-) create mode 100644 sune.txt diff --git a/dged.nix b/dged.nix index a8f1a8f..6f68df9 100644 --- a/dged.nix +++ b/dged.nix @@ -10,6 +10,8 @@ , valgrind , linkFarm , fetchFromGitHub +, glibcLocalesUtf8 +, strace }: stdenv.mkDerivation { name = "dged"; @@ -32,6 +34,9 @@ stdenv.mkDerivation { bmake docs ''; + # needed for tests to work in sandboxed builds + LOCALE_ARCHIVE = "${glibcLocalesUtf8}/lib/locale/locale-archive"; + TREESITTER_GRAMMARS = with tree-sitter-grammars; linkFarm "tree-sitter-grammars" rec { "bash" = tree-sitter-bash; diff --git a/src/dged/buffer.c b/src/dged/buffer.c index 6051f69..1062a47 100644 --- a/src/dged/buffer.c +++ b/src/dged/buffer.c @@ -157,6 +157,42 @@ void buffer_static_teardown() { } } +static uint32_t get_tab_width(struct buffer *buffer) { + struct setting *tw = lang_setting(&buffer->lang, "tab-width"); + if (tw == NULL) { + tw = settings_get("editor.tab-width"); + } + + uint32_t tab_width = 4; + if (tw != NULL && tw->value.type == Setting_Number) { + tab_width = tw->value.number_value; + } + return tab_width; +} + +static bool use_tabs(struct buffer *buffer) { + struct setting *ut = lang_setting(&buffer->lang, "use-tabs"); + if (ut == NULL) { + ut = settings_get("editor.use-tabs"); + } + + bool use_tabs = false; + if (ut != NULL && ut->value.type == Setting_Bool) { + use_tabs = ut->value.bool_value; + } + + return use_tabs; +} + +static uint32_t visual_char_width(struct codepoint *codepoint, + uint32_t tab_width) { + if (codepoint->codepoint == '\t') { + return tab_width; + } else { + return unicode_visual_char_width(codepoint); + } +} + static struct buffer create_internal(const char *name, char *filename) { struct buffer b = (struct buffer){ .filename = filename, @@ -185,7 +221,7 @@ static struct buffer create_internal(const char *name, char *filename) { static void strip_final_newline(struct buffer *b) { uint32_t nlines = text_num_lines(b->text); - if (nlines > 0 && text_line_length(b->text, nlines - 1) == 0) { + if (nlines > 0 && buffer_line_length(b, nlines - 1) == 0) { text_delete(b->text, nlines - 1, 0, nlines - 1, 1); } } @@ -207,7 +243,7 @@ static void buffer_read_from_file(struct buffer *b) { int bytes = fread(buff, 1, 4096, file); if (bytes > 0) { uint32_t ignore; - text_append(b->text, buff, bytes, &ignore, &ignore); + text_append(b->text, buff, bytes, &ignore); } else if (bytes == 0) { break; // EOF } else { @@ -239,70 +275,66 @@ static void write_line(struct text_chunk *chunk, void *userdata) { fputc('\n', file); } -static bool is_word_break(uint8_t c) { +static bool is_word_break(const struct codepoint *codepoint) { + uint32_t c = codepoint->codepoint; return c == ' ' || c == '.' || c == '(' || c == ')' || c == '[' || c == ']' || - c == '{' || c == '}' || c == ';' || c == '<' || c == '>' || c == ':'; + c == '{' || c == '}' || c == ';' || c == '<' || c == '>' || c == ':' || + c == '"'; } -static bool is_word_char(uint8_t c) { return !is_word_break(c); } - -struct match_result { - struct location at; - bool found; -}; - -static struct match_result find_next_in_line(struct buffer *buffer, - struct location start, - bool (*predicate)(uint8_t c)) { - struct text_chunk line = text_get_line(buffer->text, start.line); - bool found = false; +static bool is_word_char(const struct codepoint *c) { + return !is_word_break(c); +} - if (line.nbytes == 0) { +static struct match_result +find_next_in_line(struct buffer *buffer, struct location start, + bool (*predicate)(const struct codepoint *c)) { + if (text_line_size(buffer->text, start.line) == 0) { return (struct match_result){.at = start, .found = false}; } - uint32_t bytei = text_col_to_byteindex(buffer->text, start.line, start.col); - while (bytei < line.nbytes) { - if (predicate(line.text[bytei])) { + bool found = false; + struct utf8_codepoint_iterator iter = + text_line_codepoint_iterator(buffer->text, start.line); + uint32_t coli = 0, tab_width = get_tab_width(buffer); + struct codepoint *codepoint; + while ((codepoint = utf8_next_codepoint(&iter)) != NULL) { + if (coli >= start.col && predicate(codepoint)) { found = true; break; } - ++bytei; + + coli += visual_char_width(codepoint, tab_width); } - uint32_t target_col = text_byteindex_to_col(buffer->text, start.line, bytei); return (struct match_result){ - .at = (struct location){.line = start.line, .col = target_col}, - .found = found}; + .at = (struct location){.line = start.line, .col = coli}, .found = found}; } -static struct match_result find_prev_in_line(struct buffer *buffer, - struct location start, - bool (*predicate)(uint8_t c)) { - struct text_chunk line = text_get_line(buffer->text, start.line); - bool found = false; +static struct match_result +find_prev_in_line(struct buffer *buffer, struct location start, + bool (*predicate)(const struct codepoint *c)) { - if (line.nbytes == 0) { + if (text_line_size(buffer->text, start.line) == 0) { return (struct match_result){.at = start, .found = false}; } - uint32_t bytei = text_col_to_byteindex(buffer->text, start.line, start.col); - while (bytei > 0) { - if (predicate(line.text[bytei])) { + bool found = false; + struct utf8_codepoint_iterator iter = + text_line_codepoint_iterator(buffer->text, start.line); + uint32_t coli = 0, tab_width = get_tab_width(buffer), found_at; + struct codepoint *codepoint; + while (coli < start.col && (codepoint = utf8_next_codepoint(&iter)) != NULL) { + if (predicate(codepoint)) { found = true; - break; + found_at = coli; } - --bytei; - } - - // first byte on line can also be a match - if (predicate(line.text[bytei])) { - found = true; + coli += visual_char_width(codepoint, tab_width); } - uint32_t target_col = text_byteindex_to_col(buffer->text, start.line, bytei); return (struct match_result){ - .at = (struct location){.line = start.line, .col = target_col}, + .at = + (struct location){.line = start.line, .col = found ? found_at : coli}, .found = found}; } @@ -315,13 +347,52 @@ static struct text_chunk *copy_region(struct buffer *buffer, free(curr->text); } + struct location begin_bytes = + buffer_location_to_byte_coords(buffer, region.begin); + struct location end_bytes = + buffer_location_to_byte_coords(buffer, region.end); + struct text_chunk txt = - text_get_region(buffer->text, region.begin.line, region.begin.col, - region.end.line, region.end.col); + text_get_region(buffer->text, begin_bytes.line, begin_bytes.col, + end_bytes.line, end_bytes.col); *curr = txt; return curr; } +static struct location do_indent(struct buffer *buffer, struct location at, + uint32_t tab_width, bool use_tabs) { + if (use_tabs) { + return buffer_add(buffer, at, (uint8_t *)"\t", 1); + } else { + return buffer_add(buffer, at, (uint8_t *)" ", + tab_width > 16 ? 16 : tab_width); + } +} + +static uint64_t to_global_offset(struct buffer *buffer, + struct location bytecoords) { + uint32_t line = bytecoords.line; + uint32_t col = bytecoords.col; + uint32_t byteoff = 0; + uint32_t nlines = buffer_num_lines(buffer); + + if (nlines == 0) { + return 0; + } + + for (uint32_t l = 0; l < line && l < nlines; ++l) { + // +1 for newline + byteoff += text_line_size(buffer->text, l) + 1; + } + + // handle last line + uint32_t l = line < nlines ? line : nlines - 1; + uint32_t nbytes = text_line_size(buffer->text, l); + byteoff += col <= nbytes ? col : nbytes + 1; + + return byteoff; +} + /* --------------------- buffer methods -------------------- */ struct buffer buffer_create(const char *name) { @@ -452,18 +523,29 @@ struct location buffer_add(struct buffer *buffer, struct location at, struct location initial = at; struct location final = at; - uint32_t lines_added, cols_added; - text_insert_at(buffer->text, initial.line, initial.col, text, nbytes, - &lines_added, &cols_added); + struct location at_bytes = buffer_location_to_byte_coords(buffer, at); + + uint32_t lines_added; + text_insert_at(buffer->text, at_bytes.line, at_bytes.col, text, nbytes, + &lines_added); // move to after inserted text if (lines_added > 0) { final = buffer_clamp(buffer, (int64_t)at.line + lines_added, 0); } else { + uint32_t cols_added = 0, tab_width = get_tab_width(buffer); + struct utf8_codepoint_iterator iter = + create_utf8_codepoint_iterator(text, nbytes, 0); + struct codepoint *codepoint; + while ((codepoint = utf8_next_codepoint(&iter)) != NULL) { + cols_added += visual_char_width(codepoint, tab_width); + } final = buffer_clamp(buffer, (int64_t)at.line, (int64_t)at.col + cols_added); } + struct location final_bytes = buffer_location_to_byte_coords(buffer, final); + undo_push_add( &buffer->undo, (struct undo_add){.begin = {.row = initial.line, .col = initial.col}, @@ -474,11 +556,17 @@ struct location buffer_add(struct buffer *buffer, struct location at, (struct undo_boundary){.save_point = false}); } - uint32_t begin_idx = text_global_idx(buffer->text, initial.line, initial.col); - uint32_t end_idx = text_global_idx(buffer->text, final.line, final.col); + uint32_t begin_idx = to_global_offset(buffer, at_bytes); + uint32_t end_idx = to_global_offset(buffer, final_bytes); VEC_FOR_EACH(&buffer->hooks->insert_hooks, struct insert_hook * h) { - h->callback(buffer, region_new(initial, final), begin_idx, end_idx, + h->callback(buffer, + (struct edit_location){ + .coordinates = region_new(initial, final), + .bytes = region_new(at_bytes, final_bytes), + .global_byte_begin = begin_idx, + .global_byte_end = end_idx, + }, h->userdata); } @@ -488,15 +576,16 @@ struct location buffer_add(struct buffer *buffer, struct location at, struct location buffer_set_text(struct buffer *buffer, uint8_t *text, uint32_t nbytes) { - uint32_t lines, cols; + uint32_t lines_added; text_clear(buffer->text); - text_append(buffer->text, text, nbytes, &lines, &cols); + text_append(buffer->text, text, nbytes, &lines_added); // if last line is empty, remove it strip_final_newline(buffer); - return buffer_clamp(buffer, lines, cols); + return buffer_clamp(buffer, lines_added, + buffer_line_length(buffer, lines_added)); } void buffer_clear(struct buffer *buffer) { text_clear(buffer->text); } @@ -524,9 +613,18 @@ struct location buffer_previous_char(struct buffer *buffer, } --dot.line; - dot.col = buffer_num_chars(buffer, dot.line); + dot.col = buffer_line_length(buffer, dot.line); } else { - --dot.col; + struct utf8_codepoint_iterator iter = + text_line_codepoint_iterator(buffer->text, dot.line); + struct codepoint *codepoint; + uint32_t coli = 0, tab_width = get_tab_width(buffer), last_width = 0; + while (coli < dot.col && (codepoint = utf8_next_codepoint(&iter)) != NULL) { + last_width = visual_char_width(codepoint, tab_width); + coli += last_width; + } + + dot.col = coli - last_width; } return dot; @@ -571,14 +669,14 @@ struct location buffer_previous_line(struct buffer *buffer, } --dot.line; - uint32_t nchars = buffer_num_chars(buffer, dot.line); + uint32_t nchars = buffer_line_length(buffer, dot.line); uint32_t new_col = dot.col > nchars ? nchars : dot.col; return dot; } struct location buffer_next_char(struct buffer *buffer, struct location dot) { - if (dot.col == buffer_num_chars(buffer, dot.line)) { + if (dot.col == buffer_line_length(buffer, dot.line)) { uint32_t lastline = buffer->lazy_row_add ? buffer_num_lines(buffer) : buffer_num_lines(buffer) - 1; if (dot.line == lastline) { @@ -588,7 +686,16 @@ struct location buffer_next_char(struct buffer *buffer, struct location dot) { dot.col = 0; ++dot.line; } else { - ++dot.col; + struct utf8_codepoint_iterator iter = + text_line_codepoint_iterator(buffer->text, dot.line); + struct codepoint *codepoint; + uint32_t coli = 0; + while (coli <= dot.col && + (codepoint = utf8_next_codepoint(&iter)) != NULL) { + coli += visual_char_width(codepoint, get_tab_width(buffer)); + } + + dot.col = coli; } return dot; @@ -635,7 +742,7 @@ struct location buffer_next_line(struct buffer *buffer, struct location dot) { ++dot.line; uint32_t new_col = dot.col; - uint32_t nchars = buffer_num_chars(buffer, dot.line); + uint32_t nchars = buffer_line_length(buffer, dot.line); new_col = new_col > nchars ? nchars : new_col; return dot; @@ -664,8 +771,8 @@ struct location buffer_clamp(struct buffer *buffer, int64_t line, int64_t col) { // clamp col if (col < 0) { col = 0; - } else if (col > buffer_num_chars(buffer, line)) { - col = buffer_num_chars(buffer, line); + } else if (col > buffer_line_length(buffer, line)) { + col = buffer_line_length(buffer, line); } location.col = col; @@ -681,7 +788,7 @@ struct location buffer_end(struct buffer *buffer) { return (struct location){.line = nlines, .col = 0}; } else { return (struct location){.line = nlines - 1, - .col = buffer_num_chars(buffer, nlines - 1)}; + .col = buffer_line_length(buffer, nlines - 1)}; } } @@ -689,55 +796,22 @@ uint32_t buffer_num_lines(struct buffer *buffer) { return text_num_lines(buffer->text); } -uint32_t buffer_num_chars(struct buffer *buffer, uint32_t line) { - if (line >= buffer_num_lines(buffer)) { - return 0; +uint32_t buffer_line_length(struct buffer *buffer, uint32_t line) { + uint32_t tab_size = get_tab_width(buffer), len = 0; + struct utf8_codepoint_iterator iter = + text_line_codepoint_iterator(buffer->text, line); + struct codepoint *codepoint; + while ((codepoint = utf8_next_codepoint(&iter)) != NULL) { + len += visual_char_width(codepoint, tab_size); } - return text_line_length(buffer->text, line); + return len; } struct location buffer_newline(struct buffer *buffer, struct location at) { return buffer_add(buffer, at, (uint8_t *)"\n", 1); } -static uint32_t get_tab_width(struct buffer *buffer) { - struct setting *tw = lang_setting(&buffer->lang, "tab-width"); - if (tw == NULL) { - tw = settings_get("editor.tab-width"); - } - - uint32_t tab_width = 4; - if (tw != NULL && tw->value.type == Setting_Number) { - tab_width = tw->value.number_value; - } - return tab_width; -} - -static bool use_tabs(struct buffer *buffer) { - struct setting *ut = lang_setting(&buffer->lang, "use-tabs"); - if (ut == NULL) { - ut = settings_get("editor.use-tabs"); - } - - bool use_tabs = false; - if (ut != NULL && ut->value.type == Setting_Bool) { - use_tabs = ut->value.bool_value; - } - - return use_tabs; -} - -static struct location do_indent(struct buffer *buffer, struct location at, - uint32_t tab_width, bool use_tabs) { - if (use_tabs) { - return buffer_add(buffer, at, (uint8_t *)"\t", 1); - } else { - return buffer_add(buffer, at, (uint8_t *)" ", - tab_width > 16 ? 16 : tab_width); - } -} - struct location buffer_indent(struct buffer *buffer, struct location at) { return do_indent(buffer, at, get_tab_width(buffer), use_tabs(buffer)); } @@ -778,16 +852,13 @@ struct location buffer_undo(struct buffer *buffer, struct location dot) { case Undo_Add: { struct undo_add *add = &rec->add; - pos = - buffer_delete(buffer, (struct region){.begin = - (struct location){ - .line = add->begin.row, - .col = add->begin.col, - }, - .end = (struct location){ - .line = add->end.row, - .col = add->end.col, - }}); + pos = buffer_delete(buffer, + (struct region){ + .begin = (struct location){.line = add->begin.row, + .col = add->begin.col}, + .end = (struct location){.line = add->end.row, + .col = add->end.col}, + }); break; } @@ -888,9 +959,14 @@ struct location buffer_delete(struct buffer *buffer, struct region region) { return region.begin; } + struct location begin_bytes = + buffer_location_to_byte_coords(buffer, region.begin); + struct location end_bytes = + buffer_location_to_byte_coords(buffer, region.end); + struct text_chunk txt = - text_get_region(buffer->text, region.begin.line, region.begin.col, - region.end.line, region.end.col); + text_get_region(buffer->text, begin_bytes.line, begin_bytes.col, + end_bytes.line, end_bytes.col); undo_push_boundary(&buffer->undo, (struct undo_boundary){.save_point = false}); @@ -903,17 +979,22 @@ struct location buffer_delete(struct buffer *buffer, struct region region) { undo_push_boundary(&buffer->undo, (struct undo_boundary){.save_point = false}); - uint32_t begin_idx = - text_global_idx(buffer->text, region.begin.line, region.begin.col); - uint32_t end_idx = - text_global_idx(buffer->text, region.end.line, region.end.col); + uint64_t begin_idx = to_global_offset(buffer, begin_bytes); + uint64_t end_idx = to_global_offset(buffer, end_bytes); - text_delete(buffer->text, region.begin.line, region.begin.col, - region.end.line, region.end.col); + text_delete(buffer->text, begin_bytes.line, begin_bytes.col, end_bytes.line, + end_bytes.col); buffer->modified = true; VEC_FOR_EACH(&buffer->hooks->delete_hooks, struct delete_hook * h) { - h->callback(buffer, region, begin_idx, end_idx, h->userdata); + h->callback(buffer, + (struct edit_location){ + .coordinates = region, + .bytes = region_new(begin_bytes, end_bytes), + .global_byte_begin = begin_idx, + .global_byte_end = end_idx, + }, + h->userdata); } return region.begin; @@ -1035,27 +1116,6 @@ struct cmdbuf { struct buffer *buffer; }; -static uint32_t visual_char_width(uint8_t *byte, uint32_t maxlen) { - if (*byte == '\t') { - return 4; - } else { - return utf8_visual_char_width(byte, maxlen); - } -} - -uint32_t visual_string_width(uint8_t *txt, uint32_t len, uint32_t start_col, - uint32_t end_col) { - uint32_t start_byte = utf8_nbytes(txt, len, start_col); - uint32_t end_byte = utf8_nbytes(txt, len, end_col); - - uint32_t width = 0; - for (uint32_t bytei = start_byte; bytei < end_byte; ++bytei) { - width += visual_char_width(&txt[bytei], len - bytei); - } - - return width; -} - static void apply_properties(struct command_list *cmds, struct text_property *properties[], uint32_t nproperties) { @@ -1097,65 +1157,67 @@ void render_line(struct text_chunk *line, void *userdata) { command_list_set_show_whitespace(cmdbuf->cmds, cmdbuf->show_ws); // calculate scroll offsets - uint32_t scroll_bytes = - utf8_nbytes(line->text, line->nbytes, cmdbuf->origin.col); - uint32_t text_nbytes_scroll = - scroll_bytes > line->nbytes ? 0 : line->nbytes - scroll_bytes; - uint8_t *text = line->text + scroll_bytes; - - uint32_t visual_col_start = 0; - uint32_t cur_visual_col = 0; - uint32_t start_byte = 0, text_nbytes = 0; struct text_property *properties[32] = {0}; uint64_t prev_properties_hash = 0; - for (uint32_t cur_byte = start_byte, coli = 0; - cur_byte < text_nbytes_scroll && cur_visual_col < cmdbuf->width && - coli < line->nchars - cmdbuf->origin.col; - ++coli) { + uint32_t tab_width = get_tab_width(cmdbuf->buffer); + + // handle scroll column offset + uint32_t coli = 0, bytei = 0; + struct utf8_codepoint_iterator iter = text_chunk_codepoint_iterator(line); + struct codepoint *codepoint; + while (coli < cmdbuf->origin.col && + (codepoint = utf8_next_codepoint(&iter)) != NULL) { + coli += visual_char_width(codepoint, tab_width); + bytei += codepoint->nbytes; + } - uint32_t bytes_remaining = text_nbytes_scroll - cur_byte; - uint32_t char_nbytes = utf8_nbytes(text + cur_byte, bytes_remaining, 1); - uint32_t char_vwidth = visual_char_width(text + cur_byte, bytes_remaining); + // coli is the visual column [0..width-1] + coli = 0; + uint32_t drawn_bytei = bytei; + uint32_t drawn_coli = coli; + while (coli < cmdbuf->width && + (codepoint = utf8_next_codepoint(&iter)) != NULL) { // calculate character properties uint32_t nproperties = 0; - text_get_properties( - cmdbuf->buffer->text, - (struct location){.line = line->line, .col = coli + cmdbuf->origin.col}, - properties, 32, &nproperties); + text_get_properties(cmdbuf->buffer->text, line->line, bytei, properties, 32, + &nproperties); // if we have any new or lost props, flush text up until now, reset // and re-apply current properties uint64_t new_properties_hash = properties_hash(properties, nproperties); if (new_properties_hash != prev_properties_hash) { - command_list_draw_text(cmdbuf->cmds, visual_col_start, visual_line, - text + start_byte, cur_byte - start_byte); + command_list_draw_text(cmdbuf->cmds, drawn_coli, visual_line, + line->text + drawn_bytei, bytei - drawn_bytei); command_list_reset_color(cmdbuf->cmds); - visual_col_start = cur_visual_col; - start_byte = cur_byte; + drawn_coli = coli; + drawn_bytei = bytei; // apply new properties apply_properties(cmdbuf->cmds, properties, nproperties); } prev_properties_hash = new_properties_hash; - cur_byte += char_nbytes; - text_nbytes += char_nbytes; - cur_visual_col += char_vwidth; + bytei += codepoint->nbytes; + coli += visual_char_width(codepoint, tab_width); } // flush remaining - command_list_draw_text(cmdbuf->cmds, visual_col_start, visual_line, - text + start_byte, text_nbytes - start_byte); + command_list_draw_text(cmdbuf->cmds, drawn_coli, visual_line, + line->text + drawn_bytei, bytei - drawn_bytei); + + drawn_coli = coli; + drawn_bytei = bytei; command_list_reset_color(cmdbuf->cmds); command_list_set_show_whitespace(cmdbuf->cmds, false); - if (cur_visual_col < cmdbuf->width) { - command_list_draw_repeated(cmdbuf->cmds, cur_visual_col, visual_line, ' ', - cmdbuf->width - cur_visual_col); + // TODO: considering the whole screen is cleared, is this really needed? + if (drawn_coli < cmdbuf->width) { + command_list_draw_repeated(cmdbuf->cmds, drawn_coli, visual_line, ' ', + cmdbuf->width - drawn_coli); } } @@ -1200,19 +1262,19 @@ void buffer_render(struct buffer *buffer, struct buffer_render_params *params) { void buffer_add_text_property(struct buffer *buffer, struct location start, struct location end, struct text_property property) { - text_add_property( - buffer->text, (struct location){.line = start.line, .col = start.col}, - (struct location){.line = end.line, .col = end.col}, property); + struct location bytestart = buffer_location_to_byte_coords(buffer, start); + struct location byteend = buffer_location_to_byte_coords(buffer, end); + text_add_property(buffer->text, bytestart.line, bytestart.col, byteend.line, + byteend.col, property); } void buffer_get_text_properties(struct buffer *buffer, struct location location, struct text_property **properties, uint32_t max_nproperties, uint32_t *nproperties) { - text_get_properties( - buffer->text, - (struct location){.line = location.line, .col = location.col}, properties, - max_nproperties, nproperties); + struct location bytecoords = buffer_location_to_byte_coords(buffer, location); + text_get_properties(buffer->text, bytecoords.line, bytecoords.col, properties, + max_nproperties, nproperties); } void buffer_clear_text_properties(struct buffer *buffer) { @@ -1244,9 +1306,12 @@ void buffer_sort_lines(struct buffer *buffer, uint32_t start_line, (struct location){.line = end + 1, .col = 0}); struct s8 *lines = (struct s8 *)malloc(sizeof(struct s8) * ntosort); - struct text_chunk txt = - text_get_region(buffer->text, region.begin.line, region.begin.col, - region.end.line, region.end.col); + + struct location bytebeg = + buffer_location_to_byte_coords(buffer, region.begin); + struct location byteend = buffer_location_to_byte_coords(buffer, region.end); + struct text_chunk txt = text_get_region( + buffer->text, bytebeg.line, bytebeg.col, byteend.line, byteend.col); uint32_t line_start = 0; uint32_t curr_line = 0; @@ -1278,3 +1343,41 @@ void buffer_sort_lines(struct buffer *buffer, uint32_t start_line, free(txt.text); } } + +struct location buffer_location_to_byte_coords(struct buffer *buffer, + struct location coords) { + struct utf8_codepoint_iterator iter = + text_line_codepoint_iterator(buffer->text, coords.line); + uint32_t byteoffset = 0, col = 0, tab_width = get_tab_width(buffer); + struct codepoint *codepoint; + + /* Let this walk up to (and including the target column) to + * make sure we account for zero-width characters when calculating the + * byte offset. + */ + while (col <= coords.col && + (codepoint = utf8_next_codepoint(&iter)) != NULL) { + byteoffset += codepoint->nbytes; + col += visual_char_width(codepoint, tab_width); + } + + /* Remove the byte-width of the last char again since it gives us the + * position right before it while still taking zero-width codepoints + * into account. + */ + return (struct location){.line = coords.line, + .col = byteoffset - + (codepoint != NULL ? codepoint->nbytes : 0)}; +} + +struct match_result +buffer_find_prev_in_line(struct buffer *buffer, struct location start, + bool (*predicate)(const struct codepoint *c)) { + return find_prev_in_line(buffer, start, predicate); +} + +struct match_result +buffer_find_next_in_line(struct buffer *buffer, struct location start, + bool (*predicate)(const struct codepoint *c)) { + return find_next_in_line(buffer, start, predicate); +} diff --git a/src/dged/buffer.h b/src/dged/buffer.h index cd5bd95..c9fe2ca 100644 --- a/src/dged/buffer.h +++ b/src/dged/buffer.h @@ -295,13 +295,13 @@ struct location buffer_end(struct buffer *buffer); uint32_t buffer_num_lines(struct buffer *buffer); /** - * Get the number of chars in a given line in buffer. + * Get the line length in number of column positions. * * @param [in] buffer The buffer to use. - * @param [in] line The line to get number of chars for. - * @returns The number of chars in @ref line. + * @param [in] line The line to get number of columns for. + * @returns The number of column positions in the current line. */ -uint32_t buffer_num_chars(struct buffer *buffer, uint32_t line); +uint32_t buffer_line_length(struct buffer *buffer, uint32_t line); /** * Insert a newline in the buffer. @@ -555,6 +555,13 @@ uint32_t buffer_add_reload_hook(struct buffer *buffer, reload_hook_cb callback, void buffer_remove_reload_hook(struct buffer *buffer, uint32_t hook_id, remove_hook_cb callback); +struct edit_location { + struct region coordinates; + struct region bytes; + uint64_t global_byte_begin; + uint64_t global_byte_end; +}; + /** * Buffer insert hook callback function. * @@ -565,9 +572,8 @@ void buffer_remove_reload_hook(struct buffer *buffer, uint32_t hook_id, * @param end_idx The global byte offset to the end of where text was inserted. * @param userdata The userdata as sent in to @ref buffer_add_insert_hook. */ -typedef void (*insert_hook_cb)(struct buffer *buffer, struct region inserted, - uint32_t begin_idx, uint32_t end_idx, - void *userdata); +typedef void (*insert_hook_cb)(struct buffer *buffer, + struct edit_location inserted, void *userdata); /** * Add an insert hook, called when text is inserted into the @p buffer. @@ -600,9 +606,8 @@ void buffer_remove_insert_hook(struct buffer *buffer, uint32_t hook_id, * @param end_idx The global byte offset to the end of the removed text. * @param userdata The userdata as sent in to @ref buffer_add_delete_hook. */ -typedef void (*delete_hook_cb)(struct buffer *buffer, struct region removed, - uint32_t begin_idx, uint32_t end_idx, - void *userdata); +typedef void (*delete_hook_cb)(struct buffer *buffer, + struct edit_location removed, void *userdata); /** * Add a delete hook, called when text is removed from the @p buffer. @@ -724,10 +729,6 @@ void buffer_update(struct buffer *buffer, struct buffer_update_params *params); */ void buffer_render(struct buffer *buffer, struct buffer_render_params *params); -// TODO: move this to where it makes sense -uint32_t visual_string_width(uint8_t *txt, uint32_t len, uint32_t start_col, - uint32_t end_col); - /** * Sort lines in a buffer alphabetically. * @@ -738,4 +739,19 @@ uint32_t visual_string_width(uint8_t *txt, uint32_t len, uint32_t start_col, void buffer_sort_lines(struct buffer *buffer, uint32_t start_line, uint32_t end_line); +struct location buffer_location_to_byte_coords(struct buffer *buffer, + struct location coords); + +struct match_result { + struct location at; + bool found; +}; + +struct match_result +buffer_find_prev_in_line(struct buffer *buffer, struct location start, + bool (*predicate)(const struct codepoint *c)); +struct match_result +buffer_find_next_in_line(struct buffer *buffer, struct location start, + bool (*predicate)(const struct codepoint *c)); + #endif diff --git a/src/dged/buffer_view.c b/src/dged/buffer_view.c index 4e67d78..f3dd2b9 100644 --- a/src/dged/buffer_view.c +++ b/src/dged/buffer_view.c @@ -128,7 +128,7 @@ void buffer_view_backward_nlines(struct buffer_view *view, uint32_t nlines) { } void buffer_view_goto_end_of_line(struct buffer_view *view) { - view->dot.col = buffer_num_chars(view->buffer, view->dot.line); + view->dot.col = buffer_line_length(view->buffer, view->dot.line); } void buffer_view_goto_beginning_of_line(struct buffer_view *view) { @@ -224,15 +224,22 @@ void buffer_view_delete_word(struct buffer_view *view) { } void buffer_view_kill_line(struct buffer_view *view) { - uint32_t nchars = - buffer_num_chars(view->buffer, view->dot.line) - view->dot.col; - if (nchars == 0) { - nchars = 1; + uint32_t ncols = + buffer_line_length(view->buffer, view->dot.line) - view->dot.col; + + uint32_t line = view->dot.line; + uint32_t col = view->dot.col + ncols; + + // kill the newline if we are at the end of the line + if (ncols == 0) { + struct location loc = buffer_next_char(view->buffer, view->dot); + line = loc.line; + col = loc.col; } struct region reg = region_new(view->dot, (struct location){ - .line = view->dot.line, - .col = view->dot.col + nchars, + .line = line, + .col = col, }); buffer_cut(view->buffer, reg); @@ -241,7 +248,8 @@ void buffer_view_kill_line(struct buffer_view *view) { void buffer_view_sort_lines(struct buffer_view *view) { struct region reg = region_new(view->dot, view->mark); if (view->mark_set && region_has_size(reg)) { - if (reg.end.line > 0 && buffer_num_chars(view->buffer, reg.end.line) == 0) { + if (reg.end.line > 0 && + buffer_line_length(view->buffer, reg.end.line) == 0) { reg.end.line -= 1; } @@ -271,21 +279,7 @@ struct location buffer_view_dot_to_relative(struct buffer_view *view) { } struct location buffer_view_dot_to_visual(struct buffer_view *view) { - // calculate visual column index for dot column - struct text_chunk c = buffer_line(view->buffer, view->dot.line); - uint32_t width = visual_string_width(c.text, c.nbytes, 0, view->dot.col); - if (view->scroll.col > 0) { - width -= visual_string_width(c.text, c.nbytes, 0, view->scroll.col); - } - - struct location l = buffer_view_dot_to_relative(view); - l.col = width + view->fringe_width; - - if (c.allocated) { - free(c.text); - } - - return l; + return buffer_view_dot_to_relative(view); } void buffer_view_undo(struct buffer_view *view) { diff --git a/src/dged/display.c b/src/dged/display.c index bc604f0..ea3f459 100644 --- a/src/dged/display.c +++ b/src/dged/display.c @@ -60,7 +60,7 @@ struct push_fmt_cmd { struct repeat_cmd { uint32_t col; uint32_t row; - int32_t c; + uint32_t c; uint32_t nrepeat; }; @@ -135,21 +135,7 @@ void display_destroy(struct display *display) { uint32_t display_width(struct display *display) { return display->width; } uint32_t display_height(struct display *display) { return display->height; } -void putch(uint8_t c) { - // TODO: move this to buffer rendering - if (c < ' ') { - fprintf(stdout, "^%c", c + 0x40); - } else if (c == 0x7f) { - fprintf(stdout, "^?"); - } else if (utf8_byte_is_unicode_start(c) || - utf8_byte_is_unicode_continuation(c)) { - putc(c, stdout); - } else if (c >= ' ' && c < 0x7f) { - putc(c, stdout); - } else { - fprintf(stdout, "|0x%02x|", c); - } -} +void putch(uint8_t c) { putc(c, stdout); } static void apply_fmt(uint8_t *fmt_stack, uint32_t fmt_stack_len) { if (fmt_stack == NULL || fmt_stack_len == 0) { @@ -164,6 +150,7 @@ static void apply_fmt(uint8_t *fmt_stack, uint32_t fmt_stack_len) { void putch_ws(uint8_t c, bool show_whitespace, uint8_t *fmt_stack, uint32_t fmt_stack_len) { + // TODO: tab width needs to be sent here if (show_whitespace && c == '\t') { fputs("\x1b[90m โ†’ \x1b[39m", stdout); apply_fmt(fmt_stack, fmt_stack_len); @@ -295,7 +282,7 @@ void command_list_draw_text_copy(struct command_list *list, uint32_t col, } void command_list_draw_repeated(struct command_list *list, uint32_t col, - uint32_t row, int32_t c, uint32_t nrepeat) { + uint32_t row, uint32_t c, uint32_t nrepeat) { struct repeat_cmd *cmd = add_command(list, RenderCommand_Repeat)->repeat; cmd->col = col; cmd->row = row; @@ -401,10 +388,14 @@ void display_render(struct display *display, display_move_cursor(display, repeat_cmd->row + cl->yoffset, repeat_cmd->col + cl->xoffset); apply_fmt(fmt_stack, fmt_stack_len); - uint32_t nbytes = utf8_nbytes((uint8_t *)&repeat_cmd->c, 4, 1); - for (uint32_t i = 0; i < repeat_cmd->nrepeat; ++i) { - putbytes((uint8_t *)&repeat_cmd->c, nbytes, show_whitespace_state, - fmt_stack, fmt_stack_len); + struct utf8_codepoint_iterator iter = + create_utf8_codepoint_iterator((uint8_t *)&repeat_cmd->c, 4, 0); + struct codepoint *codepoint = utf8_next_codepoint(&iter); + if (codepoint != NULL) { + for (uint32_t i = 0; i < repeat_cmd->nrepeat; ++i) { + putbytes((uint8_t *)&repeat_cmd->c, codepoint->nbytes, + show_whitespace_state, fmt_stack, fmt_stack_len); + } } break; } diff --git a/src/dged/display.h b/src/dged/display.h index 0fda30d..f9c7ef8 100644 --- a/src/dged/display.h +++ b/src/dged/display.h @@ -238,7 +238,7 @@ void command_list_draw_text_copy(struct command_list *list, uint32_t col, * @param nrepeat Number of times to repeat byte. */ void command_list_draw_repeated(struct command_list *list, uint32_t col, - uint32_t row, int32_t c, uint32_t nrepeat); + uint32_t row, uint32_t c, uint32_t nrepeat); void command_list_draw_command_list(struct command_list *list, struct command_list *to_draw); diff --git a/src/dged/keyboard.c b/src/dged/keyboard.c index 26eb308..04565e0 100644 --- a/src/dged/keyboard.c +++ b/src/dged/keyboard.c @@ -78,20 +78,24 @@ void parse_keys(uint8_t *bytes, uint32_t nbytes, struct key *out_keys, } else if (utf8_byte_is_unicode_continuation(b)) { // do nothing for these } else { // ascii char or unicode start byte (self-inserting) - uint32_t nb = utf8_byte_is_unicode_start(b) - ? utf8_nbytes(bytes + bytei, nbytes - bytei, 1) - : 1; - - // "compress" number of keys if previous key was also a - // "simple" key - if (prev_kp != NULL && prev_kp->mod == None) { - prev_kp->end += nb; - } else { - kp->mod = None; - kp->key = b; - kp->start = bytei; - kp->end = bytei + nb; - ++nkps; + // TODO: do this better + struct utf8_codepoint_iterator iter = + create_utf8_codepoint_iterator(bytes + bytei, nbytes - bytei, 0); + struct codepoint *codepoint = utf8_next_codepoint(&iter); + if (codepoint != NULL) { + uint32_t nb = codepoint->nbytes; + + // "compress" number of keys if previous key was also a + // "simple" key + if (prev_kp != NULL && prev_kp->mod == None) { + prev_kp->end += nb; + } else { + kp->mod = None; + kp->key = b; + kp->start = bytei; + kp->end = bytei + nb; + ++nkps; + } } } } diff --git a/src/dged/syntax.c b/src/dged/syntax.c index 8d0fd1a..569dc70 100644 --- a/src/dged/syntax.c +++ b/src/dged/syntax.c @@ -342,7 +342,8 @@ static void update_parser(struct buffer *buffer, void *userdata, : origin.line + height; ts_query_cursor_set_point_range( cursor, (TSPoint){.row = origin.line, .column = origin.col}, - (TSPoint){.row = end_line, .column = buffer_num_chars(buffer, end_line)}); + (TSPoint){.row = end_line, + .column = buffer_line_length(buffer, end_line)}); ts_query_cursor_exec(cursor, h->query, ts_tree_root_node(h->tree)); TSQueryMatch match; @@ -406,47 +407,39 @@ static void update_parser(struct buffer *buffer, void *userdata, continue; } - buffer_add_text_property( - buffer, - (struct location){.line = start.row, - .col = text_byteindex_to_col( - buffer->text, start.row, start.column)}, - (struct location){.line = end.row, - .col = text_byteindex_to_col(buffer->text, end.row, - end.column - 1)}, - (struct text_property){ - .type = TextProperty_Colors, - .colors = - (struct text_property_colors){ - .set_fg = true, - .fg = color, - }, - }); + text_add_property(buffer->text, start.row, start.column, end.row, + end.column > 0 ? end.column - 1 : 0, + (struct text_property){ + .type = TextProperty_Colors, + .colors = + (struct text_property_colors){ + .set_fg = true, + .fg = color, + }, + }); } } ts_query_cursor_delete(cursor); } -static void text_removed(struct buffer *buffer, struct region removed, - uint32_t begin_idx, uint32_t end_idx, void *userdata) { +static void text_removed(struct buffer *buffer, struct edit_location removed, + void *userdata) { struct highlight *h = (struct highlight *)userdata; - TSPoint begin = {.row = removed.begin.line, - .column = text_col_to_byteindex( - buffer->text, removed.begin.line, removed.begin.col)}; + TSPoint begin = {.row = removed.bytes.begin.line, + .column = removed.bytes.begin.col}; TSPoint new_end = begin; - TSPoint old_end = {.row = removed.end.line, - .column = text_col_to_byteindex( - buffer->text, removed.end.line, removed.end.col)}; + TSPoint old_end = {.row = removed.bytes.end.line, + .column = removed.bytes.end.col}; TSInputEdit edit = { .start_point = begin, .old_end_point = old_end, .new_end_point = new_end, - .start_byte = begin_idx, - .old_end_byte = end_idx, - .new_end_byte = begin_idx, + .start_byte = removed.global_byte_begin, + .old_end_byte = removed.global_byte_end, + .new_end_byte = removed.global_byte_begin, }; ts_tree_edit(h->tree, &edit); @@ -479,27 +472,24 @@ static void buffer_reloaded(struct buffer *buffer, void *userdata) { } } -static void text_inserted(struct buffer *buffer, struct region inserted, - uint32_t begin_idx, uint32_t end_idx, +static void text_inserted(struct buffer *buffer, struct edit_location inserted, void *userdata) { struct timer *text_inserted = timer_start("syntax.txt-inserted"); struct highlight *h = (struct highlight *)userdata; - TSPoint begin = {.row = inserted.begin.line, - .column = text_col_to_byteindex( - buffer->text, inserted.begin.line, inserted.begin.col)}; + TSPoint begin = {.row = inserted.bytes.begin.line, + .column = inserted.bytes.begin.col}; TSPoint old_end = begin; - TSPoint new_end = {.row = inserted.end.line, - .column = text_col_to_byteindex( - buffer->text, inserted.end.line, inserted.end.col)}; + TSPoint new_end = {.row = inserted.bytes.end.line, + .column = inserted.bytes.end.col}; TSInputEdit edit = { .start_point = begin, .old_end_point = old_end, .new_end_point = new_end, - .start_byte = begin_idx, - .old_end_byte = begin_idx, - .new_end_byte = end_idx, + .start_byte = inserted.global_byte_begin, + .old_end_byte = inserted.global_byte_begin, + .new_end_byte = inserted.global_byte_end, }; ts_tree_edit(h->tree, &edit); diff --git a/src/dged/text.c b/src/dged/text.c index 3d1078f..18ab04f 100644 --- a/src/dged/text.c +++ b/src/dged/text.c @@ -18,7 +18,6 @@ struct line { uint8_t *data; uint8_t flags; uint32_t nbytes; - uint32_t nchars; }; struct text_property_entry { @@ -54,11 +53,9 @@ void text_destroy(struct text *text) { text->lines[li].data = NULL; text->lines[li].flags = 0; text->lines[li].nbytes = 0; - text->lines[li].nchars = 0; } free(text->lines); - free(text); } @@ -68,68 +65,25 @@ void text_clear(struct text *text) { text->lines[li].data = NULL; text->lines[li].flags = 0; text->lines[li].nbytes = 0; - text->lines[li].nchars = 0; } text->nlines = 0; text_clear_properties(text); } -// given `char_idx` as a character index, return the byte index -uint32_t charidx_to_byteidx(struct line *line, uint32_t char_idx) { - if (line->nchars == 0) { - return 0; - } - - if (char_idx > line->nchars) { - return line->nbytes - 1; - } - - return utf8_nbytes(line->data, line->nbytes, char_idx); -} - -uint32_t text_col_to_byteindex(struct text *text, uint32_t line, uint32_t col) { - return charidx_to_byteidx(&text->lines[line], col); -} - -// given `byte_idx` as a byte index, return the character index -uint32_t byteidx_to_charidx(struct line *line, uint32_t byte_idx) { - if (byte_idx > line->nbytes) { - return line->nchars; +struct utf8_codepoint_iterator +text_line_codepoint_iterator(const struct text *text, uint32_t lineidx) { + if (lineidx >= text_num_lines(text)) { + return create_utf8_codepoint_iterator(NULL, 0, 0); } - return utf8_nchars(line->data, byte_idx); + return create_utf8_codepoint_iterator(text->lines[lineidx].data, + text->lines[lineidx].nbytes, 0); } -uint32_t text_byteindex_to_col(struct text *text, uint32_t line, - uint32_t byteindex) { - return byteidx_to_charidx(&text->lines[line], byteindex); -} - -uint32_t text_global_idx(struct text *text, uint32_t line, uint32_t col) { - uint32_t byteoff = 0; - uint32_t nlines = text_num_lines(text); - - if (nlines == 0) { - return 0; - } - - for (uint32_t l = 0; l < line && l < nlines; ++l) { - // +1 for newline - byteoff += text_line_size(text, l) + 1; - } - - uint32_t l = line < nlines ? line : nlines - 1; - uint32_t nchars = text_line_length(text, l); - uint32_t c = col < nchars ? col : nchars; - byteoff += text_col_to_byteindex(text, l, c); - - if (col > nchars) { - // account for newline - ++byteoff; - } - - return byteoff; +struct utf8_codepoint_iterator +text_chunk_codepoint_iterator(const struct text_chunk *chunk) { + return create_utf8_codepoint_iterator(chunk->text, chunk->nbytes, 0); } void append_empty_lines(struct text *text, uint32_t numlines) { @@ -145,17 +99,10 @@ void append_empty_lines(struct text *text, uint32_t numlines) { struct line *nline = &text->lines[text->nlines]; nline->data = NULL; nline->nbytes = 0; - nline->nchars = 0; nline->flags = 0; ++text->nlines; } - - if (text->nlines > text->capacity) { - printf("text->nlines: %d, text->capacity: %d\n", text->nlines, - text->capacity); - raise(SIGTRAP); - } } void ensure_line(struct text *text, uint32_t line) { @@ -166,8 +113,8 @@ void ensure_line(struct text *text, uint32_t line) { // It is assumed that `data` does not contain any \n, that is handled by // higher-level functions -void insert_at(struct text *text, uint32_t line, uint32_t col, uint8_t *data, - uint32_t len, uint32_t nchars) { +static void insert_at(struct text *text, uint32_t line, uint32_t offset, + uint8_t *data, uint32_t len) { if (len == 0) { return; @@ -178,11 +125,10 @@ void insert_at(struct text *text, uint32_t line, uint32_t col, uint8_t *data, struct line *l = &text->lines[line]; l->nbytes += len; - l->nchars += nchars; l->flags = LineChanged; l->data = realloc(l->data, l->nbytes); - uint32_t bytei = charidx_to_byteidx(l, col); + uint32_t bytei = offset; // move following bytes out of the way if (bytei + len < l->nbytes) { @@ -194,15 +140,7 @@ void insert_at(struct text *text, uint32_t line, uint32_t col, uint8_t *data, memcpy(l->data + bytei, data, len); } -uint32_t text_line_length(struct text *text, uint32_t lineidx) { - if (lineidx >= text_num_lines(text)) { - return 0; - } - - return text->lines[lineidx].nchars; -} - -uint32_t text_line_size(struct text *text, uint32_t lineidx) { +uint32_t text_line_size(const struct text *text, uint32_t lineidx) { if (lineidx >= text_num_lines(text)) { return 0; } @@ -210,20 +148,19 @@ uint32_t text_line_size(struct text *text, uint32_t lineidx) { return text->lines[lineidx].nbytes; } -uint32_t text_num_lines(struct text *text) { return text->nlines; } +uint32_t text_num_lines(const struct text *text) { return text->nlines; } + +static void split_line(struct text *text, uint32_t offset, uint32_t lineidx, + uint32_t newlineidx) { + struct line *line = &text->lines[lineidx]; + struct line *next = &text->lines[newlineidx]; -void split_line(uint32_t col, struct line *line, struct line *next) { uint8_t *data = line->data; uint32_t nbytes = line->nbytes; - uint32_t nchars = line->nchars; - - uint32_t chari = col; - uint32_t bytei = charidx_to_byteidx(line, chari); + uint32_t bytei = offset; line->nbytes = bytei; - line->nchars = chari; next->nbytes = nbytes - bytei; - next->nchars = nchars - chari; line->flags = next->flags = line->flags; next->data = NULL; @@ -260,7 +197,7 @@ void shift_lines(struct text *text, uint32_t start, int32_t direction) { memmove(dest, src, nlines * sizeof(struct line)); } -void new_line_at(struct text *text, uint32_t line, uint32_t col) { +void new_line_at(struct text *text, uint32_t line, uint32_t offset) { ensure_line(text, line); uint32_t newline = line + 1; @@ -274,7 +211,7 @@ void new_line_at(struct text *text, uint32_t line, uint32_t col) { } // split line if needed - split_line(col, &text->lines[line], &text->lines[newline]); + split_line(text, offset, line, newline); } void delete_line(struct text *text, uint32_t line) { @@ -294,29 +231,25 @@ void delete_line(struct text *text, uint32_t line) { --text->nlines; text->lines[text->nlines].data = NULL; text->lines[text->nlines].nbytes = 0; - text->lines[text->nlines].nchars = 0; } -void text_insert_at_inner(struct text *text, uint32_t line, uint32_t col, - uint8_t *bytes, uint32_t nbytes, - uint32_t *lines_added, uint32_t *cols_added) { +static void text_insert_at_inner(struct text *text, uint32_t line, + uint32_t offset, uint8_t *bytes, + uint32_t nbytes, uint32_t *lines_added) { uint32_t linelen = 0, start_line = line; - *cols_added = 0; for (uint32_t bytei = 0; bytei < nbytes; ++bytei) { uint8_t byte = bytes[bytei]; if (byte == '\n') { uint8_t *line_data = bytes + (bytei - linelen); - uint32_t nchars = utf8_nchars(line_data, linelen); + insert_at(text, line, offset, line_data, linelen); - insert_at(text, line, col, line_data, linelen, nchars); - - col += nchars; - new_line_at(text, line, col); + offset += linelen; + new_line_at(text, line, offset); ++line; linelen = 0; - col = 0; + offset = 0; } else { ++linelen; } @@ -325,30 +258,26 @@ void text_insert_at_inner(struct text *text, uint32_t line, uint32_t col, // handle remaining if (linelen > 0) { uint8_t *line_data = bytes + (nbytes - linelen); - uint32_t nchars = utf8_nchars(line_data, linelen); - insert_at(text, line, col, line_data, linelen, nchars); - *cols_added = nchars; + insert_at(text, line, offset, line_data, linelen); } *lines_added = line - start_line; } void text_append(struct text *text, uint8_t *bytes, uint32_t nbytes, - uint32_t *lines_added, uint32_t *cols_added) { + uint32_t *lines_added) { uint32_t line = text->nlines > 0 ? text->nlines - 1 : 0; - uint32_t col = text_line_length(text, line); - - text_insert_at_inner(text, line, col, bytes, nbytes, lines_added, cols_added); + uint32_t offset = text_line_size(text, line); + text_insert_at_inner(text, line, offset, bytes, nbytes, lines_added); } -void text_insert_at(struct text *text, uint32_t line, uint32_t col, - uint8_t *bytes, uint32_t nbytes, uint32_t *lines_added, - uint32_t *cols_added) { - text_insert_at_inner(text, line, col, bytes, nbytes, lines_added, cols_added); +void text_insert_at(struct text *text, uint32_t line, uint32_t offset, + uint8_t *bytes, uint32_t nbytes, uint32_t *lines_added) { + text_insert_at_inner(text, line, offset, bytes, nbytes, lines_added); } -void text_delete(struct text *text, uint32_t start_line, uint32_t start_col, - uint32_t end_line, uint32_t end_col) { +void text_delete(struct text *text, uint32_t start_line, uint32_t start_offset, + uint32_t end_line, uint32_t end_offset) { if (text->nlines == 0) { return; @@ -362,45 +291,44 @@ void text_delete(struct text *text, uint32_t start_line, uint32_t start_col, if (end_line > maxline) { end_line = maxline; - end_col = text->lines[end_line].nchars; + end_offset = text_line_size(text, end_line); } struct line *firstline = &text->lines[start_line]; struct line *lastline = &text->lines[end_line]; // clamp column - if (start_col > firstline->nchars) { - start_col = firstline->nchars > 0 ? firstline->nchars - 1 : 0; + uint32_t firstline_len = text_line_size(text, start_line); + if (start_offset > firstline_len) { + start_offset = firstline_len > 0 ? firstline_len - 1 : 0; } // handle deletion of newlines - if (end_col > lastline->nchars) { + uint32_t lastline_len = text_line_size(text, end_line); + if (end_offset > lastline_len) { if (end_line + 1 < text->nlines) { - end_col = 0; + end_offset = 0; ++end_line; lastline = &text->lines[end_line]; } else { - end_col = lastline->nchars; + end_offset = lastline_len; } } - uint32_t bytei = utf8_nbytes(lastline->data, lastline->nbytes, end_col); + uint32_t srcbytei = end_offset; + uint32_t dstbytei = start_offset; + uint32_t ncopy = lastline->nbytes - srcbytei; if (lastline == firstline) { // in this case we can "overwrite" - uint32_t dstbytei = - utf8_nbytes(firstline->data, firstline->nbytes, start_col); - memmove(firstline->data + dstbytei, lastline->data + bytei, - lastline->nbytes - bytei); + memmove(firstline->data + dstbytei, lastline->data + srcbytei, ncopy); } else { // otherwise we actually have to copy from the last line - insert_at(text, start_line, start_col, lastline->data + bytei, - lastline->nbytes - bytei, lastline->nchars - end_col); + insert_at(text, start_line, start_offset, lastline->data + srcbytei, ncopy); } - firstline->nchars = start_col + (lastline->nchars - end_col); - firstline->nbytes = - utf8_nbytes(firstline->data, firstline->nbytes, start_col) + - (lastline->nbytes - bytei); + // new byte count is whatever we had before (left of dstbytei) + // plus what we copied + firstline->nbytes = dstbytei + ncopy; // delete full lines, backwards to not shift old, crappy data upwards for (uint32_t linei = end_line >= text->nlines ? end_line - 1 : end_line; @@ -429,7 +357,6 @@ void text_for_each_line(struct text *text, uint32_t line, uint32_t nlines, .allocated = false, .text = src_line->data, .nbytes = src_line->nbytes, - .nchars = src_line->nchars, .line = li, }; callback(&line, userdata); @@ -441,8 +368,8 @@ struct text_chunk text_get_line(struct text *text, uint32_t line) { return (struct text_chunk){ .text = src_line->data, .nbytes = src_line->nbytes, - .nchars = src_line->nchars, .line = line, + .allocated = false, }; } @@ -453,33 +380,34 @@ struct copy_cmd { }; struct text_chunk text_get_region(struct text *text, uint32_t start_line, - uint32_t start_col, uint32_t end_line, - uint32_t end_col) { - if (start_line == end_line && start_col == end_col) { + uint32_t start_offset, uint32_t end_line, + uint32_t end_offset) { + if (start_line == end_line && start_offset == end_offset) { return (struct text_chunk){0}; } struct line *first_line = &text->lines[start_line]; struct line *last_line = &text->lines[end_line]; + uint32_t first_line_len = first_line->nbytes; + uint32_t last_line_len = last_line->nbytes; - if (start_col > first_line->nchars) { + if (start_offset > first_line_len) { return (struct text_chunk){0}; } // handle copying of newlines - if (end_col > last_line->nchars) { + if (end_offset > last_line_len) { ++end_line; - end_col = 0; + end_offset = 0; last_line = &text->lines[end_line]; } uint32_t nlines = end_line - start_line + 1; struct copy_cmd *copy_cmds = calloc(nlines, sizeof(struct copy_cmd)); - uint32_t total_chars = 0, total_bytes = 0; + uint32_t total_bytes = 0; for (uint32_t line = start_line; line <= end_line; ++line) { struct line *l = &text->lines[line]; - total_chars += l->nchars; total_bytes += l->nbytes; struct copy_cmd *cmd = ©_cmds[line - start_line]; @@ -490,19 +418,14 @@ struct text_chunk text_get_region(struct text *text, uint32_t start_line, // correct first line struct copy_cmd *cmd_first = ©_cmds[0]; - uint32_t byteoff = - utf8_nbytes(first_line->data, first_line->nbytes, start_col); - cmd_first->byteoffset += byteoff; - cmd_first->nbytes -= byteoff; - total_bytes -= byteoff; - total_chars -= start_col; + cmd_first->byteoffset += start_offset; + cmd_first->nbytes -= start_offset; + total_bytes -= start_offset; // correct last line struct copy_cmd *cmd_last = ©_cmds[nlines - 1]; - uint32_t byteindex = utf8_nbytes(last_line->data, last_line->nbytes, end_col); - cmd_last->nbytes -= (last_line->nbytes - byteindex); - total_bytes -= (last_line->nbytes - byteindex); - total_chars -= (last_line->nchars - end_col); + cmd_last->nbytes -= (last_line->nbytes - end_offset); + total_bytes -= (last_line->nbytes - end_offset); uint8_t *data = (uint8_t *)malloc( total_bytes + /* nr of newline chars */ (end_line - start_line)); @@ -518,7 +441,6 @@ struct text_chunk text_get_region(struct text *text, uint32_t start_line, data[curr] = '\n'; ++curr; ++total_bytes; - ++total_chars; } } @@ -527,28 +449,25 @@ struct text_chunk text_get_region(struct text *text, uint32_t start_line, .text = data, .line = 0, .nbytes = total_bytes, - .nchars = total_chars, .allocated = true, }; } -bool text_line_contains_unicode(struct text *text, uint32_t line) { - return text->lines[line].nbytes != text->lines[line].nchars; -} - -void text_add_property(struct text *text, struct location start, - struct location end, struct text_property property) { +void text_add_property(struct text *text, uint32_t start_line, + uint32_t start_offset, uint32_t end_line, + uint32_t end_offset, struct text_property property) { struct text_property_entry entry = { - .start = start, - .end = end, + .start = (struct location){.line = start_line, .col = start_offset}, + .end = (struct location){.line = end_line, .col = end_offset}, .property = property, }; VEC_PUSH(&text->properties, entry); } -void text_get_properties(struct text *text, struct location location, +void text_get_properties(struct text *text, uint32_t line, uint32_t offset, struct text_property **properties, uint32_t max_nproperties, uint32_t *nproperties) { + struct location location = {.line = line, .col = offset}; uint32_t nres = 0; VEC_FOR_EACH(&text->properties, struct text_property_entry * prop) { if (location_is_between(location, prop->start, prop->end)) { diff --git a/src/dged/text.h b/src/dged/text.h index 8b49ef4..28bd325 100644 --- a/src/dged/text.h +++ b/src/dged/text.h @@ -6,9 +6,16 @@ #include #include "location.h" +#include "utf8.h" struct text; -struct render_command; + +struct text_chunk { + uint8_t *text; + uint32_t nbytes; + uint32_t line; + bool allocated; +}; struct text *text_create(uint32_t initial_capacity); void text_destroy(struct text *text); @@ -18,31 +25,21 @@ void text_destroy(struct text *text); */ void text_clear(struct text *text); -void text_insert_at(struct text *text, uint32_t line, uint32_t col, - uint8_t *bytes, uint32_t nbytes, uint32_t *lines_added, - uint32_t *cols_added); +void text_insert_at(struct text *text, uint32_t line, uint32_t offset, + uint8_t *bytes, uint32_t nbytes, uint32_t *lines_added); void text_append(struct text *text, uint8_t *bytes, uint32_t nbytes, - uint32_t *lines_added, uint32_t *cols_added); + uint32_t *lines_added); -void text_delete(struct text *text, uint32_t start_line, uint32_t start_col, - uint32_t end_line, uint32_t end_col); +void text_delete(struct text *text, uint32_t start_line, uint32_t start_offset, + uint32_t end_line, uint32_t end_offset); -uint32_t text_num_lines(struct text *text); -uint32_t text_line_length(struct text *text, uint32_t lineidx); -uint32_t text_line_size(struct text *text, uint32_t lineidx); -uint32_t text_col_to_byteindex(struct text *text, uint32_t line, uint32_t col); -uint32_t text_byteindex_to_col(struct text *text, uint32_t line, - uint32_t byteindex); -uint32_t text_global_idx(struct text *text, uint32_t line, uint32_t col); - -struct text_chunk { - uint8_t *text; - uint32_t nbytes; - uint32_t nchars; - uint32_t line; - bool allocated; -}; +uint32_t text_num_lines(const struct text *text); +uint32_t text_line_size(const struct text *text, uint32_t lineidx); +struct utf8_codepoint_iterator +text_line_codepoint_iterator(const struct text *text, uint32_t lineidx); +struct utf8_codepoint_iterator +text_chunk_codepoint_iterator(const struct text_chunk *chunk); typedef void (*chunk_cb)(struct text_chunk *chunk, void *userdata); void text_for_each_line(struct text *text, uint32_t line, uint32_t nlines, @@ -52,10 +49,8 @@ void text_for_each_chunk(struct text *text, chunk_cb callback, void *userdata); struct text_chunk text_get_line(struct text *text, uint32_t line); struct text_chunk text_get_region(struct text *text, uint32_t start_line, - uint32_t start_col, uint32_t end_line, - uint32_t end_col); - -bool text_line_contains_unicode(struct text *text, uint32_t line); + uint32_t start_offset, uint32_t end_line, + uint32_t end_offset); enum text_property_type { TextProperty_Colors, @@ -77,10 +72,11 @@ struct text_property { }; }; -void text_add_property(struct text *text, struct location start, - struct location end, struct text_property property); +void text_add_property(struct text *text, uint32_t start_line, + uint32_t start_offset, uint32_t end_line, + uint32_t end_offset, struct text_property property); -void text_get_properties(struct text *text, struct location location, +void text_get_properties(struct text *text, uint32_t line, uint32_t offset, struct text_property **properties, uint32_t max_nproperties, uint32_t *nproperties); diff --git a/src/dged/utf8.c b/src/dged/utf8.c index 52de2da..ede4fb1 100644 --- a/src/dged/utf8.c +++ b/src/dged/utf8.c @@ -1,5 +1,6 @@ #include "utf8.h" +#include #include #include @@ -10,76 +11,125 @@ bool utf8_byte_is_unicode_continuation(uint8_t byte) { bool utf8_byte_is_unicode(uint8_t byte) { return (byte & 0x80) != 0x0; } bool utf8_byte_is_ascii(uint8_t byte) { return !utf8_byte_is_unicode(byte); } -uint32_t utf8_nbytes_in_char(uint8_t byte) { - // length of char is the number of leading ones - // flip it and count number of leading zeros - uint8_t invb = ~byte; - return __builtin_clz((uint32_t)invb) - 24; +enum utf8_state { + Utf8_Accept = 0, + Utf8_Reject = 1, +}; + +// clang-format off +static const uint8_t utf8d[] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df + 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef + 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff + 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 + 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 + 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 + 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 +}; +// clang-format on + +/* + * emoji decoding algorithm from + * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ + */ +static enum utf8_state decode(enum utf8_state *state, uint32_t *codep, + uint32_t byte) { + uint32_t type = utf8d[byte]; + + *codep = (*state != Utf8_Accept) ? (byte & 0x3fu) | (*codep << 6) + : (0xff >> type) & (byte); + + *state = utf8d[256 + *state * 16 + type]; + return *state; +} + +static struct codepoint next_utf8_codepoint(uint8_t *bytes, uint64_t nbytes) { + uint32_t codepoint = 0; + enum utf8_state state = Utf8_Accept; + uint32_t bi = 0; + while (bi < nbytes) { + enum utf8_state res = decode(&state, &codepoint, bytes[bi]); + ++bi; + + if (res == Utf8_Accept || res == Utf8_Reject) { + break; + } + } + + if (state == Utf8_Reject) { + codepoint = 0xfffd; + } + + return (struct codepoint){.codepoint = codepoint, .nbytes = bi}; } -// TODO: grapheme clusters, this returns the number of unicode code points +struct codepoint *utf8_next_codepoint(struct utf8_codepoint_iterator *iter) { + if (iter->offset >= iter->nbytes) { + return NULL; + } + + iter->current = next_utf8_codepoint(iter->data + iter->offset, + iter->nbytes - iter->offset); + iter->offset += iter->current.nbytes; + return &iter->current; +} + +struct utf8_codepoint_iterator +create_utf8_codepoint_iterator(uint8_t *data, uint64_t len, + uint64_t initial_offset) { + return (struct utf8_codepoint_iterator){ + .data = data, + .nbytes = len, + .offset = initial_offset, + }; +} + +/* TODO: grapheme clusters and other classification, this + * returns the number of unicode code points + */ uint32_t utf8_nchars(uint8_t *bytes, uint32_t nbytes) { + uint32_t bi = 0; uint32_t nchars = 0; - uint32_t expected = 0; - for (uint32_t bi = 0; bi < nbytes; ++bi) { - uint8_t byte = bytes[bi]; - if (utf8_byte_is_unicode(byte)) { - if (utf8_byte_is_unicode_start(byte)) { - expected = utf8_nbytes_in_char(byte) - 1; - } else { // continuation byte - --expected; - if (expected == 0) { - ++nchars; - } - } - } else { // ascii - ++nchars; - } + while (bi < nbytes) { + struct codepoint codepoint = next_utf8_codepoint(bytes + bi, nbytes - bi); + ++nchars; + bi += codepoint.nbytes; } + return nchars; } -// TODO: grapheme clusters, this uses the number of unicode code points +/* TODO: grapheme clusters and other classification, this + * returns the number of unicode code points + */ uint32_t utf8_nbytes(uint8_t *bytes, uint32_t nbytes, uint32_t nchars) { - uint32_t bi = 0; uint32_t chars = 0; uint32_t expected = 0; while (chars < nchars && bi < nbytes) { - uint8_t byte = bytes[bi]; - if (utf8_byte_is_unicode(byte)) { - if (utf8_byte_is_unicode_start(byte)) { - expected = utf8_nbytes_in_char(byte) - 1; - } else { // continuation char - --expected; - if (expected == 0) { - ++chars; - } - } - } else { // ascii - ++chars; - } - - ++bi; + struct codepoint codepoint = next_utf8_codepoint(bytes + bi, nbytes - bi); + bi += codepoint.nbytes; + ++chars; } + // TODO: reject invalid? return bi; } -uint32_t utf8_visual_char_width(uint8_t *bytes, uint32_t len) { - if (utf8_byte_is_unicode_start(*bytes)) { - wchar_t wc; - size_t nbytes = 0; - if ((nbytes = mbrtowc(&wc, (char *)bytes, len, NULL)) > 0) { - size_t w = wcwidth(wc); - return w > 0 ? w : 2; - } else { - return 1; - } - } else if (utf8_byte_is_unicode_continuation(*bytes)) { - return 0; +uint32_t unicode_visual_char_width(const struct codepoint *codepoint) { + if (codepoint->nbytes > 0) { + // TODO: use unicode classification instead + size_t w = wcwidth(codepoint->codepoint); + return w >= 0 ? w : 2; } else { - return 1; + return 0; } } diff --git a/src/dged/utf8.h b/src/dged/utf8.h index 04aa242..22ce22d 100644 --- a/src/dged/utf8.h +++ b/src/dged/utf8.h @@ -1,19 +1,37 @@ +#ifndef _UTF8_H +#define _UTF8_H + #include #include +struct codepoint { + uint32_t codepoint; + uint32_t nbytes; +}; + +struct utf8_codepoint_iterator { + uint8_t *data; + uint64_t nbytes; + uint64_t offset; + struct codepoint current; +}; + +struct utf8_codepoint_iterator +create_utf8_codepoint_iterator(uint8_t *data, uint64_t len, + uint64_t initial_offset); +struct codepoint *utf8_next_codepoint(struct utf8_codepoint_iterator *iter); + /*! * \brief Return the number of chars the utf-8 sequence pointed at by `bytes` of * length `nbytes`, represents */ uint32_t utf8_nchars(uint8_t *bytes, uint32_t nbytes); -/* Return the number of bytes used to make up the next `nchars` characters */ -uint32_t utf8_nbytes(uint8_t *bytes, uint32_t nbytes, uint32_t nchars); +uint32_t unicode_visual_char_width(const struct codepoint *codepoint); -/* true if `byte` is a unicode byte sequence start byte */ bool utf8_byte_is_unicode_start(uint8_t byte); bool utf8_byte_is_unicode_continuation(uint8_t byte); -bool utf8_byte_is_ascii(uint8_t byte); bool utf8_byte_is_unicode(uint8_t byte); +bool utf8_byte_is_ascii(uint8_t byte); -uint32_t utf8_visual_char_width(uint8_t *bytes, uint32_t len); +#endif diff --git a/src/main/cmds.c b/src/main/cmds.c index 4da8346..18f333d 100644 --- a/src/main/cmds.c +++ b/src/main/cmds.c @@ -258,7 +258,7 @@ void buffer_to_list_line(struct buffer *buffer, void *userdata) { buffer_add_text_property( listbuf, (struct location){.line = begin.line, .col = 0}, (struct location){.line = begin.line, - .col = buffer_num_chars(listbuf, begin.line)}, + .col = buffer_line_length(listbuf, begin.line)}, (struct text_property){.type = TextProperty_Data, .userdata = buffer}); } } diff --git a/src/main/completion.c b/src/main/completion.c index 52bf6f8..4ffbc46 100644 --- a/src/main/completion.c +++ b/src/main/completion.c @@ -40,6 +40,11 @@ static struct buffer *g_target_buffer = NULL; static void hide_completion(); +static bool is_space(const struct codepoint *c) { + // TODO: utf8 whitespace and other whitespace + return c->codepoint == ' '; +} + static uint32_t complete_path(struct completion_context ctx, void *userdata); static struct completion_provider g_path_provider = { .name = "path", @@ -214,32 +219,30 @@ static void update_completions(struct buffer *buffer, } } -static void on_buffer_delete(struct buffer *buffer, struct region deleted, - uint32_t start_idx, uint32_t end_idx, - void *userdata) { +static void on_buffer_delete(struct buffer *buffer, + struct edit_location deleted, void *userdata) { struct active_completion_ctx *ctx = (struct active_completion_ctx *)userdata; if (g_state.active) { - update_completions(buffer, ctx, deleted.begin); + update_completions(buffer, ctx, deleted.coordinates.begin); } } -static void on_buffer_insert(struct buffer *buffer, struct region inserted, - uint32_t start_idx, uint32_t end_idx, - void *userdata) { +static void on_buffer_insert(struct buffer *buffer, + struct edit_location inserted, void *userdata) { struct active_completion_ctx *ctx = (struct active_completion_ctx *)userdata; if (!g_state.active) { uint32_t nchars = 0; switch (ctx->trigger.kind) { case CompletionTrigger_Input: - for (uint32_t line = inserted.begin.line; line <= inserted.end.line; - ++line) { - nchars += buffer_num_chars(buffer, line); + for (uint32_t line = inserted.coordinates.begin.line; + line <= inserted.coordinates.end.line; ++line) { + nchars += buffer_line_length(buffer, line); } - nchars -= - inserted.begin.col + - (buffer_num_chars(buffer, inserted.end.line) - inserted.end.col); + nchars -= inserted.coordinates.begin.col + + (buffer_line_length(buffer, inserted.coordinates.end.line) - + inserted.coordinates.end.col); ctx->trigger_current_nchars += nchars; @@ -260,16 +263,16 @@ static void on_buffer_insert(struct buffer *buffer, struct region inserted, g_state.ctx = ctx; } - update_completions(buffer, ctx, inserted.end); + update_completions(buffer, ctx, inserted.coordinates.end); } static void update_completion_buffer(struct buffer *buffer, void *userdata) { buffer_add_text_property( g_target_buffer, (struct location){.line = g_state.current_completion, .col = 0}, - (struct location){ - .line = g_state.current_completion, - .col = buffer_num_chars(g_target_buffer, g_state.current_completion)}, + (struct location){.line = g_state.current_completion, + .col = buffer_line_length(g_target_buffer, + g_state.current_completion)}, (struct text_property){.type = TextProperty_Colors, .colors = (struct text_property_colors){ .set_bg = false, @@ -433,26 +436,18 @@ static uint32_t complete_path(struct completion_context ctx, void *userdata) { if (ctx.buffer == minibuffer_buffer()) { txt = minibuffer_content(); } else { - txt = buffer_line(ctx.buffer, ctx.location.line); - uint32_t end_idx = text_col_to_byteindex( - ctx.buffer->text, ctx.location.line, ctx.location.col); - - for (uint32_t bytei = end_idx; bytei > 0; --bytei) { - if (txt.text[bytei] == ' ') { - start_idx = bytei + 1; - break; - } - } - - if (start_idx >= end_idx) { + struct match_result start = + buffer_find_prev_in_line(ctx.buffer, ctx.location, is_space); + if (!start.found) { + start.at = (struct location){.line = ctx.location.line, .col = 0}; return 0; } - - txt.nbytes = end_idx - start_idx; + txt = buffer_region(ctx.buffer, region_new(start.at, ctx.location)); } - char *path = calloc(txt.nbytes + 1, sizeof(uint8_t)); - memcpy(path, txt.text + start_idx, txt.nbytes); + char *path = calloc(txt.nbytes + 1, sizeof(char)); + memcpy(path, txt.text, txt.nbytes); + path[txt.nbytes] = '\0'; if (txt.allocated) { free(txt.text); @@ -562,25 +557,18 @@ static uint32_t complete_buffers(struct completion_context ctx, if (ctx.buffer == minibuffer_buffer()) { txt = minibuffer_content(); } else { - txt = buffer_line(ctx.buffer, ctx.location.line); - uint32_t end_idx = text_col_to_byteindex( - ctx.buffer->text, ctx.location.line, ctx.location.col); - for (uint32_t bytei = end_idx; bytei > 0; --bytei) { - if (txt.text[bytei] == ' ') { - start_idx = bytei + 1; - break; - } - } - - if (start_idx >= end_idx) { + struct match_result start = + buffer_find_prev_in_line(ctx.buffer, ctx.location, is_space); + if (!start.found) { + start.at = (struct location){.line = ctx.location.line, .col = 0}; return 0; } - - txt.nbytes = end_idx - start_idx; + txt = buffer_region(ctx.buffer, region_new(start.at, ctx.location)); } - char *needle = calloc(txt.nbytes + 1, sizeof(uint8_t)); - memcpy(needle, txt.text + start_idx, txt.nbytes); + char *needle = calloc(txt.nbytes + 1, sizeof(char)); + memcpy(needle, txt.text, txt.nbytes); + needle[txt.nbytes] = '\0'; if (txt.allocated) { free(txt.text); @@ -619,31 +607,23 @@ static uint32_t complete_commands(struct completion_context ctx, if (commands == NULL) { return 0; } - struct text_chunk txt = {0}; uint32_t start_idx = 0; if (ctx.buffer == minibuffer_buffer()) { txt = minibuffer_content(); } else { - txt = buffer_line(ctx.buffer, ctx.location.line); - uint32_t end_idx = text_col_to_byteindex( - ctx.buffer->text, ctx.location.line, ctx.location.col); - for (uint32_t bytei = end_idx; bytei > 0; --bytei) { - if (txt.text[bytei] == ' ') { - start_idx = bytei + 1; - break; - } - } - - if (start_idx >= end_idx) { + struct match_result start = + buffer_find_prev_in_line(ctx.buffer, ctx.location, is_space); + if (!start.found) { + start.at = (struct location){.line = ctx.location.line, .col = 0}; return 0; } - - txt.nbytes = end_idx - start_idx; + txt = buffer_region(ctx.buffer, region_new(start.at, ctx.location)); } - char *needle = calloc(txt.nbytes + 1, sizeof(uint8_t)); - memcpy(needle, txt.text + start_idx, txt.nbytes); + char *needle = calloc(txt.nbytes + 1, sizeof(char)); + memcpy(needle, txt.text, txt.nbytes); + needle[txt.nbytes] = '\0'; if (txt.allocated) { free(txt.text); diff --git a/sune.txt b/sune.txt new file mode 100644 index 0000000..711f7ee --- /dev/null +++ b/sune.txt @@ -0,0 +1,4 @@ +โฌ†๏ธasd +๐ŸŽ  aba +this is tab +๐Ÿ‡ซ๐Ÿ‡ฎ hej hej diff --git a/test/buffer.c b/test/buffer.c index a4b318e..a7ddcc8 100644 --- a/test/buffer.c +++ b/test/buffer.c @@ -1,11 +1,21 @@ #include #include "dged/buffer.h" +#include "dged/settings.h" #include "assert.h" #include "test.h" -void test_add() { +static uint32_t add_callback_call_count = 0; +static void add_callback(struct buffer *buffer, struct edit_location added, + void *userdata) { + (void)buffer; + (void)added; + (void)userdata; + ++add_callback_call_count; +} + +static void test_add(void) { struct buffer b = buffer_create("test-buffer"); ASSERT(buffer_num_lines(&b) == 0, "Expected buffer to have zero lines"); @@ -16,10 +26,62 @@ void test_add() { ASSERT(loc.line == 1 && loc.col == strlen(txt), "Expected buffer to have one line with characters"); + // test callback + uint32_t hook_id = buffer_add_insert_hook(&b, add_callback, NULL); + buffer_add(&b, (struct location){.line = 0, .col = 0}, (uint8_t *)"hej", 3); + ASSERT(add_callback_call_count == 1, "Expected callback to have been called"); + + // test removing the hook + buffer_remove_insert_hook(&b, hook_id, NULL); + buffer_add(&b, (struct location){.line = 0, .col = 0}, (uint8_t *)"hej", 3); + ASSERT(add_callback_call_count == 1, + "Expected callback to not have been called after it has been removed"); + + buffer_destroy(&b); +} + +static uint32_t delete_callback_call_count = 0; +static void delete_callback(struct buffer *buffer, struct edit_location removed, + void *userdata) { + (void)buffer; + (void)removed; + (void)userdata; + ++delete_callback_call_count; +} + +static void test_delete(void) { + struct buffer b = buffer_create("test-buffer-delete"); + const char *txt = "we are adding some text\ntwo lines to be exact"; + struct location loc = buffer_add(&b, (struct location){.line = 0, .col = 0}, + (uint8_t *)txt, strlen(txt)); + + ASSERT(buffer_line_length(&b, 0) == 23, + "Expected line 1 to be 23 chars before deletion"); + buffer_delete(&b, region_new((struct location){.line = 0, .col = 0}, + (struct location){.line = 0, .col = 2})); + ASSERT(buffer_line_length(&b, 0) == 21, + "Expected line 1 to be 21 chars after deletion"); + + // delete newline + buffer_delete(&b, region_new((struct location){.line = 0, .col = 21}, + (struct location){.line = 1, .col = 0})); + ASSERT(buffer_num_lines(&b) == 1, + "Expected buffer to have one line after new line deletion"); + ASSERT(buffer_line_length(&b, 0) == 42, + "Expected single line to be sum of both line lengths after new line " + "deletion"); + + // test that callback works + buffer_add_delete_hook(&b, delete_callback, NULL); + buffer_delete(&b, region_new((struct location){.line = 0, .col = 0}, + (struct location){.line = 0, .col = 2})); + ASSERT(delete_callback_call_count == 1, + "Expected callback to have been called"); + buffer_destroy(&b); } -void test_word_at() { +static void test_word_at(void) { struct buffer b = buffer_create("test-word-at-buffer"); const char *txt = "word1 (word2). Another"; buffer_add(&b, (struct location){.line = 0, .col = 0}, (uint8_t *)txt, @@ -40,8 +102,7 @@ void test_word_at() { "Expected word to span cols 7..12"); // test that clamping works correctly - struct region word3 = - buffer_word_at(&b, (struct location){.line = 0, .col = 100}); + struct region word3 = buffer_word_at(&b, buffer_clamp(&b, 0, 100)); ASSERT(region_has_size(word3), "expected 0,100 to be in the last word"); ASSERT(word3.begin.col == 15 && word3.end.col == 22, "Expected word to span cols 15..22"); @@ -49,7 +110,88 @@ void test_word_at() { buffer_destroy(&b); } -void run_buffer_tests() { +static void test_line_len(void) { + struct buffer b = buffer_create("test-line-length-buffer"); + const char *txt = "Look! Banana ๐ŸŒ"; + buffer_add(&b, (struct location){.line = 0, .col = 0}, (uint8_t *)txt, + strlen(txt)); + ASSERT(buffer_line_length(&b, 0) == 15, + "Expected banana line to be 15 chars wide"); +} + +static void test_char_movement(void) { + struct buffer b = buffer_create("test-char-movement-buffer"); + const char *txt = "abcdefgh ๐ŸŽฏjklmn\tab"; + buffer_add(&b, buffer_end(&b), (uint8_t *)txt, strlen(txt)); + struct location next = + buffer_next_char(&b, (struct location){.line = 0, .col = 0}); + ASSERT(next.col == 1, "Expected next char to be next char"); + + next = buffer_next_char(&b, (struct location){.line = 0, .col = 9}); + ASSERT(next.col == 11, + "Expected a double width char to result in a 2 column move"); + + next = buffer_next_char(&b, (struct location){.line = 0, .col = 16}); + uint64_t tab_width = settings_get("editor.tab-width")->value.number_value; + ASSERT(next.col == 16 + tab_width, + "Expected a tab to result in a move the width of a tab"); + + struct location prev = + buffer_previous_char(&b, (struct location){.line = 0, .col = 0}); + ASSERT(prev.col == 0 && prev.line == 0, + "Expected backwards motion from 0,0 not to be possible"); + + prev = buffer_previous_char(&b, (struct location){.line = 0, .col = 11}); + ASSERT(prev.col == 9, + "Expected a double width char to result in a 2 column move"); + + prev = buffer_previous_char( + &b, (struct location){.line = 0, .col = 16 + tab_width}); + ASSERT(prev.col == 16, + "Expected a tab move backwards to step over the width of a tab"); +} + +static void test_word_movement(void) { + struct buffer b = buffer_create("test-word-movement-buffer"); + + const char *txt = " word1, word2 \"word3\" word4"; + buffer_add(&b, buffer_end(&b), (uint8_t *)txt, strlen(txt)); + struct location next = + buffer_next_word(&b, (struct location){.line = 0, .col = 0}); + ASSERT(next.col == 1, "Expected next word to start at col 1"); + + next = buffer_next_word(&b, (struct location){.line = 0, .col = 1}); + ASSERT(next.col == 8, "Expected next word to start at col 8"); + + next = buffer_next_word(&b, (struct location){.line = 0, .col = 8}); + ASSERT(next.col == 15, "Expected next word to start at col 15"); + + next = buffer_next_word(&b, (struct location){.line = 0, .col = 15}); + ASSERT(next.col == 22, "Expected next word to start at col 22"); + + struct location prev = + buffer_previous_word(&b, (struct location){.line = 0, .col = 26}); + ASSERT(prev.col == 22, "Expected previous word to start at col 22"); + + prev = buffer_previous_word(&b, (struct location){.line = 0, .col = 22}); + ASSERT(prev.col == 15, "Expected previous word to start at col 15"); + + prev = buffer_previous_word(&b, (struct location){.line = 0, .col = 0}); + ASSERT(prev.col == 0 && prev.line == 0, + "Expected previous word to not go before beginning of buffer"); +} + +void run_buffer_tests(void) { + settings_init(10); + settings_set_default( + "editor.tab-width", + (struct setting_value){.type = Setting_Number, .number_value = 4}); + run_test(test_add); + run_test(test_delete); run_test(test_word_at); + run_test(test_line_len); + run_test(test_char_movement); + run_test(test_word_movement); + settings_destroy(); } diff --git a/test/main.c b/test/main.c index 4c241b3..dc0c2dc 100644 --- a/test/main.c +++ b/test/main.c @@ -9,7 +9,9 @@ void handle_abort() { exit(1); } int main() { - setlocale(LC_ALL, ""); + // Use a hardcoded locale to get a + // predictable env. + setlocale(LC_ALL, "en_US.UTF-8"); signal(SIGABRT, handle_abort); struct timespec test_begin; @@ -52,5 +54,6 @@ int main() { ((uint64_t)test_begin.tv_sec * 1e9 + (uint64_t)test_begin.tv_nsec); printf("\n๐ŸŽ‰ \x1b[1;32mDone! All tests successful in %.2f ms!\x1b[0m\n", (double)elapsed_nanos / 1e6); + return 0; } diff --git a/test/text.c b/test/text.c index 9faa663..f890e7b 100644 --- a/test/text.c +++ b/test/text.c @@ -15,22 +15,19 @@ void assert_line_eq(struct text_chunk line, const char *txt, const char *msg) { void assert_line_equal(struct text_chunk *line) {} void test_add_text() { - uint32_t lines_added, cols_added; + uint32_t lines_added; /* use a silly small initial capacity to test re-alloc */ struct text *t = text_create(1); const char *txt = "This is line 1\n"; - text_insert_at(t, 0, 0, (uint8_t *)txt, strlen(txt), &lines_added, - &cols_added); + text_insert_at(t, 0, 0, (uint8_t *)txt, strlen(txt), &lines_added); - ASSERT(text_line_size(t, 0) == 14 && text_line_length(t, 0) == 14, - "Expected line 1 to have 14 chars and 14 bytes"); + ASSERT(text_line_size(t, 0) == 14, "Expected line 1 to be 14 bytes"); assert_line_eq(text_get_line(t, 0), "This is line 1", "Expected line 1 to be line 1"); const char *txt2 = "This is line 2\n"; - text_insert_at(t, 1, 0, (uint8_t *)txt2, strlen(txt2), &lines_added, - &cols_added); + text_insert_at(t, 1, 0, (uint8_t *)txt2, strlen(txt2), &lines_added); ASSERT(text_num_lines(t) == 3, "Expected text to have three lines after second insertion"); assert_line_eq(text_get_line(t, 1), "This is line 2", @@ -38,8 +35,7 @@ void test_add_text() { // simulate indentation const char *txt3 = " "; - text_insert_at(t, 0, 0, (uint8_t *)txt3, strlen(txt3), &lines_added, - &cols_added); + text_insert_at(t, 0, 0, (uint8_t *)txt3, strlen(txt3), &lines_added); ASSERT(text_num_lines(t) == 3, "Expected text to have three lines after second insertion"); assert_line_eq(text_get_line(t, 0), " This is line 1", @@ -48,7 +44,7 @@ void test_add_text() { "Expected line 2 to be line 2 still"); // insert newline in middle of line - text_insert_at(t, 1, 4, (uint8_t *)"\n", 1, &lines_added, &cols_added); + text_insert_at(t, 1, 4, (uint8_t *)"\n", 1, &lines_added); ASSERT(text_num_lines(t) == 4, "Expected text to have four lines after inserting a new line"); assert_line_eq(text_get_line(t, 1), "This", "Expected line 2 to be split"); @@ -56,11 +52,11 @@ void test_add_text() { "Expected line 2 to be split"); // insert newline before line 1 - text_insert_at(t, 1, 0, (uint8_t *)"\n", 1, &lines_added, &cols_added); + text_insert_at(t, 1, 0, (uint8_t *)"\n", 1, &lines_added); ASSERT( text_num_lines(t) == 5, "Expected to have five lines after adding an empty line in the middle"); - ASSERT(text_line_length(t, 1) == 0, "Expected line 2 to be empty"); + ASSERT(text_line_size(t, 1) == 0, "Expected line 2 to be empty"); assert_line_eq(text_get_line(t, 2), "This", "Expected line 3 to be previous line 2"); assert_line_eq(text_get_line(t, 3), " is line 2", @@ -70,37 +66,35 @@ void test_add_text() { } void test_delete_text() { - uint32_t lines_added, cols_added; + uint32_t lines_added; struct text *t = text_create(10); const char *txt = "This is line 1"; - text_insert_at(t, 0, 0, (uint8_t *)txt, strlen(txt), &lines_added, - &cols_added); + text_insert_at(t, 0, 0, (uint8_t *)txt, strlen(txt), &lines_added); text_delete(t, 0, 12, 0, 14); - ASSERT(text_line_length(t, 0) == 12, - "Expected line to be 12 chars after deleting two"); + ASSERT(text_line_size(t, 0) == 12, + "Expected line to be 12 bytes after deleting two"); ASSERT(strncmp((const char *)text_get_line(t, 0).text, "This is line", text_line_size(t, 0)) == 0, - "Expected two chars to be deleted"); + "Expected two bytes to be deleted"); text_delete(t, 0, 0, 10, 10); ASSERT(text_get_line(t, 0).nbytes == 0, - "Expected line to be empty after many chars removed"); + "Expected line to be empty after many bytes removed"); const char *txt2 = "This is line 1\nThis is line 2\nThis is line 3"; - text_insert_at(t, 0, 0, (uint8_t *)txt2, strlen(txt2), &lines_added, - &cols_added); + text_insert_at(t, 0, 0, (uint8_t *)txt2, strlen(txt2), &lines_added); ASSERT(text_num_lines(t) == 3, "Expected to have three lines after inserting as many"); text_delete(t, 1, 11, 1, 14); - ASSERT(text_line_length(t, 1) == 11, - "Expected line to contain 11 chars after deletion"); + ASSERT(text_line_size(t, 1) == 11, + "Expected line to contain 11 bytes after deletion"); struct text_chunk line = text_get_line(t, 1); ASSERT(strncmp((const char *)line.text, "This is lin", line.nbytes) == 0, "Expected deleted characters to be gone in the second line"); - text_delete(t, 1, 0, 1, text_line_length(t, 1) + 1); + text_delete(t, 1, 0, 1, text_line_size(t, 1) + 1); ASSERT(text_num_lines(t) == 2, "Expected to have two lines after deleting one"); struct text_chunk line2 = text_get_line(t, 1); @@ -110,8 +104,8 @@ void test_delete_text() { struct text *t3 = text_create(10); const char *delete_me = "This is line๐ŸŽ™\nQ"; text_insert_at(t3, 0, 0, (uint8_t *)delete_me, strlen(delete_me), - &lines_added, &cols_added); - text_delete(t3, 0, 13, 0, 14); + &lines_added); + text_delete(t3, 0, 16, 1, 0); struct text_chunk top_line = text_get_line(t3, 0); ASSERT(strncmp((const char *)top_line.text, "This is line๐ŸŽ™Q", top_line.nbytes) == 0, @@ -123,33 +117,13 @@ void test_delete_text() { struct text *t4 = text_create(10); const char *deletable_text = "Only one line kinda"; text_append(t4, (uint8_t *)deletable_text, strlen(deletable_text), - &lines_added, &cols_added); + &lines_added); text_delete(t4, 0, 19, 0, 20); ASSERT(text_num_lines(t4) == 1, "Expected the line to still be there"); - ASSERT(text_line_length(t4, 0) == 19, + ASSERT(text_line_size(t4, 0) == 19, "Expected nothing to have happened to the line"); - // test utf-8 - struct text *t2 = text_create(10); - const char *txt3 = "Emojis: ๐Ÿ‡ซ๐Ÿ‡ฎ ๐Ÿฎ\n"; - text_insert_at(t2, 0, 0, (uint8_t *)txt3, strlen(txt3), &lines_added, - &cols_added); - - // TODO: Fix when graphemes are implemented, should be 11, right now it counts - // the two unicode code points ๐Ÿ‡ซ and ๐Ÿ‡ฎ as two chars. - ASSERT(text_line_length(t2, 0) == 12, - "Line length should be 12 (even though there " - "are more bytes in the line)."); - - text_delete(t2, 0, 10, 0, 12); - ASSERT(text_line_length(t2, 0) == 10, - "Line length should be 10 after deleting the cow emoji and a space"); - struct text_chunk line3 = text_get_line(t2, 0); - ASSERT(strncmp((const char *)line3.text, "Emojis: ๐Ÿ‡ซ๐Ÿ‡ฎ", line3.nbytes) == 0, - "Expected cow emoji plus space to be deleted"); - text_destroy(t); - text_destroy(t2); text_destroy(t3); text_destroy(t4); } diff --git a/test/utf8.c b/test/utf8.c index d67c409..c5094c7 100644 --- a/test/utf8.c +++ b/test/utf8.c @@ -6,11 +6,6 @@ #include "assert.h" #include "test.h" -void test_nchars_nbytes() { - ASSERT(utf8_nchars((uint8_t *)"๐Ÿ‘ด", strlen("๐Ÿ‘ด")) == 1, - "Expected old man emoji to be 1 char"); - ASSERT(utf8_nbytes((uint8_t *)"๐Ÿ‘ด", strlen("๐Ÿ‘ด"), 1) == 4, - "Expected old man emoji to be 4 bytes"); -} +void test_nchars_nbytes() {} void run_utf8_tests() { run_test(test_nchars_nbytes); }