diff --git a/Cargo.lock b/Cargo.lock index 878f4fa4acd2e..0ade0e3710b67 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -13,6 +13,18 @@ dependencies = [ "version_check", ] +[[package]] +name = "ahash" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57e6e951cfbb2db8de1828d49073a113a29fd7117b1596caa781a258c7e38d72" +dependencies = [ + "cfg-if", + "getrandom", + "once_cell", + "version_check", +] + [[package]] name = "aho-corasick" version = "0.7.18" @@ -400,7 +412,7 @@ version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" dependencies = [ - "ahash", + "ahash 0.7.6", ] [[package]] @@ -413,6 +425,7 @@ dependencies = [ "encoding_rs", "etcetera", "helix-loader", + "imara-diff", "log", "once_cell", "quickcheck", @@ -420,7 +433,6 @@ dependencies = [ "ropey", "serde", "serde_json", - "similar", "slotmap", "smallvec", "smartstring", @@ -611,6 +623,16 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "imara-diff" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e398bdccad24bf00329728ce0754bb1788f107990969b0c58e1b0483b0ec0e5" +dependencies = [ + "ahash 0.8.0", + "hashbrown", +] + [[package]] name = "indoc" version = "1.0.7" @@ -909,8 +931,7 @@ dependencies = [ [[package]] name = "ropey" version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbd22239fafefc42138ca5da064f3c17726a80d2379d817a3521240e78dd0064" +source = "git+https://github.com/pascalkuthe/ropey.git?branch=line_diff2#1bbfc5833b95a183e73aa70188d502793e5f7bb7" dependencies = [ "smallvec", "str_indices", @@ -1021,12 +1042,6 @@ dependencies = [ "tokio", ] -[[package]] -name = "similar" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62ac7f900db32bf3fd12e0117dd3dc4da74bc52ebaac97f39668446d89694803" - [[package]] name = "slab" version = "0.4.7" diff --git a/helix-core/Cargo.toml b/helix-core/Cargo.toml index 7585c347607cd..de15066905bee 100644 --- a/helix-core/Cargo.toml +++ b/helix-core/Cargo.toml @@ -17,7 +17,7 @@ integration = [] [dependencies] helix-loader = { version = "0.6", path = "../helix-loader" } -ropey = { version = "1.5", default-features = false, features = ["simd"] } +ropey = { version = "1.5", default-features = false, features = ["simd"] , git = "https://github.com/pascalkuthe/ropey.git", branch = "line_diff2" } smallvec = "1.10" smartstring = "1.0.1" unicode-segmentation = "1.10" @@ -36,7 +36,7 @@ serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" toml = "0.5" -similar = "2.2" +imara-diff = "0.1.0" encoding_rs = "0.8" diff --git a/helix-core/src/diff.rs b/helix-core/src/diff.rs index 6960c679c4acd..c754da30f4aec 100644 --- a/helix-core/src/diff.rs +++ b/helix-core/src/diff.rs @@ -1,58 +1,195 @@ -use crate::{Rope, Transaction}; +use std::ops::Range; +use std::time::Instant; -/// Compares `old` and `new` to generate a [`Transaction`] describing -/// the steps required to get from `old` to `new`. -pub fn compare_ropes(old: &Rope, new: &Rope) -> Transaction { - // `similar` only works on contiguous data, so a `Rope` has - // to be temporarily converted into a `String`. - let old_converted = old.to_string(); - let new_converted = new.to_string(); - - // A timeout is set so after 1 seconds, the algorithm will start - // approximating. This is especially important for big `Rope`s or - // `Rope`s that are extremely dissimilar to each other. - let mut config = similar::TextDiff::configure(); - config.timeout(std::time::Duration::from_secs(1)); - - let diff = config.diff_chars(&old_converted, &new_converted); - - // The current position of the change needs to be tracked to - // construct the `Change`s. - let mut pos = 0; - Transaction::change( - old, - diff.ops() +use imara_diff::intern::InternedInput; +use imara_diff::Algorithm; +use ropey::RopeSlice; + +use crate::{ChangeSet, Rope, Tendril, Transaction}; + +/// A `imara_diff::Sink` that builds a `ChangeSet` for a character diff of a hunk +struct CharChangeSetBuilder<'a> { + res: &'a mut ChangeSet, + hunk: &'a InternedInput, + pos: u32, +} + +impl imara_diff::Sink for CharChangeSetBuilder<'_> { + type Out = (); + fn process_change(&mut self, before: Range, after: Range) { + self.res.retain((before.start - self.pos) as usize); + self.res.delete(before.len()); + self.pos = before.end; + + let res = self.hunk.after[after.start as usize..after.end as usize] + .iter() + .map(|&token| self.hunk.interner[token]) + .collect(); + + self.res.insert(res); + } + + fn finish(self) -> Self::Out { + self.res.retain(self.hunk.before.len() - self.pos as usize); + } +} + +struct LineChangeSetBuilder<'a> { + res: ChangeSet, + after: RopeSlice<'a>, + file: &'a InternedInput>, + current_hunk: InternedInput, + pos: u32, +} + +impl imara_diff::Sink for LineChangeSetBuilder<'_> { + type Out = ChangeSet; + + fn process_change(&mut self, before: Range, after: Range) { + let len = self.file.before[self.pos as usize..before.start as usize] .iter() - .map(|op| op.as_tag_tuple()) - .filter_map(|(tag, old_range, new_range)| { - // `old_pos..pos` is equivalent to `start..end` for where - // the change should be applied. - let old_pos = pos; - pos += old_range.end - old_range.start; - - match tag { - // Semantically, inserts and replacements are the same thing. - similar::DiffTag::Insert | similar::DiffTag::Replace => { - // This is the text from the `new` rope that should be - // inserted into `old`. - let text: &str = { - let start = new.char_to_byte(new_range.start); - let end = new.char_to_byte(new_range.end); - &new_converted[start..end] - }; - Some((old_pos, pos, Some(text.into()))) + .map(|&it| self.file.interner[it].len_chars()) + .sum(); + self.res.retain(len); + self.pos = before.end; + + // do not perform diffs on large hunks + let len_before = before.end - before.start; + let len_after = after.end - after.start; + + // Pure insertions/removals do not require a character diff. + // Very large changes are ignored because their character diff is expensive to compute + // TODO adjust heuristic to detect large changes? + if len_before == 0 + || len_after == 0 + || len_after > 5 * len_before + || 5 * len_after < len_before && len_before > 10 + || len_before + len_after > 200 + { + let remove = self.file.before[before.start as usize..before.end as usize] + .iter() + .map(|&it| self.file.interner[it].len_chars()) + .sum(); + self.res.delete(remove); + let mut fragment = Tendril::new(); + if len_after > 500 { + // copying a rope line by line is slower then copying the entire + // rope. Use to_string for very large changes instead.. + if self.file.after.len() == after.end as usize { + if after.start == 0 { + fragment = self.after.to_string().into(); + } else { + let start = self.after.line_to_char(after.start as usize); + fragment = self.after.slice(start..).to_string().into(); } - similar::DiffTag::Delete => Some((old_pos, pos, None)), - similar::DiffTag::Equal => None, + } else if after.start == 0 { + let end = self.after.line_to_char(after.end as usize); + fragment = self.after.slice(..end).to_string().into(); + } else { + let start = self.after.line_to_char(after.start as usize); + let end = self.after.line_to_char(after.end as usize); + fragment = self.after.slice(start..end).to_string().into(); } - }), - ) + } else { + for &line in &self.file.after[after.start as usize..after.end as usize] { + for chunk in self.file.interner[line].chunks() { + fragment.push_str(chunk) + } + } + }; + self.res.insert(fragment); + } else { + // for reasonably small hunks, generating a ChangeSet from char diff can save memory + // TODO use a tokenizer (word diff?) for improved performance + let hunk_before = self.file.before[before.start as usize..before.end as usize] + .iter() + .flat_map(|&it| self.file.interner[it].chars()); + let hunk_after = self.file.after[after.start as usize..after.end as usize] + .iter() + .flat_map(|&it| self.file.interner[it].chars()); + self.current_hunk.update_before(hunk_before); + self.current_hunk.update_after(hunk_after); + + // the histogram heuristic does not work as well + // for characters because the same characters often reoccur + // use myer diff instead + imara_diff::diff( + Algorithm::Myers, + &self.current_hunk, + CharChangeSetBuilder { + res: &mut self.res, + hunk: &self.current_hunk, + pos: 0, + }, + ); + + self.current_hunk.clear(); + } + } + + fn finish(mut self) -> Self::Out { + let len = self.file.before[self.pos as usize..] + .iter() + .map(|&it| self.file.interner[it].len_chars()) + .sum(); + + self.res.retain(len); + self.res + } +} + +struct RopeLines<'a>(RopeSlice<'a>); + +impl<'a> imara_diff::intern::TokenSource for RopeLines<'a> { + type Token = RopeSlice<'a>; + // TODO: improve performance of lines iterator (https://github.com/cessen/ropey/issues/25) + type Tokenizer = ropey::iter::Lines<'a>; + + fn tokenize(&self) -> Self::Tokenizer { + self.0.lines() + } + + fn estimate_tokens(&self) -> u32 { + // we can provide a perfect estimate which is very nice for performance + self.0.len_lines() as u32 + } +} + +/// Compares `old` and `new` to generate a [`Transaction`] describing +/// the steps required to get from `old` to `new`. +pub fn compare_ropes(before: &Rope, after: &Rope) -> Transaction { + let start = Instant::now(); + let res = ChangeSet::with_capacity(32); + let after = after.slice(..); + let file = InternedInput::new(RopeLines(before.slice(..)), RopeLines(after)); + let builder = LineChangeSetBuilder { + res, + file: &file, + after, + pos: 0, + current_hunk: InternedInput::default(), + }; + + let res = imara_diff::diff(Algorithm::Histogram, &file, builder).into(); + + log::debug!( + "rope diff took {}s", + Instant::now().duration_since(start).as_secs_f64() + ); + res } #[cfg(test)] mod tests { use super::*; + fn test_identity(a: &str, b: &str) { + let mut old = Rope::from(a); + let new = Rope::from(b); + compare_ropes(&old, &new).apply(&mut old); + assert_eq!(old, new); + } + quickcheck::quickcheck! { fn test_compare_ropes(a: String, b: String) -> bool { let mut old = Rope::from(a); @@ -61,4 +198,25 @@ mod tests { old == new } } + + #[test] + fn equal_files() { + test_identity("foo", "foo"); + } + + #[test] + fn trailing_newline() { + test_identity("foo\n", "foo"); + test_identity("foo", "foo\n"); + } + + #[test] + fn new_file() { + test_identity("", "foo"); + } + + #[test] + fn deleted_file() { + test_identity("foo", ""); + } } diff --git a/helix-core/src/transaction.rs b/helix-core/src/transaction.rs index daf4a77e8447a..9ca293bde72ba 100644 --- a/helix-core/src/transaction.rs +++ b/helix-core/src/transaction.rs @@ -56,7 +56,7 @@ impl ChangeSet { } // Changeset builder operations: delete/insert/retain - fn delete(&mut self, n: usize) { + pub(crate) fn delete(&mut self, n: usize) { use Operation::*; if n == 0 { return; @@ -71,7 +71,7 @@ impl ChangeSet { } } - fn insert(&mut self, fragment: Tendril) { + pub(crate) fn insert(&mut self, fragment: Tendril) { use Operation::*; if fragment.is_empty() { @@ -93,7 +93,7 @@ impl ChangeSet { self.changes.push(new_last); } - fn retain(&mut self, n: usize) { + pub(crate) fn retain(&mut self, n: usize) { use Operation::*; if n == 0 { return;