-
-
Notifications
You must be signed in to change notification settings - Fork 2.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
significantly improve treesitter performance while editing large files #4716
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,16 +7,19 @@ use crate::{ | |
Rope, RopeSlice, Tendril, | ||
}; | ||
|
||
use ahash::RandomState; | ||
use arc_swap::{ArcSwap, Guard}; | ||
use bitflags::bitflags; | ||
use hashbrown::raw::RawTable; | ||
use slotmap::{DefaultKey as LayerId, HopSlotMap}; | ||
|
||
use std::{ | ||
borrow::Cow, | ||
cell::RefCell, | ||
collections::{HashMap, VecDeque}, | ||
fmt, | ||
mem::replace, | ||
hash::{Hash, Hasher}, | ||
mem::{replace, transmute}, | ||
path::Path, | ||
str::FromStr, | ||
sync::Arc, | ||
|
@@ -769,30 +772,38 @@ impl Syntax { | |
// Convert the changeset into tree sitter edits. | ||
let edits = generate_edits(old_source, changeset); | ||
|
||
// This table allows inverse indexing of `layers`. | ||
// That is by hashing a `Layer` you can find | ||
// the `LayerId` of an existing equivalent `Layer` in `layers`. | ||
// | ||
// It is used to determine if a new layer exists for an injection | ||
// or if an existing layer needs to be updated. | ||
let mut layers_table = RawTable::with_capacity(self.layers.len()); | ||
let layers_hasher = RandomState::new(); | ||
// Use the edits to update all layers markers | ||
if !edits.is_empty() { | ||
fn point_add(a: Point, b: Point) -> Point { | ||
if b.row > 0 { | ||
Point::new(a.row.saturating_add(b.row), b.column) | ||
} else { | ||
Point::new(0, a.column.saturating_add(b.column)) | ||
} | ||
fn point_add(a: Point, b: Point) -> Point { | ||
if b.row > 0 { | ||
Point::new(a.row.saturating_add(b.row), b.column) | ||
} else { | ||
Point::new(0, a.column.saturating_add(b.column)) | ||
} | ||
fn point_sub(a: Point, b: Point) -> Point { | ||
if a.row > b.row { | ||
Point::new(a.row.saturating_sub(b.row), a.column) | ||
} else { | ||
Point::new(0, a.column.saturating_sub(b.column)) | ||
} | ||
} | ||
fn point_sub(a: Point, b: Point) -> Point { | ||
if a.row > b.row { | ||
Point::new(a.row.saturating_sub(b.row), a.column) | ||
} else { | ||
Point::new(0, a.column.saturating_sub(b.column)) | ||
} | ||
} | ||
|
||
for layer in self.layers.values_mut() { | ||
// The root layer always covers the whole range (0..usize::MAX) | ||
if layer.depth == 0 { | ||
layer.flags = LayerUpdateFlags::MODIFIED; | ||
continue; | ||
} | ||
for (layer_id, layer) in self.layers.iter_mut() { | ||
// The root layer always covers the whole range (0..usize::MAX) | ||
if layer.depth == 0 { | ||
layer.flags = LayerUpdateFlags::MODIFIED; | ||
continue; | ||
} | ||
|
||
if !edits.is_empty() { | ||
for range in &mut layer.ranges { | ||
// Roughly based on https://github.com/tree-sitter/tree-sitter/blob/ddeaa0c7f534268b35b4f6cb39b52df082754413/lib/src/subtree.c#L691-L720 | ||
for edit in edits.iter().rev() { | ||
|
@@ -857,6 +868,12 @@ impl Syntax { | |
} | ||
} | ||
} | ||
|
||
let hash = layers_hasher.hash_one(layer); | ||
// Safety: insert_no_grow is unsafe because it assumes that the table | ||
// has enough capacity to hold additional elements. | ||
// This is always the case as we reserved enough capacity above. | ||
unsafe { layers_table.insert_no_grow(hash, layer_id) }; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not just There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I usually use
layers_table.insert(hash, layer_id, |_| unreachable!("table should be large enough"));
layers_table.insert(hash, layer_id, |rehashed_layer| hasher.hash_one(self.layers[rehashed_layer]))); |
||
} | ||
|
||
PARSER.with(|ts_parser| { | ||
|
@@ -981,27 +998,23 @@ impl Syntax { | |
let depth = layer.depth + 1; | ||
// TODO: can't inline this since matches borrows self.layers | ||
for (config, ranges) in injections { | ||
// Find an existing layer | ||
let layer = self | ||
.layers | ||
.iter_mut() | ||
.find(|(_, layer)| { | ||
layer.depth == depth && // TODO: track parent id instead | ||
layer.config.language == config.language && layer.ranges == ranges | ||
let new_layer = LanguageLayer { | ||
tree: None, | ||
config, | ||
depth, | ||
ranges, | ||
flags: LayerUpdateFlags::empty(), | ||
}; | ||
|
||
// Find an identical existing layer | ||
let layer = layers_table | ||
.get(layers_hasher.hash_one(&new_layer), |&it| { | ||
self.layers[it] == new_layer | ||
}) | ||
.map(|(id, _layer)| id); | ||
.copied(); | ||
|
||
// ...or insert a new one. | ||
let layer_id = layer.unwrap_or_else(|| { | ||
self.layers.insert(LanguageLayer { | ||
tree: None, | ||
config, | ||
depth, | ||
ranges, | ||
// set the modified flag to ensure the layer is parsed | ||
flags: LayerUpdateFlags::empty(), | ||
}) | ||
}); | ||
let layer_id = layer.unwrap_or_else(|| self.layers.insert(new_layer)); | ||
|
||
queue.push_back(layer_id); | ||
} | ||
|
@@ -1138,6 +1151,34 @@ pub struct LanguageLayer { | |
flags: LayerUpdateFlags, | ||
} | ||
|
||
/// This PartialEq implementation only checks if that | ||
/// two layers are theoretically identical (meaning they highlight the same text range with the same language). | ||
/// It does not check whether the layers have the same internal treesitter | ||
/// state. | ||
impl PartialEq for LanguageLayer { | ||
fn eq(&self, other: &Self) -> bool { | ||
self.depth == other.depth | ||
&& self.config.language == other.config.language | ||
&& self.ranges == other.ranges | ||
} | ||
} | ||
|
||
/// Hash implementation belongs to PartialEq implementation above. | ||
/// See its documentation for details. | ||
impl Hash for LanguageLayer { | ||
fn hash<H: Hasher>(&self, state: &mut H) { | ||
self.depth.hash(state); | ||
// The transmute is necessary here because tree_sitter::Language does not derive Hash at the moment. | ||
// However it does use #[repr] transparent so the transmute here is safe | ||
// as `Language` (which `Grammar` is an alias for) is just a newtype wrapper around a (thin) pointer. | ||
// This is also compatible with the PartialEq implementation of language | ||
// as that is just a pointer comparison. | ||
let language: *const () = unsafe { transmute(self.config.language) }; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
fn main() {
let a = vec![1];
let ptr = &a as *const _ as *const ();
println!("{:?}", ptr);
} There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Though hmm, since it's a wrapper type it might not be the same thing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah its not the same, because Grammar is a Newtype wrapper around a pointer: struct Grammar(*const ()); You can definitely implement this with pointer cast but at that point you are re-implementing let language_ptr: *const Grammar = &self.config.language as *const Grammar;
let raw_language_ptr: *const *const () = language as *const *const ();
let language = *raw_language_ptr; I think this is one of those rare cases where transmute is the cleanest option. I use it only to get to the inner value of the newtype wrapper here (which is |
||
language.hash(state); | ||
self.ranges.hash(state); | ||
} | ||
} | ||
|
||
impl LanguageLayer { | ||
pub fn tree(&self) -> &Tree { | ||
// TODO: no unwrap | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do we need to pull in
RawTable
? Why not justHashMap::with_capacity()
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The point of the
RawTable
is not thecapacity
/insert_no_grow
. That just a nice bonus.A
HashMap<K,V>
is essentially aRawTable<(K,V)>
internally. However we want aLanguageLayer -> LayerId
map here so using a normalHashMap<LanguageLayer, LayerId>
would require that we clone allLanguageLayer
s (we can not store references intoSyntax::layers
because it is mutated later).With a
RawTable
we can just store theLayerId
asRawTable<LayerId>
.When we need access to the original keys to resolve hash collisions during lookup later we can just use the
LayerId
to index back intoself.layers
.IndexMap
does essentially the same thing internally (it just doesn't fit well here because we want to keep using aHopSlotMap
whereasindexmap
forces us to use essentially aVec
).I also use the exact same strategy with
RawTable
for the interning inimara-diff
(to avoid pulling inindexmap
which offers a lot more functionality). So we will depend onhashbrown
anyway (in fact we already do so as a dependy ofunicode-linebreak
)