Skip to content

Commit

Permalink
perf(codegen): reduce size of LineOffsetTable (#4643)
Browse files Browse the repository at this point in the history
`LineOffsetTables` records mappings from byte offset to line and column numbers (with column number in UTF-16 characters).

Most lines do not contain any Unicode characters, and for these lines there is an exact correspondence between number of bytes from start of line and UTF-16 column number, so no column lookup table is required.

Reduce the data stored for each line from 32 bytes to 8 bytes by storing column offset lookup tables for the rare lines which do contain Unicode chars separately.

Additionally, store column lookup tables as a `Box<[u32]>` instead of `Vec<u32>` to reduce the size of `ColumnOffsets` by 8 bytes.
  • Loading branch information
overlookmotel committed Aug 6, 2024
1 parent 9f8f299 commit 8dd76e4
Show file tree
Hide file tree
Showing 3 changed files with 86 additions and 26 deletions.
2 changes: 2 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions crates/oxc_codegen/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,10 @@ oxc_allocator = { workspace = true }
oxc_syntax = { workspace = true, features = ["to_js_string"] }
oxc_sourcemap = { workspace = true }
oxc_mangler = { workspace = true }
oxc_index = { workspace = true }

bitflags = { workspace = true }
nonmax = { workspace = true }
once_cell = { workspace = true }
daachorse = { workspace = true }
rustc-hash = { workspace = true }
Expand Down
108 changes: 82 additions & 26 deletions crates/oxc_codegen/src/sourcemap_builder.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
use std::sync::Arc;

use nonmax::NonMaxU32;

use oxc_index::{Idx, IndexVec};
use oxc_span::Span;
use oxc_syntax::identifier::{LS, PS};

Expand All @@ -9,16 +12,54 @@ const LS_OR_PS_SECOND: u8 = 0x80;
const LS_THIRD: u8 = 0xA8;
const PS_THIRD: u8 = 0xA9;

/// Line offset table
/// Index into vec of `ColumnOffsets`
#[derive(Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd, Hash)]
pub struct ColumnOffsetsId(NonMaxU32);

impl Idx for ColumnOffsetsId {
#[allow(clippy::cast_possible_truncation)]
fn from_usize(idx: usize) -> Self {
assert!(idx < u32::MAX as usize);
// SAFETY: We just checked `idx` is a legal value for `NonMaxU32`
Self(unsafe { NonMaxU32::new_unchecked(idx as u32) })
}

fn index(self) -> usize {
self.0.get() as usize
}
}

/// Line offset tables.
///
/// Used for tracking lines and columns from byte offsets via binary search.
///
/// Code is adapted from [esbuild](https://github.com/evanw/esbuild/blob/cc74e6042a9f573bf58e1e3f165ebda70af4ad3b/internal/js_printer/js_printer.go#L4806-L4808)
///
/// Most lines of source code will not contain Unicode chars, so optimize storage for this common case.
///
/// Each line is represented by a `Line`.
/// Where a line is entirely ASCII, translating byte offset to UTF-16 column is simple,
/// given the byte offset of start of line. A column lookup table isn't needed for that line.
/// In this case, `Line::column_offsets_id` is `None`.
/// For rare lines which do contain Unicode chars, we store column offsets in a `ColumnOffsets` which
/// is stored in a separate `IndexVec`. `Line::column_offsets_id` contains index for that line's `ColumnOffsets`.
/// Storing column offset info which is rarely used in a separate structure keeps `Line` as small as possible.
#[derive(Debug, Default)]
pub struct LineOffsetTables {
lines: Vec<Line>,
column_offsets: IndexVec<ColumnOffsetsId, ColumnOffsets>,
}

#[derive(Debug)]
pub struct LineOffsetTable {
columns: Option<Vec<u32>>,
byte_offset_to_first: u32,
pub struct Line {
byte_offset_to_start_of_line: u32,
column_offsets_id: Option<ColumnOffsetsId>,
}

#[derive(Debug)]
pub struct ColumnOffsets {
byte_offset_to_first: u32,
columns: Box<[u32]>,
}

#[allow(clippy::struct_field_names)]
Expand All @@ -27,7 +68,7 @@ pub struct SourcemapBuilder {
original_source: Arc<str>,
last_generated_update: usize,
last_position: Option<u32>,
line_offset_tables: Vec<LineOffsetTable>,
line_offset_tables: LineOffsetTables,
sourcemap_builder: oxc_sourcemap::SourceMapBuilder,
generated_line: u32,
generated_column: u32,
Expand All @@ -40,7 +81,7 @@ impl Default for SourcemapBuilder {
original_source: "".into(),
last_generated_update: 0,
last_position: None,
line_offset_tables: vec![],
line_offset_tables: LineOffsetTables::default(),
sourcemap_builder: oxc_sourcemap::SourceMapBuilder::default(),
generated_line: 0,
generated_column: 0,
Expand Down Expand Up @@ -97,17 +138,19 @@ impl SourcemapBuilder {
fn search_original_line_and_column(&mut self, position: u32) -> (u32, u32) {
let result = self
.line_offset_tables
.partition_point(|table| table.byte_offset_to_start_of_line <= position)
as u32;
.lines
.partition_point(|table| table.byte_offset_to_start_of_line <= position);
let original_line = if result > 0 { result - 1 } else { 0 };
let line = &self.line_offset_tables[original_line as usize];
let line = &self.line_offset_tables.lines[original_line];
let mut original_column = position - line.byte_offset_to_start_of_line;
if original_column >= line.byte_offset_to_first {
if let Some(cols) = &line.columns {
original_column = cols[(original_column - line.byte_offset_to_first) as usize];
if let Some(column_offsets_id) = line.column_offsets_id {
let column_offsets = &self.line_offset_tables.column_offsets[column_offsets_id];
if original_column >= column_offsets.byte_offset_to_first {
original_column = column_offsets.columns
[(original_column - column_offsets.byte_offset_to_first) as usize];
}
}
(original_line, original_column)
(original_line as u32, original_column)
}

#[allow(clippy::cast_possible_truncation)]
Expand Down Expand Up @@ -170,8 +213,9 @@ impl SourcemapBuilder {
self.last_generated_update = output.len();
}

fn generate_line_offset_tables(content: &str) -> Vec<LineOffsetTable> {
let mut tables = vec![];
fn generate_line_offset_tables(content: &str) -> LineOffsetTables {
let mut lines = vec![];
let mut column_offsets = IndexVec::new();

// Process content line-by-line.
// For each line, start by assuming line will be entirely ASCII, and read byte-by-byte.
Expand All @@ -181,12 +225,9 @@ impl SourcemapBuilder {
// At end of line, go back to top of outer loop, and again assume ASCII for next line.
let mut line_byte_offset = 0;
'lines: loop {
tables.push(LineOffsetTable {
columns: None,
// `usize::MAX` so `original_column >= line.byte_offset_to_first` check in
// `search_original_line_and_column` fails if line is all ASCII
byte_offset_to_first: u32::MAX,
lines.push(Line {
byte_offset_to_start_of_line: line_byte_offset,
column_offsets_id: None,
});

let remaining = &content.as_bytes()[line_byte_offset as usize..];
Expand All @@ -209,11 +250,12 @@ impl SourcemapBuilder {
}
_ => {
// Unicode char found.
// Create `columns` Vec, and set `byte_offset_to_first`.
let table = tables.iter_mut().last().unwrap();
table.byte_offset_to_first = byte_offset_from_line_start;
table.columns = Some(vec![]);
let columns = table.columns.as_mut().unwrap();
// Set `column_offsets_id` for line and create `columns` Vec.
let line = lines.iter_mut().last().unwrap();
line.column_offsets_id =
Some(ColumnOffsetsId::from_usize(column_offsets.len()));

let mut columns = vec![];

// Loop through rest of line char-by-char.
// `chunk_byte_offset` in this loop is byte offset from start of this 1st
Expand Down Expand Up @@ -256,13 +298,27 @@ impl SourcemapBuilder {
// Line break found.
// `chunk_byte_offset` is now the offset of *end* of the line break.
line_byte_offset += chunk_byte_offset;

// Record column offsets
column_offsets.push(ColumnOffsets {
byte_offset_to_first: byte_offset_from_line_start,
columns: columns.into_boxed_slice(),
});

// Revert back to outer loop for next line
continue 'lines;
}

// EOF.
// One last column entry for EOF position.
columns.push(column);

// Record column offsets
column_offsets.push(ColumnOffsets {
byte_offset_to_first: byte_offset_from_line_start,
columns: columns.into_boxed_slice(),
});

break 'lines;
}
};
Expand All @@ -277,7 +333,7 @@ impl SourcemapBuilder {
break;
}

tables
LineOffsetTables { lines, column_offsets }
}
}

Expand Down

0 comments on commit 8dd76e4

Please sign in to comment.