Skip to content

Commit

Permalink
Merge pull request #2754 from Shopify/vs/specialize_source
Browse files Browse the repository at this point in the history
Specialize code unit computing for ASCII only sources
  • Loading branch information
kddnewton authored May 3, 2024
2 parents 994316d + 4099316 commit c290b09
Show file tree
Hide file tree
Showing 9 changed files with 74 additions and 11 deletions.
6 changes: 4 additions & 2 deletions ext/prism/extension.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ ID rb_option_id_frozen_string_literal;
ID rb_option_id_line;
ID rb_option_id_scopes;
ID rb_option_id_version;
ID rb_prism_source_id_for;

/******************************************************************************/
/* IO of Ruby code */
Expand Down Expand Up @@ -599,8 +600,7 @@ parse_lex_input(pm_string_t *input, const pm_options_t *options, bool return_nod

VALUE source_string = rb_str_new((const char *) pm_string_source(input), pm_string_length(input));
VALUE offsets = rb_ary_new();
VALUE source_argv[] = { source_string, LONG2NUM(parser.start_line), offsets };
VALUE source = rb_class_new_instance(3, source_argv, rb_cPrismSource);
VALUE source = rb_funcall(rb_cPrismSource, rb_prism_source_id_for, 3, source_string, LONG2NUM(parser.start_line), offsets);

parse_lex_data_t parse_lex_data = {
.source = source,
Expand Down Expand Up @@ -1379,6 +1379,8 @@ Init_prism(void) {
rb_option_id_scopes = rb_intern_const("scopes");
rb_option_id_version = rb_intern_const("version");

rb_prism_source_id_for = rb_intern("for");

/**
* The version of the prism library.
*/
Expand Down
6 changes: 3 additions & 3 deletions lib/prism/ffi.rb
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,7 @@ def lex_common(string, code, options) # :nodoc:
buffer.read
end

Serialize.load_tokens(Source.new(code), serialized)
Serialize.load_tokens(Source.for(code), serialized)
end

def parse_common(string, code, options) # :nodoc:
Expand All @@ -329,7 +329,7 @@ def parse_comments_common(string, code, options) # :nodoc:
LibRubyParser::PrismBuffer.with do |buffer|
LibRubyParser.pm_serialize_parse_comments(buffer.pointer, string.pointer, string.length, dump_options(options))

source = Source.new(code)
source = Source.for(code)
loader = Serialize::Loader.new(source, buffer.read)

loader.load_header
Expand All @@ -343,7 +343,7 @@ def parse_lex_common(string, code, options) # :nodoc:
LibRubyParser::PrismBuffer.with do |buffer|
LibRubyParser.pm_serialize_parse_lex(buffer.pointer, string.pointer, string.length, dump_options(options))

source = Source.new(code)
source = Source.for(code)
loader = Serialize::Loader.new(source, buffer.read)

tokens = loader.load_tokens
Expand Down
2 changes: 1 addition & 1 deletion lib/prism/lex_compat.rb
Original file line number Diff line number Diff line change
Expand Up @@ -861,7 +861,7 @@ def result
# We sort by location to compare against Ripper's output
tokens.sort_by!(&:location)

Result.new(tokens, result.comments, result.magic_comments, result.data_loc, result.errors, result.warnings, Source.new(source))
Result.new(tokens, result.comments, result.magic_comments, result.data_loc, result.errors, result.warnings, Source.for(source))
end
end

Expand Down
41 changes: 41 additions & 0 deletions lib/prism/parse_result.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,14 @@ module Prism
# conjunction with locations to allow them to resolve line numbers and source
# ranges.
class Source
# Create a new source object with the given source code. This method should
# be used instead of `new` and it will return either a `Source` or a
# specialized and more performant `ASCIISource` if no multibyte characters
# are present in the source code.
def self.for(source, start_line = 1, offsets = [])
source.ascii_only? ? ASCIISource.new(source, start_line, offsets): new(source, start_line, offsets)
end

# The source code that this source object represents.
attr_reader :source

Expand Down Expand Up @@ -111,6 +119,39 @@ def find_line(byte_offset)
end
end

# Specialized version of Prism::Source for source code that includes ASCII
# characters only. This class is used to apply performance optimizations that
# cannot be applied to sources that include multibyte characters. Sources that
# include multibyte characters are represented by the Prism::Source class.
class ASCIISource < Source
# Return the character offset for the given byte offset.
def character_offset(byte_offset)
byte_offset
end

# Return the column number in characters for the given byte offset.
def character_column(byte_offset)
byte_offset - line_start(byte_offset)
end

# Returns the offset from the start of the file for the given byte offset
# counting in code units for the given encoding.
#
# This method is tested with UTF-8, UTF-16, and UTF-32. If there is the
# concept of code units that differs from the number of characters in other
# encodings, it is not captured here.
def code_units_offset(byte_offset, encoding)
byte_offset
end

# Specialized version of `code_units_column` that does not depend on
# `code_units_offset`, which is a more expensive operation. This is
# essentialy the same as `Prism::Source#column`.
def code_units_column(byte_offset, encoding)
byte_offset - line_start(byte_offset)
end
end

# This represents a location in the source.
class Location
# A Source object that is used to determine more information from the given
Expand Down
14 changes: 14 additions & 0 deletions rbi/prism/parse_result.rbi
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,20 @@ class Prism::Source
def code_units_column(byte_offset, encoding); end
end

class Prism::ASCIISource < Source
sig { params(byte_offset: Integer).returns(Integer) }
def character_offset(byte_offset); end

sig { params(byte_offset: Integer).returns(Integer) }
def character_column(byte_offset); end

sig { params(byte_offset: Integer, encoding: Encoding).returns(Integer) }
def code_units_offset(byte_offset, encoding); end

sig { params(byte_offset: Integer, encoding: Encoding).returns(Integer) }
def code_units_column(byte_offset, encoding); end
end

class Prism::Location
sig { returns(Prism::Source) }
def source; end
Expand Down
7 changes: 7 additions & 0 deletions sig/prism/parse_result.rbs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,13 @@ module Prism
def code_units_column: (Integer byte_offset, Encoding encoding) -> Integer
end

class ASCIISource < Source
def character_offset: (Integer byte_offset) -> Integer
def character_column: (Integer byte_offset) -> Integer
def code_units_offset: (Integer byte_offset, Encoding encoding) -> Integer
def code_units_column: (Integer byte_offset, Encoding encoding) -> Integer
end

class Location
attr_reader source: Source
attr_reader start_offset: Integer
Expand Down
3 changes: 1 addition & 2 deletions templates/ext/prism/api_node.c.erb
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,7 @@ pm_source_new(const pm_parser_t *parser, rb_encoding *encoding) {
rb_ary_push(offsets, ULONG2NUM(parser->newline_list.offsets[index]));
}

VALUE source_argv[] = { source_string, LONG2NUM(parser->start_line), offsets };
return rb_class_new_instance(3, source_argv, rb_cPrismSource);
return rb_funcall(rb_cPrismSource, rb_intern("for"), 3, source_string, LONG2NUM(parser->start_line), offsets);
}

typedef struct pm_node_stack_node {
Expand Down
4 changes: 2 additions & 2 deletions templates/lib/prism/dsl.rb.erb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ module Prism
# The DSL module provides a set of methods that can be used to create prism
# nodes in a more concise manner. For example, instead of writing:
#
# source = Prism::Source.new("[1]")
# source = Prism::Source.for("[1]")
#
# Prism::ArrayNode.new(
# [
Expand All @@ -20,7 +20,7 @@ module Prism
#
# you could instead write:
#
# source = Prism::Source.new("[1]")
# source = Prism::Source.for("[1]")
#
# ArrayNode(
# IntegerNode(Prism::IntegerBaseFlags::DECIMAL, 1, Location(source, 1, 1)), source),
Expand Down
2 changes: 1 addition & 1 deletion templates/lib/prism/serialize.rb.erb
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ module Prism
# Deserialize the AST represented by the given string into a parse result.
def self.load(input, serialized)
input = input.dup
source = Source.new(input)
source = Source.for(input)
loader = Loader.new(source, serialized)
result = loader.load_result

Expand Down

0 comments on commit c290b09

Please sign in to comment.