diff --git a/ext/prism/extension.c b/ext/prism/extension.c index 7b3f8944782..84872914c42 100644 --- a/ext/prism/extension.c +++ b/ext/prism/extension.c @@ -32,6 +32,7 @@ ID rb_option_id_frozen_string_literal; ID rb_option_id_line; ID rb_option_id_scopes; ID rb_option_id_version; +ID rb_prism_source_id_for; /******************************************************************************/ /* IO of Ruby code */ @@ -599,8 +600,7 @@ parse_lex_input(pm_string_t *input, const pm_options_t *options, bool return_nod VALUE source_string = rb_str_new((const char *) pm_string_source(input), pm_string_length(input)); VALUE offsets = rb_ary_new(); - VALUE source_argv[] = { source_string, LONG2NUM(parser.start_line), offsets }; - VALUE source = rb_class_new_instance(3, source_argv, rb_cPrismSource); + VALUE source = rb_funcall(rb_cPrismSource, rb_prism_source_id_for, 3, source_string, LONG2NUM(parser.start_line), offsets); parse_lex_data_t parse_lex_data = { .source = source, @@ -1379,6 +1379,8 @@ Init_prism(void) { rb_option_id_scopes = rb_intern_const("scopes"); rb_option_id_version = rb_intern_const("version"); + rb_prism_source_id_for = rb_intern("for"); + /** * The version of the prism library. */ diff --git a/lib/prism/ffi.rb b/lib/prism/ffi.rb index 2014ccea31c..cec4b9d6306 100644 --- a/lib/prism/ffi.rb +++ b/lib/prism/ffi.rb @@ -317,7 +317,7 @@ def lex_common(string, code, options) # :nodoc: buffer.read end - Serialize.load_tokens(Source.new(code), serialized) + Serialize.load_tokens(Source.for(code), serialized) end def parse_common(string, code, options) # :nodoc: @@ -329,7 +329,7 @@ def parse_comments_common(string, code, options) # :nodoc: LibRubyParser::PrismBuffer.with do |buffer| LibRubyParser.pm_serialize_parse_comments(buffer.pointer, string.pointer, string.length, dump_options(options)) - source = Source.new(code) + source = Source.for(code) loader = Serialize::Loader.new(source, buffer.read) loader.load_header @@ -343,7 +343,7 @@ def parse_lex_common(string, code, options) # :nodoc: LibRubyParser::PrismBuffer.with do |buffer| LibRubyParser.pm_serialize_parse_lex(buffer.pointer, string.pointer, string.length, dump_options(options)) - source = Source.new(code) + source = Source.for(code) loader = Serialize::Loader.new(source, buffer.read) tokens = loader.load_tokens diff --git a/lib/prism/lex_compat.rb b/lib/prism/lex_compat.rb index f199af1883c..4f8e443a3ba 100644 --- a/lib/prism/lex_compat.rb +++ b/lib/prism/lex_compat.rb @@ -861,7 +861,7 @@ def result # We sort by location to compare against Ripper's output tokens.sort_by!(&:location) - Result.new(tokens, result.comments, result.magic_comments, result.data_loc, result.errors, result.warnings, Source.new(source)) + Result.new(tokens, result.comments, result.magic_comments, result.data_loc, result.errors, result.warnings, Source.for(source)) end end diff --git a/lib/prism/parse_result.rb b/lib/prism/parse_result.rb index ff8b1dc8bf3..e8d77172283 100644 --- a/lib/prism/parse_result.rb +++ b/lib/prism/parse_result.rb @@ -5,6 +5,14 @@ module Prism # conjunction with locations to allow them to resolve line numbers and source # ranges. class Source + # Create a new source object with the given source code. This method should + # be used instead of `new` and it will return either a `Source` or a + # specialized and more performant `ASCIISource` if no multibyte characters + # are present in the source code. + def self.for(source, start_line = 1, offsets = []) + source.ascii_only? ? ASCIISource.new(source, start_line, offsets): new(source, start_line, offsets) + end + # The source code that this source object represents. attr_reader :source @@ -111,6 +119,39 @@ def find_line(byte_offset) end end + # Specialized version of Prism::Source for source code that includes ASCII + # characters only. This class is used to apply performance optimizations that + # cannot be applied to sources that include multibyte characters. Sources that + # include multibyte characters are represented by the Prism::Source class. + class ASCIISource < Source + # Return the character offset for the given byte offset. + def character_offset(byte_offset) + byte_offset + end + + # Return the column number in characters for the given byte offset. + def character_column(byte_offset) + byte_offset - line_start(byte_offset) + end + + # Returns the offset from the start of the file for the given byte offset + # counting in code units for the given encoding. + # + # This method is tested with UTF-8, UTF-16, and UTF-32. If there is the + # concept of code units that differs from the number of characters in other + # encodings, it is not captured here. + def code_units_offset(byte_offset, encoding) + byte_offset + end + + # Specialized version of `code_units_column` that does not depend on + # `code_units_offset`, which is a more expensive operation. This is + # essentialy the same as `Prism::Source#column`. + def code_units_column(byte_offset, encoding) + byte_offset - line_start(byte_offset) + end + end + # This represents a location in the source. class Location # A Source object that is used to determine more information from the given diff --git a/rbi/prism/parse_result.rbi b/rbi/prism/parse_result.rbi index 8e9129c28df..61d125c331e 100644 --- a/rbi/prism/parse_result.rbi +++ b/rbi/prism/parse_result.rbi @@ -44,6 +44,20 @@ class Prism::Source def code_units_column(byte_offset, encoding); end end +class Prism::ASCIISource < Source + sig { params(byte_offset: Integer).returns(Integer) } + def character_offset(byte_offset); end + + sig { params(byte_offset: Integer).returns(Integer) } + def character_column(byte_offset); end + + sig { params(byte_offset: Integer, encoding: Encoding).returns(Integer) } + def code_units_offset(byte_offset, encoding); end + + sig { params(byte_offset: Integer, encoding: Encoding).returns(Integer) } + def code_units_column(byte_offset, encoding); end +end + class Prism::Location sig { returns(Prism::Source) } def source; end diff --git a/sig/prism/parse_result.rbs b/sig/prism/parse_result.rbs index f520479aab7..c475fa597b9 100644 --- a/sig/prism/parse_result.rbs +++ b/sig/prism/parse_result.rbs @@ -19,6 +19,13 @@ module Prism def code_units_column: (Integer byte_offset, Encoding encoding) -> Integer end + class ASCIISource < Source + def character_offset: (Integer byte_offset) -> Integer + def character_column: (Integer byte_offset) -> Integer + def code_units_offset: (Integer byte_offset, Encoding encoding) -> Integer + def code_units_column: (Integer byte_offset, Encoding encoding) -> Integer + end + class Location attr_reader source: Source attr_reader start_offset: Integer diff --git a/templates/ext/prism/api_node.c.erb b/templates/ext/prism/api_node.c.erb index 419236ef782..0e3e4d63cc2 100644 --- a/templates/ext/prism/api_node.c.erb +++ b/templates/ext/prism/api_node.c.erb @@ -76,8 +76,7 @@ pm_source_new(const pm_parser_t *parser, rb_encoding *encoding) { rb_ary_push(offsets, ULONG2NUM(parser->newline_list.offsets[index])); } - VALUE source_argv[] = { source_string, LONG2NUM(parser->start_line), offsets }; - return rb_class_new_instance(3, source_argv, rb_cPrismSource); + return rb_funcall(rb_cPrismSource, rb_intern("for"), 3, source_string, LONG2NUM(parser->start_line), offsets); } typedef struct pm_node_stack_node { diff --git a/templates/lib/prism/dsl.rb.erb b/templates/lib/prism/dsl.rb.erb index 8dbb540952d..eff0d1c4fcf 100644 --- a/templates/lib/prism/dsl.rb.erb +++ b/templates/lib/prism/dsl.rb.erb @@ -2,7 +2,7 @@ module Prism # The DSL module provides a set of methods that can be used to create prism # nodes in a more concise manner. For example, instead of writing: # - # source = Prism::Source.new("[1]") + # source = Prism::Source.for("[1]") # # Prism::ArrayNode.new( # [ @@ -20,7 +20,7 @@ module Prism # # you could instead write: # - # source = Prism::Source.new("[1]") + # source = Prism::Source.for("[1]") # # ArrayNode( # IntegerNode(Prism::IntegerBaseFlags::DECIMAL, 1, Location(source, 1, 1)), source), diff --git a/templates/lib/prism/serialize.rb.erb b/templates/lib/prism/serialize.rb.erb index c31a319e5f7..29ae5356ba4 100644 --- a/templates/lib/prism/serialize.rb.erb +++ b/templates/lib/prism/serialize.rb.erb @@ -19,7 +19,7 @@ module Prism # Deserialize the AST represented by the given string into a parse result. def self.load(input, serialized) input = input.dup - source = Source.new(input) + source = Source.for(input) loader = Loader.new(source, serialized) result = loader.load_result