Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Parser state #231

Draft
wants to merge 6 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Steepfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ target :lib do
check "lib/lrama/grammar/parameterizing_rule_resolver.rb"
check "lib/lrama/grammar/parameterizing_rule_rhs_builder.rb"
check "lib/lrama/grammar/parameterizing_rules"
check "lib/lrama/grammar/parser_state.rb"
check "lib/lrama/grammar/percent_code.rb"
check "lib/lrama/grammar/precedence.rb"
check "lib/lrama/grammar/printer.rb"
Expand Down
10 changes: 8 additions & 2 deletions lib/lrama/grammar.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
require "lrama/grammar/code"
require "lrama/grammar/counter"
require "lrama/grammar/error_token"
require "lrama/grammar/parser_state"
require "lrama/grammar/percent_code"
require "lrama/grammar/precedence"
require "lrama/grammar/printer"
Expand All @@ -20,7 +21,7 @@
module Lrama
# Grammar is the result of parsing an input grammar file
class Grammar
attr_reader :percent_codes, :eof_symbol, :error_symbol, :undef_symbol, :accept_symbol, :aux
attr_reader :percent_codes, :parser_states, :eof_symbol, :error_symbol, :undef_symbol, :accept_symbol, :aux
attr_accessor :union, :expect,
:printers, :error_tokens,
:lex_param, :parse_param, :initial_action,
Expand All @@ -35,6 +36,7 @@ def initialize(rule_counter)
@percent_codes = []
@printers = []
@error_tokens = []
@parser_states = []
@symbols = []
@types = []
@rule_builders = []
Expand Down Expand Up @@ -63,6 +65,10 @@ def add_error_token(ident_or_tags:, token_code:, lineno:)
@error_tokens << ErrorToken.new(ident_or_tags: ident_or_tags, token_code: token_code, lineno: lineno)
end

def add_parser_state(state_id, state_list)
@parser_states << ParserState.new(state_id: state_id, state_list: state_list)
end

def add_term(id:, alias_name: nil, tag: nil, token_id: nil, replace: false)
if token_id && (sym = @symbols.find {|s| s.token_id == token_id })
if replace
Expand Down Expand Up @@ -195,7 +201,7 @@ def find_symbol_by_id(id)
end

def find_symbol_by_id!(id)
find_symbol_by_id(id) || (raise "Symbol not found: #{id}")
find_symbol_by_id(id) || (raise "Symbol not found: #{id.s_value}")
end

def find_symbol_by_number!(number)
Expand Down
144 changes: 144 additions & 0 deletions lib/lrama/grammar/parser_state.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
module Lrama
class Grammar
class ParserState
attr_reader :state_id, :state_list

def initialize(state_id:, state_list:)
@state_id = state_id
@state_list = state_list
end

def enum_definition
<<~ENUM
enum #{enum_name}
{
#{enum_body}
};
typedef enum #{enum_name} #{enum_type};

static const char *const #{enum_name_table_name}[] = {
#{int_to_name.join(", ")}
};

YY_ATTRIBUTE_UNUSED
static const char *
#{enum_name}_name (#{enum_type} num)
{
return #{enum_name_table_name}[num];
}

# define #{state_name_macro}(value) #{enum_name}_name (value)
# define #{current_state_name_macro} #{state_name_macro} (*#{stack_prefix}_p)
ENUM
end

def state_name_macro
"YY_STATE_#{state_name.upcase}_NAME"
end

def current_state_name_macro
"YY_CURRENT_STATE_#{state_name.upcase}_NAME"
end

def states_functions
<<~FUNC
# define YYPUSH_STATE_#{state_name.upcase}(value) \\
do \\
{ \\
if (#{stack_prefix}_b + #{states_stack_size_name} - 1 <= #{stack_prefix}_p) \\
YYSTATE_STACK_INCREASE (#{stack_prefix}_a, #{stack_prefix}_b, #{stack_prefix}_p, #{states_stack_size_name}, "#{state_name}"); \\
YYDPRINTF ((stderr, "Push %s to #{state_name}\\n", #{state_name_macro} (yyparser_state_ ## value))); \\
*++#{stack_prefix}_p = yyparser_state_ ## value; \\
} \\
while (0)

# define YYPOP_STATE_#{state_name.upcase}() \\
do \\
{ \\
YYDPRINTF ((stderr, "Pop #{state_name}\\n")); \\
if (#{stack_prefix}_p != #{stack_prefix}_b) \\
{ \\
#{stack_prefix}_p -= 1; \\
} \\
else \\
{ \\
YYDPRINTF ((stderr, "Try to pop empty #{state_name} stack\\n")); \\
} \\
} \\
while (0)

# define YYSET_STATE_#{state_name.upcase}(value) \\
do \\
{ \\
YYDPRINTF ((stderr, "Set %s to #{state_name}\\n", #{state_name_macro} (yyparser_state_ ## value))); \\
*#{stack_prefix}_p = yyparser_state_ ## value; \\
} \\
while (0)

# define YY_STATE_#{state_name.upcase} #{stack_prefix}_p
FUNC
end

def states_clean_up_stack
<<~CODE
if (#{stack_prefix}_b != #{stack_prefix}_a)
YYSTACK_FREE (#{stack_prefix}_b);
CODE
end

def states_stack_size_name
"#{stack_prefix}_stacksize"
end

def states_stacks
<<~STACKS
/* Current size of state stack size */
YYPTRDIFF_T #{states_stack_size_name} = YYINITDEPTH;

/* The parser state stack (#{stack_prefix}): array, bottom, top. */
int #{stack_prefix}_a[YYINITDEPTH];
int *#{stack_prefix}_b = #{stack_prefix}_a;
int *#{stack_prefix}_p = #{stack_prefix}_b;
STACKS
end

def state_name
state_id.s_value
end

def enum_name
"yyparser_state_#{state_name}"
end

def enum_type
"#{enum_name}_t"
end

def enum_body
enum_numbers.join(",\n ")
end

def int_to_name
state_list.map do |state|
"\"#{state.s_value}\""
end << "YY_NULLPTR"
end

def enum_name_table_name
"#{enum_name}_names"
end

def stack_prefix
"yyparser_state_#{state_name}"
end

private

def enum_numbers
state_list.map do |state|
"yyparser_state_#{state.s_value}"
end
end
end
end
end
7 changes: 4 additions & 3 deletions lib/lrama/grammar/reference.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@ module Lrama
class Grammar
# type: :dollar or :at
# name: String (e.g. $$, $foo, $expr.right)
# index: Integer (e.g. $1)
# number: Integer (e.g. $1)
# index:
# ex_tag: "$<tag>1" (Optional)
class Reference < Struct.new(:type, :name, :index, :ex_tag, :first_column, :last_column, keyword_init: true)
class Reference < Struct.new(:type, :name, :number, :index, :ex_tag, :first_column, :last_column, keyword_init: true)
def value
name || index
name || number
end
end
end
Expand Down
69 changes: 66 additions & 3 deletions lib/lrama/grammar/rule_builder.rb
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,12 @@ def process_rhs(parameterizing_resolver)
@parameterizing_rules = @parameterizing_rules + parameterizing.build
@replaced_rhs << parameterizing.build_token
end
when Lrama::Lexer::Token::ParserStatePop
process_parser_state_token(token, "parser_state_pop_", "YYPOP_STATE_#{token.s_value.upcase}();", i, parameterizing_resolver)
when Lrama::Lexer::Token::ParserStatePush
process_parser_state_token(token, "parser_state_push_", "YYPUSH_STATE_#{token.s_value.upcase}(#{token.state.s_value});", i, parameterizing_resolver)
when Lrama::Lexer::Token::ParserStateSet
process_parser_state_token(token, "parser_state_set_", "YYSET_STATE_#{token.s_value.upcase}(#{token.state.s_value});", i, parameterizing_resolver)
when Lrama::Lexer::Token::UserCode
prefix = token.referred ? "@" : "$@"
new_token = Lrama::Lexer::Token::Ident.new(s_value: prefix + @midrule_action_counter.increment.to_s)
Expand All @@ -138,29 +144,53 @@ def process_rhs(parameterizing_resolver)
end
end

def process_parser_state_token(token, prefix, code, position_in_original_rule_rhs, parameterizing_resolver)
new_token = Lrama::Lexer::Token::Ident.new(s_value: prefix + token.s_value + @midrule_action_counter.increment.to_s)
user_code = Lrama::Lexer::Token::UserCode.new(s_value: code, location: token.location)

@replaced_rhs << new_token
rule_builder = RuleBuilder.new(@rule_counter, @midrule_action_counter, position_in_original_rule_rhs, skip_preprocess_references: true)
rule_builder.lhs = new_token
rule_builder.user_code = user_code
rule_builder.complete_input
rule_builder.setup_rules(parameterizing_resolver)

@rule_builders_for_derived_rules << rule_builder
end

def numberize_references
# Bison n'th component is 1-origin
(rhs + [user_code]).compact.each.with_index(1) do |token, i|
next unless token.is_a?(Lrama::Lexer::Token::UserCode)

token.references.each do |ref|
# Derive number reference index from named reference
ref_name = ref.name
if ref_name && ref_name != '$'
if lhs.referred_by?(ref_name)
ref.name = '$'
else
candidates = rhs.each_with_index.select {|token, i| token.referred_by?(ref_name) }
candidates = referable_tokens.each_with_index.select {|token, i| token.referred_by?(ref_name) }

raise "Referring symbol `#{ref_name}` is duplicated. #{token}" if candidates.size >= 2
raise "Referring symbol `#{ref_name}` is not found. #{token}" unless referring_symbol = candidates.first

ref.index = referring_symbol[1] + 1
ref.number = referring_symbol[1] + 1
end
end

if ref.number
# Remapping number reference index to include non referable tokens
# TODO: Is it better to separate "number" of reference from actual "index" (Grammar::Reference)?
ref.index = number_to_index[ref.number]

if !ref.index
raise "Can not refer to not exist component. $#{ref.number}"
end
end

# TODO: Need to check index of @ too?
next if ref.type == :at

if ref.index
# TODO: Prohibit $0 even so Bison allows it?
# See: https://www.gnu.org/software/bison/manual/html_node/Actions.html
Expand All @@ -171,6 +201,39 @@ def numberize_references
end
end

def referable_token?(token)
case token
when Lrama::Lexer::Token::ParserStatePop
false
when Lrama::Lexer::Token::ParserStatePush
false
when Lrama::Lexer::Token::ParserStateSet
false
else
true
end
end

def referable_tokens
rhs.select do |token|
referable_token?(token)
end
end

def number_to_index
return @number_to_index if @number_to_index

@number_to_index = [0]

rhs.each.with_index(1) do |token, i|
if referable_token?(token)
@number_to_index << i
end
end

@number_to_index
end

def flush_user_code
if c = @user_code
@rhs << c
Expand Down
4 changes: 4 additions & 0 deletions lib/lrama/lexer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ class Lexer
%empty
%code
%rule
%parser-state-push
%parser-state-pop
%parser-state-set
%parser-state
)

def initialize(text)
Expand Down
3 changes: 3 additions & 0 deletions lib/lrama/lexer/token.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
require 'lrama/lexer/token/char'
require 'lrama/lexer/token/ident'
require 'lrama/lexer/token/instantiate_rule'
require 'lrama/lexer/token/parser_state_pop'
require 'lrama/lexer/token/parser_state_push'
require 'lrama/lexer/token/parser_state_set'
require 'lrama/lexer/token/tag'
require 'lrama/lexer/token/user_code'

Expand Down
8 changes: 8 additions & 0 deletions lib/lrama/lexer/token/parser_state_pop.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
module Lrama
class Lexer
class Token
class ParserStatePop < Token
end
end
end
end
9 changes: 9 additions & 0 deletions lib/lrama/lexer/token/parser_state_push.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
module Lrama
class Lexer
class Token
class ParserStatePush < Token
attr_accessor :state
end
end
end
end
9 changes: 9 additions & 0 deletions lib/lrama/lexer/token/parser_state_set.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
module Lrama
class Lexer
class Token
class ParserStateSet < Token
attr_accessor :state
end
end
end
end
4 changes: 2 additions & 2 deletions lib/lrama/lexer/token/user_code.rb
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def scan_reference(scanner)
return Lrama::Grammar::Reference.new(type: :dollar, name: "$", ex_tag: tag, first_column: start, last_column: scanner.pos - 1)
when scanner.scan(/\$(<[a-zA-Z0-9_]+>)?(\d+)/) # $1, $2, $<long>1
tag = scanner[1] ? Lrama::Lexer::Token::Tag.new(s_value: scanner[1]) : nil
return Lrama::Grammar::Reference.new(type: :dollar, index: Integer(scanner[2]), ex_tag: tag, first_column: start, last_column: scanner.pos - 1)
return Lrama::Grammar::Reference.new(type: :dollar, number: Integer(scanner[2]), index: Integer(scanner[2]), ex_tag: tag, first_column: start, last_column: scanner.pos - 1)
when scanner.scan(/\$(<[a-zA-Z0-9_]+>)?([a-zA-Z_][a-zA-Z0-9_]*)/) # $foo, $expr, $<long>program (named reference without brackets)
tag = scanner[1] ? Lrama::Lexer::Token::Tag.new(s_value: scanner[1]) : nil
return Lrama::Grammar::Reference.new(type: :dollar, name: scanner[2], ex_tag: tag, first_column: start, last_column: scanner.pos - 1)
Expand All @@ -51,7 +51,7 @@ def scan_reference(scanner)
when scanner.scan(/@\$/) # @$
return Lrama::Grammar::Reference.new(type: :at, name: "$", first_column: start, last_column: scanner.pos - 1)
when scanner.scan(/@(\d+)/) # @1
return Lrama::Grammar::Reference.new(type: :at, index: Integer(scanner[1]), first_column: start, last_column: scanner.pos - 1)
return Lrama::Grammar::Reference.new(type: :at, number: Integer(scanner[1]), index: Integer(scanner[1]), first_column: start, last_column: scanner.pos - 1)
when scanner.scan(/@([a-zA-Z][a-zA-Z0-9_]*)/) # @foo, @expr (named reference without brackets)
return Lrama::Grammar::Reference.new(type: :at, name: scanner[1], first_column: start, last_column: scanner.pos - 1)
when scanner.scan(/@\[([a-zA-Z_.][-a-zA-Z0-9_.]*)\]/) # @expr.right, @expr-right (named reference with brackets)
Expand Down
Loading