From 5e3df7f4d78ddc1fc01abe866f3c8e283c06315a Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Mon, 28 Oct 2013 03:49:51 +0100 Subject: [PATCH 01/54] experiment with JSON scanner --- lib/coderay/scanners/json.rb | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/lib/coderay/scanners/json.rb b/lib/coderay/scanners/json.rb index b09970c2..0d514cfa 100644 --- a/lib/coderay/scanners/json.rb +++ b/lib/coderay/scanners/json.rb @@ -14,7 +14,7 @@ class JSON < Scanner ESCAPE = / [bfnrt\\"\/] /x # :nodoc: UNICODE_ESCAPE = / u[a-fA-F0-9]{4} /x # :nodoc: - KEY = / (?> (?: [^\\"]+ | \\. )* ) " \s* : /x + KEY = / (?> (?: [^\\"]+ | \\. )* ) " \s* : /mx protected @@ -37,40 +37,41 @@ def scan_tokens encoder, options when :initial if match = scan(/ \s+ /x) encoder.text_token match, :space - elsif match = scan(/"/) - state = check(/#{KEY}/o) ? :key : :string - encoder.begin_group state + elsif match = scan(/ " (?=#{KEY}) /ox) + state = :key + encoder.begin_group :key + encoder.text_token match, :delimiter + elsif match = scan(/ " /x) + state = :string + encoder.begin_group :string encoder.text_token match, :delimiter elsif match = scan(/ [:,\[{\]}] /x) encoder.text_token match, :operator elsif match = scan(/ true | false | null /x) encoder.text_token match, :value + elsif match = scan(/ -? (?: 0 | [1-9]\d* ) (?: \.\d+ (?: [eE][-+]? \d+ )? | [eE][-+]? \d+ ) /x) + encoder.text_token match, :float elsif match = scan(/ -? (?: 0 | [1-9]\d* ) /x) - if scan(/ \.\d+ (?:[eE][-+]?\d+)? | [eE][-+]? \d+ /x) - match << matched - encoder.text_token match, :float - else - encoder.text_token match, :integer - end + encoder.text_token match, :integer else encoder.text_token getch, :error end when :string, :key - if match = scan(/[^\\"]+/) + if match = scan(/ [^\\"]+ /x) encoder.text_token match, :content - elsif match = scan(/"/) + elsif match = scan(/ " /x) encoder.text_token match, :delimiter encoder.end_group state state = :initial - elsif match = scan(/ \\ (?: #{ESCAPE} | #{UNICODE_ESCAPE} ) /mox) + elsif match = scan(/ \\ (?: #{ESCAPE} | #{UNICODE_ESCAPE} ) /ox) encoder.text_token match, :char - elsif match = scan(/\\./m) + elsif match = scan(/ \\. /mx) encoder.text_token match, :content - elsif match = scan(/ \\ | $ /x) + elsif match = scan(/ \\ /x) encoder.end_group state - encoder.text_token match, :error unless match.empty? state = :initial + encoder.text_token match, :error else raise_inspect "else case \" reached; %p not handled." % peek(1), encoder end From 8dc6d8b6ac7d02ef7067a36c082a114c83606c9e Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Mon, 28 Oct 2013 03:59:52 +0100 Subject: [PATCH 02/54] ws --- lib/coderay/scanners/json.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/coderay/scanners/json.rb b/lib/coderay/scanners/json.rb index 0d514cfa..cb61960b 100644 --- a/lib/coderay/scanners/json.rb +++ b/lib/coderay/scanners/json.rb @@ -80,6 +80,7 @@ def scan_tokens encoder, options raise_inspect 'Unknown state: %p' % [state], encoder end + end if options[:keep_state] From 615ac9604cf9f37009fa38e4320552c8735b4386 Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Sat, 21 Mar 2015 04:32:59 +0100 Subject: [PATCH 03/54] add alternative JSON scanners --- lib/coderay/scanners/json.rb | 34 ++++---- lib/coderay/scanners/json1.rb | 100 ++++++++++++++++++++++++ lib/coderay/scanners/json2.rb | 131 +++++++++++++++++++++++++++++++ lib/coderay/scanners/json3.rb | 143 ++++++++++++++++++++++++++++++++++ lib/coderay/scanners/json4.rb | 143 ++++++++++++++++++++++++++++++++++ 5 files changed, 533 insertions(+), 18 deletions(-) create mode 100644 lib/coderay/scanners/json1.rb create mode 100644 lib/coderay/scanners/json2.rb create mode 100644 lib/coderay/scanners/json3.rb create mode 100644 lib/coderay/scanners/json4.rb diff --git a/lib/coderay/scanners/json.rb b/lib/coderay/scanners/json.rb index cb61960b..b09970c2 100644 --- a/lib/coderay/scanners/json.rb +++ b/lib/coderay/scanners/json.rb @@ -14,7 +14,7 @@ class JSON < Scanner ESCAPE = / [bfnrt\\"\/] /x # :nodoc: UNICODE_ESCAPE = / u[a-fA-F0-9]{4} /x # :nodoc: - KEY = / (?> (?: [^\\"]+ | \\. )* ) " \s* : /mx + KEY = / (?> (?: [^\\"]+ | \\. )* ) " \s* : /x protected @@ -37,41 +37,40 @@ def scan_tokens encoder, options when :initial if match = scan(/ \s+ /x) encoder.text_token match, :space - elsif match = scan(/ " (?=#{KEY}) /ox) - state = :key - encoder.begin_group :key - encoder.text_token match, :delimiter - elsif match = scan(/ " /x) - state = :string - encoder.begin_group :string + elsif match = scan(/"/) + state = check(/#{KEY}/o) ? :key : :string + encoder.begin_group state encoder.text_token match, :delimiter elsif match = scan(/ [:,\[{\]}] /x) encoder.text_token match, :operator elsif match = scan(/ true | false | null /x) encoder.text_token match, :value - elsif match = scan(/ -? (?: 0 | [1-9]\d* ) (?: \.\d+ (?: [eE][-+]? \d+ )? | [eE][-+]? \d+ ) /x) - encoder.text_token match, :float elsif match = scan(/ -? (?: 0 | [1-9]\d* ) /x) - encoder.text_token match, :integer + if scan(/ \.\d+ (?:[eE][-+]?\d+)? | [eE][-+]? \d+ /x) + match << matched + encoder.text_token match, :float + else + encoder.text_token match, :integer + end else encoder.text_token getch, :error end when :string, :key - if match = scan(/ [^\\"]+ /x) + if match = scan(/[^\\"]+/) encoder.text_token match, :content - elsif match = scan(/ " /x) + elsif match = scan(/"/) encoder.text_token match, :delimiter encoder.end_group state state = :initial - elsif match = scan(/ \\ (?: #{ESCAPE} | #{UNICODE_ESCAPE} ) /ox) + elsif match = scan(/ \\ (?: #{ESCAPE} | #{UNICODE_ESCAPE} ) /mox) encoder.text_token match, :char - elsif match = scan(/ \\. /mx) + elsif match = scan(/\\./m) encoder.text_token match, :content - elsif match = scan(/ \\ /x) + elsif match = scan(/ \\ | $ /x) encoder.end_group state + encoder.text_token match, :error unless match.empty? state = :initial - encoder.text_token match, :error else raise_inspect "else case \" reached; %p not handled." % peek(1), encoder end @@ -80,7 +79,6 @@ def scan_tokens encoder, options raise_inspect 'Unknown state: %p' % [state], encoder end - end if options[:keep_state] diff --git a/lib/coderay/scanners/json1.rb b/lib/coderay/scanners/json1.rb new file mode 100644 index 00000000..c2f75b95 --- /dev/null +++ b/lib/coderay/scanners/json1.rb @@ -0,0 +1,100 @@ +module CodeRay +module Scanners + + # Scanner for JSON (JavaScript Object Notation). + class JSON1 < Scanner + + register_for :json1 + file_extension 'json1' + + KINDS_NOT_LOC = [ + :float, :char, :content, :delimiter, + :error, :integer, :operator, :value, + ] # :nodoc: + + ESCAPE = / [bfnrt\\"\/] /x # :nodoc: + UNICODE_ESCAPE = / u[a-fA-F0-9]{4} /x # :nodoc: + KEY = / (?> (?: [^\\"]+ | \\. )* ) " \s* : /mx + + protected + + def setup + @state = :initial + end + + # See http://json.org/ for a definition of the JSON lexic/grammar. + def scan_tokens encoder, options + state = options[:state] || @state + + if [:string, :key].include? state + encoder.begin_group state + end + + until eos? + + case state + + when :initial + if match = scan(/ \s+ /x) + encoder.text_token match, :space + elsif match = scan(/ " (?=#{KEY}) /ox) + state = :key + encoder.begin_group :key + encoder.text_token match, :delimiter + elsif match = scan(/ " /x) + state = :string + encoder.begin_group :string + encoder.text_token match, :delimiter + elsif match = scan(/ [:,\[{\]}] /x) + encoder.text_token match, :operator + elsif match = scan(/ true | false | null /x) + encoder.text_token match, :value + elsif match = scan(/ -? (?: 0 | [1-9]\d* ) (?: \.\d+ (?: [eE][-+]? \d+ )? | [eE][-+]? \d+ ) /x) + encoder.text_token match, :float + elsif match = scan(/ -? (?: 0 | [1-9]\d* ) /x) + encoder.text_token match, :integer + else + encoder.text_token getch, :error + end + + when :string, :key + if match = scan(/ [^\\"]+ /x) + encoder.text_token match, :content + elsif match = scan(/ " /x) + encoder.text_token match, :delimiter + encoder.end_group state + state = :initial + elsif match = scan(/ \\ (?: #{ESCAPE} | #{UNICODE_ESCAPE} ) /ox) + encoder.text_token match, :char + elsif match = scan(/ \\. /mx) + encoder.text_token match, :content + elsif match = scan(/ \\ /x) + encoder.end_group state + state = :initial + encoder.text_token match, :error + else + raise_inspect "else case \" reached; %p not handled." % peek(1), encoder + end + + else + raise_inspect 'Unknown state: %p' % [state], encoder + + end + + end + + if options[:keep_state] + @state = state + end + + if [:string, :key].include? state + encoder.end_group state + end + + encoder + end + + end + +end +end diff --git a/lib/coderay/scanners/json2.rb b/lib/coderay/scanners/json2.rb new file mode 100644 index 00000000..14bbe670 --- /dev/null +++ b/lib/coderay/scanners/json2.rb @@ -0,0 +1,131 @@ +module CodeRay +module Scanners + + class RuleBasedScanner2 < Scanner + class << self + attr_accessor :states + + def state *names, &block + @@states ||= {} + + @@rules = [] + + instance_eval(&block) + + for name in names + @@states[name] = @@rules + end + + @@rules = nil + end + + def token pattern, *actions + @@rules << [pattern, *actions] + end + + def push_group name + [:begin_group, name] + end + + def pop_group + [:end_group] + end + end + end + + # Scanner for JSON (JavaScript Object Notation). + class JSON2 < RuleBasedScanner2 + + register_for :json2 + file_extension 'json2' + + KINDS_NOT_LOC = [ + :float, :char, :content, :delimiter, + :error, :integer, :operator, :value, + ] # :nodoc: + + ESCAPE = / [bfnrt\\"\/] /x # :nodoc: + UNICODE_ESCAPE = / u[a-fA-F0-9]{4} /x # :nodoc: + KEY = / (?> (?: [^\\"]+ | \\. )* ) " \s* : /mx + + state :initial do + token %r/ \s+ /x, :space + + token %r/ " (?=#{KEY}) /x, push_group(:key), :delimiter + token %r/ " /x, push_group(:string), :delimiter + + token %r/ [:,\[{\]}] /x, :operator + + token %r/ true | false | null /x, :value + token %r/ -? (?: 0 | [1-9]\d* ) (?: \.\d+ (?: [eE][-+]? \d+ )? | [eE][-+]? \d+ ) /x, :float + token %r/ -? (?: 0 | [1-9]\d* ) /x, :integer + end + + state :string, :key do + token %r/ [^\\"]+ /x, :content + + token %r/ " /x, :delimiter, pop_group + + token %r/ \\ (?: #{ESCAPE} | #{UNICODE_ESCAPE} ) /x, :char + token %r/ \\. /mx, :content + token %r/ \\ /x, pop_group, :error + + # token %r/$/, end_group + end + + protected + + def setup + @state = :initial + end + + # See http://json.org/ for a definition of the JSON lexic/grammar. + def scan_tokens encoder, options + state = options[:state] || @state + + if [:string, :key].include? state + encoder.begin_group state + end + + states = [state] + + until eos? + for pattern, *actions in @@states[state] + if match = scan(pattern) + for action in actions + case action + when Symbol + encoder.text_token match, action + when Array + case action.first + when :begin_group + encoder.begin_group action.last + state = action.last + states << state + when :end_group + encoder.end_group states.pop + state = states.last + end + end + end + + break + end + end && encoder.text_token(getch, :error) + end + + if options[:keep_state] + @state = state + end + + if [:string, :key].include? state + encoder.end_group state + end + + encoder + end + + end + +end +end diff --git a/lib/coderay/scanners/json3.rb b/lib/coderay/scanners/json3.rb new file mode 100644 index 00000000..a79f5135 --- /dev/null +++ b/lib/coderay/scanners/json3.rb @@ -0,0 +1,143 @@ +module CodeRay +module Scanners + + class RuleBasedScanner3 < Scanner + class << self + attr_accessor :states + + def state *names, &block + @@code ||= "" + + @@code << "when #{names.map(&:inspect).join(', ')}\n" + + @@first = true + instance_eval(&block) + @@code << " else\n" + # @@code << " raise 'no match for #{names.map(&:inspect).join(', ')}'\n" + @@code << " encoder.text_token getch, :error\n" + @@code << " end\n" + @@code << " \n" + end + + def token pattern, *actions + @@code << " #{'els' unless @@first}if match = scan(#{pattern.inspect})\n" + + for action in actions + case action + when Symbol + @@code << " p 'text_token %p %p' % [match, #{action.inspect}]\n" if $DEBUG + @@code << " encoder.text_token match, #{action.inspect}\n" + when Array + case action.first + when :begin_group + @@code << " p 'begin_group %p' % [#{action.last.inspect}]\n" if $DEBUG + @@code << " state = #{action.last.inspect}\n" + @@code << " states << #{action.last.inspect}\n" + @@code << " encoder.begin_group #{action.last.inspect}\n" + when :end_group + @@code << " p 'end_group %p' % [states.last]\n" if $DEBUG + @@code << " encoder.end_group states.pop\n" + @@code << " state = states.last\n" + end + end + end + + @@first = false + end + + def push_group name + [:begin_group, name] + end + + def pop_group + [:end_group] + end + end + end + + # Scanner for JSON (JavaScript Object Notation). + class JSON3 < RuleBasedScanner3 + + register_for :json3 + file_extension 'json3' + + KINDS_NOT_LOC = [ + :float, :char, :content, :delimiter, + :error, :integer, :operator, :value, + ] # :nodoc: + + ESCAPE = / [bfnrt\\"\/] /x # :nodoc: + UNICODE_ESCAPE = / u[a-fA-F0-9]{4} /x # :nodoc: + KEY = / (?> (?: [^\\"]+ | \\. )* ) " \s* : /mx + + state :initial do + token %r/ \s+ /x, :space + + token %r/ [:,\[{\]}] /x, :operator + + token %r/ " (?=#{KEY}) /x, push_group(:key), :delimiter + token %r/ " /x, push_group(:string), :delimiter + + token %r/ true | false | null /x, :value + token %r/ -? (?: 0 | [1-9]\d* ) (?: \.\d+ (?: e[-+]? \d+ )? | e[-+]? \d+ ) /ix, :float + token %r/ -? (?: 0 | [1-9]\d* ) (?: e[+-] \d+ )? /ix, :integer + end + + state :key, :string do + token %r/ [^\\"]+ /x, :content + + token %r/ " /x, :delimiter, pop_group + + token %r/ \\ (?: #{ESCAPE} | #{UNICODE_ESCAPE} ) /x, :char + token %r/ \\. /mx, :content + token %r/ \\ /x, pop_group, :error + end + + protected + + def setup + @state = :initial + end + + # See http://json.org/ for a definition of the JSON lexic/grammar. + scan_tokens_code = <<-"RUBY" + def scan_tokens encoder, options + state = options[:state] || @state + + if [:string, :key].include? state + encoder.begin_group state + end + + states = [state] + + until eos? + + case state + +#{ @@code.chomp.gsub(/^/, ' ') } + else + raise_inspect 'Unknown state: %p' % [state], encoder + + end + + end + + if options[:keep_state] + @state = state + end + + if [:string, :key].include? state + encoder.end_group state + end + + encoder + end + RUBY + + # puts scan_tokens_code + class_eval scan_tokens_code + + end + +end +end diff --git a/lib/coderay/scanners/json4.rb b/lib/coderay/scanners/json4.rb new file mode 100644 index 00000000..31602189 --- /dev/null +++ b/lib/coderay/scanners/json4.rb @@ -0,0 +1,143 @@ +module CodeRay +module Scanners + + class RuleBasedScanner4 < Scanner + class << self + attr_accessor :states + + def state *names, &block + @@code ||= "" + + @@code << "when #{names.map(&:inspect).join(', ')}\n" + + @@first = true + instance_eval(&block) + @@code << " else\n" + # @@code << " raise 'no match for #{names.map(&:inspect).join(', ')}'\n" + @@code << " encoder.text_token getch, :error\n" + @@code << " end\n" + @@code << " \n" + end + + def token pattern, *actions + @@code << " #{'els' unless @@first}if match = scan(#{pattern.inspect})\n" + + for action in actions + case action + when Symbol + @@code << " p 'text_token %p %p' % [match, #{action.inspect}]\n" if $DEBUG + @@code << " encoder.text_token match, #{action.inspect}\n" + when Array + case action.first + when :push + @@code << " p 'push %p' % [#{action.last.inspect}]\n" if $DEBUG + @@code << " state = #{action.last.inspect}\n" + @@code << " states << state\n" + @@code << " encoder.begin_group state\n" + when :pop + @@code << " p 'pop %p' % [states.last]\n" if $DEBUG + @@code << " encoder.end_group states.pop\n" + @@code << " state = states.last\n" + end + end + end + + @@first = false + end + + def push state + [:push, state] + end + + def pop + [:pop] + end + end + end + + # Scanner for JSON (JavaScript Object Notation). + class JSON4 < RuleBasedScanner4 + + register_for :json4 + file_extension 'json4' + + KINDS_NOT_LOC = [ + :float, :char, :content, :delimiter, + :error, :integer, :operator, :value, + ] # :nodoc: + + ESCAPE = / [bfnrt\\"\/] /x # :nodoc: + UNICODE_ESCAPE = / u[a-fA-F0-9]{4} /x # :nodoc: + KEY = / (?> (?: [^\\"]+ | \\. )* ) " \s* : /mx + + state :initial do + token %r/ \s+ /x, :space + + token %r/ [:,\[{\]}] /x, :operator + + token %r/ " (?=#{KEY}) /x, push(:key), :delimiter + token %r/ " /x, push(:string), :delimiter + + token %r/ true | false | null /x, :value + token %r/ -? (?: 0 | [1-9]\d* ) (?: \.\d+ (?: e[-+]? \d+ )? | e[-+]? \d+ ) /ix, :float + token %r/ -? (?: 0 | [1-9]\d* ) (?: e[+-] \d+ )? /ix, :integer + end + + state :key, :string do + token %r/ [^\\"]+ /x, :content + + token %r/ " /x, :delimiter, pop + + token %r/ \\ (?: #{ESCAPE} | #{UNICODE_ESCAPE} ) /x, :char + token %r/ \\. /mx, :content + token %r/ \\ /x, :error, pop + end + + protected + + def setup + @state = :initial + end + + # See http://json.org/ for a definition of the JSON lexic/grammar. + scan_tokens_code = <<-"RUBY" + def scan_tokens encoder, options + state = options[:state] || @state + + if [:string, :key].include? state + encoder.begin_group state + end + + states = [state] + + until eos? + + case state + +#{ @@code.chomp.gsub(/^/, ' ') } + else + raise_inspect 'Unknown state: %p' % [state], encoder + + end + + end + + if options[:keep_state] + @state = state + end + + if [:string, :key].include? state + encoder.end_group state + end + + encoder + end + RUBY + + # puts scan_tokens_code + class_eval scan_tokens_code + + end + +end +end From 300ccd3a622e1ec16802de11070f5c6a3c733248 Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Sat, 21 Mar 2015 12:25:11 +0100 Subject: [PATCH 04/54] no need to modify file_extension --- lib/coderay/scanners/json1.rb | 2 +- lib/coderay/scanners/json2.rb | 2 +- lib/coderay/scanners/json3.rb | 2 +- lib/coderay/scanners/json4.rb | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/coderay/scanners/json1.rb b/lib/coderay/scanners/json1.rb index c2f75b95..d44f6baa 100644 --- a/lib/coderay/scanners/json1.rb +++ b/lib/coderay/scanners/json1.rb @@ -5,7 +5,7 @@ module Scanners class JSON1 < Scanner register_for :json1 - file_extension 'json1' + file_extension 'json' KINDS_NOT_LOC = [ :float, :char, :content, :delimiter, diff --git a/lib/coderay/scanners/json2.rb b/lib/coderay/scanners/json2.rb index 14bbe670..6d7adc82 100644 --- a/lib/coderay/scanners/json2.rb +++ b/lib/coderay/scanners/json2.rb @@ -37,7 +37,7 @@ def pop_group class JSON2 < RuleBasedScanner2 register_for :json2 - file_extension 'json2' + file_extension 'json' KINDS_NOT_LOC = [ :float, :char, :content, :delimiter, diff --git a/lib/coderay/scanners/json3.rb b/lib/coderay/scanners/json3.rb index a79f5135..cf0c1f02 100644 --- a/lib/coderay/scanners/json3.rb +++ b/lib/coderay/scanners/json3.rb @@ -59,7 +59,7 @@ def pop_group class JSON3 < RuleBasedScanner3 register_for :json3 - file_extension 'json3' + file_extension 'json' KINDS_NOT_LOC = [ :float, :char, :content, :delimiter, diff --git a/lib/coderay/scanners/json4.rb b/lib/coderay/scanners/json4.rb index 31602189..5cb3afbd 100644 --- a/lib/coderay/scanners/json4.rb +++ b/lib/coderay/scanners/json4.rb @@ -59,7 +59,7 @@ def pop class JSON4 < RuleBasedScanner4 register_for :json4 - file_extension 'json4' + file_extension 'json' KINDS_NOT_LOC = [ :float, :char, :content, :delimiter, From f4f0db4715da2fb5d1454ffc843d7acc67c78923 Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Sat, 21 Mar 2015 12:26:31 +0100 Subject: [PATCH 05/54] add variant tasks like rake test:scanner:json:2 --- rake_tasks/test.rake | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/rake_tasks/test.rake b/rake_tasks/test.rake index b15b9993..9653d3e2 100644 --- a/rake_tasks/test.rake +++ b/rake_tasks/test.rake @@ -48,6 +48,11 @@ Please rename or remove it and run again to use the GitHub repository: task lang => :update_scanner_suite do ruby "./test/scanners/suite.rb #{lang}" end + (1..4).each do |i| + task "#{lang}:#{i}" => :update_scanner_suite do + ruby "./test/scanners/suite.rb #{lang}:#{i}" + end + end end end From f1ea4287f42b6c70ac037d7ce7be1d6737b50094 Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Sat, 21 Mar 2015 12:26:49 +0100 Subject: [PATCH 06/54] this seems obsolete --- lib/coderay/scanners/java_script.rb | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/coderay/scanners/java_script.rb b/lib/coderay/scanners/java_script.rb index 9eb0a0a1..5e278137 100644 --- a/lib/coderay/scanners/java_script.rb +++ b/lib/coderay/scanners/java_script.rb @@ -100,7 +100,6 @@ def scan_tokens encoder, options # TODO: scan over nested tags xml_scanner.tokenize match, :tokens => encoder value_expected = false - next elsif match = scan(/ [-+*=<>?:;,!&^|(\[{~%]+ | \.(?!\d) /x) value_expected = true From b01a3fbb6a00c93b06db4f35ea8ecc6760daed9e Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Sat, 21 Mar 2015 12:27:24 +0100 Subject: [PATCH 07/54] add SKIP_UPDATE_SCANNER_SUITE switch --- rake_tasks/test.rake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rake_tasks/test.rake b/rake_tasks/test.rake index 9653d3e2..2b25fbf6 100644 --- a/rake_tasks/test.rake +++ b/rake_tasks/test.rake @@ -37,7 +37,7 @@ Please rename or remove it and run again to use the GitHub repository: else puts 'Downloading scanner test suite...' sh 'git clone https://github.com/rubychan/coderay-scanner-tests.git test/scanners/' - end + end unless ENV['SKIP_UPDATE_SCANNER_SUITE'] end namespace :scanner do From 2499b1e5e94a35bb1a0754a5c3f774e2751b2f4b Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Sat, 21 Mar 2015 12:28:04 +0100 Subject: [PATCH 08/54] first version of RuleBasedScanner for JavaScript --- lib/coderay/scanners/_map.rb | 1 + lib/coderay/scanners/java_script4.rb | 368 +++++++++++++++++++++++++++ 2 files changed, 369 insertions(+) create mode 100644 lib/coderay/scanners/java_script4.rb diff --git a/lib/coderay/scanners/_map.rb b/lib/coderay/scanners/_map.rb index a240298d..441ccc65 100644 --- a/lib/coderay/scanners/_map.rb +++ b/lib/coderay/scanners/_map.rb @@ -10,6 +10,7 @@ module Scanners :eruby => :erb, :irb => :ruby, :javascript => :java_script, + :javascript4 => :java_script4, :js => :java_script, :pascal => :delphi, :patch => :diff, diff --git a/lib/coderay/scanners/java_script4.rb b/lib/coderay/scanners/java_script4.rb new file mode 100644 index 00000000..10aa709f --- /dev/null +++ b/lib/coderay/scanners/java_script4.rb @@ -0,0 +1,368 @@ +module CodeRay +module Scanners + + class RuleBasedScanner5 < Scanner + + CheckIf = Struct.new :callback + + class << self + attr_accessor :states + + def state *names, &block + @@code ||= "" + + @@code << "when #{names.map(&:inspect).join(', ')}\n" + + @@first = true + instance_eval(&block) + @@code << " else\n" + # @@code << " raise 'no match for #{names.map(&:inspect).join(', ')}'\n" + @@code << " encoder.text_token getch, :error\n" + @@code << " end\n" + @@code << " \n" + end + + def token *pattern_and_actions + if index = pattern_and_actions.find_index { |item| !item.is_a?(CheckIf) } + preconditions = pattern_and_actions[0..index - 1] if index > 0 + pattern = pattern_and_actions[index] or raise 'I need a pattern!' + actions = pattern_and_actions[index + 1..-1] or raise 'I need actions!' + end + + precondition_expression = '' + if preconditions + for precondition in preconditions + case precondition + when CheckIf + callback = make_callback(precondition.callback) + case precondition.callback.arity + when 0 + arguments = '' + when 1 + arguments = '(state)' + else + raise "I got %p arguments for precondition: %p, but I only know how to evaluate 0..1" % [precondition.callback.arity, callback] + end + precondition_expression << "#{callback}#{arguments} && " + else + raise "I don't know how to evaluate this precondition: %p" % [precondition] + end + end + end + + case pattern + when Regexp + pattern_expression = pattern.inspect + when Proc + pattern_expression = make_callback(pattern).to_s + else + raise "I don't know how to evaluate this pattern: %p" % [pattern] + end + + @@code << " #{'els' unless @@first}if #{precondition_expression}match = scan(#{pattern_expression})\n" + + for action in actions + case action + when Symbol + @@code << " p 'text_token %p %p' % [match, #{action.inspect}]\n" if $DEBUG + @@code << " encoder.text_token match, #{action.inspect}\n" + when Array + case action.first + when :push + case action.last + when Symbol + @@code << " p 'push %p' % [#{action.last.inspect}]\n" if $DEBUG + @@code << " state = #{action.last.inspect}\n" + when Proc + callback = make_callback(action.last) + case action.last.arity + when 0 + arguments = '' + when 1 + arguments = '(match)' + else + raise "I got %p arguments for push: %p, but I only know how to evaluate 0..1" % [action.last.arity, callback] + end + @@code << " p 'push %p' % [#{callback}]\n" if $DEBUG + @@code << " state = #{callback}#{arguments}\n" + else + raise "I don't know how to evaluate this push state: %p" % [action.last] + end + @@code << " states << state\n" + @@code << " encoder.begin_group state\n" + when :pop + @@code << " p 'pop %p' % [states.last]\n" if $DEBUG + @@code << " encoder.end_group states.pop\n" + @@code << " state = states.last\n" + end + when Proc + callback = make_callback(action) + case action.arity + when 0 + arguments = '' + when 1 + arguments = '(match)' + when 2 + arguments = '(match, encoder)' + else + raise "I got %p arguments for action: %p, but I only know how to evaluate 0..2" % [action.arity, callback] + end + @@code << " p 'calling %p'\n" % [callback] if $DEBUG + @@code << " #{callback}#{arguments}\n" + + else + raise "I don't know how to evaluate this action: %p" % [action] + end + end + + @@first = false + end + + def push state = nil, &block + raise 'push requires a state or a block; got nothing' unless state || block + [:push, state || block] + end + + def pop + [:pop] + end + + def check_if &callback + CheckIf.new callback + end + + protected + + def make_callback block + @callbacks ||= {} + + base_name = "__callback_line_#{block.source_location.last}" + name = base_name + counter = 'a' + while @callbacks.key?(name) + name = "#{base_name}_#{counter}" + counter.succ! + end + + @callbacks[name] = define_method(name, &block) + end + end + end + + # Scanner for JavaScript. + # + # Aliases: +ecmascript+, +ecma_script+, +javascript+ + class JavaScript4 < RuleBasedScanner5 + + register_for :java_script4 + file_extension 'js' + + # The actual JavaScript keywords. + KEYWORDS = %w[ + break case catch continue default delete do else + finally for function if in instanceof new + return switch throw try typeof var void while with + ] # :nodoc: + PREDEFINED_CONSTANTS = %w[ + false null true undefined NaN Infinity + ] # :nodoc: + + MAGIC_VARIABLES = %w[ this arguments ] # :nodoc: arguments was introduced in JavaScript 1.4 + + KEYWORDS_EXPECTING_VALUE = WordList.new.add %w[ + case delete in instanceof new return throw typeof with + ] # :nodoc: + + # Reserved for future use. + RESERVED_WORDS = %w[ + abstract boolean byte char class debugger double enum export extends + final float goto implements import int interface long native package + private protected public short static super synchronized throws transient + volatile + ] # :nodoc: + + IDENT_KIND = WordList.new(:ident). + add(RESERVED_WORDS, :reserved). + add(PREDEFINED_CONSTANTS, :predefined_constant). + add(MAGIC_VARIABLES, :local_variable). + add(KEYWORDS, :keyword) # :nodoc: + + ESCAPE = / [bfnrtv\n\\'"] | x[a-fA-F0-9]{1,2} | [0-7]{1,3} /x # :nodoc: + UNICODE_ESCAPE = / u[a-fA-F0-9]{4} | U[a-fA-F0-9]{8} /x # :nodoc: + REGEXP_ESCAPE = / [bBdDsSwW] /x # :nodoc: + STRING_CONTENT_PATTERN = { + "'" => /[^\\']+/, + '"' => /[^\\"]+/, + '/' => /[^\\\/]+/, + } # :nodoc: + KEY_CHECK_PATTERN = { + "'" => / (?> [^\\']* (?: \\. [^\\']* )* ) ' \s* : /mx, + '"' => / (?> [^\\"]* (?: \\. [^\\"]* )* ) " \s* : /mx, + } # :nodoc: + + state :initial do + token %r/ \s+ | \\\n /x, :space, -> (match) do + @value_expected = true if !@value_expected && match.index(?\n) + end + + token %r! // [^\n\\]* (?: \\. [^\n\\]* )* | /\* (?: .*? \*/ | .*() ) !mx, :comment, -> (match) do + @value_expected = true + # state = :open_multi_line_comment if self[1] + end + + # elsif check(/\.?\d/) + token %r/0[xX][0-9A-Fa-f]+/, :hex, -> { @key_expected = @value_expected = false } + token %r/(?>0[0-7]+)(?![89.eEfF])/, :octal, -> { @key_expected = @value_expected = false } + token %r/\d+[fF]|\d*\.\d+(?:[eE][+-]?\d+)?[fF]?|\d+[eE][+-]?\d+[fF]?/, :float, -> { @key_expected = @value_expected = false } + token %r/\d+/, :integer, -> { @key_expected = @value_expected = false } + + token check_if { @value_expected }, %r/<([[:alpha:]]\w*) (?: [^\/>]*\/> | .*?<\/\1>)/xim, -> (match, encoder) do + # TODO: scan over nested tags + xml_scanner.tokenize match, :tokens => encoder + @value_expected = false + end + + token %r/ [-+*=<>?:;,!&^|(\[{~%]+ | \.(?!\d) /x, :operator, -> (match) do + @value_expected = true + last_operator = match[-1] + @key_expected = (last_operator == ?{) || (last_operator == ?,) + @function_expected = false + end + + token %r/ [)\]}]+ /x, :operator, -> { @function_expected = @key_expected = @value_expected = false } + + token %r/ [$a-zA-Z_][A-Za-z_0-9$]* /x, -> (match, encoder) do + kind = IDENT_KIND[match] + @value_expected = (kind == :keyword) && KEYWORDS_EXPECTING_VALUE[match] + # TODO: labels + if kind == :ident + if match.index(?$) # $ allowed inside an identifier + kind = :predefined + elsif @function_expected + kind = :function + elsif check(/\s*[=:]\s*function\b/) + kind = :function + elsif @key_expected && check(/\s*:/) + kind = :key + end + end + @function_expected = (kind == :keyword) && (match == 'function') + @key_expected = false + encoder.text_token match, kind + end + + token %r/["']/, push { |match| + @key_expected && check(KEY_CHECK_PATTERN[match]) ? :key : :string + }, :delimiter, -> (match) { @string_delimiter = match } + + token check_if { @value_expected }, %r/\//, push(:regexp), :delimiter, -> { @string_delimiter = '/' } + + token %r/ \/ /x, :operator, -> { @value_expected = true; @key_expected = false } + end + + state :string, :regexp, :key do + token -> { STRING_CONTENT_PATTERN[@string_delimiter] }, :content + + token %r/\//, :delimiter, -> (match, encoder) do + modifiers = scan(/[gim]+/) + encoder.text_token modifiers, :modifier if modifiers && !modifiers.empty? + end, -> do + @string_delimiter = nil + @key_expected = @value_expected = false + end, pop + + token %r/["']/, :delimiter, -> do + @string_delimiter = nil + @key_expected = @value_expected = false + end, pop + + token check_if { |state| state != :regexp }, %r/ \\ (?: #{ESCAPE} | #{UNICODE_ESCAPE} ) /mox, -> (match, encoder) do + if @string_delimiter == "'" && !(match == "\\\\" || match == "\\'") + encoder.text_token match, :content + else + encoder.text_token match, :char + end + end + + token check_if { |state| state == :regexp }, %r/ \\ (?: #{ESCAPE} | #{REGEXP_ESCAPE} | #{UNICODE_ESCAPE} ) /mox, :char + token %r/\\./m, :content + token %r/ \\ /x, pop, :error, -> (match, encoder) do + @string_delimiter = nil + @key_expected = @value_expected = false + end + end + + state :open_multi_line_comment do + token %r! .*? \*/ !mx, :initial # don't consume! + token %r/ .+ /mx, :comment, -> { @value_expected = true } + + # if match = scan(%r! .*? \*/ !mx) + # state = :initial + # else + # match = scan(%r! .+ !mx) + # end + # value_expected = true + # encoder.text_token match, :comment if match + end + + protected + + def setup + @state = :initial + end + + scan_tokens_code = <<-"RUBY" + def scan_tokens encoder, options#{ def_line = __LINE__; nil } + state, @string_delimiter = options[:state] || @state + if @string_delimiter + encoder.begin_group state + end + + @value_expected = true + @key_expected = false + @function_expected = false + + states = [state] + + until eos? + + case state + +#{ @@code.chomp.gsub(/^/, ' ') } + else + raise_inspect 'Unknown state: %p' % [state], encoder + + end + + end + + if options[:keep_state] + @state = state, string_delimiter + end + + if [:string, :regexp].include? state + encoder.end_group state + end + + encoder + end + RUBY + + # puts scan_tokens_code + class_eval scan_tokens_code, __FILE__, def_line + + protected + + def reset_instance + super + @xml_scanner.reset if defined? @xml_scanner + end + + def xml_scanner + @xml_scanner ||= CodeRay.scanner :xml, :tokens => @tokens, :keep_tokens => true, :keep_state => false + end + + end + +end +end From 647e9c06e7316659e1348d5ef66ad825b08fd464 Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Sun, 22 Mar 2015 14:17:59 +0100 Subject: [PATCH 09/54] more work on DSL scanner for JavaScript --- lib/coderay/scanners/_map.rb | 3 + lib/coderay/scanners/java_script1.rb | 238 ++++++++++++++++++++++++++ lib/coderay/scanners/java_script2.rb | 240 +++++++++++++++++++++++++++ lib/coderay/scanners/java_script3.rb | 239 ++++++++++++++++++++++++++ lib/coderay/scanners/java_script4.rb | 152 ++++++++++------- 5 files changed, 812 insertions(+), 60 deletions(-) create mode 100644 lib/coderay/scanners/java_script1.rb create mode 100644 lib/coderay/scanners/java_script2.rb create mode 100644 lib/coderay/scanners/java_script3.rb diff --git a/lib/coderay/scanners/_map.rb b/lib/coderay/scanners/_map.rb index 441ccc65..8fc505aa 100644 --- a/lib/coderay/scanners/_map.rb +++ b/lib/coderay/scanners/_map.rb @@ -10,6 +10,9 @@ module Scanners :eruby => :erb, :irb => :ruby, :javascript => :java_script, + :javascript1 => :java_script1, + :javascript2 => :java_script2, + :javascript3 => :java_script3, :javascript4 => :java_script4, :js => :java_script, :pascal => :delphi, diff --git a/lib/coderay/scanners/java_script1.rb b/lib/coderay/scanners/java_script1.rb new file mode 100644 index 00000000..4fe59bad --- /dev/null +++ b/lib/coderay/scanners/java_script1.rb @@ -0,0 +1,238 @@ +# like java_script.rb +# - but uses instance instead of local variables for flags +module CodeRay +module Scanners + + # Scanner for JavaScript. + # + # Aliases: +ecmascript+, +ecma_script+, +javascript+ + class JavaScript1 < Scanner + + register_for :java_script1 + file_extension 'js' + + # The actual JavaScript keywords. + KEYWORDS = %w[ + break case catch continue default delete do else + finally for function if in instanceof new + return switch throw try typeof var void while with + ] # :nodoc: + PREDEFINED_CONSTANTS = %w[ + false null true undefined NaN Infinity + ] # :nodoc: + + MAGIC_VARIABLES = %w[ this arguments ] # :nodoc: arguments was introduced in JavaScript 1.4 + + KEYWORDS_EXPECTING_VALUE = WordList.new.add %w[ + case delete in instanceof new return throw typeof with + ] # :nodoc: + + # Reserved for future use. + RESERVED_WORDS = %w[ + abstract boolean byte char class debugger double enum export extends + final float goto implements import int interface long native package + private protected public short static super synchronized throws transient + volatile + ] # :nodoc: + + IDENT_KIND = WordList.new(:ident). + add(RESERVED_WORDS, :reserved). + add(PREDEFINED_CONSTANTS, :predefined_constant). + add(MAGIC_VARIABLES, :local_variable). + add(KEYWORDS, :keyword) # :nodoc: + + ESCAPE = / [bfnrtv\n\\'"] | x[a-fA-F0-9]{1,2} | [0-7]{1,3} /x # :nodoc: + UNICODE_ESCAPE = / u[a-fA-F0-9]{4} | U[a-fA-F0-9]{8} /x # :nodoc: + REGEXP_ESCAPE = / [bBdDsSwW] /x # :nodoc: + STRING_CONTENT_PATTERN = { + "'" => /[^\\']+/, + '"' => /[^\\"]+/, + '/' => /[^\\\/]+/, + } # :nodoc: + KEY_CHECK_PATTERN = { + "'" => / (?> [^\\']* (?: \\. [^\\']* )* ) ' \s* : /mx, + '"' => / (?> [^\\"]* (?: \\. [^\\"]* )* ) " \s* : /mx, + } # :nodoc: + + protected + + def setup + @state = :initial + end + + def scan_tokens encoder, options + + state, @string_delimiter = options[:state] || @state + if @string_delimiter + encoder.begin_group state + end + + @value_expected = true + @key_expected = false + @function_expected = false + + until eos? + + case state + + when :initial + + if match = scan(/ \s+ | \\\n /x) + @value_expected = true if !@value_expected && match.index(?\n) + encoder.text_token match, :space + + elsif match = scan(%r! // [^\n\\]* (?: \\. [^\n\\]* )* | /\* (?: .*? \*/ | .*() ) !mx) + @value_expected = true + encoder.text_token match, :comment + state = :open_multi_line_comment if self[1] + + elsif check(/\.?\d/) + @key_expected = @value_expected = false + if match = scan(/0[xX][0-9A-Fa-f]+/) + encoder.text_token match, :hex + elsif match = scan(/(?>0[0-7]+)(?![89.eEfF])/) + encoder.text_token match, :octal + elsif match = scan(/\d+[fF]|\d*\.\d+(?:[eE][+-]?\d+)?[fF]?|\d+[eE][+-]?\d+[fF]?/) + encoder.text_token match, :float + elsif match = scan(/\d+/) + encoder.text_token match, :integer + end + + elsif @value_expected && match = scan(/<([[:alpha:]]\w*) (?: [^\/>]*\/> | .*?<\/\1>)/xim) + # TODO: scan over nested tags + xml_scanner.tokenize match, :tokens => encoder + @value_expected = false + + elsif match = scan(/ [-+*=<>?:;,!&^|(\[{~%]+ | \.(?!\d) /x) + @value_expected = true + last_operator = match[-1] + @key_expected = (last_operator == ?{) || (last_operator == ?,) + @function_expected = false + encoder.text_token match, :operator + + elsif match = scan(/ [)\]}]+ /x) + @function_expected = @key_expected = @value_expected = false + encoder.text_token match, :operator + + elsif match = scan(/ [$a-zA-Z_][A-Za-z_0-9$]* /x) + kind = IDENT_KIND[match] + @value_expected = (kind == :keyword) && KEYWORDS_EXPECTING_VALUE[match] + # TODO: labels + if kind == :ident + if match.index(?$) # $ allowed inside an identifier + kind = :predefined + elsif @function_expected + kind = :function + elsif check(/\s*[=:]\s*function\b/) + kind = :function + elsif @key_expected && check(/\s*:/) + kind = :key + end + end + @function_expected = (kind == :keyword) && (match == 'function') + @key_expected = false + encoder.text_token match, kind + + elsif match = scan(/["']/) + if @key_expected && check(KEY_CHECK_PATTERN[match]) + state = :key + else + state = :string + end + encoder.begin_group state + @string_delimiter = match + encoder.text_token match, :delimiter + + elsif @value_expected && (match = scan(/\//)) + encoder.begin_group :regexp + state = :regexp + @string_delimiter = '/' + encoder.text_token match, :delimiter + + elsif match = scan(/ \/ /x) + @value_expected = true + @key_expected = false + encoder.text_token match, :operator + + else + encoder.text_token getch, :error + + end + + when :string, :regexp, :key + if match = scan(STRING_CONTENT_PATTERN[@string_delimiter]) + encoder.text_token match, :content + elsif match = scan(/["'\/]/) + encoder.text_token match, :delimiter + if state == :regexp + modifiers = scan(/[gim]+/) + encoder.text_token modifiers, :modifier if modifiers && !modifiers.empty? + end + encoder.end_group state + @string_delimiter = nil + @key_expected = @value_expected = false + state = :initial + elsif state != :regexp && (match = scan(/ \\ (?: #{ESCAPE} | #{UNICODE_ESCAPE} ) /mox)) + if @string_delimiter == "'" && !(match == "\\\\" || match == "\\'") + encoder.text_token match, :content + else + encoder.text_token match, :char + end + elsif state == :regexp && match = scan(/ \\ (?: #{ESCAPE} | #{REGEXP_ESCAPE} | #{UNICODE_ESCAPE} ) /mox) + encoder.text_token match, :char + elsif match = scan(/\\./m) + encoder.text_token match, :content + elsif match = scan(/ \\ | $ /x) + encoder.end_group state + encoder.text_token match, :error unless match.empty? + @string_delimiter = nil + @key_expected = @value_expected = false + state = :initial + else + raise_inspect "else case #{@string_delimiter} reached; %p not handled." % peek(1), encoder + end + + when :open_multi_line_comment + if match = scan(%r! .*? \*/ !mx) + state = :initial + else + match = scan(%r! .+ !mx) + end + @value_expected = true + encoder.text_token match, :comment if match + + else + #:nocov: + raise_inspect 'Unknown state: %p' % [state], encoder + #:nocov: + + end + + end + + if options[:keep_state] + @state = state, @string_delimiter + end + + if [:string, :regexp].include? state + encoder.end_group state + end + + encoder + end + + protected + + def reset_instance + super + @xml_scanner.reset if defined? @xml_scanner + end + + def xml_scanner + @xml_scanner ||= CodeRay.scanner :xml, :tokens => @tokens, :keep_tokens => true, :keep_state => false + end + + end + +end +end diff --git a/lib/coderay/scanners/java_script2.rb b/lib/coderay/scanners/java_script2.rb new file mode 100644 index 00000000..42fa6409 --- /dev/null +++ b/lib/coderay/scanners/java_script2.rb @@ -0,0 +1,240 @@ +# like java_script.rb +# - but uses instance instead of local variables for flags +# - but uses the same rule logic as java_script4.rb +# - also uses states array push/pop +module CodeRay +module Scanners + + # Scanner for JavaScript. + # + # Aliases: +ecmascript+, +ecma_script+, +javascript+ + class JavaScript2 < Scanner + + register_for :java_script2 + file_extension 'js' + + # The actual JavaScript keywords. + KEYWORDS = %w[ + break case catch continue default delete do else + finally for function if in instanceof new + return switch throw try typeof var void while with + ] # :nodoc: + PREDEFINED_CONSTANTS = %w[ + false null true undefined NaN Infinity + ] # :nodoc: + + MAGIC_VARIABLES = %w[ this arguments ] # :nodoc: arguments was introduced in JavaScript 1.4 + + KEYWORDS_EXPECTING_VALUE = WordList.new.add %w[ + case delete in instanceof new return throw typeof with + ] # :nodoc: + + # Reserved for future use. + RESERVED_WORDS = %w[ + abstract boolean byte char class debugger double enum export extends + final float goto implements import int interface long native package + private protected public short static super synchronized throws transient + volatile + ] # :nodoc: + + IDENT_KIND = WordList.new(:ident). + add(RESERVED_WORDS, :reserved). + add(PREDEFINED_CONSTANTS, :predefined_constant). + add(MAGIC_VARIABLES, :local_variable). + add(KEYWORDS, :keyword) # :nodoc: + + ESCAPE = / [bfnrtv\n\\'"] | x[a-fA-F0-9]{1,2} | [0-7]{1,3} /x # :nodoc: + UNICODE_ESCAPE = / u[a-fA-F0-9]{4} | U[a-fA-F0-9]{8} /x # :nodoc: + REGEXP_ESCAPE = / [bBdDsSwW] /x # :nodoc: + STRING_CONTENT_PATTERN = { + "'" => /[^\\']+/, + '"' => /[^\\"]+/, + '/' => /[^\\\/]+/, + } # :nodoc: + KEY_CHECK_PATTERN = { + "'" => / (?> [^\\']* (?: \\. [^\\']* )* ) ' \s* : /mx, + '"' => / (?> [^\\"]* (?: \\. [^\\"]* )* ) " \s* : /mx, + } # :nodoc: + + protected + + def setup + @state = :initial + end + + def scan_tokens encoder, options + + state, @string_delimiter = options[:state] || @state + if @string_delimiter + encoder.begin_group state + end + + @value_expected = true + @key_expected = false + @function_expected = false + + states = [state] + + until eos? + + case state + + when :initial + + if match = scan(/ \s+ | \\\n /x) + encoder.text_token match, :space + @value_expected = true if !@value_expected && match.index(?\n) + + elsif match = scan(%r! // [^\n\\]* (?: \\. [^\n\\]* )* | /\* (?: .*? \*/ | .*() ) !mx) + encoder.text_token match, :comment + @value_expected = true + # state = :open_multi_line_comment if self[1] + + elsif check(/\.?\d/) + if match = scan(/0[xX][0-9A-Fa-f]+/) + encoder.text_token match, :hex + elsif match = scan(/(?>0[0-7]+)(?![89.eEfF])/) + encoder.text_token match, :octal + elsif match = scan(/\d+[fF]|\d*\.\d+(?:[eE][+-]?\d+)?[fF]?|\d+[eE][+-]?\d+[fF]?/) + encoder.text_token match, :float + elsif match = scan(/\d+/) + encoder.text_token match, :integer + end + @key_expected = @value_expected = false + + elsif @value_expected && match = scan(/<([[:alpha:]]\w*) (?: [^\/>]*\/> | .*?<\/\1>)/xim) + # TODO: scan over nested tags + xml_scanner.tokenize match, :tokens => encoder + @value_expected = false + + elsif match = scan(/ [-+*=<>?:;,!&^|(\[{~%]+ | \.(?!\d) /x) + encoder.text_token match, :operator + @value_expected = true + @key_expected = /[{,]$/ === match + @function_expected = false + + elsif match = scan(/ [)\]}]+ /x) + encoder.text_token match, :operator + @function_expected = @key_expected = @value_expected = false + + elsif match = scan(/ [$a-zA-Z_][A-Za-z_0-9$]* /x) + kind = IDENT_KIND[match] + @value_expected = (kind == :keyword) && KEYWORDS_EXPECTING_VALUE[match] + # TODO: labels + if kind == :ident + if match.index(?$) # $ allowed inside an identifier + kind = :predefined + elsif @function_expected + kind = :function + elsif check(/\s*[=:]\s*function\b/) + kind = :function + elsif @key_expected && check(/\s*:/) + kind = :key + end + end + encoder.text_token match, kind + @function_expected = (kind == :keyword) && (match == 'function') + @key_expected = false + + elsif match = scan(/["']/) + state = (@key_expected && check(KEY_CHECK_PATTERN[match])) ? :key : :string + states << state + encoder.begin_group state + @string_delimiter = match + encoder.text_token match, :delimiter + + elsif @value_expected && (match = scan(/\//)) + state = :regexp + states << state + encoder.begin_group state + @string_delimiter = '/' + encoder.text_token match, :delimiter + + elsif match = scan(/ \/ /x) + @value_expected = true + @key_expected = false + encoder.text_token match, :operator + + else + encoder.text_token getch, :error + + end + + when :string, :regexp, :key + if match = scan(STRING_CONTENT_PATTERN[@string_delimiter]) + encoder.text_token match, :content + elsif match = scan(/["'\/]/) + encoder.text_token match, :delimiter + if match == '/' + modifiers = scan(/[gim]+/) + encoder.text_token modifiers, :modifier if modifiers && !modifiers.empty? + end + @string_delimiter = nil + @key_expected = @value_expected = false + encoder.end_group states.pop + state = states.last + elsif state != :regexp && (match = scan(/ \\ (?: #{ESCAPE} | #{UNICODE_ESCAPE} ) /mox)) + if @string_delimiter == "'" && !(match == "\\\\" || match == "\\'") + encoder.text_token match, :content + else + encoder.text_token match, :char + end + elsif state == :regexp && match = scan(/ \\ (?: #{ESCAPE} | #{REGEXP_ESCAPE} | #{UNICODE_ESCAPE} ) /mox) + encoder.text_token match, :char + elsif match = scan(/\\./m) + encoder.text_token match, :content + elsif match = scan(/ \\ | $ /x) + encoder.end_group states.pop + state = states.last + encoder.text_token match, :error unless match.empty? + @string_delimiter = nil + @key_expected = @value_expected = false + else + raise_inspect "else case #{@string_delimiter} reached; %p not handled." % peek(1), encoder + end + + # when :open_multi_line_comment + # if match = scan(%r! .*? \*/ !mx) + # states.pop + # state = states.last + # else + # match = scan(%r! .+ !mx) + # end + # @value_expected = true + # encoder.text_token match, :comment if match + + else + #:nocov: + raise_inspect 'Unknown state: %p' % [state], encoder + #:nocov: + + end + + end + + if options[:keep_state] + @state = state, @string_delimiter + end + + if [:string, :regexp].include? state + encoder.end_group state + end + + encoder + end + + protected + + def reset_instance + super + @xml_scanner.reset if defined? @xml_scanner + end + + def xml_scanner + @xml_scanner ||= CodeRay.scanner :xml, :tokens => @tokens, :keep_tokens => true, :keep_state => false + end + + end + +end +end diff --git a/lib/coderay/scanners/java_script3.rb b/lib/coderay/scanners/java_script3.rb new file mode 100644 index 00000000..9492967c --- /dev/null +++ b/lib/coderay/scanners/java_script3.rb @@ -0,0 +1,239 @@ +# like java_script.rb +# - but uses the same rule logic as java_script4.rb +# - also uses states array push/pop +module CodeRay +module Scanners + + # Scanner for JavaScript. + # + # Aliases: +ecmascript+, +ecma_script+, +javascript+ + class JavaScript3 < Scanner + + register_for :java_script3 + file_extension 'js' + + # The actual JavaScript keywords. + KEYWORDS = %w[ + break case catch continue default delete do else + finally for function if in instanceof new + return switch throw try typeof var void while with + ] # :nodoc: + PREDEFINED_CONSTANTS = %w[ + false null true undefined NaN Infinity + ] # :nodoc: + + MAGIC_VARIABLES = %w[ this arguments ] # :nodoc: arguments was introduced in JavaScript 1.4 + + KEYWORDS_EXPECTING_VALUE = WordList.new.add %w[ + case delete in instanceof new return throw typeof with + ] # :nodoc: + + # Reserved for future use. + RESERVED_WORDS = %w[ + abstract boolean byte char class debugger double enum export extends + final float goto implements import int interface long native package + private protected public short static super synchronized throws transient + volatile + ] # :nodoc: + + IDENT_KIND = WordList.new(:ident). + add(RESERVED_WORDS, :reserved). + add(PREDEFINED_CONSTANTS, :predefined_constant). + add(MAGIC_VARIABLES, :local_variable). + add(KEYWORDS, :keyword) # :nodoc: + + ESCAPE = / [bfnrtv\n\\'"] | x[a-fA-F0-9]{1,2} | [0-7]{1,3} /x # :nodoc: + UNICODE_ESCAPE = / u[a-fA-F0-9]{4} | U[a-fA-F0-9]{8} /x # :nodoc: + REGEXP_ESCAPE = / [bBdDsSwW] /x # :nodoc: + STRING_CONTENT_PATTERN = { + "'" => /[^\\']+/, + '"' => /[^\\"]+/, + '/' => /[^\\\/]+/, + } # :nodoc: + KEY_CHECK_PATTERN = { + "'" => / (?> [^\\']* (?: \\. [^\\']* )* ) ' \s* : /mx, + '"' => / (?> [^\\"]* (?: \\. [^\\"]* )* ) " \s* : /mx, + } # :nodoc: + + protected + + def setup + @state = :initial + end + + def scan_tokens encoder, options + + state, string_delimiter = options[:state] || @state + if string_delimiter + encoder.begin_group state + end + + value_expected = true + key_expected = false + function_expected = false + + states = [state] + + until eos? + + case state + + when :initial + + if match = scan(/ \s+ | \\\n /x) + encoder.text_token match, :space + value_expected = true if !value_expected && match.index(?\n) + + elsif match = scan(%r! // [^\n\\]* (?: \\. [^\n\\]* )* | /\* (?: .*? \*/ | .*() ) !mx) + encoder.text_token match, :comment + value_expected = true + # state = :open_multi_line_comment if self[1] + + elsif check(/\.?\d/) + if match = scan(/0[xX][0-9A-Fa-f]+/) + encoder.text_token match, :hex + elsif match = scan(/(?>0[0-7]+)(?![89.eEfF])/) + encoder.text_token match, :octal + elsif match = scan(/\d+[fF]|\d*\.\d+(?:[eE][+-]?\d+)?[fF]?|\d+[eE][+-]?\d+[fF]?/) + encoder.text_token match, :float + elsif match = scan(/\d+/) + encoder.text_token match, :integer + end + key_expected = value_expected = false + + elsif value_expected && match = scan(/<([[:alpha:]]\w*) (?: [^\/>]*\/> | .*?<\/\1>)/xim) + # TODO: scan over nested tags + xml_scanner.tokenize match, :tokens => encoder + value_expected = false + + elsif match = scan(/ [-+*=<>?:;,!&^|(\[{~%]+ | \.(?!\d) /x) + encoder.text_token match, :operator + value_expected = true + key_expected = /[{,]$/ === match + function_expected = false + + elsif match = scan(/ [)\]}]+ /x) + encoder.text_token match, :operator + function_expected = key_expected = value_expected = false + + elsif match = scan(/ [$a-zA-Z_][A-Za-z_0-9$]* /x) + kind = IDENT_KIND[match] + value_expected = (kind == :keyword) && KEYWORDS_EXPECTING_VALUE[match] + # TODO: labels + if kind == :ident + if match.index(?$) # $ allowed inside an identifier + kind = :predefined + elsif function_expected + kind = :function + elsif check(/\s*[=:]\s*function\b/) + kind = :function + elsif key_expected && check(/\s*:/) + kind = :key + end + end + encoder.text_token match, kind + function_expected = (kind == :keyword) && (match == 'function') + key_expected = false + + elsif match = scan(/["']/) + state = (key_expected && check(KEY_CHECK_PATTERN[match])) ? :key : :string + states << state + encoder.begin_group state + string_delimiter = match + encoder.text_token match, :delimiter + + elsif value_expected && (match = scan(/\//)) + state = :regexp + states << state + encoder.begin_group state + string_delimiter = '/' + encoder.text_token match, :delimiter + + elsif match = scan(/ \/ /x) + value_expected = true + key_expected = false + encoder.text_token match, :operator + + else + encoder.text_token getch, :error + + end + + when :string, :regexp, :key + if match = scan(STRING_CONTENT_PATTERN[string_delimiter]) + encoder.text_token match, :content + elsif match = scan(/["'\/]/) + encoder.text_token match, :delimiter + if match == '/' + modifiers = scan(/[gim]+/) + encoder.text_token modifiers, :modifier if modifiers && !modifiers.empty? + end + string_delimiter = nil + key_expected = value_expected = false + encoder.end_group states.pop + state = states.last + elsif state != :regexp && (match = scan(/ \\ (?: #{ESCAPE} | #{UNICODE_ESCAPE} ) /mox)) + if string_delimiter == "'" && !(match == "\\\\" || match == "\\'") + encoder.text_token match, :content + else + encoder.text_token match, :char + end + elsif state == :regexp && match = scan(/ \\ (?: #{ESCAPE} | #{REGEXP_ESCAPE} | #{UNICODE_ESCAPE} ) /mox) + encoder.text_token match, :char + elsif match = scan(/\\./m) + encoder.text_token match, :content + elsif match = scan(/ \\ | $ /x) + encoder.end_group states.pop + state = states.last + encoder.text_token match, :error unless match.empty? + string_delimiter = nil + key_expected = value_expected = false + else + raise_inspect "else case #{string_delimiter} reached; %p not handled." % peek(1), encoder + end + + # when :open_multi_line_comment + # if match = scan(%r! .*? \*/ !mx) + # states.pop + # state = states.last + # else + # match = scan(%r! .+ !mx) + # end + # value_expected = true + # encoder.text_token match, :comment if match + + else + #:nocov: + raise_inspect 'Unknown state: %p' % [state], encoder + #:nocov: + + end + + end + + if options[:keep_state] + @state = state, string_delimiter + end + + if [:string, :regexp].include? state + encoder.end_group state + end + + encoder + end + + protected + + def reset_instance + super + @xml_scanner.reset if defined? @xml_scanner + end + + def xml_scanner + @xml_scanner ||= CodeRay.scanner :xml, :tokens => @tokens, :keep_tokens => true, :keep_state => false + end + + end + +end +end diff --git a/lib/coderay/scanners/java_script4.rb b/lib/coderay/scanners/java_script4.rb index 10aa709f..4b9601f3 100644 --- a/lib/coderay/scanners/java_script4.rb +++ b/lib/coderay/scanners/java_script4.rb @@ -1,9 +1,10 @@ +# TODO: string_delimiter should be part of the state: push(:regexp, '/'), check_if -> (state, delimiter) { … } module CodeRay module Scanners class RuleBasedScanner5 < Scanner - CheckIf = Struct.new :callback + CheckIf = Struct.new :condition class << self attr_accessor :states @@ -22,7 +23,18 @@ def state *names, &block @@code << " \n" end - def token *pattern_and_actions + def on? pattern + pattern_expression = pattern.inspect + @@code << " #{'els' unless @@first}if check(#{pattern_expression})\n" + + @@first = true + yield + @@code << " end\n" + + @@first = false + end + + def on *pattern_and_actions if index = pattern_and_actions.find_index { |item| !item.is_a?(CheckIf) } preconditions = pattern_and_actions[0..index - 1] if index > 0 pattern = pattern_and_actions[index] or raise 'I need a pattern!' @@ -34,16 +46,23 @@ def token *pattern_and_actions for precondition in preconditions case precondition when CheckIf - callback = make_callback(precondition.callback) - case precondition.callback.arity - when 0 - arguments = '' - when 1 - arguments = '(state)' + case precondition.condition + when Proc + callback = make_callback(precondition.condition) + case precondition.condition.arity + when 0 + arguments = '' + when 1 + arguments = '(state)' + else + raise "I got %p arguments for precondition: %p, but I only know how to evaluate 0..1" % [precondition.condition.arity, callback] + end + precondition_expression << "#{callback}#{arguments} && " + when Symbol + precondition_expression << "#{precondition.condition} && " else - raise "I got %p arguments for precondition: %p, but I only know how to evaluate 0..1" % [precondition.callback.arity, callback] + raise "I don't know how to evaluate this check_if precondition: %p" % [precondition.condition] end - precondition_expression << "#{callback}#{arguments} && " else raise "I don't know how to evaluate this precondition: %p" % [precondition] end @@ -51,6 +70,8 @@ def token *pattern_and_actions end case pattern + # when String + # pattern_expression = pattern when Regexp pattern_expression = pattern.inspect when Proc @@ -127,8 +148,8 @@ def pop [:pop] end - def check_if &callback - CheckIf.new callback + def check_if value = nil, &callback + CheckIf.new value || callback end protected @@ -201,37 +222,35 @@ class JavaScript4 < RuleBasedScanner5 } # :nodoc: state :initial do - token %r/ \s+ | \\\n /x, :space, -> (match) do - @value_expected = true if !@value_expected && match.index(?\n) - end + # on %r/ [ \t]* \n \s* /x, :space, -> { @value_expected = true } + # on %r/ [ \t]+ | \\\n /x, :space + on %r/ \s+ | \\\n /x, :space, -> (match) { @value_expected = true if !@value_expected && match.index(?\n) } - token %r! // [^\n\\]* (?: \\. [^\n\\]* )* | /\* (?: .*? \*/ | .*() ) !mx, :comment, -> (match) do - @value_expected = true + on %r! // [^\n\\]* (?: \\. [^\n\\]* )* | /\* (?: .*? \*/ | .*() ) !mx, :comment, -> { @value_expected = true } # state = :open_multi_line_comment if self[1] - end - # elsif check(/\.?\d/) - token %r/0[xX][0-9A-Fa-f]+/, :hex, -> { @key_expected = @value_expected = false } - token %r/(?>0[0-7]+)(?![89.eEfF])/, :octal, -> { @key_expected = @value_expected = false } - token %r/\d+[fF]|\d*\.\d+(?:[eE][+-]?\d+)?[fF]?|\d+[eE][+-]?\d+[fF]?/, :float, -> { @key_expected = @value_expected = false } - token %r/\d+/, :integer, -> { @key_expected = @value_expected = false } + on? %r/\.?\d/ do + on %r/0[xX][0-9A-Fa-f]+/, :hex, -> { @key_expected = @value_expected = false } + on %r/(?>0[0-7]+)(?![89.eEfF])/, :octal, -> { @key_expected = @value_expected = false } + on %r/\d+[fF]|\d*\.\d+(?:[eE][+-]?\d+)?[fF]?|\d+[eE][+-]?\d+[fF]?/, :float, -> { @key_expected = @value_expected = false } + on %r/\d+/, :integer, -> { @key_expected = @value_expected = false } + end - token check_if { @value_expected }, %r/<([[:alpha:]]\w*) (?: [^\/>]*\/> | .*?<\/\1>)/xim, -> (match, encoder) do + on check_if(:@value_expected), %r/<([[:alpha:]]\w*) (?: [^\/>]*\/> | .*?<\/\1>)/xim, -> (match, encoder) do # TODO: scan over nested tags xml_scanner.tokenize match, :tokens => encoder @value_expected = false end - token %r/ [-+*=<>?:;,!&^|(\[{~%]+ | \.(?!\d) /x, :operator, -> (match) do + on %r/ [-+*=<>?:;,!&^|(\[{~%]+ | \.(?!\d) /x, :operator, -> (match) do @value_expected = true - last_operator = match[-1] - @key_expected = (last_operator == ?{) || (last_operator == ?,) + @key_expected = /[{,]$/ === match @function_expected = false end - token %r/ [)\]}]+ /x, :operator, -> { @function_expected = @key_expected = @value_expected = false } + on %r/ [)\]}]+ /x, :operator, -> { @function_expected = @key_expected = @value_expected = false } - token %r/ [$a-zA-Z_][A-Za-z_0-9$]* /x, -> (match, encoder) do + on %r/ [$a-zA-Z_][A-Za-z_0-9$]* /x, -> (match, encoder) do kind = IDENT_KIND[match] @value_expected = (kind == :keyword) && KEYWORDS_EXPECTING_VALUE[match] # TODO: labels @@ -246,37 +265,47 @@ class JavaScript4 < RuleBasedScanner5 kind = :key end end + encoder.text_token match, kind @function_expected = (kind == :keyword) && (match == 'function') @key_expected = false - encoder.text_token match, kind end - token %r/["']/, push { |match| + on %r/["']/, push { |match| + @string_delimiter = match @key_expected && check(KEY_CHECK_PATTERN[match]) ? :key : :string - }, :delimiter, -> (match) { @string_delimiter = match } + }, :delimiter - token check_if { @value_expected }, %r/\//, push(:regexp), :delimiter, -> { @string_delimiter = '/' } + on check_if(:@value_expected), %r/\//, push(:regexp), :delimiter, -> { @string_delimiter = '/' } - token %r/ \/ /x, :operator, -> { @value_expected = true; @key_expected = false } + on %r/ \/ /x, :operator, -> { @value_expected = true; @key_expected = false } end state :string, :regexp, :key do - token -> { STRING_CONTENT_PATTERN[@string_delimiter] }, :content + on -> { STRING_CONTENT_PATTERN[@string_delimiter] }, :content + # on 'STRING_CONTENT_PATTERN[@string_delimiter]', :content - token %r/\//, :delimiter, -> (match, encoder) do - modifiers = scan(/[gim]+/) - encoder.text_token modifiers, :modifier if modifiers && !modifiers.empty? - end, -> do - @string_delimiter = nil - @key_expected = @value_expected = false - end, pop + # on %r/\//, :delimiter, -> (match, encoder) do + # modifiers = scan(/[gim]+/) + # encoder.text_token modifiers, :modifier if modifiers && !modifiers.empty? + # @string_delimiter = nil + # @key_expected = @value_expected = false + # end, pop + # + # on %r/["']/, :delimiter, -> do + # @string_delimiter = nil + # @key_expected = @value_expected = false + # end, pop - token %r/["']/, :delimiter, -> do + on %r/["'\/]/, :delimiter, -> (match, encoder) do + if match == '/' + modifiers = scan(/[gim]+/) + encoder.text_token modifiers, :modifier if modifiers && !modifiers.empty? + end @string_delimiter = nil @key_expected = @value_expected = false end, pop - token check_if { |state| state != :regexp }, %r/ \\ (?: #{ESCAPE} | #{UNICODE_ESCAPE} ) /mox, -> (match, encoder) do + on check_if { |state| state != :regexp }, %r/ \\ (?: #{ESCAPE} | #{UNICODE_ESCAPE} ) /mox, -> (match, encoder) do if @string_delimiter == "'" && !(match == "\\\\" || match == "\\'") encoder.text_token match, :content else @@ -284,26 +313,26 @@ class JavaScript4 < RuleBasedScanner5 end end - token check_if { |state| state == :regexp }, %r/ \\ (?: #{ESCAPE} | #{REGEXP_ESCAPE} | #{UNICODE_ESCAPE} ) /mox, :char - token %r/\\./m, :content - token %r/ \\ /x, pop, :error, -> (match, encoder) do + on check_if { |state| state == :regexp }, %r/ \\ (?: #{ESCAPE} | #{REGEXP_ESCAPE} | #{UNICODE_ESCAPE} ) /mox, :char + on %r/\\./m, :content + on %r/ \\ /x, pop, :error, -> do @string_delimiter = nil @key_expected = @value_expected = false end end - state :open_multi_line_comment do - token %r! .*? \*/ !mx, :initial # don't consume! - token %r/ .+ /mx, :comment, -> { @value_expected = true } - - # if match = scan(%r! .*? \*/ !mx) - # state = :initial - # else - # match = scan(%r! .+ !mx) - # end - # value_expected = true - # encoder.text_token match, :comment if match - end + # state :open_multi_line_comment do + # on %r! .*? \*/ !mx, :initial # don't consume! + # on %r/ .+ /mx, :comment, -> { @value_expected = true } + # + # # if match = scan(%r! .*? \*/ !mx) + # # state = :initial + # # else + # # match = scan(%r! .+ !mx) + # # end + # # value_expected = true + # # encoder.text_token match, :comment if match + # end protected @@ -348,7 +377,10 @@ def scan_tokens encoder, options#{ def_line = __LINE__; nil } end RUBY - # puts scan_tokens_code + if ENV['PUTS'] + puts scan_tokens_code + puts "callbacks: #{@callbacks.size}" + end class_eval scan_tokens_code, __FILE__, def_line protected From 63c9f26af69bad4f0e5407c02bde51a64d44ac60 Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Wed, 25 Mar 2015 00:06:45 +0100 Subject: [PATCH 10/54] finally, a version that is fast without eval! --- lib/coderay/scanners/_map.rb | 1 + lib/coderay/scanners/java_script5.rb | 399 +++++++++++++++++++++++++++ rake_tasks/test.rake | 2 +- 3 files changed, 401 insertions(+), 1 deletion(-) create mode 100644 lib/coderay/scanners/java_script5.rb diff --git a/lib/coderay/scanners/_map.rb b/lib/coderay/scanners/_map.rb index 8fc505aa..61079d53 100644 --- a/lib/coderay/scanners/_map.rb +++ b/lib/coderay/scanners/_map.rb @@ -14,6 +14,7 @@ module Scanners :javascript2 => :java_script2, :javascript3 => :java_script3, :javascript4 => :java_script4, + :javascript5 => :java_script5, :js => :java_script, :pascal => :delphi, :patch => :diff, diff --git a/lib/coderay/scanners/java_script5.rb b/lib/coderay/scanners/java_script5.rb new file mode 100644 index 00000000..491c6c20 --- /dev/null +++ b/lib/coderay/scanners/java_script5.rb @@ -0,0 +1,399 @@ +# TODO: string_delimiter should be part of the state: push(:regexp, '/'), check_if -> (state, delimiter) { … } +module CodeRay +module Scanners + + class RuleBasedScanner6 < Scanner + + Groups = Struct.new :token_kinds + Kind = Struct.new :token_kind + Push = Struct.new :state + Pop = Class.new + CheckIf = Struct.new :condition + ValueSetter = Struct.new :targets, :value + + class << self + attr_accessor :states + + def state *names, &block + @@code ||= "" + + @@code << "when #{names.map(&:inspect).join(', ')}\n" + + @@first = true + instance_eval(&block) + @@code << " else\n" + # @@code << " raise 'no match for #{names.map(&:inspect).join(', ')}'\n" + @@code << " encoder.text_token getch, :error\n" + @@code << " end\n" + @@code << " \n" + end + + def on? pattern + pattern_expression = pattern.inspect + @@code << " #{'els' unless @@first}if check(#{pattern_expression})\n" + + @@first = true + yield + @@code << " end\n" + + @@first = false + end + + def on *pattern_and_actions + if index = pattern_and_actions.find_index { |item| !item.is_a?(CheckIf) } + preconditions = pattern_and_actions[0..index - 1] if index > 0 + pattern = pattern_and_actions[index] or raise 'I need a pattern!' + actions = pattern_and_actions[index + 1..-1] or raise 'I need actions!' + end + + precondition_expression = '' + if preconditions + for precondition in preconditions + case precondition + when CheckIf + case precondition.condition + when Proc + precondition_expression << "#{make_callback(precondition.condition)} && " + when Symbol + precondition_expression << "#{precondition.condition} && " + else + raise "I don't know how to evaluate this check_if precondition: %p" % [precondition.condition] + end + else + raise "I don't know how to evaluate this precondition: %p" % [precondition] + end + end + end + + case pattern + when String + raise + pattern_expression = pattern + when Regexp + pattern_expression = pattern.inspect + when Proc + pattern_expression = make_callback(pattern) + else + raise "I don't know how to evaluate this pattern: %p" % [pattern] + end + + @@code << " #{'els' unless @@first}if #{precondition_expression}match = scan(#{pattern_expression})\n" + + for action in actions + case action + when String + raise + @@code << " p 'evaluate #{action.inspect}'\n" if $DEBUG + @@code << " #{action}\n" + + when Symbol + @@code << " p 'text_token %p %p' % [match, #{action.inspect}]\n" if $DEBUG + @@code << " encoder.text_token match, #{action.inspect}\n" + when Kind + case action.token_kind + when Proc + @@code << " encoder.text_token match, #{make_callback(action.token_kind)}\n" + else + raise "I don't know how to evaluate this kind: %p" % [action.token_kind] + end + when Groups + @@code << " p 'text_tokens %p in groups %p' % [match, #{action.token_kinds.inspect}]\n" if $DEBUG + action.token_kinds.each_with_index do |kind, i| + @@code << " encoder.text_token self[#{i + 1}], #{kind.inspect} if self[#{i + 1}]\n" + end + + when Push + case action.state + when String + raise + @@code << " p 'push %p' % [#{action.state}]\n" if $DEBUG + @@code << " state = #{action.state}\n" + when Symbol + @@code << " p 'push %p' % [#{action.state.inspect}]\n" if $DEBUG + @@code << " state = #{action.state.inspect}\n" + when Proc + @@code << " state = #{make_callback(action.state)}\n" + else + raise "I don't know how to evaluate this push state: %p" % [action.state] + end + @@code << " states << state\n" + @@code << " encoder.begin_group state\n" + when Pop + @@code << " p 'pop %p' % [states.last]\n" if $DEBUG + @@code << " encoder.end_group states.pop\n" + @@code << " state = states.last\n" + + when ValueSetter + case action.value + when Proc + @@code << " #{action.targets.join(' = ')} = #{make_callback(action.value)}\n" + else + @@code << " #{action.targets.join(' = ')} = #{action.value.inspect}\n" + end + + when Proc + @@code << " #{make_callback(action)}\n" + + else + raise "I don't know how to evaluate this action: %p" % [action] + end + end + + @@first = false + end + + def groups *token_kinds + Groups.new token_kinds + end + + def kind token_kind = nil, &block + Kind.new token_kind || block + end + + def push state = nil, &block + raise 'push requires a state or a block; got nothing' unless state || block + Push.new state || block + end + + def pop + Pop.new + end + + def check_if value = nil, &callback + CheckIf.new value || callback + end + + def flag_on *flags + ValueSetter.new Array(flags), true + end + + def flag_off *flags + ValueSetter.new Array(flags), false + end + + def set flag, value = nil, &callback + ValueSetter.new [flag], value || callback + end + + def unset *flags + ValueSetter.new Array(flags), nil + end + + protected + + def make_callback block + @callbacks ||= {} + + base_name = "__callback_line_#{block.source_location.last}" + name = base_name + counter = 'a' + while @callbacks.key?(name) + name = "#{base_name}_#{counter}" + counter.succ! + end + + @callbacks[name] = define_method(name, &block) + + arguments = block.parameters.map(&:last) + + if arguments.empty? + name + else + "#{name}(#{arguments.join(', ')})" + end + end + end + end + + # Scanner for JavaScript. + # + # Aliases: +ecmascript+, +ecma_script+, +javascript+ + class JavaScript5 < RuleBasedScanner6 + + register_for :java_script5 + file_extension 'js' + + # The actual JavaScript keywords. + KEYWORDS = %w[ + break case catch continue default delete do else + finally for function if in instanceof new + return switch throw try typeof var void while with + ] # :nodoc: + PREDEFINED_CONSTANTS = %w[ + false null true undefined NaN Infinity + ] # :nodoc: + + MAGIC_VARIABLES = %w[ this arguments ] # :nodoc: arguments was introduced in JavaScript 1.4 + + KEYWORDS_EXPECTING_VALUE = WordList.new.add %w[ + case delete in instanceof new return throw typeof with + ] # :nodoc: + + # Reserved for future use. + RESERVED_WORDS = %w[ + abstract boolean byte char class debugger double enum export extends + final float goto implements import int interface long native package + private protected public short static super synchronized throws transient + volatile + ] # :nodoc: + + IDENT_KIND = WordList.new(:ident). + add(RESERVED_WORDS, :reserved). + add(PREDEFINED_CONSTANTS, :predefined_constant). + add(MAGIC_VARIABLES, :local_variable). + add(KEYWORDS, :keyword) # :nodoc: + + ESCAPE = / [bfnrtv\n\\'"] | x[a-fA-F0-9]{1,2} | [0-7]{1,3} /x # :nodoc: + UNICODE_ESCAPE = / u[a-fA-F0-9]{4} | U[a-fA-F0-9]{8} /x # :nodoc: + REGEXP_ESCAPE = / [bBdDsSwW] /x # :nodoc: + STRING_CONTENT_PATTERN = { + "'" => /[^\\']+/, + '"' => /[^\\"]+/, + '/' => /[^\\\/]+/, + } # :nodoc: + KEY_CHECK_PATTERN = { + "'" => / (?> [^\\']* (?: \\. [^\\']* )* ) ' \s* : /mx, + '"' => / (?> [^\\"]* (?: \\. [^\\"]* )* ) " \s* : /mx, + } # :nodoc: + + state :initial do + on %r/ \s+ | \\\n /x, :space, set(:value_expected) { |match, value_expected| value_expected || match.index(?\n) } + on %r! // [^\n\\]* (?: \\. [^\n\\]* )* | /\* (?: .*? \*/ | .*() ) !mx, :comment, flag_off(:value_expected) + # state = :open_multi_line_comment if self[1] + + on? %r/\.?\d/ do + on %r/0[xX][0-9A-Fa-f]+/, :hex, flag_off(:key_expected, :value_expected) + on %r/(?>0[0-7]+)(?![89.eEfF])/, :octal, flag_off(:key_expected, :value_expected) + on %r/\d+[fF]|\d*\.\d+(?:[eE][+-]?\d+)?[fF]?|\d+[eE][+-]?\d+[fF]?/, :float, flag_off(:key_expected, :value_expected) + on %r/\d+/, :integer, flag_off(:key_expected, :value_expected) + end + + on check_if(:value_expected), %r/<([[:alpha:]]\w*) (?: [^\/>]*\/> | .*?<\/\1>)/xim, -> (match, encoder) do + # TODO: scan over nested tags + xml_scanner.tokenize match, :tokens => encoder + end, flag_off(:value_expected) + + on %r/ [-+*=<>?:;,!&^|(\[{~%]++ (??:;,!&^|(\[{~%]*+ (?<=[{,]) /x, :operator, flag_on(:value_expected, :key_expected), flag_off(:function_expected) + on %r/ [)\]}]+ /x, :operator, flag_off(:function_expected, :key_expected, :value_expected) + + on %r/ function (?![A-Za-z_0-9$]) /x, :keyword, flag_on(:function_expected), flag_off(:key_expected, :value_expected) + on %r/ [$a-zA-Z_][A-Za-z_0-9$]* /x, kind { |match, function_expected, key_expected| + kind = IDENT_KIND[match] + # TODO: labels + if kind == :ident + if match.index(?$) # $ allowed inside an identifier + kind = :predefined + elsif function_expected + kind = :function + elsif check(/\s*[=:]\s*function\b/) + kind = :function + elsif key_expected && check(/\s*:/) + kind = :key + end + end + + kind + }, flag_off(:function_expected, :key_expected), set(:value_expected) { |match| KEYWORDS_EXPECTING_VALUE[match] } + + on %r/["']/, push { |match, key_expected| key_expected && check(KEY_CHECK_PATTERN[match]) ? :key : :string }, :delimiter, set(:string_delimiter) { |match| match } + on check_if(:value_expected), %r/\//, push(:regexp), :delimiter + + on %r/\//, :operator, flag_on(:value_expected), flag_off(:key_expected) + end + + state :string, :key do + on -> (string_delimiter) { STRING_CONTENT_PATTERN[string_delimiter] }, :content + on %r/["']/, :delimiter, unset(:string_delimiter), flag_off(:key_expected, :value_expected), pop + on %r/ \\ (?: #{ESCAPE} | #{UNICODE_ESCAPE} ) /x, kind { |match, string_delimiter| + string_delimiter == "'" && !(match == "\\\\" || match == "\\'") ? :content : :char + } + on %r/ \\. /mx, :content + on %r/ \\ /x, unset(:string_delimiter), flag_off(:key_expected, :value_expected), pop, :error + end + + state :regexp do + on STRING_CONTENT_PATTERN['/'], :content + on %r/(\/)([gim]+)?/, groups(:delimiter, :modifier), flag_off(:key_expected, :value_expected), pop + on %r/ \\ (?: #{ESCAPE} | #{REGEXP_ESCAPE} | #{UNICODE_ESCAPE} ) /x, :char + on %r/\\./m, :content + on %r/ \\ /x, pop, :error, flag_off(:key_expected, :value_expected) + end + + # state :open_multi_line_comment do + # on %r! .*? \*/ !mx, :initial # don't consume! + # on %r/ .+ /mx, :comment, -> { value_expected = true } + # + # # if match = scan(%r! .*? \*/ !mx) + # # state = :initial + # # else + # # match = scan(%r! .+ !mx) + # # end + # # value_expected = true + # # encoder.text_token match, :comment if match + # end + + protected + + def setup + @state = :initial + end + + scan_tokens_code = <<-"RUBY" + def scan_tokens encoder, options#{ def_line = __LINE__; nil } + state, string_delimiter = options[:state] || @state + if string_delimiter + encoder.begin_group state + end + + value_expected = true + key_expected = false + function_expected = false + + states = [state] + + until eos? + + case state + +#{ @@code.chomp.gsub(/^/, ' ') } + else + raise_inspect 'Unknown state: %p' % [state], encoder + + end + + end + + if options[:keep_state] + @state = state, string_delimiter + end + + if [:string, :regexp].include? state + encoder.end_group state + end + + encoder + end + RUBY + + if ENV['PUTS'] + puts scan_tokens_code + puts "callbacks: #{@callbacks.size}" + end + class_eval scan_tokens_code, __FILE__, def_line + + protected + + def reset_instance + super + @xml_scanner.reset if defined? @xml_scanner + end + + def xml_scanner + @xml_scanner ||= CodeRay.scanner :xml, :tokens => @tokens, :keep_tokens => true, :keep_state => false + end + + end + +end +end diff --git a/rake_tasks/test.rake b/rake_tasks/test.rake index 2b25fbf6..b96e86d9 100644 --- a/rake_tasks/test.rake +++ b/rake_tasks/test.rake @@ -48,7 +48,7 @@ Please rename or remove it and run again to use the GitHub repository: task lang => :update_scanner_suite do ruby "./test/scanners/suite.rb #{lang}" end - (1..4).each do |i| + (1..5).each do |i| task "#{lang}:#{i}" => :update_scanner_suite do ruby "./test/scanners/suite.rb #{lang}:#{i}" end From e6753b9a96f697f8b35311f4315eda2cb0006e61 Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Wed, 25 Mar 2015 00:06:52 +0100 Subject: [PATCH 11/54] cleanup .gitignore --- .gitignore | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index deed1a27..888553ca 100644 --- a/.gitignore +++ b/.gitignore @@ -1,13 +1,9 @@ -.DS_Store -.*~ +.* coverage pkg spec/reports doc Gemfile.lock -.rvmrc -.ruby-gemset -.ruby-version test/executable/source.rb.html test/executable/source.rb.json test/scanners From 6eaa589d054ac515e041c0efeff0cdc6f36bfeea Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Wed, 25 Mar 2015 22:44:59 +0100 Subject: [PATCH 12/54] use instance variable instead of class variable --- lib/coderay/scanners/java_script5.rb | 70 ++++++++++++++-------------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/lib/coderay/scanners/java_script5.rb b/lib/coderay/scanners/java_script5.rb index 491c6c20..bc5ce9ad 100644 --- a/lib/coderay/scanners/java_script5.rb +++ b/lib/coderay/scanners/java_script5.rb @@ -15,28 +15,28 @@ class << self attr_accessor :states def state *names, &block - @@code ||= "" + @code ||= "" - @@code << "when #{names.map(&:inspect).join(', ')}\n" + @code << "when #{names.map(&:inspect).join(', ')}\n" - @@first = true + @first = true instance_eval(&block) - @@code << " else\n" - # @@code << " raise 'no match for #{names.map(&:inspect).join(', ')}'\n" - @@code << " encoder.text_token getch, :error\n" - @@code << " end\n" - @@code << " \n" + @code << " else\n" + # @code << " raise 'no match for #{names.map(&:inspect).join(', ')}'\n" + @code << " encoder.text_token getch, :error\n" + @code << " end\n" + @code << " \n" end def on? pattern pattern_expression = pattern.inspect - @@code << " #{'els' unless @@first}if check(#{pattern_expression})\n" + @code << " #{'els' unless @first}if check(#{pattern_expression})\n" - @@first = true + @first = true yield - @@code << " end\n" + @code << " end\n" - @@first = false + @first = false end def on *pattern_and_actions @@ -77,69 +77,69 @@ def on *pattern_and_actions raise "I don't know how to evaluate this pattern: %p" % [pattern] end - @@code << " #{'els' unless @@first}if #{precondition_expression}match = scan(#{pattern_expression})\n" + @code << " #{'els' unless @first}if #{precondition_expression}match = scan(#{pattern_expression})\n" for action in actions case action when String raise - @@code << " p 'evaluate #{action.inspect}'\n" if $DEBUG - @@code << " #{action}\n" + @code << " p 'evaluate #{action.inspect}'\n" if $DEBUG + @code << " #{action}\n" when Symbol - @@code << " p 'text_token %p %p' % [match, #{action.inspect}]\n" if $DEBUG - @@code << " encoder.text_token match, #{action.inspect}\n" + @code << " p 'text_token %p %p' % [match, #{action.inspect}]\n" if $DEBUG + @code << " encoder.text_token match, #{action.inspect}\n" when Kind case action.token_kind when Proc - @@code << " encoder.text_token match, #{make_callback(action.token_kind)}\n" + @code << " encoder.text_token match, #{make_callback(action.token_kind)}\n" else raise "I don't know how to evaluate this kind: %p" % [action.token_kind] end when Groups - @@code << " p 'text_tokens %p in groups %p' % [match, #{action.token_kinds.inspect}]\n" if $DEBUG + @code << " p 'text_tokens %p in groups %p' % [match, #{action.token_kinds.inspect}]\n" if $DEBUG action.token_kinds.each_with_index do |kind, i| - @@code << " encoder.text_token self[#{i + 1}], #{kind.inspect} if self[#{i + 1}]\n" + @code << " encoder.text_token self[#{i + 1}], #{kind.inspect} if self[#{i + 1}]\n" end when Push case action.state when String raise - @@code << " p 'push %p' % [#{action.state}]\n" if $DEBUG - @@code << " state = #{action.state}\n" + @code << " p 'push %p' % [#{action.state}]\n" if $DEBUG + @code << " state = #{action.state}\n" when Symbol - @@code << " p 'push %p' % [#{action.state.inspect}]\n" if $DEBUG - @@code << " state = #{action.state.inspect}\n" + @code << " p 'push %p' % [#{action.state.inspect}]\n" if $DEBUG + @code << " state = #{action.state.inspect}\n" when Proc - @@code << " state = #{make_callback(action.state)}\n" + @code << " state = #{make_callback(action.state)}\n" else raise "I don't know how to evaluate this push state: %p" % [action.state] end - @@code << " states << state\n" - @@code << " encoder.begin_group state\n" + @code << " states << state\n" + @code << " encoder.begin_group state\n" when Pop - @@code << " p 'pop %p' % [states.last]\n" if $DEBUG - @@code << " encoder.end_group states.pop\n" - @@code << " state = states.last\n" + @code << " p 'pop %p' % [states.last]\n" if $DEBUG + @code << " encoder.end_group states.pop\n" + @code << " state = states.last\n" when ValueSetter case action.value when Proc - @@code << " #{action.targets.join(' = ')} = #{make_callback(action.value)}\n" + @code << " #{action.targets.join(' = ')} = #{make_callback(action.value)}\n" else - @@code << " #{action.targets.join(' = ')} = #{action.value.inspect}\n" + @code << " #{action.targets.join(' = ')} = #{action.value.inspect}\n" end when Proc - @@code << " #{make_callback(action)}\n" + @code << " #{make_callback(action)}\n" else raise "I don't know how to evaluate this action: %p" % [action] end end - @@first = false + @first = false end def groups *token_kinds @@ -356,7 +356,7 @@ def scan_tokens encoder, options#{ def_line = __LINE__; nil } case state -#{ @@code.chomp.gsub(/^/, ' ') } +#{ @code.chomp.gsub(/^/, ' ') } else raise_inspect 'Unknown state: %p' % [state], encoder From 5e954e2522a0ec46460086c8bce6975183b366b7 Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Fri, 3 Apr 2015 13:02:33 +0200 Subject: [PATCH 13/54] add check_unless --- lib/coderay/scanners/java_script5.rb | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/lib/coderay/scanners/java_script5.rb b/lib/coderay/scanners/java_script5.rb index bc5ce9ad..1b9ae5c9 100644 --- a/lib/coderay/scanners/java_script5.rb +++ b/lib/coderay/scanners/java_script5.rb @@ -8,7 +8,9 @@ class RuleBasedScanner6 < Scanner Kind = Struct.new :token_kind Push = Struct.new :state Pop = Class.new - CheckIf = Struct.new :condition + Check = Struct.new :condition + CheckIf = Class.new Check + CheckUnless = Class.new Check ValueSetter = Struct.new :targets, :value class << self @@ -40,7 +42,7 @@ def on? pattern end def on *pattern_and_actions - if index = pattern_and_actions.find_index { |item| !item.is_a?(CheckIf) } + if index = pattern_and_actions.find_index { |item| !item.is_a?(Check) } preconditions = pattern_and_actions[0..index - 1] if index > 0 pattern = pattern_and_actions[index] or raise 'I need a pattern!' actions = pattern_and_actions[index + 1..-1] or raise 'I need actions!' @@ -59,6 +61,15 @@ def on *pattern_and_actions else raise "I don't know how to evaluate this check_if precondition: %p" % [precondition.condition] end + when CheckUnless + case precondition.condition + when Proc + precondition_expression << "!#{make_callback(precondition.condition)} && " + when Symbol + precondition_expression << "!#{precondition.condition} && " + else + raise "I don't know how to evaluate this check_unless precondition: %p" % [precondition.condition] + end else raise "I don't know how to evaluate this precondition: %p" % [precondition] end @@ -163,6 +174,10 @@ def check_if value = nil, &callback CheckIf.new value || callback end + def check_unless value = nil, &callback + CheckUnless.new value || callback + end + def flag_on *flags ValueSetter.new Array(flags), true end From 0cd3e62ee4aeec80e749cd88b36ea367ac79ffd4 Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Fri, 3 Apr 2015 16:23:25 +0200 Subject: [PATCH 14/54] add DSL CSS scanner --- lib/coderay/scanners/css2.rb | 125 +++++++++++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) create mode 100644 lib/coderay/scanners/css2.rb diff --git a/lib/coderay/scanners/css2.rb b/lib/coderay/scanners/css2.rb new file mode 100644 index 00000000..a072c9eb --- /dev/null +++ b/lib/coderay/scanners/css2.rb @@ -0,0 +1,125 @@ +require_relative 'java_script5' + +module CodeRay +module Scanners + + class CSS2 < RuleBasedScanner6 + + register_for :css2 + + KINDS_NOT_LOC = [ + :comment, + :class, :pseudo_class, :tag, + :id, :directive, + :key, :value, :operator, :color, :float, :string, + :error, :important, :type, + ] # :nodoc: + + module RE # :nodoc: + Hex = /[0-9a-fA-F]/ + Unicode = /\\#{Hex}{1,6}\b/ # differs from standard because it allows uppercase hex too + Escape = /#{Unicode}|\\[^\n0-9a-fA-F]/ + NMChar = /[-_a-zA-Z0-9]/ + NMStart = /[_a-zA-Z]/ + String1 = /(")((?:[^\n\\"]+|\\\n|#{Escape})+)?(")?/ # TODO: buggy regexp + String2 = /(')((?:[^\n\\']+|\\\n|#{Escape})+)?(')?/ # TODO: buggy regexp + String = /#{String1}|#{String2}/ + + HexColor = /#(?:#{Hex}{6}|#{Hex}{3})/ + + Num = /-?(?:[0-9]*\.[0-9]+|[0-9]+)n?/ + Name = /#{NMChar}+/ + Ident = /-?#{NMStart}#{NMChar}*/ + AtKeyword = /@#{Ident}/ + Percentage = /#{Num}%/ + + reldimensions = %w[em ex px] + absdimensions = %w[in cm mm pt pc] + Unit = Regexp.union(*(reldimensions + absdimensions + %w[s dpi dppx deg])) + + Dimension = /#{Num}#{Unit}/ + + Function = /((?:url|alpha|attr|counters?)\()((?:[^)\n]|\\\))+)?(\))?/ + + Id = /(?!#{HexColor}\b(?!-))##{Name}/ + Class = /\.#{Name}/ + PseudoClass = /::?#{Ident}/ + AttributeSelector = /(\[)([^\]]+)?(\])?/ + end + + protected + + def setup + @state = :initial + @value_expected = false + @block = false + end + + state :initial do + on %r/\s+/, :space + + on check_if(:block), check_if(:value_expected), %r/(?>#{RE::Ident})(?!\()/x, :value + on check_if(:block), %r/(?>#{RE::Ident})(?!\()/x, :key + + on check_unless(:block), %r/(?>#{RE::Ident})(?!\()|\*/x, :tag + on check_unless(:block), RE::Class, :class + on check_unless(:block), RE::Id, :id + on check_unless(:block), RE::PseudoClass, :pseudo_class + # TODO: Improve highlighting inside of attribute selectors. + on check_unless(:block), RE::AttributeSelector, groups(:operator, :attribute_name, :operator) + on check_unless(:block), %r/(@media)(\s+)?(#{RE::Ident})?(\s+)?(\{)?/, groups(:directive, :space, :type, :space, :operator) + + on %r/\/\*(?:.*?\*\/|\z)/m, :comment + on %r/\{/, :operator, flag_off(:value_expected), flag_on(:block) + on %r/\}/, :operator, flag_off(:value_expected), flag_off(:block) + on RE::String1, push(:string), groups(:delimiter, :content, :delimiter), pop + on RE::String2, push(:string), groups(:delimiter, :content, :delimiter), pop + on RE::Function, push(:function), groups(:delimiter, :content, :delimiter), pop + on %r/(?: #{RE::Dimension} | #{RE::Percentage} | #{RE::Num} )/x, :float + on RE::HexColor, :color + on %r/! *important/, :important + on %r/(?:rgb|hsl)a?\([^()\n]*\)?/, :color + on RE::AtKeyword, :directive + on %r/:/, :operator, flag_on(:value_expected) + on %r/;/, :operator, flag_off(:value_expected) + on %r/ [+>~,.=()\/] /x, :operator + end + + scan_tokens_code = <<-"RUBY" + def scan_tokens encoder, options#{ def_line = __LINE__; nil } + states = Array(options[:state] || @state).dup + value_expected = @value_expected + block = @block + + until eos? + + case state + +#{ @code.chomp.gsub(/^/, ' ') } + else + raise_inspect 'Unknown state: %p' % [state], encoder + + end + + end + + if options[:keep_state] + @state = states + @value_expected = value_expected + @block = block + end + + encoder + end + RUBY + + if ENV['PUTS'] + puts scan_tokens_code + puts "callbacks: #{@callbacks.size}" + end + class_eval scan_tokens_code, __FILE__, def_line + + end + +end +end From e8bef1034bedcc4ae1698657ea349b455edff58d Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Fri, 3 Apr 2015 16:30:33 +0200 Subject: [PATCH 15/54] move RuleBasedScanner into own file --- lib/coderay.rb | 3 + lib/coderay/rule_based_scanner.rb | 221 +++++++++++++++++++++++++++ lib/coderay/scanners/css2.rb | 4 +- lib/coderay/scanners/java_script5.rb | 220 +------------------------- 4 files changed, 226 insertions(+), 222 deletions(-) create mode 100644 lib/coderay/rule_based_scanner.rb diff --git a/lib/coderay.rb b/lib/coderay.rb index f759ed63..14e45aa7 100644 --- a/lib/coderay.rb +++ b/lib/coderay.rb @@ -153,6 +153,9 @@ def self.coderay_path *path autoload :Encoders, coderay_path('encoder') autoload :Styles, coderay_path('style') + # DSL Scanner + autoload :RuleBasedScanner, coderay_path('rule_based_scanner') + # convenience access and reusable Encoder/Scanner pair autoload :Duo, coderay_path('duo') diff --git a/lib/coderay/rule_based_scanner.rb b/lib/coderay/rule_based_scanner.rb new file mode 100644 index 00000000..670cb152 --- /dev/null +++ b/lib/coderay/rule_based_scanner.rb @@ -0,0 +1,221 @@ +module CodeRay + module Scanners + class RuleBasedScanner < Scanner + + Groups = Struct.new :token_kinds + Kind = Struct.new :token_kind + Push = Struct.new :state + Pop = Class.new + Check = Struct.new :condition + CheckIf = Class.new Check + CheckUnless = Class.new Check + ValueSetter = Struct.new :targets, :value + + class << self + attr_accessor :states + + def state *names, &block + @code ||= "" + + @code << "when #{names.map(&:inspect).join(', ')}\n" + + @first = true + instance_eval(&block) + @code << " else\n" + # @code << " raise 'no match for #{names.map(&:inspect).join(', ')}'\n" + @code << " encoder.text_token getch, :error\n" + @code << " end\n" + @code << " \n" + end + + def on? pattern + pattern_expression = pattern.inspect + @code << " #{'els' unless @first}if check(#{pattern_expression})\n" + + @first = true + yield + @code << " end\n" + + @first = false + end + + def on *pattern_and_actions + if index = pattern_and_actions.find_index { |item| !item.is_a?(Check) } + preconditions = pattern_and_actions[0..index - 1] if index > 0 + pattern = pattern_and_actions[index] or raise 'I need a pattern!' + actions = pattern_and_actions[index + 1..-1] or raise 'I need actions!' + end + + precondition_expression = '' + if preconditions + for precondition in preconditions + case precondition + when CheckIf + case precondition.condition + when Proc + precondition_expression << "#{make_callback(precondition.condition)} && " + when Symbol + precondition_expression << "#{precondition.condition} && " + else + raise "I don't know how to evaluate this check_if precondition: %p" % [precondition.condition] + end + when CheckUnless + case precondition.condition + when Proc + precondition_expression << "!#{make_callback(precondition.condition)} && " + when Symbol + precondition_expression << "!#{precondition.condition} && " + else + raise "I don't know how to evaluate this check_unless precondition: %p" % [precondition.condition] + end + else + raise "I don't know how to evaluate this precondition: %p" % [precondition] + end + end + end + + case pattern + when String + raise + pattern_expression = pattern + when Regexp + pattern_expression = pattern.inspect + when Proc + pattern_expression = make_callback(pattern) + else + raise "I don't know how to evaluate this pattern: %p" % [pattern] + end + + @code << " #{'els' unless @first}if #{precondition_expression}match = scan(#{pattern_expression})\n" + + for action in actions + case action + when String + raise + @code << " p 'evaluate #{action.inspect}'\n" if $DEBUG + @code << " #{action}\n" + + when Symbol + @code << " p 'text_token %p %p' % [match, #{action.inspect}]\n" if $DEBUG + @code << " encoder.text_token match, #{action.inspect}\n" + when Kind + case action.token_kind + when Proc + @code << " encoder.text_token match, #{make_callback(action.token_kind)}\n" + else + raise "I don't know how to evaluate this kind: %p" % [action.token_kind] + end + when Groups + @code << " p 'text_tokens %p in groups %p' % [match, #{action.token_kinds.inspect}]\n" if $DEBUG + action.token_kinds.each_with_index do |kind, i| + @code << " encoder.text_token self[#{i + 1}], #{kind.inspect} if self[#{i + 1}]\n" + end + + when Push + case action.state + when String + raise + @code << " p 'push %p' % [#{action.state}]\n" if $DEBUG + @code << " state = #{action.state}\n" + when Symbol + @code << " p 'push %p' % [#{action.state.inspect}]\n" if $DEBUG + @code << " state = #{action.state.inspect}\n" + when Proc + @code << " state = #{make_callback(action.state)}\n" + else + raise "I don't know how to evaluate this push state: %p" % [action.state] + end + @code << " states << state\n" + @code << " encoder.begin_group state\n" + when Pop + @code << " p 'pop %p' % [states.last]\n" if $DEBUG + @code << " encoder.end_group states.pop\n" + @code << " state = states.last\n" + + when ValueSetter + case action.value + when Proc + @code << " #{action.targets.join(' = ')} = #{make_callback(action.value)}\n" + else + @code << " #{action.targets.join(' = ')} = #{action.value.inspect}\n" + end + + when Proc + @code << " #{make_callback(action)}\n" + + else + raise "I don't know how to evaluate this action: %p" % [action] + end + end + + @first = false + end + + def groups *token_kinds + Groups.new token_kinds + end + + def kind token_kind = nil, &block + Kind.new token_kind || block + end + + def push state = nil, &block + raise 'push requires a state or a block; got nothing' unless state || block + Push.new state || block + end + + def pop + Pop.new + end + + def check_if value = nil, &callback + CheckIf.new value || callback + end + + def check_unless value = nil, &callback + CheckUnless.new value || callback + end + + def flag_on *flags + ValueSetter.new Array(flags), true + end + + def flag_off *flags + ValueSetter.new Array(flags), false + end + + def set flag, value = nil, &callback + ValueSetter.new [flag], value || callback + end + + def unset *flags + ValueSetter.new Array(flags), nil + end + + protected + + def make_callback block + @callbacks ||= {} + + base_name = "__callback_line_#{block.source_location.last}" + name = base_name + counter = 'a' + while @callbacks.key?(name) + name = "#{base_name}_#{counter}" + counter.succ! + end + + @callbacks[name] = define_method(name, &block) + + arguments = block.parameters.map(&:last) + + if arguments.empty? + name + else + "#{name}(#{arguments.join(', ')})" + end + end + end + end + end +end \ No newline at end of file diff --git a/lib/coderay/scanners/css2.rb b/lib/coderay/scanners/css2.rb index a072c9eb..93b890ae 100644 --- a/lib/coderay/scanners/css2.rb +++ b/lib/coderay/scanners/css2.rb @@ -1,9 +1,7 @@ -require_relative 'java_script5' - module CodeRay module Scanners - class CSS2 < RuleBasedScanner6 + class CSS2 < RuleBasedScanner register_for :css2 diff --git a/lib/coderay/scanners/java_script5.rb b/lib/coderay/scanners/java_script5.rb index 1b9ae5c9..e337cc5f 100644 --- a/lib/coderay/scanners/java_script5.rb +++ b/lib/coderay/scanners/java_script5.rb @@ -2,228 +2,10 @@ module CodeRay module Scanners - class RuleBasedScanner6 < Scanner - - Groups = Struct.new :token_kinds - Kind = Struct.new :token_kind - Push = Struct.new :state - Pop = Class.new - Check = Struct.new :condition - CheckIf = Class.new Check - CheckUnless = Class.new Check - ValueSetter = Struct.new :targets, :value - - class << self - attr_accessor :states - - def state *names, &block - @code ||= "" - - @code << "when #{names.map(&:inspect).join(', ')}\n" - - @first = true - instance_eval(&block) - @code << " else\n" - # @code << " raise 'no match for #{names.map(&:inspect).join(', ')}'\n" - @code << " encoder.text_token getch, :error\n" - @code << " end\n" - @code << " \n" - end - - def on? pattern - pattern_expression = pattern.inspect - @code << " #{'els' unless @first}if check(#{pattern_expression})\n" - - @first = true - yield - @code << " end\n" - - @first = false - end - - def on *pattern_and_actions - if index = pattern_and_actions.find_index { |item| !item.is_a?(Check) } - preconditions = pattern_and_actions[0..index - 1] if index > 0 - pattern = pattern_and_actions[index] or raise 'I need a pattern!' - actions = pattern_and_actions[index + 1..-1] or raise 'I need actions!' - end - - precondition_expression = '' - if preconditions - for precondition in preconditions - case precondition - when CheckIf - case precondition.condition - when Proc - precondition_expression << "#{make_callback(precondition.condition)} && " - when Symbol - precondition_expression << "#{precondition.condition} && " - else - raise "I don't know how to evaluate this check_if precondition: %p" % [precondition.condition] - end - when CheckUnless - case precondition.condition - when Proc - precondition_expression << "!#{make_callback(precondition.condition)} && " - when Symbol - precondition_expression << "!#{precondition.condition} && " - else - raise "I don't know how to evaluate this check_unless precondition: %p" % [precondition.condition] - end - else - raise "I don't know how to evaluate this precondition: %p" % [precondition] - end - end - end - - case pattern - when String - raise - pattern_expression = pattern - when Regexp - pattern_expression = pattern.inspect - when Proc - pattern_expression = make_callback(pattern) - else - raise "I don't know how to evaluate this pattern: %p" % [pattern] - end - - @code << " #{'els' unless @first}if #{precondition_expression}match = scan(#{pattern_expression})\n" - - for action in actions - case action - when String - raise - @code << " p 'evaluate #{action.inspect}'\n" if $DEBUG - @code << " #{action}\n" - - when Symbol - @code << " p 'text_token %p %p' % [match, #{action.inspect}]\n" if $DEBUG - @code << " encoder.text_token match, #{action.inspect}\n" - when Kind - case action.token_kind - when Proc - @code << " encoder.text_token match, #{make_callback(action.token_kind)}\n" - else - raise "I don't know how to evaluate this kind: %p" % [action.token_kind] - end - when Groups - @code << " p 'text_tokens %p in groups %p' % [match, #{action.token_kinds.inspect}]\n" if $DEBUG - action.token_kinds.each_with_index do |kind, i| - @code << " encoder.text_token self[#{i + 1}], #{kind.inspect} if self[#{i + 1}]\n" - end - - when Push - case action.state - when String - raise - @code << " p 'push %p' % [#{action.state}]\n" if $DEBUG - @code << " state = #{action.state}\n" - when Symbol - @code << " p 'push %p' % [#{action.state.inspect}]\n" if $DEBUG - @code << " state = #{action.state.inspect}\n" - when Proc - @code << " state = #{make_callback(action.state)}\n" - else - raise "I don't know how to evaluate this push state: %p" % [action.state] - end - @code << " states << state\n" - @code << " encoder.begin_group state\n" - when Pop - @code << " p 'pop %p' % [states.last]\n" if $DEBUG - @code << " encoder.end_group states.pop\n" - @code << " state = states.last\n" - - when ValueSetter - case action.value - when Proc - @code << " #{action.targets.join(' = ')} = #{make_callback(action.value)}\n" - else - @code << " #{action.targets.join(' = ')} = #{action.value.inspect}\n" - end - - when Proc - @code << " #{make_callback(action)}\n" - - else - raise "I don't know how to evaluate this action: %p" % [action] - end - end - - @first = false - end - - def groups *token_kinds - Groups.new token_kinds - end - - def kind token_kind = nil, &block - Kind.new token_kind || block - end - - def push state = nil, &block - raise 'push requires a state or a block; got nothing' unless state || block - Push.new state || block - end - - def pop - Pop.new - end - - def check_if value = nil, &callback - CheckIf.new value || callback - end - - def check_unless value = nil, &callback - CheckUnless.new value || callback - end - - def flag_on *flags - ValueSetter.new Array(flags), true - end - - def flag_off *flags - ValueSetter.new Array(flags), false - end - - def set flag, value = nil, &callback - ValueSetter.new [flag], value || callback - end - - def unset *flags - ValueSetter.new Array(flags), nil - end - - protected - - def make_callback block - @callbacks ||= {} - - base_name = "__callback_line_#{block.source_location.last}" - name = base_name - counter = 'a' - while @callbacks.key?(name) - name = "#{base_name}_#{counter}" - counter.succ! - end - - @callbacks[name] = define_method(name, &block) - - arguments = block.parameters.map(&:last) - - if arguments.empty? - name - else - "#{name}(#{arguments.join(', ')})" - end - end - end - end - # Scanner for JavaScript. # # Aliases: +ecmascript+, +ecma_script+, +javascript+ - class JavaScript5 < RuleBasedScanner6 + class JavaScript5 < RuleBasedScanner register_for :java_script5 file_extension 'js' From 235e01b4077a33ccc82e646d7f5992dd41b6646e Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Wed, 22 Apr 2015 00:56:03 +0200 Subject: [PATCH 16/54] add push/pop state, working on C scanner --- lib/coderay/rule_based_scanner.rb | 35 +++++++-- lib/coderay/scanners/c2.rb | 126 ++++++++++++++++++++++++++++++ 2 files changed, 156 insertions(+), 5 deletions(-) create mode 100644 lib/coderay/scanners/c2.rb diff --git a/lib/coderay/rule_based_scanner.rb b/lib/coderay/rule_based_scanner.rb index 670cb152..ac6a623c 100644 --- a/lib/coderay/rule_based_scanner.rb +++ b/lib/coderay/rule_based_scanner.rb @@ -6,10 +6,13 @@ class RuleBasedScanner < Scanner Kind = Struct.new :token_kind Push = Struct.new :state Pop = Class.new + PushState = Struct.new :state + PopState = Class.new Check = Struct.new :condition CheckIf = Class.new Check CheckUnless = Class.new Check ValueSetter = Struct.new :targets, :value + Continue = Class.new class << self attr_accessor :states @@ -101,7 +104,7 @@ def on *pattern_and_actions when Kind case action.token_kind when Proc - @code << " encoder.text_token match, #{make_callback(action.token_kind)}\n" + @code << " encoder.text_token match, kind = #{make_callback(action.token_kind)}\n" else raise "I don't know how to evaluate this kind: %p" % [action.token_kind] end @@ -111,7 +114,7 @@ def on *pattern_and_actions @code << " encoder.text_token self[#{i + 1}], #{kind.inspect} if self[#{i + 1}]\n" end - when Push + when Push, PushState case action.state when String raise @@ -126,16 +129,22 @@ def on *pattern_and_actions raise "I don't know how to evaluate this push state: %p" % [action.state] end @code << " states << state\n" - @code << " encoder.begin_group state\n" - when Pop + @code << " encoder.begin_group state\n" if action.is_a? Push + when Pop, PopState @code << " p 'pop %p' % [states.last]\n" if $DEBUG - @code << " encoder.end_group states.pop\n" + if action.is_a? Pop + @code << " encoder.end_group states.pop\n" + else + @code << " states.pop\n" + end @code << " state = states.last\n" when ValueSetter case action.value when Proc @code << " #{action.targets.join(' = ')} = #{make_callback(action.value)}\n" + when Symbol + @code << " #{action.targets.join(' = ')} = #{action.value}\n" else @code << " #{action.targets.join(' = ')} = #{action.value.inspect}\n" end @@ -143,6 +152,9 @@ def on *pattern_and_actions when Proc @code << " #{make_callback(action)}\n" + when Continue + @code << " next\n" + else raise "I don't know how to evaluate this action: %p" % [action] end @@ -168,6 +180,15 @@ def pop Pop.new end + def push_state state = nil, &block + raise 'push_state requires a state or a block; got nothing' unless state || block + PushState.new state || block + end + + def pop_state + PopState.new + end + def check_if value = nil, &callback CheckIf.new value || callback end @@ -192,6 +213,10 @@ def unset *flags ValueSetter.new Array(flags), nil end + def continue + Continue.new + end + protected def make_callback block diff --git a/lib/coderay/scanners/c2.rb b/lib/coderay/scanners/c2.rb new file mode 100644 index 00000000..d21e032c --- /dev/null +++ b/lib/coderay/scanners/c2.rb @@ -0,0 +1,126 @@ +module CodeRay +module Scanners + + # Scanner for C. + class C2 < RuleBasedScanner + + register_for :c2 + file_extension 'c' + + KEYWORDS = [ + 'asm', 'break', 'case', 'continue', 'default', 'do', + 'else', 'enum', 'for', 'goto', 'if', 'return', + 'sizeof', 'struct', 'switch', 'typedef', 'union', 'while', + 'restrict', # added in C99 + ] # :nodoc: + + PREDEFINED_TYPES = [ + 'int', 'long', 'short', 'char', + 'signed', 'unsigned', 'float', 'double', + 'bool', 'complex', # added in C99 + ] # :nodoc: + + PREDEFINED_CONSTANTS = [ + 'EOF', 'NULL', + 'true', 'false', # added in C99 + ] # :nodoc: + DIRECTIVES = [ + 'auto', 'extern', 'register', 'static', 'void', + 'const', 'volatile', # added in C89 + 'inline', # added in C99 + ] # :nodoc: + + IDENT_KIND = WordList.new(:ident). + add(KEYWORDS, :keyword). + add(PREDEFINED_TYPES, :predefined_type). + add(DIRECTIVES, :directive). + add(PREDEFINED_CONSTANTS, :predefined_constant) # :nodoc: + + ESCAPE = / [rbfntv\n\\'"] | x[a-fA-F0-9]{1,2} | [0-7]{1,3} /x # :nodoc: + UNICODE_ESCAPE = / u[a-fA-F0-9]{4} | U[a-fA-F0-9]{8} /x # :nodoc: + + protected + + state :initial do + on check_if(:in_preproc_line), %r/ \s*? \n \s* /x, :space, flag_off(:in_preproc_line), set(:label_expected, :label_expected_before_preproc_line) + on %r/ \s+ | \\\n /x, :space + + on %r/ [-+*=<>?:;,!&^|()\[\]{}~%]+ | \/(?![\/*])=? | \.(?!\d) /x, :operator, set(:label_expected) { |match, case_expected| match =~ /[;\{\}]/ || case_expected && match =~ /:/ }, flag_off(:case_expected) + + on %r/ (?: case | default ) \b /x, :keyword, flag_on(:case_expected), flag_off(:label_expected) + on check_if(:label_expected), check_unless(:in_preproc_line), %r/ [A-Za-z_][A-Za-z_0-9]*+ :(?!:) /x, kind { |match| + kind = IDENT_KIND[match.chop] + kind == :ident ? :label : kind + }, set(:label_expected) { |kind| kind == :label } + on %r/ [A-Za-z_][A-Za-z_0-9]* /x, kind { |match| IDENT_KIND[match] }, flag_off(:label_expected) + + on %r/(L)?(")/, push(:string), groups(:modifier, :delimiter) + + on %r/ L?' (?: [^\'\n\\] | \\ #{ESCAPE} )? '? /x, :char, flag_off(:label_expected) + on %r/0[xX][0-9A-Fa-f]+/, :hex, flag_off(:label_expected) + on %r/(?:0[0-7]+)(?![89.eEfF])/, :octal, flag_off(:label_expected) + on %r/(?:\d+)(?![.eEfF])L?L?/, :integer, flag_off(:label_expected) + on %r/\d[fF]?|\d*\.\d+(?:[eE][+-]?\d+)?[fF]?|\d+[eE][+-]?\d+[fF]?/, :float, flag_off(:label_expected) + + on %r! // [^\n\\]* (?: \\. [^\n\\]* )* | /\* (?: .*? \*/ | .* ) !mx, :comment + on %r/ \# \s* if \s* 0 /x, -> (match) { + match << scan_until(/ ^\# (?:elif|else|endif) .*? $ | \z /mx) unless eos? + }, :comment + on %r/ \# [ \t]* include\b /x, :preprocessor, flag_on(:in_preproc_line), set(:label_expected_before_preproc_line, :label_expected), push_state(:include_expected) + on %r/ \# [ \t]* \w* /x, :preprocessor, flag_on(:in_preproc_line), set(:label_expected_before_preproc_line, :label_expected) + + on %r/\$/, :ident + end + + state :string do + on %r/[^\\\n"]+/, :content + on %r/ \\ (?: #{ESCAPE} | #{UNICODE_ESCAPE} ) /mx, :char + on %r/"/, :delimiter, pop, flag_off(:label_expected) + on %r/ \\ /x, pop, :error, flag_off(:label_expected) + on %r/ $ /x, pop, flag_off(:label_expected), continue + end + + state :include_expected do + on %r/<[^>\n]+>?|"[^"\n\\]*(?:\\.[^"\n\\]*)*"?/, :include, pop_state + on %r/ \s*? \n \s* /x, :space, pop_state + on %r/\s+/, :space + on %r//, pop_state, continue # TODO: add otherwise method for this + end + + scan_tokens_code = <<-"RUBY" + def scan_tokens encoder, options#{ def_line = __LINE__; nil } + state = :initial + label_expected = true + case_expected = false + label_expected_before_preproc_line = nil + in_preproc_line = false + + states = [state] + + until eos? + last_pos = pos + case state +#{ @code.chomp.gsub(/^/, ' ') } + else + raise_inspect 'Unknown state: %p' % [state], encoder + end + + raise_inspect 'nothing was consumed! states = %p' % [states], encoder if pos == last_pos + end + + if state == :string + encoder.end_group :string + end + + encoder + end + RUBY + + if ENV['PUTS'] + puts scan_tokens_code + puts "callbacks: #{@callbacks.size}" + end + class_eval scan_tokens_code, __FILE__, def_line + end +end +end From 7dcbf8a62e36c34d94528db7444419de46205a0a Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Sat, 13 Feb 2016 00:20:02 +0100 Subject: [PATCH 17/54] Debug encoder should count tokens for better inspection --- lib/coderay/encoders/debug.rb | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/lib/coderay/encoders/debug.rb b/lib/coderay/encoders/debug.rb index f4db3301..6b680fc9 100644 --- a/lib/coderay/encoders/debug.rb +++ b/lib/coderay/encoders/debug.rb @@ -15,9 +15,12 @@ class Debug < Encoder register_for :debug + attr_reader :size + FILE_EXTENSION = 'raydebug' def text_token text, kind + @size += 1 if kind == :space @out << text else @@ -43,6 +46,13 @@ def end_line kind @out << ']' end + protected + + def setup options + super + @size = 0 + end + end end From e6d46f903a1478411696d1ed249e772096e64237 Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Sat, 13 Feb 2016 00:20:10 +0100 Subject: [PATCH 18/54] just show the array --- lib/coderay/encoders/debug_lint.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/coderay/encoders/debug_lint.rb b/lib/coderay/encoders/debug_lint.rb index a4eba2c7..497d8c5d 100644 --- a/lib/coderay/encoders/debug_lint.rb +++ b/lib/coderay/encoders/debug_lint.rb @@ -29,7 +29,7 @@ def begin_group kind end def end_group kind - raise Lint::IncorrectTokenGroupNesting, 'We are inside %s, not %p (end_group)' % [@opened.reverse.map(&:inspect).join(' < '), kind] if @opened.last != kind + raise Lint::IncorrectTokenGroupNesting, 'We are inside %p, not %p (end_group)' % [@opened.reverse, kind] if @opened.last != kind @opened.pop super end @@ -40,7 +40,7 @@ def begin_line kind end def end_line kind - raise Lint::IncorrectTokenGroupNesting, 'We are inside %s, not %p (end_line)' % [@opened.reverse.map(&:inspect).join(' < '), kind] if @opened.last != kind + raise Lint::IncorrectTokenGroupNesting, 'We are inside %p, not %p (end_line)' % [@opened.reverse, kind] if @opened.last != kind @opened.pop super end From 61a9d9689be7720a8345b7ff35f08041134cc15e Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Sat, 13 Feb 2016 00:21:25 +0100 Subject: [PATCH 19/54] scanner tweaks --- lib/coderay/rule_based_scanner.rb | 10 ++++++---- lib/coderay/scanners/c2.rb | 2 +- lib/coderay/scanners/css2.rb | 5 +++-- lib/coderay/scanners/java_script5.rb | 4 ++-- 4 files changed, 12 insertions(+), 9 deletions(-) diff --git a/lib/coderay/rule_based_scanner.rb b/lib/coderay/rule_based_scanner.rb index ac6a623c..9239cf62 100644 --- a/lib/coderay/rule_based_scanner.rb +++ b/lib/coderay/rule_based_scanner.rb @@ -219,18 +219,20 @@ def continue protected - def make_callback block + def callbacks @callbacks ||= {} - + end + + def make_callback block base_name = "__callback_line_#{block.source_location.last}" name = base_name counter = 'a' - while @callbacks.key?(name) + while callbacks.key?(name) name = "#{base_name}_#{counter}" counter.succ! end - @callbacks[name] = define_method(name, &block) + callbacks[name] = define_method(name, &block) arguments = block.parameters.map(&:last) diff --git a/lib/coderay/scanners/c2.rb b/lib/coderay/scanners/c2.rb index d21e032c..a5d4d096 100644 --- a/lib/coderay/scanners/c2.rb +++ b/lib/coderay/scanners/c2.rb @@ -118,7 +118,7 @@ def scan_tokens encoder, options#{ def_line = __LINE__; nil } if ENV['PUTS'] puts scan_tokens_code - puts "callbacks: #{@callbacks.size}" + puts "callbacks: #{callbacks.size}" end class_eval scan_tokens_code, __FILE__, def_line end diff --git a/lib/coderay/scanners/css2.rb b/lib/coderay/scanners/css2.rb index 93b890ae..edb9ca3f 100644 --- a/lib/coderay/scanners/css2.rb +++ b/lib/coderay/scanners/css2.rb @@ -88,6 +88,7 @@ def scan_tokens encoder, options#{ def_line = __LINE__; nil } states = Array(options[:state] || @state).dup value_expected = @value_expected block = @block + state = states.last until eos? @@ -112,8 +113,8 @@ def scan_tokens encoder, options#{ def_line = __LINE__; nil } RUBY if ENV['PUTS'] - puts scan_tokens_code - puts "callbacks: #{@callbacks.size}" + puts CodeRay.scan(scan_tokens_code, :ruby).terminal + puts "callbacks: #{callbacks.size}" end class_eval scan_tokens_code, __FILE__, def_line diff --git a/lib/coderay/scanners/java_script5.rb b/lib/coderay/scanners/java_script5.rb index e337cc5f..ae68cef0 100644 --- a/lib/coderay/scanners/java_script5.rb +++ b/lib/coderay/scanners/java_script5.rb @@ -174,8 +174,8 @@ def scan_tokens encoder, options#{ def_line = __LINE__; nil } RUBY if ENV['PUTS'] - puts scan_tokens_code - puts "callbacks: #{@callbacks.size}" + puts CodeRay.scan(scan_tokens_code, :ruby).terminal + puts "callbacks: #{callbacks.size}" end class_eval scan_tokens_code, __FILE__, def_line From c274a9024e0a16cae529431b109d364f6cc3a65e Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Sat, 13 Feb 2016 00:26:43 +0100 Subject: [PATCH 20/54] fix comment --- lib/coderay/scanners/lua.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/coderay/scanners/lua.rb b/lib/coderay/scanners/lua.rb index fb1e45a7..81d7dae4 100644 --- a/lib/coderay/scanners/lua.rb +++ b/lib/coderay/scanners/lua.rb @@ -76,7 +76,7 @@ def scan_tokens(encoder, options) encoder.text_token(match, :comment) elsif match = scan(/\[=*\[/) # [[ long (possibly multiline) string ]] - num_equals = match.count("=") # Number must match for comment end + num_equals = match.count("=") # Number must match for string end encoder.begin_group(:string) encoder.text_token(match, :delimiter) state = :long_string From 36af5cab11f12499d86d999373c6ce2b525e145c Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Sat, 13 Feb 2016 00:29:22 +0100 Subject: [PATCH 21/54] add explicit pattern method; make pattern optional --- lib/coderay/rule_based_scanner.rb | 63 ++++++++++++++++--------------- 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/lib/coderay/rule_based_scanner.rb b/lib/coderay/rule_based_scanner.rb index 9239cf62..f354c367 100644 --- a/lib/coderay/rule_based_scanner.rb +++ b/lib/coderay/rule_based_scanner.rb @@ -2,6 +2,7 @@ module CodeRay module Scanners class RuleBasedScanner < Scanner + Pattern = Struct.new :pattern Groups = Struct.new :token_kinds Kind = Struct.new :token_kind Push = Struct.new :state @@ -43,53 +44,51 @@ def on? pattern end def on *pattern_and_actions - if index = pattern_and_actions.find_index { |item| !item.is_a?(Check) } - preconditions = pattern_and_actions[0..index - 1] if index > 0 - pattern = pattern_and_actions[index] or raise 'I need a pattern!' - actions = pattern_and_actions[index + 1..-1] or raise 'I need actions!' + if index = pattern_and_actions.find_index { |item| !(item.is_a?(Check) || item.is_a?(Regexp) || item.is_a?(Pattern)) } + conditions = pattern_and_actions[0..index - 1] or raise 'I need conditions or a pattern!' + actions = pattern_and_actions[index..-1] or raise 'I need actions!' + else + raise "invalid rule structure: #{pattern_and_actions.map(&:class)}" end - precondition_expression = '' - if preconditions - for precondition in preconditions - case precondition + condition_expressions = [] + if conditions + for condition in conditions + case condition when CheckIf - case precondition.condition + case condition.condition when Proc - precondition_expression << "#{make_callback(precondition.condition)} && " + condition_expressions << "#{make_callback(condition.condition)}" when Symbol - precondition_expression << "#{precondition.condition} && " + condition_expressions << "#{condition.condition}" else - raise "I don't know how to evaluate this check_if precondition: %p" % [precondition.condition] + raise "I don't know how to evaluate this check_if condition: %p" % [condition.condition] end when CheckUnless - case precondition.condition + case condition.condition when Proc - precondition_expression << "!#{make_callback(precondition.condition)} && " + condition_expressions << "!#{make_callback(condition.condition)}" when Symbol - precondition_expression << "!#{precondition.condition} && " + condition_expressions << "!#{condition.condition}" else - raise "I don't know how to evaluate this check_unless precondition: %p" % [precondition.condition] + raise "I don't know how to evaluate this check_unless condition: %p" % [condition.condition] end + when Pattern + case condition.pattern + when Proc + condition_expressions << "match = scan(#{make_callback(condition.pattern)})" + else + raise "I don't know how to evaluate this pattern: %p" % [condition.pattern] + end + when Regexp + condition_expressions << "match = scan(#{condition.inspect})" else - raise "I don't know how to evaluate this precondition: %p" % [precondition] + raise "I don't know how to evaluate this pattern/condition: %p" % [condition] end end end - case pattern - when String - raise - pattern_expression = pattern - when Regexp - pattern_expression = pattern.inspect - when Proc - pattern_expression = make_callback(pattern) - else - raise "I don't know how to evaluate this pattern: %p" % [pattern] - end - - @code << " #{'els' unless @first}if #{precondition_expression}match = scan(#{pattern_expression})\n" + @code << " #{'els' unless @first}if #{condition_expressions.join(' && ')}\n" for action in actions case action @@ -167,6 +166,10 @@ def groups *token_kinds Groups.new token_kinds end + def pattern pattern = nil, &block + Pattern.new pattern || block + end + def kind token_kind = nil, &block Kind.new token_kind || block end From 40f1fa7bc46a5fd8addba230afb21f072a7e1215 Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Sat, 13 Feb 2016 00:32:30 +0100 Subject: [PATCH 22/54] Push and Pop take optional group argument now --- lib/coderay/rule_based_scanner.rb | 40 ++++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/lib/coderay/rule_based_scanner.rb b/lib/coderay/rule_based_scanner.rb index f354c367..2154fb75 100644 --- a/lib/coderay/rule_based_scanner.rb +++ b/lib/coderay/rule_based_scanner.rb @@ -5,8 +5,8 @@ class RuleBasedScanner < Scanner Pattern = Struct.new :pattern Groups = Struct.new :token_kinds Kind = Struct.new :token_kind - Push = Struct.new :state - Pop = Class.new + Push = Struct.new :state, :group + Pop = Struct.new :group PushState = Struct.new :state PopState = Class.new Check = Struct.new :condition @@ -128,11 +128,35 @@ def on *pattern_and_actions raise "I don't know how to evaluate this push state: %p" % [action.state] end @code << " states << state\n" - @code << " encoder.begin_group state\n" if action.is_a? Push + if action.is_a? Push + if action.state == action.group + @code << " encoder.begin_group state\n" + else + case action.state + when Symbol + @code << " p 'begin group %p' % [#{action.group.inspect}]\n" if $DEBUG + @code << " encoder.begin_group #{action.group.inspect}\n" + when Proc + @code << " encoder.begin_group #{make_callback(action.group)}\n" + else + raise "I don't know how to evaluate this push state: %p" % [action.state] + end + end + end when Pop, PopState @code << " p 'pop %p' % [states.last]\n" if $DEBUG if action.is_a? Pop - @code << " encoder.end_group states.pop\n" + if action.group + case action.group + when Symbol + @code << " encoder.end_group #{action.group.inspect}\n" + else + raise "I don't know how to evaluate this pop group: %p" % [action.group] + end + @code << " states.pop\n" + else + @code << " encoder.end_group states.pop\n" + end else @code << " states.pop\n" end @@ -174,13 +198,13 @@ def kind token_kind = nil, &block Kind.new token_kind || block end - def push state = nil, &block + def push state = nil, group = state, &block raise 'push requires a state or a block; got nothing' unless state || block - Push.new state || block + Push.new state || block, group || block end - def pop - Pop.new + def pop group = nil + Pop.new group end def push_state state = nil, &block From aa93af4cbc465d90db8f1f27477214ec119223a3 Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Sat, 13 Feb 2016 00:32:56 +0100 Subject: [PATCH 23/54] quick increment/decrement, yay! --- lib/coderay/rule_based_scanner.rb | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/lib/coderay/rule_based_scanner.rb b/lib/coderay/rule_based_scanner.rb index 2154fb75..ba49ccf3 100644 --- a/lib/coderay/rule_based_scanner.rb +++ b/lib/coderay/rule_based_scanner.rb @@ -13,6 +13,7 @@ class RuleBasedScanner < Scanner CheckIf = Class.new Check CheckUnless = Class.new Check ValueSetter = Struct.new :targets, :value + Increment = Struct.new :targets, :operation, :value Continue = Class.new class << self @@ -172,6 +173,16 @@ def on *pattern_and_actions @code << " #{action.targets.join(' = ')} = #{action.value.inspect}\n" end + when Increment + case action.value + when Proc + @code << " #{action.targets.join(' = ')} #{action.operation}= #{make_callback(action.value)}\n" + when Symbol + @code << " #{action.targets.join(' = ')} #{action.operation}= #{action.value}\n" + else + @code << " #{action.targets.join(' = ')} #{action.operation}= #{action.value.inspect}\n" + end + when Proc @code << " #{make_callback(action)}\n" @@ -240,6 +251,14 @@ def unset *flags ValueSetter.new Array(flags), nil end + def increment *counters + Increment.new Array(counters), :+, 1 + end + + def decrement *counters + Increment.new Array(counters), :-, 1 + end + def continue Continue.new end From 3df8487e1a8f73b4a0723b2e833b74c9d7801bc8 Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Sat, 13 Feb 2016 00:33:16 +0100 Subject: [PATCH 24/54] use explicit pattern method --- lib/coderay/scanners/java_script5.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/coderay/scanners/java_script5.rb b/lib/coderay/scanners/java_script5.rb index ae68cef0..6b5a83fb 100644 --- a/lib/coderay/scanners/java_script5.rb +++ b/lib/coderay/scanners/java_script5.rb @@ -100,7 +100,7 @@ class JavaScript5 < RuleBasedScanner end state :string, :key do - on -> (string_delimiter) { STRING_CONTENT_PATTERN[string_delimiter] }, :content + on pattern { |string_delimiter| STRING_CONTENT_PATTERN[string_delimiter] }, :content on %r/["']/, :delimiter, unset(:string_delimiter), flag_off(:key_expected, :value_expected), pop on %r/ \\ (?: #{ESCAPE} | #{UNICODE_ESCAPE} ) /x, kind { |match, string_delimiter| string_delimiter == "'" && !(match == "\\\\" || match == "\\'") ? :content : :char From 7561e8ddfa64cdcefb3a79eaa14085cfb8ef2c0d Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Sat, 13 Feb 2016 00:34:19 +0100 Subject: [PATCH 25/54] warn about error tokens --- lib/coderay/rule_based_scanner.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/coderay/rule_based_scanner.rb b/lib/coderay/rule_based_scanner.rb index ba49ccf3..24e278d0 100644 --- a/lib/coderay/rule_based_scanner.rb +++ b/lib/coderay/rule_based_scanner.rb @@ -27,7 +27,7 @@ def state *names, &block @first = true instance_eval(&block) @code << " else\n" - # @code << " raise 'no match for #{names.map(&:inspect).join(', ')}'\n" + @code << " puts 'no match for #{names.map(&:inspect).join(', ')}'\n" if $DEBUG @code << " encoder.text_token getch, :error\n" @code << " end\n" @code << " \n" From a1a7b2c871a0b33451292e542073a4aa743c91f2 Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Sat, 13 Feb 2016 00:47:21 +0100 Subject: [PATCH 26/54] add json scanner using RuleBasedScanner --- lib/coderay/scanners/json5.rb | 92 +++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 lib/coderay/scanners/json5.rb diff --git a/lib/coderay/scanners/json5.rb b/lib/coderay/scanners/json5.rb new file mode 100644 index 00000000..c68ebb7d --- /dev/null +++ b/lib/coderay/scanners/json5.rb @@ -0,0 +1,92 @@ +module CodeRay +module Scanners + + # Scanner for JSON (JavaScript Object Notation). + class JSON5 < RuleBasedScanner + + register_for :json5 + file_extension 'json' + + KINDS_NOT_LOC = [ + :float, :char, :content, :delimiter, + :error, :integer, :operator, :value, + ] # :nodoc: + + ESCAPE = / [bfnrt\\"\/] /x # :nodoc: + UNICODE_ESCAPE = / u[a-fA-F0-9]{4} /x # :nodoc: + KEY = / (?> (?: [^\\"]+ | \\. )* ) " \s* : /mx + + state :initial do + on %r/ \s+ /x, :space + + on %r/ [:,\[{\]}] /x, :operator + + on %r/ " (?=#{KEY}) /x, push(:key), :delimiter + on %r/ " /x, push(:string), :delimiter + + on %r/ true | false | null /x, :value + on %r/ -? (?: 0 | [1-9]\d* ) (?: \.\d+ (?: e[-+]? \d+ )? | e[-+]? \d+ ) /ix, :float + on %r/ -? (?: 0 | [1-9]\d* ) (?: e[+-] \d+ )? /ix, :integer + end + + state :key, :string do + on %r/ [^\\"]+ /x, :content + + on %r/ " /x, :delimiter, pop + + on %r/ \\ (?: #{ESCAPE} | #{UNICODE_ESCAPE} ) /x, :char + on %r/ \\. /mx, :content + on %r/ \\ /x, :error, pop + end + + protected + + def setup + @state = :initial + end + + # See http://json.org/ for a definition of the JSON lexic/grammar. + scan_tokens_code = <<-"RUBY" + def scan_tokens encoder, options + state = options[:state] || @state + + if [:string, :key].include? state + encoder.begin_group state + end + + states = [state] + + until eos? + + case state + +#{ @code.chomp.gsub(/^/, ' ') } + else + raise_inspect 'Unknown state: %p' % [state], encoder + + end + + end + + if options[:keep_state] + @state = state + end + + if [:string, :key].include? state + encoder.end_group state + end + + encoder + end + RUBY + + if ENV['PUTS'] + puts CodeRay.scan(scan_tokens_code, :ruby).terminal + puts "callbacks: #{callbacks.size}" + end + class_eval scan_tokens_code + + end + +end +end From 4da772ba7dd5026e74c24c1e007784ea4203690c Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Sat, 13 Feb 2016 00:50:36 +0100 Subject: [PATCH 27/54] add generated Lua scanner --- lib/coderay/scanners/lua2.rb | 184 +++++++++++++++++++++++++++++++++++ 1 file changed, 184 insertions(+) create mode 100644 lib/coderay/scanners/lua2.rb diff --git a/lib/coderay/scanners/lua2.rb b/lib/coderay/scanners/lua2.rb new file mode 100644 index 00000000..b047629c --- /dev/null +++ b/lib/coderay/scanners/lua2.rb @@ -0,0 +1,184 @@ +# encoding: utf-8 + +module CodeRay +module Scanners + + # Scanner for the Lua[http://lua.org] programming lanuage. + # + # The language’s complete syntax is defined in + # {the Lua manual}[http://www.lua.org/manual/5.2/manual.html], + # which is what this scanner tries to conform to. + class Lua2 < RuleBasedScanner + + register_for :lua2 + file_extension 'lua' + title 'Lua' + + # Keywords used in Lua. + KEYWORDS = %w[and break do else elseif end + for function goto if in + local not or repeat return + then until while + ] + + # Constants set by the Lua core. + PREDEFINED_CONSTANTS = %w[false true nil] + + # The expressions contained in this array are parts of Lua’s `basic' + # library. Although it’s not entirely necessary to load that library, + # it is highly recommended and one would have to provide own implementations + # of some of these expressions if one does not do so. They however aren’t + # keywords, neither are they constants, but nearly predefined, so they + # get tagged as `predefined' rather than anything else. + # + # This list excludes values of form `_UPPERCASE' because the Lua manual + # requires such identifiers to be reserved by Lua anyway and they are + # highlighted directly accordingly, without the need for specific + # identifiers to be listed here. + PREDEFINED_EXPRESSIONS = %w[ + assert collectgarbage dofile error getmetatable + ipairs load loadfile next pairs pcall print + rawequal rawget rawlen rawset select setmetatable + tonumber tostring type xpcall + ] + + # Automatic token kind selection for normal words. + IDENT_KIND = CodeRay::WordList.new(:ident). + add(KEYWORDS, :keyword). + add(PREDEFINED_CONSTANTS, :predefined_constant). + add(PREDEFINED_EXPRESSIONS, :predefined) + + protected + + # Scanner initialization. + def setup + @state = :initial + @brace_depth = 0 + @num_equals = nil + end + + state :initial, :map do + on %r/\-\-\[\=*\[/, push(:long_comment, :comment), :delimiter, #--[[ long (possibly multiline) comment ]] + set(:num_equals, -> (match) { match.count('=') }) # Number must match for comment end + on %r/--.*$/, :comment # --Lua comment + on %r/\[=*\[/, push(:long_string, :string), :delimiter, # [[ long (possibly multiline) string ]] + set(:num_equals, -> (match) { match.count('=') }) # Number must match for string end + on %r/::\s*[a-zA-Z_][a-zA-Z0-9_]+\s*::/, :label # ::goto_label:: + on %r/_[A-Z]+/, :predefined # _UPPERCASE are names reserved for Lua + on check_if { |brace_depth| brace_depth > 0 }, %r/([a-zA-Z_][a-zA-Z0-9_]*) (\s+)?(=)/x, groups(:key, :space, :operator) + on %r/[a-zA-Z_][a-zA-Z0-9_]*/, kind { |match| IDENT_KIND[match] }, push_state { |match, kind, state| # Normal letters (or letters followed by digits) + # Extra highlighting for entities following certain keywords + if kind == :keyword && match == 'function' + state = :function_expected + elsif kind == :keyword && match == 'goto' + state = :goto_label_expected + elsif kind == :keyword && match == 'local' + state = :local_var_expected + end + + state + } + + on %r/\{/, push(:map), kind { |brace_depth| brace_depth > 0 ? :inline_delimiter : :delimiter }, increment(:brace_depth) # Opening table brace { + on check_if { |brace_depth| brace_depth == 1 }, %r/\}/, :delimiter, pop, decrement(:brace_depth) # Closing table brace } + on check_if { |brace_depth| brace_depth == 0 }, %r/\}/, :error # Mismatched brace + on %r/\}/, :inline_delimiter, pop, decrement(:brace_depth) + + on %r/'/, push(:single_quoted_string, :string), :delimiter, set(:start_delim, :match) # String delimiters " and ' + on %r/"/, push(:double_quoted_string, :string), :delimiter, set(:start_delim, :match) + # ↓Prefix hex number ←|→ decimal number + on %r/-? (?:0x\h* \. \h+ (?:p[+\-]?\d+)? | \d*\.\d+ (?:e[+\-]?\d+)?)/ix, :float # hexadecimal constants have no E power, decimal ones no P power + # ↓Prefix hex number ←|→ decimal number + on %r/-? (?:0x\h+ (?:p[+\-]?\d+)? | \d+ (?:e[+\-]?\d+)?)/ix, :integer # hexadecimal constants have no E power, decimal ones no P power + on %r/[\+\-\*\/%^\#=~<>\(\)\[\]:;,] | \.(?!\d)/x, :operator # Operators + on %r/\s+/, :space # Space + end + + state :function_expected do + on %r/\(.*?\)/m, :operator, pop_state # x = function() # "Anonymous" function without explicit name + on %r/[a-zA-Z_] (?:[a-zA-Z0-9_\.] (?!\.\d))* [\.\:]/x, :ident # function tbl.subtbl.foo() | function tbl:foo() # Colon only allowed as last separator + on %r/[a-zA-Z_][a-zA-Z0-9_]*/, :function, pop_state # function foo() + on %r/\s+/, :space # Between the `function' keyword and the ident may be any amount of whitespace + end + + state :goto_label_expected do + on %r/[a-zA-Z_][a-zA-Z0-9_]*/, :label, pop_state + on %r/\s+/, :space # Between the `goto' keyword and the label may be any amount of whitespace + end + + state :local_var_expected do + on %r/function/, :keyword, pop_state, push_state(:function_expected) # local function ... + on %r/[a-zA-Z_][a-zA-Z0-9_]*/, :local_variable + on %r/,/, :operator + on %r/\=/, :operator, pop_state + on %r/\n/, :space, pop_state + on %r/\s+/, :space + end + + state :long_comment do + on pattern { |num_equals| %r/(.*?)(\]={#{num_equals}}\])/m }, groups(:content, :delimiter), pop(:comment) + on %r/.*/m, :error, pop(:comment) + end + + state :long_string do + on pattern { |num_equals| %r/(.*?)(\]={#{num_equals}}\])/m }, groups(:content, :delimiter), pop(:string) # Long strings do not interpret any escape sequences + on %r/.*/m, :error, pop(:string) + end + + state :single_quoted_string do + on %r/[^\\'\n]+/, :content # Everything except \ and the start delimiter character is string content (newlines are only allowed if preceeded by \ or \z) + on %r/\\(?:["'abfnrtv\\]|z\s*|x\h\h|\d{1,3}|\n)/m, :char + on %r/'/, :delimiter, pop(:string) + on %r/\n/, :error, pop(:string) # Lua forbids unescaped newlines in normal non-long strings + # encoder.text_token("\\n\n", :error) # Visually appealing error indicator--otherwise users may wonder whether the highlighter cannot highlight multine strings + end + + state :double_quoted_string do + on %r/[^\\"\n]+/, :content # Everything except \ and the start delimiter character is string content (newlines are only allowed if preceeded by \ or \z) + on %r/\\(?:["'abfnrtv\\]|z\s*|x\h\h|\d{1,3}|\n)/m, :char + on %r/"/, :delimiter, pop(:string) + on %r/\n/, :error, pop(:string) # Lua forbids unescaped newlines in normal non-long strings + # encoder.text_token("\\n\n", :error) # Visually appealing error indicator--otherwise users may wonder whether the highlighter cannot highlight multine strings + end + + # CodeRay entry hook. Starts parsing. + scan_tokens_code = <<-"RUBY" + def scan_tokens encoder, options#{ def_line = __LINE__; nil } + state = options[:state] || @state + brace_depth = @brace_depth + num_equals = nil + + states = [state] + + until eos? + + case state + +#{ @code.chomp.gsub(/^/, ' ') } + else + raise_inspect 'Unknown state: %p' % [state], encoder + + end + + end + + if options[:keep_state] + @state = state + end + + encoder.end_group :string if [:string, :single_quoted_string, :double_quoted_string].include? state + brace_depth.times { encoder.end_group :map } + + encoder + end + RUBY + + if ENV['PUTS'] + puts CodeRay.scan(scan_tokens_code, :ruby).terminal + puts "callbacks: #{callbacks.size}" + end + class_eval scan_tokens_code, __FILE__, def_line + end + +end +end From 9f4af602c8f2b9892ff7b92071ae25da9d509ea4 Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Sat, 13 Feb 2016 00:50:55 +0100 Subject: [PATCH 28/54] highlight generated C scanner (like the others) --- lib/coderay/scanners/c2.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/coderay/scanners/c2.rb b/lib/coderay/scanners/c2.rb index a5d4d096..d12f9105 100644 --- a/lib/coderay/scanners/c2.rb +++ b/lib/coderay/scanners/c2.rb @@ -117,7 +117,7 @@ def scan_tokens encoder, options#{ def_line = __LINE__; nil } RUBY if ENV['PUTS'] - puts scan_tokens_code + puts CodeRay.scan(scan_tokens_code, :ruby).terminal puts "callbacks: #{callbacks.size}" end class_eval scan_tokens_code, __FILE__, def_line From aaa1705c486001cd73b9787b01c2e8a1d70c1009 Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Sat, 13 Feb 2016 00:51:29 +0100 Subject: [PATCH 29/54] ignore benchmark results --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 888553ca..e97fe08a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ .* +bench/example.* coverage pkg spec/reports From 0d1c78656fbcdda8c5f729950d46d8ea855eeca7 Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Sat, 13 Feb 2016 10:25:20 +0100 Subject: [PATCH 30/54] move comment to the top --- lib/coderay/scanners/json5.rb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/coderay/scanners/json5.rb b/lib/coderay/scanners/json5.rb index c68ebb7d..f7f6eac2 100644 --- a/lib/coderay/scanners/json5.rb +++ b/lib/coderay/scanners/json5.rb @@ -2,6 +2,8 @@ module CodeRay module Scanners # Scanner for JSON (JavaScript Object Notation). + # + # See http://json.org/ for a definition of the JSON lexic/grammar. class JSON5 < RuleBasedScanner register_for :json5 @@ -45,7 +47,6 @@ def setup @state = :initial end - # See http://json.org/ for a definition of the JSON lexic/grammar. scan_tokens_code = <<-"RUBY" def scan_tokens encoder, options state = options[:state] || @state From dd1d7791d818152506f48a6d429d882e4bda2087 Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Sat, 13 Feb 2016 10:26:22 +0100 Subject: [PATCH 31/54] move setup to superclass --- lib/coderay/rule_based_scanner.rb | 5 +++++ lib/coderay/scanners/java_script5.rb | 4 ---- lib/coderay/scanners/json5.rb | 4 ---- lib/coderay/scanners/lua2.rb | 2 +- 4 files changed, 6 insertions(+), 9 deletions(-) diff --git a/lib/coderay/rule_based_scanner.rb b/lib/coderay/rule_based_scanner.rb index 24e278d0..35adad17 100644 --- a/lib/coderay/rule_based_scanner.rb +++ b/lib/coderay/rule_based_scanner.rb @@ -289,6 +289,11 @@ def make_callback block end end end + + def setup + @state = :initial + end + end end end \ No newline at end of file diff --git a/lib/coderay/scanners/java_script5.rb b/lib/coderay/scanners/java_script5.rb index 6b5a83fb..68576624 100644 --- a/lib/coderay/scanners/java_script5.rb +++ b/lib/coderay/scanners/java_script5.rb @@ -132,10 +132,6 @@ class JavaScript5 < RuleBasedScanner protected - def setup - @state = :initial - end - scan_tokens_code = <<-"RUBY" def scan_tokens encoder, options#{ def_line = __LINE__; nil } state, string_delimiter = options[:state] || @state diff --git a/lib/coderay/scanners/json5.rb b/lib/coderay/scanners/json5.rb index f7f6eac2..29334830 100644 --- a/lib/coderay/scanners/json5.rb +++ b/lib/coderay/scanners/json5.rb @@ -43,10 +43,6 @@ class JSON5 < RuleBasedScanner protected - def setup - @state = :initial - end - scan_tokens_code = <<-"RUBY" def scan_tokens encoder, options state = options[:state] || @state diff --git a/lib/coderay/scanners/lua2.rb b/lib/coderay/scanners/lua2.rb index b047629c..fe638976 100644 --- a/lib/coderay/scanners/lua2.rb +++ b/lib/coderay/scanners/lua2.rb @@ -52,7 +52,7 @@ class Lua2 < RuleBasedScanner # Scanner initialization. def setup - @state = :initial + super @brace_depth = 0 @num_equals = nil end From 42e2ca35e1e76699c5cbe8b35ffe8964a67c40fc Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Sat, 13 Feb 2016 10:31:05 +0100 Subject: [PATCH 32/54] cleanup --- lib/coderay/scanners/json5.rb | 2 +- lib/coderay/scanners/lua2.rb | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/coderay/scanners/json5.rb b/lib/coderay/scanners/json5.rb index 29334830..2b2dbddc 100644 --- a/lib/coderay/scanners/json5.rb +++ b/lib/coderay/scanners/json5.rb @@ -44,7 +44,7 @@ class JSON5 < RuleBasedScanner protected scan_tokens_code = <<-"RUBY" - def scan_tokens encoder, options + def scan_tokens encoder, options#{ def_line = __LINE__; nil } state = options[:state] || @state if [:string, :key].include? state diff --git a/lib/coderay/scanners/lua2.rb b/lib/coderay/scanners/lua2.rb index fe638976..866ddedb 100644 --- a/lib/coderay/scanners/lua2.rb +++ b/lib/coderay/scanners/lua2.rb @@ -141,7 +141,6 @@ def setup # encoder.text_token("\\n\n", :error) # Visually appealing error indicator--otherwise users may wonder whether the highlighter cannot highlight multine strings end - # CodeRay entry hook. Starts parsing. scan_tokens_code = <<-"RUBY" def scan_tokens encoder, options#{ def_line = __LINE__; nil } state = options[:state] || @state From f1bd8330bf179791f6257c85b951860458f0826c Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Sat, 13 Feb 2016 10:31:41 +0100 Subject: [PATCH 33/54] use setup --- lib/coderay/scanners/c2.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/coderay/scanners/c2.rb b/lib/coderay/scanners/c2.rb index d12f9105..87964f94 100644 --- a/lib/coderay/scanners/c2.rb +++ b/lib/coderay/scanners/c2.rb @@ -89,7 +89,7 @@ class C2 < RuleBasedScanner scan_tokens_code = <<-"RUBY" def scan_tokens encoder, options#{ def_line = __LINE__; nil } - state = :initial + state = @state label_expected = true case_expected = false label_expected_before_preproc_line = nil From ca3f15f8bb1b23e12bbe0baedaf2c381c466d36b Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Sat, 13 Feb 2016 10:35:50 +0100 Subject: [PATCH 34/54] remove whitespace --- lib/coderay/scanners/css2.rb | 4 ---- lib/coderay/scanners/java_script5.rb | 4 ---- lib/coderay/scanners/json5.rb | 4 ---- lib/coderay/scanners/lua2.rb | 4 ---- 4 files changed, 16 deletions(-) diff --git a/lib/coderay/scanners/css2.rb b/lib/coderay/scanners/css2.rb index edb9ca3f..4b20b793 100644 --- a/lib/coderay/scanners/css2.rb +++ b/lib/coderay/scanners/css2.rb @@ -91,15 +91,11 @@ def scan_tokens encoder, options#{ def_line = __LINE__; nil } state = states.last until eos? - case state - #{ @code.chomp.gsub(/^/, ' ') } else raise_inspect 'Unknown state: %p' % [state], encoder - end - end if options[:keep_state] diff --git a/lib/coderay/scanners/java_script5.rb b/lib/coderay/scanners/java_script5.rb index 68576624..bee1b260 100644 --- a/lib/coderay/scanners/java_script5.rb +++ b/lib/coderay/scanners/java_script5.rb @@ -146,15 +146,11 @@ def scan_tokens encoder, options#{ def_line = __LINE__; nil } states = [state] until eos? - case state - #{ @code.chomp.gsub(/^/, ' ') } else raise_inspect 'Unknown state: %p' % [state], encoder - end - end if options[:keep_state] diff --git a/lib/coderay/scanners/json5.rb b/lib/coderay/scanners/json5.rb index 2b2dbddc..3b5b159f 100644 --- a/lib/coderay/scanners/json5.rb +++ b/lib/coderay/scanners/json5.rb @@ -54,15 +54,11 @@ def scan_tokens encoder, options#{ def_line = __LINE__; nil } states = [state] until eos? - case state - #{ @code.chomp.gsub(/^/, ' ') } else raise_inspect 'Unknown state: %p' % [state], encoder - end - end if options[:keep_state] diff --git a/lib/coderay/scanners/lua2.rb b/lib/coderay/scanners/lua2.rb index 866ddedb..4b99a442 100644 --- a/lib/coderay/scanners/lua2.rb +++ b/lib/coderay/scanners/lua2.rb @@ -150,15 +150,11 @@ def scan_tokens encoder, options#{ def_line = __LINE__; nil } states = [state] until eos? - case state - #{ @code.chomp.gsub(/^/, ' ') } else raise_inspect 'Unknown state: %p' % [state], encoder - end - end if options[:keep_state] From ee9e8407fb25e3383caa6e8954dc49ada031a538 Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Sat, 13 Feb 2016 10:44:34 +0100 Subject: [PATCH 35/54] cleanup --- lib/coderay/scanners/c2.rb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/coderay/scanners/c2.rb b/lib/coderay/scanners/c2.rb index 87964f94..7ae382af 100644 --- a/lib/coderay/scanners/c2.rb +++ b/lib/coderay/scanners/c2.rb @@ -77,14 +77,14 @@ class C2 < RuleBasedScanner on %r/ \\ (?: #{ESCAPE} | #{UNICODE_ESCAPE} ) /mx, :char on %r/"/, :delimiter, pop, flag_off(:label_expected) on %r/ \\ /x, pop, :error, flag_off(:label_expected) - on %r/ $ /x, pop, flag_off(:label_expected), continue + on %r/ $ /x, pop, flag_off(:label_expected) end state :include_expected do on %r/<[^>\n]+>?|"[^"\n\\]*(?:\\.[^"\n\\]*)*"?/, :include, pop_state on %r/ \s*? \n \s* /x, :space, pop_state on %r/\s+/, :space - on %r//, pop_state, continue # TODO: add otherwise method for this + on %r//, pop_state # TODO: add otherwise method for this end scan_tokens_code = <<-"RUBY" @@ -98,14 +98,14 @@ def scan_tokens encoder, options#{ def_line = __LINE__; nil } states = [state] until eos? - last_pos = pos + # last_pos = pos case state #{ @code.chomp.gsub(/^/, ' ') } else raise_inspect 'Unknown state: %p' % [state], encoder end - raise_inspect 'nothing was consumed! states = %p' % [states], encoder if pos == last_pos + # raise_inspect 'nothing was consumed! states = %p' % [states], encoder if pos == last_pos end if state == :string From f8cadd9fce43c48ed4b32685f62e99f8770b8963 Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Sun, 14 Feb 2016 12:32:06 +0100 Subject: [PATCH 36/54] add line number to eval --- lib/coderay/scanners/json5.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/coderay/scanners/json5.rb b/lib/coderay/scanners/json5.rb index 3b5b159f..dcfdd3fe 100644 --- a/lib/coderay/scanners/json5.rb +++ b/lib/coderay/scanners/json5.rb @@ -77,7 +77,7 @@ def scan_tokens encoder, options#{ def_line = __LINE__; nil } puts CodeRay.scan(scan_tokens_code, :ruby).terminal puts "callbacks: #{callbacks.size}" end - class_eval scan_tokens_code + class_eval scan_tokens_code, __FILE__, def_line end From 13ac3fdc6fa5330c9eacc6ac9be92a869ab8d3be Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Sun, 14 Feb 2016 15:08:30 +0100 Subject: [PATCH 37/54] optional push state (return nil) --- lib/coderay/rule_based_scanner.rb | 8 ++++++-- lib/coderay/scanners/lua2.rb | 10 ++++------ 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/lib/coderay/rule_based_scanner.rb b/lib/coderay/rule_based_scanner.rb index 35adad17..334c08c4 100644 --- a/lib/coderay/rule_based_scanner.rb +++ b/lib/coderay/rule_based_scanner.rb @@ -120,15 +120,19 @@ def on *pattern_and_actions raise @code << " p 'push %p' % [#{action.state}]\n" if $DEBUG @code << " state = #{action.state}\n" + @code << " states << state\n" when Symbol @code << " p 'push %p' % [#{action.state.inspect}]\n" if $DEBUG @code << " state = #{action.state.inspect}\n" + @code << " states << state\n" when Proc - @code << " state = #{make_callback(action.state)}\n" + @code << " if new_state = #{make_callback(action.state)}\n" + @code << " state = new_state\n" + @code << " states << new_state\n" + @code << " end\n" else raise "I don't know how to evaluate this push state: %p" % [action.state] end - @code << " states << state\n" if action.is_a? Push if action.state == action.group @code << " encoder.begin_group state\n" diff --git a/lib/coderay/scanners/lua2.rb b/lib/coderay/scanners/lua2.rb index 4b99a442..8426834f 100644 --- a/lib/coderay/scanners/lua2.rb +++ b/lib/coderay/scanners/lua2.rb @@ -66,17 +66,15 @@ def setup on %r/::\s*[a-zA-Z_][a-zA-Z0-9_]+\s*::/, :label # ::goto_label:: on %r/_[A-Z]+/, :predefined # _UPPERCASE are names reserved for Lua on check_if { |brace_depth| brace_depth > 0 }, %r/([a-zA-Z_][a-zA-Z0-9_]*) (\s+)?(=)/x, groups(:key, :space, :operator) - on %r/[a-zA-Z_][a-zA-Z0-9_]*/, kind { |match| IDENT_KIND[match] }, push_state { |match, kind, state| # Normal letters (or letters followed by digits) + on %r/[a-zA-Z_][a-zA-Z0-9_]*/, kind { |match| IDENT_KIND[match] }, push_state { |match, kind| # Normal letters (or letters followed by digits) # Extra highlighting for entities following certain keywords if kind == :keyword && match == 'function' - state = :function_expected + :function_expected elsif kind == :keyword && match == 'goto' - state = :goto_label_expected + :goto_label_expected elsif kind == :keyword && match == 'local' - state = :local_var_expected + :local_var_expected end - - state } on %r/\{/, push(:map), kind { |brace_depth| brace_depth > 0 ? :inline_delimiter : :delimiter }, increment(:brace_depth) # Opening table brace { From 6b80e1efb6a44e3eb647eb396499db17cc6def47 Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Sun, 14 Feb 2016 15:09:15 +0100 Subject: [PATCH 38/54] remove obsolete flag, fix order of rules --- lib/coderay/scanners/lua2.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/coderay/scanners/lua2.rb b/lib/coderay/scanners/lua2.rb index 8426834f..1aba7698 100644 --- a/lib/coderay/scanners/lua2.rb +++ b/lib/coderay/scanners/lua2.rb @@ -82,8 +82,8 @@ def setup on check_if { |brace_depth| brace_depth == 0 }, %r/\}/, :error # Mismatched brace on %r/\}/, :inline_delimiter, pop, decrement(:brace_depth) - on %r/'/, push(:single_quoted_string, :string), :delimiter, set(:start_delim, :match) # String delimiters " and ' - on %r/"/, push(:double_quoted_string, :string), :delimiter, set(:start_delim, :match) + on %r/"/, push(:double_quoted_string, :string), :delimiter # String delimiters " and ' + on %r/'/, push(:single_quoted_string, :string), :delimiter # ↓Prefix hex number ←|→ decimal number on %r/-? (?:0x\h* \. \h+ (?:p[+\-]?\d+)? | \d*\.\d+ (?:e[+\-]?\d+)?)/ix, :float # hexadecimal constants have no E power, decimal ones no P power # ↓Prefix hex number ←|→ decimal number From dcf73a6b5bae19a6592f4be17105005474cec2d3 Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Sun, 14 Feb 2016 15:09:28 +0100 Subject: [PATCH 39/54] nicer debug output --- lib/coderay/rule_based_scanner.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/coderay/rule_based_scanner.rb b/lib/coderay/rule_based_scanner.rb index 334c08c4..653b43f0 100644 --- a/lib/coderay/rule_based_scanner.rb +++ b/lib/coderay/rule_based_scanner.rb @@ -27,7 +27,7 @@ def state *names, &block @first = true instance_eval(&block) @code << " else\n" - @code << " puts 'no match for #{names.map(&:inspect).join(', ')}'\n" if $DEBUG + @code << " puts \"no match for \#{state.inspect} => skip char\"\n" if $DEBUG @code << " encoder.text_token getch, :error\n" @code << " end\n" @code << " \n" From 9526bd86b6420e868bee1da167e40afc11ed1c0b Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Sun, 14 Feb 2016 15:11:18 +0100 Subject: [PATCH 40/54] generate scanner code automatically --- lib/coderay/rule_based_scanner.rb | 87 ++++++++++++++++++++++++++++--- lib/coderay/scanners/lua2.rb | 36 +++---------- 2 files changed, 87 insertions(+), 36 deletions(-) diff --git a/lib/coderay/rule_based_scanner.rb b/lib/coderay/rule_based_scanner.rb index 653b43f0..a22bcc3b 100644 --- a/lib/coderay/rule_based_scanner.rb +++ b/lib/coderay/rule_based_scanner.rb @@ -1,3 +1,5 @@ +require 'set' + module CodeRay module Scanners class RuleBasedScanner < Scanner @@ -240,26 +242,32 @@ def check_unless value = nil, &callback end def flag_on *flags + flags.each { |name| variables << name } ValueSetter.new Array(flags), true end def flag_off *flags + flags.each { |name| variables << name } ValueSetter.new Array(flags), false end def set flag, value = nil, &callback + variables << flag ValueSetter.new [flag], value || callback end def unset *flags + flags.each { |name| variables << name } ValueSetter.new Array(flags), nil end def increment *counters + counters.each { |name| variables << name } Increment.new Array(counters), :+, 1 end def decrement *counters + counters.each { |name| variables << name } Increment.new Array(counters), :-, 1 end @@ -267,33 +275,96 @@ def continue Continue.new end + def define_scan_tokens! + if ENV['PUTS'] + puts CodeRay.scan(scan_tokens_code, :ruby).terminal + puts "callbacks: #{callbacks.size}" + end + + class_eval scan_tokens_code + end + protected def callbacks @callbacks ||= {} end + def variables + @variables ||= Set.new + end + + def additional_variables + variables - %i(state match kind) + end + def make_callback block base_name = "__callback_line_#{block.source_location.last}" - name = base_name + callback_name = base_name counter = 'a' - while callbacks.key?(name) - name = "#{base_name}_#{counter}" + while callbacks.key?(callback_name) + callback_name = "#{base_name}_#{counter}" counter.succ! end - callbacks[name] = define_method(name, &block) + callbacks[callback_name] = define_method(callback_name, &block) - arguments = block.parameters.map(&:last) + parameters = block.parameters - if arguments.empty? - name + if parameters.empty? + callback_name else - "#{name}(#{arguments.join(', ')})" + parameter_names = parameters.map(&:last) + parameter_names.each { |name| variables << name } + "#{callback_name}(#{parameter_names.join(', ')})" end end + + def scan_tokens_code + <<-"RUBY" + def scan_tokens encoder, options + state = options[:state] || @state + +#{ restore_local_variables_code.chomp.gsub(/^/, ' ' * 3) } + + states = [state] + + until eos? + case state +#{ @code.chomp.gsub(/^/, ' ' * 4) } + else + raise_inspect 'Unknown state: %p' % [state], encoder + end end + if options[:keep_state] + @state = state + end + +#{ close_groups_code.chomp.gsub(/^/, ' ' * 3) } + + encoder + end + RUBY + end + + def restore_local_variables_code + additional_variables.sort.map { |name| "#{name} = @#{name}" }.join("\n") + end + + def close_groups_code + "close_groups(encoder, states)" + end + end + + def scan_tokens tokens, options + self.class.define_scan_tokens! + + scan_tokens tokens, options + end + + protected + def setup @state = :initial end diff --git a/lib/coderay/scanners/lua2.rb b/lib/coderay/scanners/lua2.rb index 1aba7698..f48627dd 100644 --- a/lib/coderay/scanners/lua2.rb +++ b/lib/coderay/scanners/lua2.rb @@ -139,38 +139,18 @@ def setup # encoder.text_token("\\n\n", :error) # Visually appealing error indicator--otherwise users may wonder whether the highlighter cannot highlight multine strings end - scan_tokens_code = <<-"RUBY" - def scan_tokens encoder, options#{ def_line = __LINE__; nil } - state = options[:state] || @state - brace_depth = @brace_depth - num_equals = nil - - states = [state] - - until eos? + def close_groups encoder, states + states.reverse_each do |state| case state -#{ @code.chomp.gsub(/^/, ' ') } - else - raise_inspect 'Unknown state: %p' % [state], encoder + when :long_string, :single_quoted_string, :double_quoted_string + encoder.end_group :string + when :long_comment + encoder.end_group :long_comment + when :map + encoder.end_group :map end end - - if options[:keep_state] - @state = state - end - - encoder.end_group :string if [:string, :single_quoted_string, :double_quoted_string].include? state - brace_depth.times { encoder.end_group :map } - - encoder - end - RUBY - - if ENV['PUTS'] - puts CodeRay.scan(scan_tokens_code, :ruby).terminal - puts "callbacks: #{callbacks.size}" end - class_eval scan_tokens_code, __FILE__, def_line end end From 14339bfc8f58ac883bb5f202411804f03966f462 Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Sun, 14 Feb 2016 19:22:42 +0100 Subject: [PATCH 41/54] some more variables that are set by the scanner --- lib/coderay/rule_based_scanner.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/coderay/rule_based_scanner.rb b/lib/coderay/rule_based_scanner.rb index a22bcc3b..3022ec9b 100644 --- a/lib/coderay/rule_based_scanner.rb +++ b/lib/coderay/rule_based_scanner.rb @@ -295,7 +295,7 @@ def variables end def additional_variables - variables - %i(state match kind) + variables - %i(encoder options state states match kind) end def make_callback block From 90c5c9161f53ee6e8400b3af2e967820ba0addd9 Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Sun, 14 Feb 2016 19:25:35 +0100 Subject: [PATCH 42/54] remove templates, yay! --- lib/coderay/rule_based_scanner.rb | 4 +++ lib/coderay/scanners/c2.rb | 42 ++++++++---------------- lib/coderay/scanners/css2.rb | 42 ++++-------------------- lib/coderay/scanners/java_script5.rb | 48 +++++++--------------------- lib/coderay/scanners/json5.rb | 38 ++-------------------- lib/coderay/scanners/lua2.rb | 18 +++++------ 6 files changed, 46 insertions(+), 146 deletions(-) diff --git a/lib/coderay/rule_based_scanner.rb b/lib/coderay/rule_based_scanner.rb index 3022ec9b..56edcfcf 100644 --- a/lib/coderay/rule_based_scanner.rb +++ b/lib/coderay/rule_based_scanner.rb @@ -369,6 +369,10 @@ def setup @state = :initial end + def close_groups encoder, states + # TODO + end + end end end \ No newline at end of file diff --git a/lib/coderay/scanners/c2.rb b/lib/coderay/scanners/c2.rb index 7ae382af..3103e549 100644 --- a/lib/coderay/scanners/c2.rb +++ b/lib/coderay/scanners/c2.rb @@ -87,40 +87,24 @@ class C2 < RuleBasedScanner on %r//, pop_state # TODO: add otherwise method for this end - scan_tokens_code = <<-"RUBY" - def scan_tokens encoder, options#{ def_line = __LINE__; nil } - state = @state - label_expected = true - case_expected = false - label_expected_before_preproc_line = nil - in_preproc_line = false - - states = [state] - - until eos? - # last_pos = pos - case state -#{ @code.chomp.gsub(/^/, ' ') } - else - raise_inspect 'Unknown state: %p' % [state], encoder - end - - # raise_inspect 'nothing was consumed! states = %p' % [states], encoder if pos == last_pos - end + protected + + def setup + super - if state == :string + @label_expected = true + @case_expected = false + @label_expected_before_preproc_line = nil + @in_preproc_line = false + end + + def close_groups encoder, states + if states.last == :string encoder.end_group :string end - - encoder end - RUBY - if ENV['PUTS'] - puts CodeRay.scan(scan_tokens_code, :ruby).terminal - puts "callbacks: #{callbacks.size}" - end - class_eval scan_tokens_code, __FILE__, def_line end + end end diff --git a/lib/coderay/scanners/css2.rb b/lib/coderay/scanners/css2.rb index 4b20b793..0c0d4a0a 100644 --- a/lib/coderay/scanners/css2.rb +++ b/lib/coderay/scanners/css2.rb @@ -45,14 +45,6 @@ module RE # :nodoc: AttributeSelector = /(\[)([^\]]+)?(\])?/ end - protected - - def setup - @state = :initial - @value_expected = false - @block = false - end - state :initial do on %r/\s+/, :space @@ -83,36 +75,14 @@ def setup on %r/ [+>~,.=()\/] /x, :operator end - scan_tokens_code = <<-"RUBY" - def scan_tokens encoder, options#{ def_line = __LINE__; nil } - states = Array(options[:state] || @state).dup - value_expected = @value_expected - block = @block - state = states.last - - until eos? - case state -#{ @code.chomp.gsub(/^/, ' ') } - else - raise_inspect 'Unknown state: %p' % [state], encoder - end - end - - if options[:keep_state] - @state = states - @value_expected = value_expected - @block = block - end - - encoder - end - RUBY + protected - if ENV['PUTS'] - puts CodeRay.scan(scan_tokens_code, :ruby).terminal - puts "callbacks: #{callbacks.size}" + def setup + super + + @value_expected = false + @block = false end - class_eval scan_tokens_code, __FILE__, def_line end diff --git a/lib/coderay/scanners/java_script5.rb b/lib/coderay/scanners/java_script5.rb index bee1b260..9839d23e 100644 --- a/lib/coderay/scanners/java_script5.rb +++ b/lib/coderay/scanners/java_script5.rb @@ -130,48 +130,22 @@ class JavaScript5 < RuleBasedScanner # # encoder.text_token match, :comment if match # end - protected + protected - scan_tokens_code = <<-"RUBY" - def scan_tokens encoder, options#{ def_line = __LINE__; nil } - state, string_delimiter = options[:state] || @state - if string_delimiter - encoder.begin_group state - end - - value_expected = true - key_expected = false - function_expected = false - - states = [state] - - until eos? - case state -#{ @code.chomp.gsub(/^/, ' ') } - else - raise_inspect 'Unknown state: %p' % [state], encoder - end - end - - if options[:keep_state] - @state = state, string_delimiter - end - - if [:string, :regexp].include? state - encoder.end_group state - end + def setup + super - encoder + @string_delimiter = nil + @value_expected = true + @key_expected = false + @function_expected = false end - RUBY - if ENV['PUTS'] - puts CodeRay.scan(scan_tokens_code, :ruby).terminal - puts "callbacks: #{callbacks.size}" + def close_groups encoder, states + if [:string, :key, :regexp].include? states.last + encoder.end_group states.last + end end - class_eval scan_tokens_code, __FILE__, def_line - - protected def reset_instance super diff --git a/lib/coderay/scanners/json5.rb b/lib/coderay/scanners/json5.rb index dcfdd3fe..8b0a8bd9 100644 --- a/lib/coderay/scanners/json5.rb +++ b/lib/coderay/scanners/json5.rb @@ -41,43 +41,11 @@ class JSON5 < RuleBasedScanner on %r/ \\ /x, :error, pop end - protected - - scan_tokens_code = <<-"RUBY" - def scan_tokens encoder, options#{ def_line = __LINE__; nil } - state = options[:state] || @state - - if [:string, :key].include? state - encoder.begin_group state - end - - states = [state] - - until eos? - case state -#{ @code.chomp.gsub(/^/, ' ') } - else - raise_inspect 'Unknown state: %p' % [state], encoder - end - end - - if options[:keep_state] - @state = state - end - - if [:string, :key].include? state - encoder.end_group state + def close_groups encoder, states + if [:string, :key].include? states.last + encoder.end_group states.last end - - encoder - end - RUBY - - if ENV['PUTS'] - puts CodeRay.scan(scan_tokens_code, :ruby).terminal - puts "callbacks: #{callbacks.size}" end - class_eval scan_tokens_code, __FILE__, def_line end diff --git a/lib/coderay/scanners/lua2.rb b/lib/coderay/scanners/lua2.rb index f48627dd..fa20e9b3 100644 --- a/lib/coderay/scanners/lua2.rb +++ b/lib/coderay/scanners/lua2.rb @@ -48,15 +48,6 @@ class Lua2 < RuleBasedScanner add(PREDEFINED_CONSTANTS, :predefined_constant). add(PREDEFINED_EXPRESSIONS, :predefined) - protected - - # Scanner initialization. - def setup - super - @brace_depth = 0 - @num_equals = nil - end - state :initial, :map do on %r/\-\-\[\=*\[/, push(:long_comment, :comment), :delimiter, #--[[ long (possibly multiline) comment ]] set(:num_equals, -> (match) { match.count('=') }) # Number must match for comment end @@ -139,6 +130,15 @@ def setup # encoder.text_token("\\n\n", :error) # Visually appealing error indicator--otherwise users may wonder whether the highlighter cannot highlight multine strings end + protected + + def setup + super + + @brace_depth = 0 + @num_equals = nil + end + def close_groups encoder, states states.reverse_each do |state| case state From 545398fac57b5949359298e1f258e1746adfa3e5 Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Sat, 4 Jun 2016 14:10:45 +0200 Subject: [PATCH 43/54] update version; this will be CodeRay 2 --- lib/coderay/version.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/coderay/version.rb b/lib/coderay/version.rb index 7ea3f70c..ed87d63a 100644 --- a/lib/coderay/version.rb +++ b/lib/coderay/version.rb @@ -1,3 +1,3 @@ module CodeRay - VERSION = '1.1.1' + VERSION = '2.0.0' end From 548e2d0aea6b4c18a2f3e8203241fcaedb10bc8d Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Sun, 15 Jan 2017 18:21:06 +1300 Subject: [PATCH 44/54] default set(:flag) to true --- lib/coderay/rule_based_scanner.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/coderay/rule_based_scanner.rb b/lib/coderay/rule_based_scanner.rb index 56edcfcf..0eb92226 100644 --- a/lib/coderay/rule_based_scanner.rb +++ b/lib/coderay/rule_based_scanner.rb @@ -253,7 +253,7 @@ def flag_off *flags def set flag, value = nil, &callback variables << flag - ValueSetter.new [flag], value || callback + ValueSetter.new [flag], value || callback || true end def unset *flags @@ -375,4 +375,4 @@ def close_groups encoder, states end end -end \ No newline at end of file +end From 7a02cdded08dd232319eae17998abc877efb58cb Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Sun, 9 Apr 2017 18:38:25 +0200 Subject: [PATCH 45/54] working towards DSL scanner --- lib/coderay.rb | 1 + lib/coderay/scanners/c3.rb | 112 ++++++++ lib/coderay/scanners/c4.rb | 126 +++++++++ lib/coderay/scanners/lua2b.rb | 157 ++++++++++++ lib/coderay/scanners/lua3.rb | 142 +++++++++++ lib/coderay/scanners/lua4.rb | 89 +++++++ lib/coderay/state_based_scanner.rb | 394 +++++++++++++++++++++++++++++ 7 files changed, 1021 insertions(+) create mode 100644 lib/coderay/scanners/c3.rb create mode 100644 lib/coderay/scanners/c4.rb create mode 100644 lib/coderay/scanners/lua2b.rb create mode 100644 lib/coderay/scanners/lua3.rb create mode 100644 lib/coderay/scanners/lua4.rb create mode 100644 lib/coderay/state_based_scanner.rb diff --git a/lib/coderay.rb b/lib/coderay.rb index 5c923f5c..c1c9e344 100644 --- a/lib/coderay.rb +++ b/lib/coderay.rb @@ -155,6 +155,7 @@ def self.coderay_path *path # DSL Scanner autoload :RuleBasedScanner, coderay_path('rule_based_scanner') + autoload :StateBasedScanner, coderay_path('state_based_scanner') # convenience access and reusable Encoder/Scanner pair autoload :Duo, coderay_path('duo') diff --git a/lib/coderay/scanners/c3.rb b/lib/coderay/scanners/c3.rb new file mode 100644 index 00000000..49555cae --- /dev/null +++ b/lib/coderay/scanners/c3.rb @@ -0,0 +1,112 @@ +module CodeRay +module Scanners + + # Scanner for C. + class C3 < RuleBasedScanner + + register_for :c3 + file_extension 'c' + + KEYWORDS = [ + 'asm', 'break', 'case', 'continue', 'default', 'do', + 'else', 'enum', 'for', 'goto', 'if', 'return', + 'sizeof', 'struct', 'switch', 'typedef', 'union', 'while', + 'restrict', # added in C99 + ] # :nodoc: + + PREDEFINED_TYPES = [ + 'int', 'long', 'short', 'char', + 'signed', 'unsigned', 'float', 'double', + 'bool', 'complex', # added in C99 + ] # :nodoc: + + PREDEFINED_CONSTANTS = [ + 'EOF', 'NULL', + 'true', 'false', # added in C99 + ] # :nodoc: + DIRECTIVES = [ + 'auto', 'extern', 'register', 'static', 'void', + 'const', 'volatile', # added in C89 + 'inline', # added in C99 + ] # :nodoc: + + IDENT_KIND = WordList.new(:ident). + add(KEYWORDS, :keyword). + add(PREDEFINED_TYPES, :predefined_type). + add(DIRECTIVES, :directive). + add(PREDEFINED_CONSTANTS, :predefined_constant) # :nodoc: + + ESCAPE = / [rbfntv\n\\'"] | x[a-fA-F0-9]{1,2} | [0-7]{1,3} /x # :nodoc: + UNICODE_ESCAPE = / u[a-fA-F0-9]{4} | U[a-fA-F0-9]{8} /x # :nodoc: + + protected + + state :initial do + on check_if(:in_preproc_line), %r/ \s*? \n \s* /x, :space, unset(:in_preproc_line), set(:label_expected, :label_expected_before_preproc_line) + on %r/ \s+ | \\\n /x, :space + + on %r/ [-+*=<>?:;,!&^|()\[\]{}~%]+ | \/(?![\/*])=? | \.(?!\d) /x, :operator, set(:label_expected) { |match, case_expected| + match =~ /[;\{\}]/ || case_expected && match =~ /:/ + }, unset(:case_expected) + + on %r/ (?: case | default ) \b /x, :keyword, set(:case_expected), unset(:label_expected) + on check_if(:label_expected), check_unless(:in_preproc_line), %r/ [A-Za-z_][A-Za-z_0-9]*+ :(?!:) /x, kind { |match| + kind = IDENT_KIND[match.chop] + kind == :ident ? :label : kind + }, set(:label_expected) { |kind| kind == :label } + on %r/ [A-Za-z_][A-Za-z_0-9]* /x, kind { |match| IDENT_KIND[match] }, unset(:label_expected) + + on %r/(L)?(")/, push(:string), groups(:modifier, :delimiter) + + on %r/ L?' (?: [^\'\n\\] | \\ #{ESCAPE} )? '? /x, :char, unset(:label_expected) + on %r/0[xX][0-9A-Fa-f]+/, :hex, unset(:label_expected) + on %r/(?:0[0-7]+)(?![89.eEfF])/, :octal, unset(:label_expected) + on %r/(?:\d+)(?![.eEfF])L?L?/, :integer, unset(:label_expected) + on %r/\d[fF]?|\d*\.\d+(?:[eE][+-]?\d+)?[fF]?|\d+[eE][+-]?\d+[fF]?/, :float, unset(:label_expected) + + on %r! // [^\n\\]* (?: \\. [^\n\\]* )* | /\* (?: .*? \*/ | .* ) !mx, :comment + on %r/ \# \s* if \s* 0 /x, -> (match) { + match << scan_until(/ ^\# (?:elif|else|endif) .*? $ | \z /mx) unless eos? + }, :comment + on %r/ \# [ \t]* include\b /x, :preprocessor, set(:in_preproc_line), set(:label_expected_before_preproc_line, :label_expected), push_state(:include_expected) + on %r/ \# [ \t]* \w* /x, :preprocessor, set(:in_preproc_line), set(:label_expected_before_preproc_line, :label_expected) + + on %r/\$/, :ident + end + + state :string do + on %r/[^\\\n"]+/, :content + on %r/ \\ (?: #{ESCAPE} | #{UNICODE_ESCAPE} ) /mx, :char + on %r/"/, :delimiter, pop, unset(:label_expected) + on %r/ \\ /x, pop, :error, unset(:label_expected) + on %r/ $ /x, pop, unset(:label_expected) + end + + state :include_expected do + on %r/<[^>\n]+>?|"[^"\n\\]*(?:\\.[^"\n\\]*)*"?/, :include, pop_state + on %r/ \s*? \n \s* /x, :space, pop_state + on %r/\s+/, :space + on %r//, pop_state # TODO: add otherwise method for this + end + + protected + + def setup + super + + @label_expected = true + @case_expected = false + @label_expected_before_preproc_line = nil + @in_preproc_line = false + end + + def close_groups encoder, states + if states.last == :string + encoder.end_group :string + end + end + + end + +end +end diff --git a/lib/coderay/scanners/c4.rb b/lib/coderay/scanners/c4.rb new file mode 100644 index 00000000..ff67e495 --- /dev/null +++ b/lib/coderay/scanners/c4.rb @@ -0,0 +1,126 @@ +module CodeRay +module Scanners + + # Scanner for C. + class C4 < StateBasedScanner + + register_for :c4 + file_extension 'c' + + KEYWORDS = [ + 'asm', 'break', 'case', 'continue', 'default', 'do', + 'else', 'enum', 'for', 'goto', 'if', 'return', + 'sizeof', 'struct', 'switch', 'typedef', 'union', 'while', + 'restrict', # added in C99 + ] # :nodoc: + + PREDEFINED_TYPES = [ + 'int', 'long', 'short', 'char', + 'signed', 'unsigned', 'float', 'double', + 'bool', 'complex', # added in C99 + ] # :nodoc: + + PREDEFINED_CONSTANTS = [ + 'EOF', 'NULL', + 'true', 'false', # added in C99 + ] # :nodoc: + DIRECTIVES = [ + 'auto', 'extern', 'register', 'static', 'void', + 'const', 'volatile', # added in C89 + 'inline', # added in C99 + ] # :nodoc: + + IDENT_KIND = WordList.new(:ident). + add(KEYWORDS, :keyword). + add(PREDEFINED_TYPES, :predefined_type). + add(DIRECTIVES, :directive). + add(PREDEFINED_CONSTANTS, :predefined_constant) # :nodoc: + + ESCAPE = / [rbfntv\n\\'"] | x[a-fA-F0-9]{1,2} | [0-7]{1,3} /x # :nodoc: + UNICODE_ESCAPE = / u[a-fA-F0-9]{4} | U[a-fA-F0-9]{8} /x # :nodoc: + + protected + + state :initial do + check in_preproc_line? do + skip %r/ \s*? \n \s* /x, :space do + unset :in_preproc_line + expect :label if label_expected_before_preproc_line? + end + end + + skip %r/ \s+ | \\\n /x, :space + + on %r/ [-+*=<>?:;,!&^|()\[\]{}~%]+ | \/(?![\/*])=? | \.(?!\d) /x, :operator do |match, case_expected| + expect :label if match =~ /[;\{\}]/ || expected?(:case) && match =~ /:/ + end + + on %r/ (?: case | default ) \b /x, :keyword do + expect :case + end + + check label_expected?, !in_preproc_line? do + on %r/ [A-Za-z_][A-Za-z_0-9]*+ :(?!:) /x, -> match { + kind = IDENT_KIND[match.chop] + kind == :ident ? :label : kind + } do |kind| + expect :label if kind == :label + end + end + + on %r/ [A-Za-z_][A-Za-z_0-9]* /x, IDENT_KIND + + on %r/(L)?(")/, push(:string), groups(:modifier, :delimiter) + + on %r/ L?' (?: [^\'\n\\] | \\ #{ESCAPE} )? '? /x, :char + on %r/0[xX][0-9A-Fa-f]+/, :hex + on %r/(?:0[0-7]+)(?![89.eEfF])/, :octal + on %r/(?:\d+)(?![.eEfF])L?L?/, :integer + on %r/\d[fF]?|\d*\.\d+(?:[eE][+-]?\d+)?[fF]?|\d+[eE][+-]?\d+[fF]?/, :float + + skip %r! // [^\n\\]* (?: \\. [^\n\\]* )* | /\* (?: .*? \*/ | .* ) !mx, :comment + on %r/ \# \s* if \s* 0 /x, -> (match) { + match << scan_until(/ ^\# (?:elif|else|endif) .*? $ | \z /mx) unless eos? + }, :comment + on %r/ \# [ \t]* include\b /x, :preprocessor, set(:in_preproc_line), set(:label_expected_before_preproc_line, :label_expected), push(:include) + on %r/ \# [ \t]* \w* /x, :preprocessor, set(:in_preproc_line), set(:label_expected_before_preproc_line, :label_expected) + + on %r/\$/, :ident + end + + group_state :string do + on %r/[^\\\n"]+/, :content + on %r/ \\ (?: #{ESCAPE} | #{UNICODE_ESCAPE} ) /mx, :char + on %r/"/, :delimiter, pop + on %r/ \\ /x, pop, :error + on %r/ $ /x, pop + end + + state :include do + on %r/<[^>\n]+>?|"[^"\n\\]*(?:\\.[^"\n\\]*)*"?/, :include, pop + on %r/ \s*? \n \s* /x, :space, pop + on %r/\s+/, :space + otherwise pop + end + + protected + + def setup + super + + @label_expected = true + @case_expected = false + @label_expected_before_preproc_line = nil + @in_preproc_line = false + end + + def close_groups encoder, states + if states.last == :string + encoder.end_group :string + end + end + + end + +end +end diff --git a/lib/coderay/scanners/lua2b.rb b/lib/coderay/scanners/lua2b.rb new file mode 100644 index 00000000..9e2b1fe8 --- /dev/null +++ b/lib/coderay/scanners/lua2b.rb @@ -0,0 +1,157 @@ +# encoding: utf-8 + +module CodeRay +module Scanners + + # Scanner for the Lua[http://lua.org] programming lanuage. + # + # The language’s complete syntax is defined in + # {the Lua manual}[http://www.lua.org/manual/5.2/manual.html], + # which is what this scanner tries to conform to. + class Lua2 < RuleBasedScanner + + register_for :lua2 + file_extension 'lua' + title 'Lua' + + # Keywords used in Lua. + KEYWORDS = %w[and break do else elseif end + for function goto if in + local not or repeat return + then until while + ] + + # Constants set by the Lua core. + PREDEFINED_CONSTANTS = %w[false true nil] + + # The expressions contained in this array are parts of Lua’s `basic' + # library. Although it’s not entirely necessary to load that library, + # it is highly recommended and one would have to provide own implementations + # of some of these expressions if one does not do so. They however aren’t + # keywords, neither are they constants, but nearly predefined, so they + # get tagged as `predefined' rather than anything else. + # + # This list excludes values of form `_UPPERCASE' because the Lua manual + # requires such identifiers to be reserved by Lua anyway and they are + # highlighted directly accordingly, without the need for specific + # identifiers to be listed here. + PREDEFINED_EXPRESSIONS = %w[ + assert collectgarbage dofile error getmetatable + ipairs load loadfile next pairs pcall print + rawequal rawget rawlen rawset select setmetatable + tonumber tostring type xpcall + ] + + # Automatic token kind selection for normal words. + IDENT_KIND = CodeRay::WordList.new(:ident). + add(KEYWORDS, :keyword). + add(PREDEFINED_CONSTANTS, :predefined_constant). + add(PREDEFINED_EXPRESSIONS, :predefined) + + state :initial, :map => :map do + on %r/\-\-\[\=*\[/, push(:long_comment, :comment), :delimiter, #--[[ long (possibly multiline) comment ]] + set(:num_equals, -> (match) { match.count('=') }) # Number must match for comment end + on %r/--.*$/, :comment # --Lua comment + on %r/\[=*\[/, push(:long_string, :string), :delimiter, # [[ long (possibly multiline) string ]] + set(:num_equals, -> (match) { match.count('=') }) # Number must match for string end + on %r/::\s*[a-zA-Z_][a-zA-Z0-9_]+\s*::/, :label # ::goto_label:: + on %r/_[A-Z]+/, :predefined # _UPPERCASE are names reserved for Lua + on check_if { |brace_depth| brace_depth > 0 }, %r/([a-zA-Z_][a-zA-Z0-9_]*) (\s+)?(=)/x, groups(:key, :space, :operator) + on %r/[a-zA-Z_][a-zA-Z0-9_]*/, kind { |match| IDENT_KIND[match] }, push_state { |match, kind| # Normal letters (or letters followed by digits) + # Extra highlighting for entities following certain keywords + if kind == :keyword && match == 'function' + :function_expected + elsif kind == :keyword && match == 'goto' + :goto_label_expected + elsif kind == :keyword && match == 'local' + :local_var_expected + end + } + + on %r/\{/, push(:map), kind { |brace_depth| brace_depth > 0 ? :inline_delimiter : :delimiter }, increment(:brace_depth) # Opening table brace { + on check_if { |brace_depth| brace_depth == 1 }, %r/\}/, :delimiter, pop, decrement(:brace_depth) # Closing table brace } + on check_if { |brace_depth| brace_depth == 0 }, %r/\}/, :error # Mismatched brace + on %r/\}/, :inline_delimiter, pop, decrement(:brace_depth) + + on %r/"/, push(:double_quoted_string, :string), :delimiter # String delimiters " and ' + on %r/'/, push(:single_quoted_string, :string), :delimiter + # ↓Prefix hex number ←|→ decimal number + on %r/-? (?:0x\h* \. \h+ (?:p[+\-]?\d+)? | \d*\.\d+ (?:e[+\-]?\d+)?)/ix, :float # hexadecimal constants have no E power, decimal ones no P power + # ↓Prefix hex number ←|→ decimal number + on %r/-? (?:0x\h+ (?:p[+\-]?\d+)? | \d+ (?:e[+\-]?\d+)?)/ix, :integer # hexadecimal constants have no E power, decimal ones no P power + on %r/[\+\-\*\/%^\#=~<>\(\)\[\]:;,] | \.(?!\d)/x, :operator # Operators + on %r/\s+/, :space # Space + end + + state :function_expected do + on %r/\(.*?\)/m, :operator, pop_state # x = function() # "Anonymous" function without explicit name + on %r/[a-zA-Z_] (?:[a-zA-Z0-9_\.] (?!\.\d))* [\.\:]/x, :ident # function tbl.subtbl.foo() | function tbl:foo() # Colon only allowed as last separator + on %r/[a-zA-Z_][a-zA-Z0-9_]*/, :function, pop_state # function foo() + on %r/\s+/, :space # Between the `function' keyword and the ident may be any amount of whitespace + end + + state :goto_label_expected do + on %r/[a-zA-Z_][a-zA-Z0-9_]*/, :label, pop_state + on %r/\s+/, :space # Between the `goto' keyword and the label may be any amount of whitespace + end + + state :local_var_expected do + on %r/function/, :keyword, pop_state, push_state(:function_expected) # local function ... + on %r/[a-zA-Z_][a-zA-Z0-9_]*/, :local_variable + on %r/,/, :operator + on %r/\=/, :operator, pop_state + on %r/\n/, :space, pop_state + on %r/\s+/, :space + end + + state :long_comment => :comment do + on pattern { |num_equals| %r/(.*?)(\]={#{num_equals}}\])/m }, groups(:content, :delimiter), pop(:comment) + on %r/.*/m, :error, pop(:comment) + end + + state :long_string => :string do + on pattern { |num_equals| %r/(.*?)(\]={#{num_equals}}\])/m }, groups(:content, :delimiter), pop(:string) # Long strings do not interpret any escape sequences + on %r/.*/m, :error, pop(:string) + end + + state :single_quoted_string => :string do + on %r/[^\\'\n]+/, :content # Everything except \ and the start delimiter character is string content (newlines are only allowed if preceeded by \ or \z) + on %r/\\(?:["'abfnrtv\\]|z\s*|x\h\h|\d{1,3}|\n)/m, :char + on %r/'/, :delimiter, pop(:string) + on %r/\n/, :error, pop(:string) # Lua forbids unescaped newlines in normal non-long strings + # encoder.text_token("\\n\n", :error) # Visually appealing error indicator--otherwise users may wonder whether the highlighter cannot highlight multine strings + end + + state :double_quoted_string => :string do + on %r/[^\\"\n]+/, :content # Everything except \ and the start delimiter character is string content (newlines are only allowed if preceeded by \ or \z) + on %r/\\(?:["'abfnrtv\\]|z\s*|x\h\h|\d{1,3}|\n)/m, :char + on %r/"/, :delimiter, pop(:string) + on %r/\n/, :error, pop(:string) # Lua forbids unescaped newlines in normal non-long strings + # encoder.text_token("\\n\n", :error) # Visually appealing error indicator--otherwise users may wonder whether the highlighter cannot highlight multine strings + end + + protected + + def setup + super + + @brace_depth = 0 + @num_equals = nil + end + + def close_groups encoder, states + states.reverse_each do |state| + case state + when :long_string, :single_quoted_string, :double_quoted_string + encoder.end_group :string + when :long_comment + encoder.end_group :long_comment + when :map + encoder.end_group :map + end + end + end + end + +end +end diff --git a/lib/coderay/scanners/lua3.rb b/lib/coderay/scanners/lua3.rb new file mode 100644 index 00000000..d2d42804 --- /dev/null +++ b/lib/coderay/scanners/lua3.rb @@ -0,0 +1,142 @@ +# encoding: utf-8 +# Pseudocode: states optionally define groups, comments removed, counter definition? + +module CodeRay +module Scanners + + # Scanner for the Lua[http://lua.org] programming lanuage. + # + # The language’s complete syntax is defined in + # {the Lua manual}[http://www.lua.org/manual/5.2/manual.html], + # which is what this scanner tries to conform to. + class Lua3 < RuleBasedScannerX + + register_for :lua3 + file_extension 'lua' + title 'Lua' + + # Keywords used in Lua. + KEYWORDS = %w[and break do else elseif end + for function goto if in + local not or repeat return + then until while + ] + + # Constants set by the Lua core. + PREDEFINED_CONSTANTS = %w[false true nil] + + # The expressions contained in this array are parts of Lua’s `basic' + # library. Although it’s not entirely necessary to load that library, + # it is highly recommended and one would have to provide own implementations + # of some of these expressions if one does not do so. They however aren’t + # keywords, neither are they constants, but nearly predefined, so they + # get tagged as `predefined' rather than anything else. + # + # This list excludes values of form `_UPPERCASE' because the Lua manual + # requires such identifiers to be reserved by Lua anyway and they are + # highlighted directly accordingly, without the need for specific + # identifiers to be listed here. + PREDEFINED_EXPRESSIONS = %w[ + assert collectgarbage dofile error getmetatable + ipairs load loadfile next pairs pcall print + rawequal rawget rawlen rawset select setmetatable + tonumber tostring type xpcall + ] + + # Automatic token kind selection for normal words. + IDENT_KIND = CodeRay::WordList.new(:ident). + add(KEYWORDS, :keyword). + add(PREDEFINED_CONSTANTS, :predefined_constant). + add(PREDEFINED_EXPRESSIONS, :predefined) + + protected + + # Scanner initialization. + def setup + super + @brace_depth = 0 + @num_equals = nil + end + + counter :brace_depth + + state :initial, :map => :map do + on %r/\-\-\[\=*\[/, push(:long_comment), :delimiter, set(:num_equals, -> (match) { match.count('=') }) + on %r/--.*$/, :comment + on %r/\[=*\[/, push(:long_string), :delimiter, set(:num_equals, -> (match) { match.count('=') }) + on %r/::\s*[a-zA-Z_][a-zA-Z0-9_]+\s*::/, :label + on %r/_[A-Z]+/, :predefined + on check_if(:brace_depth, :>, 0), %r/([a-zA-Z_][a-zA-Z0-9_]*) (\s+)?(=)/x, groups(:key, :space, :operator) + on %r/[a-zA-Z_][a-zA-Z0-9_]*/, kind { |match| IDENT_KIND[match] }, push_state { |match, kind| + if kind == :keyword && match == 'function' + :function_expected + elsif kind == :keyword && match == 'goto' + :goto_label_expected + elsif kind == :keyword && match == 'local' + :local_var_expected + end + } + + on %r/\{/, push(:map), kind { |brace_depth| brace_depth > 0 ? :inline_delimiter : :delimiter }, increment(:brace_depth) + on check_if(:brace_depth, :==, 1), %r/\}/, :delimiter, pop, decrement(:brace_depth) + on check_if(:brace_depth, :==, 0), %r/\}/, :error + on %r/\}/, :inline_delimiter, pop, decrement(:brace_depth) + + on %r/"/, push(:double_quoted_string), :delimiter + on %r/'/, push(:single_quoted_string), :delimiter + + on %r/-? (?:0x\h* \. \h+ (?:p[+\-]?\d+)? | \d*\.\d+ (?:e[+\-]?\d+)?)/ix, :float + + on %r/-? (?:0x\h+ (?:p[+\-]?\d+)? | \d+ (?:e[+\-]?\d+)?)/ix, :integer + on %r/[\+\-\*\/%^\#=~<>\(\)\[\]:;,] | \.(?!\d)/x, :operator + on %r/\s+/, :space + end + + state :function_expected do + on %r/\(.*?\)/m, :operator, pop + on %r/[a-zA-Z_] (?:[a-zA-Z0-9_\.] (?!\.\d))* [\.\:]/x, :ident + on %r/[a-zA-Z_][a-zA-Z0-9_]*/, :function, pop + on %r/\s+/, :space + end + + state :goto_label_expected do + on %r/[a-zA-Z_][a-zA-Z0-9_]*/, :label, pop + on %r/\s+/, :space + end + + state :local_var_expected do + on %r/function/, :keyword, pop, push(:function_expected) + on %r/[a-zA-Z_][a-zA-Z0-9_]*/, :local_variable + on %r/,/, :operator + on %r/\=/, :operator, pop + on %r/\n/, :space, pop + on %r/\s+/, :space + end + + state :long_comment => :comment do + on pattern { |num_equals| %r/(.*?)(\]={#{num_equals}}\])/m }, groups(:content, :delimiter), pop(:comment) + on %r/.*/m, :error, pop(:comment) + end + + state :long_string => :string do + on pattern { |num_equals| %r/(.*?)(\]={#{num_equals}}\])/m }, groups(:content, :delimiter), pop(:string) + on %r/.*/m, :error, pop(:string) + end + + state :single_quoted_string => :string do + on %r/[^\\'\n]+/, :content + on %r/\\(?:["'abfnrtv\\]|z\s*|x\h\h|\d{1,3}|\n)/m, :char + on %r/'/, :delimiter, pop(:string) + on %r/\n/, :error, pop(:string) + end + + state :double_quoted_string => :string do + on %r/[^\\"\n]+/, :content + on %r/\\(?:["'abfnrtv\\]|z\s*|x\h\h|\d{1,3}|\n)/m, :char + on %r/"/, :delimiter, pop(:string) + on %r/\n/, :error, pop(:string) + end + end + +end +end diff --git a/lib/coderay/scanners/lua4.rb b/lib/coderay/scanners/lua4.rb new file mode 100644 index 00000000..0315d34e --- /dev/null +++ b/lib/coderay/scanners/lua4.rb @@ -0,0 +1,89 @@ +# encoding: utf-8 + +module CodeRay +module Scanners + + # Scanner for the Lua[http://lua.org] programming lanuage. + # + # The language’s complete syntax is defined in + # {the Lua manual}[http://www.lua.org/manual/5.2/manual.html], + # which is what this scanner tries to conform to. + class Lua4 < RuleBasedScanner + + register_for :lua4 + file_extension 'lua' + title 'Lua' + + protected + + state :initial do + on %r'#!(.*?)$', :doctype + on %r//, push_state(:base) + end + + state :base do + on %r'--\[(=*)\[.*?\]\1\]'m, :comment + on %r'--.*$', :comment + + on %r'(\d*\.\d+|\d+\.\d*)(e[+-]?\d+)?'i, :float + on %r'\d+e[+-]?\d+'i, :float + on %r'0x[0-9a-f]*'i, :hex + on %r'\d+', :integer + + on %r'\n', :space + on %r'[^\S\n]', :space + # multiline strings + on %r'\[(=*)\[.*?\]\1\]'m, :string + + on %r'(==|~=|<=|>=|\.\.\.|\.\.|[=+\-*/%^<>#!.\\:])', :operator + on %r'[\[\]{}().,:;]', :operator + on %r'(and|or|not)\b', :operator + + on %r'(break|do|else|elseif|end|for|if|in|repeat|return|then|until|while)\b', :keyword + on %r'(local)\b', :keyword + on %r'(true|false|nil)\b', :predefined_constant + + on %r'(function)\b', :keyword, push_state(:funcname) + + on %r'[A-Za-z_]\w*(\.[A-Za-z_]\w*)?', :ident + + # on %r"'", :string, combined(:stringescape, :sqs) + on %r"'", :string, push_state(:sqs) + # on %r'"', :string, combined(:stringescape, :dqs) + on %r'"', :string, push_state(:dqs) + end + + state :funcname do + on %r'\s+', :space + on %r'(?:([A-Za-z_]\w*)(\.))?([A-Za-z_]\w*)', groups(:class, :operator, :function), pop_state + # inline function + on %r'\(', :operator, pop_state + end + + # if I understand correctly, every character is valid in a lua string, + # so this state is only for later corrections + # state :string do + # on %r'.', :string + # end + + # state :stringescape do + # on %r/\\([abfnrtv\\"']|\d{1,3})/, :escape + # end + + state :sqs do + on %r"'", :string, pop_state + # include(:string) + on %r/\\([abfnrtv\\"']|\d{1,3})/, :escape + on %r'.', :string + end + + state :dqs do + on %r'"', :string, pop_state + # include(:string) + on %r/\\([abfnrtv\\"']|\d{1,3})/, :escape + on %r'.', :string + end + end + +end +end diff --git a/lib/coderay/state_based_scanner.rb b/lib/coderay/state_based_scanner.rb new file mode 100644 index 00000000..b196adc9 --- /dev/null +++ b/lib/coderay/state_based_scanner.rb @@ -0,0 +1,394 @@ +require 'set' + +module CodeRay + module Scanners + class StateBasedScanner < Scanner + class State + attr_reader :names + attr_reader :rules + attr_reader :scanner + + def initialize scanner, names, &block + @scanner = scanner + @names = names + + @rules = [] + @check = nil + + instance_eval(&block) + end + + def rules_code + <<-RUBY +when #{names.map(&:inspect).join(', ')} +#{rules.map.with_index { |rule, index| rule.code(first: index.zero?) }.join} + else + puts "no match for \#{state.inspect} => skip character" if $DEBUG + encoder.text_token getch, :error + end + + RUBY + end + + protected + + # structure + def check *conditions, &block + return @check unless conditions.any? || block + raise "Can't nest check yet" if @check + + @check = Conditions.new(conditions) + instance_eval(&block) + @check = nil + end + + # rules + def on pattern, *actions, &block + @rules << Rule.new(self, pattern, *actions, check: @check, &block) + end + + def skip pattern, *actions, &block + @rules << Rule.new(self, pattern, *actions, check: @check, skip: true, &block) + end + + def otherwise *actions, &block + @rules << Rule.new(self, //, *actions, check: @check, skip: true, &block) + end + + # actions + def push state + Push.new(state) + end + + def pop + Pop.new + end + + def kind token_kind = nil, &block + Kind.new token_kind || scanner.callback(block) + end + + def groups *token_kinds + Groups.new(token_kinds) + end + + def set target, value = nil, &block + Setter.new target, value || block || true + end + + def callback block + scanner.callback(block) + end + + # magic flag getters + def method_missing method, *args, &block + method_name = method.to_s + if method_name.end_with?('?') + Getter.new(scanner.variable(method_name.chomp('?'))) + else + super + end + end + end + + class GroupState < State + end + + class Rule + attr_reader :pattern + attr_reader :actions + attr_reader :check + attr_reader :state + + def initialize state, pattern, *actions, check:, skip: false, &block + @state = state + @pattern = (skip ? Skip : Scan).new(pattern) + @actions = *build_actions(actions, block) + @check = check + + raise [pattern, *actions, check, skip, block].inspect if check == false + end + + def code first: + <<-RUBI + #{'els' unless first}if #{condition_expression} +#{actions_code.gsub(/^/, ' ' * 2)} + RUBI + end + + def skip? + @pattern.is_a?(Skip) + end + + protected + + def condition_expression + [check, pattern].compact.map(&:code).join(' && ') + end + + def actions_code + actions.map(&:code).join("\n") + end + + def build_actions actions, block + actions += [block] if block + + actions.map do |action| + case action + when Symbol + Token.new(action) + when Proc + state.instance_eval do + callback action + end + when WordList + state.instance_eval do + kind { |match| action[match] } + end + when Push, Pop, Groups, Kind, Setter + action + else + raise "Don't know how to build action for %p (%p)" % [action, action.class] + end + end + end + end + + # conditions + class Conditions < Struct.new(:conditions) + def code + "#{conditions.map(&:code).join(' && ')}" + end + end + + class Scan < Struct.new(:pattern) + def code + "match = scan(#{pattern.inspect})" + end + end + + class Skip < Scan + end + + class Getter < Struct.new(:name, :negative) + def code + "#{negative && '!'}#{name}" + end + + def !@ + negative + end + + protected + + def negative + @negative ||= Getter.new(name, :negative) + end + end + + # actions + class Push < Struct.new :state + def code + "push" + end + end + + class Pop < Class.new + def code + "pop" + end + end + + class Groups < Struct.new(:token_kinds) + def code + "groups" + end + end + + class Setter < Struct.new(:name, :value) + def code + "set" + end + end + + + class Kind < Struct.new(:token_kind) + def code + case token_kind + when Callback + "encoder.text_token match, kind = #{token_kind.code}\n" + else + raise "I don't know how to evaluate this kind: %p" % [token_kind] + end + end + end + + class Token < Struct.new(:name) + def code + "encoder.text_token match, #{name.inspect}" + end + end + + class Callback < Struct.new(:name, :block) + def code + if parameter_names.empty? + name + else + "#{name}(#{parameter_names.join(', ')})" + end + end + + protected + + def parameter_names + block.parameters.map(&:last) + end + end + + class << self + def states + @states ||= {} + end + + def scan_tokens tokens, options + self.class.define_scan_tokens! + + scan_tokens tokens, options + end + + def define_scan_tokens! + if ENV['PUTS'] + puts CodeRay.scan(scan_tokens_code, :ruby).terminal + puts "callbacks: #{callbacks.size}" + end + + class_eval scan_tokens_code + end + + def variable name + variables << name.to_sym + + name + end + + def callback block + return unless block + + callback_name = name_for_callback(block) + callbacks[callback_name] = define_method(callback_name, &block) + block.parameters.map(&:last).each { |name| variable name } + + Callback.new(callback_name, block) + end + + protected + + def state *names, state_class: State, &block + state_class.new(self, names, &block).tap do |state| + for name in names + states[name] = state + end + end + end + + def group_state *names, &block + state(*names, state_class: GroupState, &block) + end + + def callbacks + @callbacks ||= {} + end + + def variables + @variables ||= Set.new + end + + def additional_variables + variables - %i(encoder options state states match kind) + end + + def name_for_callback block + base_name = "__callback_line_#{block.source_location.last}" + callback_name = base_name + counter = 'a' + + while callbacks.key?(callback_name) + callback_name = "#{base_name}_#{counter}" + counter.succ! + end + + callback_name + end + + def scan_tokens_code + <<-"RUBY" + def scan_tokens encoder, options + state = options[:state] || @state + +#{ restore_local_variables_code.chomp.gsub(/^/, ' ' * 3) } + + states = [state] + + until eos? + case state +#{ states_code.chomp.gsub(/^/, ' ' * 4) } + else + raise_inspect 'Unknown state: %p' % [state], encoder + end + end + + if options[:keep_state] + @state = state + end + +#{ close_groups_code.chomp.gsub(/^/, ' ' * 3) } + + encoder + end + RUBY + end + + def states_code + states.values.map(&:rules_code).join + end + + def restore_local_variables_code + additional_variables.sort.map { |name| "#{name} = @#{name}" }.join("\n") + end + + def close_groups_code + "close_groups(encoder, states)" + end + end + + def scan_tokens tokens, options + self.class.define_scan_tokens! + + scan_tokens tokens, options + end + + protected + + def setup + @state = :initial + reset_expectations + end + + def close_groups encoder, states + # TODO + end + + def expect kind + @expected = kind + end + + def expected? kind + @expected == kind + end + + def reset_expectations + @expected = nil + end + end + end +end From 1bdaeef6c43436e4984f5b96cb17618f82832225 Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Sun, 9 Apr 2017 18:38:53 +0200 Subject: [PATCH 46/54] starting with SimpleScanner --- lib/coderay/simple_scanner.rb | 40 ++++ lib/coderay/simple_scanner_dsl.rb | 381 ++++++++++++++++++++++++++++++ spec/simple_scanner_spec.rb | 28 +++ spec/spec_helper.rb | 96 ++++++++ 4 files changed, 545 insertions(+) create mode 100644 lib/coderay/simple_scanner.rb create mode 100644 lib/coderay/simple_scanner_dsl.rb create mode 100644 spec/simple_scanner_spec.rb create mode 100644 spec/spec_helper.rb diff --git a/lib/coderay/simple_scanner.rb b/lib/coderay/simple_scanner.rb new file mode 100644 index 00000000..6873f884 --- /dev/null +++ b/lib/coderay/simple_scanner.rb @@ -0,0 +1,40 @@ +require 'set' + +module CodeRay + module Scanners + class SimpleScanner < Scanner + extend SimpleScannerDSL + + class << self + def define_scan_tokens! + if ENV['PUTS'] + puts CodeRay.scan(scan_tokens_code, :ruby).terminal + puts "callbacks: #{callbacks.size}" + end + + class_eval <<-RUBY +def scan_tokens encoder, options +#{ scan_tokens_code.chomp.gsub(/^/, ' ' * 2) } +end + RUBY + end + end + + def scan_tokens tokens, options + self.class.define_scan_tokens! + + scan_tokens tokens, options + end + + protected + + def setup + @state = :initial + end + + def close_groups encoder, states + # TODO + end + end + end +end \ No newline at end of file diff --git a/lib/coderay/simple_scanner_dsl.rb b/lib/coderay/simple_scanner_dsl.rb new file mode 100644 index 00000000..b3c8c57c --- /dev/null +++ b/lib/coderay/simple_scanner_dsl.rb @@ -0,0 +1,381 @@ +require 'set' + +module CodeRay + module Scanners + module SimpleScannerDSL + Pattern = Struct.new :pattern + Groups = Struct.new :token_kinds + Kind = Struct.new :token_kind + Push = Struct.new :state, :group + Pop = Struct.new :group + PushState = Struct.new :state + PopState = Class.new + Check = Struct.new :condition + CheckIf = Class.new Check + CheckUnless = Class.new Check + ValueSetter = Struct.new :targets, :value + Increment = Struct.new :targets, :operation, :value + Continue = Class.new + + State = Struct.new :names, :block, :dsl do + def initialize(*) + super + eval + end + + def eval + @first = true + + @code = "" + instance_eval(&block) + end + + def code + <<-RUBY +when #{names.map(&:inspect).join(', ')} +#{ rules_code.chomp.gsub(/^/, ' ') } + else +#{ handle_unexpected_char_code.chomp.gsub(/^/, ' ' * 2) } + end + RUBY + end + + protected + + def rules_code + @code + end + + def handle_unexpected_char_code + ''.tap do |code| + code << 'puts "no match for #{state.inspect} => skip char"' << "\n" if $DEBUG + code << 'encoder.text_token getch, :error' + end + end + + public + + def on? pattern + pattern_expression = pattern.inspect + @code << "#{'els' unless @first}if check(#{pattern_expression})\n" + + @first = true + yield + @code << "end\n" + + @first = false + end + + def on *pattern_and_actions + if index = pattern_and_actions.find_index { |item| !(item.is_a?(Check) || item.is_a?(Regexp) || item.is_a?(Pattern)) } + conditions = pattern_and_actions[0..index - 1] or raise 'I need conditions or a pattern!' + actions = pattern_and_actions[index..-1] or raise 'I need actions!' + else + raise "invalid rule structure: #{pattern_and_actions.map(&:class)}" + end + + condition_expressions = [] + if conditions + for condition in conditions + case condition + when CheckIf + case condition.condition + when Proc + condition_expressions << "#{dsl.add_callback(condition.condition)}" + when Symbol + condition_expressions << "#{condition.condition}" + else + raise "I don't know how to evaluate this check_if condition: %p" % [condition.condition] + end + when CheckUnless + case condition.condition + when Proc + condition_expressions << "!#{dsl.add_callback(condition.condition)}" + when Symbol + condition_expressions << "!#{condition.condition}" + else + raise "I don't know how to evaluate this check_unless condition: %p" % [condition.condition] + end + when Pattern + case condition.pattern + when Proc + condition_expressions << "match = scan(#{dsl.add_callback(condition.pattern)})" + else + raise "I don't know how to evaluate this pattern: %p" % [condition.pattern] + end + when Regexp + condition_expressions << "match = scan(#{condition.inspect})" + else + raise "I don't know how to evaluate this pattern/condition: %p" % [condition] + end + end + end + + @code << "#{'els' unless @first}if #{condition_expressions.join(' && ')}\n" + + for action in actions + case action + when String + raise + @code << "p 'evaluate #{action.inspect}'\n" if $DEBUG + @code << "#{action}\n" + + when Symbol + @code << "p 'text_token %p %p' % [match, #{action.inspect}]\n" if $DEBUG + @code << "encoder.text_token match, #{action.inspect}\n" + when Kind + case action.token_kind + when Proc + @code << "encoder.text_token match, kind = #{dsl.add_callback(action.token_kind)}\n" + else + raise "I don't know how to evaluate this kind: %p" % [action.token_kind] + end + when Groups + @code << "p 'text_tokens %p in groups %p' % [match, #{action.token_kinds.inspect}]\n" if $DEBUG + action.token_kinds.each_with_index do |kind, i| + @code << "encoder.text_token self[#{i + 1}], #{kind.inspect} if self[#{i + 1}]\n" + end + + when Push, PushState + case action.state + when String + raise + @code << "p 'push %p' % [#{action.state}]\n" if $DEBUG + @code << "state = #{action.state}\n" + @code << "states << state\n" + when Symbol + @code << "p 'push %p' % [#{action.state.inspect}]\n" if $DEBUG + @code << "state = #{action.state.inspect}\n" + @code << "states << state\n" + when Proc + @code << "if new_state = #{dsl.add_callback(action.state)}\n" + @code << " state = new_state\n" + @code << " states << new_state\n" + @code << "end\n" + else + raise "I don't know how to evaluate this push state: %p" % [action.state] + end + if action.is_a? Push + if action.state == action.group + @code << "encoder.begin_group state\n" + else + case action.state + when Symbol + @code << "p 'begin group %p' % [#{action.group.inspect}]\n" if $DEBUG + @code << "encoder.begin_group #{action.group.inspect}\n" + when Proc + @code << "encoder.begin_group #{dsl.add_callback(action.group)}\n" + else + raise "I don't know how to evaluate this push state: %p" % [action.state] + end + end + end + when Pop, PopState + @code << "p 'pop %p' % [states.last]\n" if $DEBUG + if action.is_a? Pop + if action.group + case action.group + when Symbol + @code << "encoder.end_group #{action.group.inspect}\n" + else + raise "I don't know how to evaluate this pop group: %p" % [action.group] + end + @code << "states.pop\n" + else + @code << "encoder.end_group states.pop\n" + end + else + @code << "states.pop\n" + end + @code << "state = states.last\n" + + when ValueSetter + case action.value + when Proc + @code << "#{action.targets.join(' = ')} = #{dsl.add_callback(action.value)}\n" + when Symbol + @code << "#{action.targets.join(' = ')} = #{action.value}\n" + else + @code << "#{action.targets.join(' = ')} = #{action.value.inspect}\n" + end + + when Increment + case action.value + when Proc + @code << "#{action.targets.join(' = ')} #{action.operation}= #{dsl.add_callback(action.value)}\n" + when Symbol + @code << "#{action.targets.join(' = ')} #{action.operation}= #{action.value}\n" + else + @code << "#{action.targets.join(' = ')} #{action.operation}= #{action.value.inspect}\n" + end + + when Proc + @code << "#{dsl.add_callback(action)}\n" + + when Continue + @code << "next\n" + + else + raise "I don't know how to evaluate this action: %p" % [action] + end + end + + @first = false + end + + def groups *token_kinds + Groups.new token_kinds + end + + def pattern pattern = nil, &block + Pattern.new pattern || block + end + + def kind token_kind = nil, &block + Kind.new token_kind || block + end + + def push state = nil, group = state, &block + raise 'push requires a state or a block; got nothing' unless state || block + Push.new state || block, group || block + end + + def pop group = nil + Pop.new group + end + + def push_state state = nil, &block + raise 'push_state requires a state or a block; got nothing' unless state || block + PushState.new state || block + end + + def pop_state + PopState.new + end + + def check_if value = nil, &callback + CheckIf.new value || callback + end + + def check_unless value = nil, &callback + CheckUnless.new value || callback + end + + def flag_on *flags + flags.each { |name| dsl.add_variable name } + ValueSetter.new Array(flags), true + end + + def flag_off *flags + flags.each { |name| dsl.add_variable name } + ValueSetter.new Array(flags), false + end + + def set flag, value = nil, &callback + dsl.add_variable flag + ValueSetter.new [flag], value || callback + end + + def unset *flags + flags.each { |name| dsl.add_variable name } + ValueSetter.new Array(flags), nil + end + + def increment *counters + counters.each { |name| dsl.add_variable name } + Increment.new Array(counters), :+, 1 + end + + def decrement *counters + counters.each { |name| dsl.add_variable name } + Increment.new Array(counters), :-, 1 + end + + def continue + Continue.new + end + end + + attr_accessor :states + + def state *names, &block + @states ||= [] + @states << State.new(names, block, self) + end + + def add_callback block + base_name = "__callback_line_#{block.source_location.last}" + callback_name = base_name + counter = 'a' + while callbacks.key?(callback_name) + callback_name = "#{base_name}_#{counter}" + counter.succ! + end + + callbacks[callback_name] = define_method(callback_name, &block) + + parameters = block.parameters + + if parameters.empty? + callback_name + else + parameter_names = parameters.map(&:last) + parameter_names.each { |name| variables << name } + "#{callback_name}(#{parameter_names.join(', ')})" + end + end + + def add_variable name + variables << name + end + + protected + + def callbacks + @callbacks ||= {} + end + + def variables + @variables ||= Set.new + end + + def additional_variables + variables - %i(encoder options state states match kind) + end + + def scan_tokens_code + <<-"RUBY" +state = options[:state] || @state +states = [state] +#{ restore_local_variables_code.chomp } + +until eos? + case state +#{ states_code.chomp.gsub(/^/, ' ') } + else + raise_inspect 'Unknown state: %p' % [state], encoder + end +end + +@state = state if options[:keep_state] + +#{ close_groups_code.chomp } + +encoder + RUBY + end + + def restore_local_variables_code + additional_variables.sort.map { |name| "#{name} = @#{name}" }.join("\n") + end + + def states_code + @states.map(&:code)[0,1].join + end + + def close_groups_code + 'close_groups(encoder, states)' + end + end + end +end \ No newline at end of file diff --git a/spec/simple_scanner_spec.rb b/spec/simple_scanner_spec.rb new file mode 100644 index 00000000..088343cb --- /dev/null +++ b/spec/simple_scanner_spec.rb @@ -0,0 +1,28 @@ +RSpec.describe CodeRay::Scanners::SimpleScanner do + let(:scanner) { Class.new described_class } + + describe '#scan_tokens_code' do + subject { scanner.send :scan_tokens_code } + it 'lets you define states' do + is_expected.to eq <<-RUBY +state = options[:state] || @state +states = [state] + + +until eos? + case state + + else + raise_inspect 'Unknown state: %p' % [state], encoder + end +end + +@state = state if options[:keep_state] + +close_groups(encoder, states) + +encoder + RUBY + end + end +end diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb new file mode 100644 index 00000000..49b6a0ec --- /dev/null +++ b/spec/spec_helper.rb @@ -0,0 +1,96 @@ +# This file was generated by the `rspec --init` command. Conventionally, all +# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`. +# The generated `.rspec` file contains `--require spec_helper` which will cause +# this file to always be loaded, without a need to explicitly require it in any +# files. +# +# Given that it is always loaded, you are encouraged to keep this file as +# light-weight as possible. Requiring heavyweight dependencies from this file +# will add to the boot time of your test suite on EVERY test run, even for an +# individual file that may not need all of that loaded. Instead, consider making +# a separate helper file that requires the additional dependencies and performs +# the additional setup, and require it from the spec files that actually need +# it. +# +# The `.rspec` file also contains a few flags that are not defaults but that +# users commonly want. +# +# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration +RSpec.configure do |config| + # rspec-expectations config goes here. You can use an alternate + # assertion/expectation library such as wrong or the stdlib/minitest + # assertions if you prefer. + config.expect_with :rspec do |expectations| + # This option will default to `true` in RSpec 4. It makes the `description` + # and `failure_message` of custom matchers include text for helper methods + # defined using `chain`, e.g.: + # be_bigger_than(2).and_smaller_than(4).description + # # => "be bigger than 2 and smaller than 4" + # ...rather than: + # # => "be bigger than 2" + expectations.include_chain_clauses_in_custom_matcher_descriptions = true + end + + # rspec-mocks config goes here. You can use an alternate test double + # library (such as bogus or mocha) by changing the `mock_with` option here. + config.mock_with :rspec do |mocks| + # Prevents you from mocking or stubbing a method that does not exist on + # a real object. This is generally recommended, and will default to + # `true` in RSpec 4. + mocks.verify_partial_doubles = true + end + + # These two settings work together to allow you to limit a spec run + # to individual examples or groups you care about by tagging them with + # `:focus` metadata. When nothing is tagged with `:focus`, all examples + # get run. + config.filter_run :focus + config.run_all_when_everything_filtered = true + + # Allows RSpec to persist some state between runs in order to support + # the `--only-failures` and `--next-failure` CLI options. We recommend + # you configure your source control system to ignore this file. + config.example_status_persistence_file_path = "spec/examples.txt" + + # Limits the available syntax to the non-monkey patched syntax that is + # recommended. For more details, see: + # - http://rspec.info/blog/2012/06/rspecs-new-expectation-syntax/ + # - http://www.teaisaweso.me/blog/2013/05/27/rspecs-new-message-expectation-syntax/ + # - http://rspec.info/blog/2014/05/notable-changes-in-rspec-3/#zero-monkey-patching-mode + config.disable_monkey_patching! + + # This setting enables warnings. It's recommended, but in some cases may + # be too noisy due to issues in dependencies. + config.warnings = true + + # Many RSpec users commonly either run the entire suite or an individual + # file, and it's useful to allow more verbose output when running an + # individual spec file. + if config.files_to_run.one? + # Use the documentation formatter for detailed output, + # unless a formatter has already been configured + # (e.g. via a command-line flag). + config.default_formatter = 'doc' + end + + # Print the 10 slowest examples and example groups at the + # end of the spec run, to help surface which specs are running + # particularly slow. + config.profile_examples = 10 + + # Run specs in random order to surface order dependencies. If you find an + # order dependency and want to debug it, you can fix the order by providing + # the seed, which is printed after each run. + # --seed 1234 + config.order = :random + + # Seed global randomization in this process using the `--seed` CLI option. + # Setting this allows you to use `--seed` to deterministically reproduce + # test failures related to randomization by passing the same `--seed` value + # as the one that triggered the failure. + Kernel.srand config.seed +end + +$LOAD_PATH << 'lib/coderay' + +require 'coderay' From e101dbe2d2887a7d158672c86171d59a70027db6 Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Fri, 3 Nov 2017 00:37:54 +0100 Subject: [PATCH 47/54] normalize class names --- lib/coderay/scanners/java_script4.rb | 4 ++-- lib/coderay/scanners/json2.rb | 4 ++-- lib/coderay/scanners/json3.rb | 4 ++-- lib/coderay/scanners/json4.rb | 4 ++-- lib/coderay/scanners/lua3.rb | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/lib/coderay/scanners/java_script4.rb b/lib/coderay/scanners/java_script4.rb index 4b9601f3..7899a8db 100644 --- a/lib/coderay/scanners/java_script4.rb +++ b/lib/coderay/scanners/java_script4.rb @@ -2,7 +2,7 @@ module CodeRay module Scanners - class RuleBasedScanner5 < Scanner + class JavaScript4RuleBasedScanner < Scanner CheckIf = Struct.new :condition @@ -173,7 +173,7 @@ def make_callback block # Scanner for JavaScript. # # Aliases: +ecmascript+, +ecma_script+, +javascript+ - class JavaScript4 < RuleBasedScanner5 + class JavaScript4 < JavaScript4RuleBasedScanner register_for :java_script4 file_extension 'js' diff --git a/lib/coderay/scanners/json2.rb b/lib/coderay/scanners/json2.rb index 6d7adc82..51df7826 100644 --- a/lib/coderay/scanners/json2.rb +++ b/lib/coderay/scanners/json2.rb @@ -1,7 +1,7 @@ module CodeRay module Scanners - class RuleBasedScanner2 < Scanner + class JSON2RuleBasedScanner < Scanner class << self attr_accessor :states @@ -34,7 +34,7 @@ def pop_group end # Scanner for JSON (JavaScript Object Notation). - class JSON2 < RuleBasedScanner2 + class JSON2 < JSON2RuleBasedScanner register_for :json2 file_extension 'json' diff --git a/lib/coderay/scanners/json3.rb b/lib/coderay/scanners/json3.rb index cf0c1f02..e05feb4e 100644 --- a/lib/coderay/scanners/json3.rb +++ b/lib/coderay/scanners/json3.rb @@ -1,7 +1,7 @@ module CodeRay module Scanners - class RuleBasedScanner3 < Scanner + class JSON3RuleBasedScanner < Scanner class << self attr_accessor :states @@ -56,7 +56,7 @@ def pop_group end # Scanner for JSON (JavaScript Object Notation). - class JSON3 < RuleBasedScanner3 + class JSON3 < JSON3RuleBasedScanner register_for :json3 file_extension 'json' diff --git a/lib/coderay/scanners/json4.rb b/lib/coderay/scanners/json4.rb index 5cb3afbd..38d71e35 100644 --- a/lib/coderay/scanners/json4.rb +++ b/lib/coderay/scanners/json4.rb @@ -1,7 +1,7 @@ module CodeRay module Scanners - class RuleBasedScanner4 < Scanner + class JSON4RuleBasedScanner < Scanner class << self attr_accessor :states @@ -56,7 +56,7 @@ def pop end # Scanner for JSON (JavaScript Object Notation). - class JSON4 < RuleBasedScanner4 + class JSON4 < JSON4RuleBasedScanner register_for :json4 file_extension 'json' diff --git a/lib/coderay/scanners/lua3.rb b/lib/coderay/scanners/lua3.rb index d2d42804..f0693bb3 100644 --- a/lib/coderay/scanners/lua3.rb +++ b/lib/coderay/scanners/lua3.rb @@ -9,7 +9,7 @@ module Scanners # The language’s complete syntax is defined in # {the Lua manual}[http://www.lua.org/manual/5.2/manual.html], # which is what this scanner tries to conform to. - class Lua3 < RuleBasedScannerX + class Lua3 < RuleBasedScanner register_for :lua3 file_extension 'lua' From e69b878287a292b93cc669b9b95fadff68a4fa35 Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Fri, 3 Nov 2017 00:41:01 +0100 Subject: [PATCH 48/54] sort .gitignore, add spec/example.txt --- .gitignore | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index e97fe08a..4d962c0c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,11 +1,12 @@ .* bench/example.* coverage -pkg -spec/reports doc Gemfile.lock +old-stuff +pkg +spec/examples.txt +spec/reports test/executable/source.rb.html test/executable/source.rb.json test/scanners -old-stuff From 579c00bab003658e1bffd989ab15719a188cb7b0 Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Sun, 5 Nov 2017 16:13:51 +0100 Subject: [PATCH 49/54] testing SingleStateRuleBasedScanner; not faster :( --- lib/coderay.rb | 1 + lib/coderay/scanners/_map.rb | 1 + lib/coderay/scanners/java_script6.rb | 162 ++++++++ .../single_state_rule_based_scanner.rb | 370 ++++++++++++++++++ rake_tasks/test.rake | 2 +- 5 files changed, 535 insertions(+), 1 deletion(-) create mode 100644 lib/coderay/scanners/java_script6.rb create mode 100644 lib/coderay/single_state_rule_based_scanner.rb diff --git a/lib/coderay.rb b/lib/coderay.rb index c1c9e344..e43f6bb5 100644 --- a/lib/coderay.rb +++ b/lib/coderay.rb @@ -155,6 +155,7 @@ def self.coderay_path *path # DSL Scanner autoload :RuleBasedScanner, coderay_path('rule_based_scanner') + autoload :SingleStateRuleBasedScanner, coderay_path('single_state_rule_based_scanner') autoload :StateBasedScanner, coderay_path('state_based_scanner') # convenience access and reusable Encoder/Scanner pair diff --git a/lib/coderay/scanners/_map.rb b/lib/coderay/scanners/_map.rb index 61079d53..82fb17f5 100644 --- a/lib/coderay/scanners/_map.rb +++ b/lib/coderay/scanners/_map.rb @@ -15,6 +15,7 @@ module Scanners :javascript3 => :java_script3, :javascript4 => :java_script4, :javascript5 => :java_script5, + :javascript6 => :java_script6, :js => :java_script, :pascal => :delphi, :patch => :diff, diff --git a/lib/coderay/scanners/java_script6.rb b/lib/coderay/scanners/java_script6.rb new file mode 100644 index 00000000..b745bd4b --- /dev/null +++ b/lib/coderay/scanners/java_script6.rb @@ -0,0 +1,162 @@ +# TODO: string_delimiter should be part of the state: push(:regexp, '/'), check_if -> (state, delimiter) { … } +module CodeRay +module Scanners + + # Scanner for JavaScript. + # + # Aliases: +ecmascript+, +ecma_script+, +javascript+ + class JavaScript6 < SingleStateRuleBasedScanner + + register_for :java_script6 + file_extension 'js' + + # The actual JavaScript keywords. + KEYWORDS = %w[ + break case catch continue default delete do else + finally for function if in instanceof new + return switch throw try typeof var void while with + ] # :nodoc: + PREDEFINED_CONSTANTS = %w[ + false null true undefined NaN Infinity + ] # :nodoc: + + MAGIC_VARIABLES = %w[ this arguments ] # :nodoc: arguments was introduced in JavaScript 1.4 + + KEYWORDS_EXPECTING_VALUE = WordList.new.add %w[ + case delete in instanceof new return throw typeof with + ] # :nodoc: + + # Reserved for future use. + RESERVED_WORDS = %w[ + abstract boolean byte char class debugger double enum export extends + final float goto implements import int interface long native package + private protected public short static super synchronized throws transient + volatile + ] # :nodoc: + + IDENT_KIND = WordList.new(:ident). + add(RESERVED_WORDS, :reserved). + add(PREDEFINED_CONSTANTS, :predefined_constant). + add(MAGIC_VARIABLES, :local_variable). + add(KEYWORDS, :keyword) # :nodoc: + + ESCAPE = / [bfnrtv\n\\'"] | x[a-fA-F0-9]{1,2} | [0-7]{1,3} /x # :nodoc: + UNICODE_ESCAPE = / u[a-fA-F0-9]{4} | U[a-fA-F0-9]{8} /x # :nodoc: + REGEXP_ESCAPE = / [bBdDsSwW] /x # :nodoc: + STRING_CONTENT_PATTERN = { + "'" => /[^\\']+/, + '"' => /[^\\"]+/, + '/' => /[^\\\/]+/, + } # :nodoc: + KEY_CHECK_PATTERN = { + "'" => / (?> [^\\']* (?: \\. [^\\']* )* ) ' \s* : /mx, + '"' => / (?> [^\\"]* (?: \\. [^\\"]* )* ) " \s* : /mx, + } # :nodoc: + + state :initial do + on %r/ \s+ | \\\n /x, :space, set(:value_expected) { |match, value_expected| value_expected || match.index(?\n) } + on %r! // [^\n\\]* (?: \\. [^\n\\]* )* | /\* (?: .*? \*/ | .*() ) !mx, :comment, flag_off(:value_expected) + # state = :open_multi_line_comment if self[1] + + on? %r/\.?\d/ do + on %r/0[xX][0-9A-Fa-f]+/, :hex, flag_off(:key_expected, :value_expected) + on %r/(?>0[0-7]+)(?![89.eEfF])/, :octal, flag_off(:key_expected, :value_expected) + on %r/\d+[fF]|\d*\.\d+(?:[eE][+-]?\d+)?[fF]?|\d+[eE][+-]?\d+[fF]?/, :float, flag_off(:key_expected, :value_expected) + on %r/\d+/, :integer, flag_off(:key_expected, :value_expected) + end + + on check_if(:value_expected), %r/<([[:alpha:]]\w*) (?: [^\/>]*\/> | .*?<\/\1>)/xim, -> (match, encoder) do + # TODO: scan over nested tags + xml_scanner.tokenize match, :tokens => encoder + end, flag_off(:value_expected) + + on %r/ [-+*=<>?:;,!&^|(\[{~%]++ (??:;,!&^|(\[{~%]*+ (?<=[{,]) /x, :operator, flag_on(:value_expected, :key_expected), flag_off(:function_expected) + on %r/ [)\]}]+ /x, :operator, flag_off(:function_expected, :key_expected, :value_expected) + + on %r/ function (?![A-Za-z_0-9$]) /x, :keyword, flag_on(:function_expected), flag_off(:key_expected, :value_expected) + on %r/ [$a-zA-Z_][A-Za-z_0-9$]* /x, kind { |match, function_expected, key_expected| + kind = IDENT_KIND[match] + # TODO: labels + if kind == :ident + if match.index(?$) # $ allowed inside an identifier + kind = :predefined + elsif function_expected + kind = :function + elsif check(/\s*[=:]\s*function\b/) + kind = :function + elsif key_expected && check(/\s*:/) + kind = :key + end + end + + kind + }, flag_off(:function_expected, :key_expected), set(:value_expected) { |match| KEYWORDS_EXPECTING_VALUE[match] } + + on %r/["']/, push { |match, key_expected| key_expected && check(KEY_CHECK_PATTERN[match]) ? :key : :string }, :delimiter, set(:string_delimiter) { |match| match } + on check_if(:value_expected), %r/\//, push(:regexp), :delimiter + + on %r/\//, :operator, flag_on(:value_expected), flag_off(:key_expected) + end + + state :string, :key do + on pattern { |string_delimiter| STRING_CONTENT_PATTERN[string_delimiter] }, :content + on %r/["']/, :delimiter, unset(:string_delimiter), flag_off(:key_expected, :value_expected), pop + on %r/ \\ (?: #{ESCAPE} | #{UNICODE_ESCAPE} ) /x, kind { |match, string_delimiter| + string_delimiter == "'" && !(match == "\\\\" || match == "\\'") ? :content : :char + } + on %r/ \\. /mx, :content + on %r/ \\ /x, unset(:string_delimiter), flag_off(:key_expected, :value_expected), pop, :error + end + + state :regexp do + on STRING_CONTENT_PATTERN['/'], :content + on %r/(\/)([gim]+)?/, groups(:delimiter, :modifier), flag_off(:key_expected, :value_expected), pop + on %r/ \\ (?: #{ESCAPE} | #{REGEXP_ESCAPE} | #{UNICODE_ESCAPE} ) /x, :char + on %r/\\./m, :content + on %r/ \\ /x, pop, :error, flag_off(:key_expected, :value_expected) + end + + # state :open_multi_line_comment do + # on %r! .*? \*/ !mx, :initial # don't consume! + # on %r/ .+ /mx, :comment, -> { value_expected = true } + # + # # if match = scan(%r! .*? \*/ !mx) + # # state = :initial + # # else + # # match = scan(%r! .+ !mx) + # # end + # # value_expected = true + # # encoder.text_token match, :comment if match + # end + + protected + + def setup + super + + @string_delimiter = nil + @value_expected = true + @key_expected = false + @function_expected = false + end + + def close_groups encoder, state + if [:string, :key, :regexp].include? state + encoder.end_group state + end + end + + def reset_instance + super + @xml_scanner.reset if defined? @xml_scanner + end + + def xml_scanner + @xml_scanner ||= CodeRay.scanner :xml, :tokens => @tokens, :keep_tokens => true, :keep_state => false + end + + end + +end +end diff --git a/lib/coderay/single_state_rule_based_scanner.rb b/lib/coderay/single_state_rule_based_scanner.rb new file mode 100644 index 00000000..cd8d4a4e --- /dev/null +++ b/lib/coderay/single_state_rule_based_scanner.rb @@ -0,0 +1,370 @@ +require 'set' + +module CodeRay + module Scanners + class SingleStateRuleBasedScanner < Scanner + + Pattern = Struct.new :pattern + Groups = Struct.new :token_kinds + Kind = Struct.new :token_kind + Push = Struct.new :state, :group + Pop = Struct.new :group + PushState = Struct.new :state + PopState = Class.new + Check = Struct.new :condition + CheckIf = Class.new Check + CheckUnless = Class.new Check + ValueSetter = Struct.new :targets, :value + Increment = Struct.new :targets, :operation, :value + Continue = Class.new + + class << self + attr_accessor :states + + def state *names, &block + @code ||= "" + + @code << "when #{names.map(&:inspect).join(', ')}\n" + + @first = true + instance_eval(&block) + @code << " else\n" + @code << " puts \"no match for \#{state.inspect} => skip char\"\n" if $DEBUG + @code << " encoder.text_token getch, :error\n" + @code << " end\n" + @code << " \n" + end + + def on? pattern + pattern_expression = pattern.inspect + @code << " #{'els' unless @first}if check(#{pattern_expression})\n" + + @first = true + yield + @code << " end\n" + + @first = false + end + + def on *pattern_and_actions + if index = pattern_and_actions.find_index { |item| !(item.is_a?(Check) || item.is_a?(Regexp) || item.is_a?(Pattern)) } + conditions = pattern_and_actions[0..index - 1] or raise 'I need conditions or a pattern!' + actions = pattern_and_actions[index..-1] or raise 'I need actions!' + else + raise "invalid rule structure: #{pattern_and_actions.map(&:class)}" + end + + condition_expressions = [] + if conditions + for condition in conditions + case condition + when CheckIf + case condition.condition + when Proc + condition_expressions << "#{make_callback(condition.condition)}" + when Symbol + condition_expressions << "#{condition.condition}" + else + raise "I don't know how to evaluate this check_if condition: %p" % [condition.condition] + end + when CheckUnless + case condition.condition + when Proc + condition_expressions << "!#{make_callback(condition.condition)}" + when Symbol + condition_expressions << "!#{condition.condition}" + else + raise "I don't know how to evaluate this check_unless condition: %p" % [condition.condition] + end + when Pattern + case condition.pattern + when Proc + condition_expressions << "match = scan(#{make_callback(condition.pattern)})" + else + raise "I don't know how to evaluate this pattern: %p" % [condition.pattern] + end + when Regexp + condition_expressions << "match = scan(#{condition.inspect})" + else + raise "I don't know how to evaluate this pattern/condition: %p" % [condition] + end + end + end + + @code << " #{'els' unless @first}if #{condition_expressions.join(' && ')}\n" + + for action in actions + case action + when String + raise + @code << " p 'evaluate #{action.inspect}'\n" if $DEBUG + @code << " #{action}\n" + + when Symbol + @code << " p 'text_token %p %p' % [match, #{action.inspect}]\n" if $DEBUG + @code << " encoder.text_token match, #{action.inspect}\n" + when Kind + case action.token_kind + when Proc + @code << " encoder.text_token match, kind = #{make_callback(action.token_kind)}\n" + else + raise "I don't know how to evaluate this kind: %p" % [action.token_kind] + end + when Groups + @code << " p 'text_tokens %p in groups %p' % [match, #{action.token_kinds.inspect}]\n" if $DEBUG + action.token_kinds.each_with_index do |kind, i| + @code << " encoder.text_token self[#{i + 1}], #{kind.inspect} if self[#{i + 1}]\n" + end + + when Push, PushState + case action.state + when String + raise + @code << " p 'push %p' % [#{action.state}]\n" if $DEBUG + @code << " state = #{action.state}\n" + when Symbol + @code << " p 'push %p' % [#{action.state.inspect}]\n" if $DEBUG + @code << " state = #{action.state.inspect}\n" + when Proc + @code << " if new_state = #{make_callback(action.state)}\n" + @code << " state = new_state\n" + @code << " end\n" + else + raise "I don't know how to evaluate this push state: %p" % [action.state] + end + if action.is_a? Push + if action.state == action.group + @code << " encoder.begin_group state\n" + else + case action.state + when Symbol + @code << " p 'begin group %p' % [#{action.group.inspect}]\n" if $DEBUG + @code << " encoder.begin_group #{action.group.inspect}\n" + when Proc + @code << " encoder.begin_group #{make_callback(action.group)}\n" + else + raise "I don't know how to evaluate this push state: %p" % [action.state] + end + end + end + when Pop, PopState + @code << " p 'pop %p' % [state]\n" if $DEBUG + if action.is_a? Pop + if action.group + case action.group + when Symbol + @code << " encoder.end_group #{action.group.inspect}\n" + else + raise "I don't know how to evaluate this pop group: %p" % [action.group] + end + else + @code << " encoder.end_group state\n" + end + end + @code << " state = :initial\n" + + when ValueSetter + case action.value + when Proc + @code << " #{action.targets.join(' = ')} = #{make_callback(action.value)}\n" + when Symbol + @code << " #{action.targets.join(' = ')} = #{action.value}\n" + else + @code << " #{action.targets.join(' = ')} = #{action.value.inspect}\n" + end + + when Increment + case action.value + when Proc + @code << " #{action.targets.join(' = ')} #{action.operation}= #{make_callback(action.value)}\n" + when Symbol + @code << " #{action.targets.join(' = ')} #{action.operation}= #{action.value}\n" + else + @code << " #{action.targets.join(' = ')} #{action.operation}= #{action.value.inspect}\n" + end + + when Proc + @code << " #{make_callback(action)}\n" + + when Continue + @code << " next\n" + + else + raise "I don't know how to evaluate this action: %p" % [action] + end + end + + @first = false + end + + def groups *token_kinds + Groups.new token_kinds + end + + def pattern pattern = nil, &block + Pattern.new pattern || block + end + + def kind token_kind = nil, &block + Kind.new token_kind || block + end + + def push state = nil, group = state, &block + raise 'push requires a state or a block; got nothing' unless state || block + Push.new state || block, group || block + end + + def pop group = nil + Pop.new group + end + + def push_state state = nil, &block + raise 'push_state requires a state or a block; got nothing' unless state || block + PushState.new state || block + end + + def pop_state + PopState.new + end + + def check_if value = nil, &callback + CheckIf.new value || callback + end + + def check_unless value = nil, &callback + CheckUnless.new value || callback + end + + def flag_on *flags + flags.each { |name| variables << name } + ValueSetter.new Array(flags), true + end + + def flag_off *flags + flags.each { |name| variables << name } + ValueSetter.new Array(flags), false + end + + def set flag, value = nil, &callback + variables << flag + ValueSetter.new [flag], value || callback || true + end + + def unset *flags + flags.each { |name| variables << name } + ValueSetter.new Array(flags), nil + end + + def increment *counters + counters.each { |name| variables << name } + Increment.new Array(counters), :+, 1 + end + + def decrement *counters + counters.each { |name| variables << name } + Increment.new Array(counters), :-, 1 + end + + def continue + Continue.new + end + + def define_scan_tokens! + if ENV['PUTS'] + puts CodeRay.scan(scan_tokens_code, :ruby).terminal + puts "callbacks: #{callbacks.size}" + end + + class_eval scan_tokens_code + end + + protected + + def callbacks + @callbacks ||= {} + end + + def variables + @variables ||= Set.new + end + + def additional_variables + variables - %i(encoder options state match kind) + end + + def make_callback block + base_name = "__callback_line_#{block.source_location.last}" + callback_name = base_name + counter = 'a' + while callbacks.key?(callback_name) + callback_name = "#{base_name}_#{counter}" + counter.succ! + end + + callbacks[callback_name] = define_method(callback_name, &block) + + parameters = block.parameters + + if parameters.empty? + callback_name + else + parameter_names = parameters.map(&:last) + parameter_names.each { |name| variables << name } + "#{callback_name}(#{parameter_names.join(', ')})" + end + end + + def scan_tokens_code + <<-"RUBY" + def scan_tokens encoder, options + state = options[:state] || @state + +#{ restore_local_variables_code.chomp.gsub(/^/, ' ' * 3) } + + until eos? + case state +#{ @code.chomp.gsub(/^/, ' ' * 4) } + else + raise_inspect 'Unknown state: %p' % [state], encoder + end + end + + if options[:keep_state] + @state = state + end + +#{ close_groups_code.chomp.gsub(/^/, ' ' * 3) } + + encoder + end + RUBY + end + + def restore_local_variables_code + additional_variables.sort.map { |name| "#{name} = @#{name}" }.join("\n") + end + + def close_groups_code + "close_groups(encoder, state)" + end + end + + def scan_tokens tokens, options + self.class.define_scan_tokens! + + scan_tokens tokens, options + end + + protected + + def setup + @state = :initial + end + + def close_groups encoder, state + # TODO + end + + end + end +end diff --git a/rake_tasks/test.rake b/rake_tasks/test.rake index 58e6daa2..6468790d 100644 --- a/rake_tasks/test.rake +++ b/rake_tasks/test.rake @@ -48,7 +48,7 @@ Please rename or remove it and run again to use the GitHub repository: task lang => :update_scanner_suite do ruby "./test/scanners/suite.rb #{lang}" end - (1..5).each do |i| + (1..6).each do |i| task "#{lang}:#{i}" => :update_scanner_suite do ruby "./test/scanners/suite.rb #{lang}:#{i}" end From 738aae54b580d34966ba72f3465513185d385c1c Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Sun, 5 Nov 2017 17:58:59 +0100 Subject: [PATCH 50/54] fix autoloading of DSL scanners --- lib/coderay.rb | 5 ----- lib/coderay/scanners.rb | 6 ++++++ lib/coderay/simple_scanner.rb | 1 + 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/lib/coderay.rb b/lib/coderay.rb index e43f6bb5..c3de20b5 100644 --- a/lib/coderay.rb +++ b/lib/coderay.rb @@ -153,11 +153,6 @@ def self.coderay_path *path autoload :Encoders, coderay_path('encoders') autoload :Styles, coderay_path('styles') - # DSL Scanner - autoload :RuleBasedScanner, coderay_path('rule_based_scanner') - autoload :SingleStateRuleBasedScanner, coderay_path('single_state_rule_based_scanner') - autoload :StateBasedScanner, coderay_path('state_based_scanner') - # convenience access and reusable Encoder/Scanner pair autoload :Duo, coderay_path('duo') diff --git a/lib/coderay/scanners.rb b/lib/coderay/scanners.rb index 3c7e594d..0935458d 100644 --- a/lib/coderay/scanners.rb +++ b/lib/coderay/scanners.rb @@ -21,6 +21,12 @@ module Scanners plugin_path File.dirname(__FILE__), 'scanners' autoload :Scanner, CodeRay.coderay_path('scanners', 'scanner') + + # DSL Scanners + autoload :RuleBasedScanner, CodeRay.coderay_path('rule_based_scanner') + autoload :SingleStateRuleBasedScanner, CodeRay.coderay_path('single_state_rule_based_scanner') + autoload :StateBasedScanner, CodeRay.coderay_path('state_based_scanner') + autoload :SimpleScanner, CodeRay.coderay_path('simple_scanner') end diff --git a/lib/coderay/simple_scanner.rb b/lib/coderay/simple_scanner.rb index 6873f884..96a0e511 100644 --- a/lib/coderay/simple_scanner.rb +++ b/lib/coderay/simple_scanner.rb @@ -1,4 +1,5 @@ require 'set' +require 'coderay/simple_scanner_dsl' module CodeRay module Scanners From 465e6c3ca2d9c23043c6e5b89f637c284e4ef722 Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Sun, 5 Nov 2017 17:59:15 +0100 Subject: [PATCH 51/54] remove obsolete "protected" --- lib/coderay/rule_based_scanner.rb | 2 -- 1 file changed, 2 deletions(-) diff --git a/lib/coderay/rule_based_scanner.rb b/lib/coderay/rule_based_scanner.rb index 0eb92226..834e1fba 100644 --- a/lib/coderay/rule_based_scanner.rb +++ b/lib/coderay/rule_based_scanner.rb @@ -363,8 +363,6 @@ def scan_tokens tokens, options scan_tokens tokens, options end - protected - def setup @state = :initial end From a9e04e1f52f6530cbf4b5251abcf6f1f82840ce9 Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Sun, 5 Nov 2017 17:59:45 +0100 Subject: [PATCH 52/54] fix specs, update SimpleScannerDSL --- lib/coderay/simple_scanner.rb | 2 +- lib/coderay/simple_scanner_dsl.rb | 93 ++++++++++++++++--------------- spec/simple_scanner_spec.rb | 33 +++++++++-- spec/spec_helper.rb | 2 - 4 files changed, 77 insertions(+), 53 deletions(-) diff --git a/lib/coderay/simple_scanner.rb b/lib/coderay/simple_scanner.rb index 96a0e511..2ad5fe3a 100644 --- a/lib/coderay/simple_scanner.rb +++ b/lib/coderay/simple_scanner.rb @@ -15,7 +15,7 @@ def define_scan_tokens! class_eval <<-RUBY def scan_tokens encoder, options -#{ scan_tokens_code.chomp.gsub(/^/, ' ' * 2) } +#{ scan_tokens_code.chomp.gsub(/^/, ' ') } end RUBY end diff --git a/lib/coderay/simple_scanner_dsl.rb b/lib/coderay/simple_scanner_dsl.rb index b3c8c57c..e954ccf6 100644 --- a/lib/coderay/simple_scanner_dsl.rb +++ b/lib/coderay/simple_scanner_dsl.rb @@ -3,6 +3,8 @@ module CodeRay module Scanners module SimpleScannerDSL + NoStatesError = Class.new StandardError + Pattern = Struct.new :pattern Groups = Struct.new :token_kinds Kind = Struct.new :token_kind @@ -26,7 +28,7 @@ def initialize(*) def eval @first = true - @code = "" + @code = '' instance_eval(&block) end @@ -111,115 +113,119 @@ def on *pattern_and_actions end end - @code << "#{'els' unless @first}if #{condition_expressions.join(' && ')}\n" + condition_code = "#{'els' unless @first}if #{condition_expressions.join(' && ')}\n" + action_code = '' for action in actions case action when String raise - @code << "p 'evaluate #{action.inspect}'\n" if $DEBUG - @code << "#{action}\n" + action_code << "p 'evaluate #{action.inspect}'\n" if $DEBUG + action_code << "#{action}\n" when Symbol - @code << "p 'text_token %p %p' % [match, #{action.inspect}]\n" if $DEBUG - @code << "encoder.text_token match, #{action.inspect}\n" + action_code << "p 'text_token %p %p' % [match, #{action.inspect}]\n" if $DEBUG + action_code << "encoder.text_token match, #{action.inspect}\n" when Kind case action.token_kind when Proc - @code << "encoder.text_token match, kind = #{dsl.add_callback(action.token_kind)}\n" + action_code << "encoder.text_token match, kind = #{dsl.add_callback(action.token_kind)}\n" else raise "I don't know how to evaluate this kind: %p" % [action.token_kind] end when Groups - @code << "p 'text_tokens %p in groups %p' % [match, #{action.token_kinds.inspect}]\n" if $DEBUG + action_code << "p 'text_tokens %p in groups %p' % [match, #{action.token_kinds.inspect}]\n" if $DEBUG action.token_kinds.each_with_index do |kind, i| - @code << "encoder.text_token self[#{i + 1}], #{kind.inspect} if self[#{i + 1}]\n" + action_code << "encoder.text_token self[#{i + 1}], #{kind.inspect} if self[#{i + 1}]\n" end when Push, PushState case action.state when String raise - @code << "p 'push %p' % [#{action.state}]\n" if $DEBUG - @code << "state = #{action.state}\n" - @code << "states << state\n" + action_code << "p 'push %p' % [#{action.state}]\n" if $DEBUG + action_code << "state = #{action.state}\n" + action_code << "states << state\n" when Symbol - @code << "p 'push %p' % [#{action.state.inspect}]\n" if $DEBUG - @code << "state = #{action.state.inspect}\n" - @code << "states << state\n" + action_code << "p 'push %p' % [#{action.state.inspect}]\n" if $DEBUG + action_code << "state = #{action.state.inspect}\n" + action_code << "states << state\n" when Proc - @code << "if new_state = #{dsl.add_callback(action.state)}\n" - @code << " state = new_state\n" - @code << " states << new_state\n" - @code << "end\n" + action_code << "if new_state = #{dsl.add_callback(action.state)}\n" + action_code << " state = new_state\n" + action_code << " states << new_state\n" + action_code << "end\n" else raise "I don't know how to evaluate this push state: %p" % [action.state] end if action.is_a? Push if action.state == action.group - @code << "encoder.begin_group state\n" + action_code << "encoder.begin_group state\n" else case action.state when Symbol - @code << "p 'begin group %p' % [#{action.group.inspect}]\n" if $DEBUG - @code << "encoder.begin_group #{action.group.inspect}\n" + action_code << "p 'begin group %p' % [#{action.group.inspect}]\n" if $DEBUG + action_code << "encoder.begin_group #{action.group.inspect}\n" when Proc - @code << "encoder.begin_group #{dsl.add_callback(action.group)}\n" + action_code << "encoder.begin_group #{dsl.add_callback(action.group)}\n" else raise "I don't know how to evaluate this push state: %p" % [action.state] end end end when Pop, PopState - @code << "p 'pop %p' % [states.last]\n" if $DEBUG + action_code << "p 'pop %p' % [states.last]\n" if $DEBUG if action.is_a? Pop if action.group case action.group when Symbol - @code << "encoder.end_group #{action.group.inspect}\n" + action_code << "encoder.end_group #{action.group.inspect}\n" else raise "I don't know how to evaluate this pop group: %p" % [action.group] end - @code << "states.pop\n" + action_code << "states.pop\n" else - @code << "encoder.end_group states.pop\n" + action_code << "encoder.end_group states.pop\n" end else - @code << "states.pop\n" + action_code << "states.pop\n" end - @code << "state = states.last\n" + action_code << "state = states.last\n" when ValueSetter case action.value when Proc - @code << "#{action.targets.join(' = ')} = #{dsl.add_callback(action.value)}\n" + action_code << "#{action.targets.join(' = ')} = #{dsl.add_callback(action.value)}\n" when Symbol - @code << "#{action.targets.join(' = ')} = #{action.value}\n" + action_code << "#{action.targets.join(' = ')} = #{action.value}\n" else - @code << "#{action.targets.join(' = ')} = #{action.value.inspect}\n" + action_code << "#{action.targets.join(' = ')} = #{action.value.inspect}\n" end when Increment case action.value when Proc - @code << "#{action.targets.join(' = ')} #{action.operation}= #{dsl.add_callback(action.value)}\n" + action_code << "#{action.targets.join(' = ')} #{action.operation}= #{dsl.add_callback(action.value)}\n" when Symbol - @code << "#{action.targets.join(' = ')} #{action.operation}= #{action.value}\n" + action_code << "#{action.targets.join(' = ')} #{action.operation}= #{action.value}\n" else - @code << "#{action.targets.join(' = ')} #{action.operation}= #{action.value.inspect}\n" + action_code << "#{action.targets.join(' = ')} #{action.operation}= #{action.value.inspect}\n" end when Proc - @code << "#{dsl.add_callback(action)}\n" + action_code << "#{dsl.add_callback(action)}\n" when Continue - @code << "next\n" + action_code << "next\n" else raise "I don't know how to evaluate this action: %p" % [action] end end + @code << condition_code + @code << action_code.gsub(/^/, ' ') + @first = false end @@ -347,8 +353,7 @@ def scan_tokens_code <<-"RUBY" state = options[:state] || @state states = [state] -#{ restore_local_variables_code.chomp } - +#{ restore_local_variables_code } until eos? case state #{ states_code.chomp.gsub(/^/, ' ') } @@ -359,7 +364,7 @@ def scan_tokens_code @state = state if options[:keep_state] -#{ close_groups_code.chomp } +close_groups(encoder, states) encoder RUBY @@ -370,11 +375,11 @@ def restore_local_variables_code end def states_code - @states.map(&:code)[0,1].join - end + unless defined? @states + raise NoStatesError, 'no states defined for %p' % [self.class] + end - def close_groups_code - 'close_groups(encoder, states)' + @states.map(&:code).join end end end diff --git a/spec/simple_scanner_spec.rb b/spec/simple_scanner_spec.rb index 088343cb..bc2aec44 100644 --- a/spec/simple_scanner_spec.rb +++ b/spec/simple_scanner_spec.rb @@ -1,17 +1,37 @@ RSpec.describe CodeRay::Scanners::SimpleScanner do - let(:scanner) { Class.new described_class } + let(:scanner) { described_class } describe '#scan_tokens_code' do subject { scanner.send :scan_tokens_code } - it 'lets you define states' do - is_expected.to eq <<-RUBY + it 'throws an error' do + expect { subject }.to raise_error(CodeRay::Scanners::SimpleScannerDSL::NoStatesError) + end + end + + describe 'with one state' do + let(:scanner) do + Class.new described_class do + state :somepony do + on %r/rainbow/, :dash + end + end + end + + describe '#scan_tokens_code' do + subject { scanner.send :scan_tokens_code } + it 'returns an scanner with one states' do + is_expected.to eq <<-RUBY state = options[:state] || @state states = [state] - until eos? case state - + when :somepony + if match = scan(/rainbow/) + encoder.text_token match, :dash + else + encoder.text_token getch, :error + end else raise_inspect 'Unknown state: %p' % [state], encoder end @@ -22,7 +42,8 @@ close_groups(encoder, states) encoder - RUBY + RUBY + end end end end diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb index 49b6a0ec..fe4e726f 100644 --- a/spec/spec_helper.rb +++ b/spec/spec_helper.rb @@ -91,6 +91,4 @@ Kernel.srand config.seed end -$LOAD_PATH << 'lib/coderay' - require 'coderay' From ec891978d3756c186104d8d243283f8d3104b85a Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Sun, 5 Nov 2017 18:10:57 +0100 Subject: [PATCH 53/54] trying to fix tests for Ruby 2.4 --- .travis.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index b99c95e0..1e020903 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,7 +9,7 @@ rvm: - 2.1 - 2.2 - 2.3 - - 2.4 + - 2.4.2 - ruby-head - jruby branches: @@ -21,6 +21,5 @@ matrix: allow_failures: - rvm: ruby-head - rvm: jruby - - rvm: rbx script: "rake test" # test:scanners" sudo: false From 434006c0a68d2884c05fe97dcee8db69e87a9a99 Mon Sep 17 00:00:00 2001 From: Kornelius Kalnbach Date: Mon, 27 Nov 2017 14:07:03 +0100 Subject: [PATCH 54/54] testing rouge scanner --- lib/coderay/rouge_scanner.rb | 49 +++++ lib/coderay/rouge_scanner_dsl.rb | 199 ++++++++++++++++++++ lib/coderay/scanners.rb | 1 + lib/coderay/scanners/_map.rb | 1 + lib/coderay/scanners/java_script7.rb | 268 +++++++++++++++++++++++++++ 5 files changed, 518 insertions(+) create mode 100644 lib/coderay/rouge_scanner.rb create mode 100644 lib/coderay/rouge_scanner_dsl.rb create mode 100644 lib/coderay/scanners/java_script7.rb diff --git a/lib/coderay/rouge_scanner.rb b/lib/coderay/rouge_scanner.rb new file mode 100644 index 00000000..b08d7fab --- /dev/null +++ b/lib/coderay/rouge_scanner.rb @@ -0,0 +1,49 @@ +require 'set' +require 'coderay/rouge_scanner_dsl' + +module CodeRay + module Scanners + class RougeScanner < Scanner + require 'rouge' + include Rouge::Token::Tokens + + extend RougeScannerDSL + + class << self + def define_scan_tokens! + if ENV['PUTS'] + puts CodeRay.scan(scan_tokens_code, :ruby).terminal + puts "callbacks: #{callbacks.size}" + end + + class_eval <<-RUBY +def scan_tokens encoder, options + @encoder = encoder +#{ scan_tokens_code.chomp.gsub(/^/, ' ') } +end + RUBY + end + end + + def scan_tokens tokens, options + self.class.define_scan_tokens! + + scan_tokens tokens, options + end + + protected + + def setup + @state = :root + end + + def close_groups encoder, states + # TODO + end + + def token token + @encoder.text_token @match, token + end + end + end +end \ No newline at end of file diff --git a/lib/coderay/rouge_scanner_dsl.rb b/lib/coderay/rouge_scanner_dsl.rb new file mode 100644 index 00000000..38b06f53 --- /dev/null +++ b/lib/coderay/rouge_scanner_dsl.rb @@ -0,0 +1,199 @@ +require 'set' + +module CodeRay + module Scanners + module RougeScannerDSL + NoStatesError = Class.new StandardError + + State = Struct.new :name, :rules do + def initialize(name, &block) + super name, [] + + instance_eval(&block) + end + + def code scanner + <<-RUBY +when #{name.inspect} +#{ rules_code(scanner).chomp.gsub(/^/, ' ') } + else + encoder.text_token getch, :error + end + RUBY + end + + def rules_code scanner, first: true + raise 'no rules defined for %p' % [self] if rules.empty? + + [ + rules.first.code(scanner, first: first), + *rules.drop(1).map { |rule| rule.code(scanner) } + ].join + end + + protected + + # DSL + + def rule pattern, token = nil, next_state = nil, &block + unless token || block + raise 'please pass `rule` a token to yield or a callback' + end + + case token + when Class + unless token < Rouge::Token + raise "invalid token: #{token.inspect}" + end + + case next_state + when Symbol + rules << Rule.new(pattern, token, next_state) + when nil + rules << Rule.new(pattern, token) + else + raise "invalid next state: #{next_state.inspect}" + end + when nil + rules << CallbackRule.new(pattern, block) + else + raise "invalid token: #{token.inspect}" + end + end + + def mixin state_name + rules << Mixin.new(state_name) + end + end + + Rule = Struct.new :pattern, :token, :action do + def initialize(pattern, token, action = nil) + super + end + + def code scanner, first: false + <<-RUBY + action_code.to_s +#{'els' unless first}if match = scan(#{pattern.inspect}) + encoder.text_token match, #{token.token_chain.map(&:name).join('::')} + RUBY + end + + def action_code + case action + when :pop! + <<-RUBY + states.pop + state = states.last + RUBY + when Symbol + <<-RUBY + state = #{action.inspect} + states << state + RUBY + end + end + end + + CallbackRule = Struct.new :pattern, :callback do + def code scanner, first: false + <<-RUBY +#{'els' unless first}if match = scan(#{pattern.inspect}) + @match = match + #{scanner.add_callback(callback)} + RUBY + end + end + + Mixin = Struct.new(:state_name) do + def code scanner, first: false + scanner.states[state_name].rules_code(scanner, first: first) + end + end + + attr_accessor :states + + def state name, &block + @states ||= {} + @states[name] = State.new(name, &block) + end + + def add_callback block + base_name = "__callback_line_#{block.source_location.last}" + callback_name = base_name + counter = 'a' + while callbacks.key?(callback_name) + callback_name = "#{base_name}_#{counter}" + counter = counter.succ + end + + callbacks[callback_name] = define_method(callback_name, &block) + + parameters = block.parameters + + if parameters.empty? + callback_name + else + parameter_names = parameters.map do |type, name| + raise "callbacks don't allow rest parameters: %p" % [parameters] unless type == :req || type == :opt + name = :match if name == :m + name + end + + parameter_names.each { |name| variables << name } + "#{callback_name}(#{parameter_names.join(', ')})" + end + end + + def add_variable name + variables << name + end + + protected + + def callbacks + @callbacks ||= {} + end + + def variables + @variables ||= Set.new + end + + def additional_variables + variables - %i(encoder options state states match kind) + end + + def scan_tokens_code + <<-"RUBY" +state = options[:state] || @state +states = [state] +#{ restore_local_variables_code } +until eos? + case state +#{ states_code.chomp.gsub(/^/, ' ') } + else + raise_inspect 'Unknown state: %p' % [state], encoder + end +end + +@state = state if options[:keep_state] + +close_groups(encoder, states) + +encoder + RUBY + end + + def restore_local_variables_code + additional_variables.sort.map { |name| "#{name} = @#{name}" }.join("\n") + end + + def states_code + unless defined?(@states) && !@states.empty? + raise NoStatesError, 'no states defined for %p' % [self.class] + end + + @states.values.map { |state| state.code(self) }.join + end + end + end +end \ No newline at end of file diff --git a/lib/coderay/scanners.rb b/lib/coderay/scanners.rb index 0935458d..5892f528 100644 --- a/lib/coderay/scanners.rb +++ b/lib/coderay/scanners.rb @@ -26,6 +26,7 @@ module Scanners autoload :RuleBasedScanner, CodeRay.coderay_path('rule_based_scanner') autoload :SingleStateRuleBasedScanner, CodeRay.coderay_path('single_state_rule_based_scanner') autoload :StateBasedScanner, CodeRay.coderay_path('state_based_scanner') + autoload :RougeScanner, CodeRay.coderay_path('rouge_scanner') autoload :SimpleScanner, CodeRay.coderay_path('simple_scanner') end diff --git a/lib/coderay/scanners/_map.rb b/lib/coderay/scanners/_map.rb index 82fb17f5..4d836e61 100644 --- a/lib/coderay/scanners/_map.rb +++ b/lib/coderay/scanners/_map.rb @@ -16,6 +16,7 @@ module Scanners :javascript4 => :java_script4, :javascript5 => :java_script5, :javascript6 => :java_script6, + :javascript7 => :java_script7, :js => :java_script, :pascal => :delphi, :patch => :diff, diff --git a/lib/coderay/scanners/java_script7.rb b/lib/coderay/scanners/java_script7.rb new file mode 100644 index 00000000..082a781e --- /dev/null +++ b/lib/coderay/scanners/java_script7.rb @@ -0,0 +1,268 @@ +# Trying to imitate https://github.com/jneen/rouge/blob/master/lib/rouge/lexers/javascript.rb. +module CodeRay +module Scanners + + # Scanner for JavaScript. + # + # Aliases: +ecmascript+, +ecma_script+, +javascript+ + class JavaScript7 < RougeScanner + register_for :java_script7 + file_extension 'js' + + state :multiline_comment do + rule %r([*]/), Comment::Multiline, :pop! + rule %r([^*/]+), Comment::Multiline + rule %r([*/]), Comment::Multiline + end + + state :comments_and_whitespace do + rule /\s+/, Text + rule /