2009-06-30

Más rapidito

Como les estuve contando, sigo escribiendo mi wiki. Ya parsea un subconjunto interesante del lenguaje definido por trac.
Siguiendo la tradición, les cuento como está avanzando el tokenizer. Al tokenizer lo simplifiqué para que devuelva la expresión regular que matcheo junto con el match (en vez del "tipo"). Esto hizo que la interfase para definir las reglas para tokenizar sea más simple. Si no hay ninguna regla que matchee sigue devolviendo ["string", :text].
Sin más, acá el código:

module Rapidito
class Tokenizer

def initialize( *delimiters )
@delimiter_list = delimiters + [/\z/]
@match_cache = nil
end

def source
valid_cache? ? @match_cache[0].to_s + @source : @source
end

def source=(s)
@match_cache = nil
@source = s
end

def has_next?
!@source.empty? || valid_cache?
end

def valid_cache?
(!@match_cache.nil?) && (@match_cache[0].to_s.length > 0)
end

def next_match
@delimiter_list.map {|regex| [regex.match(@source),regex]}.reject {|p| p[0].nil?}.inject do
|better,new|
better_pos = better[0].pre_match.length
new_pos = new[0].pre_match.length

if better_pos < new_pos
better
elsif new_pos < better_pos
new
elsif better[0].to_s.length > new[0].to_s.length
better
else
new
end
end
end

def next_token
if @match_cache #cached delimiter
rv = @match_cache
@match_cache = nil
return rv
end

match = next_match
p = match[0].pre_match.length
@source = @source[p + match[0].to_s.length, @source.length]

if p == 0 #delimiter
match
else #text
@match_cache = match
[match[0].pre_match, :text]
end
end

def all_tokens
tokens = []
while has_next?
tokens << next_token
end
tokens
end
end
end

Y si miran los tests de unidad, van a ver que también quedaron más lindos:
require 'test/unit'
require 'rapidito/tokenizer'

include Rapidito

class TokenizerTest < Test::Unit::TestCase

def test_no_token
tok = Tokenizer.new
tok.source = "aaaa"
assert_equal true, tok.has_next?
assert_equal ["aaaa", :text], tok.next_token
assert_equal false, tok.has_next?
end

def assert_all_tokens( expected, tokenizer )
assert_equal expected,
tokenizer.all_tokens.map { |token, kind| [token.to_s, kind] }
end

def test_two_delimiters
tok = Tokenizer.new(
/\|/, /;;/
)

tok.source = "aa|bbb;;;;cccc"
assert_all_tokens \
[ ["aa", :text], ["|", /\|/], ["bbb", :text],
[";;", /;;/], [";;", /;;/], ["cccc", :text] ],
tok

tok.source = "aa;;bbb||cccc"
assert_all_tokens \
[ ["aa", :text], [";;", /;;/], ["bbb", :text],
["|", /\|/], ["|", /\|/], ["cccc", :text] ],
tok
end

def test_choose_longest_match
tok = Tokenizer.new(
/aa/, /aaa/
)
tok.source = "aaaa"
assert_all_tokens [ ["aaa", /aaa/], ["a", :text ] ], tok
end

def test_reset_precache
tok = Tokenizer.new(
/\|/, /,/
)
tok.source = "original start|original end"
tok.next_token
tok.source = "new start,new end"
assert_equal ["new start", :text], tok.next_token
end

def test_almost_finished
tok = Tokenizer.new( /!/ )
tok.source = "bang!"
tok.next_token
assert_equal true, tok.has_next?
tok.next_token
assert_equal false, tok.has_next?
end

def test_carriage_return_ending
tok = Tokenizer.new( /!/ )
tok.source = "bang!\n"
tok.next_token
assert_equal true, tok.has_next?
tok.next_token
assert_equal true, tok.has_next?
assert_equal "\n", tok.next_token[0].to_s
assert_equal false, tok.has_next?
end

def test_transparent_caching
tok = Tokenizer.new( /!/ )
tok.source = "bang!pum"
tok.next_token

assert_equal "!pum", tok.source
end

def test_match_klass
tok = Tokenizer.new( /!/ )
tok.source = "!bang!pum"

assert_equal \
[MatchData, String, MatchData, String],
tok.all_tokens.map { |tok, kind| tok.class }
end
end

Happy hacking,
Aureliano.

No hay comentarios.: