aurelianito: 2009/06

2009-06-30

Más rapidito

Como les estuve contando, sigo escribiendo mi wiki. Ya parsea un subconjunto interesante del lenguaje definido por trac.
Siguiendo la tradición, les cuento como está avanzando el tokenizer. Al tokenizer lo simplifiqué para que devuelva la expresión regular que matcheo junto con el match (en vez del "tipo"). Esto hizo que la interfase para definir las reglas para tokenizar sea más simple. Si no hay ninguna regla que matchee sigue devolviendo ["string", :text].
Sin más, acá el código:

module Rapidito
  class Tokenizer
  
    def initialize( *delimiters )
      @delimiter_list = delimiters +  [/\z/]
      @match_cache = nil
    end
    
    def source
      valid_cache? ? @match_cache[0].to_s + @source : @source
    end
    
    def source=(s)
      @match_cache = nil
      @source = s
    end
    
    def has_next?
      !@source.empty? || valid_cache?
    end
    
    def valid_cache?
      (!@match_cache.nil?) && (@match_cache[0].to_s.length > 0)
    end
    
    def next_match
      @delimiter_list.map {|regex| [regex.match(@source),regex]}.reject {|p| p[0].nil?}.inject do
        |better,new|
        better_pos = better[0].pre_match.length
        new_pos = new[0].pre_match.length
        
        if better_pos < new_pos
          better
        elsif new_pos < better_pos
          new
        elsif better[0].to_s.length > new[0].to_s.length
          better
        else
          new
        end
      end
    end
    
    def next_token
      if @match_cache #cached delimiter
        rv = @match_cache
        @match_cache = nil
        return rv
      end
      
      match = next_match
      p = match[0].pre_match.length
      @source = @source[p + match[0].to_s.length, @source.length]
      
      if p == 0 #delimiter
        match
      else #text
        @match_cache = match
        [match[0].pre_match, :text]
      end
    end
    
    def all_tokens
      tokens = []
      while has_next?
        tokens << next_token
      end
      tokens
    end
  end
end

Y si miran los tests de unidad, van a ver que también quedaron más lindos:

require 'test/unit'
require 'rapidito/tokenizer'

include Rapidito

class TokenizerTest < Test::Unit::TestCase
  
  def test_no_token
    tok = Tokenizer.new
    tok.source = "aaaa"
    assert_equal true, tok.has_next?
    assert_equal ["aaaa", :text], tok.next_token
    assert_equal false, tok.has_next?
  end
  
  def assert_all_tokens( expected, tokenizer )
    assert_equal expected, 
      tokenizer.all_tokens.map { |token, kind| [token.to_s, kind] }
  end
  
  def test_two_delimiters
    tok = Tokenizer.new( 
      /\|/, /;;/ 
    )
    
    tok.source = "aa|bbb;;;;cccc"
    assert_all_tokens \
      [ ["aa", :text], ["|", /\|/], ["bbb", :text], 
        [";;", /;;/], [";;", /;;/], ["cccc", :text] ], 
      tok
    
    tok.source = "aa;;bbb||cccc"
    assert_all_tokens \
      [ ["aa", :text], [";;", /;;/], ["bbb", :text], 
        ["|", /\|/], ["|", /\|/], ["cccc", :text] ], 
      tok
  end
  
  def test_choose_longest_match
    tok = Tokenizer.new( 
      /aa/, /aaa/
    )
    tok.source = "aaaa"
    assert_all_tokens [ ["aaa", /aaa/], ["a", :text ] ], tok
  end
  
  def test_reset_precache
    tok = Tokenizer.new( 
      /\|/, /,/
    )
    tok.source = "original start|original end"
    tok.next_token
    tok.source = "new start,new end"
    assert_equal ["new start", :text], tok.next_token
  end
  
  def test_almost_finished
    tok = Tokenizer.new( /!/ )
    tok.source = "bang!"
    tok.next_token
    assert_equal true, tok.has_next?
    tok.next_token
    assert_equal false, tok.has_next?
  end
  
  def test_carriage_return_ending
    tok = Tokenizer.new( /!/ )
    tok.source = "bang!\n"
    tok.next_token
    assert_equal true, tok.has_next?
    tok.next_token
    assert_equal true, tok.has_next?
    assert_equal "\n", tok.next_token[0].to_s
    assert_equal false, tok.has_next?
  end
  
  def test_transparent_caching
    tok = Tokenizer.new( /!/ )
    tok.source = "bang!pum"
    tok.next_token
    
    assert_equal "!pum", tok.source
  end
  
  def test_match_klass
    tok = Tokenizer.new( /!/ )
    tok.source = "!bang!pum"
    
    assert_equal \
      [MatchData, String, MatchData, String], 
      tok.all_tokens.map { |tok, kind| tok.class }
  end
end

Happy hacking,
Aureliano.

2009-06-19

Bouncer versión 2

Hice una nueva versión del bouncer que permite bouncear a varios targets desde un mismo host:

#!/usr/bin/env ruby

# == Synopsis
#
# bouncer.rb: Redirects TCP connections to distant machines. Handles simultaneously many connections.
#
# == Usage
#
# ruby redirect.rb [OPTION]
#
# -h, --help:
#   Show help
#
# --ip ip, -i ip:
#   Accept connections from ip (default 127.0.0.1)
#
# --port port, -p port:
#   Listen on port (default 12345)
#
# --target ip:port, -t ip:port
#   Connect to ip:port (default 127.0.0.1:23456)
#

require 'getoptlong'
require 'rdoc/usage'
require 'socket'

class Bouncer
 def self.for_host( source_ip, start_port, targets )
  rules = {}
  targets.each_with_index do
   |target, index|
   rules[ start_port + index ] = target
  end
  self.new( source_ip, rules )
 end
 def initialize( source_ip, port_target_map )
  @port_target_map = port_target_map
  @source_ip = source_ip

  @servers = @port_target_map.keys.map do
    |port|
    ss = TCPServer.new( "", port )
    ss.setsockopt( Socket::SOL_SOCKET, Socket::SO_REUSEADDR, 1 )
    ss
  end
    
  @descriptors = @servers.clone
  @next_step = {}
 end

 def handle_new_connection( ss )
  incoming = ss.accept
  if incoming.peeraddr[3] == @source_ip
   begin
    outgoing = TCPSocket.new( *@port_target_map[incoming.addr[1]] )
    @next_step[ incoming ] = outgoing
    @next_step[ outgoing ] = incoming

    @descriptors += [ incoming, outgoing ]
   rescue
    silent_close( incoming )
   end
  else
   silent_close( incoming )
  end
 end

 def silent_close( sock )
  begin
   sock.close
  rescue
   #do nothing intentionally
  end
 end

  def propagate(sock)
    next_sock = @next_step[sock]
    next_sock.write(sock.read_nonblock(1000 * 1000))
  end

 def finish_connection(sock)
  next_sock = @next_step[sock]
  [ sock, next_sock ].each do
   |s|
   silent_close(s)
   @descriptors.delete(s)
   @next_step.delete(s)
  end
 end

 def run
  loop do
   connections = select( @descriptors )
   connections[0].each do
    |sock|
    if @servers.include? sock then
     handle_new_connection( sock )
    else
     begin
      sock.eof? ? finish_connection(sock) : propagate(sock)
     rescue
      finish_connection(sock)
     end
    end
   end
  end
 end
end

if $0 == __FILE__ then

 opts = GetoptLong.new(
   [ '--help', '-h', GetoptLong::NO_ARGUMENT ],
   [ '--ip', '-i', GetoptLong::REQUIRED_ARGUMENT ],
   [ '--port', '-p', GetoptLong::REQUIRED_ARGUMENT ],
   [ '--target', '-t', GetoptLong::REQUIRED_ARGUMENT ]
 )

 ip = '127.0.0.1'
 port = '12345'
 target = '127.0.0.1:23456'

 opts.each do
  |opt, arg|
  case opt
  when '--help'
   RDoc::usage
   exit
  when '--ip'
   ip = arg
  when '--port'
   port = arg
  when '--target'
   target = arg
  end
 end

 port = port.to_i
 target = target.split(":")

 trap("SIGINT") do
   exit
 end

 Bouncer.new(ip, port => [target[0], target[1].to_i]).run
end

2009-06-18

Mousewheel para arriba en vmware

Encontré como hacer para que ande bien la ruedita del mouse adentro de un Guest de VMWare con Kubuntu 9.04. Hay que instalar el paquete xserver-xorg-input-vmmouse.
Happy hacking,
Aureliano.

2009-06-17

Cambiando de resolución en X

El comando xrandr sirve, entre otras cosas para cambiar la resolución de X desde un shell. Primero listo las resoluciones disponibles:


$ xrandr
Screen 0: minimum 320 x 200, current 1280 x 1024, maximum 1280 x 1280
VGA connected 1280x1024+0+0 (normal left inverted right x axis y axis) 338mm x 270mm
   1280x1024      60.0*+   75.0     60.0*
   1152x864       75.0
   1024x768       75.0     60.0
   800x600        75.0     60.3
   640x480        75.0     59.9
   720x400        70.1

y puedo elegir la que quiera con el parámetro -s. Por ejemplo:


$ xrandr -s 800x600

Happy hacking,
Aureliano

2009-06-13

Pequeñas delicias de las expresiones regulares

Como les conté acá y acá, estoy escribiendo un tokenizador para un wiki que estoy programando. Y hoy me encontré con una cosa muy extraña de las expresiones regulares.
En ruby la función match sirve para buscar el primer match de una regex dentro de un string. Por ejemplo (usando el irb):


irb(main):001:0> m = /a/.match "babab"
=> #<MatchData "a">
irb(main):002:0> m.pre_match
=> "b"
irb(main):003:0> m[0]
=> "a"

En particular, el pre_match es lo que está antes del match en el string. También según había entendido (mal) /\Z/ matchea con el final del string. Por ejemplo:


irb(main):004:0> m = /\Z/.match "hola"
=> #<MatchData "">
irb(main):005:0> m.pre_match
=> "hola"

Pero, /\Z/ tiene un comportamiento muy extraño, aunque documentado, cuando el último caracter antes del final es un \n. Lo que pasa es que el pre_match queda ¡sin el \n del final!. Lo muestro en el irb:


irb(main):006:0> m = /\Z/.match "\n"
=> #<MatchData "">
irb(main):007:0> m.pre_match
=> ""

Para que no se manduque el \n, hay que usar /\z/ (¡en minúscula!):


irb(main):008:0> m = /\z/.match "\n"
=> #<MatchData "">
irb(main):009:0> m.pre_match
=> "\n"

Por lo tanto tuve que tocar el tokenizer, ahora la función de initialize quedó así (miren el cambio de la "Z" a "z"):

    def initialize( delimiters )
      @delimiter_list = [[/\z/, :finish]] + 
        delimiters.to_a.map { |k,arr| arr.map { |re| [re, k] } }.inject([]) { |ac,ps| ac + ps }
      @match_cache = nil
    end

Y el test que captura el problema que genera usar \Z en vez de \z quedó así:

  def test_carriage_return_ending
    tok = Tokenizer.new( :a_kind => [/!/] )
    tok.source = "bang!\n"
    tok.next_token
    assert_equal true, tok.has_next?
    tok.next_token
    assert_equal true, tok.has_next?
    assert_equal "\n", tok.next_token[0].to_s
    assert_equal false, tok.has_next?
  end

Happy hacking,
Aureliano.

2009-06-12

Tokenizer de rapidito, segunda versión

Como les conté hace un par de posts, estoy haciendo un wiki y blogueando sobre el tokenizer
Hoy les cuento de la segunda versión del tokenizer. A esta versión (que no es compatible con la anterior) le agregué la posibilidad de asociar un tipo (kind) a cada regexp que define un token. Por lo tanto, el método next_match ahora devuelve un par [match, kind]. Los matches de texto común (no delimitadores) tienen kind :text. Los matches de los delimitadores vienen con la MatchData entera, para poder usarla si interesan partes del match a la hora de procesar el token. También dejé de usar =~ para matchear regexps, ya que el
Me quedaron un par de cosas con dudas después de implementar esto.

¿Se puede hacer en ruby que busque un match de una regexp en un string a partir de una posición? Hacer una regexp del tipo /.{34,}(regex de verdad)/ no vale.
¿Puedo forzar a la unión de expresiones regulares a buscar el match más largo (en vez del primero que matchee)?

A pesar de las dudas, igual tengo una versión que creo que es significativamente mejor que la versión anterior.
Así que como la otra vez dejo el código:

module Rapidito
  class Tokenizer
  
    attr_reader :source
  
    def initialize( delimiters )
      @delimiter_list = [[/\Z/, :finish]] + 
        delimiters.to_a.map { |k,arr| arr.map { |re| [re, k] } }.inject([]) { |ac,ps| ac + ps }
      @match_cache = nil
    end
    
    def source=(s)
      @match_cache = nil
      @source = s
    end
    
    def has_next?
      !@source.empty? || valid_cache?
    end
    
    def valid_cache?
      (!@match_cache.nil?) && (@match_cache[0].to_s.length > 0)
    end
    
    def next_match
      @delimiter_list.map {|p| [p[0].match(@source), p[1]]}.reject {|p| p[0].nil?}.inject do
        |better,new|
        better_pos = better[0].pre_match.length
        new_pos = new[0].pre_match.length
        
        if better_pos < new_pos
          better
        elsif new_pos < better_pos
          new
        elsif better[0].to_s.length > new[0].to_s.length
          better
        else
          new
        end
      end
    end
    
    def next_token
      if @match_cache #cached delimiter
        rv = @match_cache
        @match_cache = nil
        return rv
      end
      
      match = next_match
      p = match[0].pre_match.length
      @source = @source[p + match[0].to_s.length, @source.length]
      
      if p == 0 #delimiter
        [match[0].to_s, match[1]]
      else #text
        @match_cache = match
        [match[0].pre_match, :text]
      end
    end
    
    def all_tokens
      tokens = []
      while has_next?
        tokens << next_token
      end
      tokens
    end
  end
end

Y los tests de unidad:

require 'test/unit'
require 'rapidito/tokenizer'

include Rapidito

class TokenizerTest < Test::Unit::TestCase
  
  def test_no_token
    tok = Tokenizer.new( {} )
    tok.source = "aaaa"
    assert_equal true, tok.has_next?
    assert_equal ["aaaa", :text], tok.next_token
    assert_equal false, tok.has_next?
  end
  
  def assert_all_tokens( expected, tokenizer )
    assert_equal expected, 
      tokenizer.all_tokens.map { |token, kind| [token.to_s, kind] }
  end
  
  def test_two_delimiters
    tok = Tokenizer.new( 
      :a_kind => [/\|/, /;;/] 
    )
    
    tok.source = "aa|bbb;;;;cccc"
    assert_all_tokens \
      [ ["aa", :text], ["|", :a_kind], ["bbb", :text], 
        [";;", :a_kind], [";;", :a_kind], ["cccc", :text] ], 
      tok
    
    tok.source = "aa;;bbb||cccc"
    assert_all_tokens \
      [ ["aa", :text], [";;", :a_kind], ["bbb", :text], 
        ["|", :a_kind], ["|", :a_kind], ["cccc", :text] ], 
      tok
  end
  
  def test_choose_longest_match
    tok = Tokenizer.new( 
      :a_kind => [/aa/, /aaa/] 
    )
    tok.source = "aaaa"
    assert_equal [ ["aaa", :a_kind], ["a", :text ] ], tok.all_tokens
  end
  
  def test_reset_precache
    tok = Tokenizer.new( 
      :a_kind => [/\|/, /,/]
    )
    tok.source = "original start|original end"
    tok.next_token
    tok.source = "new start,new end"
    assert_equal ["new start", :text], tok.next_token
  end
  
  def test_almost_finished
    tok = Tokenizer.new( :a_kind => [/!/] )
    tok.source = "bang!"
    tok.next_token
    assert_equal true, tok.has_next?
    tok.next_token
    assert_equal false, tok.has_next?
  end
end

Happy hacking,
Aureliano.

2009-06-11

Syntax highlighting de Ruby en mi blog

Tomando como base lo que dice en este post, hice un script en ruby que genera html con clases que se pueden poner coloritos con css:

#!/usr/bin/env ruby

require 'rubygems'
require 'syntax/convertors/html'

class Syntax::Convertors::HTML
  def convert( text, klass="" )
    html = "<pre class=\"#{klass}\">"
    regions = []
    @tokenizer.tokenize( text ) do |tok|
      value = html_escape(tok)
      case tok.instruction
        when :region_close then
          regions.pop
          html << "</span>"
        when :region_open then
          regions.push tok.group
          html << "<span class=\"#{tok.group}\">#{value}"
        else
          if tok.group == ( regions.last || :normal )
            html << value
          else
            html << "<span class=\"#{tok.group}\">#{value}</span>"
          end
      end
    end
    html << "</span>" while regions.pop
    html << "</pre>" 
    html
  end
end

convertor = Syntax::Convertors::HTML.for_syntax "ruby"
puts convertor.convert( $stdin.read, "ruby"  )

En el script monkeypatchié un toque para que agregue la clase "ruby" al tag pre que engloba todo el código y aparte estoy usando estos estilos:


<style>
pre.ruby {
      background-color: #ffffcc;
      color: #000000;
      padding: 10px;
      font-size: 1.1em;
      overflow: auto;
      margin: 4px 0px;
      width: 95%;
      border: thin dashed;
}
.ruby .normal {}
.ruby .comment { color: #005; font-style: italic; }
.ruby .keyword { color: #A00; font-weight: bold; }
.ruby .method { color: #077; }
.ruby .class { color: #074; }
.ruby .module { color: #050; }
.ruby .punct { color: #447; font-weight: bold; }
.ruby .symbol { color: #099; }
.ruby .string { color: #944; }
.ruby .char { color: #F07; }
.ruby .ident { color: #004; }
.ruby .constant { color: #07F; }
.ruby .regex { color: #B66; }
.ruby .number { color: #D55; }
.ruby .attribute { color: #377; }
.ruby .global { color: #3B7; }
.ruby .expr { color: #227; }
</style>

Una cosa más, si estás viendo este post en otro lugar que no sea aurelianito.blogspot.com no vas a ver el resaltado de sintaxis (ya que no va a tener los estilos).

Hasta la próxima,
Aureliano

2009-06-06

Tokenizer de rapidito

Después de mirar un rato el estado de las bibliotecas para hacer wikis en Ruby, y descubrir que ninguna me servía, Decidí que tenía que poner cartas en el asunto y hacer la mía. Mi idea es implementar el markup de Trac, haciéndolo extensible, y agregarle un par de cositas que por ahora son un secreto :-p.

Lo importante del asunto es que puse manos en el asunto. Al final, después de un intento fallido de hackear mi camino al andar, decidí que lo mejor es armar un tokenizer y un parser que use esos tokens para generar el árbol del que extraeré el HTML.

Así que me puse a programar. Como no encontré ningún tokenizer en ruby, programé uno. El tokenizer se contruye con un montón de expresiones regulares que definen cada delimitador. Después se le setea una fuente de caracteres (un string) y separa el string en los delimitadores de arriba (que se devuelven como símbolos) y cadenas que no matchean con ninguno de los delimitadores (que devuelve como strings).

Bueno, basta de cháchara, acá tá el código:


module Rapidito
  class Tokenizer
  
    def initialize( *delimiters )
      @regexp = Regexp.union( *delimiters  + [/$/] )
    end
    
    attr_accessor :source
    
    def has_next?
      ! @source.empty?
    end
    
    def next_token
      p = (@source =~ @regexp)
      if p == 0 #delimiter
        token = nil
        @source.sub!( @regexp ) { |match| token=match.to_sym; "" }
        token
      else #text
        token = @source[0,p]
        @source = @source[p,@source.length]
        token
      end
    end
    
    def all_tokens
      tokens = []
      while has_next?
        tokens << next_token
      end
      tokens
    end
    
  end
end

Y, acá abajo la única documentación que hice hasta ahora, o sea los tests de unidad:


require 'test/unit'
require 'rapidito/tokenizer'

include Rapidito

class TokenizerTest < Test::Unit::TestCase
  
  def test_no_token
    tok = Tokenizer.new
    tok.source = "aaaa"
    assert_equal true, tok.has_next?
    assert_equal "aaaa", tok.next_token
    assert_equal false, tok.has_next?
  end
  
  def test_two_delimiters
    tok = Tokenizer.new( /\|/, /;;/ )
    tok.source = "aa|bbb;;;;cccc"
    assert_equal [ "aa", :"|", "bbb", :";;", :";;", "cccc" ], tok.all_tokens
    
    tok.source = "aa;;bbb||cccc"
    assert_equal [ "aa", :";;", "bbb", :"|", :"|", "cccc" ], tok.all_tokens
  end
  
  def test_choose_first_match
    tok = Tokenizer.new( /aa/, /aaa/ )
    tok.source = "aaa"
    assert_equal [ :aa, "a" ], tok.all_tokens
  end
  
end

Happy hacking,
Aureliano.

PD: ¿Prefieren que ponga el código con syntax highlighting?

2009-06-01

Quine en ruby

En el ruby quiz pusieron como problema de la semana hacer un quine en ruby. Un quine es un programa que genera como salida el código fuente del mismo (ver wikipedia).

Así que puse manos a la obra. Agarre el último quine en C de la página de wikipedia:


main() { char *s="main() { char *s=%c%s%c; printf(s,34,s,34); }"; printf(s,34,s,34); }

Y lo traduje a ruby:


s="s=%c%s%c;printf(s,34,s,34)";printf(s,34,s,34)

En ruby no hace falta tener main, porque todo lo que agarra el intérprete lo ejecuta y tampoco hay que declarar variables, así que quedó de 48 caracteres.

Y después hice uno que hace que tenga un ENTER al final, quedando más lindo cuando lo corrés de la consola:


s="s=%c%s%c;printf(s,34,s,34,13,10)%c%c";printf(s,34,s,34,13,10)

Fueron 10 minutos divertidos.

Happy hacking,
Aureliano.

aurelianito

2009-06-30

Más rapidito

2009-06-19

Bouncer versión 2

2009-06-18

Mousewheel para arriba en vmware

2009-06-17

Cambiando de resolución en X

2009-06-13

Pequeñas delicias de las expresiones regulares

2009-06-12

Tokenizer de rapidito, segunda versión

2009-06-11

Syntax highlighting de Ruby en mi blog

2009-06-06

Tokenizer de rapidito

2009-06-01

Quine en ruby

Archivo del Blog

Tags