#!/usr/bin/env perl ############################################################################# # Name: regex.pl # Purpose: Generate test code for wxRegEx from 'reg.test' # Author: Mike Wetherell # RCS-ID: $Id$ # Copyright: (c) Mike Wetherell # Licence: wxWidgets licence ############################################################################# # # Notes: # See './regex.pl -h' for usage # # Output at the moment is C++ using the cppunit testing framework. The # language/framework specifics are separated, with the following 5 # subs as an interface: 'begin_output', 'begin_section', 'write_test', # 'end_section' and 'end_output'. So for a different language/framework, # implement 5 new similar subs. # # I've avoided using 'use encoding "UTF-8"', since this wasn't available # in perl 5.6.x. Instead I've used some hacks like 'pack "U0C*"'. Versions # earler than perl 5.6.0 aren't going to work. # use strict; use warnings; use File::Basename; #use encoding "UTF-8"; # enable in the future when perl 5.6.x is just a memory # if 0 output is wide characters, if 1 output is utf8 encoded my $utf = 1; # quote a parameter (C++ helper) # sub quotecxx { my %esc = ( "\a" => "a", "\b" => "b", "\f" => "f", "\n" => "n", "\r" => "r", "\t" => "t", "\013" => "v", '"' => '"', "\\" => "\\" ); # working around lack of 'use encoding' $_ = pack "U0C*", unpack "C*", $_; use utf8; s/[\000-\037"\\\177-\x{ffff}]/ if ($esc{$&}) { "\\$esc{$&}"; } elsif (ord($&) > 0x9f) { if ($utf) { $&; } else { sprintf "\\u%04x", ord($&); } } else { sprintf "\\%03o", ord($&); } /ge; # working around lack of 'use encoding' no utf8; $_ = pack "C*", unpack "C*", $_; return ($utf ? '"' : 'L"') . $_ . '"' } # start writing the output code (C++ interface) # sub begin_output { my ($from, $instructions) = @_; # embed it in the comment $from = "\n$from"; $from =~ s/^(?: )?/ * /mg; # $instructions contains information about the flags etc. if ($instructions) { $instructions = "\n$instructions"; $instructions =~ s/^(?: )?/ * /mg; } my $u = $utf ? " (UTF-8 encoded)" : ""; print <add(" . (join ', ', @args) . ", NULL);\n"; } # end a section (C++ interface) # sub end_section { my ($id, $class) = @{$classes[$#classes]}; print <addTest(".$_->[1]."::suite());\n" for @classes; print <(?:\\[{}]|[^{}])+)|(??{$curly}))*\}/; $quote = qr/"(?:\\"|[^"])*"/; my @tokens = shift =~ /($curly|$quote|\S+)/g; # now remove braces/quotes and unescape any escapes for (@tokens) { if (s/^{(.*)}$/$1/) { # for curly quoting, only unescape \{ and \} s/\\([{}])/$1/g; } else { s/^"(.*)"$/$1/; # unescape any escapes my %esc = ( "a" => "\a", "b" => "\b", "f" => "\f", "n" => "\n", "r" => "\r", "t" => "\t", "v" => "\013" ); my $x = qr/[[:xdigit:]]/; s/\\([0-7]{1,3}|x$x+|u$x{1,4}|.)/ if ($1 =~ m{^([0-7]+)}) { chr(oct($1)); } elsif ($1 =~ m{^x($x+)}) { pack("C0U", hex($1) & 0xff); } elsif ($1 =~ m{^u($x+)}) { pack("C0U", hex($1)); } elsif ($esc{$1}) { $esc{$1}; } else { $1; } /ge; } } return @tokens; } # helpers which keep track of whether begin_section has been called, so that # end_section can be called when appropriate # my @doing = ("0", ""); my $in_section = 0; sub handle_doing { end_section if $in_section; $in_section = 0; @doing = @_; } sub handle_test { begin_section(@doing) if !$in_section; $in_section = 1; write_test @_; } sub handle_end { end_section if $in_section; $in_section = 0; end_output; } # 'main' - start by parsing the command lines options. # my $badoption = !@ARGV; my $utfdefault = $utf; my $outputname; for (my $i = 0; $i < @ARGV; ) { if ($ARGV[$i] !~ m{^-.}) { $i++; next; } if ($ARGV[$i] eq '--') { splice @ARGV, $i, 1; last; } if ($ARGV[$i] =~ s{^-(.*)o(.*)$}{-$1}i) { # -o : output file $outputname = $2 || splice @ARGV, $i + 1, 1; } for (split //, substr($ARGV[$i], 1)) { if (/u/i) { # -u : utf-8 output $utf = 1; } elsif (/w/i) { # -w : wide char output $utf = 0; } else { $badoption = 1; } } splice @ARGV, $i, 1; } # Display help # if ($badoption) { my $prog = basename $0; my ($w, $u) = (" (default)", " "); ($w, $u) = ($u, $w) if $utfdefault; print <$outputname" if $outputname; # Read in the files and initially parse just the comments for copyright # information and instructions on the tests # my @input; # slurped input files stripped of comments my $files = ""; # copyright info from the input comments my $instructions = ""; # test instructions from the input comments do { my $inputname = basename $ARGV[0] if @ARGV; # slurp input undef $/; my $in = <>; # remove escaped newlines $in =~ s/(? 1) { $instructions[1] = "Flag characters:\n$instructions[1]"; } $instructions = join "\n", @instructions; $instructions =~ s/^#([^\t]?)/ $1/mg; } } # @input is the input of all files (stipped of comments) $in =~ s/^#.*$//mg; push @input, $in; } while $ARGV[0]; # Make a string naming the generator, the input files and copyright info # my $from = "Generated " . localtime() . " by " . basename $0; $from =~ s/[\s]+/ /g; if ($files) { if ($files =~ /:/) { $from .= " from the following files:"; } else { $from .= " from work with the following copyright:"; } } $from = join("\n", $from =~ /(.{0,76}(?:\s|$))/g); # word-wrap $from .= "\n$files" if $files; # Now start to print the code # begin_output $from, $instructions; # numbers for 'extra' sections my $extra = 1; for (@input) { # Print the main tests # # Test lines look like this: # m 3 b {\(a\)b} ab ab a # # Also looks for heading lines, e.g.: # doing 4 "parentheses" # for (split "\n") { if (/^doing\s+(\S+)\s+(\S.*)/) { handle_doing parsetcl "$1 $2"; } elsif (/^[efimp]\s/) { handle_test parsetcl $_; } } # Extra tests # # The expression below matches something like this: # test reg-33.8 {Bug 505048} { # regexp -inline {\A\s*[^b]*b} ab # } ab # # The three subexpressions then return these parts: # $extras[$i] = '{Bug 505048}', # $extras[$i + 1] = '-inline {\A\s*[^b]*b} ab' # $extras[$i + 2] = 'ab' # my @extras = /\ntest\s+\S+\s*(\{.*?\})\s*\{\n # line 1 \s*regexp\s+([^\n]+)\n # line 2 \}\s*(\S[^\n]*)/gx; # line 3 handle_doing "extra_" . $extra++, "checks for bug fixes" if @extras; for (my $i = 0; $i < @extras; $i += 3) { my $id = $extras[$i]; # further parse the middle line into options and the rest (i.e. $args) my ($opts, $args) = $extras[$i + 1] =~ /^\s*((?:-\S+\s+)*)([^\s-].*)/; my @args = parsetcl $args; $#args = 1; # only want the first two # now handle the options my $test = $opts =~ /-indices/ ? 'i' : $extras[$i + 2] ? 'm' : 'f'; my $results = $opts =~ /-inline/ && $test ne 'f' ? $extras[$i+2] : ''; # get them all in the right order and print unshift @args, $test, parsetcl($id), '-'; push @args, parsetcl(parsetcl($results)) if $results; handle_test @args; } } # finish # handle_end;