package RcParse; require 5.004; use Exporter; use strict; use vars qw(@EXPORT_OK @ISA $VERSION); @ISA = qw(Exporter); @EXPORT_OK = qw(rc_verify); $VERSION = '0.10'; my %esc = ( t => "\t" , n => "\n" , r => "\r" , f => "\f" , b => "\b" , a => "\a" , e => "\e" , 'c@' => "\c@", cA => "\cA", cB => "\cB", cC => "\cC", cD => "\cD", cE => "\cE", cF => "\cF", cG => "\cG", cH => "\cH", cI => "\cI", cJ => "\cJ", cK => "\cK", cL => "\cL", cM => "\cM", cN => "\cN", cO => "\cO", cP => "\cP", cQ => "\cQ", cR => "\cR", cS => "\cS", cT => "\cT", cU => "\cU", cV => "\cV", cW => "\cW", cX => "\cX", cY => "\cY", cZ => "\cZ", ca => "\cA", cb => "\cB", cc => "\cC", cd => "\cD", ce => "\cE", cf => "\cF", cg => "\cG", ch => "\cH", ci => "\cI", cj => "\cJ", ck => "\cK", cl => "\cL", cm => "\cM", cn => "\cN", co => "\cO", cp => "\cP", cq => "\cQ", cr => "\cR", cs => "\cS", ct => "\cT", cu => "\cU", cv => "\cV", cw => "\cW", cx => "\cX", cy => "\cY", cz => "\cZ", 'c[' => "\c[", 'c]' => "\c]", 'c^' => "\c^", 'c_' => "\c_" ); # # it checks for mismatched quotes and removes comments; if called in list # context it also parses the line into separate list elements # # parameters # rc_verify( $line [ , 'ad=?', 'an=?', 'ws=?' | '=' ] ); # # returns # in scalar context it can return three different values # error: undef # empty line or the entire line a comment: '' # line with comments removed # # in list context # error: empty list | undef # empty line or the entire line a comment: ( undef ) : a single element # list, the element being 'undef' # line split into 'words | strings' # sub rc_verify { my $l = shift; my ( $inquote_s , $inquote_d , $was_slash ) = ( 0 , 0 , 0 ); my $ch; $l =~ s/^\s+//; # remove leading $l =~ s/\s+$//; # and trailing whitespace my @ch = split // , $l; # split into individual characters $l = ''; for $ch ( @ch ) { if ( $ch eq '\\' || $was_slash ) { $was_slash = ! $was_slash; } elsif( $ch eq '\'' ) { $inquote_s = ! $inquote_s; } elsif( $ch eq '"' ) { $inquote_d = ! $inquote_d; } elsif( $ch eq '#' && ! ( $inquote_s || $inquote_d ) ) { last; } return # mixed quotes not allowed if $inquote_s && $inquote_d; $l .= $ch; } return # unterminated quotes if $inquote_s || $inquote_d || $was_slash; return wantarray ? ( undef ) : '' unless $l =~ /\S/; if( ! wantarray ) { # let's do the conversion here $l =~ s/\\(c[\@A-Za-z\[\]\^_]|[tnrfbae])/sprintf("%s",$esc{$1})/eg; $l =~ s/\\[xX]([0-9a-fA-F][0-9a-fA-F])/sprintf(chr(hex("0x$1")))/eg; $l =~ s/\\([0-7]{1,3})/sprintf(chr(oct("$1")))/eg; $l =~ s/\\(.)/$1/g; # replace remaining backslashed items } return wantarray ? __split_line( $l , shift ) : $l; } # # parameters # split_line( $line [ , 'an=?', 'ad=?', 'ws=?' ] ); # # returns # error: empty list | undef # empty line or the entire line a comment: (undef) : a single element list, # the element being 'undef' # line split into 'words | strings' # # This is an internal routine # sub __split_line { my $l = shift; my ( @w , $s , $eq , $c , $was_eq); my $regex; if( defined $_[0] && $_[0] =~ /^(?:(?:an|ad|ws)(=c?)?|=c?)$/ ) { if ( $_[0] =~ /^ad$/ ) { $regex = '\\G(?:([a-zA-Z_]+|\d+)|[\'"]([^\'"\\\\]*(?:\\\\.[^\'"\\\\]*)*)[\'"])\\s+'; } elsif( $_[0] =~ /^an$/ ) { $regex = '\\G(?:(\\w+)|[\'"]([^\'"\\\\]*(?:\\\\.[^\'"\\\\]*)*)[\'"])\\s+'; } elsif( $_[0] =~ /^ws$/ ) { $regex = '\\G(?:([^\\s"\'\\\\]+)|[\'"]([^\'"\\\\]*(?:\\\\.[^\'"\\\\]*)*)[\'"])\\s+'; } elsif( $_[0] =~ /^(ad)?=c?/ ) { $regex = '\\G(?:([a-zA-Z_]+|\d+)|[\'"]([^\'"\\\\]*(?:\\\\.[^\'"\\\\]*)*)[\'"])'; } elsif( $_[0] =~ /^an=c?/ ) { $regex = '\\G(?:(\\w+)|[\'"]([^\'"\\\\]*(?:\\\\.[^\'"\\\\]*)*)[\'"])'; } elsif( $_[0] =~ /^ws=c?/ ) { $regex = '\\G(?:([^\\s"\'\\\\=]+)|[\'"]([^\'"\\\\]*(?:\\\\.[^\'"\\\\]*)*)[\'"])'; } $eq = $_[0] =~ /=/; $c = $_[0] =~ /c/; } else { $eq = 0; $c = 0; $regex = '\\G(?:([a-zA-Z_]+|\d+)|[\'"]([^\'"\\\\]*(?:\\\\.[^\'"\\\\]*)*)[\'"])\\s+'; } $l .= ' '; # need as a guard while( $l =~ /$regex/gc ) { $s = defined $1 ? $1 : $2; if( defined $2 ) { $s =~ s/^\s+//; $s =~ s/(\\\s)?\s*?$/$1/; } # replace backslashed escaped chars $s =~ s/\\([tnrfbae])/sprintf "%s",$esc{$1}/eg; $s =~ s/\\(c[\@A-Za-z\[\]\^_])/sprintf("%s",$esc{$1})/eg; $s =~ s/\\[xX]([0-9a-fA-F][0-9a-fA-F])/sprintf(chr(hex("0x$1")))/eg; $s =~ s/\\([0-7]{1,3})/sprintf(chr(oct("$1")))/eg; $s =~ s/\\(.)/$1/g; # replace remaining backslashed items if( $was_eq && $c ) { $w[ $#w ] .= $s; } else { push @w , $s; } if( $eq ) { if( $l =~ /\G\s*=\s*/gc ) { $was_eq = 1; if( ! $c ) { push @w , '='; } else { $w[ $#w ] .= ' = '; } } else { $was_eq = 0; return unless $l =~ /\G\s+/gc; } return if $#w > 0 && $w[ $#w ] eq '=' && $w[ $#w - 1 ] eq '='; } } return # line was not completely parsed if $l =~ /\G./gc || $was_eq; @w; } 1; __END__ =head1 RcParse RcParse - B =head1 rc_verify S B reacts to context. If called in scalar context then it returns the line with comments removed and also checks for matched up quotes. It also does the conversions of backslashed items described below. In scalar context backslashed items can appear anywhere, this is not the case when in list context. In list context it does the above and then it returns a list of items split up based on the second argument to B. If the second argument doesn't match one of the above or isn't there then it is simply ignored and 'B' is used. See below for an explanation of arguments. All leading and trailing whitespace are removed from the line. Comments are delimited by a 'B<#>' sign that isn't within quotes. =head1 SCALAR CONTEXT The second argument is ignored in scalar context. B returns the value 'undef' which can be checked for with S> B returns the line with comments removed. If after removing comments the line consists only of whitespace, it returns the empty line which can be checked with S> If the statement is true then it was either an empty line or a line with only comments on it. Otherwise the line had properly matched quotes, comments have been removed and backslashed items have been converted. Backslashed items can appear anywhere on the line. =head1 LIST CONTEXT The relevance of the second argument is explained below. B returns the empty list which can be checked with either of two ways S> or S> B if the line was empty or had only comments on it then it returns a single element list with the element being 'undef' which can be checked with S> if that's not true then it will contain the split up line. The splitting is based on the second, optional, argument to B. Evrything that is done in scalar context is done before the splitting, except for the conversion of backslashed items. Backslashed items can appear within a quoted string. =head1 ARGUMENT 'an(=c?)?' The splitting is done in an alphanumerical sense unless within quotes. Almost any character is allowed within quotes. See below for more on quotes. The 'equal' sign is optional and if included then it will be allowed outside of a quoted string. Otherwise only alphanumerics and underscore are allowed and they can be mixed. The test is basically the B function B plus underscore. eg. set ab3 5 set ab0 ls7 set er_ 'long line' If the 'equal' sign is included then the following are also OK. The 'equal' sign must be between two non 'equal' signs and cannot appear at the end or beginning of the line. If the 'c' is included after the 'equal' then the two elements, the one before and the one after, as well as the 'equal' are parsed into a single array element. There will be an extra space added before and after the 'equal'. eg. ab3 = 5 er_ = 'long line' =head1 ARGUMENT 'ad(=c?)?' Same as above but digits cannot be mixed with alpha characters. The test is basically the B functions B plus underscore or B. 'B' is the default argument to B. eg. set ab 5 =head1 ARGUMENT '=c?' Same as 'B' =head1 ARGUMENT 'ws(=c?)?' This allows any character other than \, ', or " to appear in a 'word' without needing quoting (whitespace delimited). This means that unless '=' is appended to 'ws', an 'equal' sign can appear anywhere and will not be parsed into a separate element if it's in the middle of a word or attached to a word. eg. set ab* nice% set ab=9 OK (ab=9) will be parsed into a single entity. see below. If the 'equal' sign is included then it is parsed slightly different. An 'equal' sign is parsed into a separate element unless 'c' is present after the 'equal'. In that case the element before, the element after and the 'equal' are parsed into a single array element. There will be an extra space added before and after the 'equal'. The equal sign must be between to non equal signs and cannot appear at the end or beginning of the line. eg. ab='cool dude' ab& = nice =head1 QUOTES The quotes can be either single(') or double("). If called in list context backslashed items can only appear within quoted strings and must be escaped - B<\'>, B<\">, B<\\>. Other backslashed characters listed below will be turned into real characters based on B and B conventions. Any character not listed is left as is, but the backslash is removed from before it. Leading and trailing whitespace within a quoted string is removed. If you wish to keep leading whitspace then backslash the very first one. If you wish to keep trailing whitespace then backslash the very last one. eg. '\o' will turn into 'o' '\n' will be turned into a newline '\12' will be turned into a newline ( octal escape ) '\x0a' will be turned into a newline ( hex escape ) '\x20' will be turned into a space \t tab \n newline \r carriage return \f form feed \b backspace \a alarm \e escape \033 octal escape \x1b hexadecimal escape \X1B hexadecimal escape \c] escape \cJ newline \cM carriage return =head1 EXAMPLES use RcParse qw(rc_verify); @line = rc_verify($line,'an='); if(defined @line && defined $line[0]) { # we got a good parse } @line = rc_verify($line,'an=c'); if(defined @line && defined $line[0]) { # we got a good parse; and now items with an equal between # them are combined into single element } $line = rc_verify($line); if(defined $line && length($line) > 0) { # we got a valid line } =head1 AUTHOR Gabor Egressy B Copyright (c) 1999 Gabor Egressy. All rights reserved. All wrongs reversed. This program is free software; you can redistribute and/or modify it under the same terms as B itself. =cut