Perl/FAQ/Поиск по шаблону
Материал из Wiki.crossplatform.ru
Версия от 10:05, 3 декабря 2008; Root (Обсуждение | вклад)
Perl · |
[править] Introduction
#----------------------------- match( $string, $pattern ); subst( $string, $pattern, $replacement ); #----------------------------- $meadow =~ m/sheep/; # True if $meadow contains "sheep" $meadow !~ m/sheep/; # True if $meadow doesn't contain "sheep" $meadow =~ s/old/new/; # Replace "old" with "new" in $meadow #----------------------------- # Fine bovines demand fine toreadors. # Muskoxen are a polar ovibovine species. # Grooviness went out of fashion decades ago. #----------------------------- # Ovines are found typically in oviaries. #----------------------------- if ($meadow =~ /\bovines?\b/i) { print "Here be sheep!" } #----------------------------- $string = "good food"; $string =~ s/o*/e/; #----------------------------- # good food # # geod food # # geed food # # geed feed # # ged food # # ged fed # # egood food #----------------------------- #% echo ababacaca | perl -ne 'print "$&\n" if /(a|ba|b)+(a|ac)+/' #ababa #----------------------------- #% echo ababacaca | # awk 'match($0,/(a|ba|b)+(a|ac)+/) { print substr($0, RSTART, RLENGTH) }' #ababacaca #----------------------------- while (m/(\d+)/g) { print "Found number $1\n"; } #----------------------------- @numbers = m/(\d+)/g; #----------------------------- $digits = "123456789"; @nonlap = $digits =~ /(\d\d\d)/g; @yeslap = $digits =~ /(?=(\d\d\d))/g; print "Non-overlapping: @nonlap\n"; print "Overlapping: @yeslap\n"; # Non-overlapping: 123 456 789 # Overlapping: 123 234 345 456 567 678 789 #----------------------------- $string = "And little lambs eat ivy"; $string =~ /l[^s]*s/; print "($`) ($&) ($')\n"; # (And ) (little lambs) ( eat ivy) #-----------------------------
[править] Copying and Substituting Simultaneously
#----------------------------- $dst = $src; $dst =~ s/this/that/; #----------------------------- ($dst = $src) =~ s/this/that/; #----------------------------- # strip to basename ($progname = $0) =~ s!^.*/!!; # Make All Words Title-Cased ($capword = $word) =~ s/(\w+)/\u\L$1/g; # /usr/man/man3/foo.1 changes to /usr/man/cat3/foo.1 ($catpage = $manpage) =~ s/man(?=\d)/cat/; #----------------------------- @bindirs = qw( /usr/bin /bin /usr/local/bin ); for (@libdirs = @bindirs) { s/bin/lib/ } print "@libdirs\n"; # /usr/lib /lib /usr/local/lib #----------------------------- ($a = $b) =~ s/x/y/g; # copy $b and then change $a $a = ($b =~ s/x/y/g); # change $b, count goes in $a #-----------------------------
[править] Matching Letters
#----------------------------- if ($var =~ /^[A-Za-z]+$/) { # it is purely alphabetic } #----------------------------- use locale; if ($var =~ /^[^\W\d_]+$/) { print "var is purely alphabetic\n"; } #----------------------------- use locale; use POSIX 'locale_h'; # the following locale string might be different on your system unless (setlocale(LC_ALL, "fr_CA.ISO8859-1")) { die "couldn't set locale to French Canadian\n"; } while (<DATA>) { chomp; if (/^[^\W\d_]+$/) { print "$_: alphabetic\n"; } else { print "$_: line noise\n"; } } #__END__ #silly #faзade #coцperate #niсo #Renйe #Moliиre #hжmoglobin #naпve #tschьЯ #random!stuff#here #-----------------------------
[править] Matching Words
#----------------------------- #/\S+/ # as many non-whitespace bytes as possible #/[A-Za-z'-]+/ # as many letters, apostrophes, and hyphens #----------------------------- #/\b([A-Za-z]+)\b/ # usually best #/\s([A-Za-z]+)\s/ # fails at ends or w/ punctuation #-----------------------------
[править] Commenting Regular Expressions
#----------------------------- # download the following standalone program #!/usr/bin/perl -p # resname - change all "foo.bar.com" style names in the input stream # into "foo.bar.com [204.148.40.9]" (or whatever) instead use Socket; # load inet_addr s{ # ( # capture the hostname in $1 (?: # these parens for grouping only (?! [-_] ) # lookahead for neither underscore nor dash [\w-] + # hostname component \. # and the domain dot ) + # now repeat that whole thing a bunch of times [A-Za-z] # next must be a letter [\w-] + # now trailing domain part ) # end of $1 capture }{ # replace with this: "$1 " . # the original bit, plus a space ( ($addr = gethostbyname($1)) # if we get an addr ? "[" . inet_ntoa($addr) . "]" # format it : "[???]" # else mark dubious ) }gex; # /g for global # /e for execute # /x for nice formatting #----------------------------- s/ # replace \# # a pound sign (\w+) # the variable name \# # another pound sign /${$1}/xg; # with the value of the global variable ##----------------------------- s/ # replace \# # a pound sign (\w+) # the variable name \# # another pound sign /'$' . $1/xeeg; # ' with the value of *any* variable #-----------------------------
[править] Finding the Nth Occurrence of a Match
#----------------------------- # One fish two fish red fish blue fish #----------------------------- $WANT = 3; $count = 0; while (/(\w+)\s+fish\b/gi) { if (++$count == $WANT) { print "The third fish is a $1 one.\n"; # Warning: don't `last' out of this loop } } # The third fish is a red one. #----------------------------- /(?:\w+\s+fish\s+){2}(\w+)\s+fish/i; #----------------------------- # simple way with while loop $count = 0; while ($string =~ /PAT/g) { $count++; # or whatever you'd like to do here } # same thing with trailing while $count = 0; $count++ while $string =~ /PAT/g; # or with for loop for ($count = 0; $string =~ /PAT/g; $count++) { } # Similar, but this time count overlapping matches $count++ while $string =~ /(?=PAT)/g; #----------------------------- $pond = 'One fish two fish red fish blue fish'; # using a temporary @colors = ($pond =~ /(\w+)\s+fish\b/gi); # get all matches $color = $colors[2]; # then the one we want # or without a temporary array $color = ( $pond =~ /(\w+)\s+fish\b/gi )[2]; # just grab element 3 print "The third fish in the pond is $color.\n"; # The third fish in the pond is red. #----------------------------- $count = 0; $_ = 'One fish two fish red fish blue fish'; @evens = grep { $count++ % 2 == 1 } /(\w+)\s+fish\b/gi; print "Even numbered fish are @evens.\n"; # Even numbered fish are two blue. #----------------------------- $count = 0; s{ \b # makes next \w more efficient ( \w+ ) # this is what we'll be changing ( \s+ fish \b ) }{ if (++$count == 4) { "sushi" . $2; } else { $1 . $2; } }gex; # One fish two fish red fish sushi fish #----------------------------- $pond = 'One fish two fish red fish blue fish swim here.'; $color = ( $pond =~ /\b(\w+)\s+fish\b/gi )[-1]; print "Last fish is $color.\n"; # Last fish is blue. #----------------------------- m{ A # find some pattern A (?! # mustn't be able to find .* # something A # and A ) $ # through the end of the string }x #----------------------------- $pond = 'One fish two fish red fish blue fish swim here.'; if ($pond =~ m{ \b ( \w+) \s+ fish \b (?! .* \b fish \b ) }six ) { print "Last fish is $1.\n"; } else { print "Failed!\n"; } # Last fish is blue. #-----------------------------
[править] Matching Multiple Lines
#----------------------------- # download the following standalone program #!/usr/bin/perl # killtags - very bad html tag killer undef $/; # each read is whole file while (<>) { # get one whole file at a time s/<.*?>//gs; # strip tags (terribly) print; # print file to STDOUT } #----------------------------- # download the following standalone program #!/usr/bin/perl # headerfy: change certain chapter headers to html $/ = ''; while ( <> ) { # fetch a paragraph s{ \A # start of record ( # capture in $1 Chapter # text string \s+ # mandatory whitespace \d+ # decimal number \s* # optional whitespace : # a real colon . * # anything not a newline till end of line ) }{<H1>$1</H1>}gx; print; } #----------------------------- #% perl -00pe 's{\A(Chapter\s+\d+\s*:.*)}{<H1>$1</H1>}gx' datafile #----------------------------- $/ = ''; # paragraph read mode for readline access while (<ARGV>) { while (m#^START(.*?)^END#sm) { # /s makes . span line boundaries # /m makes ^ match near newlines print "chunk $. in $ARGV has <<$1>>\n"; } } #-----------------------------
[править] Reading Records with a Pattern Separator
#----------------------------- undef $/; @chunks = split(/pattern/, <FILEHANDLE>); #----------------------------- # .Ch, .Se and .Ss divide chunks of STDIN { local $/ = undef; @chunks = split(/^\.(Ch|Se|Ss)$/m, <>); } print "I read ", scalar(@chunks), " chunks.\n"; #-----------------------------
[править] Extracting a Range of Lines
#----------------------------- while (<>) { if (/BEGIN PATTERN/ .. /END PATTERN/) { # line falls between BEGIN and END in the # text, inclusive. } } while (<>) { if ($FIRST_LINE_NUM .. $LAST_LINE_NUM) { # operate only between first and last line, inclusive. } } #----------------------------- while (<>) { if (/BEGIN PATTERN/ ... /END PATTERN/) { # line is between BEGIN and END on different lines } } while (<>) { if ($FIRST_LINE_NUM ... $LAST_LINE_NUM) { # operate only between first and last line, but not same } } #----------------------------- # command-line to print lines 15 through 17 inclusive (see below) perl -ne 'print if 15 .. 17' datafile # print out all <XMP> .. </XMP> displays from HTML doc while (<>) { print if m#<XMP>#i .. m#</XMP>#i; } # same, but as shell command # perl -ne 'print if m#<XMP>#i .. m#</XMP>#i' document.html #----------------------------- # perl -ne 'BEGIN { $top=3; $bottom=5 } print if $top .. $bottom' /etc/passwd # previous command FAILS # perl -ne 'BEGIN { $top=3; $bottom=5 } \ # print if $. == $top .. $. == $bottom' /etc/passwd # works # perl -ne 'print if 3 .. 5' /etc/passwd # also works #----------------------------- print if /begin/ .. /end/; print if /begin/ ... /end/; #----------------------------- while (<>) { $in_header = 1 .. /^$/; $in_body = /^$/ .. eof(); } #----------------------------- %seen = (); while (<>) { next unless /^From:?\s/i .. /^$/; while (/([^<>(),;\s]+\@[^<>(),;\s]+)/g) { print "$1\n" unless $seen{$1}++; } } #-----------------------------
[править] Matching Shell Globs as Regular Expressions
#----------------------------- sub glob2pat { my $globstr = shift; my %patmap = ( '*' => '.*', '?' => '.', '[' => '[', ']' => ']', ); $globstr =~ s{(.)} { $patmap{$1} || "\Q$1" }ge; return '^' . $globstr . '$'; #' } #-----------------------------
[править] Speeding Up Interpolated Matches
#----------------------------- while ($line = <>) { if ($line =~ /$pattern/o) { # do something } } #----------------------------- # download the following standalone program #!/usr/bin/perl # popgrep1 - grep for abbreviations of places that say "pop" # version 1: slow but obvious way @popstates = qw(CO ON MI WI MN); LINE: while (defined($line = <>)) { for $state (@popstates) { if ($line =~ /\b$state\b/) { print; next LINE; } } } #----------------------------- # download the following standalone program #!/usr/bin/perl # popgrep2 - grep for abbreviations of places that say "pop" # version 2: eval strings; fast but hard to quote @popstates = qw(CO ON MI WI MN); $code = 'while (defined($line = <>)) {'; for $state (@popstates) { $code .= "\tif (\$line =~ /\\b$state\\b/) { print \$line; next; }\n"; } $code .= '}'; print "CODE IS\n----\n$code\n----\n" if 0; # turn on to debug eval $code; die if $@; #----------------------------- while (defined($line = <>)) { if ($line =~ /\bCO\b/) { print $line; next; } if ($line =~ /\bON\b/) { print $line; next; } if ($line =~ /\bMI\b/) { print $line; next; } if ($line =~ /\bWI\b/) { print $line; next; } if ($line =~ /\bMN\b/) { print $line; next; } } #----------------------------- # download the following standalone program #!/usr/bin/perl # popgrep3 - grep for abbreviations of places that say "pop" # version 3: use build_match_func algorithm @popstates = qw(CO ON MI WI MN); $expr = join('||', map { "m/\\b\$popstates[$_]\\b/o" } 0..$#popstates); $match_any = eval "sub { $expr }"; die if $@; while (<>) { print if &$match_any; } #----------------------------- sub { m/\b$popstates[0]\b/o || m/\b$popstates[1]\b/o || m/\b$popstates[2]\b/o || m/\b$popstates[3]\b/o || m/\b$popstates[4]\b/o } #----------------------------- # download the following standalone program #!/usr/bin/perl # grepauth - print lines that mention both Tom and Nat $multimatch = build_match_all(q/Tom/, q/Nat/); while (<>) { print if &$multimatch; } exit; sub build_match_any { build_match_func('||', @_) } sub build_match_all { build_match_func('&&', @_) } sub build_match_func { my $condition = shift; my @pattern = @_; # must be lexical variable, not dynamic one my $expr = join $condition => map { "m/\$pattern[$_]/o" } (0..$#pattern); my $match_func = eval "sub { local \$_ = shift if \@_; $expr }"; die if $@; # propagate $@; this shouldn't happen! return $match_func; } #----------------------------- # download the following standalone program #!/usr/bin/perl # popgrep4 - grep for abbreviations of places that say "pop" # version 4: use Regexp module use Regexp; @popstates = qw(CO ON MI WI MN); @poppats = map { Regexp->new( '\b' . $_ . '\b') } @popstates; while (defined($line = <>)) { for $patobj (@poppats) { print $line if $patobj->match($line); } } #-----------------------------
[править] Testing for a Valid Pattern
#----------------------------- do { print "Pattern? "; chomp($pat = <>); eval { "" =~ /$pat/ }; warn "INVALID PATTERN $@" if $@; } while $@; #----------------------------- sub is_valid_pattern { my $pat = shift; return eval { "" =~ /$pat/; 1 } || 0; } #----------------------------- # download the following standalone program #!/usr/bin/perl # paragrep - trivial paragraph grepper die "usage: $0 pat [files]\n" unless @ARGV; $/ = ''; $pat = shift; eval { "" =~ /$pat/; 1 } or die "$0: Bad pattern $pat: $@\n"; while (<>) { print "$ARGV $.: $_" if /$pat/o; } #----------------------------- $pat = "You lose @{[ system('rm -rf *')]} big here"; #----------------------------- $safe_pat = quotemeta($pat); something() if /$safe_pat/; #----------------------------- something() if /\Q$pat/; #-----------------------------
[править] Honoring Locale Settings in Regular Expressions
#----------------------------- use locale; #----------------------------- # download the following standalone program #!/usr/bin/perl -w # localeg - demonstrate locale effects use locale; use POSIX 'locale_h'; $name = "andreas k\xF6nig"; @locale{qw(German English)} = qw(de_DE.ISO_8859-1 us-ascii); setlocale(LC_CTYPE, $locale{English}) or die "Invalid locale $locale{English}"; @english_names = (); while ($name =~ /\b(\w+)\b/g) { push(@english_names, ucfirst($1)); } setlocale(LC_CTYPE, $locale{German}) or die "Invalid locale $locale{German}"; @german_names = (); while ($name =~ /\b(\w+)\b/g) { push(@german_names, ucfirst($1)); } print "English names: @english_names\n"; print "German names: @german_names\n"; English names: Andreas K Nig German names: Andreas Kцnig #-----------------------------
[править] Approximate Matching
#----------------------------- use String::Approx qw(amatch); if (amatch("PATTERN", @list)) { # matched } @matches = amatch("PATTERN", @list); #----------------------------- use String::Approx qw(amatch); open(DICT, "/usr/dict/words") or die "Can't open dict: $!"; while(<DICT>) { print if amatch("balast"); } ballast balustrade blast blastula sandblast #-----------------------------
[править] Matching from Where the Last Pattern Left Off
#----------------------------- while (/(\d+)/g) { print "Found $1\n"; } #----------------------------- $n = " 49 here"; $n =~ s/\G /0/g; print $n; 00049 here #----------------------------- while (/\G,?(\d+)/g) { print "Found number $1\n"; } #----------------------------- $_ = "The year 1752 lost 10 days on the 3rd of September"; while (/(\d+)/gc) { print "Found number $1\n"; } if (/\G(\S+)/g) { print "Found $1 after the last number.\n"; } #Found number 1752 # #Found number 10 # #Found number 3 # #Found rd after the last number. #----------------------------- print "The position in \$a is ", pos($a); pos($a) = 30; print "The position in \$_ is ", pos; pos = 30; #-----------------------------
[править] Greedy and Non-Greedy Matches
#----------------------------- # greedy pattern s/<.*>//gs; # try to remove tags, very badly # non-greedy pattern s/<.*?>//gs; # try to remove tags, still rather badly #----------------------------- #<b><i>this</i> and <i>that</i> are important</b> Oh, <b><i>me too!</i></b> #----------------------------- m{ <b><i>(.*?)</i></b> }sx #----------------------------- /BEGIN((?:(?!BEGIN).)*)END/ #----------------------------- m{ <b><i>( (?: (?!</b>|</i>). )* ) </i></b> }sx #----------------------------- m{ <b><i>( (?: (?!</[ib]>). )* ) </i></b> }sx #----------------------------- m{ <b><i> [^<]* # stuff not possibly bad, and not possibly the end. (?: # at this point, we can have '<' if not part of something bad (?! </?[ib]> ) # what we can't have < # okay, so match the '<' [^<]* # and continue with more safe stuff ) * </i></b> }sx #-----------------------------
[править] Detecting Duplicate Words
#----------------------------- $/ = ''; # paragrep mode while (<>) { while ( m{ \b # start at a word boundary (begin letters) (\S+) # find chunk of non-whitespace \b # until another word boundary (end letters) ( \s+ # separated by some whitespace \1 # and that very same chunk again \b # until another word boundary ) + # one or more sets of those }xig ) { print "dup word '$1' at paragraph $.\n"; } } #----------------------------- This is a test test of the duplicate word finder. #----------------------------- $a = 'nobody'; $b = 'bodysnatcher'; if ("$a $b" =~ /^(\w+)(\w+) \2(\w+)$/) { print "$2 overlaps in $1-$2-$3\n"; } body overlaps in no-body-snatcher #----------------------------- /^(\w+?)(\w+) \2(\w+)$/, #----------------------------- # download the following standalone program #!/usr/bin/perl # prime_pattern -- find prime factors of argument using pattern matching for ($N = ('o' x shift); $N =~ /^(oo+?)\1+$/; $N =~ s/$1/o/g) { print length($1), " "; } print length ($N), "\n"; #----------------------------- # solve for 12x + 15y + 16z = 281, maximizing x if (($X, $Y, $Z) = (('o' x 281) =~ /^(o*)\1{11}(o*)\2{14}(o*)\3{15}$/)) { ($x, $y, $z) = (length($X), length($Y), length($Z)); print "One solution is: x=$x; y=$y; z=$z.\n"; } else { print "No solution.\n"; } #One solution is: x=17; y=3; z=2. #----------------------------- ('o' x 281) =~ /^(o+)\1{11}(o+)\2{14}(o+)\3{15}$/; #One solution is: x=17; y=3; z=2 ('o' x 281) =~ /^(o*?)\1{11}(o*)\2{14}(o*)\3{15}$/; #One solution is: x=0; y=7; z=11. ('o' x 281) =~ /^(o+?)\1{11}(o*)\2{14}(o*)\3{15}$/; #One solution is: x=1; y=3; z=14. #-----------------------------
[править] Expressing AND, OR, and NOT in a Single Pattern
#----------------------------- chomp($pattern = <CONFIG_FH>); if ( $data =~ /$pattern/ ) { ..... } #----------------------------- /ALPHA|BETA/; #----------------------------- /^(?=.*ALPHA)(?=.*BETA)/s; #----------------------------- /ALPHA.*BETA|BETA.*ALPHA/s; #----------------------------- /^(?:(?!PAT).)*$/s; #----------------------------- /(?=^(?:(?!BAD).)*$)GOOD/s; #----------------------------- if (!($string =~ /pattern/)) { something() } # ugly if ( $string !~ /pattern/) { something() } # preferred #----------------------------- if ($string =~ /pat1/ && $string =~ /pat2/ ) { something () } #----------------------------- if ($string =~ /pat1/ || $string =~ /pat2/ ) { something () } #----------------------------- # download the following standalone program #!/usr/bin/perl # minigrep - trivial grep $pat = shift; while (<>) { print if /$pat/o; } #----------------------------- "labelled" =~ /^(?=.*bell)(?=.*lab)/s #----------------------------- $string =~ /bell/ && $string =~ /lab/ #----------------------------- if ($murray_hill =~ m{ ^ # start of string (?= # zero-width lookahead .* # any amount of intervening stuff bell # the desired bell string ) # rewind, since we were only looking (?= # and do the same thing .* # any amount of intervening stuff lab # and the lab part ) }sx ) # /s means . can match newline { print "Looks like Bell Labs might be in Murray Hill!\n"; } #----------------------------- "labelled" =~ /(?:^.*bell.*lab)|(?:^.*lab.*bell)/ #----------------------------- $brand = "labelled"; if ($brand =~ m{ (?: # non-capturing grouper ^ .*? # any amount of stuff at the front bell # look for a bell .*? # followed by any amount of anything lab # look for a lab ) # end grouper | # otherwise, try the other direction (?: # non-capturing grouper ^ .*? # any amount of stuff at the front lab # look for a lab .*? # followed by any amount of anything bell # followed by a bell ) # end grouper }sx ) # /s means . can match newline { print "Our brand has bell and lab separate.\n"; } #----------------------------- $map =~ /^(?:(?!waldo).)*$/s #----------------------------- if ($map =~ m{ ^ # start of string (?: # non-capturing grouper (?! # look ahead negation waldo # is he ahead of us now? ) # is so, the negation failed . # any character (cuzza /s) ) * # repeat that grouping 0 or more $ # through the end of the string }sx ) # /s means . can match newline { print "There's no waldo here!\n"; } #----------------------------- 7:15am up 206 days, 13:30, 4 users, load average: 1.04, 1.07, 1.04 USER TTY FROM LOGIN@ IDLE JCPU PCPU WHAT tchrist tty1 5:16pm 36days 24:43 0.03s xinit tchrist tty2 5:19pm 6days 0.43s 0.43s -tcsh tchrist ttyp0 chthon 7:58am 3days 23.44s 0.44s -tcsh gnat ttyS4 coprolith 2:01pm 13:36m 0.30s 0.30s -tcsh #----------------------------- #% w | minigrep '^(?!.*ttyp).*tchrist' #----------------------------- m{ ^ # anchored to the start (?! # zero-width look-ahead assertion .* # any amount of anything (faster than .*?) ttyp # the string you don't want to find ) # end look-ahead negation; rewind to start .* # any amount of anything (faster than .*?) tchrist # now try to find Tom }x #----------------------------- #% w | grep tchrist | grep -v ttyp #----------------------------- #% grep -i 'pattern' files #% minigrep '(?i)pattern' files #-----------------------------
[править] Matching Multiple-Byte Characters
#----------------------------- my $eucjp = q{ # EUC-JP encoding subcomponents: [\x00-\x7F] # ASCII/JIS-Roman (one-byte/character) | \x8E[\xA0-\xDF] # half-width katakana (two bytes/char) | \x8F[\xA1-\xFE][\xA1-\xFE] # JIS X 0212-1990 (three bytes/char) | [\xA1-\xFE][\xA1-\xFE] # JIS X 0208:1997 (two bytes/char) }; #----------------------------- /^ (?: $eucjp )*? \xC5\xEC\xB5\xFE/ox # Trying to find Tokyo #----------------------------- /^ ( (?:eucjp)*? ) $Tokyo/$1$Osaka/ox #----------------------------- /\G ( (?:eucjp)*? ) $Tokyo/$1$Osaka/gox #----------------------------- @chars = /$eucjp/gox; # One character per list element #----------------------------- while (<>) { my @chars = /$eucjp/gox; # One character per list element for my $char (@chars) { if (length($char) == 1) { # Do something interesting with this one-byte character } else { # Do something interesting with this multiple-byte character } } my $line = join("",@chars); # Glue list back together print $line; } #----------------------------- $is_eucjp = m/^(?:$eucjp)*$/xo; #----------------------------- $is_eucjp = m/^(?:$eucjp)*$/xo; $is_sjis = m/^(?:$sjis)*$/xo; #----------------------------- while (<>) { my @chars = /$eucjp/gox; # One character per list element for my $euc (@chars) { my $uni = $euc2uni{$char}; if (defined $uni) { $euc = $uni; } else { ## deal with unknown EUC->Unicode mapping here. } } my $line = join("",@chars); print $line; } #-----------------------------
[править] Matching a Valid Mail Address
#----------------------------- 1 while $addr =~ s/\([^()]*\)//g; #----------------------------- Dear someuser@host.com, Please confirm the mail address you gave us Wed May 6 09:38:41 MDT 1998 by replying to this message. Include the string "Rumpelstiltskin" in that reply, but spelled in reverse; that is, start with "Nik...". Once this is done, your confirmed address will be entered into our records. #-----------------------------
[править] Matching Abbreviations
#----------------------------- chomp($answer = <>); if ("SEND" =~ /^\Q$answer/i) { print "Action is send\n" } elsif ("STOP" =~ /^\Q$answer/i) { print "Action is stop\n" } elsif ("ABORT" =~ /^\Q$answer/i) { print "Action is abort\n" } elsif ("LIST" =~ /^\Q$answer/i) { print "Action is list\n" } elsif ("EDIT" =~ /^\Q$answer/i) { print "Action is edit\n" } #----------------------------- use Text::Abbrev; $href = abbrev qw(send abort list edit); for (print "Action: "; <>; print "Action: ") { chomp; my $action = $href->{ lc($_) }; print "Action is $action\n"; } #----------------------------- $name = 'send'; &$name(); #----------------------------- # assumes that &invoke_editor, &deliver_message, # $file and $PAGER are defined somewhere else. use Text::Abbrev; my($href, %actions, $errors); %actions = ( "edit" => \&invoke_editor, "send" => \&deliver_message, "list" => sub { system($PAGER, $file) }, "abort" => sub { print "See ya!\n"; exit; }, "" => sub { print "Unknown command: $cmd\n"; $errors++; }, ); $href = abbrev(keys %actions); local $_; for (print "Action: "; <>; print "Action: ") { s/^\s+//; # trim leading white space s/\s+$//; # trim trailing white space next unless $_; $actions->{ $href->{ lc($_) } }->(); } #----------------------------- $abbreviation = lc($_); $expansion = $href->{$abbreviation}; $coderef = $actions->{$expansion}; &$coderef(); #-----------------------------
[править] Program: urlify
#----------------------------- #% gunzip -c ~/mail/archive.gz | urlify > archive.urlified #----------------------------- #% urlify ~/mail/*.inbox > ~/allmail.urlified #----------------------------- # download the following standalone program #!/usr/bin/perl # urlify - wrap HTML links around URL-like constructs $urls = '(http|telnet|gopher|file|wais|ftp)'; $ltrs = '\w'; $gunk = '/#~:.?+=&%@!\-'; $punc = '.:?\-'; $any = "${ltrs}${gunk}${punc}"; while (<>) { s{ \b # start at word boundary ( # begin $1 { $urls : # need resource and a colon [$any] +? # followed by on or more # of any valid character, but # be conservative and take only # what you need to.... ) # end $1 } (?= # look-ahead non-consumptive assertion [$punc]* # either 0 or more punctuation [^$any] # followed by a non-url char | # or else $ # then end of the string ) }{<A HREF="$1">$1</A>}igox; print; } #-----------------------------
[править] Program: tcgrep
#----------------------------- #% tcgrep -ril '^From: .*kate' ~/mail #----------------------------- # download the following standalone program #!/usr/bin/perl -w # tcgrep: tom christiansen's rewrite of grep # v1.0: Thu Sep 30 16:24:43 MDT 1993 # v1.1: Fri Oct 1 08:33:43 MDT 1993 # v1.2: Fri Jul 26 13:37:02 CDT 1996 # v1.3: Sat Aug 30 14:21:47 CDT 1997 # v1.4: Mon May 18 16:17:48 EDT 1998 use strict; # globals use vars qw($Me $Errors $Grand_Total $Mult %Compress $Matches); my ($matcher, $opt); # matcher - anon. sub to check for matches # opt - ref to hash w/ command line options init(); # initialize globals ($opt, $matcher) = parse_args(); # get command line options and patterns matchfile($opt, $matcher, @ARGV); # process files exit(2) if $Errors; exit(0) if $Grand_Total; exit(1); ################################### sub init { ($Me = $0) =~ s!.*/!!; # get basename of program, "tcgrep" $Errors = $Grand_Total = 0; # initialize global counters $Mult = ""; # flag for multiple files in @ARGV $| = 1; # autoflush output %Compress = ( # file extensions and program names z => 'gzcat', # for uncompressing gz => 'gzcat', Z => 'zcat', ); } ################################### sub usage { die <<EOF usage: $Me [flags] [files] Standard grep options: i case insensitive n number lines c give count of lines matching C ditto, but >1 match per line possible w word boundaries only s silent mode x exact matches only v invert search sense (lines that DON'T match) h hide filenames e expression (for exprs beginning with -) f file with expressions l list filenames matching Specials: 1 1 match per file H highlight matches u underline matches r recursive on directories or dot if none t process directories in 'ls -t' order p paragraph mode (default: line mode) P ditto, but specify separator, e.g. -P '%%\\n' a all files, not just plain text files q quiet about failed file and dir opens T trace files as opened May use a TCGREP environment variable to set default options. EOF } ################################### sub parse_args { use Getopt::Std; my ($optstring, $zeros, $nulls, %opt, $pattern, @patterns, $match_code); my ($SO, $SE); if ($_ = $ENV{TCGREP}) { # get envariable TCGREP s/^([^\-])/-$1/; # add leading - if missing unshift(@ARGV, $_); # add TCGREP opt string to @ARGV } $optstring = "incCwsxvhe:f:l1HurtpP:aqT"; $zeros = 'inCwxvhelut'; # options to init to 0 (prevent warnings) $nulls = 'pP'; # options to init to "" (prevent warnings) @opt{ split //, $zeros } = ( 0 ) x length($zeros); @opt{ split //, $nulls } = ( '' ) x length($nulls); getopts($optstring, \%opt) or usage(); if ($opt{f}) { # -f patfile open(PATFILE, $opt{f}) or die qq($Me: Can't open '$opt{f}': $!); # make sure each pattern in file is valid while ( defined($pattern = <PATFILE>) ) { chomp $pattern; eval { 'foo' =~ /$pattern/, 1 } or die "$Me: $opt{f}:$.: bad pattern: $@"; push @patterns, $pattern; } close PATFILE; } else { # make sure pattern is valid $pattern = $opt{e} || shift(@ARGV) || usage(); eval { 'foo' =~ /$pattern/, 1 } or die "$Me: bad pattern: $@"; @patterns = ($pattern); } if ($opt{H} || $opt{u}) { # highlight or underline my $term = $ENV{TERM} || 'vt100'; my $terminal; eval { # try to look up escapes for stand-out require POSIX; # or underline via Term::Cap use Term::Cap; my $termios = POSIX::Termios->new(); $termios->getattr; my $ospeed = $termios->getospeed; $terminal = Tgetent Term::Cap { TERM=>undef, OSPEED=>$ospeed } }; unless ($@) { # if successful, get escapes for either local $^W = 0; # stand-out (-H) or underlined (-u) ($SO, $SE) = $opt{H} ? ($terminal->Tputs('so'), $terminal->Tputs('se')) : ($terminal->Tputs('us'), $terminal->Tputs('ue')); } else { # if use of Term::Cap fails, ($SO, $SE) = $opt{H} # use tput command to get escapes ? (`tput -T $term smso`, `tput -T $term rmso`) : (`tput -T $term smul`, `tput -T $term rmul`) } } if ($opt{i}) { @patterns = map {"(?i)$_"} @patterns; } if ($opt{p} || $opt{P}) { @patterns = map {"(?m)$_"} @patterns; } $opt{p} && ($/ = ''); $opt{P} && ($/ = eval(qq("$opt{P}"))); # for -P '%%\n' $opt{w} && (@patterns = map {'\b' . $_ . '\b'} @patterns); $opt{'x'} && (@patterns = map {"^$_\$"} @patterns); if (@ARGV) { $Mult = 1 if ($opt{r} || (@ARGV > 1) || -d $ARGV[0]) && !$opt{h}; } $opt{1} += $opt{l}; # that's a one and an ell $opt{H} += $opt{u}; $opt{c} += $opt{C}; $opt{'s'} += $opt{c}; $opt{1} += $opt{'s'} && !$opt{c}; # that's a one @ARGV = ($opt{r} ? '.' : '-') unless @ARGV; $opt{r} = 1 if !$opt{r} && grep(-d, @ARGV) == @ARGV; $match_code = ''; $match_code .= 'study;' if @patterns > 5; # might speed things up a bit foreach (@patterns) { s(/)(\\/)g } if ($opt{H}) { foreach $pattern (@patterns) { $match_code .= "\$Matches += s/($pattern)/${SO}\$1${SE}/g;"; } } elsif ($opt{v}) { foreach $pattern (@patterns) { $match_code .= "\$Matches += !/$pattern/;"; } } elsif ($opt{C}) { foreach $pattern (@patterns) { $match_code .= "\$Matches++ while /$pattern/g;"; } } else { foreach $pattern (@patterns) { $match_code .= "\$Matches++ if /$pattern/;"; } } $matcher = eval "sub { $match_code }"; die if $@; return (\%opt, $matcher); } ################################### sub matchfile { $opt = shift; # reference to option hash $matcher = shift; # reference to matching sub my ($file, @list, $total, $name); local($_); $total = 0; FILE: while (defined ($file = shift(@_))) { if (-d $file) { if (-l $file && @ARGV != 1) { warn "$Me: \"$file\" is a symlink to a directory\n" if $opt->{T}; next FILE; } if (!$opt->{r}) { warn "$Me: \"$file\" is a directory, but no -r given\n" if $opt->{T}; next FILE; } unless (opendir(DIR, $file)) { unless ($opt->{'q'}) { warn "$Me: can't opendir $file: $!\n"; $Errors++; } next FILE; } @list = (); for (readdir(DIR)) { push(@list, "$file/$_") unless /^\.{1,2}$/; } closedir(DIR); if ($opt->{t}) { my (@dates); for (@list) { push(@dates, -M) } @list = @list[sort { $dates[$a] <=> $dates[$b] } 0..$#dates]; } else { @list = sort @list; } matchfile($opt, $matcher, @list); # process files next FILE; } if ($file eq '-') { warn "$Me: reading from stdin\n" if -t STDIN && !$opt->{'q'}; $name = '<STDIN>'; } else { $name = $file; unless (-e $file) { warn qq($Me: file "$file" does not exist\n) unless $opt->{'q'}; $Errors++; next FILE; } unless (-f $file || $opt->{a}) { warn qq($Me: skipping non-plain file "$file"\n) if $opt->{T}; next FILE; } my ($ext) = $file =~ /\.([^.]+)$/; if (defined $ext && exists $Compress{$ext}) { $file = "$Compress{$ext} <$file |"; } elsif (! (-T $file || $opt->{a})) { warn qq($Me: skipping binary file "$file"\n) if $opt->{T}; next FILE; } } warn "$Me: checking $file\n" if $opt->{T}; unless (open(FILE, $file)) { unless ($opt->{'q'}) { warn "$Me: $file: $!\n"; $Errors++; } next FILE; } $total = 0; $Matches = 0; LINE: while (<FILE>) { $Matches = 0; ############## &{$matcher}(); # do it! (check for matches) ############## next LINE unless $Matches; $total += $Matches; if ($opt->{p} || $opt->{P}) { s/\n{2,}$/\n/ if $opt->{p}; chomp if $opt->{P}; } print("$name\n"), next FILE if $opt->{l}; $opt->{'s'} || print $Mult && "$name:", $opt->{n} ? "$.:" : "", $_, ($opt->{p} || $opt->{P}) && ('-' x 20) . "\n"; next FILE if $opt->{1}; # that's a one } } continue { print $Mult && "$name:", $total, "\n" if $opt->{c}; } $Grand_Total += $total; } #-----------------------------
[править] Regular Expression Grabbag
#----------------------------- m/^m*(d?c{0,3}|c[dm])(l?x{0,3}|x[lc])(v?i{0,3}|i[vx])$/i #----------------------------- s/(\S+)(\s+)(\S+)/$3$2$1/ #----------------------------- m/(\w+)\s*=\s*(.*)\s*$/ # keyword is $1, value is $2 #----------------------------- m/.{80,}/ #----------------------------- m|(\d+)/(\d+)/(\d+) (\d+):(\d+):(\d+)| #----------------------------- s(/usr/bin)(/usr/local/bin)g #----------------------------- s/%([0-9A-Fa-f][0-9A-Fa-f])/chr hex $1/ge #----------------------------- s{ /\* # Match the opening delimiter .*? # Match a minimal number of characters \*/ # Match the closing delimiter } []gsx; #----------------------------- s/^\s+//; s/\s+$//; #----------------------------- s/\\n/\n/g; #----------------------------- s/^.*::// #----------------------------- m/^([01]?\d\d|2[0-4]\d|25[0-5])\.([01]?\d\d|2[0-4]\d|25[0-5])\. ([01]?\d\d|2[0-4]\d|25[0-5])\.([01]?\d\d|2[0-4]\d|25[0-5])$/; #----------------------------- s(^.*/)() #----------------------------- $cols = ( ($ENV{TERMCAP} || " ") =~ m/:co#(\d+):/ ) ? $1 : 80; #----------------------------- ($name = " $0 @ARGV") =~ s, /\S+/, ,g; #----------------------------- die "This isn't Linux" unless $^O =~ m/linux/i; #----------------------------- s/\n\s+/ /g #----------------------------- @nums = m/(\d+\.?\d*|\.\d+)/g; #----------------------------- @capwords = m/(\b[^\Wa-z0-9_]+\b)/g; #----------------------------- @lowords = m/(\b[^\WA-Z0-9_]+\b)/g; #----------------------------- @icwords = m/(\b[^\Wa-z0-9_][^\WA-Z0-9_]*\b)/; #----------------------------- @links = m/<A[^>]+?HREF\s*=\s*["']?([^'" >]+?)[ '"]?>/sig; #"' #----------------------------- ($initial) = m/^\S+\s+(\S)\S*\s+\S/ ? $1 : ""; #----------------------------- s/"([^"]*)"/``$1''/g #" #----------------------------- { local $/ = ""; while (<>) { s/\n/ /g; s/ {3,}/ /g; push @sentences, m/(\S.*?[!?.])(?= |\Z)/g; } } #----------------------------- m/(\d{4})-(\d\d)-(\d\d)/ # YYYY in $1, MM in $2, DD in $3 #----------------------------- m/ ^ (?: 1 \s (?: \d\d\d \s)? # 1, or 1 and area code | # ... or ... \(\d\d\d\) \s # area code with parens | # ... or ... (?: \+\d\d?\d? \s)? # optional +country code \d\d\d ([\s\-]) # and area code ) \d\d\d (\s|\1) # prefix (and area code separator) \d\d\d\d # exchange $ /x #----------------------------- m/\boh\s+my\s+gh?o(d(dess(es)?|s?)|odness|sh)\b/i #----------------------------- push(@lines, $1) while ($input =~ s/^([^\012\015]*)(\012\015?|\015\012?)//); #-----------------------------