|
Perl/FAQ/Поиск по шаблону
Материал из Wiki.crossplatform.ru
[править] Introduction
#-----------------------------
match( $string, $pattern );
subst( $string, $pattern, $replacement );
#-----------------------------
$meadow =~ m/sheep/; # True if $meadow contains "sheep"
$meadow !~ m/sheep/; # True if $meadow doesn't contain "sheep"
$meadow =~ s/old/new/; # Replace "old" with "new" in $meadow
#-----------------------------
# Fine bovines demand fine toreadors.
# Muskoxen are a polar ovibovine species.
# Grooviness went out of fashion decades ago.
#-----------------------------
# Ovines are found typically in oviaries.
#-----------------------------
if ($meadow =~ /\bovines?\b/i) { print "Here be sheep!" }
#-----------------------------
$string = "good food";
$string =~ s/o*/e/;
#-----------------------------
# good food
#
# geod food
#
# geed food
#
# geed feed
#
# ged food
#
# ged fed
#
# egood food
#-----------------------------
#% echo ababacaca | perl -ne 'print "$&\n" if /(a|ba|b)+(a|ac)+/'
#ababa
#-----------------------------
#% echo ababacaca |
# awk 'match($0,/(a|ba|b)+(a|ac)+/) { print substr($0, RSTART, RLENGTH) }'
#ababacaca
#-----------------------------
while (m/(\d+)/g) {
print "Found number $1\n";
}
#-----------------------------
@numbers = m/(\d+)/g;
#-----------------------------
$digits = "123456789";
@nonlap = $digits =~ /(\d\d\d)/g;
@yeslap = $digits =~ /(?=(\d\d\d))/g;
print "Non-overlapping: @nonlap\n";
print "Overlapping: @yeslap\n";
# Non-overlapping: 123 456 789
# Overlapping: 123 234 345 456 567 678 789
#-----------------------------
$string = "And little lambs eat ivy";
$string =~ /l[^s]*s/;
print "($`) ($&) ($')\n";
# (And ) (little lambs) ( eat ivy)
#-----------------------------
[править] Copying and Substituting Simultaneously
#-----------------------------
$dst = $src;
$dst =~ s/this/that/;
#-----------------------------
($dst = $src) =~ s/this/that/;
#-----------------------------
# strip to basename
($progname = $0) =~ s!^.*/!!;
# Make All Words Title-Cased
($capword = $word) =~ s/(\w+)/\u\L$1/g;
# /usr/man/man3/foo.1 changes to /usr/man/cat3/foo.1
($catpage = $manpage) =~ s/man(?=\d)/cat/;
#-----------------------------
@bindirs = qw( /usr/bin /bin /usr/local/bin );
for (@libdirs = @bindirs) { s/bin/lib/ }
print "@libdirs\n";
# /usr/lib /lib /usr/local/lib
#-----------------------------
($a = $b) =~ s/x/y/g; # copy $b and then change $a
$a = ($b =~ s/x/y/g); # change $b, count goes in $a
#-----------------------------
[править] Matching Letters
#-----------------------------
if ($var =~ /^[A-Za-z]+$/) {
# it is purely alphabetic
}
#-----------------------------
use locale;
if ($var =~ /^[^\W\d_]+$/) {
print "var is purely alphabetic\n";
}
#-----------------------------
use locale;
use POSIX 'locale_h';
# the following locale string might be different on your system
unless (setlocale(LC_ALL, "fr_CA.ISO8859-1")) {
die "couldn't set locale to French Canadian\n";
}
while (<DATA>) {
chomp;
if (/^[^\W\d_]+$/) {
print "$_: alphabetic\n";
} else {
print "$_: line noise\n";
}
}
#__END__
#silly
#faзade
#coцperate
#niсo
#Renйe
#Moliиre
#hжmoglobin
#naпve
#tschьЯ
#random!stuff#here
#-----------------------------
[править] Matching Words
#-----------------------------
#/\S+/ # as many non-whitespace bytes as possible
#/[A-Za-z'-]+/ # as many letters, apostrophes, and hyphens
#-----------------------------
#/\b([A-Za-z]+)\b/ # usually best
#/\s([A-Za-z]+)\s/ # fails at ends or w/ punctuation
#-----------------------------
[править] Commenting Regular Expressions
#-----------------------------
# download the following standalone program
#!/usr/bin/perl -p
# resname - change all "foo.bar.com" style names in the input stream
# into "foo.bar.com [204.148.40.9]" (or whatever) instead
use Socket; # load inet_addr
s{ #
( # capture the hostname in $1
(?: # these parens for grouping only
(?! [-_] ) # lookahead for neither underscore nor dash
[\w-] + # hostname component
\. # and the domain dot
) + # now repeat that whole thing a bunch of times
[A-Za-z] # next must be a letter
[\w-] + # now trailing domain part
) # end of $1 capture
}{ # replace with this:
"$1 " . # the original bit, plus a space
( ($addr = gethostbyname($1)) # if we get an addr
? "[" . inet_ntoa($addr) . "]" # format it
: "[???]" # else mark dubious
)
}gex; # /g for global
# /e for execute
# /x for nice formatting
#-----------------------------
s/ # replace
\# # a pound sign
(\w+) # the variable name
\# # another pound sign
/${$1}/xg; # with the value of the global variable
##-----------------------------
s/ # replace
\# # a pound sign
(\w+) # the variable name
\# # another pound sign
/'$' . $1/xeeg; # ' with the value of *any* variable
#-----------------------------
[править] Finding the Nth Occurrence of a Match
#-----------------------------
# One fish two fish red fish blue fish
#-----------------------------
$WANT = 3;
$count = 0;
while (/(\w+)\s+fish\b/gi) {
if (++$count == $WANT) {
print "The third fish is a $1 one.\n";
# Warning: don't `last' out of this loop
}
}
# The third fish is a red one.
#-----------------------------
/(?:\w+\s+fish\s+){2}(\w+)\s+fish/i;
#-----------------------------
# simple way with while loop
$count = 0;
while ($string =~ /PAT/g) {
$count++; # or whatever you'd like to do here
}
# same thing with trailing while
$count = 0;
$count++ while $string =~ /PAT/g;
# or with for loop
for ($count = 0; $string =~ /PAT/g; $count++) { }
# Similar, but this time count overlapping matches
$count++ while $string =~ /(?=PAT)/g;
#-----------------------------
$pond = 'One fish two fish red fish blue fish';
# using a temporary
@colors = ($pond =~ /(\w+)\s+fish\b/gi); # get all matches
$color = $colors[2]; # then the one we want
# or without a temporary array
$color = ( $pond =~ /(\w+)\s+fish\b/gi )[2]; # just grab element 3
print "The third fish in the pond is $color.\n";
# The third fish in the pond is red.
#-----------------------------
$count = 0;
$_ = 'One fish two fish red fish blue fish';
@evens = grep { $count++ % 2 == 1 } /(\w+)\s+fish\b/gi;
print "Even numbered fish are @evens.\n";
# Even numbered fish are two blue.
#-----------------------------
$count = 0;
s{
\b # makes next \w more efficient
( \w+ ) # this is what we'll be changing
(
\s+ fish \b
)
}{
if (++$count == 4) {
"sushi" . $2;
} else {
$1 . $2;
}
}gex;
# One fish two fish red fish sushi fish
#-----------------------------
$pond = 'One fish two fish red fish blue fish swim here.';
$color = ( $pond =~ /\b(\w+)\s+fish\b/gi )[-1];
print "Last fish is $color.\n";
# Last fish is blue.
#-----------------------------
m{
A # find some pattern A
(?! # mustn't be able to find
.* # something
A # and A
)
$ # through the end of the string
}x
#-----------------------------
$pond = 'One fish two fish red fish blue fish swim here.';
if ($pond =~ m{
\b ( \w+) \s+ fish \b
(?! .* \b fish \b )
}six )
{
print "Last fish is $1.\n";
} else {
print "Failed!\n";
}
# Last fish is blue.
#-----------------------------
[править] Matching Multiple Lines
#-----------------------------
# download the following standalone program
#!/usr/bin/perl
# killtags - very bad html tag killer
undef $/; # each read is whole file
while (<>) { # get one whole file at a time
s/<.*?>//gs; # strip tags (terribly)
print; # print file to STDOUT
}
#-----------------------------
# download the following standalone program
#!/usr/bin/perl
# headerfy: change certain chapter headers to html
$/ = '';
while ( <> ) { # fetch a paragraph
s{
\A # start of record
( # capture in $1
Chapter # text string
\s+ # mandatory whitespace
\d+ # decimal number
\s* # optional whitespace
: # a real colon
. * # anything not a newline till end of line
)
}{<H1>$1</H1>}gx;
print;
}
#-----------------------------
#% perl -00pe 's{\A(Chapter\s+\d+\s*:.*)}{<H1>$1</H1>}gx' datafile
#-----------------------------
$/ = ''; # paragraph read mode for readline access
while (<ARGV>) {
while (m#^START(.*?)^END#sm) { # /s makes . span line boundaries
# /m makes ^ match near newlines
print "chunk $. in $ARGV has <<$1>>\n";
}
}
#-----------------------------
[править] Reading Records with a Pattern Separator
#-----------------------------
undef $/;
@chunks = split(/pattern/, <FILEHANDLE>);
#-----------------------------
# .Ch, .Se and .Ss divide chunks of STDIN
{
local $/ = undef;
@chunks = split(/^\.(Ch|Se|Ss)$/m, <>);
}
print "I read ", scalar(@chunks), " chunks.\n";
#-----------------------------
[править] Extracting a Range of Lines
#-----------------------------
while (<>) {
if (/BEGIN PATTERN/ .. /END PATTERN/) {
# line falls between BEGIN and END in the
# text, inclusive.
}
}
while (<>) {
if ($FIRST_LINE_NUM .. $LAST_LINE_NUM) {
# operate only between first and last line, inclusive.
}
}
#-----------------------------
while (<>) {
if (/BEGIN PATTERN/ ... /END PATTERN/) {
# line is between BEGIN and END on different lines
}
}
while (<>) {
if ($FIRST_LINE_NUM ... $LAST_LINE_NUM) {
# operate only between first and last line, but not same
}
}
#-----------------------------
# command-line to print lines 15 through 17 inclusive (see below)
perl -ne 'print if 15 .. 17' datafile
# print out all <XMP> .. </XMP> displays from HTML doc
while (<>) {
print if m#<XMP>#i .. m#</XMP>#i;
}
# same, but as shell command
# perl -ne 'print if m#<XMP>#i .. m#</XMP>#i' document.html
#-----------------------------
# perl -ne 'BEGIN { $top=3; $bottom=5 } print if $top .. $bottom' /etc/passwd # previous command FAILS
# perl -ne 'BEGIN { $top=3; $bottom=5 } \
# print if $. == $top .. $. == $bottom' /etc/passwd # works
# perl -ne 'print if 3 .. 5' /etc/passwd # also works
#-----------------------------
print if /begin/ .. /end/;
print if /begin/ ... /end/;
#-----------------------------
while (<>) {
$in_header = 1 .. /^$/;
$in_body = /^$/ .. eof();
}
#-----------------------------
%seen = ();
while (<>) {
next unless /^From:?\s/i .. /^$/;
while (/([^<>(),;\s]+\@[^<>(),;\s]+)/g) {
print "$1\n" unless $seen{$1}++;
}
}
#-----------------------------
[править] Matching Shell Globs as Regular Expressions
#-----------------------------
sub glob2pat {
my $globstr = shift;
my %patmap = (
'*' => '.*',
'?' => '.',
'[' => '[',
']' => ']',
);
$globstr =~ s{(.)} { $patmap{$1} || "\Q$1" }ge;
return '^' . $globstr . '$'; #'
}
#-----------------------------
[править] Speeding Up Interpolated Matches
#-----------------------------
while ($line = <>) {
if ($line =~ /$pattern/o) {
# do something
}
}
#-----------------------------
# download the following standalone program
#!/usr/bin/perl
# popgrep1 - grep for abbreviations of places that say "pop"
# version 1: slow but obvious way
@popstates = qw(CO ON MI WI MN);
LINE: while (defined($line = <>)) {
for $state (@popstates) {
if ($line =~ /\b$state\b/) {
print; next LINE;
}
}
}
#-----------------------------
# download the following standalone program
#!/usr/bin/perl
# popgrep2 - grep for abbreviations of places that say "pop"
# version 2: eval strings; fast but hard to quote
@popstates = qw(CO ON MI WI MN);
$code = 'while (defined($line = <>)) {';
for $state (@popstates) {
$code .= "\tif (\$line =~ /\\b$state\\b/) { print \$line; next; }\n";
}
$code .= '}';
print "CODE IS\n----\n$code\n----\n" if 0; # turn on to debug
eval $code;
die if $@;
#-----------------------------
while (defined($line = <>)) {
if ($line =~ /\bCO\b/) { print $line; next; }
if ($line =~ /\bON\b/) { print $line; next; }
if ($line =~ /\bMI\b/) { print $line; next; }
if ($line =~ /\bWI\b/) { print $line; next; }
if ($line =~ /\bMN\b/) { print $line; next; }
}
#-----------------------------
# download the following standalone program
#!/usr/bin/perl
# popgrep3 - grep for abbreviations of places that say "pop"
# version 3: use build_match_func algorithm
@popstates = qw(CO ON MI WI MN);
$expr = join('||', map { "m/\\b\$popstates[$_]\\b/o" } 0..$#popstates);
$match_any = eval "sub { $expr }";
die if $@;
while (<>) {
print if &$match_any;
}
#-----------------------------
sub {
m/\b$popstates[0]\b/o || m/\b$popstates[1]\b/o ||
m/\b$popstates[2]\b/o || m/\b$popstates[3]\b/o ||
m/\b$popstates[4]\b/o
}
#-----------------------------
# download the following standalone program
#!/usr/bin/perl
# grepauth - print lines that mention both Tom and Nat
$multimatch = build_match_all(q/Tom/, q/Nat/);
while (<>) {
print if &$multimatch;
}
exit;
sub build_match_any { build_match_func('||', @_) }
sub build_match_all { build_match_func('&&', @_) }
sub build_match_func {
my $condition = shift;
my @pattern = @_; # must be lexical variable, not dynamic one
my $expr = join $condition => map { "m/\$pattern[$_]/o" } (0..$#pattern);
my $match_func = eval "sub { local \$_ = shift if \@_; $expr }";
die if $@; # propagate $@; this shouldn't happen!
return $match_func;
}
#-----------------------------
# download the following standalone program
#!/usr/bin/perl
# popgrep4 - grep for abbreviations of places that say "pop"
# version 4: use Regexp module
use Regexp;
@popstates = qw(CO ON MI WI MN);
@poppats = map { Regexp->new( '\b' . $_ . '\b') } @popstates;
while (defined($line = <>)) {
for $patobj (@poppats) {
print $line if $patobj->match($line);
}
}
#-----------------------------
[править] Testing for a Valid Pattern
#-----------------------------
do {
print "Pattern? ";
chomp($pat = <>);
eval { "" =~ /$pat/ };
warn "INVALID PATTERN $@" if $@;
} while $@;
#-----------------------------
sub is_valid_pattern {
my $pat = shift;
return eval { "" =~ /$pat/; 1 } || 0;
}
#-----------------------------
# download the following standalone program
#!/usr/bin/perl
# paragrep - trivial paragraph grepper
die "usage: $0 pat [files]\n" unless @ARGV;
$/ = '';
$pat = shift;
eval { "" =~ /$pat/; 1 } or die "$0: Bad pattern $pat: $@\n";
while (<>) {
print "$ARGV $.: $_" if /$pat/o;
}
#-----------------------------
$pat = "You lose @{[ system('rm -rf *')]} big here";
#-----------------------------
$safe_pat = quotemeta($pat);
something() if /$safe_pat/;
#-----------------------------
something() if /\Q$pat/;
#-----------------------------
[править] Honoring Locale Settings in Regular Expressions
#-----------------------------
use locale;
#-----------------------------
# download the following standalone program
#!/usr/bin/perl -w
# localeg - demonstrate locale effects
use locale;
use POSIX 'locale_h';
$name = "andreas k\xF6nig";
@locale{qw(German English)} = qw(de_DE.ISO_8859-1 us-ascii);
setlocale(LC_CTYPE, $locale{English})
or die "Invalid locale $locale{English}";
@english_names = ();
while ($name =~ /\b(\w+)\b/g) {
push(@english_names, ucfirst($1));
}
setlocale(LC_CTYPE, $locale{German})
or die "Invalid locale $locale{German}";
@german_names = ();
while ($name =~ /\b(\w+)\b/g) {
push(@german_names, ucfirst($1));
}
print "English names: @english_names\n";
print "German names: @german_names\n";
English names: Andreas K Nig
German names: Andreas Kцnig
#-----------------------------
[править] Approximate Matching
#-----------------------------
use String::Approx qw(amatch);
if (amatch("PATTERN", @list)) {
# matched
}
@matches = amatch("PATTERN", @list);
#-----------------------------
use String::Approx qw(amatch);
open(DICT, "/usr/dict/words") or die "Can't open dict: $!";
while(<DICT>) {
print if amatch("balast");
}
ballast
balustrade
blast
blastula
sandblast
#-----------------------------
[править] Matching from Where the Last Pattern Left Off
#-----------------------------
while (/(\d+)/g) {
print "Found $1\n";
}
#-----------------------------
$n = " 49 here";
$n =~ s/\G /0/g;
print $n;
00049 here
#-----------------------------
while (/\G,?(\d+)/g) {
print "Found number $1\n";
}
#-----------------------------
$_ = "The year 1752 lost 10 days on the 3rd of September";
while (/(\d+)/gc) {
print "Found number $1\n";
}
if (/\G(\S+)/g) {
print "Found $1 after the last number.\n";
}
#Found number 1752
#
#Found number 10
#
#Found number 3
#
#Found rd after the last number.
#-----------------------------
print "The position in \$a is ", pos($a);
pos($a) = 30;
print "The position in \$_ is ", pos;
pos = 30;
#-----------------------------
[править] Greedy and Non-Greedy Matches
#-----------------------------
# greedy pattern
s/<.*>//gs; # try to remove tags, very badly
# non-greedy pattern
s/<.*?>//gs; # try to remove tags, still rather badly
#-----------------------------
#<b><i>this</i> and <i>that</i> are important</b> Oh, <b><i>me too!</i></b>
#-----------------------------
m{ <b><i>(.*?)</i></b> }sx
#-----------------------------
/BEGIN((?:(?!BEGIN).)*)END/
#-----------------------------
m{ <b><i>( (?: (?!</b>|</i>). )* ) </i></b> }sx
#-----------------------------
m{ <b><i>( (?: (?!</[ib]>). )* ) </i></b> }sx
#-----------------------------
m{
<b><i>
[^<]* # stuff not possibly bad, and not possibly the end.
(?:
# at this point, we can have '<' if not part of something bad
(?! </?[ib]> ) # what we can't have
< # okay, so match the '<'
[^<]* # and continue with more safe stuff
) *
</i></b>
}sx
#-----------------------------
[править] Detecting Duplicate Words
#-----------------------------
$/ = ''; # paragrep mode
while (<>) {
while ( m{
\b # start at a word boundary (begin letters)
(\S+) # find chunk of non-whitespace
\b # until another word boundary (end letters)
(
\s+ # separated by some whitespace
\1 # and that very same chunk again
\b # until another word boundary
) + # one or more sets of those
}xig
)
{
print "dup word '$1' at paragraph $.\n";
}
}
#-----------------------------
This is a test
test of the duplicate word finder.
#-----------------------------
$a = 'nobody';
$b = 'bodysnatcher';
if ("$a $b" =~ /^(\w+)(\w+) \2(\w+)$/) {
print "$2 overlaps in $1-$2-$3\n";
}
body overlaps in no-body-snatcher
#-----------------------------
/^(\w+?)(\w+) \2(\w+)$/,
#-----------------------------
# download the following standalone program
#!/usr/bin/perl
# prime_pattern -- find prime factors of argument using pattern matching
for ($N = ('o' x shift); $N =~ /^(oo+?)\1+$/; $N =~ s/$1/o/g) {
print length($1), " ";
}
print length ($N), "\n";
#-----------------------------
# solve for 12x + 15y + 16z = 281, maximizing x
if (($X, $Y, $Z) =
(('o' x 281) =~ /^(o*)\1{11}(o*)\2{14}(o*)\3{15}$/))
{
($x, $y, $z) = (length($X), length($Y), length($Z));
print "One solution is: x=$x; y=$y; z=$z.\n";
} else {
print "No solution.\n";
}
#One solution is: x=17; y=3; z=2.
#-----------------------------
('o' x 281) =~ /^(o+)\1{11}(o+)\2{14}(o+)\3{15}$/;
#One solution is: x=17; y=3; z=2
('o' x 281) =~ /^(o*?)\1{11}(o*)\2{14}(o*)\3{15}$/;
#One solution is: x=0; y=7; z=11.
('o' x 281) =~ /^(o+?)\1{11}(o*)\2{14}(o*)\3{15}$/;
#One solution is: x=1; y=3; z=14.
#-----------------------------
[править] Expressing AND, OR, and NOT in a Single Pattern
#-----------------------------
chomp($pattern = <CONFIG_FH>);
if ( $data =~ /$pattern/ ) { ..... }
#-----------------------------
/ALPHA|BETA/;
#-----------------------------
/^(?=.*ALPHA)(?=.*BETA)/s;
#-----------------------------
/ALPHA.*BETA|BETA.*ALPHA/s;
#-----------------------------
/^(?:(?!PAT).)*$/s;
#-----------------------------
/(?=^(?:(?!BAD).)*$)GOOD/s;
#-----------------------------
if (!($string =~ /pattern/)) { something() } # ugly
if ( $string !~ /pattern/) { something() } # preferred
#-----------------------------
if ($string =~ /pat1/ && $string =~ /pat2/ ) {
something
() }
#-----------------------------
if ($string =~ /pat1/ || $string =~ /pat2/ ) {
something
() }
#-----------------------------
# download the following standalone program
#!/usr/bin/perl
# minigrep - trivial grep
$pat = shift;
while (<>) {
print if /$pat/o;
}
#-----------------------------
"labelled" =~ /^(?=.*bell)(?=.*lab)/s
#-----------------------------
$string =~ /bell/ && $string =~ /lab/
#-----------------------------
if ($murray_hill =~ m{
^ # start of string
(?= # zero-width lookahead
.* # any amount of intervening stuff
bell # the desired bell string
) # rewind, since we were only looking
(?= # and do the same thing
.* # any amount of intervening stuff
lab # and the lab part
)
}sx ) # /s means . can match newline
{
print "Looks like Bell Labs might be in Murray Hill!\n";
}
#-----------------------------
"labelled" =~ /(?:^.*bell.*lab)|(?:^.*lab.*bell)/
#-----------------------------
$brand = "labelled";
if ($brand =~ m{
(?: # non-capturing grouper
^ .*? # any amount of stuff at the front
bell # look for a bell
.*? # followed by any amount of anything
lab # look for a lab
) # end grouper
| # otherwise, try the other direction
(?: # non-capturing grouper
^ .*? # any amount of stuff at the front
lab # look for a lab
.*? # followed by any amount of anything
bell # followed by a bell
) # end grouper
}sx ) # /s means . can match newline
{
print "Our brand has bell and lab separate.\n";
}
#-----------------------------
$map =~ /^(?:(?!waldo).)*$/s
#-----------------------------
if ($map =~ m{
^ # start of string
(?: # non-capturing grouper
(?! # look ahead negation
waldo # is he ahead of us now?
) # is so, the negation failed
. # any character (cuzza /s)
) * # repeat that grouping 0 or more
$ # through the end of the string
}sx ) # /s means . can match newline
{
print "There's no waldo here!\n";
}
#-----------------------------
7:15am up 206 days, 13:30, 4 users, load average: 1.04, 1.07, 1.04
USER TTY FROM LOGIN@ IDLE JCPU PCPU WHAT
tchrist tty1 5:16pm 36days 24:43 0.03s xinit
tchrist tty2 5:19pm 6days 0.43s 0.43s -tcsh
tchrist ttyp0 chthon 7:58am 3days 23.44s 0.44s -tcsh
gnat ttyS4 coprolith 2:01pm 13:36m 0.30s 0.30s -tcsh
#-----------------------------
#% w | minigrep '^(?!.*ttyp).*tchrist'
#-----------------------------
m{
^ # anchored to the start
(?! # zero-width look-ahead assertion
.* # any amount of anything (faster than .*?)
ttyp # the string you don't want to find
) # end look-ahead negation; rewind to start
.* # any amount of anything (faster than .*?)
tchrist # now try to find Tom
}x
#-----------------------------
#% w | grep tchrist | grep -v ttyp
#-----------------------------
#% grep -i 'pattern' files
#% minigrep '(?i)pattern' files
#-----------------------------
[править] Matching Multiple-Byte Characters
#-----------------------------
my $eucjp = q{ # EUC-JP encoding subcomponents:
[\x00-\x7F] # ASCII/JIS-Roman (one-byte/character)
| \x8E[\xA0-\xDF] # half-width katakana (two bytes/char)
| \x8F[\xA1-\xFE][\xA1-\xFE] # JIS X 0212-1990 (three bytes/char)
| [\xA1-\xFE][\xA1-\xFE] # JIS X 0208:1997 (two bytes/char)
};
#-----------------------------
/^ (?: $eucjp )*? \xC5\xEC\xB5\xFE/ox # Trying to find Tokyo
#-----------------------------
/^ ( (?:eucjp)*? ) $Tokyo/$1$Osaka/ox
#-----------------------------
/\G ( (?:eucjp)*? ) $Tokyo/$1$Osaka/gox
#-----------------------------
@chars = /$eucjp/gox; # One character per list element
#-----------------------------
while (<>) {
my @chars = /$eucjp/gox; # One character per list element
for my $char (@chars) {
if (length($char) == 1) {
# Do something interesting with this one-byte character
} else {
# Do something interesting with this multiple-byte character
}
}
my $line = join("",@chars); # Glue list back together
print $line;
}
#-----------------------------
$is_eucjp = m/^(?:$eucjp)*$/xo;
#-----------------------------
$is_eucjp = m/^(?:$eucjp)*$/xo;
$is_sjis = m/^(?:$sjis)*$/xo;
#-----------------------------
while (<>) {
my @chars = /$eucjp/gox; # One character per list element
for my $euc (@chars) {
my $uni = $euc2uni{$char};
if (defined $uni) {
$euc = $uni;
} else {
## deal with unknown EUC->Unicode mapping here.
}
}
my $line = join("",@chars);
print $line;
}
#-----------------------------
[править] Matching a Valid Mail Address
#-----------------------------
1 while $addr =~ s/\([^()]*\)//g;
#-----------------------------
Dear someuser@host.com,
Please confirm the mail address you gave us Wed May 6 09:38:41
MDT 1998 by replying to this message. Include the string
"Rumpelstiltskin" in that reply, but spelled in reverse; that is,
start with "Nik...". Once this is done, your confirmed address will
be entered into our records.
#-----------------------------
[править] Matching Abbreviations
#-----------------------------
chomp($answer = <>);
if ("SEND" =~ /^\Q$answer/i) { print "Action is send\n" }
elsif ("STOP" =~ /^\Q$answer/i) { print "Action is stop\n" }
elsif ("ABORT" =~ /^\Q$answer/i) { print "Action is abort\n" }
elsif ("LIST" =~ /^\Q$answer/i) { print "Action is list\n" }
elsif ("EDIT" =~ /^\Q$answer/i) { print "Action is edit\n" }
#-----------------------------
use Text::Abbrev;
$href = abbrev qw(send abort list edit);
for (print "Action: "; <>; print "Action: ") {
chomp;
my $action = $href->{ lc($_) };
print "Action is $action\n";
}
#-----------------------------
$name = 'send';
&$name();
#-----------------------------
# assumes that &invoke_editor, &deliver_message,
# $file and $PAGER are defined somewhere else.
use Text::Abbrev;
my($href, %actions, $errors);
%actions = (
"edit" => \&invoke_editor,
"send" => \&deliver_message,
"list" => sub { system($PAGER, $file) },
"abort" => sub {
print "See ya!\n";
exit;
},
"" => sub {
print "Unknown command: $cmd\n";
$errors++;
},
);
$href = abbrev(keys %actions);
local $_;
for (print "Action: "; <>; print "Action: ") {
s/^\s+//; # trim leading white space
s/\s+$//; # trim trailing white space
next unless $_;
$actions->{ $href->{ lc($_) } }->();
}
#-----------------------------
$abbreviation = lc($_);
$expansion = $href->{$abbreviation};
$coderef = $actions->{$expansion};
&$coderef();
#-----------------------------
[править] Program: urlify
#-----------------------------
#% gunzip -c ~/mail/archive.gz | urlify > archive.urlified
#-----------------------------
#% urlify ~/mail/*.inbox > ~/allmail.urlified
#-----------------------------
# download the following standalone program
#!/usr/bin/perl
# urlify - wrap HTML links around URL-like constructs
$urls = '(http|telnet|gopher|file|wais|ftp)';
$ltrs = '\w';
$gunk = '/#~:.?+=&%@!\-';
$punc = '.:?\-';
$any = "${ltrs}${gunk}${punc}";
while (<>) {
s{
\b # start at word boundary
( # begin $1 {
$urls : # need resource and a colon
[$any] +? # followed by on or more
# of any valid character, but
# be conservative and take only
# what you need to....
) # end $1 }
(?= # look-ahead non-consumptive assertion
[$punc]* # either 0 or more punctuation
[^$any] # followed by a non-url char
| # or else
$ # then end of the string
)
}{<A HREF="$1">$1</A>}igox;
print;
}
#-----------------------------
[править] Program: tcgrep
#-----------------------------
#% tcgrep -ril '^From: .*kate' ~/mail
#-----------------------------
# download the following standalone program
#!/usr/bin/perl -w
# tcgrep: tom christiansen's rewrite of grep
# v1.0: Thu Sep 30 16:24:43 MDT 1993
# v1.1: Fri Oct 1 08:33:43 MDT 1993
# v1.2: Fri Jul 26 13:37:02 CDT 1996
# v1.3: Sat Aug 30 14:21:47 CDT 1997
# v1.4: Mon May 18 16:17:48 EDT 1998
use strict;
# globals
use vars qw($Me $Errors $Grand_Total $Mult %Compress $Matches);
my ($matcher, $opt); # matcher - anon. sub to check for matches
# opt - ref to hash w/ command line options
init(); # initialize globals
($opt, $matcher) = parse_args(); # get command line options and patterns
matchfile($opt, $matcher, @ARGV); # process files
exit(2) if $Errors;
exit(0) if $Grand_Total;
exit(1);
###################################
sub init {
($Me = $0) =~ s!.*/!!; # get basename of program, "tcgrep"
$Errors = $Grand_Total = 0; # initialize global counters
$Mult = ""; # flag for multiple files in @ARGV
$| = 1; # autoflush output
%Compress = ( # file extensions and program names
z => 'gzcat', # for uncompressing
gz => 'gzcat',
Z => 'zcat',
);
}
###################################
sub usage {
die <<EOF
usage: $Me [flags] [files]
Standard grep options:
i case insensitive
n number lines
c give count of lines matching
C ditto, but >1 match per line possible
w word boundaries only
s silent mode
x exact matches only
v invert search sense (lines that DON'T match)
h hide filenames
e expression (for exprs beginning with -)
f file with expressions
l list filenames matching
Specials:
1 1 match per file
H highlight matches
u underline matches
r recursive on directories or dot if none
t process directories in 'ls -t' order
p paragraph mode (default: line mode)
P ditto, but specify separator, e.g. -P '%%\\n'
a all files, not just plain text files
q quiet about failed file and dir opens
T trace files as opened
May use a TCGREP environment variable to set default options.
EOF
}
###################################
sub parse_args {
use Getopt::Std;
my ($optstring, $zeros, $nulls, %opt, $pattern, @patterns, $match_code);
my ($SO, $SE);
if ($_ = $ENV{TCGREP}) { # get envariable TCGREP
s/^([^\-])/-$1/; # add leading - if missing
unshift(@ARGV, $_); # add TCGREP opt string to @ARGV
}
$optstring = "incCwsxvhe:f:l1HurtpP:aqT";
$zeros = 'inCwxvhelut'; # options to init to 0 (prevent warnings)
$nulls = 'pP'; # options to init to "" (prevent warnings)
@opt{ split //, $zeros } = ( 0 ) x length($zeros);
@opt{ split //, $nulls } = ( '' ) x length($nulls);
getopts($optstring, \%opt) or usage();
if ($opt{f}) { # -f patfile
open(PATFILE, $opt{f}) or die qq($Me: Can't open '$opt{f}': $!);
# make sure each pattern in file is valid
while ( defined($pattern = <PATFILE>) ) {
chomp $pattern;
eval { 'foo' =~ /$pattern/, 1 } or
die "$Me: $opt{f}:$.: bad pattern: $@";
push @patterns, $pattern;
}
close PATFILE;
}
else { # make sure pattern is valid
$pattern = $opt{e} || shift(@ARGV) || usage();
eval { 'foo' =~ /$pattern/, 1 } or
die "$Me: bad pattern: $@";
@patterns = ($pattern);
}
if ($opt{H} || $opt{u}) { # highlight or underline
my $term = $ENV{TERM} || 'vt100';
my $terminal;
eval { # try to look up escapes for stand-out
require POSIX; # or underline via Term::Cap
use Term::Cap;
my $termios = POSIX::Termios->new();
$termios->getattr;
my $ospeed = $termios->getospeed;
$terminal = Tgetent Term::Cap { TERM=>undef, OSPEED=>$ospeed }
};
unless ($@) { # if successful, get escapes for either
local $^W = 0; # stand-out (-H) or underlined (-u)
($SO, $SE) = $opt{H}
? ($terminal->Tputs('so'), $terminal->Tputs('se'))
: ($terminal->Tputs('us'), $terminal->Tputs('ue'));
}
else { # if use of Term::Cap fails,
($SO, $SE) = $opt{H} # use tput command to get escapes
? (`tput -T $term smso`, `tput -T $term rmso`)
: (`tput -T $term smul`, `tput -T $term rmul`)
}
}
if ($opt{i}) {
@patterns = map {"(?i)$_"} @patterns;
}
if ($opt{p} || $opt{P}) {
@patterns = map {"(?m)$_"} @patterns;
}
$opt{p} && ($/ = '');
$opt{P} && ($/ = eval(qq("$opt{P}"))); # for -P '%%\n'
$opt{w} && (@patterns = map {'\b' . $_ . '\b'} @patterns);
$opt{'x'} && (@patterns = map {"^$_\$"} @patterns);
if (@ARGV) {
$Mult = 1 if ($opt{r} || (@ARGV > 1) || -d $ARGV[0]) && !$opt{h};
}
$opt{1} += $opt{l}; # that's a one and an ell
$opt{H} += $opt{u};
$opt{c} += $opt{C};
$opt{'s'} += $opt{c};
$opt{1} += $opt{'s'} && !$opt{c}; # that's a one
@ARGV = ($opt{r} ? '.' : '-') unless @ARGV;
$opt{r} = 1 if !$opt{r} && grep(-d, @ARGV) == @ARGV;
$match_code = '';
$match_code .= 'study;' if @patterns > 5; # might speed things up a bit
foreach (@patterns) { s(/)(\\/)g }
if ($opt{H}) {
foreach $pattern (@patterns) {
$match_code .= "\$Matches += s/($pattern)/${SO}\$1${SE}/g;";
}
}
elsif ($opt{v}) {
foreach $pattern (@patterns) {
$match_code .= "\$Matches += !/$pattern/;";
}
}
elsif ($opt{C}) {
foreach $pattern (@patterns) {
$match_code .= "\$Matches++ while /$pattern/g;";
}
}
else {
foreach $pattern (@patterns) {
$match_code .= "\$Matches++ if /$pattern/;";
}
}
$matcher = eval "sub { $match_code }";
die if $@;
return (\%opt, $matcher);
}
###################################
sub matchfile {
$opt = shift; # reference to option hash
$matcher = shift; # reference to matching sub
my ($file, @list, $total, $name);
local($_);
$total = 0;
FILE: while (defined ($file = shift(@_))) {
if (-d $file) {
if (-l $file && @ARGV != 1) {
warn "$Me: \"$file\" is a symlink to a directory\n"
if $opt->{T};
next FILE;
}
if (!$opt->{r}) {
warn "$Me: \"$file\" is a directory, but no -r given\n"
if $opt->{T};
next FILE;
}
unless (opendir(DIR, $file)) {
unless ($opt->{'q'}) {
warn "$Me: can't opendir $file: $!\n";
$Errors++;
}
next FILE;
}
@list = ();
for (readdir(DIR)) {
push(@list, "$file/$_") unless /^\.{1,2}$/;
}
closedir(DIR);
if ($opt->{t}) {
my (@dates);
for (@list) { push(@dates, -M) }
@list = @list[sort { $dates[$a] <=> $dates[$b] } 0..$#dates];
}
else {
@list = sort @list;
}
matchfile($opt, $matcher, @list); # process files
next FILE;
}
if ($file eq '-') {
warn "$Me: reading from stdin\n" if -t STDIN && !$opt->{'q'};
$name = '<STDIN>';
}
else {
$name = $file;
unless (-e $file) {
warn qq($Me: file "$file" does not exist\n) unless $opt->{'q'};
$Errors++;
next FILE;
}
unless (-f $file || $opt->{a}) {
warn qq($Me: skipping non-plain file "$file"\n) if $opt->{T};
next FILE;
}
my ($ext) = $file =~ /\.([^.]+)$/;
if (defined $ext && exists $Compress{$ext}) {
$file = "$Compress{$ext} <$file |";
}
elsif (! (-T $file || $opt->{a})) {
warn qq($Me: skipping binary file "$file"\n) if $opt->{T};
next FILE;
}
}
warn "$Me: checking $file\n" if $opt->{T};
unless (open(FILE, $file)) {
unless ($opt->{'q'}) {
warn "$Me: $file: $!\n";
$Errors++;
}
next FILE;
}
$total = 0;
$Matches = 0;
LINE: while (<FILE>) {
$Matches = 0;
##############
&{$matcher}(); # do it! (check for matches)
##############
next LINE unless $Matches;
$total += $Matches;
if ($opt->{p} || $opt->{P}) {
s/\n{2,}$/\n/ if $opt->{p};
chomp if $opt->{P};
}
print("$name\n"), next FILE if $opt->{l};
$opt->{'s'} || print $Mult && "$name:",
$opt->{n} ? "$.:" : "",
$_,
($opt->{p} || $opt->{P}) && ('-' x 20) . "\n";
next FILE if $opt->{1}; # that's a one
}
}
continue {
print $Mult && "$name:", $total, "\n" if $opt->{c};
}
$Grand_Total += $total;
}
#-----------------------------
[править] Regular Expression Grabbag
#-----------------------------
m/^m*(d?c{0,3}|c[dm])(l?x{0,3}|x[lc])(v?i{0,3}|i[vx])$/i
#-----------------------------
s/(\S+)(\s+)(\S+)/$3$2$1/
#-----------------------------
m/(\w+)\s*=\s*(.*)\s*$/ # keyword is $1, value is $2
#-----------------------------
m/.{80,}/
#-----------------------------
m|(\d+)/(\d+)/(\d+) (\d+):(\d+):(\d+)|
#-----------------------------
s(/usr/bin)(/usr/local/bin)g
#-----------------------------
s/%([0-9A-Fa-f][0-9A-Fa-f])/chr hex $1/ge
#-----------------------------
s{
/\* # Match the opening delimiter
.*? # Match a minimal number of characters
\*/ # Match the closing delimiter
} []gsx;
#-----------------------------
s/^\s+//;
s/\s+$//;
#-----------------------------
s/\\n/\n/g;
#-----------------------------
s/^.*:://
#-----------------------------
m/^([01]?\d\d|2[0-4]\d|25[0-5])\.([01]?\d\d|2[0-4]\d|25[0-5])\.
([01]?\d\d|2[0-4]\d|25[0-5])\.([01]?\d\d|2[0-4]\d|25[0-5])$/;
#-----------------------------
s(^.*/)()
#-----------------------------
$cols = ( ($ENV{TERMCAP} || " ") =~ m/:co#(\d+):/ ) ? $1 : 80;
#-----------------------------
($name = " $0 @ARGV") =~ s, /\S+/, ,g;
#-----------------------------
die "This isn't Linux" unless $^O =~ m/linux/i;
#-----------------------------
s/\n\s+/ /g
#-----------------------------
@nums = m/(\d+\.?\d*|\.\d+)/g;
#-----------------------------
@capwords = m/(\b[^\Wa-z0-9_]+\b)/g;
#-----------------------------
@lowords = m/(\b[^\WA-Z0-9_]+\b)/g;
#-----------------------------
@icwords = m/(\b[^\Wa-z0-9_][^\WA-Z0-9_]*\b)/;
#-----------------------------
@links = m/<A[^>]+?HREF\s*=\s*["']?([^'" >]+?)[ '"]?>/sig; #"'
#-----------------------------
($initial) = m/^\S+\s+(\S)\S*\s+\S/ ? $1 : "";
#-----------------------------
s/"([^"]*)"/``$1''/g #"
#-----------------------------
{ local $/ = "";
while (<>) {
s/\n/ /g;
s/ {3,}/ /g;
push @sentences, m/(\S.*?[!?.])(?= |\Z)/g;
}
}
#-----------------------------
m/(\d{4})-(\d\d)-(\d\d)/ # YYYY in $1, MM in $2, DD in $3
#-----------------------------
m/ ^
(?:
1 \s (?: \d\d\d \s)? # 1, or 1 and area code
| # ... or ...
\(\d\d\d\) \s # area code with parens
| # ... or ...
(?: \+\d\d?\d? \s)? # optional +country code
\d\d\d ([\s\-]) # and area code
)
\d\d\d (\s|\1) # prefix (and area code separator)
\d\d\d\d # exchange
$
/x
#-----------------------------
m/\boh\s+my\s+gh?o(d(dess(es)?|s?)|odness|sh)\b/i
#-----------------------------
push(@lines, $1)
while ($input =~ s/^([^\012\015]*)(\012\015?|\015\012?)//);
#-----------------------------
|