|
Python/FAQ/Поиск по шаблону
Материал из Wiki.crossplatform.ru
[править] Introduction
# Note: regexes are used less often in Python than in Perl as tasks are often
# covered by string methods, or specialised objects, modules, or packages.
import re # "re" is the regular expression module.
re.search("sheep",meadow) # returns a MatchObject is meadow contains "sheep".
if not re.search("sheep",meadow):
print "no sheep on this meadow only a fat python."
# replacing strings is not done by "re"gular expressions.
meadow = meadow.replace("old","new") # replace "old" with "new" and assign result.
#-----------------------------
re.search("ovine",meadow)
meadow = """Fine bovines demand fine toreadors.
Muskoxen are polar ovibovine species.
Grooviness went out of fashion decades ago."""
meadow = "Ovines are found typically in ovaries."
if re.search(r"\bovines\b",meadow,re.I) : print "Here be sheep!"
#-----------------------------
# The tricky bit
mystr = "good food"
re.sub("o*","e",mystr,1) # gives 'egood food'
echo ababacaca | python -c "import sys,re; print re.search('(a|ba|b)+(a|ac)+',sys.stdin.read()).group()"
#-----------------------------
# pattern matching modifiers
# assume perl code iterates over some file
import re, fileinput
for ln = fileinput.input():
fnd = re.findall("(\d+)",ln)
if len(fnd) > 0:
print "Found number %s" % (fnd[0])
# ----------------------------
digits = "123456789"
nonlap = re.findall("(\d\d\d)", digits)
yeslap = ["not yet"]
print "Non-overlapping:",",".join(nonlap)
print "Overlapping :",",".join(yeslap)
# ----------------------------
mystr = "And little lambs eat ivy"
fnd = re.search("(l[^s]*s)", mystr)
print "(%s) (%s) (%s)" % (mystr[:fnd.start()], fnd.group(), mystr[fnd.end():])
# (And ) (little lambs) ( eat ivy)
[править] Copying and Substituting Simultaneously
import re
dst = re.sub("this","that",src)
#-----------------------------
# strip to basename
basename = re.sub(".*/(?=[^/]+)","",progname)
# Make All Words Title-Cased
# DON'T DO THIS - use str.title() instead
def cap(mo): return mo.group().capitalize()
re.sub("(?P<n>\w+)",cap,"make all words title-cased")
# /usr/man/man3/foo.1 changes to /usr/man/cat3/foo.1
manpage = "/usr/man/man3/foo.1"
catpage = re.sub("man(?=\d)","cat",manpage)
#-----------------------------
bindirs = "/usr/bin /bin /usr/local/bin".split()
libdirs = [d.replace("bin", "lib") for d in bindirs]
print " ".join(libdirs)
#=> /usr/lib /lib /usr/local/lib
#-----------------------------
# strings are never modified in place.
#-----------------------------
[править] Matching Letters
##---------------------------
# DON'T DO THIS. use line[:-1].isalpha() [this probably goes for the
# remainder of this section too!]
import re
if re.match("^[A-Za-z]+$",line):
print "pure alphabetic"
##---------------------------
if re.match(r"^[^\W\d_]+$", line, re.LOCALE):
print "pure alphabetic"
##---------------------------
import re
import locale
try:
locale.setlocale(locale.LC_ALL, 'fr_CA.ISO8859-1')
except:
print "couldn't set locale to French Cnadian"
raise SystemExit
DATA="""
silly
façade
coöperate
niño
Renée
Molière
hæmoglobin
naïve
tschüß
random!stuff#here
"""
for ln in DATA.split():
ln = ln.rstrip()
if re.match(r"^[^\W\d_]+$",ln,re.LOCALE):
print "%s: alphabetic" % (ln)
else:
print "%s: line noise" % (ln)
# although i dont think "coöperate" should be in canadian
##---------------------------
[править] Matching Words
# Matching Words
"\S+" # as many non-whitespace bytes as possible
"[A-Za-z'-]+" # as many letters, apostrophes, and hyphens
# string split is similar to splitting on "\s+"
"A text with some\tseparator".split()
"\b*([A-Za-z]+)\b*" # word boundaries
"\s*([A-Za-z]+)\s*" # might work too as on letters are allowed.
re.search("\Bis\B","this thistle") # matches on thistle not on this
re.search("\Bis\B","vis-a-vis") # does not match
[править] Commenting Regular Expressions
#-----------------------------
#!/usr/bin/python
# resname - change all "foo.bar.com" style names in the input stream
# into "foo.bar.com [204.148.40.9]" (or whatever) instead
import socket # load inet_addr
import fileinput
import re
match = re.compile("""(?P<hostname> # capture hostname
(?: # these parens for grouping only
[\w-]+ # hostname component
\. # ant the domain dot
) + # now repeat that whole thing a bunch of times
[A-Za-z] # next must be a letter
[\w-] + # now trailing domain part
) # end of hostname capture
""",re.VERBOSE) # for nice formatting
def repl(match_obj):
orig_hostname = match_obj.group("hostname")
try:
addr = socket.gethostbyname(orig_hostname)
except socket.gaierror:
addr = "???"
return "%s [%s]" % (orig_hostname, addr)
for ln in fileinput.input():
print match.sub(repl, ln)
#-----------------------------
re.sub("""(?x) # nicer formatting
\# # a pound sign
(\w+) # the variable name
\# # another pound sign
""",
lambda m: eval(m.group(1)), # replace with the value of the global variable
line
)
##-----------------------------
re.sub("""(?x) # nicer formatting
\# # a pound sign
(\w+) # the variable name
\# # another pound sign
""",
lambda m: eval(eval(m.group(1))), # replace with the value of *any* variable
line
)
##-----------------------------
[править] Finding the Nth Occurrence of a Match
import re
pond = "one fish two fish red fish blue fish"
fishes = re.findall(r"(?i)(\w+)\s+fish\b",pond)
if len(fishes)>2:
print "The third fish is a %s one." % (fishes[2])
##-----------------------------
re.findall(r"(?i)(?:\w+\s+fish\s+){2}(\w+)\s+fish",pond)
##-----------------------------
count = 0
for match_object in re.finditer(r"PAT", mystr):
count += 1 # or whatever you want to do here
# "progressive" matching might be better if one wants match 5 from 50.
# to count use
count = len(re.findall(r"PAT",mystr))
count = len(re.findall(r"aba","abaababa"))
# "count" overlapping matches
count = len(re.findall(r"(?=aba)","abaababa"))
# FASTEST non-overlapping might be str.count
"abaababa".count("aba")
##-----------------------------
pond = "one fish two fish red fish blue fish"
colors = re.findall(r"(?i)(\w+)\s+fish\b",pond) # get all matches
color = colors[2] # then the one we want
# or without a temporary list
color = re.findall(r"(?i)(\w+)\s+fish\b",pond)[2] # just grab element 3
print "The third fish in the pond is %s." % (color)
##-----------------------------
import re
pond = "one fish two fish red fish blue fish"
matches = re.findall(r"(\w+)\s+fish\b",pond)
evens = [fish for (i, fish) in enumerate(matches) if i%2]
print "Even numbered fish are %s." % (" ".join(evens))
##-----------------------------
count = 0
def four_is_sushi(match_obj):
global count
count += 1
if count==4:
return "sushi%s" % (match_obj.group(2))
return "".join(match_obj.groups())
re.sub(r"""(?x) # VERBOSE
\b # makes next \w more efficient
( \w+ ) # this is what we'll be changing
(
\s+ fish \b
)""",
four_is_sushi,
pond)
# one fish two fish red fish sushi fish
##-----------------------------
# greedily
last_fish = re.findall(r"(?i).*\b(\w+)\s+fish\b",pond)
##-----------------------------
pond = "One fish two fish red fish blue fish swim here"
color = re.findall(r"(?i)\b(\w+)\s+fish\b",pond)[-1]
print "Last fish is "+color+"."
# FASTER using string.
lastfish = pond.rfind("fish")
color = pond[:lastfish].split()[-1]
##-----------------------------
r"""(?x)
A # find some pattern A
(?! # mustn't be able to find
.* # something
A # and A
)
$ # through the end of string
"""
pond = "One fish two fish red fish blue fish swim here"
fnd = re.findall(r"""(?xis) # VERBOSE, CASEINSENSITIVE, DOTALL
\b ( \w+ ) \s+ fish \b
(?! .* \b fish \b )""",
pond)
if len(fnd):
print "Last fish is %s." % (fnd[0])
else:
print "Failed!"
[править] Matching Multiple Lines
# Matching Multiple Lines
#
#!/usr/bin/python
# killtags - very bad html tag killer
import re
import sys
text = open(sys.argv[1]).read() # read the whole file
text = re.sub("(?ms)<.*?>","",text) # strip tags (terrible
print text
## ----------------------------
#!/usr/bin/python
# headerfy: change certain chapter headers to html
import sys, re
match = re.compile(r"""(?xms) # re.VERBOSE, re.MULTILINE, and re.DOTALL
\A # start of the string
(?P<chapter> # capture in g<chapter>
Chapter # literal string
\s+ # mandatory whitespace
\d+ # decimal number
\s* # optional whitespace
: # a real colon
. * # anything not a newline till end of line
)
""")
text = open(sys.argv[1]).read() # read the whole file
for paragraph in text.split("\n"): # split on unix end of lines
p = match.sub("<h1>\g<chapter></h1>",paragraph)
print p
## ----------------------------
# the one liner does not run.
# python -c 'import sys,re; for p in open(sys.argv[1]).read().split("\n\n"): print re.sub(r"(?ms)\A(Chapter\s+\d+\s*:.*)","<h1>\g0</h1>",p)'
## ----------------------------
match = re.compile(r"(?ms)^START(.*?)^END")
# s makes . span line boundaries
# m makes ^ match at the beginning of the string and at the beginning of each line
chunk = 0
for paragraph in open(sys.argv[1]).read().split("\n\n"):
chunk += 1
fnd = match.findall(paragraph)
if fnd:
print "chunk %d in %s has <<%s>>" % (chunk,sys.argv[1],">>,<<".join(fnd))
## ----------------------------
[править] Reading Records with a Pattern Separator
import sys
# Read the whole file and split
chunks = open(sys.argv[1]).read().split() # on whitespace
chunks = open(sys.argv[1]).read().split("\n") # on line ends
# splitting on pattern
import re
pattern = r"x"
chunks = re.split(pattern, open(sys.argv[1]).read())
##-----------------------------
chunks = re.split(r"(?m)^\.(Ch|Se|Ss)$",open(sys.argv[1]).read())
print "I read %d chunks." % (len(chunks))
# without delimiters
chunks = re.split(r"(?m)^\.(?:Ch|Se|Ss)$",open(sys.argv[1]).read())
# with delimiters
chunks = re.split(r"(?m)^(\.(?:Ch|Se|Ss))$",open(sys.argv[1]).read())
# with delimiters at chunkstart
chunks = re.findall(r"""(?xms) # multiline, dot matches lineend, allow comments
((?:^\.)? # consume the separator if present
.*?) # match everything but not greedy
(?= # end the match on this but dont consume it
(?: # dont put into group [1]
^\.(?:Ch|Se|Ss)$ # either end on one of the roff commands
|\Z # or end of text
)
)""",
open(sys.argv[1]).read())
# [1] if "?:" is removed the result holds tuples: ('.Ch\nchapter x','.Ch')
# which might be more usefull.
[править] Extracting a Range of Lines
##-----------------------------
# Python doesn't have perl's range operators
# If you want to only use a selected line range, use enumerate
# (though note that indexing starts at zero:
for i, line in enumerate(myfile):
if firstlinenum <= i < lastlinenum:
dosomethingwith(line)
# Using patterned ranges is slightly trickier -
# You need to search for the first pattern then
# search for the next pattern:
import re
for line in myfile:
if re.match(pat1, line):
break
dosomethingwith(line) # Only if pat1 can be on same line as pat2
for line in myfile:
if re.match(pat2, line):
break
dosomethingwith(line)
##-----------------------------
# If you need to extract ranges a lot, the following generator funcs
# may be useful:
def extract_range(myfile, start, finish):
for i, line in enumerate(myfile):
if start <= i < finish:
yield line
elif i == finish:
break
for line in extract_range(open("/etc/passwd"), 3, 5):
print line
def patterned_range(myfile, startpat, endpat=None):
startpat = re.compile(startpat)
if endpat is not None:
endpat = re.compile(endpat)
in_range = False
for line in myfile:
if re.match(startpat, line):
in_range = True
if in_range:
yield line
if endpat is not None and re.match(endpat, line):
break
# DO NOT DO THIS. Use the email module instead
for line in patterned_range(msg, "^From:?", "^$"):
pass #...
[править] Matching Shell Globs as Regular Expressions
tests = (("list.?",r"^list\..$"),
("project.*",r"^project\..*$"),
("*old",r"^.*old$"),
("type*.[ch]",r"^type.*\.[ch]$"),
("*.*",r"^.*\..*$"),
("*",r"^.*$"),
)
# The book says convert "*","?","[","]" all other characters will be quoted.
# The book uses "\Q" which escapes any characters that would otherwise be
# treated as regular expression.
# Escaping every char fails as "\s" is not "s" in a regex.
def glob2pat(globstr):
pat = globstr.replace("\\",r"\\")
pat = pat.replace(".",r"\.").replace("?",r".").replace("*",r".*")
return "^"+pat+"$"
for globstr, patstr in tests:
g2p = glob2pat(globstr)
if g2p != patstr:
print globstr, "failed! Should be", patstr, "but was", g2p
[править] Speeding Up Interpolated Matches
# download the following standalone program
#!/usr/bin/python
# popgrep1 - grep for abbreviations of places that say "pop"
# version 1: slow but obvious way
import fileinput
import re
popstates = ["CO","ON","MI","WI","MN"]
for line in fileinput.input():
for state in popstates:
if re.search(r"\b"+state+r"\b",line):
print line
#-----------------------------
# download the following standalone program
#!/usr/bin/python
# popgrep2 - grep for abbreviations of places that say "pop"
# version 2: compile the patterns
import fileinput
import re
popstates = ["CO","ON","MI","WI","MN"]
state_re = []
for state in popstates:
state_re.append(re.compile(r"\b"+state+r"\b"))
for line in fileinput.input():
for state in state_re:
if state.search(line):
print line
#-----------------------------
# download the following standalone program
#!/usr/bin/python
# popgrep3 - grep for abbreviations of places that say "pop"
# version 3: compile a single pattern
import fileinput
import re
popstates = ["CO","ON","MI","WI","MN"]
state_re = re.compile(r"\b(?:"+"|".join(popstates)+r")\b")
for line in fileinput.input():
if state_re.search(line):
print line
#-----------------------------
# download the following standalone program
#!/usr/bin/python
# grepauth - print lines that mention both Tom and Nat
import fileinput
import re
def build_match_any(words):
return re.compile("|".join(words))
def uniq(arr):
seen = {}
for item in arr:
seen[item] = seen.get(item, 0) + 1
return seen.keys()
def build_match_all(words):
r = re.compile("|".join(words))
c = lambda line: len(uniq(r.findall(line)))>=len(words)
return c
any = build_match_any(("Tom","Nat"))
all = build_match_all(("Tom","Nat"))
for line in fileinput.input():
if any.search(line):
print "any:", line
if all(line):
print "all:", line
#-----------------------------
[править] Testing for a Valid Pattern
# Testing for a Valid Pattern
import re
while True:
pat = raw_input("Pattern? ")
try:
re.compile(pat)
except re.error, err:
print "INVALID PATTERN", err
continue
break
# ----
def is_valid_pattern(pat):
try:
re.compile(pat)
except re.error:
return False
return True
# ----
# download the following standalone program
#!/usr/bin/python
# paragrep - trivial paragraph grepper
#
# differs from perl version in parano.
# python version displays paragraph in current file.
import sys, os.path, re
if len(sys.argv)<=1:
print "usage: %s pat [files]\n" % sys.argv[0]
sys.exit(1)
pat = sys.argv[1]
try:
pat_re = re.compile(pat)
except:
print "%s: bad pattern %s: %s" % (sys.argv[1], pat, sys.exc_info()[1])
sys.exit(1)
for filename in filter(os.path.isfile,sys.argv[2:]):
parano = 0
for para in open(filename).read().split("\n\n"):
parano += 1
if pat_re.search(para):
print filename, parano, para, "\n"
# ----
# as we dont evaluate patterns the attack ::
#
# $pat = "You lose @{[ system('rm -rf *']} big here";
#
# does not work.
[править] Honoring Locale Settings in Regular Expressions
# download the following standalone program
#!/usr/bin/python
# localeg - demonstrates locale effects
#
# re must be told to respect locale either in the regexp
# "(?L)" or as flag to the call (python 2.4) "re.LOCALE".
import sys
import re
from locale import LC_CTYPE, setlocale, getlocale
name = "andreas k\xF6nig"
locale = {"German" : "de_DE.ISO_8859-1", "English" : "en_US"}
# us-ascii is not supported on linux py23
# none works in activestate py24
try:
setlocale(LC_CTYPE, locale["English"])
except:
print "Invalid locale %s" % locale["English"]
sys.exit(1)
english_names = []
for n in re.findall(r"(?L)\b(\w+)\b",name):
english_names.append(n.capitalize())
try:
setlocale(LC_CTYPE, locale["German"])
except:
print "Invalid locale %s" % locale["German"]
sys.exit(1)
german_names = map(str.capitalize, re.findall(r"(?L)\b(\w+)\b",name))
print "English names: %s" % " ".join(english_names)
print "German names: %s" % " ".join(german_names)
[править] Approximate Matching
##-----------------------------
import difflib
matchlist = ["ape", "apple", "lapel", "peach", "puppy"]
print difflib.get_close_matches("appel", matchlist)
#=> ['lapel', 'apple', 'ape']
##-----------------------------
# Also see:
# http://www.personal.psu.edu/staff/i/u/iua1/python/apse/
# http://www.bio.cam.ac.uk/~mw263/pyagrep.html
[править] Matching from Where the Last Pattern Left Off
##-----------------------------
# To search (potentially) repeatedly for a pattern, use re.finditer():
# DO NOT DO THIS. Split on commas and convert elems using int()
mystr = "3,4,5,9,120"
for match in re.finditer("(\d+)", mystr):
n = match.group(0)
if n == "9":
break # '120' will never be matched
print "Found number", n
# matches know their end position
mystr = "The year 1752 lost 10 days on the 3rd of September"
x = re.finditer("(\d+)", mystr)
for match in x:
n = match.group(0)
print "Found number", n
tail = re.match("(\S+)", mystr[match.end():])
if tail:
print "Found %s after the last number."%tail.group(0)
[править] Greedy and Non-Greedy Matches
# Python's regexes are based on Perl's, so it has the non-greedy
# '*?', '+?', and '??' versions of '*', '+', and '?'.
# DO NOT DO THIS. import htmllib, formatter, etc, instead
#-----------------------------
# greedy pattern
txt = re.sub("<.*>", "", txt) # try to remove tags, very badly
# non-greedy pattern
txt = re.sub("<.*?>", "", txt) # try to remove tags, still rather badly
#-----------------------------
txt = "<b><i>this</i> and <i>that</i> are important</b> Oh, <b><i>me too!</i></b>"
print re.findall("<b><i>(.*?)</i></b>", txt
##-----------------------------
print re.findall("/BEGIN((?:(?!BEGIN).)*)END/", txt)
##-----------------------------
print re.findall("<b><i>((?:(?!<b>|<i>).)*)</i></b>", txt)
##-----------------------------
print re.findall("<b><i>((?:(?!<[ib]>).)*)</i></b>", txt)
##-----------------------------
print re.findall("""
<b><i>
[^<]* # stuff not possibly bad, and not possibly the end.
(?: # at this point, we can have '<' if not part of something bad
(?! </?[ib]> ) # what we can't have
< # okay, so match the '<'
[^<]* # and continue with more safe stuff
) *
</i></b>
""", re.VERBOSE, txt)
##-----------------------------
[править] Detecting Duplicate Words
##-----------------------------
text = """
This is a test
test of the duplicate word finder.
"""
words = text.split()
for curr, next in zip(words[:-1], words[1:]):
if curr.upper() == next.upper():
print "Duplicate word '%s' found." % curr
# DON'T DO THIS
import re
pat = r"""
\b # start at a word boundary (begin letters)
(\S+) # find chunk of non-whitespace
\b # until another word boundary (end letters)
(
\s+ # separated by some whitespace
\1 # and that very same chunk again
\b # until another word boundary
) + # one or more sets of those
"""
for match in re.finditer(pat, text, flags=re.VERBOSE|re.IGNORECASE):
print "Duplicate word '%s' found." % match.group(1)
##-----------------------------
a = 'nobody';
b = 'bodysnatcher';
text = a+" "+b
pat = r"^(\w+)(\w+) \2(\w+)$"
for match in re.finditer(pat, text):
m1, m2, m3 = match.groups()
print m2, "overlaps in %s-%s-%s"%(m1, m2, m3)
##-----------------------------
pat = r"^(\w+?)(\w+) \2(\w+)$"
##-----------------------------
try:
while True:
factor = re.match(r"^(oo+?)\1+$", n).group(1)
n = re.sub(factor, "o", n)
print len(factor)
except AttributeError:
print len(n)
##-----------------------------
def diaphantine(n, x, y, z):
pat = r"^(o*)\1{%s}(o*)\2{%s}(o*)\3{%s}$"%(x-1, y-1, z-1)
text = "o"*n
try:
vals = [len(v) for v in re.match(pat, text).groups()]
except ValueError:
print "No solutions."
else:
print "One solution is: x=%s, y=%s, z=%s."%tuple(vals)
diaphantine(n=281, x=12, y=15, z=16)
[править] Expressing AND, OR, and NOT in a Single Pattern
##-----------------------------
# Pass any of the following patterns to re.match(), etc
pat = "ALPHA|BETA"
pat = "^(?=.*ALPHA)(?=.*BETA)"
pat = "ALPHA.*BETA|BETA.*ALPHA"
pat = "^(?:(?!PAT).)*$"
pat = "(?=^(?:(?!BAD).)*$)GOOD"
##-----------------------------
if not re.match(pattern, text):
something()
##-----------------------------
if re.match(pat1, text) and re.match(pat2, text):
something()
##-----------------------------
if re.match(pat1, text) or re.match(pat2, text):
something()
##-----------------------------
# DON'T DO THIS.
"""minigrep - trivial grep"""
import sys, re
pat = sys.argv[1]
for line in sys.stdin:
if re.match(pat, line):
print line[:-1]
##-----------------------------
if re.match(r"^(?=.*bell)(?=.*lab)", "labelled"):
something()
##-----------------------------
if re.search("bell", s) and re.search("lab", s):
something()
##-----------------------------
if re.match("""
^ # start of string
(?= # zero-width lookahead
.* # any amount of intervening stuff
bell # the desired bell string
) # rewind, since we were only looking
(?= # and do the same thing
.* # any amount of intervening stuff
lab # and the lab part
)
""",
murray_hill,
re.DOTALL | re.VERBOSE):
print "Looks like Bell Labs might be in Murray Hill!"
##-----------------------------
if re.match(r"(?:^.*bell.*lab)|(?:^.*lab.*bell)", "labelled"):
something()
##-----------------------------
brand = "labelled"
if re.match("""
(?: # non-capturing grouper
^ .*? # any amount of stuff at the front
bell # look for a bell
.*? # followed by any amount of anything
lab # look for a lab
) # end grouper
| # otherwise, try the other direction
(?: # non-capturing grouper
^ .*? # any amount of stuff at the front
lab # look for a lab
.*? # followed by any amount of anything
bell # followed by a bell
) # end grouper
""",
brand,
re.DOTALL | re.VERBOSE):
print "Our brand has bell and lab separate."
##-----------------------------
x = "odlaw"
if re.match("^(?:(?!waldo).)*$", x):
print "There's no waldo here!"
##-----------------------------
if re.match("""
^ # start of string
(?: # non-capturing grouper
(?! # look ahead negation
waldo # is he ahead of us now?
) # is so, the negation failed
. # any character (cuzza /s)
) * # repeat that grouping 0 or more
$ # through the end of the string
""",
x,
re.VERBOSE | re.DOTALL):
print "There's no waldo here!\n";
##-----------------------------
[править] Matching Multiple-Byte Characters
# @@INCOMPLETE@@
# @@INCOMPLETE@@
[править] Matching a Valid Mail Address
##-----------------------------
from email._parseaddr import AddressList
print AddressList("fred&barney@stonehenge.com").addresslist[0]
print AddressList("fred&barney@stonehenge.com (Hanna Barbara)").addresslist[0]
name, address = AddressList("Mr Fooby Blah <me@nowhere.com>").addresslist[0]
print "%s's address is '%s'"%(name, address)
[править] Matching Abbreviations
##-----------------------------
# Assuming the strings all start with different letters, or you don't
# mind there being precedence, use the startswith string method:
def get_action(answer):
answer = answer.lower()
actions = ["send", "stop", "abort", "list", "end"]
for action in actions:
if action.startswith(answer):
return action
print "Action is %s."%get_action("L")
#=> Action is list.
##-----------------------------
#DON'T DO THIS:
import re
answer = "ab"
answer = re.escape(answer.strip())
for action in ("SEND", "STOP", "ABORT", "LIST", "EDIT"):
if re.match(answer, action, flags=re.IGNORECASE):
print "Action is %s."%action.lower()
##-----------------------------
import re, sys
def handle_cmd(cmd):
cmd = re.escape(cmd.strip())
for name, action in {"edit": invoke_editor,
"send": deliver_message,
"list": lambda: system(pager, myfile),
"abort": sys.exit,
}
if re.match(cmd, name, flags=re.IGNORECASE):
action()
break
else:
print "Unknown command:", cmd
handle_cmd("ab")
[править] Program: urlify
##-----------------------------
# urlify - wrap HTML links around URL-like constructs
import re, sys, fileinput
def urlify_string(s):
urls = r'(http|telnet|gopher|file|wais|ftp)'
ltrs = r'\w';
gunk = r'/#~:.?+=&%@!\-'
punc = r'.:?\-'
any = ltrs + gunk + punc
pat = re.compile(r"""
\b # start at word boundary
( # begin \1 {
%(urls)s : # need resource and a colon
[%(any)s] +? # followed by one or more
# of any valid character, but
# be conservative and take only
# what you need to....
) # end \1 }
(?= # look-ahead non-consumptive assertion
[%(punc)s]* # either 0 or more punctuation
[^%(any)s] # followed by a non-url char
| # or else
$ # then end of the string
)
"""%locals(), re.VERBOSE | re.IGNORECASE)
return re.sub(pat, r"<A HREF=\1>\1</A>", s)
if __name__ == "__main__":
for line in fileinput.input():
print urlify_string(line)
[править] Program: tcgrep
##-----------------------------
# @@INCOMPLETE@@
# @@INCOMPLETE@@
[править] Regular Expression Grabbag
# The majority of regexes in this section are either partially
# or completely The Wrong Thing to Do.
##-----------------------------
# DON'T DO THIS. Use a Roman Numeral module, etc. (since
# you need one anyway to calculate values)
pat = r"^m*(d?c{0,3}|c[dm])(l?x{0,3}|x[lc])(v?i{0,3}|i[vx])$"
re.match(pat, "mcmlxcvii")
##-----------------------------
txt = "one two three four five"
# If the words are cleanly delimited just split and rejoin:
word1, word2, rest = txt.split(" ", 2)
print " ".join([word2, word1, rest])
# Otherwise:
frompat = r"(\S+)(\s+)(\S+)"
topat = r"\3\2\1"
print re.sub(frompat, topat, txt)
##-----------------------------
print str.split("=")
# DON'T DO THIS
pat = r"(\w+)\s*=\s*(.*)\s*$"
print re.match(pat, "key=val").groups()
##-----------------------------
line = "such a very very very very very very very very very very very very very long line"
if len(line) > 80:
process(line)
# DON'T DO THIS
pat = ".{80,}"
if re.match(pat, line):
process(line)
##-----------------------------
dt = time.strptime("12/11/05 12:34:56", "%d/%m/%y %H:%M:%S")
# DON'T DO THIS
pat = r"(\d+)/(\d+)/(\d+) (\d+):(\d+):(\d+)"
dt = re.match(pat, "12/11/05 12:34:56").groups()
##-----------------------------
txt = "/usr/bin/python"
print txt.replace("/usr/bin", "/usr/local/bin")
# Alternatively for file operations use os.path, shutil, etc.
# DON'T DO THIS
print re.sub("/usr/bin", "/usr/local/bin", txt)
##-----------------------------
import re
def unescape_hex(matchobj):
return chr(int(matchobj.group(1) or 0, 16))
txt = re.sub(r"%([0-9A-Fa-f][0-9A-Fa-f])", unescape_hex, txt)
# Assuming that the hex escaping is well-behaved, an alternative is:
def unescape_hex(seg):
return chr(int(seg[:2], 16)) + seg[2:]
segs = txt.split("%")
txt = segs[0] + "".join(unescape_hex(seg) for seg in segs[1:])
##-----------------------------
txt = re.sub(r"""
/\* # Match the opening delimiter
.*? # Match a minimal number of characters
\*/ # Match the closing delimiter
""", "", txt, re.VERBOSE)
##-----------------------------
txt.strip()
# DON'T DO THIS
txt = re.sub(r"^\s+", "", txt)
txt = re.sub(r"\s+$", "", txt)
##-----------------------------
txt.replace("\\n", "\n")
# DON'T DO THIS
txt = re.sub("\\n", "\n", txt)
##-----------------------------
txt = re.sub("^.*::", "")
##-----------------------------
import socket
socket.inet_aton(txt) # Will raise an error if incorrect
# DON'T DO THIS.
octseg =r"([01]?\d\d|2[0-4]\d|25[0-5])"
dot = r"\."
pat = "^" + octseg + dot + octseg + dot + octseg + dot + octseg + "$"
if not re.match(pat, txt, re.VERBOSE)
raise ValueError
# Defitely DON'T DO THIS.
pat = r"""^([01]?\d\d|2[0-4]\d|25[0-5])\.([01]?\d\d|2[0-4]\d|25[0-5])\.
([01]?\d\d|2[0-4]\d|25[0-5])\.([01]?\d\d|2[0-4]\d|25[0-5])$"""
##-----------------------------
fname = os.path.basename(path)
# DON'T DO THIS.
fname = re.sub("^.*/", "", path)
##-----------------------------
import os
try:
tc = os.environ["TERMCAP"]
except KeyError:
cols = 80
else:
cols = re.match(":co#(\d+):").groups(1)
##-----------------------------
# (not quite equivalent to the Perl version)
name = os.path.basename(sys.argv[0])
# DON'T DO THIS.
name = re.sub("^.*/", "", sys.argv[0])
##-----------------------------
if sys.platform != "linux":
raise SystemExit("This isn't Linux")
##-----------------------------
txt = re.sub(r"\n\s+", " ", txt)
# In many cases you could just use:
txt = txt.replace("\n", " ")
##-----------------------------
nums = re.findall(r"\d+\.?\d*|\.\d+", txt)
##-----------------------------
# If the words are clearly delimited just use:
capwords = [word for word in txt.split() if word.isupper()]
# Otherwise
capwords = [word for word in re.findall(r"\b(\S+)\b", txt) if word.isupper()]
# (probably) DON'T DO THIS.
capwords = re.findall(r"(\b[^\Wa-z0-9_]+\b)", txt)
##-----------------------------
# If the words are clearly delimited just use:
lowords = [word for word in txt.split() if word.islower()]
# Otherwise
lowords = [word for word in re.findall(r"\b(\S+)\b", txt) if word.islower()]
# (probably) DON'T DO THIS.
lowords = re.findall(r"(\b[^\WA-Z0-9_]+\b)", txt)
##-----------------------------
# If the words are clearly delimited just use:
icwords = [word for word in txt.split() if word.istitle()]
# Otherwise
icwords = [word for word in re.finditer(r"\b(\S+)\b") if word.istitle()]
# DON'T DO THIS.
icwords = re.findall(r"(\b[^\Wa-z0-9_][^\WA-Z0-9_]*\b)", txt)
##-----------------------------
# DON'T DO THIS - use HTMLParser, etc.
links = re.findall(r"""<A[^>]+?HREF\s*=\s*["']?([^'" >]+?)[ '"]?>""", txt)
##-----------------------------
names = txt.split()
if len(names) == 3:
initial = names[1][0]
else:
initial = ""
# DON'T DO THIS.
pat = "^\S+\s+(\S)\S*\s+\S"
try:
initial = re.match(pat, txt).group(1)
except AttributeError:
initial = ""
##-----------------------------
txt = re.sub('"([^"]*)"', "``\1''", txt)
##-----------------------------
sentences = [elem[0] for elem in re.findall(r"(.*?[!?.])( |\Z)", s)]
##-----------------------------
import time
dt = time.strptime(txt, "%Y-%m-%d")
# DON'T DO THIS.
year, month, day = re.match(r"(\d{4})-(\d\d)-(\d\d)", txt).groups()
##-----------------------------
pat = r"""
^
(?:
1 \s (?: \d\d\d \s)? # 1, or 1 and area code
| # ... or ...
\(\d\d\d\) \s # area code with parens
| # ... or ...
(?: \+\d\d?\d? \s)? # optional +country code
\d\d\d ([\s\-]) # and area code
)
\d\d\d (\s|\1) # prefix (and area code separator)
\d\d\d\d # exchange
$
"""
re.match(pat, txt, re.VERBOSE)
##-----------------------------
re.match(r"\boh\s+my\s+gh?o(d(dess(es)?|s?)|odness|sh)\b", txt, re.IGNORECASE)
##-----------------------------
for line in file(fname, "Ur"): #Universal newlines
process(line)
# DON'T DO THIS
lines = [re.sub(r"^([^\012\015]*)(\012\015?|\015\012?)", "", line)
for line in file(fname)]
##-----------------------------
|