Python/FAQ/Поиск по шаблону
Материал из Wiki.crossplatform.ru
(Различия между версиями)
Root (Обсуждение | вклад)
(Новая: {{Python_header}} == Introduction == <source lang="python"> # Note: regexes are used less often in Python than in Perl as tasks are often # covered by string methods, or specialised obj...)
Следующая правка →
(Новая: {{Python_header}} == Introduction == <source lang="python"> # Note: regexes are used less often in Python than in Perl as tasks are often # covered by string methods, or specialised obj...)
Следующая правка →
Версия 14:22, 3 декабря 2008
· Python · |
Introduction
# Note: regexes are used less often in Python than in Perl as tasks are often # covered by string methods, or specialised objects, modules, or packages. import re # "re" is the regular expression module. re.search("sheep",meadow) # returns a MatchObject is meadow contains "sheep". if not re.search("sheep",meadow): print "no sheep on this meadow only a fat python." # replacing strings is not done by "re"gular expressions. meadow = meadow.replace("old","new") # replace "old" with "new" and assign result. #----------------------------- re.search("ovine",meadow) meadow = """Fine bovines demand fine toreadors. Muskoxen are polar ovibovine species. Grooviness went out of fashion decades ago.""" meadow = "Ovines are found typically in ovaries." if re.search(r"\bovines\b",meadow,re.I) : print "Here be sheep!" #----------------------------- # The tricky bit mystr = "good food" re.sub("o*","e",mystr,1) # gives 'egood food' echo ababacaca | python -c "import sys,re; print re.search('(a|ba|b)+(a|ac)+',sys.stdin.read()).group()" #----------------------------- # pattern matching modifiers # assume perl code iterates over some file import re, fileinput for ln = fileinput.input(): fnd = re.findall("(\d+)",ln) if len(fnd) > 0: print "Found number %s" % (fnd[0]) # ---------------------------- digits = "123456789" nonlap = re.findall("(\d\d\d)", digits) yeslap = ["not yet"] print "Non-overlapping:",",".join(nonlap) print "Overlapping :",",".join(yeslap) # ---------------------------- mystr = "And little lambs eat ivy" fnd = re.search("(l[^s]*s)", mystr) print "(%s) (%s) (%s)" % (mystr[:fnd.start()], fnd.group(), mystr[fnd.end():]) # (And ) (little lambs) ( eat ivy)
Copying and Substituting Simultaneously
import re dst = re.sub("this","that",src) #----------------------------- # strip to basename basename = re.sub(".*/(?=[^/]+)","",progname) # Make All Words Title-Cased # DON'T DO THIS - use str.title() instead def cap(mo): return mo.group().capitalize() re.sub("(?P<n>\w+)",cap,"make all words title-cased") # /usr/man/man3/foo.1 changes to /usr/man/cat3/foo.1 manpage = "/usr/man/man3/foo.1" catpage = re.sub("man(?=\d)","cat",manpage) #----------------------------- bindirs = "/usr/bin /bin /usr/local/bin".split() libdirs = [d.replace("bin", "lib") for d in bindirs] print " ".join(libdirs) #=> /usr/lib /lib /usr/local/lib #----------------------------- # strings are never modified in place. #-----------------------------
Matching Letters
##--------------------------- # DON'T DO THIS. use line[:-1].isalpha() [this probably goes for the # remainder of this section too!] import re if re.match("^[A-Za-z]+$",line): print "pure alphabetic" ##--------------------------- if re.match(r"^[^\W\d_]+$", line, re.LOCALE): print "pure alphabetic" ##--------------------------- import re import locale try: locale.setlocale(locale.LC_ALL, 'fr_CA.ISO8859-1') except: print "couldn't set locale to French Cnadian" raise SystemExit DATA=""" silly façade coöperate niño Renée Molière hæmoglobin naïve tschüß random!stuff#here """ for ln in DATA.split(): ln = ln.rstrip() if re.match(r"^[^\W\d_]+$",ln,re.LOCALE): print "%s: alphabetic" % (ln) else: print "%s: line noise" % (ln) # although i dont think "coöperate" should be in canadian ##---------------------------
Matching Words
# Matching Words "\S+" # as many non-whitespace bytes as possible "[A-Za-z'-]+" # as many letters, apostrophes, and hyphens # string split is similar to splitting on "\s+" "A text with some\tseparator".split() "\b*([A-Za-z]+)\b*" # word boundaries "\s*([A-Za-z]+)\s*" # might work too as on letters are allowed. re.search("\Bis\B","this thistle") # matches on thistle not on this re.search("\Bis\B","vis-a-vis") # does not match
Commenting Regular Expressions
#----------------------------- #!/usr/bin/python # resname - change all "foo.bar.com" style names in the input stream # into "foo.bar.com [204.148.40.9]" (or whatever) instead import socket # load inet_addr import fileinput import re match = re.compile("""(?P<hostname> # capture hostname (?: # these parens for grouping only [\w-]+ # hostname component \. # ant the domain dot ) + # now repeat that whole thing a bunch of times [A-Za-z] # next must be a letter [\w-] + # now trailing domain part ) # end of hostname capture """,re.VERBOSE) # for nice formatting def repl(match_obj): orig_hostname = match_obj.group("hostname") try: addr = socket.gethostbyname(orig_hostname) except socket.gaierror: addr = "???" return "%s [%s]" % (orig_hostname, addr) for ln in fileinput.input(): print match.sub(repl, ln) #----------------------------- re.sub("""(?x) # nicer formatting \# # a pound sign (\w+) # the variable name \# # another pound sign """, lambda m: eval(m.group(1)), # replace with the value of the global variable line ) ##----------------------------- re.sub("""(?x) # nicer formatting \# # a pound sign (\w+) # the variable name \# # another pound sign """, lambda m: eval(eval(m.group(1))), # replace with the value of *any* variable line ) ##-----------------------------
Finding the Nth Occurrence of a Match
import re pond = "one fish two fish red fish blue fish" fishes = re.findall(r"(?i)(\w+)\s+fish\b",pond) if len(fishes)>2: print "The third fish is a %s one." % (fishes[2]) ##----------------------------- re.findall(r"(?i)(?:\w+\s+fish\s+){2}(\w+)\s+fish",pond) ##----------------------------- count = 0 for match_object in re.finditer(r"PAT", mystr): count += 1 # or whatever you want to do here # "progressive" matching might be better if one wants match 5 from 50. # to count use count = len(re.findall(r"PAT",mystr)) count = len(re.findall(r"aba","abaababa")) # "count" overlapping matches count = len(re.findall(r"(?=aba)","abaababa")) # FASTEST non-overlapping might be str.count "abaababa".count("aba") ##----------------------------- pond = "one fish two fish red fish blue fish" colors = re.findall(r"(?i)(\w+)\s+fish\b",pond) # get all matches color = colors[2] # then the one we want # or without a temporary list color = re.findall(r"(?i)(\w+)\s+fish\b",pond)[2] # just grab element 3 print "The third fish in the pond is %s." % (color) ##----------------------------- import re pond = "one fish two fish red fish blue fish" matches = re.findall(r"(\w+)\s+fish\b",pond) evens = [fish for (i, fish) in enumerate(matches) if i%2] print "Even numbered fish are %s." % (" ".join(evens)) ##----------------------------- count = 0 def four_is_sushi(match_obj): global count count += 1 if count==4: return "sushi%s" % (match_obj.group(2)) return "".join(match_obj.groups()) re.sub(r"""(?x) # VERBOSE \b # makes next \w more efficient ( \w+ ) # this is what we'll be changing ( \s+ fish \b )""", four_is_sushi, pond) # one fish two fish red fish sushi fish ##----------------------------- # greedily last_fish = re.findall(r"(?i).*\b(\w+)\s+fish\b",pond) ##----------------------------- pond = "One fish two fish red fish blue fish swim here" color = re.findall(r"(?i)\b(\w+)\s+fish\b",pond)[-1] print "Last fish is "+color+"." # FASTER using string. lastfish = pond.rfind("fish") color = pond[:lastfish].split()[-1] ##----------------------------- r"""(?x) A # find some pattern A (?! # mustn't be able to find .* # something A # and A ) $ # through the end of string """ pond = "One fish two fish red fish blue fish swim here" fnd = re.findall(r"""(?xis) # VERBOSE, CASEINSENSITIVE, DOTALL \b ( \w+ ) \s+ fish \b (?! .* \b fish \b )""", pond) if len(fnd): print "Last fish is %s." % (fnd[0]) else: print "Failed!"
Matching Multiple Lines
# Matching Multiple Lines # #!/usr/bin/python # killtags - very bad html tag killer import re import sys text = open(sys.argv[1]).read() # read the whole file text = re.sub("(?ms)<.*?>","",text) # strip tags (terrible print text ## ---------------------------- #!/usr/bin/python # headerfy: change certain chapter headers to html import sys, re match = re.compile(r"""(?xms) # re.VERBOSE, re.MULTILINE, and re.DOTALL \A # start of the string (?P<chapter> # capture in g<chapter> Chapter # literal string \s+ # mandatory whitespace \d+ # decimal number \s* # optional whitespace : # a real colon . * # anything not a newline till end of line ) """) text = open(sys.argv[1]).read() # read the whole file for paragraph in text.split("\n"): # split on unix end of lines p = match.sub("<h1>\g<chapter></h1>",paragraph) print p ## ---------------------------- # the one liner does not run. # python -c 'import sys,re; for p in open(sys.argv[1]).read().split("\n\n"): print re.sub(r"(?ms)\A(Chapter\s+\d+\s*:.*)","<h1>\g0</h1>",p)' ## ---------------------------- match = re.compile(r"(?ms)^START(.*?)^END") # s makes . span line boundaries # m makes ^ match at the beginning of the string and at the beginning of each line chunk = 0 for paragraph in open(sys.argv[1]).read().split("\n\n"): chunk += 1 fnd = match.findall(paragraph) if fnd: print "chunk %d in %s has <<%s>>" % (chunk,sys.argv[1],">>,<<".join(fnd)) ## ----------------------------
Reading Records with a Pattern Separator
import sys # Read the whole file and split chunks = open(sys.argv[1]).read().split() # on whitespace chunks = open(sys.argv[1]).read().split("\n") # on line ends # splitting on pattern import re pattern = r"x" chunks = re.split(pattern, open(sys.argv[1]).read()) ##----------------------------- chunks = re.split(r"(?m)^\.(Ch|Se|Ss)$",open(sys.argv[1]).read()) print "I read %d chunks." % (len(chunks)) # without delimiters chunks = re.split(r"(?m)^\.(?:Ch|Se|Ss)$",open(sys.argv[1]).read()) # with delimiters chunks = re.split(r"(?m)^(\.(?:Ch|Se|Ss))$",open(sys.argv[1]).read()) # with delimiters at chunkstart chunks = re.findall(r"""(?xms) # multiline, dot matches lineend, allow comments ((?:^\.)? # consume the separator if present .*?) # match everything but not greedy (?= # end the match on this but dont consume it (?: # dont put into group [1] ^\.(?:Ch|Se|Ss)$ # either end on one of the roff commands |\Z # or end of text ) )""", open(sys.argv[1]).read()) # [1] if "?:" is removed the result holds tuples: ('.Ch\nchapter x','.Ch') # which might be more usefull.
Extracting a Range of Lines
##----------------------------- # Python doesn't have perl's range operators # If you want to only use a selected line range, use enumerate # (though note that indexing starts at zero: for i, line in enumerate(myfile): if firstlinenum <= i < lastlinenum: dosomethingwith(line) # Using patterned ranges is slightly trickier - # You need to search for the first pattern then # search for the next pattern: import re for line in myfile: if re.match(pat1, line): break dosomethingwith(line) # Only if pat1 can be on same line as pat2 for line in myfile: if re.match(pat2, line): break dosomethingwith(line) ##----------------------------- # If you need to extract ranges a lot, the following generator funcs # may be useful: def extract_range(myfile, start, finish): for i, line in enumerate(myfile): if start <= i < finish: yield line elif i == finish: break for line in extract_range(open("/etc/passwd"), 3, 5): print line def patterned_range(myfile, startpat, endpat=None): startpat = re.compile(startpat) if endpat is not None: endpat = re.compile(endpat) in_range = False for line in myfile: if re.match(startpat, line): in_range = True if in_range: yield line if endpat is not None and re.match(endpat, line): break # DO NOT DO THIS. Use the email module instead for line in patterned_range(msg, "^From:?", "^$"): pass #...
Matching Shell Globs as Regular Expressions
tests = (("list.?",r"^list\..$"), ("project.*",r"^project\..*$"), ("*old",r"^.*old$"), ("type*.[ch]",r"^type.*\.[ch]$"), ("*.*",r"^.*\..*$"), ("*",r"^.*$"), ) # The book says convert "*","?","[","]" all other characters will be quoted. # The book uses "\Q" which escapes any characters that would otherwise be # treated as regular expression. # Escaping every char fails as "\s" is not "s" in a regex. def glob2pat(globstr): pat = globstr.replace("\\",r"\\") pat = pat.replace(".",r"\.").replace("?",r".").replace("*",r".*") return "^"+pat+"$" for globstr, patstr in tests: g2p = glob2pat(globstr) if g2p != patstr: print globstr, "failed! Should be", patstr, "but was", g2p
Speeding Up Interpolated Matches
# download the following standalone program #!/usr/bin/python # popgrep1 - grep for abbreviations of places that say "pop" # version 1: slow but obvious way import fileinput import re popstates = ["CO","ON","MI","WI","MN"] for line in fileinput.input(): for state in popstates: if re.search(r"\b"+state+r"\b",line): print line #----------------------------- # download the following standalone program #!/usr/bin/python # popgrep2 - grep for abbreviations of places that say "pop" # version 2: compile the patterns import fileinput import re popstates = ["CO","ON","MI","WI","MN"] state_re = [] for state in popstates: state_re.append(re.compile(r"\b"+state+r"\b")) for line in fileinput.input(): for state in state_re: if state.search(line): print line #----------------------------- # download the following standalone program #!/usr/bin/python # popgrep3 - grep for abbreviations of places that say "pop" # version 3: compile a single pattern import fileinput import re popstates = ["CO","ON","MI","WI","MN"] state_re = re.compile(r"\b(?:"+"|".join(popstates)+r")\b") for line in fileinput.input(): if state_re.search(line): print line #----------------------------- # download the following standalone program #!/usr/bin/python # grepauth - print lines that mention both Tom and Nat import fileinput import re def build_match_any(words): return re.compile("|".join(words)) def uniq(arr): seen = {} for item in arr: seen[item] = seen.get(item, 0) + 1 return seen.keys() def build_match_all(words): r = re.compile("|".join(words)) c = lambda line: len(uniq(r.findall(line)))>=len(words) return c any = build_match_any(("Tom","Nat")) all = build_match_all(("Tom","Nat")) for line in fileinput.input(): if any.search(line): print "any:", line if all(line): print "all:", line #-----------------------------
Testing for a Valid Pattern
# Testing for a Valid Pattern import re while True: pat = raw_input("Pattern? ") try: re.compile(pat) except re.error, err: print "INVALID PATTERN", err continue break # ---- def is_valid_pattern(pat): try: re.compile(pat) except re.error: return False return True # ---- # download the following standalone program #!/usr/bin/python # paragrep - trivial paragraph grepper # # differs from perl version in parano. # python version displays paragraph in current file. import sys, os.path, re if len(sys.argv)<=1: print "usage: %s pat [files]\n" % sys.argv[0] sys.exit(1) pat = sys.argv[1] try: pat_re = re.compile(pat) except: print "%s: bad pattern %s: %s" % (sys.argv[1], pat, sys.exc_info()[1]) sys.exit(1) for filename in filter(os.path.isfile,sys.argv[2:]): parano = 0 for para in open(filename).read().split("\n\n"): parano += 1 if pat_re.search(para): print filename, parano, para, "\n" # ---- # as we dont evaluate patterns the attack :: # # $pat = "You lose @{[ system('rm -rf *']} big here"; # # does not work.
Honoring Locale Settings in Regular Expressions
# download the following standalone program #!/usr/bin/python # localeg - demonstrates locale effects # # re must be told to respect locale either in the regexp # "(?L)" or as flag to the call (python 2.4) "re.LOCALE". import sys import re, string from locale import LC_CTYPE, setlocale, getlocale name = "andreas k\xF6nig" locale = {"German" : "de_DE.ISO_8859-1", "English" : "en_US"} # us-ascii is not supported on linux py23 # none works in activestate py24 try: setlocale(LC_CTYPE, locale["English"]) except: print "Invalid locale %s" % locale["English"] sys.exit(1) english_names = [] for n in re.findall(r"(?L)\b(\w+)\b",name): english_names.append(n.capitalize()) try: setlocale(LC_CTYPE, locale["German"]) except: print "Invalid locale %s" % locale["German"] sys.exit(1) german_names = map(string.capitalize, re.findall(r"(?L)\b(\w+)\b",name)) print "English names: %s" % " ".join(english_names) print "German names: %s" % " ".join(german_names)
Approximate Matching
##----------------------------- import difflib matchlist = ["ape", "apple", "lapel", "peach", "puppy"] print difflib.get_close_matches("appel", matchlist) #=> ['lapel', 'apple', 'ape'] ##----------------------------- # Also see: # http://www.personal.psu.edu/staff/i/u/iua1/python/apse/ # http://www.bio.cam.ac.uk/~mw263/pyagrep.html
Matching from Where the Last Pattern Left Off
##----------------------------- # To search (potentially) repeatedly for a pattern, use re.finditer(): # DO NOT DO THIS. Split on commas and convert elems using int() mystr = "3,4,5,9,120" for match in re.finditer("(\d+)", mystr): n = match.group(0) if n == "9": break # '120' will never be matched print "Found number", n # matches know their end position mystr = "The year 1752 lost 10 days on the 3rd of September" x = re.finditer("(\d+)", mystr) for match in x: n = match.group(0) print "Found number", n tail = re.match("(\S+)", mystr[match.end():]) if tail: print "Found %s after the last number."%tail.group(0)
Greedy and Non-Greedy Matches
# Python's regexes are based on Perl's, so it has the non-greedy # '*?', '+?', and '??' versions of '*', '+', and '?'. # DO NOT DO THIS. import htmllib, formatter, etc, instead #----------------------------- # greedy pattern txt = re.sub("<.*>", "", txt) # try to remove tags, very badly # non-greedy pattern txt = re.sub("<.*?>", "", txt) # try to remove tags, still rather badly #----------------------------- txt = "<b><i>this</i> and <i>that</i> are important</b> Oh, <b><i>me too!</i></b>" print re.findall("<b><i>(.*?)</i></b>", txt ##----------------------------- print re.findall("/BEGIN((?:(?!BEGIN).)*)END/", txt) ##----------------------------- print re.findall("<b><i>((?:(?!<b>|<i>).)*)</i></b>", txt) ##----------------------------- print re.findall("<b><i>((?:(?!<[ib]>).)*)</i></b>", txt) ##----------------------------- print re.findall(""" <b><i> [^<]* # stuff not possibly bad, and not possibly the end. (?: # at this point, we can have '<' if not part of something bad (?! </?[ib]> ) # what we can't have < # okay, so match the '<' [^<]* # and continue with more safe stuff ) * </i></b> """, re.VERBOSE, txt) ##-----------------------------
Detecting Duplicate Words
##----------------------------- text = """ This is a test test of the duplicate word finder. """ words = text.split() for curr, next in zip(words[:-1], words[1:]): if curr.upper() == next.upper(): print "Duplicate word '%s' found." % curr # DON'T DO THIS import re pat = r""" \b # start at a word boundary (begin letters) (\S+) # find chunk of non-whitespace \b # until another word boundary (end letters) ( \s+ # separated by some whitespace \1 # and that very same chunk again \b # until another word boundary ) + # one or more sets of those """ for match in re.finditer(pat, text, flags=re.VERBOSE|re.IGNORECASE): print "Duplicate word '%s' found." % match.group(1) ##----------------------------- a = 'nobody'; b = 'bodysnatcher'; text = a+" "+b pat = r"^(\w+)(\w+) \2(\w+)$" for match in re.finditer(pat, text): m1, m2, m3 = match.groups() print m2, "overlaps in %s-%s-%s"%(m1, m2, m3) ##----------------------------- pat = r"^(\w+?)(\w+) \2(\w+)$" ##----------------------------- try: while True: factor = re.match(r"^(oo+?)\1+$", n).group(1) n = re.sub(factor, "o", n) print len(factor) except AttributeError: print len(n) ##----------------------------- def diaphantine(n, x, y, z): pat = r"^(o*)\1{%s}(o*)\2{%s}(o*)\3{%s}$"%(x-1, y-1, z-1) text = "o"*n try: vals = [len(v) for v in re.match(pat, text).groups()] except ValueError: print "No solutions." else: print "One solution is: x=%s, y=%s, z=%s."%tuple(vals) diaphantine(n=281, x=12, y=15, z=16)
Expressing AND, OR, and NOT in a Single Pattern
##----------------------------- # Pass any of the following patterns to re.match(), etc pat = "ALPHA|BETA" pat = "^(?=.*ALPHA)(?=.*BETA)" pat = "ALPHA.*BETA|BETA.*ALPHA" pat = "^(?:(?!PAT).)*$" pat = "(?=^(?:(?!BAD).)*$)GOOD" ##----------------------------- if not re.match(pattern, text): something() ##----------------------------- if re.match(pat1, text) and re.match(pat2, text): something() ##----------------------------- if re.match(pat1, text) or re.match(pat2, text): something() ##----------------------------- # DON'T DO THIS. """minigrep - trivial grep""" import sys, re pat = sys.argv[1] for line in sys.stdin: if re.match(pat, line): print line[:-1] ##----------------------------- if re.match(r"^(?=.*bell)(?=.*lab)", "labelled"): something() ##----------------------------- if re.search("bell", s) and re.search("lab", s): something() ##----------------------------- if re.match(""" ^ # start of string (?= # zero-width lookahead .* # any amount of intervening stuff bell # the desired bell string ) # rewind, since we were only looking (?= # and do the same thing .* # any amount of intervening stuff lab # and the lab part ) """, murray_hill, re.DOTALL | re.VERBOSE): print "Looks like Bell Labs might be in Murray Hill!" ##----------------------------- if re.match(r"(?:^.*bell.*lab)|(?:^.*lab.*bell)", "labelled"): something() ##----------------------------- brand = "labelled" if re.match(""" (?: # non-capturing grouper ^ .*? # any amount of stuff at the front bell # look for a bell .*? # followed by any amount of anything lab # look for a lab ) # end grouper | # otherwise, try the other direction (?: # non-capturing grouper ^ .*? # any amount of stuff at the front lab # look for a lab .*? # followed by any amount of anything bell # followed by a bell ) # end grouper """, brand, re.DOTALL | re.VERBOSE): print "Our brand has bell and lab separate." ##----------------------------- x = "odlaw" if re.match("^(?:(?!waldo).)*$", x): print "There's no waldo here!" ##----------------------------- if re.match(""" ^ # start of string (?: # non-capturing grouper (?! # look ahead negation waldo # is he ahead of us now? ) # is so, the negation failed . # any character (cuzza /s) ) * # repeat that grouping 0 or more $ # through the end of the string """, x, re.VERBOSE | re.DOTALL): print "There's no waldo here!\n"; ##-----------------------------
Matching Multiple-Byte Characters
# @@INCOMPLETE@@ # @@INCOMPLETE@@
Matching a Valid Mail Address
##----------------------------- from email._parseaddr import AddressList print AddressList("fred&barney@stonehenge.com").addresslist[0] print AddressList("fred&barney@stonehenge.com (Hanna Barbara)").addresslist[0] name, address = AddressList("Mr Fooby Blah <me@nowhere.com>").addresslist[0] print "%s's address is '%s'"%(name, address)
Matching Abbreviations
##----------------------------- # Assuming the strings all start with different letters, or you don't # mind there being precedence, use the startswith string method: def get_action(answer): answer = answer.lower() actions = ["send", "stop", "abort", "list", "end"] for action in actions: if action.startswith(answer): return action print "Action is %s."%get_action("L") #=> Action is list. ##----------------------------- #DON'T DO THIS: import re answer = "ab" answer = re.escape(answer.strip()) for action in ("SEND", "STOP", "ABORT", "LIST", "EDIT"): if re.match(answer, action, flags=re.IGNORECASE): print "Action is %s."%action.lower() ##----------------------------- import re, sys def handle_cmd(cmd): cmd = re.escape(cmd.strip()) for name, action in {"edit": invoke_editor, "send": deliver_message, "list": lambda: system(pager, myfile), "abort": sys.exit, } if re.match(cmd, name, flags=re.IGNORECASE): action() break else: print "Unknown command:", cmd handle_cmd("ab")
Program: urlify
##----------------------------- # urlify - wrap HTML links around URL-like constructs import re, sys, fileinput def urlify_string(s): urls = r'(http|telnet|gopher|file|wais|ftp)' ltrs = r'\w'; gunk = r'/#~:.?+=&%@!\-' punc = r'.:?\-' any = ltrs + gunk + punc pat = re.compile(r""" \b # start at word boundary ( # begin \1 { %(urls)s : # need resource and a colon [%(any)s] +? # followed by one or more # of any valid character, but # be conservative and take only # what you need to.... ) # end \1 } (?= # look-ahead non-consumptive assertion [%(punc)s]* # either 0 or more punctuation [^%(any)s] # followed by a non-url char | # or else $ # then end of the string ) """%locals(), re.VERBOSE | re.IGNORECASE) return re.sub(pat, r"<A HREF=\1>\1</A>", s) if __name__ == "__main__": for line in fileinput.input(): print urlify_string(line)
Program: tcgrep
##----------------------------- # @@INCOMPLETE@@ # @@INCOMPLETE@@
Regular Expression Grabbag
# The majority of regexes in this section are either partially # or completely The Wrong Thing to Do. ##----------------------------- # DON'T DO THIS. Use a Roman Numeral module, etc. (since # you need one anyway to calculate values) pat = r"^m*(d?c{0,3}|c[dm])(l?x{0,3}|x[lc])(v?i{0,3}|i[vx])$" re.match(pat, "mcmlxcvii") ##----------------------------- txt = "one two three four five" # If the words are cleanly delimited just split and rejoin: word1, word2, rest = txt.split(" ", 2) print " ".join([word2, word1, rest]) # Otherwise: frompat = r"(\S+)(\s+)(\S+)" topat = r"\3\2\1" print re.sub(frompat, topat, txt) ##----------------------------- print str.split("=") # DON'T DO THIS pat = r"(\w+)\s*=\s*(.*)\s*$" print re.match(pat, "key=val").groups() ##----------------------------- line = "such a very very very very very very very very very very very very very long line" if len(line) > 80: process(line) # DON'T DO THIS pat = ".{80,}" if re.match(pat, line): process(line) ##----------------------------- dt = time.strptime("12/11/05 12:34:56", "%d/%m/%y %H:%M:%S") # DON'T DO THIS pat = r"(\d+)/(\d+)/(\d+) (\d+):(\d+):(\d+)" dt = re.match(pat, "12/11/05 12:34:56").groups() ##----------------------------- txt = "/usr/bin/python" print txt.replace("/usr/bin", "/usr/local/bin") # Alternatively for file operations use os.path, shutil, etc. # DON'T DO THIS print re.sub("/usr/bin", "/usr/local/bin", txt) ##----------------------------- import re def unescape_hex(matchobj): return chr(int(matchobj.groups(0)[0], 16)) txt = re.sub(r"%([0-9A-Fa-f][0-9A-Fa-f])", unescape_hex, txt) # Assuming that the hex escaping is well-behaved, an alternative is: def unescape_hex(seg): return chr(int(seg[:2], 16)) + seg[2:] segs = txt.split("%") txt = segs[0] + "".join(unescape_hex(seg) for seg in segs[1:]) ##----------------------------- txt = re.sub(r""" /\* # Match the opening delimiter .*? # Match a minimal number of characters \*/ # Match the closing delimiter """, "", txt, re.VERBOSE) ##----------------------------- txt.strip() # DON'T DO THIS txt = re.sub(r"^\s+", "", txt) txt = re.sub(r"\s+$", "", txt) ##----------------------------- txt.replace("\\n", "\n") # DON'T DO THIS txt = re.sub("\\n", "\n", txt) ##----------------------------- txt = re.sub("^.*::", "") ##----------------------------- import socket socket.inet_aton(txt) # Will raise an error if incorrect # DON'T DO THIS. octseg =r"([01]?\d\d|2[0-4]\d|25[0-5])" dot = r"\." pat = "^" + octseg + dot + octseg + dot + octseg + dot + octseg + "$" if not re.match(pat, txt, re.VERBOSE) raise ValueError # Defitely DON'T DO THIS. pat = r"""^([01]?\d\d|2[0-4]\d|25[0-5])\.([01]?\d\d|2[0-4]\d|25[0-5])\. ([01]?\d\d|2[0-4]\d|25[0-5])\.([01]?\d\d|2[0-4]\d|25[0-5])$""" ##----------------------------- fname = os.path.basename(path) # DON'T DO THIS. fname = re.sub("^.*/", "", path) ##----------------------------- import os try: tc = os.environ["TERMCAP"] except KeyError: cols = 80 else: cols = re.match(":co#(\d+):").groups(1) ##----------------------------- # (not quite equivalent to the Perl version) name = os.path.basename(sys.argv[0]) # DON'T DO THIS. name = re.sub("^.*/", "", sys.argv[0]) ##----------------------------- if sys.platform != "linux": raise SystemExit("This isn't Linux") ##----------------------------- txt = re.sub(r"\n\s+", " ", txt) # In many cases you could just use: txt = txt.replace("\n", " ") ##----------------------------- nums = re.findall(r"\d+\.?\d*|\.\d+", txt) ##----------------------------- # If the words are clearly delimited just use: capwords = [word for word in txt.split() if word.isupper()] # Otherwise capwords = [word for word in re.findall(r"\b(\S+)\b", txt) if word.isupper()] # (probably) DON'T DO THIS. capwords = re.findall(r"(\b[^\Wa-z0-9_]+\b)", txt) ##----------------------------- # If the words are clearly delimited just use: lowords = [word for word in txt.split() if word.islower()] # Otherwise lowords = [word for word in re.findall(r"\b(\S+)\b", txt) if word.islower()] # (probably) DON'T DO THIS. lowords = re.findall(r"(\b[^\WA-Z0-9_]+\b)", txt) ##----------------------------- # If the words are clearly delimited just use: icwords = [word for word in txt.split() if word.istitle()] # Otherwise icwords = [word for word in re.finditer(r"\b(\S+)\b") if word.istitle()] # DON'T DO THIS. icwords = re.findall(r"(\b[^\Wa-z0-9_][^\WA-Z0-9_]*\b)", txt) ##----------------------------- # DON'T DO THIS - use HTMLParser, etc. links = re.findall(r"""<A[^>]+?HREF\s*=\s*["']?([^'" >]+?)[ '"]?>""", txt) ##----------------------------- names = txt.split() if len(names) == 3: initial = names[1][0] else: initial = "" # DON'T DO THIS. pat = "^\S+\s+(\S)\S*\s+\S" try: initial = re.match(pat, txt).group(1) except AttributeError: initial = "" ##----------------------------- txt = re.sub('"([^"]*)"', "``\1''", txt) ##----------------------------- sentences = [elem[0] for elem in re.findall(r"(.*?[!?.])( |\Z)", s)] ##----------------------------- import time dt = time.strptime(txt, "%Y-%m-%d") # DON'T DO THIS. year, month, day = re.match(r"(\d{4})-(\d\d)-(\d\d)", txt).groups() ##----------------------------- pat = r""" ^ (?: 1 \s (?: \d\d\d \s)? # 1, or 1 and area code | # ... or ... \(\d\d\d\) \s # area code with parens | # ... or ... (?: \+\d\d?\d? \s)? # optional +country code \d\d\d ([\s\-]) # and area code ) \d\d\d (\s|\1) # prefix (and area code separator) \d\d\d\d # exchange $ """ re.match(pat, txt, re.VERBOSE) ##----------------------------- re.match(r"\boh\s+my\s+gh?o(d(dess(es)?|s?)|odness|sh)\b", txt, re.IGNORECASE) ##----------------------------- for line in file(fname, "Ur"): #Universal newlines process(line) # DON'T DO THIS lines = [re.sub(r"^([^\012\015]*)(\012\015?|\015\012?)", "", line) for line in file(fname)] ##-----------------------------