Groovy/FAQ/Поиск по шаблону
Материал из
Версия от 09:05, 8 декабря 2008; Root (Обсуждение | вклад)
Groovy · |
//---------------------------------------------------------------------------------- // Groovy has built-in language support for Regular Expressions: // * Strings quoted with '/' characters have special escaping // rules for backslashes and the like. // * ~string (regex pattern operator) // * m =~ /pattern/ (regex find operator) // * m ==~/pattern/ (regex match operator) // * patterns can be used in case expressions in a switch statement // * string.replaceAll can take a closure expression as the second argument // In addition, Groovy can make use of Java's Pattern, Matcher and Scanner classes // directly. (The sugar coating metnioed above sits on top of these anyway). // There are also additional open source Java regex libraries which can be used. meadow1 = 'cow grass butterflies Ovine' meadow2 = 'goat sheep flowers dog' // pattern strings can benefit from 'slashy' quotes partial = /sheep/ full = /.*sheep.*/ // find operator assert !(meadow1 =~ partial) assert meadow2 =~ partial finder = (meadow2 =~ partial) // underneath Groovy sugar coating is Java implementation assert finder instanceof java.util.regex.Matcher // match operator assert !(meadow1 ==~ full) assert meadow2 ==~ full matcher = (meadow2 ==~ full) // under the covers is just a boolean assert matcher instanceof Boolean assert meadow1 =~ /(?i)\bovines?\b/ // (?i) == case flag string = 'good food' println string.replaceFirst(/o*/, 'e') // => egood food println string.replaceAll(/o*/, 'e') // => egeede efeede (global) // beware this one is just textual replacement println string.replace(/o*/, 'e') // => good food println 'o*o*'.replace(/o*/, 'e') // => ee // groovy -e "m = args[0] =~ /(a|ba|b)+(a|ac)+/; if (m.matches()) println m[0][0]" ababacaca // => ababa digits = "123456789" nonlap = digits =~ /\d\d\d/ assert nonlap.count == 3 print 'Non-overlapping: ' (0..<nonlap.count).each{ print nonlap[it] + ' ' }; print '\n' print 'Overlapping: ' yeslap = (digits =~ /(?=(\d\d\d))/) assert yeslap.count == 7 (0..<yeslap.count).each{ print yeslap[it][1] + ' ' }; print '\n' // Non-overlapping: 123 456 789 // Overlapping: 123 234 345 456 567 678 789 string = 'And little lambs eat ivy' // Greedy version parts = string =~ /(.*)(l[^s]*s)(.*)/ ({ print "(${parts[0][it]}) " }; print '\n' // (And little ) (lambs) ( eat ivy) // Reluctant version parts = string =~ /(.*?)(l[^s]*s)(.*)/ ({ print "(${parts[0][it]}) " }; print '\n' // (And ) (little lambs) ( eat ivy) //----------------------------------------------------------------------------------
Copying and Substituting Simultaneously
//---------------------------------------------------------------------------------- // Groovy splits src and dest to avoid this problem src = 'Go this way' dst = src.replaceFirst('this', 'that') assert dst == 'Go that way' // extract basename src = 'c:/some/path/file.ext' dst = src.replaceFirst('^.*/', '') assert dst == 'file.ext' // Make All Words Title-Cased (not that you would do it this way) // The preprocessing operations \X where X is one of l, u, L, and U are not supported // in the sun regex library but other Java regex libraries may support this. Instead: src = 'make all words title-cased' dst = src ('a'..'z').each{ dst = dst.replaceAll(/([^a-zA-Z])/+it+/|\A/+it, /$1/+it.toUpperCase()) } assert dst == 'Make All Words Title-Cased' // rename list of dirs bindirs = '/usr/bin /bin /usr/local/bin'.split(' ').toList() expected = '/usr/lib /lib /usr/local/lib'.split(' ').toList() libdirs = bindirs.collect { dir -> dir.replaceFirst('bin', 'lib') } assert libdirs == expected //----------------------------------------------------------------------------------
Matching Letters
//---------------------------------------------------------------------------------- // Groovy uses Java regex (other Java regex packages would also be possible) // It doesn't support Locale-based settings but you can roll your own to some // extent, you can use any Unicode characters as per below and you can use // \p{Punct} Punctuation: One of !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ // or the other special character classes words = ''' silly façade coöperate niño Renée Moliçre hæmoglobin naïve tschüß random!stuff#here\u0948 ''' results = '' greekAlpha = '\u0391' special = 'çéüßöñàæï?' + greekAlpha // flag as either Y (alphabetic) or N (not) words.split('\n').findAll{it.trim()}.each{ results += it ==~ /^[\w/+special+/]+$/ ?'Y':'N' } assert results == 'YYYYYYYYYN' results = '' words.split('\n').findAll{it.trim()}.each{ results += it ==~ /^[^\p{Punct}]+$/ ?'Y':'N' } assert results == 'YYYYYYYYYN' //----------------------------------------------------------------------------------
Matching Words
//---------------------------------------------------------------------------------- // as many non-whitespace bytes as possible finder = 'abczqz z' =~ /a\S+z/ assert finder[0] == 'abczqz' // as many letters, apostrophes, and hyphens finder = "aAzZ'z-z0z" =~ /a[A-Za-z'-]+z/ //' assert finder[0] == "aAzZ'z-z" // selecting words finder = '23rd Psalm' =~ /\b([A-Za-z]+)\b/ // usually best println finder[0][0] // => Psalm (23rd is not matched) finder = '23rd Psalm' =~ /\s([A-Za-z]+)\s/ // fails at ends or w/ punctuation println finder.matches() // => false (no whitespaces at ends) //----------------------------------------------------------------------------------
Commenting Regular Expressions
//---------------------------------------------------------------------------------- str = ' and' re = '''(?x) # to enable whitespace and comments ( # capture the hostname in $1 (?: # these parens for grouping only (?! [-_] ) # lookahead for neither underscore nor dash [\\w-] + # hostname component \\. # and the domain dot ) + # now repeat that whole thing a bunch of times [A-Za-z] # next must be a letter [\\w-] + # now trailing domain part ) # end of $1 capture ''' finder = str =~ re out = str (0..<finder.count).each{ adr = finder[it][0] out = out.replaceAll(adr, "$adr [${InetAddress.getByName(adr).hostAddress}]") } println out // => [] and [] // to match whitespace or #-characters in an extended re you need to escape them. foo = 42 str = 'blah #foo# blah' re = '''(?x) # to enable whitespace and comments \\# # a pound sign (\\w+) # the variable name \\# # another pound sign ''' finder = str =~ re found = finder[0] out = str.replaceAll(found[0], evaluate(found[1]).toString()) assert out == 'blah 42 blah' //----------------------------------------------------------------------------------
Finding the Nth Occurrence of a Match
//---------------------------------------------------------------------------------- fish = 'One fish two fish red fish blue fish' expected = 'The third fish is a red one.' thirdFish = /(?:\w+\s+fish\s+){2}(\w+)\s+fish.*/ assert expected == (fish.replaceAll(thirdFish, 'The third fish is a $1 one.')) anyFish = /(\w+)\s+fish\b/ finder = fish =~ anyFish // finder contains an array of matched groups // 2 = third one (index start at 0), 1 = matched word in group out = "The third fish is a ${finder[2][1]} one." assert out == expected evens = [] (0..<finder.count).findAll{it%2!=0}.each{ evens += finder[it][1] } println "Even numbered fish are ${evens.join(' ')}." // => Even numbered fish are two blue. // one of several ways to do this pond = fish + ' in the pond' fishInPond = (/(\w+)(\s+fish\b\s*)/) * 4 + /(.*)/ found = (pond =~ fishInPond)[0] println ((found[1..6] + 'sushi' + found[8..9]).join()) // => One fish two fish red fish sushi fish in the pond // find last fish expected = 'Last fish is blue' pond = 'One fish two fish red fish blue fish swim here.' finder = (pond =~ anyFish) assert expected == "Last fish is ${finder[finder.count-1][1]}" // => Last fish is blue // greedy match version of above finder = (pond =~ /.*\b/ + anyFish) assert expected == "Last fish is ${finder[0][1]}" // last fish match version of above finder = (pond =~ /\b(\w+)\s+fish\b(?!.*\bfish\b)/) assert expected == "Last fish is ${finder[0][1]}" //----------------------------------------------------------------------------------
Matching Multiple Lines
//---------------------------------------------------------------------------------- // Html Stripper // get this using: fakedfile = new File('path_to_file.htm').text fakedFile = ''' <html> <head><title>Chapter 1 Title</title></head> <body> <h1>Chapter 1: Some Heading</h1> A paragraph. </body> </html> ''' stripExpectations = ''' Chapter 1 Title Chapter 1: Some Heading A paragraph. '''.trim() stripped = fakedFile.replaceAll(/(?m)<.*?>/,'').trim() assert stripExpectations == stripped pattern = '''(?x) ( # capture in $1 Chapter # text string \\s+ # mandatory whitespace \\d+ # decimal number \\s* # optional whitespace : # a real colon . * # anything not a newline till end of line ) ''' headerfyExpectations = ''' Chapter 1 Title <H1>Chapter 1: Some Heading</H1> A paragraph. '''.trim() headerfied = stripped.replaceAll(pattern, '<H1>$1</H1>') assert headerfyExpectations == headerfied // one liner equivalent which prints to stdout //% groovy -p -e "line.replaceAll(/^(Chapter\s+\d+\s*:.*)/,'<H1>$1</H1>')" // one liner equivalent which modifies file in place and creates *.bak original file //% groovy -pi .bak -e "line.replaceAll(/^(Chapter\s+\d+\s*:.*)/,'<H1>$1</H1>')" // use: realFileInput = new File(path_to_file).text fakeFileInput = ''' 0 START 1 2 END 3 4 5 START 6 END ''' chunkyPattern = /(?ms)^START(.*?)^END/ finder = fakeFileInput =~ chunkyPattern (0..<finder.count).each { println "Chunk #$it contains ${new StringTokenizer(finder[it][1],'\n').countTokens()} lines." } // => // Chunk #0 contains 2 lines. // Chunk #1 contains 1 lines. //----------------------------------------------------------------------------------
Reading Records with a Pattern Separator
//---------------------------------------------------------------------------------- // general pattern is: //file = new File("datafile").text.split(/pattern/) // .Ch, .Se and .Ss divide chunks of input text fakedFiletext = ''' .Ch abc .Se def .Ss ghi .Se jkl .Se mno .Ss pqr .Ch stu .Ch vwx .Se yz! ''' chunks = fakedFiletext.split(/(?m)^\.(Ch|Se|Ss)$/) println "I read ${chunks.size()} chunks." // => I read 10 chunks. //----------------------------------------------------------------------------------
Extracting a Range of Lines
//---------------------------------------------------------------------------------- // Groovy doesn't support the ~/BEGIN/ .. ~/END/ notation // you have to emulate it as shown in the example below // The from line number to line number processing is supported // from the command line but not within a script, e.g. // command-line to print lines 15 through 17 inclusive (see below) // > groovy -p -e "if (count in 15..17) return line" datafile // Within a script itself, you emulate the count by keeping state htmlContent = ''' <h1>A Heading</h1> Here is <XMP>inline AAA</XMP>. And the bigger Example 2: <XMP> line BBB line CCC </XMP> Done. '''.trim() examplePattern = /(?ms)<XMP>(.*?)<\/XMP>/ finder = htmlContent =~ examplePattern (0..<finder.count).each { println "Example ${it+1}:" println finder[it][1] } // => // Example 1: // inline AAA // Example 2: // // line BBB // line CCC // htmlContent.split('\n').eachWithIndex{ line, count -> if (count in 4..5) println line } // => // line BBB // line CCC // You would probably use a mail Api for this in Groovy fakedMailInput = ''' From: A Person <> To: <> Date: Sun, 31 Dec 2006 02:14:57 +1100 From: To: <> Date: Sun, 31 Dec 2006 02:14:58 +1100 From: To: <> Date: Sun, 31 Dec 2006 02:14:59 +1100 '''.trim()+'\n' seen = [:] fakedMailInput.split('\n').each{ line -> m = (line =~ /^From:?\s(.*)/) if (m) { addr = m[0][1] =~ /([^<>(),;\s]+\@[^<>(),;\s]+)/ x = addr[0][1] if (seen.containsKey(x)) seen[x] += 1 else seen[x] = 1 } } seen.each{ k,v -> println "Address $k seen $v time${v==1?'':'s'}." } // => // Address seen 1 time. // Address seen 2 times. //----------------------------------------------------------------------------------
Matching Shell Globs as Regular Expressions
//---------------------------------------------------------------------------------- import java.util.regex.Pattern names = ''' myFile.txt oldFile.tex myPicture.jpg ''' def glob2pat(globstr) { def patmap = [ '*':'.*', '?':'.', '[':'[', ']':']' ] def result = '(?m)^' '^' + globstr.replaceAll(/(.)/) { all, c -> result += (patmap.containsKey(c) ? patmap[c] : Pattern.quote(c)) } result + '$' } def checkNumMatches(pat, count) { assert (names =~ glob2pat(pat)).count == count } checkNumMatches('*.*', 3) checkNumMatches('my*.*', 2) checkNumMatches('*.t*', 2) checkNumMatches('*File.*', 2) checkNumMatches('*Rabbit*.*', 0) //----------------------------------------------------------------------------------
Speeding Up Interpolated Matches
//---------------------------------------------------------------------------------- // version 1: simple obvious way states = 'CO ON MI WI MN'.split(' ').toList() def popgrep1(file) { file.eachLine{ line -> if (states.any{ line =~ /\b$it\b/ }) println line } } // popgrep1(new File('path_to_file')) // version 2: eval strings; fast but hard to quote (SLOW) def popgrep2(file) { def code = 'def found = false\n' states.each{ code += "if (!found && line =~ /\\b$it\\b/) found = true\n" } code += "if (found) println line\n" file.eachLine{ line = it; evaluate(code) } } // popgrep2(new File('path_to_file')) // version 2b: eval using switch/case (not in Perl cookbook) (SLOW) def popgrep2b(file) { def code = 'switch(line) {\n' states.each{ code += "case ~/.*\\b$it\\b.*/:\nprintln line;break\n" } code += "default:break\n}\n" file.eachLine{ line = it; evaluate(code) } } // popgrep2b(new File('path_to_file')) // version3: build a match_any function as a GString def popgrep3(file) { def code = states.collect{ "line =~ /\\b$it\\b/" }.join('||') file.eachLine{ line = it; if (evaluate(code)) println line } } // popgrep3(new File('path_to_file')) // version4: pretty fast, but simple: compile all re's first: patterns = states.collect{ ~/\b$it\b/ } def popgrep4(file) { file.eachLine{ line -> if (patterns.any{ it.matcher(line)}) println line } } // popgrep4(new File('path_to_file')) // version5: faster str = states.collect{ /\b$it\b/ }.join('|') def popgrep5(file) { file.eachLine{ line -> if (line =~ str) println line } } // popgrep5(new File('path_to_file')) // version5b: faster (like 5 but compiled outside loop) pattern = ~states.collect{ /\b$it\b/ }.join('|') def popgrep5b(file) { file.eachLine{ line -> if (pattern.matcher(line)) println line } } // popgrep5b(new File('path_to_file')) // speeds trials ON the current source file (~1200 lines) // popgrep1 => 0.39s // popgrep2 => 25.08s // popgrep2b => 23.86s // popgrep3 => 22.42s // popgrep4 => 0.12s // popgrep5 => 0.05s // popgrep5b => 0.05s // Groovy's built-in support is the way to go in terms of // both speed and simplicity of understanding. Avoid using // evaluate() unless you absolutely need it // generic matching functions input = ''' both cat and dog neither just a cat just a dog '''.split('\n').findAll{it.trim()} def matchAny(line, patterns) { patterns.any{ line =~ it } } def matchAll(line, patterns) { patterns.every{ line =~ it } } assert input.findAll{ matchAny(it, ['cat','dog']) }.size() == 3 assert input.findAll{ matchAny(it, ['cat$','^n.*']) }.size() == 2 assert input.findAll{ matchAll(it, ['cat','dog']) }.size() == 1 assert input.findAll{ matchAll(it, ['cat$','^n.*']) }.size() == 0 //----------------------------------------------------------------------------------
Testing for a Valid Pattern
//---------------------------------------------------------------------------------- // patternCheckingScript: prompt = '\n> ' print 'Enter patterns to check:' + prompt new BufferedReader(new InputStreamReader({ line -> try { Pattern.compile(line) print 'Valid' + prompt } catch (java.util.regex.PatternSyntaxException ex) { print 'Invalid pattern: ' + ex.message + prompt } } // => // Enter patterns to check: // > ab*.c // Valid // > ^\s+[^a-z]*$ // Valid // > ** // Invalid pattern: Dangling meta character '*' near index 0 // ** // ^ //----------------------------------------------------------------------------------
Honoring Locale Settings in Regular Expressions
//---------------------------------------------------------------------------------- src = 'dierk könig' // simplistic with locale issue dst = src ('a'..'z').each{ dst = dst.replaceAll(/(?<=[^a-zA-Z])/+it+/|\A/+it, it.toUpperCase()) } println dst // => Dierk KöNig // locale avoidance dst = src ('a'..'z').each{ dst = dst.replaceAll(/(?<=\A|\b)/+it, it.toUpperCase()) } println dst // => Dierk König //----------------------------------------------------------------------------------
Approximate Matching
//---------------------------------------------------------------------------------- // Several libraries exist, e.g. // // // both support numerous algorithms. Using the second as an example: import* target = 'balast' candidates = ''' quick brown fox jumped over the lazy dog ballast ballasts balustrade balustrades blast blasted blaster blasters blasting blasts '''.split('\n').findAll{it.trim()} metrics = [new Levenshtein(), new MongeElkan(), new JaroWinkler(), new Soundex()] def out(name, results) { print name.padLeft(14) + ' '; results.each{print(it.padRight(16))}; println() } def outr(name, results){out(name, results.collect{''+((int)(it*100))/100})} out ('Word/Metric', metrics.collect{it.shortDescriptionString} ) candidates.each{ w -> outr(w, metrics.collect{ m -> m.getSimilarity(target, w)} )} // => // Word/Metric Levenshtein MongeElkan JaroWinkler Soundex // quick 0 0.11 0 0.66 // brown 0.16 0.23 0.5 0.73 // fox 0 0.2 0 0.66 // jumped 0 0.2 0 0.66 // over 0 0.44 0 0.55 // the 0 0.33 0 0.55 // lazy 0.33 0.5 0.44 0.66 // dog 0 0.2 0 0.66 // ballast 0.85 0.83 0.96 1 // ballasts 0.75 0.83 0.94 0.94 // balustrade 0.5 0.93 0.3 0.94 // balustrades 0.45 0.93 0.3 0.94 // blast 0.83 0.8 0.88 1 // blasted 0.57 0.66 0.8 0.94 // blaster 0.57 0.66 0.8 0.94 // blasters 0.5 0.66 0.77 0.94 // blasting 0.5 0.66 0.77 0.94 // blasts 0.66 0.66 0.84 0.94 // to implement the example, iterate through /usr/dict/words selecting words // where one or a combination of metrics are greater than some threshold //----------------------------------------------------------------------------------
Matching from Where the Last Pattern Left Off
//---------------------------------------------------------------------------------- n = " 49 here" println n.replaceAll(/\G /,'0') // => 00049 here str = "3,4,5,9,120" print 'Found numbers:' str.eachMatch(/\G,?(\d+)/){ print ' ' + it[1] } println() // => Found numbers: 3 4 5 9 120 // Groovy doesn't have the String.pos or a /c re modifier like Perl // But it does have similar functionality. Matcher has start() and // end() for find the position and Matcher's usePattern() allows // you to swap patterns without changing the buffer position text = 'the year 1752 lost 10 days on the 3rd of September' p = ~/(?<=\D)(\d+)/ m = p.matcher(text) while (m.find()) { println 'Found ' + + ' starting at pos ' + m.start() + ' and ending at pos ' + m.end() } // now reset pos back to between 1st and 2nd numbers if (m.find(16)) { println 'Found ' + } // => // Found 1752 starting at pos 9 and ending at pos 13 // Found 10 starting at pos 19 and ending at pos 21 // Found 3 starting at pos 34 and ending at pos 35 // Found 10 // Alternatively you can use Scanner in Java 5-7+: p1 = ~/(?<=\D)(\d+)/ p2 = ~/\S+/ s = new Scanner(text) while ((f = s.findInLine(p1))) { println 'Found: ' + f } if ((f = s.findInLine(p2))) { println "Found $f after the last number." } // => // Found: 1752 // Found: 10 // Found: 3 // Found rd after the last number. //----------------------------------------------------------------------------------
Greedy and Non-Greedy Matches
//---------------------------------------------------------------------------------- html = '<b><i>this</i> and <i>that</i> are important</b> Oh, <b><i>me too!</i></b>' greedyHtmlStripPattern = ~/(?m)<.*>/ // not good nonGreedyHtmlStripPattern = ~/(?m)<.*?>/ // not great simpleNested = ~/(?mx)<b><i>(.*?)<\/i><\/b>/ // match BEGIN, then not BEGIN, then END generalPattern = ~/BEGIN((?:(?!BEGIN).)*)END/ betterButInefficient1 = ~/(?mx)<b><i>( (?: (?!<\/b>|<\/i>). )* ) <\/i><\/b>/ betterButInefficient2 = ~/(?mx)<b><i>( (?: (?!<\/[ib]>). )* ) <\/i><\/b>/ efficientPattern = '''(?mx) <b><i> [^<]* # stuff not possibly bad, and not possibly the end. (?: # at this point, we can have '<' if not part of something bad (?! </?[ib]> ) # what we can't have < # okay, so match the '<' [^<]* # and continue with more safe stuff ) * </i></b> ''' //' //----------------------------------------------------------------------------------
Detecting Duplicate Words
//---------------------------------------------------------------------------------- input = 'This is a test\nTest of the duplicate word finder.\n' dupWordPattern = '''(?ix) \\b # start at word boundary (\\S+) # find chunk of non-whitespace \\b # until a word boundary ( \\s+ # followed by whitespace \\1 # and that same chunk again \\b # and a word boundary ) + # one or more times ''' finder = input =~ dupWordPattern println 'Found duplicate word: ' + finder[0][1] // => Found duplicate word: test astr = 'nobody' bstr = 'bodysnatcher' m = "$astr $bstr" =~ /^(\w+)(\w+) \2(\w+)$/ actual = "${m[0][2]} overlaps in ${m[0][1]}-${m[0][2]}-${m[0][3]}" assert actual == 'body overlaps in no-body-snatcher' cap = 'o' * 180 while (m = (cap =~ /^(oo+?)\1+$/)) { p1 = m[0][1] print p1.size() + ' ' cap = cap.replaceAll(p1,'o') } println cap.size() // => 2 2 3 3 5 // diophantine // solve for 12x + 15y + 16z = 281, maximizing x if ((m = ('o' * 281) =~ /^(o*)\1{11}(o*)\2{14}(o*)\3{15}$/)) { x=m[0][1].size(); y=m[0][2].size(); z=m[0][3].size() println "One solution is: x=$x; y=$y; z=$z" } else println "No solution." // => One solution is: x=17; y=3; z=2 // using different quantifiers: // /^(o+)\1{11}(o+)\2{14}(o+)\3{15}$/ // => One solution is: x=17; y=3; z=2 // /^(o*?)\1{11}(o*)\2{14}(o*)\3{15}$/ // => One solution is: x=0; y=7; z=11 // /^(o+?)\1{11}(o*)\2{14}(o*)\3{15}$/ // => One solution is: x=1; y=3; z=14 //----------------------------------------------------------------------------------
Expressing AND, OR, and NOT in a Single Pattern
//---------------------------------------------------------------------------------- // Groovy doesn't currently support x!~y so you must use the !(x=~y) style // alpha OR beta assert 'alpha' ==~ /alpha|beta/ assert 'beta' ==~ /alpha|beta/ assert 'betalpha' =~ /alpha/ || 'betalpha' =~ /beta/ // alpha AND beta assert !('alpha' =~ /(?=.*alpha)(?=.*beta)/) assert 'alphabeta' =~ /(?=.*alpha)(?=.*beta)/ assert 'betalpha' =~ /(?=.*alpha)(?=.*beta)/ assert 'betalpha' =~ /alpha/ && 'betalpha' =~ /beta/ // alpha AND beta, no overlap assert 'alphabeta' =~ /alpha.*beta|beta.*alpha/ assert !('betalpha' =~ /alpha.*beta|beta.*alpha/) // NOT beta assert 'alpha gamma' =~ /^(?:(?!beta).)*$/ assert !('alpha beta gamma' =~ /^(?:(?!beta).)*$/) // NOT bad BUT good assert !('GOOD and BAD' =~ /(?=(?:(?!BAD).)*$)GOOD/) assert !('BAD' =~ /(?=(?:(?!BAD).)*$)GOOD/) assert !('WORSE' =~ /(?=(?:(?!BAD).)*$)GOOD/) assert 'GOOD' =~ /(?=(?:(?!BAD).)*$)GOOD/ // minigrep could be done as a one-liner as follows // groovy -p -e "if (line =~ /pat/) return line" datafile string = 'labelled' assert string =~ /^(?=.*bell)(?=.*lab)/ assert string =~ /bell/ && string =~ 'lab' fakeAddress = "blah bell blah " murrayHillRegex = '''(?x) ^ # start of string (?= # zero-width lookahead .* # any amount of intervening stuff bell # the desired bell string ) # rewind, since we were only looking (?= # and do the same thing .* # any amount of intervening stuff lab # and the lab part ) ''' assert string =~ murrayHillRegex assert !(fakeAddress =~ murrayHillRegex) // eliminate overlapping assert !(string =~ /(?:^.*bell.*lab)|(?:^.*lab.*bell)/) brandRegex = '''(?x) (?: # non-capturing grouper ^ .*? # any amount of stuff at the front bell # look for a bell .*? # followed by any amount of anything lab # look for a lab ) # end grouper | # otherwise, try the other direction (?: # non-capturing grouper ^ .*? # any amount of stuff at the front lab # look for a lab .*? # followed by any amount of anything bell # followed by a bell ) # end grouper ''' assert !(string =~ brandRegex) map = 'the great baldo' assert map =~ /^(?:(?!waldo).)*$/ noWaldoRegex = '''(?x) ^ # start of string (?: # non-capturing grouper (?! # look ahead negation waldo # is he ahead of us now? ) # is so, the negation failed . # any character (cuzza /s) ) * # repeat that grouping 0 or more $ # through the end of the string ''' assert map =~ noWaldoRegex // on unix systems use: realFakedInput = 'w'.process().text fakedInput = ''' 7:15am up 206 days, 13:30, 4 users, load average: 1.04, 1.07, 1.04 USER TTY FROM LOGIN@ IDLE JCPU PCPU WHAT tchrist tty1 5:16pm 36days 24:43 0.03s xinit tchrist tty2 5:19pm 6days 0.43s 0.43s -tcsh tchrist ttyp0 chthon 7:58am 3days 23.44s 0.44s -tcsh gnat ttyS4 coprolith 2:01pm 13:36m 0.30s 0.30s -tcsh '''.trim() + '\n' def miniGrepMethod(input) { input.split('\n').findAll{it =~ '^(?!.*ttyp).*tchrist'} } assert miniGrepMethod(fakedInput).size() == 2 findUserRegex = '''(?xm) ^ # anchored to the start (?! # zero-width look-ahead assertion .* # any amount of anything (faster than .*?) ttyp # the string you don't want to find ) # end look-ahead negation; rewind to start .* # any amount of anything (faster than .*?) tchrist # now try to find Tom ''' assert (fakedInput =~ findUserRegex).count == 2 //----------------------------------------------------------------------------------
Matching Multiple-Byte Characters
//---------------------------------------------------------------------------------- // Groovy uses Unicode character encoding // special care needs to be taken when using unicode because of the different // byte lengths, e.g. à can be encoded as two bytes \u0061\u0300 and is also // supported in legacy character sets by a single character \u00E0. To Match // this character, you can't use any of /./, /../, /a/, /\u00E0/, /\u0061/\u0300 // or /\pL/. The correct way is to use /X (not currently supported) or one // of /\pL/\pM*/ to ensure that it is a letter or /\PM\pM*/ when you just want // to combine multicharacter sequences and don't care whether it is a letter def checkUnicode(s) { println s + ' is of size ' + s.size() println 'Exactly matches /./ ' + (s ==~ /./) println 'Exactly matches /../ ' + (s ==~ /../) println 'Exactly matches /a/ ' + (s ==~ /a/) println 'Exactly matches /\\u00E0/ ' + (s ==~ /\u00E0/) println 'Exactly matches /\\u0061\\u0300/ ' + (s ==~ /\u0061\u0300/) println 'Exactly matches /\\pL/ ' + (s ==~ /\pL/) println 'Exactly matches /\\pL\\pM*/ ' + (s ==~ /\pL\pM*/) println 'Exactly matches /\\PM\\pM*/ ' + (s ==~ /\PM\pM*/) } checkUnicode('à') checkUnicode('\u0061\u0300') checkUnicode('\u00E0') // => // à is of size 1 // Exactly matches /./ true // Exactly matches /../ false // Exactly matches /a/ false // Exactly matches /\u00E0/ true // Exactly matches /\u0061\u0300/ false // Exactly matches /\pL/ true // Exactly matches /\pL\pM*/ true // Exactly matches /\PM\pM*/ true // a? is of size 2 // Exactly matches /./ false // Exactly matches /../ true // Exactly matches /a/ false // Exactly matches /\u00E0/ false // Exactly matches /\u0061\u0300/ true // Exactly matches /\pL/ false // Exactly matches /\pL\pM*/ true // Exactly matches /\PM\pM*/ true // à is of size 1 // Exactly matches /./ true // Exactly matches /../ false // Exactly matches /a/ false // Exactly matches /\u00E0/ true // Exactly matches /\u0061\u0300/ false // Exactly matches /\pL/ true // Exactly matches /\pL\pM*/ true // Exactly matches /\PM\pM*/ true //----------------------------------------------------------------------------------
Matching a Valid Mail Address
//---------------------------------------------------------------------------------- // The Perl Cookbook categorizes this as a hard problem ... mostly for // reasons not related to the actual regex - but with a 60-line regex // perhaps there are some issues with that too. Further details: // simpleCommentStripper = /\([^()]*\)/ println 'Book Publishing <> (We will spam you)'.replaceAll(simpleCommentStripper, '') // => Book Publishing <> // inspired by the fact that domain names can contain any foreign character these days modern = /^.+@[^\.].*\.[a-z]{2,}>?$/ // .Net lenient = /\w+([-+.]\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*/ // a little more checking strict = /^[_a-zA-Z0-9- <]+(\.[_a-zA-Z0-9- <]+)*@[a-zA-Z0-9-]+(\.[a-zA-Z0-9-]+)*\./ + /(([0-9]{1,3})|([a-zA-Z]{2,3})|(aero|coop|info|museum|name))>?$/ addresses = ['', 'Book Publishing <>'] addresses.each{ assert it =~ lenient assert it =~ strict assert it =~ modern } //----------------------------------------------------------------------------------
Matching Abbreviations
//---------------------------------------------------------------------------------- def findAction(ans) { def re = '(?i)^' + Pattern.quote(ans) if ("SEND" =~ re) println "Action is send" else if ("STOP" =~ re) println "Action is stop" else if ("ABORT" =~ re) println "Action is abort" else if ("EDIT" =~ re) println "Action is edit" else println 'No Match' } findAction('edit something') // => No Match findAction('edit') // => Action is edit findAction('se') // => Action is send findAction('e') // => Action is edit def buildAbbrev(words) { def table = new TreeMap() words.each{ w -> (0..<w.size()).each { n -> if (!(words - w).any{ it.size() >= n+1 && it[0..n] == w[0..n] }) table[w[0..n]] = w } } table } println buildAbbrev('send stop abort edit'.split(' ').toList()) // => ["a":"abort", "ab":"abort", "abo":"abort", "abor":"abort", "abort":"abort", // "e":"edit", "ed":"edit", "edi":"edit", "edit":"edit", "se":"send", "sen":"send", // "send":"send", "st":"stop", "sto":"stop", "stop":"stop"] // miniShellScript: // dummy methods def invokeEditor() { println "invoking editor" } def deliverMessage() { println "delivering message at " + new Date() } actions = [ edit: this.&invokeEditor, send: this.&deliverMessage, list: { println Runtime.runtime.freeMemory() }, abort: { System.exit(0) }, unknown: { println "Unknown Command"} ] table = buildAbbrev(actions.keySet().toList()) prompt = '\n> ' print 'Enter Commands: edit send list abort' + prompt new BufferedReader(new InputStreamReader({ line -> def idx = (table.containsKey(line)) ? table[line] : 'unknown' actions[idx]() print prompt } //----------------------------------------------------------------------------------
Program: urlify
//---------------------------------------------------------------------------------- //% gunzip -c ~/mail/archive.gz | urlify > archive.urlified //% urlify ~/mail/*.inbox > ~/allmail.urlified urls = '(https?|telnet|gopher|file|wais|ftp|mail)' ltrs = /\w/ gunk = /\#\/~:.?+=&%@!\-/ punc = /.:?\-/ doll = /$/ all = /$ltrs$gunk$punc/ findUrls = """(?ix) \\b # start at word boundary ( # begin group 1 { $urls : # need resource and a colon [$all] +? # followed by on or more of any valid # character, but be conservative and # take only what you need to... ) # end group 1 } (?= # look-ahead non-consumptive assertion [$punc]* # either 0 or more punctuation [^$all] # followed by a non-url character | # or else $doll # then end of the string ) """ input = ''' If you find a typo on please send an email to ''' println input.replaceAll(findUrls,'<a href="$1">$1</a>') // => // If you find a typo on <a href=""></a> please // send an email to <a href=""></a> // urlifyScript: #!/usr/bin/groovy // urlify - wrap HTML links around URL-like constructs // definitions from above args.each{ file -> new File(file).eachLine{ line -> println line.replaceAll(findUrls,'<a href="$1">$1</a>') } } //----------------------------------------------------------------------------------
Program: tcgrep
//---------------------------------------------------------------------------------- // @@INCOMPLETE@@ // @@INCOMPLETE@@ //---------------------------------------------------------------------------------- // not an exact equivalent to original cookbook but has // a reasonable subset of mostly similar functionality // instead of -r recursion option, use Ant fileset wildcards // e.g. **/*.c. You can also specify an excludes pattern // e.g. **/*.* -X **/*.h will process all but header files // (currently not optimised and with minimal error checking) // uses jopt-simple ( op = new joptsimple.OptionParser() NOCASE = 'i'; op.accepts( NOCASE, "case insensitive" ) WITHN = 'n'; op.accepts( WITHN, "display line/para with line/para number" ) WITHF = 'H'; op.accepts( WITHF, "display line/para with filename" ) NONAME = 'h'; op.accepts( NONAME, "hide filenames" ) COUNT = 'c'; op.accepts( COUNT, "give count of lines/paras matching" ) TCOUNT = 'C'; op.accepts( TCOUNT, "give count of total matches (multiple per line/para)" ) WORD = 'w'; op.accepts( WORD, "word boundaries only" ) EXACT = 'x'; op.accepts( EXACT, "exact matches only" ) INVERT = 'v'; op.accepts( INVERT, "invert search sense (lines that DON'T match)" ) EXCLUDE = 'X'; op.accepts( EXCLUDE, "exclude files matching pattern [default is '**/*.bak']" ). withRequiredArg().describedAs('path_pattern') MATCH = 'l'; op.accepts( MATCH, "list names of files with matches" ) NOMATCH = 'L'; op.accepts( NOMATCH, "list names of files with no match" ) PARA = 'p'; op.accepts( PARA, "para mode (.* matches newlines)" ). withOptionalArg().describedAs('para_pattern') EXPR = 'e'; op.accepts( EXPR, "expression (when pattern begins with '-')" ). withRequiredArg().describedAs('pattern') FILE = 'f'; op.accepts( FILE, "file containing pattern" ). withRequiredArg().describedAs('filename') HELP = 'help'; op.accepts( HELP, "display this message" ) options = op.parse(args) params = options.nonOptionArguments() if (options.wasDetected( HELP )) { op.printHelpOn( System.out ) } else if (params.size() == 0) { println "Usage: grep [OPTION]... PATTERN [FILE]...\nTry 'grep --$HELP' for more information." } else { modifiers = [] paraPattern = '' o_withn = options.wasDetected( WITHN ) o_withf = options.wasDetected( WITHF ) o_noname = options.wasDetected( NONAME ) o_count = options.wasDetected( COUNT ) o_tcount = options.wasDetected( TCOUNT ) o_invert = options.wasDetected( INVERT ) o_match = options.wasDetected( MATCH ) o_nomatch = options.wasDetected( NOMATCH ) if (options.wasDetected( EXPR )) { pattern = options.valueOf( EXPR ) } else if (options.wasDetected( FILE )) { pattern = new File(options.valueOf( FILE )).text.trim() } else { pattern = params[0] params = params[1..-1] } if (options.wasDetected( EXCLUDE )) excludes = options.valueOf( EXCLUDE ) else excludes = ['**/*.bak'] if (options.wasDetected( EXACT )) pattern = '^' + pattern + '$' else if (options.wasDetected( WORD )) pattern = /\b$pattern\b/ if (options.wasDetected( NOCASE )) modifiers += 'i' if (options.wasDetected( PARA )) { if (options.hasArgument( PARA )) paraPattern = options.valueOf( PARA ) else paraPattern = '^$' paraPattern = '(?sm)' + paraPattern modifiers += 'sm' } if (modifiers) pattern = "(?${modifiers.join()})" + pattern if (params.size() == 0) grepStream(, '<stdin>') else { scanner = new AntBuilder().fileScanner { fileset(dir:'.', includes:params.join(','), excludes:excludes) } for (f in scanner) { grepStream(new FileInputStream(f), f) } } } def grepStream(s, name) { def count = 0 def tcount = 0 def pieces if (paraPattern) pieces = s.text.split(paraPattern) else pieces = s.readLines() def fileMode = o_match || o_nomatch || o_count || o_tcount pieces.eachWithIndex{line, index -> def m = line =~ pattern boolean found = m.count if (found != o_invert) { count++ tcount += m.count if (!fileMode) { linefields = [] if (o_withf) linefields += name if (o_withn) linefields += index + 1 linefields += line println linefields.join(':') } } } def display = true if ((o_match && count == 0) || (o_nomatch && count != 0)) display = false if (fileMode && display) { filefields = [] if (!o_noname) filefields += name if (o_tcount) filefields += tcount else if (o_count) filefields += count println filefields.join(':') } } //----------------------------------------------------------------------------------
Regular Expression Grabbag
//---------------------------------------------------------------------------------- romans = /(?i)^m*(d?c{0,3}|c[dm])(l?x{0,3}|x[lc])(v?i{0,3}|i[vx])$/ assert 'cmxvi' =~ romans // can't have tens before 1000s (M) or 100s (C) after 5s (V) assert !('xmvci' =~ romans) // swap first two words assert 'the words'.replaceAll(/(\S+)(\s+)(\S+)/, '$3$2$1') == 'words the' // extract keyword and value m = 'k=v' =~ /(\w+)\s*=\s*(.*)\s*$/ assert m.matches() assert m[0][1] == 'k' assert m[0][2] == 'v' hasAtLeastSize = { n -> /.{$n,}/ } assert 'abcdefghijklmnopqrstuvwxyz' =~ hasAtLeastSize(20) // MM/DD/YY HH:MM:SS (lenient - doesn't check HH > 23 etc) d = /\d+/ datetime = "($d)/($d)/($d) ($d):($d):($d)" assert '04/05/2006 10:26:59' =~ datetime orig = '/usr/bin/vi' expected = '/usr/local/bin/vi' orig.replaceAll('/usr/bin','/usr/local/bin') == expected escapeSequenceRegex = /%([0-9A-Fa-f][0-9A-Fa-f])/ convertEscapeToChar = { Object[] ch -> new Character((char)Integer.parseInt(ch[1],16)) } assert 'abc%3cdef'.replaceAll(escapeSequenceRegex, convertEscapeToChar) == 'abc<def' commentStripper = '''(?xms) /\\* # Match the opening delimiter .* # Match a minimal number of characters */ \\*/ # Match the closing delimiter ''' input = ''' a line /* some comment */ another line ''' expected = ''' a line another line ''' assert input.replaceAll(commentStripper,'') == expected // emulate s.trim() assert ' x y '.replaceAll(/^\s+/, '').replaceAll(/\s+$/, '') == 'x y' // convert \\n into \n assert (/a\nb/.replaceAll(/\\n/,"\n") == 'a\nb') // remove package symbol (Groovy/Java doesn't use this as package symbol) assert 'A::B'.replaceAll(/^.*::/, '') == 'B' // match IP Address (requires leading 0's) ipregex = /^([01]?\d\d|2[0-4]\d|25[0-5])\.([01]?\d\d|2[0-4]\d|25[0-5])\./ + /([01]?\d\d|2[0-4]\d|25[0-5])\.([01]?\d\d|2[0-4]\d|25[0-5])$/ assert !('123.456.789' =~ ipregex) assert '' =~ ipregex // extract basename assert 'c:/usr/temp.txt'.replaceAll(/^.*\/{1}/, '') == 'temp.txt' termcap = ':co#80:li#24:' m = (termcap =~ /:co\#(\d+):/) assert m.count == 1 assert m[0][1] == '80' assert 'cmd c:/tmp/junk.txt'.replaceAll(/ \S+\/{1}/, ' ') == 'cmd junk.txt' os = System.getProperty('') println 'Is Linux? ' + (os ==~ /(?i)linux.*/) println 'Is Windows? ' + (os ==~ /(?i)windows.*/) println 'Is Mac? ' + (os ==~ /(?i)mac.*/) // join multiline sting multi = ''' This is a test '''.trim() assert multi.replaceAll(/(?m)\n\s+/, ' ') == 'This is a test' // nums in string string = 'The 5th test was won today by 10 wickets after 10.5 overs' nums = string =~ /(\d+\.?\d*|\.\d+)/ assert (0..<nums.count).collect{ nums[it][1] }.join(' ') == '5 10 10.5' // capitalize words words = 'the Capital words ARE hiding' capwords = words =~ /(\b\p{Upper}+\b)/ assert (0..<capwords.count).collect{ capwords[it][1] }.join(' ') == 'ARE' lowords = words =~ /(\b\p{Lower}+\b)/ assert (0..<lowords.count).collect{ lowords[it][1] }.join(' ') == 'the words hiding' capWords = words =~ /(\b\p{Upper}\p{Lower}*\b)/ assert (0..<capWords.count).collect{ capWords[it][1] }.join(' ') == 'Capital' input = ''' If you find a typo on <a href=""></a> please send an email to <a href=""></a> ''' linkRegex = /(?im)<A[^>]+?HREF\s*=\s*["']?([^'" >]+?)[ '"]?>/ //' links = input =~ linkRegex (0..<links.count).each{ println links[it][1] } // => // // // find middle initial if any m = 'Lee Harvey Oswald' =~ /^\S+\s+(\S)\S*\s+\S/ initial = m.count ? m[0][1] : "" assert initial == 'H' // inch marks to quotes println 'I said "Hello" to you.'.replaceAll(/"([^"]*)"/, /``$1''/) //" // => I said ``Hello'' to you. // extract sentences (2 spaces or newline after punctuation) input = ''' Is this a sentence? Yes! And so is this. And the fourth. ''' sentences = [] strip = input.replaceAll(/(\p{Punct})\n/, '$1 ').replaceAll(/\n/, ' ').replaceAll(/ {3,}/,' ') m = strip =~ /(\S.*?\p{Punct})(?= |\Z)/ (0..<m.count).each{ sentences += m[it][1] } assert sentences == ["Is this a sentence?", "Yes!", "And so is this.", "And the fourth."] // YYYY-MM-DD m = '2007-2-28' =~ /(\d{4})-(\d\d?)-(\d\d?)/ assert m.matches() assert ['2007', '2', '28'] == [m[0][1], m[0][2], m[0][3]] usPhoneRegex = /^[01]?[- .]?(\([2-9]\d{2}\)|[2-9]\d{2})[- .]?\d{3}[- .]?\d{4}$/ numbers = ''' (425) 555-0123 425-555-0123 425 555 0123 1-425-555-0123 '''.trim().split('\n').toList() assert numbers.every{ it ==~ usPhoneRegex } exclaimRegex = /(?i)\boh\s+my\s+gh?o(d(dess(es)?|s?)|odness|sh)\b/ assert 'Oh my Goodness!' =~ exclaimRegex assert !('Golly gosh' =~ exclaimRegex) input = 'line 1\rline 2\nline\r\nline 3\n\rline 4' m = input =~ /(?m)^([^\012\015]*)(\012\015?|\015\012?)/ assert m.count == 4