|
Groovy/FAQ/Поиск по шаблону
Материал из Wiki.crossplatform.ru
[править] Introduction
//----------------------------------------------------------------------------------
// Groovy has built-in language support for Regular Expressions:
// * Strings quoted with '/' characters have special escaping
// rules for backslashes and the like.
// * ~string (regex pattern operator)
// * m =~ /pattern/ (regex find operator)
// * m ==~/pattern/ (regex match operator)
// * patterns can be used in case expressions in a switch statement
// * string.replaceAll can take a closure expression as the second argument
// In addition, Groovy can make use of Java's Pattern, Matcher and Scanner classes
// directly. (The sugar coating metnioed above sits on top of these anyway).
// There are also additional open source Java regex libraries which can be used.
meadow1 = 'cow grass butterflies Ovine'
meadow2 = 'goat sheep flowers dog'
// pattern strings can benefit from 'slashy' quotes
partial = /sheep/
full = /.*sheep.*/
// find operator
assert !(meadow1 =~ partial)
assert meadow2 =~ partial
finder = (meadow2 =~ partial)
// underneath Groovy sugar coating is Java implementation
assert finder instanceof java.util.regex.Matcher
// match operator
assert !(meadow1 ==~ full)
assert meadow2 ==~ full
matcher = (meadow2 ==~ full)
// under the covers is just a boolean
assert matcher instanceof Boolean
assert meadow1 =~ /(?i)\bovines?\b/ // (?i) == case flag
string = 'good food'
println string.replaceFirst(/o*/, 'e')
// => egood food
println string.replaceAll(/o*/, 'e')
// => egeede efeede (global)
// beware this one is just textual replacement
println string.replace(/o*/, 'e')
// => good food
println 'o*o*'.replace(/o*/, 'e')
// => ee
// groovy -e "m = args[0] =~ /(a|ba|b)+(a|ac)+/; if (m.matches()) println m[0][0]" ababacaca
// => ababa
digits = "123456789"
nonlap = digits =~ /\d\d\d/
assert nonlap.count == 3
print 'Non-overlapping: '
(0..<nonlap.count).each{ print nonlap[it] + ' ' }; print '\n'
print 'Overlapping: '
yeslap = (digits =~ /(?=(\d\d\d))/)
assert yeslap.count == 7
(0..<yeslap.count).each{ print yeslap[it][1] + ' ' }; print '\n'
// Non-overlapping: 123 456 789
// Overlapping: 123 234 345 456 567 678 789
string = 'And little lambs eat ivy'
// Greedy version
parts = string =~ /(.*)(l[^s]*s)(.*)/
(1..parts.groupCount()).each{ print "(${parts[0][it]}) " }; print '\n'
// (And little ) (lambs) ( eat ivy)
// Reluctant version
parts = string =~ /(.*?)(l[^s]*s)(.*)/
(1..parts.groupCount()).each{ print "(${parts[0][it]}) " }; print '\n'
// (And ) (little lambs) ( eat ivy)
//----------------------------------------------------------------------------------
[править] Copying and Substituting Simultaneously
//----------------------------------------------------------------------------------
// Groovy splits src and dest to avoid this problem
src = 'Go this way'
dst = src.replaceFirst('this', 'that')
assert dst == 'Go that way'
// extract basename
src = 'c:/some/path/file.ext'
dst = src.replaceFirst('^.*/', '')
assert dst == 'file.ext'
// Make All Words Title-Cased (not that you would do it this way)
// The preprocessing operations \X where X is one of l, u, L, and U are not supported
// in the sun regex library but other Java regex libraries may support this. Instead:
src = 'make all words title-cased'
dst = src
('a'..'z').each{ dst = dst.replaceAll(/([^a-zA-Z])/+it+/|\A/+it, /$1/+it.toUpperCase()) }
assert dst == 'Make All Words Title-Cased'
// rename list of dirs
bindirs = '/usr/bin /bin /usr/local/bin'.split(' ').toList()
expected = '/usr/lib /lib /usr/local/lib'.split(' ').toList()
libdirs = bindirs.collect { dir -> dir.replaceFirst('bin', 'lib') }
assert libdirs == expected
//----------------------------------------------------------------------------------
[править] Matching Letters
//----------------------------------------------------------------------------------
// Groovy uses Java regex (other Java regex packages would also be possible)
// It doesn't support Locale-based settings but you can roll your own to some
// extent, you can use any Unicode characters as per below and you can use
// \p{Punct} Punctuation: One of !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
// or the other special character classes
words = '''
silly
façade
coöperate
niño
Renée
Moliçre
hæmoglobin
naïve
tschüß
random!stuff#here\u0948
'''
results = ''
greekAlpha = '\u0391'
special = 'çéüßöñàæï?' + greekAlpha
// flag as either Y (alphabetic) or N (not)
words.split('\n').findAll{it.trim()}.each{ results += it ==~ /^[\w/+special+/]+$/ ?'Y':'N' }
assert results == 'YYYYYYYYYN'
results = ''
words.split('\n').findAll{it.trim()}.each{ results += it ==~ /^[^\p{Punct}]+$/ ?'Y':'N' }
assert results == 'YYYYYYYYYN'
//----------------------------------------------------------------------------------
[править] Matching Words
//----------------------------------------------------------------------------------
// as many non-whitespace bytes as possible
finder = 'abczqz z' =~ /a\S+z/
assert finder[0] == 'abczqz'
// as many letters, apostrophes, and hyphens
finder = "aAzZ'z-z0z" =~ /a[A-Za-z'-]+z/ //'
assert finder[0] == "aAzZ'z-z"
// selecting words
finder = '23rd Psalm' =~ /\b([A-Za-z]+)\b/ // usually best
println finder[0][0]
// => Psalm (23rd is not matched)
finder = '23rd Psalm' =~ /\s([A-Za-z]+)\s/ // fails at ends or w/ punctuation
println finder.matches()
// => false (no whitespaces at ends)
//----------------------------------------------------------------------------------
[править] Commenting Regular Expressions
//----------------------------------------------------------------------------------
str = 'groovy.codehaus.org and www.aboutgroovy.com'
re = '''(?x) # to enable whitespace and comments
( # capture the hostname in $1
(?: # these parens for grouping only
(?! [-_] ) # lookahead for neither underscore nor dash
[\\w-] + # hostname component
\\. # and the domain dot
) + # now repeat that whole thing a bunch of times
[A-Za-z] # next must be a letter
[\\w-] + # now trailing domain part
) # end of $1 capture
'''
finder = str =~ re
out = str
(0..<finder.count).each{
adr = finder[it][0]
out = out.replaceAll(adr, "$adr [${InetAddress.getByName(adr).hostAddress}]")
}
println out
// => groovy.codehaus.org [63.246.7.187] and www.aboutgroovy.com [63.246.7.76]
// to match whitespace or #-characters in an extended re you need to escape them.
foo = 42
str = 'blah #foo# blah'
re = '''(?x) # to enable whitespace and comments
\\# # a pound sign
(\\w+) # the variable name
\\# # another pound sign
'''
finder = str =~ re
found = finder[0]
out = str.replaceAll(found[0], evaluate(found[1]).toString())
assert out == 'blah 42 blah'
//----------------------------------------------------------------------------------
[править] Finding the Nth Occurrence of a Match
//----------------------------------------------------------------------------------
fish = 'One fish two fish red fish blue fish'
expected = 'The third fish is a red one.'
thirdFish = /(?:\w+\s+fish\s+){2}(\w+)\s+fish.*/
assert expected == (fish.replaceAll(thirdFish, 'The third fish is a $1 one.'))
anyFish = /(\w+)\s+fish\b/
finder = fish =~ anyFish
// finder contains an array of matched groups
// 2 = third one (index start at 0), 1 = matched word in group
out = "The third fish is a ${finder[2][1]} one."
assert out == expected
evens = []
(0..<finder.count).findAll{it%2!=0}.each{ evens += finder[it][1] }
println "Even numbered fish are ${evens.join(' ')}."
// => Even numbered fish are two blue.
// one of several ways to do this
pond = fish + ' in the pond'
fishInPond = (/(\w+)(\s+fish\b\s*)/) * 4 + /(.*)/
found = (pond =~ fishInPond)[0]
println ((found[1..6] + 'sushi' + found[8..9]).join())
// => One fish two fish red fish sushi fish in the pond
// find last fish
expected = 'Last fish is blue'
pond = 'One fish two fish red fish blue fish swim here.'
finder = (pond =~ anyFish)
assert expected == "Last fish is ${finder[finder.count-1][1]}"
// => Last fish is blue
// greedy match version of above
finder = (pond =~ /.*\b/ + anyFish)
assert expected == "Last fish is ${finder[0][1]}"
// last fish match version of above
finder = (pond =~ /\b(\w+)\s+fish\b(?!.*\bfish\b)/)
assert expected == "Last fish is ${finder[0][1]}"
//----------------------------------------------------------------------------------
[править] Matching Multiple Lines
//----------------------------------------------------------------------------------
// Html Stripper
// get this using: fakedfile = new File('path_to_file.htm').text
fakedFile = '''
<html>
<head><title>Chapter 1 Title</title></head>
<body>
<h1>Chapter 1: Some Heading</h1>
A paragraph.
</body>
</html>
'''
stripExpectations = '''
Chapter 1 Title
Chapter 1: Some Heading
A paragraph.
'''.trim()
stripped = fakedFile.replaceAll(/(?m)<.*?>/,'').trim()
assert stripExpectations == stripped
pattern = '''(?x)
( # capture in $1
Chapter # text string
\\s+ # mandatory whitespace
\\d+ # decimal number
\\s* # optional whitespace
: # a real colon
. * # anything not a newline till end of line
)
'''
headerfyExpectations = '''
Chapter 1 Title
<H1>Chapter 1: Some Heading</H1>
A paragraph.
'''.trim()
headerfied = stripped.replaceAll(pattern, '<H1>$1</H1>')
assert headerfyExpectations == headerfied
// one liner equivalent which prints to stdout
//% groovy -p -e "line.replaceAll(/^(Chapter\s+\d+\s*:.*)/,'<H1>$1</H1>')"
// one liner equivalent which modifies file in place and creates *.bak original file
//% groovy -pi .bak -e "line.replaceAll(/^(Chapter\s+\d+\s*:.*)/,'<H1>$1</H1>')"
// use: realFileInput = new File(path_to_file).text
fakeFileInput = '''
0
START
1
2
END
3
4
5
START
6
END
'''
chunkyPattern = /(?ms)^START(.*?)^END/
finder = fakeFileInput =~ chunkyPattern
(0..<finder.count).each {
println "Chunk #$it contains ${new StringTokenizer(finder[it][1],'\n').countTokens()} lines."
}
// =>
// Chunk #0 contains 2 lines.
// Chunk #1 contains 1 lines.
//----------------------------------------------------------------------------------
[править] Reading Records with a Pattern Separator
//----------------------------------------------------------------------------------
// general pattern is:
//file = new File("datafile").text.split(/pattern/)
// .Ch, .Se and .Ss divide chunks of input text
fakedFiletext = '''
.Ch
abc
.Se
def
.Ss
ghi
.Se
jkl
.Se
mno
.Ss
pqr
.Ch
stu
.Ch
vwx
.Se
yz!
'''
chunks = fakedFiletext.split(/(?m)^\.(Ch|Se|Ss)$/)
println "I read ${chunks.size()} chunks."
// => I read 10 chunks.
//----------------------------------------------------------------------------------
[править] Extracting a Range of Lines
//----------------------------------------------------------------------------------
// Groovy doesn't support the ~/BEGIN/ .. ~/END/ notation
// you have to emulate it as shown in the example below
// The from line number to line number processing is supported
// from the command line but not within a script, e.g.
// command-line to print lines 15 through 17 inclusive (see below)
// > groovy -p -e "if (count in 15..17) return line" datafile
// Within a script itself, you emulate the count by keeping state
htmlContent = '''
<h1>A Heading</h1>
Here is <XMP>inline AAA</XMP>.
And the bigger Example 2:
<XMP>
line BBB
line CCC
</XMP>
Done.
'''.trim()
examplePattern = /(?ms)<XMP>(.*?)<\/XMP>/
finder = htmlContent =~ examplePattern
(0..<finder.count).each {
println "Example ${it+1}:"
println finder[it][1]
}
// =>
// Example 1:
// inline AAA
// Example 2:
//
// line BBB
// line CCC
//
htmlContent.split('\n').eachWithIndex{ line, count ->
if (count in 4..5) println line
}
// =>
// line BBB
// line CCC
// You would probably use a mail Api for this in Groovy
fakedMailInput = '''
From: A Person <someone@somewhere.com>
To: <pleac-discuss@lists.sourceforge.net>
Date: Sun, 31 Dec 2006 02:14:57 +1100
From: noone@nowhere.com
To: <pleac-discuss@lists.sourceforge.net>
Date: Sun, 31 Dec 2006 02:14:58 +1100
From: someone@somewhere.com
To: <pleac-discuss@lists.sourceforge.net>
Date: Sun, 31 Dec 2006 02:14:59 +1100
'''.trim()+'\n'
seen = [:]
fakedMailInput.split('\n').each{ line ->
m = (line =~ /^From:?\s(.*)/)
if (m) {
addr = m[0][1] =~ /([^<>(),;\s]+\@[^<>(),;\s]+)/
x = addr[0][1]
if (seen.containsKey(x)) seen[x] += 1 else seen[x] = 1
}
}
seen.each{ k,v -> println "Address $k seen $v time${v==1?'':'s'}." }
// =>
// Address noone@nowhere.com seen 1 time.
// Address someone@somewhere.com seen 2 times.
//----------------------------------------------------------------------------------
[править] Matching Shell Globs as Regular Expressions
//----------------------------------------------------------------------------------
import java.util.regex.Pattern
names = '''
myFile.txt
oldFile.tex
myPicture.jpg
'''
def glob2pat(globstr) {
def patmap = [ '*':'.*', '?':'.', '[':'[', ']':']' ]
def result = '(?m)^'
'^' + globstr.replaceAll(/(.)/) { all, c ->
result += (patmap.containsKey(c) ? patmap[c] : Pattern.quote(c))
}
result + '$'
}
def checkNumMatches(pat, count) {
assert (names =~ glob2pat(pat)).count == count
}
checkNumMatches('*.*', 3)
checkNumMatches('my*.*', 2)
checkNumMatches('*.t*', 2)
checkNumMatches('*File.*', 2)
checkNumMatches('*Rabbit*.*', 0)
//----------------------------------------------------------------------------------
[править] Speeding Up Interpolated Matches
//----------------------------------------------------------------------------------
// version 1: simple obvious way
states = 'CO ON MI WI MN'.split(' ').toList()
def popgrep1(file) {
file.eachLine{ line ->
if (states.any{ line =~ /\b$it\b/ }) println line
}
}
// popgrep1(new File('path_to_file'))
// version 2: eval strings; fast but hard to quote (SLOW)
def popgrep2(file) {
def code = 'def found = false\n'
states.each{
code += "if (!found && line =~ /\\b$it\\b/) found = true\n"
}
code += "if (found) println line\n"
file.eachLine{ line = it; evaluate(code) }
}
// popgrep2(new File('path_to_file'))
// version 2b: eval using switch/case (not in Perl cookbook) (SLOW)
def popgrep2b(file) {
def code = 'switch(line) {\n'
states.each{
code += "case ~/.*\\b$it\\b.*/:\nprintln line;break\n"
}
code += "default:break\n}\n"
file.eachLine{ line = it; evaluate(code) }
}
// popgrep2b(new File('path_to_file'))
// version3: build a match_any function as a GString
def popgrep3(file) {
def code = states.collect{ "line =~ /\\b$it\\b/" }.join('||')
file.eachLine{ line = it; if (evaluate(code)) println line }
}
// popgrep3(new File('path_to_file'))
// version4: pretty fast, but simple: compile all re's first:
patterns = states.collect{ ~/\b$it\b/ }
def popgrep4(file) {
file.eachLine{ line ->
if (patterns.any{ it.matcher(line)}) println line
}
}
// popgrep4(new File('path_to_file'))
// version5: faster
str = states.collect{ /\b$it\b/ }.join('|')
def popgrep5(file) {
file.eachLine{ line ->
if (line =~ str) println line
}
}
// popgrep5(new File('path_to_file'))
// version5b: faster (like 5 but compiled outside loop)
pattern = ~states.collect{ /\b$it\b/ }.join('|')
def popgrep5b(file) {
file.eachLine{ line ->
if (pattern.matcher(line)) println line
}
}
// popgrep5b(new File('path_to_file'))
// speeds trials ON the current source file (~1200 lines)
// popgrep1 => 0.39s
// popgrep2 => 25.08s
// popgrep2b => 23.86s
// popgrep3 => 22.42s
// popgrep4 => 0.12s
// popgrep5 => 0.05s
// popgrep5b => 0.05s
// Groovy's built-in support is the way to go in terms of
// both speed and simplicity of understanding. Avoid using
// evaluate() unless you absolutely need it
// generic matching functions
input = '''
both cat and dog
neither
just a cat
just a dog
'''.split('\n').findAll{it.trim()}
def matchAny(line, patterns) { patterns.any{ line =~ it } }
def matchAll(line, patterns) { patterns.every{ line =~ it } }
assert input.findAll{ matchAny(it, ['cat','dog']) }.size() == 3
assert input.findAll{ matchAny(it, ['cat$','^n.*']) }.size() == 2
assert input.findAll{ matchAll(it, ['cat','dog']) }.size() == 1
assert input.findAll{ matchAll(it, ['cat$','^n.*']) }.size() == 0
//----------------------------------------------------------------------------------
[править] Testing for a Valid Pattern
//----------------------------------------------------------------------------------
// patternCheckingScript:
prompt = '\n> '
print 'Enter patterns to check:' + prompt
new BufferedReader(new InputStreamReader(System.in)).eachLine{ line ->
try {
Pattern.compile(line)
print 'Valid' + prompt
} catch (java.util.regex.PatternSyntaxException ex) {
print 'Invalid pattern: ' + ex.message + prompt
}
}
// =>
// Enter patterns to check:
// > ab*.c
// Valid
// > ^\s+[^a-z]*$
// Valid
// > **
// Invalid pattern: Dangling meta character '*' near index 0
// **
// ^
//----------------------------------------------------------------------------------
[править] Honoring Locale Settings in Regular Expressions
//----------------------------------------------------------------------------------
src = 'dierk könig'
// simplistic with locale issue
dst = src
('a'..'z').each{ dst = dst.replaceAll(/(?<=[^a-zA-Z])/+it+/|\A/+it, it.toUpperCase()) }
println dst
// => Dierk KöNig
// locale avoidance
dst = src
('a'..'z').each{ dst = dst.replaceAll(/(?<=\A|\b)/+it, it.toUpperCase()) }
println dst
// => Dierk König
//----------------------------------------------------------------------------------
[править] Approximate Matching
//----------------------------------------------------------------------------------
// Several libraries exist, e.g.
// http://secondstring.sourceforge.net/
// http://sourceforge.net/projects/simmetrics/
// both support numerous algorithms. Using the second as an example:
import uk.ac.shef.wit.simmetrics.similaritymetrics.*
target = 'balast'
candidates = '''
quick
brown
fox
jumped
over
the
lazy
dog
ballast
ballasts
balustrade
balustrades
blast
blasted
blaster
blasters
blasting
blasts
'''.split('\n').findAll{it.trim()}
metrics = [new Levenshtein(), new MongeElkan(), new JaroWinkler(), new Soundex()]
def out(name, results) {
print name.padLeft(14) + ' '; results.each{print(it.padRight(16))}; println()
}
def outr(name, results){out(name, results.collect{''+((int)(it*100))/100})}
out ('Word/Metric', metrics.collect{it.shortDescriptionString} )
candidates.each{ w -> outr(w, metrics.collect{ m -> m.getSimilarity(target, w)} )}
// =>
// Word/Metric Levenshtein MongeElkan JaroWinkler Soundex
// quick 0 0.11 0 0.66
// brown 0.16 0.23 0.5 0.73
// fox 0 0.2 0 0.66
// jumped 0 0.2 0 0.66
// over 0 0.44 0 0.55
// the 0 0.33 0 0.55
// lazy 0.33 0.5 0.44 0.66
// dog 0 0.2 0 0.66
// ballast 0.85 0.83 0.96 1
// ballasts 0.75 0.83 0.94 0.94
// balustrade 0.5 0.93 0.3 0.94
// balustrades 0.45 0.93 0.3 0.94
// blast 0.83 0.8 0.88 1
// blasted 0.57 0.66 0.8 0.94
// blaster 0.57 0.66 0.8 0.94
// blasters 0.5 0.66 0.77 0.94
// blasting 0.5 0.66 0.77 0.94
// blasts 0.66 0.66 0.84 0.94
// to implement the example, iterate through /usr/dict/words selecting words
// where one or a combination of metrics are greater than some threshold
//----------------------------------------------------------------------------------
[править] Matching from Where the Last Pattern Left Off
//----------------------------------------------------------------------------------
n = " 49 here"
println n.replaceAll(/\G /,'0')
// => 00049 here
str = "3,4,5,9,120"
print 'Found numbers:'
str.eachMatch(/\G,?(\d+)/){ print ' ' + it[1] }
println()
// => Found numbers: 3 4 5 9 120
// Groovy doesn't have the String.pos or a /c re modifier like Perl
// But it does have similar functionality. Matcher has start() and
// end() for find the position and Matcher's usePattern() allows
// you to swap patterns without changing the buffer position
text = 'the year 1752 lost 10 days on the 3rd of September'
p = ~/(?<=\D)(\d+)/
m = p.matcher(text)
while (m.find()) {
println 'Found ' + m.group() + ' starting at pos ' + m.start() +
' and ending at pos ' + m.end()
}
// now reset pos back to between 1st and 2nd numbers
if (m.find(16)) { println 'Found ' + m.group() }
// =>
// Found 1752 starting at pos 9 and ending at pos 13
// Found 10 starting at pos 19 and ending at pos 21
// Found 3 starting at pos 34 and ending at pos 35
// Found 10
// Alternatively you can use Scanner in Java 5-7+:
p1 = ~/(?<=\D)(\d+)/
p2 = ~/\S+/
s = new Scanner(text)
while ((f = s.findInLine(p1))) { println 'Found: ' + f }
if ((f = s.findInLine(p2))) { println "Found $f after the last number." }
// =>
// Found: 1752
// Found: 10
// Found: 3
// Found rd after the last number.
//----------------------------------------------------------------------------------
[править] Greedy and Non-Greedy Matches
//----------------------------------------------------------------------------------
html = '<b><i>this</i> and <i>that</i> are important</b> Oh, <b><i>me too!</i></b>'
greedyHtmlStripPattern = ~/(?m)<.*>/ // not good
nonGreedyHtmlStripPattern = ~/(?m)<.*?>/ // not great
simpleNested = ~/(?mx)<b><i>(.*?)<\/i><\/b>/
// match BEGIN, then not BEGIN, then END
generalPattern = ~/BEGIN((?:(?!BEGIN).)*)END/
betterButInefficient1 = ~/(?mx)<b><i>( (?: (?!<\/b>|<\/i>). )* ) <\/i><\/b>/
betterButInefficient2 = ~/(?mx)<b><i>( (?: (?!<\/[ib]>). )* ) <\/i><\/b>/
efficientPattern = '''(?mx)
<b><i>
[^<]* # stuff not possibly bad, and not possibly the end.
(?:
# at this point, we can have '<' if not part of something bad
(?! </?[ib]> ) # what we can't have
< # okay, so match the '<'
[^<]* # and continue with more safe stuff
) *
</i></b>
''' //'
//----------------------------------------------------------------------------------
[править] Detecting Duplicate Words
//----------------------------------------------------------------------------------
input = 'This is a test\nTest of the duplicate word finder.\n'
dupWordPattern = '''(?ix)
\\b # start at word boundary
(\\S+) # find chunk of non-whitespace
\\b # until a word boundary
(
\\s+ # followed by whitespace
\\1 # and that same chunk again
\\b # and a word boundary
) + # one or more times
'''
finder = input =~ dupWordPattern
println 'Found duplicate word: ' + finder[0][1]
// => Found duplicate word: test
astr = 'nobody'
bstr = 'bodysnatcher'
m = "$astr $bstr" =~ /^(\w+)(\w+) \2(\w+)$/
actual = "${m[0][2]} overlaps in ${m[0][1]}-${m[0][2]}-${m[0][3]}"
assert actual == 'body overlaps in no-body-snatcher'
cap = 'o' * 180
while (m = (cap =~ /^(oo+?)\1+$/)) {
p1 = m[0][1]
print p1.size() + ' '
cap = cap.replaceAll(p1,'o')
}
println cap.size()
// => 2 2 3 3 5
// diophantine
// solve for 12x + 15y + 16z = 281, maximizing x
if ((m = ('o' * 281) =~ /^(o*)\1{11}(o*)\2{14}(o*)\3{15}$/)) {
x=m[0][1].size(); y=m[0][2].size(); z=m[0][3].size()
println "One solution is: x=$x; y=$y; z=$z"
} else println "No solution."
// => One solution is: x=17; y=3; z=2
// using different quantifiers:
// /^(o+)\1{11}(o+)\2{14}(o+)\3{15}$/
// => One solution is: x=17; y=3; z=2
// /^(o*?)\1{11}(o*)\2{14}(o*)\3{15}$/
// => One solution is: x=0; y=7; z=11
// /^(o+?)\1{11}(o*)\2{14}(o*)\3{15}$/
// => One solution is: x=1; y=3; z=14
//----------------------------------------------------------------------------------
[править] Expressing AND, OR, and NOT in a Single Pattern
//----------------------------------------------------------------------------------
// Groovy doesn't currently support x!~y so you must use the !(x=~y) style
// alpha OR beta
assert 'alpha' ==~ /alpha|beta/
assert 'beta' ==~ /alpha|beta/
assert 'betalpha' =~ /alpha/ || 'betalpha' =~ /beta/
// alpha AND beta
assert !('alpha' =~ /(?=.*alpha)(?=.*beta)/)
assert 'alphabeta' =~ /(?=.*alpha)(?=.*beta)/
assert 'betalpha' =~ /(?=.*alpha)(?=.*beta)/
assert 'betalpha' =~ /alpha/ && 'betalpha' =~ /beta/
// alpha AND beta, no overlap
assert 'alphabeta' =~ /alpha.*beta|beta.*alpha/
assert !('betalpha' =~ /alpha.*beta|beta.*alpha/)
// NOT beta
assert 'alpha gamma' =~ /^(?:(?!beta).)*$/
assert !('alpha beta gamma' =~ /^(?:(?!beta).)*$/)
// NOT bad BUT good
assert !('GOOD and BAD' =~ /(?=(?:(?!BAD).)*$)GOOD/)
assert !('BAD' =~ /(?=(?:(?!BAD).)*$)GOOD/)
assert !('WORSE' =~ /(?=(?:(?!BAD).)*$)GOOD/)
assert 'GOOD' =~ /(?=(?:(?!BAD).)*$)GOOD/
// minigrep could be done as a one-liner as follows
// groovy -p -e "if (line =~ /pat/) return line" datafile
string = 'labelled'
assert string =~ /^(?=.*bell)(?=.*lab)/
assert string =~ /bell/ && string =~ 'lab'
fakeAddress = "blah bell blah "
murrayHillRegex = '''(?x)
^ # start of string
(?= # zero-width lookahead
.* # any amount of intervening stuff
bell # the desired bell string
) # rewind, since we were only looking
(?= # and do the same thing
.* # any amount of intervening stuff
lab # and the lab part
)
'''
assert string =~ murrayHillRegex
assert !(fakeAddress =~ murrayHillRegex)
// eliminate overlapping
assert !(string =~ /(?:^.*bell.*lab)|(?:^.*lab.*bell)/)
brandRegex = '''(?x)
(?: # non-capturing grouper
^ .*? # any amount of stuff at the front
bell # look for a bell
.*? # followed by any amount of anything
lab # look for a lab
) # end grouper
| # otherwise, try the other direction
(?: # non-capturing grouper
^ .*? # any amount of stuff at the front
lab # look for a lab
.*? # followed by any amount of anything
bell # followed by a bell
) # end grouper
'''
assert !(string =~ brandRegex)
map = 'the great baldo'
assert map =~ /^(?:(?!waldo).)*$/
noWaldoRegex = '''(?x)
^ # start of string
(?: # non-capturing grouper
(?! # look ahead negation
waldo # is he ahead of us now?
) # is so, the negation failed
. # any character (cuzza /s)
) * # repeat that grouping 0 or more
$ # through the end of the string
'''
assert map =~ noWaldoRegex
// on unix systems use: realFakedInput = 'w'.process().text
fakedInput = '''
7:15am up 206 days, 13:30, 4 users, load average: 1.04, 1.07, 1.04
USER TTY FROM LOGIN@ IDLE JCPU PCPU WHAT
tchrist tty1 5:16pm 36days 24:43 0.03s xinit
tchrist tty2 5:19pm 6days 0.43s 0.43s -tcsh
tchrist ttyp0 chthon 7:58am 3days 23.44s 0.44s -tcsh
gnat ttyS4 coprolith 2:01pm 13:36m 0.30s 0.30s -tcsh
'''.trim() + '\n'
def miniGrepMethod(input) {
input.split('\n').findAll{it =~ '^(?!.*ttyp).*tchrist'}
}
assert miniGrepMethod(fakedInput).size() == 2
findUserRegex = '''(?xm)
^ # anchored to the start
(?! # zero-width look-ahead assertion
.* # any amount of anything (faster than .*?)
ttyp # the string you don't want to find
) # end look-ahead negation; rewind to start
.* # any amount of anything (faster than .*?)
tchrist # now try to find Tom
'''
assert (fakedInput =~ findUserRegex).count == 2
//----------------------------------------------------------------------------------
[править] Matching Multiple-Byte Characters
//----------------------------------------------------------------------------------
// Groovy uses Unicode character encoding
// special care needs to be taken when using unicode because of the different
// byte lengths, e.g. à can be encoded as two bytes \u0061\u0300 and is also
// supported in legacy character sets by a single character \u00E0. To Match
// this character, you can't use any of /./, /../, /a/, /\u00E0/, /\u0061/\u0300
// or /\pL/. The correct way is to use /X (not currently supported) or one
// of /\pL/\pM*/ to ensure that it is a letter or /\PM\pM*/ when you just want
// to combine multicharacter sequences and don't care whether it is a letter
def checkUnicode(s) {
println s + ' is of size ' + s.size()
println 'Exactly matches /./ ' + (s ==~ /./)
println 'Exactly matches /../ ' + (s ==~ /../)
println 'Exactly matches /a/ ' + (s ==~ /a/)
println 'Exactly matches /\\u00E0/ ' + (s ==~ /\u00E0/)
println 'Exactly matches /\\u0061\\u0300/ ' + (s ==~ /\u0061\u0300/)
println 'Exactly matches /\\pL/ ' + (s ==~ /\pL/)
println 'Exactly matches /\\pL\\pM*/ ' + (s ==~ /\pL\pM*/)
println 'Exactly matches /\\PM\\pM*/ ' + (s ==~ /\PM\pM*/)
}
checkUnicode('à')
checkUnicode('\u0061\u0300')
checkUnicode('\u00E0')
// =>
// à is of size 1
// Exactly matches /./ true
// Exactly matches /../ false
// Exactly matches /a/ false
// Exactly matches /\u00E0/ true
// Exactly matches /\u0061\u0300/ false
// Exactly matches /\pL/ true
// Exactly matches /\pL\pM*/ true
// Exactly matches /\PM\pM*/ true
// a? is of size 2
// Exactly matches /./ false
// Exactly matches /../ true
// Exactly matches /a/ false
// Exactly matches /\u00E0/ false
// Exactly matches /\u0061\u0300/ true
// Exactly matches /\pL/ false
// Exactly matches /\pL\pM*/ true
// Exactly matches /\PM\pM*/ true
// à is of size 1
// Exactly matches /./ true
// Exactly matches /../ false
// Exactly matches /a/ false
// Exactly matches /\u00E0/ true
// Exactly matches /\u0061\u0300/ false
// Exactly matches /\pL/ true
// Exactly matches /\pL\pM*/ true
// Exactly matches /\PM\pM*/ true
//----------------------------------------------------------------------------------
[править] Matching a Valid Mail Address
//----------------------------------------------------------------------------------
// The Perl Cookbook categorizes this as a hard problem ... mostly for
// reasons not related to the actual regex - but with a 60-line regex
// perhaps there are some issues with that too. Further details:
// http://www.perl.com/CPAN/authors/Tom_Christiansen/scripts/ckaddr.gz
simpleCommentStripper = /\([^()]*\)/
println 'Book Publishing <marketing@books.com> (We will spam you)'.replaceAll(simpleCommentStripper, '')
// => Book Publishing <marketing@books.com>
// inspired by the fact that domain names can contain any foreign character these days
modern = /^.+@[^\.].*\.[a-z]{2,}>?$/
// .Net
lenient = /\w+([-+.]\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*/
// a little more checking
strict = /^[_a-zA-Z0-9- <]+(\.[_a-zA-Z0-9- <]+)*@[a-zA-Z0-9-]+(\.[a-zA-Z0-9-]+)*\./ +
/(([0-9]{1,3})|([a-zA-Z]{2,3})|(aero|coop|info|museum|name))>?$/
addresses = ['someuser@somehost.com',
'Book Publishing <marketing@books.com>']
addresses.each{
assert it =~ lenient
assert it =~ strict
assert it =~ modern
}
//----------------------------------------------------------------------------------
[править] Matching Abbreviations
//----------------------------------------------------------------------------------
def findAction(ans) {
def re = '(?i)^' + Pattern.quote(ans)
if ("SEND" =~ re) println "Action is send"
else if ("STOP" =~ re) println "Action is stop"
else if ("ABORT" =~ re) println "Action is abort"
else if ("EDIT" =~ re) println "Action is edit"
else println 'No Match'
}
findAction('edit something')
// => No Match
findAction('edit')
// => Action is edit
findAction('se')
// => Action is send
findAction('e')
// => Action is edit
def buildAbbrev(words) {
def table = new TreeMap()
words.each{ w ->
(0..<w.size()).each { n ->
if (!(words - w).any{
it.size() >= n+1 && it[0..n] == w[0..n]
}) table[w[0..n]] = w
}
}
table
}
println buildAbbrev('send stop abort edit'.split(' ').toList())
// => ["a":"abort", "ab":"abort", "abo":"abort", "abor":"abort", "abort":"abort",
// "e":"edit", "ed":"edit", "edi":"edit", "edit":"edit", "se":"send", "sen":"send",
// "send":"send", "st":"stop", "sto":"stop", "stop":"stop"]
// miniShellScript:
// dummy methods
def invokeEditor() { println "invoking editor" }
def deliverMessage() { println "delivering message at " + new Date() }
actions = [
edit: this.&invokeEditor,
send: this.&deliverMessage,
list: { println Runtime.runtime.freeMemory() },
abort: { System.exit(0) },
unknown: { println "Unknown Command"}
]
table = buildAbbrev(actions.keySet().toList())
prompt = '\n> '
print 'Enter Commands: edit send list abort' + prompt
new BufferedReader(new InputStreamReader(System.in)).eachLine{ line ->
def idx = (table.containsKey(line)) ? table[line] : 'unknown'
actions[idx]()
print prompt
}
//----------------------------------------------------------------------------------
[править] Program: urlify
//----------------------------------------------------------------------------------
//% gunzip -c ~/mail/archive.gz | urlify > archive.urlified
//% urlify ~/mail/*.inbox > ~/allmail.urlified
urls = '(https?|telnet|gopher|file|wais|ftp|mail)'
ltrs = /\w/
gunk = /\#\/~:.?+=&%@!\-/
punc = /.:?\-/
doll = /$/
all = /$ltrs$gunk$punc/
findUrls = """(?ix)
\\b # start at word boundary
( # begin group 1 {
$urls : # need resource and a colon
[$all] +? # followed by on or more of any valid
# character, but be conservative and
# take only what you need to...
) # end group 1 }
(?= # look-ahead non-consumptive assertion
[$punc]* # either 0 or more punctuation
[^$all] # followed by a non-url character
| # or else
$doll # then end of the string
)
"""
input = '''
If you find a typo on http://groovy.codehaus.org please
send an email to mail:spelling.pedant@codehaus.org
'''
println input.replaceAll(findUrls,'<a href="$1">$1</a>')
// =>
// If you find a typo on <a href="http://groovy.codehaus.org">http://groovy.codehaus.org</a> please
// send an email to <a href="mail:spelling.pedant@codehaus.org">mail:spelling.pedant@codehaus.org</a>
// urlifyScript:
#!/usr/bin/groovy
// urlify - wrap HTML links around URL-like constructs
// definitions from above
args.each{ file ->
new File(file).eachLine{ line ->
println line.replaceAll(findUrls,'<a href="$1">$1</a>')
}
}
//----------------------------------------------------------------------------------
[править] Program: tcgrep
//----------------------------------------------------------------------------------
// @@INCOMPLETE@@
// @@INCOMPLETE@@
//----------------------------------------------------------------------------------
// not an exact equivalent to original cookbook but has
// a reasonable subset of mostly similar functionality
// instead of -r recursion option, use Ant fileset wildcards
// e.g. **/*.c. You can also specify an excludes pattern
// e.g. **/*.* -X **/*.h will process all but header files
// (currently not optimised and with minimal error checking)
// uses jopt-simple (jopt-simple.sf.net)
op = new joptsimple.OptionParser()
NOCASE = 'i'; op.accepts( NOCASE, "case insensitive" )
WITHN = 'n'; op.accepts( WITHN, "display line/para with line/para number" )
WITHF = 'H'; op.accepts( WITHF, "display line/para with filename" )
NONAME = 'h'; op.accepts( NONAME, "hide filenames" )
COUNT = 'c'; op.accepts( COUNT, "give count of lines/paras matching" )
TCOUNT = 'C'; op.accepts( TCOUNT, "give count of total matches (multiple per line/para)" )
WORD = 'w'; op.accepts( WORD, "word boundaries only" )
EXACT = 'x'; op.accepts( EXACT, "exact matches only" )
INVERT = 'v'; op.accepts( INVERT, "invert search sense (lines that DON'T match)" )
EXCLUDE = 'X'; op.accepts( EXCLUDE, "exclude files matching pattern [default is '**/*.bak']" ).
withRequiredArg().describedAs('path_pattern')
MATCH = 'l'; op.accepts( MATCH, "list names of files with matches" )
NOMATCH = 'L'; op.accepts( NOMATCH, "list names of files with no match" )
PARA = 'p'; op.accepts( PARA, "para mode (.* matches newlines)" ).
withOptionalArg().describedAs('para_pattern')
EXPR = 'e'; op.accepts( EXPR, "expression (when pattern begins with '-')" ).
withRequiredArg().describedAs('pattern')
FILE = 'f'; op.accepts( FILE, "file containing pattern" ).
withRequiredArg().describedAs('filename')
HELP = 'help'; op.accepts( HELP, "display this message" )
options = op.parse(args)
params = options.nonOptionArguments()
if (options.wasDetected( HELP )) {
op.printHelpOn( System.out )
} else if (params.size() == 0) {
println "Usage: grep [OPTION]... PATTERN [FILE]...\nTry 'grep --$HELP' for more information."
} else {
modifiers = []
paraPattern = ''
o_withn = options.wasDetected( WITHN )
o_withf = options.wasDetected( WITHF )
o_noname = options.wasDetected( NONAME )
o_count = options.wasDetected( COUNT )
o_tcount = options.wasDetected( TCOUNT )
o_invert = options.wasDetected( INVERT )
o_match = options.wasDetected( MATCH )
o_nomatch = options.wasDetected( NOMATCH )
if (options.wasDetected( EXPR )) {
pattern = options.valueOf( EXPR )
} else if (options.wasDetected( FILE )) {
pattern = new File(options.valueOf( FILE )).text.trim()
} else {
pattern = params[0]
params = params[1..-1]
}
if (options.wasDetected( EXCLUDE )) excludes = options.valueOf( EXCLUDE )
else excludes = ['**/*.bak']
if (options.wasDetected( EXACT )) pattern = '^' + pattern + '$'
else if (options.wasDetected( WORD )) pattern = /\b$pattern\b/
if (options.wasDetected( NOCASE )) modifiers += 'i'
if (options.wasDetected( PARA )) {
if (options.hasArgument( PARA )) paraPattern = options.valueOf( PARA )
else paraPattern = '^$'
paraPattern = '(?sm)' + paraPattern
modifiers += 'sm'
}
if (modifiers) pattern = "(?${modifiers.join()})" + pattern
if (params.size() == 0) grepStream(System.in, '<stdin>')
else {
scanner = new AntBuilder().fileScanner {
fileset(dir:'.', includes:params.join(','), excludes:excludes)
}
for (f in scanner) {
grepStream(new FileInputStream(f), f)
}
}
}
def grepStream(s, name) {
def count = 0
def tcount = 0
def pieces
if (paraPattern) pieces = s.text.split(paraPattern)
else pieces = s.readLines()
def fileMode = o_match || o_nomatch || o_count || o_tcount
pieces.eachWithIndex{line, index ->
def m = line =~ pattern
boolean found = m.count
if (found != o_invert) {
count++
tcount += m.count
if (!fileMode) {
linefields = []
if (o_withf) linefields += name
if (o_withn) linefields += index + 1
linefields += line
println linefields.join(':')
}
}
}
def display = true
if ((o_match && count == 0) || (o_nomatch && count != 0)) display = false
if (fileMode && display) {
filefields = []
if (!o_noname) filefields += name
if (o_tcount) filefields += tcount
else if (o_count) filefields += count
println filefields.join(':')
}
}
//----------------------------------------------------------------------------------
[править] Regular Expression Grabbag
//----------------------------------------------------------------------------------
romans = /(?i)^m*(d?c{0,3}|c[dm])(l?x{0,3}|x[lc])(v?i{0,3}|i[vx])$/
assert 'cmxvi' =~ romans
// can't have tens before 1000s (M) or 100s (C) after 5s (V)
assert !('xmvci' =~ romans)
// swap first two words
assert 'the words'.replaceAll(/(\S+)(\s+)(\S+)/, '$3$2$1') == 'words the'
// extract keyword and value
m = 'k=v' =~ /(\w+)\s*=\s*(.*)\s*$/
assert m.matches()
assert m[0][1] == 'k'
assert m[0][2] == 'v'
hasAtLeastSize = { n -> /.{$n,}/ }
assert 'abcdefghijklmnopqrstuvwxyz' =~ hasAtLeastSize(20)
// MM/DD/YY HH:MM:SS (lenient - doesn't check HH > 23 etc)
d = /\d+/
datetime = "($d)/($d)/($d) ($d):($d):($d)"
assert '04/05/2006 10:26:59' =~ datetime
orig = '/usr/bin/vi'
expected = '/usr/local/bin/vi'
orig.replaceAll('/usr/bin','/usr/local/bin') == expected
escapeSequenceRegex = /%([0-9A-Fa-f][0-9A-Fa-f])/
convertEscapeToChar = { Object[] ch -> new Character((char)Integer.parseInt(ch[1],16)) }
assert 'abc%3cdef'.replaceAll(escapeSequenceRegex, convertEscapeToChar) == 'abc<def'
commentStripper = '''(?xms)
/\\* # Match the opening delimiter
.* # Match a minimal number of characters */
\\*/ # Match the closing delimiter
'''
input = '''
a line
/*
some comment
*/
another line
'''
expected = '''
a line
another line
'''
assert input.replaceAll(commentStripper,'') == expected
// emulate s.trim()
assert ' x y '.replaceAll(/^\s+/, '').replaceAll(/\s+$/, '') == 'x y'
// convert \\n into \n
assert (/a\nb/.replaceAll(/\\n/,"\n") == 'a\nb')
// remove package symbol (Groovy/Java doesn't use this as package symbol)
assert 'A::B'.replaceAll(/^.*::/, '') == 'B'
// match IP Address (requires leading 0's)
ipregex = /^([01]?\d\d|2[0-4]\d|25[0-5])\.([01]?\d\d|2[0-4]\d|25[0-5])\./ +
/([01]?\d\d|2[0-4]\d|25[0-5])\.([01]?\d\d|2[0-4]\d|25[0-5])$/
assert !('123.456.789' =~ ipregex)
assert '192.168.000.001' =~ ipregex
// extract basename
assert 'c:/usr/temp.txt'.replaceAll(/^.*\/{1}/, '') == 'temp.txt'
termcap = ':co#80:li#24:'
m = (termcap =~ /:co\#(\d+):/)
assert m.count == 1
assert m[0][1] == '80'
assert 'cmd c:/tmp/junk.txt'.replaceAll(/ \S+\/{1}/, ' ') == 'cmd junk.txt'
os = System.getProperty('os.name')
println 'Is Linux? ' + (os ==~ /(?i)linux.*/)
println 'Is Windows? ' + (os ==~ /(?i)windows.*/)
println 'Is Mac? ' + (os ==~ /(?i)mac.*/)
// join multiline sting
multi = '''
This is
a test
'''.trim()
assert multi.replaceAll(/(?m)\n\s+/, ' ') == 'This is a test'
// nums in string
string = 'The 5th test was won today by 10 wickets after 10.5 overs'
nums = string =~ /(\d+\.?\d*|\.\d+)/
assert (0..<nums.count).collect{ nums[it][1] }.join(' ') == '5 10 10.5'
// capitalize words
words = 'the Capital words ARE hiding'
capwords = words =~ /(\b\p{Upper}+\b)/
assert (0..<capwords.count).collect{ capwords[it][1] }.join(' ') == 'ARE'
lowords = words =~ /(\b\p{Lower}+\b)/
assert (0..<lowords.count).collect{ lowords[it][1] }.join(' ') == 'the words hiding'
capWords = words =~ /(\b\p{Upper}\p{Lower}*\b)/
assert (0..<capWords.count).collect{ capWords[it][1] }.join(' ') == 'Capital'
input = '''
If you find a typo on <a href="http://groovy.codehaus.org">http://groovy.codehaus.org</a> please
send an email to <a href="mail:spelling.pedant@codehaus.org">mail:spelling.pedant@codehaus.org</a>
'''
linkRegex = /(?im)<A[^>]+?HREF\s*=\s*["']?([^'" >]+?)[ '"]?>/ //'
links = input =~ linkRegex
(0..<links.count).each{ println links[it][1] }
// =>
// http://groovy.codehaus.org
// mail:spelling.pedant@codehaus.org
// find middle initial if any
m = 'Lee Harvey Oswald' =~ /^\S+\s+(\S)\S*\s+\S/
initial = m.count ? m[0][1] : ""
assert initial == 'H'
// inch marks to quotes
println 'I said "Hello" to you.'.replaceAll(/"([^"]*)"/, /``$1''/) //"
// => I said ``Hello'' to you.
// extract sentences (2 spaces or newline after punctuation)
input = '''
Is this a sentence?
Yes! And so
is this. And the fourth.
'''
sentences = []
strip = input.replaceAll(/(\p{Punct})\n/, '$1 ').replaceAll(/\n/, ' ').replaceAll(/ {3,}/,' ')
m = strip =~ /(\S.*?\p{Punct})(?= |\Z)/
(0..<m.count).each{ sentences += m[it][1] }
assert sentences == ["Is this a sentence?", "Yes!", "And so is this.", "And the fourth."]
// YYYY-MM-DD
m = '2007-2-28' =~ /(\d{4})-(\d\d?)-(\d\d?)/
assert m.matches()
assert ['2007', '2', '28'] == [m[0][1], m[0][2], m[0][3]]
usPhoneRegex = /^[01]?[- .]?(\([2-9]\d{2}\)|[2-9]\d{2})[- .]?\d{3}[- .]?\d{4}$/
numbers = '''
(425) 555-0123
425-555-0123
425 555 0123
1-425-555-0123
'''.trim().split('\n').toList()
assert numbers.every{ it ==~ usPhoneRegex }
exclaimRegex = /(?i)\boh\s+my\s+gh?o(d(dess(es)?|s?)|odness|sh)\b/
assert 'Oh my Goodness!' =~ exclaimRegex
assert !('Golly gosh' =~ exclaimRegex)
input = 'line 1\rline 2\nline\r\nline 3\n\rline 4'
m = input =~ /(?m)^([^\012\015]*)(\012\015?|\015\012?)/
assert m.count == 4
|