|
Groovy/FAQ/Работа с Web
Материал из Wiki.crossplatform.ru
[править] Introduction
//----------------------------------------------------------------------------------
// Many packages are available for simulating a browser. A good starting point:
// http://groovy.codehaus.org/Testing+Web+Applications
//----------------------------------------------------------------------------------
[править] Fetching a URL from a Perl Script
//----------------------------------------------------------------------------------
// for non-binary content
urlStr = 'http://groovy.codehaus.org'
content = new URL(urlStr).text
println content.size() // => 34824
// for binary content
urlStr = 'http://groovy.codehaus.org/download/attachments/1871/gina_3d.gif'
bytes = new ByteArrayOutputStream()
bytes << new URL(urlStr).openStream()
println bytes.size() // => 6066
// various forms of potential error checking
try {
new URL('x:y:z')
} catch (MalformedURLException ex) {
println ex.message // => unknown protocol: x
}
try {
new URL('cnn.com/not.there')
} catch (MalformedURLException ex) {
println ex.message // => no protocol: cnn.com/not.there
}
try {
content = new URL('http://cnn.com/not.there').text
} catch (FileNotFoundException ex) {
println "Couldn't find: " + ex.message
// => Couldn't find: http://www.cnn.com/not.there
}
// titleBytes example
def titleBytes(urlStr) {
def lineCount = 0; def byteCount = 0
new URL(urlStr).eachLine{ line ->
lineCount++; byteCount += line.size()
}
println "$urlStr => ($lineCount lines, $byteCount bytes)"
}
titleBytes('http://www.tpj.com/')
// http://www.tpj.com/ => (677 lines, 25503 bytes)
//----------------------------------------------------------------------------------
[править] Automating Form Submission
//----------------------------------------------------------------------------------
// using HtmlUnit (htmlunit.sf.net)
import com.gargoylesoftware.htmlunit.WebClient
def webClient = new WebClient()
def page = webClient.getPage('http://search.cpan.org/')
// check page title
assert page.titleText.startsWith('The CPAN Search Site')
// fill in form and submit it
def form = page.getFormByName('f')
def field = form.getInputByName('query')
field.setValueAttribute('DB_File')
def button = form.getInputByValue('CPAN Search')
def result = button.click()
// check search result has at least one link ending in DB_File.pm
assert result.anchors.any{ a -> a.hrefAttribute.endsWith('DB_File.pm') }
// fields must be properly escaped
println URLEncoder.encode(/"this isn't <EASY>&<FUN>"/, 'utf-8')
// => %22this+isn%27t+%3CEASY%3E%26%3CFUN%3E%22
// proxies can be taken from environment, or specified
//System.properties.putAll( ["http.proxyHost":"proxy-host", "http.proxyPort":"proxy-port",
// "http.proxyUserName":"user-name", "http.proxyPassword":"proxy-passwd"] )
//----------------------------------------------------------------------------------
[править] Extracting URLs
//----------------------------------------------------------------------------------
// using HtmlUnit (htmlunit.sf.net)
import com.gargoylesoftware.htmlunit.WebClient
client = new WebClient()
html = client.getPage('http://www.perl.com/CPAN/')
println page.anchors.collect{ it.hrefAttribute }.sort().unique().join('\n')
// =>
// disclaimer.html
// http://bookmarks.cpan.org/
// http://faq.perl.org/
// mailto:cpan@perl.org
// ...
//----------------------------------------------------------------------------------
[править] Converting ASCII to HTML
//----------------------------------------------------------------------------------
// split paragraphs
LS = System.properties.'line.separator'
new File(args[0]).text.split("$LS$LS").each{ para ->
if (para.startsWith(" ")) println "<pre>\n$para\n</pre>"
else {
para = para.replaceAll(/(?m)^(>.*?)$/, /$1<br \/>/) // quoted text
para = para.replaceAll(/<URL:(.*)>/, /<a href="$1">$1<\/a>/) // embedded URL
para = para.replaceAll(/(http:\S+)/, /<a href="$1">$1<\/a>/) // guessed URL
para = para.replaceAll('\\*(\\S+)\\*', /<strong>$1<\/strong>/) // this is *bold* here
para = para.replaceAll(/\b_(\S+)_\b/, /<em>$1<\/em>/) // this is _italic_ here
println "<p>\n$para\n</p>" // add paragraph tags
}
}
def encodeEmail(email) {
println "<table>"
email = URLEncoder.encode(email)
email = text.replaceAll(/(\n[ \t]+)/, / . /) // continuation lines
email = text.replaceAll(/(?m)^(\S+?:)\s*(.*?)$/,
/<tr><th align="left">$1<\/th><td>$2<\/td><\/tr>/);
println email
println "</table>"
}
//----------------------------------------------------------------------------------
[править] Converting HTML to ASCII
//----------------------------------------------------------------------------------
// using CyberNeko Parser (people.apache.org/~andyc/neko/doc)
parser = new org.cyberneko.html.parsers.SAXParser()
parser.setFeature('http://xml.org/sax/features/namespaces', false)
page = new XmlParser(parser).parse('http://www.perl.com/CPAN/')
page.depthFirst().each{ println it.text() }
//----------------------------------------------------------------------------------
[править] Extracting or Removing HTML Tags
//----------------------------------------------------------------------------------
// removing tags, see 20.5
// extracting tags: htitle using cyberneko and XmlSlurper
parser = new org.cyberneko.html.parsers.SAXParser()
parser.setFeature('http://xml.org/sax/features/namespaces', false)
page = new XmlParser(parser).parse('http://www.perl.com/CPAN/')
println page.HEAD.TITLE[0].text()
// extracting tags: htitle using HtmlUnit
client = new WebClient()
html = client.getPage('http://www.perl.com/CPAN/')
println html.titleText
//----------------------------------------------------------------------------------
[править] Finding Stale Links
//----------------------------------------------------------------------------------
import com.gargoylesoftware.htmlunit.WebClient
client = new WebClient()
page = client.getPage('http://www.perl.com/CPAN/')
page.anchors.each{
checkUrl(page, it.hrefAttribute)
}
def checkUrl(page, url) {
try {
print "$url "
qurl = page.getFullyQualifiedUrl(url)
client.getPage(qurl)
println 'OK'
} catch (Exception ex) {
println 'BAD'
}
}
// =>
// modules/index.html OK
// RECENT.html OK
// http://search.cpan.org/recent OK
// http://mirrors.cpan.org/ OK
// http://perldoc.perl.org/ OK
// mailto:cpan@perl.org BAD
// http://www.csc.fi/suomi/funet/verkko.html.en/ BAD
// ...
//----------------------------------------------------------------------------------
[править] Finding Fresh Links
//----------------------------------------------------------------------------------
import org.apache.commons.httpclient.HttpClient
import org.apache.commons.httpclient.methods.HeadMethod
import java.text.DateFormat
urls = [
"http://www.apache.org/",
"http://www.perl.org/",
"http://www.python.org/",
"http://www.ora.com/",
"http://jakarta.apache.org/",
"http://www.w3.org/"
]
df = DateFormat.getDateTimeInstance(DateFormat.FULL, DateFormat.MEDIUM)
client = new HttpClient()
urlInfo = [:]
urls.each{ url ->
head = new HeadMethod(url)
client.executeMethod(head)
lastModified = head.getResponseHeader("last-modified")?.value
urlInfo[df.parse(lastModified)]=url
}
urlInfo.keySet().sort().each{ key ->
println "$key ${urlInfo[key]}"
}
// =>
// Sun Jan 07 21:48:15 EST 2007 http://www.apache.org/
// Sat Jan 13 12:44:32 EST 2007 http://jakarta.apache.org/
// Fri Jan 19 14:50:13 EST 2007 http://www.w3.org/
// Fri Jan 19 19:28:35 EST 2007 http://www.python.org/
// Sat Jan 20 09:36:08 EST 2007 http://www.ora.com/
// Sat Jan 20 13:25:53 EST 2007 http://www.perl.org/
//----------------------------------------------------------------------------------
[править] Creating HTML Templates
//----------------------------------------------------------------------------------
// GString version (variables must be predefined):
username = 'Tom'
count = 99
total = 999
htmlStr = """
<!-- simple.template for internal template() function -->
<HTML><HEAD><TITLE>Report for $username</TITLE></HEAD>
<BODY><H1>Report for $username</H1>
$username logged in $count times, for a total of $total minutes.
"""
println htmlStr
// SimpleTemplateEngine version:
def html = '''
<!-- simple.template for internal template() function -->
<HTML><HEAD><TITLE>Report for $username</TITLE></HEAD>
<BODY><H1>Report for $username</H1>
$username logged in $count times, for a total of $total minutes.
'''
def engine = new groovy.text.SimpleTemplateEngine()
def reader = new StringReader(html)
def template = engine.createTemplate(reader)
println template.make(username:"Peter", count:"23", total: "1234")
// SQL version
import groovy.sql.Sql
user = 'Peter'
def sql = Sql.newInstance('jdbc:mysql://localhost:3306/mydb', 'dbuser',
'dbpass', 'com.mysql.jdbc.Driver')
sql.query("SELECT COUNT(duration),SUM(duration) FROM logins WHERE username='$user'") { answer ->
println (template.make(username:user, count:answer[0], total:answer[1]))
}
//----------------------------------------------------------------------------------
[править] Mirroring Web Pages
//----------------------------------------------------------------------------------
// using built-in connection features
urlStr = 'http://jakarta.apache.org/'
url = new URL(urlStr)
connection = url.openConnection()
connection.ifModifiedSince = new Date(2007,1,18).time
connection.connect()
println connection.responseCode
// manually setting header field
connection = url.openConnection()
df = new java.text.SimpleDateFormat ("EEE, dd MMM yyyy HH:mm:ss 'GMT'")
df.setTimeZone(TimeZone.getTimeZone('GMT'))
connection.setRequestProperty("If-Modified-Since",df.format(new Date(2007,1,18)));
connection.connect()
println connection.responseCode
//----------------------------------------------------------------------------------
[править] Creating a Robot
//----------------------------------------------------------------------------------
// The website http://www.robotstxt.org/wc/active/html/ lists many available robots
// including Java ones which can be used from Groovy. In particular, j-spider
// allows you to:
// + Check your site for errors (internal server errors, ...)
// + Outgoing and/or internal link checking
// + Analyze your site structure (creating a sitemap, ...)
// + Download complete web sites
// most of its functionality is available by tweaking appropriate configuration
// files and then running it as a standalone application but you can also write
// your own java classes.
//----------------------------------------------------------------------------------
[править] Parsing a Web Server Log File
//----------------------------------------------------------------------------------
// sample data, use 'LOGFILE = new File(args[0]).text' or similar
LOGFILE = '''
127.0.0.1 - - [04/Sep/2005:20:50:31 +0200] "GET /bus HTTP/1.1" 301 303
127.0.0.1 - - [04/Sep/2005:20:50:31 +0200] "GET /bus HTTP/1.1" 301 303 "-" "Opera/8.02 (X11; Linux i686; U; en)"
192.168.0.1 - - [04/Sep/2005:20:50:36 +0200] "GET /bus/libjs/layersmenu-library.js HTTP/1.1" 200 6228
192.168.0.1 - - [04/Sep/2005:20:50:36 +0200] "GET /bus/libjs/layersmenu-library.js HTTP/1.1" 200 6228 "http://localhost/bus/" "Opera/8.02 (X11; Linux i686; U; en)"
'''
// similar to perl version:
fields = ['client','identuser','authuser','date','time','tz','method','url','protocol','status','bytes']
regex = /^(\S+) (\S+) (\S+) \[([^:]+):(\d+:\d+:\d+) ([^\]]+)\] "(\S+) (.*?) (\S+)" (\S+) (\S+).*$/
LOGFILE.trim().split('\n').each{ line ->
m = line =~ regex
if (m.matches()) {
for (idx in 0..<fields.size()) { println "${fields[idx]}=${m[0][idx+1]}" }
println()
}
}
//----------------------------------------------------------------------------------
[править] Processing Server Logs
//----------------------------------------------------------------------------------
// sample data, use 'LOGFILE = new File(args[0]).text' or similar
LOGFILE = '''
204.31.113.138 - - [03/Jul/1996:06:56:12 -0800] "POST /forms/login.jsp HTTP/1.0" 200 5593
fcrawler.looksmart.com - - [26/Apr/2000:00:00:12 -0400] "GET /contacts.html HTTP/1.0" 200 4595 "-" "FAST-WebCrawler/2.1-pre2 (ashen@looksmart.net)"
fcrawler.looksmart.com - - [26/Apr/2000:00:17:19 -0400] "GET /news/news.html HTTP/1.0" 200 16716 "-" "FAST-WebCrawler/2.1-pre2 (ashen@looksmart.net)"
ppp931.on.bellglobal.com - - [26/Apr/2000:00:16:12 -0400] "GET /download/windows/asctab31.zip HTTP/1.0" 200 1540096 "http://www.htmlgoodies.com/downloads/freeware/webdevelopment/15.html" "Mozilla/4.7 [en]C-SYMPA (Win95; U)"
123.123.123.123 - - [26/Apr/2000:00:23:48 -0400] "GET /pics/wpaper.gif HTTP/1.0" 200 6248 "http://www.jafsoft.com/asctortf/" "Mozilla/4.05 (Macintosh; I; PPC)"
123.123.123.123 - - [26/Apr/2000:00:23:47 -0400] "GET /asctortf/ HTTP/1.0" 200 8130 "http://search.netscape.com/Computers/Data_Formats/Document/Text/RTF" "Mozilla/4.05 (Macintosh; I; PPC)"
123.123.123.123 - - [26/Apr/2000:00:23:48 -0400] "GET /pics/5star2000.gif HTTP/1.0" 200 4005 "http://www.jafsoft.com/asctortf/" "Mozilla/4.05 (Macintosh; I; PPC)"
123.123.123.123 - - [27/Apr/2000:00:23:50 -0400] "GET /pics/5star.gif HTTP/1.0" 200 1031 "http://www.jafsoft.com/asctortf/" "Mozilla/4.05 (Macintosh; I; PPC)"
123.123.123.123 - - [27/Apr/2000:00:23:51 -0400] "GET /pics/a2hlogo.jpg HTTP/1.0" 200 4282 "http://www.jafsoft.com/asctortf/" "Mozilla/4.05 (Macintosh; I; PPC)"
123.123.123.123 - - [27/Apr/2000:00:23:51 -0400] "GET /cgi-bin/newcount?jafsof3&width=4&font=digital&noshow HTTP/1.0" 200 36 "http://www.jafsoft.com/asctortf/" "Mozilla/4.05 (Macintosh; I; PPC)"
127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326
127.0.0.1 - - [04/Sep/2005:20:50:31 +0200] "GET / HTTP/1.1" 200 1927
127.0.0.1 - - [04/Sep/2005:20:50:31 +0200] "GET /bus HTTP/1.1" 301 303 "-" "Opera/8.02 (X11; Linux i686; U; en)"
192.168.0.1 - - [05/Sep/2005:20:50:36 +0200] "GET /bus/libjs/layersmenu-library.js HTTP/1.1" 200 6228
192.168.0.1 - - [05/Sep/2005:20:50:36 +0200] "GET /bus/libjs/layersmenu-library.js HTTP/1.1" 200 6228 "http://localhost/bus/" "Opera/8.02 (X11; Linux i686; U; en)"
'''
fields = ['client','identuser','authuser','date','time','tz','method','url','protocol','status','bytes']
regex = /^(\S+) (\S+) (\S+) \[([^:]+):(\d+:\d+:\d+) ([^\]]+)\] "(\S+) (.*?) (\S+)" (\S+) (\S+).*$/
class Summary {
def hosts = [:]
def what = [:]
def accessCount = 0
def postCount = 0
def homeCount = 0
def totalBytes = 0
}
totals = [:]
LOGFILE.trim().split('\n').each{ line ->
m = line =~ regex
if (m.matches()) {
date = m[0][fields.indexOf('date')+1]
s = totals.get(date, new Summary())
s.accessCount++
if (m[0][fields.indexOf('method')+1] == 'POST') s.postCount++
s.totalBytes += (m[0][fields.indexOf('bytes')+1]).toInteger()
def url = m[0][fields.indexOf('url')+1]
if (url == '/') s.homeCount++
s.what[url] = s.what.get(url, 0) + 1
def host = m[0][fields.indexOf('client')+1]
s.hosts[host] = s.hosts.get(host, 0) + 1
}
}
report('Date','Hosts','Accesses','Unidocs','POST','Home','Bytes')
totals.each{ key, s ->
report(key, s.hosts.size(), s.accessCount, s.what.size(), s.postCount, s.homeCount, s.totalBytes)
}
v = totals.values()
report('Grand Total', v.sum{it.hosts.size()}, v.sum{it.accessCount}, v.sum{it.what.size()},
v.sum{it.postCount}, v.sum{it.homeCount}, v.sum{it.totalBytes} )
def report(a, b, c, d, e, f, g) {
printf ("%12s %6s %8s %8s %8s %8s %10s\n", [a,b,c,d,e,f,g])
}
// =>
// Date Hosts Accesses Unidocs POST Home Bytes
// 03/Jul/1996 1 1 1 1 0 5593
// 10/Oct/2000 1 1 1 0 0 2326
// 04/Sep/2005 1 2 2 0 1 2230
// 05/Sep/2005 1 2 1 0 0 12456
// 26/Apr/2000 3 6 6 0 0 1579790
// 27/Apr/2000 1 3 3 0 0 5349
// Grand Total 8 15 14 1 1 1607744
// Some open source log processing packages in Java:
// http://www.generationjava.com/projects/logview/index.shtml
// http://ostermiller.org/webalizer/
// http://jxla.nvdcms.org/en/index.xml
// http://polliwog.sourceforge.net/index.html
// as well as textual reports, most of these can produce graphical reports
// Most have their own configuration information and Java extension points.
//----------------------------------------------------------------------------------
[править] Program: htmlsub
//----------------------------------------------------------------------------------
import org.cyberneko.html.filters.Writer
import org.cyberneko.html.filters.DefaultFilter
import org.apache.xerces.xni.parser.XMLDocumentFilter
import org.apache.xerces.xni.*
import org.cyberneko.html.parsers.DOMParser
import org.xml.sax.InputSource
input = '''
<HTML><HEAD><TITLE>Hi!</TITLE></HEAD><BODY>
<H1>Welcome to Scooby World!</H1>
I have <A HREF="pictures.html">pictures</A> of the crazy dog
himself. Here's one!<P>
<IMG SRC="scooby.jpg" ALT="Good doggy!"><P>
<BLINK>He's my hero!</BLINK> I would like to meet him some day,
and get my picture taken with him.<P>
P.S. I am deathly ill. <A HREF="shergold.html">Please send
cards</A>.
</BODY></HTML>
'''
class WordReplaceFilter extends DefaultFilter {
private before, after
WordReplaceFilter(b, a) { before = b; after = a }
void characters(XMLString text, Augmentations augs) {
char[] c = text.toString().replaceAll(before, after)
super.characters(new XMLString(c, 0, c.size()), augs)
}
void setProperty(String s, Object o){}
}
XMLDocumentFilter[] filters = [
new WordReplaceFilter(/(?sm)picture/, /photo/),
new Writer()
]
parser = new DOMParser()
parser.setProperty("http://cyberneko.org/html/properties/filters", filters)
parser.parse(new InputSource(new StringReader(input)))
//----------------------------------------------------------------------------------
[править] Program: hrefsub
//----------------------------------------------------------------------------------
import org.cyberneko.html.filters.Writer
import org.cyberneko.html.filters.DefaultFilter
import org.apache.xerces.xni.parser.XMLDocumentFilter
import org.apache.xerces.xni.*
import org.cyberneko.html.parsers.DOMParser
import org.xml.sax.InputSource
input = '''
<HTML><HEAD><TITLE>Hi!</TITLE></HEAD><BODY>
<H1>Welcome to Scooby World!</H1>
I have <A HREF="pictures.html">pictures</A> of the crazy dog
himself. Here's one!<P>
<IMG SRC="scooby.jpg" ALT="Good doggy!"><P>
<BLINK>He's my hero!</BLINK> I would like to meet him some day,
and get my picture taken with him.<P>
P.S. I am deathly ill. <A HREF="shergold.html">Please send
cards</A>.
</BODY></HTML>
'''
class HrefReplaceFilter extends DefaultFilter {
private before, after
HrefReplaceFilter(b, a) { before = b; after = a }
void startElement(QName element, XMLAttributes attributes, Augmentations augs) {
def idx = attributes.getIndex('href')
if (idx != -1) {
def newtext = attributes.getValue(idx).replaceAll(before, after)
attributes.setValue(idx, URLEncoder.encode(newtext))
}
super.startElement(element, attributes, augs)
}
void setProperty(String s, Object o){}
}
XMLDocumentFilter[] myfilters = [
new HrefReplaceFilter(/shergold.html/, /cards.html/),
new Writer()
]
parser = new DOMParser()
parser.setProperty("http://cyberneko.org/html/properties/filters", myfilters)
parser.parse(new InputSource(new StringReader(input)))
//----------------------------------------------------------------------------------
|