Groovy/FAQ/Работа с Web

Материал из Wiki.crossplatform.ru

Содержание

1 Introduction
2 Fetching a URL from a Perl Script
3 Automating Form Submission
4 Extracting URLs
5 Converting ASCII to HTML
6 Converting HTML to ASCII
7 Extracting or Removing HTML Tags
8 Finding Stale Links
9 Finding Fresh Links
10 Creating HTML Templates
11 Mirroring Web Pages
12 Creating a Robot
13 Parsing a Web Server Log File
14 Processing Server Logs
15 Program: htmlsub
16 Program: hrefsub

[править] Introduction

//----------------------------------------------------------------------------------
// Many packages are available for simulating a browser. A good starting point:
// http://groovy.codehaus.org/Testing+Web+Applications
//----------------------------------------------------------------------------------

[править] Fetching a URL from a Perl Script

//----------------------------------------------------------------------------------
// for non-binary content
urlStr = 'http://groovy.codehaus.org'
content = new URL(urlStr).text
println content.size() // => 34824
 
 
// for binary content
urlStr = 'http://groovy.codehaus.org/download/attachments/1871/gina_3d.gif'
bytes = new ByteArrayOutputStream()
bytes << new URL(urlStr).openStream()
 
println bytes.size() // => 6066
 
// various forms of potential error checking
try {
    new URL('x:y:z')
 
} catch (MalformedURLException ex) {
    println ex.message // => unknown protocol: x
 
}
try {
    new URL('cnn.com/not.there')
} catch (MalformedURLException ex) {
 
    println ex.message // => no protocol: cnn.com/not.there
}
try {
    content = new URL('http://cnn.com/not.there').text
 
} catch (FileNotFoundException ex) {
    println "Couldn't find: " + ex.message
    // => Couldn't find: http://www.cnn.com/not.there
 
}
 
// titleBytes example
def titleBytes(urlStr) {
    def lineCount = 0; def byteCount = 0
 
    new URL(urlStr).eachLine{ line ->
        lineCount++; byteCount += line.size()
 
    }
    println "$urlStr => ($lineCount lines, $byteCount bytes)"
}
titleBytes('http://www.tpj.com/')
// http://www.tpj.com/ => (677 lines, 25503 bytes)
//----------------------------------------------------------------------------------

[править] Automating Form Submission

//----------------------------------------------------------------------------------
// using HtmlUnit (htmlunit.sf.net)
import com.gargoylesoftware.htmlunit.WebClient
 
def webClient = new WebClient()
 
def page = webClient.getPage('http://search.cpan.org/')
// check page title
assert page.titleText.startsWith('The CPAN Search Site')
// fill in form and submit it
def form = page.getFormByName('f')
 
def field = form.getInputByName('query')
field.setValueAttribute('DB_File')
def button = form.getInputByValue('CPAN Search')
 
def result = button.click()
// check search result has at least one link ending in DB_File.pm
assert result.anchors.any{ a -> a.hrefAttribute.endsWith('DB_File.pm') }
 
// fields must be properly escaped
println URLEncoder.encode(/"this isn't <EASY>&<FUN>"/, 'utf-8')
// => %22this+isn%27t+%3CEASY%3E%26%3CFUN%3E%22
 
 
// proxies can be taken from environment, or specified
//System.properties.putAll( ["http.proxyHost":"proxy-host", "http.proxyPort":"proxy-port",
//    "http.proxyUserName":"user-name", "http.proxyPassword":"proxy-passwd"] )
//----------------------------------------------------------------------------------

[править] Extracting URLs

//----------------------------------------------------------------------------------
// using HtmlUnit (htmlunit.sf.net)
import com.gargoylesoftware.htmlunit.WebClient
 
client = new WebClient()
 
html = client.getPage('http://www.perl.com/CPAN/')
println page.anchors.collect{ it.hrefAttribute }.sort().unique().join('\n')
 
// =>
// disclaimer.html
// http://bookmarks.cpan.org/
// http://faq.perl.org/
// mailto:cpan@perl.org
// ...
//----------------------------------------------------------------------------------

[править] Converting ASCII to HTML

//----------------------------------------------------------------------------------
// split paragraphs
LS = System.properties.'line.separator'
new File(args[0]).text.split("$LS$LS").each{ para ->
 
    if (para.startsWith(" ")) println "<pre>\n$para\n</pre>"
    else {
 
        para = para.replaceAll(/(?m)^(>.*?)$/, /$1<br \/>/)            // quoted text
 
        para = para.replaceAll(/<URL:(.*)>/, /<a href="$1">$1<\/a>/)   // embedded URL
 
        para = para.replaceAll(/(http:\S+)/, /<a href="$1">$1<\/a>/)   // guessed URL
        para = para.replaceAll('\\*(\\S+)\\*', /<strong>$1<\/strong>/) // this is *bold* here
 
        para = para.replaceAll(/\b_(\S+)_\b/, /<em>$1<\/em>/)          // this is _italic_ here
        println "<p>\n$para\n</p>"                                     // add paragraph tags
 
    }
}
 
def encodeEmail(email) {
    println "<table>"
 
    email = URLEncoder.encode(email)
    email = text.replaceAll(/(\n[ \t]+)/, / . /)   // continuation lines
 
    email = text.replaceAll(/(?m)^(\S+?:)\s*(.*?)$/,
                  /<tr><th align="left">$1<\/th><td>$2<\/td><\/tr>/);
 
    println email
    println "</table>"
}
//----------------------------------------------------------------------------------

[править] Converting HTML to ASCII

//----------------------------------------------------------------------------------
// using CyberNeko Parser (people.apache.org/~andyc/neko/doc)
 
parser = new org.cyberneko.html.parsers.SAXParser()
parser.setFeature('http://xml.org/sax/features/namespaces', false)
 
page = new XmlParser(parser).parse('http://www.perl.com/CPAN/')
page.depthFirst().each{ println it.text() }
 
//----------------------------------------------------------------------------------

[править] Extracting or Removing HTML Tags

//----------------------------------------------------------------------------------
// removing tags, see 20.5
 
// extracting tags: htitle using cyberneko and XmlSlurper
parser = new org.cyberneko.html.parsers.SAXParser()
 
parser.setFeature('http://xml.org/sax/features/namespaces', false)
page = new XmlParser(parser).parse('http://www.perl.com/CPAN/')
 
println page.HEAD.TITLE[0].text()
 
// extracting tags: htitle using HtmlUnit
client = new WebClient()
 
html = client.getPage('http://www.perl.com/CPAN/')
println html.titleText
//----------------------------------------------------------------------------------

[править] Finding Stale Links

//----------------------------------------------------------------------------------
import com.gargoylesoftware.htmlunit.WebClient
 
client = new WebClient()
page = client.getPage('http://www.perl.com/CPAN/')
page.anchors.each{
    checkUrl(page, it.hrefAttribute)
 
}
 
def checkUrl(page, url) {
    try {
 
        print "$url "
        qurl = page.getFullyQualifiedUrl(url)
        client.getPage(qurl)
        println 'OK'
 
    } catch (Exception ex) {
        println 'BAD'
 
    }
}
// =>
// modules/index.html OK
// RECENT.html OK
// http://search.cpan.org/recent OK
// http://mirrors.cpan.org/ OK
// http://perldoc.perl.org/ OK
// mailto:cpan@perl.org BAD
// http://www.csc.fi/suomi/funet/verkko.html.en/ BAD
// ...
//----------------------------------------------------------------------------------

[править] Finding Fresh Links

//----------------------------------------------------------------------------------
import org.apache.commons.httpclient.HttpClient
 
import org.apache.commons.httpclient.methods.HeadMethod
import java.text.DateFormat
 
urls = [
    "http://www.apache.org/",
    "http://www.perl.org/",
    "http://www.python.org/",
    "http://www.ora.com/",
    "http://jakarta.apache.org/",
    "http://www.w3.org/"
 
]
 
df = DateFormat.getDateTimeInstance(DateFormat.FULL, DateFormat.MEDIUM)
client = new HttpClient()
 
urlInfo = [:]
urls.each{ url ->
    head = new HeadMethod(url)
 
    client.executeMethod(head)
    lastModified = head.getResponseHeader("last-modified")?.value
    urlInfo[df.parse(lastModified)]=url
 
}
 
urlInfo.keySet().sort().each{ key ->
    println "$key ${urlInfo[key]}"
}
 
// =>
// Sun Jan 07 21:48:15 EST 2007 http://www.apache.org/
// Sat Jan 13 12:44:32 EST 2007 http://jakarta.apache.org/
// Fri Jan 19 14:50:13 EST 2007 http://www.w3.org/
// Fri Jan 19 19:28:35 EST 2007 http://www.python.org/
// Sat Jan 20 09:36:08 EST 2007 http://www.ora.com/
// Sat Jan 20 13:25:53 EST 2007 http://www.perl.org/
//----------------------------------------------------------------------------------

[править] Creating HTML Templates

//----------------------------------------------------------------------------------
// GString version (variables must be predefined):
username = 'Tom'
count = 99
total = 999
htmlStr = """
<!-- simple.template for internal template() function -->
 
<HTML><HEAD><TITLE>Report for $username</TITLE></HEAD>
<BODY><H1>Report for $username</H1>
$username logged in $count times, for a total of $total minutes.
"""
println htmlStr
 
// SimpleTemplateEngine version:
def html = '''
 
<!-- simple.template for internal template() function -->
<HTML><HEAD><TITLE>Report for $username</TITLE></HEAD>
<BODY><H1>Report for $username</H1>
$username logged in $count times, for a total of $total minutes.
'''
 
def engine = new groovy.text.SimpleTemplateEngine()
def reader = new StringReader(html)
 
def template = engine.createTemplate(reader)
println template.make(username:"Peter", count:"23", total: "1234")
 
// SQL version
import groovy.sql.Sql
user = 'Peter'
def sql = Sql.newInstance('jdbc:mysql://localhost:3306/mydb', 'dbuser',
                      'dbpass', 'com.mysql.jdbc.Driver')
 
sql.query("SELECT COUNT(duration),SUM(duration) FROM logins WHERE username='$user'") { answer ->
    println (template.make(username:user, count:answer[0], total:answer[1]))
 
}
//----------------------------------------------------------------------------------

[править] Mirroring Web Pages

//----------------------------------------------------------------------------------
// using built-in connection features
urlStr = 'http://jakarta.apache.org/'
url = new URL(urlStr)
connection = url.openConnection()
 
connection.ifModifiedSince = new Date(2007,1,18).time
connection.connect()
println connection.responseCode
 
// manually setting header field
connection = url.openConnection()
 
df = new java.text.SimpleDateFormat ("EEE, dd MMM yyyy HH:mm:ss 'GMT'")
df.setTimeZone(TimeZone.getTimeZone('GMT'))
 
connection.setRequestProperty("If-Modified-Since",df.format(new Date(2007,1,18)));
connection.connect()
 
println connection.responseCode
//----------------------------------------------------------------------------------

[править] Creating a Robot

//----------------------------------------------------------------------------------
// The website http://www.robotstxt.org/wc/active/html/ lists many available robots
// including Java ones which can be used from Groovy. In particular, j-spider
// allows you to:
// + Check your site for errors (internal server errors, ...)
// + Outgoing and/or internal link checking
// + Analyze your site structure (creating a sitemap, ...)
// + Download complete web sites
// most of its functionality is available by tweaking appropriate configuration
// files and then running it as a standalone application but you can also write
// your own java classes.
//----------------------------------------------------------------------------------

[править] Parsing a Web Server Log File

//----------------------------------------------------------------------------------
// sample data, use 'LOGFILE = new File(args[0]).text' or similar
LOGFILE = '''
127.0.0.1 - - [04/Sep/2005:20:50:31 +0200] "GET /bus HTTP/1.1" 301 303
127.0.0.1 - - [04/Sep/2005:20:50:31 +0200] "GET /bus HTTP/1.1" 301 303 "-" "Opera/8.02 (X11; Linux i686; U; en)"
192.168.0.1 - - [04/Sep/2005:20:50:36 +0200] "GET /bus/libjs/layersmenu-library.js HTTP/1.1" 200 6228
192.168.0.1 - - [04/Sep/2005:20:50:36 +0200] "GET /bus/libjs/layersmenu-library.js HTTP/1.1" 200 6228 "http://localhost/bus/" "Opera/8.02 (X11; Linux i686; U; en)"
'''
 
// similar to perl version:
fields = ['client','identuser','authuser','date','time','tz','method','url','protocol','status','bytes']
 
regex = /^(\S+) (\S+) (\S+) \[([^:]+):(\d+:\d+:\d+) ([^\]]+)\] "(\S+) (.*?) (\S+)" (\S+) (\S+).*$/
 
LOGFILE.trim().split('\n').each{ line ->
 
    m = line =~ regex
    if (m.matches()) {
        for (idx in 0..<fields.size()) { println "${fields[idx]}=${m[0][idx+1]}" }
 
        println()
    }
}
//----------------------------------------------------------------------------------

[править] Processing Server Logs

//----------------------------------------------------------------------------------
// sample data, use 'LOGFILE = new File(args[0]).text' or similar
LOGFILE = '''
204.31.113.138 - - [03/Jul/1996:06:56:12 -0800] "POST /forms/login.jsp HTTP/1.0" 200 5593
fcrawler.looksmart.com - - [26/Apr/2000:00:00:12 -0400] "GET /contacts.html HTTP/1.0" 200 4595 "-" "FAST-WebCrawler/2.1-pre2 (ashen@looksmart.net)"
fcrawler.looksmart.com - - [26/Apr/2000:00:17:19 -0400] "GET /news/news.html HTTP/1.0" 200 16716 "-" "FAST-WebCrawler/2.1-pre2 (ashen@looksmart.net)"
ppp931.on.bellglobal.com - - [26/Apr/2000:00:16:12 -0400] "GET /download/windows/asctab31.zip HTTP/1.0" 200 1540096 "http://www.htmlgoodies.com/downloads/freeware/webdevelopment/15.html" "Mozilla/4.7 [en]C-SYMPA  (Win95; U)"
123.123.123.123 - - [26/Apr/2000:00:23:48 -0400] "GET /pics/wpaper.gif HTTP/1.0" 200 6248 "http://www.jafsoft.com/asctortf/" "Mozilla/4.05 (Macintosh; I; PPC)"
123.123.123.123 - - [26/Apr/2000:00:23:47 -0400] "GET /asctortf/ HTTP/1.0" 200 8130 "http://search.netscape.com/Computers/Data_Formats/Document/Text/RTF" "Mozilla/4.05 (Macintosh; I; PPC)"
123.123.123.123 - - [26/Apr/2000:00:23:48 -0400] "GET /pics/5star2000.gif HTTP/1.0" 200 4005 "http://www.jafsoft.com/asctortf/" "Mozilla/4.05 (Macintosh; I; PPC)"
123.123.123.123 - - [27/Apr/2000:00:23:50 -0400] "GET /pics/5star.gif HTTP/1.0" 200 1031 "http://www.jafsoft.com/asctortf/" "Mozilla/4.05 (Macintosh; I; PPC)"
123.123.123.123 - - [27/Apr/2000:00:23:51 -0400] "GET /pics/a2hlogo.jpg HTTP/1.0" 200 4282 "http://www.jafsoft.com/asctortf/" "Mozilla/4.05 (Macintosh; I; PPC)"
123.123.123.123 - - [27/Apr/2000:00:23:51 -0400] "GET /cgi-bin/newcount?jafsof3&width=4&font=digital&noshow HTTP/1.0" 200 36 "http://www.jafsoft.com/asctortf/" "Mozilla/4.05 (Macintosh; I; PPC)"
127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326
127.0.0.1 - - [04/Sep/2005:20:50:31 +0200] "GET / HTTP/1.1" 200 1927
127.0.0.1 - - [04/Sep/2005:20:50:31 +0200] "GET /bus HTTP/1.1" 301 303 "-" "Opera/8.02 (X11; Linux i686; U; en)"
192.168.0.1 - - [05/Sep/2005:20:50:36 +0200] "GET /bus/libjs/layersmenu-library.js HTTP/1.1" 200 6228
192.168.0.1 - - [05/Sep/2005:20:50:36 +0200] "GET /bus/libjs/layersmenu-library.js HTTP/1.1" 200 6228 "http://localhost/bus/" "Opera/8.02 (X11; Linux i686; U; en)"
'''
 
fields = ['client','identuser','authuser','date','time','tz','method','url','protocol','status','bytes']
 
regex = /^(\S+) (\S+) (\S+) \[([^:]+):(\d+:\d+:\d+) ([^\]]+)\] "(\S+) (.*?) (\S+)" (\S+) (\S+).*$/
 
 
class Summary {
    def hosts = [:]
    def what = [:]
 
    def accessCount = 0
    def postCount = 0
    def homeCount = 0
 
    def totalBytes = 0
}
totals = [:]
LOGFILE.trim().split('\n').each{ line ->
 
    m = line =~ regex
    if (m.matches()) {
        date = m[0][fields.indexOf('date')+1]
 
        s = totals.get(date, new Summary())
        s.accessCount++
        if (m[0][fields.indexOf('method')+1] == 'POST') s.postCount++
        s.totalBytes += (m[0][fields.indexOf('bytes')+1]).toInteger()
 
        def url = m[0][fields.indexOf('url')+1]
        if (url == '/') s.homeCount++
        s.what[url] = s.what.get(url, 0) + 1
 
        def host = m[0][fields.indexOf('client')+1]
        s.hosts[host] = s.hosts.get(host, 0) + 1
 
    }
}
report('Date','Hosts','Accesses','Unidocs','POST','Home','Bytes')
 
totals.each{ key, s ->
    report(key, s.hosts.size(), s.accessCount, s.what.size(), s.postCount, s.homeCount, s.totalBytes)
}
 
v = totals.values()
report('Grand Total', v.sum{it.hosts.size()}, v.sum{it.accessCount}, v.sum{it.what.size()},
        v.sum{it.postCount}, v.sum{it.homeCount}, v.sum{it.totalBytes} )
 
def report(a, b, c, d, e, f, g) {
    printf ("%12s %6s %8s %8s %8s %8s %10s\n", [a,b,c,d,e,f,g])
 
}
// =>
//         Date  Hosts Accesses  Unidocs     POST     Home      Bytes
//  03/Jul/1996      1        1        1        1        0       5593
//  10/Oct/2000      1        1        1        0        0       2326
//  04/Sep/2005      1        2        2        0        1       2230
//  05/Sep/2005      1        2        1        0        0      12456
//  26/Apr/2000      3        6        6        0        0    1579790
//  27/Apr/2000      1        3        3        0        0       5349
//  Grand Total      8       15       14        1        1    1607744
 
 
// Some open source log processing packages in Java:
// http://www.generationjava.com/projects/logview/index.shtml
// http://ostermiller.org/webalizer/
// http://jxla.nvdcms.org/en/index.xml
// http://polliwog.sourceforge.net/index.html
// as well as textual reports, most of these can produce graphical reports
// Most have their own configuration information and Java extension points.
//----------------------------------------------------------------------------------

[править] Program: htmlsub

//----------------------------------------------------------------------------------
 import org.cyberneko.html.filters.Writer
 
 import org.cyberneko.html.filters.DefaultFilter
 import org.apache.xerces.xni.parser.XMLDocumentFilter
 
 import org.apache.xerces.xni.*
 import org.cyberneko.html.parsers.DOMParser
 
 import org.xml.sax.InputSource
 
 input = '''
 <HTML><HEAD><TITLE>Hi!</TITLE></HEAD><BODY>
 
 <H1>Welcome to Scooby World!</H1>
 I have <A HREF="pictures.html">pictures</A> of the crazy dog
 himself. Here's one!<P>
 <IMG SRC="scooby.jpg" ALT="Good doggy!"><P>
 
 <BLINK>He's my hero!</BLINK> I would like to meet him some day,
 and get my picture taken with him.<P>
 P.S. I am deathly ill. <A HREF="shergold.html">Please send
 cards</A>.
 </BODY></HTML>
 
 '''
 
 class WordReplaceFilter extends DefaultFilter {
     private before, after
 
     WordReplaceFilter(b, a) { before = b; after = a }
     void characters(XMLString text, Augmentations augs) {
 
         char[] c = text.toString().replaceAll(before, after)
         super.characters(new XMLString(c, 0, c.size()), augs)
 
     }
     void setProperty(String s, Object o){}
 
 }
 XMLDocumentFilter[] filters = [
     new WordReplaceFilter(/(?sm)picture/, /photo/),
     new Writer()
 
 ]
 parser = new DOMParser()
 parser.setProperty("http://cyberneko.org/html/properties/filters", filters)
 
 parser.parse(new InputSource(new StringReader(input)))
//----------------------------------------------------------------------------------

[править] Program: hrefsub

//----------------------------------------------------------------------------------
 
import org.cyberneko.html.filters.Writer
import org.cyberneko.html.filters.DefaultFilter
 
import org.apache.xerces.xni.parser.XMLDocumentFilter
import org.apache.xerces.xni.*
 
import org.cyberneko.html.parsers.DOMParser
import org.xml.sax.InputSource
 
input = '''
<HTML><HEAD><TITLE>Hi!</TITLE></HEAD><BODY>
 
<H1>Welcome to Scooby World!</H1>
I have <A HREF="pictures.html">pictures</A> of the crazy dog
himself. Here's one!<P>
 
<IMG SRC="scooby.jpg" ALT="Good doggy!"><P>
<BLINK>He's my hero!</BLINK> I would like to meet him some day,
and get my picture taken with him.<P>
P.S. I am deathly ill. <A HREF="shergold.html">Please send
cards</A>.
 
</BODY></HTML>
'''
 
class HrefReplaceFilter extends DefaultFilter {
    private before, after
 
    HrefReplaceFilter(b, a) { before = b; after = a }
    void startElement(QName element, XMLAttributes attributes, Augmentations augs) {
 
        def idx = attributes.getIndex('href')
        if (idx != -1) {
 
            def newtext = attributes.getValue(idx).replaceAll(before, after)
            attributes.setValue(idx, URLEncoder.encode(newtext))
 
        }
        super.startElement(element, attributes, augs)
    }
    void setProperty(String s, Object o){}
 
}
XMLDocumentFilter[] myfilters = [
    new HrefReplaceFilter(/shergold.html/, /cards.html/),
    new Writer()
 
]
parser = new DOMParser()
parser.setProperty("http://cyberneko.org/html/properties/filters", myfilters)
parser.parse(new InputSource(new StringReader(input)))
 
//----------------------------------------------------------------------------------

Личные инструменты

Поиск

Библиотеки

Языки

Другое

Навигация

Инструменты

Просмотры

Groovy/FAQ/Работа с Web

Материал из Wiki.crossplatform.ru

Содержание

[править] Introduction

[править] Fetching a URL from a Perl Script

[править] Automating Form Submission

[править] Extracting URLs

[править] Converting ASCII to HTML

[править] Converting HTML to ASCII

[править] Extracting or Removing HTML Tags

[править] Finding Stale Links

[править] Finding Fresh Links

[править] Creating HTML Templates

[править] Mirroring Web Pages

[править] Creating a Robot

[править] Parsing a Web Server Log File

[править] Processing Server Logs

[править] Program: htmlsub

[править] Program: hrefsub