Python/FAQ/Работа с Web

Материал из Wiki.crossplatform.ru

Версия от 07:10, 23 декабря 2009; 71.141.106.3 (Обсуждение)
(разн.) ← Предыдущая | Текущая версия (разн.) | Следующая → (разн.)
Перейти к: навигация, поиск
· Python ·

Содержание

[править] Introduction

# @@INCOMPLETE@@
# @@INCOMPLETE@@

[править] Fetching a URL from a Perl Script

#-----------------------------
import urllib
content = urllib.urlopen(url).read()
 
try:
    import urllib
    content = urllib.urlopen(url).read()
 
except IOError:
    print "could not get %s" % url
 
#-----------------------------
# download the following standalone program
#!/usr/bin/python
# titlebytes - find the title and size of documents
 
#
# differences to perl
# 
# * no URI::Heuristics
# * perl LWP supports fetching files from local system
# * fetching a title from ftp or file doesnt work in perl either.
 
import sys, urllib2, HTMLParser
if len(sys.argv)<=1:
    print "usage: %s url" % sys.argv[0]
    sys.exit(1)
raw_url = sys.argv[1] 
 
 
# python has no equivalent to pearls URI::Heuristics, which
# would do some guessing like :
#
#   perl            -> http://www.perl.com
#   www.oreilly.com -> http://www.oreilly.com
#   ftp.funet.fi    -> ftp://ftp.funet.fi
 
#   /etc/passwd     -> file:/etc/passwd
 
# simple but pedantic html parser: tpj.com breaks it.
class html(HTMLParser.HTMLParser):
    def __init__(self):
        HTMLParser.HTMLParser.__init__(self)
        self._data = {}
        self._open_tags = []
    def handle_starttag(self, tag, attrs):
        self._open_tags.append(tag)
    def handle_endtag(self, tag):
        if len(self._open_tags)>0:
            self._open_tags.pop()
    def handle_data(self, data):
        if len(self._open_tags)>0:
            self._data[self._open_tags[-1]] = data
    def __getattr__(self,attr):
        return self._data.get(attr, "")
 
url = raw_url
print "%s =>\n\t" % url,
# TODO fake user agent "Schmozilla/v9.17 Platinum"
# TODO referer "http://wizard.yellowbrick.oz"
# as we only do http httplib would do also
 
try:
        response = urllib2.urlopen(url)
except:
        print " %s" % sys.exc_info()[1].reason[1]
        sys.exit(1)
# title is not in response
data = response.read()
parser = html()
parser.feed(data)
parser.close()  # force processing all data
 
count = len(data.split("\n"))
bytes = len(data)
print "%s (%d lines, %d bytes)" % (parser.title, 
        count, 
        bytes)
 
# omly bytes is in response.info()

[править] Automating Form Submission

# GET method
 
import httplib
conn = httplib.HTTPConnection('www.perl.com')
conn.request('GET','/cgi-bin/cpan_mod?module=DB_File&readme=1')
r1 = conn.getresponse()
content = r1.read()
 
# POST method
import urllib
params = urllib.urlencode({'module': 'DB_File', 'readme': 1})
content = urllib.urlopen('http://www.perl.com', params).read()
 
 
# fields must be properly escaped
# script.cgi?field1?arg=%22this%20isn%27t%20%3CEASY%3E%22
 
# proxies can be taken from environment, or specified
# as the optional thrid parameter to urlopen.

[править] Extracting URLs

# download the following standalone program
#!/usr/bin/python
# xurl - extract unique, sorted list of links from URL
 
 
from HTMLParser import HTMLParser
import urllib
from sets import Set as set # not needed in 2.4
 
class myParser(HTMLParser):
    def __init__(self, url):
        self.baseUrl = url[:url.rfind('/')]
        HTMLParser.__init__(self)
    def reset(self):
        self.urls = set()
        HTMLParser.reset(self)
    def handle_starttag(self, tag, attrs):
        if tag == 'a':
            if attrs[0][0] == 'href':
                if attrs[0][1].find(':') == -1:
                    # we need to add the base URL.
 
                    self.urls.add(self.baseUrl + '/' + attrs[0][1])
                else:
                    self.urls.add(attrs[0][1])
url = 'http://www.perl.com/CPAN'
p = myParser(url)
s = urllib.urlopen(url)
data = s.read()
p.feed(data)
urllist = p.urls._data.keys()
urllist.sort()
print '\n'.join(urllist)

[править] Converting ASCII to HTML

# Converting ASCII to HTML
 
 
# download the following standalone program
#!/usr/bin/python
# text2html - trivial html encoding of normal text
 
import sys
import re
 
# precompile regular expressions
 
re_quoted = re.compile(r"(?m)^(>.*?)$")
re_url = re.compile(r"<URL:(.*)>")
re_http = re.compile(r"(http:\S+)")
re_strong = re.compile(r"\*(\S+)\*")
re_em = re.compile(r"\b_(\S+)_\b")
 
# split paragraphs
for para in open(sys.argv[1]).read().split("\n\n"):
    # TODO encode entities: dont encode "<>" but do "&"
 
    if para.startswith(" "):
        print "<pre>\n%s\n</pre>" % para
    else:
 
        para = re_quoted.sub(r"\1<br />", para)          # quoted text
        para = re_url.sub(r'<a href="\1">\1</a>', para)  # embedded URL
 
        para = re_http.sub(r'<a href="\1">\1</a>', para) # guessed URL
        para = re_strong.sub(r"<strong>\1</strong>",para)   # this is *bold* here
 
        para = re_em.sub(r"<em>\1</em>",para)            # this is _italic_ here
        print "<p>\n%s\n</p>" % para                     # add paragraph tags
 
 
 
 
#-----------------------------
import sys, re
import htmlentitydefs
 
def encode_entities(s):
    for k,v in htmlentitydefs.codepoint2name.items():
        if k<256: # no unicodes
 
            s = s.replace(chr(k),"&%s;"%v)
    return s
 
print "<table>"
text = sys.stdin.read()
text = encode_entities(text)
text = re.sub(r"(\n[ \t]+)"," . ",text)   # continuation lines
 
text = re.sub(r"(?m)^(\S+?:)\s*(.*?)$",
              r'<tr><th align="left">\1</th><td>\2</td></tr>',
                            text);
print text    
 
print "</table>"

[править] Converting HTML to ASCII

# Converting HTML to ASCII
 
#-----------------------------
import os
ascii = os.popen("lynx -dump " + filename).read()
 
 
#-----------------------------
import formatter
import htmllib
 
w = formatter.DumbWriter()
f = formatter.AbstractFormatter(w)
p = htmllib.HTMLParser(f)
p.feed(html)
p.close()
 
# Above is a bare minimum to use writer/formatter/parser
# framework of Python.
 
# Search Python Cookbook for more details, like writing
# your own writers or formatters.
 
 
# Recipe #52297 has TtyFormatter, formatting underline
# and bold in Terminal. Recipe #135005 has a writer
# accumulating text instead of printing.

[править] Extracting or Removing HTML Tags

import re
 
plain_text = re.sub(r"<[^>]*>","",html_text) #WRONG
 
 
# using HTMLParser
import sys, HTMLParser
 
class html(HTMLParser.HTMLParser):
    def __init__(self):
        HTMLParser.HTMLParser.__init__(self)
        self._plaintext = ""
 
        self._ignore = False
    def handle_starttag(self, tag, attrs):
        if tag == "script":
            self._ignore = True
 
    def handle_endtag(self, tag):
        if tag == "script":
            self._ignore = False
    def handle_data(self, data):
        if len(data)>0 and not self._ignore:
            self._plaintext += data
    def get_plaintext(self):
        return self._plaintext
    def error(self,msg):
        # ignore all errors
 
        pass
 
html_text = open(sys.argv[1]).read()
 
parser = html()
parser.feed(html_text)
parser.close()  # force processing all data
print parser.get_plaintext()
 
title_s = re.search(r"(?i)<title>\s*(.*?)\s*</title>", text)
title = title_s.group(1) if title_s or "NO TITLE"
 
# download the following standalone program
#!/usr/bin/python
# htitlebytes - get html title from URL
#
 
import sys, urllib2, HTMLParser
if len(sys.argv)<=1:
    print "usage: %s url ..." % sys.argv[0]
    sys.exit(1)
 
 
# simple but pedantic html parser: tpj.com breaks it.
class html(HTMLParser.HTMLParser):
    def __init__(self):
        HTMLParser.HTMLParser.__init__(self)
        self._data = {}
        self._open_tags = []
    def handle_starttag(self, tag, attrs):
        self._open_tags.append(tag)
    def handle_endtag(self, tag):
        if len(self._open_tags)>0:
            self._open_tags.pop()
    def handle_data(self, data):
        if len(self._open_tags)>0:
            self._data[self._open_tags[-1]] = data
    def __getattr__(self,attr):
        return self._data.get(attr, "")
    def error(self,msg):
        # ignore all errors
        pass
 
for url in sys.argv[1:]:
    print "%s: " % url,
    # TODO fake user agent "Schmozilla/v9.17 Platinum"
    # TODO referer "http://wizard.yellowbrick.oz"
    # as we only do http httplib would do also
 
    try:
        response = urllib2.urlopen(url)
    except:
        print " %s" % sys.exc_info()[1]
        sys.exit(1)
    # title is not in response
    parser = html()
    parser.feed(response.read())
    parser.close()  # force processing all data
 
    print parser.title

[править] Finding Stale Links

# download the following standalone program
#!/usr/bin/python
# churl - check urls
 
import sys
 
# head request
 
import urllib
def valid(url):
    try:
        conn = urllib.urlopen(url)
        return 1
    except:
        return 0
 
 
# parser class as in xurl
from HTMLParser import HTMLParser
from sets import Set as set # not needed in 2.4
 
class myParser(HTMLParser):
    def __init__(self, url):
        self.baseUrl = url[:url.rfind('/')]
        HTMLParser.__init__(self)
    def reset(self):
        self.urls = set()
        HTMLParser.reset(self)
    def handle_starttag(self, tag, attrs):
        if tag == 'a':
            if attrs[0][0] == 'href':
                if ':' not in attrs[0][1]:
                    # we need to add the base URL.
 
                    self.urls.add(self.baseUrl + '/' + attrs[0][1])
                else:
                    self.urls.add(attrs[0][1])
 
if len(sys.argv)<=1:
    print "usage: %s <start_url>" % (sys.argv[0])
    sys.exit(1)
 
base_url = sys.argv[1]
 
print base_url+":"
p = myParser(base_url)
s = urllib.urlopen(base_url)
data = s.read()
p.feed(data)
for link in p.urls._data:
    state = "UNKNOWN URL"
    if link.startswith("http:"):
        state = "BAD"
 
        if valid(link):
            state = "OK"
    print "  %s: %s" % (link, state)

[править] Finding Fresh Links

# download the following standalone program
#!/usr/bin/python
 
# surl - sort URLs by their last modification date
 
import urllib
import time
import sys
from collections import defaultdict
 
Date = defaultdict(list)
while True:
    # we only read from stdin not from argv.
 
    ln = sys.stdin.readline()
    if not ln:
        break
    ln = ln.strip()
    try:
        u = urllib.urlopen(ln)
        date = time.mktime(u.info().getdate("date"))
        Date[date].append(ln)
    except:
 
        sys.stderr.write("%s: %s!\n" % (ln, sys.exc_info()[1]))
 
dates = Date.keys()
dates.sort()    # python 2.4 would have sorted
for d in dates:
    print "%s  %s" % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(d)),
                    ", ".join(Date[d]))

[править] Creating HTML Templates

import re
 
 
def template(filename, fillings):
    text = open(filename).read()
    def repl(matchobj):
        return str(fillings.get(matchobj.group(1), ""))
 
    # replace quoted words with value from fillings dictionary
    text = re.sub("%%(.+?)%%", repl, text)
    return text
 
fields = { "username":"peter", "count":"23", "total": "1234"}
 
print template("/home/httpd/templates/simple.template", fields)
 
# download the following standalone program
#!/usr/bin/python
# userrep1 - report duration of user logins using SQL database
 
import MySQLdb
import cgi
 
import re
import sys
 
def template(filename, fillings):
    text = open(filename).read()
    def repl(matchobj):
        return str(fillings(matchobj.group(1), ""))
 
    # replace quoted words with value from fillings dictionary
    text = re.sub("%%(.+?)%%", repl, text)
    return text
 
fields = cgi.FieldStorage()
if "user" not in fields:
    print "Content-Type: text/plain\n"
 
    print "No username"
    sys.exit(1)
 
def get_userdata(username):
    db = MySQLdb.connect(passwd="",db="connections", user="bert")
    db.query("select count(duration) as count,"
 
            +" sum(duration) as total from logins"
            +" where username='%s'" % username)
    res = db.store_result().fetch_row(maxrows=1,how=1)
    res[0]["username"] = username
    db.close()
    return res[0]
 
print "Content-Type: text/html\n"
 
print template("report.tpl", get_userdata(fields["user"].value))
 
# @@INCOMPLETE@@

[править] Mirroring Web Pages

# @@INCOMPLETE@@
# @@INCOMPLETE@@

[править] Creating a Robot

# @@INCOMPLETE@@
# @@INCOMPLETE@@

[править] Parsing a Web Server Log File

# sample data, use ``LOGFILE = open(sys.argv[1])`` in real life
LOGFILE = [
        '127.0.0.1 - - [04/Sep/2005:20:50:31 +0200] "GET /bus HTTP/1.1" 301 303\n',
        '127.0.0.1 - - [04/Sep/2005:20:50:31 +0200] "GET /bus HTTP/1.1" 301 303 "-" "Opera/8.02 (X11; Linux i686; U; en)"\n',
        '192.168.0.1 - - [04/Sep/2005:20:50:36 +0200] "GET /bus/libjs/layersmenu-library.js HTTP/1.1" 200 6228\n',
        '192.168.0.1 - - [04/Sep/2005:20:50:36 +0200] "GET /bus/libjs/layersmenu-library.js HTTP/1.1" 200 6228 "http://localhost/bus/" "Opera/8.02 (X11; Linux i686; U; en)"\n',
    ]
 
import re
 
 
# similar too perl version.
web_server_log_re = re.compile(r'^(\S+) (\S+) (\S+) \[([^:]+):(\d+:\d+:\d+) ([^\]]+)\] "(\S+) (.*?) (\S+)" (\S+) (\S+)$')
 
# with group naming.
split_re = re.compile(r'''(?x)         # allow nicer formatting (but requires escaping blanks)
                       ^(?P<client>\S+)\s
                       (?P<identuser>\S+)\s
                       (?P<authuser>\S+)\s
                       \[
                         (?P<date>[^:]+):
                         (?P<time>[\d:]+)\s
                         (?P<tz>[^\]]+)
                       \]\s
                       "
                         (?P<method>\S+)\s
                         (?P<url>.*?)\s
                         (?P<protocol>\S+)
                       "\s
                       (?P<status>\S+)\s
                       (?P<bytes>\S+)
                       (?:
                         \s
                         "
                           (?P<referrer>[^"]+)
                         "\s
                         "
                           (?P<agent>[^"]+)
                         "
                       )?''')
 
for line in LOGFILE:
    f = split_re.match(line)
    if f:
        print "agent = %s" % f.group('agent')

[править] Processing Server Logs

# @@INCOMPLETE@@
# @@INCOMPLETE@@

[править] Program: htmlsub

# @@INCOMPLETE@@
# @@INCOMPLETE@@

[править] Program: hrefsub

# @@INCOMPLETE@@
# @@INCOMPLETE@@