|
Python/FAQ/Работа с Web
Материал из Wiki.crossplatform.ru
[править] Introduction
# @@INCOMPLETE@@
# @@INCOMPLETE@@
[править] Fetching a URL from a Perl Script
#-----------------------------
import urllib
content = urllib.urlopen(url).read()
try:
import urllib
content = urllib.urlopen(url).read()
except IOError:
print "could not get %s" % url
#-----------------------------
# download the following standalone program
#!/usr/bin/python
# titlebytes - find the title and size of documents
#
# differences to perl
#
# * no URI::Heuristics
# * perl LWP supports fetching files from local system
# * fetching a title from ftp or file doesnt work in perl either.
import sys, urllib2, HTMLParser
if len(sys.argv)<=1:
print "usage: %s url" % sys.argv[0]
sys.exit(1)
raw_url = sys.argv[1]
# python has no equivalent to pearls URI::Heuristics, which
# would do some guessing like :
#
# perl -> http://www.perl.com
# www.oreilly.com -> http://www.oreilly.com
# ftp.funet.fi -> ftp://ftp.funet.fi
# /etc/passwd -> file:/etc/passwd
# simple but pedantic html parser: tpj.com breaks it.
class html(HTMLParser.HTMLParser):
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self._data = {}
self._open_tags = []
def handle_starttag(self, tag, attrs):
self._open_tags.append(tag)
def handle_endtag(self, tag):
if len(self._open_tags)>0:
self._open_tags.pop()
def handle_data(self, data):
if len(self._open_tags)>0:
self._data[self._open_tags[-1]] = data
def __getattr__(self,attr):
return self._data.get(attr, "")
url = raw_url
print "%s =>\n\t" % url,
# TODO fake user agent "Schmozilla/v9.17 Platinum"
# TODO referer "http://wizard.yellowbrick.oz"
# as we only do http httplib would do also
try:
response = urllib2.urlopen(url)
except:
print " %s" % sys.exc_info()[1].reason[1]
sys.exit(1)
# title is not in response
data = response.read()
parser = html()
parser.feed(data)
parser.close() # force processing all data
count = len(data.split("\n"))
bytes = len(data)
print "%s (%d lines, %d bytes)" % (parser.title,
count,
bytes)
# omly bytes is in response.info()
[править] Automating Form Submission
# GET method
import httplib
conn = httplib.HTTPConnection('www.perl.com')
conn.request('GET','/cgi-bin/cpan_mod?module=DB_File&readme=1')
r1 = conn.getresponse()
content = r1.read()
# POST method
import urllib
params = urllib.urlencode({'module': 'DB_File', 'readme': 1})
content = urllib.urlopen('http://www.perl.com', params).read()
# fields must be properly escaped
# script.cgi?field1?arg=%22this%20isn%27t%20%3CEASY%3E%22
# proxies can be taken from environment, or specified
# as the optional thrid parameter to urlopen.
[править] Extracting URLs
# download the following standalone program
#!/usr/bin/python
# xurl - extract unique, sorted list of links from URL
from HTMLParser import HTMLParser
import urllib
from sets import Set as set # not needed in 2.4
class myParser(HTMLParser):
def __init__(self, url):
self.baseUrl = url[:url.rfind('/')]
HTMLParser.__init__(self)
def reset(self):
self.urls = set()
HTMLParser.reset(self)
def handle_starttag(self, tag, attrs):
if tag == 'a':
if attrs[0][0] == 'href':
if attrs[0][1].find(':') == -1:
# we need to add the base URL.
self.urls.add(self.baseUrl + '/' + attrs[0][1])
else:
self.urls.add(attrs[0][1])
url = 'http://www.perl.com/CPAN'
p = myParser(url)
s = urllib.urlopen(url)
data = s.read()
p.feed(data)
urllist = p.urls._data.keys()
urllist.sort()
print '\n'.join(urllist)
[править] Converting ASCII to HTML
# Converting ASCII to HTML
# download the following standalone program
#!/usr/bin/python
# text2html - trivial html encoding of normal text
import sys
import re
# precompile regular expressions
re_quoted = re.compile(r"(?m)^(>.*?)$")
re_url = re.compile(r"<URL:(.*)>")
re_http = re.compile(r"(http:\S+)")
re_strong = re.compile(r"\*(\S+)\*")
re_em = re.compile(r"\b_(\S+)_\b")
# split paragraphs
for para in open(sys.argv[1]).read().split("\n\n"):
# TODO encode entities: dont encode "<>" but do "&"
if para.startswith(" "):
print "<pre>\n%s\n</pre>" % para
else:
para = re_quoted.sub(r"\1<br />", para) # quoted text
para = re_url.sub(r'<a href="\1">\1</a>', para) # embedded URL
para = re_http.sub(r'<a href="\1">\1</a>', para) # guessed URL
para = re_strong.sub(r"<strong>\1</strong>",para) # this is *bold* here
para = re_em.sub(r"<em>\1</em>",para) # this is _italic_ here
print "<p>\n%s\n</p>" % para # add paragraph tags
#-----------------------------
import sys, re
import htmlentitydefs
def encode_entities(s):
for k,v in htmlentitydefs.codepoint2name.items():
if k<256: # no unicodes
s = s.replace(chr(k),"&%s;"%v)
return s
print "<table>"
text = sys.stdin.read()
text = encode_entities(text)
text = re.sub(r"(\n[ \t]+)"," . ",text) # continuation lines
text = re.sub(r"(?m)^(\S+?:)\s*(.*?)$",
r'<tr><th align="left">\1</th><td>\2</td></tr>',
text);
print text
print "</table>"
[править] Converting HTML to ASCII
# Converting HTML to ASCII
#-----------------------------
import os
ascii = os.popen("lynx -dump " + filename).read()
#-----------------------------
import formatter
import htmllib
w = formatter.DumbWriter()
f = formatter.AbstractFormatter(w)
p = htmllib.HTMLParser(f)
p.feed(html)
p.close()
# Above is a bare minimum to use writer/formatter/parser
# framework of Python.
# Search Python Cookbook for more details, like writing
# your own writers or formatters.
# Recipe #52297 has TtyFormatter, formatting underline
# and bold in Terminal. Recipe #135005 has a writer
# accumulating text instead of printing.
[править] Extracting or Removing HTML Tags
import re
plain_text = re.sub(r"<[^>]*>","",html_text) #WRONG
# using HTMLParser
import sys, HTMLParser
class html(HTMLParser.HTMLParser):
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self._plaintext = ""
self._ignore = False
def handle_starttag(self, tag, attrs):
if tag == "script":
self._ignore = True
def handle_endtag(self, tag):
if tag == "script":
self._ignore = False
def handle_data(self, data):
if len(data)>0 and not self._ignore:
self._plaintext += data
def get_plaintext(self):
return self._plaintext
def error(self,msg):
# ignore all errors
pass
html_text = open(sys.argv[1]).read()
parser = html()
parser.feed(html_text)
parser.close() # force processing all data
print parser.get_plaintext()
title_s = re.search(r"(?i)<title>\s*(.*?)\s*</title>", text)
title = title_s.group(1) if title_s or "NO TITLE"
# download the following standalone program
#!/usr/bin/python
# htitlebytes - get html title from URL
#
import sys, urllib2, HTMLParser
if len(sys.argv)<=1:
print "usage: %s url ..." % sys.argv[0]
sys.exit(1)
# simple but pedantic html parser: tpj.com breaks it.
class html(HTMLParser.HTMLParser):
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self._data = {}
self._open_tags = []
def handle_starttag(self, tag, attrs):
self._open_tags.append(tag)
def handle_endtag(self, tag):
if len(self._open_tags)>0:
self._open_tags.pop()
def handle_data(self, data):
if len(self._open_tags)>0:
self._data[self._open_tags[-1]] = data
def __getattr__(self,attr):
return self._data.get(attr, "")
def error(self,msg):
# ignore all errors
pass
for url in sys.argv[1:]:
print "%s: " % url,
# TODO fake user agent "Schmozilla/v9.17 Platinum"
# TODO referer "http://wizard.yellowbrick.oz"
# as we only do http httplib would do also
try:
response = urllib2.urlopen(url)
except:
print " %s" % sys.exc_info()[1]
sys.exit(1)
# title is not in response
parser = html()
parser.feed(response.read())
parser.close() # force processing all data
print parser.title
[править] Finding Stale Links
# download the following standalone program
#!/usr/bin/python
# churl - check urls
import sys
# head request
import urllib
def valid(url):
try:
conn = urllib.urlopen(url)
return 1
except:
return 0
# parser class as in xurl
from HTMLParser import HTMLParser
from sets import Set as set # not needed in 2.4
class myParser(HTMLParser):
def __init__(self, url):
self.baseUrl = url[:url.rfind('/')]
HTMLParser.__init__(self)
def reset(self):
self.urls = set()
HTMLParser.reset(self)
def handle_starttag(self, tag, attrs):
if tag == 'a':
if attrs[0][0] == 'href':
if ':' not in attrs[0][1]:
# we need to add the base URL.
self.urls.add(self.baseUrl + '/' + attrs[0][1])
else:
self.urls.add(attrs[0][1])
if len(sys.argv)<=1:
print "usage: %s <start_url>" % (sys.argv[0])
sys.exit(1)
base_url = sys.argv[1]
print base_url+":"
p = myParser(base_url)
s = urllib.urlopen(base_url)
data = s.read()
p.feed(data)
for link in p.urls._data:
state = "UNKNOWN URL"
if link.startswith("http:"):
state = "BAD"
if valid(link):
state = "OK"
print " %s: %s" % (link, state)
[править] Finding Fresh Links
# download the following standalone program
#!/usr/bin/python
# surl - sort URLs by their last modification date
import urllib
import time
import sys
from collections import defaultdict
Date = defaultdict(list)
while True:
# we only read from stdin not from argv.
ln = sys.stdin.readline()
if not ln:
break
ln = ln.strip()
try:
u = urllib.urlopen(ln)
date = time.mktime(u.info().getdate("date"))
Date[date].append(ln)
except:
sys.stderr.write("%s: %s!\n" % (ln, sys.exc_info()[1]))
dates = Date.keys()
dates.sort() # python 2.4 would have sorted
for d in dates:
print "%s %s" % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(d)),
", ".join(Date[d]))
[править] Creating HTML Templates
import re
def template(filename, fillings):
text = open(filename).read()
def repl(matchobj):
return str(fillings.get(matchobj.group(1), ""))
# replace quoted words with value from fillings dictionary
text = re.sub("%%(.+?)%%", repl, text)
return text
fields = { "username":"peter", "count":"23", "total": "1234"}
print template("/home/httpd/templates/simple.template", fields)
# download the following standalone program
#!/usr/bin/python
# userrep1 - report duration of user logins using SQL database
import MySQLdb
import cgi
import re
import sys
def template(filename, fillings):
text = open(filename).read()
def repl(matchobj):
return str(fillings(matchobj.group(1), ""))
# replace quoted words with value from fillings dictionary
text = re.sub("%%(.+?)%%", repl, text)
return text
fields = cgi.FieldStorage()
if "user" not in fields:
print "Content-Type: text/plain\n"
print "No username"
sys.exit(1)
def get_userdata(username):
db = MySQLdb.connect(passwd="",db="connections", user="bert")
db.query("select count(duration) as count,"
+" sum(duration) as total from logins"
+" where username='%s'" % username)
res = db.store_result().fetch_row(maxrows=1,how=1)
res[0]["username"] = username
db.close()
return res[0]
print "Content-Type: text/html\n"
print template("report.tpl", get_userdata(fields["user"].value))
# @@INCOMPLETE@@
[править] Mirroring Web Pages
# @@INCOMPLETE@@
# @@INCOMPLETE@@
[править] Creating a Robot
# @@INCOMPLETE@@
# @@INCOMPLETE@@
[править] Parsing a Web Server Log File
# sample data, use ``LOGFILE = open(sys.argv[1])`` in real life
LOGFILE = [
'127.0.0.1 - - [04/Sep/2005:20:50:31 +0200] "GET /bus HTTP/1.1" 301 303\n',
'127.0.0.1 - - [04/Sep/2005:20:50:31 +0200] "GET /bus HTTP/1.1" 301 303 "-" "Opera/8.02 (X11; Linux i686; U; en)"\n',
'192.168.0.1 - - [04/Sep/2005:20:50:36 +0200] "GET /bus/libjs/layersmenu-library.js HTTP/1.1" 200 6228\n',
'192.168.0.1 - - [04/Sep/2005:20:50:36 +0200] "GET /bus/libjs/layersmenu-library.js HTTP/1.1" 200 6228 "http://localhost/bus/" "Opera/8.02 (X11; Linux i686; U; en)"\n',
]
import re
# similar too perl version.
web_server_log_re = re.compile(r'^(\S+) (\S+) (\S+) \[([^:]+):(\d+:\d+:\d+) ([^\]]+)\] "(\S+) (.*?) (\S+)" (\S+) (\S+)$')
# with group naming.
split_re = re.compile(r'''(?x) # allow nicer formatting (but requires escaping blanks)
^(?P<client>\S+)\s
(?P<identuser>\S+)\s
(?P<authuser>\S+)\s
\[
(?P<date>[^:]+):
(?P<time>[\d:]+)\s
(?P<tz>[^\]]+)
\]\s
"
(?P<method>\S+)\s
(?P<url>.*?)\s
(?P<protocol>\S+)
"\s
(?P<status>\S+)\s
(?P<bytes>\S+)
(?:
\s
"
(?P<referrer>[^"]+)
"\s
"
(?P<agent>[^"]+)
"
)?''')
for line in LOGFILE:
f = split_re.match(line)
if f:
print "agent = %s" % f.group('agent')
[править] Processing Server Logs
# @@INCOMPLETE@@
# @@INCOMPLETE@@
[править] Program: htmlsub
# @@INCOMPLETE@@
# @@INCOMPLETE@@
[править] Program: hrefsub
# @@INCOMPLETE@@
# @@INCOMPLETE@@
|