|
Ocaml/FAQ/Web Automation
Материал из Wiki.crossplatform.ru
20. Web Automation
Introduction
(* Libraries for HTTP clients and servers are listed at The Caml Hump: *)
http://caml.inria.fr/cgi-bin/hump.en.cgi?browse=40
Fetching a URL from a Perl Script
(* If you just want to read a URL as a string, Ocamlnet's "Convenience"
interface to Http_client is as easy as it gets. For the more powerful
general interface to Http_client, see the next example. *)
#use "topfind";;
#require "netclient";;
open Http_client.Convenience
let content = http_get url
(*-----------------------------*)
#!/usr/bin/ocaml
(* titlebytes - find the title and size of documents *)
#use "topfind";;
#require "str";;
#require "netclient";;
let raw_url = Sys.argv.(1)
let url = Neturl.parse_url raw_url
let () =
Printf.printf "%s =>\n\t%!" (Neturl.string_of_url url);
let call = new Http_client.get (Neturl.string_of_url url) in
call#set_req_header "User-Agent" "Schmozilla/v9.14 Platinum";
call#set_req_header "Referer" "http://wizard.yellowbrick.oz";
let pipeline = new Http_client.pipeline in
pipeline#add call;
pipeline#run ();
match call#status with
| `Successful ->
let content = call#get_resp_body () in
let bytes = String.length content in
let count = ref 0 in
String.iter (function '\n' -> incr count | _ -> ()) content;
let regexp =
Str.regexp_case_fold ".*<title>\\([^<]*\\)</title>.*" in
let title =
try (ignore (Str.search_forward regexp content 0);
Str.matched_group 1 content)
with Not_found -> "(untitled)" in
let title =
Str.global_replace
(Str.regexp "\\(^[\n\r\t ]+\\)\\|\\([\n\r\t ]+$\\)") ""
title in
Printf.printf "%s (%d lines, %d bytes)\n" title !count bytes
| `Client_error ->
Printf.eprintf "Client error: %d %s\n"
call#response_status_code
call#response_status_text
| `Http_protocol_error e ->
Printf.eprintf "HTTP protocol error: %s\n"
(Printexc.to_string e)
| `Redirection ->
Printf.eprintf "Redirection\n"
| `Server_error ->
Printf.eprintf "Server error\n"
| `Unserved ->
assert false
Automating Form Submission
#use "topfind";;
#require "netclient";;
open Http_client.Convenience
(* Submit a form using GET. *)
let url = "http://www.perl.com/cgi-bin/cpan_mod?module=DB_File&readme=1"
let content = http_get url
(* Submit a form using POST. Since we need to follow a redirect here,
we can't use the "Convenience" methods. *)
let url = "http://www.perl.com/cgi-bin/cpan_mod"
let params = ["module", "DB_File"; "readme", "1"]
let () =
let call = new Http_client.post url params in
call#set_redirect_mode Http_client.Redirect;
let pipeline = new Http_client.pipeline in
pipeline#add call;
pipeline#run ()
let content = call#response_body#value
(* GET parameters can be URL encoded with Netencoding.Url.encode. *)
let arg = "\"this isn't <EASY> & <FUN>\""
Netencoding.Url.encode arg
(* - : string = "%22this+isn%27t+%3CEASY%3E+%26+%3CFUN%3E%22" *)
Netencoding.Url.encode ~plus:false arg
(* - : string = "%22this%20isn%27t%20%3CEASY%3E%20%26%20%3CFUN%3E%22" *)
(* To use a proxy, either set the "http_proxy" environment variable and
call "set_proxy_from_environment" on the pipeline (done automatically
for the "Convenience" methods) or set the proxy host and port using
the "set_proxy" method: *)
let () = pipeline#set_proxy "localhost" 3128
Extracting URLs
(* The Nethtml library, part of Ocamlnet, can parse arbitrary HTML from
files and web pages. *)
#use "topfind";;
#require "netstring";;
open Nethtml
(* Define a function to walk through all the elements in a document and
accumulate the results of a user-supplied function for each element.
This is known as a "fold" in functional programming. *)
let rec fold_elements f accu = function
| Element (_, _, children) as element ->
let accu =
List.fold_right
(fun child accu ->
fold_elements f accu child)
children
accu in
f accu element
| other -> accu
(* Define a type for links so we can tell anchors and images apart. *)
type link = A of string | IMG of string
(* Using fold_elements, define a function that collects the URLs from
all the "a" and "img" tags. *)
let find_links elements =
List.flatten
(List.map
(fold_elements
(fun accu element ->
match element with
| Element ("a", attribs, _) ->
(try A (List.assoc "href" attribs) :: accu
with Not_found -> accu)
| Element ("img", attribs, _) ->
(try IMG (List.assoc "src" attribs) :: accu
with Not_found -> accu)
| _ -> accu)
[])
elements)
(* Parse an HTML file. *)
let elements = parse (new Netchannels.input_channel (open_in filename))
(* Print the links we found. *)
let () =
List.iter
(function
| A href -> Printf.printf "ANCHOR: %s\n" href
| IMG src -> Printf.printf "IMAGE: %s\n" src)
(find_links elements)
(*-----------------------------*)
#!/usr/bin/ocaml
(* xurl - extract unique, sorted list of links from URL *)
#use "topfind";;
#require "netclient";;
open Http_client.Convenience
open Nethtml
let rec fold_elements f accu = function
| Element (_, _, children) as element ->
let accu =
List.fold_right
(fun child accu ->
fold_elements f accu child)
children
accu in
f accu element
| other -> accu
type link = A of string | IMG of string
let find_links elements =
List.flatten
(List.map
(fold_elements
(fun accu element ->
match element with
| Element ("a", attribs, _) ->
(try A (List.assoc "href" attribs) :: accu
with Not_found -> accu)
| Element ("img", attribs, _) ->
(try IMG (List.assoc "src" attribs) :: accu
with Not_found -> accu)
| _ -> accu)
[])
elements)
let base_url = Sys.argv.(1)
let elements = parse (new Netchannels.input_string (http_get base_url))
let url_syntax = Hashtbl.find Neturl.common_url_syntax "http"
let url_syntax =
{url_syntax with
Neturl.url_enable_fragment = Neturl.Url_part_allowed}
let url_syntax = Neturl.partial_url_syntax url_syntax
module StringSet = Set.Make(String)
let () =
StringSet.iter print_endline
(List.fold_left
(fun accu s ->
try
StringSet.add
(Neturl.string_of_url
(Neturl.apply_relative_url
(Neturl.url_of_string url_syntax base_url)
(Neturl.url_of_string url_syntax s)))
accu
with Neturl.Malformed_URL ->
Printf.eprintf "Malformed URL: %s\n%!" s;
accu)
StringSet.empty
(List.map
(function
| A href -> href
| IMG src -> src)
(find_links elements)))
Converting ASCII to HTML
#!/usr/bin/ocaml
(* text2html - trivial html encoding of normal text *)
#use "topfind";;
#require "str";;
#require "netstring";;
let line_stream_of_channel channel =
Stream.from
(fun _ -> try Some (input_line channel) with End_of_file -> None)
let paragraph_stream_of_channel channel =
let lines = line_stream_of_channel channel in
let rec next para_lines i =
match Stream.peek lines, para_lines with
| None, [] -> None
| Some "", [] -> Stream.junk lines; next para_lines i
| Some "", _
| None, _ -> Some (String.concat "\n" (List.rev para_lines))
| Some line, _ -> Stream.junk lines; next (line :: para_lines) i in
Stream.from (next [])
let chop s =
if s = "" then s else String.sub s 0 (String.length s - 1);;
let substitutions =
[
(* embedded URL (good) or guessed URL (bad) *)
Str.regexp "\\(<URL:[^>]+>\\)\\|\\(http:[^ \n\r\t]+\\)",
(fun s ->
let s =
if s.[0] = '<'
then String.sub s 5 (String.length s - 6)
else s in
[Nethtml.Element ("a", ["href", s], [Nethtml.Data s])]);
(* this is *bold* here *)
Str.regexp "\\*[^*]+\\*",
(fun s -> [Nethtml.Element ("strong", [], [Nethtml.Data s])]);
(* this is _italics_ here *)
Str.regexp "_[^ _]+_",
(fun s -> [Nethtml.Element ("em", [], [Nethtml.Data s])]);
]
let substitute regexp func data =
List.flatten
(List.map
(function
| Str.Text s -> [Nethtml.Data s]
| Str.Delim s -> func s)
(Str.full_split regexp data))
let rec map_data f list =
List.flatten
(List.map
(function
| Nethtml.Data data -> f data
| Nethtml.Element (name, attrs, children) ->
[Nethtml.Element (name, attrs, map_data f children)])
list)
let text2html text =
(* Create the initial HTML tree. *)
let html = [Nethtml.Data text] in
(* Split text into lines. *)
let html =
List.flatten
(List.map
(function
| Nethtml.Data data ->
List.map
(fun line -> Nethtml.Data (line ^ "\n"))
("" :: Str.split (Str.regexp "\n") data)
| Nethtml.Element _ as e -> [e])
html) in
(* Perform inline substitutions. *)
let html =
List.fold_right
(fun (regexp, func) ->
map_data (substitute regexp func))
substitutions
html in
(* Add line breaks to quoted text. *)
let html =
List.flatten
(List.map
(function
| Nethtml.Data line when line.[0] = '>' ->
[Nethtml.Data (chop line);
Nethtml.Element ("br", [], []);
Nethtml.Data "\n"]
| Nethtml.Data line -> [Nethtml.Data line]
| Nethtml.Element _ as e -> [e])
html) in
(* Return the finished document. *)
html
let buffer = Buffer.create 0
let channel = new Netchannels.output_buffer buffer
let write html = Nethtml.write channel (Nethtml.encode html)
let paragraphs = paragraph_stream_of_channel stdin
(* Main loop *)
let () =
let first = ref true in
Stream.iter
(fun para ->
if !first then first := false
else write [Nethtml.Data "\n\n"];
(* Paragraphs beginning with whitespace are wrapped in <pre> *)
let tag, body =
if String.length para > 0 && String.contains " \t" para.[0]
then "pre", [Nethtml.Data "\n";
Nethtml.Data para; (* indented verbatim *)
Nethtml.Data "\n"]
else "p", text2html para in (* add paragraph tag *)
write [Nethtml.Element (tag, [], body)])
paragraphs;
print_endline (Buffer.contents buffer)
(*-----------------------------*)
(* To format mail headers as a table, add the following just before
the main loop. *)
let () =
let colon_delim = Str.regexp "[ \t]*:[ \t]*" in
let continuation = Str.regexp "\n[ \t]+" in
try
let headers = Stream.next paragraphs in
let headers = Str.global_replace continuation " " headers in
let lines = Str.split (Str.regexp "\n") headers in
let rows =
List.flatten
(List.map
(fun line ->
(* parse heading *)
let key, value =
match Str.bounded_split_delim colon_delim line 2 with
| [key; value] -> key, value
| _ -> "", line in
[Nethtml.Element
("tr", [],
[Nethtml.Element
("th", ["align", "left"], [Nethtml.Data key]);
Nethtml.Element
("td", [], [Nethtml.Data value])]);
Nethtml.Data "\n"])
lines) in
write [Nethtml.Element ("table", [], Nethtml.encode rows);
Nethtml.Element ("hr", [], []);
Nethtml.Data "\n\n"]
with Stream.Failure -> ()
Converting HTML to ASCII
#load "unix.cma";;
let slurp_channel channel =
let buffer_size = 4096 in
let buffer = Buffer.create buffer_size in
let string = String.create buffer_size in
let chars_read = ref 1 in
while !chars_read <> 0 do
chars_read := input channel string 0 buffer_size;
Buffer.add_substring buffer string 0 !chars_read
done;
Buffer.contents buffer
let () =
let process = Unix.open_process_in ("lynx -dump " ^ filename) in
let ascii = slurp_channel process in
ignore (Unix.close_process_in process);
(* ... *)
Extracting or Removing HTML Tags
(* Nethtml can be used to safely isolate and extract the text elements
from an HTML document. *)
#use "topfind";;
#require "netstring";;
(* Load the HTML document. *)
let channel = new Netchannels.input_channel (open_in filename)
let html = Nethtml.parse channel
let () = channel#close_in ()
(* Convert the document to plain text. *)
let plain_text =
let text = ref [] in
let rec loop html =
List.iter
(function
| Nethtml.Data s -> text := s :: !text
| Nethtml.Element (_, _, children) -> loop children)
html in
loop (Nethtml.decode html);
String.concat "" (List.rev !text)
(*-----------------------------*)
#!/usr/bin/ocaml
(* htitle - get html title from URL *)
#use "topfind";;
#require "str";;
#require "netclient";;
open Http_client.Convenience
let ltrim = Str.global_replace (Str.regexp "^[ \r\n\t\x00\x0B]*") ""
let rtrim = Str.global_replace (Str.regexp "[ \r\n\t\x00\x0B]*$") ""
let trim s = rtrim (ltrim s)
let find_title html =
let title = ref "" in
let rec loop = function
| Nethtml.Element ("title", _, Nethtml.Data data :: _) ->
title := trim data; raise Exit
| Nethtml.Element (_, _, children) -> List.iter loop children
| _ -> () in
(try List.iter loop html with Exit -> ());
!title
let urls =
if Array.length Sys.argv > 1
then List.tl (Array.to_list Sys.argv)
else (Printf.eprintf "usage: %s url ...\n" Sys.argv.(0); exit 1)
let () =
List.iter
(fun url ->
print_string (url ^ ": ");
try
let res = http_get url in
let ch = new Netchannels.input_string res in
let html = Nethtml.parse ch in
print_endline (find_title html)
with
| Http_client.Http_error (status, _) ->
Printf.printf "%d %s\n" status
(Nethttp.string_of_http_status
(Nethttp.http_status_of_int status))
| Failure s -> print_endline s)
urls
Finding Stale Links
#!/usr/bin/ocaml
(* churl - check urls *)
#use "topfind";;
#require "netclient";;
open Http_client.Convenience
open Nethtml
let rec fold_elements f accu = function
| Element (_, _, children) as element ->
let accu =
List.fold_right
(fun child accu ->
fold_elements f accu child)
children
accu in
f accu element
| other -> accu
type link = A of string | IMG of string
let find_links elements =
List.flatten
(List.map
(fold_elements
(fun accu element ->
match element with
| Element ("a", attribs, _) ->
(try A (List.assoc "href" attribs) :: accu
with Not_found -> accu)
| Element ("img", attribs, _) ->
(try IMG (List.assoc "src" attribs) :: accu
with Not_found -> accu)
| _ -> accu)
[])
elements)
let check_url url =
Printf.printf " %s: %s\n%!" url
(match (http_head_message url)#status with
| `Successful -> "OK"
| _ -> "BAD")
let () =
if Array.length Sys.argv <> 2
then (Printf.eprintf "usage: %s <start_url>\n" Sys.argv.(0); exit 1)
let base_url = Sys.argv.(1)
let elements = parse (new Netchannels.input_string (http_get base_url))
let url_syntax = Hashtbl.find Neturl.common_url_syntax "http"
let url_syntax =
{url_syntax with
Neturl.url_enable_fragment = Neturl.Url_part_allowed}
let url_syntax = Neturl.partial_url_syntax url_syntax
module StringSet = Set.Make(String)
let () =
print_endline (base_url ^ ":");
StringSet.iter check_url
(List.fold_left
(fun accu s ->
try
StringSet.add
(Neturl.string_of_url
(Neturl.apply_relative_url
(Neturl.url_of_string url_syntax base_url)
(Neturl.url_of_string url_syntax s)))
accu
with Neturl.Malformed_URL ->
Printf.eprintf "Malformed URL: %s\n%!" s;
accu)
StringSet.empty
(List.map
(function
| A href -> href
| IMG src -> src)
(find_links elements)))
Finding Fresh Links
#!/usr/bin/ocaml
(* surl - sort URLs by their last modification date *)
#use "topfind";;
#require "netclient";;
open Http_client.Convenience
let dates = ref []
let () =
try
while true do
let url = input_line stdin in
let call = http_head_message url in
let date =
try Some (Netdate.parse
(call#response_header#field "Last-Modified"))
with Not_found -> None in
dates := (date, url) :: !dates
done
with End_of_file -> ()
let () =
List.iter
(fun (date, url) ->
Printf.printf "%-25s %s\n"
(match date with
| Some date -> Netdate.format "%a %b %d %H:%M:%S %Y" date
| None -> "<NONE SPECIFIED>")
url)
(List.rev (List.sort compare !dates))
Creating HTML Templates
(* Template replacement using regular expressions from the Str module. *)
#load "str.cma";;
let slurp_channel channel =
let buffer_size = 4096 in
let buffer = Buffer.create buffer_size in
let string = String.create buffer_size in
let chars_read = ref 1 in
while !chars_read <> 0 do
chars_read := input channel string 0 buffer_size;
Buffer.add_substring buffer string 0 !chars_read
done;
Buffer.contents buffer
let slurp_file filename =
let channel = open_in_bin filename in
let result =
try slurp_channel channel
with e -> close_in channel; raise e in
close_in channel;
result
let template_regexp = Str.regexp "%%\\([^%]+\\)%%"
let template filename fillings =
let text = slurp_file filename in
let eval s =
try Hashtbl.find fillings s
with Not_found -> "" in
let replace _ =
eval (Str.matched_group 1 text) in
Str.global_substitute template_regexp replace text
(*-----------------------------*)
(* Alternative implementation: a hand-written stream parser. This version
avoids loading the whole template into memory, so it is efficient for
large files. *)
let template filename fillings =
let f = open_in filename in
try
let buffer = Buffer.create (in_channel_length f) in
let text = Stream.of_channel f in
let eval s =
try Hashtbl.find fillings s
with Not_found -> "" in
let rec search () =
match Stream.peek text with
| None -> ()
| Some '%' ->
Stream.junk text;
(match Stream.peek text with
| None ->
Buffer.add_char buffer '%';
search ()
| Some '%' ->
Stream.junk text;
replace ""
| Some c ->
Stream.junk text;
Buffer.add_char buffer '%';
Buffer.add_char buffer c;
search ())
| Some c ->
Stream.junk text;
Buffer.add_char buffer c;
search ()
and replace acc =
match Stream.peek text with
| None ->
Buffer.add_string buffer "%%";
Buffer.add_string buffer acc
| Some '%' ->
Stream.junk text;
(match Stream.peek text with
| None ->
Buffer.add_string buffer "%%";
Buffer.add_string buffer acc;
Buffer.add_char buffer '%'
| Some '%' ->
Stream.junk text;
Buffer.add_string buffer (eval acc);
search ()
| Some c ->
Stream.junk text;
replace (acc ^ "%" ^ (String.make 1 c)))
| Some c ->
Stream.junk text;
replace (acc ^ (String.make 1 c)) in
search ();
close_in f;
Buffer.contents buffer
with e ->
close_in f;
raise e
(*-----------------------------*)
(* simple.template contains the following:
<!-- simple.template for internal template() function -->
<HTML><HEAD><TITLE>Report for %%username%%</TITLE></HEAD>
<BODY><H1>Report for %%username%%</H1>
%%username%% logged in %%count%% times, for a total of %%total%% minutes.
*)
let () =
let fields = Hashtbl.create 3 in
Hashtbl.replace fields "username" whats_his_name;
Hashtbl.replace fields "count" (string_of_int login_count);
Hashtbl.replace fields "total" (string_of_int minute_used);
print_endline (template "simple.template" fields)
(* Output:
<!-- simple.template for internal template() function -->
<HTML><HEAD><TITLE>Report for ramen</TITLE></HEAD>
<BODY><H1>Report for ramen</H1>
ramen logged in 42 times, for a total of 123 minutes.
*)
(*-----------------------------*)
(* userrep - report duration of user logins using SQL database *)
let process (cgi : Netcgi.cgi) =
cgi#set_header ~content_type:"text/html" ();
begin
match cgi#argument_value "username" with
| "" ->
cgi#out_channel#output_string "No username"
| user ->
let db =
Mysql.quick_connect
~user:"user"
~password:"seekritpassword"
~database:"connections" () in
let sql = Printf.sprintf "
SELECT COUNT(duration),SUM(duration)
FROM logins WHERE username='%s'
" (Mysql.escape user) in
let result = Mysql.exec db sql in
let default d = function Some x -> x | None -> d in
let (count, total) =
match Mysql.fetch result with
| None -> ("0", "0")
| Some row ->
(default "0" row.(0),
default "0" row.(1)) in
(* template defined in the solution above *)
let tpl = template "report.tpl" in
let vars = Hashtbl.create 3 in
Hashtbl.replace vars "username" user;
Hashtbl.replace vars "count" count;
Hashtbl.replace vars "total" total;
cgi#out_channel#output_string (tpl vars)
end;
cgi#out_channel#commit_work ()
let () =
let config = Netcgi.default_config in
let buffered _ ch = new Netchannels.buffered_trans_channel ch in
Netcgi_cgi.run ~config ~output_type:(`Transactional buffered) process
Mirroring Web Pages
#use "topfind";;
#require "unix";;
#require "netclient";;
let mirror url file =
let call = new Http_client.get url in
begin
try
let mtime = (Unix.stat file).Unix.st_mtime in
let date = Netdate.mk_mail_date mtime in
call#set_req_header "If-Modified-Since" date
with Unix.Unix_error _ -> ()
end;
call#set_response_body_storage (`File (fun () -> file));
let pipeline = new Http_client.pipeline in
pipeline#add call;
pipeline#run ();
if call#response_status = `Ok
then (let date =
Netdate.parse
(call#response_header#field "Last-Modified") in
Unix.utimes file 0.0 (Netdate.since_epoch date));
call#response_status
Creating a Robot
#load "str.cma";;
(* Parse "robots.txt" content from a stream of lines and return a
list of user agents and a multi-valued hash table containing the
rules for each user agent. *)
let parse_robots =
let module S = Set.Make(struct
type t = string
let compare = compare
end) in
(* Precompile regular expressions. *)
let comments = Str.regexp "#.*" in
let leading_white = Str.regexp "^[ \t]+" in
let trailing_white = Str.regexp "[ \t\r]+$" in
let colon_delim = Str.regexp "[ \t]*:[ \t]*" in
fun stream ->
let user_agent = ref "*" in
let user_agents = ref (S.singleton "*") in
let rules = Hashtbl.create 0 in
Stream.iter
(fun s ->
let s = Str.replace_first comments "" s in
let s = Str.replace_first leading_white "" s in
let s = Str.replace_first trailing_white "" s in
if String.length s > 0 then
match Str.bounded_split_delim colon_delim s 2 with
| ["User-agent"; value] ->
(* Found a new User-agent. *)
user_agent := value;
user_agents := S.add value !user_agents
| ["Sitemap"; value] ->
(* Sitemaps are always global. *)
Hashtbl.add rules "*" ("Sitemap", value)
| [key; value] ->
(* Found a rule for the current User-agent. *)
Hashtbl.add rules !user_agent (key, value)
| _ -> failwith s)
stream;
S.elements !user_agents, rules
(* Produce a stream of lines from an input channel. *)
let line_stream_of_channel channel =
Stream.from
(fun _ -> try Some (input_line channel) with End_of_file -> None)
(* Produce a stream of lines from a string in memory. *)
let line_stream_of_string string =
Stream.of_list (Str.split (Str.regexp "\n") string)
(*-----------------------------*)
(* Use Ocamlnet to retrieve a "robots.txt" file and print its rules. *)
#use "topfind";;
#require "netclient";;
open Http_client.Convenience
let agents, rules =
parse_robots
(line_stream_of_string
(http_get "http://sourceforge.net/robots.txt"))
let () =
List.iter
(fun agent ->
Printf.printf "User-agent: %s\n" agent;
List.iter
(fun (key, value) ->
Printf.printf "\t%s: %s\n" key value)
(Hashtbl.find_all rules agent))
agents
Parsing a Web Server Log File
(* Use the Weblogs library by Richard Jones:
http://merjis.com/developers/weblogs
You will also need the HostIP library:
http://merjis.com/developers/hostip *)
let log = Weblogs.import_file "/var/log/apache2/access.log"
let () =
Array.iter
(fun {Weblogs.src_ip=client;
remote_username=identuser;
username=authuser;
t=datetime;
http_method=method';
full_url=url;
http_version=protocol;
rcode=status;
size=bytes;
(* Many more fields are available.
See Weblogs API documentation for details. *)
} ->
(* ... *)
())
log
Processing Server Logs
#!/usr/bin/ocaml
(* sumwww - summarize web server log activity *)
#use "topfind";;
#require "weblogs";;
open Weblogs
let file =
if Array.length Sys.argv = 2
then Sys.argv.(1)
else (Printf.eprintf "usage: %s <logfile>\n" Sys.argv.(0);
exit 1)
let format_date = CalendarLib.Printer.CalendarPrinter.sprint "%d/%b/%Y"
let incr_hash hash key by =
Hashtbl.replace hash key
(try Hashtbl.find hash key + by
with Not_found -> by)
let count_hash hash =
let count = ref 0 in
Hashtbl.iter (fun _ _ -> incr count) hash;
!count
let add_hash dest src =
Hashtbl.iter (incr_hash dest) src
let lastdate = ref ""
let count = ref 0
let posts = ref 0
let homes = ref 0
let bytesum = ref 0l
let hosts = ref (Hashtbl.create 0)
let whats = ref (Hashtbl.create 0)
let sumcount = ref 0
let allposts = ref 0
let allhomes = ref 0
let bytesumsum = ref 0l
let allhosts = ref (Hashtbl.create 0)
let allwhats = ref (Hashtbl.create 0)
(* display the tallies of hosts and URLs *)
let write_report () =
Printf.printf "%s %7d %8d %8d %7d %7d %14ld\n%!"
!lastdate (count_hash !hosts) !count (count_hash !whats)
!posts !homes !bytesum;
(* add to summary data *)
sumcount := !sumcount + !count;
bytesumsum := Int32.add !bytesumsum !bytesum;
allposts := !allposts + !posts;
allhomes := !allhomes + !homes;
(* reset daily data *)
count := 0;
posts := 0;
homes := 0;
bytesum := 0l;
add_hash !allhosts !hosts;
add_hash !allwhats !whats;
Hashtbl.clear !hosts;
Hashtbl.clear !whats
(* read log file and tally hits from the host and to the URL *)
let daily_logs () =
let log = import_file file in
print_endline
" Date Hosts Accesses Unidocs POST Home Bytes";
print_endline
"----------- ------- -------- -------- ------- ------- --------------";
Array.iter
(fun row ->
let date = format_date row.t in
let host = row.src_ip in
let what = row.url in
let post = row.http_method = POST in
let home = what = "/" in
let bytes = match row.size with Some n -> n | None -> 0 in
if !lastdate = "" then lastdate := date;
if !lastdate <> date then write_report ();
lastdate := date;
incr count;
if post then incr posts;
if home then incr homes;
incr_hash !hosts host 1;
incr_hash !whats what 1;
bytesum := Int32.add !bytesum (Int32.of_int bytes))
log;
if !count > 0 then write_report ()
let summary () =
lastdate := "Grand Total";
count := !sumcount;
bytesum := !bytesumsum;
hosts := !allhosts;
posts := !allposts;
whats := !allwhats;
homes := !allhomes;
write_report ()
let () =
daily_logs ();
summary ();
exit 0
(*-----------------------------*)
#!/usr/bin/ocaml
(* aprept - report on Apache logs *)
#use "topfind";;
#require "weblogs";;
open Weblogs
let file =
if Array.length Sys.argv = 2
then Sys.argv.(1)
else (Printf.eprintf "usage: %s <logfile>\n" Sys.argv.(0);
exit 1)
let log = import_file file
let conn = HostIP.connection ()
let incr_hash hash key by =
Hashtbl.replace hash key
(try Hashtbl.find hash key + by
with Not_found -> by)
let report_countries () =
let total = ref 0 in
let countries = Hashtbl.create 0 in
Array.iter
(fun row ->
let country =
match HostIP.get_country_name conn row.src_ip with
| Some country -> country
| None -> "UNKNOWN" in
incr_hash countries country 1;
incr total)
log;
let country_records = ref [] in
Hashtbl.iter
(fun country count ->
country_records := (count, country) :: !country_records)
countries;
print_endline "Domain Records";
print_endline "===============================";
List.iter
(fun (count, country) ->
Printf.printf "%18s %5d %5.2f%%\n%!"
country count (float count *. 100. /. float !total))
(List.rev (List.sort compare !country_records))
let report_files () =
let total = ref 0 in
let totalbytes = ref 0l in
let bytes = Hashtbl.create 0 in
let records = Hashtbl.create 0 in
Array.iter
(fun row ->
let file = row.url in
let size = match row.size with Some n -> n | None -> 0 in
incr_hash bytes file size;
incr_hash records file 1;
totalbytes := Int32.add !totalbytes (Int32.of_int size);
incr total)
log;
let file_records = ref [] in
Hashtbl.iter
(fun file size ->
let count = Hashtbl.find records file in
file_records := (file, size, count) :: !file_records)
bytes;
print_endline
"File Bytes Records";
print_endline
"=========================================================";
List.iter
(fun (file, size, count) ->
Printf.printf "%-22s %10d %5.2f%% %9d %5.2f%%\n%!"
file size
(float size *. 100. /. Int32.to_float !totalbytes)
count
(float count *. 100. /. float !total))
(List.sort compare !file_records)
let () =
report_countries ();
print_newline ();
report_files ()
Program: htmlsub
#!/usr/bin/ocaml
(* htmlsub - make substitutions in normal text of HTML files *)
#use "topfind";;
#require "str";;
#require "netstring";;
let usage () =
Printf.eprintf "Usage: %s <from> <to> <file>...\n" Sys.argv.(0);
exit 1
let from, to', files =
match List.tl (Array.to_list Sys.argv) with
| from :: to' :: files -> from, to', files
| _ -> usage ()
let rec map_data f = function
| Nethtml.Data data -> Nethtml.Data (f data)
| Nethtml.Element (name, attribs, children) ->
Nethtml.Element (name, attribs, List.map (map_data f) children)
let regexp = Str.regexp_string from
let buffer = Buffer.create 0
let out_channel = new Netchannels.output_buffer buffer
let write html = Nethtml.write out_channel (Nethtml.encode html)
let () =
List.iter
(fun file ->
let in_channel = new Netchannels.input_channel (open_in file) in
let html = Nethtml.decode (Nethtml.parse in_channel) in
in_channel#close_in ();
write (List.map (map_data (Str.global_replace regexp to')) html))
files;
print_endline (Buffer.contents buffer)
Program: hrefsub
#!/usr/bin/ocaml
(* hrefsub - make substitutions in <A HREF="..."> fields of HTML files *)
#use "topfind";;
#require "str";;
#require "netstring";;
let usage () =
Printf.eprintf "Usage: %s <from> <to> <file>...\n" Sys.argv.(0);
exit 1
let from, to', files =
match List.tl (Array.to_list Sys.argv) with
| from :: to' :: files -> from, to', files
| _ -> usage ()
let rec map_attr tag attr f = function
| Nethtml.Data _ as d -> d
| Nethtml.Element (name, attribs, children)
when tag = name && List.mem_assoc attr attribs ->
let value = List.assoc attr attribs in
Nethtml.Element (name,
(attr, f value)
:: List.remove_assoc attr attribs,
List.map (map_attr tag attr f) children)
| Nethtml.Element (name, attribs, children) ->
Nethtml.Element (name,
attribs,
List.map (map_attr tag attr f) children)
let regexp = Str.regexp_string from
let buffer = Buffer.create 0
let out_channel = new Netchannels.output_buffer buffer
let write html = Nethtml.write out_channel (Nethtml.encode html)
let () =
List.iter
(fun file ->
let in_channel = new Netchannels.input_channel (open_in file) in
let html = Nethtml.decode (Nethtml.parse in_channel) in
in_channel#close_in ();
write (List.map (map_attr "a" "href"
(Str.global_replace regexp to')) html))
files;
print_endline (Buffer.contents buffer)
|