Ocaml/FAQ/Web Automation
Материал из Wiki.crossplatform.ru
(Различия между версиями)
ViGOur (Обсуждение | вклад)
(Новая страница: «== 20. Web Automation == === Introduction === <source lang="lisp">(* Libraries for HTTP clients and servers are listed at The Caml Hump: *) http://caml.inria.fr/cg…»)
(Новая страница: «== 20. Web Automation == === Introduction === <source lang="lisp">(* Libraries for HTTP clients and servers are listed at The Caml Hump: *) http://caml.inria.fr/cg…»)
Текущая версия на 20:50, 24 ноября 2010
[править] 20. Web Automation
[править] Introduction
(* Libraries for HTTP clients and servers are listed at The Caml Hump: *) http://caml.inria.fr/cgi-bin/hump.en.cgi?browse=40
[править] Fetching a URL from a Perl Script
(* If you just want to read a URL as a string, Ocamlnet's "Convenience" interface to Http_client is as easy as it gets. For the more powerful general interface to Http_client, see the next example. *) #use "topfind";; #require "netclient";; open Http_client.Convenience let content = http_get url (*-----------------------------*) #!/usr/bin/ocaml (* titlebytes - find the title and size of documents *) #use "topfind";; #require "str";; #require "netclient";; let raw_url = Sys.argv.(1) let url = Neturl.parse_url raw_url let () = Printf.printf "%s =>\n\t%!" (Neturl.string_of_url url); let call = new Http_client.get (Neturl.string_of_url url) in call#set_req_header "User-Agent" "Schmozilla/v9.14 Platinum"; call#set_req_header "Referer" "http://wizard.yellowbrick.oz"; let pipeline = new Http_client.pipeline in pipeline#add call; pipeline#run (); match call#status with | `Successful -> let content = call#get_resp_body () in let bytes = String.length content in let count = ref 0 in String.iter (function '\n' -> incr count | _ -> ()) content; let regexp = Str.regexp_case_fold ".*<title>\\([^<]*\\)</title>.*" in let title = try (ignore (Str.search_forward regexp content 0); Str.matched_group 1 content) with Not_found -> "(untitled)" in let title = Str.global_replace (Str.regexp "\\(^[\n\r\t ]+\\)\\|\\([\n\r\t ]+$\\)") "" title in Printf.printf "%s (%d lines, %d bytes)\n" title !count bytes | `Client_error -> Printf.eprintf "Client error: %d %s\n" call#response_status_code call#response_status_text | `Http_protocol_error e -> Printf.eprintf "HTTP protocol error: %s\n" (Printexc.to_string e) | `Redirection -> Printf.eprintf "Redirection\n" | `Server_error -> Printf.eprintf "Server error\n" | `Unserved -> assert false
[править] Automating Form Submission
#use "topfind";; #require "netclient";; open Http_client.Convenience (* Submit a form using GET. *) let url = "http://www.perl.com/cgi-bin/cpan_mod?module=DB_File&readme=1" let content = http_get url (* Submit a form using POST. Since we need to follow a redirect here, we can't use the "Convenience" methods. *) let url = "http://www.perl.com/cgi-bin/cpan_mod" let params = ["module", "DB_File"; "readme", "1"] let () = let call = new Http_client.post url params in call#set_redirect_mode Http_client.Redirect; let pipeline = new Http_client.pipeline in pipeline#add call; pipeline#run () let content = call#response_body#value (* GET parameters can be URL encoded with Netencoding.Url.encode. *) let arg = "\"this isn't <EASY> & <FUN>\"" Netencoding.Url.encode arg (* - : string = "%22this+isn%27t+%3CEASY%3E+%26+%3CFUN%3E%22" *) Netencoding.Url.encode ~plus:false arg (* - : string = "%22this%20isn%27t%20%3CEASY%3E%20%26%20%3CFUN%3E%22" *) (* To use a proxy, either set the "http_proxy" environment variable and call "set_proxy_from_environment" on the pipeline (done automatically for the "Convenience" methods) or set the proxy host and port using the "set_proxy" method: *) let () = pipeline#set_proxy "localhost" 3128
[править] Extracting URLs
(* The Nethtml library, part of Ocamlnet, can parse arbitrary HTML from files and web pages. *) #use "topfind";; #require "netstring";; open Nethtml (* Define a function to walk through all the elements in a document and accumulate the results of a user-supplied function for each element. This is known as a "fold" in functional programming. *) let rec fold_elements f accu = function | Element (_, _, children) as element -> let accu = List.fold_right (fun child accu -> fold_elements f accu child) children accu in f accu element | other -> accu (* Define a type for links so we can tell anchors and images apart. *) type link = A of string | IMG of string (* Using fold_elements, define a function that collects the URLs from all the "a" and "img" tags. *) let find_links elements = List.flatten (List.map (fold_elements (fun accu element -> match element with | Element ("a", attribs, _) -> (try A (List.assoc "href" attribs) :: accu with Not_found -> accu) | Element ("img", attribs, _) -> (try IMG (List.assoc "src" attribs) :: accu with Not_found -> accu) | _ -> accu) []) elements) (* Parse an HTML file. *) let elements = parse (new Netchannels.input_channel (open_in filename)) (* Print the links we found. *) let () = List.iter (function | A href -> Printf.printf "ANCHOR: %s\n" href | IMG src -> Printf.printf "IMAGE: %s\n" src) (find_links elements) (*-----------------------------*) #!/usr/bin/ocaml (* xurl - extract unique, sorted list of links from URL *) #use "topfind";; #require "netclient";; open Http_client.Convenience open Nethtml let rec fold_elements f accu = function | Element (_, _, children) as element -> let accu = List.fold_right (fun child accu -> fold_elements f accu child) children accu in f accu element | other -> accu type link = A of string | IMG of string let find_links elements = List.flatten (List.map (fold_elements (fun accu element -> match element with | Element ("a", attribs, _) -> (try A (List.assoc "href" attribs) :: accu with Not_found -> accu) | Element ("img", attribs, _) -> (try IMG (List.assoc "src" attribs) :: accu with Not_found -> accu) | _ -> accu) []) elements) let base_url = Sys.argv.(1) let elements = parse (new Netchannels.input_string (http_get base_url)) let url_syntax = Hashtbl.find Neturl.common_url_syntax "http" let url_syntax = {url_syntax with Neturl.url_enable_fragment = Neturl.Url_part_allowed} let url_syntax = Neturl.partial_url_syntax url_syntax module StringSet = Set.Make(String) let () = StringSet.iter print_endline (List.fold_left (fun accu s -> try StringSet.add (Neturl.string_of_url (Neturl.apply_relative_url (Neturl.url_of_string url_syntax base_url) (Neturl.url_of_string url_syntax s))) accu with Neturl.Malformed_URL -> Printf.eprintf "Malformed URL: %s\n%!" s; accu) StringSet.empty (List.map (function | A href -> href | IMG src -> src) (find_links elements)))
[править] Converting ASCII to HTML
#!/usr/bin/ocaml (* text2html - trivial html encoding of normal text *) #use "topfind";; #require "str";; #require "netstring";; let line_stream_of_channel channel = Stream.from (fun _ -> try Some (input_line channel) with End_of_file -> None) let paragraph_stream_of_channel channel = let lines = line_stream_of_channel channel in let rec next para_lines i = match Stream.peek lines, para_lines with | None, [] -> None | Some "", [] -> Stream.junk lines; next para_lines i | Some "", _ | None, _ -> Some (String.concat "\n" (List.rev para_lines)) | Some line, _ -> Stream.junk lines; next (line :: para_lines) i in Stream.from (next []) let chop s = if s = "" then s else String.sub s 0 (String.length s - 1);; let substitutions = [ (* embedded URL (good) or guessed URL (bad) *) Str.regexp "\\(<URL:[^>]+>\\)\\|\\(http:[^ \n\r\t]+\\)", (fun s -> let s = if s.[0] = '<' then String.sub s 5 (String.length s - 6) else s in [Nethtml.Element ("a", ["href", s], [Nethtml.Data s])]); (* this is *bold* here *) Str.regexp "\\*[^*]+\\*", (fun s -> [Nethtml.Element ("strong", [], [Nethtml.Data s])]); (* this is _italics_ here *) Str.regexp "_[^ _]+_", (fun s -> [Nethtml.Element ("em", [], [Nethtml.Data s])]); ] let substitute regexp func data = List.flatten (List.map (function | Str.Text s -> [Nethtml.Data s] | Str.Delim s -> func s) (Str.full_split regexp data)) let rec map_data f list = List.flatten (List.map (function | Nethtml.Data data -> f data | Nethtml.Element (name, attrs, children) -> [Nethtml.Element (name, attrs, map_data f children)]) list) let text2html text = (* Create the initial HTML tree. *) let html = [Nethtml.Data text] in (* Split text into lines. *) let html = List.flatten (List.map (function | Nethtml.Data data -> List.map (fun line -> Nethtml.Data (line ^ "\n")) ("" :: Str.split (Str.regexp "\n") data) | Nethtml.Element _ as e -> [e]) html) in (* Perform inline substitutions. *) let html = List.fold_right (fun (regexp, func) -> map_data (substitute regexp func)) substitutions html in (* Add line breaks to quoted text. *) let html = List.flatten (List.map (function | Nethtml.Data line when line.[0] = '>' -> [Nethtml.Data (chop line); Nethtml.Element ("br", [], []); Nethtml.Data "\n"] | Nethtml.Data line -> [Nethtml.Data line] | Nethtml.Element _ as e -> [e]) html) in (* Return the finished document. *) html let buffer = Buffer.create 0 let channel = new Netchannels.output_buffer buffer let write html = Nethtml.write channel (Nethtml.encode html) let paragraphs = paragraph_stream_of_channel stdin (* Main loop *) let () = let first = ref true in Stream.iter (fun para -> if !first then first := false else write [Nethtml.Data "\n\n"]; (* Paragraphs beginning with whitespace are wrapped in <pre> *) let tag, body = if String.length para > 0 && String.contains " \t" para.[0] then "pre", [Nethtml.Data "\n"; Nethtml.Data para; (* indented verbatim *) Nethtml.Data "\n"] else "p", text2html para in (* add paragraph tag *) write [Nethtml.Element (tag, [], body)]) paragraphs; print_endline (Buffer.contents buffer) (*-----------------------------*) (* To format mail headers as a table, add the following just before the main loop. *) let () = let colon_delim = Str.regexp "[ \t]*:[ \t]*" in let continuation = Str.regexp "\n[ \t]+" in try let headers = Stream.next paragraphs in let headers = Str.global_replace continuation " " headers in let lines = Str.split (Str.regexp "\n") headers in let rows = List.flatten (List.map (fun line -> (* parse heading *) let key, value = match Str.bounded_split_delim colon_delim line 2 with | [key; value] -> key, value | _ -> "", line in [Nethtml.Element ("tr", [], [Nethtml.Element ("th", ["align", "left"], [Nethtml.Data key]); Nethtml.Element ("td", [], [Nethtml.Data value])]); Nethtml.Data "\n"]) lines) in write [Nethtml.Element ("table", [], Nethtml.encode rows); Nethtml.Element ("hr", [], []); Nethtml.Data "\n\n"] with Stream.Failure -> ()
[править] Converting HTML to ASCII
#load "unix.cma";; let slurp_channel channel = let buffer_size = 4096 in let buffer = Buffer.create buffer_size in let string = String.create buffer_size in let chars_read = ref 1 in while !chars_read <> 0 do chars_read := input channel string 0 buffer_size; Buffer.add_substring buffer string 0 !chars_read done; Buffer.contents buffer let () = let process = Unix.open_process_in ("lynx -dump " ^ filename) in let ascii = slurp_channel process in ignore (Unix.close_process_in process); (* ... *)
[править] Extracting or Removing HTML Tags
(* Nethtml can be used to safely isolate and extract the text elements from an HTML document. *) #use "topfind";; #require "netstring";; (* Load the HTML document. *) let channel = new Netchannels.input_channel (open_in filename) let html = Nethtml.parse channel let () = channel#close_in () (* Convert the document to plain text. *) let plain_text = let text = ref [] in let rec loop html = List.iter (function | Nethtml.Data s -> text := s :: !text | Nethtml.Element (_, _, children) -> loop children) html in loop (Nethtml.decode html); String.concat "" (List.rev !text) (*-----------------------------*) #!/usr/bin/ocaml (* htitle - get html title from URL *) #use "topfind";; #require "str";; #require "netclient";; open Http_client.Convenience let ltrim = Str.global_replace (Str.regexp "^[ \r\n\t\x00\x0B]*") "" let rtrim = Str.global_replace (Str.regexp "[ \r\n\t\x00\x0B]*$") "" let trim s = rtrim (ltrim s) let find_title html = let title = ref "" in let rec loop = function | Nethtml.Element ("title", _, Nethtml.Data data :: _) -> title := trim data; raise Exit | Nethtml.Element (_, _, children) -> List.iter loop children | _ -> () in (try List.iter loop html with Exit -> ()); !title let urls = if Array.length Sys.argv > 1 then List.tl (Array.to_list Sys.argv) else (Printf.eprintf "usage: %s url ...\n" Sys.argv.(0); exit 1) let () = List.iter (fun url -> print_string (url ^ ": "); try let res = http_get url in let ch = new Netchannels.input_string res in let html = Nethtml.parse ch in print_endline (find_title html) with | Http_client.Http_error (status, _) -> Printf.printf "%d %s\n" status (Nethttp.string_of_http_status (Nethttp.http_status_of_int status)) | Failure s -> print_endline s) urls
[править] Finding Stale Links
#!/usr/bin/ocaml (* churl - check urls *) #use "topfind";; #require "netclient";; open Http_client.Convenience open Nethtml let rec fold_elements f accu = function | Element (_, _, children) as element -> let accu = List.fold_right (fun child accu -> fold_elements f accu child) children accu in f accu element | other -> accu type link = A of string | IMG of string let find_links elements = List.flatten (List.map (fold_elements (fun accu element -> match element with | Element ("a", attribs, _) -> (try A (List.assoc "href" attribs) :: accu with Not_found -> accu) | Element ("img", attribs, _) -> (try IMG (List.assoc "src" attribs) :: accu with Not_found -> accu) | _ -> accu) []) elements) let check_url url = Printf.printf " %s: %s\n%!" url (match (http_head_message url)#status with | `Successful -> "OK" | _ -> "BAD") let () = if Array.length Sys.argv <> 2 then (Printf.eprintf "usage: %s <start_url>\n" Sys.argv.(0); exit 1) let base_url = Sys.argv.(1) let elements = parse (new Netchannels.input_string (http_get base_url)) let url_syntax = Hashtbl.find Neturl.common_url_syntax "http" let url_syntax = {url_syntax with Neturl.url_enable_fragment = Neturl.Url_part_allowed} let url_syntax = Neturl.partial_url_syntax url_syntax module StringSet = Set.Make(String) let () = print_endline (base_url ^ ":"); StringSet.iter check_url (List.fold_left (fun accu s -> try StringSet.add (Neturl.string_of_url (Neturl.apply_relative_url (Neturl.url_of_string url_syntax base_url) (Neturl.url_of_string url_syntax s))) accu with Neturl.Malformed_URL -> Printf.eprintf "Malformed URL: %s\n%!" s; accu) StringSet.empty (List.map (function | A href -> href | IMG src -> src) (find_links elements)))
[править] Finding Fresh Links
#!/usr/bin/ocaml (* surl - sort URLs by their last modification date *) #use "topfind";; #require "netclient";; open Http_client.Convenience let dates = ref [] let () = try while true do let url = input_line stdin in let call = http_head_message url in let date = try Some (Netdate.parse (call#response_header#field "Last-Modified")) with Not_found -> None in dates := (date, url) :: !dates done with End_of_file -> () let () = List.iter (fun (date, url) -> Printf.printf "%-25s %s\n" (match date with | Some date -> Netdate.format "%a %b %d %H:%M:%S %Y" date | None -> "<NONE SPECIFIED>") url) (List.rev (List.sort compare !dates))
[править] Creating HTML Templates
(* Template replacement using regular expressions from the Str module. *) #load "str.cma";; let slurp_channel channel = let buffer_size = 4096 in let buffer = Buffer.create buffer_size in let string = String.create buffer_size in let chars_read = ref 1 in while !chars_read <> 0 do chars_read := input channel string 0 buffer_size; Buffer.add_substring buffer string 0 !chars_read done; Buffer.contents buffer let slurp_file filename = let channel = open_in_bin filename in let result = try slurp_channel channel with e -> close_in channel; raise e in close_in channel; result let template_regexp = Str.regexp "%%\\([^%]+\\)%%" let template filename fillings = let text = slurp_file filename in let eval s = try Hashtbl.find fillings s with Not_found -> "" in let replace _ = eval (Str.matched_group 1 text) in Str.global_substitute template_regexp replace text (*-----------------------------*) (* Alternative implementation: a hand-written stream parser. This version avoids loading the whole template into memory, so it is efficient for large files. *) let template filename fillings = let f = open_in filename in try let buffer = Buffer.create (in_channel_length f) in let text = Stream.of_channel f in let eval s = try Hashtbl.find fillings s with Not_found -> "" in let rec search () = match Stream.peek text with | None -> () | Some '%' -> Stream.junk text; (match Stream.peek text with | None -> Buffer.add_char buffer '%'; search () | Some '%' -> Stream.junk text; replace "" | Some c -> Stream.junk text; Buffer.add_char buffer '%'; Buffer.add_char buffer c; search ()) | Some c -> Stream.junk text; Buffer.add_char buffer c; search () and replace acc = match Stream.peek text with | None -> Buffer.add_string buffer "%%"; Buffer.add_string buffer acc | Some '%' -> Stream.junk text; (match Stream.peek text with | None -> Buffer.add_string buffer "%%"; Buffer.add_string buffer acc; Buffer.add_char buffer '%' | Some '%' -> Stream.junk text; Buffer.add_string buffer (eval acc); search () | Some c -> Stream.junk text; replace (acc ^ "%" ^ (String.make 1 c))) | Some c -> Stream.junk text; replace (acc ^ (String.make 1 c)) in search (); close_in f; Buffer.contents buffer with e -> close_in f; raise e (*-----------------------------*) (* simple.template contains the following: <!-- simple.template for internal template() function --> <HTML><HEAD><TITLE>Report for %%username%%</TITLE></HEAD> <BODY><H1>Report for %%username%%</H1> %%username%% logged in %%count%% times, for a total of %%total%% minutes. *) let () = let fields = Hashtbl.create 3 in Hashtbl.replace fields "username" whats_his_name; Hashtbl.replace fields "count" (string_of_int login_count); Hashtbl.replace fields "total" (string_of_int minute_used); print_endline (template "simple.template" fields) (* Output: <!-- simple.template for internal template() function --> <HTML><HEAD><TITLE>Report for ramen</TITLE></HEAD> <BODY><H1>Report for ramen</H1> ramen logged in 42 times, for a total of 123 minutes. *) (*-----------------------------*) (* userrep - report duration of user logins using SQL database *) let process (cgi : Netcgi.cgi) = cgi#set_header ~content_type:"text/html" (); begin match cgi#argument_value "username" with | "" -> cgi#out_channel#output_string "No username" | user -> let db = Mysql.quick_connect ~user:"user" ~password:"seekritpassword" ~database:"connections" () in let sql = Printf.sprintf " SELECT COUNT(duration),SUM(duration) FROM logins WHERE username='%s' " (Mysql.escape user) in let result = Mysql.exec db sql in let default d = function Some x -> x | None -> d in let (count, total) = match Mysql.fetch result with | None -> ("0", "0") | Some row -> (default "0" row.(0), default "0" row.(1)) in (* template defined in the solution above *) let tpl = template "report.tpl" in let vars = Hashtbl.create 3 in Hashtbl.replace vars "username" user; Hashtbl.replace vars "count" count; Hashtbl.replace vars "total" total; cgi#out_channel#output_string (tpl vars) end; cgi#out_channel#commit_work () let () = let config = Netcgi.default_config in let buffered _ ch = new Netchannels.buffered_trans_channel ch in Netcgi_cgi.run ~config ~output_type:(`Transactional buffered) process
[править] Mirroring Web Pages
#use "topfind";; #require "unix";; #require "netclient";; let mirror url file = let call = new Http_client.get url in begin try let mtime = (Unix.stat file).Unix.st_mtime in let date = Netdate.mk_mail_date mtime in call#set_req_header "If-Modified-Since" date with Unix.Unix_error _ -> () end; call#set_response_body_storage (`File (fun () -> file)); let pipeline = new Http_client.pipeline in pipeline#add call; pipeline#run (); if call#response_status = `Ok then (let date = Netdate.parse (call#response_header#field "Last-Modified") in Unix.utimes file 0.0 (Netdate.since_epoch date)); call#response_status
[править] Creating a Robot
#load "str.cma";; (* Parse "robots.txt" content from a stream of lines and return a list of user agents and a multi-valued hash table containing the rules for each user agent. *) let parse_robots = let module S = Set.Make(struct type t = string let compare = compare end) in (* Precompile regular expressions. *) let comments = Str.regexp "#.*" in let leading_white = Str.regexp "^[ \t]+" in let trailing_white = Str.regexp "[ \t\r]+$" in let colon_delim = Str.regexp "[ \t]*:[ \t]*" in fun stream -> let user_agent = ref "*" in let user_agents = ref (S.singleton "*") in let rules = Hashtbl.create 0 in Stream.iter (fun s -> let s = Str.replace_first comments "" s in let s = Str.replace_first leading_white "" s in let s = Str.replace_first trailing_white "" s in if String.length s > 0 then match Str.bounded_split_delim colon_delim s 2 with | ["User-agent"; value] -> (* Found a new User-agent. *) user_agent := value; user_agents := S.add value !user_agents | ["Sitemap"; value] -> (* Sitemaps are always global. *) Hashtbl.add rules "*" ("Sitemap", value) | [key; value] -> (* Found a rule for the current User-agent. *) Hashtbl.add rules !user_agent (key, value) | _ -> failwith s) stream; S.elements !user_agents, rules (* Produce a stream of lines from an input channel. *) let line_stream_of_channel channel = Stream.from (fun _ -> try Some (input_line channel) with End_of_file -> None) (* Produce a stream of lines from a string in memory. *) let line_stream_of_string string = Stream.of_list (Str.split (Str.regexp "\n") string) (*-----------------------------*) (* Use Ocamlnet to retrieve a "robots.txt" file and print its rules. *) #use "topfind";; #require "netclient";; open Http_client.Convenience let agents, rules = parse_robots (line_stream_of_string (http_get "http://sourceforge.net/robots.txt")) let () = List.iter (fun agent -> Printf.printf "User-agent: %s\n" agent; List.iter (fun (key, value) -> Printf.printf "\t%s: %s\n" key value) (Hashtbl.find_all rules agent)) agents
[править] Parsing a Web Server Log File
(* Use the Weblogs library by Richard Jones: http://merjis.com/developers/weblogs You will also need the HostIP library: http://merjis.com/developers/hostip *) let log = Weblogs.import_file "/var/log/apache2/access.log" let () = Array.iter (fun {Weblogs.src_ip=client; remote_username=identuser; username=authuser; t=datetime; http_method=method'; full_url=url; http_version=protocol; rcode=status; size=bytes; (* Many more fields are available. See Weblogs API documentation for details. *) } -> (* ... *) ()) log
[править] Processing Server Logs
#!/usr/bin/ocaml (* sumwww - summarize web server log activity *) #use "topfind";; #require "weblogs";; open Weblogs let file = if Array.length Sys.argv = 2 then Sys.argv.(1) else (Printf.eprintf "usage: %s <logfile>\n" Sys.argv.(0); exit 1) let format_date = CalendarLib.Printer.CalendarPrinter.sprint "%d/%b/%Y" let incr_hash hash key by = Hashtbl.replace hash key (try Hashtbl.find hash key + by with Not_found -> by) let count_hash hash = let count = ref 0 in Hashtbl.iter (fun _ _ -> incr count) hash; !count let add_hash dest src = Hashtbl.iter (incr_hash dest) src let lastdate = ref "" let count = ref 0 let posts = ref 0 let homes = ref 0 let bytesum = ref 0l let hosts = ref (Hashtbl.create 0) let whats = ref (Hashtbl.create 0) let sumcount = ref 0 let allposts = ref 0 let allhomes = ref 0 let bytesumsum = ref 0l let allhosts = ref (Hashtbl.create 0) let allwhats = ref (Hashtbl.create 0) (* display the tallies of hosts and URLs *) let write_report () = Printf.printf "%s %7d %8d %8d %7d %7d %14ld\n%!" !lastdate (count_hash !hosts) !count (count_hash !whats) !posts !homes !bytesum; (* add to summary data *) sumcount := !sumcount + !count; bytesumsum := Int32.add !bytesumsum !bytesum; allposts := !allposts + !posts; allhomes := !allhomes + !homes; (* reset daily data *) count := 0; posts := 0; homes := 0; bytesum := 0l; add_hash !allhosts !hosts; add_hash !allwhats !whats; Hashtbl.clear !hosts; Hashtbl.clear !whats (* read log file and tally hits from the host and to the URL *) let daily_logs () = let log = import_file file in print_endline " Date Hosts Accesses Unidocs POST Home Bytes"; print_endline "----------- ------- -------- -------- ------- ------- --------------"; Array.iter (fun row -> let date = format_date row.t in let host = row.src_ip in let what = row.url in let post = row.http_method = POST in let home = what = "/" in let bytes = match row.size with Some n -> n | None -> 0 in if !lastdate = "" then lastdate := date; if !lastdate <> date then write_report (); lastdate := date; incr count; if post then incr posts; if home then incr homes; incr_hash !hosts host 1; incr_hash !whats what 1; bytesum := Int32.add !bytesum (Int32.of_int bytes)) log; if !count > 0 then write_report () let summary () = lastdate := "Grand Total"; count := !sumcount; bytesum := !bytesumsum; hosts := !allhosts; posts := !allposts; whats := !allwhats; homes := !allhomes; write_report () let () = daily_logs (); summary (); exit 0 (*-----------------------------*) #!/usr/bin/ocaml (* aprept - report on Apache logs *) #use "topfind";; #require "weblogs";; open Weblogs let file = if Array.length Sys.argv = 2 then Sys.argv.(1) else (Printf.eprintf "usage: %s <logfile>\n" Sys.argv.(0); exit 1) let log = import_file file let conn = HostIP.connection () let incr_hash hash key by = Hashtbl.replace hash key (try Hashtbl.find hash key + by with Not_found -> by) let report_countries () = let total = ref 0 in let countries = Hashtbl.create 0 in Array.iter (fun row -> let country = match HostIP.get_country_name conn row.src_ip with | Some country -> country | None -> "UNKNOWN" in incr_hash countries country 1; incr total) log; let country_records = ref [] in Hashtbl.iter (fun country count -> country_records := (count, country) :: !country_records) countries; print_endline "Domain Records"; print_endline "==============================="; List.iter (fun (count, country) -> Printf.printf "%18s %5d %5.2f%%\n%!" country count (float count *. 100. /. float !total)) (List.rev (List.sort compare !country_records)) let report_files () = let total = ref 0 in let totalbytes = ref 0l in let bytes = Hashtbl.create 0 in let records = Hashtbl.create 0 in Array.iter (fun row -> let file = row.url in let size = match row.size with Some n -> n | None -> 0 in incr_hash bytes file size; incr_hash records file 1; totalbytes := Int32.add !totalbytes (Int32.of_int size); incr total) log; let file_records = ref [] in Hashtbl.iter (fun file size -> let count = Hashtbl.find records file in file_records := (file, size, count) :: !file_records) bytes; print_endline "File Bytes Records"; print_endline "========================================================="; List.iter (fun (file, size, count) -> Printf.printf "%-22s %10d %5.2f%% %9d %5.2f%%\n%!" file size (float size *. 100. /. Int32.to_float !totalbytes) count (float count *. 100. /. float !total)) (List.sort compare !file_records) let () = report_countries (); print_newline (); report_files ()
[править] Program: htmlsub
#!/usr/bin/ocaml (* htmlsub - make substitutions in normal text of HTML files *) #use "topfind";; #require "str";; #require "netstring";; let usage () = Printf.eprintf "Usage: %s <from> <to> <file>...\n" Sys.argv.(0); exit 1 let from, to', files = match List.tl (Array.to_list Sys.argv) with | from :: to' :: files -> from, to', files | _ -> usage () let rec map_data f = function | Nethtml.Data data -> Nethtml.Data (f data) | Nethtml.Element (name, attribs, children) -> Nethtml.Element (name, attribs, List.map (map_data f) children) let regexp = Str.regexp_string from let buffer = Buffer.create 0 let out_channel = new Netchannels.output_buffer buffer let write html = Nethtml.write out_channel (Nethtml.encode html) let () = List.iter (fun file -> let in_channel = new Netchannels.input_channel (open_in file) in let html = Nethtml.decode (Nethtml.parse in_channel) in in_channel#close_in (); write (List.map (map_data (Str.global_replace regexp to')) html)) files; print_endline (Buffer.contents buffer)
[править] Program: hrefsub
#!/usr/bin/ocaml (* hrefsub - make substitutions in <A HREF="..."> fields of HTML files *) #use "topfind";; #require "str";; #require "netstring";; let usage () = Printf.eprintf "Usage: %s <from> <to> <file>...\n" Sys.argv.(0); exit 1 let from, to', files = match List.tl (Array.to_list Sys.argv) with | from :: to' :: files -> from, to', files | _ -> usage () let rec map_attr tag attr f = function | Nethtml.Data _ as d -> d | Nethtml.Element (name, attribs, children) when tag = name && List.mem_assoc attr attribs -> let value = List.assoc attr attribs in Nethtml.Element (name, (attr, f value) :: List.remove_assoc attr attribs, List.map (map_attr tag attr f) children) | Nethtml.Element (name, attribs, children) -> Nethtml.Element (name, attribs, List.map (map_attr tag attr f) children) let regexp = Str.regexp_string from let buffer = Buffer.create 0 let out_channel = new Netchannels.output_buffer buffer let write html = Nethtml.write out_channel (Nethtml.encode html) let () = List.iter (fun file -> let in_channel = new Netchannels.input_channel (open_in file) in let html = Nethtml.decode (Nethtml.parse in_channel) in in_channel#close_in (); write (List.map (map_attr "a" "href" (Str.global_replace regexp to')) html)) files; print_endline (Buffer.contents buffer)