Ruby/Network/HTML Parser

Материал из Wiki.crossplatform.ru

Перейти к: навигация, поиск

Содержание

Check body size

require "net/http"
require "uri"
module Net
  class HTTP
    def HTTP.get_with_headers(uri, headers=nil)
      uri = URI.parse(uri) if uri.respond_to? :to_str
      start(uri.host, uri.port) do |http|
        path_query = uri.path + (uri.query ? ("?" + uri.query) : "")
        return http.get(path_query, headers)
      end
    end
  end
end
uncompressed = Net::HTTP.get_with_headers("http://www.cnn.ru/")
uncompressed.body.size



Customizing HTTP Request Headers

require "net/http"
require "uri"
module Net
  class HTTP
    def HTTP.get_with_headers(uri, headers=nil)
      uri = URI.parse(uri) if uri.respond_to? :to_str
      start(uri.host, uri.port) do |http|
        path_query = uri.path + (uri.query ? ("?" + uri.query) : "")
        return http.get(path_query, headers)
      end
    end
  end
end
res = Net::HTTP.get_with_headers("http://www.google.ru/",
                                 {"Accept-Language" => "de"})
s = res.body.size
res.body[s-200..s-140]



Extract URL

require "uri"
text = %{"test
<a href="http://www.a.ru/">http://www.a.ru/</a>, and be sure
to check http://www.a.ru/blog/. Email me at <a href="mailto:b@a.ru">b@a.ru</a>.}
 
END_CHARS = %{.,"?!:;}
p URI.extract(text, ["http"]).collect { |u| END_CHARS.index(u[-1]) ? u.chop : u }



Grab links for anchor and image

require "rexml/document"
require "rexml/streamlistener"
require "set"
class LinkGrabber
  include REXML::StreamListener
  attr_reader :links
 def initialize(interesting_tags = {"a" => %w{href}, "img" => %w{src}}.freeze)
    @tags = interesting_tags
    @links = Set.new
  end
  def tag_start(name, attrs)
    @tags[name].each do |uri_attr|
      @links << attrs[uri_attr] if attrs[uri_attr]
    end if @tags[name]
  end
  def parse(text)
    REXML::Document.parse_stream(text, self)
  end
end
 
text = %{"test
<a href="http://www.example.ru/">http://www.example.ru/</a>, http://www.example.ru/blog/. Email me at <a
href="mailto:bob@example.ru">b@e.ru</a>.}
grabber = LinkGrabber.new
grabber.parse(text)
p grabber.links