Watir收集最快的代理服务器ip列表

这个脚本所做的事情是从www.cnproxy.com网站上取得最新的10页代理服务器的ip,将它们写到proxy_ip_list.txt文本文件中去。包括每一个代理服务器的响应时间。

#encoding: UTF-8                       #添加这一行才可以处理中文
require 'rubygems'                     #gem install xxx --no-ri --no-rdoc
require 'timeout'
require 'watir-webdriver'

time1=Time.now
proxy_ip_lists=File.new("proxy_ip_lists.txt",'w')
def char2int(str)
    for i in 0.. str.length-1
      if str[i]=='z' then str[i]='3'
      elsif str[i]=='c' then str[i]='1' 
      elsif str[i]=='r' then str[i]='8' 
      elsif str[i]=='d' then str[i]='0' 
      elsif str[i]=='k' then str[i]='2' 
      elsif str[i]=='m' then str[i]='4' 
      elsif str[i]=='l' then str[i]='9' 
      elsif str[i]=='b' then str[i]='5' 
      elsif str[i]=='i' then str[i]='7' 
      elsif str[i]=='w' then str[i]='6' 
      end
    end
  return str
 end

for page_no in 1..10 do                  #一共有10页
   url = "http://www.cnproxy.com/proxy#{page_no}.html"
   content = Net::HTTP.get(URI.parse(url))
   regex=/<td>(.*?)<SCRIPT type=text\/javascript>document.write\(":"(.*?)\)<\/SCRIPT><\/td>/   #http://www.rubular.com
   for array in content.scan(regex)    
     #puts "proxy ip is : " + array[0] + 'port is : ' + char2int(array[1].delete('+'))
     proxy_ip_lists.puts array[0]+':'+char2int(array[1].delete('+'))
    end
 end

 pf = File.open("proxy_ip_lists.txt","r")
 url='www.google.com'; line='I have value'
 while line !='\n' && line != nil do
  begin
  Timeout::timeout(5)  do |timeout_length|
    #until (line =='\n' || line == nil) do line=pf.gets; end;
    line=pf.gets; if (line =='\n' || line == nil) then line=pf.gets; end
    proxy_addr=line.split(":")[0]; proxy_port=line.split(":")[1].to_i
    proxy =Net::HTTP::Proxy(proxy_addr, proxy_port).start(url) { |http|
    start_at = Time.now
    response = http.get('/index.html')
    end_at = Time.now 
    diff = end_at - start_at                                            # 响应时间
    #puts response.body
    puts line.rstrip + " accessing " + url + " using " + diff.to_s      #观察各个代理服务器的响应时间各是多少
   }
 end
  rescue Errno::EADDRNOTAVAIL, Errno::ECONNREFUSED,Errno::ECONNRESET,Errno::EOPNOTSUPP,Timeout::Error, EOFError, SocketError 
      puts "exception raised !"
     retry   
  end
end
time2=Time.now
puts "这个程序运行了一共 " + (time2-time1).to_s + " 秒"

Comments !