这个脚本所做的事情是从www.cnproxy.com网站上取得最新的10页代理服务器的ip,将它们写到proxy_ip_list.txt文本文件中去。包括每一个代理服务器的响应时间。
#encoding: UTF-8 #添加这一行才可以处理中文
require 'rubygems' #gem install xxx --no-ri --no-rdoc
require 'timeout'
require 'watir-webdriver'
time1=Time.now
proxy_ip_lists=File.new("proxy_ip_lists.txt",'w')
def char2int(str)
for i in 0.. str.length-1
if str[i]=='z' then str[i]='3'
elsif str[i]=='c' then str[i]='1'
elsif str[i]=='r' then str[i]='8'
elsif str[i]=='d' then str[i]='0'
elsif str[i]=='k' then str[i]='2'
elsif str[i]=='m' then str[i]='4'
elsif str[i]=='l' then str[i]='9'
elsif str[i]=='b' then str[i]='5'
elsif str[i]=='i' then str[i]='7'
elsif str[i]=='w' then str[i]='6'
end
end
return str
end
for page_no in 1..10 do #一共有10页
url = "http://www.cnproxy.com/proxy#{page_no}.html"
content = Net::HTTP.get(URI.parse(url))
regex=/<td>(.*?)<SCRIPT type=text\/javascript>document.write\(":"(.*?)\)<\/SCRIPT><\/td>/ #http://www.rubular.com
for array in content.scan(regex)
#puts "proxy ip is : " + array[0] + 'port is : ' + char2int(array[1].delete('+'))
proxy_ip_lists.puts array[0]+':'+char2int(array[1].delete('+'))
end
end
pf = File.open("proxy_ip_lists.txt","r")
url='www.google.com'; line='I have value'
while line !='\n' && line != nil do
begin
Timeout::timeout(5) do |timeout_length|
#until (line =='\n' || line == nil) do line=pf.gets; end;
line=pf.gets; if (line =='\n' || line == nil) then line=pf.gets; end
proxy_addr=line.split(":")[0]; proxy_port=line.split(":")[1].to_i
proxy =Net::HTTP::Proxy(proxy_addr, proxy_port).start(url) { |http|
start_at = Time.now
response = http.get('/index.html')
end_at = Time.now
diff = end_at - start_at # 响应时间
#puts response.body
puts line.rstrip + " accessing " + url + " using " + diff.to_s #观察各个代理服务器的响应时间各是多少
}
end
rescue Errno::EADDRNOTAVAIL, Errno::ECONNREFUSED,Errno::ECONNRESET,Errno::EOPNOTSUPP,Timeout::Error, EOFError, SocketError
puts "exception raised !"
retry
end
end
time2=Time.now
puts "这个程序运行了一共 " + (time2-time1).to_s + " 秒"
Comments !