python爬虫

python 爬虫获取XiciDaili代理IP

默认获取前5页的代理IP，验证其是否有效，然后使用sqlite存储为本地db文件。

class getProxy():



    def __init__(self):

        self.user_agent = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"

        self.header = {"User-Agent": self.user_agent}

        self.dbname="proxy.db"

        self.now = time.strftime("%Y-%m-%d")



    def getContent(self, num):

        nn_url = "http://www.xicidaili.com/nn/" + str(num)

        #国内高匿

        req = urllib2.Request(nn_url, headers=self.header)

        resp = urllib2.urlopen(req, timeout=10)

        content = resp.read()

        et = etree.HTML(content)

        result_even = et.xpath('//tr[@class=""]')

        result_odd = et.xpath('//tr[@class="odd"]')

        #因为网页源码中class 分开了奇偶两个class，所以使用lxml最方便的方式就是分开获取。

        #刚开始我使用一个方式获取，因而出现很多不对称的情况，估计是网站会经常修改源码，怕被其他爬虫的抓到

        #使用上面的方法可以不管网页怎么改，都可以抓到ip 和port

        for i in result_even:

            t1 = i.xpath("./td/text()")[:2]

            print "IP:%s\tPort:%s" % (t1[0], t1[1])

            if self.isAlive(t1[0], t1[1]):



                self.insert_db(self.now,t1[0],t1[1])

        for i in result_odd:

            t2 = i.xpath("./td/text()")[:2]

            print "IP:%s\tPort:%s" % (t2[0], t2[1])

            if self.isAlive(t2[0], t2[1]):

                self.insert_db(self.now,t2[0],t2[1])

接着实现写插入数据库函数：

def insert_db(self,date,ip,port):

        dbname=self.dbname

        try:

            conn=sqlite3.connect(dbname)

        except:

            print "Error to open database%" %self.dbname

        create_tb='''

        CREATE TABLE IF NOT EXISTS PROXY

        (DATE TEXT,

        IP TEXT,

        PORT TEXT

        );

        '''

        conn.execute(create_tb)

        insert_db_cmd='''

        INSERT INTO PROXY (DATE,IP,PORT) VALUES ('%s','%s','%s');

        ''' %(date,ip,port) #写入时间，ip和端口

        conn.execute(insert_db_cmd)

        conn.commit() #记得commit

        conn.close()

接着完成判断代理是否有效

 #查看爬到的代理IP是否还能用

    def isAlive(self,ip,port):

        proxy={'http':ip+':'+port}

        print proxy



        #使用这个方式是全局方法。

        proxy_support=urllib2.ProxyHandler(proxy)

        opener=urllib2.build_opener(proxy_support)

        urllib2.install_opener(opener)

        #使用代理访问腾讯官网，进行验证代理是否有效

        test_url="http://www.qq.com"

        req=urllib2.Request(test_url,headers=self.header)

        try:

            #timeout 设置为10，如果你不能忍受你的代理延时超过10，就修改timeout的数字

            resp=urllib2.urlopen(req,timeout=10)



            if resp.code==200:

                print "work"

                return True

            else:

                print "not work"

                return False

        except :

            print "Not work"

            return False

获取前面多少也的代理IP，用一个循环即可：

    def loop(self,page):

        for i in range(1,page):

            self.getContent(i)

更新2016-08-13
接着实现对已有的数据库进行清洗，失效的代理要移除。待续。。。

调用类实例：设置爬取前面5页的代理ip

if __name__ == "__main__":

    now = datetime.datetime.now()

    print "Start at %s" % now

    obj=getProxy()

    obj.loop(5)

获取最新source code，可以到
https://github.com/Rockyzsu/getProxy
sync up

2016-08-11

20 个评论

呵呵啊哈哈

{'http': '113.74.90.1:8888'}
Not work
Traceback (most recent call last):
File "main.py", line 115, in <module>
obj.loop(5)
File "main.py", line 63, in loop
self.getContent(i)
File "main.py", line 19, in getContent
resp = urllib2.urlopen(req, timeout=10)
File "D:\Python27\lib\urllib2.py", line 127, in urlopen
return _opener.open(url, data, timeout)
File "D:\Python27\lib\urllib2.py", line 404, in open
response = self._open(req, data)
File "D:\Python27\lib\urllib2.py", line 422, in _open
'_open', req)
File "D:\Python27\lib\urllib2.py", line 382, in _call_chain
result = func(*args)
File "D:\Python27\lib\urllib2.py", line 1214, in http_open
return self.do_open(httplib.HTTPConnection, req)
File "D:\Python27\lib\urllib2.py", line 1184, in do_open
raise URLError(err)
urllib2.URLError: <urlopen error timed out>
运行段时间出错是什么原因啊。

呵呵啊哈哈

大侠你有QQ吗

李魔佛回复呵呵啊哈哈

你的网络延时，也就是网速问题吧（暂时看起来是这样）

anbency

Open URL http://www.xicidaili.com/nn/2
Traceback (most recent call last):
File "main.py", line 125, in <module>
obj.loop(4)
File "main.py", line 73, in loop
self.getContent(i)
File "main.py", line 21, in getContent
resp = urllib2.urlopen(req, timeout=10)
File "/usr/lib/python2.7/urllib2.py", line 154, in urlopen
return opener.open(url, data, timeout)
File "/usr/lib/python2.7/urllib2.py", line 429, in open
response = self._open(req, data)
File "/usr/lib/python2.7/urllib2.py", line 447, in _open
'_open', req)
File "/usr/lib/python2.7/urllib2.py", line 407, in _call_chain
result = func(*args)
File "/usr/lib/python2.7/urllib2.py", line 1228, in http_open
return self.do_open(httplib.HTTPConnection, req)
File "/usr/lib/python2.7/urllib2.py", line 1201, in do_open
r = h.getresponse(buffering=True)
File "/usr/lib/python2.7/httplib.py", line 1136, in getresponse
response.begin()
File "/usr/lib/python2.7/httplib.py", line 453, in begin
version, status, reason = self._read_status()
File "/usr/lib/python2.7/httplib.py", line 417, in _read_status
raise BadStatusLine(line)
httplib.BadStatusLine: ''

anbency

很奇怪，用你的源码总是在urlopen http://www.xicidaili.com/nn/2 /3 /4的时候出错，而open第一页就不会出错。我试过如果直接urlopen这5个页面，不判断isAlive直接写进去db就没有问题。

另外不明白你在写入数据库之前已经判断isAlive，为什么后面还要check db，是为之后用的？

李魔佛回复 anbency

第一个问题，如果直接urlopen就出错，是不是你没有添加文件头？或者timeout太短被服务器屏蔽了？

第二个问题，因为代理存活率大概在10%，所以留着一些以后验证一样看还能不能用。

anbency

抱歉又来打搅，如果可以的话，是否可以在您的机器上运行下我这边的脚本，我这里移植只能抓取第一页。
# -*- coding=utf-8 -*-
__author__ = 'Rocky'
import urllib2, time, datetime
from lxml import etree
import sqlite3, time
import proxy
class getProxy():

def __init__(self):
self.user_agent = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"
self.header = {"User-Agent": self.user_agent}
self.dbname="proxy.db"
self.now = time.strftime("%Y-%m-%d")

def getContent(self, num):
nn_url = "http://www.xicidaili.com/nn/" + str(num)
#proxy.create_proxy()
#国内高匿
req = urllib2.Request(nn_url, headers=self.header)
print "Open URL %s" % nn_url
try:
resp = urllib2.urlopen(req, timeout=10)
except urllib2.HTTPError, e:
print e.code
except urllib2.URLError,e:
print e.reason
resp.close()
time.sleep(120)
resp = urllib2.urlopen(req, timeout=10)
else:
content = resp.read()
resp.close()
et = etree.HTML(content)
result_even = et.xpath('//tr[@class=""]')
result_odd = et.xpath('//tr[@class="odd"]')
#因为网页源码中class 分开了奇偶两个class，所以使用lxml最方便的方式就是分开获取。
#刚开始我使用一个方式获取，因而出现很多不对称的情况，估计是网站会经常修改源码，怕被其他爬虫的抓到
#使用上面的方法可以不管网页怎么改，都可以抓到ip 和port
for i in result_even:
t1 = i.xpath("./td/text()")[:2]
print "IP:%s\tPort:%s" % (t1[0], t1[1])
if self.isAlive(t1[0], t1[1]):
self.insert_db(self.now,t1[0],t1[1])
for i in result_odd:
t2 = i.xpath("./td/text()")[:2]
print "IP:%s\tPort:%s" % (t2[0], t2[1])
if self.isAlive(t2[0], t2[1]):
self.insert_db(self.now,t2[0],t2[1])

def insert_db(self,date,ip,port):
dbname=self.dbname
try:
conn=sqlite3.connect(dbname)
except:
print "Error to open database%" %self.dbname
create_tb='''
CREATE TABLE IF NOT EXISTS PROXY
(DATE TEXT,
IP TEXT,
PORT TEXT
);
'''
conn.execute(create_tb)
insert_db_cmd='''
INSERT INTO PROXY (DATE,IP,PORT) VALUES ('%s','%s','%s');
''' %(date,ip,port)
conn.execute(insert_db_cmd)
conn.commit()
conn.close()

def loop(self,page=5):
for i in range(1,page):
self.getContent(i)

#查看爬到的代理IP是否还能用
def isAlive(self,ip,port):
proxy={'http':ip+':'+port}
print proxy

#使用这个方式是全局方法。
proxy_support=urllib2.ProxyHandler(proxy)
opener=urllib2.build_opener(proxy_support)
urllib2.install_opener(opener)
#使用代理访问腾讯官网，进行验证代理是否有效
test_url="http://helloyesyes.iteye.com"#"http://www.qq.com"
req=urllib2.Request(test_url,headers=self.header)
try:
#timeout 设置为10，如果你不能忍受你的代理延时超过10，就修改timeout的数字
resp=urllib2.urlopen(req,timeout=2)

if resp.code==200:
print "work"
#html = resp.read()
#print html
return True
else:
print "not work"
return False
except :
print "except Not work"
return False

#查看数据库里面的数据时候还有效，没有的话将其纪录删除
def check_db_pool(self):
conn=sqlite3.connect(self.dbname)
query_cmd='''
select IP,PORT from PROXY;
'''
cursor=conn.execute(query_cmd)
for row in cursor:
if not self.isAlive(row[0],row[1]):
#代理失效，要从数据库从删除
delete_cmd='''
delete from PROXY where IP='%s'
''' %row[0]
#print "delete IP %s in db" %row[0]
conn.execute(delete_cmd)
conn.commit()

conn.close()

if __name__ == "__main__":
now = datetime.datetime.now()
print "Start at %s" % now
obj=getProxy()
obj.loop(3)
obj.check_db_pool()

李魔佛回复 anbency

好的，我尽量试试。话说，你有github吗？上面可以协助调试代码

anbency 回复李魔佛

github调试代码，不太会用，他不是只是管理代码的么

anbency 回复李魔佛

https://github.com/anbency/spider.git

李魔佛回复 anbency

你的mian.py代码里：

nn_url = "http://www.xicidaili.com/nn/" + str(num)
#proxy.create_proxy()
#国内高匿
req = urllib2.Request(nn_url, headers=self.header)
print "Open URL %s" % nn_url
try:
resp = urllib2.urlopen(req, timeout=10)
except urllib2.HTTPError, e:
print e.code
except urllib2.URLError,e:
print e.reason
resp.close()
time.sleep(120)
resp = urllib2.urlopen(req, timeout=10)
else:
content = resp.read()
resp.close()
et = etree.HTML(content)
result_even = et.xpath('//tr[@class=""]')
result_odd = et.xpath('//tr[@class="odd"]')

这里逻辑有问题，else语句上面没有if，还有有2个except的判断。

anbency

这个写法应该是可以的吧
我的本意时在出现URLError的时候，等待一段时间再try一次urlopen

摘自网络：
python中try/except/else/finally语句的完整格式如下所示：
try:
Normal execution block
except A:
Exception A handle
except B:
Exception B handle
except:
Other exception handle
else:
if no exception,get here
finally:
print("finally")

anbency 回复李魔佛

想到一个地方可能有问题，你在isAlive中安装了代理服务器，这个应该是全局的。那如果在爬取xici第2页之前，你安装的那个代理可能无法访问xici的站点，就会导致后面的页面无法connect

anbency 回复 anbency

想请教下rocky兄，
第1点：设置完代理如何确认我是用代理去做的urlopen呢，也就是代理是否设置成功？
第2点，设置完代理后，如果取消代理，仍然用我原本的ip去访问
不知道rocky兄是否对此有研究？

李魔佛回复 anbency

proxy={'http':ip+':'+port}
print proxy

#使用这个方式是全局方法。
proxy_support=urllib2.ProxyHandler(proxy)
opener=urllib2.build_opener(proxy_support)
urllib2.install_opener(opener)
#使用代理访问腾讯官网，进行验证代理是否有效
test_url="http://www.qq.com"
req=urllib2.Request(test_url,headers=self.header)
try:

我上面说的是全局的，不过每次重新设置后，都会使用新的代理。
代码里的逻辑是先把全部代理爬取下来，然后再逐个验证。并不是抓一个验证一个。

李魔佛回复 anbency

好，我在本地验证一下，我以前没这么用过。

李魔佛回复 anbency

你代码里面设置了从第5页开始爬

def loop(self,page=5):
for i in range(5,10):
self.getContent(i)

李魔佛

还有一个错误：
Traceback (most recent call last):
File "main.py", line 128, in <module>
obj.loop(3)
File "main.py", line 74, in loop
self.getContent(i)
File "main.py", line 27, in getContent
resp.close()
UnboundLocalError: local variable 'resp' referenced before assignment

anbency 回复李魔佛

抱歉给你带来困恼，这个我其实是debug的时候改了下，想看看是否抓后面的页面会ok
但这个不影响分析我的问题

UnboundLocalError: local variable 'resp' referenced before assignment
这个错误就是我遇到的
也就是第一个页面是用本地的ip去抓取xici，这个时候是ok的，但当抓取第2个页面时，因为全局已经装上了代理，所以导致抓取失败
我改了下，不用全局的，而是使用request.setproxy()去关联每个request，这样isAlive里面对http://www.qq.com的request不会影响我对xici的request

李魔佛回复 anbency

是的，你代码是抓取一页后就验证是否alive，这是你的代理已经生效了。
所以我项目里是全部爬完后再统一验证，可以避免这个问题。

要回复文章请先登录或注册

python 爬虫获取XiciDaili代理IP

20 个评论

发起人

推荐内容

python 爬虫获取XiciDaili代理IP

20 个评论

发起人

推荐内容

相关问题