python异步aiohttp爬虫 - 异步爬取链家数据

import requests
from lxml import etree
import asyncio
import aiohttp
import pandas
import re
import math
import time

loction_info = ''' 1→杭州
2→武汉
3→北京
按ENTER确认:'''
loction_select = input(loction_info)
loction_dic = {'1': 'hz',
'2': 'wh',
'3': 'bj'}
city_url = 'https://{}.lianjia.com/ershoufang/'.format(loction_dic[loction_select])
down = input('请输入价格下限(万):')
up = input('请输入价格上限(万):')

inter_list = [(int(down), int(up))]


def half_inter(inter):
lower = inter[0]
upper = inter[1]
delta = int((upper - lower) / 2)
inter_list.remove(inter)
print('已经缩小价格区间', inter)
inter_list.append((lower, lower + delta))
inter_list.append((lower + delta, upper))


pagenum = {}


def get_num(inter):
url = city_url + 'bp{}ep{}/'.format(inter[0], inter[1])
r = requests.get(url).text
print(r)
num = int(etree.HTML(r).xpath("//h2[@class='total fl']/span/text()")[0].strip())
pagenum[(inter[0], inter[1])] = num
return num


totalnum = get_num(inter_list[0])

judge = True
while judge:
a = [get_num(x) > 3000 for x in inter_list]
if True in a:
judge = True
else:
judge = False
for i in inter_list:
if get_num(i) > 3000:
half_inter(i)
print('价格区间缩小完毕!')

url_lst = []
url_lst_failed = []
url_lst_successed = []
url_lst_duplicated = []

for i in inter_list:
totalpage = math.ceil(pagenum[i] / 30)
for j in range(1, totalpage + 1):
url = city_url + 'pg{}bp{}ep{}/'.format(j, i[0], i[1])
url_lst.append(url)
print('url列表获取完毕!')

info_lst = []


async def get_info(url):
async with aiohttp.ClientSession() as session:
async with session.get(url, timeout=5) as resp:
if resp.status != 200:
url_lst_failed.append(url)
else:
url_lst_successed.append(url)
r = await resp.text()
nodelist = etree.HTML(r).xpath("//ul[@class='sellListContent']/li")
# print('-------------------------------------------------------------')
# print('开始抓取第{}个页面的数据,共计{}个页面'.format(url_lst.index(url),len(url_lst)))
# print('开始抓取第{}个页面的数据,共计{}个页面'.format(url_lst.index(url), len(url_lst)))
# print('开始抓取第{}个页面的数据,共计{}个页面'.format(url_lst.index(url), len(url_lst)))
# print('-------------------------------------------------------------')
info_dic = {}
index = 1
print('开始抓取{}'.format(resp.url))
print('开始抓取{}'.format(resp.url))
print('开始抓取{}'.format(resp.url))
for node in nodelist:
try:
info_dic['title'] = node.xpath(".//div[@class='title']/a/text()")[0]
except:
info_dic['title'] = '/'
try:
info_dic['href'] = node.xpath(".//div[@class='title']/a/@href")[0]
except:
info_dic['href'] = '/'
try:
info_dic['xiaoqu'] = \
node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[0]
except:
info_dic['xiaoqu'] = '/'
try:
info_dic['huxing'] = \
node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[1]
except:
info_dic['huxing'] = '/'
try:
info_dic['area'] = \
node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[2]
except:
info_dic['area'] = '/'
try:
info_dic['chaoxiang'] = \
node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[3]
except:
info_dic['chaoxiang'] = '/'
try:
info_dic['zhuangxiu'] = \
node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[4]
except:
info_dic['zhuangxiu'] = '/'
try:
info_dic['dianti'] = \
node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[5]
except:
info_dic['dianti'] = '/'
try:
info_dic['louceng'] = re.findall('\((.*)\)', node.xpath(".//div[@class='positionInfo']/text()")[0])
except:
info_dic['louceng'] = '/'
try:
info_dic['nianxian'] = re.findall('\)(.*?)年', node.xpath(".//div[@class='positionInfo']/text()")[0])
except:
info_dic['nianxian'] = '/'
try:
info_dic['guanzhu'] = ''.join(re.findall('[0-9]', node.xpath(".//div[@class='followInfo']/text()")[
0].replace(' ', '').split('/')[0]))
except:
info_dic['guanzhu'] = '/'
try:
info_dic['daikan'] = ''.join(re.findall('[0-9]',
node.xpath(".//div[@class='followInfo']/text()")[0].replace(
' ', '').split('/')[1]))
except:
info_dic['daikan'] = '/'
try:
info_dic['fabu'] = node.xpath(".//div[@class='followInfo']/text()")[0].replace(' ', '').split('/')[
2]
except:
info_dic['fabu'] = '/'
try:
info_dic['totalprice'] = node.xpath(".//div[@class='totalPrice']/span/text()")[0]
except:
info_dic['totalprice'] = '/'
try:
info_dic['unitprice'] = node.xpath(".//div[@class='unitPrice']/span/text()")[0].replace('单价', '')
except:
info_dic['unitprice'] = '/'
if True in [info_dic['href'] in dic.values() for dic in info_lst]:
url_lst_duplicated.append(info_dic)
else:
info_lst.append(info_dic)
print('第{}条: {}→房屋信息抓取完毕!'.format(index, info_dic['title']))
index += 1
info_dic = {}


start = time.time()

# 首次抓取url_lst中的信息,部分url没有对其发起请求,不知道为什么
tasks = [asyncio.ensure_future(get_info(url)) for url in url_lst]
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))

# 将没有发起请求的url放入一个列表,对其进行循环抓取,直到所有url都被发起请求
url_lst_unrequested = []
for url in url_lst:
if url not in url_lst_successed or url_lst_failed:
url_lst_unrequested.append(url)
while len(url_lst_unrequested) > 0:
tasks_unrequested = [asyncio.ensure_future(get_info(url)) for url in url_lst_unrequested]
loop.run_until_complete(asyncio.wait(tasks_unrequested))
url_lst_unrequested = []
for url in url_lst:
if url not in url_lst_successed:
url_lst_unrequested.append(url)
end = time.time()
print('当前价格区间段内共有{}套二手房源\(包含{}条重复房源\),实际获得{}条房源信息。'.format(totalnum, len(url_lst_duplicated), len(info_lst)))
print('总共耗时{}秒'.format(end - start))

df = pandas.DataFrame(info_lst)
df.to_csv("ljwh.csv", encoding='gbk')

0 个评论

要回复文章请先登录注册