requests直接post图片文件

李魔佛 发表了文章 • 0 个评论 • 100 次浏览 • 2019-05-17 16:32 • 来自相关话题

代码如下:
file_path=r'9927_15562445086485238.png'
file=open(file_path, 'rb').read()
r=requests.post(url=code_url,data=file)
print(r.text) 查看全部
代码如下:
    file_path=r'9927_15562445086485238.png'
file=open(file_path, 'rb').read()
r=requests.post(url=code_url,data=file)
print(r.text)

正则表达式替换中文换行符【python】

李魔佛 发表了文章 • 0 个评论 • 86 次浏览 • 2019-05-13 11:02 • 来自相关话题

js里面的内容有中文的换行符。
使用正则表达式替换换行符。(也可以替换为任意字符)js=re.sub('\r\n','',js)
完毕。
js里面的内容有中文的换行符。
使用正则表达式替换换行符。(也可以替换为任意字符)
js=re.sub('\r\n','',js)

完毕。

request header显示Provisional headers are shown

李魔佛 发表了文章 • 0 个评论 • 78 次浏览 • 2019-05-13 10:07 • 来自相关话题

出现这个情况,一般是因为装了一些插件,比如屏蔽广告的插件 ad block导致的。
把插件卸载了问题就解决了。
出现这个情况,一般是因为装了一些插件,比如屏蔽广告的插件 ad block导致的。
把插件卸载了问题就解决了。

异步爬虫aiohttp post提交数据

李魔佛 发表了文章 • 0 个评论 • 140 次浏览 • 2019-05-08 16:40 • 来自相关话题

import aiohttp
import asyncio

page = 30

post_data = {
'page': 1,
'pageSize': 10,
'keyWord': '',
'dpIds': '',
}

headers = {
"Accept": "application/json, text/javascript, */*; q=0.01",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "en-US,en;q=0.9",
"Connection": "keep-alive",
"Content-Length": "34",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"Cookie": "secure; JSESSIONID=8NGWetn7NWF7Hb-SSkrbbzGDbYQzmNM_gjKj8wql4PXn2uc7ruv0!-96282387; __jsluid=72f938f1aa890b0ab98d726eb9d7d36f; Hm_lvt_606ad402d71f074871f1daa788ba943d=1557302782; Hm_lpvt_606ad402d71f074871f1daa788ba943d=1557302788",
"Host": "credit.chaozhou.gov.cn",
"Origin": "http://credit.chaozhou.gov.cn",
"Referer": "http://credit.chaozhou.gov.cn/ ... ot%3B,
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36",
"X-Requested-With": "XMLHttpRequest",
}

result=[]


async def fetch(session,url, data):
async with session.post(url=url, data=data, headers=headers) as response:
return await response.json()

async def parse(html):
xzcf_list = html.get('newtxzcfList')
if xzcf_list is None:
return
for i in xzcf_list:
result.append(i)

async def downlod(page):
data=post_data.copy()
data['page']=page
url = 'http://credit.chaozhou.gov.cn/tfieldTypeActionJson!initXzcfListnew.do'
async with aiohttp.ClientSession() as session:
html=await fetch(session,url,data)
await parse(html)

loop = asyncio.get_event_loop()
tasks=[asyncio.ensure_future(downlod(i)) for i in range(1,page)]
tasks=asyncio.gather(*tasks)
# print(tasks)
loop.run_until_complete(tasks)
# loop.close()
# print(result)
count=0
for i in result:
print(i.get('cfXdrMc'))
count+=1
print(f'total {count}') 查看全部
import aiohttp
import asyncio

page = 30

post_data = {
'page': 1,
'pageSize': 10,
'keyWord': '',
'dpIds': '',
}

headers = {
"Accept": "application/json, text/javascript, */*; q=0.01",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "en-US,en;q=0.9",
"Connection": "keep-alive",
"Content-Length": "34",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"Cookie": "secure; JSESSIONID=8NGWetn7NWF7Hb-SSkrbbzGDbYQzmNM_gjKj8wql4PXn2uc7ruv0!-96282387; __jsluid=72f938f1aa890b0ab98d726eb9d7d36f; Hm_lvt_606ad402d71f074871f1daa788ba943d=1557302782; Hm_lpvt_606ad402d71f074871f1daa788ba943d=1557302788",
"Host": "credit.chaozhou.gov.cn",
"Origin": "http://credit.chaozhou.gov.cn",
"Referer": "http://credit.chaozhou.gov.cn/ ... ot%3B,
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36",
"X-Requested-With": "XMLHttpRequest",
}

result=[]


async def fetch(session,url, data):
async with session.post(url=url, data=data, headers=headers) as response:
return await response.json()

async def parse(html):
xzcf_list = html.get('newtxzcfList')
if xzcf_list is None:
return
for i in xzcf_list:
result.append(i)

async def downlod(page):
data=post_data.copy()
data['page']=page
url = 'http://credit.chaozhou.gov.cn/tfieldTypeActionJson!initXzcfListnew.do'
async with aiohttp.ClientSession() as session:
html=await fetch(session,url,data)
await parse(html)

loop = asyncio.get_event_loop()
tasks=[asyncio.ensure_future(downlod(i)) for i in range(1,page)]
tasks=asyncio.gather(*tasks)
# print(tasks)
loop.run_until_complete(tasks)
# loop.close()
# print(result)
count=0
for i in result:
print(i.get('cfXdrMc'))
count+=1
print(f'total {count}')

python异步aiohttp爬虫 - 异步爬取链家数据

李魔佛 发表了文章 • 0 个评论 • 125 次浏览 • 2019-05-08 15:52 • 来自相关话题

import requests
from lxml import etree
import asyncio
import aiohttp
import pandas
import re
import math
import time

loction_info = ''' 1→杭州
2→武汉
3→北京
按ENTER确认:'''
loction_select = input(loction_info)
loction_dic = {'1': 'hz',
'2': 'wh',
'3': 'bj'}
city_url = 'https://{}.lianjia.com/ershoufang/'.format(loction_dic[loction_select])
down = input('请输入价格下限(万):')
up = input('请输入价格上限(万):')

inter_list = [(int(down), int(up))]


def half_inter(inter):
lower = inter[0]
upper = inter[1]
delta = int((upper - lower) / 2)
inter_list.remove(inter)
print('已经缩小价格区间', inter)
inter_list.append((lower, lower + delta))
inter_list.append((lower + delta, upper))


pagenum = {}


def get_num(inter):
url = city_url + 'bp{}ep{}/'.format(inter[0], inter[1])
r = requests.get(url).text
print(r)
num = int(etree.HTML(r).xpath("//h2[@class='total fl']/span/text()")[0].strip())
pagenum[(inter[0], inter[1])] = num
return num


totalnum = get_num(inter_list[0])

judge = True
while judge:
a = [get_num(x) > 3000 for x in inter_list]
if True in a:
judge = True
else:
judge = False
for i in inter_list:
if get_num(i) > 3000:
half_inter(i)
print('价格区间缩小完毕!')

url_lst = []
url_lst_failed = []
url_lst_successed = []
url_lst_duplicated = []

for i in inter_list:
totalpage = math.ceil(pagenum[i] / 30)
for j in range(1, totalpage + 1):
url = city_url + 'pg{}bp{}ep{}/'.format(j, i[0], i[1])
url_lst.append(url)
print('url列表获取完毕!')

info_lst = []


async def get_info(url):
async with aiohttp.ClientSession() as session:
async with session.get(url, timeout=5) as resp:
if resp.status != 200:
url_lst_failed.append(url)
else:
url_lst_successed.append(url)
r = await resp.text()
nodelist = etree.HTML(r).xpath("//ul[@class='sellListContent']/li")
# print('-------------------------------------------------------------')
# print('开始抓取第{}个页面的数据,共计{}个页面'.format(url_lst.index(url),len(url_lst)))
# print('开始抓取第{}个页面的数据,共计{}个页面'.format(url_lst.index(url), len(url_lst)))
# print('开始抓取第{}个页面的数据,共计{}个页面'.format(url_lst.index(url), len(url_lst)))
# print('-------------------------------------------------------------')
info_dic = {}
index = 1
print('开始抓取{}'.format(resp.url))
print('开始抓取{}'.format(resp.url))
print('开始抓取{}'.format(resp.url))
for node in nodelist:
try:
info_dic['title'] = node.xpath(".//div[@class='title']/a/text()")[0]
except:
info_dic['title'] = '/'
try:
info_dic['href'] = node.xpath(".//div[@class='title']/a/@href")[0]
except:
info_dic['href'] = '/'
try:
info_dic['xiaoqu'] = \
node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[0]
except:
info_dic['xiaoqu'] = '/'
try:
info_dic['huxing'] = \
node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[1]
except:
info_dic['huxing'] = '/'
try:
info_dic['area'] = \
node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[2]
except:
info_dic['area'] = '/'
try:
info_dic['chaoxiang'] = \
node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[3]
except:
info_dic['chaoxiang'] = '/'
try:
info_dic['zhuangxiu'] = \
node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[4]
except:
info_dic['zhuangxiu'] = '/'
try:
info_dic['dianti'] = \
node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[5]
except:
info_dic['dianti'] = '/'
try:
info_dic['louceng'] = re.findall('\((.*)\)', node.xpath(".//div[@class='positionInfo']/text()")[0])
except:
info_dic['louceng'] = '/'
try:
info_dic['nianxian'] = re.findall('\)(.*?)年', node.xpath(".//div[@class='positionInfo']/text()")[0])
except:
info_dic['nianxian'] = '/'
try:
info_dic['guanzhu'] = ''.join(re.findall('[0-9]', node.xpath(".//div[@class='followInfo']/text()")[
0].replace(' ', '').split('/')[0]))
except:
info_dic['guanzhu'] = '/'
try:
info_dic['daikan'] = ''.join(re.findall('[0-9]',
node.xpath(".//div[@class='followInfo']/text()")[0].replace(
' ', '').split('/')[1]))
except:
info_dic['daikan'] = '/'
try:
info_dic['fabu'] = node.xpath(".//div[@class='followInfo']/text()")[0].replace(' ', '').split('/')[
2]
except:
info_dic['fabu'] = '/'
try:
info_dic['totalprice'] = node.xpath(".//div[@class='totalPrice']/span/text()")[0]
except:
info_dic['totalprice'] = '/'
try:
info_dic['unitprice'] = node.xpath(".//div[@class='unitPrice']/span/text()")[0].replace('单价', '')
except:
info_dic['unitprice'] = '/'
if True in [info_dic['href'] in dic.values() for dic in info_lst]:
url_lst_duplicated.append(info_dic)
else:
info_lst.append(info_dic)
print('第{}条: {}→房屋信息抓取完毕!'.format(index, info_dic['title']))
index += 1
info_dic = {}


start = time.time()

# 首次抓取url_lst中的信息,部分url没有对其发起请求,不知道为什么
tasks = [asyncio.ensure_future(get_info(url)) for url in url_lst]
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))

# 将没有发起请求的url放入一个列表,对其进行循环抓取,直到所有url都被发起请求
url_lst_unrequested = []
for url in url_lst:
if url not in url_lst_successed or url_lst_failed:
url_lst_unrequested.append(url)
while len(url_lst_unrequested) > 0:
tasks_unrequested = [asyncio.ensure_future(get_info(url)) for url in url_lst_unrequested]
loop.run_until_complete(asyncio.wait(tasks_unrequested))
url_lst_unrequested = []
for url in url_lst:
if url not in url_lst_successed:
url_lst_unrequested.append(url)
end = time.time()
print('当前价格区间段内共有{}套二手房源\(包含{}条重复房源\),实际获得{}条房源信息。'.format(totalnum, len(url_lst_duplicated), len(info_lst)))
print('总共耗时{}秒'.format(end - start))

df = pandas.DataFrame(info_lst)
df.to_csv("ljwh.csv", encoding='gbk') 查看全部
import requests
from lxml import etree
import asyncio
import aiohttp
import pandas
import re
import math
import time

loction_info = ''' 1→杭州
2→武汉
3→北京
按ENTER确认:'''
loction_select = input(loction_info)
loction_dic = {'1': 'hz',
'2': 'wh',
'3': 'bj'}
city_url = 'https://{}.lianjia.com/ershoufang/'.format(loction_dic[loction_select])
down = input('请输入价格下限(万):')
up = input('请输入价格上限(万):')

inter_list = [(int(down), int(up))]


def half_inter(inter):
lower = inter[0]
upper = inter[1]
delta = int((upper - lower) / 2)
inter_list.remove(inter)
print('已经缩小价格区间', inter)
inter_list.append((lower, lower + delta))
inter_list.append((lower + delta, upper))


pagenum = {}


def get_num(inter):
url = city_url + 'bp{}ep{}/'.format(inter[0], inter[1])
r = requests.get(url).text
print(r)
num = int(etree.HTML(r).xpath("//h2[@class='total fl']/span/text()")[0].strip())
pagenum[(inter[0], inter[1])] = num
return num


totalnum = get_num(inter_list[0])

judge = True
while judge:
a = [get_num(x) > 3000 for x in inter_list]
if True in a:
judge = True
else:
judge = False
for i in inter_list:
if get_num(i) > 3000:
half_inter(i)
print('价格区间缩小完毕!')

url_lst = []
url_lst_failed = []
url_lst_successed = []
url_lst_duplicated = []

for i in inter_list:
totalpage = math.ceil(pagenum[i] / 30)
for j in range(1, totalpage + 1):
url = city_url + 'pg{}bp{}ep{}/'.format(j, i[0], i[1])
url_lst.append(url)
print('url列表获取完毕!')

info_lst = []


async def get_info(url):
async with aiohttp.ClientSession() as session:
async with session.get(url, timeout=5) as resp:
if resp.status != 200:
url_lst_failed.append(url)
else:
url_lst_successed.append(url)
r = await resp.text()
nodelist = etree.HTML(r).xpath("//ul[@class='sellListContent']/li")
# print('-------------------------------------------------------------')
# print('开始抓取第{}个页面的数据,共计{}个页面'.format(url_lst.index(url),len(url_lst)))
# print('开始抓取第{}个页面的数据,共计{}个页面'.format(url_lst.index(url), len(url_lst)))
# print('开始抓取第{}个页面的数据,共计{}个页面'.format(url_lst.index(url), len(url_lst)))
# print('-------------------------------------------------------------')
info_dic = {}
index = 1
print('开始抓取{}'.format(resp.url))
print('开始抓取{}'.format(resp.url))
print('开始抓取{}'.format(resp.url))
for node in nodelist:
try:
info_dic['title'] = node.xpath(".//div[@class='title']/a/text()")[0]
except:
info_dic['title'] = '/'
try:
info_dic['href'] = node.xpath(".//div[@class='title']/a/@href")[0]
except:
info_dic['href'] = '/'
try:
info_dic['xiaoqu'] = \
node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[0]
except:
info_dic['xiaoqu'] = '/'
try:
info_dic['huxing'] = \
node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[1]
except:
info_dic['huxing'] = '/'
try:
info_dic['area'] = \
node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[2]
except:
info_dic['area'] = '/'
try:
info_dic['chaoxiang'] = \
node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[3]
except:
info_dic['chaoxiang'] = '/'
try:
info_dic['zhuangxiu'] = \
node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[4]
except:
info_dic['zhuangxiu'] = '/'
try:
info_dic['dianti'] = \
node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[5]
except:
info_dic['dianti'] = '/'
try:
info_dic['louceng'] = re.findall('\((.*)\)', node.xpath(".//div[@class='positionInfo']/text()")[0])
except:
info_dic['louceng'] = '/'
try:
info_dic['nianxian'] = re.findall('\)(.*?)年', node.xpath(".//div[@class='positionInfo']/text()")[0])
except:
info_dic['nianxian'] = '/'
try:
info_dic['guanzhu'] = ''.join(re.findall('[0-9]', node.xpath(".//div[@class='followInfo']/text()")[
0].replace(' ', '').split('/')[0]))
except:
info_dic['guanzhu'] = '/'
try:
info_dic['daikan'] = ''.join(re.findall('[0-9]',
node.xpath(".//div[@class='followInfo']/text()")[0].replace(
' ', '').split('/')[1]))
except:
info_dic['daikan'] = '/'
try:
info_dic['fabu'] = node.xpath(".//div[@class='followInfo']/text()")[0].replace(' ', '').split('/')[
2]
except:
info_dic['fabu'] = '/'
try:
info_dic['totalprice'] = node.xpath(".//div[@class='totalPrice']/span/text()")[0]
except:
info_dic['totalprice'] = '/'
try:
info_dic['unitprice'] = node.xpath(".//div[@class='unitPrice']/span/text()")[0].replace('单价', '')
except:
info_dic['unitprice'] = '/'
if True in [info_dic['href'] in dic.values() for dic in info_lst]:
url_lst_duplicated.append(info_dic)
else:
info_lst.append(info_dic)
print('第{}条: {}→房屋信息抓取完毕!'.format(index, info_dic['title']))
index += 1
info_dic = {}


start = time.time()

# 首次抓取url_lst中的信息,部分url没有对其发起请求,不知道为什么
tasks = [asyncio.ensure_future(get_info(url)) for url in url_lst]
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))

# 将没有发起请求的url放入一个列表,对其进行循环抓取,直到所有url都被发起请求
url_lst_unrequested = []
for url in url_lst:
if url not in url_lst_successed or url_lst_failed:
url_lst_unrequested.append(url)
while len(url_lst_unrequested) > 0:
tasks_unrequested = [asyncio.ensure_future(get_info(url)) for url in url_lst_unrequested]
loop.run_until_complete(asyncio.wait(tasks_unrequested))
url_lst_unrequested = []
for url in url_lst:
if url not in url_lst_successed:
url_lst_unrequested.append(url)
end = time.time()
print('当前价格区间段内共有{}套二手房源\(包含{}条重复房源\),实际获得{}条房源信息。'.format(totalnum, len(url_lst_duplicated), len(info_lst)))
print('总共耗时{}秒'.format(end - start))

df = pandas.DataFrame(info_lst)
df.to_csv("ljwh.csv", encoding='gbk')

pycharm debug scrapy 报错 twisted.internet.error.ReactorNotRestartable

李魔佛 发表了文章 • 0 个评论 • 272 次浏览 • 2019-04-23 11:35 • 来自相关话题

没发现哪里不妥,以前debug调试scrapy一直没问题。 
后来才发现,
scrapy run的启动文件名不能命令为cmd.py !!!!!
我把scrapy的启动写到cmd.py里面
from scrapy import cmdline cmdline.execute('scrapy crawl xxxx'.split())
 
然后cmd.py和系统某个调试功能的库重名了。 查看全部
没发现哪里不妥,以前debug调试scrapy一直没问题。 
后来才发现,
scrapy run的启动文件名不能命令为cmd.py !!!!!
我把scrapy的启动写到cmd.py里面
from scrapy import cmdline cmdline.execute('scrapy crawl xxxx'.split())
 
然后cmd.py和系统某个调试功能的库重名了。

CentOS Zookeeper无法启动:Error contacting service,It is probably not running

李魔佛 发表了文章 • 0 个评论 • 156 次浏览 • 2019-04-09 19:20 • 来自相关话题

启动:
./kafka-server-start.sh -daemon ../config/server.properties
报错:
Error contacting service,It is probably not running
 
关闭重启,杀进程,看端口是否被占用。无果。
后来看了下防火墙,OMG,有一台机子的防火墙没有关闭。
 
手工关闭后问题就解决了。
 
关闭防火墙命令:
systemctl stop firewalld.service #关闭防火墙
systemctl disable firewalld.service #禁止启动防火墙 查看全部
启动:
./kafka-server-start.sh -daemon ../config/server.properties
报错:
Error contacting service,It is probably not running
 
关闭重启,杀进程,看端口是否被占用。无果。
后来看了下防火墙,OMG,有一台机子的防火墙没有关闭。
 
手工关闭后问题就解决了。
 
关闭防火墙命令:
systemctl stop firewalld.service #关闭防火墙
systemctl disable firewalld.service #禁止启动防火墙

【python】pymongo find_one_and_update的用法

李魔佛 发表了文章 • 0 个评论 • 511 次浏览 • 2019-04-04 11:31 • 来自相关话题

原生的mongo语句是这样的:
db.collection.findOneAndUpdate(
<filter>,
<update>,
{
projection: <document>,
sort: <document>,
maxTimeMS: <number>,
upsert: <boolean>,
returnNewDocument: <boolean>,
collation: <document>,
arrayFilters: [ <filterdocument1>, ... ]
}
)
转换成python pymongo是这样的:
 
>>> db.example.find_one_and_update(
... {'_id': 'userid'},
... {'$inc': {'seq': 1}},
... projection={'seq': True, '_id': False},
... return_document=ReturnDocument.AFTER)
上面的语句的意思是:
找到_id 为userid的值得文档,然后把该文档中的seq的值+1,然后返回seq的数据,不显示_id列
最后返回的数据是这样的:

{'seq': 2}
 
注意
findOneAndUpdate
是获取mongo文档中第一条满足条件的数据并做修改。该函数是线程安全的。意思就是在多个线程中操作,不会对同一条数据进行获取修改。 查看全部
原生的mongo语句是这样的:
db.collection.findOneAndUpdate(
<filter>,
<update>,
{
projection: <document>,
sort: <document>,
maxTimeMS: <number>,
upsert: <boolean>,
returnNewDocument: <boolean>,
collation: <document>,
arrayFilters: [ <filterdocument1>, ... ]
}
)

转换成python pymongo是这样的:
 
>>> db.example.find_one_and_update(
... {'_id': 'userid'},
... {'$inc': {'seq': 1}},
... projection={'seq': True, '_id': False},
... return_document=ReturnDocument.AFTER)

上面的语句的意思是:
找到_id 为userid的值得文档,然后把该文档中的seq的值+1,然后返回seq的数据,不显示_id列
最后返回的数据是这样的:

{'seq': 2}
 
注意
findOneAndUpdate
是获取mongo文档中第一条满足条件的数据并做修改。该函数是线程安全的。意思就是在多个线程中操作,不会对同一条数据进行获取修改。

scrapy命令行执行传递多个参数给spider 动态传参

李魔佛 发表了文章 • 0 个评论 • 228 次浏览 • 2019-03-28 11:24 • 来自相关话题

有时候在命令行执行scrapy,比如scrapy crawl spiderXXXX,如果我想要传递一个自定义的参数进去给scrapy,比如我想传递一个爬取的页码数目,我要每次爬取10页。
 
那么需要在spider中定义一个构造函数
 
def __init__(self,page=None,*args, **kwargs):
super(Gaode,self).__init__(*args, **kwargs)
self.page=page


def start_requests(self):
XXXXXX 调用self.page 即可
yield Request(XXXX)
 
然后在启动scrapy的时候赋予参数的值:
 
scrapy crawl spider -a page=10
 
就可以动态传入参数 查看全部
有时候在命令行执行scrapy,比如scrapy crawl spiderXXXX,如果我想要传递一个自定义的参数进去给scrapy,比如我想传递一个爬取的页码数目,我要每次爬取10页。
 
那么需要在spider中定义一个构造函数
 
    def __init__(self,page=None,*args, **kwargs):
super(Gaode,self).__init__(*args, **kwargs)
self.page=page


def start_requests(self):
XXXXXX 调用self.page 即可
yield Request(XXXX)

 
然后在启动scrapy的时候赋予参数的值:
 
scrapy crawl spider -a page=10
 
就可以动态传入参数

学习强国Python自动化代码

李魔佛 发表了文章 • 0 个评论 • 4618 次浏览 • 2019-03-27 17:45 • 来自相关话题

话不多说,爱国爱党爱人民!!! 本代码转载至github其他人,与本人无关。
 
# _*_ coding: utf-8 _*_

from selenium import webdriver
import time

__author__ = 'Silent_Coder'
__date__ = '2019/3/12 22:41'

HOME_PAGE = 'https://www.xuexi.cn/'
VIDEO_LINK = 'https://www.xuexi.cn/a191dbc3067d516c3e2e17e2e08953d6/b87d700beee2c44826a9202c75d18c85.html?pageNumber=39'
LONG_VIDEO_LINK = 'https://www.xuexi.cn/f65dae4a57fe21fcc36f3506d660891c/b2e5aa79be613aed1f01d261c4a2ae17.html'
LONG_VIDEO_LINK2 = 'https://www.xuexi.cn/0040db2a403b0b9303a68b9ae5a4cca0/b2e5aa79be613aed1f01d261c4a2ae17.html'
TEST_VIDEO_LINK = 'https://www.xuexi.cn/8e35a343fca20ee32c79d67e35dfca90/7f9f27c65e84e71e1b7189b7132b4710.html'
SCORES_LINK = 'https://pc.xuexi.cn/points/my-points.html'
LOGIN_LINK = 'https://pc.xuexi.cn/points/login.html'
ARTICLES_LINK = 'https://www.xuexi.cn/d05cad69216e688d304bb91ef3aac4c6/9a3668c13f6e303932b5e0e100fc248b.html'

options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-automation'])
browser = webdriver.Chrome(executable_path=r'D:\OneDrive\Python\selenium\chromedriver.exe',options=options)


def login_simulation():
"""模拟登录"""
# 方式一:使用cookies方式
# 先自己登录,然后复制token值覆盖
# cookies = {'name': 'token', 'value': ''}
# browser.add_cookie(cookies)

# 方式二:自己扫码登录
browser.get(LOGIN_LINK)
browser.maximize_window()
browser.execute_script("var q=document.documentElement.scrollTop=1000")
time.sleep(10)
browser.get(HOME_PAGE)
print("模拟登录完毕\n")


def watch_videos():
"""观看视频"""
browser.get(VIDEO_LINK)
videos = browser.find_elements_by_xpath("//div[@id='Ck3ln2wlyg3k00']")
spend_time = 0

for i, video in enumerate(videos):
if i > 6:
break
video.click()
all_handles = browser.window_handles
browser.switch_to_window(all_handles[-1])
browser.get(browser.current_url)

# 点击播放
browser.find_element_by_xpath("//div[@class='outter']").click()
# 获取视频时长
video_duration_str = browser.find_element_by_xpath("//span[@class='duration']").get_attribute('innerText')
video_duration = int(video_duration_str.split(':')[0]) * 60 + int(video_duration_str.split(':')[1])
# 保持学习,直到视频结束
time.sleep(video_duration + 3)
spend_time += video_duration + 3
browser.close()
browser.switch_to_window(all_handles[0])

# if spend_time < 3010:
# browser.get(LONG_VIDEO_LINK)
# browser.execute_script("var q=document.documentElement.scrollTop=850")
# try:
# browser.find_element_by_xpath("//div[@class='outter']").click()
# except:
# pass
#
# # 观看剩下的时间
# time.sleep(3010 - spend_time)
browser.get(TEST_VIDEO_LINK)
time.sleep(3010 - spend_time)
print("播放视频完毕\n")


def read_articles():
"""阅读文章"""
browser.get(ARTICLES_LINK)
articles = browser.find_elements_by_xpath("//div[@id='Ca4gvo4bwg7400']")
for index, article in enumerate(articles):
if index > 7:
break
article.click()
all_handles = browser.window_handles
browser.switch_to_window(all_handles[-1])
browser.get(browser.current_url)
for i in range(0, 2000, 100):

js_code = "var q=document.documentElement.scrollTop=" + str(i)
browser.execute_script(js_code)
time.sleep(5)
for i in range(2000, 0, -100):
js_code = "var q=document.documentElement.scrollTop=" + str(i)
browser.execute_script(js_code)
time.sleep(5)
time.sleep(80)
browser.close()
browser.switch_to_window(all_handles[0])
print("阅读文章完毕\n")


def get_scores():
"""获取当前积分"""
browser.get(SCORES_LINK)
time.sleep(2)
gross_score = browser.find_element_by_xpath("//*[@id='app']/div/div[2]/div/div[2]/div[2]/span[1]")\
.get_attribute('innerText')
today_score = browser.find_element_by_xpath("//span[@class='my-points-points']").get_attribute('innerText')
print("当前总积分:" + str(gross_score))
print("今日积分:" + str(today_score))
print("获取积分完毕,即将退出\n")


if __name__ == '__main__':
login_simulation() # 模拟登录
read_articles() # 阅读文章
watch_videos() # 观看视频
get_scores() # 获得今日积分
browser.quit() 查看全部
话不多说,爱国爱党爱人民!!! 本代码转载至github其他人,与本人无关。
 
# _*_ coding: utf-8 _*_

from selenium import webdriver
import time

__author__ = 'Silent_Coder'
__date__ = '2019/3/12 22:41'

HOME_PAGE = 'https://www.xuexi.cn/'
VIDEO_LINK = 'https://www.xuexi.cn/a191dbc3067d516c3e2e17e2e08953d6/b87d700beee2c44826a9202c75d18c85.html?pageNumber=39'
LONG_VIDEO_LINK = 'https://www.xuexi.cn/f65dae4a57fe21fcc36f3506d660891c/b2e5aa79be613aed1f01d261c4a2ae17.html'
LONG_VIDEO_LINK2 = 'https://www.xuexi.cn/0040db2a403b0b9303a68b9ae5a4cca0/b2e5aa79be613aed1f01d261c4a2ae17.html'
TEST_VIDEO_LINK = 'https://www.xuexi.cn/8e35a343fca20ee32c79d67e35dfca90/7f9f27c65e84e71e1b7189b7132b4710.html'
SCORES_LINK = 'https://pc.xuexi.cn/points/my-points.html'
LOGIN_LINK = 'https://pc.xuexi.cn/points/login.html'
ARTICLES_LINK = 'https://www.xuexi.cn/d05cad69216e688d304bb91ef3aac4c6/9a3668c13f6e303932b5e0e100fc248b.html'

options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-automation'])
browser = webdriver.Chrome(executable_path=r'D:\OneDrive\Python\selenium\chromedriver.exe',options=options)


def login_simulation():
"""模拟登录"""
# 方式一:使用cookies方式
# 先自己登录,然后复制token值覆盖
# cookies = {'name': 'token', 'value': ''}
# browser.add_cookie(cookies)

# 方式二:自己扫码登录
browser.get(LOGIN_LINK)
browser.maximize_window()
browser.execute_script("var q=document.documentElement.scrollTop=1000")
time.sleep(10)
browser.get(HOME_PAGE)
print("模拟登录完毕\n")


def watch_videos():
"""观看视频"""
browser.get(VIDEO_LINK)
videos = browser.find_elements_by_xpath("//div[@id='Ck3ln2wlyg3k00']")
spend_time = 0

for i, video in enumerate(videos):
if i > 6:
break
video.click()
all_handles = browser.window_handles
browser.switch_to_window(all_handles[-1])
browser.get(browser.current_url)

# 点击播放
browser.find_element_by_xpath("//div[@class='outter']").click()
# 获取视频时长
video_duration_str = browser.find_element_by_xpath("//span[@class='duration']").get_attribute('innerText')
video_duration = int(video_duration_str.split(':')[0]) * 60 + int(video_duration_str.split(':')[1])
# 保持学习,直到视频结束
time.sleep(video_duration + 3)
spend_time += video_duration + 3
browser.close()
browser.switch_to_window(all_handles[0])

# if spend_time < 3010:
# browser.get(LONG_VIDEO_LINK)
# browser.execute_script("var q=document.documentElement.scrollTop=850")
# try:
# browser.find_element_by_xpath("//div[@class='outter']").click()
# except:
# pass
#
# # 观看剩下的时间
# time.sleep(3010 - spend_time)
browser.get(TEST_VIDEO_LINK)
time.sleep(3010 - spend_time)
print("播放视频完毕\n")


def read_articles():
"""阅读文章"""
browser.get(ARTICLES_LINK)
articles = browser.find_elements_by_xpath("//div[@id='Ca4gvo4bwg7400']")
for index, article in enumerate(articles):
if index > 7:
break
article.click()
all_handles = browser.window_handles
browser.switch_to_window(all_handles[-1])
browser.get(browser.current_url)
for i in range(0, 2000, 100):

js_code = "var q=document.documentElement.scrollTop=" + str(i)
browser.execute_script(js_code)
time.sleep(5)
for i in range(2000, 0, -100):
js_code = "var q=document.documentElement.scrollTop=" + str(i)
browser.execute_script(js_code)
time.sleep(5)
time.sleep(80)
browser.close()
browser.switch_to_window(all_handles[0])
print("阅读文章完毕\n")


def get_scores():
"""获取当前积分"""
browser.get(SCORES_LINK)
time.sleep(2)
gross_score = browser.find_element_by_xpath("//*[@id='app']/div/div[2]/div/div[2]/div[2]/span[1]")\
.get_attribute('innerText')
today_score = browser.find_element_by_xpath("//span[@class='my-points-points']").get_attribute('innerText')
print("当前总积分:" + str(gross_score))
print("今日积分:" + str(today_score))
print("获取积分完毕,即将退出\n")


if __name__ == '__main__':
login_simulation() # 模拟登录
read_articles() # 阅读文章
watch_videos() # 观看视频
get_scores() # 获得今日积分
browser.quit()