python

爬虫nike登录流程抓包分析

python爬虫 • 李魔佛发表了文章 • 0 个评论 • 3827 次浏览 • 2020-08-15 23:52 • 来自相关话题

<占坑> 敬请期待。

模拟登录网易163失败

贡献

python爬虫 • xiaoai 回复了问题 • 2 人关注 • 2 个回复 • 6214 次浏览 • 2020-06-28 14:25 • 来自相关话题

深圳住房公积金验证码识别破解

python爬虫 • 李魔佛发表了文章 • 0 个评论 • 3315 次浏览 • 2020-06-26 14:34 • 来自相关话题

http://gjj.sz.gov.cn/fzgn/zfcq/index.html

比较常规的验证码，使用keras全连接层，cv切割后每个字符只需要20个样本就达到准确率99%。
需要模型或者代码的私聊。查看全部

http://gjj.sz.gov.cn/fzgn/zfcq/index.html

比较常规的验证码，使用keras全连接层，cv切割后每个字符只需要20个样本就达到准确率99%。
需要模型或者代码的私聊。

PyQt5自定义控件

李魔佛发表了文章 • 0 个评论 • 3674 次浏览 • 2020-06-13 23:14 • 来自相关话题

PyQt5包含种类丰富的控件。但能满足所有需求的控件库是不存在的。通常控件库只提供了像按钮、文本控件、滑块等最常用的控件。但如果需要某种特殊的控件，我们只能自己动手来实现。自定义控件需要使用工具库提供的绘图工具，可能有两种方式：在已有的控件上进行拓展或从头开始创建自定义控件。

Burning widget(烧录控件)
这个控件可能会在Nero，K3B或其他CD/DVD烧录软件中见到。

# -*- coding: utf-8 -*-

"""
PyQt5 tutorial

In this example, we create a custom widget.
"""
import sys
from PyQt5.QtWidgets import (QWidget, QSlider, QApplication,
QHBoxLayout, QVBoxLayout)
from PyQt5.QtCore import QObject, Qt, pyqtSignal
from PyQt5.QtGui import QPainter, QFont, QColor, QPen

class Communicate(QObject):
updateBW = pyqtSignal(int)

class BurningWidget(QWidget):
def __init__(self):
super().__init__()

self.initUI()

def initUI(self):

self.setMinimumSize(1, 30)
self.value = 75
self.num = [75, 150, 225, 300, 375, 450, 525, 600, 675]

def setValue(self, value):

self.value = value

def paintEvent(self, e):

qp = QPainter()
qp.begin(self)
self.drawWidget(qp)
qp.end()

def drawWidget(self, qp):

font = QFont('Serif', 7, QFont.Light)
qp.setFont(font)

size = self.size()
w = size.width()
h = size.height()

step = int(round(w / 10.0))

till = int(((w / 750.0) * self.value))
full = int(((w / 750.0) * 700))

if self.value >= 700:

qp.setPen(QColor(255, 255, 255))
qp.setBrush(QColor(255, 255, 184))
qp.drawRect(0, 0, full, h)
qp.setPen(QColor(255, 175, 175))
qp.setBrush(QColor(255, 175, 175))
qp.drawRect(full, 0, till - full, h)

else:

qp.setPen(QColor(255, 255, 255))
qp.setBrush(QColor(255, 255, 184))
qp.drawRect(0, 0, till, h)

pen = QPen(QColor(20, 20, 20), 1,
Qt.SolidLine)

qp.setPen(pen)
qp.setBrush(Qt.NoBrush)
qp.drawRect(0, 0, w - 1, h - 1)

j = 0

for i in range(step, 10 * step, step):
qp.drawLine(i, 0, i, 5)
metrics = qp.fontMetrics()
fw = metrics.width(str(self.num[j]))
qp.drawText(i - fw / 2, h / 2, str(self.num[j]))
j = j + 1

class Example(QWidget):
def __init__(self):
super().__init__()

self.initUI()

def initUI(self):
sld = QSlider(Qt.Horizontal, self)
sld.setFocusPolicy(Qt.NoFocus)
sld.setRange(1, 750)
sld.setValue(75)
sld.setGeometry(30, 40, 150, 30)

self.c = Communicate()
self.wid = BurningWidget()
self.c.updateBW[int].connect(self.wid.setValue)

sld.valueChanged[int].connect(self.changeValue)
hbox = QHBoxLayout()
hbox.addWidget(self.wid)
vbox = QVBoxLayout()
vbox.addStretch(1)
vbox.addLayout(hbox)
self.setLayout(vbox)

self.setGeometry(300, 300, 390, 210)
self.setWindowTitle('Burning widget')
self.show()

def changeValue(self, value):
self.c.updateBW.emit(value)
self.wid.repaint()

if __name__ == '__main__':
app = QApplication(sys.argv)
ex = Example()
sys.exit(app.exec_())
在示例中我们使用了滑块与一个自定义控件。自定义控件受滑块控制。控件显示了媒体介质的容量和剩余空间。该控件的最小值为1,最大值为750。在值超过700时颜色变为红色。这通常意味着超刻(即实际写入光盘的容量超过刻录盘片官方标称容量的一种操作)。

BurningWidget控件通过QHBoxLayout与QVBoxLayout置于窗体的底部。
class BurningWidget(QWidget):

def __init__(self):
super().__init__()

烧录的控件,它基于QWidget

self.setMinimumSize(1, 30)我们改变了控件的最小大小(高度),默认值为有点小。
font = QFont('Serif', 7, QFont.Light)
qp.setFont(font)我们使用一个比默认要小的字体。
size = self.size()
w = size.width()
h = size.height()

step = int(round(w / 10.0))

till = int(((w / 750.0) * self.value))
full = int(((w / 750.0) * 700))
控件采用了动态绘制技术。窗体越大，控件也随之变大；反之亦然。这也是我们需要计算自定义控件的载体控件(即窗体)尺寸的原因。till参数定义了需要绘制的总尺寸，它根据slider控件计算得出，是整体区域的比例值。full参数定义了红色区域的绘制起点。注意在绘制时为取得较大精度而使用的浮点数运算。

实际的绘制分三个步骤。黄色或红黄矩形的绘制，然后是刻度线的绘制，最后是刻度值的绘制。

metrics = qp.fontMetrics()
fw = metrics.width(str(self.num[j]))
qp.drawText(i-fw/2, h/2, str(self.num[j]))我们使用字体度量来绘制文本。我们必须知道文本的宽度,以中心垂直线。
def changeValue(self, value):

self.c.updateBW.emit(value)
self.wid.repaint()当滑块发生移动时，changeValue()方法会被调用。在方法内我们触发了一个自定义的updateBW信号，其参数是当前滚动条的值。该值被用于计算Burning widget的容量值。然后对控件进行重绘。

查看全部

PyQt5包含种类丰富的控件。但能满足所有需求的控件库是不存在的。通常控件库只提供了像按钮、文本控件、滑块等最常用的控件。但如果需要某种特殊的控件，我们只能自己动手来实现。自定义控件需要使用工具库提供的绘图工具，可能有两种方式：在已有的控件上进行拓展或从头开始创建自定义控件。

Burning widget(烧录控件)
这个控件可能会在Nero，K3B或其他CD/DVD烧录软件中见到。

# -*- coding: utf-8 -*-

 

"""

PyQt5 tutorial

 

In this example, we create a custom widget.

"""

import sys

from PyQt5.QtWidgets import (QWidget, QSlider, QApplication,

                             QHBoxLayout, QVBoxLayout)

from PyQt5.QtCore import QObject, Qt, pyqtSignal

from PyQt5.QtGui import QPainter, QFont, QColor, QPen

 

 

class Communicate(QObject):

    updateBW = pyqtSignal(int)

 

 

class BurningWidget(QWidget):

    def __init__(self):

        super().__init__()

 

        self.initUI()

 

    def initUI(self):

 

        self.setMinimumSize(1, 30)

        self.value = 75

        self.num = [75, 150, 225, 300, 375, 450, 525, 600, 675]

 

    def setValue(self, value):

 

        self.value = value

 

    def paintEvent(self, e):

 

        qp = QPainter()

        qp.begin(self)

        self.drawWidget(qp)

        qp.end()

 

    def drawWidget(self, qp):

 

        font = QFont('Serif', 7, QFont.Light)

        qp.setFont(font)

 

        size = self.size()

        w = size.width()

        h = size.height()

 

        step = int(round(w / 10.0))

 

        till = int(((w / 750.0) * self.value))

        full = int(((w / 750.0) * 700))

 

        if self.value >= 700:

 

            qp.setPen(QColor(255, 255, 255))

            qp.setBrush(QColor(255, 255, 184))

            qp.drawRect(0, 0, full, h)

            qp.setPen(QColor(255, 175, 175))

            qp.setBrush(QColor(255, 175, 175))

            qp.drawRect(full, 0, till - full, h)

 

        else:

 

            qp.setPen(QColor(255, 255, 255))

            qp.setBrush(QColor(255, 255, 184))

            qp.drawRect(0, 0, till, h)

 

        pen = QPen(QColor(20, 20, 20), 1,

                   Qt.SolidLine)

 

        qp.setPen(pen)

        qp.setBrush(Qt.NoBrush)

        qp.drawRect(0, 0, w - 1, h - 1)

 

        j = 0

 

        for i in range(step, 10 * step, step):

            qp.drawLine(i, 0, i, 5)

            metrics = qp.fontMetrics()

            fw = metrics.width(str(self.num[j]))

            qp.drawText(i - fw / 2, h / 2, str(self.num[j]))

            j = j + 1

 

 

class Example(QWidget):

    def __init__(self):

        super().__init__()

 

        self.initUI()

 

    def initUI(self):

        sld = QSlider(Qt.Horizontal, self)

        sld.setFocusPolicy(Qt.NoFocus)

        sld.setRange(1, 750)

        sld.setValue(75)

        sld.setGeometry(30, 40, 150, 30)

 

        self.c = Communicate()

        self.wid = BurningWidget()

        self.c.updateBW[int].connect(self.wid.setValue)

 

        sld.valueChanged[int].connect(self.changeValue)

        hbox = QHBoxLayout()

        hbox.addWidget(self.wid)

        vbox = QVBoxLayout()

        vbox.addStretch(1)

        vbox.addLayout(hbox)

        self.setLayout(vbox)

 

        self.setGeometry(300, 300, 390, 210)

        self.setWindowTitle('Burning widget')

        self.show()

 

    def changeValue(self, value):

        self.c.updateBW.emit(value)

        self.wid.repaint()

 

 

if __name__ == '__main__':

    app = QApplication(sys.argv)

    ex = Example()

    sys.exit(app.exec_())

在示例中我们使用了滑块与一个自定义控件。自定义控件受滑块控制。控件显示了媒体介质的容量和剩余空间。该控件的最小值为1,最大值为750。在值超过700时颜色变为红色。这通常意味着超刻(即实际写入光盘的容量超过刻录盘片官方标称容量的一种操作)。

BurningWidget控件通过QHBoxLayout与QVBoxLayout置于窗体的底部。

class BurningWidget(QWidget):

  

    def __init__(self):      

        super().__init__()

烧录的控件,它基于QWidget

self.setMinimumSize(1, 30)

我们改变了控件的最小大小(高度),默认值为有点小。

font = QFont('Serif', 7, QFont.Light)

qp.setFont(font)

我们使用一个比默认要小的字体。

size = self.size()

w = size.width()

h = size.height()

 

step = int(round(w / 10.0))

 

 

till = int(((w / 750.0) * self.value))

full = int(((w / 750.0) * 700))

控件采用了动态绘制技术。窗体越大，控件也随之变大；反之亦然。这也是我们需要计算自定义控件的载体控件(即窗体)尺寸的原因。till参数定义了需要绘制的总尺寸，它根据slider控件计算得出，是整体区域的比例值。full参数定义了红色区域的绘制起点。注意在绘制时为取得较大精度而使用的浮点数运算。

实际的绘制分三个步骤。黄色或红黄矩形的绘制，然后是刻度线的绘制，最后是刻度值的绘制。

metrics = qp.fontMetrics()

fw = metrics.width(str(self.num[j]))

qp.drawText(i-fw/2, h/2, str(self.num[j]))

我们使用字体度量来绘制文本。我们必须知道文本的宽度,以中心垂直线。

def changeValue(self, value):

          

    self.c.updateBW.emit(value)        

    self.wid.repaint()

当滑块发生移动时，changeValue()方法会被调用。在方法内我们触发了一个自定义的updateBW信号，其参数是当前滚动条的值。该值被用于计算Burning widget的容量值。然后对控件进行重绘。

Windows安装pyminizip

李魔佛发表了文章 • 0 个评论 • 4192 次浏览 • 2020-05-31 19:06 • 来自相关话题

python3直接安装会报错：
pip install pyminizip
电脑需要安装vc的编译库，或者在其他机子上把pyd文件拷贝到程序的当前目录。

为什么我使用splash中间件得到的response.body和splash上访问的html代码不同

贡献

李魔佛回复了问题 • 1 人关注 • 1 个回复 • 4029 次浏览 • 2020-04-29 00:19 • 来自相关话题

pyqt5 QRect在哪个类

李魔佛发表了文章 • 0 个评论 • 2903 次浏览 • 2020-04-24 10:45 • 来自相关话题

最新的版本是在 QtCore里面的
from PyQt5.QtCore import Qt,QRect

最新的版本是在 QtCore里面的

from PyQt5.QtCore import Qt,QRect

请问各位用scrapy和redis方法爬取不到数据的问题（可悬赏），求大佬看下，感激不尽

贡献

python爬虫 • 李魔佛回复了问题 • 2 人关注 • 1 个回复 • 9365 次浏览 • 2020-04-16 22:16 • 来自相关话题

薅“疫情公益”羊毛，黑产恶意爬取各大出版社电子书上万册

python爬虫 • Magiccc 发表了文章 • 0 个评论 • 3509 次浏览 • 2020-02-26 13:17 • 来自相关话题

疫情以来，所有企业都上班延期选择在线复工，在我们居家自我隔离期间，极验观察爬虫却没有消停，反而爬虫行为更加活跃且更胜往常。本周五，我们和无糖信息一起聊聊线上爬虫的“疫情”。

爬虫发送弹幕问题

python爬虫 • naythefirst 发起了问题 • 1 人关注 • 0 个回复 • 4483 次浏览 • 2020-02-26 11:28 • 来自相关话题

requests请求返回的json格式为bytes乱码

贡献

python爬虫 • 李魔佛回复了问题 • 2 人关注 • 1 个回复 • 6030 次浏览 • 2020-02-16 23:35 • 来自相关话题

为什么我这段代码得到的是空列表呢

贡献

python爬虫 • 李魔佛回复了问题 • 2 人关注 • 1 个回复 • 4025 次浏览 • 2020-02-09 12:47 • 来自相关话题

socketio中client的sio wait用法

李魔佛发表了文章 • 1 个评论 • 5323 次浏览 • 2020-01-08 20:30 • 来自相关话题

用于阻塞当前的线程，后面的操作不会进行，直到服务端断开。

import time
import socketio

sio = socketio.Client()
start_timer = None

def send_ping():
global start_timer
start_timer = time.time()
sio.emit('ping_from_client')

@sio.event
def connect():
print('connected to server')
send_ping()

@sio.event
def pong_from_server(data):
global start_timer
latency = time.time() - start_timer
print('latency is {0:.2f} ms'.format(latency * 1000))
sio.sleep(1)
send_ping()

if __name__ == '__main__':
sio.connect('http://localhost:5000')
sio.wait()
print('next')

比如上述代码中，如果调用了sio.wait() , 那么next是不会被打印的。

如果注释掉后，那么next就可以正常被打印。查看全部

用于阻塞当前的线程，后面的操作不会进行，直到服务端断开。

import time

import socketio



sio = socketio.Client()

start_timer = None





def send_ping():

    global start_timer

    start_timer = time.time()

    sio.emit('ping_from_client')





@sio.event

def connect():

    print('connected to server')

    send_ping()





@sio.event

def pong_from_server(data):

    global start_timer

    latency = time.time() - start_timer

    print('latency is {0:.2f} ms'.format(latency * 1000))

    sio.sleep(1)

    send_ping()





if __name__ == '__main__':

    sio.connect('http://localhost:5000')

    sio.wait()

    print('next')

比如上述代码中，如果调用了sio.wait() , 那么next是不会被打印的。

如果注释掉后，那么next就可以正常被打印。

jieba.posseg TypeError: cannot unpack non-iterable pair object 词性分析报错

李魔佛发表了文章 • 0 个评论 • 4771 次浏览 • 2019-11-23 10:12 • 来自相关话题

词性标注的例子出现错误 'pair' object is not iterable

例子：import jieba.posseg as pseg
seg_list = pseg.cut("我爱北京天安门")
for word,flag in seg_list:
print(word)
print(flag)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-5-f105f6980f88> in <module>()
1 import jieba.posseg as pseg
2 seg_list = pseg.cut("我爱北京天安门")
----> 3 for word,flag in seg_list:
4 print(word)
5 print(flag)

TypeError: cannot unpack non-iterable pair object原因是新版本中seg_list是一个生成器，所以只能 for win seg_list然后从word中解包出来

print(w.word)

print(w.flag)

这样问题就解决了。查看全部

词性标注的例子出现错误 'pair' object is not iterable

例子：

import jieba.posseg as pseg

seg_list = pseg.cut("我爱北京天安门")

for word,flag in seg_list:

    print(word)

    print(flag)

---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

<ipython-input-5-f105f6980f88> in <module>()

      1 import jieba.posseg as pseg

      2 seg_list = pseg.cut("我爱北京天安门")

----> 3 for word,flag in seg_list:

      4     print(word)

      5     print(flag)



TypeError: cannot unpack non-iterable pair object

原因是新版本中seg_list是一个生成器，所以只能 for win seg_list

然后从word中解包出来

print(w.word)

print(w.flag)

这样问题就解决了。

scrapy在settings中定义变量不能包含小写！

python爬虫 • 李魔佛发表了文章 • 0 个评论 • 3433 次浏览 • 2019-11-16 16:39 • 来自相关话题

如果变量名包含小写字母，那么你的变量会被过滤掉，在scrapy编码的其他地方都会无法被识别。
比如定义了一个叫 Redis_host = '192.168.1.1'，的值

然后在spider中，如果你调用self.settings.get('Redis_host')
那么返回值是 None。

如果用REDIS_HOST定义，那么就可以正确返回它的值。

如果你一定要用小写，也有其他方法可正常调用。
先导入settings文件
fromt xxxx import setttings # xxx为项目名

host = settings.Redis_host # 直接导入一个文件的形式来调用是可以的查看全部

如果变量名包含小写字母，那么你的变量会被过滤掉，在scrapy编码的其他地方都会无法被识别。
比如定义了一个叫 Redis_host = '192.168.1.1'，的值

然后在spider中，如果你调用self.settings.get('Redis_host')
那么返回值是 None。

如果用REDIS_HOST定义，那么就可以正确返回它的值。

如果你一定要用小写，也有其他方法可正常调用。
先导入settings文件
fromt xxxx import setttings # xxx为项目名

host = settings.Redis_host # 直接导入一个文件的形式来调用是可以的

etree.strip_tags的用法

python爬虫 • 李魔佛发表了文章 • 0 个评论 • 4761 次浏览 • 2019-10-24 11:24 • 来自相关话题

直接从官方文档那里拿过来，发现这个函数功能还挺不错的。
它把参数中的标签从源htmlelement中删除，并且把里面的标签文本给合并进来。

举个例子：from lxml.html import etree
from lxml.html import fromstring, HtmlElement

test_html = '''helloworld'''
test_element = fromstring(test_html)
etree.strip_tags(test_element,'span') # 清除span标签
etree.tostring(test_element)
因为上述操作直接应用于test_element上的，所以test_element的值已经被修改了。

所以现在test_element 的值是
b'helloworld'

原创文章，转载请注明出处
http://30daydo.com/article/553
查看全部

直接从官方文档那里拿过来，发现这个函数功能还挺不错的。
它把参数中的标签从源htmlelement中删除，并且把里面的标签文本给合并进来。

举个例子：

from lxml.html import etree

from lxml.html import fromstring, HtmlElement



test_html = '''<p><span>hello</span><span>world</span></p>'''

test_element = fromstring(test_html)

etree.strip_tags(test_element,'span') # 清除span标签

etree.tostring(test_element)

因为上述操作直接应用于test_element上的，所以test_element的值已经被修改了。

所以现在test_element 的值是
b'helloworld'

原创文章，转载请注明出处
http://30daydo.com/article/553

mumu模拟器adb无法识别

python爬虫 • 李魔佛发表了文章 • 0 个评论 • 5693 次浏览 • 2019-10-17 08:41 • 来自相关话题

因为端口号被mumu改了。

<Forwarding name="ADB_PORT" proto="1" hostip="127.0.0.1" hostport="7555" guestport="5555"/>

在mumu浏览器里面可以看到这个配置信息。

adb connect 127.0.0.1:7555
然后adb shell 就可以了。

配置文件名是：myandrovm_vbox86.nemu 查看全部

因为端口号被mumu改了。

<Forwarding name="ADB_PORT" proto="1" hostip="127.0.0.1" hostport="7555" guestport="5555"/>

在mumu浏览器里面可以看到这个配置信息。

adb connect 127.0.0.1:7555
然后adb shell 就可以了。

配置文件名是：myandrovm_vbox86.nemu

aiohttp异步下载图片

python爬虫 • 李魔佛发表了文章 • 0 个评论 • 5322 次浏览 • 2019-09-16 17:14 • 来自相关话题

保存图片的时候不能用自带的open函数打开文件，需要用到异步io库 aiofiles来打开url = 'http://xyhz.huizhou.gov.cn/static/js/common/jigsaw/images/{}.jpg'
headers={'User-Agent':'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
async def getPage(num):

async with aiohttp.ClientSession() as session:
async with session.get(url.format(num),headers=headers) as resp:
if resp.status==200:
f= await aiofiles.open('{}.jpg'.format(num),mode='wb')
await f.write(await resp.read())
await f.close()

loop = asyncio.get_event_loop()
tasks = [getPage(i) for i in range(5)]
loop.run_until_complete(asyncio.wait(tasks))
原创文章，
转载请注明出处：
http://30daydo.com/article/537
查看全部

保存图片的时候不能用自带的open函数打开文件，需要用到异步io库 aiofiles来打开

url = 'http://xyhz.huizhou.gov.cn/static/js/common/jigsaw/images/{}.jpg'

headers={'User-Agent':'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}

async def getPage(num):



    async with aiohttp.ClientSession() as session:

        async with session.get(url.format(num),headers=headers) as resp:

            if resp.status==200:

                f= await aiofiles.open('{}.jpg'.format(num),mode='wb')

                await f.write(await resp.read())

                await f.close()



loop = asyncio.get_event_loop()

tasks = [getPage(i) for i in range(5)]

loop.run_until_complete(asyncio.wait(tasks))

原创文章，
转载请注明出处：
http://30daydo.com/article/537

基于文本及符号密度的网页正文提取方法 python实现

李魔佛发表了文章 • 0 个评论 • 5460 次浏览 • 2019-09-10 15:19 • 来自相关话题

基于文本及符号密度的网页正文提取方法 python实现
项目路径https://github.com/Rockyzsu/CodePool/tree/master/GeneralNewsExtractor
完成后在本文详细介绍，
请密切关注。查看全部

基于文本及符号密度的网页正文提取方法 python实现
项目路径https://github.com/Rockyzsu/CodePool/tree/master/GeneralNewsExtractor
完成后在本文详细介绍，
请密切关注。

python exchange保存备份邮件

李魔佛发表了文章 • 3 个评论 • 3819 次浏览 • 2019-09-09 10:50 • 来自相关话题

python exchange保存备份邮件
方便自己平时备份邮件。# -*-coding=utf-8-*-

# @Time : 2019/9/9 9:25
# @File : mail_backup.py
# @Author :
import codecs
import re
import config
import os
from exchangelib import DELEGATE, Account, Credentials, Configuration, NTLM, Message, Mailbox, HTMLBody,FileAttachment,ItemAttachment
from exchangelib.protocol import BaseProtocol, NoVerifyHTTPAdapter

#此句用来消除ssl证书错误，exchange使用自签证书需加上
BaseProtocol.HTTP_ADAPTER_CLS = NoVerifyHTTPAdapter

# 输入你的域账号如example\xxx
cred = Credentials(r'example\xxx', 你的邮箱密码)

configx = Configuration(server='mail.credlink.com', credentials=cred, auth_type=NTLM)
a = Account(
primary_smtp_address='你的邮箱地址', config=configx, autodiscover=False, access_type=DELEGATE
)

for item in a.inbox.all().order_by('-datetime_received')[:100]:
print(item.subject, item.sender, item.unique_body,item.datetime_received)

name = item.subject
name = re.sub('[\/:*?"<>|]', '-', name)
local_path = os.path.join('inbox', name+'.html')
with codecs.open(local_path, 'w','utf-8') as f:
f.write(item.unique_body)

for attachment in item.attachments:
if isinstance(attachment, FileAttachment):
name = attachment.name
name = re.sub('[\/:*?"<>|]','-',name)
local_path = os.path.join('inbox', attachment.name)
with codecs.open(local_path, 'wb') as f:
f.write(attachment.content)
print('Saved attachment to', local_path)

elif isinstance(attachment, ItemAttachment):
if isinstance(attachment.item, Message):
name=attachment.item.subject
name = re.sub('[\/:*?"<>|]', '-', name)
local_path = os.path.join('inbox', 'attachment')
with codecs.open(local_path, 'w') as f:
f.write(attachment.item.body)
原创文章，
转载请注明出处
http://30daydo.com/article/534
查看全部

python exchange保存备份邮件
方便自己平时备份邮件。

# -*-coding=utf-8-*-



# @Time : 2019/9/9 9:25

# @File : mail_backup.py

# @Author : 

import codecs

import re

import config

import os

from exchangelib import DELEGATE, Account, Credentials, Configuration, NTLM, Message, Mailbox, HTMLBody,FileAttachment,ItemAttachment

from exchangelib.protocol import BaseProtocol, NoVerifyHTTPAdapter





#此句用来消除ssl证书错误，exchange使用自签证书需加上

BaseProtocol.HTTP_ADAPTER_CLS = NoVerifyHTTPAdapter





# 输入你的域账号如example\xxx

cred = Credentials(r'example\xxx', 你的邮箱密码)



configx = Configuration(server='mail.credlink.com', credentials=cred, auth_type=NTLM)

a = Account(

    primary_smtp_address='你的邮箱地址', config=configx, autodiscover=False, access_type=DELEGATE

)





for item in a.inbox.all().order_by('-datetime_received')[:100]:

    print(item.subject, item.sender, item.unique_body,item.datetime_received)

   

    name = item.subject

    name = re.sub('[\/:*?"<>|]', '-', name)

    local_path = os.path.join('inbox', name+'.html')

    with codecs.open(local_path, 'w','utf-8') as f:

        f.write(item.unique_body)



    for attachment in item.attachments:

        if isinstance(attachment, FileAttachment):

            name = attachment.name

            name = re.sub('[\/:*?"<>|]','-',name)

            local_path = os.path.join('inbox', attachment.name)

            with codecs.open(local_path, 'wb') as f:

                f.write(attachment.content)

            print('Saved attachment to', local_path)



        elif isinstance(attachment, ItemAttachment):

            if isinstance(attachment.item, Message):

                name=attachment.item.subject

                name = re.sub('[\/:*?"<>|]', '-', name)

                local_path = os.path.join('inbox', 'attachment')

                with codecs.open(local_path, 'w') as f:

                    f.write(attachment.item.body)

原创文章，
转载请注明出处
http://30daydo.com/article/534

性能对比 pypy vs python

李魔佛发表了文章 • 0 个评论 • 5253 次浏览 • 2019-09-06 17:04 • 来自相关话题

性能对比 pypy vs python
不试不知道，一试吓一跳。
如果是CPU密集型的程序，pypy3的执行速度比python要快上一百倍。
talk is cheap, show me the code!

代码很简单，运行加法运算：
执行2千万次
import time

LOOP = 2*10**8

def add(x,y):
return x+y

def cpu_pressure(loop):

for i in range(loop):
result = add(i,i+1)

if __name__ == '__main__':
start = time.time()
cpu_pressure(LOOP)
print(f'time used {time.time()-start}s')
python执行：
python main.py
返回用时：time used 21.422261476516724s

pypy执行：
pypy main.py
返回用时：time used 0.1925642490386963s

差距真的很大。查看全部

性能对比 pypy vs python
不试不知道，一试吓一跳。
如果是CPU密集型的程序，pypy3的执行速度比python要快上一百倍。
talk is cheap, show me the code!

代码很简单，运行加法运算：
执行2千万次

import time



LOOP = 2*10**8



def add(x,y):

    return x+y



def cpu_pressure(loop):

    

    for i in range(loop):

        result = add(i,i+1)





if __name__ == '__main__':

    start = time.time()

    cpu_pressure(LOOP)

    print(f'time used {time.time()-start}s')

python执行：
python main.py
返回用时：time used 21.422261476516724s

pypy执行：
pypy main.py
返回用时：time used 0.1925642490386963s

差距真的很大。

scrapy源码分析<一>：入口函数以及是如何运行

python爬虫 • 李魔佛发表了文章 • 0 个评论 • 6679 次浏览 • 2019-08-31 10:47 • 来自相关话题

运行scrapy crawl example 命令的时候，就会执行我们写的爬虫程序。
下面我们从源码分析一下scrapy执行的流程：

执行scrapy crawl 命令时，调用的是Command类class Command(ScrapyCommand):

requires_project = True

def syntax(self):
return '[options]'

def short_desc(self):
return 'Runs all of the spiders - My Defined'

def run(self,args,opts):
print('==================')
print(type(self.crawler_process))
spider_list = self.crawler_process.spiders.list() # 找到爬虫类

for name in spider_list:
print('=================')
print(name)
self.crawler_process.crawl(name,**opts.__dict__)

self.crawler_process.start()
然后我们去看看crawler_process，这个是来自ScrapyCommand，而ScrapyCommand又是CrawlerProcess的子类，而CrawlerProcess又是CrawlerRunner的子类

在CrawlerRunner构造函数里面主要作用就是这个 def __init__(self, settings=None):
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
self.settings = settings
self.spider_loader = _get_spider_loader(settings) # 构造爬虫
self._crawlers = set()
self._active = set()
self.bootstrap_failed = False
1. 加载配置文件def _get_spider_loader(settings):

cls_path = settings.get('SPIDER_LOADER_CLASS')

# settings文件没有定义SPIDER_LOADER_CLASS，所以这里获取到的是系统的默认配置文件，
# 默认配置文件在接下来的代码块A
# SPIDER_LOADER_CLASS = 'scrapy.spiderloader.SpiderLoader'

loader_cls = load_object(cls_path)
# 这个函数就是根据路径转为类对象，也就是上面crapy.spiderloader.SpiderLoader 这个
# 字符串变成一个类对象
# 具体的load_object 对象代码见下面代码块B

return loader_cls.from_settings(settings.frozencopy())
默认配置文件defautl_settting.py# 代码块A
#......省略若干
SCHEDULER = 'scrapy.core.scheduler.Scheduler'
SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleLifoDiskQueue'
SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.LifoMemoryQueue'
SCHEDULER_PRIORITY_QUEUE = 'scrapy.pqueues.ScrapyPriorityQueue'

SPIDER_LOADER_CLASS = 'scrapy.spiderloader.SpiderLoader' 就是这个值
SPIDER_LOADER_WARN_ONLY = False

SPIDER_MIDDLEWARES = {}

load_object的实现# 代码块B 为了方便，我把异常处理的去除
from importlib import import_module #导入第三方库

def load_object(path):
dot = path.rindex('.')
module, name = path[:dot], path[dot+1:]
# 上面把路径分为基本路径+模块名

mod = import_module(module)
obj = getattr(mod, name)
# 获取模块里面那个值

return obj

测试代码：In [33]: mod = import_module(module)

In [34]: mod
Out[34]: <module 'scrapy.spiderloader' from '/home/xda/anaconda3/lib/python3.7/site-packages/scrapy/spiderloader.py'>

In [35]: getattr(mod,name)
Out[35]: scrapy.spiderloader.SpiderLoader

In [36]: obj = getattr(mod,name)

In [37]: obj
Out[37]: scrapy.spiderloader.SpiderLoader

In [38]: type(obj)
Out[38]: type
在代码块A中，loader_cls是SpiderLoader，最后返回的的是SpiderLoader.from_settings(settings.frozencopy())
接下来看看SpiderLoader.from_settings， def from_settings(cls, settings):
return cls(settings)
返回类对象自己，所以直接看__init__函数即可class SpiderLoader(object):
"""
SpiderLoader is a class which locates and loads spiders
in a Scrapy project.
"""
def __init__(self, settings):
self.spider_modules = settings.getlist('SPIDER_MODULES')
# 获得settting中的模块名字，创建scrapy的时候就默认帮你生成了
# 你可以看看你的settings文件里面的内容就可以找到这个值，是一个list

self.warn_only = settings.getbool('SPIDER_LOADER_WARN_ONLY')
self._spiders = {}
self._found = defaultdict(list)
self._load_all_spiders() # 加载所有爬虫

核心就是这个_load_all_spiders：
走起：def _load_all_spiders(self):
for name in self.spider_modules:

for module in walk_modules(name): # 这个遍历文件夹里面的文件，然后再转化为类对象，
# 保存到字典：self._spiders = {}
self._load_spiders(module) # 模块变成spider

self._check_name_duplicates() # 去重，如果名字一样就异常

接下来看看_load_spiders
核心就是下面的。def iter_spider_classes(module):
from scrapy.spiders import Spider

for obj in six.itervalues(vars(module)): # 找到模块里面的变量，然后迭代出来
if inspect.isclass(obj) and \
issubclass(obj, Spider) and \
obj.__module__ == module.__name__ and \
getattr(obj, 'name', None): # 有name属性，继承于Spider
yield obj
这个obj就是我们平时写的spider类了。
原来分析了这么多，才找到了我们平时写的爬虫类

待续。。。。

原创文章
转载请注明出处
http://30daydo.com/article/530
查看全部

运行scrapy crawl example 命令的时候，就会执行我们写的爬虫程序。
下面我们从源码分析一下scrapy执行的流程：

执行scrapy crawl 命令时，调用的是Command类

class Command(ScrapyCommand):



    requires_project = True



    def syntax(self):

        return '[options]'



    def short_desc(self):

        return 'Runs all of the spiders - My Defined'



    def run(self,args,opts):

        print('==================')

        print(type(self.crawler_process))

        spider_list = self.crawler_process.spiders.list() # 找到爬虫类



        for name in spider_list:

            print('=================')

            print(name)

            self.crawler_process.crawl(name,**opts.__dict__)



        self.crawler_process.start()

然后我们去看看crawler_process，这个是来自ScrapyCommand，而ScrapyCommand又是CrawlerProcess的子类，而CrawlerProcess又是CrawlerRunner的子类

在CrawlerRunner构造函数里面主要作用就是这个

      def __init__(self, settings=None):

        if isinstance(settings, dict) or settings is None:

            settings = Settings(settings)

        self.settings = settings

        self.spider_loader = _get_spider_loader(settings) # 构造爬虫

        self._crawlers = set()

        self._active = set()

        self.bootstrap_failed = False

1. 加载配置文件

def _get_spider_loader(settings):



    cls_path = settings.get('SPIDER_LOADER_CLASS')

    

    # settings文件没有定义SPIDER_LOADER_CLASS，所以这里获取到的是系统的默认配置文件，

    # 默认配置文件在接下来的代码块A

    # SPIDER_LOADER_CLASS = 'scrapy.spiderloader.SpiderLoader'    

    

    loader_cls = load_object(cls_path) 

    # 这个函数就是根据路径转为类对象，也就是上面crapy.spiderloader.SpiderLoader 这个

    # 字符串变成一个类对象

    # 具体的load_object 对象代码见下面代码块B



    return loader_cls.from_settings(settings.frozencopy())

默认配置文件defautl_settting.py

# 代码块A

#......省略若干

SCHEDULER = 'scrapy.core.scheduler.Scheduler'

SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleLifoDiskQueue'

SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.LifoMemoryQueue'

SCHEDULER_PRIORITY_QUEUE = 'scrapy.pqueues.ScrapyPriorityQueue'



SPIDER_LOADER_CLASS = 'scrapy.spiderloader.SpiderLoader' 就是这个值

SPIDER_LOADER_WARN_ONLY = False



SPIDER_MIDDLEWARES = {}

load_object的实现

# 代码块B 为了方便，我把异常处理的去除

from importlib import import_module #导入第三方库



def load_object(path):

    dot = path.rindex('.') 

    module, name = path[:dot], path[dot+1:]

    # 上面把路径分为基本路径+模块名

    

    mod = import_module(module)

    obj = getattr(mod, name)

    # 获取模块里面那个值

    

    return obj

测试代码：

In [33]: mod = import_module(module)                                                                                                                                             



In [34]: mod                                                                                                                                                                     

Out[34]: <module 'scrapy.spiderloader' from '/home/xda/anaconda3/lib/python3.7/site-packages/scrapy/spiderloader.py'>



In [35]: getattr(mod,name)                                                                                                                                                       

Out[35]: scrapy.spiderloader.SpiderLoader



In [36]: obj = getattr(mod,name)                                                                                                                                                 



In [37]: obj                                                                                                                                                                     

Out[37]: scrapy.spiderloader.SpiderLoader



In [38]: type(obj)                                                                                                                                                               

Out[38]: type

在代码块A中，loader_cls是SpiderLoader，最后返回的的是SpiderLoader.from_settings(settings.frozencopy())
接下来看看SpiderLoader.from_settings，

    def from_settings(cls, settings):

        return cls(settings)

返回类对象自己，所以直接看__init__函数即可

class SpiderLoader(object):

    """

    SpiderLoader is a class which locates and loads spiders

    in a Scrapy project.

    """

    def __init__(self, settings):

        self.spider_modules = settings.getlist('SPIDER_MODULES') 

        # 获得settting中的模块名字，创建scrapy的时候就默认帮你生成了

        # 你可以看看你的settings文件里面的内容就可以找到这个值，是一个list

        

        self.warn_only = settings.getbool('SPIDER_LOADER_WARN_ONLY')

        self._spiders = {}

        self._found = defaultdict(list)

        self._load_all_spiders() # 加载所有爬虫

核心就是这个_load_all_spiders：
走起：

def _load_all_spiders(self):

        for name in self.spider_modules:            



                for module in walk_modules(name): # 这个遍历文件夹里面的文件，然后再转化为类对象，

                    # 保存到字典：self._spiders = {}       

                    self._load_spiders(module) # 模块变成spider



        self._check_name_duplicates() # 去重，如果名字一样就异常

接下来看看_load_spiders
核心就是下面的。

def iter_spider_classes(module):

    from scrapy.spiders import Spider



    for obj in six.itervalues(vars(module)): # 找到模块里面的变量，然后迭代出来

        if inspect.isclass(obj) and \

           issubclass(obj, Spider) and \

           obj.__module__ == module.__name__ and \

           getattr(obj, 'name', None): # 有name属性，继承于Spider

           yield obj

这个obj就是我们平时写的spider类了。
原来分析了这么多，才找到了我们平时写的爬虫类

待续。。。。

原创文章
转载请注明出处
http://30daydo.com/article/530

anaconda环境下无法启动jupyter notebook

李魔佛发表了文章 • 0 个评论 • 7675 次浏览 • 2019-08-19 17:16 • 来自相关话题

运行 jupyter notebook
报错： from . import (constants, error, message, context,
ImportError: DLL load failed: 找不到指定的模块。

但是可以直接在Anaconda navigator中直接启动，所以判断是环境问题。
切换到anaconda的虚拟环境，（在菜单中进入anaconda prompt command），在当前命令行下执行 jupyter notebook就能够正常运行。

查看全部

运行 jupyter notebook
报错：

    from . import (constants, error, message, context,

ImportError: DLL load failed: 找不到指定的模块。

但是可以直接在Anaconda navigator中直接启动，所以判断是环境问题。
切换到anaconda的虚拟环境，（在菜单中进入anaconda prompt command），在当前命令行下执行 jupyter notebook就能够正常运行。

random.randint的用法

李魔佛发表了文章 • 0 个评论 • 13493 次浏览 • 2019-08-01 16:31 • 来自相关话题

random.randint的用法：
from random import randint

randint(0,1)
Out[25]: 1

randint(0,1)
Out[26]: 1

randint(0,1)
Out[27]: 1

randint(0,1)
Out[28]: 1

randint(0,1)
Out[29]: 0

randint(0,1)
Out[30]: 1
random.randint（a,b）

输出的整数范围包含a和b，和之间的整数
查看全部

random.randint的用法：

from random import randint



randint(0,1)

Out[25]: 1



randint(0,1)

Out[26]: 1



randint(0,1)

Out[27]: 1



randint(0,1)

Out[28]: 1



randint(0,1)

Out[29]: 0



randint(0,1)

Out[30]: 1

random.randint（a,b）

输出的整数范围包含a和b，和之间的整数

frontera运行link_follower.py 报错：doesn't define any object named 'FIFO'

python爬虫 • 李魔佛发表了文章 • 0 个评论 • 3958 次浏览 • 2019-07-18 11:29 • 来自相关话题

代码如下：
from __future__ import print_function

import re

import requests

from frontera.contrib.requests.manager import RequestsFrontierManager
# from frontera.contrib.requests.manager import RequestsFrontierManager
from frontera import Settings

from six.moves.urllib.parse import urljoin

SETTINGS = Settings()
SETTINGS.BACKEND = 'frontera.contrib.backends.memory.FIFO'
# SETTINGS.BACKEND = 'frontera.contrib.backends.memory.MemoryDistributedBackend'

SETTINGS.LOGGING_MANAGER_ENABLED = True
SETTINGS.LOGGING_BACKEND_ENABLED = True
SETTINGS.MAX_REQUESTS = 100
SETTINGS.MAX_NEXT_REQUESTS = 10

SEEDS = [
'http://www.imdb.com',
]

LINK_RE = re.compile(r'<a.+?href="(.*?)".?>', re.I)

def extract_page_links(response):
return [urljoin(response.url, link) for link in LINK_RE.findall(response.text)]

if __name__ == '__main__':

frontier = RequestsFrontierManager(SETTINGS)
frontier.add_seeds([requests.Request(url=url) for url in SEEDS])
while True:
next_requests = frontier.get_next_requests()
if not next_requests:
break
for request in next_requests:
try:
response = requests.get(request.url)
links = [
requests.Request(url=url)
for url in extract_page_links(response)
]
frontier.page_crawled(response)
print('Crawled', response.url, '(found', len(links), 'urls)')

if links:
frontier.links_extracted(request, links)
except requests.RequestException as e:
error_code = type(e).__name__
frontier.request_error(request, error_code)
print('Failed to process request', request.url, 'Error:', e)

无论用的py2或者py3，都会报以下的错误。raise NameError("Module '%s' doesn't define any object named '%s'" % (module, name))
NameError: Module 'frontera.contrib.backends.memory' doesn't define any object named 'FIFO' 查看全部

代码如下：

from __future__ import print_function



import re



import requests



from frontera.contrib.requests.manager import RequestsFrontierManager

# from frontera.contrib.requests.manager import RequestsFrontierManager

from frontera import Settings



from six.moves.urllib.parse import urljoin





SETTINGS = Settings()

SETTINGS.BACKEND = 'frontera.contrib.backends.memory.FIFO'

# SETTINGS.BACKEND = 'frontera.contrib.backends.memory.MemoryDistributedBackend'



SETTINGS.LOGGING_MANAGER_ENABLED = True

SETTINGS.LOGGING_BACKEND_ENABLED = True

SETTINGS.MAX_REQUESTS = 100

SETTINGS.MAX_NEXT_REQUESTS = 10



SEEDS = [

    'http://www.imdb.com',

]



LINK_RE = re.compile(r'<a.+?href="(.*?)".?>', re.I)





def extract_page_links(response):

    return [urljoin(response.url, link) for link in LINK_RE.findall(response.text)]



if __name__ == '__main__':



    frontier = RequestsFrontierManager(SETTINGS)

    frontier.add_seeds([requests.Request(url=url) for url in SEEDS])

    while True:

        next_requests = frontier.get_next_requests()

        if not next_requests:

            break

        for request in next_requests:

                try:

                    response = requests.get(request.url)

                    links = [

                        requests.Request(url=url)

                        for url in extract_page_links(response)

                    ]

                    frontier.page_crawled(response)

                    print('Crawled', response.url, '(found', len(links), 'urls)')



                    if links:

                        frontier.links_extracted(request, links)

                except requests.RequestException as e:

                    error_code = type(e).__name__

                    frontier.request_error(request, error_code)

                    print('Failed to process request', request.url, 'Error:', e)

无论用的py2或者py3，都会报以下的错误。

raise NameError("Module '%s' doesn't define any object named '%s'" % (module, name))

NameError: Module 'frontera.contrib.backends.memory' doesn't define any object named 'FIFO'

scrapy-rabbitmq 不支持python3 [修改源码使它支持]

python爬虫 • 李魔佛发表了文章 • 0 个评论 • 3698 次浏览 • 2019-07-17 17:24 • 来自相关话题

官方版本在2015年就没有更新了。
在python3上运行的收会报错。

需要修改以下地方：

待续。。

scrapy rabbitmq 分布式爬虫

python爬虫 • 李魔佛发表了文章 • 0 个评论 • 6583 次浏览 • 2019-07-17 16:59 • 来自相关话题

对于没接触过rabbitmq的同学，可以看这个文章：https://blog.csdn.net/hellozpc/article/details/81436980
rabbitmq是个不错的消息队列服务，可以配合scrapy作为消息队列.

下面是一个简单的demo：import re
import requests
import scrapy
from scrapy import Request
from rabbit_spider import settings
from scrapy.log import logger
import json
from rabbit_spider.items import RabbitSpiderItem
import datetime
from scrapy.selector import Selector
import pika

# from scrapy_rabbitmq.spiders import RabbitMQMixin
# from scrapy.contrib.spiders import CrawlSpider

class Website(scrapy.Spider):
name = "rabbit"

def start_requests(self):
headers = {'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
'Host': '36kr.com',
'Referer': 'https://36kr.com/information/web_news',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'
}

url = 'https://36kr.com/information/web_news'

yield Request(url=url,
headers=headers)

def parse(self, response):

credentials = pika.PlainCredentials('admin', 'admin')
connection = pika.BlockingConnection(pika.ConnectionParameters('192.168.1.101', 5672, '/', credentials))

channel = connection.channel()
channel.exchange_declare(exchange='direct_log', exchange_type='direct')

result = channel.queue_declare(exclusive=True, queue='')

queue_name = result.method.queue

# print(queue_name)
# infos = sys.argv[1:] if len(sys.argv)>1 else ['info']
info = 'info'

# 绑定多个值

channel.queue_bind(
exchange='direct_log',
routing_key=info,
queue=queue_name
)
print('start to receive [{}]'.format(info))

channel.basic_consume(
on_message_callback=self.callback_func,
queue=queue_name,
auto_ack=True,
)

channel.start_consuming()

def callback_func(self, ch, method, properties, body):
print(body)
启动spider：from scrapy import cmdline
cmdline.execute('scrapy crawl rabbit'.split())
然后往rabbitmq里面推送数据：import pika
import settings

credentials = pika.PlainCredentials('admin','admin')
connection = pika.BlockingConnection(pika.ConnectionParameters('192.168.1.101',5672,'/',credentials))

channel = connection.channel()
channel.exchange_declare(exchange='direct_log',exchange_type='direct') # fanout 就是组播

routing_key = 'info'
message='https://36kr.com/pp/api/aggregation-entity?type=web_latest_article&b_id=59499&per_page=30'
channel.basic_publish(
exchange='direct_log',
routing_key=routing_key,
body=message
)

print('sending message {}'.format(message))
connection.close()

推送数据后，scrapy会马上接受到队里里面的数据。
注意不能在start_requests里面写等待队列的命令，因为start_requests函数需要返回一个生成器，否则程序会报错。

待续。。。
###### 2019-08-29 更新 ###################
发现一个坑，就是rabbitMQ在接受到数据后，无法在回调函数里面使用yield生成器。
查看全部

对于没接触过rabbitmq的同学，可以看这个文章：https://blog.csdn.net/hellozpc/article/details/81436980
rabbitmq是个不错的消息队列服务，可以配合scrapy作为消息队列.

下面是一个简单的demo：

import re

import requests

import scrapy

from scrapy import Request

from rabbit_spider import settings

from scrapy.log import logger

import json

from rabbit_spider.items import RabbitSpiderItem

import datetime

from scrapy.selector import Selector

import pika



# from scrapy_rabbitmq.spiders import RabbitMQMixin

# from scrapy.contrib.spiders import CrawlSpider



class Website(scrapy.Spider):

    name = "rabbit"



    def start_requests(self):

        headers = {'Accept': '*/*',

                   'Accept-Encoding': 'gzip, deflate, br',

                   'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',

                   'Host': '36kr.com',

                   'Referer': 'https://36kr.com/information/web_news',

                   'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'

                   }



        url = 'https://36kr.com/information/web_news'

        



        yield Request(url=url,

                      headers=headers)



    def parse(self, response):

       



        credentials = pika.PlainCredentials('admin', 'admin')

        connection = pika.BlockingConnection(pika.ConnectionParameters('192.168.1.101', 5672, '/', credentials))



        channel = connection.channel()

        channel.exchange_declare(exchange='direct_log', exchange_type='direct')



        result = channel.queue_declare(exclusive=True, queue='')



        queue_name = result.method.queue



        # print(queue_name)

        # infos = sys.argv[1:] if len(sys.argv)>1 else ['info']

        info = 'info'



        # 绑定多个值



        channel.queue_bind(

            exchange='direct_log',

            routing_key=info,

            queue=queue_name

        )

        print('start to receive [{}]'.format(info))



        channel.basic_consume(

            on_message_callback=self.callback_func,

            queue=queue_name,

            auto_ack=True,

        )



        channel.start_consuming()





    def callback_func(self, ch, method, properties, body):

        print(body)

启动spider：

from scrapy import cmdline

cmdline.execute('scrapy crawl rabbit'.split())

然后往rabbitmq里面推送数据：

import pika

import settings



credentials = pika.PlainCredentials('admin','admin')

connection = pika.BlockingConnection(pika.ConnectionParameters('192.168.1.101',5672,'/',credentials))



channel = connection.channel()

channel.exchange_declare(exchange='direct_log',exchange_type='direct') # fanout 就是组播



routing_key = 'info'

message='https://36kr.com/pp/api/aggregation-entity?type=web_latest_article&b_id=59499&per_page=30'

channel.basic_publish(

	exchange='direct_log',

	routing_key=routing_key,

	body=message

	)



print('sending message {}'.format(message))

connection.close()

推送数据后，scrapy会马上接受到队里里面的数据。
注意不能在start_requests里面写等待队列的命令，因为start_requests函数需要返回一个生成器，否则程序会报错。

待续。。。
###### 2019-08-29 更新 ###################
发现一个坑，就是rabbitMQ在接受到数据后，无法在回调函数里面使用yield生成器。

exchange_declare() got an unexpected keyword argument 'type'

李魔佛发表了文章 • 0 个评论 • 3312 次浏览 • 2019-07-16 14:40 • 来自相关话题

In new version of pika, now it is using
exchange_type instead of type

credentials = pika.PlainCredentials('admin','admin')
connection = pika.BlockingConnection(pika.ConnectionParameters('192.168.1.101',5672,'/',credentials))

channel = connection.channel()

channel.exchange_declare(exchange='logs',exchange_type='fanout') 查看全部

In new version of pika, now it is using
exchange_type instead of type

	credentials = pika.PlainCredentials('admin','admin')

	connection = pika.BlockingConnection(pika.ConnectionParameters('192.168.1.101',5672,'/',credentials))



	channel = connection.channel()



	channel.exchange_declare(exchange='logs',exchange_type='fanout')

twisted的getPage已经不建议使用，新接口为twisted.web.client.Agent

python爬虫 • 李魔佛发表了文章 • 2 个评论 • 4079 次浏览 • 2019-07-12 11:31 • 来自相关话题

Twisted-16.7.0 is coming soon, and it deprecates twisted.web.client.getPage (and client.HTTPClientFactory). We use these in some of the unit tests, to fetch one of the HTTP WAPI/WUI pages and make sure the contents look right.

We need to change these tests to use twisted.web.client.Agent instead, or a package named "treq", which is a Twisted flavor of the excellent (but blocking) requests library.

查看全部

Twisted-16.7.0 is coming soon, and it deprecates twisted.web.client.getPage (and client.HTTPClientFactory). We use these in some of the unit tests, to fetch one of the HTTP WAPI/WUI pages and make sure the contents look right.

We need to change these tests to use twisted.web.client.Agent instead, or a package named "treq", which is a Twisted flavor of the excellent (but blocking) requests library.

twisted　reactor运行后，添加了addBoth函数，但是还是无法停止

李魔佛发表了文章 • 0 个评论 • 4422 次浏览 • 2019-07-11 09:43 • 来自相关话题

代码如下：
from scrapy.selector import Selector

def get_response_callback(content):
txt = str(content,encoding='utf-8')
resp = Selector(text=txt)
title = resp.xpath('//title/text()').extract_first()
print(title)

@defer.inlineCallbacks
def task():
url = 'http://www.baidu.com'
d=getPage(url.encode('utf-8'))
d.addCallback(get_response_callback)
yield d

def done():
reactor.stop()

def done1(*args,**kwargs):
reactor.stop()

task_list =
for i in range(4):
d=task()
task_list.append(d)

dd = defer.DeferredList(task_list)

dd.addBoth(done)

reactor.run()
上面的代码是无法停止的，如果使用的是　
dd.addBoth(done)

done函数的定义是没有参数的。　

而使用另一个done函数带参数的done(*args,**kwargs)
是可以正常退出的，done里面写了reactor.stop() 函数

原创文章
转载请注明出处：
http://30daydo.com/article/509
查看全部