mongodb
mongodb python同步两个数据库数据
python • 李魔佛 发表了文章 • 0 个评论 • 2143 次浏览 • 2022-04-07 02:44
所以自己动手写代码完成:
# -*- coding: utf-8 -*-
# @Time : 2022/4/6 4:41
# @File : database_migrate.py
# @Author : Rocky C@www.30daydo.com
import time
from loguru import logger
import pymongo
ignore_db = ['admin', 'config', 'local',
] # 忽略更新的库
ignore_col = [('db_stock','dfcf_list_full')]
logger.add('mongo.log')
# 数据库同步
def get_client(user, password, host, port):
connect_uri = f'mongodb://{user}:{password}@{host}:{port}'
client = pymongo.MongoClient(connect_uri)
return client
def origin():
return get_client('admin', 'password', '127.0.0.1', '27017')
def target():
return get_client('root', 'password', '127.0.0.1', '27017')
def transfer():
origin_client = origin()
target_client = target()
dbs = get_db_name(origin_client)
for db in dbs:
for col in get_collection_name(origin_client, db):
if (db,col) in ignore_col:
continue
items = []
logger.info(f'正在更新{db} {col}')
for i in get_item(origin_client, db, col):
items.append(i)
insert_item(target_client, db, col, items)
logger.info(f'更新数据库 {db} {col}')
# time.sleep(0.5)
def get_item(client, db_name, col):
return client[db_name][col].find()
def insert_item(client, db_name, col, data):
batch = 1000
count = len(data)//batch + 1
for i in range(count):
item = data[i*batch:(i+1)*batch]
if len(item)==0:
continue
try:
client[db_name][col].insert_many(item)
except Exception as e:
logger.error(e)
logger.error(f'{db_name} {col} 插入出错')
def get_db_name(client):
db_name = client.list_database_names()
dbs = []
for db in db_name:
if db not in ignore_db:
dbs.append(db)
return dbs
def delete_col(client,db,col):
try:
client[db][col].delete_many({})
except Exception as e:
logger.error(e)
logger.error(db)
logger.error(col)
return False
else:
return True
def server_compare():
'''
比较2个数据库是否相同,只是单纯比较条数
'''
origin_client = origin()
target_client = target()
dbs = get_db_name(origin_client)
for db in dbs:
for col in get_collection_name(origin_client, db):
origin_count = origin_client[db][col].count_documents({})
target_count = target_client[db][col].count_documents({})
if origin_count!=target_count:
logger.info(f'collection {db} {col}有区别')
#
if delete_col(target_client,db,col):
items = []
logger.info(f'正在更新{db} {col}')
for i in get_item(origin_client, db, col):
items.append(i)
insert_item(target_client, db, col, items)
logger.info(f'更新数据库 {db} {col}')
time.sleep(1)
def get_collection_name(client, db_name):
collection_names = client[db_name].list_collection_names(session=None)
return collection_names
def main():
server_compare()
if __name__ == '__main__':
main()
原理就是不断迭代,不同的数据库,里面的不同的collection。
对于同名collection,通过条数是否一致,来决定是否要把原数据复制过来。
保存上面文件为main.py
执行 python main.py
就可以进行数据同步工作啦。 查看全部
所以自己动手写代码完成:
# -*- coding: utf-8 -*-
# @Time : 2022/4/6 4:41
# @File : database_migrate.py
# @Author : Rocky C@www.30daydo.com
import time
from loguru import logger
import pymongo
ignore_db = ['admin', 'config', 'local',
] # 忽略更新的库
ignore_col = [('db_stock','dfcf_list_full')]
logger.add('mongo.log')
# 数据库同步
def get_client(user, password, host, port):
connect_uri = f'mongodb://{user}:{password}@{host}:{port}'
client = pymongo.MongoClient(connect_uri)
return client
def origin():
return get_client('admin', 'password', '127.0.0.1', '27017')
def target():
return get_client('root', 'password', '127.0.0.1', '27017')
def transfer():
origin_client = origin()
target_client = target()
dbs = get_db_name(origin_client)
for db in dbs:
for col in get_collection_name(origin_client, db):
if (db,col) in ignore_col:
continue
items = []
logger.info(f'正在更新{db} {col}')
for i in get_item(origin_client, db, col):
items.append(i)
insert_item(target_client, db, col, items)
logger.info(f'更新数据库 {db} {col}')
# time.sleep(0.5)
def get_item(client, db_name, col):
return client[db_name][col].find()
def insert_item(client, db_name, col, data):
batch = 1000
count = len(data)//batch + 1
for i in range(count):
item = data[i*batch:(i+1)*batch]
if len(item)==0:
continue
try:
client[db_name][col].insert_many(item)
except Exception as e:
logger.error(e)
logger.error(f'{db_name} {col} 插入出错')
def get_db_name(client):
db_name = client.list_database_names()
dbs = []
for db in db_name:
if db not in ignore_db:
dbs.append(db)
return dbs
def delete_col(client,db,col):
try:
client[db][col].delete_many({})
except Exception as e:
logger.error(e)
logger.error(db)
logger.error(col)
return False
else:
return True
def server_compare():
'''
比较2个数据库是否相同,只是单纯比较条数
'''
origin_client = origin()
target_client = target()
dbs = get_db_name(origin_client)
for db in dbs:
for col in get_collection_name(origin_client, db):
origin_count = origin_client[db][col].count_documents({})
target_count = target_client[db][col].count_documents({})
if origin_count!=target_count:
logger.info(f'collection {db} {col}有区别')
#
if delete_col(target_client,db,col):
items = []
logger.info(f'正在更新{db} {col}')
for i in get_item(origin_client, db, col):
items.append(i)
insert_item(target_client, db, col, items)
logger.info(f'更新数据库 {db} {col}')
time.sleep(1)
def get_collection_name(client, db_name):
collection_names = client[db_name].list_collection_names(session=None)
return collection_names
def main():
server_compare()
if __name__ == '__main__':
main()
原理就是不断迭代,不同的数据库,里面的不同的collection。
对于同名collection,通过条数是否一致,来决定是否要把原数据复制过来。
保存上面文件为main.py
执行 python main.py
就可以进行数据同步工作啦。
mongodb 判断列表字段不为空
数据库 • 李魔佛 发表了文章 • 0 个评论 • 8968 次浏览 • 2019-08-20 11:08
db.test_tab.insert({array:[]})
db.test_tab.insert({array:[]})
db.test_tab.insert({array:[]})
db.test_tab.insert({array:[1,2,3,4,5]})
db.test_tab.insert({array:[1,2,3,4,5,6]})
使用以下命令判断列表不为空:
db.getCollection("example").find({array:{$exists:true,$ne:[]}}); # 字段不为0 查看全部
db.test_tab.insert({array:[]})
db.test_tab.insert({array:[]})
db.test_tab.insert({array:[]})
db.test_tab.insert({array:[1,2,3,4,5]})
db.test_tab.insert({array:[1,2,3,4,5,6]})
使用以下命令判断列表不为空:
db.getCollection("example").find({array:{$exists:true,$ne:[]}}); # 字段不为0
mongodb 修改嵌套字典字典的字段名
数据库 • 李魔佛 发表了文章 • 0 个评论 • 6075 次浏览 • 2019-08-05 13:55
db.test.update({},{$rename:{'旧字段':'新字段'}},true,true)
比如下面的例子:db.getCollection('example').update({},{$rename:{'corp':'企业'}})
上面就是把字段corp改为企业。
如果是嵌套字段呢?
比如 corp字典是一个字典,里面是 { 'address':'USA', 'phone':'12345678' }
那么要修改里面的address为地址:
db.getCollection('example').update({},{$rename:{'corp.address':'corp.地址'}})
原创文章,转载请注明出处
原文连接:http://30daydo.com/article/521
查看全部
db.test.update({},{$rename:{'旧字段':'新字段'}},true,true)
比如下面的例子:
db.getCollection('example').update({},{$rename:{'corp':'企业'}})
上面就是把字段corp改为企业。
如果是嵌套字段呢?
比如 corp字典是一个字典,里面是 { 'address':'USA', 'phone':'12345678' }
那么要修改里面的address为地址:
db.getCollection('example').update({},{$rename:{'corp.address':'corp.地址'}})
原创文章,转载请注明出处
原文连接:http://30daydo.com/article/521
mongodb motor 异步操作比同步操作的时间要慢?
数据库 • 量化投机者 回复了问题 • 2 人关注 • 1 个回复 • 5418 次浏览 • 2019-08-03 09:01
mongodb find得到的数据顺序每次都是一样的
数据库 • 李魔佛 发表了文章 • 0 个评论 • 3396 次浏览 • 2019-07-26 09:00
使用pymongo中的find_one_and_update出错:需要分片键
数据库 • 李魔佛 发表了文章 • 0 个评论 • 5830 次浏览 • 2019-06-10 17:13
raise OperationFailure(msg % errmsg, code, response)
pymongo.errors.OperationFailure: Query for sharded findAndModify must contain the shard key
2019-06-10 16:14:32 [scrapy.core.engine] INFO: Closing spider (finished)
2019-06-10 16:14:32 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
需要在查询语句中把分片键也添加进去。
因为findOneModify只会找一个记录,但是到底在哪个分片的记录呢? 因为不确定,所以才需要把shard加上去。
参考官方:
Targeted Operations vs. Broadcast Operations
Generally, the fastest queries in a sharded environment are those that mongos route to a single shard, using the shard key and the cluster meta data from the config server. These targeted operations use the shard key value to locate the shard or subset of shards that satisfy the query document.
For queries that don’t include the shard key, mongos must query all shards, wait for their responses and then return the result to the application. These “scatter/gather” queries can be long running operations.
Broadcast Operations
mongos instances broadcast queries to all shards for the collection unless the mongos can determine which shard or subset of shards stores this data.
After the mongos receives responses from all shards, it merges the data and returns the result document. The performance of a broadcast operation depends on the overall load of the cluster, as well as variables like network latency, individual shard load, and number of documents returned per shard. Whenever possible, favor operations that result in targeted operation over those that result in a broadcast operation.
Multi-update operations are always broadcast operations.
The updateMany() and deleteMany() methods are broadcast operations, unless the query document specifies the shard key in full.
Targeted Operations
mongos can route queries that include the shard key or the prefix of a compound shard key a specific shard or set of shards. mongos uses the shard key value to locate the chunk whose range includes the shard key value and directs the query at the shard containing that chunk.
For example, if the shard key is:
copy
{ a: 1, b: 1, c: 1 }
The mongos program can route queries that include the full shard key or either of the following shard key prefixes at a specific shard or set of shards:
copy
{ a: 1 }
{ a: 1, b: 1 }
All insertOne() operations target to one shard. Each document in the insertMany() array targets to a single shard, but there is no guarantee all documents in the array insert into a single shard.
All updateOne(), replaceOne() and deleteOne() operations must include the shard key or _id in the query document. MongoDB returns an error if these methods are used without the shard key or _id.
Depending on the distribution of data in the cluster and the selectivity of the query, mongos may still perform a broadcast operation to fulfill these queries.
Index Use
If the query does not include the shard key, the mongos must send the query to all shards as a “scatter/gather” operation. Each shard will, in turn, use either the shard key index or another more efficient index to fulfill the query.
If the query includes multiple sub-expressions that reference the fields indexed by the shard key and the secondary index, the mongos can route the queries to a specific shard and the shard will use the index that will allow it to fulfill most efficiently.
Sharded Cluster Security
Use Internal Authentication to enforce intra-cluster security and prevent unauthorized cluster components from accessing the cluster. You must start each mongod or mongos in the cluster with the appropriate security settings in order to enforce internal authentication.
See Deploy Sharded Cluster with Keyfile Access Control for a tutorial on deploying a secured shardedcluster.
Cluster Users
Sharded clusters support Role-Based Access Control (RBAC) for restricting unauthorized access to cluster data and operations. You must start each mongod in the cluster, including the config servers, with the --auth option in order to enforce RBAC. Alternatively, enforcing Internal Authentication for inter-cluster security also enables user access controls via RBAC.
With RBAC enforced, clients must specify a --username, --password, and --authenticationDatabase when connecting to the mongos in order to access cluster resources.
Each cluster has its own cluster users. These users cannot be used to access individual shards.
See Enable Access Control for a tutorial on enabling adding users to an RBAC-enabled MongoDB deployment. 查看全部
File "C:\ProgramData\Anaconda3\lib\site-packages\pymongo\helpers.py", line 155, in _check_command_response
raise OperationFailure(msg % errmsg, code, response)
pymongo.errors.OperationFailure: Query for sharded findAndModify must contain the shard key
2019-06-10 16:14:32 [scrapy.core.engine] INFO: Closing spider (finished)
2019-06-10 16:14:32 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
需要在查询语句中把分片键也添加进去。
因为findOneModify只会找一个记录,但是到底在哪个分片的记录呢? 因为不确定,所以才需要把shard加上去。
参考官方:
Targeted Operations vs. Broadcast Operations
Generally, the fastest queries in a sharded environment are those that mongos route to a single shard, using the shard key and the cluster meta data from the config server. These targeted operations use the shard key value to locate the shard or subset of shards that satisfy the query document.
For queries that don’t include the shard key, mongos must query all shards, wait for their responses and then return the result to the application. These “scatter/gather” queries can be long running operations.
Broadcast Operations
mongos instances broadcast queries to all shards for the collection unless the mongos can determine which shard or subset of shards stores this data.
After the mongos receives responses from all shards, it merges the data and returns the result document. The performance of a broadcast operation depends on the overall load of the cluster, as well as variables like network latency, individual shard load, and number of documents returned per shard. Whenever possible, favor operations that result in targeted operation over those that result in a broadcast operation.
Multi-update operations are always broadcast operations.
The updateMany() and deleteMany() methods are broadcast operations, unless the query document specifies the shard key in full.
Targeted Operations
mongos can route queries that include the shard key or the prefix of a compound shard key a specific shard or set of shards. mongos uses the shard key value to locate the chunk whose range includes the shard key value and directs the query at the shard containing that chunk.
For example, if the shard key is:
copy
{ a: 1, b: 1, c: 1 }
The mongos program can route queries that include the full shard key or either of the following shard key prefixes at a specific shard or set of shards:
copy
{ a: 1 }
{ a: 1, b: 1 }
All insertOne() operations target to one shard. Each document in the insertMany() array targets to a single shard, but there is no guarantee all documents in the array insert into a single shard.
All updateOne(), replaceOne() and deleteOne() operations must include the shard key or _id in the query document. MongoDB returns an error if these methods are used without the shard key or _id.
Depending on the distribution of data in the cluster and the selectivity of the query, mongos may still perform a broadcast operation to fulfill these queries.
Index Use
If the query does not include the shard key, the mongos must send the query to all shards as a “scatter/gather” operation. Each shard will, in turn, use either the shard key index or another more efficient index to fulfill the query.
If the query includes multiple sub-expressions that reference the fields indexed by the shard key and the secondary index, the mongos can route the queries to a specific shard and the shard will use the index that will allow it to fulfill most efficiently.
Sharded Cluster Security
Use Internal Authentication to enforce intra-cluster security and prevent unauthorized cluster components from accessing the cluster. You must start each mongod or mongos in the cluster with the appropriate security settings in order to enforce internal authentication.
See Deploy Sharded Cluster with Keyfile Access Control for a tutorial on deploying a secured shardedcluster.
Cluster Users
Sharded clusters support Role-Based Access Control (RBAC) for restricting unauthorized access to cluster data and operations. You must start each mongod in the cluster, including the config servers, with the --auth option in order to enforce RBAC. Alternatively, enforcing Internal Authentication for inter-cluster security also enables user access controls via RBAC.
With RBAC enforced, clients must specify a --username, --password, and --authenticationDatabase when connecting to the mongos in order to access cluster resources.
Each cluster has its own cluster users. These users cannot be used to access individual shards.
See Enable Access Control for a tutorial on enabling adding users to an RBAC-enabled MongoDB deployment.
Warning: unable to run listCollections, attempting to approximate collection
数据库 • 李魔佛 发表了文章 • 0 个评论 • 19717 次浏览 • 2019-06-07 17:35
Warning: unable to run listCollections, attempting to approximate collection names by parsing connectionStatus
那是因为设置了密码,但是没有进行认证导致的错误。这个错误为啥不直接说明原因呢。汗
直接: db.auth('admin','密码')
认证成功返回1, 然后重新执行show tables就可以看到所有的表了。 查看全部
Warning: unable to run listCollections, attempting to approximate collection names by parsing connectionStatus
那是因为设置了密码,但是没有进行认证导致的错误。这个错误为啥不直接说明原因呢。汗
直接: db.auth('admin','密码')
认证成功返回1, 然后重新执行show tables就可以看到所有的表了。
python连接mongodb集群 cluster
数据库 • 李魔佛 发表了文章 • 0 个评论 • 4831 次浏览 • 2019-06-03 15:55
连接方法如下:import pymongo
db = pymongo.MongoClient('mongodb://10.18.6.46,10.18.6.26,10.18.6.102')上面默认的端口do都是27017,如果是其他端口,需要这样修改:db = pymongo.MongoClient('mongodb://10.18.6.46:8888,10.18.6.26:9999,10.18.6.102:7777')
然后就可以正常读写数据库:
读:coll=db['testdb']['testcollection'].find()
for i in coll:
print(i)输出内容:{'_id': ObjectId('5cf4c7981ee9edff72e5c503'), 'username': 'hello'}
{'_id': ObjectId('5cf4c7991ee9edff72e5c504'), 'username': 'hello'}
{'_id': ObjectId('5cf4c7991ee9edff72e5c505'), 'username': 'hello'}
{'_id': ObjectId('5cf4c79a1ee9edff72e5c506'), 'username': 'hello'}
{'_id': ObjectId('5cf4c7b21ee9edff72e5c507'), 'username': 'hello world'}
写:collection = db['testdb']['testcollection']
for i in range(10):
collection.insert({'username':'huston{}'.format(i)})
原创文章,转载请注明出处:
http://30daydo.com/article/494
查看全部
连接方法如下:
import pymongo上面默认的端口do都是27017,如果是其他端口,需要这样修改:
db = pymongo.MongoClient('mongodb://10.18.6.46,10.18.6.26,10.18.6.102')
db = pymongo.MongoClient('mongodb://10.18.6.46:8888,10.18.6.26:9999,10.18.6.102:7777')
然后就可以正常读写数据库:
读:
coll=db['testdb']['testcollection'].find()输出内容:
for i in coll:
print(i)
{'_id': ObjectId('5cf4c7981ee9edff72e5c503'), 'username': 'hello'}
{'_id': ObjectId('5cf4c7991ee9edff72e5c504'), 'username': 'hello'}
{'_id': ObjectId('5cf4c7991ee9edff72e5c505'), 'username': 'hello'}
{'_id': ObjectId('5cf4c79a1ee9edff72e5c506'), 'username': 'hello'}
{'_id': ObjectId('5cf4c7b21ee9edff72e5c507'), 'username': 'hello world'}
写:
collection = db['testdb']['testcollection']
for i in range(10):
collection.insert({'username':'huston{}'.format(i)})
原创文章,转载请注明出处:
http://30daydo.com/article/494
【python】pymongo find_one_and_update的用法
python爬虫 • 李魔佛 发表了文章 • 0 个评论 • 14528 次浏览 • 2019-04-04 11:31
<filter>,
<update>,
{
projection: <document>,
sort: <document>,
maxTimeMS: <number>,
upsert: <boolean>,
returnNewDocument: <boolean>,
collation: <document>,
arrayFilters: [ <filterdocument1>, ... ]
}
)
转换成python pymongo是这样的:>>> db.example.find_one_and_update(
... {'_id': 'userid'},
... {'$inc': {'seq': 1}},
... projection={'seq': True, '_id': False},
... return_document=ReturnDocument.AFTER)
上面的语句的意思是:
找到_id 为userid的值得文档,然后把该文档中的seq的值+1,然后返回seq的数据,不显示_id列
最后返回的数据是这样的:
{'seq': 2}
注意
findOneAndUpdate
是获取mongo文档中第一条满足条件的数据并做修改。该函数是线程安全的。意思就是在多个线程中操作,不会对同一条数据进行获取修改。
也就是该操作是原子操作。
ReturnDocument 引用的库
class pymongo.collection.ReturnDocument
在开头 from pymongo.collection import ReturnDocument
原创文章
转载请注明出处:
http://30daydo.com/article/445 查看全部
db.collection.findOneAndUpdate(
<filter>,
<update>,
{
projection: <document>,
sort: <document>,
maxTimeMS: <number>,
upsert: <boolean>,
returnNewDocument: <boolean>,
collation: <document>,
arrayFilters: [ <filterdocument1>, ... ]
}
)
转换成python pymongo是这样的:
>>> db.example.find_one_and_update(
... {'_id': 'userid'},
... {'$inc': {'seq': 1}},
... projection={'seq': True, '_id': False},
... return_document=ReturnDocument.AFTER)
上面的语句的意思是:
找到_id 为userid的值得文档,然后把该文档中的seq的值+1,然后返回seq的数据,不显示_id列
最后返回的数据是这样的:
{'seq': 2}
注意
findOneAndUpdate
是获取mongo文档中第一条满足条件的数据并做修改。该函数是线程安全的。意思就是在多个线程中操作,不会对同一条数据进行获取修改。
也就是该操作是原子操作。
ReturnDocument 引用的库
class pymongo.collection.ReturnDocument
在开头 from pymongo.collection import ReturnDocument
原创文章
转载请注明出处:
http://30daydo.com/article/445
python 代码获取mongodb数据库下所有的collection 文档名字
python • 李魔佛 发表了文章 • 0 个评论 • 5606 次浏览 • 2018-11-27 11:41
db['db_pledge'].list_collection_names()
db['db_pledge'].collection_names()
db['db_pledge'].list_collection_names()
python爬虫集思录所有用户的帖子 scrapy写入mongodb数据库
python爬虫 • 李魔佛 发表了文章 • 0 个评论 • 6660 次浏览 • 2018-09-02 21:52
项目采用scrapy的框架,数据写入到mongodb的数据库。 整个站点爬下来大概用了半小时,数据有12w条。
项目中的主要代码如下:
主spider# -*- coding: utf-8 -*-
import re
import scrapy
from scrapy import Request, FormRequest
from jsl.items import JslItem
from jsl import config
import logging
class AllcontentSpider(scrapy.Spider):
name = 'allcontent'
headers = {
'Host': 'www.jisilu.cn', 'Connection': 'keep-alive', 'Pragma': 'no-cache',
'Cache-Control': 'no-cache', 'Accept': 'application/json,text/javascript,*/*;q=0.01',
'Origin': 'https://www.jisilu.cn', 'X-Requested-With': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/67.0.3396.99Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',
'Referer': 'https://www.jisilu.cn/login/',
'Accept-Encoding': 'gzip,deflate,br',
'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8'
}
def start_requests(self):
login_url = 'https://www.jisilu.cn/login/'
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip,deflate,br', 'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8',
'Cache-Control': 'no-cache', 'Connection': 'keep-alive',
'Host': 'www.jisilu.cn', 'Pragma': 'no-cache', 'Referer': 'https://www.jisilu.cn/',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/67.0.3396.99Safari/537.36'}
yield Request(url=login_url, headers=headers, callback=self.login,dont_filter=True)
def login(self, response):
url = 'https://www.jisilu.cn/account/ajax/login_process/'
data = {
'return_url': 'https://www.jisilu.cn/',
'user_name': config.username,
'password': config.password,
'net_auto_login': '1',
'_post_type': 'ajax',
}
yield FormRequest(
url=url,
headers=self.headers,
formdata=data,
callback=self.parse,
dont_filter=True
)
def parse(self, response):
for i in range(1,3726):
focus_url = 'https://www.jisilu.cn/home/explore/sort_type-new__day-0__page-{}'.format(i)
yield Request(url=focus_url, headers=self.headers, callback=self.parse_page,dont_filter=True)
def parse_page(self, response):
nodes = response.xpath('//div[@class="aw-question-list"]/div')
for node in nodes:
each_url=node.xpath('.//h4/a/@href').extract_first()
yield Request(url=each_url,headers=self.headers,callback=self.parse_item,dont_filter=True)
def parse_item(self,response):
item = JslItem()
title = response.xpath('//div[@class="aw-mod-head"]/h1/text()').extract_first()
s = response.xpath('//div[@class="aw-question-detail-txt markitup-box"]').xpath('string(.)').extract_first()
ret = re.findall('(.*?)\.donate_user_avatar', s, re.S)
try:
content = ret[0].strip()
except:
content = None
createTime = response.xpath('//div[@class="aw-question-detail-meta"]/span/text()').extract_first()
resp_no = response.xpath('//div[@class="aw-mod aw-question-detail-box"]//ul/h2/text()').re_first('\d+')
url = response.url
item['title'] = title.strip()
item['content'] = content
try:
item['resp_no']=int(resp_no)
except Exception as e:
logging.warning('e')
item['resp_no']=None
item['createTime'] = createTime
item['url'] = url.strip()
resp =
for index,reply in enumerate(response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]')):
replay_user = reply.xpath('.//div[@class="pull-left aw-dynamic-topic-content"]//p/a/text()').extract_first()
rep_content = reply.xpath(
'.//div[@class="pull-left aw-dynamic-topic-content"]//div[@class="markitup-box"]/text()').extract_first()
# print rep_content
agree=reply.xpath('.//em[@class="aw-border-radius-5 aw-vote-count pull-left"]/text()').extract_first()
resp.append({replay_user.strip()+'_{}'.format(index): [int(agree),rep_content.strip()]})
item['resp'] = resp
yield item
login函数是模拟登录集思录,通过抓包就可以知道一些上传的data。
然后就是分页去抓取。逻辑很简单。
然后pipeline里面写入mongodb。import pymongo
from collections import OrderedDict
class JslPipeline(object):
def __init__(self):
self.db = pymongo.MongoClient(host='10.18.6.1',port=27017)
# self.user = u'neo牛3' # 修改为指定的用户名 如 毛之川 ,然后找到用户的id,在用户也的源码哪里可以找到 比如持有封基是8132
self.collection = self.db['db_parker']['jsl']
def process_item(self, item, spider):
self.collection.insert(OrderedDict(item))
return item
抓取到的数据入库mongodb:
点击查看大图
原创文章
转载请注明出处:http://30daydo.com/publish/article/351
查看全部
项目采用scrapy的框架,数据写入到mongodb的数据库。 整个站点爬下来大概用了半小时,数据有12w条。
项目中的主要代码如下:
主spider
# -*- coding: utf-8 -*-
import re
import scrapy
from scrapy import Request, FormRequest
from jsl.items import JslItem
from jsl import config
import logging
class AllcontentSpider(scrapy.Spider):
name = 'allcontent'
headers = {
'Host': 'www.jisilu.cn', 'Connection': 'keep-alive', 'Pragma': 'no-cache',
'Cache-Control': 'no-cache', 'Accept': 'application/json,text/javascript,*/*;q=0.01',
'Origin': 'https://www.jisilu.cn', 'X-Requested-With': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/67.0.3396.99Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',
'Referer': 'https://www.jisilu.cn/login/',
'Accept-Encoding': 'gzip,deflate,br',
'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8'
}
def start_requests(self):
login_url = 'https://www.jisilu.cn/login/'
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip,deflate,br', 'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8',
'Cache-Control': 'no-cache', 'Connection': 'keep-alive',
'Host': 'www.jisilu.cn', 'Pragma': 'no-cache', 'Referer': 'https://www.jisilu.cn/',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/67.0.3396.99Safari/537.36'}
yield Request(url=login_url, headers=headers, callback=self.login,dont_filter=True)
def login(self, response):
url = 'https://www.jisilu.cn/account/ajax/login_process/'
data = {
'return_url': 'https://www.jisilu.cn/',
'user_name': config.username,
'password': config.password,
'net_auto_login': '1',
'_post_type': 'ajax',
}
yield FormRequest(
url=url,
headers=self.headers,
formdata=data,
callback=self.parse,
dont_filter=True
)
def parse(self, response):
for i in range(1,3726):
focus_url = 'https://www.jisilu.cn/home/explore/sort_type-new__day-0__page-{}'.format(i)
yield Request(url=focus_url, headers=self.headers, callback=self.parse_page,dont_filter=True)
def parse_page(self, response):
nodes = response.xpath('//div[@class="aw-question-list"]/div')
for node in nodes:
each_url=node.xpath('.//h4/a/@href').extract_first()
yield Request(url=each_url,headers=self.headers,callback=self.parse_item,dont_filter=True)
def parse_item(self,response):
item = JslItem()
title = response.xpath('//div[@class="aw-mod-head"]/h1/text()').extract_first()
s = response.xpath('//div[@class="aw-question-detail-txt markitup-box"]').xpath('string(.)').extract_first()
ret = re.findall('(.*?)\.donate_user_avatar', s, re.S)
try:
content = ret[0].strip()
except:
content = None
createTime = response.xpath('//div[@class="aw-question-detail-meta"]/span/text()').extract_first()
resp_no = response.xpath('//div[@class="aw-mod aw-question-detail-box"]//ul/h2/text()').re_first('\d+')
url = response.url
item['title'] = title.strip()
item['content'] = content
try:
item['resp_no']=int(resp_no)
except Exception as e:
logging.warning('e')
item['resp_no']=None
item['createTime'] = createTime
item['url'] = url.strip()
resp =
for index,reply in enumerate(response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]')):
replay_user = reply.xpath('.//div[@class="pull-left aw-dynamic-topic-content"]//p/a/text()').extract_first()
rep_content = reply.xpath(
'.//div[@class="pull-left aw-dynamic-topic-content"]//div[@class="markitup-box"]/text()').extract_first()
# print rep_content
agree=reply.xpath('.//em[@class="aw-border-radius-5 aw-vote-count pull-left"]/text()').extract_first()
resp.append({replay_user.strip()+'_{}'.format(index): [int(agree),rep_content.strip()]})
item['resp'] = resp
yield item
login函数是模拟登录集思录,通过抓包就可以知道一些上传的data。
然后就是分页去抓取。逻辑很简单。
然后pipeline里面写入mongodb。
import pymongo
from collections import OrderedDict
class JslPipeline(object):
def __init__(self):
self.db = pymongo.MongoClient(host='10.18.6.1',port=27017)
# self.user = u'neo牛3' # 修改为指定的用户名 如 毛之川 ,然后找到用户的id,在用户也的源码哪里可以找到 比如持有封基是8132
self.collection = self.db['db_parker']['jsl']
def process_item(self, item, spider):
self.collection.insert(OrderedDict(item))
return item
抓取到的数据入库mongodb:
点击查看大图
原创文章
转载请注明出处:http://30daydo.com/publish/article/351
docker里运行mongodb,保存的数据在外部使用mongoexport不能导出:提示错误Unrecognized field 'snapshot'
python • 李魔佛 发表了文章 • 0 个评论 • 10747 次浏览 • 2018-08-31 14:21
很无语。 目前还找不到原因。
docker里面运行的mongodb, mongodb的数据挂载到宿主机。 开放了27017端口。
在windows下使用mongoexport工具导出数据:
错误信息:C:\Program Files\MongoDB\Server\3.4\bin>mongoexport.exe /h 10.18.6.102 /d stock
/c company /o company.json /type json
2018-08-31T14:13:47.841+0800 connected to: 10.18.6.102
2018-08-31T14:13:47.854+0800 Failed: Failed to parse: { find: "company", filt
er: {}, sort: {}, skip: 0, snapshot: true, $readPreference: { mode: "secondaryPr
eferred" }, $db: "stock" }. Unrecognized field 'snapshot'.
C:\Program Files\MongoDB\Server\3.4\bin>
目前这个问题已经解决:
需要进去docker容器里面,然后在容器里面操作,把数据导出来到挂载的目录下,然后可以直接获取到数据了。 查看全部
很无语。 目前还找不到原因。
docker里面运行的mongodb, mongodb的数据挂载到宿主机。 开放了27017端口。
在windows下使用mongoexport工具导出数据:
错误信息:
C:\Program Files\MongoDB\Server\3.4\bin>mongoexport.exe /h 10.18.6.102 /d stock
/c company /o company.json /type json
2018-08-31T14:13:47.841+0800 connected to: 10.18.6.102
2018-08-31T14:13:47.854+0800 Failed: Failed to parse: { find: "company", filt
er: {}, sort: {}, skip: 0, snapshot: true, $readPreference: { mode: "secondaryPr
eferred" }, $db: "stock" }. Unrecognized field 'snapshot'.
C:\Program Files\MongoDB\Server\3.4\bin>
目前这个问题已经解决:
需要进去docker容器里面,然后在容器里面操作,把数据导出来到挂载的目录下,然后可以直接获取到数据了。
python mongodb大数据(>3GB)转移Mysql数据库
python • 李魔佛 发表了文章 • 0 个评论 • 5332 次浏览 • 2018-08-20 15:44
于是使用了分段遍历的方法.# -*-coding=utf-8-*-
import pandas as pd
import json
import pymongo
from sqlalchemy import create_engine
# 将mongo数据转移到mysql
client = pymongo.MongoClient('xxx')
doc = client['spider']['meituan']
engine = create_engine('mysql+pymysql://xxx:xxx@xxx:/xxx?charset=utf8')
def classic_method():
temp =
start = 0
# 数据太大还是会爆内存,或者游标丢失
for i in doc.find().batch_size(500):
start += 1
del i['_id']
temp.append(i)
print(start)
print('start to save to mysql')
df = pd.read_json(json.dumps(temp))
df = df.set_index('poiid', drop=True)
df.to_sql('meituan', con=engine, if_exists='replace')
print('done')
def chunksize_move():
block = 10000
total = doc.find({}).count()
iter_number = total // block
for i in range(iter_number + 1):
small_part = doc.find({}).limit(block).skip(i * block)
list_data =
for item in small_part:
del item['_id']
del item['crawl_time']
item['poiid'] = int(item['poiid'])
for k, v in item.items():
if isinstance(v, dict) or isinstance(v, list):
item[k] = json.dumps(v, ensure_ascii=False)
list_data.append(item)
df = pd.DataFrame(list_data)
df = df.set_index('poiid', drop=True)
try:
df.to_sql('meituan', con=engine, if_exists='append')
print('to sql {}'.format(i))
except Exception as e:
print(e)
chunksize_move()
速度比一次批量的要快不少. 查看全部
for i in doc.find({})进行逐行遍历的话,游标就会超时,而且越到后面速度越慢.
于是使用了分段遍历的方法.
# -*-coding=utf-8-*-
import pandas as pd
import json
import pymongo
from sqlalchemy import create_engine
# 将mongo数据转移到mysql
client = pymongo.MongoClient('xxx')
doc = client['spider']['meituan']
engine = create_engine('mysql+pymysql://xxx:xxx@xxx:/xxx?charset=utf8')
def classic_method():
temp =
start = 0
# 数据太大还是会爆内存,或者游标丢失
for i in doc.find().batch_size(500):
start += 1
del i['_id']
temp.append(i)
print(start)
print('start to save to mysql')
df = pd.read_json(json.dumps(temp))
df = df.set_index('poiid', drop=True)
df.to_sql('meituan', con=engine, if_exists='replace')
print('done')
def chunksize_move():
block = 10000
total = doc.find({}).count()
iter_number = total // block
for i in range(iter_number + 1):
small_part = doc.find({}).limit(block).skip(i * block)
list_data =
for item in small_part:
del item['_id']
del item['crawl_time']
item['poiid'] = int(item['poiid'])
for k, v in item.items():
if isinstance(v, dict) or isinstance(v, list):
item[k] = json.dumps(v, ensure_ascii=False)
list_data.append(item)
df = pd.DataFrame(list_data)
df = df.set_index('poiid', drop=True)
try:
df.to_sql('meituan', con=engine, if_exists='append')
print('to sql {}'.format(i))
except Exception as e:
print(e)
chunksize_move()
速度比一次批量的要快不少.
python 把mongodb的数据迁移到mysql
python • 李魔佛 发表了文章 • 0 个评论 • 4842 次浏览 • 2018-08-20 11:02
import pymongo
from setting import get_engine
# 将mongo数据转移到mysql
client = pymongo.MongoClient('10.18.6.101')
doc = client['spider']['meituan']
engine = create_engine('mysql+pymysql://localhost:1234@10.18.4.211/spider?charset=utf8')
temp=[]
for i in doc.find({}):
del i['_id']
temp.append(i)
print('start to save to mysql')
df = pd.read_json(json.dumps(temp))
df = df.set_index('poiid',drop=True)
df.to_sql('meituan',con=engine,if_exists='replace')
print('done')
居然CPU飙到了90%
查看全部
import pymongo
from setting import get_engine
# 将mongo数据转移到mysql
client = pymongo.MongoClient('10.18.6.101')
doc = client['spider']['meituan']
engine = create_engine('mysql+pymysql://localhost:1234@10.18.4.211/spider?charset=utf8')
temp=[]
for i in doc.find({}):
del i['_id']
temp.append(i)
print('start to save to mysql')
df = pd.read_json(json.dumps(temp))
df = df.set_index('poiid',drop=True)
df.to_sql('meituan',con=engine,if_exists='replace')
print('done')
居然CPU飙到了90%
mongodb sort: Executor error during find command: OperationFailed: Sort operation used more than
网络 • 李魔佛 发表了文章 • 0 个评论 • 7903 次浏览 • 2018-07-09 10:31
Error: error: {
"ok" : 0,
"errmsg" : "Executor error during find command: OperationFailed: Sort operation used more than the maximum 33554432 bytes of RAM. Add an index, or specify a smaller limit.",
"code" : 96,
"codeName" : "OperationFailed"
}
使用limit函数限制其输出就可以了:
db.getCollection('老布').find({}).sort({'created_at':-1}).limit(1000) 查看全部
Error: error: {
"ok" : 0,
"errmsg" : "Executor error during find command: OperationFailed: Sort operation used more than the maximum 33554432 bytes of RAM. Add an index, or specify a smaller limit.",
"code" : 96,
"codeName" : "OperationFailed"
}
使用limit函数限制其输出就可以了:
db.getCollection('老布').find({}).sort({'created_at':-1}).limit(1000)
mongo服务器因为mongod.lock 被锁定无法正常运行
网络安全 • 李魔佛 发表了文章 • 0 个评论 • 8494 次浏览 • 2018-05-20 18:33
Sun May 20 18:26:04.630 [initandlisten] MongoDB starting : pid=2343 port=27017 dbpath=/home/pi/mongo/db/ 32-bit host=raspberrypi
Sun May 20 18:26:04.631 [initandlisten]
Sun May 20 18:26:04.631 [initandlisten] ** NOTE: This is a 32 bit MongoDB binary.
Sun May 20 18:26:04.631 [initandlisten] ** 32 bit builds are limited to less than 2GB of data (or less with --journal).
Sun May 20 18:26:04.631 [initandlisten] ** See http://dochub.mongodb.org/core/32bit
Sun May 20 18:26:04.631 [initandlisten]
Sun May 20 18:26:04.631 [initandlisten] db version v2.4.10
Sun May 20 18:26:04.632 [initandlisten] git version: nogitversion
Sun May 20 18:26:04.632 [initandlisten] build info: Linux bm-wb-04 3.19.0-trunk-armmp #1 SMP Debian 3.19.1-1~exp1+plugwash1 (2015-03-28) armv7l BOOST_LIB_VERSION=1_55
Sun May 20 18:26:04.632 [initandlisten] allocator: system
Sun May 20 18:26:04.632 [initandlisten] options: { dbpath: "/home/pi/mongo/db/", journal: true, logpath: "/home/pi/mongo/mongod.log" }
Sun May 20 18:26:05.956 [initandlisten] journal dir=/home/pi/mongo/db/journal
Sun May 20 18:26:05.957 [initandlisten] recover : no journal files present, no recovery needed
Sun May 20 18:26:06.023 [initandlisten] ERROR: mmap private failed with out of memory. You are using a 32-bit build and probably need to upgrade to 64
Sun May 20 18:26:06.023 [initandlisten] Assertion: 13636:file /home/pi/mongo/db/xueqiu.5 open/create failed in createPrivateMap (look in log for more information)
0x662fa8 0x63527c 0x6196c0 0x6197bc 0x409d5c 0x4414fc 0x2ea634 0x2eaa3c 0x2eb55c 0x2ebd98 0x26f998 0x26fb94 0x175a20 0x177bf4 0x152bf0 0x7660e294
mongod(_ZN5mongo15printStackTraceERSo+0x1c) [0x662fa8]
mongod(_ZN5mongo10logContextEPKc+0x110) [0x63527c]
mongod(_ZN5mongo11msgassertedEiPKc+0xc0) [0x6196c0]
mongod(_ZN5mongo18msgassertedNoTraceEiPKc+0) [0x6197bc]
mongod(_ZN5mongo8MongoMMF13finishOpeningEv+0x308) [0x409d5c]
mongod(_ZN5mongo13MongoDataFile12openExistingEPKc+0x9c) [0x4414fc]
mongod(_ZN5mongo8Database16openExistingFileEi+0x23c) [0x2ea634]
mongod(_ZN5mongo8Database12openAllFilesEv+0x24) [0x2eaa3c]
mongod(_ZN5mongo8DatabaseC2EPKcRbRKSs+0x158) [0x2eb55c]
mongod(_ZN5mongo14DatabaseHolder11getOrCreateERKSsS2_Rb+0x500) [0x2ebd98]
mongod(_ZN5mongo6Client7Context11_finishInitEv+0x34) [0x26f998]
mongod(_ZN5mongo6Client7ContextC1ERKSsS3_b+0x78) [0x26fb94]
mongod(_ZN5mongo14_initAndListenEi+0xb00) [0x175a20]
mongod(_ZN5mongo13initAndListenEi+0x14) [0x177bf4]
mongod(main+0x2b8) [0x152bf0]
/lib/arm-linux-gnueabihf/libc.so.6(__libc_start_main+0x114) [0x7660e294]
Sun May 20 18:26:06.035 [initandlisten] warning database /home/pi/mongo/db/ xueqiu could not be opened
Sun May 20 18:26:06.035 [initandlisten] DBException 13636: file /home/pi/mongo/db/xueqiu.5 open/create failed in createPrivateMap (look in log for more information)
Sun May 20 18:26:06.036 [initandlisten] exception in initAndListen: 13636 file /home/pi/mongo/db/xueqiu.5 open/create failed in createPrivateMap (look in log for more information), terminating
Sun May 20 18:26:06.036 dbexit:
Sun May 20 18:26:06.036 [initandlisten] shutdown: going to close listening sockets...
Sun May 20 18:26:06.036 [initandlisten] shutdown: going to flush diaglog...
Sun May 20 18:26:06.036 [initandlisten] shutdown: going to close sockets...
Sun May 20 18:26:06.036 [initandlisten] shutdown: waiting for fs preallocator...
Sun May 20 18:26:06.036 [initandlisten] shutdown: lock for final commit...
Sun May 20 18:26:06.036 [initandlisten] shutdown: final commit...
Sun May 20 18:26:06.036 [initandlisten] shutdown: closing all files...
Sun May 20 18:26:06.037 [initandlisten] closeAllFiles() finished
Sun May 20 18:26:06.037 [initandlisten] journalCleanup...
Sun May 20 18:26:06.037 [initandlisten] removeJournalFiles
Sun May 20 18:26:06.050 [initandlisten] shutdown: removing fs lock...
Sun May 20 18:26:06.050 dbexit: really exiting now
应该是之前没有正常被关闭,导致文件本锁住了。
解决办法:
1. 先把数据文件备份, --dbpath 的路径整个备份一下, 不然接下来的操作误操作了数据就丢失了
2. 运行修复命令:
mongod --dbpath /data/db --repair
替换上面的db为您自己的本地路径 查看全部
Sun May 20 18:26:04.630 [initandlisten] MongoDB starting : pid=2343 port=27017 dbpath=/home/pi/mongo/db/ 32-bit host=raspberrypi
Sun May 20 18:26:04.631 [initandlisten]
Sun May 20 18:26:04.631 [initandlisten] ** NOTE: This is a 32 bit MongoDB binary.
Sun May 20 18:26:04.631 [initandlisten] ** 32 bit builds are limited to less than 2GB of data (or less with --journal).
Sun May 20 18:26:04.631 [initandlisten] ** See http://dochub.mongodb.org/core/32bit
Sun May 20 18:26:04.631 [initandlisten]
Sun May 20 18:26:04.631 [initandlisten] db version v2.4.10
Sun May 20 18:26:04.632 [initandlisten] git version: nogitversion
Sun May 20 18:26:04.632 [initandlisten] build info: Linux bm-wb-04 3.19.0-trunk-armmp #1 SMP Debian 3.19.1-1~exp1+plugwash1 (2015-03-28) armv7l BOOST_LIB_VERSION=1_55
Sun May 20 18:26:04.632 [initandlisten] allocator: system
Sun May 20 18:26:04.632 [initandlisten] options: { dbpath: "/home/pi/mongo/db/", journal: true, logpath: "/home/pi/mongo/mongod.log" }
Sun May 20 18:26:05.956 [initandlisten] journal dir=/home/pi/mongo/db/journal
Sun May 20 18:26:05.957 [initandlisten] recover : no journal files present, no recovery needed
Sun May 20 18:26:06.023 [initandlisten] ERROR: mmap private failed with out of memory. You are using a 32-bit build and probably need to upgrade to 64
Sun May 20 18:26:06.023 [initandlisten] Assertion: 13636:file /home/pi/mongo/db/xueqiu.5 open/create failed in createPrivateMap (look in log for more information)
0x662fa8 0x63527c 0x6196c0 0x6197bc 0x409d5c 0x4414fc 0x2ea634 0x2eaa3c 0x2eb55c 0x2ebd98 0x26f998 0x26fb94 0x175a20 0x177bf4 0x152bf0 0x7660e294
mongod(_ZN5mongo15printStackTraceERSo+0x1c) [0x662fa8]
mongod(_ZN5mongo10logContextEPKc+0x110) [0x63527c]
mongod(_ZN5mongo11msgassertedEiPKc+0xc0) [0x6196c0]
mongod(_ZN5mongo18msgassertedNoTraceEiPKc+0) [0x6197bc]
mongod(_ZN5mongo8MongoMMF13finishOpeningEv+0x308) [0x409d5c]
mongod(_ZN5mongo13MongoDataFile12openExistingEPKc+0x9c) [0x4414fc]
mongod(_ZN5mongo8Database16openExistingFileEi+0x23c) [0x2ea634]
mongod(_ZN5mongo8Database12openAllFilesEv+0x24) [0x2eaa3c]
mongod(_ZN5mongo8DatabaseC2EPKcRbRKSs+0x158) [0x2eb55c]
mongod(_ZN5mongo14DatabaseHolder11getOrCreateERKSsS2_Rb+0x500) [0x2ebd98]
mongod(_ZN5mongo6Client7Context11_finishInitEv+0x34) [0x26f998]
mongod(_ZN5mongo6Client7ContextC1ERKSsS3_b+0x78) [0x26fb94]
mongod(_ZN5mongo14_initAndListenEi+0xb00) [0x175a20]
mongod(_ZN5mongo13initAndListenEi+0x14) [0x177bf4]
mongod(main+0x2b8) [0x152bf0]
/lib/arm-linux-gnueabihf/libc.so.6(__libc_start_main+0x114) [0x7660e294]
Sun May 20 18:26:06.035 [initandlisten] warning database /home/pi/mongo/db/ xueqiu could not be opened
Sun May 20 18:26:06.035 [initandlisten] DBException 13636: file /home/pi/mongo/db/xueqiu.5 open/create failed in createPrivateMap (look in log for more information)
Sun May 20 18:26:06.036 [initandlisten] exception in initAndListen: 13636 file /home/pi/mongo/db/xueqiu.5 open/create failed in createPrivateMap (look in log for more information), terminating
Sun May 20 18:26:06.036 dbexit:
Sun May 20 18:26:06.036 [initandlisten] shutdown: going to close listening sockets...
Sun May 20 18:26:06.036 [initandlisten] shutdown: going to flush diaglog...
Sun May 20 18:26:06.036 [initandlisten] shutdown: going to close sockets...
Sun May 20 18:26:06.036 [initandlisten] shutdown: waiting for fs preallocator...
Sun May 20 18:26:06.036 [initandlisten] shutdown: lock for final commit...
Sun May 20 18:26:06.036 [initandlisten] shutdown: final commit...
Sun May 20 18:26:06.036 [initandlisten] shutdown: closing all files...
Sun May 20 18:26:06.037 [initandlisten] closeAllFiles() finished
Sun May 20 18:26:06.037 [initandlisten] journalCleanup...
Sun May 20 18:26:06.037 [initandlisten] removeJournalFiles
Sun May 20 18:26:06.050 [initandlisten] shutdown: removing fs lock...
Sun May 20 18:26:06.050 dbexit: really exiting now
应该是之前没有正常被关闭,导致文件本锁住了。
解决办法:
1. 先把数据文件备份, --dbpath 的路径整个备份一下, 不然接下来的操作误操作了数据就丢失了
2. 运行修复命令:
mongod --dbpath /data/db --repair
替换上面的db为您自己的本地路径
mongo服务启动失败: ERROR: mmap private failed with out of memory
树莓派 • 李魔佛 发表了文章 • 0 个评论 • 4755 次浏览 • 2018-05-13 12:23
sudo mongod --fork --dbpath /home/pi/mongo/db/ --smallfiles --journal --logpath /home/pi/mongo/log.txt
突然今天发现mongo的服务连不上,看log发现mongo在启动后马上关闭了,提示的错误是在加载一个大的数据文件的时候提示内存不足(坑爹的,树莓派自身内存才1GB,无法扩容)。
错误日志:
Sun May 13 12:08:11.185 [initandlisten] MongoDB starting : pid=1929 port=27017 dbpath=/home/pi/mongo/db/ 32-bit host=raspberrypi
Sun May 13 12:08:11.186 [initandlisten]
Sun May 13 12:08:11.186 [initandlisten] ** NOTE: This is a 32 bit MongoDB binary.
Sun May 13 12:08:11.186 [initandlisten] ** 32 bit builds are limited to less than 2GB of data (or less with --journal).
Sun May 13 12:08:11.186 [initandlisten] ** See http://dochub.mongodb.org/core/32bit
Sun May 13 12:08:11.187 [initandlisten]
Sun May 13 12:08:11.187 [initandlisten] db version v2.4.10
Sun May 13 12:08:11.187 [initandlisten] git version: nogitversion
Sun May 13 12:08:11.187 [initandlisten] build info: Linux bm-wb-04 3.19.0-trunk-armmp #1 SMP Debian 3.19.1-1~exp1+plugwash1 (2015-03-28) armv7l BOOST_LIB_VERSION=1_55
Sun May 13 12:08:11.187 [initandlisten] allocator: system
Sun May 13 12:08:11.187 [initandlisten] options: { dbpath: "/home/pi/mongo/db/", fork: true, journal: true, logpath: "/home/pi/mongo/mongod.log" }
Sun May 13 12:08:11.198 [initandlisten] journal dir=/home/pi/mongo/db/journal
Sun May 13 12:08:11.198 [initandlisten] recover : no journal files present, no recovery needed
Sun May 13 12:08:11.238 [initandlisten] ERROR: mmap private failed with out of memory. You are using a 32-bit build and probably need to upgrade to 64
Sun May 13 12:08:11.239 [initandlisten] Assertion: 13636:file /home/pi/mongo/db/xueqiu.5 open/create failed in createPrivateMap (look in log for more information)
0x662fa8 0x63527c 0x6196c0 0x6197bc 0x409d5c 0x4414fc 0x2ea634 0x2eaa3c 0x2eb55c 0x2ebd98 0x26f998 0x26fb94 0x175a20 0x177bf4 0x152bf0 0x76556294
mongod(_ZN5mongo15printStackTraceERSo+0x1c) [0x662fa8]
mongod(_ZN5mongo10logContextEPKc+0x110) [0x63527c]
mongod(_ZN5mongo11msgassertedEiPKc+0xc0) [0x6196c0]
mongod(_ZN5mongo18msgassertedNoTraceEiPKc+0) [0x6197bc]
mongod(_ZN5mongo8MongoMMF13finishOpeningEv+0x308) [0x409d5c]
mongod(_ZN5mongo13MongoDataFile12openExistingEPKc+0x9c) [0x4414fc]
mongod(_ZN5mongo8Database16openExistingFileEi+0x23c) [0x2ea634]
mongod(_ZN5mongo8Database12openAllFilesEv+0x24) [0x2eaa3c]
mongod(_ZN5mongo8DatabaseC2EPKcRbRKSs+0x158) [0x2eb55c]
mongod(_ZN5mongo14DatabaseHolder11getOrCreateERKSsS2_Rb+0x500) [0x2ebd98]
mongod(_ZN5mongo6Client7Context11_finishInitEv+0x34) [0x26f998]
mongod(_ZN5mongo6Client7ContextC1ERKSsS3_b+0x78) [0x26fb94]
mongod(_ZN5mongo14_initAndListenEi+0xb00) [0x175a20]
mongod(_ZN5mongo13initAndListenEi+0x14) [0x177bf4]
mongod(main+0x2b8) [0x152bf0]
/lib/arm-linux-gnueabihf/libc.so.6(__libc_start_main+0x114) [0x76556294]
Sun May 13 12:08:11.250 [initandlisten] warning database /home/pi/mongo/db/ xueqiu could not be opened
Sun May 13 12:08:11.251 [initandlisten] DBException 13636: file /home/pi/mongo/db/xueqiu.5 open/create failed in createPrivateMap (look in log for more information)
Sun May 13 12:08:11.251 [initandlisten] exception in initAndListen: 13636 file /home/pi/mongo/db/xueqiu.5 open/create failed in createPrivateMap (look in log for more information), terminating
Sun May 13 12:08:11.251 dbexit:
Sun May 13 12:08:11.252 [initandlisten] shutdown: going to close listening sockets...
Sun May 13 12:08:11.252 [initandlisten] shutdown: going to flush diaglog...
Sun May 13 12:08:11.252 [initandlisten] shutdown: going to close sockets...
Sun May 13 12:08:11.252 [initandlisten] shutdown: waiting for fs preallocator...
Sun May 13 12:08:11.252 [initandlisten] shutdown: lock for final commit...
Sun May 13 12:08:11.252 [initandlisten] shutdown: final commit...
Sun May 13 12:08:11.252 [initandlisten] shutdown: closing all files...
Sun May 13 12:08:11.252 [initandlisten] closeAllFiles() finished
Sun May 13 12:08:11.252 [initandlisten] journalCleanup...
Sun May 13 12:08:11.253 [initandlisten] removeJournalFiles
Sun May 13 12:08:11.263 [initandlisten] shutdown: removing fs lock...
Sun May 13 12:08:11.264 dbexit: really exiting now
看了下mongod的用法,尝试把参数 --journal,去掉,重新运行,然后就可以了。
sudo mongod --fork --dbpath /home/pi/mongo/db/ --smallfiles --logpath /home/pi/mongo/log.txt 查看全部
sudo mongod --fork --dbpath /home/pi/mongo/db/ --smallfiles --journal --logpath /home/pi/mongo/log.txt
突然今天发现mongo的服务连不上,看log发现mongo在启动后马上关闭了,提示的错误是在加载一个大的数据文件的时候提示内存不足(坑爹的,树莓派自身内存才1GB,无法扩容)。
错误日志:
Sun May 13 12:08:11.185 [initandlisten] MongoDB starting : pid=1929 port=27017 dbpath=/home/pi/mongo/db/ 32-bit host=raspberrypi
Sun May 13 12:08:11.186 [initandlisten]
Sun May 13 12:08:11.186 [initandlisten] ** NOTE: This is a 32 bit MongoDB binary.
Sun May 13 12:08:11.186 [initandlisten] ** 32 bit builds are limited to less than 2GB of data (or less with --journal).
Sun May 13 12:08:11.186 [initandlisten] ** See http://dochub.mongodb.org/core/32bit
Sun May 13 12:08:11.187 [initandlisten]
Sun May 13 12:08:11.187 [initandlisten] db version v2.4.10
Sun May 13 12:08:11.187 [initandlisten] git version: nogitversion
Sun May 13 12:08:11.187 [initandlisten] build info: Linux bm-wb-04 3.19.0-trunk-armmp #1 SMP Debian 3.19.1-1~exp1+plugwash1 (2015-03-28) armv7l BOOST_LIB_VERSION=1_55
Sun May 13 12:08:11.187 [initandlisten] allocator: system
Sun May 13 12:08:11.187 [initandlisten] options: { dbpath: "/home/pi/mongo/db/", fork: true, journal: true, logpath: "/home/pi/mongo/mongod.log" }
Sun May 13 12:08:11.198 [initandlisten] journal dir=/home/pi/mongo/db/journal
Sun May 13 12:08:11.198 [initandlisten] recover : no journal files present, no recovery needed
Sun May 13 12:08:11.238 [initandlisten] ERROR: mmap private failed with out of memory. You are using a 32-bit build and probably need to upgrade to 64
Sun May 13 12:08:11.239 [initandlisten] Assertion: 13636:file /home/pi/mongo/db/xueqiu.5 open/create failed in createPrivateMap (look in log for more information)
0x662fa8 0x63527c 0x6196c0 0x6197bc 0x409d5c 0x4414fc 0x2ea634 0x2eaa3c 0x2eb55c 0x2ebd98 0x26f998 0x26fb94 0x175a20 0x177bf4 0x152bf0 0x76556294
mongod(_ZN5mongo15printStackTraceERSo+0x1c) [0x662fa8]
mongod(_ZN5mongo10logContextEPKc+0x110) [0x63527c]
mongod(_ZN5mongo11msgassertedEiPKc+0xc0) [0x6196c0]
mongod(_ZN5mongo18msgassertedNoTraceEiPKc+0) [0x6197bc]
mongod(_ZN5mongo8MongoMMF13finishOpeningEv+0x308) [0x409d5c]
mongod(_ZN5mongo13MongoDataFile12openExistingEPKc+0x9c) [0x4414fc]
mongod(_ZN5mongo8Database16openExistingFileEi+0x23c) [0x2ea634]
mongod(_ZN5mongo8Database12openAllFilesEv+0x24) [0x2eaa3c]
mongod(_ZN5mongo8DatabaseC2EPKcRbRKSs+0x158) [0x2eb55c]
mongod(_ZN5mongo14DatabaseHolder11getOrCreateERKSsS2_Rb+0x500) [0x2ebd98]
mongod(_ZN5mongo6Client7Context11_finishInitEv+0x34) [0x26f998]
mongod(_ZN5mongo6Client7ContextC1ERKSsS3_b+0x78) [0x26fb94]
mongod(_ZN5mongo14_initAndListenEi+0xb00) [0x175a20]
mongod(_ZN5mongo13initAndListenEi+0x14) [0x177bf4]
mongod(main+0x2b8) [0x152bf0]
/lib/arm-linux-gnueabihf/libc.so.6(__libc_start_main+0x114) [0x76556294]
Sun May 13 12:08:11.250 [initandlisten] warning database /home/pi/mongo/db/ xueqiu could not be opened
Sun May 13 12:08:11.251 [initandlisten] DBException 13636: file /home/pi/mongo/db/xueqiu.5 open/create failed in createPrivateMap (look in log for more information)
Sun May 13 12:08:11.251 [initandlisten] exception in initAndListen: 13636 file /home/pi/mongo/db/xueqiu.5 open/create failed in createPrivateMap (look in log for more information), terminating
Sun May 13 12:08:11.251 dbexit:
Sun May 13 12:08:11.252 [initandlisten] shutdown: going to close listening sockets...
Sun May 13 12:08:11.252 [initandlisten] shutdown: going to flush diaglog...
Sun May 13 12:08:11.252 [initandlisten] shutdown: going to close sockets...
Sun May 13 12:08:11.252 [initandlisten] shutdown: waiting for fs preallocator...
Sun May 13 12:08:11.252 [initandlisten] shutdown: lock for final commit...
Sun May 13 12:08:11.252 [initandlisten] shutdown: final commit...
Sun May 13 12:08:11.252 [initandlisten] shutdown: closing all files...
Sun May 13 12:08:11.252 [initandlisten] closeAllFiles() finished
Sun May 13 12:08:11.252 [initandlisten] journalCleanup...
Sun May 13 12:08:11.253 [initandlisten] removeJournalFiles
Sun May 13 12:08:11.263 [initandlisten] shutdown: removing fs lock...
Sun May 13 12:08:11.264 dbexit: really exiting now
看了下mongod的用法,尝试把参数 --journal,去掉,重新运行,然后就可以了。
sudo mongod --fork --dbpath /home/pi/mongo/db/ --smallfiles --logpath /home/pi/mongo/log.txt
树莓派安装mongodb服务器
网络 • 李魔佛 发表了文章 • 0 个评论 • 7986 次浏览 • 2017-12-18 16:57
不过安装的是32bit 的mongodb,数据库的大小会被限制在2GB。
树莓派启动mongodb
修改/etc/mongodb.config,
把里面的bind=127.0.0.1 注释掉,前面加一个#即可,因为这样其他主机也可以访问这台树莓派的mongodb服务器。
修改dbpath和dblog的路径,因为默认的路径你需要root权限
然后运行 mongod --config /etc/mongodb.config , 然后远程使用mongo ip地址就可以远程连接了。 查看全部
不过安装的是32bit 的mongodb,数据库的大小会被限制在2GB。
树莓派启动mongodb
修改/etc/mongodb.config,
把里面的bind=127.0.0.1 注释掉,前面加一个#即可,因为这样其他主机也可以访问这台树莓派的mongodb服务器。
修改dbpath和dblog的路径,因为默认的路径你需要root权限
然后运行 mongod --config /etc/mongodb.config , 然后远程使用mongo ip地址就可以远程连接了。
使用官网下载的mongodb,如何设置远程连接mongodb服务器
网络安全 • 李魔佛 发表了文章 • 0 个评论 • 5608 次浏览 • 2017-12-17 23:10
解压后在mongo的bin目录下运行 mongod --dbpath ~/mongo/db
可以看到mongodb服务被正常启动了。
在局域网其他电脑上使用mongodb客户端尝试连接这个mongo服务器,发现无法连接上。
因为官网下载的mongo问价解压后并没有mongo.conf配置文件。
在本机运行命令: mongo
可以看到输出:
Server has startup warnings:
2017-12-17T22:56:19.702+0800 I STORAGE [initandlisten]
2017-12-17T22:56:19.702+0800 I STORAGE [initandlisten] ** WARNING: Using the XFS filesystem is strongly recommended with the WiredTiger storage engine
2017-12-17T22:56:19.702+0800 I STORAGE [initandlisten] ** See http://dochub.mongodb.org/core ... ystem
2017-12-17T22:56:20.600+0800 I CONTROL [initandlisten]
2017-12-17T22:56:20.600+0800 I CONTROL [initandlisten] ** WARNING: Access control is not enabled for the database.
2017-12-17T22:56:20.600+0800 I CONTROL [initandlisten] ** Read and write access to data and configuration is unrestricted.
2017-12-17T22:56:20.600+0800 I CONTROL [initandlisten]
2017-12-17T22:56:20.600+0800 I CONTROL [initandlisten] ** WARNING: This server is bound to localhost.
2017-12-17T22:56:20.600+0800 I CONTROL [initandlisten] ** Remote systems will be unable to connect to this server.
2017-12-17T22:56:20.600+0800 I CONTROL [initandlisten] ** Start the server with --bind_ip <address> to specify which IP
2017-12-17T22:56:20.600+0800 I CONTROL [initandlisten] ** addresses it should serve responses from, or with --bind_ip_all to
2017-12-17T22:56:20.600+0800 I CONTROL [initandlisten] ** bind to all interfaces. If this behavior is desired, start the
2017-12-17T22:56:20.600+0800 I CONTROL [initandlisten] ** server with --bind_ip 127.0.0.1 to disable this warning.
2017-12-17T22:56:20.600+0800 I CONTROL [initandlisten]
2017-12-17T22:56:20.601+0800 I CONTROL [initandlisten]
2017-12-17T22:56:20.601+0800 I CONTROL [initandlisten] ** WARNING: /sys/kernel/mm/transparent_hugepage/defrag is 'always'.
2017-12-17T22:56:20.601+0800 I CONTROL [initandlisten] ** We suggest setting it to 'never'
2017-12-17T22:56:20.601+0800 I CONTROL [initandlisten]
从上面的信息可以看到,如果需要远程的机子连接到本机,需要添加一个选项: --bind_ip_all
运行下面命令后:
mongod --dbpath ~/mongo/db --bind_ip_all
远程的机子就能够连上mongo服务器了。
原文地址:http://30daydo.com/article/247
转载请注明出处 查看全部
解压后在mongo的bin目录下运行 mongod --dbpath ~/mongo/db
可以看到mongodb服务被正常启动了。
在局域网其他电脑上使用mongodb客户端尝试连接这个mongo服务器,发现无法连接上。
因为官网下载的mongo问价解压后并没有mongo.conf配置文件。
在本机运行命令: mongo
可以看到输出:
Server has startup warnings:
2017-12-17T22:56:19.702+0800 I STORAGE [initandlisten]
2017-12-17T22:56:19.702+0800 I STORAGE [initandlisten] ** WARNING: Using the XFS filesystem is strongly recommended with the WiredTiger storage engine
2017-12-17T22:56:19.702+0800 I STORAGE [initandlisten] ** See http://dochub.mongodb.org/core ... ystem
2017-12-17T22:56:20.600+0800 I CONTROL [initandlisten]
2017-12-17T22:56:20.600+0800 I CONTROL [initandlisten] ** WARNING: Access control is not enabled for the database.
2017-12-17T22:56:20.600+0800 I CONTROL [initandlisten] ** Read and write access to data and configuration is unrestricted.
2017-12-17T22:56:20.600+0800 I CONTROL [initandlisten]
2017-12-17T22:56:20.600+0800 I CONTROL [initandlisten] ** WARNING: This server is bound to localhost.
2017-12-17T22:56:20.600+0800 I CONTROL [initandlisten] ** Remote systems will be unable to connect to this server.
2017-12-17T22:56:20.600+0800 I CONTROL [initandlisten] ** Start the server with --bind_ip <address> to specify which IP
2017-12-17T22:56:20.600+0800 I CONTROL [initandlisten] ** addresses it should serve responses from, or with --bind_ip_all to
2017-12-17T22:56:20.600+0800 I CONTROL [initandlisten] ** bind to all interfaces. If this behavior is desired, start the
2017-12-17T22:56:20.600+0800 I CONTROL [initandlisten] ** server with --bind_ip 127.0.0.1 to disable this warning.
2017-12-17T22:56:20.600+0800 I CONTROL [initandlisten]
2017-12-17T22:56:20.601+0800 I CONTROL [initandlisten]
2017-12-17T22:56:20.601+0800 I CONTROL [initandlisten] ** WARNING: /sys/kernel/mm/transparent_hugepage/defrag is 'always'.
2017-12-17T22:56:20.601+0800 I CONTROL [initandlisten] ** We suggest setting it to 'never'
2017-12-17T22:56:20.601+0800 I CONTROL [initandlisten]
从上面的信息可以看到,如果需要远程的机子连接到本机,需要添加一个选项: --bind_ip_all
运行下面命令后:
mongod --dbpath ~/mongo/db --bind_ip_all
远程的机子就能够连上mongo服务器了。
原文地址:http://30daydo.com/article/247
转载请注明出处
mongodb中$sum:1 后面的1是什么意思
python • 李魔佛 发表了文章 • 0 个评论 • 10685 次浏览 • 2017-09-05 23:32
{
"_id" : "GuqXmAkkARqhBDqhy",
"beatmapset_id" : "342537",
"version" : "MX",
"diff_approach" : "5",
"artist" : "Yousei Teikoku",
"title" : "Kokou no Sousei",
"difficultyrating" : "3.5552737712860107"
}
{
"_id" : "oHLT7KqsB7bztBGvu",
"beatmapset_id" : "342537",
"version" : "HD",
"diff_approach" : "5",
"artist" : "Yousei Teikoku",
"title" : "Kokou no Sousei",
"difficultyrating" : "2.7515676021575928"
}
{
"_id" : "GbotZfrPEwW69FkGD",
"beatmapset_id" : "342537",
"version" : "NM",
"diff_approach" : "5",
"artist" : "Yousei Teikoku",
"title" : "Kokou no Sousei",
"difficultyrating" : "0"
}
然后运行以下的命令:
db.getCollection('dup_case').aggregate(
[
{$group:{
_id:{diff_approach:'$diff_approach'},
count:{$sum:2}
}},
{$match:{count:{$gt:1}}}
]
)
返回的count是6
所以
$sum:1 的含义:
如果前面的情况出现一次,就加1, 如果后面$sum:2 那么每次前面条件满足一次就加2
查看全部
{
"_id" : "GuqXmAkkARqhBDqhy",
"beatmapset_id" : "342537",
"version" : "MX",
"diff_approach" : "5",
"artist" : "Yousei Teikoku",
"title" : "Kokou no Sousei",
"difficultyrating" : "3.5552737712860107"
}
{
"_id" : "oHLT7KqsB7bztBGvu",
"beatmapset_id" : "342537",
"version" : "HD",
"diff_approach" : "5",
"artist" : "Yousei Teikoku",
"title" : "Kokou no Sousei",
"difficultyrating" : "2.7515676021575928"
}
{
"_id" : "GbotZfrPEwW69FkGD",
"beatmapset_id" : "342537",
"version" : "NM",
"diff_approach" : "5",
"artist" : "Yousei Teikoku",
"title" : "Kokou no Sousei",
"difficultyrating" : "0"
}
然后运行以下的命令:
db.getCollection('dup_case').aggregate(
[
{$group:{
_id:{diff_approach:'$diff_approach'},
count:{$sum:2}
}},
{$match:{count:{$gt:1}}}
]
)
返回的count是6
所以
$sum:1 的含义:
如果前面的情况出现一次,就加1, 如果后面$sum:2 那么每次前面条件满足一次就加2
mongodb python同步两个数据库数据
python • 李魔佛 发表了文章 • 0 个评论 • 2143 次浏览 • 2022-04-07 02:44
所以自己动手写代码完成:
# -*- coding: utf-8 -*-
# @Time : 2022/4/6 4:41
# @File : database_migrate.py
# @Author : Rocky C@www.30daydo.com
import time
from loguru import logger
import pymongo
ignore_db = ['admin', 'config', 'local',
] # 忽略更新的库
ignore_col = [('db_stock','dfcf_list_full')]
logger.add('mongo.log')
# 数据库同步
def get_client(user, password, host, port):
connect_uri = f'mongodb://{user}:{password}@{host}:{port}'
client = pymongo.MongoClient(connect_uri)
return client
def origin():
return get_client('admin', 'password', '127.0.0.1', '27017')
def target():
return get_client('root', 'password', '127.0.0.1', '27017')
def transfer():
origin_client = origin()
target_client = target()
dbs = get_db_name(origin_client)
for db in dbs:
for col in get_collection_name(origin_client, db):
if (db,col) in ignore_col:
continue
items = []
logger.info(f'正在更新{db} {col}')
for i in get_item(origin_client, db, col):
items.append(i)
insert_item(target_client, db, col, items)
logger.info(f'更新数据库 {db} {col}')
# time.sleep(0.5)
def get_item(client, db_name, col):
return client[db_name][col].find()
def insert_item(client, db_name, col, data):
batch = 1000
count = len(data)//batch + 1
for i in range(count):
item = data[i*batch:(i+1)*batch]
if len(item)==0:
continue
try:
client[db_name][col].insert_many(item)
except Exception as e:
logger.error(e)
logger.error(f'{db_name} {col} 插入出错')
def get_db_name(client):
db_name = client.list_database_names()
dbs = []
for db in db_name:
if db not in ignore_db:
dbs.append(db)
return dbs
def delete_col(client,db,col):
try:
client[db][col].delete_many({})
except Exception as e:
logger.error(e)
logger.error(db)
logger.error(col)
return False
else:
return True
def server_compare():
'''
比较2个数据库是否相同,只是单纯比较条数
'''
origin_client = origin()
target_client = target()
dbs = get_db_name(origin_client)
for db in dbs:
for col in get_collection_name(origin_client, db):
origin_count = origin_client[db][col].count_documents({})
target_count = target_client[db][col].count_documents({})
if origin_count!=target_count:
logger.info(f'collection {db} {col}有区别')
#
if delete_col(target_client,db,col):
items = []
logger.info(f'正在更新{db} {col}')
for i in get_item(origin_client, db, col):
items.append(i)
insert_item(target_client, db, col, items)
logger.info(f'更新数据库 {db} {col}')
time.sleep(1)
def get_collection_name(client, db_name):
collection_names = client[db_name].list_collection_names(session=None)
return collection_names
def main():
server_compare()
if __name__ == '__main__':
main()
原理就是不断迭代,不同的数据库,里面的不同的collection。
对于同名collection,通过条数是否一致,来决定是否要把原数据复制过来。
保存上面文件为main.py
执行 python main.py
就可以进行数据同步工作啦。 查看全部
所以自己动手写代码完成:
# -*- coding: utf-8 -*-
# @Time : 2022/4/6 4:41
# @File : database_migrate.py
# @Author : Rocky C@www.30daydo.com
import time
from loguru import logger
import pymongo
ignore_db = ['admin', 'config', 'local',
] # 忽略更新的库
ignore_col = [('db_stock','dfcf_list_full')]
logger.add('mongo.log')
# 数据库同步
def get_client(user, password, host, port):
connect_uri = f'mongodb://{user}:{password}@{host}:{port}'
client = pymongo.MongoClient(connect_uri)
return client
def origin():
return get_client('admin', 'password', '127.0.0.1', '27017')
def target():
return get_client('root', 'password', '127.0.0.1', '27017')
def transfer():
origin_client = origin()
target_client = target()
dbs = get_db_name(origin_client)
for db in dbs:
for col in get_collection_name(origin_client, db):
if (db,col) in ignore_col:
continue
items = []
logger.info(f'正在更新{db} {col}')
for i in get_item(origin_client, db, col):
items.append(i)
insert_item(target_client, db, col, items)
logger.info(f'更新数据库 {db} {col}')
# time.sleep(0.5)
def get_item(client, db_name, col):
return client[db_name][col].find()
def insert_item(client, db_name, col, data):
batch = 1000
count = len(data)//batch + 1
for i in range(count):
item = data[i*batch:(i+1)*batch]
if len(item)==0:
continue
try:
client[db_name][col].insert_many(item)
except Exception as e:
logger.error(e)
logger.error(f'{db_name} {col} 插入出错')
def get_db_name(client):
db_name = client.list_database_names()
dbs = []
for db in db_name:
if db not in ignore_db:
dbs.append(db)
return dbs
def delete_col(client,db,col):
try:
client[db][col].delete_many({})
except Exception as e:
logger.error(e)
logger.error(db)
logger.error(col)
return False
else:
return True
def server_compare():
'''
比较2个数据库是否相同,只是单纯比较条数
'''
origin_client = origin()
target_client = target()
dbs = get_db_name(origin_client)
for db in dbs:
for col in get_collection_name(origin_client, db):
origin_count = origin_client[db][col].count_documents({})
target_count = target_client[db][col].count_documents({})
if origin_count!=target_count:
logger.info(f'collection {db} {col}有区别')
#
if delete_col(target_client,db,col):
items = []
logger.info(f'正在更新{db} {col}')
for i in get_item(origin_client, db, col):
items.append(i)
insert_item(target_client, db, col, items)
logger.info(f'更新数据库 {db} {col}')
time.sleep(1)
def get_collection_name(client, db_name):
collection_names = client[db_name].list_collection_names(session=None)
return collection_names
def main():
server_compare()
if __name__ == '__main__':
main()
原理就是不断迭代,不同的数据库,里面的不同的collection。
对于同名collection,通过条数是否一致,来决定是否要把原数据复制过来。
保存上面文件为main.py
执行 python main.py
就可以进行数据同步工作啦。
mongodb 判断列表字段不为空
数据库 • 李魔佛 发表了文章 • 0 个评论 • 8968 次浏览 • 2019-08-20 11:08
db.test_tab.insert({array:[]})
db.test_tab.insert({array:[]})
db.test_tab.insert({array:[]})
db.test_tab.insert({array:[1,2,3,4,5]})
db.test_tab.insert({array:[1,2,3,4,5,6]})
使用以下命令判断列表不为空:
db.getCollection("example").find({array:{$exists:true,$ne:[]}}); # 字段不为0 查看全部
db.test_tab.insert({array:[]})
db.test_tab.insert({array:[]})
db.test_tab.insert({array:[]})
db.test_tab.insert({array:[1,2,3,4,5]})
db.test_tab.insert({array:[1,2,3,4,5,6]})
使用以下命令判断列表不为空:
db.getCollection("example").find({array:{$exists:true,$ne:[]}}); # 字段不为0
mongodb 修改嵌套字典字典的字段名
数据库 • 李魔佛 发表了文章 • 0 个评论 • 6075 次浏览 • 2019-08-05 13:55
db.test.update({},{$rename:{'旧字段':'新字段'}},true,true)
比如下面的例子:db.getCollection('example').update({},{$rename:{'corp':'企业'}})
上面就是把字段corp改为企业。
如果是嵌套字段呢?
比如 corp字典是一个字典,里面是 { 'address':'USA', 'phone':'12345678' }
那么要修改里面的address为地址:
db.getCollection('example').update({},{$rename:{'corp.address':'corp.地址'}})
原创文章,转载请注明出处
原文连接:http://30daydo.com/article/521
查看全部
db.test.update({},{$rename:{'旧字段':'新字段'}},true,true)
比如下面的例子:
db.getCollection('example').update({},{$rename:{'corp':'企业'}})
上面就是把字段corp改为企业。
如果是嵌套字段呢?
比如 corp字典是一个字典,里面是 { 'address':'USA', 'phone':'12345678' }
那么要修改里面的address为地址:
db.getCollection('example').update({},{$rename:{'corp.address':'corp.地址'}})
原创文章,转载请注明出处
原文连接:http://30daydo.com/article/521
mongodb find得到的数据顺序每次都是一样的
数据库 • 李魔佛 发表了文章 • 0 个评论 • 3396 次浏览 • 2019-07-26 09:00
使用pymongo中的find_one_and_update出错:需要分片键
数据库 • 李魔佛 发表了文章 • 0 个评论 • 5830 次浏览 • 2019-06-10 17:13
raise OperationFailure(msg % errmsg, code, response)
pymongo.errors.OperationFailure: Query for sharded findAndModify must contain the shard key
2019-06-10 16:14:32 [scrapy.core.engine] INFO: Closing spider (finished)
2019-06-10 16:14:32 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
需要在查询语句中把分片键也添加进去。
因为findOneModify只会找一个记录,但是到底在哪个分片的记录呢? 因为不确定,所以才需要把shard加上去。
参考官方:
Targeted Operations vs. Broadcast Operations
Generally, the fastest queries in a sharded environment are those that mongos route to a single shard, using the shard key and the cluster meta data from the config server. These targeted operations use the shard key value to locate the shard or subset of shards that satisfy the query document.
For queries that don’t include the shard key, mongos must query all shards, wait for their responses and then return the result to the application. These “scatter/gather” queries can be long running operations.
Broadcast Operations
mongos instances broadcast queries to all shards for the collection unless the mongos can determine which shard or subset of shards stores this data.
After the mongos receives responses from all shards, it merges the data and returns the result document. The performance of a broadcast operation depends on the overall load of the cluster, as well as variables like network latency, individual shard load, and number of documents returned per shard. Whenever possible, favor operations that result in targeted operation over those that result in a broadcast operation.
Multi-update operations are always broadcast operations.
The updateMany() and deleteMany() methods are broadcast operations, unless the query document specifies the shard key in full.
Targeted Operations
mongos can route queries that include the shard key or the prefix of a compound shard key a specific shard or set of shards. mongos uses the shard key value to locate the chunk whose range includes the shard key value and directs the query at the shard containing that chunk.
For example, if the shard key is:
copy
{ a: 1, b: 1, c: 1 }
The mongos program can route queries that include the full shard key or either of the following shard key prefixes at a specific shard or set of shards:
copy
{ a: 1 }
{ a: 1, b: 1 }
All insertOne() operations target to one shard. Each document in the insertMany() array targets to a single shard, but there is no guarantee all documents in the array insert into a single shard.
All updateOne(), replaceOne() and deleteOne() operations must include the shard key or _id in the query document. MongoDB returns an error if these methods are used without the shard key or _id.
Depending on the distribution of data in the cluster and the selectivity of the query, mongos may still perform a broadcast operation to fulfill these queries.
Index Use
If the query does not include the shard key, the mongos must send the query to all shards as a “scatter/gather” operation. Each shard will, in turn, use either the shard key index or another more efficient index to fulfill the query.
If the query includes multiple sub-expressions that reference the fields indexed by the shard key and the secondary index, the mongos can route the queries to a specific shard and the shard will use the index that will allow it to fulfill most efficiently.
Sharded Cluster Security
Use Internal Authentication to enforce intra-cluster security and prevent unauthorized cluster components from accessing the cluster. You must start each mongod or mongos in the cluster with the appropriate security settings in order to enforce internal authentication.
See Deploy Sharded Cluster with Keyfile Access Control for a tutorial on deploying a secured shardedcluster.
Cluster Users
Sharded clusters support Role-Based Access Control (RBAC) for restricting unauthorized access to cluster data and operations. You must start each mongod in the cluster, including the config servers, with the --auth option in order to enforce RBAC. Alternatively, enforcing Internal Authentication for inter-cluster security also enables user access controls via RBAC.
With RBAC enforced, clients must specify a --username, --password, and --authenticationDatabase when connecting to the mongos in order to access cluster resources.
Each cluster has its own cluster users. These users cannot be used to access individual shards.
See Enable Access Control for a tutorial on enabling adding users to an RBAC-enabled MongoDB deployment. 查看全部
File "C:\ProgramData\Anaconda3\lib\site-packages\pymongo\helpers.py", line 155, in _check_command_response
raise OperationFailure(msg % errmsg, code, response)
pymongo.errors.OperationFailure: Query for sharded findAndModify must contain the shard key
2019-06-10 16:14:32 [scrapy.core.engine] INFO: Closing spider (finished)
2019-06-10 16:14:32 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
需要在查询语句中把分片键也添加进去。
因为findOneModify只会找一个记录,但是到底在哪个分片的记录呢? 因为不确定,所以才需要把shard加上去。
参考官方:
Targeted Operations vs. Broadcast Operations
Generally, the fastest queries in a sharded environment are those that mongos route to a single shard, using the shard key and the cluster meta data from the config server. These targeted operations use the shard key value to locate the shard or subset of shards that satisfy the query document.
For queries that don’t include the shard key, mongos must query all shards, wait for their responses and then return the result to the application. These “scatter/gather” queries can be long running operations.
Broadcast Operations
mongos instances broadcast queries to all shards for the collection unless the mongos can determine which shard or subset of shards stores this data.
After the mongos receives responses from all shards, it merges the data and returns the result document. The performance of a broadcast operation depends on the overall load of the cluster, as well as variables like network latency, individual shard load, and number of documents returned per shard. Whenever possible, favor operations that result in targeted operation over those that result in a broadcast operation.
Multi-update operations are always broadcast operations.
The updateMany() and deleteMany() methods are broadcast operations, unless the query document specifies the shard key in full.
Targeted Operations
mongos can route queries that include the shard key or the prefix of a compound shard key a specific shard or set of shards. mongos uses the shard key value to locate the chunk whose range includes the shard key value and directs the query at the shard containing that chunk.
For example, if the shard key is:
copy
{ a: 1, b: 1, c: 1 }
The mongos program can route queries that include the full shard key or either of the following shard key prefixes at a specific shard or set of shards:
copy
{ a: 1 }
{ a: 1, b: 1 }
All insertOne() operations target to one shard. Each document in the insertMany() array targets to a single shard, but there is no guarantee all documents in the array insert into a single shard.
All updateOne(), replaceOne() and deleteOne() operations must include the shard key or _id in the query document. MongoDB returns an error if these methods are used without the shard key or _id.
Depending on the distribution of data in the cluster and the selectivity of the query, mongos may still perform a broadcast operation to fulfill these queries.
Index Use
If the query does not include the shard key, the mongos must send the query to all shards as a “scatter/gather” operation. Each shard will, in turn, use either the shard key index or another more efficient index to fulfill the query.
If the query includes multiple sub-expressions that reference the fields indexed by the shard key and the secondary index, the mongos can route the queries to a specific shard and the shard will use the index that will allow it to fulfill most efficiently.
Sharded Cluster Security
Use Internal Authentication to enforce intra-cluster security and prevent unauthorized cluster components from accessing the cluster. You must start each mongod or mongos in the cluster with the appropriate security settings in order to enforce internal authentication.
See Deploy Sharded Cluster with Keyfile Access Control for a tutorial on deploying a secured shardedcluster.
Cluster Users
Sharded clusters support Role-Based Access Control (RBAC) for restricting unauthorized access to cluster data and operations. You must start each mongod in the cluster, including the config servers, with the --auth option in order to enforce RBAC. Alternatively, enforcing Internal Authentication for inter-cluster security also enables user access controls via RBAC.
With RBAC enforced, clients must specify a --username, --password, and --authenticationDatabase when connecting to the mongos in order to access cluster resources.
Each cluster has its own cluster users. These users cannot be used to access individual shards.
See Enable Access Control for a tutorial on enabling adding users to an RBAC-enabled MongoDB deployment.
Warning: unable to run listCollections, attempting to approximate collection
数据库 • 李魔佛 发表了文章 • 0 个评论 • 19717 次浏览 • 2019-06-07 17:35
Warning: unable to run listCollections, attempting to approximate collection names by parsing connectionStatus
那是因为设置了密码,但是没有进行认证导致的错误。这个错误为啥不直接说明原因呢。汗
直接: db.auth('admin','密码')
认证成功返回1, 然后重新执行show tables就可以看到所有的表了。 查看全部
Warning: unable to run listCollections, attempting to approximate collection names by parsing connectionStatus
那是因为设置了密码,但是没有进行认证导致的错误。这个错误为啥不直接说明原因呢。汗
直接: db.auth('admin','密码')
认证成功返回1, 然后重新执行show tables就可以看到所有的表了。
python连接mongodb集群 cluster
数据库 • 李魔佛 发表了文章 • 0 个评论 • 4831 次浏览 • 2019-06-03 15:55
连接方法如下:import pymongo
db = pymongo.MongoClient('mongodb://10.18.6.46,10.18.6.26,10.18.6.102')上面默认的端口do都是27017,如果是其他端口,需要这样修改:db = pymongo.MongoClient('mongodb://10.18.6.46:8888,10.18.6.26:9999,10.18.6.102:7777')
然后就可以正常读写数据库:
读:coll=db['testdb']['testcollection'].find()
for i in coll:
print(i)输出内容:{'_id': ObjectId('5cf4c7981ee9edff72e5c503'), 'username': 'hello'}
{'_id': ObjectId('5cf4c7991ee9edff72e5c504'), 'username': 'hello'}
{'_id': ObjectId('5cf4c7991ee9edff72e5c505'), 'username': 'hello'}
{'_id': ObjectId('5cf4c79a1ee9edff72e5c506'), 'username': 'hello'}
{'_id': ObjectId('5cf4c7b21ee9edff72e5c507'), 'username': 'hello world'}
写:collection = db['testdb']['testcollection']
for i in range(10):
collection.insert({'username':'huston{}'.format(i)})
原创文章,转载请注明出处:
http://30daydo.com/article/494
查看全部
连接方法如下:
import pymongo上面默认的端口do都是27017,如果是其他端口,需要这样修改:
db = pymongo.MongoClient('mongodb://10.18.6.46,10.18.6.26,10.18.6.102')
db = pymongo.MongoClient('mongodb://10.18.6.46:8888,10.18.6.26:9999,10.18.6.102:7777')
然后就可以正常读写数据库:
读:
coll=db['testdb']['testcollection'].find()输出内容:
for i in coll:
print(i)
{'_id': ObjectId('5cf4c7981ee9edff72e5c503'), 'username': 'hello'}
{'_id': ObjectId('5cf4c7991ee9edff72e5c504'), 'username': 'hello'}
{'_id': ObjectId('5cf4c7991ee9edff72e5c505'), 'username': 'hello'}
{'_id': ObjectId('5cf4c79a1ee9edff72e5c506'), 'username': 'hello'}
{'_id': ObjectId('5cf4c7b21ee9edff72e5c507'), 'username': 'hello world'}
写:
collection = db['testdb']['testcollection']
for i in range(10):
collection.insert({'username':'huston{}'.format(i)})
原创文章,转载请注明出处:
http://30daydo.com/article/494
【python】pymongo find_one_and_update的用法
python爬虫 • 李魔佛 发表了文章 • 0 个评论 • 14528 次浏览 • 2019-04-04 11:31
<filter>,
<update>,
{
projection: <document>,
sort: <document>,
maxTimeMS: <number>,
upsert: <boolean>,
returnNewDocument: <boolean>,
collation: <document>,
arrayFilters: [ <filterdocument1>, ... ]
}
)
转换成python pymongo是这样的:>>> db.example.find_one_and_update(
... {'_id': 'userid'},
... {'$inc': {'seq': 1}},
... projection={'seq': True, '_id': False},
... return_document=ReturnDocument.AFTER)
上面的语句的意思是:
找到_id 为userid的值得文档,然后把该文档中的seq的值+1,然后返回seq的数据,不显示_id列
最后返回的数据是这样的:
{'seq': 2}
注意
findOneAndUpdate
是获取mongo文档中第一条满足条件的数据并做修改。该函数是线程安全的。意思就是在多个线程中操作,不会对同一条数据进行获取修改。
也就是该操作是原子操作。
ReturnDocument 引用的库
class pymongo.collection.ReturnDocument
在开头 from pymongo.collection import ReturnDocument
原创文章
转载请注明出处:
http://30daydo.com/article/445 查看全部
db.collection.findOneAndUpdate(
<filter>,
<update>,
{
projection: <document>,
sort: <document>,
maxTimeMS: <number>,
upsert: <boolean>,
returnNewDocument: <boolean>,
collation: <document>,
arrayFilters: [ <filterdocument1>, ... ]
}
)
转换成python pymongo是这样的:
>>> db.example.find_one_and_update(
... {'_id': 'userid'},
... {'$inc': {'seq': 1}},
... projection={'seq': True, '_id': False},
... return_document=ReturnDocument.AFTER)
上面的语句的意思是:
找到_id 为userid的值得文档,然后把该文档中的seq的值+1,然后返回seq的数据,不显示_id列
最后返回的数据是这样的:
{'seq': 2}
注意
findOneAndUpdate
是获取mongo文档中第一条满足条件的数据并做修改。该函数是线程安全的。意思就是在多个线程中操作,不会对同一条数据进行获取修改。
也就是该操作是原子操作。
ReturnDocument 引用的库
class pymongo.collection.ReturnDocument
在开头 from pymongo.collection import ReturnDocument
原创文章
转载请注明出处:
http://30daydo.com/article/445
python 代码获取mongodb数据库下所有的collection 文档名字
python • 李魔佛 发表了文章 • 0 个评论 • 5606 次浏览 • 2018-11-27 11:41
db['db_pledge'].list_collection_names()
db['db_pledge'].collection_names()
db['db_pledge'].list_collection_names()
python爬虫集思录所有用户的帖子 scrapy写入mongodb数据库
python爬虫 • 李魔佛 发表了文章 • 0 个评论 • 6660 次浏览 • 2018-09-02 21:52
项目采用scrapy的框架,数据写入到mongodb的数据库。 整个站点爬下来大概用了半小时,数据有12w条。
项目中的主要代码如下:
主spider# -*- coding: utf-8 -*-
import re
import scrapy
from scrapy import Request, FormRequest
from jsl.items import JslItem
from jsl import config
import logging
class AllcontentSpider(scrapy.Spider):
name = 'allcontent'
headers = {
'Host': 'www.jisilu.cn', 'Connection': 'keep-alive', 'Pragma': 'no-cache',
'Cache-Control': 'no-cache', 'Accept': 'application/json,text/javascript,*/*;q=0.01',
'Origin': 'https://www.jisilu.cn', 'X-Requested-With': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/67.0.3396.99Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',
'Referer': 'https://www.jisilu.cn/login/',
'Accept-Encoding': 'gzip,deflate,br',
'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8'
}
def start_requests(self):
login_url = 'https://www.jisilu.cn/login/'
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip,deflate,br', 'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8',
'Cache-Control': 'no-cache', 'Connection': 'keep-alive',
'Host': 'www.jisilu.cn', 'Pragma': 'no-cache', 'Referer': 'https://www.jisilu.cn/',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/67.0.3396.99Safari/537.36'}
yield Request(url=login_url, headers=headers, callback=self.login,dont_filter=True)
def login(self, response):
url = 'https://www.jisilu.cn/account/ajax/login_process/'
data = {
'return_url': 'https://www.jisilu.cn/',
'user_name': config.username,
'password': config.password,
'net_auto_login': '1',
'_post_type': 'ajax',
}
yield FormRequest(
url=url,
headers=self.headers,
formdata=data,
callback=self.parse,
dont_filter=True
)
def parse(self, response):
for i in range(1,3726):
focus_url = 'https://www.jisilu.cn/home/explore/sort_type-new__day-0__page-{}'.format(i)
yield Request(url=focus_url, headers=self.headers, callback=self.parse_page,dont_filter=True)
def parse_page(self, response):
nodes = response.xpath('//div[@class="aw-question-list"]/div')
for node in nodes:
each_url=node.xpath('.//h4/a/@href').extract_first()
yield Request(url=each_url,headers=self.headers,callback=self.parse_item,dont_filter=True)
def parse_item(self,response):
item = JslItem()
title = response.xpath('//div[@class="aw-mod-head"]/h1/text()').extract_first()
s = response.xpath('//div[@class="aw-question-detail-txt markitup-box"]').xpath('string(.)').extract_first()
ret = re.findall('(.*?)\.donate_user_avatar', s, re.S)
try:
content = ret[0].strip()
except:
content = None
createTime = response.xpath('//div[@class="aw-question-detail-meta"]/span/text()').extract_first()
resp_no = response.xpath('//div[@class="aw-mod aw-question-detail-box"]//ul/h2/text()').re_first('\d+')
url = response.url
item['title'] = title.strip()
item['content'] = content
try:
item['resp_no']=int(resp_no)
except Exception as e:
logging.warning('e')
item['resp_no']=None
item['createTime'] = createTime
item['url'] = url.strip()
resp =
for index,reply in enumerate(response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]')):
replay_user = reply.xpath('.//div[@class="pull-left aw-dynamic-topic-content"]//p/a/text()').extract_first()
rep_content = reply.xpath(
'.//div[@class="pull-left aw-dynamic-topic-content"]//div[@class="markitup-box"]/text()').extract_first()
# print rep_content
agree=reply.xpath('.//em[@class="aw-border-radius-5 aw-vote-count pull-left"]/text()').extract_first()
resp.append({replay_user.strip()+'_{}'.format(index): [int(agree),rep_content.strip()]})
item['resp'] = resp
yield item
login函数是模拟登录集思录,通过抓包就可以知道一些上传的data。
然后就是分页去抓取。逻辑很简单。
然后pipeline里面写入mongodb。import pymongo
from collections import OrderedDict
class JslPipeline(object):
def __init__(self):
self.db = pymongo.MongoClient(host='10.18.6.1',port=27017)
# self.user = u'neo牛3' # 修改为指定的用户名 如 毛之川 ,然后找到用户的id,在用户也的源码哪里可以找到 比如持有封基是8132
self.collection = self.db['db_parker']['jsl']
def process_item(self, item, spider):
self.collection.insert(OrderedDict(item))
return item
抓取到的数据入库mongodb:
点击查看大图
原创文章
转载请注明出处:http://30daydo.com/publish/article/351
查看全部
项目采用scrapy的框架,数据写入到mongodb的数据库。 整个站点爬下来大概用了半小时,数据有12w条。
项目中的主要代码如下:
主spider
# -*- coding: utf-8 -*-
import re
import scrapy
from scrapy import Request, FormRequest
from jsl.items import JslItem
from jsl import config
import logging
class AllcontentSpider(scrapy.Spider):
name = 'allcontent'
headers = {
'Host': 'www.jisilu.cn', 'Connection': 'keep-alive', 'Pragma': 'no-cache',
'Cache-Control': 'no-cache', 'Accept': 'application/json,text/javascript,*/*;q=0.01',
'Origin': 'https://www.jisilu.cn', 'X-Requested-With': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/67.0.3396.99Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',
'Referer': 'https://www.jisilu.cn/login/',
'Accept-Encoding': 'gzip,deflate,br',
'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8'
}
def start_requests(self):
login_url = 'https://www.jisilu.cn/login/'
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip,deflate,br', 'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8',
'Cache-Control': 'no-cache', 'Connection': 'keep-alive',
'Host': 'www.jisilu.cn', 'Pragma': 'no-cache', 'Referer': 'https://www.jisilu.cn/',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/67.0.3396.99Safari/537.36'}
yield Request(url=login_url, headers=headers, callback=self.login,dont_filter=True)
def login(self, response):
url = 'https://www.jisilu.cn/account/ajax/login_process/'
data = {
'return_url': 'https://www.jisilu.cn/',
'user_name': config.username,
'password': config.password,
'net_auto_login': '1',
'_post_type': 'ajax',
}
yield FormRequest(
url=url,
headers=self.headers,
formdata=data,
callback=self.parse,
dont_filter=True
)
def parse(self, response):
for i in range(1,3726):
focus_url = 'https://www.jisilu.cn/home/explore/sort_type-new__day-0__page-{}'.format(i)
yield Request(url=focus_url, headers=self.headers, callback=self.parse_page,dont_filter=True)
def parse_page(self, response):
nodes = response.xpath('//div[@class="aw-question-list"]/div')
for node in nodes:
each_url=node.xpath('.//h4/a/@href').extract_first()
yield Request(url=each_url,headers=self.headers,callback=self.parse_item,dont_filter=True)
def parse_item(self,response):
item = JslItem()
title = response.xpath('//div[@class="aw-mod-head"]/h1/text()').extract_first()
s = response.xpath('//div[@class="aw-question-detail-txt markitup-box"]').xpath('string(.)').extract_first()
ret = re.findall('(.*?)\.donate_user_avatar', s, re.S)
try:
content = ret[0].strip()
except:
content = None
createTime = response.xpath('//div[@class="aw-question-detail-meta"]/span/text()').extract_first()
resp_no = response.xpath('//div[@class="aw-mod aw-question-detail-box"]//ul/h2/text()').re_first('\d+')
url = response.url
item['title'] = title.strip()
item['content'] = content
try:
item['resp_no']=int(resp_no)
except Exception as e:
logging.warning('e')
item['resp_no']=None
item['createTime'] = createTime
item['url'] = url.strip()
resp =
for index,reply in enumerate(response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]')):
replay_user = reply.xpath('.//div[@class="pull-left aw-dynamic-topic-content"]//p/a/text()').extract_first()
rep_content = reply.xpath(
'.//div[@class="pull-left aw-dynamic-topic-content"]//div[@class="markitup-box"]/text()').extract_first()
# print rep_content
agree=reply.xpath('.//em[@class="aw-border-radius-5 aw-vote-count pull-left"]/text()').extract_first()
resp.append({replay_user.strip()+'_{}'.format(index): [int(agree),rep_content.strip()]})
item['resp'] = resp
yield item
login函数是模拟登录集思录,通过抓包就可以知道一些上传的data。
然后就是分页去抓取。逻辑很简单。
然后pipeline里面写入mongodb。
import pymongo
from collections import OrderedDict
class JslPipeline(object):
def __init__(self):
self.db = pymongo.MongoClient(host='10.18.6.1',port=27017)
# self.user = u'neo牛3' # 修改为指定的用户名 如 毛之川 ,然后找到用户的id,在用户也的源码哪里可以找到 比如持有封基是8132
self.collection = self.db['db_parker']['jsl']
def process_item(self, item, spider):
self.collection.insert(OrderedDict(item))
return item
抓取到的数据入库mongodb:
点击查看大图
原创文章
转载请注明出处:http://30daydo.com/publish/article/351
docker里运行mongodb,保存的数据在外部使用mongoexport不能导出:提示错误Unrecognized field 'snapshot'
python • 李魔佛 发表了文章 • 0 个评论 • 10747 次浏览 • 2018-08-31 14:21
很无语。 目前还找不到原因。
docker里面运行的mongodb, mongodb的数据挂载到宿主机。 开放了27017端口。
在windows下使用mongoexport工具导出数据:
错误信息:C:\Program Files\MongoDB\Server\3.4\bin>mongoexport.exe /h 10.18.6.102 /d stock
/c company /o company.json /type json
2018-08-31T14:13:47.841+0800 connected to: 10.18.6.102
2018-08-31T14:13:47.854+0800 Failed: Failed to parse: { find: "company", filt
er: {}, sort: {}, skip: 0, snapshot: true, $readPreference: { mode: "secondaryPr
eferred" }, $db: "stock" }. Unrecognized field 'snapshot'.
C:\Program Files\MongoDB\Server\3.4\bin>
目前这个问题已经解决:
需要进去docker容器里面,然后在容器里面操作,把数据导出来到挂载的目录下,然后可以直接获取到数据了。 查看全部
很无语。 目前还找不到原因。
docker里面运行的mongodb, mongodb的数据挂载到宿主机。 开放了27017端口。
在windows下使用mongoexport工具导出数据:
错误信息:
C:\Program Files\MongoDB\Server\3.4\bin>mongoexport.exe /h 10.18.6.102 /d stock
/c company /o company.json /type json
2018-08-31T14:13:47.841+0800 connected to: 10.18.6.102
2018-08-31T14:13:47.854+0800 Failed: Failed to parse: { find: "company", filt
er: {}, sort: {}, skip: 0, snapshot: true, $readPreference: { mode: "secondaryPr
eferred" }, $db: "stock" }. Unrecognized field 'snapshot'.
C:\Program Files\MongoDB\Server\3.4\bin>
目前这个问题已经解决:
需要进去docker容器里面,然后在容器里面操作,把数据导出来到挂载的目录下,然后可以直接获取到数据了。
python mongodb大数据(>3GB)转移Mysql数据库
python • 李魔佛 发表了文章 • 0 个评论 • 5332 次浏览 • 2018-08-20 15:44
于是使用了分段遍历的方法.# -*-coding=utf-8-*-
import pandas as pd
import json
import pymongo
from sqlalchemy import create_engine
# 将mongo数据转移到mysql
client = pymongo.MongoClient('xxx')
doc = client['spider']['meituan']
engine = create_engine('mysql+pymysql://xxx:xxx@xxx:/xxx?charset=utf8')
def classic_method():
temp =
start = 0
# 数据太大还是会爆内存,或者游标丢失
for i in doc.find().batch_size(500):
start += 1
del i['_id']
temp.append(i)
print(start)
print('start to save to mysql')
df = pd.read_json(json.dumps(temp))
df = df.set_index('poiid', drop=True)
df.to_sql('meituan', con=engine, if_exists='replace')
print('done')
def chunksize_move():
block = 10000
total = doc.find({}).count()
iter_number = total // block
for i in range(iter_number + 1):
small_part = doc.find({}).limit(block).skip(i * block)
list_data =
for item in small_part:
del item['_id']
del item['crawl_time']
item['poiid'] = int(item['poiid'])
for k, v in item.items():
if isinstance(v, dict) or isinstance(v, list):
item[k] = json.dumps(v, ensure_ascii=False)
list_data.append(item)
df = pd.DataFrame(list_data)
df = df.set_index('poiid', drop=True)
try:
df.to_sql('meituan', con=engine, if_exists='append')
print('to sql {}'.format(i))
except Exception as e:
print(e)
chunksize_move()
速度比一次批量的要快不少. 查看全部
for i in doc.find({})进行逐行遍历的话,游标就会超时,而且越到后面速度越慢.
于是使用了分段遍历的方法.
# -*-coding=utf-8-*-
import pandas as pd
import json
import pymongo
from sqlalchemy import create_engine
# 将mongo数据转移到mysql
client = pymongo.MongoClient('xxx')
doc = client['spider']['meituan']
engine = create_engine('mysql+pymysql://xxx:xxx@xxx:/xxx?charset=utf8')
def classic_method():
temp =
start = 0
# 数据太大还是会爆内存,或者游标丢失
for i in doc.find().batch_size(500):
start += 1
del i['_id']
temp.append(i)
print(start)
print('start to save to mysql')
df = pd.read_json(json.dumps(temp))
df = df.set_index('poiid', drop=True)
df.to_sql('meituan', con=engine, if_exists='replace')
print('done')
def chunksize_move():
block = 10000
total = doc.find({}).count()
iter_number = total // block
for i in range(iter_number + 1):
small_part = doc.find({}).limit(block).skip(i * block)
list_data =
for item in small_part:
del item['_id']
del item['crawl_time']
item['poiid'] = int(item['poiid'])
for k, v in item.items():
if isinstance(v, dict) or isinstance(v, list):
item[k] = json.dumps(v, ensure_ascii=False)
list_data.append(item)
df = pd.DataFrame(list_data)
df = df.set_index('poiid', drop=True)
try:
df.to_sql('meituan', con=engine, if_exists='append')
print('to sql {}'.format(i))
except Exception as e:
print(e)
chunksize_move()
速度比一次批量的要快不少.
python 把mongodb的数据迁移到mysql
python • 李魔佛 发表了文章 • 0 个评论 • 4842 次浏览 • 2018-08-20 11:02
import pymongo
from setting import get_engine
# 将mongo数据转移到mysql
client = pymongo.MongoClient('10.18.6.101')
doc = client['spider']['meituan']
engine = create_engine('mysql+pymysql://localhost:1234@10.18.4.211/spider?charset=utf8')
temp=[]
for i in doc.find({}):
del i['_id']
temp.append(i)
print('start to save to mysql')
df = pd.read_json(json.dumps(temp))
df = df.set_index('poiid',drop=True)
df.to_sql('meituan',con=engine,if_exists='replace')
print('done')
居然CPU飙到了90%
查看全部
import pymongo
from setting import get_engine
# 将mongo数据转移到mysql
client = pymongo.MongoClient('10.18.6.101')
doc = client['spider']['meituan']
engine = create_engine('mysql+pymysql://localhost:1234@10.18.4.211/spider?charset=utf8')
temp=[]
for i in doc.find({}):
del i['_id']
temp.append(i)
print('start to save to mysql')
df = pd.read_json(json.dumps(temp))
df = df.set_index('poiid',drop=True)
df.to_sql('meituan',con=engine,if_exists='replace')
print('done')
居然CPU飙到了90%
mongodb sort: Executor error during find command: OperationFailed: Sort operation used more than
网络 • 李魔佛 发表了文章 • 0 个评论 • 7903 次浏览 • 2018-07-09 10:31
Error: error: {
"ok" : 0,
"errmsg" : "Executor error during find command: OperationFailed: Sort operation used more than the maximum 33554432 bytes of RAM. Add an index, or specify a smaller limit.",
"code" : 96,
"codeName" : "OperationFailed"
}
使用limit函数限制其输出就可以了:
db.getCollection('老布').find({}).sort({'created_at':-1}).limit(1000) 查看全部
Error: error: {
"ok" : 0,
"errmsg" : "Executor error during find command: OperationFailed: Sort operation used more than the maximum 33554432 bytes of RAM. Add an index, or specify a smaller limit.",
"code" : 96,
"codeName" : "OperationFailed"
}
使用limit函数限制其输出就可以了:
db.getCollection('老布').find({}).sort({'created_at':-1}).limit(1000)
mongo服务器因为mongod.lock 被锁定无法正常运行
网络安全 • 李魔佛 发表了文章 • 0 个评论 • 8494 次浏览 • 2018-05-20 18:33
Sun May 20 18:26:04.630 [initandlisten] MongoDB starting : pid=2343 port=27017 dbpath=/home/pi/mongo/db/ 32-bit host=raspberrypi
Sun May 20 18:26:04.631 [initandlisten]
Sun May 20 18:26:04.631 [initandlisten] ** NOTE: This is a 32 bit MongoDB binary.
Sun May 20 18:26:04.631 [initandlisten] ** 32 bit builds are limited to less than 2GB of data (or less with --journal).
Sun May 20 18:26:04.631 [initandlisten] ** See http://dochub.mongodb.org/core/32bit
Sun May 20 18:26:04.631 [initandlisten]
Sun May 20 18:26:04.631 [initandlisten] db version v2.4.10
Sun May 20 18:26:04.632 [initandlisten] git version: nogitversion
Sun May 20 18:26:04.632 [initandlisten] build info: Linux bm-wb-04 3.19.0-trunk-armmp #1 SMP Debian 3.19.1-1~exp1+plugwash1 (2015-03-28) armv7l BOOST_LIB_VERSION=1_55
Sun May 20 18:26:04.632 [initandlisten] allocator: system
Sun May 20 18:26:04.632 [initandlisten] options: { dbpath: "/home/pi/mongo/db/", journal: true, logpath: "/home/pi/mongo/mongod.log" }
Sun May 20 18:26:05.956 [initandlisten] journal dir=/home/pi/mongo/db/journal
Sun May 20 18:26:05.957 [initandlisten] recover : no journal files present, no recovery needed
Sun May 20 18:26:06.023 [initandlisten] ERROR: mmap private failed with out of memory. You are using a 32-bit build and probably need to upgrade to 64
Sun May 20 18:26:06.023 [initandlisten] Assertion: 13636:file /home/pi/mongo/db/xueqiu.5 open/create failed in createPrivateMap (look in log for more information)
0x662fa8 0x63527c 0x6196c0 0x6197bc 0x409d5c 0x4414fc 0x2ea634 0x2eaa3c 0x2eb55c 0x2ebd98 0x26f998 0x26fb94 0x175a20 0x177bf4 0x152bf0 0x7660e294
mongod(_ZN5mongo15printStackTraceERSo+0x1c) [0x662fa8]
mongod(_ZN5mongo10logContextEPKc+0x110) [0x63527c]
mongod(_ZN5mongo11msgassertedEiPKc+0xc0) [0x6196c0]
mongod(_ZN5mongo18msgassertedNoTraceEiPKc+0) [0x6197bc]
mongod(_ZN5mongo8MongoMMF13finishOpeningEv+0x308) [0x409d5c]
mongod(_ZN5mongo13MongoDataFile12openExistingEPKc+0x9c) [0x4414fc]
mongod(_ZN5mongo8Database16openExistingFileEi+0x23c) [0x2ea634]
mongod(_ZN5mongo8Database12openAllFilesEv+0x24) [0x2eaa3c]
mongod(_ZN5mongo8DatabaseC2EPKcRbRKSs+0x158) [0x2eb55c]
mongod(_ZN5mongo14DatabaseHolder11getOrCreateERKSsS2_Rb+0x500) [0x2ebd98]
mongod(_ZN5mongo6Client7Context11_finishInitEv+0x34) [0x26f998]
mongod(_ZN5mongo6Client7ContextC1ERKSsS3_b+0x78) [0x26fb94]
mongod(_ZN5mongo14_initAndListenEi+0xb00) [0x175a20]
mongod(_ZN5mongo13initAndListenEi+0x14) [0x177bf4]
mongod(main+0x2b8) [0x152bf0]
/lib/arm-linux-gnueabihf/libc.so.6(__libc_start_main+0x114) [0x7660e294]
Sun May 20 18:26:06.035 [initandlisten] warning database /home/pi/mongo/db/ xueqiu could not be opened
Sun May 20 18:26:06.035 [initandlisten] DBException 13636: file /home/pi/mongo/db/xueqiu.5 open/create failed in createPrivateMap (look in log for more information)
Sun May 20 18:26:06.036 [initandlisten] exception in initAndListen: 13636 file /home/pi/mongo/db/xueqiu.5 open/create failed in createPrivateMap (look in log for more information), terminating
Sun May 20 18:26:06.036 dbexit:
Sun May 20 18:26:06.036 [initandlisten] shutdown: going to close listening sockets...
Sun May 20 18:26:06.036 [initandlisten] shutdown: going to flush diaglog...
Sun May 20 18:26:06.036 [initandlisten] shutdown: going to close sockets...
Sun May 20 18:26:06.036 [initandlisten] shutdown: waiting for fs preallocator...
Sun May 20 18:26:06.036 [initandlisten] shutdown: lock for final commit...
Sun May 20 18:26:06.036 [initandlisten] shutdown: final commit...
Sun May 20 18:26:06.036 [initandlisten] shutdown: closing all files...
Sun May 20 18:26:06.037 [initandlisten] closeAllFiles() finished
Sun May 20 18:26:06.037 [initandlisten] journalCleanup...
Sun May 20 18:26:06.037 [initandlisten] removeJournalFiles
Sun May 20 18:26:06.050 [initandlisten] shutdown: removing fs lock...
Sun May 20 18:26:06.050 dbexit: really exiting now
应该是之前没有正常被关闭,导致文件本锁住了。
解决办法:
1. 先把数据文件备份, --dbpath 的路径整个备份一下, 不然接下来的操作误操作了数据就丢失了
2. 运行修复命令:
mongod --dbpath /data/db --repair
替换上面的db为您自己的本地路径 查看全部
Sun May 20 18:26:04.630 [initandlisten] MongoDB starting : pid=2343 port=27017 dbpath=/home/pi/mongo/db/ 32-bit host=raspberrypi
Sun May 20 18:26:04.631 [initandlisten]
Sun May 20 18:26:04.631 [initandlisten] ** NOTE: This is a 32 bit MongoDB binary.
Sun May 20 18:26:04.631 [initandlisten] ** 32 bit builds are limited to less than 2GB of data (or less with --journal).
Sun May 20 18:26:04.631 [initandlisten] ** See http://dochub.mongodb.org/core/32bit
Sun May 20 18:26:04.631 [initandlisten]
Sun May 20 18:26:04.631 [initandlisten] db version v2.4.10
Sun May 20 18:26:04.632 [initandlisten] git version: nogitversion
Sun May 20 18:26:04.632 [initandlisten] build info: Linux bm-wb-04 3.19.0-trunk-armmp #1 SMP Debian 3.19.1-1~exp1+plugwash1 (2015-03-28) armv7l BOOST_LIB_VERSION=1_55
Sun May 20 18:26:04.632 [initandlisten] allocator: system
Sun May 20 18:26:04.632 [initandlisten] options: { dbpath: "/home/pi/mongo/db/", journal: true, logpath: "/home/pi/mongo/mongod.log" }
Sun May 20 18:26:05.956 [initandlisten] journal dir=/home/pi/mongo/db/journal
Sun May 20 18:26:05.957 [initandlisten] recover : no journal files present, no recovery needed
Sun May 20 18:26:06.023 [initandlisten] ERROR: mmap private failed with out of memory. You are using a 32-bit build and probably need to upgrade to 64
Sun May 20 18:26:06.023 [initandlisten] Assertion: 13636:file /home/pi/mongo/db/xueqiu.5 open/create failed in createPrivateMap (look in log for more information)
0x662fa8 0x63527c 0x6196c0 0x6197bc 0x409d5c 0x4414fc 0x2ea634 0x2eaa3c 0x2eb55c 0x2ebd98 0x26f998 0x26fb94 0x175a20 0x177bf4 0x152bf0 0x7660e294
mongod(_ZN5mongo15printStackTraceERSo+0x1c) [0x662fa8]
mongod(_ZN5mongo10logContextEPKc+0x110) [0x63527c]
mongod(_ZN5mongo11msgassertedEiPKc+0xc0) [0x6196c0]
mongod(_ZN5mongo18msgassertedNoTraceEiPKc+0) [0x6197bc]
mongod(_ZN5mongo8MongoMMF13finishOpeningEv+0x308) [0x409d5c]
mongod(_ZN5mongo13MongoDataFile12openExistingEPKc+0x9c) [0x4414fc]
mongod(_ZN5mongo8Database16openExistingFileEi+0x23c) [0x2ea634]
mongod(_ZN5mongo8Database12openAllFilesEv+0x24) [0x2eaa3c]
mongod(_ZN5mongo8DatabaseC2EPKcRbRKSs+0x158) [0x2eb55c]
mongod(_ZN5mongo14DatabaseHolder11getOrCreateERKSsS2_Rb+0x500) [0x2ebd98]
mongod(_ZN5mongo6Client7Context11_finishInitEv+0x34) [0x26f998]
mongod(_ZN5mongo6Client7ContextC1ERKSsS3_b+0x78) [0x26fb94]
mongod(_ZN5mongo14_initAndListenEi+0xb00) [0x175a20]
mongod(_ZN5mongo13initAndListenEi+0x14) [0x177bf4]
mongod(main+0x2b8) [0x152bf0]
/lib/arm-linux-gnueabihf/libc.so.6(__libc_start_main+0x114) [0x7660e294]
Sun May 20 18:26:06.035 [initandlisten] warning database /home/pi/mongo/db/ xueqiu could not be opened
Sun May 20 18:26:06.035 [initandlisten] DBException 13636: file /home/pi/mongo/db/xueqiu.5 open/create failed in createPrivateMap (look in log for more information)
Sun May 20 18:26:06.036 [initandlisten] exception in initAndListen: 13636 file /home/pi/mongo/db/xueqiu.5 open/create failed in createPrivateMap (look in log for more information), terminating
Sun May 20 18:26:06.036 dbexit:
Sun May 20 18:26:06.036 [initandlisten] shutdown: going to close listening sockets...
Sun May 20 18:26:06.036 [initandlisten] shutdown: going to flush diaglog...
Sun May 20 18:26:06.036 [initandlisten] shutdown: going to close sockets...
Sun May 20 18:26:06.036 [initandlisten] shutdown: waiting for fs preallocator...
Sun May 20 18:26:06.036 [initandlisten] shutdown: lock for final commit...
Sun May 20 18:26:06.036 [initandlisten] shutdown: final commit...
Sun May 20 18:26:06.036 [initandlisten] shutdown: closing all files...
Sun May 20 18:26:06.037 [initandlisten] closeAllFiles() finished
Sun May 20 18:26:06.037 [initandlisten] journalCleanup...
Sun May 20 18:26:06.037 [initandlisten] removeJournalFiles
Sun May 20 18:26:06.050 [initandlisten] shutdown: removing fs lock...
Sun May 20 18:26:06.050 dbexit: really exiting now
应该是之前没有正常被关闭,导致文件本锁住了。
解决办法:
1. 先把数据文件备份, --dbpath 的路径整个备份一下, 不然接下来的操作误操作了数据就丢失了
2. 运行修复命令:
mongod --dbpath /data/db --repair
替换上面的db为您自己的本地路径
mongo服务启动失败: ERROR: mmap private failed with out of memory
树莓派 • 李魔佛 发表了文章 • 0 个评论 • 4755 次浏览 • 2018-05-13 12:23
sudo mongod --fork --dbpath /home/pi/mongo/db/ --smallfiles --journal --logpath /home/pi/mongo/log.txt
突然今天发现mongo的服务连不上,看log发现mongo在启动后马上关闭了,提示的错误是在加载一个大的数据文件的时候提示内存不足(坑爹的,树莓派自身内存才1GB,无法扩容)。
错误日志:
Sun May 13 12:08:11.185 [initandlisten] MongoDB starting : pid=1929 port=27017 dbpath=/home/pi/mongo/db/ 32-bit host=raspberrypi
Sun May 13 12:08:11.186 [initandlisten]
Sun May 13 12:08:11.186 [initandlisten] ** NOTE: This is a 32 bit MongoDB binary.
Sun May 13 12:08:11.186 [initandlisten] ** 32 bit builds are limited to less than 2GB of data (or less with --journal).
Sun May 13 12:08:11.186 [initandlisten] ** See http://dochub.mongodb.org/core/32bit
Sun May 13 12:08:11.187 [initandlisten]
Sun May 13 12:08:11.187 [initandlisten] db version v2.4.10
Sun May 13 12:08:11.187 [initandlisten] git version: nogitversion
Sun May 13 12:08:11.187 [initandlisten] build info: Linux bm-wb-04 3.19.0-trunk-armmp #1 SMP Debian 3.19.1-1~exp1+plugwash1 (2015-03-28) armv7l BOOST_LIB_VERSION=1_55
Sun May 13 12:08:11.187 [initandlisten] allocator: system
Sun May 13 12:08:11.187 [initandlisten] options: { dbpath: "/home/pi/mongo/db/", fork: true, journal: true, logpath: "/home/pi/mongo/mongod.log" }
Sun May 13 12:08:11.198 [initandlisten] journal dir=/home/pi/mongo/db/journal
Sun May 13 12:08:11.198 [initandlisten] recover : no journal files present, no recovery needed
Sun May 13 12:08:11.238 [initandlisten] ERROR: mmap private failed with out of memory. You are using a 32-bit build and probably need to upgrade to 64
Sun May 13 12:08:11.239 [initandlisten] Assertion: 13636:file /home/pi/mongo/db/xueqiu.5 open/create failed in createPrivateMap (look in log for more information)
0x662fa8 0x63527c 0x6196c0 0x6197bc 0x409d5c 0x4414fc 0x2ea634 0x2eaa3c 0x2eb55c 0x2ebd98 0x26f998 0x26fb94 0x175a20 0x177bf4 0x152bf0 0x76556294
mongod(_ZN5mongo15printStackTraceERSo+0x1c) [0x662fa8]
mongod(_ZN5mongo10logContextEPKc+0x110) [0x63527c]
mongod(_ZN5mongo11msgassertedEiPKc+0xc0) [0x6196c0]
mongod(_ZN5mongo18msgassertedNoTraceEiPKc+0) [0x6197bc]
mongod(_ZN5mongo8MongoMMF13finishOpeningEv+0x308) [0x409d5c]
mongod(_ZN5mongo13MongoDataFile12openExistingEPKc+0x9c) [0x4414fc]
mongod(_ZN5mongo8Database16openExistingFileEi+0x23c) [0x2ea634]
mongod(_ZN5mongo8Database12openAllFilesEv+0x24) [0x2eaa3c]
mongod(_ZN5mongo8DatabaseC2EPKcRbRKSs+0x158) [0x2eb55c]
mongod(_ZN5mongo14DatabaseHolder11getOrCreateERKSsS2_Rb+0x500) [0x2ebd98]
mongod(_ZN5mongo6Client7Context11_finishInitEv+0x34) [0x26f998]
mongod(_ZN5mongo6Client7ContextC1ERKSsS3_b+0x78) [0x26fb94]
mongod(_ZN5mongo14_initAndListenEi+0xb00) [0x175a20]
mongod(_ZN5mongo13initAndListenEi+0x14) [0x177bf4]
mongod(main+0x2b8) [0x152bf0]
/lib/arm-linux-gnueabihf/libc.so.6(__libc_start_main+0x114) [0x76556294]
Sun May 13 12:08:11.250 [initandlisten] warning database /home/pi/mongo/db/ xueqiu could not be opened
Sun May 13 12:08:11.251 [initandlisten] DBException 13636: file /home/pi/mongo/db/xueqiu.5 open/create failed in createPrivateMap (look in log for more information)
Sun May 13 12:08:11.251 [initandlisten] exception in initAndListen: 13636 file /home/pi/mongo/db/xueqiu.5 open/create failed in createPrivateMap (look in log for more information), terminating
Sun May 13 12:08:11.251 dbexit:
Sun May 13 12:08:11.252 [initandlisten] shutdown: going to close listening sockets...
Sun May 13 12:08:11.252 [initandlisten] shutdown: going to flush diaglog...
Sun May 13 12:08:11.252 [initandlisten] shutdown: going to close sockets...
Sun May 13 12:08:11.252 [initandlisten] shutdown: waiting for fs preallocator...
Sun May 13 12:08:11.252 [initandlisten] shutdown: lock for final commit...
Sun May 13 12:08:11.252 [initandlisten] shutdown: final commit...
Sun May 13 12:08:11.252 [initandlisten] shutdown: closing all files...
Sun May 13 12:08:11.252 [initandlisten] closeAllFiles() finished
Sun May 13 12:08:11.252 [initandlisten] journalCleanup...
Sun May 13 12:08:11.253 [initandlisten] removeJournalFiles
Sun May 13 12:08:11.263 [initandlisten] shutdown: removing fs lock...
Sun May 13 12:08:11.264 dbexit: really exiting now
看了下mongod的用法,尝试把参数 --journal,去掉,重新运行,然后就可以了。
sudo mongod --fork --dbpath /home/pi/mongo/db/ --smallfiles --logpath /home/pi/mongo/log.txt 查看全部
sudo mongod --fork --dbpath /home/pi/mongo/db/ --smallfiles --journal --logpath /home/pi/mongo/log.txt
突然今天发现mongo的服务连不上,看log发现mongo在启动后马上关闭了,提示的错误是在加载一个大的数据文件的时候提示内存不足(坑爹的,树莓派自身内存才1GB,无法扩容)。
错误日志:
Sun May 13 12:08:11.185 [initandlisten] MongoDB starting : pid=1929 port=27017 dbpath=/home/pi/mongo/db/ 32-bit host=raspberrypi
Sun May 13 12:08:11.186 [initandlisten]
Sun May 13 12:08:11.186 [initandlisten] ** NOTE: This is a 32 bit MongoDB binary.
Sun May 13 12:08:11.186 [initandlisten] ** 32 bit builds are limited to less than 2GB of data (or less with --journal).
Sun May 13 12:08:11.186 [initandlisten] ** See http://dochub.mongodb.org/core/32bit
Sun May 13 12:08:11.187 [initandlisten]
Sun May 13 12:08:11.187 [initandlisten] db version v2.4.10
Sun May 13 12:08:11.187 [initandlisten] git version: nogitversion
Sun May 13 12:08:11.187 [initandlisten] build info: Linux bm-wb-04 3.19.0-trunk-armmp #1 SMP Debian 3.19.1-1~exp1+plugwash1 (2015-03-28) armv7l BOOST_LIB_VERSION=1_55
Sun May 13 12:08:11.187 [initandlisten] allocator: system
Sun May 13 12:08:11.187 [initandlisten] options: { dbpath: "/home/pi/mongo/db/", fork: true, journal: true, logpath: "/home/pi/mongo/mongod.log" }
Sun May 13 12:08:11.198 [initandlisten] journal dir=/home/pi/mongo/db/journal
Sun May 13 12:08:11.198 [initandlisten] recover : no journal files present, no recovery needed
Sun May 13 12:08:11.238 [initandlisten] ERROR: mmap private failed with out of memory. You are using a 32-bit build and probably need to upgrade to 64
Sun May 13 12:08:11.239 [initandlisten] Assertion: 13636:file /home/pi/mongo/db/xueqiu.5 open/create failed in createPrivateMap (look in log for more information)
0x662fa8 0x63527c 0x6196c0 0x6197bc 0x409d5c 0x4414fc 0x2ea634 0x2eaa3c 0x2eb55c 0x2ebd98 0x26f998 0x26fb94 0x175a20 0x177bf4 0x152bf0 0x76556294
mongod(_ZN5mongo15printStackTraceERSo+0x1c) [0x662fa8]
mongod(_ZN5mongo10logContextEPKc+0x110) [0x63527c]
mongod(_ZN5mongo11msgassertedEiPKc+0xc0) [0x6196c0]
mongod(_ZN5mongo18msgassertedNoTraceEiPKc+0) [0x6197bc]
mongod(_ZN5mongo8MongoMMF13finishOpeningEv+0x308) [0x409d5c]
mongod(_ZN5mongo13MongoDataFile12openExistingEPKc+0x9c) [0x4414fc]
mongod(_ZN5mongo8Database16openExistingFileEi+0x23c) [0x2ea634]
mongod(_ZN5mongo8Database12openAllFilesEv+0x24) [0x2eaa3c]
mongod(_ZN5mongo8DatabaseC2EPKcRbRKSs+0x158) [0x2eb55c]
mongod(_ZN5mongo14DatabaseHolder11getOrCreateERKSsS2_Rb+0x500) [0x2ebd98]
mongod(_ZN5mongo6Client7Context11_finishInitEv+0x34) [0x26f998]
mongod(_ZN5mongo6Client7ContextC1ERKSsS3_b+0x78) [0x26fb94]
mongod(_ZN5mongo14_initAndListenEi+0xb00) [0x175a20]
mongod(_ZN5mongo13initAndListenEi+0x14) [0x177bf4]
mongod(main+0x2b8) [0x152bf0]
/lib/arm-linux-gnueabihf/libc.so.6(__libc_start_main+0x114) [0x76556294]
Sun May 13 12:08:11.250 [initandlisten] warning database /home/pi/mongo/db/ xueqiu could not be opened
Sun May 13 12:08:11.251 [initandlisten] DBException 13636: file /home/pi/mongo/db/xueqiu.5 open/create failed in createPrivateMap (look in log for more information)
Sun May 13 12:08:11.251 [initandlisten] exception in initAndListen: 13636 file /home/pi/mongo/db/xueqiu.5 open/create failed in createPrivateMap (look in log for more information), terminating
Sun May 13 12:08:11.251 dbexit:
Sun May 13 12:08:11.252 [initandlisten] shutdown: going to close listening sockets...
Sun May 13 12:08:11.252 [initandlisten] shutdown: going to flush diaglog...
Sun May 13 12:08:11.252 [initandlisten] shutdown: going to close sockets...
Sun May 13 12:08:11.252 [initandlisten] shutdown: waiting for fs preallocator...
Sun May 13 12:08:11.252 [initandlisten] shutdown: lock for final commit...
Sun May 13 12:08:11.252 [initandlisten] shutdown: final commit...
Sun May 13 12:08:11.252 [initandlisten] shutdown: closing all files...
Sun May 13 12:08:11.252 [initandlisten] closeAllFiles() finished
Sun May 13 12:08:11.252 [initandlisten] journalCleanup...
Sun May 13 12:08:11.253 [initandlisten] removeJournalFiles
Sun May 13 12:08:11.263 [initandlisten] shutdown: removing fs lock...
Sun May 13 12:08:11.264 dbexit: really exiting now
看了下mongod的用法,尝试把参数 --journal,去掉,重新运行,然后就可以了。
sudo mongod --fork --dbpath /home/pi/mongo/db/ --smallfiles --logpath /home/pi/mongo/log.txt
树莓派安装mongodb服务器
网络 • 李魔佛 发表了文章 • 0 个评论 • 7986 次浏览 • 2017-12-18 16:57
不过安装的是32bit 的mongodb,数据库的大小会被限制在2GB。
树莓派启动mongodb
修改/etc/mongodb.config,
把里面的bind=127.0.0.1 注释掉,前面加一个#即可,因为这样其他主机也可以访问这台树莓派的mongodb服务器。
修改dbpath和dblog的路径,因为默认的路径你需要root权限
然后运行 mongod --config /etc/mongodb.config , 然后远程使用mongo ip地址就可以远程连接了。 查看全部
不过安装的是32bit 的mongodb,数据库的大小会被限制在2GB。
树莓派启动mongodb
修改/etc/mongodb.config,
把里面的bind=127.0.0.1 注释掉,前面加一个#即可,因为这样其他主机也可以访问这台树莓派的mongodb服务器。
修改dbpath和dblog的路径,因为默认的路径你需要root权限
然后运行 mongod --config /etc/mongodb.config , 然后远程使用mongo ip地址就可以远程连接了。
使用官网下载的mongodb,如何设置远程连接mongodb服务器
网络安全 • 李魔佛 发表了文章 • 0 个评论 • 5608 次浏览 • 2017-12-17 23:10
解压后在mongo的bin目录下运行 mongod --dbpath ~/mongo/db
可以看到mongodb服务被正常启动了。
在局域网其他电脑上使用mongodb客户端尝试连接这个mongo服务器,发现无法连接上。
因为官网下载的mongo问价解压后并没有mongo.conf配置文件。
在本机运行命令: mongo
可以看到输出:
Server has startup warnings:
2017-12-17T22:56:19.702+0800 I STORAGE [initandlisten]
2017-12-17T22:56:19.702+0800 I STORAGE [initandlisten] ** WARNING: Using the XFS filesystem is strongly recommended with the WiredTiger storage engine
2017-12-17T22:56:19.702+0800 I STORAGE [initandlisten] ** See http://dochub.mongodb.org/core ... ystem
2017-12-17T22:56:20.600+0800 I CONTROL [initandlisten]
2017-12-17T22:56:20.600+0800 I CONTROL [initandlisten] ** WARNING: Access control is not enabled for the database.
2017-12-17T22:56:20.600+0800 I CONTROL [initandlisten] ** Read and write access to data and configuration is unrestricted.
2017-12-17T22:56:20.600+0800 I CONTROL [initandlisten]
2017-12-17T22:56:20.600+0800 I CONTROL [initandlisten] ** WARNING: This server is bound to localhost.
2017-12-17T22:56:20.600+0800 I CONTROL [initandlisten] ** Remote systems will be unable to connect to this server.
2017-12-17T22:56:20.600+0800 I CONTROL [initandlisten] ** Start the server with --bind_ip <address> to specify which IP
2017-12-17T22:56:20.600+0800 I CONTROL [initandlisten] ** addresses it should serve responses from, or with --bind_ip_all to
2017-12-17T22:56:20.600+0800 I CONTROL [initandlisten] ** bind to all interfaces. If this behavior is desired, start the
2017-12-17T22:56:20.600+0800 I CONTROL [initandlisten] ** server with --bind_ip 127.0.0.1 to disable this warning.
2017-12-17T22:56:20.600+0800 I CONTROL [initandlisten]
2017-12-17T22:56:20.601+0800 I CONTROL [initandlisten]
2017-12-17T22:56:20.601+0800 I CONTROL [initandlisten] ** WARNING: /sys/kernel/mm/transparent_hugepage/defrag is 'always'.
2017-12-17T22:56:20.601+0800 I CONTROL [initandlisten] ** We suggest setting it to 'never'
2017-12-17T22:56:20.601+0800 I CONTROL [initandlisten]
从上面的信息可以看到,如果需要远程的机子连接到本机,需要添加一个选项: --bind_ip_all
运行下面命令后:
mongod --dbpath ~/mongo/db --bind_ip_all
远程的机子就能够连上mongo服务器了。
原文地址:http://30daydo.com/article/247
转载请注明出处 查看全部
解压后在mongo的bin目录下运行 mongod --dbpath ~/mongo/db
可以看到mongodb服务被正常启动了。
在局域网其他电脑上使用mongodb客户端尝试连接这个mongo服务器,发现无法连接上。
因为官网下载的mongo问价解压后并没有mongo.conf配置文件。
在本机运行命令: mongo
可以看到输出:
Server has startup warnings:
2017-12-17T22:56:19.702+0800 I STORAGE [initandlisten]
2017-12-17T22:56:19.702+0800 I STORAGE [initandlisten] ** WARNING: Using the XFS filesystem is strongly recommended with the WiredTiger storage engine
2017-12-17T22:56:19.702+0800 I STORAGE [initandlisten] ** See http://dochub.mongodb.org/core ... ystem
2017-12-17T22:56:20.600+0800 I CONTROL [initandlisten]
2017-12-17T22:56:20.600+0800 I CONTROL [initandlisten] ** WARNING: Access control is not enabled for the database.
2017-12-17T22:56:20.600+0800 I CONTROL [initandlisten] ** Read and write access to data and configuration is unrestricted.
2017-12-17T22:56:20.600+0800 I CONTROL [initandlisten]
2017-12-17T22:56:20.600+0800 I CONTROL [initandlisten] ** WARNING: This server is bound to localhost.
2017-12-17T22:56:20.600+0800 I CONTROL [initandlisten] ** Remote systems will be unable to connect to this server.
2017-12-17T22:56:20.600+0800 I CONTROL [initandlisten] ** Start the server with --bind_ip <address> to specify which IP
2017-12-17T22:56:20.600+0800 I CONTROL [initandlisten] ** addresses it should serve responses from, or with --bind_ip_all to
2017-12-17T22:56:20.600+0800 I CONTROL [initandlisten] ** bind to all interfaces. If this behavior is desired, start the
2017-12-17T22:56:20.600+0800 I CONTROL [initandlisten] ** server with --bind_ip 127.0.0.1 to disable this warning.
2017-12-17T22:56:20.600+0800 I CONTROL [initandlisten]
2017-12-17T22:56:20.601+0800 I CONTROL [initandlisten]
2017-12-17T22:56:20.601+0800 I CONTROL [initandlisten] ** WARNING: /sys/kernel/mm/transparent_hugepage/defrag is 'always'.
2017-12-17T22:56:20.601+0800 I CONTROL [initandlisten] ** We suggest setting it to 'never'
2017-12-17T22:56:20.601+0800 I CONTROL [initandlisten]
从上面的信息可以看到,如果需要远程的机子连接到本机,需要添加一个选项: --bind_ip_all
运行下面命令后:
mongod --dbpath ~/mongo/db --bind_ip_all
远程的机子就能够连上mongo服务器了。
原文地址:http://30daydo.com/article/247
转载请注明出处
mongodb中$sum:1 后面的1是什么意思
python • 李魔佛 发表了文章 • 0 个评论 • 10685 次浏览 • 2017-09-05 23:32
{
"_id" : "GuqXmAkkARqhBDqhy",
"beatmapset_id" : "342537",
"version" : "MX",
"diff_approach" : "5",
"artist" : "Yousei Teikoku",
"title" : "Kokou no Sousei",
"difficultyrating" : "3.5552737712860107"
}
{
"_id" : "oHLT7KqsB7bztBGvu",
"beatmapset_id" : "342537",
"version" : "HD",
"diff_approach" : "5",
"artist" : "Yousei Teikoku",
"title" : "Kokou no Sousei",
"difficultyrating" : "2.7515676021575928"
}
{
"_id" : "GbotZfrPEwW69FkGD",
"beatmapset_id" : "342537",
"version" : "NM",
"diff_approach" : "5",
"artist" : "Yousei Teikoku",
"title" : "Kokou no Sousei",
"difficultyrating" : "0"
}
然后运行以下的命令:
db.getCollection('dup_case').aggregate(
[
{$group:{
_id:{diff_approach:'$diff_approach'},
count:{$sum:2}
}},
{$match:{count:{$gt:1}}}
]
)
返回的count是6
所以
$sum:1 的含义:
如果前面的情况出现一次,就加1, 如果后面$sum:2 那么每次前面条件满足一次就加2
查看全部
{
"_id" : "GuqXmAkkARqhBDqhy",
"beatmapset_id" : "342537",
"version" : "MX",
"diff_approach" : "5",
"artist" : "Yousei Teikoku",
"title" : "Kokou no Sousei",
"difficultyrating" : "3.5552737712860107"
}
{
"_id" : "oHLT7KqsB7bztBGvu",
"beatmapset_id" : "342537",
"version" : "HD",
"diff_approach" : "5",
"artist" : "Yousei Teikoku",
"title" : "Kokou no Sousei",
"difficultyrating" : "2.7515676021575928"
}
{
"_id" : "GbotZfrPEwW69FkGD",
"beatmapset_id" : "342537",
"version" : "NM",
"diff_approach" : "5",
"artist" : "Yousei Teikoku",
"title" : "Kokou no Sousei",
"difficultyrating" : "0"
}
然后运行以下的命令:
db.getCollection('dup_case').aggregate(
[
{$group:{
_id:{diff_approach:'$diff_approach'},
count:{$sum:2}
}},
{$match:{count:{$gt:1}}}
]
)
返回的count是6
所以
$sum:1 的含义:
如果前面的情况出现一次,就加1, 如果后面$sum:2 那么每次前面条件满足一次就加2