How to import data from mongodb to pandas?

I have a large amount of data in a collection in mongodb which I need to analyze. How do i import that data to pandas?

I am new to pandas and numpy.

EDIT: The mongodb collection contains sensor values tagged with date and time. The sensor values are of float datatype.

Sample Data:

{
"_cls" : "SensorReport",
"_id" : ObjectId("515a963b78f6a035d9fa531b"),
"_types" : [
"SensorReport"
],
"Readings" : [
{
"a" : 0.958069536790466,
"_types" : [
"Reading"
],
"ReadingUpdatedDate" : ISODate("2013-04-02T08:26:35.297Z"),
"b" : 6.296118156595,
"_cls" : "Reading"
},
{
"a" : 0.95574014778624,
"_types" : [
"Reading"
],
"ReadingUpdatedDate" : ISODate("2013-04-02T08:27:09.963Z"),
"b" : 6.29651468650064,
"_cls" : "Reading"
},
{
"a" : 0.953648289182713,
"_types" : [
"Reading"
],
"ReadingUpdatedDate" : ISODate("2013-04-02T08:27:37.545Z"),
"b" : 7.29679823731148,
"_cls" : "Reading"
},
{
"a" : 0.955931884300997,
"_types" : [
"Reading"
],
"ReadingUpdatedDate" : ISODate("2013-04-02T08:28:21.369Z"),
"b" : 6.29642922525632,
"_cls" : "Reading"
},
{
"a" : 0.95821381,
"_types" : [
"Reading"
],
"ReadingUpdatedDate" : ISODate("2013-04-02T08:41:20.801Z"),
"b" : 7.28956613,
"_cls" : "Reading"
},
{
"a" : 4.95821335,
"_types" : [
"Reading"
],
"ReadingUpdatedDate" : ISODate("2013-04-02T08:41:36.931Z"),
"b" : 6.28956574,
"_cls" : "Reading"
},
{
"a" : 9.95821341,
"_types" : [
"Reading"
],
"ReadingUpdatedDate" : ISODate("2013-04-02T08:42:09.971Z"),
"b" : 0.28956488,
"_cls" : "Reading"
},
{
"a" : 1.95667927,
"_types" : [
"Reading"
],
"ReadingUpdatedDate" : ISODate("2013-04-02T08:43:55.463Z"),
"b" : 0.29115237,
"_cls" : "Reading"
}
],
"latestReportTime" : ISODate("2013-04-02T08:43:55.463Z"),
"sensorName" : "56847890-0",
"reportCount" : 8
}
135298 次浏览

http://docs.mongodb.org/manual/reference/mongoexport

导出到 csv 并使用 read_csv 或 JSON,并使用 DataFrame.from_records()

pymongo可能会给你提供帮助,下面是我使用的一些代码:

import pandas as pd
from pymongo import MongoClient




def _connect_mongo(host, port, username, password, db):
""" A util for making a connection to mongo """


if username and password:
mongo_uri = 'mongodb://%s:%s@%s:%s/%s' % (username, password, host, port, db)
conn = MongoClient(mongo_uri)
else:
conn = MongoClient(host, port)




return conn[db]




def read_mongo(db, collection, query={}, host='localhost', port=27017, username=None, password=None, no_id=True):
""" Read from Mongo and Store into DataFrame """


# Connect to MongoDB
db = _connect_mongo(host=host, port=port, username=username, password=password, db=db)


# Make a query to the specific DB and Collection
cursor = db[collection].find(query)


# Expand the cursor and construct the DataFrame
df =  pd.DataFrame(list(cursor))


# Delete the _id
if no_id:
del df['_id']


return df

Monary 正是这样,它是 超级快(another link)

See 这个很酷的帖子 which includes a quick tutorial and some timings.

你可以使用这个代码把你的 mongodb 数据加载到熊猫数据框中。这对我很有用。希望对你也一样。

import pymongo
import pandas as pd
from pymongo import MongoClient
client = MongoClient()
db = client.database_name
collection = db.collection_name
data = pd.DataFrame(list(collection.find()))

吸毒

pandas.DataFrame(list(...))

如果迭代器/生成器的结果很大,则会消耗大量内存

更好地生成小块并在最后连接

def iterator2dataframes(iterator, chunk_size: int):
"""Turn an iterator into multiple small pandas.DataFrame


This is a balance between memory and efficiency
"""
records = []
frames = []
for i, record in enumerate(iterator):
records.append(record)
if i % chunk_size == chunk_size - 1:
frames.append(pd.DataFrame(records))
records = []
if records:
frames.append(pd.DataFrame(records))
return pd.concat(frames)

为了有效地处理外核(不适合 RAM)数据(即并行执行) ,可以尝试 蟒蛇火焰生态系统: Blaze/Dask/Odo。

Blaze (和 奥多)具有开箱即用的函数来处理 MongoDB。

一些有用的文章作为开始:

And an article which shows what amazing things are possible with Blaze stack: 用 Blaze 和 Impala 分析17亿条 Reddit 评论 (essentially, querying 975 Gb of Reddit comments in seconds).

另外,我和这些技术没有任何关系。

import pandas as pd
from odo import odo


data = odo('mongodb://localhost/db::collection', pd.DataFrame)

As per PEP, simple is better than complicated:

import pandas as pd
df = pd.DataFrame.from_records(db.<database_name>.<collection_name>.find())

可以像使用常规 mongoDB 数据库那样包含条件,甚至可以使用 find _ one ()从数据库中只获取一个元素,等等。

瞧!

等等国给出这个伟大的答案之后,我想补充一下使用与 . read _ sql ().read_csv()一致的块大小来做这件事的可能性。我通过避免逐个查看“迭代器”/“光标”的“记录”来放大 杜亮的答案。 我将借用以前的 朗读蒙戈函数。

def read_mongo(db,
collection, query={},
host='localhost', port=27017,
username=None, password=None,
chunksize = 100, no_id=True):
""" Read from Mongo and Store into DataFrame """




# Connect to MongoDB
#db = _connect_mongo(host=host, port=port, username=username, password=password, db=db)
client = MongoClient(host=host, port=port)
# Make a query to the specific DB and Collection
db_aux = client[db]




# Some variables to create the chunks
skips_variable = range(0, db_aux[collection].find(query).count(), int(chunksize))
if len(skips_variable)<=1:
skips_variable = [0,len(skips_variable)]


# Iteration to create the dataframe in chunks.
for i in range(1,len(skips_variable)):


# Expand the cursor and construct the DataFrame
#df_aux =pd.DataFrame(list(cursor_aux[skips_variable[i-1]:skips_variable[i]]))
df_aux =pd.DataFrame(list(db_aux[collection].find(query)[skips_variable[i-1]:skips_variable[i]]))


if no_id:
del df_aux['_id']


# Concatenate the chunks into a unique df
if 'df' not in locals():
df =  df_aux
else:
df = pd.concat([df, df_aux], ignore_index=True)


return df

类似的方法如 Rafael Valero、 Waitingkuo 和 Deu Leung 使用 分页:

def read_mongo(
# db,
collection, query=None,
# host='localhost', port=27017, username=None, password=None,
chunksize = 100, page_num=1, no_id=True):


# Connect to MongoDB
db = _connect_mongo(host=host, port=port, username=username, password=password, db=db)


# Calculate number of documents to skip
skips = chunksize * (page_num - 1)


# Sorry, this is in spanish
# https://www.toptal.com/python/c%C3%B3digo-buggy-python-los-10-errores-m%C3%A1s-comunes-que-cometen-los-desarrolladores-python/es
if not query:
query = {}


# Make a query to the specific DB and Collection
cursor = db[collection].find(query).skip(skips).limit(chunksize)


# Expand the cursor and construct the DataFrame
df =  pd.DataFrame(list(cursor))


# Delete the _id
if no_id:
del df['_id']


return df

我发现另一个非常有用的选择是:

from pandas.io.json import json_normalize


cursor = my_collection.find()
df = json_normalize(cursor)

(或 json_normalize(list(cursor)),视乎你的蟒蛇/熊猫版本而定)。

通过这种方式,您可以免费地展开嵌套的 mongodb 文档。

你可以用三行 PDmongo实现你想要的:

import pdmongo as pdm
import pandas as pd
df = pdm.read_mongo("MyCollection", [], "mongodb://localhost:27017/mydb")

如果数据非常大,可以首先通过筛选不需要的数据来执行聚合查询,然后将它们映射到所需的列。

下面是一个将 Readings.a映射到 a列并按 reportCount列进行过滤的示例:

import pdmongo as pdm
import pandas as pd
df = pdm.read_mongo("MyCollection", [{'$match': {'reportCount': {'$gt': 6}}}, {'$unwind': '$Readings'}, {'$project': {'a': '$Readings.a'}}], "mongodb://localhost:27017/mydb")

read_mongo接受与 pymongo aggregate相同的参数

您还可以使用 皮蒙哥罗——这是 MongoDB 提供的一个官方库,用于将 MongoDB 数据导出到熊猫、 numPy、 parquet 文件等。

你可以使用 “ Pandas.json _ norize”方法:

import pandas as pd
display(pd.json_normalize( x ))
display(pd.json_normalize( x , record_path="Readings" ))

它应该显示两个表,其中 x 是您的光标,或者:

from bson import ObjectId
def ISODate(st):
return st


x = {
"_cls" : "SensorReport",
"_id" : ObjectId("515a963b78f6a035d9fa531b"),
"_types" : [
"SensorReport"
],
"Readings" : [
{
"a" : 0.958069536790466,
"_types" : [
"Reading"
],
"ReadingUpdatedDate" : ISODate("2013-04-02T08:26:35.297Z"),
"b" : 6.296118156595,
"_cls" : "Reading"
},
{
"a" : 0.95574014778624,
"_types" : [
"Reading"
],
"ReadingUpdatedDate" : ISODate("2013-04-02T08:27:09.963Z"),
"b" : 6.29651468650064,
"_cls" : "Reading"
},
{
"a" : 0.953648289182713,
"_types" : [
"Reading"
],
"ReadingUpdatedDate" : ISODate("2013-04-02T08:27:37.545Z"),
"b" : 7.29679823731148,
"_cls" : "Reading"
},
{
"a" : 0.955931884300997,
"_types" : [
"Reading"
],
"ReadingUpdatedDate" : ISODate("2013-04-02T08:28:21.369Z"),
"b" : 6.29642922525632,
"_cls" : "Reading"
},
{
"a" : 0.95821381,
"_types" : [
"Reading"
],
"ReadingUpdatedDate" : ISODate("2013-04-02T08:41:20.801Z"),
"b" : 7.28956613,
"_cls" : "Reading"
},
{
"a" : 4.95821335,
"_types" : [
"Reading"
],
"ReadingUpdatedDate" : ISODate("2013-04-02T08:41:36.931Z"),
"b" : 6.28956574,
"_cls" : "Reading"
},
{
"a" : 9.95821341,
"_types" : [
"Reading"
],
"ReadingUpdatedDate" : ISODate("2013-04-02T08:42:09.971Z"),
"b" : 0.28956488,
"_cls" : "Reading"
},
{
"a" : 1.95667927,
"_types" : [
"Reading"
],
"ReadingUpdatedDate" : ISODate("2013-04-02T08:43:55.463Z"),
"b" : 0.29115237,
"_cls" : "Reading"
}
],
"latestReportTime" : ISODate("2013-04-02T08:43:55.463Z"),
"sensorName" : "56847890-0",
"reportCount" : 8
}
  1. 用以下方法开始带壳的 mongo: mongosh

  2. Scroll up on shell until you see where mongo is connected to. It should look something like this: mongodb://127.0.0.1:27017/?directConnection=true&serverSelectionTimeoutMS=2000&appName=mongosh+1.5.4

  3. 复制并粘贴到 mongoclient

  4. 密码如下:

from pymongo import MongoClient
import pandas as pd


client = MongoClient('mongodb://127.0.0.1:27017/?directConnection=true&serverSelectionTimeoutMS=2000&appName=mongosh+1.5.4')


mydatabase = client.yourdatabasename
mycollection = mydatabase.yourcollectionname
cursor = mycollection.find()
listofDocuments = list(cursor)
df = pd.DataFrame(listofDocuments)
df