由多个字段构成的 mongodb 组值

例如,我有这些文件:

{
"addr": "address1",
"book": "book1"
},
{
"addr": "address2",
"book": "book1"
},
{
"addr": "address1",
"book": "book5"
},
{
"addr": "address3",
"book": "book9"
},
{
"addr": "address2",
"book": "book5"
},
{
"addr": "address2",
"book": "book1"
},
{
"addr": "address1",
"book": "book1"
},
{
"addr": "address15",
"book": "book1"
},
{
"addr": "address9",
"book": "book99"
},
{
"addr": "address90",
"book": "book33"
},
{
"addr": "address4",
"book": "book3"
},
{
"addr": "address5",
"book": "book1"
},
{
"addr": "address77",
"book": "book11"
},
{
"addr": "address1",
"book": "book1"
}

诸如此类。如何提出请求,描述每个地址的前 N 个地址和前 M 本书?预期结果的例子: < br > < br > address1 | Book _ 1:5 < br > | Book _ 2:10 < br > | Book _ 3:50 < br > | 总计: 65 < br > _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ < br > address2 | Book _ 1:10 < br > | Book _ 2:10 < br > | ...翻译: 奇芳校对: 奇芳校对: 奇芳校对: 奇芳校对: 奇芳校对: 奇芳校对: 奇芳校对: 奇芳校对: 奇芳校对: 奇芳校对: 奇芳校对: 奇芳校对: 奇芳校对: 奇芳校对: 奇芳校对: 奇芳校对: 奇芳Book _ M: 10 < br > | 总计: M * 10 < br > 。。 < br > _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ < br > addressN| book _ 1:20 < br >| book _ 2:20 < br >|..| book _ M: 20 < br > 总数: M * 20

305800 次浏览

使用如下聚合函数:

[
{$group: {_id : {book : '$book',address:'$addr'}, total:{$sum :1}}},
{$project : {book : '$_id.book', address : '$_id.address', total : '$total', _id : 0}}
]

它会给你的结果如下:

        {
"total" : 1,
"book" : "book33",
"address" : "address90"
},
{
"total" : 1,
"book" : "book5",
"address" : "address1"
},
{
"total" : 1,
"book" : "book99",
"address" : "address9"
},
{
"total" : 1,
"book" : "book1",
"address" : "address5"
},
{
"total" : 1,
"book" : "book5",
"address" : "address2"
},
{
"total" : 1,
"book" : "book3",
"address" : "address4"
},
{
"total" : 1,
"book" : "book11",
"address" : "address77"
},
{
"total" : 1,
"book" : "book9",
"address" : "address3"
},
{
"total" : 1,
"book" : "book1",
"address" : "address15"
},
{
"total" : 2,
"book" : "book1",
"address" : "address2"
},
{
"total" : 3,
"book" : "book1",
"address" : "address1"
}

我没有完全得到您期望的结果格式,所以请随意修改这个到您需要的一个。

TLDR 摘要

在现代 MongoDB 版本中,您可以使用基本聚合结果之外的 $slice强制执行此操作。对于“大”结果,改为对每个分组运行并行查询(答案末尾有一个演示清单) ,或者等待 服务器9377解析,这将允许对数组中的 $push项的数量进行“限制”。

db.books.aggregate([
{ "$group": {
"_id": {
"addr": "$addr",
"book": "$book"
},
"bookCount": { "$sum": 1 }
}},
{ "$group": {
"_id": "$_id.addr",
"books": {
"$push": {
"book": "$_id.book",
"count": "$bookCount"
},
},
"count": { "$sum": "$bookCount" }
}},
{ "$sort": { "count": -1 } },
{ "$limit": 2 },
{ "$project": {
"books": { "$slice": [ "$books", 2 ] },
"count": 1
}}
])

MongoDB 3.6预览

仍然没有解析 服务器9377,但是在这个版本中,$lookup允许一个新的“非相关”选项,该选项采用 "pipeline"表达式作为参数,而不是 "localFields""foreignFields"选项。然后,这允许与另一个管道表达式进行“自连接”,在这个表达式中,我们可以应用 $limit以返回“ top-n”结果。

db.books.aggregate([
{ "$group": {
"_id": "$addr",
"count": { "$sum": 1 }
}},
{ "$sort": { "count": -1 } },
{ "$limit": 2 },
{ "$lookup": {
"from": "books",
"let": {
"addr": "$_id"
},
"pipeline": [
{ "$match": {
"$expr": { "$eq": [ "$addr", "$$addr"] }
}},
{ "$group": {
"_id": "$book",
"count": { "$sum": 1 }
}},
{ "$sort": { "count": -1  } },
{ "$limit": 2 }
],
"as": "books"
}}
])

这里的另一个附加功能当然是使用 $match通过 $expr插入变量来选择“ join”中的匹配项,但是一般前提是“管道中的管道”,其中内部内容可以通过来自父节点的匹配进行过滤。因为它们本身都是“管道”,所以我们可以分别得到 $limit的每个结果。

这将是运行并行查询的下一个最佳选择,如果 $match被允许并且能够在“子管道”处理中使用索引,那么实际上会更好。因此,它没有像引用的问题所要求的那样使用“对 $push的限制”,它实际上提供了一些应该更好地工作的东西。


原创内容

你似乎偶然发现了最重要的“ N”问题。在某种程度上,你的问题是相当容易解决的,尽管不是你所要求的那种严格的限制:

db.books.aggregate([
{ "$group": {
"_id": {
"addr": "$addr",
"book": "$book"
},
"bookCount": { "$sum": 1 }
}},
{ "$group": {
"_id": "$_id.addr",
"books": {
"$push": {
"book": "$_id.book",
"count": "$bookCount"
},
},
"count": { "$sum": "$bookCount" }
}},
{ "$sort": { "count": -1 } },
{ "$limit": 2 }
])

现在会得到这样的结果:

{
"result" : [
{
"_id" : "address1",
"books" : [
{
"book" : "book4",
"count" : 1
},
{
"book" : "book5",
"count" : 1
},
{
"book" : "book1",
"count" : 3
}
],
"count" : 5
},
{
"_id" : "address2",
"books" : [
{
"book" : "book5",
"count" : 1
},
{
"book" : "book1",
"count" : 2
}
],
"count" : 3
}
],
"ok" : 1
}

因此,这与您要求的不同,虽然我们确实得到了地址值的最高结果,但是底层的“书籍”选择不仅限于所需的结果数量。

事实证明这是非常困难的,但是这是可以做到的,尽管复杂性只是随着需要匹配的项目数量的增加而增加。为了简单起见,我们最多可以保持两个匹配:

db.books.aggregate([
{ "$group": {
"_id": {
"addr": "$addr",
"book": "$book"
},
"bookCount": { "$sum": 1 }
}},
{ "$group": {
"_id": "$_id.addr",
"books": {
"$push": {
"book": "$_id.book",
"count": "$bookCount"
},
},
"count": { "$sum": "$bookCount" }
}},
{ "$sort": { "count": -1 } },
{ "$limit": 2 },
{ "$unwind": "$books" },
{ "$sort": { "count": 1, "books.count": -1 } },
{ "$group": {
"_id": "$_id",
"books": { "$push": "$books" },
"count": { "$first": "$count" }
}},
{ "$project": {
"_id": {
"_id": "$_id",
"books": "$books",
"count": "$count"
},
"newBooks": "$books"
}},
{ "$unwind": "$newBooks" },
{ "$group": {
"_id": "$_id",
"num1": { "$first": "$newBooks" }
}},
{ "$project": {
"_id": "$_id",
"newBooks": "$_id.books",
"num1": 1
}},
{ "$unwind": "$newBooks" },
{ "$project": {
"_id": "$_id",
"num1": 1,
"newBooks": 1,
"seen": { "$eq": [
"$num1",
"$newBooks"
]}
}},
{ "$match": { "seen": false } },
{ "$group":{
"_id": "$_id._id",
"num1": { "$first": "$num1" },
"num2": { "$first": "$newBooks" },
"count": { "$first": "$_id.count" }
}},
{ "$project": {
"num1": 1,
"num2": 1,
"count": 1,
"type": { "$cond": [ 1, [true,false],0 ] }
}},
{ "$unwind": "$type" },
{ "$project": {
"books": { "$cond": [
"$type",
"$num1",
"$num2"
]},
"count": 1
}},
{ "$group": {
"_id": "$_id",
"count": { "$first": "$count" },
"books": { "$push": "$books" }
}},
{ "$sort": { "count": -1 } }
])

所以这实际上会给你前两个“地址”条目中的前两个“书”。

但在我看来,还是保留第一种形式,然后简单地对返回的数组元素进行“切片”,以获取第一个“ N”元素。


示范守则

演示代码适用于 v8.x 和 v10.x 版本的 NodeJS 的当前 LTS 版本。这主要是针对 async/await语法的,但是在一般流程中没有任何这样的限制,并且只需要对普通承诺进行很少的修改就可以适应,甚至可以返回到普通回调实现。

Index.js

const { MongoClient } = require('mongodb');
const fs = require('mz/fs');


const uri = 'mongodb://localhost:27017';


const log = data => console.log(JSON.stringify(data, undefined, 2));


(async function() {


try {
const client = await MongoClient.connect(uri);


const db = client.db('bookDemo');
const books = db.collection('books');


let { version } = await db.command({ buildInfo: 1 });
version = parseFloat(version.match(new RegExp(/(?:(?!-).)*/))[0]);


// Clear and load books
await books.deleteMany({});


await books.insertMany(
(await fs.readFile('books.json'))
.toString()
.replace(/\n$/,"")
.split("\n")
.map(JSON.parse)
);


if ( version >= 3.6 ) {


// Non-correlated pipeline with limits
let result = await books.aggregate([
{ "$group": {
"_id": "$addr",
"count": { "$sum": 1 }
}},
{ "$sort": { "count": -1 } },
{ "$limit": 2 },
{ "$lookup": {
"from": "books",
"as": "books",
"let": { "addr": "$_id" },
"pipeline": [
{ "$match": {
"$expr": { "$eq": [ "$addr", "$$addr" ] }
}},
{ "$group": {
"_id": "$book",
"count": { "$sum": 1 },
}},
{ "$sort": { "count": -1 } },
{ "$limit": 2 }
]
}}
]).toArray();


log({ result });
}


// Serial result procesing with parallel fetch


// First get top addr items
let topaddr = await books.aggregate([
{ "$group": {
"_id": "$addr",
"count": { "$sum": 1 }
}},
{ "$sort": { "count": -1 } },
{ "$limit": 2 }
]).toArray();


// Run parallel top books for each addr
let topbooks = await Promise.all(
topaddr.map(({ _id: addr }) =>
books.aggregate([
{ "$match": { addr } },
{ "$group": {
"_id": "$book",
"count": { "$sum": 1 }
}},
{ "$sort": { "count": -1 } },
{ "$limit": 2 }
]).toArray()
)
);


// Merge output
topaddr = topaddr.map((d,i) => ({ ...d, books: topbooks[i] }));
log({ topaddr });


client.close();


} catch(e) {
console.error(e)
} finally {
process.exit()
}


})()

Books Json

{ "addr": "address1",  "book": "book1"  }
{ "addr": "address2",  "book": "book1"  }
{ "addr": "address1",  "book": "book5"  }
{ "addr": "address3",  "book": "book9"  }
{ "addr": "address2",  "book": "book5"  }
{ "addr": "address2",  "book": "book1"  }
{ "addr": "address1",  "book": "book1"  }
{ "addr": "address15", "book": "book1"  }
{ "addr": "address9",  "book": "book99" }
{ "addr": "address90", "book": "book33" }
{ "addr": "address4",  "book": "book3"  }
{ "addr": "address5",  "book": "book1"  }
{ "addr": "address77", "book": "book11" }
{ "addr": "address1",  "book": "book1"  }

下面的查询将提供与所需响应中给出的完全相同的结果:

db.books.aggregate([
{
$group: {
_id: { addresses: "$addr", books: "$book" },
num: { $sum :1 }
}
},
{
$group: {
_id: "$_id.addresses",
bookCounts: { $push: { bookName: "$_id.books",count: "$num" } }
}
},
{
$project: {
_id: 1,
bookCounts:1,
"totalBookAtAddress": {
"$sum": "$bookCounts.count"
}
}
}


])

答复如下:

/* 1 */
{
"_id" : "address4",
"bookCounts" : [
{
"bookName" : "book3",
"count" : 1
}
],
"totalBookAtAddress" : 1
},


/* 2 */
{
"_id" : "address90",
"bookCounts" : [
{
"bookName" : "book33",
"count" : 1
}
],
"totalBookAtAddress" : 1
},


/* 3 */
{
"_id" : "address15",
"bookCounts" : [
{
"bookName" : "book1",
"count" : 1
}
],
"totalBookAtAddress" : 1
},


/* 4 */
{
"_id" : "address3",
"bookCounts" : [
{
"bookName" : "book9",
"count" : 1
}
],
"totalBookAtAddress" : 1
},


/* 5 */
{
"_id" : "address5",
"bookCounts" : [
{
"bookName" : "book1",
"count" : 1
}
],
"totalBookAtAddress" : 1
},


/* 6 */
{
"_id" : "address1",
"bookCounts" : [
{
"bookName" : "book1",
"count" : 3
},
{
"bookName" : "book5",
"count" : 1
}
],
"totalBookAtAddress" : 4
},


/* 7 */
{
"_id" : "address2",
"bookCounts" : [
{
"bookName" : "book1",
"count" : 2
},
{
"bookName" : "book5",
"count" : 1
}
],
"totalBookAtAddress" : 3
},


/* 8 */
{
"_id" : "address77",
"bookCounts" : [
{
"bookName" : "book11",
"count" : 1
}
],
"totalBookAtAddress" : 1
},


/* 9 */
{
"_id" : "address9",
"bookCounts" : [
{
"bookName" : "book99",
"count" : 1
}
],
"totalBookAtAddress" : 1
}

由于 mongoDB 版本3.6,这是很容易做到的,使用 $group$slice$limit$sort:

  1. 把书数一数
  2. $sort,所以他们将稍后推根据计数
  3. $groupaddress$push相关书籍,和 $sum每个地址的总数。
  4. 按地址总数分列的 $sort
  5. 地址结果为 topN
  6. 使用 $slice将数组中的图书限制为 topM
db.collection.aggregate([
{$group: {_id: {book: "$book",  addr: "$addr"}, count: {$sum: 1}}},
{$sort: {"_id.addr": 1, count: -1}},
{$group: {
_id: "$_id.addr", totalCount: {$sum: "$count"},
books: {$push: {book: "$_id.book", count: "$count"}}
}
},
{$sort: {totalCount: -1}},
{$limit: topN},
{$set: {addr: "$_id", _id: "$$REMOVE", books: {$slice: ["$books", 0, topM]}}}
])

看看它在 游乐场示例 -v3.4上是如何工作的

在 mongoDB 5.2版本中,有一个 topN累加器可以更加简化:

db.collection.aggregate([
{$group: {_id: {book: "$book",  addr: "$addr"}, count: {$sum: 1}}},
{$group: {
_id: "$_id.addr",
totalCount: {$sum: "$count"},
books: {$topN: {output: {book: "$_id.book", count: "$count"},
sortBy: {count: -1},
n: topM
}}
}},
{$sort: {totalCount: -1}},
{$limit: topN},
{$project: {addr: "$_id", _id: 0, books: 1, totalCount: 1}}
])

看看它在 游乐场示例 -v5.2上是如何工作的