Find all occurrences of a key in nested dictionaries and lists

我有一本这样的字典:

{
"id": "abcde",
"key1": "blah",
"key2": "blah blah",
"nestedlist": [
{
"id": "qwerty",
"nestednestedlist": [
{
"id": "xyz",
"keyA": "blah blah blah"
},
{
"id": "fghi",
"keyZ": "blah blah blah"
}
],
"anothernestednestedlist": [
{
"id": "asdf",
"keyQ": "blah blah"
},
{
"id": "yuiop",
"keyW": "blah"
}
]
}
]
}

基本上是一个具有嵌套列表、字典和字符串的字典,具有任意深度。

遍历这个过程以提取每个“ id”键的值的最佳方法是什么?我想实现类似于“//id”的 XPath 查询。“ id”的值总是一个字符串。

So from my example, the output I need is basically:

["abcde", "qwerty", "xyz", "fghi", "asdf", "yuiop"]

秩序不重要。

128517 次浏览
d = { "id" : "abcde",
"key1" : "blah",
"key2" : "blah blah",
"nestedlist" : [
{ "id" : "qwerty",
"nestednestedlist" : [
{ "id" : "xyz", "keyA" : "blah blah blah" },
{ "id" : "fghi", "keyZ" : "blah blah blah" }],
"anothernestednestedlist" : [
{ "id" : "asdf", "keyQ" : "blah blah" },
{ "id" : "yuiop", "keyW" : "blah" }] } ] }




def fun(d):
if 'id' in d:
yield d['id']
for k in d:
if isinstance(d[k], list):
for i in d[k]:
for j in fun(i):
yield j

>>> list(fun(d))
['abcde', 'qwerty', 'xyz', 'fghi', 'asdf', 'yuiop']
def find(key, value):
for k, v in value.items():
if k == key:
yield v
elif isinstance(v, dict):
for result in find(key, v):
yield result
elif isinstance(v, list):
for d in v:
for result in find(key, d):
yield result

编辑:@Anthony 注意到这不适用于直接嵌套的列表。如果您的输入中包含这个,那么您可以使用:

def find(key, value):
for k, v in (value.items() if isinstance(value, dict) else
enumerate(value) if isinstance(value, list) else []):
if k == key:
yield v
elif isinstance(v, (dict, list)):
for result in find(key, v):
yield result

但我觉得原版更容易理解,所以我就不写了。

d = { "id" : "abcde",
"key1" : "blah",
"key2" : "blah blah",
"nestedlist" : [
{ "id" : "qwerty",
"nestednestedlist" : [
{ "id" : "xyz", "keyA" : "blah blah blah" },
{ "id" : "fghi", "keyZ" : "blah blah blah" }],
"anothernestednestedlist" : [
{ "id" : "asdf", "keyQ" : "blah blah" },
{ "id" : "yuiop", "keyW" : "blah" }] } ] }




def findkeys(node, kv):
if isinstance(node, list):
for i in node:
for x in findkeys(i, kv):
yield x
elif isinstance(node, dict):
if kv in node:
yield node[kv]
for j in node.values():
for x in findkeys(j, kv):
yield x


print(list(findkeys(d, 'id')))

此函数递归地搜索包含嵌套字典和列表的字典。它生成一个名为 fields _ found 的列表,其中包含每次找到该字段时的值。“ field”是我在字典及其嵌套列表和字典中寻找的关键字。

def get_recursively(search_dict, field):
"""Takes a dict with nested lists and dicts,
and searches all dicts for a key of the field
provided.
"""
fields_found = []


for key, value in search_dict.iteritems():


if key == field:
fields_found.append(value)


elif isinstance(value, dict):
results = get_recursively(value, field)
for result in results:
fields_found.append(result)


elif isinstance(value, list):
for item in value:
if isinstance(item, dict):
more_results = get_recursively(item, field)
for another_result in more_results:
fields_found.append(another_result)


return fields_found

另一个变体,包括找到结果的嵌套路径(note: this version doesn't consider lists) :

def find_all_items(obj, key, keys=None):
"""
Example of use:
d = {'a': 1, 'b': 2, 'c': {'a': 3, 'd': 4, 'e': {'a': 9, 'b': 3}, 'j': {'c': 4}}}
for k, v in find_all_items(d, 'a'):
print "* {} = {} *".format('->'.join(k), v)
"""
ret = []
if not keys:
keys = []
if key in obj:
out_keys = keys + [key]
ret.append((out_keys, obj[key]))
for k, v in obj.items():
if isinstance(v, dict):
found_items = find_all_items(v, key, keys=(keys+[k]))
ret += found_items
return ret

我是这么想的:

def keyHole(k2b,o):
# print "Checking for %s in "%k2b,o
if isinstance(o, dict):
for k, v in o.iteritems():
if k == k2b and not hasattr(v, '__iter__'): yield v
else:
for r in  keyHole(k2b,v): yield r
elif hasattr(o, '__iter__'):
for r in [ keyHole(k2b,i) for i in o ]:
for r2 in r: yield r2
return

例如:

>>> findMe = {'Me':{'a':2,'Me':'bop'},'z':{'Me':4}}
>>> keyHole('Me',findMe)
<generator object keyHole at 0x105eccb90>
>>> [ x for x in keyHole('Me',findMe) ]
['bop', 4]

我发现这个问答非常有趣,因为它为同一个问题提供了几种不同的解决方案。我用一个复杂的 dictionary 对象测试了所有这些函数。我不得不从测试中去掉两个函数,因为它们有很多失败的结果,而且它们不支持返回列表或者字典作为值,我认为这是必要的,因为一个函数应该为将要到来的几乎 任何数据做好准备。

因此,我通过 timeit模块在100.000次迭代中输入其他函数,输出结果如下:

0.11 usec/pass on gen_dict_extract(k,o)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - -
6.03 usec/pass on find_all_items(k,o)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - -
0.15 usec/pass on findkeys(k,o)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - -
1.79 usec/pass on get_recursively(k,o)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - -
0.14 usec/pass on find(k,o)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - -
0.36 usec/pass on dict_extract(k,o)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - -

所有函数都有相同的指针来搜索(“日志记录”)和相同的字典对象,其构造如下:

o = { 'temparature': '50',
'logging': {
'handlers': {
'console': {
'formatter': 'simple',
'class': 'logging.StreamHandler',
'stream': 'ext://sys.stdout',
'level': 'DEBUG'
}
},
'loggers': {
'simpleExample': {
'handlers': ['console'],
'propagate': 'no',
'level': 'INFO'
},
'root': {
'handlers': ['console'],
'level': 'DEBUG'
}
},
'version': '1',
'formatters': {
'simple': {
'datefmt': "'%Y-%m-%d %H:%M:%S'",
'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
}
}
},
'treatment': {'second': 5, 'last': 4, 'first': 4},
'treatment_plan': [[4, 5, 4], [4, 5, 4], [5, 5, 5]]
}

所有函数都交付了相同的结果,但时间差异是巨大的!函数 gen_dict_extract(k,o)是我从这里的函数改编的函数,实际上它非常像来自 Alfe 的 find函数,主要的区别是,我在检查给定对象是否有 iteritem 函数,以防在递归过程中传递字符串:

# python 2
def gen_dict_extract(key, var):
if hasattr(var,'iteritems'): # hasattr(var,'items') for python 3
for k, v in var.iteritems(): # var.items() for python 3
if k == key:
yield v
if isinstance(v, dict):
for result in gen_dict_extract(key, v):
yield result
elif isinstance(v, list):
for d in v:
for result in gen_dict_extract(key, d):
yield result

所以这个变量是这里最快最安全的函数。而且 find_all_items是难以置信的慢,远远落后于第二慢的 get_recursivley,而其他的,除了 dict_extract,彼此接近。函数 funkeyHole只有在查找字符串时才能工作。

有趣的学习方面:)

我只是想使用 yield from迭代@heerei-software 的优秀答案,并接受顶级列表。

def gen_dict_extract(var, key):
if isinstance(var, dict):
for k, v in var.items():
if k == key:
yield v
if isinstance(v, (dict, list)):
yield from gen_dict_extract(v, key)
elif isinstance(var, list):
for d in var:
yield from gen_dict_extract(d, key)

跟进@heerei 软件的答案 @ Bruno-Bronosky 的评论,如果你想遍历一个列表/一组键:

def gen_dict_extract(var, keys):
for key in keys:
if hasattr(var, 'items'):
for k, v in var.items():
if k == key:
yield v
if isinstance(v, dict):
for result in gen_dict_extract([key], v):
yield result
elif isinstance(v, list):
for d in v:
for result in gen_dict_extract([key], d):
yield result

注意,我传递的列表只有一个元素([ key ]} ,而不是字符串键。

pip install nested-lookup 完全符合你的要求:

document = [ { 'taco' : 42 } , { 'salsa' : [ { 'burrito' : { 'taco' : 69 } } ] } ]


>>> print(nested_lookup('taco', document))
[42, 69]

我不能得到的解决方案张贴在这里工作的盒子,所以我想写一些更灵活一点。

下面的递归函数应该允许您在一组任意深度的嵌套字典和列表中收集满足某个给定键的正则表达式模式的所有值。

import re


def search(dictionary, search_pattern, output=None):
"""
Search nested dictionaries and lists using a regex search
pattern to match a key and return the corresponding value(s).
"""
if output is None:
output = []


pattern = re.compile(search_pattern)
    

for k, v in dictionary.items():
pattern_found = pattern.search(k)
if not pattern_found:
if isinstance(v, list):
for item in v:
if isinstance(item, dict):
search(item, search_pattern, output)
if isinstance(v, dict):
search(v, search_pattern, output)
else:
if pattern_found:
output.append(v)


return output

如果你想搜索一个特定的术语,你总是可以使你的搜索模式像 r'\bsome_term\b'

自从 在 Python 中有一个最大的递归深度以来,我会考虑对任意规模采用迭代方法:

def get_ids(data: Dict, key: str) -> List:
stack = [data]
result = []
while stack:
elem = stack.pop()
if isinstance(elem, dict):
for k, v in elem.items():
if k == key:
result.append(v)
if isinstance(elem, (list, dict)):
stack.append(v)
elif isinstance(elem, list):
for obj in elem:
stack.append(obj)
return result