def unique_everseen(iterable, key=None):"List unique elements, preserving order. Remember all elements ever seen."# unique_everseen('AAAABBBCCDAABBB') --> A B C D# unique_everseen('ABBCcAD', str.lower) --> A B C Dseen = set()seen_add = seen.addif key is None:for element in filterfalse(seen.__contains__, iterable):seen_add(element)yield elementelse:for element in iterable:k = key(element)if k not in seen:seen_add(k)yield element
In [118]: unique([1,5,1,1,4,3,4])Out[118]: [1, 5, 4, 3]
我尝试过增加数据大小,并看到了次线性时间复杂度(不是确定的,但建议这对于正常数据应该没问题)。
In [122]: %timeit unique(np.random.randint(5, size=(1)))10000 loops, best of 3: 25.3 us per loop
In [123]: %timeit unique(np.random.randint(5, size=(10)))10000 loops, best of 3: 42.9 us per loop
In [124]: %timeit unique(np.random.randint(5, size=(100)))10000 loops, best of 3: 132 us per loop
In [125]: %timeit unique(np.random.randint(5, size=(1000)))1000 loops, best of 3: 1.05 ms per loop
In [126]: %timeit unique(np.random.randint(5, size=(10000)))100 loops, best of 3: 11 ms per loop
>>> l = [5, 6, 6, 1, 1, 2, 2, 3, 4]>>> reduce(lambda r, v: v in r[1] and r or (r[0].append(v) or r[1].add(v)) or r, l, ([], set()))[0][5, 6, 1, 2, 3, 4]
说明:
default = (list(), set())# use list to keep order# use set to make lookup faster
def reducer(result, item):if item not in result[1]:result[0].append(item)result[1].add(item)return result
>>> reduce(reducer, l, default)[0][5, 6, 1, 2, 3, 4]
import pandas as pdimport numpy as np
uniquifier = lambda alist: pd.Series(alist).drop_duplicates().tolist()
# from the chosen answerdef f7(seq):seen = set()seen_add = seen.addreturn [ x for x in seq if not (x in seen or seen_add(x))]
alist = np.random.randint(low=0, high=1000, size=10000).tolist()
print uniquifier(alist) == f7(alist) # True
时间:
In [104]: %timeit f7(alist)1000 loops, best of 3: 1.3 ms per loopIn [110]: %timeit uniquifier(alist)100 loops, best of 3: 4.39 ms per loop
text = "ask not what your country can do for you ask what you can do for your country"sentence = text.split(" ")noduplicates = [(sentence[i]) for i in range (0,len(sentence)) if sentence[i] not in sentence[:i]]print(noduplicates)
%matplotlib notebook
from iteration_utilities import unique_everseenfrom collections import OrderedDictfrom more_itertools import unique_everseen as mi_unique_everseen
def f7(seq):seen = set()seen_add = seen.addreturn [x for x in seq if not (x in seen or seen_add(x))]
def iteration_utilities_unique_everseen(seq):return list(unique_everseen(seq))
def more_itertools_unique_everseen(seq):return list(mi_unique_everseen(seq))
def odict(seq):return list(OrderedDict.fromkeys(seq))
from simple_benchmark import benchmark
b = benchmark([f7, iteration_utilities_unique_everseen, more_itertools_unique_everseen, odict],{2**i: list(range(2**i)) for i in range(1, 20)},'list size (no duplicates)')b.plot()
为了确保我也做了一个有更多重复的测试,只是为了检查它是否有区别:
import random
b = benchmark([f7, iteration_utilities_unique_everseen, more_itertools_unique_everseen, odict],{2**i: [random.randint(0, 2**(i-1)) for _ in range(2**i)] for i in range(1, 20)},'list size (lots of duplicates)')b.plot()
一个只包含一个值:
b = benchmark([f7, iteration_utilities_unique_everseen, more_itertools_unique_everseen, odict],{2**i: [1]*(2**i) for i in range(1, 20)},'list size (only duplicates)')b.plot()
x = [1, 2, 1, 3, 1, 4]
# brute force methodarr = []for i in x:if not i in arr:arr.insert(x[i],i)
# recursive methodtmp = []def remove_duplicates(j=0):if j < len(x):if not x[j] in tmp:tmp.append(x[j])i = j+1remove_duplicates(i)
remove_duplicates()
2…但是他们每次迭代都会浪费一个哈希查找。 令人惊讶的是,考虑到关于这个主题的大量讨论和辩论,实际上对似乎被忽视的代码有一个显着的改进。如图所示,每次“测试和设置”迭代都需要两个次哈希查找:第一次测试成员资格x not in seen,然后再次实际添加值seen.add(x)。由于第一个操作保证了第二个操作总是成功,因此这里存在浪费的重复工作。而且由于这里的整体技术是如此高效,多余的哈希查找可能最终成为剩下的少量工作中最昂贵的部分。