This does involve copying the data, though, since Python strings need to be immutable. An advantage is that you can also search for e.g. a rising edge by finding \x00\x01
import numpy,time
def find1(arr,value):
return (arr==value).tostring().find('\x01')
def find2(arr,value): #find value over inner most axis, and return array of indices to the match
b = arr==value
return b.argmax(axis=-1) - ~(b.any())
for size in [(1,100000000),(10000,10000),(1000000,100),(10000000,10)]:
print(size)
values = numpy.random.choice([0,0,0,0,0,0,0,1],size=size)
v = values>0
t=time.time()
numpy.apply_along_axis(find1,-1,v,1)
print('find1',time.time()-t)
t=time.time()
find2(v,1)
print('find2',time.time()-t)
虽然对你来说已经太迟了,但是为了以后的参考:
在 numba 实现之前,使用 numba (1)是最简单的方法。如果您使用的是 anaconda python 发行版,那么它应该已经被安装了。
The code will be compiled so it will be fast.
@jit(nopython=True)
def find_first(item, vec):
"""return the index of the first occurence of item in vec"""
for i in xrange(len(vec)):
if item == vec[i]:
return i
return -1
The array had 1 million elements and tests were run 100 times. Results still fluctuate a bit, but the qualitative trend is clear: Python and f2py quit at the first element so they scale differently. Python gets too slow if the needle is not in the first 1%, whereas f2py is fast (but you need to compile it).
subroutine find_first(needle, haystack, haystack_length, index)
implicit none
integer, intent(in) :: needle
integer, intent(in) :: haystack_length
integer, intent(in), dimension(haystack_length) :: haystack
!f2py intent(inplace) haystack
integer, intent(out) :: index
integer :: k
index = -1
do k = 1, haystack_length
if (haystack(k)==needle) then
index = k - 1
exit
endif
enddo
end
如果您正在寻找 integer以外的东西,只需更改类型,然后使用:
f2py -c -m search search.f90
然后你可以做(来自 Python) :
import search
print(search.find_first.__doc__)
a = search.find_first(your_int_needle, your_int_array)
arr = np.arange(100000)
%timeit index(arr, 5) # 1000000 loops, best of 3: 1.88 µs per loop
%timeit find_first(5, arr) # 1000000 loops, best of 3: 1.7 µs per loop
%timeit index(arr, 99999) # 10000 loops, best of 3: 118 µs per loop
%timeit find_first(99999, arr) # 10000 loops, best of 3: 96 µs per loop
import numpy as np
from numba import jit
from timeit import timeit
def find_first(x):
idx, step = 0, 32
while idx < x.size:
nz, = x[idx: idx + step].nonzero()
if len(nz):
return nz[0] + idx
idx += step
step = min(9600, step + step // 2)
return -1
@jit(nopython=True)
def find_first_numba(vec):
"""return the index of the first occurence of item in vec"""
for i in range(len(vec)):
if vec[i]:
return i
return -1
SIZE = 10_000_000
# First only
x = np.empty(SIZE)
find_first_numba(x[:10])
print('---- FIRST ----')
x[:] = 0
x[0] = 1
print('ndarray.nonzero', timeit(lambda: x.nonzero()[0][0], number=100)*10, 'ms')
print('find_first', timeit(lambda: find_first(x), number=1000), 'ms')
print('find_first_numba', timeit(lambda: find_first_numba(x), number=1000), 'ms')
print('---- LAST ----')
x[:] = 0
x[-1] = 1
print('ndarray.nonzero', timeit(lambda: x.nonzero()[0][0], number=100)*10, 'ms')
print('find_first', timeit(lambda: find_first(x), number=100)*10, 'ms')
print('find_first_numba', timeit(lambda: find_first_numba(x), number=100)*10, 'ms')
print('---- NONE ----')
x[:] = 0
print('ndarray.nonzero', timeit(lambda: x.nonzero()[0], number=100)*10, 'ms')
print('find_first', timeit(lambda: find_first(x), number=100)*10, 'ms')
print('find_first_numba', timeit(lambda: find_first_numba(x), number=100)*10, 'ms')
print('---- ALL ----')
x[:] = 1
print('ndarray.nonzero', timeit(lambda: x.nonzero()[0][0], number=100)*10, 'ms')
print('find_first', timeit(lambda: find_first(x), number=100)*10, 'ms')
print('find_first_numba', timeit(lambda: find_first_numba(x), number=100)*10, 'ms')
在我的机器上的结果是:
---- FIRST ----
ndarray.nonzero 54.733994480002366 ms
find_first 0.0013148509997336078 ms
find_first_numba 0.0002839310000126716 ms
---- LAST ----
ndarray.nonzero 54.56336712999928 ms
find_first 25.38929685000312 ms
find_first_numba 8.022820680002951 ms
---- NONE ----
ndarray.nonzero 24.13432420999925 ms
find_first 25.345200140000088 ms
find_first_numba 8.154927100003988 ms
---- ALL ----
ndarray.nonzero 55.753537260002304 ms
find_first 0.0014760300018679118 ms
find_first_numba 0.0004358099977253005 ms
import numpy as np
from numba import jit
from timeit import timeit
def find_first(x):
idx = x.view(bool).argmax() // x.itemsize
return idx if x[idx] else -1
@jit(nopython=True)
def find_first_numba(vec):
"""return the index of the first occurence of item in vec"""
for i in range(len(vec)):
if vec[i]:
return i
return -1
SIZE = 10_000_000
# First only
x = np.empty(SIZE)
find_first_numba(x[:10])
print('---- FIRST ----')
x[:] = 0
x[0] = 1
print('ndarray.nonzero', timeit(lambda: x.nonzero()[0][0], number=100)*10, 'ms')
print('find_first', timeit(lambda: find_first(x), number=1000), 'ms')
print('find_first_numba', timeit(lambda: find_first_numba(x), number=1000), 'ms')
print('---- LAST ----')
x[:] = 0
x[-1] = 1
print('ndarray.nonzero', timeit(lambda: x.nonzero()[0][0], number=100)*10, 'ms')
print('find_first', timeit(lambda: find_first(x), number=100)*10, 'ms')
print('find_first_numba', timeit(lambda: find_first_numba(x), number=100)*10, 'ms')
print('---- NONE ----')
x[:] = 0
print('ndarray.nonzero', timeit(lambda: x.nonzero()[0], number=100)*10, 'ms')
print('find_first', timeit(lambda: find_first(x), number=100)*10, 'ms')
print('find_first_numba', timeit(lambda: find_first_numba(x), number=100)*10, 'ms')
print('---- ALL ----')
x[:] = 1
print('ndarray.nonzero', timeit(lambda: x.nonzero()[0][0], number=100)*10, 'ms')
print('find_first', timeit(lambda: find_first(x), number=100)*10, 'ms')
print('find_first_numba', timeit(lambda: find_first_numba(x), number=100)*10, 'ms')
在我的机器上得到的结果是:
---- FIRST ----
ndarray.nonzero 57.63976670001284 ms
find_first 0.0010841979965334758 ms
find_first_numba 0.0002308919938514009 ms
---- LAST ----
ndarray.nonzero 58.96685277999495 ms
find_first 5.923203580023255 ms
find_first_numba 8.762269750004634 ms
---- NONE ----
ndarray.nonzero 25.13398071998381 ms
find_first 5.924289370013867 ms
find_first_numba 8.810063839919167 ms
---- ALL ----
ndarray.nonzero 55.181210660084616 ms
find_first 0.001246920000994578 ms
find_first_numba 0.00028766007744707167 ms