def find_offsets(haystack, needle):
"""
Find the start of all (possibly-overlapping) instances of needle in haystack
"""
offs = -1
while True:
offs = haystack.find(needle, offs+1)
if offs == -1:
break
else:
yield offs
for offs in find_offsets("ooottat", "o"):
print offs
我会选择 Lev,但是值得指出的是,如果你最终得到的是更复杂的搜索结果,那么使用 re.finditer 可能值得记住(但是 re 常常带来更多的麻烦而不是价值——但是有时候很方便知道)
test = "ooottat"
[ (i.start(), i.end()) for i in re.finditer('o', test)]
# [(0, 1), (1, 2), (2, 3)]
[ (i.start(), i.end()) for i in re.finditer('o+', test)]
# [(0, 3)]
根据经验,NumPy 阵列在使用 POD (普通旧数据)时的性能通常优于其他解决方案。字符串是 POD 和字符的一个例子。要查找字符串中只有一个字符的所有索引,NumPy ndarray 可能是最快的方法:
def find1(str, ch):
# 0.100 seconds for 1MB str
npbuf = np.frombuffer(str, dtype=np.uint8) # Reinterpret str as a char buffer
return np.where(npbuf == ord(ch)) # Find indices with numpy
def find2(str, ch):
# 0.920 seconds for 1MB str
return [i for i, c in enumerate(str) if c == ch] # Find indices with python