import itertools
import string
def isplitwords(s):
i = iter(s)
while True:
r = []
for c in itertools.takewhile(lambda x: not x in string.whitespace, i):
r.append(c)
else:
if r:
yield ''.join(r)
continue
else:
raise StopIteration()
import string
def gsplit(s,sep=string.whitespace):
word = []
for c in s:
if c in sep:
if word:
yield "".join(word)
word = []
else:
word.append(c)
if word:
yield "".join(word)
def splitStr(string, sep="\s+"):
# warning: does not yet work if sep is a lookahead like `(?=b)`
if sep=='':
return (c for c in string)
else:
return (_.group(1) for _ in re.finditer(f'(?:^|{sep})((?:(?!{sep}).)*)', string))
# alternatively, more verbosely:
regex = f'(?:^|{sep})((?:(?!{sep}).)*)'
for match in re.finditer(regex, string):
fragment = match.group(1)
yield fragment
其思想是,((?!pat).)*“否定”一个组,通过确保它贪婪地匹配,直到模式开始匹配(Lookahead 不使用正则表达式有限状态机中的字符串)。在伪代码中: 重复使用(begin-of-string xor {sep}) + as much as possible until we would be able to begin again (or hit end of string)
def isplit(source, sep=None, regex=False):
"""
generator version of str.split()
:param source:
source string (unicode or bytes)
:param sep:
separator to split on.
:param regex:
if True, will treat sep as regular expression.
:returns:
generator yielding elements of string.
"""
if sep is None:
# mimic default python behavior
source = source.strip()
sep = "\\s+"
if isinstance(source, bytes):
sep = sep.encode("ascii")
regex = True
if regex:
# version using re.finditer()
if not hasattr(sep, "finditer"):
sep = re.compile(sep)
start = 0
for m in sep.finditer(source):
idx = m.start()
assert idx >= start
yield source[start:idx]
start = m.end()
yield source[start:]
else:
# version using str.find(), less overhead than re.finditer()
sepsize = len(sep)
start = 0
while True:
idx = source.find(sep, start)
if idx == -1:
yield source[start:]
return
yield source[start:idx]
start = idx + sepsize
import string
def _str_split_chars(s, delims):
"Split the string `s` by characters contained in `delims`, including the \
empty parts between two consecutive delimiters"
start = 0
for i, c in enumerate(s):
if c in delims:
yield s[start:i]
start = i+1
yield s[start:]
def _str_split_chars_ne(s, delims):
"Split the string `s` by longest possible sequences of characters \
contained in `delims`"
start = 0
in_s = False
for i, c in enumerate(s):
if c in delims:
if in_s:
yield s[start:i]
in_s = False
else:
if not in_s:
in_s = True
start = i
if in_s:
yield s[start:]
def _str_split_word(s, delim):
"Split the string `s` by the string `delim`"
dlen = len(delim)
start = 0
try:
while True:
i = s.index(delim, start)
yield s[start:i]
start = i+dlen
except ValueError:
pass
yield s[start:]
def _str_split_word_ne(s, delim):
"Split the string `s` by the string `delim`, not including empty parts \
between two consecutive delimiters"
dlen = len(delim)
start = 0
try:
while True:
i = s.index(delim, start)
if start!=i:
yield s[start:i]
start = i+dlen
except ValueError:
pass
if start<len(s):
yield s[start:]
def str_split(s, *delims, empty=None):
"""\
Split the string `s` by the rest of the arguments, possibly omitting
empty parts (`empty` keyword argument is responsible for that).
This is a generator function.
When only one delimiter is supplied, the string is simply split by it.
`empty` is then `True` by default.
str_split('[]aaa[][]bb[c', '[]')
-> '', 'aaa', '', 'bb[c'
str_split('[]aaa[][]bb[c', '[]', empty=False)
-> 'aaa', 'bb[c'
When multiple delimiters are supplied, the string is split by longest
possible sequences of those delimiters by default, or, if `empty` is set to
`True`, empty strings between the delimiters are also included. Note that
the delimiters in this case may only be single characters.
str_split('aaa, bb : c;', ' ', ',', ':', ';')
-> 'aaa', 'bb', 'c'
str_split('aaa, bb : c;', *' ,:;', empty=True)
-> 'aaa', '', 'bb', '', '', 'c', ''
When no delimiters are supplied, `string.whitespace` is used, so the effect
is the same as `str.split()`, except this function is a generator.
str_split('aaa\\t bb c \\n')
-> 'aaa', 'bb', 'c'
"""
if len(delims)==1:
f = _str_split_word if empty is None or empty else _str_split_word_ne
return f(s, delims[0])
if len(delims)==0:
delims = string.whitespace
delims = set(delims) if len(delims)>=4 else ''.join(delims)
if any(len(d)>1 for d in delims):
raise ValueError("Only 1-character multiple delimiters are supported")
f = _str_split_chars if empty else _str_split_chars_ne
return f(s, delims)
def split_generator(f,s):
"""
f is a string, s is the substring we split on.
This produces a generator rather than a possibly
memory intensive list.
"""
i=0
j=0
while j<len(f):
if i>=len(f):
yield f[j:]
j=i
elif f[i] != s:
i=i+1
else:
yield [f[j:i]]
j=i+1
i=i+1
def isplit(string, delimiter = None):
"""Like string.split but returns an iterator (lazy)
Multiple character delimters are not handled.
"""
if delimiter is None:
# Whitespace delimited by default
delim = r"\s"
elif len(delimiter) != 1:
raise ValueError("Can only handle single character delimiters",
delimiter)
else:
# Escape, incase it's "\", "*" etc.
delim = re.escape(delimiter)
return (x.group(0) for x in re.finditer(r"[^{}]+".format(delim), string))
import itertools as it
def iter_split(string, sep=None):
sep = sep or ' '
groups = it.groupby(string, lambda s: s != sep)
return (''.join(g) for k, g in groups if k)
the_text = "100 " * 9999 + "100"
def test_function( method ):
def fn( ):
total = 0
for x in method( the_text ):
total += int( x )
return total
return fn
我想展示如何使用 find _ iter 解决方案为给定的分隔符返回一个生成器,然后使用 itertools 中的成对配方来构建前面的下一个迭代,这个迭代将获得原始分割方法中的实际单词。
from more_itertools import pairwise
import re
string = "dasdha hasud hasuid hsuia dhsuai dhasiu dhaui d"
delimiter = " "
# split according to the given delimiter including segments beginning at the beginning and ending at the end
for prev, curr in pairwise(re.finditer("^|[{0}]+|$".format(delimiter), string)):
print(string[prev.end(): curr.start()])
def str_split(text: str, separator: str) -> Iterable[str]:
i = 0
n = len(text)
while i <= n:
j = text.find(separator, i)
if j == -1:
j = n
yield text[i:j]
i = j + 1
def isplit(text, sep=None, maxsplit=-1):
if not isinstance(text, (str, bytes)):
raise TypeError(f"requires 'str' or 'bytes' but received a '{type(text).__name__}'")
if sep in ('', b''):
raise ValueError('empty separator')
if maxsplit == 0 or not text:
yield text
return
regex = (
re.escape(sep) if sep is not None
else [br'\s+', r'\s+'][isinstance(text, str)]
)
yield from re.split(regex, text, maxsplit=max(0, maxsplit))
def gsplit(todo):
chunk= 100
while todo:
splits = todo.split(maxsplit=chunk)
if len(splits) == chunk:
todo = splits.pop()
else:
todo=None
for item in splits:
yield item