In [20]: timeit sum(1 for line in open('Charts.ipynb'))100000 loops, best of 3: 9.79 µs per loop
In [21]: timeit len(open('Charts.ipynb').read().splitlines())100000 loops, best of 3: 12 µs per loop
from itertools import (takewhile,repeat)
def rawincount(filename):f = open(filename, 'rb')bufgen = takewhile(lambda x: x, (f.raw.read(1024*1024) for _ in repeat(None)))return sum( buf.count(b'\n') for buf in bufgen )
import subprocess
def count_file_lines(file_path):"""Counts the number of lines in a file using wc utility.:param file_path: path to file:return: int, no of lines"""num = subprocess.check_output(['wc', '-l', file_path])num = num.split(' ')return int(num[0])
import timeit
from numba import jit, prangeimport numpy as np
from itertools import (takewhile,repeat)
FILE = '../data/us_confirmed.csv' # 40.6MB, 371755 line fileCR = ord('\n')
# Copied from the question above. Used as a benchmarkdef file_len(fname):with open(fname) as f:for i, l in enumerate(f):passreturn i + 1
# Copied from another answer. Used as a benchmarkdef rawincount(filename):f = open(filename, 'rb')bufgen = takewhile(lambda x: x, (f.read(1024*1024*10) for _ in repeat(None)))return sum( buf.count(b'\n') for buf in bufgen )
# Single thread@jit(nopython=True)def numbacountsingle_chunk(bs):
c = 0for i in range(len(bs)):if bs[i] == CR:c += 1
return c
def numbacountsingle(filename):f = open(filename, "rb")total = 0while True:chunk = f.read(1024*1024*10)lines = numbacountsingle_chunk(chunk)total += linesif not chunk:break
return total
# Multi thread@jit(nopython=True, parallel=True)def numbacountparallel_chunk(bs):
c = 0for i in prange(len(bs)):if bs[i] == CR:c += 1
return c
def numbacountparallel(filename):f = open(filename, "rb")total = 0while True:chunk = f.read(1024*1024*10)lines = numbacountparallel_chunk(np.frombuffer(chunk, dtype=np.uint8))total += linesif not chunk:break
return total
print('numbacountparallel')print(numbacountparallel(FILE)) # This allows Numba to compile and cache the function without adding to the time.print(timeit.Timer(lambda: numbacountparallel(FILE)).timeit(number=100))
print('\nnumbacountsingle')print(numbacountsingle(FILE))print(timeit.Timer(lambda: numbacountsingle(FILE)).timeit(number=100))
print('\nfile_len')print(file_len(FILE))print(timeit.Timer(lambda: rawincount(FILE)).timeit(number=100))
print('\nrawincount')print(rawincount(FILE))print(timeit.Timer(lambda: rawincount(FILE)).timeit(number=100))
def buf_count_newlines_gen(fname):def _make_gen(reader):while True:b = reader(2 ** 16)if not b: breakyield b
with open(fname, "rb") as f:count = sum(buf.count(b"\n") for buf in _make_gen(f.raw.read))return count
它速度快,内存效率高。大多数其他解决方案大约慢20倍。
重现情节的代码:
import mmapimport subprocessfrom functools import partial
import perfplot
def setup(n):fname = "t.txt"with open(fname, "w") as f:for i in range(n):f.write(str(i) + "\n")return fname
def for_enumerate(fname):i = 0with open(fname) as f:for i, _ in enumerate(f):passreturn i + 1
def sum1(fname):return sum(1 for _ in open(fname))
def mmap_count(fname):with open(fname, "r+") as f:buf = mmap.mmap(f.fileno(), 0)
lines = 0while buf.readline():lines += 1return lines
def for_open(fname):lines = 0for _ in open(fname):lines += 1return lines
def buf_count_newlines(fname):lines = 0buf_size = 2 ** 16with open(fname) as f:buf = f.read(buf_size)while buf:lines += buf.count("\n")buf = f.read(buf_size)return lines
def buf_count_newlines_gen(fname):def _make_gen(reader):b = reader(2 ** 16)while b:yield bb = reader(2 ** 16)
with open(fname, "rb") as f:count = sum(buf.count(b"\n") for buf in _make_gen(f.raw.read))return count
def wc_l(fname):return int(subprocess.check_output(["wc", "-l", fname]).split()[0])
def sum_partial(fname):with open(fname) as f:count = sum(x.count("\n") for x in iter(partial(f.read, 2 ** 16), ""))return count
def read_count(fname):return open(fname).read().count("\n")
b = perfplot.bench(setup=setup,kernels=[for_enumerate,sum1,mmap_count,for_open,wc_l,buf_count_newlines,buf_count_newlines_gen,sum_partial,read_count,],n_range=[2 ** k for k in range(27)],xlabel="num lines",)b.save("out.png")b.show()