lines=100
input_file=/usr/share/dict/words
# This is the basic selection method
<$input_file sort -R | head -n $lines
# If the file has duplicates that must never cause duplicate results
<$input_file sort | uniq | sort -R | head -n $lines
# If the file has blank lines that must be filtered, use sed
<$input_file sed $'/^[ \t]*$/d' | sort -R | head -n $lines
$ time ./powershuf.py -n 10 --file lines_78000000000.txt > /dev/null
./powershuf.py -n 10 --file lines_78000000000.txt > /dev/null 0.02s user 0.01s system 80% cpu 0.047 total
$ time shuf -n 10 lines_78000000000.txt
shuf -n 10 lines_78000000000.txt 2171.20s user 22.17s system 99% cpu 36:35.80 total
瓶颈是CPU和不使用多线程,它固定了1个核在100%,其他15个没有使用。
Python是我经常使用的,所以我将使用它来提高速度:
#!/bin/python3
import random
f = open("lines_78000000000.txt", "rt")
count = 0
while 1:
buffer = f.read(65536)
if not buffer: break
count += buffer.count('\n')
for i in range(10):
f.readline(random.randint(1, count))
这只花了我不到一分钟:
$ time ./shuf.py
./shuf.py 42.57s user 16.19s system 98% cpu 59.752 total
# randomly sample select 5% of lines in file
# including header row, exclude blank lines, new seed
time \
awk 'BEGIN {srand()}
!/^$/ { if (rand() <= .05 || FNR==1) print > "data-sample.txt"}' data.txt
# awk tsv004 3.76s user 1.46s system 91% cpu 5.716 total
# Function to sample N lines randomly from a file
# Parameter $1: Name of the original file
# Parameter $2: N lines to be sampled
rand_line_sampler() {
N_t=$(awk '{print $1}' $1 | wc -l) # Number of total lines
N_t_m_d=$(( $N_t - $2 - 1 )) # Number oftotal lines minus desired number of lines
N_d_m_1=$(( $2 - 1)) # Number of desired lines minus 1
# vector to have the 0 (fail) with size of N_t_m_d
echo '0' > vector_0.temp
for i in $(seq 1 1 $N_t_m_d); do
echo "0" >> vector_0.temp
done
# vector to have the 1 (success) with size of desired number of lines
echo '1' > vector_1.temp
for i in $(seq 1 1 $N_d_m_1); do
echo "1" >> vector_1.temp
done
cat vector_1.temp vector_0.temp | shuf > rand_vector.temp
paste -d" " rand_vector.temp $1 |
awk '$1 != 0 {$1=""; print}' |
sed 's/^ *//' > sampled_file.txt # file with the sampled lines
rm vector_0.temp vector_1.temp rand_vector.temp
}
rand_line_sampler "parameter_1" "parameter_2"