在JavaScript中将大字符串分割为n大小的块

我想把一个非常大的字符串(比如10,000个字符)分割成n大小的块。

就性能而言,最好的方法是什么?

例如< p >: "1234567890"除以2将得到["12", "34", "56", "78", "90"].

这样的事情是否可能使用String.prototype.match,如果是这样,这是否是在性能方面的最佳方式?

235427 次浏览

你可以这样做:

"1234567890".match(/.{1,2}/g);
// Results in:
["12", "34", "56", "78", "90"]

如果字符串的大小不是chunk-size的倍数,该方法仍然有效:

"123456789".match(/.{1,2}/g);
// Results in:
["12", "34", "56", "78", "9"]

一般来说,对于任何你想从中提取最多__abc0大小的子字符串的字符串,你可以这样做:

str.match(/.{1,n}/g); // Replace n with the size of the substring

如果你的字符串可以包含换行符或回车,你会这样做:

str.match(/(.|[\r\n]){1,n}/g); // Replace n with the size of the substring

至于性能,我用了大约10k个字符,在Chrome上花了一秒钟多一点的时间。YMMV。

这也可以用在可重用函数中:

function chunkString(str, length) {
return str.match(new RegExp('.{1,' + length + '}', 'g'));
}
var str = "123456789";
var chunks = [];
var chunkSize = 2;


while (str) {
if (str.length < chunkSize) {
chunks.push(str);
break;
}
else {
chunks.push(str.substr(0, chunkSize));
str = str.substr(chunkSize);
}
}


alert(chunks); // chunks == 12,34,56,78,9
var l = str.length, lc = 0, chunks = [], c = 0, chunkSize = 2;
for (; lc < l; c++) {
chunks[c] = str.slice(lc, lc += chunkSize);
}

我会用正则表达式…

var chunkStr = function(str, chunkLength) {
return str.match(new RegExp('[\\s\\S]{1,' + +chunkLength + '}', 'g'));
}

底线:

  • match非常低效,slice更好,在Firefox上substr/substring更好
  • match对于短字符串甚至更低效(即使是缓存的regex -可能是由于regex解析设置时间)
  • match对于大块更低效(可能是由于无法“跳转”)
  • 对于更长的字符串和非常小的块大小,match在旧的IE上优于slice,但在所有其他系统上仍然失败
  • jsperf岩石

这是一个快速而直接的解决方案

function chunkString (str, len) {
const size = Math.ceil(str.length/len)
const r = Array(size)
let offset = 0
  

for (let i = 0; i < size; i++) {
r[i] = str.substr(offset, len)
offset += len
}
  

return r
}


console.log(chunkString("helloworld", 3))
// => [ "hel", "low", "orl", "d" ]


// 10,000 char string
const bigString = "helloworld".repeat(1000)
console.time("perf")
const result = chunkString(bigString, 3)
console.timeEnd("perf")
console.log(result)
// => perf: 0.385 ms
// => [ "hel", "low", "orl", "dhe", "llo", "wor", ... ]

以原型函数的形式:

String.prototype.lsplit = function(){
return this.match(new RegExp('.{1,'+ ((arguments.length==1)?(isFinite(String(arguments[0]).trim())?arguments[0]:false):1) +'}', 'g'));
}

我已经写了一个扩展函数,所以块长度也可以是一个数字数组,比如[1,3]

String.prototype.chunkString = function(len) {
var _ret;
if (this.length < 1) {
return [];
}
if (typeof len === 'number' && len > 0) {
var _size = Math.ceil(this.length / len), _offset = 0;
_ret = new Array(_size);
for (var _i = 0; _i < _size; _i++) {
_ret[_i] = this.substring(_offset, _offset = _offset + len);
}
}
else if (typeof len === 'object' && len.length) {
var n = 0, l = this.length, chunk, that = this;
_ret = [];
do {
len.forEach(function(o) {
chunk = that.substring(n, n + o);
if (chunk !== '') {
_ret.push(chunk);
n += chunk.length;
}
});
if (n === 0) {
return undefined; // prevent an endless loop when len = [0]
}
} while (n < l);
}
return _ret;
};

的代码

"1234567890123".chunkString([1,3])

将返回:

[ '1', '234', '5', '678', '9', '012', '3' ]

我创建了几个更快的变种,你可以参见jsPerf。我最喜欢的是这个:

function chunkSubstr(str, size) {
const numChunks = Math.ceil(str.length / size)
const chunks = new Array(numChunks)


for (let i = 0, o = 0; i < numChunks; ++i, o += size) {
chunks[i] = str.substr(o, size)
}


return chunks
}

这是我正在使用的代码,它使用String.prototype.slice

是的,它是相当长的答案,因为它试图遵循当前的标准尽可能接近,当然包含了合理数量的JSDOC注释。然而,一旦缩小,代码只有828字节,一旦gzip传输,它只有497字节。

添加到String.prototype的1方法(在可用的情况下使用Object.defineProperty)是:

  1. toChunks

包含了许多测试来检查功能。

担心代码的长度会影响性能?没有必要担心,http://jsperf.com/chunk-string/3

许多额外的代码是为了确保代码在多个javascript环境中响应相同。

.
/*jslint maxlen:80, browser:true, devel:true */


/*
* Properties used by toChunks.
*/


/*property
MAX_SAFE_INTEGER, abs, ceil, configurable, defineProperty, enumerable,
floor, length, max, min, pow, prototype, slice, toChunks, value,
writable
*/


/*
* Properties used in the testing of toChunks implimentation.
*/


/*property
appendChild, createTextNode, floor, fromCharCode, getElementById, length,
log, pow, push, random, toChunks
*/


(function () {
'use strict';


var MAX_SAFE_INTEGER = Number.MAX_SAFE_INTEGER || Math.pow(2, 53) - 1;


/**
* Defines a new property directly on an object, or modifies an existing
* property on an object, and returns the object.
*
* @private
* @function
* @param {Object} object
* @param {string} property
* @param {Object} descriptor
* @return {Object}
* @see https://goo.gl/CZnEqg
*/
function $defineProperty(object, property, descriptor) {
if (Object.defineProperty) {
Object.defineProperty(object, property, descriptor);
} else {
object[property] = descriptor.value;
}


return object;
}


/**
* Returns true if the operands are strictly equal with no type conversion.
*
* @private
* @function
* @param {*} a
* @param {*} b
* @return {boolean}
* @see http://www.ecma-international.org/ecma-262/5.1/#sec-11.9.4
*/
function $strictEqual(a, b) {
return a === b;
}


/**
* Returns true if the operand inputArg is undefined.
*
* @private
* @function
* @param {*} inputArg
* @return {boolean}
*/
function $isUndefined(inputArg) {
return $strictEqual(typeof inputArg, 'undefined');
}


/**
* The abstract operation throws an error if its argument is a value that
* cannot be converted to an Object, otherwise returns the argument.
*
* @private
* @function
* @param {*} inputArg The object to be tested.
* @throws {TypeError} If inputArg is null or undefined.
* @return {*} The inputArg if coercible.
* @see https://goo.gl/5GcmVq
*/
function $requireObjectCoercible(inputArg) {
var errStr;


if (inputArg === null || $isUndefined(inputArg)) {
errStr = 'Cannot convert argument to object: ' + inputArg;
throw new TypeError(errStr);
}


return inputArg;
}


/**
* The abstract operation converts its argument to a value of type string
*
* @private
* @function
* @param {*} inputArg
* @return {string}
* @see https://people.mozilla.org/~jorendorff/es6-draft.html#sec-tostring
*/
function $toString(inputArg) {
var type,
val;


if (inputArg === null) {
val = 'null';
} else {
type = typeof inputArg;
if (type === 'string') {
val = inputArg;
} else if (type === 'undefined') {
val = type;
} else {
if (type === 'symbol') {
throw new TypeError('Cannot convert symbol to string');
}


val = String(inputArg);
}
}


return val;
}


/**
* Returns a string only if the arguments is coercible otherwise throws an
* error.
*
* @private
* @function
* @param {*} inputArg
* @throws {TypeError} If inputArg is null or undefined.
* @return {string}
*/
function $onlyCoercibleToString(inputArg) {
return $toString($requireObjectCoercible(inputArg));
}


/**
* The function evaluates the passed value and converts it to an integer.
*
* @private
* @function
* @param {*} inputArg The object to be converted to an integer.
* @return {number} If the target value is NaN, null or undefined, 0 is
*                   returned. If the target value is false, 0 is returned
*                   and if true, 1 is returned.
* @see http://www.ecma-international.org/ecma-262/5.1/#sec-9.4
*/
function $toInteger(inputArg) {
var number = +inputArg,
val = 0;


if ($strictEqual(number, number)) {
if (!number || number === Infinity || number === -Infinity) {
val = number;
} else {
val = (number > 0 || -1) * Math.floor(Math.abs(number));
}
}


return val;
}


/**
* The abstract operation ToLength converts its argument to an integer
* suitable for use as the length of an array-like object.
*
* @private
* @function
* @param {*} inputArg The object to be converted to a length.
* @return {number} If len <= +0 then +0 else if len is +INFINITY then
*                   2^53-1 else min(len, 2^53-1).
* @see https://people.mozilla.org/~jorendorff/es6-draft.html#sec-tolength
*/
function $toLength(inputArg) {
return Math.min(Math.max($toInteger(inputArg), 0), MAX_SAFE_INTEGER);
}


if (!String.prototype.toChunks) {
/**
* This method chunks a string into an array of strings of a specified
* chunk size.
*
* @function
* @this {string} The string to be chunked.
* @param {Number} chunkSize The size of the chunks that the string will
*                           be chunked into.
* @returns {Array} Returns an array of the chunked string.
*/
$defineProperty(String.prototype, 'toChunks', {
enumerable: false,
configurable: true,
writable: true,
value: function (chunkSize) {
var str = $onlyCoercibleToString(this),
chunkLength = $toInteger(chunkSize),
chunked = [],
numChunks,
length,
index,
start,
end;


if (chunkLength < 1) {
return chunked;
}


length = $toLength(str.length);
numChunks = Math.ceil(length / chunkLength);
index = 0;
start = 0;
end = chunkLength;
chunked.length = numChunks;
while (index < numChunks) {
chunked[index] = str.slice(start, end);
start = end;
end += chunkLength;
index += 1;
}


return chunked;
}
});
}
}());


/*
* Some tests
*/


(function () {
'use strict';


var pre = document.getElementById('out'),
chunkSizes = [],
maxChunkSize = 512,
testString = '',
maxTestString = 100000,
chunkSize = 0,
index = 1;


while (chunkSize < maxChunkSize) {
chunkSize = Math.pow(2, index);
chunkSizes.push(chunkSize);
index += 1;
}


index = 0;
while (index < maxTestString) {
testString += String.fromCharCode(Math.floor(Math.random() * 95) + 32);
index += 1;
}


function log(result) {
pre.appendChild(document.createTextNode(result + '\n'));
}


function test() {
var strLength = testString.length,
czLength = chunkSizes.length,
czIndex = 0,
czValue,
result,
numChunks,
pass;


while (czIndex < czLength) {
czValue = chunkSizes[czIndex];
numChunks = Math.ceil(strLength / czValue);
result = testString.toChunks(czValue);
czIndex += 1;
log('chunksize: ' + czValue);
log(' Number of chunks:');
log('  Calculated: ' + numChunks);
log('  Actual:' + result.length);
pass = result.length === numChunks;
log(' First chunk size: ' + result[0].length);
pass = pass && result[0].length === czValue;
log(' Passed: ' + pass);
log('');
}
}


test();
log('');
log('Simple test result');
log('abcdefghijklmnopqrstuvwxyz'.toChunks(3));
}());
<pre id="out"></pre>

    window.format = function(b, a) {
if (!b || isNaN(+a)) return a;
var a = b.charAt(0) == "-" ? -a : +a,
j = a < 0 ? a = -a : 0,
e = b.match(/[^\d\-\+#]/g),
h = e && e[e.length - 1] || ".",
e = e && e[1] && e[0] || ",",
b = b.split(h),
a = a.toFixed(b[1] && b[1].length),
a = +a + "",
d = b[1] && b[1].lastIndexOf("0"),
c = a.split(".");
if (!c[1] || c[1] && c[1].length <= d) a = (+a).toFixed(d + 1);
d = b[0].split(e);
b[0] = d.join("");
var f = b[0] && b[0].indexOf("0");
if (f > -1)
for (; c[0].length < b[0].length - f;) c[0] = "0" + c[0];
else +c[0] == 0 && (c[0] = "");
a = a.split(".");
a[0] = c[0];
if (c = d[1] && d[d.length -
1].length) {
for (var d = a[0], f = "", k = d.length % c, g = 0, i = d.length; g < i; g++) f += d.charAt(g), !((g - k + 1) % c) && g < i - c && (f += e);
a[0] = f
}
a[1] = b[1] && a[1] ? h + a[1] : "";
return (j ? "-" : "") + a[0] + a[1]
};


var str="1234567890";
var formatstr=format( "##,###.", str);
alert(formatstr);




This will split the string in reverse order with comma separated after 3 char's. If you want you can change the position.

它将大字符串拆分为给定单词的小字符串。

function chunkSubstr(str, words) {
var parts = str.split(" ") , values = [] , i = 0 , tmpVar = "";
$.each(parts, function(index, value) {
if(tmpVar.length < words){
tmpVar += " " + value;
}else{
values[i] = tmpVar.replace(/\s+/g, " ");
i++;
tmpVar = value;
}
});
if(values.length < 1 &&  parts.length > 0){
values[0] = tmpVar;
}
return values;
}

使用slice()方法:

function returnChunksArray(str, chunkSize) {
var arr = [];
while(str !== '') {
arr.push(str.slice(0, chunkSize));
str = str.slice(chunkSize);
}
return arr;
}

使用substring()方法也可以做到这一点。

function returnChunksArray(str, chunkSize) {
var arr = [];
while(str !== '') {
arr.push(str.substring(0, chunkSize));
str = str.substring(chunkSize);
}
return arr;
}
function chunkString(str, length = 10) {
let result = [],
offset = 0;
if (str.length <= length) return result.push(str) && result;
while (offset < str.length) {
result.push(str.substr(offset, length));
offset += length;
}
return result;
}

那么下面这一小段代码呢:

function splitME(str, size) {
let subStr = new RegExp('.{1,' + size + '}', 'g');
return str.match(subStr);
};

惊喜!你可以使用分裂进行分割。

var parts = "1234567890 ".split(/(.{2})/).filter(O=>O)

结果在[ '12', '34', '56', '78', '90', ' ' ]

const getChunksFromString = (str, chunkSize) => {
var regexChunk = new RegExp(`.{1,${chunkSize}}`, 'g')   // '.' represents any character
return str.match(regexChunk)
}

根据需要调用它

console.log(getChunksFromString("Hello world", 3))   // ["Hel", "lo ", "wor", "ld"]

我对上述解决方案的问题是,不管在句子中的位置如何,它都将字符串划分为正式的大小块。

我认为下面的方法比较好;虽然它需要一些性能调整:

 static chunkString(str, length, size,delimiter='\n' ) {
const result = [];
for (let i = 0; i < str.length; i++) {
const lastIndex = _.lastIndexOf(str, delimiter,size + i);
result.push(str.substr(i, lastIndex - i));
i = lastIndex;
}
return result;
}

这是一个解决方案,我想出了一个模板字符串后,一个小实验:

用法:

chunkString(5)`testing123`

function chunkString(nSize) {
return (strToChunk) => {
let result = [];
let chars = String(strToChunk).split('');


for(let i = 0; i < (String(strToChunk).length / nSize); i++) {
result = result.concat(chars.slice(i*nSize,(i+1)*nSize).join(''));
}
return result
}
}


document.write(chunkString(5)`testing123`);
// returns: testi,ng123


document.write(chunkString(3)`testing123`);
// returns: tes,tin,g12,3

你可以不带任何正则表达式使用reduce():

(str, n) => {
return str.split('').reduce(
(acc, rec, index) => {
return ((index % n) || !(index)) ? acc.concat(rec) : acc.concat(',', rec)
},
''
).split(',')
}

你绝对可以做一些

let pieces = "1234567890 ".split(/(.{2})/).filter(x => x.length == 2);

要得到这个:

[ '12', '34', '56', '78', '90' ]

如果你想动态输入/调整数据块的大小,使数据块的大小为n,你可以这样做:

n = 2;
let pieces = "1234567890 ".split(new RegExp("(.{"+n.toString()+"})")).filter(x => x.length == n);

要在原始字符串中找到所有大小为n的块,尝试这样做:

let subs = new Set();
let n = 2;
let str = "1234567890 ";
let regex = new RegExp("(.{"+n.toString()+"})");     //set up regex expression dynamically encoded with n


for (let i = 0; i < n; i++){               //starting from all possible offsets from position 0 in the string
let pieces = str.split(regex).filter(x => x.length == n);    //divide the string into chunks of size n...
for (let p of pieces)                 //...and add the chunks to the set
subs.add(p);
str = str.substr(1);    //shift the string reading frame
}

你应该得到:

[ '12', '23', '34', '45', '56', '67', '78', '89', '90', '0 ' ]
包含预分配的左版本和右版本。 这和RegExp impl一样快,但它随着块大小的增长,运行速度也会加快

function chunkLeft (str, size = 3) {
if (typeof str === 'string') {
const length = str.length
const chunks = Array(Math.ceil(length / size))
for (let i = 0, index = 0; index < length; i++) {
chunks[i] = str.slice(index, index += size)
}
return chunks
}
}


function chunkRight (str, size = 3) {
if (typeof str === 'string') {
const length = str.length
const chunks = Array(Math.ceil(length / size))
if (length) {
chunks[0] = str.slice(0, length % size || size)
for (let i = 1, index = chunks[0].length; index < length; i++) {
chunks[i] = str.slice(index, index += size)
}
}
return chunks
}
}


console.log(chunkRight())  // undefined
console.log(chunkRight(''))  // []
console.log(chunkRight('1'))  // ["1"]
console.log(chunkRight('123'))  // ["123"]
console.log(chunkRight('1234'))  // ["1", "234"]
console.log(chunkRight('12345'))  // ["12", "345"]
console.log(chunkRight('123456'))  // ["123", "456"]
console.log(chunkRight('1234567'))  // ["1", "234", "567"]

使用npm库"chkchars" 但请记住,确保给定的字符串长度完全除以"number"参数。< / p >

const phrase = "1110010111010011100101110100010000011100101110100111001011101001011101001110010111010001000001110010111010011100101110100"
const number = 7


chkchars.splitToChunks(phrase, number)


// result => ['1110010', '1110100','1110010', '1110100','0100000', '1110010','1110100', '1110010','1110100', '1011101','0011100', '1011101','0001000','0011100','1011101', '0011100','1011101']


// perf => 0.287ms