C 中带分隔符的拆分字符串

如何编写一个函数来拆分并返回一个带分隔符的字符串的数组?

char* str = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
str_split(str,',');
789770 次浏览

可以使用 strtok()函数分割字符串(并指定要使用的分隔符)。请注意,strtok()将修改传递给它的字符串。如果其他地方需要原始字符串,则复制该字符串并将其传递给 strtok()

编辑:

示例(注意它不处理连续的分隔符,例如“ JAN,,,FEB,MAR”) :

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>


char** str_split(char* a_str, const char a_delim)
{
char** result    = 0;
size_t count     = 0;
char* tmp        = a_str;
char* last_comma = 0;
char delim[2];
delim[0] = a_delim;
delim[1] = 0;


/* Count how many elements will be extracted. */
while (*tmp)
{
if (a_delim == *tmp)
{
count++;
last_comma = tmp;
}
tmp++;
}


/* Add space for trailing token. */
count += last_comma < (a_str + strlen(a_str) - 1);


/* Add space for terminating null string so caller
knows where the list of returned strings ends. */
count++;


result = malloc(sizeof(char*) * count);


if (result)
{
size_t idx  = 0;
char* token = strtok(a_str, delim);


while (token)
{
assert(idx < count);
*(result + idx++) = strdup(token);
token = strtok(0, delim);
}
assert(idx == count - 1);
*(result + idx) = 0;
}


return result;
}


int main()
{
char months[] = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
char** tokens;


printf("months=[%s]\n\n", months);


tokens = str_split(months, ',');


if (tokens)
{
int i;
for (i = 0; *(tokens + i); i++)
{
printf("month=[%s]\n", *(tokens + i));
free(*(tokens + i));
}
printf("\n");
free(tokens);
}


return 0;
}

产出:

$ ./main.exe
months=[JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC]


month=[JAN]
month=[FEB]
month=[MAR]
month=[APR]
month=[MAY]
month=[JUN]
month=[JUL]
month=[AUG]
month=[SEP]
month=[OCT]
month=[NOV]
month=[DEC]

字符串标记器这个代码应该把你放在正确的方向。

int main(void) {
char st[] ="Where there is will, there is a way.";
char *ch;
ch = strtok(st, " ");
while (ch != NULL) {
printf("%s\n", ch);
ch = strtok(NULL, " ,");
}
getch();
return 0;
}

在上面的示例中,有一种方法可以在字符串中返回一个以空结束的字符串数组(如您所希望的那样)。但是它不能传递一个文本字符串,因为它必须被函数修改:

#include <stdlib.h>
#include <stdio.h>
#include <string.h>


char** str_split( char* str, char delim, int* numSplits )
{
char** ret;
int retLen;
char* c;


if ( ( str == NULL ) ||
( delim == '\0' ) )
{
/* Either of those will cause problems */
ret = NULL;
retLen = -1;
}
else
{
retLen = 0;
c = str;


/* Pre-calculate number of elements */
do
{
if ( *c == delim )
{
retLen++;
}


c++;
} while ( *c != '\0' );


ret = malloc( ( retLen + 1 ) * sizeof( *ret ) );
ret[retLen] = NULL;


c = str;
retLen = 1;
ret[0] = str;


do
{
if ( *c == delim )
{
ret[retLen++] = &c[1];
*c = '\0';
}


c++;
} while ( *c != '\0' );
}


if ( numSplits != NULL )
{
*numSplits = retLen;
}


return ret;
}


int main( int argc, char* argv[] )
{
const char* str = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";


char* strCpy;
char** split;
int num;
int i;


strCpy = malloc( strlen( str ) * sizeof( *strCpy ) );
strcpy( strCpy, str );


split = str_split( strCpy, ',', &num );


if ( split == NULL )
{
puts( "str_split returned NULL" );
}
else
{
printf( "%i Results: \n", num );


for ( i = 0; i < num; i++ )
{
puts( split[i] );
}
}


free( split );
free( strCpy );


return 0;
}

也许有一个更简洁的方法来做到这一点,但你得到的想法。

这个函数接受一个 char * string 并用分隔符将其分割。一行中可以有多个分隔符。注意,该函数修改原始字符串。如果您需要原始字符串保持不变,则必须首先复制原始字符串。这个函数不使用任何 cstring 函数调用,所以它可能比其他函数快一点。如果您不关心内存分配,可以在函数的顶部分配 sub _ string,大小为 strlen (src _ str)/2,并且(像 c + + “ version”提到的那样)跳过函数的下半部分。如果这样做,函数将减少到 O (N) ,但下面显示的内存优化方式是 O (2N)。

功能:

char** str_split(char *src_str, const char deliminator, size_t &num_sub_str){
//replace deliminator's with zeros and count how many
//sub strings with length >= 1 exist
num_sub_str = 0;
char *src_str_tmp = src_str;
bool found_delim = true;
while(*src_str_tmp){
if(*src_str_tmp == deliminator){
*src_str_tmp = 0;
found_delim = true;
}
else if(found_delim){ //found first character of a new string
num_sub_str++;
found_delim = false;
//sub_str_vec.push_back(src_str_tmp); //for c++
}
src_str_tmp++;
}
printf("Start - found %d sub strings\n", num_sub_str);
if(num_sub_str <= 0){
printf("str_split() - no substrings were found\n");
return(0);
}


//if you want to use a c++ vector and push onto it, the rest of this function
//can be omitted (obviously modifying input parameters to take a vector, etc)


char **sub_strings = (char **)malloc( (sizeof(char*) * num_sub_str) + 1);
const char *src_str_terminator = src_str_tmp;
src_str_tmp = src_str;
bool found_null = true;
size_t idx = 0;
while(src_str_tmp < src_str_terminator){
if(!*src_str_tmp) //found a NULL
found_null = true;
else if(found_null){
sub_strings[idx++] = src_str_tmp;
//printf("sub_string_%d: [%s]\n", idx-1, sub_strings[idx-1]);
found_null = false;
}
src_str_tmp++;
}
sub_strings[num_sub_str] = NULL;


return(sub_strings);
}

使用方法:

  char months[] = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
char *str = strdup(months);
size_t num_sub_str;
char **sub_strings = str_split(str, ',', num_sub_str);
char *endptr;
if(sub_strings){
for(int i = 0; sub_strings[i]; i++)
printf("[%s]\n", sub_strings[i]);
}
free(sub_strings);
free(str);

方法将为您完成所有工作(内存分配,计算长度)。更多的信息和描述可以在这里找到-Split ()方法拆分 C 字符串的实现

int split (const char *str, char c, char ***arr)
{
int count = 1;
int token_len = 1;
int i = 0;
char *p;
char *t;


p = str;
while (*p != '\0')
{
if (*p == c)
count++;
p++;
}


*arr = (char**) malloc(sizeof(char*) * count);
if (*arr == NULL)
exit(1);


p = str;
while (*p != '\0')
{
if (*p == c)
{
(*arr)[i] = (char*) malloc( sizeof(char) * token_len );
if ((*arr)[i] == NULL)
exit(1);


token_len = 0;
i++;
}
p++;
token_len++;
}
(*arr)[i] = (char*) malloc( sizeof(char) * token_len );
if ((*arr)[i] == NULL)
exit(1);


i = 0;
p = str;
t = ((*arr)[i]);
while (*p != '\0')
{
if (*p != c && *p != '\0')
{
*t = *p;
t++;
}
else
{
*t = '\0';
i++;
t = ((*arr)[i]);
}
p++;
}


return count;
}

使用方法:

int main (int argc, char ** argv)
{
int i;
char *s = "Hello, this is a test module for the string splitting.";
int c = 0;
char **arr = NULL;


c = split(s, ' ', &arr);


printf("found %d tokens.\n", c);


for (i = 0; i < c; i++)
printf("string #%d: %s\n", i, arr[i]);


return 0;
}
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <errno.h>


/**
*  splits str on delim and dynamically allocates an array of pointers.
*
*  On error -1 is returned, check errno
*  On success size of array is returned, which may be 0 on an empty string
*  or 1 if no delim was found.
*
*  You could rewrite this to return the char ** array instead and upon NULL
*  know it's an allocation problem but I did the triple array here.  Note that
*  upon the hitting two delim's in a row "foo,,bar" the array would be:
*  { "foo", NULL, "bar" }
*
*  You need to define the semantics of a trailing delim Like "foo," is that a
*  2 count array or an array of one?  I choose the two count with the second entry
*  set to NULL since it's valueless.
*  Modifies str so make a copy if this is a problem
*/
int split( char * str, char delim, char ***array, int *length ) {
char *p;
char **res;
int count=0;
int k=0;


p = str;
// Count occurance of delim in string
while( (p=strchr(p,delim)) != NULL ) {
*p = 0; // Null terminate the deliminator.
p++; // Skip past our new null
count++;
}


// allocate dynamic array
res = calloc( 1, count * sizeof(char *));
if( !res ) return -1;


p = str;
for( k=0; k<count; k++ ){
if( *p ) res[k] = p;  // Copy start of string
p = strchr(p, 0 );    // Look for next null
p++; // Start of next string
}


*array = res;
*length = count;


return 0;
}


char str[] = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC,";


int main() {
char **res;
int k=0;
int count =0;
int rc;


rc = split( str, ',', &res, &count );
if( rc ) {
printf("Error: %s errno: %d \n", strerror(errno), errno);
}


printf("count: %d\n", count );
for( k=0; k<count; k++ ) {
printf("str: %s\n", res[k]);
}


free(res );
return 0;
}

我认为 strsep仍然是这方面的最佳工具:

while ((token = strsep(&str, ","))) my_fn(token);

字面意思就是一行分开一个字符串。

额外的括号是一个风格元素,表示我们有意测试赋值的结果,而不是相等运算符 ==

为了使这种模式起作用,tokenstr都具有 char *类型。如果从字符串文字开始,那么首先要做一个副本:

// More general pattern:
const char *my_str_literal = "JAN,FEB,MAR";
char *token, *str, *tofree;


tofree = str = strdup(my_str_literal);  // We own str's memory now.
while ((token = strsep(&str, ","))) my_fn(token);
free(tofree);

如果两个分隔符同时出现在 str中,您将得到一个 token值,即空字符串。str的值被修改,因为遇到的每个分隔符都被零字节覆盖——这是首先复制被解析的字符串的另一个好理由。

在一个评论中,有人建议说 strtokstrsep更好,因为 strtok更便携。Ubuntu 和 MacOSX 都有 strsep; 可以有把握地猜测,其他 Unixy 系统也有 strsep。Windows 缺少 strsep,但它有 strbrk,这使得这个简短而甜美的 strsep替代品:

char *strsep(char **stringp, const char *delim) {
if (*stringp == NULL) { return NULL; }
char *token_start = *stringp;
*stringp = strpbrk(token_start, delim);
if (*stringp) {
**stringp = '\0';
(*stringp)++;
}
return token_start;
}

这里 是对 strsepstrtok的一个很好的解释。利弊可能会被主观判断,但是,我认为这是一个生动的迹象,strsep被设计为取代 strtok

试试这个。

char** strsplit(char* str, const char* delim){
char** res = NULL;
char*  part;
int i = 0;


char* aux = strdup(str);


part = strdup(strtok(aux, delim));


while(part){
res = (char**)realloc(res, (i + 1) * sizeof(char*));
*(res + i) = strdup(part);


part = strdup(strtok(NULL, delim));
i++;
}


res = (char**)realloc(res, i * sizeof(char*));
*(res + i) = NULL;


return res;
}

这是一个字符串分割函数,可以处理多字符分隔符。注意,如果分隔符长于正在拆分的字符串,那么 bufferstringLengths将被设置为 (void *) 0,而 numStrings将被设置为 0

这个算法已经经过测试,并且有效。(免责声明: 它没有测试非 ASCII 字符串,并假定调用者提供了有效的参数)

void splitString(const char *original, const char *delimiter, char ** * buffer, int * numStrings, int * * stringLengths){
const int lo = strlen(original);
const int ld = strlen(delimiter);
if(ld > lo){
*buffer = (void *)0;
*numStrings = 0;
*stringLengths = (void *)0;
return;
}


*numStrings = 1;


for(int i = 0;i < (lo - ld);i++){
if(strncmp(&original[i], delimiter, ld) == 0) {
i += (ld - 1);
(*numStrings)++;
}
}


*stringLengths = (int *) malloc(sizeof(int) * *numStrings);


int currentStringLength = 0;
int currentStringNumber = 0;
int delimiterTokenDecrementCounter = 0;
for(int i = 0;i < lo;i++){
if(delimiterTokenDecrementCounter > 0){
delimiterTokenDecrementCounter--;
} else if(i < (lo - ld)){
if(strncmp(&original[i], delimiter, ld) == 0){
(*stringLengths)[currentStringNumber] = currentStringLength;
currentStringNumber++;
currentStringLength = 0;
delimiterTokenDecrementCounter = ld - 1;
} else {
currentStringLength++;
}
} else {
currentStringLength++;
}


if(i == (lo - 1)){
(*stringLengths)[currentStringNumber] = currentStringLength;
}
}


*buffer = (char **) malloc(sizeof(char *) * (*numStrings));
for(int i = 0;i < *numStrings;i++){
(*buffer)[i] = (char *) malloc(sizeof(char) * ((*stringLengths)[i] + 1));
}


currentStringNumber = 0;
currentStringLength = 0;
delimiterTokenDecrementCounter = 0;
for(int i = 0;i < lo;i++){
if(delimiterTokenDecrementCounter > 0){
delimiterTokenDecrementCounter--;
} else if(currentStringLength >= (*stringLengths)[currentStringNumber]){
(*buffer)[currentStringNumber][currentStringLength] = 0;
delimiterTokenDecrementCounter = ld - 1;
currentStringLength = 0;
currentStringNumber++;
} else {
(*buffer)[currentStringNumber][currentStringLength] = (char)original[i];
currentStringLength++;
}
}
buffer[currentStringNumber][currentStringLength] = 0;
}

示例代码:

int main(){
const char *string = "STRING-1 DELIM string-2 DELIM sTrInG-3";
char **buffer;
int numStrings;
int * stringLengths;


splitString(string, " DELIM ", &buffer, &numStrings, &stringLengths);


for(int i = 0;i < numStrings;i++){
printf("String: %s\n", buffer[i]);
}
}

图书馆:

#include <stdlib.h>
#include <string.h>
#include <stdio.h>

我的方法是扫描字符串,让指针指向分隔符(和第一个字符)之后的每个字符,同时将字符串中分隔符的外观指定为“0”。
首先复制一个原始字符串(因为它是常量) ,然后通过扫描得到拆分的数目,并将其传递给指针参数 Len。然后,将第一个结果指针指向复制字符串指针,然后扫描复制字符串: 一旦遇到分隔符,将其赋值为“0”,从而结束前一个结果字符串,并将下一个结果字符串指针指向下一个字符指针。

char** split(char* a_str, const char a_delim, int* len){
char* s = (char*)malloc(sizeof(char) * strlen(a_str));
strcpy(s, a_str);
char* tmp = a_str;
int count = 0;
while (*tmp != '\0'){
if (*tmp == a_delim) count += 1;
tmp += 1;
}
*len = count;
char** results = (char**)malloc(count * sizeof(char*));
results[0] = s;
int i = 1;
while (*s!='\0'){
if (*s == a_delim){
*s = '\0';
s += 1;
results[i++] = s;
}
else s += 1;
}
return results;
}

此优化方法在 * result 中创建(或更新现有的)指针数组,并返回 * count 中的元素数。

使用“ max”指示所期望的最大字符串数(当指定现有数组或任何其他原因时) ,否则将其设置为0

要与分隔符列表进行比较,请将 delm 定义为 char * 并替换下面的行:

if (str[i]==delim) {

有以下两句话:

 char *c=delim; while(*c && *c!=str[i]) c++;
if (*c) {

好好享受吧

#include <stdlib.h>
#include <string.h>


char **split(char *str, size_t len, char delim, char ***result, unsigned long *count, unsigned long max) {
size_t i;
char **_result;


// there is at least one string returned
*count=1;


_result= *result;


// when the result array is specified, fill it during the first pass
if (_result) {
_result[0]=str;
}


// scan the string for delimiter, up to specified length
for (i=0; i<len; ++i) {


// to compare against a list of delimiters,
// define delim as a string and replace
// the next line:
//     if (str[i]==delim) {
//
// with the two following lines:
//     char *c=delim; while(*c && *c!=str[i]) c++;
//     if (*c) {
//
if (str[i]==delim) {


// replace delimiter with zero
str[i]=0;


// when result array is specified, fill it during the first pass
if (_result) {
_result[*count]=str+i+1;
}


// increment count for each separator found
++(*count);


// if max is specified, dont go further
if (max && *count==max)  {
break;
}


}
}


// when result array is specified, we are done here
if (_result) {
return _result;
}


// else allocate memory for result
// and fill the result array


*result=malloc((*count)*sizeof(char*));
if (!*result) {
return NULL;
}
_result=*result;


// add first string to result
_result[0]=str;


// if theres more strings
for (i=1; i<*count; ++i) {


// find next string
while(*str) ++str;
++str;


// add next string to result
_result[i]=str;


}


return _result;
}

用法例子:

#include <stdio.h>


int main(int argc, char **argv) {
char *str="JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
char **result=malloc(6*sizeof(char*));
char **result2=0;
unsigned long count;
unsigned long count2;
unsigned long i;


split(strdup(str),strlen(str),',',&result,&count,6);
split(strdup(str),strlen(str),',',&result2,&count2,0);


if (result)
for (i=0; i<count; ++i) {
printf("%s\n",result[i]);
}


printf("\n");


if (result2)
for (i=0; i<count2; ++i) {
printf("%s\n", result2[i]);
}


return 0;


}

我的建议是:

int split (const char *txt, char delim, char ***tokens)
{
int *tklen, *t, count = 1;
char **arr, *p = (char *) txt;


while (*p != '\0') if (*p++ == delim) count += 1;
t = tklen = calloc (count, sizeof (int));
for (p = (char *) txt; *p != '\0'; p++) *p == delim ? *t++ : (*t)++;
*tokens = arr = malloc (count * sizeof (char *));
t = tklen;
p = *arr++ = calloc (*(t++) + 1, sizeof (char *));
while (*txt != '\0')
{
if (*txt == delim)
{
p = *arr++ = calloc (*(t++) + 1, sizeof (char *));
txt++;
}
else *p++ = *txt++;
}
free (tklen);
return count;
}

用法:

char **tokens;
int count, i;
const char *str = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";


count = split (str, ',', &tokens);
for (i = 0; i < count; i++) printf ("%s\n", tokens[i]);


/* freeing tokens */
for (i = 0; i < count; i++) free (tokens[i]);
free (tokens);

我的代码(经过测试) :

#include <stdio.h>
#include <stdlib.h>
#include <string.h>


int dtmsplit(char *str, const char *delim, char ***array, int *length ) {
int i=0;
char *token;
char **res = (char **) malloc(0 * sizeof(char *));


/* get the first token */
token = strtok(str, delim);
while( token != NULL )
{
res = (char **) realloc(res, (i + 1) * sizeof(char *));
res[i] = token;
i++;
token = strtok(NULL, delim);
}
*array = res;
*length = i;
return 1;
}


int main()
{
int i;
int c = 0;
char **arr = NULL;


int count =0;


char str[80] = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
c = dtmsplit(str, ",", &arr, &count);
printf("Found %d tokens.\n", count);


for (i = 0; i < count; i++)
printf("string #%d: %s\n", i, arr[i]);


return(0);
}

结果:

Found 12 tokens.
string #0: JAN
string #1: FEB
string #2: MAR
string #3: APR
string #4: MAY
string #5: JUN
string #6: JUL
string #7: AUG
string #8: SEP
string #9: OCT
string #10: NOV
string #11: DEC

爆炸 & 内爆-初始字符串保持完整,动态内存分配

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>


typedef struct
{
uintptr_t   ptr;
int         size;
} token_t;


int explode(char *str, int slen, const char *delimiter, token_t **tokens)
{
int i = 0, c1 = 0, c2 = 0;


for(i = 0; i <= slen; i++)
{
if(str[i] == *delimiter)
{
c1++;
}
}


if(c1 == 0)
{
return -1;
}


*tokens = (token_t*)calloc((c1 + 1), sizeof(token_t));
((*tokens)[c2]).ptr = (uintptr_t)str;


i = 0;
while(i <= slen)
{
if((str[i] == *delimiter) || (i == slen))
{
((*tokens)[c2]).size = (int)((uintptr_t)&(str[i]) - (uintptr_t)(((*tokens)[c2]).ptr));
if(i < slen)
{
c2++;
((*tokens)[c2]).ptr = (uintptr_t)&(str[i + 1]);
}
}
i++;
}
return (c1 + 1);
}


char* implode(token_t *tokens, int size, const char *delimiter)
{
int     i, len = 0;
char    *str;


for(i = 0; i < len; i++)
{
len += tokens[i].size + 1;
}


str = (char*)calloc(len, sizeof(char));


len = 0;
for(i = 0; i < size; i++)
{
memcpy((void*)&str[len], (void*)tokens[i].ptr, tokens[i].size);
len += tokens[i].size;
str[(len++)] = *delimiter;
}


str[len - 1] = '\0';


return str;
}

用法:

int main(int argc, char **argv)
{
int         i, c;
char        *exp = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
token_t     *tokens;
char        *imp;


printf("%s\n", exp);


if((c = explode(exp, strlen(exp), ",", &tokens)) > 0)
{
imp = implode(tokens, c, ",");
printf("%s\n", imp);


for(i = 0; i < c; i++)
{
printf("%.*s, %d\n", tokens[i].size, (char*)tokens[i].ptr, tokens[i].size);
}
}


free((void*)tokens);
free((void*)imp);
return 0;
}

下面是我从 ZString 库实现的 strtok()zstring_strtok()处理连续分隔符的方式不同于标准库的 strtok()

只需看一下下面的代码,确保您能够了解它是如何工作的(我试图尽可能多地使用注释)

char *zstring_strtok(char *str, const char *delim) {
static char *static_str=0;      /* var to store last address */
int index=0, strlength=0;       /* integers for indexes */
int found = 0;                  /* check if delim is found */


/* delimiter cannot be NULL
* if no more char left, return NULL as well
*/
if (delim==0 || (str == 0 && static_str == 0))
return 0;


if (str == 0)
str = static_str;


/* get length of string */
while(str[strlength])
strlength++;


/* find the first occurance of delim */
for (index=0;index<strlength;index++)
if (str[index]==delim[0]) {
found=1;
break;
}


/* if delim is not contained in str, return str */
if (!found) {
static_str = 0;
return str;
}


/* check for consecutive delimiters
*if first char is delim, return delim
*/
if (str[0]==delim[0]) {
static_str = (str + 1);
return (char *)delim;
}


/* terminate the string
* this assignmetn requires char[], so str has to
* be char[] rather than *char
*/
str[index] = '\0';


/* save the rest of the string */
if ((str + index + 1)!=0)
static_str = (str + index + 1);
else
static_str = 0;


return str;
}

下面是一个例子用法..。

  Example Usage
char str[] = "A,B,,,C";
printf("1 %s\n",zstring_strtok(s,","));
printf("2 %s\n",zstring_strtok(NULL,","));
printf("3 %s\n",zstring_strtok(NULL,","));
printf("4 %s\n",zstring_strtok(NULL,","));
printf("5 %s\n",zstring_strtok(NULL,","));
printf("6 %s\n",zstring_strtok(NULL,","));


Example Output
1 A
2 B
3 ,
4 ,
5 C
6 (null)

可以从 Github 下载该库 Https://github.com/fnoyanisi/zstring

如果你愿意使用一个外部库,我不能推荐 bstrlib不够。它需要一些额外的设置,但是从长远来看更容易使用。

例如,拆分下面的字符串,首先使用 bfromcstr()调用创建一个 bstring。(bstring是字符缓冲区的包装器)。 接下来,以逗号分割字符串,将结果保存在 struct bstrList中,struct bstrList包含字段 qty和数组 entry,后者是一个 bstring数组。

bstrlib还有许多其他功能可以在 bstring上运行

小菜一碟。

#include "bstrlib.h"
#include <stdio.h>
int main() {
int i;
char *tmp = "Hello,World,sak";
bstring bstr = bfromcstr(tmp);
struct bstrList *blist = bsplit(bstr, ',');
printf("num %d\n", blist->qty);
for(i=0;i<blist->qty;i++) {
printf("%d: %s\n", i, bstr2cstr(blist->entry[i], '_'));
}


}

我认为下面的解决方案是理想的:

  • 不会破坏源字符串
  • 可重入——也就是说,您可以在一个或多个线程中的任何地方安全地调用它
  • 便携式的
  • 正确处理多个分隔符
  • 快速高效

代码说明:

  1. 定义一个结构 token来存储令牌的地址和长度
  2. 在最坏的情况下为这些分配足够的内存,也就是当 str完全由分离器组成,所以有 strlen(str) + 1 令牌,都是空字符串
  3. 扫描 str记录每个令牌的地址和长度
  4. 使用此选项可以分配正确大小的输出数组,包括用于 NULL前哨值的额外空间
  5. 使用 start 和 length 分配、复制和添加标记 信息-使用 memcpy,因为它比 strcpy快,我们知道 长度
  6. 释放令牌地址和长度数组
  7. 返回令牌数组
typedef struct {
const char *start;
size_t len;
} token;


char **split(const char *str, char sep)
{
char **array;
unsigned int start = 0, stop, toks = 0, t;
token *tokens = malloc((strlen(str) + 1) * sizeof(token));
for (stop = 0; str[stop]; stop++) {
if (str[stop] == sep) {
tokens[toks].start = str + start;
tokens[toks].len = stop - start;
toks++;
start = stop + 1;
}
}
/* Mop up the last token */
tokens[toks].start = str + start;
tokens[toks].len = stop - start;
toks++;
array = malloc((toks + 1) * sizeof(char*));
for (t = 0; t < toks; t++) {
/* Calloc makes it nul-terminated */
char *token = calloc(tokens[t].len + 1, 1);
memcpy(token, tokens[t].start, tokens[t].len);
array[t] = token;
}
/* Add a sentinel */
array[t] = NULL;
free(tokens);
return array;
}

为简洁起见,略去 malloc检查。

一般来说,我不会从这样的拆分函数返回 char *指针数组,因为它会让调用者承担很多正确释放它们的责任。我更喜欢的一个接口是允许调用方传递一个回调函数,并对每个令牌调用这个函数,正如我在这里所描述的: 拆分 C 中的字符串

我的版本是:

int split(char* str, const char delimeter, char*** args) {
int cnt = 1;
char* t = str;


while (*t == delimeter) t++;


char* t2 = t;
while (*(t2++))
if (*t2 == delimeter && *(t2 + 1) != delimeter && *(t2 + 1) != 0) cnt++;


(*args) = malloc(sizeof(char*) * cnt);


for(int i = 0; i < cnt; i++) {
char* ts = t;
while (*t != delimeter && *t != 0) t++;


int len = (t - ts + 1);
(*args)[i] = malloc(sizeof(char) * len);
memcpy((*args)[i], ts, sizeof(char) * (len - 1));
(*args)[i][len - 1] = 0;


while (*t == delimeter) t++;
}


return cnt;
}

围绕这个问题的两个问题是内存管理和线程安全, 在 C 语言中,要完美地完成这个任务并不容易。我想要的解决方案是:

  • 线程安全(strtok 不是线程安全的)
  • 不使用 malloc 或它的任何派生程序(以避免内存管理问题)
  • 检查各个字段的数组边界(以避免未知数据上的段错误)
  • 使用多字节字段分隔符(utf-8)
  • 忽略输入中的额外字段
  • 为无效字段长度提供软错误例程

我提出的解决方案满足所有这些条件 但是我认为在实践中,额外的工作是值得的 以避免其他解决方案的共同陷阱。

#include <stdio.h>
#include <string.h>


struct splitFieldType {
char *field;
int   maxLength;
};


typedef struct splitFieldType splitField;


int strsplit(splitField *fields, int expected, const char *input, const char *fieldSeparator, void (*softError)(int fieldNumber,int expected,int actual))  {
int i;
int fieldSeparatorLen=strlen(fieldSeparator);
const char *tNext, *tLast=input;


for (i=0; i<expected && (tNext=strstr(tLast, fieldSeparator))!=NULL; ++i) {
int len=tNext-tLast;
if (len>=fields[i].maxLength) {
softError(i,fields[i].maxLength-1,len);
len=fields[i].maxLength-1;
}
fields[i].field[len]=0;
strncpy(fields[i].field,tLast,len);
tLast=tNext+fieldSeparatorLen;
}
if (i<expected) {
if (strlen(tLast)>fields[i].maxLength) {
softError(i,fields[i].maxLength,strlen(tLast));
} else {
strcpy(fields[i].field,tLast);
}
return i+1;
} else {
return i;
}
}




void monthSplitSoftError(int fieldNumber, int expected, int actual) {
fprintf(stderr,"monthSplit: input field #%d is %d bytes, expected %d bytes\n",fieldNumber+1,actual,expected);
}




int main() {
const char *fieldSeparator=",";
const char *input="JAN,FEB,MAR,APRI,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC,FOO,BAR";


struct monthFieldsType {
char field1[4];
char field2[4];
char field3[4];
char field4[4];
char field5[4];
char field6[4];
char field7[4];
char field8[4];
char field9[4];
char field10[4];
char field11[4];
char field12[4];
} monthFields;


splitField inputFields[12] = {
{monthFields.field1,  sizeof(monthFields.field1)},
{monthFields.field2,  sizeof(monthFields.field2)},
{monthFields.field3,  sizeof(monthFields.field3)},
{monthFields.field4,  sizeof(monthFields.field4)},
{monthFields.field5,  sizeof(monthFields.field5)},
{monthFields.field6,  sizeof(monthFields.field6)},
{monthFields.field7,  sizeof(monthFields.field7)},
{monthFields.field8,  sizeof(monthFields.field8)},
{monthFields.field9,  sizeof(monthFields.field9)},
{monthFields.field10, sizeof(monthFields.field10)},
{monthFields.field11, sizeof(monthFields.field11)},
{monthFields.field12, sizeof(monthFields.field12)}
};


int expected=sizeof(inputFields)/sizeof(splitField);


printf("input data: %s\n", input);
printf("expecting %d fields\n",expected);


int ct=strsplit(inputFields, expected, input, fieldSeparator, monthSplitSoftError);


if (ct!=expected) {
printf("string split %d fields, expected %d\n", ct,expected);
}


for (int i=0;i<expected;++i) {
printf("field %d: %s\n",i+1,inputFields[i].field);
}


printf("\n");
printf("Direct structure access, field 10: %s", monthFields.field10);
}

下面是一个编译和输出的示例。请注意,在我的示例中,我特意拼写了“ APRIL”,以便您可以看到软错误是如何工作的。

$ gcc strsplitExample.c && ./a.out
input data: JAN,FEB,MAR,APRIL,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC,FOO,BAR
expecting 12 fields
monthSplit: input field #4 is 5 bytes, expected 3 bytes
field 1: JAN
field 2: FEB
field 3: MAR
field 4: APR
field 5: MAY
field 6: JUN
field 7: JUL
field 8: AUG
field 9: SEP
field 10: OCT
field 11: NOV
field 12: DEC


Direct structure access, field 10: OCT

好好享受吧!

下面是另一个实现,它将安全地操作以标记与问题中请求的原型相匹配的 字符串,并返回分配给 char 的指针(例如 char **)。分隔符字符串可以包含多个字符,输入字符串可以包含任意数量的标记。所有的分配和重新分配都由 mallocrealloc处理,没有 POSIX strdup

分配的指针的初始数量由 NPTRS常量控制,唯一的限制是它大于零。返回的 char **在最后一个类似于 *argv[]的令牌之后包含一个 哨兵 NULL,并且以 execvexecvpexecve可用的形式存在。

strtok()一样,多个顺序分隔符被视为一个单独的分隔符,因此对 "JAN,FEB,MAR,APR,MAY,,,JUN,JUL,AUG,SEP,OCT,NOV,DEC"的解析就好像只有一个 ','分隔 "MAY,JUN"一样。

下面的函数是按行注释的,并且添加了一个分割月份的简短 main()。分配的指针的初始数量设置在 2,以便在对输入字符串进行标记时强制进行三次重新分配:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>


#define NPTRS 2     /* initial number of pointers to allocate (must be > 0) */


/* split src into tokens with sentinel NULL after last token.
* return allocated pointer-to-pointer with sentinel NULL on success,
* or NULL on failure to allocate initial block of pointers. The number
* of allocated pointers are doubled each time reallocation required.
*/
char **strsplit (const char *src, const char *delim)
{
int i = 0, in = 0, nptrs = NPTRS;       /* index, in/out flag, ptr count */
char **dest = NULL;                     /* ptr-to-ptr to allocate/fill */
const char *p = src, *ep = p;           /* pointer and end-pointer */


/* allocate/validate nptrs pointers for dest */
if (!(dest = malloc (nptrs * sizeof *dest))) {
perror ("malloc-dest");
return NULL;
}
*dest = NULL;   /* set first pointer as sentinel NULL */


for (;;) {  /* loop continually until end of src reached */
if (!*ep || strchr (delim, *ep)) {  /* if at nul-char or delimiter char */
size_t len = ep - p;            /* get length of token */
if (in && len) {                /* in-word and chars in token */
if (i == nptrs - 1) {       /* used pointer == allocated - 1? */
/* realloc dest to temporary pointer/validate */
void *tmp = realloc (dest, 2 * nptrs * sizeof *dest);
if (!tmp) {
perror ("realloc-dest");
break;  /* don't exit, original dest still valid */
}
dest = tmp;             /* assign reallocated block to dest */
nptrs *= 2;             /* increment allocated pointer count */
}
/* allocate/validate storage for token */
if (!(dest[i] = malloc (len + 1))) {
perror ("malloc-dest[i]");
break;
}
memcpy (dest[i], p, len);   /* copy len chars to storage */
dest[i++][len] = 0;         /* nul-terminate, advance index */
dest[i] = NULL;             /* set next pointer NULL */
}
if (!*ep)                       /* if at end, break */
break;
in = 0;                         /* set in-word flag 0 (false) */
}
else {  /* normal word char */
if (!in)                        /* if not in-word */
p = ep;                     /* update start to end-pointer */
in = 1;                         /* set in-word flag 1 (true) */
}
ep++;   /* advance to next character */
}


return dest;
}


int main (void) {


char *str = "JAN,FEB,MAR,APR,MAY,,,JUN,JUL,AUG,SEP,OCT,NOV,DEC",
**tokens;                           /* pointer to pointer to char */


if ((tokens = strsplit (str, ","))) {   /* split string into tokens */
for (char **p = tokens; *p; p++) {  /* loop over filled pointers */
puts (*p);
free (*p);      /* don't forget to free allocated strings */
}
free (tokens);      /* and pointers */
}
}

示例使用/输出

$ ./bin/splitinput
JAN
FEB
MAR
APR
MAY
JUN
JUL
AUG
SEP
OCT
NOV
DEC

如果你还有其他问题,请告诉我。

我知道派对迟到了,但这里还有两个功能可以玩,也许还能进一步适应你的需要(文章的 底部的源代码)

另请参阅下面的 实施说明,以决定哪种功能更适合您的需要。

#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdbool.h>  // C99


// tokenize destructively
char **str_toksarray_alloc(
char **strp,       /* InOut: pointer to the source non-constant c-string */
const char *delim, /* c-string containing the delimiting chars */
size_t *ntoks,     /* InOut: # of tokens to parse/parsed (NULL or *ntoks==0 for all tokens) */
bool keepnulls     /* false ignores empty tokens, true includes them */
);


// tokenize non-destructively
char **str_toksarray_alloc2(
const char *str,    /* the source c-string */
const char *delim,
size_t *ntoks,
bool keepnulls
);

使用说明

它们的原型几乎相同,除了源字符串(分别为 strpstr)。

strp(指向字符串的指针)是一个已经分配的非常量 c-string 的地址,它将被就地标记。str是一个不会改变的 c-string (它甚至可以是 string-Literal)。我所说的 C 弦是指以 nul结束的字符缓冲区。对于这两个函数,其余的参数是相同的。

要解析所有可用的令牌,哑巴 ntoks(意思是在将其传递给任何函数或将其作为 NULL指针传递之前将其设置为0)。否则函数将解析到 *ntoks标记,或者直到没有更多的标记(以先到者为准)。在任何情况下,当 ntoksnon-NULL时,它将通过成功解析的令牌数得到更新。

另请注意,非静音ntoks确定将分配多少个指针。因此,如果源字符串包含大约10个标记,并且我们将 ntoks设置为1000,那么最终将得到990个不必要的分配指针。另一方面,如果源字符串包含大约1000个令牌,但是我们只需要前10个,那么将 ntoks设置为10似乎是更明智的选择。

这两个函数都是 分配并返回一个字符指针数组,但是 str_toksarray_alloc()使它们指向修改后的源字符串本身中的令牌,而 str_toksarray_alloc2()使它们指向动态分配的令牌副本(名称末尾的2表示分配的2级)。

返回的数组附加了一个 NULL前哨指针,在传递回的 ntoks值中没有考虑到这一点(换句话说,当 non-NULL传递回来时,ntoks将返回数组的长度传递给调用者,而不是它的第一级大小)。

keepnulls设置为 true时,结果令牌类似于我们期望的 一个 href = “ https://linux.die.net/man/3/strsep”rel = “ nofollow norefrer”> strsep () 函数。主要的意思是源字符串中的连续分隔符产生空标记(空值) ,如果 delim是空的 c 字符串,或者在源字符串中没有找到它所包含的分隔符,结果只有1个标记: 源字符串。与 一个 href = “ https://linux.die.net/man/3/strsep”rel = “ nofollow norefrer”> strsep () 相反,通过将 keepnulls设置为 false可以忽略空标记。

失败的 函数调用可以通过根据 NULL检查它们的返回值来识别,或者通过根据0检查传递回的 ntoks值来识别(假设 ntoksnon-NULL)。我建议在尝试访问返回的数组之前总是检查是否失败,因为这些函数包括健全性检查,可以推迟本来会立即发生的崩溃(例如,将 NULL指针作为源字符串传递)。

在成功 时,调用方应该在完成数组操作后释放数组。 对于 str_toksarray_alloc(),一个简单的 翻译: 奇芳翻译: 奇芳翻译: 奇芳翻译: 奇芳翻译: 奇芳翻译: 奇芳翻译: 奇芳翻译: 奇芳翻译: 奇芳翻译: 奇芳翻译: 奇芳就足够了。对于 str_toksarray_alloc2(),由于第二级分配,会涉及到一个循环。NULL哨兵(或者传递回来的 non-NULL ntoks值)使这个变得微不足道,但是我也在下面提供了一个 toksarray_free2()函数,以供所有懒惰的蜜蜂使用:)

下面是使用这两个函数的简化示例。

准备:

const char *src = ";b,test,Tèst,;;cd;ελληνικά,nørmälize,;string to";
const char *delim = ";,";
bool keepnulls = true;
size_t ntoks = 0;

Str _ toksarray _ alloc () :

// destructive (use copy of src)


char *scopy = strdup( src );
if (!scopy) { ... };          // handle strdup failure


printf( "%s\n", src );
char **arrtoks = str_toksarray_alloc( &scopy, delim, &ntoks, keepnulls );
printf( "%lu tokens read\n", ntoks );
if ( arrtoks ) {
for (int i=0; arrtoks[i]; i++) {
printf( "%d: %s\n", i, arrtoks[i] );
}
}
free( scopy );
free( arrtoks );


/* OUTPUT
;b,test,Tèst,;;cd;ελληνικά,nørmälize,;string to
11 tokens read
0:
1: b
2: test
3: Tèst
4:
5:
6: cd
7: ελληνικά
8: nørmälize
9:
10: string to
*/

Str _ toksarray _ alloc2() :

// non-destructive


keepnulls = false;    // reject empty tokens


printf( "%s\n", src );
arrtoks = str_toksarray_alloc2( src, delim, &ntoks, keepnulls );
printf( "%lu tokens read\n", ntoks );
if ( arrtoks ) {
for (int i=0; arrtoks[i]; i++) {
printf( "%d: %s\n", i, arrtoks[i] );
}
}
toksarray_free2( arrtoks );                     // dangling arrtoks
// or: arrtoks = toksarray_free2( arrtoks );    // non-dangling artoks


/* OUTPUT
;b,test,Tèst,;;cd;ελληνικά,nørmälize,;string to
7 tokens read
0: b
1: test
2: Tèst
3: cd
4: ελληνικά
5: nørmälize
6: string to
*/

实施说明

这两个函数都使用 一个 href = “ https://linux.die.net/man/3/strsep”rel = “ nofollow norefrer”> strsep () 进行标记化,这使它们成为 线程安全,但它不是标准函数。如果没有提供,您总是可以使用开源实现(例如 GNU 的苹果的)。str_toksarray_alloc2()中使用的函数 一个 href = “ https://linux.die.net/man/3/strup”rel = “ nofollow norefrer”> strup () 也是如此(它的实现很简单,但是这里还有 GNU 的苹果的的例子)。

str_toksarray_alloc()中使用 一个 href = “ https://linux.die.net/man/3/strsep”rel = “ nofollow norefrer”> strsep () 的一个副作用是,在解析循环的每个步骤中,源字符串的起始指针不断移动到下一个标记。这意味着调用者将无法释放解析后的字符串,除非他们将起始地址保存到一个额外的指针。通过使用 strpSaved指针在函数的本地执行 我们帮他们省了麻烦str_toksarray_alloc2()不受此影响,因为它不接触源字符串。

这两个函数之间的主要区别在于,str_toksarray_alloc()不为找到的令牌分配内存。它只为数组指针分配空间,并将它们直接指向源字符串。这是因为 一个 href = “ https://linux.die.net/man/3/strsep”rel = “ nofollow norefrer”> strsep () nul-就地终止找到的令牌。这种依赖关系可能会使支持代码复杂化,但是对于大字符串,它也会在性能上产生很大的差异。如果保留源字符串并不重要,那么它也会对内存占用产生很大的影响。

另一方面,str_toksarray_alloc2()分配并返回动态分配的令牌副本的自我维持的数组,没有进一步的依赖关系。它首先从源字符串的本地副本创建数组,然后将实际的令牌内容复制到数组中。这比 str_toksarray_alloc()慢得多,占用的内存也大得多,但是它没有进一步的依赖关系,并且没有对源字符串的性质设置特殊的要求。这使得编写更简单(因此更易于维护)的支持代码变得更加容易。

这两个函数之间的另一个区别是,当 ntoks静音时,第一级分配(数组指针)。它们都解析所有可用的令牌,但是它们采用完全不同的方法。str_toksarray_alloc()使用 alloc-ahead,初始大小为16(字符指针) ,在解析循环中根据需要将其增加一倍。str_toksarray_alloc2()第一次计算所有可用的令牌,然后只分配一次这么多字符指针。第一次传递是使用辅助函数 str_toksfound()完成的,该函数使用标准函数 一个 href = “ https://linux.die.net/man/3/strpbrk”rel = “ nofollow norefrer”> strpbrk () Strchr () 。下面我将提供该函数的源代码。

哪种方法更好实际上取决于您的决定,这取决于您的项目的需要。您可以随意调整每个函数的代码,以便从这两种方法中选择一种。

我想说,平均而言,对于真正大的字符串,alloc-ahead 要快得多,特别是当初始大小和增长因子根据具体情况进行微调时(例如,使它们成为函数参数)。保存所有这些 strchr()strpbrk()的额外传球可以有所不同。但是,对于相对较小的字符串(这种情况很常见) ,仅仅提前分配一堆字符指针就有些过头了。在这种情况下,虽然不会造成什么损害,但是确实会无缘无故地使代码变得混乱。无论如何,请随意选择最适合你的。

这两个函数也是如此。我想说,在大多数情况下,str_toksarray_alloc2()处理起来要简单得多,因为对于中小型字符串来说,内存和性能很少是问题。如果您必须处理巨大的字符串,那么可以考虑使用 str_toksarray_alloc()(尽管在这些情况下,您应该使用专门的字符串解析函数,以接近您的项目的需要和输入的规格)。

哦,天哪,我觉得那可不仅仅是2美分那么简单(哈哈)。

无论如何,这里是2个函数和 helper 函数的代码(我已经删除了它们的大部分描述注释,因为我已经涵盖了几乎所有内容)。

源代码

Str _ toksarray _ alloc () :

// ----------------------------------------
// Tokenize destructively a nul-terminated source-string.
// Return a dynamically allocated, NULL terminated array of char-pointers
// each pointing to each token found in the source-string, or NULL on error.
//
char **str_toksarray_alloc(char **strp, const char *delim, size_t *ntoks, bool keepnulls)
{
// sanity checks
if ( !strp || !*strp || !**strp || !delim ) {
goto failed;
}


char *strpSaved = *strp;                    // save initial *strp pointer
bool ntoksOk = (ntoks && *ntoks);           // false when ntoks is muted
size_t _ntoks = (ntoksOk ? *ntoks : 16);    // # of tokens to alloc-ahead


// alloc array of char-pointers (+1 for NULL sentinel)
char **toksarr = malloc( (_ntoks+1) * sizeof(*toksarr) );
if ( !toksarr ) {
goto failed;
}


// Parse *strp tokens into the array
size_t i = 0;           // # of actually parsed tokens
char *tok;
while ( (tok = strsep(strp, delim)) ) {
// if requested, ignore empty tokens
if ( *tok == '\0' && !keepnulls ) {
continue;
}
// non-muted ntoks reached? we are done
if ( ntoksOk && i == _ntoks ) {
*ntoks = i;
break;
}
// muted ntoks & ran out of space? double toksarr and keep parsing
if ( !ntoksOk && i == _ntoks ) {
_ntoks *= 2;
char **tmparr = realloc( toksarr, (_ntoks+1) * sizeof(*tmparr) );
if ( !tmparr ) {
*strp = strpSaved;
free( toksarr );
goto failed;
}
toksarr = tmparr;
}
toksarr[i++] = tok; // get token address
}
toksarr[i] = NULL;      // NULL sentinel


*strp = strpSaved;      // restore initial *strp pointer
if (ntoks) *ntoks = i;  // pass to caller # of parsed tokens
return toksarr;


failed:
if (ntoks) *ntoks = 0;
return NULL;
}

Str _ toksarray _ alloc2() :

// ----------------------------------------
// Tokenize non-destructively a nul-terminated source-string.
// Return a dynamically allocated, NULL terminated array of dynamically
// allocated and nul-terminated string copies of each token found in the
// source-string. Return NULL on error.
// The 2 at the end of the name means 2-levels of allocation.
//
char **str_toksarray_alloc2( const char *str, const char *delim, size_t *ntoks, bool keepnulls )
{
// sanity checks
if ( !str || !*str || !delim ) {
if (ntoks) *ntoks = 0;
return NULL;
}


// make a copy of str to work with
char *_str = strdup( str );
if ( !_str ) {
if (ntoks) *ntoks = 0;
return NULL;
}


// if ntoks is muted we'll allocate str_tokscount() tokens, else *ntoks
size_t _ntoks = (ntoks && *ntoks) ? *ntoks : str_tokscount(_str, delim, keepnulls);
if ( _ntoks == 0 ) {        // str_tokscount() failed
goto fail_free_str;
}
    

// alloc the array of strings (+1 for an extra NULL sentinel)
char **toksarr = malloc( (_ntoks+1) * sizeof(*toksarr) );
if ( !toksarr ) {
goto fail_free_str;
}


// Parse str tokens and duplicate them into the array
size_t i = 0;           // # of actually parsed tokens
char *tok;
while ( i < _ntoks && (tok = strsep(&_str, delim)) ) {
// if requested, skip empty tokens
if ( *tok == '\0' && !keepnulls ) {
continue;
}
// duplicate current token into the array
char *tmptok = strdup( tok );
if ( !tmptok ) {
goto fail_free_arr;
}
toksarr[i++] = tmptok;
}
toksarr[i] = NULL;      // NULL sentinel


free( _str );           // release the local copy of the source-string
if (ntoks) *ntoks = i;  // pass to caller the # of parsed tokens
return toksarr;


// cleanup before failing
fail_free_arr:
for (size_t idx=0; idx < i; idx++) {
free( toksarr[idx] );
}
free( toksarr );


fail_free_str:
free( _str );
if (ntoks) *ntoks = 0;
return NULL;
}

Str _ tokscount () -helper 函数,由 Str _ toksarr _ alloc2()使用:

// ----------------------------------------
// Return the count of tokens present in a nul-terminated source-string (str),
// based on the delimiting chars contained in a 2nd nul-terminated string (delim).
// If the boolean argument is false, empty tokens are excluded.
//
// To stay consistent with the behavior of strsep(), the function returns 1 if
// delim is an empty string or none of its delimiters is found in str (in those
// cases the source-string is considered a single token).
// 0 is returned when str or delim are passed as NULL pointers, or when str is
// passed as an empty string.
//
size_t str_tokscount( const char *str, const char *delim, bool keepnulls )
{
// sanity checks
if ( !str || !*str || !delim ) {
return 0;
}


const char *tok = str;
size_t nnulls = strchr(delim, *str) ? 1 : 0;
size_t ntoks = 1;   // even when no delims in str, str counts as 1 token
for (; (str = strpbrk(tok, delim)); ntoks++ ) {
tok = ++str;
if ( strchr(delim, *str) ) {
nnulls++;
}
}


return keepnulls ? ntoks : (ntoks - nnulls);
}

Toksarray _ free2() -对 Str _ toksarr _ alloc2()返回的数组使用它:

// ----------------------------------------
// Free a dynamically allocated, NULL terminated, array of char-pointers
// with each such pointer pointing to its own dynamically allocated data.
// Return NULL, so the caller has the choice of assigning it back to the
// dangling pointer. The 2 at the end of the name means 2-levels of deallocation.
//
// NULL terminated array means ending with a NULL sentinel.
//      e.g.: toksarr[0] = tok1, ..., toksarr[len] = NULL
//
char **toksarray_free2( char **toksarr )
{
if ( toksarr ) {
char **toks = toksarr;
while ( *toks ) {   // walk until NULL sentinel
free( *toks++ );
}
free( toksarr );
}


return NULL;
}
#include <cstring>
#include <cstdio>
int main()
{
char buf[] = "This is Luke Skywalker    here!";
for( char* tok = strtok( buf, " ");
tok != nullptr;
tok = strtok( nullptr, " ")) {
puts( tok);
}
}


输出

This
is
Luke
Skywalker
here!

我试着做了一个非常简单的例子。

#include <stdio.h>
#include <string.h>


void split(char* inputArr, char** outputArr, char* delim) {
    

char *temp;
temp = strtok(inputArr, delim);


for(int i = 0; temp != NULL; i++) {
outputArr[i] = temp;
temp = strtok(NULL, delim);
}
}


int main(int argc, char **argv){
    

/* check for proper arguments */
    

if(argc != 2){
printf("One Argument Expected\n");
} else {


printf("\n");
/*---------main code starts here----------*/
FILE * myScriptFile;
myScriptFile = fopen(argv[1], "r");
        

/* read txt file and split into array like java split() */
        

int bufferLen = 100;
char buffer[bufferLen];
        

char *splitArr[100];


while(fgets(buffer, bufferLen, myScriptFile) != NULL){
            

split(buffer, splitArr, " ");


printf("Index 0 String: %s\n", splitArr[0]);
printf("Index 1 String: %s\n", splitArr[1]);
printf("Index 2 String: %s\n", splitArr[2]);
printf("Index 3 String: %s\n", splitArr[3]);
}
fclose(myScriptFile);
}
printf("\nProgram-Script Ended\n");
return 0;
}

假设. txt 文件具有

Hello this is test
Hello2 this is test2

使用. txt 文件作为参数运行它会给出

Index 0 String: Hello
Index 1 String: this
Index 2 String: is
Index 3 String: test


Index 0 String: Hello2
Index 1 String: this
Index 2 String: is
Index 3 String: test2

strtok()strsep()都修改输入字符串。我们可以使用 Strspn ()Strpbrk ()编写一个基于分隔符拆分字符串的函数。

算法:

  1. 如果输入字符串不为空,则转到步骤2 else 返回 null
  2. 跳过分隔符,如果在字符串的开始处有,并记录单词的开始位置(使用 strspn()) ,称之为 start
  3. 从前一步中找到的当前起始位置(使用 strpbrk())查找下一个分隔符位置(如果不存在更多的分隔符,则查找字符串的结束位置) ,将其命名为 end
  4. 在该内存中分配内存并将字符串从 start复制到 end
  5. 还牌。

优点:

  1. 线程安全。
  2. 处理多个分隔符。
  3. 便携式的。
  4. 不像 strtok()strsep()那样修改输入字符串。

实施方法:

#include <stdio.h>
#include <string.h>
#include <stdlib.h>


/*
* alloc_str function allocates memory and copy substring
* to allocated memory.
*/


static char * alloc_str (const char * start, const char * end) {
if (!start || !end || (start >= end)) {
return NULL;
}


char * tmp = malloc (end - start + 1);
if (tmp) {
memcpy (tmp, start, end - start);
tmp[end - start] = '\0';
} else {
fprintf (stderr, "Failed to allocate memory\n");
exit (EXIT_FAILURE);
}


return tmp;
}


/*
* str_split function returns the next token which is sequences of contiguous
* characters separated by any of the characters that are part of delimiters.
*
* Parameters:
* p_str : Address of pointer to the string that you want to split.
* sep : A set of characters that delimit the pieces in the string.
*
* Behaviour is undefined if sep is not a pointer to a null-terminated string.
*
* Return :
* Returns the pointer to dynamically allocated memory where the token is copied.
* If p_str is NULL or empty string, NULL is returned.
*/


char * str_split (char ** p_str, const char * sep) {
char * token = NULL;


if (*p_str && **p_str) {
char * p_end;


// skip separator
*p_str += strspn(*p_str, sep);


p_end = *p_str;


// find separator
p_end = strpbrk (p_end, sep);


// strpbrk() returns null pointer if no such character
// exists in the input string which is part of sep argument.
if (!p_end) {
p_end = *p_str + strlen (*p_str);
}


token = alloc_str (*p_str, p_end);
*p_str = p_end;
}


return token;
}


/*==================================================*/
/*==================================================*/


/*
* Just a helper function
*/


void token_helper (char * in_str, const char * delim) {
printf ("\nInput string : ");


if (in_str) printf ("\"%s\"\n", in_str);
else printf ("NULL\n");


if (delim) printf ("Delimiter : \"%s\"\n", delim);


char * ptr = in_str;
char * token = NULL;


printf ("Tokens:\n");
while ((token = str_split(&ptr, delim)) != NULL) {
printf ("-> %s\n", token);
/* You can assign this token to a pointer of an array of pointers
* and return that array of pointers from this function.
* Since, this is for demonstration purpose, I am
* freeing the allocated memory now.
*/
free (token);
}
}


/*
* Driver function
*/


int main (void) {
/* test cases */


char string[100] = "hello world!";
const char * delim = " ";
token_helper (string, delim);


strcpy (string, " hello world,friend of mine!");
delim = " ,";
token_helper (string, delim);


strcpy (string, "Another string");
delim = "-!";
token_helper (string, delim);


strcpy (string, "   one  more   -- string  !");
delim = "- !";
token_helper (string, delim);


strcpy (string, "");
delim = " ";
token_helper (string, delim);


token_helper (NULL, "");


strcpy (string, "hi");
delim = " -$";
token_helper (string, delim);


strcpy (string, "Give papa a cup of proper coffee in a copper coffee cup.");
delim = "cp";
token_helper (string, delim);


strcpy (string, "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC");
delim = ",";
token_helper (string, delim);


return 0;
}

产出:

# ./a.out


Input string : "hello world!"
Delimiter : " "
Tokens:
-> hello
-> world!


Input string : " hello world,friend of mine!"
Delimiter : " ,"
Tokens:
-> hello
-> world
-> friend
-> of
-> mine!


Input string : "Another string"
Delimiter : "-!"
Tokens:
-> Another string


Input string : "   one  more   -- string  !"
Delimiter : "- !"
Tokens:
-> one
-> more
-> string


Input string : ""
Delimiter : " "
Tokens:


Input string : NULL
Delimiter : ""
Tokens:


Input string : "hi"
Delimiter : " -$"
Tokens:
-> hi


Input string : "Give papa a cup of proper coffee in a copper coffee cup."
Delimiter : "cp"
Tokens:
-> Give
-> a
-> a a
-> u
->  of
-> ro
-> er
-> offee in a
-> o
-> er
-> offee
-> u
-> .


Input string : "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC"
Delimiter : ","
Tokens:
-> JAN
-> FEB
-> MAR
-> APR
-> MAY
-> JUN
-> JUL
-> AUG
-> SEP
-> OCT
-> NOV
-> DEC

为了寻找一个简单的解决方案,我偶然发现了这个。 我着迷于所有的选项,但对我自己的用例/品味不满意(这可能是可怕的)。

我已经创建了一个有点独特的解决方案,旨在清楚地表现为其用户,而不是重新分配任何内存,并与人类可读的评论 + 。

上传到 gist.github: https://gist.github.com/RepComm/1e89f7611733ce0e75c8476d5ef66093

例如:

#include "./strutils.c"


struct str_split_info info;
info.source = " SPLIT ME hello SPLIT ME world SPLIT ME whats SPLIT ME going SPLIT ME on SPLIT ME today";
info.delimiter = " SPLIT ME ";


str_split_begin(&info);


char * substr;


for (int i=0; i<info.splitStringsCount; i++) {
substr = info.splitStrings[i];
printf("substring: '%s'\n", substr);
}


str_split_end(&info);

产出:

$ ./test
substring: ''
substring: 'hello'
substring: 'world'
substring: 'whats'
substring: 'going'
substring: 'on'
substring: 'today'

完全来源于 strutils.c

#ifndef STRUTILS_C
#define STRUTILS_C 1


#ifndef str
#define str char *
#endif


#include <stdlib.h>
#include <stdbool.h>
#include <string.h>


#include <stdio.h>


struct str_split_info {
/* The string to be split
* Provided by caller of str_split_begin function
*/
str source;
/* The string that cuts the source string, all occurances of
* this string will be removed from the source string


* Provided by caller of str_split_begin function
*/
str delimiter;


/* Array of strings split by delimiter
* Provided and allocated by str_split_begin function
* Must be garbage collected by str_split_end function
*/
str * splitStrings;


/* Array of string lengths split by delimiter
* Provided and allocated by str_split_begin function
* Must be garbage collected by str_split_end function
*/
int * splitStringsLengths;


/* Number of strings split by delimiter contained in splitStrings
* Provided by str_split_begin function
*/
int splitStringsCount;
};
#define str_split_infop struct str_split_info *


/* Split a string by a delimiting string
*
* The caller is responsible only for calling str_split_end
* when finished with the results in 'info'
*/
void str_split_begin (str_split_infop info) {
info->splitStringsCount = 0;


int sourceLength = strlen(info->source);
int sourceOffset = 0;
char sourceChar;


int delimiterLength = strlen(info->delimiter);
int delimiterOffset = 0;
char delimiterChar;


//first pass, simply count occurances so we can allocate only once
for (sourceOffset = 0; sourceOffset<sourceLength; sourceOffset++) {
sourceChar = info->source[sourceOffset];
delimiterChar = info->delimiter[delimiterOffset];


if (sourceChar == delimiterChar) {
delimiterOffset++;


if (delimiterOffset >= delimiterLength) {
delimiterOffset = 0;
//increment count
info->splitStringsCount ++;
}
} else {
delimiterOffset = 0;
}
}
info->splitStringsCount++;


//allocate arrays since we know the count
//this one is an array of strings, which are each char arrays
info->splitStrings = (str *) malloc(sizeof (str *) * info->splitStringsCount);
//this one is an array of ints
info->splitStringsLengths = (int*) malloc(sizeof(int) *info->splitStringsCount);


int stringBegin = 0;
int stringEnd = 0;
int splitIndex = 0;
int splitLength = 0;


//second pass, fill the arrays
for (sourceOffset = 0; sourceOffset<sourceLength; sourceOffset++) {
sourceChar = info->source[sourceOffset];
delimiterChar = info->delimiter[delimiterOffset];


if (sourceChar == delimiterChar) {
delimiterOffset++;


//if we've reached the end of the delimiter
if (delimiterOffset >= delimiterLength) {


//don't worry about delimiter trailing null, strlen doesn't count those
stringEnd = sourceOffset - delimiterLength;
        

//char count of substring we want to split
splitLength = stringEnd - stringBegin + 1;


//allocate for our substring split
info->splitStrings[splitIndex] = (str) malloc(
//+1 for trailing null for c-string
sizeof(char) * splitLength + 1
);


//copy substring from source into splitStrings array
memcpy(
info->splitStrings[splitIndex],
info->source + stringBegin,
splitLength
);
//explicitly set the last char of this split to a NULL just for fun
info->splitStrings[splitIndex][splitLength] = 0x00;


//conveniently put the substring split size for the
//user of str_split_begin :)
info->splitStringsLengths[splitIndex] = splitLength;


//move to next split index
splitIndex ++;


//reset delimiter offset so we look for new occurances of it
delimiterOffset = 0;


//next substring split should occur after the current delimiter
stringBegin = sourceOffset+1;
}
} else {
//reset delimiter offset so we look for new occurances of it
delimiterOffset = 0;
}
}


//handle edge case of last substring after last delimiter
if (stringEnd != stringBegin) {
stringEnd = sourceLength-1;


splitLength = stringEnd - stringBegin + 1;


//allocate for our substring split
info->splitStrings[splitIndex] = (str) malloc(
//+1 for trailing null for c-string
sizeof(char) * splitLength + 1
);


//copy substring from source into splitStrings array
memcpy(
info->splitStrings[splitIndex],
info->source + stringBegin,
splitLength
);
    

}
}
int str_split_count (str_split_infop info) {
return info->splitStringsCount;
}


void str_split_get (str_split_infop info, str * out) {
for (int i=0; i < info->splitStringsCount; i++) {
strcpy(out[i], info->splitStrings[i]);
}
}


void str_split_end (str_split_infop info) {
if (info->splitStringsCount > 0 && info->splitStrings != NULL) {
//free each string allocated
for (int i=0; i < info->splitStringsCount; i++) {
free(info->splitStrings[i]);
}
//free string array pointer
free (info->splitStrings);


//free string lengths array pointer
free(info->splitStringsLengths);


info->splitStringsCount = 0;
}
}


void str_split_test () {
char * source = "hello world this is a test";
str delimiter = " ";


struct str_split_info info;
  

info.source = source;
info.delimiter = delimiter;


str_split_begin (&info);


//iterate thru split substrings
//NOTE: removed/memory cleanup after str_split_end
for (int i=0; i<info.splitStringsCount; i++) {
// info.splitStrings[i];
}


str_split_end(&info);
}


#endif