string StringToUpper(string strToConvert)
{//change each element of the string to upper case
for(unsigned int i=0;i<strToConvert.length();i++)
{
strToConvert[i] = toupper(strToConvert[i]);
}
return strToConvert;//return the converted string
}
string StringToLower(string strToConvert)
{//change each element of the string to lower case
for(unsigned int i=0;i<strToConvert.length();i++)
{
strToConvert[i] = tolower(strToConvert[i]);
}
return strToConvert;//return the converted string
}
// char, not int, is essential: otherwise gcc unpacks to vectors of int! Huge slowdown.
char ascii_toupper_char(char c) {
return ('a' <= c && c <= 'z') ? c^0x20 : c; // ^ autovectorizes to PXOR: runs on more ports than paddb
}
// gcc can only auto-vectorize loops when the number of iterations is known before the first iteration. strlen gives us that
size_t strtoupper_autovec(char *dst, const char *src) {
size_t len = strlen(src);
for (size_t i=0 ; i<len ; ++i) {
dst[i] = ascii_toupper_char(src[i]); // gcc does the vector range check with psubusb / pcmpeqb instead of pcmpgtb
}
return len;
}
#include <immintrin.h>
__m128i upcase_si128(__m128i src) {
// The above 2 paragraphs were comments here
__m128i rangeshift = _mm_sub_epi8(src, _mm_set1_epi8('a'+128));
__m128i nomodify = _mm_cmpgt_epi8(rangeshift, _mm_set1_epi8(-128 + 25)); // 0:lower case -1:anything else (upper case or non-alphabetic). 25 = 'z' - 'a'
__m128i flip = _mm_andnot_si128(nomodify, _mm_set1_epi8(0x20)); // 0x20:lcase 0:non-lcase
// just mask the XOR-mask so elements are XORed with 0 instead of 0x20
return _mm_xor_si128(src, flip);
// it's easier to xor with 0x20 or 0 than to AND with ~0x20 or 0xFF
}
// FIXME: doesn't always copy the terminating 0.
// microbenchmarks are for this version of the code (with _mm_store in the loop, instead of storeu, for Merom).
size_t strtoupper_sse2(char *dst, const char *src_begin) {
const char *src = src_begin;
// scalar until the src pointer is aligned
while ( (0xf & (uintptr_t)src) && *src ) {
*(dst++) = ascii_toupper(*(src++));
}
if (!*src)
return src - src_begin;
// current position (p) is now 16B-aligned, and we're not at the end
int zero_positions;
do {
__m128i sv = _mm_load_si128( (const __m128i*)src );
// TODO: SSE4.2 PCMPISTRI or PCMPISTRM version to combine the lower-case and '\0' detection?
__m128i nullcheck = _mm_cmpeq_epi8(_mm_setzero_si128(), sv);
zero_positions = _mm_movemask_epi8(nullcheck);
// TODO: unroll so the null-byte check takes less overhead
if (zero_positions)
break;
__m128i upcased = upcase_si128(sv); // doing this before the loop break lets gcc realize that the constants are still in registers for the unaligned cleanup version. But it leads to more wasted insns in the early-out case
_mm_storeu_si128((__m128i*)dst, upcased);
//_mm_store_si128((__m128i*)dst, upcased); // for testing on CPUs where storeu is slow
src += 16;
dst += 16;
} while(1);
// handle the last few bytes. Options: scalar loop, masked store, or unaligned 16B.
// rewriting some bytes beyond the end of the string would be easy,
// but doing a non-atomic read-modify-write outside of the string is not safe.
// Upcasing is idempotent, so unaligned potentially-overlapping is a good option.
unsigned int cleanup_bytes = ffs(zero_positions) - 1; // excluding the trailing null
const char* last_byte = src + cleanup_bytes; // points at the terminating '\0'
// FIXME: copy the terminating 0 when we end at an aligned vector boundary
// optionally special-case cleanup_bytes == 15: final aligned vector can be used.
if (cleanup_bytes > 0) {
if (last_byte - src_begin >= 16) {
// if src==dest, this load overlaps with the last store: store-forwarding stall. Hopefully OOO execution hides it
__m128i sv = _mm_loadu_si128( (const __m128i*)(last_byte-15) ); // includes the \0
_mm_storeu_si128((__m128i*)(dst + cleanup_bytes - 15), upcase_si128(sv));
} else {
// whole string less than 16B
// if this is common, try 64b or even 32b cleanup with movq / movd and upcase_si128
#if 1
for (unsigned int i = 0 ; i <= cleanup_bytes ; ++i) {
dst[i] = ascii_toupper(src[i]);
}
#else
// gcc stupidly auto-vectorizes this, resulting in huge code bloat, but no measurable slowdown because it never runs
for (int i = cleanup_bytes - 1 ; i >= 0 ; --i) {
dst[i] = ascii_toupper(src[i]);
}
#endif
}
}
return last_byte - src_begin;
}
#include <ctype.h>
inline void toupper(char* str)
{
while (str[i]) {
if (islower(str[i]))
str[i] &= ~32; // Clear bit 6 as it is what differs (32) between Upper and Lowercases
i++;
}
}
Zoë Saldaña played in La maldición del padre Cardona. ëèñ αω óóChloë
Zoë Saldaña played in La maldición del padre Cardona. ëèñ αω óóChloë
ZOË SALDAÑA PLAYED IN LA MALDICIÓN DEL PADRE CARDONA. ËÈÑ ΑΩ ÓÓCHLOË
ZOË SALDAÑA PLAYED IN LA MALDICIÓN DEL PADRE CARDONA. ËÈÑ ΑΩ ÓÓCHLOË
zoë saldaña played in la maldición del padre cardona. ëèñ αω óóchloë
zoë saldaña played in la maldición del padre cardona. ëèñ αω óóchloë
Compilador de optimización de C/C++ de Microsoft (R) versión 19.27.29111 para x64
(C) Microsoft Corporation. Todos los derechos reservados.
main.cpp
Microsoft (R) Incremental Linker Version 14.27.29111.0
Copyright (C) Microsoft Corporation. All rights reserved.
/out:main.exe
main.obj
kernel32.lib
user32.lib
gdi32.lib
winspool.lib
comdlg32.lib
advapi32.lib
shell32.lib
ole32.lib
oleaut32.lib
uuid.lib
odbc32.lib
odbccp32.lib
运行main.exe >main.exe < / p >
结果
Zoë Saldaña played in La maldición del padre Cardona. ëèñ αω óóChloë
Zoë Saldaña played in La maldición del padre Cardona. ëèñ αω óóChloë
ZOË SALDAÑA PLAYED IN LA MALDICIÓN DEL PADRE CARDONA. ËÈÑ ΑΩ ÓÓCHLOË
ZOË SALDAÑA PLAYED IN LA MALDICIÓN DEL PADRE CARDONA. ËÈÑ ΑΩ ÓÓCHLOË
zoë saldaña played in la maldición del padre cardona. ëèñ αω óóchloë
zoë saldaña played in la maldición del padre cardona. ëèñ αω óóchloë
//Since I work on a MAC, and Windows methods mentioned do not work for me, I //just built this quick method.
string str;
str = "This String Will Print Out in all CAPS";
int len = str.size();
char b;
for (int i = 0; i < len; i++){
b = str[i];
b = toupper(b);
// b = to lower(b); //alternately
str[i] = b;
}
cout<<str;