用 Javascript 解码 UTF-8

小开

回答最初的问题: 以下是如何在 javascript 中解码 utf-8:

Http://ecmanaut.blogspot.ca/2006/07/encoding-decoding-utf8-in-javascript.html

具体来说,

function encode_utf8(s) {
return unescape(encodeURIComponent(s));
}


function decode_utf8(s) {
return decodeURIComponent(escape(s));
}

我们已经在我们的生产代码中使用了6年，并且它工作得非常完美。

但是请注意，不推荐使用 escape ()和 unescape ()。 看这个。

小开

这应该会奏效:

// http://www.onicos.com/staff/iz/amuse/javascript/expert/utf.txt


/* utf.js - UTF-8 <=> UTF-16 convertion
*
* Copyright (C) 1999 Masanao Izumo <iz@onicos.co.jp>
* Version: 1.0
* LastModified: Dec 25 1999
* This library is free.  You can redistribute it and/or modify it.
*/


function Utf8ArrayToStr(array) {
var out, i, len, c;
var char2, char3;


out = "";
len = array.length;
i = 0;
while(i < len) {
c = array[i++];
switch(c >> 4)
{
case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
// 0xxxxxxx
out += String.fromCharCode(c);
break;
case 12: case 13:
// 110x xxxx   10xx xxxx
char2 = array[i++];
out += String.fromCharCode(((c & 0x1F) << 6) | (char2 & 0x3F));
break;
case 14:
// 1110 xxxx  10xx xxxx  10xx xxxx
char2 = array[i++];
char3 = array[i++];
out += String.fromCharCode(((c & 0x0F) << 12) |
((char2 & 0x3F) << 6) |
((char3 & 0x3F) << 0));
break;
}
}


return out;
}

看看 JSFiddle 演示。

另见相关问题: 给你和给你

小开

我寻找了一个简单的解决方案，这对我很有效:

//input data
view = new Uint8Array(data);


//output string
serialString = ua2text(view);


//convert UTF8 to string
function ua2text(ua) {
s = "";
for (var i = 0; i < ua.length; i++) {
s += String.fromCharCode(ua[i]);
}
return s;
}

我唯一的问题是有时候我一次只能看到一个角色。这可能是我设计的数组缓冲区的源代码。我在用 https://github.com/xseignard/cordovarduino读取机器人设备上的串行数据。

小开

@ albert 的解决方案是最接近的，但它只能解析3字节的 utf-8字符

function utf8ArrayToStr(array) {
var out, i, len, c;
var char2, char3;


out = "";
len = array.length;
i = 0;


// XXX: Invalid bytes are ignored
while(i < len) {
c = array[i++];
if (c >> 7 == 0) {
// 0xxx xxxx
out += String.fromCharCode(c);
continue;
}


// Invalid starting byte
if (c >> 6 == 0x02) {
continue;
}


// #### MULTIBYTE ####
// How many bytes left for thus character?
var extraLength = null;
if (c >> 5 == 0x06) {
extraLength = 1;
} else if (c >> 4 == 0x0e) {
extraLength = 2;
} else if (c >> 3 == 0x1e) {
extraLength = 3;
} else if (c >> 2 == 0x3e) {
extraLength = 4;
} else if (c >> 1 == 0x7e) {
extraLength = 5;
} else {
continue;
}


// Do we have enough bytes in our data?
if (i+extraLength > len) {
var leftovers = array.slice(i-1);


// If there is an invalid byte in the leftovers we might want to
// continue from there.
for (; i < len; i++) if (array[i] >> 6 != 0x02) break;
if (i != len) continue;


// All leftover bytes are valid.
return {result: out, leftovers: leftovers};
}
// Remove the UTF-8 prefix from the char (res)
var mask = (1 << (8 - extraLength - 1)) - 1,
res = c & mask, nextChar, count;


for (count = 0; count < extraLength; count++) {
nextChar = array[i++];


// Is the char valid multibyte part?
if (nextChar >> 6 != 0x02) {break;};
res = (res << 6) | (nextChar & 0x3f);
}


if (count != extraLength) {
i--;
continue;
}


if (res <= 0xffff) {
out += String.fromCharCode(res);
continue;
}


res -= 0x10000;
var high = ((res >> 10) & 0x3ff) + 0xd800,
low = (res & 0x3ff) + 0xdc00;
out += String.fromCharCode(high, low);
}


return {result: out, leftovers: []};
}

如果要分块解析字符串，则返回 {result: "parsed string", leftovers: [list of invalid bytes at the end]}。

编辑: 修正了@unHammer 发现的问题。

小开

也许使用文本解码器就足够了。

但 IE 不支持。

var decoder = new TextDecoder('utf-8'),
decodedMessage;


decodedMessage = decoder.decode(message.data);

处理非 UTF8文本

在本例中，我们解码俄文文本“ ，!”意思是“你好，世界”在我们的 textdecder ()构造函数中，我们指定了 windows-1251字符编码，这适用于西里尔字母的脚本。

    let win1251decoder = new TextDecoder('windows-1251');
let bytes = new Uint8Array([207, 240, 232, 226, 229, 242, 44, 32, 236, 232, 240, 33]);
console.log(win1251decoder.decode(bytes)); // Привет, мир!

文本解码器的接口描述为给你。

从字符串中检索字节数组同样简单:

const decoder = new TextDecoder();
const encoder = new TextEncoder();


const byteArray = encoder.encode('Größe');
// converted it to a byte array


// now we can decode it back to a string if desired
console.log(decoder.decode(byteArray));

如果你有一个不同的编码，那么你必须补偿编码后。 TextEncoder 构造函数中的参数是列出的给你中的任何一种有效编码。

小开

下面是一个解决方案，它可以处理所有 Unicode 代码点，包括上限(4字节)值，并且受到所有现代浏览器(IE 和其他 > 5.5的浏览器)的支持。它使用 decdeURIComponent () ，但不使用不推荐的转义/无转义函数:

function utf8_to_str(a) {
for(var i=0, s=''; i<a.length; i++) {
var h = a[i].toString(16)
if(h.length < 2) h = '0' + h
s += '%' + h
}
return decodeURIComponent(s)
}

在 GitHub上测试和使用

从字符串创建 UTF-8:

function utf8_from_str(s) {
for(var i=0, enc = encodeURIComponent(s), a = []; i < enc.length;) {
if(enc[i] === '%') {
a.push(parseInt(enc.substr(i+1, 2), 16))
i += 3
} else {
a.push(enc.charCodeAt(i++))
}
}
return a
}

在 GitHub上测试和使用

小开

更新@Albert 的表情符号添加条件。

function Utf8ArrayToStr(array) {
var out, i, len, c;
var char2, char3, char4;


out = "";
len = array.length;
i = 0;
while(i < len) {
c = array[i++];
switch(c >> 4)
{
case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
// 0xxxxxxx
out += String.fromCharCode(c);
break;
case 12: case 13:
// 110x xxxx   10xx xxxx
char2 = array[i++];
out += String.fromCharCode(((c & 0x1F) << 6) | (char2 & 0x3F));
break;
case 14:
// 1110 xxxx  10xx xxxx  10xx xxxx
char2 = array[i++];
char3 = array[i++];
out += String.fromCharCode(((c & 0x0F) << 12) |
((char2 & 0x3F) << 6) |
((char3 & 0x3F) << 0));
break;
case 15:
// 1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx
char2 = array[i++];
char3 = array[i++];
char4 = array[i++];
out += String.fromCodePoint(((c & 0x07) << 18) | ((char2 & 0x3F) << 12) | ((char3 & 0x3F) << 6) | (char4 & 0x3F));


break;
}


return out;
}

小开

我认为最简单的方法是使用内置的 js 函数 decdeURI ()/encodeURI ()。

function (usernameSent) {
var usernameEncoded = usernameSent; // Current value: utf8
var usernameDecoded = decodeURI(usernameReceived);  // Decoded
// do stuff
}

小开

//String 到 Utf8 ByteBuffer

function strToUTF8(str){
return Uint8Array.from(encodeURIComponent(str).replace(/%(..)/g,(m,v)=>{return String.fromCodePoint(parseInt(v,16))}), c=>c.codePointAt(0))
}

//Utf8 ByteArray 转换为 string

function UTF8toStr(ba){
return decodeURIComponent(ba.reduce((p,c)=>{return p+'%'+c.toString(16),''}))
}

小开

使用我的1.6 KB 图书馆，你可以

ToString(FromUTF8(Array.from(usernameReceived)))

小开

这是我在一个比 UTF-8编码/解码更具体的谷歌搜索后发现的。因此，对于那些正在寻找一个转换库转换之间的编码，在这里您去。

Https://github.com/inexorabletash/text-encoding

var uint8array = new TextEncoder().encode(str);
var str = new TextDecoder(encoding).decode(uint8array);

从回收自述粘贴

支持编码规范中的所有编码:

Utf-8 ibm866 iso-8859-2 iso-8859-3 iso-8859-4 iso-8859-4 iso-8859-5 iso-8859-6 iso-8859-7 iso-8859-8 iso-8859-8-i iso-8859-10 iso-8859-10 iso-8859-13 iso-8859-14 iso-8859-15Iso-8859-16 koi8-r koi8-u macintosh windows-874 windows-1250 windows-1251 windows-1252 windows-1253 windows-1254 windows-1255 windows-1256 windows-1257 windows-1258 x-mac-cyrillic gb18030 hz-gb-2312 big5 euc-jp iso-2022 -Jp shift _ jis euc-kr 替换 utf-16be utf-16le x-user-Definition

(某些编码可能支持其他名称，例如 ascii、 iso-8859-1等，有关每种编码的附加标签，请参阅 Encoding。)

小开

这是一个具有大量错误报告的解决方案。

它将采用 UTF-8编码的字节数组(其中字节数组表示为数组，每个数字都是0到255之间的整数) 并将生成一个 Unicode 字符的 JavaScript 字符串。

function getNextByte(value, startByteIndex, startBitsStr,
additional, index)
{
if (index >= value.length) {
var startByte = value[startByteIndex];
throw new Error("Invalid UTF-8 sequence. Byte " + startByteIndex
+ " with value " + startByte + " (" + String.fromCharCode(startByte)
+ "; binary: " + toBinary(startByte)
+ ") starts with " + startBitsStr + " in binary and thus requires "
+ additional + " bytes after it, but we only have "
+ (value.length - startByteIndex) + ".");
}
var byteValue = value[index];
checkNextByteFormat(value, startByteIndex, startBitsStr, additional, index);
return byteValue;
}


function checkNextByteFormat(value, startByteIndex, startBitsStr,
additional, index)
{
if ((value[index] & 0xC0) != 0x80) {
var startByte = value[startByteIndex];
var wrongByte = value[index];
throw new Error("Invalid UTF-8 byte sequence. Byte " + startByteIndex
+ " with value " + startByte + " (" +String.fromCharCode(startByte)
+ "; binary: " + toBinary(startByte) + ") starts with "
+ startBitsStr + " in binary and thus requires " + additional
+ " additional bytes, each of which shouls start with 10 in binary."
+ " However byte " + (index - startByteIndex)
+ " after it with value " + wrongByte + " ("
+ String.fromCharCode(wrongByte) + "; binary: " + toBinary(wrongByte)
+") does not start with 10 in binary.");
}
}


function fromUtf8 (str) {
var value = [];
var destIndex = 0;
for (var index = 0; index < str.length; index++) {
var code = str.charCodeAt(index);
if (code <= 0x7F) {
value[destIndex++] = code;
} else if (code <= 0x7FF) {
value[destIndex++] = ((code >> 6 ) & 0x1F) | 0xC0;
value[destIndex++] = ((code >> 0 ) & 0x3F) | 0x80;
} else if (code <= 0xFFFF) {
value[destIndex++] = ((code >> 12) & 0x0F) | 0xE0;
value[destIndex++] = ((code >> 6 ) & 0x3F) | 0x80;
value[destIndex++] = ((code >> 0 ) & 0x3F) | 0x80;
} else if (code <= 0x1FFFFF) {
value[destIndex++] = ((code >> 18) & 0x07) | 0xF0;
value[destIndex++] = ((code >> 12) & 0x3F) | 0x80;
value[destIndex++] = ((code >> 6 ) & 0x3F) | 0x80;
value[destIndex++] = ((code >> 0 ) & 0x3F) | 0x80;
} else if (code <= 0x03FFFFFF) {
value[destIndex++] = ((code >> 24) & 0x03) | 0xF0;
value[destIndex++] = ((code >> 18) & 0x3F) | 0x80;
value[destIndex++] = ((code >> 12) & 0x3F) | 0x80;
value[destIndex++] = ((code >> 6 ) & 0x3F) | 0x80;
value[destIndex++] = ((code >> 0 ) & 0x3F) | 0x80;
} else if (code <= 0x7FFFFFFF) {
value[destIndex++] = ((code >> 30) & 0x01) | 0xFC;
value[destIndex++] = ((code >> 24) & 0x3F) | 0x80;
value[destIndex++] = ((code >> 18) & 0x3F) | 0x80;
value[destIndex++] = ((code >> 12) & 0x3F) | 0x80;
value[destIndex++] = ((code >> 6 ) & 0x3F) | 0x80;
value[destIndex++] = ((code >> 0 ) & 0x3F) | 0x80;
} else {
throw new Error("Unsupported Unicode character \""
+ str.charAt(index) + "\" with code " + code + " (binary: "
+ toBinary(code) + ") at index " + index
+ ". Cannot represent it as UTF-8 byte sequence.");
}
}
return value;
}

小开

你应该用 decodeURI。

Https://developer.mozilla.org/en-us/docs/web/javascript/reference/global_objects/decodeuri

就这么简单:

decodeURI('https://developer.mozilla.org/ru/docs/JavaScript_%D1%88%D0%B5%D0%BB%D0%BB%D1%8B');
// "https://developer.mozilla.org/ru/docs/JavaScript_шеллы"

考虑在 try catch块中使用它，以避免丢失 URIError。

此外，它还支持完整的浏览器。

小开

正如其他人所建议的那样，最好使用编码 API。但是如果你需要支持 IE (由于一些奇怪的原因) MDN 建议这个回购最快最小文本编码解码器

如果你需要使用填料库:

    import {encode, decode} from "fastestsmallesttextencoderdecoder";

然后(不管多填充)进行编码和解码:

    // takes in USVString and returns a Uint8Array object
const encoded = new TextEncoder().encode('€')
console.log(encoded);


// takes in an ArrayBuffer or an ArrayBufferView and returns a DOMString
const decoded = new TextDecoder().decode(encoded);
console.log(decoded);

小开

const decoder = new TextDecoder();
console.log(decoder.decode(new Uint8Array([97])));

MDN 资源链路