HTML实体解码

我如何编码和解码HTML实体使用JavaScript或JQuery?

var varTitle = "Chris' corner";

我希望它是:

var varTitle = "Chris' corner";
581043 次浏览

你可以尝试这样做:

var Title = $('<textarea />').html("Chris&apos; corner").text();
console.log(Title);
<script src="https://ajax.googleapis.com/ajax/libs/jquery/2.1.1/jquery.min.js"></script>

JS Fiddle.

A more interactive version:

$('form').submit(function() {
var theString = $('#string').val();
var varTitle = $('<textarea />').html(theString).text();
$('#output').text(varTitle);
return false;
});
<script src="https://ajax.googleapis.com/ajax/libs/jquery/2.1.1/jquery.min.js"></script>
<form action="#" method="post">
<fieldset>
<label for="string">Enter a html-encoded string to decode</label>
<input type="text" name="string" id="string" />
</fieldset>
<fieldset>
<input type="submit" value="decode" />
</fieldset>
</form>


<div id="output"></div>

JS小提琴

如何解码HTML实体使用jQuery?所述,在页面中注入不受信任的HTML是危险的。

一个替代方案是使用PHP的html_entity_decode (from http://phpjs.org/functions/html_entity_decode:424)的纯javascript实现。这个例子应该是这样的:

var varTitle = html_entity_decode("Chris&apos; corner");

我不建议使用jQuery代码作为答案。虽然它不会将要解码的字符串插入到页面中,但它确实会创建脚本和HTML元素等内容。这代码比我们需要的多。相反,我建议使用更安全、更优化的函数。

var decodeEntities = (function() {
// this prevents any overhead from creating the object each time
var element = document.createElement('div');


function decodeHTMLEntities (str) {
if(str && typeof str === 'string') {
// strip script/html tags
str = str.replace(/<script[^>]*>([\S\s]*?)<\/script>/gmi, '');
str = str.replace(/<\/?\w(?:[^"'>]|"[^"]*"|'[^']*')*>/gmi, '');
element.innerHTML = str;
str = element.textContent;
element.textContent = '';
}


return str;
}


return decodeHTMLEntities;
})();

http://jsfiddle.net/LYteC/4/

要使用这个函数,只需调用decodeEntities("&amp;"),它将使用与jQuery版本相同的底层技术——但是没有jQuery的开销,并且在清除输入中的HTML标记之后。关于如何过滤HTML标签,请参阅已接受答案的Mike Samuel的评论

这个函数可以很容易地作为jQuery插件使用,只需在您的项目中添加以下行即可。

jQuery.decodeEntities = decodeEntities;

我知道我有点晚了,但我认为我可以提供以下片段作为我如何使用jQuery解码HTML实体的示例:

var varTitleE = "Chris&apos; corner";
var varTitleD = $("<div/>").html(varTitleE).text();


console.log(varTitleE + " vs. " + varTitleD);​​​​​​​​​​​

不要忘记启动检查器/firebug以查看控制台结果——或者简单地将console.log(…)替换为/alert(…)

也就是说,以下是我的控制台通过谷歌Chrome检查器读取的内容:

Chris&apos; corner vs. Chris' corner

就像Robert K说的,不要使用jQuery.html().text()来解码html实体,因为这是不安全的,因为用户输入永远不能访问DOM。阅读XSS,了解为什么这是不安全的。

相反,可以尝试带有逃避unescape方法的Underscore.js实用带库:

< a href = " http://underscorejs.org/逃脱“> < >强_.escape (string) < /强> < / >

转义插入HTML的字符串,替换&<>"`'字符。

_.escape('Curly, Larry & Moe');
=> "Curly, Larry &amp; Moe"

< a href = " http://underscorejs.org/ unescape " > < >强_.unescape (string) < /强> < / >

与escape相反,将&amp;&lt;&gt;&quot;&#96;&#x27;替换为未转义的对应对象。

_.unescape('Curly, Larry &amp; Moe');
=> "Curly, Larry & Moe"

要支持解码更多字符,只需复制下划线unescape方法并向映射中添加更多字符。

受Robert K的解决方案的启发,这个版本不剥离HTML标记,而且同样安全。

var decode_entities = (function() {
// Remove HTML Entities
var element = document.createElement('div');


function decode_HTML_entities (str) {


if(str && typeof str === 'string') {


// Escape HTML before decoding for HTML Entities
str = escape(str).replace(/%26/g,'&').replace(/%23/g,'#').replace(/%3B/g,';');


element.innerHTML = str;
if(element.innerText){
str = element.innerText;
element.innerText = '';
}else{
// Firefox support
str = element.textContent;
element.textContent = '';
}
}
return unescape(str);
}
return decode_HTML_entities;
})();

这里有一个不需要创建div的快速方法,并解码“最常见的”HTML转义字符:

function decodeHTMLEntities(text) {
var entities = [
['amp', '&'],
['apos', '\''],
['#x27', '\''],
['#x2F', '/'],
['#39', '\''],
['#47', '/'],
['lt', '<'],
['gt', '>'],
['nbsp', ' '],
['quot', '"']
];


for (var i = 0, max = entities.length; i < max; ++i)
text = text.replace(new RegExp('&'+entities[i][0]+';', 'g'), entities[i][1]);


return text;
}

我认为这与我们选择的解决方案完全相反。

var decoded = $("<div/>").text(encodedStr).html();

试试吧!

要做到这一点,在纯javascript没有jquery或预定义一切,你可以循环编码的html字符串通过元素innerHTML和innerText(/textContent)属性的每解码步骤,这是必需的:

<html>
<head>
<title>For every decode step, cycle through innerHTML and innerText </title>
<script>
function decode(str) {
var d = document.createElement("div");
d.innerHTML = str;
return typeof d.innerText !== 'undefined' ? d.innerText : d.textContent;
}
</script>
</head>
<body>
<script>
var encodedString = "&lt;p&gt;name&lt;/p&gt;&lt;p&gt;&lt;span style=\"font-size:xx-small;\"&gt;ajde&lt;/span&gt;&lt;/p&gt;&lt;p&gt;&lt;em&gt;da&lt;/em&gt;&lt;/p&gt;";
</script>
<input type=button onclick="document.body.innerHTML=decode(encodedString)"/>
</body>
</html>
受Robert K的解决方案的启发,剥离html标签和阻止执行脚本和事件处理程序,如:<img src=fake onerror="prompt(1)"> 在最新的Chrome, FF, IE上测试(应该可以从IE9工作,但还没有测试)
var decodeEntities = (function () {
//create a new html document (doesn't execute script tags in child elements)
var doc = document.implementation.createHTMLDocument("");
var element = doc.createElement('div');


function getText(str) {
element.innerHTML = str;
str = element.textContent;
element.textContent = '';
return str;
}


function decodeHTMLEntities(str) {
if (str && typeof str === 'string') {
var x = getText(str);
while (str !== x) {
str = x;
x = getText(x);
}
return x;
}
}
return decodeHTMLEntities;
})();

简单地调用:

decodeEntities('<img src=fake onerror="prompt(1)">');
decodeEntities("<script>alert('aaa!')</script>");

jQuery提供了一种编码和解码html实体的方法。

如果你使用“<div/>”标签,它会删除所有的html。

function htmlDecode(value) {
return $("<div/>").html(value).text();
}


function htmlEncode(value) {
return $('<div/>').text(value).html();
}

如果你使用“<textarea/>”标签,它将保留html标签。

function htmlDecode(value) {
return $("<textarea/>").html(value).text();
}


function htmlEncode(value) {
return $('<textarea/>').text(value).html();
}

为了在列表中添加另一个“受Robert K启发”,这里是另一个安全版本,它不剥离HTML标签. c。它不是通过HTML解析器运行整个字符串,而是只提取实体并转换它们。

var decodeEntities = (function() {
// this prevents any overhead from creating the object each time
var element = document.createElement('div');


// regular expression matching HTML entities
var entity = /&(?:#x[a-f0-9]+|#[0-9]+|[a-z0-9]+);?/ig;


return function decodeHTMLEntities(str) {
// find and replace all the html entities
str = str.replace(entity, function(m) {
element.innerHTML = m;
return element.textContent;
});


// reset the value
element.textContent = '';


return str;
}
})();

因为@Robert K和@mattcasey都有很好的代码,我想在这里贡献一个CoffeeScript版本,以防将来有人会使用它:

    String::unescape = (strict = false) ->
###
# Take escaped text, and return the unescaped version
#
# @param string str | String to be used
# @param bool strict | Stict mode will remove all HTML
#
# Test it here:
# https://jsfiddle.net/tigerhawkvok/t9pn1dn5/
#
# Code: https://gist.github.com/tigerhawkvok/285b8631ed6ebef4446d
###
# Create a dummy element
element = document.createElement("div")
decodeHTMLEntities = (str) ->
if str? and typeof str is "string"
unless strict is true
# escape HTML tags
str = escape(str).replace(/%26/g,'&').replace(/%23/g,'#').replace(/%3B/g,';')
else
str = str.replace(/<script[^>]*>([\S\s]*?)<\/script>/gmi, '')
str = str.replace(/<\/?\w(?:[^"'>]|"[^"]*"|'[^']*')*>/gmi, '')
element.innerHTML = str
if element.innerText
# Do we support innerText?
str = element.innerText
element.innerText = ""
else
# Firefox
str = element.textContent
element.textContent = ""
unescape(str)
# Remove encoded or double-encoded tags
fixHtmlEncodings = (string) ->
string = string.replace(/\&amp;#/mg, '&#') # The rest, for double-encodings
string = string.replace(/\&quot;/mg, '"')
string = string.replace(/\&quote;/mg, '"')
string = string.replace(/\&#95;/mg, '_')
string = string.replace(/\&#39;/mg, "'")
string = string.replace(/\&#34;/mg, '"')
string = string.replace(/\&#62;/mg, '>')
string = string.replace(/\&#60;/mg, '<')
string
# Run it
tmp = fixHtmlEncodings(this)
decodeHTMLEntities(tmp)

参见https://jsfiddle.net/tigerhawkvok/t9pn1dn5/7/https://gist.github.com/tigerhawkvok/285b8631ed6ebef4446d(包括已编译的JS,与此答案相比可能已更新)

以下是完整版本

function htmldecode(s){
window.HTML_ESC_MAP = {
"nbsp":" ","iexcl":"¡","cent":"¢","pound":"£","curren":"¤","yen":"¥","brvbar":"¦","sect":"§","uml":"¨","copy":"©","ordf":"ª","laquo":"«","not":"¬","reg":"®","macr":"¯","deg":"°","plusmn":"±","sup2":"²","sup3":"³","acute":"´","micro":"µ","para":"¶","middot":"·","cedil":"¸","sup1":"¹","ordm":"º","raquo":"»","frac14":"¼","frac12":"½","frac34":"¾","iquest":"¿","Agrave":"À","Aacute":"Á","Acirc":"Â","Atilde":"Ã","Auml":"Ä","Aring":"Å","AElig":"Æ","Ccedil":"Ç","Egrave":"È","Eacute":"É","Ecirc":"Ê","Euml":"Ë","Igrave":"Ì","Iacute":"Í","Icirc":"Î","Iuml":"Ï","ETH":"Ð","Ntilde":"Ñ","Ograve":"Ò","Oacute":"Ó","Ocirc":"Ô","Otilde":"Õ","Ouml":"Ö","times":"×","Oslash":"Ø","Ugrave":"Ù","Uacute":"Ú","Ucirc":"Û","Uuml":"Ü","Yacute":"Ý","THORN":"Þ","szlig":"ß","agrave":"à","aacute":"á","acirc":"â","atilde":"ã","auml":"ä","aring":"å","aelig":"æ","ccedil":"ç","egrave":"è","eacute":"é","ecirc":"ê","euml":"ë","igrave":"ì","iacute":"í","icirc":"î","iuml":"ï","eth":"ð","ntilde":"ñ","ograve":"ò","oacute":"ó","ocirc":"ô","otilde":"õ","ouml":"ö","divide":"÷","oslash":"ø","ugrave":"ù","uacute":"ú","ucirc":"û","uuml":"ü","yacute":"ý","thorn":"þ","yuml":"ÿ","fnof":"ƒ","Alpha":"Α","Beta":"Β","Gamma":"Γ","Delta":"Δ","Epsilon":"Ε","Zeta":"Ζ","Eta":"Η","Theta":"Θ","Iota":"Ι","Kappa":"Κ","Lambda":"Λ","Mu":"Μ","Nu":"Ν","Xi":"Ξ","Omicron":"Ο","Pi":"Π","Rho":"Ρ","Sigma":"Σ","Tau":"Τ","Upsilon":"Υ","Phi":"Φ","Chi":"Χ","Psi":"Ψ","Omega":"Ω","alpha":"α","beta":"β","gamma":"γ","delta":"δ","epsilon":"ε","zeta":"ζ","eta":"η","theta":"θ","iota":"ι","kappa":"κ","lambda":"λ","mu":"μ","nu":"ν","xi":"ξ","omicron":"ο","pi":"π","rho":"ρ","sigmaf":"ς","sigma":"σ","tau":"τ","upsilon":"υ","phi":"φ","chi":"χ","psi":"ψ","omega":"ω","thetasym":"ϑ","upsih":"ϒ","piv":"ϖ","bull":"•","hellip":"…","prime":"′","Prime":"″","oline":"‾","frasl":"⁄","weierp":"℘","image":"ℑ","real":"ℜ","trade":"™","alefsym":"ℵ","larr":"←","uarr":"↑","rarr":"→","darr":"↓","harr":"↔","crarr":"↵","lArr":"⇐","uArr":"⇑","rArr":"⇒","dArr":"⇓","hArr":"⇔","forall":"∀","part":"∂","exist":"∃","empty":"∅","nabla":"∇","isin":"∈","notin":"∉","ni":"∋","prod":"∏","sum":"∑","minus":"−","lowast":"∗","radic":"√","prop":"∝","infin":"∞","ang":"∠","and":"∧","or":"∨","cap":"∩","cup":"∪","int":"∫","there4":"∴","sim":"∼","cong":"≅","asymp":"≈","ne":"≠","equiv":"≡","le":"≤","ge":"≥","sub":"⊂","sup":"⊃","nsub":"⊄","sube":"⊆","supe":"⊇","oplus":"⊕","otimes":"⊗","perp":"⊥","sdot":"⋅","lceil":"⌈","rceil":"⌉","lfloor":"⌊","rfloor":"⌋","lang":"〈","rang":"〉","loz":"◊","spades":"♠","clubs":"♣","hearts":"♥","diams":"♦","\"":"quot","amp":"&","lt":"<","gt":">","OElig":"Œ","oelig":"œ","Scaron":"Š","scaron":"š","Yuml":"Ÿ","circ":"ˆ","tilde":"˜","ndash":"–","mdash":"—","lsquo":"‘","rsquo":"’","sbquo":"‚","ldquo":"“","rdquo":"”","bdquo":"„","dagger":"†","Dagger":"‡","permil":"‰","lsaquo":"‹","rsaquo":"›","euro":"€"};
if(!window.HTML_ESC_MAP_EXP)
window.HTML_ESC_MAP_EXP = new RegExp("&("+Object.keys(HTML_ESC_MAP).join("|")+");","g");
return s?s.replace(window.HTML_ESC_MAP_EXP,function(x){
return HTML_ESC_MAP[x.substring(1,x.length-1)]||x;
}):s;
}

使用

htmldecode("&sum;&nbsp;&gt;&euro;");

@William Lahti的回答有一个更实用的方法:

var entities = {
'amp': '&',
'apos': '\'',
'#x27': '\'',
'#x2F': '/',
'#39': '\'',
'#47': '/',
'lt': '<',
'gt': '>',
'nbsp': ' ',
'quot': '"'
}


function decodeHTMLEntities (text) {
return text.replace(/&([^;]+);/gm, function (match, entity) {
return entities[entity] || match
})
}

原文作者的答案在这里

这是我最喜欢的解码HTML字符的方法。使用此代码的优点是还保留了标记。

function decodeHtml(html) {
var txt = document.createElement("textarea");
txt.innerHTML = html;
return txt.value;
}

例如:http://jsfiddle.net/k65s3/

输入:

Entity:&nbsp;Bad attempt at XSS:<script>alert('new\nline?')</script><br>

输出:

Entity: Bad attempt at XSS:<script>alert('new\nline?')</script><br>

这是另一个版本:

function convertHTMLEntity(text){
const span = document.createElement('span');


return text
.replace(/&[#A-Za-z0-9]+;/gi, (entity,position,text)=> {
span.innerHTML = entity;
return span.innerText;
});
}


console.log(convertHTMLEntity('Large &lt; &#163; 500'));