var bodyContent = document.getElementsByTagName('body')[0];var result = appendTextNodes(bodyContent);
function appendTextNodes(element) {var text = '';
// Loop through the childNodes of the passed in elementfor (var i = 0, len = element.childNodes.length; i < len; i++) {// Get a reference to the current childvar node = element.childNodes[i];// Append the node's value if it's a text nodeif (node.nodeType == 3) {text += node.nodeValue;}// Recurse through the node's children, if there are anyif (node.childNodes.length > 0) {appendTextNodes(node);}}// Return the final resultreturn text;}
str='this string has <i>html</i> code i want to <b>remove</b><br>Link Number 1 -><a href="http://www.bbc.co.uk">BBC</a> Link Number 1<br><p>Now back to normal text and stuff</p>';str=str.replace(/<br>/gi, "\n");str=str.replace(/<p.*>/gi, "\n");str=str.replace(/<a.*href="(.*?)".*>(.*?)<\/a>/gi, " $2 (Link->$1) ");str=str.replace(/<(?:.|\s)*?>/g, "");
str变量是这样开始的:
this string has <i>html</i> code i want to <b>remove</b><br>Link Number 1 -><a href="http://www.bbc.co.uk">BBC</a> Link Number 1<br><p>Now back to normal text and stuff</p>
然后代码运行后,它看起来像这样:-
this string has html code i want to removeLink Number 1 -> BBC (Link->http://www.bbc.co.uk) Link Number 1
Now back to normal text and stuff
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"><html><!--comment-->
<head>
<title>This is my title</title><meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1"><style>
body {margin-top: 15px;}a { color: #D80C1F; font-weight:bold; text-decoration:none; }
</style></head>
<body><center>This string has <i>html</i> code i want to <b>remove</b><br>In this line <a href="http://www.bbc.co.uk">BBC</a> with link is mentioned.<br/>Now back to "normal text" and stuff using <html encoding></center></body></html>
function convertHtmlToText() {var inputText = document.getElementById("input").value;var returnText = "" + inputText;
//-- remove BR tags and replace them with line breakreturnText=returnText.replace(/<br>/gi, "\n");returnText=returnText.replace(/<br\s\/>/gi, "\n");returnText=returnText.replace(/<br\/>/gi, "\n");
//-- remove P and A tags but preserve what's inside of themreturnText=returnText.replace(/<p.*>/gi, "\n");returnText=returnText.replace(/<a.*href="(.*?)".*>(.*?)<\/a>/gi, " $2 ($1)");
//-- remove all inside SCRIPT and STYLE tagsreturnText=returnText.replace(/<script.*>[\w\W]{1,}(.*?)[\w\W]{1,}<\/script>/gi, "");returnText=returnText.replace(/<style.*>[\w\W]{1,}(.*?)[\w\W]{1,}<\/style>/gi, "");//-- remove all elsereturnText=returnText.replace(/<(?:.|\s)*?>/g, "");
//-- get rid of more than 2 multiple line breaks:returnText=returnText.replace(/(?:(?:\r\n|\r|\n)\s*){2,}/gim, "\n\n");
//-- get rid of more than 2 spaces:returnText = returnText.replace(/ +(?= )/g,'');
//-- get rid of html-encoded characters:returnText=returnText.replace(/ /gi," ");returnText=returnText.replace(/&/gi,"&");returnText=returnText.replace(/"/gi,'"');returnText=returnText.replace(/</gi,'<');returnText=returnText.replace(/>/gi,'>');
//-- returndocument.getElementById("output").value = returnText;}
var content = "<p>checking the html source </p><p> </p><p>with </p><p>all</p><p>the html </p><p>content</p>";
var text = $(content).text();//It gets you the plain textconsole.log(text);//check the data in your console
cj("#text_area_id").val(text);//set your content to text area using text_area_id
var htmlparser = require('htmlparser2');
var body = '<p><div>This is </div>a <span>simple </span> <img src="test"></img>example.</p>';
var result = [];
var parser = new htmlparser.Parser({ontext: function(text){result.push(text);}}, {decodeEntities: true});
parser.write(body);parser.end();
result.join('');
const text = `<html lang="en"><head><style type="text/css">*{color:red}</style><script>alert('hello')</script></head><body><b>This is some text</b><br/><body></html>`;
// Remove style tags and contenttext.replace(/<style[^>]*>.*<\/style>/gm, '')// Remove script tags and content.replace(/<script[^>]*>.*<\/script>/gm, '')// Remove all opening, closing and orphan HTML tags.replace(/<[^>]+>/gm, '')// Remove leading spaces and repeated CR/LF.replace(/([\r\n]+ +)+/gm, '');
var div = document.getElementsByTagName('div');for (var i=0; i<div.length; i++) {div[i].insertAdjacentHTML('afterend', div[i].innerHTML);document.body.removeChild(div[i]);}
function cleanHTML(str){str.replace(/<(?<=<)(.*?)(?=>)>/g, '<$1>');}
function uncleanHTML(str){str.replace(/<(?<=<)(.*?)(?=>)>/g, '<$1>');}
方法二:
function cleanHTML(str){str.replace(/</g, '<').replace(/>/g, '>');}
function uncleanHTML(str){str.replace(/</g, '<').replace(/>/g, '>');}
const getTextFromHtml = (t) =>t?.split('>')?.map((i) => i.split('<')[0]).filter((i) => !i.includes('=') && i.trim()).join('');
const test = '<p>This <strong>one</strong> <em>time</em>,</p><br /><blockquote>I went to</blockquote><ul><li>band <a href="https://workingclasshistory.com" rel="noopener noreferrer" target="_blank">camp</a>…</li></ul><p>I edited this as a reviewer just to double check</p>'
getTextFromHtml(test)// 'This onetime,I went toband camp…I edited this as a reviewer just to double check'