从字符串中删除 HTML 标记

如何从字符串中删除 HTML 标记,以便输出干净的文本?

let str = string.stringByReplacingOccurrencesOfString("<[^>]+>", withString: "", options: .RegularExpressionSearch, range: nil)
print(str)
61442 次浏览

嗯,我尝试了你的功能,它工作在一个小例子:

var string = "<!DOCTYPE html> <html> <body> <h1>My First Heading</h1> <p>My first paragraph.</p> </body> </html>"
let str = string.stringByReplacingOccurrencesOfString("<[^>]+>", withString: "", options: .RegularExpressionSearch, range: nil)
print(str)


//output "  My First Heading My first paragraph. "

你能举例说明一个问题吗?

Swift 4和5版本:

var string = "<!DOCTYPE html> <html> <body> <h1>My First Heading</h1> <p>My first paragraph.</p> </body> </html>"
let str = string.replacingOccurrences(of: "<[^>]+>", with: "", options: .regularExpression, range: nil)

由于 HTML 不是 regular language(HTML 是 context-free语言) ,所以不能使用正则表达式

我会考虑改用 NSAttributedString。

let htmlString = "LCD Soundsystem was the musical project of producer <a href='http://www.last.fm/music/James+Murphy' class='bbcode_artist'>James Murphy</a>, co-founder of <a href='http://www.last.fm/tag/dance-punk' class='bbcode_tag' rel='tag'>dance-punk</a> label <a href='http://www.last.fm/label/DFA' class='bbcode_label'>DFA</a> Records. Formed in 2001 in New York City, New York, United States, the music of LCD Soundsystem can also be described as a mix of <a href='http://www.last.fm/tag/alternative%20dance' class='bbcode_tag' rel='tag'>alternative dance</a> and <a href='http://www.last.fm/tag/post%20punk' class='bbcode_tag' rel='tag'>post punk</a>, along with elements of <a href='http://www.last.fm/tag/disco' class='bbcode_tag' rel='tag'>disco</a> and other styles. <br />"
let htmlStringData = htmlString.dataUsingEncoding(NSUTF8StringEncoding)!
let options: [String: AnyObject] = [NSDocumentTypeDocumentAttribute: NSHTMLTextDocumentType, NSCharacterEncodingDocumentAttribute: NSUTF8StringEncoding]
let attributedHTMLString = try! NSAttributedString(data: htmlStringData, options: options, documentAttributes: nil)
let string = attributedHTMLString.string

或者,就像伊尔沙德 · 穆罕默德(Irshad Mohamed)在评论中说的那样:

let attributed = try NSAttributedString(data: htmlString.data(using: .unicode)!, options: [NSDocumentTypeDocumentAttribute: NSHTMLTextDocumentType], documentAttributes: nil)
print(attributed.string)

我使用以下扩展来删除特定的 HTML 元素:

extension String {
func deleteHTMLTag(tag:String) -> String {
return self.stringByReplacingOccurrencesOfString("(?i)</?\(tag)\\b[^<]*>", withString: "", options: .RegularExpressionSearch, range: nil)
}


func deleteHTMLTags(tags:[String]) -> String {
var mutableString = self
for tag in tags {
mutableString = mutableString.deleteHTMLTag(tag)
}
return mutableString
}
}

这使得只从字符串中删除 <a>标记成为可能,例如:

let string = "my html <a href="">link text</a>"
let withoutHTMLString = string.deleteHTMLTag("a") // Will be "my  html link text"

迅速4:

extension String {
func deleteHTMLTag(tag:String) -> String {
return self.replacingOccurrences(of: "(?i)</?\(tag)\\b[^<]*>", with: "", options: .regularExpression, range: nil)
}


func deleteHTMLTags(tags:[String]) -> String {
var mutableString = self
for tag in tags {
mutableString = mutableString.deleteHTMLTag(tag: tag)
}
return mutableString
}
}

Mohamed 解决方案,但在 Swift 4中作为字符串扩展。

extension String {


func stripOutHtml() -> String? {
do {
guard let data = self.data(using: .unicode) else {
return nil
}
let attributed = try NSAttributedString(data: data, options: [.documentType: NSAttributedString.DocumentType.html, .characterEncoding: String.Encoding.utf8.rawValue], documentAttributes: nil)
return attributed.string
} catch {
return nil
}
}
}

更新至 Swift 4:

guard let htmlStringData = htmlString.data(using: .unicode) else { fatalError() }


let options: [NSAttributedString.DocumentReadingOptionKey: Any] = [
.documentType: NSAttributedString.DocumentType.html
.characterEncoding: String.Encoding.unicode.rawValue
]


let attributedHTMLString = try! NSAttributedString(data: htmlStringData, options: options, documentAttributes: nil)
let string = attributedHTMLString.string
extension String{
var htmlStripped : String{
return self.replacingOccurrences(of: "<[^>]+>", with: "", options: .regularExpression, range: nil)
}
}

编码愉快

与使用 NSAttributedString HTML 转换相比,我更喜欢使用正则表达式,请注意,这非常耗时,也需要在主线程上运行。 更多信息请点击这里: < a href = “ https://developer.apple.com/document/foundation/nsAttributedstring/1524613-initwithdata”rel = “ nofollow norefrer”> https://developer.apple.com/documentation/foundation/nsattributedstring/1524613-initwithdata

对我来说,这是一个技巧,首先我删除任何 CSS 内联样式,然后所有的 HTML 标签。作为 NSAttributedString 选项可能不太可靠,但对我的情况来说要快得多。

extension String {
func withoutHtmlTags() -> String {
let str = self.replacingOccurrences(of: "<style>[^>]+</style>", with: "", options: .regularExpression, range: nil)
return str.replacingOccurrences(of: "<[^>]+>", with: "", options: .regularExpression, range: nil)
}
}

Swift 5

extension String {
public func trimHTMLTags() -> String? {
guard let htmlStringData = self.data(using: String.Encoding.utf8) else {
return nil
}
    

let options: [NSAttributedString.DocumentReadingOptionKey : Any] = [
.documentType: NSAttributedString.DocumentType.html,
.characterEncoding: String.Encoding.utf8.rawValue
]
    

let attributedString = try? NSAttributedString(data: htmlStringData, options: options, documentAttributes: nil)
return attributedString?.string
}
}

用途:

let  str = "my html <a href='https://www.google.com'>link text</a>"


print(str.trimHTMLTags() ?? "--") //"my html link text"

使用 XML 基于事件的处理XMLParser能够取得轻微的成功,可在所有平台上使用 Foundation

优点

  • 与使用正则表达式相比,此解决方案的性能更好。
  • 更安全,正如一些人已经提到的,HTML不是一种常规语言。
  • 线程安全(不需要在主线程上运行)。

缺点

  • HTML不是 XML,尽管它与 XML非常相似,但在尝试将其解析为 XML之前,您可能需要清理 HTML
  • 例如: <br><hr>将使解析失败,但是 <br /><hr />将被解析为 \n
  • 它是一个基于委托的 API,迫使您遵守 NSObject协议和基于事件的处理。
  • XMLParser已经很长时间没有更新了,因此缺乏许多我们希望拥有的新的 Swift 能力。
  • XMLDocument Foundation提供的一个更新、更灵活的 API,但它只能在 macOS 上使用。

对于我自己的用例,我创建了一个类,使我能够使用 async/await和异步处理。

请随意调整您自己的用例,也许改进原始 HTML字符串的清洗过程。

解决方案

import Foundation


final class Parser: NSObject, XMLParserDelegate {
private(set) var result = ""
private var finished: (() -> Void)?
private var fail: ((Error) -> Void)?
private var content = ""


init(html: String) async throws {
super.init()
        

result = try await withUnsafeThrowingContinuation { [weak self] continuation in
// tweak here as needed
let clean = html
.replacingOccurrences(of: "<!DOCTYPE html>",
with: "",
options: .caseInsensitive)
.replacingOccurrences(of: "<br>",
with: "\n",
options: .caseInsensitive)
.replacingOccurrences(of: "<hr>",
with: "\n",
options: .caseInsensitive)
            

let xml = XMLParser(data: .init(("<xml>" + clean + "</xml>").utf8))
self?.finished = { [weak self] in
xml.delegate = nil
self?.fail = nil
self?.finished = nil
                

guard let content = self?.content else { return }


continuation
.resume(returning: content
.trimmingCharacters(in:
.whitespacesAndNewlines))
}
            

self?.fail = { [weak self] in
xml.delegate = nil
self?.fail = nil
self?.finished = nil
xml.abortParsing()


continuation
.resume(throwing: $0)
}
            

xml.delegate = self
            

if !xml.parse(),
let error = xml.parserError {
self?.fail?(error)
}
}
}
    

func parserDidEndDocument(_: XMLParser) {
finished?()
}
    

func parser(_: XMLParser, parseErrorOccurred: Error) {
fail?(parseErrorOccurred)
}
    

func parser(_: XMLParser, validationErrorOccurred: Error) {
fail?(validationErrorOccurred)
}
    

func parser(_: XMLParser, foundCharacters: String) {
content += foundCharacters
}
}

用法及例子

使用这篇文章中已经给出的一些例子

let string = "<!DOCTYPE html> <html> <body> <h1>My First Heading</h1> <p>My first paragraph.</p> </body> </html>"


let result = try await Parser(html: string).result
// My First Heading My first paragraph.
let string = "LCD Soundsystem was the musical project of producer <a href='http://www.last.fm/music/James+Murphy' class='bbcode_artist'>James Murphy</a>, co-founder of <a href='http://www.last.fm/tag/dance-punk' class='bbcode_tag' rel='tag'>dance-punk</a> label <a href='http://www.last.fm/label/DFA' class='bbcode_label'>DFA</a> Records. Formed in 2001 in New York City, New York, United States, the music of LCD Soundsystem can also be described as a mix of <a href='http://www.last.fm/tag/alternative%20dance' class='bbcode_tag' rel='tag'>alternative dance</a> and <a href='http://www.last.fm/tag/post%20punk' class='bbcode_tag' rel='tag'>post punk</a>, along with elements of <a href='http://www.last.fm/tag/disco' class='bbcode_tag' rel='tag'>disco</a> and other styles. <br />"


let result = try await Parser(html: string).result
// LCD Soundsystem was the musical project of producer James Murphy, co-founder of dance-punk label DFA Records. Formed in 2001 in New York City, New York, United States, the music of LCD Soundsystem can also be described as a mix of alternative dance and post punk, along with elements of disco and other styles.
let string = "my html <a href=\"\">link text</a>"


let result = try await Parser(html: string).result
// my html link text