#example.py using Python 3.7.4
import re
str="""Everything is awesome! <pre>Hello,
# Normally (.*) will not capture newlines, but here re.DOTATLL is set
pattern = re.compile(r"<pre>(.*)</pre>",re.DOTALL)
matches = pattern.search(str)
#example2.py using Python 3.7.4
import re
# str contains three <pre>...</pre> tags
str = """In two different ex-
periments, the authors had subjects chat and solve the <pre>Desert Survival Problem</pre> with a
humorous or non-humorous computer. In both experiments the computer made pre-
programmed comments, but in study 1 subjects were led to believe they were interact-
ing with another person. In the <pre>humor conditions</pre> subjects received a number of funny
comments, for instance: “The mirror is probably too small to be used as a signaling
device to alert rescue teams to your location. Rank it lower. (On the other hand, it
offers <pre>endless opportunity for self-reflection</pre>)”."""
# Normally (.*) will not capture newlines, but here re.DOTATLL is set
# The question mark in (.*?) indicates non greedy matching.
pattern = re.compile(r"<pre>(.*?)</pre>",re.DOTALL)
matches = pattern.finditer(str)
for i,match in enumerate(matches):
print(f"tag {i}: ",match.group(1))
python example2.py
tag 0: Desert Survival Problem
tag 1: humor conditions
tag 2: endless opportunity for self-reflection
(?<=>) # look behind (but don't consume/capture) for a '>'
([\w\s]+) # capture/consume any combination of alpha/numeric/whitespace
(?=<\/) # look ahead (but don't consume/capture) for a '</'
let TAG_NAME = '([^\s</>]+)';
let STRING = '("(?:[^"\\\\]|\\\\.)*")';
// \1 is a back reference to TAG_NAME
`<${TAG_NAME}(?:${STRING}|${NOT_CLOSING_TAG_NOT_QUOTE})+>(.*?)</\\1 *>`;
let tagRegex = new RegExp(NON_SELF_CLOSING_HTML_TAG, 'g');
let myStr = `Aenean <abc href="/life<><>\\"<?/abc></abc>"><a>life</a></abc> sed consectetur.
<a href="/work">Work Inner HTML</a> quis risus eget <a href="/about">about inner html</a> leo.
interacted with any of the <<<ve text="<></ve>>">abc</ve>`;
let matches = myStr.match(tagRegex);
// Removing 'g' flag to match each tag part in the for loop
tagRegex = new RegExp(NON_SELF_CLOSING_HTML_TAG);
for (let i = 0; i < matches.length; i++) {
let tagParts = matches[i].match(tagRegex);
console.log(`Tag #${i} = [${tagParts[0]}]`);
console.log(`Tag #${i} name: [${tagParts[1]}]`);
console.log(`Tag #${i} string attr: [${tagParts[2]}]`);
console.log(`Tag #${i} inner html: [${tagParts[3]}]`);
Tag #0 = [<abc href="/life<><>\"<?/abc></abc>"><a>life</a></abc>]
Tag #0 name: [abc]
Tag #0 string attr: ["/life<><>\"<?/abc></abc>"]
Tag #0 inner html: [<a>life</a>]
Tag #1 = [<a href="/work">Work Inner HTML</a>]
Tag #1 name: [a]
Tag #1 string attr: ["/work"]
Tag #1 inner html: [Work Inner HTML]
Tag #2 = [<a href="/about">about inner html</a>]
Tag #2 name: [a]
Tag #2 string attr: ["/about"]
Tag #2 inner html: [about inner html]
Tag #3 = [<ve text="<></ve>>">abc</ve>]
Tag #3 name: [ve]
Tag #3 string attr: ["<></ve>>"]
Tag #3 inner html: [abc]
Aenean lacinia <abc href="/life<><><?/a></a>">
<a>life</a></abc> sed consectetur.
<a href="/work">Work</a> quis risus eget urna mollis ornare <a href="/about">about</a> leo.
interacted with any of the <<<ve text="<></ve>>">abc</ve>