## invRegex.py## Copyright 2008, Paul McGuire## pyparsing script to expand a regular expression into all possible matching strings# Supports:# - {n} and {m,n} repetition, but not unbounded + or * repetition# - ? optional elements# - [] character ranges# - () grouping# - | alternation#__all__ = ["count","invert"]
from pyparsing import (Literal, oneOf, printables, ParserElement, Combine,SkipTo, operatorPrecedence, ParseFatalException, Word, nums, opAssoc,Suppress, ParseResults, srange)
class CharacterRangeEmitter(object):def __init__(self,chars):# remove duplicate chars in character range, but preserve original orderseen = set()self.charset = "".join( seen.add(c) or c for c in chars if c not in seen )def __str__(self):return '['+self.charset+']'def __repr__(self):return '['+self.charset+']'def makeGenerator(self):def genChars():for s in self.charset:yield sreturn genChars
class OptionalEmitter(object):def __init__(self,expr):self.expr = exprdef makeGenerator(self):def optionalGen():yield ""for s in self.expr.makeGenerator()():yield sreturn optionalGen
class DotEmitter(object):def makeGenerator(self):def dotGen():for c in printables:yield creturn dotGen
class GroupEmitter(object):def __init__(self,exprs):self.exprs = ParseResults(exprs)def makeGenerator(self):def groupGen():def recurseList(elist):if len(elist)==1:for s in elist[0].makeGenerator()():yield selse:for s in elist[0].makeGenerator()():for s2 in recurseList(elist[1:]):yield s + s2if self.exprs:for s in recurseList(self.exprs):yield sreturn groupGen
class AlternativeEmitter(object):def __init__(self,exprs):self.exprs = exprsdef makeGenerator(self):def altGen():for e in self.exprs:for s in e.makeGenerator()():yield sreturn altGen
class LiteralEmitter(object):def __init__(self,lit):self.lit = litdef __str__(self):return "Lit:"+self.litdef __repr__(self):return "Lit:"+self.litdef makeGenerator(self):def litGen():yield self.litreturn litGen
def handleRange(toks):return CharacterRangeEmitter(srange(toks[0]))
def handleRepetition(toks):toks=toks[0]if toks[1] in "*+":raise ParseFatalException("",0,"unbounded repetition operators not supported")if toks[1] == "?":return OptionalEmitter(toks[0])if "count" in toks:return GroupEmitter([toks[0]] * int(toks.count))if "minCount" in toks:mincount = int(toks.minCount)maxcount = int(toks.maxCount)optcount = maxcount - mincountif optcount:opt = OptionalEmitter(toks[0])for i in range(1,optcount):opt = OptionalEmitter(GroupEmitter([toks[0],opt]))return GroupEmitter([toks[0]] * mincount + [opt])else:return [toks[0]] * mincount
def handleLiteral(toks):lit = ""for t in toks:if t[0] == "\\":if t[1] == "t":lit += '\t'else:lit += t[1]else:lit += treturn LiteralEmitter(lit)
def handleMacro(toks):macroChar = toks[0][1]if macroChar == "d":return CharacterRangeEmitter("0123456789")elif macroChar == "w":return CharacterRangeEmitter(srange("[A-Za-z0-9_]"))elif macroChar == "s":return LiteralEmitter(" ")else:raise ParseFatalException("",0,"unsupported macro character (" + macroChar + ")")
def handleSequence(toks):return GroupEmitter(toks[0])
def handleDot():return CharacterRangeEmitter(printables)
def handleAlternative(toks):return AlternativeEmitter(toks[0])
_parser = Nonedef parser():global _parserif _parser is None:ParserElement.setDefaultWhitespaceChars("")lbrack,rbrack,lbrace,rbrace,lparen,rparen = map(Literal,"[]{}()")
reMacro = Combine("\\" + oneOf(list("dws")))escapedChar = ~reMacro + Combine("\\" + oneOf(list(printables)))reLiteralChar = "".join(c for c in printables if c not in r"\[]{}().*?+|") + " \t"
reRange = Combine(lbrack + SkipTo(rbrack,ignore=escapedChar) + rbrack)reLiteral = ( escapedChar | oneOf(list(reLiteralChar)) )reDot = Literal(".")repetition = (( lbrace + Word(nums).setResultsName("count") + rbrace ) |( lbrace + Word(nums).setResultsName("minCount")+","+ Word(nums).setResultsName("maxCount") + rbrace ) |oneOf(list("*+?")))
reRange.setParseAction(handleRange)reLiteral.setParseAction(handleLiteral)reMacro.setParseAction(handleMacro)reDot.setParseAction(handleDot)
reTerm = ( reLiteral | reRange | reMacro | reDot )reExpr = operatorPrecedence( reTerm,[(repetition, 1, opAssoc.LEFT, handleRepetition),(None, 2, opAssoc.LEFT, handleSequence),(Suppress('|'), 2, opAssoc.LEFT, handleAlternative),])_parser = reExpr
return _parser
def count(gen):"""Simple function to count the number of elements returned by a generator."""i = 0for s in gen:i += 1return i
def invert(regex):"""Call this routine as a generator to return all the strings thatmatch the input regular expression.for s in invert("[A-Z]{3}\d{3}"):print s"""invReGenerator = GroupEmitter(parser().parseString(regex)).makeGenerator()return invReGenerator()
def main():tests = r"""[A-EA][A-D]*[A-D]{3}X[A-C]{3}YX[A-C]{3}\(X\dfoobar\d\dfoobar{2}foobar{2,9}fooba[rz]{2}(foobar){2}([01]\d)|(2[0-5])([01]\d\d)|(2[0-4]\d)|(25[0-5])[A-C]{1,2}[A-C]{0,3}[A-C]\s[A-C]\s[A-C][A-C]\s?[A-C][A-C][A-C]\s([A-C][A-C])[A-C]\s([A-C][A-C])?[A-C]{2}\d{2}@|TH[12]@(@|TH[12])?@(@|TH[12]|AL[12]|SP[123]|TB(1[0-9]?|20?|[3-9]))?@(@|TH[12]|AL[12]|SP[123]|TB(1[0-9]?|20?|[3-9])|OH(1[0-9]?|2[0-9]?|30?|[4-9]))?(([ECMP]|HA|AK)[SD]|HS)T[A-CV]{2}A[cglmrstu]|B[aehikr]?|C[adeflmorsu]?|D[bsy]|E[rsu]|F[emr]?|G[ade]|H[efgos]?|I[nr]?|Kr?|L[airu]|M[dgnot]|N[abdeiop]?|Os?|P[abdmortu]?|R[abefghnu]|S[bcegimnr]?|T[abcehilm]|Uu[bhopqst]|U|V|W|Xe|Yb?|Z[nr](a|b)|(x|y)(a|b) (x|y)""".split('\n')
for t in tests:t = t.strip()if not t: continueprint '-'*50print ttry:print count(invert(t))for s in invert(t):print sexcept ParseFatalException,pfe:print pfe.msgprintcontinueprint
if __name__ == "__main__":main()