#!/usr/bin/env python # coding: utf8 """ Example: ((type : 0 ) and (name : '.iso') ) or size >= 32000000000 (or (and (: type 0) (: name 'test image')) (>= size 32000000000)) = [ { 'type': ('__eq__', 0), 'name' :('__contains__', '.iso') }, { 'size' : ('__gt__', 32000000000) } ] ( test : 'this is a test' and foo = bar) or meat = 10 or sigh : Abc (or (and (: (name test) (literal 'this is a test')) (= (name foo) (name bar))) (or (= (name meat) (literal 10)) (: (name sigh) (name Abc)))) [ { 'test' : ('__contains__', 'this is a test'), 'foo' : ( '__eq__' : 'bar')}, [ { 'meat' : ( '__eq__', 10)}, { 'sigh' : ( '__contains__', 'Abc)}]] test : 'Foo' { 'test' : ( '__contains__', 'Foo' ) } test : 'Foo' and bar : 'Foo' { 'test' : ( '__contains__', 'Foo' ), 'bar' : ( '__contains__', 'Foo' ) } """ # see http://effbot.org/zone/simple-top-down-parsing.htm import sys import re # symbol (token type) registry symbol_table = {} class symbol_base(object): id = None value = None first = second = third = None def nud(self): raise SyntaxError("Syntax error (%r)." % self.id) def led(self, left): raise SyntaxError("Unknown operator (%r)." % self.id) def __repr__(self): if self.id == "(name)" or self.id == "(literal)": return "(%s %s)" % (self.id[1:-1], self.value) out = [self.id, self.first, self.second, self.third] out = map(str, filter(None, out)) return "(" + " ".join(out) + ")" def symbol(id, bp=0): try: s = symbol_table[id] except KeyError: class s(symbol_base): pass s.__name__ = "symbol-" + id # for debugging s.id = id s.value = None s.lbp = bp symbol_table[id] = s else: s.lbp = max(bp, s.lbp) return s # helpers def infix(id, bp): def led(self, left): self.first = left self.second = expression(bp) return self symbol(id, bp).led = led def infix_r(id, bp): def led(self, left): self.first = left self.second = expression(bp-1) return self symbol(id, bp).led = led def prefix(id, bp): def nud(self): self.first = expression(bp) return self symbol(id).nud = nud def advance(id=None): global token if id and token.id != id: raise SyntaxError("Expected %r" % id) token = next() def method(s): # decorator assert issubclass(s, symbol_base) def bind(fn): setattr(s, fn.__name__, fn) return bind # python expression syntax infix_r("or", 30); infix_r("and", 40); prefix("not", 50) infix("is", 60); infix(":", 60); infix("=", 60) infix(">", 60); infix(">=", 60) infix("<", 60); infix("<=", 60) infix("<>", 60) symbol("(", 150) symbol("'", 150) # additional behaviour symbol("*").nud = lambda self: self symbol("(name)").nud = lambda self: self symbol("(literal)").nud = lambda self: self symbol("(end)") symbol(")") @method(symbol("(")) def nud(self): expr = expression() advance(")") return expr @method(symbol("'")) def nud(self): expr = expression() advance("'") return expr # constants def constant(id): @method(symbol(id)) def nud(self): self.id = "(literal)" self.value = id return self constant("None") constant("True") constant("False") # multitoken operators @method(symbol("is")) def led(self, left): if token.id == "not": advance() self.id = "is not" self.first = left self.second = expression(60) return self # python tokenizer def tokenize_python(program): import tokenize from cStringIO import StringIO type_map = { tokenize.NUMBER: "(literal)", tokenize.STRING: "(literal)", tokenize.OP: "(operator)", tokenize.NAME: "(name)", } for t in tokenize.generate_tokens(StringIO(program).next): try: yield type_map[t[0]], t[1] except KeyError: if t[0] == tokenize.NL: continue if t[0] == tokenize.ENDMARKER: break else: raise SyntaxError("Syntax error") yield "(end)", "(end)" def tokenize(program): if isinstance(program, list): source = program else: source = tokenize_python(program) for id, value in source: if id == "(literal)": symbol = symbol_table[id] s = symbol() s.value = value else: # name or operator symbol = symbol_table.get(value) if symbol: s = symbol() elif id == "(name)": symbol = symbol_table[id] s = symbol() s.value = value else: raise SyntaxError("Unknown operator (%r)" % id) yield s # parser engine def expression(rbp=0): global token t = token token = next() left = t.nud() while rbp < token.lbp: t = token token = next() left = t.led(left) return left def parse(program): global token, next next = tokenize(program).next token = next() return expression() cmpcasts = {'__lt__': lambda x, y: x < y, '__le__': lambda x, y: x <= y, '__eq__': lambda x, y: x == y, '__ne__': lambda x, y: x != y, '__gt__': lambda x, y: x > y, '__ge__': lambda x, y: x >= y, '__contains__' : lambda x, y: y in x } def short_or(key, aList, anObject): """A helper function for parse_branch, since reduce(or, [value]) won't short-circuit.""" for eachValue in aList: if parse_branch(key, eachValue, anObject): return True return False def opval(key, value, anObject): """A helper function for parse_branch.""" operation, value = value if key == '*': for objectKey in anObject.keys(): if opval( objectKey, value, anObject): return True return False try: anOp = getattr(anObject[key], operation) return apply(anOp, [value]) except AttributeError: return cmpcasts[operation](anObject[key], value) except KeyError: raise SyntaxError("Unknown key (%r)." % value) return False def parse_branch(key, value, anObject): """Parse a BizFilter branch according to the following grammar: {name: value} implies an atomic operation on a value of name 'name'. Multiple {names} implies an 'and' relationship between them. [value, value, value, ...] implies an 'or' relationship. (operation, value) implies anObject.operation(value). Any other value is taken as a primitive type. Example: ((not isInvalid) and ((delivery is None) or delivery >= 3/13/29)) = {'isInvalid': False, 'delivery': [None, ('__gt__', aDate('3/13/1929')) ] } """ if type(value) == type({}): if len(value) == 0: return True else: return reduce(lambda x,y: x and y, [parse_branch(eachKey, eachValue, anObject) for eachKey, eachValue in value.items()]) elif type(value) == type([]): if len(value) == 0: return False else: return short_or(key, value, anObject) elif type(value) == type(()): return opval(key, value, anObject) else: try: return value == anObject[key] except KeyError: return False def applyFilter( data, filterObject): for value in data: if parse_branch('', filterObject, value): yield value def filterData( data, filterString ): if filterString == '': filterObject = {} else: filterObject = createFilterObject( parse( filterString ) ) return list( applyFilter( data, filterObject ) ) def formatResult( result ): if result.isdigit(): return float( result ) elif ( result.endswith("'") and result.startswith("'")) or ( result.endswith('"') and result.startswith('"')): return result[1:-1] else: return result def createFilterObject( parsedData ): result = None if parsedData.id == 'or': result = [] childData = parsedData while childData.second: if childData.second.first is None: result.append( createFilterObject( childData ) ) else: result.append( createFilterObject( childData.first ) ) childData = childData.second elif parsedData.id == 'and': result = {} childData = parsedData while childData.second: if childData.second.first is None: value = createFilterObject( childData ) else: value = createFilterObject( childData.first ) result[ value.keys()[0] ] = value[value.keys()[0]] childData = childData.second elif parsedData.id == ':': result = { parsedData.first.value : ( '__contains__', formatResult( parsedData.second.value ) ) } elif parsedData.id == '=': result = { parsedData.first.value : ( '__eq__', formatResult( parsedData.second.value ) ) } elif parsedData.id == '<': result = { parsedData.first.value : ( '__lt__', formatResult( parsedData.second.value ) ) } elif parsedData.id == '<=': result = { parsedData.first.value : ( '__le__', formatResult( parsedData.second.value ) ) } elif parsedData.id == '<>': result = { parsedData.first.value : ( '__ne__', formatResult( parsedData.second.value ) ) } elif parsedData.id == '>': result = { parsedData.first.value : ( '__gt__', formatResult( parsedData.second.value ) ) } elif parsedData.id == '>=': result = { parsedData.first.value : ( '__ge__', formatResult( parsedData.second.value ) ) } else: result = { '*' : ( '__contains__', formatResult( parsedData.value ) ) } return result """ Tests: import applications.vmanager.modules.filter_parser as myFilter filterString = '(foo : Bar and bar = bazz1 ) or foo = Blah' data = [ { 'foo' : 'Bar', 'bar' : 'bazz0'},{ 'foo' : 'Bar', 'bar' : 'bazz1'},{ 'foo' : 'Blah', 'bar' : 'bazz2'} ] filterObject = { 'foo' : ( '__contains__', 'Bar' ), 'bar' : ( '__eq__', 'bazz1') } parsedData = '(and (: (name foo) (name Bar)) (= (name bar) (name bazz1)))' list( myFilter.applyFilter( data, filterObject ) ) myFilter.createFilterObject( myFilter.parse( filterString ) ) myFilter.filterData( data, filterString ) """