Jay Taylor's notes

back to listing index

Python PyParsing

[web search]
Original source (pyparsing.wikispaces.com)
Tags: python compilers parsers parse-trees pyparsing pyparsing.wikispaces.com
Clipped on: 2015-01-05

"""Search query parser

version 2006-03-09

This search query parser uses the excellent Pyparsing module 
(http://pyparsing.sourceforge.net/) to parse search queries by users.
It handles:

* 'and', 'or' and implicit 'and' operators;
* parentheses;
* quoted strings;
* wildcards at the end of a search term (help*);

Requirements:
* Python
* Pyparsing

If you run this script, it will perform a number of tests. To use is as a
module, you should use inheritance on the SearchQueryParser class and overwrite
the Get... methods. The ParserTest class gives a very simple example of how this
could work.

-------------------------------------------------------------------------------
Copyright (c) 2006, Estrate, the Netherlands
All rights reserved.

Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice, this
  list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
  this list of conditions and the following disclaimer in the documentation 
  and/or other materials provided with the distribution.
* Neither the name of Estrate nor the names of its contributors may be used
  to endorse or promote products derived from this software without specific
  prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

CONTRIBUTORS:
- Steven Mooij
- Rudolph Froger
- Paul McGuire

TODO:
- add more docs
- ask someone to check my English texts
- add more kinds of wildcards ('*' at the beginning and '*' inside a word)?
"""
from pyparsing import Word, alphanums, Keyword, Group, Combine, Forward, Suppress, Optional, OneOrMore, oneOf
from sets import Set

class SearchQueryParser:

    def __init__(self):
        self._methods = {
            'and': self.evaluateAnd,
            'or': self.evaluateOr,
            'not': self.evaluateNot,
            'parenthesis': self.evaluateParenthesis,
            'quotes': self.evaluateQuotes,
            'word': self.evaluateWord,
            'wordwildcard': self.evaluateWordWildcard,
        }
        self._parser = self.parser()
    
    def parser(self):
        """
        This function returns a parser.
        The grammar should be like most full text search engines (Google, Tsearch, Lucene).
        
        Grammar:
        - a query consists of alphanumeric words, with an optional '*' wildcard
          at the end of a word
        - a sequence of words between quotes is a literal string
        - words can be used together by using operators ('and' or 'or')
        - words with operators can be grouped with parenthesis
        - a word or group of words can be preceded by a 'not' operator
        - the 'and' operator precedes an 'or' operator
        - if an operator is missing, use an 'and' operator
        """
        operatorOr = Forward()
        
        operatorWord = Group(Combine(Word(alphanums) + Suppress('*'))).setResultsName('wordwildcard') | \
                            Group(Word(alphanums)).setResultsName('word')
        
        operatorQuotesContent = Forward()
        operatorQuotesContent << (
            (operatorWord + operatorQuotesContent) | operatorWord
        )
        
        operatorQuotes = Group(
            Suppress('"') + operatorQuotesContent + Suppress('"')
        ).setResultsName("quotes") | operatorWord
        
        operatorParenthesis = Group(
            (Suppress("(") + operatorOr + Suppress(")"))
        ).setResultsName("parenthesis") | operatorQuotes

        operatorNot = Forward()
        operatorNot << (Group(
            Suppress(Keyword("not", caseless=True)) + operatorNot
        ).setResultsName("not") | operatorParenthesis)

        operatorAnd = Forward()
        operatorAnd << (Group(
            operatorNot + Suppress(Keyword("and", caseless=True)) + operatorAnd
        ).setResultsName("and") | Group(
            operatorNot + OneOrMore(~oneOf("and or") + operatorAnd)
        ).setResultsName("and") | operatorNot)
        
        operatorOr << (Group(
            operatorAnd + Suppress(Keyword("or", caseless=True)) + operatorOr
        ).setResultsName("or") | operatorAnd)

        return operatorOr.parseString

    def evaluateAnd(self, argument):
        return self.evaluate(argument[0]).intersection(self.evaluate(argument[1]))

    def evaluateOr(self, argument):
        return self.evaluate(argument[0]).union(self.evaluate(argument[1]))

    def evaluateNot(self, argument):
        return self.GetNot(self.evaluate(argument[0]))

    def evaluateParenthesis(self, argument):
        return self.evaluate(argument[0])

    def evaluateQuotes(self, argument):
        """Evaluate quoted strings

        First is does an 'and' on the indidual search terms, then it asks the
        function GetQuoted to only return the subset of ID's that contain the
        literal string.
        """
        r = Set()
        search_terms = []
        for item in argument:
            search_terms.append(item[0])
            if len(r) == 0:
                r = self.evaluate(item)
            else:
                r = r.intersection(self.evaluate(item))
        return self.GetQuotes(' '.join(search_terms), r)

    def evaluateWord(self, argument):
        return self.GetWord(argument[0])

    def evaluateWordWildcard(self, argument):
        return self.GetWordWildcard(argument[0])
        
    def evaluate(self, argument):
        return self._methods[argument.getName()](argument)

    def Parse(self, query):
        #print self._parser(query)[0]
        return self.evaluate(self._parser(query)[0])

    def GetWord(self, word):
        return Set()

    def GetWordWildcard(self, word):
        return Set()

    def GetQuotes(self, search_string, tmp_result):
        return Set()

    def GetNot(self, not_set):
        return Set().difference(not_set)


class ParserTest(SearchQueryParser):
    """Tests the parser with some search queries
    tests containts a dictionary with tests and expected results.
    """
    tests = {
        'help': Set([1, 2, 4, 5]),
        'help or hulp': Set([1, 2, 3, 4, 5]),
        'help and hulp': Set([2]),
        'help hulp': Set([2]),
        'help and hulp or hilp': Set([2, 3, 4]),
        'help or hulp and hilp': Set([1, 2, 3, 4, 5]),
        'help or hulp or hilp or halp': Set([1, 2, 3, 4, 5, 6]),
        '(help or hulp) and (hilp or halp)': Set([3, 4, 5]),
        'help and (hilp or halp)': Set([4, 5]),
        '(help and (hilp or halp)) or hulp': Set([2, 3, 4, 5]),
        'not help': Set([3, 6, 7, 8]),
        'not hulp and halp': Set([5, 6]),
        'not (help and halp)': Set([1, 2, 3, 4, 6, 7, 8]),
        '"help me please"': Set([2]),
        '"help me please" or hulp': Set([2, 3]),
        '"help me please" or (hulp and halp)': Set([2]),
        'help*': Set([1, 2, 4, 5, 8]),
        'help or hulp*': Set([1, 2, 3, 4, 5]),
        'help* and hulp': Set([2]),
        'help and hulp* or hilp': Set([2, 3, 4]),
        'help* or hulp or hilp or halp': Set([1, 2, 3, 4, 5, 6, 8]),
        '(help or hulp*) and (hilp* or halp)': Set([3, 4, 5]),
        'help* and (hilp* or halp*)': Set([4, 5]),
        '(help and (hilp* or halp)) or hulp*': Set([2, 3, 4, 5]),
        'not help* and halp': Set([6]),
        'not (help* and helpe*)': Set([1, 2, 3, 4, 5, 6, 7]),
        '"help* me please"': Set([2]),
        '"help* me* please" or hulp*': Set([2, 3]),
        '"help me please*" or (hulp and halp)': Set([2]),
        '"help me please" not (hulp and halp)': Set([2]),
        '"help me please" hulp': Set([2]),
        'help and hilp and not holp': Set([4]),
        'help hilp not holp': Set([4]),
        'help hilp and not holp': Set([4]),
    }

    docs = {
        1: 'help',
        2: 'help me please hulp',
        3: 'hulp hilp',
        4: 'help hilp',
        5: 'halp thinks he needs help',
        6: 'he needs halp',
        7: 'nothing',
        8: 'helper',
    }
        
    index = {
        'help': Set((1, 2, 4, 5)),
        'me': Set((2,)),
        'please': Set((2,)),
        'hulp': Set((2, 3,)),
        'hilp': Set((3, 4,)),
        'halp': Set((5, 6,)),
        'thinks': Set((5,)),
        'he': Set((5, 6,)),
        'needs': Set((5, 6,)),
        'nothing': Set((7,)),
        'helper': Set((8,)),
    }

    def GetWord(self, word):
        if (self.index.has_key(word)):
            return self.index[word]
        else:
            return Set()

    def GetWordWildcard(self, word):
        result = Set()
        for item in self.index.keys():
            if word == item[0:len(word)]:
                result = result.union(self.index[item])
        return result

    def GetQuotes(self, search_string, tmp_result):
        result = Set()
        for item in tmp_result:
            if self.docs[item].count(search_string):
                result.add(item)
        return result
    
    def GetNot(self, not_set):
        all = Set(self.docs.keys())
        return all.difference(not_set)

    def Test(self):
        all_ok = True
        for item in self.tests.keys():
            print item
            r = self.Parse(item)
            e = self.tests[item]
            print 'Result: %s' % r
            print 'Expect: %s' % e
            if e == r:
                print 'Test OK'
            else:
                all_ok = False
                print '>>>>>>>>>>>>>>>>>>>>>>Test ERROR<<<<<<<<<<<<<<<<<<<<<'
            print ''
        return all_ok
            
if __name__=='__main__':
    if ParserTest().Test():
        print 'All tests OK'
    else:
        print 'One or more tests FAILED'