diff options
author | yvesf <yvesf@d0e8fea9-7529-0410-93fb-d39fd5b9c1dd> | 2009-12-04 23:40:36 +0000 |
---|---|---|
committer | yvesf <yvesf@d0e8fea9-7529-0410-93fb-d39fd5b9c1dd> | 2009-12-04 23:40:36 +0000 |
commit | 400958e540150b65cbf59467ea61aa4e654f4542 (patch) | |
tree | 3cc5ed7ce8d183534dfa58cf39caa4dd420d4d12 /aiml/AimlParser.py | |
parent | 6153ba1277632d74105170c182594d3833636fa7 (diff) | |
download | omegle-400958e540150b65cbf59467ea61aa4e654f4542.tar.gz omegle-400958e540150b65cbf59467ea61aa4e654f4542.zip |
aiml bot
git-svn-id: http://xapek.org/svn/common/omegle@1473 d0e8fea9-7529-0410-93fb-d39fd5b9c1dd
Diffstat (limited to 'aiml/AimlParser.py')
-rw-r--r-- | aiml/AimlParser.py | 545 |
1 files changed, 545 insertions, 0 deletions
diff --git a/aiml/AimlParser.py b/aiml/AimlParser.py new file mode 100644 index 0000000..75c2cf1 --- /dev/null +++ b/aiml/AimlParser.py @@ -0,0 +1,545 @@ +from xml.sax.handler import ContentHandler +from xml.sax.xmlreader import Locator +import sys +import xml.sax +import xml.sax.handler + +class AimlParserError(Exception): pass + +class AimlHandler(ContentHandler): + # The legal states of the AIML parser + _STATE_OutsideAiml = 0 + _STATE_InsideAiml = 1 + _STATE_InsideCategory = 2 + _STATE_InsidePattern = 3 + _STATE_AfterPattern = 4 + _STATE_InsideThat = 5 + _STATE_AfterThat = 6 + _STATE_InsideTemplate = 7 + _STATE_AfterTemplate = 8 + + def __init__(self, encoding = "UTF-8"): + self.categories = {} + self._encoding = encoding + self._state = self._STATE_OutsideAiml + self._version = "" + self._namespace = "" + self._forwardCompatibleMode = False + self._currentPattern = "" + self._currentThat = "" + self._currentTopic = "" + self._insideTopic = False + self._currentUnknown = "" # the name of the current unknown element + + # This is set to true when a parse error occurs in a category. + self._skipCurrentCategory = False + + # Counts the number of parse errors in a particular AIML document. + # query with getNumErrors(). If 0, the document is AIML-compliant. + self._numParseErrors = 0 + + # TODO: select the proper validInfo table based on the version number. + self._validInfo = self._validationInfo101 + + # This stack of bools is used when parsing <li> elements inside + # <condition> elements, to keep track of whether or not an + # attribute-less "default" <li> element has been found yet. Only + # one default <li> is allowed in each <condition> element. We need + # a stack in order to correctly handle nested <condition> tags. + self._foundDefaultLiStack = [] + + # This stack of strings indicates what the current whitespace-handling + # behavior should be. Each string in the stack is either "default" or + # "preserve". When a new AIML element is encountered, a new string is + # pushed onto the stack, based on the value of the element's "xml:space" + # attribute (if absent, the top of the stack is pushed again). When + # ending an element, pop an object off the stack. + self._whitespaceBehaviorStack = ["default"] + + self._elemStack = [] + self._locator = Locator() + self.setDocumentLocator(self._locator) + + def getNumErrors(self): + "Return the number of errors found while parsing the current document." + return self._numParseErrors + + def setEncoding(self, encoding): + """Set the text encoding to use when encoding strings read from XML. + + Defaults to 'UTF-8'. + + """ + self._encoding = encoding + + def _location(self): + "Return a string describing the current location in the source file." + line = self._locator.getLineNumber() + column = self._locator.getColumnNumber() + return "(line %d, column %d)" % (line, column) + + def _pushWhitespaceBehavior(self, attr): + """Push a new string onto the whitespaceBehaviorStack. + + The string's value is taken from the "xml:space" attribute, if it exists + and has a legal value ("default" or "preserve"). Otherwise, the previous + stack element is duplicated. + + """ + assert len(self._whitespaceBehaviorStack) > 0, "Whitespace behavior stack should never be empty!" + try: + if attr["xml:space"] == "default" or attr["xml:space"] == "preserve": + self._whitespaceBehaviorStack.append(attr["xml:space"]) + else: + raise AimlParserError, "Invalid value for xml:space attribute "+self._location() + except KeyError: + self._whitespaceBehaviorStack.append(self._whitespaceBehaviorStack[-1]) + + def startElementNS(self, name, qname, attr): + print "QNAME:", qname + print "NAME:", name + uri,elem = name + if (elem == "bot"): print "name:", attr.getValueByQName("name"), "a'ite?" + self.startElement(elem, attr) + pass + + def startElement(self, name, attr): + # Wrapper around _startElement, which catches errors in _startElement() + # and keeps going. + + # If we're inside an unknown element, ignore everything until we're + # out again. + if self._currentUnknown != "": + return + # If we're skipping the current category, ignore everything until + # it's finished. + if self._skipCurrentCategory: + return + + # process this start-element. + try: self._startElement(name, attr) + except AimlParserError, msg: + # Print the error message + sys.stderr.write("PARSE ERROR: %s\n" % msg) + + self._numParseErrors += 1 # increment error count + # In case of a parse error, if we're inside a category, skip it. + if self._state >= self._STATE_InsideCategory: + self._skipCurrentCategory = True + + def _startElement(self, name, attr): + if name == "aiml": + # <aiml> tags are only legal in the OutsideAiml state + if self._state != self._STATE_OutsideAiml: + raise AimlParserError, "Unexpected <aiml> tag "+self._location() + self._state = self._STATE_InsideAiml + self._insideTopic = False + self._currentTopic = u"" + try: self._version = attr["version"] + except KeyError: + # This SHOULD be a syntax error, but so many AIML sets out there are missing + # "version" attributes that it just seems nicer to let it slide. + #raise AimlParserError, "Missing 'version' attribute in <aiml> tag "+self._location() + #print "WARNING: Missing 'version' attribute in <aiml> tag "+self._location() + #print " Defaulting to version 1.0" + self._version = "1.0" + self._forwardCompatibleMode = (self._version != "1.0.1") + self._pushWhitespaceBehavior(attr) + # Not sure about this namespace business yet... + #try: + # self._namespace = attr["xmlns"] + # if self._version == "1.0.1" and self._namespace != "http://alicebot.org/2001/AIML-1.0.1": + # raise AimlParserError, "Incorrect namespace for AIML v1.0.1 "+self._location() + #except KeyError: + # if self._version != "1.0": + # raise AimlParserError, "Missing 'version' attribute(s) in <aiml> tag "+self._location() + elif self._state == self._STATE_OutsideAiml: + # If we're outside of an AIML element, we ignore all tags. + return + elif name == "topic": + # <topic> tags are only legal in the InsideAiml state, and only + # if we're not already inside a topic. + if (self._state != self._STATE_InsideAiml) or self._insideTopic: + raise AimlParserError, "Unexpected <topic> tag", self._location() + try: self._currentTopic = unicode(attr['name']) + except KeyError: + raise AimlParserError, "Required \"name\" attribute missing in <topic> element "+self._location() + self._insideTopic = True + elif name == "category": + # <category> tags are only legal in the InsideAiml state + if self._state != self._STATE_InsideAiml: + raise AimlParserError, "Unexpected <category> tag "+self._location() + self._state = self._STATE_InsideCategory + self._currentPattern = u"" + self._currentThat = u"" + # If we're not inside a topic, the topic is implicitly set to * + if not self._insideTopic: self._currentTopic = u"*" + self._elemStack = [] + self._pushWhitespaceBehavior(attr) + elif name == "pattern": + # <pattern> tags are only legal in the InsideCategory state + if self._state != self._STATE_InsideCategory: + raise AimlParserError, "Unexpected <pattern> tag "+self._location() + self._state = self._STATE_InsidePattern + elif name == "that" and self._state == self._STATE_AfterPattern: + # <that> are legal either inside a <template> element, or + # inside a <category> element, between the <pattern> and the + # <template> elements. This clause handles the latter case. + self._state = self._STATE_InsideThat + elif name == "template": + # <template> tags are only legal in the AfterPattern and AfterThat + # states + if self._state not in [self._STATE_AfterPattern, self._STATE_AfterThat]: + raise AimlParserError, "Unexpected <template> tag "+self._location() + # if no <that> element was specified, it is implicitly set to * + if self._state == self._STATE_AfterPattern: + self._currentThat = u"*" + self._state = self._STATE_InsideTemplate + self._elemStack.append(['template',{}]) + self._pushWhitespaceBehavior(attr) + elif self._state == self._STATE_InsidePattern: + # Certain tags are allowed inside <pattern> elements. + if name == "bot" and attr.has_key("name") and attr["name"] == u"name": + # Insert a special character string that the PatternMgr will + # replace with the bot's name. + self._currentPattern += u" BOT_NAME " + else: + raise AimlParserError, ("Unexpected <%s> tag " % name)+self._location() + elif self._state == self._STATE_InsideThat: + # Certain tags are allowed inside <that> elements. + if name == "bot" and attr.has_key("name") and attr["name"] == u"name": + # Insert a special character string that the PatternMgr will + # replace with the bot's name. + self._currentThat += u" BOT_NAME " + else: + raise AimlParserError, ("Unexpected <%s> tag " % name)+self._location() + elif self._state == self._STATE_InsideTemplate and self._validInfo.has_key(name): + # Starting a new element inside the current pattern. First + # we need to convert 'attr' into a native Python dictionary, + # so it can later be marshaled. + attrDict = {} + for k,v in attr.items(): + #attrDict[k[1].encode(self._encoding)] = v.encode(self._encoding) + attrDict[k.encode(self._encoding)] = unicode(v) + self._validateElemStart(name, attrDict, self._version) + # Push the current element onto the element stack. + self._elemStack.append([name.encode(self._encoding),attrDict]) + self._pushWhitespaceBehavior(attr) + # If this is a condition element, push a new entry onto the + # foundDefaultLiStack + if name == "condition": + self._foundDefaultLiStack.append(False) + else: + # we're now inside an unknown element. + if self._forwardCompatibleMode: + # In Forward Compatibility Mode, we ignore the element and its + # contents. + self._currentUnknown = name + else: + # Otherwise, unknown elements are grounds for error! + raise AimlParserError, ("Unexpected <%s> tag " % name)+self._location() + + def characters(self, ch): + # Wrapper around _characters which catches errors in _characters() + # and keeps going. + if self._state == self._STATE_OutsideAiml: + # If we're outside of an AIML element, we ignore all text + return + if self._currentUnknown != "": + # If we're inside an unknown element, ignore all text + return + if self._skipCurrentCategory: + # If we're skipping the current category, ignore all text. + return + try: self._characters(ch) + except AimlParserError, msg: + # Print the message + sys.stderr.write("PARSE ERROR: %s\n" % msg) + self._numParseErrors += 1 # increment error count + # In case of a parse error, if we're inside a category, skip it. + if self._state >= self._STATE_InsideCategory: + self._skipCurrentCategory = True + + def _characters(self, ch): + text = unicode(ch) + if self._state == self._STATE_InsidePattern: + self._currentPattern += text + elif self._state == self._STATE_InsideThat: + self._currentThat += text + elif self._state == self._STATE_InsideTemplate: + # First, see whether the element at the top of the element stack + # is permitted to contain text. + try: + parent = self._elemStack[-1][0] + parentAttr = self._elemStack[-1][1] + required, optional, canBeParent = self._validInfo[parent] + nonBlockStyleCondition = (parent == "condition" and not (parentAttr.has_key("name") and parentAttr.has_key("value"))) + if not canBeParent: + raise AimlParserError, ("Unexpected text inside <%s> element "%parent)+self._location() + elif parent == "random" or nonBlockStyleCondition: + # <random> elements can only contain <li> subelements. However, + # there's invariably some whitespace around the <li> that we need + # to ignore. Same for non-block-style <condition> elements (i.e. + # those which don't have both a "name" and a "value" attribute). + if len(text.strip()) == 0: + # ignore whitespace inside these elements. + return + else: + # non-whitespace text inside these elements is a syntax error. + raise AimlParserError, ("Unexpected text inside <%s> element "%parent)+self._location() + except IndexError: + # the element stack is empty. This should never happen. + raise AimlParserError, "Element stack is empty while validating text "+self._location() + + # Add a new text element to the element at the top of the element + # stack. If there's already a text element there, simply append the + # new characters to its contents. + try: textElemOnStack = (self._elemStack[-1][-1][0] == "text") + except IndexError: textElemOnStack = False + except KeyError: textElemOnStack = False + if textElemOnStack: + self._elemStack[-1][-1][2] += text + else: + self._elemStack[-1].append(["text", {"xml:space": self._whitespaceBehaviorStack[-1]}, text]) + else: + # all other text is ignored + pass + + def endElementNS(self, name, qname): + uri, elem = name + self.endElement(elem) + + def endElement(self, name): + """Wrapper around _endElement which catches errors in _characters() + and keeps going. + + """ + if self._state == self._STATE_OutsideAiml: + # If we're outside of an AIML element, ignore all tags + return + if self._currentUnknown != "": + # see if we're at the end of an unknown element. If so, we can + # stop ignoring everything. + if name == self._currentUnknown: + self._currentUnknown = "" + return + if self._skipCurrentCategory: + # If we're skipping the current category, see if it's ending. We + # stop on ANY </category> tag, since we're not keeping track of + # state in ignore-mode. + if name == "category": + self._skipCurrentCategory = False + self._state = self._STATE_InsideAiml + return + try: self._endElement(name) + except AimlParserError, msg: + # Print the message + sys.stderr.write("PARSE ERROR: %s\n" % msg) + self._numParseErrors += 1 # increment error count + # In case of a parse error, if we're inside a category, skip it. + if self._state >= self._STATE_InsideCategory: + self._skipCurrentCategory = True + + def _endElement(self, name): + """Verify that an AIML end element is valid in the current + context. + + Raises an AimlParserError if an illegal end element is encountered. + + """ + if name == "aiml": + # </aiml> tags are only legal in the InsideAiml state + if self._state != self._STATE_InsideAiml: + raise AimlParserError, "Unexpected </aiml> tag "+self._location() + self._state = self._STATE_OutsideAiml + self._whitespaceBehaviorStack.pop() + elif name == "topic": + # </topic> tags are only legal in the InsideAiml state, and + # only if _insideTopic is true. + if self._state != self._STATE_InsideAiml or not self._insideTopic: + raise AimlParserError, "Unexpected </topic> tag "+self._location() + self._insideTopic = False + self._currentTopic = u"" + elif name == "category": + # </category> tags are only legal in the AfterTemplate state + if self._state != self._STATE_AfterTemplate: + raise AimlParserError, "Unexpected </category> tag "+self._location() + self._state = self._STATE_InsideAiml + # End the current category. Store the current pattern/that/topic and + # element in the categories dictionary. + key = (self._currentPattern.strip(), self._currentThat.strip(),self._currentTopic.strip()) + self.categories[key] = self._elemStack[-1] + self._whitespaceBehaviorStack.pop() + elif name == "pattern": + # </pattern> tags are only legal in the InsidePattern state + if self._state != self._STATE_InsidePattern: + raise AimlParserError, "Unexpected </pattern> tag "+self._location() + self._state = self._STATE_AfterPattern + elif name == "that" and self._state == self._STATE_InsideThat: + # </that> tags are only allowed inside <template> elements or in + # the InsideThat state. This clause handles the latter case. + self._state = self._STATE_AfterThat + elif name == "template": + # </template> tags are only allowed in the InsideTemplate state. + if self._state != self._STATE_InsideTemplate: + raise AimlParserError, "Unexpected </template> tag "+self._location() + self._state = self._STATE_AfterTemplate + self._whitespaceBehaviorStack.pop() + elif self._state == self._STATE_InsidePattern: + # Certain tags are allowed inside <pattern> elements. + if name not in ["bot"]: + raise AimlParserError, ("Unexpected </%s> tag " % name)+self._location() + elif self._state == self._STATE_InsideThat: + # Certain tags are allowed inside <that> elements. + if name not in ["bot"]: + raise AimlParserError, ("Unexpected </%s> tag " % name)+self._location() + elif self._state == self._STATE_InsideTemplate: + # End of an element inside the current template. Append the + # element at the top of the stack onto the one beneath it. + elem = self._elemStack.pop() + self._elemStack[-1].append(elem) + self._whitespaceBehaviorStack.pop() + # If the element was a condition, pop an item off the + # foundDefaultLiStack as well. + if elem[0] == "condition": self._foundDefaultLiStack.pop() + else: + # Unexpected closing tag + raise AimlParserError, ("Unexpected </%s> tag " % name)+self._location() + + # A dictionary containing a validation information for each AIML + # element. The keys are the names of the elements. The values are a + # tuple of three items. The first is a list containing the names of + # REQUIRED attributes, the second is a list of OPTIONAL attributes, + # and the third is a boolean value indicating whether or not the + # element can contain other elements and/or text (if False, the + # element can only appear in an atomic context, such as <date/>). + _validationInfo101 = { + "bot": ( ["name"], [], False ), + "condition": ( [], ["name", "value"], True ), # can only contain <li> elements + "date": ( [], [], False ), + "formal": ( [], [], True ), + "gender": ( [], [], True ), + "get": ( ["name"], [], False ), + "gossip": ( [], [], True ), + "id": ( [], [], False ), + "input": ( [], ["index"], False ), + "javascript": ( [], [], True ), + "learn": ( [], [], True ), + "li": ( [], ["name", "value"], True ), + "lowercase": ( [], [], True ), + "person": ( [], [], True ), + "person2": ( [], [], True ), + "random": ( [], [], True ), # can only contain <li> elements + "sentence": ( [], [], True ), + "set": ( ["name"], [], True), + "size": ( [], [], False ), + "sr": ( [], [], False ), + "srai": ( [], [], True ), + "star": ( [], ["index"], False ), + "system": ( [], [], True ), + "template": ( [], [], True ), # needs to be in the list because it can be a parent. + "that": ( [], ["index"], False ), + "thatstar": ( [], ["index"], False ), + "think": ( [], [], True ), + "topicstar": ( [], ["index"], False ), + "uppercase": ( [], [], True ), + "version": ( [], [], False ), + } + + def _validateElemStart(self, name, attr, version): + """Test the validity of an element starting inside a <template> + element. + + This function raises an AimlParserError exception if it the tag is + invalid. Otherwise, no news is good news. + + """ + # Check the element's attributes. Make sure that all required + # attributes are present, and that any remaining attributes are + # valid options. + required, optional, canBeParent = self._validInfo[name] + for a in required: + if a not in attr and not self._forwardCompatibleMode: + raise AimlParserError, ("Required \"%s\" attribute missing in <%s> element " % (a,name))+self._location() + for a in attr: + if a in required: continue + if a[0:4] == "xml:": continue # attributes in the "xml" namespace can appear anywhere + if a not in optional and not self._forwardCompatibleMode: + raise AimlParserError, ("Unexpected \"%s\" attribute in <%s> element " % (a,name))+self._location() + + # special-case: several tags contain an optional "index" attribute. + # This attribute's value must be a positive integer. + if name in ["star", "thatstar", "topicstar"]: + for k,v in attr.items(): + if k == "index": + temp = 0 + try: temp = int(v) + except: + raise AimlParserError, ("Bad type for \"%s\" attribute (expected integer, found \"%s\") " % (k,v))+self._location() + if temp < 1: + raise AimlParserError, ("\"%s\" attribute must have non-negative value " % (k))+self._location() + + # See whether the containing element is permitted to contain + # subelements. If not, this element is invalid no matter what it is. + try: + parent = self._elemStack[-1][0] + parentAttr = self._elemStack[-1][1] + except IndexError: + # If the stack is empty, no parent is present. This should never + # happen. + raise AimlParserError, ("Element stack is empty while validating <%s> " % name)+self._location() + required, optional, canBeParent = self._validInfo[parent] + nonBlockStyleCondition = (parent == "condition" and not (parentAttr.has_key("name") and parentAttr.has_key("value"))) + if not canBeParent: + raise AimlParserError, ("<%s> elements cannot have any contents "%parent)+self._location() + # Special-case test if the parent element is <condition> (the + # non-block-style variant) or <random>: these elements can only + # contain <li> subelements. + elif (parent == "random" or nonBlockStyleCondition) and name!="li": + raise AimlParserError, ("<%s> elements can only contain <li> subelements "%parent)+self._location() + # Special-case test for <li> elements, which can only be contained + # by non-block-style <condition> and <random> elements, and whose + # required attributes are dependent upon which attributes are + # present in the <condition> parent. + elif name=="li": + if not (parent=="random" or nonBlockStyleCondition): + raise AimlParserError, ("Unexpected <li> element contained by <%s> element "%parent)+self._location() + if nonBlockStyleCondition: + if parentAttr.has_key("name"): + # Single-predicate condition. Each <li> element except the + # last must have a "value" attribute. + if len(attr) == 0: + # This could be the default <li> element for this <condition>, + # unless we've already found one. + if self._foundDefaultLiStack[-1]: + raise AimlParserError, "Unexpected default <li> element inside <condition> "+self._location() + else: + self._foundDefaultLiStack[-1] = True + elif len(attr) == 1 and attr.has_key("value"): + pass # this is the valid case + else: + raise AimlParserError, "Invalid <li> inside single-predicate <condition> "+self._location() + elif len(parentAttr) == 0: + # Multi-predicate condition. Each <li> element except the + # last must have a "name" and a "value" attribute. + if len(attr) == 0: + # This could be the default <li> element for this <condition>, + # unless we've already found one. + if self._foundDefaultLiStack[-1]: + raise AimlParserError, "Unexpected default <li> element inside <condition> "+self._location() + else: + self._foundDefaultLiStack[-1] = True + elif len(attr) == 2 and attr.has_key("value") and attr.has_key("name"): + pass # this is the valid case + else: + raise AimlParserError, "Invalid <li> inside multi-predicate <condition> "+self._location() + # All is well! + return True + +def create_parser(): + """Create and return an AIML parser object.""" + parser = xml.sax.make_parser() + handler = AimlHandler("UTF-8") + parser.setContentHandler(handler) + #parser.setFeature(xml.sax.handler.feature_namespaces, True) + return parser
\ No newline at end of file |