from xml.sax.handler import ContentHandler from xml.sax.xmlreader import Locator import sys import xml.sax import xml.sax.handler class AimlParserError(Exception): pass class AimlHandler(ContentHandler): # The legal states of the AIML parser _STATE_OutsideAiml = 0 _STATE_InsideAiml = 1 _STATE_InsideCategory = 2 _STATE_InsidePattern = 3 _STATE_AfterPattern = 4 _STATE_InsideThat = 5 _STATE_AfterThat = 6 _STATE_InsideTemplate = 7 _STATE_AfterTemplate = 8 def __init__(self, encoding = "UTF-8"): self.categories = {} self._encoding = encoding self._state = self._STATE_OutsideAiml self._version = "" self._namespace = "" self._forwardCompatibleMode = False self._currentPattern = "" self._currentThat = "" self._currentTopic = "" self._insideTopic = False self._currentUnknown = "" # the name of the current unknown element # This is set to true when a parse error occurs in a category. self._skipCurrentCategory = False # Counts the number of parse errors in a particular AIML document. # query with getNumErrors(). If 0, the document is AIML-compliant. self._numParseErrors = 0 # TODO: select the proper validInfo table based on the version number. self._validInfo = self._validationInfo101 # This stack of bools is used when parsing
  • elements inside # elements, to keep track of whether or not an # attribute-less "default"
  • element has been found yet. Only # one default
  • is allowed in each element. We need # a stack in order to correctly handle nested tags. self._foundDefaultLiStack = [] # This stack of strings indicates what the current whitespace-handling # behavior should be. Each string in the stack is either "default" or # "preserve". When a new AIML element is encountered, a new string is # pushed onto the stack, based on the value of the element's "xml:space" # attribute (if absent, the top of the stack is pushed again). When # ending an element, pop an object off the stack. self._whitespaceBehaviorStack = ["default"] self._elemStack = [] self._locator = Locator() self.setDocumentLocator(self._locator) def getNumErrors(self): "Return the number of errors found while parsing the current document." return self._numParseErrors def setEncoding(self, encoding): """Set the text encoding to use when encoding strings read from XML. Defaults to 'UTF-8'. """ self._encoding = encoding def _location(self): "Return a string describing the current location in the source file." line = self._locator.getLineNumber() column = self._locator.getColumnNumber() return "(line %d, column %d)" % (line, column) def _pushWhitespaceBehavior(self, attr): """Push a new string onto the whitespaceBehaviorStack. The string's value is taken from the "xml:space" attribute, if it exists and has a legal value ("default" or "preserve"). Otherwise, the previous stack element is duplicated. """ assert len(self._whitespaceBehaviorStack) > 0, "Whitespace behavior stack should never be empty!" try: if attr["xml:space"] == "default" or attr["xml:space"] == "preserve": self._whitespaceBehaviorStack.append(attr["xml:space"]) else: raise AimlParserError, "Invalid value for xml:space attribute "+self._location() except KeyError: self._whitespaceBehaviorStack.append(self._whitespaceBehaviorStack[-1]) def startElementNS(self, name, qname, attr): print "QNAME:", qname print "NAME:", name uri,elem = name if (elem == "bot"): print "name:", attr.getValueByQName("name"), "a'ite?" self.startElement(elem, attr) pass def startElement(self, name, attr): # Wrapper around _startElement, which catches errors in _startElement() # and keeps going. # If we're inside an unknown element, ignore everything until we're # out again. if self._currentUnknown != "": return # If we're skipping the current category, ignore everything until # it's finished. if self._skipCurrentCategory: return # process this start-element. try: self._startElement(name, attr) except AimlParserError, msg: # Print the error message sys.stderr.write("PARSE ERROR: %s\n" % msg) self._numParseErrors += 1 # increment error count # In case of a parse error, if we're inside a category, skip it. if self._state >= self._STATE_InsideCategory: self._skipCurrentCategory = True def _startElement(self, name, attr): if name == "aiml": # tags are only legal in the OutsideAiml state if self._state != self._STATE_OutsideAiml: raise AimlParserError, "Unexpected tag "+self._location() self._state = self._STATE_InsideAiml self._insideTopic = False self._currentTopic = u"" try: self._version = attr["version"] except KeyError: # This SHOULD be a syntax error, but so many AIML sets out there are missing # "version" attributes that it just seems nicer to let it slide. #raise AimlParserError, "Missing 'version' attribute in tag "+self._location() #print "WARNING: Missing 'version' attribute in tag "+self._location() #print " Defaulting to version 1.0" self._version = "1.0" self._forwardCompatibleMode = (self._version != "1.0.1") self._pushWhitespaceBehavior(attr) # Not sure about this namespace business yet... #try: # self._namespace = attr["xmlns"] # if self._version == "1.0.1" and self._namespace != "http://alicebot.org/2001/AIML-1.0.1": # raise AimlParserError, "Incorrect namespace for AIML v1.0.1 "+self._location() #except KeyError: # if self._version != "1.0": # raise AimlParserError, "Missing 'version' attribute(s) in tag "+self._location() elif self._state == self._STATE_OutsideAiml: # If we're outside of an AIML element, we ignore all tags. return elif name == "topic": # tags are only legal in the InsideAiml state, and only # if we're not already inside a topic. if (self._state != self._STATE_InsideAiml) or self._insideTopic: raise AimlParserError, "Unexpected tag", self._location() try: self._currentTopic = unicode(attr['name']) except KeyError: raise AimlParserError, "Required \"name\" attribute missing in element "+self._location() self._insideTopic = True elif name == "category": # tags are only legal in the InsideAiml state if self._state != self._STATE_InsideAiml: raise AimlParserError, "Unexpected tag "+self._location() self._state = self._STATE_InsideCategory self._currentPattern = u"" self._currentThat = u"" # If we're not inside a topic, the topic is implicitly set to * if not self._insideTopic: self._currentTopic = u"*" self._elemStack = [] self._pushWhitespaceBehavior(attr) elif name == "pattern": # tags are only legal in the InsideCategory state if self._state != self._STATE_InsideCategory: raise AimlParserError, "Unexpected tag "+self._location() self._state = self._STATE_InsidePattern elif name == "that" and self._state == self._STATE_AfterPattern: # are legal either inside a