diff --git a/html5lib/_tokenizer.py b/html5lib/_tokenizer.py index 4748a197..d114ddee 100644 --- a/html5lib/_tokenizer.py +++ b/html5lib/_tokenizer.py @@ -9,7 +9,6 @@ from .constants import entities from .constants import asciiLetters, asciiUpper2Lower from .constants import digits, hexDigits, EOF -from .constants import tokenTypes, tagTokenTypes from .constants import replacementCharacters from ._inputstream import HTMLInputStream @@ -24,6 +23,71 @@ attributeMap = OrderedDict +class Token(object): + def __init__(self, data=None): + self.data = data + + +class Doctype(Token): + def __init__(self, name, publicId, systemId, correct): + self.name = name.translate(asciiUpper2Lower) + self.namespace = None + self.publicId = publicId + self.systemId = systemId + self.correct = correct + + +class Characters(Token): + pass + + +class SpaceCharacters(Token): + pass + + +class Tag(Token): + def __init__(self, name, attributes): + self.name = name.translate(asciiUpper2Lower) + self.namespace = None + self.attributes = attributeMap(attributes or {}) + self.self_closing = False + self.attribute_name = "" + self.attribute_value = "" + + def flushAttribute(self): + if self.attribute_name and self.attribute_name not in self.attributes: + self.attributes[self.attribute_name] = self.attribute_value + self.attribute_name = "" + self.attribute_value = "" + + def accumulateAttributeName(self, text): + self.attribute_name += text.translate(asciiUpper2Lower) + + def accumulateAttributeValue(self, text): + self.attribute_value += text + + +class StartTag(Tag): + def __init__(self, name, data=None): + super(StartTag, self).__init__(name, data) + self.self_closing_acknowledged = False + + +class EndTag(Tag): + def __init__(self, name, data=None): + super(EndTag, self).__init__(name, data) + + +class Comment(Token): + pass + + +class ParseError(Token): + def __init__(self, data, datavars=None): + self.data = data + self.datavars = datavars or {} + + class HTMLTokenizer(object): """ This class takes care of tokenizing HTML. @@ -64,14 +128,14 @@ def __iter__(self): # instead of True and the loop will terminate. while self.state(): while self.stream.errors: - yield {"type": tokenTypes["ParseError"], "data": self.stream.errors.pop(0)} + yield ParseError(data=self.stream.errors.pop(0)) while self.tokenQueue: yield self.tokenQueue.popleft() def consumeNumberEntity(self, isHex): """This function returns either U+FFFD or the character based on the decimal or hexadecimal representation. It also discards ";" if present. - If not present self.tokenQueue.append({"type": tokenTypes["ParseError"]}) is invoked. + If not present self.tokenQueue.append(ParseError) is invoked. """ allowed = digits @@ -95,15 +159,11 @@ def consumeNumberEntity(self, isHex): # Certain characters get replaced with others if charAsInt in replacementCharacters: char = replacementCharacters[charAsInt] - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "illegal-codepoint-for-numeric-entity", - "datavars": {"charAsInt": charAsInt}}) + self.tokenQueue.append(ParseError("illegal-codepoint-for-numeric-entity", {"charAsInt": charAsInt})) elif ((0xD800 <= charAsInt <= 0xDFFF) or (charAsInt > 0x10FFFF)): char = "\uFFFD" - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "illegal-codepoint-for-numeric-entity", - "datavars": {"charAsInt": charAsInt}}) + self.tokenQueue.append(ParseError("illegal-codepoint-for-numeric-entity", {"charAsInt": charAsInt})) else: # Should speed up this check somehow (e.g. move the set to a constant) if ((0x0001 <= charAsInt <= 0x0008) or @@ -119,10 +179,7 @@ def consumeNumberEntity(self, isHex): 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF])): - self.tokenQueue.append({"type": tokenTypes["ParseError"], - "data": - "illegal-codepoint-for-numeric-entity", - "datavars": {"charAsInt": charAsInt}}) + self.tokenQueue.append(ParseError("illegal-codepoint-for-numeric-entity", {"charAsInt": charAsInt})) try: # Try/except needed as UCS-2 Python builds' unichar only works # within the BMP. @@ -134,8 +191,7 @@ def consumeNumberEntity(self, isHex): # Discard the ; if present. Otherwise, put it back on the queue and # invoke parseError on parser. if c != ";": - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "numeric-entity-without-semicolon"}) + self.tokenQueue.append(ParseError("numeric-entity-without-semicolon")) self.stream.unget(c) return char @@ -165,8 +221,7 @@ def consumeEntity(self, allowedChar=None, fromAttribute=False): output = self.consumeNumberEntity(hex) else: # No digits found - self.tokenQueue.append({"type": tokenTypes["ParseError"], - "data": "expected-numeric-entity"}) + self.tokenQueue.append(ParseError("expected-numeric-entity")) self.stream.unget(charStack.pop()) output = "&" + "".join(charStack) @@ -193,8 +248,7 @@ def consumeEntity(self, allowedChar=None, fromAttribute=False): if entityName is not None: if entityName[-1] != ";": - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "named-entity-without-semicolon"}) + self.tokenQueue.append(ParseError("named-entity-without-semicolon")) if (entityName[-1] != ";" and fromAttribute and (charStack[entityLength] in asciiLetters or charStack[entityLength] in digits or @@ -206,19 +260,18 @@ def consumeEntity(self, allowedChar=None, fromAttribute=False): self.stream.unget(charStack.pop()) output += "".join(charStack[entityLength:]) else: - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "expected-named-entity"}) + self.tokenQueue.append(ParseError("expected-named-entity")) self.stream.unget(charStack.pop()) output = "&" + "".join(charStack) if fromAttribute: - self.currentToken["data"][-1][1] += output + self.currentToken.accumulateAttributeValue(output) else: if output in spaceCharacters: - tokenType = "SpaceCharacters" + token = SpaceCharacters(output) else: - tokenType = "Characters" - self.tokenQueue.append({"type": tokenTypes[tokenType], "data": output}) + token = Characters(output) + self.tokenQueue.append(token) def processEntityInAttribute(self, allowedChar): """This method replaces the need for "entityInAttributeValueState". @@ -232,23 +285,15 @@ def emitCurrentToken(self): """ token = self.currentToken # Add token to the queue to be yielded - if (token["type"] in tagTokenTypes): - token["name"] = token["name"].translate(asciiUpper2Lower) - if token["type"] == tokenTypes["StartTag"]: - raw = token["data"] - data = attributeMap(raw) - if len(raw) > len(data): - # we had some duplicated attribute, fix so first wins - data.update(raw[::-1]) - token["data"] = data - - if token["type"] == tokenTypes["EndTag"]: - if token["data"]: - self.tokenQueue.append({"type": tokenTypes["ParseError"], - "data": "attributes-in-end-tag"}) - if token["selfClosing"]: - self.tokenQueue.append({"type": tokenTypes["ParseError"], - "data": "self-closing-flag-on-end-tag"}) + if isinstance(token, Tag): + if self.currentToken.attribute_name in self.currentToken.attributes: + self.tokenQueue.append(ParseError("duplicate-attribute")) + token.flushAttribute() + if isinstance(token, EndTag): + if token.attributes: + self.tokenQueue.append(ParseError("attributes-in-end-tag")) + if token.self_closing: + self.tokenQueue.append(ParseError("self-closing-flag-on-end-tag")) self.tokenQueue.append(token) self.state = self.dataState @@ -260,10 +305,8 @@ def dataState(self): elif data == "<": self.state = self.tagOpenState elif data == "\u0000": - self.tokenQueue.append({"type": tokenTypes["ParseError"], - "data": "invalid-codepoint"}) - self.tokenQueue.append({"type": tokenTypes["Characters"], - "data": "\u0000"}) + self.tokenQueue.append(ParseError("invalid-codepoint")) + self.tokenQueue.append(Characters("\u0000")) elif data is EOF: # Tokenization ends. return False @@ -271,15 +314,13 @@ def dataState(self): # Directly after emitting a token you switch back to the "data # state". At that point spaceCharacters are important so they are # emitted separately. - self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data": - data + self.stream.charsUntil(spaceCharacters, True)}) + self.tokenQueue.append(SpaceCharacters(data + self.stream.charsUntil(spaceCharacters, True))) # No need to update lastFourChars here, since the first space will # have already been appended to lastFourChars and will have broken # any sequences else: chars = self.stream.charsUntil(("&", "<", "\u0000")) - self.tokenQueue.append({"type": tokenTypes["Characters"], "data": - data + chars}) + self.tokenQueue.append(Characters(data + chars)) return True def entityDataState(self): @@ -297,23 +338,19 @@ def rcdataState(self): # Tokenization ends. return False elif data == "\u0000": - self.tokenQueue.append({"type": tokenTypes["ParseError"], - "data": "invalid-codepoint"}) - self.tokenQueue.append({"type": tokenTypes["Characters"], - "data": "\uFFFD"}) + self.tokenQueue.append(ParseError("invalid-codepoint")) + self.tokenQueue.append(Characters("\uFFFD")) elif data in spaceCharacters: # Directly after emitting a token you switch back to the "data # state". At that point spaceCharacters are important so they are # emitted separately. - self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data": - data + self.stream.charsUntil(spaceCharacters, True)}) + self.tokenQueue.append(SpaceCharacters(data + self.stream.charsUntil(spaceCharacters, True))) # No need to update lastFourChars here, since the first space will # have already been appended to lastFourChars and will have broken # any sequences else: chars = self.stream.charsUntil(("&", "<", "\u0000")) - self.tokenQueue.append({"type": tokenTypes["Characters"], "data": - data + chars}) + self.tokenQueue.append(Characters(data + chars)) return True def characterReferenceInRcdata(self): @@ -326,17 +363,14 @@ def rawtextState(self): if data == "<": self.state = self.rawtextLessThanSignState elif data == "\u0000": - self.tokenQueue.append({"type": tokenTypes["ParseError"], - "data": "invalid-codepoint"}) - self.tokenQueue.append({"type": tokenTypes["Characters"], - "data": "\uFFFD"}) + self.tokenQueue.append(ParseError("invalid-codepoint")) + self.tokenQueue.append(Characters("\uFFFD")) elif data == EOF: # Tokenization ends. return False else: chars = self.stream.charsUntil(("<", "\u0000")) - self.tokenQueue.append({"type": tokenTypes["Characters"], "data": - data + chars}) + self.tokenQueue.append(Characters(data + chars)) return True def scriptDataState(self): @@ -344,17 +378,14 @@ def scriptDataState(self): if data == "<": self.state = self.scriptDataLessThanSignState elif data == "\u0000": - self.tokenQueue.append({"type": tokenTypes["ParseError"], - "data": "invalid-codepoint"}) - self.tokenQueue.append({"type": tokenTypes["Characters"], - "data": "\uFFFD"}) + self.tokenQueue.append(ParseError("invalid-codepoint")) + self.tokenQueue.append(Characters("\uFFFD")) elif data == EOF: # Tokenization ends. return False else: chars = self.stream.charsUntil(("<", "\u0000")) - self.tokenQueue.append({"type": tokenTypes["Characters"], "data": - data + chars}) + self.tokenQueue.append(Characters(data + chars)) return True def plaintextState(self): @@ -363,13 +394,10 @@ def plaintextState(self): # Tokenization ends. return False elif data == "\u0000": - self.tokenQueue.append({"type": tokenTypes["ParseError"], - "data": "invalid-codepoint"}) - self.tokenQueue.append({"type": tokenTypes["Characters"], - "data": "\uFFFD"}) + self.tokenQueue.append(ParseError("invalid-codepoint")) + self.tokenQueue.append(Characters("\uFFFD")) else: - self.tokenQueue.append({"type": tokenTypes["Characters"], "data": - data + self.stream.charsUntil("\u0000")}) + self.tokenQueue.append(Characters(data + self.stream.charsUntil("\u0000"))) return True def tagOpenState(self): @@ -379,30 +407,24 @@ def tagOpenState(self): elif data == "/": self.state = self.closeTagOpenState elif data in asciiLetters: - self.currentToken = {"type": tokenTypes["StartTag"], - "name": data, "data": [], - "selfClosing": False, - "selfClosingAcknowledged": False} + self.currentToken = StartTag(name=data) self.state = self.tagNameState elif data == ">": # XXX In theory it could be something besides a tag name. But # do we really care? - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "expected-tag-name-but-got-right-bracket"}) - self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<>"}) + self.tokenQueue.append(ParseError("expected-tag-name-but-got-right-bracket")) + self.tokenQueue.append(Characters("<>")) self.state = self.dataState elif data == "?": # XXX In theory it could be something besides a tag name. But # do we really care? - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "expected-tag-name-but-got-question-mark"}) + self.tokenQueue.append(ParseError("expected-tag-name-but-got-question-mark")) self.stream.unget(data) self.state = self.bogusCommentState else: # XXX - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "expected-tag-name"}) - self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) + self.tokenQueue.append(ParseError("expected-tag-name")) + self.tokenQueue.append(Characters("<")) self.stream.unget(data) self.state = self.dataState return True @@ -410,23 +432,18 @@ def tagOpenState(self): def closeTagOpenState(self): data = self.stream.char() if data in asciiLetters: - self.currentToken = {"type": tokenTypes["EndTag"], "name": data, - "data": [], "selfClosing": False} + self.currentToken = EndTag(name=data) self.state = self.tagNameState elif data == ">": - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "expected-closing-tag-but-got-right-bracket"}) + self.tokenQueue.append(ParseError("expected-closing-tag-but-got-right-bracket")) self.state = self.dataState elif data is EOF: - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "expected-closing-tag-but-got-eof"}) - self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ""}) + self.tokenQueue.append(ParseError("expected-closing-tag-but-got-eof")) + self.tokenQueue.append(Characters("")) self.state = self.dataState else: # XXX data can be _'_... - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "expected-closing-tag-but-got-char", - "datavars": {"data": data}}) + self.tokenQueue.append(ParseError("expected-closing-tag-but-got-char", datavars={"data": data})) self.stream.unget(data) self.state = self.bogusCommentState return True @@ -438,17 +455,15 @@ def tagNameState(self): elif data == ">": self.emitCurrentToken() elif data is EOF: - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "eof-in-tag-name"}) + self.tokenQueue.append(ParseError("eof-in-tag-name")) self.state = self.dataState elif data == "/": self.state = self.selfClosingStartTagState elif data == "\u0000": - self.tokenQueue.append({"type": tokenTypes["ParseError"], - "data": "invalid-codepoint"}) - self.currentToken["name"] += "\uFFFD" + self.tokenQueue.append(ParseError("invalid-codepoint")) + self.currentToken.name += "\uFFFD" else: - self.currentToken["name"] += data + self.currentToken.name += data.translate(asciiUpper2Lower) # (Don't use charsUntil here, because tag names are # very short and it's faster to not do anything fancy) return True @@ -459,7 +474,7 @@ def rcdataLessThanSignState(self): self.temporaryBuffer = "" self.state = self.rcdataEndTagOpenState else: - self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) + self.tokenQueue.append(Characters("<")) self.stream.unget(data) self.state = self.rcdataState return True @@ -470,35 +485,29 @@ def rcdataEndTagOpenState(self): self.temporaryBuffer += data self.state = self.rcdataEndTagNameState else: - self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ""}) + self.tokenQueue.append(Characters("")) self.stream.unget(data) self.state = self.rcdataState return True def rcdataEndTagNameState(self): - appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower() + name = self.temporaryBuffer.translate(asciiUpper2Lower) + appropriate = self.currentToken and self.currentToken.name == name data = self.stream.char() if data in spaceCharacters and appropriate: - self.currentToken = {"type": tokenTypes["EndTag"], - "name": self.temporaryBuffer, - "data": [], "selfClosing": False} + self.currentToken = EndTag(name=name) self.state = self.beforeAttributeNameState elif data == "/" and appropriate: - self.currentToken = {"type": tokenTypes["EndTag"], - "name": self.temporaryBuffer, - "data": [], "selfClosing": False} + self.currentToken = EndTag(name=name) self.state = self.selfClosingStartTagState elif data == ">" and appropriate: - self.currentToken = {"type": tokenTypes["EndTag"], - "name": self.temporaryBuffer, - "data": [], "selfClosing": False} + self.currentToken = EndTag(name=name) self.emitCurrentToken() self.state = self.dataState elif data in asciiLetters: self.temporaryBuffer += data else: - self.tokenQueue.append({"type": tokenTypes["Characters"], - "data": "" + self.temporaryBuffer}) + self.tokenQueue.append(Characters("" + self.temporaryBuffer)) self.stream.unget(data) self.state = self.rcdataState return True @@ -509,7 +518,7 @@ def rawtextLessThanSignState(self): self.temporaryBuffer = "" self.state = self.rawtextEndTagOpenState else: - self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) + self.tokenQueue.append(Characters("<")) self.stream.unget(data) self.state = self.rawtextState return True @@ -520,35 +529,29 @@ def rawtextEndTagOpenState(self): self.temporaryBuffer += data self.state = self.rawtextEndTagNameState else: - self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ""}) + self.tokenQueue.append(Characters("")) self.stream.unget(data) self.state = self.rawtextState return True def rawtextEndTagNameState(self): - appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower() + name = self.temporaryBuffer.translate(asciiUpper2Lower) + appropriate = self.currentToken and self.currentToken.name == name data = self.stream.char() if data in spaceCharacters and appropriate: - self.currentToken = {"type": tokenTypes["EndTag"], - "name": self.temporaryBuffer, - "data": [], "selfClosing": False} + self.currentToken = EndTag(name=name) self.state = self.beforeAttributeNameState elif data == "/" and appropriate: - self.currentToken = {"type": tokenTypes["EndTag"], - "name": self.temporaryBuffer, - "data": [], "selfClosing": False} + self.currentToken = EndTag(name=name) self.state = self.selfClosingStartTagState elif data == ">" and appropriate: - self.currentToken = {"type": tokenTypes["EndTag"], - "name": self.temporaryBuffer, - "data": [], "selfClosing": False} + self.currentToken = EndTag(name=name) self.emitCurrentToken() self.state = self.dataState elif data in asciiLetters: self.temporaryBuffer += data else: - self.tokenQueue.append({"type": tokenTypes["Characters"], - "data": "" + self.temporaryBuffer}) + self.tokenQueue.append(Characters("" + self.temporaryBuffer)) self.stream.unget(data) self.state = self.rawtextState return True @@ -559,10 +562,10 @@ def scriptDataLessThanSignState(self): self.temporaryBuffer = "" self.state = self.scriptDataEndTagOpenState elif data == "!": - self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "" and appropriate: - self.currentToken = {"type": tokenTypes["EndTag"], - "name": self.temporaryBuffer, - "data": [], "selfClosing": False} + self.currentToken = EndTag(name=name) self.emitCurrentToken() self.state = self.dataState elif data in asciiLetters: self.temporaryBuffer += data else: - self.tokenQueue.append({"type": tokenTypes["Characters"], - "data": "" + self.temporaryBuffer}) + self.tokenQueue.append(Characters("" + self.temporaryBuffer)) self.stream.unget(data) self.state = self.scriptDataState return True @@ -609,7 +606,7 @@ def scriptDataEndTagNameState(self): def scriptDataEscapeStartState(self): data = self.stream.char() if data == "-": - self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) + self.tokenQueue.append(Characters("-")) self.state = self.scriptDataEscapeStartDashState else: self.stream.unget(data) @@ -619,7 +616,7 @@ def scriptDataEscapeStartState(self): def scriptDataEscapeStartDashState(self): data = self.stream.char() if data == "-": - self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) + self.tokenQueue.append(Characters("-")) self.state = self.scriptDataEscapedDashDashState else: self.stream.unget(data) @@ -629,62 +626,55 @@ def scriptDataEscapeStartDashState(self): def scriptDataEscapedState(self): data = self.stream.char() if data == "-": - self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) + self.tokenQueue.append(Characters("-")) self.state = self.scriptDataEscapedDashState elif data == "<": self.state = self.scriptDataEscapedLessThanSignState elif data == "\u0000": - self.tokenQueue.append({"type": tokenTypes["ParseError"], - "data": "invalid-codepoint"}) - self.tokenQueue.append({"type": tokenTypes["Characters"], - "data": "\uFFFD"}) + self.tokenQueue.append(ParseError("invalid-codepoint")) + self.tokenQueue.append(Characters("\uFFFD")) elif data == EOF: self.state = self.dataState else: chars = self.stream.charsUntil(("<", "-", "\u0000")) - self.tokenQueue.append({"type": tokenTypes["Characters"], "data": - data + chars}) + self.tokenQueue.append(Characters(data + chars)) return True def scriptDataEscapedDashState(self): data = self.stream.char() if data == "-": - self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) + self.tokenQueue.append(Characters("-")) self.state = self.scriptDataEscapedDashDashState elif data == "<": self.state = self.scriptDataEscapedLessThanSignState elif data == "\u0000": - self.tokenQueue.append({"type": tokenTypes["ParseError"], - "data": "invalid-codepoint"}) - self.tokenQueue.append({"type": tokenTypes["Characters"], - "data": "\uFFFD"}) + self.tokenQueue.append(ParseError("invalid-codepoint")) + self.tokenQueue.append(Characters("\uFFFD")) self.state = self.scriptDataEscapedState elif data == EOF: self.state = self.dataState else: - self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) + self.tokenQueue.append(Characters(data)) self.state = self.scriptDataEscapedState return True def scriptDataEscapedDashDashState(self): data = self.stream.char() if data == "-": - self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) + self.tokenQueue.append(Characters("-")) elif data == "<": self.state = self.scriptDataEscapedLessThanSignState elif data == ">": - self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"}) + self.tokenQueue.append(Characters(">")) self.state = self.scriptDataState elif data == "\u0000": - self.tokenQueue.append({"type": tokenTypes["ParseError"], - "data": "invalid-codepoint"}) - self.tokenQueue.append({"type": tokenTypes["Characters"], - "data": "\uFFFD"}) + self.tokenQueue.append(ParseError("invalid-codepoint")) + self.tokenQueue.append(Characters("\uFFFD")) self.state = self.scriptDataEscapedState elif data == EOF: self.state = self.dataState else: - self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) + self.tokenQueue.append(Characters(data)) self.state = self.scriptDataEscapedState return True @@ -694,11 +684,11 @@ def scriptDataEscapedLessThanSignState(self): self.temporaryBuffer = "" self.state = self.scriptDataEscapedEndTagOpenState elif data in asciiLetters: - self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<" + data}) + self.tokenQueue.append(Characters("<" + data)) self.temporaryBuffer = data self.state = self.scriptDataDoubleEscapeStartState else: - self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) + self.tokenQueue.append(Characters("<")) self.stream.unget(data) self.state = self.scriptDataEscapedState return True @@ -709,35 +699,29 @@ def scriptDataEscapedEndTagOpenState(self): self.temporaryBuffer = data self.state = self.scriptDataEscapedEndTagNameState else: - self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ""}) + self.tokenQueue.append(Characters("")) self.stream.unget(data) self.state = self.scriptDataEscapedState return True def scriptDataEscapedEndTagNameState(self): - appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower() + name = self.temporaryBuffer.translate(asciiUpper2Lower) + appropriate = self.currentToken and self.currentToken.name == name data = self.stream.char() if data in spaceCharacters and appropriate: - self.currentToken = {"type": tokenTypes["EndTag"], - "name": self.temporaryBuffer, - "data": [], "selfClosing": False} + self.currentToken = EndTag(name=name) self.state = self.beforeAttributeNameState elif data == "/" and appropriate: - self.currentToken = {"type": tokenTypes["EndTag"], - "name": self.temporaryBuffer, - "data": [], "selfClosing": False} + self.currentToken = EndTag(name=name) self.state = self.selfClosingStartTagState elif data == ">" and appropriate: - self.currentToken = {"type": tokenTypes["EndTag"], - "name": self.temporaryBuffer, - "data": [], "selfClosing": False} + self.currentToken = EndTag(name=name) self.emitCurrentToken() self.state = self.dataState elif data in asciiLetters: self.temporaryBuffer += data else: - self.tokenQueue.append({"type": tokenTypes["Characters"], - "data": "" + self.temporaryBuffer}) + self.tokenQueue.append(Characters("" + self.temporaryBuffer)) self.stream.unget(data) self.state = self.scriptDataEscapedState return True @@ -745,13 +729,13 @@ def scriptDataEscapedEndTagNameState(self): def scriptDataDoubleEscapeStartState(self): data = self.stream.char() if data in (spaceCharacters | frozenset(("/", ">"))): - self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) + self.tokenQueue.append(Characters(data)) if self.temporaryBuffer.lower() == "script": self.state = self.scriptDataDoubleEscapedState else: self.state = self.scriptDataEscapedState elif data in asciiLetters: - self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) + self.tokenQueue.append(Characters(data)) self.temporaryBuffer += data else: self.stream.unget(data) @@ -761,76 +745,67 @@ def scriptDataDoubleEscapeStartState(self): def scriptDataDoubleEscapedState(self): data = self.stream.char() if data == "-": - self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) + self.tokenQueue.append(Characters("-")) self.state = self.scriptDataDoubleEscapedDashState elif data == "<": - self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) + self.tokenQueue.append(Characters("<")) self.state = self.scriptDataDoubleEscapedLessThanSignState elif data == "\u0000": - self.tokenQueue.append({"type": tokenTypes["ParseError"], - "data": "invalid-codepoint"}) - self.tokenQueue.append({"type": tokenTypes["Characters"], - "data": "\uFFFD"}) + self.tokenQueue.append(ParseError("invalid-codepoint")) + self.tokenQueue.append(Characters("\uFFFD")) elif data == EOF: - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "eof-in-script-in-script"}) + self.tokenQueue.append(ParseError("eof-in-script-in-script")) self.state = self.dataState else: - self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) + self.tokenQueue.append(Characters(data)) return True def scriptDataDoubleEscapedDashState(self): data = self.stream.char() if data == "-": - self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) + self.tokenQueue.append(Characters("-")) self.state = self.scriptDataDoubleEscapedDashDashState elif data == "<": - self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) + self.tokenQueue.append(Characters("<")) self.state = self.scriptDataDoubleEscapedLessThanSignState elif data == "\u0000": - self.tokenQueue.append({"type": tokenTypes["ParseError"], - "data": "invalid-codepoint"}) - self.tokenQueue.append({"type": tokenTypes["Characters"], - "data": "\uFFFD"}) + self.tokenQueue.append(ParseError("invalid-codepoint")) + self.tokenQueue.append(Characters("\uFFFD")) self.state = self.scriptDataDoubleEscapedState elif data == EOF: - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "eof-in-script-in-script"}) + self.tokenQueue.append(ParseError("eof-in-script-in-script")) self.state = self.dataState else: - self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) + self.tokenQueue.append(Characters(data)) self.state = self.scriptDataDoubleEscapedState return True def scriptDataDoubleEscapedDashDashState(self): data = self.stream.char() if data == "-": - self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) + self.tokenQueue.append(Characters("-")) elif data == "<": - self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) + self.tokenQueue.append(Characters("<")) self.state = self.scriptDataDoubleEscapedLessThanSignState elif data == ">": - self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"}) + self.tokenQueue.append(Characters(">")) self.state = self.scriptDataState elif data == "\u0000": - self.tokenQueue.append({"type": tokenTypes["ParseError"], - "data": "invalid-codepoint"}) - self.tokenQueue.append({"type": tokenTypes["Characters"], - "data": "\uFFFD"}) + self.tokenQueue.append(ParseError("invalid-codepoint")) + self.tokenQueue.append(Characters("\uFFFD")) self.state = self.scriptDataDoubleEscapedState elif data == EOF: - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "eof-in-script-in-script"}) + self.tokenQueue.append(ParseError("eof-in-script-in-script")) self.state = self.dataState else: - self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) + self.tokenQueue.append(Characters(data)) self.state = self.scriptDataDoubleEscapedState return True def scriptDataDoubleEscapedLessThanSignState(self): data = self.stream.char() if data == "/": - self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "/"}) + self.tokenQueue.append(Characters("/")) self.temporaryBuffer = "" self.state = self.scriptDataDoubleEscapeEndState else: @@ -841,13 +816,13 @@ def scriptDataDoubleEscapedLessThanSignState(self): def scriptDataDoubleEscapeEndState(self): data = self.stream.char() if data in (spaceCharacters | frozenset(("/", ">"))): - self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) + self.tokenQueue.append(Characters(data)) if self.temporaryBuffer.lower() == "script": self.state = self.scriptDataEscapedState else: self.state = self.scriptDataDoubleEscapedState elif data in asciiLetters: - self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) + self.tokenQueue.append(Characters(data)) self.temporaryBuffer += data else: self.stream.unget(data) @@ -859,83 +834,55 @@ def beforeAttributeNameState(self): if data in spaceCharacters: self.stream.charsUntil(spaceCharacters, True) elif data in asciiLetters: - self.currentToken["data"].append([data, ""]) + self.currentToken.flushAttribute() + self.currentToken.accumulateAttributeName(data) self.state = self.attributeNameState elif data == ">": self.emitCurrentToken() elif data == "/": self.state = self.selfClosingStartTagState elif data in ("'", '"', "=", "<"): - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "invalid-character-in-attribute-name"}) - self.currentToken["data"].append([data, ""]) + self.tokenQueue.append(ParseError("invalid-character-in-attribute-name")) + self.currentToken.flushAttribute() + self.currentToken.accumulateAttributeName(data) self.state = self.attributeNameState elif data == "\u0000": - self.tokenQueue.append({"type": tokenTypes["ParseError"], - "data": "invalid-codepoint"}) - self.currentToken["data"].append(["\uFFFD", ""]) + self.tokenQueue.append(ParseError("invalid-codepoint")) + self.currentToken.flushAttribute() + self.currentToken.accumulateAttributeName("\uFFFD") self.state = self.attributeNameState elif data is EOF: - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "expected-attribute-name-but-got-eof"}) + self.tokenQueue.append(ParseError("expected-attribute-name-but-got-eof")) self.state = self.dataState else: - self.currentToken["data"].append([data, ""]) + self.currentToken.flushAttribute() + self.currentToken.accumulateAttributeName(data) self.state = self.attributeNameState return True def attributeNameState(self): data = self.stream.char() - leavingThisState = True - emitToken = False if data == "=": self.state = self.beforeAttributeValueState elif data in asciiLetters: - self.currentToken["data"][-1][0] += data +\ - self.stream.charsUntil(asciiLetters, True) - leavingThisState = False + self.currentToken.accumulateAttributeName(data) elif data == ">": - # XXX If we emit here the attributes are converted to a dict - # without being checked and when the code below runs we error - # because data is a dict not a list - emitToken = True + self.emitCurrentToken() elif data in spaceCharacters: self.state = self.afterAttributeNameState elif data == "/": self.state = self.selfClosingStartTagState elif data == "\u0000": - self.tokenQueue.append({"type": tokenTypes["ParseError"], - "data": "invalid-codepoint"}) - self.currentToken["data"][-1][0] += "\uFFFD" - leavingThisState = False + self.tokenQueue.append(ParseError("invalid-codepoint")) + self.currentToken.accumulateAttributeName("\uFFFD") elif data in ("'", '"', "<"): - self.tokenQueue.append({"type": tokenTypes["ParseError"], - "data": - "invalid-character-in-attribute-name"}) - self.currentToken["data"][-1][0] += data - leavingThisState = False + self.tokenQueue.append(ParseError("invalid-character-in-attribute-name")) + self.currentToken.accumulateAttributeName(data) elif data is EOF: - self.tokenQueue.append({"type": tokenTypes["ParseError"], - "data": "eof-in-attribute-name"}) + self.tokenQueue.append(ParseError("eof-in-attribute-name")) self.state = self.dataState else: - self.currentToken["data"][-1][0] += data - leavingThisState = False - - if leavingThisState: - # Attributes are not dropped at this stage. That happens when the - # start tag token is emitted so values can still be safely appended - # to attributes, but we do want to report the parse error in time. - self.currentToken["data"][-1][0] = ( - self.currentToken["data"][-1][0].translate(asciiUpper2Lower)) - for name, _ in self.currentToken["data"][:-1]: - if self.currentToken["data"][-1][0] == name: - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "duplicate-attribute"}) - break - # XXX Fix for above XXX - if emitToken: - self.emitCurrentToken() + self.currentToken.accumulateAttributeName(data) return True def afterAttributeNameState(self): @@ -947,26 +894,27 @@ def afterAttributeNameState(self): elif data == ">": self.emitCurrentToken() elif data in asciiLetters: - self.currentToken["data"].append([data, ""]) + self.currentToken.flushAttribute() + self.currentToken.accumulateAttributeName(data) self.state = self.attributeNameState elif data == "/": self.state = self.selfClosingStartTagState elif data == "\u0000": - self.tokenQueue.append({"type": tokenTypes["ParseError"], - "data": "invalid-codepoint"}) - self.currentToken["data"].append(["\uFFFD", ""]) + self.tokenQueue.append(ParseError("invalid-codepoint")) + self.currentToken.flushAttribute() + self.currentToken.accumulateAttributeName("\uFFFD") self.state = self.attributeNameState elif data in ("'", '"', "<"): - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "invalid-character-after-attribute-name"}) - self.currentToken["data"].append([data, ""]) + self.tokenQueue.append(ParseError("invalid-character-after-attribute-name")) + self.currentToken.flushAttribute() + self.currentToken.accumulateAttributeName(data) self.state = self.attributeNameState elif data is EOF: - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "expected-end-of-tag-but-got-eof"}) + self.tokenQueue.append(ParseError("expected-end-of-tag-but-got-eof")) self.state = self.dataState else: - self.currentToken["data"].append([data, ""]) + self.currentToken.flushAttribute() + self.currentToken.accumulateAttributeName(data) self.state = self.attributeNameState return True @@ -982,25 +930,21 @@ def beforeAttributeValueState(self): elif data == "'": self.state = self.attributeValueSingleQuotedState elif data == ">": - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "expected-attribute-value-but-got-right-bracket"}) + self.tokenQueue.append(ParseError("expected-attribute-value-but-got-right-bracket")) self.emitCurrentToken() elif data == "\u0000": - self.tokenQueue.append({"type": tokenTypes["ParseError"], - "data": "invalid-codepoint"}) - self.currentToken["data"][-1][1] += "\uFFFD" + self.tokenQueue.append(ParseError("invalid-codepoint")) + self.currentToken.accumulateAttributeValue("\uFFFD") self.state = self.attributeValueUnQuotedState elif data in ("=", "<", "`"): - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "equals-in-unquoted-attribute-value"}) - self.currentToken["data"][-1][1] += data + self.tokenQueue.append(ParseError("equals-in-unquoted-attribute-value")) + self.currentToken.accumulateAttributeValue(data) self.state = self.attributeValueUnQuotedState elif data is EOF: - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "expected-attribute-value-but-got-eof"}) + self.tokenQueue.append(ParseError("expected-attribute-value-but-got-eof")) self.state = self.dataState else: - self.currentToken["data"][-1][1] += data + self.currentToken.accumulateAttributeValue(data) self.state = self.attributeValueUnQuotedState return True @@ -1011,16 +955,13 @@ def attributeValueDoubleQuotedState(self): elif data == "&": self.processEntityInAttribute('"') elif data == "\u0000": - self.tokenQueue.append({"type": tokenTypes["ParseError"], - "data": "invalid-codepoint"}) - self.currentToken["data"][-1][1] += "\uFFFD" + self.tokenQueue.append(ParseError("invalid-codepoint")) + self.currentToken.accumulateAttributeValue("\uFFFD") elif data is EOF: - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "eof-in-attribute-value-double-quote"}) + self.tokenQueue.append(ParseError("eof-in-attribute-value-double-quote")) self.state = self.dataState else: - self.currentToken["data"][-1][1] += data +\ - self.stream.charsUntil(("\"", "&", "\u0000")) + self.currentToken.accumulateAttributeValue(data + self.stream.charsUntil(("\"", "&", "\u0000"))) return True def attributeValueSingleQuotedState(self): @@ -1030,16 +971,13 @@ def attributeValueSingleQuotedState(self): elif data == "&": self.processEntityInAttribute("'") elif data == "\u0000": - self.tokenQueue.append({"type": tokenTypes["ParseError"], - "data": "invalid-codepoint"}) - self.currentToken["data"][-1][1] += "\uFFFD" + self.tokenQueue.append(ParseError("invalid-codepoint")) + self.currentToken.accumulateAttributeValue("\uFFFD") elif data is EOF: - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "eof-in-attribute-value-single-quote"}) + self.tokenQueue.append(ParseError("eof-in-attribute-value-single-quote")) self.state = self.dataState else: - self.currentToken["data"][-1][1] += data +\ - self.stream.charsUntil(("'", "&", "\u0000")) + self.currentToken.accumulateAttributeValue(data + self.stream.charsUntil(("'", "&", "\u0000"))) return True def attributeValueUnQuotedState(self): @@ -1051,20 +989,17 @@ def attributeValueUnQuotedState(self): elif data == ">": self.emitCurrentToken() elif data in ('"', "'", "=", "<", "`"): - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "unexpected-character-in-unquoted-attribute-value"}) - self.currentToken["data"][-1][1] += data + self.tokenQueue.append(ParseError("unexpected-character-in-unquoted-attribute-value")) + self.currentToken.accumulateAttributeValue(data) elif data == "\u0000": - self.tokenQueue.append({"type": tokenTypes["ParseError"], - "data": "invalid-codepoint"}) - self.currentToken["data"][-1][1] += "\uFFFD" + self.tokenQueue.append(ParseError("invalid-codepoint")) + self.currentToken.accumulateAttributeValue("\uFFFD") elif data is EOF: - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "eof-in-attribute-value-no-quotes"}) + self.tokenQueue.append(ParseError("eof-in-attribute-value-no-quotes")) self.state = self.dataState else: - self.currentToken["data"][-1][1] += data + self.stream.charsUntil( - frozenset(("&", ">", '"', "'", "=", "<", "`", "\u0000")) | spaceCharacters) + self.currentToken.accumulateAttributeValue(data + self.stream.charsUntil( + frozenset(("&", ">", '"', "'", "=", "<", "`", "\u0000")) | spaceCharacters)) return True def afterAttributeValueState(self): @@ -1076,13 +1011,11 @@ def afterAttributeValueState(self): elif data == "/": self.state = self.selfClosingStartTagState elif data is EOF: - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "unexpected-EOF-after-attribute-value"}) + self.tokenQueue.append(ParseError("unexpected-EOF-after-attribute-value")) self.stream.unget(data) self.state = self.dataState else: - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "unexpected-character-after-attribute-value"}) + self.tokenQueue.append(ParseError("unexpected-character-after-attribute-value")) self.stream.unget(data) self.state = self.beforeAttributeNameState return True @@ -1090,17 +1023,14 @@ def afterAttributeValueState(self): def selfClosingStartTagState(self): data = self.stream.char() if data == ">": - self.currentToken["selfClosing"] = True + self.currentToken.self_closing = True self.emitCurrentToken() elif data is EOF: - self.tokenQueue.append({"type": tokenTypes["ParseError"], - "data": - "unexpected-EOF-after-solidus-in-tag"}) + self.tokenQueue.append(ParseError("unexpected-EOF-after-solidus-in-tag")) self.stream.unget(data) self.state = self.dataState else: - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "unexpected-character-after-solidus-in-tag"}) + self.tokenQueue.append(ParseError("unexpected-character-after-solidus-in-tag")) self.stream.unget(data) self.state = self.beforeAttributeNameState return True @@ -1111,8 +1041,7 @@ def bogusCommentState(self): # and emit it. data = self.stream.charsUntil(">") data = data.replace("\u0000", "\uFFFD") - self.tokenQueue.append( - {"type": tokenTypes["Comment"], "data": data}) + self.tokenQueue.append(Comment(data)) # Eat the character directly after the bogus comment which is either a # ">" or an EOF. @@ -1125,7 +1054,7 @@ def markupDeclarationOpenState(self): if charStack[-1] == "-": charStack.append(self.stream.char()) if charStack[-1] == "-": - self.currentToken = {"type": tokenTypes["Comment"], "data": ""} + self.currentToken = Comment("") self.state = self.commentStartState return True elif charStack[-1] in ('d', 'D'): @@ -1137,10 +1066,7 @@ def markupDeclarationOpenState(self): matched = False break if matched: - self.currentToken = {"type": tokenTypes["Doctype"], - "name": "", - "publicId": None, "systemId": None, - "correct": True} + self.currentToken = Doctype(name="", publicId=None, systemId=None, correct=True) self.state = self.doctypeState return True elif (charStack[-1] == "[" and @@ -1157,8 +1083,7 @@ def markupDeclarationOpenState(self): self.state = self.cdataSectionState return True - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "expected-dashes-or-doctype"}) + self.tokenQueue.append(ParseError("expected-dashes-or-doctype")) while charStack: self.stream.unget(charStack.pop()) @@ -1170,21 +1095,18 @@ def commentStartState(self): if data == "-": self.state = self.commentStartDashState elif data == "\u0000": - self.tokenQueue.append({"type": tokenTypes["ParseError"], - "data": "invalid-codepoint"}) - self.currentToken["data"] += "\uFFFD" + self.tokenQueue.append(ParseError("invalid-codepoint")) + self.currentToken.data += "\uFFFD" elif data == ">": - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "incorrect-comment"}) + self.tokenQueue.append(ParseError("incorrect-comment")) self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data is EOF: - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "eof-in-comment"}) + self.tokenQueue.append(ParseError("eof-in-comment")) self.tokenQueue.append(self.currentToken) self.state = self.dataState else: - self.currentToken["data"] += data + self.currentToken.data += data self.state = self.commentState return True @@ -1193,21 +1115,18 @@ def commentStartDashState(self): if data == "-": self.state = self.commentEndState elif data == "\u0000": - self.tokenQueue.append({"type": tokenTypes["ParseError"], - "data": "invalid-codepoint"}) - self.currentToken["data"] += "-\uFFFD" + self.tokenQueue.append(ParseError("invalid-codepoint")) + self.currentToken.data += "-\uFFFD" elif data == ">": - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "incorrect-comment"}) + self.tokenQueue.append(ParseError("incorrect-comment")) self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data is EOF: - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "eof-in-comment"}) + self.tokenQueue.append(ParseError("eof-in-comment")) self.tokenQueue.append(self.currentToken) self.state = self.dataState else: - self.currentToken["data"] += "-" + data + self.currentToken.data += "-" + data self.state = self.commentState return True @@ -1216,16 +1135,14 @@ def commentState(self): if data == "-": self.state = self.commentEndDashState elif data == "\u0000": - self.tokenQueue.append({"type": tokenTypes["ParseError"], - "data": "invalid-codepoint"}) - self.currentToken["data"] += "\uFFFD" + self.tokenQueue.append(ParseError("invalid-codepoint")) + self.currentToken.data += "\uFFFD" elif data is EOF: - self.tokenQueue.append({"type": tokenTypes["ParseError"], - "data": "eof-in-comment"}) + self.tokenQueue.append(ParseError("eof-in-comment")) self.tokenQueue.append(self.currentToken) self.state = self.dataState else: - self.currentToken["data"] += data + \ + self.currentToken.data += data + \ self.stream.charsUntil(("-", "\u0000")) return True @@ -1234,17 +1151,15 @@ def commentEndDashState(self): if data == "-": self.state = self.commentEndState elif data == "\u0000": - self.tokenQueue.append({"type": tokenTypes["ParseError"], - "data": "invalid-codepoint"}) - self.currentToken["data"] += "-\uFFFD" + self.tokenQueue.append(ParseError("invalid-codepoint")) + self.currentToken.data += "-\uFFFD" self.state = self.commentState elif data is EOF: - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "eof-in-comment-end-dash"}) + self.tokenQueue.append(ParseError("eof-in-comment-end-dash")) self.tokenQueue.append(self.currentToken) self.state = self.dataState else: - self.currentToken["data"] += "-" + data + self.currentToken.data += "-" + data self.state = self.commentState return True @@ -1254,28 +1169,23 @@ def commentEndState(self): self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data == "\u0000": - self.tokenQueue.append({"type": tokenTypes["ParseError"], - "data": "invalid-codepoint"}) - self.currentToken["data"] += "--\uFFFD" + self.tokenQueue.append(ParseError("invalid-codepoint")) + self.currentToken.data += "--\uFFFD" self.state = self.commentState elif data == "!": - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "unexpected-bang-after-double-dash-in-comment"}) + self.tokenQueue.append(ParseError("unexpected-bang-after-double-dash-in-comment")) self.state = self.commentEndBangState elif data == "-": - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "unexpected-dash-after-double-dash-in-comment"}) - self.currentToken["data"] += data + self.tokenQueue.append(ParseError("unexpected-dash-after-double-dash-in-comment")) + self.currentToken.data += data elif data is EOF: - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "eof-in-comment-double-dash"}) + self.tokenQueue.append(ParseError("eof-in-comment-double-dash")) self.tokenQueue.append(self.currentToken) self.state = self.dataState else: # XXX - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "unexpected-char-in-comment"}) - self.currentToken["data"] += "--" + data + self.tokenQueue.append(ParseError("unexpected-char-in-comment")) + self.currentToken.data += "--" + data self.state = self.commentState return True @@ -1285,20 +1195,18 @@ def commentEndBangState(self): self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data == "-": - self.currentToken["data"] += "--!" + self.currentToken.data += "--!" self.state = self.commentEndDashState elif data == "\u0000": - self.tokenQueue.append({"type": tokenTypes["ParseError"], - "data": "invalid-codepoint"}) - self.currentToken["data"] += "--!\uFFFD" + self.tokenQueue.append(ParseError("invalid-codepoint")) + self.currentToken.data += "--!\uFFFD" self.state = self.commentState elif data is EOF: - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "eof-in-comment-end-bang-state"}) + self.tokenQueue.append(ParseError("eof-in-comment-end-bang-state")) self.tokenQueue.append(self.currentToken) self.state = self.dataState else: - self.currentToken["data"] += "--!" + data + self.currentToken.data += "--!" + data self.state = self.commentState return True @@ -1307,14 +1215,12 @@ def doctypeState(self): if data in spaceCharacters: self.state = self.beforeDoctypeNameState elif data is EOF: - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "expected-doctype-name-but-got-eof"}) - self.currentToken["correct"] = False + self.tokenQueue.append(ParseError("expected-doctype-name-but-got-eof")) + self.currentToken.correct = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "need-space-after-doctype"}) + self.tokenQueue.append(ParseError("need-space-after-doctype")) self.stream.unget(data) self.state = self.beforeDoctypeNameState return True @@ -1324,50 +1230,42 @@ def beforeDoctypeNameState(self): if data in spaceCharacters: pass elif data == ">": - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "expected-doctype-name-but-got-right-bracket"}) - self.currentToken["correct"] = False + self.tokenQueue.append(ParseError("expected-doctype-name-but-got-right-bracket")) + self.currentToken.correct = False self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data == "\u0000": - self.tokenQueue.append({"type": tokenTypes["ParseError"], - "data": "invalid-codepoint"}) - self.currentToken["name"] = "\uFFFD" + self.tokenQueue.append(ParseError("invalid-codepoint")) + self.currentToken.name = "\uFFFD" self.state = self.doctypeNameState elif data is EOF: - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "expected-doctype-name-but-got-eof"}) - self.currentToken["correct"] = False + self.tokenQueue.append(ParseError("expected-doctype-name-but-got-eof")) + self.currentToken.correct = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: - self.currentToken["name"] = data + self.currentToken.name = data.translate(asciiUpper2Lower) self.state = self.doctypeNameState return True def doctypeNameState(self): data = self.stream.char() if data in spaceCharacters: - self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) self.state = self.afterDoctypeNameState elif data == ">": - self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data == "\u0000": - self.tokenQueue.append({"type": tokenTypes["ParseError"], - "data": "invalid-codepoint"}) - self.currentToken["name"] += "\uFFFD" + self.tokenQueue.append(ParseError("invalid-codepoint")) + self.currentToken.name += "\uFFFD" self.state = self.doctypeNameState elif data is EOF: - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "eof-in-doctype-name"}) - self.currentToken["correct"] = False - self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) + self.tokenQueue.append(ParseError("eof-in-doctype-name")) + self.currentToken.correct = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: - self.currentToken["name"] += data + self.currentToken.name += data.translate(asciiUpper2Lower) return True def afterDoctypeNameState(self): @@ -1378,10 +1276,9 @@ def afterDoctypeNameState(self): self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data is EOF: - self.currentToken["correct"] = False + self.currentToken.correct = False self.stream.unget(data) - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "eof-in-doctype"}) + self.tokenQueue.append(ParseError("eof-in-doctype")) self.tokenQueue.append(self.currentToken) self.state = self.dataState else: @@ -1413,10 +1310,8 @@ def afterDoctypeNameState(self): # discarded; only the latest character might be '>' or EOF # and needs to be ungetted self.stream.unget(data) - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "expected-space-or-right-bracket-in-doctype", "datavars": - {"data": data}}) - self.currentToken["correct"] = False + self.tokenQueue.append(ParseError("expected-space-or-right-bracket-in-doctype", datavars={"data": data})) + self.currentToken.correct = False self.state = self.bogusDoctypeState return True @@ -1426,14 +1321,12 @@ def afterDoctypePublicKeywordState(self): if data in spaceCharacters: self.state = self.beforeDoctypePublicIdentifierState elif data in ("'", '"'): - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "unexpected-char-in-doctype"}) + self.tokenQueue.append(ParseError("unexpected-char-in-doctype")) self.stream.unget(data) self.state = self.beforeDoctypePublicIdentifierState elif data is EOF: - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "eof-in-doctype"}) - self.currentToken["correct"] = False + self.tokenQueue.append(ParseError("eof-in-doctype")) + self.currentToken.correct = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: @@ -1446,27 +1339,24 @@ def beforeDoctypePublicIdentifierState(self): if data in spaceCharacters: pass elif data == "\"": - self.currentToken["publicId"] = "" + self.currentToken.publicId = "" self.state = self.doctypePublicIdentifierDoubleQuotedState elif data == "'": - self.currentToken["publicId"] = "" + self.currentToken.publicId = "" self.state = self.doctypePublicIdentifierSingleQuotedState elif data == ">": - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "unexpected-end-of-doctype"}) - self.currentToken["correct"] = False + self.tokenQueue.append(ParseError("unexpected-end-of-doctype")) + self.currentToken.correct = False self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data is EOF: - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "eof-in-doctype"}) - self.currentToken["correct"] = False + self.tokenQueue.append(ParseError("eof-in-doctype")) + self.currentToken.correct = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "unexpected-char-in-doctype"}) - self.currentToken["correct"] = False + self.tokenQueue.append(ParseError("unexpected-char-in-doctype")) + self.currentToken.correct = False self.state = self.bogusDoctypeState return True @@ -1475,23 +1365,20 @@ def doctypePublicIdentifierDoubleQuotedState(self): if data == "\"": self.state = self.afterDoctypePublicIdentifierState elif data == "\u0000": - self.tokenQueue.append({"type": tokenTypes["ParseError"], - "data": "invalid-codepoint"}) - self.currentToken["publicId"] += "\uFFFD" + self.tokenQueue.append(ParseError("invalid-codepoint")) + self.currentToken.publicId += "\uFFFD" elif data == ">": - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "unexpected-end-of-doctype"}) - self.currentToken["correct"] = False + self.tokenQueue.append(ParseError("unexpected-end-of-doctype")) + self.currentToken.correct = False self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data is EOF: - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "eof-in-doctype"}) - self.currentToken["correct"] = False + self.tokenQueue.append(ParseError("eof-in-doctype")) + self.currentToken.correct = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: - self.currentToken["publicId"] += data + self.currentToken.publicId += data return True def doctypePublicIdentifierSingleQuotedState(self): @@ -1499,23 +1386,20 @@ def doctypePublicIdentifierSingleQuotedState(self): if data == "'": self.state = self.afterDoctypePublicIdentifierState elif data == "\u0000": - self.tokenQueue.append({"type": tokenTypes["ParseError"], - "data": "invalid-codepoint"}) - self.currentToken["publicId"] += "\uFFFD" + self.tokenQueue.append(ParseError("invalid-codepoint")) + self.currentToken.publicId += "\uFFFD" elif data == ">": - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "unexpected-end-of-doctype"}) - self.currentToken["correct"] = False + self.tokenQueue.append(ParseError("unexpected-end-of-doctype")) + self.currentToken.correct = False self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data is EOF: - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "eof-in-doctype"}) - self.currentToken["correct"] = False + self.tokenQueue.append(ParseError("eof-in-doctype")) + self.currentToken.correct = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: - self.currentToken["publicId"] += data + self.currentToken.publicId += data return True def afterDoctypePublicIdentifierState(self): @@ -1526,25 +1410,21 @@ def afterDoctypePublicIdentifierState(self): self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data == '"': - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "unexpected-char-in-doctype"}) - self.currentToken["systemId"] = "" + self.tokenQueue.append(ParseError("unexpected-char-in-doctype")) + self.currentToken.systemId = "" self.state = self.doctypeSystemIdentifierDoubleQuotedState elif data == "'": - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "unexpected-char-in-doctype"}) - self.currentToken["systemId"] = "" + self.tokenQueue.append(ParseError("unexpected-char-in-doctype")) + self.currentToken.systemId = "" self.state = self.doctypeSystemIdentifierSingleQuotedState elif data is EOF: - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "eof-in-doctype"}) - self.currentToken["correct"] = False + self.tokenQueue.append(ParseError("eof-in-doctype")) + self.currentToken.correct = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "unexpected-char-in-doctype"}) - self.currentToken["correct"] = False + self.tokenQueue.append(ParseError("unexpected-char-in-doctype")) + self.currentToken.correct = False self.state = self.bogusDoctypeState return True @@ -1556,21 +1436,19 @@ def betweenDoctypePublicAndSystemIdentifiersState(self): self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data == '"': - self.currentToken["systemId"] = "" + self.currentToken.systemId = "" self.state = self.doctypeSystemIdentifierDoubleQuotedState elif data == "'": - self.currentToken["systemId"] = "" + self.currentToken.systemId = "" self.state = self.doctypeSystemIdentifierSingleQuotedState elif data == EOF: - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "eof-in-doctype"}) - self.currentToken["correct"] = False + self.tokenQueue.append(ParseError("eof-in-doctype")) + self.currentToken.correct = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "unexpected-char-in-doctype"}) - self.currentToken["correct"] = False + self.tokenQueue.append(ParseError("unexpected-char-in-doctype")) + self.currentToken.correct = False self.state = self.bogusDoctypeState return True @@ -1579,14 +1457,12 @@ def afterDoctypeSystemKeywordState(self): if data in spaceCharacters: self.state = self.beforeDoctypeSystemIdentifierState elif data in ("'", '"'): - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "unexpected-char-in-doctype"}) + self.tokenQueue.append(ParseError("unexpected-char-in-doctype")) self.stream.unget(data) self.state = self.beforeDoctypeSystemIdentifierState elif data is EOF: - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "eof-in-doctype"}) - self.currentToken["correct"] = False + self.tokenQueue.append(ParseError("eof-in-doctype")) + self.currentToken.correct = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: @@ -1599,27 +1475,24 @@ def beforeDoctypeSystemIdentifierState(self): if data in spaceCharacters: pass elif data == "\"": - self.currentToken["systemId"] = "" + self.currentToken.systemId = "" self.state = self.doctypeSystemIdentifierDoubleQuotedState elif data == "'": - self.currentToken["systemId"] = "" + self.currentToken.systemId = "" self.state = self.doctypeSystemIdentifierSingleQuotedState elif data == ">": - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "unexpected-char-in-doctype"}) - self.currentToken["correct"] = False + self.tokenQueue.append(ParseError("unexpected-char-in-doctype")) + self.currentToken.correct = False self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data is EOF: - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "eof-in-doctype"}) - self.currentToken["correct"] = False + self.tokenQueue.append(ParseError("eof-in-doctype")) + self.currentToken.correct = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "unexpected-char-in-doctype"}) - self.currentToken["correct"] = False + self.tokenQueue.append(ParseError("unexpected-char-in-doctype")) + self.currentToken.correct = False self.state = self.bogusDoctypeState return True @@ -1628,23 +1501,20 @@ def doctypeSystemIdentifierDoubleQuotedState(self): if data == "\"": self.state = self.afterDoctypeSystemIdentifierState elif data == "\u0000": - self.tokenQueue.append({"type": tokenTypes["ParseError"], - "data": "invalid-codepoint"}) - self.currentToken["systemId"] += "\uFFFD" + self.tokenQueue.append(ParseError("invalid-codepoint")) + self.currentToken.systemId += "\uFFFD" elif data == ">": - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "unexpected-end-of-doctype"}) - self.currentToken["correct"] = False + self.tokenQueue.append(ParseError("unexpected-end-of-doctype")) + self.currentToken.correct = False self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data is EOF: - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "eof-in-doctype"}) - self.currentToken["correct"] = False + self.tokenQueue.append(ParseError("eof-in-doctype")) + self.currentToken.correct = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: - self.currentToken["systemId"] += data + self.currentToken.systemId += data return True def doctypeSystemIdentifierSingleQuotedState(self): @@ -1652,23 +1522,20 @@ def doctypeSystemIdentifierSingleQuotedState(self): if data == "'": self.state = self.afterDoctypeSystemIdentifierState elif data == "\u0000": - self.tokenQueue.append({"type": tokenTypes["ParseError"], - "data": "invalid-codepoint"}) - self.currentToken["systemId"] += "\uFFFD" + self.tokenQueue.append(ParseError("invalid-codepoint")) + self.currentToken.systemId += "\uFFFD" elif data == ">": - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "unexpected-end-of-doctype"}) - self.currentToken["correct"] = False + self.tokenQueue.append(ParseError("unexpected-end-of-doctype")) + self.currentToken.correct = False self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data is EOF: - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "eof-in-doctype"}) - self.currentToken["correct"] = False + self.tokenQueue.append(ParseError("eof-in-doctype")) + self.currentToken.correct = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: - self.currentToken["systemId"] += data + self.currentToken.systemId += data return True def afterDoctypeSystemIdentifierState(self): @@ -1679,14 +1546,12 @@ def afterDoctypeSystemIdentifierState(self): self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data is EOF: - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "eof-in-doctype"}) - self.currentToken["correct"] = False + self.tokenQueue.append(ParseError("eof-in-doctype")) + self.currentToken.correct = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "unexpected-char-in-doctype"}) + self.tokenQueue.append(ParseError("unexpected-char-in-doctype")) self.state = self.bogusDoctypeState return True @@ -1725,11 +1590,9 @@ def cdataSectionState(self): nullCount = data.count("\u0000") if nullCount > 0: for _ in range(nullCount): - self.tokenQueue.append({"type": tokenTypes["ParseError"], - "data": "invalid-codepoint"}) + self.tokenQueue.append(ParseError("invalid-codepoint")) data = data.replace("\u0000", "\uFFFD") if data: - self.tokenQueue.append({"type": tokenTypes["Characters"], - "data": data}) + self.tokenQueue.append(Characters(data)) self.state = self.dataState return True diff --git a/html5lib/constants.py b/html5lib/constants.py index fe3e237c..9f89d616 100644 --- a/html5lib/constants.py +++ b/html5lib/constants.py @@ -2918,20 +2918,6 @@ 0x9F: "\u0178", } -tokenTypes = { - "Doctype": 0, - "Characters": 1, - "SpaceCharacters": 2, - "StartTag": 3, - "EndTag": 4, - "EmptyTag": 5, - "Comment": 6, - "ParseError": 7 -} - -tagTokenTypes = frozenset([tokenTypes["StartTag"], tokenTypes["EndTag"], - tokenTypes["EmptyTag"]]) - prefixes = {v: k for k, v in namespaces.items()} prefixes["http://www.w3.org/1998/Math/MathML"] = "math" diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index 74d829d9..159b04cf 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -4,7 +4,18 @@ import types from . import _inputstream -from . import _tokenizer +from ._tokenizer import ( + attributeMap, + HTMLTokenizer, + Characters, + SpaceCharacters, + StartTag, + EndTag, + Comment, + Doctype, + ParseError as TokenizerParseError, + Tag, +) from . import treebuilders from .treebuilders.base import Marker @@ -13,7 +24,6 @@ from .constants import ( spaceCharacters, asciiUpper2Lower, specialElements, headingElements, cdataElements, rcdataElements, - tokenTypes, tagTokenTypes, namespaces, htmlIntegrationPointElements, mathmlTextIntegrationPointElements, adjustForeignAttributes as adjustForeignAttributesMap, @@ -126,7 +136,7 @@ def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kw self.innerHTMLMode = innerHTML self.container = container self.scripting = scripting - self.tokenizer = _tokenizer.HTMLTokenizer(stream, parser=self, **kwargs) + self.tokenizer = HTMLTokenizer(stream, parser=self, **kwargs) self.reset() try: @@ -193,13 +203,6 @@ def isMathMLTextIntegrationPoint(self, element): return (element.namespace, element.name) in mathmlTextIntegrationPointElements def mainLoop(self): - CharactersToken = tokenTypes["Characters"] - SpaceCharactersToken = tokenTypes["SpaceCharacters"] - StartTagToken = tokenTypes["StartTag"] - EndTagToken = tokenTypes["EndTag"] - CommentToken = tokenTypes["Comment"] - DoctypeToken = tokenTypes["Doctype"] - ParseErrorToken = tokenTypes["ParseError"] for token in self.tokenizer: prev_token = None @@ -210,45 +213,43 @@ def mainLoop(self): currentNodeNamespace = currentNode.namespace if currentNode else None currentNodeName = currentNode.name if currentNode else None - type = new_token["type"] - - if type == ParseErrorToken: - self.parseError(new_token["data"], new_token.get("datavars", {})) + if isinstance(new_token, TokenizerParseError): + self.parseError(new_token.data, new_token.datavars) new_token = None else: if (len(self.tree.openElements) == 0 or currentNodeNamespace == self.tree.defaultNamespace or (self.isMathMLTextIntegrationPoint(currentNode) and - ((type == StartTagToken and - token["name"] not in frozenset(["mglyph", "malignmark"])) or - type in (CharactersToken, SpaceCharactersToken))) or + ((token.__class__ == StartTag and + token.name not in frozenset(["mglyph", "malignmark"])) or + token.__class__ in (Characters, SpaceCharacters))) or (currentNodeNamespace == namespaces["mathml"] and currentNodeName == "annotation-xml" and - type == StartTagToken and - token["name"] == "svg") or + token.__class__ == StartTag and + token.name == "svg") or (self.isHTMLIntegrationPoint(currentNode) and - type in (StartTagToken, CharactersToken, SpaceCharactersToken))): + token.__class__ in (StartTag, Characters, SpaceCharacters))): phase = self.phase else: phase = self.phases["inForeignContent"] - if type == CharactersToken: + if isinstance(new_token, Characters): new_token = phase.processCharacters(new_token) - elif type == SpaceCharactersToken: + elif isinstance(new_token, SpaceCharacters): new_token = phase.processSpaceCharacters(new_token) - elif type == StartTagToken: + elif isinstance(new_token, StartTag): new_token = phase.processStartTag(new_token) - elif type == EndTagToken: + elif isinstance(new_token, EndTag): new_token = phase.processEndTag(new_token) - elif type == CommentToken: + elif isinstance(new_token, Comment): new_token = phase.processComment(new_token) - elif type == DoctypeToken: + elif isinstance(new_token, Doctype): new_token = phase.processDoctype(new_token) - if (type == StartTagToken and prev_token["selfClosing"] and - not prev_token["selfClosingAcknowledged"]): + if (isinstance(new_token, StartTag) and prev_token.self_closing and + not prev_token.self_closing_acknowledged): self.parseError("non-void-element-with-trailing-solidus", - {"name": prev_token["name"]}) + {"name": prev_token.name}) # When the loop finishes it's EOF reprocess = True @@ -397,14 +398,12 @@ def parseRCDataRawtext(self, token, contentType): def getPhases(debug): def log(function): """Logger that records which phase processes each token""" - type_names = {value: key for key, value in tokenTypes.items()} - def wrapped(self, *args, **kwargs): if function.__name__.startswith("process") and len(args) > 0: token = args[0] - info = {"type": type_names[token['type']]} - if token['type'] in tagTokenTypes: - info["name"] = token['name'] + info = {"type": token.__class__.__name__} + if isinstance(token, Tag): + info["name"] = token.name self.parser.log.append((self.parser.tokenizer.state.__name__, self.parser.phase.__class__.__name__, @@ -446,16 +445,16 @@ def processDoctype(self, token): self.parser.parseError("unexpected-doctype") def processCharacters(self, token): - self.tree.insertText(token["data"]) + self.tree.insertText(token.data) def processSpaceCharacters(self, token): - self.tree.insertText(token["data"]) + self.tree.insertText(token.data) def processStartTag(self, token): # Note the caching is done here rather than BoundMethodDispatcher as doing it there # requires a circular reference to the Phase, and this ends up with a significant # (CPython 2.7, 3.8) GC cost when parsing many short inputs - name = token["name"] + name = token.name # In Py2, using `in` is quicker in general than try/except KeyError # In Py3, `in` is quicker when there are few cache hits (typically short inputs) if name in self.__startTagCache: @@ -469,11 +468,11 @@ def processStartTag(self, token): return func(token) def startTagHtml(self, token): - if not self.parser.firstStartTag and token["name"] == "html": + if not self.parser.firstStartTag and token.name == "html": self.parser.parseError("non-html-root") # XXX Need a check here to see if the first start tag token emitted is # this token... If it's not, invoke self.parser.parseError(). - for attr, value in token["data"].items(): + for attr, value in token.attributes.items(): if attr not in self.tree.openElements[0].attributes: self.tree.openElements[0].attributes[attr] = value self.parser.firstStartTag = False @@ -482,7 +481,7 @@ def processEndTag(self, token): # Note the caching is done here rather than BoundMethodDispatcher as doing it there # requires a circular reference to the Phase, and this ends up with a significant # (CPython 2.7, 3.8) GC cost when parsing many short inputs - name = token["name"] + name = token.name # In Py2, using `in` is quicker in general than try/except KeyError # In Py3, `in` is quicker when there are few cache hits (typically short inputs) if name in self.__endTagCache: @@ -505,10 +504,10 @@ def processComment(self, token): self.tree.insertComment(token, self.tree.document) def processDoctype(self, token): - name = token["name"] - publicId = token["publicId"] - systemId = token["systemId"] - correct = token["correct"] + name = token.name + publicId = token.publicId + systemId = token.systemId + correct = token.correct if (name != "html" or publicId is not None or systemId is not None and systemId != "about:legacy-compat"): @@ -522,7 +521,7 @@ def processDoctype(self, token): if publicId != "": publicId = publicId.translate(asciiUpper2Lower) - if (not correct or token["name"] != "html" or + if (not correct or token.name != "html" or publicId.startswith( ("+//silmaril//dtd html pro v0r11 19970101//", "-//advasoft ltd//dtd html 3.0 aswedit + extensions//", @@ -610,13 +609,13 @@ def processCharacters(self, token): def processStartTag(self, token): self.parser.parseError("expected-doctype-but-got-start-tag", - {"name": token["name"]}) + {"name": token.name}) self.anythingElse() return token def processEndTag(self, token): self.parser.parseError("expected-doctype-but-got-end-tag", - {"name": token["name"]}) + {"name": token.name}) self.anythingElse() return token @@ -630,7 +629,7 @@ class BeforeHtmlPhase(Phase): # helper methods def insertHtmlElement(self): - self.tree.insertRoot(impliedTagToken("html", "StartTag")) + self.tree.insertRoot(impliedTagToken("html", StartTag)) self.parser.phase = self.parser.phases["beforeHead"] # other @@ -649,15 +648,15 @@ def processCharacters(self, token): return token def processStartTag(self, token): - if token["name"] == "html": + if token.name == "html": self.parser.firstStartTag = True self.insertHtmlElement() return token def processEndTag(self, token): - if token["name"] not in ("head", "body", "html", "br"): + if token.name not in ("head", "body", "html", "br"): self.parser.parseError("unexpected-end-tag-before-html", - {"name": token["name"]}) + {"name": token.name}) else: self.insertHtmlElement() return token @@ -666,14 +665,14 @@ class BeforeHeadPhase(Phase): __slots__ = tuple() def processEOF(self): - self.startTagHead(impliedTagToken("head", "StartTag")) + self.startTagHead(impliedTagToken("head", StartTag)) return True def processSpaceCharacters(self, token): pass def processCharacters(self, token): - self.startTagHead(impliedTagToken("head", "StartTag")) + self.startTagHead(impliedTagToken("head", StartTag)) return token def startTagHtml(self, token): @@ -685,16 +684,16 @@ def startTagHead(self, token): self.parser.phase = self.parser.phases["inHead"] def startTagOther(self, token): - self.startTagHead(impliedTagToken("head", "StartTag")) + self.startTagHead(impliedTagToken("head", StartTag)) return token def endTagImplyHead(self, token): - self.startTagHead(impliedTagToken("head", "StartTag")) + self.startTagHead(impliedTagToken("head", StartTag)) return token def endTagOther(self, token): self.parser.parseError("end-tag-after-implied-root", - {"name": token["name"]}) + {"name": token.name}) startTagHandler = _utils.MethodDispatcher([ ("html", startTagHtml), @@ -728,14 +727,14 @@ def startTagHead(self, token): def startTagBaseLinkCommand(self, token): self.tree.insertElement(token) self.tree.openElements.pop() - token["selfClosingAcknowledged"] = True + token.self_closing_acknowledged = True def startTagMeta(self, token): self.tree.insertElement(token) self.tree.openElements.pop() - token["selfClosingAcknowledged"] = True + token.self_closing_acknowledged = True - attributes = token["data"] + attributes = token.attributes if self.parser.tokenizer.stream.charEncoding[1] == "tentative": if "charset" in attributes: self.parser.tokenizer.stream.changeEncoding(attributes["charset"]) @@ -785,7 +784,7 @@ def endTagHtmlBodyBr(self, token): return token def endTagOther(self, token): - self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) + self.parser.parseError("unexpected-end-tag", {"name": token.name}) def anythingElse(self): self.endTagHead(impliedTagToken("head")) @@ -835,10 +834,10 @@ def startTagBaseLinkCommand(self, token): return self.parser.phases["inHead"].processStartTag(token) def startTagHeadNoscript(self, token): - self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) + self.parser.parseError("unexpected-start-tag", {"name": token.name}) def startTagOther(self, token): - self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]}) + self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token.name}) self.anythingElse() return token @@ -848,12 +847,12 @@ def endTagNoscript(self, token): self.parser.phase = self.parser.phases["inHead"] def endTagBr(self, token): - self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]}) + self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token.name}) self.anythingElse() return token def endTagOther(self, token): - self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) + self.parser.parseError("unexpected-end-tag", {"name": token.name}) def anythingElse(self): # Caller must raise parse error first! @@ -897,7 +896,7 @@ def startTagFrameset(self, token): def startTagFromHead(self, token): self.parser.parseError("unexpected-start-tag-out-of-my-head", - {"name": token["name"]}) + {"name": token.name}) self.tree.openElements.append(self.tree.headPointer) self.parser.phases["inHead"].processStartTag(token) for node in self.tree.openElements[::-1]: @@ -906,7 +905,7 @@ def startTagFromHead(self, token): break def startTagHead(self, token): - self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) + self.parser.parseError("unexpected-start-tag", {"name": token.name}) def startTagOther(self, token): self.anythingElse() @@ -917,10 +916,10 @@ def endTagHtmlBodyBr(self, token): return token def endTagOther(self, token): - self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) + self.parser.parseError("unexpected-end-tag", {"name": token.name}) def anythingElse(self): - self.tree.insertElement(impliedTagToken("body", "StartTag")) + self.tree.insertElement(impliedTagToken("body", StartTag)) self.parser.phase = self.parser.phases["inBody"] self.parser.framesetOK = True @@ -984,7 +983,7 @@ def processEOF(self): def processSpaceCharactersDropNewline(self, token): # Sometimes (start of
,, and