diff --git a/jvm/src/test/resources/scala/xml/utf16.xml b/jvm/src/test/resources/scala/xml/utf16.xml new file mode 100644 index 00000000..52c4fbaa Binary files /dev/null and b/jvm/src/test/resources/scala/xml/utf16.xml differ diff --git a/jvm/src/test/resources/scala/xml/utf8.xml b/jvm/src/test/resources/scala/xml/utf8.xml new file mode 100644 index 00000000..bb59f58c --- /dev/null +++ b/jvm/src/test/resources/scala/xml/utf8.xml @@ -0,0 +1,2 @@ + + diff --git a/jvm/src/test/scala/scala/xml/XMLTest.scala b/jvm/src/test/scala/scala/xml/XMLTest.scala index 49ca1597..9d14219b 100644 --- a/jvm/src/test/scala/scala/xml/XMLTest.scala +++ b/jvm/src/test/scala/scala/xml/XMLTest.scala @@ -1,9 +1,10 @@ package scala.xml import org.junit.{Test => UnitTest} -import org.junit.Assert.{assertEquals, assertFalse, assertTrue} +import org.junit.Assert.{assertEquals, assertFalse, assertNull, assertThrows, assertTrue} import java.io.StringWriter import java.io.ByteArrayOutputStream +import java.net.URL import scala.xml.dtd.{DocType, PublicID} import scala.xml.parsing.ConstructingParser import scala.xml.Utility.sort @@ -681,6 +682,8 @@ class XMLTestJVM { assertTrue(gotAnError) } + def resourceUrl(resourceName: String): URL = getClass.getResource(s"$resourceName.xml") + // Here we see that opening InputStream prematurely, as was done previously, breaks XInclude. @UnitTest(expected = classOf[org.xml.sax.SAXParseException]) def xIncludeNeedsSystemId(): Unit = { val parserFactory = xercesInternal @@ -688,7 +691,7 @@ class XMLTestJVM { parserFactory.setXIncludeAware(true) XML .withSAXParser(parserFactory.newSAXParser) - .load(getClass.getResource("site.xml").openStream()) + .load(resourceUrl("site").openStream()) .toString } @@ -703,7 +706,7 @@ class XMLTestJVM { parserFactory.setXIncludeAware(true) val actual: String = XML .withSAXParser(parserFactory.newSAXParser) - .load(getClass.getResource(resourceName)) + .load(resourceUrl(resourceName)) .toString assertEquals(expected, actual) @@ -718,8 +721,8 @@ class XMLTestJVM { | |""".stripMargin - @UnitTest def xIncludeWithExternalXerces(): Unit = check(xercesExternal, "includer.xml", includerExpected) - @UnitTest def xIncludeWithInternalXerces(): Unit = check(xercesInternal, "includer.xml", includerExpected) + @UnitTest def xIncludeWithExternalXerces(): Unit = check(xercesExternal, "includer", includerExpected) + @UnitTest def xIncludeWithInternalXerces(): Unit = check(xercesInternal, "includer", includerExpected) // And here we demonstrate that both external and built-in Xerces report incorrect `xml:base` // when the XML file included contains its own include, and included files are not in the same directory: @@ -750,8 +753,170 @@ class XMLTestJVM { // // I find it utterly incomprehensible that foundational library shipped with JDK and used everywhere // has a bug in its core functionality for years and it never gets fixed, but sadly, it is the state of affairs: - @UnitTest def xIncludeFailWithExternalXerces(): Unit = check(xercesExternal, "site.xml", siteUnfortunatelyExpected) - @UnitTest def xIncludeFailWithInternalXerces(): Unit = check(xercesInternal, "site.xml", siteUnfortunatelyExpected) + @UnitTest def xIncludeFailWithExternalXerces(): Unit = check(xercesExternal, "site", siteUnfortunatelyExpected) + @UnitTest def xIncludeFailWithInternalXerces(): Unit = check(xercesInternal, "site", siteUnfortunatelyExpected) + + @UnitTest + def documentBaseURI(): Unit = { + val url: URL = resourceUrl("site") + // XMLLoader returns the document's baseURI: + assert(XML.withSAXParser(xercesInternal.newSAXParser).loadDocument(url).baseURI.endsWith("/test-classes/scala/xml/site.xml")) + assert(XML.withSAXParser(xercesExternal.newSAXParser).loadDocument(url).baseURI.endsWith("/test-classes/scala/xml/site.xml")) + // ConstructingParser does not return it of course: since it uses scala.io.Source it has no idea where is the XML coming from: + assertNull(ConstructingParser.fromSource(scala.io.Source.fromURI(url.toURI), preserveWS = false).document().baseURI) + } + + @UnitTest + def xmlStandAlone(): Unit = { + val standAlone: String = s"""""" + val nonStandAlone: String = s"""""" + val default: String = s"""""" + val noXmlDeclaration: String = s"""""" + + // ConstructingParser returns standAlone status of the document straight from the `xml` declaration: + assertEquals(Some(true ), ConstructingParser.fromSource(scala.io.Source.fromString(standAlone), preserveWS = false).document().standAlone) + assertEquals(Some(false), ConstructingParser.fromSource(scala.io.Source.fromString(nonStandAlone), preserveWS = false).document().standAlone) + assertTrue(ConstructingParser.fromSource(scala.io.Source.fromString(default), preserveWS = false).document().standAlone.isEmpty) + // ConstructingParser incorrectly returns null standAlone value when the document does not have the xml declaration: + assertNull(ConstructingParser.fromSource(scala.io.Source.fromString(noXmlDeclaration), preserveWS = false).document().standAlone) + + // XMLLoader returns standAlone status of the document straight from the `xml` declaration: + assertTrue(XML.withSAXParser(xercesInternal.newSAXParser).loadStringDocument(standAlone).standAlone.contains(true)) + assertTrue(XML.withSAXParser(xercesInternal.newSAXParser).loadStringDocument(nonStandAlone).standAlone.contains(false)) + assertTrue(XML.withSAXParser(xercesInternal.newSAXParser).loadStringDocument(default).standAlone.contains(false)) + assertTrue(XML.withSAXParser(xercesInternal.newSAXParser).loadStringDocument(noXmlDeclaration).standAlone.contains(false)) + } + + @UnitTest + def xmlVersion(): Unit = { + val xml10 = s"""""" + val xml11 = s"""""" + val noXmlDeclaration: String = s"""""" + + // ConstructingParser returns XML version of the document straight from the `xml` declaration for version="1.0": + assertEquals(Some("1.0"), ConstructingParser.fromSource(scala.io.Source.fromString(xml10), preserveWS = false).document().version) + // ConstructingParser returns incorrect version value when the the version is "1.1" (and prints "cannot deal with versions != 1.0a"): + assertTrue(ConstructingParser.fromSource(scala.io.Source.fromString(xml11), preserveWS = false).document().version.isEmpty) + // ConstructingParser incorrectly returns null version value when the document does not have the xml declaration: + assertNull(ConstructingParser.fromSource(scala.io.Source.fromString(noXmlDeclaration), preserveWS = false).document().version) + + // XMLLoader returns XML version of the document straight from the `xml` declaration + assertTrue(xercesInternal.getFeature("http://xml.org/sax/features/xml-1.1")) + assertEquals(Some("1.0"), XML.withSAXParser(xercesInternal.newSAXParser).loadStringDocument(xml10).version) + assertEquals(Some("1.1"), XML.withSAXParser(xercesInternal.newSAXParser).loadStringDocument(xml11).version) + assertEquals(Some("1.0"), XML.withSAXParser(xercesInternal.newSAXParser).loadStringDocument(noXmlDeclaration).version) + } + + @UnitTest + def xmlEncoding(): Unit = { + val utf8: String = s"""""" + val utf16: String = s"""""" + val default: String = s"""""" + val noXmlDeclaration: String = s"""""" + + // ConstructingParser returns XML encoding name canonicalized from the `xml` declaration: + assertEquals(Some("UTF-8" ), ConstructingParser.fromSource(scala.io.Source.fromString(utf8 ), preserveWS = false).document().encoding) + assertEquals(Some("UTF-16"), ConstructingParser.fromSource(scala.io.Source.fromString(utf16 ), preserveWS = false).document().encoding) + assertEquals(None , ConstructingParser.fromSource(scala.io.Source.fromString(default), preserveWS = false).document().encoding) + // ConstructingParser incorrectly returns null encoding value when the document does not have the xml declaration: + assertNull(ConstructingParser.fromSource(scala.io.Source.fromString(noXmlDeclaration), preserveWS = false).document().encoding) + + // XMLLoader does not return the encoding specified in the `xml` declaration: + assertEquals(None, XML.loadStringDocument(utf8).encoding) + assertEquals(None, XML.loadStringDocument(utf16).encoding) + assertEquals(None, XML.loadStringDocument(default).encoding) + assertEquals(None, XML.loadStringDocument(noXmlDeclaration).encoding) + + // XMLLoader returns the encoding determined from the Byte Order Mark in the document itself: + assertEquals(Some("UTF-8"), XML.loadDocument(resourceUrl("utf8")).encoding) + assertEquals(Some("UTF-16BE"), XML.loadDocument(resourceUrl("utf16")).encoding) + + // ConstructingParser doesn't seem to be able to parse XML with Byte Order Mark: + assertThrows( + classOf[java.nio.charset.MalformedInputException], + () => ConstructingParser.fromSource(scala.io.Source.fromURI(resourceUrl("utf16").toURI), preserveWS = false).document().encoding + ) + } + + @UnitTest + def loadDtd(): Unit = { + val parserFactory: javax.xml.parsers.SAXParserFactory = xercesExternal + parserFactory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false) + + val xml: String = + s""" + | + | + | + | + | + | + | + | + | + |]> + |&AUTHOR; + |""".stripMargin + + val document: Document = XML.withSAXParser(parserFactory.newSAXParser).loadStringDocument(xml) + + // XMLLoader parses and returns DTD. + // Note: dtd.ContentModel that DTD uses to represent the element content model lacks fidelity: + // occurrence indicators "?" and "+" can not be expressed. + // Note: spurious parentheses come from the dtd.ContentModel's toString() methods... + assertEquals( + """DTD PUBLIC "-//OASIS//DTD DocBook V5.0//EN" "http://www.oasis-open.org/docbook/xml/5.0/docbook.dtd" [ + | + | + | + | + | + | + | + | + | + | + |]""".stripMargin, + document.dtd.toString) + + // XMLLoader resolves entities defined in the DTD - + // XML parser parses and uses the DTD internally, so there is no need to install any additional entity resolvers: + assertEquals("""John Doe""", document.docElem.toString) + + val document2: Document = ConstructingParser.fromSource(scala.io.Source.fromString(xml), preserveWS = false).document() + + // ConstructingParser + // ignores + // element declarations + // attribute list declarations + // some entity declarations + // notations + // captures + // decls: List[Decl] - for EntityDecl and PEReference + // ent: Map[String, EntityDecl] + // returns only + // decls + assertEquals( + s"""DTD PUBLIC "-//OASIS//DTD DocBook V5.0//EN" "http://www.oasis-open.org/docbook/xml/5.0/docbook.dtd" [ + | + |]""".stripMargin, + document2.dtd.toString) + + // ConstructingParser resolves entities defined in the DTD + assertEquals("""John Doe""", document2.docElem.toString) + } @UnitTest def nodeSeqNs(): Unit = { diff --git a/shared/src/main/scala/scala/xml/dtd/DTD.scala b/shared/src/main/scala/scala/xml/dtd/DTD.scala index 6c577221..a3f588ec 100644 --- a/shared/src/main/scala/scala/xml/dtd/DTD.scala +++ b/shared/src/main/scala/scala/xml/dtd/DTD.scala @@ -33,7 +33,7 @@ abstract class DTD { var ent: mutable.Map[String, EntityDecl] = new mutable.HashMap[String, EntityDecl]() override def toString: String = - "DTD [\n%s%s]".format( + "DTD %s [\n%s]".format( Option(externalID).getOrElse(""), decls.mkString("", "\n", "\n") ) diff --git a/shared/src/main/scala/scala/xml/dtd/Decl.scala b/shared/src/main/scala/scala/xml/dtd/Decl.scala index bbd7f022..bdde0337 100644 --- a/shared/src/main/scala/scala/xml/dtd/Decl.scala +++ b/shared/src/main/scala/scala/xml/dtd/Decl.scala @@ -95,11 +95,12 @@ case class UnparsedEntityDecl(name: String, extID: ExternalID, notation: String) extID.buildString(sb).append(" NDATA ").append(notation).append('>') } } + /** a notation declaration */ case class NotationDecl(name: String, extID: ExternalID) extends MarkupDecl { override def buildString(sb: StringBuilder): StringBuilder = { sb.append("') } } diff --git a/shared/src/main/scala/scala/xml/factory/XMLLoader.scala b/shared/src/main/scala/scala/xml/factory/XMLLoader.scala index 0aa36e69..afe54330 100644 --- a/shared/src/main/scala/scala/xml/factory/XMLLoader.scala +++ b/shared/src/main/scala/scala/xml/factory/XMLLoader.scala @@ -55,17 +55,14 @@ trait XMLLoader[T <: Node] { * The methods available in scala.xml.XML use the XML parser in the JDK * (unless another parser is present on the classpath). */ - private def getDocElem(document: Document): T = document.docElem.asInstanceOf[T] - - def loadXML(inputSource: InputSource, parser: SAXParser): T = getDocElem(loadDocument(inputSource, parser)) - def loadXMLNodes(inputSource: InputSource, parser: SAXParser): Seq[Node] = loadDocument(inputSource, parser).children - private def loadDocument(inputSource: InputSource, parser: SAXParser): Document = adapter.loadDocument(inputSource, parser) - private def loadDocument(inputSource: InputSource, reader: XMLReader): Document = adapter.loadDocument(inputSource, reader) + // TODO remove + def loadXML(inputSource: InputSource, parser: SAXParser): T = getDocElem(adapter.loadDocument(inputSource, parser.getXMLReader)) + def loadXMLNodes(inputSource: InputSource, parser: SAXParser): Seq[Node] = adapter.loadDocument(inputSource, parser.getXMLReader).children def adapter: parsing.FactoryAdapter = new parsing.NoBindingFactoryAdapter() /** Loads XML Document. */ - def loadDocument(source: InputSource): Document = loadDocument(source, reader) + def loadDocument(inputSource: InputSource): Document = adapter.loadDocument(inputSource, reader) def loadFileDocument(fileName: String): Document = loadDocument(Source.fromFile(fileName)) def loadFileDocument(file: File): Document = loadDocument(Source.fromFile(file)) def loadDocument(url: URL): Document = loadDocument(Source.fromUrl(url)) @@ -76,6 +73,7 @@ trait XMLLoader[T <: Node] { def loadStringDocument(string: String): Document = loadDocument(Source.fromString(string)) /** Loads XML element. */ + private def getDocElem(document: Document): T = document.docElem.asInstanceOf[T] def load(inputSource: InputSource): T = getDocElem(loadDocument(inputSource)) def loadFile(fileName: String): T = getDocElem(loadFileDocument(fileName)) def loadFile(file: File): T = getDocElem(loadFileDocument(file)) diff --git a/shared/src/main/scala/scala/xml/parsing/DtdBuilder.scala b/shared/src/main/scala/scala/xml/parsing/DtdBuilder.scala new file mode 100644 index 00000000..df8e3a2f --- /dev/null +++ b/shared/src/main/scala/scala/xml/parsing/DtdBuilder.scala @@ -0,0 +1,190 @@ +/* + * Scala (https://www.scala-lang.org) + * + * Copyright EPFL and Lightbend, Inc. + * + * Licensed under Apache License 2.0 + * (http://www.apache.org/licenses/LICENSE-2.0). + * + * See the NOTICE file distributed with this work for + * additional information regarding copyright ownership. + */ + +package scala +package xml +package parsing + +import scala.xml.dtd._ + +// Note: this is private to avoid it becoming a part of binary compatibility checks +final private[parsing] class DtdBuilder( + name: String, + externalID: ExternalID +) { + private var elements: List[ElemDecl] = List.empty + private var attributeLists: List[AttListDecl] = List.empty + private var entities: List[EntityDecl] = List.empty + private var notations: List[NotationDecl] = List.empty + private var unparsedEntities: List[UnparsedEntityDecl] = List.empty + private var parameterReferences: List[PEReference] = List.empty + + // AttListDecl under construction + private var elementName: Option[String] = None + private var attributes: List[AttrDecl] = List.empty + + private def flushAttributes(): Unit = if (elementName.isDefined) { + attributeLists ::= AttListDecl(elementName.get, attributes.reverse) + attributes = List.empty + elementName = None + } + + private var done: Boolean = false + def isDone: Boolean = done + + def endDTD(): Unit = { + flushAttributes() + done = true + } + + def dtd: DTD = new DTD { + // Note: weirdly, unlike DocType, DTD does not have a 'name'... + this.externalID = DtdBuilder.this.externalID + this.elem ++= elements.map(d => d.name -> d).toMap + this.attr ++= attributeLists.map(d => d.name -> d).toMap + this.ent ++= entities.map { d => + val name: String = d match { + case ParsedEntityDecl(name, _) => name + case ParameterEntityDecl(name, _) => name + case UnparsedEntityDecl(name, _, _) => name + } + name -> d + }.toMap + this.decls = + elements.reverse ++ + attributeLists.reverse ++ + entities.reverse ++ + DtdBuilder.this.notations.reverse ++ + parameterReferences.reverse + + override val notations: Seq[NotationDecl] = DtdBuilder.this.notations.reverse + override val unparsedEntities: Seq[EntityDecl] = DtdBuilder.this.unparsedEntities.reverse + } + + + def elementDecl(name: String, model: String): Unit = { + flushAttributes() + elements ::= ElemDecl(name, ElementContentModel.parseContentModel(model)) + } + + // The type will be one of the strings "CDATA", "ID", "IDREF", "IDREFS", "NMTOKEN", "NMTOKENS", "ENTITY", "ENTITIES", + // a parenthesized token group with the separator "|" and all whitespace removed, + // or the word "NOTATION" followed by a space followed by a parenthesized token group with all whitespace removed. + def attributeDecl( + eName: String, + aName: String, + `type`: String, + mode: String, + value: String + ): Unit = { + if (!elementName.contains(eName)) { + flushAttributes() + elementName = Some(eName) + } + + val attribute: AttrDecl = AttrDecl( + aName, + `type`, + mode match { + case "#REQUIRED" => REQUIRED + case "#IMPLIED" => IMPLIED + case "#FIXED" => DEFAULT(fixed = true, value) + case _ => DEFAULT(fixed = false, value) + } + ) + + attributes ::= attribute + } + + // General entities are reported with their regular names, + // parameter entities have '%' prepended to their names, + // and the external DTD subset has the pseudo-entity name "[dtd]". + def startEntity(name: String): Unit = { + flushAttributes() + if (name.startsWith("%")) parameterReferences ::= PEReference(name.tail.trim) + } + + def endEntity(name: String): Unit = { + } + + def notationDecl( + name: String, + publicId: String, + systemId: String + ): Unit = { + flushAttributes() + notations ::= NotationDecl(name, DtdBuilder.mkExternalID(publicId, systemId)) + } + + def unparsedEntityDecl( + name: String, + publicId: String, + systemId: String, + notationName: String + ): Unit = { + flushAttributes() + val unparsedEntity: UnparsedEntityDecl = + UnparsedEntityDecl(name, DtdBuilder.mkExternalID(publicId, systemId), notationName) + entities ::= unparsedEntity + unparsedEntities ::= unparsedEntity + } + + def internalEntityDecl( + name: String, + value: String + ): Unit = { + flushAttributes() + entityDecl(name, IntDef(value)) + } + + def externalEntityDecl( + name: String, + publicId: String, + systemId: String + ): Unit = { + flushAttributes() + entityDecl(name, ExtDef(DtdBuilder.mkExternalID(publicId, systemId))) + } + + private def entityDecl( + name: String, + entityDef: EntityDef + ): Unit = { + val entity: EntityDecl = + if (name.startsWith("%")) ParameterEntityDecl(name.tail.trim, entityDef) + else ParsedEntityDecl(name, entityDef) + entities ::= entity + } + + // DTD class currently does not provide for capturing processing instructions + def processingInstruction(target: String, data: String): Unit = () + + // DTD class currently does not provide for capturing comments + def comment(commentText: String): Unit = () +} + +// Note: this is private to avoid it becoming a part of binary compatibility checks +private[parsing] object DtdBuilder { + def apply( + name: String, + publicId: String, + systemId: String + ): DtdBuilder = new DtdBuilder( + name, + mkExternalID(publicId, systemId) + ) + + private def mkExternalID(publicId: String, systemId: String): ExternalID = + if (publicId != null) PublicID(publicId, systemId) + else if (systemId != null) SystemID(systemId) + else NoExternalID +} diff --git a/shared/src/main/scala/scala/xml/parsing/ElementContentModel.scala b/shared/src/main/scala/scala/xml/parsing/ElementContentModel.scala new file mode 100644 index 00000000..832297c5 --- /dev/null +++ b/shared/src/main/scala/scala/xml/parsing/ElementContentModel.scala @@ -0,0 +1,229 @@ +/* + * Scala (https://www.scala-lang.org) + * + * Copyright EPFL and Lightbend, Inc. + * + * Licensed under Apache License 2.0 + * (http://www.apache.org/licenses/LICENSE-2.0). + * + * See the NOTICE file distributed with this work for + * additional information regarding copyright ownership. + */ + +package scala.xml.parsing + +import scala.annotation.tailrec +import scala.xml.dtd + +// Note: this is private to avoid it becoming a part of binary compatibility checks. + +// The content model will consist of the string "EMPTY", the string "ANY", or a parenthesised group, +// optionally followed by an occurrence indicator. +// The model will be normalized so that all parameter entities are fully resolved and all whitespace is removed, +// and will include the enclosing parentheses. +// Other normalization (such as removing redundant parentheses or simplifying occurrence indicators) +// is at the discretion of the parser. + +// elementdecl ::= '' +// contentspec ::= 'EMPTY' | 'ANY' | Mixed | children +// children ::= (choice | seq) ('?' | '*' | '+')? +// cp ::= (Name | choice | seq) ('?' | '*' | '+')? +// choice ::= '(' S? cp ( S? '|' S? cp )+ S? ')' +// seq ::= '(' S? cp ( S? ',' S? cp )* S? ')' +// Mixed ::= '(' S? '#PCDATA' (S? '|' S? Name)* S? ')*' +// | '(' S? '#PCDATA' S? ')' +private[parsing] object ElementContentModel { + def parseContentModel(model: String): dtd.ContentModel = ContentSpec.parse(model) match { + case ContentSpec.Empty => dtd.EMPTY + case ContentSpec.Any => dtd.ANY + case ContentSpec.PCData => dtd.PCDATA + case ContentSpec.Children(elements, occurrence) => dtd.ELEMENTS(convertOccurrence(elements, occurrence)) + case ContentSpec.Mixed(elements) => + val result: List[dtd.ContentModel.RegExp] = + dtd.ContentModel.Letter(dtd.ContentModel.ElemName(ContentSpec.PCData.value)) +: + elements.map(convertElements) + // TODO scala.xml.dtd.impl.Alt.apply() insists on there being al least two alternatives, + // which causes an exception in MIXED.toString() when there is only one alternative besides #PCDATA. + // I think this is a bug. + dtd.MIXED(dtd.ContentModel.Alt(result: _*)) + } + + private def convertElements(elements: Elements): dtd.ContentModel.RegExp = { + def convertCp(cp: Cp): dtd.ContentModel.RegExp = convertOccurrence(cp.elements, cp.occurrence) + elements match { + case Elements.Element(name) => dtd.ContentModel.Letter(dtd.ContentModel.ElemName(name)) + case Elements.Choice(children) => dtd.ContentModel.Alt(children.map(convertCp): _*) + case Elements.Sequence(children) => dtd.ContentModel.Sequ(children.map(convertCp): _*) + } + } + + private def convertOccurrence(elements: Elements, occurrence: Occurrence): dtd.ContentModel.RegExp = { + val result: dtd.ContentModel.RegExp = convertElements(elements) + occurrence match { + case Occurrence.Once => result + case Occurrence.RepeatOptional => dtd.ContentModel.Star(result) + case Occurrence.OnceOptional => dtd.ContentModel.Star(result) // TODO fidelity lost! + case Occurrence.Repeat => dtd.ContentModel.Star(result) // TODO fidelity lost! + } + } + + sealed trait ContentSpec + object ContentSpec { + sealed trait Simple extends ContentSpec { + final override def toString: String = value + val value: String + } + case object Empty extends Simple { + override val value: String = "EMPTY" + } + case object Any extends Simple { + override val value: String = "ANY" + } + case object PCData extends ContentSpec { + override def toString: String = s"($value)" + val value: String = "#PCDATA" + } + final case class Mixed(elements: List[Elements.Element]) extends ContentSpec { + override def toString: String = { + val names: String = elements.mkString("|") + s"(${PCData.value}|$names)*" + } + } + final case class Children(elements: Elements.Many, occurrence: Occurrence) extends ContentSpec { + override def toString: String = s"$elements$occurrence" + } + object Children { + def parse(string: String, occurrence: Occurrence): Children = + Children(Elements.Many.parse(string), occurrence) + } + def parse(model: String): ContentSpec = model match { + case Empty.value => Empty + case Any.value => Any + case model => + val (parenthesized: String, occurrence: Occurrence) = Occurrence.parse(model) + require(isParenthesized(parenthesized)) + val string: String = removeParentheses(parenthesized) + if (occurrence == Occurrence.Once && string == PCData.value) PCData else if (occurrence == Occurrence.RepeatOptional) { + val choice: List[String] = Elements.Choice.split(string) + if (choice.length > 1 && choice.head == PCData.value) Mixed(choice.tail.map(Elements.Element)) + else Children.parse(string, occurrence) + } else Children.parse(string, occurrence) + } + } + + sealed trait Elements + object Elements { + final case class Element(name: String) extends Elements { + override def toString: String = name + } + sealed abstract class ManyCompanion(val separator: Char) { + final def split(string: String): List[String] = ElementContentModel.split(string, separator) + } + sealed abstract class Many(children: List[Cp]) extends Elements { + final override def toString: String = children.map(_.toString).mkString("(", companion.separator.toString, ")") + def companion: ManyCompanion + } + object Choice extends ManyCompanion(separator = '|') + final case class Choice(children: List[Cp]) extends Many(children) { + override def companion: ManyCompanion = Choice + } + object Sequence extends ManyCompanion(separator = ',') + final case class Sequence(children: List[Cp]) extends Many(children) { + override def companion: ManyCompanion = Sequence + } + object Many { + def parse(string: String): Many = { + val choice: List[String] = Choice.split(string) + if (choice.length > 1) Choice(choice.map(Cp.parse)) + else Sequence(Sequence.split(string).map(Cp.parse)) + } + } + def parse(string: String): Elements = + if (!isParenthesized(string)) Element(string) + else Many.parse(removeParentheses(string)) + } + + final case class Cp(elements: Elements, occurrence: Occurrence) { + override def toString: String = s"$elements$occurrence" + } + object Cp { + def parse(string: String): Cp = { + val (maybeParenthesized: String, occurrence: Occurrence) = Occurrence.parse(string) + Cp(Elements.parse(maybeParenthesized), occurrence) + } + } + + sealed class Occurrence + object Occurrence { + case object Once extends Occurrence { + override def toString: String = "" + } + sealed trait Signed extends Occurrence { + final override def toString: String = sign + def sign: String + } + case object OnceOptional extends Signed { + override def sign: String = "?" + } + case object Repeat extends Signed { + override def sign: String = "+" + } + case object RepeatOptional extends Signed { + override def sign: String = "*" + } + def parse(string: String): (String, Occurrence) = + if (string.endsWith(OnceOptional.sign)) (string.init, OnceOptional) else + if (string.endsWith(RepeatOptional.sign)) (string.init, RepeatOptional) else + if (string.endsWith(Repeat.sign)) (string.init, Repeat) else + (string, Once) + } + + private def isParenthesized(string: String): Boolean = { + @tailrec + def isParenthesized(level: Int, tail: String): Boolean = { + val current: Char = tail.head + val nextTail: String = tail.tail + val nextLevel: Int = if (current == '(') level + 1 else if (current == ')') level - 1 else level + if (nextTail.isEmpty) nextLevel == 0 else if (nextLevel == 0) false else isParenthesized(nextLevel, nextTail) + } + + string.startsWith("(") && isParenthesized(0, string) + } + + @tailrec + private def removeParentheses(string: String): String = + if (!isParenthesized(string)) string + else removeParentheses(string.tail.init) + + // split at the top level of parentheses + private def split(string: String, separator: Char): List[String] = { + @tailrec + def split( + result: List[String], + level: Int, + init: String, + tail: String + ): List[String] = if (tail.isEmpty) if (init.isEmpty) result else result :+ init else { + val current: Char = tail.head + val nextTail: String = tail.tail + if (level == 0 && current == separator) split( + result :+ init, + level, + "", + nextTail + ) else split( + result, + if (current == '(') level + 1 else if (current == ')') level - 1 else level, + init :+ current, + nextTail + ) + } + + split( + List.empty, + 0, + "", + string + ) + } +} diff --git a/shared/src/main/scala/scala/xml/parsing/FactoryAdapter.scala b/shared/src/main/scala/scala/xml/parsing/FactoryAdapter.scala index 9c170b6e..362c1516 100644 --- a/shared/src/main/scala/scala/xml/parsing/FactoryAdapter.scala +++ b/shared/src/main/scala/scala/xml/parsing/FactoryAdapter.scala @@ -15,8 +15,8 @@ package xml package parsing import scala.collection.Seq -import org.xml.sax.{Attributes, SAXNotRecognizedException, SAXNotSupportedException} -import org.xml.sax.ext.DefaultHandler2 +import org.xml.sax.{Attributes, Locator, SAXNotRecognizedException, SAXNotSupportedException} +import org.xml.sax.ext.{DefaultHandler2, Locator2} // can be mixed into FactoryAdapter if desired trait ConsoleErrorHandler extends DefaultHandler2 { @@ -42,10 +42,21 @@ trait ConsoleErrorHandler extends DefaultHandler2 { abstract class FactoryAdapter extends DefaultHandler2 with factory.XMLLoader[Node] { val normalizeWhitespace: Boolean = false + // reference to the XMLReader that parses the document; this is used to query + // features (e.g., 'is-standalone') and properties (e.g., document-xml-version) - + // see http://www.saxproject.org/apidoc/org/xml/sax/package-summary.html + private var xmlReader: Option[XMLReader] = None + + private var dtdBuilder: Option[DtdBuilder] = None + private def inDtd: Boolean = dtdBuilder.isDefined && !dtdBuilder.get.isDone + private var document: Option[Document] = None + private var baseURI: Option[String] = None + private var xmlEncoding: Option[String] = None private var prefixMappings: List[(String, String)] = List.empty + // TODO all the variables should be private, but - binary compatibility... var prolog: List[Node] = List.empty var rootElem: Node = _ var epilogue: List[Node] = List.empty @@ -100,16 +111,10 @@ abstract class FactoryAdapter extends DefaultHandler2 with factory.XMLLoader[Nod } /** - * Load XML document from the source using the parser. - */ - def loadDocument(source: InputSource, parser: SAXParser): Document = - loadDocument(source, parser.getXMLReader) - - /** - * Load XML document from the source using the reader. + * Load XML document from the inputSource using the xmlReader. */ - def loadDocument(source: InputSource, xmlReader: XMLReader): Document = { - if (source == null) throw new IllegalArgumentException("InputSource cannot be null") + def loadDocument(inputSource: InputSource, xmlReader: XMLReader): Document = { + if (inputSource == null) throw new IllegalArgumentException("InputSource cannot be null") xmlReader.setContentHandler(this) xmlReader.setDTDHandler(this) @@ -126,7 +131,16 @@ abstract class FactoryAdapter extends DefaultHandler2 with factory.XMLLoader[Nod case _: SAXNotSupportedException => } - xmlReader.parse(source) + /* Use DeclHandler if it is supported by the xmlReader. */ + try { + xmlReader.setProperty("http://xml.org/sax/properties/declaration-handler", this) + } catch { + case _: SAXNotRecognizedException => + case _: SAXNotSupportedException => + } + + this.xmlReader = Some(xmlReader) + xmlReader.parse(inputSource) document.get } @@ -175,8 +189,25 @@ abstract class FactoryAdapter extends DefaultHandler2 with factory.XMLLoader[Nod /* ContentHandler methods */ + // Since Java 14, ContentHandler has a method that delivers the values from the XML declaration: + // def declaration(version: String, encoding: String, standalone: String): Unit = () + // but it'll be years until we are all on Java 14 *and* Xerces starts calling this method... + + override def setDocumentLocator(locator: Locator): Unit = { + baseURI = Option(locator.getSystemId) + locator match { + case locator2: Locator2 => + // Note: Xerces calls setDocumentLocator() (and startDocument()) *before* it even reads the XML declaration; + // the version delivered here - locator2.getXMLVersion - is always "1.0"; + // the real version is retrieved as a property of the XML reader in endDocument(). + + xmlEncoding = Option(locator2.getEncoding) + case _ => + } + } + override def startDocument(): Unit = { - scopeStack ::= TopScope // TODO remove + scopeStack ::= TopScope // TODO turn into a parameter } override def endDocument(): Unit = { @@ -187,13 +218,33 @@ abstract class FactoryAdapter extends DefaultHandler2 with factory.XMLLoader[Nod this.document = Some(document) document.children = prolog ++ rootElem ++ epilogue document.docElem = rootElem - document.dtd = null - document.baseURI = null - document.encoding = None - document.standAlone = None - document.version = None + document.dtd = dtdBuilder.map(_.dtd).orNull + document.baseURI = baseURI.orNull + document.encoding = xmlEncoding + + document.version = + try { + Option(xmlReader.get.getProperty("http://xml.org/sax/properties/document-xml-version").asInstanceOf[String]) + } catch { + case _: SAXNotRecognizedException => None + case _: SAXNotSupportedException => None + } + + document.standAlone = + try { + Some(xmlReader.get.getFeature("http://xml.org/sax/features/is-standalone")) + } catch { + case _: SAXNotRecognizedException => None + case _: SAXNotSupportedException => None + } // Note: resetting to the freshly-created state; needed only if this instance is reused, which we do not do... + dtdBuilder = None + xmlReader = None + + baseURI = None + xmlEncoding = None + hStack = hStack.last :: Nil // TODO List.empty scopeStack = scopeStack.tail // TODO List.empty @@ -322,13 +373,36 @@ abstract class FactoryAdapter extends DefaultHandler2 with factory.XMLLoader[Nod } } } + + override def ignorableWhitespace(ch: Array[Char], offset: Int, length: Int): Unit = () + /** * Processing instruction. */ - override def processingInstruction(target: String, data: String): Unit = { - captureText() - hStack = hStack.reverse_:::(createProcInstr(target, data).toList) - } + override def processingInstruction(target: String, data: String): Unit = + if (inDtd) dtdBuilder.foreach(_.processingInstruction(target, data)) else { + captureText() + hStack = hStack.reverse_:::(createProcInstr(target, data).toList) + } + + override def skippedEntity(name: String): Unit = () + + /* LexicalHandler methods (see https://docs.oracle.com/javase/8/docs/api/org/xml/sax/ext/LexicalHandler.html) */ + + override def startDTD( + name: String, + publicId: String, + systemId: String + ): Unit = dtdBuilder = Some(DtdBuilder( + name, + publicId, + systemId + )) + + override def endDTD(): Unit = dtdBuilder.foreach(_.endDTD()) + + override def startEntity(name: String): Unit = dtdBuilder.foreach(_.startEntity(name)) + override def endEntity(name: String): Unit = dtdBuilder.foreach(_.endEntity(name)) /** * Start of a CDATA section. @@ -347,8 +421,32 @@ abstract class FactoryAdapter extends DefaultHandler2 with factory.XMLLoader[Nod * Comment. */ override def comment(ch: Array[Char], start: Int, length: Int): Unit = { - captureText() val commentText: String = String.valueOf(ch.slice(start, start + length)) - hStack = hStack.reverse_:::(createComment(commentText).toList) + if (inDtd) dtdBuilder.foreach(_.comment(commentText)) else { + captureText() + hStack = hStack.reverse_:::(createComment(commentText).toList) + } } + + /* DTDHandler methods (see https://docs.oracle.com/javase/8/docs/api/org/xml/sax/DTDHandler.html) */ + + override def notationDecl(name: String, publicId: String, systemId: String): Unit = + dtdBuilder.foreach(_.notationDecl(name, publicId, systemId)) + + override def unparsedEntityDecl(name: String, publicId: String, systemId: String, notationName: String): Unit = + dtdBuilder.foreach(_.unparsedEntityDecl(name, publicId, systemId, notationName)) + + /* DeclHandler methods (see https://docs.oracle.com/javase/8/docs/api/org/xml/sax/ext/DeclHandler.html) */ + + override def elementDecl(name: String, model: String): Unit = + dtdBuilder.foreach(_.elementDecl(name, model)) + + override def attributeDecl(eName: String, aName: String, `type`: String, mode: String, value: String): Unit = + dtdBuilder.foreach(_.attributeDecl(eName, aName, `type`, mode, value)) + + override def internalEntityDecl(name: String, value: String): Unit = + dtdBuilder.foreach(_.internalEntityDecl(name, value)) + + override def externalEntityDecl(name: String, publicId: String, systemId: String): Unit = + dtdBuilder.foreach(_.externalEntityDecl(name, publicId, systemId)) }