diff --git a/jvm/src/test/resources/scala/xml/utf16.xml b/jvm/src/test/resources/scala/xml/utf16.xml
new file mode 100644
index 00000000..52c4fbaa
Binary files /dev/null and b/jvm/src/test/resources/scala/xml/utf16.xml differ
diff --git a/jvm/src/test/resources/scala/xml/utf8.xml b/jvm/src/test/resources/scala/xml/utf8.xml
new file mode 100644
index 00000000..bb59f58c
--- /dev/null
+++ b/jvm/src/test/resources/scala/xml/utf8.xml
@@ -0,0 +1,2 @@
+
+
diff --git a/jvm/src/test/scala/scala/xml/XMLTest.scala b/jvm/src/test/scala/scala/xml/XMLTest.scala
index 49ca1597..9d14219b 100644
--- a/jvm/src/test/scala/scala/xml/XMLTest.scala
+++ b/jvm/src/test/scala/scala/xml/XMLTest.scala
@@ -1,9 +1,10 @@
package scala.xml
import org.junit.{Test => UnitTest}
-import org.junit.Assert.{assertEquals, assertFalse, assertTrue}
+import org.junit.Assert.{assertEquals, assertFalse, assertNull, assertThrows, assertTrue}
import java.io.StringWriter
import java.io.ByteArrayOutputStream
+import java.net.URL
import scala.xml.dtd.{DocType, PublicID}
import scala.xml.parsing.ConstructingParser
import scala.xml.Utility.sort
@@ -681,6 +682,8 @@ class XMLTestJVM {
assertTrue(gotAnError)
}
+ def resourceUrl(resourceName: String): URL = getClass.getResource(s"$resourceName.xml")
+
// Here we see that opening InputStream prematurely, as was done previously, breaks XInclude.
@UnitTest(expected = classOf[org.xml.sax.SAXParseException]) def xIncludeNeedsSystemId(): Unit = {
val parserFactory = xercesInternal
@@ -688,7 +691,7 @@ class XMLTestJVM {
parserFactory.setXIncludeAware(true)
XML
.withSAXParser(parserFactory.newSAXParser)
- .load(getClass.getResource("site.xml").openStream())
+ .load(resourceUrl("site").openStream())
.toString
}
@@ -703,7 +706,7 @@ class XMLTestJVM {
parserFactory.setXIncludeAware(true)
val actual: String = XML
.withSAXParser(parserFactory.newSAXParser)
- .load(getClass.getResource(resourceName))
+ .load(resourceUrl(resourceName))
.toString
assertEquals(expected, actual)
@@ -718,8 +721,8 @@ class XMLTestJVM {
|
|""".stripMargin
- @UnitTest def xIncludeWithExternalXerces(): Unit = check(xercesExternal, "includer.xml", includerExpected)
- @UnitTest def xIncludeWithInternalXerces(): Unit = check(xercesInternal, "includer.xml", includerExpected)
+ @UnitTest def xIncludeWithExternalXerces(): Unit = check(xercesExternal, "includer", includerExpected)
+ @UnitTest def xIncludeWithInternalXerces(): Unit = check(xercesInternal, "includer", includerExpected)
// And here we demonstrate that both external and built-in Xerces report incorrect `xml:base`
// when the XML file included contains its own include, and included files are not in the same directory:
@@ -750,8 +753,170 @@ class XMLTestJVM {
//
// I find it utterly incomprehensible that foundational library shipped with JDK and used everywhere
// has a bug in its core functionality for years and it never gets fixed, but sadly, it is the state of affairs:
- @UnitTest def xIncludeFailWithExternalXerces(): Unit = check(xercesExternal, "site.xml", siteUnfortunatelyExpected)
- @UnitTest def xIncludeFailWithInternalXerces(): Unit = check(xercesInternal, "site.xml", siteUnfortunatelyExpected)
+ @UnitTest def xIncludeFailWithExternalXerces(): Unit = check(xercesExternal, "site", siteUnfortunatelyExpected)
+ @UnitTest def xIncludeFailWithInternalXerces(): Unit = check(xercesInternal, "site", siteUnfortunatelyExpected)
+
+ @UnitTest
+ def documentBaseURI(): Unit = {
+ val url: URL = resourceUrl("site")
+ // XMLLoader returns the document's baseURI:
+ assert(XML.withSAXParser(xercesInternal.newSAXParser).loadDocument(url).baseURI.endsWith("/test-classes/scala/xml/site.xml"))
+ assert(XML.withSAXParser(xercesExternal.newSAXParser).loadDocument(url).baseURI.endsWith("/test-classes/scala/xml/site.xml"))
+ // ConstructingParser does not return it of course: since it uses scala.io.Source it has no idea where is the XML coming from:
+ assertNull(ConstructingParser.fromSource(scala.io.Source.fromURI(url.toURI), preserveWS = false).document().baseURI)
+ }
+
+ @UnitTest
+ def xmlStandAlone(): Unit = {
+ val standAlone: String = s""""""
+ val nonStandAlone: String = s""""""
+ val default: String = s""""""
+ val noXmlDeclaration: String = s""""""
+
+ // ConstructingParser returns standAlone status of the document straight from the `xml` declaration:
+ assertEquals(Some(true ), ConstructingParser.fromSource(scala.io.Source.fromString(standAlone), preserveWS = false).document().standAlone)
+ assertEquals(Some(false), ConstructingParser.fromSource(scala.io.Source.fromString(nonStandAlone), preserveWS = false).document().standAlone)
+ assertTrue(ConstructingParser.fromSource(scala.io.Source.fromString(default), preserveWS = false).document().standAlone.isEmpty)
+ // ConstructingParser incorrectly returns null standAlone value when the document does not have the xml declaration:
+ assertNull(ConstructingParser.fromSource(scala.io.Source.fromString(noXmlDeclaration), preserveWS = false).document().standAlone)
+
+ // XMLLoader returns standAlone status of the document straight from the `xml` declaration:
+ assertTrue(XML.withSAXParser(xercesInternal.newSAXParser).loadStringDocument(standAlone).standAlone.contains(true))
+ assertTrue(XML.withSAXParser(xercesInternal.newSAXParser).loadStringDocument(nonStandAlone).standAlone.contains(false))
+ assertTrue(XML.withSAXParser(xercesInternal.newSAXParser).loadStringDocument(default).standAlone.contains(false))
+ assertTrue(XML.withSAXParser(xercesInternal.newSAXParser).loadStringDocument(noXmlDeclaration).standAlone.contains(false))
+ }
+
+ @UnitTest
+ def xmlVersion(): Unit = {
+ val xml10 = s""""""
+ val xml11 = s""""""
+ val noXmlDeclaration: String = s""""""
+
+ // ConstructingParser returns XML version of the document straight from the `xml` declaration for version="1.0":
+ assertEquals(Some("1.0"), ConstructingParser.fromSource(scala.io.Source.fromString(xml10), preserveWS = false).document().version)
+ // ConstructingParser returns incorrect version value when the the version is "1.1" (and prints "cannot deal with versions != 1.0a"):
+ assertTrue(ConstructingParser.fromSource(scala.io.Source.fromString(xml11), preserveWS = false).document().version.isEmpty)
+ // ConstructingParser incorrectly returns null version value when the document does not have the xml declaration:
+ assertNull(ConstructingParser.fromSource(scala.io.Source.fromString(noXmlDeclaration), preserveWS = false).document().version)
+
+ // XMLLoader returns XML version of the document straight from the `xml` declaration
+ assertTrue(xercesInternal.getFeature("http://xml.org/sax/features/xml-1.1"))
+ assertEquals(Some("1.0"), XML.withSAXParser(xercesInternal.newSAXParser).loadStringDocument(xml10).version)
+ assertEquals(Some("1.1"), XML.withSAXParser(xercesInternal.newSAXParser).loadStringDocument(xml11).version)
+ assertEquals(Some("1.0"), XML.withSAXParser(xercesInternal.newSAXParser).loadStringDocument(noXmlDeclaration).version)
+ }
+
+ @UnitTest
+ def xmlEncoding(): Unit = {
+ val utf8: String = s""""""
+ val utf16: String = s""""""
+ val default: String = s""""""
+ val noXmlDeclaration: String = s""""""
+
+ // ConstructingParser returns XML encoding name canonicalized from the `xml` declaration:
+ assertEquals(Some("UTF-8" ), ConstructingParser.fromSource(scala.io.Source.fromString(utf8 ), preserveWS = false).document().encoding)
+ assertEquals(Some("UTF-16"), ConstructingParser.fromSource(scala.io.Source.fromString(utf16 ), preserveWS = false).document().encoding)
+ assertEquals(None , ConstructingParser.fromSource(scala.io.Source.fromString(default), preserveWS = false).document().encoding)
+ // ConstructingParser incorrectly returns null encoding value when the document does not have the xml declaration:
+ assertNull(ConstructingParser.fromSource(scala.io.Source.fromString(noXmlDeclaration), preserveWS = false).document().encoding)
+
+ // XMLLoader does not return the encoding specified in the `xml` declaration:
+ assertEquals(None, XML.loadStringDocument(utf8).encoding)
+ assertEquals(None, XML.loadStringDocument(utf16).encoding)
+ assertEquals(None, XML.loadStringDocument(default).encoding)
+ assertEquals(None, XML.loadStringDocument(noXmlDeclaration).encoding)
+
+ // XMLLoader returns the encoding determined from the Byte Order Mark in the document itself:
+ assertEquals(Some("UTF-8"), XML.loadDocument(resourceUrl("utf8")).encoding)
+ assertEquals(Some("UTF-16BE"), XML.loadDocument(resourceUrl("utf16")).encoding)
+
+ // ConstructingParser doesn't seem to be able to parse XML with Byte Order Mark:
+ assertThrows(
+ classOf[java.nio.charset.MalformedInputException],
+ () => ConstructingParser.fromSource(scala.io.Source.fromURI(resourceUrl("utf16").toURI), preserveWS = false).document().encoding
+ )
+ }
+
+ @UnitTest
+ def loadDtd(): Unit = {
+ val parserFactory: javax.xml.parsers.SAXParserFactory = xercesExternal
+ parserFactory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false)
+
+ val xml: String =
+ s"""
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |]>
+ |&AUTHOR;
+ |""".stripMargin
+
+ val document: Document = XML.withSAXParser(parserFactory.newSAXParser).loadStringDocument(xml)
+
+ // XMLLoader parses and returns DTD.
+ // Note: dtd.ContentModel that DTD uses to represent the element content model lacks fidelity:
+ // occurrence indicators "?" and "+" can not be expressed.
+ // Note: spurious parentheses come from the dtd.ContentModel's toString() methods...
+ assertEquals(
+ """DTD PUBLIC "-//OASIS//DTD DocBook V5.0//EN" "http://www.oasis-open.org/docbook/xml/5.0/docbook.dtd" [
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |]""".stripMargin,
+ document.dtd.toString)
+
+ // XMLLoader resolves entities defined in the DTD -
+ // XML parser parses and uses the DTD internally, so there is no need to install any additional entity resolvers:
+ assertEquals("""John Doe""", document.docElem.toString)
+
+ val document2: Document = ConstructingParser.fromSource(scala.io.Source.fromString(xml), preserveWS = false).document()
+
+ // ConstructingParser
+ // ignores
+ // element declarations
+ // attribute list declarations
+ // some entity declarations
+ // notations
+ // captures
+ // decls: List[Decl] - for EntityDecl and PEReference
+ // ent: Map[String, EntityDecl]
+ // returns only
+ // decls
+ assertEquals(
+ s"""DTD PUBLIC "-//OASIS//DTD DocBook V5.0//EN" "http://www.oasis-open.org/docbook/xml/5.0/docbook.dtd" [
+ |
+ |]""".stripMargin,
+ document2.dtd.toString)
+
+ // ConstructingParser resolves entities defined in the DTD
+ assertEquals("""John Doe""", document2.docElem.toString)
+ }
@UnitTest
def nodeSeqNs(): Unit = {
diff --git a/shared/src/main/scala/scala/xml/dtd/DTD.scala b/shared/src/main/scala/scala/xml/dtd/DTD.scala
index 6c577221..a3f588ec 100644
--- a/shared/src/main/scala/scala/xml/dtd/DTD.scala
+++ b/shared/src/main/scala/scala/xml/dtd/DTD.scala
@@ -33,7 +33,7 @@ abstract class DTD {
var ent: mutable.Map[String, EntityDecl] = new mutable.HashMap[String, EntityDecl]()
override def toString: String =
- "DTD [\n%s%s]".format(
+ "DTD %s [\n%s]".format(
Option(externalID).getOrElse(""),
decls.mkString("", "\n", "\n")
)
diff --git a/shared/src/main/scala/scala/xml/dtd/Decl.scala b/shared/src/main/scala/scala/xml/dtd/Decl.scala
index bbd7f022..bdde0337 100644
--- a/shared/src/main/scala/scala/xml/dtd/Decl.scala
+++ b/shared/src/main/scala/scala/xml/dtd/Decl.scala
@@ -95,11 +95,12 @@ case class UnparsedEntityDecl(name: String, extID: ExternalID, notation: String)
extID.buildString(sb).append(" NDATA ").append(notation).append('>')
}
}
+
/** a notation declaration */
case class NotationDecl(name: String, extID: ExternalID) extends MarkupDecl {
override def buildString(sb: StringBuilder): StringBuilder = {
sb.append("')
}
}
diff --git a/shared/src/main/scala/scala/xml/factory/XMLLoader.scala b/shared/src/main/scala/scala/xml/factory/XMLLoader.scala
index 0aa36e69..afe54330 100644
--- a/shared/src/main/scala/scala/xml/factory/XMLLoader.scala
+++ b/shared/src/main/scala/scala/xml/factory/XMLLoader.scala
@@ -55,17 +55,14 @@ trait XMLLoader[T <: Node] {
* The methods available in scala.xml.XML use the XML parser in the JDK
* (unless another parser is present on the classpath).
*/
- private def getDocElem(document: Document): T = document.docElem.asInstanceOf[T]
-
- def loadXML(inputSource: InputSource, parser: SAXParser): T = getDocElem(loadDocument(inputSource, parser))
- def loadXMLNodes(inputSource: InputSource, parser: SAXParser): Seq[Node] = loadDocument(inputSource, parser).children
- private def loadDocument(inputSource: InputSource, parser: SAXParser): Document = adapter.loadDocument(inputSource, parser)
- private def loadDocument(inputSource: InputSource, reader: XMLReader): Document = adapter.loadDocument(inputSource, reader)
+ // TODO remove
+ def loadXML(inputSource: InputSource, parser: SAXParser): T = getDocElem(adapter.loadDocument(inputSource, parser.getXMLReader))
+ def loadXMLNodes(inputSource: InputSource, parser: SAXParser): Seq[Node] = adapter.loadDocument(inputSource, parser.getXMLReader).children
def adapter: parsing.FactoryAdapter = new parsing.NoBindingFactoryAdapter()
/** Loads XML Document. */
- def loadDocument(source: InputSource): Document = loadDocument(source, reader)
+ def loadDocument(inputSource: InputSource): Document = adapter.loadDocument(inputSource, reader)
def loadFileDocument(fileName: String): Document = loadDocument(Source.fromFile(fileName))
def loadFileDocument(file: File): Document = loadDocument(Source.fromFile(file))
def loadDocument(url: URL): Document = loadDocument(Source.fromUrl(url))
@@ -76,6 +73,7 @@ trait XMLLoader[T <: Node] {
def loadStringDocument(string: String): Document = loadDocument(Source.fromString(string))
/** Loads XML element. */
+ private def getDocElem(document: Document): T = document.docElem.asInstanceOf[T]
def load(inputSource: InputSource): T = getDocElem(loadDocument(inputSource))
def loadFile(fileName: String): T = getDocElem(loadFileDocument(fileName))
def loadFile(file: File): T = getDocElem(loadFileDocument(file))
diff --git a/shared/src/main/scala/scala/xml/parsing/DtdBuilder.scala b/shared/src/main/scala/scala/xml/parsing/DtdBuilder.scala
new file mode 100644
index 00000000..df8e3a2f
--- /dev/null
+++ b/shared/src/main/scala/scala/xml/parsing/DtdBuilder.scala
@@ -0,0 +1,190 @@
+/*
+ * Scala (https://www.scala-lang.org)
+ *
+ * Copyright EPFL and Lightbend, Inc.
+ *
+ * Licensed under Apache License 2.0
+ * (http://www.apache.org/licenses/LICENSE-2.0).
+ *
+ * See the NOTICE file distributed with this work for
+ * additional information regarding copyright ownership.
+ */
+
+package scala
+package xml
+package parsing
+
+import scala.xml.dtd._
+
+// Note: this is private to avoid it becoming a part of binary compatibility checks
+final private[parsing] class DtdBuilder(
+ name: String,
+ externalID: ExternalID
+) {
+ private var elements: List[ElemDecl] = List.empty
+ private var attributeLists: List[AttListDecl] = List.empty
+ private var entities: List[EntityDecl] = List.empty
+ private var notations: List[NotationDecl] = List.empty
+ private var unparsedEntities: List[UnparsedEntityDecl] = List.empty
+ private var parameterReferences: List[PEReference] = List.empty
+
+ // AttListDecl under construction
+ private var elementName: Option[String] = None
+ private var attributes: List[AttrDecl] = List.empty
+
+ private def flushAttributes(): Unit = if (elementName.isDefined) {
+ attributeLists ::= AttListDecl(elementName.get, attributes.reverse)
+ attributes = List.empty
+ elementName = None
+ }
+
+ private var done: Boolean = false
+ def isDone: Boolean = done
+
+ def endDTD(): Unit = {
+ flushAttributes()
+ done = true
+ }
+
+ def dtd: DTD = new DTD {
+ // Note: weirdly, unlike DocType, DTD does not have a 'name'...
+ this.externalID = DtdBuilder.this.externalID
+ this.elem ++= elements.map(d => d.name -> d).toMap
+ this.attr ++= attributeLists.map(d => d.name -> d).toMap
+ this.ent ++= entities.map { d =>
+ val name: String = d match {
+ case ParsedEntityDecl(name, _) => name
+ case ParameterEntityDecl(name, _) => name
+ case UnparsedEntityDecl(name, _, _) => name
+ }
+ name -> d
+ }.toMap
+ this.decls =
+ elements.reverse ++
+ attributeLists.reverse ++
+ entities.reverse ++
+ DtdBuilder.this.notations.reverse ++
+ parameterReferences.reverse
+
+ override val notations: Seq[NotationDecl] = DtdBuilder.this.notations.reverse
+ override val unparsedEntities: Seq[EntityDecl] = DtdBuilder.this.unparsedEntities.reverse
+ }
+
+
+ def elementDecl(name: String, model: String): Unit = {
+ flushAttributes()
+ elements ::= ElemDecl(name, ElementContentModel.parseContentModel(model))
+ }
+
+ // The type will be one of the strings "CDATA", "ID", "IDREF", "IDREFS", "NMTOKEN", "NMTOKENS", "ENTITY", "ENTITIES",
+ // a parenthesized token group with the separator "|" and all whitespace removed,
+ // or the word "NOTATION" followed by a space followed by a parenthesized token group with all whitespace removed.
+ def attributeDecl(
+ eName: String,
+ aName: String,
+ `type`: String,
+ mode: String,
+ value: String
+ ): Unit = {
+ if (!elementName.contains(eName)) {
+ flushAttributes()
+ elementName = Some(eName)
+ }
+
+ val attribute: AttrDecl = AttrDecl(
+ aName,
+ `type`,
+ mode match {
+ case "#REQUIRED" => REQUIRED
+ case "#IMPLIED" => IMPLIED
+ case "#FIXED" => DEFAULT(fixed = true, value)
+ case _ => DEFAULT(fixed = false, value)
+ }
+ )
+
+ attributes ::= attribute
+ }
+
+ // General entities are reported with their regular names,
+ // parameter entities have '%' prepended to their names,
+ // and the external DTD subset has the pseudo-entity name "[dtd]".
+ def startEntity(name: String): Unit = {
+ flushAttributes()
+ if (name.startsWith("%")) parameterReferences ::= PEReference(name.tail.trim)
+ }
+
+ def endEntity(name: String): Unit = {
+ }
+
+ def notationDecl(
+ name: String,
+ publicId: String,
+ systemId: String
+ ): Unit = {
+ flushAttributes()
+ notations ::= NotationDecl(name, DtdBuilder.mkExternalID(publicId, systemId))
+ }
+
+ def unparsedEntityDecl(
+ name: String,
+ publicId: String,
+ systemId: String,
+ notationName: String
+ ): Unit = {
+ flushAttributes()
+ val unparsedEntity: UnparsedEntityDecl =
+ UnparsedEntityDecl(name, DtdBuilder.mkExternalID(publicId, systemId), notationName)
+ entities ::= unparsedEntity
+ unparsedEntities ::= unparsedEntity
+ }
+
+ def internalEntityDecl(
+ name: String,
+ value: String
+ ): Unit = {
+ flushAttributes()
+ entityDecl(name, IntDef(value))
+ }
+
+ def externalEntityDecl(
+ name: String,
+ publicId: String,
+ systemId: String
+ ): Unit = {
+ flushAttributes()
+ entityDecl(name, ExtDef(DtdBuilder.mkExternalID(publicId, systemId)))
+ }
+
+ private def entityDecl(
+ name: String,
+ entityDef: EntityDef
+ ): Unit = {
+ val entity: EntityDecl =
+ if (name.startsWith("%")) ParameterEntityDecl(name.tail.trim, entityDef)
+ else ParsedEntityDecl(name, entityDef)
+ entities ::= entity
+ }
+
+ // DTD class currently does not provide for capturing processing instructions
+ def processingInstruction(target: String, data: String): Unit = ()
+
+ // DTD class currently does not provide for capturing comments
+ def comment(commentText: String): Unit = ()
+}
+
+// Note: this is private to avoid it becoming a part of binary compatibility checks
+private[parsing] object DtdBuilder {
+ def apply(
+ name: String,
+ publicId: String,
+ systemId: String
+ ): DtdBuilder = new DtdBuilder(
+ name,
+ mkExternalID(publicId, systemId)
+ )
+
+ private def mkExternalID(publicId: String, systemId: String): ExternalID =
+ if (publicId != null) PublicID(publicId, systemId)
+ else if (systemId != null) SystemID(systemId)
+ else NoExternalID
+}
diff --git a/shared/src/main/scala/scala/xml/parsing/ElementContentModel.scala b/shared/src/main/scala/scala/xml/parsing/ElementContentModel.scala
new file mode 100644
index 00000000..832297c5
--- /dev/null
+++ b/shared/src/main/scala/scala/xml/parsing/ElementContentModel.scala
@@ -0,0 +1,229 @@
+/*
+ * Scala (https://www.scala-lang.org)
+ *
+ * Copyright EPFL and Lightbend, Inc.
+ *
+ * Licensed under Apache License 2.0
+ * (http://www.apache.org/licenses/LICENSE-2.0).
+ *
+ * See the NOTICE file distributed with this work for
+ * additional information regarding copyright ownership.
+ */
+
+package scala.xml.parsing
+
+import scala.annotation.tailrec
+import scala.xml.dtd
+
+// Note: this is private to avoid it becoming a part of binary compatibility checks.
+
+// The content model will consist of the string "EMPTY", the string "ANY", or a parenthesised group,
+// optionally followed by an occurrence indicator.
+// The model will be normalized so that all parameter entities are fully resolved and all whitespace is removed,
+// and will include the enclosing parentheses.
+// Other normalization (such as removing redundant parentheses or simplifying occurrence indicators)
+// is at the discretion of the parser.
+
+// elementdecl ::= ''
+// contentspec ::= 'EMPTY' | 'ANY' | Mixed | children
+// children ::= (choice | seq) ('?' | '*' | '+')?
+// cp ::= (Name | choice | seq) ('?' | '*' | '+')?
+// choice ::= '(' S? cp ( S? '|' S? cp )+ S? ')'
+// seq ::= '(' S? cp ( S? ',' S? cp )* S? ')'
+// Mixed ::= '(' S? '#PCDATA' (S? '|' S? Name)* S? ')*'
+// | '(' S? '#PCDATA' S? ')'
+private[parsing] object ElementContentModel {
+ def parseContentModel(model: String): dtd.ContentModel = ContentSpec.parse(model) match {
+ case ContentSpec.Empty => dtd.EMPTY
+ case ContentSpec.Any => dtd.ANY
+ case ContentSpec.PCData => dtd.PCDATA
+ case ContentSpec.Children(elements, occurrence) => dtd.ELEMENTS(convertOccurrence(elements, occurrence))
+ case ContentSpec.Mixed(elements) =>
+ val result: List[dtd.ContentModel.RegExp] =
+ dtd.ContentModel.Letter(dtd.ContentModel.ElemName(ContentSpec.PCData.value)) +:
+ elements.map(convertElements)
+ // TODO scala.xml.dtd.impl.Alt.apply() insists on there being al least two alternatives,
+ // which causes an exception in MIXED.toString() when there is only one alternative besides #PCDATA.
+ // I think this is a bug.
+ dtd.MIXED(dtd.ContentModel.Alt(result: _*))
+ }
+
+ private def convertElements(elements: Elements): dtd.ContentModel.RegExp = {
+ def convertCp(cp: Cp): dtd.ContentModel.RegExp = convertOccurrence(cp.elements, cp.occurrence)
+ elements match {
+ case Elements.Element(name) => dtd.ContentModel.Letter(dtd.ContentModel.ElemName(name))
+ case Elements.Choice(children) => dtd.ContentModel.Alt(children.map(convertCp): _*)
+ case Elements.Sequence(children) => dtd.ContentModel.Sequ(children.map(convertCp): _*)
+ }
+ }
+
+ private def convertOccurrence(elements: Elements, occurrence: Occurrence): dtd.ContentModel.RegExp = {
+ val result: dtd.ContentModel.RegExp = convertElements(elements)
+ occurrence match {
+ case Occurrence.Once => result
+ case Occurrence.RepeatOptional => dtd.ContentModel.Star(result)
+ case Occurrence.OnceOptional => dtd.ContentModel.Star(result) // TODO fidelity lost!
+ case Occurrence.Repeat => dtd.ContentModel.Star(result) // TODO fidelity lost!
+ }
+ }
+
+ sealed trait ContentSpec
+ object ContentSpec {
+ sealed trait Simple extends ContentSpec {
+ final override def toString: String = value
+ val value: String
+ }
+ case object Empty extends Simple {
+ override val value: String = "EMPTY"
+ }
+ case object Any extends Simple {
+ override val value: String = "ANY"
+ }
+ case object PCData extends ContentSpec {
+ override def toString: String = s"($value)"
+ val value: String = "#PCDATA"
+ }
+ final case class Mixed(elements: List[Elements.Element]) extends ContentSpec {
+ override def toString: String = {
+ val names: String = elements.mkString("|")
+ s"(${PCData.value}|$names)*"
+ }
+ }
+ final case class Children(elements: Elements.Many, occurrence: Occurrence) extends ContentSpec {
+ override def toString: String = s"$elements$occurrence"
+ }
+ object Children {
+ def parse(string: String, occurrence: Occurrence): Children =
+ Children(Elements.Many.parse(string), occurrence)
+ }
+ def parse(model: String): ContentSpec = model match {
+ case Empty.value => Empty
+ case Any.value => Any
+ case model =>
+ val (parenthesized: String, occurrence: Occurrence) = Occurrence.parse(model)
+ require(isParenthesized(parenthesized))
+ val string: String = removeParentheses(parenthesized)
+ if (occurrence == Occurrence.Once && string == PCData.value) PCData else if (occurrence == Occurrence.RepeatOptional) {
+ val choice: List[String] = Elements.Choice.split(string)
+ if (choice.length > 1 && choice.head == PCData.value) Mixed(choice.tail.map(Elements.Element))
+ else Children.parse(string, occurrence)
+ } else Children.parse(string, occurrence)
+ }
+ }
+
+ sealed trait Elements
+ object Elements {
+ final case class Element(name: String) extends Elements {
+ override def toString: String = name
+ }
+ sealed abstract class ManyCompanion(val separator: Char) {
+ final def split(string: String): List[String] = ElementContentModel.split(string, separator)
+ }
+ sealed abstract class Many(children: List[Cp]) extends Elements {
+ final override def toString: String = children.map(_.toString).mkString("(", companion.separator.toString, ")")
+ def companion: ManyCompanion
+ }
+ object Choice extends ManyCompanion(separator = '|')
+ final case class Choice(children: List[Cp]) extends Many(children) {
+ override def companion: ManyCompanion = Choice
+ }
+ object Sequence extends ManyCompanion(separator = ',')
+ final case class Sequence(children: List[Cp]) extends Many(children) {
+ override def companion: ManyCompanion = Sequence
+ }
+ object Many {
+ def parse(string: String): Many = {
+ val choice: List[String] = Choice.split(string)
+ if (choice.length > 1) Choice(choice.map(Cp.parse))
+ else Sequence(Sequence.split(string).map(Cp.parse))
+ }
+ }
+ def parse(string: String): Elements =
+ if (!isParenthesized(string)) Element(string)
+ else Many.parse(removeParentheses(string))
+ }
+
+ final case class Cp(elements: Elements, occurrence: Occurrence) {
+ override def toString: String = s"$elements$occurrence"
+ }
+ object Cp {
+ def parse(string: String): Cp = {
+ val (maybeParenthesized: String, occurrence: Occurrence) = Occurrence.parse(string)
+ Cp(Elements.parse(maybeParenthesized), occurrence)
+ }
+ }
+
+ sealed class Occurrence
+ object Occurrence {
+ case object Once extends Occurrence {
+ override def toString: String = ""
+ }
+ sealed trait Signed extends Occurrence {
+ final override def toString: String = sign
+ def sign: String
+ }
+ case object OnceOptional extends Signed {
+ override def sign: String = "?"
+ }
+ case object Repeat extends Signed {
+ override def sign: String = "+"
+ }
+ case object RepeatOptional extends Signed {
+ override def sign: String = "*"
+ }
+ def parse(string: String): (String, Occurrence) =
+ if (string.endsWith(OnceOptional.sign)) (string.init, OnceOptional) else
+ if (string.endsWith(RepeatOptional.sign)) (string.init, RepeatOptional) else
+ if (string.endsWith(Repeat.sign)) (string.init, Repeat) else
+ (string, Once)
+ }
+
+ private def isParenthesized(string: String): Boolean = {
+ @tailrec
+ def isParenthesized(level: Int, tail: String): Boolean = {
+ val current: Char = tail.head
+ val nextTail: String = tail.tail
+ val nextLevel: Int = if (current == '(') level + 1 else if (current == ')') level - 1 else level
+ if (nextTail.isEmpty) nextLevel == 0 else if (nextLevel == 0) false else isParenthesized(nextLevel, nextTail)
+ }
+
+ string.startsWith("(") && isParenthesized(0, string)
+ }
+
+ @tailrec
+ private def removeParentheses(string: String): String =
+ if (!isParenthesized(string)) string
+ else removeParentheses(string.tail.init)
+
+ // split at the top level of parentheses
+ private def split(string: String, separator: Char): List[String] = {
+ @tailrec
+ def split(
+ result: List[String],
+ level: Int,
+ init: String,
+ tail: String
+ ): List[String] = if (tail.isEmpty) if (init.isEmpty) result else result :+ init else {
+ val current: Char = tail.head
+ val nextTail: String = tail.tail
+ if (level == 0 && current == separator) split(
+ result :+ init,
+ level,
+ "",
+ nextTail
+ ) else split(
+ result,
+ if (current == '(') level + 1 else if (current == ')') level - 1 else level,
+ init :+ current,
+ nextTail
+ )
+ }
+
+ split(
+ List.empty,
+ 0,
+ "",
+ string
+ )
+ }
+}
diff --git a/shared/src/main/scala/scala/xml/parsing/FactoryAdapter.scala b/shared/src/main/scala/scala/xml/parsing/FactoryAdapter.scala
index 9c170b6e..362c1516 100644
--- a/shared/src/main/scala/scala/xml/parsing/FactoryAdapter.scala
+++ b/shared/src/main/scala/scala/xml/parsing/FactoryAdapter.scala
@@ -15,8 +15,8 @@ package xml
package parsing
import scala.collection.Seq
-import org.xml.sax.{Attributes, SAXNotRecognizedException, SAXNotSupportedException}
-import org.xml.sax.ext.DefaultHandler2
+import org.xml.sax.{Attributes, Locator, SAXNotRecognizedException, SAXNotSupportedException}
+import org.xml.sax.ext.{DefaultHandler2, Locator2}
// can be mixed into FactoryAdapter if desired
trait ConsoleErrorHandler extends DefaultHandler2 {
@@ -42,10 +42,21 @@ trait ConsoleErrorHandler extends DefaultHandler2 {
abstract class FactoryAdapter extends DefaultHandler2 with factory.XMLLoader[Node] {
val normalizeWhitespace: Boolean = false
+ // reference to the XMLReader that parses the document; this is used to query
+ // features (e.g., 'is-standalone') and properties (e.g., document-xml-version) -
+ // see http://www.saxproject.org/apidoc/org/xml/sax/package-summary.html
+ private var xmlReader: Option[XMLReader] = None
+
+ private var dtdBuilder: Option[DtdBuilder] = None
+ private def inDtd: Boolean = dtdBuilder.isDefined && !dtdBuilder.get.isDone
+
private var document: Option[Document] = None
+ private var baseURI: Option[String] = None
+ private var xmlEncoding: Option[String] = None
private var prefixMappings: List[(String, String)] = List.empty
+ // TODO all the variables should be private, but - binary compatibility...
var prolog: List[Node] = List.empty
var rootElem: Node = _
var epilogue: List[Node] = List.empty
@@ -100,16 +111,10 @@ abstract class FactoryAdapter extends DefaultHandler2 with factory.XMLLoader[Nod
}
/**
- * Load XML document from the source using the parser.
- */
- def loadDocument(source: InputSource, parser: SAXParser): Document =
- loadDocument(source, parser.getXMLReader)
-
- /**
- * Load XML document from the source using the reader.
+ * Load XML document from the inputSource using the xmlReader.
*/
- def loadDocument(source: InputSource, xmlReader: XMLReader): Document = {
- if (source == null) throw new IllegalArgumentException("InputSource cannot be null")
+ def loadDocument(inputSource: InputSource, xmlReader: XMLReader): Document = {
+ if (inputSource == null) throw new IllegalArgumentException("InputSource cannot be null")
xmlReader.setContentHandler(this)
xmlReader.setDTDHandler(this)
@@ -126,7 +131,16 @@ abstract class FactoryAdapter extends DefaultHandler2 with factory.XMLLoader[Nod
case _: SAXNotSupportedException =>
}
- xmlReader.parse(source)
+ /* Use DeclHandler if it is supported by the xmlReader. */
+ try {
+ xmlReader.setProperty("http://xml.org/sax/properties/declaration-handler", this)
+ } catch {
+ case _: SAXNotRecognizedException =>
+ case _: SAXNotSupportedException =>
+ }
+
+ this.xmlReader = Some(xmlReader)
+ xmlReader.parse(inputSource)
document.get
}
@@ -175,8 +189,25 @@ abstract class FactoryAdapter extends DefaultHandler2 with factory.XMLLoader[Nod
/* ContentHandler methods */
+ // Since Java 14, ContentHandler has a method that delivers the values from the XML declaration:
+ // def declaration(version: String, encoding: String, standalone: String): Unit = ()
+ // but it'll be years until we are all on Java 14 *and* Xerces starts calling this method...
+
+ override def setDocumentLocator(locator: Locator): Unit = {
+ baseURI = Option(locator.getSystemId)
+ locator match {
+ case locator2: Locator2 =>
+ // Note: Xerces calls setDocumentLocator() (and startDocument()) *before* it even reads the XML declaration;
+ // the version delivered here - locator2.getXMLVersion - is always "1.0";
+ // the real version is retrieved as a property of the XML reader in endDocument().
+
+ xmlEncoding = Option(locator2.getEncoding)
+ case _ =>
+ }
+ }
+
override def startDocument(): Unit = {
- scopeStack ::= TopScope // TODO remove
+ scopeStack ::= TopScope // TODO turn into a parameter
}
override def endDocument(): Unit = {
@@ -187,13 +218,33 @@ abstract class FactoryAdapter extends DefaultHandler2 with factory.XMLLoader[Nod
this.document = Some(document)
document.children = prolog ++ rootElem ++ epilogue
document.docElem = rootElem
- document.dtd = null
- document.baseURI = null
- document.encoding = None
- document.standAlone = None
- document.version = None
+ document.dtd = dtdBuilder.map(_.dtd).orNull
+ document.baseURI = baseURI.orNull
+ document.encoding = xmlEncoding
+
+ document.version =
+ try {
+ Option(xmlReader.get.getProperty("http://xml.org/sax/properties/document-xml-version").asInstanceOf[String])
+ } catch {
+ case _: SAXNotRecognizedException => None
+ case _: SAXNotSupportedException => None
+ }
+
+ document.standAlone =
+ try {
+ Some(xmlReader.get.getFeature("http://xml.org/sax/features/is-standalone"))
+ } catch {
+ case _: SAXNotRecognizedException => None
+ case _: SAXNotSupportedException => None
+ }
// Note: resetting to the freshly-created state; needed only if this instance is reused, which we do not do...
+ dtdBuilder = None
+ xmlReader = None
+
+ baseURI = None
+ xmlEncoding = None
+
hStack = hStack.last :: Nil // TODO List.empty
scopeStack = scopeStack.tail // TODO List.empty
@@ -322,13 +373,36 @@ abstract class FactoryAdapter extends DefaultHandler2 with factory.XMLLoader[Nod
}
}
}
+
+ override def ignorableWhitespace(ch: Array[Char], offset: Int, length: Int): Unit = ()
+
/**
* Processing instruction.
*/
- override def processingInstruction(target: String, data: String): Unit = {
- captureText()
- hStack = hStack.reverse_:::(createProcInstr(target, data).toList)
- }
+ override def processingInstruction(target: String, data: String): Unit =
+ if (inDtd) dtdBuilder.foreach(_.processingInstruction(target, data)) else {
+ captureText()
+ hStack = hStack.reverse_:::(createProcInstr(target, data).toList)
+ }
+
+ override def skippedEntity(name: String): Unit = ()
+
+ /* LexicalHandler methods (see https://docs.oracle.com/javase/8/docs/api/org/xml/sax/ext/LexicalHandler.html) */
+
+ override def startDTD(
+ name: String,
+ publicId: String,
+ systemId: String
+ ): Unit = dtdBuilder = Some(DtdBuilder(
+ name,
+ publicId,
+ systemId
+ ))
+
+ override def endDTD(): Unit = dtdBuilder.foreach(_.endDTD())
+
+ override def startEntity(name: String): Unit = dtdBuilder.foreach(_.startEntity(name))
+ override def endEntity(name: String): Unit = dtdBuilder.foreach(_.endEntity(name))
/**
* Start of a CDATA section.
@@ -347,8 +421,32 @@ abstract class FactoryAdapter extends DefaultHandler2 with factory.XMLLoader[Nod
* Comment.
*/
override def comment(ch: Array[Char], start: Int, length: Int): Unit = {
- captureText()
val commentText: String = String.valueOf(ch.slice(start, start + length))
- hStack = hStack.reverse_:::(createComment(commentText).toList)
+ if (inDtd) dtdBuilder.foreach(_.comment(commentText)) else {
+ captureText()
+ hStack = hStack.reverse_:::(createComment(commentText).toList)
+ }
}
+
+ /* DTDHandler methods (see https://docs.oracle.com/javase/8/docs/api/org/xml/sax/DTDHandler.html) */
+
+ override def notationDecl(name: String, publicId: String, systemId: String): Unit =
+ dtdBuilder.foreach(_.notationDecl(name, publicId, systemId))
+
+ override def unparsedEntityDecl(name: String, publicId: String, systemId: String, notationName: String): Unit =
+ dtdBuilder.foreach(_.unparsedEntityDecl(name, publicId, systemId, notationName))
+
+ /* DeclHandler methods (see https://docs.oracle.com/javase/8/docs/api/org/xml/sax/ext/DeclHandler.html) */
+
+ override def elementDecl(name: String, model: String): Unit =
+ dtdBuilder.foreach(_.elementDecl(name, model))
+
+ override def attributeDecl(eName: String, aName: String, `type`: String, mode: String, value: String): Unit =
+ dtdBuilder.foreach(_.attributeDecl(eName, aName, `type`, mode, value))
+
+ override def internalEntityDecl(name: String, value: String): Unit =
+ dtdBuilder.foreach(_.internalEntityDecl(name, value))
+
+ override def externalEntityDecl(name: String, publicId: String, systemId: String): Unit =
+ dtdBuilder.foreach(_.externalEntityDecl(name, publicId, systemId))
}