// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.text;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
/**
* Static XML utility methods
*
* @author Bjorn Borud
* @author Vegard Havdal
* @author bratseth
* @author Steinar Knutsen
*/
public class XML {
/**
* The point of this weird class and the jumble of abstract methods is
* linking the scan for characters that must be quoted into the quoting
* table, and making it actual work to make them go out of sync again.
*/
private static abstract class LegalCharacters {
// To quote http://www.w3.org/TR/REC-xml/ :
// Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
// [#x10000-#x10FFFF]
final boolean isLegal(final int codepoint, final boolean escapeLow,
final int stripCodePoint, final boolean isAttribute) {
if (codepoint == stripCodePoint) {
return removeCodePoint();
} else if (codepoint < ' ') {
if (!escapeLow) {
return true;
}
switch (codepoint) {
case 0x09:
case 0x0a:
case 0x0d:
return true;
default:
return ctrlEscapeCodePoint(codepoint);
}
} else if (codepoint >= 0x20 && codepoint <= 0xd7ff) {
switch (codepoint) {
case '&':
return ampCodePoint();
case '<':
return ltCodePoint();
case '>':
return gtCodePoint();
case '"':
return quotCodePoint(isAttribute);
default:
return true;
}
} else if ((codepoint >= 0xe000 && codepoint <= 0xfffd)
|| (codepoint >= 0x10000 && codepoint <= 0x10ffff)) {
return true;
} else {
return filterCodePoint(codepoint);
}
}
private boolean quotCodePoint(final boolean isAttribute) {
if (isAttribute) {
quoteQuot();
return false;
} else {
return true;
}
}
private boolean filterCodePoint(final int codepoint) {
replace(codepoint);
return false;
}
private boolean gtCodePoint() {
quoteGt();
return false;
}
private boolean ltCodePoint() {
quoteLt();
return false;
}
private boolean ampCodePoint() {
quoteAmp();
return false;
}
private boolean ctrlEscapeCodePoint(final int codepoint) {
ctrlEscape(codepoint);
return false;
}
private boolean removeCodePoint() {
remove();
return false;
}
protected abstract void quoteQuot();
protected abstract void quoteGt();
protected abstract void quoteLt();
protected abstract void quoteAmp();
protected abstract void remove();
protected abstract void ctrlEscape(int codepoint);
protected abstract void replace(int codepoint);
}
private static final class Quote extends LegalCharacters {
char[] lastQuoted;
private static final char[] EMPTY = new char[0];
private static final char[] REPLACEMENT_CHARACTER = "\ufffd".toCharArray();
private static final char[] AMP = "&".toCharArray();
private static final char[] LT = "<".toCharArray();
private static final char[] GT = ">".toCharArray();
private static final char[] QUOT = """.toCharArray();
@Override
protected void remove() {
lastQuoted = EMPTY;
}
@Override
protected void replace(final int codepoint) {
lastQuoted = REPLACEMENT_CHARACTER;
}
@Override
protected void quoteQuot() {
lastQuoted = QUOT;
}
@Override
protected void quoteGt() {
lastQuoted = GT;
}
@Override
protected void quoteLt() {
lastQuoted = LT;
}
@Override
protected void quoteAmp() {
lastQuoted = AMP;
}
@Override
protected void ctrlEscape(final int codepoint) {
lastQuoted = REPLACEMENT_CHARACTER;
}
}
private static final class Scan extends LegalCharacters {
@Override
protected void quoteQuot() {
}
@Override
protected void quoteGt() {
}
@Override
protected void quoteLt() {
}
@Override
protected void quoteAmp() {
}
@Override
protected void remove() {
}
@Override
protected void ctrlEscape(final int codepoint) {
}
@Override
protected void replace(final int codepoint) {
}
}
private static final Scan scanner = new Scan();
/**
* Replaces the characters that need to be escaped with their corresponding
* character entities.
*
* @param s1
* String possibly containing characters that need to be escaped
* in XML
*
* @return Returns the input string with special characters that need to be
* escaped replaced by character entities.
*/
public static String xmlEscape(String s1) {
return xmlEscape(s1, true, true, null, -1);
}
/**
* Replaces the characters that need to be escaped with their corresponding
* character entities.
*
* @param s1
* String possibly containing characters that need to be escaped
* in XML
* @param isAttribute
* Is the input string to be used as an attribute?
*
* @return Returns the input string with special characters that need to be
* escaped replaced by character entities
*/
public static String xmlEscape(String s1, boolean isAttribute) {
return xmlEscape(s1, isAttribute, true, null, -1);
}
/**
* Replaces the characters that need to be escaped with their corresponding
* character entities.
*
* @param s1
* String possibly containing characters that need to be escaped
* in XML
* @param isAttribute
* Is the input string to be used as an attribute?
*
*
* @param stripCharacter
* any occurrence of this character is removed from the string
*
* @return Returns the input string with special characters that need to be
* escaped replaced by character entities
*/
public static String xmlEscape(String s1, boolean isAttribute, char stripCharacter) {
return xmlEscape(s1, isAttribute, true, null, (int) stripCharacter);
}
/**
* Replaces the characters that need to be escaped with their corresponding
* character entities.
*
* @param s1
* String possibly containing characters that need to be escaped
* in XML
* @param isAttribute
* Is the input string to be used as an attribute?
*
* @param escapeLowAscii
* Should ascii characters below 32 be escaped as well
*
* @return Returns the input string with special characters that need to be
* escaped replaced by character entities
*/
public static String xmlEscape(String s1, boolean isAttribute, boolean escapeLowAscii) {
return xmlEscape(s1, isAttribute, escapeLowAscii, null, -1);
}
/**
* Replaces the characters that need to be escaped with their corresponding
* character entities.
*
* @param s1
* String possibly containing characters that need to be escaped
* in XML
* @param isAttribute
* Is the input string to be used as an attribute?
*
* @param escapeLowAscii
* Should ascii characters below 32 be escaped as well
*
* @param stripCharacter
* any occurrence of this character is removed from the string
*
* @return Returns the input string with special characters that need to be
* escaped replaced by character entities
*/
public static String xmlEscape(String s1, boolean isAttribute, boolean escapeLowAscii, char stripCharacter) {
return xmlEscape(s1, isAttribute, escapeLowAscii, null, (int) stripCharacter);
}
/**
* Replaces the following:
*
* - all ascii codes less than 32 except 9 (tab), 10 (nl) and 13 (cr)
*
- ampersand (&)
*
- less than (<)
*
- larger than (>)
*
- double quotes (") if isAttribute is
true
*
* with character entities.
*
*/
public static String xmlEscape(String string, boolean isAttribute, StringBuilder buffer) {
return xmlEscape(string, isAttribute, true, buffer, -1);
}
/**
* Replaces the following:
*
* - all ascii codes less than 32 except 9 (tab), 10 (nl) and 13 (cr) if
* escapeLowAscii is
true
* - ampersand (&)
*
- less than (<)
*
- larger than (>)
*
- double quotes (") if isAttribute is
true
*
* with character entities.
*
*/
public static String xmlEscape(String string, boolean isAttribute, boolean escapeLowAscii, StringBuilder buffer) {
return xmlEscape(string, isAttribute, escapeLowAscii, buffer, -1);
}
/**
* Replaces the following:
*
* - all ascii codes less than 32 except 9 (tab), 10 (nl) and 13 (cr) if
* escapeLowAscii is
true
* - ampersand (&)
*
- less than (<)
*
- larger than (>)
*
- double quotes (") if isAttribute is
true
*
* with character entities.
*
* @param stripCodePoint
* any occurrence of this character is removed from the string
*/
public static String xmlEscape(String string, boolean isAttribute, boolean escapeLowAscii,
StringBuilder buffer, int stripCodePoint) {
// buffer and stripCodePoint changed order in the signature compared to
// the char based API to avoid wrong method being called
// This is inner loop stuff, so we sacrifice a little for speed -
// no copying will occur until a character needing escaping is found
boolean legalCharacter = true;
Quote escaper;
int i = 0;
for (i = 0; i < string.length() && legalCharacter; i = string.offsetByCodePoints(i, 1)) {
legalCharacter = scanner.isLegal(string.codePointAt(i), escapeLowAscii, stripCodePoint, isAttribute);
}
if (legalCharacter) {
return string;
}
i = string.offsetByCodePoints(i, -1); // Back to the char needing escaping
escaper = new Quote();
if (buffer == null) {
buffer = new StringBuilder((int) (string.length() * 1.2));
}
// ugly appending zero length strings
if (i > 0) {
buffer.append(string.substring(0, i));
}
// i is at the first codepoint which needs replacing
// Don't guard against double-escaping, as:
// don't try to be clever (LCJ).
for (; i < string.length(); i = string.offsetByCodePoints(i, 1)) {
int codepoint = string.codePointAt(i);
if (escaper.isLegal(codepoint, escapeLowAscii, stripCodePoint, isAttribute)) {
buffer.appendCodePoint(codepoint);
} else {
buffer.append(escaper.lastQuoted);
}
}
return buffer.toString();
}
/**
* Returns the Document of an XML file reader
*
* @throws RuntimeException
* if the root Document cannot be returned
*/
public static Document getDocument(Reader reader) {
try {
return getDocumentBuilder().parse(new InputSource(reader));
} catch (IOException e) {
throw new IllegalArgumentException("Could not read '" + reader + "'", e);
} catch (SAXParseException e) {
throw new IllegalArgumentException("Could not parse '" + reader + "', error at line " + e.getLineNumber() + ", column " + e.getColumnNumber(), e);
} catch (SAXException e) {
throw new IllegalArgumentException("Could not parse '" + reader + "'", e);
}
}
/**
* Returns the Document of the string XML payload
*/
public static Document getDocument(String string) {
return getDocument(new StringReader(string));
}
/**
* Creates a new XML DocumentBuilder
*
* @return a DocumentBuilder
* @throws RuntimeException
* if we fail to create one
*/
public static DocumentBuilder getDocumentBuilder() {
return getDocumentBuilder("com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderFactoryImpl", null);
}
/**
* Creates a new XML DocumentBuilder
*
* @param implementation
* which jaxp implementation should be used
* @param classLoader
* which class loader should be used when getting a new
* DocumentBuilder
* @throws RuntimeException
* if we fail to create one
* @return a DocumentBuilder
*/
public static DocumentBuilder getDocumentBuilder(String implementation, ClassLoader classLoader) {
try {
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(implementation, classLoader);
factory.setNamespaceAware(true);
factory.setXIncludeAware(true);
return factory.newDocumentBuilder();
} catch (ParserConfigurationException e) {
throw new RuntimeException("Could not create an XML builder");
}
}
/**
* Returns the child Element objects from a w3c dom spec
*
* @return List of elements. Empty list (never null) if none found or if the
* given element is null
*/
public static List getChildren(Element spec) {
List children = new ArrayList<>();
if (spec == null) {
return children;
}
NodeList childNodes = spec.getChildNodes();
for (int i = 0; i < childNodes.getLength(); i++) {
Node child = childNodes.item(i);
if (child instanceof Element) {
children.add((Element) child);
}
}
return children;
}
/**
* Returns the child Element objects with given name from a w3c dom spec
*
* @return List of elements. Empty list (never null) if none found or the
* given element is null
*/
public static List getChildren(Element spec, String name) {
List ret = new ArrayList<>();
if (spec == null) {
return ret;
}
NodeList children = spec.getChildNodes();
if (children == null) {
return ret;
}
for (int i = 0; i < children.getLength(); i++) {
Node child = children.item(i);
if (child != null && child instanceof Element) {
if (child.getNodeName().equals(name)) {
ret.add((Element) child);
}
}
}
return ret;
}
/**
* Gets the string contents of the given Element. Returns "", never null if
* the element is null, or has no content
*/
public static String getValue(Element e) {
if (e == null) {
return "";
}
Node child = e.getFirstChild();
if (child == null) {
return "";
}
return child.getNodeValue();
}
/** Returns the first child with the given name, or null if none */
public static Element getChild(Element e, String name) {
return (getChildren(e, name).size() >= 1) ? getChildren(e, name).get(0) : null;
}
/**
* Returns the path to the given xml node, where each node name is separated
* by the given separator string.
*
* @param n
* The xml node to find path to
* @param sep
* The separator string
* @return The path to the xml node as a String
*/
public static String getNodePath(Node n, String sep) {
if (n == null) {
return "";
}
StringBuffer ret = new StringBuffer(n.getNodeName());
while ((n.getParentNode() != null) && !(n.getParentNode() instanceof Document)) {
n = n.getParentNode();
ret.insert(0, sep).insert(0, n.getNodeName());
}
return ret.toString();
}
private static boolean inclusiveWithin(int x, int low, int high) {
return low <= x && x <= high;
}
private static boolean nameStartSet(int codepoint) {
// NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] |
// [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] |
// [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF]
// | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
boolean valid;
if (codepoint < 0xC0) {
valid = inclusiveWithin(codepoint, 'a', 'z')
|| inclusiveWithin(codepoint, 'A', 'Z') || codepoint == '_'
|| codepoint == ':';
} else {
valid = inclusiveWithin(codepoint, 0xC0, 0xD6)
|| inclusiveWithin(codepoint, 0xD8, 0xF6)
|| inclusiveWithin(codepoint, 0xF8, 0x2FF)
|| inclusiveWithin(codepoint, 0x370, 0x37D)
|| inclusiveWithin(codepoint, 0x37F, 0x1FFF)
|| inclusiveWithin(codepoint, 0x200C, 0x200D)
|| inclusiveWithin(codepoint, 0x2070, 0x218F)
|| inclusiveWithin(codepoint, 0x2C00, 0x2FEF)
|| inclusiveWithin(codepoint, 0x3001, 0xD7FF)
|| inclusiveWithin(codepoint, 0xF900, 0xFDCF)
|| inclusiveWithin(codepoint, 0xFDF0, 0xFFFD)
|| inclusiveWithin(codepoint, 0x10000, 0xEFFFF);
}
return valid;
}
private static boolean nameSetExceptStart(int codepoint) {
// "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
boolean valid;
if (codepoint < 0xB7) {
valid = inclusiveWithin(codepoint, '0', '9') || codepoint == '-'
|| codepoint == '.';
} else {
valid = codepoint == '\u00B7'
|| inclusiveWithin(codepoint, 0x300, 0x36F)
|| inclusiveWithin(codepoint, 0x023F, 0x2040);
}
return valid;
}
private static boolean nameChar(int codepoint, boolean first) {
// NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
boolean valid = nameStartSet(codepoint);
return first ? valid : valid || nameSetExceptStart(codepoint);
}
/**
* Check whether the name of a tag or attribute conforms to XML
* 1.1 (Second Edition). This does not check against reserved names, it
* only checks the set of characters used.
*
* @param possibleName
* a possibly valid XML name
* @return true if the name may be used as an XML tag or attribute name
*/
public static boolean isName(CharSequence possibleName) {
final int barrier = possibleName.length();
int i = 0;
boolean valid = true;
boolean first = true;
if (barrier < 1) {
valid = false;
}
while (valid && i < barrier) {
char c = possibleName.charAt(i++);
if (Character.isHighSurrogate(c)) {
valid = nameChar(Character.toCodePoint(c, possibleName.charAt(i++)), first);
} else {
valid = nameChar((int) c, first);
}
first = false;
}
return valid;
}
}