// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.text; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.util.ArrayList; import java.util.List; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.SAXParseException; /** * Static XML utility methods * * @author Bjorn Borud * @author Vegard Havdal * @author bratseth * @author Steinar Knutsen */ public class XML { /** * The point of this weird class and the jumble of abstract methods is * linking the scan for characters that must be quoted into the quoting * table, and making it actual work to make them go out of sync again. */ private static abstract class LegalCharacters { // To quote http://www.w3.org/TR/REC-xml/ : // Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | // [#x10000-#x10FFFF] final boolean isLegal(final int codepoint, final boolean escapeLow, final int stripCodePoint, final boolean isAttribute) { if (codepoint == stripCodePoint) { return removeCodePoint(); } else if (codepoint < ' ') { if (!escapeLow) { return true; } switch (codepoint) { case 0x09: case 0x0a: case 0x0d: return true; default: return ctrlEscapeCodePoint(codepoint); } } else if (codepoint >= 0x20 && codepoint <= 0xd7ff) { switch (codepoint) { case '&': return ampCodePoint(); case '<': return ltCodePoint(); case '>': return gtCodePoint(); case '"': return quotCodePoint(isAttribute); default: return true; } } else if ((codepoint >= 0xe000 && codepoint <= 0xfffd) || (codepoint >= 0x10000 && codepoint <= 0x10ffff)) { return true; } else { return filterCodePoint(codepoint); } } private boolean quotCodePoint(final boolean isAttribute) { if (isAttribute) { quoteQuot(); return false; } else { return true; } } private boolean filterCodePoint(final int codepoint) { replace(codepoint); return false; } private boolean gtCodePoint() { quoteGt(); return false; } private boolean ltCodePoint() { quoteLt(); return false; } private boolean ampCodePoint() { quoteAmp(); return false; } private boolean ctrlEscapeCodePoint(final int codepoint) { ctrlEscape(codepoint); return false; } private boolean removeCodePoint() { remove(); return false; } protected abstract void quoteQuot(); protected abstract void quoteGt(); protected abstract void quoteLt(); protected abstract void quoteAmp(); protected abstract void remove(); protected abstract void ctrlEscape(int codepoint); protected abstract void replace(int codepoint); } private static final class Quote extends LegalCharacters { char[] lastQuoted; private static final char[] EMPTY = new char[0]; private static final char[] REPLACEMENT_CHARACTER = "\ufffd".toCharArray(); private static final char[] AMP = "&".toCharArray(); private static final char[] LT = "<".toCharArray(); private static final char[] GT = ">".toCharArray(); private static final char[] QUOT = """.toCharArray(); @Override protected void remove() { lastQuoted = EMPTY; } @Override protected void replace(final int codepoint) { lastQuoted = REPLACEMENT_CHARACTER; } @Override protected void quoteQuot() { lastQuoted = QUOT; } @Override protected void quoteGt() { lastQuoted = GT; } @Override protected void quoteLt() { lastQuoted = LT; } @Override protected void quoteAmp() { lastQuoted = AMP; } @Override protected void ctrlEscape(final int codepoint) { lastQuoted = REPLACEMENT_CHARACTER; } } private static final class Scan extends LegalCharacters { @Override protected void quoteQuot() { } @Override protected void quoteGt() { } @Override protected void quoteLt() { } @Override protected void quoteAmp() { } @Override protected void remove() { } @Override protected void ctrlEscape(final int codepoint) { } @Override protected void replace(final int codepoint) { } } private static final Scan scanner = new Scan(); /** * Replaces the characters that need to be escaped with their corresponding * character entities. * * @param s1 * String possibly containing characters that need to be escaped * in XML * * @return Returns the input string with special characters that need to be * escaped replaced by character entities. */ public static String xmlEscape(String s1) { return xmlEscape(s1, true, true, null, -1); } /** * Replaces the characters that need to be escaped with their corresponding * character entities. * * @param s1 * String possibly containing characters that need to be escaped * in XML * @param isAttribute * Is the input string to be used as an attribute? * * @return Returns the input string with special characters that need to be * escaped replaced by character entities */ public static String xmlEscape(String s1, boolean isAttribute) { return xmlEscape(s1, isAttribute, true, null, -1); } /** * Replaces the characters that need to be escaped with their corresponding * character entities. * * @param s1 * String possibly containing characters that need to be escaped * in XML * @param isAttribute * Is the input string to be used as an attribute? * * * @param stripCharacter * any occurrence of this character is removed from the string * * @return Returns the input string with special characters that need to be * escaped replaced by character entities */ public static String xmlEscape(String s1, boolean isAttribute, char stripCharacter) { return xmlEscape(s1, isAttribute, true, null, (int) stripCharacter); } /** * Replaces the characters that need to be escaped with their corresponding * character entities. * * @param s1 * String possibly containing characters that need to be escaped * in XML * @param isAttribute * Is the input string to be used as an attribute? * * @param escapeLowAscii * Should ascii characters below 32 be escaped as well * * @return Returns the input string with special characters that need to be * escaped replaced by character entities */ public static String xmlEscape(String s1, boolean isAttribute, boolean escapeLowAscii) { return xmlEscape(s1, isAttribute, escapeLowAscii, null, -1); } /** * Replaces the characters that need to be escaped with their corresponding * character entities. * * @param s1 * String possibly containing characters that need to be escaped * in XML * @param isAttribute * Is the input string to be used as an attribute? * * @param escapeLowAscii * Should ascii characters below 32 be escaped as well * * @param stripCharacter * any occurrence of this character is removed from the string * * @return Returns the input string with special characters that need to be * escaped replaced by character entities */ public static String xmlEscape(String s1, boolean isAttribute, boolean escapeLowAscii, char stripCharacter) { return xmlEscape(s1, isAttribute, escapeLowAscii, null, (int) stripCharacter); } /** * Replaces the following: * * with character entities. * */ public static String xmlEscape(String string, boolean isAttribute, StringBuilder buffer) { return xmlEscape(string, isAttribute, true, buffer, -1); } /** * Replaces the following: * * with character entities. * */ public static String xmlEscape(String string, boolean isAttribute, boolean escapeLowAscii, StringBuilder buffer) { return xmlEscape(string, isAttribute, escapeLowAscii, buffer, -1); } /** * Replaces the following: * * with character entities. * * @param stripCodePoint * any occurrence of this character is removed from the string */ public static String xmlEscape(String string, boolean isAttribute, boolean escapeLowAscii, StringBuilder buffer, int stripCodePoint) { // buffer and stripCodePoint changed order in the signature compared to // the char based API to avoid wrong method being called // This is inner loop stuff, so we sacrifice a little for speed - // no copying will occur until a character needing escaping is found boolean legalCharacter = true; Quote escaper; int i = 0; for (i = 0; i < string.length() && legalCharacter; i = string.offsetByCodePoints(i, 1)) { legalCharacter = scanner.isLegal(string.codePointAt(i), escapeLowAscii, stripCodePoint, isAttribute); } if (legalCharacter) { return string; } i = string.offsetByCodePoints(i, -1); // Back to the char needing escaping escaper = new Quote(); if (buffer == null) { buffer = new StringBuilder((int) (string.length() * 1.2)); } // ugly appending zero length strings if (i > 0) { buffer.append(string.substring(0, i)); } // i is at the first codepoint which needs replacing // Don't guard against double-escaping, as: // don't try to be clever (LCJ). for (; i < string.length(); i = string.offsetByCodePoints(i, 1)) { int codepoint = string.codePointAt(i); if (escaper.isLegal(codepoint, escapeLowAscii, stripCodePoint, isAttribute)) { buffer.appendCodePoint(codepoint); } else { buffer.append(escaper.lastQuoted); } } return buffer.toString(); } /** * Returns the Document of an XML file reader * * @throws RuntimeException * if the root Document cannot be returned */ public static Document getDocument(Reader reader) { try { return getDocumentBuilder().parse(new InputSource(reader)); } catch (IOException e) { throw new IllegalArgumentException("Could not read '" + reader + "'", e); } catch (SAXParseException e) { throw new IllegalArgumentException("Could not parse '" + reader + "', error at line " + e.getLineNumber() + ", column " + e.getColumnNumber(), e); } catch (SAXException e) { throw new IllegalArgumentException("Could not parse '" + reader + "'", e); } } /** * Returns the Document of the string XML payload */ public static Document getDocument(String string) { return getDocument(new StringReader(string)); } /** * Creates a new XML DocumentBuilder * * @return a DocumentBuilder * @throws RuntimeException * if we fail to create one */ public static DocumentBuilder getDocumentBuilder() { return getDocumentBuilder("com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderFactoryImpl", null); } /** * Creates a new XML DocumentBuilder * * @param implementation * which jaxp implementation should be used * @param classLoader * which class loader should be used when getting a new * DocumentBuilder * @throws RuntimeException * if we fail to create one * @return a DocumentBuilder */ public static DocumentBuilder getDocumentBuilder(String implementation, ClassLoader classLoader) { try { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(implementation, classLoader); factory.setNamespaceAware(true); factory.setXIncludeAware(true); return factory.newDocumentBuilder(); } catch (ParserConfigurationException e) { throw new RuntimeException("Could not create an XML builder"); } } /** * Returns the child Element objects from a w3c dom spec * * @return List of elements. Empty list (never null) if none found or if the * given element is null */ public static List getChildren(Element spec) { List children = new ArrayList<>(); if (spec == null) { return children; } NodeList childNodes = spec.getChildNodes(); for (int i = 0; i < childNodes.getLength(); i++) { Node child = childNodes.item(i); if (child instanceof Element) { children.add((Element) child); } } return children; } /** * Returns the child Element objects with given name from a w3c dom spec * * @return List of elements. Empty list (never null) if none found or the * given element is null */ public static List getChildren(Element spec, String name) { List ret = new ArrayList<>(); if (spec == null) { return ret; } NodeList children = spec.getChildNodes(); if (children == null) { return ret; } for (int i = 0; i < children.getLength(); i++) { Node child = children.item(i); if (child != null && child instanceof Element) { if (child.getNodeName().equals(name)) { ret.add((Element) child); } } } return ret; } /** * Gets the string contents of the given Element. Returns "", never null if * the element is null, or has no content */ public static String getValue(Element e) { if (e == null) { return ""; } Node child = e.getFirstChild(); if (child == null) { return ""; } return child.getNodeValue(); } /** Returns the first child with the given name, or null if none */ public static Element getChild(Element e, String name) { return (getChildren(e, name).size() >= 1) ? getChildren(e, name).get(0) : null; } /** * Returns the path to the given xml node, where each node name is separated * by the given separator string. * * @param n * The xml node to find path to * @param sep * The separator string * @return The path to the xml node as a String */ public static String getNodePath(Node n, String sep) { if (n == null) { return ""; } StringBuffer ret = new StringBuffer(n.getNodeName()); while ((n.getParentNode() != null) && !(n.getParentNode() instanceof Document)) { n = n.getParentNode(); ret.insert(0, sep).insert(0, n.getNodeName()); } return ret.toString(); } private static boolean inclusiveWithin(int x, int low, int high) { return low <= x && x <= high; } private static boolean nameStartSet(int codepoint) { // NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | // [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | // [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] // | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF] boolean valid; if (codepoint < 0xC0) { valid = inclusiveWithin(codepoint, 'a', 'z') || inclusiveWithin(codepoint, 'A', 'Z') || codepoint == '_' || codepoint == ':'; } else { valid = inclusiveWithin(codepoint, 0xC0, 0xD6) || inclusiveWithin(codepoint, 0xD8, 0xF6) || inclusiveWithin(codepoint, 0xF8, 0x2FF) || inclusiveWithin(codepoint, 0x370, 0x37D) || inclusiveWithin(codepoint, 0x37F, 0x1FFF) || inclusiveWithin(codepoint, 0x200C, 0x200D) || inclusiveWithin(codepoint, 0x2070, 0x218F) || inclusiveWithin(codepoint, 0x2C00, 0x2FEF) || inclusiveWithin(codepoint, 0x3001, 0xD7FF) || inclusiveWithin(codepoint, 0xF900, 0xFDCF) || inclusiveWithin(codepoint, 0xFDF0, 0xFFFD) || inclusiveWithin(codepoint, 0x10000, 0xEFFFF); } return valid; } private static boolean nameSetExceptStart(int codepoint) { // "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040] boolean valid; if (codepoint < 0xB7) { valid = inclusiveWithin(codepoint, '0', '9') || codepoint == '-' || codepoint == '.'; } else { valid = codepoint == '\u00B7' || inclusiveWithin(codepoint, 0x300, 0x36F) || inclusiveWithin(codepoint, 0x023F, 0x2040); } return valid; } private static boolean nameChar(int codepoint, boolean first) { // NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040] boolean valid = nameStartSet(codepoint); return first ? valid : valid || nameSetExceptStart(codepoint); } /** * Check whether the name of a tag or attribute conforms to XML * 1.1 (Second Edition). This does not check against reserved names, it * only checks the set of characters used. * * @param possibleName * a possibly valid XML name * @return true if the name may be used as an XML tag or attribute name */ public static boolean isName(CharSequence possibleName) { final int barrier = possibleName.length(); int i = 0; boolean valid = true; boolean first = true; if (barrier < 1) { valid = false; } while (valid && i < barrier) { char c = possibleName.charAt(i++); if (Character.isHighSurrogate(c)) { valid = nameChar(Character.toCodePoint(c, possibleName.charAt(i++)), first); } else { valid = nameChar((int) c, first); } first = false; } return valid; } }