Skip to content

Commit 61559c4

Browse files
committed
[bugfix] Improve handling of XML BOM
Closes eXist-db/exist#5610
1 parent f67e1ab commit 61559c4

File tree

6 files changed

+316
-2
lines changed

6 files changed

+316
-2
lines changed

exist-core/pom.xml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -723,10 +723,13 @@
723723
<header>${project.parent.relativePath}/../elemental-parent/elemental-LGPL-21-ONLY-license.template.txt</header>
724724
<includes>
725725
<include>project-suppression.xml</include>
726+
<include>src/main/java/org/exist/util/ByteOrderMark.java</include>
726727
<include>src/test/java/org/exist/xquery/ImportFromPkgTest.java</include>
727728
<include>src/test/java/org/exist/xquery/WatchdogTest.java</include>
728729
<include>src/test/java/org/exist/xquery/value/DateTimeTypesTest.java</include>
729730
<include>src/test/java/org/exist/xquery/functions/fn/FunXmlToJsonTest.java</include>
731+
<include>src/test/java/org/exist/xquery/functions/fn/ParsingFunctionsTest.java</include>
732+
<include>src/test/java/org/exist/xquery/functions/xmldb/XMLDBStoreTest.java</include>
730733
<include>src/test/resources-filtered/org/exist/xquery/import-from-pkg-test.conf.xml</include>
731734
</includes>
732735
</licenseSet>
@@ -814,8 +817,10 @@
814817
<include>src/main/java/org/exist/xmlrpc/ExistRpcTypeFactory.java</include>
815818
<include>src/main/java/org/exist/xquery/XQueryContext.java</include>
816819
<include>src/main/java/org/exist/xquery/functions/fn/FunUriCollection.java</include>
820+
<include>src/main/java/org/exist/xquery/functions/fn/ParsingFunctions.java</include>
817821
<include>src/main/java/org/exist/xquery/functions/system/GetUptime.java</include>
818822
<include>src/main/java/org/exist/xquery/functions/system/Shutdown.java</include>
823+
<include>src/main/java/org/exist/xquery/functions/xmldb/XMLDBStore.java</include>
819824
<include>src/main/java/org/exist/xquery/value/AbstractDateTimeValue.java</include>
820825
<include>src/main/java/org/exist/xquery/value/Type.java</include>
821826
<include>src/main/java/org/exist/xslt/EXistURIResolver.java</include>
@@ -934,6 +939,7 @@
934939
<exclude>src/main/java/org/exist/test/runner/XMLTestRunner.java</exclude>
935940
<exclude>src/main/java/org/exist/test/runner/XQueryTestRunner.java</exclude>
936941
<exclude>src/main/java/org/exist/test/runner/XSuite.java</exclude>
942+
<exclude>src/main/java/org/exist/util/ByteOrderMark.java</exclude>
937943
<exclude>src/main/java/org/exist/util/Collations.java</exclude>
938944
<exclude>src/main/java/org/exist/util/crypto/digest/DigestType.java</exclude>
939945
<exclude>src/main/java/org/exist/webstart/JnlpJarFiles.java</exclude>
@@ -946,8 +952,12 @@
946952
<exclude>src/main/java/org/exist/xquery/functions/fn/FunXmlToJson.java</exclude>
947953
<exclude>src/test/java/org/exist/xquery/functions/fn/FunXmlToJsonTest.java</exclude>
948954
<exclude>src/main/java/org/exist/xquery/functions/fn/LoadXQueryModule.java</exclude>
955+
<exclude>src/main/java/org/exist/xquery/functions/fn/ParsingFunctions.java</exclude>
956+
<exclude>src/test/java/org/exist/xquery/functions/fn/ParsingFunctionsTest.java</exclude>
949957
<exclude>src/main/java/org/exist/xquery/functions/system/GetUptime.java</exclude>
950958
<exclude>src/main/java/org/exist/xquery/functions/system/Shutdown.java</exclude>
959+
<exclude>src/main/java/org/exist/xquery/functions/xmldb/XMLDBStore.java</exclude>
960+
<exclude>src/test/java/org/exist/xquery/functions/xmldb/XMLDBStoreTest.java</exclude>
951961
<exclude>src/main/java/org/exist/xquery/value/AbstractDateTimeValue.java</exclude>
952962
<exclude>src/test/java/org/exist/xquery/value/DateTimeTypesTest.java</exclude>
953963
<exclude>src/main/java/org/exist/xquery/value/Type.java</exclude>
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
/*
2+
* Elemental
3+
* Copyright (C) 2024, Evolved Binary Ltd
4+
*
5+
6+
* https://www.evolvedbinary.com | https://www.elemental.xyz
7+
*
8+
* This library is free software; you can redistribute it and/or
9+
* modify it under the terms of the GNU Lesser General Public
10+
* License as published by the Free Software Foundation; version 2.1.
11+
*
12+
* This library is distributed in the hope that it will be useful,
13+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
14+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15+
* Lesser General Public License for more details.
16+
*
17+
* You should have received a copy of the GNU Lesser General Public
18+
* License along with this library; if not, write to the Free Software
19+
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20+
*/
21+
package org.exist.util;
22+
23+
import java.nio.charset.StandardCharsets;
24+
25+
/**
26+
* Byte Order Mark utilities.
27+
*
28+
* @author <a href="mailto:[email protected]">Adam Retter</a>
29+
*/
30+
public class ByteOrderMark {
31+
public static final byte[] UTF8_BOM_BYTES = new byte[] {(byte)0xEF, (byte)0xBB, (byte)0xBF};
32+
public static final String UTF8_BOM = new String(UTF8_BOM_BYTES, StandardCharsets.UTF_8);
33+
34+
35+
public static final byte[] UTF16_BE_BOM_BYTES = new byte[] {(byte)0xFE, (byte)0xFF};
36+
public static final String UTF16_BE_BOM = new String(UTF16_BE_BOM_BYTES, StandardCharsets.UTF_16BE);
37+
38+
public static final byte[] UTF16_LE_BOM_BYTES = new byte[] {(byte)0xFF, (byte)0xFE};
39+
public static final String UTF16_LE_BOM = new String(UTF16_LE_BOM_BYTES, StandardCharsets.UTF_16LE);
40+
41+
/**
42+
* Strip BOM from the start of an XML string.
43+
*
44+
* @param xml the XML as a string
45+
*
46+
* @return the XML without a BOM.
47+
*/
48+
public static String stripXmlBom(final String xml) {
49+
if (xml.startsWith(UTF8_BOM) || xml.startsWith(UTF16_BE_BOM) || xml.startsWith(UTF16_LE_BOM)) {
50+
return xml.substring(1);
51+
}
52+
return xml;
53+
}
54+
}

exist-core/src/main/java/org/exist/xquery/functions/fn/ParsingFunctions.java

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,28 @@
11
/*
2+
* Elemental
3+
* Copyright (C) 2024, Evolved Binary Ltd
4+
*
5+
6+
* https://www.evolvedbinary.com | https://www.elemental.xyz
7+
*
8+
* This library is free software; you can redistribute it and/or
9+
* modify it under the terms of the GNU Lesser General Public
10+
* License as published by the Free Software Foundation; version 2.1.
11+
*
12+
* This library is distributed in the hope that it will be useful,
13+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
14+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15+
* Lesser General Public License for more details.
16+
*
17+
* You should have received a copy of the GNU Lesser General Public
18+
* License along with this library; if not, write to the Free Software
19+
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20+
*
21+
* NOTE: Parts of this file contain code from 'The eXist-db Authors'.
22+
* The original license header is included below.
23+
*
24+
* =====================================================================
25+
*
226
* eXist-db Open Source Native XML Database
327
* Copyright (C) 2001 The eXist-db Authors
428
*
@@ -40,6 +64,9 @@
4064

4165
import java.io.IOException;
4266
import java.io.StringReader;
67+
import java.nio.charset.StandardCharsets;
68+
69+
import static org.exist.util.ByteOrderMark.stripXmlBom;
4370

4471
public class ParsingFunctions extends BasicFunction {
4572

@@ -103,7 +130,8 @@ private Sequence parse(final String xmlContent, final Sequence[] args) throws XP
103130
}
104131
}
105132

106-
private ValidationReport validate(final String xmlContent, final SAXAdapter saxAdapter) throws XPathException {
133+
private ValidationReport validate(String xmlContent, final SAXAdapter saxAdapter) throws XPathException {
134+
xmlContent = stripXmlBom(xmlContent);
107135
final String xml;
108136
if (isCalledAs("parse-xml-fragment")) {
109137
xml = "<" + FRAGMENT_WRAPPER_NAME + ">" + xmlContent + "</" + FRAGMENT_WRAPPER_NAME + ">";

exist-core/src/main/java/org/exist/xquery/functions/xmldb/XMLDBStore.java

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,28 @@
11
/*
2+
* Elemental
3+
* Copyright (C) 2024, Evolved Binary Ltd
4+
*
5+
6+
* https://www.evolvedbinary.com | https://www.elemental.xyz
7+
*
8+
* This library is free software; you can redistribute it and/or
9+
* modify it under the terms of the GNU Lesser General Public
10+
* License as published by the Free Software Foundation; version 2.1.
11+
*
12+
* This library is distributed in the hope that it will be useful,
13+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
14+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15+
* Lesser General Public License for more details.
16+
*
17+
* You should have received a copy of the GNU Lesser General Public
18+
* License along with this library; if not, write to the Free Software
19+
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20+
*
21+
* NOTE: Parts of this file contain code from 'The eXist-db Authors'.
22+
* The original license header is included below.
23+
*
24+
* =====================================================================
25+
*
226
* eXist-db Open Source Native XML Database
327
* Copyright (C) 2001 The eXist-db Authors
428
*
@@ -64,12 +88,14 @@
6488
import org.xmldb.api.modules.BinaryResource;
6589
import org.xmldb.api.modules.XMLResource;
6690

91+
import static org.exist.util.ByteOrderMark.stripXmlBom;
6792
import static org.exist.xquery.FunctionDSL.*;
6893
import static org.exist.xquery.XPathException.execAndAddErrorIfMissing;
6994
import static org.exist.xquery.functions.xmldb.XMLDBModule.functionSignature;
7095
import static org.exist.xquery.functions.xmldb.XMLDBModule.functionSignatures;
7196

7297
/**
98+
* @author <a href="mailto:[email protected]">Adam Retter</a>
7399
* @author wolf
74100
*/
75101
public class XMLDBStore extends XMLDBAbstractCollectionManipulator {
@@ -184,7 +210,7 @@ public Sequence evalWithCollection(Collection collection, Sequence[] args, Seque
184210
} else {
185211
try (Resource resource = getResource(mimeType, collection, docName)) {
186212
if (Type.subTypeOf(item.getType(), Type.STRING)) {
187-
resource.setContent(item.getStringValue());
213+
resource.setContent(stripXmlBom(item.getStringValue()));
188214
} else if (item.getType() == Type.BASE64_BINARY) {
189215
resource.setContent(((BinaryValue) item).toJavaObject());
190216
} else if (Type.subTypeOf(item.getType(), Type.NODE)) {
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
/*
2+
* Elemental
3+
* Copyright (C) 2024, Evolved Binary Ltd
4+
*
5+
6+
* https://www.evolvedbinary.com | https://www.elemental.xyz
7+
*
8+
* This library is free software; you can redistribute it and/or
9+
* modify it under the terms of the GNU Lesser General Public
10+
* License as published by the Free Software Foundation; version 2.1.
11+
*
12+
* This library is distributed in the hope that it will be useful,
13+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
14+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15+
* Lesser General Public License for more details.
16+
*
17+
* You should have received a copy of the GNU Lesser General Public
18+
* License along with this library; if not, write to the Free Software
19+
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20+
*/
21+
package org.exist.xquery.functions.fn;
22+
23+
import org.exist.test.ExistXmldbEmbeddedServer;
24+
import org.junit.ClassRule;
25+
import org.junit.Test;
26+
import org.xmldb.api.base.XMLDBException;
27+
28+
import java.nio.charset.StandardCharsets;
29+
30+
import static org.exist.util.ByteOrderMark.*;
31+
import static org.junit.Assert.assertEquals;
32+
33+
/**
34+
* @author <a href="mailto:[email protected]">Adam Retter</a>
35+
*/
36+
public class ParsingFunctionsTest {
37+
38+
private static final String UTF8_DECL = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
39+
40+
@ClassRule
41+
public static final ExistXmldbEmbeddedServer existEmbeddedServer = new ExistXmldbEmbeddedServer(false, true, true);
42+
43+
@Test
44+
public void parseWithoutBomWithoutDecl() throws XMLDBException {
45+
final String query = "parse-xml('<elem1/>')";
46+
final String result = existEmbeddedServer.executeOneValue(query);
47+
assertEquals("<elem1/>", result);
48+
}
49+
50+
@Test
51+
public void parseWithoutBomWithDecl() throws XMLDBException {
52+
final String query = "parse-xml('" + UTF8_DECL + "<elem2/>')";
53+
final String result = existEmbeddedServer.executeOneValue(query);
54+
assertEquals("<elem2/>", result);
55+
}
56+
57+
@Test
58+
public void parseWithUtf8BomWithoutDecl() throws XMLDBException {
59+
final String query = "parse-xml('" + UTF8_BOM + "<elem3/>')";
60+
final String result = existEmbeddedServer.executeOneValue(query);
61+
assertEquals("<elem3/>", result);
62+
}
63+
64+
@Test
65+
public void parseWithUtf8BomWithDecl() throws XMLDBException {
66+
final String query = "parse-xml('" + UTF8_BOM + UTF8_DECL + "<elem4/>')";
67+
final String result = existEmbeddedServer.executeOneValue(query);
68+
assertEquals("<elem4/>", result);
69+
}
70+
71+
@Test
72+
public void parseWithUtf16BEBomWithoutDecl() throws XMLDBException {
73+
final String query = "parse-xml('" + UTF16_BE_BOM + new String(new byte[]{0x00, 0x3c, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x65, 0x00, 0x6d, 0x00, 0x35, 0x00, 0x2f, 0x00, 0x3e}, StandardCharsets.UTF_16BE) + "')";
74+
final String result = existEmbeddedServer.executeOneValue(query);
75+
assertEquals("<elem5/>", result);
76+
}
77+
78+
@Test
79+
public void parseWithUtf16LEBomWithoutDecl() throws XMLDBException {
80+
final String query = "parse-xml('" + UTF16_LE_BOM + new String(new byte[]{0x3c, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x65, 0x00, 0x6d, 0x00, 0x36, 0x00, 0x2f, 0x00, 0x3e, 0x00}, StandardCharsets.UTF_16LE) + "')";
81+
final String result = existEmbeddedServer.executeOneValue(query);
82+
assertEquals("<elem6/>", result);
83+
}
84+
85+
@Test
86+
public void parseFragmentWithoutBomWithoutDecl() throws XMLDBException {
87+
final String query = "parse-xml-fragment('<elem1/>')";
88+
final String result = existEmbeddedServer.executeOneValue(query);
89+
assertEquals("<elem1/>", result);
90+
}
91+
92+
@Test
93+
public void parseFragmentWithUtf8BomWithoutDecl() throws XMLDBException {
94+
final String query = "parse-xml-fragment('" + UTF8_BOM + "<elem3/>')";
95+
final String result = existEmbeddedServer.executeOneValue(query);
96+
assertEquals("<elem3/>", result);
97+
}
98+
99+
@Test
100+
public void parseFragmentWithUtf16BEBomWithoutDecl() throws XMLDBException {
101+
final String query = "parse-xml-fragment('" + UTF16_BE_BOM + new String(new byte[]{0x00, 0x3c, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x65, 0x00, 0x6d, 0x00, 0x35, 0x00, 0x2f, 0x00, 0x3e}, StandardCharsets.UTF_16BE) + "')";
102+
final String result = existEmbeddedServer.executeOneValue(query);
103+
assertEquals("<elem5/>", result);
104+
}
105+
106+
@Test
107+
public void parseFragmentWithUtf16LEBomWithoutDecl() throws XMLDBException {
108+
final String query = "parse-xml-fragment('" + UTF16_LE_BOM + new String(new byte[]{0x3c, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x65, 0x00, 0x6d, 0x00, 0x36, 0x00, 0x2f, 0x00, 0x3e, 0x00}, StandardCharsets.UTF_16LE) + "')";
109+
final String result = existEmbeddedServer.executeOneValue(query);
110+
assertEquals("<elem6/>", result);
111+
}
112+
}

0 commit comments

Comments
 (0)