Skip to content

Commit 3f5a315

Browse files
committed
TriGParser, NQuadsParser and JsonLdParser skip UTF-8 BOM in input streams
(closes #10)
1 parent f8d3214 commit 3f5a315

6 files changed

+81
-3
lines changed

src/quickRdfIo/JsonLdParser.php

+3
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,9 @@ public function next(): void {
114114
}
115115

116116
public function parse(string $input): iQuadIterator {
117+
if (substr($input, 0, 3) === "\xEF\xBB\xBF") {
118+
$input = substr($input, 3);
119+
}
117120
$this->quads = JsonLD::toRdf($input, ['base' => $this->baseUri]);
118121
return $this;
119122
}

src/quickRdfIo/NQuadsParser.php

+4-1
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,9 @@
4343
*/
4444
class NQuadsParser implements iParser, iQuadIterator {
4545

46+
use TmpStreamParserTrait;
47+
use StreamSkipBomTrait;
48+
4649
const MODE_TRIPLES = 1;
4750
const MODE_QUADS = 2;
4851
const MODE_TRIPLES_STAR = 3;
@@ -66,7 +69,6 @@ class NQuadsParser implements iParser, iQuadIterator {
6669
const STAR_START = '%\\G\s*<<%';
6770
const STAR_END = '%\\G\s*>>%';
6871
const READ_BUF_SIZE = 8096;
69-
use TmpStreamParserTrait;
7072

7173
/**
7274
* See https://www.w3.org/TR/n-quads/#grammar-production-ECHAR
@@ -233,6 +235,7 @@ public function rewind(): void {
233235
if ($this->input->tell() !== 0) {
234236
$this->input->rewind();
235237
}
238+
$this->skipBom($this->input);
236239
if ($this->mode === self::MODE_TRIPLES || $this->mode === self::MODE_QUADS) {
237240
$this->quads = $this->quadGenerator();
238241
} else {

src/quickRdfIo/TriGParser.php

+2
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
class TriGParser implements iParser, iQuadIterator {
4444

4545
use TmpStreamParserTrait;
46+
use StreamSkipBomTrait;
4647

4748
const CHUNK_SIZE = 8192;
4849

@@ -181,6 +182,7 @@ public function rewind(): void {
181182
if ($this->input->tell() !== 0) {
182183
$this->input->rewind();
183184
}
185+
$this->skipBom($this->input);
184186
$this->next();
185187
}
186188

tests/JsonLdTest.php

+20-2
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626

2727
namespace quickRdfIo;
2828

29-
use quickRdf\DataFactory;
29+
use quickRdf\DataFactory as DF;
3030
use quickRdf\Dataset;
3131

3232
/**
@@ -42,7 +42,7 @@ class JsonLdTest extends \PHPUnit\Framework\TestCase {
4242
private JsonLdSerializer $serializer;
4343

4444
public function setUp(): void {
45-
$this->df = new DataFactory();
45+
$this->df = new DF();
4646
$this->refParser = new NQuadsParser($this->df, false, NQuadsParser::MODE_QUADS);
4747
$this->parser = new JsonLdParser($this->df);
4848
$this->serializer = new JsonLdSerializer(null);
@@ -71,4 +71,22 @@ public function testBig(): void {
7171
$this->assertEquals($ref->count(), $dataset->count());
7272
$this->assertTrue($ref->equals($dataset));
7373
}
74+
75+
/**
76+
* https://github.com/sweetrdf/quickRdfIo/issues/10
77+
*/
78+
public function testBom(): void {
79+
$ref = new Dataset();
80+
$quad = DF::quad(df::namedNode('http://foo'), DF::namedNode('http://bar'), DF::namedNode('http://baz'));
81+
$ref->add($quad);
82+
$output = tmpfile();
83+
fwrite($output, "\xEF\xBB\xBF");
84+
$this->serializer->serializeStream($output, $ref);
85+
86+
fseek($output, 0);
87+
$dataset = new Dataset();
88+
$dataset->add($this->parser->parseStream($output));
89+
$this->assertCount(1, $dataset);
90+
$this->assertTrue($quad->equals($dataset[0]));
91+
}
7492
}

tests/NQuadsParserTest.php

+26
Original file line numberDiff line numberDiff line change
@@ -264,4 +264,30 @@ public function testIssue7(): void {
264264
$this->assertCount(2, $dataset);
265265
}
266266
}
267+
268+
/**
269+
* https://github.com/sweetrdf/quickRdfIo/issues/10
270+
*/
271+
public function testBom(): void {
272+
$df = new DF();
273+
$parser = new NQuadsParser($df);
274+
$inputs = [
275+
'issue10_utf16be.nq' => "UTF-16 BE",
276+
'issue10_utf32le.nq' => "UTF-32 LE",
277+
'issue10_utf7.nq' => "UTF-7",
278+
];
279+
foreach ($inputs as $file => $enc) {
280+
try {
281+
$parser->parseStream(fopen(__DIR__ . '/files/' . $file, 'r'));
282+
} catch (RdfIoException $ex) {
283+
$this->assertEquals("Input stream has wrong encoding $enc", $ex->getMessage());
284+
}
285+
}
286+
287+
$dataset = new \quickRdf\Dataset();
288+
$dataset->add($parser->parseStream(fopen(__DIR__ . '/files/issue10_utf8.nq', 'r')));
289+
$this->assertCount(1, $dataset);
290+
$q = $df->quad(df::namedNode('http://foo'), DF::namedNode('http://bar'), DF::namedNode('http://baz'));
291+
$this->assertTrue($q->equals($dataset[0]));
292+
}
267293
}

tests/TriGParserTest.php

+26
Original file line numberDiff line numberDiff line change
@@ -110,4 +110,30 @@ public function testUtfChunk(): void {
110110
$triples = iterator_to_array($iter);
111111
$this->assertCount(148, $triples);
112112
}
113+
114+
/**
115+
* https://github.com/sweetrdf/quickRdfIo/issues/10
116+
*/
117+
public function testBom(): void {
118+
$df = new DF();
119+
$parser = new TriGParser($df);
120+
$inputs = [
121+
'issue10_utf16be.nq' => "UTF-16 BE",
122+
'issue10_utf32le.nq' => "UTF-32 LE",
123+
'issue10_utf7.nq' => "UTF-7",
124+
];
125+
foreach ($inputs as $file => $enc) {
126+
try {
127+
$parser->parseStream(fopen(__DIR__ . '/files/' . $file, 'r'));
128+
} catch (RdfIoException $ex) {
129+
$this->assertEquals("Input stream has wrong encoding $enc", $ex->getMessage());
130+
}
131+
}
132+
133+
$dataset = new \quickRdf\Dataset();
134+
$dataset->add($parser->parseStream(fopen(__DIR__ . '/files/issue10_utf8.nq', 'r')));
135+
$this->assertCount(1, $dataset);
136+
$q = $df->quad(df::namedNode('http://foo'), DF::namedNode('http://bar'), DF::namedNode('http://baz'));
137+
$this->assertTrue($q->equals($dataset[0]));
138+
}
113139
}

0 commit comments

Comments
 (0)