Skip to content

Commit b61eb96

Browse files
authored
json: refine constraint for whitespace to avoid runaways yet allow pretty print (ggml-org#7866)
1 parent 396b18d commit b61eb96

6 files changed

+44
-45
lines changed

common/json-schema-to-grammar.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ static std::string build_repetition(const std::string & item_rule, int min_items
4040
return result;
4141
}
4242

43-
const std::string SPACE_RULE = "\" \"?";
43+
const std::string SPACE_RULE = "| \" \" | \"\\n\" [ \\t]{0,20}";
4444

4545
struct BuiltinRule {
4646
std::string content;

examples/json_schema_to_grammar.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,8 @@ def __init__(self, content: str, deps: list = None):
2929
self.content = content
3030
self.deps = deps or []
3131

32-
# whitespace is constrained to a single space char to prevent model "running away" in
33-
# whitespace. Also maybe improves generation quality?
34-
SPACE_RULE = '" "?'
32+
# Constraining spaces to prevent model "running away".
33+
SPACE_RULE = '| " " | "\\n" [ \\t]{0,20}'
3534

3635
PRIMITIVE_RULES = {
3736
'boolean' : BuiltinRule('("true" | "false") space', []),

examples/server/public/json-schema-to-grammar.mjs

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
// WARNING: This file was ported from json_schema_to_grammar.py, please fix bugs / add features there first.
2-
const SPACE_RULE = '" "?';
2+
const SPACE_RULE = '| " " | "\\n" [ \\t]{0,20}';
33

44
function _buildRepetition(itemRule, minItems, maxItems, opts={}) {
55
if (minItems === 0 && maxItems === 1) {

grammars/json.gbnf

+1-1
Original file line numberDiff line numberDiff line change
@@ -22,4 +22,4 @@ string ::=
2222
number ::= ("-"? ([0-9] | [1-9] [0-9]{0,15})) ("." [0-9]+)? ([eE] [-+]? [0-9] [1-9]{0,15})? ws
2323

2424
# Optional space: by convention, applied in this grammar after literal chars when allowed
25-
ws ::= [ \t\n]{0,20}
25+
ws ::= | " " | "\n" [ \t]{0,20}

grammars/json_arr.gbnf

+1-1
Original file line numberDiff line numberDiff line change
@@ -31,4 +31,4 @@ string ::=
3131
number ::= ("-"? ([0-9] | [1-9] [0-9]{0,15})) ("." [0-9]+)? ([eE] [-+]? [1-9] [0-9]{0,15})? ws
3232

3333
# Optional space: by convention, applied in this grammar after literal chars when allowed
34-
ws ::= [ \t\n]{0,20}
34+
ws ::= | " " | "\n" [ \t]{0,20}

tests/test-json-schema-to-grammar.cpp

+38-38
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
112112
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
113113
object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
114114
root ::= object
115-
space ::= " "?
115+
space ::= | " " | "\n" [ \t]{0,20}
116116
string ::= "\"" char* "\"" space
117117
value ::= object | array | string | number | boolean | null
118118
)"""
@@ -135,7 +135,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
135135
date-time ::= date "T" time
136136
date-time-string ::= "\"" date-time "\"" space
137137
root ::= "[" space tuple-0 "," space uuid "," space tuple-2 "," space tuple-3 "]" space
138-
space ::= " "?
138+
space ::= | " " | "\n" [ \t]{0,20}
139139
time ::= ([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9]{3} )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )
140140
time-string ::= "\"" time "\"" space
141141
tuple-0 ::= date-string
@@ -154,7 +154,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
154154
R"""(
155155
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
156156
root ::= "\"" char* "\"" space
157-
space ::= " "?
157+
space ::= | " " | "\n" [ \t]{0,20}
158158
)"""
159159
});
160160

@@ -168,7 +168,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
168168
R"""(
169169
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
170170
root ::= "\"" char+ "\"" space
171-
space ::= " "?
171+
space ::= | " " | "\n" [ \t]{0,20}
172172
)"""
173173
});
174174

@@ -182,7 +182,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
182182
R"""(
183183
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
184184
root ::= "\"" char{3,} "\"" space
185-
space ::= " "?
185+
space ::= | " " | "\n" [ \t]{0,20}
186186
)"""
187187
});
188188

@@ -196,7 +196,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
196196
R"""(
197197
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
198198
root ::= "\"" char{0,3} "\"" space
199-
space ::= " "?
199+
space ::= | " " | "\n" [ \t]{0,20}
200200
)"""
201201
});
202202

@@ -211,7 +211,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
211211
R"""(
212212
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
213213
root ::= "\"" char{1,4} "\"" space
214-
space ::= " "?
214+
space ::= | " " | "\n" [ \t]{0,20}
215215
)"""
216216
});
217217

@@ -223,7 +223,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
223223
})""",
224224
R"""(
225225
root ::= ("true" | "false") space
226-
space ::= " "?
226+
space ::= | " " | "\n" [ \t]{0,20}
227227
)"""
228228
});
229229

@@ -236,7 +236,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
236236
R"""(
237237
integral-part ::= [0] | [1-9] [0-9]{0,15}
238238
root ::= ("-"? integral-part) space
239-
space ::= " "?
239+
space ::= | " " | "\n" [ \t]{0,20}
240240
)"""
241241
});
242242

@@ -248,7 +248,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
248248
})""",
249249
R"""(
250250
root ::= "\"foo\""
251-
space ::= " "?
251+
space ::= | " " | "\n" [ \t]{0,20}
252252
)"""
253253
});
254254

@@ -260,7 +260,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
260260
})""",
261261
R"""(
262262
root ::= "123"
263-
space ::= " "?
263+
space ::= | " " | "\n" [ \t]{0,20}
264264
)"""
265265
});
266266

@@ -272,7 +272,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
272272
})""",
273273
R"""(
274274
root ::= "\"red\"" | "\"amber\"" | "\"green\"" | "null" | "42" | "[\"foo\"]"
275-
space ::= " "?
275+
space ::= | " " | "\n" [ \t]{0,20}
276276
)"""
277277
});
278278

@@ -285,7 +285,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
285285
R"""(
286286
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
287287
root ::= "[" space string "]" space
288-
space ::= " "?
288+
space ::= | " " | "\n" [ \t]{0,20}
289289
string ::= "\"" char* "\"" space
290290
)"""
291291
});
@@ -302,7 +302,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
302302
integral-part ::= [0] | [1-9] [0-9]{0,15}
303303
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
304304
root ::= "[" space string "," space number "]" space
305-
space ::= " "?
305+
space ::= | " " | "\n" [ \t]{0,20}
306306
string ::= "\"" char* "\"" space
307307
)"""
308308
});
@@ -317,7 +317,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
317317
decimal-part ::= [0-9]{1,16}
318318
integral-part ::= [0] | [1-9] [0-9]{0,15}
319319
root ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
320-
space ::= " "?
320+
space ::= | " " | "\n" [ \t]{0,20}
321321
)"""
322322
});
323323

@@ -333,7 +333,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
333333
R"""(
334334
boolean ::= ("true" | "false") space
335335
root ::= "[" space boolean ("," space boolean)+ "]" space
336-
space ::= " "?
336+
space ::= | " " | "\n" [ \t]{0,20}
337337
)"""
338338
});
339339

@@ -349,7 +349,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
349349
R"""(
350350
boolean ::= ("true" | "false") space
351351
root ::= "[" space boolean? "]" space
352-
space ::= " "?
352+
space ::= | " " | "\n" [ \t]{0,20}
353353
)"""
354354
});
355355

@@ -365,7 +365,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
365365
R"""(
366366
boolean ::= ("true" | "false") space
367367
root ::= "[" space (boolean ("," space boolean)?)? "]" space
368-
space ::= " "?
368+
space ::= | " " | "\n" [ \t]{0,20}
369369
)"""
370370
});
371371

@@ -386,7 +386,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
386386
item ::= number | integer
387387
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
388388
root ::= "[" space item ("," space item){2,4} "]" space
389-
space ::= " "?
389+
space ::= | " " | "\n" [ \t]{0,20}
390390
)"""
391391
});
392392

@@ -399,7 +399,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
399399
})""",
400400
R"""(
401401
root ::= "\"" "ab" "c"? "d"* "ef" "g"+ ("hij")? "kl" "\"" space
402-
space ::= " "?
402+
space ::= | " " | "\n" [ \t]{0,20}
403403
)"""
404404
});
405405

@@ -412,7 +412,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
412412
})""",
413413
R"""(
414414
root ::= "\"" "[]{}()|+*?" "\"" space
415-
space ::= " "?
415+
space ::= | " " | "\n" [ \t]{0,20}
416416
)"""
417417
});
418418

@@ -425,7 +425,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
425425
})""",
426426
R"""(
427427
root ::= "\"" "\"" "\"" space
428-
space ::= " "?
428+
space ::= | " " | "\n" [ \t]{0,20}
429429
)"""
430430
});
431431

@@ -440,7 +440,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
440440
dot ::= [^\x0A\x0D]
441441
root ::= "\"" ("(" root-1{1,3} ")")? root-1{3,3} "-" root-1{4,4} " " "a"{3,5} "nd" dot dot dot "\"" space
442442
root-1 ::= [0-9]
443-
space ::= " "?
443+
space ::= | " " | "\n" [ \t]{0,20}
444444
)"""
445445
});
446446

@@ -468,7 +468,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
468468
c-kv ::= "\"c\"" space ":" space string
469469
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
470470
root ::= "{" space b-kv "," space c-kv "," space a-kv "}" space
471-
space ::= " "?
471+
space ::= | " " | "\n" [ \t]{0,20}
472472
string ::= "\"" char* "\"" space
473473
)"""
474474
});
@@ -488,7 +488,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
488488
a-kv ::= "\"a\"" space ":" space string
489489
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
490490
root ::= "{" space (a-kv )? "}" space
491-
space ::= " "?
491+
space ::= | " " | "\n" [ \t]{0,20}
492492
string ::= "\"" char* "\"" space
493493
)"""
494494
});
@@ -512,7 +512,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
512512
c-kv ::= "\"c\"" space ":" space string
513513
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
514514
root ::= "{" space (a-kv a-rest | b-kv b-rest | c-kv )? "}" space
515-
space ::= " "?
515+
space ::= | " " | "\n" [ \t]{0,20}
516516
string ::= "\"" char* "\"" space
517517
)"""
518518
});
@@ -538,7 +538,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
538538
d-kv ::= "\"d\"" space ":" space string
539539
d-rest ::= ( "," space c-kv )?
540540
root ::= "{" space b-kv "," space a-kv ( "," space ( d-kv d-rest | c-kv ) )? "}" space
541-
space ::= " "?
541+
space ::= | " " | "\n" [ \t]{0,20}
542542
string ::= "\"" char* "\"" space
543543
)"""
544544
});
@@ -559,7 +559,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
559559
integral-part ::= [0] | [1-9] [0-9]{0,15}
560560
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
561561
root ::= "{" space (additional-kvs )? "}" space
562-
space ::= " "?
562+
space ::= | " " | "\n" [ \t]{0,20}
563563
string ::= "\"" char* "\"" space
564564
)"""
565565
});
@@ -581,7 +581,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
581581
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
582582
object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
583583
root ::= object
584-
space ::= " "?
584+
space ::= | " " | "\n" [ \t]{0,20}
585585
string ::= "\"" char* "\"" space
586586
value ::= object | array | string | number | boolean | null
587587
)"""
@@ -603,7 +603,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
603603
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
604604
object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
605605
root ::= object
606-
space ::= " "?
606+
space ::= | " " | "\n" [ \t]{0,20}
607607
string ::= "\"" char* "\"" space
608608
value ::= object | array | string | number | boolean | null
609609
)"""
@@ -618,7 +618,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
618618
})""",
619619
R"""(
620620
root ::= "{" space "}" space
621-
space ::= " "?
621+
space ::= | " " | "\n" [ \t]{0,20}
622622
)"""
623623
});
624624

@@ -642,7 +642,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
642642
integral-part ::= [0] | [1-9] [0-9]{0,15}
643643
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
644644
root ::= "{" space a-kv ( "," space ( additional-kvs ) )? "}" space
645-
space ::= " "?
645+
space ::= | " " | "\n" [ \t]{0,20}
646646
string ::= "\"" char* "\"" space
647647
)"""
648648
});
@@ -667,7 +667,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
667667
integral-part ::= [0] | [1-9] [0-9]{0,15}
668668
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
669669
root ::= "{" space (a-kv a-rest | additional-kvs )? "}" space
670-
space ::= " "?
670+
space ::= | " " | "\n" [ \t]{0,20}
671671
string ::= "\"" char* "\"" space
672672
)"""
673673
});
@@ -695,7 +695,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
695695
integral-part ::= [0] | [1-9] [0-9]{0,15}
696696
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
697697
root ::= "{" space a-kv ( "," space ( b-kv b-rest | additional-kvs ) )? "}" space
698-
space ::= " "?
698+
space ::= | " " | "\n" [ \t]{0,20}
699699
string ::= "\"" char* "\"" space
700700
)"""
701701
});
@@ -725,7 +725,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
725725
foo ::= "{" space foo-a-kv "}" space
726726
foo-a-kv ::= "\"a\"" space ":" space string
727727
root ::= foo
728-
space ::= " "?
728+
space ::= | " " | "\n" [ \t]{0,20}
729729
string ::= "\"" char* "\"" space
730730
)"""
731731
});
@@ -759,7 +759,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
759759
integral-part ::= [0] | [1-9] [0-9]{0,15}
760760
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
761761
root ::= alternative-0 | alternative-1
762-
space ::= " "?
762+
space ::= | " " | "\n" [ \t]{0,20}
763763
)"""
764764
});
765765

@@ -803,7 +803,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
803803
integral-part ::= [0] | [1-9] [0-9]{0,15}
804804
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
805805
root ::= "{" space a-kv "," space b-kv ( "," space ( d-kv d-rest | c-kv ) )? "}" space
806-
space ::= " "?
806+
space ::= | " " | "\n" [ \t]{0,20}
807807
)"""
808808
});
809809

@@ -851,7 +851,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
851851
number-number-kv ::= "\"number\"" space ":" space number-number
852852
number-number-root-kv ::= "\"root\"" space ":" space number
853853
root ::= "{" space number-kv "}" space
854-
space ::= " "?
854+
space ::= | " " | "\n" [ \t]{0,20}
855855
)"""
856856
});
857857
}

0 commit comments

Comments
 (0)