Skip to content

Commit de93795

Browse files
feat: convert scanner to use array.h header
* Fix creation of uninitialized custom_tag_name in deserialize Co-authored-by: Marshall <[email protected]> * Use array header for strings and arrays Co-authored-by: Marshall <[email protected]> * Use memcmp instead of strncmp because the length is handled explicitly Co-authored-by: Marshall <[email protected]> --------- Co-authored-by: Marshall <[email protected]>
1 parent b285e25 commit de93795

File tree

4 files changed

+447
-185
lines changed

4 files changed

+447
-185
lines changed

src/scanner.c

Lines changed: 63 additions & 147 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
#include "tag.h"
2-
31
#include <wctype.h>
2+
#include "tree_sitter/array.h"
3+
#include "tag.h"
44

55
enum TokenType {
66
START_TAG_NAME,
@@ -18,115 +18,27 @@ enum TokenType {
1818
};
1919

2020
typedef struct {
21-
uint32_t len;
22-
uint32_t cap;
23-
Tag *data;
24-
} tags_vec;
25-
26-
typedef struct {
27-
tags_vec tags;
21+
Array(Tag) tags;
2822
} Scanner;
2923

3024
#define MAX(a, b) ((a) > (b) ? (a) : (b))
3125

32-
#define VEC_RESIZE(vec, _cap) \
33-
if ((_cap) > (vec).cap && (_cap) > 0) { \
34-
void *tmp = realloc((vec).data, (_cap) * sizeof((vec).data[0])); \
35-
assert(tmp != NULL); \
36-
(vec).data = tmp; \
37-
(vec).cap = (_cap); \
38-
}
39-
40-
#define VEC_GROW(vec, _cap) \
41-
if ((vec).cap < (_cap)) { \
42-
VEC_RESIZE((vec), (_cap)); \
43-
}
44-
45-
#define VEC_PUSH(vec, el) \
46-
if ((vec).cap == (vec).len) { \
47-
VEC_RESIZE((vec), MAX(16, (vec).len * 2)); \
48-
} \
49-
(vec).data[(vec).len++] = (el);
50-
51-
#define VEC_POP(vec) \
52-
{ \
53-
if (VEC_BACK(vec).type == CUSTOM) { \
54-
tag_free(&VEC_BACK(vec)); \
55-
} \
56-
(vec).len--; \
57-
}
58-
59-
#define VEC_BACK(vec) ((vec).data[(vec).len - 1])
60-
61-
#define VEC_FREE(vec) \
62-
{ \
63-
if ((vec).data != NULL) \
64-
free((vec).data); \
65-
(vec).data = NULL; \
66-
}
67-
68-
#define VEC_CLEAR(vec) \
69-
{ \
70-
for (int i = 0; i < (vec).len; i++) { \
71-
tag_free(&(vec).data[i]); \
72-
} \
73-
(vec).len = 0; \
74-
}
75-
76-
#define STRING_RESIZE(vec, _cap) \
77-
void *tmp = realloc((vec).data, ((_cap) + 1) * sizeof((vec).data[0])); \
78-
assert(tmp != NULL); \
79-
(vec).data = tmp; \
80-
memset((vec).data + (vec).len, 0, (((_cap) + 1) - (vec).len) * sizeof((vec).data[0])); \
81-
(vec).cap = (_cap);
82-
83-
#define STRING_GROW(vec, _cap) \
84-
if ((vec).cap < (_cap)) { \
85-
STRING_RESIZE((vec), (_cap)); \
86-
}
87-
88-
#define STRING_PUSH(vec, el) \
89-
if ((vec).cap == (vec).len) { \
90-
STRING_RESIZE((vec), MAX(16, (vec).len * 2)); \
91-
} \
92-
(vec).data[(vec).len++] = (el);
93-
94-
#define STRING_INIT(vec) \
95-
{ \
96-
(vec).data = calloc(1, sizeof(char) * 17); \
97-
(vec).len = 0; \
98-
(vec).cap = 16; \
99-
}
100-
101-
#define STRING_FREE(vec) \
102-
{ \
103-
if ((vec).data != NULL) \
104-
free((vec).data); \
105-
(vec).data = NULL; \
106-
}
107-
108-
#define STRING_CLEAR(vec) \
109-
{ \
110-
(vec).len = 0; \
111-
memset((vec).data, 0, (vec).cap * sizeof(char)); \
112-
}
113-
11426
static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
11527

11628
static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); }
11729

11830
static unsigned serialize(Scanner *scanner, char *buffer) {
119-
uint16_t tag_count = scanner->tags.len > UINT16_MAX ? UINT16_MAX : scanner->tags.len;
31+
uint16_t tag_count = scanner->tags.size > UINT16_MAX ? UINT16_MAX : scanner->tags.size;
12032
uint16_t serialized_tag_count = 0;
12133

12234
unsigned size = sizeof(tag_count);
12335
memcpy(&buffer[size], &tag_count, sizeof(tag_count));
12436
size += sizeof(tag_count);
12537

12638
for (; serialized_tag_count < tag_count; serialized_tag_count++) {
127-
Tag tag = scanner->tags.data[serialized_tag_count];
39+
Tag tag = scanner->tags.contents[serialized_tag_count];
12840
if (tag.type == CUSTOM) {
129-
unsigned name_length = tag.custom_tag_name.len;
41+
unsigned name_length = tag.custom_tag_name.size;
13042
if (name_length > UINT8_MAX) {
13143
name_length = UINT8_MAX;
13244
}
@@ -135,7 +47,7 @@ static unsigned serialize(Scanner *scanner, char *buffer) {
13547
}
13648
buffer[size++] = (char)tag.type;
13749
buffer[size++] = (char)name_length;
138-
strncpy(&buffer[size], tag.custom_tag_name.data, name_length);
50+
strncpy(&buffer[size], tag.custom_tag_name.contents, name_length);
13951
size += name_length;
14052
} else {
14153
if (size + 1 >= TREE_SITTER_SERIALIZATION_BUFFER_SIZE) {
@@ -150,7 +62,11 @@ static unsigned serialize(Scanner *scanner, char *buffer) {
15062
}
15163

15264
static void deserialize(Scanner *scanner, const char *buffer, unsigned length) {
153-
VEC_CLEAR(scanner->tags);
65+
for (unsigned i = 0; i < scanner->tags.size; i++) {
66+
tag_free(&scanner->tags.contents[i]);
67+
}
68+
array_clear(&scanner->tags);
69+
15470
if (length > 0) {
15571
unsigned size = 0;
15672
uint16_t tag_count = 0;
@@ -162,37 +78,34 @@ static void deserialize(Scanner *scanner, const char *buffer, unsigned length) {
16278
memcpy(&tag_count, &buffer[size], sizeof(tag_count));
16379
size += sizeof(tag_count);
16480

165-
VEC_RESIZE(scanner->tags, tag_count);
81+
array_reserve(&scanner->tags, tag_count);
16682
if (tag_count > 0) {
16783
unsigned iter = 0;
16884
for (iter = 0; iter < serialized_tag_count; iter++) {
169-
Tag tag = scanner->tags.data[iter];
85+
Tag tag = tag_new();
17086
tag.type = (TagType)buffer[size++];
17187
if (tag.type == CUSTOM) {
17288
uint16_t name_length = (uint8_t)buffer[size++];
173-
tag.custom_tag_name.len = name_length;
174-
tag.custom_tag_name.cap = name_length;
175-
tag.custom_tag_name.data = (char *)calloc(1, sizeof(char) * (name_length + 1));
176-
strncpy(tag.custom_tag_name.data, &buffer[size], name_length);
89+
array_reserve(&tag.custom_tag_name, name_length);
90+
tag.custom_tag_name.size = name_length;
91+
memcpy(tag.custom_tag_name.contents, &buffer[size], name_length);
17792
size += name_length;
17893
}
179-
VEC_PUSH(scanner->tags, tag);
94+
array_push(&scanner->tags, tag);
18095
}
18196
// add zero tags if we didn't read enough, this is because the
18297
// buffer had no more room but we held more tags.
18398
for (; iter < tag_count; iter++) {
184-
Tag tag = new_tag();
185-
VEC_PUSH(scanner->tags, tag);
99+
array_push(&scanner->tags, tag_new());
186100
}
187101
}
188102
}
189103
}
190104

191105
static String scan_tag_name(TSLexer *lexer) {
192-
String tag_name;
193-
STRING_INIT(tag_name);
106+
String tag_name = array_new();
194107
while (iswalnum(lexer->lookahead) || lexer->lookahead == '-' || lexer->lookahead == ':') {
195-
STRING_PUSH(tag_name, towupper(lexer->lookahead));
108+
array_push(&tag_name, towupper(lexer->lookahead));
196109
advance(lexer);
197110
}
198111
return tag_name;
@@ -230,13 +143,13 @@ static bool scan_comment(TSLexer *lexer) {
230143
}
231144

232145
static bool scan_raw_text(Scanner *scanner, TSLexer *lexer) {
233-
if (scanner->tags.len == 0) {
146+
if (scanner->tags.size == 0) {
234147
return false;
235148
}
236149

237150
lexer->mark_end(lexer);
238151

239-
const char *end_delimiter = VEC_BACK(scanner->tags).type == SCRIPT ? "</SCRIPT" : "</STYLE";
152+
const char *end_delimiter = array_back(&scanner->tags)->type == SCRIPT ? "</SCRIPT" : "</STYLE";
240153

241154
unsigned delimiter_index = 0;
242155
while (lexer->lookahead) {
@@ -258,70 +171,73 @@ static bool scan_raw_text(Scanner *scanner, TSLexer *lexer) {
258171
}
259172

260173
static bool scan_implicit_end_tag(Scanner *scanner, TSLexer *lexer) {
261-
Tag *parent = scanner->tags.len == 0 ? NULL : &VEC_BACK(scanner->tags);
174+
Tag *parent = scanner->tags.size == 0 ? NULL : array_back(&scanner->tags);
262175

263176
bool is_closing_tag = false;
264177
if (lexer->lookahead == '/') {
265178
is_closing_tag = true;
266179
advance(lexer);
267180
} else {
268-
if (parent && is_void(parent)) {
269-
VEC_POP(scanner->tags);
181+
if (parent && tag_is_void(parent)) {
182+
array_pop(&scanner->tags);
270183
lexer->result_symbol = IMPLICIT_END_TAG;
271184
return true;
272185
}
273186
}
274187

275188
String tag_name = scan_tag_name(lexer);
276-
if (tag_name.len == 0 && !lexer->eof(lexer)) {
277-
STRING_FREE(tag_name);
189+
if (tag_name.size == 0 && !lexer->eof(lexer)) {
190+
array_delete(&tag_name);
278191
return false;
279192
}
280193

281-
Tag next_tag = for_name(tag_name.data);
194+
Tag next_tag = tag_for_name(tag_name);
282195

283196
if (is_closing_tag) {
284197
// The tag correctly closes the topmost element on the stack
285-
if (scanner->tags.len > 0 && tagcmp(&VEC_BACK(scanner->tags), &next_tag)) {
286-
STRING_FREE(tag_name);
198+
if (scanner->tags.size > 0 && tag_eq(array_back(&scanner->tags), &next_tag)) {
287199
tag_free(&next_tag);
288200
return false;
289201
}
290202

291203
// Otherwise, dig deeper and queue implicit end tags (to be nice in
292204
// the case of malformed HTML)
293-
for (unsigned i = scanner->tags.len; i > 0; i--) {
294-
if (scanner->tags.data[i - 1].type == next_tag.type) {
295-
VEC_POP(scanner->tags);
205+
for (unsigned i = scanner->tags.size; i > 0; i--) {
206+
if (scanner->tags.contents[i - 1].type == next_tag.type) {
207+
Tag popped_tag = array_pop(&scanner->tags);
208+
tag_free(&popped_tag);
296209
lexer->result_symbol = IMPLICIT_END_TAG;
297-
STRING_FREE(tag_name);
298210
tag_free(&next_tag);
299211
return true;
300212
}
301213
}
302-
} else if (parent &&
303-
(!can_contain(parent, &next_tag) ||
304-
(parent->type == HTML || parent->type == HEAD || parent->type == BODY) && lexer->eof(lexer))) {
305-
VEC_POP(scanner->tags);
214+
} else if (
215+
parent &&
216+
(
217+
!tag_can_contain(parent, &next_tag) ||
218+
(parent->type == HTML || parent->type == HEAD || parent->type == BODY) && lexer->eof(lexer)
219+
)
220+
) {
221+
Tag popped_tag = array_pop(&scanner->tags);
222+
tag_free(&popped_tag);
306223
lexer->result_symbol = IMPLICIT_END_TAG;
307-
STRING_FREE(tag_name);
308224
tag_free(&next_tag);
309225
return true;
310226
}
311227

312-
STRING_FREE(tag_name);
313228
tag_free(&next_tag);
314229
return false;
315230
}
316231

317232
static bool scan_start_tag_name(Scanner *scanner, TSLexer *lexer) {
318233
String tag_name = scan_tag_name(lexer);
319-
if (tag_name.len == 0) {
320-
STRING_FREE(tag_name);
234+
if (tag_name.size == 0) {
235+
array_delete(&tag_name);
321236
return false;
322237
}
323-
Tag tag = for_name(tag_name.data);
324-
VEC_PUSH(scanner->tags, tag);
238+
239+
Tag tag = tag_for_name(tag_name);
240+
array_push(&scanner->tags, tag);
325241
switch (tag.type) {
326242
case SCRIPT:
327243
lexer->result_symbol = SCRIPT_START_TAG_NAME;
@@ -333,34 +249,37 @@ static bool scan_start_tag_name(Scanner *scanner, TSLexer *lexer) {
333249
lexer->result_symbol = START_TAG_NAME;
334250
break;
335251
}
336-
STRING_FREE(tag_name);
337252
return true;
338253
}
339254

340255
static bool scan_end_tag_name(Scanner *scanner, TSLexer *lexer) {
341256
String tag_name = scan_tag_name(lexer);
342-
if (tag_name.len == 0) {
343-
STRING_FREE(tag_name);
257+
258+
if (tag_name.size == 0) {
259+
array_delete(&tag_name);
344260
return false;
345261
}
346-
Tag tag = for_name(tag_name.data);
347-
if (scanner->tags.len > 0 && tagcmp(&VEC_BACK(scanner->tags), &tag)) {
348-
VEC_POP(scanner->tags);
262+
263+
Tag tag = tag_for_name(tag_name);
264+
if (scanner->tags.size > 0 && tag_eq(array_back(&scanner->tags), &tag)) {
265+
Tag popped_tag = array_pop(&scanner->tags);
266+
tag_free(&popped_tag);
349267
lexer->result_symbol = END_TAG_NAME;
350268
} else {
351269
lexer->result_symbol = ERRONEOUS_END_TAG_NAME;
352270
}
271+
353272
tag_free(&tag);
354-
STRING_FREE(tag_name);
355273
return true;
356274
}
357275

358276
static bool scan_self_closing_tag_delimiter(Scanner *scanner, TSLexer *lexer) {
359277
advance(lexer);
360278
if (lexer->lookahead == '>') {
361279
advance(lexer);
362-
if (scanner->tags.len > 0) {
363-
VEC_POP(scanner->tags);
280+
if (scanner->tags.size > 0) {
281+
Tag popped_tag = array_pop(&scanner->tags);
282+
tag_free(&popped_tag);
364283
lexer->result_symbol = SELF_CLOSING_TAG_DELIMITER;
365284
}
366285
return true;
@@ -369,9 +288,6 @@ static bool scan_self_closing_tag_delimiter(Scanner *scanner, TSLexer *lexer) {
369288
}
370289

371290
static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) {
372-
if (scanner->tags.len > 0) {
373-
Tag *parent = &VEC_BACK(scanner->tags);
374-
}
375291
if (valid_symbols[RAW_TEXT] && !valid_symbols[START_TAG_NAME] && !valid_symbols[END_TAG_NAME]) {
376292
return scan_raw_text(scanner, lexer);
377293
}
@@ -439,9 +355,9 @@ void tree_sitter_html_external_scanner_deserialize(void *payload, const char *bu
439355

440356
void tree_sitter_html_external_scanner_destroy(void *payload) {
441357
Scanner *scanner = (Scanner *)payload;
442-
for (unsigned i = 0; i < scanner->tags.len; i++) {
443-
STRING_FREE(scanner->tags.data[i].custom_tag_name);
358+
for (unsigned i = 0; i < scanner->tags.size; i++) {
359+
tag_free(&scanner->tags.contents[i]);
444360
}
445-
VEC_FREE(scanner->tags);
361+
array_delete(&scanner->tags);
446362
free(scanner);
447363
}

0 commit comments

Comments
 (0)