Skip to content

Commit

Permalink
feat: convert scanner to use array.h header
Browse files Browse the repository at this point in the history
* Fix creation of uninitialized custom_tag_name in deserialize

Co-authored-by: Marshall <[email protected]>

* Use array header for strings and arrays

Co-authored-by: Marshall <[email protected]>

* Use memcmp instead of strncmp because the length is handled explicitly

Co-authored-by: Marshall <[email protected]>

---------

Co-authored-by: Marshall <[email protected]>
  • Loading branch information
maxbrunsfeld and maxdeviant authored Mar 18, 2024
1 parent b285e25 commit de93795
Show file tree
Hide file tree
Showing 4 changed files with 447 additions and 185 deletions.
210 changes: 63 additions & 147 deletions src/scanner.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#include "tag.h"

#include <wctype.h>
#include "tree_sitter/array.h"
#include "tag.h"

enum TokenType {
START_TAG_NAME,
Expand All @@ -18,115 +18,27 @@ enum TokenType {
};

typedef struct {
uint32_t len;
uint32_t cap;
Tag *data;
} tags_vec;

typedef struct {
tags_vec tags;
Array(Tag) tags;
} Scanner;

#define MAX(a, b) ((a) > (b) ? (a) : (b))

#define VEC_RESIZE(vec, _cap) \
if ((_cap) > (vec).cap && (_cap) > 0) { \
void *tmp = realloc((vec).data, (_cap) * sizeof((vec).data[0])); \
assert(tmp != NULL); \
(vec).data = tmp; \
(vec).cap = (_cap); \
}

#define VEC_GROW(vec, _cap) \
if ((vec).cap < (_cap)) { \
VEC_RESIZE((vec), (_cap)); \
}

#define VEC_PUSH(vec, el) \
if ((vec).cap == (vec).len) { \
VEC_RESIZE((vec), MAX(16, (vec).len * 2)); \
} \
(vec).data[(vec).len++] = (el);

#define VEC_POP(vec) \
{ \
if (VEC_BACK(vec).type == CUSTOM) { \
tag_free(&VEC_BACK(vec)); \
} \
(vec).len--; \
}

#define VEC_BACK(vec) ((vec).data[(vec).len - 1])

#define VEC_FREE(vec) \
{ \
if ((vec).data != NULL) \
free((vec).data); \
(vec).data = NULL; \
}

#define VEC_CLEAR(vec) \
{ \
for (int i = 0; i < (vec).len; i++) { \
tag_free(&(vec).data[i]); \
} \
(vec).len = 0; \
}

#define STRING_RESIZE(vec, _cap) \
void *tmp = realloc((vec).data, ((_cap) + 1) * sizeof((vec).data[0])); \
assert(tmp != NULL); \
(vec).data = tmp; \
memset((vec).data + (vec).len, 0, (((_cap) + 1) - (vec).len) * sizeof((vec).data[0])); \
(vec).cap = (_cap);

#define STRING_GROW(vec, _cap) \
if ((vec).cap < (_cap)) { \
STRING_RESIZE((vec), (_cap)); \
}

#define STRING_PUSH(vec, el) \
if ((vec).cap == (vec).len) { \
STRING_RESIZE((vec), MAX(16, (vec).len * 2)); \
} \
(vec).data[(vec).len++] = (el);

#define STRING_INIT(vec) \
{ \
(vec).data = calloc(1, sizeof(char) * 17); \
(vec).len = 0; \
(vec).cap = 16; \
}

#define STRING_FREE(vec) \
{ \
if ((vec).data != NULL) \
free((vec).data); \
(vec).data = NULL; \
}

#define STRING_CLEAR(vec) \
{ \
(vec).len = 0; \
memset((vec).data, 0, (vec).cap * sizeof(char)); \
}

static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); }

static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); }

static unsigned serialize(Scanner *scanner, char *buffer) {
uint16_t tag_count = scanner->tags.len > UINT16_MAX ? UINT16_MAX : scanner->tags.len;
uint16_t tag_count = scanner->tags.size > UINT16_MAX ? UINT16_MAX : scanner->tags.size;
uint16_t serialized_tag_count = 0;

unsigned size = sizeof(tag_count);
memcpy(&buffer[size], &tag_count, sizeof(tag_count));
size += sizeof(tag_count);

for (; serialized_tag_count < tag_count; serialized_tag_count++) {
Tag tag = scanner->tags.data[serialized_tag_count];
Tag tag = scanner->tags.contents[serialized_tag_count];
if (tag.type == CUSTOM) {
unsigned name_length = tag.custom_tag_name.len;
unsigned name_length = tag.custom_tag_name.size;
if (name_length > UINT8_MAX) {
name_length = UINT8_MAX;
}
Expand All @@ -135,7 +47,7 @@ static unsigned serialize(Scanner *scanner, char *buffer) {
}
buffer[size++] = (char)tag.type;
buffer[size++] = (char)name_length;
strncpy(&buffer[size], tag.custom_tag_name.data, name_length);
strncpy(&buffer[size], tag.custom_tag_name.contents, name_length);
size += name_length;
} else {
if (size + 1 >= TREE_SITTER_SERIALIZATION_BUFFER_SIZE) {
Expand All @@ -150,7 +62,11 @@ static unsigned serialize(Scanner *scanner, char *buffer) {
}

static void deserialize(Scanner *scanner, const char *buffer, unsigned length) {
VEC_CLEAR(scanner->tags);
for (unsigned i = 0; i < scanner->tags.size; i++) {
tag_free(&scanner->tags.contents[i]);
}
array_clear(&scanner->tags);

if (length > 0) {
unsigned size = 0;
uint16_t tag_count = 0;
Expand All @@ -162,37 +78,34 @@ static void deserialize(Scanner *scanner, const char *buffer, unsigned length) {
memcpy(&tag_count, &buffer[size], sizeof(tag_count));
size += sizeof(tag_count);

VEC_RESIZE(scanner->tags, tag_count);
array_reserve(&scanner->tags, tag_count);
if (tag_count > 0) {
unsigned iter = 0;
for (iter = 0; iter < serialized_tag_count; iter++) {
Tag tag = scanner->tags.data[iter];
Tag tag = tag_new();
tag.type = (TagType)buffer[size++];
if (tag.type == CUSTOM) {
uint16_t name_length = (uint8_t)buffer[size++];
tag.custom_tag_name.len = name_length;
tag.custom_tag_name.cap = name_length;
tag.custom_tag_name.data = (char *)calloc(1, sizeof(char) * (name_length + 1));
strncpy(tag.custom_tag_name.data, &buffer[size], name_length);
array_reserve(&tag.custom_tag_name, name_length);
tag.custom_tag_name.size = name_length;
memcpy(tag.custom_tag_name.contents, &buffer[size], name_length);
size += name_length;
}
VEC_PUSH(scanner->tags, tag);
array_push(&scanner->tags, tag);
}
// add zero tags if we didn't read enough, this is because the
// buffer had no more room but we held more tags.
for (; iter < tag_count; iter++) {
Tag tag = new_tag();
VEC_PUSH(scanner->tags, tag);
array_push(&scanner->tags, tag_new());
}
}
}
}

static String scan_tag_name(TSLexer *lexer) {
String tag_name;
STRING_INIT(tag_name);
String tag_name = array_new();
while (iswalnum(lexer->lookahead) || lexer->lookahead == '-' || lexer->lookahead == ':') {
STRING_PUSH(tag_name, towupper(lexer->lookahead));
array_push(&tag_name, towupper(lexer->lookahead));
advance(lexer);
}
return tag_name;
Expand Down Expand Up @@ -230,13 +143,13 @@ static bool scan_comment(TSLexer *lexer) {
}

static bool scan_raw_text(Scanner *scanner, TSLexer *lexer) {
if (scanner->tags.len == 0) {
if (scanner->tags.size == 0) {
return false;
}

lexer->mark_end(lexer);

const char *end_delimiter = VEC_BACK(scanner->tags).type == SCRIPT ? "</SCRIPT" : "</STYLE";
const char *end_delimiter = array_back(&scanner->tags)->type == SCRIPT ? "</SCRIPT" : "</STYLE";

unsigned delimiter_index = 0;
while (lexer->lookahead) {
Expand All @@ -258,70 +171,73 @@ static bool scan_raw_text(Scanner *scanner, TSLexer *lexer) {
}

static bool scan_implicit_end_tag(Scanner *scanner, TSLexer *lexer) {
Tag *parent = scanner->tags.len == 0 ? NULL : &VEC_BACK(scanner->tags);
Tag *parent = scanner->tags.size == 0 ? NULL : array_back(&scanner->tags);

bool is_closing_tag = false;
if (lexer->lookahead == '/') {
is_closing_tag = true;
advance(lexer);
} else {
if (parent && is_void(parent)) {
VEC_POP(scanner->tags);
if (parent && tag_is_void(parent)) {
array_pop(&scanner->tags);
lexer->result_symbol = IMPLICIT_END_TAG;
return true;
}
}

String tag_name = scan_tag_name(lexer);
if (tag_name.len == 0 && !lexer->eof(lexer)) {
STRING_FREE(tag_name);
if (tag_name.size == 0 && !lexer->eof(lexer)) {
array_delete(&tag_name);
return false;
}

Tag next_tag = for_name(tag_name.data);
Tag next_tag = tag_for_name(tag_name);

if (is_closing_tag) {
// The tag correctly closes the topmost element on the stack
if (scanner->tags.len > 0 && tagcmp(&VEC_BACK(scanner->tags), &next_tag)) {
STRING_FREE(tag_name);
if (scanner->tags.size > 0 && tag_eq(array_back(&scanner->tags), &next_tag)) {
tag_free(&next_tag);
return false;
}

// Otherwise, dig deeper and queue implicit end tags (to be nice in
// the case of malformed HTML)
for (unsigned i = scanner->tags.len; i > 0; i--) {
if (scanner->tags.data[i - 1].type == next_tag.type) {
VEC_POP(scanner->tags);
for (unsigned i = scanner->tags.size; i > 0; i--) {
if (scanner->tags.contents[i - 1].type == next_tag.type) {
Tag popped_tag = array_pop(&scanner->tags);
tag_free(&popped_tag);
lexer->result_symbol = IMPLICIT_END_TAG;
STRING_FREE(tag_name);
tag_free(&next_tag);
return true;
}
}
} else if (parent &&
(!can_contain(parent, &next_tag) ||
(parent->type == HTML || parent->type == HEAD || parent->type == BODY) && lexer->eof(lexer))) {
VEC_POP(scanner->tags);
} else if (
parent &&
(
!tag_can_contain(parent, &next_tag) ||
(parent->type == HTML || parent->type == HEAD || parent->type == BODY) && lexer->eof(lexer)
)
) {
Tag popped_tag = array_pop(&scanner->tags);
tag_free(&popped_tag);
lexer->result_symbol = IMPLICIT_END_TAG;
STRING_FREE(tag_name);
tag_free(&next_tag);
return true;
}

STRING_FREE(tag_name);
tag_free(&next_tag);
return false;
}

static bool scan_start_tag_name(Scanner *scanner, TSLexer *lexer) {
String tag_name = scan_tag_name(lexer);
if (tag_name.len == 0) {
STRING_FREE(tag_name);
if (tag_name.size == 0) {
array_delete(&tag_name);
return false;
}
Tag tag = for_name(tag_name.data);
VEC_PUSH(scanner->tags, tag);

Tag tag = tag_for_name(tag_name);
array_push(&scanner->tags, tag);
switch (tag.type) {
case SCRIPT:
lexer->result_symbol = SCRIPT_START_TAG_NAME;
Expand All @@ -333,34 +249,37 @@ static bool scan_start_tag_name(Scanner *scanner, TSLexer *lexer) {
lexer->result_symbol = START_TAG_NAME;
break;
}
STRING_FREE(tag_name);
return true;
}

static bool scan_end_tag_name(Scanner *scanner, TSLexer *lexer) {
String tag_name = scan_tag_name(lexer);
if (tag_name.len == 0) {
STRING_FREE(tag_name);

if (tag_name.size == 0) {
array_delete(&tag_name);
return false;
}
Tag tag = for_name(tag_name.data);
if (scanner->tags.len > 0 && tagcmp(&VEC_BACK(scanner->tags), &tag)) {
VEC_POP(scanner->tags);

Tag tag = tag_for_name(tag_name);
if (scanner->tags.size > 0 && tag_eq(array_back(&scanner->tags), &tag)) {
Tag popped_tag = array_pop(&scanner->tags);
tag_free(&popped_tag);
lexer->result_symbol = END_TAG_NAME;
} else {
lexer->result_symbol = ERRONEOUS_END_TAG_NAME;
}

tag_free(&tag);
STRING_FREE(tag_name);
return true;
}

static bool scan_self_closing_tag_delimiter(Scanner *scanner, TSLexer *lexer) {
advance(lexer);
if (lexer->lookahead == '>') {
advance(lexer);
if (scanner->tags.len > 0) {
VEC_POP(scanner->tags);
if (scanner->tags.size > 0) {
Tag popped_tag = array_pop(&scanner->tags);
tag_free(&popped_tag);
lexer->result_symbol = SELF_CLOSING_TAG_DELIMITER;
}
return true;
Expand All @@ -369,9 +288,6 @@ static bool scan_self_closing_tag_delimiter(Scanner *scanner, TSLexer *lexer) {
}

static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) {
if (scanner->tags.len > 0) {
Tag *parent = &VEC_BACK(scanner->tags);
}
if (valid_symbols[RAW_TEXT] && !valid_symbols[START_TAG_NAME] && !valid_symbols[END_TAG_NAME]) {
return scan_raw_text(scanner, lexer);
}
Expand Down Expand Up @@ -439,9 +355,9 @@ void tree_sitter_html_external_scanner_deserialize(void *payload, const char *bu

void tree_sitter_html_external_scanner_destroy(void *payload) {
Scanner *scanner = (Scanner *)payload;
for (unsigned i = 0; i < scanner->tags.len; i++) {
STRING_FREE(scanner->tags.data[i].custom_tag_name);
for (unsigned i = 0; i < scanner->tags.size; i++) {
tag_free(&scanner->tags.contents[i]);
}
VEC_FREE(scanner->tags);
array_delete(&scanner->tags);
free(scanner);
}
Loading

0 comments on commit de93795

Please sign in to comment.