From d53f2e7107cf63669b705c3abf08c129eeb0315e Mon Sep 17 00:00:00 2001 From: mattkae Date: Thu, 22 Jun 2023 13:29:49 -0400 Subject: Parsing most html entities except for the last category --- src/tokenizer.cpp | 90 ++++++++++++++++++++++++++----------------------------- 1 file changed, 43 insertions(+), 47 deletions(-) (limited to 'src/tokenizer.cpp') diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index 8cf9b31..9931d59 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -15,49 +15,15 @@ Tokenizer create(code_point_t* value) { return t; } -namespace CodePoints { - const code_point_t TAB = 0x0009; - const code_point_t LF = 0x000A; - const code_point_t FF = 0x00C; - const code_point_t SPACE = 0x020; - const code_point_t SOLIDUS = 0x02F; - const code_point_t LOWERCASE_A = 0x0061; - const code_point_t LOWERCASE_Z = 0x007A; - const code_point_t LOWERCASE_F = 0x0066; - const code_point_t UPPERCASE_A = 0x041; - const code_point_t UPPERCASE_Z = 0x05A; - const code_point_t UPPERCASE_F = 0x0046; - const code_point_t NULL_CHAR = 0x0000; - const code_point_t REPLACEMENT_CHAR = 0xFFFD; - const code_point_t GREATER_THAN_SIGN = 0x003E; - const code_point_t LESS_THAN_SIGN = 0x003C; - const code_point_t AMPERSAND = 0x0026; - const code_point_t EXCLAMATION_MARK = 0x0021; - const code_point_t NUMBER_SIGN = 0x0023; - const code_point_t LOWERCASE_X = 0x0078; - const code_point_t UPPERCASE_X = 0x0058; - const code_point_t DIGIT_ZERO = 0x0030; - const code_point_t DIGIT_NINE = 0x0039; - - inline bool is_decimal(code_point_t c) { - return c >= CodePoints::DIGIT_ZERO && c <= CodePoints::DIGIT_NINE; - } - - inline bool is_hex(code_point_t c) { - return (c >= CodePoints::DIGIT_ZERO && c <= CodePoints::DIGIT_NINE) - || (c >= CodePoints::LOWERCASE_A && c <= CodePoints::LOWERCASE_F) - || (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_F); - } -}; - // Helpers - +/// Consumes the next token by incrementing the ptr. inline void consume_next(Tokenizer* tokenizer) { tokenizer->ptr++; } +// Unconsumed the next tokekn by decrementing the ptr. inline void unconsume_previous(Tokenizer* tokenizer) { tokenizer->ptr--; } @@ -148,7 +114,7 @@ inline void tag_name_state(Tokenizer* tokenizer) { else if (c == EOF) { // TODO: @Error tokenizer->state = TokenizerState_Data; - tokenizer->flag = tokenizer->flag & ~TokenizerFlag_IncrementPtr; + tokenizer->flag = tokenizer->flag & TokenizerFlag_DecrementPtr; } else { tokenizer->last.append_to_tag_name(c); @@ -185,7 +151,7 @@ inline void end_tag_open_state(Tokenizer* tokenizer) { tokenizer->last.type = HtmlTokenType_Character; tokenizer->last.character_token = CodePoints::SOLIDUS; - tokenizer->flag = tokenizer->flag & ~TokenizerFlag_IncrementPtr; + tokenizer->flag = tokenizer->flag & TokenizerFlag_DecrementPtr; } else if (c == CodePoints::NULL_CHAR) { // TODO: @Error @@ -198,7 +164,9 @@ inline void end_tag_open_state(Tokenizer* tokenizer) { } /// https://dev.w3.org/html5/spec-LC/tokenization.html#consume-a-character-reference -inline void try_consume_character_reference(Tokenizer* tokenizer) { +/// Attempts to consume a character reference from the current tokenizer. If one cannot +/// be consumed, false is returned, otherwise true. +inline bool try_consume_character_reference(Tokenizer* tokenizer) { code_point_t c = *tokenizer->ptr; switch (c) { @@ -212,16 +180,16 @@ inline void try_consume_character_reference(Tokenizer* tokenizer) { // TODO: The additional allowed character? // Not a character reference. No characters are consumed, and nothing is returned. (This is not an error, either.) tokenizer->state = TokenizerState_Data; - tokenizer->flag &= TokenizerFlag_IncrementPtr; - break; + return true; } case CodePoints::NUMBER_SIGN: { consume_next(tokenizer); c = *tokenizer->ptr; - bool none_match_range = false; + bool is_hex_value = false; code_point_t value = 0x0000; if (c == CodePoints::UPPERCASE_X || c == CodePoints::LOWERCASE_X) { + is_hex_value = true; consume_next(tokenizer); c = *tokenizer->ptr; @@ -229,10 +197,12 @@ inline void try_consume_character_reference(Tokenizer* tokenizer) { unconsume_previous(tokenizer); // X unconsume_previous(tokenizer); // Number sign // TODO: @Error parse error - return; + return false; } + while (CodePoints::is_hex(c)) { + tokenizer->last.append_to_code_entity(c); consume_next(tokenizer); c = *tokenizer->ptr; } @@ -241,16 +211,41 @@ inline void try_consume_character_reference(Tokenizer* tokenizer) { if (!CodePoints::is_decimal(c)) { unconsume_previous(tokenizer); // Number sign // TODO: @Error parse error - return; + return false; } while (CodePoints::is_decimal(c)) { + tokenizer->last.append_to_code_entity(c); consume_next(tokenizer); c = *tokenizer->ptr; } } - break; + // We should have the hex value now. + if (c != CodePoints::SEMICOLON) { + // TODO: @Error parse error + return false; + } + + consume_next(tokenizer); + c = *tokenizer->ptr; + auto code_entity = tokenizer->last.code_entity_to_value(is_hex_value); + printf("%d\n", code_entity); + + auto is_parse_erorr = !CodePoints::try_get_character_ref(code_entity, code_entity); + if (is_parse_erorr) { + // TODO: @Error + return false; + } + + return true; + } + default: { + // TODO: Tedious work lies ahead. + // Otherwise try and find the string by name in this table + // https://dev.w3.org/html5/spec-LC/named-character-references.html#named-character-references + logger_error("Unsupported character reference"); + return false; } } } @@ -259,6 +254,7 @@ inline void try_consume_character_reference(Tokenizer* tokenizer) { inline void character_reference_in_data_state(Tokenizer* tokenizer) { // A character reference begins with an ampersand code_point_t c = *tokenizer->ptr; + try_consume_character_reference(tokenizer); } HtmlToken read_next(Tokenizer* tokenizer) { @@ -267,7 +263,7 @@ HtmlToken read_next(Tokenizer* tokenizer) { tokenizer->state = TokenizerState_Data; do { // Reset all flags, except for IncrementPtr - tokenizer->flag = 0 | TokenizerFlag_IncrementPtr; + tokenizer->flag = 0; switch (tokenizer->state) { case TokenizerState_Data: @@ -290,7 +286,7 @@ HtmlToken read_next(Tokenizer* tokenizer) { exit(1); } - if (tokenizer->flag & TokenizerFlag_IncrementPtr) { + if ((tokenizer->flag & TokenizerFlag_DecrementPtr) == 0) { tokenizer->ptr++; } -- cgit v1.2.1