From 4058f9b1704322f8185136c2558c2ab96a4d835c Mon Sep 17 00:00:00 2001 From: mattkae Date: Sun, 23 Apr 2023 20:23:54 -0400 Subject: Initial commit with a working parser --- src/tokenizer.cpp | 268 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 268 insertions(+) create mode 100644 src/tokenizer.cpp (limited to 'src/tokenizer.cpp') diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp new file mode 100644 index 0000000..30ca6bd --- /dev/null +++ b/src/tokenizer.cpp @@ -0,0 +1,268 @@ +#include "tokenizer.hpp" +#include "code_point.h" +#include "html_token.hpp" +#include +#include +#include +#include + +Tokenizer create(code_point_t* value) { + Tokenizer t; + t.ptr = value; + t.length = strlen((char*)value); + t.state = TokenizerState_Data; + return t; +} + +namespace CodePoints { + const code_point_t TAB = 0x0009; + const code_point_t LF = 0x000A; + const code_point_t FF = 0x00C; + const code_point_t SPACE = 0x020; + const code_point_t SOLIDUS = 0x02F; + const code_point_t LOWERCASE_A = 0x0061; + const code_point_t LOWERCASE_Z = 0x007A; + const code_point_t LOWERCASE_F = 0x0066; + const code_point_t UPPERCASE_A = 0x041; + const code_point_t UPPERCASE_Z = 0x05A; + const code_point_t UPPERCASE_F = 0x0046; + const code_point_t NULL_CHAR = 0x0000; + const code_point_t REPLACEMENT_CHAR = 0xFFFD; + const code_point_t GREATER_THAN_SIGN = 0x003E; + const code_point_t LESS_THAN_SIGN = 0x003C; + const code_point_t AMPERSAND = 0x0026; + const code_point_t EXCLAMATION_MARK = 0x0021; + const code_point_t NUMBER_SIGN = 0x0023; + const code_point_t LOWERCASE_X = 0x0078; + const code_point_t UPPERCASE_X = 0x0058; + const code_point_t DIGIT_ZERO = 0x0030; + const code_point_t DIGIT_NINE = 0x0039; +}; + +// https://dev.w3.org/html5/spec-LC/tokenization.html#tokenization + +inline void consume_next(Tokenizer* tokenizer) { + tokenizer->ptr++; +} + + +/// https://dev.w3.org/html5/spec-LC/tokenization.html#data-state +inline void data_state(Tokenizer* tokenizer) { + auto c = *tokenizer->ptr; + switch (c) { + case CodePoints::AMPERSAND: + tokenizer->state = TokenizerState_CharacterReferenceInData; + break; + case CodePoints::LESS_THAN_SIGN: + tokenizer->state = TokenizerState_TagOpen; + break; + case CodePoints::NULL_CHAR: + tokenizer->last.type = HtmlTokenType_EOF; + tokenizer->flag = tokenizer->flag | TokenizerFlag_Emit; + break; + default: + // TODO: @Error If null, throw an error + tokenizer->last.type = HtmlTokenType_Character; + tokenizer->last.character_token = *tokenizer->ptr; + tokenizer->flag |= TokenizerFlag_Emit; + break; + } +} + +/// https://dev.w3.org/html5/spec-LC/tokenization.html#tag-open-state +inline void tag_open_state(Tokenizer* tokenizer) { + code_point_t c = *tokenizer->ptr; + switch (c) { + case CodePoints::EXCLAMATION_MARK: + tokenizer->state = TokenizerState_MarkupDeclarationOpen; // TODO + break; + case CodePoints::SOLIDUS: // U+002F + tokenizer->state = TokenizerState_EndTagOpen; + break; + default: + // TODO: In these two case, we do NOT want to emit the token just yet. + if (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_Z) { + c = c + 0x0020; // To lower + tokenizer->state = TokenizerState_TagName; + tokenizer->last.append_to_tag_name(c); + tokenizer->last.type = HtmlTokenType_StartTag; + } + else if (c >= CodePoints::LOWERCASE_A && c <= CodePoints::LOWERCASE_Z) { + tokenizer->state = TokenizerState_TagName; + tokenizer->last.append_to_tag_name(c); + tokenizer->last.type = HtmlTokenType_StartTag; + } + else if (c == '?') { // U+003F + // TODO: Parse error + tokenizer->state = TokenizerState_BogusComment; // TODO: + } + else { + tokenizer->state = TokenizerState_Data; + } + } +} + +inline void tag_name_state(Tokenizer* tokenizer) { + code_point_t c = *tokenizer->ptr; + switch (c) { + case CodePoints::TAB: + case CodePoints::FF: + case CodePoints::LF: + case CodePoints::SPACE: + tokenizer->state = TokenizerState_BeforeAttribute; + tokenizer->flag |= TokenizerFlag_Emit; + break; + case CodePoints::SOLIDUS: + tokenizer->state = TokenizerState_SelfClosingStartTag; + break; + case CodePoints::GREATER_THAN_SIGN: + tokenizer->state = TokenizerState_Data; + tokenizer->flag |= TokenizerFlag_Emit; + break; + default: + if (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_Z) { + c = c + 0x0020; // To lower + tokenizer->last.append_to_tag_name(c); + } + else if (c == CodePoints::NULL_CHAR) { + // TODO: @Error + tokenizer->last.append_to_tag_name(CodePoints::REPLACEMENT_CHAR); + } + else if (c == EOF) { + // TODO: @Error + tokenizer->state = TokenizerState_Data; + tokenizer->flag = tokenizer->flag & ~TokenizerFlag_IncrementPtr; + } + else { + tokenizer->last.append_to_tag_name(c); + } + break; + } +} + +/// Process the end tag open state +/// Spec: https://dev.w3.org/html5/spec-LC/tokenization.html#end-tag-open-state +inline void end_tag_open_state(Tokenizer* tokenizer) { + code_point_t c = *tokenizer->ptr; + if (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_Z) { + c = c + 0x0020; // To lower + tokenizer->last.type = HtmlTokenType_EndTag; + tokenizer->last.append_to_tag_name(c); + tokenizer->state = TokenizerState_TagName; + } + else if (c >= CodePoints::LOWERCASE_A && c <= CodePoints::LOWERCASE_Z) { + tokenizer->last.type = HtmlTokenType_EndTag; + tokenizer->last.append_to_tag_name(c); + tokenizer->state = TokenizerState_TagName; + } + else if (c == CodePoints::GREATER_THAN_SIGN) { + // TODO: @Error We got something like + tokenizer->state = TokenizerState_Data; + } + else if (c == EOF) { + // TODO: @Error + // TODO: @Question Emit two tokens? + tokenizer->last.type = HtmlTokenType_Character; + tokenizer->last.character_token = CodePoints::LESS_THAN_SIGN; + + tokenizer->last.type = HtmlTokenType_Character; + tokenizer->last.character_token = CodePoints::SOLIDUS; + + tokenizer->flag = tokenizer->flag & ~TokenizerFlag_IncrementPtr; + } + else if (c == CodePoints::NULL_CHAR) { + // TODO: @Error + tokenizer->last.append_to_tag_name(CodePoints::REPLACEMENT_CHAR); + } + else { + // TODO: @Error + tokenizer->state = TokenizerState_BogusComment; + } +} + +/// https://dev.w3.org/html5/spec-LC/tokenization.html#consume-a-character-reference +inline void try_consume_character_reference(Tokenizer* tokenizer) { + code_point_t c = *tokenizer->ptr; + + switch (c) { + case CodePoints::TAB: + case CodePoints::LF: + case CodePoints::FF: + case CodePoints::SPACE: + case CodePoints::LESS_THAN_SIGN: + case CodePoints::AMPERSAND: + case EOF: { + // TODO: The additional allowed character? + // Not a character reference. No characters are consumed, and nothing is returned. (This is not an error, either.) + tokenizer->state = TokenizerState_Data; + tokenizer->flag &= TokenizerFlag_IncrementPtr; + break; + } + case CodePoints::NUMBER_SIGN: { + consume_next(tokenizer); + c = *tokenizer->ptr; + + bool is_hex = false; // If set to true, we should interpret within the range 0 to F as hex, otherwise 0 to 9 as decimal + switch (c) { + case CodePoints::UPPERCASE_X: + case CodePoints::LOWERCASE_X: { + consume_next(tokenizer); + is_hex = true; + break; + } + default: { + is_hex = false; + break; + } + } + break; + } + } +} + +/// TODO: +inline void character_reference_in_data_state(Tokenizer* tokenizer) { + // A character reference begins with an ampersand + code_point_t c = *tokenizer->ptr; +} + +HtmlToken read_next(Tokenizer* tokenizer) { + tokenizer->flag = TokenizerFlag_None; + tokenizer->last.reset(); + tokenizer->state = TokenizerState_Data; + do { + // Reset all flags, except for IncrementPtr + tokenizer->flag = 0 | TokenizerFlag_IncrementPtr; + + switch (tokenizer->state) { + case TokenizerState_Data: + data_state(tokenizer); + break; + case TokenizerState_TagOpen: + tag_open_state(tokenizer); + break; + case TokenizerState_TagName: + tag_name_state(tokenizer); + break; + case TokenizerState_EndTagOpen: + end_tag_open_state(tokenizer); + break; + case TokenizerState_CharacterReferenceInData: + character_reference_in_data_state(tokenizer); + break; + default: + logger_error("Unsupported state, exploding: %d\n", tokenizer->state); + exit(1); + } + + if (tokenizer->flag & TokenizerFlag_IncrementPtr) { + tokenizer->ptr++; + } + + if (tokenizer->flag & TokenizerFlag_Emit) { + break; + } + } while (true); + + return tokenizer->last; +} -- cgit v1.2.1