#include "tokenizer.hpp" #include "code_point.h" #include "html_token.hpp" #include #include #include #include Tokenizer create(code_point_t* value) { Tokenizer t; t.ptr = value; t.length = strlen((char*)value); t.state = TokenizerState_Data; return t; } namespace CodePoints { const code_point_t TAB = 0x0009; const code_point_t LF = 0x000A; const code_point_t FF = 0x00C; const code_point_t SPACE = 0x020; const code_point_t SOLIDUS = 0x02F; const code_point_t LOWERCASE_A = 0x0061; const code_point_t LOWERCASE_Z = 0x007A; const code_point_t LOWERCASE_F = 0x0066; const code_point_t UPPERCASE_A = 0x041; const code_point_t UPPERCASE_Z = 0x05A; const code_point_t UPPERCASE_F = 0x0046; const code_point_t NULL_CHAR = 0x0000; const code_point_t REPLACEMENT_CHAR = 0xFFFD; const code_point_t GREATER_THAN_SIGN = 0x003E; const code_point_t LESS_THAN_SIGN = 0x003C; const code_point_t AMPERSAND = 0x0026; const code_point_t EXCLAMATION_MARK = 0x0021; const code_point_t NUMBER_SIGN = 0x0023; const code_point_t LOWERCASE_X = 0x0078; const code_point_t UPPERCASE_X = 0x0058; const code_point_t DIGIT_ZERO = 0x0030; const code_point_t DIGIT_NINE = 0x0039; }; // https://dev.w3.org/html5/spec-LC/tokenization.html#tokenization inline void consume_next(Tokenizer* tokenizer) { tokenizer->ptr++; } /// https://dev.w3.org/html5/spec-LC/tokenization.html#data-state inline void data_state(Tokenizer* tokenizer) { auto c = *tokenizer->ptr; switch (c) { case CodePoints::AMPERSAND: tokenizer->state = TokenizerState_CharacterReferenceInData; break; case CodePoints::LESS_THAN_SIGN: tokenizer->state = TokenizerState_TagOpen; break; case CodePoints::NULL_CHAR: tokenizer->last.type = HtmlTokenType_EOF; tokenizer->flag = tokenizer->flag | TokenizerFlag_Emit; break; default: // TODO: @Error If null, throw an error tokenizer->last.type = HtmlTokenType_Character; tokenizer->last.character_token = *tokenizer->ptr; tokenizer->flag |= TokenizerFlag_Emit; break; } } /// https://dev.w3.org/html5/spec-LC/tokenization.html#tag-open-state inline void tag_open_state(Tokenizer* tokenizer) { code_point_t c = *tokenizer->ptr; switch (c) { case CodePoints::EXCLAMATION_MARK: tokenizer->state = TokenizerState_MarkupDeclarationOpen; // TODO break; case CodePoints::SOLIDUS: // U+002F tokenizer->state = TokenizerState_EndTagOpen; break; default: // TODO: In these two case, we do NOT want to emit the token just yet. if (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_Z) { c = c + 0x0020; // To lower tokenizer->state = TokenizerState_TagName; tokenizer->last.append_to_tag_name(c); tokenizer->last.type = HtmlTokenType_StartTag; } else if (c >= CodePoints::LOWERCASE_A && c <= CodePoints::LOWERCASE_Z) { tokenizer->state = TokenizerState_TagName; tokenizer->last.append_to_tag_name(c); tokenizer->last.type = HtmlTokenType_StartTag; } else if (c == '?') { // U+003F // TODO: Parse error tokenizer->state = TokenizerState_BogusComment; // TODO: } else { tokenizer->state = TokenizerState_Data; } } } inline void tag_name_state(Tokenizer* tokenizer) { code_point_t c = *tokenizer->ptr; switch (c) { case CodePoints::TAB: case CodePoints::FF: case CodePoints::LF: case CodePoints::SPACE: tokenizer->state = TokenizerState_BeforeAttribute; tokenizer->flag |= TokenizerFlag_Emit; break; case CodePoints::SOLIDUS: tokenizer->state = TokenizerState_SelfClosingStartTag; break; case CodePoints::GREATER_THAN_SIGN: tokenizer->state = TokenizerState_Data; tokenizer->flag |= TokenizerFlag_Emit; break; default: if (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_Z) { c = c + 0x0020; // To lower tokenizer->last.append_to_tag_name(c); } else if (c == CodePoints::NULL_CHAR) { // TODO: @Error tokenizer->last.append_to_tag_name(CodePoints::REPLACEMENT_CHAR); } else if (c == EOF) { // TODO: @Error tokenizer->state = TokenizerState_Data; tokenizer->flag = tokenizer->flag & ~TokenizerFlag_IncrementPtr; } else { tokenizer->last.append_to_tag_name(c); } break; } } /// Process the end tag open state /// Spec: https://dev.w3.org/html5/spec-LC/tokenization.html#end-tag-open-state inline void end_tag_open_state(Tokenizer* tokenizer) { code_point_t c = *tokenizer->ptr; if (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_Z) { c = c + 0x0020; // To lower tokenizer->last.type = HtmlTokenType_EndTag; tokenizer->last.append_to_tag_name(c); tokenizer->state = TokenizerState_TagName; } else if (c >= CodePoints::LOWERCASE_A && c <= CodePoints::LOWERCASE_Z) { tokenizer->last.type = HtmlTokenType_EndTag; tokenizer->last.append_to_tag_name(c); tokenizer->state = TokenizerState_TagName; } else if (c == CodePoints::GREATER_THAN_SIGN) { // TODO: @Error We got something like tokenizer->state = TokenizerState_Data; } else if (c == EOF) { // TODO: @Error // TODO: @Question Emit two tokens? tokenizer->last.type = HtmlTokenType_Character; tokenizer->last.character_token = CodePoints::LESS_THAN_SIGN; tokenizer->last.type = HtmlTokenType_Character; tokenizer->last.character_token = CodePoints::SOLIDUS; tokenizer->flag = tokenizer->flag & ~TokenizerFlag_IncrementPtr; } else if (c == CodePoints::NULL_CHAR) { // TODO: @Error tokenizer->last.append_to_tag_name(CodePoints::REPLACEMENT_CHAR); } else { // TODO: @Error tokenizer->state = TokenizerState_BogusComment; } } /// https://dev.w3.org/html5/spec-LC/tokenization.html#consume-a-character-reference inline void try_consume_character_reference(Tokenizer* tokenizer) { code_point_t c = *tokenizer->ptr; switch (c) { case CodePoints::TAB: case CodePoints::LF: case CodePoints::FF: case CodePoints::SPACE: case CodePoints::LESS_THAN_SIGN: case CodePoints::AMPERSAND: case EOF: { // TODO: The additional allowed character? // Not a character reference. No characters are consumed, and nothing is returned. (This is not an error, either.) tokenizer->state = TokenizerState_Data; tokenizer->flag &= TokenizerFlag_IncrementPtr; break; } case CodePoints::NUMBER_SIGN: { consume_next(tokenizer); c = *tokenizer->ptr; bool is_hex = false; // If set to true, we should interpret within the range 0 to F as hex, otherwise 0 to 9 as decimal switch (c) { case CodePoints::UPPERCASE_X: case CodePoints::LOWERCASE_X: { consume_next(tokenizer); is_hex = true; break; } default: { is_hex = false; break; } } break; } } } /// TODO: inline void character_reference_in_data_state(Tokenizer* tokenizer) { // A character reference begins with an ampersand code_point_t c = *tokenizer->ptr; } HtmlToken read_next(Tokenizer* tokenizer) { tokenizer->flag = TokenizerFlag_None; tokenizer->last.reset(); tokenizer->state = TokenizerState_Data; do { // Reset all flags, except for IncrementPtr tokenizer->flag = 0 | TokenizerFlag_IncrementPtr; switch (tokenizer->state) { case TokenizerState_Data: data_state(tokenizer); break; case TokenizerState_TagOpen: tag_open_state(tokenizer); break; case TokenizerState_TagName: tag_name_state(tokenizer); break; case TokenizerState_EndTagOpen: end_tag_open_state(tokenizer); break; case TokenizerState_CharacterReferenceInData: character_reference_in_data_state(tokenizer); break; default: logger_error("Unsupported state, exploding: %d\n", tokenizer->state); exit(1); } if (tokenizer->flag & TokenizerFlag_IncrementPtr) { tokenizer->ptr++; } if (tokenizer->flag & TokenizerFlag_Emit) { break; } } while (true); return tokenizer->last; }