// https://dev.w3.org/html5/spec-LC/tokenization.html#tokenization #include "tokenizer.hpp" #include "code_point.h" #include "html_token.hpp" #include #include #include #include Tokenizer create(code_point_t* value) { Tokenizer t; t.ptr = value; t.length = strlen((char*)value); t.state = TokenizerState_Data; return t; } // Helpers /// Consumes the next token by incrementing the ptr. inline void consume_next(Tokenizer* tokenizer) { tokenizer->ptr++; } // Unconsumed the next tokekn by decrementing the ptr. inline void unconsume_previous(Tokenizer* tokenizer) { tokenizer->ptr--; } inline void emit_character(Tokenizer* tokenizer, code_point_t c) { tokenizer->last.type = HtmlTokenType_Character; tokenizer->last.character_token = c; tokenizer->flag |= TokenizerFlag_Emit; } /// https://dev.w3.org/html5/spec-LC/tokenization.html#data-state inline void data_state(Tokenizer* tokenizer) { auto c = *tokenizer->ptr; switch (c) { case CodePoints::AMPERSAND: tokenizer->state = TokenizerState_CharacterReferenceInData; break; case CodePoints::LESS_THAN_SIGN: tokenizer->state = TokenizerState_TagOpen; break; case CodePoints::NULL_CHAR: tokenizer->last.type = HtmlTokenType_EOF; tokenizer->flag = tokenizer->flag | TokenizerFlag_Emit; break; default: // TODO: @Error If null, throw an error emit_character(tokenizer, *tokenizer->ptr); break; } } /// https://dev.w3.org/html5/spec-LC/tokenization.html#tag-open-state inline void tag_open_state(Tokenizer* tokenizer) { code_point_t c = *tokenizer->ptr; switch (c) { case CodePoints::EXCLAMATION_MARK: tokenizer->state = TokenizerState_MarkupDeclarationOpen; // TODO break; case CodePoints::SOLIDUS: // U+002F tokenizer->state = TokenizerState_EndTagOpen; break; default: // TODO: In these two case, we do NOT want to emit the token just yet. if (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_Z) { c = c + 0x0020; // To lower tokenizer->state = TokenizerState_TagName; tokenizer->last.append_to_tag_name(c); tokenizer->last.type = HtmlTokenType_StartTag; } else if (c >= CodePoints::LOWERCASE_A && c <= CodePoints::LOWERCASE_Z) { tokenizer->state = TokenizerState_TagName; tokenizer->last.append_to_tag_name(c); tokenizer->last.type = HtmlTokenType_StartTag; } else if (c == '?') { // U+003F // TODO: Parse error tokenizer->state = TokenizerState_BogusComment; // TODO: } else { emit_character(tokenizer, CodePoints::LESS_THAN_SIGN); tokenizer->state = TokenizerState_Data; tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement; } break; } } inline void tag_name_state(Tokenizer* tokenizer) { code_point_t c = *tokenizer->ptr; switch (c) { case CodePoints::TAB: case CodePoints::FF: case CodePoints::LF: case CodePoints::SPACE: tokenizer->state = TokenizerState_BeforeAttribute; tokenizer->flag |= TokenizerFlag_Emit; break; case CodePoints::SOLIDUS: tokenizer->state = TokenizerState_SelfClosingStartTag; break; case CodePoints::GREATER_THAN_SIGN: tokenizer->state = TokenizerState_Data; tokenizer->flag |= TokenizerFlag_Emit; break; default: if (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_Z) { c = c + 0x0020; // To lower tokenizer->last.append_to_tag_name(c); } else if (c == CodePoints::NULL_CHAR) { // TODO: @Error tokenizer->last.append_to_tag_name(CodePoints::REPLACEMENT_CHAR); } else if (c == EOF) { // TODO: @Error tokenizer->state = TokenizerState_Data; tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement; } else { tokenizer->last.append_to_tag_name(c); } break; } } /// Process the end tag open state /// Spec: https://dev.w3.org/html5/spec-LC/tokenization.html#end-tag-open-state inline void end_tag_open_state(Tokenizer* tokenizer) { code_point_t c = *tokenizer->ptr; if (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_Z) { c = c + 0x0020; // To lower tokenizer->last.type = HtmlTokenType_EndTag; tokenizer->last.append_to_tag_name(c); tokenizer->state = TokenizerState_TagName; } else if (c >= CodePoints::LOWERCASE_A && c <= CodePoints::LOWERCASE_Z) { tokenizer->last.type = HtmlTokenType_EndTag; tokenizer->last.append_to_tag_name(c); tokenizer->state = TokenizerState_TagName; } else if (c == CodePoints::GREATER_THAN_SIGN) { // TODO: @Error We got something like tokenizer->state = TokenizerState_Data; } else if (c == EOF) { // TODO: @Error // TODO: @Question Emit two tokens? tokenizer->last.type = HtmlTokenType_Character; tokenizer->last.character_token = CodePoints::LESS_THAN_SIGN; tokenizer->last.type = HtmlTokenType_Character; tokenizer->last.character_token = CodePoints::SOLIDUS; tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement; } else if (c == CodePoints::NULL_CHAR) { // TODO: @Error tokenizer->last.append_to_tag_name(CodePoints::REPLACEMENT_CHAR); } else { // TODO: @Error tokenizer->state = TokenizerState_BogusComment; } } /// https://dev.w3.org/html5/spec-LC/tokenization.html#consume-a-character-reference /// Attempts to consume a character reference from the current tokenizer. If one cannot /// be consumed, false is returned, otherwise true. inline bool try_consume_character_reference(Tokenizer* tokenizer) { code_point_t c = *tokenizer->ptr; switch (c) { case CodePoints::TAB: case CodePoints::LF: case CodePoints::FF: case CodePoints::SPACE: case CodePoints::LESS_THAN_SIGN: case CodePoints::AMPERSAND: case EOF: { // TODO: The additional allowed character? // Not a character reference. No characters are consumed, and nothing is returned. (This is not an error, either.) tokenizer->state = TokenizerState_Data; return true; } case CodePoints::NUMBER_SIGN: { consume_next(tokenizer); c = *tokenizer->ptr; bool is_hex_value = false; code_point_t value = 0x0000; if (c == CodePoints::UPPERCASE_X || c == CodePoints::LOWERCASE_X) { is_hex_value = true; consume_next(tokenizer); c = *tokenizer->ptr; if (!CodePoints::is_hex(c)) { unconsume_previous(tokenizer); // X unconsume_previous(tokenizer); // Number sign // TODO: @Error parse error return false; } while (CodePoints::is_hex(c)) { tokenizer->last.append_to_code_entity(c); consume_next(tokenizer); c = *tokenizer->ptr; } } else { if (!CodePoints::is_decimal(c)) { unconsume_previous(tokenizer); // Number sign // TODO: @Error parse error return false; } while (CodePoints::is_decimal(c)) { tokenizer->last.append_to_code_entity(c); consume_next(tokenizer); c = *tokenizer->ptr; } } // We should have the hex value now. if (c != CodePoints::SEMICOLON) { // TODO: @Error parse error return false; } consume_next(tokenizer); c = *tokenizer->ptr; auto code_entity = tokenizer->last.code_entity_to_value(is_hex_value); printf("%d\n", code_entity); auto is_parse_erorr = !CodePoints::try_get_character_ref(code_entity, code_entity); if (is_parse_erorr) { // TODO: @Error return false; } return true; } default: { // TODO: Tedious work lies ahead. // Otherwise try and find the string by name in this table // https://dev.w3.org/html5/spec-LC/named-character-references.html#named-character-references logger_error("Unsupported character reference"); return false; } } } /// TODO: inline void character_reference_in_data_state(Tokenizer* tokenizer) { // A character reference begins with an ampersand code_point_t c = *tokenizer->ptr; try_consume_character_reference(tokenizer); } HtmlToken read_next(Tokenizer* tokenizer) { tokenizer->flag = TokenizerFlag_None; tokenizer->last.reset(); tokenizer->state = TokenizerState_Data; do { // Reset all flags, except for IncrementPtr tokenizer->flag = 0; switch (tokenizer->state) { case TokenizerState_Data: data_state(tokenizer); break; case TokenizerState_TagOpen: tag_open_state(tokenizer); break; case TokenizerState_TagName: tag_name_state(tokenizer); break; case TokenizerState_EndTagOpen: end_tag_open_state(tokenizer); break; case TokenizerState_CharacterReferenceInData: character_reference_in_data_state(tokenizer); break; default: logger_error("Unsupported state, exploding: %d\n", tokenizer->state); exit(1); } if ((tokenizer->flag & TokenizerFlag_NoIncrement) == 0) { tokenizer->ptr++; } if (tokenizer->flag & TokenizerFlag_Emit) { break; } } while (true); return tokenizer->last; }