// https://dev.w3.org/html5/spec-LC/tokenization.html#tokenization #include "tokenizer.hpp" #include "code_point.h" #include "html_token.hpp" #include #include #include #include Tokenizer create(code_point_t* value) { Tokenizer t; t.ptr = value; t.length = strlen((char*)value); t.state = TokenizerState_Data; return t; } // Helpers /// Consumes the next token by incrementing the ptr. inline void consume_next(Tokenizer* tokenizer) { tokenizer->ptr++; } // Unconsumed the next tokekn by decrementing the ptr. inline void unconsume_previous(Tokenizer* tokenizer) { tokenizer->ptr--; } inline void emit_character(Tokenizer* tokenizer, code_point_t c) { tokenizer->last.type = HtmlTokenType_Character; tokenizer->last.character_token = c; tokenizer->flag |= TokenizerFlag_Emit; } /// https://dev.w3.org/html5/spec-LC/tokenization.html#consume-a-character-reference /// Attempts to consume a character reference from the current tokenizer. If one cannot /// be consumed, false is returned, otherwise true. inline bool try_consume_character_reference(Tokenizer* tokenizer) { code_point_t c = *tokenizer->ptr; switch (c) { case CodePoints::TAB: case CodePoints::LF: case CodePoints::FF: case CodePoints::SPACE: case CodePoints::LESS_THAN_SIGN: case CodePoints::AMPERSAND: case EOF: { // TODO: The additional allowed character? // Not a character reference. No characters are consumed, and nothing is returned. (This is not an error, either.) tokenizer->state = TokenizerState_Data; return true; } case CodePoints::NUMBER_SIGN: { consume_next(tokenizer); c = *tokenizer->ptr; bool is_hex_value = false; std::wstring code_entity; if (c == CodePoints::UPPERCASE_X || c == CodePoints::LOWERCASE_X) { is_hex_value = true; consume_next(tokenizer); c = *tokenizer->ptr; if (!CodePoints::is_hex(c)) { unconsume_previous(tokenizer); // X unconsume_previous(tokenizer); // Number sign // TODO: @Error parse error return false; } while (CodePoints::is_hex(c)) { code_entity += c; consume_next(tokenizer); c = *tokenizer->ptr; } } else { if (!CodePoints::is_decimal(c)) { unconsume_previous(tokenizer); // Number sign // TODO: @Error parse error return false; } while (CodePoints::is_decimal(c)) { code_entity += c; consume_next(tokenizer); c = *tokenizer->ptr; } } // We should have the hex value now. if (c != CodePoints::SEMICOLON) { // TODO: @Error parse error return false; } consume_next(tokenizer); c = *tokenizer->ptr; tokenizer->last.set_code_entity_to_value(code_entity, is_hex_value); auto is_parse_erorr = !CodePoints::try_get_character_ref(tokenizer->last.entity, tokenizer->last.entity); if (is_parse_erorr) { // TODO: @Error return false; } return true; } default: { // TODO: Tedious work lies ahead. // Otherwise try and find the string by name in this table // https://dev.w3.org/html5/spec-LC/named-character-references.html#named-character-references logger_error("Unsupported character reference"); return false; } } } /// https://dev.w3.org/html5/spec-LC/tokenization.html#data-state inline void data_state(Tokenizer* tokenizer) { auto c = *tokenizer->ptr; switch (c) { case CodePoints::AMPERSAND: tokenizer->state = TokenizerState_CharacterReferenceInData; break; case CodePoints::LESS_THAN_SIGN: tokenizer->state = TokenizerState_TagOpen; break; case CodePoints::NULL_CHAR: tokenizer->last.type = HtmlTokenType_EOF; tokenizer->flag = tokenizer->flag | TokenizerFlag_Emit; break; default: // TODO: @Error If null, throw an error emit_character(tokenizer, *tokenizer->ptr); break; } } /// https://dev.w3.org/html5/spec-LC/tokenization.html#tag-open-state inline void tag_open_state(Tokenizer* tokenizer) { code_point_t c = *tokenizer->ptr; switch (c) { case CodePoints::EXCLAMATION_MARK: tokenizer->state = TokenizerState_MarkupDeclarationOpen; // TODO break; case CodePoints::SOLIDUS: // U+002F tokenizer->state = TokenizerState_EndTagOpen; break; default: // TODO: In these two case, we do NOT want to emit the token just yet. if (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_Z) { c = c + 0x0020; // To lower tokenizer->state = TokenizerState_TagName; tokenizer->last.append_to_tag_name(c); tokenizer->last.type = HtmlTokenType_StartTag; } else if (c >= CodePoints::LOWERCASE_A && c <= CodePoints::LOWERCASE_Z) { tokenizer->state = TokenizerState_TagName; tokenizer->last.append_to_tag_name(c); tokenizer->last.type = HtmlTokenType_StartTag; } else if (c == '?') { // U+003F // TODO: Parse error tokenizer->state = TokenizerState_BogusComment; // TODO: } else { emit_character(tokenizer, CodePoints::LESS_THAN_SIGN); tokenizer->state = TokenizerState_Data; tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement; } break; } } inline void tag_name_state(Tokenizer* tokenizer) { code_point_t c = *tokenizer->ptr; switch (c) { case CodePoints::TAB: case CodePoints::FF: case CodePoints::LF: case CodePoints::SPACE: tokenizer->state = TokenizerState_BeforeAttributeName; break; case CodePoints::SOLIDUS: tokenizer->state = TokenizerState_SelfClosingStartTag; break; case CodePoints::GREATER_THAN_SIGN: tokenizer->state = TokenizerState_Data; tokenizer->flag |= TokenizerFlag_Emit; break; default: if (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_Z) { c = c + 0x0020; // To lower tokenizer->last.append_to_tag_name(c); } else if (c == CodePoints::NULL_CHAR) { // TODO: @Error tokenizer->last.append_to_tag_name(CodePoints::REPLACEMENT_CHAR); } else if (c == EOF) { // TODO: @Error tokenizer->state = TokenizerState_Data; tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement; } else { tokenizer->last.append_to_tag_name(c); } break; } } inline void before_attribute_name_state(Tokenizer* tokenizer) { code_point_t c = *tokenizer->ptr; switch (c) { case CodePoints::TAB: case CodePoints::LF: case CodePoints::FF: case CodePoints::SPACE: { // Ignore the character break; } case CodePoints::SOLIDUS: { tokenizer->state = TokenizerState_SelfClosingStartTag; break; } case CodePoints::GREATER_THAN_SIGN: { tokenizer->state = TokenizerState_Data; tokenizer->flag |= TokenizerFlag_Emit; break; } case CodePoints::NULL_CHAR: { // TODO: @Error Parse error. break; } case CodePoints::QUOTATION_MARK: case CodePoints::APOSTROPHE: case CodePoints::LESS_THAN_SIGN: case CodePoints::EQUALS_SIGN: { // TODO: @Error Parse error // Treat this the same as the "default" case, which is funny break; } case EOF: { // TODO: @Error Parse error tokenizer->state = TokenizerState_Data; tokenizer->flag = tokenizer->flag & TokenizerFlag_NoIncrement; break; } default: { if (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_Z) { c = CodePoints::to_lower_case(c); } tokenizer->last.start_attribute(); tokenizer->last.add_to_attribute_name(c); tokenizer->state = TokenizerState_AttributeNameState; break; } } } inline void attribute_name_state(Tokenizer* tokenizer) { code_point_t c = *tokenizer->ptr; switch (c) { case CodePoints::TAB: case CodePoints::LF: case CodePoints::FF: tokenizer->state = TokenizerState_AfterAttributeNameState; break; case CodePoints::SOLIDUS: tokenizer->state = TokenizerState_SelfClosingStartTag; break; case CodePoints::EQUALS_SIGN: tokenizer->state = TokenizerState_BeforeAttributeValueState; break; case CodePoints::GREATER_THAN_SIGN: tokenizer->state = TokenizerState_Data; tokenizer->flag = tokenizer->flag | TokenizerFlag_Emit; break; case CodePoints::NULL_CHAR: // TODO: @ParseError tokenizer->last.add_to_attribute_name(CodePoints::REPLACEMENT_CHAR); break; case CodePoints::QUOTATION_MARK: case CodePoints::APOSTROPHE: case CodePoints::LESS_THAN_SIGN: // TODO: @ParseError tokenizer->last.add_to_attribute_name(c); break; case CodePoints::MY_EOF: // TODO: @ParseError tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement; tokenizer->state = TokenizerState_Data; break; default: if (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_Z) { c = CodePoints::to_lower_case(c); } tokenizer->last.add_to_attribute_name(c); break; } } inline void before_attribute_value_state(Tokenizer* tokenizer) { code_point_t c = *tokenizer->ptr; switch (c) { case CodePoints::TAB: case CodePoints::LF: case CodePoints::FF: case CodePoints::SPACE: break; case CodePoints::QUOTATION_MARK: tokenizer->state = TokenizerState_AttributeValueDoubleQuoted; break; case CodePoints::AMPERSAND: tokenizer->state = TokenizerState_AttributeValueUnquoted; tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement; break; case CodePoints::APOSTROPHE: tokenizer->state = TokenizerState_AttributeValueSingleQuoted; break; case CodePoints::NULL_CHAR: // TODO: @ParseError tokenizer->state = TokenizerState_AttributeValueUnquoted; tokenizer->last.add_to_attribute_value(CodePoints::REPLACEMENT_CHAR); break; case CodePoints::GREATER_THAN_SIGN: // TODO: @ParseError tokenizer->state = TokenizerState_Data; tokenizer->flag = tokenizer->flag | TokenizerFlag_Emit; break; case CodePoints::LESS_THAN_SIGN: case CodePoints::EQUALS_SIGN: case CodePoints::MY_EOF: tokenizer->state = TokenizerState_AttributeValueUnquoted; tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement; break; default: if (c == CodePoints::GRAVE_ACCENT) { // TODO: @ParseError } tokenizer->state = TokenizerState_AttributeValueUnquoted; tokenizer->last.add_to_attribute_value(c); break; } } inline void attribute_value_double_quoted_state(Tokenizer* tokenizer) { code_point_t c = *tokenizer->ptr; switch (c) { case CodePoints::QUOTATION_MARK: tokenizer->state = TokenizerState_AfterAttributeValueQuoted; break; case CodePoints::AMPERSAND: // https://dev.w3.org/html5/spec-LC/tokenization.html#character-reference-in-attribute-value-state consume_next(tokenizer); if (!try_consume_character_reference(tokenizer)) { tokenizer->last.add_to_attribute_value(CodePoints::AMPERSAND); break; } tokenizer->last.add_to_attribute_value(tokenizer->last.entity); break; case CodePoints::NULL_CHAR: // TODO: @ParseError tokenizer->last.add_to_attribute_value(CodePoints::REPLACEMENT_CHAR); break; case CodePoints::MY_EOF: // TODO: @ParseError tokenizer->state = TokenizerState_Data; tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement; break; default: tokenizer->last.add_to_attribute_value(c); break; } } inline void after_attribute_value_quoted_state(Tokenizer* tokenizer) { code_point_t c = *tokenizer->ptr; switch (c) { case CodePoints::TAB: case CodePoints::LF: case CodePoints::FF: case CodePoints::SPACE: tokenizer->state = TokenizerState_BeforeAttributeName; break; case CodePoints::SOLIDUS: tokenizer->state = TokenizerState_SelfClosingStartTag; break; case CodePoints::GREATER_THAN_SIGN: tokenizer->state = TokenizerState_Data; tokenizer->flag = tokenizer->flag | TokenizerFlag_Emit; break; case CodePoints::MY_EOF: // TODO: @ParseError tokenizer->state = TokenizerState_Data; tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement; break; default: // TODO: @ParseError tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement; tokenizer->state = TokenizerState_BeforeAttributeName; break; } } /// Process the end tag open state /// Spec: https://dev.w3.org/html5/spec-LC/tokenization.html#end-tag-open-state inline void end_tag_open_state(Tokenizer* tokenizer) { code_point_t c = *tokenizer->ptr; if (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_Z) { c = c + 0x0020; // To lower tokenizer->last.type = HtmlTokenType_EndTag; tokenizer->last.append_to_tag_name(c); tokenizer->state = TokenizerState_TagName; } else if (c >= CodePoints::LOWERCASE_A && c <= CodePoints::LOWERCASE_Z) { tokenizer->last.type = HtmlTokenType_EndTag; tokenizer->last.append_to_tag_name(c); tokenizer->state = TokenizerState_TagName; } else if (c == CodePoints::GREATER_THAN_SIGN) { // TODO: @Error We got something like tokenizer->state = TokenizerState_Data; } else if (c == EOF) { // TODO: @Error // TODO: @Question Emit two tokens? tokenizer->last.type = HtmlTokenType_Character; tokenizer->last.character_token = CodePoints::LESS_THAN_SIGN; tokenizer->last.type = HtmlTokenType_Character; tokenizer->last.character_token = CodePoints::SOLIDUS; tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement; } else if (c == CodePoints::NULL_CHAR) { // TODO: @Error tokenizer->last.append_to_tag_name(CodePoints::REPLACEMENT_CHAR); } else { // TODO: @Error tokenizer->state = TokenizerState_BogusComment; } } /// TODO: inline void character_reference_in_data_state(Tokenizer* tokenizer) { // A character reference begins with an ampersand code_point_t c = *tokenizer->ptr; try_consume_character_reference(tokenizer); } HtmlToken read_next(Tokenizer* tokenizer) { tokenizer->flag = TokenizerFlag_None; tokenizer->last.reset(); tokenizer->state = TokenizerState_Data; do { // Reset all flags, except for IncrementPtr tokenizer->flag = 0; switch (tokenizer->state) { case TokenizerState_Data: data_state(tokenizer); break; case TokenizerState_TagOpen: tag_open_state(tokenizer); break; case TokenizerState_TagName: tag_name_state(tokenizer); break; case TokenizerState_BeforeAttributeName: before_attribute_name_state(tokenizer); break; case TokenizerState_AttributeNameState: attribute_name_state(tokenizer); break; case TokenizerState_BeforeAttributeValueState: before_attribute_value_state(tokenizer); break; case TokenizerState_AttributeValueDoubleQuoted: attribute_value_double_quoted_state(tokenizer); break; case TokenizerState_AfterAttributeValueQuoted: after_attribute_value_quoted_state(tokenizer); break; case TokenizerState_EndTagOpen: end_tag_open_state(tokenizer); break; case TokenizerState_CharacterReferenceInData: character_reference_in_data_state(tokenizer); break; default: logger_error("Unsupported state, exploding: %d\n", tokenizer->state); exit(1); } if ((tokenizer->flag & TokenizerFlag_NoIncrement) == 0) { tokenizer->ptr++; } if (tokenizer->flag & TokenizerFlag_Emit) { break; } } while (true); return tokenizer->last; }