From 4feb59d831d395369aa21d77e9b9d293125421d1 Mon Sep 17 00:00:00 2001 From: mattkae Date: Fri, 23 Jun 2023 10:25:52 -0400 Subject: Able to parse double quoted HTML attributes --- src/tokenizer.cpp | 392 +++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 303 insertions(+), 89 deletions(-) (limited to 'src/tokenizer.cpp') diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index dc0b8d7..2360c3c 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -34,6 +34,91 @@ inline void emit_character(Tokenizer* tokenizer, code_point_t c) { tokenizer->flag |= TokenizerFlag_Emit; } +/// https://dev.w3.org/html5/spec-LC/tokenization.html#consume-a-character-reference +/// Attempts to consume a character reference from the current tokenizer. If one cannot +/// be consumed, false is returned, otherwise true. +inline bool try_consume_character_reference(Tokenizer* tokenizer) { + code_point_t c = *tokenizer->ptr; + + switch (c) { + case CodePoints::TAB: + case CodePoints::LF: + case CodePoints::FF: + case CodePoints::SPACE: + case CodePoints::LESS_THAN_SIGN: + case CodePoints::AMPERSAND: + case EOF: { + // TODO: The additional allowed character? + // Not a character reference. No characters are consumed, and nothing is returned. (This is not an error, either.) + tokenizer->state = TokenizerState_Data; + return true; + } + case CodePoints::NUMBER_SIGN: { + consume_next(tokenizer); + c = *tokenizer->ptr; + + bool is_hex_value = false; + std::wstring code_entity; + if (c == CodePoints::UPPERCASE_X || c == CodePoints::LOWERCASE_X) { + is_hex_value = true; + consume_next(tokenizer); + c = *tokenizer->ptr; + + if (!CodePoints::is_hex(c)) { + unconsume_previous(tokenizer); // X + unconsume_previous(tokenizer); // Number sign + // TODO: @Error parse error + return false; + } + + + while (CodePoints::is_hex(c)) { + code_entity += c; + consume_next(tokenizer); + c = *tokenizer->ptr; + } + } + else { + if (!CodePoints::is_decimal(c)) { + unconsume_previous(tokenizer); // Number sign + // TODO: @Error parse error + return false; + } + + while (CodePoints::is_decimal(c)) { + code_entity += c; + consume_next(tokenizer); + c = *tokenizer->ptr; + } + } + + // We should have the hex value now. + if (c != CodePoints::SEMICOLON) { + // TODO: @Error parse error + return false; + } + + consume_next(tokenizer); + c = *tokenizer->ptr; + tokenizer->last.set_code_entity_to_value(code_entity, is_hex_value); + auto is_parse_erorr = !CodePoints::try_get_character_ref(tokenizer->last.entity, tokenizer->last.entity); + if (is_parse_erorr) { + // TODO: @Error + return false; + } + + return true; + } + default: { + // TODO: Tedious work lies ahead. + // Otherwise try and find the string by name in this table + // https://dev.w3.org/html5/spec-LC/named-character-references.html#named-character-references + logger_error("Unsupported character reference"); + return false; + } + } +} + /// https://dev.w3.org/html5/spec-LC/tokenization.html#data-state inline void data_state(Tokenizer* tokenizer) { @@ -99,8 +184,7 @@ inline void tag_name_state(Tokenizer* tokenizer) { case CodePoints::FF: case CodePoints::LF: case CodePoints::SPACE: - tokenizer->state = TokenizerState_BeforeAttribute; - tokenizer->flag |= TokenizerFlag_Emit; + tokenizer->state = TokenizerState_BeforeAttributeName; break; case CodePoints::SOLIDUS: tokenizer->state = TokenizerState_SelfClosingStartTag; @@ -130,6 +214,208 @@ inline void tag_name_state(Tokenizer* tokenizer) { } } +inline void before_attribute_name_state(Tokenizer* tokenizer) { + code_point_t c = *tokenizer->ptr; + + switch (c) { + case CodePoints::TAB: + case CodePoints::LF: + case CodePoints::FF: + case CodePoints::SPACE: { + // Ignore the character + break; + } + case CodePoints::SOLIDUS: { + tokenizer->state = TokenizerState_SelfClosingStartTag; + break; + } + case CodePoints::GREATER_THAN_SIGN: { + tokenizer->state = TokenizerState_Data; + tokenizer->flag |= TokenizerFlag_Emit; + break; + } + case CodePoints::NULL_CHAR: { + // TODO: @Error Parse error. + break; + } + case CodePoints::QUOTATION_MARK: + case CodePoints::APOSTROPHE: + case CodePoints::LESS_THAN_SIGN: + case CodePoints::EQUALS_SIGN: { + // TODO: @Error Parse error + // Treat this the same as the "default" case, which is funny + break; + } + case EOF: { + // TODO: @Error Parse error + tokenizer->state = TokenizerState_Data; + tokenizer->flag = tokenizer->flag & TokenizerFlag_NoIncrement; + break; + } + default: { + if (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_Z) { + c = CodePoints::to_lower_case(c); + } + + tokenizer->last.start_attribute(); + tokenizer->last.add_to_attribute_name(c); + tokenizer->state = TokenizerState_AttributeNameState; + break; + } + } +} + +inline void attribute_name_state(Tokenizer* tokenizer) { + code_point_t c = *tokenizer->ptr; + + switch (c) { + case CodePoints::TAB: + case CodePoints::LF: + case CodePoints::FF: + tokenizer->state = TokenizerState_AfterAttributeNameState; + break; + case CodePoints::SOLIDUS: + tokenizer->state = TokenizerState_SelfClosingStartTag; + break; + case CodePoints::EQUALS_SIGN: + tokenizer->state = TokenizerState_BeforeAttributeValueState; + break; + case CodePoints::GREATER_THAN_SIGN: + tokenizer->state = TokenizerState_Data; + tokenizer->flag = tokenizer->flag | TokenizerFlag_Emit; + break; + case CodePoints::NULL_CHAR: + // TODO: @ParseError + tokenizer->last.add_to_attribute_name(CodePoints::REPLACEMENT_CHAR); + break; + case CodePoints::QUOTATION_MARK: + case CodePoints::APOSTROPHE: + case CodePoints::LESS_THAN_SIGN: + // TODO: @ParseError + tokenizer->last.add_to_attribute_name(c); + break; + case CodePoints::MY_EOF: + // TODO: @ParseError + tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement; + tokenizer->state = TokenizerState_Data; + break; + default: + if (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_Z) { + c = CodePoints::to_lower_case(c); + } + + tokenizer->last.add_to_attribute_name(c); + break; + } +} + +inline void before_attribute_value_state(Tokenizer* tokenizer) { + code_point_t c = *tokenizer->ptr; + + switch (c) { + case CodePoints::TAB: + case CodePoints::LF: + case CodePoints::FF: + case CodePoints::SPACE: + break; + case CodePoints::QUOTATION_MARK: + tokenizer->state = TokenizerState_AttributeValueDoubleQuoted; + break; + case CodePoints::AMPERSAND: + tokenizer->state = TokenizerState_AttributeValueUnquoted; + tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement; + break; + case CodePoints::APOSTROPHE: + tokenizer->state = TokenizerState_AttributeValueSingleQuoted; + break; + case CodePoints::NULL_CHAR: + // TODO: @ParseError + tokenizer->state = TokenizerState_AttributeValueUnquoted; + tokenizer->last.add_to_attribute_value(CodePoints::REPLACEMENT_CHAR); + break; + case CodePoints::GREATER_THAN_SIGN: + // TODO: @ParseError + tokenizer->state = TokenizerState_Data; + tokenizer->flag = tokenizer->flag | TokenizerFlag_Emit; + break; + case CodePoints::LESS_THAN_SIGN: + case CodePoints::EQUALS_SIGN: + case CodePoints::MY_EOF: + tokenizer->state = TokenizerState_AttributeValueUnquoted; + tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement; + break; + default: + if (c == CodePoints::GRAVE_ACCENT) { + // TODO: @ParseError + } + tokenizer->state = TokenizerState_AttributeValueUnquoted; + tokenizer->last.add_to_attribute_value(c); + break; + } +} + +inline void attribute_value_double_quoted_state(Tokenizer* tokenizer) { + code_point_t c = *tokenizer->ptr; + + switch (c) { + case CodePoints::QUOTATION_MARK: + tokenizer->state = TokenizerState_AfterAttributeValueQuoted; + break; + case CodePoints::AMPERSAND: + // https://dev.w3.org/html5/spec-LC/tokenization.html#character-reference-in-attribute-value-state + consume_next(tokenizer); + if (!try_consume_character_reference(tokenizer)) { + tokenizer->last.add_to_attribute_value(CodePoints::AMPERSAND); + break; + } + + tokenizer->last.add_to_attribute_value(tokenizer->last.entity); + break; + case CodePoints::NULL_CHAR: + // TODO: @ParseError + tokenizer->last.add_to_attribute_value(CodePoints::REPLACEMENT_CHAR); + break; + case CodePoints::MY_EOF: + // TODO: @ParseError + tokenizer->state = TokenizerState_Data; + tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement; + break; + default: + tokenizer->last.add_to_attribute_value(c); + break; + } +} + +inline void after_attribute_value_quoted_state(Tokenizer* tokenizer) { + code_point_t c = *tokenizer->ptr; + + switch (c) { + case CodePoints::TAB: + case CodePoints::LF: + case CodePoints::FF: + case CodePoints::SPACE: + tokenizer->state = TokenizerState_BeforeAttributeName; + break; + case CodePoints::SOLIDUS: + tokenizer->state = TokenizerState_SelfClosingStartTag; + break; + case CodePoints::GREATER_THAN_SIGN: + tokenizer->state = TokenizerState_Data; + tokenizer->flag = tokenizer->flag | TokenizerFlag_Emit; + break; + case CodePoints::MY_EOF: + // TODO: @ParseError + tokenizer->state = TokenizerState_Data; + tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement; + break; + default: + // TODO: @ParseError + tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement; + tokenizer->state = TokenizerState_BeforeAttributeName; + break; + } +} + /// Process the end tag open state /// Spec: https://dev.w3.org/html5/spec-LC/tokenization.html#end-tag-open-state inline void end_tag_open_state(Tokenizer* tokenizer) { @@ -170,93 +456,6 @@ inline void end_tag_open_state(Tokenizer* tokenizer) { } } -/// https://dev.w3.org/html5/spec-LC/tokenization.html#consume-a-character-reference -/// Attempts to consume a character reference from the current tokenizer. If one cannot -/// be consumed, false is returned, otherwise true. -inline bool try_consume_character_reference(Tokenizer* tokenizer) { - code_point_t c = *tokenizer->ptr; - - switch (c) { - case CodePoints::TAB: - case CodePoints::LF: - case CodePoints::FF: - case CodePoints::SPACE: - case CodePoints::LESS_THAN_SIGN: - case CodePoints::AMPERSAND: - case EOF: { - // TODO: The additional allowed character? - // Not a character reference. No characters are consumed, and nothing is returned. (This is not an error, either.) - tokenizer->state = TokenizerState_Data; - return true; - } - case CodePoints::NUMBER_SIGN: { - consume_next(tokenizer); - c = *tokenizer->ptr; - - bool is_hex_value = false; - code_point_t value = 0x0000; - if (c == CodePoints::UPPERCASE_X || c == CodePoints::LOWERCASE_X) { - is_hex_value = true; - consume_next(tokenizer); - c = *tokenizer->ptr; - - if (!CodePoints::is_hex(c)) { - unconsume_previous(tokenizer); // X - unconsume_previous(tokenizer); // Number sign - // TODO: @Error parse error - return false; - } - - - while (CodePoints::is_hex(c)) { - tokenizer->last.append_to_code_entity(c); - consume_next(tokenizer); - c = *tokenizer->ptr; - } - } - else { - if (!CodePoints::is_decimal(c)) { - unconsume_previous(tokenizer); // Number sign - // TODO: @Error parse error - return false; - } - - while (CodePoints::is_decimal(c)) { - tokenizer->last.append_to_code_entity(c); - consume_next(tokenizer); - c = *tokenizer->ptr; - } - } - - // We should have the hex value now. - if (c != CodePoints::SEMICOLON) { - // TODO: @Error parse error - return false; - } - - consume_next(tokenizer); - c = *tokenizer->ptr; - auto code_entity = tokenizer->last.code_entity_to_value(is_hex_value); - printf("%d\n", code_entity); - - auto is_parse_erorr = !CodePoints::try_get_character_ref(code_entity, code_entity); - if (is_parse_erorr) { - // TODO: @Error - return false; - } - - return true; - } - default: { - // TODO: Tedious work lies ahead. - // Otherwise try and find the string by name in this table - // https://dev.w3.org/html5/spec-LC/named-character-references.html#named-character-references - logger_error("Unsupported character reference"); - return false; - } - } -} - /// TODO: inline void character_reference_in_data_state(Tokenizer* tokenizer) { // A character reference begins with an ampersand @@ -282,6 +481,21 @@ HtmlToken read_next(Tokenizer* tokenizer) { case TokenizerState_TagName: tag_name_state(tokenizer); break; + case TokenizerState_BeforeAttributeName: + before_attribute_name_state(tokenizer); + break; + case TokenizerState_AttributeNameState: + attribute_name_state(tokenizer); + break; + case TokenizerState_BeforeAttributeValueState: + before_attribute_value_state(tokenizer); + break; + case TokenizerState_AttributeValueDoubleQuoted: + attribute_value_double_quoted_state(tokenizer); + break; + case TokenizerState_AfterAttributeValueQuoted: + after_attribute_value_quoted_state(tokenizer); + break; case TokenizerState_EndTagOpen: end_tag_open_state(tokenizer); break; -- cgit v1.2.1