From 4feb59d831d395369aa21d77e9b9d293125421d1 Mon Sep 17 00:00:00 2001 From: mattkae Date: Fri, 23 Jun 2023 10:25:52 -0400 Subject: Able to parse double quoted HTML attributes --- examples/1.html | 2 +- src/code_point.h | 10 ++ src/html_token.cpp | 9 +- src/html_token.hpp | 31 ++++- src/tokenizer.cpp | 392 +++++++++++++++++++++++++++++++++++++++++------------ src/tokenizer.hpp | 13 +- 6 files changed, 359 insertions(+), 98 deletions(-) diff --git a/examples/1.html b/examples/1.html index f83a60a..f146e41 100644 --- a/examples/1.html +++ b/examples/1.html @@ -1,7 +1,7 @@ Meow is < bark
-

Hello World

+

Hello World

I am in a paragraph ©

diff --git a/src/code_point.h b/src/code_point.h index c039907..59afd75 100644 --- a/src/code_point.h +++ b/src/code_point.h @@ -2,6 +2,7 @@ #define CODE_POINT_H #include +#include typedef wchar_t code_point_t; @@ -26,6 +27,8 @@ namespace CodePoints { const code_point_t REPLACEMENT_CHAR = 0xFFFD; const code_point_t GREATER_THAN_SIGN = 0x003E; const code_point_t LESS_THAN_SIGN = 0x003C; + const code_point_t EQUALS_SIGN = 0x003D; + const code_point_t GRAVE_ACCENT = 0x0060; const code_point_t AMPERSAND = 0x0026; const code_point_t EXCLAMATION_MARK = 0x0021; const code_point_t NUMBER_SIGN = 0x0023; @@ -34,6 +37,9 @@ namespace CodePoints { const code_point_t DIGIT_ZERO = 0x0030; const code_point_t DIGIT_NINE = 0x0039; const code_point_t SEMICOLON = 0x003B; + const code_point_t QUOTATION_MARK = 0x0022; + const code_point_t APOSTROPHE = 0x0027; + const code_point_t MY_EOF = EOF; inline bool is_decimal(code_point_t c) { return c >= CodePoints::DIGIT_ZERO && c <= CodePoints::DIGIT_NINE; @@ -153,6 +159,10 @@ namespace CodePoints { out = c; return true; } + + inline code_point_t to_lower_case(code_point_t c) { + return c + 0x0020; + } }; #endif diff --git a/src/html_token.cpp b/src/html_token.cpp index 1d0952d..1eabaa8 100644 --- a/src/html_token.cpp +++ b/src/html_token.cpp @@ -5,7 +5,6 @@ const char* TOKEN_TO_NAME_MAP[HtmlTokenType_Length] = { "Text", "Start Tag", "End Tag", - "Attribute", "EOF", "Character" }; @@ -18,7 +17,12 @@ void HtmlToken::print() { break; case HtmlTokenType_StartTag: case HtmlTokenType_EndTag: - logger_info("%s, %S", name, tag_name.c_str()); + logger_info("%s, %S, attributes: %lu", name, tag_name.c_str(), attributes.size()); + + for (auto i = 0; i < attributes.size(); i++) { + HtmlAttribute& attribute = attributes[i]; + printf("\tattribute: %S=%S\n", attribute.name.c_str(), attribute.value.c_str()); + } break; default: logger_info("%s", name); @@ -29,4 +33,5 @@ void HtmlToken::reset() { type = HtmlTokenType_None; tag_name.clear(); code_entity.clear(); + attributes.clear(); } diff --git a/src/html_token.hpp b/src/html_token.hpp index e691d21..09a5c98 100644 --- a/src/html_token.hpp +++ b/src/html_token.hpp @@ -3,17 +3,22 @@ #include #include "code_point.h" +#include enum HtmlTokenType { HtmlTokenType_None = 0, HtmlTokenType_StartTag, HtmlTokenType_EndTag, - HtmlTokenType_Attribute, HtmlTokenType_EOF, HtmlTokenType_Character, HtmlTokenType_Length }; +struct HtmlAttribute { + std::wstring name; + std::wstring value; +}; + struct HtmlToken { HtmlTokenType type; @@ -21,19 +26,34 @@ struct HtmlToken { char character_token; std::wstring tag_name; std::wstring code_entity; + std::vector attributes; + + HtmlAttribute* active_attribute; + + code_point_t entity; void append_to_tag_name(code_point_t c) { tag_name += c; } - void append_to_code_entity(code_point_t c) { - code_entity += c; + void add_to_attribute_name(code_point_t c) { + active_attribute->name += c; + } + + void add_to_attribute_value(code_point_t c) { + active_attribute->value += c; + } + + void start_attribute() { + auto length = attributes.size(); + attributes.push_back(HtmlAttribute()); + active_attribute = &attributes[length]; } /// Transforms the code_entity into a usable value. /// Note that we are assuming that the code_entity is /// valid at this point in time. - code_point_t code_entity_to_value(bool is_hex) { + void set_code_entity_to_value(const std::wstring& code_entity, bool is_hex) { code_point_t value = 0x0000; if (is_hex) { int multiplier = 1; @@ -64,7 +84,8 @@ struct HtmlToken { multiplier *= 10; } } - return value; + + entity = value; } void print(); diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index dc0b8d7..2360c3c 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -34,6 +34,91 @@ inline void emit_character(Tokenizer* tokenizer, code_point_t c) { tokenizer->flag |= TokenizerFlag_Emit; } +/// https://dev.w3.org/html5/spec-LC/tokenization.html#consume-a-character-reference +/// Attempts to consume a character reference from the current tokenizer. If one cannot +/// be consumed, false is returned, otherwise true. +inline bool try_consume_character_reference(Tokenizer* tokenizer) { + code_point_t c = *tokenizer->ptr; + + switch (c) { + case CodePoints::TAB: + case CodePoints::LF: + case CodePoints::FF: + case CodePoints::SPACE: + case CodePoints::LESS_THAN_SIGN: + case CodePoints::AMPERSAND: + case EOF: { + // TODO: The additional allowed character? + // Not a character reference. No characters are consumed, and nothing is returned. (This is not an error, either.) + tokenizer->state = TokenizerState_Data; + return true; + } + case CodePoints::NUMBER_SIGN: { + consume_next(tokenizer); + c = *tokenizer->ptr; + + bool is_hex_value = false; + std::wstring code_entity; + if (c == CodePoints::UPPERCASE_X || c == CodePoints::LOWERCASE_X) { + is_hex_value = true; + consume_next(tokenizer); + c = *tokenizer->ptr; + + if (!CodePoints::is_hex(c)) { + unconsume_previous(tokenizer); // X + unconsume_previous(tokenizer); // Number sign + // TODO: @Error parse error + return false; + } + + + while (CodePoints::is_hex(c)) { + code_entity += c; + consume_next(tokenizer); + c = *tokenizer->ptr; + } + } + else { + if (!CodePoints::is_decimal(c)) { + unconsume_previous(tokenizer); // Number sign + // TODO: @Error parse error + return false; + } + + while (CodePoints::is_decimal(c)) { + code_entity += c; + consume_next(tokenizer); + c = *tokenizer->ptr; + } + } + + // We should have the hex value now. + if (c != CodePoints::SEMICOLON) { + // TODO: @Error parse error + return false; + } + + consume_next(tokenizer); + c = *tokenizer->ptr; + tokenizer->last.set_code_entity_to_value(code_entity, is_hex_value); + auto is_parse_erorr = !CodePoints::try_get_character_ref(tokenizer->last.entity, tokenizer->last.entity); + if (is_parse_erorr) { + // TODO: @Error + return false; + } + + return true; + } + default: { + // TODO: Tedious work lies ahead. + // Otherwise try and find the string by name in this table + // https://dev.w3.org/html5/spec-LC/named-character-references.html#named-character-references + logger_error("Unsupported character reference"); + return false; + } + } +} + /// https://dev.w3.org/html5/spec-LC/tokenization.html#data-state inline void data_state(Tokenizer* tokenizer) { @@ -99,8 +184,7 @@ inline void tag_name_state(Tokenizer* tokenizer) { case CodePoints::FF: case CodePoints::LF: case CodePoints::SPACE: - tokenizer->state = TokenizerState_BeforeAttribute; - tokenizer->flag |= TokenizerFlag_Emit; + tokenizer->state = TokenizerState_BeforeAttributeName; break; case CodePoints::SOLIDUS: tokenizer->state = TokenizerState_SelfClosingStartTag; @@ -130,6 +214,208 @@ inline void tag_name_state(Tokenizer* tokenizer) { } } +inline void before_attribute_name_state(Tokenizer* tokenizer) { + code_point_t c = *tokenizer->ptr; + + switch (c) { + case CodePoints::TAB: + case CodePoints::LF: + case CodePoints::FF: + case CodePoints::SPACE: { + // Ignore the character + break; + } + case CodePoints::SOLIDUS: { + tokenizer->state = TokenizerState_SelfClosingStartTag; + break; + } + case CodePoints::GREATER_THAN_SIGN: { + tokenizer->state = TokenizerState_Data; + tokenizer->flag |= TokenizerFlag_Emit; + break; + } + case CodePoints::NULL_CHAR: { + // TODO: @Error Parse error. + break; + } + case CodePoints::QUOTATION_MARK: + case CodePoints::APOSTROPHE: + case CodePoints::LESS_THAN_SIGN: + case CodePoints::EQUALS_SIGN: { + // TODO: @Error Parse error + // Treat this the same as the "default" case, which is funny + break; + } + case EOF: { + // TODO: @Error Parse error + tokenizer->state = TokenizerState_Data; + tokenizer->flag = tokenizer->flag & TokenizerFlag_NoIncrement; + break; + } + default: { + if (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_Z) { + c = CodePoints::to_lower_case(c); + } + + tokenizer->last.start_attribute(); + tokenizer->last.add_to_attribute_name(c); + tokenizer->state = TokenizerState_AttributeNameState; + break; + } + } +} + +inline void attribute_name_state(Tokenizer* tokenizer) { + code_point_t c = *tokenizer->ptr; + + switch (c) { + case CodePoints::TAB: + case CodePoints::LF: + case CodePoints::FF: + tokenizer->state = TokenizerState_AfterAttributeNameState; + break; + case CodePoints::SOLIDUS: + tokenizer->state = TokenizerState_SelfClosingStartTag; + break; + case CodePoints::EQUALS_SIGN: + tokenizer->state = TokenizerState_BeforeAttributeValueState; + break; + case CodePoints::GREATER_THAN_SIGN: + tokenizer->state = TokenizerState_Data; + tokenizer->flag = tokenizer->flag | TokenizerFlag_Emit; + break; + case CodePoints::NULL_CHAR: + // TODO: @ParseError + tokenizer->last.add_to_attribute_name(CodePoints::REPLACEMENT_CHAR); + break; + case CodePoints::QUOTATION_MARK: + case CodePoints::APOSTROPHE: + case CodePoints::LESS_THAN_SIGN: + // TODO: @ParseError + tokenizer->last.add_to_attribute_name(c); + break; + case CodePoints::MY_EOF: + // TODO: @ParseError + tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement; + tokenizer->state = TokenizerState_Data; + break; + default: + if (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_Z) { + c = CodePoints::to_lower_case(c); + } + + tokenizer->last.add_to_attribute_name(c); + break; + } +} + +inline void before_attribute_value_state(Tokenizer* tokenizer) { + code_point_t c = *tokenizer->ptr; + + switch (c) { + case CodePoints::TAB: + case CodePoints::LF: + case CodePoints::FF: + case CodePoints::SPACE: + break; + case CodePoints::QUOTATION_MARK: + tokenizer->state = TokenizerState_AttributeValueDoubleQuoted; + break; + case CodePoints::AMPERSAND: + tokenizer->state = TokenizerState_AttributeValueUnquoted; + tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement; + break; + case CodePoints::APOSTROPHE: + tokenizer->state = TokenizerState_AttributeValueSingleQuoted; + break; + case CodePoints::NULL_CHAR: + // TODO: @ParseError + tokenizer->state = TokenizerState_AttributeValueUnquoted; + tokenizer->last.add_to_attribute_value(CodePoints::REPLACEMENT_CHAR); + break; + case CodePoints::GREATER_THAN_SIGN: + // TODO: @ParseError + tokenizer->state = TokenizerState_Data; + tokenizer->flag = tokenizer->flag | TokenizerFlag_Emit; + break; + case CodePoints::LESS_THAN_SIGN: + case CodePoints::EQUALS_SIGN: + case CodePoints::MY_EOF: + tokenizer->state = TokenizerState_AttributeValueUnquoted; + tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement; + break; + default: + if (c == CodePoints::GRAVE_ACCENT) { + // TODO: @ParseError + } + tokenizer->state = TokenizerState_AttributeValueUnquoted; + tokenizer->last.add_to_attribute_value(c); + break; + } +} + +inline void attribute_value_double_quoted_state(Tokenizer* tokenizer) { + code_point_t c = *tokenizer->ptr; + + switch (c) { + case CodePoints::QUOTATION_MARK: + tokenizer->state = TokenizerState_AfterAttributeValueQuoted; + break; + case CodePoints::AMPERSAND: + // https://dev.w3.org/html5/spec-LC/tokenization.html#character-reference-in-attribute-value-state + consume_next(tokenizer); + if (!try_consume_character_reference(tokenizer)) { + tokenizer->last.add_to_attribute_value(CodePoints::AMPERSAND); + break; + } + + tokenizer->last.add_to_attribute_value(tokenizer->last.entity); + break; + case CodePoints::NULL_CHAR: + // TODO: @ParseError + tokenizer->last.add_to_attribute_value(CodePoints::REPLACEMENT_CHAR); + break; + case CodePoints::MY_EOF: + // TODO: @ParseError + tokenizer->state = TokenizerState_Data; + tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement; + break; + default: + tokenizer->last.add_to_attribute_value(c); + break; + } +} + +inline void after_attribute_value_quoted_state(Tokenizer* tokenizer) { + code_point_t c = *tokenizer->ptr; + + switch (c) { + case CodePoints::TAB: + case CodePoints::LF: + case CodePoints::FF: + case CodePoints::SPACE: + tokenizer->state = TokenizerState_BeforeAttributeName; + break; + case CodePoints::SOLIDUS: + tokenizer->state = TokenizerState_SelfClosingStartTag; + break; + case CodePoints::GREATER_THAN_SIGN: + tokenizer->state = TokenizerState_Data; + tokenizer->flag = tokenizer->flag | TokenizerFlag_Emit; + break; + case CodePoints::MY_EOF: + // TODO: @ParseError + tokenizer->state = TokenizerState_Data; + tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement; + break; + default: + // TODO: @ParseError + tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement; + tokenizer->state = TokenizerState_BeforeAttributeName; + break; + } +} + /// Process the end tag open state /// Spec: https://dev.w3.org/html5/spec-LC/tokenization.html#end-tag-open-state inline void end_tag_open_state(Tokenizer* tokenizer) { @@ -170,93 +456,6 @@ inline void end_tag_open_state(Tokenizer* tokenizer) { } } -/// https://dev.w3.org/html5/spec-LC/tokenization.html#consume-a-character-reference -/// Attempts to consume a character reference from the current tokenizer. If one cannot -/// be consumed, false is returned, otherwise true. -inline bool try_consume_character_reference(Tokenizer* tokenizer) { - code_point_t c = *tokenizer->ptr; - - switch (c) { - case CodePoints::TAB: - case CodePoints::LF: - case CodePoints::FF: - case CodePoints::SPACE: - case CodePoints::LESS_THAN_SIGN: - case CodePoints::AMPERSAND: - case EOF: { - // TODO: The additional allowed character? - // Not a character reference. No characters are consumed, and nothing is returned. (This is not an error, either.) - tokenizer->state = TokenizerState_Data; - return true; - } - case CodePoints::NUMBER_SIGN: { - consume_next(tokenizer); - c = *tokenizer->ptr; - - bool is_hex_value = false; - code_point_t value = 0x0000; - if (c == CodePoints::UPPERCASE_X || c == CodePoints::LOWERCASE_X) { - is_hex_value = true; - consume_next(tokenizer); - c = *tokenizer->ptr; - - if (!CodePoints::is_hex(c)) { - unconsume_previous(tokenizer); // X - unconsume_previous(tokenizer); // Number sign - // TODO: @Error parse error - return false; - } - - - while (CodePoints::is_hex(c)) { - tokenizer->last.append_to_code_entity(c); - consume_next(tokenizer); - c = *tokenizer->ptr; - } - } - else { - if (!CodePoints::is_decimal(c)) { - unconsume_previous(tokenizer); // Number sign - // TODO: @Error parse error - return false; - } - - while (CodePoints::is_decimal(c)) { - tokenizer->last.append_to_code_entity(c); - consume_next(tokenizer); - c = *tokenizer->ptr; - } - } - - // We should have the hex value now. - if (c != CodePoints::SEMICOLON) { - // TODO: @Error parse error - return false; - } - - consume_next(tokenizer); - c = *tokenizer->ptr; - auto code_entity = tokenizer->last.code_entity_to_value(is_hex_value); - printf("%d\n", code_entity); - - auto is_parse_erorr = !CodePoints::try_get_character_ref(code_entity, code_entity); - if (is_parse_erorr) { - // TODO: @Error - return false; - } - - return true; - } - default: { - // TODO: Tedious work lies ahead. - // Otherwise try and find the string by name in this table - // https://dev.w3.org/html5/spec-LC/named-character-references.html#named-character-references - logger_error("Unsupported character reference"); - return false; - } - } -} - /// TODO: inline void character_reference_in_data_state(Tokenizer* tokenizer) { // A character reference begins with an ampersand @@ -282,6 +481,21 @@ HtmlToken read_next(Tokenizer* tokenizer) { case TokenizerState_TagName: tag_name_state(tokenizer); break; + case TokenizerState_BeforeAttributeName: + before_attribute_name_state(tokenizer); + break; + case TokenizerState_AttributeNameState: + attribute_name_state(tokenizer); + break; + case TokenizerState_BeforeAttributeValueState: + before_attribute_value_state(tokenizer); + break; + case TokenizerState_AttributeValueDoubleQuoted: + attribute_value_double_quoted_state(tokenizer); + break; + case TokenizerState_AfterAttributeValueQuoted: + after_attribute_value_quoted_state(tokenizer); + break; case TokenizerState_EndTagOpen: end_tag_open_state(tokenizer); break; diff --git a/src/tokenizer.hpp b/src/tokenizer.hpp index 8b844cd..e2c17f9 100644 --- a/src/tokenizer.hpp +++ b/src/tokenizer.hpp @@ -6,7 +6,11 @@ enum TokenizerFlag { TokenizerFlag_None = 0, + + /// When set, the tokenizer will emit the HtmlToken stored in "last". TokenizerFlag_Emit = 1, + + /// When set, the tokenizer will not increment the pointer when it attempts its next read. TokenizerFlag_NoIncrement = 2 }; @@ -20,7 +24,14 @@ enum TokenizerState { TokenizerState_TagName, TokenizerState_BogusComment, TokenizerState_CommentState, - TokenizerState_BeforeAttribute, + TokenizerState_BeforeAttributeName, + TokenizerState_AttributeNameState, + TokenizerState_AfterAttributeNameState, + TokenizerState_BeforeAttributeValueState, + TokenizerState_AttributeValueUnquoted, + TokenizerState_AttributeValueDoubleQuoted, + TokenizerState_AttributeValueSingleQuoted, + TokenizerState_AfterAttributeValueQuoted, TokenizerState_SelfClosingStartTag }; -- cgit v1.2.1