summaryrefslogtreecommitdiff
path: root/src/tokenizer.cpp
diff options
context:
space:
mode:
authormattkae <mattkae@protonmail.com>2023-06-23 10:25:52 -0400
committermattkae <mattkae@protonmail.com>2023-06-23 10:25:52 -0400
commit4feb59d831d395369aa21d77e9b9d293125421d1 (patch)
tree7657a6ea15fc6a873c89cb2d03b75f56767bae71 /src/tokenizer.cpp
parent29e03ef74a814cb31a0ae53192e25cc75b638256 (diff)
Able to parse double quoted HTML attributesHEADmaster
Diffstat (limited to 'src/tokenizer.cpp')
-rw-r--r--src/tokenizer.cpp392
1 files changed, 303 insertions, 89 deletions
diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp
index dc0b8d7..2360c3c 100644
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@@ -34,6 +34,91 @@ inline void emit_character(Tokenizer* tokenizer, code_point_t c) {
tokenizer->flag |= TokenizerFlag_Emit;
}
+/// https://dev.w3.org/html5/spec-LC/tokenization.html#consume-a-character-reference
+/// Attempts to consume a character reference from the current tokenizer. If one cannot
+/// be consumed, false is returned, otherwise true.
+inline bool try_consume_character_reference(Tokenizer* tokenizer) {
+ code_point_t c = *tokenizer->ptr;
+
+ switch (c) {
+ case CodePoints::TAB:
+ case CodePoints::LF:
+ case CodePoints::FF:
+ case CodePoints::SPACE:
+ case CodePoints::LESS_THAN_SIGN:
+ case CodePoints::AMPERSAND:
+ case EOF: {
+ // TODO: The additional allowed character?
+ // Not a character reference. No characters are consumed, and nothing is returned. (This is not an error, either.)
+ tokenizer->state = TokenizerState_Data;
+ return true;
+ }
+ case CodePoints::NUMBER_SIGN: {
+ consume_next(tokenizer);
+ c = *tokenizer->ptr;
+
+ bool is_hex_value = false;
+ std::wstring code_entity;
+ if (c == CodePoints::UPPERCASE_X || c == CodePoints::LOWERCASE_X) {
+ is_hex_value = true;
+ consume_next(tokenizer);
+ c = *tokenizer->ptr;
+
+ if (!CodePoints::is_hex(c)) {
+ unconsume_previous(tokenizer); // X
+ unconsume_previous(tokenizer); // Number sign
+ // TODO: @Error parse error
+ return false;
+ }
+
+
+ while (CodePoints::is_hex(c)) {
+ code_entity += c;
+ consume_next(tokenizer);
+ c = *tokenizer->ptr;
+ }
+ }
+ else {
+ if (!CodePoints::is_decimal(c)) {
+ unconsume_previous(tokenizer); // Number sign
+ // TODO: @Error parse error
+ return false;
+ }
+
+ while (CodePoints::is_decimal(c)) {
+ code_entity += c;
+ consume_next(tokenizer);
+ c = *tokenizer->ptr;
+ }
+ }
+
+ // We should have the hex value now.
+ if (c != CodePoints::SEMICOLON) {
+ // TODO: @Error parse error
+ return false;
+ }
+
+ consume_next(tokenizer);
+ c = *tokenizer->ptr;
+ tokenizer->last.set_code_entity_to_value(code_entity, is_hex_value);
+ auto is_parse_erorr = !CodePoints::try_get_character_ref(tokenizer->last.entity, tokenizer->last.entity);
+ if (is_parse_erorr) {
+ // TODO: @Error
+ return false;
+ }
+
+ return true;
+ }
+ default: {
+ // TODO: Tedious work lies ahead.
+ // Otherwise try and find the string by name in this table
+ // https://dev.w3.org/html5/spec-LC/named-character-references.html#named-character-references
+ logger_error("Unsupported character reference");
+ return false;
+ }
+ }
+}
+
/// https://dev.w3.org/html5/spec-LC/tokenization.html#data-state
inline void data_state(Tokenizer* tokenizer) {
@@ -99,8 +184,7 @@ inline void tag_name_state(Tokenizer* tokenizer) {
case CodePoints::FF:
case CodePoints::LF:
case CodePoints::SPACE:
- tokenizer->state = TokenizerState_BeforeAttribute;
- tokenizer->flag |= TokenizerFlag_Emit;
+ tokenizer->state = TokenizerState_BeforeAttributeName;
break;
case CodePoints::SOLIDUS:
tokenizer->state = TokenizerState_SelfClosingStartTag;
@@ -130,6 +214,208 @@ inline void tag_name_state(Tokenizer* tokenizer) {
}
}
+inline void before_attribute_name_state(Tokenizer* tokenizer) {
+ code_point_t c = *tokenizer->ptr;
+
+ switch (c) {
+ case CodePoints::TAB:
+ case CodePoints::LF:
+ case CodePoints::FF:
+ case CodePoints::SPACE: {
+ // Ignore the character
+ break;
+ }
+ case CodePoints::SOLIDUS: {
+ tokenizer->state = TokenizerState_SelfClosingStartTag;
+ break;
+ }
+ case CodePoints::GREATER_THAN_SIGN: {
+ tokenizer->state = TokenizerState_Data;
+ tokenizer->flag |= TokenizerFlag_Emit;
+ break;
+ }
+ case CodePoints::NULL_CHAR: {
+ // TODO: @Error Parse error.
+ break;
+ }
+ case CodePoints::QUOTATION_MARK:
+ case CodePoints::APOSTROPHE:
+ case CodePoints::LESS_THAN_SIGN:
+ case CodePoints::EQUALS_SIGN: {
+ // TODO: @Error Parse error
+ // Treat this the same as the "default" case, which is funny
+ break;
+ }
+ case EOF: {
+ // TODO: @Error Parse error
+ tokenizer->state = TokenizerState_Data;
+ tokenizer->flag = tokenizer->flag & TokenizerFlag_NoIncrement;
+ break;
+ }
+ default: {
+ if (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_Z) {
+ c = CodePoints::to_lower_case(c);
+ }
+
+ tokenizer->last.start_attribute();
+ tokenizer->last.add_to_attribute_name(c);
+ tokenizer->state = TokenizerState_AttributeNameState;
+ break;
+ }
+ }
+}
+
+inline void attribute_name_state(Tokenizer* tokenizer) {
+ code_point_t c = *tokenizer->ptr;
+
+ switch (c) {
+ case CodePoints::TAB:
+ case CodePoints::LF:
+ case CodePoints::FF:
+ tokenizer->state = TokenizerState_AfterAttributeNameState;
+ break;
+ case CodePoints::SOLIDUS:
+ tokenizer->state = TokenizerState_SelfClosingStartTag;
+ break;
+ case CodePoints::EQUALS_SIGN:
+ tokenizer->state = TokenizerState_BeforeAttributeValueState;
+ break;
+ case CodePoints::GREATER_THAN_SIGN:
+ tokenizer->state = TokenizerState_Data;
+ tokenizer->flag = tokenizer->flag | TokenizerFlag_Emit;
+ break;
+ case CodePoints::NULL_CHAR:
+ // TODO: @ParseError
+ tokenizer->last.add_to_attribute_name(CodePoints::REPLACEMENT_CHAR);
+ break;
+ case CodePoints::QUOTATION_MARK:
+ case CodePoints::APOSTROPHE:
+ case CodePoints::LESS_THAN_SIGN:
+ // TODO: @ParseError
+ tokenizer->last.add_to_attribute_name(c);
+ break;
+ case CodePoints::MY_EOF:
+ // TODO: @ParseError
+ tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement;
+ tokenizer->state = TokenizerState_Data;
+ break;
+ default:
+ if (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_Z) {
+ c = CodePoints::to_lower_case(c);
+ }
+
+ tokenizer->last.add_to_attribute_name(c);
+ break;
+ }
+}
+
+inline void before_attribute_value_state(Tokenizer* tokenizer) {
+ code_point_t c = *tokenizer->ptr;
+
+ switch (c) {
+ case CodePoints::TAB:
+ case CodePoints::LF:
+ case CodePoints::FF:
+ case CodePoints::SPACE:
+ break;
+ case CodePoints::QUOTATION_MARK:
+ tokenizer->state = TokenizerState_AttributeValueDoubleQuoted;
+ break;
+ case CodePoints::AMPERSAND:
+ tokenizer->state = TokenizerState_AttributeValueUnquoted;
+ tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement;
+ break;
+ case CodePoints::APOSTROPHE:
+ tokenizer->state = TokenizerState_AttributeValueSingleQuoted;
+ break;
+ case CodePoints::NULL_CHAR:
+ // TODO: @ParseError
+ tokenizer->state = TokenizerState_AttributeValueUnquoted;
+ tokenizer->last.add_to_attribute_value(CodePoints::REPLACEMENT_CHAR);
+ break;
+ case CodePoints::GREATER_THAN_SIGN:
+ // TODO: @ParseError
+ tokenizer->state = TokenizerState_Data;
+ tokenizer->flag = tokenizer->flag | TokenizerFlag_Emit;
+ break;
+ case CodePoints::LESS_THAN_SIGN:
+ case CodePoints::EQUALS_SIGN:
+ case CodePoints::MY_EOF:
+ tokenizer->state = TokenizerState_AttributeValueUnquoted;
+ tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement;
+ break;
+ default:
+ if (c == CodePoints::GRAVE_ACCENT) {
+ // TODO: @ParseError
+ }
+ tokenizer->state = TokenizerState_AttributeValueUnquoted;
+ tokenizer->last.add_to_attribute_value(c);
+ break;
+ }
+}
+
+inline void attribute_value_double_quoted_state(Tokenizer* tokenizer) {
+ code_point_t c = *tokenizer->ptr;
+
+ switch (c) {
+ case CodePoints::QUOTATION_MARK:
+ tokenizer->state = TokenizerState_AfterAttributeValueQuoted;
+ break;
+ case CodePoints::AMPERSAND:
+ // https://dev.w3.org/html5/spec-LC/tokenization.html#character-reference-in-attribute-value-state
+ consume_next(tokenizer);
+ if (!try_consume_character_reference(tokenizer)) {
+ tokenizer->last.add_to_attribute_value(CodePoints::AMPERSAND);
+ break;
+ }
+
+ tokenizer->last.add_to_attribute_value(tokenizer->last.entity);
+ break;
+ case CodePoints::NULL_CHAR:
+ // TODO: @ParseError
+ tokenizer->last.add_to_attribute_value(CodePoints::REPLACEMENT_CHAR);
+ break;
+ case CodePoints::MY_EOF:
+ // TODO: @ParseError
+ tokenizer->state = TokenizerState_Data;
+ tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement;
+ break;
+ default:
+ tokenizer->last.add_to_attribute_value(c);
+ break;
+ }
+}
+
+inline void after_attribute_value_quoted_state(Tokenizer* tokenizer) {
+ code_point_t c = *tokenizer->ptr;
+
+ switch (c) {
+ case CodePoints::TAB:
+ case CodePoints::LF:
+ case CodePoints::FF:
+ case CodePoints::SPACE:
+ tokenizer->state = TokenizerState_BeforeAttributeName;
+ break;
+ case CodePoints::SOLIDUS:
+ tokenizer->state = TokenizerState_SelfClosingStartTag;
+ break;
+ case CodePoints::GREATER_THAN_SIGN:
+ tokenizer->state = TokenizerState_Data;
+ tokenizer->flag = tokenizer->flag | TokenizerFlag_Emit;
+ break;
+ case CodePoints::MY_EOF:
+ // TODO: @ParseError
+ tokenizer->state = TokenizerState_Data;
+ tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement;
+ break;
+ default:
+ // TODO: @ParseError
+ tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement;
+ tokenizer->state = TokenizerState_BeforeAttributeName;
+ break;
+ }
+}
+
/// Process the end tag open state
/// Spec: https://dev.w3.org/html5/spec-LC/tokenization.html#end-tag-open-state
inline void end_tag_open_state(Tokenizer* tokenizer) {
@@ -170,93 +456,6 @@ inline void end_tag_open_state(Tokenizer* tokenizer) {
}
}
-/// https://dev.w3.org/html5/spec-LC/tokenization.html#consume-a-character-reference
-/// Attempts to consume a character reference from the current tokenizer. If one cannot
-/// be consumed, false is returned, otherwise true.
-inline bool try_consume_character_reference(Tokenizer* tokenizer) {
- code_point_t c = *tokenizer->ptr;
-
- switch (c) {
- case CodePoints::TAB:
- case CodePoints::LF:
- case CodePoints::FF:
- case CodePoints::SPACE:
- case CodePoints::LESS_THAN_SIGN:
- case CodePoints::AMPERSAND:
- case EOF: {
- // TODO: The additional allowed character?
- // Not a character reference. No characters are consumed, and nothing is returned. (This is not an error, either.)
- tokenizer->state = TokenizerState_Data;
- return true;
- }
- case CodePoints::NUMBER_SIGN: {
- consume_next(tokenizer);
- c = *tokenizer->ptr;
-
- bool is_hex_value = false;
- code_point_t value = 0x0000;
- if (c == CodePoints::UPPERCASE_X || c == CodePoints::LOWERCASE_X) {
- is_hex_value = true;
- consume_next(tokenizer);
- c = *tokenizer->ptr;
-
- if (!CodePoints::is_hex(c)) {
- unconsume_previous(tokenizer); // X
- unconsume_previous(tokenizer); // Number sign
- // TODO: @Error parse error
- return false;
- }
-
-
- while (CodePoints::is_hex(c)) {
- tokenizer->last.append_to_code_entity(c);
- consume_next(tokenizer);
- c = *tokenizer->ptr;
- }
- }
- else {
- if (!CodePoints::is_decimal(c)) {
- unconsume_previous(tokenizer); // Number sign
- // TODO: @Error parse error
- return false;
- }
-
- while (CodePoints::is_decimal(c)) {
- tokenizer->last.append_to_code_entity(c);
- consume_next(tokenizer);
- c = *tokenizer->ptr;
- }
- }
-
- // We should have the hex value now.
- if (c != CodePoints::SEMICOLON) {
- // TODO: @Error parse error
- return false;
- }
-
- consume_next(tokenizer);
- c = *tokenizer->ptr;
- auto code_entity = tokenizer->last.code_entity_to_value(is_hex_value);
- printf("%d\n", code_entity);
-
- auto is_parse_erorr = !CodePoints::try_get_character_ref(code_entity, code_entity);
- if (is_parse_erorr) {
- // TODO: @Error
- return false;
- }
-
- return true;
- }
- default: {
- // TODO: Tedious work lies ahead.
- // Otherwise try and find the string by name in this table
- // https://dev.w3.org/html5/spec-LC/named-character-references.html#named-character-references
- logger_error("Unsupported character reference");
- return false;
- }
- }
-}
-
/// TODO:
inline void character_reference_in_data_state(Tokenizer* tokenizer) {
// A character reference begins with an ampersand
@@ -282,6 +481,21 @@ HtmlToken read_next(Tokenizer* tokenizer) {
case TokenizerState_TagName:
tag_name_state(tokenizer);
break;
+ case TokenizerState_BeforeAttributeName:
+ before_attribute_name_state(tokenizer);
+ break;
+ case TokenizerState_AttributeNameState:
+ attribute_name_state(tokenizer);
+ break;
+ case TokenizerState_BeforeAttributeValueState:
+ before_attribute_value_state(tokenizer);
+ break;
+ case TokenizerState_AttributeValueDoubleQuoted:
+ attribute_value_double_quoted_state(tokenizer);
+ break;
+ case TokenizerState_AfterAttributeValueQuoted:
+ after_attribute_value_quoted_state(tokenizer);
+ break;
case TokenizerState_EndTagOpen:
end_tag_open_state(tokenizer);
break;