summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authormattkae <mattkae@protonmail.com>2023-06-23 10:25:52 -0400
committermattkae <mattkae@protonmail.com>2023-06-23 10:25:52 -0400
commit4feb59d831d395369aa21d77e9b9d293125421d1 (patch)
tree7657a6ea15fc6a873c89cb2d03b75f56767bae71
parent29e03ef74a814cb31a0ae53192e25cc75b638256 (diff)
Able to parse double quoted HTML attributesHEADmaster
-rw-r--r--examples/1.html2
-rw-r--r--src/code_point.h10
-rw-r--r--src/html_token.cpp9
-rw-r--r--src/html_token.hpp31
-rw-r--r--src/tokenizer.cpp392
-rw-r--r--src/tokenizer.hpp13
6 files changed, 359 insertions, 98 deletions
diff --git a/examples/1.html b/examples/1.html
index f83a60a..f146e41 100644
--- a/examples/1.html
+++ b/examples/1.html
@@ -1,7 +1,7 @@
Meow is < bark
<div>
- <h1>Hello World</h1>
+ <h1 id="10" class="hello-tag">Hello World</h1>
<p>
I am in a paragraph &#169;
</p>
diff --git a/src/code_point.h b/src/code_point.h
index c039907..59afd75 100644
--- a/src/code_point.h
+++ b/src/code_point.h
@@ -2,6 +2,7 @@
#define CODE_POINT_H
#include <cstdint>
+#include <cstdio>
typedef wchar_t code_point_t;
@@ -26,6 +27,8 @@ namespace CodePoints {
const code_point_t REPLACEMENT_CHAR = 0xFFFD;
const code_point_t GREATER_THAN_SIGN = 0x003E;
const code_point_t LESS_THAN_SIGN = 0x003C;
+ const code_point_t EQUALS_SIGN = 0x003D;
+ const code_point_t GRAVE_ACCENT = 0x0060;
const code_point_t AMPERSAND = 0x0026;
const code_point_t EXCLAMATION_MARK = 0x0021;
const code_point_t NUMBER_SIGN = 0x0023;
@@ -34,6 +37,9 @@ namespace CodePoints {
const code_point_t DIGIT_ZERO = 0x0030;
const code_point_t DIGIT_NINE = 0x0039;
const code_point_t SEMICOLON = 0x003B;
+ const code_point_t QUOTATION_MARK = 0x0022;
+ const code_point_t APOSTROPHE = 0x0027;
+ const code_point_t MY_EOF = EOF;
inline bool is_decimal(code_point_t c) {
return c >= CodePoints::DIGIT_ZERO && c <= CodePoints::DIGIT_NINE;
@@ -153,6 +159,10 @@ namespace CodePoints {
out = c;
return true;
}
+
+ inline code_point_t to_lower_case(code_point_t c) {
+ return c + 0x0020;
+ }
};
#endif
diff --git a/src/html_token.cpp b/src/html_token.cpp
index 1d0952d..1eabaa8 100644
--- a/src/html_token.cpp
+++ b/src/html_token.cpp
@@ -5,7 +5,6 @@ const char* TOKEN_TO_NAME_MAP[HtmlTokenType_Length] = {
"Text",
"Start Tag",
"End Tag",
- "Attribute",
"EOF",
"Character"
};
@@ -18,7 +17,12 @@ void HtmlToken::print() {
break;
case HtmlTokenType_StartTag:
case HtmlTokenType_EndTag:
- logger_info("%s, %S", name, tag_name.c_str());
+ logger_info("%s, %S, attributes: %lu", name, tag_name.c_str(), attributes.size());
+
+ for (auto i = 0; i < attributes.size(); i++) {
+ HtmlAttribute& attribute = attributes[i];
+ printf("\tattribute: %S=%S\n", attribute.name.c_str(), attribute.value.c_str());
+ }
break;
default:
logger_info("%s", name);
@@ -29,4 +33,5 @@ void HtmlToken::reset() {
type = HtmlTokenType_None;
tag_name.clear();
code_entity.clear();
+ attributes.clear();
}
diff --git a/src/html_token.hpp b/src/html_token.hpp
index e691d21..09a5c98 100644
--- a/src/html_token.hpp
+++ b/src/html_token.hpp
@@ -3,17 +3,22 @@
#include <string>
#include "code_point.h"
+#include <vector>
enum HtmlTokenType {
HtmlTokenType_None = 0,
HtmlTokenType_StartTag,
HtmlTokenType_EndTag,
- HtmlTokenType_Attribute,
HtmlTokenType_EOF,
HtmlTokenType_Character,
HtmlTokenType_Length
};
+struct HtmlAttribute {
+ std::wstring name;
+ std::wstring value;
+};
+
struct HtmlToken {
HtmlTokenType type;
@@ -21,19 +26,34 @@ struct HtmlToken {
char character_token;
std::wstring tag_name;
std::wstring code_entity;
+ std::vector<HtmlAttribute> attributes;
+
+ HtmlAttribute* active_attribute;
+
+ code_point_t entity;
void append_to_tag_name(code_point_t c) {
tag_name += c;
}
- void append_to_code_entity(code_point_t c) {
- code_entity += c;
+ void add_to_attribute_name(code_point_t c) {
+ active_attribute->name += c;
+ }
+
+ void add_to_attribute_value(code_point_t c) {
+ active_attribute->value += c;
+ }
+
+ void start_attribute() {
+ auto length = attributes.size();
+ attributes.push_back(HtmlAttribute());
+ active_attribute = &attributes[length];
}
/// Transforms the code_entity into a usable value.
/// Note that we are assuming that the code_entity is
/// valid at this point in time.
- code_point_t code_entity_to_value(bool is_hex) {
+ void set_code_entity_to_value(const std::wstring& code_entity, bool is_hex) {
code_point_t value = 0x0000;
if (is_hex) {
int multiplier = 1;
@@ -64,7 +84,8 @@ struct HtmlToken {
multiplier *= 10;
}
}
- return value;
+
+ entity = value;
}
void print();
diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp
index dc0b8d7..2360c3c 100644
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@@ -34,6 +34,91 @@ inline void emit_character(Tokenizer* tokenizer, code_point_t c) {
tokenizer->flag |= TokenizerFlag_Emit;
}
+/// https://dev.w3.org/html5/spec-LC/tokenization.html#consume-a-character-reference
+/// Attempts to consume a character reference from the current tokenizer. If one cannot
+/// be consumed, false is returned, otherwise true.
+inline bool try_consume_character_reference(Tokenizer* tokenizer) {
+ code_point_t c = *tokenizer->ptr;
+
+ switch (c) {
+ case CodePoints::TAB:
+ case CodePoints::LF:
+ case CodePoints::FF:
+ case CodePoints::SPACE:
+ case CodePoints::LESS_THAN_SIGN:
+ case CodePoints::AMPERSAND:
+ case EOF: {
+ // TODO: The additional allowed character?
+ // Not a character reference. No characters are consumed, and nothing is returned. (This is not an error, either.)
+ tokenizer->state = TokenizerState_Data;
+ return true;
+ }
+ case CodePoints::NUMBER_SIGN: {
+ consume_next(tokenizer);
+ c = *tokenizer->ptr;
+
+ bool is_hex_value = false;
+ std::wstring code_entity;
+ if (c == CodePoints::UPPERCASE_X || c == CodePoints::LOWERCASE_X) {
+ is_hex_value = true;
+ consume_next(tokenizer);
+ c = *tokenizer->ptr;
+
+ if (!CodePoints::is_hex(c)) {
+ unconsume_previous(tokenizer); // X
+ unconsume_previous(tokenizer); // Number sign
+ // TODO: @Error parse error
+ return false;
+ }
+
+
+ while (CodePoints::is_hex(c)) {
+ code_entity += c;
+ consume_next(tokenizer);
+ c = *tokenizer->ptr;
+ }
+ }
+ else {
+ if (!CodePoints::is_decimal(c)) {
+ unconsume_previous(tokenizer); // Number sign
+ // TODO: @Error parse error
+ return false;
+ }
+
+ while (CodePoints::is_decimal(c)) {
+ code_entity += c;
+ consume_next(tokenizer);
+ c = *tokenizer->ptr;
+ }
+ }
+
+ // We should have the hex value now.
+ if (c != CodePoints::SEMICOLON) {
+ // TODO: @Error parse error
+ return false;
+ }
+
+ consume_next(tokenizer);
+ c = *tokenizer->ptr;
+ tokenizer->last.set_code_entity_to_value(code_entity, is_hex_value);
+ auto is_parse_erorr = !CodePoints::try_get_character_ref(tokenizer->last.entity, tokenizer->last.entity);
+ if (is_parse_erorr) {
+ // TODO: @Error
+ return false;
+ }
+
+ return true;
+ }
+ default: {
+ // TODO: Tedious work lies ahead.
+ // Otherwise try and find the string by name in this table
+ // https://dev.w3.org/html5/spec-LC/named-character-references.html#named-character-references
+ logger_error("Unsupported character reference");
+ return false;
+ }
+ }
+}
+
/// https://dev.w3.org/html5/spec-LC/tokenization.html#data-state
inline void data_state(Tokenizer* tokenizer) {
@@ -99,8 +184,7 @@ inline void tag_name_state(Tokenizer* tokenizer) {
case CodePoints::FF:
case CodePoints::LF:
case CodePoints::SPACE:
- tokenizer->state = TokenizerState_BeforeAttribute;
- tokenizer->flag |= TokenizerFlag_Emit;
+ tokenizer->state = TokenizerState_BeforeAttributeName;
break;
case CodePoints::SOLIDUS:
tokenizer->state = TokenizerState_SelfClosingStartTag;
@@ -130,6 +214,208 @@ inline void tag_name_state(Tokenizer* tokenizer) {
}
}
+inline void before_attribute_name_state(Tokenizer* tokenizer) {
+ code_point_t c = *tokenizer->ptr;
+
+ switch (c) {
+ case CodePoints::TAB:
+ case CodePoints::LF:
+ case CodePoints::FF:
+ case CodePoints::SPACE: {
+ // Ignore the character
+ break;
+ }
+ case CodePoints::SOLIDUS: {
+ tokenizer->state = TokenizerState_SelfClosingStartTag;
+ break;
+ }
+ case CodePoints::GREATER_THAN_SIGN: {
+ tokenizer->state = TokenizerState_Data;
+ tokenizer->flag |= TokenizerFlag_Emit;
+ break;
+ }
+ case CodePoints::NULL_CHAR: {
+ // TODO: @Error Parse error.
+ break;
+ }
+ case CodePoints::QUOTATION_MARK:
+ case CodePoints::APOSTROPHE:
+ case CodePoints::LESS_THAN_SIGN:
+ case CodePoints::EQUALS_SIGN: {
+ // TODO: @Error Parse error
+ // Treat this the same as the "default" case, which is funny
+ break;
+ }
+ case EOF: {
+ // TODO: @Error Parse error
+ tokenizer->state = TokenizerState_Data;
+ tokenizer->flag = tokenizer->flag & TokenizerFlag_NoIncrement;
+ break;
+ }
+ default: {
+ if (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_Z) {
+ c = CodePoints::to_lower_case(c);
+ }
+
+ tokenizer->last.start_attribute();
+ tokenizer->last.add_to_attribute_name(c);
+ tokenizer->state = TokenizerState_AttributeNameState;
+ break;
+ }
+ }
+}
+
+inline void attribute_name_state(Tokenizer* tokenizer) {
+ code_point_t c = *tokenizer->ptr;
+
+ switch (c) {
+ case CodePoints::TAB:
+ case CodePoints::LF:
+ case CodePoints::FF:
+ tokenizer->state = TokenizerState_AfterAttributeNameState;
+ break;
+ case CodePoints::SOLIDUS:
+ tokenizer->state = TokenizerState_SelfClosingStartTag;
+ break;
+ case CodePoints::EQUALS_SIGN:
+ tokenizer->state = TokenizerState_BeforeAttributeValueState;
+ break;
+ case CodePoints::GREATER_THAN_SIGN:
+ tokenizer->state = TokenizerState_Data;
+ tokenizer->flag = tokenizer->flag | TokenizerFlag_Emit;
+ break;
+ case CodePoints::NULL_CHAR:
+ // TODO: @ParseError
+ tokenizer->last.add_to_attribute_name(CodePoints::REPLACEMENT_CHAR);
+ break;
+ case CodePoints::QUOTATION_MARK:
+ case CodePoints::APOSTROPHE:
+ case CodePoints::LESS_THAN_SIGN:
+ // TODO: @ParseError
+ tokenizer->last.add_to_attribute_name(c);
+ break;
+ case CodePoints::MY_EOF:
+ // TODO: @ParseError
+ tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement;
+ tokenizer->state = TokenizerState_Data;
+ break;
+ default:
+ if (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_Z) {
+ c = CodePoints::to_lower_case(c);
+ }
+
+ tokenizer->last.add_to_attribute_name(c);
+ break;
+ }
+}
+
+inline void before_attribute_value_state(Tokenizer* tokenizer) {
+ code_point_t c = *tokenizer->ptr;
+
+ switch (c) {
+ case CodePoints::TAB:
+ case CodePoints::LF:
+ case CodePoints::FF:
+ case CodePoints::SPACE:
+ break;
+ case CodePoints::QUOTATION_MARK:
+ tokenizer->state = TokenizerState_AttributeValueDoubleQuoted;
+ break;
+ case CodePoints::AMPERSAND:
+ tokenizer->state = TokenizerState_AttributeValueUnquoted;
+ tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement;
+ break;
+ case CodePoints::APOSTROPHE:
+ tokenizer->state = TokenizerState_AttributeValueSingleQuoted;
+ break;
+ case CodePoints::NULL_CHAR:
+ // TODO: @ParseError
+ tokenizer->state = TokenizerState_AttributeValueUnquoted;
+ tokenizer->last.add_to_attribute_value(CodePoints::REPLACEMENT_CHAR);
+ break;
+ case CodePoints::GREATER_THAN_SIGN:
+ // TODO: @ParseError
+ tokenizer->state = TokenizerState_Data;
+ tokenizer->flag = tokenizer->flag | TokenizerFlag_Emit;
+ break;
+ case CodePoints::LESS_THAN_SIGN:
+ case CodePoints::EQUALS_SIGN:
+ case CodePoints::MY_EOF:
+ tokenizer->state = TokenizerState_AttributeValueUnquoted;
+ tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement;
+ break;
+ default:
+ if (c == CodePoints::GRAVE_ACCENT) {
+ // TODO: @ParseError
+ }
+ tokenizer->state = TokenizerState_AttributeValueUnquoted;
+ tokenizer->last.add_to_attribute_value(c);
+ break;
+ }
+}
+
+inline void attribute_value_double_quoted_state(Tokenizer* tokenizer) {
+ code_point_t c = *tokenizer->ptr;
+
+ switch (c) {
+ case CodePoints::QUOTATION_MARK:
+ tokenizer->state = TokenizerState_AfterAttributeValueQuoted;
+ break;
+ case CodePoints::AMPERSAND:
+ // https://dev.w3.org/html5/spec-LC/tokenization.html#character-reference-in-attribute-value-state
+ consume_next(tokenizer);
+ if (!try_consume_character_reference(tokenizer)) {
+ tokenizer->last.add_to_attribute_value(CodePoints::AMPERSAND);
+ break;
+ }
+
+ tokenizer->last.add_to_attribute_value(tokenizer->last.entity);
+ break;
+ case CodePoints::NULL_CHAR:
+ // TODO: @ParseError
+ tokenizer->last.add_to_attribute_value(CodePoints::REPLACEMENT_CHAR);
+ break;
+ case CodePoints::MY_EOF:
+ // TODO: @ParseError
+ tokenizer->state = TokenizerState_Data;
+ tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement;
+ break;
+ default:
+ tokenizer->last.add_to_attribute_value(c);
+ break;
+ }
+}
+
+inline void after_attribute_value_quoted_state(Tokenizer* tokenizer) {
+ code_point_t c = *tokenizer->ptr;
+
+ switch (c) {
+ case CodePoints::TAB:
+ case CodePoints::LF:
+ case CodePoints::FF:
+ case CodePoints::SPACE:
+ tokenizer->state = TokenizerState_BeforeAttributeName;
+ break;
+ case CodePoints::SOLIDUS:
+ tokenizer->state = TokenizerState_SelfClosingStartTag;
+ break;
+ case CodePoints::GREATER_THAN_SIGN:
+ tokenizer->state = TokenizerState_Data;
+ tokenizer->flag = tokenizer->flag | TokenizerFlag_Emit;
+ break;
+ case CodePoints::MY_EOF:
+ // TODO: @ParseError
+ tokenizer->state = TokenizerState_Data;
+ tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement;
+ break;
+ default:
+ // TODO: @ParseError
+ tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement;
+ tokenizer->state = TokenizerState_BeforeAttributeName;
+ break;
+ }
+}
+
/// Process the end tag open state
/// Spec: https://dev.w3.org/html5/spec-LC/tokenization.html#end-tag-open-state
inline void end_tag_open_state(Tokenizer* tokenizer) {
@@ -170,93 +456,6 @@ inline void end_tag_open_state(Tokenizer* tokenizer) {
}
}
-/// https://dev.w3.org/html5/spec-LC/tokenization.html#consume-a-character-reference
-/// Attempts to consume a character reference from the current tokenizer. If one cannot
-/// be consumed, false is returned, otherwise true.
-inline bool try_consume_character_reference(Tokenizer* tokenizer) {
- code_point_t c = *tokenizer->ptr;
-
- switch (c) {
- case CodePoints::TAB:
- case CodePoints::LF:
- case CodePoints::FF:
- case CodePoints::SPACE:
- case CodePoints::LESS_THAN_SIGN:
- case CodePoints::AMPERSAND:
- case EOF: {
- // TODO: The additional allowed character?
- // Not a character reference. No characters are consumed, and nothing is returned. (This is not an error, either.)
- tokenizer->state = TokenizerState_Data;
- return true;
- }
- case CodePoints::NUMBER_SIGN: {
- consume_next(tokenizer);
- c = *tokenizer->ptr;
-
- bool is_hex_value = false;
- code_point_t value = 0x0000;
- if (c == CodePoints::UPPERCASE_X || c == CodePoints::LOWERCASE_X) {
- is_hex_value = true;
- consume_next(tokenizer);
- c = *tokenizer->ptr;
-
- if (!CodePoints::is_hex(c)) {
- unconsume_previous(tokenizer); // X
- unconsume_previous(tokenizer); // Number sign
- // TODO: @Error parse error
- return false;
- }
-
-
- while (CodePoints::is_hex(c)) {
- tokenizer->last.append_to_code_entity(c);
- consume_next(tokenizer);
- c = *tokenizer->ptr;
- }
- }
- else {
- if (!CodePoints::is_decimal(c)) {
- unconsume_previous(tokenizer); // Number sign
- // TODO: @Error parse error
- return false;
- }
-
- while (CodePoints::is_decimal(c)) {
- tokenizer->last.append_to_code_entity(c);
- consume_next(tokenizer);
- c = *tokenizer->ptr;
- }
- }
-
- // We should have the hex value now.
- if (c != CodePoints::SEMICOLON) {
- // TODO: @Error parse error
- return false;
- }
-
- consume_next(tokenizer);
- c = *tokenizer->ptr;
- auto code_entity = tokenizer->last.code_entity_to_value(is_hex_value);
- printf("%d\n", code_entity);
-
- auto is_parse_erorr = !CodePoints::try_get_character_ref(code_entity, code_entity);
- if (is_parse_erorr) {
- // TODO: @Error
- return false;
- }
-
- return true;
- }
- default: {
- // TODO: Tedious work lies ahead.
- // Otherwise try and find the string by name in this table
- // https://dev.w3.org/html5/spec-LC/named-character-references.html#named-character-references
- logger_error("Unsupported character reference");
- return false;
- }
- }
-}
-
/// TODO:
inline void character_reference_in_data_state(Tokenizer* tokenizer) {
// A character reference begins with an ampersand
@@ -282,6 +481,21 @@ HtmlToken read_next(Tokenizer* tokenizer) {
case TokenizerState_TagName:
tag_name_state(tokenizer);
break;
+ case TokenizerState_BeforeAttributeName:
+ before_attribute_name_state(tokenizer);
+ break;
+ case TokenizerState_AttributeNameState:
+ attribute_name_state(tokenizer);
+ break;
+ case TokenizerState_BeforeAttributeValueState:
+ before_attribute_value_state(tokenizer);
+ break;
+ case TokenizerState_AttributeValueDoubleQuoted:
+ attribute_value_double_quoted_state(tokenizer);
+ break;
+ case TokenizerState_AfterAttributeValueQuoted:
+ after_attribute_value_quoted_state(tokenizer);
+ break;
case TokenizerState_EndTagOpen:
end_tag_open_state(tokenizer);
break;
diff --git a/src/tokenizer.hpp b/src/tokenizer.hpp
index 8b844cd..e2c17f9 100644
--- a/src/tokenizer.hpp
+++ b/src/tokenizer.hpp
@@ -6,7 +6,11 @@
enum TokenizerFlag {
TokenizerFlag_None = 0,
+
+ /// When set, the tokenizer will emit the HtmlToken stored in "last".
TokenizerFlag_Emit = 1,
+
+ /// When set, the tokenizer will not increment the pointer when it attempts its next read.
TokenizerFlag_NoIncrement = 2
};
@@ -20,7 +24,14 @@ enum TokenizerState {
TokenizerState_TagName,
TokenizerState_BogusComment,
TokenizerState_CommentState,
- TokenizerState_BeforeAttribute,
+ TokenizerState_BeforeAttributeName,
+ TokenizerState_AttributeNameState,
+ TokenizerState_AfterAttributeNameState,
+ TokenizerState_BeforeAttributeValueState,
+ TokenizerState_AttributeValueUnquoted,
+ TokenizerState_AttributeValueDoubleQuoted,
+ TokenizerState_AttributeValueSingleQuoted,
+ TokenizerState_AfterAttributeValueQuoted,
TokenizerState_SelfClosingStartTag
};