summaryrefslogtreecommitdiff
path: root/src/tokenizer.cpp
diff options
context:
space:
mode:
authormattkae <mattkae@protonmail.com>2023-06-22 13:29:49 -0400
committermattkae <mattkae@protonmail.com>2023-06-22 13:29:49 -0400
commitd53f2e7107cf63669b705c3abf08c129eeb0315e (patch)
tree5dadce890576c23e462b5e3dde69542868efe912 /src/tokenizer.cpp
parent04206efb102990c2ce0fa26be30db803cdc816be (diff)
Parsing most html entities except for the last category
Diffstat (limited to 'src/tokenizer.cpp')
-rw-r--r--src/tokenizer.cpp90
1 files changed, 43 insertions, 47 deletions
diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp
index 8cf9b31..9931d59 100644
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@@ -15,49 +15,15 @@ Tokenizer create(code_point_t* value) {
return t;
}
-namespace CodePoints {
- const code_point_t TAB = 0x0009;
- const code_point_t LF = 0x000A;
- const code_point_t FF = 0x00C;
- const code_point_t SPACE = 0x020;
- const code_point_t SOLIDUS = 0x02F;
- const code_point_t LOWERCASE_A = 0x0061;
- const code_point_t LOWERCASE_Z = 0x007A;
- const code_point_t LOWERCASE_F = 0x0066;
- const code_point_t UPPERCASE_A = 0x041;
- const code_point_t UPPERCASE_Z = 0x05A;
- const code_point_t UPPERCASE_F = 0x0046;
- const code_point_t NULL_CHAR = 0x0000;
- const code_point_t REPLACEMENT_CHAR = 0xFFFD;
- const code_point_t GREATER_THAN_SIGN = 0x003E;
- const code_point_t LESS_THAN_SIGN = 0x003C;
- const code_point_t AMPERSAND = 0x0026;
- const code_point_t EXCLAMATION_MARK = 0x0021;
- const code_point_t NUMBER_SIGN = 0x0023;
- const code_point_t LOWERCASE_X = 0x0078;
- const code_point_t UPPERCASE_X = 0x0058;
- const code_point_t DIGIT_ZERO = 0x0030;
- const code_point_t DIGIT_NINE = 0x0039;
-
- inline bool is_decimal(code_point_t c) {
- return c >= CodePoints::DIGIT_ZERO && c <= CodePoints::DIGIT_NINE;
- }
-
- inline bool is_hex(code_point_t c) {
- return (c >= CodePoints::DIGIT_ZERO && c <= CodePoints::DIGIT_NINE)
- || (c >= CodePoints::LOWERCASE_A && c <= CodePoints::LOWERCASE_F)
- || (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_F);
- }
-};
-
// Helpers
-
+/// Consumes the next token by incrementing the ptr.
inline void consume_next(Tokenizer* tokenizer) {
tokenizer->ptr++;
}
+// Unconsumed the next tokekn by decrementing the ptr.
inline void unconsume_previous(Tokenizer* tokenizer) {
tokenizer->ptr--;
}
@@ -148,7 +114,7 @@ inline void tag_name_state(Tokenizer* tokenizer) {
else if (c == EOF) {
// TODO: @Error
tokenizer->state = TokenizerState_Data;
- tokenizer->flag = tokenizer->flag & ~TokenizerFlag_IncrementPtr;
+ tokenizer->flag = tokenizer->flag & TokenizerFlag_DecrementPtr;
}
else {
tokenizer->last.append_to_tag_name(c);
@@ -185,7 +151,7 @@ inline void end_tag_open_state(Tokenizer* tokenizer) {
tokenizer->last.type = HtmlTokenType_Character;
tokenizer->last.character_token = CodePoints::SOLIDUS;
- tokenizer->flag = tokenizer->flag & ~TokenizerFlag_IncrementPtr;
+ tokenizer->flag = tokenizer->flag & TokenizerFlag_DecrementPtr;
}
else if (c == CodePoints::NULL_CHAR) {
// TODO: @Error
@@ -198,7 +164,9 @@ inline void end_tag_open_state(Tokenizer* tokenizer) {
}
/// https://dev.w3.org/html5/spec-LC/tokenization.html#consume-a-character-reference
-inline void try_consume_character_reference(Tokenizer* tokenizer) {
+/// Attempts to consume a character reference from the current tokenizer. If one cannot
+/// be consumed, false is returned, otherwise true.
+inline bool try_consume_character_reference(Tokenizer* tokenizer) {
code_point_t c = *tokenizer->ptr;
switch (c) {
@@ -212,16 +180,16 @@ inline void try_consume_character_reference(Tokenizer* tokenizer) {
// TODO: The additional allowed character?
// Not a character reference. No characters are consumed, and nothing is returned. (This is not an error, either.)
tokenizer->state = TokenizerState_Data;
- tokenizer->flag &= TokenizerFlag_IncrementPtr;
- break;
+ return true;
}
case CodePoints::NUMBER_SIGN: {
consume_next(tokenizer);
c = *tokenizer->ptr;
- bool none_match_range = false;
+ bool is_hex_value = false;
code_point_t value = 0x0000;
if (c == CodePoints::UPPERCASE_X || c == CodePoints::LOWERCASE_X) {
+ is_hex_value = true;
consume_next(tokenizer);
c = *tokenizer->ptr;
@@ -229,10 +197,12 @@ inline void try_consume_character_reference(Tokenizer* tokenizer) {
unconsume_previous(tokenizer); // X
unconsume_previous(tokenizer); // Number sign
// TODO: @Error parse error
- return;
+ return false;
}
+
while (CodePoints::is_hex(c)) {
+ tokenizer->last.append_to_code_entity(c);
consume_next(tokenizer);
c = *tokenizer->ptr;
}
@@ -241,16 +211,41 @@ inline void try_consume_character_reference(Tokenizer* tokenizer) {
if (!CodePoints::is_decimal(c)) {
unconsume_previous(tokenizer); // Number sign
// TODO: @Error parse error
- return;
+ return false;
}
while (CodePoints::is_decimal(c)) {
+ tokenizer->last.append_to_code_entity(c);
consume_next(tokenizer);
c = *tokenizer->ptr;
}
}
- break;
+ // We should have the hex value now.
+ if (c != CodePoints::SEMICOLON) {
+ // TODO: @Error parse error
+ return false;
+ }
+
+ consume_next(tokenizer);
+ c = *tokenizer->ptr;
+ auto code_entity = tokenizer->last.code_entity_to_value(is_hex_value);
+ printf("%d\n", code_entity);
+
+ auto is_parse_erorr = !CodePoints::try_get_character_ref(code_entity, code_entity);
+ if (is_parse_erorr) {
+ // TODO: @Error
+ return false;
+ }
+
+ return true;
+ }
+ default: {
+ // TODO: Tedious work lies ahead.
+ // Otherwise try and find the string by name in this table
+ // https://dev.w3.org/html5/spec-LC/named-character-references.html#named-character-references
+ logger_error("Unsupported character reference");
+ return false;
}
}
}
@@ -259,6 +254,7 @@ inline void try_consume_character_reference(Tokenizer* tokenizer) {
inline void character_reference_in_data_state(Tokenizer* tokenizer) {
// A character reference begins with an ampersand
code_point_t c = *tokenizer->ptr;
+ try_consume_character_reference(tokenizer);
}
HtmlToken read_next(Tokenizer* tokenizer) {
@@ -267,7 +263,7 @@ HtmlToken read_next(Tokenizer* tokenizer) {
tokenizer->state = TokenizerState_Data;
do {
// Reset all flags, except for IncrementPtr
- tokenizer->flag = 0 | TokenizerFlag_IncrementPtr;
+ tokenizer->flag = 0;
switch (tokenizer->state) {
case TokenizerState_Data:
@@ -290,7 +286,7 @@ HtmlToken read_next(Tokenizer* tokenizer) {
exit(1);
}
- if (tokenizer->flag & TokenizerFlag_IncrementPtr) {
+ if ((tokenizer->flag & TokenizerFlag_DecrementPtr) == 0) {
tokenizer->ptr++;
}