summaryrefslogtreecommitdiff
path: root/src/tokenizer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/tokenizer.cpp')
-rw-r--r--src/tokenizer.cpp268
1 files changed, 268 insertions, 0 deletions
diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp
new file mode 100644
index 0000000..30ca6bd
--- /dev/null
+++ b/src/tokenizer.cpp
@@ -0,0 +1,268 @@
+#include "tokenizer.hpp"
+#include "code_point.h"
+#include "html_token.hpp"
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <matte/logger.h>
+
+Tokenizer create(code_point_t* value) {
+ Tokenizer t;
+ t.ptr = value;
+ t.length = strlen((char*)value);
+ t.state = TokenizerState_Data;
+ return t;
+}
+
+namespace CodePoints {
+ const code_point_t TAB = 0x0009;
+ const code_point_t LF = 0x000A;
+ const code_point_t FF = 0x00C;
+ const code_point_t SPACE = 0x020;
+ const code_point_t SOLIDUS = 0x02F;
+ const code_point_t LOWERCASE_A = 0x0061;
+ const code_point_t LOWERCASE_Z = 0x007A;
+ const code_point_t LOWERCASE_F = 0x0066;
+ const code_point_t UPPERCASE_A = 0x041;
+ const code_point_t UPPERCASE_Z = 0x05A;
+ const code_point_t UPPERCASE_F = 0x0046;
+ const code_point_t NULL_CHAR = 0x0000;
+ const code_point_t REPLACEMENT_CHAR = 0xFFFD;
+ const code_point_t GREATER_THAN_SIGN = 0x003E;
+ const code_point_t LESS_THAN_SIGN = 0x003C;
+ const code_point_t AMPERSAND = 0x0026;
+ const code_point_t EXCLAMATION_MARK = 0x0021;
+ const code_point_t NUMBER_SIGN = 0x0023;
+ const code_point_t LOWERCASE_X = 0x0078;
+ const code_point_t UPPERCASE_X = 0x0058;
+ const code_point_t DIGIT_ZERO = 0x0030;
+ const code_point_t DIGIT_NINE = 0x0039;
+};
+
+// https://dev.w3.org/html5/spec-LC/tokenization.html#tokenization
+
+inline void consume_next(Tokenizer* tokenizer) {
+ tokenizer->ptr++;
+}
+
+
+/// https://dev.w3.org/html5/spec-LC/tokenization.html#data-state
+inline void data_state(Tokenizer* tokenizer) {
+ auto c = *tokenizer->ptr;
+ switch (c) {
+ case CodePoints::AMPERSAND:
+ tokenizer->state = TokenizerState_CharacterReferenceInData;
+ break;
+ case CodePoints::LESS_THAN_SIGN:
+ tokenizer->state = TokenizerState_TagOpen;
+ break;
+ case CodePoints::NULL_CHAR:
+ tokenizer->last.type = HtmlTokenType_EOF;
+ tokenizer->flag = tokenizer->flag | TokenizerFlag_Emit;
+ break;
+ default:
+ // TODO: @Error If null, throw an error
+ tokenizer->last.type = HtmlTokenType_Character;
+ tokenizer->last.character_token = *tokenizer->ptr;
+ tokenizer->flag |= TokenizerFlag_Emit;
+ break;
+ }
+}
+
+/// https://dev.w3.org/html5/spec-LC/tokenization.html#tag-open-state
+inline void tag_open_state(Tokenizer* tokenizer) {
+ code_point_t c = *tokenizer->ptr;
+ switch (c) {
+ case CodePoints::EXCLAMATION_MARK:
+ tokenizer->state = TokenizerState_MarkupDeclarationOpen; // TODO
+ break;
+ case CodePoints::SOLIDUS: // U+002F
+ tokenizer->state = TokenizerState_EndTagOpen;
+ break;
+ default:
+ // TODO: In these two case, we do NOT want to emit the token just yet.
+ if (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_Z) {
+ c = c + 0x0020; // To lower
+ tokenizer->state = TokenizerState_TagName;
+ tokenizer->last.append_to_tag_name(c);
+ tokenizer->last.type = HtmlTokenType_StartTag;
+ }
+ else if (c >= CodePoints::LOWERCASE_A && c <= CodePoints::LOWERCASE_Z) {
+ tokenizer->state = TokenizerState_TagName;
+ tokenizer->last.append_to_tag_name(c);
+ tokenizer->last.type = HtmlTokenType_StartTag;
+ }
+ else if (c == '?') { // U+003F
+ // TODO: Parse error
+ tokenizer->state = TokenizerState_BogusComment; // TODO:
+ }
+ else {
+ tokenizer->state = TokenizerState_Data;
+ }
+ }
+}
+
+inline void tag_name_state(Tokenizer* tokenizer) {
+ code_point_t c = *tokenizer->ptr;
+ switch (c) {
+ case CodePoints::TAB:
+ case CodePoints::FF:
+ case CodePoints::LF:
+ case CodePoints::SPACE:
+ tokenizer->state = TokenizerState_BeforeAttribute;
+ tokenizer->flag |= TokenizerFlag_Emit;
+ break;
+ case CodePoints::SOLIDUS:
+ tokenizer->state = TokenizerState_SelfClosingStartTag;
+ break;
+ case CodePoints::GREATER_THAN_SIGN:
+ tokenizer->state = TokenizerState_Data;
+ tokenizer->flag |= TokenizerFlag_Emit;
+ break;
+ default:
+ if (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_Z) {
+ c = c + 0x0020; // To lower
+ tokenizer->last.append_to_tag_name(c);
+ }
+ else if (c == CodePoints::NULL_CHAR) {
+ // TODO: @Error
+ tokenizer->last.append_to_tag_name(CodePoints::REPLACEMENT_CHAR);
+ }
+ else if (c == EOF) {
+ // TODO: @Error
+ tokenizer->state = TokenizerState_Data;
+ tokenizer->flag = tokenizer->flag & ~TokenizerFlag_IncrementPtr;
+ }
+ else {
+ tokenizer->last.append_to_tag_name(c);
+ }
+ break;
+ }
+}
+
+/// Process the end tag open state
+/// Spec: https://dev.w3.org/html5/spec-LC/tokenization.html#end-tag-open-state
+inline void end_tag_open_state(Tokenizer* tokenizer) {
+ code_point_t c = *tokenizer->ptr;
+ if (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_Z) {
+ c = c + 0x0020; // To lower
+ tokenizer->last.type = HtmlTokenType_EndTag;
+ tokenizer->last.append_to_tag_name(c);
+ tokenizer->state = TokenizerState_TagName;
+ }
+ else if (c >= CodePoints::LOWERCASE_A && c <= CodePoints::LOWERCASE_Z) {
+ tokenizer->last.type = HtmlTokenType_EndTag;
+ tokenizer->last.append_to_tag_name(c);
+ tokenizer->state = TokenizerState_TagName;
+ }
+ else if (c == CodePoints::GREATER_THAN_SIGN) {
+ // TODO: @Error We got something like </>
+ tokenizer->state = TokenizerState_Data;
+ }
+ else if (c == EOF) {
+ // TODO: @Error
+ // TODO: @Question Emit two tokens?
+ tokenizer->last.type = HtmlTokenType_Character;
+ tokenizer->last.character_token = CodePoints::LESS_THAN_SIGN;
+
+ tokenizer->last.type = HtmlTokenType_Character;
+ tokenizer->last.character_token = CodePoints::SOLIDUS;
+
+ tokenizer->flag = tokenizer->flag & ~TokenizerFlag_IncrementPtr;
+ }
+ else if (c == CodePoints::NULL_CHAR) {
+ // TODO: @Error
+ tokenizer->last.append_to_tag_name(CodePoints::REPLACEMENT_CHAR);
+ }
+ else {
+ // TODO: @Error
+ tokenizer->state = TokenizerState_BogusComment;
+ }
+}
+
+/// https://dev.w3.org/html5/spec-LC/tokenization.html#consume-a-character-reference
+inline void try_consume_character_reference(Tokenizer* tokenizer) {
+ code_point_t c = *tokenizer->ptr;
+
+ switch (c) {
+ case CodePoints::TAB:
+ case CodePoints::LF:
+ case CodePoints::FF:
+ case CodePoints::SPACE:
+ case CodePoints::LESS_THAN_SIGN:
+ case CodePoints::AMPERSAND:
+ case EOF: {
+ // TODO: The additional allowed character?
+ // Not a character reference. No characters are consumed, and nothing is returned. (This is not an error, either.)
+ tokenizer->state = TokenizerState_Data;
+ tokenizer->flag &= TokenizerFlag_IncrementPtr;
+ break;
+ }
+ case CodePoints::NUMBER_SIGN: {
+ consume_next(tokenizer);
+ c = *tokenizer->ptr;
+
+ bool is_hex = false; // If set to true, we should interpret within the range 0 to F as hex, otherwise 0 to 9 as decimal
+ switch (c) {
+ case CodePoints::UPPERCASE_X:
+ case CodePoints::LOWERCASE_X: {
+ consume_next(tokenizer);
+ is_hex = true;
+ break;
+ }
+ default: {
+ is_hex = false;
+ break;
+ }
+ }
+ break;
+ }
+ }
+}
+
+/// TODO:
+inline void character_reference_in_data_state(Tokenizer* tokenizer) {
+ // A character reference begins with an ampersand
+ code_point_t c = *tokenizer->ptr;
+}
+
+HtmlToken read_next(Tokenizer* tokenizer) {
+ tokenizer->flag = TokenizerFlag_None;
+ tokenizer->last.reset();
+ tokenizer->state = TokenizerState_Data;
+ do {
+ // Reset all flags, except for IncrementPtr
+ tokenizer->flag = 0 | TokenizerFlag_IncrementPtr;
+
+ switch (tokenizer->state) {
+ case TokenizerState_Data:
+ data_state(tokenizer);
+ break;
+ case TokenizerState_TagOpen:
+ tag_open_state(tokenizer);
+ break;
+ case TokenizerState_TagName:
+ tag_name_state(tokenizer);
+ break;
+ case TokenizerState_EndTagOpen:
+ end_tag_open_state(tokenizer);
+ break;
+ case TokenizerState_CharacterReferenceInData:
+ character_reference_in_data_state(tokenizer);
+ break;
+ default:
+ logger_error("Unsupported state, exploding: %d\n", tokenizer->state);
+ exit(1);
+ }
+
+ if (tokenizer->flag & TokenizerFlag_IncrementPtr) {
+ tokenizer->ptr++;
+ }
+
+ if (tokenizer->flag & TokenizerFlag_Emit) {
+ break;
+ }
+ } while (true);
+
+ return tokenizer->last;
+}