From 4058f9b1704322f8185136c2558c2ab96a4d835c Mon Sep 17 00:00:00 2001
From: mattkae <mattkae@protonmail.com>
Date: Sun, 23 Apr 2023 20:23:54 -0400
Subject: Initial commit with a working parser

---
 .gitignore         |   4 +
 .projectile        |   0
 Makefile           |  45 +++++++++
 examples/1.html    |   6 ++
 html-standard.pdf  | Bin 0 -> 14599212 bytes
 src/code_point.h   |   8 ++
 src/html_token.cpp |  31 +++++++
 src/html_token.hpp |  32 +++++++
 src/main.cpp       |  80 ++++++++++++++++
 src/tokenizer.cpp  | 268 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/tokenizer.hpp  |  41 ++++++++
 11 files changed, 515 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 .projectile
 create mode 100644 Makefile
 create mode 100644 examples/1.html
 create mode 100644 html-standard.pdf
 create mode 100644 src/code_point.h
 create mode 100644 src/html_token.cpp
 create mode 100644 src/html_token.hpp
 create mode 100644 src/main.cpp
 create mode 100644 src/tokenizer.cpp
 create mode 100644 src/tokenizer.hpp
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..704b56a
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+.cproject
+.project
+all
+build
\ No newline at end of file
diff --git a/.projectile b/.projectile
new file mode 100644
index 0000000..e69de29
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..e453aa6
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,45 @@
+TARGET_EXEC ?= html_parser
+
+BUILD_DIR ?= ./build
+SRC_DIRS ?= ./src
+
+CC   := g++ -g
+SRCS := $(shell find $(SRC_DIRS) -name *.cpp -or -name *.c -or -name *.s)
+OBJS := $(SRCS:%=$(BUILD_DIR)/%.o)
+DEPS := $(OBJS:.o=.d)
+
+INC_DIRS := $(shell find $(SRC_DIRS) -type d)
+INC_FLAGS := $(addprefix -I,$(INC_DIRS))
+
+CPPFLAGS ?= $(INC_FLAGS) -MMD -MP
+
+$(BUILD_DIR)/$(TARGET_EXEC): $(OBJS)
+	$(CC) $(OBJS) -o $@ -lmatte
+
+# assembly
+$(BUILD_DIR)/%.s.o: %.s
+	$(MKDIR_P) $(dir $@)
+	$(AS) $(ASFLAGS) -c $< -o $@
+
+# c source
+$(BUILD_DIR)/%.c.o: %.c
+	$(MKDIR_P) $(dir $@)
+	$(CC) $(CPPFLAGS) $(CFLAGS) -c $< -o $@
+
+# c++ source
+$(BUILD_DIR)/%.cpp.o: %.cpp
+	$(MKDIR_P) $(dir $@)
+	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $< -o $@
+
+
+.PHONY: clean
+
+clean:
+	$(RM) -r $(BUILD_DIR)
+
+all:
+	$(BUILD_DIR)/$(TARGET_EXEC)
+
+-include $(DEPS)
+
+MKDIR_P ?= mkdir -p
diff --git a/examples/1.html b/examples/1.html
new file mode 100644
index 0000000..06f1bd6
--- /dev/null
+++ b/examples/1.html
@@ -0,0 +1,6 @@
+<div>
+  <h1>Hello World</h1>
+  <p>
+    I am in a paragraph
+  </p>
+</div>
diff --git a/html-standard.pdf b/html-standard.pdf
new file mode 100644
index 0000000..d1a8614
Binary files /dev/null and b/html-standard.pdf differ
diff --git a/src/code_point.h b/src/code_point.h
new file mode 100644
index 0000000..dcfcd32
--- /dev/null
+++ b/src/code_point.h
@@ -0,0 +1,8 @@
+#ifndef CODE_POINT_H
+#define CODE_POINT_H
+
+#include <cstdint>
+
+typedef wchar_t code_point_t;
+
+#endif
diff --git a/src/html_token.cpp b/src/html_token.cpp
new file mode 100644
index 0000000..904e79a
--- /dev/null
+++ b/src/html_token.cpp
@@ -0,0 +1,31 @@
+#include "html_token.hpp"
+#include <matte/logger.h>
+
+const char* TOKEN_TO_NAME_MAP[HtmlTokenType_Length] = {
+    "Text",
+    "Start Tag",
+    "End Tag",
+    "Attribute",
+    "EOF",
+    "Character"
+};
+
+void HtmlToken::print() {
+    const char* name = TOKEN_TO_NAME_MAP[type];
+    switch (type) {
+    case HtmlTokenType_Character:
+        logger_info("%s, %c", name, character_token);
+        break;
+    case HtmlTokenType_StartTag:
+    case HtmlTokenType_EndTag:
+        logger_info("%s, %s", name, tag_name.c_str());
+        break;
+    default:
+        logger_info("%s", name);
+    }
+}
+
+void HtmlToken::reset() {
+    type = HtmlTokenType_None;
+    tag_name.clear();
+}
diff --git a/src/html_token.hpp b/src/html_token.hpp
new file mode 100644
index 0000000..ee385ce
--- /dev/null
+++ b/src/html_token.hpp
@@ -0,0 +1,32 @@
+#ifndef HTML_TOKEN_HPP
+#define HTML_TOKEN_HPP
+
+#include <string>
+#include "code_point.h"
+
+enum HtmlTokenType {
+    HtmlTokenType_None = 0,
+    HtmlTokenType_StartTag,
+    HtmlTokenType_EndTag,
+    HtmlTokenType_Attribute,
+    HtmlTokenType_EOF,
+    HtmlTokenType_Character,
+    HtmlTokenType_Length
+};
+
+struct HtmlToken {
+    HtmlTokenType type;
+
+    // TODO: Performance
+    char character_token;
+    std::string tag_name;
+
+    void append_to_tag_name(code_point_t c) {
+        tag_name += c;
+    }
+
+    void print();
+    void reset();
+};
+
+#endif
diff --git a/src/main.cpp b/src/main.cpp
new file mode 100644
index 0000000..c92321c
--- /dev/null
+++ b/src/main.cpp
@@ -0,0 +1,80 @@
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <ctype.h>
+#include <iostream>
+#include <matte/logger.h>
+#include <matte/types.h>
+#include <matte/list.h>
+#include "code_point.h"
+#include "html_token.hpp"
+#include "tokenizer.hpp"
+#include <argp.h>
+
+using namespace matte;
+
+const char *argp_program_version = "html_parser 0.1";
+const char *argp_program_bug_address = "<matthew@matthewkosarek.xyz>";
+static char doc[] = "A description of your program.";
+static char args_doc[] = "-f [FILENAME]...";
+static struct argp_option options[] = { 
+    { "file", 'f', "FILE", 0, "File to parse."},
+    { 0 } 
+};
+
+struct Arguments {
+    char* filename;
+};
+
+static error_t parse_opt(int key, char *arg, struct argp_state *state) {
+    Arguments* a = (Arguments*)state->input;
+    switch (key) {
+    case 'f': {
+        a->filename = arg;
+        break;
+    }
+    case ARGP_KEY_ARG:
+        return 0;
+    default:
+        return ARGP_ERR_UNKNOWN;
+    }
+    return 0;
+}
+
+static struct argp argp = { options, parse_opt, args_doc, doc, 0, 0, 0 };
+
+int main(int argc, char *argv[]) {
+    Arguments arguments;
+    arguments.filename = nullptr;
+    auto error = argp_parse(&argp, argc, argv, 0, 0, &arguments);
+
+    if (arguments.filename == nullptr) {
+        exit(EXIT_FAILURE);
+    }
+
+    FILE* file = fopen(arguments.filename, "rb");
+    if (file == NULL) {
+        exit(EXIT_FAILURE);
+    }
+
+    code_point_t wc;
+    code_point_t buffer[1024];
+    size_t ptr = 0;
+    while ((wc=fgetwc(file))!=WEOF) {
+        buffer[ptr++] = wc;
+    }
+
+    buffer[ptr] = '\0';
+    fclose(file);
+
+    Tokenizer tokenizer = create(buffer);
+    while (true ) {
+        auto token = read_next(&tokenizer);
+        token.print();
+        if (token.type == HtmlTokenType_EOF) {
+            break;
+        }
+    }
+
+    return EXIT_SUCCESS;
+}
diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp
new file mode 100644
index 0000000..30ca6bd
--- /dev/null
+++ b/src/tokenizer.cpp
@@ -0,0 +1,268 @@
+#include "tokenizer.hpp"
+#include "code_point.h"
+#include "html_token.hpp"
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <matte/logger.h>
+
+Tokenizer create(code_point_t* value) {
+    Tokenizer t;
+    t.ptr = value;
+    t.length = strlen((char*)value);
+    t.state = TokenizerState_Data;
+    return t;
+}
+
+namespace CodePoints {
+    const code_point_t TAB = 0x0009;
+    const code_point_t LF = 0x000A;
+    const code_point_t FF = 0x00C;
+    const code_point_t SPACE = 0x020;
+    const code_point_t SOLIDUS = 0x02F;
+    const code_point_t LOWERCASE_A = 0x0061;
+    const code_point_t LOWERCASE_Z = 0x007A;
+    const code_point_t LOWERCASE_F = 0x0066;
+    const code_point_t UPPERCASE_A = 0x041;
+    const code_point_t UPPERCASE_Z = 0x05A;
+    const code_point_t UPPERCASE_F = 0x0046;
+    const code_point_t NULL_CHAR = 0x0000;
+    const code_point_t REPLACEMENT_CHAR = 0xFFFD;
+    const code_point_t GREATER_THAN_SIGN = 0x003E;
+    const code_point_t LESS_THAN_SIGN = 0x003C;
+    const code_point_t AMPERSAND = 0x0026;
+    const code_point_t EXCLAMATION_MARK = 0x0021;
+    const code_point_t NUMBER_SIGN = 0x0023;
+    const code_point_t LOWERCASE_X = 0x0078;
+    const code_point_t UPPERCASE_X = 0x0058;
+    const code_point_t DIGIT_ZERO = 0x0030;
+    const code_point_t DIGIT_NINE = 0x0039;
+};
+
+// https://dev.w3.org/html5/spec-LC/tokenization.html#tokenization
+
+inline void consume_next(Tokenizer* tokenizer) {
+    tokenizer->ptr++;
+}
+
+
+/// https://dev.w3.org/html5/spec-LC/tokenization.html#data-state
+inline void data_state(Tokenizer* tokenizer) {
+    auto c = *tokenizer->ptr;
+    switch (c) {
+    case CodePoints::AMPERSAND:
+        tokenizer->state = TokenizerState_CharacterReferenceInData;
+        break;
+    case CodePoints::LESS_THAN_SIGN:
+        tokenizer->state = TokenizerState_TagOpen;
+        break;
+    case CodePoints::NULL_CHAR:
+        tokenizer->last.type = HtmlTokenType_EOF;
+        tokenizer->flag =  tokenizer->flag | TokenizerFlag_Emit;
+        break;
+    default:
+        // TODO: @Error If null, throw an error
+        tokenizer->last.type = HtmlTokenType_Character;
+        tokenizer->last.character_token = *tokenizer->ptr;
+        tokenizer->flag |= TokenizerFlag_Emit;
+        break;
+    }
+}
+
+/// https://dev.w3.org/html5/spec-LC/tokenization.html#tag-open-state
+inline void tag_open_state(Tokenizer* tokenizer) {
+    code_point_t c = *tokenizer->ptr;
+    switch (c) {
+    case CodePoints::EXCLAMATION_MARK:
+        tokenizer->state = TokenizerState_MarkupDeclarationOpen; // TODO
+        break;
+    case CodePoints::SOLIDUS: // U+002F
+        tokenizer->state = TokenizerState_EndTagOpen;
+        break;
+    default:
+        // TODO: In these two case, we do NOT want to emit the token just yet.
+        if (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_Z) {
+            c = c + 0x0020; // To lower
+            tokenizer->state = TokenizerState_TagName;
+            tokenizer->last.append_to_tag_name(c);
+            tokenizer->last.type = HtmlTokenType_StartTag;
+        }
+        else if (c >= CodePoints::LOWERCASE_A && c <= CodePoints::LOWERCASE_Z) {
+            tokenizer->state = TokenizerState_TagName;
+            tokenizer->last.append_to_tag_name(c);
+            tokenizer->last.type = HtmlTokenType_StartTag;
+        }
+        else if (c == '?') { // U+003F
+            // TODO: Parse error
+            tokenizer->state = TokenizerState_BogusComment; // TODO:
+        }
+        else {
+            tokenizer->state = TokenizerState_Data;
+        }
+    }
+}
+
+inline void tag_name_state(Tokenizer* tokenizer) {
+    code_point_t c = *tokenizer->ptr;
+    switch (c) {
+    case CodePoints::TAB:
+    case CodePoints::FF:
+    case CodePoints::LF:
+    case CodePoints::SPACE:
+        tokenizer->state = TokenizerState_BeforeAttribute;
+        tokenizer->flag |= TokenizerFlag_Emit;
+        break;
+    case CodePoints::SOLIDUS:
+        tokenizer->state = TokenizerState_SelfClosingStartTag;
+        break;
+    case CodePoints::GREATER_THAN_SIGN:
+        tokenizer->state = TokenizerState_Data;
+        tokenizer->flag |= TokenizerFlag_Emit;
+        break;
+    default:
+        if (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_Z) {
+            c = c + 0x0020; // To lower
+            tokenizer->last.append_to_tag_name(c);
+        }
+        else if (c == CodePoints::NULL_CHAR) {
+            // TODO: @Error
+            tokenizer->last.append_to_tag_name(CodePoints::REPLACEMENT_CHAR);
+        }
+        else if (c == EOF) {
+            // TODO: @Error
+            tokenizer->state = TokenizerState_Data;
+            tokenizer->flag = tokenizer->flag & ~TokenizerFlag_IncrementPtr;
+        }
+        else {
+            tokenizer->last.append_to_tag_name(c);
+        }
+        break;
+    }
+}
+
+/// Process the end tag open state
+/// Spec: https://dev.w3.org/html5/spec-LC/tokenization.html#end-tag-open-state
+inline void end_tag_open_state(Tokenizer* tokenizer) {
+    code_point_t c = *tokenizer->ptr;
+    if (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_Z) {
+        c = c + 0x0020; // To lower
+        tokenizer->last.type = HtmlTokenType_EndTag;
+        tokenizer->last.append_to_tag_name(c);
+        tokenizer->state = TokenizerState_TagName;
+    }
+    else if (c >= CodePoints::LOWERCASE_A && c <= CodePoints::LOWERCASE_Z) {
+        tokenizer->last.type = HtmlTokenType_EndTag;
+        tokenizer->last.append_to_tag_name(c);
+        tokenizer->state = TokenizerState_TagName;
+    }
+    else if (c == CodePoints::GREATER_THAN_SIGN) {
+        // TODO: @Error We got something like </>
+        tokenizer->state = TokenizerState_Data;
+    }
+    else if (c == EOF) {
+        // TODO: @Error
+        // TODO: @Question Emit two tokens?
+        tokenizer->last.type = HtmlTokenType_Character;
+        tokenizer->last.character_token = CodePoints::LESS_THAN_SIGN;
+
+        tokenizer->last.type = HtmlTokenType_Character;
+        tokenizer->last.character_token = CodePoints::SOLIDUS;
+        
+        tokenizer->flag = tokenizer->flag & ~TokenizerFlag_IncrementPtr;
+    }
+    else if (c == CodePoints::NULL_CHAR) {
+        // TODO: @Error
+        tokenizer->last.append_to_tag_name(CodePoints::REPLACEMENT_CHAR);
+    }
+    else {
+        // TODO: @Error
+        tokenizer->state = TokenizerState_BogusComment;
+    }
+}
+
+/// https://dev.w3.org/html5/spec-LC/tokenization.html#consume-a-character-reference
+inline void try_consume_character_reference(Tokenizer* tokenizer) {
+    code_point_t c = *tokenizer->ptr;
+
+    switch (c) {
+    case CodePoints::TAB:
+    case CodePoints::LF:
+    case CodePoints::FF:
+    case CodePoints::SPACE:
+    case CodePoints::LESS_THAN_SIGN:
+    case CodePoints::AMPERSAND:
+    case EOF: {
+        // TODO: The additional allowed character?
+        // Not a character reference. No characters are consumed, and nothing is returned. (This is not an error, either.)
+        tokenizer->state = TokenizerState_Data;
+        tokenizer->flag &= TokenizerFlag_IncrementPtr;
+        break;
+    }
+    case CodePoints::NUMBER_SIGN: {
+        consume_next(tokenizer);
+        c = *tokenizer->ptr;
+
+        bool is_hex = false; // If set to true, we should interpret within the range 0 to F as hex, otherwise 0 to 9 as decimal
+        switch (c) {
+        case CodePoints::UPPERCASE_X:
+        case CodePoints::LOWERCASE_X: {
+            consume_next(tokenizer);
+            is_hex = true;
+            break;
+        }
+        default: {
+            is_hex = false;
+            break;
+        }
+        }
+        break;
+    }
+    }
+}
+
+/// TODO:
+inline void character_reference_in_data_state(Tokenizer* tokenizer) {
+    // A character reference begins with an ampersand
+    code_point_t c = *tokenizer->ptr;
+}
+
+HtmlToken read_next(Tokenizer* tokenizer) {
+    tokenizer->flag = TokenizerFlag_None;
+    tokenizer->last.reset();
+    tokenizer->state = TokenizerState_Data;
+    do {
+        // Reset all flags, except for IncrementPtr
+        tokenizer->flag = 0 | TokenizerFlag_IncrementPtr;
+        
+        switch (tokenizer->state) {
+        case TokenizerState_Data:
+            data_state(tokenizer);
+            break;
+        case TokenizerState_TagOpen:
+            tag_open_state(tokenizer);
+            break;
+        case TokenizerState_TagName:
+            tag_name_state(tokenizer);
+            break;
+        case TokenizerState_EndTagOpen:
+            end_tag_open_state(tokenizer);
+            break;
+        case TokenizerState_CharacterReferenceInData:
+            character_reference_in_data_state(tokenizer);
+            break;
+        default:
+            logger_error("Unsupported state, exploding: %d\n", tokenizer->state);
+            exit(1);
+        }
+        
+        if (tokenizer->flag & TokenizerFlag_IncrementPtr) {
+            tokenizer->ptr++;
+        }
+
+        if (tokenizer->flag & TokenizerFlag_Emit) {
+            break;
+        }
+    } while (true);
+
+    return tokenizer->last;
+}
diff --git a/src/tokenizer.hpp b/src/tokenizer.hpp
new file mode 100644
index 0000000..4978bfb
--- /dev/null
+++ b/src/tokenizer.hpp
@@ -0,0 +1,41 @@
+#ifndef TOKENIZER_HPP
+#define TOKENIZER_HPP
+
+#include "code_point.h"
+#include "html_token.hpp"
+
+enum TokenizerFlag {
+    TokenizerFlag_None = 0,
+    TokenizerFlag_Emit = 1,
+    TokenizerFlag_IncrementPtr = 2
+};
+
+enum TokenizerState {
+    TokenizerState_None,
+    TokenizerState_Data,
+    TokenizerState_CharacterReferenceInData,
+    TokenizerState_TagOpen,
+    TokenizerState_MarkupDeclarationOpen,
+    TokenizerState_EndTagOpen,
+    TokenizerState_TagName,
+    TokenizerState_BogusComment,
+    TokenizerState_CommentState,
+    TokenizerState_BeforeAttribute,
+    TokenizerState_SelfClosingStartTag
+};
+
+struct Tokenizer {
+    code_point_t* ptr = nullptr;
+    size_t length = 0;
+
+    TokenizerState state = TokenizerState_Data;
+    HtmlToken last;
+    int flag = TokenizerFlag_None;
+};
+
+
+Tokenizer create(code_point_t*);
+HtmlToken read_next(Tokenizer*);
+
+
+#endif
-- 
cgit v1.2.1