summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authormattkae <mattkae@protonmail.com>2023-04-23 20:23:54 -0400
committermattkae <mattkae@protonmail.com>2023-04-23 20:23:54 -0400
commit4058f9b1704322f8185136c2558c2ab96a4d835c (patch)
treef764007c2cdd0f41372d66dcf02ccde26509b839
Initial commit with a working parser
-rw-r--r--.gitignore4
-rw-r--r--.projectile0
-rw-r--r--Makefile45
-rw-r--r--examples/1.html6
-rw-r--r--html-standard.pdfbin0 -> 14599212 bytes
-rw-r--r--src/code_point.h8
-rw-r--r--src/html_token.cpp31
-rw-r--r--src/html_token.hpp32
-rw-r--r--src/main.cpp80
-rw-r--r--src/tokenizer.cpp268
-rw-r--r--src/tokenizer.hpp41
11 files changed, 515 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..704b56a
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+.cproject
+.project
+all
+build \ No newline at end of file
diff --git a/.projectile b/.projectile
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/.projectile
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..e453aa6
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,45 @@
+TARGET_EXEC ?= html_parser
+
+BUILD_DIR ?= ./build
+SRC_DIRS ?= ./src
+
+CC := g++ -g
+SRCS := $(shell find $(SRC_DIRS) -name *.cpp -or -name *.c -or -name *.s)
+OBJS := $(SRCS:%=$(BUILD_DIR)/%.o)
+DEPS := $(OBJS:.o=.d)
+
+INC_DIRS := $(shell find $(SRC_DIRS) -type d)
+INC_FLAGS := $(addprefix -I,$(INC_DIRS))
+
+CPPFLAGS ?= $(INC_FLAGS) -MMD -MP
+
+$(BUILD_DIR)/$(TARGET_EXEC): $(OBJS)
+ $(CC) $(OBJS) -o $@ -lmatte
+
+# assembly
+$(BUILD_DIR)/%.s.o: %.s
+ $(MKDIR_P) $(dir $@)
+ $(AS) $(ASFLAGS) -c $< -o $@
+
+# c source
+$(BUILD_DIR)/%.c.o: %.c
+ $(MKDIR_P) $(dir $@)
+ $(CC) $(CPPFLAGS) $(CFLAGS) -c $< -o $@
+
+# c++ source
+$(BUILD_DIR)/%.cpp.o: %.cpp
+ $(MKDIR_P) $(dir $@)
+ $(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $< -o $@
+
+
+.PHONY: clean
+
+clean:
+ $(RM) -r $(BUILD_DIR)
+
+all:
+ $(BUILD_DIR)/$(TARGET_EXEC)
+
+-include $(DEPS)
+
+MKDIR_P ?= mkdir -p
diff --git a/examples/1.html b/examples/1.html
new file mode 100644
index 0000000..06f1bd6
--- /dev/null
+++ b/examples/1.html
@@ -0,0 +1,6 @@
+<div>
+ <h1>Hello World</h1>
+ <p>
+ I am in a paragraph
+ </p>
+</div>
diff --git a/html-standard.pdf b/html-standard.pdf
new file mode 100644
index 0000000..d1a8614
--- /dev/null
+++ b/html-standard.pdf
Binary files differ
diff --git a/src/code_point.h b/src/code_point.h
new file mode 100644
index 0000000..dcfcd32
--- /dev/null
+++ b/src/code_point.h
@@ -0,0 +1,8 @@
+#ifndef CODE_POINT_H
+#define CODE_POINT_H
+
+#include <cstdint>
+
+typedef wchar_t code_point_t;
+
+#endif
diff --git a/src/html_token.cpp b/src/html_token.cpp
new file mode 100644
index 0000000..904e79a
--- /dev/null
+++ b/src/html_token.cpp
@@ -0,0 +1,31 @@
+#include "html_token.hpp"
+#include <matte/logger.h>
+
+const char* TOKEN_TO_NAME_MAP[HtmlTokenType_Length] = {
+ "Text",
+ "Start Tag",
+ "End Tag",
+ "Attribute",
+ "EOF",
+ "Character"
+};
+
+void HtmlToken::print() {
+ const char* name = TOKEN_TO_NAME_MAP[type];
+ switch (type) {
+ case HtmlTokenType_Character:
+ logger_info("%s, %c", name, character_token);
+ break;
+ case HtmlTokenType_StartTag:
+ case HtmlTokenType_EndTag:
+ logger_info("%s, %s", name, tag_name.c_str());
+ break;
+ default:
+ logger_info("%s", name);
+ }
+}
+
+void HtmlToken::reset() {
+ type = HtmlTokenType_None;
+ tag_name.clear();
+}
diff --git a/src/html_token.hpp b/src/html_token.hpp
new file mode 100644
index 0000000..ee385ce
--- /dev/null
+++ b/src/html_token.hpp
@@ -0,0 +1,32 @@
+#ifndef HTML_TOKEN_HPP
+#define HTML_TOKEN_HPP
+
+#include <string>
+#include "code_point.h"
+
+enum HtmlTokenType {
+ HtmlTokenType_None = 0,
+ HtmlTokenType_StartTag,
+ HtmlTokenType_EndTag,
+ HtmlTokenType_Attribute,
+ HtmlTokenType_EOF,
+ HtmlTokenType_Character,
+ HtmlTokenType_Length
+};
+
+struct HtmlToken {
+ HtmlTokenType type;
+
+ // TODO: Performance
+ char character_token;
+ std::string tag_name;
+
+ void append_to_tag_name(code_point_t c) {
+ tag_name += c;
+ }
+
+ void print();
+ void reset();
+};
+
+#endif
diff --git a/src/main.cpp b/src/main.cpp
new file mode 100644
index 0000000..c92321c
--- /dev/null
+++ b/src/main.cpp
@@ -0,0 +1,80 @@
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <ctype.h>
+#include <iostream>
+#include <matte/logger.h>
+#include <matte/types.h>
+#include <matte/list.h>
+#include "code_point.h"
+#include "html_token.hpp"
+#include "tokenizer.hpp"
+#include <argp.h>
+
+using namespace matte;
+
+const char *argp_program_version = "html_parser 0.1";
+const char *argp_program_bug_address = "<matthew@matthewkosarek.xyz>";
+static char doc[] = "A description of your program.";
+static char args_doc[] = "-f [FILENAME]...";
+static struct argp_option options[] = {
+ { "file", 'f', "FILE", 0, "File to parse."},
+ { 0 }
+};
+
+struct Arguments {
+ char* filename;
+};
+
+static error_t parse_opt(int key, char *arg, struct argp_state *state) {
+ Arguments* a = (Arguments*)state->input;
+ switch (key) {
+ case 'f': {
+ a->filename = arg;
+ break;
+ }
+ case ARGP_KEY_ARG:
+ return 0;
+ default:
+ return ARGP_ERR_UNKNOWN;
+ }
+ return 0;
+}
+
+static struct argp argp = { options, parse_opt, args_doc, doc, 0, 0, 0 };
+
+int main(int argc, char *argv[]) {
+ Arguments arguments;
+ arguments.filename = nullptr;
+ auto error = argp_parse(&argp, argc, argv, 0, 0, &arguments);
+
+ if (arguments.filename == nullptr) {
+ exit(EXIT_FAILURE);
+ }
+
+ FILE* file = fopen(arguments.filename, "rb");
+ if (file == NULL) {
+ exit(EXIT_FAILURE);
+ }
+
+ code_point_t wc;
+ code_point_t buffer[1024];
+ size_t ptr = 0;
+ while ((wc=fgetwc(file))!=WEOF) {
+ buffer[ptr++] = wc;
+ }
+
+ buffer[ptr] = '\0';
+ fclose(file);
+
+ Tokenizer tokenizer = create(buffer);
+ while (true ) {
+ auto token = read_next(&tokenizer);
+ token.print();
+ if (token.type == HtmlTokenType_EOF) {
+ break;
+ }
+ }
+
+ return EXIT_SUCCESS;
+}
diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp
new file mode 100644
index 0000000..30ca6bd
--- /dev/null
+++ b/src/tokenizer.cpp
@@ -0,0 +1,268 @@
+#include "tokenizer.hpp"
+#include "code_point.h"
+#include "html_token.hpp"
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <matte/logger.h>
+
+Tokenizer create(code_point_t* value) {
+ Tokenizer t;
+ t.ptr = value;
+ t.length = strlen((char*)value);
+ t.state = TokenizerState_Data;
+ return t;
+}
+
+namespace CodePoints {
+ const code_point_t TAB = 0x0009;
+ const code_point_t LF = 0x000A;
+ const code_point_t FF = 0x00C;
+ const code_point_t SPACE = 0x020;
+ const code_point_t SOLIDUS = 0x02F;
+ const code_point_t LOWERCASE_A = 0x0061;
+ const code_point_t LOWERCASE_Z = 0x007A;
+ const code_point_t LOWERCASE_F = 0x0066;
+ const code_point_t UPPERCASE_A = 0x041;
+ const code_point_t UPPERCASE_Z = 0x05A;
+ const code_point_t UPPERCASE_F = 0x0046;
+ const code_point_t NULL_CHAR = 0x0000;
+ const code_point_t REPLACEMENT_CHAR = 0xFFFD;
+ const code_point_t GREATER_THAN_SIGN = 0x003E;
+ const code_point_t LESS_THAN_SIGN = 0x003C;
+ const code_point_t AMPERSAND = 0x0026;
+ const code_point_t EXCLAMATION_MARK = 0x0021;
+ const code_point_t NUMBER_SIGN = 0x0023;
+ const code_point_t LOWERCASE_X = 0x0078;
+ const code_point_t UPPERCASE_X = 0x0058;
+ const code_point_t DIGIT_ZERO = 0x0030;
+ const code_point_t DIGIT_NINE = 0x0039;
+};
+
+// https://dev.w3.org/html5/spec-LC/tokenization.html#tokenization
+
+inline void consume_next(Tokenizer* tokenizer) {
+ tokenizer->ptr++;
+}
+
+
+/// https://dev.w3.org/html5/spec-LC/tokenization.html#data-state
+inline void data_state(Tokenizer* tokenizer) {
+ auto c = *tokenizer->ptr;
+ switch (c) {
+ case CodePoints::AMPERSAND:
+ tokenizer->state = TokenizerState_CharacterReferenceInData;
+ break;
+ case CodePoints::LESS_THAN_SIGN:
+ tokenizer->state = TokenizerState_TagOpen;
+ break;
+ case CodePoints::NULL_CHAR:
+ tokenizer->last.type = HtmlTokenType_EOF;
+ tokenizer->flag = tokenizer->flag | TokenizerFlag_Emit;
+ break;
+ default:
+ // TODO: @Error If null, throw an error
+ tokenizer->last.type = HtmlTokenType_Character;
+ tokenizer->last.character_token = *tokenizer->ptr;
+ tokenizer->flag |= TokenizerFlag_Emit;
+ break;
+ }
+}
+
+/// https://dev.w3.org/html5/spec-LC/tokenization.html#tag-open-state
+inline void tag_open_state(Tokenizer* tokenizer) {
+ code_point_t c = *tokenizer->ptr;
+ switch (c) {
+ case CodePoints::EXCLAMATION_MARK:
+ tokenizer->state = TokenizerState_MarkupDeclarationOpen; // TODO
+ break;
+ case CodePoints::SOLIDUS: // U+002F
+ tokenizer->state = TokenizerState_EndTagOpen;
+ break;
+ default:
+ // TODO: In these two case, we do NOT want to emit the token just yet.
+ if (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_Z) {
+ c = c + 0x0020; // To lower
+ tokenizer->state = TokenizerState_TagName;
+ tokenizer->last.append_to_tag_name(c);
+ tokenizer->last.type = HtmlTokenType_StartTag;
+ }
+ else if (c >= CodePoints::LOWERCASE_A && c <= CodePoints::LOWERCASE_Z) {
+ tokenizer->state = TokenizerState_TagName;
+ tokenizer->last.append_to_tag_name(c);
+ tokenizer->last.type = HtmlTokenType_StartTag;
+ }
+ else if (c == '?') { // U+003F
+ // TODO: Parse error
+ tokenizer->state = TokenizerState_BogusComment; // TODO:
+ }
+ else {
+ tokenizer->state = TokenizerState_Data;
+ }
+ }
+}
+
+inline void tag_name_state(Tokenizer* tokenizer) {
+ code_point_t c = *tokenizer->ptr;
+ switch (c) {
+ case CodePoints::TAB:
+ case CodePoints::FF:
+ case CodePoints::LF:
+ case CodePoints::SPACE:
+ tokenizer->state = TokenizerState_BeforeAttribute;
+ tokenizer->flag |= TokenizerFlag_Emit;
+ break;
+ case CodePoints::SOLIDUS:
+ tokenizer->state = TokenizerState_SelfClosingStartTag;
+ break;
+ case CodePoints::GREATER_THAN_SIGN:
+ tokenizer->state = TokenizerState_Data;
+ tokenizer->flag |= TokenizerFlag_Emit;
+ break;
+ default:
+ if (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_Z) {
+ c = c + 0x0020; // To lower
+ tokenizer->last.append_to_tag_name(c);
+ }
+ else if (c == CodePoints::NULL_CHAR) {
+ // TODO: @Error
+ tokenizer->last.append_to_tag_name(CodePoints::REPLACEMENT_CHAR);
+ }
+ else if (c == EOF) {
+ // TODO: @Error
+ tokenizer->state = TokenizerState_Data;
+ tokenizer->flag = tokenizer->flag & ~TokenizerFlag_IncrementPtr;
+ }
+ else {
+ tokenizer->last.append_to_tag_name(c);
+ }
+ break;
+ }
+}
+
+/// Process the end tag open state
+/// Spec: https://dev.w3.org/html5/spec-LC/tokenization.html#end-tag-open-state
+inline void end_tag_open_state(Tokenizer* tokenizer) {
+ code_point_t c = *tokenizer->ptr;
+ if (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_Z) {
+ c = c + 0x0020; // To lower
+ tokenizer->last.type = HtmlTokenType_EndTag;
+ tokenizer->last.append_to_tag_name(c);
+ tokenizer->state = TokenizerState_TagName;
+ }
+ else if (c >= CodePoints::LOWERCASE_A && c <= CodePoints::LOWERCASE_Z) {
+ tokenizer->last.type = HtmlTokenType_EndTag;
+ tokenizer->last.append_to_tag_name(c);
+ tokenizer->state = TokenizerState_TagName;
+ }
+ else if (c == CodePoints::GREATER_THAN_SIGN) {
+ // TODO: @Error We got something like </>
+ tokenizer->state = TokenizerState_Data;
+ }
+ else if (c == EOF) {
+ // TODO: @Error
+ // TODO: @Question Emit two tokens?
+ tokenizer->last.type = HtmlTokenType_Character;
+ tokenizer->last.character_token = CodePoints::LESS_THAN_SIGN;
+
+ tokenizer->last.type = HtmlTokenType_Character;
+ tokenizer->last.character_token = CodePoints::SOLIDUS;
+
+ tokenizer->flag = tokenizer->flag & ~TokenizerFlag_IncrementPtr;
+ }
+ else if (c == CodePoints::NULL_CHAR) {
+ // TODO: @Error
+ tokenizer->last.append_to_tag_name(CodePoints::REPLACEMENT_CHAR);
+ }
+ else {
+ // TODO: @Error
+ tokenizer->state = TokenizerState_BogusComment;
+ }
+}
+
+/// https://dev.w3.org/html5/spec-LC/tokenization.html#consume-a-character-reference
+inline void try_consume_character_reference(Tokenizer* tokenizer) {
+ code_point_t c = *tokenizer->ptr;
+
+ switch (c) {
+ case CodePoints::TAB:
+ case CodePoints::LF:
+ case CodePoints::FF:
+ case CodePoints::SPACE:
+ case CodePoints::LESS_THAN_SIGN:
+ case CodePoints::AMPERSAND:
+ case EOF: {
+ // TODO: The additional allowed character?
+ // Not a character reference. No characters are consumed, and nothing is returned. (This is not an error, either.)
+ tokenizer->state = TokenizerState_Data;
+ tokenizer->flag &= TokenizerFlag_IncrementPtr;
+ break;
+ }
+ case CodePoints::NUMBER_SIGN: {
+ consume_next(tokenizer);
+ c = *tokenizer->ptr;
+
+ bool is_hex = false; // If set to true, we should interpret within the range 0 to F as hex, otherwise 0 to 9 as decimal
+ switch (c) {
+ case CodePoints::UPPERCASE_X:
+ case CodePoints::LOWERCASE_X: {
+ consume_next(tokenizer);
+ is_hex = true;
+ break;
+ }
+ default: {
+ is_hex = false;
+ break;
+ }
+ }
+ break;
+ }
+ }
+}
+
+/// TODO:
+inline void character_reference_in_data_state(Tokenizer* tokenizer) {
+ // A character reference begins with an ampersand
+ code_point_t c = *tokenizer->ptr;
+}
+
+HtmlToken read_next(Tokenizer* tokenizer) {
+ tokenizer->flag = TokenizerFlag_None;
+ tokenizer->last.reset();
+ tokenizer->state = TokenizerState_Data;
+ do {
+ // Reset all flags, except for IncrementPtr
+ tokenizer->flag = 0 | TokenizerFlag_IncrementPtr;
+
+ switch (tokenizer->state) {
+ case TokenizerState_Data:
+ data_state(tokenizer);
+ break;
+ case TokenizerState_TagOpen:
+ tag_open_state(tokenizer);
+ break;
+ case TokenizerState_TagName:
+ tag_name_state(tokenizer);
+ break;
+ case TokenizerState_EndTagOpen:
+ end_tag_open_state(tokenizer);
+ break;
+ case TokenizerState_CharacterReferenceInData:
+ character_reference_in_data_state(tokenizer);
+ break;
+ default:
+ logger_error("Unsupported state, exploding: %d\n", tokenizer->state);
+ exit(1);
+ }
+
+ if (tokenizer->flag & TokenizerFlag_IncrementPtr) {
+ tokenizer->ptr++;
+ }
+
+ if (tokenizer->flag & TokenizerFlag_Emit) {
+ break;
+ }
+ } while (true);
+
+ return tokenizer->last;
+}
diff --git a/src/tokenizer.hpp b/src/tokenizer.hpp
new file mode 100644
index 0000000..4978bfb
--- /dev/null
+++ b/src/tokenizer.hpp
@@ -0,0 +1,41 @@
+#ifndef TOKENIZER_HPP
+#define TOKENIZER_HPP
+
+#include "code_point.h"
+#include "html_token.hpp"
+
+enum TokenizerFlag {
+ TokenizerFlag_None = 0,
+ TokenizerFlag_Emit = 1,
+ TokenizerFlag_IncrementPtr = 2
+};
+
+enum TokenizerState {
+ TokenizerState_None,
+ TokenizerState_Data,
+ TokenizerState_CharacterReferenceInData,
+ TokenizerState_TagOpen,
+ TokenizerState_MarkupDeclarationOpen,
+ TokenizerState_EndTagOpen,
+ TokenizerState_TagName,
+ TokenizerState_BogusComment,
+ TokenizerState_CommentState,
+ TokenizerState_BeforeAttribute,
+ TokenizerState_SelfClosingStartTag
+};
+
+struct Tokenizer {
+ code_point_t* ptr = nullptr;
+ size_t length = 0;
+
+ TokenizerState state = TokenizerState_Data;
+ HtmlToken last;
+ int flag = TokenizerFlag_None;
+};
+
+
+Tokenizer create(code_point_t*);
+HtmlToken read_next(Tokenizer*);
+
+
+#endif