From 4058f9b1704322f8185136c2558c2ab96a4d835c Mon Sep 17 00:00:00 2001 From: mattkae Date: Sun, 23 Apr 2023 20:23:54 -0400 Subject: Initial commit with a working parser --- .gitignore | 4 + .projectile | 0 Makefile | 45 +++++++++ examples/1.html | 6 ++ html-standard.pdf | Bin 0 -> 14599212 bytes src/code_point.h | 8 ++ src/html_token.cpp | 31 +++++++ src/html_token.hpp | 32 +++++++ src/main.cpp | 80 ++++++++++++++++ src/tokenizer.cpp | 268 +++++++++++++++++++++++++++++++++++++++++++++++++++++ src/tokenizer.hpp | 41 ++++++++ 11 files changed, 515 insertions(+) create mode 100644 .gitignore create mode 100644 .projectile create mode 100644 Makefile create mode 100644 examples/1.html create mode 100644 html-standard.pdf create mode 100644 src/code_point.h create mode 100644 src/html_token.cpp create mode 100644 src/html_token.hpp create mode 100644 src/main.cpp create mode 100644 src/tokenizer.cpp create mode 100644 src/tokenizer.hpp diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..704b56a --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.cproject +.project +all +build \ No newline at end of file diff --git a/.projectile b/.projectile new file mode 100644 index 0000000..e69de29 diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..e453aa6 --- /dev/null +++ b/Makefile @@ -0,0 +1,45 @@ +TARGET_EXEC ?= html_parser + +BUILD_DIR ?= ./build +SRC_DIRS ?= ./src + +CC := g++ -g +SRCS := $(shell find $(SRC_DIRS) -name *.cpp -or -name *.c -or -name *.s) +OBJS := $(SRCS:%=$(BUILD_DIR)/%.o) +DEPS := $(OBJS:.o=.d) + +INC_DIRS := $(shell find $(SRC_DIRS) -type d) +INC_FLAGS := $(addprefix -I,$(INC_DIRS)) + +CPPFLAGS ?= $(INC_FLAGS) -MMD -MP + +$(BUILD_DIR)/$(TARGET_EXEC): $(OBJS) + $(CC) $(OBJS) -o $@ -lmatte + +# assembly +$(BUILD_DIR)/%.s.o: %.s + $(MKDIR_P) $(dir $@) + $(AS) $(ASFLAGS) -c $< -o $@ + +# c source +$(BUILD_DIR)/%.c.o: %.c + $(MKDIR_P) $(dir $@) + $(CC) $(CPPFLAGS) $(CFLAGS) -c $< -o $@ + +# c++ source +$(BUILD_DIR)/%.cpp.o: %.cpp + $(MKDIR_P) $(dir $@) + $(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $< -o $@ + + +.PHONY: clean + +clean: + $(RM) -r $(BUILD_DIR) + +all: + $(BUILD_DIR)/$(TARGET_EXEC) + +-include $(DEPS) + +MKDIR_P ?= mkdir -p diff --git a/examples/1.html b/examples/1.html new file mode 100644 index 0000000..06f1bd6 --- /dev/null +++ b/examples/1.html @@ -0,0 +1,6 @@ +
+

Hello World

+

+ I am in a paragraph +

+
diff --git a/html-standard.pdf b/html-standard.pdf new file mode 100644 index 0000000..d1a8614 Binary files /dev/null and b/html-standard.pdf differ diff --git a/src/code_point.h b/src/code_point.h new file mode 100644 index 0000000..dcfcd32 --- /dev/null +++ b/src/code_point.h @@ -0,0 +1,8 @@ +#ifndef CODE_POINT_H +#define CODE_POINT_H + +#include + +typedef wchar_t code_point_t; + +#endif diff --git a/src/html_token.cpp b/src/html_token.cpp new file mode 100644 index 0000000..904e79a --- /dev/null +++ b/src/html_token.cpp @@ -0,0 +1,31 @@ +#include "html_token.hpp" +#include + +const char* TOKEN_TO_NAME_MAP[HtmlTokenType_Length] = { + "Text", + "Start Tag", + "End Tag", + "Attribute", + "EOF", + "Character" +}; + +void HtmlToken::print() { + const char* name = TOKEN_TO_NAME_MAP[type]; + switch (type) { + case HtmlTokenType_Character: + logger_info("%s, %c", name, character_token); + break; + case HtmlTokenType_StartTag: + case HtmlTokenType_EndTag: + logger_info("%s, %s", name, tag_name.c_str()); + break; + default: + logger_info("%s", name); + } +} + +void HtmlToken::reset() { + type = HtmlTokenType_None; + tag_name.clear(); +} diff --git a/src/html_token.hpp b/src/html_token.hpp new file mode 100644 index 0000000..ee385ce --- /dev/null +++ b/src/html_token.hpp @@ -0,0 +1,32 @@ +#ifndef HTML_TOKEN_HPP +#define HTML_TOKEN_HPP + +#include +#include "code_point.h" + +enum HtmlTokenType { + HtmlTokenType_None = 0, + HtmlTokenType_StartTag, + HtmlTokenType_EndTag, + HtmlTokenType_Attribute, + HtmlTokenType_EOF, + HtmlTokenType_Character, + HtmlTokenType_Length +}; + +struct HtmlToken { + HtmlTokenType type; + + // TODO: Performance + char character_token; + std::string tag_name; + + void append_to_tag_name(code_point_t c) { + tag_name += c; + } + + void print(); + void reset(); +}; + +#endif diff --git a/src/main.cpp b/src/main.cpp new file mode 100644 index 0000000..c92321c --- /dev/null +++ b/src/main.cpp @@ -0,0 +1,80 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include "code_point.h" +#include "html_token.hpp" +#include "tokenizer.hpp" +#include + +using namespace matte; + +const char *argp_program_version = "html_parser 0.1"; +const char *argp_program_bug_address = ""; +static char doc[] = "A description of your program."; +static char args_doc[] = "-f [FILENAME]..."; +static struct argp_option options[] = { + { "file", 'f', "FILE", 0, "File to parse."}, + { 0 } +}; + +struct Arguments { + char* filename; +}; + +static error_t parse_opt(int key, char *arg, struct argp_state *state) { + Arguments* a = (Arguments*)state->input; + switch (key) { + case 'f': { + a->filename = arg; + break; + } + case ARGP_KEY_ARG: + return 0; + default: + return ARGP_ERR_UNKNOWN; + } + return 0; +} + +static struct argp argp = { options, parse_opt, args_doc, doc, 0, 0, 0 }; + +int main(int argc, char *argv[]) { + Arguments arguments; + arguments.filename = nullptr; + auto error = argp_parse(&argp, argc, argv, 0, 0, &arguments); + + if (arguments.filename == nullptr) { + exit(EXIT_FAILURE); + } + + FILE* file = fopen(arguments.filename, "rb"); + if (file == NULL) { + exit(EXIT_FAILURE); + } + + code_point_t wc; + code_point_t buffer[1024]; + size_t ptr = 0; + while ((wc=fgetwc(file))!=WEOF) { + buffer[ptr++] = wc; + } + + buffer[ptr] = '\0'; + fclose(file); + + Tokenizer tokenizer = create(buffer); + while (true ) { + auto token = read_next(&tokenizer); + token.print(); + if (token.type == HtmlTokenType_EOF) { + break; + } + } + + return EXIT_SUCCESS; +} diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp new file mode 100644 index 0000000..30ca6bd --- /dev/null +++ b/src/tokenizer.cpp @@ -0,0 +1,268 @@ +#include "tokenizer.hpp" +#include "code_point.h" +#include "html_token.hpp" +#include +#include +#include +#include + +Tokenizer create(code_point_t* value) { + Tokenizer t; + t.ptr = value; + t.length = strlen((char*)value); + t.state = TokenizerState_Data; + return t; +} + +namespace CodePoints { + const code_point_t TAB = 0x0009; + const code_point_t LF = 0x000A; + const code_point_t FF = 0x00C; + const code_point_t SPACE = 0x020; + const code_point_t SOLIDUS = 0x02F; + const code_point_t LOWERCASE_A = 0x0061; + const code_point_t LOWERCASE_Z = 0x007A; + const code_point_t LOWERCASE_F = 0x0066; + const code_point_t UPPERCASE_A = 0x041; + const code_point_t UPPERCASE_Z = 0x05A; + const code_point_t UPPERCASE_F = 0x0046; + const code_point_t NULL_CHAR = 0x0000; + const code_point_t REPLACEMENT_CHAR = 0xFFFD; + const code_point_t GREATER_THAN_SIGN = 0x003E; + const code_point_t LESS_THAN_SIGN = 0x003C; + const code_point_t AMPERSAND = 0x0026; + const code_point_t EXCLAMATION_MARK = 0x0021; + const code_point_t NUMBER_SIGN = 0x0023; + const code_point_t LOWERCASE_X = 0x0078; + const code_point_t UPPERCASE_X = 0x0058; + const code_point_t DIGIT_ZERO = 0x0030; + const code_point_t DIGIT_NINE = 0x0039; +}; + +// https://dev.w3.org/html5/spec-LC/tokenization.html#tokenization + +inline void consume_next(Tokenizer* tokenizer) { + tokenizer->ptr++; +} + + +/// https://dev.w3.org/html5/spec-LC/tokenization.html#data-state +inline void data_state(Tokenizer* tokenizer) { + auto c = *tokenizer->ptr; + switch (c) { + case CodePoints::AMPERSAND: + tokenizer->state = TokenizerState_CharacterReferenceInData; + break; + case CodePoints::LESS_THAN_SIGN: + tokenizer->state = TokenizerState_TagOpen; + break; + case CodePoints::NULL_CHAR: + tokenizer->last.type = HtmlTokenType_EOF; + tokenizer->flag = tokenizer->flag | TokenizerFlag_Emit; + break; + default: + // TODO: @Error If null, throw an error + tokenizer->last.type = HtmlTokenType_Character; + tokenizer->last.character_token = *tokenizer->ptr; + tokenizer->flag |= TokenizerFlag_Emit; + break; + } +} + +/// https://dev.w3.org/html5/spec-LC/tokenization.html#tag-open-state +inline void tag_open_state(Tokenizer* tokenizer) { + code_point_t c = *tokenizer->ptr; + switch (c) { + case CodePoints::EXCLAMATION_MARK: + tokenizer->state = TokenizerState_MarkupDeclarationOpen; // TODO + break; + case CodePoints::SOLIDUS: // U+002F + tokenizer->state = TokenizerState_EndTagOpen; + break; + default: + // TODO: In these two case, we do NOT want to emit the token just yet. + if (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_Z) { + c = c + 0x0020; // To lower + tokenizer->state = TokenizerState_TagName; + tokenizer->last.append_to_tag_name(c); + tokenizer->last.type = HtmlTokenType_StartTag; + } + else if (c >= CodePoints::LOWERCASE_A && c <= CodePoints::LOWERCASE_Z) { + tokenizer->state = TokenizerState_TagName; + tokenizer->last.append_to_tag_name(c); + tokenizer->last.type = HtmlTokenType_StartTag; + } + else if (c == '?') { // U+003F + // TODO: Parse error + tokenizer->state = TokenizerState_BogusComment; // TODO: + } + else { + tokenizer->state = TokenizerState_Data; + } + } +} + +inline void tag_name_state(Tokenizer* tokenizer) { + code_point_t c = *tokenizer->ptr; + switch (c) { + case CodePoints::TAB: + case CodePoints::FF: + case CodePoints::LF: + case CodePoints::SPACE: + tokenizer->state = TokenizerState_BeforeAttribute; + tokenizer->flag |= TokenizerFlag_Emit; + break; + case CodePoints::SOLIDUS: + tokenizer->state = TokenizerState_SelfClosingStartTag; + break; + case CodePoints::GREATER_THAN_SIGN: + tokenizer->state = TokenizerState_Data; + tokenizer->flag |= TokenizerFlag_Emit; + break; + default: + if (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_Z) { + c = c + 0x0020; // To lower + tokenizer->last.append_to_tag_name(c); + } + else if (c == CodePoints::NULL_CHAR) { + // TODO: @Error + tokenizer->last.append_to_tag_name(CodePoints::REPLACEMENT_CHAR); + } + else if (c == EOF) { + // TODO: @Error + tokenizer->state = TokenizerState_Data; + tokenizer->flag = tokenizer->flag & ~TokenizerFlag_IncrementPtr; + } + else { + tokenizer->last.append_to_tag_name(c); + } + break; + } +} + +/// Process the end tag open state +/// Spec: https://dev.w3.org/html5/spec-LC/tokenization.html#end-tag-open-state +inline void end_tag_open_state(Tokenizer* tokenizer) { + code_point_t c = *tokenizer->ptr; + if (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_Z) { + c = c + 0x0020; // To lower + tokenizer->last.type = HtmlTokenType_EndTag; + tokenizer->last.append_to_tag_name(c); + tokenizer->state = TokenizerState_TagName; + } + else if (c >= CodePoints::LOWERCASE_A && c <= CodePoints::LOWERCASE_Z) { + tokenizer->last.type = HtmlTokenType_EndTag; + tokenizer->last.append_to_tag_name(c); + tokenizer->state = TokenizerState_TagName; + } + else if (c == CodePoints::GREATER_THAN_SIGN) { + // TODO: @Error We got something like + tokenizer->state = TokenizerState_Data; + } + else if (c == EOF) { + // TODO: @Error + // TODO: @Question Emit two tokens? + tokenizer->last.type = HtmlTokenType_Character; + tokenizer->last.character_token = CodePoints::LESS_THAN_SIGN; + + tokenizer->last.type = HtmlTokenType_Character; + tokenizer->last.character_token = CodePoints::SOLIDUS; + + tokenizer->flag = tokenizer->flag & ~TokenizerFlag_IncrementPtr; + } + else if (c == CodePoints::NULL_CHAR) { + // TODO: @Error + tokenizer->last.append_to_tag_name(CodePoints::REPLACEMENT_CHAR); + } + else { + // TODO: @Error + tokenizer->state = TokenizerState_BogusComment; + } +} + +/// https://dev.w3.org/html5/spec-LC/tokenization.html#consume-a-character-reference +inline void try_consume_character_reference(Tokenizer* tokenizer) { + code_point_t c = *tokenizer->ptr; + + switch (c) { + case CodePoints::TAB: + case CodePoints::LF: + case CodePoints::FF: + case CodePoints::SPACE: + case CodePoints::LESS_THAN_SIGN: + case CodePoints::AMPERSAND: + case EOF: { + // TODO: The additional allowed character? + // Not a character reference. No characters are consumed, and nothing is returned. (This is not an error, either.) + tokenizer->state = TokenizerState_Data; + tokenizer->flag &= TokenizerFlag_IncrementPtr; + break; + } + case CodePoints::NUMBER_SIGN: { + consume_next(tokenizer); + c = *tokenizer->ptr; + + bool is_hex = false; // If set to true, we should interpret within the range 0 to F as hex, otherwise 0 to 9 as decimal + switch (c) { + case CodePoints::UPPERCASE_X: + case CodePoints::LOWERCASE_X: { + consume_next(tokenizer); + is_hex = true; + break; + } + default: { + is_hex = false; + break; + } + } + break; + } + } +} + +/// TODO: +inline void character_reference_in_data_state(Tokenizer* tokenizer) { + // A character reference begins with an ampersand + code_point_t c = *tokenizer->ptr; +} + +HtmlToken read_next(Tokenizer* tokenizer) { + tokenizer->flag = TokenizerFlag_None; + tokenizer->last.reset(); + tokenizer->state = TokenizerState_Data; + do { + // Reset all flags, except for IncrementPtr + tokenizer->flag = 0 | TokenizerFlag_IncrementPtr; + + switch (tokenizer->state) { + case TokenizerState_Data: + data_state(tokenizer); + break; + case TokenizerState_TagOpen: + tag_open_state(tokenizer); + break; + case TokenizerState_TagName: + tag_name_state(tokenizer); + break; + case TokenizerState_EndTagOpen: + end_tag_open_state(tokenizer); + break; + case TokenizerState_CharacterReferenceInData: + character_reference_in_data_state(tokenizer); + break; + default: + logger_error("Unsupported state, exploding: %d\n", tokenizer->state); + exit(1); + } + + if (tokenizer->flag & TokenizerFlag_IncrementPtr) { + tokenizer->ptr++; + } + + if (tokenizer->flag & TokenizerFlag_Emit) { + break; + } + } while (true); + + return tokenizer->last; +} diff --git a/src/tokenizer.hpp b/src/tokenizer.hpp new file mode 100644 index 0000000..4978bfb --- /dev/null +++ b/src/tokenizer.hpp @@ -0,0 +1,41 @@ +#ifndef TOKENIZER_HPP +#define TOKENIZER_HPP + +#include "code_point.h" +#include "html_token.hpp" + +enum TokenizerFlag { + TokenizerFlag_None = 0, + TokenizerFlag_Emit = 1, + TokenizerFlag_IncrementPtr = 2 +}; + +enum TokenizerState { + TokenizerState_None, + TokenizerState_Data, + TokenizerState_CharacterReferenceInData, + TokenizerState_TagOpen, + TokenizerState_MarkupDeclarationOpen, + TokenizerState_EndTagOpen, + TokenizerState_TagName, + TokenizerState_BogusComment, + TokenizerState_CommentState, + TokenizerState_BeforeAttribute, + TokenizerState_SelfClosingStartTag +}; + +struct Tokenizer { + code_point_t* ptr = nullptr; + size_t length = 0; + + TokenizerState state = TokenizerState_Data; + HtmlToken last; + int flag = TokenizerFlag_None; +}; + + +Tokenizer create(code_point_t*); +HtmlToken read_next(Tokenizer*); + + +#endif -- cgit v1.2.1