From 04206efb102990c2ce0fa26be30db803cdc816be Mon Sep 17 00:00:00 2001 From: mattkae Date: Sun, 30 Apr 2023 11:15:18 -0400 Subject: Moving over to CMakeList for qcreator support --- .gitignore | 3 ++- CMakeLists.txt | 13 +++++++++++++ Makefile | 45 ------------------------------------------- src/tokenizer.cpp | 57 ++++++++++++++++++++++++++++++++++++++++++++----------- 4 files changed, 61 insertions(+), 57 deletions(-) create mode 100644 CMakeLists.txt delete mode 100644 Makefile diff --git a/.gitignore b/.gitignore index 704b56a..e926f51 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ .cproject .project all -build \ No newline at end of file +build +CMakeLists.txt.user \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..a65ed8b --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,13 @@ +cmake_minimum_required(VERSION 3.5) + +project(html_parser LANGUAGES CXX) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +add_executable(html_parser src/main.cpp src/html_token.cpp src/tokenizer.cpp) + +target_link_libraries(html_parser -lmatte) + +install(TARGETS html_parser + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}) diff --git a/Makefile b/Makefile deleted file mode 100644 index e453aa6..0000000 --- a/Makefile +++ /dev/null @@ -1,45 +0,0 @@ -TARGET_EXEC ?= html_parser - -BUILD_DIR ?= ./build -SRC_DIRS ?= ./src - -CC := g++ -g -SRCS := $(shell find $(SRC_DIRS) -name *.cpp -or -name *.c -or -name *.s) -OBJS := $(SRCS:%=$(BUILD_DIR)/%.o) -DEPS := $(OBJS:.o=.d) - -INC_DIRS := $(shell find $(SRC_DIRS) -type d) -INC_FLAGS := $(addprefix -I,$(INC_DIRS)) - -CPPFLAGS ?= $(INC_FLAGS) -MMD -MP - -$(BUILD_DIR)/$(TARGET_EXEC): $(OBJS) - $(CC) $(OBJS) -o $@ -lmatte - -# assembly -$(BUILD_DIR)/%.s.o: %.s - $(MKDIR_P) $(dir $@) - $(AS) $(ASFLAGS) -c $< -o $@ - -# c source -$(BUILD_DIR)/%.c.o: %.c - $(MKDIR_P) $(dir $@) - $(CC) $(CPPFLAGS) $(CFLAGS) -c $< -o $@ - -# c++ source -$(BUILD_DIR)/%.cpp.o: %.cpp - $(MKDIR_P) $(dir $@) - $(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $< -o $@ - - -.PHONY: clean - -clean: - $(RM) -r $(BUILD_DIR) - -all: - $(BUILD_DIR)/$(TARGET_EXEC) - --include $(DEPS) - -MKDIR_P ?= mkdir -p diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index 30ca6bd..8cf9b31 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -1,3 +1,4 @@ +// https://dev.w3.org/html5/spec-LC/tokenization.html#tokenization #include "tokenizer.hpp" #include "code_point.h" #include "html_token.hpp" @@ -37,14 +38,30 @@ namespace CodePoints { const code_point_t UPPERCASE_X = 0x0058; const code_point_t DIGIT_ZERO = 0x0030; const code_point_t DIGIT_NINE = 0x0039; + + inline bool is_decimal(code_point_t c) { + return c >= CodePoints::DIGIT_ZERO && c <= CodePoints::DIGIT_NINE; + } + + inline bool is_hex(code_point_t c) { + return (c >= CodePoints::DIGIT_ZERO && c <= CodePoints::DIGIT_NINE) + || (c >= CodePoints::LOWERCASE_A && c <= CodePoints::LOWERCASE_F) + || (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_F); + } }; -// https://dev.w3.org/html5/spec-LC/tokenization.html#tokenization + +// Helpers + inline void consume_next(Tokenizer* tokenizer) { tokenizer->ptr++; } +inline void unconsume_previous(Tokenizer* tokenizer) { + tokenizer->ptr--; +} + /// https://dev.w3.org/html5/spec-LC/tokenization.html#data-state inline void data_state(Tokenizer* tokenizer) { @@ -202,19 +219,37 @@ inline void try_consume_character_reference(Tokenizer* tokenizer) { consume_next(tokenizer); c = *tokenizer->ptr; - bool is_hex = false; // If set to true, we should interpret within the range 0 to F as hex, otherwise 0 to 9 as decimal - switch (c) { - case CodePoints::UPPERCASE_X: - case CodePoints::LOWERCASE_X: { + bool none_match_range = false; + code_point_t value = 0x0000; + if (c == CodePoints::UPPERCASE_X || c == CodePoints::LOWERCASE_X) { consume_next(tokenizer); - is_hex = true; - break; - } - default: { - is_hex = false; - break; + c = *tokenizer->ptr; + + if (!CodePoints::is_hex(c)) { + unconsume_previous(tokenizer); // X + unconsume_previous(tokenizer); // Number sign + // TODO: @Error parse error + return; + } + + while (CodePoints::is_hex(c)) { + consume_next(tokenizer); + c = *tokenizer->ptr; + } } + else { + if (!CodePoints::is_decimal(c)) { + unconsume_previous(tokenizer); // Number sign + // TODO: @Error parse error + return; + } + + while (CodePoints::is_decimal(c)) { + consume_next(tokenizer); + c = *tokenizer->ptr; + } } + break; } } -- cgit v1.2.1