summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authormattkae <mattkae@protonmail.com>2023-04-30 11:15:18 -0400
committermattkae <mattkae@protonmail.com>2023-04-30 11:15:18 -0400
commit04206efb102990c2ce0fa26be30db803cdc816be (patch)
tree7423964d7afeb6d2d25eeb544a0d8396df546e06
parent4058f9b1704322f8185136c2558c2ab96a4d835c (diff)
Moving over to CMakeList for qcreator support
-rw-r--r--.gitignore3
-rw-r--r--CMakeLists.txt13
-rw-r--r--Makefile45
-rw-r--r--src/tokenizer.cpp57
4 files changed, 61 insertions, 57 deletions
diff --git a/.gitignore b/.gitignore
index 704b56a..e926f51 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
.cproject
.project
all
-build \ No newline at end of file
+build
+CMakeLists.txt.user \ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..a65ed8b
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,13 @@
+cmake_minimum_required(VERSION 3.5)
+
+project(html_parser LANGUAGES CXX)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+add_executable(html_parser src/main.cpp src/html_token.cpp src/tokenizer.cpp)
+
+target_link_libraries(html_parser -lmatte)
+
+install(TARGETS html_parser
+ LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR})
diff --git a/Makefile b/Makefile
deleted file mode 100644
index e453aa6..0000000
--- a/Makefile
+++ /dev/null
@@ -1,45 +0,0 @@
-TARGET_EXEC ?= html_parser
-
-BUILD_DIR ?= ./build
-SRC_DIRS ?= ./src
-
-CC := g++ -g
-SRCS := $(shell find $(SRC_DIRS) -name *.cpp -or -name *.c -or -name *.s)
-OBJS := $(SRCS:%=$(BUILD_DIR)/%.o)
-DEPS := $(OBJS:.o=.d)
-
-INC_DIRS := $(shell find $(SRC_DIRS) -type d)
-INC_FLAGS := $(addprefix -I,$(INC_DIRS))
-
-CPPFLAGS ?= $(INC_FLAGS) -MMD -MP
-
-$(BUILD_DIR)/$(TARGET_EXEC): $(OBJS)
- $(CC) $(OBJS) -o $@ -lmatte
-
-# assembly
-$(BUILD_DIR)/%.s.o: %.s
- $(MKDIR_P) $(dir $@)
- $(AS) $(ASFLAGS) -c $< -o $@
-
-# c source
-$(BUILD_DIR)/%.c.o: %.c
- $(MKDIR_P) $(dir $@)
- $(CC) $(CPPFLAGS) $(CFLAGS) -c $< -o $@
-
-# c++ source
-$(BUILD_DIR)/%.cpp.o: %.cpp
- $(MKDIR_P) $(dir $@)
- $(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $< -o $@
-
-
-.PHONY: clean
-
-clean:
- $(RM) -r $(BUILD_DIR)
-
-all:
- $(BUILD_DIR)/$(TARGET_EXEC)
-
--include $(DEPS)
-
-MKDIR_P ?= mkdir -p
diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp
index 30ca6bd..8cf9b31 100644
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@@ -1,3 +1,4 @@
+// https://dev.w3.org/html5/spec-LC/tokenization.html#tokenization
#include "tokenizer.hpp"
#include "code_point.h"
#include "html_token.hpp"
@@ -37,14 +38,30 @@ namespace CodePoints {
const code_point_t UPPERCASE_X = 0x0058;
const code_point_t DIGIT_ZERO = 0x0030;
const code_point_t DIGIT_NINE = 0x0039;
+
+ inline bool is_decimal(code_point_t c) {
+ return c >= CodePoints::DIGIT_ZERO && c <= CodePoints::DIGIT_NINE;
+ }
+
+ inline bool is_hex(code_point_t c) {
+ return (c >= CodePoints::DIGIT_ZERO && c <= CodePoints::DIGIT_NINE)
+ || (c >= CodePoints::LOWERCASE_A && c <= CodePoints::LOWERCASE_F)
+ || (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_F);
+ }
};
-// https://dev.w3.org/html5/spec-LC/tokenization.html#tokenization
+
+// Helpers
+
inline void consume_next(Tokenizer* tokenizer) {
tokenizer->ptr++;
}
+inline void unconsume_previous(Tokenizer* tokenizer) {
+ tokenizer->ptr--;
+}
+
/// https://dev.w3.org/html5/spec-LC/tokenization.html#data-state
inline void data_state(Tokenizer* tokenizer) {
@@ -202,19 +219,37 @@ inline void try_consume_character_reference(Tokenizer* tokenizer) {
consume_next(tokenizer);
c = *tokenizer->ptr;
- bool is_hex = false; // If set to true, we should interpret within the range 0 to F as hex, otherwise 0 to 9 as decimal
- switch (c) {
- case CodePoints::UPPERCASE_X:
- case CodePoints::LOWERCASE_X: {
+ bool none_match_range = false;
+ code_point_t value = 0x0000;
+ if (c == CodePoints::UPPERCASE_X || c == CodePoints::LOWERCASE_X) {
consume_next(tokenizer);
- is_hex = true;
- break;
- }
- default: {
- is_hex = false;
- break;
+ c = *tokenizer->ptr;
+
+ if (!CodePoints::is_hex(c)) {
+ unconsume_previous(tokenizer); // X
+ unconsume_previous(tokenizer); // Number sign
+ // TODO: @Error parse error
+ return;
+ }
+
+ while (CodePoints::is_hex(c)) {
+ consume_next(tokenizer);
+ c = *tokenizer->ptr;
+ }
}
+ else {
+ if (!CodePoints::is_decimal(c)) {
+ unconsume_previous(tokenizer); // Number sign
+ // TODO: @Error parse error
+ return;
+ }
+
+ while (CodePoints::is_decimal(c)) {
+ consume_next(tokenizer);
+ c = *tokenizer->ptr;
+ }
}
+
break;
}
}