From 29e03ef74a814cb31a0ae53192e25cc75b638256 Mon Sep 17 00:00:00 2001 From: mattkae Date: Thu, 22 Jun 2023 13:48:11 -0400 Subject: Handling less than signs in plain text --- CMakeLists.txt | 6 ++++++ examples/1.html | 2 ++ src/html_token.cpp | 2 +- src/html_token.hpp | 2 +- src/tokenizer.cpp | 19 +++++++++++++------ src/tokenizer.hpp | 2 +- 6 files changed, 24 insertions(+), 9 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a65ed8b..56e8ff8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,6 +7,12 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) add_executable(html_parser src/main.cpp src/html_token.cpp src/tokenizer.cpp) +if(MSVC) + target_compile_options(html_parser PRIVATE /W4 /WX) +else() + target_compile_options(html_parser PRIVATE -Wall -Wextra -Wpedantic -Wswitch-enum) +endif() + target_link_libraries(html_parser -lmatte) install(TARGETS html_parser diff --git a/examples/1.html b/examples/1.html index 8193bf0..f83a60a 100644 --- a/examples/1.html +++ b/examples/1.html @@ -1,3 +1,5 @@ +Meow is < bark +

Hello World

diff --git a/src/html_token.cpp b/src/html_token.cpp index 8589ba8..1d0952d 100644 --- a/src/html_token.cpp +++ b/src/html_token.cpp @@ -18,7 +18,7 @@ void HtmlToken::print() { break; case HtmlTokenType_StartTag: case HtmlTokenType_EndTag: - logger_info("%s, %s", name, tag_name.c_str()); + logger_info("%s, %S", name, tag_name.c_str()); break; default: logger_info("%s", name); diff --git a/src/html_token.hpp b/src/html_token.hpp index 3d848d9..e691d21 100644 --- a/src/html_token.hpp +++ b/src/html_token.hpp @@ -37,7 +37,7 @@ struct HtmlToken { code_point_t value = 0x0000; if (is_hex) { int multiplier = 1; - for (size_t i = code_entity.size() - 1; i >= 0; i--) { + for (size_t i = code_entity.size() - 1; i > 0; i--) { auto c = code_entity[i]; if (c >= CodePoints::LOWERCASE_A) { // [a, z] c = 10 + c - CodePoints::LOWERCASE_A; diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index 9931d59..dc0b8d7 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -28,6 +28,12 @@ inline void unconsume_previous(Tokenizer* tokenizer) { tokenizer->ptr--; } +inline void emit_character(Tokenizer* tokenizer, code_point_t c) { + tokenizer->last.type = HtmlTokenType_Character; + tokenizer->last.character_token = c; + tokenizer->flag |= TokenizerFlag_Emit; +} + /// https://dev.w3.org/html5/spec-LC/tokenization.html#data-state inline void data_state(Tokenizer* tokenizer) { @@ -45,9 +51,7 @@ inline void data_state(Tokenizer* tokenizer) { break; default: // TODO: @Error If null, throw an error - tokenizer->last.type = HtmlTokenType_Character; - tokenizer->last.character_token = *tokenizer->ptr; - tokenizer->flag |= TokenizerFlag_Emit; + emit_character(tokenizer, *tokenizer->ptr); break; } } @@ -80,8 +84,11 @@ inline void tag_open_state(Tokenizer* tokenizer) { tokenizer->state = TokenizerState_BogusComment; // TODO: } else { + emit_character(tokenizer, CodePoints::LESS_THAN_SIGN); tokenizer->state = TokenizerState_Data; + tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement; } + break; } } @@ -114,7 +121,7 @@ inline void tag_name_state(Tokenizer* tokenizer) { else if (c == EOF) { // TODO: @Error tokenizer->state = TokenizerState_Data; - tokenizer->flag = tokenizer->flag & TokenizerFlag_DecrementPtr; + tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement; } else { tokenizer->last.append_to_tag_name(c); @@ -151,7 +158,7 @@ inline void end_tag_open_state(Tokenizer* tokenizer) { tokenizer->last.type = HtmlTokenType_Character; tokenizer->last.character_token = CodePoints::SOLIDUS; - tokenizer->flag = tokenizer->flag & TokenizerFlag_DecrementPtr; + tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement; } else if (c == CodePoints::NULL_CHAR) { // TODO: @Error @@ -286,7 +293,7 @@ HtmlToken read_next(Tokenizer* tokenizer) { exit(1); } - if ((tokenizer->flag & TokenizerFlag_DecrementPtr) == 0) { + if ((tokenizer->flag & TokenizerFlag_NoIncrement) == 0) { tokenizer->ptr++; } diff --git a/src/tokenizer.hpp b/src/tokenizer.hpp index 4cd9245..8b844cd 100644 --- a/src/tokenizer.hpp +++ b/src/tokenizer.hpp @@ -7,7 +7,7 @@ enum TokenizerFlag { TokenizerFlag_None = 0, TokenizerFlag_Emit = 1, - TokenizerFlag_DecrementPtr = 2 + TokenizerFlag_NoIncrement = 2 }; enum TokenizerState { -- cgit v1.2.1