summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authormattkae <mattkae@protonmail.com>2023-06-22 13:48:11 -0400
committermattkae <mattkae@protonmail.com>2023-06-22 13:48:11 -0400
commit29e03ef74a814cb31a0ae53192e25cc75b638256 (patch)
treee992398b4fba704859653f615ba40aab6e4eee94
parentd53f2e7107cf63669b705c3abf08c129eeb0315e (diff)
Handling less than signs in plain text
-rw-r--r--CMakeLists.txt6
-rw-r--r--examples/1.html2
-rw-r--r--src/html_token.cpp2
-rw-r--r--src/html_token.hpp2
-rw-r--r--src/tokenizer.cpp19
-rw-r--r--src/tokenizer.hpp2
6 files changed, 24 insertions, 9 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a65ed8b..56e8ff8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,6 +7,12 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
add_executable(html_parser src/main.cpp src/html_token.cpp src/tokenizer.cpp)
+if(MSVC)
+ target_compile_options(html_parser PRIVATE /W4 /WX)
+else()
+ target_compile_options(html_parser PRIVATE -Wall -Wextra -Wpedantic -Wswitch-enum)
+endif()
+
target_link_libraries(html_parser -lmatte)
install(TARGETS html_parser
diff --git a/examples/1.html b/examples/1.html
index 8193bf0..f83a60a 100644
--- a/examples/1.html
+++ b/examples/1.html
@@ -1,3 +1,5 @@
+Meow is < bark
+
<div>
<h1>Hello World</h1>
<p>
diff --git a/src/html_token.cpp b/src/html_token.cpp
index 8589ba8..1d0952d 100644
--- a/src/html_token.cpp
+++ b/src/html_token.cpp
@@ -18,7 +18,7 @@ void HtmlToken::print() {
break;
case HtmlTokenType_StartTag:
case HtmlTokenType_EndTag:
- logger_info("%s, %s", name, tag_name.c_str());
+ logger_info("%s, %S", name, tag_name.c_str());
break;
default:
logger_info("%s", name);
diff --git a/src/html_token.hpp b/src/html_token.hpp
index 3d848d9..e691d21 100644
--- a/src/html_token.hpp
+++ b/src/html_token.hpp
@@ -37,7 +37,7 @@ struct HtmlToken {
code_point_t value = 0x0000;
if (is_hex) {
int multiplier = 1;
- for (size_t i = code_entity.size() - 1; i >= 0; i--) {
+ for (size_t i = code_entity.size() - 1; i > 0; i--) {
auto c = code_entity[i];
if (c >= CodePoints::LOWERCASE_A) { // [a, z]
c = 10 + c - CodePoints::LOWERCASE_A;
diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp
index 9931d59..dc0b8d7 100644
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@@ -28,6 +28,12 @@ inline void unconsume_previous(Tokenizer* tokenizer) {
tokenizer->ptr--;
}
+inline void emit_character(Tokenizer* tokenizer, code_point_t c) {
+ tokenizer->last.type = HtmlTokenType_Character;
+ tokenizer->last.character_token = c;
+ tokenizer->flag |= TokenizerFlag_Emit;
+}
+
/// https://dev.w3.org/html5/spec-LC/tokenization.html#data-state
inline void data_state(Tokenizer* tokenizer) {
@@ -45,9 +51,7 @@ inline void data_state(Tokenizer* tokenizer) {
break;
default:
// TODO: @Error If null, throw an error
- tokenizer->last.type = HtmlTokenType_Character;
- tokenizer->last.character_token = *tokenizer->ptr;
- tokenizer->flag |= TokenizerFlag_Emit;
+ emit_character(tokenizer, *tokenizer->ptr);
break;
}
}
@@ -80,8 +84,11 @@ inline void tag_open_state(Tokenizer* tokenizer) {
tokenizer->state = TokenizerState_BogusComment; // TODO:
}
else {
+ emit_character(tokenizer, CodePoints::LESS_THAN_SIGN);
tokenizer->state = TokenizerState_Data;
+ tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement;
}
+ break;
}
}
@@ -114,7 +121,7 @@ inline void tag_name_state(Tokenizer* tokenizer) {
else if (c == EOF) {
// TODO: @Error
tokenizer->state = TokenizerState_Data;
- tokenizer->flag = tokenizer->flag & TokenizerFlag_DecrementPtr;
+ tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement;
}
else {
tokenizer->last.append_to_tag_name(c);
@@ -151,7 +158,7 @@ inline void end_tag_open_state(Tokenizer* tokenizer) {
tokenizer->last.type = HtmlTokenType_Character;
tokenizer->last.character_token = CodePoints::SOLIDUS;
- tokenizer->flag = tokenizer->flag & TokenizerFlag_DecrementPtr;
+ tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement;
}
else if (c == CodePoints::NULL_CHAR) {
// TODO: @Error
@@ -286,7 +293,7 @@ HtmlToken read_next(Tokenizer* tokenizer) {
exit(1);
}
- if ((tokenizer->flag & TokenizerFlag_DecrementPtr) == 0) {
+ if ((tokenizer->flag & TokenizerFlag_NoIncrement) == 0) {
tokenizer->ptr++;
}
diff --git a/src/tokenizer.hpp b/src/tokenizer.hpp
index 4cd9245..8b844cd 100644
--- a/src/tokenizer.hpp
+++ b/src/tokenizer.hpp
@@ -7,7 +7,7 @@
enum TokenizerFlag {
TokenizerFlag_None = 0,
TokenizerFlag_Emit = 1,
- TokenizerFlag_DecrementPtr = 2
+ TokenizerFlag_NoIncrement = 2
};
enum TokenizerState {