From 04206efb102990c2ce0fa26be30db803cdc816be Mon Sep 17 00:00:00 2001 From: mattkae Date: Sun, 30 Apr 2023 11:15:18 -0400 Subject: Moving over to CMakeList for qcreator support --- src/tokenizer.cpp | 57 ++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 46 insertions(+), 11 deletions(-) (limited to 'src/tokenizer.cpp') diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index 30ca6bd..8cf9b31 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -1,3 +1,4 @@ +// https://dev.w3.org/html5/spec-LC/tokenization.html#tokenization #include "tokenizer.hpp" #include "code_point.h" #include "html_token.hpp" @@ -37,14 +38,30 @@ namespace CodePoints { const code_point_t UPPERCASE_X = 0x0058; const code_point_t DIGIT_ZERO = 0x0030; const code_point_t DIGIT_NINE = 0x0039; + + inline bool is_decimal(code_point_t c) { + return c >= CodePoints::DIGIT_ZERO && c <= CodePoints::DIGIT_NINE; + } + + inline bool is_hex(code_point_t c) { + return (c >= CodePoints::DIGIT_ZERO && c <= CodePoints::DIGIT_NINE) + || (c >= CodePoints::LOWERCASE_A && c <= CodePoints::LOWERCASE_F) + || (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_F); + } }; -// https://dev.w3.org/html5/spec-LC/tokenization.html#tokenization + +// Helpers + inline void consume_next(Tokenizer* tokenizer) { tokenizer->ptr++; } +inline void unconsume_previous(Tokenizer* tokenizer) { + tokenizer->ptr--; +} + /// https://dev.w3.org/html5/spec-LC/tokenization.html#data-state inline void data_state(Tokenizer* tokenizer) { @@ -202,19 +219,37 @@ inline void try_consume_character_reference(Tokenizer* tokenizer) { consume_next(tokenizer); c = *tokenizer->ptr; - bool is_hex = false; // If set to true, we should interpret within the range 0 to F as hex, otherwise 0 to 9 as decimal - switch (c) { - case CodePoints::UPPERCASE_X: - case CodePoints::LOWERCASE_X: { + bool none_match_range = false; + code_point_t value = 0x0000; + if (c == CodePoints::UPPERCASE_X || c == CodePoints::LOWERCASE_X) { consume_next(tokenizer); - is_hex = true; - break; - } - default: { - is_hex = false; - break; + c = *tokenizer->ptr; + + if (!CodePoints::is_hex(c)) { + unconsume_previous(tokenizer); // X + unconsume_previous(tokenizer); // Number sign + // TODO: @Error parse error + return; + } + + while (CodePoints::is_hex(c)) { + consume_next(tokenizer); + c = *tokenizer->ptr; + } } + else { + if (!CodePoints::is_decimal(c)) { + unconsume_previous(tokenizer); // Number sign + // TODO: @Error parse error + return; + } + + while (CodePoints::is_decimal(c)) { + consume_next(tokenizer); + c = *tokenizer->ptr; + } } + break; } } -- cgit v1.2.1