From d53f2e7107cf63669b705c3abf08c129eeb0315e Mon Sep 17 00:00:00 2001 From: mattkae Date: Thu, 22 Jun 2023 13:29:49 -0400 Subject: Parsing most html entities except for the last category --- src/html_token.hpp | 46 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 44 insertions(+), 2 deletions(-) (limited to 'src/html_token.hpp') diff --git a/src/html_token.hpp b/src/html_token.hpp index ee385ce..3d848d9 100644 --- a/src/html_token.hpp +++ b/src/html_token.hpp @@ -17,14 +17,56 @@ enum HtmlTokenType { struct HtmlToken { HtmlTokenType type; - // TODO: Performance + // TODO: @Performance char character_token; - std::string tag_name; + std::wstring tag_name; + std::wstring code_entity; void append_to_tag_name(code_point_t c) { tag_name += c; } + void append_to_code_entity(code_point_t c) { + code_entity += c; + } + + /// Transforms the code_entity into a usable value. + /// Note that we are assuming that the code_entity is + /// valid at this point in time. + code_point_t code_entity_to_value(bool is_hex) { + code_point_t value = 0x0000; + if (is_hex) { + int multiplier = 1; + for (size_t i = code_entity.size() - 1; i >= 0; i--) { + auto c = code_entity[i]; + if (c >= CodePoints::LOWERCASE_A) { // [a, z] + c = 10 + c - CodePoints::LOWERCASE_A; + } + else if (c >= CodePoints::UPPERCASE_A) { // [A, Z] + c = 10 + c - CodePoints::UPPERCASE_A; + } + else { // [0, 9] + c = c - CodePoints::DIGIT_ZERO; // Now it is between 0 and 9 + } + + // Now we have c in decimal, let's convert it to the final value. + c = c * multiplier; + value += c; + multiplier *= 16; + } + } + else { + int multiplier = 1; + for (int i = code_entity.size() - 1; i >= 0; i--) { + auto c = code_entity[i]; + c = c - CodePoints::DIGIT_ZERO; // Now it is between 0 and 9 + value += c * multiplier; + multiplier *= 10; + } + } + return value; + } + void print(); void reset(); }; -- cgit v1.2.1