From d53f2e7107cf63669b705c3abf08c129eeb0315e Mon Sep 17 00:00:00 2001 From: mattkae Date: Thu, 22 Jun 2023 13:29:49 -0400 Subject: Parsing most html entities except for the last category --- .gitignore | 4 +- README.org | 6 +++ examples/1.html | 2 +- src/code_point.h | 150 +++++++++++++++++++++++++++++++++++++++++++++++++++++ src/html_token.cpp | 1 + src/html_token.hpp | 46 +++++++++++++++- src/main.cpp | 2 +- src/tokenizer.cpp | 90 +++++++++++++++----------------- src/tokenizer.hpp | 2 +- 9 files changed, 250 insertions(+), 53 deletions(-) create mode 100644 README.org diff --git a/.gitignore b/.gitignore index e926f51..b2586d6 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,6 @@ .project all build -CMakeLists.txt.user \ No newline at end of file +CMakeLists.txt.user +.idea +cmake-build-debug \ No newline at end of file diff --git a/README.org b/README.org new file mode 100644 index 0000000..aa49b9b --- /dev/null +++ b/README.org @@ -0,0 +1,6 @@ +* HTML Parser + +The goal of this project is to write an HTML parser in C++. + +** Tokenization + diff --git a/examples/1.html b/examples/1.html index 06f1bd6..8193bf0 100644 --- a/examples/1.html +++ b/examples/1.html @@ -1,6 +1,6 @@

Hello World

- I am in a paragraph + I am in a paragraph ©

diff --git a/src/code_point.h b/src/code_point.h index dcfcd32..c039907 100644 --- a/src/code_point.h +++ b/src/code_point.h @@ -5,4 +5,154 @@ typedef wchar_t code_point_t; +namespace CodePoints { + struct CodePointMap { + code_point_t key; + code_point_t value; + }; + + const code_point_t TAB = 0x0009; + const code_point_t LF = 0x000A; + const code_point_t FF = 0x00C; + const code_point_t SPACE = 0x020; + const code_point_t SOLIDUS = 0x02F; + const code_point_t LOWERCASE_A = 0x0061; + const code_point_t LOWERCASE_Z = 0x007A; + const code_point_t LOWERCASE_F = 0x0066; + const code_point_t UPPERCASE_A = 0x041; + const code_point_t UPPERCASE_Z = 0x05A; + const code_point_t UPPERCASE_F = 0x0046; + const code_point_t NULL_CHAR = 0x0000; + const code_point_t REPLACEMENT_CHAR = 0xFFFD; + const code_point_t GREATER_THAN_SIGN = 0x003E; + const code_point_t LESS_THAN_SIGN = 0x003C; + const code_point_t AMPERSAND = 0x0026; + const code_point_t EXCLAMATION_MARK = 0x0021; + const code_point_t NUMBER_SIGN = 0x0023; + const code_point_t LOWERCASE_X = 0x0078; + const code_point_t UPPERCASE_X = 0x0058; + const code_point_t DIGIT_ZERO = 0x0030; + const code_point_t DIGIT_NINE = 0x0039; + const code_point_t SEMICOLON = 0x003B; + + inline bool is_decimal(code_point_t c) { + return c >= CodePoints::DIGIT_ZERO && c <= CodePoints::DIGIT_NINE; + } + + inline bool is_hex(code_point_t c) { + return (c >= CodePoints::DIGIT_ZERO && c <= CodePoints::DIGIT_NINE) + || (c >= CodePoints::LOWERCASE_A && c <= CodePoints::LOWERCASE_F) + || (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_F); + } + + inline bool in_range(code_point_t c, code_point_t lo, code_point_t hi) { + return c >= lo && c <= hi; + } + + constexpr CodePointMap CHARACTER_REF_PARSE_ERROR_REPALCEMENT[] = { + { 0x00, 0xFFFD }, + { 0x0D, 0X000D }, + { 0x80, 0x20AC }, + { 0x81, 0x0081 }, + { 0x82, 0x201A }, + { 0x83, 0x0192 }, + { 0x84, 0x201E }, + { 0x85, 0x2026 }, + { 0x86, 0x2020 }, + { 0x87, 0x2021 }, + { 0x88, 0x02C6 }, + { 0x89, 0x2030 }, + { 0x8A, 0x0160 }, + { 0x8B, 0x2039 }, + { 0x8C, 0x0152 }, + { 0x8D, 0x008D }, + { 0x8E, 0x017D }, + { 0x8F, 0x008F }, + { 0x90, 0x0090 }, + { 0x91, 0x2018 }, + { 0x92, 0x2019 }, + { 0x93, 0x201C }, + { 0x94, 0x201D }, + { 0x95, 0x2022 }, + { 0x96, 0x2013 }, + { 0x97, 0x2014 }, + { 0x98, 0x02DC }, + { 0x99, 0x2122 }, + { 0x9A, 0x0161 }, + { 0x9B, 0x203A }, + { 0x9C, 0x0153 }, + { 0x9D, 0x009D }, + { 0x9E, 0x017E }, + { 0x9F, 0x0178 } + }; + constexpr int CHARACTER_REF_PARSE_ERROR_REPALCEMENT_LEN = sizeof(CHARACTER_REF_PARSE_ERROR_REPALCEMENT) / sizeof(CodePointMap); + + /** + * Attempts to get a character reference from the provided parsed code point value. + * See the following link for validation: https://dev.w3.org/html5/spec-LC/tokenization.html#consume-a-character-reference + * @param c Input character value + * @param out Output character reference + * @returns true if there wasn't a parse error, otherwise false + */ + inline bool try_get_character_ref(code_point_t c, code_point_t& out) { + for (int i = 0; i < CHARACTER_REF_PARSE_ERROR_REPALCEMENT_LEN; i++) { + if (CHARACTER_REF_PARSE_ERROR_REPALCEMENT[i].key == c) { + out = CHARACTER_REF_PARSE_ERROR_REPALCEMENT[i].value; + return false; + } + } + + if ((c >= 0xD800 && c <= 0xDFFF) || c >= 0x10FFFF) { + out = REPLACEMENT_CHAR; + return false; + } + + if (in_range(c, 0x0001, 0x0008) + || in_range(c, 0x000E, 0x001F) + || in_range(c, 0x007F, 0x009F) + || in_range(c, 0xFDD0, 0xFDEF) + || c == 0x000B + || c == 0xFFFE + || c == 0xFFFF + || c == 0x1FFFE + || c == 0x1FFFF + || c == 0x2FFFE + || c == 0x2FFFF + || c == 0x3FFFE + || c == 0x3FFFF + || c == 0x4FFFE + || c == 0x4FFFF + || c == 0x5FFFE + || c == 0x5FFFF + || c == 0x6FFFE + || c == 0x6FFFF + || c == 0x7FFFE + || c == 0x7FFFF + || c == 0x8FFFE + || c == 0x8FFFF + || c == 0x9FFFE + || c == 0x9FFFF + || c == 0xAFFFE + || c == 0xAFFFF + || c == 0xBFFFE + || c == 0xBFFFF + || c == 0xCFFFE + || c == 0xCFFFF + || c == 0xDFFFE + || c == 0xDFFFF + || c == 0xEFFFE + || c == 0xEFFFF + || c == 0xFFFFE + || c == 0xFFFFF + || c == 0x10FFFE + || c == 0x10FFFF) { + out = c; + return false; + } + + out = c; + return true; + } +}; + #endif diff --git a/src/html_token.cpp b/src/html_token.cpp index 904e79a..8589ba8 100644 --- a/src/html_token.cpp +++ b/src/html_token.cpp @@ -28,4 +28,5 @@ void HtmlToken::print() { void HtmlToken::reset() { type = HtmlTokenType_None; tag_name.clear(); + code_entity.clear(); } diff --git a/src/html_token.hpp b/src/html_token.hpp index ee385ce..3d848d9 100644 --- a/src/html_token.hpp +++ b/src/html_token.hpp @@ -17,14 +17,56 @@ enum HtmlTokenType { struct HtmlToken { HtmlTokenType type; - // TODO: Performance + // TODO: @Performance char character_token; - std::string tag_name; + std::wstring tag_name; + std::wstring code_entity; void append_to_tag_name(code_point_t c) { tag_name += c; } + void append_to_code_entity(code_point_t c) { + code_entity += c; + } + + /// Transforms the code_entity into a usable value. + /// Note that we are assuming that the code_entity is + /// valid at this point in time. + code_point_t code_entity_to_value(bool is_hex) { + code_point_t value = 0x0000; + if (is_hex) { + int multiplier = 1; + for (size_t i = code_entity.size() - 1; i >= 0; i--) { + auto c = code_entity[i]; + if (c >= CodePoints::LOWERCASE_A) { // [a, z] + c = 10 + c - CodePoints::LOWERCASE_A; + } + else if (c >= CodePoints::UPPERCASE_A) { // [A, Z] + c = 10 + c - CodePoints::UPPERCASE_A; + } + else { // [0, 9] + c = c - CodePoints::DIGIT_ZERO; // Now it is between 0 and 9 + } + + // Now we have c in decimal, let's convert it to the final value. + c = c * multiplier; + value += c; + multiplier *= 16; + } + } + else { + int multiplier = 1; + for (int i = code_entity.size() - 1; i >= 0; i--) { + auto c = code_entity[i]; + c = c - CodePoints::DIGIT_ZERO; // Now it is between 0 and 9 + value += c * multiplier; + multiplier *= 10; + } + } + return value; + } + void print(); void reset(); }; diff --git a/src/main.cpp b/src/main.cpp index c92321c..b914682 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -68,7 +68,7 @@ int main(int argc, char *argv[]) { fclose(file); Tokenizer tokenizer = create(buffer); - while (true ) { + while (true) { auto token = read_next(&tokenizer); token.print(); if (token.type == HtmlTokenType_EOF) { diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index 8cf9b31..9931d59 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -15,49 +15,15 @@ Tokenizer create(code_point_t* value) { return t; } -namespace CodePoints { - const code_point_t TAB = 0x0009; - const code_point_t LF = 0x000A; - const code_point_t FF = 0x00C; - const code_point_t SPACE = 0x020; - const code_point_t SOLIDUS = 0x02F; - const code_point_t LOWERCASE_A = 0x0061; - const code_point_t LOWERCASE_Z = 0x007A; - const code_point_t LOWERCASE_F = 0x0066; - const code_point_t UPPERCASE_A = 0x041; - const code_point_t UPPERCASE_Z = 0x05A; - const code_point_t UPPERCASE_F = 0x0046; - const code_point_t NULL_CHAR = 0x0000; - const code_point_t REPLACEMENT_CHAR = 0xFFFD; - const code_point_t GREATER_THAN_SIGN = 0x003E; - const code_point_t LESS_THAN_SIGN = 0x003C; - const code_point_t AMPERSAND = 0x0026; - const code_point_t EXCLAMATION_MARK = 0x0021; - const code_point_t NUMBER_SIGN = 0x0023; - const code_point_t LOWERCASE_X = 0x0078; - const code_point_t UPPERCASE_X = 0x0058; - const code_point_t DIGIT_ZERO = 0x0030; - const code_point_t DIGIT_NINE = 0x0039; - - inline bool is_decimal(code_point_t c) { - return c >= CodePoints::DIGIT_ZERO && c <= CodePoints::DIGIT_NINE; - } - - inline bool is_hex(code_point_t c) { - return (c >= CodePoints::DIGIT_ZERO && c <= CodePoints::DIGIT_NINE) - || (c >= CodePoints::LOWERCASE_A && c <= CodePoints::LOWERCASE_F) - || (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_F); - } -}; - // Helpers - +/// Consumes the next token by incrementing the ptr. inline void consume_next(Tokenizer* tokenizer) { tokenizer->ptr++; } +// Unconsumed the next tokekn by decrementing the ptr. inline void unconsume_previous(Tokenizer* tokenizer) { tokenizer->ptr--; } @@ -148,7 +114,7 @@ inline void tag_name_state(Tokenizer* tokenizer) { else if (c == EOF) { // TODO: @Error tokenizer->state = TokenizerState_Data; - tokenizer->flag = tokenizer->flag & ~TokenizerFlag_IncrementPtr; + tokenizer->flag = tokenizer->flag & TokenizerFlag_DecrementPtr; } else { tokenizer->last.append_to_tag_name(c); @@ -185,7 +151,7 @@ inline void end_tag_open_state(Tokenizer* tokenizer) { tokenizer->last.type = HtmlTokenType_Character; tokenizer->last.character_token = CodePoints::SOLIDUS; - tokenizer->flag = tokenizer->flag & ~TokenizerFlag_IncrementPtr; + tokenizer->flag = tokenizer->flag & TokenizerFlag_DecrementPtr; } else if (c == CodePoints::NULL_CHAR) { // TODO: @Error @@ -198,7 +164,9 @@ inline void end_tag_open_state(Tokenizer* tokenizer) { } /// https://dev.w3.org/html5/spec-LC/tokenization.html#consume-a-character-reference -inline void try_consume_character_reference(Tokenizer* tokenizer) { +/// Attempts to consume a character reference from the current tokenizer. If one cannot +/// be consumed, false is returned, otherwise true. +inline bool try_consume_character_reference(Tokenizer* tokenizer) { code_point_t c = *tokenizer->ptr; switch (c) { @@ -212,16 +180,16 @@ inline void try_consume_character_reference(Tokenizer* tokenizer) { // TODO: The additional allowed character? // Not a character reference. No characters are consumed, and nothing is returned. (This is not an error, either.) tokenizer->state = TokenizerState_Data; - tokenizer->flag &= TokenizerFlag_IncrementPtr; - break; + return true; } case CodePoints::NUMBER_SIGN: { consume_next(tokenizer); c = *tokenizer->ptr; - bool none_match_range = false; + bool is_hex_value = false; code_point_t value = 0x0000; if (c == CodePoints::UPPERCASE_X || c == CodePoints::LOWERCASE_X) { + is_hex_value = true; consume_next(tokenizer); c = *tokenizer->ptr; @@ -229,10 +197,12 @@ inline void try_consume_character_reference(Tokenizer* tokenizer) { unconsume_previous(tokenizer); // X unconsume_previous(tokenizer); // Number sign // TODO: @Error parse error - return; + return false; } + while (CodePoints::is_hex(c)) { + tokenizer->last.append_to_code_entity(c); consume_next(tokenizer); c = *tokenizer->ptr; } @@ -241,16 +211,41 @@ inline void try_consume_character_reference(Tokenizer* tokenizer) { if (!CodePoints::is_decimal(c)) { unconsume_previous(tokenizer); // Number sign // TODO: @Error parse error - return; + return false; } while (CodePoints::is_decimal(c)) { + tokenizer->last.append_to_code_entity(c); consume_next(tokenizer); c = *tokenizer->ptr; } } - break; + // We should have the hex value now. + if (c != CodePoints::SEMICOLON) { + // TODO: @Error parse error + return false; + } + + consume_next(tokenizer); + c = *tokenizer->ptr; + auto code_entity = tokenizer->last.code_entity_to_value(is_hex_value); + printf("%d\n", code_entity); + + auto is_parse_erorr = !CodePoints::try_get_character_ref(code_entity, code_entity); + if (is_parse_erorr) { + // TODO: @Error + return false; + } + + return true; + } + default: { + // TODO: Tedious work lies ahead. + // Otherwise try and find the string by name in this table + // https://dev.w3.org/html5/spec-LC/named-character-references.html#named-character-references + logger_error("Unsupported character reference"); + return false; } } } @@ -259,6 +254,7 @@ inline void try_consume_character_reference(Tokenizer* tokenizer) { inline void character_reference_in_data_state(Tokenizer* tokenizer) { // A character reference begins with an ampersand code_point_t c = *tokenizer->ptr; + try_consume_character_reference(tokenizer); } HtmlToken read_next(Tokenizer* tokenizer) { @@ -267,7 +263,7 @@ HtmlToken read_next(Tokenizer* tokenizer) { tokenizer->state = TokenizerState_Data; do { // Reset all flags, except for IncrementPtr - tokenizer->flag = 0 | TokenizerFlag_IncrementPtr; + tokenizer->flag = 0; switch (tokenizer->state) { case TokenizerState_Data: @@ -290,7 +286,7 @@ HtmlToken read_next(Tokenizer* tokenizer) { exit(1); } - if (tokenizer->flag & TokenizerFlag_IncrementPtr) { + if ((tokenizer->flag & TokenizerFlag_DecrementPtr) == 0) { tokenizer->ptr++; } diff --git a/src/tokenizer.hpp b/src/tokenizer.hpp index 4978bfb..4cd9245 100644 --- a/src/tokenizer.hpp +++ b/src/tokenizer.hpp @@ -7,7 +7,7 @@ enum TokenizerFlag { TokenizerFlag_None = 0, TokenizerFlag_Emit = 1, - TokenizerFlag_IncrementPtr = 2 + TokenizerFlag_DecrementPtr = 2 }; enum TokenizerState { -- cgit v1.2.1