Parsing most html entities except for the last category

author: mattkae <mattkae@protonmail.com> 2023-06-22 13:29:49 -0400
committer: mattkae <mattkae@protonmail.com> 2023-06-22 13:29:49 -0400
commit: d53f2e7107cf63669b705c3abf08c129eeb0315e (patch)
tree: 5dadce890576c23e462b5e3dde69542868efe912
parent: 04206efb102990c2ce0fa26be30db803cdc816be (diff)
9 files changed, 250 insertions, 53 deletions
diff --git a/.gitignore b/.gitignore
index e926f51..b2586d6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,4 +2,6 @@
 .project
 all
 build
-CMakeLists.txt.user
-\ No newline at end of file
+CMakeLists.txt.user
+.idea
+cmake-build-debug
+\ No newline at end of file
diff --git a/README.org b/README.org
new file mode 100644
index 0000000..aa49b9b
--- /dev/null
+++ b/README.org
@@ -0,0 +1,6 @@
+* HTML Parser
+
+The goal of this project is to write an HTML parser in C++. 
+
+** Tokenization
+
diff --git a/examples/1.html b/examples/1.html
index 06f1bd6..8193bf0 100644
--- a/examples/1.html
+++ b/examples/1.html
@@ -1,6 +1,6 @@
 <div>
   <h1>Hello World</h1>
   <p>
-    I am in a paragraph
+    I am in a paragraph &#169;
   </p>
 </div>
diff --git a/src/code_point.h b/src/code_point.h
index dcfcd32..c039907 100644
--- a/src/code_point.h
+++ b/src/code_point.h
@@ -5,4 +5,154 @@
 
 typedef wchar_t code_point_t;
 
+namespace CodePoints {
+    struct CodePointMap {
+        code_point_t key;
+        code_point_t value;
+    };
+    
+    const code_point_t TAB = 0x0009;
+    const code_point_t LF = 0x000A;
+    const code_point_t FF = 0x00C;
+    const code_point_t SPACE = 0x020;
+    const code_point_t SOLIDUS = 0x02F;
+    const code_point_t LOWERCASE_A = 0x0061;
+    const code_point_t LOWERCASE_Z = 0x007A;
+    const code_point_t LOWERCASE_F = 0x0066;
+    const code_point_t UPPERCASE_A = 0x041;
+    const code_point_t UPPERCASE_Z = 0x05A;
+    const code_point_t UPPERCASE_F = 0x0046;
+    const code_point_t NULL_CHAR = 0x0000;
+    const code_point_t REPLACEMENT_CHAR = 0xFFFD;
+    const code_point_t GREATER_THAN_SIGN = 0x003E;
+    const code_point_t LESS_THAN_SIGN = 0x003C;
+    const code_point_t AMPERSAND = 0x0026;
+    const code_point_t EXCLAMATION_MARK = 0x0021;
+    const code_point_t NUMBER_SIGN = 0x0023;
+    const code_point_t LOWERCASE_X = 0x0078;
+    const code_point_t UPPERCASE_X = 0x0058;
+    const code_point_t DIGIT_ZERO = 0x0030;
+    const code_point_t DIGIT_NINE = 0x0039;
+    const code_point_t SEMICOLON = 0x003B;
+
+    inline bool is_decimal(code_point_t c) {
+        return c >= CodePoints::DIGIT_ZERO && c <= CodePoints::DIGIT_NINE;
+    }
+
+    inline bool is_hex(code_point_t c) {
+        return (c >= CodePoints::DIGIT_ZERO && c <= CodePoints::DIGIT_NINE)
+            || (c >= CodePoints::LOWERCASE_A && c <= CodePoints::LOWERCASE_F)
+            || (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_F);
+    }
+
+    inline bool in_range(code_point_t c, code_point_t lo, code_point_t hi) {
+        return c >= lo && c <= hi;
+    }
+
+    constexpr CodePointMap CHARACTER_REF_PARSE_ERROR_REPALCEMENT[] = {
+        { 0x00, 0xFFFD },
+        { 0x0D, 0X000D },
+        { 0x80, 0x20AC },
+        { 0x81, 0x0081 },
+        { 0x82, 0x201A },
+        { 0x83, 0x0192 },
+        { 0x84, 0x201E },
+        { 0x85, 0x2026 },
+        { 0x86, 0x2020 },
+        { 0x87, 0x2021 },
+        { 0x88, 0x02C6 },
+        { 0x89, 0x2030 },
+        { 0x8A, 0x0160 },
+        { 0x8B, 0x2039 },
+        { 0x8C, 0x0152 },
+        { 0x8D, 0x008D },
+        { 0x8E, 0x017D },
+        { 0x8F, 0x008F },
+        { 0x90, 0x0090 },
+        { 0x91, 0x2018 },
+        { 0x92, 0x2019 },
+        { 0x93, 0x201C },
+        { 0x94, 0x201D },
+        { 0x95, 0x2022 },
+        { 0x96, 0x2013 },
+        { 0x97, 0x2014 },
+        { 0x98, 0x02DC },
+        { 0x99, 0x2122 },
+        { 0x9A, 0x0161 },
+        { 0x9B, 0x203A },
+        { 0x9C, 0x0153 },
+        { 0x9D, 0x009D },
+        { 0x9E, 0x017E },
+        { 0x9F, 0x0178 }
+    };
+    constexpr int CHARACTER_REF_PARSE_ERROR_REPALCEMENT_LEN = sizeof(CHARACTER_REF_PARSE_ERROR_REPALCEMENT) / sizeof(CodePointMap);
+
+    /**
+     * Attempts to get a character reference from the provided parsed code point value.
+     * See the following link for validation: https://dev.w3.org/html5/spec-LC/tokenization.html#consume-a-character-reference
+     * @param c Input character value
+     * @param out Output character reference
+     * @returns true if there wasn't a parse error, otherwise false
+     */
+    inline bool try_get_character_ref(code_point_t c, code_point_t& out) {
+        for (int i = 0; i < CHARACTER_REF_PARSE_ERROR_REPALCEMENT_LEN; i++) {
+            if (CHARACTER_REF_PARSE_ERROR_REPALCEMENT[i].key == c) {
+                out = CHARACTER_REF_PARSE_ERROR_REPALCEMENT[i].value;
+                return false;
+            }
+        }
+
+        if ((c >= 0xD800 && c <= 0xDFFF) || c >= 0x10FFFF) {
+            out = REPLACEMENT_CHAR;
+            return false;
+        }
+
+        if (in_range(c, 0x0001, 0x0008)
+            || in_range(c, 0x000E, 0x001F)
+            || in_range(c, 0x007F, 0x009F)
+            || in_range(c, 0xFDD0, 0xFDEF)
+            || c == 0x000B
+            || c == 0xFFFE
+            || c == 0xFFFF
+            || c == 0x1FFFE
+            || c == 0x1FFFF
+            || c == 0x2FFFE
+            || c == 0x2FFFF
+            || c == 0x3FFFE
+            || c == 0x3FFFF
+            || c == 0x4FFFE
+            || c == 0x4FFFF
+            || c == 0x5FFFE
+            || c == 0x5FFFF
+            || c == 0x6FFFE
+            || c == 0x6FFFF
+            || c == 0x7FFFE
+            || c == 0x7FFFF
+            || c == 0x8FFFE
+            || c == 0x8FFFF
+            || c == 0x9FFFE
+            || c == 0x9FFFF
+            || c == 0xAFFFE
+            || c == 0xAFFFF
+            || c == 0xBFFFE
+            || c == 0xBFFFF
+            || c == 0xCFFFE
+            || c == 0xCFFFF
+            || c == 0xDFFFE
+            || c == 0xDFFFF
+            || c == 0xEFFFE
+            || c == 0xEFFFF
+            || c == 0xFFFFE
+            || c == 0xFFFFF
+            || c == 0x10FFFE
+            || c == 0x10FFFF) {
+            out = c;
+            return false;
+        }
+
+        out = c;
+        return true;
+    }
+};
+
 #endif
diff --git a/src/html_token.cpp b/src/html_token.cpp
index 904e79a..8589ba8 100644
--- a/src/html_token.cpp
+++ b/src/html_token.cpp
@@ -28,4 +28,5 @@ void HtmlToken::print() {
 void HtmlToken::reset() {
     type = HtmlTokenType_None;
     tag_name.clear();
+    code_entity.clear();
 }
diff --git a/src/html_token.hpp b/src/html_token.hpp
index ee385ce..3d848d9 100644
--- a/src/html_token.hpp
+++ b/src/html_token.hpp
@@ -17,14 +17,56 @@ enum HtmlTokenType {
 struct HtmlToken {
     HtmlTokenType type;
 
-    // TODO: Performance
+    // TODO: @Performance
     char character_token;
-    std::string tag_name;
+    std::wstring tag_name;
+    std::wstring code_entity;
 
     void append_to_tag_name(code_point_t c) {
         tag_name += c;
     }
 
+    void append_to_code_entity(code_point_t c) {
+        code_entity += c;
+    }
+
+    /// Transforms the code_entity into a usable value.
+    /// Note that we are assuming that the code_entity is
+    /// valid at this point in time.
+    code_point_t code_entity_to_value(bool is_hex) {
+        code_point_t value = 0x0000;
+        if (is_hex) {
+            int multiplier = 1;
+            for (size_t i = code_entity.size() - 1; i >= 0; i--) {
+                auto c = code_entity[i];
+                if (c >= CodePoints::LOWERCASE_A) { // [a, z]
+                    c = 10 + c - CodePoints::LOWERCASE_A;
+                }
+                else if (c >= CodePoints::UPPERCASE_A) { // [A, Z]
+                    c = 10 + c - CodePoints::UPPERCASE_A;
+                }
+                else { // [0, 9]
+                    c = c - CodePoints::DIGIT_ZERO; // Now it is between 0 and 9
+                }
+
+                // Now we have c in decimal, let's convert it to the final value.
+                c = c * multiplier;
+                value += c;
+                multiplier *= 16;
+            }
+        }
+        else {
+            int multiplier = 1;
+            for (int i = code_entity.size() - 1; i >= 0; i--) {
+                auto c = code_entity[i];
+                c = c - CodePoints::DIGIT_ZERO; // Now it is between 0 and 9
+                value += c * multiplier;
+                multiplier *= 10;
+            }
+        }
+        return value;
+    }
+
     void print();
     void reset();
 };
diff --git a/src/main.cpp b/src/main.cpp
index c92321c..b914682 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -68,7 +68,7 @@ int main(int argc, char *argv[]) {
     fclose(file);
 
     Tokenizer tokenizer = create(buffer);
-    while (true ) {
+    while (true) {
         auto token = read_next(&tokenizer);
         token.print();
         if (token.type == HtmlTokenType_EOF) {
diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp
index 8cf9b31..9931d59 100644
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@@ -15,49 +15,15 @@ Tokenizer create(code_point_t* value) {
     return t;
 }
 
-namespace CodePoints {
-    const code_point_t TAB = 0x0009;
-    const code_point_t LF = 0x000A;
-    const code_point_t FF = 0x00C;
-    const code_point_t SPACE = 0x020;
-    const code_point_t SOLIDUS = 0x02F;
-    const code_point_t LOWERCASE_A = 0x0061;
-    const code_point_t LOWERCASE_Z = 0x007A;
-    const code_point_t LOWERCASE_F = 0x0066;
-    const code_point_t UPPERCASE_A = 0x041;
-    const code_point_t UPPERCASE_Z = 0x05A;
-    const code_point_t UPPERCASE_F = 0x0046;
-    const code_point_t NULL_CHAR = 0x0000;
-    const code_point_t REPLACEMENT_CHAR = 0xFFFD;
-    const code_point_t GREATER_THAN_SIGN = 0x003E;
-    const code_point_t LESS_THAN_SIGN = 0x003C;
-    const code_point_t AMPERSAND = 0x0026;
-    const code_point_t EXCLAMATION_MARK = 0x0021;
-    const code_point_t NUMBER_SIGN = 0x0023;
-    const code_point_t LOWERCASE_X = 0x0078;
-    const code_point_t UPPERCASE_X = 0x0058;
-    const code_point_t DIGIT_ZERO = 0x0030;
-    const code_point_t DIGIT_NINE = 0x0039;
-
-    inline bool is_decimal(code_point_t c) {
-        return c >= CodePoints::DIGIT_ZERO && c <= CodePoints::DIGIT_NINE;
-    }
-
-    inline bool is_hex(code_point_t c) {
-        return (c >= CodePoints::DIGIT_ZERO && c <= CodePoints::DIGIT_NINE)
-            || (c >= CodePoints::LOWERCASE_A && c <= CodePoints::LOWERCASE_F)
-            || (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_F);
-    }
-};
-
 
 // Helpers
 
-
+/// Consumes the next token by incrementing the ptr.
 inline void consume_next(Tokenizer* tokenizer) {
     tokenizer->ptr++;
 }
 
+// Unconsumed the next tokekn by decrementing the ptr.
 inline void unconsume_previous(Tokenizer* tokenizer) {
     tokenizer->ptr--;
 }
@@ -148,7 +114,7 @@ inline void tag_name_state(Tokenizer* tokenizer) {
         else if (c == EOF) {
             // TODO: @Error
             tokenizer->state = TokenizerState_Data;
-            tokenizer->flag = tokenizer->flag & ~TokenizerFlag_IncrementPtr;
+            tokenizer->flag = tokenizer->flag & TokenizerFlag_DecrementPtr;
         }
         else {
             tokenizer->last.append_to_tag_name(c);
@@ -185,7 +151,7 @@ inline void end_tag_open_state(Tokenizer* tokenizer) {
         tokenizer->last.type = HtmlTokenType_Character;
         tokenizer->last.character_token = CodePoints::SOLIDUS;
         
-        tokenizer->flag = tokenizer->flag & ~TokenizerFlag_IncrementPtr;
+        tokenizer->flag = tokenizer->flag & TokenizerFlag_DecrementPtr;
     }
     else if (c == CodePoints::NULL_CHAR) {
         // TODO: @Error
@@ -198,7 +164,9 @@ inline void end_tag_open_state(Tokenizer* tokenizer) {
 }
 
 /// https://dev.w3.org/html5/spec-LC/tokenization.html#consume-a-character-reference
-inline void try_consume_character_reference(Tokenizer* tokenizer) {
+/// Attempts to consume a character reference from the current tokenizer. If one cannot
+/// be consumed, false is returned, otherwise true.
+inline bool try_consume_character_reference(Tokenizer* tokenizer) {
     code_point_t c = *tokenizer->ptr;
 
     switch (c) {
@@ -212,16 +180,16 @@ inline void try_consume_character_reference(Tokenizer* tokenizer) {
         // TODO: The additional allowed character?
         // Not a character reference. No characters are consumed, and nothing is returned. (This is not an error, either.)
         tokenizer->state = TokenizerState_Data;
-        tokenizer->flag &= TokenizerFlag_IncrementPtr;
-        break;
+        return true;
     }
     case CodePoints::NUMBER_SIGN: {
         consume_next(tokenizer);
         c = *tokenizer->ptr;
 
-        bool none_match_range = false;
+        bool is_hex_value = false;
         code_point_t value = 0x0000;
         if (c == CodePoints::UPPERCASE_X || c == CodePoints::LOWERCASE_X) {
+            is_hex_value = true;
             consume_next(tokenizer);
             c = *tokenizer->ptr;
 
@@ -229,10 +197,12 @@ inline void try_consume_character_reference(Tokenizer* tokenizer) {
                 unconsume_previous(tokenizer); // X
                 unconsume_previous(tokenizer); // Number sign
                 // TODO: @Error parse error
-                return;
+                return false;
             }
 
+            
             while (CodePoints::is_hex(c)) {
+                tokenizer->last.append_to_code_entity(c);
                 consume_next(tokenizer);
                 c = *tokenizer->ptr;
             }
@@ -241,16 +211,41 @@ inline void try_consume_character_reference(Tokenizer* tokenizer) {
             if (!CodePoints::is_decimal(c)) {
                 unconsume_previous(tokenizer); // Number sign
                 // TODO: @Error parse error
-                return;
+                return false;
             }
             
             while (CodePoints::is_decimal(c)) {
+                tokenizer->last.append_to_code_entity(c);
                 consume_next(tokenizer);
                 c = *tokenizer->ptr;
             }
         }
 
-        break;
+        // We should have the hex value now.
+        if (c != CodePoints::SEMICOLON) {
+            // TODO: @Error parse error
+            return false;
+        }
+
+        consume_next(tokenizer);
+        c = *tokenizer->ptr;
+        auto code_entity = tokenizer->last.code_entity_to_value(is_hex_value);
+        printf("%d\n", code_entity);
+
+        auto is_parse_erorr = !CodePoints::try_get_character_ref(code_entity, code_entity);
+        if (is_parse_erorr) {
+            // TODO: @Error
+            return false;
+        }
+
+        return true;
+    }
+    default: {
+        // TODO: Tedious work lies ahead.
+        // Otherwise try and find the string by name in this table
+        // https://dev.w3.org/html5/spec-LC/named-character-references.html#named-character-references
+        logger_error("Unsupported character reference");
+        return false;
     }
     }
 }
@@ -259,6 +254,7 @@ inline void try_consume_character_reference(Tokenizer* tokenizer) {
 inline void character_reference_in_data_state(Tokenizer* tokenizer) {
     // A character reference begins with an ampersand
     code_point_t c = *tokenizer->ptr;
+    try_consume_character_reference(tokenizer);
 }
 
 HtmlToken read_next(Tokenizer* tokenizer) {
@@ -267,7 +263,7 @@ HtmlToken read_next(Tokenizer* tokenizer) {
     tokenizer->state = TokenizerState_Data;
     do {
         // Reset all flags, except for IncrementPtr
-        tokenizer->flag = 0 | TokenizerFlag_IncrementPtr;
+        tokenizer->flag = 0;
         
         switch (tokenizer->state) {
         case TokenizerState_Data:
@@ -290,7 +286,7 @@ HtmlToken read_next(Tokenizer* tokenizer) {
             exit(1);
         }
         
-        if (tokenizer->flag & TokenizerFlag_IncrementPtr) {
+        if ((tokenizer->flag & TokenizerFlag_DecrementPtr) == 0) {
             tokenizer->ptr++;
         }
 
diff --git a/src/tokenizer.hpp b/src/tokenizer.hpp
index 4978bfb..4cd9245 100644
--- a/src/tokenizer.hpp
+++ b/src/tokenizer.hpp
@@ -7,7 +7,7 @@
 enum TokenizerFlag {
     TokenizerFlag_None = 0,
     TokenizerFlag_Emit = 1,
-    TokenizerFlag_IncrementPtr = 2
+    TokenizerFlag_DecrementPtr = 2
 };
 
 enum TokenizerState {
author	mattkae <mattkae@protonmail.com>	2023-06-22 13:29:49 -0400
committer	mattkae <mattkae@protonmail.com>	2023-06-22 13:29:49 -0400
commit	d53f2e7107cf63669b705c3abf08c129eeb0315e (patch)
tree	5dadce890576c23e462b5e3dde69542868efe912
parent	04206efb102990c2ce0fa26be30db803cdc816be (diff)