From d53f2e7107cf63669b705c3abf08c129eeb0315e Mon Sep 17 00:00:00 2001 From: mattkae Date: Thu, 22 Jun 2023 13:29:49 -0400 Subject: Parsing most html entities except for the last category --- src/code_point.h | 150 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 150 insertions(+) (limited to 'src/code_point.h') diff --git a/src/code_point.h b/src/code_point.h index dcfcd32..c039907 100644 --- a/src/code_point.h +++ b/src/code_point.h @@ -5,4 +5,154 @@ typedef wchar_t code_point_t; +namespace CodePoints { + struct CodePointMap { + code_point_t key; + code_point_t value; + }; + + const code_point_t TAB = 0x0009; + const code_point_t LF = 0x000A; + const code_point_t FF = 0x00C; + const code_point_t SPACE = 0x020; + const code_point_t SOLIDUS = 0x02F; + const code_point_t LOWERCASE_A = 0x0061; + const code_point_t LOWERCASE_Z = 0x007A; + const code_point_t LOWERCASE_F = 0x0066; + const code_point_t UPPERCASE_A = 0x041; + const code_point_t UPPERCASE_Z = 0x05A; + const code_point_t UPPERCASE_F = 0x0046; + const code_point_t NULL_CHAR = 0x0000; + const code_point_t REPLACEMENT_CHAR = 0xFFFD; + const code_point_t GREATER_THAN_SIGN = 0x003E; + const code_point_t LESS_THAN_SIGN = 0x003C; + const code_point_t AMPERSAND = 0x0026; + const code_point_t EXCLAMATION_MARK = 0x0021; + const code_point_t NUMBER_SIGN = 0x0023; + const code_point_t LOWERCASE_X = 0x0078; + const code_point_t UPPERCASE_X = 0x0058; + const code_point_t DIGIT_ZERO = 0x0030; + const code_point_t DIGIT_NINE = 0x0039; + const code_point_t SEMICOLON = 0x003B; + + inline bool is_decimal(code_point_t c) { + return c >= CodePoints::DIGIT_ZERO && c <= CodePoints::DIGIT_NINE; + } + + inline bool is_hex(code_point_t c) { + return (c >= CodePoints::DIGIT_ZERO && c <= CodePoints::DIGIT_NINE) + || (c >= CodePoints::LOWERCASE_A && c <= CodePoints::LOWERCASE_F) + || (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_F); + } + + inline bool in_range(code_point_t c, code_point_t lo, code_point_t hi) { + return c >= lo && c <= hi; + } + + constexpr CodePointMap CHARACTER_REF_PARSE_ERROR_REPALCEMENT[] = { + { 0x00, 0xFFFD }, + { 0x0D, 0X000D }, + { 0x80, 0x20AC }, + { 0x81, 0x0081 }, + { 0x82, 0x201A }, + { 0x83, 0x0192 }, + { 0x84, 0x201E }, + { 0x85, 0x2026 }, + { 0x86, 0x2020 }, + { 0x87, 0x2021 }, + { 0x88, 0x02C6 }, + { 0x89, 0x2030 }, + { 0x8A, 0x0160 }, + { 0x8B, 0x2039 }, + { 0x8C, 0x0152 }, + { 0x8D, 0x008D }, + { 0x8E, 0x017D }, + { 0x8F, 0x008F }, + { 0x90, 0x0090 }, + { 0x91, 0x2018 }, + { 0x92, 0x2019 }, + { 0x93, 0x201C }, + { 0x94, 0x201D }, + { 0x95, 0x2022 }, + { 0x96, 0x2013 }, + { 0x97, 0x2014 }, + { 0x98, 0x02DC }, + { 0x99, 0x2122 }, + { 0x9A, 0x0161 }, + { 0x9B, 0x203A }, + { 0x9C, 0x0153 }, + { 0x9D, 0x009D }, + { 0x9E, 0x017E }, + { 0x9F, 0x0178 } + }; + constexpr int CHARACTER_REF_PARSE_ERROR_REPALCEMENT_LEN = sizeof(CHARACTER_REF_PARSE_ERROR_REPALCEMENT) / sizeof(CodePointMap); + + /** + * Attempts to get a character reference from the provided parsed code point value. + * See the following link for validation: https://dev.w3.org/html5/spec-LC/tokenization.html#consume-a-character-reference + * @param c Input character value + * @param out Output character reference + * @returns true if there wasn't a parse error, otherwise false + */ + inline bool try_get_character_ref(code_point_t c, code_point_t& out) { + for (int i = 0; i < CHARACTER_REF_PARSE_ERROR_REPALCEMENT_LEN; i++) { + if (CHARACTER_REF_PARSE_ERROR_REPALCEMENT[i].key == c) { + out = CHARACTER_REF_PARSE_ERROR_REPALCEMENT[i].value; + return false; + } + } + + if ((c >= 0xD800 && c <= 0xDFFF) || c >= 0x10FFFF) { + out = REPLACEMENT_CHAR; + return false; + } + + if (in_range(c, 0x0001, 0x0008) + || in_range(c, 0x000E, 0x001F) + || in_range(c, 0x007F, 0x009F) + || in_range(c, 0xFDD0, 0xFDEF) + || c == 0x000B + || c == 0xFFFE + || c == 0xFFFF + || c == 0x1FFFE + || c == 0x1FFFF + || c == 0x2FFFE + || c == 0x2FFFF + || c == 0x3FFFE + || c == 0x3FFFF + || c == 0x4FFFE + || c == 0x4FFFF + || c == 0x5FFFE + || c == 0x5FFFF + || c == 0x6FFFE + || c == 0x6FFFF + || c == 0x7FFFE + || c == 0x7FFFF + || c == 0x8FFFE + || c == 0x8FFFF + || c == 0x9FFFE + || c == 0x9FFFF + || c == 0xAFFFE + || c == 0xAFFFF + || c == 0xBFFFE + || c == 0xBFFFF + || c == 0xCFFFE + || c == 0xCFFFF + || c == 0xDFFFE + || c == 0xDFFFF + || c == 0xEFFFE + || c == 0xEFFFF + || c == 0xFFFFE + || c == 0xFFFFF + || c == 0x10FFFE + || c == 0x10FFFF) { + out = c; + return false; + } + + out = c; + return true; + } +}; + #endif -- cgit v1.2.1