summaryrefslogtreecommitdiff
path: root/src/code_point.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/code_point.h')
-rw-r--r--src/code_point.h150
1 files changed, 150 insertions, 0 deletions
diff --git a/src/code_point.h b/src/code_point.h
index dcfcd32..c039907 100644
--- a/src/code_point.h
+++ b/src/code_point.h
@@ -5,4 +5,154 @@
typedef wchar_t code_point_t;
+namespace CodePoints {
+ struct CodePointMap {
+ code_point_t key;
+ code_point_t value;
+ };
+
+ const code_point_t TAB = 0x0009;
+ const code_point_t LF = 0x000A;
+ const code_point_t FF = 0x00C;
+ const code_point_t SPACE = 0x020;
+ const code_point_t SOLIDUS = 0x02F;
+ const code_point_t LOWERCASE_A = 0x0061;
+ const code_point_t LOWERCASE_Z = 0x007A;
+ const code_point_t LOWERCASE_F = 0x0066;
+ const code_point_t UPPERCASE_A = 0x041;
+ const code_point_t UPPERCASE_Z = 0x05A;
+ const code_point_t UPPERCASE_F = 0x0046;
+ const code_point_t NULL_CHAR = 0x0000;
+ const code_point_t REPLACEMENT_CHAR = 0xFFFD;
+ const code_point_t GREATER_THAN_SIGN = 0x003E;
+ const code_point_t LESS_THAN_SIGN = 0x003C;
+ const code_point_t AMPERSAND = 0x0026;
+ const code_point_t EXCLAMATION_MARK = 0x0021;
+ const code_point_t NUMBER_SIGN = 0x0023;
+ const code_point_t LOWERCASE_X = 0x0078;
+ const code_point_t UPPERCASE_X = 0x0058;
+ const code_point_t DIGIT_ZERO = 0x0030;
+ const code_point_t DIGIT_NINE = 0x0039;
+ const code_point_t SEMICOLON = 0x003B;
+
+ inline bool is_decimal(code_point_t c) {
+ return c >= CodePoints::DIGIT_ZERO && c <= CodePoints::DIGIT_NINE;
+ }
+
+ inline bool is_hex(code_point_t c) {
+ return (c >= CodePoints::DIGIT_ZERO && c <= CodePoints::DIGIT_NINE)
+ || (c >= CodePoints::LOWERCASE_A && c <= CodePoints::LOWERCASE_F)
+ || (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_F);
+ }
+
+ inline bool in_range(code_point_t c, code_point_t lo, code_point_t hi) {
+ return c >= lo && c <= hi;
+ }
+
+ constexpr CodePointMap CHARACTER_REF_PARSE_ERROR_REPALCEMENT[] = {
+ { 0x00, 0xFFFD },
+ { 0x0D, 0X000D },
+ { 0x80, 0x20AC },
+ { 0x81, 0x0081 },
+ { 0x82, 0x201A },
+ { 0x83, 0x0192 },
+ { 0x84, 0x201E },
+ { 0x85, 0x2026 },
+ { 0x86, 0x2020 },
+ { 0x87, 0x2021 },
+ { 0x88, 0x02C6 },
+ { 0x89, 0x2030 },
+ { 0x8A, 0x0160 },
+ { 0x8B, 0x2039 },
+ { 0x8C, 0x0152 },
+ { 0x8D, 0x008D },
+ { 0x8E, 0x017D },
+ { 0x8F, 0x008F },
+ { 0x90, 0x0090 },
+ { 0x91, 0x2018 },
+ { 0x92, 0x2019 },
+ { 0x93, 0x201C },
+ { 0x94, 0x201D },
+ { 0x95, 0x2022 },
+ { 0x96, 0x2013 },
+ { 0x97, 0x2014 },
+ { 0x98, 0x02DC },
+ { 0x99, 0x2122 },
+ { 0x9A, 0x0161 },
+ { 0x9B, 0x203A },
+ { 0x9C, 0x0153 },
+ { 0x9D, 0x009D },
+ { 0x9E, 0x017E },
+ { 0x9F, 0x0178 }
+ };
+ constexpr int CHARACTER_REF_PARSE_ERROR_REPALCEMENT_LEN = sizeof(CHARACTER_REF_PARSE_ERROR_REPALCEMENT) / sizeof(CodePointMap);
+
+ /**
+ * Attempts to get a character reference from the provided parsed code point value.
+ * See the following link for validation: https://dev.w3.org/html5/spec-LC/tokenization.html#consume-a-character-reference
+ * @param c Input character value
+ * @param out Output character reference
+ * @returns true if there wasn't a parse error, otherwise false
+ */
+ inline bool try_get_character_ref(code_point_t c, code_point_t& out) {
+ for (int i = 0; i < CHARACTER_REF_PARSE_ERROR_REPALCEMENT_LEN; i++) {
+ if (CHARACTER_REF_PARSE_ERROR_REPALCEMENT[i].key == c) {
+ out = CHARACTER_REF_PARSE_ERROR_REPALCEMENT[i].value;
+ return false;
+ }
+ }
+
+ if ((c >= 0xD800 && c <= 0xDFFF) || c >= 0x10FFFF) {
+ out = REPLACEMENT_CHAR;
+ return false;
+ }
+
+ if (in_range(c, 0x0001, 0x0008)
+ || in_range(c, 0x000E, 0x001F)
+ || in_range(c, 0x007F, 0x009F)
+ || in_range(c, 0xFDD0, 0xFDEF)
+ || c == 0x000B
+ || c == 0xFFFE
+ || c == 0xFFFF
+ || c == 0x1FFFE
+ || c == 0x1FFFF
+ || c == 0x2FFFE
+ || c == 0x2FFFF
+ || c == 0x3FFFE
+ || c == 0x3FFFF
+ || c == 0x4FFFE
+ || c == 0x4FFFF
+ || c == 0x5FFFE
+ || c == 0x5FFFF
+ || c == 0x6FFFE
+ || c == 0x6FFFF
+ || c == 0x7FFFE
+ || c == 0x7FFFF
+ || c == 0x8FFFE
+ || c == 0x8FFFF
+ || c == 0x9FFFE
+ || c == 0x9FFFF
+ || c == 0xAFFFE
+ || c == 0xAFFFF
+ || c == 0xBFFFE
+ || c == 0xBFFFF
+ || c == 0xCFFFE
+ || c == 0xCFFFF
+ || c == 0xDFFFE
+ || c == 0xDFFFF
+ || c == 0xEFFFE
+ || c == 0xEFFFF
+ || c == 0xFFFFE
+ || c == 0xFFFFF
+ || c == 0x10FFFE
+ || c == 0x10FFFF) {
+ out = c;
+ return false;
+ }
+
+ out = c;
+ return true;
+ }
+};
+
#endif