summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/tokenizer.cpp57
1 files changed, 46 insertions, 11 deletions
diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp
index 30ca6bd..8cf9b31 100644
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@@ -1,3 +1,4 @@
+// https://dev.w3.org/html5/spec-LC/tokenization.html#tokenization
#include "tokenizer.hpp"
#include "code_point.h"
#include "html_token.hpp"
@@ -37,14 +38,30 @@ namespace CodePoints {
const code_point_t UPPERCASE_X = 0x0058;
const code_point_t DIGIT_ZERO = 0x0030;
const code_point_t DIGIT_NINE = 0x0039;
+
+ inline bool is_decimal(code_point_t c) {
+ return c >= CodePoints::DIGIT_ZERO && c <= CodePoints::DIGIT_NINE;
+ }
+
+ inline bool is_hex(code_point_t c) {
+ return (c >= CodePoints::DIGIT_ZERO && c <= CodePoints::DIGIT_NINE)
+ || (c >= CodePoints::LOWERCASE_A && c <= CodePoints::LOWERCASE_F)
+ || (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_F);
+ }
};
-// https://dev.w3.org/html5/spec-LC/tokenization.html#tokenization
+
+// Helpers
+
inline void consume_next(Tokenizer* tokenizer) {
tokenizer->ptr++;
}
+inline void unconsume_previous(Tokenizer* tokenizer) {
+ tokenizer->ptr--;
+}
+
/// https://dev.w3.org/html5/spec-LC/tokenization.html#data-state
inline void data_state(Tokenizer* tokenizer) {
@@ -202,19 +219,37 @@ inline void try_consume_character_reference(Tokenizer* tokenizer) {
consume_next(tokenizer);
c = *tokenizer->ptr;
- bool is_hex = false; // If set to true, we should interpret within the range 0 to F as hex, otherwise 0 to 9 as decimal
- switch (c) {
- case CodePoints::UPPERCASE_X:
- case CodePoints::LOWERCASE_X: {
+ bool none_match_range = false;
+ code_point_t value = 0x0000;
+ if (c == CodePoints::UPPERCASE_X || c == CodePoints::LOWERCASE_X) {
consume_next(tokenizer);
- is_hex = true;
- break;
- }
- default: {
- is_hex = false;
- break;
+ c = *tokenizer->ptr;
+
+ if (!CodePoints::is_hex(c)) {
+ unconsume_previous(tokenizer); // X
+ unconsume_previous(tokenizer); // Number sign
+ // TODO: @Error parse error
+ return;
+ }
+
+ while (CodePoints::is_hex(c)) {
+ consume_next(tokenizer);
+ c = *tokenizer->ptr;
+ }
}
+ else {
+ if (!CodePoints::is_decimal(c)) {
+ unconsume_previous(tokenizer); // Number sign
+ // TODO: @Error parse error
+ return;
+ }
+
+ while (CodePoints::is_decimal(c)) {
+ consume_next(tokenizer);
+ c = *tokenizer->ptr;
+ }
}
+
break;
}
}