// https://dev.w3.org/html5/spec-LC/tokenization.html#tokenization
#include "tokenizer.hpp"
#include "code_point.h"
#include "html_token.hpp"
#include
#include
#include
#include
Tokenizer create(code_point_t* value) {
Tokenizer t;
t.ptr = value;
t.length = strlen((char*)value);
t.state = TokenizerState_Data;
return t;
}
namespace CodePoints {
const code_point_t TAB = 0x0009;
const code_point_t LF = 0x000A;
const code_point_t FF = 0x00C;
const code_point_t SPACE = 0x020;
const code_point_t SOLIDUS = 0x02F;
const code_point_t LOWERCASE_A = 0x0061;
const code_point_t LOWERCASE_Z = 0x007A;
const code_point_t LOWERCASE_F = 0x0066;
const code_point_t UPPERCASE_A = 0x041;
const code_point_t UPPERCASE_Z = 0x05A;
const code_point_t UPPERCASE_F = 0x0046;
const code_point_t NULL_CHAR = 0x0000;
const code_point_t REPLACEMENT_CHAR = 0xFFFD;
const code_point_t GREATER_THAN_SIGN = 0x003E;
const code_point_t LESS_THAN_SIGN = 0x003C;
const code_point_t AMPERSAND = 0x0026;
const code_point_t EXCLAMATION_MARK = 0x0021;
const code_point_t NUMBER_SIGN = 0x0023;
const code_point_t LOWERCASE_X = 0x0078;
const code_point_t UPPERCASE_X = 0x0058;
const code_point_t DIGIT_ZERO = 0x0030;
const code_point_t DIGIT_NINE = 0x0039;
inline bool is_decimal(code_point_t c) {
return c >= CodePoints::DIGIT_ZERO && c <= CodePoints::DIGIT_NINE;
}
inline bool is_hex(code_point_t c) {
return (c >= CodePoints::DIGIT_ZERO && c <= CodePoints::DIGIT_NINE)
|| (c >= CodePoints::LOWERCASE_A && c <= CodePoints::LOWERCASE_F)
|| (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_F);
}
};
// Helpers
inline void consume_next(Tokenizer* tokenizer) {
tokenizer->ptr++;
}
inline void unconsume_previous(Tokenizer* tokenizer) {
tokenizer->ptr--;
}
/// https://dev.w3.org/html5/spec-LC/tokenization.html#data-state
inline void data_state(Tokenizer* tokenizer) {
auto c = *tokenizer->ptr;
switch (c) {
case CodePoints::AMPERSAND:
tokenizer->state = TokenizerState_CharacterReferenceInData;
break;
case CodePoints::LESS_THAN_SIGN:
tokenizer->state = TokenizerState_TagOpen;
break;
case CodePoints::NULL_CHAR:
tokenizer->last.type = HtmlTokenType_EOF;
tokenizer->flag = tokenizer->flag | TokenizerFlag_Emit;
break;
default:
// TODO: @Error If null, throw an error
tokenizer->last.type = HtmlTokenType_Character;
tokenizer->last.character_token = *tokenizer->ptr;
tokenizer->flag |= TokenizerFlag_Emit;
break;
}
}
/// https://dev.w3.org/html5/spec-LC/tokenization.html#tag-open-state
inline void tag_open_state(Tokenizer* tokenizer) {
code_point_t c = *tokenizer->ptr;
switch (c) {
case CodePoints::EXCLAMATION_MARK:
tokenizer->state = TokenizerState_MarkupDeclarationOpen; // TODO
break;
case CodePoints::SOLIDUS: // U+002F
tokenizer->state = TokenizerState_EndTagOpen;
break;
default:
// TODO: In these two case, we do NOT want to emit the token just yet.
if (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_Z) {
c = c + 0x0020; // To lower
tokenizer->state = TokenizerState_TagName;
tokenizer->last.append_to_tag_name(c);
tokenizer->last.type = HtmlTokenType_StartTag;
}
else if (c >= CodePoints::LOWERCASE_A && c <= CodePoints::LOWERCASE_Z) {
tokenizer->state = TokenizerState_TagName;
tokenizer->last.append_to_tag_name(c);
tokenizer->last.type = HtmlTokenType_StartTag;
}
else if (c == '?') { // U+003F
// TODO: Parse error
tokenizer->state = TokenizerState_BogusComment; // TODO:
}
else {
tokenizer->state = TokenizerState_Data;
}
}
}
inline void tag_name_state(Tokenizer* tokenizer) {
code_point_t c = *tokenizer->ptr;
switch (c) {
case CodePoints::TAB:
case CodePoints::FF:
case CodePoints::LF:
case CodePoints::SPACE:
tokenizer->state = TokenizerState_BeforeAttribute;
tokenizer->flag |= TokenizerFlag_Emit;
break;
case CodePoints::SOLIDUS:
tokenizer->state = TokenizerState_SelfClosingStartTag;
break;
case CodePoints::GREATER_THAN_SIGN:
tokenizer->state = TokenizerState_Data;
tokenizer->flag |= TokenizerFlag_Emit;
break;
default:
if (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_Z) {
c = c + 0x0020; // To lower
tokenizer->last.append_to_tag_name(c);
}
else if (c == CodePoints::NULL_CHAR) {
// TODO: @Error
tokenizer->last.append_to_tag_name(CodePoints::REPLACEMENT_CHAR);
}
else if (c == EOF) {
// TODO: @Error
tokenizer->state = TokenizerState_Data;
tokenizer->flag = tokenizer->flag & ~TokenizerFlag_IncrementPtr;
}
else {
tokenizer->last.append_to_tag_name(c);
}
break;
}
}
/// Process the end tag open state
/// Spec: https://dev.w3.org/html5/spec-LC/tokenization.html#end-tag-open-state
inline void end_tag_open_state(Tokenizer* tokenizer) {
code_point_t c = *tokenizer->ptr;
if (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_Z) {
c = c + 0x0020; // To lower
tokenizer->last.type = HtmlTokenType_EndTag;
tokenizer->last.append_to_tag_name(c);
tokenizer->state = TokenizerState_TagName;
}
else if (c >= CodePoints::LOWERCASE_A && c <= CodePoints::LOWERCASE_Z) {
tokenizer->last.type = HtmlTokenType_EndTag;
tokenizer->last.append_to_tag_name(c);
tokenizer->state = TokenizerState_TagName;
}
else if (c == CodePoints::GREATER_THAN_SIGN) {
// TODO: @Error We got something like >
tokenizer->state = TokenizerState_Data;
}
else if (c == EOF) {
// TODO: @Error
// TODO: @Question Emit two tokens?
tokenizer->last.type = HtmlTokenType_Character;
tokenizer->last.character_token = CodePoints::LESS_THAN_SIGN;
tokenizer->last.type = HtmlTokenType_Character;
tokenizer->last.character_token = CodePoints::SOLIDUS;
tokenizer->flag = tokenizer->flag & ~TokenizerFlag_IncrementPtr;
}
else if (c == CodePoints::NULL_CHAR) {
// TODO: @Error
tokenizer->last.append_to_tag_name(CodePoints::REPLACEMENT_CHAR);
}
else {
// TODO: @Error
tokenizer->state = TokenizerState_BogusComment;
}
}
/// https://dev.w3.org/html5/spec-LC/tokenization.html#consume-a-character-reference
inline void try_consume_character_reference(Tokenizer* tokenizer) {
code_point_t c = *tokenizer->ptr;
switch (c) {
case CodePoints::TAB:
case CodePoints::LF:
case CodePoints::FF:
case CodePoints::SPACE:
case CodePoints::LESS_THAN_SIGN:
case CodePoints::AMPERSAND:
case EOF: {
// TODO: The additional allowed character?
// Not a character reference. No characters are consumed, and nothing is returned. (This is not an error, either.)
tokenizer->state = TokenizerState_Data;
tokenizer->flag &= TokenizerFlag_IncrementPtr;
break;
}
case CodePoints::NUMBER_SIGN: {
consume_next(tokenizer);
c = *tokenizer->ptr;
bool none_match_range = false;
code_point_t value = 0x0000;
if (c == CodePoints::UPPERCASE_X || c == CodePoints::LOWERCASE_X) {
consume_next(tokenizer);
c = *tokenizer->ptr;
if (!CodePoints::is_hex(c)) {
unconsume_previous(tokenizer); // X
unconsume_previous(tokenizer); // Number sign
// TODO: @Error parse error
return;
}
while (CodePoints::is_hex(c)) {
consume_next(tokenizer);
c = *tokenizer->ptr;
}
}
else {
if (!CodePoints::is_decimal(c)) {
unconsume_previous(tokenizer); // Number sign
// TODO: @Error parse error
return;
}
while (CodePoints::is_decimal(c)) {
consume_next(tokenizer);
c = *tokenizer->ptr;
}
}
break;
}
}
}
/// TODO:
inline void character_reference_in_data_state(Tokenizer* tokenizer) {
// A character reference begins with an ampersand
code_point_t c = *tokenizer->ptr;
}
HtmlToken read_next(Tokenizer* tokenizer) {
tokenizer->flag = TokenizerFlag_None;
tokenizer->last.reset();
tokenizer->state = TokenizerState_Data;
do {
// Reset all flags, except for IncrementPtr
tokenizer->flag = 0 | TokenizerFlag_IncrementPtr;
switch (tokenizer->state) {
case TokenizerState_Data:
data_state(tokenizer);
break;
case TokenizerState_TagOpen:
tag_open_state(tokenizer);
break;
case TokenizerState_TagName:
tag_name_state(tokenizer);
break;
case TokenizerState_EndTagOpen:
end_tag_open_state(tokenizer);
break;
case TokenizerState_CharacterReferenceInData:
character_reference_in_data_state(tokenizer);
break;
default:
logger_error("Unsupported state, exploding: %d\n", tokenizer->state);
exit(1);
}
if (tokenizer->flag & TokenizerFlag_IncrementPtr) {
tokenizer->ptr++;
}
if (tokenizer->flag & TokenizerFlag_Emit) {
break;
}
} while (true);
return tokenizer->last;
}