// https://dev.w3.org/html5/spec-LC/tokenization.html#tokenization
#include "tokenizer.hpp"
#include "code_point.h"
#include "html_token.hpp"
#include
#include
#include
#include
Tokenizer create(code_point_t* value) {
Tokenizer t;
t.ptr = value;
t.length = strlen((char*)value);
t.state = TokenizerState_Data;
return t;
}
// Helpers
/// Consumes the next token by incrementing the ptr.
inline void consume_next(Tokenizer* tokenizer) {
tokenizer->ptr++;
}
// Unconsumed the next tokekn by decrementing the ptr.
inline void unconsume_previous(Tokenizer* tokenizer) {
tokenizer->ptr--;
}
/// https://dev.w3.org/html5/spec-LC/tokenization.html#data-state
inline void data_state(Tokenizer* tokenizer) {
auto c = *tokenizer->ptr;
switch (c) {
case CodePoints::AMPERSAND:
tokenizer->state = TokenizerState_CharacterReferenceInData;
break;
case CodePoints::LESS_THAN_SIGN:
tokenizer->state = TokenizerState_TagOpen;
break;
case CodePoints::NULL_CHAR:
tokenizer->last.type = HtmlTokenType_EOF;
tokenizer->flag = tokenizer->flag | TokenizerFlag_Emit;
break;
default:
// TODO: @Error If null, throw an error
tokenizer->last.type = HtmlTokenType_Character;
tokenizer->last.character_token = *tokenizer->ptr;
tokenizer->flag |= TokenizerFlag_Emit;
break;
}
}
/// https://dev.w3.org/html5/spec-LC/tokenization.html#tag-open-state
inline void tag_open_state(Tokenizer* tokenizer) {
code_point_t c = *tokenizer->ptr;
switch (c) {
case CodePoints::EXCLAMATION_MARK:
tokenizer->state = TokenizerState_MarkupDeclarationOpen; // TODO
break;
case CodePoints::SOLIDUS: // U+002F
tokenizer->state = TokenizerState_EndTagOpen;
break;
default:
// TODO: In these two case, we do NOT want to emit the token just yet.
if (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_Z) {
c = c + 0x0020; // To lower
tokenizer->state = TokenizerState_TagName;
tokenizer->last.append_to_tag_name(c);
tokenizer->last.type = HtmlTokenType_StartTag;
}
else if (c >= CodePoints::LOWERCASE_A && c <= CodePoints::LOWERCASE_Z) {
tokenizer->state = TokenizerState_TagName;
tokenizer->last.append_to_tag_name(c);
tokenizer->last.type = HtmlTokenType_StartTag;
}
else if (c == '?') { // U+003F
// TODO: Parse error
tokenizer->state = TokenizerState_BogusComment; // TODO:
}
else {
tokenizer->state = TokenizerState_Data;
}
}
}
inline void tag_name_state(Tokenizer* tokenizer) {
code_point_t c = *tokenizer->ptr;
switch (c) {
case CodePoints::TAB:
case CodePoints::FF:
case CodePoints::LF:
case CodePoints::SPACE:
tokenizer->state = TokenizerState_BeforeAttribute;
tokenizer->flag |= TokenizerFlag_Emit;
break;
case CodePoints::SOLIDUS:
tokenizer->state = TokenizerState_SelfClosingStartTag;
break;
case CodePoints::GREATER_THAN_SIGN:
tokenizer->state = TokenizerState_Data;
tokenizer->flag |= TokenizerFlag_Emit;
break;
default:
if (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_Z) {
c = c + 0x0020; // To lower
tokenizer->last.append_to_tag_name(c);
}
else if (c == CodePoints::NULL_CHAR) {
// TODO: @Error
tokenizer->last.append_to_tag_name(CodePoints::REPLACEMENT_CHAR);
}
else if (c == EOF) {
// TODO: @Error
tokenizer->state = TokenizerState_Data;
tokenizer->flag = tokenizer->flag & TokenizerFlag_DecrementPtr;
}
else {
tokenizer->last.append_to_tag_name(c);
}
break;
}
}
/// Process the end tag open state
/// Spec: https://dev.w3.org/html5/spec-LC/tokenization.html#end-tag-open-state
inline void end_tag_open_state(Tokenizer* tokenizer) {
code_point_t c = *tokenizer->ptr;
if (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_Z) {
c = c + 0x0020; // To lower
tokenizer->last.type = HtmlTokenType_EndTag;
tokenizer->last.append_to_tag_name(c);
tokenizer->state = TokenizerState_TagName;
}
else if (c >= CodePoints::LOWERCASE_A && c <= CodePoints::LOWERCASE_Z) {
tokenizer->last.type = HtmlTokenType_EndTag;
tokenizer->last.append_to_tag_name(c);
tokenizer->state = TokenizerState_TagName;
}
else if (c == CodePoints::GREATER_THAN_SIGN) {
// TODO: @Error We got something like >
tokenizer->state = TokenizerState_Data;
}
else if (c == EOF) {
// TODO: @Error
// TODO: @Question Emit two tokens?
tokenizer->last.type = HtmlTokenType_Character;
tokenizer->last.character_token = CodePoints::LESS_THAN_SIGN;
tokenizer->last.type = HtmlTokenType_Character;
tokenizer->last.character_token = CodePoints::SOLIDUS;
tokenizer->flag = tokenizer->flag & TokenizerFlag_DecrementPtr;
}
else if (c == CodePoints::NULL_CHAR) {
// TODO: @Error
tokenizer->last.append_to_tag_name(CodePoints::REPLACEMENT_CHAR);
}
else {
// TODO: @Error
tokenizer->state = TokenizerState_BogusComment;
}
}
/// https://dev.w3.org/html5/spec-LC/tokenization.html#consume-a-character-reference
/// Attempts to consume a character reference from the current tokenizer. If one cannot
/// be consumed, false is returned, otherwise true.
inline bool try_consume_character_reference(Tokenizer* tokenizer) {
code_point_t c = *tokenizer->ptr;
switch (c) {
case CodePoints::TAB:
case CodePoints::LF:
case CodePoints::FF:
case CodePoints::SPACE:
case CodePoints::LESS_THAN_SIGN:
case CodePoints::AMPERSAND:
case EOF: {
// TODO: The additional allowed character?
// Not a character reference. No characters are consumed, and nothing is returned. (This is not an error, either.)
tokenizer->state = TokenizerState_Data;
return true;
}
case CodePoints::NUMBER_SIGN: {
consume_next(tokenizer);
c = *tokenizer->ptr;
bool is_hex_value = false;
code_point_t value = 0x0000;
if (c == CodePoints::UPPERCASE_X || c == CodePoints::LOWERCASE_X) {
is_hex_value = true;
consume_next(tokenizer);
c = *tokenizer->ptr;
if (!CodePoints::is_hex(c)) {
unconsume_previous(tokenizer); // X
unconsume_previous(tokenizer); // Number sign
// TODO: @Error parse error
return false;
}
while (CodePoints::is_hex(c)) {
tokenizer->last.append_to_code_entity(c);
consume_next(tokenizer);
c = *tokenizer->ptr;
}
}
else {
if (!CodePoints::is_decimal(c)) {
unconsume_previous(tokenizer); // Number sign
// TODO: @Error parse error
return false;
}
while (CodePoints::is_decimal(c)) {
tokenizer->last.append_to_code_entity(c);
consume_next(tokenizer);
c = *tokenizer->ptr;
}
}
// We should have the hex value now.
if (c != CodePoints::SEMICOLON) {
// TODO: @Error parse error
return false;
}
consume_next(tokenizer);
c = *tokenizer->ptr;
auto code_entity = tokenizer->last.code_entity_to_value(is_hex_value);
printf("%d\n", code_entity);
auto is_parse_erorr = !CodePoints::try_get_character_ref(code_entity, code_entity);
if (is_parse_erorr) {
// TODO: @Error
return false;
}
return true;
}
default: {
// TODO: Tedious work lies ahead.
// Otherwise try and find the string by name in this table
// https://dev.w3.org/html5/spec-LC/named-character-references.html#named-character-references
logger_error("Unsupported character reference");
return false;
}
}
}
/// TODO:
inline void character_reference_in_data_state(Tokenizer* tokenizer) {
// A character reference begins with an ampersand
code_point_t c = *tokenizer->ptr;
try_consume_character_reference(tokenizer);
}
HtmlToken read_next(Tokenizer* tokenizer) {
tokenizer->flag = TokenizerFlag_None;
tokenizer->last.reset();
tokenizer->state = TokenizerState_Data;
do {
// Reset all flags, except for IncrementPtr
tokenizer->flag = 0;
switch (tokenizer->state) {
case TokenizerState_Data:
data_state(tokenizer);
break;
case TokenizerState_TagOpen:
tag_open_state(tokenizer);
break;
case TokenizerState_TagName:
tag_name_state(tokenizer);
break;
case TokenizerState_EndTagOpen:
end_tag_open_state(tokenizer);
break;
case TokenizerState_CharacterReferenceInData:
character_reference_in_data_state(tokenizer);
break;
default:
logger_error("Unsupported state, exploding: %d\n", tokenizer->state);
exit(1);
}
if ((tokenizer->flag & TokenizerFlag_DecrementPtr) == 0) {
tokenizer->ptr++;
}
if (tokenizer->flag & TokenizerFlag_Emit) {
break;
}
} while (true);
return tokenizer->last;
}