// https://dev.w3.org/html5/spec-LC/tokenization.html#tokenization
#include "tokenizer.hpp"
#include "code_point.h"
#include "html_token.hpp"
#include
#include
#include
#include
Tokenizer create(code_point_t* value) {
Tokenizer t;
t.ptr = value;
t.length = strlen((char*)value);
t.state = TokenizerState_Data;
return t;
}
// Helpers
/// Consumes the next token by incrementing the ptr.
inline void consume_next(Tokenizer* tokenizer) {
tokenizer->ptr++;
}
// Unconsumed the next tokekn by decrementing the ptr.
inline void unconsume_previous(Tokenizer* tokenizer) {
tokenizer->ptr--;
}
inline void emit_character(Tokenizer* tokenizer, code_point_t c) {
tokenizer->last.type = HtmlTokenType_Character;
tokenizer->last.character_token = c;
tokenizer->flag |= TokenizerFlag_Emit;
}
/// https://dev.w3.org/html5/spec-LC/tokenization.html#consume-a-character-reference
/// Attempts to consume a character reference from the current tokenizer. If one cannot
/// be consumed, false is returned, otherwise true.
inline bool try_consume_character_reference(Tokenizer* tokenizer) {
code_point_t c = *tokenizer->ptr;
switch (c) {
case CodePoints::TAB:
case CodePoints::LF:
case CodePoints::FF:
case CodePoints::SPACE:
case CodePoints::LESS_THAN_SIGN:
case CodePoints::AMPERSAND:
case EOF: {
// TODO: The additional allowed character?
// Not a character reference. No characters are consumed, and nothing is returned. (This is not an error, either.)
tokenizer->state = TokenizerState_Data;
return true;
}
case CodePoints::NUMBER_SIGN: {
consume_next(tokenizer);
c = *tokenizer->ptr;
bool is_hex_value = false;
std::wstring code_entity;
if (c == CodePoints::UPPERCASE_X || c == CodePoints::LOWERCASE_X) {
is_hex_value = true;
consume_next(tokenizer);
c = *tokenizer->ptr;
if (!CodePoints::is_hex(c)) {
unconsume_previous(tokenizer); // X
unconsume_previous(tokenizer); // Number sign
// TODO: @Error parse error
return false;
}
while (CodePoints::is_hex(c)) {
code_entity += c;
consume_next(tokenizer);
c = *tokenizer->ptr;
}
}
else {
if (!CodePoints::is_decimal(c)) {
unconsume_previous(tokenizer); // Number sign
// TODO: @Error parse error
return false;
}
while (CodePoints::is_decimal(c)) {
code_entity += c;
consume_next(tokenizer);
c = *tokenizer->ptr;
}
}
// We should have the hex value now.
if (c != CodePoints::SEMICOLON) {
// TODO: @Error parse error
return false;
}
consume_next(tokenizer);
c = *tokenizer->ptr;
tokenizer->last.set_code_entity_to_value(code_entity, is_hex_value);
auto is_parse_erorr = !CodePoints::try_get_character_ref(tokenizer->last.entity, tokenizer->last.entity);
if (is_parse_erorr) {
// TODO: @Error
return false;
}
return true;
}
default: {
// TODO: Tedious work lies ahead.
// Otherwise try and find the string by name in this table
// https://dev.w3.org/html5/spec-LC/named-character-references.html#named-character-references
logger_error("Unsupported character reference");
return false;
}
}
}
/// https://dev.w3.org/html5/spec-LC/tokenization.html#data-state
inline void data_state(Tokenizer* tokenizer) {
auto c = *tokenizer->ptr;
switch (c) {
case CodePoints::AMPERSAND:
tokenizer->state = TokenizerState_CharacterReferenceInData;
break;
case CodePoints::LESS_THAN_SIGN:
tokenizer->state = TokenizerState_TagOpen;
break;
case CodePoints::NULL_CHAR:
tokenizer->last.type = HtmlTokenType_EOF;
tokenizer->flag = tokenizer->flag | TokenizerFlag_Emit;
break;
default:
// TODO: @Error If null, throw an error
emit_character(tokenizer, *tokenizer->ptr);
break;
}
}
/// https://dev.w3.org/html5/spec-LC/tokenization.html#tag-open-state
inline void tag_open_state(Tokenizer* tokenizer) {
code_point_t c = *tokenizer->ptr;
switch (c) {
case CodePoints::EXCLAMATION_MARK:
tokenizer->state = TokenizerState_MarkupDeclarationOpen; // TODO
break;
case CodePoints::SOLIDUS: // U+002F
tokenizer->state = TokenizerState_EndTagOpen;
break;
default:
// TODO: In these two case, we do NOT want to emit the token just yet.
if (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_Z) {
c = c + 0x0020; // To lower
tokenizer->state = TokenizerState_TagName;
tokenizer->last.append_to_tag_name(c);
tokenizer->last.type = HtmlTokenType_StartTag;
}
else if (c >= CodePoints::LOWERCASE_A && c <= CodePoints::LOWERCASE_Z) {
tokenizer->state = TokenizerState_TagName;
tokenizer->last.append_to_tag_name(c);
tokenizer->last.type = HtmlTokenType_StartTag;
}
else if (c == '?') { // U+003F
// TODO: Parse error
tokenizer->state = TokenizerState_BogusComment; // TODO:
}
else {
emit_character(tokenizer, CodePoints::LESS_THAN_SIGN);
tokenizer->state = TokenizerState_Data;
tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement;
}
break;
}
}
inline void tag_name_state(Tokenizer* tokenizer) {
code_point_t c = *tokenizer->ptr;
switch (c) {
case CodePoints::TAB:
case CodePoints::FF:
case CodePoints::LF:
case CodePoints::SPACE:
tokenizer->state = TokenizerState_BeforeAttributeName;
break;
case CodePoints::SOLIDUS:
tokenizer->state = TokenizerState_SelfClosingStartTag;
break;
case CodePoints::GREATER_THAN_SIGN:
tokenizer->state = TokenizerState_Data;
tokenizer->flag |= TokenizerFlag_Emit;
break;
default:
if (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_Z) {
c = c + 0x0020; // To lower
tokenizer->last.append_to_tag_name(c);
}
else if (c == CodePoints::NULL_CHAR) {
// TODO: @Error
tokenizer->last.append_to_tag_name(CodePoints::REPLACEMENT_CHAR);
}
else if (c == EOF) {
// TODO: @Error
tokenizer->state = TokenizerState_Data;
tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement;
}
else {
tokenizer->last.append_to_tag_name(c);
}
break;
}
}
inline void before_attribute_name_state(Tokenizer* tokenizer) {
code_point_t c = *tokenizer->ptr;
switch (c) {
case CodePoints::TAB:
case CodePoints::LF:
case CodePoints::FF:
case CodePoints::SPACE: {
// Ignore the character
break;
}
case CodePoints::SOLIDUS: {
tokenizer->state = TokenizerState_SelfClosingStartTag;
break;
}
case CodePoints::GREATER_THAN_SIGN: {
tokenizer->state = TokenizerState_Data;
tokenizer->flag |= TokenizerFlag_Emit;
break;
}
case CodePoints::NULL_CHAR: {
// TODO: @Error Parse error.
break;
}
case CodePoints::QUOTATION_MARK:
case CodePoints::APOSTROPHE:
case CodePoints::LESS_THAN_SIGN:
case CodePoints::EQUALS_SIGN: {
// TODO: @Error Parse error
// Treat this the same as the "default" case, which is funny
break;
}
case EOF: {
// TODO: @Error Parse error
tokenizer->state = TokenizerState_Data;
tokenizer->flag = tokenizer->flag & TokenizerFlag_NoIncrement;
break;
}
default: {
if (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_Z) {
c = CodePoints::to_lower_case(c);
}
tokenizer->last.start_attribute();
tokenizer->last.add_to_attribute_name(c);
tokenizer->state = TokenizerState_AttributeNameState;
break;
}
}
}
inline void attribute_name_state(Tokenizer* tokenizer) {
code_point_t c = *tokenizer->ptr;
switch (c) {
case CodePoints::TAB:
case CodePoints::LF:
case CodePoints::FF:
tokenizer->state = TokenizerState_AfterAttributeNameState;
break;
case CodePoints::SOLIDUS:
tokenizer->state = TokenizerState_SelfClosingStartTag;
break;
case CodePoints::EQUALS_SIGN:
tokenizer->state = TokenizerState_BeforeAttributeValueState;
break;
case CodePoints::GREATER_THAN_SIGN:
tokenizer->state = TokenizerState_Data;
tokenizer->flag = tokenizer->flag | TokenizerFlag_Emit;
break;
case CodePoints::NULL_CHAR:
// TODO: @ParseError
tokenizer->last.add_to_attribute_name(CodePoints::REPLACEMENT_CHAR);
break;
case CodePoints::QUOTATION_MARK:
case CodePoints::APOSTROPHE:
case CodePoints::LESS_THAN_SIGN:
// TODO: @ParseError
tokenizer->last.add_to_attribute_name(c);
break;
case CodePoints::MY_EOF:
// TODO: @ParseError
tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement;
tokenizer->state = TokenizerState_Data;
break;
default:
if (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_Z) {
c = CodePoints::to_lower_case(c);
}
tokenizer->last.add_to_attribute_name(c);
break;
}
}
inline void before_attribute_value_state(Tokenizer* tokenizer) {
code_point_t c = *tokenizer->ptr;
switch (c) {
case CodePoints::TAB:
case CodePoints::LF:
case CodePoints::FF:
case CodePoints::SPACE:
break;
case CodePoints::QUOTATION_MARK:
tokenizer->state = TokenizerState_AttributeValueDoubleQuoted;
break;
case CodePoints::AMPERSAND:
tokenizer->state = TokenizerState_AttributeValueUnquoted;
tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement;
break;
case CodePoints::APOSTROPHE:
tokenizer->state = TokenizerState_AttributeValueSingleQuoted;
break;
case CodePoints::NULL_CHAR:
// TODO: @ParseError
tokenizer->state = TokenizerState_AttributeValueUnquoted;
tokenizer->last.add_to_attribute_value(CodePoints::REPLACEMENT_CHAR);
break;
case CodePoints::GREATER_THAN_SIGN:
// TODO: @ParseError
tokenizer->state = TokenizerState_Data;
tokenizer->flag = tokenizer->flag | TokenizerFlag_Emit;
break;
case CodePoints::LESS_THAN_SIGN:
case CodePoints::EQUALS_SIGN:
case CodePoints::MY_EOF:
tokenizer->state = TokenizerState_AttributeValueUnquoted;
tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement;
break;
default:
if (c == CodePoints::GRAVE_ACCENT) {
// TODO: @ParseError
}
tokenizer->state = TokenizerState_AttributeValueUnquoted;
tokenizer->last.add_to_attribute_value(c);
break;
}
}
inline void attribute_value_double_quoted_state(Tokenizer* tokenizer) {
code_point_t c = *tokenizer->ptr;
switch (c) {
case CodePoints::QUOTATION_MARK:
tokenizer->state = TokenizerState_AfterAttributeValueQuoted;
break;
case CodePoints::AMPERSAND:
// https://dev.w3.org/html5/spec-LC/tokenization.html#character-reference-in-attribute-value-state
consume_next(tokenizer);
if (!try_consume_character_reference(tokenizer)) {
tokenizer->last.add_to_attribute_value(CodePoints::AMPERSAND);
break;
}
tokenizer->last.add_to_attribute_value(tokenizer->last.entity);
break;
case CodePoints::NULL_CHAR:
// TODO: @ParseError
tokenizer->last.add_to_attribute_value(CodePoints::REPLACEMENT_CHAR);
break;
case CodePoints::MY_EOF:
// TODO: @ParseError
tokenizer->state = TokenizerState_Data;
tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement;
break;
default:
tokenizer->last.add_to_attribute_value(c);
break;
}
}
inline void after_attribute_value_quoted_state(Tokenizer* tokenizer) {
code_point_t c = *tokenizer->ptr;
switch (c) {
case CodePoints::TAB:
case CodePoints::LF:
case CodePoints::FF:
case CodePoints::SPACE:
tokenizer->state = TokenizerState_BeforeAttributeName;
break;
case CodePoints::SOLIDUS:
tokenizer->state = TokenizerState_SelfClosingStartTag;
break;
case CodePoints::GREATER_THAN_SIGN:
tokenizer->state = TokenizerState_Data;
tokenizer->flag = tokenizer->flag | TokenizerFlag_Emit;
break;
case CodePoints::MY_EOF:
// TODO: @ParseError
tokenizer->state = TokenizerState_Data;
tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement;
break;
default:
// TODO: @ParseError
tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement;
tokenizer->state = TokenizerState_BeforeAttributeName;
break;
}
}
/// Process the end tag open state
/// Spec: https://dev.w3.org/html5/spec-LC/tokenization.html#end-tag-open-state
inline void end_tag_open_state(Tokenizer* tokenizer) {
code_point_t c = *tokenizer->ptr;
if (c >= CodePoints::UPPERCASE_A && c <= CodePoints::UPPERCASE_Z) {
c = c + 0x0020; // To lower
tokenizer->last.type = HtmlTokenType_EndTag;
tokenizer->last.append_to_tag_name(c);
tokenizer->state = TokenizerState_TagName;
}
else if (c >= CodePoints::LOWERCASE_A && c <= CodePoints::LOWERCASE_Z) {
tokenizer->last.type = HtmlTokenType_EndTag;
tokenizer->last.append_to_tag_name(c);
tokenizer->state = TokenizerState_TagName;
}
else if (c == CodePoints::GREATER_THAN_SIGN) {
// TODO: @Error We got something like >
tokenizer->state = TokenizerState_Data;
}
else if (c == EOF) {
// TODO: @Error
// TODO: @Question Emit two tokens?
tokenizer->last.type = HtmlTokenType_Character;
tokenizer->last.character_token = CodePoints::LESS_THAN_SIGN;
tokenizer->last.type = HtmlTokenType_Character;
tokenizer->last.character_token = CodePoints::SOLIDUS;
tokenizer->flag = tokenizer->flag | TokenizerFlag_NoIncrement;
}
else if (c == CodePoints::NULL_CHAR) {
// TODO: @Error
tokenizer->last.append_to_tag_name(CodePoints::REPLACEMENT_CHAR);
}
else {
// TODO: @Error
tokenizer->state = TokenizerState_BogusComment;
}
}
/// TODO:
inline void character_reference_in_data_state(Tokenizer* tokenizer) {
// A character reference begins with an ampersand
code_point_t c = *tokenizer->ptr;
try_consume_character_reference(tokenizer);
}
HtmlToken read_next(Tokenizer* tokenizer) {
tokenizer->flag = TokenizerFlag_None;
tokenizer->last.reset();
tokenizer->state = TokenizerState_Data;
do {
// Reset all flags, except for IncrementPtr
tokenizer->flag = 0;
switch (tokenizer->state) {
case TokenizerState_Data:
data_state(tokenizer);
break;
case TokenizerState_TagOpen:
tag_open_state(tokenizer);
break;
case TokenizerState_TagName:
tag_name_state(tokenizer);
break;
case TokenizerState_BeforeAttributeName:
before_attribute_name_state(tokenizer);
break;
case TokenizerState_AttributeNameState:
attribute_name_state(tokenizer);
break;
case TokenizerState_BeforeAttributeValueState:
before_attribute_value_state(tokenizer);
break;
case TokenizerState_AttributeValueDoubleQuoted:
attribute_value_double_quoted_state(tokenizer);
break;
case TokenizerState_AfterAttributeValueQuoted:
after_attribute_value_quoted_state(tokenizer);
break;
case TokenizerState_EndTagOpen:
end_tag_open_state(tokenizer);
break;
case TokenizerState_CharacterReferenceInData:
character_reference_in_data_state(tokenizer);
break;
default:
logger_error("Unsupported state, exploding: %d\n", tokenizer->state);
exit(1);
}
if ((tokenizer->flag & TokenizerFlag_NoIncrement) == 0) {
tokenizer->ptr++;
}
if (tokenizer->flag & TokenizerFlag_Emit) {
break;
}
} while (true);
return tokenizer->last;
}