=encoding utf8
=head1 NAME
html/tokenizer - HTML input stream and tokenizer.
=head1 SYNOPSIS
from html/tokenizer import HTMLTokenizer;
let tokenizer := new HTMLTokenizer( _input: "<p title='x'>&</p>" );
let tokens := tokenizer.tokenize();
=head1 NOTE
This module is not normally useful to end users. Instead use C<html/parser>.
=head1 DESCRIPTION
This module implements the tokenizer layer for C<html/parser>. It
accepts already-decoded ZuzuScript strings, normalizes line endings,
tracks source position, emits HTML tokenizer tokens, and records
non-fatal parse errors with line, column, offset, and tokenizer state.
It intentionally does not build a DOM tree. C<html/parser> re-exports
these classes for focused tokenizer tests and for the tree builder. The
tokenizer exposes C<setAllowCDATA> and C<allowCDATA> so the tree builder
can recognise CDATA sections only while processing SVG or MathML foreign
content.
=head1 EXPORTS
=head2 Classes
=over
=item C<HTMLTokenizer>
Tokenizer for HTML strings. Construct it with C<_input> or call
C<reset(String input, String state?)> to reuse it. C<tokenize()> returns
all tokens. C<nextToken()> returns one token at a time and eventually an
EOF token.
Public state methods are C<state>, C<setState>,
C<setLastStartTagName>, C<lastStartTagName>, C<setAllowCDATA>,
C<allowCDATA>, and C<errors>. C<setState> accepts tokenizer state names
such as C<data>, C<rcdata>, C<rawtext>, C<script_data>, and
C<plaintext>. C<errors()> returns a copy of the parse errors emitted
during the last tokenization run.
=item C<HTMLInputStream>
Input stream used by C<HTMLTokenizer>. It normalizes CRLF and CR line
endings to LF, tracks source position, and exposes C<source>, C<offset>,
C<line>, C<column>, C<lastOffset>, C<lastLine>, C<lastColumn>, C<eof>,
C<consume>, C<reconsume>, C<peek>, and C<match>. Most users should use
C<HTMLTokenizer> instead of consuming the stream directly.
=item C<HTMLToken>
Token object emitted by the tokenizer. C<type()> returns values such as
C<start_tag>, C<end_tag>, C<characters>, C<comment>, C<doctype>, and
C<eof>. Other accessors are C<data>, C<tagName>, C<attributes>,
C<getAttribute>, C<hasAttribute>, C<selfClosing>, C<publicId>,
C<systemId>, C<forceQuirks>, and C<toDebugString>.
=item C<HTMLParseError>
Non-fatal tokenizer or tree-construction error. Accessors are C<code>,
C<message>, C<line>, C<column>, C<offset>, C<state>, and C<to_String>.
=item C<HTMLNamedCharacterReferences>
Small named-reference table wrapper. C<table()> returns the current
mapping, C<get(String name)> returns a named reference or C<null>,
C<isComplete()> returns false, and C<coverage()> describes the partial
coverage.
=back
=head1 CHARACTER REFERENCES
Numeric decimal and hexadecimal references are implemented, including
HTML replacement handling for null, surrogate, out-of-range, and C1
Windows-1252 values. The named-reference table is deliberately partial
and covers the common entities needed by the focused tokenizer suite:
C<amp>, C<lt>, C<gt>, C<quot>, C<apos>, C<nbsp>, C<copy>, C<reg>, and
C<not>, with their semicolon forms and the legacy no-semicolon forms
used by HTML tokenization.
=head1 LIMITATIONS
There is no DOM tree construction inside this module, no html5lib
C<.dat> harness, full CSS selector support, or full WHATWG
named-character-reference table.
=head1 COPYRIGHT AND LICENCE
B<< html/tokenizer >> is copyright Toby Inkster.
It is free software; you may redistribute it and/or modify it under
the terms of either the Artistic License 1.0 or the GNU General Public
License version 2.
=cut
from std/string import chr, index, join, ord, replace, substr, trim;
function _html_tok_string ( value ) {
return value ≡ null ? "" : "" _ value;
}
function _html_tok_bool ( value ) {
return value ? true : false;
}
function _html_tok_lc ( value ) {
return lc(_html_tok_string(value));
}
function _html_tok_state_name ( value ) {
let state := _html_tok_lc(value ≡ null ? "data" : value);
state := replace( state, " ", "_", "g" );
state := replace( state, "-", "_", "g" );
return state;
}
function _html_tok_copy_attrs ( Array attrs ) {
let out := [];
for ( let attr in attrs ) {
out.push( {
name: attr{name},
value: attr{value},
} );
}
return out;
}
function _html_tok_is_space ( String ch ) {
return ch eq " " or ch eq "\n" or ch eq "\t" or ch eq chr(12);
}
function _html_tok_is_alpha ( String ch ) {
return ch ~ /^[A-Za-z]$/;
}
function _html_tok_is_digit ( String ch ) {
return ch ~ /^[0-9]$/;
}
function _html_tok_is_hex_digit ( String ch ) {
return ch ~ /^[0-9A-Fa-f]$/;
}
function _html_tok_is_alnum ( String ch ) {
return ch ~ /^[0-9A-Za-z]$/;
}
function _html_tok_ascii_lower_char ( String ch ) {
return _html_tok_is_alpha(ch) ? lc(ch) : ch;
}
function _html_tok_hex_value ( String ch ) {
return index( "0123456789abcdef", lc(ch) );
}
function _html_tok_parse_int ( String text, Number base ) {
let n := 0;
let i := 0;
while ( i < length text ) {
let digit := base == 16
? _html_tok_hex_value(substr( text, i, 1 ))
: index( "0123456789", substr( text, i, 1 ) );
return null if digit < 0 or digit >= base;
n := n * base + digit;
i++;
}
return n;
}
function _html_tok_control_replacement ( Number code ) {
switch ( code: == ) {
case 128: return 8364;
case 130: return 8218;
case 131: return 402;
case 132: return 8222;
case 133: return 8230;
case 134: return 8224;
case 135: return 8225;
case 136: return 710;
case 137: return 8240;
case 138: return 352;
case 139: return 8249;
case 140: return 338;
case 142: return 381;
case 145: return 8216;
case 146: return 8217;
case 147: return 8220;
case 148: return 8221;
case 149: return 8226;
case 150: return 8211;
case 151: return 8212;
case 152: return 732;
case 153: return 8482;
case 154: return 353;
case 155: return 8250;
case 156: return 339;
case 158: return 382;
case 159: return 376;
}
return null;
}
function _html_tok_named_references () {
return {
"amp": "&",
"amp;": "&",
"apos": "'",
"apos;": "'",
"copy": chr(169),
"copy;": chr(169),
"gt": ">",
"gt;": ">",
"lt": "<",
"lt;": "<",
"nbsp": chr(160),
"nbsp;": chr(160),
"not": chr(172),
"not;": chr(172),
"quot": "\"",
"quot;": "\"",
"reg": chr(174),
"reg;": chr(174),
};
}
function _html_tok_named_reference_keys () {
return [
"apos;",
"copy;",
"nbsp;",
"quot;",
"amp;",
"not;",
"reg;",
"gt;",
"lt;",
"apos",
"copy",
"nbsp",
"quot",
"amp",
"not",
"reg",
"gt",
"lt",
];
}
class HTMLNamedCharacterReferences {
static method table () {
return _html_tok_named_references();
}
static method get ( String name ) {
let table := _html_tok_named_references();
return table.exists(name) ? table.get(name) : null;
}
static method isComplete () {
return false;
}
static method coverage () {
return "partial";
}
}
class HTMLParseError {
let String _code := "";
let String _message := "";
let Number _line := 1;
let Number _column := 1;
let Number _offset := 0;
let String _state := "data";
method code () {
return _code;
}
method message () {
return _message;
}
method line () {
return _line;
}
method column () {
return _column;
}
method offset () {
return _offset;
}
method state () {
return _state;
}
method to_String () {
return _code _ " at line " _ _line _ ", column "
_ _column _ " in " _ _state;
}
}
class HTMLToken {
let String _type := "";
let String _data := "";
let String _tag_name := "";
let Array _attributes := [];
let Boolean _self_closing := false;
let _public_id := null;
let _system_id := null;
let Boolean _force_quirks := false;
method __build__ () {
_attributes := [] if _attributes ≡ null;
_tag_name := _html_tok_lc(_tag_name) if _tag_name ≢ null;
_data := _html_tok_string(_data) if _data ≢ null;
}
method type () {
return _type;
}
method data () {
return _data;
}
method tagName () {
return _tag_name;
}
method attributes () {
return _html_tok_copy_attrs(_attributes);
}
method getAttribute ( String name ) {
let wanted := _html_tok_lc(name);
for ( let attr in _attributes ) {
return attr{value} if attr{name} eq wanted;
}
return null;
}
method hasAttribute ( String name ) {
let wanted := _html_tok_lc(name);
for ( let attr in _attributes ) {
return true if attr{name} eq wanted;
}
return false;
}
method selfClosing () {
return _self_closing;
}
method publicId () {
return _public_id;
}
method systemId () {
return _system_id;
}
method forceQuirks () {
return _force_quirks;
}
method _add_attribute ( String name, String value ) {
_attributes.push( {
name: _html_tok_lc(name),
value: _html_tok_string(value),
} );
return self;
}
method _set_self_closing ( Boolean value ) {
_self_closing := value;
return self;
}
method toDebugString () {
if ( _type eq "Character" ) {
return "Character(" _ _data _ ")";
}
if ( _type eq "Comment" ) {
return "Comment(" _ _data _ ")";
}
if ( _type eq "DOCTYPE" ) {
return "DOCTYPE(" _ _tag_name _ ")";
}
if ( _type eq "StartTag" or _type eq "EndTag" ) {
let attrs := [];
for ( let attr in _attributes ) {
attrs.push( attr{name} _ "=" _ attr{value} );
}
return _type _ "(" _ _tag_name _ (
attrs.length() ? " " _ join( " ", attrs ) : ""
) _ (_self_closing ? " /" : "") _ ")";
}
return _type;
}
}
class HTMLInputStream {
let String _input := "";
let String _source := "";
let Number _offset := 0;
let Number _line := 1;
let Number _column := 1;
let Number _last_offset := 0;
let Number _last_line := 1;
let Number _last_column := 1;
let String _last_char := "";
let Boolean _reconsume := false;
method __build__ () {
self.reset(_input);
}
method _normalize ( String input ) {
let out := replace( input, "\r\n", "\n", "g" );
out := replace( out, "\r", "\n", "g" );
return out;
}
method reset ( String input ) {
_input := input;
_source := self._normalize(_html_tok_string(input));
_offset := 0;
_line := 1;
_column := 1;
_last_offset := 0;
_last_line := 1;
_last_column := 1;
_last_char := "";
_reconsume := false;
return self;
}
method source () {
return _source;
}
method offset () {
return _offset;
}
method line () {
return _line;
}
method column () {
return _column;
}
method lastOffset () {
return _last_offset;
}
method lastLine () {
return _last_line;
}
method lastColumn () {
return _last_column;
}
method eof () {
return _offset >= length _source and not _reconsume;
}
method consume () {
if ( _reconsume ) {
_reconsume := false;
return _last_char;
}
return null if _offset >= length _source;
let ch := substr( _source, _offset, 1 );
_last_offset := _offset;
_last_line := _line;
_last_column := _column;
_last_char := ch;
_offset++;
if ( ch eq "\n" ) {
_line++;
_column := 1;
}
else {
_column++;
}
return ch;
}
method reconsume () {
_reconsume := true if _last_char ne "";
return self;
}
method peek ( Number n := 1 ) {
return "" if _offset >= length _source;
return substr( _source, _offset, n );
}
method match ( String text, Boolean case_insensitive := false ) {
let got := self.peek(length text);
return case_insensitive
? lc(got) eq lc(text)
: got eq text;
}
}
class HTMLTokenizer {
let String _input := "";
let String _state := "data";
let _last_start_tag_name := null;
let _stream := null;
let Array _tokens := [];
let Array _errors := [];
let Number _read_index := 0;
let Boolean _tokenized := false;
let Boolean _eof_emitted := false;
let Boolean _allow_cdata := false;
method __build__ () {
self.reset( _input, _state );
_last_start_tag_name := _html_tok_lc(_last_start_tag_name)
unless _last_start_tag_name ≡ null;
}
method reset ( String input, String state := "data" ) {
_input := _html_tok_string(input);
_state := _html_tok_state_name(state);
_stream := new HTMLInputStream( _input: _input );
_tokens := [];
_errors := [];
_read_index := 0;
_tokenized := false;
_eof_emitted := false;
_last_start_tag_name := null;
_allow_cdata := false;
return self;
}
method state () {
return _state;
}
method setState ( String state ) {
_state := _html_tok_state_name(state);
return self;
}
method setLastStartTagName ( name ) {
_last_start_tag_name := name ≡ null ? null : _html_tok_lc(name);
return self;
}
method lastStartTagName () {
return _last_start_tag_name;
}
method setAllowCDATA ( Boolean allow ) {
_allow_cdata := allow ? true : false;
return self;
}
method allowCDATA () {
return _allow_cdata;
}
method errors () {
let out := [];
for ( let error in _errors ) {
out.push(error);
}
return out;
}
method tokenize () {
self._run() unless _tokenized;
let out := [];
for ( let token in _tokens ) {
out.push(token);
}
return out;
}
method nextToken () {
while ( _read_index >= _tokens.length() and not _eof_emitted ) {
self._run_one_cycle();
}
return null if _read_index >= _tokens.length();
let token := _tokens[_read_index];
_read_index++;
return token;
}
method _run () {
while ( not _eof_emitted ) {
self._run_one_cycle();
}
_tokenized := true;
return self;
}
method _run_one_cycle () {
return self if _eof_emitted;
if ( not _stream.eof() ) {
if ( _state eq "data" ) {
self._tokenize_data();
}
else if ( _state eq "rcdata" ) {
self._tokenize_text_mode( "rcdata", true );
}
else if ( _state eq "rawtext" ) {
self._tokenize_text_mode( "rawtext", false );
}
else if ( _state eq "script_data" ) {
self._tokenize_text_mode( "script_data", false );
}
else if ( _state eq "plaintext" ) {
self._tokenize_plaintext();
}
else {
self._parse_error(
"unsupported-tokenizer-state",
"Unsupported tokenizer state " _ _state,
);
self.setState("data");
}
return self;
}
if ( _state ne "data" and _state ne "plaintext" ) {
self._parse_error(
"eof-in-" _ _state,
"End of file in " _ _state _ " state",
);
}
self._emit(new HTMLToken( _type: "EOF" ));
_eof_emitted := true;
return self;
}
method _emit ( HTMLToken token ) {
_tokens.push(token);
return token;
}
method _emit_character ( String data ) {
return null if data eq "";
return self._emit(new HTMLToken( _type: "Character", _data: data ));
}
method _parse_error ( String code, String message ) {
_errors.push(
new HTMLParseError(
_code: code,
_message: message,
_line: _stream.line(),
_column: _stream.column(),
_offset: _stream.offset(),
_state: _state,
),
);
return self;
}
method _parse_error_at_last ( String code, String message ) {
_errors.push(
new HTMLParseError(
_code: code,
_message: message,
_line: _stream.lastLine(),
_column: _stream.lastColumn(),
_offset: _stream.lastOffset(),
_state: _state,
),
);
return self;
}
method _consume_string ( String text ) {
let i := 0;
while ( i < length text ) {
_stream.consume();
i++;
}
return text;
}
method _starts_with ( String text, Boolean ci := false ) {
return _stream.match( text, ci );
}
method _skip_spaces () {
while ( not _stream.eof() and _html_tok_is_space(_stream.peek()) ) {
_stream.consume();
}
return self;
}
method _tokenize_data () {
let text := "";
while ( not _stream.eof() ) {
if ( self._starts_with("<") or self._starts_with("&") ) {
last;
}
let ch := _stream.consume();
if ( ch eq chr(0) ) {
self._parse_error_at_last(
"unexpected-null-character",
"Unexpected null character",
);
text _= chr(65533);
}
else {
text _= ch;
}
}
if ( text ne "" ) {
return self._emit_character(text);
}
if ( _stream.eof() ) {
return null;
}
if ( self._starts_with("&") ) {
_stream.consume();
return self._emit_character(self._consume_character_reference(false));
}
return self._consume_markup();
}
method _tokenize_plaintext () {
let text := "";
while ( not _stream.eof() ) {
let ch := _stream.consume();
if ( ch eq chr(0) ) {
self._parse_error_at_last(
"unexpected-null-character",
"Unexpected null character",
);
text _= chr(65533);
}
else {
text _= ch;
}
}
self._emit_character(text);
return self;
}
method _appropriate_end_tag_name () {
return _last_start_tag_name ≢ null
? _last_start_tag_name
: (
_state eq "rcdata" ? "textarea"
: ( _state eq "rawtext" ? "style" : "script" )
);
}
method _tokenize_text_mode ( String mode, Boolean expand_refs ) {
let text := "";
let tag := self._appropriate_end_tag_name();
while ( not _stream.eof() ) {
if (
tag ne "" and
self._starts_with("</" _ tag, true) and
self._text_end_tag_follows(tag)
) {
last;
}
if ( expand_refs and self._starts_with("&") ) {
_stream.consume();
text _= self._consume_character_reference(false);
}
else {
let ch := _stream.consume();
if ( ch eq chr(0) ) {
self._parse_error_at_last(
"unexpected-null-character",
"Unexpected null character",
);
text _= chr(65533);
}
else {
text _= ch;
}
}
}
self._emit_character(text);
if ( not _stream.eof() ) {
self._consume_markup();
}
return self;
}
method _text_end_tag_follows ( String tag ) {
let probe := _stream.peek(2 + length tag + 1);
let after := substr( probe, 2 + length tag, 1 );
return after eq "" or after eq ">" or _html_tok_is_space(after)
or after eq "/";
}
method _consume_markup () {
_stream.consume(); // <
if ( _stream.eof() ) {
self._parse_error_at_last(
"eof-before-tag-name",
"EOF before tag name",
);
return self._emit_character("<");
}
if ( self._starts_with("!--") ) {
self._consume_string("!--");
return self._parse_comment();
}
if ( self._starts_with("![CDATA[") and _allow_cdata ) {
self._consume_string("![CDATA[");
return self._parse_cdata_section();
}
if ( self._starts_with("!DOCTYPE", true) ) {
self._consume_string(substr( _stream.peek(8), 0, 8 ));
return self._parse_doctype();
}
if ( self._starts_with("!") ) {
_stream.consume();
self._parse_error(
"incorrectly-opened-comment",
"Markup declaration is not comment or doctype",
);
return self._parse_bogus_comment();
}
if ( self._starts_with("?") ) {
_stream.consume();
self._parse_error(
"unexpected-question-mark-instead-of-tag-name",
"Unexpected question mark instead of tag name",
);
return self._parse_bogus_comment();
}
if ( self._starts_with("/") ) {
_stream.consume();
return self._parse_end_tag();
}
if ( _html_tok_is_alpha(_stream.peek()) ) {
return self._parse_start_tag();
}
self._parse_error(
"invalid-first-character-of-tag-name",
"Invalid first character of tag name",
);
return self._emit_character("<");
}
method _parse_cdata_section () {
let data := "";
while ( not _stream.eof() ) {
if ( self._starts_with("]]>") ) {
self._consume_string("]]>");
return self._emit_character(data);
}
data _= _stream.consume();
}
self._parse_error(
"eof-in-cdata",
"EOF in CDATA section",
);
return self._emit_character(data);
}
method _parse_tag_name () {
let name := "";
while ( not _stream.eof() ) {
let ch := _stream.peek();
last if _html_tok_is_space(ch) or ch eq "/" or ch eq ">";
if ( ch eq chr(0) ) {
self._parse_error(
"unexpected-null-character",
"Unexpected null in tag name",
);
name _= chr(65533);
_stream.consume();
}
else {
name _= _html_tok_ascii_lower_char(_stream.consume());
}
}
return name;
}
method _parse_start_tag () {
let name := self._parse_tag_name();
let token := new HTMLToken(
_type: "StartTag",
_tag_name: name,
_attributes: [],
);
_last_start_tag_name := name;
self._parse_attributes( token, false );
return self._emit(token);
}
method _parse_end_tag () {
if ( _stream.eof() ) {
self._parse_error(
"eof-before-tag-name",
"EOF before end tag name",
);
return self._emit_character("</");
}
if ( not _html_tok_is_alpha(_stream.peek()) ) {
self._parse_error(
"invalid-first-character-of-tag-name",
"Invalid first character of end tag name",
);
return self._parse_bogus_comment();
}
let name := self._parse_tag_name();
let token := new HTMLToken(
_type: "EndTag",
_tag_name: name,
_attributes: [],
);
self._parse_attributes( token, true );
return self._emit(token);
}
method _parse_attributes ( HTMLToken token, Boolean end_tag ) {
while ( not _stream.eof() ) {
self._skip_spaces();
last if _stream.eof();
let ch := _stream.peek();
if ( ch eq ">" ) {
_stream.consume();
return self;
}
if ( ch eq "/" ) {
_stream.consume();
if ( _stream.peek() eq ">" ) {
_stream.consume();
if ( end_tag ) {
self._parse_error(
"end-tag-with-trailing-solidus",
"End tag has trailing solidus",
);
}
else {
token._set_self_closing(true);
}
return self;
}
self._parse_error(
"unexpected-solidus-in-tag",
"Unexpected solidus in tag",
);
}
let name := self._parse_attribute_name();
last if name eq "";
let value := "";
self._skip_spaces();
if ( _stream.peek() eq "=" ) {
_stream.consume();
self._skip_spaces();
value := self._parse_attribute_value();
}
if ( token.hasAttribute(name) ) {
self._parse_error(
"duplicate-attribute",
"Duplicate attribute " _ name,
);
}
else if ( end_tag ) {
self._parse_error(
"end-tag-with-attributes",
"End tag has attributes",
);
token._add_attribute( name, value );
}
else {
token._add_attribute( name, value );
}
}
self._parse_error(
"eof-in-tag",
"EOF in tag",
);
return self;
}
method _parse_attribute_name () {
let name := "";
while ( not _stream.eof() ) {
let ch := _stream.peek();
last if _html_tok_is_space(ch) or ch eq "/" or ch eq ">"
or ch eq "=";
name _= _html_tok_ascii_lower_char(_stream.consume());
}
return name;
}
method _parse_attribute_value () {
return "" if _stream.eof();
let quote := _stream.peek();
if ( quote eq "\"" or quote eq "'" ) {
_stream.consume();
let value := "";
while ( not _stream.eof() ) {
let ch := _stream.consume();
return value if ch eq quote;
if ( ch eq "&" ) {
value _= self._consume_character_reference(true);
}
else if ( ch eq chr(0) ) {
self._parse_error_at_last(
"unexpected-null-character",
"Unexpected null character",
);
value _= chr(65533);
}
else {
value _= ch;
}
}
self._parse_error(
"eof-in-attribute-value",
"EOF in quoted attribute value",
);
return value;
}
let value := "";
while ( not _stream.eof() ) {
let ch := _stream.peek();
last if _html_tok_is_space(ch) or ch eq ">";
last if ch eq "/" and _stream.peek(2) eq "/>";
if ( ch eq "&" ) {
_stream.consume();
value _= self._consume_character_reference(true);
}
else {
value _= _stream.consume();
}
}
return value;
}
method _parse_comment () {
let data := "";
while ( not _stream.eof() ) {
if ( self._starts_with("-->") ) {
self._consume_string("-->");
return self._emit(new HTMLToken(
_type: "Comment",
_data: data,
));
}
if ( self._starts_with("--!>") ) {
self._parse_error(
"abrupt-closing-of-empty-comment",
"Abrupt comment close",
);
self._consume_string("--!>");
return self._emit(new HTMLToken(
_type: "Comment",
_data: data,
));
}
if ( self._starts_with("<!--") ) {
self._parse_error(
"nested-comment",
"Nested comment opening",
);
}
data _= _stream.consume();
}
self._parse_error(
"eof-in-comment",
"EOF in comment",
);
return self._emit(new HTMLToken(
_type: "Comment",
_data: data,
));
}
method _parse_bogus_comment () {
let data := "";
while ( not _stream.eof() ) {
let ch := _stream.consume();
last if ch eq ">";
data _= ch;
}
return self._emit(new HTMLToken(
_type: "Comment",
_data: data,
));
}
method _parse_doctype () {
self._skip_spaces();
if ( _stream.eof() or _stream.peek() eq ">" ) {
self._parse_error(
"missing-doctype-name",
"Missing doctype name",
);
_stream.consume() if _stream.peek() eq ">";
return self._emit(new HTMLToken(
_type: "DOCTYPE",
_tag_name: "",
_force_quirks: true,
));
}
let name := "";
while ( not _stream.eof() ) {
let ch := _stream.peek();
last if _html_tok_is_space(ch) or ch eq ">";
name _= _html_tok_ascii_lower_char(_stream.consume());
}
self._skip_spaces();
let public_id := null;
let system_id := null;
let force := false;
if ( self._starts_with("PUBLIC", true) ) {
self._consume_string(substr( _stream.peek(6), 0, 6 ));
self._skip_spaces();
public_id := self._parse_doctype_quoted();
if ( public_id ≡ null ) {
self._parse_error(
"missing-doctype-public-identifier",
"Missing doctype public identifier",
);
force := true;
}
self._skip_spaces();
system_id := self._parse_doctype_quoted();
}
else if ( self._starts_with("SYSTEM", true) ) {
self._consume_string(substr( _stream.peek(6), 0, 6 ));
self._skip_spaces();
system_id := self._parse_doctype_quoted();
if ( system_id ≡ null ) {
self._parse_error(
"missing-doctype-system-identifier",
"Missing doctype system identifier",
);
force := true;
}
}
else if ( _stream.peek() ne ">" and not _stream.eof() ) {
self._parse_error(
"invalid-character-sequence-after-doctype-name",
"Invalid text after doctype name",
);
force := true;
}
while ( not _stream.eof() and _stream.peek() ne ">" ) {
_stream.consume();
}
if ( _stream.peek() eq ">" ) {
_stream.consume();
}
else {
self._parse_error(
"eof-in-doctype",
"EOF in doctype",
);
force := true;
}
return self._emit(new HTMLToken(
_type: "DOCTYPE",
_tag_name: name,
_public_id: public_id,
_system_id: system_id,
_force_quirks: force,
));
}
method _parse_doctype_quoted () {
return null if _stream.eof();
let quote := _stream.peek();
return null unless quote eq "\"" or quote eq "'";
_stream.consume();
let value := "";
while ( not _stream.eof() ) {
let ch := _stream.consume();
return value if ch eq quote;
value _= ch;
}
self._parse_error(
"eof-in-doctype",
"EOF in doctype identifier",
);
return value;
}
method _consume_character_reference ( Boolean in_attribute ) {
if ( _stream.eof() ) {
return "&";
}
if ( _stream.peek() eq "#" ) {
_stream.consume();
return self._consume_numeric_character_reference();
}
return self._consume_named_character_reference(in_attribute);
}
method _consume_named_character_reference ( Boolean in_attribute ) {
let table := _html_tok_named_references();
for ( let key in _html_tok_named_reference_keys() ) {
if ( _stream.match( key, false ) ) {
let has_semicolon := substr( key, length key - 1, 1 ) eq ";";
let after := _stream.peek(length key + 1);
after := substr( after, length key, 1 );
if (
in_attribute and
not has_semicolon and
(_html_tok_is_alnum(after) or after eq "=")
) {
next;
}
self._consume_string(key);
if ( not has_semicolon ) {
self._parse_error(
"missing-semicolon-after-character-reference",
"Missing semicolon after named character reference",
);
}
return table.get(key);
}
}
if ( _html_tok_is_alnum(_stream.peek()) ) {
self._parse_error(
"unknown-named-character-reference",
"Unknown named character reference",
);
}
return "&";
}
method _consume_numeric_character_reference () {
let base := 10;
if ( _stream.peek() eq "x" or _stream.peek() eq "X" ) {
base := 16;
_stream.consume();
}
let digits := "";
while ( not _stream.eof() ) {
let ch := _stream.peek();
last unless base == 16
? _html_tok_is_hex_digit(ch)
: _html_tok_is_digit(ch);
digits _= _stream.consume();
}
if ( digits eq "" ) {
self._parse_error(
"absence-of-digits-in-numeric-character-reference",
"No digits in numeric character reference",
);
return base == 16 ? "&#x" : "&#";
}
if ( _stream.peek() eq ";" ) {
_stream.consume();
}
else {
self._parse_error(
"missing-semicolon-after-character-reference",
"Missing semicolon after numeric character reference",
);
}
let code := _html_tok_parse_int( digits, base );
if ( code == 0 ) {
self._parse_error(
"null-character-reference",
"Null character reference",
);
return chr(65533);
}
if ( code > 1114111 ) {
self._parse_error(
"character-reference-outside-unicode-range",
"Character reference outside Unicode range",
);
return chr(65533);
}
if ( code >= 55296 and code <= 57343 ) {
self._parse_error(
"surrogate-character-reference",
"Surrogate character reference",
);
return chr(65533);
}
let replacement := _html_tok_control_replacement(code);
if ( replacement ≢ null ) {
self._parse_error(
"control-character-reference",
"Control character reference",
);
return chr(replacement);
}
return chr(code);
}
}
modules/html/tokenizer.zzm
html-0.0.1 source code
Package
- Name
- html
- Version
- 0.0.1
- Uploaded
- 2026-06-10 01:22:42
- Repository
- https://github.com/tobyink/zuzu-html
- Dependencies
-
-
std/io>= 0 -
std/string>= 0
-
- Metadata
- zuzu-distribution.json
- Archive
- Download .tar.gz