texlive[70106] branches/branch2023.final/Master/texmf-dist: luaxml

commits+karl at tug.org commits+karl at tug.org
Fri Feb 23 23:09:19 CET 2024


Revision: 70106
          https://tug.org/svn/texlive?view=revision&revision=70106
Author:   karl
Date:     2024-02-23 23:09:19 +0100 (Fri, 23 Feb 2024)
Log Message:
-----------
luaxml (branch) (23feb24)

Modified Paths:
--------------
    branches/branch2023.final/Master/texmf-dist/doc/luatex/luaxml/README
    branches/branch2023.final/Master/texmf-dist/doc/luatex/luaxml/luaxml.pdf
    branches/branch2023.final/Master/texmf-dist/doc/luatex/luaxml/luaxml.tex
    branches/branch2023.final/Master/texmf-dist/tex/luatex/luaxml/luaxml-cssquery.lua
    branches/branch2023.final/Master/texmf-dist/tex/luatex/luaxml/luaxml-domobject.lua
    branches/branch2023.final/Master/texmf-dist/tex/luatex/luaxml/luaxml-entities.lua
    branches/branch2023.final/Master/texmf-dist/tex/luatex/luaxml/luaxml-mod-handler.lua
    branches/branch2023.final/Master/texmf-dist/tex/luatex/luaxml/luaxml-namedentities.lua
    branches/branch2023.final/Master/texmf-dist/tex/luatex/luaxml/luaxml-testxml.lua
    branches/branch2023.final/Master/texmf-dist/tex/luatex/luaxml/luaxml-transform.lua

Added Paths:
-----------
    branches/branch2023.final/Master/texmf-dist/tex/luatex/luaxml/luaxml-mod-html.lua

Modified: branches/branch2023.final/Master/texmf-dist/doc/luatex/luaxml/README
===================================================================
--- branches/branch2023.final/Master/texmf-dist/doc/luatex/luaxml/README	2024-02-23 22:09:07 UTC (rev 70105)
+++ branches/branch2023.final/Master/texmf-dist/doc/luatex/luaxml/README	2024-02-23 22:09:19 UTC (rev 70106)
@@ -15,8 +15,12 @@
     make install
 
 Please note that you will need [LDoc](http://stevedonovan.github.io/ldoc/manual/doc.md.html#Processing_Single_Modules) and 
-[dkjson](http://dkolf.de/src/dkjson-lua.fsl/home) Lua modules installed on your system.
+[dkjson](http://dkolf.de/src/dkjson-lua.fsl/home) Lua modules installed on your system. You can install them using:
 
+    $ luarocks install --local ldoc
+    $ luarocks install --local dkjso
+
+
 License:
 ========
 
@@ -28,7 +32,7 @@
 ------
 Michal Hoftich
 Email: michal.h21 at gmail.com
-Version: v0.1q, 2021-10-06
+Version: v0.1r, 2024-02-23
 
 Original authors: Paul Chakravarti and Manoel Campos (http://manoelcampos.com)
 
@@ -35,3 +39,4 @@
 If you are interested in the process of development you may observe
 
     https://github.com/michal-h21/LuaXML 
+

Modified: branches/branch2023.final/Master/texmf-dist/doc/luatex/luaxml/luaxml.pdf
===================================================================
(Binary files differ)

Modified: branches/branch2023.final/Master/texmf-dist/doc/luatex/luaxml/luaxml.tex
===================================================================
--- branches/branch2023.final/Master/texmf-dist/doc/luatex/luaxml/luaxml.tex	2024-02-23 22:09:07 UTC (rev 70105)
+++ branches/branch2023.final/Master/texmf-dist/doc/luatex/luaxml/luaxml.tex	2024-02-23 22:09:19 UTC (rev 70106)
@@ -7,7 +7,7 @@
 \usepackage{framed}
 % Version is defined in the makefile, use default values when compiled directly
 \ifdefined\version\else
-\def\version{v0.1q}
+\def\version{v0.1r}
 \let\gitdate\date
 \fi
 \newcommand\modulename[1]{\subsection{#1}\label{sec:#1}}
@@ -466,8 +466,8 @@
 The parameters table can hold following values:
 
 \begin{description}
-  \item[verbatim] -- by default, spaces are collapsed. This is useful in general, but you may want to 
-    keep spaces, for example in program listings. Set \texttt{verbatim=true} in this case.
+  \item[verbatim] -- used for source code listings and similar texts, that should keep their original formatting. 
+    Special characters are not escaped, so you will want to transform the elements into verbatim or listings environment.
   \item[separator] -- when you select element by names (\verb|@<element name>|), you can use this parameter
     set the separator between possible multiple instances of the child element.
 \end{description}

Modified: branches/branch2023.final/Master/texmf-dist/tex/luatex/luaxml/luaxml-cssquery.lua
===================================================================
--- branches/branch2023.final/Master/texmf-dist/tex/luatex/luaxml/luaxml-cssquery.lua	2024-02-23 22:09:07 UTC (rev 70105)
+++ branches/branch2023.final/Master/texmf-dist/tex/luatex/luaxml/luaxml-cssquery.lua	2024-02-23 22:09:19 UTC (rev 70106)
@@ -1,7 +1,12 @@
 --- CSS query module for LuaXML
 -- @module luaxml-cssquery
 -- @author Michal Hoftich <michal.h21 at gmail.com
-local parse_query = require("luaxml-parse-query")
+local parse_query
+if kpse then
+  parse_query = require("luaxml-parse-query")
+else
+  parse_query = require("luaxml.parse-query")
+end
 
 -- the string.explode function is provided by LuaTeX
 -- this is alternative for stock Lua

Modified: branches/branch2023.final/Master/texmf-dist/tex/luatex/luaxml/luaxml-domobject.lua
===================================================================
--- branches/branch2023.final/Master/texmf-dist/tex/luatex/luaxml/luaxml-domobject.lua	2024-02-23 22:09:07 UTC (rev 70105)
+++ branches/branch2023.final/Master/texmf-dist/tex/luatex/luaxml/luaxml-domobject.lua	2024-02-23 22:09:19 UTC (rev 70106)
@@ -2,13 +2,26 @@
 -- @module luaxml-domobject
 -- @author Michal Hoftich <michal.h21 at gmail.com
 local dom = {}
-local xml = require("luaxml-mod-xml")
-local handler = require("luaxml-mod-handler")
-local css_query = require("luaxml-cssquery")
 
+local xml
+local handler
+local css_query
+if kpse then
+  xml = require("luaxml-mod-xml")
+  handler = require("luaxml-mod-handler")
+  css_query = require("luaxml-cssquery")
+else
+  xml = require("luaxml.mod-xml")
+  handler = require("luaxml.mod-handler")
+  css_query = require("luaxml.cssquery")
+end
 
+
 local void = {area = true, base = true, br = true, col = true, hr = true, img = true, input = true, link = true, meta = true, param = true}
 
+-- support also upper case names
+for k,v in pairs(void) do void[string.upper(k)] = true end
+
 local escapes = {
   [">"] = ">",
   ["<"] = "<",
@@ -266,6 +279,8 @@
     for _, el in ipairs(current:get_children()) do
       if el:is_text() then
         text[#text+1] = el._text or ""
+      elseif  el._type == "CDATA" then
+        text[#text+1] = el._text or ""
       elseif el:is_element() then
         text[#text+1] = el:get_text()
       end
@@ -339,11 +354,12 @@
     return el._parent
   end
 
-  --- Execute function on the current element and all it's children elements.
+  --- Execute function on the current element and all it's children nodes.
+  -- The differenct to DOM_Object:traverse_elements() is that it executes the function 
+  -- also on text nodes and all other kinds of XML nodes.
   -- The traversing of child elements of a given node can be disabled when the executed
   -- function returns false.
-  -- @return nothing
-  function DOM_Object:traverse_elements(
+  function DOM_Object:traverse(
     fn, --- function which will be executed on the current element and all it's children
     current --- [optional] element to be selected
     )
@@ -353,17 +369,68 @@
       current = self:root_node() 
     end
     local status = true
-    if self:is_element(current) or self:get_node_type(current) == "ROOT"then
-      local status = fn(current)
+    local status = fn(current)
+    if current:is_element() or current:get_node_type() == "ROOT" then
       -- don't traverse child nodes when the user function return false
       if status ~= false then
-        for _, child in ipairs(self:get_children(current)) do
-          self:traverse_elements(fn, child)
+        for _, child in ipairs(current:get_children()) do
+          self:traverse(fn, child)
         end
       end
     end
   end
 
+  --- Execute function on the current element and all it's children elements.
+  --- The traversing of child elements of a given node can be disabled when the executed
+  -- function returns false.
+  -- @return nothing
+  function DOM_Object:traverse_elements(
+    fn, --- function which will be executed on the current element and all it's children
+    current --- [optional] element to be selected
+    )
+    local current = current or self --
+    current:traverse(function(node)
+      if node:is_element() or node:get_node_type() == "ROOT" then
+        fn(node)
+      end
+    end)
+  end
+
+  --- Get table with the inner text of an element, every text node is a separate table item. 
+  --- @return table
+  function DOM_Object:strings(
+    current --- [optional] element to be selected
+    )
+    local strings = {}
+    local current = current or self
+    current:traverse(function(node)
+      if node:get_node_type() == "TEXT" then
+        table.insert(strings, node._text or "")
+      end
+    end)
+    return strings
+  end
+
+  --- Get table with the inner text of an element -- leading and trailing spaces are removed and elements that contain only white space are ignored.
+  --- @return table 
+  function DOM_Object:stripped_strings(
+    current --- [optional] element to be selected
+    )
+    local current = current or self
+    local strings = current:strings()
+    local cleaned = {}
+    for k,v in ipairs(strings) do
+      v = v:gsub("^%s*", ""):gsub("%s*$", "")
+      if v ~= "" then
+        table.insert(cleaned, v)
+      end
+    end
+    return cleaned
+  end
+
+
+
+
   --- Execute function on list of elements returned by DOM_Object:get_path()
   function DOM_Object:traverse_node_list( 
     nodelist --- table with nodes selected by DOM_Object:get_path()

Modified: branches/branch2023.final/Master/texmf-dist/tex/luatex/luaxml/luaxml-entities.lua
===================================================================
--- branches/branch2023.final/Master/texmf-dist/tex/luatex/luaxml/luaxml-entities.lua	2024-02-23 22:09:07 UTC (rev 70105)
+++ branches/branch2023.final/Master/texmf-dist/tex/luatex/luaxml/luaxml-entities.lua	2024-02-23 22:09:19 UTC (rev 70106)
@@ -1,6 +1,11 @@
 local M = {}
 local char = unicode and unicode.utf8.char or utf8.char
-local named_entities = require "luaxml-namedentities"
+local named_entities
+if kpse then
+  named_entities = require "luaxml-namedentities"
+else
+  named_entities = require "luaxml.namedentities"
+end
 local hexchartable = {}
 local decchartable = {}
 

Modified: branches/branch2023.final/Master/texmf-dist/tex/luatex/luaxml/luaxml-mod-handler.lua
===================================================================
--- branches/branch2023.final/Master/texmf-dist/tex/luatex/luaxml/luaxml-mod-handler.lua	2024-02-23 22:09:07 UTC (rev 70105)
+++ branches/branch2023.final/Master/texmf-dist/tex/luatex/luaxml/luaxml-mod-handler.lua	2024-02-23 22:09:19 UTC (rev 70106)
@@ -112,9 +112,17 @@
 -- at returns Returns a string representation of table
 
 local M = {}
-local stack = require("luaxml-stack")
-local entities = require("luaxml-entities")
 
+local stack
+local entities
+if kpse then
+    stack = require("luaxml-stack")
+    entities = require("luaxml-entities")
+else
+    stack = require("luaxml.stack")
+    entities = require("luaxml.entities")
+end
+
 local function showTable(t)
     local sep = ''
     local res = ''

Added: branches/branch2023.final/Master/texmf-dist/tex/luatex/luaxml/luaxml-mod-html.lua
===================================================================
--- branches/branch2023.final/Master/texmf-dist/tex/luatex/luaxml/luaxml-mod-html.lua	                        (rev 0)
+++ branches/branch2023.final/Master/texmf-dist/tex/luatex/luaxml/luaxml-mod-html.lua	2024-02-23 22:09:19 UTC (rev 70106)
@@ -0,0 +1,2047 @@
+-- Copyright Michal Hoftich, 2022
+-- HTML parser inspired by https://browser.engineering/html.html
+-- but then redone using https://html.spec.whatwg.org/multipage/parsing.html
+--
+-- There main purpose of this module is to create an useful DOM for later processing
+-- using LuaXML functions. Either for cleanup, or for translation to output formats, 
+-- for example LaTeX. 
+--
+-- It should be possible to serialize DOM back to the original HTML code. 
+--
+-- We attempt to do some basic fixes, like to close paragraphs or list items that 
+-- aren't closed correctly in the original code. We don't fix tables or 
+-- formatting elements (see https://html.spec.whatwg.org/multipage/parsing.html#the-list-of-active-formatting-elements)
+-- as these features don't seem necessary for the purpose of this module. We may change
+-- this policy in the future, if it turns out that they are necessary. 
+--
+--
+local M = {}
+
+-- use local copies of utf8 functions
+local ucodepoint = utf8.codepoint
+local utfchar      = utf8.char
+local function uchar(codepoint)
+  if codepoint and codepoint > -1 then
+    return utfchar(codepoint)
+  end
+  return ""
+end
+
+-- declare namespaces
+local xmlns = {
+  HTML = "http://www.w3.org/1999/xhtml",
+  MathML = "http://www.w3.org/1998/Math/MathML",
+  SVG = "http://www.w3.org/2000/svg",
+  XLink = "http://www.w3.org/1999/xlink",
+  XML = "http://www.w3.org/XML/1998/namespace",
+  XMLNS = "http://www.w3.org/2000/xmlns/", 
+}
+
+-- we must make search tree for named entities, as their support 
+-- is quite messy
+local named_entities
+if kpse then
+  named_entities = require "luaxml-namedentities"
+else
+  named_entities = require "luaxml.namedentities"
+end
+
+local entity_tree = {children = {}}
+
+local function update_tree(tree, char)
+  local children = tree.children or {}
+  local current = children[char] or {}
+  children[char] = current
+  tree.children = children
+  return current
+end
+
+-- loop over named entities and update tree
+for entity, char in pairs(named_entities) do
+  local tree = entity_tree
+  for char in entity:gmatch(".") do
+    tree = update_tree(tree,char)
+  end
+  tree.entity = entity
+  tree.char   = char
+end
+
+local function search_entity_tree(tbl) 
+  -- get named entity for the list of characters
+  local tree = entity_tree
+  for _,char in ipairs(tbl) do 
+    if tree.children then
+      tree = tree.children[char]
+      if not tree then return nil end
+    else
+      return nil
+    end
+  end
+  -- print("tree", tree.char)
+  return tree
+end
+
+
+-- declare  basic node types
+
+local Root = {
+  _type = "root",
+  xmlns = xmlns.HTML
+}
+
+function Root:init()
+  local o = {}
+  setmetatable(o, self)
+  self.__index = self
+  self.__tostring = function (x) return "_ROOT" end
+  o.children = {}
+  return o
+end
+
+function Root:add_child(node)
+  table.insert(self.children, node)
+end
+
+local Doctype = {
+  _type = "doctype"
+}
+function Doctype:init(name, parent)
+  local o = {}
+  setmetatable(o, self)
+  self.__index = self
+  self.__tostring = function (x) 
+    if x.data then
+      return "<!DOCTYPE " .. x.name .. " " .. x.data ..  ">" 
+    else
+      return "<!DOCTYPE " .. x.name .. ">" 
+    end
+  end
+  self.add_child = Root.add_child
+  o.parent = parent
+  o.name = name
+  o.children = {}
+  return o
+end
+
+function Doctype:add_data(data)
+  self.data = data
+end
+
+
+local Text = {
+  _type = "text"
+}
+
+function Text:init(text, parent)
+  local o = {}
+  setmetatable(o, self)
+  self.__index = self
+  o.text = text
+  self.__tostring = function (x) return "'" ..  x.text .. "'" end
+  self.add_child = Root.add_child
+  o.parent = parent
+  o.children = {}
+  return o
+end
+
+local Comment = {
+  _type = "comment"
+}
+
+function Comment:init(text, parent)
+  local o = {}
+  setmetatable(o, self)
+  self.__index = self
+  o.text = text
+  self.__tostring = function (x) return "<!--" ..  x.text .. "-->" end
+  self.add_child = Root.add_child
+  o.parent = parent
+  o.children = {}
+  return o
+end
+
+
+
+local Element = {
+  _type = "element"
+}
+
+function Element:init(tag, parent)
+  local o = {}
+  setmetatable(o, self)
+  self.__index = self
+  -- tag can be table with unicode characters
+  if type(tag) == "table" then
+    o.tag = table.concat(tag)
+  else
+    o.tag = tag
+  end
+  self.__tostring = function(x) 
+    local attr = {}
+    for _, el in ipairs(x.attr) do 
+      -- handle attributes
+      local value
+      if el.value:match('"') then
+        value = "'" .. el.value .. "'"
+      else
+        value = '"' .. el.value .. '"'
+      end
+      attr[#attr+1] =  el.name .. "=" .. value
+    end
+    local closing = ">"
+    if x.self_closing then
+      closing = " />"
+    end
+    if #attr > 0 then
+      return "<" .. x.tag .. " " .. table.concat(attr, " ") .. closing 
+    else
+      return "<" .. x.tag .. closing
+    end
+  end
+  self.add_child = Root.add_child
+  o.children = {}
+  o.attr     = {}
+  o.parent = parent
+  -- default xmlns
+  o.xmlns  = xmlns.HTML
+  return o
+end
+
+-- state machine functions
+
+-- each function takes HtmlParser as an argument
+local HtmlStates = {}
+
+-- declare codepoints for more efficient processing
+local less_than      = ucodepoint("<")
+local greater_than   = ucodepoint(">")
+local amperesand     = ucodepoint("&")
+local exclam         = ucodepoint("!")
+local question       = ucodepoint("?")
+local solidus        = ucodepoint("/")
+local equals         = ucodepoint("=")
+local quoting        = ucodepoint('"')
+local apostrophe     = ucodepoint("'")
+local semicolon      = ucodepoint(";")
+local hyphen         = ucodepoint("-")
+local dash           = ucodepoint("-")
+local numbersign     = ucodepoint("#")
+local smallx         = ucodepoint("x")
+local bigx           = ucodepoint("X")
+local right_square   = ucodepoint("]")
+local EOF            = -1 -- special character, meaning end of stream
+local null           = 0
+
+local function is_upper_alpha(codepoint)
+  if (64 < codepoint and codepoint < 91) then
+    return true
+  end
+end
+local function is_lower_alpha(codepoint)
+  if (96 < codepoint and codepoint < 123) then 
+    return true
+  end
+end
+
+local function is_alpha(codepoint)
+  -- detect if codepoint is alphanumeric
+  if is_upper_alpha(codepoint) or
+     is_lower_alpha(codepoint) then
+       return true
+  end
+  return false
+end
+
+
+local function is_numeric(codepoint)
+  if 47 < codepoint and codepoint < 58 then
+    return true
+  end
+end
+
+local function is_upper_hex(codepoint)
+  if 64 < codepoint and codepoint < 71 then
+    return true
+  end
+end
+
+local function is_lower_hex(codepoint)
+  if 96 < codepoint and codepoint < 103 then
+    return true
+  end
+end
+
+local function is_hexadecimal(codepoint) 
+  if is_numeric(codepoint) or
+     is_lower_hex(codepoint) or
+     is_upper_hex(codepoint)
+  then 
+    return true
+  end
+end
+
+
+local function is_alphanumeric(codepoint)
+  return is_alpha(codepoint) or is_numeric(codepoint)
+end
+
+local function is_space(codepoint) 
+  -- detect space characters
+  if codepoint==0x0009 or codepoint==0x000A or codepoint==0x000C or codepoint==0x0020 then
+    return true
+  end
+  return false
+end
+
+local function is_surrogate(codepoint)
+  return  0xD800 <= codepoint and codepoint <= 0xDFFF
+end
+
+
+character_entity_replace_table = {
+[0x80] =  0x20AC,  
+[0x82] =  0x201A,  
+[0x83] =  0x0192,  
+[0x84] =  0x201E,  
+[0x85] =  0x2026,  
+[0x86] =  0x2020,  
+[0x87] =  0x2021,  
+[0x88] =  0x02C6,  
+[0x89] =  0x2030,  
+[0x8A] =  0x0160,  
+[0x8B] =  0x2039,  
+[0x8C] =  0x0152,  
+[0x8E] =  0x017D,  
+[0x91] =  0x2018,  
+[0x92] =  0x2019,  
+[0x93] =  0x201C,  
+[0x94] =  0x201D,  
+[0x95] =  0x2022,  
+[0x96] =  0x2013,  
+[0x97] =  0x2014,  
+[0x98] =  0x02DC,  
+[0x99] =  0x2122,  
+[0x9A] =  0x0161,  
+[0x9B] =  0x203A,  
+[0x9C] =  0x0153,  
+[0x9E] =  0x017E,  
+[0x9F] =  0x0178  
+}
+
+local function fix_null(codepoint)
+  if codepoint == null then
+    return 0xFFFD
+  else
+    return codepoint
+  end
+end
+
+HtmlStates.data = function(parser) 
+  -- this is the default state
+  local codepoint = parser.codepoint
+  -- print("codepoint", parser.codepoint)
+  if codepoint == less_than then
+    -- start of tag
+    return "tag_open"
+  elseif codepoint  == amperesand then
+    -- we must save the current state 
+    -- what we will return to after entity
+    parser.return_state = "data"
+    return "character_reference" 
+  elseif codepoint == EOF then
+    parser:emit_eof()
+  else
+    parser:emit_character(uchar(codepoint))
+  end
+  return "data"
+end
+
+HtmlStates.tag_open = function(parser)
+  -- parse tag contents
+  local codepoint = parser.codepoint
+  if codepoint == exclam then
+    return "markup_declaration_open"
+  elseif codepoint == solidus then
+    return "end_tag_open"
+  elseif codepoint == question then
+    parser:start_token("comment",{data={}})
+    return "bogus_comment"
+  elseif is_alpha(codepoint) then
+    local data = {
+      name = {},
+      attr = {},
+      current_attr_name = {},
+      current_attr_value = {},
+      self_closing = false
+    }
+    parser:start_token("start_tag", data)
+    return parser:tokenize("tag_name")
+  elseif codepoint == EOF then
+    parser:emit_character(">")
+    parser:emit_eof()
+  else
+    -- invalid tag
+    -- emit "<" and reconsume current character as data
+    parser:emit_character("<")
+    return parser:tokenize("data")
+  end
+end
+
+HtmlStates.character_reference = function(parser)
+  -- parse HTML entities
+  -- initialize temp buffer
+  parser.temp_buffer = {"&"}
+  local codepoint = parser.codepoint
+  if is_alphanumeric(codepoint) then
+    return parser:tokenize("named_character_reference")
+  elseif codepoint == numbersign then
+    table.insert(parser.temp_buffer, uchar(codepoint))
+    return "numeric_character_reference"
+  else
+    parser:flush_temp_buffer()
+    return parser:tokenize(parser.return_state)
+  end
+
+end
+
+HtmlStates.named_character_reference = function(parser)
+  -- named entity parsing is pretty complicated 
+  -- https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
+  local codepoint = parser.codepoint
+  -- test if the current entity name is included in the named entity list
+  local search_table = {}
+  -- first char in temp buffer is &, which we don't want to lookup in the search tree
+  for i=2, #parser.temp_buffer do search_table[#search_table+1] = parser.temp_buffer[i] end
+  if codepoint == semicolon then
+    -- close named entity
+    local entity = search_entity_tree(search_table) 
+    if entity and entity.char then
+      parser:add_entity(entity.char)
+    else
+      -- if the current name doesn't correspond to any named entity, flush everything into text
+      parser:flush_temp_buffer()
+      return parser:tokenize(parser.return_state)
+    end
+    return parser.return_state
+  else
+    local char = uchar(codepoint)
+    -- try if the current entity name is in the named entity search tree
+    table.insert(search_table, char)
+    local entity = search_entity_tree(search_table)
+    if entity then
+      -- keep parsing name entity while we match a name
+      table.insert(parser.temp_buffer, char)
+      return "named_character_reference"
+    else
+      -- here this will be more complicated
+      if #search_table > 1 then
+        local token = parser.current_token
+        if token.type == "start_tag" and (codepoint == equals or is_alphanumeric(codepoint)) then
+          -- in attribute value, flush characters and retokenize  
+          parser:flush_temp_buffer()
+          return parser:tokenize(parser.return_state)
+        else
+          -- try to get entity for characters preceding the current character
+          table.remove(search_table)
+          local newentity = search_entity_tree(search_table)
+          if newentity and newentity.char then
+            parser:add_entity(newentity.char)
+          else
+            -- we need to find if parts of the current substring match a named entity
+            -- for example ¬it; -> ¬it; but ∉ -> ∉
+            local rest = {}
+            -- loop over the table with characters, and try to find if it matches entity
+            for i = #search_table, 1,-1 do
+              local removed_char = table.remove(search_table)
+              -- 
+              table.insert(rest, 1, removed_char)
+              newentity = search_entity_tree(search_table)
+              if newentity and newentity.char then
+                parser:add_entity(newentity.char)
+                parser.temp_buffer = rest
+                break
+              end
+            end
+            -- replace temporary buffer witch characters that followed the matched entity
+            parser:flush_temp_buffer()
+          end
+          return parser:tokenize(parser.return_state)
+        end
+      else
+        -- search table contains only the current character
+        parser:flush_temp_buffer()
+        return parser:tokenize(parser.return_state)
+      end
+    end
+  end
+
+end
+
+HtmlStates.numeric_character_reference = function(parser)
+  -- this variable will hold the number
+  local codepoint = parser.codepoint
+  parser.character_reference_code = 0
+  if codepoint == smallx or codepoint == bigx then
+    -- hexadecimal entity
+    table.insert(parser.temp_buffer, uchar(codepoint))
+    return "hexadecimal_character_reference_start"
+  else
+    -- try decimal entity
+    return parser:tokenize("decimal_character_reference_start")
+  end
+
+end
+
+HtmlStates.hexadecimal_character_reference_start = function(parser)
+  local codepoint = parser.codepoint
+  if is_hexadecimal(codepoint) then
+    return parser:tokenize("hexadecimal_character_reference")
+  else
+    parser:flush_temp_buffer()
+    return parser:tokenize(parser.return_state)
+  end
+end
+
+HtmlStates.decimal_character_reference_start = function(parser)
+  local codepoint = parser.codepoint
+  if is_numeric(codepoint) then
+    return parser:tokenize("decimal_character_reference")
+  else
+    parser:flush_temp_buffer()
+    return parser:tokenize(parser.return_state)
+  end
+end
+
+
+HtmlStates.decimal_character_reference = function(parser)
+  local codepoint = parser.codepoint
+  -- helper functions for easier working with the character_reference_code
+  local function multiply(number)
+    parser.character_reference_code = parser.character_reference_code * number
+  end
+  local function add(number)
+    parser.character_reference_code = parser.character_reference_code + number
+  end
+  if is_numeric(codepoint) then
+    multiply(10)
+    add(codepoint - 0x30)
+  elseif codepoint == semicolon then
+    return "numeric_reference_end_state"
+  else
+    -- this adds current entity
+    parser:tokenize("numeric_reference_end_state")
+    -- now tokenize the current character
+    return parser:tokenize(parser.return_state)
+  end
+  return "decimal_character_reference"
+end
+
+HtmlStates.hexadecimal_character_reference = function(parser)
+  local codepoint = parser.codepoint
+  -- helper functions for easier working with the character_reference_code
+  local function multiply(number)
+    parser.character_reference_code = parser.character_reference_code * number
+  end
+  local function add(number)
+    parser.character_reference_code = parser.character_reference_code + number
+  end
+  if is_numeric(codepoint) then
+    multiply(16)
+    add(codepoint - 0x30)
+  elseif is_upper_hex(codepoint) then
+    multiply(16)
+    add(codepoint - 0x37)
+  elseif is_lower_hex(codepoint) then
+    multiply(16)
+    add(codepoint - 0x57)
+  elseif codepoint == semicolon then
+    return "numeric_reference_end_state"
+  else
+    -- this adds current entity
+    parser:tokenize("numeric_reference_end_state")
+    -- now tokenize the current character
+    return parser:tokenize(parser.return_state)
+  end
+  return "hexadecimal_character_reference"
+end
+
+HtmlStates.numeric_reference_end_state = function(parser)
+  -- in this state, we don't need to 
+  local character = parser.character_reference_code
+  -- we need to clean invalid character codes
+  if character == 0x00 or 
+     character >  0x10FFFF or
+     is_surrogate(character) 
+  then
+    character = 0xFFFD
+  -- should we add special support for "noncharacter"? I think we can pass them to the output anyway
+  elseif character_entity_replace_table[character] then 
+    character = character_entity_replace_table[character]
+  end
+  parser:add_entity(uchar(character))
+  return parser.return_state
+end
+
+
+HtmlStates.markup_declaration_open = function(parser)
+  -- started by <!
+  -- we now need to find the following text, to find if we started comment, doctype, or cdata
+  local comment_pattern = "^%-%-"
+  local doctype_pattern = "^[Dd][Oo][Cc][Tt][Yy][Pp][Ee]"
+  local cdata_pattern   = "^%[CDATA%["
+  local start_pos = parser.position
+  local text = parser.body
+  if text:match(comment_pattern, start_pos) then
+    -- local _, newpos = text:find(comment_pattern, start_pos)
+    -- we need to ignore next few characters
+    parser.ignored_pos = start_pos + 1
+    parser:start_token("comment", {data = {}})
+    return "comment_start"
+  elseif text:match(doctype_pattern, start_pos) then
+    parser.ignored_pos = start_pos + 6
+    parser:start_token("doctype", {name = {}, data = {}, force_quirks = false})
+    return "doctype"
+  elseif text:match(cdata_pattern, start_pos) then
+    parser.ignored_pos = start_pos + 6
+    local current_element = parser:current_node()
+    if current_element.xmlns == xmlns.HTML or not current_element.xmlns then
+      -- we change CDATA simply to comments
+      parser:start_token("comment", {data = {"[CDATA["}})
+      return "bogus_comment"
+    else
+      -- we are in XML mode, this happens for included SVG or MathML
+      return "cdata_section"
+    end
+  else
+    parser:start_token("comment", {data = {}})
+    return "bogus_comment"
+  end
+  -- local start, stop = string.find(parser.body, comment_pattern, parser.position)
+end
+
+
+HtmlStates.cdata_section = function(parser)
+  local codepoint = parser.codepoint
+  if codepoint == right_square then
+    return "cdata_section_bracket"
+  elseif codepoint == EOF then
+    parser:emit_eof()
+  else
+    parser:emit_character(uchar(codepoint))
+    return "cdata_section"
+  end
+end
+
+HtmlStates.cdata_section_bracket = function(parser)
+  local codepoint = parser.codepoint
+  if codepoint == right_square then
+    return "cdata_section_end"
+  else
+    parser:emit_character("]")
+    return parser:tokenize("cdata_section")
+  end
+end
+
+HtmlStates.cdata_section_end = function(parser)
+  local codepoint = parser.codepoint
+  if codepoint == right_square then
+    parser:emit_character("]")
+    return "cdata_section_end"
+  elseif codepoint == greater_than then
+    return "data"
+  else
+    parser:emit_character("]")
+    return parser:tokenize("cdata_section")
+  end
+end
+
+
+HtmlStates.comment_start = function(parser)
+  local codepoint = parser.codepoint
+  if codepoint == hyphen then
+    return "comment_start_dash"
+  elseif codepoint == greater_than then
+    parser:emit()
+    return "data"
+  else
+    return parser:tokenize("comment")
+  end
+end
+
+HtmlStates.comment_start_dash = function(parser)
+  local codepoint = parser.codepoint
+  if codepoint == hyphen then
+    return "comment_end"
+  elseif codepoint == greater_than then
+    parser:emit()
+    return data
+  elseif codepoint == EOF then
+    parser:emit()
+    parser:emit_eof()
+  else
+    parser:append_token_data("data", "-")
+    return parser:tokenize("comment")
+  end
+end
+
+HtmlStates.comment = function(parser)
+  local codepoint = parser.codepoint
+  codepoint = fix_null(codepoint)
+  if codepoint == less_than then
+    parser:append_token_data("data", uchar(codepoint))
+    return "comment_less_than"
+  elseif codepoint == hyphen then
+    return "comment_end_dash"
+  elseif codepoint == EOF then
+    parser:emit()
+    parser:emit_eof()
+  else
+    parser:append_token_data("data", uchar(codepoint))
+  end
+  return "comment"
+end
+
+HtmlStates.comment_less_than = function(parser)
+  local codepoint = parser.codepoint
+  if codepoint == exclam then
+    parser:append_token_data("data", uchar(codepoint))
+    return "comment_less_than_bang"
+  elseif codepoint == less_than then
+    parser:append_token_data("data", uchar(codepoint))
+    return "comment_less_than"
+  else
+    return parser:tokenize("comment")
+  end
+end
+
+HtmlStates.comment_less_than_bang = function(parser)
+  local codepoint = parser.codepoint
+  if codepoint == hyphen then
+    return "comment_less_than_bang_dash"
+  else
+    return parser:tokenize("comment")
+  end
+end
+
+HtmlStates.comment_less_than_bang_dash = function(parser)
+  local codepoint = parser.codepoint
+  if codepoint == hyphen then
+    return "comment_less_than_bang_dash_dash"
+  else
+    return parser:tokenize("comment_end_dash")
+  end
+
+end
+
+HtmlStates.comment_less_than_bang_dash_dash = function(parser)
+  -- these comment states start to be ridiculous
+  local codepoint = parser.codepoint
+  if codepoint == greater_than or codepoint == EOF then
+    return parser:tokenize("comment_end")
+  else
+    return parser:tokenize("comment_end")
+  end
+end
+
+HtmlStates.comment_end_dash = function(parser)
+  local codepoint = parser.codepoint
+  if codepoint == hyphen then
+    return "comment_end"
+  elseif codepoint == EOF then
+    parser:emit()
+    parser:emit_eof()
+  else
+    parser:append_token_data("data", uchar(codepoint))
+    return parser:tokenize("comment")
+  end
+end
+
+HtmlStates.comment_end = function(parser)
+  local codepoint = parser.codepoint
+  if codepoint == greater_than then
+    parser:emit()
+    return "data"
+  elseif codepoint == exclam then
+    return "comment_end_bang"
+  elseif codepoint == hyphen then
+    parser:append_token_data("data", "-")
+    return "comment_end"
+  elseif codepoint == EOF then
+    parser:emit()
+    parser:emit_eof()
+  else
+    parser:append_token_data("data", "--")
+    return parser:tokenize("comment")
+  end
+end
+
+HtmlStates.comment_end_bang = function(parser)
+  local codepoint = parser.codepoint
+  if codepoint == hyphen then
+    parser:append_token_data("data", "--!")
+    return "comment_end_dash"
+  elseif codepoint == greater_than then
+    parser:emit()
+    return "data"
+  elseif codepoint == EOF then
+    parser:emit()
+    parser:emit_eof()
+  else
+    parser:append_token_data("data", "--!")
+    return parser:tokenize("comment")
+  end
+end
+
+HtmlStates.end_tag_open = function(parser)
+  local codepoint = parser.codepoint
+  if is_alpha(codepoint) then
+    local data = {
+      name = {}
+    }
+    parser:start_token("end_tag", data)
+    return parser:tokenize("tag_name")
+  elseif codepoint == greater_than then
+    return "data"
+  elseif codepoint == EOF then
+    parser:discard_token()
+    parser:emit_character("</")
+    parser:emit_eof()
+  else
+    data = {
+      data = {}
+    }
+    parser:start_token("comment", data)
+    return parser:tokenize("bogus_comment")
+  end
+end
+
+HtmlStates.bogus_comment = function(parser)
+  -- started by <?
+  local codepoint = parser.codepoint
+  codepoint = fix_null(codepoint)
+  if codepoint == greater_than then
+    parser:emit()
+    return "data"
+  elseif codepoint == EOF then
+    parser:emit()
+    parser:emit_eof()
+  else
+    parser:append_token_data("data", uchar(codepoint))
+    return "bogus_comment"
+  end
+end
+
+local function doctype_eof(parser)
+    parser:set_token_data("force_quirks", true)
+    parser:emit()
+    parser:emit_eof()
+end
+
+HtmlStates.doctype = function(parser)
+  local codepoint = parser.codepoint
+  if is_space(codepoint) then
+    return "before_doctype_name"
+  elseif codepoint == greater_than then
+    return parser:tokenize("before_doctype_name")
+  elseif codepoint == EOF then
+    doctype_eof(parser)
+  else
+    return parser:tokenize("before_doctype_name")
+  end
+end
+
+HtmlStates.before_doctype_name = function(parser)
+  local codepoint = parser.codepoint
+  codepoint = fix_null(codepoint)
+  if is_space(codepoint) then
+    return "before_doctype_name"
+  elseif codepoint == greater_than then
+    parser:set_token_data("force_quirks", true)
+    parser:emit()
+    return "data"
+  elseif codepoint == EOF then
+    doctype_eof(parser)
+  elseif is_upper_alpha(codepoint) then
+    -- add lowercase name
+    parser:append_token_data("name", uchar(codepoint + 0x20))
+    return "doctype_name"
+  else
+    parser:append_token_data("name", uchar(codepoint))
+    return "doctype_name"
+  end
+end
+
+HtmlStates.doctype_name = function(parser)
+
+  local codepoint = parser.codepoint
+  codepoint = fix_null(codepoint)
+  if is_space(codepoint) then
+    return "after_doctype_name"
+  elseif codepoint == greater_than then
+    parser:emit()
+    return "data"
+  elseif codepoint == EOF then
+    doctype_eof(parser)
+  elseif is_upper_alpha(codepoint) then
+    -- add lowercase name
+    parser:append_token_data("name", uchar(codepoint + 0x20))
+    return "doctype_name"
+  else
+    parser:append_token_data("name", uchar(codepoint))
+    return "doctype_name"
+  end
+end
+
+HtmlStates.after_doctype_name = function(parser)
+  local codepoint = parser.codepoint
+  if is_space(codepoint) then
+    return "after_doctype_name"
+  elseif codepoint == greater_than then
+    parser:emit()
+    return "data"
+  elseif codepoint == EOF then
+    doctype_eof(parser)
+  else
+    parser:append_token_data("data", uchar(codepoint))
+    -- there are lot of complicated rules how to consume doctype, 
+    -- but I think that for our purpose they aren't interesting.
+    -- so everything until EOF or > is consumed as token.data
+    return "consume_doctype_data"
+  end
+end
+
+HtmlStates.consume_doctype_data = function(parser)
+  -- this state just reads everything inside doctype as data
+  local codepoint = parser.codepoint
+  if codepoint == greater_than then
+    parser:emit()
+    return "data"
+  elseif codepoint == EOF then
+    doctype_eof(parser)
+  else
+    parser:append_token_data("data", uchar(codepoint))
+    return "consume_doctype_data"
+  end
+end
+
+HtmlStates.tag_name = function(parser)
+  local codepoint = parser.codepoint
+  codepoint = fix_null(codepoint)
+  if is_space(codepoint) then 
+    return "before_attribute_name"
+  elseif codepoint == solidus then
+    return "self_closing_tag"
+  elseif codepoint == greater_than then
+    parser:emit()
+    return "data"
+  elseif is_upper_alpha(codepoint) then
+    local lower = string.lower(uchar(codepoint))
+    parser:append_token_data("name", lower)
+  elseif codepoint==EOF then
+    parser:emit()
+    parser:emit_eof()
+  else
+    local char = uchar(codepoint)
+    parser:append_token_data("name", char)
+  end
+  return "tag_name"
+
+end
+
+HtmlStates.self_closing_tag = function(parser)
+  local codepoint = parser.codepoint
+  if codepoint == greater_than then
+    parser.current_token.self_closing = true
+    parser:emit()
+    return "data"
+  else
+    return parser:tokenize("before_attribute_name")
+  end
+end
+
+
+HtmlStates.before_attribute_name = function(parser)
+  local codepoint = parser.codepoint
+  if is_space(codepoint) then
+    -- ignore spacing
+    return "before_attribute_name"
+  elseif codepoint == solidus or codepoint == greater_than then
+    -- reconsume in after_attribute_name
+    return parser:tokenize("after_attribute_name")
+  elseif codepoint == equals then
+    -- ToDo: handle https://html.spec.whatwg.org/multipage/parsing.html#parse-error-unexpected-equals-sign-before-attribute-name
+  else
+    -- start new attribute
+    parser:start_attribute()
+    return parser:tokenize("attribute_name")
+  end
+end
+
+HtmlStates.attribute_name = function(parser)
+  local codepoint = parser.codepoint
+  if is_space(codepoint) 
+     or codepoint == solidus
+     or codepoint == greater_than 
+  then
+    return parser:tokenize("after_attribute_name")
+  elseif codepoint == equals then
+    return "before_attribute_value"
+  elseif is_upper_alpha(codepoint) then
+    -- lowercase attribute names
+    local lower = string.lower(uchar(codepoint))
+    parser:append_token_data("current_attr_name", lower)
+    return "attribute_name"
+  else
+    parser:append_token_data("current_attr_name", uchar(codepoint))
+    return "attribute_name"
+  end
+end
+
+HtmlStates.after_attribute_name = function(parser)
+  local codepoint = parser.codepoint
+  if is_space(codepoint) then
+    return "after_attribute_name"
+  elseif codepoint == equals then
+    return "before_attribute_value"
+  elseif codepoint == solidus then
+    return "self_closing_tag"
+  elseif codepoint == greater_than then
+    parser:emit()
+    return "data"
+  else
+    parser:start_attribute()
+    return parser:tokenize("attribute_name")
+  end
+end
+
+HtmlStates.before_attribute_value = function(parser)
+  local codepoint = parser.codepoint
+  if is_space(codepoint) then
+    return "before_attribute_value" 
+  elseif codepoint == quoting then
+    return "attribute_value_quoting"
+  elseif codepoint == apostrophe then
+    return "attribute_value_apostrophe"
+  elseif codepoint == greater_than then
+    parser:emit()
+    return "data"
+  else
+    return  parser:tokenize("attribute_value_unquoted")
+  end
+end
+
+HtmlStates.attribute_value_quoting = function(parser)
+  local codepoint = parser.codepoint
+  if codepoint == quoting then
+    return "after_attribute_value_quoting"
+  elseif codepoint == amperesand then
+    parser.return_state = "attribute_value_quoting"
+    return "character_reference"
+  else
+    parser:append_token_data("current_attr_value", uchar(codepoint))
+    return "attribute_value_quoting"
+  end
+end
+
+HtmlStates.attribute_value_apostrophe = function(parser)
+  local codepoint = parser.codepoint
+  if codepoint == apostrophe then
+    return "after_attribute_value_quoting"
+  elseif codepoint == amperesand then
+    parser.return_state = "attribute_value_apostrophe"
+    return "character_reference"
+  else
+    parser:append_token_data("current_attr_value", uchar(codepoint))
+    return "attribute_value_apostrophe"
+  end
+end
+
+HtmlStates.attribute_value_unquoted = function(parser)
+  local codepoint = parser.codepoint
+  if is_space(codepoint) then
+    return "before_attribute_name"
+  elseif codepoint == amperesand then
+    parser.return_state = "attribute_value_unquoted"
+    return "character_reference"
+  elseif codepoint == greater_than then
+    parser:emit()
+    return "data"
+  else
+    parser:append_token_data("current_attr_value", uchar(codepoint))
+    return "attribute_value_unquoted"
+  end
+end
+
+HtmlStates.after_attribute_value_quoting = function(parser)
+  local codepoint = parser.codepoint
+  if is_space(codepoint) then
+    return "before_attribute_name"
+  elseif codepoint == solidus then
+    return "self_closing_tag"
+  elseif codepoint == greater_than then
+    parser:emit()
+    return "data"
+  else 
+    return parser:tokenize("before_attribute_name")
+  end
+end
+
+HtmlStates.rcdata = function(parser)
+  -- this is the default state
+  local codepoint = parser.codepoint
+  -- print("codepoint", parser.codepoint)
+  codepoint = fix_null(codepoint)
+  if codepoint == less_than then
+    -- start of tag
+    return "rcdata_less_than"
+  elseif codepoint  == amperesand then
+    -- we must save the current state 
+    -- what we will return to after entity
+    parser.return_state = "rcdata"
+    return "character_reference" 
+  elseif codepoint == EOF then
+    parser:emit_eof()
+  else
+    parser:emit_character(uchar(codepoint))
+  end
+  return "rcdata"
+end
+
+local function discard_rcdata_end_tag(parser, text)
+    parser:discard_token()
+    parser:emit_character(text)
+end
+
+HtmlStates.rcdata_less_than = function(parser)
+  local codepoint = parser.codepoint
+  if codepoint == solidus then
+    return "rcdata_end_tag_open"
+  else
+    discard_rcdata_end_tag(parser, "<")
+    return parser:tokenize("rcdata")
+  end
+end
+
+HtmlStates.rcdata_end_tag_open = function(parser)
+  local codepoint = parser.codepoint
+  if is_alpha(codepoint) then
+    parser:start_token("end_tag", {name={}})
+    parser.temp_buffer = {}
+    return parser:tokenize("rcdata_end_tag_name")
+  else
+    discard_rcdata_end_tag(parser, "</")
+    return parser:tokenize("rcdata")
+  end
+end
+
+
+
+HtmlStates.rcdata_end_tag_name = function(parser)
+  -- we need to find name of the currently opened tag
+  local parent = parser:get_parent() or {}
+  local opened_tag = parent.tag 
+  local current_tag = table.concat(parser.current_token.name or {})
+  local codepoint = parser.codepoint
+  if is_upper_alpha(codepoint) then
+    parser:append_token_data("name", uchar(codepoint + 0x20))
+    -- insert current char to temp buffer
+    table.insert(parser.temp_buffer, uchar(codepoint))
+    return "rcdata_end_tag_name"
+  elseif is_lower_alpha(codepoint) then
+    parser:append_token_data("name", uchar(codepoint))
+    table.insert(parser.temp_buffer, uchar(codepoint))
+    return "rcdata_end_tag_name"
+  elseif opened_tag == current_tag then
+    if is_space(codepoint) then
+      return "before_attribute_name"
+    elseif codepoint == solidus then
+      return "self_closing_tag"
+    elseif codepoint == greater_than then
+      parser:emit()
+      return "data"
+    end
+  else
+    discard_rcdata_end_tag(parser, "</" .. table.concat(parser.temp_buffer))
+    parser.temp_buffer = {}
+    return parser:tokenize("rcdata")
+  end
+end
+
+HtmlStates.rawtext = function(parser)
+  local codepoint = parser.codepoint
+  codepoint = fix_null(codepoint)
+  if codepoint == less_than then
+    return "rawtext_less_than"
+  elseif codepoint == EOF then
+    parser:emit_eof()
+  else
+    parser:emit_character(uchar(codepoint))
+    return "rawtext"
+  end
+end
+
+HtmlStates.rawtext_less_than = function(parser)
+  local codepoint = parser.codepoint
+  if codepoint == solidus then
+    return "rawtext_end_tag_open"
+  else
+    parser:emit_character("<")
+    return parser:tokenize("rawtext")
+  end
+end
+
+HtmlStates.rawtext_end_tag_open = function(parser)
+  local codepoint = parser.codepoint
+  if is_alpha(codepoint) then
+    parser:start_token("end_tag", {name={}})
+    parser.temp_buffer = {}
+    return parser:tokenize("rawtext_end_tag_name")
+  else
+    parser:emit_character("</")
+    return parser:tokenize("rawtext")
+  end
+end
+
+HtmlStates.rawtext_end_tag_name = function(parser)
+  -- we need to find name of the currently opened tag
+  local parent = parser:get_parent() or {}
+  local opened_tag = parent.tag 
+  local current_tag = table.concat(parser.current_token.name or {})
+  local codepoint = parser.codepoint
+  if is_upper_alpha(codepoint) then
+    parser:append_token_data("name", uchar(codepoint + 0x20))
+    table.insert(parser.temp_buffer, uchar(codepoint))
+    return "rawtext_end_tag_name"
+  elseif is_lower_alpha(codepoint) then
+    parser:append_token_data("name", uchar(codepoint))
+    table.insert(parser.temp_buffer, uchar(codepoint))
+    return "rawtext_end_tag_name"
+  elseif opened_tag == current_tag then
+    if is_space(codepoint) then
+      return "before_attribute_name"
+    elseif codepoint == solidus then
+      return "self_closing_tag"
+    elseif codepoint == greater_than then
+      parser:emit()
+      return "data"
+    end
+  else
+    discard_rcdata_end_tag(parser, "</" .. table.concat(parser.temp_buffer))
+    parser.temp_buffer = {}
+    return parser:tokenize("rawtext")
+  end
+end
+
+HtmlStates.script_data = function(parser)
+  local codepoint = parser.codepoint
+  codepoint = fix_null(codepoint)
+  if codepoint == less_than then
+    return "script_data_less_than"
+  elseif codepoint == EOF then
+    parser:emit_eof()
+  else
+    parser:emit_character(uchar(codepoint))
+    return "script_data"
+  end
+end
+
+HtmlStates.script_data_less_than = function(parser)
+  local codepoint = parser.codepoint
+  if codepoint == solidus then
+    parser.temp_buffer = {}
+    return "script_data_end_tag_open"
+  elseif codepoint == exclam then
+    parser:emit_character("<!")
+    return "script_data_escape_start"
+  else
+    parser:emit_character("<")
+    return parser:tokenize("script_data")
+  end
+end
+
+HtmlStates.script_data_end_tag_open = function(parser)
+  local codepoint = parser.codepoint
+  if is_alpha(codepoint) then
+    parser:start_token("end_tag", {name={}})
+    return parser:tokenize("script_data_end_tag_name")
+  else
+    parser:emit_character("</")
+    return parser:tokenize("script_data")
+  end
+end
+
+HtmlStates.script_data_end_tag_name = function(parser)
+  -- we need to find name of the currently opened tag
+  local parent = parser:get_parent() or {}
+  local opened_tag = parent.tag 
+  local current_tag = table.concat(parser.current_token.name or {})
+  local codepoint = parser.codepoint
+  if is_upper_alpha(codepoint) then
+    parser:append_token_data("name", uchar(codepoint + 0x20))
+    table.insert(parser.temp_buffer, uchar(codepoint))
+    return "script_data_end_tag_name"
+  elseif is_lower_alpha(codepoint) then
+    parser:append_token_data("name", uchar(codepoint))
+    table.insert(parser.temp_buffer, uchar(codepoint))
+    return "script_data_end_tag_name"
+  elseif opened_tag == current_tag then
+    if is_space(codepoint) then
+      return "before_attribute_name"
+    elseif codepoint == solidus then
+      return "self_closing_tag"
+    elseif codepoint == greater_than then
+      parser:emit()
+      return "data"
+    end
+  else
+    discard_rcdata_end_tag(parser, "</" .. table.concat(parser.temp_buffer))
+    parser.temp_buffer = {}
+    return parser:tokenize("script_data")
+  end
+
+end
+
+HtmlStates.script_data_escape_start = function(parser)
+  local codepoint = parser.codepoint
+  if codepoint == hyphen then
+    parser:emit_character("-")
+    return "script_data_escape_start_dash"
+  else
+    parser:tokenize("script_data")
+  end
+end
+
+HtmlStates.script_data_escape_start_dash = function(parser)
+  local codepoint = parser.codepoint
+  if codepoint == hyphen then
+    parser:emit_character("-")
+    return "script_data_escaped_dash_dash"
+  else
+    parser:tokenize("script_data")
+  end
+
+end
+
+
+HtmlStates.script_data_escaped = function(parser)
+  local codepoint = parser.codepoint
+  codepoint = fix_null(codepoint)
+  if codepoint == hyphen then
+    parser:emit_character("-")
+    return "script_data_escaped_dash"
+  elseif codepoint == less_than then
+    return "script_data_escaped_less_than_sign"
+  elseif codepoint == EOF then
+    parser:emit_eof()
+  else
+    parser:emit_character(uchar(codepoint))
+    return "script_data_escaped"
+  end
+end
+
+HtmlStates.script_data_escaped_dash = function(parser)
+  local codepoint = parser.codepoint
+  codepoint = fix_null(codepoint)
+  if codepoint == hyphen then
+    parser:emit_character("-")
+    return "script_data_escaped_dash_dash"
+  elseif codepoint == less_than then
+    return "script_data_escaped_less_than_sign"
+  elseif codepoint == EOF then
+    parser:emit_eof()
+  else
+    parser:emit_character(uchar(codepoint))
+    return "script_data_escaped"
+  end
+
+end
+
+HtmlStates.script_data_escaped_dash_dash = function(parser)
+  local codepoint = parser.codepoint
+  codepoint = fix_null(codepoint)
+  if codepoint == hyphen then
+    parser:emit_character("-")
+    return "script_data_escaped_dash_dash"
+  elseif codepoint == less_than then
+    return "script_data_escaped_less_than_sign"
+  elseif codepoint == greater_than then
+    parser:emit_character(">")
+    return "script_data"
+  elseif codepoint == EOF then
+    parser:emit_eof()
+  else
+    parser:emit_character(uchar(codepoint))
+    return "script_data_escaped"
+  end
+
+end
+
+HtmlStates.script_data_escaped_less_than_sign = function(parser)
+  local codepoint = parser.codepoint
+  if codepoint == solidus then
+    parser.temp_buffer = {}
+    return "script_data_escaped_end_tag_open"
+  elseif is_alpha(codepoint) then
+    parser.temp_buffer = {}
+    parser:emit_character("<")
+    return parser:tokenize("script_data_double_escape_start")
+  else
+    parser:emit_character("<")
+    return parser:tokenize("script_data_escaped")
+  end
+end
+
+HtmlStates.script_data_escaped_end_tag_open = function(parser) 
+  local codepoint = parser.codepoint
+  if is_alpha(codepoint) then
+    parser:start_token("end_tag", {name={}})
+    return parser:tokenize("script_data_escaped_end_tag_name")
+  else
+    parser:emit_character("</")
+    return parser:tokenize("script_data_escaped")
+  end
+end
+
+HtmlStates.script_data_escaped_end_tag_name = function(parser)
+  -- we need to find name of the currently opened tag
+  local parent = parser:get_parent() or {}
+  local opened_tag = parent.tag 
+  local current_tag = table.concat(parser.current_token.name or {})
+  local codepoint = parser.codepoint
+  if is_upper_alpha(codepoint) then
+    parser:append_token_data("name", uchar(codepoint + 0x20))
+    table.insert(parser.temp_buffer, uchar(codepoint))
+    return "script_data_escaped_end_tag_name"
+  elseif is_lower_alpha(codepoint) then
+    parser:append_token_data("name", uchar(codepoint))
+    table.insert(parser.temp_buffer, uchar(codepoint))
+    return "script_data_escaped_end_tag_name"
+  elseif opened_tag == current_tag then
+    if is_space(codepoint) then
+      return "before_attribute_name"
+    elseif codepoint == solidus then
+      return "self_closing_tag"
+    elseif codepoint == greater_than then
+      parser:emit()
+      return "data"
+    end
+  else
+    discard_rcdata_end_tag(parser, "</" .. table.concat(parser.temp_buffer))
+    parser.temp_buffer = {}
+    return parser:tokenize("script_data_escaped")
+  end
+end
+
+HtmlStates.script_data_double_escape_start = function(parser)
+  local codepoint = parser.codepoint
+  if is_alpha(codepoint) or
+     codepoint == solidus or 
+     codepoint == greater_than 
+  then
+    local current_tag = table.concat(parser.current_token.name or {})
+    parser:emit_character(uchar(codepoint))
+    if current_tag == "script" then
+      return "script_data_double_escaped"
+    else
+      return "script_data_escaped"
+    end
+  elseif is_upper_alpha(codepoint) then
+    parser:emit_character(uchar(codepoint))
+    table.insert(parser.temp_buffer, uchar(codepoint) + 0x20)
+    return "script_data_double_escape_start"
+  elseif is_lower_alpha(codepoint) then
+    parser:emit_character(uchar(codepoint))
+    table.insert(parser.temp_buffer, uchar(codepoint))
+    return "script_data_double_escape_start"
+  else
+    return parser:tokenize("script_data_escaped")
+  end
+end
+
+HtmlStates.script_data_double_escaped = function(parser)
+  local codepoint = parser.codepoint
+  codepoint = fix_null(codepoint)
+  if codepoint == hyphen then
+    parser:emit_character("-")
+    return "script_data_double_escaped_dash"
+  elseif codepoint == less_than then
+    parser:emit_character("<")
+    return "script_data_double_escaped_less_than_sign"
+  elseif codepoint == EOF then
+    parser:emit_eof()
+  else
+    parser:emit_character(uchar(codepoint))
+    return "script_data_double_escaped"
+  end
+end
+
+HtmlStates.script_data_double_escaped_dash = function(parser)
+  local codepoint = parser.codepoint
+  codepoint = fix_null(codepoint)
+  if codepoint == hyphen then
+    parser:emit_character("-")
+    return "script_data_double_escaped_dash"
+  elseif codepoint == less_than then
+    parser:emit_character("<")
+    return "script_data_double_escaped_less_than_sign"
+  elseif codepoint == greater_than then
+    parser:emit_character(">")
+    return "script_data"
+  elseif codepoint == EOF then
+    parser:emit_eof()
+  else
+    parser:emit_character(uchar(codepoint))
+    return "script_data_double_escaped"
+  end
+end
+
+HtmlStates.script_data_double_escaped_less_than_sign = function(parser)
+  local codepoint = parser.codepoint
+  if codepoint == solidus then
+    parser:emit("/")
+    return "script_data_double_escape_end"
+  else
+    return parser:tokenize("script_data_double_escaped")
+  end
+end
+
+HtmlStates.script_data_double_escape_end = function(parser)
+  local codepoint = parser.codepoint
+  if is_alpha(codepoint) or
+     codepoint == solidus or 
+     codepoint == greater_than 
+  then
+    local current_tag = table.concat(parser.current_token.name or {})
+    parser:emit_character(uchar(codepoint))
+    if current_tag == "script" then
+      return "script_data_escaped"
+    else
+      return "script_data_double_escaped"
+    end
+  elseif is_upper_alpha(codepoint) then
+    parser:emit_character(uchar(codepoint))
+    table.insert(parser.temp_buffer, uchar(codepoint) + 0x20)
+    return "script_data_double_escape_start"
+  elseif is_lower_alpha(codepoint) then
+    parser:emit_character(uchar(codepoint))
+    table.insert(parser.temp_buffer, uchar(codepoint))
+    return "script_data_double_escape_start"
+  else
+    return parser:tokenize("script_data_double_escaped")
+  end
+
+end
+
+-- formatting elements needs special treatment
+local formatting_element_names ={
+   a = true, b = true, big = true, code = true, em = true, font = true, i = true, nobr = true, s = true, small = true, strike = true, strong = true, tt = true, u = true
+}
+local function is_formatting_element(name)
+  return formatting_element_names[name]
+end
+
+local special_elements = {}
+
+local special_elements_list = {"address", "applet", "area", "article", "aside",
+"base", "basefont", "bgsound", "blockquote", "body", "br", "button", "caption",
+"center", "col", "colgroup", "dd", "details", "dir", "div", "dl", "dt",
+"embed", "fieldset", "figcaption", "figure", "footer", "form", "frame",
+"frameset", "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup",
+"hr", "html", "iframe", "img", "input", "keygen", "li", "link", "listing",
+"main", "marquee", "menu", "meta", "nav", "noembed", "noframes", "noscript",
+"object", "ol", "p", "param", "plaintext", "pre", "script", "section",
+"select", "source", "style", "summary", "table", "tbody", "td", "template",
+"textarea", "tfoot", "th", "thead", "title", "tr", "track", "ul", "wbr", "xmp",
+"mi","mo","mn","ms","mtext", "annotation-xml","foreignObject","desc", "title"
+}
+
+for k,v in ipairs(special_elements_list) do
+  special_elements[v] = true
+end
+
+
+local function is_special(name)
+  return special_elements[name]
+end
+
+-- these lists are used in HtmlParser:generate_implied_endtags()
+local implied_endtags = {dd=true, dt=true, li = true, optgroup = true, option = true, p = true, rb = true, rp = true, rd = true, trc = true}
+local implied_endtags_thoroughly = {dd=true, dt=true, li = true, optgroup = true, option = true, p = true, 
+      rb = true, rp = true, rd = true, trc = true, caption = true, colgroup = true, tbody = true, td = true, 
+      tfoot = true, th = true, thead = true, tr = true
+}
+
+-- find if unfinished tags list contain a tag
+-- it fails if any element from element_list is matched before that tag
+local function is_in_scope(parser, target, element_list)
+  for i = #parser.unfinished, 1, -1 do
+    local node = parser.unfinished[i] 
+    local tag = node.tag
+    if tag == target then 
+      return true
+    elseif element_list[tag] then
+      return false
+    end
+  end
+  return false
+end
+
+local particular_scope_elements = { applet = true, caption = true, html = true, table = true, td = true,
+      th = true, marquee = true, object = true, template = true, mi = true, mo = true, mn = true,
+      ms = true, mtext = true, ["annotation-xml"] = true, foreignObject = true, desc = true, title = true,
+}
+
+local function is_in_particular_scope(parser, target)
+  return is_in_scope(parser, target, particular_scope_elements)
+end
+
+-- derived scope lists
+--
+-- list_item scope
+local list_item_scope_elements = {ol = true, ul = true}
+for k,v in pairs(particular_scope_elements) do list_item_scope_elements[k] = v end
+
+local function is_in_list_item_scope(parser, target)
+  return is_in_scope(parser, target, list_item_scope_elements)
+end
+
+-- button scope
+local button_scope_elements = {button = true}
+for k,v in pairs(particular_scope_elements) do button_scope_elements[k] = v end
+
+local function is_in_button_scope(parser, target)
+  return is_in_scope(parser, target, button_scope_elements)
+end
+
+-- table scope
+local table_scope_elements = {html = true, table = true, template = true}
+
+local function is_in_table_scope(parser, target)
+  return is_in_scope(parser, target, table_scope_elements)
+end
+
+-- select scope
+local function is_in_select_scope(parser, target)
+  -- this scope is specific, because it supports all tags except two
+  for i = #parser.unfinished, 1, -1 do
+    local node = parser.unfinished[i] 
+    local tag = node.tag
+    if tag == target then 
+      return true
+    elseif tag == "optgroup" or tag == "option" then
+      -- only these two tags are supported
+    else
+      return false
+    end
+  end
+  return false
+end
+
+-- List of active formatting elements
+-- https://html.spec.whatwg.org/multipage/parsing.html#the-list-of-active-formatting-elements
+-- we don't implement it yet, but maybe in the future. 
+
+
+local HtmlTreeStates = {}
+
+
+
+
+
+
+local HtmlParser = {}
+
+function HtmlParser:init(body)
+  local o ={}
+  setmetatable(o, self)
+  self.__index        = self
+  o.body              = self:normalize_newlines(body) -- HTML string
+  o.position          = 0                -- position in the parsed string
+  o.unfinished        = {}    -- insert Root node into the list of opened elements
+  o.Document          = Root:init()
+  o.default_state     = "data"           -- default state machine state
+  o.state             = o.default_state  -- working state of the machine
+  o.return_state      = o.default_state  -- special state set by entities parsing
+  o.temp_buffer       = {}               -- keep temporary data
+  o.current_token     = {type="start"}   -- currently processed token
+  o.insertion_mode    = "initial"        -- tree construction state
+  o.head_pointer      = nil              -- pointer to the Head element
+  o.form_pointer      = nil
+  o.active_formatting = {}               -- list of active formatting elements
+  o.scripting_flag    = false            -- we will not support scripting
+  return o
+end
+
+function HtmlParser:normalize_newlines(body)
+  -- we must normalize newlines
+  return body:gsub("\r\n", "\n"):gsub("\r", "\n")
+end
+
+-- declare void elements
+local self_closing_tags_list = {"area", "base", "br", "col", "embed", "hr", "img", "input",
+    "link", "meta", "param", "source", "track", "wbr"}
+ 
+local self_closing_tags = {}
+for _,v in ipairs(self_closing_tags_list) do self_closing_tags[v] = true end
+
+
+
+
+function HtmlParser:parse()
+  -- we assume utf8 input, you must convert it yourself if the source is 
+  -- in a different encoding
+  self.text = {}
+  self.state = self.default_state
+  -- this should enable us to pass over some characters that we want to ignore
+  -- for example scripts, css, etc.
+  self.ignored_pos = -1
+  for pos, ucode in utf8.codes(self.body) do
+    -- save buffer info and require the tokenize function
+    if pos > self.ignored_pos then
+      self.position = pos
+      self.codepoint = ucode
+      self.character = uchar(ucode)
+      self.state = self:tokenize(self.state) or self.state -- if tokenizer don't return new state, assume that it continues in the current state
+    end
+  end
+  return self:finish()
+end
+
+function HtmlParser:tokenize(state)
+  local state = state or self.state
+  local ucode = self.codepoint
+  local text = self.text
+
+  self.last_position = self.position
+  self.element_state = false
+  -- execute state machine object and return new state
+  local fn = HtmlStates[state] or function(parser) return self.default_state end
+  local newstate =  fn(self)
+  -- this should enable changing state from elements that needs special treatment, like <script> or <style>
+  if self.element_state then return self.element_state end
+  -- print("newstate", newstate, state, uchar(ucode or 32))
+  return newstate
+end
+
+function HtmlParser:start_token(typ, data)
+  -- emit the previous token
+  -- self:emit()
+  data.type = typ
+  self.current_token = data
+end
+
+function HtmlParser:discard_token()
+  self.current_token = {type="empty"}
+end
+
+
+
+function HtmlParser:append_token_data(name, data)
+  -- append data to the current token
+  local token = self.current_token or {}
+  if token[name] and type(token[name]) == "table" then
+    table.insert(token[name], data)
+  end
+end
+
+function HtmlParser:set_token_data(name, data)
+  local token = self.current_token or {}
+  token[name] = data
+end
+
+function HtmlParser:flush_temp_buffer()
+  -- write stuff from the temp buffer back to the document
+  local token = self.current_token
+  if token.type == "start_tag" then
+    -- in start tag, entities can be only in attribute value
+    for _, char in ipairs(self.temp_buffer) do
+      table.insert(token.current_attr_value, char)
+    end
+  elseif self.return_state == "data" then
+    -- handle entities in text
+    for _, char in ipairs(self.temp_buffer) do
+      self:start_token("character", {char=char})
+      self:emit()
+    end
+  end
+  self.temp_buffer = {}
+end
+
+function HtmlParser:add_entity(char)
+  local token = self.current_token
+  if token.type == "start_tag" then
+    table.insert(token.current_attr_value, char)
+  else
+    self:start_token("character", {char=char})
+    self:emit()
+  end
+  self.temp_buffer = {}
+end
+
+function HtmlParser:emit(token)
+  -- state machine functions should use this function to emit tokens
+  local token = token or self.current_token
+  -- print("Emit", token.type)
+  local token_type = token.type
+  if token_type     == "character" then
+    table.insert(self.text, token.char)
+  elseif token_type == "doctype" then
+    self:add_text()
+    self:add_doctype()
+  elseif token_type == "start_tag" then
+    self:add_text()
+    -- self:start_attribute()
+    self:start_tag()
+    -- print("Emit start tag", table.concat(token.name))
+    -- save last attribute
+  elseif token_type == "end_tag" then
+    self:add_text()
+    self:end_tag()
+    -- print("Emit end tag", table.concat(token.name))
+  elseif token_type == "comment" then
+    self:add_text()
+    self:add_comment()
+    -- self:start_attribute()
+  elseif token_type == "empty" then
+
+  end
+  -- self.current_token = {type="empty"}
+end
+
+function HtmlParser:emit_character(text)
+  self:start_token("character", {char=text})
+  self:emit()
+end
+
+function HtmlParser:emit_eof()
+  self:start_token("end_of_file", {})
+  self:emit()
+end
+
+function HtmlParser:get_parent()
+  -- return parent element
+  return self.unfinished[#self.unfinished] or self.Document
+end
+
+function HtmlParser:close_element()
+  -- return parent element and remove it from the unfinished list
+  return table.remove(self.unfinished)
+end
+
+function HtmlParser:add_text(text)
+  -- process current text node
+  local text = text
+  if not text then
+    text = self.text
+  end
+  if type(text) == "table" then
+    if #text > 0 then
+      text = table.concat(text)
+    end
+  end
+  if type(text) == "string" and text~="" then
+    local parent = self:get_parent()
+    local node = Text:init(text, parent)
+    parent:add_child(node)
+  end
+  self.text = {}
+end
+
+
+function HtmlParser:start_attribute()
+  local token = self.current_token or {}
+  if token.type == "start_tag" then
+    local attr_name = table.concat(token.current_attr_name)
+    local attr_value = table.concat(token.current_attr_value) or ""
+    if attr_name ~= "" then
+      -- token.attr[attr_name] = attr_value
+      table.insert(token.attr, {name = attr_name, value = attr_value})
+      -- print("saving attribute", attr_name, attr_value)
+    end
+    self:set_token_data("current_attr_name", {})
+    self:set_token_data("current_attr_value", {})
+  end
+end
+
+function HtmlParser:set_xmlns(node, parent)
+  -- handle xmlns
+  local in_attr = false
+  -- try to find xmlns in node's attributes first
+  for _, attr in ipairs(node.attr) do
+    if attr.name == "xmlns" then
+      node.xmlns = attr.value
+      in_attr = true
+      break
+    end
+  end
+  if not in_attr then
+    -- if we cannot find xmlns attribute, then use 
+    --  xmlns from the parent element, or the default xmlns 
+    local parent = self:get_parent()
+    node.xmlns = parent.xmlns or xmlns.HTML
+  end
+end
+
+function HtmlParser:start_tag()
+  local token = self.current_token
+  if token.type == "start_tag" then
+    -- close all currently opened attributes
+    self:start_attribute()
+    -- initiate Element object, pass attributes and info about self_closing
+    local name = table.concat(token.name)
+    local parent = self:get_parent()
+    local node = Element:init(name, parent)
+    node.attr = token.attr
+    node.self_closing = token.self_closing
+    self:set_xmlns(node)
+    -- 
+    if token.self_closing        -- <img />
+      or self_closing_tags[name] -- void elements
+    then
+      parent:add_child(node)
+    else
+      -- add to the unfinished list
+      table.insert(self.unfinished, node)
+    end
+    if name == "title" then 
+      self.element_state = "rcdata" 
+    elseif name == "style" then
+      self.element_state = "rawtext" 
+    elseif name == "script" then
+      self.element_state = "script_data"
+    end
+  end
+end
+
+function HtmlParser:end_tag()
+  -- close current opened element
+  local token = self.current_token
+  if token.type == "end_tag" then
+    if #self.unfinished==0 then return nil end
+    local node = self:close_element()
+    local parent = self:get_parent()
+    parent:add_child(node)
+  end
+end
+
+function HtmlParser:add_comment()
+  local token = self.current_token
+  if token.type == "comment" then
+    self:start_attribute()
+    local parent = self:get_parent()
+    local text = table.concat(token.data)
+    local node = Comment:init(text, parent)
+    parent:add_child(node)
+  end
+end
+
+function HtmlParser:add_doctype()
+  local token = self.current_token
+  if token.type == "doctype" then
+    self:start_attribute()
+    local parent = self:get_parent()
+    local name = table.concat(token.name)
+    local node = Doctype:init(name, parent)
+    if #token.data > 0 then
+      node:add_data(table.concat(token.data))
+    end
+    parent:add_child(node)
+  end
+end
+
+function HtmlParser:switch_insertion(name)
+  self.insertion_mode = name
+end
+
+function HtmlParser:current_node()
+  return self:get_parent()
+end
+
+function HtmlParser:adjusted_current_node()
+  -- we don't support this feature yet
+  -- https://html.spec.whatwg.org/multipage/parsing.html#adjusted-current-node
+  return self:current_node()
+end
+
+
+function HtmlParser:reset_insertion_mode()
+  -- https://html.spec.whatwg.org/multipage/parsing.html#reset-the-insertion-mode-appropriately
+  local last = false
+  for position = #self.unfinished, 1, -1 do
+    local node = self.unfinished[position]
+    if position == 1 then last = true end
+    local name = node.tag
+    -- switch to insertion mode based on the current element name
+    -- there is lot of other cases, but we support only basic ones
+    -- we can support other insertion modes in the future
+    if name == "head" and last == true then
+      self:switch_insertion("in_head")
+      return
+    elseif name == "body" then
+      self:switch_insertion("in_body")
+      return
+    elseif name == "html" then
+      if self.head_pointer then
+        self:switch_insertion("before_head")
+        return
+      else
+        self:switch_insertion("after_head")
+      end
+    elseif last == true then
+      self:switch_insertion("in_body")
+      return
+    end
+  end
+  -- by default use in_body
+  self:switch_insertion("in_body")
+end
+
+
+-- https://html.spec.whatwg.org/multipage/parsing.html#closing-elements-that-have-implied-end-tags
+function HtmlParser:generate_implied_endtags(included, ignored)
+  local included = included or implied_endtags
+  -- parser can pass list of elements that should be removed from the "included" list
+  local ignored = ignored or {}
+  for _, name in ipairs(ignored) do included[name] = nil end
+  local current = self:current_node() or {}
+  -- keep removing elements while they are in the "included" list
+  if included[current.tag] then
+    table.remove(self.unfinished)
+    self:generate_implied_endtags(ignored)
+  end
+end
+
+function HtmlParser:finish()
+  -- tokenize without any real character
+  self.codepoint = EOF
+  self:tokenize(self.state)
+  -- self:emit()
+  self:add_text()
+  -- close all unfinished elements
+  if #self.unfinished == 0 then
+    -- add implicit html tag
+    self:start_tag("html")
+  end
+  while #self.unfinished > 0 do
+    local node = self:close_element()
+    local parent = self:get_parent()
+    parent:add_child(node)
+  end
+  -- return root element
+  return self.Document -- self:close_element()
+end
+
+-- 
+M.Text       = Text
+M.Element    = Element
+M.HtmlParser = HtmlParser
+M.HtmlStates = HtmlStates -- table with functions for particular parser states
+M.self_closing_tags = self_closing_tags -- list of void elements
+M.search_entity_tree = search_entity_tree
+M.is_in_particular_scope = is_in_particular_scope
+M.is_in_list_item_scope = is_in_list_item_scope
+M.is_in_button_scope = is_in_button_scope
+M.is_in_table_scope = is_in_table_scope
+M.is_in_select_scope = is_in_select_scope
+
+return M 


Property changes on: branches/branch2023.final/Master/texmf-dist/tex/luatex/luaxml/luaxml-mod-html.lua
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Modified: branches/branch2023.final/Master/texmf-dist/tex/luatex/luaxml/luaxml-namedentities.lua
===================================================================
--- branches/branch2023.final/Master/texmf-dist/tex/luatex/luaxml/luaxml-namedentities.lua	2024-02-23 22:09:07 UTC (rev 70105)
+++ branches/branch2023.final/Master/texmf-dist/tex/luatex/luaxml/luaxml-namedentities.lua	2024-02-23 22:09:19 UTC (rev 70106)
@@ -40,7 +40,7 @@
 ["nsqsube"]="⋢",
 ["Nacute"]="Ń",
 ["mcomma"]="⨩",
-["ApplyFunction"]=utf8.char(8289),
+["ApplyFunction"]=utf8.char(0x2061),
 ["rfisht"]="⥽",
 ["phmmat"]="ℳ",
 ["rarrw"]="↝",

Modified: branches/branch2023.final/Master/texmf-dist/tex/luatex/luaxml/luaxml-testxml.lua
===================================================================
--- branches/branch2023.final/Master/texmf-dist/tex/luatex/luaxml/luaxml-testxml.lua	2024-02-23 22:09:07 UTC (rev 70105)
+++ branches/branch2023.final/Master/texmf-dist/tex/luatex/luaxml/luaxml-testxml.lua	2024-02-23 22:09:19 UTC (rev 70106)
@@ -9,9 +9,18 @@
 -- Initial Import
 --
 
-modxml = require('luaxml-mod-xml')
-handler = require('luaxml-mod-handler')
-pretty = require('luaxml-pretty')
+local modxml
+local handler
+local pretty
+if kpse then
+    modxml = require('luaxml-mod-xml')
+    handler = require('luaxml-mod-handler')
+    pretty = require('luaxml-pretty')
+else
+    modxml = require('luaxml.mod-xml')
+    handler = require('luaxml.mod-handler')
+    pretty = require('luaxml.pretty')
+end
 
 
 -- Defaults

Modified: branches/branch2023.final/Master/texmf-dist/tex/luatex/luaxml/luaxml-transform.lua
===================================================================
--- branches/branch2023.final/Master/texmf-dist/tex/luatex/luaxml/luaxml-transform.lua	2024-02-23 22:09:07 UTC (rev 70105)
+++ branches/branch2023.final/Master/texmf-dist/tex/luatex/luaxml/luaxml-transform.lua	2024-02-23 22:09:19 UTC (rev 70106)
@@ -4,8 +4,15 @@
 
 -- code originaly comes from from https://github.com/michal-h21/luaxml-mathml
 --
-local domobject = require "luaxml-domobject"
-local cssquery = require "luaxml-cssquery"
+local domobject
+local cssquery
+if kpse then
+  domobject = require "luaxml-domobject"
+  cssquery = require "luaxml-cssquery"
+else
+  domobject = require "luaxml.domobject"
+  cssquery = require "luaxml.cssquery"
+end
 -- initialize CSS selector object
 local css = cssquery()
 
@@ -49,7 +56,11 @@
   -- process all Unicode characters and find if they should be replaced
   for _, char in utf8.codes(text) do
     -- construct new string with replacements or original char
-    t[#t+1] = unicodes[char] or utf8.char(char)
+    if verbatim then
+      t[#t+1] = utf8.char(char)
+    else
+      t[#t+1] = unicodes[char] or utf8.char(char)
+    end
   end
   local text = table.concat(t)
   if parameters.collapse_newlines==true then
@@ -56,7 +67,7 @@
     text = text:gsub("\n", " ")
   end
   -- verbatim can be set in parameters table. it prevent collapsing of spaces. 
-  if not parameters.verbatim then
+  if not verbatim then
     text = text:gsub("(%s%s+)", function(a) return a:sub(1,1) end)
   end
   return text



More information about the tex-live-commits mailing list.