[latex3-commits] [git/LaTeX3-latex3-luaotfload] bidi-dev: Initial bidi (7f10ba3)

Marcel Fabian Krüger tex at 2krueger.de
Tue Aug 13 02:42:26 CEST 2019


Repository : https://github.com/latex3/luaotfload
On branch  : bidi-dev
Link       : https://github.com/latex3/luaotfload/commit/7f10ba38f3986e72dc069837e1c990f60a2ac0da

>---------------------------------------------------------------

commit 7f10ba38f3986e72dc069837e1c990f60a2ac0da
Author: Marcel Fabian Krüger <tex at 2krueger.de>
Date:   Sat Aug 10 14:41:34 2019 +0200

    Initial bidi


>---------------------------------------------------------------

7f10ba38f3986e72dc069837e1c990f60a2ac0da
 src/luaotfload-bidi.lua                         | 587 ++++++++++++++++++++++++
 src/luaotfload-init.lua                         |   6 +-
 src/luaotfload-main.lua                         |   1 +
 texmf/tex/luatex/luaotfload/luaotfload-init.lua |   6 +-
 texmf/tex/luatex/luaotfload/luaotfload-main.lua |   1 +
 5 files changed, 599 insertions(+), 2 deletions(-)

diff --git a/src/luaotfload-bidi.lua b/src/luaotfload-bidi.lua
new file mode 100644
index 0000000..387accb
--- /dev/null
+++ b/src/luaotfload-bidi.lua
@@ -0,0 +1,587 @@
+-----------------------------------------------------------------------
+--         FILE:  luaotfload-bidi.lua
+--  DESCRIPTION:  part of luaotfload / fallback
+-----------------------------------------------------------------------
+
+local ProvidesLuaModule = { 
+    name          = "luaotfload-bidi",
+    version       = "2.9904",     --TAGVERSION
+    date          = "2019-08-02", --TAGDATE
+    description   = "luaotfload submodule / bidi",
+    license       = "GPL v2.0",
+    author        = "Marcel Krüger"
+}
+
+if luatexbase and luatexbase.provides_module then
+  luatexbase.provides_module (ProvidesLuaModule)
+end  
+
+local nodenew            = node.direct.new
+local nodecopy           = node.direct.copy
+local setsubtype         = node.direct.setsubtype
+local setchar            = node.direct.setchar
+local getchar            = node.direct.getchar
+local getfont            = node.direct.getfont
+local getid              = node.direct.getid
+local setnext            = node.direct.setnext
+local getnext            = node.direct.getnext
+local setprev            = node.direct.setprev
+local getprev            = node.direct.getprev
+local traverse           = node.direct.traverse
+local getwhd             = node.direct.getwhd
+local tail               = node.direct.tail
+local remove             = node.direct.remove
+local insert_after       = node.direct.insert_after
+local insert_before      = node.direct.insert_before
+local traverse_char      = node.direct.traverse_char
+local protect_glyph      = node.direct.protect_glyph
+local getdirection       = node.direct.getdirection
+local setdirection       = node.direct.setdirection
+local otffeatures        = fonts.constructors.newfeatures "otf"
+
+local codepoint = lpeg.S'0123456789ABCDEF'^4/function(c)return tonumber(c, 16)end
+local bidi_classes do
+  local entry = lpeg.Cg(codepoint * ';' * (1-lpeg.P';')^0 * ';' * (1-lpeg.P';')^0 * ';' * (1-lpeg.P';')^0 * ';' * lpeg.C((1-lpeg.P';')^0) * ';')^-1 * (1-lpeg.P'\n')^0 * '\n'
+  local file = lpeg.Cf(
+      lpeg.Ct''
+    * entry^0
+  , rawset)
+
+  local f = io.open(kpse.find_file"UnicodeData.txt")
+  bidi_classes = setmetatable(file:match(f:read'*a'), {
+      __index = function(t, cp)
+        if     (cp >= 0xAC00 and cp <= 0xDCA3)
+            or (cp >= 0xD800 and cp <= 0xF8FF)
+            or (cp >= 0x17000 and cp <= 0x187F7)
+            or (cp >= 0x20000 and cp <= 0x2EBE0) -- Technically there are some
+                                                 -- small gaps in there, but
+                                                 -- why would anyone store
+                                                 -- other kinds of characters
+                                                 -- there? Also it would add
+                                                 -- three additional ranges...
+            or (cp >= 0xF0000 and cp <= 0x10FFFD) then
+          t[cp] = "L"
+          return "L"
+        else
+          t[cp] = "ON"
+          return "ON"
+        end
+      end
+    })
+  f:close()
+end
+
+local bidi_brackets do
+  local entry = lpeg.Cg(codepoint * '; ' * lpeg.Ct(lpeg.Cg(codepoint, 'other') * '; ' * lpeg.Cg(lpeg.S'oc', 'type')) * ' ')^-1 * (1-lpeg.P'\n')^0 * '\n'
+  local file = lpeg.Cf(
+      lpeg.Ct''
+    * entry^0
+  , rawset)
+
+  local f = io.open(kpse.find_file"BidiBrackets.txt")
+  bidi_brackets = file:match(f:read'*a')
+  f:close()
+end
+
+-- At the time of writing (Unicode 12.1.0), this is a complete list of
+-- characters whose canonical decomposition contains a bidi_brackets
+-- entry. Given that this isn't stored directly AFAICT, it is easier to
+-- list them here than parse some file.
+local bidi_brackets_canonical = {
+  [0x2329] = 0x3008,
+  [0x232A] = 0x3009,
+}
+for k, v in pairs(bidi_brackets) do
+  bidi_brackets_canonical[k] = k
+end
+
+local opentype_mirroring do
+  local entry = lpeg.Cg(codepoint * '; ' * codepoint * ' ')^-1 * (1-lpeg.P'\n')^0 * '\n'
+  local file = lpeg.Cf(
+      lpeg.Ct''
+    * entry^0
+  , rawset)
+
+  local f = io.open(kpse.find_file"OpentypeMirroring.txt")
+  opentype_mirroring = file:match(f:read'*a')
+  f:close()
+end
+
+local bidi_fonts = setmetatable({}, {
+  __index = function(t, fid)
+    local f = font.getfont(fid)
+    -- table.tofile('myfont2', f)
+    local res = f and f.bidi or false
+    t[fid] = res
+    return res
+  end,
+})
+
+local function makebidifont(tfmdata)
+  tfmdata.bidi = true
+end
+
+local glyph_id = node.id'glyph'
+local dir_id = node.id'dir'
+local glue_id = node.id'glue'
+local kern_id = node.id'kern'
+
+local Strong = {
+  L = 'L',
+  R = 'R',
+  AN = 'R',
+  EN = 'R',
+}
+
+local NI = {
+  B = true,
+  S = true,
+  WS = true,
+  ON = true,
+  FSI = true,
+  LRI = true,
+  PDI = true,
+}
+
+local function adjust_nsm(pre, dir, node_type)
+  local follow = getnext(pre)
+  local follow_type = node_type[follow]
+  while follow ~= stop and (not follow_type[2] or follow_type[3] == "NSM") do
+    if follow_type[2] then
+      follow_type[2] = dir
+    end
+    follow = getnext(follow)
+    follow_type = node_type[follow]
+  end
+end
+function do_wni(head, level, stop, sos, eos, node_type)
+  local opposite, direction
+  if level % 2 == 0 then
+    direction, opposite = 'L', 'R'
+  else
+    direction, opposite = 'R', 'L'
+  end
+  local stop = getnext(stop)
+  local prevclass, prevstrong = sos, sos
+  -- We combine W1--W7, that shouldn't make a difference and is
+  -- faster.
+  local cur = head
+  while cur ~= stop do
+    local curtype = node_type[cur]
+    local curclass = curtype[1]
+    if curclass == "NSM" then
+      curclass = prevclass == "PDI" and "ON" or prevclass
+      curtype[1] = curclass
+    elseif curclass == "EN" then
+      if prevstrong == "AL" then
+        curclass = "AN"
+        curtype[1] = curclass
+      elseif prevstrong == "L" then
+        curtype[1] = "L"
+        -- HACK: No curclass change. Therefore prevclass is still EN,
+        -- such that this W7 change does not affect the ES/ET changes
+        -- in W4-W5
+      end
+    elseif curclass == "ES" then
+      if prevclass == "EN" then
+        local follow = getnext(cur)
+        local followclass = node_type[follow][1]
+        while follow ~= stop and not followclass do
+          follow = getnext(follow)
+          followclass = follow and node_type[follow][1]
+        end
+        if follow ~= stop and followclass == "EN" then
+          if prevstrong == "AL" then
+            curclass = "AN"
+            curtype[1] = curclass
+          elseif prevstrong == "L" then
+            curtype[1] = "L"
+            curclass = "EN" -- (sic), see above
+          end
+        end
+      end
+    elseif curclass == "CS" then
+      if prevclass == "EN" or prevclass == "AN" then
+        local follow = getnext(cur)
+        local followclass = node_type[follow][1]
+        while follow ~= stop and not followclass do
+          follow = getnext(follow)
+          followclass = follow and node_type[follow][1]
+        end
+        if follow ~= stop and followclass == prevclass then
+          if followclass == "EN" then
+            if prevstrong == "AL" then
+              curclass = "AN"
+              curtype[1] = curclass
+            elseif prevstrong == "L" then
+              curtype[1] = "L"
+              curclass = "EN" -- (sic), see above
+            end
+          else
+            curclass = prevclass
+            curtype[1] = curclass
+          end
+        else
+          curclass = "ON"
+          curtype[1] = curclass
+        end
+      else
+        curclass = "ON"
+        curtype[1] = curclass
+      end
+    elseif curclass == "ET" then
+      local follow = getnext(cur)
+      local followclass = node_type[follow][1]
+      while follow ~= stop and (followclass == "ET" or not followclass) do
+        follow = getnext(follow)
+        followclass = follow and node_type[follow][1]
+      end
+      if followclass == "EN" then
+        follow = cur
+        followclass = curclass
+        while follow ~= stop and (followclass == "ET" or not followclass) do
+          if followclass then
+            node_type[follow][1] = "EN"
+          end
+          follow = getnext(follow)
+          followclass = follow and node_type[follow][1]
+        end
+      else
+        curclass = "ON"
+        curtype[1] = curclass
+      end
+    elseif curclass == "AL" then
+      prevstrong = "AL"
+      curclass = "R"
+      curtype[1] = curclass
+    elseif curclass == "L" or curclass == "R" then
+      prevstrong = curclass
+    elseif not curclass then
+      curclass = prevclass -- Do not change prevclass for the next run
+    end
+    prevclass = curclass
+    cur = getnext(cur)
+  end
+  cur = head
+  local last_e, last_s = 0, 0
+  prevstrong = sos
+  local stack = {}
+  while cur ~= stop do
+    if getid(cur) == glyph_id and bidi_fonts[getfont(cur)] then
+      local cp = getchar(cur) -- FIXME: canonical equivalents
+      local bracket = bidi_brackets[cp]
+      if bracket then
+        if bracket.type == 'o' then
+          local info = {cur, bracket.other, prevstrong == opposite}
+          stack[#stack + 1] = info
+        else -- if cp.type == 'c'
+          for i = #stack,1,-1 do
+            local entry = stack[i]
+            if entry[2] == cp then
+              for j = i,#stack do
+                stack[j] = nil
+              end
+              if last_e >= i then
+                local btype, etype = node_type[entry[1]], node_type[cur]
+                btype[1], etype[1] = direction, direction
+                adjust_nsm(entry[1], direction, node_type)
+                adjust_nsm(cur, direction, node_type)
+                last_s, last_e = i-1, i-1
+              elseif last_s >= i then
+                if entry[3] then
+                  local btype, etype = node_type[entry[1]], node_type[cur]
+                  adjust_nsm(entry[1], opposite, node_type)
+                  adjust_nsm(cur, opposite, node_type)
+                  btype[1], etype[1] = opposite, opposite
+                end
+                last_s = i-1
+              end
+              break
+            end
+          end
+        end
+      else
+        local curclass = node_type[cur][1]
+        if Strong[curclass] == direction then
+          last_e, last_s, prevstrong = #stack, #stack, direction
+        elseif Strong[curclass] == opposite then
+          last_s, prevstrong = #stack, opposite
+        end
+      end
+    end
+    cur = getnext(cur)
+  end
+  cur = head
+  prevstrong = sos
+  local newlevels = direction == 'L' and {
+    L = level,
+    R = level+1,
+    AN = level+2,
+    EN = level+2,
+  } or {
+    L = level+1,
+    R = level,
+    AN = level+1,
+    EN = level+1,
+  }
+  while cur ~= stop do
+    local curtype = node_type[cur]
+    local curclass = curtype[1]
+    local strong = Strong[curclass]
+    if strong then
+      prevstrong = strong
+      curtype[2] = newlevels[curclass]
+      cur = getnext(cur)
+    else
+      local follow = getnext(cur)
+      local followclass = follow and node_type[follow][1]
+      while follow ~= stop and not Strong[followclass] do
+        follow = getnext(follow)
+        followclass = follow and Strong[node_type[follow][1]]
+      end
+      if follow == stop then
+        followclass = eos
+      end
+      local outerdir = followclass == prevstrong and followclass or direction
+      follow = cur
+      followclass = curclass
+      while follow ~= stop and not Strong[followclass] do
+        follow_type = node_type[follow]
+        follow_type[1], follow_type[2] = followclass and outerdir, followclass and newlevels[outerdir]
+        follow = getnext(follow)
+        followclass = follow and node_type[follow][1]
+      end
+      cur = follow
+    end
+  end
+end
+function dobidi(head, a, b, c, par_direction)
+  head = node.direct.todirect(head)
+  -- for cur in traverse(head) do
+  --   print(node.direct.tonode(cur))
+  -- end
+  local node_type = {} -- We do not need to preserve the direction types, so this is faster than using properties
+  local dir_matches = {}
+  par_direction = par_direction == "TRT" and "R" or "L" -- We hope to only encounter TRT/TLT
+  local level, overwrite, isolate = par_direction == "R" and 1 or 0
+  local stack = {}
+  local function push(dir, new_overwrite, new_isolate)
+    stack[#stack+1] = {level, overwrite, isolate}
+    level, overwrite, isolate = level + (level + dir + 1)%2 + 1, new_overwrite, new_isolate
+  end
+  local function pop()
+    local last = stack[#stack]
+    stack[#stack] = nil
+    level, overwrite, isolate = last[1], last[2], last[3]
+  end
+  for cur, tcur, scur in traverse(head) do
+    local class, curlevel
+    if tcur == glyph_id and bidi_fonts[getfont(cur)] then
+      class = bidi_classes[getchar(cur)]
+      if class == "RLE" then
+        class, curlevel = nil, level
+        push(0)
+      elseif class == "LRE" then
+        class, curlevel = nil, level
+        push(1)
+      elseif class == "RLO" then
+        class, curlevel = nil, level
+        push(1, "R")
+      elseif class == "LRO" then
+        class, curlevel = nil, level
+        push(0, "L")
+      elseif class == "PDF" then
+        class = nil
+        if not isolate and #stack >= 1 then
+          pop()
+        end
+      -- elseif class == "RLI" then -- Not supported yet, use textdir
+      --   -- TODO
+      -- elseif class == "LRI" then -- Not supported yet, use textdir
+      --   -- TODO
+      -- elseif class == "FSI" then -- Not supported yet, use textdir
+      --   -- TODO
+      -- elseif class == "PDI" then -- Not supported yet, use textdir
+      --   -- TODO
+      elseif class == "BN" then
+        class = overwrite or nil
+      elseif class == "B" then
+        assert(false) -- FIXME: Can this happen in TeX?
+      else
+        class = overwrite or class
+      end
+    elseif tcur == dir_id then
+      local dir, reset = getdirection(cur)
+      if reset then
+        while not isolate and #stack >= 1 do
+          pop()
+        end
+        if isolate then
+          dir_matches[isolate] = false
+          dir_matches[cur] = isolate
+          pop()
+        else
+          -- Unmatched reset. LuaTeX inserts them sometimes, just
+          -- dropping them normally works fine. But deleting is
+          -- difficult here because the loop needs the next pointer.
+          -- FIXME: We will leak them for now
+          head = remove(head, cur)
+        end
+        class = overwrite or "PDI"
+      else
+        curlevel = level
+        dir = dir == 1 and 1 or 0
+        class = overwrite or (dir == 1 and "LRI" or "RLI")
+        push(dir, nil, cur)
+      end
+    elseif tcur == glue_id or tcur == kern_id then -- Not sure about kerns
+      class = "WS"
+    else
+      class = "ON"
+    end
+    node_type[cur] = {class, curlevel or level, class}
+  end
+  for i = 1,#stack do pop() end
+  local parlevel = level
+  local isolating_level_runs = {}
+  level = -parlevel - 2
+  local current_run
+  for cur, tcur, scur in traverse(head) do
+    if level % 2 == 1 and tcur == glyph_id and scur == 0 and bidi_fonts[getfont(cur)] then
+      local char = opentype_mirroring[getchar(cur)]
+      if char then
+        setchar(cur, char)
+      end
+    end
+    local curtype = node_type[cur]
+    local curclass, curlevel = curtype[1], curtype[2]
+    if curlevel ~= level and curclass then
+      local os = (level > curlevel and level or curlevel) % 2 == 1 and 'R' or 'L'
+      level = curlevel
+      if current_run then
+        current_run[3] = getprev(cur)
+        current_run[5] = os
+        current_run = nil
+      end
+      if dir_matches[cur] then
+        local beg = dir_matches[cur]
+        current_run = dir_matches[beg]
+        local remember = {getnext(beg), getprev(cur)}
+        dir_matches[cur] = nil
+        if getnext(beg) == cur then -- Handle stupid input
+          dir_matches[beg] = nil
+        else
+          dir_matches[beg] = remember
+          setprev(getnext(beg), nil)
+          setnext(getprev(cur), nil)
+          setnext(beg, cur)
+          setprev(cur, beg)
+        end
+      else
+        current_run = {cur, level, nil, os}
+        isolating_level_runs[#isolating_level_runs+1] = current_run
+      end
+    end
+    if dir_matches[cur] == false then
+      dir_matches[cur] = current_run
+      current_run = nil
+    end
+  end
+  if current_run then
+    current_run[3] = tail(head)
+    current_run[5] = (level > parlevel and level or parlevel) % 2 == 1 and 'R' or 'L'
+    -- Should always be level IINM, but let's us the offical check
+    current_run = nil
+  end
+  for i = 1, #isolating_level_runs do
+    local run = isolating_level_runs[i]
+    do_wni(run[1], run[2], run[3], run[4], run[5], node_type)
+  end
+  -- for cur in traverse(head) do
+  --   local curtype = node_type[cur]
+  --   local curclass, curlevel, origtype = curtype[1], curtype[2], curtype[3]
+  --   -- print('?', node.direct.tonode(cur), curclass, curlevel, origtype)
+  -- end
+  for cur in traverse(head) do
+    local remembered = dir_matches[cur]
+    if remembered then
+      local newnext, newprev = remembered[1], remembered[2]
+      setnext(newprev, getnext(cur))
+      setprev(getnext(cur), newprev)
+      setprev(newnext, cur)
+      setnext(cur, newnext)
+    end
+  end
+  -- for cur in traverse(head) do
+  --   local curtype = node_type[cur]
+  --   local curclass, curlevel, origtype = curtype[1], curtype[2], curtype[3]
+  --   print('!', node.direct.tonode(cur), curclass, curlevel, origtype)
+  -- end
+  -- TODO: Actually insert directional markers
+  level = parlevel
+  local curdir = level
+  function push(n, newlevel)
+    local dirnode = nodenew(dir_id)
+    setdirection(dirnode, newlevel % 2, 0)
+    stack[#stack + 1] = {level, dirnode}
+    -- tableinsert(stack, 
+    level = newlevel
+    return insert_before(head, n, dirnode)
+  end
+  function pop(head, n)
+    local entry = stack[#stack]
+    stack[#stack] = nil
+    level = entry[1]
+    local dirnode = nodecopy(entry[2])
+    setsubtype(dirnode, 1)
+    return insert_before(head, n, dirnode), entry[2]
+  end
+  for cur, tcur, scur in traverse(head) do
+    local curtype = node_type[cur]
+    local curlevel = curtype[2]
+    if tcur == dir_id and scur == 1 then
+      local newlevel = curlevel + (curlevel + getdirection(cur) + 1)%2 + 1
+      while level > newlevel do
+        head = pop(head, cur)
+      end
+      stack[#stack] = nil
+      level = curlevel
+    end
+    if curlevel and level ~= curlevel then
+      local push_pos = cur
+      while level > curlevel do
+        head, push_pos = pop(head, cur)
+      end
+      if level < curlevel then
+        push(push_pos, curlevel)
+      end
+    end
+    if tcur == dir_id and scur == 0 then
+      local newlevel = curlevel + (curlevel + getdirection(cur) + 1)%2 + 1
+      stack[#stack + 1] = {level, cur}
+      level = newlevel
+    end
+  end
+  -- for cur in traverse(head) do
+  --   local curtype = node_type[cur]
+  --   local curlevel = curtype and curtype[2]
+  --   local dir = not curtype and getdirection(cur)
+  --   print('_', node.direct.tonode(cur), curlevel, curtype and curtype[1], dir)
+  -- end
+  return node.direct.tonode(head)
+end
+
+otffeatures.register {
+  name        = "bidi",
+  description = "Apply Unicode bidi algorithm",
+  manipulators = {
+    node = makebidifont,
+  },
+  --                -- We have to see how processors interact with
+  --                -- multiscript fonts
+  -- processors = {
+  --   node = donotdef,
+  -- }
+}
+
+--- vim:sw=2:ts=2:expandtab:tw=71
diff --git a/src/luaotfload-init.lua b/src/luaotfload-init.lua
index b388b74..66f83e0 100644
--- a/src/luaotfload-init.lua
+++ b/src/luaotfload-init.lua
@@ -648,7 +648,11 @@ local init_post_install_callbacks = function ()
   -- MK Pass current text direction to simple_font_handler
   local handler = nodes.simple_font_handler
   local callback = function(head, groupcode, _, _, direction)
-    return handler(head, groupcode, nil, nil, direction or tex.get'textdir')
+    if not direction then
+      direction = tex.get'textdir'
+    end
+    head = dobidi(head, nil, nil, nil, direction)
+    return handler(head, groupcode, nil, nil, direction)
   end
   luatexbase.add_to_callback("pre_linebreak_filter",
                              callback,
diff --git a/src/luaotfload-main.lua b/src/luaotfload-main.lua
index 107a3d9..fd759ba 100644
--- a/src/luaotfload-main.lua
+++ b/src/luaotfload-main.lua
@@ -301,6 +301,7 @@ luaotfload.main = function ()
     loadmodule "embolden"     --- fake bold
     loadmodule "notdef"       --- missing glyph handling
     initialize "auxiliary"    --- additional high-level functionality
+    loadmodule "bidi"         --- ...
 
     luaotfload.aux.start_rewrite_fontname () --- to be migrated to fontspec
 
diff --git a/texmf/tex/luatex/luaotfload/luaotfload-init.lua b/texmf/tex/luatex/luaotfload/luaotfload-init.lua
index b388b74..66f83e0 100644
--- a/texmf/tex/luatex/luaotfload/luaotfload-init.lua
+++ b/texmf/tex/luatex/luaotfload/luaotfload-init.lua
@@ -648,7 +648,11 @@ local init_post_install_callbacks = function ()
   -- MK Pass current text direction to simple_font_handler
   local handler = nodes.simple_font_handler
   local callback = function(head, groupcode, _, _, direction)
-    return handler(head, groupcode, nil, nil, direction or tex.get'textdir')
+    if not direction then
+      direction = tex.get'textdir'
+    end
+    head = dobidi(head, nil, nil, nil, direction)
+    return handler(head, groupcode, nil, nil, direction)
   end
   luatexbase.add_to_callback("pre_linebreak_filter",
                              callback,
diff --git a/texmf/tex/luatex/luaotfload/luaotfload-main.lua b/texmf/tex/luatex/luaotfload/luaotfload-main.lua
index 107a3d9..fd759ba 100644
--- a/texmf/tex/luatex/luaotfload/luaotfload-main.lua
+++ b/texmf/tex/luatex/luaotfload/luaotfload-main.lua
@@ -301,6 +301,7 @@ luaotfload.main = function ()
     loadmodule "embolden"     --- fake bold
     loadmodule "notdef"       --- missing glyph handling
     initialize "auxiliary"    --- additional high-level functionality
+    loadmodule "bidi"         --- ...
 
     luaotfload.aux.start_rewrite_fontname () --- to be migrated to fontspec
 





More information about the latex3-commits mailing list