[latex3-commits] [git/LaTeX3-latex3-luaotfload] dev: upper/lower -- Default Unicode algorithms finished (4ff2f4f)

Marcel Fabian Krüger tex at 2krueger.de
Sat Oct 24 09:48:01 CEST 2020


Repository : https://github.com/latex3/luaotfload
On branch  : dev
Link       : https://github.com/latex3/luaotfload/commit/4ff2f4f73ee1e8ba8ab8a1dafe03dcff80332cbb

>---------------------------------------------------------------

commit 4ff2f4f73ee1e8ba8ab8a1dafe03dcff80332cbb
Author: Marcel Fabian Krüger <tex at 2krueger.de>
Date:   Sat Oct 24 09:48:01 2020 +0200

    upper/lower -- Default Unicode algorithms finished


>---------------------------------------------------------------

4ff2f4f73ee1e8ba8ab8a1dafe03dcff80332cbb
 src/luaotfload-case.lua    | 186 +++++++++++++++++++++++++++++++++++++++++----
 src/luaotfload-unicode.lua | 104 +++++++++++++++++++------
 2 files changed, 253 insertions(+), 37 deletions(-)

diff --git a/src/luaotfload-case.lua b/src/luaotfload-case.lua
index a005187..33bb1bc 100644
--- a/src/luaotfload-case.lua
+++ b/src/luaotfload-case.lua
@@ -1,7 +1,13 @@
-local mapping_tables = require'luaotfload-unicode'.casemapping
+local unicode_data = require'luaotfload-unicode'
+
+local mapping_tables = unicode_data.casemapping
+local soft_dotted = unicode_data.soft_dotted
+local ccc = unicode_data.ccc
 
 local uppercase = mapping_tables.uppercase
 local lowercase = mapping_tables.lowercase
+local cased = mapping_tables.cased
+local case_ignorable = mapping_tables.case_ignorable
 
 local otfregister  = fonts.constructors.features.otf.register
 
@@ -13,17 +19,149 @@ local getnext = direct.getnext
 local setchar = direct.setchar
 local setdisc = direct.setdisc
 local getdisc = direct.getdisc
+local getfield = direct.getfield
 local remove = direct.remove
 local copy = direct.copy
 local insert_after = direct.insert_after
+local traverse = direct.traverse
 
 local disc = node.id'disc'
-local glyph = node.id'disc'
+
+--[[ We make some implicit assumptions about contexts in SpecialCasing.txt here which happened to be true when I wrote the code:
+--
+-- * Before_Dot only appears as Not_Before_Dot
+-- * No other context appears with Not_
+-- * Final_Sigma is never language dependent
+-- * Other contexts are always language dependent
+-- * The only languages with special mappings are Lithuanian (lt/"LTH "/lit), Turkish (tr/"TRK "/tur), and Azeri/Azerbaijani (az/"AZE "/aze)
+]]
+
+local font_lang = setmetatable({}, {__index = function(t, fid)
+  local f = font.getfont(fid)
+  local lang = f.specification.features.normal.language
+  lang = lang == 'lth' and 'lt' or lang == 'trk' and 'tr' or lang == 'aze' and 'az' or false
+  t[fid] = lang
+  return lang
+end})
+
+local function is_Final_Sigma(font, mapping, n, after)
+  mapping = mapping.Final_Sigma
+  if not mapping then return false end
+  mapping = mapping._
+  if not mapping then return false end
+  n = getnext(n)
+  repeat
+    while n do
+      local char, id = is_char(n, font)
+      if id == disc then
+        after = getnext(n)
+        n = getfield(n, 'replace')
+        char, id = is_char(n, font)
+      elseif char then
+        if not case_ignorable[char] then
+          return not cased[char] and mapping
+        end
+        n = getnext(n)
+      else
+        return mapping
+      end
+    end
+    n, after = after
+  until not n
+  return mapping
+end
+
+local function is_More_Above(font, mapping, n, after)
+  mapping = mapping.More_Above
+  if not mapping then return false end
+  mapping = mapping._
+  if not mapping then return false end
+  n = getnext(n)
+  repeat
+    while n do
+      local char, id = is_char(n, font)
+      if id == disc then
+        after = getnext(n)
+        n = getfield(n, 'replace')
+        char, id = is_char(n, font)
+      elseif char then
+        local char_ccc = ccc[char]
+        if not char_ccc then
+          return false
+        elseif char_ccc == 230 then
+          return mapping
+        end
+        n = getnext(n)
+      else
+        return false
+      end
+    end
+    n, after = after
+  until not n
+  return false
+end
+
+local function is_Not_Before_Dot(font, mapping, n, after)
+  mapping = mapping.Not_Before_Dot
+  if not mapping then return false end
+  mapping = mapping._
+  if not mapping then return false end
+  n = getnext(n)
+  repeat
+    while n do
+      local char, id = is_char(n, font)
+      if id == disc then
+        after = getnext(n)
+        n = getfield(n, 'replace')
+        char, id = is_char(n, font)
+      elseif char then
+        local char_ccc = ccc[char]
+        if not char_ccc then
+          return mapping
+        elseif char_ccc == 230 then
+          return char ~= 0x0307 and mapping
+        end
+        n = getnext(n)
+      else
+        return mapping
+      end
+    end
+    n, after = after
+  until not n
+  return mapping
+end
+
+local function is_Language_Mapping(font, mapping, n, after, seen_soft_dotted, seen_I)
+  if not mapping then return false end
+  if seen_soft_dotted then
+    local mapping = mapping.After_Soft_Dotted
+    mapping = mapping and mapping._
+    if mapping then
+      return mapping
+    end
+  end
+  if seen_I then
+    local mapping = mapping.After_I
+    mapping = mapping and mapping._
+    if mapping then
+      return mapping
+    end
+  end
+  return is_More_Above(font, mapping, n, after) or is_Not_Before_Dot(font, mapping, n, after) or mapping._ -- Might be nil
+end
+
 local function process(table)
-  local function processor(head, font)
+  local function processor(head, font, after, seen_cased, seen_soft_dotted, seen_I)
+    local lang = font_lang[font]
     local n = head
     while n do
-      n = has_glyph(n)
+      do
+        local new = has_glyph(n)
+        if n ~= new then
+          seen_cased, seen_soft_dotted, seen_I = nil
+        end
+        n = new
+      end
       if not n then break end
       local char, id = is_char(n, font)
       if char then
@@ -31,27 +169,47 @@ local function process(table)
         if mapping then
           if tonumber(mapping) then
             setchar(n, mapping)
-          elseif #mapping == 0 then
-            head, n = remove(head, n)
-            goto continue
           else
-            setchar(n, mapping[1])
-            for i=2, #mapping do
-              head, n = insert_after(head, n, copy(n))
-              setchar(n, mapping[i])
+            mapping = seen_cased and is_Final_Sigma(font, mapping, n, after)
+                   or lang and is_Language_Mapping(font, mapping[lang], n, after, seen_soft_dotted, seen_I)
+                   or mapping._
+            if #mapping == 0 then
+              head, n = remove(head, n)
+              goto continue
+            else
+              setchar(n, mapping[1])
+              for i=2, #mapping do
+                head, n = insert_after(head, n, copy(n))
+                setchar(n, mapping[i])
+              end
             end
           end
         end
+        if not case_ignorable[char] then
+          seen_cased = cased[char] or nil
+        end
+        local char_ccc = ccc[char]
+        if not char_ccc or char_ccc == 230 then
+          seen_I = char == 0x49 or nil
+          seen_soft_dotted = soft_dotted[char]
+        end
       elseif id == disc and uses_font(n, font) then
         local pre, post, rep = getdisc(n)
-        setdisc(n, processor(pre, font), processor(post, font), processor(rep, font))
+        local after = getnext(n)
+        pre, post, rep, seen_cased, seen_soft_dotted, seen_I =
+            processor(pre, font, nil, seen_cased, seen_soft_dotted, seen_I),
+            processor(post, font, after),
+            processor(rep, font, after, seen_cased, seen_soft_dotted, seen_I)
+        setdisc(n, pre, post, rep)
+      else
+        seen_cased, seen_soft_dotted, seen_I = nil
       end
       n = getnext(n)
       ::continue::
     end
-    return head
+    return head, seen_cased, seen_soft_dotted, seen_I
   end
-  return processor
+  return function(head, font) return (processor(head, font)) end
 end
 
 local upper_process = process(uppercase)
diff --git a/src/luaotfload-unicode.lua b/src/luaotfload-unicode.lua
index 0cd196a..06ebd49 100644
--- a/src/luaotfload-unicode.lua
+++ b/src/luaotfload-unicode.lua
@@ -99,14 +99,17 @@ local alphnum_only do
   end
 end
 
-local uppercase, lowercase, titlecase = {}, {}, nil do
+local uppercase, lowercase, ccc, cased, case_ignorable, titlecase = {}, {}, {}, {}, {}, nil do
   titlecase = nil -- Not implemented yet(?)
   local ignored_field = (1-lpeg.P';')^0 * ';'
+  local cased_category = lpeg.P'Ll;' + 'Lu;' + 'Lt;'
+  local case_ignore_category = lpeg.P'Mn;' + 'Me;' + 'Cf;' + 'Lm;' + 'Sk;'
+
   local simple_entry =
       codepoint/0 * ';'
     * ignored_field -- Name
-    * ignored_field -- General_Category
-    * ignored_field -- ccc
+    * (ignored_field - cased_category - case_ignore_category) -- General_Category
+    * '0;' -- ccc
     * ignored_field -- Bidi
     * ignored_field -- Decomp
     * ignored_field -- Numeric
@@ -119,8 +122,8 @@ local uppercase, lowercase, titlecase = {}, {}, nil do
   local entry = simple_entry
     + codepoint * ';'
     * ignored_field -- Name
-    * ignored_field -- General_Category
-    * ignored_field -- ccc
+    * (cased_category * lpeg.Cc(cased) + case_ignore_category * lpeg.Cc(case_ignorable) + ignored_field * lpeg.Cc(nil)) -- General_Category
+    * ('0;' * lpeg.Cc(nil) + lpeg.R'09'^1/tonumber * ';') -- ccc
     * ignored_field -- Bidi
     * ignored_field -- Decomp
     * ignored_field -- Numeric
@@ -132,7 +135,9 @@ local uppercase, lowercase, titlecase = {}, {}, nil do
     * (codepoint + lpeg.Cc(nil)) * ';' -- uppercase
     * (codepoint + lpeg.Cc(nil)) * ';' -- lowercase
     * (codepoint + lpeg.Cc(nil)) * '\n' -- titlecase
-    / function(codepoint, upper, lower, title)
+    / function(codepoint, cased_flag, ccc_val, upper, lower, title)
+      if cased_flag then cased_flag[codepoint] = true end
+      ccc[codepoint] = ccc_val
       uppercase[codepoint] = upper
       lowercase[codepoint] = lower
       -- if title then titlecase[codepoint] = title end -- Not implemented yet(?)
@@ -144,32 +149,81 @@ local uppercase, lowercase, titlecase = {}, {}, nil do
   f:close()
 end
 
+local props do
+  local ws = lpeg.P' '^0
+  local nl = ws * ('#' * (1-lpeg.P'\n')^0)^-1 * '\n'
+  local entry = codepoint * (".." * codepoint + lpeg.Cc(false)) * ws * ";" * ws * lpeg.C(lpeg.R("AZ", "az", "__")^1) * nl
+  local file = lpeg.Cf(
+      lpeg.Ct(
+          lpeg.Cg(lpeg.Ct"", "Soft_Dotted")
+        * lpeg.Cg(lpeg.Cc(cased), "Other_Lowercase")
+        * lpeg.Cg(lpeg.Cc(cased), "Other_Uppercase"))
+    * (lpeg.Cg(entry) + nl)^0
+  , function(t, cp_start, cp_end, prop)
+    local prop_table = t[prop]
+    if prop_table then
+      for cp = cp_start, cp_end or cp_start do
+        prop_table[cp] = true
+      end
+    end
+    return t
+  end) * -1
+
+  local f = io.open(kpse.find_file"PropList.txt")
+  props = file:match(f:read'*a')
+  f:close()
+end
+
 do
   local ws = lpeg.P' '^0
   local nl = ws * ('#' * (1-lpeg.P'\n')^0)^-1 * '\n'
+  local file = (codepoint * (".." * codepoint + lpeg.Cc(false)) * ws * ";" * ws * (lpeg.P'Single_Quote' + 'MidLetter' + 'MidNumLet') * nl / function(cp_start, cp_end)
+    for cp = cp_start, cp_end or cp_start do
+      case_ignorable[cp] = true
+    end
+  end + (1-lpeg.P'\n')^0 * '\n')^0 * -1
+
+  local f = io.open(kpse.find_file"WordBreakProperty.txt")
+  assert(file:match(f:read'*a'))
+  f:close()
+end
+
+do
+  local ws = lpeg.P' '^0
+  local nl = ws * ('#' * (1-lpeg.P'\n')^0)^-1 * '\n'
+  local empty = {}
+  local function set(t, cp, condition, value)
+    local old = t[cp] or cp
+    if not condition then
+      if #value == 1 and tonumber(old) then
+        t[cp] = value[1]
+        return
+      end
+      condition = empty
+    end
+    if tonumber(old or cp) then
+      old = {_ = {old}}
+      t[cp] = old
+    end
+    for i=1, #condition do
+      local cond = condition[i]
+      local step = old[cond]
+      if not step then
+        step = {}
+        old[cond] = step
+      end
+      old = step
+    end
+    old._ = value
+  end
   local entry = codepoint * ";"
               * lpeg.Ct((ws * codepoint)^1 + ws) * ";"
               * lpeg.Ct((ws * codepoint)^1 + ws) * ";"
               * lpeg.Ct((ws * codepoint)^1 + ws) * ";"
               * (lpeg.Ct((ws * lpeg.C(lpeg.R('AZ', 'az', '__')^1))^1) * ";")^-1
               * ws * nl / function(cp, lower, title, upper, condition)
-                if condition then return end
-                if #lower == 1 then
-                  lower = lower[1]
-                  if lower ~= lowercase[cp] then
-                    lowercase[cp] = lower
-                  end
-                else
-                  lowercase[cp] = lower
-                end
-                if #upper == 1 then
-                  upper = upper[1]
-                  if upper ~= uppercase[cp] then
-                    uppercase[cp] = upper
-                  end
-                else
-                  uppercase[cp] = upper
-                end
+                set(lowercase, cp, condition, lower)
+                set(uppercase, cp, condition, upper)
               end
   local file = (entry + nl)^0 * -1
 
@@ -184,6 +238,10 @@ return {
   casemapping = {
     uppercase = uppercase,
     lowercase = lowercase,
+    cased = cased,
+    case_ignorable = case_ignorable,
     -- titlecase = titlecase,
   },
+  ccc = ccc,
+  soft_dotted = props.Soft_Dotted,
 }





More information about the latex3-commits mailing list.