[latex3-commits] [git/LaTeX3-latex3-luaotfload] dev: Support uppercasing for greek (05ae3b7)

Marcel Fabian Krüger tex at 2krueger.de
Wed Apr 27 20:22:20 CEST 2022


Repository : https://github.com/latex3/luaotfload
On branch  : dev
Link       : https://github.com/latex3/luaotfload/commit/05ae3b76a3b8e0016114d864c1e85f9a80686145

>---------------------------------------------------------------

commit 05ae3b76a3b8e0016114d864c1e85f9a80686145
Author: Marcel Fabian Krüger <tex at 2krueger.de>
Date:   Wed Apr 27 20:22:20 2022 +0200

    Support uppercasing for greek


>---------------------------------------------------------------

05ae3b76a3b8e0016114d864c1e85f9a80686145
 src/luaotfload-case.lua | 276 +++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 229 insertions(+), 47 deletions(-)

diff --git a/src/luaotfload-case.lua b/src/luaotfload-case.lua
index 33bb1bc..4913010 100644
--- a/src/luaotfload-case.lua
+++ b/src/luaotfload-case.lua
@@ -21,6 +21,7 @@ local setdisc = direct.setdisc
 local getdisc = direct.getdisc
 local getfield = direct.getfield
 local remove = direct.remove
+local free = direct.free
 local copy = direct.copy
 local insert_after = direct.insert_after
 local traverse = direct.traverse
@@ -36,39 +37,118 @@ local disc = node.id'disc'
 -- * The only languages with special mappings are Lithuanian (lt/"LTH "/lit), Turkish (tr/"TRK "/tur), and Azeri/Azerbaijani (az/"AZE "/aze)
 ]]
 
-local font_lang = setmetatable({}, {__index = function(t, fid)
-  local f = font.getfont(fid)
-  local lang = f.specification.features.normal.language
-  lang = lang == 'lth' and 'lt' or lang == 'trk' and 'tr' or lang == 'aze' and 'az' or false
-  t[fid] = lang
-  return lang
-end})
+local UPPER_MASK = 0x3FF
+local HAS_VOWEL = 0x200000
+local HAS_YPOGEGRAMMENI = 0x400000
+local HAS_ACCENT = 0x800000
+local HAS_DIALYTIKA = 0x1000000
+local HAS_OTHER_GREEK_DIACRITIC = 0x2000000
 
-local function is_Final_Sigma(font, mapping, n, after)
-  mapping = mapping.Final_Sigma
-  if not mapping then return false end
-  mapping = mapping._
-  if not mapping then return false end
+local greek_data
+local greek_diacritic = {
+  [0x0300] = HAS_ACCENT,
+  [0x0301] = HAS_ACCENT,
+  [0x0342] = HAS_ACCENT,
+  [0x0302] = HAS_ACCENT,
+  [0x0303] = HAS_ACCENT,
+  [0x0311] = HAS_ACCENT,
+  [0x0308] = HAS_DIALYTIKA,
+  [0x0344] = HAS_DIALYTIKA | HAS_ACCENT,
+  [0x0345] = HAS_YPOGEGRAMMENI,
+  [0x0304] = HAS_OTHER_GREEK_DIACRITIC,
+  [0x0306] = HAS_OTHER_GREEK_DIACRITIC,
+  [0x0313] = HAS_OTHER_GREEK_DIACRITIC,
+  [0x0314] = HAS_OTHER_GREEK_DIACRITIC,
+  [0x0343] = HAS_OTHER_GREEK_DIACRITIC,
+}
+
+local function init_greek_data()
+  local NFD = require'lua-uni-normalize'.NFD
+  local data = {}
+  greek_data = data
+
+  local vowels = {
+    [utf8.codepoint'Α'] = true, [utf8.codepoint'Ε'] = true,
+    [utf8.codepoint'Η'] = true, [utf8.codepoint'Ι'] = true,
+    [utf8.codepoint'Ο'] = true, [utf8.codepoint'Ω'] = true,
+    [utf8.codepoint'Υ'] = true,
+  }
+  local function handle_char(c)
+    local decomp = NFD(utf8.char(c))
+    local first = utf8.codepoint(decomp)
+    local upper = uppercase[first]
+    if upper then
+      if not tonumber(upper) then
+        upper = upper._
+        assert(#upper == 1)
+        upper = upper[1]
+      end
+    else
+      upper = first
+    end
+    if upper > UPPER_MASK then return end -- Only happens for unassigned codepoints
+    local datum = upper
+    if vowels[upper] then
+      datum = datum | HAS_VOWEL
+    end
+    if utf8.len(decomp) > 1 then
+      for _, c in utf8.codes(decomp) do
+        local dia = greek_diacritic[c]
+        if dia and dia ~= HAS_OTHER_GREEK_DIACRITIC then datum = datum | dia end
+      end
+    end
+    data[c] = datum
+  end
+  for c = 0x0370, 0x03ff do handle_char(c) end
+  for c = 0x1f00, 0x1fff do handle_char(c) end
+  for c = 0x2126, 0x2126 do handle_char(c) end
+end
+
+local function font_lang(feature)
+  return setmetatable({}, {__index = function(t, fid)
+    local f = font.getfont(fid)
+    local features = f.specification.features.normal
+    print(require'inspect'(features))
+    local lang = features[feature]
+    if type(lang) ~= 'string' or lang == 'auto' then
+      lang = features.language
+      lang = lang == 'lth' and 'lt' or lang == 'trk' and 'tr' or lang == 'aze' and 'az' or (lang == 'ell' or lang == 'pgr') and 'el' or false
+    end
+    t[fid] = lang
+    return lang
+  end})
+end
+
+local function is_followed_by_cased(font, n, after)
   n = getnext(n)
   repeat
     while n do
       local char, id = is_char(n, font)
-      if id == disc then
+      if not char and id == disc then
         after = getnext(n)
         n = getfield(n, 'replace')
         char, id = is_char(n, font)
-      elseif char then
+      end
+      if char then
         if not case_ignorable[char] then
-          return not cased[char] and mapping
+          return cased[char]
         end
         n = getnext(n)
       else
-        return mapping
+        return false
       end
     end
     n, after = after
   until not n
-  return mapping
+  return false
+end
+
+local function is_Final_Sigma(font, mapping, n, after)
+  mapping = mapping.Final_Sigma
+  if not mapping then return false end
+  mapping = mapping._
+  if not mapping then return false end
+  return not is_followed_by_cased(font, n, after) and mapping
 end
 
 local function is_More_Above(font, mapping, n, after)
@@ -150,56 +230,158 @@ local function is_Language_Mapping(font, mapping, n, after, seen_soft_dotted, se
   return is_More_Above(font, mapping, n, after) or is_Not_Before_Dot(font, mapping, n, after) or mapping._ -- Might be nil
 end
 
-local function process(table)
-  local function processor(head, font, after, seen_cased, seen_soft_dotted, seen_I)
+local function process(table, feature)
+  local font_lang = font_lang(feature)
+  -- The other seen_... are booleans, while seen_greek has more states:
+  --   - nil: Not greek
+  --   - true: Greek. Last was not a vowel with accent and without dialytika
+  --   - node: Greek. Last vowel with accent and without dialytika
+  local function processor(head, font, after, seen_cased, seen_soft_dotted, seen_I, seen_greek)
     local lang = font_lang[font]
+    local greek
+    if lang == 'el' then
+      if table == uppercase then
+        if not greek_data then
+          init_greek_data()
+        end
+        greek = greek_data
+      end
+      lang = false
+    end
     local n = head
     while n do
       do
         local new = has_glyph(n)
         if n ~= new then
-          seen_cased, seen_soft_dotted, seen_I = nil
+          seen_cased, seen_soft_dotted, seen_I, seen_greek = nil
         end
         n = new
       end
       if not n then break end
       local char, id = is_char(n, font)
       if char then
-        local mapping = table[char]
-        if mapping then
-          if tonumber(mapping) then
-            setchar(n, mapping)
+        -- if (GREEK_LETTERS.contains(c)) {
+        if greek and (char >= 0x0370 and char <= 0x03ff or char >= 0x1f00 and char <= 0x1fff or char == 0x1234) then --FIXME
+          -- local c = char
+          local first_datum = greek[char] or 0
+          local datum = first_datum
+          local upper = datum & UPPER_MASK
+          -- Add a dialytika to this iota or ypsilon vowel
+          -- if we removed a tonos from the previous vowel,
+          -- and that previous vowel did not also have (or gain) a dialytika.
+          -- Adding one only to the final vowel in a longer sequence
+          -- (which does not occur in normal writing) would require lookahead.
+          -- Set the same flag as for preserving an existing dialytika.
+          if datum & HAS_VOWEL ~= 0 and seen_greek and seen_greek ~= true and (upper == 0x0399 or upper == 0x03a5) then
+            first_datum = first_datum | HAS_DIALYTIKA;
+          end
+          local has_ypogegrammeni = datum & HAS_YPOGEGRAMMENI ~= 0
+          local add_ypogegrammeni = has_ypogegrammeni
+          local post = getnext(n)
+          local last
+          local saved_tonos
+          while post do
+            local char = is_char(post, font)
+            if not char then break end
+            local diacritic_data = greek_diacritic[char]
+            if not diacritic_data then break end
+            datum = datum | diacritic_data
+            if diacritic_data & HAS_YPOGEGRAMMENI ~= 0 then
+              has_ypogegrammeni = false
+              setchar(post, 0x0399) -- FIXME: 0x0399 Fits with ICU, but maybe consider 
+              last = post
+              post = getnext(post)
+            else
+              local old = post
+              head, post = remove(head, post)
+              if char == 0x0301 and not saved_tonos then
+                saved_tonos = old
+              else
+                free(old)
+              end
+            end
+          end
+          if upper == 0x0397
+              and not has_ypogegrammeni
+              and not seen_cased
+              and not is_followed_by_cased(font, n, after)
+              then
+            if first_datum & HAS_ACCENT ~= 0 then
+              upper = 0x0389
+              if saved_tonos then
+                free(saved_tonos)
+                saved_tonos = nil
+              end
+            end
           else
-            mapping = seen_cased and is_Final_Sigma(font, mapping, n, after)
-                   or lang and is_Language_Mapping(font, mapping[lang], n, after, seen_soft_dotted, seen_I)
-                   or mapping._
-            if #mapping == 0 then
-              head, n = remove(head, n)
-              goto continue
+            if saved_tonos then
+              free(saved_tonos)
+              saved_tonos = nil
+            end
+            if first_datum & HAS_DIALYTIKA ~= 0 then
+              if upper == 0x0399 then -- upper == 'Ι'
+                upper = 0x03AA
+                datum = datum & ~HAS_DIALYTIKA
+              elseif upper == 0x03a5 then -- upper == 'Υ'
+                upper = 0x03ab
+                datum = datum & ~HAS_DIALYTIKA
+              end
+            end
+          end
+          setchar(n, upper)
+          if datum & HAS_DIALYTIKA ~= 0 then
+            head, n = insert_after(head, n, copy(n))
+            setchar(n, 0x0308)
+          end
+          if saved_tonos then
+            head, n = insert_after(head, n, saved_tonos)
+          end
+          if add_ypogegrammeni then
+            head, n = insert_after(head, n, copy(n))
+            setchar(n, 0x0399)
+          end
+          n = last or n
+          seen_greek = datum & (HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA) == HAS_VOWEL | HAS_ACCENT and n or true
+        else
+          local mapping = table[char]
+          if mapping then
+            if tonumber(mapping) then
+              setchar(n, mapping)
             else
-              setchar(n, mapping[1])
-              for i=2, #mapping do
-                head, n = insert_after(head, n, copy(n))
-                setchar(n, mapping[i])
+              mapping = seen_cased and is_Final_Sigma(font, mapping, n, after)
+                     or lang and is_Language_Mapping(font, mapping[lang], n, after, seen_soft_dotted, seen_I)
+                     or mapping._
+              if #mapping == 0 then
+                local old = n
+                head, n = remove(head, n)
+                free(old)
+                goto continue
+              else
+                setchar(n, mapping[1])
+                for i=2, #mapping do
+                  head, n = insert_after(head, n, copy(n))
+                  setchar(n, mapping[i])
+                end
               end
             end
           end
+          local char_ccc = ccc[char]
+          if not char_ccc or char_ccc == 230 then
+            seen_I = char == 0x49 or nil
+            seen_soft_dotted = soft_dotted[char]
+          end
+          seen_greek = nil
         end
         if not case_ignorable[char] then
           seen_cased = cased[char] or nil
         end
-        local char_ccc = ccc[char]
-        if not char_ccc or char_ccc == 230 then
-          seen_I = char == 0x49 or nil
-          seen_soft_dotted = soft_dotted[char]
-        end
       elseif id == disc and uses_font(n, font) then
         local pre, post, rep = getdisc(n)
         local after = getnext(n)
-        pre, post, rep, seen_cased, seen_soft_dotted, seen_I =
-            processor(pre, font, nil, seen_cased, seen_soft_dotted, seen_I),
-            processor(post, font, after),
-            processor(rep, font, after, seen_cased, seen_soft_dotted, seen_I)
+        pre, post, rep, seen_cased, seen_soft_dotted, seen_I, seen_greek =
+            processor(pre, font, nil, seen_cased, seen_soft_dotted, seen_I, seen_greek),
+            processor(post, font, after, seen_greek),
+            processor(rep, font, after, seen_cased, seen_soft_dotted, seen_I, seen_greek)
         setdisc(n, pre, post, rep)
       else
         seen_cased, seen_soft_dotted, seen_I = nil
@@ -207,12 +389,12 @@ local function process(table)
       n = getnext(n)
       ::continue::
     end
-    return head, seen_cased, seen_soft_dotted, seen_I
+    return head, seen_cased, seen_soft_dotted, seen_I, seen_greek
   end
-  return function(head, font) return (processor(head, font)) end
+  return function(head, font, ...) return (processor(head, font)) end
 end
 
-local upper_process = process(uppercase)
+local upper_process = process(uppercase, 'upper')
 otfregister {
   name = 'upper',
   description = 'Map to uppercase',
@@ -225,7 +407,7 @@ otfregister {
   },
 }
 
-local lower_process = process(lowercase)
+local lower_process = process(lowercase, 'lower')
 otfregister {
   name = 'lower',
   description = 'Map to lowercase',





More information about the latex3-commits mailing list.