[latex3-commits] [git/LaTeX3-latex3-luaotfload] dev: Support uppercasing for greek (68258c8)
Marcel Fabian Krüger
tex at 2krueger.de
Wed Apr 27 20:45:15 CEST 2022
Repository : https://github.com/latex3/luaotfload
On branch : dev
Link : https://github.com/latex3/luaotfload/commit/68258c8e8bde38813d78be2cc965a32f5def4c98
>---------------------------------------------------------------
commit 68258c8e8bde38813d78be2cc965a32f5def4c98
Author: Marcel Fabian Krüger <tex at 2krueger.de>
Date: Wed Apr 27 20:22:20 2022 +0200
Support uppercasing for greek
>---------------------------------------------------------------
68258c8e8bde38813d78be2cc965a32f5def4c98
src/luaotfload-case.lua | 267 +++++++++++++++++++++++++++++++++++++++---------
1 file changed, 220 insertions(+), 47 deletions(-)
diff --git a/src/luaotfload-case.lua b/src/luaotfload-case.lua
index 33bb1bc..838489b 100644
--- a/src/luaotfload-case.lua
+++ b/src/luaotfload-case.lua
@@ -21,6 +21,7 @@ local setdisc = direct.setdisc
local getdisc = direct.getdisc
local getfield = direct.getfield
local remove = direct.remove
+local free = direct.free
local copy = direct.copy
local insert_after = direct.insert_after
local traverse = direct.traverse
@@ -36,39 +37,117 @@ local disc = node.id'disc'
-- * The only languages with special mappings are Lithuanian (lt/"LTH "/lit), Turkish (tr/"TRK "/tur), and Azeri/Azerbaijani (az/"AZE "/aze)
]]
-local font_lang = setmetatable({}, {__index = function(t, fid)
- local f = font.getfont(fid)
- local lang = f.specification.features.normal.language
- lang = lang == 'lth' and 'lt' or lang == 'trk' and 'tr' or lang == 'aze' and 'az' or false
- t[fid] = lang
- return lang
-end})
+local UPPER_MASK = 0x3FF
+local HAS_VOWEL = 0x200000
+local HAS_YPOGEGRAMMENI = 0x400000
+local HAS_ACCENT = 0x800000
+local HAS_DIALYTIKA = 0x1000000
+local HAS_OTHER_GREEK_DIACRITIC = 0x2000000
-local function is_Final_Sigma(font, mapping, n, after)
- mapping = mapping.Final_Sigma
- if not mapping then return false end
- mapping = mapping._
- if not mapping then return false end
+local greek_data
+local greek_diacritic = {
+ [0x0300] = HAS_ACCENT,
+ [0x0301] = HAS_ACCENT,
+ [0x0342] = HAS_ACCENT,
+ [0x0302] = HAS_ACCENT,
+ [0x0303] = HAS_ACCENT,
+ [0x0311] = HAS_ACCENT,
+ [0x0308] = HAS_DIALYTIKA,
+ [0x0344] = HAS_DIALYTIKA | HAS_ACCENT,
+ [0x0345] = HAS_YPOGEGRAMMENI,
+ [0x0304] = HAS_OTHER_GREEK_DIACRITIC,
+ [0x0306] = HAS_OTHER_GREEK_DIACRITIC,
+ [0x0313] = HAS_OTHER_GREEK_DIACRITIC,
+ [0x0314] = HAS_OTHER_GREEK_DIACRITIC,
+ [0x0343] = HAS_OTHER_GREEK_DIACRITIC,
+}
+
+local function init_greek_data()
+ local NFD = require'lua-uni-normalize'.NFD
+ local data = {}
+ greek_data = data
+
+ local vowels = {
+ [utf8.codepoint'Α'] = true, [utf8.codepoint'Ε'] = true,
+ [utf8.codepoint'Η'] = true, [utf8.codepoint'Ι'] = true,
+ [utf8.codepoint'Ο'] = true, [utf8.codepoint'Ω'] = true,
+ [utf8.codepoint'Υ'] = true,
+ }
+ local function handle_char(c)
+ local decomp = NFD(utf8.char(c))
+ local first = utf8.codepoint(decomp)
+ local upper = uppercase[first]
+ if upper then
+ if not tonumber(upper) then
+ upper = upper._
+ assert(#upper == 1)
+ upper = upper[1]
+ end
+ else
+ upper = first
+ end
+ if upper > UPPER_MASK then return end -- Only happens for unassigned codepoints
+ local datum = upper
+ if vowels[upper] then
+ datum = datum | HAS_VOWEL
+ end
+ if utf8.len(decomp) > 1 then
+ for _, c in utf8.codes(decomp) do
+ local dia = greek_diacritic[c]
+ if dia and dia ~= HAS_OTHER_GREEK_DIACRITIC then datum = datum | dia end
+ end
+ end
+ data[c] = datum
+ end
+ for c = 0x0370, 0x03ff do handle_char(c) end
+ for c = 0x1f00, 0x1fff do handle_char(c) end
+ for c = 0x2126, 0x2126 do handle_char(c) end
+end
+
+local function font_lang(feature)
+ return setmetatable({}, {__index = function(t, fid)
+ local f = font.getfont(fid)
+ local features = f.specification.features.normal
+ local lang = features[feature]
+ if type(lang) ~= 'string' or lang == 'auto' then
+ lang = features.language
+ lang = lang == 'lth' and 'lt' or lang == 'trk' and 'tr' or lang == 'aze' and 'az' or (lang == 'ell' or lang == 'pgr') and 'el' or false
+ end
+ t[fid] = lang
+ return lang
+ end})
+end
+
+local function is_followed_by_cased(font, n, after)
n = getnext(n)
repeat
while n do
local char, id = is_char(n, font)
- if id == disc then
+ if not char and id == disc then
after = getnext(n)
n = getfield(n, 'replace')
char, id = is_char(n, font)
- elseif char then
+ end
+ if char then
if not case_ignorable[char] then
- return not cased[char] and mapping
+ return cased[char]
end
n = getnext(n)
else
- return mapping
+ return false
end
end
n, after = after
until not n
- return mapping
+ return false
+end
+
+local function is_Final_Sigma(font, mapping, n, after)
+ mapping = mapping.Final_Sigma
+ if not mapping then return false end
+ mapping = mapping._
+ if not mapping then return false end
+ return not is_followed_by_cased(font, n, after) and mapping
end
local function is_More_Above(font, mapping, n, after)
@@ -150,56 +229,150 @@ local function is_Language_Mapping(font, mapping, n, after, seen_soft_dotted, se
return is_More_Above(font, mapping, n, after) or is_Not_Before_Dot(font, mapping, n, after) or mapping._ -- Might be nil
end
-local function process(table)
- local function processor(head, font, after, seen_cased, seen_soft_dotted, seen_I)
+local function process(table, feature)
+ local font_lang = font_lang(feature)
+ -- The other seen_... are booleans, while seen_greek has more states:
+ -- - nil: Not greek
+ -- - true: Greek. Last was not a vowel with accent and without dialytika
+ -- - node: Greek. Last vowel with accent and without dialytika
+ local function processor(head, font, after, seen_cased, seen_soft_dotted, seen_I, seen_greek)
local lang = font_lang[font]
+ local greek
+ if lang == 'el' then
+ if table == uppercase then
+ if not greek_data then
+ init_greek_data()
+ end
+ greek = greek_data
+ end
+ lang = false
+ end
local n = head
while n do
do
local new = has_glyph(n)
if n ~= new then
- seen_cased, seen_soft_dotted, seen_I = nil
+ seen_cased, seen_soft_dotted, seen_I, seen_greek = nil
end
n = new
end
if not n then break end
local char, id = is_char(n, font)
if char then
- local mapping = table[char]
- if mapping then
- if tonumber(mapping) then
- setchar(n, mapping)
+ if greek and (char >= 0x0370 and char <= 0x03ff or char >= 0x1f00 and char <= 0x1fff or char == 0x1234) then --FIXME
+ local first_datum = greek[char] or 0
+ local datum = first_datum
+ local upper = datum & UPPER_MASK
+ if datum & HAS_VOWEL ~= 0 and seen_greek and seen_greek ~= true and (upper == 0x0399 or upper == 0x03a5) then
+ first_datum = first_datum | HAS_DIALYTIKA;
+ end
+ local has_ypogegrammeni = datum & HAS_YPOGEGRAMMENI ~= 0
+ local add_ypogegrammeni = has_ypogegrammeni
+ local post = getnext(n)
+ local last
+ local saved_tonos
+ while post do
+ local char = is_char(post, font)
+ if not char then break end
+ local diacritic_data = greek_diacritic[char]
+ if not diacritic_data then break end
+ datum = datum | diacritic_data
+ if diacritic_data & HAS_YPOGEGRAMMENI ~= 0 then
+ has_ypogegrammeni = false
+ setchar(post, 0x0399) -- FIXME: 0x0399 Fits with ICU, but maybe consider
+ last = post
+ post = getnext(post)
+ else
+ local old = post
+ head, post = remove(head, post)
+ if char == 0x0301 and not saved_tonos then
+ saved_tonos = old
+ else
+ free(old)
+ end
+ end
+ end
+ if upper == 0x0397
+ and not has_ypogegrammeni
+ and not seen_cased
+ and not is_followed_by_cased(font, n, after)
+ then
+ if first_datum & HAS_ACCENT ~= 0 then
+ upper = 0x0389
+ if saved_tonos then
+ free(saved_tonos)
+ saved_tonos = nil
+ end
+ end
else
- mapping = seen_cased and is_Final_Sigma(font, mapping, n, after)
- or lang and is_Language_Mapping(font, mapping[lang], n, after, seen_soft_dotted, seen_I)
- or mapping._
- if #mapping == 0 then
- head, n = remove(head, n)
- goto continue
+ if saved_tonos then
+ free(saved_tonos)
+ saved_tonos = nil
+ end
+ if first_datum & HAS_DIALYTIKA ~= 0 then
+ if upper == 0x0399 then -- upper == 'Ι'
+ upper = 0x03AA
+ datum = datum & ~HAS_DIALYTIKA
+ elseif upper == 0x03a5 then -- upper == 'Υ'
+ upper = 0x03ab
+ datum = datum & ~HAS_DIALYTIKA
+ end
+ end
+ end
+ setchar(n, upper)
+ if datum & HAS_DIALYTIKA ~= 0 then
+ head, n = insert_after(head, n, copy(n))
+ setchar(n, 0x0308)
+ end
+ if saved_tonos then
+ head, n = insert_after(head, n, saved_tonos)
+ end
+ if add_ypogegrammeni then
+ head, n = insert_after(head, n, copy(n))
+ setchar(n, 0x0399)
+ end
+ n = last or n
+ seen_greek = datum & (HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA) == HAS_VOWEL | HAS_ACCENT and n or true
+ else
+ local mapping = table[char]
+ if mapping then
+ if tonumber(mapping) then
+ setchar(n, mapping)
else
- setchar(n, mapping[1])
- for i=2, #mapping do
- head, n = insert_after(head, n, copy(n))
- setchar(n, mapping[i])
+ mapping = seen_cased and is_Final_Sigma(font, mapping, n, after)
+ or lang and is_Language_Mapping(font, mapping[lang], n, after, seen_soft_dotted, seen_I)
+ or mapping._
+ if #mapping == 0 then
+ local old = n
+ head, n = remove(head, n)
+ free(old)
+ goto continue
+ else
+ setchar(n, mapping[1])
+ for i=2, #mapping do
+ head, n = insert_after(head, n, copy(n))
+ setchar(n, mapping[i])
+ end
end
end
end
+ local char_ccc = ccc[char]
+ if not char_ccc or char_ccc == 230 then
+ seen_I = char == 0x49 or nil
+ seen_soft_dotted = soft_dotted[char]
+ end
+ seen_greek = nil
end
if not case_ignorable[char] then
seen_cased = cased[char] or nil
end
- local char_ccc = ccc[char]
- if not char_ccc or char_ccc == 230 then
- seen_I = char == 0x49 or nil
- seen_soft_dotted = soft_dotted[char]
- end
elseif id == disc and uses_font(n, font) then
local pre, post, rep = getdisc(n)
local after = getnext(n)
- pre, post, rep, seen_cased, seen_soft_dotted, seen_I =
- processor(pre, font, nil, seen_cased, seen_soft_dotted, seen_I),
- processor(post, font, after),
- processor(rep, font, after, seen_cased, seen_soft_dotted, seen_I)
+ pre, post, rep, seen_cased, seen_soft_dotted, seen_I, seen_greek =
+ processor(pre, font, nil, seen_cased, seen_soft_dotted, seen_I, seen_greek),
+ processor(post, font, after, seen_greek),
+ processor(rep, font, after, seen_cased, seen_soft_dotted, seen_I, seen_greek)
setdisc(n, pre, post, rep)
else
seen_cased, seen_soft_dotted, seen_I = nil
@@ -207,12 +380,12 @@ local function process(table)
n = getnext(n)
::continue::
end
- return head, seen_cased, seen_soft_dotted, seen_I
+ return head, seen_cased, seen_soft_dotted, seen_I, seen_greek
end
- return function(head, font) return (processor(head, font)) end
+ return function(head, font, ...) return (processor(head, font)) end
end
-local upper_process = process(uppercase)
+local upper_process = process(uppercase, 'upper')
otfregister {
name = 'upper',
description = 'Map to uppercase',
@@ -225,7 +398,7 @@ otfregister {
},
}
-local lower_process = process(lowercase)
+local lower_process = process(lowercase, 'lower')
otfregister {
name = 'lower',
description = 'Map to lowercase',
More information about the latex3-commits
mailing list.