[latex3-commits] [git/LaTeX3-latex3-luaotfload] dev: upper/lower -- Default Unicode algorithms finished (4ff2f4f)
Marcel Fabian Krüger
tex at 2krueger.de
Sat Oct 24 09:48:01 CEST 2020
Repository : https://github.com/latex3/luaotfload
On branch : dev
Link : https://github.com/latex3/luaotfload/commit/4ff2f4f73ee1e8ba8ab8a1dafe03dcff80332cbb
>---------------------------------------------------------------
commit 4ff2f4f73ee1e8ba8ab8a1dafe03dcff80332cbb
Author: Marcel Fabian Krüger <tex at 2krueger.de>
Date: Sat Oct 24 09:48:01 2020 +0200
upper/lower -- Default Unicode algorithms finished
>---------------------------------------------------------------
4ff2f4f73ee1e8ba8ab8a1dafe03dcff80332cbb
src/luaotfload-case.lua | 186 +++++++++++++++++++++++++++++++++++++++++----
src/luaotfload-unicode.lua | 104 +++++++++++++++++++------
2 files changed, 253 insertions(+), 37 deletions(-)
diff --git a/src/luaotfload-case.lua b/src/luaotfload-case.lua
index a005187..33bb1bc 100644
--- a/src/luaotfload-case.lua
+++ b/src/luaotfload-case.lua
@@ -1,7 +1,13 @@
-local mapping_tables = require'luaotfload-unicode'.casemapping
+local unicode_data = require'luaotfload-unicode'
+
+local mapping_tables = unicode_data.casemapping
+local soft_dotted = unicode_data.soft_dotted
+local ccc = unicode_data.ccc
local uppercase = mapping_tables.uppercase
local lowercase = mapping_tables.lowercase
+local cased = mapping_tables.cased
+local case_ignorable = mapping_tables.case_ignorable
local otfregister = fonts.constructors.features.otf.register
@@ -13,17 +19,149 @@ local getnext = direct.getnext
local setchar = direct.setchar
local setdisc = direct.setdisc
local getdisc = direct.getdisc
+local getfield = direct.getfield
local remove = direct.remove
local copy = direct.copy
local insert_after = direct.insert_after
+local traverse = direct.traverse
local disc = node.id'disc'
-local glyph = node.id'disc'
+
+--[[ We make some implicit assumptions about contexts in SpecialCasing.txt here which happened to be true when I wrote the code:
+--
+-- * Before_Dot only appears as Not_Before_Dot
+-- * No other context appears with Not_
+-- * Final_Sigma is never language dependent
+-- * Other contexts are always language dependent
+-- * The only languages with special mappings are Lithuanian (lt/"LTH "/lit), Turkish (tr/"TRK "/tur), and Azeri/Azerbaijani (az/"AZE "/aze)
+]]
+
+local font_lang = setmetatable({}, {__index = function(t, fid)
+ local f = font.getfont(fid)
+ local lang = f.specification.features.normal.language
+ lang = lang == 'lth' and 'lt' or lang == 'trk' and 'tr' or lang == 'aze' and 'az' or false
+ t[fid] = lang
+ return lang
+end})
+
+local function is_Final_Sigma(font, mapping, n, after)
+ mapping = mapping.Final_Sigma
+ if not mapping then return false end
+ mapping = mapping._
+ if not mapping then return false end
+ n = getnext(n)
+ repeat
+ while n do
+ local char, id = is_char(n, font)
+ if id == disc then
+ after = getnext(n)
+ n = getfield(n, 'replace')
+ char, id = is_char(n, font)
+ elseif char then
+ if not case_ignorable[char] then
+ return not cased[char] and mapping
+ end
+ n = getnext(n)
+ else
+ return mapping
+ end
+ end
+ n, after = after
+ until not n
+ return mapping
+end
+
+local function is_More_Above(font, mapping, n, after)
+ mapping = mapping.More_Above
+ if not mapping then return false end
+ mapping = mapping._
+ if not mapping then return false end
+ n = getnext(n)
+ repeat
+ while n do
+ local char, id = is_char(n, font)
+ if id == disc then
+ after = getnext(n)
+ n = getfield(n, 'replace')
+ char, id = is_char(n, font)
+ elseif char then
+ local char_ccc = ccc[char]
+ if not char_ccc then
+ return false
+ elseif char_ccc == 230 then
+ return mapping
+ end
+ n = getnext(n)
+ else
+ return false
+ end
+ end
+ n, after = after
+ until not n
+ return false
+end
+
+local function is_Not_Before_Dot(font, mapping, n, after)
+ mapping = mapping.Not_Before_Dot
+ if not mapping then return false end
+ mapping = mapping._
+ if not mapping then return false end
+ n = getnext(n)
+ repeat
+ while n do
+ local char, id = is_char(n, font)
+ if id == disc then
+ after = getnext(n)
+ n = getfield(n, 'replace')
+ char, id = is_char(n, font)
+ elseif char then
+ local char_ccc = ccc[char]
+ if not char_ccc then
+ return mapping
+ elseif char_ccc == 230 then
+ return char ~= 0x0307 and mapping
+ end
+ n = getnext(n)
+ else
+ return mapping
+ end
+ end
+ n, after = after
+ until not n
+ return mapping
+end
+
+local function is_Language_Mapping(font, mapping, n, after, seen_soft_dotted, seen_I)
+ if not mapping then return false end
+ if seen_soft_dotted then
+ local mapping = mapping.After_Soft_Dotted
+ mapping = mapping and mapping._
+ if mapping then
+ return mapping
+ end
+ end
+ if seen_I then
+ local mapping = mapping.After_I
+ mapping = mapping and mapping._
+ if mapping then
+ return mapping
+ end
+ end
+ return is_More_Above(font, mapping, n, after) or is_Not_Before_Dot(font, mapping, n, after) or mapping._ -- Might be nil
+end
+
local function process(table)
- local function processor(head, font)
+ local function processor(head, font, after, seen_cased, seen_soft_dotted, seen_I)
+ local lang = font_lang[font]
local n = head
while n do
- n = has_glyph(n)
+ do
+ local new = has_glyph(n)
+ if n ~= new then
+ seen_cased, seen_soft_dotted, seen_I = nil
+ end
+ n = new
+ end
if not n then break end
local char, id = is_char(n, font)
if char then
@@ -31,27 +169,47 @@ local function process(table)
if mapping then
if tonumber(mapping) then
setchar(n, mapping)
- elseif #mapping == 0 then
- head, n = remove(head, n)
- goto continue
else
- setchar(n, mapping[1])
- for i=2, #mapping do
- head, n = insert_after(head, n, copy(n))
- setchar(n, mapping[i])
+ mapping = seen_cased and is_Final_Sigma(font, mapping, n, after)
+ or lang and is_Language_Mapping(font, mapping[lang], n, after, seen_soft_dotted, seen_I)
+ or mapping._
+ if #mapping == 0 then
+ head, n = remove(head, n)
+ goto continue
+ else
+ setchar(n, mapping[1])
+ for i=2, #mapping do
+ head, n = insert_after(head, n, copy(n))
+ setchar(n, mapping[i])
+ end
end
end
end
+ if not case_ignorable[char] then
+ seen_cased = cased[char] or nil
+ end
+ local char_ccc = ccc[char]
+ if not char_ccc or char_ccc == 230 then
+ seen_I = char == 0x49 or nil
+ seen_soft_dotted = soft_dotted[char]
+ end
elseif id == disc and uses_font(n, font) then
local pre, post, rep = getdisc(n)
- setdisc(n, processor(pre, font), processor(post, font), processor(rep, font))
+ local after = getnext(n)
+ pre, post, rep, seen_cased, seen_soft_dotted, seen_I =
+ processor(pre, font, nil, seen_cased, seen_soft_dotted, seen_I),
+ processor(post, font, after),
+ processor(rep, font, after, seen_cased, seen_soft_dotted, seen_I)
+ setdisc(n, pre, post, rep)
+ else
+ seen_cased, seen_soft_dotted, seen_I = nil
end
n = getnext(n)
::continue::
end
- return head
+ return head, seen_cased, seen_soft_dotted, seen_I
end
- return processor
+ return function(head, font) return (processor(head, font)) end
end
local upper_process = process(uppercase)
diff --git a/src/luaotfload-unicode.lua b/src/luaotfload-unicode.lua
index 0cd196a..06ebd49 100644
--- a/src/luaotfload-unicode.lua
+++ b/src/luaotfload-unicode.lua
@@ -99,14 +99,17 @@ local alphnum_only do
end
end
-local uppercase, lowercase, titlecase = {}, {}, nil do
+local uppercase, lowercase, ccc, cased, case_ignorable, titlecase = {}, {}, {}, {}, {}, nil do
titlecase = nil -- Not implemented yet(?)
local ignored_field = (1-lpeg.P';')^0 * ';'
+ local cased_category = lpeg.P'Ll;' + 'Lu;' + 'Lt;'
+ local case_ignore_category = lpeg.P'Mn;' + 'Me;' + 'Cf;' + 'Lm;' + 'Sk;'
+
local simple_entry =
codepoint/0 * ';'
* ignored_field -- Name
- * ignored_field -- General_Category
- * ignored_field -- ccc
+ * (ignored_field - cased_category - case_ignore_category) -- General_Category
+ * '0;' -- ccc
* ignored_field -- Bidi
* ignored_field -- Decomp
* ignored_field -- Numeric
@@ -119,8 +122,8 @@ local uppercase, lowercase, titlecase = {}, {}, nil do
local entry = simple_entry
+ codepoint * ';'
* ignored_field -- Name
- * ignored_field -- General_Category
- * ignored_field -- ccc
+ * (cased_category * lpeg.Cc(cased) + case_ignore_category * lpeg.Cc(case_ignorable) + ignored_field * lpeg.Cc(nil)) -- General_Category
+ * ('0;' * lpeg.Cc(nil) + lpeg.R'09'^1/tonumber * ';') -- ccc
* ignored_field -- Bidi
* ignored_field -- Decomp
* ignored_field -- Numeric
@@ -132,7 +135,9 @@ local uppercase, lowercase, titlecase = {}, {}, nil do
* (codepoint + lpeg.Cc(nil)) * ';' -- uppercase
* (codepoint + lpeg.Cc(nil)) * ';' -- lowercase
* (codepoint + lpeg.Cc(nil)) * '\n' -- titlecase
- / function(codepoint, upper, lower, title)
+ / function(codepoint, cased_flag, ccc_val, upper, lower, title)
+ if cased_flag then cased_flag[codepoint] = true end
+ ccc[codepoint] = ccc_val
uppercase[codepoint] = upper
lowercase[codepoint] = lower
-- if title then titlecase[codepoint] = title end -- Not implemented yet(?)
@@ -144,32 +149,81 @@ local uppercase, lowercase, titlecase = {}, {}, nil do
f:close()
end
+local props do
+ local ws = lpeg.P' '^0
+ local nl = ws * ('#' * (1-lpeg.P'\n')^0)^-1 * '\n'
+ local entry = codepoint * (".." * codepoint + lpeg.Cc(false)) * ws * ";" * ws * lpeg.C(lpeg.R("AZ", "az", "__")^1) * nl
+ local file = lpeg.Cf(
+ lpeg.Ct(
+ lpeg.Cg(lpeg.Ct"", "Soft_Dotted")
+ * lpeg.Cg(lpeg.Cc(cased), "Other_Lowercase")
+ * lpeg.Cg(lpeg.Cc(cased), "Other_Uppercase"))
+ * (lpeg.Cg(entry) + nl)^0
+ , function(t, cp_start, cp_end, prop)
+ local prop_table = t[prop]
+ if prop_table then
+ for cp = cp_start, cp_end or cp_start do
+ prop_table[cp] = true
+ end
+ end
+ return t
+ end) * -1
+
+ local f = io.open(kpse.find_file"PropList.txt")
+ props = file:match(f:read'*a')
+ f:close()
+end
+
do
local ws = lpeg.P' '^0
local nl = ws * ('#' * (1-lpeg.P'\n')^0)^-1 * '\n'
+ local file = (codepoint * (".." * codepoint + lpeg.Cc(false)) * ws * ";" * ws * (lpeg.P'Single_Quote' + 'MidLetter' + 'MidNumLet') * nl / function(cp_start, cp_end)
+ for cp = cp_start, cp_end or cp_start do
+ case_ignorable[cp] = true
+ end
+ end + (1-lpeg.P'\n')^0 * '\n')^0 * -1
+
+ local f = io.open(kpse.find_file"WordBreakProperty.txt")
+ assert(file:match(f:read'*a'))
+ f:close()
+end
+
+do
+ local ws = lpeg.P' '^0
+ local nl = ws * ('#' * (1-lpeg.P'\n')^0)^-1 * '\n'
+ local empty = {}
+ local function set(t, cp, condition, value)
+ local old = t[cp] or cp
+ if not condition then
+ if #value == 1 and tonumber(old) then
+ t[cp] = value[1]
+ return
+ end
+ condition = empty
+ end
+ if tonumber(old or cp) then
+ old = {_ = {old}}
+ t[cp] = old
+ end
+ for i=1, #condition do
+ local cond = condition[i]
+ local step = old[cond]
+ if not step then
+ step = {}
+ old[cond] = step
+ end
+ old = step
+ end
+ old._ = value
+ end
local entry = codepoint * ";"
* lpeg.Ct((ws * codepoint)^1 + ws) * ";"
* lpeg.Ct((ws * codepoint)^1 + ws) * ";"
* lpeg.Ct((ws * codepoint)^1 + ws) * ";"
* (lpeg.Ct((ws * lpeg.C(lpeg.R('AZ', 'az', '__')^1))^1) * ";")^-1
* ws * nl / function(cp, lower, title, upper, condition)
- if condition then return end
- if #lower == 1 then
- lower = lower[1]
- if lower ~= lowercase[cp] then
- lowercase[cp] = lower
- end
- else
- lowercase[cp] = lower
- end
- if #upper == 1 then
- upper = upper[1]
- if upper ~= uppercase[cp] then
- uppercase[cp] = upper
- end
- else
- uppercase[cp] = upper
- end
+ set(lowercase, cp, condition, lower)
+ set(uppercase, cp, condition, upper)
end
local file = (entry + nl)^0 * -1
@@ -184,6 +238,10 @@ return {
casemapping = {
uppercase = uppercase,
lowercase = lowercase,
+ cased = cased,
+ case_ignorable = case_ignorable,
-- titlecase = titlecase,
},
+ ccc = ccc,
+ soft_dotted = props.Soft_Dotted,
}
More information about the latex3-commits
mailing list.