[latex3-commits] [git/LaTeX3-latex3-luaotfload] dev: Add full BCP47 parsing for case features (5daac93)
Marcel Fabian Krüger
tex at 2krueger.de
Sun Jun 26 11:35:20 CEST 2022
Repository : https://github.com/latex3/luaotfload
On branch : dev
Link : https://github.com/latex3/luaotfload/commit/5daac93851e20a62c74cf20fb95947f871aa5774
>---------------------------------------------------------------
commit 5daac93851e20a62c74cf20fb95947f871aa5774
Author: Marcel Fabian Krüger <tex at 2krueger.de>
Date: Sun Jun 26 11:35:20 2022 +0200
Add full BCP47 parsing for case features
>---------------------------------------------------------------
5daac93851e20a62c74cf20fb95947f871aa5774
src/luaotfload-bcp47.lua | 86 ++++++++++++++++++++++++++++++++++++++++++++++++
src/luaotfload-case.lua | 41 +++++++++++++++++++++++
2 files changed, 127 insertions(+)
diff --git a/src/luaotfload-bcp47.lua b/src/luaotfload-bcp47.lua
new file mode 100644
index 0000000..fc90c14
--- /dev/null
+++ b/src/luaotfload-bcp47.lua
@@ -0,0 +1,86 @@
+-- Suport for parsing BCP47 based language tags into components
+local normalize_case, language_tag do
+ local l = lpeg or require'lpeg'
+ local function rep(base, num, max)
+ max = max or num
+ if num == 1 then
+ if max == 1 then
+ return base
+ else
+ return base * base^-(max-1)
+ end
+ end
+ return base * rep(base, num - 1, max - 1)
+ end
+ local eor = #(l.P'-' + -1) -- End of record
+ local alpha = l.R('az', 'AZ')
+ local alphanum = l.R('az', 'AZ', '09')
+ local digit = l.R'09'
+ normalize_case = l.Cs(
+ (alphanum^2/string.lower)
+ * (
+ ('-' * (alphanum/string.upper) * (rep(alphanum, 3)/string.lower))
+ + ('-' * alphanum^3/string.lower)
+ + ('-' * rep(alphanum, 2)/string.upper)
+ )^0
+ * ('-' * alphanum * '-' * l.P(1)^1 / string.lower)^-1
+ + alphanum * '-' * l.P(1)^1 / string.lower
+ ) * -1
+ local extlang = l.Cg(rep(alpha, 3), 'extlang')
+ local language = l.Cg(rep(alpha, 2, 3), 'language') * ('-' * extlang * eor)^-1
+ local script = l.Cg(rep(alpha, 4), 'script')
+ local region = l.Cg(rep(alpha, 2) + rep(digit, 3), 'region')
+ local variant = l.Cg(rep(alphanum, 5, 8) + digit * rep(alphanum, 3), 'variant')
+ local singleton = l.R('09', 'aw', 'yz')
+ local extension = l.Cg(l.C(singleton) * l.Ct(('-' * l.C(rep(alphanum, 2, 8)))^1))
+ local privateuse = l.P'x' * l.Cg(l.Ct(('-' * l.C(rep(alphanum, 2, 8)))^1), 'private')
+ local irregular =
+ l.P'en-GB-oed' * l.Cc{language = 'en', region = 'GB', variant = 'oxendict'}
+ + l.P'i-ami' * l.Cc{language = 'ami'}
+ + l.P'i-bnn' * l.Cc{language = 'bnn'}
+ + l.P'i-default' * l.Cc{language = 'i-default'} -- Not deprecated
+ + l.P'i-enochian' * l.Cc{language = 'i-enochian'} -- The non-existance of a non deprecated language code for enochian
+ -- demonstrates a shocking for the language of my ancestors
+ + l.P'i-hak' * l.Cc{language = 'hak'}
+ + l.P'i-klingon' * l.Cc{language = 'tlh'}
+ + l.P'i-lux' * l.Cc{language = 'lb'}
+ + l.P'i-mingo' * l.Cc{language = 'i-mingo'} -- Not deprecated
+ + l.P'i-navajo' * l.Cc{language = 'nv'}
+ + l.P'i-pwn' * l.Cc{language = 'pwn'}
+ + l.P'i-tao' * l.Cc{language = 'tao'}
+ + l.P'i-tay' * l.Cc{language = 'tay'}
+ + l.P'i-tsu' * l.Cc{language = 'tsu'}
+ + l.P'sgn-BE-FR' * l.Cc{language = 'sfb'}
+ + l.P'sgn-BE-NL' * l.Cc{language = 'vgt'}
+ + l.P'sgn-CH-DE' * l.Cc{language = 'sgg'}
+ local regular =
+ l.P'art-lojban' * l.Cc{language = 'jbo'}
+ + l.P'cel-gaulish' * l.Cc{language = 'cel-gaulish'} -- Might be xcg, xga or xtg
+ + l.P'no-bok' * l.Cc{language = 'nb'}
+ + l.P'no-nyn' * l.Cc{language = 'nn'}
+ + l.P'zh-guoyu' * l.Cc{language = 'cmn'}
+ + l.P'zh-hakka' * l.Cc{language = 'hak'}
+ + l.P'zh-min' * l.Cc{language = '...'}
+ + l.P'zh-min-nan' * l.Cc{language = 'nan'}
+ + l.P'zh-xiang' * l.Cc{language = 'hsn'}
+ local grandfathered = irregular + regular
+ local langtag = language * ('-' * script * eor)^-1
+ * ('-' * region * eor)^-1
+ * ('-' * variant)^0
+ * (#('-' * singleton * '-') * l.Cg(l.Cf(l.Ct'' * ('-' * extension)^0, rawset), 'extension'))^-1
+ * ('-' * privateuse)^-1
+ language_tag = (grandfathered + l.Ct(langtag + privateuse)) * -1
+end
+local function parse(tag)
+ tag = normalize_case:match(tag)
+ if not tag then return end
+ return language_tag:match(tag), tag
+end
+-- for l in io.lines() do
+-- print(parse(l))
+-- -- io.stdout:write(parse(l), '\n')
+-- end
+return {
+ normalize_case = function(n) return normalize_case:match(tag) end,
+ parse = parse,
+}
diff --git a/src/luaotfload-case.lua b/src/luaotfload-case.lua
index f5a1473..79fb370 100644
--- a/src/luaotfload-case.lua
+++ b/src/luaotfload-case.lua
@@ -1,4 +1,5 @@
local unicode_data = require'luaotfload-unicode'
+local bcp47 = require'luaotfload-bcp47'
local mapping_tables = unicode_data.casemapping
local soft_dotted = unicode_data.soft_dotted
@@ -26,6 +27,8 @@ local copy = direct.copy
local insert_after = direct.insert_after
local traverse = direct.traverse
+local report = luaotfload.log.report
+
local disc = node.id'disc'
--[[ We make some implicit assumptions about contexts in SpecialCasing.txt here which happened to be true when I wrote the code:
@@ -113,6 +116,15 @@ local function init_greek_data()
for c = 0x2126, 0x2126 do handle_char(c) end
end
+local relevant_languages = {
+ lt = true,
+ tr = true,
+ az = true,
+ hy = {_ = true, yiwn = true},
+ el = {_ = true, iota = true},
+ de = {_ = false, eszett = true},
+}
+
local function font_lang(feature)
return setmetatable({}, {__index = function(t, fid)
local f = font.getfont(fid)
@@ -127,6 +139,35 @@ local function font_lang(feature)
or (lang == 'ell' or lang == 'pgr') and 'el'
or false
end
+ if lang == 'de-alt' then
+ lang = 'de-x-eszett'
+ end
+ local parsed = lang and bcp47.parse(lang)
+ if lang and not parsed then
+ report('luaotfload-case', 0, 'Unable to parse passed language tag')
+ end
+ lang = parsed and parsed.language
+ local subtags = lang and relevant_languages[lang]
+ if subtags then
+ local private = parsed.private
+ if subtags ~= true and private then
+ local first = true
+ for _, ext in ipairs(private) do
+ if subtags[ext] then
+ if first then
+ lang = lang .. '-x'
+ first = nil
+ end
+ lang = lang .. '-' .. ext
+ end
+ end
+ if first then
+ lang = subtags._ and lang
+ end
+ end
+ else
+ lang = false
+ end
t[fid] = lang
return lang
end})
More information about the latex3-commits
mailing list.