[latex3-commits] [git/LaTeX3-latex3-luaotfload] dev: Add full BCP47 parsing for case features (5daac93)

Marcel Fabian Krüger tex at 2krueger.de
Sun Jun 26 11:35:20 CEST 2022


Repository : https://github.com/latex3/luaotfload
On branch  : dev
Link       : https://github.com/latex3/luaotfload/commit/5daac93851e20a62c74cf20fb95947f871aa5774

>---------------------------------------------------------------

commit 5daac93851e20a62c74cf20fb95947f871aa5774
Author: Marcel Fabian Krüger <tex at 2krueger.de>
Date:   Sun Jun 26 11:35:20 2022 +0200

    Add full BCP47 parsing for case features


>---------------------------------------------------------------

5daac93851e20a62c74cf20fb95947f871aa5774
 src/luaotfload-bcp47.lua | 86 ++++++++++++++++++++++++++++++++++++++++++++++++
 src/luaotfload-case.lua  | 41 +++++++++++++++++++++++
 2 files changed, 127 insertions(+)

diff --git a/src/luaotfload-bcp47.lua b/src/luaotfload-bcp47.lua
new file mode 100644
index 0000000..fc90c14
--- /dev/null
+++ b/src/luaotfload-bcp47.lua
@@ -0,0 +1,86 @@
+-- Suport for parsing BCP47 based language tags into components
+local normalize_case, language_tag do
+  local l = lpeg or require'lpeg'
+  local function rep(base, num, max)
+    max = max or num
+    if num == 1 then
+      if max == 1 then
+        return base
+      else
+        return base * base^-(max-1)
+      end
+    end
+    return base * rep(base, num - 1, max - 1)
+  end
+  local eor = #(l.P'-' + -1) -- End of record
+  local alpha = l.R('az', 'AZ')
+  local alphanum = l.R('az', 'AZ', '09')
+  local digit = l.R'09'
+  normalize_case = l.Cs(
+      (alphanum^2/string.lower)
+      * (
+          ('-' * (alphanum/string.upper) * (rep(alphanum, 3)/string.lower))
+        + ('-' * alphanum^3/string.lower)
+        + ('-' * rep(alphanum, 2)/string.upper)
+        )^0
+      * ('-' * alphanum * '-' * l.P(1)^1 / string.lower)^-1
+    + alphanum * '-' * l.P(1)^1 / string.lower
+  ) * -1
+  local extlang = l.Cg(rep(alpha, 3), 'extlang')
+  local language = l.Cg(rep(alpha, 2, 3), 'language') * ('-' * extlang * eor)^-1
+  local script = l.Cg(rep(alpha, 4), 'script')
+  local region = l.Cg(rep(alpha, 2) + rep(digit, 3), 'region')
+  local variant = l.Cg(rep(alphanum, 5, 8) + digit * rep(alphanum, 3), 'variant')
+  local singleton = l.R('09', 'aw', 'yz')
+  local extension = l.Cg(l.C(singleton) * l.Ct(('-' * l.C(rep(alphanum, 2, 8)))^1))
+  local privateuse = l.P'x' * l.Cg(l.Ct(('-' * l.C(rep(alphanum, 2, 8)))^1), 'private')
+  local irregular =
+      l.P'en-GB-oed' * l.Cc{language = 'en', region = 'GB', variant = 'oxendict'}
+    + l.P'i-ami' * l.Cc{language = 'ami'}
+    + l.P'i-bnn' * l.Cc{language = 'bnn'}
+    + l.P'i-default' * l.Cc{language = 'i-default'} -- Not deprecated
+    + l.P'i-enochian' * l.Cc{language = 'i-enochian'} -- The non-existance of a non deprecated language code for enochian
+                                                      -- demonstrates a shocking for the language of my ancestors
+    + l.P'i-hak' * l.Cc{language = 'hak'}
+    + l.P'i-klingon' * l.Cc{language = 'tlh'}
+    + l.P'i-lux' * l.Cc{language = 'lb'}
+    + l.P'i-mingo' * l.Cc{language = 'i-mingo'} -- Not deprecated
+    + l.P'i-navajo' * l.Cc{language = 'nv'}
+    + l.P'i-pwn' * l.Cc{language = 'pwn'}
+    + l.P'i-tao' * l.Cc{language = 'tao'}
+    + l.P'i-tay' * l.Cc{language = 'tay'}
+    + l.P'i-tsu' * l.Cc{language = 'tsu'}
+    + l.P'sgn-BE-FR' * l.Cc{language = 'sfb'}
+    + l.P'sgn-BE-NL' * l.Cc{language = 'vgt'}
+    + l.P'sgn-CH-DE' * l.Cc{language = 'sgg'}
+  local regular =
+      l.P'art-lojban' * l.Cc{language = 'jbo'}
+    + l.P'cel-gaulish' * l.Cc{language = 'cel-gaulish'} -- Might be xcg, xga or xtg
+    + l.P'no-bok' * l.Cc{language = 'nb'}
+    + l.P'no-nyn' * l.Cc{language = 'nn'}
+    + l.P'zh-guoyu' * l.Cc{language = 'cmn'}
+    + l.P'zh-hakka' * l.Cc{language = 'hak'}
+    + l.P'zh-min' * l.Cc{language = '...'}
+    + l.P'zh-min-nan' * l.Cc{language = 'nan'}
+    + l.P'zh-xiang' * l.Cc{language = 'hsn'}
+  local grandfathered = irregular + regular
+  local langtag = language * ('-' * script * eor)^-1
+                           * ('-' * region * eor)^-1
+                           * ('-' * variant)^0
+                           * (#('-' * singleton * '-') * l.Cg(l.Cf(l.Ct'' * ('-' * extension)^0, rawset), 'extension'))^-1
+                           * ('-' * privateuse)^-1
+  language_tag = (grandfathered + l.Ct(langtag + privateuse)) * -1
+end
+local function parse(tag)
+  tag = normalize_case:match(tag)
+  if not tag then return end
+  return language_tag:match(tag), tag
+end
+-- for l in io.lines() do
+--   print(parse(l))
+--   -- io.stdout:write(parse(l), '\n')
+-- end
+return {
+  normalize_case = function(n) return normalize_case:match(tag) end,
+  parse = parse,
+}
diff --git a/src/luaotfload-case.lua b/src/luaotfload-case.lua
index f5a1473..79fb370 100644
--- a/src/luaotfload-case.lua
+++ b/src/luaotfload-case.lua
@@ -1,4 +1,5 @@
 local unicode_data = require'luaotfload-unicode'
+local bcp47 = require'luaotfload-bcp47'
 
 local mapping_tables = unicode_data.casemapping
 local soft_dotted = unicode_data.soft_dotted
@@ -26,6 +27,8 @@ local copy = direct.copy
 local insert_after = direct.insert_after
 local traverse = direct.traverse
 
+local report = luaotfload.log.report
+
 local disc = node.id'disc'
 
 --[[ We make some implicit assumptions about contexts in SpecialCasing.txt here which happened to be true when I wrote the code:
@@ -113,6 +116,15 @@ local function init_greek_data()
   for c = 0x2126, 0x2126 do handle_char(c) end
 end
 
+local relevant_languages = {
+  lt = true,
+  tr = true,
+  az = true,
+  hy = {_ = true, yiwn = true},
+  el = {_ = true, iota = true},
+  de = {_ = false, eszett = true},
+}
+
 local function font_lang(feature)
   return setmetatable({}, {__index = function(t, fid)
     local f = font.getfont(fid)
@@ -127,6 +139,35 @@ local function font_lang(feature)
           or (lang == 'ell' or lang == 'pgr') and 'el'
           or false
     end
+    if lang == 'de-alt' then
+      lang = 'de-x-eszett'
+    end
+    local parsed = lang and bcp47.parse(lang)
+    if lang and not parsed then
+      report('luaotfload-case', 0, 'Unable to parse passed language tag')
+    end
+    lang = parsed and parsed.language
+    local subtags = lang and relevant_languages[lang]
+    if subtags then
+      local private = parsed.private
+      if subtags ~= true and private then
+        local first = true
+        for _, ext in ipairs(private) do
+          if subtags[ext] then
+            if first then
+              lang = lang .. '-x'
+              first = nil
+            end
+            lang = lang .. '-' .. ext
+          end
+        end
+        if first then
+          lang = subtags._ and lang
+        end
+      end
+    else
+      lang = false
+    end
     t[fid] = lang
     return lang
   end})





More information about the latex3-commits mailing list.