texlive[76195] Master/texmf-dist: lua-uni-algos (31aug25)

commits+karl at tug.org commits+karl at tug.org
Sun Aug 31 21:20:36 CEST 2025


Revision: 76195
          https://tug.org/svn/texlive?view=revision&revision=76195
Author:   karl
Date:     2025-08-31 21:20:36 +0200 (Sun, 31 Aug 2025)
Log Message:
-----------
lua-uni-algos (31aug25)

Modified Paths:
--------------
    trunk/Master/texmf-dist/doc/luatex/lua-uni-algos/README.md
    trunk/Master/texmf-dist/doc/luatex/lua-uni-algos/lua-uni-algos.pdf
    trunk/Master/texmf-dist/doc/luatex/lua-uni-algos/lua-uni-algos.tex
    trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-algos.lua
    trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-case.lua
    trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-graphemes.lua
    trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-normalize.lua
    trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-parse.lua

Added Paths:
-----------
    trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-data-parser.lua
    trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-data-preload.lua
    trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-data.lua
    trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-stage-tables.lua
    trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-words.lua

Modified: trunk/Master/texmf-dist/doc/luatex/lua-uni-algos/README.md
===================================================================
--- trunk/Master/texmf-dist/doc/luatex/lua-uni-algos/README.md	2025-08-31 19:20:16 UTC (rev 76194)
+++ trunk/Master/texmf-dist/doc/luatex/lua-uni-algos/README.md	2025-08-31 19:20:36 UTC (rev 76195)
@@ -1,12 +1,12 @@
 # The lua-uni-algos Package
 
-Version: v0.4.1
+Version: v0.5
 
-Date: 2022-02-26
+Date: 2025-08-31
 
 Author: Marcel Krüger
 
-License: LPPL v1.3
+License: LPPL v1.3c
 
 A collection of small Lua modules implementing some if the most generic Unicode algorithms for use with LuaTeX.
 This package tries to reduce duplicated work by collecting a set of small utilities which can be used be useful for many LuaTeX packages dealing with Unicode strings.

Modified: trunk/Master/texmf-dist/doc/luatex/lua-uni-algos/lua-uni-algos.pdf
===================================================================
(Binary files differ)

Modified: trunk/Master/texmf-dist/doc/luatex/lua-uni-algos/lua-uni-algos.tex
===================================================================
--- trunk/Master/texmf-dist/doc/luatex/lua-uni-algos/lua-uni-algos.tex	2025-08-31 19:20:16 UTC (rev 76194)
+++ trunk/Master/texmf-dist/doc/luatex/lua-uni-algos/lua-uni-algos.tex	2025-08-31 19:20:36 UTC (rev 76195)
@@ -2,7 +2,7 @@
 \usepackage{doc, shortvrb, metalogo, hyperref, fontspec}
 % \setmainfont{Noto Serif}
 % \setmonofont{FreeMono}
-\title{Unicode algorithms for Lua\TeX\thanks{This document corresponds to \pkg{lua-uni-algos} v0.4.1.}}
+\title{Unicode algorithms for Lua\TeX\thanks{This document corresponds to \pkg{lua-uni-algos} v0.5.}}
 \author{Marcel Krüger\thanks{E-Mail: \href{mailto:tex at 2krueger.de}{\nolinkurl{tex at 2krueger.de}}}}
 \MakeShortVerb\|
 \newcommand\pkg{\texttt}
@@ -17,6 +17,7 @@
   \item[Unicode normalization] Normalize a given Lua string into any of the normalization forms NFC, NFD, NFKC, or NFKD as specified in the Unicode standard, section 2.12.
   \item[Case folding] Fold Unicode codepoints into a form which eliminates all case distinctions. This can be used for case-independent matching of strings. Not to be confused with case mapping which maps all characters to lower/upper/titlecase: In contrast to case mapping, case folding is mostly locale independent but does not give results which should be shown to users.
   \item[Grapheme cluster segmentation] Identify a grapheme cluster, a unit of text which is perceived as a single character by typical users, according to the rules in UAX \#29, section 3.
+  \item[Word boundary segmentation] Identify word boundaries according to the rules in UAX \#29, section 4.
 \end{description}
 \section{Normalization}
 Unicode normalization is handled by the Lua module |lua-uni-normalize|.
@@ -180,4 +181,43 @@
 
 Do not try to interpret the |state|, it has no defined values and might change at any point.
 
+\section{Word boundaries}
+Word segmentation is handled by the Lua module |lua-uni-words|.
+You can either load it directly with
+\begin{verbatim}
+local words = require'lua-uni-words'
+\end{verbatim}
+or if you need access to all implemented algorithms you can use
+\begin{verbatim}
+local uni_algos = require'lua-uni-algos'
+local words = uni_algos.words
+\end{verbatim}
+
+This is used to identify word boundaries. Unicode describes these as the boundaries users would expect when searching in a text for whole words.
+
+The UAX also suggests that for some use-cases useful words can be determined from these by taking any segment between word boundaries and filtering out all segments ``containing only whitespace, punctuation and similar characters''.
+
+Currently only the string interface is recommended:
+
+\begin{verbatim}
+for final, first, word_segment in words.word_boundaries'This text will be split into words segments!' do
+  print('"' .. word_segment .. '"')
+end
+\end{verbatim}
+% \begin{verbatim}
+% for final, first, word_segment in words.word_boundaries'Z͑ͫ̓ͪ̂ͫ̽͏̴̙̤̞͉͚̯̞̠͍A̴̵̜̰͔ͫ͗͢L̠ͨͧͩ͘G̴̻͈͍͔̹̑͗̎̅͛́Ǫ̵̹̻̝̳͂̌̌͘!͖̬̰̙̗̿̋ͥͥ̂ͣ̐́́͜͞' do
+%   print(grapheme)
+% end
+% \end{verbatim}
+
+\noindent\begingroup
+  \ttfamily
+  \directlua{
+    local words = require'./lua-uni-words'
+    for final, first, word_segment in words.word_boundaries'This text will be split into words segments!' do
+      tex.sprint('"' .. word_segment .. '"\string\\\string\\')
+    end
+  }\par
+\endgroup
+
 \end{document}

Modified: trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-algos.lua
===================================================================
--- trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-algos.lua	2025-08-31 19:20:16 UTC (rev 76194)
+++ trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-algos.lua	2025-08-31 19:20:36 UTC (rev 76195)
@@ -1,5 +1,5 @@
 -- lua-uni-algos.lua
--- Copyright 2020--2022 Marcel Krüger
+-- Copyright 2020--2025 Marcel Krüger
 --
 -- This work may be distributed and/or modified under the
 -- conditions of the LaTeX Project Public License, either version 1.3
@@ -17,4 +17,5 @@
   case = require'lua-uni-case',
   graphemes = require'lua-uni-graphemes',
   normalize = require'lua-uni-normalize',
+  words = require'lua-uni-words',
 }

Modified: trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-case.lua
===================================================================
--- trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-case.lua	2025-08-31 19:20:16 UTC (rev 76194)
+++ trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-case.lua	2025-08-31 19:20:36 UTC (rev 76195)
@@ -1,5 +1,5 @@
--- lua-uni-graphemes.lua
--- Copyright 2020--2022 Marcel Krüger
+-- lua-uni-case.lua
+-- Copyright 2020--2025 Marcel Krüger
 --
 -- This work may be distributed and/or modified under the
 -- conditions of the LaTeX Project Public License, either version 1.3

Added: trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-data-parser.lua
===================================================================
--- trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-data-parser.lua	                        (rev 0)
+++ trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-data-parser.lua	2025-08-31 19:20:36 UTC (rev 76195)
@@ -0,0 +1,129 @@
+-- lua-uni-data-parser.lua
+-- Copyright 2020--2025 Marcel Krüger
+--
+-- This work may be distributed and/or modified under the
+-- conditions of the LaTeX Project Public License, either version 1.3
+-- of this license or (at your option) any later version.
+-- The latest version of this license is in
+--   http://www.latex-project.org/lppl.txt
+-- and version 1.3 or later is part of all distributions of LaTeX
+-- version 2005/12/01 or later.
+--
+-- This work has the LPPL maintenance status `maintained'.
+-- 
+-- The Current Maintainer of this work is Marcel Krüger
+
+-- This is mainly an internal module used in lua-uni-data to efficiently store
+-- the data entries.
+--
+-- This is an internal module which should only be loaded by other parts of lua-uni-algos.
+-- If you want to access the data parsed here, please use lua-uni-data.
+
+local bits = 6
+
+local parse, lpeg = require'lua-uni-parse', lpeg or require'lpeg'
+local lookup_table = require'lua-uni-stage-tables'
+local to_three_stage_lookup = lookup_table.to_three_stage_lookup
+
+local function dynamic_invertible_table(default)
+  local forward, count = {}, 0
+  local backward = setmetatable({}, {__index = function(t, key)
+    local index = count + 1
+    count = index
+    forward[index], t[key] = key, index
+    return index
+  end})
+  if default ~= nil then
+    forward[0], backward[default] = default, 0
+  end
+  return forward, backward
+end
+
+local read_properties do
+  local l, p = lpeg, parse
+  local any_property = l.R('az', 'AZ', '__')^1
+
+  function read_properties(filename, default, properties_pattern)
+    local forward_mapping, inverted_mapping = dynamic_invertible_table(default)
+    properties_pattern = l.P(properties_pattern or any_property)
+    local result = p.parse_file(filename,
+      l.Cg(p.fields(p.codepoint_range, properties_pattern / inverted_mapping)) + p.ignore_line,
+      p.multiset
+    )
+    if not result then
+      return nil, string.format("Failed to parse %s", filename)
+    end
+    return result, forward_mapping
+  end
+end
+
+local category_mapping
+local general_category, ccc, decomposition_mapping, compatibility_mapping, uppercase, lowercase, titlecase do
+  local reverse_category_mapping
+  category_mapping, reverse_category_mapping = dynamic_invertible_table'Cn'
+  local function multiset(ts, key, key_range, general_category, ccc, decomp_kind, decomp_mapping, upper, lower, title)
+    key_range = key_range or key
+    for codepoint=key, key_range do
+      ts[1][codepoint], ts[2][codepoint], ts[4][codepoint], ts[5][codepoint], ts[6][codepoint], ts[7][codepoint] = general_category, ccc, decomp_mapping, upper and upper - codepoint, lower and lower - codepoint, title and title - codepoint or upper and upper - codepoint
+      if not decomp_kind then
+        ts[3][codepoint] = decomp_mapping
+      end
+    end
+    return ts
+  end
+  local l, p = lpeg, parse
+  local Cnil = l.Cc(nil)
+  local letter = l.R('AZ', 'az')
+  local codepoint_or_ud_range = p.codepoint * ( -- When scanning a range in UnicodeData.txt, we use the data from the entry of the end of the range.
+    ';<' * p.ignore_field * lpeg.B', First>' * p.ignore_line * p.codepoint
+    + l.Cc(nil)
+  )
+  local parsed = assert(p.parse_file('UnicodeData', l.Cf(
+    l.Ct(l.Ct'' * l.Ct'' * l.Ct'' * l.Ct'' * l.Ct'' * l.Ct'' * l.Ct'') * (
+      l.Cg(p.fields(codepoint_or_ud_range,
+                    p.ignore_field, -- Name (ignored)
+                    l.R'AZ' * l.R'az' / reverse_category_mapping, -- General_Category
+                    '0' * Cnil + p.number, -- Canonical_Combining_Class
+                    p.ignore_field, -- Bidi_Class (ignored)
+                    ('<' * l.C(letter^1) * '> ' + Cnil) -- Decomposition_Type
+                  * (l.Ct(p.codepoint * (' ' * p.codepoint)^0) + Cnil), -- Decomposition_Mapping
+                    p.ignore_field, -- Numeric_Type / Numeric_Value (ignored)
+                    p.ignore_field, -- Numeric_Type / Numeric_Value (ignored)
+                    p.ignore_field, -- Numeric_Type / Numeric_Value (ignored)
+                    p.ignore_field, -- Bidi_Mirrored (ignored)
+                    p.ignore_field, -- obsolete
+                    p.ignore_field, -- obsolete
+                    (p.codepoint + Cnil), -- Simple_Uppercase_Mapping
+                    (p.codepoint + Cnil), -- Simple_Lowercase_Mapping
+                    (p.codepoint + Cnil)) -- Simple_Titlecase_Mapping
+      ) + p.eol
+    )^0, multiset) * -1))
+  general_category, ccc, decomposition_mapping, compatibility_mapping, uppercase, lowercase, titlecase = unpack(parsed)
+end
+
+local grapheme_break_property, grapheme_break_mapping = assert(read_properties('GraphemeBreakProperty', 'Other'))
+local word_break_property, word_break_mapping = assert(read_properties('WordBreakProperty', 'Other'))
+
+general_category = to_three_stage_lookup(general_category, category_mapping, bits, bits, 1)
+ccc = to_three_stage_lookup(ccc, nil, bits, bits, 1)
+grapheme_break_property = to_three_stage_lookup(grapheme_break_property, grapheme_break_mapping, bits, bits, 1)
+uppercase = to_three_stage_lookup(uppercase, 'offset', bits, bits, 3)
+lowercase = to_three_stage_lookup(lowercase, 'offset', bits, bits, 3)
+titlecase = to_three_stage_lookup(titlecase, 'offset', bits, bits, 3)
+word_break_property = to_three_stage_lookup(word_break_property, word_break_mapping, bits, bits, 1)
+
+return {
+  tables = {
+    category = general_category,
+    ccc = ccc,
+    grapheme = grapheme_break_property,
+    uppercase = uppercase,
+    lowercase = lowercase,
+    titlecase = titlecase,
+    wordbreak = word_break_property,
+  },
+  misc = {
+    decomposition_mapping = decomposition_mapping,
+    compatibility_mapping = compatibility_mapping,
+  },
+}


Property changes on: trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-data-parser.lua
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-data-preload.lua
===================================================================
--- trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-data-preload.lua	                        (rev 0)
+++ trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-data-preload.lua	2025-08-31 19:20:36 UTC (rev 76195)
@@ -0,0 +1,82 @@
+-- lua-uni-data-preload.lua
+-- Copyright 2020--2025 Marcel Krüger
+--
+-- This work may be distributed and/or modified under the
+-- conditions of the LaTeX Project Public License, either version 1.3
+-- of this license or (at your option) any later version.
+-- The latest version of this license is in
+--   http://www.latex-project.org/lppl.txt
+-- and version 1.3 or later is part of all distributions of LaTeX
+-- version 2005/12/01 or later.
+--
+-- This work has the LPPL maintenance status `maintained'.
+-- 
+-- The Current Maintainer of this work is Marcel Krüger
+
+-- This is mainly an internal module used in lua-uni-data to efficiently store
+-- the data entries.
+--
+--
+local lookup_table = require'lua-uni-stage-tables'
+local serialize = lookup_table.serialize
+local deserialize = lookup_table.deserialize
+
+-- Serialize a Lua value into a string.
+-- Not designed to be human readable but only to be reloaded into Lua with load().
+local function serialize_lua(value)
+  local t = type(value)
+  local fmt
+  if t == 'number' then
+    fmt = math.type(value) == 'integer' and '%#X' or '%A'
+  elseif t == 'string' then
+    fmt = '%q'
+  elseif t == 'boolean' or t == 'nil' then
+    fmt = '%s'
+  elseif t == 'table' then
+    local k, v
+    local entries, length = {}, 0
+    while true do
+      local last_key = k or 0
+      k, v = next(value, k)
+      if math.type(k) == 'integer' and k > last_key and k - last_key < 5 then
+        for i = last_key+1, k-1 do
+          length = length + 1
+          entries[length] = 'nil'
+        end
+        length = length + 1
+        entries[length] = serialize_lua(v)
+      else
+        break
+      end
+    end
+    while k ~= nil do
+      length = length + 1
+      entries[length] = string.format('[%s] = %s', serialize_lua(k), serialize_lua(v))
+      k, v = next(value, k)
+    end
+    fmt, value = '{%s}', table.concat(entries, ', ', 1, length)
+  elseif t == 'function' or t == 'thread' or t == 'userdata' then
+    error"Unsupported type in deserialize"
+  end
+  return string.format(fmt, value)
+end
+
+return {
+  generate_bytecode = function()
+    local data = require'lua-uni-data'
+
+    local tables = {}
+    for k, v in next, data.tables do
+      tables[#tables + 1] = string.format("[%q] = deserialize %q,", k, serialize(v))
+    end
+    return assert(load(string.format("\z
+      package.preload['lua-uni-data'] = function() \z
+        local deserialize = require'lua-uni-stage-tables'.deserialize \z
+        return { \z
+          tables = { %s }, \z
+          misc = %s, \z
+        }\z
+      end\z
+    ", table.concat(tables), serialize_lua(data.misc, 'misc')), 'preloaded_unicode_data', 't'))
+  end,
+}


Property changes on: trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-data-preload.lua
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-data.lua
===================================================================
--- trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-data.lua	                        (rev 0)
+++ trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-data.lua	2025-08-31 19:20:36 UTC (rev 76195)
@@ -0,0 +1,20 @@
+-- lua-uni-data.lua
+-- Copyright 2020--2025 Marcel Krüger
+--
+-- This work may be distributed and/or modified under the
+-- conditions of the LaTeX Project Public License, either version 1.3
+-- of this license or (at your option) any later version.
+-- The latest version of this license is in
+--   http://www.latex-project.org/lppl.txt
+-- and version 1.3 or later is part of all distributions of LaTeX
+-- version 2005/12/01 or later.
+--
+-- This work has the LPPL maintenance status `maintained'.
+-- 
+-- The Current Maintainer of this work is Marcel Krüger
+
+-- Make data from lua-uni-data-parser externally reachable.
+--
+-- Preloaded data is provided instead if it's present.
+--
+return require'lua-uni-data-parser'


Property changes on: trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-data.lua
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Modified: trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-graphemes.lua
===================================================================
--- trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-graphemes.lua	2025-08-31 19:20:16 UTC (rev 76194)
+++ trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-graphemes.lua	2025-08-31 19:20:36 UTC (rev 76195)
@@ -1,5 +1,5 @@
 -- lua-uni-graphemes.lua
--- Copyright 2020--2022 Marcel Krüger
+-- Copyright 2020--2025 Marcel Krüger
 --
 -- This work may be distributed and/or modified under the
 -- conditions of the LaTeX Project Public License, either version 1.3
@@ -123,7 +123,7 @@
 
 -- The value of "state" is considered internal and should not be relied upon.
 -- Just pass it to the function as is or pass nil. `nil` should only be passed when the passed codepoint starts a new cluster
-function read_codepoint(cp, state)
+local function read_codepoint(cp, state)
   local new_cluster
   state, new_cluster = state_map[state or 'START'](property[cp])
   return new_cluster, state

Modified: trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-normalize.lua
===================================================================
--- trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-normalize.lua	2025-08-31 19:20:16 UTC (rev 76194)
+++ trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-normalize.lua	2025-08-31 19:20:36 UTC (rev 76195)
@@ -1,5 +1,5 @@
 -- lua-uni-normalize.lua
--- Copyright 2020--2022 Marcel Krüger
+-- Copyright 2020--2025 Marcel Krüger
 --
 -- This work may be distributed and/or modified under the
 -- conditions of the LaTeX Project Public License, either version 1.3
@@ -224,7 +224,7 @@
   end
   return s
 end
-function to_nfd_table(s, decomposition_mapping)
+local function to_nfd_table(s, decomposition_mapping)
   local new_codepoints = result_table
   local j = 1
   for _, c in codes(s) do

Modified: trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-parse.lua
===================================================================
--- trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-parse.lua	2025-08-31 19:20:16 UTC (rev 76194)
+++ trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-parse.lua	2025-08-31 19:20:36 UTC (rev 76195)
@@ -1,5 +1,5 @@
 -- lua-uni-parse.lua
--- Copyright 2020--2022 Marcel Krüger
+-- Copyright 2020--2025 Marcel Krüger
 --
 -- This work may be distributed and/or modified under the
 -- conditions of the LaTeX Project Public License, either version 1.3
@@ -15,6 +15,9 @@
 
 -- Just a simple helper module to make UCD parsing more readable
 
+-- The rawget is needed here because of Context idiosyncrasies.
+local find_file = assert(kpse and rawget(kpse, 'find_file') or resolvers and resolvers.find_file, 'No file searching library found')
+
 local lpeg = lpeg or require'lpeg'
 local R = lpeg.R
 local tonumber = tonumber
@@ -47,7 +50,7 @@
   if func then
     return parse_uni_file(filename, lpeg.Cf(lpeg.Ct'' * patt^0 * -1, func), nil, ...)
   end
-  local resolved = kpse.find_file(filename .. '.txt')
+  local resolved = find_file(filename .. '.txt')
   if not resolved then
     error(string.format("Unable to find Unicode datafile %q", filename))
   end

Added: trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-stage-tables.lua
===================================================================
--- trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-stage-tables.lua	                        (rev 0)
+++ trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-stage-tables.lua	2025-08-31 19:20:36 UTC (rev 76195)
@@ -0,0 +1,268 @@
+-- lua-uni-stage-tables.lua
+-- Copyright 2025 Marcel Krüger
+--
+-- This work may be distributed and/or modified under the
+-- conditions of the LaTeX Project Public License, either version 1.3
+-- of this license or (at your option) any later version.
+-- The latest version of this license is in
+--   http://www.latex-project.org/lppl.txt
+-- and version 1.3 or later is part of all distributions of LaTeX
+-- version 2005/12/01 or later.
+--
+-- This work has the LPPL maintenance status `maintained'.
+-- 
+-- The Current Maintainer of this work is Marcel Krüger
+
+-- This is mainly an internal module used in lua-uni-data to efficiently store
+-- the data entries.
+
+
+-- The source is a 0-based table with integer values fitting into data_bytes bytes.
+-- The first return value is a 0-based table with values being 1-based indexes into the second table or 0.
+local function compress(compress_bits, data_bytes, signed, source, upper_limit)
+  local compress_mask = (1 << compress_bits) - 1
+  local default_lower_level = lua.newtable(compress_mask + 1, 0)
+  local pattern = (signed and '>i' or '>I') .. data_bytes
+  local zero_value = string.pack(pattern, 0)
+  for i=1, compress_mask + 1 do
+    default_lower_level[i] = zero_value
+  end
+  local first_stage = upper_limit and lua.newtable((upper_limit >> compress_bits), 1) or {}
+  for k, v in next, source do
+    if v and v ~= 0 and type(k) == 'number' then
+      local high, low = k >> compress_bits, (k & compress_mask) + 1
+      local subtable = first_stage[high]
+      if not subtable then
+        subtable = table.move(default_lower_level, 1, compress_mask + 1, 1, {})
+        first_stage[high] = subtable
+      end
+      subtable[low] = string.pack(pattern, v)
+    end
+  end
+  local default_key = table.concat(default_lower_level)
+  local second_stage_lookup = {[default_key] = 0}
+  local second_stage = {[0] = default_key}
+  for k, v in next, first_stage do
+    local key = table.concat(v)
+    local index = second_stage_lookup[key]
+    if not index then
+      index = #second_stage + 1
+      second_stage_lookup[key] = index
+      second_stage[index] = key
+    end
+    if index ~= 0 then
+      first_stage[k] = index
+    end
+  end
+  if upper_limit then
+    for i = 0, upper_limit >> compress_bits do
+      first_stage[i] = first_stage[i] or 0
+    end
+  end
+  return first_stage, second_stage
+end
+
+local function lookup_identity(value) return value end
+local function lookup_table(t) return function(value) return t[value] end end
+local function lookup_offset(value, codepoint) return codepoint + value end
+
+local readers = {
+  [true] = {
+    [1] = sio.readinteger1,
+    [2] = sio.readinteger2,
+    [3] = sio.readinteger3,
+    [4] = sio.readinteger4,
+  },
+  [false] = {
+    [1] = sio.readcardinal1,
+    [2] = sio.readcardinal2,
+    [3] = sio.readcardinal3,
+    [4] = sio.readcardinal4,
+  },
+}
+
+local function two_stage_metatable(first_stage, second_stage, lookup, bits, bytes)
+  local lookup_function, signed
+  if lookup == nil then
+    signed = false
+    lookup_function = lookup_identity
+  elseif lookup == 'offset' then
+    signed = true
+    lookup_function = lookup_offset
+  else
+    signed = false
+    lookup_function = lookup_table(lookup)
+  end
+  local reader = assert(readers[signed][bytes])
+  local block_size = 1 << bits
+  local mask = block_size - 1
+  local stride = bytes * block_size
+  return setmetatable({
+    __first_stage = first_stage,
+    __second_stage = second_stage,
+    __lookup = lookup,
+    __bytes = bytes,
+    __bits = bits,
+  }, {
+    __index = function(_, key)
+      if type(key) ~= 'number' then return nil end
+      local high, low = key >> bits, key & mask
+      local second_index = first_stage[high] or 0
+      local value
+      if second_index ~= 0 then
+        value = reader(second_stage, stride * (second_index - 1) + bytes * low + 1)
+      else
+        value = 0
+      end
+      return lookup_function(value, key)
+    end
+  })
+end
+
+local function one_stage_metatable(buffer, lookup, bytes)
+  local lookup_function, signed
+  if lookup == nil then
+    signed = false
+    lookup_function = lookup_identity
+  elseif lookup == 'offset' then
+    signed = true
+    lookup_function = lookup_offset
+  else
+    signed = false
+    lookup_function = lookup_table(lookup)
+  end
+  local reader = assert(readers[signed][bytes])
+  return setmetatable({
+    __buffer = buffer,
+    __lookup = lookup,
+    __bytes = bytes,
+  }, {
+    __index = function(_, key)
+      local value = reader(buffer, bytes * key + 1)
+      return lookup_function(value, key)
+    end
+  })
+end
+
+local function to_two_stage_lookup(source, lookup, bits, bytes)
+  local first_stage, second_stage = compress(bits, bytes, source)
+  return two_stage_metatable(first_stage, table.concat(second_stage), lookup, codepoint_block_bits, bytes)
+end
+
+local function to_three_stage_lookup(source, lookup, bits1, bits2, bytes)
+  local intermediate_stage, third_stage = compress(bits2, bytes, lookup == 'offset', source)
+  local needed_bytes = math.floor(math.log(#third_stage, 256)) + 1
+  local first_stage, second_stage = compress(bits1, needed_bytes, false, intermediate_stage, (0x10FFFF >> bits2) + 1)
+  return two_stage_metatable(two_stage_metatable(first_stage, table.concat(second_stage), nil, bits1, needed_bytes), table.concat(third_stage), lookup, bits2, bytes)
+end
+
+-- Cache file format (everything in little endian):
+--   Header:
+--     u32: Version (0x00010000)
+--     u8: Stages (supported: 2 or 3)
+--     u8: Kind (enum: 0x00 Identity table, 0x01 Delta encoding (signed), 0x02 Mapping table)
+--   Mapping table: (only present if kind == 0x02
+--     <Data size in bytes>: Entry count without default
+--     <u8 size prefixed string>: Default
+--     <Entry count> * <u8 size prefixed string>: Entries
+--   Outer table:
+--     <u32> Number of entries
+--     <u8> Size of one entry in bytes
+--     <data>
+--   Inner tables: (repeated `stages - 1` times)
+--     <previous entry size bytes> Number of entries
+--     <u8> Size of underlying value in bytes
+--     <u8> Bits representing page size (each entry is 2^bits * bytes fields big)
+--     <data>
+--     
+local function serialize(nested_table)
+  local buffer = {}
+  local nesting_depth do
+    local current_table = nested_table
+    nesting_depth = 0
+    repeat
+      nesting_depth = nesting_depth + 1
+      current_table = current_table.__first_stage
+    until current_table == nil
+  end
+  local lookup = nested_table.__lookup
+  local kind = lookup == nil and 0 or lookup == 'offset' and 1 or 2
+  buffer[1] = string.pack('>I4BB', 0x00010000, nesting_depth, kind)
+  if kind == 2 then
+    buffer[2] = string.pack('>I4', #lookup)
+    for i=0, #lookup do
+      buffer[3 + i] = string.pack('s1', lookup[i])
+    end
+  end
+  local function serialize_stage(stage, prev)
+    local inner_stage = stage.__first_stage
+    if inner_stage then
+      local nested_bytes = serialize_stage(inner_stage, stage)
+      buffer[#buffer + 1] = string.pack(string.format('>I%sBB', nested_bytes), #stage.__second_stage // (stage.__bytes * (1 << stage.__bits)), stage.__bytes, stage.__bits)
+      buffer[#buffer + 1] = stage.__second_stage
+      return stage.__bytes
+    else
+      local index_bytes = math.floor(math.log(#prev.__second_stage // ((1 << prev.__bits) * prev.__bytes), 256)) + 1
+      buffer[#buffer + 1] = string.pack('>I4B', #stage + 1, index_bytes)
+      buffer[#buffer + 1] = string.pack('>' .. string.rep('I' .. index_bytes, #stage + 1), table.unpack(stage, 0))
+      return index_bytes
+    end
+  end
+  serialize_stage(nested_table)
+  return table.concat(buffer)
+end
+
+local function deserialize(data)
+  local offset = 1
+  local version, nesting_depth, kind
+  version, nesting_depth, kind, offset = string.unpack('>I4BB', data, offset)
+  if version ~= 0x00010000 then error'Invalid version' end
+  local lookup
+  if kind == 0 then
+    lookup = nil
+  elseif kind == 1 then
+    lookup = 'offset'
+  elseif kind == 2 then
+    -- TODO
+    local lookup_count
+    lookup_count, offset = string.unpack('>I4', data, offset)
+    lookup = lua.newtable(lookup_count, 1)
+    for i=0, lookup_count do
+      lookup[i], offset = string.unpack('s1', data, offset)
+    end
+  else
+    error'Unsupported type'
+  end
+
+  local nested_table, previous_bytes
+  do
+    local entries, bytes
+    entries, bytes, offset = string.unpack('>I4B', data, offset)
+    local stage_size = entries * bytes
+    local stage = data:sub(offset, offset + stage_size - 1)
+    offset = offset + stage_size
+    nested_table = one_stage_metatable(stage, level == nesting_depth and lookup or nil, bytes)
+    previous_bytes = bytes
+  end
+
+  for level = 2, nesting_depth do
+    local entries, bytes, bits
+    entries, bytes, bits, offset = string.unpack(string.format('>I%sBB', previous_bytes), data, offset) --, #stage.__second_stage // (stage.__bytes * (1 << stage.__bits)), stage.__bytes, stage.__bits)
+    local stage_size = entries * bytes * (1 << bits)
+    local stage = data:sub(offset, offset + stage_size - 1)
+    offset = offset + stage_size
+    nested_table = two_stage_metatable(nested_table, stage, level == nesting_depth and lookup or nil, bits, bytes)
+    previous_bytes = bytes
+  end
+
+  assert(offset == #data + 1)
+  return nested_table
+end
+
+return {
+  compress = compress,
+  to_two_stage_lookup = to_two_stage_lookup,
+  to_three_stage_lookup = to_three_stage_lookup,
+  serialize = serialize,
+  deserialize = deserialize,
+}


Property changes on: trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-stage-tables.lua
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-words.lua
===================================================================
--- trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-words.lua	                        (rev 0)
+++ trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-words.lua	2025-08-31 19:20:36 UTC (rev 76195)
@@ -0,0 +1,315 @@
+-- lua-uni-words.lua
+-- Copyright 2025 Marcel Krüger
+--
+-- This work may be distributed and/or modified under the
+-- conditions of the LaTeX Project Public License, either version 1.3
+-- of this license or (at your option) any later version.
+-- The latest version of this license is in
+--   http://www.latex-project.org/lppl.txt
+-- and version 1.3 or later is part of all distributions of LaTeX
+-- version 2005/12/01 or later.
+--
+-- This work has the LPPL maintenance status `maintained'.
+-- 
+-- The Current Maintainer of this work is Marcel Krüger
+
+local extended_pictographic, property do
+  local p = require'lua-uni-parse'
+  local l = lpeg or require'lpeg'
+
+  extended_pictographic = p.parse_file('emoji-data',
+    l.Cg(p.fields(p.codepoint_range, 'Extended_Pictographic' * l.Cc(true))) + p.ignore_line,
+    p.multiset)
+  if not extended_pictographic then
+    error[[Break Property matching failed]]
+  end
+
+  property = p.parse_file('WordBreakProperty',
+    l.Cg(p.fields(p.codepoint_range, l.C(l.R('az', 'AZ', '__')^1))) + p.ignore_line,
+    p.multiset)
+  if not property then
+    error[[Break Property matching failed]]
+  end
+end
+
+local ignorable = { Extend = true, Format = true, ZWJ = true, }
+local controls = { CR = true, LF = true, Newline = true, }
+
+local function context_AHLetter_Mid(cp)
+  local prop = property[cp]
+  if ignorable[prop] then
+    return nil, context_AHLetter_Mid
+  end
+  if prop == 'ALetter' then
+    return false, 'ASTARTED'
+  end
+  if prop == 'Hebrew_Letter' then
+    return false, 'HSTARTED'
+  end
+  return true, 'PRE'
+end
+
+local function context_HLetter_Double(cp)
+  local prop = property[cp]
+  if ignorable[prop] then
+    return nil, context_HLetter_Double
+  end
+  if prop == 'Hebrew_Letter' then
+    return false, 'HSTARTED'
+  end
+  return true, 'PRE'
+end
+
+local function context_Numeric_Mid(cp)
+  local prop = property[cp]
+  if ignorable[prop] then
+    return nil, context_Numeric_Mid
+  end
+  if prop == 'Numeric' then
+    return false, 'NSTARTED'
+  end
+  return true, 'PRE'
+end
+
+local state_map state_map = {
+  START = function(prop)
+    if prop == 'CR' then
+      return 'CR', true
+    end
+    if prop == 'LF' or prop == 'Newline' then
+      return 'START', true
+    end
+    return state_map.PRE(prop), true
+  end,
+  PRE = function(prop)
+    if controls[prop] then
+      return state_map.START(prop)
+    end
+    if ignorable[prop] then
+      return 'PRE', false
+    end
+    if prop == 'WSegSpace' then
+      return 'WHITE', true
+    end
+    if prop == 'ALetter' then
+      return 'ASTARTED', true
+    end
+    if prop == 'Hebrew_Letter' then
+      return 'HSTARTED', true
+    end
+    if prop == 'Numeric' then
+      return 'NSTARTED', true
+    end
+    if prop == 'Katakana' then
+      return 'KSTARTED', true
+    end
+    if prop == 'ExtendNumLet' then
+      return 'EXTEND', true
+    end
+    if prop == 'Regional_Indicator' then
+      return 'RI', true
+    end
+    return 'PRE', true
+  end,
+  CR = function(prop)
+    if prop == 'LF' then
+      return 'START', false
+    else
+      return state_map.START(prop)
+    end
+  end,
+  WHITE = function(prop)
+    if prop == 'WSegSpace' then
+      return 'WHITE', false
+    else
+      return state_map.PRE(prop)
+    end
+  end,
+  EXTEND = function(prop)
+    if ignorable[prop] then
+      return 'EXTEND', false
+    end
+    if prop == 'ALetter' then
+      return 'ASTARTED', false
+    end
+    if prop == 'Hebrew_Letter' then
+      return 'HSTARTED', false
+    end
+    if prop == 'Katakana' then
+      return 'KSTARTED', false
+    end
+    if prop == 'Numeric' then
+      return 'NSTARTED', false
+    end
+    if prop == 'ExtendNumLet' then
+      return 'EXTEND', false
+    end
+    return state_map.PRE(prop)
+  end,
+  KSTARTED = function(prop)
+    if ignorable[prop] then
+      return 'KSTARTED', false
+    end
+    if prop == 'Katakana' then
+      return 'Katakana', false
+    end
+    if prop == 'ExtendNumLet' then
+      return 'EXTEND', false
+    end
+    return state_map.PRE(prop)
+  end,
+  RI = function(prop)
+    if ignorable[prop] then
+      return 'RI', false
+    end
+    if prop == 'Regional_Indicator' then
+      return 'PRE', false
+    end
+    return state_map.PRE(prop)
+  end,
+  ASTARTED = function(prop)
+    if ignorable[prop] then
+      return 'ASTARTED', false
+    end
+    if prop == 'ALetter' then
+      return 'ASTARTED', false
+    end
+    if prop == 'Hebrew_Letter' then
+      return 'HSTARTED', false
+    end
+    if prop == 'Numeric' then
+      return 'NSTARTED', false
+    end
+    if prop == 'ExtendNumLet' then
+      return 'EXTEND', false
+    end
+    if prop == 'MidLetter' or prop == 'MidNumLet' or prop == 'Single_Quote' then
+      return context_AHLetter_Mid
+    end
+    return state_map.PRE(prop)
+  end,
+  HSTARTED = function(prop)
+    if ignorable[prop] then
+      return 'HSTARTED', false
+    end
+    if prop == 'ALetter' then
+      return 'ASTARTED', false
+    end
+    if prop == 'Hebrew_Letter' then
+      return 'HSTARTED', false
+    end
+    if prop == 'Numeric' then
+      return 'NSTARTED', false
+    end
+    if prop == 'ExtendNumLet' then
+      return 'EXTEND', false
+    end
+    if prop == 'Single_Quote' then
+      return 'HSINGLE_QUOTE', false
+    end
+    if prop == 'MidLetter' or prop == 'MidNumLet' then
+      return context_AHLetter_Mid
+    end
+    if prop == 'Double_Quote' then
+      return context_HLetter_Double
+    end
+    return state_map.PRE(prop)
+  end,
+  HSINGLE_QUOTE = function(prop)
+    if ignorable[prop] then
+      return 'HSINGLE_QUOTE', false
+    end
+    if prop == 'ALetter' then
+      return 'ASTARTED', false
+    end
+    if prop == 'Hebrew_Letter' then
+      return 'HSTARTED', false
+    end
+    return state_map.PRE(prop)
+  end,
+  NSTARTED = function(prop)
+    if ignorable[prop] then
+      return 'NSTARTED', false
+    end
+    if prop == 'ALetter' then
+      return 'ASTARTED', false
+    end
+    if prop == 'Hebrew_Letter' then
+      return 'HSTARTED', false
+    end
+    if prop == 'Numeric' then
+      return 'NSTARTED', false
+    end
+    if prop == 'ExtendNumLet' then
+      return 'EXTEND', false
+    end
+    if prop == 'MidNum' or prop == 'MidNumLet' or prop == 'Single_Quote' then
+      return context_Numeric_Mid
+    end
+    return state_map.PRE(prop)
+  end,
+}
+
+local from_ZWJ, to_ZWJ = {}, {}
+for k in next, state_map do
+  local zwj_state = 'ZWJ_' .. k
+  from_ZWJ[zwj_state], to_ZWJ[k] = k, zwj_state
+end
+
+
+-- The value of "state" is considered internal and should not be relied upon.
+-- Just pass it to the function as is or pass nil. `nil` should only be passed when the passed codepoint starts a new cluster
+local function read_codepoint(cp, state)
+  local mapped_state = from_ZWJ[state]
+  local new_word
+  local prop = property[cp]
+  state, new_word = state_map[mapped_state or state or 'START'](prop)
+  if mapped_state and extended_pictographic[cp] then
+    new_word = false
+  end
+  if prop == 'ZWJ' then
+    state = to_ZWJ[state]
+  end
+  return new_word, state
+end
+
+-- A Lua iterator for strings -- Only reporting the beginning of every word segment
+local function word_boundaries_start(str)
+  local nextcode, str, i = utf8.codes(str)
+  local state = "START"
+  local saved_i, saved_code
+  return function()
+    local new_word, code
+    repeat
+      i, code = nextcode(str, i)
+      if saved_i then
+        new_word, state = state(code)
+        if new_word ~= nil then
+          i, code, saved_i, saved_code = saved_i, saved_code, nil, nil
+        end
+      else
+        if not i then return end
+        new_word, state = read_codepoint(code, state)
+        if new_word == nil then
+          saved_i, saved_code = i, code
+        end
+      end
+    until new_word
+    return i, code
+  end
+end
+-- A more useful iterator: returns the byterange of the segment in reverse order followed by a string with the word
+local function word_boundaries(str)
+  local iter = word_boundaries_start(str)
+  return function(_, cur)
+    if cur == #str then return end
+    local new = iter()
+    if not new then return #str, cur + 1, str:sub(cur + 1) end
+    return new - 1, cur + 1, str:sub(cur + 1, new - 1)
+  end, nil, iter() - 1
+end
+return {
+  read_codepoint = read_codepoint,
+  word_boundaries_start = word_bounaries_start,
+  word_boundaries = word_boundaries,
+}


Property changes on: trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-words.lua
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property


More information about the tex-live-commits mailing list.