texlive[76195] Master/texmf-dist: lua-uni-algos (31aug25)
commits+karl at tug.org
commits+karl at tug.org
Sun Aug 31 21:20:36 CEST 2025
Revision: 76195
https://tug.org/svn/texlive?view=revision&revision=76195
Author: karl
Date: 2025-08-31 21:20:36 +0200 (Sun, 31 Aug 2025)
Log Message:
-----------
lua-uni-algos (31aug25)
Modified Paths:
--------------
trunk/Master/texmf-dist/doc/luatex/lua-uni-algos/README.md
trunk/Master/texmf-dist/doc/luatex/lua-uni-algos/lua-uni-algos.pdf
trunk/Master/texmf-dist/doc/luatex/lua-uni-algos/lua-uni-algos.tex
trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-algos.lua
trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-case.lua
trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-graphemes.lua
trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-normalize.lua
trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-parse.lua
Added Paths:
-----------
trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-data-parser.lua
trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-data-preload.lua
trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-data.lua
trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-stage-tables.lua
trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-words.lua
Modified: trunk/Master/texmf-dist/doc/luatex/lua-uni-algos/README.md
===================================================================
--- trunk/Master/texmf-dist/doc/luatex/lua-uni-algos/README.md 2025-08-31 19:20:16 UTC (rev 76194)
+++ trunk/Master/texmf-dist/doc/luatex/lua-uni-algos/README.md 2025-08-31 19:20:36 UTC (rev 76195)
@@ -1,12 +1,12 @@
# The lua-uni-algos Package
-Version: v0.4.1
+Version: v0.5
-Date: 2022-02-26
+Date: 2025-08-31
Author: Marcel Krüger
-License: LPPL v1.3
+License: LPPL v1.3c
A collection of small Lua modules implementing some if the most generic Unicode algorithms for use with LuaTeX.
This package tries to reduce duplicated work by collecting a set of small utilities which can be used be useful for many LuaTeX packages dealing with Unicode strings.
Modified: trunk/Master/texmf-dist/doc/luatex/lua-uni-algos/lua-uni-algos.pdf
===================================================================
(Binary files differ)
Modified: trunk/Master/texmf-dist/doc/luatex/lua-uni-algos/lua-uni-algos.tex
===================================================================
--- trunk/Master/texmf-dist/doc/luatex/lua-uni-algos/lua-uni-algos.tex 2025-08-31 19:20:16 UTC (rev 76194)
+++ trunk/Master/texmf-dist/doc/luatex/lua-uni-algos/lua-uni-algos.tex 2025-08-31 19:20:36 UTC (rev 76195)
@@ -2,7 +2,7 @@
\usepackage{doc, shortvrb, metalogo, hyperref, fontspec}
% \setmainfont{Noto Serif}
% \setmonofont{FreeMono}
-\title{Unicode algorithms for Lua\TeX\thanks{This document corresponds to \pkg{lua-uni-algos} v0.4.1.}}
+\title{Unicode algorithms for Lua\TeX\thanks{This document corresponds to \pkg{lua-uni-algos} v0.5.}}
\author{Marcel Krüger\thanks{E-Mail: \href{mailto:tex at 2krueger.de}{\nolinkurl{tex at 2krueger.de}}}}
\MakeShortVerb\|
\newcommand\pkg{\texttt}
@@ -17,6 +17,7 @@
\item[Unicode normalization] Normalize a given Lua string into any of the normalization forms NFC, NFD, NFKC, or NFKD as specified in the Unicode standard, section 2.12.
\item[Case folding] Fold Unicode codepoints into a form which eliminates all case distinctions. This can be used for case-independent matching of strings. Not to be confused with case mapping which maps all characters to lower/upper/titlecase: In contrast to case mapping, case folding is mostly locale independent but does not give results which should be shown to users.
\item[Grapheme cluster segmentation] Identify a grapheme cluster, a unit of text which is perceived as a single character by typical users, according to the rules in UAX \#29, section 3.
+ \item[Word boundary segmentation] Identify word boundaries according to the rules in UAX \#29, section 4.
\end{description}
\section{Normalization}
Unicode normalization is handled by the Lua module |lua-uni-normalize|.
@@ -180,4 +181,43 @@
Do not try to interpret the |state|, it has no defined values and might change at any point.
+\section{Word boundaries}
+Word segmentation is handled by the Lua module |lua-uni-words|.
+You can either load it directly with
+\begin{verbatim}
+local words = require'lua-uni-words'
+\end{verbatim}
+or if you need access to all implemented algorithms you can use
+\begin{verbatim}
+local uni_algos = require'lua-uni-algos'
+local words = uni_algos.words
+\end{verbatim}
+
+This is used to identify word boundaries. Unicode describes these as the boundaries users would expect when searching in a text for whole words.
+
+The UAX also suggests that for some use-cases useful words can be determined from these by taking any segment between word boundaries and filtering out all segments ``containing only whitespace, punctuation and similar characters''.
+
+Currently only the string interface is recommended:
+
+\begin{verbatim}
+for final, first, word_segment in words.word_boundaries'This text will be split into words segments!' do
+ print('"' .. word_segment .. '"')
+end
+\end{verbatim}
+% \begin{verbatim}
+% for final, first, word_segment in words.word_boundaries'Z͑ͫ̓ͪ̂ͫ̽͏̴̙̤̞͉͚̯̞̠͍A̴̵̜̰͔ͫ͗͢L̠ͨͧͩ͘G̴̻͈͍͔̹̑͗̎̅͛́Ǫ̵̹̻̝̳͂̌̌͘!͖̬̰̙̗̿̋ͥͥ̂ͣ̐́́͜͞' do
+% print(grapheme)
+% end
+% \end{verbatim}
+
+\noindent\begingroup
+ \ttfamily
+ \directlua{
+ local words = require'./lua-uni-words'
+ for final, first, word_segment in words.word_boundaries'This text will be split into words segments!' do
+ tex.sprint('"' .. word_segment .. '"\string\\\string\\')
+ end
+ }\par
+\endgroup
+
\end{document}
Modified: trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-algos.lua
===================================================================
--- trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-algos.lua 2025-08-31 19:20:16 UTC (rev 76194)
+++ trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-algos.lua 2025-08-31 19:20:36 UTC (rev 76195)
@@ -1,5 +1,5 @@
-- lua-uni-algos.lua
--- Copyright 2020--2022 Marcel Krüger
+-- Copyright 2020--2025 Marcel Krüger
--
-- This work may be distributed and/or modified under the
-- conditions of the LaTeX Project Public License, either version 1.3
@@ -17,4 +17,5 @@
case = require'lua-uni-case',
graphemes = require'lua-uni-graphemes',
normalize = require'lua-uni-normalize',
+ words = require'lua-uni-words',
}
Modified: trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-case.lua
===================================================================
--- trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-case.lua 2025-08-31 19:20:16 UTC (rev 76194)
+++ trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-case.lua 2025-08-31 19:20:36 UTC (rev 76195)
@@ -1,5 +1,5 @@
--- lua-uni-graphemes.lua
--- Copyright 2020--2022 Marcel Krüger
+-- lua-uni-case.lua
+-- Copyright 2020--2025 Marcel Krüger
--
-- This work may be distributed and/or modified under the
-- conditions of the LaTeX Project Public License, either version 1.3
Added: trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-data-parser.lua
===================================================================
--- trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-data-parser.lua (rev 0)
+++ trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-data-parser.lua 2025-08-31 19:20:36 UTC (rev 76195)
@@ -0,0 +1,129 @@
+-- lua-uni-data-parser.lua
+-- Copyright 2020--2025 Marcel Krüger
+--
+-- This work may be distributed and/or modified under the
+-- conditions of the LaTeX Project Public License, either version 1.3
+-- of this license or (at your option) any later version.
+-- The latest version of this license is in
+-- http://www.latex-project.org/lppl.txt
+-- and version 1.3 or later is part of all distributions of LaTeX
+-- version 2005/12/01 or later.
+--
+-- This work has the LPPL maintenance status `maintained'.
+--
+-- The Current Maintainer of this work is Marcel Krüger
+
+-- This is mainly an internal module used in lua-uni-data to efficiently store
+-- the data entries.
+--
+-- This is an internal module which should only be loaded by other parts of lua-uni-algos.
+-- If you want to access the data parsed here, please use lua-uni-data.
+
+local bits = 6
+
+local parse, lpeg = require'lua-uni-parse', lpeg or require'lpeg'
+local lookup_table = require'lua-uni-stage-tables'
+local to_three_stage_lookup = lookup_table.to_three_stage_lookup
+
+local function dynamic_invertible_table(default)
+ local forward, count = {}, 0
+ local backward = setmetatable({}, {__index = function(t, key)
+ local index = count + 1
+ count = index
+ forward[index], t[key] = key, index
+ return index
+ end})
+ if default ~= nil then
+ forward[0], backward[default] = default, 0
+ end
+ return forward, backward
+end
+
+local read_properties do
+ local l, p = lpeg, parse
+ local any_property = l.R('az', 'AZ', '__')^1
+
+ function read_properties(filename, default, properties_pattern)
+ local forward_mapping, inverted_mapping = dynamic_invertible_table(default)
+ properties_pattern = l.P(properties_pattern or any_property)
+ local result = p.parse_file(filename,
+ l.Cg(p.fields(p.codepoint_range, properties_pattern / inverted_mapping)) + p.ignore_line,
+ p.multiset
+ )
+ if not result then
+ return nil, string.format("Failed to parse %s", filename)
+ end
+ return result, forward_mapping
+ end
+end
+
+local category_mapping
+local general_category, ccc, decomposition_mapping, compatibility_mapping, uppercase, lowercase, titlecase do
+ local reverse_category_mapping
+ category_mapping, reverse_category_mapping = dynamic_invertible_table'Cn'
+ local function multiset(ts, key, key_range, general_category, ccc, decomp_kind, decomp_mapping, upper, lower, title)
+ key_range = key_range or key
+ for codepoint=key, key_range do
+ ts[1][codepoint], ts[2][codepoint], ts[4][codepoint], ts[5][codepoint], ts[6][codepoint], ts[7][codepoint] = general_category, ccc, decomp_mapping, upper and upper - codepoint, lower and lower - codepoint, title and title - codepoint or upper and upper - codepoint
+ if not decomp_kind then
+ ts[3][codepoint] = decomp_mapping
+ end
+ end
+ return ts
+ end
+ local l, p = lpeg, parse
+ local Cnil = l.Cc(nil)
+ local letter = l.R('AZ', 'az')
+ local codepoint_or_ud_range = p.codepoint * ( -- When scanning a range in UnicodeData.txt, we use the data from the entry of the end of the range.
+ ';<' * p.ignore_field * lpeg.B', First>' * p.ignore_line * p.codepoint
+ + l.Cc(nil)
+ )
+ local parsed = assert(p.parse_file('UnicodeData', l.Cf(
+ l.Ct(l.Ct'' * l.Ct'' * l.Ct'' * l.Ct'' * l.Ct'' * l.Ct'' * l.Ct'') * (
+ l.Cg(p.fields(codepoint_or_ud_range,
+ p.ignore_field, -- Name (ignored)
+ l.R'AZ' * l.R'az' / reverse_category_mapping, -- General_Category
+ '0' * Cnil + p.number, -- Canonical_Combining_Class
+ p.ignore_field, -- Bidi_Class (ignored)
+ ('<' * l.C(letter^1) * '> ' + Cnil) -- Decomposition_Type
+ * (l.Ct(p.codepoint * (' ' * p.codepoint)^0) + Cnil), -- Decomposition_Mapping
+ p.ignore_field, -- Numeric_Type / Numeric_Value (ignored)
+ p.ignore_field, -- Numeric_Type / Numeric_Value (ignored)
+ p.ignore_field, -- Numeric_Type / Numeric_Value (ignored)
+ p.ignore_field, -- Bidi_Mirrored (ignored)
+ p.ignore_field, -- obsolete
+ p.ignore_field, -- obsolete
+ (p.codepoint + Cnil), -- Simple_Uppercase_Mapping
+ (p.codepoint + Cnil), -- Simple_Lowercase_Mapping
+ (p.codepoint + Cnil)) -- Simple_Titlecase_Mapping
+ ) + p.eol
+ )^0, multiset) * -1))
+ general_category, ccc, decomposition_mapping, compatibility_mapping, uppercase, lowercase, titlecase = unpack(parsed)
+end
+
+local grapheme_break_property, grapheme_break_mapping = assert(read_properties('GraphemeBreakProperty', 'Other'))
+local word_break_property, word_break_mapping = assert(read_properties('WordBreakProperty', 'Other'))
+
+general_category = to_three_stage_lookup(general_category, category_mapping, bits, bits, 1)
+ccc = to_three_stage_lookup(ccc, nil, bits, bits, 1)
+grapheme_break_property = to_three_stage_lookup(grapheme_break_property, grapheme_break_mapping, bits, bits, 1)
+uppercase = to_three_stage_lookup(uppercase, 'offset', bits, bits, 3)
+lowercase = to_three_stage_lookup(lowercase, 'offset', bits, bits, 3)
+titlecase = to_three_stage_lookup(titlecase, 'offset', bits, bits, 3)
+word_break_property = to_three_stage_lookup(word_break_property, word_break_mapping, bits, bits, 1)
+
+return {
+ tables = {
+ category = general_category,
+ ccc = ccc,
+ grapheme = grapheme_break_property,
+ uppercase = uppercase,
+ lowercase = lowercase,
+ titlecase = titlecase,
+ wordbreak = word_break_property,
+ },
+ misc = {
+ decomposition_mapping = decomposition_mapping,
+ compatibility_mapping = compatibility_mapping,
+ },
+}
Property changes on: trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-data-parser.lua
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-data-preload.lua
===================================================================
--- trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-data-preload.lua (rev 0)
+++ trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-data-preload.lua 2025-08-31 19:20:36 UTC (rev 76195)
@@ -0,0 +1,82 @@
+-- lua-uni-data-preload.lua
+-- Copyright 2020--2025 Marcel Krüger
+--
+-- This work may be distributed and/or modified under the
+-- conditions of the LaTeX Project Public License, either version 1.3
+-- of this license or (at your option) any later version.
+-- The latest version of this license is in
+-- http://www.latex-project.org/lppl.txt
+-- and version 1.3 or later is part of all distributions of LaTeX
+-- version 2005/12/01 or later.
+--
+-- This work has the LPPL maintenance status `maintained'.
+--
+-- The Current Maintainer of this work is Marcel Krüger
+
+-- This is mainly an internal module used in lua-uni-data to efficiently store
+-- the data entries.
+--
+--
+local lookup_table = require'lua-uni-stage-tables'
+local serialize = lookup_table.serialize
+local deserialize = lookup_table.deserialize
+
+-- Serialize a Lua value into a string.
+-- Not designed to be human readable but only to be reloaded into Lua with load().
+local function serialize_lua(value)
+ local t = type(value)
+ local fmt
+ if t == 'number' then
+ fmt = math.type(value) == 'integer' and '%#X' or '%A'
+ elseif t == 'string' then
+ fmt = '%q'
+ elseif t == 'boolean' or t == 'nil' then
+ fmt = '%s'
+ elseif t == 'table' then
+ local k, v
+ local entries, length = {}, 0
+ while true do
+ local last_key = k or 0
+ k, v = next(value, k)
+ if math.type(k) == 'integer' and k > last_key and k - last_key < 5 then
+ for i = last_key+1, k-1 do
+ length = length + 1
+ entries[length] = 'nil'
+ end
+ length = length + 1
+ entries[length] = serialize_lua(v)
+ else
+ break
+ end
+ end
+ while k ~= nil do
+ length = length + 1
+ entries[length] = string.format('[%s] = %s', serialize_lua(k), serialize_lua(v))
+ k, v = next(value, k)
+ end
+ fmt, value = '{%s}', table.concat(entries, ', ', 1, length)
+ elseif t == 'function' or t == 'thread' or t == 'userdata' then
+ error"Unsupported type in deserialize"
+ end
+ return string.format(fmt, value)
+end
+
+return {
+ generate_bytecode = function()
+ local data = require'lua-uni-data'
+
+ local tables = {}
+ for k, v in next, data.tables do
+ tables[#tables + 1] = string.format("[%q] = deserialize %q,", k, serialize(v))
+ end
+ return assert(load(string.format("\z
+ package.preload['lua-uni-data'] = function() \z
+ local deserialize = require'lua-uni-stage-tables'.deserialize \z
+ return { \z
+ tables = { %s }, \z
+ misc = %s, \z
+ }\z
+ end\z
+ ", table.concat(tables), serialize_lua(data.misc, 'misc')), 'preloaded_unicode_data', 't'))
+ end,
+}
Property changes on: trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-data-preload.lua
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-data.lua
===================================================================
--- trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-data.lua (rev 0)
+++ trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-data.lua 2025-08-31 19:20:36 UTC (rev 76195)
@@ -0,0 +1,20 @@
+-- lua-uni-data.lua
+-- Copyright 2020--2025 Marcel Krüger
+--
+-- This work may be distributed and/or modified under the
+-- conditions of the LaTeX Project Public License, either version 1.3
+-- of this license or (at your option) any later version.
+-- The latest version of this license is in
+-- http://www.latex-project.org/lppl.txt
+-- and version 1.3 or later is part of all distributions of LaTeX
+-- version 2005/12/01 or later.
+--
+-- This work has the LPPL maintenance status `maintained'.
+--
+-- The Current Maintainer of this work is Marcel Krüger
+
+-- Make data from lua-uni-data-parser externally reachable.
+--
+-- Preloaded data is provided instead if it's present.
+--
+return require'lua-uni-data-parser'
Property changes on: trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-data.lua
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Modified: trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-graphemes.lua
===================================================================
--- trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-graphemes.lua 2025-08-31 19:20:16 UTC (rev 76194)
+++ trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-graphemes.lua 2025-08-31 19:20:36 UTC (rev 76195)
@@ -1,5 +1,5 @@
-- lua-uni-graphemes.lua
--- Copyright 2020--2022 Marcel Krüger
+-- Copyright 2020--2025 Marcel Krüger
--
-- This work may be distributed and/or modified under the
-- conditions of the LaTeX Project Public License, either version 1.3
@@ -123,7 +123,7 @@
-- The value of "state" is considered internal and should not be relied upon.
-- Just pass it to the function as is or pass nil. `nil` should only be passed when the passed codepoint starts a new cluster
-function read_codepoint(cp, state)
+local function read_codepoint(cp, state)
local new_cluster
state, new_cluster = state_map[state or 'START'](property[cp])
return new_cluster, state
Modified: trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-normalize.lua
===================================================================
--- trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-normalize.lua 2025-08-31 19:20:16 UTC (rev 76194)
+++ trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-normalize.lua 2025-08-31 19:20:36 UTC (rev 76195)
@@ -1,5 +1,5 @@
-- lua-uni-normalize.lua
--- Copyright 2020--2022 Marcel Krüger
+-- Copyright 2020--2025 Marcel Krüger
--
-- This work may be distributed and/or modified under the
-- conditions of the LaTeX Project Public License, either version 1.3
@@ -224,7 +224,7 @@
end
return s
end
-function to_nfd_table(s, decomposition_mapping)
+local function to_nfd_table(s, decomposition_mapping)
local new_codepoints = result_table
local j = 1
for _, c in codes(s) do
Modified: trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-parse.lua
===================================================================
--- trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-parse.lua 2025-08-31 19:20:16 UTC (rev 76194)
+++ trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-parse.lua 2025-08-31 19:20:36 UTC (rev 76195)
@@ -1,5 +1,5 @@
-- lua-uni-parse.lua
--- Copyright 2020--2022 Marcel Krüger
+-- Copyright 2020--2025 Marcel Krüger
--
-- This work may be distributed and/or modified under the
-- conditions of the LaTeX Project Public License, either version 1.3
@@ -15,6 +15,9 @@
-- Just a simple helper module to make UCD parsing more readable
+-- The rawget is needed here because of Context idiosyncrasies.
+local find_file = assert(kpse and rawget(kpse, 'find_file') or resolvers and resolvers.find_file, 'No file searching library found')
+
local lpeg = lpeg or require'lpeg'
local R = lpeg.R
local tonumber = tonumber
@@ -47,7 +50,7 @@
if func then
return parse_uni_file(filename, lpeg.Cf(lpeg.Ct'' * patt^0 * -1, func), nil, ...)
end
- local resolved = kpse.find_file(filename .. '.txt')
+ local resolved = find_file(filename .. '.txt')
if not resolved then
error(string.format("Unable to find Unicode datafile %q", filename))
end
Added: trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-stage-tables.lua
===================================================================
--- trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-stage-tables.lua (rev 0)
+++ trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-stage-tables.lua 2025-08-31 19:20:36 UTC (rev 76195)
@@ -0,0 +1,268 @@
+-- lua-uni-stage-tables.lua
+-- Copyright 2025 Marcel Krüger
+--
+-- This work may be distributed and/or modified under the
+-- conditions of the LaTeX Project Public License, either version 1.3
+-- of this license or (at your option) any later version.
+-- The latest version of this license is in
+-- http://www.latex-project.org/lppl.txt
+-- and version 1.3 or later is part of all distributions of LaTeX
+-- version 2005/12/01 or later.
+--
+-- This work has the LPPL maintenance status `maintained'.
+--
+-- The Current Maintainer of this work is Marcel Krüger
+
+-- This is mainly an internal module used in lua-uni-data to efficiently store
+-- the data entries.
+
+
+-- The source is a 0-based table with integer values fitting into data_bytes bytes.
+-- The first return value is a 0-based table with values being 1-based indexes into the second table or 0.
+local function compress(compress_bits, data_bytes, signed, source, upper_limit)
+ local compress_mask = (1 << compress_bits) - 1
+ local default_lower_level = lua.newtable(compress_mask + 1, 0)
+ local pattern = (signed and '>i' or '>I') .. data_bytes
+ local zero_value = string.pack(pattern, 0)
+ for i=1, compress_mask + 1 do
+ default_lower_level[i] = zero_value
+ end
+ local first_stage = upper_limit and lua.newtable((upper_limit >> compress_bits), 1) or {}
+ for k, v in next, source do
+ if v and v ~= 0 and type(k) == 'number' then
+ local high, low = k >> compress_bits, (k & compress_mask) + 1
+ local subtable = first_stage[high]
+ if not subtable then
+ subtable = table.move(default_lower_level, 1, compress_mask + 1, 1, {})
+ first_stage[high] = subtable
+ end
+ subtable[low] = string.pack(pattern, v)
+ end
+ end
+ local default_key = table.concat(default_lower_level)
+ local second_stage_lookup = {[default_key] = 0}
+ local second_stage = {[0] = default_key}
+ for k, v in next, first_stage do
+ local key = table.concat(v)
+ local index = second_stage_lookup[key]
+ if not index then
+ index = #second_stage + 1
+ second_stage_lookup[key] = index
+ second_stage[index] = key
+ end
+ if index ~= 0 then
+ first_stage[k] = index
+ end
+ end
+ if upper_limit then
+ for i = 0, upper_limit >> compress_bits do
+ first_stage[i] = first_stage[i] or 0
+ end
+ end
+ return first_stage, second_stage
+end
+
+local function lookup_identity(value) return value end
+local function lookup_table(t) return function(value) return t[value] end end
+local function lookup_offset(value, codepoint) return codepoint + value end
+
+local readers = {
+ [true] = {
+ [1] = sio.readinteger1,
+ [2] = sio.readinteger2,
+ [3] = sio.readinteger3,
+ [4] = sio.readinteger4,
+ },
+ [false] = {
+ [1] = sio.readcardinal1,
+ [2] = sio.readcardinal2,
+ [3] = sio.readcardinal3,
+ [4] = sio.readcardinal4,
+ },
+}
+
+local function two_stage_metatable(first_stage, second_stage, lookup, bits, bytes)
+ local lookup_function, signed
+ if lookup == nil then
+ signed = false
+ lookup_function = lookup_identity
+ elseif lookup == 'offset' then
+ signed = true
+ lookup_function = lookup_offset
+ else
+ signed = false
+ lookup_function = lookup_table(lookup)
+ end
+ local reader = assert(readers[signed][bytes])
+ local block_size = 1 << bits
+ local mask = block_size - 1
+ local stride = bytes * block_size
+ return setmetatable({
+ __first_stage = first_stage,
+ __second_stage = second_stage,
+ __lookup = lookup,
+ __bytes = bytes,
+ __bits = bits,
+ }, {
+ __index = function(_, key)
+ if type(key) ~= 'number' then return nil end
+ local high, low = key >> bits, key & mask
+ local second_index = first_stage[high] or 0
+ local value
+ if second_index ~= 0 then
+ value = reader(second_stage, stride * (second_index - 1) + bytes * low + 1)
+ else
+ value = 0
+ end
+ return lookup_function(value, key)
+ end
+ })
+end
+
+local function one_stage_metatable(buffer, lookup, bytes)
+ local lookup_function, signed
+ if lookup == nil then
+ signed = false
+ lookup_function = lookup_identity
+ elseif lookup == 'offset' then
+ signed = true
+ lookup_function = lookup_offset
+ else
+ signed = false
+ lookup_function = lookup_table(lookup)
+ end
+ local reader = assert(readers[signed][bytes])
+ return setmetatable({
+ __buffer = buffer,
+ __lookup = lookup,
+ __bytes = bytes,
+ }, {
+ __index = function(_, key)
+ local value = reader(buffer, bytes * key + 1)
+ return lookup_function(value, key)
+ end
+ })
+end
+
+local function to_two_stage_lookup(source, lookup, bits, bytes)
+ local first_stage, second_stage = compress(bits, bytes, source)
+ return two_stage_metatable(first_stage, table.concat(second_stage), lookup, codepoint_block_bits, bytes)
+end
+
+local function to_three_stage_lookup(source, lookup, bits1, bits2, bytes)
+ local intermediate_stage, third_stage = compress(bits2, bytes, lookup == 'offset', source)
+ local needed_bytes = math.floor(math.log(#third_stage, 256)) + 1
+ local first_stage, second_stage = compress(bits1, needed_bytes, false, intermediate_stage, (0x10FFFF >> bits2) + 1)
+ return two_stage_metatable(two_stage_metatable(first_stage, table.concat(second_stage), nil, bits1, needed_bytes), table.concat(third_stage), lookup, bits2, bytes)
+end
+
+-- Cache file format (everything in little endian):
+-- Header:
+-- u32: Version (0x00010000)
+-- u8: Stages (supported: 2 or 3)
+-- u8: Kind (enum: 0x00 Identity table, 0x01 Delta encoding (signed), 0x02 Mapping table)
+-- Mapping table: (only present if kind == 0x02
+-- <Data size in bytes>: Entry count without default
+-- <u8 size prefixed string>: Default
+-- <Entry count> * <u8 size prefixed string>: Entries
+-- Outer table:
+-- <u32> Number of entries
+-- <u8> Size of one entry in bytes
+-- <data>
+-- Inner tables: (repeated `stages - 1` times)
+-- <previous entry size bytes> Number of entries
+-- <u8> Size of underlying value in bytes
+-- <u8> Bits representing page size (each entry is 2^bits * bytes fields big)
+-- <data>
+--
+local function serialize(nested_table)
+ local buffer = {}
+ local nesting_depth do
+ local current_table = nested_table
+ nesting_depth = 0
+ repeat
+ nesting_depth = nesting_depth + 1
+ current_table = current_table.__first_stage
+ until current_table == nil
+ end
+ local lookup = nested_table.__lookup
+ local kind = lookup == nil and 0 or lookup == 'offset' and 1 or 2
+ buffer[1] = string.pack('>I4BB', 0x00010000, nesting_depth, kind)
+ if kind == 2 then
+ buffer[2] = string.pack('>I4', #lookup)
+ for i=0, #lookup do
+ buffer[3 + i] = string.pack('s1', lookup[i])
+ end
+ end
+ local function serialize_stage(stage, prev)
+ local inner_stage = stage.__first_stage
+ if inner_stage then
+ local nested_bytes = serialize_stage(inner_stage, stage)
+ buffer[#buffer + 1] = string.pack(string.format('>I%sBB', nested_bytes), #stage.__second_stage // (stage.__bytes * (1 << stage.__bits)), stage.__bytes, stage.__bits)
+ buffer[#buffer + 1] = stage.__second_stage
+ return stage.__bytes
+ else
+ local index_bytes = math.floor(math.log(#prev.__second_stage // ((1 << prev.__bits) * prev.__bytes), 256)) + 1
+ buffer[#buffer + 1] = string.pack('>I4B', #stage + 1, index_bytes)
+ buffer[#buffer + 1] = string.pack('>' .. string.rep('I' .. index_bytes, #stage + 1), table.unpack(stage, 0))
+ return index_bytes
+ end
+ end
+ serialize_stage(nested_table)
+ return table.concat(buffer)
+end
+
+local function deserialize(data)
+ local offset = 1
+ local version, nesting_depth, kind
+ version, nesting_depth, kind, offset = string.unpack('>I4BB', data, offset)
+ if version ~= 0x00010000 then error'Invalid version' end
+ local lookup
+ if kind == 0 then
+ lookup = nil
+ elseif kind == 1 then
+ lookup = 'offset'
+ elseif kind == 2 then
+ -- TODO
+ local lookup_count
+ lookup_count, offset = string.unpack('>I4', data, offset)
+ lookup = lua.newtable(lookup_count, 1)
+ for i=0, lookup_count do
+ lookup[i], offset = string.unpack('s1', data, offset)
+ end
+ else
+ error'Unsupported type'
+ end
+
+ local nested_table, previous_bytes
+ do
+ local entries, bytes
+ entries, bytes, offset = string.unpack('>I4B', data, offset)
+ local stage_size = entries * bytes
+ local stage = data:sub(offset, offset + stage_size - 1)
+ offset = offset + stage_size
+ nested_table = one_stage_metatable(stage, level == nesting_depth and lookup or nil, bytes)
+ previous_bytes = bytes
+ end
+
+ for level = 2, nesting_depth do
+ local entries, bytes, bits
+ entries, bytes, bits, offset = string.unpack(string.format('>I%sBB', previous_bytes), data, offset) --, #stage.__second_stage // (stage.__bytes * (1 << stage.__bits)), stage.__bytes, stage.__bits)
+ local stage_size = entries * bytes * (1 << bits)
+ local stage = data:sub(offset, offset + stage_size - 1)
+ offset = offset + stage_size
+ nested_table = two_stage_metatable(nested_table, stage, level == nesting_depth and lookup or nil, bits, bytes)
+ previous_bytes = bytes
+ end
+
+ assert(offset == #data + 1)
+ return nested_table
+end
+
+return {
+ compress = compress,
+ to_two_stage_lookup = to_two_stage_lookup,
+ to_three_stage_lookup = to_three_stage_lookup,
+ serialize = serialize,
+ deserialize = deserialize,
+}
Property changes on: trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-stage-tables.lua
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-words.lua
===================================================================
--- trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-words.lua (rev 0)
+++ trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-words.lua 2025-08-31 19:20:36 UTC (rev 76195)
@@ -0,0 +1,315 @@
+-- lua-uni-words.lua
+-- Copyright 2025 Marcel Krüger
+--
+-- This work may be distributed and/or modified under the
+-- conditions of the LaTeX Project Public License, either version 1.3
+-- of this license or (at your option) any later version.
+-- The latest version of this license is in
+-- http://www.latex-project.org/lppl.txt
+-- and version 1.3 or later is part of all distributions of LaTeX
+-- version 2005/12/01 or later.
+--
+-- This work has the LPPL maintenance status `maintained'.
+--
+-- The Current Maintainer of this work is Marcel Krüger
+
+local extended_pictographic, property do
+ local p = require'lua-uni-parse'
+ local l = lpeg or require'lpeg'
+
+ extended_pictographic = p.parse_file('emoji-data',
+ l.Cg(p.fields(p.codepoint_range, 'Extended_Pictographic' * l.Cc(true))) + p.ignore_line,
+ p.multiset)
+ if not extended_pictographic then
+ error[[Break Property matching failed]]
+ end
+
+ property = p.parse_file('WordBreakProperty',
+ l.Cg(p.fields(p.codepoint_range, l.C(l.R('az', 'AZ', '__')^1))) + p.ignore_line,
+ p.multiset)
+ if not property then
+ error[[Break Property matching failed]]
+ end
+end
+
+local ignorable = { Extend = true, Format = true, ZWJ = true, }
+local controls = { CR = true, LF = true, Newline = true, }
+
+local function context_AHLetter_Mid(cp)
+ local prop = property[cp]
+ if ignorable[prop] then
+ return nil, context_AHLetter_Mid
+ end
+ if prop == 'ALetter' then
+ return false, 'ASTARTED'
+ end
+ if prop == 'Hebrew_Letter' then
+ return false, 'HSTARTED'
+ end
+ return true, 'PRE'
+end
+
+local function context_HLetter_Double(cp)
+ local prop = property[cp]
+ if ignorable[prop] then
+ return nil, context_HLetter_Double
+ end
+ if prop == 'Hebrew_Letter' then
+ return false, 'HSTARTED'
+ end
+ return true, 'PRE'
+end
+
+local function context_Numeric_Mid(cp)
+ local prop = property[cp]
+ if ignorable[prop] then
+ return nil, context_Numeric_Mid
+ end
+ if prop == 'Numeric' then
+ return false, 'NSTARTED'
+ end
+ return true, 'PRE'
+end
+
+local state_map state_map = {
+ START = function(prop)
+ if prop == 'CR' then
+ return 'CR', true
+ end
+ if prop == 'LF' or prop == 'Newline' then
+ return 'START', true
+ end
+ return state_map.PRE(prop), true
+ end,
+ PRE = function(prop)
+ if controls[prop] then
+ return state_map.START(prop)
+ end
+ if ignorable[prop] then
+ return 'PRE', false
+ end
+ if prop == 'WSegSpace' then
+ return 'WHITE', true
+ end
+ if prop == 'ALetter' then
+ return 'ASTARTED', true
+ end
+ if prop == 'Hebrew_Letter' then
+ return 'HSTARTED', true
+ end
+ if prop == 'Numeric' then
+ return 'NSTARTED', true
+ end
+ if prop == 'Katakana' then
+ return 'KSTARTED', true
+ end
+ if prop == 'ExtendNumLet' then
+ return 'EXTEND', true
+ end
+ if prop == 'Regional_Indicator' then
+ return 'RI', true
+ end
+ return 'PRE', true
+ end,
+ CR = function(prop)
+ if prop == 'LF' then
+ return 'START', false
+ else
+ return state_map.START(prop)
+ end
+ end,
+ WHITE = function(prop)
+ if prop == 'WSegSpace' then
+ return 'WHITE', false
+ else
+ return state_map.PRE(prop)
+ end
+ end,
+ EXTEND = function(prop)
+ if ignorable[prop] then
+ return 'EXTEND', false
+ end
+ if prop == 'ALetter' then
+ return 'ASTARTED', false
+ end
+ if prop == 'Hebrew_Letter' then
+ return 'HSTARTED', false
+ end
+ if prop == 'Katakana' then
+ return 'KSTARTED', false
+ end
+ if prop == 'Numeric' then
+ return 'NSTARTED', false
+ end
+ if prop == 'ExtendNumLet' then
+ return 'EXTEND', false
+ end
+ return state_map.PRE(prop)
+ end,
+ KSTARTED = function(prop)
+ if ignorable[prop] then
+ return 'KSTARTED', false
+ end
+ if prop == 'Katakana' then
+ return 'Katakana', false
+ end
+ if prop == 'ExtendNumLet' then
+ return 'EXTEND', false
+ end
+ return state_map.PRE(prop)
+ end,
+ RI = function(prop)
+ if ignorable[prop] then
+ return 'RI', false
+ end
+ if prop == 'Regional_Indicator' then
+ return 'PRE', false
+ end
+ return state_map.PRE(prop)
+ end,
+ ASTARTED = function(prop)
+ if ignorable[prop] then
+ return 'ASTARTED', false
+ end
+ if prop == 'ALetter' then
+ return 'ASTARTED', false
+ end
+ if prop == 'Hebrew_Letter' then
+ return 'HSTARTED', false
+ end
+ if prop == 'Numeric' then
+ return 'NSTARTED', false
+ end
+ if prop == 'ExtendNumLet' then
+ return 'EXTEND', false
+ end
+ if prop == 'MidLetter' or prop == 'MidNumLet' or prop == 'Single_Quote' then
+ return context_AHLetter_Mid
+ end
+ return state_map.PRE(prop)
+ end,
+ HSTARTED = function(prop)
+ if ignorable[prop] then
+ return 'HSTARTED', false
+ end
+ if prop == 'ALetter' then
+ return 'ASTARTED', false
+ end
+ if prop == 'Hebrew_Letter' then
+ return 'HSTARTED', false
+ end
+ if prop == 'Numeric' then
+ return 'NSTARTED', false
+ end
+ if prop == 'ExtendNumLet' then
+ return 'EXTEND', false
+ end
+ if prop == 'Single_Quote' then
+ return 'HSINGLE_QUOTE', false
+ end
+ if prop == 'MidLetter' or prop == 'MidNumLet' then
+ return context_AHLetter_Mid
+ end
+ if prop == 'Double_Quote' then
+ return context_HLetter_Double
+ end
+ return state_map.PRE(prop)
+ end,
+ HSINGLE_QUOTE = function(prop)
+ if ignorable[prop] then
+ return 'HSINGLE_QUOTE', false
+ end
+ if prop == 'ALetter' then
+ return 'ASTARTED', false
+ end
+ if prop == 'Hebrew_Letter' then
+ return 'HSTARTED', false
+ end
+ return state_map.PRE(prop)
+ end,
+ NSTARTED = function(prop)
+ if ignorable[prop] then
+ return 'NSTARTED', false
+ end
+ if prop == 'ALetter' then
+ return 'ASTARTED', false
+ end
+ if prop == 'Hebrew_Letter' then
+ return 'HSTARTED', false
+ end
+ if prop == 'Numeric' then
+ return 'NSTARTED', false
+ end
+ if prop == 'ExtendNumLet' then
+ return 'EXTEND', false
+ end
+ if prop == 'MidNum' or prop == 'MidNumLet' or prop == 'Single_Quote' then
+ return context_Numeric_Mid
+ end
+ return state_map.PRE(prop)
+ end,
+}
+
+local from_ZWJ, to_ZWJ = {}, {}
+for k in next, state_map do
+ local zwj_state = 'ZWJ_' .. k
+ from_ZWJ[zwj_state], to_ZWJ[k] = k, zwj_state
+end
+
+
+-- The value of "state" is considered internal and should not be relied upon.
+-- Just pass it to the function as is or pass nil. `nil` should only be passed when the passed codepoint starts a new cluster
+local function read_codepoint(cp, state)
+ local mapped_state = from_ZWJ[state]
+ local new_word
+ local prop = property[cp]
+ state, new_word = state_map[mapped_state or state or 'START'](prop)
+ if mapped_state and extended_pictographic[cp] then
+ new_word = false
+ end
+ if prop == 'ZWJ' then
+ state = to_ZWJ[state]
+ end
+ return new_word, state
+end
+
+-- A Lua iterator for strings -- Only reporting the beginning of every word segment
+local function word_boundaries_start(str)
+ local nextcode, str, i = utf8.codes(str)
+ local state = "START"
+ local saved_i, saved_code
+ return function()
+ local new_word, code
+ repeat
+ i, code = nextcode(str, i)
+ if saved_i then
+ new_word, state = state(code)
+ if new_word ~= nil then
+ i, code, saved_i, saved_code = saved_i, saved_code, nil, nil
+ end
+ else
+ if not i then return end
+ new_word, state = read_codepoint(code, state)
+ if new_word == nil then
+ saved_i, saved_code = i, code
+ end
+ end
+ until new_word
+ return i, code
+ end
+end
+-- A more useful iterator: returns the byterange of the segment in reverse order followed by a string with the word
+local function word_boundaries(str)
+ local iter = word_boundaries_start(str)
+ return function(_, cur)
+ if cur == #str then return end
+ local new = iter()
+ if not new then return #str, cur + 1, str:sub(cur + 1) end
+ return new - 1, cur + 1, str:sub(cur + 1, new - 1)
+ end, nil, iter() - 1
+end
+return {
+ read_codepoint = read_codepoint,
+ word_boundaries_start = word_bounaries_start,
+ word_boundaries = word_boundaries,
+}
Property changes on: trunk/Master/texmf-dist/tex/luatex/lua-uni-algos/lua-uni-words.lua
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
More information about the tex-live-commits
mailing list.