[latex3-commits] [git/LaTeX3-latex3-lua-uni-algos] main: Add and document string interface for word segmentation (e3ab98b)

Sat Jan 28 13:10:31 CET 2023

Repository : https://github.com/latex3/lua-uni-algos
On branch  : main
Link       : https://github.com/latex3/lua-uni-algos/commit/e3ab98bea5d7eb112bf9393df7d2fe9287a21b1e

>---------------------------------------------------------------

commit e3ab98bea5d7eb112bf9393df7d2fe9287a21b1e
Author: Marcel Fabian Krüger <tex at 2krueger.de>
Date:   Sat Jan 28 13:10:02 2023 +0100

    Add and document string interface for word segmentation


>---------------------------------------------------------------

e3ab98bea5d7eb112bf9393df7d2fe9287a21b1e
 lua-uni-algos.lua |  1 +
 lua-uni-algos.tex | 40 ++++++++++++++++++++++++++++++++++++++++
 lua-uni-words.lua | 37 +++++++++++++++++++++++++++++++++++++
 3 files changed, 78 insertions(+)

diff --git a/lua-uni-algos.lua b/lua-uni-algos.lua
index d965fc2..89c2a7d 100644
--- a/lua-uni-algos.lua
+++ b/lua-uni-algos.lua
@@ -17,4 +17,5 @@ return {
   case = require'lua-uni-case',
   graphemes = require'lua-uni-graphemes',
   normalize = require'lua-uni-normalize',
+  words = require'lua-uni-words',
 }
diff --git a/lua-uni-algos.tex b/lua-uni-algos.tex
index 2b290c7..de943ad 100644
--- a/lua-uni-algos.tex
+++ b/lua-uni-algos.tex
@@ -17,6 +17,7 @@ Currently this package implements:
   \item[Unicode normalization] Normalize a given Lua string into any of the normalization forms NFC, NFD, NFKC, or NFKD as specified in the Unicode standard, section 2.12.
   \item[Case folding] Fold Unicode codepoints into a form which eliminates all case distinctions. This can be used for case-independent matching of strings. Not to be confused with case mapping which maps all characters to lower/upper/titlecase: In contrast to case mapping, case folding is mostly locale independent but does not give results which should be shown to users.
   \item[Grapheme cluster segmentation] Identify a grapheme cluster, a unit of text which is perceived as a single character by typical users, according to the rules in UAX \#29, section 3.
+  \item[Word boundary segmentation] Identify word boundaries according to the rules in UAX \#29, section 4.
 \end{description}
 \section{Normalization}
 Unicode normalization is handled by the Lua module |lua-uni-normalize|.
@@ -180,4 +181,43 @@ print(new_cluster)
 
 Do not try to interpret the |state|, it has no defined values and might change at any point.
 
+\section{Word boundaries}
+Word segmentation is handled by the Lua module |lua-uni-words|.
+You can either load it directly with
+\begin{verbatim}
+local words = require'lua-uni-words'
+\end{verbatim}
+or if you need access to all implemented algorithms you can use
+\begin{verbatim}
+local uni_algos = require'lua-uni-algos'
+local words = uni_algos.words
+\end{verbatim}
+
+This is used to identify word boundaries. Unicode describes these as the boundaries users would expect when searching in a text for whole words.
+
+The UAX also suggests that for some use-cases useful words can be determined from these by taking any segment between word boundaries and filtering out all segments ``containing only whitespace, punctuation and similar characters''.
+
+Currently only the string interface is recommended:
+
+\begin{verbatim}
+for final, first, word_segment in words.word_boundaries'This text will be split into words segments!' do
+  print('"' .. word_segment .. '"')
+end
+\end{verbatim}
+% \begin{verbatim}
+% for final, first, word_segment in words.word_boundaries'Z͑ͫ̓ͪ̂ͫ̽͏̴̙̤̞͉͚̯̞̠͍A̴̵̜̰͔ͫ͗͢L̠ͨͧͩ͘G̴̻͈͍͔̹̑͗̎̅͛́Ǫ̵̹̻̝̳͂̌̌͘!͖̬̰̙̗̿̋ͥͥ̂ͣ̐́́͜͞' do
+%   print(grapheme)
+% end
+% \end{verbatim}
+
+\noindent\begingroup
+  \ttfamily
+  \directlua{
+    local words = require'./lua-uni-words'
+    for final, first, word_segment in words.word_boundaries'This text will be split into words segments!' do
+      tex.sprint('"' .. word_segment .. '"\string\\\string\\')
+    end
+  }\par
+\endgroup
+
 \end{document}
diff --git a/lua-uni-words.lua b/lua-uni-words.lua
index cc9dbaf..0d86f03 100644
--- a/lua-uni-words.lua
+++ b/lua-uni-words.lua
@@ -273,6 +273,43 @@ local function read_codepoint(cp, state)
   return new_word, state
 end
 
+-- A Lua iterator for strings -- Only reporting the beginning of every word segment
+local function word_boundaries_start(str)
+  local nextcode, str, i = utf8.codes(str)
+  local state = "START"
+  local saved_i, saved_code
+  return function()
+    local new_word, code
+    repeat
+      i, code = nextcode(str, i)
+      if saved_i then
+        new_word, state = state(code)
+        if new_word ~= nil then
+          i, code, saved_i, saved_code = saved_i, saved_code, nil, nil
+        end
+      else
+        if not i then return end
+        new_word, state = read_codepoint(code, state)
+        if new_word == nil then
+          saved_i, saved_code = i, code
+        end
+      end
+    until new_word
+    return i, code
+  end
+end
+-- A more useful iterator: returns the byterange of the segment in reverse order followed by a string with the word
+local function word_boundaries(str)
+  local iter = word_boundaries_start(str)
+  return function(_, cur)
+    if cur == #str then return end
+    local new = iter()
+    if not new then return #str, cur + 1, str:sub(cur + 1) end
+    return new - 1, cur + 1, str:sub(cur + 1, new - 1)
+  end, nil, iter() - 1
+end
 return {
   read_codepoint = read_codepoint,
+  word_boundaries_start = word_bounaries_start,
+  word_boundaries = word_boundaries,
 }