[latex3-commits] [git/LaTeX3-latex3-lua-uni-algos] main: Word segmentation (fab9d72)
Marcel Fabian Krüger
tex at 2krueger.de
Sat Jan 28 00:50:17 CET 2023
Repository : https://github.com/latex3/lua-uni-algos
On branch : main
Link : https://github.com/latex3/lua-uni-algos/commit/fab9d727fe5a8ab3f0d79f79065b635f0172836b
>---------------------------------------------------------------
commit fab9d727fe5a8ab3f0d79f79065b635f0172836b
Author: Marcel Fabian Krüger <tex at 2krueger.de>
Date: Fri Jan 27 23:38:02 2023 +0100
Word segmentation
>---------------------------------------------------------------
fab9d727fe5a8ab3f0d79f79065b635f0172836b
lua-uni-graphemes.lua | 2 +-
lua-uni-words.lua | 278 ++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 279 insertions(+), 1 deletion(-)
diff --git a/lua-uni-graphemes.lua b/lua-uni-graphemes.lua
index 73ea800..e268118 100644
--- a/lua-uni-graphemes.lua
+++ b/lua-uni-graphemes.lua
@@ -123,7 +123,7 @@ local state_map state_map = {
-- The value of "state" is considered internal and should not be relied upon.
-- Just pass it to the function as is or pass nil. `nil` should only be passed when the passed codepoint starts a new cluster
-function read_codepoint(cp, state)
+local function read_codepoint(cp, state)
local new_cluster
state, new_cluster = state_map[state or 'START'](property[cp])
return new_cluster, state
diff --git a/lua-uni-words.lua b/lua-uni-words.lua
new file mode 100644
index 0000000..cc9dbaf
--- /dev/null
+++ b/lua-uni-words.lua
@@ -0,0 +1,278 @@
+-- lua-uni-words.lua
+-- Copyright 2020--2023 Marcel Krüger
+--
+-- This work may be distributed and/or modified under the
+-- conditions of the LaTeX Project Public License, either version 1.3
+-- of this license or (at your option) any later version.
+-- The latest version of this license is in
+-- http://www.latex-project.org/lppl.txt
+-- and version 1.3 or later is part of all distributions of LaTeX
+-- version 2005/12/01 or later.
+--
+-- This work has the LPPL maintenance status `maintained'.
+--
+-- The Current Maintainer of this work is Marcel Krüger
+
+local extended_pictographic, property do
+ local p = require'lua-uni-parse'
+ local l = lpeg or require'lpeg'
+
+ extended_pictographic = p.parse_file('emoji-data',
+ l.Cg(p.fields(p.codepoint_range, 'Extended_Pictographic' * l.Cc(true))) + p.ignore_line,
+ p.multiset)
+ if not extended_pictographic then
+ error[[Break Property matching failed]]
+ end
+
+ property = p.parse_file('WordBreakProperty',
+ l.Cg(p.fields(p.codepoint_range, l.C(l.R('az', 'AZ', '__')^1))) + p.ignore_line,
+ p.multiset)
+ if not property then
+ error[[Break Property matching failed]]
+ end
+end
+
+local ignorable = { Extend = true, Format = true, ZWJ = true, }
+local controls = { CR = true, LF = true, Newline = true, }
+
+local function context_AHLetter_Mid(cp)
+ local prop = property[cp]
+ if ignorable[prop] then
+ return nil, context_AHLetter_Mid
+ end
+ if prop == 'ALetter' then
+ return false, 'ASTARTED'
+ end
+ if prop == 'Hebrew_Letter' then
+ return false, 'HSTARTED'
+ end
+ return true, 'PRE'
+end
+
+local function context_HLetter_Double(cp)
+ local prop = property[cp]
+ if ignorable[prop] then
+ return nil, context_HLetter_Double
+ end
+ if prop == 'Hebrew_Letter' then
+ return false, 'HSTARTED'
+ end
+ return true, 'PRE'
+end
+
+local function context_Numeric_Mid(cp)
+ local prop = property[cp]
+ if ignorable[prop] then
+ return nil, context_Numeric_Mid
+ end
+ if prop == 'Numeric' then
+ return false, 'NSTARTED'
+ end
+ return true, 'PRE'
+end
+
+local state_map state_map = {
+ START = function(prop)
+ if prop == 'CR' then
+ return 'CR', true
+ end
+ if prop == 'LF' or prop == 'Newline' then
+ return 'START', true
+ end
+ return state_map.PRE(prop), true
+ end,
+ PRE = function(prop)
+ if controls[prop] then
+ return state_map.START(prop)
+ end
+ if ignorable[prop] then
+ return 'PRE', false
+ end
+ if prop == 'WSegSpace' then
+ return 'WHITE', true
+ end
+ if prop == 'ALetter' then
+ return 'ASTARTED', true
+ end
+ if prop == 'Hebrew_Letter' then
+ return 'HSTARTED', true
+ end
+ if prop == 'Numeric' then
+ return 'NSTARTED', true
+ end
+ if prop == 'Katakana' then
+ return 'KSTARTED', true
+ end
+ if prop == 'ExtendNumLet' then
+ return 'EXTEND', true
+ end
+ if prop == 'Regional_Indicator' then
+ return 'RI', true
+ end
+ return 'PRE', true
+ end,
+ CR = function(prop)
+ if prop == 'LF' then
+ return 'START', false
+ else
+ return state_map.START(prop)
+ end
+ end,
+ WHITE = function(prop)
+ if prop == 'WSegSpace' then
+ return 'WHITE', false
+ else
+ return state_map.PRE(prop)
+ end
+ end,
+ EXTEND = function(prop)
+ if ignorable[prop] then
+ return 'EXTEND', false
+ end
+ if prop == 'ALetter' then
+ return 'ASTARTED', false
+ end
+ if prop == 'Hebrew_Letter' then
+ return 'HSTARTED', false
+ end
+ if prop == 'Katakana' then
+ return 'KSTARTED', false
+ end
+ if prop == 'Numeric' then
+ return 'NSTARTED', false
+ end
+ if prop == 'ExtendNumLet' then
+ return 'EXTEND', false
+ end
+ return state_map.PRE(prop)
+ end,
+ KSTARTED = function(prop)
+ if ignorable[prop] then
+ return 'KSTARTED', false
+ end
+ if prop == 'Katakana' then
+ return 'Katakana', false
+ end
+ if prop == 'ExtendNumLet' then
+ return 'EXTEND', false
+ end
+ return state_map.PRE(prop)
+ end,
+ RI = function(prop)
+ if ignorable[prop] then
+ return 'RI', false
+ end
+ if prop == 'Regional_Indicator' then
+ return 'PRE', false
+ end
+ return state_map.PRE(prop)
+ end,
+ ASTARTED = function(prop)
+ if ignorable[prop] then
+ return 'ASTARTED', false
+ end
+ if prop == 'ALetter' then
+ return 'ASTARTED', false
+ end
+ if prop == 'Hebrew_Letter' then
+ return 'HSTARTED', false
+ end
+ if prop == 'Numeric' then
+ return 'NSTARTED', false
+ end
+ if prop == 'ExtendNumLet' then
+ return 'EXTEND', false
+ end
+ if prop == 'MidLetter' or prop == 'MidNumLet' or prop == 'Single_Quote' then
+ return context_AHLetter_Mid
+ end
+ return state_map.PRE(prop)
+ end,
+ HSTARTED = function(prop)
+ if ignorable[prop] then
+ return 'HSTARTED', false
+ end
+ if prop == 'ALetter' then
+ return 'ASTARTED', false
+ end
+ if prop == 'Hebrew_Letter' then
+ return 'HSTARTED', false
+ end
+ if prop == 'Numeric' then
+ return 'NSTARTED', false
+ end
+ if prop == 'ExtendNumLet' then
+ return 'EXTEND', false
+ end
+ if prop == 'Single_Quote' then
+ return 'HSINGLE_QUOTE', false
+ end
+ if prop == 'MidLetter' or prop == 'MidNumLet' then
+ return context_AHLetter_Mid
+ end
+ if prop == 'Double_Quote' then
+ return context_HLetter_Double
+ end
+ return state_map.PRE(prop)
+ end,
+ HSINGLE_QUOTE = function(prop)
+ if ignorable[prop] then
+ return 'HSINGLE_QUOTE', false
+ end
+ if prop == 'ALetter' then
+ return 'ASTARTED', false
+ end
+ if prop == 'Hebrew_Letter' then
+ return 'HSTARTED', false
+ end
+ return state_map.PRE(prop)
+ end,
+ NSTARTED = function(prop)
+ if ignorable[prop] then
+ return 'NSTARTED', false
+ end
+ if prop == 'ALetter' then
+ return 'ASTARTED', false
+ end
+ if prop == 'Hebrew_Letter' then
+ return 'HSTARTED', false
+ end
+ if prop == 'Numeric' then
+ return 'NSTARTED', false
+ end
+ if prop == 'ExtendNumLet' then
+ return 'EXTEND', false
+ end
+ if prop == 'MidNum' or prop == 'MidNumLet' or prop == 'Single_Quote' then
+ return context_Numeric_Mid
+ end
+ return state_map.PRE(prop)
+ end,
+}
+
+local from_ZWJ, to_ZWJ = {}, {}
+for k in next, state_map do
+ local zwj_state = 'ZWJ_' .. k
+ from_ZWJ[zwj_state], to_ZWJ[k] = k, zwj_state
+end
+
+
+-- The value of "state" is considered internal and should not be relied upon.
+-- Just pass it to the function as is or pass nil. `nil` should only be passed when the passed codepoint starts a new cluster
+local function read_codepoint(cp, state)
+ local mapped_state = from_ZWJ[state]
+ local new_word
+ local prop = property[cp]
+ state, new_word = state_map[mapped_state or state or 'START'](prop)
+ if mapped_state and extended_pictographic[cp] then
+ new_word = false
+ end
+ if prop == 'ZWJ' then
+ state = to_ZWJ[state]
+ end
+ return new_word, state
+end
+
+return {
+ read_codepoint = read_codepoint,
+}
More information about the latex3-commits
mailing list.