[latex3-commits] [git/LaTeX3-latex3-latex3] text-map: Add \text_map_... functions (c7fdb58d2)
Joseph Wright
joseph.wright at morningstar2.co.uk
Fri Aug 5 16:21:32 CEST 2022
Repository : https://github.com/latex3/latex3
On branch : text-map
Link : https://github.com/latex3/latex3/commit/c7fdb58d225536dbe06ad885ca8a5499bc5784b7
>---------------------------------------------------------------
commit c7fdb58d225536dbe06ad885ca8a5499bc5784b7
Author: Joseph Wright <joseph.wright at morningstar2.co.uk>
Date: Fri Aug 5 15:21:03 2022 +0100
Add \text_map_... functions
Currently the regional indicator rule is not implemented:
I am not clear on the exact requirement.
>---------------------------------------------------------------
c7fdb58d225536dbe06ad885ca8a5499bc5784b7
l3kernel/CHANGELOG.md | 2 +
l3kernel/doc/source3body.tex | 1 +
l3kernel/l3.ins | 1 +
l3kernel/l3text-map.dtx | 623 +++++++++++++++++++++
l3kernel/l3text.dtx | 49 ++
l3kernel/l3unicode.dtx | 48 ++
.../{m3token001.tlg => m3text006.luatex.tlg} | 6 +-
.../m3color005.lvt => testfiles/m3text006.lvt} | 22 +-
.../{m3text005.ptex.tlg => m3text006.tlg} | 13 +-
.../{m3token001.tlg => m3text006.xetex.tlg} | 6 +-
10 files changed, 743 insertions(+), 28 deletions(-)
diff --git a/l3kernel/CHANGELOG.md b/l3kernel/CHANGELOG.md
index 9fcaa43d5..50cc62d5f 100644
--- a/l3kernel/CHANGELOG.md
+++ b/l3kernel/CHANGELOG.md
@@ -11,6 +11,8 @@ this project uses date-based 'snapshot' version identifiers.
- Support for case changing Croatian diagraph with 8-bit engines
- Support accent removal when uppercasing Greek with 8-bit engines
- Function `\sys_ensure_backend:`
+- `\text_map_function:nN` and `\text_map_inline:nn` for mapping to
+ graphemes in textual input
### Fixed
- Behavior of `\color_math:nn` in alignments
diff --git a/l3kernel/doc/source3body.tex b/l3kernel/doc/source3body.tex
index 20e187536..c9b0f3448 100644
--- a/l3kernel/doc/source3body.tex
+++ b/l3kernel/doc/source3body.tex
@@ -589,6 +589,7 @@ used on top of \LaTeXe{} if \cs{outer} tokens are used in the arguments.
\clist_gput_right:Nn \g_docinput_clist
{
l3text-case.dtx ,
+ l3text-map.dtx ,
l3text-purify.dtx
}
\ExplSyntaxOff
diff --git a/l3kernel/l3.ins b/l3kernel/l3.ins
index fbcc815d9..08949c097 100644
--- a/l3kernel/l3.ins
+++ b/l3kernel/l3.ins
@@ -106,6 +106,7 @@ and all files in that bundle must be distributed together.
\from{l3unicode.dtx} {package}
\from{l3text.dtx} {package}
\from{l3text-case.dtx} {package}
+ \from{l3text-map.dtx} {package}
\from{l3text-purify.dtx}{package}
\from{l3candidates.dtx} {package}
\from{l3legacy.dtx} {package}
diff --git a/l3kernel/l3text-map.dtx b/l3kernel/l3text-map.dtx
new file mode 100644
index 000000000..22ed5aae6
--- /dev/null
+++ b/l3kernel/l3text-map.dtx
@@ -0,0 +1,623 @@
+% \iffalse meta-comment
+%
+%% File: l3text-map.dtx
+%
+% Copyright (C) 2022 The LaTeX Project
+%
+% It may be distributed and/or modified under the conditions of the
+% LaTeX Project Public License (LPPL), either version 1.3c of this
+% license or (at your option) any later version. The latest version
+% of this license is in the file
+%
+% https://www.latex-project.org/lppl.txt
+%
+% This file is part of the "l3kernel bundle" (The Work in LPPL)
+% and all files in that bundle must be distributed together.
+%
+% -----------------------------------------------------------------------
+%
+% The development version of the bundle can be found at
+%
+% https://github.com/latex3/latex3
+%
+% for those people who are interested.
+%
+%<*driver>
+\documentclass[full,kernel]{l3doc}
+\begin{document}
+ \DocInput{\jobname.dtx}
+\end{document}
+%</driver>
+% \fi
+%
+% \title{^^A
+% The \textsf{l3text-map} package: text processing (mapping)^^A
+% }
+%
+% \author{^^A
+% The \LaTeX{} Project\thanks
+% {^^A
+% E-mail:
+% \href{mailto:latex-team at latex-project.org}
+% {latex-team at latex-project.org}^^A
+% }^^A
+% }
+%
+% \date{Released 2022-07-15}
+%
+% \maketitle
+%
+% \begin{documentation}
+%
+% \end{documentation}
+%
+% \begin{implementation}
+%
+% \section{\pkg{l3text-map} implementation}
+%
+% \begin{macrocode}
+%<*package>
+% \end{macrocode}
+%
+% \begin{macrocode}
+%<@@=text>
+% \end{macrocode}
+%
+% \subsection{Mapping to text}
+%
+% \begin{macro}[EXP]{\text_map_function:nN}
+% \begin{macro}[EXP]{\@@_map_function:nN}
+% \begin{macro}[EXP]{\@@_map_loop:Nnw}
+% \begin{macro}[EXP]{\@@_map_group:Nnn}
+% \begin{macro}[EXP]{\@@_map_space:Nnw}
+% \begin{macro}[EXP]{\@@_map_N_type:NnN}
+% \begin{macro}[EXP]{\@@_map_char:NnN}
+% \begin{macro}[EXP]{\@@_map_char:NnNN}
+% \begin{macro}[EXP]{\@@_map_char:NnNNN}
+% \begin{macro}[EXP]{\@@_map_char:NnNNNN}
+% \begin{macro}[EXP]{\@@_map_codepoint:Nnn}
+% \begin{macro}[EXP]{\@@_map_CR:Nnw}
+% \begin{macro}[EXP]{\@@_map_CR:NnN}
+% \begin{macro}[EXP]{\@@_map_class:Nnnn}
+% \begin{macro}[EXP]{\@@_map_class:nNnnn}
+% \begin{macro}[EXP]{\@@_map_class_loop:Nnnnw}
+% \begin{macro}[EXP]{\@@_map_class_end:nw}
+% \begin{macro}[EXP]
+% {
+% \@@_map_Control:Nnn ,
+% \@@_map_Extend:Nnn ,
+% \@@_map_SpacingMark:Nnn ,
+% \@@_map_Prepend:Nnn
+% }
+% \begin{macro}[EXP]
+% {
+% \@@_map_not_Control:Nnn ,
+% \@@_map_not_Extend:Nnn ,
+% \@@_map_not_SpacingMark:Nnn ,
+% \@@_map_not_Prepend:Nnn ,
+% \@@_map_not_L:Nnn ,
+% \@@_map_not_LV:Nnn ,
+% \@@_map_not_LVT:Nnn
+% }
+% \begin{macro}[EXP]
+% {
+% \@@_map_L:Nnn ,
+% \@@_map_LV:Nnn ,
+% \@@_map_LVT:Nnn
+% }
+% \begin{macro}[EXP]{\@@_map_hangul:Nnnw}
+% \begin{macro}[EXP]{\@@_map_hangul:NnnN}
+% \begin{macro}[EXP]{\@@_map_hangul_char:NnnN}
+% \begin{macro}[EXP]{\@@_map_hangul_char:NnnNN}
+% \begin{macro}[EXP]{\@@_map_hangul_char:NnnNNN}
+% \begin{macro}[EXP]{\@@_map_hangul_char:NnnNNNN}
+% \begin{macro}[EXP]{\@@_map_hangul:Nnnnw}
+% \begin{macro}[EXP]{\@@_map_hangul:nNnnnn}
+% \begin{macro}[EXP]{\@@_map_hangul_loop:Nnnnnw}
+% \begin{macro}[EXP]{\@@_map_hangul_next:Nnnn}
+% \begin{macro}[EXP]{\@@_map_hangul_end:nw}
+% \begin{macro}[EXP]
+% {
+% \@@_map_hangul_L:Nnn ,
+% \@@_map_hangul_LV:Nnn ,
+% \@@_map_hangul_V:Nnn ,
+% \@@_map_hangul_LVT:Nnn ,
+% \@@_map_hangul_T:Nnn
+% }
+% \begin{macro}[EXP]{\@@_map_output:Nn}
+% \begin{macro}[EXP]{\text_map_break:}
+% \begin{macro}[EXP]{\text_map_break:n}
+% The standard lead-off for an action loop.
+% \begin{macrocode}
+\cs_new:Npn \text_map_function:nN #1#2
+ { \exp_args:Ne \@@_map_function:nN { \text_expand:n {#1} } #2 }
+\cs_new:Npn \@@_map_function:nN #1#2
+ {
+ \@@_map_loop:Nnw #2 { } #1
+ \q_@@_recursion_tail \q_@@_recursion_stop
+ \prg_break_point:Nn \text_map_break: { }
+ }
+% \end{macrocode}
+% The standard set up for an \enquote{action} loop. Groups are handled by
+% recursion, spaces are treated similarly: both count as grapheme boundaries.
+% For \texttt{N}-type tokens, we filter out control sequences (again
+% a boundary), then move on to further analysis.
+% \begin{macrocode}
+\cs_new:Npn \@@_map_loop:Nnw #1#2#3 \q_@@_recursion_stop
+ {
+ \tl_if_head_is_N_type:nTF {#3}
+ { \@@_map_N_type:NnN }
+ {
+ \tl_if_head_is_group:nTF {#3}
+ { \@@_map_group:Nnn }
+ { \@@_map_space:Nnw }
+ }
+ #1 {#2} #3 \q_@@_recursion_stop
+ }
+\cs_new:Npn \@@_map_group:Nnn #1#2#3
+ {
+ \@@_map_output:Nn #1 {#2}
+ {
+ \@@_map_loop:Nnw #1 { } #2
+ \q_@@_recursion_tail \q_@@_recursion_stop
+ \prg_break_point:Nn \text_map_break: { }
+ }
+ \@@_map_loop:Nnw #1 { }
+ }
+\use:x
+ { \cs_new:Npn \exp_not:N \@@_map_space:Nnw ##1##2 \c_space_tl }
+ {
+ \@@_map_output:Nn #1 {#2}
+ #1 { ~ }
+ \@@_map_loop:Nnw #1 { }
+ }
+\cs_new:Npn \@@_map_N_type:NnN #1#2#3
+ {
+ \@@_if_q_recursion_tail_stop_do:Nn #3
+ {
+ \@@_map_output:Nn #1 {#2}
+ \text_map_break:
+ }
+ \token_if_cs:NTF #3
+ {
+ \@@_map_output:Nn #1 {#2}
+ #1 {#3}
+ \@@_map_loop:Nnw #1 { }
+ }
+ { \@@_map_char:NnN #1 {#2} #3 }
+ }
+% \end{macrocode}
+% We want to keep common code paths, so collect up one Unicode codepoint
+% as a single argument in an engine-independent way.
+% \begin{macrocode}
+\bool_lazy_or:nnTF
+ { \sys_if_engine_luatex_p: }
+ { \sys_if_engine_xetex_p: }
+ {
+ \cs_new:Npn \@@_map_char:NnN #1#2#3
+ { \@@_map_codepoint:Nnn #1 {#2} #3 }
+ }
+ {
+ \cs_new:Npn \@@_map_char:NnN #1#2#3
+ {
+ \int_compare:nNnTF { `#3 } > { "80 }
+ {
+ \int_compare:nNnTF { `#3 } < { "E0 }
+ { \@@_map_char:NnNN }
+ {
+ \int_compare:nNnTF { `#3 } < { "F0 }
+ { \@@_map_char:NnNNN }
+ { \@@_map_char:NnNNNN }
+ }
+ }
+ { \@@_map_codepoint:Nnn }
+ #1 {#2} #3
+ }
+ \cs_new:Npn \@@_map_char:NnNN #1#2#3#4
+ { \@@_map_codepoint:Nnn #1 {#2} {#3#4} }
+ \cs_new:Npn \@@_map_char:NnNNN #1#2#3#4#5
+ { \@@_map_codepoint:Nnn #1 {#2} {#3#4#5} }
+ \cs_new:Npn \@@_map_char:NnNNNN #1#2#3#4#5#6
+ { \@@_map_codepoint:Nnn #1 {#2} {#3#4#5#6} }
+ }
+% \end{macrocode}
+% We want to keep common code paths, so collect up one Unicode codepoint
+% as a single argument in an engine-independent way. We can then pull out
+% the special cases: hard-coded for speed so not actually using the
+% grapheme data. The carriage return case needs a bit of context handling
+% so has an auxiliary. Codepoint U+200D is the zero-width joiner, which has
+% no context to concern us: just don't break.
+% \begin{macrocode}
+\cs_new:Npn \@@_map_codepoint:Nnn #1#2#3
+ {
+ \@@_map_codepoint_compare:nNnTF {#3} = { "0D }
+ {
+ \@@_map_output:Nn #1 {#2}
+ \@@_map_CR:Nnw #1 {#3}
+ }
+ {
+ \@@_map_codepoint_compare:nNnTF {#3} = { "200D }
+ { \@@_map_loop:Nnw #1 {#2#3} }
+ { \@@_map_class:Nnnn #1 {#2} {#3} { Control } }
+ }
+ }
+% \end{macrocode}
+% A carriage return is a boundary unless it is immediately followed by
+% a line feed, in which case that pair is a boundary.
+% \begin{macrocode}
+\cs_new:Npn \@@_map_CR:Nnw #1#2#3 \q_@@_recursion_stop
+ {
+ \tl_if_head_is_N_type:nTF {#3}
+ { \@@_map_CR:NnN #1 {#2} }
+ {
+ #1 {#2}
+ \@@_map_loop:Nnw #1 { }
+ }
+ #3 \q_@@_recursion_stop
+ }
+\cs_new:Npn \@@_map_CR:NnN #1#2#3
+ {
+ \@@_if_q_recursion_tail_stop_do:Nn #3
+ {
+ #1 {#2}
+ \text_map_break:
+ }
+ \bool_lazy_and:nnTF
+ { ! \token_if_cs_p:N #3 }
+ { \int_compare_p:nNn { `#3 } = { "0A } }
+ {
+ \@@_map_output:Nn #1 {#2#3}
+ \@@_map_loop:Nnw #1 { }
+ }
+ { \@@_map_loop:Nnw #1 { } #3 }
+ }
+% \end{macrocode}
+% There are various classes of character, and we deal with them all in
+% the same general way. We need to example the relevant list of codepoints:
+% if we get a hit, then we do whatever the relevant action is. Otherwise
+% we loop, but only if the current codepoint could still match: the
+% loop stops early otherwise and we move forward.
+% \begin{macrocode}
+\cs_new:Npn \@@_map_class:Nnnn #1#2#3#4
+ {
+ \exp_args:Nv \@@_map_class:nNnnn { c_@@_grapheme_ #4 _clist }
+ #1 {#2} {#3} {#4}
+ }
+\cs_new:Npn \@@_map_class:nNnnn #1#2#3#4#5
+ {
+ \@@_map_class_loop:Nnnnw #2 {#3} {#4} {#5}
+ #1 , \q_@@_recursion_tail .. , \q_@@_recursion_stop
+ }
+\cs_new:Npn \@@_map_class_loop:Nnnnw #1#2#3#4 #5 .. #6 ,
+ {
+ \@@_if_q_recursion_tail_stop_do:nn {#5}
+ { \use:c { @@_map_not_ #4 :Nnn } #1 {#2} {#3} }
+ \@@_map_codepoint_compare:nNnTF {#3} < { "#5 }
+ {
+ \@@_map_class_end:nw
+ { \use:c { @@_map_not_ #4 :Nnn } #1 {#2} {#3} }
+ }
+ {
+ \@@_map_codepoint_compare:nNnTF {#3} > { "#6 }
+ { \@@_map_class_loop:Nnnnw #1 {#2} {#3} {#4} }
+ {
+ \@@_map_class_end:nw
+ { \use:c { @@_map_ #4 :Nnn } #1 {#2} {#3} }
+ }
+ }
+ }
+\cs_new:Npn \@@_map_class_end:nw #1#2 \q_@@_recursion_stop {#1}
+% \end{macrocode}
+% Break before \emph{and} after.
+% \begin{macrocode}
+\cs_new:Npn \@@_map_Control:Nnn #1#2#3
+ {
+ \@@_map_output:Nn #1 {#2}
+ \@@_map_output:Nn #1 {#3}
+ \@@_map_loop:Nnw #1 { }
+ }
+% \end{macrocode}
+% Keep collecting.
+% \begin{macrocode}
+\cs_new:Npn \@@_map_Extend:Nnn #1#2#3
+ { \@@_map_loop:Nnw #1 {#2#3} }
+\cs_new_eq:NN \@@_map_SpacingMark:Nnn \@@_map_Extend:Nnn
+% \end{macrocode}
+% Retain and loop, outputting anything earlier.
+% \begin{macrocode}
+\cs_new:Npn \@@_map_Prepend:Nnn #1#2#3
+ {
+ \@@_map_output:Nn #1 {#2}
+ \@@_map_loop:Nnw #1 {#3}
+ }
+% \end{macrocode}
+% Dealing with end-of-class is done such that we can be flexible.
+% \begin{macrocode}
+\cs_new:Npn \@@_map_not_Control:Nnn #1#2#3
+ { \@@_map_class:Nnnn #1 {#2} {#3} { Extend } }
+\cs_new:Npn \@@_map_not_Extend:Nnn #1#2#3
+ { \@@_map_class:Nnnn #1 {#2} {#3} { SpacingMark } }
+\cs_new:Npn \@@_map_not_SpacingMark:Nnn #1#2#3
+ { \@@_map_class:Nnnn #1 {#2} {#3} { Prepend } }
+\cs_new:Npn \@@_map_not_Prepend:Nnn #1#2#3
+ { \@@_map_class:Nnnn #1 {#2} {#3} { L } }
+\cs_new:Npn \@@_map_not_L:Nnn #1#2#3
+ { \@@_map_class:Nnnn #1 {#2} {#3} { LV } }
+\cs_new:Npn \@@_map_not_LV:Nnn #1#2#3
+ { \@@_map_class:Nnnn #1 {#2} {#3} { LVT } }
+\cs_new:Npn \@@_map_not_LVT:Nnn #1#2#3
+ {
+ \@@_map_output:Nn #1 {#2}
+ \@@_map_loop:Nnw #1 {#3}
+ }
+% \end{macrocode}
+% Hangul needs additional treatment. First we have to deal with
+% the start-of-Hangul position: output what we had up to now, then
+% move the the specialist handler. The idea here is to pick off the
+% different codepoint types one at a time, tracking what else can be
+% considered at each stage until we hit the end of the viable types.
+% Other than that, we just keep building up the Hangul codepoints
+% using a dedicated version of the loop from above.
+% \begin{macrocode}
+\cs_new:Npn \@@_map_L:Nnn #1#2#3
+ {
+ \@@_map_output:Nn #1 {#2}
+ \@@_map_hangul:Nnnw
+ #1 {#3} { L V { LV } { LVT } }
+ }
+\cs_new:Npn \@@_map_LV:Nnn #1#2#3
+ {
+ \@@_map_output:Nn #1 {#2}
+ \@@_map_hangul:Nnnw
+ #1 {#3} { V T }
+ }
+\cs_new:Npn \@@_map_LVT:Nnn #1#2#3
+ {
+ \@@_map_output:Nn #1 {#2}
+ \@@_map_hangul:Nnnw
+ #1 {#3} { T }
+ }
+\cs_new:Npn \@@_map_hangul:Nnnw #1#2#3#4 \q_@@_recursion_stop
+ {
+ \tl_if_head_is_N_type:nTF {#4}
+ { \@@_map_hangul:NnnN #1 {#2} {#3} }
+ {
+ #1 {#2}
+ \@@_map_loop:Nnw #1 { }
+ }
+ #4 \q_@@_recursion_stop
+ }
+\cs_new:Npn \@@_map_hangul:NnnN #1#2#3#4
+ {
+ \@@_if_q_recursion_tail_stop_do:Nn #4
+ {
+ #1 {#2}
+ \text_map_break:
+ }
+ \token_if_cs:NTF #4
+ {
+ #1 {#2}
+ \@@_map_loop:Nnw #1 { }
+ }
+ { \@@_map_hangul_char:NnnN #1 {#2} {#3} #4 }
+ }
+\bool_lazy_or:nnTF
+ { \sys_if_engine_luatex_p: }
+ { \sys_if_engine_xetex_p: }
+ {
+ \cs_new:Npn \@@_map_hangul_char:NnnN #1#2#3#4
+ { \@@_map_hangul:Nnnnw #1 {#2} {#4} #3 ; }
+ }
+ {
+ \cs_new:Npn \@@_map_hangul_char:NnnN #1#2#3#4
+ {
+ \int_compare:nNnTF { `#4 } > { "80 }
+ {
+ \int_compare:nNnTF { `#4 } < { "E0 }
+ { \@@_map_hangul_char:NnnNN }
+ {
+ \int_compare:nNnTF { `#4 } < { "F0 }
+ { \@@_map_hangul_char:NnnNNN }
+ { \@@_map_hangul_char:NnnNNNN }
+ }
+ #1 {#2} {#3} #4
+ }
+ { \@@_map_hangul:Nnnnw #1 {#2} #4 #3 ; }
+ }
+ \cs_new:Npn \@@_map_hangul_char:NnnNN #1#2#3#4#5
+ { \@@_map_hangul:Nnnnw #1 {#2} {#4#5} #3 ; }
+ \cs_new:Npn \@@_map_hangul_char:NnnNNN #1#2#3#4#5#6
+ { \@@_map_hangul:Nnnnw #1 {#2} {#4#5#6} #3 ; }
+ \cs_new:Npn \@@_map_hangul_char:NnnNNNN #1#2#3#4#5#6#7
+ { \@@_map_hangul:Nnnnw #1 {#2} {#4#5#6#7} #3 ; }
+ }
+\cs_new:Npn \@@_map_hangul:Nnnnw #1#2#3#4#5 ;
+ {
+ \exp_args:Nv \@@_map_hangul:nNnnnn { c_@@_grapheme_ #4 _clist }
+ #1 {#2} {#3} {#4} {#5}
+ }
+\cs_new:Npn \@@_map_hangul:nNnnnn #1#2#3#4#5#6
+ {
+ \@@_map_hangul_loop:Nnnnnw #2 {#3} {#4} {#5} {#6}
+ #1 , \q_@@_recursion_tail .. , \q_@@_recursion_stop
+ }
+\cs_new:Npn \@@_map_hangul_loop:Nnnnnw #1#2#3#4#5 #6 .. #7 ,
+ {
+ \@@_if_q_recursion_tail_stop_do:nn {#6}
+ { \@@_map_hangul_next:Nnnn #1 {#2} {#3} {#5} }
+ \@@_map_codepoint_compare:nNnTF {#3} < { "#6 }
+ { \@@_map_hangul_next:Nnnn #1 {#2} {#3} {#5} }
+ {
+ \@@_map_codepoint_compare:nNnTF {#3} > { "#7 }
+ { \@@_map_hangul_loop:Nnnnnw #1 {#2} {#3} {#4} {#5} }
+ {
+ \@@_map_hangul_end:nw
+ { \use:c { @@_map_hangul_ #4 :Nnn } #1 {#2} {#3} }
+ }
+ }
+ }
+\cs_new:Npn \@@_map_hangul_next:Nnnn #1#2#3#4
+ {
+ \tl_if_blank:nTF {#4}
+ {
+ #1 {#2}
+ \@@_map_loop:Nnw #1 { }
+ }
+ { \@@_map_hangul:Nnnnw #1 {#2} {#3} #4 ; }
+ }
+\cs_new:Npn \@@_map_hangul_end:nw #1#2 \q_@@_recursion_stop {#1}
+\cs_new:Npn \@@_map_hangul_L:Nnn #1#2#3
+ {
+ \@@_map_hangul:Nnnw
+ #1 {#2#3} { L V { LV } { LVT } }
+ }
+\cs_new:Npn \@@_map_hangul_LV:Nnn #1#2#3
+ {
+ \@@_map_hangul:Nnnw
+ #1 {#2#3} { V T }
+ }
+\cs_new_eq:NN \@@_map_hangul_V:Nnn \@@_map_hangul_LV:Nnn
+\cs_new:Npn \@@_map_hangul_LVT:Nnn #1#2#3
+ {
+ \@@_map_hangul:Nnnw
+ #1 {#2#3} { T }
+ }
+\cs_new_eq:NN \@@_map_hangul_T:Nnn \@@_map_hangul_LVT:Nnn
+% \end{macrocode}
+% For the end of the process.
+% \begin{macrocode}
+\cs_new:Npn \@@_map_output:Nn #1#2
+ { \tl_if_blank:nF {#2} { #1 {#2} } }
+\cs_new:Npn \text_map_break:
+ { \prg_map_break:Nn \text_map_break: { } }
+\cs_new:Npn \text_map_break:n
+ { \prg_map_break:Nn \text_map_break: }
+% \end{macrocode}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+%
+% \begin{macro}[EXP, TF]{\@@_map_codepoint_compare:nNn}
+% \begin{macro}[EXP]{\@@_map_codepoint_compare:N, \@@_map_codepoint_compare_aux:N}
+% \begin{macro}[EXP]{\@@_map_codepoint_compare:NN}
+% \begin{macro}[EXP]{\@@_map_codepoint_compare:NNN}
+% \begin{macro}[EXP]{\@@_map_codepoint_compare:NNNN}
+% Allows comparison for all engines using a first \enquote{character} followed
+% by a codepoint.
+% \begin{macrocode}
+\bool_lazy_or:nnTF
+ { \sys_if_engine_luatex_p: }
+ { \sys_if_engine_xetex_p: }
+ {
+ \prg_new_conditional:Npnn
+ \@@_map_codepoint_compare:nNn #1#2#3 { TF }
+ {
+ \int_compare:nNnTF { `#1 } #2 {#3}
+ \prg_return_true: \prg_return_false:
+ }
+ }
+ {
+ \prg_new_conditional:Npnn
+ \@@_map_codepoint_compare:nNn #1#2#3 { TF }
+ {
+ \int_compare:nNnTF { \@@_map_codepoint_compare:N #1 }
+ #2 {#3}
+ \prg_return_true: \prg_return_false:
+ }
+ \cs_new:Npn \@@_map_codepoint_compare:N #1
+ {
+ \if_int_compare:w `#1 > "80 \exp_stop_f:
+ \if_int_compare:w `#1 < "E0 \exp_stop_f:
+ \exp_after:wN \exp_after:wN \exp_after:wN
+ \@@_map_codepoint_compare:NN
+ \else:
+ \if_int_compare:w `#1 < "F0 \exp_stop_f:
+ \exp_after:wN \exp_after:wN \exp_after:wN
+ \exp_after:wN \exp_after:wN \exp_after:wN
+ \exp_after:wN \@@_map_codepoint_compare:NNN
+ \else:
+ \exp_after:wN \exp_after:wN \exp_after:wN
+ \exp_after:wN \exp_after:wN \exp_after:wN
+ \exp_after:wN \@@_map_codepoint_compare:NNNN
+ \fi:
+ \fi:
+ \else:
+ \exp_after:wN \@@_map_codepoint_compare_aux:N
+ \fi:
+ #1
+ }
+ \cs_new:Npn \@@_map_codepoint_compare_aux:N #1 { `#1 }
+ \cs_new:Npn \@@_map_codepoint_compare:NN #1#2
+ { (`#1 - "C0) * "40 + `#2 - "80 }
+ \cs_new:Npn \@@_map_codepoint_compare:NNN #1#2#3
+ { (`#1 - "E0) * "1000 + (`#2 - "80) * "40 + `#3 - "80 }
+ \cs_new:Npn \@@_map_codepoint_compare:NNNN #1#2#3#4
+ {
+ (`#1 - "F0) * "40000
+ + (`#2 - "80) * "1000
+ + (`#3 - "80) * "40
+ + `#4 - "80
+ }
+ }
+% \end{macrocode}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+%
+% \begin{macro}{\text_map_inline:nn}
+% The standard non-expandable inline version.
+% \begin{macrocode}
+\cs_new_protected:Npn \text_map_inline:nn #1#2
+ {
+ \int_gincr:N \g__kernel_prg_map_int
+ \cs_gset_protected:cpn
+ { @@_map_ \int_use:N \g__kernel_prg_map_int :w } ##1 {#2}
+ \exp_args:Nnc \text_map_function:nN {#1}
+ { @@_map_ \int_use:N \g__kernel_prg_map_int :w }
+ \prg_break_point:Nn \text_map_break:
+ { \int_gdecr:N \g__kernel_prg_map_int }
+ }
+% \end{macrocode}
+% \end{macro}
+%
+% \begin{macrocode}
+%</package>
+% \end{macrocode}
+%
+% \end{implementation}
+%
+% \PrintIndex
diff --git a/l3kernel/l3text.dtx b/l3kernel/l3text.dtx
index 7bba36eff..99f4a5364 100644
--- a/l3kernel/l3text.dtx
+++ b/l3kernel/l3text.dtx
@@ -299,6 +299,52 @@
% \texttt{true}.
% \end{variable}
%
+% \section{Mapping to graphemes}
+%
+% \begin{function}[rEXP, added = 2022-08-04]{\text_map_function:nN}
+% \begin{syntax}
+% \cs{text_map_function:nN} \meta{text} \Arg{function}
+% \end{syntax}
+% Takes user input \meta{text} and expands as described for
+% \cs{text_expand:n}, then maps over the \emph{graphemes} within the
+% result, passing each grapheme to the \meta{function}.
+% Broadly a grapheme is a \enquote{user perceived character}:
+% the Unicode Consortium describe the decomposition of input to
+% graphemes in depth, and the approach used here implements that
+% algorithm. The \meta{function} should accept one argument as \meta{balanced
+% text}: this may be a single codepoint, multiple codepoints (or with an
+% $8$-bit engine bytes) or may be a control sequence.
+% See also \cs{text_map_inline:nn}.
+% \end{function}
+%
+% \begin{function}[added = 2022-08-04]{\text_map_inline:nn}
+% \begin{syntax}
+% \cs{text_map_inline:nn} \meta{text} \Arg{inline function}
+% \end{syntax}
+% Takes user input \meta{text} and expands as described for
+% \cs{text_expand:n}, then maps over the \emph{graphemes} within the
+% result, passing each grapheme to the \meta{inline function}.
+% Broadly a grapheme is a \enquote{user perceived character}:
+% the Unicode Consortium describe the decomposition of input to
+% graphemes in depth, and the approach used here implements that
+% algorithm. The \meta{inline function} should consist of code which
+% receives the grapheme as \meta{balanced
+% text}: this may be a single codepoint, multiple codepoints (or with an
+% 8-bit engine bytes) or may be a control sequence.
+% See also \cs{text_map_function:nN}.
+% \end{function}
+%
+% \begin{function}[rEXP, added = 2022-08-04]
+% {\text_map_break:, \text_map_break:n}
+% \begin{syntax}
+% \cs{text_map_break:}
+% \cs{text_map_break:n} \Arg{code}
+% \end{syntax}
+% Used to terminate a \cs[no-index]{text_map_\ldots} function before all
+% entries in the \meta{text} have been processed. This
+% normally takes place within a conditional statement.
+% \end{function}
+%
% \end{documentation}
%
% \begin{implementation}
@@ -357,11 +403,14 @@
% \end{macro}
%
% \begin{macro}[EXP]{\@@_if_q_recursion_tail_stop_do:Nn}
+% \begin{macro}[EXP]{\@@_if_q_recursion_tail_stop_do:nn}
% Functions to query recursion quarks.
% \begin{macrocode}
\__kernel_quark_new_test:N \@@_if_q_recursion_tail_stop_do:Nn
+\__kernel_quark_new_test:N \@@_if_q_recursion_tail_stop_do:nn
% \end{macrocode}
% \end{macro}
+% \end{macro}
%
% \begin{variable}{\s_@@_recursion_tail,\s_@@_recursion_stop}
% Internal scan marks quarks.
diff --git a/l3kernel/l3unicode.dtx b/l3kernel/l3unicode.dtx
index fbf1d0bd6..86bd39817 100644
--- a/l3kernel/l3unicode.dtx
+++ b/l3kernel/l3unicode.dtx
@@ -252,6 +252,54 @@
% \end{macrocode}
%
% \begin{macrocode}
+%<@@=text>
+% \end{macrocode}
+%
+% Read the Unicode grapheme data. This is quite easy to handle and we only need
+% codepoints, not characters, so there is no need to worry about the engine in use.
+% As reading as a string is most convenient, we have to do some work to remove
+% spaces: the hardest part of the entire process!
+% \begin{macrocode}
+\ior_new:N \g_@@_data_ior
+\group_begin:
+ \ior_open:Nn \g_@@_data_ior { GraphemeBreakProperty.txt }
+ \cs_set_nopar:Npn \l_@@_tmpa_str { }
+ \cs_set_nopar:Npn \l_@@_tmpb_str { }
+ \cs_set_protected:Npn \@@_data_auxi:w #1 ;~ #2 ~ #3 \q_stop
+ {
+ \str_if_eq:VnF \l_@@_tmpb_str {#2}
+ {
+ \str_if_empty:NF \l_@@_tmpb_str
+ {
+ \clist_const:cx { c_@@_grapheme_ \l_@@_tmpb_str _clist }
+ { \exp_after:wN \use_none:n \l_@@_tmpa_str }
+ \cs_set_nopar:Npn \l_@@_tmpa_str { }
+ }
+ \cs_set_nopar:Npn \l_@@_tmpb_str {#2}
+ }
+ \@@_data_auxii:w #1 .. #1 .. #1 \q_stop
+ }
+ \cs_set_protected:Npn \@@_data_auxii:w #1 .. #2 .. #3 \q_stop
+ {
+ \cs_set_nopar:Npx \l_@@_tmpa_str
+ {
+ \l_@@_tmpa_str ,
+ \tl_trim_spaces:n {#1} .. \tl_trim_spaces:n {#2}
+ }
+ }
+ \ior_str_map_inline:Nn \g_@@_data_ior
+ {
+ \str_if_eq:eeF { \tl_head:w #1 \c_hash_str \q_stop } { \c_hash_str }
+ {
+ \tl_if_blank:nF {#1}
+ { \@@_data_auxi:w #1 \q_stop }
+ }
+ }
+ \ior_close:N \g_@@_data_ior
+\group_end:
+% \end{macrocode}
+%
+% \begin{macrocode}
%</package>
% \end{macrocode}
%
diff --git a/l3kernel/testfiles/m3token001.tlg b/l3kernel/testfiles/m3text006.luatex.tlg
similarity index 78%
copy from l3kernel/testfiles/m3token001.tlg
copy to l3kernel/testfiles/m3text006.luatex.tlg
index cd844648d..9b3a47f3f 100644
--- a/l3kernel/testfiles/m3token001.tlg
+++ b/l3kernel/testfiles/m3text006.luatex.tlg
@@ -2,7 +2,9 @@ This is a generated file for the LaTeX (2e + expl3) validation system.
Don't change this file in any respect.
Author: Joseph Wright
============================================================
-TEST 1: Unicode NFD
+TEST 1: Grapheme mapping
============================================================
-A
+(H)(e)(l)(l)(o)
+(S)(p)(ı)(n̈)(a)(l)( )(T)(a)(p)
+(가)
============================================================
diff --git a/l3kernel/testfiles-backend/m3color005.lvt b/l3kernel/testfiles/m3text006.lvt
similarity index 55%
copy from l3kernel/testfiles-backend/m3color005.lvt
copy to l3kernel/testfiles/m3text006.lvt
index e43d9a6aa..7030d17aa 100644
--- a/l3kernel/testfiles-backend/m3color005.lvt
+++ b/l3kernel/testfiles/m3text006.lvt
@@ -1,9 +1,6 @@
%
% Copyright (C) 2022 The LaTeX Project
%
-
-\documentclass{minimal}
-
\input{regression-test}
\RequirePackage[enable-debug]{expl3}
@@ -11,27 +8,22 @@
\debug_on:n { check-declarations , deprecation , log-functions }
\ExplSyntaxOff
-\START
+\documentclass{minimal}
+\START
\AUTHOR{Joseph Wright}
\ExplSyntaxOn
\OMIT
- \cs_set_protected:Npn \test:n #1
- {
- \hbox_set:Nn \l_tmpa_box
- {
- #1
- Hello
- }
- \box_show:N \l_tmpa_box
- }
+ \cs_set:Npn \test:n #1 { (#1) }
\TIMO
-\TEST { Current~color }
+\TESTEXP { Grapheme~mapping }
{
- \test:n { \color_ensure_current: }
+ \text_map_function:nN { Hello } \test:n \NEWLINE
+ \text_map_function:nN { Spın̈al~Tap } \test:n \NEWLINE
+ \text_map_function:nN { 각 } \test:n \NEWLINE
}
\END
\ No newline at end of file
diff --git a/l3kernel/testfiles/m3text005.ptex.tlg b/l3kernel/testfiles/m3text006.tlg
similarity index 73%
copy from l3kernel/testfiles/m3text005.ptex.tlg
copy to l3kernel/testfiles/m3text006.tlg
index 5735cebe5..b397eac19 100644
--- a/l3kernel/testfiles/m3text005.ptex.tlg
+++ b/l3kernel/testfiles/m3text006.tlg
@@ -2,14 +2,9 @@ This is a generated file for the LaTeX (2e + expl3) validation system.
Don't change this file in any respect.
Author: Joseph Wright
============================================================
-TEST 1: \@uclclist\ entries
+TEST 1: Grapheme mapping
============================================================
-Ё\cyryo
-Ё\CYRYO
-Ё\CYRYO
-Ё\CYRYO
-ё\cyryo
-ё\CYRYO
-ё\CYRYO
-ё\CYRYO
+(H)(e)(l)(l)(o)
+(S)(p)(^^c4^^b1)(n^^cc^^88)(a)(l)( )(T)(a)(p)
+(^^e1^^84^^80^^e1^^85^^a1)
============================================================
diff --git a/l3kernel/testfiles/m3token001.tlg b/l3kernel/testfiles/m3text006.xetex.tlg
similarity index 78%
copy from l3kernel/testfiles/m3token001.tlg
copy to l3kernel/testfiles/m3text006.xetex.tlg
index cd844648d..9b3a47f3f 100644
--- a/l3kernel/testfiles/m3token001.tlg
+++ b/l3kernel/testfiles/m3text006.xetex.tlg
@@ -2,7 +2,9 @@ This is a generated file for the LaTeX (2e + expl3) validation system.
Don't change this file in any respect.
Author: Joseph Wright
============================================================
-TEST 1: Unicode NFD
+TEST 1: Grapheme mapping
============================================================
-A
+(H)(e)(l)(l)(o)
+(S)(p)(ı)(n̈)(a)(l)( )(T)(a)(p)
+(가)
============================================================
More information about the latex3-commits
mailing list.