[latex3-commits] [git/LaTeX3-latex3-latex3] text-map: Add \text_map_... functions (c7fdb58d2)

Joseph Wright joseph.wright at morningstar2.co.uk
Fri Aug 5 16:21:32 CEST 2022


Repository : https://github.com/latex3/latex3
On branch  : text-map
Link       : https://github.com/latex3/latex3/commit/c7fdb58d225536dbe06ad885ca8a5499bc5784b7

>---------------------------------------------------------------

commit c7fdb58d225536dbe06ad885ca8a5499bc5784b7
Author: Joseph Wright <joseph.wright at morningstar2.co.uk>
Date:   Fri Aug 5 15:21:03 2022 +0100

    Add \text_map_... functions
    
    Currently the regional indicator rule is not implemented:
    I am not clear on the exact requirement.


>---------------------------------------------------------------

c7fdb58d225536dbe06ad885ca8a5499bc5784b7
 l3kernel/CHANGELOG.md                              |   2 +
 l3kernel/doc/source3body.tex                       |   1 +
 l3kernel/l3.ins                                    |   1 +
 l3kernel/l3text-map.dtx                            | 623 +++++++++++++++++++++
 l3kernel/l3text.dtx                                |  49 ++
 l3kernel/l3unicode.dtx                             |  48 ++
 .../{m3token001.tlg => m3text006.luatex.tlg}       |   6 +-
 .../m3color005.lvt => testfiles/m3text006.lvt}     |  22 +-
 .../{m3text005.ptex.tlg => m3text006.tlg}          |  13 +-
 .../{m3token001.tlg => m3text006.xetex.tlg}        |   6 +-
 10 files changed, 743 insertions(+), 28 deletions(-)

diff --git a/l3kernel/CHANGELOG.md b/l3kernel/CHANGELOG.md
index 9fcaa43d5..50cc62d5f 100644
--- a/l3kernel/CHANGELOG.md
+++ b/l3kernel/CHANGELOG.md
@@ -11,6 +11,8 @@ this project uses date-based 'snapshot' version identifiers.
 - Support for case changing Croatian diagraph with 8-bit engines
 - Support accent removal when uppercasing Greek with 8-bit engines
 - Function `\sys_ensure_backend:`
+- `\text_map_function:nN` and `\text_map_inline:nn` for mapping to
+  graphemes in textual input
 
 ### Fixed
 - Behavior of `\color_math:nn` in alignments
diff --git a/l3kernel/doc/source3body.tex b/l3kernel/doc/source3body.tex
index 20e187536..c9b0f3448 100644
--- a/l3kernel/doc/source3body.tex
+++ b/l3kernel/doc/source3body.tex
@@ -589,6 +589,7 @@ used on top of \LaTeXe{} if \cs{outer} tokens are used in the arguments.
 \clist_gput_right:Nn \g_docinput_clist
   {
     l3text-case.dtx ,
+    l3text-map.dtx  ,
     l3text-purify.dtx
   }
 \ExplSyntaxOff
diff --git a/l3kernel/l3.ins b/l3kernel/l3.ins
index fbcc815d9..08949c097 100644
--- a/l3kernel/l3.ins
+++ b/l3kernel/l3.ins
@@ -106,6 +106,7 @@ and all files in that bundle must be distributed together.
         \from{l3unicode.dtx}    {package}
         \from{l3text.dtx}       {package}
         \from{l3text-case.dtx}  {package}
+        \from{l3text-map.dtx}   {package}
         \from{l3text-purify.dtx}{package}
         \from{l3candidates.dtx} {package}
         \from{l3legacy.dtx}     {package}
diff --git a/l3kernel/l3text-map.dtx b/l3kernel/l3text-map.dtx
new file mode 100644
index 000000000..22ed5aae6
--- /dev/null
+++ b/l3kernel/l3text-map.dtx
@@ -0,0 +1,623 @@
+% \iffalse meta-comment
+%
+%% File: l3text-map.dtx
+%
+% Copyright (C) 2022 The LaTeX Project
+%
+% It may be distributed and/or modified under the conditions of the
+% LaTeX Project Public License (LPPL), either version 1.3c of this
+% license or (at your option) any later version.  The latest version
+% of this license is in the file
+%
+%    https://www.latex-project.org/lppl.txt
+%
+% This file is part of the "l3kernel bundle" (The Work in LPPL)
+% and all files in that bundle must be distributed together.
+%
+% -----------------------------------------------------------------------
+%
+% The development version of the bundle can be found at
+%
+%    https://github.com/latex3/latex3
+%
+% for those people who are interested.
+%
+%<*driver>
+\documentclass[full,kernel]{l3doc}
+\begin{document}
+  \DocInput{\jobname.dtx}
+\end{document}
+%</driver>
+% \fi
+%
+% \title{^^A
+%   The \textsf{l3text-map} package: text processing (mapping)^^A
+% }
+%
+% \author{^^A
+%  The \LaTeX{} Project\thanks
+%    {^^A
+%      E-mail:
+%        \href{mailto:latex-team at latex-project.org}
+%          {latex-team at latex-project.org}^^A
+%    }^^A
+% }
+%
+% \date{Released 2022-07-15}
+%
+% \maketitle
+%
+% \begin{documentation}
+%
+% \end{documentation}
+%
+% \begin{implementation}
+%
+% \section{\pkg{l3text-map} implementation}
+%
+%    \begin{macrocode}
+%<*package>
+%    \end{macrocode}
+%
+%    \begin{macrocode}
+%<@@=text>
+%    \end{macrocode}
+%
+% \subsection{Mapping to text}
+%
+% \begin{macro}[EXP]{\text_map_function:nN}
+% \begin{macro}[EXP]{\@@_map_function:nN}
+% \begin{macro}[EXP]{\@@_map_loop:Nnw}
+% \begin{macro}[EXP]{\@@_map_group:Nnn}
+% \begin{macro}[EXP]{\@@_map_space:Nnw}
+% \begin{macro}[EXP]{\@@_map_N_type:NnN}
+% \begin{macro}[EXP]{\@@_map_char:NnN}
+% \begin{macro}[EXP]{\@@_map_char:NnNN}
+% \begin{macro}[EXP]{\@@_map_char:NnNNN}
+% \begin{macro}[EXP]{\@@_map_char:NnNNNN}
+% \begin{macro}[EXP]{\@@_map_codepoint:Nnn}
+% \begin{macro}[EXP]{\@@_map_CR:Nnw}
+% \begin{macro}[EXP]{\@@_map_CR:NnN}
+% \begin{macro}[EXP]{\@@_map_class:Nnnn}
+% \begin{macro}[EXP]{\@@_map_class:nNnnn}
+% \begin{macro}[EXP]{\@@_map_class_loop:Nnnnw}
+% \begin{macro}[EXP]{\@@_map_class_end:nw}
+% \begin{macro}[EXP]
+%   {
+%     \@@_map_Control:Nnn     ,
+%     \@@_map_Extend:Nnn      ,
+%     \@@_map_SpacingMark:Nnn ,
+%     \@@_map_Prepend:Nnn
+%   }
+% \begin{macro}[EXP]
+%   {
+%     \@@_map_not_Control:Nnn     ,
+%     \@@_map_not_Extend:Nnn      ,
+%     \@@_map_not_SpacingMark:Nnn ,
+%     \@@_map_not_Prepend:Nnn     ,
+%     \@@_map_not_L:Nnn           ,
+%     \@@_map_not_LV:Nnn          ,
+%     \@@_map_not_LVT:Nnn
+%   }
+% \begin{macro}[EXP]
+%   {
+%     \@@_map_L:Nnn  ,
+%     \@@_map_LV:Nnn  ,
+%     \@@_map_LVT:Nnn
+%   }
+% \begin{macro}[EXP]{\@@_map_hangul:Nnnw}
+% \begin{macro}[EXP]{\@@_map_hangul:NnnN}
+% \begin{macro}[EXP]{\@@_map_hangul_char:NnnN}
+% \begin{macro}[EXP]{\@@_map_hangul_char:NnnNN}
+% \begin{macro}[EXP]{\@@_map_hangul_char:NnnNNN}
+% \begin{macro}[EXP]{\@@_map_hangul_char:NnnNNNN}
+% \begin{macro}[EXP]{\@@_map_hangul:Nnnnw}
+% \begin{macro}[EXP]{\@@_map_hangul:nNnnnn}
+% \begin{macro}[EXP]{\@@_map_hangul_loop:Nnnnnw}
+% \begin{macro}[EXP]{\@@_map_hangul_next:Nnnn}
+% \begin{macro}[EXP]{\@@_map_hangul_end:nw}
+% \begin{macro}[EXP]
+%   {
+%     \@@_map_hangul_L:Nnn   ,
+%     \@@_map_hangul_LV:Nnn  ,
+%     \@@_map_hangul_V:Nnn   ,
+%     \@@_map_hangul_LVT:Nnn ,
+%     \@@_map_hangul_T:Nnn
+%   }
+% \begin{macro}[EXP]{\@@_map_output:Nn}
+% \begin{macro}[EXP]{\text_map_break:}
+% \begin{macro}[EXP]{\text_map_break:n}
+%   The standard lead-off for an action loop.
+%    \begin{macrocode}
+\cs_new:Npn \text_map_function:nN #1#2
+  { \exp_args:Ne \@@_map_function:nN { \text_expand:n {#1} } #2 }
+\cs_new:Npn \@@_map_function:nN #1#2
+  {
+    \@@_map_loop:Nnw #2 { } #1
+      \q_@@_recursion_tail \q_@@_recursion_stop
+    \prg_break_point:Nn \text_map_break: { }
+  }
+%    \end{macrocode}
+%  The standard set up for an \enquote{action} loop. Groups are handled by
+%  recursion, spaces are treated similarly: both count as grapheme boundaries.
+%  For \texttt{N}-type tokens, we filter out control sequences (again
+%  a boundary), then move on to further analysis.
+%    \begin{macrocode}
+\cs_new:Npn \@@_map_loop:Nnw #1#2#3 \q_@@_recursion_stop
+  {
+    \tl_if_head_is_N_type:nTF {#3}
+      { \@@_map_N_type:NnN }
+      {
+        \tl_if_head_is_group:nTF {#3}
+          { \@@_map_group:Nnn }
+          { \@@_map_space:Nnw }
+      }
+    #1 {#2} #3 \q_@@_recursion_stop
+  }
+\cs_new:Npn \@@_map_group:Nnn #1#2#3
+  {
+    \@@_map_output:Nn #1 {#2}
+    {
+      \@@_map_loop:Nnw #1 { } #2
+        \q_@@_recursion_tail \q_@@_recursion_stop
+      \prg_break_point:Nn \text_map_break: { }
+    }
+    \@@_map_loop:Nnw #1 { }
+  }
+\use:x
+  { \cs_new:Npn \exp_not:N \@@_map_space:Nnw ##1##2 \c_space_tl }
+  {
+    \@@_map_output:Nn #1 {#2}
+    #1 { ~ }
+    \@@_map_loop:Nnw #1 { }
+  }
+\cs_new:Npn \@@_map_N_type:NnN #1#2#3
+  {
+    \@@_if_q_recursion_tail_stop_do:Nn #3
+      {
+        \@@_map_output:Nn #1 {#2}
+        \text_map_break:
+      }
+    \token_if_cs:NTF #3
+      {
+        \@@_map_output:Nn #1 {#2}
+        #1 {#3}
+        \@@_map_loop:Nnw #1 { }
+      }
+      { \@@_map_char:NnN #1 {#2} #3 }
+  }
+%    \end{macrocode}
+%  We want to keep common code paths, so collect up one Unicode codepoint
+%  as a single argument in an engine-independent way.
+%    \begin{macrocode}
+\bool_lazy_or:nnTF
+  { \sys_if_engine_luatex_p: }
+  { \sys_if_engine_xetex_p: }
+  {
+    \cs_new:Npn \@@_map_char:NnN #1#2#3
+      { \@@_map_codepoint:Nnn #1 {#2} #3 }
+  }
+  {
+    \cs_new:Npn \@@_map_char:NnN #1#2#3
+      {
+        \int_compare:nNnTF { `#3 } > { "80 }
+          {
+            \int_compare:nNnTF { `#3 } < { "E0 }
+              { \@@_map_char:NnNN }
+              {
+                 \int_compare:nNnTF { `#3 } < { "F0 }
+                   { \@@_map_char:NnNNN }
+                   { \@@_map_char:NnNNNN }
+              }
+          }
+          { \@@_map_codepoint:Nnn }
+            #1 {#2} #3
+      }
+    \cs_new:Npn \@@_map_char:NnNN #1#2#3#4
+      { \@@_map_codepoint:Nnn #1 {#2} {#3#4} }
+    \cs_new:Npn \@@_map_char:NnNNN #1#2#3#4#5
+      { \@@_map_codepoint:Nnn #1 {#2} {#3#4#5} }
+    \cs_new:Npn \@@_map_char:NnNNNN #1#2#3#4#5#6
+      { \@@_map_codepoint:Nnn #1 {#2} {#3#4#5#6} }
+  }
+%    \end{macrocode}
+%  We want to keep common code paths, so collect up one Unicode codepoint
+%  as a single argument in an engine-independent way. We can then pull out
+%  the special cases: hard-coded for speed so not actually using the
+%  grapheme data. The carriage return case needs a bit of context handling
+%  so has an auxiliary. Codepoint U+200D is the zero-width joiner, which has
+%  no context to concern us: just don't break.
+%    \begin{macrocode}
+\cs_new:Npn \@@_map_codepoint:Nnn #1#2#3
+  {
+    \@@_map_codepoint_compare:nNnTF {#3} =  { "0D }
+      {
+        \@@_map_output:Nn #1 {#2}
+        \@@_map_CR:Nnw #1 {#3}
+      }
+      {
+        \@@_map_codepoint_compare:nNnTF {#3} = { "200D }
+          { \@@_map_loop:Nnw #1 {#2#3} }
+          { \@@_map_class:Nnnn #1 {#2} {#3} { Control } }
+      }
+  }
+%    \end{macrocode}
+%   A carriage return is a boundary unless it is immediately followed by
+%   a line feed, in which case that pair is a boundary.
+%    \begin{macrocode}
+\cs_new:Npn \@@_map_CR:Nnw #1#2#3 \q_@@_recursion_stop
+  {
+    \tl_if_head_is_N_type:nTF {#3}
+      { \@@_map_CR:NnN #1 {#2} }
+      {
+        #1 {#2}
+        \@@_map_loop:Nnw #1 { }
+      }
+        #3 \q_@@_recursion_stop
+  }
+\cs_new:Npn \@@_map_CR:NnN #1#2#3
+  {
+    \@@_if_q_recursion_tail_stop_do:Nn #3
+      {
+        #1 {#2}
+        \text_map_break:
+      }
+    \bool_lazy_and:nnTF
+      { ! \token_if_cs_p:N #3 }
+      { \int_compare_p:nNn { `#3 } = { "0A } }
+      {
+        \@@_map_output:Nn #1 {#2#3}
+        \@@_map_loop:Nnw #1 { }
+      }
+      { \@@_map_loop:Nnw #1 { } #3 }
+  }
+%    \end{macrocode}
+%   There are various classes of character, and we deal with them all in
+%   the same general way. We need to example the relevant list of codepoints:
+%   if we get a hit, then we do whatever the relevant action is. Otherwise
+%   we loop, but only if the current codepoint could still match: the
+%   loop stops early otherwise and we move forward.
+%    \begin{macrocode}
+\cs_new:Npn \@@_map_class:Nnnn #1#2#3#4
+  {
+    \exp_args:Nv \@@_map_class:nNnnn { c_@@_grapheme_ #4 _clist }
+      #1 {#2} {#3} {#4}
+  }
+\cs_new:Npn \@@_map_class:nNnnn #1#2#3#4#5
+  {
+    \@@_map_class_loop:Nnnnw #2 {#3} {#4} {#5}
+      #1 , \q_@@_recursion_tail .. , \q_@@_recursion_stop
+  }
+\cs_new:Npn \@@_map_class_loop:Nnnnw #1#2#3#4 #5 .. #6 ,
+  {
+    \@@_if_q_recursion_tail_stop_do:nn {#5}
+      { \use:c { @@_map_not_ #4 :Nnn } #1 {#2} {#3} }
+    \@@_map_codepoint_compare:nNnTF {#3} < { "#5 }
+      {
+        \@@_map_class_end:nw
+          { \use:c { @@_map_not_ #4 :Nnn } #1 {#2} {#3} }
+      }
+      {
+        \@@_map_codepoint_compare:nNnTF {#3} > { "#6 }
+          { \@@_map_class_loop:Nnnnw #1 {#2} {#3} {#4} }
+          {
+            \@@_map_class_end:nw
+              { \use:c { @@_map_ #4 :Nnn } #1 {#2} {#3} }
+          }
+      }
+  }
+\cs_new:Npn \@@_map_class_end:nw #1#2 \q_@@_recursion_stop {#1}
+%    \end{macrocode}
+%   Break before \emph{and} after.
+%    \begin{macrocode}
+\cs_new:Npn \@@_map_Control:Nnn #1#2#3
+  {
+    \@@_map_output:Nn #1 {#2}
+    \@@_map_output:Nn #1 {#3}
+    \@@_map_loop:Nnw #1 { }
+  }
+%    \end{macrocode}
+%   Keep collecting.
+%    \begin{macrocode}
+\cs_new:Npn \@@_map_Extend:Nnn #1#2#3
+  { \@@_map_loop:Nnw #1 {#2#3} }
+\cs_new_eq:NN \@@_map_SpacingMark:Nnn \@@_map_Extend:Nnn
+%    \end{macrocode}
+%   Retain and loop, outputting anything earlier.
+%    \begin{macrocode}
+\cs_new:Npn \@@_map_Prepend:Nnn #1#2#3
+  {
+    \@@_map_output:Nn #1 {#2}
+    \@@_map_loop:Nnw #1 {#3}
+  }
+%    \end{macrocode}
+%   Dealing with end-of-class is done such that we can be flexible.
+%    \begin{macrocode}
+\cs_new:Npn \@@_map_not_Control:Nnn #1#2#3
+  { \@@_map_class:Nnnn #1 {#2} {#3} { Extend } }
+\cs_new:Npn \@@_map_not_Extend:Nnn #1#2#3
+  { \@@_map_class:Nnnn #1 {#2} {#3} { SpacingMark } }
+\cs_new:Npn \@@_map_not_SpacingMark:Nnn #1#2#3
+  { \@@_map_class:Nnnn #1 {#2} {#3} { Prepend } }
+\cs_new:Npn \@@_map_not_Prepend:Nnn #1#2#3
+  { \@@_map_class:Nnnn #1 {#2} {#3} { L } }
+\cs_new:Npn \@@_map_not_L:Nnn #1#2#3
+  { \@@_map_class:Nnnn #1 {#2} {#3} { LV } }
+\cs_new:Npn \@@_map_not_LV:Nnn #1#2#3
+  { \@@_map_class:Nnnn #1 {#2} {#3} { LVT } }
+\cs_new:Npn \@@_map_not_LVT:Nnn #1#2#3
+  {
+    \@@_map_output:Nn #1 {#2}
+    \@@_map_loop:Nnw #1 {#3}
+  }
+%    \end{macrocode}
+%   Hangul needs additional treatment. First we have to deal with
+%   the start-of-Hangul position: output what we had up to now, then
+%   move the the specialist handler. The idea here is to pick off the
+%   different codepoint types one at a time, tracking what else can be
+%   considered at each stage until we hit the end of the viable types.
+%   Other than that, we just keep building up the Hangul codepoints
+%   using a dedicated version of the loop from above.
+%    \begin{macrocode}
+\cs_new:Npn \@@_map_L:Nnn #1#2#3
+  {
+    \@@_map_output:Nn #1 {#2}
+    \@@_map_hangul:Nnnw
+      #1 {#3} { L V { LV } { LVT } }
+  }
+\cs_new:Npn \@@_map_LV:Nnn #1#2#3
+  {
+    \@@_map_output:Nn #1 {#2}
+    \@@_map_hangul:Nnnw
+      #1 {#3} { V  T }
+  }
+\cs_new:Npn \@@_map_LVT:Nnn #1#2#3
+  {
+    \@@_map_output:Nn #1 {#2}
+    \@@_map_hangul:Nnnw
+      #1 {#3} { T }
+  }
+\cs_new:Npn \@@_map_hangul:Nnnw #1#2#3#4 \q_@@_recursion_stop
+  {
+    \tl_if_head_is_N_type:nTF {#4}
+      { \@@_map_hangul:NnnN #1 {#2} {#3} }
+      {
+        #1 {#2}
+        \@@_map_loop:Nnw #1 { }
+      }
+    #4 \q_@@_recursion_stop
+  }
+\cs_new:Npn \@@_map_hangul:NnnN #1#2#3#4
+  {
+    \@@_if_q_recursion_tail_stop_do:Nn #4
+      {
+        #1 {#2}
+        \text_map_break:
+      }
+    \token_if_cs:NTF #4
+      {
+        #1 {#2}
+        \@@_map_loop:Nnw #1 { }
+      }
+      { \@@_map_hangul_char:NnnN #1 {#2} {#3} #4 }
+  }
+\bool_lazy_or:nnTF
+  { \sys_if_engine_luatex_p: }
+  { \sys_if_engine_xetex_p: }
+  {
+    \cs_new:Npn \@@_map_hangul_char:NnnN #1#2#3#4
+      { \@@_map_hangul:Nnnnw #1 {#2} {#4} #3 ; }
+  }
+  {
+    \cs_new:Npn \@@_map_hangul_char:NnnN #1#2#3#4
+      {
+        \int_compare:nNnTF { `#4 } > { "80 }
+          {
+            \int_compare:nNnTF { `#4 } < { "E0 }
+              { \@@_map_hangul_char:NnnNN }
+              {
+                 \int_compare:nNnTF { `#4 } < { "F0 }
+                   { \@@_map_hangul_char:NnnNNN }
+                   { \@@_map_hangul_char:NnnNNNN }
+              }
+                #1 {#2} {#3} #4 
+          }
+          { \@@_map_hangul:Nnnnw #1 {#2} #4 #3 ; }
+      }
+    \cs_new:Npn \@@_map_hangul_char:NnnNN #1#2#3#4#5
+      { \@@_map_hangul:Nnnnw #1 {#2} {#4#5} #3 ; }
+    \cs_new:Npn \@@_map_hangul_char:NnnNNN #1#2#3#4#5#6
+      { \@@_map_hangul:Nnnnw #1 {#2} {#4#5#6} #3 ; }
+    \cs_new:Npn \@@_map_hangul_char:NnnNNNN #1#2#3#4#5#6#7
+      { \@@_map_hangul:Nnnnw #1 {#2} {#4#5#6#7} #3 ; }
+  }
+\cs_new:Npn \@@_map_hangul:Nnnnw #1#2#3#4#5 ;
+  {
+    \exp_args:Nv \@@_map_hangul:nNnnnn { c_@@_grapheme_ #4 _clist }
+      #1 {#2} {#3} {#4} {#5}
+  }
+\cs_new:Npn \@@_map_hangul:nNnnnn #1#2#3#4#5#6
+  {
+    \@@_map_hangul_loop:Nnnnnw #2 {#3} {#4} {#5} {#6}
+      #1 , \q_@@_recursion_tail .. , \q_@@_recursion_stop
+  }
+\cs_new:Npn \@@_map_hangul_loop:Nnnnnw #1#2#3#4#5 #6 .. #7 ,
+  {
+    \@@_if_q_recursion_tail_stop_do:nn {#6}
+      { \@@_map_hangul_next:Nnnn #1 {#2} {#3} {#5} }
+    \@@_map_codepoint_compare:nNnTF {#3} < { "#6 }
+      { \@@_map_hangul_next:Nnnn #1 {#2} {#3} {#5} }
+      {
+        \@@_map_codepoint_compare:nNnTF {#3} > { "#7 }
+          { \@@_map_hangul_loop:Nnnnnw #1 {#2} {#3} {#4} {#5} }
+          {
+            \@@_map_hangul_end:nw
+              { \use:c { @@_map_hangul_ #4 :Nnn } #1 {#2} {#3} }
+          }
+      }
+  }
+\cs_new:Npn \@@_map_hangul_next:Nnnn #1#2#3#4
+  {
+    \tl_if_blank:nTF {#4}
+      {
+        #1 {#2}
+        \@@_map_loop:Nnw #1 { }
+      }
+      { \@@_map_hangul:Nnnnw #1 {#2} {#3} #4 ; }
+  }
+\cs_new:Npn \@@_map_hangul_end:nw #1#2 \q_@@_recursion_stop {#1}
+\cs_new:Npn \@@_map_hangul_L:Nnn #1#2#3
+  {
+    \@@_map_hangul:Nnnw
+      #1 {#2#3} { L V { LV } { LVT } }
+  }
+\cs_new:Npn \@@_map_hangul_LV:Nnn #1#2#3
+  {
+    \@@_map_hangul:Nnnw
+      #1 {#2#3} { V  T }
+  }
+\cs_new_eq:NN \@@_map_hangul_V:Nnn \@@_map_hangul_LV:Nnn
+\cs_new:Npn \@@_map_hangul_LVT:Nnn #1#2#3
+  {
+    \@@_map_hangul:Nnnw
+      #1 {#2#3} { T }
+  }
+\cs_new_eq:NN \@@_map_hangul_T:Nnn \@@_map_hangul_LVT:Nnn
+%    \end{macrocode}
+%   For the end of the process.
+%    \begin{macrocode}
+\cs_new:Npn \@@_map_output:Nn #1#2
+  { \tl_if_blank:nF {#2} { #1 {#2} } }
+\cs_new:Npn \text_map_break:
+  { \prg_map_break:Nn \text_map_break: { } }
+\cs_new:Npn \text_map_break:n
+  { \prg_map_break:Nn \text_map_break: }
+%    \end{macrocode}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+%
+% \begin{macro}[EXP, TF]{\@@_map_codepoint_compare:nNn}
+% \begin{macro}[EXP]{\@@_map_codepoint_compare:N, \@@_map_codepoint_compare_aux:N}
+% \begin{macro}[EXP]{\@@_map_codepoint_compare:NN}
+% \begin{macro}[EXP]{\@@_map_codepoint_compare:NNN}
+% \begin{macro}[EXP]{\@@_map_codepoint_compare:NNNN}
+%   Allows comparison for all engines using a first \enquote{character} followed
+%   by a codepoint.
+%    \begin{macrocode}
+\bool_lazy_or:nnTF
+  { \sys_if_engine_luatex_p: }
+  { \sys_if_engine_xetex_p: }
+  {
+    \prg_new_conditional:Npnn
+      \@@_map_codepoint_compare:nNn #1#2#3 { TF }
+      {
+        \int_compare:nNnTF { `#1 } #2 {#3}
+          \prg_return_true: \prg_return_false:
+      }
+  }
+  {
+    \prg_new_conditional:Npnn
+      \@@_map_codepoint_compare:nNn #1#2#3 { TF }
+      {
+        \int_compare:nNnTF { \@@_map_codepoint_compare:N #1 }
+            #2 {#3}
+          \prg_return_true: \prg_return_false:
+      }
+    \cs_new:Npn \@@_map_codepoint_compare:N #1
+      {
+        \if_int_compare:w `#1 > "80 \exp_stop_f:
+          \if_int_compare:w `#1 < "E0 \exp_stop_f:
+            \exp_after:wN \exp_after:wN \exp_after:wN
+              \@@_map_codepoint_compare:NN
+          \else:
+            \if_int_compare:w `#1 < "F0 \exp_stop_f:
+              \exp_after:wN \exp_after:wN \exp_after:wN
+              \exp_after:wN \exp_after:wN \exp_after:wN
+              \exp_after:wN \@@_map_codepoint_compare:NNN
+            \else:
+              \exp_after:wN \exp_after:wN \exp_after:wN
+              \exp_after:wN \exp_after:wN \exp_after:wN
+              \exp_after:wN \@@_map_codepoint_compare:NNNN
+            \fi:
+          \fi:
+        \else:
+          \exp_after:wN \@@_map_codepoint_compare_aux:N
+        \fi:
+          #1
+      }
+    \cs_new:Npn \@@_map_codepoint_compare_aux:N #1 { `#1 }
+    \cs_new:Npn \@@_map_codepoint_compare:NN #1#2
+      { (`#1 - "C0) * "40 + `#2 - "80 }
+    \cs_new:Npn \@@_map_codepoint_compare:NNN #1#2#3
+      { (`#1 - "E0) * "1000 + (`#2 - "80) * "40 + `#3 - "80 }
+    \cs_new:Npn \@@_map_codepoint_compare:NNNN #1#2#3#4
+      {
+          (`#1 - "F0) * "40000 
+        + (`#2 - "80) * "1000
+        + (`#3 - "80) * "40
+        + `#4 - "80
+      }
+  }
+%    \end{macrocode}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+%
+% \begin{macro}{\text_map_inline:nn}
+%   The standard non-expandable inline version.
+%    \begin{macrocode}
+\cs_new_protected:Npn \text_map_inline:nn #1#2
+  {
+    \int_gincr:N \g__kernel_prg_map_int
+    \cs_gset_protected:cpn
+      { @@_map_ \int_use:N \g__kernel_prg_map_int :w } ##1 {#2}
+    \exp_args:Nnc \text_map_function:nN {#1}
+      { @@_map_ \int_use:N \g__kernel_prg_map_int :w }
+    \prg_break_point:Nn \text_map_break:
+      { \int_gdecr:N \g__kernel_prg_map_int }
+  }
+%    \end{macrocode}
+% \end{macro}
+%
+%    \begin{macrocode}
+%</package>
+%    \end{macrocode}
+%
+% \end{implementation}
+%
+% \PrintIndex
diff --git a/l3kernel/l3text.dtx b/l3kernel/l3text.dtx
index 7bba36eff..99f4a5364 100644
--- a/l3kernel/l3text.dtx
+++ b/l3kernel/l3text.dtx
@@ -299,6 +299,52 @@
 %   \texttt{true}.
 % \end{variable}
 %
+% \section{Mapping to graphemes}
+%
+% \begin{function}[rEXP, added = 2022-08-04]{\text_map_function:nN}
+%   \begin{syntax}
+%     \cs{text_map_function:nN} \meta{text} \Arg{function}
+%   \end{syntax}
+%   Takes user input \meta{text} and expands as described for
+%   \cs{text_expand:n}, then maps over the \emph{graphemes} within the
+%   result, passing each grapheme to the \meta{function}.
+%   Broadly a grapheme is a \enquote{user perceived character}:
+%   the Unicode Consortium describe the decomposition of input to
+%   graphemes in depth, and the approach used here implements that
+%   algorithm. The \meta{function} should accept one argument as \meta{balanced
+%   text}: this may be a single codepoint, multiple codepoints (or with an
+%   $8$-bit engine bytes) or may be a control sequence.
+%   See also \cs{text_map_inline:nn}.
+% \end{function}
+%
+% \begin{function}[added = 2022-08-04]{\text_map_inline:nn}
+%   \begin{syntax}
+%     \cs{text_map_inline:nn} \meta{text} \Arg{inline function}
+%   \end{syntax}
+%   Takes user input \meta{text} and expands as described for
+%   \cs{text_expand:n}, then maps over the \emph{graphemes} within the
+%   result, passing each grapheme to the \meta{inline function}.
+%   Broadly a grapheme is a \enquote{user perceived character}:
+%   the Unicode Consortium describe the decomposition of input to
+%   graphemes in depth, and the approach used here implements that
+%   algorithm. The \meta{inline function} should consist of code which
+%   receives the grapheme as \meta{balanced
+%   text}: this may be a single codepoint, multiple codepoints (or with an
+%   8-bit engine bytes) or may be a control sequence.
+%   See also \cs{text_map_function:nN}.
+% \end{function}
+%
+% \begin{function}[rEXP, added = 2022-08-04]
+%   {\text_map_break:, \text_map_break:n}
+%   \begin{syntax}
+%     \cs{text_map_break:}
+%     \cs{text_map_break:n} \Arg{code}
+%   \end{syntax}
+%   Used to terminate a \cs[no-index]{text_map_\ldots} function before all
+%   entries in the \meta{text} have been processed. This
+%   normally takes place within a conditional statement.
+% \end{function}
+%
 % \end{documentation}
 %
 % \begin{implementation}
@@ -357,11 +403,14 @@
 % \end{macro}
 %
 % \begin{macro}[EXP]{\@@_if_q_recursion_tail_stop_do:Nn}
+% \begin{macro}[EXP]{\@@_if_q_recursion_tail_stop_do:nn}
 %   Functions to query recursion quarks.
 %    \begin{macrocode}
 \__kernel_quark_new_test:N \@@_if_q_recursion_tail_stop_do:Nn
+\__kernel_quark_new_test:N \@@_if_q_recursion_tail_stop_do:nn
 %    \end{macrocode}
 % \end{macro}
+% \end{macro}
 %
 % \begin{variable}{\s_@@_recursion_tail,\s_@@_recursion_stop}
 %   Internal scan marks quarks.
diff --git a/l3kernel/l3unicode.dtx b/l3kernel/l3unicode.dtx
index fbf1d0bd6..86bd39817 100644
--- a/l3kernel/l3unicode.dtx
+++ b/l3kernel/l3unicode.dtx
@@ -252,6 +252,54 @@
 %    \end{macrocode}
 %
 %    \begin{macrocode}
+%<@@=text>
+%    \end{macrocode}
+%
+%  Read the Unicode grapheme data. This is quite easy to handle and we only need
+%  codepoints, not characters, so there is no need to worry about the engine in use.
+%  As reading as a string is most convenient, we have to do some work to remove
+%  spaces: the hardest part of the entire process!
+%    \begin{macrocode}
+\ior_new:N \g_@@_data_ior
+\group_begin:
+  \ior_open:Nn \g_@@_data_ior { GraphemeBreakProperty.txt }
+  \cs_set_nopar:Npn \l_@@_tmpa_str { }
+  \cs_set_nopar:Npn \l_@@_tmpb_str { }
+  \cs_set_protected:Npn \@@_data_auxi:w #1 ;~ #2 ~ #3 \q_stop
+    {
+      \str_if_eq:VnF \l_@@_tmpb_str {#2}
+        {
+          \str_if_empty:NF \l_@@_tmpb_str
+            {
+              \clist_const:cx { c_@@_grapheme_ \l_@@_tmpb_str _clist }
+                { \exp_after:wN \use_none:n \l_@@_tmpa_str }
+              \cs_set_nopar:Npn \l_@@_tmpa_str { }
+            }
+          \cs_set_nopar:Npn \l_@@_tmpb_str {#2}
+        }
+      \@@_data_auxii:w #1 .. #1 .. #1 \q_stop
+    }
+  \cs_set_protected:Npn \@@_data_auxii:w #1 .. #2 .. #3 \q_stop
+    {
+      \cs_set_nopar:Npx \l_@@_tmpa_str
+        {
+          \l_@@_tmpa_str ,
+          \tl_trim_spaces:n {#1} .. \tl_trim_spaces:n {#2}
+        }
+    }
+  \ior_str_map_inline:Nn \g_@@_data_ior
+    {
+      \str_if_eq:eeF { \tl_head:w #1 \c_hash_str \q_stop } { \c_hash_str }
+        {
+          \tl_if_blank:nF {#1}
+            { \@@_data_auxi:w #1 \q_stop }
+        }
+    }
+  \ior_close:N \g_@@_data_ior
+\group_end:    
+%    \end{macrocode}
+%
+%    \begin{macrocode}
 %</package>
 %    \end{macrocode}
 %
diff --git a/l3kernel/testfiles/m3token001.tlg b/l3kernel/testfiles/m3text006.luatex.tlg
similarity index 78%
copy from l3kernel/testfiles/m3token001.tlg
copy to l3kernel/testfiles/m3text006.luatex.tlg
index cd844648d..9b3a47f3f 100644
--- a/l3kernel/testfiles/m3token001.tlg
+++ b/l3kernel/testfiles/m3text006.luatex.tlg
@@ -2,7 +2,9 @@ This is a generated file for the LaTeX (2e + expl3) validation system.
 Don't change this file in any respect.
 Author: Joseph Wright
 ============================================================
-TEST 1: Unicode NFD
+TEST 1: Grapheme mapping
 ============================================================
-A
+(H)(e)(l)(l)(o)
+(S)(p)(ı)(n̈)(a)(l)( )(T)(a)(p)
+(가)
 ============================================================
diff --git a/l3kernel/testfiles-backend/m3color005.lvt b/l3kernel/testfiles/m3text006.lvt
similarity index 55%
copy from l3kernel/testfiles-backend/m3color005.lvt
copy to l3kernel/testfiles/m3text006.lvt
index e43d9a6aa..7030d17aa 100644
--- a/l3kernel/testfiles-backend/m3color005.lvt
+++ b/l3kernel/testfiles/m3text006.lvt
@@ -1,9 +1,6 @@
 %
 % Copyright (C) 2022 The LaTeX Project
 %
-
-\documentclass{minimal}
-
 \input{regression-test}
 
 \RequirePackage[enable-debug]{expl3}
@@ -11,27 +8,22 @@
 \debug_on:n { check-declarations , deprecation , log-functions }
 \ExplSyntaxOff
 
-\START
+\documentclass{minimal}
 
+\START
 \AUTHOR{Joseph Wright}
 
 \ExplSyntaxOn
 
 \OMIT
-  \cs_set_protected:Npn \test:n #1
-    {
-      \hbox_set:Nn \l_tmpa_box
-        {
-          #1
-          Hello
-        }
-      \box_show:N \l_tmpa_box
-    }
+  \cs_set:Npn \test:n #1 { (#1) }
 \TIMO
 
-\TEST { Current~color }
+\TESTEXP { Grapheme~mapping }
   {
-    \test:n { \color_ensure_current: }
+    \text_map_function:nN { Hello } \test:n \NEWLINE
+    \text_map_function:nN { Spın̈al~Tap } \test:n \NEWLINE
+    \text_map_function:nN { 각 } \test:n \NEWLINE
   }
 
 \END
\ No newline at end of file
diff --git a/l3kernel/testfiles/m3text005.ptex.tlg b/l3kernel/testfiles/m3text006.tlg
similarity index 73%
copy from l3kernel/testfiles/m3text005.ptex.tlg
copy to l3kernel/testfiles/m3text006.tlg
index 5735cebe5..b397eac19 100644
--- a/l3kernel/testfiles/m3text005.ptex.tlg
+++ b/l3kernel/testfiles/m3text006.tlg
@@ -2,14 +2,9 @@ This is a generated file for the LaTeX (2e + expl3) validation system.
 Don't change this file in any respect.
 Author: Joseph Wright
 ============================================================
-TEST 1: \@uclclist\ entries
+TEST 1: Grapheme mapping
 ============================================================
-Ё\cyryo 
-Ё\CYRYO 
-Ё\CYRYO 
-Ё\CYRYO 
-ё\cyryo 
-ё\CYRYO 
-ё\CYRYO 
-ё\CYRYO 
+(H)(e)(l)(l)(o)
+(S)(p)(^^c4^^b1)(n^^cc^^88)(a)(l)( )(T)(a)(p)
+(^^e1^^84^^80^^e1^^85^^a1)
 ============================================================
diff --git a/l3kernel/testfiles/m3token001.tlg b/l3kernel/testfiles/m3text006.xetex.tlg
similarity index 78%
copy from l3kernel/testfiles/m3token001.tlg
copy to l3kernel/testfiles/m3text006.xetex.tlg
index cd844648d..9b3a47f3f 100644
--- a/l3kernel/testfiles/m3token001.tlg
+++ b/l3kernel/testfiles/m3text006.xetex.tlg
@@ -2,7 +2,9 @@ This is a generated file for the LaTeX (2e + expl3) validation system.
 Don't change this file in any respect.
 Author: Joseph Wright
 ============================================================
-TEST 1: Unicode NFD
+TEST 1: Grapheme mapping
 ============================================================
-A
+(H)(e)(l)(l)(o)
+(S)(p)(ı)(n̈)(a)(l)( )(T)(a)(p)
+(가)
 ============================================================





More information about the latex3-commits mailing list.