[latex3-commits] [git/LaTeX3-latex3-latex3] text-map: Add \text_map_... functions (e001783e7)

Thu Aug 4 21:45:14 CEST 2022

Repository : https://github.com/latex3/latex3
On branch  : text-map
Link       : https://github.com/latex3/latex3/commit/e001783e71d69c888e888c068f439d8abcba9979

>---------------------------------------------------------------

commit e001783e71d69c888e888c068f439d8abcba9979
Author: Joseph Wright <joseph.wright at morningstar2.co.uk>
Date:   Thu Aug 4 20:34:50 2022 +0100

    Add \text_map_... functions
    
    Currently the regional indicator rule is not implemented:
    I am not clear on the exact requirement.


>---------------------------------------------------------------

e001783e71d69c888e888c068f439d8abcba9979
 l3kernel/CHANGELOG.md                              |   2 +
 l3kernel/doc/source3body.tex                       |   1 +
 l3kernel/l3.ins                                    |   1 +
 l3kernel/l3text-map.dtx                            | 443 +++++++++++++++++++++
 l3kernel/l3text.dtx                                |  49 +++
 l3kernel/l3unicode.dtx                             |  48 +++
 ...{m3token001.luatex.tlg => m3text006.luatex.tlg} |   7 +-
 .../m3color005.lvt => testfiles/m3text006.lvt}     |  23 +-
 .../testfiles/{m3token001.tlg => m3text006.tlg}    |   5 +-
 .../{m3token001.luatex.tlg => m3text006.xetex.tlg} |   7 +-
 10 files changed, 560 insertions(+), 26 deletions(-)

diff --git a/l3kernel/CHANGELOG.md b/l3kernel/CHANGELOG.md
index 3a4e44ad3..9733c15d1 100644
--- a/l3kernel/CHANGELOG.md
+++ b/l3kernel/CHANGELOG.md
@@ -11,6 +11,8 @@ this project uses date-based 'snapshot' version identifiers.
 - Support for case changing Croatian diagraph with 8-bit engines
 - Support accent removal when uppercasing Greek with 8-bit engines
 - Function `\sys_ensure_backend:`
+- `\text_map_function:nN` and `\text_map_inline:nn` for mapping to
+  graphemes in textual input
 
 ### Fixed
 - Behavior of `\color_math:nn` in alignments
diff --git a/l3kernel/doc/source3body.tex b/l3kernel/doc/source3body.tex
index 20e187536..c9b0f3448 100644
--- a/l3kernel/doc/source3body.tex
+++ b/l3kernel/doc/source3body.tex
@@ -589,6 +589,7 @@ used on top of \LaTeXe{} if \cs{outer} tokens are used in the arguments.
 \clist_gput_right:Nn \g_docinput_clist
   {
     l3text-case.dtx ,
+    l3text-map.dtx  ,
     l3text-purify.dtx
   }
 \ExplSyntaxOff
diff --git a/l3kernel/l3.ins b/l3kernel/l3.ins
index fbcc815d9..08949c097 100644
--- a/l3kernel/l3.ins
+++ b/l3kernel/l3.ins
@@ -106,6 +106,7 @@ and all files in that bundle must be distributed together.
         \from{l3unicode.dtx}    {package}
         \from{l3text.dtx}       {package}
         \from{l3text-case.dtx}  {package}
+        \from{l3text-map.dtx}   {package}
         \from{l3text-purify.dtx}{package}
         \from{l3candidates.dtx} {package}
         \from{l3legacy.dtx}     {package}
diff --git a/l3kernel/l3text-map.dtx b/l3kernel/l3text-map.dtx
new file mode 100644
index 000000000..a99b8deb7
--- /dev/null
+++ b/l3kernel/l3text-map.dtx
@@ -0,0 +1,443 @@
+% \iffalse meta-comment
+%
+%% File: l3text-map.dtx
+%
+% Copyright (C) 2022 The LaTeX Project
+%
+% It may be distributed and/or modified under the conditions of the
+% LaTeX Project Public License (LPPL), either version 1.3c of this
+% license or (at your option) any later version.  The latest version
+% of this license is in the file
+%
+%    https://www.latex-project.org/lppl.txt
+%
+% This file is part of the "l3kernel bundle" (The Work in LPPL)
+% and all files in that bundle must be distributed together.
+%
+% -----------------------------------------------------------------------
+%
+% The development version of the bundle can be found at
+%
+%    https://github.com/latex3/latex3
+%
+% for those people who are interested.
+%
+%<*driver>
+\documentclass[full,kernel]{l3doc}
+\begin{document}
+  \DocInput{\jobname.dtx}
+\end{document}
+%</driver>
+% \fi
+%
+% \title{^^A
+%   The \textsf{l3text-map} package: text processing (mapping)^^A
+% }
+%
+% \author{^^A
+%  The \LaTeX{} Project\thanks
+%    {^^A
+%      E-mail:
+%        \href{mailto:latex-team at latex-project.org}
+%          {latex-team at latex-project.org}^^A
+%    }^^A
+% }
+%
+% \date{Released 2022-07-15}
+%
+% \maketitle
+%
+% \begin{documentation}
+%
+% \end{documentation}
+%
+% \begin{implementation}
+%
+% \section{\pkg{l3text-map} implementation}
+%
+%    \begin{macrocode}
+%<*package>
+%    \end{macrocode}
+%
+%    \begin{macrocode}
+%<@@=text>
+%    \end{macrocode}
+%
+% \subsection{Mapping to text}
+%
+% \begin{macro}[EXP]{\text_map_function:nN}
+% \begin{macro}[EXP]{\@@_map_function:nN}
+% \begin{macro}[EXP]{\@@_map_loop:Nnw}
+% \begin{macro}[EXP]{\@@_map_group:Nnn}
+% \begin{macro}[EXP]{\@@_map_space:Nnw}
+% \begin{macro}[EXP]{\@@_map_N_type:NnN}
+% \begin{macro}[EXP]{\@@_map_char:NnN}
+% \begin{macro}[EXP]{\@@_map_char:NnNN}
+% \begin{macro}[EXP]{\@@_map_char:NnNNN}
+% \begin{macro}[EXP]{\@@_map_char:NnNNNN}
+% \begin{macro}[EXP]{\@@_map_codepoint:Nnn}
+% \begin{macro}[EXP]{\@@_map_CR:Nnw}
+% \begin{macro}[EXP]{\@@_map_CR:NnN}
+% \begin{macro}[EXP]{\@@_map_class:Nnnn}
+% \begin{macro}[EXP]{\@@_map_class:nNnnn}
+% \begin{macro}[EXP]{\@@_map_class_loop:Nnnnw}
+% \begin{macro}[EXP]{\@@_map_class_end:nw}
+% \begin{macro}[EXP]
+%   {
+%     \@@_map_Control:Nnn     ,
+%     \@@_map_Extend:Nnn      ,
+%     \@@_map_Prepend:Nnn     ,
+%     \@@_map_SpacingMark:Nnn
+%   }
+% \begin{macro}[EXP]
+%   {
+%     \@@_map_not_Control:Nnn     ,
+%     \@@_map_not_Extend:Nnn      ,
+%     \@@_map_not_Prepend:Nnn     ,
+%     \@@_map_not_SpacingMark:Nnn
+%   }
+% \begin{macro}[EXP]{\@@_map_output:Nn}
+% \begin{macro}[EXP]{\text_map_break:}
+% \begin{macro}[EXP]{\text_map_break:n}
+%   The standard lead-off for an action loop.
+%    \begin{macrocode}
+\cs_new:Npn \text_map_function:nN #1#2
+  { \exp_args:Ne \@@_map_function:nN { \text_expand:n {#1} } #2 }
+\cs_new:Npn \@@_map_function:nN #1#2
+  {
+    \@@_map_loop:Nnw #2 { } #1
+      \q_@@_recursion_tail \q_@@_recursion_stop
+    \prg_break_point:Nn \text_map_break: { }
+  }
+%    \end{macrocode}
+%  The standard set up for an \enquote{action} loop. Groups are handled by
+%  recursion, spaces are treated similarly: both count as grapheme boundaries.
+%  For \texttt{N}-type tokens, we filter out control sequences (again
+%  a boundary), then move on to further analysis.
+%    \begin{macrocode}
+\cs_new:Npn \@@_map_loop:Nnw #1#2#3 \q_@@_recursion_stop
+  {
+    \tl_if_head_is_N_type:nTF {#3}
+      { \@@_map_N_type:NnN }
+      {
+        \tl_if_head_is_group:nTF {#3}
+          { \@@_map_group:Nnn }
+          { \@@_map_space:Nnw }
+      }
+    #1 {#2} #3 \q_@@_recursion_stop
+  }
+\cs_new:Npn \@@_map_group:Nnn #1#2#3
+  {
+    \@@_map_output:Nn #1 {#2}
+    {
+      \@@_map_loop:Nnw #1 { } #2
+        \q_@@_recursion_tail \q_@@_recursion_stop
+      \prg_break_point:Nn \text_map_break: { }
+    }
+    \@@_map_loop:Nnw #1 { }
+  }
+\use:x
+  { \cs_new:Npn \exp_not:N \@@_map_space:Nnw ##1##2 \c_space_tl }
+  {
+    \@@_map_output:Nn #1 {#2}
+    #1 { ~ }
+    \@@_map_loop:Nnw #1 { }
+  }
+\cs_new:Npn \@@_map_N_type:NnN #1#2#3
+  {
+    \@@_if_recursion_tail_stop_do:Nn #3
+      {
+        \@@_map_output:Nn #1 {#2}
+        \text_map_break:
+      }
+    \token_if_cs:NTF #3
+      {
+        \@@_map_output:Nn #1 {#2}
+        #1 {#3}
+        \@@_map_loop:Nnw #1 { }
+      }
+      { \@@_map_char:NnN #1 {#2} #3 }
+  }
+%    \end{macrocode}
+%  We want to keep common code paths, so collect up one Unicode codepoint
+%  as a single argument in an engine-independent way.
+%    \begin{macrocode}
+\bool_lazy_or:nnTF
+  { \sys_if_engine_luatex_p: }
+  { \sys_if_engine_xetex_p: }
+  {
+    \cs_new:Npn \@@_map_char:NnN #1#2#3
+      { \@@_map_codepoint:Nnn #1 {#2} #3 }
+  }
+  {
+    \cs_new:Npn \@@_map_char:NnN #1#2#3
+      {
+        \int_compare:nNnTF { `#3 } > { "80 }
+          {
+            \int_compare:nNnTF { `#3 } < { "E0 }
+              { \@@_map_char:NnNN }
+              {
+                 \int_compare:nNnTF { `#3 } < { "F0 }
+                   { \@@_map_char:NnNNN }
+                   { \@@_map_char:NnNNNN }
+              }
+          }
+          { \@@_map_codepoint:Nnn }
+            #1 {#2} #3
+      }
+    \cs_new:Npn \@@_map_char:NnNN #1#2#3#4
+      { \@@_map_codepoint:Nnn #1 {#2} {#3#4} }
+    \cs_new:Npn \@@_map_char:NnNNN #1#2#3#4#5
+      { \@@_map_codepoint:Nnn #1 {#2} {#3#4#5} }
+    \cs_new:Npn \@@_map_char:NnNNNN #1#2#3#4#5#6
+      { \@@_map_codepoint:Nnn #1 {#2} {#3#4#5#6} }
+  }
+%    \end{macrocode}
+%  We want to keep common code paths, so collect up one Unicode codepoint
+%  as a single argument in an engine-independent way. We can then pull out
+%  the special cases: hard-coded for speed so not actually using the
+%  grapheme data. The carriage return case needs a bit of context handling
+%  so has an auxiliary. Codepoint U+200D is the zero-width joiner, which has
+%  no context to concern us: just don't break.
+%    \begin{macrocode}
+\cs_new:Npn \@@_map_codepoint:Nnn #1#2#3
+  {
+    \@@_map_codepoint_compare:nNnTF {#3} =  { "0D }
+      {
+        \@@_map_output:Nn #1 {#2}
+        \@@_map_CR:Nnw #1 {#3}
+      }
+      {
+        \@@_map_codepoint_compare:nNnTF {#3} = { "200D }
+          { \@@_map_loop:Nnw #1 {#2#3} }
+          { \@@_map_class:Nnnn #1 {#2} {#3} { Control } }
+      }
+  }
+%    \end{macrocode}
+%   A carriage return is a boundary unless it is immediately followed by
+%   a line feed, in which case that pair is a boundary.
+%    \begin{macrocode}
+\cs_new:Npn \@@_map_CR:Nnw #1#2#3 \q_@@_recursion_stop
+  {
+    \tl_if_head_is_N_type:nTF {#3}
+      { \@@_map_CR:NnN #1 {#2} }
+      {
+        #1 {#2}
+        \@@_map_loop:Nnw #1 { }
+      }
+        #3 \q_@@_recursion_stop
+  }
+\cs_new:Npn \@@_map_CR:NnN #1#2#3
+  {
+    \@@_if_recursion_tail_stop_do:Nn #3
+      {
+        #1 {#2}
+        \text_map_break:
+      }
+    \bool_lazy_and:nnTF
+      { ! \token_if_cs_p:N #3 }
+      { \int_compare_p:nNn { `#3 } = { "0A } }
+      {
+        \@@_map_output:Nn #1 {#2#3}
+        \@@_map_loop:Nnw #1 { }
+      }
+      { \@@_map_loop:Nnw #1 { } #3 }
+  }
+%    \end{macrocode}
+%   There are various classes of character, and we deal with them all in
+%   the same general way. We need to example the relevant list of codepoints:
+%   if we get a hit, then we do whatever the relevant action is. Otherwise
+%   we loop, but only if the current codepoint could still match: the
+%   loop stops early otherwise and we move forward.
+%    \begin{macrocode}
+\cs_new:Npn \@@_map_class:Nnnn #1#2#3#4
+  {
+    \exp_args:Nv \@@_map_class:nNnnn { c_@@_grapheme_ #4 _clist }
+      #1 {#2} {#3} {#4}
+  }
+\cs_new:Npn \@@_map_class:nNnnn #1#2#3#4#5
+  {
+    \@@_map_class_loop:Nnnnw #2 {#3} {#4} {#5}
+      #1 , \q_@@_recursion_tail .. , \q_@@_recursion_stop
+  }
+\cs_new:Npn \@@_map_class_loop:Nnnnw #1#2#3#4 #5 .. #6 ,
+  {
+    \@@_if_recursion_tail_stop_do:nn {#5}
+      { \use:c { @@_map_not_ #4 :Nnn } #1 {#2} {#3} }
+    \@@_map_codepoint_compare:nNnTF {#3} < { "#5 }
+      {
+        \@@_map_class_end:nw
+          { \use:c { @@_map_not_ #4 :Nnn } #1 {#2} {#3} }
+      }
+      {
+        \@@_map_codepoint_compare:nNnTF {#3} > { "#6 }
+          { \@@_map_class_loop:Nnnnw #1 {#2} {#3} {#4} }
+          {
+            \@@_map_class_end:nw
+              { \use:c { @@_map_ #4 :Nnn } #1 {#2} {#3} }
+          }
+      }
+  }
+\cs_new:Npn \@@_map_class_end:nw #1#2 \q_@@_recursion_stop {#1}
+%    \end{macrocode}
+%   Break before \emph{and} after.
+%    \begin{macrocode}
+\cs_new:Npn \@@_map_Control:Nnn #1#2#3
+  {
+    \@@_map_output:Nn #1 {#2}
+    \@@_map_output:Nn #1 {#3}
+    \@@_map_loop:Nnw #1 { }
+  }
+%    \end{macrocode}
+%   Keep collecting.
+%    \begin{macrocode}
+\cs_new:Npn \@@_map_Extend:Nnn #1#2#3
+  { \@@_map_loop:Nnw #1 {#2#3} }
+\cs_new_eq:NN \@@_map_SpacingMark:Nnn \@@_map_Extend:Nnn
+%    \end{macrocode}
+%   Retain and loop, outputting anything earlier.
+%    \begin{macrocode}
+\cs_new:Npn \@@_map_Prepend:Nnn #1#2#3
+  {
+    \@@_map_output:Nn #1 {#2}
+    \@@_map_loop:Nnw #1 {#3}
+  }
+%    \end{macrocode}
+%   Dealing with end-of-class is done such that we can be flexible.
+%    \begin{macrocode}
+\cs_new:Npn \@@_map_not_Control:Nnn #1#2#3
+  { \@@_map_class:Nnnn #1 {#2} {#3} { Extend } }
+\cs_new:Npn \@@_map_not_Extend:Nnn #1#2#3
+  { \@@_map_class:Nnnn #1 {#2} {#3} { SpacingMark } }
+\cs_new:Npn \@@_map_not_SpacingMark:Nnn #1#2#3
+  { \@@_map_class:Nnnn #1 {#2} {#3} { Prepend } }
+\cs_new:Npn \@@_map_not_Prepend:Nnn #1#2#3
+  {
+    \@@_map_output:Nn #1 {#2}
+    \@@_map_loop:Nnw #1 {#3}
+  }
+%    \end{macrocode}
+%   For the end of the process.
+%    \begin{macrocode}
+\cs_new:Npn \@@_map_output:Nn #1#2
+  { \tl_if_blank:nF {#2} { #1 {#2} } }
+\cs_new:Npn \text_map_break:
+  { \prg_map_break:Nn \text_map_break: { } }
+\cs_new:Npn \text_map_break:n
+  { \prg_map_break:Nn \text_map_break: }
+%    \end{macrocode}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+%
+% \begin{macro}[EXP, TF]{\@@_map_codepoint_compare:nNn}
+% \begin{macro}[EXP]{\@@_map_codepoint_compare:N, \@@_map_codepoint_compare_aux:N}
+% \begin{macro}[EXP]{\@@_map_codepoint_compare:NN}
+% \begin{macro}[EXP]{\@@_map_codepoint_compare:NNN}
+% \begin{macro}[EXP]{\@@_map_codepoint_compare:NNNN}
+%   Allows comparison for all engines using a first \enquote{character} followed
+%   by a codepoint.
+%    \begin{macrocode}
+\bool_lazy_or:nnTF
+  { \sys_if_engine_luatex_p: }
+  { \sys_if_engine_xetex_p: }
+  {
+    \prg_new_conditional:Npnn
+      \@@_map_codepoint_compare:nNn #1#2#3 { TF }
+      {
+        \int_compare:nNnTF { `#1 } #2 {#3}
+          \prg_return_true: \prg_return_false:
+      }
+  }
+  {
+    \prg_new_conditional:Npnn
+      \@@_map_codepoint_compare:nNn #1#2#3 { TF }
+      {
+        \int_compare:nNnTF { \@@_map_codepoint_compare:N #1 }
+            #2 {#3}
+          \prg_return_true: \prg_return_false:
+      }
+    \cs_new:Npn \@@_map_codepoint_compare:N #1
+      {
+        \if_int_compare:w `#1 > "80 \exp_stop_f:
+          \if_int_compare:w `#1 < "E0 \exp_stop_f:
+            \exp_after:wN \exp_after:wN \exp_after:wN
+              \@@_map_codepoint_compare:NN
+          \else:
+            \if_int_compare:w `#1 < "F0 \exp_stop_f:
+              \exp_after:wN \exp_after:wN \exp_after:wN
+              \exp_after:wN \exp_after:wN \exp_after:wN
+              \exp_after:wN \@@_map_codepoint_compare:NNN
+            \else:
+              \exp_after:wN \exp_after:wN \exp_after:wN
+              \exp_after:wN \exp_after:wN \exp_after:wN
+              \exp_after:wN \@@_map_codepoint_compare:NNNN
+            \fi:
+          \fi:
+        \else:
+          \exp_after:wN \@@_map_codepoint_compare_aux:N
+        \fi:
+          #1
+      }
+    \cs_new:Npn \@@_map_codepoint_compare_aux:N #1 { `#1 }
+    \cs_new:Npn \@@_map_codepoint_compare:NN #1#2
+      { (`#1 - "C0) * "40 + `#2 - "80 }
+    \cs_new:Npn \@@_map_codepoint_compare:NNN #1#2#3
+      { (`#1 - "E0) * "1000 + (`#2 - "80) * "40 + `#3 - "80 }
+    \cs_new:Npn \@@_map_codepoint_compare:NNNN #1#2#3#4
+      {
+          (`#1 - "F0) * "40000 
+        + (`#2 - "80) * "1000
+        + (`#3 - "80) * "40
+        + `#4 - "80
+      }
+  }
+%    \end{macrocode}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+%
+% \begin{macro}{\text_map_inline:nn}
+%   The standard non-expandable inline version.
+%    \begin{macrocode}
+\cs_new_protected:Npn \text_map_inline:nn #1#2
+  {
+    \int_gincr:N \g__kernel_prg_map_int
+    \cs_gset_protected:cpn
+      { @@_map_ \int_use:N \g__kernel_prg_map_int :w } ##1 {#2}
+    \exp_args:Nnc \text_map_function:nN {#1}
+      { @@_map_ \int_use:N \g__kernel_prg_map_int :w }
+    \prg_break_point:Nn \text_map_break:
+      { \int_gdecr:N \g__kernel_prg_map_int }
+  }
+%    \end{macrocode}
+% \end{macro}
+%
+%    \begin{macrocode}
+%</package>
+%    \end{macrocode}
+%
+% \end{implementation}
+%
+% \PrintIndex
diff --git a/l3kernel/l3text.dtx b/l3kernel/l3text.dtx
index a98eb3cae..f1110bdf2 100644
--- a/l3kernel/l3text.dtx
+++ b/l3kernel/l3text.dtx
@@ -299,6 +299,52 @@
 %   \texttt{true}.
 % \end{variable}
 %
+% \section{Mapping to graphemes}
+%
+% \begin{function}[rEXP, added = 2022-08-04]
+%   \begin{syntax}
+%     \cs{text_map_function:nN} \meta{text} \Arg{function}
+%   \end{syntax}
+%   Takes user input \meta{text} and expands as described for
+%   \cs{text_expand:n}, then maps over the \emph{graphemes} within the
+%   result, passing each grapheme to the \meta{function}.
+%   Broadly a grapheme is a \enquote{user perceived character}:
+%   the Unicode Consortium describe the decomposition of input to
+%   graphemes in depth, and the approach used here implements that
+%   algorithm. The \meta{function} should accept one argument as \meta{balanced
+%   text}: this may be a single codepoint, multiple codepoints (or with an
+%   $8$-bit engine bytes) or may be a control sequence.
+%   See also \cs{text_map_inline:nn}.
+% \end{function}
+%
+% \begin{function}[added = 2022-08-04]
+%   \begin{syntax}
+%     \cs{text_map_inline:nn} \meta{text} \Arg{inline function}
+%   \end{syntax}
+%   Takes user input \meta{text} and expands as described for
+%   \cs{text_expand:n}, then maps over the \emph{graphemes} within the
+%   result, passing each grapheme to the \meta{inline function}.
+%   Broadly a grapheme is a \enquote{user perceived character}:
+%   the Unicode Consortium describe the decomposition of input to
+%   graphemes in depth, and the approach used here implements that
+%   algorithm. The \meta{inline function} should consist of code which
+%   receives the grapheme as \meta{balanced
+%   text}: this may be a single codepoint, multiple codepoints (or with an
+%   8-bit engine bytes) or may be a control sequence.
+%   See also \cs{text_map_function:nN}.
+% \end{function}
+%
+% \begin{function}[rEXP, added = 2022-08-04]
+%   {\text_map_break:, \text_map_break:n}
+%   \begin{syntax}
+%     \cs{text_map_break:}
+%     \cs{text_map_break:n} \Arg{code}
+%   \end{syntax}
+%   Used to terminate a \cs[no-index]{text_map_\ldots} function before all
+%   entries in the \meta{text} have been processed. This
+%   normally takes place within a conditional statement.
+% \end{function}
+%
 % \end{documentation}
 %
 % \begin{implementation}
@@ -357,11 +403,14 @@
 % \end{macro}
 %
 % \begin{macro}[EXP]{\@@_if_recursion_tail_stop_do:Nn}
+% \begin{macro}[EXP]{\@@_if_recursion_tail_stop_do:nn}
 %   Functions to query recursion quarks.
 %    \begin{macrocode}
 \__kernel_quark_new_test:N \@@_if_recursion_tail_stop_do:Nn
+\__kernel_quark_new_test:N \@@_if_recursion_tail_stop_do:nn
 %    \end{macrocode}
 % \end{macro}
+% \end{macro}
 %
 % \subsection{Utilities}
 %
diff --git a/l3kernel/l3unicode.dtx b/l3kernel/l3unicode.dtx
index fbf1d0bd6..86bd39817 100644
--- a/l3kernel/l3unicode.dtx
+++ b/l3kernel/l3unicode.dtx
@@ -252,6 +252,54 @@
 %    \end{macrocode}
 %
 %    \begin{macrocode}
+%<@@=text>
+%    \end{macrocode}
+%
+%  Read the Unicode grapheme data. This is quite easy to handle and we only need
+%  codepoints, not characters, so there is no need to worry about the engine in use.
+%  As reading as a string is most convenient, we have to do some work to remove
+%  spaces: the hardest part of the entire process!
+%    \begin{macrocode}
+\ior_new:N \g_@@_data_ior
+\group_begin:
+  \ior_open:Nn \g_@@_data_ior { GraphemeBreakProperty.txt }
+  \cs_set_nopar:Npn \l_@@_tmpa_str { }
+  \cs_set_nopar:Npn \l_@@_tmpb_str { }
+  \cs_set_protected:Npn \@@_data_auxi:w #1 ;~ #2 ~ #3 \q_stop
+    {
+      \str_if_eq:VnF \l_@@_tmpb_str {#2}
+        {
+          \str_if_empty:NF \l_@@_tmpb_str
+            {
+              \clist_const:cx { c_@@_grapheme_ \l_@@_tmpb_str _clist }
+                { \exp_after:wN \use_none:n \l_@@_tmpa_str }
+              \cs_set_nopar:Npn \l_@@_tmpa_str { }
+            }
+          \cs_set_nopar:Npn \l_@@_tmpb_str {#2}
+        }
+      \@@_data_auxii:w #1 .. #1 .. #1 \q_stop
+    }
+  \cs_set_protected:Npn \@@_data_auxii:w #1 .. #2 .. #3 \q_stop
+    {
+      \cs_set_nopar:Npx \l_@@_tmpa_str
+        {
+          \l_@@_tmpa_str ,
+          \tl_trim_spaces:n {#1} .. \tl_trim_spaces:n {#2}
+        }
+    }
+  \ior_str_map_inline:Nn \g_@@_data_ior
+    {
+      \str_if_eq:eeF { \tl_head:w #1 \c_hash_str \q_stop } { \c_hash_str }
+        {
+          \tl_if_blank:nF {#1}
+            { \@@_data_auxi:w #1 \q_stop }
+        }
+    }
+  \ior_close:N \g_@@_data_ior
+\group_end:    
+%    \end{macrocode}
+%
+%    \begin{macrocode}
 %</package>
 %    \end{macrocode}
 %
diff --git a/l3kernel/testfiles/m3token001.luatex.tlg b/l3kernel/testfiles/m3text006.luatex.tlg
similarity index 80%
copy from l3kernel/testfiles/m3token001.luatex.tlg
copy to l3kernel/testfiles/m3text006.luatex.tlg
index bdf4776d3..a7c25ab1a 100644
--- a/l3kernel/testfiles/m3token001.luatex.tlg
+++ b/l3kernel/testfiles/m3text006.luatex.tlg
@@ -2,9 +2,8 @@ This is a generated file for the LaTeX (2e + expl3) validation system.
 Don't change this file in any respect.
 Author: Joseph Wright
 ============================================================
-TEST 1: Unicode NFD
+TEST 1: Grapheme mapping
 ============================================================
-A
-á
-ῒ
+(H)(e)(l)(l)(o)
+(S)(p)(ı)(n̈)(a)(l)( )(T)(a)(p)
 ============================================================
diff --git a/l3kernel/testfiles-backend/m3color005.lvt b/l3kernel/testfiles/m3text006.lvt
similarity index 55%
copy from l3kernel/testfiles-backend/m3color005.lvt
copy to l3kernel/testfiles/m3text006.lvt
index e43d9a6aa..e18e34cdb 100644
--- a/l3kernel/testfiles-backend/m3color005.lvt
+++ b/l3kernel/testfiles/m3text006.lvt
@@ -1,9 +1,6 @@
 %
 % Copyright (C) 2022 The LaTeX Project
 %
-
-\documentclass{minimal}
-
 \input{regression-test}
 
 \RequirePackage[enable-debug]{expl3}
@@ -11,27 +8,21 @@
 \debug_on:n { check-declarations , deprecation , log-functions }
 \ExplSyntaxOff
 
-\START
+\documentclass{minimal}
 
+\START
 \AUTHOR{Joseph Wright}
 
 \ExplSyntaxOn
 
 \OMIT
-  \cs_set_protected:Npn \test:n #1
-    {
-      \hbox_set:Nn \l_tmpa_box
-        {
-          #1
-          Hello
-        }
-      \box_show:N \l_tmpa_box
-    }
+  \cs_set:Npn \test:n #1 { (#1) }
 \TIMO
 
-\TEST { Current~color }
-  {
-    \test:n { \color_ensure_current: }
+\TESTEXP{Grapheme~mapping}
+  {%
+    \text_map_function:nN { Hello } \test:n \NEWLINE
+    \text_map_function:nN { Spın̈al~Tap } \test:n
   }
 
 \END
\ No newline at end of file
diff --git a/l3kernel/testfiles/m3token001.tlg b/l3kernel/testfiles/m3text006.tlg
similarity index 78%
copy from l3kernel/testfiles/m3token001.tlg
copy to l3kernel/testfiles/m3text006.tlg
index cd844648d..d2e8e1d97 100644
--- a/l3kernel/testfiles/m3token001.tlg
+++ b/l3kernel/testfiles/m3text006.tlg
@@ -2,7 +2,8 @@ This is a generated file for the LaTeX (2e + expl3) validation system.
 Don't change this file in any respect.
 Author: Joseph Wright
 ============================================================
-TEST 1: Unicode NFD
+TEST 1: Grapheme mapping
 ============================================================
-A
+(H)(e)(l)(l)(o)
+(S)(p)(^^c4^^b1)(n^^cc^^88)(a)(l)( )(T)(a)(p)
 ============================================================
diff --git a/l3kernel/testfiles/m3token001.luatex.tlg b/l3kernel/testfiles/m3text006.xetex.tlg
similarity index 80%
copy from l3kernel/testfiles/m3token001.luatex.tlg
copy to l3kernel/testfiles/m3text006.xetex.tlg
index bdf4776d3..a7c25ab1a 100644
--- a/l3kernel/testfiles/m3token001.luatex.tlg
+++ b/l3kernel/testfiles/m3text006.xetex.tlg
@@ -2,9 +2,8 @@ This is a generated file for the LaTeX (2e + expl3) validation system.
 Don't change this file in any respect.
 Author: Joseph Wright
 ============================================================
-TEST 1: Unicode NFD
+TEST 1: Grapheme mapping
 ============================================================
-A
-á
-ῒ
+(H)(e)(l)(l)(o)
+(S)(p)(ı)(n̈)(a)(l)( )(T)(a)(p)
 ============================================================