[latex3-commits] [git/LaTeX3-latex3-latex3] unicode-data: Rename \char_to_utfviii_bytes:n to \codepoint_to_bytes:n (31a1124e2)

Sun Oct 9 19:35:35 CEST 2022

Repository : https://github.com/latex3/latex3
On branch  : unicode-data
Link       : https://github.com/latex3/latex3/commit/31a1124e2fe6d1f19dba3d1fb921dc1cf91e9613

>---------------------------------------------------------------

commit 31a1124e2fe6d1f19dba3d1fb921dc1cf91e9613
Author: Joseph Wright <joseph.wright at morningstar2.co.uk>
Date:   Sun Oct 9 17:44:09 2022 +0100

    Rename \char_to_utfviii_bytes:n to \codepoint_to_bytes:n
    
    This fits a more general pattern of moving functions:
    several commits will address this.
    
    There is still the question of byte order here:
    do we want to change and 'fill from the bottom'.


>---------------------------------------------------------------

31a1124e2fe6d1f19dba3d1fb921dc1cf91e9613
 l3kernel/CHANGELOG.md                      |   4 +
 l3kernel/doc/l3obsolete.txt                |   1 +
 l3kernel/l3candidates.dtx                  |  12 --
 l3kernel/l3deprecation.dtx                 |   7 ++
 l3kernel/l3str-convert.dtx                 |   2 +-
 l3kernel/l3str.dtx                         |   2 +-
 l3kernel/l3text-case.dtx                   |  16 +--
 l3kernel/l3text-purify.dtx                 |   4 +-
 l3kernel/l3token.dtx                       | 105 ----------------
 l3kernel/l3unicode.dtx                     | 192 +++++++++++++++++++++++------
 l3kernel/testfiles/m3char001.luatex.tlg    |  16 +--
 l3kernel/testfiles/m3char001.lvt           |   8 --
 l3kernel/testfiles/m3char001.ptex.tlg      |  16 +--
 l3kernel/testfiles/m3char001.tlg           |  16 +--
 l3kernel/testfiles/m3char001.uptex.tlg     |  16 +--
 l3kernel/testfiles/m3char001.xetex.tlg     |  16 +--
 l3kernel/testfiles/m3text006.lvt           |   2 +-
 l3kernel/testfiles/m3token006.luatex.tlg   |  12 +-
 l3kernel/testfiles/m3token006.lvt          |  10 --
 l3kernel/testfiles/m3token006.tlg          |  12 +-
 l3kernel/testfiles/m3token006.xetex.tlg    |  12 +-
 l3kernel/testfiles/m3unicode001.luatex.tlg |  10 ++
 l3kernel/testfiles/m3unicode001.lvt        |  10 ++
 l3kernel/testfiles/m3unicode001.tlg        |  10 ++
 l3kernel/testfiles/m3unicode001.xetex.tlg  |  10 ++
 25 files changed, 244 insertions(+), 277 deletions(-)

diff --git a/l3kernel/CHANGELOG.md b/l3kernel/CHANGELOG.md
index 6e55f71e0..0d32ba176 100644
--- a/l3kernel/CHANGELOG.md
+++ b/l3kernel/CHANGELOG.md
@@ -8,6 +8,7 @@ this project uses date-based 'snapshot' version identifiers.
 ## [Unreleased]
 
 ### Added
+- `\codepoint_to_bytes:n`
 - `\codepoint_str_generate:n`
 
 ### Changed
@@ -19,6 +20,9 @@ this project uses date-based 'snapshot' version identifiers.
   tokens (issue [\#1110](https://github.com/latex3/latex3/issues/1110)), and an
   esoteric case (issue [\#1113](https://github.com/latex3/latex3/issues/1113))
 
+### Deprecated
+- `\char_to_utfviii_bytes:n`
+
 ## [2022-09-28]
 
 ### Added
diff --git a/l3kernel/doc/l3obsolete.txt b/l3kernel/doc/l3obsolete.txt
index 6f8844ce6..2377bb985 100644
--- a/l3kernel/doc/l3obsolete.txt
+++ b/l3kernel/doc/l3obsolete.txt
@@ -22,6 +22,7 @@ Function                            Date deprecated
 \char_str_lower_case:N                   2020-01-03
 \char_str_mixed_case:N                   2020-01-03
 \char_str_upper_case:N                   2020-01-03
+\char_to_utfviii_bytes:n                 2022-10-09
 \cs_argument_spec:N                      2022-06-24
 \l_keys_key_tl                           2020-02-08
 \l_keys_path_tl                          2020-02-08
diff --git a/l3kernel/l3candidates.dtx b/l3kernel/l3candidates.dtx
index 1bcbfc4ec..13a0e9b5c 100644
--- a/l3kernel/l3candidates.dtx
+++ b/l3kernel/l3candidates.dtx
@@ -626,18 +626,6 @@
 %   (\enquote{active}), and character code $32$ (space).
 % \end{variable}
 %
-% \begin{function}[added = 2020-01-09, EXP]{\char_to_utfviii_bytes:n}
-%   \begin{syntax}
-%     \cs{char_to_utfviii_bytes:n} \Arg{codepoint}
-%   \end{syntax}
-%   Converts the (Unicode) \meta{codepoint} to UTF-8 bytes. The expansion
-%   of this function comprises four brace groups, each of which will contain
-%   a hexadecimal value: the appropriate byte. As UTF-8 is a variable-length,
-%   one or more of the groups may be empty: the bytes read in the logical order,
-%   such that a two-byte codepoint will have groups |#1| and |#2| filled
-%   and |#3| and |#4| empty.
-% \end{function}
-%
 % \begin{function}[added = 2020-01-02, rEXP]{\char_to_nfd:N}
 %   \begin{syntax}
 %     \cs{char_to_nfd:N} \meta{char}
diff --git a/l3kernel/l3deprecation.dtx b/l3kernel/l3deprecation.dtx
index 02f353516..85c532113 100644
--- a/l3kernel/l3deprecation.dtx
+++ b/l3kernel/l3deprecation.dtx
@@ -549,6 +549,13 @@
 %
 % \subsection{Deprecated \pkg{l3token} functions}
 %
+% \begin{macro}[EXP]{\char_to_utfviii_bytes:n}
+%    \begin{macrocode}
+\__kernel_patch_deprecation:nnNNpn { 2022-10-09 } { \codepoint_to_bytes:n }
+\cs_gset:Npn \char_to_utfviii_bytes:n { \codepoint_to_bytes:n }
+%    \end{macrocode}
+% \end{macro}
+%
 % \begin{macro}[EXP]
 %   {
 %     \char_lower_case:N, \char_upper_case:N,
diff --git a/l3kernel/l3str-convert.dtx b/l3kernel/l3str-convert.dtx
index f2d379346..3b5df6dda 100644
--- a/l3kernel/l3str-convert.dtx
+++ b/l3kernel/l3str-convert.dtx
@@ -2635,7 +2635,7 @@
     \cs_new:Npn \@@_convert_pdfname_bytes:n #1
       {
         \exp_args:Ne \@@_convert_pdfname_bytes_aux:n
-          { \char_to_utfviii_bytes:n {`#1} }
+          { \codepoint_to_bytes:n {`#1} }
       }
     \cs_new:Npn \@@_convert_pdfname_bytes_aux:n #1
       { \@@_convert_pdfname_bytes_aux:nnnn #1 }
diff --git a/l3kernel/l3str.dtx b/l3kernel/l3str.dtx
index 82e907b88..c7237635c 100644
--- a/l3kernel/l3str.dtx
+++ b/l3kernel/l3str.dtx
@@ -2056,7 +2056,7 @@
       \use:e
         {
           \exp_not:N \@@_change_case_generate:nnnn
-            \char_to_utfviii_bytes:n {#1}
+            \codepoint_to_bytes:n {#1}
         }
     }
   \cs_new:Npn \@@_change_case_generate:nnnn #1#2#3#4
diff --git a/l3kernel/l3text-case.dtx b/l3kernel/l3text-case.dtx
index cad0ed295..18c2188e6 100644
--- a/l3kernel/l3text-case.dtx
+++ b/l3kernel/l3text-case.dtx
@@ -1868,7 +1868,7 @@
                   }
               }
             \use:x
-              { \@@_tmp:w \char_to_utfviii_bytes:n { "#2 } }  
+              { \@@_tmp:w \codepoint_to_bytes:n { "#2 } }  
           \group_end:
         }
       \@@_tmp:w \c_@@_dotless_i_tl      { 0131 }
@@ -1902,8 +1902,8 @@
           \use:x
             {
               \@@_tmp:w
-                \char_to_utfviii_bytes:n { "#1 }
-                \char_to_utfviii_bytes:n { "#2 }
+                \codepoint_to_bytes:n { "#1 }
+                \codepoint_to_bytes:n { "#2 }
             }
           \@@_loop:nn
         }
@@ -2183,8 +2183,8 @@
             \use:x
               {
                 \@@_tmp:w
-                  \char_to_utfviii_bytes:n { "#1 }
-                  \char_to_utfviii_bytes:n { "#2 }
+                  \codepoint_to_bytes:n { "#1 }
+                  \codepoint_to_bytes:n { "#2 }
               }
           \group_end:
         }
@@ -2238,7 +2238,7 @@
                     {#2}
               }
             \use:x
-              { \@@_tmp:w \char_to_utfviii_bytes:n { "#1 } }
+              { \@@_tmp:w \codepoint_to_bytes:n { "#1 } }
           \group_end:
         }
       \@@_tmp:w { 00DF } { SS } { upper }
@@ -2463,8 +2463,8 @@
             \use:x
               {
                 \@@_tmp:w
-                  \char_to_utfviii_bytes:n { "#1 }
-                  \char_to_utfviii_bytes:n { "#2 }
+                  \codepoint_to_bytes:n { "#1 }
+                  \codepoint_to_bytes:n { "#2 }
               }
           \group_end:
         }
diff --git a/l3kernel/l3text-purify.dtx b/l3kernel/l3text-purify.dtx
index 7caaeb0b5..dcec40bca 100644
--- a/l3kernel/l3text-purify.dtx
+++ b/l3kernel/l3text-purify.dtx
@@ -486,7 +486,7 @@
         \text_declare_purify_equivalent:Nx #1
           {
             \exp_args:Ne \@@_tmp:n
-              { \char_to_utfviii_bytes:n { "#2 } }
+              { \codepoint_to_bytes:n { "#2 } }
           }
         \@@_loop:Nn
       }
@@ -574,7 +574,7 @@
       \cs_set:Npn \@@_tmp:n #1
         {
           \exp_args:Ne \@@_tmp_aux:n
-            { \char_to_utfviii_bytes:n { "#1 } }
+            { \codepoint_to_bytes:n { "#1 } }
         }
       \cs_set:Npn \@@_tmp_aux:n #1 { \@@_tmp:nnnn #1 }
       \cs_set:Npn \@@_tmp:nnnn #1#2#3#4
diff --git a/l3kernel/l3token.dtx b/l3kernel/l3token.dtx
index 307cb8d08..356d88d57 100644
--- a/l3kernel/l3token.dtx
+++ b/l3kernel/l3token.dtx
@@ -1690,111 +1690,6 @@
 % \end{macro}
 % \end{macro}
 %
-% \begin{macro}[EXP]{\char_to_utfviii_bytes:n}
-% \begin{macro}[EXP]{\@@_to_utfviii_bytes_auxi:n}
-% \begin{macro}[EXP]{\@@_to_utfviii_bytes_auxii:Nnn}
-% \begin{macro}[EXP]{\@@_to_utfviii_bytes_auxiii:n}
-% \begin{macro}[EXP]
-%   {
-%     \@@_to_utfviii_bytes_outputi:nw   ,
-%     \@@_to_utfviii_bytes_outputii:nw  ,
-%     \@@_to_utfviii_bytes_outputiii:nw ,
-%     \@@_to_utfviii_bytes_outputiv:nw
-%   }
-% \begin{macro}[EXP]
-%   {\@@_to_utfviii_bytes_output:nnn, \@@_to_utfviii_bytes_output:fnn}
-% \begin{macro}[EXP]{\@@_to_utfviii_bytes_end:}
-%   This code converts a codepoint into the correct UTF-8 representation.
-%   In terms of the algorithm itself, see
-%   \url{https://en.wikipedia.org/wiki/UTF-8} for the octet pattern.
-%    \begin{macrocode}
-\cs_new:Npn \char_to_utfviii_bytes:n #1
-  {
-    \exp_args:Nf \@@_to_utfviii_bytes_auxi:n
-      { \int_eval:n {#1} }
-  }
-\cs_new:Npn \@@_to_utfviii_bytes_auxi:n #1
-  {
-    \if_int_compare:w #1 > "80 \exp_stop_f:
-      \if_int_compare:w #1 < "800 \exp_stop_f:
-        \@@_to_utfviii_bytes_outputi:nw
-          { \@@_to_utfviii_bytes_auxii:Nnn C {#1} { 64 } }
-        \@@_to_utfviii_bytes_outputii:nw
-          { \@@_to_utfviii_bytes_auxiii:n {#1} }
-      \else:
-        \if_int_compare:w #1 < "10000 \exp_stop_f:
-          \@@_to_utfviii_bytes_outputi:nw
-            { \@@_to_utfviii_bytes_auxii:Nnn E {#1} { 64 * 64 } }
-          \@@_to_utfviii_bytes_outputii:nw
-            {
-              \@@_to_utfviii_bytes_auxiii:n
-                { \int_div_truncate:nn {#1} { 64 } }
-            }
-          \@@_to_utfviii_bytes_outputiii:nw
-            { \@@_to_utfviii_bytes_auxiii:n {#1} }
-        \else:
-          \@@_to_utfviii_bytes_outputi:nw
-            {
-              \@@_to_utfviii_bytes_auxii:Nnn F
-                 {#1} { 64 * 64 * 64 }
-            }
-          \@@_to_utfviii_bytes_outputii:nw
-            {
-              \@@_to_utfviii_bytes_auxiii:n
-                { \int_div_truncate:nn {#1} { 64 * 64 } }
-            }
-          \@@_to_utfviii_bytes_outputiii:nw
-            {
-              \@@_to_utfviii_bytes_auxiii:n
-                { \int_div_truncate:nn {#1} { 64 } }
-            }
-          \@@_to_utfviii_bytes_outputiv:nw
-            { \@@_to_utfviii_bytes_auxiii:n {#1} }
-        \fi:
-      \fi:
-    \else:
-      \@@_to_utfviii_bytes_outputi:nw {#1}
-    \fi:
-    \@@_to_utfviii_bytes_end: { } { } { } { }
-  }
-\cs_new:Npn \@@_to_utfviii_bytes_auxii:Nnn #1#2#3
-  {  "#10 + \int_div_truncate:nn {#2} {#3} }
-\cs_new:Npn \@@_to_utfviii_bytes_auxiii:n #1
-  { \int_mod:nn {#1} { 64 } + 128 }
-\cs_new:Npn \@@_to_utfviii_bytes_outputi:nw
-  #1 #2 \@@_to_utfviii_bytes_end: #3
-  { \@@_to_utfviii_bytes_output:fnn { \int_eval:n {#1} } { } {#2} }
-\cs_new:Npn \@@_to_utfviii_bytes_outputii:nw
-  #1 #2 \@@_to_utfviii_bytes_end: #3#4
-  { \@@_to_utfviii_bytes_output:fnn { \int_eval:n {#1} } { {#3} } {#2} }
-\cs_new:Npn \@@_to_utfviii_bytes_outputiii:nw
-  #1 #2 \@@_to_utfviii_bytes_end: #3#4#5
-  {
-    \@@_to_utfviii_bytes_output:fnn
-      { \int_eval:n {#1} } { {#3} {#4} } {#2}
-  }
-\cs_new:Npn \@@_to_utfviii_bytes_outputiv:nw
-  #1 #2 \@@_to_utfviii_bytes_end: #3#4#5#6
-  {
-    \@@_to_utfviii_bytes_output:fnn
-      { \int_eval:n {#1} } { {#3} {#4} {#5} } {#2}
-  }
-\cs_new:Npn \@@_to_utfviii_bytes_output:nnn #1#2#3
-  {
-    #3
-    \@@_to_utfviii_bytes_end: #2 {#1}
-  }
-\cs_generate_variant:Nn \@@_to_utfviii_bytes_output:nnn { f }
-\cs_new:Npn \@@_to_utfviii_bytes_end: { }
-%    \end{macrocode}
-% \end{macro}
-% \end{macro}
-% \end{macro}
-% \end{macro}
-% \end{macro}
-% \end{macro}
-% \end{macro}
-%
 % \begin{macro}[EXP]{\char_to_nfd:N}
 % \begin{macro}[EXP]{\char_to_nfd:n}
 % \begin{macro}[EXP]{\@@_to_nfd:nn}
diff --git a/l3kernel/l3unicode.dtx b/l3kernel/l3unicode.dtx
index 0974cae51..a4cd8ae96 100644
--- a/l3kernel/l3unicode.dtx
+++ b/l3kernel/l3unicode.dtx
@@ -99,6 +99,18 @@
 %   category code $10$.
 % \end{function}
 %
+% \begin{function}[added = 2022-10-09, EXP]{\codepoints_to_bytes:n}
+%   \begin{syntax}
+%     \cs{codepoint_to_bytes:n} \Arg{codepoint}
+%   \end{syntax}
+%   Converts the \meta{codepoint} to UTF-8 bytes. The expansion
+%   of this function comprises four brace groups, each of which will contain
+%   a hexadecimal value: the appropriate byte. As UTF-8 is a variable-length,
+%   one or more of the groups may be empty: the bytes read in the logical order,
+%   such that a two-byte codepoint will have groups |#1| and |#2| filled
+%   and |#3| and |#4| empty.
+% \end{function}
+%
 % \end{documentation}
 %
 % \begin{implementation}
@@ -113,39 +125,7 @@
 %<@@=codepoint>
 %    \end{macrocode}
 %
-% Text operations requires data from the Unicode Consortium. Data read into
-% Unicode engine formats is at best a small part of what we need, so there
-% is a loader here to set  up the appropriate data structures.
-%
-% Where we need data for most or all of the Unicode range, we use the two-stage
-% table approach recommended by the Unicode Consortium and demonstrated in a
-% model implementation in Python in
-% \url{https://www.strchr.com/multi-stage_tables}. This approach uses the
-% \texttt{intarray} (\texttt{fontdimen}-based) data type as it is fast for
-% random access and avoids significant hash table usage. In contrast, where
-% only a small subset of codepoints are required, storage as macros is
-% preferable. There is also some consideration of the effort needed to load
-% data: see for example the grapheme breaking information, which would be 
-% problematic to convert into a two-stage table but which can be used with
-% reasonable performance in a small number of comma lists (at the cost that
-% breaking at higher codepoint Hangul characters will be slightly slow).
-%
-% \begin{variable}{\c_@@_block_size_int}
-%   Choosing the block size for the blocks in the two-stage approach is
-%   non-trivial: depending on the data stored, the optimal size for
-%   memory usage will vary. At the same time, for us there is also the
-%   question of load-time: larger blocks require longer comma lists
-%   as intermediates, so are slower. As this is going to be needed
-%   to use the data, we set it up outside of the group for clarity.
-%    \begin{macrocode}
-\int_const:Nn \c_@@_block_size_int { 64 }
-%    \end{macrocode}
-% \end{variable}
-%
-% Parsing the data files can be the same way for all engines, but where they
-% are stored as character tokens, the construction method depends on whether
-% they are Unicode or $8$-bit internally. Parsing is therefore done by common
-% functions, with some data storage using engine-specific auxiliaries.
+% \subsection{User functions}
 %
 % \begin{macro}[EXP]{\codepoint_str_generate:n}
 % \begin{macro}[EXP]{\@@_str_generate:nnnn}
@@ -189,7 +169,7 @@
             \use:e
               {
                 \exp_not:N \@@_str_generate:nnnn
-                  \char_to_utfviii_bytes:n {#1}
+                  \codepoint_to_bytes:n {#1}
               }
           }
       }
@@ -221,7 +201,7 @@
                 \use:e
                   {
                     \exp_not:N \@@_generate:nnnn
-                      \char_to_utfviii_bytes:n {#1}
+                      \codepoint_to_bytes:n {#1}
                   }
               }
           }
@@ -256,6 +236,147 @@
 % \end{macro}
 % \end{macro}
 %
+% \begin{macro}[EXP]{\codepoint_to_bytes:n}
+% \begin{macro}[EXP]{\@@_to_bytes_auxi:n}
+% \begin{macro}[EXP]{\@@_to_bytes_auxii:Nnn}
+% \begin{macro}[EXP]{\@@_to_bytes_auxiii:n}
+% \begin{macro}[EXP]
+%   {
+%     \@@_to_bytes_outputi:nw   ,
+%     \@@_to_bytes_outputii:nw  ,
+%     \@@_to_bytes_outputiii:nw ,
+%     \@@_to_bytes_outputiv:nw
+%   }
+% \begin{macro}[EXP]
+%   {\@@_to_bytes_output:nnn, \@@_to_bytes_output:fnn}
+% \begin{macro}[EXP]{\@@_to_bytes_end:}
+%   This code converts a codepoint into the correct UTF-8 representation.
+%   In terms of the algorithm itself, see
+%   \url{https://en.wikipedia.org/wiki/UTF-8} for the octet pattern.
+%    \begin{macrocode}
+\cs_new:Npn \codepoint_to_bytes:n #1
+  {
+    \exp_args:Nf \@@_to_bytes_auxi:n
+      { \int_eval:n {#1} }
+  }
+\cs_new:Npn \@@_to_bytes_auxi:n #1
+  {
+    \if_int_compare:w #1 > "80 \exp_stop_f:
+      \if_int_compare:w #1 < "800 \exp_stop_f:
+        \@@_to_bytes_outputi:nw
+          { \@@_to_bytes_auxii:Nnn C {#1} { 64 } }
+        \@@_to_bytes_outputii:nw
+          { \@@_to_bytes_auxiii:n {#1} }
+      \else:
+        \if_int_compare:w #1 < "10000 \exp_stop_f:
+          \@@_to_bytes_outputi:nw
+            { \@@_to_bytes_auxii:Nnn E {#1} { 64 * 64 } }
+          \@@_to_bytes_outputii:nw
+            {
+              \@@_to_bytes_auxiii:n
+                { \int_div_truncate:nn {#1} { 64 } }
+            }
+          \@@_to_bytes_outputiii:nw
+            { \@@_to_bytes_auxiii:n {#1} }
+        \else:
+          \@@_to_bytes_outputi:nw
+            {
+              \@@_to_bytes_auxii:Nnn F
+                 {#1} { 64 * 64 * 64 }
+            }
+          \@@_to_bytes_outputii:nw
+            {
+              \@@_to_bytes_auxiii:n
+                { \int_div_truncate:nn {#1} { 64 * 64 } }
+            }
+          \@@_to_bytes_outputiii:nw
+            {
+              \@@_to_bytes_auxiii:n
+                { \int_div_truncate:nn {#1} { 64 } }
+            }
+          \@@_to_bytes_outputiv:nw
+            { \@@_to_bytes_auxiii:n {#1} }
+        \fi:
+      \fi:
+    \else:
+      \@@_to_bytes_outputi:nw {#1}
+    \fi:
+    \@@_to_bytes_end: { } { } { } { }
+  }
+\cs_new:Npn \@@_to_bytes_auxii:Nnn #1#2#3
+  {  "#10 + \int_div_truncate:nn {#2} {#3} }
+\cs_new:Npn \@@_to_bytes_auxiii:n #1
+  { \int_mod:nn {#1} { 64 } + 128 }
+\cs_new:Npn \@@_to_bytes_outputi:nw
+  #1 #2 \@@_to_bytes_end: #3
+  { \@@_to_bytes_output:fnn { \int_eval:n {#1} } { } {#2} }
+\cs_new:Npn \@@_to_bytes_outputii:nw
+  #1 #2 \@@_to_bytes_end: #3#4
+  { \@@_to_bytes_output:fnn { \int_eval:n {#1} } { {#3} } {#2} }
+\cs_new:Npn \@@_to_bytes_outputiii:nw
+  #1 #2 \@@_to_bytes_end: #3#4#5
+  {
+    \@@_to_bytes_output:fnn
+      { \int_eval:n {#1} } { {#3} {#4} } {#2}
+  }
+\cs_new:Npn \@@_to_bytes_outputiv:nw
+  #1 #2 \@@_to_bytes_end: #3#4#5#6
+  {
+    \@@_to_bytes_output:fnn
+      { \int_eval:n {#1} } { {#3} {#4} {#5} } {#2}
+  }
+\cs_new:Npn \@@_to_bytes_output:nnn #1#2#3
+  {
+    #3
+    \@@_to_bytes_end: #2 {#1}
+  }
+\cs_generate_variant:Nn \@@_to_bytes_output:nnn { f }
+\cs_new:Npn \@@_to_bytes_end: { }
+%    \end{macrocode}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+%
+% \subsection{Data loader}
+%
+% Text operations requires data from the Unicode Consortium. Data read into
+% Unicode engine formats is at best a small part of what we need, so there
+% is a loader here to set  up the appropriate data structures.
+%
+% Where we need data for most or all of the Unicode range, we use the two-stage
+% table approach recommended by the Unicode Consortium and demonstrated in a
+% model implementation in Python in
+% \url{https://www.strchr.com/multi-stage_tables}. This approach uses the
+% \texttt{intarray} (\texttt{fontdimen}-based) data type as it is fast for
+% random access and avoids significant hash table usage. In contrast, where
+% only a small subset of codepoints are required, storage as macros is
+% preferable. There is also some consideration of the effort needed to load
+% data: see for example the grapheme breaking information, which would be 
+% problematic to convert into a two-stage table but which can be used with
+% reasonable performance in a small number of comma lists (at the cost that
+% breaking at higher codepoint Hangul characters will be slightly slow).
+%
+% \begin{variable}{\c_@@_block_size_int}
+%   Choosing the block size for the blocks in the two-stage approach is
+%   non-trivial: depending on the data stored, the optimal size for
+%   memory usage will vary. At the same time, for us there is also the
+%   question of load-time: larger blocks require longer comma lists
+%   as intermediates, so are slower. As this is going to be needed
+%   to use the data, we set it up outside of the group for clarity.
+%    \begin{macrocode}
+\int_const:Nn \c_@@_block_size_int { 64 }
+%    \end{macrocode}
+% \end{variable}
+%
+% Parsing the data files can be the same way for all engines, but where they
+% are stored as character tokens, the construction method depends on whether
+% they are Unicode or $8$-bit internally. Parsing is therefore done by common
+% functions, with some data storage using engine-specific auxiliaries.
+%
 % As only the data needs to remain at the end of this process, everything
 % is set up inside a group. The only thing that is outside is creating a
 % stream: they are global anyway and it is best to force a stream for
@@ -721,7 +842,6 @@
 % \end{macro}
 % \end{macro}
 %
-%
 % \begin{macro}[EXP]{\__kernel_codepoint_nfd:n}
 % \begin{macro}[EXP]{\@@_nfd:nn}
 %   A simple interface.
diff --git a/l3kernel/testfiles/m3char001.luatex.tlg b/l3kernel/testfiles/m3char001.luatex.tlg
index 0b33b0a9d..fe8482250 100644
--- a/l3kernel/testfiles/m3char001.luatex.tlg
+++ b/l3kernel/testfiles/m3char001.luatex.tlg
@@ -489,15 +489,7 @@ cell 2
 C
 ============================================================
 ============================================================
-TEST 7: \char_to_utfviii_bytes:n 
-============================================================
-{65}{}{}{}
-{206}{169}{}{}
-{225}{136}{128}{}
-{240}{144}{128}{128}
-============================================================
-============================================================
-TEST 8: Number of expansions
+TEST 7: Number of expansions
 ============================================================
 begin-group character A
 end-group character A
@@ -511,7 +503,7 @@ the character A
 undefined
 ============================================================
 ============================================================
-TEST 9: \char_ <thing>case:N
+TEST 8: \char_ <thing>case:N
 ============================================================
 The token list contains the tokens:
 >  a (the letter a).
@@ -595,7 +587,7 @@ The token list contains the tokens:
 l. ...    }
 ============================================================
 ============================================================
-TEST 10: \char_str_ <thing>case:N
+TEST 9: \char_str_ <thing>case:N
 ============================================================
 The token list contains the tokens:
 >  a (the character a).
@@ -679,7 +671,7 @@ The token list contains the tokens:
 l. ...    }
 ============================================================
 ============================================================
-TEST 11: Changing \lccode and \uccode 
+TEST 10: Changing \lccode and \uccode 
 ============================================================
 The token list contains the tokens:
 >  q (the character q).
diff --git a/l3kernel/testfiles/m3char001.lvt b/l3kernel/testfiles/m3char001.lvt
index 3a217d4e4..775ba9c86 100644
--- a/l3kernel/testfiles/m3char001.lvt
+++ b/l3kernel/testfiles/m3char001.lvt
@@ -143,14 +143,6 @@
       }
   }
 
-\TESTEXP { \char_to_utfviii_bytes:n }
-  {
-    \char_to_utfviii_bytes:n { `A } \NEWLINE
-    \char_to_utfviii_bytes:n { "03A9 } \NEWLINE
-    \char_to_utfviii_bytes:n { "1200 } \NEWLINE
-    \char_to_utfviii_bytes:n { "10000 }
-  }
-
 \OMIT
 \cs_gset:Npn \test:nn #1#2
   {
diff --git a/l3kernel/testfiles/m3char001.ptex.tlg b/l3kernel/testfiles/m3char001.ptex.tlg
index 446889d96..09526d1b5 100644
--- a/l3kernel/testfiles/m3char001.ptex.tlg
+++ b/l3kernel/testfiles/m3char001.ptex.tlg
@@ -518,15 +518,7 @@ cell 2
 C
 ============================================================
 ============================================================
-TEST 7: \char_to_utfviii_bytes:n 
-============================================================
-{65}{}{}{}
-{206}{169}{}{}
-{225}{136}{128}{}
-{240}{144}{128}{128}
-============================================================
-============================================================
-TEST 8: Number of expansions
+TEST 7: Number of expansions
 ============================================================
 begin-group character A
 end-group character A
@@ -540,7 +532,7 @@ the character A
 undefined
 ============================================================
 ============================================================
-TEST 9: \char_ <thing>case:N
+TEST 8: \char_ <thing>case:N
 ============================================================
 The token list contains the tokens:
 >  a (the letter a).
@@ -1042,7 +1034,7 @@ The token list contains the tokens:
 l. ...    }
 ============================================================
 ============================================================
-TEST 10: \char_str_ <thing>case:N
+TEST 9: \char_str_ <thing>case:N
 ============================================================
 The token list contains the tokens:
 >  a (the character a).
@@ -1136,7 +1128,7 @@ The token list contains the tokens:
 l. ...    }
 ============================================================
 ============================================================
-TEST 11: Changing \lccode and \uccode 
+TEST 10: Changing \lccode and \uccode 
 ============================================================
 The token list contains the tokens:
 >  q (the character q).
diff --git a/l3kernel/testfiles/m3char001.tlg b/l3kernel/testfiles/m3char001.tlg
index 446889d96..09526d1b5 100644
--- a/l3kernel/testfiles/m3char001.tlg
+++ b/l3kernel/testfiles/m3char001.tlg
@@ -518,15 +518,7 @@ cell 2
 C
 ============================================================
 ============================================================
-TEST 7: \char_to_utfviii_bytes:n 
-============================================================
-{65}{}{}{}
-{206}{169}{}{}
-{225}{136}{128}{}
-{240}{144}{128}{128}
-============================================================
-============================================================
-TEST 8: Number of expansions
+TEST 7: Number of expansions
 ============================================================
 begin-group character A
 end-group character A
@@ -540,7 +532,7 @@ the character A
 undefined
 ============================================================
 ============================================================
-TEST 9: \char_ <thing>case:N
+TEST 8: \char_ <thing>case:N
 ============================================================
 The token list contains the tokens:
 >  a (the letter a).
@@ -1042,7 +1034,7 @@ The token list contains the tokens:
 l. ...    }
 ============================================================
 ============================================================
-TEST 10: \char_str_ <thing>case:N
+TEST 9: \char_str_ <thing>case:N
 ============================================================
 The token list contains the tokens:
 >  a (the character a).
@@ -1136,7 +1128,7 @@ The token list contains the tokens:
 l. ...    }
 ============================================================
 ============================================================
-TEST 11: Changing \lccode and \uccode 
+TEST 10: Changing \lccode and \uccode 
 ============================================================
 The token list contains the tokens:
 >  q (the character q).
diff --git a/l3kernel/testfiles/m3char001.uptex.tlg b/l3kernel/testfiles/m3char001.uptex.tlg
index 446889d96..09526d1b5 100644
--- a/l3kernel/testfiles/m3char001.uptex.tlg
+++ b/l3kernel/testfiles/m3char001.uptex.tlg
@@ -518,15 +518,7 @@ cell 2
 C
 ============================================================
 ============================================================
-TEST 7: \char_to_utfviii_bytes:n 
-============================================================
-{65}{}{}{}
-{206}{169}{}{}
-{225}{136}{128}{}
-{240}{144}{128}{128}
-============================================================
-============================================================
-TEST 8: Number of expansions
+TEST 7: Number of expansions
 ============================================================
 begin-group character A
 end-group character A
@@ -540,7 +532,7 @@ the character A
 undefined
 ============================================================
 ============================================================
-TEST 9: \char_ <thing>case:N
+TEST 8: \char_ <thing>case:N
 ============================================================
 The token list contains the tokens:
 >  a (the letter a).
@@ -1042,7 +1034,7 @@ The token list contains the tokens:
 l. ...    }
 ============================================================
 ============================================================
-TEST 10: \char_str_ <thing>case:N
+TEST 9: \char_str_ <thing>case:N
 ============================================================
 The token list contains the tokens:
 >  a (the character a).
@@ -1136,7 +1128,7 @@ The token list contains the tokens:
 l. ...    }
 ============================================================
 ============================================================
-TEST 11: Changing \lccode and \uccode 
+TEST 10: Changing \lccode and \uccode 
 ============================================================
 The token list contains the tokens:
 >  q (the character q).
diff --git a/l3kernel/testfiles/m3char001.xetex.tlg b/l3kernel/testfiles/m3char001.xetex.tlg
index 8854c31f0..5e42184bd 100644
--- a/l3kernel/testfiles/m3char001.xetex.tlg
+++ b/l3kernel/testfiles/m3char001.xetex.tlg
@@ -489,15 +489,7 @@ cell 2
 C
 ============================================================
 ============================================================
-TEST 7: \char_to_utfviii_bytes:n 
-============================================================
-{65}{}{}{}
-{206}{169}{}{}
-{225}{136}{128}{}
-{240}{144}{128}{128}
-============================================================
-============================================================
-TEST 8: Number of expansions
+TEST 7: Number of expansions
 ============================================================
 begin-group character A
 end-group character A
@@ -511,7 +503,7 @@ the character A
 undefined
 ============================================================
 ============================================================
-TEST 9: \char_ <thing>case:N
+TEST 8: \char_ <thing>case:N
 ============================================================
 The token list contains the tokens:
 >  a (the letter a).
@@ -595,7 +587,7 @@ The token list contains the tokens:
 l. ...    }
 ============================================================
 ============================================================
-TEST 10: \char_str_ <thing>case:N
+TEST 9: \char_str_ <thing>case:N
 ============================================================
 The token list contains the tokens:
 >  a (the character a).
@@ -679,7 +671,7 @@ The token list contains the tokens:
 l. ...    }
 ============================================================
 ============================================================
-TEST 11: Changing \lccode and \uccode 
+TEST 10: Changing \lccode and \uccode 
 ============================================================
 The token list contains the tokens:
 >  q (the character q).
diff --git a/l3kernel/testfiles/m3text006.lvt b/l3kernel/testfiles/m3text006.lvt
index 2357e6bb5..5c2d8b86a 100644
--- a/l3kernel/testfiles/m3text006.lvt
+++ b/l3kernel/testfiles/m3text006.lvt
@@ -127,7 +127,7 @@
           {
             \exp_args:Ne \test_generate_aux:n
               {
-                \exp_args:Ne \char_to_utfviii_bytes:n
+                \exp_args:Ne \codepoint_to_bytes:n
                   { " \tl_trim_spaces:n {#1} }
               }
           }
diff --git a/l3kernel/testfiles/m3token006.luatex.tlg b/l3kernel/testfiles/m3token006.luatex.tlg
index a1c542abe..f5a12fbb0 100644
--- a/l3kernel/testfiles/m3token006.luatex.tlg
+++ b/l3kernel/testfiles/m3token006.luatex.tlg
@@ -2,17 +2,7 @@ This is a generated file for the LaTeX (2e + expl3) validation system.
 Don't change this file in any respect.
 Author: Joseph Wright
 ============================================================
-TEST 1: Byte_decomposition
-============================================================
-{65}{}{}{}
-{195}{142}{}{}
-{206}{137}{}{}
-{225}{182}{173}{}
-{239}{191}{189}{}
-{240}{144}{128}{128}
-============================================================
-============================================================
-TEST 2: Character decomposition
+TEST 1: Character decomposition
 ============================================================
 A
 Î
diff --git a/l3kernel/testfiles/m3token006.lvt b/l3kernel/testfiles/m3token006.lvt
index 9aeea764c..cdb8df767 100644
--- a/l3kernel/testfiles/m3token006.lvt
+++ b/l3kernel/testfiles/m3token006.lvt
@@ -15,16 +15,6 @@
 
 \ExplSyntaxOn
 
-\TESTEXP { Byte_decomposition }
-  {
-    \char_to_utfviii_bytes:n { `A } \NEWLINE
-    \char_to_utfviii_bytes:n { "00CE } \NEWLINE
-    \char_to_utfviii_bytes:n { "0389 } \NEWLINE
-    \char_to_utfviii_bytes:n { "1DAD } \NEWLINE
-    \char_to_utfviii_bytes:n { "FFFD } \NEWLINE
-    \char_to_utfviii_bytes:n { "10000 }
-  }
-
 \TESTEXP { Character~decomposition }
   {
     \char_to_nfd:n { `A } \NEWLINE
diff --git a/l3kernel/testfiles/m3token006.tlg b/l3kernel/testfiles/m3token006.tlg
index 45256798d..7ab6b11e2 100644
--- a/l3kernel/testfiles/m3token006.tlg
+++ b/l3kernel/testfiles/m3token006.tlg
@@ -2,17 +2,7 @@ This is a generated file for the LaTeX (2e + expl3) validation system.
 Don't change this file in any respect.
 Author: Joseph Wright
 ============================================================
-TEST 1: Byte_decomposition
-============================================================
-{65}{}{}{}
-{195}{142}{}{}
-{206}{137}{}{}
-{225}{182}{173}{}
-{239}{191}{189}{}
-{240}{144}{128}{128}
-============================================================
-============================================================
-TEST 2: Character decomposition
+TEST 1: Character decomposition
 ============================================================
 A
 I^^cc^^82
diff --git a/l3kernel/testfiles/m3token006.xetex.tlg b/l3kernel/testfiles/m3token006.xetex.tlg
index a1c542abe..f5a12fbb0 100644
--- a/l3kernel/testfiles/m3token006.xetex.tlg
+++ b/l3kernel/testfiles/m3token006.xetex.tlg
@@ -2,17 +2,7 @@ This is a generated file for the LaTeX (2e + expl3) validation system.
 Don't change this file in any respect.
 Author: Joseph Wright
 ============================================================
-TEST 1: Byte_decomposition
-============================================================
-{65}{}{}{}
-{195}{142}{}{}
-{206}{137}{}{}
-{225}{182}{173}{}
-{239}{191}{189}{}
-{240}{144}{128}{128}
-============================================================
-============================================================
-TEST 2: Character decomposition
+TEST 1: Character decomposition
 ============================================================
 A
 Î
diff --git a/l3kernel/testfiles/m3unicode001.luatex.tlg b/l3kernel/testfiles/m3unicode001.luatex.tlg
index a812f4345..0e243b747 100644
--- a/l3kernel/testfiles/m3unicode001.luatex.tlg
+++ b/l3kernel/testfiles/m3unicode001.luatex.tlg
@@ -21,3 +21,13 @@ X X
 X X
 XaX
 ============================================================
+============================================================
+TEST 3: Byte decomposition
+============================================================
+{65}{}{}{}
+{195}{142}{}{}
+{206}{137}{}{}
+{225}{182}{173}{}
+{239}{191}{189}{}
+{240}{144}{128}{128}
+============================================================
diff --git a/l3kernel/testfiles/m3unicode001.lvt b/l3kernel/testfiles/m3unicode001.lvt
index ed724c47e..73e6a41df 100644
--- a/l3kernel/testfiles/m3unicode001.lvt
+++ b/l3kernel/testfiles/m3unicode001.lvt
@@ -42,4 +42,14 @@
     X \codepoint_generate:nn { 97 } { 10 } X \NEWLINE
   }
 
+\TESTEXP { Byte~decomposition }
+  {
+    \codepoint_to_bytes:n { `A } \NEWLINE
+    \codepoint_to_bytes:n { "00CE } \NEWLINE
+    \codepoint_to_bytes:n { "0389 } \NEWLINE
+    \codepoint_to_bytes:n { "1DAD } \NEWLINE
+    \codepoint_to_bytes:n { "FFFD } \NEWLINE
+    \codepoint_to_bytes:n { "10000 }
+  }
+
 \END
\ No newline at end of file
diff --git a/l3kernel/testfiles/m3unicode001.tlg b/l3kernel/testfiles/m3unicode001.tlg
index 2924a5588..21244eb8b 100644
--- a/l3kernel/testfiles/m3unicode001.tlg
+++ b/l3kernel/testfiles/m3unicode001.tlg
@@ -21,3 +21,13 @@ X X
 X X
 XaX
 ============================================================
+============================================================
+TEST 3: Byte decomposition
+============================================================
+{65}{}{}{}
+{195}{142}{}{}
+{206}{137}{}{}
+{225}{182}{173}{}
+{239}{191}{189}{}
+{240}{144}{128}{128}
+============================================================
diff --git a/l3kernel/testfiles/m3unicode001.xetex.tlg b/l3kernel/testfiles/m3unicode001.xetex.tlg
index a812f4345..0e243b747 100644
--- a/l3kernel/testfiles/m3unicode001.xetex.tlg
+++ b/l3kernel/testfiles/m3unicode001.xetex.tlg
@@ -21,3 +21,13 @@ X X
 X X
 XaX
 ============================================================
+============================================================
+TEST 3: Byte decomposition
+============================================================
+{65}{}{}{}
+{195}{142}{}{}
+{206}{137}{}{}
+{225}{182}{173}{}
+{239}{191}{189}{}
+{240}{144}{128}{128}
+============================================================