[latex3-commits] [git/LaTeX3-latex3-latex3] unicode-data: Rename \char_to_utfviii_bytes:n to \codepoint_to_bytes:n (31a1124e2)
Joseph Wright
joseph.wright at morningstar2.co.uk
Sun Oct 9 19:35:35 CEST 2022
Repository : https://github.com/latex3/latex3
On branch : unicode-data
Link : https://github.com/latex3/latex3/commit/31a1124e2fe6d1f19dba3d1fb921dc1cf91e9613
>---------------------------------------------------------------
commit 31a1124e2fe6d1f19dba3d1fb921dc1cf91e9613
Author: Joseph Wright <joseph.wright at morningstar2.co.uk>
Date: Sun Oct 9 17:44:09 2022 +0100
Rename \char_to_utfviii_bytes:n to \codepoint_to_bytes:n
This fits a more general pattern of moving functions:
several commits will address this.
There is still the question of byte order here:
do we want to change and 'fill from the bottom'.
>---------------------------------------------------------------
31a1124e2fe6d1f19dba3d1fb921dc1cf91e9613
l3kernel/CHANGELOG.md | 4 +
l3kernel/doc/l3obsolete.txt | 1 +
l3kernel/l3candidates.dtx | 12 --
l3kernel/l3deprecation.dtx | 7 ++
l3kernel/l3str-convert.dtx | 2 +-
l3kernel/l3str.dtx | 2 +-
l3kernel/l3text-case.dtx | 16 +--
l3kernel/l3text-purify.dtx | 4 +-
l3kernel/l3token.dtx | 105 ----------------
l3kernel/l3unicode.dtx | 192 +++++++++++++++++++++++------
l3kernel/testfiles/m3char001.luatex.tlg | 16 +--
l3kernel/testfiles/m3char001.lvt | 8 --
l3kernel/testfiles/m3char001.ptex.tlg | 16 +--
l3kernel/testfiles/m3char001.tlg | 16 +--
l3kernel/testfiles/m3char001.uptex.tlg | 16 +--
l3kernel/testfiles/m3char001.xetex.tlg | 16 +--
l3kernel/testfiles/m3text006.lvt | 2 +-
l3kernel/testfiles/m3token006.luatex.tlg | 12 +-
l3kernel/testfiles/m3token006.lvt | 10 --
l3kernel/testfiles/m3token006.tlg | 12 +-
l3kernel/testfiles/m3token006.xetex.tlg | 12 +-
l3kernel/testfiles/m3unicode001.luatex.tlg | 10 ++
l3kernel/testfiles/m3unicode001.lvt | 10 ++
l3kernel/testfiles/m3unicode001.tlg | 10 ++
l3kernel/testfiles/m3unicode001.xetex.tlg | 10 ++
25 files changed, 244 insertions(+), 277 deletions(-)
diff --git a/l3kernel/CHANGELOG.md b/l3kernel/CHANGELOG.md
index 6e55f71e0..0d32ba176 100644
--- a/l3kernel/CHANGELOG.md
+++ b/l3kernel/CHANGELOG.md
@@ -8,6 +8,7 @@ this project uses date-based 'snapshot' version identifiers.
## [Unreleased]
### Added
+- `\codepoint_to_bytes:n`
- `\codepoint_str_generate:n`
### Changed
@@ -19,6 +20,9 @@ this project uses date-based 'snapshot' version identifiers.
tokens (issue [\#1110](https://github.com/latex3/latex3/issues/1110)), and an
esoteric case (issue [\#1113](https://github.com/latex3/latex3/issues/1113))
+### Deprecated
+- `\char_to_utfviii_bytes:n`
+
## [2022-09-28]
### Added
diff --git a/l3kernel/doc/l3obsolete.txt b/l3kernel/doc/l3obsolete.txt
index 6f8844ce6..2377bb985 100644
--- a/l3kernel/doc/l3obsolete.txt
+++ b/l3kernel/doc/l3obsolete.txt
@@ -22,6 +22,7 @@ Function Date deprecated
\char_str_lower_case:N 2020-01-03
\char_str_mixed_case:N 2020-01-03
\char_str_upper_case:N 2020-01-03
+\char_to_utfviii_bytes:n 2022-10-09
\cs_argument_spec:N 2022-06-24
\l_keys_key_tl 2020-02-08
\l_keys_path_tl 2020-02-08
diff --git a/l3kernel/l3candidates.dtx b/l3kernel/l3candidates.dtx
index 1bcbfc4ec..13a0e9b5c 100644
--- a/l3kernel/l3candidates.dtx
+++ b/l3kernel/l3candidates.dtx
@@ -626,18 +626,6 @@
% (\enquote{active}), and character code $32$ (space).
% \end{variable}
%
-% \begin{function}[added = 2020-01-09, EXP]{\char_to_utfviii_bytes:n}
-% \begin{syntax}
-% \cs{char_to_utfviii_bytes:n} \Arg{codepoint}
-% \end{syntax}
-% Converts the (Unicode) \meta{codepoint} to UTF-8 bytes. The expansion
-% of this function comprises four brace groups, each of which will contain
-% a hexadecimal value: the appropriate byte. As UTF-8 is a variable-length,
-% one or more of the groups may be empty: the bytes read in the logical order,
-% such that a two-byte codepoint will have groups |#1| and |#2| filled
-% and |#3| and |#4| empty.
-% \end{function}
-%
% \begin{function}[added = 2020-01-02, rEXP]{\char_to_nfd:N}
% \begin{syntax}
% \cs{char_to_nfd:N} \meta{char}
diff --git a/l3kernel/l3deprecation.dtx b/l3kernel/l3deprecation.dtx
index 02f353516..85c532113 100644
--- a/l3kernel/l3deprecation.dtx
+++ b/l3kernel/l3deprecation.dtx
@@ -549,6 +549,13 @@
%
% \subsection{Deprecated \pkg{l3token} functions}
%
+% \begin{macro}[EXP]{\char_to_utfviii_bytes:n}
+% \begin{macrocode}
+\__kernel_patch_deprecation:nnNNpn { 2022-10-09 } { \codepoint_to_bytes:n }
+\cs_gset:Npn \char_to_utfviii_bytes:n { \codepoint_to_bytes:n }
+% \end{macrocode}
+% \end{macro}
+%
% \begin{macro}[EXP]
% {
% \char_lower_case:N, \char_upper_case:N,
diff --git a/l3kernel/l3str-convert.dtx b/l3kernel/l3str-convert.dtx
index f2d379346..3b5df6dda 100644
--- a/l3kernel/l3str-convert.dtx
+++ b/l3kernel/l3str-convert.dtx
@@ -2635,7 +2635,7 @@
\cs_new:Npn \@@_convert_pdfname_bytes:n #1
{
\exp_args:Ne \@@_convert_pdfname_bytes_aux:n
- { \char_to_utfviii_bytes:n {`#1} }
+ { \codepoint_to_bytes:n {`#1} }
}
\cs_new:Npn \@@_convert_pdfname_bytes_aux:n #1
{ \@@_convert_pdfname_bytes_aux:nnnn #1 }
diff --git a/l3kernel/l3str.dtx b/l3kernel/l3str.dtx
index 82e907b88..c7237635c 100644
--- a/l3kernel/l3str.dtx
+++ b/l3kernel/l3str.dtx
@@ -2056,7 +2056,7 @@
\use:e
{
\exp_not:N \@@_change_case_generate:nnnn
- \char_to_utfviii_bytes:n {#1}
+ \codepoint_to_bytes:n {#1}
}
}
\cs_new:Npn \@@_change_case_generate:nnnn #1#2#3#4
diff --git a/l3kernel/l3text-case.dtx b/l3kernel/l3text-case.dtx
index cad0ed295..18c2188e6 100644
--- a/l3kernel/l3text-case.dtx
+++ b/l3kernel/l3text-case.dtx
@@ -1868,7 +1868,7 @@
}
}
\use:x
- { \@@_tmp:w \char_to_utfviii_bytes:n { "#2 } }
+ { \@@_tmp:w \codepoint_to_bytes:n { "#2 } }
\group_end:
}
\@@_tmp:w \c_@@_dotless_i_tl { 0131 }
@@ -1902,8 +1902,8 @@
\use:x
{
\@@_tmp:w
- \char_to_utfviii_bytes:n { "#1 }
- \char_to_utfviii_bytes:n { "#2 }
+ \codepoint_to_bytes:n { "#1 }
+ \codepoint_to_bytes:n { "#2 }
}
\@@_loop:nn
}
@@ -2183,8 +2183,8 @@
\use:x
{
\@@_tmp:w
- \char_to_utfviii_bytes:n { "#1 }
- \char_to_utfviii_bytes:n { "#2 }
+ \codepoint_to_bytes:n { "#1 }
+ \codepoint_to_bytes:n { "#2 }
}
\group_end:
}
@@ -2238,7 +2238,7 @@
{#2}
}
\use:x
- { \@@_tmp:w \char_to_utfviii_bytes:n { "#1 } }
+ { \@@_tmp:w \codepoint_to_bytes:n { "#1 } }
\group_end:
}
\@@_tmp:w { 00DF } { SS } { upper }
@@ -2463,8 +2463,8 @@
\use:x
{
\@@_tmp:w
- \char_to_utfviii_bytes:n { "#1 }
- \char_to_utfviii_bytes:n { "#2 }
+ \codepoint_to_bytes:n { "#1 }
+ \codepoint_to_bytes:n { "#2 }
}
\group_end:
}
diff --git a/l3kernel/l3text-purify.dtx b/l3kernel/l3text-purify.dtx
index 7caaeb0b5..dcec40bca 100644
--- a/l3kernel/l3text-purify.dtx
+++ b/l3kernel/l3text-purify.dtx
@@ -486,7 +486,7 @@
\text_declare_purify_equivalent:Nx #1
{
\exp_args:Ne \@@_tmp:n
- { \char_to_utfviii_bytes:n { "#2 } }
+ { \codepoint_to_bytes:n { "#2 } }
}
\@@_loop:Nn
}
@@ -574,7 +574,7 @@
\cs_set:Npn \@@_tmp:n #1
{
\exp_args:Ne \@@_tmp_aux:n
- { \char_to_utfviii_bytes:n { "#1 } }
+ { \codepoint_to_bytes:n { "#1 } }
}
\cs_set:Npn \@@_tmp_aux:n #1 { \@@_tmp:nnnn #1 }
\cs_set:Npn \@@_tmp:nnnn #1#2#3#4
diff --git a/l3kernel/l3token.dtx b/l3kernel/l3token.dtx
index 307cb8d08..356d88d57 100644
--- a/l3kernel/l3token.dtx
+++ b/l3kernel/l3token.dtx
@@ -1690,111 +1690,6 @@
% \end{macro}
% \end{macro}
%
-% \begin{macro}[EXP]{\char_to_utfviii_bytes:n}
-% \begin{macro}[EXP]{\@@_to_utfviii_bytes_auxi:n}
-% \begin{macro}[EXP]{\@@_to_utfviii_bytes_auxii:Nnn}
-% \begin{macro}[EXP]{\@@_to_utfviii_bytes_auxiii:n}
-% \begin{macro}[EXP]
-% {
-% \@@_to_utfviii_bytes_outputi:nw ,
-% \@@_to_utfviii_bytes_outputii:nw ,
-% \@@_to_utfviii_bytes_outputiii:nw ,
-% \@@_to_utfviii_bytes_outputiv:nw
-% }
-% \begin{macro}[EXP]
-% {\@@_to_utfviii_bytes_output:nnn, \@@_to_utfviii_bytes_output:fnn}
-% \begin{macro}[EXP]{\@@_to_utfviii_bytes_end:}
-% This code converts a codepoint into the correct UTF-8 representation.
-% In terms of the algorithm itself, see
-% \url{https://en.wikipedia.org/wiki/UTF-8} for the octet pattern.
-% \begin{macrocode}
-\cs_new:Npn \char_to_utfviii_bytes:n #1
- {
- \exp_args:Nf \@@_to_utfviii_bytes_auxi:n
- { \int_eval:n {#1} }
- }
-\cs_new:Npn \@@_to_utfviii_bytes_auxi:n #1
- {
- \if_int_compare:w #1 > "80 \exp_stop_f:
- \if_int_compare:w #1 < "800 \exp_stop_f:
- \@@_to_utfviii_bytes_outputi:nw
- { \@@_to_utfviii_bytes_auxii:Nnn C {#1} { 64 } }
- \@@_to_utfviii_bytes_outputii:nw
- { \@@_to_utfviii_bytes_auxiii:n {#1} }
- \else:
- \if_int_compare:w #1 < "10000 \exp_stop_f:
- \@@_to_utfviii_bytes_outputi:nw
- { \@@_to_utfviii_bytes_auxii:Nnn E {#1} { 64 * 64 } }
- \@@_to_utfviii_bytes_outputii:nw
- {
- \@@_to_utfviii_bytes_auxiii:n
- { \int_div_truncate:nn {#1} { 64 } }
- }
- \@@_to_utfviii_bytes_outputiii:nw
- { \@@_to_utfviii_bytes_auxiii:n {#1} }
- \else:
- \@@_to_utfviii_bytes_outputi:nw
- {
- \@@_to_utfviii_bytes_auxii:Nnn F
- {#1} { 64 * 64 * 64 }
- }
- \@@_to_utfviii_bytes_outputii:nw
- {
- \@@_to_utfviii_bytes_auxiii:n
- { \int_div_truncate:nn {#1} { 64 * 64 } }
- }
- \@@_to_utfviii_bytes_outputiii:nw
- {
- \@@_to_utfviii_bytes_auxiii:n
- { \int_div_truncate:nn {#1} { 64 } }
- }
- \@@_to_utfviii_bytes_outputiv:nw
- { \@@_to_utfviii_bytes_auxiii:n {#1} }
- \fi:
- \fi:
- \else:
- \@@_to_utfviii_bytes_outputi:nw {#1}
- \fi:
- \@@_to_utfviii_bytes_end: { } { } { } { }
- }
-\cs_new:Npn \@@_to_utfviii_bytes_auxii:Nnn #1#2#3
- { "#10 + \int_div_truncate:nn {#2} {#3} }
-\cs_new:Npn \@@_to_utfviii_bytes_auxiii:n #1
- { \int_mod:nn {#1} { 64 } + 128 }
-\cs_new:Npn \@@_to_utfviii_bytes_outputi:nw
- #1 #2 \@@_to_utfviii_bytes_end: #3
- { \@@_to_utfviii_bytes_output:fnn { \int_eval:n {#1} } { } {#2} }
-\cs_new:Npn \@@_to_utfviii_bytes_outputii:nw
- #1 #2 \@@_to_utfviii_bytes_end: #3#4
- { \@@_to_utfviii_bytes_output:fnn { \int_eval:n {#1} } { {#3} } {#2} }
-\cs_new:Npn \@@_to_utfviii_bytes_outputiii:nw
- #1 #2 \@@_to_utfviii_bytes_end: #3#4#5
- {
- \@@_to_utfviii_bytes_output:fnn
- { \int_eval:n {#1} } { {#3} {#4} } {#2}
- }
-\cs_new:Npn \@@_to_utfviii_bytes_outputiv:nw
- #1 #2 \@@_to_utfviii_bytes_end: #3#4#5#6
- {
- \@@_to_utfviii_bytes_output:fnn
- { \int_eval:n {#1} } { {#3} {#4} {#5} } {#2}
- }
-\cs_new:Npn \@@_to_utfviii_bytes_output:nnn #1#2#3
- {
- #3
- \@@_to_utfviii_bytes_end: #2 {#1}
- }
-\cs_generate_variant:Nn \@@_to_utfviii_bytes_output:nnn { f }
-\cs_new:Npn \@@_to_utfviii_bytes_end: { }
-% \end{macrocode}
-% \end{macro}
-% \end{macro}
-% \end{macro}
-% \end{macro}
-% \end{macro}
-% \end{macro}
-% \end{macro}
-%
% \begin{macro}[EXP]{\char_to_nfd:N}
% \begin{macro}[EXP]{\char_to_nfd:n}
% \begin{macro}[EXP]{\@@_to_nfd:nn}
diff --git a/l3kernel/l3unicode.dtx b/l3kernel/l3unicode.dtx
index 0974cae51..a4cd8ae96 100644
--- a/l3kernel/l3unicode.dtx
+++ b/l3kernel/l3unicode.dtx
@@ -99,6 +99,18 @@
% category code $10$.
% \end{function}
%
+% \begin{function}[added = 2022-10-09, EXP]{\codepoints_to_bytes:n}
+% \begin{syntax}
+% \cs{codepoint_to_bytes:n} \Arg{codepoint}
+% \end{syntax}
+% Converts the \meta{codepoint} to UTF-8 bytes. The expansion
+% of this function comprises four brace groups, each of which will contain
+% a hexadecimal value: the appropriate byte. As UTF-8 is a variable-length,
+% one or more of the groups may be empty: the bytes read in the logical order,
+% such that a two-byte codepoint will have groups |#1| and |#2| filled
+% and |#3| and |#4| empty.
+% \end{function}
+%
% \end{documentation}
%
% \begin{implementation}
@@ -113,39 +125,7 @@
%<@@=codepoint>
% \end{macrocode}
%
-% Text operations requires data from the Unicode Consortium. Data read into
-% Unicode engine formats is at best a small part of what we need, so there
-% is a loader here to set up the appropriate data structures.
-%
-% Where we need data for most or all of the Unicode range, we use the two-stage
-% table approach recommended by the Unicode Consortium and demonstrated in a
-% model implementation in Python in
-% \url{https://www.strchr.com/multi-stage_tables}. This approach uses the
-% \texttt{intarray} (\texttt{fontdimen}-based) data type as it is fast for
-% random access and avoids significant hash table usage. In contrast, where
-% only a small subset of codepoints are required, storage as macros is
-% preferable. There is also some consideration of the effort needed to load
-% data: see for example the grapheme breaking information, which would be
-% problematic to convert into a two-stage table but which can be used with
-% reasonable performance in a small number of comma lists (at the cost that
-% breaking at higher codepoint Hangul characters will be slightly slow).
-%
-% \begin{variable}{\c_@@_block_size_int}
-% Choosing the block size for the blocks in the two-stage approach is
-% non-trivial: depending on the data stored, the optimal size for
-% memory usage will vary. At the same time, for us there is also the
-% question of load-time: larger blocks require longer comma lists
-% as intermediates, so are slower. As this is going to be needed
-% to use the data, we set it up outside of the group for clarity.
-% \begin{macrocode}
-\int_const:Nn \c_@@_block_size_int { 64 }
-% \end{macrocode}
-% \end{variable}
-%
-% Parsing the data files can be the same way for all engines, but where they
-% are stored as character tokens, the construction method depends on whether
-% they are Unicode or $8$-bit internally. Parsing is therefore done by common
-% functions, with some data storage using engine-specific auxiliaries.
+% \subsection{User functions}
%
% \begin{macro}[EXP]{\codepoint_str_generate:n}
% \begin{macro}[EXP]{\@@_str_generate:nnnn}
@@ -189,7 +169,7 @@
\use:e
{
\exp_not:N \@@_str_generate:nnnn
- \char_to_utfviii_bytes:n {#1}
+ \codepoint_to_bytes:n {#1}
}
}
}
@@ -221,7 +201,7 @@
\use:e
{
\exp_not:N \@@_generate:nnnn
- \char_to_utfviii_bytes:n {#1}
+ \codepoint_to_bytes:n {#1}
}
}
}
@@ -256,6 +236,147 @@
% \end{macro}
% \end{macro}
%
+% \begin{macro}[EXP]{\codepoint_to_bytes:n}
+% \begin{macro}[EXP]{\@@_to_bytes_auxi:n}
+% \begin{macro}[EXP]{\@@_to_bytes_auxii:Nnn}
+% \begin{macro}[EXP]{\@@_to_bytes_auxiii:n}
+% \begin{macro}[EXP]
+% {
+% \@@_to_bytes_outputi:nw ,
+% \@@_to_bytes_outputii:nw ,
+% \@@_to_bytes_outputiii:nw ,
+% \@@_to_bytes_outputiv:nw
+% }
+% \begin{macro}[EXP]
+% {\@@_to_bytes_output:nnn, \@@_to_bytes_output:fnn}
+% \begin{macro}[EXP]{\@@_to_bytes_end:}
+% This code converts a codepoint into the correct UTF-8 representation.
+% In terms of the algorithm itself, see
+% \url{https://en.wikipedia.org/wiki/UTF-8} for the octet pattern.
+% \begin{macrocode}
+\cs_new:Npn \codepoint_to_bytes:n #1
+ {
+ \exp_args:Nf \@@_to_bytes_auxi:n
+ { \int_eval:n {#1} }
+ }
+\cs_new:Npn \@@_to_bytes_auxi:n #1
+ {
+ \if_int_compare:w #1 > "80 \exp_stop_f:
+ \if_int_compare:w #1 < "800 \exp_stop_f:
+ \@@_to_bytes_outputi:nw
+ { \@@_to_bytes_auxii:Nnn C {#1} { 64 } }
+ \@@_to_bytes_outputii:nw
+ { \@@_to_bytes_auxiii:n {#1} }
+ \else:
+ \if_int_compare:w #1 < "10000 \exp_stop_f:
+ \@@_to_bytes_outputi:nw
+ { \@@_to_bytes_auxii:Nnn E {#1} { 64 * 64 } }
+ \@@_to_bytes_outputii:nw
+ {
+ \@@_to_bytes_auxiii:n
+ { \int_div_truncate:nn {#1} { 64 } }
+ }
+ \@@_to_bytes_outputiii:nw
+ { \@@_to_bytes_auxiii:n {#1} }
+ \else:
+ \@@_to_bytes_outputi:nw
+ {
+ \@@_to_bytes_auxii:Nnn F
+ {#1} { 64 * 64 * 64 }
+ }
+ \@@_to_bytes_outputii:nw
+ {
+ \@@_to_bytes_auxiii:n
+ { \int_div_truncate:nn {#1} { 64 * 64 } }
+ }
+ \@@_to_bytes_outputiii:nw
+ {
+ \@@_to_bytes_auxiii:n
+ { \int_div_truncate:nn {#1} { 64 } }
+ }
+ \@@_to_bytes_outputiv:nw
+ { \@@_to_bytes_auxiii:n {#1} }
+ \fi:
+ \fi:
+ \else:
+ \@@_to_bytes_outputi:nw {#1}
+ \fi:
+ \@@_to_bytes_end: { } { } { } { }
+ }
+\cs_new:Npn \@@_to_bytes_auxii:Nnn #1#2#3
+ { "#10 + \int_div_truncate:nn {#2} {#3} }
+\cs_new:Npn \@@_to_bytes_auxiii:n #1
+ { \int_mod:nn {#1} { 64 } + 128 }
+\cs_new:Npn \@@_to_bytes_outputi:nw
+ #1 #2 \@@_to_bytes_end: #3
+ { \@@_to_bytes_output:fnn { \int_eval:n {#1} } { } {#2} }
+\cs_new:Npn \@@_to_bytes_outputii:nw
+ #1 #2 \@@_to_bytes_end: #3#4
+ { \@@_to_bytes_output:fnn { \int_eval:n {#1} } { {#3} } {#2} }
+\cs_new:Npn \@@_to_bytes_outputiii:nw
+ #1 #2 \@@_to_bytes_end: #3#4#5
+ {
+ \@@_to_bytes_output:fnn
+ { \int_eval:n {#1} } { {#3} {#4} } {#2}
+ }
+\cs_new:Npn \@@_to_bytes_outputiv:nw
+ #1 #2 \@@_to_bytes_end: #3#4#5#6
+ {
+ \@@_to_bytes_output:fnn
+ { \int_eval:n {#1} } { {#3} {#4} {#5} } {#2}
+ }
+\cs_new:Npn \@@_to_bytes_output:nnn #1#2#3
+ {
+ #3
+ \@@_to_bytes_end: #2 {#1}
+ }
+\cs_generate_variant:Nn \@@_to_bytes_output:nnn { f }
+\cs_new:Npn \@@_to_bytes_end: { }
+% \end{macrocode}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+%
+% \subsection{Data loader}
+%
+% Text operations requires data from the Unicode Consortium. Data read into
+% Unicode engine formats is at best a small part of what we need, so there
+% is a loader here to set up the appropriate data structures.
+%
+% Where we need data for most or all of the Unicode range, we use the two-stage
+% table approach recommended by the Unicode Consortium and demonstrated in a
+% model implementation in Python in
+% \url{https://www.strchr.com/multi-stage_tables}. This approach uses the
+% \texttt{intarray} (\texttt{fontdimen}-based) data type as it is fast for
+% random access and avoids significant hash table usage. In contrast, where
+% only a small subset of codepoints are required, storage as macros is
+% preferable. There is also some consideration of the effort needed to load
+% data: see for example the grapheme breaking information, which would be
+% problematic to convert into a two-stage table but which can be used with
+% reasonable performance in a small number of comma lists (at the cost that
+% breaking at higher codepoint Hangul characters will be slightly slow).
+%
+% \begin{variable}{\c_@@_block_size_int}
+% Choosing the block size for the blocks in the two-stage approach is
+% non-trivial: depending on the data stored, the optimal size for
+% memory usage will vary. At the same time, for us there is also the
+% question of load-time: larger blocks require longer comma lists
+% as intermediates, so are slower. As this is going to be needed
+% to use the data, we set it up outside of the group for clarity.
+% \begin{macrocode}
+\int_const:Nn \c_@@_block_size_int { 64 }
+% \end{macrocode}
+% \end{variable}
+%
+% Parsing the data files can be the same way for all engines, but where they
+% are stored as character tokens, the construction method depends on whether
+% they are Unicode or $8$-bit internally. Parsing is therefore done by common
+% functions, with some data storage using engine-specific auxiliaries.
+%
% As only the data needs to remain at the end of this process, everything
% is set up inside a group. The only thing that is outside is creating a
% stream: they are global anyway and it is best to force a stream for
@@ -721,7 +842,6 @@
% \end{macro}
% \end{macro}
%
-%
% \begin{macro}[EXP]{\__kernel_codepoint_nfd:n}
% \begin{macro}[EXP]{\@@_nfd:nn}
% A simple interface.
diff --git a/l3kernel/testfiles/m3char001.luatex.tlg b/l3kernel/testfiles/m3char001.luatex.tlg
index 0b33b0a9d..fe8482250 100644
--- a/l3kernel/testfiles/m3char001.luatex.tlg
+++ b/l3kernel/testfiles/m3char001.luatex.tlg
@@ -489,15 +489,7 @@ cell 2
C
============================================================
============================================================
-TEST 7: \char_to_utfviii_bytes:n
-============================================================
-{65}{}{}{}
-{206}{169}{}{}
-{225}{136}{128}{}
-{240}{144}{128}{128}
-============================================================
-============================================================
-TEST 8: Number of expansions
+TEST 7: Number of expansions
============================================================
begin-group character A
end-group character A
@@ -511,7 +503,7 @@ the character A
undefined
============================================================
============================================================
-TEST 9: \char_ <thing>case:N
+TEST 8: \char_ <thing>case:N
============================================================
The token list contains the tokens:
> a (the letter a).
@@ -595,7 +587,7 @@ The token list contains the tokens:
l. ... }
============================================================
============================================================
-TEST 10: \char_str_ <thing>case:N
+TEST 9: \char_str_ <thing>case:N
============================================================
The token list contains the tokens:
> a (the character a).
@@ -679,7 +671,7 @@ The token list contains the tokens:
l. ... }
============================================================
============================================================
-TEST 11: Changing \lccode and \uccode
+TEST 10: Changing \lccode and \uccode
============================================================
The token list contains the tokens:
> q (the character q).
diff --git a/l3kernel/testfiles/m3char001.lvt b/l3kernel/testfiles/m3char001.lvt
index 3a217d4e4..775ba9c86 100644
--- a/l3kernel/testfiles/m3char001.lvt
+++ b/l3kernel/testfiles/m3char001.lvt
@@ -143,14 +143,6 @@
}
}
-\TESTEXP { \char_to_utfviii_bytes:n }
- {
- \char_to_utfviii_bytes:n { `A } \NEWLINE
- \char_to_utfviii_bytes:n { "03A9 } \NEWLINE
- \char_to_utfviii_bytes:n { "1200 } \NEWLINE
- \char_to_utfviii_bytes:n { "10000 }
- }
-
\OMIT
\cs_gset:Npn \test:nn #1#2
{
diff --git a/l3kernel/testfiles/m3char001.ptex.tlg b/l3kernel/testfiles/m3char001.ptex.tlg
index 446889d96..09526d1b5 100644
--- a/l3kernel/testfiles/m3char001.ptex.tlg
+++ b/l3kernel/testfiles/m3char001.ptex.tlg
@@ -518,15 +518,7 @@ cell 2
C
============================================================
============================================================
-TEST 7: \char_to_utfviii_bytes:n
-============================================================
-{65}{}{}{}
-{206}{169}{}{}
-{225}{136}{128}{}
-{240}{144}{128}{128}
-============================================================
-============================================================
-TEST 8: Number of expansions
+TEST 7: Number of expansions
============================================================
begin-group character A
end-group character A
@@ -540,7 +532,7 @@ the character A
undefined
============================================================
============================================================
-TEST 9: \char_ <thing>case:N
+TEST 8: \char_ <thing>case:N
============================================================
The token list contains the tokens:
> a (the letter a).
@@ -1042,7 +1034,7 @@ The token list contains the tokens:
l. ... }
============================================================
============================================================
-TEST 10: \char_str_ <thing>case:N
+TEST 9: \char_str_ <thing>case:N
============================================================
The token list contains the tokens:
> a (the character a).
@@ -1136,7 +1128,7 @@ The token list contains the tokens:
l. ... }
============================================================
============================================================
-TEST 11: Changing \lccode and \uccode
+TEST 10: Changing \lccode and \uccode
============================================================
The token list contains the tokens:
> q (the character q).
diff --git a/l3kernel/testfiles/m3char001.tlg b/l3kernel/testfiles/m3char001.tlg
index 446889d96..09526d1b5 100644
--- a/l3kernel/testfiles/m3char001.tlg
+++ b/l3kernel/testfiles/m3char001.tlg
@@ -518,15 +518,7 @@ cell 2
C
============================================================
============================================================
-TEST 7: \char_to_utfviii_bytes:n
-============================================================
-{65}{}{}{}
-{206}{169}{}{}
-{225}{136}{128}{}
-{240}{144}{128}{128}
-============================================================
-============================================================
-TEST 8: Number of expansions
+TEST 7: Number of expansions
============================================================
begin-group character A
end-group character A
@@ -540,7 +532,7 @@ the character A
undefined
============================================================
============================================================
-TEST 9: \char_ <thing>case:N
+TEST 8: \char_ <thing>case:N
============================================================
The token list contains the tokens:
> a (the letter a).
@@ -1042,7 +1034,7 @@ The token list contains the tokens:
l. ... }
============================================================
============================================================
-TEST 10: \char_str_ <thing>case:N
+TEST 9: \char_str_ <thing>case:N
============================================================
The token list contains the tokens:
> a (the character a).
@@ -1136,7 +1128,7 @@ The token list contains the tokens:
l. ... }
============================================================
============================================================
-TEST 11: Changing \lccode and \uccode
+TEST 10: Changing \lccode and \uccode
============================================================
The token list contains the tokens:
> q (the character q).
diff --git a/l3kernel/testfiles/m3char001.uptex.tlg b/l3kernel/testfiles/m3char001.uptex.tlg
index 446889d96..09526d1b5 100644
--- a/l3kernel/testfiles/m3char001.uptex.tlg
+++ b/l3kernel/testfiles/m3char001.uptex.tlg
@@ -518,15 +518,7 @@ cell 2
C
============================================================
============================================================
-TEST 7: \char_to_utfviii_bytes:n
-============================================================
-{65}{}{}{}
-{206}{169}{}{}
-{225}{136}{128}{}
-{240}{144}{128}{128}
-============================================================
-============================================================
-TEST 8: Number of expansions
+TEST 7: Number of expansions
============================================================
begin-group character A
end-group character A
@@ -540,7 +532,7 @@ the character A
undefined
============================================================
============================================================
-TEST 9: \char_ <thing>case:N
+TEST 8: \char_ <thing>case:N
============================================================
The token list contains the tokens:
> a (the letter a).
@@ -1042,7 +1034,7 @@ The token list contains the tokens:
l. ... }
============================================================
============================================================
-TEST 10: \char_str_ <thing>case:N
+TEST 9: \char_str_ <thing>case:N
============================================================
The token list contains the tokens:
> a (the character a).
@@ -1136,7 +1128,7 @@ The token list contains the tokens:
l. ... }
============================================================
============================================================
-TEST 11: Changing \lccode and \uccode
+TEST 10: Changing \lccode and \uccode
============================================================
The token list contains the tokens:
> q (the character q).
diff --git a/l3kernel/testfiles/m3char001.xetex.tlg b/l3kernel/testfiles/m3char001.xetex.tlg
index 8854c31f0..5e42184bd 100644
--- a/l3kernel/testfiles/m3char001.xetex.tlg
+++ b/l3kernel/testfiles/m3char001.xetex.tlg
@@ -489,15 +489,7 @@ cell 2
C
============================================================
============================================================
-TEST 7: \char_to_utfviii_bytes:n
-============================================================
-{65}{}{}{}
-{206}{169}{}{}
-{225}{136}{128}{}
-{240}{144}{128}{128}
-============================================================
-============================================================
-TEST 8: Number of expansions
+TEST 7: Number of expansions
============================================================
begin-group character A
end-group character A
@@ -511,7 +503,7 @@ the character A
undefined
============================================================
============================================================
-TEST 9: \char_ <thing>case:N
+TEST 8: \char_ <thing>case:N
============================================================
The token list contains the tokens:
> a (the letter a).
@@ -595,7 +587,7 @@ The token list contains the tokens:
l. ... }
============================================================
============================================================
-TEST 10: \char_str_ <thing>case:N
+TEST 9: \char_str_ <thing>case:N
============================================================
The token list contains the tokens:
> a (the character a).
@@ -679,7 +671,7 @@ The token list contains the tokens:
l. ... }
============================================================
============================================================
-TEST 11: Changing \lccode and \uccode
+TEST 10: Changing \lccode and \uccode
============================================================
The token list contains the tokens:
> q (the character q).
diff --git a/l3kernel/testfiles/m3text006.lvt b/l3kernel/testfiles/m3text006.lvt
index 2357e6bb5..5c2d8b86a 100644
--- a/l3kernel/testfiles/m3text006.lvt
+++ b/l3kernel/testfiles/m3text006.lvt
@@ -127,7 +127,7 @@
{
\exp_args:Ne \test_generate_aux:n
{
- \exp_args:Ne \char_to_utfviii_bytes:n
+ \exp_args:Ne \codepoint_to_bytes:n
{ " \tl_trim_spaces:n {#1} }
}
}
diff --git a/l3kernel/testfiles/m3token006.luatex.tlg b/l3kernel/testfiles/m3token006.luatex.tlg
index a1c542abe..f5a12fbb0 100644
--- a/l3kernel/testfiles/m3token006.luatex.tlg
+++ b/l3kernel/testfiles/m3token006.luatex.tlg
@@ -2,17 +2,7 @@ This is a generated file for the LaTeX (2e + expl3) validation system.
Don't change this file in any respect.
Author: Joseph Wright
============================================================
-TEST 1: Byte_decomposition
-============================================================
-{65}{}{}{}
-{195}{142}{}{}
-{206}{137}{}{}
-{225}{182}{173}{}
-{239}{191}{189}{}
-{240}{144}{128}{128}
-============================================================
-============================================================
-TEST 2: Character decomposition
+TEST 1: Character decomposition
============================================================
A
Î
diff --git a/l3kernel/testfiles/m3token006.lvt b/l3kernel/testfiles/m3token006.lvt
index 9aeea764c..cdb8df767 100644
--- a/l3kernel/testfiles/m3token006.lvt
+++ b/l3kernel/testfiles/m3token006.lvt
@@ -15,16 +15,6 @@
\ExplSyntaxOn
-\TESTEXP { Byte_decomposition }
- {
- \char_to_utfviii_bytes:n { `A } \NEWLINE
- \char_to_utfviii_bytes:n { "00CE } \NEWLINE
- \char_to_utfviii_bytes:n { "0389 } \NEWLINE
- \char_to_utfviii_bytes:n { "1DAD } \NEWLINE
- \char_to_utfviii_bytes:n { "FFFD } \NEWLINE
- \char_to_utfviii_bytes:n { "10000 }
- }
-
\TESTEXP { Character~decomposition }
{
\char_to_nfd:n { `A } \NEWLINE
diff --git a/l3kernel/testfiles/m3token006.tlg b/l3kernel/testfiles/m3token006.tlg
index 45256798d..7ab6b11e2 100644
--- a/l3kernel/testfiles/m3token006.tlg
+++ b/l3kernel/testfiles/m3token006.tlg
@@ -2,17 +2,7 @@ This is a generated file for the LaTeX (2e + expl3) validation system.
Don't change this file in any respect.
Author: Joseph Wright
============================================================
-TEST 1: Byte_decomposition
-============================================================
-{65}{}{}{}
-{195}{142}{}{}
-{206}{137}{}{}
-{225}{182}{173}{}
-{239}{191}{189}{}
-{240}{144}{128}{128}
-============================================================
-============================================================
-TEST 2: Character decomposition
+TEST 1: Character decomposition
============================================================
A
I^^cc^^82
diff --git a/l3kernel/testfiles/m3token006.xetex.tlg b/l3kernel/testfiles/m3token006.xetex.tlg
index a1c542abe..f5a12fbb0 100644
--- a/l3kernel/testfiles/m3token006.xetex.tlg
+++ b/l3kernel/testfiles/m3token006.xetex.tlg
@@ -2,17 +2,7 @@ This is a generated file for the LaTeX (2e + expl3) validation system.
Don't change this file in any respect.
Author: Joseph Wright
============================================================
-TEST 1: Byte_decomposition
-============================================================
-{65}{}{}{}
-{195}{142}{}{}
-{206}{137}{}{}
-{225}{182}{173}{}
-{239}{191}{189}{}
-{240}{144}{128}{128}
-============================================================
-============================================================
-TEST 2: Character decomposition
+TEST 1: Character decomposition
============================================================
A
Î
diff --git a/l3kernel/testfiles/m3unicode001.luatex.tlg b/l3kernel/testfiles/m3unicode001.luatex.tlg
index a812f4345..0e243b747 100644
--- a/l3kernel/testfiles/m3unicode001.luatex.tlg
+++ b/l3kernel/testfiles/m3unicode001.luatex.tlg
@@ -21,3 +21,13 @@ X X
X X
XaX
============================================================
+============================================================
+TEST 3: Byte decomposition
+============================================================
+{65}{}{}{}
+{195}{142}{}{}
+{206}{137}{}{}
+{225}{182}{173}{}
+{239}{191}{189}{}
+{240}{144}{128}{128}
+============================================================
diff --git a/l3kernel/testfiles/m3unicode001.lvt b/l3kernel/testfiles/m3unicode001.lvt
index ed724c47e..73e6a41df 100644
--- a/l3kernel/testfiles/m3unicode001.lvt
+++ b/l3kernel/testfiles/m3unicode001.lvt
@@ -42,4 +42,14 @@
X \codepoint_generate:nn { 97 } { 10 } X \NEWLINE
}
+\TESTEXP { Byte~decomposition }
+ {
+ \codepoint_to_bytes:n { `A } \NEWLINE
+ \codepoint_to_bytes:n { "00CE } \NEWLINE
+ \codepoint_to_bytes:n { "0389 } \NEWLINE
+ \codepoint_to_bytes:n { "1DAD } \NEWLINE
+ \codepoint_to_bytes:n { "FFFD } \NEWLINE
+ \codepoint_to_bytes:n { "10000 }
+ }
+
\END
\ No newline at end of file
diff --git a/l3kernel/testfiles/m3unicode001.tlg b/l3kernel/testfiles/m3unicode001.tlg
index 2924a5588..21244eb8b 100644
--- a/l3kernel/testfiles/m3unicode001.tlg
+++ b/l3kernel/testfiles/m3unicode001.tlg
@@ -21,3 +21,13 @@ X X
X X
XaX
============================================================
+============================================================
+TEST 3: Byte decomposition
+============================================================
+{65}{}{}{}
+{195}{142}{}{}
+{206}{137}{}{}
+{225}{182}{173}{}
+{239}{191}{189}{}
+{240}{144}{128}{128}
+============================================================
diff --git a/l3kernel/testfiles/m3unicode001.xetex.tlg b/l3kernel/testfiles/m3unicode001.xetex.tlg
index a812f4345..0e243b747 100644
--- a/l3kernel/testfiles/m3unicode001.xetex.tlg
+++ b/l3kernel/testfiles/m3unicode001.xetex.tlg
@@ -21,3 +21,13 @@ X X
X X
XaX
============================================================
+============================================================
+TEST 3: Byte decomposition
+============================================================
+{65}{}{}{}
+{195}{142}{}{}
+{206}{137}{}{}
+{225}{182}{173}{}
+{239}{191}{189}{}
+{240}{144}{128}{128}
+============================================================
More information about the latex3-commits
mailing list.