[latex3-commits] [git/LaTeX3-latex3-latex3] unicode-data: New \codepoint_(str_)generate:n(n) functions (d4f3d9b6b)
Joseph Wright
joseph.wright at morningstar2.co.uk
Sun Oct 9 18:28:02 CEST 2022
Repository : https://github.com/latex3/latex3
On branch : unicode-data
Link : https://github.com/latex3/latex3/commit/d4f3d9b6baeead4df2bf4db12a18c8918f8b242f
>---------------------------------------------------------------
commit d4f3d9b6baeead4df2bf4db12a18c8918f8b242f
Author: Joseph Wright <joseph.wright at morningstar2.co.uk>
Date: Sun Oct 9 13:45:15 2022 +0100
New \codepoint_(str_)generate:n(n) functions
>---------------------------------------------------------------
d4f3d9b6baeead4df2bf4db12a18c8918f8b242f
l3kernel/CHANGELOG.md | 3 +
l3kernel/l3token.dtx | 40 +-----
l3kernel/l3unicode.dtx | 140 ++++++++++++++++++---
...convert005.ptex.tlg => m3unicode001.luatex.tlg} | 20 ++-
l3kernel/testfiles/m3unicode001.lvt | 45 +++++++
...{m3str-convert005.ptex.tlg => m3unicode001.tlg} | 20 ++-
...-convert005.ptex.tlg => m3unicode001.xetex.tlg} | 20 ++-
7 files changed, 214 insertions(+), 74 deletions(-)
diff --git a/l3kernel/CHANGELOG.md b/l3kernel/CHANGELOG.md
index a5b1842d9..6e55f71e0 100644
--- a/l3kernel/CHANGELOG.md
+++ b/l3kernel/CHANGELOG.md
@@ -7,6 +7,9 @@ this project uses date-based 'snapshot' version identifiers.
## [Unreleased]
+### Added
+- `\codepoint_str_generate:n`
+
### Changed
- Usage of `\exp_not:n`/`\exp_not:N` in `\peek_analysis_map_inline:n` output
diff --git a/l3kernel/l3token.dtx b/l3kernel/l3token.dtx
index b2b5e3468..307cb8d08 100644
--- a/l3kernel/l3token.dtx
+++ b/l3kernel/l3token.dtx
@@ -1828,51 +1828,13 @@
\cs_new:Npn \@@_to_nfd:nnnn #1#2#3#4
{
\int_compare:nNnTF {#1} = {#3}
- { \@@_to_nfd_generate:nn {#1} {#4} }
+ { \codepoint_generate:nn {#1} {#4} }
{
\@@_to_nfd:nn {#1} {#4}
\tl_if_blank:nF {#2}
{ \@@_to_nfd:nn {#2} {#4} }
}
}
-\bool_lazy_or:nnTF
- { \sys_if_engine_luatex_p: }
- { \sys_if_engine_xetex_p: }
- {
- \cs_new:Npn \@@_to_nfd_generate:nn
- { \char_generate:nn }
- }
- {
- \cs_new:Npn \@@_to_nfd_generate:nn #1#2
- {
- \exp_args:Ne \@@_to_nfd_generate:n
- { \char_to_utfviii_bytes:n {#1} }
- }
- \cs_new:Npn \@@_to_nfd_generate:n #1
- { \@@_to_nfd_generate:nnnn #1 }
- \cs_new:Npn \@@_to_nfd_generate:nnnn #1#2#3#4
- {
- \int_compare:nNnTF {#1} < { "80 }
- { \char_generate:nn {#1} { \char_value_catcode:n {#1} } }
- {
- \exp_after:wN \exp_after:wN \exp_after:wN
- \exp_not:N \char_generate:nn {#1} { 13 }
- \exp_after:wN \exp_after:wN \exp_after:wN
- \exp_not:N \char_generate:nn {#2} { 13 }
- \tl_if_blank:nF {#3}
- {
- \exp_after:wN \exp_after:wN \exp_after:wN
- \exp_not:N \char_generate:nn {#3} { 13 }
- \tl_if_blank:nF {#4}
- {
- \exp_after:wN \exp_after:wN \exp_after:wN
- \exp_not:N \char_generate:nn {#4} { 13 }
- }
- }
- }
-
- }
- }
% \end{macrocode}
% \end{macro}
% \end{macro}
diff --git a/l3kernel/l3unicode.dtx b/l3kernel/l3unicode.dtx
index c9d3f51f2..0974cae51 100644
--- a/l3kernel/l3unicode.dtx
+++ b/l3kernel/l3unicode.dtx
@@ -50,8 +50,54 @@
% \begin{documentation}
%
% This module provides Unicode-specific functions along with loading data
-% from a range of Unicode Consortium files. At present, it provides no
-% public functions.
+% from a range of Unicode Consortium files. Most of the code here is
+% internal, but there are a small set of public functions. These work with
+% Unicode \meta{codepoints} and are designed to give useable results with
+% both Unicode-aware and $8$-bit engines.
+%
+% \begin{function}[EXP, added = 2022-10-09]
+% {\codepoint_generate:nn}
+% \begin{syntax}
+% \cs{codepoint_generate:nn} \Arg{codepoint} \Arg{catcode}
+% \end{syntax}
+% Generates one or more character tokens representing the \meta{codepoint}.
+% With Unicode engines, exactly one character token will be generated, and
+% this will have the \meta{catcode} specified as the second argument:
+% \begin{itemize}
+% \item $1$ (begin group)
+% \item $2$ (end group)
+% \item $3$ (math toggle)
+% \item $4$ (alignment)
+% \item $6$ (parameter)
+% \item $7$ (math superscript)
+% \item $8$ (math subscript)
+% \item $10$ (space)
+% \item $11$ (letter)
+% \item $12$ (other)
+% \item $13$ (active)
+% \end{itemize}
+% For $8$-bit engines, between one and four character tokens will be
+% produced: these will be the bytes of the UTF-8 representation of the
+% \meta{codepoint}. For all codepoints outside of the classical ASCII
+% range, the generated character tokens will be active (category code
+% $13$); the \meta{catcode} argument is only used for codepoints in the
+% ASCII range. To allow the result of this function to be used inside a
+% expansion context, the result is protected by \cs{exp_not:n}.
+% \end{function}
+%
+% \begin{function}[EXP, added = 2022-10-09]
+% {\codepoint_str_generate:n}
+% \begin{syntax}
+% \cs{codepoint_str_generate:n} \Arg{codepoint}
+% \end{syntax}
+% Generates one or more character tokens representing the \meta{codepoint}.
+% With Unicode engines, exactly one character token will be generated.
+% For $8$-bit engines, between one and four character tokens will be
+% produced: these will be the bytes of the UTF-8 representation of the
+% \meta{codepoint}. All of the generated character tokens will be of
+% category code $12$, except any spaces (codepoint $32$), which will be
+% category code $10$.
+% \end{function}
%
% \end{documentation}
%
@@ -101,8 +147,11 @@
% they are Unicode or $8$-bit internally. Parsing is therefore done by common
% functions, with some data storage using engine-specific auxiliaries.
%
-% \begin{macro}[EXP]{\@@_generate_str:n}
-% \begin{macro}[EXP]{\@@_generate_str:nnnn}
+% \begin{macro}[EXP]{\codepoint_str_generate:n}
+% \begin{macro}[EXP]{\@@_str_generate:nnnn}
+% \begin{macro}[EXP]{\codepoint_generate:nn}
+% \begin{macro}[EXP]{\@@_generate:nnnn}
+% \begin{macro}[EXP]{\@@_generate:n}
% Conversion of a codepoint to a character (Unicode engines) or to one
% or more bytes ($8$-bit engines) is required. For loading the data,
% all that is needed is the form which creates strings: these are outside
@@ -115,23 +164,36 @@
{ \sys_if_engine_luatex_p: }
{ \sys_if_engine_xetex_p: }
{
- \cs_new:Npn \@@_generate_str:n #1
+ \cs_new:Npn \codepoint_str_generate:n #1
{
\int_compare:nNnTF {#1} = { `\ }
{ ~ }
{ \char_generate:nn {#1} { 12 } }
}
+ \cs_new:Npn \codepoint_generate:nn #1#2
+ {
+ \int_compare:nNnTF {#1} = { `\ }
+ { ~ }
+ {
+ \__kernel_exp_not:w \exp_after:wN \exp_after:wN \exp_after:wN
+ { \char_generate:nn {#1} {#2} }
+ }
+ }
}
{
- \cs_new:Npn \@@_generate_str:n #1
+ \cs_new:Npn \codepoint_str_generate:n #1
{
- \use:e
+ \int_compare:nNnTF {#1} = { `\ }
+ { ~ }
{
- \exp_not:N \@@_generate_str:nnnn
- \char_to_utfviii_bytes:n {#1}
+ \use:e
+ {
+ \exp_not:N \@@_str_generate:nnnn
+ \char_to_utfviii_bytes:n {#1}
+ }
}
}
- \cs_new:Npn \@@_generate_str:nnnn #1#2#3#4
+ \cs_new:Npn \@@_str_generate:nnnn #1#2#3#4
{
\char_generate:nn {#1} { 12 }
\tl_if_blank:nF {#2}
@@ -145,10 +207,54 @@
}
}
}
+ \cs_new:Npn \codepoint_generate:nn #1#2
+ {
+ \int_compare:nNnTF {#1} = { `\ }
+ { ~ }
+ {
+ \int_compare:nNnTF {#1} < { "80 }
+ {
+ \__kernel_exp_not:w \exp_after:wN \exp_after:wN \exp_after:wN
+ { \char_generate:nn {#1} {#2} }
+ }
+ {
+ \use:e
+ {
+ \exp_not:N \@@_generate:nnnn
+ \char_to_utfviii_bytes:n {#1}
+ }
+ }
+ }
+ }
+ \cs_new:Npn \@@_generate:nnnn #1#2#3#4
+ {
+ \__kernel_exp_not:w \exp_after:wN
+ {
+ \tex_expanded:D
+ {
+ \@@_generate:n {#1}
+ \@@_generate:n {#2}
+ \tl_if_blank:nF {#3}
+ {
+ \@@_generate:n {#3}
+ \tl_if_blank:nF {#4}
+ { \@@_generate:n {#4} }
+ }
+ }
+ }
+ }
+ \cs_new:Npn \@@_generate:n #1
+ {
+ \__kernel_exp_not:w \exp_after:wN \exp_after:wN \exp_after:wN
+ { \char_generate:nn {#1} { 13 } }
+ }
}
% \end{macrocode}
% \end{macro}
% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
%
% As only the data needs to remain at the end of this process, everything
% is set up inside a group. The only thing that is outside is creating a
@@ -218,7 +324,7 @@
\cs_set_protected:Npn \@@_data_auxii:w #1 ; #2 ~ #3 \q_stop
{
\tl_const:cx
- { c_@@_nfd_ \@@_generate_str:n {"#1} _tl }
+ { c_@@_nfd_ \codepoint_str_generate:n {"#1} _tl }
{
{"#2}
{ \tl_if_blank:nF {#3} {"#3} }
@@ -270,7 +376,7 @@
\int_compare:nNnF {#3} = { \@@_data_offset:nn {#1} {#5} }
{
\tl_const:cx
- { c_@@_titlecase_ \@@_generate_str:n {"#1} _tl }
+ { c_@@_titlecase_ \codepoint_str_generate:n {"#1} _tl }
{ {"#5} { } { } }
}
\tl_set:Nx \l_@@_next_codepoint_fint_tl
@@ -504,7 +610,7 @@
\int_eval:n { \__kernel_codepoint_data:nn { lowercase } {"#1} + "#1 }
= "#3 ~
\tl_const:cx
- { c_@@_casefold_ \@@_generate_str:n {"#1} _tl }
+ { c_@@_casefold_ \codepoint_str_generate:n {"#1} _tl }
{ {"#3} { } { } }
\fi:
\else:
@@ -519,7 +625,7 @@
% \begin{macrocode}
\cs_set_protected:Npn \@@_data_auxii:w #1 ~ #2 ~ #3 ~ #4 \q_stop
{
- \tl_const:cx { c_@@_casefold_ \@@_generate_str:n {"#1} _tl }
+ \tl_const:cx { c_@@_casefold_ \codepoint_str_generate:n {"#1} _tl }
{
{"#2}
{"#3}
@@ -552,7 +658,7 @@
{
\tl_if_empty:nF {#4}
{
- \tl_const:cx { c_@@_ #2 case_ \@@_generate_str:n {"#1} _tl }
+ \tl_const:cx { c_@@_ #2 case_ \codepoint_str_generate:n {"#1} _tl }
{
{"#3}
{"#4}
@@ -588,7 +694,7 @@
\cs_new:Npn \__kernel_codepoint_case:nn #1#2
{
\exp_args:Ne \@@_case:nnn
- { \@@_generate_str:n {#2} } {#1} {#2}
+ { \codepoint_str_generate:n {#2} } {#1} {#2}
}
\cs_new:Npn \@@_case:nnn #1#2#3
{
@@ -621,7 +727,7 @@
% A simple interface.
% \begin{macrocode}
\cs_new:Npn \__kernel_codepoint_nfd:n #1
- { \exp_args:Ne \@@_nfd:nn { \@@_generate_str:n {#1} } {#1} }
+ { \exp_args:Ne \@@_nfd:nn { \codepoint_str_generate:n {#1} } {#1} }
\cs_new:Npn \@@_nfd:nn #1#2
{
\tl_if_exist:cTF { c_@@_nfd_ #1 _tl }
diff --git a/l3kernel/testfiles/m3str-convert005.ptex.tlg b/l3kernel/testfiles/m3unicode001.luatex.tlg
similarity index 81%
copy from l3kernel/testfiles/m3str-convert005.ptex.tlg
copy to l3kernel/testfiles/m3unicode001.luatex.tlg
index e7b068398..a812f4345 100644
--- a/l3kernel/testfiles/m3str-convert005.ptex.tlg
+++ b/l3kernel/testfiles/m3unicode001.luatex.tlg
@@ -2,14 +2,22 @@ This is a generated file for the LaTeX (2e + expl3) validation system.
Don't change this file in any respect.
Author: Joseph Wright
============================================================
-TEST 1: PDF names
+TEST 1: Codepoint to chars
============================================================
-abczz
-brackets#28#29#5B#5D#7B#7D#3C#3Exxx
+AA
+AA
+^^ad^^ad
+įį
+ͰͰ
+ԠԠ
+પપ
+ᄤᄤ
+𐀉𐀉
============================================================
============================================================
-TEST 2: PDF names with spaces
+TEST 2: Spaces
============================================================
-abc#20cde
-abc#20cde
+X X
+X X
+XaX
============================================================
diff --git a/l3kernel/testfiles/m3unicode001.lvt b/l3kernel/testfiles/m3unicode001.lvt
new file mode 100644
index 000000000..ed724c47e
--- /dev/null
+++ b/l3kernel/testfiles/m3unicode001.lvt
@@ -0,0 +1,45 @@
+%
+% Copyright (C) 2022 The LaTeX Project
+%
+\documentclass{minimal}
+\input{regression-test}
+\RequirePackage[enable-debug]{expl3}
+\ExplSyntaxOn
+\debug_on:n { check-declarations , deprecation , log-functions }
+\ExplSyntaxOff
+
+\begin{document}
+
+\START
+\AUTHOR{Joseph Wright}
+\ExplSyntaxOn
+
+\OMIT
+\cs_set:Npn \test:nn #1#2
+ {
+ \codepoint_generate:nn {#1} {#2}
+ \codepoint_str_generate:n {#1}
+ }
+\TIMO
+
+\TESTEXP { Codepoint~to~chars }
+ {
+ \test:nn { "0041 } { 11 } \NEWLINE
+ \test:nn { "0041 } { 12 } \NEWLINE
+ \test:nn { "00AD } { 12 } \NEWLINE
+ \test:nn { "012F } { 11 } \NEWLINE
+ \test:nn { "0370 } { 11 } \NEWLINE
+ \test:nn { "0520 } { 11 } \NEWLINE
+ \test:nn { "0AAA } { 11 } \NEWLINE
+ \test:nn { "1124 } { 11 } \NEWLINE
+ \test:nn { "10009 } { 11 } \NEWLINE
+ }
+
+\TESTEXP { Spaces }
+ {
+ X \codepoint_generate:nn { 32 } { 11 } X \NEWLINE
+ X \codepoint_generate:nn { 32 } { 12 } X \NEWLINE
+ X \codepoint_generate:nn { 97 } { 10 } X \NEWLINE
+ }
+
+\END
\ No newline at end of file
diff --git a/l3kernel/testfiles/m3str-convert005.ptex.tlg b/l3kernel/testfiles/m3unicode001.tlg
similarity index 70%
copy from l3kernel/testfiles/m3str-convert005.ptex.tlg
copy to l3kernel/testfiles/m3unicode001.tlg
index e7b068398..2924a5588 100644
--- a/l3kernel/testfiles/m3str-convert005.ptex.tlg
+++ b/l3kernel/testfiles/m3unicode001.tlg
@@ -2,14 +2,22 @@ This is a generated file for the LaTeX (2e + expl3) validation system.
Don't change this file in any respect.
Author: Joseph Wright
============================================================
-TEST 1: PDF names
+TEST 1: Codepoint to chars
============================================================
-abczz
-brackets#28#29#5B#5D#7B#7D#3C#3Exxx
+AA
+AA
+^^c2^^ad^^c2^^ad
+^^c4^^af^^c4^^af
+^^cd^^b0^^cd^^b0
+^^d4^^a0^^d4^^a0
+^^e0^^aa^^aa^^e0^^aa^^aa
+^^e1^^84^^a4^^e1^^84^^a4
+^^f0^^90^^80^^89^^f0^^90^^80^^89
============================================================
============================================================
-TEST 2: PDF names with spaces
+TEST 2: Spaces
============================================================
-abc#20cde
-abc#20cde
+X X
+X X
+XaX
============================================================
diff --git a/l3kernel/testfiles/m3str-convert005.ptex.tlg b/l3kernel/testfiles/m3unicode001.xetex.tlg
similarity index 81%
copy from l3kernel/testfiles/m3str-convert005.ptex.tlg
copy to l3kernel/testfiles/m3unicode001.xetex.tlg
index e7b068398..a812f4345 100644
--- a/l3kernel/testfiles/m3str-convert005.ptex.tlg
+++ b/l3kernel/testfiles/m3unicode001.xetex.tlg
@@ -2,14 +2,22 @@ This is a generated file for the LaTeX (2e + expl3) validation system.
Don't change this file in any respect.
Author: Joseph Wright
============================================================
-TEST 1: PDF names
+TEST 1: Codepoint to chars
============================================================
-abczz
-brackets#28#29#5B#5D#7B#7D#3C#3Exxx
+AA
+AA
+^^ad^^ad
+įį
+ͰͰ
+ԠԠ
+પપ
+ᄤᄤ
+𐀉𐀉
============================================================
============================================================
-TEST 2: PDF names with spaces
+TEST 2: Spaces
============================================================
-abc#20cde
-abc#20cde
+X X
+X X
+XaX
============================================================
More information about the latex3-commits
mailing list.