[latex3-commits] [git/LaTeX3-latex3-latex3] unicode-data: New \codepoint_(str_)generate:n(n) functions (d4f3d9b6b)

Sun Oct 9 18:28:02 CEST 2022

Repository : https://github.com/latex3/latex3
On branch  : unicode-data
Link       : https://github.com/latex3/latex3/commit/d4f3d9b6baeead4df2bf4db12a18c8918f8b242f

>---------------------------------------------------------------

commit d4f3d9b6baeead4df2bf4db12a18c8918f8b242f
Author: Joseph Wright <joseph.wright at morningstar2.co.uk>
Date:   Sun Oct 9 13:45:15 2022 +0100

    New \codepoint_(str_)generate:n(n) functions


>---------------------------------------------------------------

d4f3d9b6baeead4df2bf4db12a18c8918f8b242f
 l3kernel/CHANGELOG.md                              |   3 +
 l3kernel/l3token.dtx                               |  40 +-----
 l3kernel/l3unicode.dtx                             | 140 ++++++++++++++++++---
 ...convert005.ptex.tlg => m3unicode001.luatex.tlg} |  20 ++-
 l3kernel/testfiles/m3unicode001.lvt                |  45 +++++++
 ...{m3str-convert005.ptex.tlg => m3unicode001.tlg} |  20 ++-
 ...-convert005.ptex.tlg => m3unicode001.xetex.tlg} |  20 ++-
 7 files changed, 214 insertions(+), 74 deletions(-)

diff --git a/l3kernel/CHANGELOG.md b/l3kernel/CHANGELOG.md
index a5b1842d9..6e55f71e0 100644
--- a/l3kernel/CHANGELOG.md
+++ b/l3kernel/CHANGELOG.md
@@ -7,6 +7,9 @@ this project uses date-based 'snapshot' version identifiers.
 
 ## [Unreleased]
 
+### Added
+- `\codepoint_str_generate:n`
+
 ### Changed
 - Usage of `\exp_not:n`/`\exp_not:N` in `\peek_analysis_map_inline:n` output
 
diff --git a/l3kernel/l3token.dtx b/l3kernel/l3token.dtx
index b2b5e3468..307cb8d08 100644
--- a/l3kernel/l3token.dtx
+++ b/l3kernel/l3token.dtx
@@ -1828,51 +1828,13 @@
 \cs_new:Npn \@@_to_nfd:nnnn #1#2#3#4
   {
     \int_compare:nNnTF {#1} = {#3}
-      { \@@_to_nfd_generate:nn {#1} {#4} }
+      { \codepoint_generate:nn {#1} {#4} }
       {
         \@@_to_nfd:nn {#1} {#4}
         \tl_if_blank:nF {#2}
           { \@@_to_nfd:nn {#2} {#4} }
       }
   }
-\bool_lazy_or:nnTF
-  { \sys_if_engine_luatex_p: }
-  { \sys_if_engine_xetex_p: }
-  {
-    \cs_new:Npn \@@_to_nfd_generate:nn
-      { \char_generate:nn }
-  }
-  {
-    \cs_new:Npn \@@_to_nfd_generate:nn #1#2
-      {
-        \exp_args:Ne \@@_to_nfd_generate:n
-          { \char_to_utfviii_bytes:n {#1} }
-      }
-    \cs_new:Npn \@@_to_nfd_generate:n #1
-      { \@@_to_nfd_generate:nnnn #1 }
-     \cs_new:Npn \@@_to_nfd_generate:nnnn #1#2#3#4
-        {
-          \int_compare:nNnTF {#1} < { "80 }
-            { \char_generate:nn {#1} { \char_value_catcode:n {#1} } }
-            {
-              \exp_after:wN \exp_after:wN \exp_after:wN
-                \exp_not:N \char_generate:nn {#1} { 13 }
-              \exp_after:wN \exp_after:wN \exp_after:wN
-                \exp_not:N \char_generate:nn {#2} { 13 }
-              \tl_if_blank:nF {#3}
-                {
-                  \exp_after:wN \exp_after:wN \exp_after:wN
-                    \exp_not:N \char_generate:nn {#3} { 13 }
-                  \tl_if_blank:nF {#4}
-                    {
-                      \exp_after:wN \exp_after:wN \exp_after:wN
-                        \exp_not:N \char_generate:nn {#4} { 13 }
-                    }
-                }
-            }
-           
-        }
-  }
 %    \end{macrocode}
 % \end{macro}
 % \end{macro}
diff --git a/l3kernel/l3unicode.dtx b/l3kernel/l3unicode.dtx
index c9d3f51f2..0974cae51 100644
--- a/l3kernel/l3unicode.dtx
+++ b/l3kernel/l3unicode.dtx
@@ -50,8 +50,54 @@
 % \begin{documentation}
 %
 % This module provides Unicode-specific functions along with loading data
-% from a range of Unicode Consortium files. At present, it provides no
-% public functions.
+% from a range of Unicode Consortium files. Most of the code here is
+% internal, but there are a small set of public functions. These work with
+% Unicode \meta{codepoints} and are designed to give useable results with
+% both Unicode-aware and $8$-bit engines.
+%
+% \begin{function}[EXP, added = 2022-10-09]
+%   {\codepoint_generate:nn}
+%   \begin{syntax}
+%      \cs{codepoint_generate:nn} \Arg{codepoint} \Arg{catcode}
+%   \end{syntax}
+%   Generates one or more character tokens representing the \meta{codepoint}.
+%   With Unicode engines, exactly one character token will be generated, and
+%   this will have the \meta{catcode} specified as the second argument:
+%   \begin{itemize}
+%     \item $1$ (begin group)
+%     \item $2$ (end group)
+%     \item $3$ (math toggle)
+%     \item $4$ (alignment)
+%     \item $6$ (parameter)
+%     \item $7$ (math superscript)
+%     \item $8$ (math subscript)
+%     \item $10$ (space)
+%     \item $11$ (letter)
+%     \item $12$ (other)
+%     \item $13$ (active)
+%   \end{itemize}
+%   For $8$-bit engines, between one and four character tokens will be
+%   produced: these will be the bytes of the UTF-8 representation of the
+%   \meta{codepoint}. For all codepoints outside of the classical ASCII
+%   range, the generated character tokens will be active (category code
+%   $13$); the \meta{catcode} argument is only used for codepoints in the
+%   ASCII range. To allow the result of this function to be used inside a
+%   expansion context, the result is protected by \cs{exp_not:n}.
+% \end{function}
+%
+% \begin{function}[EXP, added = 2022-10-09]
+%   {\codepoint_str_generate:n}
+%   \begin{syntax}
+%      \cs{codepoint_str_generate:n} \Arg{codepoint}
+%   \end{syntax}
+%   Generates one or more character tokens representing the \meta{codepoint}.
+%   With Unicode engines, exactly one character token will be generated.
+%   For $8$-bit engines, between one and four character tokens will be
+%   produced: these will be the bytes of the UTF-8 representation of the
+%   \meta{codepoint}. All of the generated character tokens will be of
+%   category code $12$, except any spaces (codepoint $32$), which will be
+%   category code $10$.
+% \end{function}
 %
 % \end{documentation}
 %
@@ -101,8 +147,11 @@
 % they are Unicode or $8$-bit internally. Parsing is therefore done by common
 % functions, with some data storage using engine-specific auxiliaries.
 %
-% \begin{macro}[EXP]{\@@_generate_str:n}
-% \begin{macro}[EXP]{\@@_generate_str:nnnn}
+% \begin{macro}[EXP]{\codepoint_str_generate:n}
+% \begin{macro}[EXP]{\@@_str_generate:nnnn}
+% \begin{macro}[EXP]{\codepoint_generate:nn}
+% \begin{macro}[EXP]{\@@_generate:nnnn}
+% \begin{macro}[EXP]{\@@_generate:n}
 %   Conversion of a codepoint to a character (Unicode engines) or to one
 %   or more bytes ($8$-bit engines) is required. For loading the data,
 %   all that is needed is the form which creates strings: these are outside
@@ -115,23 +164,36 @@
   { \sys_if_engine_luatex_p: }
   { \sys_if_engine_xetex_p: }
   {
-    \cs_new:Npn \@@_generate_str:n #1
+    \cs_new:Npn \codepoint_str_generate:n #1
       {
         \int_compare:nNnTF {#1} = { `\  }
           { ~ }
           { \char_generate:nn {#1} { 12 } }
       }
+   \cs_new:Npn \codepoint_generate:nn #1#2
+      {
+        \int_compare:nNnTF {#1} = { `\  }
+          { ~ }
+          {
+            \__kernel_exp_not:w \exp_after:wN \exp_after:wN \exp_after:wN
+              { \char_generate:nn {#1} {#2} }
+          }
+      }
   }
   {
-    \cs_new:Npn \@@_generate_str:n #1
+    \cs_new:Npn \codepoint_str_generate:n #1
       {
-        \use:e
+        \int_compare:nNnTF {#1} = { `\  }
+          { ~ }
           {
-            \exp_not:N \@@_generate_str:nnnn
-              \char_to_utfviii_bytes:n {#1}
+            \use:e
+              {
+                \exp_not:N \@@_str_generate:nnnn
+                  \char_to_utfviii_bytes:n {#1}
+              }
           }
       }
-    \cs_new:Npn \@@_generate_str:nnnn #1#2#3#4
+    \cs_new:Npn \@@_str_generate:nnnn #1#2#3#4
       {
         \char_generate:nn {#1} { 12 }
         \tl_if_blank:nF {#2}
@@ -145,10 +207,54 @@
               }
           }
       }
+    \cs_new:Npn \codepoint_generate:nn #1#2
+      {
+        \int_compare:nNnTF {#1} = { `\  }
+          { ~ }
+          {
+            \int_compare:nNnTF {#1} < { "80 }
+              {
+                \__kernel_exp_not:w \exp_after:wN \exp_after:wN \exp_after:wN
+                  { \char_generate:nn {#1} {#2} }
+              }
+              {
+                \use:e
+                  {
+                    \exp_not:N \@@_generate:nnnn
+                      \char_to_utfviii_bytes:n {#1}
+                  }
+              }
+          }
+      }
+    \cs_new:Npn \@@_generate:nnnn #1#2#3#4
+      {
+        \__kernel_exp_not:w \exp_after:wN
+          {
+            \tex_expanded:D
+              {
+                \@@_generate:n {#1}
+                \@@_generate:n {#2}
+                \tl_if_blank:nF {#3}
+                  {
+                    \@@_generate:n {#3}
+                    \tl_if_blank:nF {#4}
+                      { \@@_generate:n {#4} }
+                  }
+              }
+          }
+      }
+     \cs_new:Npn \@@_generate:n #1
+       {
+         \__kernel_exp_not:w \exp_after:wN \exp_after:wN \exp_after:wN
+           { \char_generate:nn {#1} { 13 } }
+       }
   }
 %    \end{macrocode}
 % \end{macro}
 % \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
 %
 % As only the data needs to remain at the end of this process, everything
 % is set up inside a group. The only thing that is outside is creating a
@@ -218,7 +324,7 @@
   \cs_set_protected:Npn \@@_data_auxii:w #1 ; #2 ~ #3 \q_stop
     {
       \tl_const:cx
-        { c_@@_nfd_ \@@_generate_str:n {"#1} _tl }
+        { c_@@_nfd_ \codepoint_str_generate:n {"#1} _tl }
         {
           {"#2}
           { \tl_if_blank:nF {#3} {"#3} }
@@ -270,7 +376,7 @@
       \int_compare:nNnF {#3} = { \@@_data_offset:nn {#1} {#5} }
         {
           \tl_const:cx
-            { c_@@_titlecase_ \@@_generate_str:n {"#1} _tl }
+            { c_@@_titlecase_ \codepoint_str_generate:n {"#1} _tl }
             { {"#5} { } { } }
         }
       \tl_set:Nx \l_@@_next_codepoint_fint_tl
@@ -504,7 +610,7 @@
           \int_eval:n { \__kernel_codepoint_data:nn { lowercase } {"#1} + "#1 }
             = "#3 ~
           \tl_const:cx
-            { c_@@_casefold_ \@@_generate_str:n {"#1} _tl }
+            { c_@@_casefold_ \codepoint_str_generate:n {"#1} _tl }
             { {"#3} { } { } }
         \fi:
       \else:
@@ -519,7 +625,7 @@
 %    \begin{macrocode}
   \cs_set_protected:Npn \@@_data_auxii:w #1 ~ #2 ~ #3 ~ #4 \q_stop
     {
-      \tl_const:cx { c_@@_casefold_ \@@_generate_str:n {"#1} _tl }
+      \tl_const:cx { c_@@_casefold_ \codepoint_str_generate:n {"#1} _tl }
         {
           {"#2}
           {"#3}
@@ -552,7 +658,7 @@
     {
       \tl_if_empty:nF {#4}
         {
-          \tl_const:cx { c_@@_ #2 case_ \@@_generate_str:n {"#1} _tl }
+          \tl_const:cx { c_@@_ #2 case_ \codepoint_str_generate:n {"#1} _tl }
             {
               {"#3}
               {"#4}
@@ -588,7 +694,7 @@
 \cs_new:Npn \__kernel_codepoint_case:nn #1#2
   {
     \exp_args:Ne \@@_case:nnn
-      { \@@_generate_str:n {#2} } {#1} {#2}
+      { \codepoint_str_generate:n {#2} } {#1} {#2}
   }
 \cs_new:Npn \@@_case:nnn #1#2#3
   {
@@ -621,7 +727,7 @@
 %   A simple interface.
 %    \begin{macrocode}
 \cs_new:Npn \__kernel_codepoint_nfd:n #1
-  { \exp_args:Ne \@@_nfd:nn { \@@_generate_str:n {#1} } {#1} }
+  { \exp_args:Ne \@@_nfd:nn { \codepoint_str_generate:n {#1} } {#1} }
 \cs_new:Npn \@@_nfd:nn #1#2
   {
     \tl_if_exist:cTF { c_@@_nfd_ #1 _tl }
diff --git a/l3kernel/testfiles/m3str-convert005.ptex.tlg b/l3kernel/testfiles/m3unicode001.luatex.tlg
similarity index 81%
copy from l3kernel/testfiles/m3str-convert005.ptex.tlg
copy to l3kernel/testfiles/m3unicode001.luatex.tlg
index e7b068398..a812f4345 100644
--- a/l3kernel/testfiles/m3str-convert005.ptex.tlg
+++ b/l3kernel/testfiles/m3unicode001.luatex.tlg
@@ -2,14 +2,22 @@ This is a generated file for the LaTeX (2e + expl3) validation system.
 Don't change this file in any respect.
 Author: Joseph Wright
 ============================================================
-TEST 1: PDF names
+TEST 1: Codepoint to chars
 ============================================================
-abczz
-brackets#28#29#5B#5D#7B#7D#3C#3Exxx
+AA
+AA
+^^ad^^ad
+įį
+ͰͰ
+ԠԠ
+પપ
+ᄤᄤ
+𐀉𐀉
 ============================================================
 ============================================================
-TEST 2: PDF names with spaces
+TEST 2: Spaces
 ============================================================
-abc#20cde
-abc#20cde
+X X
+X X
+XaX
 ============================================================
diff --git a/l3kernel/testfiles/m3unicode001.lvt b/l3kernel/testfiles/m3unicode001.lvt
new file mode 100644
index 000000000..ed724c47e
--- /dev/null
+++ b/l3kernel/testfiles/m3unicode001.lvt
@@ -0,0 +1,45 @@
+%
+% Copyright (C) 2022 The LaTeX Project
+%
+\documentclass{minimal}
+\input{regression-test}
+\RequirePackage[enable-debug]{expl3}
+\ExplSyntaxOn
+\debug_on:n { check-declarations , deprecation , log-functions }
+\ExplSyntaxOff
+
+\begin{document}
+
+\START
+\AUTHOR{Joseph Wright}
+\ExplSyntaxOn
+
+\OMIT
+\cs_set:Npn \test:nn #1#2
+  {
+    \codepoint_generate:nn {#1} {#2}
+    \codepoint_str_generate:n {#1}
+  }
+\TIMO
+
+\TESTEXP { Codepoint~to~chars }
+  {
+    \test:nn { "0041 }  { 11 } \NEWLINE
+    \test:nn { "0041 }  { 12 } \NEWLINE
+    \test:nn { "00AD }  { 12 } \NEWLINE
+    \test:nn { "012F }  { 11 } \NEWLINE 
+    \test:nn { "0370 }  { 11 } \NEWLINE
+    \test:nn { "0520 }  { 11 } \NEWLINE
+    \test:nn { "0AAA }  { 11 } \NEWLINE
+    \test:nn { "1124 }  { 11 } \NEWLINE
+    \test:nn { "10009 } { 11 } \NEWLINE
+  }
+
+\TESTEXP { Spaces }
+  {
+    X \codepoint_generate:nn { 32 } { 11 } X \NEWLINE
+    X \codepoint_generate:nn { 32 } { 12 } X \NEWLINE
+    X \codepoint_generate:nn { 97 } { 10 } X \NEWLINE
+  }
+
+\END
\ No newline at end of file
diff --git a/l3kernel/testfiles/m3str-convert005.ptex.tlg b/l3kernel/testfiles/m3unicode001.tlg
similarity index 70%
copy from l3kernel/testfiles/m3str-convert005.ptex.tlg
copy to l3kernel/testfiles/m3unicode001.tlg
index e7b068398..2924a5588 100644
--- a/l3kernel/testfiles/m3str-convert005.ptex.tlg
+++ b/l3kernel/testfiles/m3unicode001.tlg
@@ -2,14 +2,22 @@ This is a generated file for the LaTeX (2e + expl3) validation system.
 Don't change this file in any respect.
 Author: Joseph Wright
 ============================================================
-TEST 1: PDF names
+TEST 1: Codepoint to chars
 ============================================================
-abczz
-brackets#28#29#5B#5D#7B#7D#3C#3Exxx
+AA
+AA
+^^c2^^ad^^c2^^ad
+^^c4^^af^^c4^^af
+^^cd^^b0^^cd^^b0
+^^d4^^a0^^d4^^a0
+^^e0^^aa^^aa^^e0^^aa^^aa
+^^e1^^84^^a4^^e1^^84^^a4
+^^f0^^90^^80^^89^^f0^^90^^80^^89
 ============================================================
 ============================================================
-TEST 2: PDF names with spaces
+TEST 2: Spaces
 ============================================================
-abc#20cde
-abc#20cde
+X X
+X X
+XaX
 ============================================================
diff --git a/l3kernel/testfiles/m3str-convert005.ptex.tlg b/l3kernel/testfiles/m3unicode001.xetex.tlg
similarity index 81%
copy from l3kernel/testfiles/m3str-convert005.ptex.tlg
copy to l3kernel/testfiles/m3unicode001.xetex.tlg
index e7b068398..a812f4345 100644
--- a/l3kernel/testfiles/m3str-convert005.ptex.tlg
+++ b/l3kernel/testfiles/m3unicode001.xetex.tlg
@@ -2,14 +2,22 @@ This is a generated file for the LaTeX (2e + expl3) validation system.
 Don't change this file in any respect.
 Author: Joseph Wright
 ============================================================
-TEST 1: PDF names
+TEST 1: Codepoint to chars
 ============================================================
-abczz
-brackets#28#29#5B#5D#7B#7D#3C#3Exxx
+AA
+AA
+^^ad^^ad
+įį
+ͰͰ
+ԠԠ
+પપ
+ᄤᄤ
+𐀉𐀉
 ============================================================
 ============================================================
-TEST 2: PDF names with spaces
+TEST 2: Spaces
 ============================================================
-abc#20cde
-abc#20cde
+X X
+X X
+XaX
 ============================================================