[latex3-commits] [git/LaTeX3-latex3-latex3] main: Extend NFD support to 8-bit engines (069b6627b)

Mon Aug 29 17:37:30 CEST 2022

Repository : https://github.com/latex3/latex3
On branch  : main
Link       : https://github.com/latex3/latex3/commit/069b6627b9df818c374e6ef557682c7a7dfc91ad

>---------------------------------------------------------------

commit 069b6627b9df818c374e6ef557682c7a7dfc91ad
Author: Joseph Wright <joseph.wright at morningstar2.co.uk>
Date:   Mon Aug 29 17:37:20 2022 +0200

    Extend NFD support to 8-bit engines


>---------------------------------------------------------------

069b6627b9df818c374e6ef557682c7a7dfc91ad
 l3kernel/CHANGELOG.md                              |   1 +
 l3kernel/l3candidates.dtx                          |  18 +++-
 l3kernel/l3token.dtx                               | 108 +++++++++++++++++----
 l3kernel/l3unicode.dtx                             |   8 +-
 ...r-convert005.ptex.tlg => m3token006.luatex.tlg} |  17 ++--
 l3kernel/testfiles/m3token006.lvt                  |  35 +++++++
 .../{m3str-convert005.ptex.tlg => m3token006.tlg}  |  17 ++--
 ...tr-convert005.ptex.tlg => m3token006.xetex.tlg} |  17 ++--
 8 files changed, 180 insertions(+), 41 deletions(-)

diff --git a/l3kernel/CHANGELOG.md b/l3kernel/CHANGELOG.md
index 41623263a..9ad2257f0 100644
--- a/l3kernel/CHANGELOG.md
+++ b/l3kernel/CHANGELOG.md
@@ -14,6 +14,7 @@ this project uses date-based 'snapshot' version identifiers.
 - `\text_map_function:nN` and `\text_map_inline:nn` for mapping to
   graphemes in textual input
 - Support for medevial Latin case changing
+- `\char_to_nfd:n` to extend NFD support to 8-bit engines
 
 ## [2022-08-23]
 
diff --git a/l3kernel/l3candidates.dtx b/l3kernel/l3candidates.dtx
index 155d51bc0..795322050 100644
--- a/l3kernel/l3candidates.dtx
+++ b/l3kernel/l3candidates.dtx
@@ -643,9 +643,21 @@
 %     \cs{char_to_nfd:N} \meta{char}
 %   \end{syntax}
 %   Converts the \meta{char} to the Unicode Normalization Form Canonical
-%   Decomposition. The category code of the generated character is the
-%   same as the \meta{char}. With $8$-bit engines, no change is made to the
-%   character.
+%   Decomposition. The category code of the \emph{first} generated character is
+%   the same as the \meta{char}; second and subsequent chars will have the
+%   current category code, as they would if typed in directly. For $8$-bit
+%   engines, no change will take place.
+% \end{function}
+%
+% \begin{function}[added = 2022-08-29, rEXP]{\char_to_nfd:n}
+%   \begin{syntax}
+%     \cs{char_to_nfd:n} \Arg{codepoint}
+%   \end{syntax}
+%   Converts the (Unicode) \meta{codepoint} to the Unicode Normalization
+%   Form Canonical Decomposition. The generated character(s) will have
+%   the current category code as they would if typed in directly. In contrast
+%   to \cs{char_to_nfd:N}, this function \emph{does} decompose codepoints
+%   with $8$-bit engines.
 % \end{function}
 %
 % \begin{function}[added = 2018-09-23]
diff --git a/l3kernel/l3token.dtx b/l3kernel/l3token.dtx
index 5077a3114..439c8a1d5 100644
--- a/l3kernel/l3token.dtx
+++ b/l3kernel/l3token.dtx
@@ -1795,36 +1795,110 @@
 % \end{macro}
 %
 % \begin{macro}[rEXP]{\char_to_nfd:N}
-% \begin{macro}[rEXP]{\@@_to_nfd:n}
+% \begin{macro}[rEXP]{\@@_to_nfd:n, \@@_to_nfd:e}
 % \begin{macro}[rEXP]{\@@_to_nfd:Nw}
-%   Look up any \textsc{nfd} and recursively produce the result.
-%    \begin{macrocode}
-\cs_new:Npn \char_to_nfd:N #1
+% \begin{macro}[rEXP]{\char_to_nfd:n}
+% \begin{macro}[rEXP]{\char_to_nfd:w}
+% \begin{macro}[rEXP]{\char_nfd_generate:n}
+% \begin{macro}[rEXP]{\char_nfd_generate:nnnn}
+%   Look up any \textsc{nfd} and recursively produce the result. Having shared
+%   code between Unicode and $8$-bit engines would be ideal, but this would be
+%   awkward as we have completely different treatment of catcodes, numbers
+%   of tokens, etc. The apparent saving becomes more of a headache than it's
+%   worth \dots
+%    \begin{macrocode}
+\bool_lazy_or:nnTF
+  { \sys_if_engine_luatex_p: }
+  { \sys_if_engine_xetex_p: }
   {
-    \cs_if_exist:cTF { c_@@_nfd_ \token_to_str:N #1 _ tl }
+    \cs_new:Npn \char_to_nfd:N #1
       {
-        \exp_after:wN \exp_after:wN \exp_after:wN \@@_to_nfd:Nw
-          \exp_after:wN \exp_after:wN \exp_after:wN #1
-            \cs:w c_@@_nfd_ \token_to_str:N #1 _ tl \cs_end:
-              \s_@@_stop
+        \cs_if_exist:cTF { c_@@_nfd_ \token_to_str:N #1 _ tl }
+          {
+            \exp_after:wN \exp_after:wN \exp_after:wN \@@_to_nfd:Nw
+              \exp_after:wN \exp_after:wN \exp_after:wN #1
+                \cs:w c_@@_nfd_ \token_to_str:N #1 _tl \cs_end:
+                  \s_@@_stop
+          }
+          { \exp_not:n {#1} }
+      }
+    \cs_new_eq:NN \@@_to_nfd:n \char_to_nfd:N
+    \cs_generate_variant:Nn \@@_to_nfd:n { e }
+    \cs_new:Npn \@@_to_nfd:Nw #1#2#3 \s_@@_stop
+      {
+        \@@_to_nfd:e
+          { \char_generate:nn { `#2 } { \@@_change_case_catcode:N #1 } }
+        \tl_if_blank:nF {#3}
+          {
+            \@@_to_nfd:e
+               { \char_generate:nn { `#3 } { \char_value_catcode:n { `#3 } } }
+          }
+      }
+    \cs_new:Npn \char_to_nfd:n #1
+      {
+        \@@_to_nfd:e { \char_generate:nn {#1} { \char_value_catcode:n {#1} } }
       }
-      { \exp_not:n {#1} }
   }
-\cs_new_eq:NN \@@_to_nfd:n \char_to_nfd:N
-\cs_new:Npn \@@_to_nfd:Nw #1#2#3 \s_@@_stop
   {
-    \exp_args:Ne \@@_to_nfd:n
-      { \char_generate:nn { `#2 } { \@@_change_case_catcode:N #1 } }
-    \tl_if_blank:nF {#3}
+    \cs_new:Npn \char_to_nfd:N #1 { \exp_not:n {#1} }
+    \cs_new:Npn \char_to_nfd:n #1
+      {
+        \int_compare:nNnTF {#1} > { "80 }
+          { \exp_args:Ne \@@_to_nfd:n { \@@_nfd_generate:n {#1} } }
+          { \@@_nfd_generate:n {#1} }
+      }
+    \cs_new:Npn \@@_to_nfd:n #1
+      {
+        \cs_if_exist:cTF { c_@@_nfd_ \tl_to_str:n {#1} _ tl }
+          {
+            \exp_after:wN \exp_after:wN \exp_after:wN \@@_to_nfd:w
+              \cs:w c_@@_nfd_ \tl_to_str:n {#1} _tl \cs_end:
+                \s_@@_stop
+          }
+          { \exp_not:n {#1} }
+      }
+    \cs_new:Npn \@@_to_nfd:w #1#2 \s_@@_stop
       {
-        \exp_args:Ne \@@_to_nfd:n
-          { \char_generate:nn { `#3 } { \char_value_catcode:n { `#3 } } }
+        \@@_to_nfd:n {#1}
+        \tl_if_blank:nF {#2}
+          { \@@_to_nfd:n {#2} }
       }
+     \cs_new:Npn \@@_nfd_generate:n #1
+       {
+         \use:e
+           {
+             \exp_not:N \@@_nfd_generate:nnnn
+               \char_to_utfviii_bytes:n {#1}
+           }
+       }
+      \cs_new:Npn \@@_nfd_generate:nnnn #1#2#3#4
+        {
+          \tl_if_blank:nTF {#2}
+            { \char_generate:nn {#1} { \char_value_catcode:n {#1} } }
+            {
+              \exp_after:wN \exp_after:wN \exp_after:wN
+                \exp_not:N \char_generate:nn {#1} { 13 }
+              \exp_after:wN \exp_after:wN \exp_after:wN
+                \exp_not:N \char_generate:nn {#2} { 13 }
+              \tl_if_blank:nF {#3}
+                {
+                  \exp_after:wN \exp_after:wN \exp_after:wN
+                    \exp_not:N \char_generate:nn {#3} { 13 }
+                  \tl_if_blank:nF {#4}
+                    {
+                      \exp_after:wN \exp_after:wN \exp_after:wN
+                        \exp_not:N \char_generate:nn {#4} { 13 }
+                    }
+                }
+            }
+           
+        }
   }
 %    \end{macrocode}
 % \end{macro}
 % \end{macro}
 % \end{macro}
+% \end{macro}
 %
 % \begin{macro}[EXP]
 %   {
diff --git a/l3kernel/l3unicode.dtx b/l3kernel/l3unicode.dtx
index 321ef95a8..0675e87f7 100644
--- a/l3kernel/l3unicode.dtx
+++ b/l3kernel/l3unicode.dtx
@@ -160,9 +160,11 @@
       \tl_const:cx
         { c_@@_nfd_ \@@_generate_other:n { "#1 } _tl }
         {
-          \@@_generate:n { "#2 }
-          \tl_if_blank:nF {#3}
-            { \@@_generate:n { "#3 } }
+          { \@@_generate:n { "#2 } }
+          {
+            \tl_if_blank:nF {#3}
+              { \@@_generate:n { "#3 } }
+            }
         }
     }
   \cs_set_protected:Npn \@@_data_auxiii:w
diff --git a/l3kernel/testfiles/m3str-convert005.ptex.tlg b/l3kernel/testfiles/m3token006.luatex.tlg
similarity index 74%
copy from l3kernel/testfiles/m3str-convert005.ptex.tlg
copy to l3kernel/testfiles/m3token006.luatex.tlg
index e7b068398..a1c542abe 100644
--- a/l3kernel/testfiles/m3str-convert005.ptex.tlg
+++ b/l3kernel/testfiles/m3token006.luatex.tlg
@@ -2,14 +2,19 @@ This is a generated file for the LaTeX (2e + expl3) validation system.
 Don't change this file in any respect.
 Author: Joseph Wright
 ============================================================
-TEST 1: PDF names
+TEST 1: Byte_decomposition
 ============================================================
-abczz
-brackets#28#29#5B#5D#7B#7D#3C#3Exxx
+{65}{}{}{}
+{195}{142}{}{}
+{206}{137}{}{}
+{225}{182}{173}{}
+{239}{191}{189}{}
+{240}{144}{128}{128}
 ============================================================
 ============================================================
-TEST 2: PDF names with spaces
+TEST 2: Character decomposition
 ============================================================
-abc#20cde
-abc#20cde
+A
+Î
+Ή
 ============================================================
diff --git a/l3kernel/testfiles/m3token006.lvt b/l3kernel/testfiles/m3token006.lvt
new file mode 100644
index 000000000..9aeea764c
--- /dev/null
+++ b/l3kernel/testfiles/m3token006.lvt
@@ -0,0 +1,35 @@
+%
+% Copyright (C) 2022 The LaTeX Project
+%
+
+\documentclass{minimal}
+\input{regression-test}
+
+\RequirePackage[enable-debug]{expl3}
+\ExplSyntaxOn
+\debug_on:n { check-declarations , deprecation , log-functions }
+\ExplSyntaxOff
+
+\START
+\AUTHOR{Joseph Wright}
+
+\ExplSyntaxOn
+
+\TESTEXP { Byte_decomposition }
+  {
+    \char_to_utfviii_bytes:n { `A } \NEWLINE
+    \char_to_utfviii_bytes:n { "00CE } \NEWLINE
+    \char_to_utfviii_bytes:n { "0389 } \NEWLINE
+    \char_to_utfviii_bytes:n { "1DAD } \NEWLINE
+    \char_to_utfviii_bytes:n { "FFFD } \NEWLINE
+    \char_to_utfviii_bytes:n { "10000 }
+  }
+
+\TESTEXP { Character~decomposition }
+  {
+    \char_to_nfd:n { `A } \NEWLINE
+    \char_to_nfd:n { "00CE } \NEWLINE
+    \char_to_nfd:n { "0389 }
+  }
+
+\END
diff --git a/l3kernel/testfiles/m3str-convert005.ptex.tlg b/l3kernel/testfiles/m3token006.tlg
similarity index 72%
copy from l3kernel/testfiles/m3str-convert005.ptex.tlg
copy to l3kernel/testfiles/m3token006.tlg
index e7b068398..45256798d 100644
--- a/l3kernel/testfiles/m3str-convert005.ptex.tlg
+++ b/l3kernel/testfiles/m3token006.tlg
@@ -2,14 +2,19 @@ This is a generated file for the LaTeX (2e + expl3) validation system.
 Don't change this file in any respect.
 Author: Joseph Wright
 ============================================================
-TEST 1: PDF names
+TEST 1: Byte_decomposition
 ============================================================
-abczz
-brackets#28#29#5B#5D#7B#7D#3C#3Exxx
+{65}{}{}{}
+{195}{142}{}{}
+{206}{137}{}{}
+{225}{182}{173}{}
+{239}{191}{189}{}
+{240}{144}{128}{128}
 ============================================================
 ============================================================
-TEST 2: PDF names with spaces
+TEST 2: Character decomposition
 ============================================================
-abc#20cde
-abc#20cde
+A
+I^^cc^^82
+^^ce^^97^^cc^^81
 ============================================================
diff --git a/l3kernel/testfiles/m3str-convert005.ptex.tlg b/l3kernel/testfiles/m3token006.xetex.tlg
similarity index 74%
copy from l3kernel/testfiles/m3str-convert005.ptex.tlg
copy to l3kernel/testfiles/m3token006.xetex.tlg
index e7b068398..a1c542abe 100644
--- a/l3kernel/testfiles/m3str-convert005.ptex.tlg
+++ b/l3kernel/testfiles/m3token006.xetex.tlg
@@ -2,14 +2,19 @@ This is a generated file for the LaTeX (2e + expl3) validation system.
 Don't change this file in any respect.
 Author: Joseph Wright
 ============================================================
-TEST 1: PDF names
+TEST 1: Byte_decomposition
 ============================================================
-abczz
-brackets#28#29#5B#5D#7B#7D#3C#3Exxx
+{65}{}{}{}
+{195}{142}{}{}
+{206}{137}{}{}
+{225}{182}{173}{}
+{239}{191}{189}{}
+{240}{144}{128}{128}
 ============================================================
 ============================================================
-TEST 2: PDF names with spaces
+TEST 2: Character decomposition
 ============================================================
-abc#20cde
-abc#20cde
+A
+Î
+Ή
 ============================================================