[latex3-commits] [git/LaTeX3-latex3-latex3] main: Extend \str_<type>>case:n to UTF-8 with 8-bit engines (8a5559e9d)

Thu Mar 24 08:14:52 CET 2022

Repository : https://github.com/latex3/latex3
On branch  : main
Link       : https://github.com/latex3/latex3/commit/8a5559e9d0ee175e5c9c13af1f3a53006bb1bbb7

>---------------------------------------------------------------

commit 8a5559e9d0ee175e5c9c13af1f3a53006bb1bbb7
Author: Joseph Wright <joseph.wright at morningstar2.co.uk>
Date:   Thu Mar 24 07:14:52 2022 +0000

    Extend \str_<type>>case:n to UTF-8 with 8-bit engines


>---------------------------------------------------------------

8a5559e9d0ee175e5c9c13af1f3a53006bb1bbb7
 l3kernel/CHANGELOG.md                 |   1 +
 l3kernel/l3str.dtx                    | 128 ++++++++++++++++++++++++++++------
 l3kernel/testfiles/m3str002.tlg       |   6 +-
 l3kernel/testfiles/m3str002.uptex.tlg |   6 +-
 4 files changed, 112 insertions(+), 29 deletions(-)

diff --git a/l3kernel/CHANGELOG.md b/l3kernel/CHANGELOG.md
index 19354e46d..794417ecc 100644
--- a/l3kernel/CHANGELOG.md
+++ b/l3kernel/CHANGELOG.md
@@ -15,6 +15,7 @@ this project uses date-based 'snapshot' version identifiers.
 
 ### Changed
 - Definition of `\legacy_if:n(TF)` to support primitive conditionals
+- `\str_<type>case:n` now case changes codepoints above 127 with all engines
 
 ## [2022-02-24]
 
diff --git a/l3kernel/l3str.dtx b/l3kernel/l3str.dtx
index be9dd9f07..1220d5182 100644
--- a/l3kernel/l3str.dtx
+++ b/l3kernel/l3str.dtx
@@ -747,16 +747,6 @@
 %       correctly deal with context-dependence and other factors appropriate
 %       to text case changing.
 %   \end{itemize}
-%
-%   \begin{texnote}
-%     As with all \pkg{expl3} functions, the input supported by
-%     \cs{str_foldcase:n} is \emph{engine-native} characters which are or
-%     interoperate with \textsc{utf-8}. As such, when used with \pdfTeX{}
-%     \emph{only} the Latin alphabet characters A--Z are case-folded
-%     (\emph{i.e.}~the \textsc{ascii} range which coincides with
-%     \textsc{utf-8}). Full \textsc{utf-8} support is available with both
-%     \XeTeX{} and \LuaTeX{}.
-%   \end{texnote}
 % \end{function}
 %
 % \begin{function}[EXP, added = 2019-11-26]
@@ -789,18 +779,6 @@
 %   a language-insensitive process, there is no special treatment of
 %   Turkic input (\emph{i.e.}~\texttt{I} always folds to \texttt{i} and
 %   not to \texttt{\i}).
-%
-%   \begin{texnote}
-%     As with all \pkg{expl3} functions, the input supported by
-%     \cs{str_foldcase:n} is \emph{engine-native} characters which are or
-%     interoperate with \textsc{utf-8}. As such, when used with \pdfTeX{}
-%     \emph{only} the Latin alphabet characters A--Z are case-folded
-%     (\emph{i.e.}~the \textsc{ascii} range which coincides with
-%     \textsc{utf-8}). Full \textsc{utf-8} support is available with both
-%     \XeTeX{} and \LuaTeX{}, subject only to the fact that \XeTeX{} in
-%     particular has issues with characters of code above hexadecimal
-%     $0\mathrm{xFFFF}$ when interacting with \cs{tl_to_str:n}.
-%   \end{texnote}
 % \end{function}
 %
 % \section{Viewing strings}
@@ -1924,10 +1902,21 @@
 % \begin{macro}[EXP]{\@@_change_case_loop:nw}
 % \begin{macro}[EXP]{\@@_change_case_space:n}
 % \begin{macro}[EXP]{\@@_change_case_char:nN}
+% \begin{macro}[EXP]{\@@_change_case_char_UTFviii:nNN}
+% \begin{macro}[EXP]{\@@_change_case_char_UTFviii:nNNN}
+% \begin{macro}[EXP]{\@@_change_case_char_UTFviii:nNNNN}
+% \begin{macro}[EXP]
+%   {
+%     \@@_change_case_char_UTFviii:nn ,
+%     \@@_change_case_char_UTFviii_lower:nn ,
+%     \@@_change_case_char_UTFviii_upper:nn ,
+%     \@@_change_case_char_UTFviii_fold:nn
+%   }
 %   Case changing for programmatic reasons is done by first detokenizing
 %   input then doing a simple loop that only has to worry about spaces
 %   and everything else. The output is detokenized to allow data sharing
-%   with text-based case changing.
+%   with text-based case changing. Similarly, for $8$-bit engines the
+%   multi-byte information is shared.
 %    \begin{macrocode}
 \cs_new:Npn \str_foldcase:n  #1 { \@@_change_case:nn {#1} { fold } }
 \cs_new:Npn \str_lowercase:n #1 { \@@_change_case:nn {#1} { lower } }
@@ -1971,7 +1960,100 @@
       { \use:c { char_str_ #1 case:N } #2 }
     \@@_change_case_loop:nw {#1}
   }
+\if_int_compare:w 0
+  \cs_if_exist:NT \tex_XeTeXversion:D { 1 }
+  \cs_if_exist:NT \tex_luatexversion:D { 1 }
+  > 0 \exp_stop_f:
+\else:
+  \cs_gset:Npn \@@_change_case_char:nN #1#2
+    {
+      \@@_if_recursion_tail_stop_do:Nn #2
+        { \@@_change_case_end:wn }
+      \int_compare:nNnTF { `#2 } > { "80 }
+        {
+          \int_compare:nNnTF { `#2 } < { "E0 }
+            { \@@_change_case_char_UTFviii:nNN }
+            {
+              \int_compare:nNnTF { `#2 } < { "F0 }
+                { \@@_change_case_char_UTFviii:nNNN }
+                { \@@_change_case_char_UTFviii:nNNNN }
+            }
+          {#1} #2
+        }
+        {
+          \@@_change_case_output:fw
+            { \use:c { char_str_ #1 case:N } #2 }
+          \@@_change_case_loop:nw {#1}
+        }
+    }
+  \cs_new:Npn \@@_change_case_char_UTFviii:nNN #1#2#3
+    { \@@_change_case_char_UTFviii:nn {#1} {#2#3} }
+  \cs_new:Npn \@@_change_case_char_UTFviii:nNNN #1#2#3#4
+    { \@@_change_case_char_UTFviii:nn {#1} {#2#3#4} }
+  \cs_new:Npn \@@_change_case_char_UTFviii:nNNNN #1#2#3#4#5
+    { \@@_change_case_char_UTFviii:nn {#1} {#2#3#4#5} }
 %    \end{macrocode}
+%   Skip high chars for the Japanese engines.
+%    \begin{macrocode}
+  \cs_if_exist:NF \tex_pdftexversion:D
+    {
+      \cs_gset:Npn \@@_change_case_char_UTFviii:nNNN #1#2#3#4
+        {
+          \@@_change_case_output:nw {#2#3#4}
+          \@@_change_case_loop:nw {#1}
+        }
+      \cs_gset:Npn \@@_change_case_char_UTFviii:nNNNN #1#2#3#4#5
+        {
+          \@@_change_case_output:nw {#2#3#4#5}
+          \@@_change_case_loop:nw {#1}
+        }
+    }
+  \cs_new:Npn \@@_change_case_char_UTFviii:nn #1#2
+    {
+      \use:c { @@_change_case_char_UTFviii_ #1 :nn } {#1} {#2}
+    }
+  \cs_new:Npn \@@_change_case_char_UTFviii_upper:nn #1#2
+    {
+      \@@_change_case_output:fw
+        {
+          \cs_if_exist:cTF { c__kernel_ #1 case_ #2 _tl }
+            {
+              \__kernel_tl_to_str:w \exp_after:wN \exp_after:wN \exp_after:wN
+                { \cs:w c__kernel_ #1 case_ #2 _tl \cs_end: }
+            }
+            {#2}
+        }
+      \@@_change_case_loop:nw {#1}
+    }
+  \cs_new_eq:NN \@@_change_case_char_UTFviii_lower:nn
+    \@@_change_case_char_UTFviii_upper:nn
+  \cs_new:Npn \@@_change_case_char_UTFviii_fold:nn #1#2
+    {
+      \@@_change_case_output:fw
+        {
+          \cs_if_exist:cTF { c__kernel_ #1 case_ #2 _tl }
+            {
+              \__kernel_tl_to_str:w \exp_after:wN \exp_after:wN \exp_after:wN
+                { \cs:w c__kernel_ #1 case_ #2 _tl \cs_end: }
+            }
+            {
+              \cs_if_exist:cTF { c__kernel_lowercase_ #2 _tl }
+                {
+                  \__kernel_tl_to_str:w
+                    \exp_after:wN \exp_after:wN \exp_after:wN
+                    { \cs:w c__kernel_lowercase_ #2 _tl \cs_end: }
+                }
+                {#2}
+            }
+        }
+      \@@_change_case_loop:nw {#1}
+    }
+\fi:
+%    \end{macrocode}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
 % \end{macro}
 % \end{macro}
 % \end{macro}
diff --git a/l3kernel/testfiles/m3str002.tlg b/l3kernel/testfiles/m3str002.tlg
index 13bb39da3..54be80513 100644
--- a/l3kernel/testfiles/m3str002.tlg
+++ b/l3kernel/testfiles/m3str002.tlg
@@ -17,9 +17,9 @@ TRUE
 ============================================================
 TEST 3: Accented characters, etc.
 ============================================================
-"CAF^^c3^^a9"
-"^^c4^^86^^c4^^97^^c6^^8a^^e1^^b9^^90^^e1^^b9^^91"
-"^^e1^^be^^aa^^cf^^89^^ce^^9d"
+"CAF^^c3^^89"
+"^^c4^^87^^c4^^97^^c6^^8a^^e1^^b9^^90^^e1^^b9^^91"
+"^^e1^^be^^a2^^cf^^89^^ce^^bd"
 "^^e1^^bf^^a3^^e1^^bf^^a2^^ef^^ac^^86"
 "^^ea^^9a^^88^^ea^^9a^^87"
 "^^ef^^bc^^ba^^ea^^9d^^8e^^e2^^93^^8d"
diff --git a/l3kernel/testfiles/m3str002.uptex.tlg b/l3kernel/testfiles/m3str002.uptex.tlg
index 13bb39da3..61f1dd171 100644
--- a/l3kernel/testfiles/m3str002.uptex.tlg
+++ b/l3kernel/testfiles/m3str002.uptex.tlg
@@ -17,9 +17,9 @@ TRUE
 ============================================================
 TEST 3: Accented characters, etc.
 ============================================================
-"CAF^^c3^^a9"
-"^^c4^^86^^c4^^97^^c6^^8a^^e1^^b9^^90^^e1^^b9^^91"
-"^^e1^^be^^aa^^cf^^89^^ce^^9d"
+"CAF^^c3^^89"
+"^^c4^^87^^c4^^97^^c6^^8a^^e1^^b9^^90^^e1^^b9^^91"
+"^^e1^^be^^aa^^cf^^89^^ce^^bd"
 "^^e1^^bf^^a3^^e1^^bf^^a2^^ef^^ac^^86"
 "^^ea^^9a^^88^^ea^^9a^^87"
 "^^ef^^bc^^ba^^ea^^9d^^8e^^e2^^93^^8d"