[latex3-commits] [git/LaTeX3-latex3-latex3] main: Extend \str_<type>>case:n to UTF-8 with 8-bit engines (8a5559e9d)
Joseph Wright
joseph.wright at morningstar2.co.uk
Thu Mar 24 08:14:52 CET 2022
Repository : https://github.com/latex3/latex3
On branch : main
Link : https://github.com/latex3/latex3/commit/8a5559e9d0ee175e5c9c13af1f3a53006bb1bbb7
>---------------------------------------------------------------
commit 8a5559e9d0ee175e5c9c13af1f3a53006bb1bbb7
Author: Joseph Wright <joseph.wright at morningstar2.co.uk>
Date: Thu Mar 24 07:14:52 2022 +0000
Extend \str_<type>>case:n to UTF-8 with 8-bit engines
>---------------------------------------------------------------
8a5559e9d0ee175e5c9c13af1f3a53006bb1bbb7
l3kernel/CHANGELOG.md | 1 +
l3kernel/l3str.dtx | 128 ++++++++++++++++++++++++++++------
l3kernel/testfiles/m3str002.tlg | 6 +-
l3kernel/testfiles/m3str002.uptex.tlg | 6 +-
4 files changed, 112 insertions(+), 29 deletions(-)
diff --git a/l3kernel/CHANGELOG.md b/l3kernel/CHANGELOG.md
index 19354e46d..794417ecc 100644
--- a/l3kernel/CHANGELOG.md
+++ b/l3kernel/CHANGELOG.md
@@ -15,6 +15,7 @@ this project uses date-based 'snapshot' version identifiers.
### Changed
- Definition of `\legacy_if:n(TF)` to support primitive conditionals
+- `\str_<type>case:n` now case changes codepoints above 127 with all engines
## [2022-02-24]
diff --git a/l3kernel/l3str.dtx b/l3kernel/l3str.dtx
index be9dd9f07..1220d5182 100644
--- a/l3kernel/l3str.dtx
+++ b/l3kernel/l3str.dtx
@@ -747,16 +747,6 @@
% correctly deal with context-dependence and other factors appropriate
% to text case changing.
% \end{itemize}
-%
-% \begin{texnote}
-% As with all \pkg{expl3} functions, the input supported by
-% \cs{str_foldcase:n} is \emph{engine-native} characters which are or
-% interoperate with \textsc{utf-8}. As such, when used with \pdfTeX{}
-% \emph{only} the Latin alphabet characters A--Z are case-folded
-% (\emph{i.e.}~the \textsc{ascii} range which coincides with
-% \textsc{utf-8}). Full \textsc{utf-8} support is available with both
-% \XeTeX{} and \LuaTeX{}.
-% \end{texnote}
% \end{function}
%
% \begin{function}[EXP, added = 2019-11-26]
@@ -789,18 +779,6 @@
% a language-insensitive process, there is no special treatment of
% Turkic input (\emph{i.e.}~\texttt{I} always folds to \texttt{i} and
% not to \texttt{\i}).
-%
-% \begin{texnote}
-% As with all \pkg{expl3} functions, the input supported by
-% \cs{str_foldcase:n} is \emph{engine-native} characters which are or
-% interoperate with \textsc{utf-8}. As such, when used with \pdfTeX{}
-% \emph{only} the Latin alphabet characters A--Z are case-folded
-% (\emph{i.e.}~the \textsc{ascii} range which coincides with
-% \textsc{utf-8}). Full \textsc{utf-8} support is available with both
-% \XeTeX{} and \LuaTeX{}, subject only to the fact that \XeTeX{} in
-% particular has issues with characters of code above hexadecimal
-% $0\mathrm{xFFFF}$ when interacting with \cs{tl_to_str:n}.
-% \end{texnote}
% \end{function}
%
% \section{Viewing strings}
@@ -1924,10 +1902,21 @@
% \begin{macro}[EXP]{\@@_change_case_loop:nw}
% \begin{macro}[EXP]{\@@_change_case_space:n}
% \begin{macro}[EXP]{\@@_change_case_char:nN}
+% \begin{macro}[EXP]{\@@_change_case_char_UTFviii:nNN}
+% \begin{macro}[EXP]{\@@_change_case_char_UTFviii:nNNN}
+% \begin{macro}[EXP]{\@@_change_case_char_UTFviii:nNNNN}
+% \begin{macro}[EXP]
+% {
+% \@@_change_case_char_UTFviii:nn ,
+% \@@_change_case_char_UTFviii_lower:nn ,
+% \@@_change_case_char_UTFviii_upper:nn ,
+% \@@_change_case_char_UTFviii_fold:nn
+% }
% Case changing for programmatic reasons is done by first detokenizing
% input then doing a simple loop that only has to worry about spaces
% and everything else. The output is detokenized to allow data sharing
-% with text-based case changing.
+% with text-based case changing. Similarly, for $8$-bit engines the
+% multi-byte information is shared.
% \begin{macrocode}
\cs_new:Npn \str_foldcase:n #1 { \@@_change_case:nn {#1} { fold } }
\cs_new:Npn \str_lowercase:n #1 { \@@_change_case:nn {#1} { lower } }
@@ -1971,7 +1960,100 @@
{ \use:c { char_str_ #1 case:N } #2 }
\@@_change_case_loop:nw {#1}
}
+\if_int_compare:w 0
+ \cs_if_exist:NT \tex_XeTeXversion:D { 1 }
+ \cs_if_exist:NT \tex_luatexversion:D { 1 }
+ > 0 \exp_stop_f:
+\else:
+ \cs_gset:Npn \@@_change_case_char:nN #1#2
+ {
+ \@@_if_recursion_tail_stop_do:Nn #2
+ { \@@_change_case_end:wn }
+ \int_compare:nNnTF { `#2 } > { "80 }
+ {
+ \int_compare:nNnTF { `#2 } < { "E0 }
+ { \@@_change_case_char_UTFviii:nNN }
+ {
+ \int_compare:nNnTF { `#2 } < { "F0 }
+ { \@@_change_case_char_UTFviii:nNNN }
+ { \@@_change_case_char_UTFviii:nNNNN }
+ }
+ {#1} #2
+ }
+ {
+ \@@_change_case_output:fw
+ { \use:c { char_str_ #1 case:N } #2 }
+ \@@_change_case_loop:nw {#1}
+ }
+ }
+ \cs_new:Npn \@@_change_case_char_UTFviii:nNN #1#2#3
+ { \@@_change_case_char_UTFviii:nn {#1} {#2#3} }
+ \cs_new:Npn \@@_change_case_char_UTFviii:nNNN #1#2#3#4
+ { \@@_change_case_char_UTFviii:nn {#1} {#2#3#4} }
+ \cs_new:Npn \@@_change_case_char_UTFviii:nNNNN #1#2#3#4#5
+ { \@@_change_case_char_UTFviii:nn {#1} {#2#3#4#5} }
% \end{macrocode}
+% Skip high chars for the Japanese engines.
+% \begin{macrocode}
+ \cs_if_exist:NF \tex_pdftexversion:D
+ {
+ \cs_gset:Npn \@@_change_case_char_UTFviii:nNNN #1#2#3#4
+ {
+ \@@_change_case_output:nw {#2#3#4}
+ \@@_change_case_loop:nw {#1}
+ }
+ \cs_gset:Npn \@@_change_case_char_UTFviii:nNNNN #1#2#3#4#5
+ {
+ \@@_change_case_output:nw {#2#3#4#5}
+ \@@_change_case_loop:nw {#1}
+ }
+ }
+ \cs_new:Npn \@@_change_case_char_UTFviii:nn #1#2
+ {
+ \use:c { @@_change_case_char_UTFviii_ #1 :nn } {#1} {#2}
+ }
+ \cs_new:Npn \@@_change_case_char_UTFviii_upper:nn #1#2
+ {
+ \@@_change_case_output:fw
+ {
+ \cs_if_exist:cTF { c__kernel_ #1 case_ #2 _tl }
+ {
+ \__kernel_tl_to_str:w \exp_after:wN \exp_after:wN \exp_after:wN
+ { \cs:w c__kernel_ #1 case_ #2 _tl \cs_end: }
+ }
+ {#2}
+ }
+ \@@_change_case_loop:nw {#1}
+ }
+ \cs_new_eq:NN \@@_change_case_char_UTFviii_lower:nn
+ \@@_change_case_char_UTFviii_upper:nn
+ \cs_new:Npn \@@_change_case_char_UTFviii_fold:nn #1#2
+ {
+ \@@_change_case_output:fw
+ {
+ \cs_if_exist:cTF { c__kernel_ #1 case_ #2 _tl }
+ {
+ \__kernel_tl_to_str:w \exp_after:wN \exp_after:wN \exp_after:wN
+ { \cs:w c__kernel_ #1 case_ #2 _tl \cs_end: }
+ }
+ {
+ \cs_if_exist:cTF { c__kernel_lowercase_ #2 _tl }
+ {
+ \__kernel_tl_to_str:w
+ \exp_after:wN \exp_after:wN \exp_after:wN
+ { \cs:w c__kernel_lowercase_ #2 _tl \cs_end: }
+ }
+ {#2}
+ }
+ }
+ \@@_change_case_loop:nw {#1}
+ }
+\fi:
+% \end{macrocode}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
% \end{macro}
% \end{macro}
% \end{macro}
diff --git a/l3kernel/testfiles/m3str002.tlg b/l3kernel/testfiles/m3str002.tlg
index 13bb39da3..54be80513 100644
--- a/l3kernel/testfiles/m3str002.tlg
+++ b/l3kernel/testfiles/m3str002.tlg
@@ -17,9 +17,9 @@ TRUE
============================================================
TEST 3: Accented characters, etc.
============================================================
-"CAF^^c3^^a9"
-"^^c4^^86^^c4^^97^^c6^^8a^^e1^^b9^^90^^e1^^b9^^91"
-"^^e1^^be^^aa^^cf^^89^^ce^^9d"
+"CAF^^c3^^89"
+"^^c4^^87^^c4^^97^^c6^^8a^^e1^^b9^^90^^e1^^b9^^91"
+"^^e1^^be^^a2^^cf^^89^^ce^^bd"
"^^e1^^bf^^a3^^e1^^bf^^a2^^ef^^ac^^86"
"^^ea^^9a^^88^^ea^^9a^^87"
"^^ef^^bc^^ba^^ea^^9d^^8e^^e2^^93^^8d"
diff --git a/l3kernel/testfiles/m3str002.uptex.tlg b/l3kernel/testfiles/m3str002.uptex.tlg
index 13bb39da3..61f1dd171 100644
--- a/l3kernel/testfiles/m3str002.uptex.tlg
+++ b/l3kernel/testfiles/m3str002.uptex.tlg
@@ -17,9 +17,9 @@ TRUE
============================================================
TEST 3: Accented characters, etc.
============================================================
-"CAF^^c3^^a9"
-"^^c4^^86^^c4^^97^^c6^^8a^^e1^^b9^^90^^e1^^b9^^91"
-"^^e1^^be^^aa^^cf^^89^^ce^^9d"
+"CAF^^c3^^89"
+"^^c4^^87^^c4^^97^^c6^^8a^^e1^^b9^^90^^e1^^b9^^91"
+"^^e1^^be^^aa^^cf^^89^^ce^^bd"
"^^e1^^bf^^a3^^e1^^bf^^a2^^ef^^ac^^86"
"^^ea^^9a^^88^^ea^^9a^^87"
"^^ef^^bc^^ba^^ea^^9d^^8e^^e2^^93^^8d"
More information about the latex3-commits
mailing list.