[latex3-commits] [git/LaTeX3-latex3-latex3] main: Extend NFD support to 8-bit engines (069b6627b)
Joseph Wright
joseph.wright at morningstar2.co.uk
Mon Aug 29 17:37:30 CEST 2022
Repository : https://github.com/latex3/latex3
On branch : main
Link : https://github.com/latex3/latex3/commit/069b6627b9df818c374e6ef557682c7a7dfc91ad
>---------------------------------------------------------------
commit 069b6627b9df818c374e6ef557682c7a7dfc91ad
Author: Joseph Wright <joseph.wright at morningstar2.co.uk>
Date: Mon Aug 29 17:37:20 2022 +0200
Extend NFD support to 8-bit engines
>---------------------------------------------------------------
069b6627b9df818c374e6ef557682c7a7dfc91ad
l3kernel/CHANGELOG.md | 1 +
l3kernel/l3candidates.dtx | 18 +++-
l3kernel/l3token.dtx | 108 +++++++++++++++++----
l3kernel/l3unicode.dtx | 8 +-
...r-convert005.ptex.tlg => m3token006.luatex.tlg} | 17 ++--
l3kernel/testfiles/m3token006.lvt | 35 +++++++
.../{m3str-convert005.ptex.tlg => m3token006.tlg} | 17 ++--
...tr-convert005.ptex.tlg => m3token006.xetex.tlg} | 17 ++--
8 files changed, 180 insertions(+), 41 deletions(-)
diff --git a/l3kernel/CHANGELOG.md b/l3kernel/CHANGELOG.md
index 41623263a..9ad2257f0 100644
--- a/l3kernel/CHANGELOG.md
+++ b/l3kernel/CHANGELOG.md
@@ -14,6 +14,7 @@ this project uses date-based 'snapshot' version identifiers.
- `\text_map_function:nN` and `\text_map_inline:nn` for mapping to
graphemes in textual input
- Support for medevial Latin case changing
+- `\char_to_nfd:n` to extend NFD support to 8-bit engines
## [2022-08-23]
diff --git a/l3kernel/l3candidates.dtx b/l3kernel/l3candidates.dtx
index 155d51bc0..795322050 100644
--- a/l3kernel/l3candidates.dtx
+++ b/l3kernel/l3candidates.dtx
@@ -643,9 +643,21 @@
% \cs{char_to_nfd:N} \meta{char}
% \end{syntax}
% Converts the \meta{char} to the Unicode Normalization Form Canonical
-% Decomposition. The category code of the generated character is the
-% same as the \meta{char}. With $8$-bit engines, no change is made to the
-% character.
+% Decomposition. The category code of the \emph{first} generated character is
+% the same as the \meta{char}; second and subsequent chars will have the
+% current category code, as they would if typed in directly. For $8$-bit
+% engines, no change will take place.
+% \end{function}
+%
+% \begin{function}[added = 2022-08-29, rEXP]{\char_to_nfd:n}
+% \begin{syntax}
+% \cs{char_to_nfd:n} \Arg{codepoint}
+% \end{syntax}
+% Converts the (Unicode) \meta{codepoint} to the Unicode Normalization
+% Form Canonical Decomposition. The generated character(s) will have
+% the current category code as they would if typed in directly. In contrast
+% to \cs{char_to_nfd:N}, this function \emph{does} decompose codepoints
+% with $8$-bit engines.
% \end{function}
%
% \begin{function}[added = 2018-09-23]
diff --git a/l3kernel/l3token.dtx b/l3kernel/l3token.dtx
index 5077a3114..439c8a1d5 100644
--- a/l3kernel/l3token.dtx
+++ b/l3kernel/l3token.dtx
@@ -1795,36 +1795,110 @@
% \end{macro}
%
% \begin{macro}[rEXP]{\char_to_nfd:N}
-% \begin{macro}[rEXP]{\@@_to_nfd:n}
+% \begin{macro}[rEXP]{\@@_to_nfd:n, \@@_to_nfd:e}
% \begin{macro}[rEXP]{\@@_to_nfd:Nw}
-% Look up any \textsc{nfd} and recursively produce the result.
-% \begin{macrocode}
-\cs_new:Npn \char_to_nfd:N #1
+% \begin{macro}[rEXP]{\char_to_nfd:n}
+% \begin{macro}[rEXP]{\char_to_nfd:w}
+% \begin{macro}[rEXP]{\char_nfd_generate:n}
+% \begin{macro}[rEXP]{\char_nfd_generate:nnnn}
+% Look up any \textsc{nfd} and recursively produce the result. Having shared
+% code between Unicode and $8$-bit engines would be ideal, but this would be
+% awkward as we have completely different treatment of catcodes, numbers
+% of tokens, etc. The apparent saving becomes more of a headache than it's
+% worth \dots
+% \begin{macrocode}
+\bool_lazy_or:nnTF
+ { \sys_if_engine_luatex_p: }
+ { \sys_if_engine_xetex_p: }
{
- \cs_if_exist:cTF { c_@@_nfd_ \token_to_str:N #1 _ tl }
+ \cs_new:Npn \char_to_nfd:N #1
{
- \exp_after:wN \exp_after:wN \exp_after:wN \@@_to_nfd:Nw
- \exp_after:wN \exp_after:wN \exp_after:wN #1
- \cs:w c_@@_nfd_ \token_to_str:N #1 _ tl \cs_end:
- \s_@@_stop
+ \cs_if_exist:cTF { c_@@_nfd_ \token_to_str:N #1 _ tl }
+ {
+ \exp_after:wN \exp_after:wN \exp_after:wN \@@_to_nfd:Nw
+ \exp_after:wN \exp_after:wN \exp_after:wN #1
+ \cs:w c_@@_nfd_ \token_to_str:N #1 _tl \cs_end:
+ \s_@@_stop
+ }
+ { \exp_not:n {#1} }
+ }
+ \cs_new_eq:NN \@@_to_nfd:n \char_to_nfd:N
+ \cs_generate_variant:Nn \@@_to_nfd:n { e }
+ \cs_new:Npn \@@_to_nfd:Nw #1#2#3 \s_@@_stop
+ {
+ \@@_to_nfd:e
+ { \char_generate:nn { `#2 } { \@@_change_case_catcode:N #1 } }
+ \tl_if_blank:nF {#3}
+ {
+ \@@_to_nfd:e
+ { \char_generate:nn { `#3 } { \char_value_catcode:n { `#3 } } }
+ }
+ }
+ \cs_new:Npn \char_to_nfd:n #1
+ {
+ \@@_to_nfd:e { \char_generate:nn {#1} { \char_value_catcode:n {#1} } }
}
- { \exp_not:n {#1} }
}
-\cs_new_eq:NN \@@_to_nfd:n \char_to_nfd:N
-\cs_new:Npn \@@_to_nfd:Nw #1#2#3 \s_@@_stop
{
- \exp_args:Ne \@@_to_nfd:n
- { \char_generate:nn { `#2 } { \@@_change_case_catcode:N #1 } }
- \tl_if_blank:nF {#3}
+ \cs_new:Npn \char_to_nfd:N #1 { \exp_not:n {#1} }
+ \cs_new:Npn \char_to_nfd:n #1
+ {
+ \int_compare:nNnTF {#1} > { "80 }
+ { \exp_args:Ne \@@_to_nfd:n { \@@_nfd_generate:n {#1} } }
+ { \@@_nfd_generate:n {#1} }
+ }
+ \cs_new:Npn \@@_to_nfd:n #1
+ {
+ \cs_if_exist:cTF { c_@@_nfd_ \tl_to_str:n {#1} _ tl }
+ {
+ \exp_after:wN \exp_after:wN \exp_after:wN \@@_to_nfd:w
+ \cs:w c_@@_nfd_ \tl_to_str:n {#1} _tl \cs_end:
+ \s_@@_stop
+ }
+ { \exp_not:n {#1} }
+ }
+ \cs_new:Npn \@@_to_nfd:w #1#2 \s_@@_stop
{
- \exp_args:Ne \@@_to_nfd:n
- { \char_generate:nn { `#3 } { \char_value_catcode:n { `#3 } } }
+ \@@_to_nfd:n {#1}
+ \tl_if_blank:nF {#2}
+ { \@@_to_nfd:n {#2} }
}
+ \cs_new:Npn \@@_nfd_generate:n #1
+ {
+ \use:e
+ {
+ \exp_not:N \@@_nfd_generate:nnnn
+ \char_to_utfviii_bytes:n {#1}
+ }
+ }
+ \cs_new:Npn \@@_nfd_generate:nnnn #1#2#3#4
+ {
+ \tl_if_blank:nTF {#2}
+ { \char_generate:nn {#1} { \char_value_catcode:n {#1} } }
+ {
+ \exp_after:wN \exp_after:wN \exp_after:wN
+ \exp_not:N \char_generate:nn {#1} { 13 }
+ \exp_after:wN \exp_after:wN \exp_after:wN
+ \exp_not:N \char_generate:nn {#2} { 13 }
+ \tl_if_blank:nF {#3}
+ {
+ \exp_after:wN \exp_after:wN \exp_after:wN
+ \exp_not:N \char_generate:nn {#3} { 13 }
+ \tl_if_blank:nF {#4}
+ {
+ \exp_after:wN \exp_after:wN \exp_after:wN
+ \exp_not:N \char_generate:nn {#4} { 13 }
+ }
+ }
+ }
+
+ }
}
% \end{macrocode}
% \end{macro}
% \end{macro}
% \end{macro}
+% \end{macro}
%
% \begin{macro}[EXP]
% {
diff --git a/l3kernel/l3unicode.dtx b/l3kernel/l3unicode.dtx
index 321ef95a8..0675e87f7 100644
--- a/l3kernel/l3unicode.dtx
+++ b/l3kernel/l3unicode.dtx
@@ -160,9 +160,11 @@
\tl_const:cx
{ c_@@_nfd_ \@@_generate_other:n { "#1 } _tl }
{
- \@@_generate:n { "#2 }
- \tl_if_blank:nF {#3}
- { \@@_generate:n { "#3 } }
+ { \@@_generate:n { "#2 } }
+ {
+ \tl_if_blank:nF {#3}
+ { \@@_generate:n { "#3 } }
+ }
}
}
\cs_set_protected:Npn \@@_data_auxiii:w
diff --git a/l3kernel/testfiles/m3str-convert005.ptex.tlg b/l3kernel/testfiles/m3token006.luatex.tlg
similarity index 74%
copy from l3kernel/testfiles/m3str-convert005.ptex.tlg
copy to l3kernel/testfiles/m3token006.luatex.tlg
index e7b068398..a1c542abe 100644
--- a/l3kernel/testfiles/m3str-convert005.ptex.tlg
+++ b/l3kernel/testfiles/m3token006.luatex.tlg
@@ -2,14 +2,19 @@ This is a generated file for the LaTeX (2e + expl3) validation system.
Don't change this file in any respect.
Author: Joseph Wright
============================================================
-TEST 1: PDF names
+TEST 1: Byte_decomposition
============================================================
-abczz
-brackets#28#29#5B#5D#7B#7D#3C#3Exxx
+{65}{}{}{}
+{195}{142}{}{}
+{206}{137}{}{}
+{225}{182}{173}{}
+{239}{191}{189}{}
+{240}{144}{128}{128}
============================================================
============================================================
-TEST 2: PDF names with spaces
+TEST 2: Character decomposition
============================================================
-abc#20cde
-abc#20cde
+A
+Î
+Ή
============================================================
diff --git a/l3kernel/testfiles/m3token006.lvt b/l3kernel/testfiles/m3token006.lvt
new file mode 100644
index 000000000..9aeea764c
--- /dev/null
+++ b/l3kernel/testfiles/m3token006.lvt
@@ -0,0 +1,35 @@
+%
+% Copyright (C) 2022 The LaTeX Project
+%
+
+\documentclass{minimal}
+\input{regression-test}
+
+\RequirePackage[enable-debug]{expl3}
+\ExplSyntaxOn
+\debug_on:n { check-declarations , deprecation , log-functions }
+\ExplSyntaxOff
+
+\START
+\AUTHOR{Joseph Wright}
+
+\ExplSyntaxOn
+
+\TESTEXP { Byte_decomposition }
+ {
+ \char_to_utfviii_bytes:n { `A } \NEWLINE
+ \char_to_utfviii_bytes:n { "00CE } \NEWLINE
+ \char_to_utfviii_bytes:n { "0389 } \NEWLINE
+ \char_to_utfviii_bytes:n { "1DAD } \NEWLINE
+ \char_to_utfviii_bytes:n { "FFFD } \NEWLINE
+ \char_to_utfviii_bytes:n { "10000 }
+ }
+
+\TESTEXP { Character~decomposition }
+ {
+ \char_to_nfd:n { `A } \NEWLINE
+ \char_to_nfd:n { "00CE } \NEWLINE
+ \char_to_nfd:n { "0389 }
+ }
+
+\END
diff --git a/l3kernel/testfiles/m3str-convert005.ptex.tlg b/l3kernel/testfiles/m3token006.tlg
similarity index 72%
copy from l3kernel/testfiles/m3str-convert005.ptex.tlg
copy to l3kernel/testfiles/m3token006.tlg
index e7b068398..45256798d 100644
--- a/l3kernel/testfiles/m3str-convert005.ptex.tlg
+++ b/l3kernel/testfiles/m3token006.tlg
@@ -2,14 +2,19 @@ This is a generated file for the LaTeX (2e + expl3) validation system.
Don't change this file in any respect.
Author: Joseph Wright
============================================================
-TEST 1: PDF names
+TEST 1: Byte_decomposition
============================================================
-abczz
-brackets#28#29#5B#5D#7B#7D#3C#3Exxx
+{65}{}{}{}
+{195}{142}{}{}
+{206}{137}{}{}
+{225}{182}{173}{}
+{239}{191}{189}{}
+{240}{144}{128}{128}
============================================================
============================================================
-TEST 2: PDF names with spaces
+TEST 2: Character decomposition
============================================================
-abc#20cde
-abc#20cde
+A
+I^^cc^^82
+^^ce^^97^^cc^^81
============================================================
diff --git a/l3kernel/testfiles/m3str-convert005.ptex.tlg b/l3kernel/testfiles/m3token006.xetex.tlg
similarity index 74%
copy from l3kernel/testfiles/m3str-convert005.ptex.tlg
copy to l3kernel/testfiles/m3token006.xetex.tlg
index e7b068398..a1c542abe 100644
--- a/l3kernel/testfiles/m3str-convert005.ptex.tlg
+++ b/l3kernel/testfiles/m3token006.xetex.tlg
@@ -2,14 +2,19 @@ This is a generated file for the LaTeX (2e + expl3) validation system.
Don't change this file in any respect.
Author: Joseph Wright
============================================================
-TEST 1: PDF names
+TEST 1: Byte_decomposition
============================================================
-abczz
-brackets#28#29#5B#5D#7B#7D#3C#3Exxx
+{65}{}{}{}
+{195}{142}{}{}
+{206}{137}{}{}
+{225}{182}{173}{}
+{239}{191}{189}{}
+{240}{144}{128}{128}
============================================================
============================================================
-TEST 2: PDF names with spaces
+TEST 2: Character decomposition
============================================================
-abc#20cde
-abc#20cde
+A
+Î
+Ή
============================================================
More information about the latex3-commits
mailing list.