[latex3-commits] [git/LaTeX3-latex3-latex3] l3text: Add \char_to_nfd:N (3d9cc0e80)

Mon Dec 2 21:43:17 CET 2019

Repository : https://github.com/latex3/latex3
On branch  : l3text
Link       : https://github.com/latex3/latex3/commit/3d9cc0e80453854e2f58d96c940c3f7d7216eecd

>---------------------------------------------------------------

commit 3d9cc0e80453854e2f58d96c940c3f7d7216eecd
Author: Joseph Wright <joseph.wright at morningstar2.co.uk>
Date:   Mon Dec 2 20:42:48 2019 +0000

    Add \char_to_nfd:N
    
    This is needed for Greek case changing, but is potentially
    more general.


>---------------------------------------------------------------

3d9cc0e80453854e2f58d96c940c3f7d7216eecd
 l3kernel/CHANGELOG.md                              |  1 +
 l3kernel/l3token.dtx                               | 42 ++++++++++++++++++++++
 l3kernel/l3unicode.dtx                             | 30 ++++++++++++----
 ...{m3str-convert003.tlg => m3token001.luatex.tlg} |  8 +++--
 l3kernel/testfiles/m3token001.lvt                  | 30 ++++++++++++++++
 .../{m3str-convert003.tlg => m3token001.tlg}       |  6 ++--
 .../{m3str-convert003.tlg => m3token001.xetex.tlg} |  8 +++--
 7 files changed, 110 insertions(+), 15 deletions(-)

diff --git a/l3kernel/CHANGELOG.md b/l3kernel/CHANGELOG.md
index 510b57f19..a711ed31f 100644
--- a/l3kernel/CHANGELOG.md
+++ b/l3kernel/CHANGELOG.md
@@ -8,6 +8,7 @@ this project uses date-based 'snapshot' version identifiers.
 ## [Unreleased]
 
 ### Added
+- `\char_to_nfd:N`
 - `\file_hex_dump:n(nn)` and `\file_get_hex_dump:n(nn)N(TF)`
 - `\str_foldcase:n`
 - `\str_lowercase:n`
diff --git a/l3kernel/l3token.dtx b/l3kernel/l3token.dtx
index 854dd177d..05e239c2d 100644
--- a/l3kernel/l3token.dtx
+++ b/l3kernel/l3token.dtx
@@ -433,6 +433,16 @@
 %   the character code changes).
 % \end{function}
 %
+% \begin{function}[added = 2019-12-02, rEXP]{\char_to_nfd:N}
+%   \begin{syntax}
+%     \cs{char_to_nfd:N} \meta{char}
+%   \end{syntax}
+%   Converts the \meta{char} to the Unicode Normalization Form Canonical
+%   Decomposition. The category code of the generated character is the
+%   same as the \meta{char}. With $8$-bit engines, no change is made to the
+%   character.
+% \end{function}
+%
 % \section{Generic tokens}
 %
 % \begin{variable}
@@ -1843,6 +1853,38 @@
 % \end{macro}
 % \end{macro}
 %
+% \begin{macro}[rEXP]{\char_to_nfd:N}
+% \begin{macro}[rEXP]{\@@_to_nfd:n}
+% \begin{macro}[rEXP]{\@@_to_nfd:Nw}
+%   Look up any \textsc{nfd} and recursively produce the result.
+%    \begin{macrocode}
+\cs_new:Npn \char_to_nfd:N #1
+  {
+    \cs_if_exist:cTF { c_@@_nfd_ \token_to_str:N #1 _ tl }
+      {
+        \exp_after:wN \exp_after:wN \exp_after:wN \@@_to_nfd:Nw
+          \exp_after:wN \exp_after:wN \exp_after:wN #1
+            \cs:w c_@@_nfd_ \token_to_str:N #1 _ tl \cs_end:
+              \q_stop
+      }
+      { \exp_not:n {#1} }
+  }
+\cs_set_eq:NN \@@_to_nfd:n \char_to_nfd:N
+\cs_new:Npn \@@_to_nfd:Nw #1#2#3 \q_stop
+  {
+    \exp_args:Ne \@@_to_nfd:n
+      { \char_generate:nn { `#2 } { \@@_change_case_catcode:N #1 } }
+    \tl_if_blank:nF {#3}
+      {
+        \exp_args:Ne \@@_to_nfd:n
+          { \char_generate:nn { `#3 } { \char_value_catcode:n { `#3 } } }
+      }
+  }
+%    \end{macrocode}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+%
 % \begin{macro}{\c_catcode_other_space_tl}
 %   Create a space with category code $12$: an \enquote{other} space.
 %    \begin{macrocode}
diff --git a/l3kernel/l3unicode.dtx b/l3kernel/l3unicode.dtx
index 31f51b812..8ae7cf37b 100644
--- a/l3kernel/l3unicode.dtx
+++ b/l3kernel/l3unicode.dtx
@@ -116,16 +116,34 @@
             }
         } 
 %    \end{macrocode}
-% Parse the main Unicode data file for title case exceptions (the one-to-one
-% lower- and uppercase mappings it contains are all be covered by the \TeX{}
-% data). There are no comments in the main data file so this can be done using
-% a standard mapping and no checks.
+% Parse the main Unicode data file for two things. First, we want the titlecase
+% exceptions: the one-to-one lower- and uppercase mappings it contains are all
+% be covered by the \TeX{} data. Second, we need normalization data: at present,
+% just the canonical \textsc{nfd} mappings. Those all yield either one or two
+% codepoints, so the split is relatively easy.
 %    \begin{macrocode}
       \ior_open:Nn \g_@@_data_ior { UnicodeData.txt }
       \cs_set_protected:Npn \@@_data_auxi:w
         #1 ; #2 ; #3 ; #4 ; #5 ; #6 ; #7 ; #8 ; #9 ;
-        { \@@_data_auxii:w #1 ; }
-      \cs_set_protected:Npn \@@_data_auxii:w
+        {
+          \tl_if_blank:nF {#6}
+            {
+              \tl_if_head_eq_charcode:nNF {#6}  < % >
+                { \@@_data_auxii:w #1 ; #6 ~ \q_stop }
+            }
+          \@@_data_auxiii:w #1 ;
+        }
+      \cs_set_protected:Npn \@@_data_auxii:w #1 ; #2 ~ #3 \q_stop
+        {
+          \tl_const:cx
+            { c_@@_nfd_ \@@_generate_char:n {#1} _tl }
+            {
+              \@@_generate:n { "#2 }
+              \tl_if_blank:nF {#3}
+                { \@@_generate:n { "#3 } }
+            }
+        }
+      \cs_set_protected:Npn \@@_data_auxiii:w
         #1 ; #2 ; #3 ; #4 ; #5 ; #6 ; #7 ~ \q_stop
         {
           \cs_set_nopar:Npn \l_@@_tmpa_tl {#7}
diff --git a/l3kernel/testfiles/m3str-convert003.tlg b/l3kernel/testfiles/m3token001.luatex.tlg
similarity index 78%
copy from l3kernel/testfiles/m3str-convert003.tlg
copy to l3kernel/testfiles/m3token001.luatex.tlg
index c89276a04..bdf4776d3 100644
--- a/l3kernel/testfiles/m3str-convert003.tlg
+++ b/l3kernel/testfiles/m3token001.luatex.tlg
@@ -1,8 +1,10 @@
 This is a generated file for the LaTeX (2e + expl3) validation system.
 Don't change this file in any respect.
-Author: Bruno Le Floch
+Author: Joseph Wright
 ============================================================
-TEST 1: str if contains char
+TEST 1: Unicode NFD
 ============================================================
-FALSE TRUE FALSE FALSE TRUE
+A
+á
+ῒ
 ============================================================
diff --git a/l3kernel/testfiles/m3token001.lvt b/l3kernel/testfiles/m3token001.lvt
new file mode 100644
index 000000000..8f2a3b630
--- /dev/null
+++ b/l3kernel/testfiles/m3token001.lvt
@@ -0,0 +1,30 @@
+%
+% Copyright (C) 2019 The LaTeX Project
+%
+
+\documentclass{minimal}
+\input{regression-test}
+
+\RequirePackage[enable-debug]{expl3}
+\ExplSyntaxOn
+\debug_on:n { check-declarations , deprecation , log-functions }
+\ExplSyntaxOff
+\makeatletter
+
+\begin{document}
+\START
+\AUTHOR{Joseph Wright}
+\ExplSyntaxOn
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+\TESTEXP{Unicode~NFD}{
+  \char_to_nfd:N A \NEWLINE
+  \bool_lazy_or:nnT { \sys_if_engine_luatex_p: } { \sys_if_engine_xetex_p: }
+    {
+      \char_to_nfd:N á \NEWLINE
+      \char_to_nfd:N ῒ
+    }
+}
+
+\END
diff --git a/l3kernel/testfiles/m3str-convert003.tlg b/l3kernel/testfiles/m3token001.tlg
similarity index 78%
copy from l3kernel/testfiles/m3str-convert003.tlg
copy to l3kernel/testfiles/m3token001.tlg
index c89276a04..cd844648d 100644
--- a/l3kernel/testfiles/m3str-convert003.tlg
+++ b/l3kernel/testfiles/m3token001.tlg
@@ -1,8 +1,8 @@
 This is a generated file for the LaTeX (2e + expl3) validation system.
 Don't change this file in any respect.
-Author: Bruno Le Floch
+Author: Joseph Wright
 ============================================================
-TEST 1: str if contains char
+TEST 1: Unicode NFD
 ============================================================
-FALSE TRUE FALSE FALSE TRUE
+A
 ============================================================
diff --git a/l3kernel/testfiles/m3str-convert003.tlg b/l3kernel/testfiles/m3token001.xetex.tlg
similarity index 78%
copy from l3kernel/testfiles/m3str-convert003.tlg
copy to l3kernel/testfiles/m3token001.xetex.tlg
index c89276a04..bdf4776d3 100644
--- a/l3kernel/testfiles/m3str-convert003.tlg
+++ b/l3kernel/testfiles/m3token001.xetex.tlg
@@ -1,8 +1,10 @@
 This is a generated file for the LaTeX (2e + expl3) validation system.
 Don't change this file in any respect.
-Author: Bruno Le Floch
+Author: Joseph Wright
 ============================================================
-TEST 1: str if contains char
+TEST 1: Unicode NFD
 ============================================================
-FALSE TRUE FALSE FALSE TRUE
+A
+á
+ῒ
 ============================================================