[latex3-commits] [git/LaTeX3-latex3-latex3] text-case, text-purify: Add \char_to_nfd:N (experimental) (38a5bb204)

Fri Jan 3 20:55:37 CET 2020

Repository : https://github.com/latex3/latex3
On branches: text-case,text-purify
Link       : https://github.com/latex3/latex3/commit/38a5bb2049aa291183b9ddb5e3815c4849e4aca9

>---------------------------------------------------------------

commit 38a5bb2049aa291183b9ddb5e3815c4849e4aca9
Author: Joseph Wright <joseph.wright at morningstar2.co.uk>
Date:   Thu Jan 2 13:14:28 2020 +0000

    Add \char_to_nfd:N (experimental)
    
    The name may yet need work here: unicode?


>---------------------------------------------------------------

38a5bb2049aa291183b9ddb5e3815c4849e4aca9
 l3kernel/CHANGELOG.md                              |  1 +
 l3kernel/l3candidates.dtx                          | 10 +++++
 l3kernel/l3token.dtx                               | 32 ++++++++++++++
 l3kernel/l3unicode.dtx                             | 51 +++++++++++++++-------
 ...{m3str-convert003.tlg => m3token001.luatex.tlg} |  8 ++--
 l3kernel/testfiles/m3token001.lvt                  | 30 +++++++++++++
 .../{m3str-convert003.tlg => m3token001.tlg}       |  6 +--
 .../{m3str-convert003.tlg => m3token001.xetex.tlg} |  8 ++--
 8 files changed, 122 insertions(+), 24 deletions(-)

diff --git a/l3kernel/CHANGELOG.md b/l3kernel/CHANGELOG.md
index 87ec0f26d..3c7501e72 100644
--- a/l3kernel/CHANGELOG.md
+++ b/l3kernel/CHANGELOG.md
@@ -8,6 +8,7 @@ this project uses date-based 'snapshot' version identifiers.
 ## [Unreleased]
 
 ### Added
+- `\char_to_nfd:N`
 - `\file_hex_dump:n(nn)` and `\file_get_hex_dump:n(nn)N(TF)`
 - `\str_<type>case:n`
 - `\text_expand:n` and supporting data structures
diff --git a/l3kernel/l3candidates.dtx b/l3kernel/l3candidates.dtx
index 1c39b04d1..988d52859 100644
--- a/l3kernel/l3candidates.dtx
+++ b/l3kernel/l3candidates.dtx
@@ -880,6 +880,16 @@
 %   and |#3| and |#4| empty.
 % \end{function}
 %
+% \begin{function}[added = 2020-01-02, rEXP]{\char_to_nfd:N}
+%   \begin{syntax}
+%     \cs{char_to_nfd:N} \meta{char}
+%   \end{syntax}
+%   Converts the \meta{char} to the Unicode Normalization Form Canonical
+%   Decomposition. The category code of the generated character is the
+%   same as the \meta{char}. With $8$-bit engines, no change is made to the
+%   character.
+% \end{function}
+%
 % \begin{function}[added = 2018-09-23]
 %   {
 %     \peek_catcode_collect_inline:Nn,
diff --git a/l3kernel/l3token.dtx b/l3kernel/l3token.dtx
index 2f3d1805b..6addd0682 100644
--- a/l3kernel/l3token.dtx
+++ b/l3kernel/l3token.dtx
@@ -1530,6 +1530,38 @@
 % \end{macro}
 % \end{macro}
 %
+% \begin{macro}[rEXP]{\char_to_nfd:N}
+% \begin{macro}[rEXP]{\@@_to_nfd:n}
+% \begin{macro}[rEXP]{\@@_to_nfd:Nw}
+%   Look up any \textsc{nfd} and recursively produce the result.
+%    \begin{macrocode}
+\cs_new:Npn \char_to_nfd:N #1
+  {
+    \cs_if_exist:cTF { c_@@_nfd_ \token_to_str:N #1 _ tl }
+      {
+        \exp_after:wN \exp_after:wN \exp_after:wN \@@_to_nfd:Nw
+          \exp_after:wN \exp_after:wN \exp_after:wN #1
+            \cs:w c_@@_nfd_ \token_to_str:N #1 _ tl \cs_end:
+              \q_stop
+      }
+      { \exp_not:n {#1} }
+  }
+\cs_set_eq:NN \@@_to_nfd:n \char_to_nfd:N
+\cs_new:Npn \@@_to_nfd:Nw #1#2#3 \q_stop
+  {
+    \exp_args:Ne \@@_to_nfd:n
+      { \char_generate:nn { `#2 } { \@@_change_case_catcode:N #1 } }
+    \tl_if_blank:nF {#3}
+      {
+        \exp_args:Ne \@@_to_nfd:n
+          { \char_generate:nn { `#3 } { \char_value_catcode:n { `#3 } } }
+      }
+  }
+%    \end{macrocode}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+%
 % \begin{macro}{\c_catcode_other_space_tl}
 %   Create a space with category code $12$: an \enquote{other} space.
 %    \begin{macrocode}
diff --git a/l3kernel/l3unicode.dtx b/l3kernel/l3unicode.dtx
index f9cabd164..0faaa996e 100644
--- a/l3kernel/l3unicode.dtx
+++ b/l3kernel/l3unicode.dtx
@@ -116,17 +116,35 @@
             }
         } 
 %    \end{macrocode}
-% Parse the main Unicode data file for title case exceptions (the one-to-one
-% lower and upper case mappings it contains are all be covered by the \TeX{}
-% data). There are no comments in the main data file so this can be done using
-% a standard mapping and no checks.
+% Parse the main Unicode data file for two things. First, we want the titlecase
+% exceptions: the one-to-one lower- and uppercase mappings it contains are all
+% be covered by the \TeX{} data. Second, we need normalization data: at present,
+% just the canonical \textsc{nfd} mappings. Those all yield either one or two
+% codepoints, so the split is relatively easy.
 %    \begin{macrocode}
       \ior_open:Nn \g_@@_data_ior { UnicodeData.txt }
       \cs_set_protected:Npn \@@_data_auxi:w
         #1 ; #2 ; #3 ; #4 ; #5 ; #6 ; #7 ; #8 ; #9 ;
-        { \@@_data_auxii:w #1 ; }
-      \cs_set_protected:Npn \@@_data_auxii:w
-        #1 ; #2 ; #3 ; #4 ; #5 ; #6 ; #7 \q_stop
+        {
+          \tl_if_blank:nF {#6}
+            {
+              \tl_if_head_eq_charcode:nNF {#6}  < % >
+                { \@@_data_auxii:w #1 ; #6 ~ \q_stop }
+            }
+          \@@_data_auxiii:w #1 ;
+        }
+      \cs_set_protected:Npn \@@_data_auxii:w #1 ; #2 ~ #3 \q_stop
+        {
+          \tl_const:cx
+            { c_@@_nfd_ \@@_generate_char:n {#1} _tl }
+            {
+              \@@_generate:n { "#2 }
+              \tl_if_blank:nF {#3}
+                { \@@_generate:n { "#3 } }
+            }
+        }
+      \cs_set_protected:Npn \@@_data_auxiii:w
+        #1 ; #2 ; #3 ; #4 ; #5 ; #6 ; #7 ~ \q_stop
         {
           \cs_set_nopar:Npn \l_@@_tmpa_tl {#7}
           \reverse_if:N \if_meaning:w \l_@@_tmpa_tl \c_empty_tl
@@ -138,13 +156,16 @@
             \fi:
           \fi:
         }
-      \ior_map_variable:NNn \g_@@_data_ior \l_@@_tmpa_tl
-        {
-          \if_meaning:w \l_@@_tmpa_tl \c_space_tl
-            \exp_after:wN \ior_map_break:
-          \fi:
-          \exp_after:wN \@@_data_auxi:w \l_@@_tmpa_tl \q_stop
-        }
+      \group_begin:
+        \char_set_catcode_space:n { `\  }%
+        \ior_map_variable:NNn \g_@@_data_ior \l_@@_tmpa_tl
+          {%
+            \if_meaning:w \l_@@_tmpa_tl \c_space_tl
+              \exp_after:wN \ior_map_break:
+            \fi:
+            \exp_after:wN \@@_data_auxi:w \l_@@_tmpa_tl \q_stop
+          }%
+      \group_end:
       \ior_close:N \g_@@_data_ior
 %    \end{macrocode}
 % The other data files all use C-style comments so we have to worry about
@@ -188,7 +209,7 @@
         }
       \ior_close:N \g_@@_data_ior
 %    \end{macrocode}
-% For upper and lower casing special situations, there is a bit more to
+% For upper- and lowercasing special situations, there is a bit more to
 % do as we also have title casing to consider, plus we need to stop part-way
 % through the file.
 %    \begin{macrocode}
diff --git a/l3kernel/testfiles/m3str-convert003.tlg b/l3kernel/testfiles/m3token001.luatex.tlg
similarity index 78%
copy from l3kernel/testfiles/m3str-convert003.tlg
copy to l3kernel/testfiles/m3token001.luatex.tlg
index c89276a04..bdf4776d3 100644
--- a/l3kernel/testfiles/m3str-convert003.tlg
+++ b/l3kernel/testfiles/m3token001.luatex.tlg
@@ -1,8 +1,10 @@
 This is a generated file for the LaTeX (2e + expl3) validation system.
 Don't change this file in any respect.
-Author: Bruno Le Floch
+Author: Joseph Wright
 ============================================================
-TEST 1: str if contains char
+TEST 1: Unicode NFD
 ============================================================
-FALSE TRUE FALSE FALSE TRUE
+A
+á
+ῒ
 ============================================================
diff --git a/l3kernel/testfiles/m3token001.lvt b/l3kernel/testfiles/m3token001.lvt
new file mode 100644
index 000000000..2ea7a6691
--- /dev/null
+++ b/l3kernel/testfiles/m3token001.lvt
@@ -0,0 +1,30 @@
+%
+% Copyright (C) 2020 The LaTeX Project
+%
+
+\documentclass{minimal}
+\input{regression-test}
+
+\RequirePackage[enable-debug]{expl3}
+\ExplSyntaxOn
+\debug_on:n { check-declarations , deprecation , log-functions }
+\ExplSyntaxOff
+\makeatletter
+
+\begin{document}
+\START
+\AUTHOR{Joseph Wright}
+\ExplSyntaxOn
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+\TESTEXP{Unicode~NFD}{
+  \char_to_nfd:N A \NEWLINE
+  \bool_lazy_or:nnT { \sys_if_engine_luatex_p: } { \sys_if_engine_xetex_p: }
+    {
+      \char_to_nfd:N á \NEWLINE
+      \char_to_nfd:N ῒ
+    }
+}
+
+\END
diff --git a/l3kernel/testfiles/m3str-convert003.tlg b/l3kernel/testfiles/m3token001.tlg
similarity index 78%
copy from l3kernel/testfiles/m3str-convert003.tlg
copy to l3kernel/testfiles/m3token001.tlg
index c89276a04..cd844648d 100644
--- a/l3kernel/testfiles/m3str-convert003.tlg
+++ b/l3kernel/testfiles/m3token001.tlg
@@ -1,8 +1,8 @@
 This is a generated file for the LaTeX (2e + expl3) validation system.
 Don't change this file in any respect.
-Author: Bruno Le Floch
+Author: Joseph Wright
 ============================================================
-TEST 1: str if contains char
+TEST 1: Unicode NFD
 ============================================================
-FALSE TRUE FALSE FALSE TRUE
+A
 ============================================================
diff --git a/l3kernel/testfiles/m3str-convert003.tlg b/l3kernel/testfiles/m3token001.xetex.tlg
similarity index 78%
copy from l3kernel/testfiles/m3str-convert003.tlg
copy to l3kernel/testfiles/m3token001.xetex.tlg
index c89276a04..bdf4776d3 100644
--- a/l3kernel/testfiles/m3str-convert003.tlg
+++ b/l3kernel/testfiles/m3token001.xetex.tlg
@@ -1,8 +1,10 @@
 This is a generated file for the LaTeX (2e + expl3) validation system.
 Don't change this file in any respect.
-Author: Bruno Le Floch
+Author: Joseph Wright
 ============================================================
-TEST 1: str if contains char
+TEST 1: Unicode NFD
 ============================================================
-FALSE TRUE FALSE FALSE TRUE
+A
+á
+ῒ
 ============================================================