[latex3-commits] [git/LaTeX3-latex3-latex3] text-case, text-purify: Add \char_to_nfd:N (experimental) (38a5bb204)
Joseph Wright
joseph.wright at morningstar2.co.uk
Fri Jan 3 20:55:37 CET 2020
Repository : https://github.com/latex3/latex3
On branches: text-case,text-purify
Link : https://github.com/latex3/latex3/commit/38a5bb2049aa291183b9ddb5e3815c4849e4aca9
>---------------------------------------------------------------
commit 38a5bb2049aa291183b9ddb5e3815c4849e4aca9
Author: Joseph Wright <joseph.wright at morningstar2.co.uk>
Date: Thu Jan 2 13:14:28 2020 +0000
Add \char_to_nfd:N (experimental)
The name may yet need work here: unicode?
>---------------------------------------------------------------
38a5bb2049aa291183b9ddb5e3815c4849e4aca9
l3kernel/CHANGELOG.md | 1 +
l3kernel/l3candidates.dtx | 10 +++++
l3kernel/l3token.dtx | 32 ++++++++++++++
l3kernel/l3unicode.dtx | 51 +++++++++++++++-------
...{m3str-convert003.tlg => m3token001.luatex.tlg} | 8 ++--
l3kernel/testfiles/m3token001.lvt | 30 +++++++++++++
.../{m3str-convert003.tlg => m3token001.tlg} | 6 +--
.../{m3str-convert003.tlg => m3token001.xetex.tlg} | 8 ++--
8 files changed, 122 insertions(+), 24 deletions(-)
diff --git a/l3kernel/CHANGELOG.md b/l3kernel/CHANGELOG.md
index 87ec0f26d..3c7501e72 100644
--- a/l3kernel/CHANGELOG.md
+++ b/l3kernel/CHANGELOG.md
@@ -8,6 +8,7 @@ this project uses date-based 'snapshot' version identifiers.
## [Unreleased]
### Added
+- `\char_to_nfd:N`
- `\file_hex_dump:n(nn)` and `\file_get_hex_dump:n(nn)N(TF)`
- `\str_<type>case:n`
- `\text_expand:n` and supporting data structures
diff --git a/l3kernel/l3candidates.dtx b/l3kernel/l3candidates.dtx
index 1c39b04d1..988d52859 100644
--- a/l3kernel/l3candidates.dtx
+++ b/l3kernel/l3candidates.dtx
@@ -880,6 +880,16 @@
% and |#3| and |#4| empty.
% \end{function}
%
+% \begin{function}[added = 2020-01-02, rEXP]{\char_to_nfd:N}
+% \begin{syntax}
+% \cs{char_to_nfd:N} \meta{char}
+% \end{syntax}
+% Converts the \meta{char} to the Unicode Normalization Form Canonical
+% Decomposition. The category code of the generated character is the
+% same as the \meta{char}. With $8$-bit engines, no change is made to the
+% character.
+% \end{function}
+%
% \begin{function}[added = 2018-09-23]
% {
% \peek_catcode_collect_inline:Nn,
diff --git a/l3kernel/l3token.dtx b/l3kernel/l3token.dtx
index 2f3d1805b..6addd0682 100644
--- a/l3kernel/l3token.dtx
+++ b/l3kernel/l3token.dtx
@@ -1530,6 +1530,38 @@
% \end{macro}
% \end{macro}
%
+% \begin{macro}[rEXP]{\char_to_nfd:N}
+% \begin{macro}[rEXP]{\@@_to_nfd:n}
+% \begin{macro}[rEXP]{\@@_to_nfd:Nw}
+% Look up any \textsc{nfd} and recursively produce the result.
+% \begin{macrocode}
+\cs_new:Npn \char_to_nfd:N #1
+ {
+ \cs_if_exist:cTF { c_@@_nfd_ \token_to_str:N #1 _ tl }
+ {
+ \exp_after:wN \exp_after:wN \exp_after:wN \@@_to_nfd:Nw
+ \exp_after:wN \exp_after:wN \exp_after:wN #1
+ \cs:w c_@@_nfd_ \token_to_str:N #1 _ tl \cs_end:
+ \q_stop
+ }
+ { \exp_not:n {#1} }
+ }
+\cs_set_eq:NN \@@_to_nfd:n \char_to_nfd:N
+\cs_new:Npn \@@_to_nfd:Nw #1#2#3 \q_stop
+ {
+ \exp_args:Ne \@@_to_nfd:n
+ { \char_generate:nn { `#2 } { \@@_change_case_catcode:N #1 } }
+ \tl_if_blank:nF {#3}
+ {
+ \exp_args:Ne \@@_to_nfd:n
+ { \char_generate:nn { `#3 } { \char_value_catcode:n { `#3 } } }
+ }
+ }
+% \end{macrocode}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+%
% \begin{macro}{\c_catcode_other_space_tl}
% Create a space with category code $12$: an \enquote{other} space.
% \begin{macrocode}
diff --git a/l3kernel/l3unicode.dtx b/l3kernel/l3unicode.dtx
index f9cabd164..0faaa996e 100644
--- a/l3kernel/l3unicode.dtx
+++ b/l3kernel/l3unicode.dtx
@@ -116,17 +116,35 @@
}
}
% \end{macrocode}
-% Parse the main Unicode data file for title case exceptions (the one-to-one
-% lower and upper case mappings it contains are all be covered by the \TeX{}
-% data). There are no comments in the main data file so this can be done using
-% a standard mapping and no checks.
+% Parse the main Unicode data file for two things. First, we want the titlecase
+% exceptions: the one-to-one lower- and uppercase mappings it contains are all
+% be covered by the \TeX{} data. Second, we need normalization data: at present,
+% just the canonical \textsc{nfd} mappings. Those all yield either one or two
+% codepoints, so the split is relatively easy.
% \begin{macrocode}
\ior_open:Nn \g_@@_data_ior { UnicodeData.txt }
\cs_set_protected:Npn \@@_data_auxi:w
#1 ; #2 ; #3 ; #4 ; #5 ; #6 ; #7 ; #8 ; #9 ;
- { \@@_data_auxii:w #1 ; }
- \cs_set_protected:Npn \@@_data_auxii:w
- #1 ; #2 ; #3 ; #4 ; #5 ; #6 ; #7 \q_stop
+ {
+ \tl_if_blank:nF {#6}
+ {
+ \tl_if_head_eq_charcode:nNF {#6} < % >
+ { \@@_data_auxii:w #1 ; #6 ~ \q_stop }
+ }
+ \@@_data_auxiii:w #1 ;
+ }
+ \cs_set_protected:Npn \@@_data_auxii:w #1 ; #2 ~ #3 \q_stop
+ {
+ \tl_const:cx
+ { c_@@_nfd_ \@@_generate_char:n {#1} _tl }
+ {
+ \@@_generate:n { "#2 }
+ \tl_if_blank:nF {#3}
+ { \@@_generate:n { "#3 } }
+ }
+ }
+ \cs_set_protected:Npn \@@_data_auxiii:w
+ #1 ; #2 ; #3 ; #4 ; #5 ; #6 ; #7 ~ \q_stop
{
\cs_set_nopar:Npn \l_@@_tmpa_tl {#7}
\reverse_if:N \if_meaning:w \l_@@_tmpa_tl \c_empty_tl
@@ -138,13 +156,16 @@
\fi:
\fi:
}
- \ior_map_variable:NNn \g_@@_data_ior \l_@@_tmpa_tl
- {
- \if_meaning:w \l_@@_tmpa_tl \c_space_tl
- \exp_after:wN \ior_map_break:
- \fi:
- \exp_after:wN \@@_data_auxi:w \l_@@_tmpa_tl \q_stop
- }
+ \group_begin:
+ \char_set_catcode_space:n { `\ }%
+ \ior_map_variable:NNn \g_@@_data_ior \l_@@_tmpa_tl
+ {%
+ \if_meaning:w \l_@@_tmpa_tl \c_space_tl
+ \exp_after:wN \ior_map_break:
+ \fi:
+ \exp_after:wN \@@_data_auxi:w \l_@@_tmpa_tl \q_stop
+ }%
+ \group_end:
\ior_close:N \g_@@_data_ior
% \end{macrocode}
% The other data files all use C-style comments so we have to worry about
@@ -188,7 +209,7 @@
}
\ior_close:N \g_@@_data_ior
% \end{macrocode}
-% For upper and lower casing special situations, there is a bit more to
+% For upper- and lowercasing special situations, there is a bit more to
% do as we also have title casing to consider, plus we need to stop part-way
% through the file.
% \begin{macrocode}
diff --git a/l3kernel/testfiles/m3str-convert003.tlg b/l3kernel/testfiles/m3token001.luatex.tlg
similarity index 78%
copy from l3kernel/testfiles/m3str-convert003.tlg
copy to l3kernel/testfiles/m3token001.luatex.tlg
index c89276a04..bdf4776d3 100644
--- a/l3kernel/testfiles/m3str-convert003.tlg
+++ b/l3kernel/testfiles/m3token001.luatex.tlg
@@ -1,8 +1,10 @@
This is a generated file for the LaTeX (2e + expl3) validation system.
Don't change this file in any respect.
-Author: Bruno Le Floch
+Author: Joseph Wright
============================================================
-TEST 1: str if contains char
+TEST 1: Unicode NFD
============================================================
-FALSE TRUE FALSE FALSE TRUE
+A
+á
+ῒ
============================================================
diff --git a/l3kernel/testfiles/m3token001.lvt b/l3kernel/testfiles/m3token001.lvt
new file mode 100644
index 000000000..2ea7a6691
--- /dev/null
+++ b/l3kernel/testfiles/m3token001.lvt
@@ -0,0 +1,30 @@
+%
+% Copyright (C) 2020 The LaTeX Project
+%
+
+\documentclass{minimal}
+\input{regression-test}
+
+\RequirePackage[enable-debug]{expl3}
+\ExplSyntaxOn
+\debug_on:n { check-declarations , deprecation , log-functions }
+\ExplSyntaxOff
+\makeatletter
+
+\begin{document}
+\START
+\AUTHOR{Joseph Wright}
+\ExplSyntaxOn
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+\TESTEXP{Unicode~NFD}{
+ \char_to_nfd:N A \NEWLINE
+ \bool_lazy_or:nnT { \sys_if_engine_luatex_p: } { \sys_if_engine_xetex_p: }
+ {
+ \char_to_nfd:N á \NEWLINE
+ \char_to_nfd:N ῒ
+ }
+}
+
+\END
diff --git a/l3kernel/testfiles/m3str-convert003.tlg b/l3kernel/testfiles/m3token001.tlg
similarity index 78%
copy from l3kernel/testfiles/m3str-convert003.tlg
copy to l3kernel/testfiles/m3token001.tlg
index c89276a04..cd844648d 100644
--- a/l3kernel/testfiles/m3str-convert003.tlg
+++ b/l3kernel/testfiles/m3token001.tlg
@@ -1,8 +1,8 @@
This is a generated file for the LaTeX (2e + expl3) validation system.
Don't change this file in any respect.
-Author: Bruno Le Floch
+Author: Joseph Wright
============================================================
-TEST 1: str if contains char
+TEST 1: Unicode NFD
============================================================
-FALSE TRUE FALSE FALSE TRUE
+A
============================================================
diff --git a/l3kernel/testfiles/m3str-convert003.tlg b/l3kernel/testfiles/m3token001.xetex.tlg
similarity index 78%
copy from l3kernel/testfiles/m3str-convert003.tlg
copy to l3kernel/testfiles/m3token001.xetex.tlg
index c89276a04..bdf4776d3 100644
--- a/l3kernel/testfiles/m3str-convert003.tlg
+++ b/l3kernel/testfiles/m3token001.xetex.tlg
@@ -1,8 +1,10 @@
This is a generated file for the LaTeX (2e + expl3) validation system.
Don't change this file in any respect.
-Author: Bruno Le Floch
+Author: Joseph Wright
============================================================
-TEST 1: str if contains char
+TEST 1: Unicode NFD
============================================================
-FALSE TRUE FALSE FALSE TRUE
+A
+á
+ῒ
============================================================
More information about the latex3-commits
mailing list