[latex3-commits] [git/LaTeX3-latex3-latex3] gh621: Change catcode produced by default by l3regex replacement (fixes #621) (0555577ad)
Bruno Le Floch
blflatex at gmail.com
Sun Apr 25 01:27:23 CEST 2021
Repository : https://github.com/latex3/latex3
On branch : gh621
Link : https://github.com/latex3/latex3/commit/0555577adf43af7b5aa1cda8fbb29c461fb48d03
>---------------------------------------------------------------
commit 0555577adf43af7b5aa1cda8fbb29c461fb48d03
Author: Bruno Le Floch <blflatex at gmail.com>
Date: Sun Apr 25 01:27:23 2021 +0200
Change catcode produced by default by l3regex replacement (fixes #621)
>---------------------------------------------------------------
0555577adf43af7b5aa1cda8fbb29c461fb48d03
l3kernel/CHANGELOG.md | 1 +
l3kernel/l3regex.dtx | 64 +++++++++++++++++++++++---------
l3kernel/testfiles/m3regex005.luatex.tlg | 48 ++++++++++++++++++++----
l3kernel/testfiles/m3regex005.lvt | 13 ++++++-
l3kernel/testfiles/m3regex005.tlg | 48 ++++++++++++++++++++----
l3kernel/testfiles/m3regex005.xetex.tlg | 48 ++++++++++++++++++++----
6 files changed, 180 insertions(+), 42 deletions(-)
diff --git a/l3kernel/CHANGELOG.md b/l3kernel/CHANGELOG.md
index 055589332..37ca37667 100644
--- a/l3kernel/CHANGELOG.md
+++ b/l3kernel/CHANGELOG.md
@@ -13,6 +13,7 @@ this project uses date-based 'snapshot' version identifiers.
- Color export in comma-separated format
### Changed
+- Use prevailing catcodes instead of string in regex replacement (issue #621)
- `\__kernel_file_name_sanitize:n` now uses a faster `\csname`-based
approach to expand the file name.
- `\pdf_version_gset:n` for `dvips`.
diff --git a/l3kernel/l3regex.dtx b/l3kernel/l3regex.dtx
index 5569afdd8..b25a659a8 100644
--- a/l3kernel/l3regex.dtx
+++ b/l3kernel/l3regex.dtx
@@ -426,10 +426,17 @@
% the last match is used in the replacement text. Submatches always keep
% the same category codes as in the original token list.
%
-% The characters inserted by the replacement have category code $12$
-% (other) by default, with the exception of space characters. Spaces
-% inserted through \verb*|\ | have category code $10$, while spaces
-% inserted through |\x20| or |\x{20}| have category code $12$.
+% By default, the category code of characters inserted by the
+% replacement are determined by the prevailing category code regime at
+% the time where the replacement is made, with two exceptions:
+% \begin{itemize}
+% \item space characters (with character code $32$) inserted with
+% \verb*|\ | or |\x20| or |\x{20}| have category code~$10$ regardless
+% of the prevailing category code regime;
+% \item if the category code would be $0$~(escape), $5$~(newline),
+% $9$~(ignore), $14$~(comment) or $15$~(invalid), it is replaced by
+% $12$~(other) instead.
+% \end{itemize}
% The escape sequence |\c| allows to insert characters
% with arbitrary category codes, as well as control sequences.
% \begin{l3regex-syntax}
@@ -5304,7 +5311,7 @@
% \end{macrocode}
% \end{macro}
%
-% \begin{macro}{\@@_replacement_normal:n}
+% \begin{macro}{\@@_replacement_normal:n, \@@_replacement_normal_aux:N}
% Most characters are simply sent to the output by
% \cs{tl_build_put_right:Nn}, unless a particular category code has been
% requested: then \cs{@@_replacement_c_A:w} or a similar auxiliary is
@@ -5313,13 +5320,16 @@
% sequence is non-empty there: it contains an empty entry
% corresponding to the initial value of
% \cs{l_@@_replacement_category_tl}.
-% The argument |#1| can be a space, otherwise it is a single
-% character.
+% The argument |#1| is a single character (including the case of a catcode-other space).
+% In case no specific catcode is requested, we taked into account the
+% current catcode regime (at the time the replacement is performed)
+% as much as reasonable, with all impossible catcodes (escape,
+% newline, etc.) being mapped to \enquote{other}.
% \begin{macrocode}
\cs_new_protected:Npn \@@_replacement_normal:n #1
{
\tl_if_empty:NTF \l_@@_replacement_category_tl
- { \@@_replacement_put:n {#1} }
+ { \@@_replacement_normal_aux:N #1 }
{ % (
\token_if_eq_charcode:NNTF #1 )
{
@@ -5327,15 +5337,37 @@
\l_@@_replacement_category_tl
}
{
- \use:c
- {
- @@_replacement_c_
- \l_@@_replacement_category_tl :w
- }
- \@@_replacement_normal:n {#1}
+ \use:c { @@_replacement_c_ \l_@@_replacement_category_tl :w }
+ ? #1
}
}
}
+\cs_new_protected:Npn \@@_replacement_normal_aux:N #1
+ {
+ \token_if_eq_charcode:NNTF #1 \c_space_token
+ { \@@_replacement_c_S:w }
+ {
+ \exp_after:wN \exp_after:wN
+ \if_case:w \tex_catcode:D `#1 \exp_stop_f:
+ \@@_replacement_c_O:w
+ \or: \@@_replacement_c_B:w
+ \or: \@@_replacement_c_E:w
+ \or: \@@_replacement_c_M:w
+ \or: \@@_replacement_c_T:w
+ \or: \@@_replacement_c_O:w
+ \or: \@@_replacement_c_P:w
+ \or: \@@_replacement_c_U:w
+ \or: \@@_replacement_c_D:w
+ \or: \@@_replacement_c_O:w
+ \or: \@@_replacement_c_S:w
+ \or: \@@_replacement_c_L:w
+ \or: \@@_replacement_c_O:w
+ \or: \@@_replacement_c_A:w
+ \else: \@@_replacement_c_O:w
+ \fi:
+ }
+ ? #1
+ }
% \end{macrocode}
% \end{macro}
%
@@ -5343,7 +5375,6 @@
% As in parsing a regular expression, we use an auxiliary built from
% |#1| if defined. Otherwise, check for escaped digits (standing from
% submatches from $0$ to $9$): anything else is a raw character.
-% We use \cs{token_to_str:N} to give spaces the right category code.
% \begin{macrocode}
\cs_new_protected:Npn \@@_replacement_escaped:N #1
{
@@ -5352,8 +5383,7 @@
\if_int_compare:w 1 < 1#1 \exp_stop_f:
\@@_replacement_put_submatch:n {#1}
\else:
- \exp_args:No \@@_replacement_normal:n
- { \token_to_str:N #1 }
+ \@@_replacement_normal:n {#1}
\fi:
}
}
diff --git a/l3kernel/testfiles/m3regex005.luatex.tlg b/l3kernel/testfiles/m3regex005.luatex.tlg
index 0e3cf39a3..32057e3e5 100644
--- a/l3kernel/testfiles/m3regex005.luatex.tlg
+++ b/l3kernel/testfiles/m3regex005.luatex.tlg
@@ -180,9 +180,9 @@ The token list \l_tmpa_tl contains the tokens:
<recently read> }
l. ... }
The token list \l_tmpa_tl contains the tokens:
-> x (the character x)
+> x (the letter x)
> \c_parameter_token (control sequence=macro parameter character #)
-> x (the character x).
+> x (the letter x).
<recently read> }
l. ... }
============================================================
@@ -281,6 +281,11 @@ TEST 10: Caseless matching and cs
TEST 11: Braces
============================================================
|\{}|
+The token list \l_tmpa_tl contains the tokens:
+> \{ (control sequence=\protected macro:->\ifmmode \lbrace \else \textb\ETC.)
+> } (the letter }).
+<recently read> }
+l. ... }
! LaTeX3 Error: Missing right brace inserted in replacement text.
For immediate help type H <return>.
...
@@ -297,8 +302,8 @@ TEST 12: More tests of cs
TEST 13: Replaced space catcode
============================================================
blank space
-the character
-the character
+blank space
+blank space
============================================================
============================================================
TEST 14: Catcode group in replacement
@@ -312,7 +317,7 @@ In a replacement text, the '\c' escape sequence can be followed by one of the
letters 'ABCDELMOPSTU' representing the character category. Then, a character
must follow, not '\1'.
The token list \l_tmpa_tl contains the tokens:
-> q (the character q)
+> q (the letter q)
> e (subscript character e)
> t (superscript character t)
> a (the letter a)
@@ -321,8 +326,8 @@ The token list \l_tmpa_tl contains the tokens:
> 1 (math shift character 1)
> p (superscript character p)
> s (subscript character s)
-> f (the character f)
-> q (the character q)
+> f (the letter f)
+> q (the letter q)
> e (subscript character e)
> t (superscript character t)
> a (the character a)
@@ -331,7 +336,7 @@ The token list \l_tmpa_tl contains the tokens:
> 1 (math shift character 1)
> p (superscript character p)
> s (subscript character s)
-> f (the character f).
+> f (the letter f).
<recently read> }
l. ... }
! LaTeX3 Error: Missing right parenthesis inserted in replacement text.
@@ -347,3 +352,30 @@ l. ... }
There were 2 missing right parentheses.
> \l_tmpb_tl=.
============================================================
+============================================================
+TEST 15: Catcode used by default
+============================================================
+\g__cctab_next_cctab=\catcodetable...
+The token list \l_tmpa_tl contains the tokens:
+> ^^M (the character ^^M)
+> ! (the character !)
+> @ (the character @)
+> # (macro parameter character #)
+> # (macro parameter character #)
+> # (macro parameter character #)
+> $ (math shift character $)
+> % (the character %)
+> $ (math shift character $)
+> ^ (superscript character ^)
+> & (alignment tab character &)
+> * (the character *)
+> { (begin-group character {)
+> (blank space )
+> } (end-group character })
+> : (the character :)
+> _ (subscript character _)
+> ~ (active character=macro:->\nobreakspace {})
+> \ (the character \).
+<recently read> }
+l. ... }
+============================================================
diff --git a/l3kernel/testfiles/m3regex005.lvt b/l3kernel/testfiles/m3regex005.lvt
index 9dd59873b..602a50efa 100644
--- a/l3kernel/testfiles/m3regex005.lvt
+++ b/l3kernel/testfiles/m3regex005.lvt
@@ -141,8 +141,9 @@
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\TEST { Braces }
{
- \regex_replace_once:nnN { .* } { \c{{}} } \l_tmpa_tl
+ \regex_replace_once:nnN { .* } { \c{{}\cL} } \l_tmpa_tl
\TYPE { | \tl_to_str:N \l_tmpa_tl | }
+ \tl_analysis_show:N \l_tmpa_tl
\exp_args:Nnx \regex_replace_once:nnN
{ .* } { \iow_char:N\\c\iow_char:N\{ } \l_tmpa_tl
\TYPE { | \tl_to_str:N \l_tmpa_tl | }
@@ -188,5 +189,15 @@
\tl_log:N \l_tmpb_tl
}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\TEST { Catcode~used~by~default }
+ {
+ \cctab_begin:N \c_document_cctab
+ \tl_clear:N \l_tmpa_tl
+ \regex_replace_all:nnN { } { \x0d!@#\#$\%$^&*{\ }:_\~\\ } \l_tmpa_tl
+ \tl_analysis_show:N \l_tmpa_tl
+ \cctab_end:
+ }
+
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\END
diff --git a/l3kernel/testfiles/m3regex005.tlg b/l3kernel/testfiles/m3regex005.tlg
index 29d78f86a..cc6c44cba 100644
--- a/l3kernel/testfiles/m3regex005.tlg
+++ b/l3kernel/testfiles/m3regex005.tlg
@@ -180,9 +180,9 @@ The token list \l_tmpa_tl contains the tokens:
<recently read> }
l. ... }
The token list \l_tmpa_tl contains the tokens:
-> x (the character x)
+> x (the letter x)
> \c_parameter_token (control sequence=macro parameter character #)
-> x (the character x).
+> x (the letter x).
<recently read> }
l. ... }
============================================================
@@ -281,6 +281,11 @@ TEST 10: Caseless matching and cs
TEST 11: Braces
============================================================
|\{}|
+The token list \l_tmpa_tl contains the tokens:
+> \{ (control sequence=\protected macro:->\ifmmode \lbrace \else \textb\ETC.)
+> } (the letter }).
+<recently read> }
+l. ... }
! LaTeX3 Error: Missing right brace inserted in replacement text.
For immediate help type H <return>.
...
@@ -297,8 +302,8 @@ TEST 12: More tests of cs
TEST 13: Replaced space catcode
============================================================
blank space
-the character
-the character
+blank space
+blank space
============================================================
============================================================
TEST 14: Catcode group in replacement
@@ -312,7 +317,7 @@ In a replacement text, the '\c' escape sequence can be followed by one of the
letters 'ABCDELMOPSTU' representing the character category. Then, a character
must follow, not '\1'.
The token list \l_tmpa_tl contains the tokens:
-> q (the character q)
+> q (the letter q)
> e (subscript character e)
> t (superscript character t)
> a (the letter a)
@@ -321,8 +326,8 @@ The token list \l_tmpa_tl contains the tokens:
> 1 (math shift character 1)
> p (superscript character p)
> s (subscript character s)
-> f (the character f)
-> q (the character q)
+> f (the letter f)
+> q (the letter q)
> e (subscript character e)
> t (superscript character t)
> a (the character a)
@@ -331,7 +336,7 @@ The token list \l_tmpa_tl contains the tokens:
> 1 (math shift character 1)
> p (superscript character p)
> s (subscript character s)
-> f (the character f).
+> f (the letter f).
<recently read> }
l. ... }
! LaTeX3 Error: Missing right parenthesis inserted in replacement text.
@@ -347,3 +352,30 @@ l. ... }
There were 2 missing right parentheses.
> \l_tmpb_tl=.
============================================================
+============================================================
+TEST 15: Catcode used by default
+============================================================
+Defining \g__cctab_1_cctab on line ...
+The token list \l_tmpa_tl contains the tokens:
+> ^^M (the character ^^M)
+> ! (the character !)
+> @ (the character @)
+> # (macro parameter character #)
+> # (macro parameter character #)
+> # (macro parameter character #)
+> $ (math shift character $)
+> % (the character %)
+> $ (math shift character $)
+> ^ (superscript character ^)
+> & (alignment tab character &)
+> * (the character *)
+> { (begin-group character {)
+> (blank space )
+> } (end-group character })
+> : (the character :)
+> _ (subscript character _)
+> ~ (active character=macro:->\nobreakspace {})
+> \ (the character \).
+<recently read> }
+l. ... }
+============================================================
diff --git a/l3kernel/testfiles/m3regex005.xetex.tlg b/l3kernel/testfiles/m3regex005.xetex.tlg
index 0e3cf39a3..b74de5b57 100644
--- a/l3kernel/testfiles/m3regex005.xetex.tlg
+++ b/l3kernel/testfiles/m3regex005.xetex.tlg
@@ -180,9 +180,9 @@ The token list \l_tmpa_tl contains the tokens:
<recently read> }
l. ... }
The token list \l_tmpa_tl contains the tokens:
-> x (the character x)
+> x (the letter x)
> \c_parameter_token (control sequence=macro parameter character #)
-> x (the character x).
+> x (the letter x).
<recently read> }
l. ... }
============================================================
@@ -281,6 +281,11 @@ TEST 10: Caseless matching and cs
TEST 11: Braces
============================================================
|\{}|
+The token list \l_tmpa_tl contains the tokens:
+> \{ (control sequence=\protected macro:->\ifmmode \lbrace \else \textb\ETC.)
+> } (the letter }).
+<recently read> }
+l. ... }
! LaTeX3 Error: Missing right brace inserted in replacement text.
For immediate help type H <return>.
...
@@ -297,8 +302,8 @@ TEST 12: More tests of cs
TEST 13: Replaced space catcode
============================================================
blank space
-the character
-the character
+blank space
+blank space
============================================================
============================================================
TEST 14: Catcode group in replacement
@@ -312,7 +317,7 @@ In a replacement text, the '\c' escape sequence can be followed by one of the
letters 'ABCDELMOPSTU' representing the character category. Then, a character
must follow, not '\1'.
The token list \l_tmpa_tl contains the tokens:
-> q (the character q)
+> q (the letter q)
> e (subscript character e)
> t (superscript character t)
> a (the letter a)
@@ -321,8 +326,8 @@ The token list \l_tmpa_tl contains the tokens:
> 1 (math shift character 1)
> p (superscript character p)
> s (subscript character s)
-> f (the character f)
-> q (the character q)
+> f (the letter f)
+> q (the letter q)
> e (subscript character e)
> t (superscript character t)
> a (the character a)
@@ -331,7 +336,7 @@ The token list \l_tmpa_tl contains the tokens:
> 1 (math shift character 1)
> p (superscript character p)
> s (subscript character s)
-> f (the character f).
+> f (the letter f).
<recently read> }
l. ... }
! LaTeX3 Error: Missing right parenthesis inserted in replacement text.
@@ -347,3 +352,30 @@ l. ... }
There were 2 missing right parentheses.
> \l_tmpb_tl=.
============================================================
+============================================================
+TEST 15: Catcode used by default
+============================================================
+Defining \g__cctab_1_cctab on line ...
+The token list \l_tmpa_tl contains the tokens:
+> ^^M (the character ^^M)
+> ! (the character !)
+> @ (the character @)
+> # (macro parameter character #)
+> # (macro parameter character #)
+> # (macro parameter character #)
+> $ (math shift character $)
+> % (the character %)
+> $ (math shift character $)
+> ^ (superscript character ^)
+> & (alignment tab character &)
+> * (the character *)
+> { (begin-group character {)
+> (blank space )
+> } (end-group character })
+> : (the character :)
+> _ (subscript character _)
+> ~ (active character=macro:->\nobreakspace {})
+> \ (the character \).
+<recently read> }
+l. ... }
+============================================================
More information about the latex3-commits
mailing list.