[latex3-commits] [git/LaTeX3-latex3-latex3] main: Reorganize l3regex doc a bit [ci skip] (5a344d84e)

Tue Apr 27 15:56:19 CEST 2021

Repository : https://github.com/latex3/latex3
On branch  : main
Link       : https://github.com/latex3/latex3/commit/5a344d84e5ab0fdb8771de612667edf36307079c

>---------------------------------------------------------------

commit 5a344d84e5ab0fdb8771de612667edf36307079c
Author: Bruno Le Floch <blflatex at gmail.com>
Date:   Mon Apr 26 23:14:42 2021 +0200

    Reorganize l3regex doc a bit [ci skip]


>---------------------------------------------------------------

5a344d84e5ab0fdb8771de612667edf36307079c
 l3kernel/l3regex.dtx | 120 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 66 insertions(+), 54 deletions(-)

diff --git a/l3kernel/l3regex.dtx b/l3kernel/l3regex.dtx
index f8c64e401..166574165 100644
--- a/l3kernel/l3regex.dtx
+++ b/l3kernel/l3regex.dtx
@@ -98,6 +98,8 @@
 %
 % \section{Syntax of regular expressions}
 %
+% \subsection{Regex examples}
+%
 % We start with a few examples, and encourage the reader to apply
 % \cs{regex_show:n} to these regular expressions.
 % \begin{itemize}
@@ -147,6 +149,8 @@
 % other things all valid integer expressions (made only with explicit
 % integers).  One should follow it with further testing.
 %
+% \subsection{Characters in regular expressions}
+%
 % Most characters match exactly themselves,
 % with an arbitrary category code. Some characters are
 % special and must be escaped with a backslash (\emph{e.g.}, |\*|
@@ -191,6 +195,8 @@
 %   \item[\\t] Horizontal tab (hex 09).
 % \end{l3regex-syntax}
 %
+% \subsection{Characters classes}
+%
 % Character types.
 % \begin{l3regex-syntax}
 %   \item[.] A single period matches any token.
@@ -235,6 +241,21 @@
 % except |p|, as well as control sequences (see below for a description
 % of |\c|).
 %
+% In character classes, only |[|, |^|, |-|, |]|, |\| and spaces are
+% special, and should be escaped. Other non-alphanumeric characters can
+% still be escaped without harm. Any escape sequence which matches a
+% single character (|\d|, |\D|, \emph{etc.}) is supported in character
+% classes.  If the first character is |^|, then
+% the meaning of the character class is inverted; |^| appearing anywhere
+% else in the range is not special.  If the first character (possibly
+% following a leading |^|) is |]| then it does not need to be escaped
+% since ending the range there would make it empty.
+% Ranges of characters
+% can be expressed using |-|, for instance, |[\D 0-5]| and |[^6-9]| are
+% equivalent.
+%
+% \subsection{Structure: alternatives, groups, repetitions}
+%
 % Quantifiers (repetition).
 % \begin{l3regex-syntax}
 %   \item[?] $0$ or $1$, greedy.
@@ -250,24 +271,6 @@
 %   \item[\{$n,m$\}?] At least $n$, no more than $m$, lazy.
 % \end{l3regex-syntax}
 %
-% Anchors and simple assertions.
-% \begin{l3regex-syntax}
-%   \item[\\b] Word boundary: either the previous token is matched by
-%     |\w| and the next by |\W|, or the opposite. For this purpose,
-%     the ends of the token list are considered as |\W|.
-%   \item[\\B] Not a word boundary: between two |\w| tokens
-%     or two |\W| tokens (including the boundary).
-%   \item[\char`^ \textrm{or} \\A]
-%     Start of the subject token list.
-%   \item[\char`$\textrm{,} \\Z \textrm{or} \\z] ^^A $
-%     End of the subject token list.
-%   \item[\\G] Start of the current match. This is only different from |^|
-%     in the case of multiple matches: for instance
-%     |\regex_count:nnN { \G a } { aaba } \l_tmpa_int| yields $2$, but
-%     replacing |\G| by |^| would result in \cs{l_tmpa_int} holding the
-%     value $1$.
-% \end{l3regex-syntax}
-%
 % Alternation and capturing groups.
 % \begin{l3regex-syntax}
 %   \item[A\char`|B\char`|C] Either one of \texttt{A}, \texttt{B},
@@ -280,6 +283,31 @@
 %     group number.
 % \end{l3regex-syntax}
 %
+% Capturing groups are a means of extracting information about the
+% match. Parenthesized groups are labelled in the order of their
+% opening parenthesis, starting at $1$. The contents of those groups
+% corresponding to the \enquote{best} match (leftmost longest)
+% can be extracted and stored in a sequence of token lists using for
+% instance \cs{regex_extract_once:nnNTF}.
+%
+% The |\K| escape sequence resets the beginning of the match to the
+% current position in the token list. This only affects what is reported
+% as the full match. For instance,
+% \begin{verbatim}
+%   \regex_extract_all:nnN { a \K . } { a123aaxyz } \l_foo_seq
+% \end{verbatim}
+% results in \cs{l_foo_seq} containing the items |{1}| and |{a}|: the
+% true matches are |{a1}| and |{aa}|, but they are trimmed by the use of
+% |\K|. The |\K| command does not affect capturing groups: for instance,
+% \begin{verbatim}
+%   \regex_extract_once:nnN { (. \K c)+ \d } { acbc3 } \l_foo_seq
+% \end{verbatim}
+% results in \cs{l_foo_seq} containing the items |{c3}| and |{bc}|: the
+% true match is |{acbc3}|, with first submatch |{bc}|, but |\K| resets
+% the beginning of the match to the last position where it appears.
+%
+% \subsection{Matching exact tokens}
+%
 % The |\c| escape sequence allows to test the category code of tokens,
 % and match control sequences. Each character category is represented
 % by a single uppercase letter:
@@ -351,6 +379,26 @@
 % |\ur{l_item_regex}+| matches one or more \enquote{words} separated by
 % optional commas.
 %
+% \subsection{Miscellaneous}
+%
+% Anchors and simple assertions.
+% \begin{l3regex-syntax}
+%   \item[\\b] Word boundary: either the previous token is matched by
+%     |\w| and the next by |\W|, or the opposite. For this purpose,
+%     the ends of the token list are considered as |\W|.
+%   \item[\\B] Not a word boundary: between two |\w| tokens
+%     or two |\W| tokens (including the boundary).
+%   \item[\char`^ \textrm{or} \\A]
+%     Start of the subject token list.
+%   \item[\char`$\textrm{,} \\Z \textrm{or} \\z] ^^A $
+%     End of the subject token list.
+%   \item[\\G] Start of the current match. This is only different from |^|
+%     in the case of multiple matches: for instance
+%     |\regex_count:nnN { \G a } { aaba } \l_tmpa_int| yields $2$, but
+%     replacing |\G| by |^| would result in \cs{l_tmpa_int} holding the
+%     value $1$.
+% \end{l3regex-syntax}
+%
 % The option |(?i)| makes the match case insensitive (identifying
 % \texttt{A}--\texttt{Z} with \texttt{a}--\texttt{z}; no Unicode support
 % yet). This applies until the end of the group in which it appears, and
@@ -363,42 +411,6 @@
 % |i| option.
 % ^^A \]
 %
-% In character classes, only |[|, |^|, |-|, |]|, |\| and spaces are
-% special, and should be escaped. Other non-alphanumeric characters can
-% still be escaped without harm. Any escape sequence which matches a
-% single character (|\d|, |\D|, \emph{etc.}) is supported in character
-% classes.  If the first character is |^|, then
-% the meaning of the character class is inverted; |^| appearing anywhere
-% else in the range is not special.  If the first character (possibly
-% following a leading |^|) is |]| then it does not need to be escaped
-% since ending the range there would make it empty.
-% Ranges of characters
-% can be expressed using |-|, for instance, |[\D 0-5]| and |[^6-9]| are
-% equivalent.
-%
-% Capturing groups are a means of extracting information about the
-% match. Parenthesized groups are labelled in the order of their
-% opening parenthesis, starting at $1$. The contents of those groups
-% corresponding to the \enquote{best} match (leftmost longest)
-% can be extracted and stored in a sequence of token lists using for
-% instance \cs{regex_extract_once:nnNTF}.
-%
-% The |\K| escape sequence resets the beginning of the match to the
-% current position in the token list. This only affects what is reported
-% as the full match. For instance,
-% \begin{verbatim}
-%   \regex_extract_all:nnN { a \K . } { a123aaxyz } \l_foo_seq
-% \end{verbatim}
-% results in \cs{l_foo_seq} containing the items |{1}| and |{a}|: the
-% true matches are |{a1}| and |{aa}|, but they are trimmed by the use of
-% |\K|. The |\K| command does not affect capturing groups: for instance,
-% \begin{verbatim}
-%   \regex_extract_once:nnN { (. \K c)+ \d } { acbc3 } \l_foo_seq
-% \end{verbatim}
-% results in \cs{l_foo_seq} containing the items |{c3}| and |{bc}|: the
-% true match is |{acbc3}|, with first submatch |{bc}|, but |\K| resets
-% the beginning of the match to the last position where it appears.
-%
 % \section{Syntax of the replacement text}
 %
 % Most of the features described in regular expressions do not make