[latex3-commits] [git/LaTeX3-latex3-latex3] gh590: Reorganize l3regex doc a bit [ci skip] (4308ea8e3)
Bruno Le Floch
blflatex at gmail.com
Mon Apr 26 23:14:42 CEST 2021
Repository : https://github.com/latex3/latex3
On branch : gh590
Link : https://github.com/latex3/latex3/commit/4308ea8e30808bcda5235337aa4d1fc31c4f81bd
>---------------------------------------------------------------
commit 4308ea8e30808bcda5235337aa4d1fc31c4f81bd
Author: Bruno Le Floch <blflatex at gmail.com>
Date: Mon Apr 26 23:14:42 2021 +0200
Reorganize l3regex doc a bit [ci skip]
>---------------------------------------------------------------
4308ea8e30808bcda5235337aa4d1fc31c4f81bd
l3kernel/l3regex.dtx | 120 ++++++++++++++++++++++++++++-----------------------
1 file changed, 66 insertions(+), 54 deletions(-)
diff --git a/l3kernel/l3regex.dtx b/l3kernel/l3regex.dtx
index 7973ee431..09343f03f 100644
--- a/l3kernel/l3regex.dtx
+++ b/l3kernel/l3regex.dtx
@@ -98,6 +98,8 @@
%
% \section{Syntax of regular expressions}
%
+% \subsection{Regex examples}
+%
% We start with a few examples, and encourage the reader to apply
% \cs{regex_show:n} to these regular expressions.
% \begin{itemize}
@@ -147,6 +149,8 @@
% other things all valid integer expressions (made only with explicit
% integers). One should follow it with further testing.
%
+% \subsection{Characters in regular expressions}
+%
% Most characters match exactly themselves,
% with an arbitrary category code. Some characters are
% special and must be escaped with a backslash (\emph{e.g.}, |\*|
@@ -191,6 +195,8 @@
% \item[\\t] Horizontal tab (hex 09).
% \end{l3regex-syntax}
%
+% \subsection{Characters classes}
+%
% Character types.
% \begin{l3regex-syntax}
% \item[.] A single period matches any token.
@@ -235,6 +241,21 @@
% except |p|, as well as control sequences (see below for a description
% of |\c|).
%
+% In character classes, only |[|, |^|, |-|, |]|, |\| and spaces are
+% special, and should be escaped. Other non-alphanumeric characters can
+% still be escaped without harm. Any escape sequence which matches a
+% single character (|\d|, |\D|, \emph{etc.}) is supported in character
+% classes. If the first character is |^|, then
+% the meaning of the character class is inverted; |^| appearing anywhere
+% else in the range is not special. If the first character (possibly
+% following a leading |^|) is |]| then it does not need to be escaped
+% since ending the range there would make it empty.
+% Ranges of characters
+% can be expressed using |-|, for instance, |[\D 0-5]| and |[^6-9]| are
+% equivalent.
+%
+% \subsection{Structure: alternatives, groups, repetitions}
+%
% Quantifiers (repetition).
% \begin{l3regex-syntax}
% \item[?] $0$ or $1$, greedy.
@@ -250,24 +271,6 @@
% \item[\{$n,m$\}?] At least $n$, no more than $m$, lazy.
% \end{l3regex-syntax}
%
-% Anchors and simple assertions.
-% \begin{l3regex-syntax}
-% \item[\\b] Word boundary: either the previous token is matched by
-% |\w| and the next by |\W|, or the opposite. For this purpose,
-% the ends of the token list are considered as |\W|.
-% \item[\\B] Not a word boundary: between two |\w| tokens
-% or two |\W| tokens (including the boundary).
-% \item[\char`^ \textrm{or} \\A]
-% Start of the subject token list.
-% \item[\char`$\textrm{,} \\Z \textrm{or} \\z] ^^A $
-% End of the subject token list.
-% \item[\\G] Start of the current match. This is only different from |^|
-% in the case of multiple matches: for instance
-% |\regex_count:nnN { \G a } { aaba } \l_tmpa_int| yields $2$, but
-% replacing |\G| by |^| would result in \cs{l_tmpa_int} holding the
-% value $1$.
-% \end{l3regex-syntax}
-%
% Alternation and capturing groups.
% \begin{l3regex-syntax}
% \item[A\char`|B\char`|C] Either one of \texttt{A}, \texttt{B},
@@ -280,6 +283,31 @@
% group number.
% \end{l3regex-syntax}
%
+% Capturing groups are a means of extracting information about the
+% match. Parenthesized groups are labelled in the order of their
+% opening parenthesis, starting at $1$. The contents of those groups
+% corresponding to the \enquote{best} match (leftmost longest)
+% can be extracted and stored in a sequence of token lists using for
+% instance \cs{regex_extract_once:nnNTF}.
+%
+% The |\K| escape sequence resets the beginning of the match to the
+% current position in the token list. This only affects what is reported
+% as the full match. For instance,
+% \begin{verbatim}
+% \regex_extract_all:nnN { a \K . } { a123aaxyz } \l_foo_seq
+% \end{verbatim}
+% results in \cs{l_foo_seq} containing the items |{1}| and |{a}|: the
+% true matches are |{a1}| and |{aa}|, but they are trimmed by the use of
+% |\K|. The |\K| command does not affect capturing groups: for instance,
+% \begin{verbatim}
+% \regex_extract_once:nnN { (. \K c)+ \d } { acbc3 } \l_foo_seq
+% \end{verbatim}
+% results in \cs{l_foo_seq} containing the items |{c3}| and |{bc}|: the
+% true match is |{acbc3}|, with first submatch |{bc}|, but |\K| resets
+% the beginning of the match to the last position where it appears.
+%
+% \subsection{Matching exact tokens}
+%
% The |\c| escape sequence allows to test the category code of tokens,
% and match control sequences. Each character category is represented
% by a single uppercase letter:
@@ -351,6 +379,26 @@
% |\ur{l_item_regex}+| matches one or more \enquote{words} separated by
% optional commas.
%
+% \subsection{Miscellaneous}
+%
+% Anchors and simple assertions.
+% \begin{l3regex-syntax}
+% \item[\\b] Word boundary: either the previous token is matched by
+% |\w| and the next by |\W|, or the opposite. For this purpose,
+% the ends of the token list are considered as |\W|.
+% \item[\\B] Not a word boundary: between two |\w| tokens
+% or two |\W| tokens (including the boundary).
+% \item[\char`^ \textrm{or} \\A]
+% Start of the subject token list.
+% \item[\char`$\textrm{,} \\Z \textrm{or} \\z] ^^A $
+% End of the subject token list.
+% \item[\\G] Start of the current match. This is only different from |^|
+% in the case of multiple matches: for instance
+% |\regex_count:nnN { \G a } { aaba } \l_tmpa_int| yields $2$, but
+% replacing |\G| by |^| would result in \cs{l_tmpa_int} holding the
+% value $1$.
+% \end{l3regex-syntax}
+%
% The option |(?i)| makes the match case insensitive (identifying
% \texttt{A}--\texttt{Z} with \texttt{a}--\texttt{z}; no Unicode support
% yet). This applies until the end of the group in which it appears, and
@@ -363,42 +411,6 @@
% |i| option.
% ^^A \]
%
-% In character classes, only |[|, |^|, |-|, |]|, |\| and spaces are
-% special, and should be escaped. Other non-alphanumeric characters can
-% still be escaped without harm. Any escape sequence which matches a
-% single character (|\d|, |\D|, \emph{etc.}) is supported in character
-% classes. If the first character is |^|, then
-% the meaning of the character class is inverted; |^| appearing anywhere
-% else in the range is not special. If the first character (possibly
-% following a leading |^|) is |]| then it does not need to be escaped
-% since ending the range there would make it empty.
-% Ranges of characters
-% can be expressed using |-|, for instance, |[\D 0-5]| and |[^6-9]| are
-% equivalent.
-%
-% Capturing groups are a means of extracting information about the
-% match. Parenthesized groups are labelled in the order of their
-% opening parenthesis, starting at $1$. The contents of those groups
-% corresponding to the \enquote{best} match (leftmost longest)
-% can be extracted and stored in a sequence of token lists using for
-% instance \cs{regex_extract_once:nnNTF}.
-%
-% The |\K| escape sequence resets the beginning of the match to the
-% current position in the token list. This only affects what is reported
-% as the full match. For instance,
-% \begin{verbatim}
-% \regex_extract_all:nnN { a \K . } { a123aaxyz } \l_foo_seq
-% \end{verbatim}
-% results in \cs{l_foo_seq} containing the items |{1}| and |{a}|: the
-% true matches are |{a1}| and |{aa}|, but they are trimmed by the use of
-% |\K|. The |\K| command does not affect capturing groups: for instance,
-% \begin{verbatim}
-% \regex_extract_once:nnN { (. \K c)+ \d } { acbc3 } \l_foo_seq
-% \end{verbatim}
-% results in \cs{l_foo_seq} containing the items |{c3}| and |{bc}|: the
-% true match is |{acbc3}|, with first submatch |{bc}|, but |\K| resets
-% the beginning of the match to the last position where it appears.
-%
% \section{Syntax of the replacement text}
%
% Most of the features described in regular expressions do not make
More information about the latex3-commits
mailing list.