[latex3-commits] [l3svn] r6646 - Add complete list of tokens at the end of l3token
noreply at latex-project.org
noreply at latex-project.org
Sat Aug 13 23:35:41 CEST 2016
Author: bruno
Date: 2016-08-13 23:35:41 +0200 (Sat, 13 Aug 2016)
New Revision: 6646
Modified:
trunk/l3kernel/l3tl.dtx
trunk/l3kernel/l3token.dtx
Log:
Add complete list of tokens at the end of l3token
I've used the words "meaning" and "shape" to distinguish
what's compared by \token_if_eq_meaning:NNTF ... ... and
by \tl_if_eq:nnTF {...}{...} (or delimited arguments).
Better names than "shape" welcome.
Rather than an incomplete list at the start I put a
link pointing to a comprehensive one in the last section.
Modified: trunk/l3kernel/l3tl.dtx
===================================================================
--- trunk/l3kernel/l3tl.dtx 2016-08-13 20:23:45 UTC (rev 6645)
+++ trunk/l3kernel/l3tl.dtx 2016-08-13 21:35:41 UTC (rev 6646)
@@ -105,30 +105,6 @@
% Functions which act on items are often faster than their analogue acting
% directly on tokens.
%
-% ^^A todo: perhaps move to another module, l3token or l3basics?
-% \begin{texnote}
-% When \TeX{} fetches an undelimited argument from the input stream,
-% explicit character tokens with character code $32$ (space) and
-% category code $10$ (space), which we here call \enquote{explicit
-% space characters}, are ignored. If the following token is an
-% explicit character token with category code $1$ (begin-group) and an
-% arbitrary character code, then \TeX{} scans ahead to obtain an equal
-% number of explicit character tokens with category code $1$
-% (begin-group) and $2$ (end-group), and the resulting list of tokens
-% (with outer braces removed) becomes the argument. Otherwise, a
-% single token is taken as the argument for the macro: we call such
-% single tokens \enquote{N-type}, as they are suitable to be used as
-% an argument for a function with the signature~\texttt{:N}.
-%
-% When \TeX{} reads a character of category code $10$ for the first
-% time, it is converted to an explicit space character, with character
-% code $32$, regardless of the initial character code.
-% \enquote{Funny} spaces with a different category code, can be
-% produced using \tn{tex_lowercase:D} or \tn{tex_uppercase:D}.
-% Explicit space characters are also produced as a result of
-% \cs{token_to_str:N}, \cs{tl_to_str:n}, etc.
-% \end{texnote}
-%
% \section{Creating and initialising token list variables}
%
% \begin{function}{\tl_new:N, \tl_new:c}
Modified: trunk/l3kernel/l3token.dtx
===================================================================
--- trunk/l3kernel/l3token.dtx 2016-08-13 20:23:45 UTC (rev 6645)
+++ trunk/l3kernel/l3token.dtx 2016-08-13 21:35:41 UTC (rev 6646)
@@ -82,28 +82,32 @@
% categories: |\token_| for anything that deals with tokens and
% |\peek_| for looking ahead in the token stream.
%
-% Most of the time we will be using the term \enquote{token} but most of the
-% time the function we're describing can equally well by used on a
-% control sequence as such one is one token as well.
+% Most functions we will describe here can be used on control sequences,
+% as those are tokens as well.
%
-% \section{All possible tokens}
+% It is important to distinguish two aspects of a token: its
+% \enquote{shape} (for lack of a better word), which affects the
+% matching of delimited arguments and the comparison of token lists
+% containing this token, and its \enquote{meaning}, which affects
+% whether the token expands or what operation it performs. One can have
+% tokens of different shapes with the same meaning, but not the
+% converse.
%
-% Let us start by reviewing every case that a given token can fall into.
-% It is very important to distinguish two aspects of a token: its meaning,
-% and what it looks like.
-%
% For instance, \cs{if:w}, \cs{if_charcode:w}, and \cs{tex_if:D} are
-% three names for the same internal operation of \TeX{}, namely the primitive
-% testing the next two characters for equality of their character code.
-% They behave identically in many situations. However, \TeX{}
-% distinguishes them when searching for a delimited argument. Namely, the
-% example function |\show_until_if:w| defined below will take everything
-% until \cs{if:w} as an argument, despite the presence of other copies of
-% \cs{if:w} under different names.
+% three names for the same internal operation of \TeX{}, namely the
+% primitive testing the next two characters for equality of their
+% character code. They have the same meaning hence behave identically
+% in many situations. However, \TeX{} distinguishes them when searching
+% for a delimited argument. Namely, the example function
+% |\show_until_if:w| defined below will take everything until \cs{if:w}
+% as an argument, despite the presence of other copies of \cs{if:w}
+% under different names.
% \begin{verbatim}
% \cs_new:Npn \show_until_if:w #1 \if:w { \tl_show:n {#1} }
% \show_until_if:w \tex_if:D \if_charcode:w \if:w
% \end{verbatim}
+% A list of all possible shapes and a list of all possible meanings are
+% given in section~\ref{sec:l3token:all-tokens}.
%
% \section{Creating character tokens}
%
@@ -1007,6 +1011,117 @@
% not a macro then \cs{scan_stop:} will be left in the input stream
% \end{function}
%
+% \section{Description of all possible tokens}
+% \label{sec:l3token:all-tokens}
+%
+% Let us end by reviewing every case that a given token can fall into.
+% This section is quite technical and some details are only meant for
+% completeness. We distinguish the meaning of the token, which controls
+% the expansion of the token and its effect on \TeX{}'s state, and its
+% shape, which is used when comparing token lists such as for delimited
+% arguments. Two tokens of the same shape must have the same meaning,
+% but the converse does not hold.
+%
+% A token has one of the following shapes.
+% \begin{itemize}
+% \item A control sequence, characterized by the sequence of
+% characters that constitute its name: for instance, \cs{use:n} is a
+% five-letter control sequence.
+% \item An active character token, characterized by its character code
+% (between $0$ and $1114111$ for \LuaTeX{} and \XeTeX{} and less for
+% other engines) and category code~$13$.
+% \item A character token, characterized by its character code and
+% category code (one of $1$, $2$, $3$, $4$, $6$, $7$, $8$, $10$,
+% $11$ or~$12$ whose meaning is described below).\footnote{In
+% \LuaTeX{}, there is also the case of ``bytes'', which behave as
+% character tokens of category code $12$~(other) and character code
+% between $1114112$ and~$1114366$. They are used to output
+% individual bytes to files, rather than UTF-8.}
+% \end{itemize}
+% There are also a few internal tokens. The following list may be
+% incomplete in some engines.
+% \begin{itemize}
+% \item Expanding \tn{the}\th{font} results in a token that looks
+% identical to the command that was used to select the current font
+% (such as \tn{tenrm}) but it differs from it in shape.
+% \item A ``frozen'' |\relax|, which differs from the primitive in
+% both shape and meaning, is inserted when the closing \tn{fi} of a
+% conditional is encountered before the conditional is evaluated.
+% \item An |\endtemplate| (which expands to |\outer endtemplate:|)
+% can be enountered when peeking ahead at the next token.
+% \item Tricky programming might access a frozen |\endwrite|.
+% \item Some frozen tokens can only be accessed in interactive
+% sessions: |\cr|, |\right|, |\endgroup|, |\fi|, |\inaccessible|.
+% \end{itemize}
+%
+% The meaning of a (non-active) character token is fixed by its category
+% code (and character code) and cannot be changed. We will call these
+% tokens \emph{explicit} character tokens. Category codes that a
+% character token can have are listed below by giving a sample output of
+% the \TeX{} primitive \tn{meaning}, together with their \LaTeX3 names
+% and most common example:
+% \begin{itemize}
+% \item[1] begin-group character (|group_begin|, often |{|),
+% \item[2] end-group character (|group_end|, often |}|),
+% \item[3] math shift character (|math_toggle|, often |$|),
+% \item[4] alignment tab character (|alignment|, often |&|),
+% \item[6] macro parameter character (|parameter|, often |#|),
+% \item[7] superscript character (|math_superscript|, often |^|),
+% \item[8] subscript character (|math_subscript|, often |_|),
+% \item[10] blank space (|space|, often character code~$32$),
+% \item[11] the letter (|letter|, such as |A|),
+% \item[12] the character (|other|, such as |0|).
+% \end{itemize}
+% Category code~$13$ (|active|) is discussed below. Input characters
+% can also have several other category codes which do not lead to
+% character tokens for later processing: $0$~(|escape|),
+% $5$~(|end_line|), $9$~(|ignore|), $14$~(|comment|), and
+% $15$~(|invalid|).
+%
+% The meaning of a control sequence or active character can be identical
+% to that of any character token listed above (with any character code),
+% and we will call such tokens \emph{implicit} character tokens. The
+% meaning is otherwise in the following list:
+% \begin{itemize}
+% \item a macro, used in \LaTeX3 for most functions and some variables
+% (|tl|, |fp|, |seq|, \ldots{}),
+% \item a primitive such as \tn{def} or \tn{topmark}, used in \LaTeX3
+% for some functions,
+% \item a register such as \tn{count}|123|, used in \LaTeX3{} for the
+% implementation of some variables (|int|, |dim|, \ldots{}),
+% \item a constant integer such as \tn{char}|"56| or \tn{mathchar}|"121|,
+% \item a font selection command,
+% \item undefined.
+% \end{itemize}
+% Macros be \tn{protected} or not, \tn{long} or not (the opposite of
+% what \LaTeX3 calls |nopar|), and \tn{outer} or not (unused in
+% \LaTeX3). Their \tn{meaning} takes the form
+% \begin{quote}
+% \meta{properties} |macro:|\meta{parameters}|->|\meta{replacement}
+% \end{quote}
+% where \meta{properties} is among \tn{protected}\tn{long}\tn{outer},
+% \meta{parameters} describes parameters that the macro expects, such as
+% |#1#2#3|, and \meta{replacement} describes how the parameters are
+% manipulated, such as~|#2/#1/#3|.
+%
+% ^^A todo Bruno: discuss here some other subtleties of space tokens? when looking for numbers, when looking for equal signs in let, in expressions, etc.
+%
+% Now is perhaps a good time to mention some subtleties relating to
+% tokens with category code $10$ (space). Any input character with this
+% category code (normally, space and tab characters) becomes a normal
+% space, with character code~$32$ and category code~$10$.
+%
+% When a macro takes an undelimited argument, explicit space characters
+% (with character code $32$ and category code $10$) are ignored. If the
+% following token is an explicit character token with category code $1$
+% (begin-group) and an arbitrary character code, then \TeX{} scans ahead
+% to obtain an equal number of explicit character tokens with category
+% code $1$ (begin-group) and $2$ (end-group), and the resulting list of
+% tokens (with outer braces removed) becomes the argument. Otherwise, a
+% single token is taken as the argument for the macro: we call such
+% single tokens \enquote{N-type}, as they are suitable to be used as an
+% argument for a function with the signature~\texttt{:N}.
+%
% \section{Internal functions}
%
% \begin{function}[EXP, added = 2016-03-25]{\__char_generate:nn}
@@ -1016,7 +1131,8 @@
% This function is identical in operation to the public
% \cs{char_generate:nn} but omits various sanity tests. In particular, this
% means it is used in certain places where engine variations need to be
-% accounted for by the kernel.
+% accounted for by the kernel. The \meta{catcode} must give an explicit
+% integer after a single expansion.
% \end{function}
%
% \end{documentation}
@@ -1033,7 +1149,7 @@
%<@@=char>
% \end{macrocode}
%
-% \section{Manipulating and interrogating character tokens}
+% \subsection{Manipulating and interrogating character tokens}
%
% \begin{macro}{\char_set_catcode:nn}
% \begin{macro}{\char_value_catcode:n}
@@ -1245,10 +1361,10 @@
% \end{macrocode}
% \end{variable}
%
-% \section{Creating character tokens}
+% \subsection{Creating character tokens}
%
% \begin{macro}
-% {^^A
+% {
% \char_set_active_eq:NN, \char_gset_active_eq:NN,
% \char_set_active_eq:Nc, \char_gset_active_eq:Nc,
% \char_set_active_eq:nN, \char_gset_active_eq:nN,
More information about the latex3-commits
mailing list